]> git.saurik.com Git - apple/xnu.git/commitdiff
xnu-4903.221.2.tar.gz macos-10141 v4903.221.2
authorApple <opensource@apple.com>
Thu, 6 Dec 2018 05:28:38 +0000 (05:28 +0000)
committerApple <opensource@apple.com>
Thu, 6 Dec 2018 05:28:38 +0000 (05:28 +0000)
1283 files changed:
EXTERNAL_HEADERS/Makefile
EXTERNAL_HEADERS/corecrypto/cc.h
EXTERNAL_HEADERS/corecrypto/cc_config.h
EXTERNAL_HEADERS/corecrypto/cc_debug.h
EXTERNAL_HEADERS/corecrypto/cc_error.h [new file with mode: 0644]
EXTERNAL_HEADERS/corecrypto/cc_priv.h
EXTERNAL_HEADERS/corecrypto/cc_runtime_config.h
EXTERNAL_HEADERS/corecrypto/ccaes.h
EXTERNAL_HEADERS/corecrypto/ccasn1.h
EXTERNAL_HEADERS/corecrypto/ccchacha20poly1305.h
EXTERNAL_HEADERS/corecrypto/cccmac.h
EXTERNAL_HEADERS/corecrypto/ccder.h
EXTERNAL_HEADERS/corecrypto/ccdes.h
EXTERNAL_HEADERS/corecrypto/ccdigest.h
EXTERNAL_HEADERS/corecrypto/ccdigest_priv.h
EXTERNAL_HEADERS/corecrypto/ccdrbg.h
EXTERNAL_HEADERS/corecrypto/cchmac.h
EXTERNAL_HEADERS/corecrypto/cckprng.h [new file with mode: 0644]
EXTERNAL_HEADERS/corecrypto/ccmd5.h
EXTERNAL_HEADERS/corecrypto/ccmode_factory.h
EXTERNAL_HEADERS/corecrypto/ccmode_impl.h
EXTERNAL_HEADERS/corecrypto/ccmode_siv.h
EXTERNAL_HEADERS/corecrypto/ccn.h
EXTERNAL_HEADERS/corecrypto/ccrc4.h
EXTERNAL_HEADERS/corecrypto/ccrng.h
EXTERNAL_HEADERS/corecrypto/ccrsa.h
EXTERNAL_HEADERS/corecrypto/ccsha1.h
EXTERNAL_HEADERS/corecrypto/ccsha2.h
EXTERNAL_HEADERS/corecrypto/cczp.h
EXTERNAL_HEADERS/img4/api.h [new file with mode: 0644]
EXTERNAL_HEADERS/img4/environment.h [new file with mode: 0644]
EXTERNAL_HEADERS/img4/img4.h [new file with mode: 0644]
EXTERNAL_HEADERS/img4/payload.h [new file with mode: 0644]
EXTERNAL_HEADERS/ptrauth.h [new file with mode: 0644]
Makefile
README.md
SETUP/kextsymboltool/kextsymboltool.c
bsd/Makefile
bsd/arm/_mcontext.h
bsd/arm/fasttrap_isa.h
bsd/arm/types.h
bsd/bsm/audit.h
bsd/bsm/audit_internal.h
bsd/bsm/audit_kevents.h
bsd/bsm/audit_record.h
bsd/conf/Makefile.template
bsd/conf/files
bsd/conf/files.arm64
bsd/dev/arm/dtrace_isa.c
bsd/dev/arm/fasttrap_isa.c
bsd/dev/arm/fbt_arm.c
bsd/dev/arm/kern_machdep.c
bsd/dev/arm/sysctl.c
bsd/dev/arm/systemcalls.c
bsd/dev/arm/unix_signal.c
bsd/dev/arm64/cpu_in_cksum.s
bsd/dev/arm64/disassembler.c
bsd/dev/arm64/dtrace_isa.c
bsd/dev/arm64/fasttrap_isa.c
bsd/dev/arm64/fbt_arm.c
bsd/dev/arm64/sysctl.c
bsd/dev/dtrace/dtrace.c
bsd/dev/dtrace/dtrace_alloc.c [deleted file]
bsd/dev/dtrace/dtrace_glue.c
bsd/dev/dtrace/dtrace_ptss.c
bsd/dev/dtrace/fasttrap.c
bsd/dev/dtrace/fbt.c
bsd/dev/dtrace/lockstat.c
bsd/dev/dtrace/profile_prvd.c
bsd/dev/dtrace/sdt.c
bsd/dev/dtrace/sdt_subr.c
bsd/dev/dtrace/systrace.c
bsd/dev/i386/dtrace_isa.c
bsd/dev/i386/dtrace_subr_x86.c
bsd/dev/i386/fasttrap_isa.c
bsd/dev/i386/fbt_x86.c
bsd/dev/i386/systemcalls.c
bsd/dev/i386/unix_signal.c
bsd/dev/monotonic.c
bsd/kern/bsd_init.c
bsd/kern/decmpfs.c
bsd/kern/kdebug.c
bsd/kern/kern_aio.c
bsd/kern/kern_authorization.c
bsd/kern/kern_backtrace.c
bsd/kern/kern_core.c
bsd/kern/kern_credential.c
bsd/kern/kern_cs.c
bsd/kern/kern_descrip.c
bsd/kern/kern_event.c
bsd/kern/kern_exec.c
bsd/kern/kern_exit.c
bsd/kern/kern_fork.c
bsd/kern/kern_guarded.c
bsd/kern/kern_kpc.c
bsd/kern/kern_lockf.c
bsd/kern/kern_malloc.c
bsd/kern/kern_memorystatus.c
bsd/kern/kern_mib.c
bsd/kern/kern_newsysctl.c
bsd/kern/kern_ntptime.c
bsd/kern/kern_overrides.c
bsd/kern/kern_pcsamples.c
bsd/kern/kern_persona.c
bsd/kern/kern_proc.c
bsd/kern/kern_prot.c
bsd/kern/kern_resource.c
bsd/kern/kern_sig.c
bsd/kern/kern_symfile.c
bsd/kern/kern_sysctl.c
bsd/kern/kern_time.c
bsd/kern/kern_xxx.c
bsd/kern/kpi_mbuf.c
bsd/kern/mach_loader.c
bsd/kern/mach_loader.h
bsd/kern/policy_check.c
bsd/kern/proc_info.c
bsd/kern/pthread_shims.c [deleted file]
bsd/kern/subr_log.c
bsd/kern/sys_coalition.c
bsd/kern/sys_generic.c
bsd/kern/sys_persona.c
bsd/kern/sys_pipe.c
bsd/kern/sys_ulock.c
bsd/kern/syscalls.master
bsd/kern/sysv_shm.c
bsd/kern/trace_codes
bsd/kern/tty.c
bsd/kern/tty_ptmx.c
bsd/kern/ubc_subr.c
bsd/kern/uipc_domain.c
bsd/kern/uipc_mbuf.c
bsd/kern/uipc_socket.c
bsd/kern/uipc_socket2.c
bsd/kern/uipc_usrreq.c
bsd/libkern/Makefile
bsd/libkern/copyio.h [new file with mode: 0644]
bsd/libkern/libkern.h
bsd/man/man2/Makefile
bsd/man/man2/exchangedata.2
bsd/man/man2/fs_snapshot_create.2
bsd/man/man2/getsockname.2
bsd/man/man2/searchfs.2
bsd/man/man2/send.2
bsd/man/man2/setuid.2
bsd/man/man3/getiopolicy_np.3
bsd/miscfs/devfs/devfs_tree.c
bsd/miscfs/nullfs/null_vnops.c
bsd/miscfs/nullfs/nullfs.h
bsd/miscfs/specfs/spec_vnops.c
bsd/net/Makefile
bsd/net/bpf.c
bsd/net/bpf.h
bsd/net/bpf_filter.c
bsd/net/bpfdesc.h
bsd/net/classq/classq_fq_codel.h
bsd/net/content_filter.c
bsd/net/content_filter.h
bsd/net/dlil.c
bsd/net/ethernet.h
bsd/net/if.c
bsd/net/if.h
bsd/net/if_bond.c
bsd/net/if_bridge.c
bsd/net/if_fake.c
bsd/net/if_gif.c
bsd/net/if_gif.h
bsd/net/if_ipsec.c
bsd/net/if_low_power_mode.c [new file with mode: 0644]
bsd/net/if_pflog.c
bsd/net/if_pflog.h
bsd/net/if_ports_used.c
bsd/net/if_stf.c
bsd/net/if_utun.c
bsd/net/if_var.h
bsd/net/if_vlan.c
bsd/net/iptap.c
bsd/net/kpi_interface.c
bsd/net/kpi_interface.h
bsd/net/kpi_protocol.c
bsd/net/nat464_utils.c [new file with mode: 0644]
bsd/net/nat464_utils.h [new file with mode: 0644]
bsd/net/necp.c
bsd/net/necp.h
bsd/net/necp_client.c
bsd/net/net_kev.h
bsd/net/net_stubs.c
bsd/net/network_agent.c
bsd/net/network_agent.h
bsd/net/ntstat.c
bsd/net/ntstat.h
bsd/net/packet_mangler.c
bsd/net/pf.c
bsd/net/pf_ioctl.c
bsd/net/pf_pbuf.c
bsd/net/pf_pbuf.h
bsd/net/pfvar.h
bsd/net/pktap.c
bsd/net/pktap.h
bsd/net/pktsched/pktsched_fq_codel.c
bsd/net/pktsched/pktsched_fq_codel.h
bsd/net/route.c
bsd/net/rtsock.c
bsd/netinet/icmp6.h
bsd/netinet/in.c
bsd/netinet/in.h
bsd/netinet/in_arp.c
bsd/netinet/in_cksum.c
bsd/netinet/in_pcb.c
bsd/netinet/in_pcb.h
bsd/netinet/in_pcblist.c
bsd/netinet/in_rmx.c
bsd/netinet/in_tclass.c
bsd/netinet/ip6.h
bsd/netinet/ip_dummynet.c
bsd/netinet/ip_fw2.c
bsd/netinet/ip_icmp.c
bsd/netinet/ip_icmp.h
bsd/netinet/ip_input.c
bsd/netinet/ip_output.c
bsd/netinet/isakmp.h [new file with mode: 0644]
bsd/netinet/kpi_ipfilter.c
bsd/netinet/kpi_ipfilter.h
bsd/netinet/mp_pcb.h
bsd/netinet/mptcp.c
bsd/netinet/mptcp_opt.c
bsd/netinet/mptcp_opt.h
bsd/netinet/mptcp_subr.c
bsd/netinet/mptcp_var.h
bsd/netinet/raw_ip.c
bsd/netinet/tcp.h
bsd/netinet/tcp_input.c
bsd/netinet/tcp_output.c
bsd/netinet/tcp_subr.c
bsd/netinet/tcp_timer.c
bsd/netinet/tcp_usrreq.c
bsd/netinet/tcp_var.h
bsd/netinet/udp_usrreq.c
bsd/netinet6/esp_chachapoly.c
bsd/netinet6/esp_chachapoly.h
bsd/netinet6/esp_core.c
bsd/netinet6/icmp6.c
bsd/netinet6/in6.c
bsd/netinet6/in6.h
bsd/netinet6/in6_ifattach.c
bsd/netinet6/in6_mcast.c
bsd/netinet6/in6_pcb.c
bsd/netinet6/in6_proto.c
bsd/netinet6/in6_src.c
bsd/netinet6/in6_var.h
bsd/netinet6/ip6_input.c
bsd/netinet6/ip6_output.c
bsd/netinet6/ip6_var.h
bsd/netinet6/ipsec.c
bsd/netinet6/nd6.c
bsd/netinet6/nd6.h
bsd/netinet6/nd6_rtr.c
bsd/netinet6/nd6_send.c
bsd/netinet6/raw_ip6.c
bsd/netinet6/udp6_output.c
bsd/netinet6/udp6_usrreq.c
bsd/netkey/key.c
bsd/netkey/key.h
bsd/nfs/nfs_gss.c
bsd/nfs/nfs_gss.h
bsd/nfs/nfs_lock.c
bsd/nfs/nfs_serv.c
bsd/nfs/nfs_socket.c
bsd/nfs/nfs_subs.c
bsd/nfs/nfs_vfsops.c
bsd/nfs/nfs_vnops.c
bsd/pgo/profile_runtime.c
bsd/pthread/Makefile [new file with mode: 0644]
bsd/pthread/bsdthread_private.h [new file with mode: 0644]
bsd/pthread/priority_private.h [new file with mode: 0644]
bsd/pthread/pthread_priority.c [new file with mode: 0644]
bsd/pthread/pthread_shims.c [new file with mode: 0644]
bsd/pthread/pthread_workqueue.c [new file with mode: 0644]
bsd/pthread/workqueue_internal.h [new file with mode: 0644]
bsd/pthread/workqueue_syscalls.h [new file with mode: 0644]
bsd/pthread/workqueue_trace.h [new file with mode: 0644]
bsd/security/audit/audit.c
bsd/security/audit/audit.h
bsd/security/audit/audit_arg.c
bsd/security/audit/audit_bsd.c
bsd/security/audit/audit_bsm.c
bsd/security/audit/audit_bsm_klib.c
bsd/security/audit/audit_bsm_token.c
bsd/security/audit/audit_private.h
bsd/security/audit/audit_syscalls.c
bsd/sys/Makefile
bsd/sys/_types/_user64_timex.h
bsd/sys/bsdtask_info.h
bsd/sys/cdefs.h
bsd/sys/codesign.h
bsd/sys/csr.h
bsd/sys/decmpfs.h
bsd/sys/disk.h
bsd/sys/dtrace_glue.h
bsd/sys/dtrace_impl.h
bsd/sys/dtrace_ptss.h
bsd/sys/event.h
bsd/sys/eventhandler.h
bsd/sys/eventvar.h
bsd/sys/fasttrap_impl.h
bsd/sys/fbt.h
bsd/sys/filedesc.h
bsd/sys/fsctl.h
bsd/sys/guarded.h
bsd/sys/imgact.h
bsd/sys/kauth.h
bsd/sys/kdebug.h
bsd/sys/kern_memorystatus.h
bsd/sys/kern_overrides.h
bsd/sys/kpi_mbuf.h
bsd/sys/linker_set.h
bsd/sys/lockstat.h
bsd/sys/malloc.h
bsd/sys/mbuf.h
bsd/sys/mcache.h
bsd/sys/monotonic.h
bsd/sys/mount_internal.h
bsd/sys/namei.h
bsd/sys/persona.h
bsd/sys/priv.h
bsd/sys/proc.h
bsd/sys/proc_info.h
bsd/sys/proc_internal.h
bsd/sys/pthread_internal.h
bsd/sys/pthread_shims.h
bsd/sys/queue.h
bsd/sys/reason.h
bsd/sys/resource.h
bsd/sys/sdt_impl.h
bsd/sys/signal.h
bsd/sys/signalvar.h
bsd/sys/socket.h
bsd/sys/socketvar.h
bsd/sys/sockio.h
bsd/sys/spawn_internal.h
bsd/sys/stat.h
bsd/sys/sysctl.h
bsd/sys/systm.h
bsd/sys/ubc.h
bsd/sys/ubc_internal.h
bsd/sys/ulock.h
bsd/sys/user.h
bsd/sys/ux_exception.h
bsd/sys/vnode.h
bsd/sys/vnode_internal.h
bsd/sys/work_interval.h
bsd/tests/bsd_tests.c [new file with mode: 0644]
bsd/tests/ctrr_test_sysctl.c [new file with mode: 0644]
bsd/tests/pmap_test_sysctl.c [new file with mode: 0644]
bsd/uuid/uuid.h
bsd/uxkern/ux_exception.c
bsd/vfs/kpi_vfs.c
bsd/vfs/vfs_attrlist.c
bsd/vfs/vfs_bio.c
bsd/vfs/vfs_cache.c
bsd/vfs/vfs_cluster.c
bsd/vfs/vfs_cprotect.c
bsd/vfs/vfs_disk_conditioner.c
bsd/vfs/vfs_fsevents.c
bsd/vfs/vfs_fslog.c
bsd/vfs/vfs_lookup.c
bsd/vfs/vfs_subr.c
bsd/vfs/vfs_syscalls.c
bsd/vfs/vfs_vnops.c
bsd/vfs/vfs_xattr.c
bsd/vm/vm_compressor_backing_file.c
bsd/vm/vm_unix.c
bsd/vm/vnode_pager.c
config/BSDKernel.exports
config/IOKit.arm.exports
config/IOKit.arm64.exports
config/IOKit.exports
config/IOKit.x86_64.exports
config/Libkern.arm.exports
config/Libkern.arm64.exports
config/Libkern.exports
config/MACFramework.exports
config/MASTER
config/MASTER.arm
config/MASTER.arm64
config/MASTER.arm64.bcm2837 [new file with mode: 0644]
config/MASTER.x86_64
config/Makefile
config/MasterVersion
config/Private.arm.exports
config/Private.arm64.exports
config/Private.exports
config/Private.x86_64.exports
config/Unsupported.arm64.exports
config/generate_linker_aliases.sh [new file with mode: 0755]
config/newvers.pl
config/version.c
iokit/IOKit/IOBSD.h
iokit/IOKit/IOCommandGate.h
iokit/IOKit/IOEventSource.h
iokit/IOKit/IOFilterInterruptEventSource.h
iokit/IOKit/IOHibernatePrivate.h
iokit/IOKit/IOInterruptEventSource.h
iokit/IOKit/IOInterrupts.h
iokit/IOKit/IOKitKeys.h
iokit/IOKit/IOMemoryDescriptor.h
iokit/IOKit/IOMultiMemoryDescriptor.h
iokit/IOKit/IOPolledInterface.h
iokit/IOKit/IORegistryEntry.h
iokit/IOKit/IOReturn.h
iokit/IOKit/IOService.h
iokit/IOKit/IOServicePM.h
iokit/IOKit/IOSharedDataQueue.h
iokit/IOKit/IOTimerEventSource.h
iokit/IOKit/IOTypes.h
iokit/IOKit/IOWorkLoop.h
iokit/IOKit/perfcontrol/IOPerfControl.h [new file with mode: 0644]
iokit/IOKit/perfcontrol/Makefile [new file with mode: 0644]
iokit/IOKit/pwr_mgt/IOPM.h
iokit/IOKit/pwr_mgt/IOPMPowerSource.h
iokit/IOKit/pwr_mgt/IOPMPrivate.h
iokit/IOKit/pwr_mgt/RootDomain.h
iokit/IOKit/rtc/IORTCController.h
iokit/Kernel/IOCPU.cpp
iokit/Kernel/IOCommandGate.cpp
iokit/Kernel/IODMACommand.cpp
iokit/Kernel/IODataQueue.cpp
iokit/Kernel/IOEventSource.cpp
iokit/Kernel/IOFilterInterruptEventSource.cpp
iokit/Kernel/IOHibernateIO.cpp
iokit/Kernel/IOInterruptEventSource.cpp
iokit/Kernel/IOKitKernelInternal.h
iokit/Kernel/IOLib.cpp
iokit/Kernel/IOMemoryCursor.cpp
iokit/Kernel/IOMemoryDescriptor.cpp
iokit/Kernel/IOMultiMemoryDescriptor.cpp
iokit/Kernel/IONVRAM.cpp
iokit/Kernel/IOPMrootDomain.cpp
iokit/Kernel/IOPerfControl.cpp [new file with mode: 0644]
iokit/Kernel/IOPlatformExpert.cpp
iokit/Kernel/IOPolledInterface.cpp
iokit/Kernel/IORegistryEntry.cpp
iokit/Kernel/IOService.cpp
iokit/Kernel/IOServicePM.cpp
iokit/Kernel/IOServicePMPrivate.h
iokit/Kernel/IOServicePrivate.h
iokit/Kernel/IOSharedDataQueue.cpp
iokit/Kernel/IOStatistics.cpp
iokit/Kernel/IOStringFuncs.c
iokit/Kernel/IOTimerEventSource.cpp
iokit/Kernel/IOUserClient.cpp
iokit/Kernel/IOWorkLoop.cpp
iokit/Tests/Tests.cpp
iokit/bsddev/IOKitBSDInit.cpp
iokit/conf/Makefile.x86_64
iokit/conf/files
libkdd/kcdata.h
libkdd/kcdtypes.c
libkdd/kdd.xcodeproj/project.pbxproj
libkdd/tests/Tests.swift
libkdd/tests/stackshot-sample-asid [new file with mode: 0644]
libkdd/tests/stackshot-sample-asid-pagetable [new file with mode: 0644]
libkdd/tests/stackshot-sample-asid-pagetable.plist.gz [new file with mode: 0644]
libkdd/tests/stackshot-sample-asid.plist.gz [new file with mode: 0644]
libkdd/tests/stackshot-sample-cpu-times [new file with mode: 0644]
libkdd/tests/stackshot-sample-cpu-times.plist.gz [new file with mode: 0644]
libkdd/tests/stackshot-sample-stacktop [new file with mode: 0644]
libkdd/tests/stackshot-sample-stacktop.plist.gz [new file with mode: 0644]
libkdd/tests/stackshot-with-shared-cache-layout [new file with mode: 0644]
libkdd/tests/stackshot-with-shared-cache-layout.plist.gz [new file with mode: 0644]
libkern/OSKextVersion.c
libkern/c++/OSCollection.cpp
libkern/c++/OSData.cpp
libkern/c++/OSDictionary.cpp
libkern/c++/OSKext.cpp
libkern/c++/OSMetaClass.cpp
libkern/c++/OSRuntime.cpp
libkern/c++/OSSerialize.cpp
libkern/c++/OSSerializeBinary.cpp
libkern/c++/OSUnserialize.cpp
libkern/c++/OSUnserialize.y
libkern/c++/OSUnserializeXML.cpp
libkern/c++/OSUnserializeXML.y
libkern/conf/Makefile.template
libkern/conf/files
libkern/conf/files.arm64 [new file with mode: 0644]
libkern/firehose/firehose_types_private.h
libkern/firehose/private.h
libkern/firehose/tracepoint_private.h
libkern/gen/OSDebug.cpp
libkern/img4/interface.c [new file with mode: 0644]
libkern/kmod/cplus_start.c
libkern/kmod/cplus_stop.c
libkern/kxld/Makefile
libkern/libclosure/libclosuredata.c [new file with mode: 0644]
libkern/libclosure/runtime.cpp [new file with mode: 0644]
libkern/libkern/Block.h [new file with mode: 0644]
libkern/libkern/Block_private.h [new file with mode: 0644]
libkern/libkern/Makefile
libkern/libkern/OSKextLibPrivate.h
libkern/libkern/OSRuntime.h [new file with mode: 0644]
libkern/libkern/c++/OSCollection.h
libkern/libkern/c++/OSDictionary.h
libkern/libkern/c++/OSKext.h
libkern/libkern/c++/OSMetaClass.h
libkern/libkern/c++/OSSerialize.h
libkern/libkern/crc.h [new file with mode: 0644]
libkern/libkern/img4/Makefile [new file with mode: 0644]
libkern/libkern/img4/interface.h [new file with mode: 0644]
libkern/libkern/prelink.h
libkern/libkern/version.h.template
libkern/os/Makefile
libkern/os/log.c
libkern/os/log_encode.h
libkern/os/reason_private.h
libkern/os/refcnt.c [new file with mode: 0644]
libkern/os/refcnt.h [new file with mode: 0644]
libkern/os/trace_internal.h
libkern/uuid/uuid.c
libkern/zlib/crc32.c [deleted file]
libkern/zlib/z_crc32.c [new file with mode: 0644]
libsa/bootstrap.cpp
libsyscall/Libsyscall.xcconfig
libsyscall/Libsyscall.xcodeproj/project.pbxproj
libsyscall/custom/SYS.h
libsyscall/custom/__fork.s
libsyscall/custom/__getpid.s
libsyscall/custom/__sigreturn.s
libsyscall/mach/mach/mach.h
libsyscall/mach/mach/mach_right.h [new file with mode: 0644]
libsyscall/mach/mach/mach_sync_ipc.h [new file with mode: 0644]
libsyscall/mach/mach/port_descriptions.h [new file with mode: 0644]
libsyscall/mach/mach/thread_state.h
libsyscall/mach/mach_msg.c
libsyscall/mach/mach_port.c
libsyscall/mach/mach_right.c [new file with mode: 0644]
libsyscall/mach/mig_reply_port.c
libsyscall/mach/port_descriptions.c [new file with mode: 0644]
libsyscall/os/thread_self_restrict.h [new file with mode: 0644]
libsyscall/os/tsd.h
libsyscall/wrappers/__commpage_gettimeofday.c
libsyscall/wrappers/_libc_funcptr.c
libsyscall/wrappers/_libkernel_init.c
libsyscall/wrappers/_libkernel_init.h
libsyscall/wrappers/cancelable/fcntl-base.c
libsyscall/wrappers/cancelable/fcntl-cancel.c
libsyscall/wrappers/cancelable/fcntl.c
libsyscall/wrappers/coalition.c
libsyscall/wrappers/getiopolicy_np.c
libsyscall/wrappers/init_cpu_capabilities.c
libsyscall/wrappers/ioctl.c
libsyscall/wrappers/libproc/libproc.c
libsyscall/wrappers/libproc/libproc_internal.h
libsyscall/wrappers/skywalk/cpu_copy_in_cksum.s [new file with mode: 0644]
libsyscall/wrappers/skywalk/cpu_copy_in_cksum_gen.c [new file with mode: 0644]
libsyscall/wrappers/skywalk/cpu_in_cksum.s [new file with mode: 0644]
libsyscall/wrappers/skywalk/cpu_in_cksum_gen.c [new file with mode: 0644]
libsyscall/wrappers/skywalk/os_channel.c [new file with mode: 0644]
libsyscall/wrappers/skywalk/os_nexus.c [new file with mode: 0644]
libsyscall/wrappers/skywalk/os_packet.c [new file with mode: 0644]
libsyscall/wrappers/spawn/posix_spawn.c
libsyscall/wrappers/spawn/spawn_private.h
libsyscall/wrappers/stackshot.c
libsyscall/wrappers/string/index.c
libsyscall/wrappers/string/memcpy.c
libsyscall/wrappers/string/memset.c
libsyscall/wrappers/string/strcmp.c
libsyscall/wrappers/string/strcpy.c
libsyscall/wrappers/string/strings.h
libsyscall/wrappers/string/strlcpy.c
libsyscall/wrappers/string/strlen.c
libsyscall/wrappers/terminate_with_reason.c
libsyscall/wrappers/thread_register_state.c
libsyscall/wrappers/varargs_wrappers.s
libsyscall/xcodescripts/create-syscalls.pl
libsyscall/xcodescripts/mach_install_mig.sh
makedefs/MakeInc.cmd
makedefs/MakeInc.def
makedefs/MakeInc.kernel
makedefs/MakeInc.rule
makedefs/MakeInc.top
osfmk/Makefile
osfmk/arm/Makefile
osfmk/arm/arm_init.c
osfmk/arm/arm_vm_init.c
osfmk/arm/atomic.h
osfmk/arm/caches.c
osfmk/arm/commpage/commpage.c
osfmk/arm/commpage/commpage.h
osfmk/arm/conf.c [deleted file]
osfmk/arm/cpu.c
osfmk/arm/cpu_capabilities.h
osfmk/arm/cpu_common.c
osfmk/arm/cpu_data.h
osfmk/arm/cpu_data_internal.h
osfmk/arm/cpu_internal.h
osfmk/arm/cpuid.c
osfmk/arm/cpuid.h
osfmk/arm/cswitch.s
osfmk/arm/data.s
osfmk/arm/dbgwrap.h
osfmk/arm/genassym.c
osfmk/arm/locks.h
osfmk/arm/locks_arm.c
osfmk/arm/locore.s
osfmk/arm/loose_ends.c
osfmk/arm/lowmem_vectors.c
osfmk/arm/machine_cpu.h
osfmk/arm/machine_cpuid.c
osfmk/arm/machine_routines.c
osfmk/arm/machine_routines.h
osfmk/arm/machine_routines_asm.s
osfmk/arm/machine_routines_common.c
osfmk/arm/misc_protos.h
osfmk/arm/model_dep.c
osfmk/arm/monotonic_arm.c
osfmk/arm/pcb.c
osfmk/arm/pmap.c
osfmk/arm/pmap.h
osfmk/arm/pmap_public.h [new file with mode: 0644]
osfmk/arm/proc_reg.h
osfmk/arm/simple_lock.h
osfmk/arm/status.c
osfmk/arm/trap.c
osfmk/arm/trap.h
osfmk/arm64/Makefile
osfmk/arm64/arm_vm_init.c
osfmk/arm64/asm.h
osfmk/arm64/bcopy.s
osfmk/arm64/bzero.s
osfmk/arm64/caches_asm.s
osfmk/arm64/copyio.c
osfmk/arm64/cpu.c
osfmk/arm64/cswitch.s
osfmk/arm64/genassym.c
osfmk/arm64/kpc.c
osfmk/arm64/locore.s
osfmk/arm64/loose_ends.c
osfmk/arm64/lowmem_vectors.c
osfmk/arm64/lz4_decode_arm64.s
osfmk/arm64/lz4_encode_arm64.s
osfmk/arm64/machine_kpc.h
osfmk/arm64/machine_routines.c
osfmk/arm64/machine_routines_asm.s
osfmk/arm64/machine_task.c
osfmk/arm64/monotonic.h
osfmk/arm64/monotonic_arm64.c
osfmk/arm64/pcb.c
osfmk/arm64/platform_tests.c
osfmk/arm64/proc_reg.h
osfmk/arm64/sleh.c
osfmk/arm64/start.s
osfmk/arm64/status.c
osfmk/arm64/strncmp.s
osfmk/arm64/strnlen.s
osfmk/bank/bank.c
osfmk/bank/bank_internal.h
osfmk/conf/Makefile.arm64
osfmk/conf/Makefile.template
osfmk/conf/Makefile.x86_64
osfmk/conf/files
osfmk/conf/files.arm
osfmk/conf/files.arm64
osfmk/conf/files.x86_64
osfmk/console/serial_console.c
osfmk/corecrypto/cc/src/cc_try_abort.c
osfmk/corecrypto/ccaes/src/aes_tab.c [deleted file]
osfmk/corecrypto/ccaes/src/ccaes_ltc_ecb_encrypt_mode.c [deleted file]
osfmk/corecrypto/ccaes/src/ccaes_private_types.h [deleted file]
osfmk/corecrypto/cchmac/src/cchmac_final.c
osfmk/corecrypto/ccmode/src/ccmode_ctr_crypt.c [deleted file]
osfmk/corecrypto/ccmode/src/ccmode_ctr_init.c [deleted file]
osfmk/corecrypto/ccmode/src/ccmode_ctr_setctr.c [deleted file]
osfmk/corecrypto/ccmode/src/ccmode_factory_ctr_crypt.c [deleted file]
osfmk/corecrypto/ccmode/src/ccmode_internal.h [deleted file]
osfmk/corecrypto/ccsha1/src/ccdigest_final_64be.c
osfmk/corecrypto/ccsha1/src/ccdigest_internal.h [new file with mode: 0644]
osfmk/corecrypto/ccsha1/src/ccsha1_eay.c
osfmk/corecrypto/ccsha1/src/ccsha1_internal.h [new file with mode: 0644]
osfmk/corecrypto/ccsha2/src/ccdigest_internal.h [new file with mode: 0644]
osfmk/corecrypto/ccsha2/src/ccsha256_di.c
osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c
osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c
osfmk/corecrypto/ccsha2/src/ccsha2_internal.h
osfmk/corpses/corpse.c
osfmk/corpses/task_corpse.h
osfmk/i386/AT386/conf.c [deleted file]
osfmk/i386/AT386/model_dep.c
osfmk/i386/Makefile
osfmk/i386/acpi.c
osfmk/i386/atomic.h
osfmk/i386/bsd_i386.c
osfmk/i386/bsd_i386_native.c
osfmk/i386/commpage/commpage.c
osfmk/i386/cpu_capabilities.h
osfmk/i386/cpu_data.h
osfmk/i386/fp_simd.s
osfmk/i386/fpu.c
osfmk/i386/fpu.h
osfmk/i386/i386_init.c
osfmk/i386/i386_lock.s
osfmk/i386/i386_vm_init.c
osfmk/i386/locks.h
osfmk/i386/locks_i386.c
osfmk/i386/locks_i386_inlines.h [new file with mode: 0644]
osfmk/i386/locks_i386_opt.c [new file with mode: 0644]
osfmk/i386/machine_routines.c
osfmk/i386/machine_routines.h
osfmk/i386/machine_task.c
osfmk/i386/mp.c
osfmk/i386/mp.h
osfmk/i386/mp_desc.c
osfmk/i386/pcb.c
osfmk/i386/pcb_native.c
osfmk/i386/pmap.h
osfmk/i386/pmap_internal.h
osfmk/i386/pmap_x86_common.c
osfmk/i386/trap.c
osfmk/ipc/ipc_entry.c
osfmk/ipc/ipc_importance.c
osfmk/ipc/ipc_init.c
osfmk/ipc/ipc_kmsg.c
osfmk/ipc/ipc_kmsg.h
osfmk/ipc/ipc_mqueue.c
osfmk/ipc/ipc_mqueue.h
osfmk/ipc/ipc_notify.c
osfmk/ipc/ipc_object.c
osfmk/ipc/ipc_object.h
osfmk/ipc/ipc_port.c
osfmk/ipc/ipc_port.h
osfmk/ipc/ipc_pset.c
osfmk/ipc/ipc_pset.h
osfmk/ipc/ipc_right.c
osfmk/ipc/ipc_space.c
osfmk/ipc/ipc_voucher.c
osfmk/ipc/ipc_voucher.h
osfmk/ipc/mach_debug.c
osfmk/ipc/mach_kernelrpc.c
osfmk/ipc/mach_msg.c
osfmk/ipc/mach_port.c
osfmk/ipc/port.h
osfmk/kdp/kdp_core.c
osfmk/kdp/kdp_core.h
osfmk/kdp/ml/arm/kdp_machdep.c
osfmk/kdp/ml/x86_64/kdp_machdep.c
osfmk/kdp/processor_core.c
osfmk/kern/Makefile
osfmk/kern/ast.c
osfmk/kern/ast.h
osfmk/kern/backtrace.c
osfmk/kern/bits.h
osfmk/kern/block_hint.h
osfmk/kern/bsd_kern.c
osfmk/kern/btlog.c
osfmk/kern/btlog.h
osfmk/kern/clock.c
osfmk/kern/clock_oldops.c
osfmk/kern/coalition.c
osfmk/kern/coalition.h
osfmk/kern/cpu_quiesce.c [new file with mode: 0644]
osfmk/kern/cpu_quiesce.h [new file with mode: 0644]
osfmk/kern/cs_blobs.h
osfmk/kern/debug.c
osfmk/kern/debug.h
osfmk/kern/ecc_logging.c [new file with mode: 0644]
osfmk/kern/exc_guard.h
osfmk/kern/exc_resource.h
osfmk/kern/exception.c
osfmk/kern/gzalloc.c
osfmk/kern/host.c
osfmk/kern/host_statistics.h
osfmk/kern/ipc_kobject.c
osfmk/kern/ipc_kobject.h
osfmk/kern/ipc_mig.c
osfmk/kern/ipc_tt.c
osfmk/kern/kalloc.c
osfmk/kern/kcdata.h
osfmk/kern/kern_cdata.c
osfmk/kern/kern_cdata.h
osfmk/kern/kern_ecc.c [deleted file]
osfmk/kern/kern_monotonic.c
osfmk/kern/kern_stackshot.c
osfmk/kern/kern_types.h
osfmk/kern/kext_alloc.c
osfmk/kern/kpc.h
osfmk/kern/kpc_common.c
osfmk/kern/ledger.c
osfmk/kern/ledger.h
osfmk/kern/locks.c
osfmk/kern/locks.h
osfmk/kern/ltable.c
osfmk/kern/ltable.h
osfmk/kern/mach_node.c
osfmk/kern/machine.c
osfmk/kern/misc_protos.h
osfmk/kern/monotonic.h
osfmk/kern/policy_internal.h
osfmk/kern/printf.c
osfmk/kern/priority.c
osfmk/kern/priority_queue.c [new file with mode: 0644]
osfmk/kern/priority_queue.h [new file with mode: 0644]
osfmk/kern/processor.c
osfmk/kern/processor.h
osfmk/kern/processor_data.h
osfmk/kern/queue.h
osfmk/kern/sched.h
osfmk/kern/sched_average.c
osfmk/kern/sched_dualq.c
osfmk/kern/sched_prim.c
osfmk/kern/sched_prim.h
osfmk/kern/sched_traditional.c
osfmk/kern/sfi.c
osfmk/kern/simple_lock.h
osfmk/kern/stack.c
osfmk/kern/startup.c
osfmk/kern/sync_sema.c
osfmk/kern/sync_sema.h
osfmk/kern/syscall_subr.c
osfmk/kern/syscall_subr.h
osfmk/kern/syscall_sw.c
osfmk/kern/task.c
osfmk/kern/task.h
osfmk/kern/task_policy.c
osfmk/kern/telemetry.c
osfmk/kern/telemetry.h
osfmk/kern/test_lock.c [new file with mode: 0644]
osfmk/kern/thread.c
osfmk/kern/thread.h
osfmk/kern/thread_act.c
osfmk/kern/thread_call.c
osfmk/kern/thread_group.h
osfmk/kern/thread_policy.c
osfmk/kern/timer.c
osfmk/kern/timer.h
osfmk/kern/trustcache.h [new file with mode: 0644]
osfmk/kern/turnstile.c [new file with mode: 0644]
osfmk/kern/turnstile.h [new file with mode: 0644]
osfmk/kern/ux_handler.c [new file with mode: 0644]
osfmk/kern/ux_handler.h [new file with mode: 0644]
osfmk/kern/waitq.c
osfmk/kern/waitq.h
osfmk/kern/zalloc.c
osfmk/kern/zalloc.h
osfmk/kern/zcache.c [new file with mode: 0644]
osfmk/kern/zcache.h [new file with mode: 0644]
osfmk/kperf/Makefile
osfmk/kperf/action.c
osfmk/kperf/action.h
osfmk/kperf/arm/kperf_meminfo.c [deleted file]
osfmk/kperf/buffer.h
osfmk/kperf/callstack.c
osfmk/kperf/context.h
osfmk/kperf/kdebug_trigger.c
osfmk/kperf/kperf.c
osfmk/kperf/kperf.h
osfmk/kperf/kperf_arch.h
osfmk/kperf/kperf_timer.c
osfmk/kperf/kperf_timer.h
osfmk/kperf/kperfbsd.c
osfmk/kperf/kperfbsd.h
osfmk/kperf/lazy.c [new file with mode: 0644]
osfmk/kperf/lazy.h [new file with mode: 0644]
osfmk/kperf/meminfo.c
osfmk/kperf/meminfo.h
osfmk/kperf/pet.c
osfmk/kperf/task_samplers.c
osfmk/kperf/task_samplers.h
osfmk/kperf/thread_samplers.c
osfmk/kperf/thread_samplers.h
osfmk/kperf/x86_64/kperf_meminfo.c [deleted file]
osfmk/mach/Makefile
osfmk/mach/arm/_structs.h
osfmk/mach/arm/sdt_isa.h
osfmk/mach/arm/thread_status.h
osfmk/mach/arm/vm_param.h
osfmk/mach/branch_predicates.h [deleted file]
osfmk/mach/exc.defs
osfmk/mach/host_info.h
osfmk/mach/host_special_ports.h
osfmk/mach/i386/vm_param.h
osfmk/mach/kmod.h
osfmk/mach/mach_exc.defs
osfmk/mach/mach_host.defs
osfmk/mach/mach_port.defs
osfmk/mach/mach_traps.h
osfmk/mach/machine.h
osfmk/mach/memory_entry.defs [new file with mode: 0644]
osfmk/mach/message.h
osfmk/mach/port.h
osfmk/mach/shared_region.h
osfmk/mach/sync_policy.h
osfmk/mach/syscall_sw.h
osfmk/mach/task_info.h
osfmk/mach/task_policy.h
osfmk/mach/task_special_ports.h
osfmk/mach/thread_act.defs
osfmk/mach/thread_policy.h
osfmk/mach/vm_param.h
osfmk/mach/vm_statistics.h
osfmk/mach/vm_types.h
osfmk/mach_debug/mach_debug_types.defs
osfmk/mach_debug/zone_info.h
osfmk/machine/atomic.h
osfmk/machine/monotonic.h
osfmk/prng/YarrowCoreLib/include/WindowsTypesForMac.h [deleted file]
osfmk/prng/YarrowCoreLib/include/yarrow.h [deleted file]
osfmk/prng/YarrowCoreLib/include/yarrowUtils.h [deleted file]
osfmk/prng/YarrowCoreLib/port/smf.c [deleted file]
osfmk/prng/YarrowCoreLib/src/assertverify.h [deleted file]
osfmk/prng/YarrowCoreLib/src/comp.c [deleted file]
osfmk/prng/YarrowCoreLib/src/comp.h [deleted file]
osfmk/prng/YarrowCoreLib/src/entropysources.h [deleted file]
osfmk/prng/YarrowCoreLib/src/macOnly.h [deleted file]
osfmk/prng/YarrowCoreLib/src/prng.c [deleted file]
osfmk/prng/YarrowCoreLib/src/prng.h [deleted file]
osfmk/prng/YarrowCoreLib/src/prngpriv.h [deleted file]
osfmk/prng/YarrowCoreLib/src/readme-prnguser.txt [deleted file]
osfmk/prng/YarrowCoreLib/src/sha1mod.c [deleted file]
osfmk/prng/YarrowCoreLib/src/sha1mod.h [deleted file]
osfmk/prng/YarrowCoreLib/src/smf.h [deleted file]
osfmk/prng/YarrowCoreLib/src/userdefines.h [deleted file]
osfmk/prng/YarrowCoreLib/src/yarrowUtils.c [deleted file]
osfmk/prng/fips_sha1.c [deleted file]
osfmk/prng/fips_sha1.h [deleted file]
osfmk/prng/prng_random.c [new file with mode: 0644]
osfmk/prng/prng_yarrow.c [deleted file]
osfmk/prng/random.c [deleted file]
osfmk/prng/random.h
osfmk/tests/Makefile [new file with mode: 0644]
osfmk/tests/README.md [new file with mode: 0644]
osfmk/tests/bitmap_test.c [new file with mode: 0644]
osfmk/tests/kernel_tests.c [new file with mode: 0644]
osfmk/tests/ktest.c [new file with mode: 0644]
osfmk/tests/ktest.h [new file with mode: 0644]
osfmk/tests/ktest_accessor.c [new file with mode: 0644]
osfmk/tests/ktest_emit.c [new file with mode: 0644]
osfmk/tests/ktest_global.c [new file with mode: 0644]
osfmk/tests/ktest_internal.h [new file with mode: 0644]
osfmk/tests/pmap_tests.c [new file with mode: 0644]
osfmk/tests/test_thread_call.c [new file with mode: 0644]
osfmk/tests/xnupost.h [new file with mode: 0644]
osfmk/vm/bsd_vm.c
osfmk/vm/lz4.c
osfmk/vm/memory_object.c
osfmk/vm/memory_object.h
osfmk/vm/pmap.h
osfmk/vm/vm32_user.c
osfmk/vm/vm_apple_protect.c
osfmk/vm/vm_compressor.c
osfmk/vm/vm_compressor.h
osfmk/vm/vm_compressor_backing_store.c
osfmk/vm/vm_compressor_backing_store.h
osfmk/vm/vm_compressor_pager.c
osfmk/vm/vm_compressor_pager.h
osfmk/vm/vm_fault.c
osfmk/vm/vm_fault.h
osfmk/vm/vm_fourk_pager.c
osfmk/vm/vm_init.c
osfmk/vm/vm_kern.c
osfmk/vm/vm_kern.h
osfmk/vm/vm_map.c
osfmk/vm/vm_map.h
osfmk/vm/vm_map_store.c
osfmk/vm/vm_map_store.h
osfmk/vm/vm_map_store_ll.c
osfmk/vm/vm_map_store_ll.h
osfmk/vm/vm_map_store_rb.c
osfmk/vm/vm_map_store_rb.h
osfmk/vm/vm_object.c
osfmk/vm/vm_object.h
osfmk/vm/vm_options.h
osfmk/vm/vm_page.h
osfmk/vm/vm_pageout.c
osfmk/vm/vm_pageout.h
osfmk/vm/vm_phantom_cache.c
osfmk/vm/vm_protos.h
osfmk/vm/vm_purgeable.c
osfmk/vm/vm_purgeable_internal.h
osfmk/vm/vm_resident.c
osfmk/vm/vm_shared_region.c
osfmk/vm/vm_shared_region.h
osfmk/vm/vm_shared_region_pager.c [new file with mode: 0644]
osfmk/vm/vm_swapfile_pager.c
osfmk/vm/vm_user.c
osfmk/voucher/ipc_pthread_priority.c
osfmk/x86_64/copyio.c
osfmk/x86_64/cswitch.s
osfmk/x86_64/idt64.s
osfmk/x86_64/kpc_x86.c
osfmk/x86_64/machine_routines_asm.s
osfmk/x86_64/monotonic_x86_64.c
osfmk/x86_64/pmap.c
osfmk/x86_64/pmap_pcid.c
pexpert/arm/pe_identify_machine.c
pexpert/arm/pe_init.c
pexpert/arm/pe_serial.c
pexpert/gen/pe_gen.c
pexpert/i386/pe_serial.c
pexpert/pexpert/arm64/AMCC.h
pexpert/pexpert/arm64/BCM2837.h [new file with mode: 0644]
pexpert/pexpert/arm64/Makefile
pexpert/pexpert/arm64/arm64_common.h
pexpert/pexpert/arm64/board_config.h
pexpert/pexpert/arm64/boot.h
pexpert/pexpert/pexpert.h
san/Kasan_kasan.exports
san/Makefile
san/conf/Makefile.arm [new file with mode: 0644]
san/conf/Makefile.arm64 [new file with mode: 0644]
san/conf/Makefile.template
san/conf/files
san/conf/files.arm [new file with mode: 0644]
san/conf/files.arm64 [new file with mode: 0644]
san/kasan-arm64.c
san/kasan-blacklist
san/kasan-blacklist-arm64
san/kasan-blacklist-x86_64
san/kasan-fakestack.c
san/kasan-test.c
san/kasan-x86_64.c
san/kasan.c
san/kasan.h
san/kasan_dynamic_blacklist.c
san/kasan_internal.h
san/ubsan-blacklist [new file with mode: 0644]
san/ubsan.c [new file with mode: 0644]
san/ubsan.h [new file with mode: 0644]
san/ubsan_log.c [new file with mode: 0644]
security/mac_base.c
security/mac_framework.h
security/mac_iokit.c
security/mac_mach.c
security/mac_mach_internal.h
security/mac_policy.h
security/mac_vfs.c
tests/Makefile [new file with mode: 0644]
tests/atm_diagnostic_flag.c [new file with mode: 0644]
tests/avx.c [new file with mode: 0644]
tests/backtracing.c [new file with mode: 0644]
tests/contextswitch.c [new file with mode: 0644]
tests/cpucount.c [new file with mode: 0644]
tests/data_protection.c [new file with mode: 0644]
tests/disk_mount_conditioner-entitlements.plist [new file with mode: 0644]
tests/disk_mount_conditioner.c [new file with mode: 0644]
tests/drop_priv.c [new file with mode: 0644]
tests/exc_resource_threads.c [new file with mode: 0644]
tests/excserver.defs [new file with mode: 0644]
tests/freebsd_waitpid_nohang.c [new file with mode: 0644]
tests/gettimeofday.c [new file with mode: 0644]
tests/gettimeofday_29192647.c [new file with mode: 0644]
tests/host_notifications.c [new file with mode: 0644]
tests/host_statistics_rate_limiting.c [new file with mode: 0644]
tests/ioperf.c [new file with mode: 0644]
tests/jumbo_va_spaces_28530648.c [new file with mode: 0644]
tests/jumbo_va_spaces_28530648.entitlements [new file with mode: 0644]
tests/kdebug.c [new file with mode: 0644]
tests/kernel_mtx_perf.c [new file with mode: 0644]
tests/kernel_uuid_match.c [new file with mode: 0644]
tests/kevent_continuous_time.c [new file with mode: 0644]
tests/kevent_pty.c [new file with mode: 0644]
tests/kevent_qos.c [new file with mode: 0644]
tests/kpc.c [new file with mode: 0644]
tests/kperf.c [new file with mode: 0644]
tests/kperf_backtracing.c [new file with mode: 0644]
tests/kperf_helpers.c [new file with mode: 0644]
tests/kperf_helpers.h [new file with mode: 0644]
tests/kqueue_add_and_trigger.c [new file with mode: 0644]
tests/kqueue_close.c [new file with mode: 0644]
tests/kqueue_fifo_18776047.c [new file with mode: 0644]
tests/kqueue_file_tests.c [new file with mode: 0644]
tests/kqueue_timer_tests.c [new file with mode: 0644]
tests/launchd_plists/com.apple.xnu.test.kevent_qos.plist [new file with mode: 0644]
tests/launchd_plists/com.apple.xnu.test.turnstile_multihop.plist [new file with mode: 0644]
tests/ltable_exhaustion_test.c [new file with mode: 0644]
tests/mach_boottime_usec.c [new file with mode: 0644]
tests/mach_continuous_time.c [new file with mode: 0644]
tests/mach_get_times.c [new file with mode: 0644]
tests/mach_port_deallocate_21692215.c [new file with mode: 0644]
tests/mach_port_insert_right.c [new file with mode: 0644]
tests/mach_port_mod_refs.c [new file with mode: 0644]
tests/mach_timebase_info.c [new file with mode: 0644]
tests/memorystatus_freeze_test.c [new file with mode: 0644]
tests/memorystatus_vm_map_fork.c [new file with mode: 0644]
tests/memorystatus_zone_test.c [new file with mode: 0644]
tests/mktimer_kobject.c [new file with mode: 0644]
tests/monotonic_core.c [new file with mode: 0644]
tests/net_tun_pr_35136664.c [new file with mode: 0644]
tests/net_tuntests.c [new file with mode: 0644]
tests/netbsd_utimensat.c [new file with mode: 0644]
tests/network_entitlements.plist [new file with mode: 0644]
tests/no32exec_35914211.c [new file with mode: 0644]
tests/no32exec_35914211_helper.c [new file with mode: 0644]
tests/ntp_adjtime_29192647.c [new file with mode: 0644]
tests/perf_compressor.c [new file with mode: 0644]
tests/perf_exit.c [new file with mode: 0644]
tests/perf_exit_proc.c [new file with mode: 0644]
tests/perf_kdebug.c [new file with mode: 0644]
tests/perf_spawn_fork.c [new file with mode: 0644]
tests/perf_vmfault.c [new file with mode: 0644]
tests/phys_footprint_interval_max.c [new file with mode: 0644]
tests/poll.c [new file with mode: 0644]
tests/poll_select_kevent_paired_fds.c [new file with mode: 0644]
tests/port_descriptions.c [new file with mode: 0644]
tests/private_entitlement.plist [new file with mode: 0644]
tests/proc_core_name_24152432.c [new file with mode: 0644]
tests/proc_info.c [new file with mode: 0644]
tests/proc_info_list_kthreads.c [new file with mode: 0644]
tests/proc_info_list_kthreads.entitlements [new file with mode: 0644]
tests/proc_info_udata.c [new file with mode: 0644]
tests/proc_uuid_policy_26567533.c [new file with mode: 0644]
tests/pwrite_avoid_sigxfsz_28581610.c [new file with mode: 0644]
tests/quiesce_counter.c [new file with mode: 0644]
tests/regression_17272465.c [new file with mode: 0644]
tests/remote_time.c [new file with mode: 0644]
tests/settimeofday_29193041.c [new file with mode: 0644]
tests/settimeofday_29193041.entitlements [new file with mode: 0644]
tests/settimeofday_29193041_entitled.c [new file with mode: 0644]
tests/sigchld_return.c [new file with mode: 0644]
tests/sigcont_return.c [new file with mode: 0644]
tests/socket_bind_35243417.c [new file with mode: 0644]
tests/socket_bind_35685803.c [new file with mode: 0644]
tests/socket_poll_close_25786011.c [new file with mode: 0644]
tests/stackshot.m [new file with mode: 0644]
tests/stackshot_block_owner_14362384.m [new file with mode: 0644]
tests/stackshot_idle_25570396.m [new file with mode: 0644]
tests/stackshot_spawn_exit_stress.c [new file with mode: 0644]
tests/suspended_spawn_26184412.c [new file with mode: 0644]
tests/task_for_pid_entitlement.plist [new file with mode: 0644]
tests/task_info.c [new file with mode: 0644]
tests/task_info_28439149.c [new file with mode: 0644]
tests/task_inspect.c [new file with mode: 0644]
tests/task_inspect.entitlements [new file with mode: 0644]
tests/telemetry.c [new file with mode: 0644]
tests/thread_group_set_32261625.c [new file with mode: 0644]
tests/tty_hang.c [new file with mode: 0644]
tests/turnstile_multihop.c [new file with mode: 0644]
tests/turnstile_multihop_helper.h [new file with mode: 0644]
tests/turnstile_multihop_types.h [new file with mode: 0644]
tests/turnstiles_test.c [new file with mode: 0644]
tests/utimensat.c [new file with mode: 0644]
tests/verify_kalloc_config.c [new file with mode: 0644]
tests/vm_set_max_addr_helper.c [new file with mode: 0644]
tests/vm_set_max_addr_test.c [new file with mode: 0644]
tests/voucher_entry_18826844.c [new file with mode: 0644]
tests/voucher_traps.c [new file with mode: 0644]
tests/wired_mem_bench.c [new file with mode: 0644]
tests/work_interval_test.c [new file with mode: 0644]
tests/work_interval_test.entitlements [new file with mode: 0644]
tests/workq_sigprof.c [new file with mode: 0644]
tests/xnu_quick_test.c [new file with mode: 0644]
tests/xnu_quick_test.entitlements [new file with mode: 0644]
tests/xnu_quick_test_entitled.c [new file with mode: 0644]
tests/xnu_quick_test_getsetpriority.c [new file with mode: 0644]
tests/xnu_quick_test_helpers.c [new file with mode: 0644]
tests/xnu_quick_test_helpers.h [new file with mode: 0644]
tools/lldbmacros/Makefile
tools/lldbmacros/core/cvalue.py
tools/lldbmacros/core/kernelcore.py
tools/lldbmacros/core/xnu_lldb_init.py
tools/lldbmacros/ioreg.py
tools/lldbmacros/ipc.py
tools/lldbmacros/kasan.py
tools/lldbmacros/kcdata.py
tools/lldbmacros/kevent.py
tools/lldbmacros/mbufs.py
tools/lldbmacros/memory.py
tools/lldbmacros/misc.py
tools/lldbmacros/pmap.py
tools/lldbmacros/process.py
tools/lldbmacros/scheduler.py
tools/lldbmacros/skywalk.py [new file with mode: 0755]
tools/lldbmacros/turnstile.py [new file with mode: 0755]
tools/lldbmacros/userspace.py
tools/lldbmacros/usertaskdebugging/userprocess.py
tools/lldbmacros/utils.py
tools/lldbmacros/waitq.py
tools/lldbmacros/workqueue.py [new file with mode: 0755]
tools/lldbmacros/xnu.py
tools/lldbmacros/xnudefines.py
tools/tests/MPMMTest/Makefile
tools/tests/Makefile
tools/tests/affinity/Makefile
tools/tests/darwintests/Makefile [deleted file]
tools/tests/darwintests/atm_diagnostic_flag.c [deleted file]
tools/tests/darwintests/avx.c [deleted file]
tools/tests/darwintests/backtracing.c [deleted file]
tools/tests/darwintests/contextswitch.c [deleted file]
tools/tests/darwintests/cpucount.c [deleted file]
tools/tests/darwintests/data_protection.c [deleted file]
tools/tests/darwintests/disk_mount_conditioner-entitlements.plist [deleted file]
tools/tests/darwintests/disk_mount_conditioner.c [deleted file]
tools/tests/darwintests/drop_priv.c [deleted file]
tools/tests/darwintests/freebsd_waitpid_nohang.c [deleted file]
tools/tests/darwintests/gettimeofday.c [deleted file]
tools/tests/darwintests/gettimeofday_29192647.c [deleted file]
tools/tests/darwintests/host_notifications.c [deleted file]
tools/tests/darwintests/host_statistics_rate_limiting.c [deleted file]
tools/tests/darwintests/ioperf.c [deleted file]
tools/tests/darwintests/jumbo_va_spaces_28530648.c [deleted file]
tools/tests/darwintests/jumbo_va_spaces_28530648.entitlements [deleted file]
tools/tests/darwintests/kdebug.c [deleted file]
tools/tests/darwintests/kevent_continuous_time.c [deleted file]
tools/tests/darwintests/kevent_pty.c [deleted file]
tools/tests/darwintests/kevent_qos.c [deleted file]
tools/tests/darwintests/kpc.c [deleted file]
tools/tests/darwintests/kperf.c [deleted file]
tools/tests/darwintests/kperf_backtracing.c [deleted file]
tools/tests/darwintests/kperf_helpers.c [deleted file]
tools/tests/darwintests/kperf_helpers.h [deleted file]
tools/tests/darwintests/kqueue_add_and_trigger.c [deleted file]
tools/tests/darwintests/kqueue_close.c [deleted file]
tools/tests/darwintests/kqueue_fifo_18776047.c [deleted file]
tools/tests/darwintests/kqueue_file_tests.c [deleted file]
tools/tests/darwintests/kqueue_timer_tests.c [deleted file]
tools/tests/darwintests/launchd_plists/com.apple.xnu.test.kevent_qos.plist [deleted file]
tools/tests/darwintests/mach_boottime_usec.c [deleted file]
tools/tests/darwintests/mach_continuous_time.c [deleted file]
tools/tests/darwintests/mach_get_times.c [deleted file]
tools/tests/darwintests/mach_port_deallocate_21692215.c [deleted file]
tools/tests/darwintests/mach_port_mod_refs.c [deleted file]
tools/tests/darwintests/mach_timebase_info.c [deleted file]
tools/tests/darwintests/memorystatus_vm_map_fork.c [deleted file]
tools/tests/darwintests/memorystatus_zone_test.c [deleted file]
tools/tests/darwintests/mktimer_kobject.c [deleted file]
tools/tests/darwintests/monotonic_core.c [deleted file]
tools/tests/darwintests/net_tun_pr_35136664.c [deleted file]
tools/tests/darwintests/net_tuntests.c [deleted file]
tools/tests/darwintests/netbsd_utimensat.c [deleted file]
tools/tests/darwintests/network_entitlements.plist [deleted file]
tools/tests/darwintests/no32exec_35914211.c [deleted file]
tools/tests/darwintests/no32exec_35914211_helper.c [deleted file]
tools/tests/darwintests/ntp_adjtime_29192647.c [deleted file]
tools/tests/darwintests/perf_compressor.c [deleted file]
tools/tests/darwintests/perf_exit.c [deleted file]
tools/tests/darwintests/perf_exit_proc.c [deleted file]
tools/tests/darwintests/perf_kdebug.c [deleted file]
tools/tests/darwintests/perf_spawn_fork.c [deleted file]
tools/tests/darwintests/poll.c [deleted file]
tools/tests/darwintests/poll_select_kevent_paired_fds.c [deleted file]
tools/tests/darwintests/private_entitlement.plist [deleted file]
tools/tests/darwintests/proc_core_name_24152432.c [deleted file]
tools/tests/darwintests/proc_info.c [deleted file]
tools/tests/darwintests/proc_info_udata.c [deleted file]
tools/tests/darwintests/proc_uuid_policy_26567533.c [deleted file]
tools/tests/darwintests/pwrite_avoid_sigxfsz_28581610.c [deleted file]
tools/tests/darwintests/regression_17272465.c [deleted file]
tools/tests/darwintests/remote_time.c [deleted file]
tools/tests/darwintests/settimeofday_29193041.c [deleted file]
tools/tests/darwintests/settimeofday_29193041.entitlements [deleted file]
tools/tests/darwintests/settimeofday_29193041_entitled.c [deleted file]
tools/tests/darwintests/sigchld_return.c [deleted file]
tools/tests/darwintests/sigcont_return.c [deleted file]
tools/tests/darwintests/socket_bind_35243417.c [deleted file]
tools/tests/darwintests/socket_bind_35685803.c [deleted file]
tools/tests/darwintests/socket_poll_close_25786011.c [deleted file]
tools/tests/darwintests/stackshot.m [deleted file]
tools/tests/darwintests/stackshot_block_owner_14362384.m [deleted file]
tools/tests/darwintests/stackshot_idle_25570396.m [deleted file]
tools/tests/darwintests/suspended_spawn_26184412.c [deleted file]
tools/tests/darwintests/task_for_pid_entitlement.plist [deleted file]
tools/tests/darwintests/task_info.c [deleted file]
tools/tests/darwintests/task_info_28439149.c [deleted file]
tools/tests/darwintests/task_inspect.c [deleted file]
tools/tests/darwintests/task_inspect.entitlements [deleted file]
tools/tests/darwintests/thread_group_set_32261625.c [deleted file]
tools/tests/darwintests/utimensat.c [deleted file]
tools/tests/darwintests/verify_kalloc_config.c [deleted file]
tools/tests/darwintests/voucher_entry_18826844.c [deleted file]
tools/tests/darwintests/voucher_traps.c [deleted file]
tools/tests/darwintests/work_interval_test.c [deleted file]
tools/tests/darwintests/work_interval_test.entitlements [deleted file]
tools/tests/darwintests/workq_sigprof.c [deleted file]
tools/tests/darwintests/xnu_quick_test.c [deleted file]
tools/tests/darwintests/xnu_quick_test_getsetpriority.c [deleted file]
tools/tests/darwintests/xnu_quick_test_helpers.c [deleted file]
tools/tests/darwintests/xnu_quick_test_helpers.h [deleted file]
tools/tests/personas/Makefile
tools/tests/personas/persona_mgr.c
tools/tests/personas/persona_spawn.c
tools/tests/personas/persona_test_run.sh [new file with mode: 0755]
tools/trace/bridgetime.lua
tools/trace/kqtrace.lua
tools/trace/wqtrace.lua [new file with mode: 0755]

index b99d5b4510be2c923802852383b035fc0b2eb053..a8db883a3cccec08dd11570851893816a8a27dd8 100644 (file)
@@ -34,6 +34,9 @@ KERNEL_FILES = \
        stddef.h        \
        stdint.h
 
+KERNEL_FILES += \
+       ptrauth.h
+
 INSTALL_MI_LIST =
 
 INSTALL_MI_DIR = .
index 7790a4faa86369c51d87dea7c0ea4a7dd74611ec..5493e41c91987ed3da55d580bcf4ab80f5471878 100644 (file)
 #define _CORECRYPTO_CC_H_
 
 #include <corecrypto/cc_config.h>
+#include <corecrypto/cc_error.h>
 #include <string.h>
 #include <stdint.h>
 
+/* Provide a general purpose macro concat method. */
+#define cc_concat_(a, b) a##b
+#define cc_concat(a, b) cc_concat_(a, b)
+
 /* Manage asserts here because a few functions in header public files do use asserts */
 #define cc_assert(x) assert(x)
 #if CC_KERNEL
 #include <assert.h>
 #endif
 
+/* Provide a static assert that can be used to create compile-type failures. */
+#define cc_static_assert(e,m)                                               \
+    ;enum { cc_concat(static_assert_, __COUNTER__) = 1/(int)(!!(e)) }
+
 /* Declare a struct element with a guarenteed alignment of _alignment_.
    The resulting struct can be used to create arrays that are aligned by
    a certain amount.  */
@@ -61,12 +70,12 @@ uint8_t b[_alignment_]; \
  @param len number of bytes to be cleared in dst
  @param dst input array
  */
-CC_NONNULL2
+CC_NONNULL((2))
 void cc_clear(size_t len, void *dst);
 
 #define cc_copy(_size_, _dst_, _src_) memcpy(_dst_, _src_, _size_)
 
-CC_INLINE CC_NONNULL2 CC_NONNULL3 CC_NONNULL4
+CC_INLINE CC_NONNULL((2, 3, 4))
 void cc_xor(size_t size, void *r, const void *s, const void *t) {
     uint8_t *_r=(uint8_t *)r;
     const uint8_t *_s=(const uint8_t *)s;
@@ -84,7 +93,7 @@ void cc_xor(size_t size, void *r, const void *s, const void *t) {
  @param ptr2 input array
  @return  returns 0 if the num bytes starting at ptr1 are identical to the num bytes starting at ptr2 and 1 if they are different or if num is 0 (empty arrays).
  */
-CC_NONNULL2 CC_NONNULL3
+CC_NONNULL((2, 3))
 int cc_cmp_safe (size_t num, const void * ptr1, const void * ptr2);
 
 /* Exchange S and T of any type.  NOTE: Both and S and T are evaluated
index 044c8e16898af676b8470eb47d049945115c9f83..fbdb2c61c7e7afbe9e71731c55876e23265dc33e 100644 (file)
@@ -12,9 +12,9 @@
 #define _CORECRYPTO_CC_CONFIG_H_
 
 /* A word about configuration macros:
+
     Conditional configuration macros specific to corecrypto should be named CORECRYPTO_xxx
-    or CCxx_yyy and be defined to be either 0 or 1 in this file. You can add an 
+    or CCxx_yyy and be defined to be either 0 or 1 in this file. You can add an
     #ifndef #error construct at the end of this file to make sure it's always defined.
 
     They should always be tested using the #if directive, never the #ifdef directive.
 
     Configuration Macros that are defined outside of corecrypto (eg: KERNEL, DEBUG, ...)
     shall only be used in this file to define CCxxx macros.
+
     External macros should be assumed to be either undefined, defined with no value,
     or defined as true or false. We shall strive to build with -Wundef whenever possible,
     so the following construct should be used to test external macros in this file:
-  
+
          #if defined(DEBUG) && (DEBUG)
          #define CORECRYPTO_DEBUG 1
          #else
          #define CORECRYPTO_DEBUG 0
          #endif
-  
+
 
     It is acceptable to define a conditional CC_xxxx macro in an implementation file,
     to be used only in this file.
+
     The current code is not guaranteed to follow those rules, but should be fixed to.
+
     Corecrypto requires GNU and C99 compatibility.
     Typically enabled by passing --gnu --c99 to the compiler (eg. armcc)
 
 #define CORECRYPTO_SIMULATE_WINDOWS_ENVIRONMENT 0
 #define CORECRYPTO_HACK_FOR_WINDOWS_DEVELOPMENT 0 //to be removed after <rdar://problem/27304763> port corecrypto to Windows
 
-//this macro is used to turn on/off usage of transparent union in corecrypto
-//it should be commented out in corecrypto and be used only by the software that use corecrypto
-//#define CORECRYPTO_DONOT_USE_TRANSPARENT_UNION
-#if defined(__cplusplus)
-#define CORECRYPTO_USE_TRANSPARENT_UNION 0
-#elif defined(CORECRYPTO_DONOT_USE_TRANSPARENT_UNION)
- #define CORECRYPTO_USE_TRANSPARENT_UNION !CORECRYPTO_DONOT_USE_TRANSPARENT_UNION
-#else
- #define CORECRYPTO_USE_TRANSPARENT_UNION 1
-#endif
-
 #if (defined(DEBUG) && (DEBUG)) || defined(_DEBUG) //MSVC defines _DEBUG
 /* CC_DEBUG is already used in CommonCrypto */
  #define CORECRYPTO_DEBUG 1
  #define CC_RTKIT 0
 #endif
 
+#if defined(RTKITROM) && (RTKITROM)
+#define CC_RTKITROM 1
+#else
+#define CC_RTKITROM 0
+#endif
+
 #if defined(USE_SEPROM) && (USE_SEPROM)
  #define CC_USE_SEPROM 1
 #else
 // warning: pointer of type 'void *' used in arithmetic
   #pragma GCC diagnostic ignored "-Wpointer-arith"
  #endif // __arm__
+#define CC_SMALL_CODE 1
+
 #endif // CC_BASEBAND
 
+#if CC_RTKIT || CC_RTKITROM
+#define CC_SMALL_CODE 1
+#endif
+
+
+#ifndef CC_SMALL_CODE
+#define CC_SMALL_CODE 0
+#endif
+
 //CC_XNU_KERNEL_AVAILABLE indicates the availibity of XNU kernel functions,
 //like what we have on OSX, iOS, tvOS, Watch OS
-#if defined(__APPLE__) && defined(__MACH__)  
+#if defined(__APPLE__) && defined(__MACH__)
  #define CC_XNU_KERNEL_AVAILABLE 1
 #else
  #define CC_XNU_KERNEL_AVAILABLE 0
 #endif
 
 #if !defined(CCN_UNIT_SIZE)
- #if defined(__arm64__) || defined(__x86_64__)  || defined(_WIN64) 
+ #if defined(__arm64__) || defined(__x86_64__)  || defined(_WIN64)
   #define CCN_UNIT_SIZE  8
  #elif defined(__arm__) || defined(__i386__) || defined(_WIN32)
   #define CCN_UNIT_SIZE  4
 
 #if defined(_MSC_VER)
     #if defined(__clang__)
-        #define CC_ALIGNED(x) __attribute__ ((aligned(x))) //clang compiler  
+        #define CC_ALIGNED(x) __attribute__ ((aligned(x))) //clang compiler
     #else
         #define CC_ALIGNED(x) __declspec(align(x)) //MS complier
     #endif
 
 #if defined(__arm__)
 //this is copied from <arm/arch.h>, because <arm/arch.h> is not available on SEPROM environment
- #if defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7S__) || defined (__ARM_ARCH_7F__) || defined (__ARM_ARCH_7K__)
+#if defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7S__) || defined (__ARM_ARCH_7F__) || defined (__ARM_ARCH_7K__) || defined(__ARM_ARCH_7EM__)
   #define _ARM_ARCH_7
  #endif
 
 #elif defined(__x86_64__) || defined(__i386__)
  #define CCN_IOS                                  0
  #define CCN_OSX                                  1
-#endif 
+#endif
 
 #if CC_USE_L4 || CC_USE_S3
 /* No dynamic linking allowed in L4, e.g. avoid nonlazy symbols */
 #endif
 
 #if !defined(CC_USE_HEAP_FOR_WORKSPACE)
- #if CC_USE_S3 || CC_USE_SEPROM || CC_RTKIT
+ #if CC_USE_S3 || CC_USE_SEPROM || CC_RTKITROM
   #define CC_USE_HEAP_FOR_WORKSPACE 0
  #else
   #define CC_USE_HEAP_FOR_WORKSPACE 1
 #define CC_DISABLE_RSAKEYGEN 0 /* default */
 #endif
 
+// see rdar://problem/26636018
+#if (CCN_UNIT_SIZE == 8) && !( defined(_MSC_VER) && defined(__clang__))
+#define CCEC25519_CURVE25519DONNA_64BIT 1
+#else
+#define CCEC25519_CURVE25519DONNA_64BIT 0
+#endif
+
 //- functions implemented in assembly ------------------------------------------
 //this the list of corecrypto clients that use assembly and the clang compiler
-#if !(CC_XNU_KERNEL_AVAILABLE || CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_USE_SEPROM || CC_USE_S3) && !defined(_WIN32) && CORECRYPTO_DEBUG
+#if !(CC_XNU_KERNEL_AVAILABLE || CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3) && !defined(_WIN32) && CORECRYPTO_DEBUG
  #warning "You are using the default corecrypto configuration, assembly optimizations may not be available for your platform"
 #endif
 
 // Use this macro to strictly disable assembly regardless of cpu/os/compiler/etc.
 // Our assembly code is not gcc compatible. Clang defines the __GNUC__ macro as well.
 #if !defined(CC_USE_ASM)
- #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__)
+ #if defined(_WIN32) || CC_EFI || CC_BASEBAND || CC_XNU_KERNEL_PRIVATE || (defined(__GNUC__) && !defined(__clang__)) || defined(__ANDROID_API__) || CC_RTKIT || CC_RTKITROM
   #define CC_USE_ASM 0
  #else
   #define CC_USE_ASM 1
 
 //-(1) ARM V7
 #if defined(_ARM_ARCH_7) && __clang__ && CC_USE_ASM
- #define CCN_DEDICATED_SQR      1
+ #define CCN_DEDICATED_SQR      CC_SMALL_CODE
  #define CCN_MUL_KARATSUBA      0 // no performance improvement
  #define CCN_ADD_ASM            1
  #define CCN_SUB_ASM            1
  #define CCN_SHIFT_RIGHT_ASM    1
  #define CCAES_ARM_ASM          1
  #define CCAES_INTEL_ASM        0
- #if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_USE_SEPROM || CC_USE_S3
+ #if CC_KERNEL || CC_USE_L4 || CC_IBOOT || CC_RTKIT || CC_RTKITROM || CC_USE_SEPROM || CC_USE_S3
   #define CCAES_MUX             0
  #else
   #define CCAES_MUX             1
 
 //-(2) ARM 64
 #elif defined(__arm64__) && __clang__ && CC_USE_ASM
- #define CCN_DEDICATED_SQR      1
+ #define CCN_DEDICATED_SQR      CC_SMALL_CODE
  #define CCN_MUL_KARATSUBA      1 // 4*n CCN_UNIT extra memory required.
  #define CCN_ADD_ASM            1
  #define CCN_SUB_ASM            1
  #define CCSHA2_VNG_ARMV7NEON   0
  #define CCSHA256_ARMV6M_ASM    0
 
-//-(4) disable assembly  
+//-(4) disable assembly
 #else
  #if CCN_UINT128_SUPPORT_FOR_64BIT_ARCH
   #define CCN_DEDICATED_SQR     1
 
 #define CC_INLINE static inline
 
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-// Non null for transparent unions is ambiguous and cause problems
-// for most tools (GCC and others: 23919290).
- #define CC_NONNULL_TU(N)
-#else
- #define CC_NONNULL_TU(N)  CC_NONNULL(N)
-#endif
-
 #ifdef __GNUC__
  #define CC_NORETURN __attribute__((__noreturn__))
  #define CC_NOTHROW __attribute__((__nothrow__))
  #define CC_NONNULL(N) __attribute__((__nonnull__ N))
- #define CC_NONNULL1 __attribute__((__nonnull__(1)))
- #define CC_NONNULL2 __attribute__((__nonnull__(2)))
- #define CC_NONNULL3 __attribute__((__nonnull__(3)))
- #define CC_NONNULL4 __attribute__((__nonnull__(4)))
- #define CC_NONNULL5 __attribute__((__nonnull__(5)))
- #define CC_NONNULL6 __attribute__((__nonnull__(6)))
- #define CC_NONNULL7 __attribute__((__nonnull__(7)))
+ #define CC_NONNULL4 CC_NONNULL((4))
  #define CC_NONNULL_ALL __attribute__((__nonnull__))
  #define CC_SENTINEL __attribute__((__sentinel__))
  #define CC_CONST __attribute__((__const__))
  #define CC_UNUSED
 /*! @parseOnly */
  #define CC_NONNULL(N)
-/*! @parseOnly */
- #define CC_NORETURN
-/*! @parseOnly */
- #define CC_NOTHROW
-/*! @parseOnly */
- #define CC_NONNULL1
-/*! @parseOnly */
- #define CC_NONNULL2
-/*! @parseOnly */
- #define CC_NONNULL3
 /*! @parseOnly */
  #define CC_NONNULL4
 /*! @parseOnly */
- #define CC_NONNULL5
-/*! @parseOnly */
- #define CC_NONNULL6
+ #define CC_NORETURN
 /*! @parseOnly */
- #define CC_NONNULL7
+ #define CC_NOTHROW
 /*! @parseOnly */
  #define CC_NONNULL_ALL
 /*! @parseOnly */
index 80e61a7b30595481111f4dafc3fe2002a513771f..8cd85e2796266a1e4d1e318947647090b3a7aed5 100644 (file)
@@ -26,7 +26,7 @@
     #if !CONFIG_EMBEDDED
         extern int printf(const char *format, ...) __printflike(1,2);
     #endif
-#elif CC_USE_S3 || CC_IBOOT || CC_RTKIT
+#elif CC_USE_S3 || CC_IBOOT || CC_RTKIT || CC_RTKITROM
     #include <stdio.h>
     #define cc_printf(x...) printf(x)
 #elif defined(__ANDROID_API__)
diff --git a/EXTERNAL_HEADERS/corecrypto/cc_error.h b/EXTERNAL_HEADERS/corecrypto/cc_error.h
new file mode 100644 (file)
index 0000000..57b8ec7
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ *  cc_error.h
+ *  corecrypto
+ *
+ *  Created on 11/14/2017
+ *
+ *  Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ */
+
+#ifndef _CORECRYPTO_CC_ERROR_H_
+#define _CORECRYPTO_CC_ERROR_H_
+
+enum {
+    CCERR_OK = 0,
+
+    /* the default error code */
+    CCERR_INTERNAL = -1,
+
+    CCERR_INTEGRITY = -2,
+
+    CCERR_DEVICE = -3,
+    CCERR_INTERRUPTS = -4,
+    CCERR_CRYPTO_CONFIG = -5,
+    CCERR_PERMS = -6,
+    CCERR_PARAMETER = -7,
+    CCERR_MEMORY = -8,
+    CCERR_FILEDESC = -9,
+    CCERR_OUT_OF_ENTROPY = -10,
+    CCERR_ATFORK = -11,
+    CCERR_OVERFLOW = -12,
+
+    CCERR_MEMORY_ALLOC_FAIL = -13,
+
+    CCEC_GENERATE_KEY_DEFAULT_ERR = -14,
+    CCEC_GENERATE_KEY_TOO_MANY_TRIES = -15,
+    CCEC_GENERATE_KEY_MULT_FAIL = -16,
+    CCEC_GENERATE_KEY_AFF_FAIL = -17,
+    CCEC_GENERATE_KEY_CONSISTENCY = -18,
+    CCEC_GENERATE_NOT_ON_CURVE = -19,
+    CCEC_GENERATE_NOT_ENOUGH_ENTROPY = -20,
+    CCEC_GENERATE_NOT_SUPPORTED = -21,
+    CCEC_GENERATE_INVALID_INPUT = -22,
+
+    // Program error: buffer too small or encrypted message is too small
+    CCRSA_INVALID_INPUT = -23,
+    // Invalid crypto configuration: Hash length versus RSA key size
+    CCRSA_INVALID_CONFIG = -24,
+    CCRSA_ENCODING_ERROR = -25,
+    CCRSA_DECODING_ERROR = -26,
+
+    // The data is invalid (we won't say more for security)
+    CCRSA_PRIVATE_OP_ERROR = -27,
+    CCRSA_KEY_ERROR = -28,
+
+    // Key generation specific
+    CCRSA_KEYGEN_PRIME_NOT_FOUND = -29,
+    CCRSA_KEYGEN_PRIME_NEED_NEW_SEED = -30,
+    CCRSA_KEYGEN_PRIME_TOO_MANY_ITERATIONS = -31,
+    CCRSA_KEYGEN_PRIME_SEED_GENERATION_ERROR = -32,
+    CCRSA_KEYGEN_MODULUS_CRT_INV_ERROR = -33,
+    CCRSA_KEYGEN_NEXT_PRIME_ERROR = -34,
+    CCRSA_KEYGEN_SEED_X_ERROR = -35,
+    CCRSA_KEYGEN_SEED_r_ERROR = -36,
+    CCRSA_KEYGEN_KEYGEN_CONSISTENCY_FAIL = -37,
+    CCRSA_KEYGEN_R1R2_SIZE_ERROR = -38,
+    CCRSA_KEYGEN_PQ_DELTA_ERROR = -39,
+
+    CCRSA_FIPS_KEYGEN_DISABLED = -40,
+
+    CCZP_INV_ERROR = -41,
+    CCZP_INV_NO_INVERSE = -42,
+    CCZP_INV_INVALID_INPUT = -43,
+
+    CCZ_INVALID_INPUT_ERROR = -44,
+    CCZ_INVALID_RADIX_ERROR = -45,
+
+    CCDH_ERROR_DEFAULT = -46,
+    CCDH_GENERATE_KEY_TOO_MANY_TRIES = -47,
+    CCDH_NOT_SUPPORTED_CONFIGURATION = -48,
+    CCDH_SAFETY_CHECK = -49,
+    CCDH_PUBLIC_KEY_MISSING = -50,
+    CCDH_INVALID_DOMAIN_PARAMETER = -51,
+    CCDH_INVALID_INPUT = -52,
+    CCDH_DOMAIN_PARAMETER_MISMATCH = -53,
+    CCDH_GENERATE_KEY_CONSISTENCY = -54,
+
+    CCSRP_ERROR_DEFAULT = -55,
+    CCSRP_GENERATE_KEY_TOO_MANY_TRIES = -56,
+    CCSRP_NOT_SUPPORTED_CONFIGURATION = -57,
+    CCSRP_SAFETY_CHECK = -58,
+    CCSRP_PUBLIC_KEY_MISSING = -59,
+    CCSRP_INVALID_DOMAIN_PARAMETER = -60,
+
+    CCDRBG_STATUS_ERROR = -61,
+    CCDRBG_STATUS_NEED_RESEED = -62,
+    CCDRBG_STATUS_PARAM_ERROR = -63,
+    // If this value is returned, the caller must abort or panic the process for
+    // security reasons. for example in the case of catastrophic error in
+    // http://csrc.nist.gov/publications/drafts/800-90/sp800_90a_r1_draft.pdf
+    // ccdrbg calls abort() or panic(), if they are available in the system.
+    CCDRBG_STATUS_ABORT = -64,
+
+    CCKPRNG_NEED_ENTROPY = -65,
+    CCKPRNG_ABORT = -66,
+
+    CCMODE_INVALID_INPUT = -67,
+    CCMODE_INVALID_CALL_SEQUENCE = -68,
+    CCMODE_INTEGRITY_FAILURE = -69,
+    CCMODE_NOT_SUPPORTED = -70,
+    CCMODE_INTERNAL_ERROR = -71,
+
+    // Configuration or unexpected issue
+    CCPOST_GENERIC_FAILURE = -72,
+    CCPOST_LIBRARY_ERROR = -73,
+    CCPOST_INTEGRITY_ERROR = -74,
+    // Output of the algo is not as expected
+    CCPOST_KAT_FAILURE = -75,
+};
+
+#define CCDRBG_STATUS_OK CCERR_OK
+#define CCKPRNG_OK CCERR_OK
+
+#endif /* _CORECRYPTO_CC_ERROR_H_ */
index 55e0eb2b881aebbaeeaabf2a301ca9647ee5bcf5..0a51e66eec069b9221e7d5c9f0d9fa4296f2a056 100644 (file)
@@ -471,6 +471,12 @@ void cc_mux2p(int s, void **r_true, void **r_false, const void *a, const void *b
     r = (~_cond&(a))|(_cond&(b)); \
 }
 
-int cc_is_compiled_with_tu(void);
+/*
+  Unfortunately, since we export this symbol, this declaration needs
+  to be in a public header to satisfy TAPI.
+
+  See fipspost_trace_priv.h for more details.
+*/
+extern const void *fipspost_trace_vtable;
 
 #endif /* _CORECRYPTO_CC_PRIV_H_ */
index 0064c6ca6f45a5b35a8b191f4456df37d81b75f0..0d7ac528987de50a17136622c7594fbc4f9fd4f9 100644 (file)
@@ -23,6 +23,7 @@
     #define CC_HAS_SupplementalSSE3() ((cpuid_features() & CPUID_FEATURE_SSSE3) != 0)
     #define CC_HAS_AVX1() ((cpuid_features() & CPUID_FEATURE_AVX1_0) != 0)
     #define CC_HAS_AVX2() ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX2) != 0)
+    #define CC_HAS_AVX512_AND_IN_KERNEL()    ((cpuid_info()->cpuid_leaf7_features & CPUID_LEAF7_FEATURE_AVX512F) !=0)
 
 #elif CC_XNU_KERNEL_AVAILABLE
     # include <System/i386/cpu_capabilities.h>
     #define CC_HAS_SupplementalSSE3() (_cpu_capabilities & kHasSupplementalSSE3)
     #define CC_HAS_AVX1() (_cpu_capabilities & kHasAVX1_0)
     #define CC_HAS_AVX2() (_cpu_capabilities & kHasAVX2_0)
+    #define CC_HAS_AVX512_AND_IN_KERNEL() 0
 #else
     #define CC_HAS_AESNI() 0
     #define CC_HAS_SupplementalSSE3() 0
     #define CC_HAS_AVX1() 0
     #define CC_HAS_AVX2() 0
+    #define CC_HAS_AVX512_AND_IN_KERNEL()  0
 #endif
 
 #endif /* !(defined(__x86_64__) || defined(__i386__)) */
index ec119b9b6fcf7a0f16abbf49c572e33b0e5527a9..281c99d22aa277b22592c8dedd93ada57f75316c 100644 (file)
@@ -45,6 +45,9 @@ extern const struct ccmode_ofb ccaes_arm_ofb_crypt_mode;
 #endif
 
 #if CCAES_MUX
+/* Runtime check to see if hardware should be used */
+int ccaes_ios_hardware_enabled(int operation);
+
 extern const struct ccmode_cbc ccaes_ios_hardware_cbc_encrypt_mode;
 extern const struct ccmode_cbc ccaes_ios_hardware_cbc_decrypt_mode;
 
@@ -86,6 +89,15 @@ extern const struct ccmode_xts ccaes_intel_xts_decrypt_opt_mode;
 extern const struct ccmode_xts ccaes_intel_xts_decrypt_aesni_mode;
 #endif
 
+#if CC_USE_L4
+extern const struct ccmode_cbc ccaes_skg_cbc_encrypt_mode;
+extern const struct ccmode_cbc ccaes_skg_cbc_decrypt_mode;
+
+extern const struct ccmode_ecb ccaes_skg_ecb_encrypt_mode;
+extern const struct ccmode_ecb ccaes_skg_ecb_decrypt_mode;
+
+extern const struct ccmode_ecb ccaes_trng_ecb_encrypt_mode;
+#endif
 
 /* Implementation Selectors: */
 const struct ccmode_ecb *ccaes_ecb_encrypt_mode(void);
index 28fba4eef86a1c0045daf16898e43df50e1f58e0..75aac6e6826eebc607ea4b32cdf61cc65a9da08d 100644 (file)
@@ -69,26 +69,16 @@ enum {
     CCASN1_CONSTRUCTED_SEQUENCE = CCASN1_SEQUENCE | CCASN1_CONSTRUCTED,
 };
 
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-typedef union {
-    const unsigned char * oid;
-} __attribute__((transparent_union)) ccoid_t;
-#define CCOID(x) ((x).oid)
-#else
-    typedef const unsigned char * ccoid_t;
+typedef const unsigned char * ccoid_t;
 #define CCOID(oid) (oid)
-#endif
-
-/* Returns *der iff *der points to a DER encoded oid that fits within *der_len. */
-ccoid_t ccoid_for_der(size_t *der_len, const uint8_t **der);
 
 /* Returns the size of an oid including it's tag and length. */
-CC_INLINE CC_PURE CC_NONNULL_TU((1))
+CC_INLINE CC_PURE CC_NONNULL((1))
 size_t ccoid_size(ccoid_t oid) {
     return 2 + CCOID(oid)[1];
 }
 
-CC_INLINE CC_PURE CC_NONNULL_TU((1)) CC_NONNULL_TU((2))
+CC_INLINE CC_PURE CC_NONNULL((1, 2))
 bool ccoid_equal(ccoid_t oid1, ccoid_t oid2) {
     return  (ccoid_size(oid1) == ccoid_size(oid2)
             && memcmp(CCOID(oid1), CCOID(oid2), ccoid_size(oid1))== 0);
index 3e76b81b47ea0c8eb68b82145d96fee297683ed9..4ca59e63beb08be5df4ba837f00462f463ff3f88 100644 (file)
@@ -107,8 +107,6 @@ struct ccchacha20poly1305_info {
 
 };
 
-extern const struct ccchacha20poly1305_info ccchacha20poly1305_info_default;
-
 const struct ccchacha20poly1305_info *ccchacha20poly1305_info(void);
 
 /*!
index 63a892fd646f83203d0c836233ce4eb8ed2c9139..d2e01814357a0596b9b2fc5e5101078ded7ff9ec 100644 (file)
 
 #define CMAC_BLOCKSIZE   16
 
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-struct cccmac_ctx {
-    uint8_t b[8];
-} CC_ALIGNED(8);
-
-typedef struct cccmac_ctx_hdr {
-    uint8_t k1[CMAC_BLOCKSIZE];
-    uint8_t k2[CMAC_BLOCKSIZE];
-    uint8_t block[CMAC_BLOCKSIZE];
-    size_t  block_nbytes;      // Number of byte occupied in block buf
-    size_t  cumulated_nbytes;  // Total size processed
-    const struct ccmode_cbc *cbc;
-    uint8_t ctx[8];
-} CC_ALIGNED(8) cccmac_ctx_hdr;
-
-
-typedef union {
-    struct cccmac_ctx *b;
-    cccmac_ctx_hdr *hdr;
-} cccmac_ctx_t __attribute__((transparent_union));
-#define cccmac_hdr_size sizeof(struct cccmac_ctx_hdr)
-
-#else
-
 struct cccmac_ctx {
     uint8_t k1[CMAC_BLOCKSIZE];
     uint8_t k2[CMAC_BLOCKSIZE];
@@ -55,8 +31,6 @@ typedef struct cccmac_ctx* cccmac_ctx_t;
 
 #define cccmac_hdr_size sizeof(struct cccmac_ctx)
 
-#endif
-
 
 #define cccmac_iv_size(_mode_)  ((_mode_)->block_size)
 #define cccmac_cbc_size(_mode_) ((_mode_)->size)
@@ -67,15 +41,9 @@ typedef struct cccmac_ctx* cccmac_ctx_t;
 #define cccmac_mode_decl(_mode_, _name_) cc_ctx_decl(struct cccmac_ctx, cccmac_ctx_size(_mode_), _name_)
 #define cccmac_mode_clear(_mode_, _name_) cc_clear(cccmac_ctx_size(_mode_), _name_)
 
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-/* Return a cccbc_ctx * which can be accesed with the macros in ccmode.h */
-#define cccmac_mode_ctx_start(_mode_, HC)     (((HC).hdr)->ctx)
-#define CCCMAC_HDR(HC)      (((cccmac_ctx_t)(HC)).hdr)
-#else
 /* Return a cccbc_ctx * which can be accesed with the macros in ccmode.h */
 #define cccmac_mode_ctx_start(_mode_, HC)    (HC->ctx)
 #define CCCMAC_HDR(HC)      (HC)
-#endif
 
 #define cccmac_mode_sym_ctx(_mode_, HC)     (cccbc_ctx *)(cccmac_mode_ctx_start(_mode_, HC))
 #define cccmac_mode_iv(_mode_, HC)     (cccbc_iv *)(cccmac_mode_ctx_start(_mode_, HC)+cccmac_cbc_size(_mode_))
index 6e2c504bef11fb032bd818ecf80538f9abdfa1cf..5bd102962f1d7ba1c4610c8b5712cee73804e851 100644 (file)
@@ -134,21 +134,21 @@ size_t ccder_sizeof_uint64(uint64_t value);
 /* Encode a tag backwards, der_end should point to one byte past the end of
    destination for the tag, returns a pointer to the first byte of the tag.
    Returns NULL if there is an encoding error. */
-CC_NONNULL2
+CC_NONNULL((2))
 uint8_t *ccder_encode_tag(ccder_tag tag, const uint8_t *der, uint8_t *der_end);
 
 /* Returns a pointer to the start of the len field.  returns NULL if there
  is an encoding error. */
-CC_NONNULL2
+CC_NONNULL((2))
 uint8_t *
 ccder_encode_len(size_t len, const uint8_t *der, uint8_t *der_end);
 
 /* der_end should point to the first byte of the content of this der item. */
-CC_NONNULL3
+CC_NONNULL((3))
 uint8_t *
 ccder_encode_tl(ccder_tag tag, size_t len, const uint8_t *der, uint8_t *der_end);
 
-CC_PURE CC_NONNULL2
+CC_PURE CC_NONNULL((2))
 uint8_t *
 ccder_encode_body_nocopy(size_t size, const uint8_t *der, uint8_t *der_end);
 
@@ -163,7 +163,7 @@ ccder_encode_constructed_tl(ccder_tag tag, const uint8_t *body_end,
 
 /* Encodes oid into der and returns
  der + ccder_sizeof_oid(oid). */
-CC_NONNULL_TU((1)) CC_NONNULL2
+CC_NONNULL((1, 2))
 uint8_t *ccder_encode_oid(ccoid_t oid, const uint8_t *der, uint8_t *der_end);
 
 CC_NONNULL((3, 4))
@@ -175,12 +175,12 @@ CC_NONNULL((2, 3))
 uint8_t *ccder_encode_integer(cc_size n, const cc_unit *s,
                               const uint8_t *der, uint8_t *der_end);
 
-CC_NONNULL3
+CC_NONNULL((3))
 uint8_t *ccder_encode_implicit_uint64(ccder_tag implicit_tag,
                                       uint64_t value,
                                       const uint8_t *der, uint8_t *der_end);
 
-CC_NONNULL2
+CC_NONNULL((2))
 uint8_t *ccder_encode_uint64(uint64_t value,
                              const uint8_t *der, uint8_t *der_end);
 
@@ -206,7 +206,7 @@ uint8_t *ccder_encode_raw_octet_string(size_t s_size, const uint8_t *s,
 
 size_t ccder_encode_eckey_size(size_t priv_size, ccoid_t oid, size_t pub_size);
 
-CC_NONNULL2 CC_NONNULL5 CC_NONNULL6  CC_NONNULL7
+CC_NONNULL((2, 5, 6, 7))
 uint8_t *ccder_encode_eckey(size_t priv_size, const uint8_t *priv_key,
                             ccoid_t oid,
                             size_t pub_size, const uint8_t *pub_key,
@@ -216,7 +216,7 @@ uint8_t *ccder_encode_eckey(size_t priv_size, const uint8_t *priv_key,
    It's inefficient – especially when you already have to convert to get to
    the form for the body.
    see encode integer for the right way to unify conversion and insertion */
-CC_NONNULL3
+CC_NONNULL((3))
 uint8_t *
 ccder_encode_body(size_t size, const uint8_t* body,
                   const uint8_t *der, uint8_t *der_end);
@@ -291,16 +291,16 @@ const uint8_t *ccder_decode_uint64(uint64_t* r,
 CC_NONNULL((2, 3, 5))
 const uint8_t *ccder_decode_seqii(cc_size n, cc_unit *r, cc_unit *s,
                                   const uint8_t *der, const uint8_t *der_end);
-CC_NONNULL_TU((1)) CC_NONNULL((3))
+CC_NONNULL((1, 3))
 const uint8_t *ccder_decode_oid(ccoid_t *oidp,
                                 const uint8_t *der, const uint8_t *der_end);
 
-CC_NONNULL((1,2,4))
+CC_NONNULL((1, 2, 4))
 const uint8_t *ccder_decode_bitstring(const uint8_t **bit_string,
                                 size_t *bit_length,
                                 const uint8_t *der, const uint8_t *der_end);
 
-CC_NONNULL_TU((4)) CC_NONNULL((1,2,3,5,6,8))
+CC_NONNULL((1, 2, 3, 4, 5, 6, 8))
 const uint8_t *ccder_decode_eckey(uint64_t *version,
                                   size_t *priv_size, const uint8_t **priv_key,
                                   ccoid_t *oid,
index b4925bd144abc424c0d09bc2939afbe96e3b57ba..31b5dadbf908a965e16f6c03b39acaa2f8180042 100644 (file)
 #define CCDES_BLOCK_SIZE 8
 #define CCDES_KEY_SIZE 8
 
-extern const struct ccmode_ecb ccdes_ltc_ecb_decrypt_mode;
-extern const struct ccmode_ecb ccdes_ltc_ecb_encrypt_mode;
-
 extern const struct ccmode_ecb ccdes3_ltc_ecb_decrypt_mode;
 extern const struct ccmode_ecb ccdes3_ltc_ecb_encrypt_mode;
-extern const struct ccmode_ecb ccdes168_ltc_ecb_encrypt_mode;
 
 const struct ccmode_ecb *ccdes_ecb_decrypt_mode(void);
 const struct ccmode_ecb *ccdes_ecb_encrypt_mode(void);
@@ -61,8 +57,8 @@ int ccdes_key_is_weak( void *key, size_t  length);
 void ccdes_key_set_odd_parity(void *key, size_t length);
 
 uint32_t
-ccdes_cbc_cksum(void *in, void *out, size_t length,
-                void *key, size_t keylen, void *ivec);
+ccdes_cbc_cksum(const void *in, void *out, size_t length,
+                const void *key, size_t key_nbytes, const void *ivec);
 
 
 #endif /* _CORECRYPTO_CCDES_H_ */
index a1b178a6018f5c7f2a1223dd77606ff2e22d54e5..52ee15123b6e0542514f79414b871fa559fd5414 100644 (file)
 
 /* To malloc a digest context for a given di, use malloc(ccdigest_di_size(di))
    and assign the result to a pointer to a struct ccdigest_ctx. */
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-struct ccdigest_ctx {
-    union {
-        uint8_t u8;
-        uint32_t u32;
-        uint64_t u64;
-        cc_unit ccn;
-    } state;
-} CC_ALIGNED(8);
-
-typedef union {
-    struct ccdigest_ctx *hdr;
-} ccdigest_ctx_t __attribute__((transparent_union));
-
-struct ccdigest_state {
-    union {
-        uint8_t u8;
-        uint32_t u32;
-        uint64_t u64;
-        cc_unit ccn;
-    } state;
-} CC_ALIGNED(8);
-
-typedef union {
-    struct ccdigest_state *hdr;
-    struct ccdigest_ctx *_ctx;
-    ccdigest_ctx_t _ctxt;
-} ccdigest_state_t __attribute__((transparent_union));
-#else //=======================================================
 struct ccdigest_ctx {
     union {
         uint8_t u8;
@@ -66,8 +37,6 @@ struct ccdigest_state {
 } CC_ALIGNED(8);
 
 typedef struct ccdigest_state *ccdigest_state_t;
-#endif //=======================================================
-
 
 struct ccdigest_info {
     size_t output_size;
@@ -99,40 +68,22 @@ struct ccdigest_info {
 #define ccdigest_di_clear(_di_, _name_) cc_clear(ccdigest_di_size(_di_), _name_)
 
 /* Digest context field accessors.  Consider the implementation private. */
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-#define ccdigest_state(_di_, _ctx_)      ((struct ccdigest_state *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + sizeof(uint64_t)))
-#else
 #define ccdigest_state(_di_, _ctx_)      ((struct ccdigest_state *)(&((ccdigest_ctx_t)(_ctx_))->state.u8 + sizeof(uint64_t)))
-#endif
 
 #define ccdigest_state_u8(_di_, _ctx_)   ccdigest_u8(ccdigest_state((_di_), (_ctx_)))
 #define ccdigest_state_u32(_di_, _ctx_)  ccdigest_u32(ccdigest_state((_di_), (_ctx_)))
 #define ccdigest_state_u64(_di_, _ctx_)  ccdigest_u64(ccdigest_state((_di_), (_ctx_)))
 #define ccdigest_state_ccn(_di_, _ctx_)  ccdigest_ccn(ccdigest_state((_di_), (_ctx_)))
 
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-#define ccdigest_nbits(_di_, _ctx_)      (((uint64_t *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8))[0])
-#define ccdigest_data(_di_, _ctx_)       (&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size + sizeof(uint64_t))
-#define ccdigest_num(_di_, _ctx_)        (((unsigned int *)(&((ccdigest_ctx_t)(_ctx_)).hdr->state.u8 + (_di_)->state_size + sizeof(uint64_t) + (_di_)->block_size))[0])
-#else
 #define ccdigest_nbits(_di_, _ctx_)      (((uint64_t *)(&((ccdigest_ctx_t)(_ctx_))->state.u8))[0])
 #define ccdigest_data(_di_, _ctx_)       (&((ccdigest_ctx_t)(_ctx_))->state.u8 + (_di_)->state_size + sizeof(uint64_t))
 #define ccdigest_num(_di_, _ctx_)        (((unsigned int *)(&((ccdigest_ctx_t)(_ctx_))->state.u8 + (_di_)->state_size + sizeof(uint64_t) + (_di_)->block_size))[0])
-#endif
 
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-/* Digest state field accessors.  Consider the implementation private. */
-#define ccdigest_u8(_state_)             (&((ccdigest_state_t)(_state_)).hdr->state.u8)
-#define ccdigest_u32(_state_)            (&((ccdigest_state_t)(_state_)).hdr->state.u32)
-#define ccdigest_u64(_state_)            (&((ccdigest_state_t)(_state_)).hdr->state.u64)
-#define ccdigest_ccn(_state_)            (&((ccdigest_state_t)(_state_)).hdr->state.ccn)
-#else
 /* Digest state field accessors.  Consider the implementation private. */
 #define ccdigest_u8(_state_)             (&((ccdigest_state_t)(_state_))->state.u8)
 #define ccdigest_u32(_state_)            (&((ccdigest_state_t)(_state_))->state.u32)
 #define ccdigest_u64(_state_)            (&((ccdigest_state_t)(_state_))->state.u64)
 #define ccdigest_ccn(_state_)            (&((ccdigest_state_t)(_state_))->state.ccn)
-#endif
 
 /* We could just use memcpy instead of this special macro, but this allows us
    to use the optimized ccn_set() assembly routine if we have one, which for
@@ -156,23 +107,6 @@ void ccdigest_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned
 void ccdigest(const struct ccdigest_info *di, size_t len,
               const void *data, void *digest);
 
-/* test functions */
-int ccdigest_test(const struct ccdigest_info *di, size_t len,
-              const void *data, const void *digest);
-
-int ccdigest_test_chunk(const struct ccdigest_info *di, size_t len,
-                        const void *data, const void *digest, size_t chunk);
-
-struct ccdigest_vector {
-    size_t len;
-    const void *message;
-    const void *digest;
-};
-
-int ccdigest_test_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v);
-int ccdigest_test_chunk_vector(const struct ccdigest_info *di, const struct ccdigest_vector *v, size_t chunk);
-
-
 #define OID_DEF(_VALUE_)  ((const unsigned char *)_VALUE_)
 
 #define CC_DIGEST_OID_MD2       OID_DEF("\x06\x08\x2A\x86\x48\x86\xF7\x0D\x02\x02")
index e888a734d396742f1685bee22266f614d896d3c9..9d42de51948f50e9d4ac310ffe5e1663abbf5812 100644 (file)
 #include <corecrypto/ccdigest.h>
 #include <corecrypto/ccasn1.h>
 
-void ccdigest_final_common(const struct ccdigest_info *di,
-                           ccdigest_ctx_t ctx, void *digest);
-void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t,
-                         unsigned char *digest);
-void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t,
-                         unsigned char *digest);
-
-CC_INLINE CC_NONNULL_TU((1))
+CC_INLINE CC_NONNULL((1))
 bool ccdigest_oid_equal(const struct ccdigest_info *di, ccoid_t oid) {
     if(di->oid == NULL && CCOID(oid) == NULL) return true;
     if(di->oid == NULL || CCOID(oid) == NULL) return false;
index af5b010a9004d4419fad06ddeb2f10a73b9b884b..7717d0c036982c719d00c18464654b5d055bdd2a 100644 (file)
 #include <corecrypto/cc.h>
 #include <corecrypto/ccdrbg_impl.h>
 
-/* error codes */
-#define CCDRBG_STATUS_OK 0
-#define CCDRBG_STATUS_ERROR (-1)
-#define CCDRBG_STATUS_NEED_RESEED (-2)
-#define CCDRBG_STATUS_PARAM_ERROR (-3)
-// If this value is returned, the caller must abort or panic the process for security reasons.
-// for example in the case of catastrophic error in
-// http://csrc.nist.gov/publications/drafts/800-90/sp800_90a_r1_draft.pdf
-// ccdrbg calls abort() or panic(), if they are available in the system.
-#define CCDRBG_STATUS_ABORT (-4)
 /*
- * The maximum length of the entropy_input,  additional_input (max_additional_input_length) , personalization string 
+ * The maximum length of the entropy_input,  additional_input (max_additional_input_length) , personalization string
  * (max_personalization_string_length) and max_number_of_bits_per_request  are implementation dependent
- * but shall fit in a 32 bit register and be be less than or equal to the specified maximum length for the 
+ * but shall fit in a 32 bit register and be be less than or equal to the specified maximum length for the
  * selected DRBG mechanism (NIST 800-90A Section 10).
  */
 
@@ -87,9 +77,9 @@ CC_INLINE void ccdrbg_done(const struct ccdrbg_info *info,
        info->done(drbg);
 }
 
-CC_INLINE size_t ccdrbg_context_size(const struct ccdrbg_info *drbg)
+CC_INLINE size_t ccdrbg_context_size(const struct ccdrbg_info *info)
 {
-    return drbg->size;
+    return info->size;
 }
 
 
@@ -110,8 +100,6 @@ void ccdrbg_factory_nistctr(struct ccdrbg_info *info, const struct ccdrbg_nistct
  * NIST SP 800-90 HMAC_DRBG
  * the maximum security strengh of drbg is half of output size of the input hash function and it internally is limited to 256 bits
  */
-extern struct ccdrbg_info ccdrbg_nistdigest_info;
-
 struct ccdrbg_nisthmac_custom {
     const struct ccdigest_info *di;
     int strictFIPS;
@@ -119,10 +107,4 @@ struct ccdrbg_nisthmac_custom {
 
 void ccdrbg_factory_nisthmac(struct ccdrbg_info *info, const struct ccdrbg_nisthmac_custom *custom);
 
-
-/*
- * Dummy DRBG
- */
-extern struct ccdrbg_info ccdrbg_dummy_info;
-
 #endif /* _CORECRYPTO_CCDRBG_H_ */
index 81c1ab835a5cff83249e9ba872c2d5ef2830db9f..048c0de14af88fa9e5df23d05e3482a92eb74c71 100644 (file)
@@ -19,14 +19,7 @@ struct cchmac_ctx {
     uint8_t b[8];
 } CC_ALIGNED(8);
 
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-typedef union {
-    struct cchmac_ctx *hdr;
-    ccdigest_ctx_t digest;
-} cchmac_ctx_t __attribute__((transparent_union));
-#else
 typedef struct cchmac_ctx* cchmac_ctx_t;
-#endif
 
 #define cchmac_ctx_size(STATE_SIZE, BLOCK_SIZE)  (ccdigest_ctx_size(STATE_SIZE, BLOCK_SIZE) + (STATE_SIZE))
 #define cchmac_di_size(_di_)  (cchmac_ctx_size((_di_)->state_size, (_di_)->block_size))
@@ -39,43 +32,25 @@ typedef struct cchmac_ctx* cchmac_ctx_t;
 #define cchmac_di_clear(_di_, _name_) cchmac_ctx_clear((_di_)->state_size, (_di_)->block_size, _name_)
 
 /* Return a ccdigest_ctx_t which can be accesed with the macros in ccdigest.h */
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-#define cchmac_digest_ctx(_di_, HC)    (((cchmac_ctx_t)(HC)).digest)
-#else
 #define cchmac_digest_ctx(_di_, HC)    ((ccdigest_ctx_t)(HC))
-#endif
 
 /* Accesors for ostate fields, this is all cchmac_ctx_t adds to the ccdigest_ctx_t. */
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-#define cchmac_ostate(_di_, HC)    ((struct ccdigest_state *)(((cchmac_ctx_t)(HC)).hdr->b + ccdigest_di_size(_di_)))
-#else
 #define cchmac_ostate(_di_, HC)    ((struct ccdigest_state *)(((cchmac_ctx_t)(HC))->b + ccdigest_di_size(_di_)))
-#endif
 #define cchmac_ostate8(_di_, HC)   (ccdigest_u8(cchmac_ostate(_di_, HC)))
 #define cchmac_ostate32(_di_, HC)  (ccdigest_u32(cchmac_ostate(_di_, HC)))
 #define cchmac_ostate64(_di_, HC)  (ccdigest_u64(cchmac_ostate(_di_, HC)))
 #define cchmac_ostateccn(_di_, HC) (ccdigest_ccn(cchmac_ostate(_di_, HC)))
 
 /* Convenience accessors for ccdigest_ctx_t fields. */
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-#define cchmac_istate(_di_, HC)    ccdigest_state(_di_, ((cchmac_ctx_t)(HC)).digest)
-#else
 #define cchmac_istate(_di_, HC)    ccdigest_state(_di_, ((ccdigest_ctx_t)(HC)))
-#endif
 #define cchmac_istate8(_di_, HC)   ccdigest_u8(cchmac_istate(_di_, HC))
 #define cchmac_istate32(_di_, HC)  ccdigest_u32(cchmac_istate(_di_, HC))
 #define cchmac_istate64(_di_, HC)  ccdigest_u64(cchmac_istate(_di_, HC))
 #define cchmac_istateccn(_di_, HC) ccdigest_ccn(cchmac_istate(_di_, HC))
 
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-#define cchmac_data(_di_, HC)      ccdigest_data(_di_, ((cchmac_ctx_t)(HC)).digest)
-#define cchmac_num(_di_, HC)       ccdigest_num(_di_, ((cchmac_ctx_t)(HC)).digest)
-#define cchmac_nbits(_di_, HC)     ccdigest_nbits(_di_, ((cchmac_ctx_t)(HC)).digest)
-#else
 #define cchmac_data(_di_, HC)      ccdigest_data(_di_, ((ccdigest_ctx_t)(HC)))
 #define cchmac_num(_di_, HC)       ccdigest_num(_di_, ((ccdigest_ctx_t)(HC)))
 #define cchmac_nbits(_di_, HC)     ccdigest_nbits(_di_, ((ccdigest_ctx_t)(HC)))
-#endif
 
 void cchmac_init(const struct ccdigest_info *di, cchmac_ctx_t ctx,
                  size_t key_len, const void *key);
@@ -88,20 +63,4 @@ void cchmac(const struct ccdigest_info *di, size_t key_len,
             const void *key, size_t data_len, const void *data,
             unsigned char *mac);
 
-/* Test functions */
-
-struct cchmac_test_input {
-    const struct ccdigest_info *di;
-    size_t key_len;
-    const void *key;
-    size_t data_len;
-    const void *data;
-    size_t mac_len;
-    const void *expected_mac;
-};
-
-int cchmac_test(const struct cchmac_test_input *input);
-int cchmac_test_chunks(const struct cchmac_test_input *input, size_t chunk_size);
-
-
 #endif /* _CORECRYPTO_CCHMAC_H_ */
diff --git a/EXTERNAL_HEADERS/corecrypto/cckprng.h b/EXTERNAL_HEADERS/corecrypto/cckprng.h
new file mode 100644 (file)
index 0000000..5e5bfca
--- /dev/null
@@ -0,0 +1,83 @@
+/*
+ *  cckprng.h
+ *  corecrypto
+ *
+ *  Created on 12/7/2017
+ *
+ *  Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ */
+
+#ifndef _CORECRYPTO_CCKPRNG_H_
+#define _CORECRYPTO_CCKPRNG_H_
+
+#include <corecrypto/cc.h>
+
+typedef struct PRNG *PrngRef;
+typedef struct cckprng_ctx *cckprng_ctx_t;
+
+struct cckprng_ctx {
+    PrngRef prng;
+    uint64_t bytes_since_entropy;
+    uint64_t bytes_generated;
+};
+
+#define CCKPRNG_ENTROPY_INTERVAL (1 << 14)
+#define CCKPRNG_RESEED_NTICKS 50
+
+/*
+  @function cckprng_init
+  @abstract Initialize a kernel PRNG context.
+
+  @param ctx Context for this instance
+  @param nbytes Length of the seed in bytes
+  @param seed Pointer to a high-entropy seed
+
+  @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT.
+*/
+int cckprng_init(cckprng_ctx_t ctx, size_t nbytes, const void *seed);
+
+/*
+  @function cckprng_reseed
+  @abstract Reseed a kernel PRNG context immediately.
+
+  @param ctx Context for this instance
+  @param nbytes Length of the seed in bytes
+  @param seed Pointer to a high-entropy seed
+
+  @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT.
+*/
+int cckprng_reseed(cckprng_ctx_t ctx, size_t nbytes, const void *seed);
+
+/*
+  @function cckprng_addentropy
+  @abstract Add entropy to a kernel PRNG context.
+
+  @param ctx Context for this instance
+  @param nbytes Length of the input entropy in bytes
+  @param seed Pointer to input entropy
+
+  @result @p CCKPRNG_OK iff successful. Panic on @p CCKPRNG_ABORT.
+
+  @discussion Input entropy is stored internally and consumed at the
+  opportune moment. This will not necessarily be before the next call
+  to @p cckprng_generate. To force an immediate reseed, call @p
+  cckprng_reseed.
+*/
+int cckprng_addentropy(cckprng_ctx_t ctx, size_t nbytes, const void *entropy);
+
+/*
+  @function cckprng_generate
+  @abstract Generate random values for use in applications.
+
+  @param ctx Context for this instance
+  @param nbytes Length of the desired output in bytes
+  @param seed Pointer to the output buffer
+
+  @result @p CCKPRNG_OK iff successful. Panic on @p
+  CCKPRNG_ABORT. Provide input to @p cckprng_addentropy on @p
+  CCKPRNG_NEED_ENTROPY.
+*/
+int cckprng_generate(cckprng_ctx_t ctx, size_t nbytes, void *out);
+
+#endif /* _CORECRYPTO_CCKPRNG_H_ */
index 602fb0868974a034d63b695480074661cf48cf3f..7e97a76f22b60771f2fed881841a3410fc0782d1 100644 (file)
@@ -17,8 +17,6 @@
 #define CCMD5_OUTPUT_SIZE  16
 #define CCMD5_STATE_SIZE   16
 
-extern const uint32_t ccmd5_initial_state[4];
-
 /* Selector */
 const struct ccdigest_info *ccmd5_di(void);
 
index 668ea9d593b9d00758e2ff8a167c0bba6340d75a..a9498d1f7a03fce1ba0e11a5d61a7dc57822fc75 100644 (file)
@@ -14,7 +14,7 @@
 #include <corecrypto/ccn.h>  /* TODO: Remove dependency on this header. */
 #include <corecrypto/ccmode_impl.h>
 
-/* Function and macros defined in this file are only to be used 
+/* Function and macros defined in this file are only to be used
  within corecrypto files.
  */
 
@@ -83,68 +83,6 @@ const struct ccmode_xts *cc##_cipher_##_xts_##_dir_##_mode(void)
     return &xts##_cipher_##_##_dir_;                                            \
 }
 
-#if 0
-
-/* example of how to make the selection function thread safe */
-
-struct ccmode_cbc cc3des_cbc_mode_encrypt;
-dispatch_once_t cc3des_mode_encrypt_init_once;
-
-void cc3des_mode_encrypt_init(void *ctx) {
-    struct ccmode_ecb *ecb = cc3des_ecb_encrypt_mode();
-    ccmode_factory_cbc_encrypt(&cc3des_mode_encrypt, ecb);
-}
-
-const struct ccmode_cbc *cc3des_cbc_encrypt_mode(void) {
-    dispatch_once_f(&cc3des_mode_encrypt_init_once, NULL, cc3des_mode_encrypt_init);
-    return &cc3des_mode_encrypt;
-}
-
-struct ccmode_cbc cc3des_cbc_mode_encrypt = {
-    .n = CC3DES_LTC_ECB_ENCRYPT_N,
-    .init = ccmode_cbc_init,
-    .cbc = ccmode_cbc_encrypt,
-    .custom = &cc3des_ltc_ecb_encrypt
-};
-
-const struct ccmode_cbc *cc3des_cbc_encrypt_mode(void) {
-    return &cc3des_mode_encrypt;
-}
-
-#endif
-
-
-
-int ccmode_cbc_init(const struct ccmode_cbc *cbc, cccbc_ctx *ctx,
-                    size_t rawkey_len, const void *rawkey);
-int ccmode_cbc_decrypt(const cccbc_ctx *ctx, cccbc_iv *iv, size_t nblocks,
-                       const void *in, void *out);
-int ccmode_cbc_encrypt(const cccbc_ctx *ctx, cccbc_iv *iv, size_t nblocks,
-                       const void *in, void *out);
-
-struct _ccmode_cbc_key {
-    const struct ccmode_ecb *ecb;
-    cc_unit u[];
-};
-
-/* Use this to statically initialize a ccmode_cbc object for decryption. */
-#define CCMODE_FACTORY_CBC_DECRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_cbc_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \
-.block_size = (ECB)->block_size, \
-.init = ccmode_cbc_init, \
-.cbc = ccmode_cbc_decrypt, \
-.custom = (ECB) \
-}
-
-/* Use this to statically initialize a ccmode_cbc object for encryption. */
-#define CCMODE_FACTORY_CBC_ENCRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_cbc_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \
-.block_size = (ECB)->block_size, \
-.init = ccmode_cbc_init, \
-.cbc = ccmode_cbc_encrypt, \
-.custom = (ECB) \
-}
-
 /* Use these function to runtime initialize a ccmode_cbc decrypt object (for
  example if it's part of a larger structure). Normally you would pass a
  ecb decrypt mode implementation of some underlying algorithm as the ecb
@@ -160,37 +98,6 @@ void ccmode_factory_cbc_encrypt(struct ccmode_cbc *cbc,
                                 const struct ccmode_ecb *ecb);
 
 
-int ccmode_cfb_init(const struct ccmode_cfb *cfb, cccfb_ctx *ctx,
-                    size_t rawkey_len, const void *rawkey,
-                    const void *iv);
-int ccmode_cfb_decrypt(cccfb_ctx *ctx, size_t nbytes,
-                       const void *in, void *out);
-int ccmode_cfb_encrypt(cccfb_ctx *ctx, size_t nbytes,
-                       const void *in, void *out);
-struct _ccmode_cfb_key {
-    const struct ccmode_ecb *ecb;
-    size_t pad_len;
-    cc_unit u[];
-};
-
-/* Use this to statically initialize a ccmode_cfb object for decryption. */
-#define CCMODE_FACTORY_CFB_DECRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \
-.block_size = 1, \
-.init = ccmode_cfb_init, \
-.cfb = ccmode_cfb_decrypt, \
-.custom = (ECB) \
-}
-
-/* Use this to statically initialize a ccmode_cfb object for encryption. */
-#define CCMODE_FACTORY_CFB_ENCRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \
-.block_size = 1, \
-.init = ccmode_cfb_init, \
-.cfb = ccmode_cfb_encrypt, \
-.custom = (ECB) \
-}
-
 /* Use these function to runtime initialize a ccmode_cfb decrypt object (for
  example if it's part of a larger structure). Normally you would pass a
  ecb encrypt mode implementation of some underlying algorithm as the ecb
@@ -205,36 +112,6 @@ void ccmode_factory_cfb_decrypt(struct ccmode_cfb *cfb,
 void ccmode_factory_cfb_encrypt(struct ccmode_cfb *cfb,
                                 const struct ccmode_ecb *ecb);
 
-int ccmode_cfb8_init(const struct ccmode_cfb8 *cfb8, cccfb8_ctx *ctx,
-                     size_t rawkey_len, const void *rawkey, const void *iv);
-int ccmode_cfb8_decrypt(cccfb8_ctx *ctx, size_t nbytes,
-                        const void *in, void *out);
-int ccmode_cfb8_encrypt(cccfb8_ctx *ctx, size_t nbytes,
-                        const void *in, void *out);
-
-struct _ccmode_cfb8_key {
-    const struct ccmode_ecb *ecb;
-    cc_unit u[];
-};
-
-/* Use this to statically initialize a ccmode_cfb8 object for decryption. */
-#define CCMODE_FACTORY_CFB8_DECRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb8_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \
-.block_size = 1, \
-.init = ccmode_cfb8_init, \
-.cfb8 = ccmode_cfb8_decrypt, \
-.custom = (ECB) \
-}
-
-/* Use this to statically initialize a ccmode_cfb8 object for encryption. */
-#define CCMODE_FACTORY_CFB8_ENCRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_cfb8_key)) + 2 * ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \
-.block_size = 1, \
-.init = ccmode_cfb8_init, \
-.cfb8 = ccmode_cfb8_encrypt, \
-.custom = (ECB) \
-}
-
 /* Use these function to runtime initialize a ccmode_cfb8 decrypt object (for
  example if it's part of a larger structure). Normally you would pass a
  ecb decrypt mode implementation of some underlying algorithm as the ecb
@@ -249,29 +126,6 @@ void ccmode_factory_cfb8_decrypt(struct ccmode_cfb8 *cfb8,
 void ccmode_factory_cfb8_encrypt(struct ccmode_cfb8 *cfb8,
                                  const struct ccmode_ecb *ecb);
 
-int ccmode_ctr_init(const struct ccmode_ctr *ctr, ccctr_ctx *ctx,
-                    size_t rawkey_len, const void *rawkey, const void *iv);
-int ccmode_ctr_setctr(const struct ccmode_ctr *mode, ccctr_ctx *ctx, const void *ctr);
-int ccmode_ctr_crypt(ccctr_ctx *ctx, size_t nbytes,
-                     const void *in, void *out);
-
-struct _ccmode_ctr_key {
-    const struct ccmode_ecb *ecb;
-    size_t pad_offset;
-    cc_unit u[];
-};
-
-/* Use this to statically initialize a ccmode_ctr object for decryption. */
-#define CCMODE_FACTORY_CTR_CRYPT(ECB_ENCRYPT) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_ctr_key)) + 2 * ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \
-.block_size = 1, \
-.ecb_block_size = (ECB_ENCRYPT)->block_size, \
-.init = ccmode_ctr_init, \
-.setctr = ccmode_ctr_setctr, \
-.ctr = ccmode_ctr_crypt, \
-.custom = (ECB_ENCRYPT) \
-}
-
 /* Use these function to runtime initialize a ccmode_ctr decrypt object (for
  example if it's part of a larger structure). Normally you would pass a
  ecb encrypt mode implementation of some underlying algorithm as the ecb
@@ -279,68 +133,6 @@ struct _ccmode_ctr_key {
 void ccmode_factory_ctr_crypt(struct ccmode_ctr *ctr,
                               const struct ccmode_ecb *ecb);
 
-
-/* Create a gcm key from a gcm mode object.
- key must point to at least sizeof(CCMODE_GCM_KEY(ecb)) bytes of free
- storage. */
-int ccmode_gcm_init(const struct ccmode_gcm *gcm, ccgcm_ctx *ctx,
-                     size_t rawkey_len, const void *rawkey);
-int ccmode_gcm_set_iv(ccgcm_ctx *ctx, size_t iv_nbytes, const void *iv);
-int ccmode_gcm_aad(ccgcm_ctx *ctx, size_t nbytes, const void *in);
-int ccmode_gcm_decrypt(ccgcm_ctx *ctx, size_t nbytes, const void *in,
-                        void *out);
-int ccmode_gcm_encrypt(ccgcm_ctx *ctx, size_t nbytes, const void *in,
-                        void *out);
-
-/*!
- @function  ccmode_gcm_finalize() finalizes AES-GCM call sequence
- @param key encryption or decryption key
- @param tag_nbytes length of tag in bytes
- @param tag authentication tag
- @result       0=success or non zero= error
- @discussion For decryption, the tag parameter must be the expected-tag. A secure compare is performed between the provided expected-tag and the computed-tag. If they are the same, 0 is returned. Otherwise, non zero is returned. For encryption, tag is output and provides the authentication tag.
-
- */
-int ccmode_gcm_finalize(ccgcm_ctx *key, size_t tag_nbytes, void *tag);
-int ccmode_gcm_reset(ccgcm_ctx *key);
-
-#define CCGCM_FLAGS_INIT_WITH_IV 1
-
-// Here is what the structure looks like in memory
-// [ temp space | length | *ecb | *ecb_key | table | ecb_key ]
-// size of table depends on the implementation (VNG vs factory)
-// currently, VNG and factory share the same "header" described here
-// VNG may add additional data after the header
-struct _ccmode_gcm_key {
-    // 5 blocks of temp space.
-    unsigned char H[16];       /* multiplier */
-    unsigned char X[16];       /* accumulator */
-    unsigned char Y[16];       /* counter */
-    unsigned char Y_0[16];     /* initial counter */
-    unsigned char buf[16];      /* buffer for stuff */
-
-    // State and length
-    uint16_t state;        /* state the GCM code is in */
-    uint16_t flags;        /* flags (persistent across reset) */
-    uint32_t buf_nbytes;   /* length of data in buf */
-
-    uint64_t aad_nbytes;   /* 64-bit counter used for IV and AAD */
-    uint64_t text_nbytes;  /* 64-bit counter for the plaintext PT */
-
-    // ECB
-    const struct ccmode_ecb *ecb;              // ecb mode
-    // Pointer to the ECB key in the buffer
-    void *ecb_key;                             // address of the ecb_key in u, set in init function
-    int encdec; //is it an encrypt or decrypt object
-
-    // Buffer with ECB key and H table if applicable
-    CC_ALIGNED(16) unsigned char u[]; // ecb key + tables
-};
-
-#define GCM_ECB_KEY_SIZE(ECB_ENCRYPT) \
-        ((5 * ccn_sizeof_size((ECB_ENCRYPT)->block_size)) \
-    + ccn_sizeof_size((ECB_ENCRYPT)->size))
-
 /* Use these function to runtime initialize a ccmode_gcm decrypt object (for
  example if it's part of a larger structure). For GCM you always pass a
  ecb encrypt mode implementation of some underlying algorithm as the ecb
@@ -355,72 +147,6 @@ void ccmode_factory_gcm_decrypt(struct ccmode_gcm *gcm,
 void ccmode_factory_gcm_encrypt(struct ccmode_gcm *gcm,
                                 const struct ccmode_ecb *ecb_encrypt);
 
-
-/* CCM (only NIST approved with AES) */
-int ccmode_ccm_init(const struct ccmode_ccm *ccm, ccccm_ctx *ctx,
-                     size_t rawkey_len, const void *rawkey);
-int ccmode_ccm_set_iv(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nonce_len, const void *nonce,
-                       size_t mac_size, size_t auth_len, size_t data_len);
-/* internal function */
-void ccmode_ccm_macdata(ccccm_ctx *key, ccccm_nonce *nonce_ctx, unsigned new_block, size_t nbytes, const void *in);
-/* api function - disallows only mac'd data after data to encrypt was sent */
-int ccmode_ccm_cbcmac(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in);
-/* internal function */
-void ccmode_ccm_crypt(ccccm_ctx *key, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in, void *out);
-int ccmode_ccm_decrypt(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in,
-                        void *out);
-int ccmode_ccm_encrypt(ccccm_ctx *ctx, ccccm_nonce *nonce_ctx, size_t nbytes, const void *in,
-                        void *out);
-int ccmode_ccm_finalize(ccccm_ctx *key, ccccm_nonce *nonce_ctx, void *mac);
-int ccmode_ccm_reset(ccccm_ctx *key, ccccm_nonce *nonce_ctx);
-
-struct _ccmode_ccm_key {
-    const struct ccmode_ecb *ecb;
-    cc_unit u[];
-};
-
-struct _ccmode_ccm_nonce {
-    unsigned char A_i[16];      /* crypto block iv */
-    unsigned char B_i[16];      /* mac block iv */
-    unsigned char MAC[16];      /* crypted mac */
-    unsigned char buf[16];      /* crypt buffer */
-
-    uint32_t mode;         /* mode: IV -> AD -> DATA */
-    uint32_t buflen;       /* length of data in buf */
-    uint32_t b_i_len;      /* length of cbcmac data in B_i */
-
-    size_t nonce_size;
-    size_t mac_size;
-};
-
-/* Use this to statically initialize a ccmode_ccm object for decryption. */
-#define CCMODE_FACTORY_CCM_DECRYPT(ECB_ENCRYPT) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_key)) + ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \
-.nonce_size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_nonce)), \
-.block_size = 1, \
-.init = ccmode_ccm_init, \
-.set_iv = ccmode_ccm_set_iv, \
-.cbcmac = ccmode_ccm_cbcmac, \
-.ccm = ccmode_ccm_decrypt, \
-.finalize = ccmode_ccm_finalize, \
-.reset = ccmode_ccm_reset, \
-.custom = (ECB_ENCRYPT) \
-}
-
-/* Use this to statically initialize a ccmode_ccm object for encryption. */
-#define CCMODE_FACTORY_CCM_ENCRYPT(ECB_ENCRYPT) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_key)) + ccn_sizeof_size((ECB_ENCRYPT)->block_size) + ccn_sizeof_size((ECB_ENCRYPT)->size), \
-.nonce_size = ccn_sizeof_size(sizeof(struct _ccmode_ccm_nonce)), \
-.block_size = 1, \
-.init = ccmode_ccm_init, \
-.set_iv = ccmode_ccm_set_iv, \
-.cbcmac = ccmode_ccm_cbcmac, \
-.ccm = ccmode_ccm_encrypt, \
-.finalize = ccmode_ccm_finalize, \
-.reset = ccmode_ccm_reset, \
-.custom = (ECB_ENCRYPT) \
-}
-
 /* Use these function to runtime initialize a ccmode_ccm decrypt object (for
  example if it's part of a larger structure). For CCM you always pass a
  ecb encrypt mode implementation of some underlying algorithm as the ecb
@@ -436,28 +162,6 @@ void ccmode_factory_ccm_decrypt(struct ccmode_ccm *ccm,
 void ccmode_factory_ccm_encrypt(struct ccmode_ccm *ccm,
                                 const struct ccmode_ecb *ecb_encrypt);
 
-
-int ccmode_ofb_init(const struct ccmode_ofb *ofb, ccofb_ctx *ctx,
-                    size_t rawkey_len, const void *rawkey,
-                    const void *iv);
-int ccmode_ofb_crypt(ccofb_ctx *ctx, size_t nbytes,
-                     const void *in, void *out);
-
-struct _ccmode_ofb_key {
-    const struct ccmode_ecb *ecb;
-    size_t pad_len;
-    cc_unit u[];
-};
-
-/* Use this to statically initialize a ccmode_ofb object. */
-#define CCMODE_FACTORY_OFB_CRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_ofb_key)) + ccn_sizeof_size((ECB)->block_size) + ccn_sizeof_size((ECB)->size), \
-.block_size = 1, \
-.init = ccmode_ofb_init, \
-.ofb = ccmode_ofb_crypt, \
-.custom = (ECB) \
-}
-
 /* Use these function to runtime initialize a ccmode_ofb encrypt object (for
  example if it's part of a larger structure). Normally you would pass a
  ecb encrypt mode implementation of some underlying algorithm as the ecb
@@ -465,44 +169,6 @@ struct _ccmode_ofb_key {
 void ccmode_factory_ofb_crypt(struct ccmode_ofb *ofb,
                               const struct ccmode_ecb *ecb);
 
-int ccmode_omac_decrypt(ccomac_ctx *ctx, size_t nblocks,
-                        const void *tweak, const void *in, void *out);
-int ccmode_omac_encrypt(ccomac_ctx *ctx, size_t nblocks,
-                        const void *tweak, const void *in, void *out);
-
-/* Create a omac key from a omac mode object.  The tweak_len here
- determines how long the tweak is in bytes, for each subsequent call to
- ccmode_omac->omac().
- key must point to at least sizeof(CCMODE_OMAC_KEY(ecb)) bytes of free
- storage. */
-int ccmode_omac_init(const struct ccmode_omac *omac, ccomac_ctx *ctx,
-                     size_t tweak_len, size_t rawkey_len,
-                     const void *rawkey);
-
-struct _ccmode_omac_key {
-    const struct ccmode_ecb *ecb;
-    size_t tweak_len;
-    cc_unit u[];
-};
-
-/* Use this to statically initialize a ccmode_omac object for decryption. */
-#define CCMODE_FACTORY_OMAC_DECRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_omac_key)) + 2 * ccn_sizeof_size((ECB)->size), \
-.block_size = (ECB)->block_size, \
-.init = ccmode_omac_init, \
-.omac = ccmode_omac_decrypt, \
-.custom = (ECB) \
-}
-
-/* Use this to statically initialize a ccmode_omac object for encryption. */
-#define CCMODE_FACTORY_OMAC_ENCRYPT(ECB) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_omac_key)) + 2 * ccn_sizeof_size((ECB)->size), \
-.block_size = (ECB)->block_size, \
-.init = ccmode_omac_init, \
-.omac = ccmode_omac_encrypt, \
-.custom = (ECB) \
-}
-
 /* Use these function to runtime initialize a ccmode_omac decrypt object (for
  example if it's part of a larger structure). Normally you would pass a
  ecb decrypt mode implementation of some underlying algorithm as the ecb
@@ -517,62 +183,6 @@ void ccmode_factory_omac_decrypt(struct ccmode_omac *omac,
 void ccmode_factory_omac_encrypt(struct ccmode_omac *omac,
                                  const struct ccmode_ecb *ecb);
 
-
-/* Function prototypes used by the macros below, do not call directly. */
-int ccmode_xts_init(const struct ccmode_xts *xts, ccxts_ctx *ctx,
-                    size_t key_nbytes, const void *data_key,
-                    const void *tweak_key);
-void ccmode_xts_key_sched(const struct ccmode_xts *xts, ccxts_ctx *ctx,
-                          size_t key_nbytes, const void *data_key,
-                          const void *tweak_key);
-void *ccmode_xts_crypt(const ccxts_ctx *ctx, ccxts_tweak *tweak,
-                       size_t nblocks, const void *in, void *out);
-int ccmode_xts_set_tweak(const ccxts_ctx *ctx, ccxts_tweak *tweak,
-                         const void *iv);
-
-
-struct _ccmode_xts_key {
-    const struct ccmode_ecb *ecb;
-    const struct ccmode_ecb *ecb_encrypt;
-    cc_unit u[];
-};
-
-struct _ccmode_xts_tweak {
-    // FIPS requires that for XTS that no more that 2^20 AES blocks may be processed for any given
-    // Key, Tweak Key, and tweak combination
-    // the bytes_processed field in the context will accumuate the number of blocks processed and
-    // will fail the encrypt/decrypt if the size is violated.  This counter will be reset to 0
-    // when set_tweak is called.
-    size_t  blocks_processed;
-    cc_unit u[];
-};
-
-/* Use this to statically initialize a ccmode_xts object for decryption. */
-#define CCMODE_FACTORY_XTS_DECRYPT(ECB, ECB_ENCRYPT) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_xts_key)) + 2 * ccn_sizeof_size((ECB)->size), \
-.tweak_size = ccn_sizeof_size(sizeof(struct _ccmode_xts_tweak)) + ccn_sizeof_size(ecb->block_size), \
-.block_size = ecb->block_size, \
-.init = ccmode_xts_init, \
-.key_sched = ccmode_xts_key_sched, \
-.set_tweak = ccmode_xts_set_tweak, \
-.xts = ccmode_xts_crypt, \
-.custom = (ECB), \
-.custom1 = (ECB_ENCRYPT) \
-}
-
-/* Use this to statically initialize a ccmode_xts object for encryption. */
-#define CCMODE_FACTORY_XTS_ENCRYPT(ECB, ECB_ENCRYPT) { \
-.size = ccn_sizeof_size(sizeof(struct _ccmode_xts_key)) + 2 * ccn_sizeof_size((ECB)->size), \
-.tweak_size = ccn_sizeof_size(sizeof(struct _ccmode_xts_tweak)) + ccn_sizeof_size(ecb->block_size), \
-.block_size = ecb->block_size, \
-.init = ccmode_xts_init, \
-.key_sched = ccmode_xts_key_sched, \
-.set_tweak = ccmode_xts_set_tweak, \
-.xts = ccmode_xts_crypt, \
-.custom = (ECB), \
-.custom1 = (ECB_ENCRYPT) \
-}
-
 /* Use these function to runtime initialize a ccmode_xts decrypt object (for
  example if it's part of a larger structure). Normally you would pass a
  ecb decrypt mode implementation of some underlying algorithm as the ecb
index 7950541612454f95475de990e47c8bf5d7f7b789..a0c6e24bc67fc5892f9ce1ae43aaa7d2a3ef0e84 100644 (file)
@@ -35,7 +35,7 @@ struct ccmode_ecb {
  *   1- ccmod_xxx_init()
  *   2- ccmod_xxx_decrypt()
  *   3- ccmod_xxx_encrypt()
- * 
+ *
  * stateful modes CCM and GCM: They provide 7 interface functions that return error codes if a function is called out of state
  *   1- ccmod_xxx_init()
  *   2- ccmod_xxx_setiv()
@@ -131,7 +131,7 @@ struct ccmode_xts {
     size_t tweak_size;  /* first argument to ccxts_tweak_decl(). Size of the tweak structure, not the expected tweak size */
     size_t block_size;
 
-    /* Create a xts key from a xts mode object.  
+    /* Create a xts key from a xts mode object.
      key must point to at least 'size' bytes of free storage.
      tweak_key must point to at least 'tweak_size' bytes of free storage.
      key and tweak_key must differ.
@@ -139,7 +139,7 @@ struct ccmode_xts {
      */
     int (*init)(const struct ccmode_xts *xts, ccxts_ctx *ctx,
                 size_t key_nbytes, const void *data_key, const void *tweak_key);
-    
+
     void (*key_sched)(const struct ccmode_xts *xts, ccxts_ctx *ctx,
                       size_t key_nbytes, const void *data_key, const void *tweak_key);
 
@@ -174,7 +174,7 @@ struct ccmode_gcm {
     const void *custom;
 };
 
-//8- GCM mode, statful
+//8- CCM mode, stateful
 cc_aligned_struct(16) ccccm_ctx;
 cc_aligned_struct(16) ccccm_nonce;
 
@@ -193,6 +193,20 @@ struct ccmode_ccm {
     const void *custom;
 };
 
+/* We need to expose this (currently)to keep CommonCrypto happy. */
+struct _ccmode_ccm_nonce {
+    unsigned char A_i[16];      /* crypto block iv */
+    unsigned char B_i[16];      /* mac block iv */
+    unsigned char MAC[16];      /* crypted mac */
+    unsigned char buf[16];      /* crypt buffer */
+
+    uint32_t mode;         /* mode: IV -> AD -> DATA */
+    uint32_t buflen;       /* length of data in buf */
+    uint32_t b_i_len;      /* length of cbcmac data in B_i */
+
+    size_t nonce_size;
+    size_t mac_size;
+};
 
 /* OMAC mode. */
 cc_aligned_struct(16) ccomac_ctx;
index 5186e12273185f825fd02b9dbc1345e2f1724b71..99322ad2de82e6047067c94d928ac4a77a049a4c 100644 (file)
@@ -126,13 +126,4 @@ CC_INLINE int ccsiv_one_shot(const struct ccmode_siv *mode,
     return rc;
 }
 
-void ccmode_factory_siv_encrypt(struct ccmode_siv *siv,
-                                const struct ccmode_cbc *cbc,
-                                const struct ccmode_ctr *ctr);
-
-void ccmode_factory_siv_decrypt(struct ccmode_siv *siv,
-                                const struct ccmode_cbc *cbc,
-                                const struct ccmode_ctr *ctr);
-
-
 #endif /* _CORECRYPTO_CCMODE_H_ */
index afaed41ae2b4eaa5689c6781e3b1ff4e87a0991e..2d3e847c9cc6fe683ec91d65c839635632518f3f 100644 (file)
@@ -94,6 +94,8 @@ typedef struct {
 /* Returns the count (n) of a ccn vector that can represent _size_ bytes. */
 #define ccn_nof_size(_size_)  (((_size_) + CCN_UNIT_SIZE - 1) / CCN_UNIT_SIZE)
 
+#define ccn_nof_sizeof(_expr_) ccn_nof_size(sizeof (_expr_))
+
 /* Return the max number of bits a ccn vector of _n_ units can hold. */
 #define ccn_bitsof_n(_n_)  ((_n_) * CCN_UNIT_BITS)
 
@@ -283,7 +285,7 @@ typedef struct {
 #define CCN521_N  ccn_nof(521)
 
 /* Return the number of used units after stripping leading 0 units.  */
-CC_PURE CC_NONNULL2
+CC_PURE CC_NONNULL((2))
 cc_size ccn_n(cc_size n, const cc_unit *s);
 
 /* s >> k -> r return bits shifted out of least significant word in bits [0, n>
@@ -292,29 +294,13 @@ cc_size ccn_n(cc_size n, const cc_unit *s);
  word shifts.  */
 CC_NONNULL((2, 3))
 cc_unit ccn_shift_right(cc_size n, cc_unit *r, const cc_unit *s, size_t k);
-CC_NONNULL((2, 3))
-void ccn_shift_right_multi(cc_size n, cc_unit *r,const cc_unit *s, size_t k);
-
-/* s << k -> r return bits shifted out of most significant word in bits [0, n>
- { N bit, scalar -> N bit } N = n * sizeof(cc_unit) * 8
- the _multi version doesn't return the shifted bits, but does support multiple
- word shifts */
-CC_NONNULL((2, 3))
-cc_unit ccn_shift_left(cc_size n, cc_unit *r, const cc_unit *s, size_t k);
-CC_NONNULL((2, 3))
-void ccn_shift_left_multi(cc_size n, cc_unit *r, const cc_unit *s, size_t k);
 
 /* s == 0 -> return 0 | s > 0 -> return index (starting at 1) of most
  significant bit that is 1.
  { N bit } N = n * sizeof(cc_unit) * 8 */
-CC_NONNULL2
+CC_NONNULL((2))
 size_t ccn_bitlen(cc_size n, const cc_unit *s);
 
-/* Returns the number of bits which are zero before the first one bit
-   counting from least to most significant bit. */
-CC_NONNULL2
-size_t ccn_trailing_zeros(cc_size n, const cc_unit *s);
-
 /* s == 0 -> return true | s != 0 -> return false
  { N bit } N = n * sizeof(cc_unit) * 8 */
 #define ccn_is_zero(_n_, _s_) (!ccn_n(_n_, _s_))
@@ -348,9 +334,6 @@ int ccn_cmpn(cc_size ns, const cc_unit *s,
 CC_NONNULL((2, 3, 4))
 cc_unit ccn_sub(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t);
 
-/* |s - t| -> r return 1 iff t > s, 0 otherwise */
-cc_unit ccn_abs(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t);
-
 /* s - v -> r return 1 iff v > s return 0 otherwise.
  { N bit, sizeof(cc_unit) * 8 bit -> N bit } N = n * sizeof(cc_unit) * 8 */
 CC_NONNULL((2, 3))
@@ -388,23 +371,12 @@ cc_unit ccn_addn(cc_size n, cc_unit *r, const cc_unit *s,
 }
 
 
-CC_NONNULL((2, 3, 4))
-void ccn_lcm(cc_size n, cc_unit *r2n, const cc_unit *s, const cc_unit *t);
-
-
 /* s * t -> r_2n                   r_2n must not overlap with s nor t
  { n bit, n bit -> 2 * n bit } n = count * sizeof(cc_unit) * 8
  { N bit, N bit -> 2N bit } N = ccn_bitsof(n) */
 CC_NONNULL((2, 3, 4))
 void ccn_mul(cc_size n, cc_unit *r_2n, const cc_unit *s, const cc_unit *t);
 
-/* s * t -> r_2n                   r_2n must not overlap with s nor t
- { n bit, n bit -> 2 * n bit } n = count * sizeof(cc_unit) * 8
- { N bit, N bit -> 2N bit } N = ccn_bitsof(n) 
- Provide a workspace for potential speedup */
-CC_NONNULL((1, 3, 4, 5))
-void ccn_mul_ws(cc_ws_t ws, cc_size count, cc_unit *r, const cc_unit *s, const cc_unit *t);
-
 /* s[0..n) * v -> r[0..n)+return value
  { N bit, sizeof(cc_unit) * 8 bit -> N + sizeof(cc_unit) * 8 bit } N = n * sizeof(cc_unit) * 8 */
 CC_NONNULL((2, 3))
@@ -422,28 +394,18 @@ CC_NONNULL((2, 3, 4))
 void ccn_mod(cc_size n, cc_unit *r, const cc_unit *a_2n, const cc_unit *d);
 #endif
 
-/* r = gcd(s, t).
-   N bit, N bit -> N bit */
-CC_NONNULL((2, 3, 4))
-void ccn_gcd(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t);
-
-/* r = gcd(s, t).
- N bit, N bit -> O bit */
-CC_NONNULL((2, 4, 6))
-void ccn_gcdn(cc_size rn, cc_unit *r, cc_size sn, const cc_unit *s, cc_size tn, const cc_unit *t);
-
 /* r = (data, len) treated as a big endian byte array, return -1 if data
  doesn't fit in r, return 0 otherwise. */
 CC_NONNULL((2, 4))
 int ccn_read_uint(cc_size n, cc_unit *r, size_t data_size, const uint8_t *data);
 
 /* r = (data, len) treated as a big endian byte array, return -1 if data
- doesn't fit in r, return 0 otherwise. 
+ doesn't fit in r, return 0 otherwise.
  ccn_read_uint strips leading zeroes and doesn't care about sign. */
 #define ccn_read_int(n, r, data_size, data) ccn_read_uint(n, r, data_size, data)
 
 /* Return actual size in bytes needed to serialize s. */
-CC_PURE CC_NONNULL2
+CC_PURE CC_NONNULL((2))
 size_t ccn_write_uint_size(cc_size n, const cc_unit *s);
 
 /* Serialize s, to out.
@@ -473,9 +435,9 @@ cc_size ccn_write_uint_padded(cc_size n, const cc_unit* s, size_t out_size, uint
 }
 
 
-/*  Return actual size in bytes needed to serialize s as int 
+/*  Return actual size in bytes needed to serialize s as int
     (adding leading zero if high bit is set). */
-CC_PURE CC_NONNULL2
+CC_PURE CC_NONNULL((2))
 size_t ccn_write_int_size(cc_size n, const cc_unit *s);
 
 /*  Serialize s, to out.
@@ -491,55 +453,25 @@ size_t ccn_write_int_size(cc_size n, const cc_unit *s);
 CC_NONNULL((2, 4))
 void ccn_write_int(cc_size n, const cc_unit *s, size_t out_size, void *out);
 
-#if CCN_DEDICATED_SQR
-
-/* s^2 -> r
- { n bit -> 2 * n bit } */
-CC_NONNULL((2, 3))
-void ccn_sqr(cc_size n, cc_unit *r, const cc_unit *s);
-
-/* s^2 -> r
- { n bit -> 2 * n bit } */
-CC_NONNULL((1, 3, 4))
-void ccn_sqr_ws(cc_ws_t ws, cc_size n, cc_unit *r, const cc_unit *s);
-
-#else
-
-/* s^2 -> r
- { n bit -> 2 * n bit } */
-CC_INLINE CC_NONNULL((2, 3))
-void ccn_sqr(cc_size n, cc_unit *r, const cc_unit *s) {
-    ccn_mul(n, r, s, s);
-}
-
-/* s^2 -> r
- { n bit -> 2 * n bit } */
-CC_INLINE CC_NONNULL((2, 3, 4))
-void ccn_sqr_ws(cc_ws_t ws, cc_size n, cc_unit *r, const cc_unit *s) {
-    ccn_mul_ws(ws, n, r, s, s);
-}
-
-#endif
-
 /* s -> r
  { n bit -> n bit } */
 CC_NONNULL((2, 3))
 void ccn_set(cc_size n, cc_unit *r, const cc_unit *s);
 
-CC_INLINE CC_NONNULL2
+CC_INLINE CC_NONNULL((2))
 void ccn_zero(cc_size n, cc_unit *r) {
     cc_zero(ccn_sizeof_n(n),r);
 }
 
-CC_INLINE CC_NONNULL2
+CC_INLINE CC_NONNULL((2))
 void ccn_clear(cc_size n, cc_unit *r) {
     cc_clear(ccn_sizeof_n(n),r);
 }
 
-CC_NONNULL2
+CC_NONNULL((2))
 void ccn_zero_multi(cc_size n, cc_unit *r, ...);
 
-CC_INLINE CC_NONNULL2
+CC_INLINE CC_NONNULL((2))
 void ccn_seti(cc_size n, cc_unit *r, cc_unit v) {
     /* assert(n > 0); */
     r[0] = v;
@@ -589,7 +521,7 @@ void ccn_setn(cc_size n, cc_unit *r, const cc_size s_size, const cc_unit *s) {
 #endif
 
 /* Swap units in r in place from cc_unit vector byte order to big endian byte order (or back). */
-CC_INLINE CC_NONNULL2
+CC_INLINE CC_NONNULL((2))
 void ccn_swap(cc_size n, cc_unit *r) {
     cc_unit *e;
     for (e = r + n - 1; r < e; ++r, --e) {
@@ -609,9 +541,9 @@ void ccn_xor(cc_size n, cc_unit *r, const cc_unit *s, const cc_unit *t) {
 }
 
 /* Debugging */
-CC_NONNULL2
+CC_NONNULL((2))
 void ccn_print(cc_size n, const cc_unit *s);
-CC_NONNULL3
+CC_NONNULL((3))
 void ccn_lprint(cc_size n, const char *label, const cc_unit *s);
 
 /* Forward declaration so we don't depend on ccrng.h. */
@@ -631,16 +563,6 @@ int ccn_random(cc_size n, cc_unit *r, struct ccrng_state *rng) {
 CC_NONNULL((2, 3))
 int ccn_random_bits(cc_size nbits, cc_unit *r, struct ccrng_state *rng);
 
-/*!
- @brief ccn_make_recip(cc_size nd, cc_unit *recip, const cc_unit *d) computes the reciprocal of d: recip = 2^2b/d where b=bitlen(d)
-
- @param nd      length of array d
- @param recip   returned reciprocal of size nd+1
- @param d       input number d
-*/
-CC_NONNULL((2, 3))
-int ccn_make_recip(cc_size nd, cc_unit *recip, const cc_unit *d);
-
 CC_NONNULL((6, 8))
 int ccn_div_euclid(cc_size nq, cc_unit *q, cc_size nr, cc_unit *r, cc_size na, const cc_unit *a, cc_size nd, const cc_unit *d);
 
index 3b50710a3f5cd137918c636d3f40d78f56d35cd5..eaf644d1db8471deeb49ef4b3ddd828298195839 100644 (file)
@@ -26,19 +26,8 @@ struct ccrc4_info {
     void (*crypt)(ccrc4_ctx *ctx, size_t nbytes, const void *in, void *out);
 };
 
-
 const struct ccrc4_info *ccrc4(void);
 
 extern const struct ccrc4_info ccrc4_eay;
 
-struct ccrc4_vector {
-    size_t keylen;
-    const void *key;
-    size_t datalen;
-    const void *pt;
-    const void *ct;
-};
-
-int ccrc4_test(const struct ccrc4_info *rc4, const struct ccrc4_vector *v);
-
 #endif /* _CORECRYPTO_CCRC4_H_ */
index 698f412ca76a763b3fb5b4e3a8e49c88553de3d0..c6bc18a90b339c3a511cdfdb5f6b05ff168f43ca 100644 (file)
 
 #include <corecrypto/cc.h>
 
-#define CCERR_DEVICE                   -100
-#define CCERR_INTERUPTS                -101
-#define CCERR_CRYPTO_CONFIG            -102
-#define CCERR_PERMS                    -103
-#define CCERR_PARAMETER                -104
-#define CCERR_MEMORY                   -105
-#define CCERR_FILEDESC                 -106
-#define CCERR_OUT_OF_ENTROPY           -107
-#define CCERR_INTERNAL                 -108
-#define CCERR_ATFORK                   -109
-#define CCERR_OVERFLOW                 -110
-
 #define CCRNG_STATE_COMMON                                                          \
     int (*generate)(struct ccrng_state *rng, size_t outlen, void *out);
 
@@ -55,6 +43,6 @@ struct ccrng_state {
 struct ccrng_state *ccrng(int *error);
 
 //call this macro with the rng argument set to output of the call to the ccrng() function
-#define ccrng_generate(rng, outlen, out) ((rng)->generate((rng), (outlen), (out)))
+#define ccrng_generate(rng, outlen, out) ((rng)->generate((struct ccrng_state *)(rng), (outlen), (out)))
 
 #endif /* _CORECRYPTO_CCRNG_H_ */
index c821efc4098a2c457f52e016d385b7e1ec077e49..0f70c3740221198fa8e4012ea584c4ac9bda0997 100644 (file)
 // This limit is relaxed to accommodate potential third-party consumers
 #define CCRSA_KEYGEN_MAX_NBITS 8192
 
-// Program error: buffer too small or encrypted message is too small
-#define CCRSA_INVALID_INPUT        -1
-// Invalid crypto configuration: Hash length versus RSA key size
-#define CCRSA_INVALID_CONFIG       -2
-// The data is invalid (we won't say more for security
-#define CCRSA_DECRYPTION_ERROR     -3
-
-#define CCRSA_ENCODING_ERROR       -4
-#define CCRSA_DECODING_ERROR       -5
-#define CCRSA_SIGNATURE_GEN_ERROR  -6
-
 struct ccrsa_full_ctx {
     __CCZP_ELEMENTS_DEFINITIONS(pb_)
 } CC_ALIGNED(CCN_UNIT_SIZE);
@@ -44,32 +33,9 @@ struct ccrsa_priv_ctx {
     __CCZP_ELEMENTS_DEFINITIONS(pv_)
 } CC_ALIGNED(CCN_UNIT_SIZE);
 
-
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-    typedef union {
-        cczp_t zp;
-        struct ccrsa_pub_ctx* pub;
-        struct ccrsa_full_ctx *full;
-    } ccrsa_full_ctx_t __attribute__((transparent_union));
-    typedef struct ccrsa_full_ctx ccrsa_full_ctx;
-    typedef struct ccrsa_priv_ctx ccrsa_priv_ctx;
-
-    typedef union {
-        cczp_t zp;
-        ccrsa_priv_ctx *priv;
-    } ccrsa_priv_ctx_t __attribute__((transparent_union));
-
-
-typedef ccrsa_full_ctx_t ccrsa_pub_ctx_t;
-typedef struct ccrsa_pub_ctx ccrsa_pub_ctx;
-
-#else
-    typedef struct ccrsa_full_ctx* ccrsa_full_ctx_t;
-    typedef struct ccrsa_pub_ctx* ccrsa_pub_ctx_t;
-    typedef struct ccrsa_priv_ctx* ccrsa_priv_ctx_t;
-#endif
-
-
+typedef struct ccrsa_full_ctx* ccrsa_full_ctx_t;
+typedef struct ccrsa_pub_ctx* ccrsa_pub_ctx_t;
+typedef struct ccrsa_priv_ctx* ccrsa_priv_ctx_t;
 
 /*
  public key cczp      d=e^-1 mod phi(m) priv key cczp             priv key cczq             dp, dq, qinv
@@ -90,7 +56,7 @@ typedef struct ccrsa_pub_ctx ccrsa_pub_ctx;
 
 /* Declare a fully scheduled rsa key.  Size is the size in bytes each ccn in
    the key.  For example to declare (on the stack or in a struct) a 1021 bit
-   rsa public key named foo use ccrsa_pub_ctx_decl(ccn_sizeof(1021), foo). 
+   rsa public key named foo use ccrsa_pub_ctx_decl(ccn_sizeof(1021), foo).
  */
 #define ccrsa_full_ctx_decl(_size_, _name_)   cc_ctx_decl(struct ccrsa_full_ctx, ccrsa_full_ctx_size(_size_), _name_)
 #define ccrsa_full_ctx_clear(_size_, _name_)  cc_clear(ccrsa_full_ctx_size(_size_), _name_)
@@ -101,19 +67,9 @@ typedef struct ccrsa_pub_ctx ccrsa_pub_ctx;
 // The offsets are computed using pb_ccn. If any object other than ccrsa_full_ctx_t
 // or ccrsa_pub_ctx_t is passed to the macros, compiler error is generated.
 
-
-
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-//#define ccrsa_ctx_zm(_ctx_)        (((ccrsa_pub_ctx_t)(_ctx_)).zp)
-
-    CC_CONST CC_INLINE cczp_t ccrsa_ctx_zm(ccrsa_full_ctx_t _ctx_) { return ((cczp_t)(struct cczp *)((_ctx_).full)); }
-    CC_CONST CC_INLINE cc_unit  *ccrsa_ctx_m(ccrsa_full_ctx_t _ctx_){ return        ((_ctx_).full->pb_ccn);}
-    #define ccrsa_ctx_n(_ctx_)         (ccrsa_ctx_zm(_ctx_).zp->n)
-#else
-    #define ccrsa_ctx_zm(_ctx_)        ((cczp_t)(_ctx_))
-    #define ccrsa_ctx_n(_ctx_)         (ccrsa_ctx_zm(_ctx_)->n)
-    #define ccrsa_ctx_m(_ctx_)         ((_ctx_)->pb_ccn)
-#endif
+#define ccrsa_ctx_zm(_ctx_)        ((cczp_t)(_ctx_))
+#define ccrsa_ctx_n(_ctx_)         (ccrsa_ctx_zm(_ctx_)->n)
+#define ccrsa_ctx_m(_ctx_)         ((_ctx_)->pb_ccn)
 
 #define ccrsa_ctx_e(_ctx_)         (ccrsa_ctx_m(_ctx_) + 2 * ccrsa_ctx_n(_ctx_) + 1)
 #define ccrsa_ctx_d(_ctx_)         (ccrsa_ctx_m(_ctx_) + 3 * ccrsa_ctx_n(_ctx_) + 1)
@@ -121,36 +77,13 @@ typedef struct ccrsa_pub_ctx ccrsa_pub_ctx;
 // accessors to ccrsa private key fields
 // The offsets are computed using pv_ccn. If any object other than ccrsa_priv_ctx_t
 // is passed to the macros, compiler error is generated.
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-
-/* rvalue accessors to ccec_key fields. */
-CC_CONST CC_INLINE
-ccrsa_priv_ctx_t ccrsa_get_private_ctx_ptr(ccrsa_full_ctx_t fk) {
-    cc_unit *p = (cc_unit *)fk.full;
-    cc_size p_size = ccrsa_ctx_n(fk);
-    p += ccn_nof_size(ccrsa_pub_ctx_size(ccn_sizeof_n(p_size))) + p_size;
-    ccrsa_priv_ctx *priv = (ccrsa_priv_ctx *)p;
-    return (ccrsa_priv_ctx_t)priv;
-}
-
-CC_CONST CC_INLINE
-ccrsa_pub_ctx_t ccrsa_ctx_public(ccrsa_full_ctx_t fk) {
-    return (ccrsa_pub_ctx_t) fk.full;
-}
-
-#define ccrsa_ctx_private_zp(FK)   ((ccrsa_get_private_ctx_ptr(FK)).zp)
-#define ccrsa_ctx_private_zq(FK)   ((cczp_t)((ccrsa_get_private_ctx_ptr(FK)).zp.zp->ccn + 2 * ccrsa_ctx_private_zp(FK).zp->n + 1))
-#define ccrsa_ctx_private_dp(FK)   ((ccrsa_get_private_ctx_ptr(FK)).zp.zp->ccn + 4 * ccrsa_ctx_private_zp(FK).zp->n + 2 + ccn_nof_size(sizeof(struct cczp)))
-#define ccrsa_ctx_private_dq(FK)   ((ccrsa_get_private_ctx_ptr(FK)).zp.zp->ccn + 5 * ccrsa_ctx_private_zp(FK).zp->n + 2 + ccn_nof_size(sizeof(struct cczp)))
-#define ccrsa_ctx_private_qinv(FK) ((ccrsa_get_private_ctx_ptr(FK)).zp.zp->ccn + 6 * ccrsa_ctx_private_zp(FK).zp->n + 2 + ccn_nof_size(sizeof(struct cczp)))
-
-#else
 #define ccrsa_ctx_private_zp(FK)   ((cczp_t)ccrsa_get_private_ctx_ptr(FK))
 #define ccrsa_ctx_private_zq(FK)   ((cczp_t)((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 2 * ccrsa_ctx_private_zp(FK)->n + 1))
 #define ccrsa_ctx_private_dp(FK)   ((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 4 * ccrsa_ctx_private_zp(FK)->n + 2 + ccn_nof_size(sizeof(struct cczp)))
 #define ccrsa_ctx_private_dq(FK)   ((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 5 * ccrsa_ctx_private_zp(FK)->n + 2 + ccn_nof_size(sizeof(struct cczp)))
 #define ccrsa_ctx_private_qinv(FK) ((ccrsa_get_private_ctx_ptr(FK))->pv_ccn + 6 * ccrsa_ctx_private_zp(FK)->n + 2 + ccn_nof_size(sizeof(struct cczp)))
 
+/* rvalue accessors to ccec_key fields. */
 CC_CONST CC_INLINE
 ccrsa_priv_ctx_t ccrsa_get_private_ctx_ptr(ccrsa_full_ctx_t fk) {
     ccrsa_priv_ctx_t priv = (ccrsa_priv_ctx_t)(ccrsa_ctx_d(fk)+ccrsa_ctx_n(fk));
@@ -168,8 +101,6 @@ ccrsa_pub_ctx_t ccrsa_ctx_public(ccrsa_full_ctx_t fk) {
     return (ccrsa_pub_ctx_t) fk;
 }
 
-#endif
-
 /* Return exact key bit size */
 static inline size_t
 ccrsa_pubkeylength(ccrsa_pub_ctx_t pubk) {
@@ -181,13 +112,13 @@ ccrsa_pubkeylength(ccrsa_pub_ctx_t pubk) {
 #define CCRSA_PKCS1_PAD_ENCRYPT  2
 
 /* Initialize key based on modulus and e as cc_unit.  key->zp.n must already be set. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
+CC_NONNULL((1, 2, 3))
 int ccrsa_init_pub(ccrsa_pub_ctx_t key, const cc_unit *modulus,
                     const cc_unit *e);
 
 /* Initialize key based on modulus and e as big endian byte array
     key->zp.n must already be set. */
-CC_NONNULL_TU((1)) CC_NONNULL((3 ,5))
+CC_NONNULL((1, 3, 5))
 int ccrsa_make_pub(ccrsa_pub_ctx_t pubk,
                               size_t exp_nbytes, const uint8_t *exp,
                               size_t mod_nbytes, const uint8_t *mod);
@@ -196,7 +127,7 @@ int ccrsa_make_pub(ccrsa_pub_ctx_t pubk,
    the result in out. Both in and out should be cc_unit aligned and
    ccrsa_key_n(key) units long. Clients should use ccn_read_uint() to
    convert bytes to a cc_unit to use for this API.*/
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
+CC_NONNULL((1, 2, 3))
 int ccrsa_pub_crypt(ccrsa_pub_ctx_t key, cc_unit *out, const cc_unit *in);
 
 /* Generate an nbit rsa key pair in key, which should be allocated using
@@ -204,19 +135,19 @@ int ccrsa_pub_crypt(ccrsa_pub_ctx_t key, cc_unit *out, const cc_unit *in);
    byte array exponent e of length e_size is used as the exponent. It's an
    error to call this function with an exponent larger than nbits. rng
    must be a pointer to an initialized struct ccrng_state. */
-CC_NONNULL_TU((2)) CC_NONNULL((4, 5))
+CC_NONNULL((2, 4, 5))
 int ccrsa_generate_key(size_t nbits, ccrsa_full_ctx_t rsa_ctx,
                        size_t e_size, const void *e, struct ccrng_state *rng) CC_WARN_RESULT;
 
 /* Generate RSA key in conformance with FIPS186-4 standard */
-CC_NONNULL_TU((2)) CC_NONNULL((4, 5, 6))
+CC_NONNULL((2, 4, 5, 6))
 int
 ccrsa_generate_fips186_key(size_t nbits, ccrsa_full_ctx_t fk,
                            size_t e_size, const void *eBytes,
                            struct ccrng_state *rng1, struct ccrng_state *rng2) CC_WARN_RESULT;
 
 /* Construct RSA key from fix input in conformance with FIPS186-4 standard */
-CC_NONNULL_TU((16)) CC_NONNULL((3, 5, 7, 9, 11, 13, 15))
+CC_NONNULL((3, 5, 7, 9, 11, 13, 15, 16))
 int
 ccrsa_make_fips186_key(size_t nbits,
                        const cc_size e_n, const cc_unit *e,
@@ -262,14 +193,14 @@ ccrsa_make_fips186_key(size_t nbits,
  * @param   sigSize           The length of generated signature in bytes, which equals the size of the RSA modulus.
  * @return                   0:ok, non-zero:error
  */
-CC_NONNULL((2,3,5,7,8,9))
+CC_NONNULL((2, 3, 5, 7, 8, 9))
 int ccrsa_sign_pss(ccrsa_full_ctx_t key,
                    const struct ccdigest_info* hashAlgorithm, const struct ccdigest_info* MgfHashAlgorithm,
                    size_t saltSize, struct ccrng_state *rng,
                    size_t hSize, const uint8_t *mHash,
                    size_t *sigSize, uint8_t *sig);
 
-CC_NONNULL((2,3,5,7,9))
+CC_NONNULL((2, 3, 5, 7, 9))
 int ccrsa_verify_pss(ccrsa_pub_ctx_t key,
                      const struct ccdigest_info* di, const struct ccdigest_info* MgfDi,
                      size_t digestSize, const uint8_t *digest,
@@ -290,37 +221,38 @@ int ccrsa_verify_pss(ccrsa_pub_ctx_t key,
                         for the output signature
 
  @result     0 iff successful.
+
   @discussion Null OID is a special case, required to support RFC 4346 where the padding
  is based on SHA1+MD5. In general it is not recommended to use a NULL OID,
  except when strictly required for interoperability
 
  */
-CC_NONNULL_TU((1)) CC_NONNULL((4, 5, 6))
+CC_NONNULL((1, 4, 5, 6))
 int ccrsa_sign_pkcs1v15(ccrsa_full_ctx_t key, const uint8_t *oid,
                         size_t digest_len, const uint8_t *digest,
                         size_t *sig_len, uint8_t *sig);
 
 
 /*!
- @function   ccrsa_sign_pkcs1v15
- @abstract   RSA signature with PKCS#1 v1.5 format per PKCS#1 v2.2
-
- @param      key        Public key
- @param      oid        OID describing the type of digest passed in
- @param      digest_len Byte length of the digest
- @param      digest     Byte array of digest_len bytes containing the digest
- @param      sig_len    Number of byte of the signature sig.
- @param      sig        Pointer to the signature buffer of sig_len
- @param      valid      Output boolean, true if the signature is valid.
-
- @result     0 iff successful.
-  @discussion Null OID is a special case, required to support RFC 4346 where the padding
- is based on SHA1+MD5. In general it is not recommended to use a NULL OID, 
- except when strictly required for interoperability
- */
-CC_NONNULL_TU((1)) CC_NONNULL((4, 6, 7))
+  @function   ccrsa_verify_pkcs1v15
+  @abstract   RSA signature with PKCS#1 v1.5 format per PKCS#1 v2.2
+
+  @param      key        Public key
+  @param      oid        OID describing the type of digest passed in
+  @param      digest_len Byte length of the digest
+  @param      digest     Byte array of digest_len bytes containing the digest
+  @param      sig_len    Number of byte of the signature sig.
+  @param      sig        Pointer to the signature buffer of sig_len
+  @param      valid      Output boolean, true if the signature is valid.
+
+  @result     0 iff successful.
+
+  @discussion Null OID is a special case, required to support RFC 4346
+  where the padding is based on SHA1+MD5. In general it is not
+  recommended to use a NULL OID, except when strictly required for
+  interoperability.
+*/
+CC_NONNULL((1, 4, 6, 7))
 int ccrsa_verify_pkcs1v15(ccrsa_pub_ctx_t key, const uint8_t *oid,
                           size_t digest_len, const uint8_t *digest,
                           size_t sig_len, const uint8_t *sig,
@@ -329,80 +261,80 @@ int ccrsa_verify_pkcs1v15(ccrsa_pub_ctx_t key, const uint8_t *oid,
 /*!
  @function   ccder_encode_rsa_pub_size
  @abstract   Calculate size of public key export format data package.
+
  @param      key        Public key
+
  @result     Returns size required for encoding.
  */
 
-CC_NONNULL_TU((1))
+CC_NONNULL((1))
 size_t ccder_encode_rsa_pub_size(const ccrsa_pub_ctx_t key);
 
 /*!
  @function   ccrsa_export_priv_pkcs1
  @abstract   Export a public key.
+
  @param      key        Public key
  @param      der        Beginning of output DER buffer
  @param      der_end    End of output DER buffer
  */
 
-CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3))
+CC_NONNULL((1, 2, 3))
 uint8_t *ccder_encode_rsa_pub(const ccrsa_pub_ctx_t key, uint8_t *der, uint8_t *der_end);
 
 
 /*!
  @function   ccder_encode_rsa_priv_size
  @abstract   Calculate size of full key exported in PKCS#1 format.
+
  @param      key        Full key
+
  @result     Returns size required for encoding.
  */
 
-CC_NONNULL_TU((1))
+CC_NONNULL((1))
 size_t ccder_encode_rsa_priv_size(const ccrsa_full_ctx_t key);
 
 /*!
  @function   ccder_encode_rsa_priv
  @abstract   Export a full key in PKCS#1 format.
+
  @param      key        Full key
  @param      der        Beginning of output DER buffer
  @param      der_end    End of output DER buffer
  */
 
-CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3))
+CC_NONNULL((1, 2, 3))
 uint8_t *ccder_encode_rsa_priv(const ccrsa_full_ctx_t key, const uint8_t *der, uint8_t *der_end);
 
 /*!
  @function   ccder_decode_rsa_pub_n
  @abstract   Calculate "n" for a public key imported from a data package.
         PKCS #1 format
+
  @param      der        Beginning of input DER buffer
  @param      der_end    End of input DER buffer
+
  @result the "n" of the RSA key that would result from the import.  This can be used
  to declare the key itself.
  */
 
-CC_NONNULL((1)) CC_NONNULL((2))
+CC_NONNULL((12))
 cc_size ccder_decode_rsa_pub_n(const uint8_t *der, const uint8_t *der_end);
 
 /*!
  @function   ccder_decode_rsa_pub
  @abstract   Import a public RSA key from a package in public key format.
         PKCS #1 format
+
  @param      key          Public key (n must be set)
  @param      der        Beginning of input DER buffer
  @param      der_end    End of input DER buffer
+
  @result     Key is initialized using the data in the public key message.
  */
 
-CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3))
+CC_NONNULL((1, 2, 3))
 const uint8_t *ccder_decode_rsa_pub(const ccrsa_pub_ctx_t key, const uint8_t *der, const uint8_t *der_end);
 
 /*!
@@ -416,7 +348,7 @@ const uint8_t *ccder_decode_rsa_pub(const ccrsa_pub_ctx_t key, const uint8_t *de
  to declare the key itself.
  */
 
-CC_NONNULL((1)) CC_NONNULL((2))
+CC_NONNULL((12))
 cc_size ccder_decode_rsa_pub_x509_n(const uint8_t *der, const uint8_t *der_end);
 
 /*!
@@ -430,48 +362,48 @@ cc_size ccder_decode_rsa_pub_x509_n(const uint8_t *der, const uint8_t *der_end);
  @result     Key is initialized using the data in the public key message.
  */
 
-CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3))
+CC_NONNULL((1, 2, 3))
 const uint8_t *ccder_decode_rsa_pub_x509(const ccrsa_pub_ctx_t key, const uint8_t *der, const uint8_t *der_end);
 
 
 /*!
  @function   ccder_decode_rsa_priv_n
  @abstract   Calculate "n" for a private key imported from a data package.
+
  @param      der        Beginning of input DER buffer
  @param      der_end    End of input DER buffer
+
  @result the "n" of the RSA key that would result from the import.  This can be used
  to declare the key itself.
  */
 
-CC_NONNULL((1)) CC_NONNULL((2))
+CC_NONNULL((12))
 cc_size ccder_decode_rsa_priv_n(const uint8_t *der, const uint8_t *der_end);
 
 /*!
  @function   ccder_decode_rsa_priv
  @abstract   Import a private RSA key from a package in PKCS#1 format.
+
  @param      key          Full key (n must be set)
  @param      der        Beginning of input DER buffer
  @param      der_end    End of input DER buffer
+
  @result     Key is initialized using the data in the public key message.
  */
 
-CC_NONNULL_TU((1)) CC_NONNULL((2)) CC_NONNULL((3))
+CC_NONNULL((1, 2, 3))
 const uint8_t *ccder_decode_rsa_priv(const ccrsa_full_ctx_t key, const uint8_t *der, const uint8_t *der_end);
 
 /*!
  @function   ccrsa_export_pub_size
  @abstract   Calculate size of public key exported data package.
+
  @param      key        Public key
+
  @result     Returns size required for encoding.
  */
 
-CC_CONST CC_INLINE CC_NONNULL_TU((1))
+CC_CONST CC_INLINE CC_NONNULL((1))
 size_t ccrsa_export_pub_size(const ccrsa_pub_ctx_t key) {
     return ccder_encode_rsa_pub_size(key);
 }
@@ -479,21 +411,21 @@ size_t ccrsa_export_pub_size(const ccrsa_pub_ctx_t key) {
 /*!
  @function   ccrsa_export_pub
  @abstract   Export a public key in public key format.
+
  @param      key        Public key
  @param      out_len    Allocated size
  @param      out        Output buffer
  */
 
-CC_NONNULL_TU((1)) CC_NONNULL((3))
+CC_NONNULL((1, 3))
 int ccrsa_export_pub(const ccrsa_pub_ctx_t key, size_t out_len, uint8_t *out);
 /*!
  @function   ccrsa_import_pub_n
  @abstract   Calculate "n" for a public key imported from a data package.
+
  @param      inlen        Length of public key package data
  @param      der          pointer to public key package data
+
  @result the "n" of the RSA key that would result from the import.  This can be used
  to declare the key itself.
  */
@@ -510,27 +442,27 @@ cc_size ccrsa_import_pub_n(size_t inlen, const uint8_t *der) {
 /*!
  @function   ccrsa_import_pub
  @abstract   Import a public RSA key from a package in public key format.
+
  @param      key          Public key (n must be set)
  @param      inlen        Length of public key package data
  @param      der           pointer to public key package data
+
  @result     Key is initialized using the data in the public key message.
  */
 
-CC_NONNULL_TU((1)) CC_NONNULL((3))
+CC_NONNULL((1, 3))
 int ccrsa_import_pub(ccrsa_pub_ctx_t key, size_t inlen, const uint8_t *der);
 
 /*!
  @function   ccrsa_export_priv_size
  @abstract   Calculate size of full key exported in PKCS#1 format.
+
  @param      key        Full key
+
  @result     Returns size required for encoding.
  */
 
-CC_CONST CC_INLINE CC_NONNULL_TU((1))
+CC_CONST CC_INLINE CC_NONNULL((1))
 size_t ccrsa_export_priv_size(const ccrsa_full_ctx_t key) {
     return ccder_encode_rsa_priv_size(key);
 }
@@ -538,13 +470,13 @@ size_t ccrsa_export_priv_size(const ccrsa_full_ctx_t key) {
 /*!
  @function   ccrsa_export_priv
  @abstract   Export a full key in PKCS#1 format.
+
  @param      key        Full key
  @param      out_len    Allocated size
  @param      out        Output buffer
  */
 
-CC_CONST CC_INLINE CC_NONNULL_TU((1)) CC_NONNULL((3))
+CC_CONST CC_INLINE CC_NONNULL((1, 3))
 int ccrsa_export_priv(const ccrsa_full_ctx_t key, size_t out_len, uint8_t *out) {
     return (ccder_encode_rsa_priv(key, out, out+out_len) != out);
 }
@@ -552,10 +484,10 @@ int ccrsa_export_priv(const ccrsa_full_ctx_t key, size_t out_len, uint8_t *out)
 /*!
  @function   ccrsa_import_priv_n
  @abstract   Calculate size of full key exported in PKCS#1 format.
+
  @param      inlen        Length of PKCS#1 package data
  @param      der           pointer to PKCS#1 package data
+
  @result the "n" of the RSA key that would result from the import.  This can be used
  to declare the key itself.
  */
@@ -568,24 +500,24 @@ cc_size ccrsa_import_priv_n(size_t inlen, const uint8_t *der) {
 /*!
  @function   ccrsa_import_priv
  @abstract   Import a full RSA key from a package in PKCS#1 format.
+
  @param      key          Full key (n must be set)
  @param      inlen        Length of PKCS#1 package data
  @param      der           pointer to PKCS#1 package data
+
  @result     Key is initialized using the data in the PKCS#1 message.
  */
 
-CC_CONST CC_INLINE CC_NONNULL_TU((1)) CC_NONNULL((3))
+CC_CONST CC_INLINE CC_NONNULL((1, 3))
 int ccrsa_import_priv(ccrsa_full_ctx_t key, size_t inlen, const uint8_t *der) {
     return (ccder_decode_rsa_priv(key, der, der+inlen) == NULL);
 }
 
 
-CC_NONNULL_TU((1)) CC_NONNULL2
+CC_NONNULL((1, 2))
 int ccrsa_get_pubkey_components(const ccrsa_pub_ctx_t pubkey, uint8_t *modulus, size_t *modulusLength, uint8_t *exponent, size_t *exponentLength);
 
-CC_NONNULL_TU((1)) CC_NONNULL2
+CC_NONNULL((1, 2))
 int ccrsa_get_fullkey_components(const ccrsa_full_ctx_t key, uint8_t *modulus, size_t *modulusLength, uint8_t *exponent, size_t *exponentLength,
                                  uint8_t *p, size_t *pLength, uint8_t *q, size_t *qLength);
 
index 3372324b9ae2fba58cae83ae836b62732a2348eb..3f343401e62934d7b2406ea7d1c9d51a773a7181 100644 (file)
 /* sha1 selector */
 const struct ccdigest_info *ccsha1_di(void);
 
-extern const uint32_t ccsha1_initial_state[5];
-
-/* shared between several implementations */
-void ccsha1_final(const struct ccdigest_info *di, ccdigest_ctx_t,
-                  unsigned char *digest);
-
-
 /* Implementations */
 extern const struct ccdigest_info ccsha1_ltc_di;
 extern const struct ccdigest_info ccsha1_eay_di;
 
 #if  CCSHA1_VNG_INTEL
-//extern const struct ccdigest_info ccsha1_vng_intel_di;
-#if defined(__x86_64__)
-extern const struct ccdigest_info ccsha1_vng_intel_AVX2_di;
-extern const struct ccdigest_info ccsha1_vng_intel_AVX1_di;
-#endif
 extern const struct ccdigest_info ccsha1_vng_intel_SupplementalSSE3_di;
 #endif
 
index 37a646ec608509419e2339125a4fd382394f0b6d..995ef7e268cfbfae5f84cf24bcdafaf893234b1f 100644 (file)
@@ -38,33 +38,14 @@ const struct ccdigest_info *ccsha512_di(void);
 #define        CCSHA256_OUTPUT_SIZE 32
 #define        CCSHA256_STATE_SIZE  32
 extern const struct ccdigest_info ccsha256_ltc_di;
-extern const struct ccdigest_info ccsha256_v6m_di;
 #if  CCSHA2_VNG_INTEL
-#if defined __x86_64__
-extern const struct ccdigest_info ccsha224_vng_intel_AVX2_di;
-extern const struct ccdigest_info ccsha224_vng_intel_AVX1_di;
-extern const struct ccdigest_info ccsha256_vng_intel_AVX2_di;
-extern const struct ccdigest_info ccsha256_vng_intel_AVX1_di;
-extern const struct ccdigest_info ccsha384_vng_intel_AVX2_di;
-extern const struct ccdigest_info ccsha384_vng_intel_AVX1_di;
-extern const struct ccdigest_info ccsha384_vng_intel_SupplementalSSE3_di;
-extern const struct ccdigest_info ccsha512_vng_intel_AVX2_di;
-extern const struct ccdigest_info ccsha512_vng_intel_AVX1_di;
-extern const struct ccdigest_info ccsha512_vng_intel_SupplementalSSE3_di;
-#endif
 extern const struct ccdigest_info ccsha224_vng_intel_SupplementalSSE3_di;
 extern const struct ccdigest_info ccsha256_vng_intel_SupplementalSSE3_di;
 #endif
 #if  CCSHA2_VNG_ARMV7NEON
 extern const struct ccdigest_info ccsha224_vng_armv7neon_di;
 extern const struct ccdigest_info ccsha256_vng_armv7neon_di;
-extern const struct ccdigest_info ccsha384_vng_arm64_di;
-extern const struct ccdigest_info ccsha384_vng_armv7neon_di;
-extern const struct ccdigest_info ccsha512_vng_arm64_di;
-extern const struct ccdigest_info ccsha512_vng_armv7neon_di;
 #endif
-extern const uint32_t ccsha256_K[64];
-extern const uint64_t ccsha512_K[80];
 
 /* SHA224 */
 #define        CCSHA224_OUTPUT_SIZE 28
index f06b96a9d2b79d970a024c037551de085d03e633..d392432dc20171ead1a33b660a195ac4e30d5028 100644 (file)
 #include <corecrypto/ccn.h>
 #include <corecrypto/ccrng.h>
 
-/* 
- Don't use cczp_hd struct directly, except in static tables such as eliptic curve parameter definitions.
- Declare cczp objects using cczp_decl_n(). It allocates cc_unit arrays of the length returned by either cczp_nof_n() or cczp_short_nof_n().
+/*
+ Don't use cczp_hd struct directly, except in static tables such as eliptic curve parameter
+ definitions.
+
+ Declare cczp objects using cczp_decl_n(). It allocates cc_unit arrays of the length returned by
+ either cczp_nof_n() or cczp_short_nof_n().
 */
 
 struct cczp;
-#if CORECRYPTO_USE_TRANSPARENT_UNION
-
-typedef union {
-    cc_unit *u;
-    struct cczp *zp;
-    //cczp_const_t czp; //for automatic type cast
-    //struct cczp_prime *prime;
-} cczp_t __attribute__((transparent_union));
-
-typedef union {
-    const cc_unit *u;
-    const struct cczp *zp;
-    //const struct cczp_prime *prime;
-    cczp_t _nczp;
-} cczp_const_t __attribute__((transparent_union));
-
-#else
-    typedef struct cczp* cczp_t;
-    typedef const struct cczp* cczp_const_t;
-#endif
+
+typedef struct cczp *cczp_t;
+typedef const struct cczp *cczp_const_t;
+
 typedef void (*ccmod_func_t)(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s);
 
 // keep cczp_hd and cczp structures consistent
 // cczp_hd is typecasted to cczp to read EC curve params
 // options field is to specify Montgomery arithmetic, bit field, etc
-// make sure n is the first element see ccrsa_ctx_n macro 
+// make sure n is the first element see ccrsa_ctx_n macro
 #define __CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \
-cc_size pre ## n;\
-cc_unit pre ## options;\
-ccmod_func_t pre ## mod_prime;
+    cc_size pre##n;                             \
+    cc_unit pre##options;                       \
+    ccmod_func_t pre##mod_prime;
 
-#define __CCZP_ELEMENTS_DEFINITIONS(pre) \
-__CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \
-cc_unit pre ## ccn[];
+#define __CCZP_ELEMENTS_DEFINITIONS(pre)    \
+    __CCZP_HEADER_ELEMENTS_DEFINITIONS(pre) \
+    cc_unit pre##ccn[];
 
-//cczp_hd must be defined separetly without variable length array ccn[], because it is used in sructures such as ccdh_gp_decl_n
-struct cczp_hd{
+// cczp_hd must be defined separetly without variable length array ccn[], because it is used in
+// sructures such as ccdh_gp_decl_n
+struct cczp_hd {
     __CCZP_HEADER_ELEMENTS_DEFINITIONS()
-}  CC_ALIGNED(CCN_UNIT_SIZE);
+} CC_ALIGNED(CCN_UNIT_SIZE);
 
 struct cczp {
     __CCZP_ELEMENTS_DEFINITIONS()
 } CC_ALIGNED(CCN_UNIT_SIZE);
 
-
 /* Return the size of an cczp where each ccn is _size_ bytes. */
 #define cczp_size(_size_) (sizeof(struct cczp) + ccn_sizeof_n(1) + 2 * (_size_))
 
@@ -79,95 +65,56 @@ struct cczp {
    with cczp_add, cczp_sub, cczp_div2, cczp_mod_inv. */
 #define cczp_short_nof_n(_n_) (ccn_nof_size(sizeof(struct cczp)) + (_n_))
 
-#define cczp_decl_n(_n_, _name_)  cc_ctx_decl(struct cczp, ccn_sizeof_n(cczp_nof_n(_n_)), _name_)
-#define cczp_short_decl_n(_n_, _name_) cc_ctx_decl(struct cczp_short, ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_)
-
-#define cczp_clear_n(_n_, _name_)  cc_clear(ccn_sizeof_n(cczp_nof_n(_n_)), _name_)
-#define cczp_short_clear_n(_n_, _name_)  cc_clear(ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_)
-
-#if CORECRYPTO_USE_TRANSPARENT_UNION 
-  #define CCZP_N(ZP) (((cczp_t)(ZP)).zp->n)
-  #define CCZP_MOD(ZP) (((cczp_t)(ZP)).zp->mod_prime)
-  #define CCZP_PRIME(ZP) (((cczp_t)(ZP)).zp->ccn)
-  #define CCZP_RECIP(ZP) (((cczp_t)(ZP)).zp->ccn + cczp_n(ZP))
-  #define CCZP_OPS(ZP) ((ZP).zp->options)
-  #define CCZP_MOD_PRIME(ZP) CCZP_MOD(ZP)
-
-CC_CONST CC_NONNULL_TU((1))
-static inline cc_size cczp_n(cczp_const_t zp) {
-    return zp.zp->n;
-}
-
-CC_CONST CC_NONNULL_TU((1))
-static inline cc_unit cczp_options(cczp_const_t zp) {
-    return zp.zp->options;
-}
-
-CC_CONST CC_NONNULL_TU((1))
-static inline ccmod_func_t cczp_mod_prime(cczp_const_t zp) {
-    return zp.zp->mod_prime;
-}
-
-CC_CONST CC_NONNULL_TU((1))
-static inline const cc_unit *cczp_prime(cczp_const_t zp) {
-    return zp.zp->ccn;
-}
-
-/* Return a pointer to the Reciprocal or Montgomery constant of zp, which is
- allocated cczp_n(zp) + 1 units long. */
-CC_CONST CC_NONNULL_TU((1))
-
-static inline const cc_unit *cczp_recip(cczp_const_t zp) {
-    return zp.zp->ccn + zp.zp->n;
-}
-
-#else
-  #define CCZP_N(ZP)     ((ZP)->n)
-  #define CCZP_MOD(ZP)   ((ZP)->mod_prime)
-  #define CCZP_MOD_PRIME(ZP) CCZP_MOD(ZP)
-  #define CCZP_PRIME(ZP) ((ZP)->ccn)
-  #define CCZP_RECIP(ZP) ((ZP)->ccn + CCZP_N(ZP))
-  #define CCZP_OPS(ZP)   ((ZP)->options)
-CC_CONST CC_NONNULL_TU((1))
-static inline cc_size cczp_n(cczp_const_t zp) {
+#define cczp_decl_n(_n_, _name_) cc_ctx_decl(struct cczp, ccn_sizeof_n(cczp_nof_n(_n_)), _name_)
+#define cczp_short_decl_n(_n_, _name_) \
+    cc_ctx_decl(struct cczp_short, ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_)
+
+#define cczp_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_nof_n(_n_)), _name_)
+#define cczp_short_clear_n(_n_, _name_) cc_clear(ccn_sizeof_n(cczp_short_nof_n(_n_)), _name_)
+
+#define CCZP_N(ZP) ((ZP)->n)
+#define CCZP_MOD(ZP) ((ZP)->mod_prime)
+#define CCZP_MOD_PRIME(ZP) CCZP_MOD(ZP)
+#define CCZP_PRIME(ZP) ((ZP)->ccn)
+#define CCZP_RECIP(ZP) ((ZP)->ccn + CCZP_N(ZP))
+#define CCZP_OPS(ZP) ((ZP)->options)
+CC_CONST CC_NONNULL((1)) static inline cc_size cczp_n(cczp_const_t zp)
+{
     return zp->n;
 }
 
-CC_CONST CC_NONNULL_TU((1))
-static inline cc_unit cczp_options(cczp_const_t zp) {
+CC_CONST CC_NONNULL((1)) static inline cc_unit cczp_options(cczp_const_t zp)
+{
     return zp->options;
 }
 
-CC_CONST CC_NONNULL_TU((1))
-static inline ccmod_func_t cczp_mod_prime(cczp_const_t zp) {
+CC_CONST CC_NONNULL((1)) static inline ccmod_func_t cczp_mod_prime(cczp_const_t zp)
+{
     return zp->mod_prime;
 }
 
-CC_CONST CC_NONNULL_TU((1))
-static inline const cc_unit *cczp_prime(cczp_const_t zp) {
+CC_CONST CC_NONNULL((1)) static inline const cc_unit *cczp_prime(cczp_const_t zp)
+{
     return zp->ccn;
 }
 
 /* Return a pointer to the Reciprocal or Montgomery constant of zp, which is
  allocated cczp_n(zp) + 1 units long. */
-CC_CONST CC_NONNULL_TU((1))
+CC_CONST CC_NONNULL((1))
 
-static inline const cc_unit *cczp_recip(cczp_const_t zp) {
+    static inline const cc_unit *cczp_recip(cczp_const_t zp)
+{
     return zp->ccn + zp->n;
 }
 
-#endif
-
-
-CC_CONST CC_NONNULL_TU((1))
-CC_INLINE size_t cczp_bitlen(cczp_const_t zp) {
+CC_CONST CC_NONNULL((1)) CC_INLINE size_t cczp_bitlen(cczp_const_t zp)
+{
     return ccn_bitlen(cczp_n(zp), cczp_prime(zp));
 }
 
-
 /* Ensure both cczp_mod_prime(zp) and cczp_recip(zp) are valid. cczp_n and
    cczp_prime must have been previously initialized. */
-CC_NONNULL_TU((1))
+CC_NONNULL((1))
 int cczp_init(cczp_t zp);
 
 /* Compute r = s2n mod cczp_prime(zp). Will write cczp_n(zp)
@@ -175,16 +122,14 @@ int cczp_init(cczp_t zp);
  identical they must not overlap.  Before calling this function either
  cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp)
  and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
-void cczp_mod(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s2n);
+CC_NONNULL((1, 2, 3)) void cczp_mod(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s2n);
 
 /* Compute r = sn mod cczp_prime(zp), Will write cczp_n(zp)
  units to r and reads sn units units from s. If r and s are not
  identical they must not overlap.  Before calling this function either
  cczp_init(zp) must have been called or both CCZP_MOD_PRIME((cc_unit *)zp)
  and CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 4))
-int cczp_modn(cczp_const_t zp, cc_unit *r, cc_size ns, const cc_unit *s);
+CC_NONNULL((1, 2, 4)) int cczp_modn(cczp_const_t zp, cc_unit *r, cc_size ns, const cc_unit *s);
 
 /* Compute r = x * y mod cczp_prime(zp). Will write cczp_n(zp) units to r
    and reads cczp_n(zp) units units from both x and y. If r and x are not
@@ -192,44 +137,20 @@ int cczp_modn(cczp_const_t zp, cc_unit *r, cc_size ns, const cc_unit *s);
    calling this function either cczp_init(zp) must have been called or both
    CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be
    initialized some other way. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4))
+CC_NONNULL((1, 2, 3, 4))
 void cczp_mul(cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y);
 
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4, 5))
-void cczp_mul_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *t, const cc_unit *x, const cc_unit *y);
-
-/* Compute r = x * x mod cczp_prime(zp). Will write cczp_n(zp) units to r
-   and reads cczp_n(zp) units from x. If r and x are not identical they must
-   not overlap. Before calling this function either cczp_init(zp) must have
-   been called or both CCZP_MOD_PRIME((cc_unit *)zp) and
-   CCZP_RECIP((cc_unit *)zp) must be initialized some other way. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
-void cczp_sqr(cczp_const_t zp, cc_unit *r, const cc_unit *x);
-
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4))
-void cczp_sqr_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *x);
-
-/* Compute r = x^(1/2) mod cczp_prime(zp). Will write cczp_n(zp) units to r
- and reads cczp_n(zp) units from x. If r and x are not identical they must
- not overlap. Before calling this function either cczp_init(zp) must have
- been called or both CCZP_MOD_PRIME((cc_unit *)zp) and
- CCZP_RECIP((cc_unit *)zp) must be initialized some other way. 
- Only support prime = 3 mod 4 */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
-int cczp_sqrt(cczp_const_t zp, cc_unit *r, const cc_unit *x);
-
 /* Compute r = m ^ e mod cczp_prime(zp), using Montgomery ladder.
    - writes cczp_n(zp) units to r
    - reads  cczp_n(zp) units units from m and e
-   - if r and m are not identical they must not overlap. 
+   - if r and m are not identical they must not overlap.
    - r and e must not overlap nor be identical.
    - before calling this function either cczp_init(zp) must have been called
    or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must
    be initialized some other way.
  */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4))
-int cczp_power(cczp_const_t zp, cc_unit *r, const cc_unit *m,
-               const cc_unit *e);
+CC_NONNULL((1, 2, 3, 4))
+int cczp_power(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit *e);
 
 /* Compute r = m ^ e mod cczp_prime(zp), using Square Square Multiply Always.
  - writes cczp_n(zp) units to r
@@ -238,101 +159,55 @@ int cczp_power(cczp_const_t zp, cc_unit *r, const cc_unit *m,
  - r and e must not overlap nor be identical.
  - before calling this function either cczp_init(zp) must have been called
  or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must
- be initialized some other way. 
+ be initialized some other way.
+
  Important: This function is intented to be constant time but is more likely
     to leak information due to memory cache. Only used with randomized input
  */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4))
-int cczp_power_ssma(cczp_const_t zp, cc_unit *r, const cc_unit *m,
-                const cc_unit *e);
-
-int cczp_power_ssma_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *s, const cc_unit *e);
-
-/* Compute r = m ^ e mod cczp_prime(zp). Will write cczp_n(zp) units to r and
- reads cczp_n(zp) units units from m.  Reads ebitlen bits from e.
- m must be <= to cczp_prime(zp).  If r and m are not identical they must not
- overlap. r and e must not overlap nor be identical.
- Before calling this function either cczp_init(zp) must have been called
- or both CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must
- be initialized some other way. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 5))
-int cczp_powern(cczp_const_t zp, cc_unit *r, const cc_unit *s,
-                size_t ebitlen, const cc_unit *e);
-
-/* Compute r = x + y mod cczp_prime(zp). Will write cczp_n(zp) units to r and
-   reads cczp_n(zp) units units from x and y. If r and x are not identical
-   they must not overlap. Only cczp_n(zp) and cczp_prime(zp) need to be valid.
-   Can be used with cczp_short_nof_n sized cc_unit array zp. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4))
-void cczp_add(cczp_const_t zp, cc_unit *r, const cc_unit *x,
-              const cc_unit *y);
-
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4, 5))
-void cczp_add_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *x,
-                 const cc_unit *y);
-
-/* Compute r = x - y mod cczp_prime(zp). Will write cczp_n(zp) units to r and
-   reads cczp_n(zp) units units from x and y. If r and x are not identical
-   they must not overlap. Only cczp_n(zp) and cczp_prime(zp) need to be valid.
-   Can be used with cczp_short_nof_n sized cc_unit array zp. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4))
-void cczp_sub(cczp_const_t zp, cc_unit *r, const cc_unit *x, const cc_unit *y);
-
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4, 5))
-void cczp_sub_ws(cc_ws_t ws, cczp_const_t zp, cc_unit *r, const cc_unit *x,
-                 const cc_unit *y);
-
-/* Compute r = x / 2 mod cczp_prime(zp). Will write cczp_n(zp) units to r and
-   reads cczp_n(zp) units units from x. If r and x are not identical
-   they must not overlap. Only cczp_n(zp) and cczp_prime(zp) need to be valid.
-   Can be used with cczp_short_nof_n sized cc_unit array zp. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
-void cczp_div2(cczp_const_t zp, cc_unit *r, const cc_unit *x);
-
-/* Compute q = a_2n / cczp_prime(zd) (mod cczp_prime(zd)) . Will write cczp_n(zd)
-   units to q and r. Will read 2 * cczp_n(zd) units units from a. If r and a
-   are not identical they must not overlap. Before calling this function
-   either cczp_init(zp) must have been called or both
-   CCZP_MOD_PRIME((cc_unit *)zp) and CCZP_RECIP((cc_unit *)zp) must be
-   initialized some other way. */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3, 4))
-void cczp_div(cczp_const_t zd, cc_unit *q, cc_unit *r, const cc_unit *a_2n);
-
+CC_NONNULL((1, 2, 3, 4))
+int cczp_power_ssma(cczp_const_t zp, cc_unit *r, const cc_unit *m, const cc_unit *e);
 
 /*!
  @brief cczp_inv(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp).
- @discussion It is a general function and works for any p. It validates the inputs. r and x can overlap. It writes n =cczp_n(zp) units to r, and read n units units from x and p. The output r is overwriten only if the inverse is correctly computed. This function is not constant time in absolute sense, but it does not have data dependent 'if' statements in the code.
- @param zp  The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to be called before invoking cczp_inv().
+ @discussion It is a general function and works for any p. It validates the inputs. r and x can
+ overlap. It writes n =cczp_n(zp) units to r, and read n units units from x and p. The output r is
+ overwriten only if the inverse is correctly computed. This function is not constant time in
+ absolute sense, but it does not have data dependent 'if' statements in the code.
+ @param zp  The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to
+ be called before invoking cczp_inv().
  @param x input big integer
  @param r output big integer
  @return  0 if inverse exists and correctly computed.
  */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
-
+CC_NONNULL((1, 2, 3))
 int cczp_inv(cczp_const_t zp, cc_unit *r, const cc_unit *x);
 
 /*!
  @brief cczp_inv_odd(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is an odd number.
  @discussion  r and x can overlap.
- @param zp  The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to be called before invoking.
+ @param zp  The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to
+ be called before invoking.
  @param x input big integer
  @param r output big integer
  @return  0 if successful
  */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
-int cczp_inv_odd(cczp_const_t zp, cc_unit *r, const cc_unit *x);
+CC_NONNULL((1, 2, 3)) int cczp_inv_odd(cczp_const_t zp, cc_unit *r, const cc_unit *x);
 
 /*!
- @brief cczp_inv_field(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is a prime number number.
- @discussion r and x must NOT overlap. The excution time of the function is independent to the value of the input x. It works only if p is a field. That is, when p is a prime. It supports Montgomery and non-Montgomery form of zp. It leaks the value of the prime and should only be used be used for public (not secret) primes (ex. Elliptic Curves)
-
- @param zp  The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to be called before invoking cczp_inv_field().
+ @brief cczp_inv_field(zp, r, x) computes r = x^-1 (mod p) , where p=cczp_prime(zp) is a prime
+ number number.
+ @discussion r and x must NOT overlap. The excution time of the function is independent to the value
+ of the input x. It works only if p is a field. That is, when p is a prime. It supports Montgomery
+ and non-Montgomery form of zp. It leaks the value of the prime and should only be used be used for
+ public (not secret) primes (ex. Elliptic Curves)
+
+ @param zp  The input zp. cczp_n(zp) and cczp_prime(zp) need to be valid. cczp_init(zp) need not to
+ be called before invoking cczp_inv_field().
  @param x input big unteger
  @param r output big integer
  @return  0 if inverse exists and correctly computed.
  */
-CC_NONNULL_TU((1)) CC_NONNULL((2, 3))
+CC_NONNULL((1, 2, 3))
 int cczp_inv_field(cczp_const_t zp, cc_unit *r, const cc_unit *x);
 
 #endif /* _CORECRYPTO_CCZP_H_ */
diff --git a/EXTERNAL_HEADERS/img4/api.h b/EXTERNAL_HEADERS/img4/api.h
new file mode 100644 (file)
index 0000000..56b875b
--- /dev/null
@@ -0,0 +1,56 @@
+/*!
+ * @header
+ * API definitions.
+ */
+#ifndef __IMG4_API_H
+#define __IMG4_API_H
+
+#ifndef __IMG4_INDIRECT
+#error "Please #include <img4/img4.h> instead of this file directly"
+#endif // __IMG4_INDIRECT
+
+#ifndef KERNEL
+#include <os/availability.h>
+#endif
+
+#if !XNU_KERNEL_PRIVATE
+#include <TargetConditionals.h>
+#endif
+
+/*!
+ * @const IMG4_API_VERSION
+ * The API version of the library. This version will be changed in accordance
+ * with new API introductions so that callers may submit code to the build that
+ * adopts those new APIs before the APIs land by using the following pattern:
+ *
+ *     #if IMG4_API_VERSION >= 20180424
+ *     img4_new_api();
+ *     #endif
+ *
+ * In this example, the library maintainer and API adopter agree on an API
+ * version of 20180424 ahead of time for the introduction of
+ * img4_new_api(). When a libdarwin with that API version is submitted, the
+ * project is rebuilt, and the new API becomes active.
+ *
+ * Breaking API changes will be both covered under this mechanism as well as
+ * individual preprocessor macros in this header that declare new behavior as
+ * required.
+ */
+#define IMG4_API_VERSION (20180112u)
+
+#if !defined(KERNEL) && !IMG4_PROJECT_BUILD
+#define IMG4_API_AVAILABLE_20180112 \
+               __API_UNAVAILABLE(macos) \
+               API_AVAILABLE(ios(12.0), tvos(12.0), watchos(5.0))
+#else
+#define IMG4_API_AVAILABLE_20180112
+#endif
+
+/*!
+ * @typedef img4_struct_version_t
+ * A type describing the version of a structure in the library.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef uint16_t img4_struct_version_t;
+
+#endif // __IMG4_API_H
diff --git a/EXTERNAL_HEADERS/img4/environment.h b/EXTERNAL_HEADERS/img4/environment.h
new file mode 100644 (file)
index 0000000..d5c4f49
--- /dev/null
@@ -0,0 +1,314 @@
+/*!
+ * @header
+ * Image4 environment interfaces.
+ */
+#ifndef __IMG4_ENVIRONMENT_H
+#define __IMG4_ENVIRONMENT_H
+
+#ifndef __IMG4_INDIRECT
+#error "Please #include <img4/img4.h> instead of this file directly"
+#endif // __IMG4_INDIRECT
+
+/*!
+ * @const IMG4_ENVIRONMENT_VERSION
+ * The version of the {@link img4_environment_t} structure supported by the
+ * implementation. See {@link _img4_environment} for complete definition.
+ */
+#define IMG4_ENVIRONMENT_VERSION ((img4_struct_version_t)0)
+
+/*!
+ * @typedef img4_crypto_selector_t
+ * A CoreCrypto selector routine.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef const struct ccdigest_info *(*img4_crypto_selector_t)(void);
+
+/*!
+ * @typedef img4_crypto_t
+ * A structure describing a crypto algorithm used by Image4.
+ *
+ * @property i4c_name
+ * The human-readable string for the crypto algorithm (e.g. "sha1").
+ *
+ * @property i4c_select
+ * The CoreCrypto selector routine for the algorithm
+ *
+ * @property i4c_hash_len
+ * The length of the hash computed by the algorithm.
+ *
+ * @property i4c_truncated_hash_len
+ * The truncated length of the hash computed by the algorithm.
+ *
+ * @property __opaque
+ * Reserved for the implementation.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef struct _img4_crypto {
+       const char *i4c_name;
+       img4_crypto_selector_t i4c_select;
+       uint32_t i4c_hash_len;
+       uint32_t i4c_truncated_hash_len;
+       const void *__opaque;
+} img4_crypto_t;
+
+/*!
+ * @const IMG4_CRYPTO_SHA1
+ * The Image4 SHA1 implementation.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT
+const img4_crypto_t _img4_crypto_sha1;
+#define IMG4_CRYPTO_SHA1 (&_img4_crypto_sha1)
+
+/*!
+ * @const IMG4_CRYPTO_SHA384
+ * The Image4 SHA-384 implementation.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT
+const img4_crypto_t _img4_crypto_sha384;
+#define IMG4_CRYPTO_SHA384 (&_img4_crypto_sha384)
+
+/*!
+ * @typedef img4_environment_t
+ * A type describing an Image4 environment.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef struct _img4_environment img4_environment_t;
+
+/*!
+ * @typedef img4_environment_get_crypto_t
+ * A function which obtains a crypto descriptor for the host environment.
+ *
+ * @param i4e
+ * The environment descriptor.
+ *
+ * @param crypto
+ * A pointer to the storage in which the pointer to the host's crypto descriptor
+ * will be written.
+ *
+ * @param ctx
+ * The context pointer supplied to {@link img4_init}.
+ *
+ * @result
+ * Upon successfully fetching the property value, zero should be returned.
+ * Otherwise, the following error codes should be returned:
+ *
+ *     [ENOENT]     The property does not exist in the environment
+ */
+IMG4_API_AVAILABLE_20180112
+typedef errno_t (*img4_environment_get_crypto_t)(
+       const img4_environment_t *i4e,
+       const img4_crypto_t **crypto,
+       const void *ctx);
+
+/*!
+ * @typedef img4_environment_get_bool_t
+ * A function which obtains a Boolean property from the host environment.
+ *
+ * @param val
+ * A pointer to storage in which the value will be written.
+ *
+ * @param ctx
+ * The context pointer supplied to {@link img4_init}.
+ *
+ * @result
+ * Upon successfully fetching the property value, zero should be returned.
+ * Otherwise, the following error codes should be returned:
+ *
+ *     [ENOENT]     The property does not exist in the environment
+ *     [EFTYPE]     The property is not expressible as a Boolean
+ */
+IMG4_API_AVAILABLE_20180112
+typedef errno_t (*img4_environment_get_bool_t)(
+       const img4_environment_t *i4e,
+       bool *val,
+       const void *ctx);
+
+/*!
+ * @typedef img4_environment_get_uint32_t
+ * A function which obtains an unsigned 32-bit integer property from the host
+ * environment.
+ *
+ * @param val
+ * A pointer to storage in which the value will be written.
+ *
+ * @param ctx
+ * The context pointer supplied to {@link img4_init}.
+ *
+ * @result
+ * Upon successfully fetching the property value, zero should be returned.
+ * Otherwise, the following error codes should be returned:
+ *
+ *     [ENOENT]     The property does not exist in the environment
+ *     [EFTYPE]     The property is not expressible as an unsigned 32-bit integer
+ */
+IMG4_API_AVAILABLE_20180112
+typedef errno_t (*img4_environment_get_uint32_t)(
+       const img4_environment_t *i4e,
+       uint32_t *val,
+       const void *ctx);
+
+/*!
+ * @typedef img4_environment_get_uint64_t
+ * A function which obtains an unsigned 64-bit integer property from the host
+ * environment.
+ *
+ * @param val
+ * A pointer to storage in which the value will be written.
+ *
+ * @param ctx
+ * The context pointer supplied to {@link img4_init}.
+ *
+ * @result
+ * Upon successfully fetching the property value, zero should be returned.
+ * Otherwise, the following error codes should be returned:
+ *
+ *     [ENOENT]     The property does not exist in the environment
+ *     [EFTYPE]     The property is not expressible as an unsigned 64-bit
+ *                  integer
+ */
+IMG4_API_AVAILABLE_20180112
+typedef errno_t (*img4_environment_get_uint64_t)(
+       const img4_environment_t *i4e,
+       uint64_t *val,
+       const void *ctx);
+
+/*!
+ * @typedef img4_environment_get_data_t
+ * A function which obtains a property which is a raw sequence of bytes from the
+ * host environment.
+ *
+ * @param bytes
+ * A pointer to storage in which the value will be written.
+ *
+ * @param len
+ * A pointer to the length of the buffer referred to be {@link val}. Upon
+ * successful return, this storage should contain the number of bytes written.
+ *
+ * @param ctx
+ * The context pointer supplied to {@link img4_init}.
+ *
+ * @result
+ * Upon successfully fetching the property value, zero should be returned.
+ * Otherwise, the following error codes should be returned:
+ *
+ *     [ENOENT]     The property does not exist in the environment
+ *     [EFTYPE]     The property is not expressible as a raw sequence of bytes
+ *     [ERANGE]     The buffer was not large enough to hold the property
+ */
+IMG4_API_AVAILABLE_20180112
+typedef errno_t (*img4_environment_get_data_t)(
+       const img4_environment_t *i4e,
+       uint8_t *bytes,
+       uint32_t *len,
+       const void *ctx);
+
+/*!
+ * @struct _img4_environment
+ * A type describing a host environment.
+ *
+ * @property i4e_version
+ * The version of the environment structure. Pass
+ * {@link IMG4_ENVIRONMENT_VERSION}.
+ *
+ * @property i4e_name
+ * A human-readable description of the environment.
+ *
+ * @property i4e_crypto
+ * A pointer to a function which returns the crypto implementation for the
+ * environment.
+ *
+ * @property i4e_cert_epoch
+ * A pointer to a function which returns the certificate epoch for the
+ * environment.
+ *
+ * @property i4e_board_id
+ * A pointer to a function which returns the board identifier for the
+ * environment.
+ *
+ * @property i4e_chip_id
+ * A pointer to a function which returns the chip design identifier for the
+ * environment.
+ *
+ * @property i4e_ecid
+ * A pointer to a function which returns the unique chip identifier for the
+ * environment.
+ *
+ * @property i4e_security_domain
+ * A pointer to a function which returns the security domain for the
+ * environment.
+ *
+ * @property i4e_cert_prod
+ * A pointer to a function which returns the certificate production status for
+ * the environment. This indicates whether the environment's leaf certificate
+ * must be production or development.
+ *
+ * - true    the environment's leaf certificate must be production
+ * - false   the environment's leaf certificate may be development
+ *
+ * @property i4e_cert_security
+ * A pointer to a function which returns the certificate security mode for the
+ * environment. This indicates Whether the leaf certificate must be secure.
+ *
+ * @property i4e_ap_nonce_hash
+ * A pointer to a function which returns the hash of the AP nonce for the
+ * environment.
+ *
+ * @property i4e_prevent_mixnmatch
+ * A pointer to a function which returns whether the environment prevents mix-
+ * n-match.
+ *
+ * - true    the environment disallows mix-n-match
+ * - false   the environment allows mix-n-match
+ *
+ * @property i4e_boot_manifest_hash
+ * A pointer to a function which returns the hash of the manifest from which
+ * mix-n-match policy derives.
+ *
+ * @property i4e_eff_security
+ * A pointer to a function which returns the effective security mode for the
+ * environment.
+ *
+ * @property i4e_eff_prod
+ * A pointer to a function which returns the effective production status for the
+ * environment.
+ *
+ * @property i4e_ap_nonce_trust
+ * A pointer to a function which returns whether the AP nonce must be
+ * exclusively fetched from main memory.
+ *
+ * - true    the AP nonce hash must be fetched from main memory exclusively;
+ *           persistent storage is not trustworthy
+ * - false   the AP nonce hash may be fetched from persistent storage
+ */
+struct _img4_environment {
+       img4_struct_version_t i4e_version;
+       const char *i4e_name;
+       img4_environment_get_crypto_t i4e_crypto;
+       img4_environment_get_uint32_t i4e_cert_epoch;
+       img4_environment_get_uint32_t i4e_board_id;
+       img4_environment_get_uint32_t i4e_chip_id;
+       img4_environment_get_uint64_t i4e_ecid;
+       img4_environment_get_uint32_t i4e_security_domain;
+       img4_environment_get_bool_t i4e_cert_prod;
+       img4_environment_get_bool_t i4e_cert_security;
+       img4_environment_get_data_t i4e_ap_nonce_hash;
+       img4_environment_get_bool_t i4e_prevent_mixnmatch;
+       img4_environment_get_data_t i4e_boot_manifest_hash;
+       img4_environment_get_bool_t i4e_eff_prod;
+       img4_environment_get_bool_t i4e_eff_security;
+       img4_environment_get_bool_t i4e_ap_nonce_trust;
+} IMG4_API_AVAILABLE_20180112;
+
+/*!
+ * @const IMG4_ENVIRONMENT_PLATFORM
+ * The environment for the host that uses the default platform implementation to
+ * resolve the environment.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT
+const struct _img4_environment _img4_environment_platform;
+#define IMG4_ENVIRONMENT_PLATFORM (&_img4_environment_platform)
+
+#endif // __IMG4_ENVIRONMENT_H
diff --git a/EXTERNAL_HEADERS/img4/img4.h b/EXTERNAL_HEADERS/img4/img4.h
new file mode 100644 (file)
index 0000000..13b053f
--- /dev/null
@@ -0,0 +1,543 @@
+/*!
+ * @header
+ * Image4 interfaces. These interfaces encapsulate the basic concepts required
+ * for authenticating and validating Image4 manifests as being authoritative.
+ * These concepts are:
+ *
+ * Environment
+ * An environment is a description of a host comprised of hardware identifiers
+ * and policy configurations. For example, the environment of an iPhone may
+ * include the following hardware identifiers (among others):
+ *
+ *     ChipID
+ *     A number identifying the chip design.
+ *
+ *     BoardID
+ *     A number identifying the board.
+ *
+ *     UniqueChipID / ECID
+ *     A number uniquely identifying a specific instance of a chip.
+ *
+ * The environment also includes policy information derived by previous stages
+ * of secure boot. Examples of such policy are:
+ *
+ *     Mix-n-Match Prevention
+ *     Whether firmware payloads from multiple, valid secure boot manifests
+ *     should be prevented from being executed on the host environment. The
+ *     default is true.
+ *
+ * Manifest
+ * An Image4 manifest is a set of constraints that describe a host environment.
+ * For example, a manifest may have been signed such that it is only valid for a
+ * single host environment. In this case, the manifest may include specific
+ * values for ChipID, BoardID, UniqueChipID, etc. Such a manifest is said to be
+ * personalized for that environment.
+ *
+ * If an environment meets the constraints in a manifest, that manifest is said
+ * to be authoritative over the environment.
+ *
+ * The manifest also includes one or more objects which may be executed in the
+ * environment.
+ *
+ * Object
+ * An object is a description of a payload. An object can describe any payload,
+ * not just the payload that is in the Image4. An object describes a payload by
+ * means of its digest. Examples of objects present in a secure boot manifest
+ * are the kernelcache and the static trust cache.
+ *
+ * If an authoritative manifest accurately describes an object, then that object
+ * may be executed in the host environment. The mechanics of execution typically
+ * involve mapping its payload into a privileged memory region. For example,
+ * when the kernelcache is executed, its payload bytes are mapped into the range
+ * of memory associated with supervisor mode.
+ *
+ * Payload
+ * A payload is the raw sequence of bytes that is described by an object. When
+ * described via an Image4 object, payloads are first wrapped in Image4 encoding
+ * to associate a tag with them. The resulting series of bytes is what is
+ * contained in a .im4p file.
+ *
+ * An Image4 file may only contain a single payload (even though a manifest may
+ * describe multiple payloads through multiple objects).
+ *
+ * Tag
+ * A tag is a FourCC which can identify any of the following:
+ *
+ *     - an object property (e.g. the 'DGST' property)
+ *     - a manifest property (e.g. the 'BORD' property)
+ *     - a certificate property
+ *     - a type of object (e.g. 'krnl')
+ *
+ * Tags comprised of all-caps are reserved for the Image4 specification.
+ */
+
+
+#ifndef __IMG4_H
+#define __IMG4_H
+
+#include <os/base.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <sys/cdefs.h>
+
+#define __IMG4_INDIRECT 1
+
+/*
+ * This header is used in the pmap layer in xnu, which is in osfmk, which does
+ * not have access to most of the BSD headers. (But for some reason it does have
+ * access to sys/cdefs.h.) The only thing we need from that header is the
+ * errno_t typedef though, so if we can't get to it, then just typeded it
+ * ourselves.
+ */
+#if MACH_KERNEL_PRIVATE
+typedef int errno_t;
+#else
+#include <sys/types.h>
+#endif
+
+#if !IMG4_PROJECT_BUILD
+#include <img4/api.h>
+#endif
+
+__BEGIN_DECLS;
+
+/*!
+ * @typedef img4_tag_t
+ * A type describing an Image4 tag.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef uint32_t img4_tag_t;
+
+/*!
+ * @typedef img4_section_t
+ * A type describing the sections of an Image4 object.
+ *
+ * @const IMG4_SECTION_MANIFEST
+ * The manifest section.
+ *
+ * @const IMG4_SECTION_OBJECT
+ * The object section.
+ *
+ * @const IMG4_SECTION_RESTOREINFO
+ * The restore info section.
+ */
+OS_ENUM(img4_section, uint8_t,
+       IMG4_SECTION_MANIFEST,
+       IMG4_SECTION_OBJECT,
+       IMG4_SECTION_RESTOREINFO,
+) IMG4_API_AVAILABLE_20180112;
+
+/*!
+ * @typedef img4_custom_tag_handler_t
+ * A handler for a tag unrecognized by the implementation.
+ *
+ * @param tag
+ * The FourCC tag.
+ *
+ * @param ctx
+ * The user-provided context pointer given to either
+ * {@link img4_get_trusted_payload} or
+ * {@link img4_get_trusted_external_payload}.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef errno_t (*img4_custom_tag_handler_t)(
+       img4_tag_t tag,
+       img4_section_t section,
+       void *ctx);
+
+/*!
+ * @typedef img4_custom_tag_t
+ * A type describing a custom tag and its handler.
+ *
+ * @property i4ct_tag
+ * The FourCC tag.
+ *
+ * @property i4ct_section
+ * The section in which the tag is expected. If {@link IMG4_SECTION_OBJECT} is
+ * given, the object corresponding to the tag given to
+ * {@link img4_get_trusted_payload} or {@link img4_get_trusted_external_payload}
+ * will be consulted for the tag.
+ *
+ * @property i4ct_handler
+ * The handler for the tag.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef struct _img4_custom_tag {
+       img4_tag_t i4ct_tag;
+       img4_section_t i4ct_section;
+       img4_custom_tag_handler_t i4ct_handler;
+} img4_custom_tag_t;
+
+/*!
+ * @typedef img4_destructor_t
+ * A type describing a destructor routine for an Image4 object.
+ *
+ * @param ptr
+ * A pointer to the buffer to dispose of.
+ *
+ * @param len
+ * The length of the buffer.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef void (*img4_destructor_t)(
+       void *ptr,
+       size_t len);
+
+/*!
+ * @typedef img4_flags_t
+ * A flagset modifying the behavior of an {@link img4_t}.
+ *
+ * @const I4F_INIT
+ * No flags set. This value is suitable for initialization purposes.
+ *
+ * @const I4F_TRUST_MANIFEST
+ * Causes the implementation to bypass trust evaluation for the manifest, i.e.
+ * it will not verify that a manifest has been signed by Apple before trusting
+ * it.
+ *
+ * This option is for testing purposes only and is not respected on the RELEASE
+ * variant of the implementation.
+ *
+ * @const I4F_FORCE_MIXNMATCH
+ * Causes the implementation to bypass mix-n-match policy evaluation and always
+ * allow mix-n-match, irrespective of the previous boot stage's conclusion or
+ * manifest policy.
+ *
+ * This option is for testing purposes only and is not respected on the RELEASE
+ * variant of the implementation.
+ */
+OS_ENUM(img4_flags, uint64_t,
+       I4F_INIT = 0,
+       I4F_TRUST_MANIFEST = (1 << 0),
+       I4F_FORCE_MIXNMATCH = (1 << 1),
+) IMG4_API_AVAILABLE_20180112;
+
+#if TARGET_OS_OSX || defined(PLATFORM_MacOSX)
+typedef char _img4_opaque_data_64[656];
+typedef char _img4_opaque_data_32[476];
+#elif TARGET_OS_IOS || defined(PLATFORM_iPhoneOS)
+typedef char _img4_opaque_data_64[656];
+typedef char _img4_opaque_data_32[476];
+#elif TARGET_OS_WATCH || defined(PLATFORM_WatchOS)
+typedef char _img4_opaque_data_64[656];
+typedef char _img4_opaque_data_32[488];
+#elif TARGET_OS_TV || defined(PLATFORM_tvOS) || defined(PLATFORM_AppleTVOS)
+typedef char _img4_opaque_data_64[656];
+typedef char _img4_opaque_data_32[476];
+#elif TARGET_OS_BRIDGE || defined(PLATFORM_BridgeOS)
+typedef char _img4_opaque_data_64[656];
+typedef char _img4_opaque_data_32[476];
+#else
+#error "Unsupported platform"
+#endif
+
+/*!
+ * @typedef img4_t
+ * An opaque structure representing Image4 data. The Image4 data must contain a
+ * manifest and may optionally contain a payload. Neither this type nor the APIs
+ * APIs which manipulate it are thread-safe.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef struct _img4 {
+#if __ILP64__ || __LP64__
+       _img4_opaque_data_64 __opaque;
+#else
+       _img4_opaque_data_32 __opaque;
+#endif
+} img4_t;
+
+#if TARGET_OS_OSX  || defined(PLATFORM_MacOSX)
+typedef char _img4_payload_opaque_data_64[488];
+typedef char _img4_payload_opaque_data_32[316];
+#elif TARGET_OS_IOS || defined(PLATFORM_iPhoneOS)
+typedef char _img4_payload_opaque_data_64[488];
+typedef char _img4_payload_opaque_data_32[316];
+#elif TARGET_OS_WATCH || defined(PLATFORM_WatchOS)
+typedef char _img4_payload_opaque_data_64[488];
+typedef char _img4_payload_opaque_data_32[316];
+#elif TARGET_OS_TV || defined(PLATFORM_tvOS) || defined(PLATFORM_AppleTVOS)
+typedef char _img4_payload_opaque_data_64[488];
+typedef char _img4_payload_opaque_data_32[316];
+#elif TARGET_OS_BRIDGE || defined(PLATFORM_BridgeOS)
+typedef char _img4_payload_opaque_data_64[488];
+typedef char _img4_payload_opaque_data_32[316];
+#else
+#error "Unsupported platform"
+#endif
+
+/*!
+ * @typedef img4_payload_t
+ * An opaque structure describing Image4 payload data. Neither this type nor the
+ * APIs which manipulate it are thread-safe.
+ */
+IMG4_API_AVAILABLE_20180112
+typedef struct _img4_payload {
+#if __ILP64__ || __LP64__
+       _img4_payload_opaque_data_64 __opaque;
+#else
+       _img4_payload_opaque_data_32 __opaque;
+#endif
+} img4_payload_t;
+
+#if !IMG4_PROJECT_BUILD
+#include <img4/environment.h>
+#include <img4/payload.h>
+#endif
+
+/*!
+ * @function img4_init
+ * Initializes an Image4.
+ *
+ * @param i4
+ * A pointer to the storage to initialize.
+ *
+ * @param flags
+ * Flags to modify initialization.
+ *
+ * @param bytes
+ * The Image4 data from which to initialize. If a destructor is provided,
+ * control of this buffer transfers to the Image4.
+ *
+ * @param len
+ * The length of the Image4 data.
+ *
+ * @param destructor
+ * A destructor for the Image4 data. May be NULL if the buffer does not require
+ * explicit deallocation (e.g. because the buffer is stack data).
+ *
+ * @result
+ * Upon success, zero is returned. The implementation may also return one of the
+ * following error codes directly:
+ *
+ *     [EILSEQ]     The data is not valid Image4 data
+ *     [EFTYPE]     The data does not contain an Image4 manifest
+ *
+ * @discussion
+ * The bytes given to this routine must represent an Image4 manifest. They may
+ * optionally also represent an Image4 payload.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3
+errno_t
+img4_init(img4_t *i4, img4_flags_t flags, const uint8_t *bytes, size_t len,
+               img4_destructor_t destructor);
+
+/*!
+ * @function img4_set_custom_tag_handler
+ * Sets custom tag handlers for an Image4. These handlers are invoked during
+ * trust evaluation of the Image4.
+ *
+ * @param i4
+ * The Image4 to modify.
+ *
+ * @param tags
+ * An array of custom tag structures which specify the custom tags expected.
+ * This must be constant storage. Passing heap or stack storage will result in
+ * undefined behavior.
+ *
+ * @param tags_cnt
+ * The number of items in the {@link tags} array.
+ *
+ * @discussion
+ * Invocations of custom tag handlers occur during trust evaluation. You should
+ * not assume that the Image4 is trusted within the scope of a custom tag
+ * handler. Trustworthiness can only be determined by consulting the return
+ * value of {@link img4_get_trusted_payload} or
+ * {@link img4_get_trusted_external_payload}.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_NONNULL1 OS_NONNULL2
+void
+img4_set_custom_tag_handler(img4_t *i4,
+               const img4_custom_tag_t *tags, size_t tags_cnt);
+
+/*!
+ * @function img4_get_trusted_payload
+ * Obtains the trusted payload bytes from the Image4.
+ *
+ * @param i4
+ * The Image4 to query.
+ *
+ * @param tag
+ * The tag for the payload to obtain.
+ *
+ * @param env
+ * The environment against which to validate the Image4.
+ *
+ * @param ctx
+ * The context pointer to pass to the routines defined in the environment (if
+ * a custom environment was passed) and to any custom tag handlers.
+ *
+ * @param bytes
+ * A pointer to the storage where the pointer to the payload buffer will be
+ * written on success.
+ *
+ * @param len
+ * A pointer to the storage where the length of the payload buffer will be
+ * written on success.
+ *
+ * @result
+ * Upon success, zero is returned. The implementation may also return one of the
+ * following error codes directly:
+ *
+ *     [ENOENT]     The Image4 does not contain a payload for the specified tag
+ *     [EAUTH]      The Image4 manifest was not authentic
+ *     [EACCES]     The environment given does not satisfy the manifest
+ *                  constraints
+ *     [EACCES]     The environment and manifest do not agree on a digest
+ *                  algorithm
+ *     [EILSEQ]     The payload for the given tag does not match its description
+ *                  in the manifest
+ *     [EIO]        The payload could not be fetched
+ *
+ * Additionally, errors from the routines specified in the
+ * {@link img4_environment_t} may be returned.
+ *
+ * @discussion
+ * This routine will perform the following validation:
+ *
+ *     1. Validate that the Image4 manifest is authentic (i.e. was signed by
+ *        Apple)
+ *     2. Validate that the given environment satisfies the constraints in the
+ *        manifest
+ *     3. Validate that the measurement of the payload for the given tag matches
+ *        the measurement in the manifest
+ *
+ * If any one of these validation checks fails, the payload is considered
+ * untrustworthy and is not returned.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 OS_NONNULL5 OS_NONNULL6
+errno_t
+img4_get_trusted_payload(img4_t *i4, img4_tag_t tag,
+               const img4_environment_t *env, void *ctx,
+               const uint8_t **bytes, size_t *len);
+
+/*!
+ * @function img4_get_trusted_external_payload
+ * Obtains the trusted payload bytes from the external Image4 payload after
+ * validating them against the object description in the Image4's manifest.
+ *
+ * @param i4
+ * The Image4 to query.
+ *
+ * @param payload
+ * The payload to validate.
+ *
+ * @param env
+ * The environment against which to validate the Image4.
+ *
+ * @param ctx
+ * The context pointer to pass to the routines defined in the environment and to
+ * any custom tag handlers.
+ *
+ * @param bytes
+ * A pointer to the storage where the pointer to the payload buffer will be
+ * written on success.
+ *
+ * @param len
+ * A pointer to the storage where the length of the payload buffer will be
+ * written on success.
+ *
+ * @result
+ * Upon success, zero is returned. The implementation may also return one of the
+ * following error codes directly:
+ *
+ *     [ENOENT]     The Image4 does not contain an object describing the given
+ *                  payload
+ *     [EAUTH]      The Image4 manifest was not authentic
+ *     [EACCES]     The environment given does not satisfy the manifest
+ *                  constraints
+ *     [EACCES]     The environment and manifest do not agree on a digest
+ *                  algorithm
+ *     [EILSEQ]     The payload for the given tag does not match its description
+ *                  in the manifest
+ *     [EIO]        The payload could not be fetched
+ *
+ * Otherwise, an error from the underlying Image4 implementation will be
+ * returned.
+ *
+ * @discussion
+ * This routine performs the same validation steps as
+ * {@link img4_get_trusted_payload}.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL2
+errno_t
+img4_get_trusted_external_payload(img4_t *i4, img4_payload_t *payload,
+               const img4_environment_t *env, void *ctx,
+               const uint8_t **bytes, size_t *len);
+
+/*!
+ * @function img4_get_entitlement_bool
+ * Queries the Image4 manifest for a Boolean entitlement value.
+ *
+ * @param i4
+ * The Image4 to query.
+ *
+ * @param entitlement
+ * The tag for the entitlement to query.
+ *
+ * @result
+ * The Boolean value of the entitlement. If the entitlement was not present,
+ * false is returned. If the entitlement was present but did not have a Boolean
+ * value, false is returned.
+ *
+ * @discussion
+ * This routine does not trigger validation of the Image4. Therefore the result
+ * result of this routine cannot be used to confer trust without also having
+ * obtained a valid payload.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1
+bool
+img4_get_entitlement_bool(img4_t *i4, img4_tag_t entitlement);
+
+/*!
+ * @function img4_get_object_entitlement_bool
+ * Queries the specified object in the Image4 manifest for a Boolean entitlement
+ * value.
+ *
+ * @param i4
+ * The Image4 to query.
+ *
+ * @param object
+ * The tag for the object to query.
+ *
+ * @param entitlement
+ * The tag for the entitlement to query.
+ *
+ * @result
+ * The Boolean value of the entitlement. If the entitlement was not present,
+ * false is returned. If the entitlement was present but did not have a Boolean
+ * value, false is returned. If the object specified was not present, false is
+ * returned.
+ *
+ * @discussion
+ * See discussion for {@link img4_get_entitlement_bool}.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1
+bool
+img4_get_object_entitlement_bool(img4_t *i4, img4_tag_t object,
+               img4_tag_t entitlement);
+
+/*!
+ * @function img4_destroy
+ * Destroys an Image4 and disposes of associated resources.
+ *
+ * @param i4
+ * The Image4 to destroy.
+ *
+ * @discussion
+ * The destructor passed to {@link img4_init} is called as a result of this
+ * routine, if any was set.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_NONNULL1
+void
+img4_destroy(img4_t *i4);
+
+__END_DECLS;
+
+#endif // __IMG4_H
diff --git a/EXTERNAL_HEADERS/img4/payload.h b/EXTERNAL_HEADERS/img4/payload.h
new file mode 100644 (file)
index 0000000..5a3ba81
--- /dev/null
@@ -0,0 +1,70 @@
+/*!
+ * @header
+ * Image4 payload interfaces. These interfaces provide a lightweight type for
+ * working with an Image4 payload that is described by a separate manifest (e.g.
+ * a .im4p file whose contents are described by an object in a manifest from a
+ * .im4m file).
+ *
+ * No direct access is provided to the raw payload bytes encapsulated by the
+ * Image4 payload by design. The intent is that in order to access the raw
+ * bytes, the payload object must be validated against a manifest object using
+ * the {@link img4_get_trusted_external_payload} interface.
+ */
+#ifndef __IMG4_PAYLOAD_H
+#define __IMG4_PAYLOAD_H
+
+#ifndef __IMG4_INDIRECT
+#error "Please #include <img4/img4.h> instead of this file directly"
+#endif // __IMG4_INDIRECT
+
+/*!
+ * @function img4_payload_init
+ *
+ * @param i4p
+ * A pointer to the payload object to initialize.
+ *
+ * @param tag
+ * The expected tag for the payload.
+ *
+ * @param bytes
+ * The buffer containing the Image4 payload.
+ *
+ * @param len
+ * The length of the buffer.
+ *
+ * @param destructor
+ * A pointer to a routine to dispose of the buffer. May be NULL if the buffer
+ * does not require explicit disposal (e.g. the buffer is stack memory).
+ *
+ * @result
+ * Upon success, zero is returned. Otherwise, one of the following error codes:
+ *
+ *     [EILSEQ]     The data is not valid Image4 data
+ *     [EFTYPE]     The data does not contain an Image4 payload
+ *     [ENOENT]     The bytes do not contain a payload for the specified tag
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_WARN_RESULT OS_NONNULL1 OS_NONNULL3 OS_NONNULL5
+errno_t
+img4_payload_init(img4_payload_t *i4p, img4_tag_t tag,
+               const uint8_t *bytes, size_t len, img4_destructor_t destructor);
+
+/*!
+ * @function img4_payload_destroy
+ * Disposes of the resources associated with the payload object.
+ *
+ * @param i4p
+ * The payload object of which to dispose.
+ *
+ * @discussion
+ * This routine does not deallocate the storage for the payload object itself,
+ * only the associated resources. This routine will cause the destructor given
+ * in {@link img4_payload_init} to be called, if any.
+ */
+IMG4_API_AVAILABLE_20180112
+OS_EXPORT OS_NONNULL1
+void
+img4_payload_destroy(img4_payload_t *i4p);
+
+#endif // __IMG4_PAYLOAD_H
+
diff --git a/EXTERNAL_HEADERS/ptrauth.h b/EXTERNAL_HEADERS/ptrauth.h
new file mode 100644 (file)
index 0000000..b6db0fb
--- /dev/null
@@ -0,0 +1,338 @@
+/*===---- ptrauth.h - Pointer authentication -------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PTRAUTH_H
+#define __PTRAUTH_H
+
+#include <stdint.h>
+
+typedef enum {
+  ptrauth_key_asia = 0,
+  ptrauth_key_asib = 1,
+  ptrauth_key_asda = 2,
+  ptrauth_key_asdb = 3,
+
+  /* A process-independent key which can be used to sign code pointers.
+     Signing and authenticating with this key is a no-op in processes
+     which disable ABI pointer authentication. */
+  ptrauth_key_process_independent_code = ptrauth_key_asia,
+
+  /* A process-specific key which can be used to sign code pointers.
+     Signing and authenticating with this key is enforced even in processes
+     which disable ABI pointer authentication. */
+  ptrauth_key_process_dependent_code = ptrauth_key_asib,
+
+  /* A process-independent key which can be used to sign data pointers.
+     Signing and authenticating with this key is a no-op in processes
+     which disable ABI pointer authentication. */
+  ptrauth_key_process_independent_data = ptrauth_key_asda,
+
+  /* A process-specific key which can be used to sign data pointers.
+     Signing and authenticating with this key is a no-op in processes
+     which disable ABI pointer authentication. */
+  ptrauth_key_process_dependent_data = ptrauth_key_asdb,
+
+  /* The key used to sign C function pointers.
+     The extra data is always 0. */
+  ptrauth_key_function_pointer = ptrauth_key_process_independent_code,
+
+  /* The key used to sign return addresses on the stack.
+     The extra data is based on the storage address of the return address.
+     On ARM64, that is always the storage address of the return address plus 8
+     (or, in other words, the value of the stack pointer on function entry) */
+  ptrauth_key_return_address = ptrauth_key_process_dependent_code,
+
+  /* The key used to sign frame pointers on the stack.
+     The extra data is based on the storage address of the frame pointer.
+     On ARM64, that is always the storage address of the frame pointer plus 16
+     (or, in other words, the value of the stack pointer on function entry) */
+  ptrauth_key_frame_pointer = ptrauth_key_process_dependent_data,
+
+  /* The key used to sign block function pointers, including:
+       invocation functions,
+       block object copy functions,
+       block object destroy functions,
+       __block variable copy functions, and
+       __block variable destroy functions.
+     The extra data is always the address at which the function pointer
+     is stored.
+
+     Note that block object pointers themselves (i.e. the direct
+     representations of values of block-pointer type) are not signed. */
+  ptrauth_key_block_function = ptrauth_key_asia,
+
+  /* The key used to sign C++ v-table pointers.
+     The extra data is always 0. */
+  ptrauth_key_cxx_vtable_pointer = ptrauth_key_asda,
+
+  /* Other pointers signed under the ABI use private ABI rules. */
+
+} ptrauth_key;
+
+/* An integer type of the appropriate size for an extra-data argument. */
+typedef uintptr_t ptrauth_extra_data_t;
+
+/* An integer type of the appropriate size for a generic signature. */
+typedef uintptr_t ptrauth_generic_signature_t;
+
+/* A signed pointer value embeds the original pointer together with
+   a signature that attests to the validity of that pointer.  Because
+   this signature must use only "spare" bits of the pointer, a
+   signature's validity is probabilistic in practice: it is unlikely
+   but still plausible that an invalidly-derived signature will
+   somehow equal the correct signature and therefore successfully
+   authenticate.  Nonetheless, this scheme provides a strong degree
+   of protection against certain kinds of attacks. */
+
+/* Authenticating a pointer that was not signed with the given key
+   and extra-data value will (likely) fail.  However, an
+   authentication failure will not lead immediately to a trap.
+   Instead, it will yield a value which is guaranteed to trap
+   if actually dereferenced. */
+
+/* The null function pointer is always the all-zero bit pattern.
+   Signing an all-zero bit pattern will embed a (likely) non-zero
+   signature in the result, and so the result will not seem to be
+   a null function pointer.  Authenticating this value will yield
+   a null function pointer back.  However, authenticating an
+   all-zero bit pattern will probably fail, because the
+   authentication will expect a (likely) non-zero signature to
+   embedded in the value.
+
+   Because of this, if a pointer may validly be null, you should
+   check for null before attempting to authenticate it. */
+
+#ifdef __PTRAUTH_INTRINSICS__
+
+/* Strip the signature from a value without authenticating it.
+
+   If the value is a function pointer, the result will not be a
+   legal function pointer because of the missing signature, and
+   attempting to call it will result in an authentication failure.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The result will have the same type as the original value. */
+#define ptrauth_strip(__value, __key) \
+  __builtin_ptrauth_strip(__value, __key)
+
+/* Blend a pointer and a small integer to form a new extra-data
+   discriminator.  Not all bits of the inputs are guaranteed to
+   contribute to the result.
+
+   On ARM64, only the low 16 bits of the integer will be considered.
+
+   For the purposes of ptrauth_sign_constant, the result of calling
+   this function is considered a constant expression if the arguments
+   are constant.  Some restrictions may be imposed on the pointer.
+
+   The first argument must be an expression of pointer type.
+   The second argument must be an expression of integer type.
+   The result will have type uintptr_t. */
+#define ptrauth_blend_discriminator(__pointer, __integer) \
+  __builtin_ptrauth_blend_discriminator(__pointer, __integer)
+
+/* Add a signature to the given pointer value using a specific key,
+   using the given extra data as a salt to the signing process.
+
+   The value must be a constant expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be a constant expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This is a constant expression if the extra data is an integer or
+   null pointer constant. */
+#define ptrauth_sign_constant(__value, __key, __data) \
+  __builtin_ptrauth_sign_constant(__value, __key, __data)
+
+/* Add a signature to the given pointer value using a specific key,
+   using the given extra data as a salt to the signing process.
+
+   This operation does not authenticate the original value and is
+   therefore potentially insecure if an attacker could possibly
+   control that value.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value. */
+#define ptrauth_sign_unauthenticated(__value, __key, __data) \
+  __builtin_ptrauth_sign_unauthenticated(__value, __key, __data)
+
+/* Authenticate a pointer using one scheme and resign it using another.
+
+   If the result is subsequently authenticated using the new scheme, that
+   authentication is guaranteed to fail if and only if the initial
+   authentication failed.
+
+   The value must be an expression of pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation is guaranteed to not leave the intermediate value
+   available for attack before it is re-signed.
+
+   Do not pass a null pointer to this function. A null pointer
+   will not successfully authenticate. */
+#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, __new_data) \
+  __builtin_ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, __new_data)
+
+/* Authenticate a pointer using one scheme and resign it as a C
+   function pointer.
+
+   If the result is subsequently authenticated using the new scheme, that
+   authentication is guaranteed to fail if and only if the initial
+   authentication failed.
+
+   The value must be an expression of function pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   This operation is guaranteed to not leave the intermediate value
+   available for attack before it is re-signed. Additionally, if this
+   expression is used syntactically as the function expression in a
+   call, only a single authentication will be performed. */
+#define ptrauth_auth_function(__value, __old_key, __old_data) \
+  ptrauth_auth_and_resign(__value, __old_key, __old_data, ptrauth_key_function_pointer, 0)
+
+/* Authenticate a data pointer.
+
+   The value must be an expression of non-function pointer type.
+   The key must be a constant expression of type ptrauth_key.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have the same type as the original value.
+
+   If the authentication fails, dereferencing the resulting pointer
+   will fail. */
+#define ptrauth_auth_data(__value, __old_key, __old_data) \
+  __builtin_ptrauth_auth(__value, __old_key, __old_data)
+
+/* Return an extra-discriminator value which can validly be used
+   as the second argument to ptrauth_blend_discriminator or the
+   third argument to the __ptrauth qualifier.
+
+   The argument must be a string literal.
+   A call to this function is an integer constant expression. */
+#define ptrauth_string_discriminator(__string) \
+  __builtin_ptrauth_string_discriminator(__string)
+
+/* Compute a full pointer-width generic signature for the given
+   value, using the given data as a salt.
+
+   This generic signature is process-independent, but may not be
+   consistent across reboots.
+
+   This can be used to validate the integrity of arbitrary data
+   by storing a signature for that data together with it.  Because
+   the signature is pointer-sized, if the stored signature matches
+   the result of re-signing the current data, a match provides very
+   strong evidence that the data has not been corrupted.
+
+   The value must be an expression of pointer or integer type; if
+   an integer, it will be coerced to uintptr_t.
+   The extra data must be an expression of pointer or integer type;
+   if an integer, it will be coerced to ptrauth_extra_data_t.
+   The result will have type ptrauth_generic_signature_t.
+
+   This operation will compute a meaningful signature even in processes
+   which disable ABI pointer authentication. */
+#define ptrauth_sign_generic_data(__value, __data) \
+  __builtin_ptrauth_sign_generic_data(__value, __data)
+
+
+/* Define some standard __ptrauth qualifiers used in the ABI. */
+#define __ptrauth_function_pointer            \
+  __ptrauth(ptrauth_key_function_pointer,0,0)
+#define __ptrauth_return_address              \
+  __ptrauth(ptrauth_key_return_address,1,0)
+#define __ptrauth_block_invocation_pointer    \
+  __ptrauth(ptrauth_key_function_pointer,1,0)
+#define __ptrauth_block_copy_helper           \
+  __ptrauth(ptrauth_key_function_pointer,1,0)
+#define __ptrauth_block_destroy_helper        \
+  __ptrauth(ptrauth_key_function_pointer,1,0)
+#define __ptrauth_block_byref_copy_helper     \
+  __ptrauth(ptrauth_key_function_pointer,1,0)
+#define __ptrauth_block_byref_destroy_helper  \
+  __ptrauth(ptrauth_key_function_pointer,1,0)
+#define __ptrauth_objc_method_list_imp        \
+  __ptrauth(ptrauth_key_function_pointer,1,0)
+#define __ptrauth_cxx_vtable_pointer          \
+  __ptrauth(ptrauth_key_cxx_vtable_pointer,0,0)
+#define __ptrauth_cxx_vtt_vtable_pointer      \
+  __ptrauth(ptrauth_key_cxx_vtable_pointer,0,0)
+#define __ptrauth_swift_heap_object_destructor \
+  __ptrauth(ptrauth_key_function_pointer,1,0xbbbf)
+
+/* Some situations in the C++ and Swift ABIs use declaration-specific
+   or type-specific extra discriminators. */
+#define __ptrauth_cxx_virtual_function_pointer(__declkey) \
+  __ptrauth(ptrauth_key_function_pointer,1,__declkey)
+#define __ptrauth_swift_function_pointer(__typekey) \
+  __ptrauth(ptrauth_key_function_pointer,0,__typekey)
+#define __ptrauth_swift_class_method_pointer(__declkey) \
+  __ptrauth(ptrauth_key_function_pointer,1,__declkey)
+#define __ptrauth_swift_protocol_witness_function_pointer(__declkey) \
+  __ptrauth(ptrauth_key_function_pointer,1,__declkey)
+#define __ptrauth_swift_value_witness_function_pointer(__key) \
+  __ptrauth(ptrauth_key_function_pointer,1,__key)
+
+#else
+
+#define ptrauth_strip(__value, __key) __value
+#define ptrauth_blend_discriminator(__pointer, __integer) ((uintptr_t)0)
+#define ptrauth_sign_constant(__value, __key, __data) __value
+#define ptrauth_sign_unauthenticated(__value, __key, __data) __value
+#define ptrauth_auth_and_resign(__value, __old_key, __old_data, __new_key, __new_data) __value
+#define ptrauth_auth_function(__value, __old_key, __old_data) __value
+#define ptrauth_auth_data(__value, __old_key, __old_data) __value
+#define ptrauth_string_discriminator(__string) ((int)0)
+#define ptrauth_sign_generic_data(__value, __data) ((ptrauth_generic_signature_t)0)
+
+#define __ptrauth_function_pointer
+#define __ptrauth_return_address
+#define __ptrauth_block_invocation_pointer
+#define __ptrauth_block_copy_helper
+#define __ptrauth_block_destroy_helper
+#define __ptrauth_block_byref_copy_helper
+#define __ptrauth_block_byref_destroy_helper
+#define __ptrauth_objc_method_list_imp
+#define __ptrauth_cxx_vtable_pointer
+#define __ptrauth_cxx_vtt_vtable_pointer
+#define __ptrauth_swift_heap_object_destructor
+#define __ptrauth_cxx_virtual_function_pointer(__declkey)
+#define __ptrauth_swift_function_pointer(__typekey)
+#define __ptrauth_swift_class_method_pointer(__declkey)
+#define __ptrauth_swift_protocol_witness_function_pointer(__declkey)
+#define __ptrauth_swift_value_witness_function_pointer(__key)
+
+#endif /* __PTRAUTH_INTRINSICS__ */
+
+#endif /* __PTRAUTH_H */
index 1660223f5d5a0448a921997c57a5b8e5304c8a05..31de51ae8358fb8ee3b1b35d7c52f4b991736181 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -220,7 +220,7 @@ EXPINC_SUBDIRS_X86_64H = $(EXPINC_SUBDIRS)
 EXPINC_SUBDIRS_ARM = $(EXPINC_SUBDIRS)
 EXPINC_SUBDIRS_ARM64 = $(EXPINC_SUBDIRS)
 
-SETUP_SUBDIRS = SETUP san
+SETUP_SUBDIRS = SETUP osfmk san
 
 COMP_SUBDIRS_X86_64 = $(ALL_SUBDIRS)
 COMP_SUBDIRS_X86_64H = $(ALL_SUBDIRS)
@@ -240,7 +240,17 @@ endif # all other RC_ProjectName
 
 installapi_libkdd installhdrs_libkdd install_libkdd:
        cd libkdd; \
-               xcodebuild -target libkdd $(subst _libkdd,,$@)  \
+               xcodebuild -target Default $(subst _libkdd,,$@) \
+                       "SRCROOT=$(SRCROOT)/libkdd"             \
+                       "OBJROOT=$(OBJROOT)"                    \
+                       "SYMROOT=$(SYMROOT)"                    \
+                       "DSTROOT=$(DSTROOT)"                    \
+                       "SDKROOT=$(SDKROOT)"
+
+
+installapi_libkdd_tests installhdrs_libkdd_tests install_libkdd_tests:
+       cd libkdd; \
+               xcodebuild -target tests $(subst _libkdd_tests,,$@)     \
                        "SRCROOT=$(SRCROOT)/libkdd"             \
                        "OBJROOT=$(OBJROOT)"                    \
                        "SYMROOT=$(SYMROOT)"                    \
@@ -250,7 +260,7 @@ installapi_libkdd installhdrs_libkdd install_libkdd:
 
 installapi_libkdd_host installhdrs_libkdd_host install_libkdd_host:
        cd libkdd; \
-               xcodebuild -target kdd.framework $(subst _libkdd_host,,$@)      \
+               xcodebuild -configuration ReleaseHost -target kdd.framework $(subst _libkdd_host,,$@)   \
                        "SRCROOT=$(SRCROOT)/libkdd"             \
                        "OBJROOT=$(OBJROOT)"                    \
                        "SYMROOT=$(SYMROOT)"                    \
@@ -265,3 +275,5 @@ installapi_libkdd_host installhdrs_libkdd_host install_libkdd_host:
 xnu_tests:
        $(MAKE) -C $(SRCROOT)/tools/tests       $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \
                SRCROOT=$(SRCROOT)/tools/tests
+       $(MAKE) -C $(SRCROOT)/tests     $(if $(filter -j,$(MAKEFLAGS)),,$(MAKEJOBS)) \
+               SRCROOT=$(SRCROOT)/tests
index dc1bbbae6aefe2ed7b421f7f97aa9a87ee08d482..0e9d6b70848df372e7f22dcc8acb4c048f0be577 100644 (file)
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@ What is XNU?
 ===========
 
 XNU kernel is part of the Darwin operating system for use in macOS and iOS operating systems. XNU is an acronym for X is Not Unix.
-XNU is a hybrid kernel combining the Mach kernel developed at Carnegie Mellon University with components from FreeBSD and C++ API for writing drivers called IOKit.
+XNU is a hybrid kernel combining the Mach kernel developed at Carnegie Mellon University with components from FreeBSD and C++ API for writing drivers called IOKit.
 XNU runs on x86_64 for both single processor and multi-processor configurations.
 
 XNU Source Tree
@@ -190,8 +190,8 @@ The header files in framework's `PrivateHeaders` are only available for ** Apple
 
 The directory containing the header file should have a Makefile that
 creates the list of files that should be installed at different locations.
-If you are adding first header file in a directory, you will need to
-create Makefile similar to xnu/bsd/sys/Makefile.
+If you are adding the first header file in a directory, you will need to
+create Makefile similar to `xnu/bsd/sys/Makefile`.
 
 Add your header file to the correct file list depending on where you want
 to install it. The default locations where the header files are installed
@@ -213,7 +213,13 @@ from each file list are -
        `$(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders`
 
 The Makefile combines the file lists mentioned above into different
-install lists which are used by build system to install the header files.
+install lists which are used by build system to install the header files. There
+are two types of install lists: machine-dependent and machine-independent.
+These lists are indicated by the presence of `MD` and `MI` in the build
+setting, respectively. If your header is architecture-specific, then you should
+use a machine-dependent install list (e.g. `INSTALL_MD_LIST`). If your header
+should be installed for all architectures, then you should use a
+machine-independent install list (e.g. `INSTALL_MI_LIST`).
 
 If the install list that you are interested does not exist, create it
 by adding the appropriate file lists.  The default install lists, its
@@ -270,28 +276,53 @@ want to export a function only to kernel level but not user level.
 
  Some pre-defined macros and their descriptions are -
 
-    a. `PRIVATE` : If true, code is available to all of the xnu kernel and is
-       not available in kernel extensions and user level header files.  The
-       header files installed in all the paths described above in (1) will not
-       have code enclosed within this macro.
-
-    b. `KERNEL_PRIVATE` : If true, code is available to all of the xnu kernel and Apple
-        internal kernel extensions.
-
-    c. `BSD_KERNEL_PRIVATE` : If true, code is available to the xnu/bsd part of
-       the kernel and is not available to rest of the kernel, kernel extensions
-       and user level header files.  The header files installed in all the
-       paths described above in (1) will not have code enclosed within this macro.
-
-    d. `KERNEL` :  If true, code is available only in kernel and kernel
-       extensions and is not available in user level header files.  Only the
+    a. `PRIVATE` : If defined, enclosed definitions are considered System
+       Private Interfaces. These are visible within xnu and
+       exposed in user/kernel headers installed within the AppleInternal
+       "PrivateHeaders" sections of the System and Kernel frameworks.
+    b. `KERNEL_PRIVATE` : If defined, enclosed code is available to all of xnu
+       kernel and Apple internal kernel extensions and omitted from user
+       headers.
+    c. `BSD_KERNEL_PRIVATE` : If defined, enclosed code is visible exclusively
+       within the xnu/bsd module.
+    d. `MACH_KERNEL_PRIVATE`: If defined, enclosed code is visible exclusively
+       within the xnu/osfmk module.
+    e. `XNU_KERNEL_PRIVATE`: If defined, enclosed code is visible exclusively
+       within xnu.
+    f. `KERNEL` :  If defined, enclosed code is available within xnu and kernel
+       extensions and is not visible in user level header files.  Only the
        header files installed in following paths will have the code -
 
             $(DSTROOT)/System/Library/Frameworks/Kernel.framework/Headers
             $(DSTROOT)/System/Library/Frameworks/Kernel.framework/PrivateHeaders
 
-       you should check [Testing the kernel][] for details.
+Conditional compilation
+=======================
 
+`xnu` offers the following mechanisms for conditionally compiling code:
+
+    a. *CPU Characteristics* If the code you are guarding has specific
+    characterstics that will vary only based on the CPU architecture being
+    targeted, use this option. Prefer checking for features of the
+    architecture (e.g. `__LP64__`, `__LITTLE_ENDIAN__`, etc.).
+    b. *New Features* If the code you are guarding, when taken together,
+    implements a feature, you should define a new feature in `config/MASTER`
+    and use the resulting `CONFIG` preprocessor token (e.g. for a feature
+    named `config_virtual_memory`, check for `#if CONFIG_VIRTUAL_MEMORY`).
+    This practice ensures that existing features may be brought to other
+    platforms by simply changing a feature switch.
+    c. *Existing Features* You can use existing features if your code is
+    strongly tied to them (e.g. use `SECURE_KERNEL` if your code implements
+    new functionality that is exclusively relevant to the trusted kernel and
+    updates the definition/understanding of what being a trusted kernel means).
+
+It is recommended that you avoid compiling based on the target platform. `xnu`
+does not define the platform macros from `TargetConditionals.h`
+(`TARGET_OS_OSX`, `TARGET_OS_IOS`, etc.).
+
+
+There is a `TARGET_OS_EMBEDDED` macro, but this should be avoided as it is in
+general too broad a definition for most functionality.
 
 How to add a new syscall
 ========================
index edb6dfaeaaf5e6698ba2c482240491a9463c9f48..7e0d49a2d10a6f9f639f72687a2af4a0d7964d12 100644 (file)
@@ -474,6 +474,10 @@ lookup_arch(const char *archstring)
        static const NXArchInfo archlist[] = {
                { "x86_64", 0x01000007 /* CPU_TYPE_X86_64 */, 3 /* CPU_SUBTYPE_X86_64_ALL */, NX_LittleEndian, NULL },
                { "x86_64h", 0x01000007 /* CPU_TYPE_X86_64 */, 8 /* CPU_SUBTYPE_X86_64_H */, NX_LittleEndian, NULL },
+               { "armv7", 12 /* CPU_TYPE_ARM */, 9 /* CPU_SUBTYPE_ARM_V7 */, NX_LittleEndian, NULL },
+               { "armv7s", 12 /* CPU_TYPE_ARM */, 11 /* CPU_SUBTYPE_ARM_V7S */, NX_LittleEndian, NULL },
+               { "armv7k", 12 /* CPU_TYPE_ARM */, 12 /* CPU_SUBTYPE_ARM_V7K */, NX_LittleEndian, NULL },
+               { "arm64", 0x0100000c /* CPU_TYPE_ARM64 */, 0 /* CPU_SUBTYPE_ARM64_ALL */, NX_LittleEndian, NULL },
        };
        unsigned long i;
 
index c0cdd42fda390b8c6d4c1d348e63bba5d2b8a0bd..f79dc7046f61288d4a3d927721ba3844e9772952 100644 (file)
@@ -19,6 +19,7 @@ INSTINC_SUBDIRS = \
        netkey \
        nfs \
        security \
+       pthread \
        sys \
        uuid \
        vfs
@@ -49,6 +50,7 @@ EXPINC_SUBDIRS = \
        netinet6 \
        netkey \
        security \
+       pthread \
        sys \
        uuid \
        vfs \
index 5a2e735cf6b9ff461fa5fddb6d1684acabf76a5b..7d03ebe75e7182a819f10e30388e8d269c09e67a 100644 (file)
@@ -79,7 +79,7 @@ _STRUCT_MCONTEXT64
 
 #ifndef _MCONTEXT_T
 #define _MCONTEXT_T
-#if defined(__LP64__)
+#if defined(__arm64__)
 typedef _STRUCT_MCONTEXT64     *mcontext_t;
 #define _STRUCT_MCONTEXT _STRUCT_MCONTEXT64
 #else
index eb577a43f66db11efedfeb0d20c13a2a3ea15fe6..e72118ebcf0793a4c5fe3254ff9e634deacd0263 100644 (file)
@@ -107,6 +107,7 @@ typedef struct fasttrap_machtp {
 #define FASTTRAP_T_ARM64_ADR                   36
 #define FASTTRAP_T_ARM64_PRFM                  37
 #define FASTTRAP_T_ARM64_EXCLUSIVE_MEM         38
+#define FASTTRAP_T_ARM64_RETAB                 39
 #endif
 
 #if defined (__arm__)                           
@@ -130,6 +131,8 @@ typedef struct fasttrap_machtp {
 #define FASTTRAP_FN_ARM 1
 #define FASTTRAP_FN_THUMB 2
 #define FASTTRAP_FN_USDT 3
+#define FASTTRAP_FN_ARM64 4
+#define FASTTRAP_FN_ARM64_32 5
 
 #define ARM_RM(x) ((x) & 0xF)
 #define ARM_RS(x) (((x) >> 8) & 0xF)
@@ -221,6 +224,9 @@ typedef struct fasttrap_machtp {
 
 #define FASTTRAP_ARM64_OP_MASK_EXCL_MEM                0x3f000000 /* Bits  to check for exclusive memory operation */
 #define FASTTRAP_ARM64_OP_VALUE_EXCL_MEM       0x08000000 /* Value to find */
+
+#define FASTTRAP_ARM64_OP_MASK_RETAB           0xfffffc1f /* Bits to check for retab Rt */
+#define FASTTRAP_ARM64_OP_VALUE_RETAB          0xd65f0c1f /* Value to find */
 #endif /* defined(__arm64__) */
 
 #ifdef __cplusplus
index 18906141cd84de6c2ff3aa2f478051b487ba298b..e84405b12e2acc1c98ca4eb985ea1052e8ca9b65 100644 (file)
@@ -128,10 +128,17 @@ typedef __int32_t         user32_ssize_t;
 typedef __int32_t              user32_long_t;
 typedef __uint32_t             user32_ulong_t;
 typedef __int32_t              user32_time_t;
-#if __arm__ && (__BIGGEST_ALIGNMENT__ > 4)
-typedef __int64_t              user32_off_t;
+
+/*
+ * This alignment is required to ensure symmetry between userspace and kernelspace
+ * when the kernel is 64-bit and the user application is 32-bit. All currently
+ * supported ARM slices (arm64/armv7k/arm64_32) contain the same type alignment
+ * ABI so this alignment isn't needed for ARM.
+ */
+#if defined(__x86_64__)
+typedef __int64_t              user32_off_t __attribute__((aligned(4)));
 #else
-typedef __int64_t              user32_off_t  __attribute__((aligned(4)));
+typedef __int64_t              user32_off_t;
 #endif
 
 #endif /* KERNEL */
index 525363788fc8a18dea9f2417dd8207c2d3733e63..1f6b2476eb22eecb14ea80b215f8143c0ffe6658 100644 (file)
@@ -79,9 +79,9 @@
 /*
  * IPC types.
  */
-#define        AT_IPC_MSG      ((u_char)1)     /* Message IPC id. */
-#define        AT_IPC_SEM      ((u_char)2)     /* Semaphore IPC id. */
-#define        AT_IPC_SHM      ((u_char)3)     /* Shared mem IPC id. */
+#define        AT_IPC_MSG      ((unsigned char)1)      /* Message IPC id. */
+#define        AT_IPC_SEM      ((unsigned char)2)      /* Semaphore IPC id. */
+#define        AT_IPC_SHM      ((unsigned char)3)      /* Shared mem IPC id. */
 
 /*
  * Audit conditions.
 #define        A_SETCOND       38
 #define        A_GETSFLAGS     39
 #define        A_SETSFLAGS     40
+#define        A_GETCTLMODE    41
+#define        A_SETCTLMODE    42
+#define        A_GETEXPAFTER   43
+#define        A_SETEXPAFTER   44
 
 /*
  * Audit policy controls.
 #define        AU_IPv4         4
 #define        AU_IPv6         16
 
+/*
+ * Reserved audit class mask indicating which classes are unable to have
+ * events added or removed by unentitled processes.
+ */
+#define AU_CLASS_MASK_RESERVED 0x10000000
+
+/*
+ * Audit control modes
+ */
+#define AUDIT_CTLMODE_NORMAL ((unsigned char)1)
+#define AUDIT_CTLMODE_EXTERNAL ((unsigned char)2)
+
+/*
+ * Audit file expire_after op modes
+ */
+#define AUDIT_EXPIRE_OP_AND ((unsigned char)0)
+#define AUDIT_EXPIRE_OP_OR ((unsigned char)1)
+
 __BEGIN_DECLS
 
 typedef        uid_t           au_id_t;
@@ -175,6 +197,7 @@ typedef     u_int16_t       au_event_t;
 typedef        u_int16_t       au_emod_t;
 typedef        u_int32_t       au_class_t;
 typedef        u_int64_t       au_asflgs_t __attribute__ ((aligned (8)));
+typedef        unsigned char   au_ctlmode_t;
 
 struct au_tid {
        dev_t           port;
@@ -237,6 +260,13 @@ struct au_session {
 };
 typedef struct au_session       au_session_t;
 
+struct au_expire_after {
+       time_t age;             /* Age after which trail files should be expired */
+       size_t size;    /* Aggregate trail size when files should be expired */
+       unsigned char op_type; /* Operator used with the above values to determine when files should be expired */
+};
+typedef struct au_expire_after au_expire_after_t;
+
 /*
  * Contents of token_t are opaque outside of libbsm.
  */
index 71a51307ab98faa2b077ec9b70233eff4868ed89..c2103f32c595333b682518e01fdf7b135a8725f8 100644 (file)
@@ -75,6 +75,7 @@ typedef       struct au_record        au_record_t;
 #define        AUDIT_HEADER_SIZE       18
 #define        MAX_AUDIT_HEADER_SIZE   (5*sizeof(u_int32_t)+18)
 #define        AUDIT_TRAILER_SIZE      7
+#define        MAX_AUDIT_IDENTITY_SIZE 179
 
 /*
  * BSM token streams store fields in big endian byte order, so as to be
index 391425f912c57316c39f704087f0fa501b0fe259..31e6353d7cf7c2ddfc062c8a1e70378682094f78 100644 (file)
 #define        AUE_WATCHEVENT          AUE_NULL
 #define        AUE_WORKQOPEN           AUE_NULL
 #define        AUE_WORKQOPS            AUE_NULL
+#define        AUE_WORKLOOPCTL         AUE_NULL
 #define        AUE_PERSONA             AUE_NULL
 #define        AUE_USRCTL              AUE_NULL
 #define        AUE_NEXUS               AUE_NULL
index 2b6ae891a836a1c659d2b6ec9a270e752c2c7e63..bedcb800a5620a83f80061c03198e26dd182074f 100644 (file)
 #define        AUT_SOCKINET128         0x81            /* XXX */
 #define        AUT_SOCKUNIX            0x82            /* XXX */
 
+/* Apple specific tokens*/
+#define        AUT_IDENTITY            0xed
+#define        AUT_KRB5_PRINCIPAL      0xee
+#define        AUT_CERT_HASH           0xef
+
 /* print values for the arbitrary token */
 #define AUP_BINARY      0
 #define AUP_OCTAL       1
@@ -272,14 +277,21 @@ token_t   *au_to_subject64_ex(au_id_t auid, uid_t euid, gid_t egid, uid_t ruid,
 #if defined(_KERNEL) || defined(KERNEL)
 token_t        *au_to_exec_args(char *args, int argc);
 token_t        *au_to_exec_env(char *envs, int envc);
+token_t        *au_to_certificate_hash(char *hash, int hashc);
+token_t        *au_to_krb5_principal(char *principal, int princ);
 #else
 token_t        *au_to_exec_args(char **argv);
 token_t        *au_to_exec_env(char **envp);
+token_t        *au_to_certificate_hash(char **hash);
+token_t        *au_to_krb5_principal(char **principal);
 #endif
 token_t        *au_to_text(const char *text);
 token_t        *au_to_kevent(struct kevent *kev);
 token_t        *au_to_trailer(int rec_size);
 token_t        *au_to_zonename(const char *zonename);
+token_t        *au_to_identity(uint32_t signer_type, const char* signing_id,
+           u_char signing_id_trunc, const char* team_id, u_char team_id_trunc,
+           uint8_t* cdhash, uint16_t cdhash_len);
 
 /*
  * BSM library routines for converting between local and BSM constant spaces.
index afe23cf346d0e4a60088637c59d20f87c2bf71a9..c38c2ffb6fb8ada0f7b9d97dbe1331e7d85ff7a5 100644 (file)
@@ -160,6 +160,7 @@ OBJS_NO_CAST_ALIGN =                        \
                dtrace.o                \
                fasttrap.o              \
                fasttrap_isa.o          \
+               fbt.o                   \
                fbt_arm.o               \
                fbt_x86.o               \
                if_bond.o               \
@@ -228,6 +229,7 @@ $(foreach file,$(OBJS_NO_CAST_ALIGN),$(eval $(call add_perfile_cflags,$(file),-W
 OBJS_NO_PACKED_ADDRESS =    \
                ah_core.o           \
                ah_input.o          \
+               dlil.o              \
                esp_input.o         \
                esp_output.o        \
                frag6.o             \
@@ -242,6 +244,7 @@ OBJS_NO_PACKED_ADDRESS =    \
                ipsec.o             \
                mld6.o              \
                mptcp_opt.o         \
+               nat464_utils.o      \
                nd6.o               \
                nd6_nbr.o           \
                nd6_prproxy.o       \
index 4bf42f392f08a9a13ebe12ce49d67f5b2eb8c5f0..65e1393f680f243f336cc3d75847ba5d0739a785 100644 (file)
@@ -117,7 +117,6 @@ bsd/dev/dtrace/lockstat.c           optional config_dtrace
 bsd/dev/dtrace/dtrace_ptss.c           optional config_dtrace
 bsd/dev/dtrace/dtrace_subr.c           optional config_dtrace
 bsd/dev/dtrace/dtrace_glue.c           standard
-bsd/dev/dtrace/dtrace_alloc.c          optional config_dtrace
 bsd/dev/dtrace/blist.c                 optional config_dtrace
 bsd/dev/dtrace/fbt.c                   optional config_dtrace
 bsd/dev/dtrace/sdt.c                   optional config_dtrace
@@ -218,6 +217,7 @@ bsd/net/net_perf.c                  optional networking
 bsd/net/if_gif.c                       optional gif
 bsd/net/if_stf.c                       optional stf
 bsd/net/if_ports_used.c                        optional networking
+bsd/net/if_low_power_mode.c            optional networking
 bsd/net/kpi_interface.c                optional networking
 bsd/net/kpi_protocol.c         optional networking
 bsd/net/kpi_interfacefilter.c  optional networking
@@ -228,6 +228,7 @@ bsd/net/necp.c                              optional necp
 bsd/net/necp_client.c          optional necp
 bsd/net/network_agent.c                        optional networking
 bsd/net/if_pflog.c                     optional pflog
+bsd/net/nat464_utils.c                 optional networking
 bsd/net/pf.c                           optional pf
 bsd/net/pf_if.c                                optional pf
 bsd/net/pf_ioctl.c                     optional pf
@@ -380,6 +381,10 @@ bsd/security/audit/audit_session.c         standard
 bsd/security/audit/audit_syscalls.c            standard
 bsd/security/audit/audit_worker.c              optional config_audit
 
+bsd/pthread/pthread_shims.c            standard
+bsd/pthread/pthread_priority.c         standard
+bsd/pthread/pthread_workqueue.c                standard
+
 bsd/kern/bsd_init.c                    standard
 ./init_sysent.c                                standard
 bsd/kern/kdebug.c                      standard
@@ -469,7 +474,6 @@ bsd/kern/posix_shm.c                        standard
 bsd/kern/qsort.c                               standard
 bsd/kern/kpi_socket.c                  optional sockets
 bsd/kern/kpi_socketfilter.c            optional sockets
-bsd/kern/pthread_shims.c               standard
 bsd/kern/proc_info.c                   standard
 bsd/kern/process_policy.c              standard
 bsd/kern/kern_overrides.c              standard
@@ -503,4 +507,6 @@ bsd/miscfs/nullfs/null_subr.c       optional nullfs
 bsd/miscfs/nullfs/null_vfsops.c     optional nullfs
 bsd/miscfs/nullfs/null_vnops.c      optional nullfs
 
+bsd/tests/bsd_tests.c                  optional config_xnupost
+bsd/tests/pmap_test_sysctl.c           optional config_xnupost
 
index 64009971c44e6d971f8959bb4ed68ed672eaf5af..7761c03acbc1027e78507c3a28db5dad8d53a42a 100644 (file)
@@ -9,6 +9,7 @@ bsd/dev/arm/unix_signal.c       standard
 
 bsd/dev/arm64/cpu_in_cksum.s   standard
 
+
 bsd/dev/arm64/dtrace_isa.c     optional config_dtrace
 bsd/dev/arm64/dtrace_subr_arm.c        optional config_dtrace
 bsd/dev/arm64/fbt_arm.c                optional config_dtrace
index d38831ba3843458b27389777d3f0c34d90131e40..07397b4b8018b72e3ce7bc757a25b4caae1d3b31 100644 (file)
@@ -28,6 +28,7 @@
 
 #define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from
                                         * mach/ppc/thread_status.h */
+#include <arm/caches_internal.h>
 #include <arm/proc_reg.h>
 
 #include <kern/thread.h>
@@ -175,12 +176,16 @@ uint64_t
 dtrace_getreg(struct regs * savearea, uint_t reg)
 {
        struct arm_saved_state *regs = (struct arm_saved_state *) savearea;
-       
+       if (regs == NULL) {
+               DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+               return (0);
+       }
        /* beyond register limit? */
        if (reg > ARM_SAVED_STATE32_COUNT - 1) {
                DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
                return (0);
        }
+
        return (uint64_t) ((unsigned int *) (&(regs->r)))[reg];
 }
 
@@ -629,3 +634,12 @@ dtrace_arm_condition_true(int cond, int cpsr)
 
        return taken;
 }
+
+void dtrace_flush_caches(void)
+{
+       /* TODO There were some problems with flushing just the cache line that had been modified.
+        * For now, we'll flush the entire cache, until we figure out how to flush just the patched block.
+        */
+       FlushPoU_Dcache();
+       InvalidatePoU_Icache();
+}
index d48b48a71bdbc91e6c5cdb2738912bf04d1eb3cb..07d41a2283ba759637b0c49bdd992575efbf7004 100644 (file)
@@ -113,16 +113,6 @@ extern int dtrace_decode_thumb(uint32_t instr);
 
 extern int dtrace_arm_condition_true(int cond, int cpsr);
 
-static
-void flush_caches(void)
-{
-       /* TODO There were some problems with flushing just the cache line that had been modified.
-        * For now, we'll flush the entire cache, until we figure out how to flush just the patched block.
-        */
-       FlushPoU_Dcache();
-       InvalidatePoU_Icache();
-}
-
 int
 fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp,
                         user_addr_t pc, fasttrap_probe_type_t type)
@@ -202,90 +192,6 @@ fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp,
        return (0);
 }
 
-// These are not exported from vm_map.h.
-extern kern_return_t vm_map_write_user(vm_map_t map, void *src_p, vm_map_address_t dst_addr, vm_size_t size);
-
-/* Patches the instructions. Almost like uwrite, but need special instructions on ARM to flush the caches. */
-static
-int patchInst(proc_t *p, void *buf, user_size_t len, user_addr_t a)
-{
-       kern_return_t ret;
-
-       ASSERT(p != NULL);
-       ASSERT(p->task != NULL);
-
-       task_t task = p->task;
-
-       /*
-        * Grab a reference to the task vm_map_t to make sure
-        * the map isn't pulled out from under us.
-        *
-        * Because the proc_lock is not held at all times on all code
-        * paths leading here, it is possible for the proc to have
-        * exited. If the map is null, fail.
-        */
-       vm_map_t map = get_task_map_reference(task);
-       if (map) {
-               /* Find the memory permissions. */
-               uint32_t nestingDepth=999999;
-               vm_region_submap_short_info_data_64_t info;
-               mach_msg_type_number_t count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
-               mach_vm_address_t address = (mach_vm_address_t)a;
-               mach_vm_size_t sizeOfRegion = (mach_vm_size_t)len;
-
-               ret = mach_vm_region_recurse(map, &address, &sizeOfRegion, &nestingDepth, (vm_region_recurse_info_t)&info, &count);
-               if (ret != KERN_SUCCESS)
-                       goto done;
-
-               vm_prot_t reprotect;
-
-               if (!(info.protection & VM_PROT_WRITE)) {
-                       /* Save the original protection values for restoration later */
-                       reprotect = info.protection;
-                       if (info.max_protection & VM_PROT_WRITE) {
-                               /* The memory is not currently writable, but can be made writable. */
-                               /* Making it both writable and executable at the same time causes warning on embedded */
-                               ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, (reprotect & ~VM_PROT_EXECUTE) | VM_PROT_WRITE);
-                       } else {
-                               /*
-                                * The memory is not currently writable, and cannot be made writable. We need to COW this memory.
-                                *
-                                * Strange, we can't just say "reprotect | VM_PROT_COPY", that fails.
-                                */
-                               ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, VM_PROT_COPY | VM_PROT_READ | VM_PROT_WRITE);
-                       }
-
-                       if (ret != KERN_SUCCESS)
-                               goto done;
-
-               } else {
-                       /* The memory was already writable. */
-                       reprotect = VM_PROT_NONE;
-               }
-
-               ret = vm_map_write_user( map,
-                                        buf,
-                                        (vm_map_address_t)a,
-                                        (vm_size_t)len);
-
-               flush_caches();
-
-               if (ret != KERN_SUCCESS)
-                       goto done;
-
-               if (reprotect != VM_PROT_NONE) {
-                       ASSERT(reprotect & VM_PROT_EXECUTE);
-                       ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, reprotect);
-               }
-
-done:
-               vm_map_deallocate(map);
-       } else
-               ret = KERN_TERMINATED;
-
-       return (int)ret;
-}
-
 int
 fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
 {
@@ -299,7 +205,7 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
                instr = FASTTRAP_ARM_INSTR;
        }
 
-       if (patchInst(p, &instr, size, tp->ftt_pc) != 0)
+       if (uwrite(p, &instr, size, tp->ftt_pc) != 0)
                return (-1);
 
        tp->ftt_installed = 1;
@@ -327,7 +233,7 @@ fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
                if (instr != FASTTRAP_ARM_INSTR)
                        goto end;
        }
-       if (patchInst(p, &tp->ftt_instr, size, tp->ftt_pc) != 0)
+       if (uwrite(p, &tp->ftt_instr, size, tp->ftt_pc) != 0)
                return (-1);
 
 end:
@@ -1154,7 +1060,7 @@ fasttrap_pid_probe(arm_saved_state_t *regs)
                                SET32(scratch+i, FASTTRAP_ARM_RET_INSTR); i += 4;
                        }
 
-                       if (patchInst(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) {
+                       if (uwrite(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) {
                                fasttrap_sigtrap(p, uthread, pc);
                                new_pc = pc;
                                break;
index c594f9c92c60c69cf2a1ebfd4ab0685257531b42..9205cfb210cd45a503448a6a50b64ed13552440f 100644 (file)
@@ -99,9 +99,6 @@ extern int                    fbt_probetab_mask;
 
 kern_return_t fbt_perfCallback(int, struct arm_saved_state *, __unused int, __unused int);
 
-static int fbt_uninstrumented_arm = 0;
-static const int fbt_log_uninstrumented = 0;
-
 extern int dtrace_arm_condition_true(int cond, int cpsr);
 
 
@@ -212,7 +209,7 @@ fbt_invop(uintptr_t addr, uintptr_t * stack, uintptr_t rval)
                                }
                                CPU->cpu_dtrace_invop_underway = 0;
                        }
-               
+
                        /*
                                On other architectures, we return a DTRACE constant to let the callback function
                                know what was replaced. On the ARM, since the function prologue/epilogue machine code
@@ -256,7 +253,7 @@ fbt_perfCallback(
                                 );
 
                emul = dtrace_invop(regs->pc, (uintptr_t*) regs, regs->r[0]);
-               
+
                __asm__ volatile(
                        "Ldtrace_invop_callsite_post_label:\n"
                        ".data\n"
@@ -335,7 +332,7 @@ fbt_perfCallback(
 }
 
 void
-fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart)
+fbt_provide_probe(struct modctl *ctl, const char *modname, const char* symbolName, machine_inst_t* symbolStart, machine_inst_t *instrHigh)
 {
        unsigned int    j;
         int            doenable = 0;
@@ -344,11 +341,11 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        fbt_probe_t     *newfbt, *retfbt, *entryfbt;
        machine_inst_t *instr, *pushinstr = NULL, *limit, theInstr;
        int             foundPushLR, savedRegs;
-       
+
        /*
         * Guard against null symbols
         */
-       if (!symbolStart || !instrLow || !instrHigh) {
+       if (!symbolStart || !instrHigh || instrHigh < symbolStart) {
                kprintf("dtrace: %s has an invalid address\n", symbolName);
                return;
        }
@@ -360,7 +357,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        savedRegs = -1;
        limit = (machine_inst_t *)instrHigh;
        for (j = 0, instr = symbolStart, theInstr = 0;
-            (j < 8) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr)); j++, instr++)
+            (j < 8) && instr < instrHigh; j++, instr++)
        {
                theInstr = *instr;
                if (FBT_IS_THUMB_PUSH_LR(theInstr)) {
@@ -390,7 +387,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        newfbt = kmem_zalloc(sizeof(fbt_probe_t), KM_SLEEP);
        newfbt->fbtp_next = NULL;
        strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS );
-               
+
        if (thisid != 0) {
                /*
                 * The dtrace_probe previously existed, so we have to hook
@@ -432,7 +429,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        newfbt->fbtp_currentval = 0;
        newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
        fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt;
-               
+
        if (doenable)
                fbt_enable(NULL, newfbt->fbtp_id, newfbt);
 
@@ -446,7 +443,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        doenable=0;
 
        thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN);
-               
+
        if (thisid != 0) {
                /* The dtrace_probe previously existed, so we have to
                 * find the end of the existing fbt chain.  If we find
@@ -501,7 +498,7 @@ again:
         * OK, it's an instruction.
         */
        theInstr = *instr;
-               
+
        /* Walked onto the start of the next routine? If so, bail out from this function */
        if (FBT_IS_THUMB_PUSH_LR(theInstr)) {
                if (!retfbt)
@@ -560,7 +557,7 @@ again:
         */
 
        newfbt = kmem_zalloc(sizeof(fbt_probe_t), KM_SLEEP);
-       newfbt->fbtp_next = NULL;       
+       newfbt->fbtp_next = NULL;
        strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS );
 
        if (retfbt == NULL) {
@@ -593,89 +590,3 @@ again:
        goto again;
 }
 
-void
-fbt_provide_module_kernel_syms(struct modctl *ctl)
-{
-       kernel_mach_header_t            *mh;
-       struct load_command             *cmd;
-       kernel_segment_command_t        *orig_ts = NULL, *orig_le = NULL;
-       struct symtab_command           *orig_st = NULL;
-       kernel_nlist_t                  *sym = NULL;
-       char                            *strings;
-       uintptr_t                       instrLow, instrHigh;
-       char                            *modname;
-       unsigned int                    i;
-
-       mh = (kernel_mach_header_t *)(ctl->mod_address);
-       modname = ctl->mod_modname;
-       
-       /*
-        * Employees of dtrace and their families are ineligible.  Void
-        * where prohibited.
-        */
-
-       if (mh->magic != MH_MAGIC_KERNEL)
-               return;
-       
-       cmd = (struct load_command *) & mh[1];
-       for (i = 0; i < mh->ncmds; i++) {
-               if (cmd->cmd == LC_SEGMENT_KERNEL) {
-                       kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd;
-
-                       if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT))
-                               orig_ts = orig_sg;
-                       else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT))
-                               orig_le = orig_sg;
-                       else if (LIT_STRNEQL(orig_sg->segname, ""))
-                               orig_ts = orig_sg;      /* kexts have a single
-                                                        * unnamed segment */
-               } else if (cmd->cmd == LC_SYMTAB)
-                       orig_st = (struct symtab_command *) cmd;
-
-               cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize);
-       }
-
-       if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL))
-               return;
-
-       sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff);
-       strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff);
-
-       /* Find extent of the TEXT section */
-       instrLow = (uintptr_t) orig_ts->vmaddr;
-       instrHigh = (uintptr_t) (orig_ts->vmaddr + orig_ts->vmsize);
-
-       for (i = 0; i < orig_st->nsyms; i++) {
-               uint8_t         n_type = sym[i].n_type & (N_TYPE | N_EXT);
-               char           *name = strings + sym[i].n_un.n_strx;
-
-               /* Check that the symbol is a global and that it has a name. */
-               if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type))
-                       continue;
-
-               if (0 == sym[i].n_un.n_strx)    /* iff a null, "", name. */
-                       continue;
-
-               /* Lop off omnipresent leading underscore. */
-               if (*name == '_')
-                       name += 1;
-
-
-               if (sym[i].n_sect == 1 && !(sym[i].n_desc & N_ARM_THUMB_DEF)) {
-                       /* A function but not a Thumb function */
-                       fbt_uninstrumented_arm++;
-                       if (fbt_log_uninstrumented)
-                               kprintf("dtrace: fbt: Skipping ARM mode function %s at %08x\n",name,(unsigned)sym[i].n_value);
-
-                       continue;
-               }
-
-                /*
-                * We're only blacklisting functions in the kernel for now.
-                */
-               if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name))
-                       continue;
-
-               fbt_provide_probe(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value);
-       }
-}
index 312952ac948feeabfee2ed74502077c941195b12..2c27afaf212acba9c12756006b343118fb9c0e5e 100644 (file)
@@ -14,6 +14,7 @@
 #include       <mach/machine.h>
 #include       <kern/cpu_number.h>
 #include       <machine/exec.h>
+#include       <pexpert/arm64/board_config.h>
 
 #if __arm64__
 extern int bootarg_no64exec;   /* bsd_init.c */
@@ -49,7 +50,8 @@ int
 grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype)
 {
 #if __arm64__
-       cpu_subtype_t hostsubtype = (exectype & CPU_ARCH_ABI64) ? cpu_subtype() : cpu_subtype32();
+       cpu_subtype_t hostsubtype =
+               (exectype & CPU_ARCH_ABI64) ? cpu_subtype() : cpu_subtype32();
 #else
        cpu_subtype_t hostsubtype = cpu_subtype();
 #endif /* __arm64__ */
@@ -63,14 +65,14 @@ grade_binary(cpu_type_t exectype, cpu_subtype_t execsubtype)
                case CPU_SUBTYPE_ARM64_V8:
                        switch (execsubtype) {
                        case CPU_SUBTYPE_ARM64_V8:
-                               return 9;
+                               return 10;
                        case CPU_SUBTYPE_ARM64_ALL:
-                               return 8;
+                               return 9;
                        }
                        break;
 
+               } /* switch (hostsubtype) */
 
-               break;
 #else /* __arm64__ */
 
        case CPU_TYPE_ARM:
index a1ee66f16231143c1f83b4d2aa184156efe659de..d97e80e219479a0ff821e3348be3d2921ca190ee 100644 (file)
@@ -7,6 +7,11 @@
 
 #include <machine/machine_routines.h>
 
+#include <mach/host_info.h>
+#include <mach/mach_host.h>
+#include <arm/cpuid.h>
+#include <libkern/libkern.h>
+
 extern int     trap_on_alignment_fault;
 extern uint64_t        wake_abstime;
 
@@ -58,3 +63,121 @@ SYSCTL_PROC(_machdep, OID_AUTO, wake_conttime,
             0, 0, sysctl_wake_conttime, "I",
             "Continuous Time at the last wakeup");
 
+/*
+ * For source compatibility, here's some machdep.cpu mibs that
+ * use host_info() to simulate reasonable answers.
+ */
+
+SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
+       "CPU info");
+
+static int
+arm_host_info SYSCTL_HANDLER_ARGS
+{
+       __unused struct sysctl_oid *unused_oidp = oidp;
+
+       host_basic_info_data_t hinfo;
+       mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+#define BSD_HOST       1
+       kern_return_t kret = host_info((host_t)BSD_HOST,
+               HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
+       if (KERN_SUCCESS != kret)
+               return (EINVAL);
+
+       if (sizeof (uint32_t) != arg2)
+               panic("size mismatch");
+
+       uintptr_t woffset = (uintptr_t)arg1 / sizeof (uint32_t);
+       uint32_t datum = *(uint32_t *)(((uint32_t *)&hinfo) + woffset);
+       return (SYSCTL_OUT(req, &datum, sizeof (datum)));
+}
+
+/*
+ * machdep.cpu.cores_per_package
+ *
+ * x86: derived from CPUID data.
+ * ARM: how many physical cores we have in the AP; aka hw.physicalcpu_max
+ */
+static
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, cores_per_package,
+       CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       (void *)offsetof(host_basic_info_data_t, physical_cpu_max),
+       sizeof (integer_t),
+       arm_host_info, "I", "CPU cores per package");
+
+/*
+ * machdep.cpu.core_count
+ *
+ * x86: derived from CPUID data.
+ * ARM: # active physical cores in the AP; aka hw.physicalcpu
+ */
+static
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, core_count,
+       CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       (void *)offsetof(host_basic_info_data_t, physical_cpu),
+       sizeof (integer_t),
+       arm_host_info, "I", "Number of enabled cores per package");
+
+/*
+ * machdep.cpu.logical_per_package
+ *
+ * x86: derived from CPUID data. Returns ENOENT if HTT bit not set, but
+ *      most x64 CPUs have that, so assume it's available.
+ * ARM: total # logical cores in the AP; aka hw.logicalcpu_max
+ */
+static
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, logical_per_package,
+       CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       (void *)offsetof(host_basic_info_data_t, logical_cpu_max),
+       sizeof (integer_t),
+       arm_host_info, "I", "CPU logical cpus per package");
+
+/*
+ * machdep.cpu.thread_count
+ *
+ * x86: derived from CPUID data.
+ * ARM: # active logical cores in the AP; aka hw.logicalcpu
+ */
+static
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count,
+       CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       (void *)offsetof(host_basic_info_data_t, logical_cpu),
+       sizeof (integer_t),
+       arm_host_info, "I", "Number of enabled threads per package");
+
+/*
+ * machdep.cpu.brand_string
+ *
+ * x86: derived from CPUID data.
+ * ARM: cons something up from the CPUID register. Could include cpufamily
+ *     here and map it to a "marketing" name, but there's no obvious need;
+ *      the value is already exported via the commpage. So keep it simple.
+ */
+static int
+make_brand_string SYSCTL_HANDLER_ARGS
+{
+       __unused struct sysctl_oid *unused_oidp = oidp;
+       __unused void *unused_arg1 = arg1;
+       __unused int unused_arg2 = arg2;
+
+       const char *impl;
+
+       switch (cpuid_info()->arm_info.arm_implementor) {
+       case CPU_VID_APPLE:
+               impl = "Apple";
+               break;
+       case CPU_VID_ARM:
+               impl = "ARM";
+               break;
+       default:
+               impl = "ARM architecture";
+               break;
+       }
+       char buf[80];
+       snprintf(buf, sizeof (buf), "%s processor", impl);
+       return (SYSCTL_OUT(req, buf, strlen(buf) + 1));
+}
+
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string,
+       CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
+       0, 0, make_brand_string, "A", "CPU brand string");
index df9f22d09e5084cbc3960a0681f293761d96932a..2fa6a6580f1c9081ee5ae7bbc45e8eb35cfe1973 100644 (file)
@@ -10,6 +10,7 @@
 #include <kern/sched_prim.h>
 #include <mach/machine/thread_status.h>
 #include <mach/thread_act.h>
+#include <machine/machine_routines.h>
 #include <arm/thread.h>
 #include <arm/proc_reg.h>
 #include <pexpert/pexpert.h>
@@ -39,7 +40,7 @@ unix_syscall(struct arm_saved_state * regs, thread_t thread_act,
 
 static int     arm_get_syscall_args(uthread_t, struct arm_saved_state *, struct sysent *);
 static int     arm_get_u32_syscall_args(uthread_t, arm_saved_state32_t *, struct sysent *);
-static void    arm_prepare_u32_syscall_return(struct sysent *, arm_saved_state32_t *, uthread_t, int);
+static void    arm_prepare_u32_syscall_return(struct sysent *, arm_saved_state_t *, uthread_t, int);
 static void    arm_prepare_syscall_return(struct sysent *, struct arm_saved_state *, uthread_t, int);
 static int     arm_get_syscall_number(struct arm_saved_state *);
 static void    arm_trace_unix_syscall(int, struct arm_saved_state *);
@@ -274,16 +275,20 @@ unix_syscall_return(int error)
 }
 
 static void
-arm_prepare_u32_syscall_return(struct sysent *callp, arm_saved_state32_t *regs, uthread_t uthread, int error)
+arm_prepare_u32_syscall_return(struct sysent *callp, arm_saved_state_t *regs, uthread_t uthread, int error)
 {
+       assert(is_saved_state32(regs));
+
+       arm_saved_state32_t *ss32 = saved_state32(regs);
+
        if (error == ERESTART) {
-               regs->pc -= 4;
+               ss32->pc -= 4;
        } else if (error != EJUSTRETURN) {
                if (error) {
-                       regs->save_r0 = error;
-                       regs->save_r1 = 0;
+                       ss32->save_r0 = error;
+                       ss32->save_r1 = 0;
                        /* set the carry bit to execute cerror routine */
-                       regs->cpsr |= PSR_CF;
+                       ss32->cpsr |= PSR_CF;
                        unix_syscall_return_kprintf("error: setting carry to trigger cerror call\n");
                } else {        /* (not error) */
                        switch (callp->sy_return_type) {
@@ -294,12 +299,12 @@ arm_prepare_u32_syscall_return(struct sysent *callp, arm_saved_state32_t *regs,
                        case _SYSCALL_RET_SIZE_T:
                        case _SYSCALL_RET_SSIZE_T:
                        case _SYSCALL_RET_UINT64_T:
-                               regs->save_r0 = uthread->uu_rval[0];
-                               regs->save_r1 = uthread->uu_rval[1];
+                               ss32->save_r0 = uthread->uu_rval[0];
+                               ss32->save_r1 = uthread->uu_rval[1];
                                break;
                        case _SYSCALL_RET_NONE:
-                               regs->save_r0 = 0;
-                               regs->save_r1 = 0;
+                               ss32->save_r0 = 0;
+                               ss32->save_r1 = 0;
                                break;
                        default:
                                panic("unix_syscall: unknown return type");
@@ -436,7 +441,7 @@ arm_clear_syscall_error(struct arm_saved_state * state)
 }
 
 #elif defined(__arm64__)
-static void arm_prepare_u64_syscall_return(struct sysent *, arm_saved_state64_t *, uthread_t, int);
+static void arm_prepare_u64_syscall_return(struct sysent *, arm_saved_state_t *, uthread_t, int);
 static int arm_get_u64_syscall_args(uthread_t, arm_saved_state64_t *, struct sysent *);
 
 static int
@@ -460,6 +465,10 @@ arm_get_u64_syscall_args(uthread_t uthread, arm_saved_state64_t *regs, struct sy
 {
        int indirect_offset, regparams;
        
+#if CONFIG_REQUIRES_U32_MUNGING
+       sy_munge_t *mungerp;
+#endif
+
        indirect_offset = (regs->x[ARM64_SYSCALL_CODE_REG_NUM] == 0) ? 1 : 0;
        regparams = 9 - indirect_offset;
 
@@ -472,6 +481,30 @@ arm_get_u64_syscall_args(uthread_t uthread, arm_saved_state64_t *regs, struct sy
        }
 
        memcpy(&uthread->uu_arg[0], &regs->x[indirect_offset], callp->sy_narg * sizeof(uint64_t));
+
+#if CONFIG_REQUIRES_U32_MUNGING
+       /*
+        * The indirect system call interface is vararg based.  For armv7k, arm64_32,
+        * and arm64, this means we simply lay the values down on the stack, padded to
+        * a width multiple (4 bytes for armv7k and arm64_32, 8 bytes for arm64).
+        * The arm64(_32) stub for syscall will load this data into the registers and
+        * then trap.  This gives us register state that corresponds to what we would
+        * expect from a armv7 task, so in this particular case we need to munge the
+        * arguments.
+        *
+        * TODO: Is there a cleaner way to do this check?  What we're actually
+        * interested in is whether the task is arm64_32.  We don't appear to guarantee
+        * that uu_proc is populated here, which is why this currently uses the
+        * thread_t.
+        */
+       mungerp = callp->sy_arg_munge32;
+       assert(uthread->uu_thread);
+
+       if (indirect_offset && !ml_thread_is64bit(uthread->uu_thread)) {
+               (*mungerp)(&uthread->uu_arg[0]);
+       }
+#endif
+
        return 0;
 }
 /*
@@ -550,45 +583,49 @@ static void
 arm_prepare_syscall_return(struct sysent *callp, struct arm_saved_state *state, uthread_t uthread, int error) 
 {
        if (is_saved_state32(state)) {
-               arm_prepare_u32_syscall_return(callp, saved_state32(state), uthread, error);
+               arm_prepare_u32_syscall_return(callp, state, uthread, error);
        } else {
-               arm_prepare_u64_syscall_return(callp, saved_state64(state), uthread, error);
+               arm_prepare_u64_syscall_return(callp, state, uthread, error);
        }
 }
 
 static void
-arm_prepare_u64_syscall_return(struct sysent *callp, arm_saved_state64_t *regs, uthread_t uthread, int error)
+arm_prepare_u64_syscall_return(struct sysent *callp, arm_saved_state_t *regs, uthread_t uthread, int error)
 {
+       assert(is_saved_state64(regs));
+
+       arm_saved_state64_t *ss64 = saved_state64(regs);
+
        if (error == ERESTART) {
-               regs->pc -= 4;
+               ss64->pc -= 4;
        } else if (error != EJUSTRETURN) {
                if (error) {
-                       regs->x[0] = error;
-                       regs->x[1] = 0;
+                       ss64->x[0] = error;
+                       ss64->x[1] = 0;
                        /* 
                         * Set the carry bit to execute cerror routine.
                         * ARM64_TODO: should we have a separate definition?  
                         * The bits are the same.
                         */
-                       regs->cpsr |= PSR_CF; 
+                       ss64->cpsr |= PSR_CF;
                        unix_syscall_return_kprintf("error: setting carry to trigger cerror call\n");
                } else {        /* (not error) */
                        switch (callp->sy_return_type) {
                        case _SYSCALL_RET_INT_T:
-                               regs->x[0] = uthread->uu_rval[0];
-                               regs->x[1] = uthread->uu_rval[1];
+                               ss64->x[0] = uthread->uu_rval[0];
+                               ss64->x[1] = uthread->uu_rval[1];
                                break;
                        case _SYSCALL_RET_UINT_T:
-                               regs->x[0] = (u_int)uthread->uu_rval[0];
-                               regs->x[1] = (u_int)uthread->uu_rval[1];
+                               ss64->x[0] = (u_int)uthread->uu_rval[0];
+                               ss64->x[1] = (u_int)uthread->uu_rval[1];
                                break;
                        case _SYSCALL_RET_OFF_T:
                        case _SYSCALL_RET_ADDR_T:
                        case _SYSCALL_RET_SIZE_T:
                        case _SYSCALL_RET_SSIZE_T:
                        case _SYSCALL_RET_UINT64_T:
-                               regs->x[0] = *((uint64_t *)(&uthread->uu_rval[0]));
-                               regs->x[1] = 0;
+                               ss64->x[0] = *((uint64_t *)(&uthread->uu_rval[0]));
+                               ss64->x[1] = 0;
                                break;
                        case _SYSCALL_RET_NONE:
                                break;
index 51c4d7e483d897c72eb0368bf1bc5f1aac6dc743..0bc0108160160b9c21733b050eb02dff0d5d4d0a 100644 (file)
@@ -30,8 +30,14 @@ extern struct arm_saved_state *get_user_regs(thread_t);
 extern user_addr_t thread_get_cthread_self(void);
 extern kern_return_t thread_getstatus(thread_t act, int flavor,
                thread_state_t tstate, mach_msg_type_number_t *count);
+extern kern_return_t thread_getstatus_to_user(thread_t act, int flavor,
+               thread_state_t tstate, mach_msg_type_number_t *count);
+extern kern_return_t machine_thread_state_convert_to_user(thread_t act, int flavor,
+               thread_state_t tstate, mach_msg_type_number_t *count);
 extern kern_return_t thread_setstatus(thread_t thread, int flavor,
                thread_state_t tstate, mach_msg_type_number_t count);
+extern kern_return_t thread_setstatus_from_user(thread_t thread, int flavor,
+               thread_state_t tstate, mach_msg_type_number_t count);
 /* XXX Put these someplace smarter... */
 typedef struct mcontext32 mcontext32_t; 
 typedef struct mcontext64 mcontext64_t;
@@ -50,18 +56,24 @@ typedef struct mcontext64 mcontext64_t;
 #endif
 
 static int
-sendsig_get_state32(thread_t th_act, mcontext32_t *mcp)
+sendsig_get_state32(thread_t th_act, arm_thread_state_t *ts, mcontext32_t *mcp)
 {
        void *tstate;
        mach_msg_type_number_t state_count;
 
-       assert(!proc_is64bit(current_proc()));
+       assert(!proc_is64bit_data(current_proc()));
 
-       tstate = (void *) &mcp->ss;
+       tstate = (void *) ts;
        state_count = ARM_THREAD_STATE_COUNT;
        if (thread_getstatus(th_act, ARM_THREAD_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
                return EINVAL;
 
+       mcp->ss = *ts;
+       tstate = (void *) &mcp->ss;
+       state_count = ARM_THREAD_STATE_COUNT;
+       if (machine_thread_state_convert_to_user(th_act, ARM_THREAD_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
+               return EINVAL;
+
        tstate = (void *) &mcp->es;
        state_count = ARM_EXCEPTION_STATE_COUNT;
        if (thread_getstatus(th_act, ARM_EXCEPTION_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
@@ -69,7 +81,7 @@ sendsig_get_state32(thread_t th_act, mcontext32_t *mcp)
 
        tstate = (void *) &mcp->fs;
        state_count = ARM_VFP_STATE_COUNT;
-       if (thread_getstatus(th_act, ARM_VFP_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
+       if (thread_getstatus_to_user(th_act, ARM_VFP_STATE, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
                return EINVAL;
 
        return 0;
@@ -77,25 +89,31 @@ sendsig_get_state32(thread_t th_act, mcontext32_t *mcp)
 
 #if defined(__arm64__)
 struct user_sigframe64 {
-       /* We can pass the last arg in a register for ARM64 */
+       /* We can pass the last two args in registers for ARM64 */
        user64_siginfo_t        sinfo;
        struct user_ucontext64  uctx;
        mcontext64_t            mctx;
 };
 
 static int
-sendsig_get_state64(thread_t th_act, mcontext64_t *mcp)
+sendsig_get_state64(thread_t th_act, arm_thread_state64_t *ts, mcontext64_t *mcp)
 {
        void *tstate;
        mach_msg_type_number_t state_count;
 
-       assert(proc_is64bit(current_proc()));
+       assert(proc_is64bit_data(current_proc()));
 
-       tstate = (void *) &mcp->ss;
+       tstate = (void *) ts;
        state_count = ARM_THREAD_STATE64_COUNT;
        if (thread_getstatus(th_act, ARM_THREAD_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
                return EINVAL;
 
+       mcp->ss = *ts;
+       tstate = (void *) &mcp->ss;
+       state_count = ARM_THREAD_STATE64_COUNT;
+       if (machine_thread_state_convert_to_user(th_act, ARM_THREAD_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
+               return EINVAL;
+
        tstate = (void *) &mcp->es;
        state_count = ARM_EXCEPTION_STATE64_COUNT;
        if (thread_getstatus(th_act, ARM_EXCEPTION_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
@@ -103,7 +121,7 @@ sendsig_get_state64(thread_t th_act, mcontext64_t *mcp)
 
        tstate = (void *) &mcp->ns;
        state_count = ARM_NEON_STATE64_COUNT;
-       if (thread_getstatus(th_act, ARM_NEON_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
+       if (thread_getstatus_to_user(th_act, ARM_NEON_STATE64, (thread_state_t) tstate, &state_count) != KERN_SUCCESS)
                return EINVAL;
 
        return 0;
@@ -127,15 +145,16 @@ sendsig_fill_uctx64(user_ucontext64_t *uctx, int oonstack, int mask, user64_addr
 static kern_return_t
 sendsig_set_thread_state64(arm_thread_state64_t *regs, 
                user64_addr_t catcher, int infostyle, int sig, user64_addr_t p_sinfo, 
-               user64_addr_t p_uctx, user64_addr_t trampact, user64_addr_t sp, thread_t th_act)
+               user64_addr_t p_uctx, user64_addr_t token, user64_addr_t trampact, user64_addr_t sp, thread_t th_act)
 {
-       assert(proc_is64bit(current_proc()));
+       assert(proc_is64bit_data(current_proc()));
 
        regs->x[0] = catcher;
        regs->x[1] = infostyle;
        regs->x[2] = sig;
        regs->x[3] = p_sinfo;
        regs->x[4] = p_uctx;
+       regs->x[5] = token;
        regs->pc = trampact;
        regs->cpsr = PSR64_USER64_DEFAULT;
        regs->sp = sp;
@@ -165,7 +184,7 @@ sendsig_set_thread_state32(arm_thread_state_t *regs,
                user32_addr_t trampact, user32_addr_t sp, thread_t th_act)
 {
 
-       assert(!proc_is64bit(current_proc()));
+       assert(!proc_is64bit_data(current_proc()));
 
        regs->r[0] = catcher;
        regs->r[1] = infostyle;
@@ -220,6 +239,7 @@ sendsig_do_dtrace(uthread_t ut, user_siginfo_t *sinfo, int sig, user_addr_t catc
        
 struct user_sigframe32 {
        user32_addr_t           puctx;
+       user32_addr_t           token;
        user32_siginfo_t        sinfo;
        struct user_ucontext32  uctx;
        mcontext32_t            mctx;
@@ -238,6 +258,16 @@ sendsig(
        __unused uint32_t code
 )
 {
+       union {
+               struct ts32 {
+                       arm_thread_state_t ss;
+               } ts32;
+#if defined(__arm64__)
+               struct ts64 {
+                       arm_thread_state64_t ss;
+               } ts64;
+#endif
+       } ts;
        union { 
                struct user_sigframe32 uf32;
 #if defined(__arm64__)
@@ -252,10 +282,13 @@ sendsig(
        thread_t        th_act;
        struct uthread *ut;
        user_size_t     stack_size = 0;
+       user_addr_t     p_uctx, token_uctx;
+       kern_return_t   kr;
 
        th_act = current_thread();
        ut = get_bsdthread_info(th_act);
 
+       bzero(&ts, sizeof(ts));
        bzero(&user_frame, sizeof(user_frame));
 
        if (p->p_sigacts->ps_siginfo & sigmask(sig))
@@ -269,16 +302,16 @@ sendsig(
        /*
         * Get sundry thread state.
         */
-       if (proc_is64bit(p)) {
+       if (proc_is64bit_data(p)) {
 #ifdef __arm64__
-               if (sendsig_get_state64(th_act, &user_frame.uf64.mctx) != 0) {
+               if (sendsig_get_state64(th_act, &ts.ts64.ss, &user_frame.uf64.mctx) != 0) {
                        goto bad2;
                }
 #else
        panic("Shouldn't have 64-bit thread states on a 32-bit kernel.");
 #endif
        } else {
-               if (sendsig_get_state32(th_act, &user_frame.uf32.mctx) != 0) {
+               if (sendsig_get_state32(th_act, &ts.ts32.ss, &user_frame.uf32.mctx) != 0) {
                        goto bad2;
                }
        }
@@ -297,15 +330,15 @@ sendsig(
                 * Get stack pointer, and allocate enough space
                 * for signal handler data.
                 */
-               if (proc_is64bit(p)) {
+               if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
-                       sp = CAST_USER_ADDR_T(user_frame.uf64.mctx.ss.sp);
+                       sp = CAST_USER_ADDR_T(ts.ts64.ss.sp);
                        sp = (sp - sizeof(user_frame.uf64) - C_64_REDZONE_LEN) & ~0xf; /* Make sure to align to 16 bytes and respect red zone */
 #else
                        panic("Shouldn't have 64-bit thread states on a 32-bit kernel.");
 #endif
                } else {
-                       sp = CAST_USER_ADDR_T(user_frame.uf32.mctx.ss.sp);
+                       sp = CAST_USER_ADDR_T(ts.ts32.ss.sp);
                        sp -= sizeof(user_frame.uf32);
 #if defined(__arm__) && (__BIGGEST_ALIGNMENT__ > 4)
                        sp &= ~0xf; /* Make sure to align to 16 bytes for armv7k */
@@ -318,7 +351,7 @@ sendsig(
        /*
         * Fill in ucontext (points to mcontext, i.e. thread states).
         */
-       if (proc_is64bit(p)) {
+       if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
                sendsig_fill_uctx64(&user_frame.uf64.uctx, oonstack, mask, sp, (user64_size_t)stack_size,
                                (user64_addr_t)&((struct user_sigframe64*)sp)->mctx);
@@ -336,16 +369,16 @@ sendsig(
        bzero((caddr_t) & sinfo, sizeof(sinfo));
        sinfo.si_signo = sig;
 
-       if (proc_is64bit(p)) {
+       if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
-               sinfo.si_addr = user_frame.uf64.mctx.ss.pc;
-               sinfo.pad[0] = user_frame.uf64.mctx.ss.sp;
+               sinfo.si_addr = ts.ts64.ss.pc;
+               sinfo.pad[0] = ts.ts64.ss.sp;
 #else
                panic("Shouldn't have 64-bit thread states on a 32-bit kernel.");
 #endif
        } else {
-               sinfo.si_addr = user_frame.uf32.mctx.ss.pc;
-               sinfo.pad[0] = user_frame.uf32.mctx.ss.sp;
+               sinfo.si_addr = ts.ts32.ss.pc;
+               sinfo.pad[0] = ts.ts32.ss.sp;
        }
 
        switch (sig) {
@@ -368,7 +401,7 @@ sendsig(
                break;
 
        case SIGBUS:
-               if (proc_is64bit(p)) {
+               if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
                        sinfo.si_addr = user_frame.uf64.mctx.es.far;
 #else
@@ -382,7 +415,7 @@ sendsig(
                break;
 
        case SIGSEGV:
-               if (proc_is64bit(p)) {
+               if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
                        sinfo.si_addr = user_frame.uf64.mctx.es.far;
 #else
@@ -460,40 +493,64 @@ sendsig(
        /* 
         * Copy signal-handling frame out to user space, set thread state.
         */
-       if (proc_is64bit(p)) {
+       if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
+               user64_addr_t token;
+
                /*
                 * mctx filled in when we get state.  uctx filled in by 
                 * sendsig_fill_uctx64(). We fill in the sinfo now.
                 */
                siginfo_user_to_user64(&sinfo, &user_frame.uf64.sinfo);
 
+               p_uctx = (user_addr_t)&((struct user_sigframe64*)sp)->uctx;
+               /*
+                * Generate the validation token for sigreturn
+                */
+               token_uctx = p_uctx;
+               kr = machine_thread_siguctx_pointer_convert_to_user(th_act, &token_uctx);
+               assert(kr == KERN_SUCCESS);
+               token = (user64_addr_t)token_uctx ^ (user64_addr_t)ps->ps_sigreturn_token;
+
                if (copyout(&user_frame.uf64, sp, sizeof(user_frame.uf64)) != 0) {
                        goto bad; 
                } 
 
-               if (sendsig_set_thread_state64(&user_frame.uf64.mctx.ss,
+               if (sendsig_set_thread_state64(&ts.ts64.ss,
                        catcher, infostyle, sig, (user64_addr_t)&((struct user_sigframe64*)sp)->sinfo,
-                       (user64_addr_t)&((struct user_sigframe64*)sp)->uctx, trampact, sp, th_act) != KERN_SUCCESS)
+                       (user64_addr_t)p_uctx, token, trampact, sp, th_act) != KERN_SUCCESS)
                        goto bad;
 
 #else
        panic("Shouldn't have 64-bit thread states on a 32-bit kernel.");
 #endif
        } else {
+               user32_addr_t token;
+
                /*
                 * mctx filled in when we get state.  uctx filled in by 
-                * sendsig_fill_uctx32(). We fill in the sinfo and *pointer* 
-                * to uctx now.
+                * sendsig_fill_uctx32(). We fill in the sinfo, *pointer*
+                * to uctx and token now.
                 */
                siginfo_user_to_user32(&sinfo, &user_frame.uf32.sinfo);
-               user_frame.uf32.puctx = (user32_addr_t) &((struct user_sigframe32*)sp)->uctx;
+
+               p_uctx = (user_addr_t)&((struct user_sigframe32*)sp)->uctx;
+               /*
+                * Generate the validation token for sigreturn
+                */
+               token_uctx = (user_addr_t)p_uctx;
+               kr = machine_thread_siguctx_pointer_convert_to_user(th_act, &token_uctx);
+               assert(kr == KERN_SUCCESS);
+               token = (user32_addr_t)token_uctx ^ (user32_addr_t)ps->ps_sigreturn_token;
+
+               user_frame.uf32.puctx = (user32_addr_t)p_uctx;
+               user_frame.uf32.token = token;
 
                if (copyout(&user_frame.uf32, sp, sizeof(user_frame.uf32)) != 0) {
                        goto bad; 
                } 
 
-               if (sendsig_set_thread_state32(&user_frame.uf32.mctx.ss,
+               if (sendsig_set_thread_state32(&ts.ts32.ss,
                        CAST_DOWN_EXPLICIT(user32_addr_t, catcher), infostyle, sig, (user32_addr_t)&((struct user_sigframe32*)sp)->sinfo,
                        CAST_DOWN_EXPLICIT(user32_addr_t, trampact), CAST_DOWN_EXPLICIT(user32_addr_t, sp), th_act) != KERN_SUCCESS)
                        goto bad;
@@ -530,7 +587,7 @@ sigreturn_copyin_ctx32(struct user_ucontext32 *uctx, mcontext32_t *mctx, user_ad
 {
        int error;
 
-       assert(!proc_is64bit(current_proc()));
+       assert(!proc_is64bit_data(current_proc()));
 
        error = copyin(uctx_addr, uctx, sizeof(*uctx));
        if (error) {
@@ -557,7 +614,7 @@ sigreturn_copyin_ctx32(struct user_ucontext32 *uctx, mcontext32_t *mctx, user_ad
 static int
 sigreturn_set_state32(thread_t th_act, mcontext32_t *mctx) 
 {
-       assert(!proc_is64bit(current_proc()));
+       assert(!proc_is64bit_data(current_proc()));
 
        /* validate the thread state, set/reset appropriate mode bits in cpsr */
 #if defined(__arm__)
@@ -568,10 +625,10 @@ sigreturn_set_state32(thread_t th_act, mcontext32_t *mctx)
 #error Unknown architecture.
 #endif
 
-       if (thread_setstatus(th_act, ARM_THREAD_STATE, (void *)&mctx->ss, ARM_THREAD_STATE_COUNT) != KERN_SUCCESS) {
+       if (thread_setstatus_from_user(th_act, ARM_THREAD_STATE, (void *)&mctx->ss, ARM_THREAD_STATE_COUNT) != KERN_SUCCESS) {
                return (EINVAL);
        }
-       if (thread_setstatus(th_act, ARM_VFP_STATE, (void *)&mctx->fs, ARM_VFP_STATE_COUNT) != KERN_SUCCESS) {
+       if (thread_setstatus_from_user(th_act, ARM_VFP_STATE, (void *)&mctx->fs, ARM_VFP_STATE_COUNT) != KERN_SUCCESS) {
                return (EINVAL);
        }
 
@@ -584,7 +641,7 @@ sigreturn_copyin_ctx64(struct user_ucontext64 *uctx, mcontext64_t *mctx, user_ad
 {
        int error;
 
-       assert(proc_is64bit(current_proc()));
+       assert(proc_is64bit_data(current_proc()));
 
        error = copyin(uctx_addr, uctx, sizeof(*uctx));
        if (error) {
@@ -611,15 +668,15 @@ sigreturn_copyin_ctx64(struct user_ucontext64 *uctx, mcontext64_t *mctx, user_ad
 static int
 sigreturn_set_state64(thread_t th_act, mcontext64_t *mctx) 
 {
-       assert(proc_is64bit(current_proc()));
+       assert(proc_is64bit_data(current_proc()));
 
        /* validate the thread state, set/reset appropriate mode bits in cpsr */
        mctx->ss.cpsr = (mctx->ss.cpsr & ~PSR64_MODE_MASK) | PSR64_USER64_DEFAULT;
 
-       if (thread_setstatus(th_act, ARM_THREAD_STATE64, (void *)&mctx->ss, ARM_THREAD_STATE64_COUNT) != KERN_SUCCESS) {
+       if (thread_setstatus_from_user(th_act, ARM_THREAD_STATE64, (void *)&mctx->ss, ARM_THREAD_STATE64_COUNT) != KERN_SUCCESS) {
                return (EINVAL);
        }
-       if (thread_setstatus(th_act, ARM_NEON_STATE64, (void *)&mctx->ns, ARM_NEON_STATE64_COUNT) != KERN_SUCCESS) {
+       if (thread_setstatus_from_user(th_act, ARM_NEON_STATE64, (void *)&mctx->ns, ARM_NEON_STATE64_COUNT) != KERN_SUCCESS) {
                return (EINVAL);
        }
 
@@ -648,14 +705,18 @@ sigreturn(
 #endif
        } mctx;
 
+       struct sigacts *ps = p->p_sigacts;
        int             error, sigmask = 0, onstack = 0;
        thread_t        th_act;
        struct uthread *ut;
+       uint32_t        sigreturn_validation;
+       user_addr_t     token_uctx;
+       kern_return_t   kr;
 
        th_act = current_thread();
        ut = (struct uthread *) get_bsdthread_info(th_act);
 
-       if (proc_is64bit(p)) {
+       if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
                error = sigreturn_copyin_ctx64(&uctx.uc64, &mctx.mc64, uap->uctx);
                if (error != 0) {
@@ -686,18 +747,54 @@ sigreturn(
        if (ut->uu_siglist & ~ut->uu_sigmask)
                signal_setast(current_thread());
 
-       if (proc_is64bit(p)) {
+       sigreturn_validation = atomic_load_explicit(
+                       &ps->ps_sigreturn_validation, memory_order_relaxed);
+       token_uctx = uap->uctx;
+       kr = machine_thread_siguctx_pointer_convert_to_user(th_act, &token_uctx);
+       assert(kr == KERN_SUCCESS);
+
+       if (proc_is64bit_data(p)) {
 #if defined(__arm64__)
+               user64_addr_t token;
+               token = (user64_addr_t)token_uctx ^ (user64_addr_t)ps->ps_sigreturn_token;
+               if ((user64_addr_t)uap->token != token) {
+#if DEVELOPMENT || DEBUG
+                       printf("process %s[%d] sigreturn token mismatch: received 0x%llx expected 0x%llx\n",
+                                       p->p_comm, p->p_pid, (user64_addr_t)uap->token, token);
+#endif /* DEVELOPMENT || DEBUG */
+                       if (sigreturn_validation != PS_SIGRETURN_VALIDATION_DISABLED) {
+                               return EINVAL;
+                       }
+               }
                error = sigreturn_set_state64(th_act, &mctx.mc64);
                if (error != 0) {
+#if DEVELOPMENT || DEBUG
+               printf("process %s[%d] sigreturn set_state64 error %d\n",
+                               p->p_comm, p->p_pid, error);
+#endif /* DEVELOPMENT || DEBUG */
                        return error;
                }
 #else
                panic("Shouldn't have 64-bit thread states on a 32-bit kernel.");
 #endif
        } else {
+               user32_addr_t token;
+               token = (user32_addr_t)token_uctx ^ (user32_addr_t)ps->ps_sigreturn_token;
+               if ((user32_addr_t)uap->token != token) {
+#if DEVELOPMENT || DEBUG
+                       printf("process %s[%d] sigreturn token mismatch: received 0x%x expected 0x%x\n",
+                                       p->p_comm, p->p_pid, (user32_addr_t)uap->token, token);
+#endif /* DEVELOPMENT || DEBUG */
+                       if (sigreturn_validation != PS_SIGRETURN_VALIDATION_DISABLED) {
+                               return EINVAL;
+                       }
+               }
                error = sigreturn_set_state32(th_act, &mctx.mc32);
                if (error != 0) {
+#if DEVELOPMENT || DEBUG
+               printf("process %s[%d] sigreturn sigreturn_set_state32 error %d\n",
+                               p->p_comm, p->p_pid, error);
+#endif /* DEVELOPMENT || DEBUG */
                        return error;
                }
        }
@@ -706,32 +803,22 @@ sigreturn(
 }
 
 /*
- * machine_exception() performs MD translation
- * of a mach exception to a unix signal and code.
+ * machine_exception() performs machine-dependent translation
+ * of a mach exception to a unix signal.
  */
-
-boolean_t
-machine_exception(
-                 int exception,
-                 mach_exception_subcode_t code,
-                 __unused mach_exception_subcode_t subcode,
-                 int *unix_signal,
-                 mach_exception_subcode_t * unix_code
-)
+int
+machine_exception(int                           exception,
+         __unused mach_exception_code_t         code,
+         __unused mach_exception_subcode_t      subcode)
 {
        switch (exception) {
-       case EXC_BAD_INSTRUCTION:
-               *unix_signal = SIGILL;
-               *unix_code = code;
-               break;
-
-       case EXC_ARITHMETIC:
-               *unix_signal = SIGFPE;
-               *unix_code = code;
-               break;
+               case EXC_BAD_INSTRUCTION:
+                       return SIGILL;
 
-       default:
-               return (FALSE);
+               case EXC_ARITHMETIC:
+                       return SIGFPE;
        }
-       return (TRUE);
+
+       return 0;
 }
+
index 00a00c6678942aed9d9a94a4c4d493324025317a..86d892aa35bc27a4742ad16ec9fa1d3d7e445ddc 100644 (file)
  * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
  * authentic; it only cares about 3 fields.
  */
+#if defined(__LP64__)
 #define        M_NEXT  0
 #define        M_DATA  16      // 8-byte address, would be aligned to 8-byte boundary
 #define        M_LEN   24
+#else
+#define        M_NEXT  0
+#define        M_DATA  8
+#define        M_LEN   12
+#endif
 
        .globl  _os_cpu_in_cksum_mbuf
        .text
@@ -98,6 +104,14 @@ _os_cpu_in_cksum_mbuf:
        #define Wmlen                   w6
        #define t       x7
        #define data    x8
+#if defined(__LP64__)
+       #define ptr_m           x0
+       #define ptr_data        x8
+#else
+       #define ptr_m           w0
+       #define ptr_data        w8
+#endif
+
 
        mov     needs_swap, #0          // needs_swap = FALSE;
        mov     started_on_odd, #0      // started_on_odd = FALSE;
@@ -128,7 +142,7 @@ _os_cpu_in_cksum_mbuf:
        ldr     Wmlen, [m, #M_LEN]      // mlen = m->m_len;
        cmp     mlen, off
        b.le    1f
-       ldr     data, [m, #M_DATA]      // mtod(m, uint8_t *)
+       ldr     ptr_data, [m, #M_DATA]  // mtod(m, uint8_t *)
        sub     mlen, mlen, off         // mlen -= off;
        add     data, data, off         // data = mtod(m, uint8_t *) + off;
        b       L_post_initial_offset
@@ -138,7 +152,7 @@ _os_cpu_in_cksum_mbuf:
        mov     x0, x3
        ret     lr
 2:
-       ldr     m, [m, #M_NEXT]
+       ldr     ptr_m, [m, #M_NEXT]
        b       0b
 
 L_loop:        // for (; len > 0; m = m->m_next) {
@@ -152,7 +166,7 @@ L_loop:     // for (; len > 0; m = m->m_next) {
  */
        cbz     m, Lin_cksum_whoops     // if (m == NULL) return -1;
        ldr     Wmlen, [m, #M_LEN]      // mlen = m->m_len;
-       ldr     data, [m, #M_DATA]      // mtod(m, uint8_t *)
+       ldr     ptr_data, [m, #M_DATA]  // mtod(m, uint8_t *)
 
 L_post_initial_offset:
 /*
@@ -374,7 +388,7 @@ L0_bytes:
 
 L_continue:
        cmp     len, #0
-       ldr     m, [m, #M_NEXT]                 // m = m->m_next
+       ldr     ptr_m, [m, #M_NEXT]                     // m = m->m_next
        b.gt    L_loop
 
 /*
index a00f8d0ebd0d1b279f6ada6878ba2653211a38c9..7195d0d72218e6c6dc8e750498fb8dbfa910ae4e 100644 (file)
@@ -1124,7 +1124,8 @@ struct arm64_decode_entry arm64_decode_table[] = {
                { .mask = FASTTRAP_ARM64_OP_MASK_ADRP,          .value = FASTTRAP_ARM64_OP_VALUE_ADRP,          .type = FASTTRAP_T_ARM64_ADRP },
                { .mask = FASTTRAP_ARM64_OP_MASK_ADR,           .value = FASTTRAP_ARM64_OP_VALUE_ADR,           .type = FASTTRAP_T_ARM64_ADR },
                { .mask = FASTTRAP_ARM64_OP_MASK_PRFM,          .value = FASTTRAP_ARM64_OP_VALUE_PRFM,          .type = FASTTRAP_T_ARM64_PRFM },
-               { .mask = FASTTRAP_ARM64_OP_MASK_EXCL_MEM,      .value = FASTTRAP_ARM64_OP_VALUE_EXCL_MEM,      .type = FASTTRAP_T_ARM64_EXCLUSIVE_MEM }}; 
+               { .mask = FASTTRAP_ARM64_OP_MASK_EXCL_MEM,      .value = FASTTRAP_ARM64_OP_VALUE_EXCL_MEM,      .type = FASTTRAP_T_ARM64_EXCLUSIVE_MEM },
+               { .mask = FASTTRAP_ARM64_OP_MASK_RETAB,         .value = FASTTRAP_ARM64_OP_VALUE_RETAB,         .type = FASTTRAP_T_ARM64_RETAB }};
 
 #define NUM_DECODE_ENTRIES (sizeof(arm64_decode_table) / sizeof(struct arm64_decode_entry))
 
index 3f81cb706dd4c313e4ce5bde5a25b947505a8517..bd2716b95ce93ed13f6ded63f21d52c93b23b3ba 100644 (file)
 
 #define MACH__POSIX_C_SOURCE_PRIVATE 1 /* pulls in suitable savearea from
                                         * mach/ppc/thread_status.h */
+#include <arm/caches_internal.h>
 #include <arm/proc_reg.h>
 
 #include <kern/thread.h>
 #include <mach/thread_status.h>
 
+#if __has_include(<ptrauth.h>)
+#include <ptrauth.h>
+#endif
 #include <stdarg.h>
 #include <string.h>
 #include <sys/malloc.h>
@@ -194,6 +198,11 @@ dtrace_getreg(struct regs * savearea, uint_t reg)
 {
        struct arm_saved_state *regs = (struct arm_saved_state *) savearea;
 
+       if (regs == NULL) {
+               DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+               return (0);
+       }
+
        if (is_saved_state32(regs)) {
                // Fix special registers if user is 32 bits
                switch (reg) {
@@ -231,7 +240,7 @@ dtrace_getustack_common(uint64_t * pcstack, int pcstack_limit, user_addr_t pc,
                        user_addr_t sp)
 {
        int ret = 0;
-       boolean_t is64bit = proc_is64bit(current_proc());
+       boolean_t is64bit = proc_is64bit_data(current_proc());
        
        ASSERT(pcstack == NULL || pcstack_limit > 0);
 
@@ -359,7 +368,7 @@ void
 dtrace_getufpstack(uint64_t * pcstack, uint64_t * fpstack, int pcstack_limit)
 {
        thread_t        thread = current_thread();
-       boolean_t       is64bit = proc_is64bit(current_proc());
+       boolean_t       is64bit = proc_is64bit_data(current_proc());
        savearea_t      *regs;
        user_addr_t     pc, sp;
        volatile        uint16_t  *flags = (volatile uint16_t *) & cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
@@ -608,7 +617,11 @@ dtrace_getarg(int arg, int aframes, dtrace_mstate_t *mstate, dtrace_vstate_t *vs
 
        for (i = 1; i <= aframes; ++i) {
                fp = fp->backchain;
+#if __has_feature(ptrauth_returns)
+               pc = (uintptr_t)ptrauth_strip((void*)fp->retaddr, ptrauth_key_return_address);
+#else
                pc = fp->retaddr;
+#endif
 
                if (dtrace_invop_callsite_pre != NULL
                    && pc >  (uintptr_t) dtrace_invop_callsite_pre
@@ -628,7 +641,7 @@ dtrace_getarg(int arg, int aframes, dtrace_mstate_t *mstate, dtrace_vstate_t *vs
                        } else {
                                /* the argument will be found in the stack */
                                fp = (struct frame*) saved_state->sp;
-                               stack = (uintptr_t*) &fp[1]; 
+                               stack = (uintptr_t*) &fp[1];
                                arg -= (inreg + 1);
                        }
 
@@ -694,3 +707,12 @@ dtrace_toxic_ranges(void (*func)(uintptr_t base, uintptr_t limit))
                        func(VM_MAX_KERNEL_ADDRESS + 1, ~(uintptr_t)0);
 }
 
+void dtrace_flush_caches(void)
+{
+       /* TODO There were some problems with flushing just the cache line that had been modified.
+        * For now, we'll flush the entire cache, until we figure out how to flush just the patched block.
+        */
+       FlushPoU_Dcache();
+       InvalidatePoU_Icache();
+}
+
index c0af6a9e2982f5e0d2fe22913fede970a15aeebf..8643cbd929f0f19404efd8a136d139bf64699564 100644 (file)
@@ -36,7 +36,6 @@
 #define _KERNEL                        /* Solaris vs. Darwin */
 #endif
 #endif
-
 #include <sys/fasttrap_isa.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/dtrace.h>
 
 #include <pexpert/pexpert.h>
 
+#if __has_include(<ptrauth.h>)
+#include <ptrauth.h>
+#endif
+
+
 extern dtrace_id_t dtrace_probeid_error;
 
 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
@@ -117,17 +121,6 @@ extern int dtrace_decode_thumb(uint32_t instr);
 #define ARM_LDR_UF (1 << 23)
 #define ARM_LDR_BF (1 << 22)
 
-static void
-flush_caches(void)
-{
-       /* TODO There were some problems with flushing just the cache line that had been modified.
-        * For now, we'll flush the entire cache, until we figure out how to flush just the patched block.
-        */
-       FlushPoU_Dcache();
-       InvalidatePoU_Icache();
-}
-
-
 static int fasttrap_tracepoint_init32 (proc_t *, fasttrap_tracepoint_t *, user_addr_t, fasttrap_probe_type_t);
 static int fasttrap_tracepoint_init64 (proc_t *, fasttrap_tracepoint_t *, user_addr_t, fasttrap_probe_type_t);
 
@@ -135,7 +128,7 @@ int
 fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp,
                         user_addr_t pc, fasttrap_probe_type_t type)
 {
-       if (proc_is64bit(p)) {
+       if (proc_is64bit_data(p)) {
                return fasttrap_tracepoint_init64(p, tp, pc, type);
        } else {
                return fasttrap_tracepoint_init32(p, tp, pc, type);
@@ -250,6 +243,8 @@ fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp,
        if (tp->ftt_fntype != FASTTRAP_FN_DONE_INIT) {
                switch(tp->ftt_fntype) {
                case FASTTRAP_FN_UNKNOWN:
+               case FASTTRAP_FN_ARM64:
+               case FASTTRAP_FN_ARM64_32:
                        /*
                         * On arm64 there is no distinction between
                         * arm vs. thumb mode instruction types.
@@ -299,90 +294,6 @@ fasttrap_tracepoint_init64(proc_t *p, fasttrap_tracepoint_t *tp,
        return (0);
 }
 
-// These are not exported from vm_map.h.
-extern kern_return_t vm_map_write_user(vm_map_t map, void *src_p, vm_map_address_t dst_addr, vm_size_t size);
-
-/* Patches the instructions. Almost like uwrite, but need special instructions on ARM to flush the caches. */
-static
-int patchInst(proc_t *p, void *buf, user_size_t len, user_addr_t a)
-{
-       kern_return_t ret;
-
-       ASSERT(p != NULL);
-       ASSERT(p->task != NULL);
-
-       task_t task = p->task;
-
-       /*
-        * Grab a reference to the task vm_map_t to make sure
-        * the map isn't pulled out from under us.
-        *
-        * Because the proc_lock is not held at all times on all code
-        * paths leading here, it is possible for the proc to have
-        * exited. If the map is null, fail.
-        */
-       vm_map_t map = get_task_map_reference(task);
-       if (map) {
-               /* Find the memory permissions. */
-               uint32_t nestingDepth=999999;
-               vm_region_submap_short_info_data_64_t info;
-               mach_msg_type_number_t count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
-               mach_vm_address_t address = (mach_vm_address_t)a;
-               mach_vm_size_t sizeOfRegion = (mach_vm_size_t)len;
-
-               ret = mach_vm_region_recurse(map, &address, &sizeOfRegion, &nestingDepth, (vm_region_recurse_info_t)&info, &count);
-               if (ret != KERN_SUCCESS)
-                       goto done;
-
-               vm_prot_t reprotect;
-
-               if (!(info.protection & VM_PROT_WRITE)) {
-                       /* Save the original protection values for restoration later */
-                       reprotect = info.protection;
-                       if (info.max_protection & VM_PROT_WRITE) {
-                               /* The memory is not currently writable, but can be made writable. */
-                               /* Making it both writable and executable at the same time causes warning on embedded */
-                               ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, (reprotect & ~VM_PROT_EXECUTE) | VM_PROT_WRITE);
-                       } else {
-                               /*
-                                * The memory is not currently writable, and cannot be made writable. We need to COW this memory.
-                                *
-                                * Strange, we can't just say "reprotect | VM_PROT_COPY", that fails.
-                                */
-                               ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, VM_PROT_COPY | VM_PROT_READ | VM_PROT_WRITE);
-                       }
-
-                       if (ret != KERN_SUCCESS)
-                               goto done;
-
-               } else {
-                       /* The memory was already writable. */
-                       reprotect = VM_PROT_NONE;
-               }
-
-               ret = vm_map_write_user( map,
-                                        buf,
-                                        (vm_map_address_t)a,
-                                        (vm_size_t)len);
-
-               flush_caches();
-
-               if (ret != KERN_SUCCESS)
-                       goto done;
-
-               if (reprotect != VM_PROT_NONE) {
-                       ASSERT(reprotect & VM_PROT_EXECUTE);
-                       ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, reprotect);
-               }
-
-done:
-               vm_map_deallocate(map);
-       } else
-               ret = KERN_TERMINATED;
-
-       return (int)ret;
-}
-
 int
 fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
 {
@@ -390,7 +301,7 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
        uint32_t instr;
        int size;
 
-       if (proc_is64bit(p)) {
+       if (proc_is64bit_data(p)) {
                size = 4;
                instr = FASTTRAP_ARM64_INSTR;
        }
@@ -403,7 +314,7 @@ fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
                }
        }
 
-       if (patchInst(p, &instr, size, tp->ftt_pc) != 0)
+       if (uwrite(p, &instr, size, tp->ftt_pc) != 0)
                return (-1);
 
        tp->ftt_installed = 1;
@@ -418,7 +329,7 @@ fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
        uint32_t instr;
        int size;
 
-       if (proc_is64bit(p)) {
+       if (proc_is64bit_data(p)) {
                /*
                 * Distinguish between read or write failures and a changed
                 * instruction.
@@ -447,7 +358,7 @@ fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
                }
        }
 
-       if (patchInst(p, &tp->ftt_instr, size, tp->ftt_pc) != 0)
+       if (uwrite(p, &tp->ftt_instr, size, tp->ftt_pc) != 0)
                return (-1);
 
 end:
@@ -501,7 +412,7 @@ fasttrap_return_common(proc_t *p, arm_saved_state_t *regs, user_addr_t pc, user_
                }
                else {
                        /* ARM64_TODO  - check for FASTTRAP_T_RET */
-                       if ((tp->ftt_type != FASTTRAP_T_ARM64_RET) &&
+                       if ((tp->ftt_type != FASTTRAP_T_ARM64_RET || tp->ftt_type != FASTTRAP_T_ARM64_RETAB) &&
                                new_pc - probe->ftp_faddr < probe->ftp_fsize)
                                continue;
                }
@@ -1214,7 +1125,7 @@ fasttrap_pid_probe_handle_patched_instr32(arm_saved_state_t *state, fasttrap_tra
                                SET32(scratch+i, FASTTRAP_ARM32_RET_INSTR); i += 4;
                        }
 
-                       if (patchInst(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) {
+                       if (uwrite(p, scratch, i, uthread->t_dtrace_scratch->write_addr) != KERN_SUCCESS) {
                                fasttrap_sigtrap(p, uthread, pc);
                                new_pc = pc;
                                break;
@@ -1280,7 +1191,7 @@ fasttrap_pid_probe_thunk_instr64(arm_saved_state_t *state, fasttrap_tracepoint_t
                return;
        }
 
-       if (patchInst(p, local_scratch, (num_instrs + 1) * sizeof(uint32_t), user_scratch_area) != KERN_SUCCESS) {
+       if (uwrite(p, local_scratch, (num_instrs + 1) * sizeof(uint32_t), user_scratch_area) != KERN_SUCCESS) {
                fasttrap_sigtrap(p, uthread, pc);
                *pc_out = pc;
                return;
@@ -1292,6 +1203,7 @@ fasttrap_pid_probe_thunk_instr64(arm_saved_state_t *state, fasttrap_tracepoint_t
        /* We may or may not be about to run a return probe (but we wouldn't thunk ret lr)*/
        uthread->t_dtrace_ret = (tp->ftt_retids != NULL);
        assert(tp->ftt_type != FASTTRAP_T_ARM64_RET);
+       assert(tp->ftt_type != FASTTRAP_T_ARM64_RETAB);
 
        /* Set address of instruction we've patched */
        uthread->t_dtrace_pc = pc;
@@ -1729,10 +1641,22 @@ fasttrap_pid_probe_handle_patched_instr64(arm_saved_state_t *state, fasttrap_tra
 
                        /* Set PC to register value (xzr, not sp) */
                        new_pc = get_saved_state64_regno(regs64, regno, 1);
+
                        *was_simulated = 1;
                        break;
                }
+               case FASTTRAP_T_ARM64_RETAB:
+               {
+                       /* Set PC to register value (xzr, not sp) */
+                       new_pc = get_saved_state64_regno(regs64, 30, 1);
+#if __has_feature(ptrauth_calls)
+                       new_pc = (user_addr_t) ptrauth_strip((void *)new_pc, ptrauth_key_return_address);
+#endif
 
+                       *was_simulated = 1;
+                       break;
+
+               }
                /*
                 * End branches.
                 */
index c2f348f9ac33ef2e8d15e0fcbfb7511ecef92f3e..3364a066e478fde9de9e80fa93a0c6ab5dda93fd 100644 (file)
 
 #include <sys/dtrace_glue.h>
 
+#if __has_include(<ptrauth.h>)
+#include <ptrauth.h>
+#endif
+
 #define DTRACE_INVOP_PUSH_FRAME 11
 
 #define DTRACE_INVOP_NOP_SKIP          4
@@ -90,7 +94,7 @@
        (((x) & 0xffc07fff) == 0xa9407bfd || ((x) & 0xffc07fff) == 0xa8c07bfd)
 
 #define FBT_IS_ARM64_ADD_FP_SP(x)      (((x) & 0xffc003ff) == 0x910003fd)      /* add fp, sp, #val  (add fp, sp, #0 == mov fp, sp) */
-#define FBT_IS_ARM64_RET(x)            ((x) == 0xd65f03c0)                     /* ret */
+#define FBT_IS_ARM64_RET(x)            (((x) == 0xd65f03c0) || ((x) == 0xd65f0fff))                    /* ret, retab */
 
 
 #define FBT_B_MASK                     0xff000000
@@ -128,19 +132,19 @@ fbt_invop(uintptr_t addr, uintptr_t * stack, uintptr_t rval)
                                if (fbt->fbtp_roffset == 0) {
                                        /*
                                         * Stack looks like this:
-                                        *      
+                                        *
                                         *      [Higher addresses]
-                                        *      
+                                        *
                                         *      Frame of caller
                                         *      Extra args for callee
-                                        *      ------------------------ 
+                                        *      ------------------------
                                         *      Frame from traced function: <previous sp (e.g. 0x1000), return address>
                                         *      ------------------------
                                         *      arm_context_t
                                         *      ------------------------
                                         *      Frame from trap handler:  <previous sp (e.g. 0x1000) , traced PC >
                                         *                              The traced function never got to mov fp, sp,
-                                        *                              so there is no frame in the backtrace pointing 
+                                        *                              so there is no frame in the backtrace pointing
                                         *                              to the frame on the stack containing the LR in the
                                         *                              caller.
                                         *      ------------------------
@@ -155,29 +159,29 @@ fbt_invop(uintptr_t addr, uintptr_t * stack, uintptr_t rval)
 
                                        arm_saved_state_t *regs = (arm_saved_state_t *)(&((arm_context_t *)stack)->ss);
 
-                                       /* 
-                                        * cpu_dtrace_caller compensates for fact that the traced function never got to update its fp. 
-                                        * When walking the stack, when we reach the frame where we extract a PC in the patched 
+                                       /*
+                                        * cpu_dtrace_caller compensates for fact that the traced function never got to update its fp.
+                                        * When walking the stack, when we reach the frame where we extract a PC in the patched
                                         * function, we put the cpu_dtrace_caller in the backtrace instead.  The next frame we extract
-                                        * will be in the caller's caller, so we output a backtrace starting at the caller and going 
+                                        * will be in the caller's caller, so we output a backtrace starting at the caller and going
                                         * sequentially up the stack.
                                         */
-                                       CPU->cpu_dtrace_caller = get_saved_state_lr(regs); 
+                                       CPU->cpu_dtrace_caller = get_saved_state_lr(regs);
                                        dtrace_probe(fbt->fbtp_id, get_saved_state_reg(regs, 0), get_saved_state_reg(regs, 1),
                                            get_saved_state_reg(regs, 2), get_saved_state_reg(regs, 3),get_saved_state_reg(regs, 4));
                                        CPU->cpu_dtrace_caller = 0;
                                } else {
                                        /*
                                         * When fbtp_roffset is non-zero, we know we are handling a return probe point.
-                                        * 
+                                        *
                                         *
                                         * Stack looks like this, as we've already popped the frame in the traced callee, and
                                         * we trap with lr set to the return address in the caller.
                                         *      [Higher addresses]
-                                        *      
+                                        *
                                         *      Frame of caller
                                         *      Extra args for callee
-                                        *      ------------------------ 
+                                        *      ------------------------
                                         *      arm_context_t
                                         *      ------------------------
                                         *      Frame from trap handler:  <sp at time of trap, traced PC >
@@ -198,7 +202,7 @@ fbt_invop(uintptr_t addr, uintptr_t * stack, uintptr_t rval)
                                }
                                CPU->cpu_dtrace_invop_underway = 0;
                        }
-               
+
                        /*
                                On other architectures, we return a DTRACE constant to let the callback function
                                know what was replaced. On the ARM, since the function prologue/epilogue machine code
@@ -280,8 +284,11 @@ fbt_perfCallback(
                        retval = KERN_SUCCESS;
                } else if (FBT_IS_ARM64_RET(emul)) {
                        lr = get_saved_state_lr(regs);
+#if __has_feature(ptrauth_calls)
+                       lr = (user_addr_t) ptrauth_strip((void *)lr, ptrauth_key_return_address);
+#endif
                        set_saved_state_pc(regs, lr);
-                       retval = KERN_SUCCESS;                  
+                       retval = KERN_SUCCESS;
                } else if (FBT_IS_ARM64_B_INSTR(emul)) {
                        pc = get_saved_state_pc(regs);
                        imm = FBT_GET_ARM64_B_IMM(emul);
@@ -301,20 +308,19 @@ fbt_perfCallback(
 }
 
 void
-fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart)
+fbt_provide_probe(struct modctl *ctl, const char *modname, const char* symbolName, machine_inst_t* symbolStart, machine_inst_t *instrHigh)
 {
-       unsigned int    j;
         int            doenable = 0;
        dtrace_id_t     thisid;
 
        fbt_probe_t     *newfbt, *retfbt, *entryfbt;
        machine_inst_t *instr, *pushinstr = NULL, *limit, theInstr;
        int             foundPushLR, savedRegs;
-       
+
        /*
-        * Guard against null symbols
+        * Guard against null and invalid symbols
         */
-       if (!symbolStart || !instrLow || !instrHigh) {
+       if (!symbolStart || !instrHigh || instrHigh < symbolStart) {
                kprintf("dtrace: %s has an invalid address\n", symbolName);
                return;
        }
@@ -322,15 +328,13 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        /*
         * Assume the compiler doesn't schedule instructions in the prologue.
         */
-
        foundPushLR = 0;
        savedRegs = -1;
        limit = (machine_inst_t *)instrHigh;
 
        assert(sizeof(*instr) == 4);
 
-       for (j = 0, instr = symbolStart, theInstr = 0;
-            (j < 8) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr)); j++, instr++)
+       for (instr = symbolStart, theInstr = 0; instr < instrHigh; instr++)
        {
                /*
                 * Count the number of time we pushed something onto the stack
@@ -361,7 +365,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        newfbt = kmem_zalloc(sizeof(fbt_probe_t), KM_SLEEP);
        newfbt->fbtp_next = NULL;
        strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS );
-               
+
        if (thisid != 0) {
                /*
                 * The dtrace_probe previously existed, so we have to hook
@@ -417,7 +421,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        doenable=0;
 
        thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN);
-               
+
        if (thisid != 0) {
                /* The dtrace_probe previously existed, so we have to
                 * find the end of the existing fbt chain.  If we find
@@ -455,7 +459,7 @@ again:
         * OK, it's an instruction.
         */
        theInstr = *instr;
-               
+
        /* Walked onto the start of the next routine? If so, bail out from this function */
        if (FBT_IS_ARM64_FRAME_PUSH(theInstr)) {
                if (!retfbt)
@@ -498,7 +502,7 @@ again:
                return;
 
        newfbt = kmem_zalloc(sizeof(fbt_probe_t), KM_SLEEP);
-       newfbt->fbtp_next = NULL;       
+       newfbt->fbtp_next = NULL;
        strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS );
 
        if (retfbt == NULL) {
@@ -529,80 +533,3 @@ again:
        instr++;
        goto again;
 }
-
-void
-fbt_provide_module_kernel_syms(struct modctl *ctl)
-{
-       kernel_mach_header_t            *mh;
-       struct load_command             *cmd;
-       kernel_segment_command_t        *orig_ts = NULL, *orig_le = NULL;
-       struct symtab_command           *orig_st = NULL;
-       kernel_nlist_t                  *sym = NULL;
-       char                            *strings;
-       uintptr_t                       instrLow, instrHigh;
-       char                            *modname;
-       unsigned int                    i;
-
-       mh = (kernel_mach_header_t *)(ctl->mod_address);
-       modname = ctl->mod_modname;
-       
-       /*
-        * Employees of dtrace and their families are ineligible.  Void
-        * where prohibited.
-        */
-
-       if (mh->magic != MH_MAGIC_KERNEL)
-               return;
-
-       cmd = (struct load_command *) & mh[1];
-       for (i = 0; i < mh->ncmds; i++) {
-               if (cmd->cmd == LC_SEGMENT_KERNEL) {
-                       kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd;
-
-                       if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT))
-                               orig_ts = orig_sg;
-                       else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT))
-                               orig_le = orig_sg;
-                       else if (LIT_STRNEQL(orig_sg->segname, ""))
-                               orig_ts = orig_sg;      /* kexts have a single
-                                                        * unnamed segment */
-               } else if (cmd->cmd == LC_SYMTAB)
-                       orig_st = (struct symtab_command *) cmd;
-
-               cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize);
-       }
-
-       if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL))
-               return;
-
-       sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff);
-       strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff);
-
-       /* Find extent of the TEXT section */
-       instrLow = (uintptr_t) orig_ts->vmaddr;
-       instrHigh = (uintptr_t) (orig_ts->vmaddr + orig_ts->vmsize);
-
-       for (i = 0; i < orig_st->nsyms; i++) {
-               uint8_t         n_type = sym[i].n_type & (N_TYPE | N_EXT);
-               char           *name = strings + sym[i].n_un.n_strx;
-
-               /* Check that the symbol is a global and that it has a name. */
-               if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type))
-                       continue;
-
-               if (0 == sym[i].n_un.n_strx)    /* iff a null, "", name. */
-                       continue;
-
-               /* Lop off omnipresent leading underscore. */
-               if (*name == '_')
-                       name += 1;
-
-                /*
-                * We're only blacklisting functions in the kernel for now.
-                */
-               if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name))
-                       continue;
-
-               fbt_provide_probe(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value);
-       }
-}
index 22dcc12d7e69a942c58a4af0f0b08b101616bf69..deb952d44aaef2ad793a7d4cc239cbc7bdbc4b84 100644 (file)
@@ -7,6 +7,11 @@
 
 #include <machine/machine_routines.h>
 
+#include <mach/host_info.h>
+#include <mach/mach_host.h>
+#include <arm/cpuid.h>
+#include <libkern/libkern.h>
+
 extern uint64_t        wake_abstime;
 
 static
@@ -53,3 +58,121 @@ SYSCTL_PROC(_machdep, OID_AUTO, wake_conttime,
             "Continuous Time at the last wakeup");
 
 
+/*
+ * For source compatibility, here's some machdep.cpu mibs that
+ * use host_info() to simulate reasonable answers.
+ */
+
+SYSCTL_NODE(_machdep, OID_AUTO, cpu, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
+       "CPU info");
+
+static int
+arm_host_info SYSCTL_HANDLER_ARGS
+{
+       __unused struct sysctl_oid *unused_oidp = oidp;
+
+       host_basic_info_data_t hinfo;
+       mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+#define BSD_HOST       1
+       kern_return_t kret = host_info((host_t)BSD_HOST,
+               HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
+       if (KERN_SUCCESS != kret)
+               return (EINVAL);
+
+       if (sizeof (uint32_t) != arg2)
+               panic("size mismatch");
+
+       uintptr_t woffset = (uintptr_t)arg1 / sizeof (uint32_t);
+       uint32_t datum = *(uint32_t *)(((uint32_t *)&hinfo) + woffset);
+       return (SYSCTL_OUT(req, &datum, sizeof (datum)));
+}
+
+/*
+ * machdep.cpu.cores_per_package
+ *
+ * x86: derived from CPUID data.
+ * ARM: how many physical cores we have in the AP; aka hw.physicalcpu_max
+ */
+static
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, cores_per_package,
+       CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       (void *)offsetof(host_basic_info_data_t, physical_cpu_max),
+       sizeof (integer_t),
+       arm_host_info, "I", "CPU cores per package");
+
+/*
+ * machdep.cpu.core_count
+ *
+ * x86: derived from CPUID data.
+ * ARM: # active physical cores in the AP; aka hw.physicalcpu
+ */
+static
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, core_count,
+       CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       (void *)offsetof(host_basic_info_data_t, physical_cpu),
+       sizeof (integer_t),
+       arm_host_info, "I", "Number of enabled cores per package");
+
+/*
+ * machdep.cpu.logical_per_package
+ *
+ * x86: derived from CPUID data. Returns ENOENT if HTT bit not set, but
+ *      most x64 CPUs have that, so assume it's available.
+ * ARM: total # logical cores in the AP; aka hw.logicalcpu_max
+ */
+static
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, logical_per_package,
+       CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       (void *)offsetof(host_basic_info_data_t, logical_cpu_max),
+       sizeof (integer_t),
+       arm_host_info, "I", "CPU logical cpus per package");
+
+/*
+ * machdep.cpu.thread_count
+ *
+ * x86: derived from CPUID data.
+ * ARM: # active logical cores in the AP; aka hw.logicalcpu
+ */
+static
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, thread_count,
+       CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_LOCKED,
+       (void *)offsetof(host_basic_info_data_t, logical_cpu),
+       sizeof (integer_t),
+       arm_host_info, "I", "Number of enabled threads per package");
+
+/*
+ * machdep.cpu.brand_string
+ *
+ * x86: derived from CPUID data.
+ * ARM: cons something up from the CPUID register. Could include cpufamily
+ *     here and map it to a "marketing" name, but there's no obvious need;
+ *      the value is already exported via the commpage. So keep it simple.
+ */
+static int
+make_brand_string SYSCTL_HANDLER_ARGS
+{
+       __unused struct sysctl_oid *unused_oidp = oidp;
+       __unused void *unused_arg1 = arg1;
+       __unused int unused_arg2 = arg2;
+
+       const char *impl;
+
+       switch (cpuid_info()->arm_info.arm_implementor) {
+       case CPU_VID_APPLE:
+               impl = "Apple";
+               break;
+       case CPU_VID_ARM:
+               impl = "ARM";
+               break;
+       default:
+               impl = "ARM architecture";
+               break;
+       }
+       char buf[80];
+       snprintf(buf, sizeof (buf), "%s processor", impl);
+       return (SYSCTL_OUT(req, buf, strlen(buf) + 1));
+}
+
+SYSCTL_PROC(_machdep_cpu, OID_AUTO, brand_string,
+       CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
+       0, 0, make_brand_string, "A", "CPU brand string");
index 587a8b43860f6a6de3c73868e66fd94de6e96520..a83adc712e2dc1f68406b6c4b38320652afa2572 100644 (file)
@@ -120,6 +120,8 @@ extern kmod_info_t g_kernel_kmod_info;
 
 extern void dtrace_suspend(void);
 extern void dtrace_resume(void);
+extern void dtrace_early_init(void);
+extern int dtrace_keep_kernel_symbols(void);
 extern void dtrace_init(void);
 extern void helper_init(void);
 extern void fasttrap_init(void);
@@ -131,6 +133,7 @@ extern void dtrace_postinit(void);
 extern void dtrace_proc_fork(proc_t*, proc_t*, int);
 extern void dtrace_proc_exec(proc_t*);
 extern void dtrace_proc_exit(proc_t*);
+
 /*
  * DTrace Tunable Variables
  *
@@ -205,13 +208,14 @@ unsigned int      dtrace_max_cpus = 0;            /* number of enabled cpus */
  */
 static dev_info_t      *dtrace_devi;           /* device info */
 static vmem_t          *dtrace_arena;          /* probe ID arena */
-static taskq_t         *dtrace_taskq;          /* task queue */
 static dtrace_probe_t  **dtrace_probes;        /* array of all probes */
 static int             dtrace_nprobes;         /* number of probes */
 static dtrace_provider_t *dtrace_provider;     /* provider list */
 static dtrace_meta_t   *dtrace_meta_pid;       /* user-land meta provider */
 static int             dtrace_opens;           /* number of opens */
 static int             dtrace_helpers;         /* number of helpers */
+static dtrace_hash_t   *dtrace_strings;
+static dtrace_hash_t   *dtrace_byprov;         /* probes hashed by provider */
 static dtrace_hash_t   *dtrace_bymod;          /* probes hashed by module */
 static dtrace_hash_t   *dtrace_byfunc;         /* probes hashed by function */
 static dtrace_hash_t   *dtrace_byname;         /* probes hashed by name */
@@ -237,7 +241,7 @@ static int          dtrace_dof_mode;        /* See dtrace_impl.h for a description of Darwin's
                         */
 int                    dtrace_kernel_symbol_mode;      /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
 static uint32_t                dtrace_wake_clients;
-
+static uint8_t      dtrace_kerneluuid[16];     /* the 128-bit uuid */
 
 /*
  * To save memory, some common memory allocations are given a
@@ -328,17 +332,17 @@ dtrace_enable_nullop(void)
     return (0);
 }
 
-static dtrace_pops_t   dtrace_provider_ops = {
-       (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
-       (void (*)(void *, struct modctl *))dtrace_nullop,
-       (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
-       (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
-       (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
-       (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
-       NULL,
-       NULL,
-       NULL,
-       (void (*)(void *, dtrace_id_t, void *))dtrace_nullop
+static dtrace_pops_t dtrace_provider_ops = {
+       .dtps_provide = (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
+       .dtps_provide_module =  (void (*)(void *, struct modctl *))dtrace_nullop,
+       .dtps_enable =  (int (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+       .dtps_disable = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+       .dtps_suspend = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+       .dtps_resume =  (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+       .dtps_getargdesc =      NULL,
+       .dtps_getargval =       NULL,
+       .dtps_usermode =        NULL,
+       .dtps_destroy = (void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 };
 
 static dtrace_id_t     dtrace_probeid_begin;   /* special BEGIN probe */
@@ -393,18 +397,22 @@ static lck_mtx_t dtrace_errlock;
  * outside of the implementation.  There is no real structure to this cpp
  * mishmash -- but is there ever?
  */
-#define        DTRACE_HASHSTR(hash, probe)     \
-       dtrace_hash_str(*((char **)((uintptr_t)(probe) + (hash)->dth_stroffs)))
 
-#define        DTRACE_HASHNEXT(hash, probe)    \
-       (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_nextoffs)
+#define        DTRACE_GETSTR(hash, elm)        \
+       (hash->dth_getstr(elm, hash->dth_stroffs))
+
+#define        DTRACE_HASHSTR(hash, elm)       \
+       dtrace_hash_str(DTRACE_GETSTR(hash, elm))
+
+#define        DTRACE_HASHNEXT(hash, elm)      \
+       (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
 
-#define        DTRACE_HASHPREV(hash, probe)    \
-       (dtrace_probe_t **)((uintptr_t)(probe) + (hash)->dth_prevoffs)
+#define        DTRACE_HASHPREV(hash, elm)      \
+       (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
 
 #define        DTRACE_HASHEQ(hash, lhs, rhs)   \
-       (strcmp(*((char **)((uintptr_t)(lhs) + (hash)->dth_stroffs)), \
-           *((char **)((uintptr_t)(rhs) + (hash)->dth_stroffs))) == 0)
+       (strcmp(DTRACE_GETSTR(hash, lhs), \
+           DTRACE_GETSTR(hash, rhs)) == 0)
 
 #define        DTRACE_AGGHASHSIZE_SLEW         17
 
@@ -756,6 +764,9 @@ sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
        if (value <= 0)
                return (ERANGE);
 
+       if (value >= dtrace_copy_maxsize())
+               return (ERANGE);
+
        lck_mtx_lock(&dtrace_lock);
                dtrace_dof_maxsize = value;
        lck_mtx_unlock(&dtrace_lock);
@@ -851,6 +862,15 @@ SYSCTL_PROC(_kern_dtrace, OID_AUTO, provide_private_probes,
        &dtrace_provide_private_probes, 0,
        sysctl_dtrace_provide_private_probes, "I", "provider must provide the private probes");
 
+/*
+ * kern.dtrace.dof_mode
+ *
+ * Returns the current DOF mode.
+ * This value is read-only.
+ */
+SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
+       &dtrace_dof_mode, 0, "dtrace dof mode");
+
 /*
  * DTrace Probe Context Functions
  *
@@ -7012,12 +7032,33 @@ dtrace_hash_str(const char *p)
        return (hval);
 }
 
+static const char*
+dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
+{
+#pragma unused(offs)
+       dtrace_probe_t *probe = (dtrace_probe_t*)elm;
+       return probe->dtpr_provider->dtpv_name;
+}
+
+static const char*
+dtrace_strkey_offset(void *elm, uintptr_t offs)
+{
+       return ((char *)((uintptr_t)(elm) + offs));
+}
+
+static const char*
+dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
+{
+       return *((char **)((uintptr_t)(elm) + offs));
+}
+
 static dtrace_hash_t *
-dtrace_hash_create(uintptr_t stroffs, uintptr_t nextoffs, uintptr_t prevoffs)
+dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
 {
        dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
 
-       hash->dth_stroffs = stroffs;
+       hash->dth_getstr = func;
+       hash->dth_stroffs = arg;
        hash->dth_nextoffs = nextoffs;
        hash->dth_prevoffs = prevoffs;
 
@@ -7066,10 +7107,10 @@ dtrace_hash_resize(dtrace_hash_t *hash)
 
        for (i = 0; i < size; i++) {
                for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
-                       dtrace_probe_t *probe = bucket->dthb_chain;
+                       void *elm = bucket->dthb_chain;
 
-                       ASSERT(probe != NULL);
-                       ndx = DTRACE_HASHSTR(hash, probe) & new_mask;
+                       ASSERT(elm != NULL);
+                       ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
 
                        next = bucket->dthb_next;
                        bucket->dthb_next = new_tab[ndx];
@@ -7084,12 +7125,12 @@ dtrace_hash_resize(dtrace_hash_t *hash)
 }
 
 static void
-dtrace_hash_add(dtrace_hash_t *hash, dtrace_probe_t *new)
+dtrace_hash_add(dtrace_hash_t *hash, void *new)
 {
        int hashval = DTRACE_HASHSTR(hash, new);
        int ndx = hashval & hash->dth_mask;
        dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
-       dtrace_probe_t **nextp, **prevp;
+       void **nextp, **prevp;
 
        for (; bucket != NULL; bucket = bucket->dthb_next) {
                if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
@@ -7122,23 +7163,29 @@ add:
        bucket->dthb_len++;
 }
 
-static dtrace_probe_t *
-dtrace_hash_lookup(dtrace_hash_t *hash, dtrace_probe_t *template)
+static void *
+dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
 {
-       int hashval = DTRACE_HASHSTR(hash, template);
+       int hashval = dtrace_hash_str(str);
        int ndx = hashval & hash->dth_mask;
        dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
 
        for (; bucket != NULL; bucket = bucket->dthb_next) {
-               if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
+               if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
                        return (bucket->dthb_chain);
        }
 
        return (NULL);
 }
 
+static dtrace_probe_t *
+dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
+{
+       return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
+}
+
 static int
-dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
+dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
 {
        int hashval = DTRACE_HASHSTR(hash, template);
        int ndx = hashval & hash->dth_mask;
@@ -7153,19 +7200,19 @@ dtrace_hash_collisions(dtrace_hash_t *hash, dtrace_probe_t *template)
 }
 
 static void
-dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
+dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
 {
-       int ndx = DTRACE_HASHSTR(hash, probe) & hash->dth_mask;
+       int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
        dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
 
-       dtrace_probe_t **prevp = DTRACE_HASHPREV(hash, probe);
-       dtrace_probe_t **nextp = DTRACE_HASHNEXT(hash, probe);
+       void **prevp = DTRACE_HASHPREV(hash, elm);
+       void **nextp = DTRACE_HASHNEXT(hash, elm);
 
        /*
-        * Find the bucket that we're removing this probe from.
+        * Find the bucket that we're removing this elm from.
         */
        for (; bucket != NULL; bucket = bucket->dthb_next) {
-               if (DTRACE_HASHEQ(hash, bucket->dthb_chain, probe))
+               if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
                        break;
        }
 
@@ -7174,12 +7221,12 @@ dtrace_hash_remove(dtrace_hash_t *hash, dtrace_probe_t *probe)
        if (*prevp == NULL) {
                if (*nextp == NULL) {
                        /*
-                        * The removed probe was the only probe on this
+                        * The removed element was the only element on this
                         * bucket; we need to remove the bucket.
                         */
                        dtrace_hashbucket_t *b = hash->dth_tab[ndx];
 
-                       ASSERT(bucket->dthb_chain == probe);
+                       ASSERT(bucket->dthb_chain == elm);
                        ASSERT(b != NULL);
 
                        if (b == bucket) {
@@ -7219,20 +7266,63 @@ dtrace_badattr(const dtrace_attribute_t *a)
 }
 
 /*
- * Return a duplicate copy of a string.  If the specified string is NULL,
- * this function returns a zero-length string.
- * APPLE NOTE: Darwin employs size bounded string operation.
+ * Returns a dtrace-managed copy of a string, and will
+ * deduplicate copies of the same string.
+ * If the specified string is NULL, returns an empty string
  */
 static char *
-dtrace_strdup(const char *str)
+dtrace_strref(const char *str)
 {
+       dtrace_string_t *s = NULL;
        size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
-       char *new = kmem_zalloc(bufsize, KM_SLEEP);
 
-       if (str != NULL)
-               (void) strlcpy(new, str, bufsize);
+       LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
 
-       return (new);
+       if (str == NULL)
+               str = "";
+
+       for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
+            s = *(DTRACE_HASHNEXT(dtrace_strings, s)))  {
+               if (strncmp(str, s->dtst_str, bufsize) != 0) {
+                       continue;
+               }
+               ASSERT(s->dtst_refcount != UINT32_MAX);
+               s->dtst_refcount++;
+               return s->dtst_str;
+       }
+
+       s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
+       s->dtst_refcount = 1;
+       (void) strlcpy(s->dtst_str, str, bufsize);
+
+       dtrace_hash_add(dtrace_strings, s);
+
+       return s->dtst_str;
+}
+
+static void
+dtrace_strunref(const char *str)
+{
+       ASSERT(str != NULL);
+       dtrace_string_t *s = NULL;
+       size_t bufsize = strlen(str) + 1;
+
+       LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
+
+       for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
+            s = *(DTRACE_HASHNEXT(dtrace_strings, s)))  {
+               if (strncmp(str, s->dtst_str, bufsize) != 0) {
+                       continue;
+               }
+               ASSERT(s->dtst_refcount != 0);
+               s->dtst_refcount--;
+               if (s->dtst_refcount == 0) {
+                       dtrace_hash_remove(dtrace_strings, s);
+                       kmem_free(s, sizeof(dtrace_string_t) + bufsize);
+               }
+               return;
+       }
+       panic("attempt to unref non-existent string %s", str);
 }
 
 #define        DTRACE_ISALPHA(c)       \
@@ -7529,9 +7619,27 @@ static int
 dtrace_match_string(const char *s, const char *p, int depth)
 {
 #pragma unused(depth) /* __APPLE__ */
+       return (s != NULL && s == p);
+}
 
-       /* APPLE NOTE: Darwin employs size bounded string operation. */
-       return (s != NULL && strncmp(s, p, strlen(s) + 1) == 0);
+/*ARGSUSED*/
+static int
+dtrace_match_module(const char *s, const char *p, int depth)
+{
+#pragma unused(depth) /* __APPLE__ */
+       size_t len;
+       if (s == NULL || p == NULL)
+               return (0);
+
+       len = strlen(p);
+
+       if (strncmp(p, s, len) != 0)
+               return (0);
+
+       if (s[len] == '.' || s[len] == '\0')
+               return (1);
+
+       return (0);
 }
 
 /*ARGSUSED*/
@@ -7554,7 +7662,18 @@ static int
 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
     zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
 {
-       dtrace_probe_t template, *probe;
+       dtrace_probe_t *probe;
+       dtrace_provider_t prov_template = {
+               .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
+       };
+
+       dtrace_probe_t template = {
+               .dtpr_provider = &prov_template,
+               .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
+               .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
+               .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
+       };
+
        dtrace_hash_t *hash = NULL;
        int len, rc, best = INT_MAX, nmatched = 0;
        dtrace_id_t i;
@@ -7575,16 +7694,19 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
                return (nmatched);
        }
 
-       template.dtpr_mod =  (char *)(uintptr_t)pkp->dtpk_mod;
-       template.dtpr_func = (char *)(uintptr_t)pkp->dtpk_func;
-       template.dtpr_name = (char *)(uintptr_t)pkp->dtpk_name;
-
        /*
-        * We want to find the most distinct of the module name, function
-        * name, and name.  So for each one that is not a glob pattern or
-        * empty string, we perform a lookup in the corresponding hash and
-        * use the hash table with the fewest collisions to do our search.
+        * We want to find the most distinct of the provider name, module name,
+        * function name, and name.  So for each one that is not a glob
+        * pattern or empty string, we perform a lookup in the corresponding
+        * hash and use the hash table with the fewest collisions to do our
+        * search.
         */
+       if (pkp->dtpk_pmatch == &dtrace_match_string &&
+           (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
+               best = len;
+               hash = dtrace_byprov;
+       }
+
        if (pkp->dtpk_mmatch == &dtrace_match_string &&
            (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
                best = len;
@@ -7671,6 +7793,24 @@ dtrace_probekey_func(const char *p)
        return (&dtrace_match_string);
 }
 
+static dtrace_probekey_f *
+dtrace_probekey_module_func(const char *p)
+{
+       LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
+
+       dtrace_probekey_f *f = dtrace_probekey_func(p);
+       if (f == &dtrace_match_string) {
+               dtrace_probe_t template = {
+                       .dtpr_mod = (char *)(uintptr_t)p,
+               };
+               if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
+                       return (&dtrace_match_module);
+               }
+               return (&dtrace_match_string);
+       }
+       return f;
+}
+
 /*
  * Build a probe comparison key for use with dtrace_match_probe() from the
  * given probe description.  By convention, a null key only matches anchored
@@ -7680,16 +7820,17 @@ dtrace_probekey_func(const char *p)
 static void
 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
 {
-       pkp->dtpk_prov = pdp->dtpd_provider;
+
+       pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
        pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
 
-       pkp->dtpk_mod = pdp->dtpd_mod;
-       pkp->dtpk_mmatch = dtrace_probekey_func(pdp->dtpd_mod);
+       pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
+       pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
 
-       pkp->dtpk_func = pdp->dtpd_func;
+       pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
        pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
 
-       pkp->dtpk_name = pdp->dtpd_name;
+       pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
        pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
 
        pkp->dtpk_id = pdp->dtpd_id;
@@ -7702,6 +7843,15 @@ dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
                pkp->dtpk_fmatch = &dtrace_match_nonzero;
 }
 
+static void
+dtrace_probekey_release(dtrace_probekey_t *pkp)
+{
+       dtrace_strunref(pkp->dtpk_prov);
+       dtrace_strunref(pkp->dtpk_mod);
+       dtrace_strunref(pkp->dtpk_func);
+       dtrace_strunref(pkp->dtpk_name);
+}
+
 static int
 dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
 {
@@ -7779,13 +7929,6 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
 
        provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
 
-       /* APPLE NOTE: Darwin employs size bounded string operation. */
-       {
-       size_t bufsize = strlen(name) + 1;
-       provider->dtpv_name = kmem_alloc(bufsize, KM_SLEEP);
-       (void) strlcpy(provider->dtpv_name, name, bufsize);
-       }
-
        provider->dtpv_attr = *pap;
        provider->dtpv_priv.dtpp_flags = priv;
        if (cr != NULL) {
@@ -7820,6 +7963,9 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
        if (pops == &dtrace_provider_ops) {
                LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
                LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
+
+               provider->dtpv_name = dtrace_strref(name);
+
                ASSERT(dtrace_anon.dta_enabling == NULL);
 
                /*
@@ -7834,6 +7980,8 @@ dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
        lck_mtx_lock(&dtrace_provider_lock);
        lck_mtx_lock(&dtrace_lock);
 
+       provider->dtpv_name = dtrace_strref(name);
+
        /*
         * If there is at least one provider registered, we'll add this
         * provider after the first provider.
@@ -7878,8 +8026,11 @@ dtrace_unregister(dtrace_provider_id_t id)
 {
        dtrace_provider_t *old = (dtrace_provider_t *)id;
        dtrace_provider_t *prev = NULL;
-       int i, self = 0;
-       dtrace_probe_t *probe, *first = NULL;
+       int self = 0;
+       dtrace_probe_t *probe, *first = NULL, *next = NULL;
+       dtrace_probe_t template = {
+               .dtpr_provider = old
+       };
 
        if (old->dtpv_pops.dtps_enable ==
            (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
@@ -7940,14 +8091,12 @@ dtrace_unregister(dtrace_provider_id_t id)
         * All of the probes for this provider are disabled; we can safely
         * remove all of them from their hash chains and from the probe array.
         */
-       for (i = 0; i < dtrace_nprobes && old->dtpv_probe_count!=0; i++) {
-               if ((probe = dtrace_probes[i]) == NULL)
-                       continue;
-
+       for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
+           probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
                if (probe->dtpr_provider != old)
                        continue;
 
-               dtrace_probes[i] = NULL;
+               dtrace_probes[probe->dtpr_id - 1] = NULL;
                old->dtpv_probe_count--;
 
                dtrace_hash_remove(dtrace_bymod, probe);
@@ -7958,11 +8107,19 @@ dtrace_unregister(dtrace_provider_id_t id)
                        first = probe;
                        probe->dtpr_nextmod = NULL;
                } else {
+                       /*
+                        * Use nextmod as the chain of probes to remove
+                        */
                        probe->dtpr_nextmod = first;
                        first = probe;
                }
        }
 
+       for (probe = first; probe != NULL; probe = next) {
+               next = probe->dtpr_nextmod;
+               dtrace_hash_remove(dtrace_byprov, probe);
+       }
+
        /*
         * The provider's probes have been removed from the hash chains and
         * from the probe array.  Now issue a dtrace_sync() to be sure that
@@ -7970,14 +8127,14 @@ dtrace_unregister(dtrace_provider_id_t id)
         */
        dtrace_sync();
 
-       for (probe = first; probe != NULL; probe = first) {
-               first = probe->dtpr_nextmod;
+       for (probe = first; probe != NULL; probe = next) {
+               next = probe->dtpr_nextmod;
 
                old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
                    probe->dtpr_arg);
-               kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
-               kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
-               kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+               dtrace_strunref(probe->dtpr_mod);
+               dtrace_strunref(probe->dtpr_func);
+               dtrace_strunref(probe->dtpr_name);
                vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
                zfree(dtrace_probe_t_zone, probe);
        }
@@ -7998,13 +8155,14 @@ dtrace_unregister(dtrace_provider_id_t id)
                prev->dtpv_next = old->dtpv_next;
        }
 
+       dtrace_strunref(old->dtpv_name);
+
        if (!self) {
                lck_mtx_unlock(&dtrace_lock);
                lck_mtx_unlock(&mod_lock);
                lck_mtx_unlock(&dtrace_provider_lock);
        }
 
-       kmem_free(old->dtpv_name, strlen(old->dtpv_name) + 1);
        kmem_free(old, sizeof (dtrace_provider_t));
 
        return (0);
@@ -8054,8 +8212,10 @@ int
 dtrace_condense(dtrace_provider_id_t id)
 {
        dtrace_provider_t *prov = (dtrace_provider_t *)id;
-       int i;
-       dtrace_probe_t *probe;
+       dtrace_probe_t *probe, *first = NULL;
+       dtrace_probe_t template = {
+               .dtpr_provider = prov
+       };
 
        /*
         * Make sure this isn't the dtrace provider itself.
@@ -8069,9 +8229,8 @@ dtrace_condense(dtrace_provider_id_t id)
        /*
         * Attempt to destroy the probes associated with this provider.
         */
-       for (i = 0; i < dtrace_nprobes; i++) {
-               if ((probe = dtrace_probes[i]) == NULL)
-                       continue;
+       for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
+           probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
 
                if (probe->dtpr_provider != prov)
                        continue;
@@ -8079,20 +8238,35 @@ dtrace_condense(dtrace_provider_id_t id)
                if (probe->dtpr_ecb != NULL)
                        continue;
 
-               dtrace_probes[i] = NULL;
+               dtrace_probes[probe->dtpr_id - 1] = NULL;
                prov->dtpv_probe_count--;
 
                dtrace_hash_remove(dtrace_bymod, probe);
                dtrace_hash_remove(dtrace_byfunc, probe);
                dtrace_hash_remove(dtrace_byname, probe);
 
-               prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, i + 1,
+               prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
                    probe->dtpr_arg);
-               kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
-               kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
-               kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+               dtrace_strunref(probe->dtpr_mod);
+               dtrace_strunref(probe->dtpr_func);
+               dtrace_strunref(probe->dtpr_name);
+               if (first == NULL) {
+                       first = probe;
+                       probe->dtpr_nextmod = NULL;
+               } else {
+                       /*
+                        * Use nextmod as the chain of probes to remove
+                        */
+                       probe->dtpr_nextmod = first;
+                       first = probe;
+               }
+       }
+
+       for (probe = first; probe != NULL; probe = first) {
+               first = probe->dtpr_nextmod;
+               dtrace_hash_remove(dtrace_byprov, probe);
+               vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
                zfree(dtrace_probe_t_zone, probe);
-               vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
        }
 
        lck_mtx_unlock(&dtrace_lock);
@@ -8136,13 +8310,14 @@ dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
 
        probe->dtpr_id = id;
        probe->dtpr_gen = dtrace_probegen++;
-       probe->dtpr_mod = dtrace_strdup(mod);
-       probe->dtpr_func = dtrace_strdup(func);
-       probe->dtpr_name = dtrace_strdup(name);
+       probe->dtpr_mod = dtrace_strref(mod);
+       probe->dtpr_func = dtrace_strref(func);
+       probe->dtpr_name = dtrace_strref(name);
        probe->dtpr_arg = arg;
        probe->dtpr_aframes = aframes;
        probe->dtpr_provider = provider;
 
+       dtrace_hash_add(dtrace_byprov, probe);
        dtrace_hash_add(dtrace_bymod, probe);
        dtrace_hash_add(dtrace_byfunc, probe);
        dtrace_hash_add(dtrace_byname, probe);
@@ -8225,19 +8400,23 @@ dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
        dtrace_id_t id;
        int match;
 
-       pkey.dtpk_prov = ((dtrace_provider_t *)prid)->dtpv_name;
+       lck_mtx_lock(&dtrace_lock);
+
+       pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
        pkey.dtpk_pmatch = &dtrace_match_string;
-       pkey.dtpk_mod = mod;
+       pkey.dtpk_mod = dtrace_strref(mod);
        pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
-       pkey.dtpk_func = func;
+       pkey.dtpk_func = dtrace_strref(func);
        pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
-       pkey.dtpk_name = name;
+       pkey.dtpk_name = dtrace_strref(name);
        pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
        pkey.dtpk_id = DTRACE_IDNONE;
 
-       lck_mtx_lock(&dtrace_lock);
        match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
            dtrace_probe_lookup_match, &id, NULL);
+
+       dtrace_probekey_release(&pkey);
+
        lck_mtx_unlock(&dtrace_lock);
 
        ASSERT(match == 1 || match == 0);
@@ -8382,6 +8561,7 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtr
        uint32_t priv;
        uid_t uid;
        zoneid_t zoneid;
+       int err;
 
        LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
 
@@ -8400,8 +8580,11 @@ dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtr
        dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
            &priv, &uid, &zoneid);
 
-       return (dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable,
-           enab, ep));
+       err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
+
+       dtrace_probekey_release(&pkey);
+
+       return err;
 }
 
 /*
@@ -8637,14 +8820,6 @@ dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
 
        meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
        meta->dtm_mops = *mops;
-
-       /* APPLE NOTE: Darwin employs size bounded string operation. */
-       {
-       size_t bufsize = strlen(name) + 1;
-       meta->dtm_name = kmem_alloc(bufsize, KM_SLEEP);
-       (void) strlcpy(meta->dtm_name, name, bufsize);
-       }
-
        meta->dtm_arg = arg;
 
        lck_mtx_lock(&dtrace_meta_lock);
@@ -8655,11 +8830,12 @@ dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
                lck_mtx_unlock(&dtrace_meta_lock);
                cmn_err(CE_WARN, "failed to register meta-register %s: "
                    "user-land meta-provider exists", name);
-               kmem_free(meta->dtm_name, strlen(meta->dtm_name) + 1);
                kmem_free(meta, sizeof (dtrace_meta_t));
                return (EINVAL);
        }
 
+       meta->dtm_name = dtrace_strref(name);
+
        dtrace_meta_pid = meta;
        *idp = (dtrace_meta_provider_id_t)meta;
 
@@ -8718,10 +8894,11 @@ dtrace_meta_unregister(dtrace_meta_provider_id_t id)
 
        *pp = NULL;
 
+       dtrace_strunref(old->dtm_name);
+
        lck_mtx_unlock(&dtrace_lock);
        lck_mtx_unlock(&dtrace_meta_lock);
 
-       kmem_free(old->dtm_name, strlen(old->dtm_name) + 1);
        kmem_free(old, sizeof (dtrace_meta_t));
 
        return (0);
@@ -12024,7 +12201,7 @@ dtrace_dof_create(dtrace_state_t *state)
 
        LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
 
-       dof = dt_kmem_zalloc_aligned(len, 8, KM_SLEEP);
+       dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
        dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
        dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
        dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
@@ -12102,11 +12279,11 @@ dtrace_dof_copyin(user_addr_t uarg, int *errp)
                return (NULL);
        }
 
-       dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
+       dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
 
         if (copyin(uarg, dof, hdr.dofh_loadsz) != 0  ||
          dof->dofh_loadsz != hdr.dofh_loadsz) {
-           dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
+           kmem_free_aligned(dof, hdr.dofh_loadsz);
            *errp = EFAULT;
            return (NULL);
        }           
@@ -12146,10 +12323,10 @@ dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
                return (NULL);
        }
 
-       dof = dt_kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
+       dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
 
        if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
-               dt_kmem_free_aligned(dof, hdr.dofh_loadsz);
+               kmem_free_aligned(dof, hdr.dofh_loadsz);
                *errp = EFAULT;
                return (NULL);
        }
@@ -12160,13 +12337,13 @@ dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
 static void
 dtrace_dof_destroy(dof_hdr_t *dof)
 {
-       dt_kmem_free_aligned(dof, dof->dofh_loadsz);
+       kmem_free_aligned(dof, dof->dofh_loadsz);
 }
 
 static dof_hdr_t *
 dtrace_dof_property(const char *name)
 {
-       unsigned int len;
+       unsigned int len = 0;
        dof_hdr_t *dof;
 
        if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
@@ -12177,7 +12354,7 @@ dtrace_dof_property(const char *name)
                return NULL;
        }
 
-       dof = dt_kmem_alloc_aligned(len, 8, KM_SLEEP);
+       dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
 
        if (!PEReadNVRAMProperty(name, dof, &len)) {
                dtrace_dof_destroy(dof);
@@ -12789,8 +12966,8 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
                return (-1);
        }
 
-       if (dof->dofh_secsize == 0) {
-               dtrace_dof_error(dof, "zero section header size");
+       if (dof->dofh_secsize < sizeof(dof_sec_t)) {
+               dtrace_dof_error(dof, "invalid section header size");
                return (-1);
        }
 
@@ -13183,7 +13360,7 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
                major = ddi_driver_major(dtrace_devi);
        }
 
-       state->dts_dev = makedevice(major, minor);
+       state->dts_dev = makedev(major, minor);
 
        if (devp != NULL)
                *devp = state->dts_dev;
@@ -13231,6 +13408,10 @@ dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
         * the normal checks are bypassed.
         */
 #if defined(__APPLE__)
+       if (cr != NULL) {
+               kauth_cred_ref(cr);
+               state->dts_cred.dcr_cred = cr;
+       }
        if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
                if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
                        /*
@@ -13927,7 +14108,7 @@ dtrace_state_destroy(dtrace_state_t *state)
         * Release the credential hold we took in dtrace_state_create().
         */
        if (state->dts_cred.dcr_cred != NULL)
-               crfree(state->dts_cred.dcr_cred);
+               kauth_cred_unref(&state->dts_cred.dcr_cred);
 
        /*
         * Now we can safely disable and destroy any enabled probes.  Because
@@ -14006,6 +14187,20 @@ dtrace_state_destroy(dtrace_state_t *state)
 /*
  * DTrace Anonymous Enabling Functions
  */
+
+int
+dtrace_keep_kernel_symbols(void)
+{
+       if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
+               return 0;
+       }
+
+       if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
+               return 1;
+
+       return 0;
+}
+
 static dtrace_state_t *
 dtrace_anon_grab(void)
 {
@@ -14049,6 +14244,7 @@ dtrace_anon_property(void)
                        break;
                }
 
+#ifdef illumos
                /*
                 * We want to create anonymous state, so we need to transition
                 * the kernel debugger to indicate that DTrace is active.  If
@@ -14061,6 +14257,7 @@ dtrace_anon_property(void)
                        dtrace_dof_destroy(dof);
                        break;
                }
+#endif
 
                /*
                 * If we haven't allocated an anonymous state, we'll do so now.
@@ -14308,6 +14505,7 @@ dtrace_helper_destroygen(proc_t* p, int gen)
        dtrace_vstate_t *vstate;
        uint_t i;
 
+       LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
        LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
 
        if (help == NULL || gen > help->dthps_generation)
@@ -14373,13 +14571,11 @@ dtrace_helper_destroygen(proc_t* p, int gen)
                /*
                 * If we have a meta provider, remove this helper provider.
                 */
-               lck_mtx_lock(&dtrace_meta_lock);
                if (dtrace_meta_pid != NULL) {
                        ASSERT(dtrace_deferred_pid == NULL);
                        dtrace_helper_provider_remove(&prov->dthp_prov,
                            p);
                }
-               lck_mtx_unlock(&dtrace_meta_lock);
 
                dtrace_helper_provider_destroy(prov);
 
@@ -14485,9 +14681,9 @@ static void
 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
     dof_helper_t *dofhp)
 {
+       LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
        LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
 
-       lck_mtx_lock(&dtrace_meta_lock);
        lck_mtx_lock(&dtrace_lock);
 
        if (!dtrace_attached() || dtrace_meta_pid == NULL) {
@@ -14536,8 +14732,6 @@ dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
                                p);
                }
        }
-
-       lck_mtx_unlock(&dtrace_meta_lock);
 }
 
 static int
@@ -14843,6 +15037,7 @@ dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
        int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
        uintptr_t daddr = (uintptr_t)dof;
 
+       LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
        LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
 
        if ((help = p->p_dtrace_helpers) == NULL)
@@ -15008,7 +15203,7 @@ dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claim
         * Any existing helpers force non-lazy behavior.
         */
        if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
-               lck_mtx_lock(&p->p_dtrace_sprlock);
+               dtrace_sprlock(p);
 
                dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
                unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
@@ -15071,7 +15266,7 @@ dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claim
 #endif /* DEBUG */
 
 unlock:
-               lck_mtx_unlock(&p->p_dtrace_sprlock);
+               dtrace_sprunlock(p);
        } else {
                rval = EACCES;
        }
@@ -15101,7 +15296,7 @@ dtrace_lazy_dofs_remove(proc_t *p, int generation)
         * Any existing helpers force non-lazy behavior.
         */
        if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
-               lck_mtx_lock(&p->p_dtrace_sprlock);
+               dtrace_sprlock(p);
 
                dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
                
@@ -15158,9 +15353,8 @@ dtrace_lazy_dofs_remove(proc_t *p, int generation)
 #endif
 
                }
-
-               lck_mtx_unlock(&p->p_dtrace_sprlock);
-       } else {                
+               dtrace_sprunlock(p);
+       } else {
                rval = EACCES;
        }
        
@@ -15173,14 +15367,14 @@ void
 dtrace_lazy_dofs_destroy(proc_t *p)
 {
        lck_rw_lock_shared(&dtrace_dof_mode_lock);
-       lck_mtx_lock(&p->p_dtrace_sprlock);
+       dtrace_sprlock(p);
        
        ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
 
        dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
        p->p_dtrace_lazy_dofs = NULL;
 
-       lck_mtx_unlock(&p->p_dtrace_sprlock);
+       dtrace_sprunlock(p);
        lck_rw_unlock_shared(&dtrace_dof_mode_lock);
 
        if (lazy_dofs) {
@@ -15205,7 +15399,7 @@ dtrace_lazy_dofs_process(proc_t *p) {
         * fault in the dof. We could fix this by holding locks longer,
         * but the errors are benign.
         */
-       lck_mtx_lock(&p->p_dtrace_sprlock);
+       dtrace_sprlock(p);
 
 
        ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
@@ -15214,8 +15408,8 @@ dtrace_lazy_dofs_process(proc_t *p) {
        dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
        p->p_dtrace_lazy_dofs = NULL;
 
-       lck_mtx_unlock(&p->p_dtrace_sprlock);
-
+       dtrace_sprunlock(p);
+       lck_mtx_lock(&dtrace_meta_lock);
        /*
         * Process each dof_helper_t
         */
@@ -15270,8 +15464,10 @@ dtrace_lazy_dofs_process(proc_t *p) {
                                lck_mtx_unlock(&dtrace_lock);
                        }
                }
-
+               lck_mtx_unlock(&dtrace_meta_lock);
                kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
+       } else {
+               lck_mtx_unlock(&dtrace_meta_lock);
        }
 }
 
@@ -15295,7 +15491,7 @@ dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
        LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
 
        lck_rw_lock_shared(&dtrace_dof_mode_lock);
-       lck_mtx_lock(&parent->p_dtrace_sprlock);
+       dtrace_sprlock(parent);
 
        /*
         * We need to make sure that the transition to lazy dofs -> helpers
@@ -15315,12 +15511,12 @@ dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
                bcopy(parent_dofs, child_dofs, parent_dofs_size);
        }
 
-       lck_mtx_unlock(&parent->p_dtrace_sprlock);
+       dtrace_sprunlock(parent);
 
        if (child_dofs) {
-               lck_mtx_lock(&child->p_dtrace_sprlock);
+               dtrace_sprlock(child);
                child->p_dtrace_lazy_dofs = child_dofs;
-               lck_mtx_unlock(&child->p_dtrace_sprlock);
+               dtrace_sprunlock(child);
                /**
                 * We process the DOF at this point if the mode is set to
                 * LAZY_OFF. This can happen if DTrace is still processing the
@@ -15365,6 +15561,7 @@ dtrace_helpers_destroy(proc_t* p)
        dtrace_vstate_t *vstate;
        uint_t i;
 
+       lck_mtx_lock(&dtrace_meta_lock);
        lck_mtx_lock(&dtrace_lock);
 
        ASSERT(p->p_dtrace_helpers != NULL);
@@ -15398,7 +15595,6 @@ dtrace_helpers_destroy(proc_t* p)
         * Destroy the helper providers.
         */
        if (help->dthps_maxprovs > 0) {
-               lck_mtx_lock(&dtrace_meta_lock);
                if (dtrace_meta_pid != NULL) {
                        ASSERT(dtrace_deferred_pid == NULL);
 
@@ -15428,7 +15624,6 @@ dtrace_helpers_destroy(proc_t* p)
                        lck_mtx_unlock(&dtrace_lock);
                }
 
-               lck_mtx_unlock(&dtrace_meta_lock);
 
                for (i = 0; i < help->dthps_nprovs; i++) {
                        dtrace_helper_provider_destroy(help->dthps_provs[i]);
@@ -15447,6 +15642,7 @@ dtrace_helpers_destroy(proc_t* p)
 
        --dtrace_helpers;
        lck_mtx_unlock(&dtrace_lock);
+       lck_mtx_unlock(&dtrace_meta_lock);
 }
 
 static void
@@ -15459,6 +15655,7 @@ dtrace_helpers_duplicate(proc_t *from, proc_t *to)
        uint_t i;
        int j, sz, hasprovs = 0;
 
+       lck_mtx_lock(&dtrace_meta_lock);
        lck_mtx_lock(&dtrace_lock);
        ASSERT(from->p_dtrace_helpers != NULL);
        ASSERT(dtrace_helpers > 0);
@@ -15530,6 +15727,8 @@ dtrace_helpers_duplicate(proc_t *from, proc_t *to)
 
        if (hasprovs)
                dtrace_helper_provider_register(to, newhelp, NULL);
+
+       lck_mtx_unlock(&dtrace_meta_lock);
 }
 
 /**
@@ -15550,7 +15749,7 @@ dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
                 * the p_dtrace_sprlock lock. A full sprlock would
                 * task_suspend the parent.
                 */
-               lck_mtx_lock(&parent_proc->p_dtrace_sprlock);
+               dtrace_sprlock(parent_proc);
 
                /*
                 * Remove all DTrace tracepoints from the child process. We
@@ -15561,7 +15760,7 @@ dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
                        dtrace_fasttrap_fork(parent_proc, child_proc);
                }
 
-               lck_mtx_unlock(&parent_proc->p_dtrace_sprlock);
+               dtrace_sprunlock(parent_proc);
 
                /*
                 * Duplicate any lazy dof(s). This must be done while NOT
@@ -15851,7 +16050,7 @@ dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
                ctl->mod_loaded = 1;
                ctl->mod_flags = 0;
                ctl->mod_user_symbols = NULL;
-               
+
                /*
                 * Find the UUID for this module, if it has one
                 */
@@ -15870,6 +16069,15 @@ dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
                
                if (ctl->mod_address == g_kernel_kmod_info.address) {
                        ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
+                       memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
+               }
+               /*
+                * Static kexts have a UUID that is not used for symbolication, as all their
+                * symbols are in kernel
+                */
+               else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
+                       memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
+                       ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
                }
        }
        dtrace_modctl_add(ctl);
@@ -16078,6 +16286,7 @@ syncloop:
                probe->dtpr_provider->dtpv_probe_count--;                                       
 
                next = probe->dtpr_nextmod;
+               dtrace_hash_remove(dtrace_byprov, probe);
                dtrace_hash_remove(dtrace_bymod, probe);
                dtrace_hash_remove(dtrace_byfunc, probe);
                dtrace_hash_remove(dtrace_byname, probe);
@@ -16103,9 +16312,9 @@ syncloop:
                prov = probe->dtpr_provider;
                prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
                    probe->dtpr_arg);
-               kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
-               kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
-               kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+               dtrace_strunref(probe->dtpr_mod);
+               dtrace_strunref(probe->dtpr_func);
+               dtrace_strunref(probe->dtpr_name);
                vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
 
                zfree(dtrace_probe_t_zone, probe);
@@ -16242,9 +16451,8 @@ dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
  */
 /*ARGSUSED*/
 static int
-dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+dtrace_attach(dev_info_t *devi)
 {
-#pragma unused(cmd) /* __APPLE__ */
        dtrace_provider_id_t id;
        dtrace_state_t *state = NULL;
        dtrace_enabling_t *enab;
@@ -16254,8 +16462,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
        lck_mtx_lock(&dtrace_lock);
 
        /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
-
-       ddi_report_dev(devi);
        dtrace_devi = devi;
 
        dtrace_modload = dtrace_module_loaded;
@@ -16274,8 +16480,6 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 
        dtrace_arena = vmem_create("dtrace", (void *)1, UINT32_MAX, 1,
            NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
-       dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
-           1, INT_MAX, 0);
 
        dtrace_state_cache = kmem_cache_create("dtrace_state_cache",
            sizeof (dtrace_dstate_percpu_t) * (int)NCPU, DTRACE_STATE_ALIGN,
@@ -16283,15 +16487,23 @@ dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 
        LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
 
-       dtrace_bymod = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_mod),
+       dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
+           0, /* unused */
+           offsetof(dtrace_probe_t, dtpr_nextprov),
+           offsetof(dtrace_probe_t, dtpr_prevprov));
+
+       dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
+           offsetof(dtrace_probe_t, dtpr_mod),
            offsetof(dtrace_probe_t, dtpr_nextmod),
            offsetof(dtrace_probe_t, dtpr_prevmod));
 
-       dtrace_byfunc = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_func),
+       dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
+           offsetof(dtrace_probe_t, dtpr_func),
            offsetof(dtrace_probe_t, dtpr_nextfunc),
            offsetof(dtrace_probe_t, dtpr_prevfunc));
 
-       dtrace_byname = dtrace_hash_create(offsetof(dtrace_probe_t, dtpr_name),
+       dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
+           offsetof(dtrace_probe_t, dtpr_name),
            offsetof(dtrace_probe_t, dtpr_nextname),
            offsetof(dtrace_probe_t, dtpr_prevname));
 
@@ -16443,6 +16655,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
        dtrace_opens++;
        dtrace_membar_producer();
 
+#ifdef illumos
        /*
         * If the kernel debugger is active (that is, if the kernel debugger
         * modified text in some way), we won't allow the open.
@@ -16453,13 +16666,17 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
                lck_mtx_unlock(&cpu_lock);
                return (EBUSY);
        }
+#endif
 
        rv = dtrace_state_create(devp, cred_p, &state);
        lck_mtx_unlock(&cpu_lock);
 
        if (rv != 0 || state == NULL) {
-               if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
+               if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
+#ifdef illumos
                        (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
+#endif
+               }
                lck_mtx_unlock(&dtrace_lock);
                /* propagate EAGAIN or ERESTART */
                return (rv);
@@ -16557,9 +16774,12 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
         * Only relinquish control of the kernel debugger interface when there
         * are no consumers and no anonymous enablings.
         */
-       if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
+       if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
+#ifdef illumos
                (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
-       
+#endif
+       }
+
        lck_mtx_unlock(&dtrace_lock);
        lck_mtx_unlock(&cpu_lock);
 
@@ -16700,6 +16920,7 @@ dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
                                        dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
                                        
                                        if (dof != NULL) {                                      
+                                               lck_mtx_lock(&dtrace_meta_lock);
                                                lck_mtx_lock(&dtrace_lock);
                                                
                                                /*
@@ -16711,6 +16932,7 @@ dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
                                                }
                                                
                                                lck_mtx_unlock(&dtrace_lock);
+                                               lck_mtx_unlock(&dtrace_meta_lock);
                                        }
                                } while (++i < multi_dof->dofiod_count && rval == 0);
                        }
@@ -16751,9 +16973,11 @@ dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
                         * EACCES means non-lazy
                         */
                        if (rval == EACCES) {
+                               lck_mtx_lock(&dtrace_meta_lock);
                                lck_mtx_lock(&dtrace_lock);
                                rval = dtrace_helper_destroygen(p, generation);
                                lck_mtx_unlock(&dtrace_lock);
+                               lck_mtx_unlock(&dtrace_meta_lock);
                        }
 
                        return (rval);
@@ -17106,17 +17330,15 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                        desc.dtpd_id++;
                }
 
-               if (cmd == DTRACEIOC_PROBEMATCH)  {
-                       dtrace_probekey(&desc, &pkey);
-                       pkey.dtpk_id = DTRACE_IDNONE;
-               }
-
                dtrace_cred2priv(cr, &priv, &uid, &zoneid);
 
                lck_mtx_lock(&dtrace_lock);
 
-               if (cmd == DTRACEIOC_PROBEMATCH) {
-                        /* Quiet compiler warning */
+               if (cmd == DTRACEIOC_PROBEMATCH)  {
+                       dtrace_probekey(&desc, &pkey);
+                       pkey.dtpk_id = DTRACE_IDNONE;
+
+                       /* Quiet compiler warning */
                        for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
                                if ((probe = dtrace_probes[i - 1]) != NULL &&
                                        (m = dtrace_match_probe(probe, &pkey,
@@ -17128,6 +17350,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                                lck_mtx_unlock(&dtrace_lock);
                                return (EINVAL);
                        }
+                       dtrace_probekey_release(&pkey);
 
                } else {
                         /* Quiet compiler warning */
@@ -17639,7 +17862,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                                        ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
 
                                ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
-                               if (!MOD_SYMBOLS_DONE(ctl)) {
+                               if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
                                        dtmul_count++;
                                        rval = EINVAL;
                                }
@@ -17695,7 +17918,7 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                         * are available, add user syms if the module might use them.
                         */
                        ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
-                       if (!MOD_SYMBOLS_DONE(ctl)) {
+                       if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
                                UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
                                if (dtmul_count++ < uuids_list->dtmul_count) {
                                        memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
@@ -17811,32 +18034,24 @@ dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv
                                ctl->mod_flags |= MODCTL_FBT_PROVIDE_PRIVATE_PROBES;
 
                        ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
-                       if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl)) {
-                               if (memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
-                                       /* BINGO! */
-                                       ctl->mod_user_symbols = module_symbols;
-                                       break;
-                               }
+                       if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
+                               dtrace_provider_t *prv;
+                               ctl->mod_user_symbols = module_symbols;
+
+                               /*
+                                * We're going to call each providers per-module provide operation
+                                * specifying only this module.
+                                */
+                               for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
+                                       prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
+                               /*
+                                * We gave every provider a chance to provide with the user syms, go ahead and clear them
+                                */
+                               ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
                        }
                        ctl = ctl->mod_next;
                }
 
-               if (ctl) {
-                       dtrace_provider_t *prv;
-
-                       /*
-                        * We're going to call each providers per-module provide operation
-                        * specifying only this module.
-                        */
-                       for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
-                               prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl); 
-                                               
-                       /*
-                        * We gave every provider a chance to provide with the user syms, go ahead and clear them
-                        */
-                       ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
-               }
-               
                lck_mtx_unlock(&mod_lock);
                lck_mtx_unlock(&dtrace_provider_lock);
 
@@ -17972,9 +18187,13 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
        dtrace_probes = NULL;
        dtrace_nprobes = 0;
 
+       dtrace_hash_destroy(dtrace_strings);
+       dtrace_hash_destroy(dtrace_byprov);
        dtrace_hash_destroy(dtrace_bymod);
        dtrace_hash_destroy(dtrace_byfunc);
        dtrace_hash_destroy(dtrace_byname);
+       dtrace_strings = NULL;
+       dtrace_byprov = NULL;
        dtrace_bymod = NULL;
        dtrace_byfunc = NULL;
        dtrace_byname = NULL;
@@ -18002,6 +18221,7 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
        lck_mtx_unlock(&dtrace_lock);
        lck_mtx_unlock(&dtrace_provider_lock);
 
+#ifdef illumos
        /*
         * We don't destroy the task queue until after we have dropped our
         * locks (taskq_destroy() may block on running tasks).  To prevent
@@ -18012,6 +18232,7 @@ dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
         */
        taskq_destroy(dtrace_taskq);
        dtrace_taskq = NULL;
+#endif
 
        return (DDI_SUCCESS);
 }
@@ -18223,6 +18444,19 @@ lck_grp_t* dtrace_lck_grp;
 
 static int gMajDevNo;
 
+void dtrace_early_init (void)
+{
+       dtrace_restriction_policy_load();
+
+       /*
+        * See dtrace_impl.h for a description of kernel symbol modes.
+        * The default is to wait for symbols from userspace (lazy symbols).
+        */
+       if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
+               dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
+       }
+}
+
 void
 dtrace_init( void )
 {
@@ -18274,13 +18508,6 @@ dtrace_init( void )
                        return;
                }
 
-#if defined(DTRACE_MEMORY_ZONES)
-               /*
-                * Initialize the dtrace kalloc-emulation zones.
-                */
-               dtrace_alloc_init();
-#endif /* DTRACE_MEMORY_ZONES */
-
                /*
                 * Allocate the dtrace_probe_t zone
                 */
@@ -18347,6 +18574,11 @@ dtrace_init( void )
 
                (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
 
+               dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
+                   offsetof(dtrace_string_t, dtst_str),
+                   offsetof(dtrace_string_t, dtst_next),
+                   offsetof(dtrace_string_t, dtst_prev));
+
                dtrace_isa_init();
                /*
                 * See dtrace_impl.h for a description of dof modes.
@@ -18386,16 +18618,6 @@ dtrace_init( void )
                                break;
                }
 
-               /*
-                * See dtrace_impl.h for a description of kernel symbol modes.
-                * The default is to wait for symbols from userspace (lazy symbols).
-                */
-               if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
-                       dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
-               }
-
-               dtrace_restriction_policy_load();
-
                gDTraceInited = 1;
 
        } else
@@ -18410,7 +18632,7 @@ dtrace_postinit(void)
         * run. That way, anonymous DOF enabled under dtrace_attach() is safe
         * to go.
         */
-       dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0), 0 ); /* Punning a dev_t to a dev_info_t* */
+       dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
        
        /*
         * Add the mach_kernel to the module list for lazy processing
diff --git a/bsd/dev/dtrace/dtrace_alloc.c b/bsd/dev/dtrace/dtrace_alloc.c
deleted file mode 100644 (file)
index e43ca8c..0000000
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2005-2007 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
- * DTrace kalloc emulation.
- *
- * This is a subset of kalloc functionality, to allow dtrace
- * specific allocation to be accounted for separately from the
- * general kalloc pool.
- *
- * Note that allocations greater than dalloc_max still go into
- * the kalloc.large bucket, as it seems impossible to emulate
- * that functionality in the bsd kern.
- */
-
-#include <stdarg.h>
-#include <string.h>
-#include <sys/malloc.h>
-#include <sys/dtrace.h>
-#include <kern/zalloc.h>
-
-#if defined(DTRACE_MEMORY_ZONES)
-
-#define DTRACE_ALLOC_MINSIZE 16
-
-vm_size_t dtrace_alloc_max;
-vm_size_t dtrace_alloc_max_prerounded;
-int first_d_zone = -1;
-struct zone *d_zone[16];
-static const char *d_zone_name[16] = {
-       "dtrace.1",             "dtrace.2",
-       "dtrace.4",             "dtrace.8",
-       "dtrace.16",            "dtrace.32",
-       "dtrace.64",            "dtrace.128",
-       "dtrace.256",           "dtrace.512",
-       "dtrace.1024",          "dtrace.2048",
-       "dtrace.4096",          "dtrace.8192",
-       "dtrace.16384",         "dtrace.32768"
-};
-
-unsigned long d_zone_max[16] = {
-      1024,            /*      1 Byte  */
-      1024,            /*      2 Byte  */
-      1024,            /*      4 Byte  */
-      1024,            /*      8 Byte  */
-      1024,            /*     16 Byte  */
-      4096,            /*     32 Byte  */
-      4096,            /*     64 Byte  */
-      4096,            /*    128 Byte  */
-      4096,            /*    256 Byte  */
-      1024,            /*    512 Byte  */
-      1024,            /*   1024 Byte  */
-      1024,            /*   2048 Byte  */
-      1024,            /*   4096 Byte  */
-      4096,            /*   8192 Byte  */
-      64,              /*  16384 Byte  */
-      64,              /*  32768 Byte  */
-};
-
-void dtrace_alloc_init(void)
-{
-       vm_size_t size;
-       int i;
-
-       if (PAGE_SIZE < 16*1024)
-               dtrace_alloc_max = 16*1024;
-       else
-               dtrace_alloc_max = PAGE_SIZE;
-       dtrace_alloc_max_prerounded = dtrace_alloc_max / 2 + 1;
-
-       /*
-        *      Allocate a zone for each size we are going to handle.
-        *      We specify non-paged memory.
-        */
-       for (i = 0, size = 1; size < dtrace_alloc_max; i++, size <<= 1) {
-               if (size < DTRACE_ALLOC_MINSIZE) {
-                       d_zone[i] = NULL;
-                       continue;
-               }
-               if (size == DTRACE_ALLOC_MINSIZE) {
-                       first_d_zone = i;
-               }
-               d_zone[i] = zinit(size, d_zone_max[i] * size, size, d_zone_name[i]);
-       }
-}
-
-void *dtrace_alloc(vm_size_t size)
-{
-       int zindex;
-       vm_size_t allocsize;
-
-       /*
-        * If size is too large for a zone, then use kmem_alloc.
-        * (We use kmem_alloc instead of kmem_alloc_kobject so that
-        * krealloc can use kmem_realloc.)
-        */
-
-       if (size >= dtrace_alloc_max_prerounded) {
-               return _MALLOC(size, M_TEMP, M_WAITOK);
-       }
-
-       /* compute the size of the block that we will actually allocate */
-       allocsize = DTRACE_ALLOC_MINSIZE;
-       zindex = first_d_zone;
-       while (allocsize < size) {
-               allocsize <<= 1;
-               zindex++;
-       }
-
-       return(zalloc_canblock(d_zone[zindex], TRUE));
-}
-
-void dtrace_free(void *data, vm_size_t size)
-{
-       int zindex;
-       vm_size_t freesize;
-
-       if (size >= dtrace_alloc_max_prerounded) {
-               _FREE(data, M_TEMP);
-               return;
-       }
-
-       /* compute the size of the block that we actually allocated from */
-       freesize = DTRACE_ALLOC_MINSIZE;
-       zindex = first_d_zone;
-       while (freesize < size) {
-               freesize <<= 1;
-               zindex++;
-       }
-
-       /* free to the appropriate zone */
-       zfree(d_zone[zindex], data);
-}
-
-#endif /* DTRACE_MEMORY_ZONES */
index d47102fe604c78e195e39917aadc47d8612f39b6..bfda934bc2107e7d1d127fc8d197ba3340b82373 100644 (file)
@@ -51,6 +51,7 @@
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
 #include <libkern/OSAtomic.h>
+#include <libkern/OSKextLibPrivate.h>
 #include <kern/kern_types.h>
 #include <kern/timer_call.h>
 #include <kern/thread_call.h>
 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
 
+void
+dtrace_sprlock(proc_t *p)
+{
+       lck_mtx_assert(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
+       lck_mtx_lock(&p->p_dtrace_sprlock);
+}
+
+void
+dtrace_sprunlock(proc_t *p)
+{
+       lck_mtx_unlock(&p->p_dtrace_sprlock);
+
+}
+
 /* Not called from probe context */
-proc_t * 
+proc_t *
 sprlock(pid_t pid)
 {
        proc_t* p;
@@ -84,9 +99,9 @@ sprlock(pid_t pid)
 
        task_suspend_internal(p->task);
 
-       proc_lock(p);
+       dtrace_sprlock(p);
 
-       lck_mtx_lock(&p->p_dtrace_sprlock);
+       proc_lock(p);
 
        return p;
 }
@@ -96,10 +111,10 @@ void
 sprunlock(proc_t *p)
 {
        if (p != PROC_NULL) {
-               lck_mtx_unlock(&p->p_dtrace_sprlock);
-
                proc_unlock(p);
 
+               dtrace_sprunlock(p);
+
                task_resume_internal(p->task);
 
                proc_rele(p);
@@ -184,7 +199,7 @@ uwrite(proc_t *p, void *buf, user_size_t len, user_addr_t a)
 
                        if (info.max_protection & VM_PROT_WRITE) {
                                /* The memory is not currently writable, but can be made writable. */
-                               ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, reprotect | VM_PROT_WRITE);
+                               ret = mach_vm_protect (map, (mach_vm_offset_t)a, (mach_vm_size_t)len, 0, (reprotect & ~VM_PROT_EXECUTE) | VM_PROT_WRITE);
                        } else {
                                /*
                                 * The memory is not currently writable, and cannot be made writable. We need to COW this memory.
@@ -207,6 +222,8 @@ uwrite(proc_t *p, void *buf, user_size_t len, user_addr_t a)
                                         (vm_map_address_t)a,
                                         (vm_size_t)len);
 
+               dtrace_flush_caches();
+
                if (ret != KERN_SUCCESS)
                        goto done;
 
@@ -270,10 +287,6 @@ PRIV_POLICY_ONLY(void *cr, int priv, int boolean)
        return kauth_cred_issuser(cr); /* XXX TODO: HAS_PRIVILEGE(cr, priv); */
 }
 
-/* XXX Get around const poisoning using structure assigns */
-gid_t
-crgetgid(const cred_t *cr) { cred_t copy_cr = *cr; return kauth_cred_getgid(&copy_cr); }
-
 uid_t
 crgetuid(const cred_t *cr) { cred_t copy_cr = *cr; return kauth_cred_getuid(&copy_cr); }
 
@@ -577,15 +590,6 @@ cyclic_remove(cyclic_id_t cyclic)
        }
 }
 
-/*
- * ddi
- */
-void
-ddi_report_dev(dev_info_t *devi)
-{
-#pragma unused(devi)
-}
-
 kern_return_t _dtrace_register_anon_DOF(char *, uchar_t *, uint_t);
 
 kern_return_t
@@ -630,29 +634,6 @@ getminor ( dev_t d )
        return (minor_t) minor(d);
 }
 
-dev_t 
-makedevice(major_t major, minor_t minor)
-{
-       return makedev( major, minor );
-}
-
-int ddi_getprop(dev_t dev, dev_info_t *dip, int flags, const char *name, int defvalue)
-{
-#pragma unused(dev, dip, flags, name)
-
-       return defvalue;
-}
-
-/*
- * Kernel Debug Interface
- */
-int
-kdi_dtrace_set(kdi_dtrace_set_t ignore)
-{
-#pragma unused(ignore)
-       return 0; /* Success */
-}
-
 extern void Debugger(const char*);
 
 void
@@ -663,7 +644,7 @@ debug_enter(char *c) { Debugger(c); }
  */
 
 void *
-dt_kmem_alloc(size_t size, int kmflag)
+dt_kmem_alloc_site(size_t size, int kmflag, vm_allocation_site_t *site)
 {
 #pragma unused(kmflag)
 
@@ -671,15 +652,12 @@ dt_kmem_alloc(size_t size, int kmflag)
  * We ignore the M_NOWAIT bit in kmflag (all of kmflag, in fact).
  * Requests larger than 8K with M_NOWAIT fail in kalloc_canblock.
  */
-#if defined(DTRACE_MEMORY_ZONES)
-       return dtrace_alloc(size);
-#else
-       return kalloc(size);
-#endif
+       vm_size_t vsize = size;
+       return kalloc_canblock(&vsize, TRUE, site);
 }
 
 void *
-dt_kmem_zalloc(size_t size, int kmflag)
+dt_kmem_zalloc_site(size_t size, int kmflag, vm_allocation_site_t *site)
 {
 #pragma unused(kmflag)
 
@@ -687,11 +665,8 @@ dt_kmem_zalloc(size_t size, int kmflag)
  * We ignore the M_NOWAIT bit in kmflag (all of kmflag, in fact).
  * Requests larger than 8K with M_NOWAIT fail in kalloc_canblock.
  */
-#if defined(DTRACE_MEMORY_ZONES)
-       void* buf = dtrace_alloc(size);
-#else
-       void* buf = kalloc(size);
-#endif
+       vm_size_t vsize = size;
+       void* buf = kalloc_canblock(&vsize, TRUE, site);
 
        if(!buf)
                return NULL;
@@ -713,21 +688,18 @@ dt_kmem_free(void *buf, size_t size)
 
        ASSERT(size > 0);
 
-#if defined(DTRACE_MEMORY_ZONES)
-       dtrace_free(buf, size);
-#else
        kfree(buf, size);
-#endif
 }
 
 
 
 /*
- * aligned kmem allocator
+ * aligned dt_kmem allocator
  * align should be a power of two
  */
 
-void* dt_kmem_alloc_aligned(size_t size, size_t align, int kmflag)
+void*
+dt_kmem_alloc_aligned_site(size_t size, size_t align, int kmflag, vm_allocation_site_t *site)
 {
        void *mem, **addr_to_free;
        intptr_t mem_aligned;
@@ -742,7 +714,7 @@ void* dt_kmem_alloc_aligned(size_t size, size_t align, int kmflag)
         * the address to free and the total size of the buffer.
         */
        hdr_size = sizeof(size_t) + sizeof(void*);
-       mem = dt_kmem_alloc(size + align + hdr_size, kmflag);
+       mem = dt_kmem_alloc_site(size + align + hdr_size, kmflag, site);
        if (mem == NULL)
                return NULL;
 
@@ -759,11 +731,12 @@ void* dt_kmem_alloc_aligned(size_t size, size_t align, int kmflag)
        return (void*) mem_aligned;
 }
 
-void* dt_kmem_zalloc_aligned(size_t size, size_t align, int kmflag)
+void*
+dt_kmem_zalloc_aligned_site(size_t size, size_t align, int kmflag, vm_allocation_site_t *s)
 {
        void* buf;
 
-       buf = dt_kmem_alloc_aligned(size, align, kmflag);
+       buf = dt_kmem_alloc_aligned_site(size, align, kmflag, s);
 
        if(!buf)
                return NULL;
@@ -773,7 +746,8 @@ void* dt_kmem_zalloc_aligned(size_t size, size_t align, int kmflag)
        return buf;
 }
 
-void dt_kmem_free_aligned(void* buf, size_t size)
+void
+dt_kmem_free_aligned(void* buf, size_t size)
 {
 #pragma unused(size)
        intptr_t ptr = (intptr_t) buf;
@@ -829,44 +803,6 @@ kmem_cache_destroy(kmem_cache_t *cp)
 #pragma unused(cp)
 }
 
-/*
- * taskq
- */
-extern void thread_call_setup(thread_call_t, thread_call_func_t, thread_call_param_t); /* XXX MACH_KERNEL_PRIVATE */
-
-static void
-_taskq_apply( task_func_t func, thread_call_param_t arg )
-{
-       func( (void *)arg );
-}
-
-taskq_t *
-taskq_create(const char *name, int nthreads, pri_t pri, int minalloc,
-    int maxalloc, uint_t flags)
-{
-#pragma unused(name,nthreads,pri,minalloc,maxalloc,flags)
-
-       return (taskq_t *)thread_call_allocate( (thread_call_func_t)_taskq_apply, NULL );
-}
-
-taskqid_t
-taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
-{
-#pragma unused(flags)
-       thread_call_setup( (thread_call_t) tq, (thread_call_func_t)_taskq_apply, (thread_call_param_t)func );
-       thread_call_enter1( (thread_call_t) tq, (thread_call_param_t)arg );
-       return (taskqid_t) tq /* for lack of anything better */;
-}
-
-void   
-taskq_destroy(taskq_t *tq)
-{
-       thread_call_cancel( (thread_call_t) tq );
-       thread_call_free( (thread_call_t) tq );
-}
-
-pri_t maxclsyspri;
-
 /*
  * vmem (Solaris "slab" allocator) used by DTrace solely to hand out resource ids
  */
@@ -1182,19 +1118,26 @@ dtrace_copyoutstr(uintptr_t src, user_addr_t dst, size_t len, volatile uint16_t
 
 extern const int copysize_limit_panic;
 
+int dtrace_copy_maxsize(void)
+{
+       return copysize_limit_panic;
+}
+
+
 int
 dtrace_buffer_copyout(const void *kaddr, user_addr_t uaddr, vm_size_t nbytes)
 {
+       int maxsize = dtrace_copy_maxsize();
        /*
         * Partition the copyout in copysize_limit_panic-sized chunks
         */
-       while (nbytes >= (vm_size_t)copysize_limit_panic) {
-               if (copyout(kaddr, uaddr, copysize_limit_panic) != 0)
+       while (nbytes >= (vm_size_t)maxsize) {
+               if (copyout(kaddr, uaddr, maxsize) != 0)
                        return (EFAULT);
 
-               nbytes -= copysize_limit_panic;
-               uaddr += copysize_limit_panic;
-               kaddr += copysize_limit_panic;
+               nbytes -= maxsize;
+               uaddr += maxsize;
+               kaddr += maxsize;
        }
        if (nbytes > 0) {
                if (copyout(kaddr, uaddr, nbytes) != 0)
@@ -1321,22 +1264,6 @@ fuword64(user_addr_t uaddr, uint64_t *value)
        return 0;
 }
 
-void
-fuword8_noerr(user_addr_t uaddr, uint8_t *value)
-{
-       if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint8_t))) {
-               *value = 0;
-       }
-}
-
-void
-fuword16_noerr(user_addr_t uaddr, uint16_t *value)
-{
-       if (copyin((const user_addr_t)uaddr, (char *)value, sizeof(uint16_t))) {
-               *value = 0;
-       }
-}
-
 void
 fuword32_noerr(user_addr_t uaddr, uint32_t *value)
 {
@@ -1373,27 +1300,6 @@ suword32(user_addr_t addr, uint32_t value)
        return 0;
 }
 
-int
-suword16(user_addr_t addr, uint16_t value)
-{
-       if (copyout((const void *)&value, addr, sizeof(value)) != 0) {
-               return -1;
-       }
-
-       return 0;
-}
-
-int
-suword8(user_addr_t addr, uint8_t value)
-{
-       if (copyout((const void *)&value, addr, sizeof(value)) != 0) {
-               return -1;
-       }
-
-       return 0;
-}
-
-
 /*
  * Miscellaneous
  */
@@ -1537,6 +1443,12 @@ dtrace_getstackdepth(int aframes)
        return (depth - aframes);
 }
 
+int
+dtrace_addr_in_module(void* addr, struct modctl *ctl)
+{
+       return OSKextKextForAddress(addr) == (void*)ctl->mod_address;
+}
+
 /*
  * Unconsidered
  */
index 1ce9c28d8707a6702c5d868405562f046f2dccb9..6741f5563410f1d3cc7203e94d5aea1fc23343fe 100644 (file)
@@ -167,7 +167,6 @@ dtrace_ptss_allocate_page(struct proc* p)
 
        mach_vm_size_t size = PAGE_MAX_SIZE;
        mach_vm_offset_t addr = 0;
-#if CONFIG_EMBEDDED
        mach_vm_offset_t write_addr = 0;
        /* 
         * The embedded OS has extra permissions for writable and executable pages.
@@ -175,16 +174,11 @@ dtrace_ptss_allocate_page(struct proc* p)
         */
        vm_prot_t cur_protection = VM_PROT_READ|VM_PROT_EXECUTE;
        vm_prot_t max_protection = VM_PROT_READ|VM_PROT_EXECUTE|VM_PROT_WRITE;
-#else
-       vm_prot_t cur_protection = VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE;
-       vm_prot_t max_protection = VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE;
-#endif /* CONFIG_EMBEDDED */
 
-       kern_return_t kr = mach_vm_map_kernel(map, &addr, size, 0, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_NONE, IPC_PORT_NULL, 0, FALSE, cur_protection, max_protection, VM_INHERIT_DEFAULT);
+       kern_return_t kr = mach_vm_map_kernel(map, &addr, size, 0, VM_FLAGS_ANYWHERE, VM_MAP_KERNEL_FLAGS_NONE, VM_KERN_MEMORY_NONE, IPC_PORT_NULL, 0, FALSE, cur_protection, max_protection, VM_INHERIT_DEFAULT);
        if (kr != KERN_SUCCESS) {
                goto err;
        }
-#if CONFIG_EMBEDDED
        /*
         * If on embedded, remap the scratch space as writable at another
         * virtual address
@@ -196,14 +190,12 @@ dtrace_ptss_allocate_page(struct proc* p)
        kr = mach_vm_protect (map, (mach_vm_offset_t)write_addr, (mach_vm_size_t)size, 0, VM_PROT_READ | VM_PROT_WRITE);
        if (kr != KERN_SUCCESS)
                goto err;
-#endif
+
        // Chain the page entries.
        int i;
        for (i=0; i<DTRACE_PTSS_ENTRIES_PER_PAGE; i++) {
                ptss_page->entries[i].addr = addr + (i * DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD);
-#if CONFIG_EMBEDDED
                ptss_page->entries[i].write_addr = write_addr + (i * DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD);
-#endif
                ptss_page->entries[i].next = &ptss_page->entries[i+1];
        }
 
@@ -243,10 +235,8 @@ dtrace_ptss_free_page(struct proc* p, struct dtrace_ptss_page* ptss_page)
        // Silent failures, no point in checking return code.
        mach_vm_deallocate(map, addr, size);
 
-#ifdef CONFIG_EMBEDDED
        mach_vm_address_t write_addr = ptss_page->entries[0].write_addr;
        mach_vm_deallocate(map, write_addr, size);
-#endif
 
        vm_map_deallocate(map);
 }
index 8425b98af29af8a6e7af711e5d3c1c0dc3bb5404..9ce7ccc9cd1e16847cffc20e6356f881b460553d 100644 (file)
@@ -46,6 +46,8 @@
 #include <sys/dtrace_impl.h>
 #include <sys/proc.h>
 
+#include <security/mac_framework.h>
+
 #include <miscfs/devfs/devfs.h>
 #include <sys/proc_internal.h>
 #include <sys/dtrace_glue.h>
@@ -143,7 +145,6 @@ qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
  *     never hold the provider lock and creation lock simultaneously
  */
 
-static dev_info_t *fasttrap_devi;
 static dtrace_meta_provider_id_t fasttrap_meta_id;
 
 static thread_t fasttrap_cleanup_thread;
@@ -401,7 +402,6 @@ fasttrap_pid_cleanup_providers(void)
        return later;
 }
 
-#ifdef FASTTRAP_ASYNC_REMOVE
 typedef struct fasttrap_tracepoint_spec {
        pid_t fttps_pid;
        user_addr_t fttps_pc;
@@ -473,13 +473,13 @@ fasttrap_tracepoint_retire(proc_t *p, fasttrap_tracepoint_t *tp)
        s->fttps_pc = tp->ftt_pc;
 
        if (fasttrap_cur_retired == fasttrap_retired_size) {
-               fasttrap_retired_size *= 2;
                fasttrap_tracepoint_spec_t *new_retired = kmem_zalloc(
-                                       fasttrap_retired_size *
-                                       sizeof(fasttrap_tracepoint_t*),
+                                       fasttrap_retired_size * 2 *
+                                       sizeof(*fasttrap_retired_spec),
                                        KM_SLEEP);
-               memcpy(new_retired, fasttrap_retired_spec, sizeof(fasttrap_tracepoint_t*) * fasttrap_retired_size);
-               kmem_free(fasttrap_retired_spec, sizeof(fasttrap_tracepoint_t*) * (fasttrap_retired_size / 2));
+               memcpy(new_retired, fasttrap_retired_spec, sizeof(*fasttrap_retired_spec) * fasttrap_retired_size);
+               kmem_free(fasttrap_retired_spec, sizeof(*fasttrap_retired_spec) * fasttrap_retired_size);
+               fasttrap_retired_size *= 2;
                fasttrap_retired_spec = new_retired;
        }
 
@@ -489,15 +489,6 @@ fasttrap_tracepoint_retire(proc_t *p, fasttrap_tracepoint_t *tp)
 
        fasttrap_pid_cleanup(FASTTRAP_CLEANUP_TRACEPOINT);
 }
-#else
-void fasttrap_tracepoint_retire(proc_t *p, fasttrap_tracepoint_t *tp)
-{
-       if (tp->ftt_retired)
-               return;
-
-       fasttrap_tracepoint_remove(p, tp);
-}
-#endif
 
 static void
 fasttrap_pid_cleanup_compute_priority(void)
@@ -533,11 +524,9 @@ fasttrap_pid_cleanup_cb(void)
                if (work & FASTTRAP_CLEANUP_PROVIDER) {
                        later = fasttrap_pid_cleanup_providers();
                }
-#ifdef FASTTRAP_ASYNC_REMOVE
                if (work & FASTTRAP_CLEANUP_TRACEPOINT) {
                        fasttrap_tracepoint_cleanup();
                }
-#endif
                lck_mtx_lock(&fasttrap_cleanup_mtx);
 
                fasttrap_pid_cleanup_compute_priority();
@@ -1162,6 +1151,25 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
            return(0);
        }
 
+       if ((p->p_csflags & (CS_KILL|CS_HARD))) {
+               proc_unlock(p);
+               for (i = 0; i < DTRACE_NCLIENTS; i++) {
+                       dtrace_state_t *state = dtrace_state_get(i);
+                       if (state == NULL)
+                               continue;
+                       if (state->dts_cred.dcr_cred == NULL)
+                               continue;
+                       mac_proc_check_get_task(state->dts_cred.dcr_cred, p);
+               }
+               rc = cs_allow_invalid(p);
+               proc_lock(p);
+               if (rc == 0) {
+                       sprunlock(p);
+                       cmn_err(CE_WARN, "process doesn't allow invalid code pages, failing to install fasttrap probe\n");
+                       return (0);
+               }
+       }
+
        /*
         * APPLE NOTE: We do not have an equivalent thread structure to Solaris.
         * Solaris uses its ulwp_t struct for scratch space to support the pid provider.
@@ -1380,29 +1388,29 @@ static const dtrace_pattr_t pid_attr = {
 };
 
 static dtrace_pops_t pid_pops = {
-       fasttrap_pid_provide,
-       NULL,
-       fasttrap_pid_enable,
-       fasttrap_pid_disable,
-       NULL,
-       NULL,
-       fasttrap_pid_getargdesc,
-       fasttrap_pid_getarg,
-       NULL,
-       fasttrap_pid_destroy
+       .dtps_provide =         fasttrap_pid_provide,
+       .dtps_provide_module =  NULL,
+       .dtps_enable =          fasttrap_pid_enable,
+       .dtps_disable =         fasttrap_pid_disable,
+       .dtps_suspend =         NULL,
+       .dtps_resume =          NULL,
+       .dtps_getargdesc =      fasttrap_pid_getargdesc,
+       .dtps_getargval =       fasttrap_pid_getarg,
+       .dtps_usermode =        NULL,
+       .dtps_destroy =         fasttrap_pid_destroy
 };
 
 static dtrace_pops_t usdt_pops = {
-       fasttrap_pid_provide,
-       NULL,
-       fasttrap_pid_enable,
-       fasttrap_pid_disable,
-       NULL,
-       NULL,
-       fasttrap_pid_getargdesc,
-       fasttrap_usdt_getarg,
-       NULL,
-       fasttrap_pid_destroy
+       .dtps_provide =         fasttrap_pid_provide,
+       .dtps_provide_module =  NULL,
+       .dtps_enable =          fasttrap_pid_enable,
+       .dtps_disable =         fasttrap_pid_disable,
+       .dtps_suspend =         NULL,
+       .dtps_resume =          NULL,
+       .dtps_getargdesc =      fasttrap_pid_getargdesc,
+       .dtps_getargval =       fasttrap_usdt_getarg,
+       .dtps_usermode =        NULL,
+       .dtps_destroy =         fasttrap_pid_destroy
 };
 
 static fasttrap_proc_t *
@@ -1593,10 +1601,7 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons
         * APPLE NOTE:  We have no equivalent to crhold,
         * even though there is a cr_ref filed in ucred.
         */
-       // lck_mtx_lock(&p->p_crlock;
-       crhold(p->p_ucred);
-       cred = p->p_ucred;
-       // lck_mtx_unlock(&p->p_crlock);
+       cred = kauth_cred_proc_ref(p);
        proc_unlock(p);
 
        new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP);
@@ -1625,7 +1630,7 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons
                        lck_mtx_lock(&fp->ftp_mtx);
                        lck_mtx_unlock(&bucket->ftb_mtx);
                        fasttrap_provider_free(new_fp);
-                       crfree(cred);
+                       kauth_cred_unref(&cred);
                        return (fp);
                }
        }
@@ -1647,7 +1652,7 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons
            &new_fp->ftp_provid) != 0) {
                lck_mtx_unlock(&bucket->ftb_mtx);
                fasttrap_provider_free(new_fp);
-               crfree(cred);
+               kauth_cred_unref(&cred);
                return (NULL);
        }
 
@@ -1657,7 +1662,8 @@ fasttrap_provider_lookup(proc_t *p, fasttrap_provider_type_t provider_type, cons
        lck_mtx_lock(&new_fp->ftp_mtx);
        lck_mtx_unlock(&bucket->ftb_mtx);
 
-       crfree(cred);
+       kauth_cred_unref(&cred);
+
        return (new_fp);
 }
 
@@ -1850,16 +1856,6 @@ fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
        if (p == PROC_NULL)
                return (ESRCH);
 
-       /*
-        * Set that the process is allowed to run modified code and
-        * bail if it is not allowed to
-        */
-#if CONFIG_EMBEDDED
-       if ((p->p_csflags & (CS_KILL|CS_HARD)) && !cs_allow_invalid(p)) {
-               proc_rele(p);
-               return (EPERM);
-       }
-#endif
        if ((provider = fasttrap_provider_lookup(p, pdata->ftps_provider_type,
                                                 provider_name, &pid_attr)) == NULL) {
                proc_rele(p);
@@ -2339,10 +2335,10 @@ fasttrap_meta_provider_name(void *arg)
 }
 
 static dtrace_mops_t fasttrap_mops = {
-       fasttrap_meta_create_probe,
-       fasttrap_meta_provide,
-       fasttrap_meta_remove,
-       fasttrap_meta_provider_name
+       .dtms_create_probe =    fasttrap_meta_create_probe,
+       .dtms_provide_proc =    fasttrap_meta_provide,
+       .dtms_remove_proc =     fasttrap_meta_remove,
+       .dtms_provider_name =   fasttrap_meta_provider_name
 };
 
 /*
@@ -2522,22 +2518,11 @@ err:
        return (EINVAL);
 }
 
-static int
-fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+static void
+fasttrap_attach(void)
 {
        ulong_t nent;
-
-       switch (cmd) {
-       case DDI_ATTACH:
-               break;
-       case DDI_RESUME:
-               return (DDI_SUCCESS);
-       default:
-               return (DDI_FAILURE);
-       }
-
-       ddi_report_dev(devi);
-       fasttrap_devi = devi;
+       unsigned int i;
 
        /*
         * Install our hooks into fork(2), exec(2), and exit(2).
@@ -2553,17 +2538,6 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
         */
        fasttrap_max = (sane_size >> 28) * 100000;
 
-#if CONFIG_EMBEDDED
-#if defined(__LP64__)
-       /*
-        * On embedded, the zone map does not grow with the memory size over 1GB
-        * (see osfmk/vm/vm_init.c)
-        */
-       if (fasttrap_max > 400000) {
-               fasttrap_max = 400000;
-       }
-#endif
-#endif
        if (fasttrap_max == 0)
                fasttrap_max = 50000;
 
@@ -2573,8 +2547,12 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
        /*
         * Conjure up the tracepoints hashtable...
         */
+#ifdef illumos
        nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
            "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE);
+#else
+       nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
+#endif
 
        if (nent <= 0 || nent > 0x1000000)
                nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
@@ -2589,11 +2567,7 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
            sizeof (fasttrap_bucket_t), KM_SLEEP);
        ASSERT(fasttrap_tpoints.fth_table != NULL);
 
-       /*
-        * APPLE NOTE:  explicitly initialize all locks...
-        */
-       unsigned int i;
-       for (i=0; i<fasttrap_tpoints.fth_nent; i++) {
+       for (i = 0; i < fasttrap_tpoints.fth_nent; i++) {
                lck_mtx_init(&fasttrap_tpoints.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
        }
 
@@ -2611,10 +2585,7 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
            sizeof (fasttrap_bucket_t), KM_SLEEP);
        ASSERT(fasttrap_provs.fth_table != NULL);
 
-       /*
-        * APPLE NOTE: explicitly initialize all locks...
-        */
-       for (i=0; i<fasttrap_provs.fth_nent; i++) {
+       for (i = 0; i < fasttrap_provs.fth_nent; i++) {
                lck_mtx_init(&fasttrap_provs.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
        }
 
@@ -2632,17 +2603,14 @@ fasttrap_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
            sizeof (fasttrap_bucket_t), KM_SLEEP);
        ASSERT(fasttrap_procs.fth_table != NULL);
 
-       /*
-        * APPLE NOTE: explicitly initialize all locks...
-        */
-       for (i=0; i<fasttrap_procs.fth_nent; i++) {
+#ifndef illumos
+       for (i = 0; i < fasttrap_procs.fth_nent; i++) {
                lck_mtx_init(&fasttrap_procs.fth_table[i].ftb_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
        }
+#endif
 
        (void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
            &fasttrap_meta_id);
-
-       return (DDI_SUCCESS);
 }
 
 static int 
@@ -2676,7 +2644,7 @@ _fasttrap_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
                return 0;
 }
 
-static int gFasttrapInited = 0;
+static int fasttrap_inited = 0;
 
 #define FASTTRAP_MAJOR  -24 /* let the kernel pick the device number */
 
@@ -2714,7 +2682,7 @@ fasttrap_init( void )
         *
         * The reason is to delay allocating the (rather large) resources as late as possible.
         */
-       if (0 == gFasttrapInited) {
+       if (!fasttrap_inited) {
                int majdevno = cdevsw_add(FASTTRAP_MAJOR, &fasttrap_cdevsw);
 
                if (majdevno < 0) {
@@ -2763,12 +2731,7 @@ fasttrap_init( void )
                lck_mtx_init(&fasttrap_cleanup_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
                lck_mtx_init(&fasttrap_count_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
 
-               if (DDI_FAILURE == fasttrap_attach((dev_info_t *)(uintptr_t)device, 0 )) {
-                       // FIX ME! Do we remove the devfs node here?
-                       // What kind of error reporting?
-                       printf("fasttrap_init: Call to fasttrap_attach failed.\n");
-                       return;
-               }
+               fasttrap_attach();
 
                /*
                 * Start the fasttrap cleanup thread
@@ -2779,14 +2742,12 @@ fasttrap_init( void )
                }
                thread_set_thread_name(fasttrap_cleanup_thread, "dtrace_fasttrap_cleanup_thread");
 
-#ifdef FASTTRAP_ASYNC_REMOVE
                fasttrap_retired_size = DEFAULT_RETIRED_SIZE;
-               fasttrap_retired_spec = kmem_zalloc(fasttrap_retired_size * sizeof(fasttrap_tracepoint_t*),
+               fasttrap_retired_spec = kmem_zalloc(fasttrap_retired_size * sizeof(*fasttrap_retired_spec),
                                        KM_SLEEP);
                lck_mtx_init(&fasttrap_retired_mtx, fasttrap_lck_grp, fasttrap_lck_attr);
-#endif
 
-               gFasttrapInited = 1;
+               fasttrap_inited = 1;
        }
 }
 
index d90a7b15d5a391952d5959dc11fcde2a54275c2e..d785547ca6dabd58df1fb0e7b7f49d351be62fa7 100644 (file)
@@ -67,10 +67,13 @@ extern kern_return_t fbt_perfCallback(int, struct savearea_t *, uintptr_t *, __u
 #error Unknown architecture
 #endif
 
+__private_extern__
+void
+qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
+
 #define        FBT_ADDR2NDX(addr)      ((((uintptr_t)(addr)) >> 4) & fbt_probetab_mask)
 #define        FBT_PROBETAB_SIZE       0x8000          /* 32k entries -- 128K total */
 
-static dev_info_t              *fbt_devi;
 static int                             fbt_probetab_size;
 dtrace_provider_id_t   fbt_id;
 fbt_probe_t                            **fbt_probetab;
@@ -91,12 +94,23 @@ void fbt_init( void );
 static const char * critical_blacklist[] =
 {
        "Call_DebuggerC",
+       "DebuggerCall",
+       "DebuggerTrapWithState",
+       "DebuggerXCallEnter",
+       "IOCPURunPlatformPanicActions",
+       "PEARMDebugPanicHook",
+       "PEHaltRestart",
+       "SavePanicInfo",
        "SysChoked",
        "_ZN9IOService14newTemperatureElPS_", /* IOService::newTemperature */
        "_ZN9IOService26temperatureCriticalForZoneEPS_", /* IOService::temperatureCriticalForZone */
        "_ZNK6OSData14getBytesNoCopyEv", /* Data::getBytesNoCopy, IOHibernateSystemWake path */
+       "__ZN16IOPlatformExpert11haltRestartEj",
+       "__ZN18IODTPlatformExpert11haltRestartEj",
+       "__ZN9IODTNVRAM13savePanicInfoEPhy"
        "_disable_preemption",
        "_enable_preemption",
+       "alternate_debugger_enter",
        "bcopy_phys",
        "console_cpu_alloc",
        "console_cpu_free",
@@ -136,12 +150,18 @@ static const char * critical_blacklist[] =
        "enter_lohandler",
        "fbt_invop",
        "fbt_perfCallback",
+       "get_preemption_level"
        "get_threadtask",
        "handle_pending_TLB_flushes",
        "hw_compare_and_store",
        "interrupt",
+       "is_saved_state32",
+       "kernel_preempt_check",
        "kernel_trap",
        "kprintf",
+       "ks_dispatch_kernel",
+       "ks_dispatch_user",
+       "ks_kernel_trap",
        "lo_alltraps",
        "lock_debugger",
        "machine_idle_cstate",
@@ -153,6 +173,9 @@ static const char * critical_blacklist[] =
        "nanotime_to_absolutetime",
        "packA",
        "panic",
+       "phystokv",
+       "phystokv_range",
+       "pltrace",
        "pmKextRegister",
        "pmMarkAllCPUsOff",
        "pmSafeMode",
@@ -167,18 +190,28 @@ static const char * critical_blacklist[] =
        "power_management_init",
        "preemption_underflow_panic",
        "register_cpu_setup_func",
+       "ret64_iret"
+       "ret_to_user"
+       "return_to_kernel",
+       "return_to_user",
+       "saved_state64",
        "sdt_invop",
        "sprlock",
        "sprunlock",
+       "strlen",
+       "strncmp",
        "t_invop",
        "tmrCvt",
-       "uread",
-       "uwrite",
+       "trap_from_kernel",
+       "uart_putc",
        "unlock_debugger",
        "unpackA",
        "unregister_cpu_setup_func",
+       "uread",
+       "uwrite",
        "vstart"
 };
+
 #define CRITICAL_BLACKLIST_COUNT (sizeof(critical_blacklist)/sizeof(critical_blacklist[0]))
 
 /*
@@ -192,6 +225,7 @@ static const char * probe_ctx_closure[] =
        "IS_64BIT_PROCESS",
        "OSCompareAndSwap",
        "SetIdlePop",
+       "__dtrace_probe",
        "absolutetime_to_microtime",
        "act_set_astbsd",
        "arm_init_idle_cpu",
@@ -287,7 +321,7 @@ fbt_module_excluded(struct modctl* ctl)
        if (ctl->mod_address == 0 || ctl->mod_size == 0) {
                return TRUE;
        }
-       
+
        if (ctl->mod_loaded == 0) {
                return TRUE;
        }
@@ -434,9 +468,12 @@ fbt_excluded(const char* name)
                return TRUE;
 #endif
 
-
 #ifdef __x86_64__
        if (LIT_STRNSTART(name, "machine_") ||
+               LIT_STRNSTART(name, "idt64") ||
+               LIT_STRNSTART(name, "ks_") ||
+               LIT_STRNSTART(name, "hndl_") ||
+               LIT_STRNSTART(name, "_intr_") ||
                LIT_STRNSTART(name, "mapping_") ||
                LIT_STRNSTART(name, "tsc_") ||
                LIT_STRNSTART(name, "pmCPU") ||
@@ -532,7 +569,7 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg)
     for (; fbt != NULL; fbt = fbt->fbtp_next) {
 
        ctl = fbt->fbtp_ctl;
-       
+
        if (!ctl->mod_loaded) {
                if (fbt_verbose) {
                        cmn_err(CE_NOTE, "fbt is failing for probe %s "
@@ -556,7 +593,7 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg)
                }
 
                continue;
-       }       
+       }
 
        dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback);
        if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) {
@@ -576,7 +613,7 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg)
                kasan_fakestack_suspend();
 #endif
 
-               (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, 
+               (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint,
                                                                sizeof(fbt->fbtp_patchval));
                /*
                 * Make the patched instruction visible via a data + instruction
@@ -590,9 +627,9 @@ fbt_enable(void *arg, dtrace_id_t id, void *parg)
        }
 
     }
-    
+
     dtrace_membar_consumer();
-    
+
     return (0);
 }
 
@@ -606,12 +643,12 @@ fbt_disable(void *arg, dtrace_id_t id, void *parg)
 
        for (; fbt != NULL; fbt = fbt->fbtp_next) {
            ctl = fbt->fbtp_ctl;
-           
+
            if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt))
                continue;
 
            if (fbt->fbtp_currentval != fbt->fbtp_savedval) {
-               (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, 
+               (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint,
                                                                sizeof(fbt->fbtp_savedval));
                /*
                 * Make the patched instruction visible via a data + instruction
@@ -647,19 +684,19 @@ fbt_suspend(void *arg, dtrace_id_t id, void *parg)
            if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt))
                continue;
 
-           (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint, 
+           (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_savedval, (vm_offset_t)fbt->fbtp_patchpoint,
                                                                sizeof(fbt->fbtp_savedval));
-               
+
                /*
                 * Make the patched instruction visible via a data + instruction
                 * cache flush for the platforms that need it
                 */
                flush_dcache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_savedval), 0);
                invalidate_icache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_savedval), 0);
-               
+
                fbt->fbtp_currentval = fbt->fbtp_savedval;
        }
-       
+
        dtrace_membar_consumer();
 }
 
@@ -677,7 +714,7 @@ fbt_resume(void *arg, dtrace_id_t id, void *parg)
            ASSERT(ctl->mod_nenabled > 0);
            if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt))
                continue;
-       
+
            dtrace_casptr(&tempDTraceTrapHook, NULL, fbt_perfCallback);
            if (tempDTraceTrapHook != (perfCallback)fbt_perfCallback) {
                if (fbt_verbose) {
@@ -687,123 +724,21 @@ fbt_resume(void *arg, dtrace_id_t id, void *parg)
                }
                return;
            }
-       
-           (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint, 
+
+           (void)ml_nofault_copy( (vm_offset_t)&fbt->fbtp_patchval, (vm_offset_t)fbt->fbtp_patchpoint,
                                                                sizeof(fbt->fbtp_patchval));
 
-#if CONFIG_EMBEDDED
                /*
                 * Make the patched instruction visible via a data + instruction cache flush.
                 */
                flush_dcache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_patchval), 0);
                invalidate_icache((vm_offset_t)fbt->fbtp_patchpoint,(vm_size_t)sizeof(fbt->fbtp_patchval), 0);
-#endif
-               
-           fbt->fbtp_currentval = fbt->fbtp_patchval;
-       }
-       
-       dtrace_membar_consumer();
-}
 
-/*
- * APPLE NOTE: fbt_getargdesc not implemented
- */
-#if !defined(__APPLE__)
-/*ARGSUSED*/
-static void
-fbt_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
-{
-       fbt_probe_t *fbt = parg;
-       struct modctl *ctl = fbt->fbtp_ctl;
-       struct module *mp = ctl->mod_mp;
-       ctf_file_t *fp = NULL, *pfp;
-       ctf_funcinfo_t f;
-       int error;
-       ctf_id_t argv[32], type;
-       int argc = sizeof (argv) / sizeof (ctf_id_t);
-       const char *parent;
-
-       if (!ctl->mod_loaded || (ctl->mod_loadcnt != fbt->fbtp_loadcnt))
-               goto err;
-
-       if (fbt->fbtp_roffset != 0 && desc->dtargd_ndx == 0) {
-               (void) strlcpy(desc->dtargd_native, "int",
-                              sizeof(desc->dtargd_native));
-               return;
-       }
-
-       if ((fp = ctf_modopen(mp, &error)) == NULL) {
-               /*
-                * We have no CTF information for this module -- and therefore
-                * no args[] information.
-                */
-               goto err;
-       }
-
-       /*
-        * If we have a parent container, we must manually import it.
-        */
-       if ((parent = ctf_parent_name(fp)) != NULL) {
-               struct modctl *mp = &modules;
-               struct modctl *mod = NULL;
-
-               /*
-                * We must iterate over all modules to find the module that
-                * is our parent.
-                */
-               do {
-                       if (strcmp(mp->mod_modname, parent) == 0) {
-                               mod = mp;
-                               break;
-                       }
-               } while ((mp = mp->mod_next) != &modules);
-
-               if (mod == NULL)
-                       goto err;
-
-               if ((pfp = ctf_modopen(mod->mod_mp, &error)) == NULL) {
-                       goto err;
-               }
-
-               if (ctf_import(fp, pfp) != 0) {
-                       ctf_close(pfp);
-                       goto err;
-               }
-
-               ctf_close(pfp);
-       }
-
-       if (ctf_func_info(fp, fbt->fbtp_symndx, &f) == CTF_ERR)
-               goto err;
-
-       if (fbt->fbtp_roffset != 0) {
-               if (desc->dtargd_ndx > 1)
-                       goto err;
-
-               ASSERT(desc->dtargd_ndx == 1);
-               type = f.ctc_return;
-       } else {
-               if (desc->dtargd_ndx + 1 > f.ctc_argc)
-                       goto err;
-
-               if (ctf_func_args(fp, fbt->fbtp_symndx, argc, argv) == CTF_ERR)
-                       goto err;
-
-               type = argv[desc->dtargd_ndx];
-       }
-
-       if (ctf_type_name(fp, type, desc->dtargd_native,
-           DTRACE_ARGTYPELEN) != NULL) {
-               ctf_close(fp);
-               return;
+           fbt->fbtp_currentval = fbt->fbtp_patchval;
        }
-err:
-       if (fp != NULL)
-               ctf_close(fp);
 
-       desc->dtargd_ndx = DTRACE_ARGNONE;
+       dtrace_membar_consumer();
 }
-#endif /* __APPLE__ */
 
 static void
 fbt_provide_module_user_syms(struct modctl *ctl)
@@ -827,11 +762,8 @@ fbt_provide_module_user_syms(struct modctl *ctl)
                        if (*name == '_')
                                name += 1;
 
-                       /*
-                        * We're only blacklisting functions in the kernel for now.
-                        */
-                        if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name))
-                               continue;
+                       if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name))
+                               continue;
 
                        /*
                         * Ignore symbols with a null address
@@ -839,11 +771,139 @@ fbt_provide_module_user_syms(struct modctl *ctl)
                        if (!symbol->dtsym_addr)
                                continue;
 
-                       fbt_provide_probe(ctl, (uintptr_t)symbol->dtsym_addr, (uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size), modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr);
+                       /*
+                        * Ignore symbols not part of this module
+                        */
+                       if (!dtrace_addr_in_module((void*)symbol->dtsym_addr, ctl))
+                               continue;
+
+                       fbt_provide_probe(ctl, modname, name, (machine_inst_t*)(uintptr_t)symbol->dtsym_addr, (machine_inst_t*)(uintptr_t)(symbol->dtsym_addr + symbol->dtsym_size));
                }
        }
 }
+static void
+fbt_provide_kernel_section(struct modctl *ctl, kernel_section_t *sect, kernel_nlist_t *sym, uint32_t nsyms, const char *strings)
+{
+       uintptr_t sect_start = (uintptr_t)sect->addr;
+       uintptr_t sect_end = (uintptr_t)sect->size + sect->addr;
+       unsigned int i;
+
+       if ((sect->flags & S_ATTR_PURE_INSTRUCTIONS) != S_ATTR_PURE_INSTRUCTIONS) {
+               return;
+       }
+
+       for (i = 0; i < nsyms; i++) {
+               uint8_t         n_type = sym[i].n_type & (N_TYPE | N_EXT);
+               const char           *name = strings + sym[i].n_un.n_strx;
+               uint64_t limit;
+
+               if (sym[i].n_value < sect_start || sym[i].n_value > sect_end)
+                       continue;
+
+               /* Check that the symbol is a global and that it has a name. */
+               if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type))
+                       continue;
+
+               if (0 == sym[i].n_un.n_strx)    /* iff a null, "", name. */
+                       continue;
+
+               /* Lop off omnipresent leading underscore. */
+               if (*name == '_')
+                       name += 1;
+
+#if defined(__arm__)
+               // Skip non-thumb functions on arm32
+               if (sym[i].n_sect == 1 && !(sym[i].n_desc & N_ARM_THUMB_DEF)) {
+                       continue;
+               }
+#endif /* defined(__arm__) */
+
+               if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name))
+                       continue;
+
+               /*
+                * Find the function boundary by looking at either the
+                * end of the section or the beginning of the next symbol
+                */
+               if (i == nsyms - 1) {
+                       limit = sect_end;
+               }
+               else {
+                       limit = sym[i + 1].n_value;
+               }
+
+               fbt_provide_probe(ctl, ctl->mod_modname, name, (machine_inst_t*)sym[i].n_value, (machine_inst_t*)limit);
+       }
+
+}
+
+static int
+fbt_sym_cmp(const void *ap, const void *bp)
+{
+       return (int)(((const kernel_nlist_t*)ap)->n_value - ((const kernel_nlist_t*)bp)->n_value);
+}
+
+static void
+fbt_provide_module_kernel_syms(struct modctl *ctl)
+{
+       kernel_mach_header_t *mh = (kernel_mach_header_t *)(ctl->mod_address);
+       kernel_segment_command_t *seg;
+       struct load_command *cmd;
+       kernel_segment_command_t *linkedit = NULL;
+       struct symtab_command *symtab = NULL;
+       kernel_nlist_t *syms = NULL, *sorted_syms = NULL;
+       const char *strings;
+       unsigned int i;
+       size_t symlen;
+
+       if (mh->magic != MH_MAGIC_KERNEL)
+               return;
+
+       cmd = (struct load_command *) &mh[1];
+       for (i = 0; i < mh->ncmds; i++) {
+               if (cmd->cmd == LC_SEGMENT_KERNEL) {
+                       kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd;
+                       if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT))
+                               linkedit = orig_sg;
+               } else if (cmd->cmd == LC_SYMTAB) {
+                       symtab = (struct symtab_command *) cmd;
+               }
+               if (symtab && linkedit) {
+                       break;
+               }
+               cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize);
+       }
 
+       if ((symtab == NULL) || (linkedit == NULL)) {
+               return;
+       }
+
+       syms = (kernel_nlist_t *)(linkedit->vmaddr + symtab->symoff - linkedit->fileoff);
+       strings = (const char *)(linkedit->vmaddr + symtab->stroff - linkedit->fileoff);
+
+       /*
+        * Make a copy of the symbol table and sort it to not cross into the next function
+        * when disassembling the function
+        */
+       symlen = sizeof(kernel_nlist_t) * symtab->nsyms;
+       sorted_syms = kmem_alloc(symlen, KM_SLEEP);
+       bcopy(syms, sorted_syms, symlen);
+       qsort(sorted_syms, symtab->nsyms, sizeof(kernel_nlist_t), fbt_sym_cmp);
+
+       for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) {
+               kernel_section_t *sect = firstsect(seg);
+
+               if (strcmp(seg->segname, "__KLD") == 0) {
+                       continue;
+               }
+
+               for (sect = firstsect(seg); sect != NULL; sect = nextsect(seg, sect)) {
+                       fbt_provide_kernel_section(ctl, sect, sorted_syms, symtab->nsyms, strings);
+               }
+       }
+
+       kmem_free(sorted_syms, symlen);
+}
 
 void
 fbt_provide_module(void *arg, struct modctl *ctl)
@@ -893,16 +953,16 @@ static dtrace_pattr_t fbt_attr = {
 };
 
 static dtrace_pops_t fbt_pops = {
-       NULL,
-       fbt_provide_module,
-       fbt_enable,
-       fbt_disable,
-       fbt_suspend,
-       fbt_resume,
-       NULL, /*  APPLE NOTE: fbt_getargdesc not implemented */
-       NULL,
-       NULL,
-       fbt_destroy
+       .dtps_provide =         NULL,
+       .dtps_provide_module =  fbt_provide_module,
+       .dtps_enable =          fbt_enable,
+       .dtps_disable =         fbt_disable,
+       .dtps_suspend =         fbt_suspend,
+       .dtps_resume =          fbt_resume,
+       .dtps_getargdesc =      NULL, /* APPLE NOTE: fbt_getargdesc implemented in userspace */
+       .dtps_getargval =       NULL,
+       .dtps_usermode =        NULL,
+       .dtps_destroy =         fbt_destroy
 };
 
 static void
@@ -916,17 +976,8 @@ fbt_cleanup(dev_info_t *devi)
 }
 
 static int
-fbt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+fbt_attach(dev_info_t *devi)
 {
-       switch (cmd) {
-       case DDI_ATTACH:
-               break;
-       case DDI_RESUME:
-               return (DDI_SUCCESS);
-       default:
-               return (DDI_FAILURE);
-       }
-
        if (fbt_probetab_size == 0)
                fbt_probetab_size = FBT_PROBETAB_SIZE;
 
@@ -944,9 +995,6 @@ fbt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
                return (DDI_FAILURE);
        }
 
-       ddi_report_dev(devi);
-       fbt_devi = devi;
-
        return (DDI_SUCCESS);
 }
 
@@ -1024,8 +1072,6 @@ static struct cdevsw fbt_cdevsw =
        0                                       /* type */
 };
 
-static int fbt_inited = 0;
-
 #undef kmem_alloc /* from its binding to dt_kmem_alloc glue */
 #undef kmem_free /* from its binding to dt_kmem_free glue */
 #include <vm/vm_kern.h>
@@ -1033,22 +1079,15 @@ static int fbt_inited = 0;
 void
 fbt_init( void )
 {
-       if (0 == fbt_inited)
-       {
-               int majdevno = cdevsw_add(FBT_MAJOR, &fbt_cdevsw);
-               
-               if (majdevno < 0) {
-                       printf("fbt_init: failed to allocate a major number!\n");
-                       return;
-               }
-               
-               PE_parse_boot_argn("IgnoreFBTBlacklist", &ignore_fbt_blacklist, sizeof (ignore_fbt_blacklist));
+       int majdevno = cdevsw_add(FBT_MAJOR, &fbt_cdevsw);
 
-               fbt_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH );
-               
-               fbt_inited = 1; /* Ensure this initialization occurs just one time. */
+       if (majdevno < 0) {
+               printf("fbt_init: failed to allocate a major number!\n");
+               return;
        }
-       else
-               panic("fbt_init: called twice!\n");
+
+       PE_parse_boot_argn("IgnoreFBTBlacklist", &ignore_fbt_blacklist, sizeof (ignore_fbt_blacklist));
+
+       fbt_attach((dev_info_t*)(uintptr_t)majdevno);
 }
 #undef FBT_MAJOR
index 2aebd0a1ed036fa8ce40cbbfe6b23ae4f1f95595..3d06c46f44daf4c9e3918fa52213145726d8dc4e 100644 (file)
@@ -176,25 +176,6 @@ lockstat_probe_t lockstat_probes[] =
 
 dtrace_id_t lockstat_probemap[LS_NPROBES];
 
-#if CONFIG_DTRACE
-#if defined(__x86_64__)
-extern void lck_mtx_lock_lockstat_patch_point(void);
-extern void lck_mtx_try_lock_lockstat_patch_point(void);
-extern void lck_mtx_try_lock_spin_lockstat_patch_point(void);
-extern void lck_mtx_unlock_lockstat_patch_point(void);
-extern void lck_mtx_lock_ext_lockstat_patch_point(void);
-extern void lck_mtx_ext_unlock_lockstat_patch_point(void);
-extern void lck_mtx_lock_spin_lockstat_patch_point(void);
-#endif
-#if defined (__arm__)
-
-#endif
-
-#if defined (__arm64__)
-
-#endif
-#endif /* CONFIG_DTRACE */
-
 typedef struct lockstat_assembly_probe {
        int lsap_probe;
        vm_offset_t * lsap_patch_point;
@@ -203,26 +184,8 @@ typedef struct lockstat_assembly_probe {
 
        lockstat_assembly_probe_t assembly_probes[] =
        {
-#if CONFIG_DTRACE
-#if defined(__x86_64__)
-               /*
-                * On x86 these points are better done via hot patches, which ensure
-                * there is zero overhead when not in use.  On x86 these patch points
-                * are swapped between the return instruction and a no-op, with the
-                * Dtrace call following the return.
-                */ 
-               { LS_LCK_MTX_LOCK_ACQUIRE,              (vm_offset_t *) lck_mtx_lock_lockstat_patch_point },
-               { LS_LCK_MTX_TRY_LOCK_ACQUIRE,          (vm_offset_t *) lck_mtx_try_lock_lockstat_patch_point },
-               { LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE,     (vm_offset_t *) lck_mtx_try_lock_spin_lockstat_patch_point },
-               { LS_LCK_MTX_UNLOCK_RELEASE,            (vm_offset_t *) lck_mtx_unlock_lockstat_patch_point },
-               { LS_LCK_MTX_EXT_LOCK_ACQUIRE,          (vm_offset_t *) lck_mtx_lock_ext_lockstat_patch_point },
-               { LS_LCK_MTX_EXT_UNLOCK_RELEASE,        (vm_offset_t *) lck_mtx_ext_unlock_lockstat_patch_point },
-               { LS_LCK_MTX_LOCK_SPIN_ACQUIRE,         (vm_offset_t *) lck_mtx_lock_spin_lockstat_patch_point },
-#endif
-               /* No assembly patch points for ARM */
-#endif /* CONFIG_DTRACE */
                { LS_LCK_INVALID, NULL }
-};
+       };
 
 
 /*
@@ -290,7 +253,6 @@ lockstat_probe_wrapper(int probe, uintptr_t lp, int rwflag)
        }
 }
 
-static dev_info_t      *lockstat_devi; /* saved in xxattach() for xxinfo() */
 static dtrace_provider_id_t lockstat_id;
 
 /*ARGSUSED*/
@@ -410,30 +372,21 @@ static dtrace_pattr_t lockstat_attr = {
 };
 
 static dtrace_pops_t lockstat_pops = {
-       lockstat_provide,
-       NULL,
-       lockstat_enable,
-       lockstat_disable,
-       NULL,
-       NULL,
-       lockstat_getargdesc,
-       NULL,
-       NULL,
-       lockstat_destroy
+       .dtps_provide =         lockstat_provide,
+       .dtps_provide_module =  NULL,
+       .dtps_enable =          lockstat_enable,
+       .dtps_disable =         lockstat_disable,
+       .dtps_suspend =         NULL,
+       .dtps_resume =          NULL,
+       .dtps_getargdesc =      lockstat_getargdesc,
+       .dtps_getargval =       NULL,
+       .dtps_usermode =        NULL,
+       .dtps_destroy =         lockstat_destroy
 };
 
 static int
-lockstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+lockstat_attach(dev_info_t *devi)
 {
-       switch (cmd) {
-       case DDI_ATTACH:
-               break;
-       case DDI_RESUME:
-               return (DDI_SUCCESS);
-       default:
-               return (DDI_FAILURE);
-       }
-
        if (ddi_create_minor_node(devi, "lockstat", S_IFCHR, 0,
            DDI_PSEUDO, 0) == DDI_FAILURE ||
            dtrace_register("lockstat", &lockstat_attr, DTRACE_PRIV_KERNEL,
@@ -445,8 +398,6 @@ lockstat_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
        lockstat_probe = dtrace_probe;
        membar_producer();
 
-       ddi_report_dev(devi);
-       lockstat_devi = devi;
        return (DDI_SUCCESS);
 }
 
@@ -482,25 +433,17 @@ static struct cdevsw lockstat_cdevsw =
        0                                       /* type */
 };
 
-static int gLockstatInited = 0;
-
 void lockstat_init( void );
 
 void lockstat_init( void )
 {
-       if (0 == gLockstatInited)
-       {
-               int majdevno = cdevsw_add(LOCKSTAT_MAJOR, &lockstat_cdevsw);
-               
-               if (majdevno < 0) {
-                       printf("lockstat_init: failed to allocate a major number!\n");
-                       gLockstatInited = 0;
-                       return;
-               }
+       int majdevno = cdevsw_add(LOCKSTAT_MAJOR, &lockstat_cdevsw);
+
+       if (majdevno < 0) {
+               printf("lockstat_init: failed to allocate a major number!\n");
+               return;
+       }
 
-               lockstat_attach( (dev_info_t    *)(uintptr_t)majdevno, DDI_ATTACH );
-               gLockstatInited = 1;
-       } else
-               panic("lockstat_init: called twice!\n");
+       lockstat_attach((dev_info_t*)(uintptr_t)majdevno);
 }
 #undef LOCKSTAT_MAJOR
index 259fab8bcb298898451e2ec3b78a300b8caa060a..a76f901c4d13ec52454db9f4b49517bb29f9462d 100644 (file)
@@ -65,7 +65,6 @@ extern struct arm_saved_state *find_kern_regs(thread_t);
 
 extern void profile_init(void);
 
-static dev_info_t *profile_devi;
 static dtrace_provider_id_t profile_id;
 
 /*
@@ -645,30 +644,21 @@ static dtrace_pattr_t profile_attr = {
 };
 
 static dtrace_pops_t profile_pops = {
-       profile_provide,
-       NULL,
-       profile_enable,
-       profile_disable,
-       NULL,
-       NULL,
-       profile_getargdesc,
-       profile_getarg,
-       profile_usermode,
-       profile_destroy
+       .dtps_provide =         profile_provide,
+       .dtps_provide_module =  NULL,
+       .dtps_enable =          profile_enable,
+       .dtps_disable =         profile_disable,
+       .dtps_suspend =         NULL,
+       .dtps_resume =          NULL,
+       .dtps_getargdesc =      profile_getargdesc,
+       .dtps_getargval =       profile_getarg,
+       .dtps_usermode =        profile_usermode,
+       .dtps_destroy =         profile_destroy
 };
 
 static int
-profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+profile_attach(dev_info_t *devi)
 {
-       switch (cmd) {
-       case DDI_ATTACH:
-               break;
-       case DDI_RESUME:
-               return (DDI_SUCCESS);
-       default:
-               return (DDI_FAILURE);
-       }
-
        if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0,
            DDI_PSEUDO, 0) == DDI_FAILURE ||
            dtrace_register("profile", &profile_attr,
@@ -680,8 +670,6 @@ profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 
        profile_max = PROFILE_MAX_DEFAULT;
 
-       ddi_report_dev(devi);
-       profile_devi = devi;
        return (DDI_SUCCESS);
 }
 
@@ -741,24 +729,15 @@ static struct cdevsw profile_cdevsw =
        0                                       /* type */
 };
 
-static int gProfileInited = 0;
-
 void profile_init( void )
 {
-       if (0 == gProfileInited)
-       {
-               int majdevno = cdevsw_add(PROFILE_MAJOR, &profile_cdevsw);
-               
-               if (majdevno < 0) {
-                       printf("profile_init: failed to allocate a major number!\n");
-                       gProfileInited = 0;
-                       return;
-               }
+       int majdevno = cdevsw_add(PROFILE_MAJOR, &profile_cdevsw);
 
-               profile_attach( (dev_info_t     *)(uintptr_t)majdevno, DDI_ATTACH );
+       if (majdevno < 0) {
+               printf("profile_init: failed to allocate a major number!\n");
+               return;
+       }
 
-               gProfileInited = 1;
-       } else
-               panic("profile_init: called twice!\n");
+       profile_attach( (dev_info_t*)(uintptr_t)majdevno);
 }
 #undef PROFILE_MAJOR
index 35937cdf3c7234253af5661c15e63704485ab8da..1abf6f14c69193409daa7c9ec1fbd68ae472dbcc 100644 (file)
@@ -81,7 +81,6 @@ extern kern_return_t fbt_perfCallback(int, struct savearea_t *, uintptr_t *, int
 
 #define DTRACE_PROBE_PREFIX "_dtrace_probe$"
 
-static dev_info_t              *sdt_devi;
 static int                     sdt_verbose = 0;
 sdt_probe_t            **sdt_probetab;
 int                    sdt_probetab_size;
@@ -328,23 +327,22 @@ err:
 }
 
 static dtrace_pops_t sdt_pops = {
-       NULL,
-       sdt_provide_module,
-       sdt_enable,
-       sdt_disable,
-       NULL,
-       NULL,
-       sdt_getargdesc,
-       sdt_getarg,
-       NULL,
-       sdt_destroy
+       .dtps_provide =         NULL,
+       .dtps_provide_module =  sdt_provide_module,
+       .dtps_enable =          sdt_enable,
+       .dtps_disable =         sdt_disable,
+       .dtps_suspend =         NULL,
+       .dtps_resume =          NULL,
+       .dtps_getargdesc =      sdt_getargdesc,
+       .dtps_getargval =       sdt_getarg,
+       .dtps_usermode =        NULL,
+       .dtps_destroy =         sdt_destroy,
 };
 
 /*ARGSUSED*/
 static int
-sdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+sdt_attach(dev_info_t *devi)
 {
-#pragma unused(cmd)
        sdt_provider_t *prov;
 
        if (ddi_create_minor_node(devi, "sdt", S_IFCHR,
@@ -354,9 +352,6 @@ sdt_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
                return (DDI_FAILURE);
        }
 
-       ddi_report_dev(devi);
-       sdt_devi = devi;
-
        if (sdt_probetab_size == 0)
                sdt_probetab_size = SDT_PROBETAB_SIZE;
 
@@ -446,164 +441,162 @@ static struct cdevsw sdt_cdevsw =
        0                                       /* type */
 };
 
-static int gSDTInited = 0;
 static struct modctl g_sdt_kernctl;
 static struct module g_sdt_mach_module;
 
 #include <mach-o/nlist.h>
 #include <libkern/kernel_mach_header.h>
 
-void sdt_init( void )
+void sdt_early_init( void )
 {
-       if (0 == gSDTInited)
-       {
-               int majdevno = cdevsw_add(SDT_MAJOR, &sdt_cdevsw);
+       if (dtrace_sdt_probes_restricted()) {
+               return;
+       }
+       if (MH_MAGIC_KERNEL != _mh_execute_header.magic) {
+               g_sdt_kernctl.mod_address = (vm_address_t)NULL;
+               g_sdt_kernctl.mod_size = 0;
+       } else {
+               kernel_mach_header_t        *mh;
+               struct load_command         *cmd;
+               kernel_segment_command_t    *orig_ts = NULL, *orig_le = NULL;
+               struct symtab_command       *orig_st = NULL;
+               kernel_nlist_t              *sym = NULL;
+               char                        *strings;
+               unsigned int                i;
                
-               if (majdevno < 0) {
-                       printf("sdt_init: failed to allocate a major number!\n");
-                       gSDTInited = 0;
-                       return;
+               g_sdt_mach_module.sdt_nprobes = 0;
+               g_sdt_mach_module.sdt_probes = NULL;
+               
+               g_sdt_kernctl.mod_address = (vm_address_t)&g_sdt_mach_module;
+               g_sdt_kernctl.mod_size = 0;
+               strncpy((char *)&(g_sdt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME);
+               
+               g_sdt_kernctl.mod_next = NULL;
+               g_sdt_kernctl.mod_stale = NULL;
+               g_sdt_kernctl.mod_id = 0;
+               g_sdt_kernctl.mod_loadcnt = 1;
+               g_sdt_kernctl.mod_loaded = 1;
+               g_sdt_kernctl.mod_flags = 0;
+               g_sdt_kernctl.mod_nenabled = 0;
+               
+               mh = &_mh_execute_header;
+               cmd = (struct load_command*) &mh[1];
+               for (i = 0; i < mh->ncmds; i++) {
+                       if (cmd->cmd == LC_SEGMENT_KERNEL) {
+                               kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd;
+                               
+                               if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT))
+                                       orig_ts = orig_sg;
+                               else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT))
+                                       orig_le = orig_sg;
+                               else if (LIT_STRNEQL(orig_sg->segname, ""))
+                                       orig_ts = orig_sg; /* kexts have a single unnamed segment */
+                       }
+                       else if (cmd->cmd == LC_SYMTAB)
+                               orig_st = (struct symtab_command *) cmd;
+                       
+                       cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
                }
-
-               if (dtrace_sdt_probes_restricted()) {
+               
+               if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL))
                        return;
-               }
-
-               if (MH_MAGIC_KERNEL != _mh_execute_header.magic) {
-                       g_sdt_kernctl.mod_address = (vm_address_t)NULL;
-                       g_sdt_kernctl.mod_size = 0;
-               } else {
-                       kernel_mach_header_t        *mh;
-                       struct load_command         *cmd;
-                       kernel_segment_command_t    *orig_ts = NULL, *orig_le = NULL;
-                       struct symtab_command       *orig_st = NULL;
-                       kernel_nlist_t              *sym = NULL;
-                       char                        *strings;
-                       unsigned int                i;
+               
+               sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff);
+               strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff);
+               
+               for (i = 0; i < orig_st->nsyms; i++) {
+                       uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT);
+                       char *name = strings + sym[i].n_un.n_strx;
+                       const char *prev_name;
+                       unsigned long best;
+                       unsigned int j;
                        
-                       g_sdt_mach_module.sdt_nprobes = 0;
-                       g_sdt_mach_module.sdt_probes = NULL;
+                       /* Check that the symbol is a global and that it has a name. */
+                       if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type))
+                               continue;
                        
-                       g_sdt_kernctl.mod_address = (vm_address_t)&g_sdt_mach_module;
-                       g_sdt_kernctl.mod_size = 0;
-                       strncpy((char *)&(g_sdt_kernctl.mod_modname), "mach_kernel", KMOD_MAX_NAME);
+                       if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */
+                               continue;
                        
-                       g_sdt_kernctl.mod_next = NULL;
-                       g_sdt_kernctl.mod_stale = NULL;
-                       g_sdt_kernctl.mod_id = 0;
-                       g_sdt_kernctl.mod_loadcnt = 1;
-                       g_sdt_kernctl.mod_loaded = 1;
-                       g_sdt_kernctl.mod_flags = 0;
-                       g_sdt_kernctl.mod_nenabled = 0;
+                       /* Lop off omnipresent leading underscore. */
+                       if (*name == '_')
+                               name += 1;
                        
-                       mh = &_mh_execute_header;
-                       cmd = (struct load_command*) &mh[1];
-                       for (i = 0; i < mh->ncmds; i++) {
-                               if (cmd->cmd == LC_SEGMENT_KERNEL) {
-                                       kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd;
-                                       
-                                       if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT))
-                                               orig_ts = orig_sg;
-                                       else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT))
-                                               orig_le = orig_sg;
-                                       else if (LIT_STRNEQL(orig_sg->segname, ""))
-                                               orig_ts = orig_sg; /* kexts have a single unnamed segment */
-                               }
-                               else if (cmd->cmd == LC_SYMTAB)
-                                       orig_st = (struct symtab_command *) cmd;
+                       if (strncmp(name, DTRACE_PROBE_PREFIX, sizeof(DTRACE_PROBE_PREFIX) - 1) == 0) {
+                               sdt_probedesc_t *sdpd = kmem_alloc(sizeof(sdt_probedesc_t), KM_SLEEP);
+                               int len = strlen(name) + 1;
                                
-                               cmd = (struct load_command *) ((uintptr_t) cmd + cmd->cmdsize);
-                       }
-                       
-                       if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL))
-                               return;
-                       
-                       sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff);
-                       strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff);
-                       
-                       for (i = 0; i < orig_st->nsyms; i++) {
-                               uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT);
-                               char *name = strings + sym[i].n_un.n_strx;
-                               const char *prev_name;
-                               unsigned long best;
-                               unsigned int j;
+                               sdpd->sdpd_name = kmem_alloc(len, KM_SLEEP);
+                               strncpy(sdpd->sdpd_name, name, len); /* NUL termination is ensured. */
                                
-                               /* Check that the symbol is a global and that it has a name. */
-                               if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type))
-                                       continue;
+                               prev_name = "<unknown>";
+                               best = 0;
                                
-                               if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */
-                                       continue;
-                               
-                               /* Lop off omnipresent leading underscore. */
-                               if (*name == '_')
-                                       name += 1;
-                               
-                               if (strncmp(name, DTRACE_PROBE_PREFIX, sizeof(DTRACE_PROBE_PREFIX) - 1) == 0) {
-                                       sdt_probedesc_t *sdpd = kmem_alloc(sizeof(sdt_probedesc_t), KM_SLEEP);
-                                       int len = strlen(name) + 1;
-                                       
-                                       sdpd->sdpd_name = kmem_alloc(len, KM_SLEEP);
-                                       strncpy(sdpd->sdpd_name, name, len); /* NUL termination is ensured. */
-                                       
-                                       prev_name = "<unknown>";
-                                       best = 0;
-                                       
-                                       /*
-                                        * Find the symbol immediately preceding the sdt probe site just discovered,
-                                        * that symbol names the function containing the sdt probe.
-                                        */
-                                       for (j = 0; j < orig_st->nsyms; j++) {
-                                               uint8_t jn_type = sym[j].n_type & N_TYPE;
-                                               char *jname = strings + sym[j].n_un.n_strx;
-                                               
-                                               if ((N_SECT != jn_type && N_ABS != jn_type))
-                                                       continue;
-                                               
-                                               if (0 == sym[j].n_un.n_strx) /* iff a null, "", name. */
-                                                       continue;
-                                               
-                                               if (*jname == '_')
-                                                       jname += 1;
-                                               
-                                               if (*(unsigned long *)sym[i].n_value <= (unsigned long)sym[j].n_value)
-                                                       continue;
-                                               
-                                               if ((unsigned long)sym[j].n_value > best) {
-                                                       best = (unsigned long)sym[j].n_value;
-                                                       prev_name = jname;
-                                               }
+                               /*
+                                * Find the symbol immediately preceding the sdt probe site just discovered,
+                                * that symbol names the function containing the sdt probe.
+                                */
+                               for (j = 0; j < orig_st->nsyms; j++) {
+                                       uint8_t jn_type = sym[j].n_type & N_TYPE;
+                                       char *jname = strings + sym[j].n_un.n_strx;
+
+                                       if ((N_SECT != jn_type && N_ABS != jn_type))
+                                               continue;
+
+                                       if (0 == sym[j].n_un.n_strx) /* iff a null, "", name. */
+                                               continue;
+
+                                       if (*jname == '_')
+                                               jname += 1;
+
+                                       if (*(unsigned long *)sym[i].n_value <= (unsigned long)sym[j].n_value)
+                                               continue;
+
+                                       if ((unsigned long)sym[j].n_value > best) {
+                                               best = (unsigned long)sym[j].n_value;
+                                               prev_name = jname;
                                        }
-                                       
-                                       sdpd->sdpd_func = kmem_alloc((len = strlen(prev_name) + 1), KM_SLEEP);
-                                       strncpy(sdpd->sdpd_func, prev_name, len); /* NUL termination is ensured. */
-                                       
-                                       sdpd->sdpd_offset = *(unsigned long *)sym[i].n_value;
+                               }
+
+                               sdpd->sdpd_func = kmem_alloc((len = strlen(prev_name) + 1), KM_SLEEP);
+                               strncpy(sdpd->sdpd_func, prev_name, len); /* NUL termination is ensured. */
+                               
+                               sdpd->sdpd_offset = *(unsigned long *)sym[i].n_value;
 #if defined(__arm__)
-                                       /* PR8353094 - mask off thumb-bit */
-                                       sdpd->sdpd_offset &= ~0x1U;
+                               /* PR8353094 - mask off thumb-bit */
+                               sdpd->sdpd_offset &= ~0x1U;
 #elif defined(__arm64__)
-                                       sdpd->sdpd_offset &= ~0x1LU;
+                               sdpd->sdpd_offset &= ~0x1LU;
 #endif  /* __arm__ */
 
 #if 0
-                                       printf("sdt_init: sdpd_offset=0x%lx, n_value=0x%lx, name=%s\n",
-                                           sdpd->sdpd_offset,  *(unsigned long *)sym[i].n_value, name);
+                               printf("sdt_init: sdpd_offset=0x%lx, n_value=0x%lx, name=%s\n",
+                                   sdpd->sdpd_offset,  *(unsigned long *)sym[i].n_value, name);
 #endif
 
-                                       sdpd->sdpd_next = g_sdt_mach_module.sdt_probes;
-                                       g_sdt_mach_module.sdt_probes = sdpd;
-                               } else {
-                                       prev_name = name;
-                               }
+                               sdpd->sdpd_next = g_sdt_mach_module.sdt_probes;
+                               g_sdt_mach_module.sdt_probes = sdpd;
+                       } else {
+                               prev_name = name;
                        }
                }
+       }
+}
+
+void sdt_init( void )
+{
+       int majdevno = cdevsw_add(SDT_MAJOR, &sdt_cdevsw);
                
-               sdt_attach( (dev_info_t *)(uintptr_t)majdevno, DDI_ATTACH );
-               
-               gSDTInited = 1;
-       } else
-               panic("sdt_init: called twice!\n");
+       if (majdevno < 0) {
+               printf("sdt_init: failed to allocate a major number!\n");
+               return;
+       }
+
+       if (dtrace_sdt_probes_restricted()) {
+               return;
+       }
+
+       sdt_attach((dev_info_t*)(uintptr_t)majdevno);
 }
 
 #undef SDT_MAJOR
index 3fc2b9aa097c390c131d842346eeefd1f86b43b9..03174ee080d89d548fe67aba329820a09c3c3dfb 100644 (file)
@@ -136,6 +136,7 @@ sdt_argdesc_t sdt_args[] = {
        { "proc", "exec-failure", 0, 0, "int", NULL },
        /* proc:::exec-success has no arguments */
        { "proc", "exit", 0, 0, "int", NULL },
+       { "proc", "exited", 0, 0, "struct proc *", "psinfo_t *"},
        { "proc", "fault", 0, 0, "int", NULL },
        { "proc", "fault", 1, 1, "siginfo_t *", NULL },
        { "proc", "lwp-create", 0, 0, "struct thread *", "lwpsinfo_t *" },
index 9c0c34f6317dfd79eb05e2ac4fec39ce64bef188..10ba834332579255c66486cdb947416cee8538d7 100644 (file)
@@ -376,7 +376,6 @@ dtrace_systrace_syscall_return(unsigned short code, int rval, int *rv)
 #error 1 << SYSTRACE_SHIFT must exceed number of system calls
 #endif
 
-static dev_info_t *systrace_devi;
 static dtrace_provider_id_t systrace_id;
 
 /*
@@ -532,31 +531,22 @@ static dtrace_pattr_t systrace_attr = {
 };
 
 static dtrace_pops_t systrace_pops = {
-       systrace_provide,
-       NULL,
-       systrace_enable,
-       systrace_disable,
-       NULL,
-       NULL,
-       systrace_getargdesc,
-       systrace_getargval,
-       NULL,
-       systrace_destroy
+       .dtps_provide =         systrace_provide,
+       .dtps_provide_module =  NULL,
+       .dtps_enable =          systrace_enable,
+       .dtps_disable =         systrace_disable,
+       .dtps_suspend =         NULL,
+       .dtps_resume =          NULL,
+       .dtps_getargdesc =      systrace_getargdesc,
+       .dtps_getargval =       systrace_getargval,
+       .dtps_usermode =        NULL,
+       .dtps_destroy =         systrace_destroy
 };
 
 static int
-systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+systrace_attach(dev_info_t *devi)
 {
-       switch (cmd) {
-       case DDI_ATTACH:
-               break;
-       case DDI_RESUME:
-               return (DDI_SUCCESS);
-       default:
-               return (DDI_FAILURE);
-       }
-
-       systrace_probe = (void(*))&dtrace_probe;
+       systrace_probe = (void*)&dtrace_probe;
        membar_enter();
 
        if (ddi_create_minor_node(devi, "systrace", S_IFCHR, 0,
@@ -568,9 +558,6 @@ systrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
                return (DDI_FAILURE);
        }
 
-       ddi_report_dev(devi);
-       systrace_devi = devi;
-
        return (DDI_SUCCESS);
 }
 
@@ -657,7 +644,6 @@ void (*machtrace_probe)(dtrace_id_t, uint64_t, uint64_t,
 
 static uint64_t machtrace_getarg(void *, dtrace_id_t, void *, int, int);       
 
-static dev_info_t *machtrace_devi;
 static dtrace_provider_id_t machtrace_id;
 
 static kern_return_t
@@ -897,30 +883,21 @@ static dtrace_pattr_t machtrace_attr = {
 };
 
 static dtrace_pops_t machtrace_pops = {
-       machtrace_provide,
-       NULL,
-       machtrace_enable,
-       machtrace_disable,
-       NULL,
-       NULL,
-       NULL,
-       machtrace_getarg,
-       NULL,
-       machtrace_destroy
+       .dtps_provide =         machtrace_provide,
+       .dtps_provide_module =  NULL,
+       .dtps_enable =          machtrace_enable,
+       .dtps_disable =         machtrace_disable,
+       .dtps_suspend =         NULL,
+       .dtps_resume =          NULL,
+       .dtps_getargdesc =      NULL,
+       .dtps_getargval =       machtrace_getarg,
+       .dtps_usermode =        NULL,
+       .dtps_destroy =         machtrace_destroy
 };
 
 static int
-machtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
+machtrace_attach(dev_info_t *devi)
 {
-       switch (cmd) {
-               case DDI_ATTACH:
-                       break;
-               case DDI_RESUME:
-                       return (DDI_SUCCESS);
-               default:
-                       return (DDI_FAILURE);
-       }
-
        machtrace_probe = dtrace_probe;
        membar_enter();
        
@@ -928,14 +905,11 @@ machtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
                                DDI_PSEUDO, 0) == DDI_FAILURE ||
                        dtrace_register("mach_trap", &machtrace_attr, DTRACE_PRIV_USER, NULL,
                                &machtrace_pops, NULL, &machtrace_id) != 0) {
-                machtrace_probe = (void (*))&systrace_stub;
+                machtrace_probe = (void*)&systrace_stub;
                ddi_remove_minor_node(devi, NULL);
                return (DDI_FAILURE);
        }
 
-       ddi_report_dev(devi);
-       machtrace_devi = devi;
-
        return (DDI_SUCCESS);
 }
 
@@ -971,31 +945,23 @@ static struct cdevsw systrace_cdevsw =
        0                                       /* type */
 };
 
-static int gSysTraceInited = 0;
-
 void systrace_init( void );
 
 void systrace_init( void )
 {
-       if (0 == gSysTraceInited) {
-               if (dtrace_sdt_probes_restricted()) {
-                       return;
-               }
-
-               int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
+       if (dtrace_sdt_probes_restricted()) {
+               return;
+       }
 
-               if (majdevno < 0) {
-                       printf("systrace_init: failed to allocate a major number!\n");
-                       gSysTraceInited = 0;
-                       return;
-               }
+       int majdevno = cdevsw_add(SYSTRACE_MAJOR, &systrace_cdevsw);
 
-               systrace_attach( (dev_info_t    *)(uintptr_t)majdevno, DDI_ATTACH );
-               machtrace_attach( (dev_info_t   *)(uintptr_t)majdevno, DDI_ATTACH );
+       if (majdevno < 0) {
+               printf("systrace_init: failed to allocate a major number!\n");
+               return;
+       }
 
-               gSysTraceInited = 1;
-       } else
-               panic("systrace_init: called twice!\n");
+       systrace_attach((dev_info_t*)(uintptr_t)majdevno);
+       machtrace_attach((dev_info_t*)(uintptr_t)majdevno);
 }
 #undef SYSTRACE_MAJOR
 
@@ -1012,7 +978,7 @@ systrace_getargval(void *arg, dtrace_id_t id, void *parg, int argno, int aframes
                uargs = uthread->t_dtrace_syscall_args;
        if (!uargs)
                return(0);
-       if (argno < 0 || argno > SYSTRACE_NARGS)
+       if (argno < 0 || argno >= SYSTRACE_NARGS)
                return(0);
 
        DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
index cb6795580b070b92481bedc2c9b473fb96f77bb6..458fc15b3625efedbea5db8f09baeecd0ca99f81 100644 (file)
@@ -183,6 +183,11 @@ dtrace_getreg(struct regs *savearea, uint_t reg)
        boolean_t is64Bit = proc_is64bit(current_proc());
        x86_saved_state_t *regs = (x86_saved_state_t *)savearea;
 
+       if (regs == NULL) {
+               DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
+               return (0);
+       }
+
        if (is64Bit) {
            if (reg <= SS) {
                reg = regmap[reg];
index c4ab38a31351d5240098e758f6861597a84db9f8..a5064d68898fec7f79268a987559b82b3448e591 100644 (file)
@@ -307,3 +307,9 @@ dtrace_safe_defer_signal(void)
 
        return 0;
 }
+
+void
+dtrace_flush_caches(void)
+{
+
+}
index a70039322d766d039226d1eead899b8ec66ddd24..0e9e9784979cb718d9247983e17a00c2aef394fa 100644 (file)
@@ -1384,7 +1384,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
 
                case FASTTRAP_T_COMMON:
                {
-                       user_addr_t addr;
+                       user_addr_t addr, write_addr;
                        uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 7];
                        uint_t i = 0;
 
@@ -1428,8 +1428,9 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                         */
 
                        addr = uthread->t_dtrace_scratch->addr;
+                       write_addr = uthread->t_dtrace_scratch->write_addr;
 
-                       if (addr == 0LL) {
+                       if (addr == 0LL || write_addr == 0LL) {
                                fasttrap_sigtrap(p, uthread, pc); // Should be killing target proc
                                new_pc = pc;
                                break;
@@ -1458,7 +1459,7 @@ fasttrap_pid_probe32(x86_saved_state_t *regs)
                        
                        ASSERT(i <= sizeof (scratch));
 
-                       if (fasttrap_copyout(scratch, addr, i)) {
+                       if (fasttrap_copyout(scratch, write_addr, i)) {
                                fasttrap_sigtrap(p, uthread, pc);
                                new_pc = pc;
                                break;
@@ -1938,7 +1939,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
 
                case FASTTRAP_T_COMMON:
                {
-                       user_addr_t addr;
+                       user_addr_t addr, write_addr;
                        uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 22];
                        uint_t i = 0;
                        
@@ -2026,8 +2027,9 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
                         */
 
                        addr = uthread->t_dtrace_scratch->addr;
+                       write_addr = uthread->t_dtrace_scratch->write_addr;
 
-                       if (addr == 0LL) {
+                       if (addr == 0LL || write_addr == 0LL) {
                                fasttrap_sigtrap(p, uthread, pc); // Should be killing target proc
                                new_pc = pc;
                                break;
@@ -2117,7 +2119,7 @@ fasttrap_pid_probe64(x86_saved_state_t *regs)
 
                        ASSERT(i <= sizeof (scratch));
 
-                       if (fasttrap_copyout(scratch, addr, i)) {
+                       if (fasttrap_copyout(scratch, write_addr, i)) {
                                fasttrap_sigtrap(p, uthread, pc);
                                new_pc = pc;
                                break;
index 2ff70daee44c80274941aad1a96482873af09597..63d1a843015983165b4ac9b8235ec3a54345b356 100644 (file)
@@ -36,7 +36,7 @@
 #include <kern/thread.h>
 #include <mach/thread_status.h>
 #include <mach/vm_param.h>
-#include <mach-o/loader.h> 
+#include <mach-o/loader.h>
 #include <mach-o/nlist.h>
 #include <libkern/kernel_mach_header.h>
 #include <libkern/OSAtomic.h>
@@ -110,7 +110,7 @@ int
 fbt_invop(uintptr_t addr, uintptr_t *state, uintptr_t rval)
 {
        fbt_probe_t *fbt = fbt_probetab[FBT_ADDR2NDX(addr)];
-       
+
        for (; fbt != NULL; fbt = fbt->fbtp_hashnext) {
                if ((uintptr_t)fbt->fbtp_patchpoint == addr) {
 
@@ -180,7 +180,7 @@ fbt_perfCallback(
                        "_dtrace_invop_callsite_post:\n"
                        "  .quad Ldtrace_invop_callsite_post_label\n"
                        ".text\n"
-                                );             
+                                );
 
                switch (emul) {
                case DTRACE_INVOP_NOP:
@@ -198,7 +198,7 @@ fbt_perfCallback(
                case DTRACE_INVOP_LEAVE:
 /*
  * Emulate first micro-op of patched leave: mov %rbp,%rsp
- * fp points just below the return address slot for target's ret 
+ * fp points just below the return address slot for target's ret
  * and at the slot holding the frame pointer saved by the target's prologue.
  */
                        fp = saved_state->rbp;
@@ -247,7 +247,7 @@ fbt_perfCallback(
 
                        retval = KERN_SUCCESS;
                        break;
-                       
+
                default:
                        retval = KERN_FAILURE;
                        break;
@@ -263,7 +263,7 @@ fbt_perfCallback(
 }
 
 void
-fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, char *modname, char* symbolName, machine_inst_t* symbolStart)
+fbt_provide_probe(struct modctl *ctl, const char *modname, const char* symbolName, machine_inst_t* symbolStart, machine_inst_t* instrHigh)
 {
        unsigned int                    j;
        unsigned int                    doenable = 0;
@@ -272,37 +272,36 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        fbt_probe_t *newfbt, *retfbt, *entryfbt;
        machine_inst_t *instr, *limit, theInstr, i1, i2, i3;
        int size;
-               
+
        /*
         * Guard against null symbols
         */
-       if (!symbolStart || !instrLow || !instrHigh) {
+       if (!symbolStart || !instrHigh || instrHigh < symbolStart) {
                kprintf("dtrace: %s has an invalid address\n", symbolName);
                return;
        }
 
        for (j = 0, instr = symbolStart, theInstr = 0;
-            (j < 4) && ((uintptr_t)instr >= instrLow) && (instrHigh > (uintptr_t)(instr + 2)); 
-            j++) {
+            (j < 4) && (instrHigh > (instr + 2)); j++) {
                theInstr = instr[0];
                if (theInstr == FBT_PUSH_RBP || theInstr == FBT_RET || theInstr == FBT_RET_IMM16)
                        break;
-               
+
                if ((size = dtrace_instr_size(instr)) <= 0)
                        break;
-               
+
                instr += size;
        }
-       
+
        if (theInstr != FBT_PUSH_RBP)
                return;
-       
+
        i1 = instr[1];
        i2 = instr[2];
        i3 = instr[3];
-       
+
        limit = (machine_inst_t *)instrHigh;
-       
+
        if (i1 == FBT_REX_RSP_RBP && i2 == FBT_MOV_RSP_RBP0 && i3 == FBT_MOV_RSP_RBP1) {
                instr += 1; /* Advance to the mov %rsp,%rbp */
                theInstr = i1;
@@ -319,26 +318,26 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
                 * 000006d8        pushl   %ebp
                 * 000006d9        movl    $0x00000004,%edx
                 * 000006de        movl    %esp,%ebp
-                * 
+                *
                 * Try the next instruction, to see if it is a movl %esp,%ebp
                 */
-               
+
                instr += 1; /* Advance past the pushl %ebp */
                if ((size = dtrace_instr_size(instr)) <= 0)
                        return;
-               
+
                instr += size;
-               
+
                if ((instr + 1) >= limit)
                        return;
-               
+
                i1 = instr[0];
                i2 = instr[1];
-               
+
                if (!(i1 == FBT_MOVL_ESP_EBP0_V0 && i2 == FBT_MOVL_ESP_EBP1_V0) &&
                    !(i1 == FBT_MOVL_ESP_EBP0_V1 && i2 == FBT_MOVL_ESP_EBP1_V1))
                        return;
-               
+
                /* instr already points at the movl %esp,%ebp */
                theInstr = i1;
        }
@@ -346,7 +345,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_ENTRY);
        newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
        strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS );
-       
+
        if (thisid != 0) {
                /*
                 * The dtrace_probe previously existed, so we have to hook
@@ -360,13 +359,13 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
                for(; entryfbt != NULL; entryfbt = entryfbt->fbtp_next) {
                        if (entryfbt->fbtp_currentval == entryfbt->fbtp_patchval)
                                doenable++;
-                       
+
                        if (entryfbt->fbtp_next == NULL) {
                                entryfbt->fbtp_next = newfbt;
                                newfbt->fbtp_id = entryfbt->fbtp_id;
                                break;
                        }
-               }                   
+               }
        }
        else {
                /*
@@ -377,7 +376,7 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
                newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname, symbolName, FBT_ENTRY, FBT_AFRAMES_ENTRY, newfbt);
                doenable = 0;
        }
-       
+
        newfbt->fbtp_patchpoint = instr;
        newfbt->fbtp_ctl = ctl;
        newfbt->fbtp_loadcnt = ctl->mod_loadcnt;
@@ -387,18 +386,18 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
        newfbt->fbtp_currentval = 0;
        newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(instr)];
        fbt_probetab[FBT_ADDR2NDX(instr)] = newfbt;
-       
+
        if (doenable)
                fbt_enable(NULL, newfbt->fbtp_id, newfbt);
-       
+
        /*
         * The fbt entry chain is in place, one entry point per symbol.
         * The fbt return chain can have multiple return points per symbol.
         * Here we find the end of the fbt return chain.
         */
-       
+
        doenable=0;
-       
+
        thisid = dtrace_probe_lookup(fbt_id, modname, symbolName, FBT_RETURN);
        if (thisid != 0) {
                /* The dtrace_probe previously existed, so we have to
@@ -420,11 +419,11 @@ fbt_provide_probe(struct modctl *ctl, uintptr_t instrLow, uintptr_t instrHigh, c
                doenable = 0;
                retfbt = NULL;
        }
-       
+
 again:
        if (instr >= limit)
                return;
-       
+
        /*
         * If this disassembly fails, then we've likely walked off into
         * a jump table or some other unsuitable area.  Bail out of the
@@ -432,7 +431,7 @@ again:
         */
        if ((size = dtrace_instr_size(instr)) <= 0)
                return;
-       
+
        /*
         * We (desperately) want to avoid erroneously instrumenting a
         * jump table, especially given that our markers are pretty
@@ -447,66 +446,66 @@ again:
        for (j = 0; j < sizeof (uintptr_t); j++) {
                uintptr_t check = (uintptr_t)instr - j;
                uint8_t *ptr;
-               
+
                if (check < (uintptr_t)symbolStart)
                        break;
-               
+
                if (check + sizeof (uintptr_t) > (uintptr_t)limit)
                        continue;
-               
+
                ptr = *(uint8_t **)check;
-               
+
                if (ptr >= (uint8_t *)symbolStart && ptr < limit) {
                        instr += size;
                        goto again;
                }
        }
-       
+
        /*
         * OK, it's an instruction.
         */
        theInstr = instr[0];
-       
+
        /* Walked onto the start of the next routine? If so, bail out of this function. */
        if (theInstr == FBT_PUSH_RBP)
                return;
-       
+
        if (!(size == 1 && (theInstr == FBT_POP_RBP || theInstr == FBT_LEAVE))) {
                instr += size;
                goto again;
        }
-       
+
        /*
         * Found the pop %rbp; or leave.
         */
        machine_inst_t *patch_instr = instr;
-       
+
        /*
         * Scan forward for a "ret", or "jmp".
         */
        instr += size;
        if (instr >= limit)
                return;
-       
+
        size = dtrace_instr_size(instr);
        if (size <= 0) /* Failed instruction decode? */
                return;
-       
+
        theInstr = instr[0];
-       
+
        if (!(size == FBT_RET_LEN && (theInstr == FBT_RET)) &&
            !(size == FBT_RET_IMM16_LEN && (theInstr == FBT_RET_IMM16)) &&
            !(size == FBT_JMP_SHORT_REL_LEN && (theInstr == FBT_JMP_SHORT_REL)) &&
            !(size == FBT_JMP_NEAR_REL_LEN && (theInstr == FBT_JMP_NEAR_REL)) &&
            !(size == FBT_JMP_FAR_ABS_LEN && (theInstr == FBT_JMP_FAR_ABS)))
                return;
-       
+
        /*
         * pop %rbp; ret; or leave; ret; or leave; jmp tailCalledFun; -- We have a winner!
         */
        newfbt = kmem_zalloc(sizeof (fbt_probe_t), KM_SLEEP);
        strlcpy( (char *)&(newfbt->fbtp_name), symbolName, MAX_FBTP_NAME_CHARS );
-       
+
        if (retfbt == NULL) {
                newfbt->fbtp_id = dtrace_probe_create(fbt_id, modname,
                                                      symbolName, FBT_RETURN, FBT_AFRAMES_RETURN, newfbt);
@@ -514,12 +513,12 @@ again:
                retfbt->fbtp_next = newfbt;
                newfbt->fbtp_id = retfbt->fbtp_id;
        }
-       
+
        retfbt = newfbt;
        newfbt->fbtp_patchpoint = patch_instr;
        newfbt->fbtp_ctl = ctl;
        newfbt->fbtp_loadcnt = ctl->mod_loadcnt;
-       
+
        if (*patch_instr == FBT_POP_RBP) {
                newfbt->fbtp_rval = DTRACE_INVOP_POP_RBP;
        } else {
@@ -528,87 +527,16 @@ again:
        }
        newfbt->fbtp_roffset =
        (uintptr_t)(patch_instr - (uint8_t *)symbolStart);
-       
+
        newfbt->fbtp_savedval = *patch_instr;
        newfbt->fbtp_patchval = FBT_PATCHVAL;
        newfbt->fbtp_hashnext = fbt_probetab[FBT_ADDR2NDX(patch_instr)];
        fbt_probetab[FBT_ADDR2NDX(patch_instr)] = newfbt;
-       
+
        if (doenable)
                fbt_enable(NULL, newfbt->fbtp_id, newfbt);
-       
+
        instr += size;
        goto again;
 }
 
-void
-fbt_provide_module_kernel_syms(struct modctl *ctl)
-{
-       kernel_mach_header_t            *mh;
-       struct load_command             *cmd;
-       kernel_segment_command_t        *orig_ts = NULL, *orig_le = NULL;
-       struct symtab_command           *orig_st = NULL;
-       kernel_nlist_t                  *sym = NULL;
-       char                            *strings;
-       uintptr_t                       instrLow, instrHigh;
-       char                            *modname;
-       unsigned int                    i;
-       
-       mh = (kernel_mach_header_t *)(ctl->mod_address);
-       modname = ctl->mod_modname;
-       
-       if (mh->magic != MH_MAGIC_KERNEL)
-               return;
-       
-       cmd = (struct load_command *) &mh[1];
-       for (i = 0; i < mh->ncmds; i++) {
-               if (cmd->cmd == LC_SEGMENT_KERNEL) {
-                       kernel_segment_command_t *orig_sg = (kernel_segment_command_t *) cmd;
-                       
-                       if (LIT_STRNEQL(orig_sg->segname, SEG_TEXT))
-                               orig_ts = orig_sg;
-                       else if (LIT_STRNEQL(orig_sg->segname, SEG_LINKEDIT))
-                               orig_le = orig_sg;
-                       else if (LIT_STRNEQL(orig_sg->segname, ""))
-                               orig_ts = orig_sg; /* kexts have a single unnamed segment */
-               }
-               else if (cmd->cmd == LC_SYMTAB)
-                       orig_st = (struct symtab_command *) cmd;
-               
-               cmd = (struct load_command *) ((caddr_t) cmd + cmd->cmdsize);
-       }
-       
-       if ((orig_ts == NULL) || (orig_st == NULL) || (orig_le == NULL))
-               return;
-       
-       sym = (kernel_nlist_t *)(orig_le->vmaddr + orig_st->symoff - orig_le->fileoff);
-       strings = (char *)(orig_le->vmaddr + orig_st->stroff - orig_le->fileoff);
-       
-       /* Find extent of the TEXT section */
-       instrLow = (uintptr_t)orig_ts->vmaddr;
-       instrHigh = (uintptr_t)(orig_ts->vmaddr + orig_ts->vmsize);
-       
-       for (i = 0; i < orig_st->nsyms; i++) {
-               uint8_t n_type = sym[i].n_type & (N_TYPE | N_EXT);
-               char *name = strings + sym[i].n_un.n_strx;
-               
-               /* Check that the symbol is a global and that it has a name. */
-               if (((N_SECT | N_EXT) != n_type && (N_ABS | N_EXT) != n_type))
-                       continue;
-               
-               if (0 == sym[i].n_un.n_strx) /* iff a null, "", name. */
-                       continue;
-               
-               /* Lop off omnipresent leading underscore. */                   
-               if (*name == '_')
-                       name += 1;
-               
-               /*
-                * We're only blacklisting functions in the kernel for now.
-                */
-               if (MOD_IS_MACH_KERNEL(ctl) && fbt_excluded(name))
-                       continue;
-               
-               fbt_provide_probe(ctl, instrLow, instrHigh, modname, name, (machine_inst_t*)sym[i].n_value);
-       }
-}
index 3a8d5dffa6d9f441e3e3f66fb12f318070b57ec6..1fa041589829caad1d1132ca6640d8e5d294f3cc 100644 (file)
@@ -34,7 +34,6 @@
 #include <kern/debug.h>
 #include <mach/machine/thread_status.h>
 #include <mach/thread_act.h>
-#include <mach/branch_predicates.h>
 
 #include <sys/kernel.h>
 #include <sys/vm.h>
index 1c271607f6423ded1d1c3b6a1b8624e30e3f837f..88e615b8ba4130a83b7b96373d781126cf599b4d 100644 (file)
@@ -46,6 +46,9 @@
 #include <sys/sysent.h>
 #include <sys/ucontext.h>
 #include <sys/wait.h>
+
+#include <sys/ux_exception.h>
+
 #include <mach/thread_act.h>   /* for thread_abort_safely */
 #include <mach/thread_status.h>        
 
@@ -62,8 +65,6 @@
 
 
 /* Forward: */
-extern boolean_t machine_exception(int, mach_exception_code_t, 
-               mach_exception_subcode_t, int *, mach_exception_subcode_t *);
 extern kern_return_t thread_getstatus(thread_t act, int flavor,
                        thread_state_t tstate, mach_msg_type_number_t *count);
 extern kern_return_t thread_setstatus(thread_t thread, int flavor,
@@ -99,6 +100,7 @@ struct sigframe32 {
        int             sig;
        user32_addr_t   sinfo;  /* siginfo32_t* */
        user32_addr_t   uctx;   /* struct ucontext32 */
+       user32_addr_t   token;
 };
 
 /*
@@ -190,6 +192,8 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint
        int stack_size = 0;
        int infostyle = UC_TRAD;
        xstate_t        sig_xstate;
+       user_addr_t     token_uctx;
+       kern_return_t   kr;
 
        thread = current_thread();
        ut = get_bsdthread_info(thread);
@@ -216,6 +220,7 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint
        if (proc_is64bit(p)) {
                x86_thread_state64_t    *tstate64;
                struct user_ucontext64  uctx64;
+               user64_addr_t token;
 
                flavor = x86_THREAD_STATE64;
                state_count = x86_THREAD_STATE64_COUNT;
@@ -273,6 +278,14 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint
                 */
                ua_fp -= sizeof(user_addr_t);
 
+               /*
+                * Generate the validation token for sigreturn
+                */
+               token_uctx = ua_uctxp;
+               kr = machine_thread_siguctx_pointer_convert_to_user(thread, &token_uctx);
+               assert(kr == KERN_SUCCESS);
+               token = (user64_addr_t)token_uctx ^ (user64_addr_t)ps->ps_sigreturn_token;
+
                /*
                 * Build the signal context to be used by sigreturn.
                 */
@@ -318,11 +331,12 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint
                tstate64->rdx = sig;
                tstate64->rcx = ua_sip;
                tstate64->r8  = ua_uctxp;
-
+               tstate64->r9  = token;
        } else {
                x86_thread_state32_t    *tstate32;
                struct user_ucontext32  uctx32;
                struct sigframe32       frame32;
+               user32_addr_t token;
 
                flavor = x86_THREAD_STATE32;
                state_count = x86_THREAD_STATE32_COUNT;
@@ -380,6 +394,15 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint
                 */
                ua_fp -= sizeof(frame32.retaddr);
 
+               /*
+                * Generate the validation token for sigreturn
+                */
+               token_uctx = ua_uctxp;
+               kr = machine_thread_siguctx_pointer_convert_to_user(thread, &token_uctx);
+               assert(kr == KERN_SUCCESS);
+               token = CAST_DOWN_EXPLICIT(user32_addr_t, token_uctx) ^
+                               CAST_DOWN_EXPLICIT(user32_addr_t, ps->ps_sigreturn_token);
+
                /* 
                 * Build the argument list for the signal handler.
                 * Handler should call sigreturn to get out of it
@@ -390,6 +413,7 @@ sendsig(struct proc *p, user_addr_t ua_catcher, int sig, int mask, __unused uint
                frame32.catcher = CAST_DOWN_EXPLICIT(user32_addr_t, ua_catcher);
                frame32.sinfo = CAST_DOWN_EXPLICIT(user32_addr_t, ua_sip);
                frame32.uctx = CAST_DOWN_EXPLICIT(user32_addr_t, ua_uctxp);
+               frame32.token = token;
 
                if (copyout((caddr_t)&frame32, ua_fp, sizeof (frame32))) 
                        goto bad;
@@ -674,6 +698,7 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval)
 
        thread_t thread = current_thread();
        struct uthread * ut;
+       struct sigacts *ps = p->p_sigacts;
        int     error;
        int     onstack = 0;
 
@@ -685,6 +710,9 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval)
        void                *  fs;
        int                    rval = EJUSTRETURN;
        xstate_t               sig_xstate;
+       uint32_t            sigreturn_validation;
+       user_addr_t         token_uctx;
+       kern_return_t       kr;
 
        ut = (struct uthread *)get_bsdthread_info(thread);
 
@@ -704,8 +732,15 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval)
 
        sig_xstate = current_xstate();
 
+       sigreturn_validation = atomic_load_explicit(
+                       &ps->ps_sigreturn_validation, memory_order_relaxed);
+       token_uctx = uap->uctx;
+       kr = machine_thread_siguctx_pointer_convert_to_user(thread, &token_uctx);
+       assert(kr == KERN_SUCCESS);
+
        if (proc_is64bit(p)) {
                struct user_ucontext64  uctx64;
+               user64_addr_t token;
 
                if ((error = copyin(uap->uctx, (void *)&uctx64, sizeof (uctx64))))
                        return(error);
@@ -724,8 +759,19 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval)
                fs_count  = thread_state64[sig_xstate].state_count;
                fs = (void *)&mctxp->mctx_avx64.fs;
 
+               token = (user64_addr_t)token_uctx ^ (user64_addr_t)ps->ps_sigreturn_token;
+               if ((user64_addr_t)uap->token != token) {
+#if DEVELOPMENT || DEBUG
+                       printf("process %s[%d] sigreturn token mismatch: received 0x%llx expected 0x%llx\n",
+                                       p->p_comm, p->p_pid, (user64_addr_t)uap->token, token);
+#endif /* DEVELOPMENT || DEBUG */
+                       if (sigreturn_validation != PS_SIGRETURN_VALIDATION_DISABLED) {
+                               rval = EINVAL;
+                       }
+               }
       } else {
                struct user_ucontext32  uctx32;
+               user32_addr_t token;
 
                if ((error = copyin(uap->uctx, (void *)&uctx32, sizeof (uctx32)))) 
                        return(error);
@@ -743,6 +789,18 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval)
                fs_flavor = thread_state32[sig_xstate].flavor;
                fs_count  = thread_state32[sig_xstate].state_count;
                fs = (void *)&mctxp->mctx_avx32.fs;
+
+               token = CAST_DOWN_EXPLICIT(user32_addr_t, uap->uctx) ^
+                               CAST_DOWN_EXPLICIT(user32_addr_t, ps->ps_sigreturn_token);
+               if ((user32_addr_t)uap->token != token) {
+#if DEVELOPMENT || DEBUG
+                       printf("process %s[%d] sigreturn token mismatch: received 0x%x expected 0x%x\n",
+                                       p->p_comm, p->p_pid, (user32_addr_t)uap->token, token);
+#endif /* DEVELOPMENT || DEBUG */
+                       if (sigreturn_validation != PS_SIGRETURN_VALIDATION_DISABLED) {
+                               rval = EINVAL;
+                       }
+               }
        }
 
        if (onstack)
@@ -752,12 +810,21 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval)
 
        if (ut->uu_siglist & ~ut->uu_sigmask)
                signal_setast(thread);
+
+       if (rval == EINVAL) {
+               goto error_ret;
+       }
+
        /*
         * thread_set_state() does all the needed checks for the passed in
         * content
         */
        if (thread_setstatus(thread, ts_flavor, ts, ts_count) != KERN_SUCCESS) {
                rval = EINVAL;
+#if DEVELOPMENT || DEBUG
+               printf("process %s[%d] sigreturn thread_setstatus error %d\n",
+                               p->p_comm, p->p_pid, rval);
+#endif /* DEVELOPMENT || DEBUG */
                goto error_ret;
        }
        
@@ -765,6 +832,10 @@ sigreturn(struct proc *p, struct sigreturn_args *uap, __unused int *retval)
 
        if (thread_setstatus(thread, fs_flavor, fs, fs_count)  != KERN_SUCCESS) {
                rval = EINVAL;
+#if DEVELOPMENT || DEBUG
+               printf("process %s[%d] sigreturn thread_setstatus error %d\n",
+                               p->p_comm, p->p_pid, rval);
+#endif /* DEVELOPMENT || DEBUG */
                goto error_ret;
 
        }
@@ -774,55 +845,39 @@ error_ret:
 
 
 /*
- * machine_exception() performs MD translation
- * of a mach exception to a unix signal and code.
+ * machine_exception() performs machine-dependent translation
+ * of a mach exception to a unix signal.
  */
-
-boolean_t
-machine_exception(
-       int                             exception,
-       mach_exception_code_t           code,
-       __unused mach_exception_subcode_t subcode,
-       int                             *unix_signal,
-       mach_exception_code_t           *unix_code)
+int
+machine_exception(int                           exception,
+                  mach_exception_code_t         code,
+         __unused mach_exception_subcode_t      subcode)
 {
-
        switch(exception) {
-
-       case EXC_BAD_ACCESS:
-               /* Map GP fault to SIGSEGV, otherwise defer to caller */
-               if (code == EXC_I386_GPFLT) {
-                       *unix_signal = SIGSEGV;
-                       *unix_code = code;
+               case EXC_BAD_ACCESS:
+                       /* Map GP fault to SIGSEGV, otherwise defer to caller */
+                       if (code == EXC_I386_GPFLT) {
+                               return SIGSEGV;
+                       }
                        break;
-               }
-               return(FALSE);
 
-       case EXC_BAD_INSTRUCTION:
-               *unix_signal = SIGILL;
-               *unix_code = code;
-               break;
+               case EXC_BAD_INSTRUCTION:
+                       return SIGILL;
 
-       case EXC_ARITHMETIC:
-               *unix_signal = SIGFPE;
-               *unix_code = code;
-               break;
+               case EXC_ARITHMETIC:
+                       return SIGFPE;
 
-       case EXC_SOFTWARE:
-               if (code == EXC_I386_BOUND) {
-                       /*
-                        * Map #BR, the Bound Range Exceeded exception, to
-                        * SIGTRAP.
-                        */
-                       *unix_signal = SIGTRAP;
-                       *unix_code = code;
+               case EXC_SOFTWARE:
+                       if (code == EXC_I386_BOUND) {
+                               /*
+                                * Map #BR, the Bound Range Exceeded exception, to
+                                * SIGTRAP.
+                                */
+                               return SIGTRAP;
+                       }
                        break;
-               }
-
-       default:
-               return(FALSE);
        }
-   
-       return(TRUE);
+
+       return 0;
 }
 
index 19a7cf3a655476c320e2f313cd3d1e7fda61410e..8a5d276e30ecfec558743174e4755326c088c52b 100644 (file)
 #include <sys/types.h>
 #include <sys/monotonic.h>
 
-static int mt_dev_open(dev_t dev, int flags, int devtype, struct proc *p);
-static int mt_dev_close(dev_t dev, int flags, int devtype, struct proc *p);
-static int mt_dev_ioctl(dev_t dev, unsigned long cmd, char *uptr, int fflag,
-               struct proc *p);
+static int mt_cdev_open(dev_t dev, int flags, int devtype, proc_t p);
+static int mt_cdev_close(dev_t dev, int flags, int devtype, proc_t p);
+static int mt_cdev_ioctl(dev_t dev, unsigned long cmd, char *uptr, int fflag,
+               proc_t p);
+
+#define MT_NODE "monotonic"
 
 static struct cdevsw mt_cdevsw = {
-       .d_open = mt_dev_open,
-       .d_close = mt_dev_close,
-       .d_read = eno_rdwrt,
-       .d_write = eno_rdwrt,
-       .d_ioctl = mt_dev_ioctl,
-       .d_stop = eno_stop,
-       .d_reset = eno_reset,
-       .d_ttys = NULL,
-       .d_select = eno_select,
-       .d_mmap = eno_mmap,
-       .d_strategy = eno_strat,
-       .d_type = 0
+       .d_open = mt_cdev_open,
+       .d_close = mt_cdev_close,
+       .d_ioctl = mt_cdev_ioctl,
+
+       .d_read = eno_rdwrt, .d_write = eno_rdwrt, .d_stop = eno_stop,
+       .d_reset = eno_reset, .d_ttys = NULL, .d_select = eno_select,
+       .d_mmap = eno_mmap, .d_strategy = eno_strat, .d_type = 0
 };
 
 /*
  * Written at initialization, read-only thereafter.
  */
 lck_grp_t *mt_lock_grp = NULL;
-
 static int mt_dev_major;
-decl_lck_mtx_data(static, mt_dev_mtxs[MT_NDEVS]);
-static bool mt_dev_owned[MT_NDEVS];
+
+static mt_device_t
+mt_get_device(dev_t devnum)
+{
+       return &mt_devices[minor(devnum)];
+}
+
+static void
+mt_device_lock(mt_device_t dev)
+{
+       lck_mtx_lock(&dev->mtd_lock);
+}
 
 static void
-mt_dev_lock(dev_t dev)
+mt_device_unlock(mt_device_t dev)
 {
-       lck_mtx_lock(&mt_dev_mtxs[minor(dev)]);
+       lck_mtx_unlock(&dev->mtd_lock);
 }
 
 static void
-mt_dev_unlock(dev_t dev)
+mt_device_assert_lock_held(__assert_only mt_device_t dev)
 {
-       lck_mtx_unlock(&mt_dev_mtxs[minor(dev)]);
+       LCK_MTX_ASSERT(&dev->mtd_lock, LCK_MTX_ASSERT_OWNED);
 }
 
 static void
-mt_dev_assert_lock_held(__assert_only dev_t dev)
+mt_device_assert_inuse(__assert_only mt_device_t dev)
 {
-       LCK_MTX_ASSERT(&mt_dev_mtxs[minor(dev)], LCK_MTX_ASSERT_OWNED);
+       assert(dev->mtd_inuse == true);
 }
 
 int
 mt_dev_init(void)
 {
-       lck_grp_attr_t *lock_grp_attr = NULL;
-       int devices = 0;
-
-       lock_grp_attr = lck_grp_attr_alloc_init();
-       mt_lock_grp = lck_grp_alloc_init("monotonic", lock_grp_attr);
-       lck_grp_attr_free(lock_grp_attr);
+       mt_lock_grp = lck_grp_alloc_init(MT_NODE, LCK_GRP_ATTR_NULL);
+       assert(mt_lock_grp != NULL);
 
        mt_dev_major = cdevsw_add(-1 /* allocate a major number */, &mt_cdevsw);
        if (mt_dev_major < 0) {
                panic("monotonic: cdevsw_add failed: %d", mt_dev_major);
-               __builtin_trap();
+               __builtin_unreachable();
        }
 
        for (int i = 0; i < MT_NDEVS; i++) {
-               dev_t dev;
-               void *dn;
-               int error;
-
-               error = monotonic_devs[i].mtd_init();
-               if (error) {
+               if (mt_devices[i].mtd_init(&mt_devices[i])) {
                        continue;
                }
 
-               dev = makedev(mt_dev_major, i);
-               dn = devfs_make_node(dev,
-                               DEVFS_CHAR, UID_ROOT, GID_WINDOWSERVER, 0666,
-                               monotonic_devs[i].mtd_name);
-               if (dn == NULL) {
+               assert(mt_devices[i].mtd_ncounters > 0);
+
+               dev_t dev = makedev(mt_dev_major, i);
+               char name[128];
+               snprintf(name, sizeof(name), MT_NODE "/%s", mt_devices[i].mtd_name);
+               void *node = devfs_make_node(dev, DEVFS_CHAR, UID_ROOT,
+                               GID_WINDOWSERVER, 0666, name);
+               if (!node) {
                        panic("monotonic: devfs_make_node failed for '%s'",
-                                       monotonic_devs[i].mtd_name);
-                       __builtin_trap();
+                                       mt_devices[i].mtd_name);
+                       __builtin_unreachable();
                }
 
-               lck_mtx_init(&mt_dev_mtxs[i], mt_lock_grp, LCK_ATTR_NULL);
-
-               devices++;
+               lck_mtx_init(&mt_devices[i].mtd_lock, mt_lock_grp, LCK_ATTR_NULL);
        }
 
        return 0;
 }
 
 static int
-mt_dev_open(dev_t dev, __unused int flags, __unused int devtype,
-               __unused struct proc *p)
+mt_cdev_open(dev_t devnum, __unused int flags, __unused int devtype,
+               __unused proc_t p)
 {
        int error = 0;
 
-       mt_dev_lock(dev);
-
-       if (mt_dev_owned[minor(dev)]) {
+       mt_device_t dev = mt_get_device(devnum);
+       mt_device_lock(dev);
+       if (dev->mtd_inuse) {
                error = EBUSY;
-               goto out;
+       } else {
+               dev->mtd_inuse = true;
        }
+       mt_device_unlock(dev);
 
-       mt_dev_owned[minor(dev)] = true;
-
-out:
-       mt_dev_unlock(dev);
        return error;
 }
 
 static int
-mt_dev_close(dev_t dev, __unused int flags, __unused int devtype,
+mt_cdev_close(dev_t devnum, __unused int flags, __unused int devtype,
                __unused struct proc *p)
 {
-       mt_dev_lock(dev);
-
-       assert(mt_dev_owned[minor(dev)]);
-       mt_dev_owned[minor(dev)] = false;
-
-       monotonic_devs[minor(dev)].mtd_reset();
+       mt_device_t dev = mt_get_device(devnum);
 
-       mt_dev_unlock(dev);
+       mt_device_lock(dev);
+       mt_device_assert_inuse(dev);
+       dev->mtd_inuse = false;
+       dev->mtd_reset();
+       mt_device_unlock(dev);
 
        return 0;
 }
 
 static int
-mt_ctl_add(dev_t dev, user_addr_t uptr, __unused int flags,
-               __unused struct proc *p)
+mt_ctl_add(mt_device_t dev, user_addr_t uptr)
 {
        int error;
        uint32_t ctr;
        union monotonic_ctl_add ctl;
 
-       mt_dev_assert_lock_held(dev);
+       mt_device_assert_lock_held(dev);
 
        error = copyin(uptr, &ctl, sizeof(ctl.in));
        if (error) {
                return error;
        }
 
-       error = monotonic_devs[minor(dev)].mtd_add(&ctl.in.config, &ctr);
+       error = dev->mtd_add(&ctl.in.config, &ctr);
        if (error) {
                return error;
        }
@@ -198,14 +192,12 @@ mt_ctl_add(dev_t dev, user_addr_t uptr, __unused int flags,
 }
 
 static int
-mt_ctl_counts(dev_t dev, user_addr_t uptr, __unused int flags,
-               __unused struct proc *p)
+mt_ctl_counts(mt_device_t dev, user_addr_t uptr)
 {
        int error;
-       uint64_t ctrs;
        union monotonic_ctl_counts ctl;
 
-       mt_dev_assert_lock_held(dev);
+       mt_device_assert_lock_held(dev);
 
        error = copyin(uptr, &ctl, sizeof(ctl.in));
        if (error) {
@@ -215,11 +207,12 @@ mt_ctl_counts(dev_t dev, user_addr_t uptr, __unused int flags,
        if (ctl.in.ctr_mask == 0) {
                return EINVAL;
        }
-       ctrs = __builtin_popcountll(ctl.in.ctr_mask);
 
        {
-               uint64_t counts[ctrs];
-               error = monotonic_devs[minor(dev)].mtd_read(ctl.in.ctr_mask, counts);
+               uint64_t counts[dev->mtd_nmonitors][dev->mtd_ncounters];
+               memset(counts, 0,
+                               dev->mtd_ncounters * dev->mtd_nmonitors * sizeof(counts[0][0]));
+               error = dev->mtd_read(ctl.in.ctr_mask, (uint64_t *)counts);
                if (error) {
                        return error;
                }
@@ -234,39 +227,40 @@ mt_ctl_counts(dev_t dev, user_addr_t uptr, __unused int flags,
 }
 
 static int
-mt_ctl_enable(dev_t dev, user_addr_t uptr)
+mt_ctl_enable(mt_device_t dev, user_addr_t uptr)
 {
        int error;
        union monotonic_ctl_enable ctl;
 
-       mt_dev_assert_lock_held(dev);
+       mt_device_assert_lock_held(dev);
 
        error = copyin(uptr, &ctl, sizeof(ctl));
        if (error) {
                return error;
        }
 
-       monotonic_devs[minor(dev)].mtd_enable(ctl.in.enable);
+       dev->mtd_enable(ctl.in.enable);
 
        return 0;
 }
 
 static int
-mt_ctl_reset(dev_t dev)
+mt_ctl_reset(mt_device_t dev)
 {
-       mt_dev_assert_lock_held(dev);
-       monotonic_devs[minor(dev)].mtd_reset();
+       mt_device_assert_lock_held(dev);
+       dev->mtd_reset();
        return 0;
 }
 
 static int
-mt_dev_ioctl(dev_t dev, unsigned long cmd, char *arg, int flags,
-               struct proc *p)
+mt_cdev_ioctl(dev_t devnum, unsigned long cmd, char *arg, __unused int flags,
+               __unused proc_t p)
 {
-       int error;
+       int error = ENODEV;
        user_addr_t uptr = *(user_addr_t *)(void *)arg;
 
-       mt_dev_lock(dev);
+       mt_device_t dev = mt_get_device(devnum);
+       mt_device_lock(dev);
 
        switch (cmd) {
        case MT_IOC_RESET:
@@ -274,7 +268,7 @@ mt_dev_ioctl(dev_t dev, unsigned long cmd, char *arg, int flags,
                break;
 
        case MT_IOC_ADD:
-               error = mt_ctl_add(dev, uptr, flags, p);
+               error = mt_ctl_add(dev, uptr);
                break;
 
        case MT_IOC_ENABLE:
@@ -282,15 +276,26 @@ mt_dev_ioctl(dev_t dev, unsigned long cmd, char *arg, int flags,
                break;
 
        case MT_IOC_COUNTS:
-               error = mt_ctl_counts(dev, uptr, flags, p);
+               error = mt_ctl_counts(dev, uptr);
                break;
 
+       case MT_IOC_GET_INFO: {
+               union monotonic_ctl_info info = {
+                       .out = {
+                               .nmonitors = dev->mtd_nmonitors,
+                               .ncounters = dev->mtd_ncounters,
+                       },
+               };
+               error = copyout(&info, uptr, sizeof(info));
+               break;
+       }
+
        default:
                error = ENODEV;
                break;
        }
 
-       mt_dev_unlock(dev);
+       mt_device_unlock(dev);
 
        return error;
 }
@@ -413,47 +418,26 @@ SYSCTL_DECL(_kern_monotonic);
 SYSCTL_NODE(_kern, OID_AUTO, monotonic, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
                "monotonic");
 
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, supported,
-               CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED,
-               (void *)MT_SUPPORTED, sizeof(int), mt_sysctl, "I",
-               "whether monotonic is supported");
+#define MT_SYSCTL(NAME, ARG, SIZE, SIZESTR, DESC) \
+               SYSCTL_PROC(_kern_monotonic, OID_AUTO, NAME, \
+               CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED, \
+               (void *)(ARG), SIZE, mt_sysctl, SIZESTR, DESC)
 
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, debug,
-               CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED,
-               (void *)MT_DEBUG, sizeof(int), mt_sysctl, "I",
+MT_SYSCTL(supported, MT_SUPPORTED, sizeof(int), "I",
+               "whether monotonic is supported");
+MT_SYSCTL(debug, MT_DEBUG, sizeof(int), "I",
                "whether monotonic is printing debug messages");
-
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, pmis,
-               CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED,
-               (void *)MT_PMIS, sizeof(uint64_t), mt_sysctl, "Q",
-               "how many PMIs have been seen");
-
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, retrograde_updates,
-               CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED | CTLFLAG_LOCKED,
-               (void *)MT_RETROGRADE, sizeof(uint64_t), mt_sysctl, "Q",
-               "how many times a counter appeared to go backwards");
-
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, task_thread_counting,
-               CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MASKED,
-               (void *)MT_TASK_THREAD, sizeof(int), mt_sysctl, "I",
-               "task and thread counting enabled");
-
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, kdebug_test,
-               CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
-               (void *)MT_KDBG_TEST, sizeof(int), mt_sysctl, "O",
-               "test that kdebug integration works");
-
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, fixed_cpu_perf,
-               CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
-               (void *)MT_FIX_CPU_PERF, sizeof(uint64_t) * 2, mt_sysctl, "O",
+MT_SYSCTL(pmis, MT_PMIS, sizeof(uint64_t), "Q",
+               "number of PMIs seen");
+MT_SYSCTL(retrograde_updates, MT_RETROGRADE, sizeof(uint64_t), "Q",
+               "number of times a counter appeared to go backwards");
+MT_SYSCTL(task_thread_counting, MT_TASK_THREAD, sizeof(int), "I",
+               "whether task and thread counting is enabled");
+MT_SYSCTL(kdebug_test, MT_KDBG_TEST, sizeof(int), "O",
+               "whether task and thread counting is enabled");
+MT_SYSCTL(fixed_cpu_perf, MT_FIX_CPU_PERF, sizeof(uint64_t) * 2, "O",
                "overhead of accessing the current CPU's counters");
-
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, fixed_thread_perf,
-               CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
-               (void *)MT_FIX_THREAD_PERF, sizeof(uint64_t) * 2, mt_sysctl, "O",
+MT_SYSCTL(fixed_thread_perf, MT_FIX_THREAD_PERF, sizeof(uint64_t) * 2, "O",
                "overhead of accessing the current thread's counters");
-
-SYSCTL_PROC(_kern_monotonic, OID_AUTO, fixed_task_perf,
-               CTLFLAG_RW | CTLFLAG_MASKED | CTLFLAG_LOCKED,
-               (void *)MT_FIX_TASK_PERF, sizeof(uint64_t) * 2, mt_sysctl, "O",
+MT_SYSCTL(fixed_task_perf, MT_FIX_TASK_PERF, sizeof(uint64_t) * 2, "O",
                "overhead of accessing the current task's counters");
index 36b5db056159d5c07df36052c0c277b08d4e98be..23e115db01ae29a388f9897dcbd1dfdeb9ae401c 100644 (file)
 #include <kern/task.h>
 #include <kern/ast.h>
 #include <kern/kalloc.h>
-#include <mach/mach_host.h>
+#include <kern/ux_handler.h>            /* for ux_handler_setup() */
 
 #include <mach/vm_param.h>
 
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 
-#include <sys/ux_exception.h>  /* for ux_exception_port */
-
 #include <sys/reboot.h>
-#include <mach/exception_types.h>
 #include <dev/busvar.h>                        /* for pseudo_inits */
 #include <sys/kdebug.h>
 #include <sys/monotonic.h>
 #include <kern/clock.h>
 #include <mach/kern_return.h>
 #include <mach/thread_act.h>           /* for thread_resume() */
-#include <mach/task.h>                 /* for task_set_exception_ports() */
-#include <sys/ux_exception.h>          /* for ux_handler() */
 #include <sys/ubc_internal.h>          /* for ubc_init() */
 #include <sys/mcache.h>                        /* for mcache_init() */
 #include <sys/mbuf.h>                  /* for mbinit() */
 #include <net/if_gif.h>                        /* for gif_init() */
 #include <vm/vm_protos.h>              /* for vnode_pager_bootstrap() */
 #include <miscfs/devfs/devfsdefs.h>    /* for devfs_kernel_mount() */
-#include <mach/host_priv.h>            /* for host_set_exception_ports() */
-#include <kern/host.h>                 /* for host_priv_self() */
 #include <vm/vm_kern.h>                        /* for kmem_suballoc() */
 #include <sys/semaphore.h>             /* for psem_lock_init() */
 #include <sys/msgbuf.h>                        /* for log_setsize() */
 #include <machine/pal_routines.h>
 #include <console/video_console.h>
 
+#if CONFIG_XNUPOST
+#include <tests/xnupost.h>
+#endif
 
 void * get_user_regs(thread_t);                /* XXX kludge for <machine/thread.h> */
 void IOKitInitializeTime(void);                /* XXX */
@@ -403,6 +399,10 @@ lck_attr_t * proc_lck_attr;
 lck_mtx_t * proc_list_mlock;
 lck_mtx_t * proc_klist_mlock;
 
+#if CONFIG_XNUPOST
+lck_grp_t * sysctl_debug_test_stackshot_owner_grp;
+lck_mtx_t * sysctl_debug_test_stackshot_owner_init_mtx;
+#endif /* !CONFIG_XNUPOST */
 
 extern lck_mtx_t * execargs_cache_lock;
 
@@ -500,6 +500,12 @@ bsd_init(void)
 #endif
        proc_kqhashlock_grp = lck_grp_alloc_init("proc-kqhashlock",  proc_lck_grp_attr);
        proc_knhashlock_grp = lck_grp_alloc_init("proc-knhashlock",  proc_lck_grp_attr);
+#if CONFIG_XNUPOST
+       sysctl_debug_test_stackshot_owner_grp = lck_grp_alloc_init("test-stackshot-owner-grp", LCK_GRP_ATTR_NULL);
+       sysctl_debug_test_stackshot_owner_init_mtx = lck_mtx_alloc_init(
+                                                       sysctl_debug_test_stackshot_owner_grp, 
+                                                       LCK_ATTR_NULL);
+#endif /* !CONFIG_XNUPOST */
        /* Allocate proc lock attribute */
        proc_lck_attr = lck_attr_alloc_init();
 #if 0
@@ -553,9 +559,6 @@ bsd_init(void)
 #endif
 #endif /* MAC */
 
-       /* Initialize System Override call */
-       init_system_override();
-       
        ulock_initialize();
 
        /*
@@ -1051,6 +1054,9 @@ bsd_init(void)
        consider_zone_gc(FALSE);
 #endif
 
+       /* Initialize System Override call */
+       init_system_override();
+       
        bsd_init_kprintf("done\n");
 }
 
@@ -1058,21 +1064,11 @@ void
 bsdinit_task(void)
 {
        proc_t p = current_proc();
-       struct uthread *ut;
-       thread_t thread;
 
        process_name("init", p);
 
-       ux_handler_init();
-
-       thread = current_thread();
-       (void) host_set_exception_ports(host_priv_self(),
-                                       EXC_MASK_ALL & ~(EXC_MASK_RPC_ALERT),//pilotfish (shark) needs this port
-                                       (mach_port_t) ux_exception_port,
-                                       EXCEPTION_DEFAULT| MACH_EXCEPTION_CODES,
-                                       0);
-
-       ut = (uthread_t)get_bsdthread_info(thread);
+       /* Set up exception-to-signal reflection */
+       ux_handler_setup();
 
 #if CONFIG_MACF
        mac_cred_label_associate_user(p->p_ucred);
@@ -1080,6 +1076,13 @@ bsdinit_task(void)
 
     vm_init_before_launchd();
 
+#if CONFIG_XNUPOST
+       int result = bsd_list_tests();
+       result = bsd_do_post();
+       if (result != 0) {
+               panic("bsd_do_post: Tests failed with result = 0x%08x\n", result);
+       }
+#endif
 
        bsd_init_kprintf("bsd_do_post - done");
 
index ebca2527174c92d56c2c99a999f1ec3e95c8c9e8..3e62edb97c2459b03bf0c7c8663d06a9dba6c264 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -486,17 +486,28 @@ decmpfs_fetch_compressed_header(vnode_t vp, decmpfs_cnode *cp, decmpfs_header **
      and return ERANGE in that case
      */
     
-       size_t read_size             = 0;
-       size_t attr_size             = 0;
+    size_t read_size             = 0;
+    size_t attr_size             = 0;
     uio_t attr_uio               = NULL;
     int err                      = 0;
     char *data                   = NULL;
+    const bool no_additional_data= ((cp != NULL)
+        && (cp->cmp_type != 0)
+        && (cp->cmp_minimal_xattr != 0));
+    char uio_buf[ UIO_SIZEOF(1) ];
     decmpfs_header *hdr = NULL;
-       char uio_buf[ UIO_SIZEOF(1) ];
+
+    /*
+     * Trace the following parameters on entry with event-id 0x03120004
+     *
+     * @vp->v_id:       vnode-id for which to fetch compressed header.
+     * @no_additional_data: If set true then xattr didn't have any extra data.
+     * @returnInvalid:  return the header even though the type is out of range.
+     */
+    DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FETCH_COMPRESSED_HEADER, vp->v_id,
+        no_additional_data, returnInvalid);
     
-    if ((cp != NULL) &&
-        (cp->cmp_type != 0) &&
-        (cp->cmp_minimal_xattr != 0)) {
+    if (no_additional_data) {
         /* this file's xattr didn't have any extra data when we fetched it, so we can synthesize a header from the data in the cnode */
         
         MALLOC(data, char *, sizeof(decmpfs_header), M_TEMP, M_WAITOK);
@@ -571,6 +582,13 @@ out:
     } else {
         *hdrOut = hdr;
     }
+    /*
+     * Trace the following parameters on return with event-id 0x03120004.
+     *
+     * @vp->v_id:       vnode-id for which to fetch compressed header.
+     * @err:            value returned from this function.
+     */
+    DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FETCH_COMPRESSED_HEADER, vp->v_id, err);
     return err;
 }
 
@@ -679,14 +697,15 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp)
      */
     
     int ret = 0;
-       int error = 0;
-       uint32_t cmp_state;
-       struct vnode_attr va_fetch;
+    int error = 0;
+    uint32_t cmp_state;
+    struct vnode_attr va_fetch;
     decmpfs_header *hdr = NULL;
     mount_t mp = NULL;
-       int cnode_locked = 0;
+    int cnode_locked = 0;
     int saveInvalid = 0; // save the header data even though the type was out of range
     uint64_t decompression_flags = 0;
+    bool is_mounted, is_local_fs;
        
     if (vnode_isnamedstream(vp)) {
         /*
@@ -721,9 +740,25 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp)
         ret = FILE_IS_NOT_COMPRESSED;
         goto done;
     }
-    
-    mp = vnode_mount(vp); 
-    if (mp == NULL) {
+
+    is_mounted = false;
+    is_local_fs = false;
+    mp = vnode_mount(vp);
+    if (mp)
+        is_mounted = true;
+    if (is_mounted)
+        is_local_fs = ((mp->mnt_flag & MNT_LOCAL));
+    /*
+     * Trace the following parameters on entry with event-id 0x03120014.
+     *
+     * @vp->v_id:       vnode-id of the file being queried.
+     * @is_mounted:     set to true if @vp belongs to a mounted fs.
+     * @is_local_fs:    set to true if @vp belongs to local fs.
+     */
+    DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FILE_IS_COMPRESSED, vp->v_id,
+        is_mounted, is_local_fs);
+
+    if (!is_mounted) {
         /*
          this should only be true before we mount the root filesystem
          we short-cut this return to avoid the call to getattr below, which
@@ -732,7 +767,8 @@ decmpfs_file_is_compressed(vnode_t vp, decmpfs_cnode *cp)
         ret = FILE_IS_NOT_COMPRESSED;
         goto done;
     }
-    if ((mp->mnt_flag & MNT_LOCAL) == 0) {
+
+    if (!is_local_fs) {
         /* compression only supported on local filesystems */
         ret = FILE_IS_NOT_COMPRESSED;
         goto done;
@@ -811,17 +847,25 @@ done:
        if (cnode_locked) decmpfs_unlock_compressed_data(cp, 1);
     
     if (hdr) FREE(hdr, M_TEMP);
-       
-       switch(ret) {
-        case FILE_IS_NOT_COMPRESSED:
-                       return 0;
-        case FILE_IS_COMPRESSED:
-        case FILE_IS_CONVERTING:
-                       return 1;
-        default:
-            /* unknown state, assume file is not compressed */
-            ErrorLogWithPath("unknown ret %d\n", ret);
-            return 0;
+    /*
+     * Trace the following parameters on return with event-id 0x03120014.
+     *
+     * @vp->v_id:       vnode-id of the file being queried.
+     * @return:         set to 1 is file is compressed.
+     */
+    switch(ret) {
+    case FILE_IS_NOT_COMPRESSED:
+           DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FILE_IS_COMPRESSED, vp->v_id, 0);
+           return 0;
+    case FILE_IS_COMPRESSED:
+    case FILE_IS_CONVERTING:
+           DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FILE_IS_COMPRESSED, vp->v_id, 1);
+           return 1;
+    default:
+           /* unknown state, assume file is not compressed */
+           DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FILE_IS_COMPRESSED, vp->v_id, 0);
+           ErrorLogWithPath("unknown ret %d\n", ret);
+           return 0;
     }
 }
 
@@ -1058,7 +1102,20 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *h
         err = 0;
         goto out;
     }
-    
+
+    /*
+     * Trace the following parameters on entry with event-id 0x03120008.
+     *
+     * @vp->v_id:       vnode-id of the file being decompressed.
+     * @hdr->compression_type: compression type.
+     * @offset:         offset from where to fetch uncompressed data.
+     * @size:           amount of uncompressed data to fetch.
+     *
+     * Please NOTE: @offset and @size can overflow in theory but
+     * here it is safe.
+     */
+    DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FETCH_UNCOMPRESSED_DATA, vp->v_id,
+        hdr->compression_type, (int)offset, (int)size);
     lck_rw_lock_shared(decompressorsLock);
     decmpfs_fetch_uncompressed_data_func fetch = decmp_get_func(vp, hdr->compression_type, fetch);
     if (fetch) {
@@ -1079,7 +1136,17 @@ decmpfs_fetch_uncompressed_data(vnode_t vp, decmpfs_cnode *cp, decmpfs_header *h
         err = ENOTSUP;
         lck_rw_unlock_shared(decompressorsLock);
     }
-    
+    /*
+     * Trace the following parameters on return with event-id 0x03120008.
+     *
+     * @vp->v_id:       vnode-id of the file being decompressed.
+     * @bytes_read:     amount of uncompressed bytes fetched in bytes.
+     * @err:            value returned from this function.
+     *
+     * Please NOTE: @bytes_read can overflow in theory but here it is safe.
+     */
+    DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FETCH_UNCOMPRESSED_DATA, vp->v_id,
+        (int)*bytes_read, err);
 out:
     return err;
 }
@@ -1512,8 +1579,15 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp)
      call out to the decompressor to free remove any data associated with this compressed file
      then delete the file's compression xattr
      */
-    
     decmpfs_header *hdr = NULL;
+
+    /*
+     * Trace the following parameters on entry with event-id 0x03120010.
+     *
+     * @vp->v_id:       vnode-id of the file for which to free compressed data.
+     */
+    DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_FREE_COMPRESSED_DATA, vp->v_id);
+
     int err = decmpfs_fetch_compressed_header(vp, cp, &hdr, 0);
     if (err) {
         ErrorLogWithPath("decmpfs_fetch_compressed_header err %d\n", err);
@@ -1532,6 +1606,13 @@ decmpfs_free_compressed_data(vnode_t vp, decmpfs_cnode *cp)
             ErrorLogWithPath("decompressor err %d\n", err);
         }
     }
+    /*
+     * Trace the following parameters on return with event-id 0x03120010.
+     *
+     * @vp->v_id:       vnode-id of the file for which to free compressed data.
+     * @err:            value returned from this function.
+     */
+    DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_FREE_COMPRESSED_DATA, vp->v_id, err);
     
     /* delete the xattr */
        err = vn_removexattr(vp, DECMPFS_XATTR_NAME, 0, decmpfs_ctx);
@@ -1585,10 +1666,23 @@ decmpfs_decompress_file(vnode_t vp, decmpfs_cnode *cp, off_t toSize, int truncat
        uint32_t new_state           = 0;
        int update_file_state        = 0;
        int allocSize                = 0;
-       decmpfs_header *hdr = NULL;
+       decmpfs_header *hdr          = NULL;
        int cmpdata_locked           = 0;
        off_t remaining              = 0;
        uint64_t uncompressed_size   = 0;
+
+       /*
+        * Trace the following parameters on entry with event-id 0x03120000.
+        *
+        * @vp->v_id:           vnode-id of the file being decompressed.
+        * @toSize:             uncompress given bytes of the file.
+        * @truncate_okay:      on error it is OK to truncate.
+        * @skiplock:           compressed data is locked, skip locking again.
+        *
+        * Please NOTE: @toSize can overflow in theory but here it is safe.
+        */
+       DECMPFS_EMIT_TRACE_ENTRY(DECMPDBG_DECOMPRESS_FILE, vp->v_id,
+               (int)toSize, truncate_okay, skiplock);
        
        if (!skiplock) {
                decmpfs_lock_compressed_data(cp, 1); cmpdata_locked = 1;
@@ -1786,7 +1880,13 @@ out:
        }
        
        if (cmpdata_locked) decmpfs_unlock_compressed_data(cp, 1);
-       
+       /*
+        * Trace the following parameters on return with event-id 0x03120000.
+        *
+        * @vp->v_id:   vnode-id of the file being decompressed.
+        * @err:        value returned from this function.
+        */
+       DECMPFS_EMIT_TRACE_RETURN(DECMPDBG_DECOMPRESS_FILE, vp->v_id, err);
        return err;
 }
 
index 5b424e71982535aa426972214b35ab2c0cff9d16..66361ad0dc008a010c747225f17a25720edec7b1 100644 (file)
@@ -182,6 +182,12 @@ static void typefilter_reject_all(typefilter_t tf)
        memset(tf, 0, KDBG_TYPEFILTER_BITMAP_SIZE);
 }
 
+static void typefilter_allow_all(typefilter_t tf)
+{
+       assert(tf != NULL);
+       memset(tf, ~0, KDBG_TYPEFILTER_BITMAP_SIZE);
+}
+
 static void typefilter_allow_class(typefilter_t tf, uint8_t class)
 {
        assert(tf != NULL);
@@ -248,6 +254,8 @@ kdbg_timestamp(void)
        }
 }
 
+static int kdbg_debug = 0;
+
 #if KDEBUG_MOJO_TRACE
 #include <sys/kdebugevents.h>
 static void kdebug_serial_print( /* forward */
@@ -303,7 +311,6 @@ static void delete_buffers(void);
 
 extern int tasks_count;
 extern int threads_count;
-extern char *proc_best_name(proc_t p);
 extern void IOSleep(int);
 
 /* trace enable status */
@@ -606,7 +613,7 @@ kdbg_set_flags(int slowflag, int enableflag, boolean_t enabled)
 /*
  * Disable wrapping and return true if trace wrapped, false otherwise.
  */
-boolean_t
+static boolean_t
 disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags)
 {
        boolean_t wrapped;
@@ -626,8 +633,8 @@ disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags)
        return wrapped;
 }
 
-void
-enable_wrap(uint32_t old_slowcheck, boolean_t lostevents)
+static void
+enable_wrap(uint32_t old_slowcheck)
 {
        int s = ml_set_interrupts_enabled(FALSE);
        lck_spin_lock(kds_spin_lock);
@@ -637,9 +644,6 @@ enable_wrap(uint32_t old_slowcheck, boolean_t lostevents)
        if ( !(old_slowcheck & SLOW_NOLOG))
                kd_ctrl_page.kdebug_slowcheck &= ~SLOW_NOLOG;
 
-       if (lostevents == TRUE)
-               kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED;
-
        lck_spin_unlock(kds_spin_lock);
        ml_set_interrupts_enabled(s);
 }
@@ -861,13 +865,20 @@ allocate_storage_unit(int cpu)
                if (kdsp_actual->kds_bufindx < EVENTS_PER_STORAGE_UNIT)
                        goto out;
        }
-       
+
        if ((kdsp = kd_ctrl_page.kds_free_list).raw != KDS_PTR_NULL) {
+               /*
+                * If there's a free page, grab it from the free list.
+                */
                kdsp_actual = POINTER_FROM_KDS_PTR(kdsp);
                kd_ctrl_page.kds_free_list = kdsp_actual->kds_next;
 
                kd_ctrl_page.kds_inuse_count++;
        } else {
+               /*
+                * Otherwise, we're going to lose events and repurpose the oldest
+                * storage unit we can find.
+                */
                if (kd_ctrl_page.kdebug_flags & KDBG_NOWRAP) {
                        kd_ctrl_page.kdebug_slowcheck |= SLOW_NOLOG;
                        kdbp->kd_lostevents = TRUE;
@@ -929,7 +940,9 @@ allocate_storage_unit(int cpu)
                } else
                        kdbp_vict->kd_lostevents = TRUE;
 
-               kd_ctrl_page.oldest_time = oldest_ts;
+               if (kd_ctrl_page.oldest_time < oldest_ts) {
+                       kd_ctrl_page.oldest_time = oldest_ts;
+               }
                kd_ctrl_page.kdebug_flags |= KDBG_WRAPPED;
        }
        kdsp_actual->kds_timestamp = kdbg_timestamp();
@@ -939,7 +952,7 @@ allocate_storage_unit(int cpu)
 
        kdsp_actual->kds_lostevents = kdbp->kd_lostevents;
        kdbp->kd_lostevents = FALSE;
-       kdsp_actual->kds_bufindx  = 0;
+       kdsp_actual->kds_bufindx = 0;
 
        if (kdbp->kd_list_head.raw == KDS_PTR_NULL)
                kdbp->kd_list_head = kdsp;
@@ -1130,24 +1143,72 @@ out1:
        }
 }
 
+/*
+ * Check if the given debug ID is allowed to be traced on the current process.
+ *
+ * Returns true if allowed and false otherwise.
+ */
+static inline bool
+kdebug_debugid_procfilt_allowed(uint32_t debugid)
+{
+       uint32_t procfilt_flags = kd_ctrl_page.kdebug_flags &
+                       (KDBG_PIDCHECK | KDBG_PIDEXCLUDE);
+
+       if (!procfilt_flags) {
+               return true;
+       }
+
+       /*
+        * DBG_TRACE and MACH_SCHED tracepoints ignore the process filter.
+        */
+       if ((debugid & 0xffff0000) == MACHDBG_CODE(DBG_MACH_SCHED, 0) ||
+               (debugid >> 24 == DBG_TRACE)) {
+               return true;
+       }
+
+       struct proc *curproc = current_proc();
+       /*
+        * If the process is missing (early in boot), allow it.
+        */
+       if (!curproc) {
+               return true;
+       }
+
+       if (procfilt_flags & KDBG_PIDCHECK) {
+               /*
+                * Allow only processes marked with the kdebug bit.
+                */
+               return curproc->p_kdebug;
+       } else if (procfilt_flags & KDBG_PIDEXCLUDE) {
+               /*
+                * Exclude any process marked with the kdebug bit.
+                */
+               return !curproc->p_kdebug;
+       } else {
+               panic("kdebug: invalid procfilt flags %x", kd_ctrl_page.kdebug_flags);
+               __builtin_unreachable();
+       }
+}
+
 static void
 kernel_debug_internal(
-       boolean_t only_filter,
-       uint32_t  debugid,
+       uint32_t debugid,
        uintptr_t arg1,
        uintptr_t arg2,
        uintptr_t arg3,
        uintptr_t arg4,
-       uintptr_t arg5)
+       uintptr_t arg5,
+       uint64_t flags)
 {
-       struct proc     *curproc;
-       uint64_t        now;
-       uint32_t        bindx;
-       kd_buf          *kd;
-       int             cpu;
+       uint64_t now;
+       uint32_t bindx;
+       kd_buf *kd;
+       int cpu;
        struct kd_bufinfo *kdbp;
        struct kd_storage *kdsp_actual;
-       union  kds_ptr kds_raw;
+       union kds_ptr kds_raw;
+       bool only_filter = flags & KDBG_FLAG_FILTERED;
+       bool observe_procfilt = !(flags & KDBG_FLAG_NOPROCFILT);
 
        if (kd_ctrl_page.kdebug_slowcheck) {
                if ((kd_ctrl_page.kdebug_slowcheck & SLOW_NOLOG) ||
@@ -1156,29 +1217,9 @@ kernel_debug_internal(
                        goto out1;
                }
 
-               if ( !ml_at_interrupt_context()) {
-                       if (kd_ctrl_page.kdebug_flags & KDBG_PIDCHECK) {
-                               /*
-                                * If kdebug flag is not set for current proc, return
-                                */
-                               curproc = current_proc();
-
-                               if ((curproc && !(curproc->p_kdebug)) &&
-                                   ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE)) &&
-                                     (debugid >> 24 != DBG_TRACE))
-                                       goto out1;
-                       }
-                       else if (kd_ctrl_page.kdebug_flags & KDBG_PIDEXCLUDE) {
-                               /*
-                                * If kdebug flag is set for current proc, return
-                                */
-                               curproc = current_proc();
-
-                               if ((curproc && curproc->p_kdebug) &&
-                                   ((debugid & 0xffff0000) != (MACHDBG_CODE(DBG_MACH_SCHED, 0) | DBG_FUNC_NONE)) &&
-                                     (debugid >> 24 != DBG_TRACE))
-                                       goto out1;
-                       }
+               if (!ml_at_interrupt_context() && observe_procfilt &&
+                               !kdebug_debugid_procfilt_allowed(debugid)) {
+                       goto out1;
                }
 
                if (kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK) {
@@ -1186,14 +1227,14 @@ kernel_debug_internal(
                                goto record_event;
 
                        goto out1;
-               } else if (only_filter == TRUE) {
+               } else if (only_filter) {
                        goto out1;
                }
                else if (kd_ctrl_page.kdebug_flags & KDBG_RANGECHECK) {
                        /* Always record trace system info */
                        if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE)
                                goto record_event;
-                               
+
                        if (debugid < kdlog_beg || debugid > kdlog_end)
                                goto out1;
                }
@@ -1201,14 +1242,14 @@ kernel_debug_internal(
                        /* Always record trace system info */
                        if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE)
                                goto record_event;
-               
+
                        if ((debugid & KDBG_EVENTID_MASK) != kdlog_value1 &&
                            (debugid & KDBG_EVENTID_MASK) != kdlog_value2 &&
                            (debugid & KDBG_EVENTID_MASK) != kdlog_value3 &&
                            (debugid & KDBG_EVENTID_MASK) != kdlog_value4)
                                goto out1;
                }
-       } else if (only_filter == TRUE) {
+       } else if (only_filter) {
                goto out1;
        }
 
@@ -1237,7 +1278,7 @@ retry_q:
        } else {
                kdsp_actual = NULL;
                bindx = EVENTS_PER_STORAGE_UNIT;
-       }       
+       }
 
        if (kdsp_actual == NULL || bindx >= EVENTS_PER_STORAGE_UNIT) {
                if (allocate_storage_unit(cpu) == FALSE) {
@@ -1249,6 +1290,7 @@ retry_q:
                }
                goto retry_q;
        }
+
        now = kdbg_timestamp() & KDBG_TIMESTAMP_MASK;
 
        if ( !OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx))
@@ -1296,8 +1338,8 @@ kernel_debug(
        uintptr_t       arg4,
        __unused uintptr_t arg5)
 {
-       kernel_debug_internal(FALSE, debugid, arg1, arg2, arg3, arg4,
-               (uintptr_t)thread_tid(current_thread()));
+       kernel_debug_internal(debugid, arg1, arg2, arg3, arg4,
+               (uintptr_t)thread_tid(current_thread()), 0);
 }
 
 void
@@ -1309,19 +1351,31 @@ kernel_debug1(
        uintptr_t       arg4,
        uintptr_t       arg5)
 {
-       kernel_debug_internal(FALSE, debugid, arg1, arg2, arg3, arg4, arg5);
+       kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0);
+}
+
+void
+kernel_debug_flags(
+       uint32_t debugid,
+       uintptr_t arg1,
+       uintptr_t arg2,
+       uintptr_t arg3,
+       uintptr_t arg4,
+       uint64_t flags)
+{
+       kernel_debug_internal(debugid, arg1, arg2, arg3, arg4,
+               (uintptr_t)thread_tid(current_thread()), flags);
 }
 
 void
 kernel_debug_filtered(
-       uint32_t  debugid,
+       uint32_t debugid,
        uintptr_t arg1,
        uintptr_t arg2,
        uintptr_t arg3,
        uintptr_t arg4)
 {
-       kernel_debug_internal(TRUE, debugid, arg1, arg2, arg3, arg4,
-               (uintptr_t)thread_tid(current_thread()));
+       kernel_debug_flags(debugid, arg1, arg2, arg3, arg4, KDBG_FLAG_FILTERED);
 }
 
 void
@@ -1358,10 +1412,10 @@ kernel_debug_string_simple(uint32_t eventid, const char *str)
                debugid |= DBG_FUNC_END;
        }
 
-       kernel_debug_internal(FALSE, debugid, str_buf[0],
-                                             str_buf[1],
-                                             str_buf[2],
-                                             str_buf[3], thread_id);
+       kernel_debug_internal(debugid, str_buf[0],
+                                      str_buf[1],
+                                      str_buf[2],
+                                      str_buf[3], thread_id, 0);
 
        debugid &= KDBG_EVENTID_MASK;
        int i = 4;
@@ -1372,10 +1426,10 @@ kernel_debug_string_simple(uint32_t eventid, const char *str)
                if ((written + (4 * sizeof(uintptr_t))) >= len) {
                        debugid |= DBG_FUNC_END;
                }
-               kernel_debug_internal(FALSE, debugid, str_buf[i],
-                                                     str_buf[i + 1],
-                                                     str_buf[i + 2],
-                                                     str_buf[i + 3], thread_id);
+               kernel_debug_internal(debugid, str_buf[i],
+                                              str_buf[i + 1],
+                                              str_buf[i + 2],
+                                              str_buf[i + 3], thread_id, 0);
        }
 }
 
@@ -1545,6 +1599,7 @@ kdebug_typefilter(__unused struct proc* p,
                                    TYPEFILTER_ALLOC_SIZE,              // initial size
                                    0,                                  // mask (alignment?)
                                    VM_FLAGS_ANYWHERE,                  // flags
+                                   VM_MAP_KERNEL_FLAGS_NONE,
                                    VM_KERN_MEMORY_NONE,
                                    kdbg_typefilter_memory_entry,       // port (memory entry!)
                                    0,                                  // offset (in memory entry)
@@ -1601,12 +1656,9 @@ int kdebug_trace64(__unused struct proc *p, struct kdebug_trace64_args *uap, __u
                return err;
        }
 
-       kernel_debug_internal(FALSE, uap->code,
-                             (uintptr_t)uap->arg1,
-                             (uintptr_t)uap->arg2,
-                             (uintptr_t)uap->arg3,
-                             (uintptr_t)uap->arg4,
-                             (uintptr_t)thread_tid(current_thread()));
+       kernel_debug_internal(uap->code, (uintptr_t)uap->arg1,
+                       (uintptr_t)uap->arg2, (uintptr_t)uap->arg3, (uintptr_t)uap->arg4,
+                       (uintptr_t)thread_tid(current_thread()), 0);
 
        return(0);
 }
@@ -1651,9 +1703,8 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr,
 
        /* if the ID is being invalidated, just emit that */
        if (str_id != 0 && str_len == 0) {
-               kernel_debug_internal(FALSE, trace_debugid | DBG_FUNC_START | DBG_FUNC_END,
-                                     (uintptr_t)debugid, (uintptr_t)str_id, 0, 0,
-                                     thread_id);
+               kernel_debug_internal(trace_debugid | DBG_FUNC_START | DBG_FUNC_END,
+                               (uintptr_t)debugid, (uintptr_t)str_id, 0, 0, thread_id, 0);
                return str_id;
        }
 
@@ -1669,9 +1720,8 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr,
                trace_debugid |= DBG_FUNC_END;
        }
 
-       kernel_debug_internal(FALSE, trace_debugid, (uintptr_t)debugid,
-                             (uintptr_t)str_id, str[0],
-                                                str[1], thread_id);
+       kernel_debug_internal(trace_debugid, (uintptr_t)debugid, (uintptr_t)str_id,
+                       str[0], str[1], thread_id, 0);
 
        trace_debugid &= KDBG_EVENTID_MASK;
        i = 2;
@@ -1681,10 +1731,10 @@ kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr,
                if ((written + (4 * sizeof(uintptr_t))) >= str_len) {
                        trace_debugid |= DBG_FUNC_END;
                }
-               kernel_debug_internal(FALSE, trace_debugid, str[i],
-                                                           str[i + 1],
-                                                           str[i + 2],
-                                                           str[i + 3], thread_id);
+               kernel_debug_internal(trace_debugid, str[i],
+                                                    str[i + 1],
+                                                    str[i + 2],
+                                                    str[i + 3], thread_id, 0);
        }
 
        return str_id;
@@ -2276,8 +2326,11 @@ kdebug_reset(void)
 void
 kdebug_free_early_buf(void)
 {
-       /* Must be done with the buffer, so release it back to the VM. */
+#if !CONFIG_EMBEDDED
+       /* Must be done with the buffer, so release it back to the VM.
+        * On embedded targets this buffer is freed when the BOOTDATA segment is freed. */
        ml_static_mfree((vm_offset_t)&kd_early_buffer, sizeof(kd_early_buffer));
+#endif
 }
 
 int
@@ -2468,6 +2521,7 @@ kdbg_enable_typefilter(void)
 static void
 kdbg_disable_typefilter(void)
 {
+       bool notify_iops = kd_ctrl_page.kdebug_flags & KDBG_TYPEFILTER_CHECK;
        kd_ctrl_page.kdebug_flags &= ~KDBG_TYPEFILTER_CHECK;
 
        if ((kd_ctrl_page.kdebug_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE))) {
@@ -2476,6 +2530,17 @@ kdbg_disable_typefilter(void)
                kdbg_set_flags(SLOW_CHECKS, 0, FALSE);
        }
        commpage_update_kdebug_state();
+
+       if (notify_iops) {
+               /*
+                * Notify IOPs that the typefilter will now allow everything.
+                * Otherwise, they won't know a typefilter is no longer in
+                * effect.
+                */
+               typefilter_allow_all(kdbg_typefilter);
+               kdbg_iop_list_callback(kd_ctrl_page.kdebug_iops,
+                               KD_CALLBACK_TYPEFILTER_CHANGED, kdbg_typefilter);
+       }
 }
 
 uint32_t
@@ -3587,7 +3652,6 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin
        uint32_t tempbuf_number;
        uint32_t old_kdebug_flags;
        uint32_t old_kdebug_slowcheck;
-       boolean_t lostevents = FALSE;
        boolean_t out_of_events = FALSE;
        boolean_t wrapped = FALSE;
 
@@ -3641,14 +3705,11 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin
        }
 
        /*
-        * If the buffers have wrapped, capture the earliest time where there
-        * are events for all CPUs and do not emit additional lost events for
+        * If the buffers have wrapped, do not emit additional lost events for the
         * oldest storage units.
         */
        if (wrapped) {
-               barrier_min = kd_ctrl_page.oldest_time;
                kd_ctrl_page.kdebug_flags &= ~KDBG_WRAPPED;
-               kd_ctrl_page.oldest_time = 0;
 
                for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page.kdebug_cpus; cpu++, kdbp++) {
                        if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
@@ -3658,13 +3719,23 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin
                        kdsp_actual->kds_lostevents = FALSE;
                }
        }
+       /*
+        * Capture the earliest time where there are events for all CPUs and don't
+        * emit events with timestamps prior.
+        */
+       barrier_min = kd_ctrl_page.oldest_time;
 
        while (count) {
                tempbuf = kdcopybuf;
                tempbuf_number = 0;
 
                if (wrapped) {
-                       /* Trace a single lost events event for wrapping. */
+                       /*
+                        * Emit a lost events tracepoint to indicate that previous events
+                        * were lost -- the thread map cannot be trusted.  A new one must
+                        * be taken so tools can analyze the trace in a backwards-facing
+                        * fashion.
+                        */
                        kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0);
                        *tempbuf = lostevent;
                        wrapped = FALSE;
@@ -3673,94 +3744,138 @@ kdbg_read(user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uin
 
                /* While space left in merged events scratch buffer. */
                while (tempbuf_count) {
+                       bool lostevents = false;
+                       int lostcpu = 0;
                        earliest_time = UINT64_MAX;
                        min_kdbp = NULL;
                        min_cpu = 0;
 
-                       /* Check each CPU's buffers. */
+                       /* Check each CPU's buffers for the earliest event. */
                        for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page.kdebug_cpus; cpu++, kdbp++) {
-                               /* Skip CPUs without data. */
+                               /* Skip CPUs without data in their oldest storage unit. */
                                if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
 next_cpu:
                                        continue;
                                }
-                               /* Debugging aid: maintain a copy of the "kdsp"
-                                * index.
-                                */
-                               volatile union kds_ptr kdsp_shadow;
-
-                               kdsp_shadow = kdsp;
-
                                /* From CPU data to buffer header to buffer. */
                                kdsp_actual = POINTER_FROM_KDS_PTR(kdsp);
 
-                               volatile struct kd_storage *kdsp_actual_shadow;
-
-                               kdsp_actual_shadow = kdsp_actual;
-
-                               /* Skip buffer if there are no events left. */
+next_event:
+                               /* The next event to be read from this buffer. */
                                rcursor = kdsp_actual->kds_readlast;
 
+                               /* Skip this buffer if there are no events left. */
                                if (rcursor == kdsp_actual->kds_bufindx) {
                                        continue;
                                }
 
-                               t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
-
-                               /* Ignore events that have aged out due to wrapping. */
-                               while (t < barrier_min) {
-                                       rcursor = ++kdsp_actual->kds_readlast;
-
-                                       if (rcursor >= EVENTS_PER_STORAGE_UNIT) {
-                                               release_storage_unit(cpu, kdsp.raw);
+                               /*
+                                * Check that this storage unit wasn't stolen and events were
+                                * lost.  This must have happened while wrapping was disabled
+                                * in this function.
+                                */
+                               if (kdsp_actual->kds_lostevents) {
+                                       lostevents = true;
+                                       kdsp_actual->kds_lostevents = FALSE;
 
-                                               if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
-                                                       goto next_cpu;
-                                               }
-                                               kdsp_shadow = kdsp;
-                                               kdsp_actual = POINTER_FROM_KDS_PTR(kdsp);
-                                               kdsp_actual_shadow = kdsp_actual;
-                                               rcursor = kdsp_actual->kds_readlast;
+                                       /*
+                                        * The earliest event we can trust is the first one in this
+                                        * stolen storage unit.
+                                        */
+                                       uint64_t lost_time =
+                                                       kdbg_get_timestamp(&kdsp_actual->kds_records[0]);
+                                       if (kd_ctrl_page.oldest_time < lost_time) {
+                                               /*
+                                                * If this is the first time we've seen lost events for
+                                                * this gap, record its timestamp as the oldest
+                                                * timestamp we're willing to merge for the lost events
+                                                * tracepoint.
+                                                */
+                                               kd_ctrl_page.oldest_time = barrier_min = lost_time;
+                                               lostcpu = cpu;
                                        }
-
-                                       t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
                                }
 
+                               t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
+
                                if ((t > barrier_max) && (barrier_max > 0)) {
+                                       if (kdbg_debug) {
+                                               printf("kdebug: FUTURE EVENT: debugid %#8x: "
+                                                               "time %lld from CPU %u "
+                                                               "(barrier at time %lld, read %lu events)\n",
+                                                               kdsp_actual->kds_records[rcursor].debugid,
+                                                               t, cpu, barrier_max, *number + tempbuf_number);
+                                       }
                                        /*
-                                        * Need to flush IOPs again before we
-                                        * can sort any more data from the
-                                        * buffers.
+                                        * Need to flush IOPs again before we can sort any more
+                                        * data from the buffers.
                                         */
                                        out_of_events = TRUE;
                                        break;
                                }
                                if (t < kdsp_actual->kds_timestamp) {
                                        /*
-                                        * indicates we've not yet completed filling
-                                        * in this event...
-                                        * this should only occur when we're looking
-                                        * at the buf that the record head is utilizing
-                                        * we'll pick these events up on the next
-                                        * call to kdbg_read
-                                        * we bail at this point so that we don't
-                                        * get an out-of-order timestream by continuing
-                                        * to read events from the other CPUs' timestream(s)
+                                        * This indicates the event emitter hasn't completed
+                                        * filling in the event (becuase we're looking at the
+                                        * buffer that the record head is using).  The max barrier
+                                        * timestamp should have saved us from seeing these kinds
+                                        * of things, but other CPUs might be slow on the up-take.
+                                        *
+                                        * Bail out so we don't get out-of-order events by
+                                        * continuing to read events from other CPUs' events.
                                         */
                                        out_of_events = TRUE;
                                        break;
                                }
+
+                               /*
+                                * Ignore events that have aged out due to wrapping or storage
+                                * unit exhaustion while merging events.
+                                */
+                               if (t < barrier_min) {
+                                       kdsp_actual->kds_readlast++;
+
+                                       if (kdsp_actual->kds_readlast >= EVENTS_PER_STORAGE_UNIT) {
+                                               release_storage_unit(cpu, kdsp.raw);
+
+                                               if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
+                                                       goto next_cpu;
+                                               }
+                                               kdsp_actual = POINTER_FROM_KDS_PTR(kdsp);
+                                       }
+
+                                       goto next_event;
+                               }
+
+                               /*
+                                * Don't worry about merging any events -- just walk through
+                                * the CPUs and find the latest timestamp of lost events.
+                                */
+                               if (lostevents) {
+                                       continue;
+                               }
+
                                if (t < earliest_time) {
                                        earliest_time = t;
                                        min_kdbp = kdbp;
                                        min_cpu = cpu;
                                }
                        }
-                       if (min_kdbp == NULL || out_of_events == TRUE) {
+                       if (lostevents) {
                                /*
-                                * all buffers ran empty
+                                * If any lost events were hit in the buffers, emit an event
+                                * with the latest timestamp.
                                 */
+                               kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, lostcpu);
+                               *tempbuf = lostevent;
+                               tempbuf->arg1 = 1;
+                               goto nextevent;
+                       }
+                       if (min_kdbp == NULL) {
+                               /* All buffers ran empty. */
                                out_of_events = TRUE;
+                       }
+                       if (out_of_events) {
                                break;
                        }
 
@@ -3774,11 +3889,12 @@ next_cpu:
                                release_storage_unit(min_cpu, kdsp.raw);
 
                        /*
-                        * Watch for out of order timestamps
+                        * Watch for out of order timestamps (from IOPs).
                         */
                        if (earliest_time < min_kdbp->kd_prev_timebase) {
                                /*
                                 * If we haven't already, emit a retrograde events event.
+                                * Otherwise, ignore this event.
                                 */
                                if (traced_retrograde) {
                                        continue;
@@ -3803,6 +3919,14 @@ nextevent:
                                break;
                }
                if (tempbuf_number) {
+                       /*
+                        * Remember the latest timestamp of events that we've merged so we
+                        * don't think we've lost events later.
+                        */
+                       uint64_t latest_time = kdbg_get_timestamp(tempbuf - 1);
+                       if (kd_ctrl_page.oldest_time < latest_time) {
+                               kd_ctrl_page.oldest_time = latest_time;
+                       }
                        if (file_version == RAW_VERSION3) {
                                if ( !(kdbg_write_v3_event_chunk_header(buffer, V3_RAW_EVENTS, (tempbuf_number * sizeof(kd_buf)), vp, ctx))) {
                                        error = EFAULT;
@@ -3820,7 +3944,7 @@ nextevent:
                                error = kdbg_write_to_vnode((caddr_t)kdcopybuf, write_size, vp, ctx, RAW_file_offset);
                                if (!error)
                                        RAW_file_offset += write_size;
-       
+
                                if (RAW_file_written >= RAW_FLUSH_SIZE) {
                                        error = VNOP_FSYNC(vp, MNT_NOWAIT, ctx);
 
@@ -3849,7 +3973,7 @@ check_error:
                        tempbuf_count = KDCOPYBUF_COUNT;
        }
        if ( !(old_kdebug_flags & KDBG_NOWRAP)) {
-               enable_wrap(old_kdebug_slowcheck, lostevents);
+               enable_wrap(old_kdebug_slowcheck);
        }
        thread_clear_eager_preempt(current_thread());
        return (error);
@@ -3883,6 +4007,12 @@ kdbg_test(size_t flavor)
                KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2, 3); code++;
                KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++;
 
+               KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code)); code++;
+               KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1); code++;
+               KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2); code++;
+               KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2, 3); code++;
+               KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++;
+
                KDBG_DEBUG(KDEBUG_TEST_CODE(code)); code++;
                KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1); code++;
                KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1, 2); code++;
@@ -3906,6 +4036,7 @@ kdbg_test(size_t flavor)
                                (uintptr_t)thread_tid(current_thread()));
                code++;
                break;
+
        default:
                return ENOTSUP;
        }
@@ -4181,6 +4312,10 @@ SYSCTL_PROC(_kern_kdbg, OID_AUTO, experimental_continuous,
                sizeof(int), kdbg_sysctl_continuous, "I",
                "Set kdebug to use mach_continuous_time");
 
+SYSCTL_INT(_kern_kdbg, OID_AUTO, debug,
+               CTLFLAG_RW | CTLFLAG_LOCKED,
+               &kdbg_debug, 0, "Set kdebug debug mode");
+
 SYSCTL_QUAD(_kern_kdbg, OID_AUTO, oldest_time,
                CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED,
                &kd_ctrl_page.oldest_time,
index 08f8d135fd126d621d9dd82fd170e83bf053bb09..b0a82bb82c7a33069e213ea6c41840be7cf2d712 100644 (file)
@@ -1506,6 +1506,7 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
        OSIncrementAtomic(&lio_contexts_alloced);
 #endif /* DEBUG */
 
+       free_context = TRUE;
        bzero(lio_context, sizeof(aio_lio_context));
        
        aiocbpp = aio_copy_in_list(p, uap->aiocblist, uap->nent);
@@ -1527,6 +1528,7 @@ lio_listio(proc_t p, struct lio_listio_args *uap, int *retval )
        }
 
        /* process list of aio requests */
+       free_context = FALSE;
        lio_context->io_issued = uap->nent;
        lio_context->io_waiter = uap->mode == LIO_WAIT ? 1 : 0; /* Should it be freed by last AIO */
        for ( i = 0; i < uap->nent; i++ ) {
@@ -1645,7 +1647,7 @@ ExitRoutine:
                FREE( entryp_listp, M_TEMP );
        if ( aiocbpp != NULL )
                FREE( aiocbpp, M_TEMP );
-       if ((lio_context != NULL) && ((lio_context->io_issued == 0) || (free_context == TRUE))) {
+       if (free_context) {
                free_lio_context(lio_context);
        }
        
index d6bf9cf91a8fa17874d7d4b76e325c2b74324c73..574cc24c8e3e5cad8516238324df61eebae7161e 100644 (file)
@@ -512,6 +512,10 @@ kauth_authorize_process_callback(kauth_cred_t credential, __unused void *idata,
  *             arg0 is pointer to vnode (vnode *) for file to be closed.
  *             arg1 is pointer to path (char *) of file to be closed.
  *             arg2 is close flags.
+ * arguments passed to KAUTH_FILEOP_WILL_RENAME listeners
+ *             arg0 is pointer to vnode (vnode *) of the file being renamed
+ *             arg1 is pointer to the "from" path (char *)
+ *             arg2 is pointer to the "to" path (char *)
  * arguments passed to KAUTH_FILEOP_RENAME listeners
  *             arg0 is pointer to "from" path (char *).
  *             arg1 is pointer to "to" path (char *).
@@ -550,7 +554,10 @@ kauth_authorize_fileop(kauth_cred_t credential, kauth_action_t action, uintptr_t
                return(0);
        }
 
-       if (action == KAUTH_FILEOP_OPEN || action == KAUTH_FILEOP_CLOSE || action == KAUTH_FILEOP_EXEC) {
+       if (action == KAUTH_FILEOP_OPEN ||
+           action == KAUTH_FILEOP_CLOSE ||
+           action == KAUTH_FILEOP_EXEC ||
+           action == KAUTH_FILEOP_WILL_RENAME) {
                /* get path to the given vnode as a convenience to our listeners.
                 */
                namep = get_pathbuff();
@@ -559,8 +566,15 @@ kauth_authorize_fileop(kauth_cred_t credential, kauth_action_t action, uintptr_t
                        release_pathbuff(namep);
                        return(0);
                }
-               if (action == KAUTH_FILEOP_CLOSE) {
-                       arg2 = arg1;  /* close has some flags that come in via arg1 */
+               if (action == KAUTH_FILEOP_CLOSE ||
+                   action == KAUTH_FILEOP_WILL_RENAME) {
+                       /*
+                        * - Close has some flags that come in via arg1.
+                        * - Will-rename wants to pass the vnode and
+                        *   both paths to the listeners ("to" path
+                        *   starts in arg1, moves to arg2).
+                        */
+                       arg2 = arg1;
                }
                arg1 = (uintptr_t)namep;
        }       
@@ -948,7 +962,6 @@ out:
 int
 kauth_copyinfilesec(user_addr_t xsecurity, kauth_filesec_t *xsecdestpp)
 {
-       user_addr_t uaddr, known_bound;
        int error;
        kauth_filesec_t fsec;
        u_int32_t count;
@@ -965,10 +978,18 @@ kauth_copyinfilesec(user_addr_t xsecurity, kauth_filesec_t *xsecdestpp)
         *
         * The upper bound must be less than KAUTH_ACL_MAX_ENTRIES.  The
         * value here is fairly arbitrary.  It's ok to have a zero count.
+        *
+        * Because we're just using these values to make a guess about the
+        * number of entries, the actual address doesn't matter, only their
+        * relative offsets into the page.  We take advantage of this to
+        * avoid an overflow in the rounding step (this is a user-provided
+        * parameter, so caution pays off).
         */
-       known_bound = xsecurity +  KAUTH_FILESEC_SIZE(0);
-       uaddr = mach_vm_round_page(known_bound);
-       count = (uaddr - known_bound) / sizeof(struct kauth_ace);
+       {
+               user_addr_t known_bound = (xsecurity & PAGE_MASK) + KAUTH_FILESEC_SIZE(0);
+               user_addr_t uaddr = mach_vm_round_page(known_bound);
+               count = (uaddr - known_bound) / sizeof(struct kauth_ace);
+       }
        if (count > 32)
                count = 32;
 restart:
index 9b175b00926ae58991cfa055d6de56218b3edd84..925994950dc6390348e9a4b960e57dcf40357c84 100644 (file)
@@ -68,7 +68,7 @@ backtrace_sysctl SYSCTL_HANDLER_ARGS
        if (!bt) {
                return ENOBUFS;
        }
-
+       bzero(bt, sizeof(uintptr_t) * bt_len);
        err = backtrace_user(bt, bt_len, &bt_filled, &user_64);
        if (err) {
                goto out;
index 73a9a454b8ffd7f2376a47c3149f9b37e7b4814f..07acd675c7f7c170fb73c04f787a75fbd0a2b8ae 100644 (file)
@@ -134,7 +134,7 @@ process_cpu_type(proc_t core_proc)
 {
        cpu_type_t what_we_think;
 #if defined (__i386__) || defined (__x86_64__)
-    if (IS_64BIT_PROCESS(core_proc)) {
+       if (IS_64BIT_PROCESS(core_proc)) {
                what_we_think = CPU_TYPE_X86_64;
        } else {
                what_we_think = CPU_TYPE_I386;
@@ -146,6 +146,7 @@ process_cpu_type(proc_t core_proc)
                what_we_think = CPU_TYPE_ARM;
        }
 #endif
+
        return what_we_think;
 }
 
@@ -154,13 +155,13 @@ process_cpu_subtype(proc_t core_proc)
 {
        cpu_type_t what_we_think;
 #if defined (__i386__) || defined (__x86_64__)
-    if (IS_64BIT_PROCESS(core_proc)) {
+       if (IS_64BIT_PROCESS(core_proc)) {
                what_we_think = CPU_SUBTYPE_X86_64_ALL;
        } else {
                what_we_think = CPU_SUBTYPE_I386_ALL;
        }
 #elif defined (__arm__) || defined(__arm64__)
-    if (IS_64BIT_PROCESS(core_proc)) {
+       if (IS_64BIT_PROCESS(core_proc)) {
                what_we_think = CPU_SUBTYPE_ARM64_ALL;
        } else {
                what_we_think = CPU_SUBTYPE_ARM_ALL;
index c433affeaacc05b86deb890846d7ec4a42c7e984..141807dc8630d195ddfc8f337c5fd361150c622d 100644 (file)
@@ -789,7 +789,7 @@ kauth_resolver_getwork_continue(int result)
 
        thread = current_thread();
        ut = get_bsdthread_info(thread);
-       message = ut->uu_kevent.uu_kauth.message;
+       message = ut->uu_save.uus_kauth.message;
        return(kauth_resolver_getwork2(message));
 }
 
@@ -916,7 +916,7 @@ kauth_resolver_getwork(user_addr_t message)
                thread_t thread = current_thread();
                struct uthread *ut = get_bsdthread_info(thread);
 
-               ut->uu_kevent.uu_kauth.message = message;
+               ut->uu_save.uus_kauth.message = message;
                error = msleep0(&kauth_resolver_unsubmitted, kauth_resolver_mtx, PCATCH, "GRGetWork", 0, kauth_resolver_getwork_continue);
                KAUTH_RESOLVER_UNLOCK();
                /*
index 92d1c43bea64332a043e658fc485a67aeddff5ae..2b40cea3e43f610f7222fb3271d4e59298d119cd 100644 (file)
@@ -79,19 +79,42 @@ unsigned long cs_procs_invalidated = 0;
 int cs_force_kill = 0;
 int cs_force_hard = 0;
 int cs_debug = 0;
+// If set, AMFI will error out early on unsigned code, before evaluation the normal policy.
+int cs_debug_fail_on_unsigned_code = 0;
+// If the previous mode is enabled, we count the resulting failures here.
+unsigned int cs_debug_unsigned_exec_failures = 0;
+unsigned int cs_debug_unsigned_mmap_failures = 0;
+
 #if SECURE_KERNEL
-const int cs_enforcement_enable = 1;
+/*
+Here we split cs_enforcement_enable into cs_system_enforcement_enable and cs_process_enforcement_enable
+
+cs_system_enforcement_enable governs whether or not system level code signing enforcement mechanisms
+are applied on the system. Today, the only such mechanism is code signing enforcement of the dyld shared
+cache.
+
+cs_process_enforcement_enable governs whether code signing enforcement mechanisms are applied to all
+processes or only those that opt into such enforcement.
+
+(On iOS and related, both of these are set by default. On macOS, only cs_system_enforcement_enable
+is set by default. Processes can then be opted into code signing enforcement on a case by case basis.)
+ */
+const int cs_system_enforcement_enable = 1;
+const int cs_process_enforcement_enable = 1;
 const int cs_library_val_enable = 1;
 #else /* !SECURE_KERNEL */
 int cs_enforcement_panic=0;
 int cs_relax_platform_task_ports = 0;
 
 #if CONFIG_ENFORCE_SIGNED_CODE
-#define DEFAULT_CS_ENFORCEMENT_ENABLE 1
+#define DEFAULT_CS_SYSTEM_ENFORCEMENT_ENABLE 1
+#define DEFAULT_CS_PROCESS_ENFORCEMENT_ENABLE 1
 #else
-#define DEFAULT_CS_ENFORCEMENT_ENABLE 0
+#define DEFAULT_CS_SYSTEM_ENFORCEMENT_ENABLE 1
+#define DEFAULT_CS_PROCESS_ENFORCEMENT_ENABLE 0
 #endif
-SECURITY_READ_ONLY_LATE(int) cs_enforcement_enable = DEFAULT_CS_ENFORCEMENT_ENABLE;
+SECURITY_READ_ONLY_LATE(int) cs_system_enforcement_enable = DEFAULT_CS_SYSTEM_ENFORCEMENT_ENABLE;
+SECURITY_READ_ONLY_LATE(int) cs_process_enforcement_enable = DEFAULT_CS_PROCESS_ENFORCEMENT_ENABLE;
 
 #if CONFIG_ENFORCE_LIBRARY_VALIDATION
 #define DEFAULT_CS_LIBRARY_VA_ENABLE 1
@@ -108,15 +131,22 @@ static lck_grp_t *cs_lockgrp;
 SYSCTL_INT(_vm, OID_AUTO, cs_force_kill, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_kill, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, cs_force_hard, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_force_hard, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, cs_debug, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_debug, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, cs_debug_fail_on_unsigned_code, CTLFLAG_RW | CTLFLAG_LOCKED,
+                          &cs_debug_fail_on_unsigned_code, 0, "");
+SYSCTL_UINT(_vm, OID_AUTO, cs_debug_unsigned_exec_failures, CTLFLAG_RD | CTLFLAG_LOCKED,
+                          &cs_debug_unsigned_exec_failures, 0, "");
+SYSCTL_UINT(_vm, OID_AUTO, cs_debug_unsigned_mmap_failures, CTLFLAG_RD | CTLFLAG_LOCKED,
+                          &cs_debug_unsigned_mmap_failures, 0, "");
 
 SYSCTL_INT(_vm, OID_AUTO, cs_all_vnodes, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_all_vnodes, 0, "");
 
 #if !SECURE_KERNEL
-SYSCTL_INT(_vm, OID_AUTO, cs_enforcement, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_enforcement_enable, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, cs_system_enforcement, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_system_enforcement_enable, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, cs_process_enforcement, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_process_enforcement_enable, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, cs_enforcement_panic, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_enforcement_panic, 0, "");
 
 #if !CONFIG_ENFORCE_LIBRARY_VALIDATION
-SYSCTL_INT(_vm, OID_AUTO, cs_library_validation, CTLFLAG_RW | CTLFLAG_LOCKED, &cs_library_val_enable, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, cs_library_validation, CTLFLAG_RD | CTLFLAG_LOCKED, &cs_library_val_enable, 0, "");
 #endif
 #endif /* !SECURE_KERNEL */
 
@@ -125,17 +155,20 @@ int panic_on_cs_killed = 0;
 void
 cs_init(void)
 {
-#if MACH_ASSERT && __x86_64__
+#if MACH_ASSERT
+#if PLATFORM_WatchOS || __x86_64__
        panic_on_cs_killed = 1;
-#endif /* MACH_ASSERT && __x86_64__ */
+#endif /* watchos || x86_64 */
+#endif /* MACH_ASSERT */
        PE_parse_boot_argn("panic_on_cs_killed", &panic_on_cs_killed,
                           sizeof (panic_on_cs_killed));
 #if !SECURE_KERNEL
        int disable_cs_enforcement = 0;
        PE_parse_boot_argn("cs_enforcement_disable", &disable_cs_enforcement, 
                           sizeof (disable_cs_enforcement));
-       if (disable_cs_enforcement) {
-               cs_enforcement_enable = 0;
+       if (disable_cs_enforcement && PE_i_can_has_debugger(NULL) != 0) {
+               cs_system_enforcement_enable = 0;
+               cs_process_enforcement_enable = 0;
        } else {
                int panic = 0;
                PE_parse_boot_argn("cs_enforcement_panic", &panic, sizeof(panic));
@@ -165,7 +198,7 @@ cs_allow_invalid(struct proc *p)
 #if MACH_ASSERT
        lck_mtx_assert(&p->p_mlock, LCK_MTX_ASSERT_NOTOWNED);
 #endif
-#if CONFIG_MACF && CONFIG_ENFORCE_SIGNED_CODE
+#if CONFIG_MACF
        /* There needs to be a MAC policy to implement this hook, or else the
         * kill bits will be cleared here every time. If we have 
         * CONFIG_ENFORCE_SIGNED_CODE, we can assume there is a policy
@@ -262,10 +295,10 @@ cs_invalid_page(addr64_t vaddr, boolean_t *cs_killed)
  */
 
 int
-cs_enforcement(struct proc *p)
+cs_process_enforcement(struct proc *p)
 {
 
-       if (cs_enforcement_enable)
+       if (cs_process_enforcement_enable)
                return 1;
        
        if (p == NULL)
@@ -277,6 +310,18 @@ cs_enforcement(struct proc *p)
        return 0;
 }
 
+int
+cs_process_global_enforcement(void)
+{
+       return cs_process_enforcement_enable ? 1 : 0;
+}
+
+int
+cs_system_enforcement(void)
+{
+       return cs_system_enforcement_enable ? 1 : 0;
+}
+
 /*
  * Returns whether a given process is still valid.
  */
@@ -312,6 +357,18 @@ cs_require_lv(struct proc *p)
        return 0;
 }
 
+int
+csproc_forced_lv(struct proc* p)
+{
+       if (p == NULL) {
+               p = current_proc();
+       }
+       if (p != NULL && (p->p_csflags & CS_FORCED_LV)) {
+               return 1;
+       }
+       return 0;
+}
+
 /*
  * <rdar://problem/24634089> added to allow system level library
  *  validation check at mac_cred_label_update_execve time
@@ -610,6 +667,56 @@ csproc_clear_platform_binary(struct proc *p)
 }
 #endif
 
+void
+csproc_disable_enforcement(struct proc* __unused p)
+{
+#if !CONFIG_ENFORCE_SIGNED_CODE
+       if (p != NULL) {
+               proc_lock(p);
+               p->p_csflags &= (~CS_ENFORCEMENT);
+               proc_unlock(p);
+       }
+#endif
+}
+
+/* Function: csproc_mark_invalid_allowed
+ *
+ * Description: Mark the process as being allowed to go invalid. Called as part of
+ *             task_for_pid and ptrace policy. Note CS_INVALID_ALLOWED only matters for
+ *             processes that have been opted into CS_ENFORCEMENT.
+ */
+void
+csproc_mark_invalid_allowed(struct proc* __unused p)
+{
+#if !CONFIG_ENFORCE_SIGNED_CODE
+       if (p != NULL) {
+               proc_lock(p);
+               p->p_csflags |= CS_INVALID_ALLOWED;
+               proc_unlock(p);
+       }
+#endif
+}
+
+/*
+ * Function: csproc_check_invalid_allowed
+ *
+ * Description: Returns 1 if the process has been marked as allowed to go invalid
+ *             because it gave its task port to an allowed process.
+ */
+int
+csproc_check_invalid_allowed(struct proc* __unused p)
+{
+#if !CONFIG_ENFORCE_SIGNED_CODE
+       if (p == NULL) {
+               p = current_proc();
+       }
+
+       if (p != NULL && (p->p_csflags & CS_INVALID_ALLOWED))
+               return 1;
+#endif
+       return 0;
+}
+
 /*
  * Function: csproc_get_prod_signed
  *
@@ -908,6 +1015,12 @@ cs_restricted(struct proc *p)
        return (p->p_csflags & CS_RESTRICT) ? 1 : 0;
 }
 
+int
+csproc_hardened_runtime(struct proc* p)
+{
+       return (p->p_csflags & CS_RUNTIME) ? 1 : 0;
+}
+
 /*
  * Function: csfg_get_path
  *
index 48904239ef98e32928cf7e512b69e072a3681f86..efc8616f7fbeab25555248667f3dbd98771dd625 100644 (file)
@@ -776,7 +776,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
        char *pop;
        struct vnode *vp = NULLVP;      /* for AUDIT_ARG() at end */
        int i, tmp, error, error2, flg = 0;
-       struct flock fl;
+       struct flock fl = {};
        struct flocktimeout fltimeout;
        struct timespec *timeout = NULL;
        struct vfs_context context;
@@ -1139,10 +1139,8 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
 
        case F_GETLK:
        case F_OFD_GETLK:
-#if CONFIG_EMBEDDED
        case F_GETLKPID:
        case F_OFD_GETLKPID:
-#endif
                if (fp->f_type != DTYPE_VNODE) {
                        error = EBADF;
                        goto out;
@@ -1553,7 +1551,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
 
        case F_LOG2PHYS:
        case F_LOG2PHYS_EXT: {
-               struct log2phys l2p_struct;    /* structure for allocate command */
+               struct log2phys l2p_struct = {};    /* structure for allocate command */
                int devBlockSize;
 
                off_t file_offset = 0;
@@ -1865,12 +1863,16 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                        if (uap->cmd == F_ADDFILESIGS_FOR_DYLD_SIM) {
                                error = ubc_cs_blob_revalidate(vp, blob, NULL, blob_add_flags);
                                if (error) {
-                                       vnode_put(vp);
-                                       goto outdrop;
+                                       blob = NULL;
+                                       if (error != EAGAIN) {
+                                               vnode_put(vp);
+                                               goto outdrop;
+                                       }
                                }
                        }
+               }
 
-               } else {
+               if (blob == NULL) {
                        /*
                         * An arbitrary limit, to prevent someone from mapping in a 20GB blob.  This should cover
                         * our use cases for the immediate future, but note that at the time of this commit, some
@@ -2086,7 +2088,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                        .len = CP_MAX_WRAPPEDKEYSIZE,
                };
 
-               MALLOC(k.key, char *, k.len, M_TEMP, M_WAITOK);
+               MALLOC(k.key, char *, k.len, M_TEMP, M_WAITOK | M_ZERO);
 
                error = VNOP_IOCTL(vp, F_TRANSCODEKEY, (caddr_t)&k, 1, &context);
 
@@ -2168,7 +2170,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                /* For now, special case HFS+ only, since this is SPI. */
                src_vp = (struct vnode *)fp->f_data;
                if (src_vp->v_tag != VT_HFS) {
-                       error = EINVAL;
+                       error = ENOTSUP;
                        goto out;
                }
 
@@ -2188,7 +2190,7 @@ fcntl_nocancel(proc_t p, struct fcntl_nocancel_args *uap, int32_t *retval)
                dst_vp = (struct vnode *)fp2->f_data;
                if (dst_vp->v_tag != VT_HFS) {
                        fp_drop(p, fd2, fp2, 1);
-                       error = EINVAL;
+                       error = ENOTSUP;
                        goto out;
                }
 
@@ -2886,7 +2888,7 @@ close_internal_locked(proc_t p, int fd, struct fileproc *fp, int flags)
        }
 
        if (fd < fdp->fd_knlistsize)
-               knote_fdclose(p, fd, FALSE);
+               knote_fdclose(p, fd);
 
        if (fp->f_flags & FP_WAITEVENT)
                (void)waitevent_close(p, fp);
@@ -4694,9 +4696,8 @@ fdexec(proc_t p, short flags, int self_exec)
         * If the current thread is bound as a workq/workloop
         * servicing thread, we need to unbind it first.
         */
-       if (ut->uu_kqueue_bound && self_exec) {
-               kevent_qos_internal_unbind(p, 0, self,
-                                          ut->uu_kqueue_flags);
+       if (ut->uu_kqr_bound && self_exec) {
+               kqueue_threadreq_unbind(p, ut->uu_kqr_bound);
        }
 
        proc_fdlock(p);
@@ -5048,6 +5049,12 @@ fdfree(proc_t p)
        assert(fdp->fd_knlistsize == -1);
        assert(fdp->fd_knhashmask == 0);
 
+       /*
+        * dealloc all workloops that have outstanding retains
+        * when created with scheduling parameters.
+        */
+       kqworkloops_dealloc(p);
+
        /* close file descriptors */
        if (fdp->fd_nfiles > 0 && fdp->fd_ofiles) {
                for (i = fdp->fd_lastfile; i >= 0; i--) {
index f07aa6d17ba1a1cebe1487a92991c02d8f97102d..d8096ba03cf605d5d97eb3f9eabadd8fc0cd9784 100644 (file)
@@ -55,7 +55,7 @@
  *     @(#)kern_event.c       1.0 (3/31/2000)
  */
 #include <stdint.h>
-#include <stdatomic.h>
+#include <machine/atomic.h>
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -87,6 +87,7 @@
 #include <sys/kdebug.h>
 #include <sys/reason.h>
 #include <os/reason_private.h>
+#include <pexpert/pexpert.h>
 
 #include <kern/locks.h>
 #include <kern/clock.h>
 #include <kern/thread.h>
 #include <kern/kcdata.h>
 
+#include <pthread/priority_private.h>
+#include <pthread/workqueue_syscalls.h>
+#include <pthread/workqueue_internal.h>
 #include <libkern/libkern.h>
 #include <libkern/OSAtomic.h>
 
@@ -119,25 +123,10 @@ extern mach_port_name_t ipc_entry_name_mask(mach_port_name_t name); /* osfmk/ipc
 
 #define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
 
-/*
- * JMM - this typedef needs to be unified with pthread_priority_t
- *       and mach_msg_priority_t. It also needs to be the same type
- *       everywhere.
- */
-typedef int32_t qos_t;
-
 MALLOC_DEFINE(M_KQUEUE, "kqueue", "memory for kqueue system");
 
 #define        KQ_EVENT        NO_EVENT64
 
-#define KNUSE_NONE       0x0
-#define KNUSE_STEAL_DROP 0x1
-#define KNUSE_BOOST      0x2
-static int kqlock2knoteuse(struct kqueue *kq, struct knote *kn, int flags);
-static int kqlock2knotedrop(struct kqueue *kq, struct knote *kn);
-static int kqlock2knotedetach(struct kqueue *kq, struct knote *kn, int flags);
-static int knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int flags);
-
 static int kqueue_read(struct fileproc *fp, struct uio *uio,
                int flags, vfs_context_t ctx);
 static int kqueue_write(struct fileproc *fp, struct uio *uio,
@@ -166,7 +155,7 @@ static void kevent_put_kq(struct proc *p, kqueue_id_t id, struct fileproc *fp, s
 static int kevent_internal(struct proc *p,
                           kqueue_id_t id, kqueue_id_t *id_out,
                           user_addr_t changelist, int nchanges,
-                          user_addr_t eventlist, int nevents, 
+                          user_addr_t eventlist, int nevents,
                           user_addr_t data_out, uint64_t data_available,
                           unsigned int flags, user_addr_t utimeout,
                           kqueue_continue_t continuation,
@@ -177,39 +166,34 @@ static int kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp,
                          struct proc *p, unsigned int flags);
 char * kevent_description(struct kevent_internal_s *kevp, char *s, size_t n);
 
+static int kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev);
+static void kevent_register_wait_block(struct turnstile *ts, thread_t handoff_thread,
+               struct knote_lock_ctx *knlc, thread_continue_t cont,
+               struct _kevent_register *cont_args) __dead2;
+static void kevent_register_wait_return(struct _kevent_register *cont_args) __dead2;
+static void kevent_register_wait_cleanup(struct knote *kn);
+static inline void kqueue_release_last(struct proc *p, kqueue_t kqu);
 static void kqueue_interrupt(struct kqueue *kq);
 static int kevent_callback(struct kqueue *kq, struct kevent_internal_s *kevp,
-                          void *data);
+               void *data);
 static void kevent_continue(struct kqueue *kq, void *data, int error);
 static void kqueue_scan_continue(void *contp, wait_result_t wait_result);
 static int kqueue_process(struct kqueue *kq, kevent_callback_t callback, void *callback_data,
-                          struct filt_process_s *process_data, int *countp, struct proc *p);
-static struct kqtailq *kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index);
-static struct kqtailq *kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index);
+               struct filt_process_s *process_data, int *countp);
 static int kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index);
 
-static struct kqtailq *kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index);
+static struct kqtailq *kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn);
+static void kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos, int flags);
 
-static void kqworkq_request_thread(struct kqworkq *kqwq, kq_index_t qos_index);
-static void kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index);
-static void kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index);
-static void kqworkq_bind_thread_impl(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
-static void kqworkq_unbind_thread(struct kqworkq *kqwq, kq_index_t qos_index, thread_t thread, unsigned int flags);
+static void kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn, kq_index_t qos);
+static void kqworkq_unbind(proc_t p, struct kqrequest *kqr);
+static thread_qos_t kqworkq_unbind_locked(struct kqworkq *kqwq, struct kqrequest *kqr, thread_t thread);
 static struct kqrequest *kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index);
 
-enum {
-       KQWL_UO_NONE = 0,
-       KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI = 0x1,
-       KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI = 0x2,
-       KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS = 0x4,
-       KQWL_UO_UPDATE_OVERRIDE_LAZY = 0x8
-};
-
-static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t qos_index, kq_index_t override_index, uint32_t flags);
-static void kqworkloop_bind_thread_impl(struct kqworkloop *kqwl, thread_t thread, unsigned int flags);
-static void kqworkloop_unbind_thread(struct kqworkloop *kqwl, thread_t thread, unsigned int flags);
-static inline kq_index_t kqworkloop_combined_qos(struct kqworkloop *kqwl, boolean_t *);
-static void kqworkloop_update_suppress_sync_count(struct kqrequest *kqr, uint32_t flags);
+static void kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index);
+static void kqworkloop_unbind(proc_t p, struct kqworkloop *kwql);
+static thread_qos_t kqworkloop_unbind_locked(struct kqworkloop *kwql, thread_t thread);
+static kq_index_t kqworkloop_owner_override(struct kqworkloop *kqwl);
 enum {
        KQWL_UTQ_NONE,
        /*
@@ -223,6 +207,8 @@ enum {
        KQWL_UTQ_UPDATE_WAKEUP_QOS,
        KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
        KQWL_UTQ_RECOMPUTE_WAKEUP_QOS,
+       KQWL_UTQ_UNBINDING, /* attempt to rebind */
+       KQWL_UTQ_PARKING,
        /*
         * The wakeup override is for suppressed knotes that have fired again at
         * a higher QoS than the one for which they are suppressed already.
@@ -231,35 +217,26 @@ enum {
        KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
        KQWL_UTQ_RESET_WAKEUP_OVERRIDE,
        /*
-        * The async QoS is the maximum QoS of an event enqueued on this workloop in
+        * The QoS is the maximum QoS of an event enqueued on this workloop in
         * userland. It is copied from the only EVFILT_WORKLOOP knote with
         * a NOTE_WL_THREAD_REQUEST bit set allowed on this workloop. If there is no
         * such knote, this QoS is 0.
         */
-       KQWL_UTQ_SET_ASYNC_QOS,
-       /*
-        * The sync waiters QoS is the maximum QoS of any thread blocked on an
-        * EVFILT_WORKLOOP knote marked with the NOTE_WL_SYNC_WAIT bit.
-        * If there is no such knote, this QoS is 0.
-        */
-       KQWL_UTQ_SET_SYNC_WAITERS_QOS,
+       KQWL_UTQ_SET_QOS_INDEX,
        KQWL_UTQ_REDRIVE_EVENTS,
 };
 static void kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos);
 static void kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index);
+static int kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags);
 
 static int knote_process(struct knote *kn, kevent_callback_t callback, void *callback_data,
-                        struct filt_process_s *process_data, struct proc *p);
-#if 0
-static void knote_put(struct knote *kn);
-#endif
+                        struct filt_process_s *process_data);
 
 static int kq_add_knote(struct kqueue *kq, struct knote *kn,
-               struct kevent_internal_s *kev, struct proc *p, int *knoteuse_flags);
+               struct knote_lock_ctx *knlc, struct proc *p);
 static struct knote *kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev, bool is_fd, struct proc *p);
-static void kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p, kn_status_t *kn_status, uint16_t *kq_state);
 
-static void knote_drop(struct knote *kn, struct proc *p);
+static void knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc);
 static struct knote *knote_alloc(void);
 static void knote_free(struct knote *kn);
 
@@ -276,109 +253,56 @@ static void knote_suppress(struct knote *kn);
 static void knote_unsuppress(struct knote *kn);
 static void knote_wakeup(struct knote *kn);
 
-static kq_index_t knote_get_queue_index(struct knote *kn);
-static struct kqtailq *knote_get_queue(struct knote *kn);
-static kq_index_t knote_get_req_index(struct knote *kn);
-static kq_index_t knote_get_qos_index(struct knote *kn);
-static void knote_set_qos_index(struct knote *kn, kq_index_t qos_index);
+static bool knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn,
+               int result, thread_qos_t *qos_out);
+static void knote_apply_qos_override(struct knote *kn, kq_index_t qos_index);
+static void knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result);
+static void knote_reset_priority(struct knote *kn, pthread_priority_t pp);
 static kq_index_t knote_get_qos_override_index(struct knote *kn);
-static kq_index_t knote_get_sync_qos_override_index(struct knote *kn);
-static void knote_set_qos_override_index(struct knote *kn, kq_index_t qos_index, boolean_t override_is_sync);
 static void knote_set_qos_overcommit(struct knote *kn);
 
-static int filt_fileattach(struct knote *kn, struct kevent_internal_s *kev);
-SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
-       .f_isfd = 1,
-       .f_attach = filt_fileattach,
-};
+static zone_t knote_zone;
+static zone_t kqfile_zone;
+static zone_t kqworkq_zone;
+static zone_t kqworkloop_zone;
+#if DEVELOPMENT || DEBUG
+#define KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK  (1U << 0)
+#define KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS     (1U << 1)
+#define KEVENT_PANIC_BOOT_ARG_INITIALIZED        (1U << 31)
 
-static void filt_kqdetach(struct knote *kn);
-static int filt_kqueue(struct knote *kn, long hint);
-static int filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
-       .f_isfd = 1,
-       .f_detach = filt_kqdetach,
-       .f_event = filt_kqueue,
-       .f_touch = filt_kqtouch,
-       .f_process = filt_kqprocess,
-};
+#define KEVENT_PANIC_DEFAULT_VALUE (0)
+static uint32_t
+kevent_debug_flags(void)
+{
+       static uint32_t flags = KEVENT_PANIC_DEFAULT_VALUE;
+
+       if ((flags & KEVENT_PANIC_BOOT_ARG_INITIALIZED) == 0) {
+               uint32_t value = 0;
+               if (!PE_parse_boot_argn("kevent_debug", &value, sizeof(value))) {
+                       value = KEVENT_PANIC_DEFAULT_VALUE;
+               }
+               value |= KEVENT_PANIC_BOOT_ARG_INITIALIZED;
+               os_atomic_store(&flags, value, relaxed);
+       }
+       return flags;
+}
+#endif
+
+#define        KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
 
 /* placeholder for not-yet-implemented filters */
 static int filt_badattach(struct knote *kn, struct kevent_internal_s *kev);
+static int filt_badevent(struct knote *kn, long hint);
 SECURITY_READ_ONLY_EARLY(static struct filterops) bad_filtops = {
        .f_attach = filt_badattach,
 };
 
-static int filt_procattach(struct knote *kn, struct kevent_internal_s *kev);
-static void filt_procdetach(struct knote *kn);
-static int filt_proc(struct knote *kn, long hint);
-static int filt_proctouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
-       .f_attach = filt_procattach,
-       .f_detach = filt_procdetach,
-       .f_event = filt_proc,
-       .f_touch = filt_proctouch,
-       .f_process = filt_procprocess,
-};
-
 #if CONFIG_MEMORYSTATUS
 extern const struct filterops memorystatus_filtops;
 #endif /* CONFIG_MEMORYSTATUS */
-
 extern const struct filterops fs_filtops;
-
 extern const struct filterops sig_filtops;
-
-static zone_t knote_zone;
-static zone_t kqfile_zone;
-static zone_t kqworkq_zone;
-static zone_t kqworkloop_zone;
-
-#define        KN_HASH(val, mask)      (((val) ^ (val >> 8)) & (mask))
-
-/* Mach portset filter */
 extern const struct filterops machport_filtops;
-
-/* User filter */
-static int filt_userattach(struct knote *kn, struct kevent_internal_s *kev);
-static void filt_userdetach(struct knote *kn);
-static int filt_user(struct knote *kn, long hint);
-static int filt_usertouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_userprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
-       .f_attach = filt_userattach,
-       .f_detach = filt_userdetach,
-       .f_event = filt_user,
-       .f_touch = filt_usertouch,
-       .f_process = filt_userprocess,
-};
-
-static lck_spin_t _filt_userlock;
-static void filt_userlock(void);
-static void filt_userunlock(void);
-
-/* Workloop filter */
-static bool filt_wlneeds_boost(struct kevent_internal_s *kev);
-static int filt_wlattach(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_wlpost_attach(struct knote *kn, struct  kevent_internal_s *kev);
-static void filt_wldetach(struct knote *kn);
-static int filt_wlevent(struct knote *kn, long hint);
-static int filt_wltouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_wldrop_and_unlock(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_wlprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
-       .f_needs_boost = filt_wlneeds_boost,
-       .f_attach = filt_wlattach,
-       .f_post_attach = filt_wlpost_attach,
-       .f_detach = filt_wldetach,
-       .f_event = filt_wlevent,
-       .f_touch = filt_wltouch,
-       .f_drop_and_unlock = filt_wldrop_and_unlock,
-       .f_process = filt_wlprocess,
-};
-
 extern const struct filterops pipe_rfiltops;
 extern const struct filterops pipe_wfiltops;
 extern const struct filterops ptsd_kqops;
@@ -394,7 +318,12 @@ extern const struct filterops fsevent_filtops;
 extern const struct filterops vnode_filtops;
 extern const struct filterops tty_filtops;
 
+const static struct filterops file_filtops;
+const static struct filterops kqread_filtops;
+const static struct filterops proc_filtops;
 const static struct filterops timer_filtops;
+const static struct filterops user_filtops;
+const static struct filterops workloop_filtops;
 
 /*
  *
@@ -403,170 +332,93 @@ const static struct filterops timer_filtops;
  * - Add a new "EVFILT_" option value to bsd/sys/event.h (typically a negative value)
  *   in the exported section of the header
  * - Update the EVFILT_SYSCOUNT value to reflect the new addition
- * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end 
+ * - Add a filterops to the sysfilt_ops array. Public filters should be added at the end
  *   of the Public Filters section in the array.
  * Private filters:
  * - Add a new "EVFILT_" value to bsd/sys/event.h (typically a positive value)
  *   in the XNU_KERNEL_PRIVATE section of the header
  * - Update the EVFILTID_MAX value to reflect the new addition
- * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of 
- *   the Private filters section of the array. 
+ * - Add a filterops to the sysfilt_ops. Private filters should be added at the end of
+ *   the Private filters section of the array.
  */
 SECURITY_READ_ONLY_EARLY(static struct filterops *) sysfilt_ops[EVFILTID_MAX] = {
        /* Public Filters */
-       [~EVFILT_READ]                                  = &file_filtops,
-       [~EVFILT_WRITE]                                 = &file_filtops,
-       [~EVFILT_AIO]                                   = &bad_filtops,
-       [~EVFILT_VNODE]                                 = &file_filtops,
-       [~EVFILT_PROC]                                  = &proc_filtops,
-       [~EVFILT_SIGNAL]                                = &sig_filtops,
-       [~EVFILT_TIMER]                                 = &timer_filtops,
-       [~EVFILT_MACHPORT]                              = &machport_filtops,
-       [~EVFILT_FS]                                    = &fs_filtops,
-       [~EVFILT_USER]                                  = &user_filtops,
-                                                                         &bad_filtops,
-                                                                         &bad_filtops,
-       [~EVFILT_SOCK]                                  = &file_filtops,
+       [~EVFILT_READ]                  = &file_filtops,
+       [~EVFILT_WRITE]                 = &file_filtops,
+       [~EVFILT_AIO]                   = &bad_filtops,
+       [~EVFILT_VNODE]                 = &file_filtops,
+       [~EVFILT_PROC]                  = &proc_filtops,
+       [~EVFILT_SIGNAL]                = &sig_filtops,
+       [~EVFILT_TIMER]                 = &timer_filtops,
+       [~EVFILT_MACHPORT]              = &machport_filtops,
+       [~EVFILT_FS]                    = &fs_filtops,
+       [~EVFILT_USER]                  = &user_filtops,
+                                         &bad_filtops,
+       [~EVFILT_VM]                    = &bad_filtops,
+       [~EVFILT_SOCK]                  = &file_filtops,
 #if CONFIG_MEMORYSTATUS
-       [~EVFILT_MEMORYSTATUS]                  = &memorystatus_filtops,
+       [~EVFILT_MEMORYSTATUS]          = &memorystatus_filtops,
 #else
-       [~EVFILT_MEMORYSTATUS]                  = &bad_filtops,
+       [~EVFILT_MEMORYSTATUS]          = &bad_filtops,
 #endif
-       [~EVFILT_EXCEPT]                                = &file_filtops,
-
+       [~EVFILT_EXCEPT]                = &file_filtops,
        [~EVFILT_WORKLOOP]              = &workloop_filtops,
 
        /* Private filters */
-       [EVFILTID_KQREAD]                               = &kqread_filtops,
-       [EVFILTID_PIPE_R]                               = &pipe_rfiltops,
-       [EVFILTID_PIPE_W]                               = &pipe_wfiltops,
-       [EVFILTID_PTSD]                                 = &ptsd_kqops,
-       [EVFILTID_SOREAD]                               = &soread_filtops,
-       [EVFILTID_SOWRITE]                              = &sowrite_filtops,
-       [EVFILTID_SCK]                                  = &sock_filtops,
-       [EVFILTID_SOEXCEPT]                     = &soexcept_filtops,
-       [EVFILTID_SPEC]                                 = &spec_filtops,
-       [EVFILTID_BPFREAD]                              = &bpfread_filtops,
-       [EVFILTID_NECP_FD]                              = &necp_fd_rfiltops,
-       [EVFILTID_FSEVENT]                              = &fsevent_filtops,
-       [EVFILTID_VN]                                   = &vnode_filtops,
-       [EVFILTID_TTY]                                  = &tty_filtops,
-       [EVFILTID_PTMX]                                 = &ptmx_kqops,
+       [EVFILTID_KQREAD]               = &kqread_filtops,
+       [EVFILTID_PIPE_R]               = &pipe_rfiltops,
+       [EVFILTID_PIPE_W]               = &pipe_wfiltops,
+       [EVFILTID_PTSD]                 = &ptsd_kqops,
+       [EVFILTID_SOREAD]               = &soread_filtops,
+       [EVFILTID_SOWRITE]              = &sowrite_filtops,
+       [EVFILTID_SCK]                  = &sock_filtops,
+       [EVFILTID_SOEXCEPT]             = &soexcept_filtops,
+       [EVFILTID_SPEC]                 = &spec_filtops,
+       [EVFILTID_BPFREAD]              = &bpfread_filtops,
+       [EVFILTID_NECP_FD]              = &necp_fd_rfiltops,
+       [EVFILTID_FSEVENT]              = &fsevent_filtops,
+       [EVFILTID_VN]                   = &vnode_filtops,
+       [EVFILTID_TTY]                  = &tty_filtops,
+       [EVFILTID_PTMX]                 = &ptmx_kqops,
 };
 
 /* waitq prepost callback */
 void waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos);
 
-#ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
-#define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */
-#endif
-#ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
-#define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG    0x80000000 /* request overcommit threads */
-#endif
-#ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK
-#define _PTHREAD_PRIORITY_QOS_CLASS_MASK    0x003fff00  /* QoS class mask */
-#endif
-#ifndef _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32
-#define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32 8
-#endif
-
-static inline __kdebug_only
-uintptr_t
-kqr_thread_id(struct kqrequest *kqr)
-{
-       return (uintptr_t)thread_tid(kqr->kqr_thread);
-}
-
-static inline
-boolean_t is_workqueue_thread(thread_t thread)
-{
-       return (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE);
-}
-
-static inline
-void knote_canonicalize_kevent_qos(struct knote *kn)
-{
-       struct kqueue *kq = knote_get_kq(kn);
-       unsigned long canonical;
-
-       if ((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0)
-               return;
-
-       /* preserve manager and overcommit flags in this case */
-       canonical = pthread_priority_canonicalize(kn->kn_qos, FALSE);
-       kn->kn_qos = (qos_t)canonical;
-}
-
-static inline
-kq_index_t qos_index_from_qos(struct knote *kn, qos_t qos, boolean_t propagation)
+static inline struct kqworkloop *
+kqr_kqworkloop(struct kqrequest *kqr)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       kq_index_t qos_index;
-       unsigned long flags = 0;
-
-       if ((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0)
-               return QOS_INDEX_KQFILE;
-
-       qos_index = (kq_index_t)thread_qos_from_pthread_priority(
-                               (unsigned long)qos, &flags);
-       
-       if (kq->kq_state & KQ_WORKQ) {
-               /* workq kqueues support requesting a manager thread (non-propagation) */
-               if (!propagation && (flags & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG))
-                       return KQWQ_QOS_MANAGER;
+       if (kqr->kqr_state & KQR_WORKLOOP) {
+               return __container_of(kqr, struct kqworkloop, kqwl_request);
        }
-
-       return qos_index;
-}
-
-static inline
-qos_t qos_from_qos_index(kq_index_t qos_index)
-{
-       /* should only happen for KQ_WORKQ */
-       if (qos_index == KQWQ_QOS_MANAGER) 
-               return  _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
-
-       if (qos_index == 0)
-               return THREAD_QOS_UNSPECIFIED;
-
-       /* Should have support from pthread kext support */
-       return (1 << (qos_index - 1 + 
-                     _PTHREAD_PRIORITY_QOS_CLASS_SHIFT_32));
+       return NULL;
 }
 
-/* kqr lock must be held */
-static inline
-unsigned long pthread_priority_for_kqrequest(
-       struct kqrequest *kqr,
-       kq_index_t qos_index)
+static inline kqueue_t
+kqr_kqueue(proc_t p, struct kqrequest *kqr)
 {
-       unsigned long priority = qos_from_qos_index(qos_index);
-       if (kqr->kqr_state & KQR_THOVERCOMMIT) {
-               priority |= _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
+       kqueue_t kqu;
+       if (kqr->kqr_state & KQR_WORKLOOP) {
+               kqu.kqwl = kqr_kqworkloop(kqr);
+       } else {
+               kqu.kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
+               assert(kqr >= kqu.kqwq->kqwq_request &&
+                               kqr < kqu.kqwq->kqwq_request + KQWQ_NBUCKETS);
        }
-       return priority;
+       return kqu;
 }
 
-static inline
-kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags)
+static inline boolean_t
+is_workqueue_thread(thread_t thread)
 {
-#pragma unused(thread)
-       kq_index_t qos_index;
-
-       if (flags & KEVENT_FLAG_WORKQ_MANAGER)
-               return KQWQ_QOS_MANAGER;
-
-       qos_index = (kq_index_t)qos_class;
-       assert(qos_index > 0 && qos_index < KQWQ_QOS_MANAGER);
-
-       return qos_index;
+       return (thread_get_tag(thread) & THREAD_TAG_WORKQUEUE);
 }
 
 /*
  * kqueue/note lock implementations
  *
  *     The kqueue lock guards the kq state, the state of its queues,
- *     and the kqueue-aware status and use counts of individual knotes.
+ *     and the kqueue-aware status and locks of individual knotes.
  *
  *     The kqueue workq lock is used to protect state guarding the
  *     interaction of the kqueue with the workq.  This state cannot
@@ -580,26 +432,47 @@ kq_index_t qos_index_for_servicer(int qos_class, thread_t thread, int flags)
  *     by calling the filter to get a [consistent] snapshot of that
  *     data.
  */
-lck_grp_attr_t * kq_lck_grp_attr;
-lck_grp_t * kq_lck_grp;
-lck_attr_t * kq_lck_attr;
+static lck_grp_attr_t *kq_lck_grp_attr;
+static lck_grp_t *kq_lck_grp;
+static lck_attr_t *kq_lck_attr;
+
+static inline void
+kqlock(kqueue_t kqu)
+{
+       lck_spin_lock(&kqu.kq->kq_lock);
+}
+
+static inline void
+kqlock_held(__assert_only kqueue_t kqu)
+{
+       LCK_SPIN_ASSERT(&kqu.kq->kq_lock, LCK_ASSERT_OWNED);
+}
+
+static inline void
+kqunlock(kqueue_t kqu)
+{
+       lck_spin_unlock(&kqu.kq->kq_lock);
+}
 
 static inline void
-kqlock(struct kqueue *kq)
+kq_req_lock(kqueue_t kqu)
 {
-       lck_spin_lock(&kq->kq_lock);
+       assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
+       lck_spin_lock(&kqu.kq->kq_reqlock);
 }
 
 static inline void
-kqlock_held(__assert_only struct kqueue *kq)
+kq_req_unlock(kqueue_t kqu)
 {
-       LCK_SPIN_ASSERT(&kq->kq_lock, LCK_ASSERT_OWNED);
+       assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
+       lck_spin_unlock(&kqu.kq->kq_reqlock);
 }
 
 static inline void
-kqunlock(struct kqueue *kq)
+kq_req_held(__assert_only kqueue_t kqu)
 {
-       lck_spin_unlock(&kq->kq_lock);
+       assert(kqu.kq->kq_state & (KQ_WORKLOOP | KQ_WORKQ));
+       LCK_SPIN_ASSERT(&kqu.kq->kq_reqlock, LCK_ASSERT_OWNED);
 }
 
 static inline void
@@ -614,243 +487,313 @@ knhash_unlock(proc_t p)
        lck_mtx_unlock(&p->p_fd->fd_knhashlock);
 }
 
+#pragma mark knote locks
 
 /*
- * Convert a kq lock to a knote use referece.
+ * Enum used by the knote_lock_* functions.
  *
- *     If the knote is being dropped, or has
- *  vanished, we can't get a use reference.
- *  Just return with it still locked.
+ * KNOTE_KQ_LOCK_ALWAYS
+ *   The function will always return with the kq lock held.
  *
- *     - kq locked at entry
- *     - unlock on exit if we get the use reference
+ * KNOTE_KQ_UNLOCK_ON_SUCCESS
+ *   The function will return with the kq lock held if it was successful
+ *   (knote_lock() is the only function that can fail).
+ *
+ * KNOTE_KQ_UNLOCK_ON_FAILURE
+ *   The function will return with the kq lock held if it was unsuccessful
+ *   (knote_lock() is the only function that can fail).
+ *
+ * KNOTE_KQ_UNLOCK:
+ *   The function returns with the kq unlocked.
  */
-static int
-kqlock2knoteuse(struct kqueue *kq, struct knote *kn, int flags)
+#define KNOTE_KQ_LOCK_ALWAYS      0x0
+#define KNOTE_KQ_LOCK_ON_SUCCESS  0x1
+#define KNOTE_KQ_LOCK_ON_FAILURE  0x2
+#define KNOTE_KQ_UNLOCK           0x3
+
+#if DEBUG || DEVELOPMENT
+__attribute__((noinline, not_tail_called, disable_tail_calls))
+void knote_lock_ctx_chk(struct knote_lock_ctx *knlc)
 {
-       if (kn->kn_status & (KN_DROPPING | KN_VANISHED))
-               return (0);
+       /* evil hackery to make sure no one forgets to unlock */
+       assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
+}
+#endif
 
-       assert(kn->kn_status & KN_ATTACHED);
-       kn->kn_inuse++;
-       if (flags & KNUSE_BOOST) {
-               set_thread_rwlock_boost();
+static struct knote_lock_ctx *
+knote_lock_ctx_find(struct kqueue *kq, struct knote *kn)
+{
+       struct knote_lock_ctx *ctx;
+       LIST_FOREACH(ctx, &kq->kq_knlocks, knlc_le) {
+               if (ctx->knlc_knote == kn) return ctx;
        }
-       kqunlock(kq);
-       return (1);
+       panic("knote lock context not found: %p", kn);
+       __builtin_trap();
 }
 
-/*
- *     - kq locked at entry
- *     - kq unlocked at exit
- */
-__disable_tail_calls
-static wait_result_t
-knoteusewait(struct kqueue *kq, struct knote *kn)
-{
-       kn->kn_status |= KN_USEWAIT;
-       waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
-                       CAST_EVENT64_T(&kn->kn_status),
-                       THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
+/* slowpath of knote_lock() */
+__attribute__((noinline))
+static bool __result_use_check
+knote_lock_slow(struct kqueue *kq, struct knote *kn,
+               struct knote_lock_ctx *knlc, int kqlocking)
+{
+       kqlock_held(kq);
+
+       struct knote_lock_ctx *owner_lc = knote_lock_ctx_find(kq, kn);
+       thread_t owner_thread = owner_lc->knlc_thread;
+
+#if DEBUG || DEVELOPMENT
+       knlc->knlc_state = KNOTE_LOCK_CTX_WAITING;
+#endif
+
+       thread_reference(owner_thread);
+       TAILQ_INSERT_TAIL(&owner_lc->knlc_head, knlc, knlc_tqe);
+       assert_wait(&kn->kn_status, THREAD_UNINT | THREAD_WAIT_NOREPORT);
        kqunlock(kq);
-       return thread_block(THREAD_CONTINUE_NULL);
-}
 
-static bool
-knoteuse_needs_boost(struct knote *kn, struct kevent_internal_s *kev)
-{
-       if (knote_fops(kn)->f_needs_boost) {
-               return knote_fops(kn)->f_needs_boost(kev);
+       if (thread_handoff_deallocate(owner_thread) == THREAD_RESTART) {
+               if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
+                               kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
+                       kqlock(kq);
+               }
+#if DEBUG || DEVELOPMENT
+               assert(knlc->knlc_state == KNOTE_LOCK_CTX_WAITING);
+               knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
+#endif
+               return false;
        }
-       return false;
+#if DEBUG || DEVELOPMENT
+               assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
+#endif
+       if (kqlocking == KNOTE_KQ_LOCK_ALWAYS ||
+                       kqlocking == KNOTE_KQ_LOCK_ON_SUCCESS) {
+               kqlock(kq);
+       }
+       return true;
 }
 
 /*
- * Convert from a knote use reference back to kq lock.
- *
- *     Drop a use reference and wake any waiters if
- *     this is the last one.
+ * Attempts to take the "knote" lock.
  *
- *  If someone is trying to drop the knote, but the
- *  caller has events they must deliver, take
- *  responsibility for the drop later - and wake the
- *  other attempted dropper in a manner that informs
- *  him of the transfer of responsibility.
+ * Called with the kqueue lock held.
  *
- *     The exit return indicates if the knote is still alive
- *  (or if not, the other dropper has been given the green
- *  light to drop it).
- *
- *  The kqueue lock is re-taken unconditionally.
+ * Returns true if the knote lock is acquired, false if it has been dropped
  */
-static int
-knoteuse2kqlock(struct kqueue *kq, struct knote *kn, int flags)
+static bool __result_use_check
+knote_lock(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
+               int kqlocking)
 {
-       int dropped = 0;
-       int steal_drop = (flags & KNUSE_STEAL_DROP);
+       kqlock_held(kq);
 
-       kqlock(kq);
-       if (flags & KNUSE_BOOST) {
-               clear_thread_rwlock_boost();
+#if DEBUG || DEVELOPMENT
+       assert(knlc->knlc_state == KNOTE_LOCK_CTX_UNLOCKED);
+#endif
+       knlc->knlc_knote = kn;
+       knlc->knlc_thread = current_thread();
+       TAILQ_INIT(&knlc->knlc_head);
+
+       if (__improbable(kn->kn_status & KN_LOCKED)) {
+               return knote_lock_slow(kq, kn, knlc, kqlocking);
        }
 
-       if (--kn->kn_inuse == 0) {
+       /*
+        * When the knote will be dropped, the knote lock is taken before
+        * KN_DROPPING is set, and then the knote will be removed from any
+        * hash table that references it before the lock is canceled.
+        */
+       assert((kn->kn_status & KN_DROPPING) == 0);
+       LIST_INSERT_HEAD(&kq->kq_knlocks, knlc, knlc_le);
+       kn->kn_status |= KN_LOCKED;
+#if DEBUG || DEVELOPMENT
+       knlc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
+#endif
 
-               if ((kn->kn_status & KN_ATTACHING) != 0) {
-                       kn->kn_status &= ~KN_ATTACHING;
-               }
+       if (kqlocking == KNOTE_KQ_UNLOCK ||
+                       kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
+               kqunlock(kq);
+       }
+       return true;
+}
 
-               if ((kn->kn_status & KN_USEWAIT) != 0) {
-                       wait_result_t result;
+/*
+ * Unlocks a knote successfully locked with knote_lock().
+ *
+ * Called with the kqueue lock held.
+ *
+ * Returns with the kqueue lock held according to KNOTE_KQ_* flags
+ */
+static void
+knote_unlock(struct kqueue *kq, struct knote *kn,
+               struct knote_lock_ctx *knlc, int flags)
+{
+       kqlock_held(kq);
 
-                       /* If we need to, try and steal the drop */
-                       if (kn->kn_status & KN_DROPPING) {
-                               if (steal_drop && !(kn->kn_status & KN_STOLENDROP)) {
-                                       kn->kn_status |= KN_STOLENDROP;
-                               } else {
-                                       dropped = 1;
-                               }
-                       }
+       assert(knlc->knlc_knote == kn);
+       assert(kn->kn_status & KN_LOCKED);
+#if DEBUG || DEVELOPMENT
+       assert(knlc->knlc_state == KNOTE_LOCK_CTX_LOCKED);
+#endif
 
-                       /* wakeup indicating if ANY USE stole the drop */
-                       result = (kn->kn_status & KN_STOLENDROP) ?
-                                THREAD_RESTART : THREAD_AWAKENED;
+       struct knote_lock_ctx *next_owner_lc = TAILQ_FIRST(&knlc->knlc_head);
 
-                       kn->kn_status &= ~KN_USEWAIT;
-                       waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                                          CAST_EVENT64_T(&kn->kn_status),
-                                          result,
-                                          WAITQ_ALL_PRIORITIES);
-               } else {
-                       /* should have seen use-wait if dropping with use refs */
-                       assert((kn->kn_status & (KN_DROPPING|KN_STOLENDROP)) == 0);
-               }
+       LIST_REMOVE(knlc, knlc_le);
 
-       } else if (kn->kn_status & KN_DROPPING) {
-               /* not the last ref but want to steal a drop if present */
-               if (steal_drop && ((kn->kn_status & KN_STOLENDROP) == 0)) {
-                       kn->kn_status |= KN_STOLENDROP;
+       if (next_owner_lc) {
+               assert(next_owner_lc->knlc_knote == kn);
+               TAILQ_REMOVE(&knlc->knlc_head, next_owner_lc, knlc_tqe);
 
-                       /* but we now have to wait to be the last ref */
-                       knoteusewait(kq, kn);
-                       kqlock(kq);
-               } else {
-                       dropped = 1;
-               }
+               assert(TAILQ_EMPTY(&next_owner_lc->knlc_head));
+               TAILQ_CONCAT(&next_owner_lc->knlc_head, &knlc->knlc_head, knlc_tqe);
+               LIST_INSERT_HEAD(&kq->kq_knlocks, next_owner_lc, knlc_le);
+#if DEBUG || DEVELOPMENT
+               next_owner_lc->knlc_state = KNOTE_LOCK_CTX_LOCKED;
+#endif
+       } else {
+               kn->kn_status &= ~KN_LOCKED;
        }
-
-       return (!dropped);
+       if (kn->kn_inuse == 0) {
+               /*
+                * No f_event() in flight anymore, we can leave QoS "Merge" mode
+                *
+                * See knote_should_apply_qos_override()
+                */
+               kn->kn_status &= ~KN_MERGE_QOS;
+       }
+       if (flags & KNOTE_KQ_UNLOCK) {
+               kqunlock(kq);
+       }
+       if (next_owner_lc) {
+               thread_wakeup_thread(&kn->kn_status, next_owner_lc->knlc_thread);
+       }
+#if DEBUG || DEVELOPMENT
+       knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
+#endif
 }
 
 /*
- * Convert a kq lock to a knote use reference
- * (for the purpose of detaching AND vanishing it).
+ * Aborts all waiters for a knote lock, and unlock the knote.
  *
- *     If the knote is being dropped, we can't get
- *     a detach reference, so wait for the knote to
- *  finish dropping before returning.
+ * Called with the kqueue lock held.
  *
- *  If the knote is being used for other purposes,
- *  we cannot detach it until those uses are done
- *  as well. Again, just wait for them to finish
- *  (caller will start over at lookup).
- *
- *     - kq locked at entry
- *     - unlocked on exit
+ * Returns with the kqueue lock held according to KNOTE_KQ_* flags
  */
-static int
-kqlock2knotedetach(struct kqueue *kq, struct knote *kn, int flags)
+static void
+knote_unlock_cancel(struct kqueue *kq, struct knote *kn,
+               struct knote_lock_ctx *knlc, int kqlocking)
 {
-       if ((kn->kn_status & KN_DROPPING) || kn->kn_inuse) {
-               /* have to wait for dropper or current uses to go away */
-               knoteusewait(kq, kn);
-               return (0);
+       kqlock_held(kq);
+
+       assert(knlc->knlc_knote == kn);
+       assert(kn->kn_status & KN_LOCKED);
+       assert(kn->kn_status & KN_DROPPING);
+
+       LIST_REMOVE(knlc, knlc_le);
+       kn->kn_status &= ~KN_LOCKED;
+
+       if (kqlocking == KNOTE_KQ_UNLOCK ||
+                       kqlocking == KNOTE_KQ_LOCK_ON_FAILURE) {
+               kqunlock(kq);
        }
-       assert((kn->kn_status & KN_VANISHED) == 0);
-       assert(kn->kn_status & KN_ATTACHED);
-       kn->kn_status &= ~KN_ATTACHED;
-       kn->kn_status |= KN_VANISHED;
-       if (flags & KNUSE_BOOST) {
-               clear_thread_rwlock_boost();
+       if (!TAILQ_EMPTY(&knlc->knlc_head)) {
+               thread_wakeup_with_result(&kn->kn_status, THREAD_RESTART);
        }
-       kn->kn_inuse++;
-       kqunlock(kq);
-       return (1);
+#if DEBUG || DEVELOPMENT
+       knlc->knlc_state = KNOTE_LOCK_CTX_UNLOCKED;
+#endif
 }
 
 /*
- * Convert a kq lock to a knote drop reference.
+ * Call the f_event hook of a given filter.
  *
- *     If the knote is in use, wait for the use count
- *     to subside.  We first mark our intention to drop
- *     it - keeping other users from "piling on."
- *     If we are too late, we have to wait for the
- *     other drop to complete.
- *
- *     - kq locked at entry
- *     - always unlocked on exit.
- *     - caller can't hold any locks that would prevent
- *       the other dropper from completing.
+ * Takes a use count to protect against concurrent drops.
  */
-static int
-kqlock2knotedrop(struct kqueue *kq, struct knote *kn)
+static void
+knote_call_filter_event(struct kqueue *kq, struct knote *kn, long hint)
 {
-       int oktodrop;
-       wait_result_t result;
+       int result, dropping = 0;
 
-       oktodrop = ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) == 0);
-       /* if another thread is attaching, they will become the dropping thread */
-       kn->kn_status |= KN_DROPPING;
-       knote_unsuppress(kn);
-       knote_dequeue(kn);
-       if (oktodrop) {
-               if (kn->kn_inuse == 0) {
-                       kqunlock(kq);
-                       return (oktodrop);
+       kqlock_held(kq);
+
+       if (kn->kn_status & (KN_DROPPING | KN_VANISHED))
+               return;
+
+       kn->kn_inuse++;
+       kqunlock(kq);
+       result = filter_call(knote_fops(kn), f_event(kn, hint));
+       kqlock(kq);
+
+       dropping = (kn->kn_status & KN_DROPPING);
+
+       if (!dropping && (result & FILTER_ACTIVE)) {
+               if (result & FILTER_ADJUST_EVENT_QOS_BIT)
+                       knote_adjust_qos(kq, kn, result);
+               knote_activate(kn);
+       }
+
+       if (--kn->kn_inuse == 0) {
+               if ((kn->kn_status & KN_LOCKED) == 0) {
+                       /*
+                        * We're the last f_event() call and there's no other f_* call in
+                        * flight, we can leave QoS "Merge" mode.
+                        *
+                        * See knote_should_apply_qos_override()
+                        */
+                       kn->kn_status &= ~KN_MERGE_QOS;
+               }
+               if (dropping) {
+                       waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
+                                       CAST_EVENT64_T(&kn->kn_inuse),
+                                       THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
                }
        }
-       result = knoteusewait(kq, kn);
-       /* THREAD_RESTART == another thread stole the knote drop */
-       return (result == THREAD_AWAKENED);
 }
 
-#if 0
 /*
- * Release a knote use count reference.
+ * Called by knote_drop() to wait for the last f_event() caller to be done.
+ *
+ *     - kq locked at entry
+ *     - kq unlocked at exit
  */
 static void
-knote_put(struct knote *kn)
+knote_wait_for_filter_events(struct kqueue *kq, struct knote *kn)
 {
-       struct kqueue *kq = knote_get_kq(kn);
+       wait_result_t wr = THREAD_NOT_WAITING;
 
-       kqlock(kq);
-       if (--kn->kn_inuse == 0) {
-               if ((kn->kn_status & KN_USEWAIT) != 0) {
-                       kn->kn_status &= ~KN_USEWAIT;
-                       waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                                          CAST_EVENT64_T(&kn->kn_status),
-                                          THREAD_AWAKENED,
-                                          WAITQ_ALL_PRIORITIES);
-               }
+       kqlock_held(kq);
+
+       assert(kn->kn_status & KN_DROPPING);
+
+       if (kn->kn_inuse) {
+               wr = waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
+                               CAST_EVENT64_T(&kn->kn_inuse),
+                               THREAD_UNINT | THREAD_WAIT_NOREPORT, TIMEOUT_WAIT_FOREVER);
        }
        kqunlock(kq);
+       if (wr == THREAD_WAITING) {
+               thread_block(THREAD_CONTINUE_NULL);
+       }
 }
-#endif
+
+#pragma mark file_filtops
 
 static int
 filt_fileattach(struct knote *kn, struct kevent_internal_s *kev)
 {
-       return (fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current()));
+       return fo_kqfilter(kn->kn_fp, kn, kev, vfs_context_current());
 }
 
-#define        f_flag f_fglob->fg_flag
-#define        f_msgcount f_fglob->fg_msgcount
-#define        f_cred f_fglob->fg_cred
-#define        f_ops f_fglob->fg_ops
-#define        f_offset f_fglob->fg_offset
-#define        f_data f_fglob->fg_data
-
-static void
+SECURITY_READ_ONLY_EARLY(static struct filterops) file_filtops = {
+       .f_isfd = 1,
+       .f_attach = filt_fileattach,
+};
+
+#pragma mark kqread_filtops
+
+#define        f_flag f_fglob->fg_flag
+#define        f_ops f_fglob->fg_ops
+#define        f_data f_fglob->fg_data
+
+static void
 filt_kqdetach(struct knote *kn)
 {
        struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
@@ -861,15 +804,12 @@ filt_kqdetach(struct knote *kn)
        kqunlock(kq);
 }
 
-/*ARGSUSED*/
 static int
 filt_kqueue(struct knote *kn, __unused long hint)
 {
        struct kqueue *kq = (struct kqueue *)kn->kn_fp->f_data;
-       int count;
 
-       count = kq->kq_count;
-       return (count > 0);
+       return (kq->kq_count > 0);
 }
 
 static int
@@ -881,8 +821,6 @@ filt_kqtouch(struct knote *kn, struct kevent_internal_s *kev)
 
        kqlock(kq);
        kn->kn_data = kq->kq_count;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
        res = (kn->kn_data > 0);
 
        kqunlock(kq);
@@ -910,7 +848,15 @@ filt_kqprocess(struct knote *kn, struct filt_process_s *data, struct kevent_inte
        return res;
 }
 
-#pragma mark EVFILT_PROC
+SECURITY_READ_ONLY_EARLY(static struct filterops) kqread_filtops = {
+       .f_isfd = 1,
+       .f_detach = filt_kqdetach,
+       .f_event = filt_kqueue,
+       .f_touch = filt_kqtouch,
+       .f_process = filt_kqprocess,
+};
+
+#pragma mark proc_filtops
 
 static int
 filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
@@ -920,15 +866,13 @@ filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
        assert(PID_MAX < NOTE_PDATAMASK);
 
        if ((kn->kn_sfflags & (NOTE_TRACK | NOTE_TRACKERR | NOTE_CHILD)) != 0) {
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = ENOTSUP;
+               knote_set_error(kn, ENOTSUP);
                return 0;
        }
 
        p = proc_find(kn->kn_id);
        if (p == NULL) {
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = ESRCH;
+               knote_set_error(kn, ESRCH);
                return 0;
        }
 
@@ -946,8 +890,7 @@ filt_procattach(struct knote *kn, __unused struct kevent_internal_s *kev)
                                break;  /* parent-in-waiting => ok */
 
                        proc_rele(p);
-                       kn->kn_flags = EV_ERROR;
-                       kn->kn_data = EACCES;
+                       knote_set_error(kn, EACCES);
                        return 0;
                } while (0);
 
@@ -1022,7 +965,7 @@ filt_proc(struct knote *kn, long hint)
                         */
                        return 0;
                }
-       }                                       
+       }
 
        /*
         * if the user is interested in this event, record it.
@@ -1040,7 +983,7 @@ filt_proc(struct knote *kn, long hint)
 
        /*
         * The kernel has a wrapper in place that returns the same data
-        * as is collected here, in kn_data.  Any changes to how 
+        * as is collected here, in kn_data.  Any changes to how
         * NOTE_EXITSTATUS and NOTE_EXIT_DETAIL are collected
         * should also be reflected in the proc_pidnoteexit() wrapper.
         */
@@ -1054,7 +997,7 @@ filt_proc(struct knote *kn, long hint)
                        kn->kn_fflags |= NOTE_EXIT_DETAIL;
                        if ((kn->kn_ptr.p_proc->p_lflag &
                             P_LTERM_DECRYPTFAIL) != 0) {
-                               kn->kn_data |= NOTE_EXIT_DECRYPTFAIL; 
+                               kn->kn_data |= NOTE_EXIT_DECRYPTFAIL;
                        }
                        if ((kn->kn_ptr.p_proc->p_lflag &
                             P_LTERM_JETSAM) != 0) {
@@ -1103,8 +1046,6 @@ filt_proctouch(struct knote *kn, struct kevent_internal_s *kev)
 
        /* accept new filter flags and mask off output events no long interesting */
        kn->kn_sfflags = kev->fflags;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /* restrict the current results to the (smaller?) set of new interest */
        /*
@@ -1138,9 +1079,22 @@ filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in
        return res;
 }
 
+SECURITY_READ_ONLY_EARLY(static struct filterops) proc_filtops = {
+       .f_attach = filt_procattach,
+       .f_detach = filt_procdetach,
+       .f_event = filt_proc,
+       .f_touch = filt_proctouch,
+       .f_process = filt_procprocess,
+};
 
-#pragma mark EVFILT_TIMER
+#pragma mark timer_filtops
 
+struct filt_timer_params {
+       uint64_t deadline; /* deadline in abs/cont time
+                                                 (or 0 if NOTE_ABSOLUTE and deadline is in past) */
+       uint64_t leeway;   /* leeway in abstime, or 0 if none */
+       uint64_t interval; /* interval in abstime or 0 if non-repeating timer */
+};
 
 /*
  * Values stored in the knote at rest (using Mach absolute time units)
@@ -1150,23 +1104,36 @@ filt_procprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in
  * kn->kn_ext[1]        leeway value
  * kn->kn_sdata         interval timer: the interval
  *                      absolute/deadline timer: 0
- * kn->kn_data          fire count
+ * kn->kn_hookid        timer state
+ *
+ * TIMER_IDLE:
+ *   The timer has either never been scheduled or been cancelled.
+ *   It is safe to schedule a new one in this state.
+ *
+ * TIMER_ARMED:
+ *   The timer has been scheduled
+ *
+ * TIMER_FIRED
+ *   The timer has fired and an event needs to be delivered.
+ *   When in this state, the callout may still be running.
+ *
+ * TIMER_IMMEDIATE
+ *   The timer has fired at registration time, and the callout was never
+ *   dispatched.
  */
+#define TIMER_IDLE       0x0
+#define        TIMER_ARMED      0x1
+#define TIMER_FIRED      0x2
+#define TIMER_IMMEDIATE  0x3
 
-static lck_mtx_t _filt_timerlock;
-
-static void filt_timerlock(void)   { lck_mtx_lock(&_filt_timerlock);   }
-static void filt_timerunlock(void) { lck_mtx_unlock(&_filt_timerlock); }
-
-static inline void filt_timer_assert_locked(void)
+static void
+filt_timer_set_params(struct knote *kn, struct filt_timer_params *params)
 {
-       LCK_MTX_ASSERT(&_filt_timerlock, LCK_MTX_ASSERT_OWNED);
+       kn->kn_ext[0] = params->deadline;
+       kn->kn_ext[1] = params->leeway;
+       kn->kn_sdata  = params->interval;
 }
 
-/* state flags stored in kn_hookid */
-#define        TIMER_RUNNING           0x1
-#define        TIMER_CANCELWAIT        0x2
-
 /*
  * filt_timervalidate - process data from user
  *
@@ -1177,20 +1144,21 @@ static inline void filt_timer_assert_locked(void)
  *      kn_sfflags      style of timer, unit of measurement
  *
  * Output:
- *      kn_sdata        either interval in abstime or 0 if non-repeating timer
- *      ext[0]          fire deadline in abs/cont time
- *                      (or 0 if NOTE_ABSOLUTE and deadline is in past)
+ *      struct filter_timer_params to apply to the filter with
+ *      filt_timer_set_params when changes are ready to be commited.
  *
  * Returns:
  *      EINVAL          Invalid user data parameters
+ *      ERANGE          Various overflows with the parameters
  *
  * Called with timer filter lock held.
  */
 static int
-filt_timervalidate(struct knote *kn)
+filt_timervalidate(const struct kevent_internal_s *kev,
+               struct filt_timer_params *params)
 {
        /*
-        * There are 4 knobs that need to be chosen for a timer registration:
+        * There are 5 knobs that need to be chosen for a timer registration:
         *
         * A) Units of time (what is the time duration of the specified number)
         *      Absolute and interval take:
@@ -1220,13 +1188,11 @@ filt_timervalidate(struct knote *kn)
         *              expires when mach_continuous_time() is > the passed in value.
         */
 
-       filt_timer_assert_locked();
-
        uint64_t multiplier;
 
        boolean_t use_abstime = FALSE;
 
-       switch (kn->kn_sfflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS|NOTE_MACHTIME)) {
+       switch (kev->fflags & (NOTE_SECONDS|NOTE_USECONDS|NOTE_NSECONDS|NOTE_MACHTIME)) {
        case NOTE_SECONDS:
                multiplier = NSEC_PER_SEC;
                break;
@@ -1248,31 +1214,33 @@ filt_timervalidate(struct knote *kn)
        }
 
        /* transform the leeway in kn_ext[1] to same time scale */
-       if (kn->kn_sfflags & NOTE_LEEWAY) {
+       if (kev->fflags & NOTE_LEEWAY) {
                uint64_t leeway_abs;
 
                if (use_abstime) {
-                       leeway_abs = (uint64_t)kn->kn_ext[1];
+                       leeway_abs = (uint64_t)kev->ext[1];
                } else  {
                        uint64_t leeway_ns;
-                       if (os_mul_overflow((uint64_t)kn->kn_ext[1], multiplier, &leeway_ns))
+                       if (os_mul_overflow((uint64_t)kev->ext[1], multiplier, &leeway_ns))
                                return (ERANGE);
 
                        nanoseconds_to_absolutetime(leeway_ns, &leeway_abs);
                }
 
-               kn->kn_ext[1] = leeway_abs;
+               params->leeway = leeway_abs;
+       } else {
+               params->leeway = 0;
        }
 
-       if (kn->kn_sfflags & NOTE_ABSOLUTE) {
+       if (kev->fflags & NOTE_ABSOLUTE) {
                uint64_t deadline_abs;
 
                if (use_abstime) {
-                       deadline_abs = (uint64_t)kn->kn_sdata;
+                       deadline_abs = (uint64_t)kev->data;
                } else {
                        uint64_t calendar_deadline_ns;
 
-                       if (os_mul_overflow((uint64_t)kn->kn_sdata, multiplier, &calendar_deadline_ns))
+                       if (os_mul_overflow((uint64_t)kev->data, multiplier, &calendar_deadline_ns))
                                return (ERANGE);
 
                        /* calendar_deadline_ns is in nanoseconds since the epoch */
@@ -1306,7 +1274,7 @@ filt_timervalidate(struct knote *kn)
                                 * it does not change the calendar timebase.
                                 */
 
-                               if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
+                               if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME)
                                        clock_continuoustime_interval_to_deadline(interval_abs,
                                                                                  &deadline_abs);
                                else
@@ -1317,9 +1285,9 @@ filt_timervalidate(struct knote *kn)
                        }
                }
 
-               kn->kn_ext[0] = deadline_abs;
-               kn->kn_sdata  = 0;       /* NOTE_ABSOLUTE is non-repeating */
-       } else if (kn->kn_sdata < 0) {
+               params->deadline = deadline_abs;
+               params->interval = 0; /* NOTE_ABSOLUTE is non-repeating */
+       } else if (kev->data < 0) {
                /*
                 * Negative interval timers fire immediately, once.
                 *
@@ -1333,16 +1301,16 @@ filt_timervalidate(struct knote *kn)
                 * We now skip the power-wasting hot spin phase and go straight to the idle phase.
                 */
 
-               kn->kn_sdata  = 0;      /* non-repeating */
-               kn->kn_ext[0] = 0;      /* expire immediately */
+               params->deadline = 0; /* expire immediately */
+               params->interval = 0; /* non-repeating */
        } else {
                uint64_t interval_abs = 0;
 
                if (use_abstime) {
-                       interval_abs = (uint64_t)kn->kn_sdata;
+                       interval_abs = (uint64_t)kev->data;
                } else {
                        uint64_t interval_ns;
-                       if (os_mul_overflow((uint64_t)kn->kn_sdata, multiplier, &interval_ns))
+                       if (os_mul_overflow((uint64_t)kev->data, multiplier, &interval_ns))
                                return (ERANGE);
 
                        nanoseconds_to_absolutetime(interval_ns, &interval_abs);
@@ -1350,117 +1318,93 @@ filt_timervalidate(struct knote *kn)
 
                uint64_t deadline = 0;
 
-               if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
+               if (kev->fflags & NOTE_MACH_CONTINUOUS_TIME)
                        clock_continuoustime_interval_to_deadline(interval_abs, &deadline);
                else
                        clock_absolutetime_interval_to_deadline(interval_abs, &deadline);
 
-               kn->kn_sdata  = interval_abs;   /* default to a repeating timer */
-               kn->kn_ext[0] = deadline;
+               params->deadline = deadline;
+               params->interval = interval_abs;
        }
 
        return (0);
 }
 
-
-
-
 /*
  * filt_timerexpire - the timer callout routine
- *
- * Just propagate the timer event into the knote
- * filter routine (by going through the knote
- * synchronization point).  Pass a hint to
- * indicate this is a real event, not just a
- * query from above.
  */
 static void
 filt_timerexpire(void *knx, __unused void *spare)
 {
-       struct klist timer_list;
        struct knote *kn = knx;
+       int v;
 
-       filt_timerlock();
-
-       kn->kn_hookid &= ~TIMER_RUNNING;
-
-       /* no "object" for timers, so fake a list */
-       SLIST_INIT(&timer_list);
-       SLIST_INSERT_HEAD(&timer_list, kn, kn_selnext);
-
-       KNOTE(&timer_list, 1);
-
-       /* if someone is waiting for timer to pop */
-       if (kn->kn_hookid & TIMER_CANCELWAIT) {
+       if (os_atomic_cmpxchgv(&kn->kn_hookid, TIMER_ARMED, TIMER_FIRED,
+                       &v, relaxed)) {
+               // our f_event always would say FILTER_ACTIVE,
+               // so be leaner and just do it.
                struct kqueue *kq = knote_get_kq(kn);
-               waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                                  CAST_EVENT64_T(&kn->kn_hook),
-                                  THREAD_AWAKENED,
-                                  WAITQ_ALL_PRIORITIES);
-
-               kn->kn_hookid &= ~TIMER_CANCELWAIT;
+               kqlock(kq);
+               knote_activate(kn);
+               kqunlock(kq);
+       } else {
+               /*
+                * From TIMER_ARMED, the only allowed transition are:
+                * - to TIMER_FIRED through the timer callout just above
+                * - to TIMER_IDLE due to filt_timercancel() which will wait for the
+                *   timer callout (and any possible invocation of filt_timerexpire) to
+                *   have finished before the state is changed again.
+                */
+               assert(v == TIMER_IDLE);
        }
-
-       filt_timerunlock();
 }
 
-/*
- * Cancel a running timer (or wait for the pop).
- * Timer filter lock is held.
- * May drop and retake the timer filter lock.
- */
 static void
 filt_timercancel(struct knote *kn)
 {
-       filt_timer_assert_locked();
-
-       assert((kn->kn_hookid & TIMER_CANCELWAIT) == 0);
-
-       /* if no timer, then we're good */
-       if ((kn->kn_hookid & TIMER_RUNNING) == 0)
-               return;
-
-       thread_call_t callout = (thread_call_t)kn->kn_hook;
-
-       /* cancel the callout if we can */
-       if (thread_call_cancel(callout)) {
-               kn->kn_hookid &= ~TIMER_RUNNING;
-               return;
+       if (os_atomic_xchg(&kn->kn_hookid, TIMER_IDLE, relaxed) == TIMER_ARMED) {
+               /* cancel the thread call and wait for any filt_timerexpire in flight */
+               thread_call_cancel_wait((thread_call_t)kn->kn_hook);
        }
+}
 
-       /* cancel failed, we have to wait for the in-flight expire routine */
-
-       kn->kn_hookid |= TIMER_CANCELWAIT;
-
-       struct kqueue *kq = knote_get_kq(kn);
-
-       waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
-                           CAST_EVENT64_T(&kn->kn_hook),
-                           THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
+/*
+ * Does this deadline needs a timer armed for it, or has it expired?
+ */
+static bool
+filt_timer_is_ready(struct knote *kn)
+{
+       uint64_t now, deadline = kn->kn_ext[0];
 
-       filt_timerunlock();
-       thread_block(THREAD_CONTINUE_NULL);
-       filt_timerlock();
+       if (deadline == 0) {
+               return true;
+       }
 
-       assert((kn->kn_hookid & TIMER_CANCELWAIT) == 0);
-       assert((kn->kn_hookid & TIMER_RUNNING) == 0);
+       if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME) {
+               now = mach_continuous_time();
+       } else {
+               now = mach_absolute_time();
+       }
+       return deadline <= now;
 }
 
+/*
+ * Arm a timer
+ *
+ * It is the responsibility of the caller to make sure the timer call
+ * has completed or been cancelled properly prior to arming it.
+ */
 static void
 filt_timerarm(struct knote *kn)
 {
-       filt_timer_assert_locked();
-
-       assert((kn->kn_hookid & TIMER_RUNNING) == 0);
-
-       thread_call_t callout = (thread_call_t)kn->kn_hook;
-
        uint64_t deadline = kn->kn_ext[0];
        uint64_t leeway   = kn->kn_ext[1];
 
        int filter_flags = kn->kn_sfflags;
        unsigned int timer_flags = 0;
 
+       assert(os_atomic_load(&kn->kn_hookid, relaxed) == TIMER_IDLE);
+
        if (filter_flags & NOTE_CRITICAL)
                timer_flags |= THREAD_CALL_DELAY_USER_CRITICAL;
        else if (filter_flags & NOTE_BACKGROUND)
@@ -1474,85 +1418,51 @@ filt_timerarm(struct knote *kn)
        if (filter_flags & NOTE_MACH_CONTINUOUS_TIME)
                timer_flags |= THREAD_CALL_CONTINUOUS;
 
-       thread_call_enter_delayed_with_leeway(callout, NULL,
-                                             deadline, leeway,
-                                             timer_flags);
-
-       kn->kn_hookid |= TIMER_RUNNING;
-}
-
-/*
- * Does this knote need a timer armed for it, or should it be ready immediately?
- */
-static boolean_t
-filt_timer_is_ready(struct knote *kn)
-{
-       uint64_t now;
-
-       if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
-               now = mach_continuous_time();
-       else
-               now = mach_absolute_time();
-
-       uint64_t deadline = kn->kn_ext[0];
-
-       if (deadline < now)
-               return TRUE;
-       else
-               return FALSE;
+       os_atomic_store(&kn->kn_hookid, TIMER_ARMED, relaxed);
+       thread_call_enter_delayed_with_leeway((thread_call_t)kn->kn_hook, NULL,
+                       deadline, leeway, timer_flags);
 }
 
 /*
  * Allocate a thread call for the knote's lifetime, and kick off the timer.
  */
 static int
-filt_timerattach(struct knote *kn, __unused struct kevent_internal_s *kev)
+filt_timerattach(struct knote *kn, struct kevent_internal_s *kev)
 {
        thread_call_t callout;
+       struct filt_timer_params params;
        int error;
 
+       if ((error = filt_timervalidate(kev, &params)) != 0) {
+               knote_set_error(kn, error);
+               return 0;
+       }
+
        callout = thread_call_allocate_with_options(filt_timerexpire,
                        (thread_call_param_t)kn, THREAD_CALL_PRIORITY_HIGH,
                        THREAD_CALL_OPTIONS_ONCE);
 
        if (NULL == callout) {
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = ENOMEM;
-               return 0;
-       }
-
-       filt_timerlock();
-
-       if ((error = filt_timervalidate(kn)) != 0) {
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data  = error;
-               filt_timerunlock();
-
-               __assert_only boolean_t freed = thread_call_free(callout);
-               assert(freed);
+               knote_set_error(kn, ENOMEM);
                return 0;
        }
 
-       kn->kn_hook = (void*)callout;
-       kn->kn_hookid = 0;
+       filt_timer_set_params(kn, &params);
+       kn->kn_hook = callout;
        kn->kn_flags |= EV_CLEAR;
+       os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed);
 
        /* NOTE_ABSOLUTE implies EV_ONESHOT */
        if (kn->kn_sfflags & NOTE_ABSOLUTE)
                kn->kn_flags |= EV_ONESHOT;
 
-       boolean_t timer_ready = FALSE;
-
-       if ((timer_ready = filt_timer_is_ready(kn))) {
-               /* cause immediate expiration */
-               kn->kn_data = 1;
+       if (filt_timer_is_ready(kn)) {
+               os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed);
+               return FILTER_ACTIVE;
        } else {
                filt_timerarm(kn);
+               return 0;
        }
-
-       filt_timerunlock();
-
-       return timer_ready;
 }
 
 /*
@@ -1561,34 +1471,17 @@ filt_timerattach(struct knote *kn, __unused struct kevent_internal_s *kev)
 static void
 filt_timerdetach(struct knote *kn)
 {
-       thread_call_t callout;
-
-       filt_timerlock();
-
-       callout = (thread_call_t)kn->kn_hook;
-       filt_timercancel(kn);
-
-       filt_timerunlock();
+       __assert_only boolean_t freed;
 
-       __assert_only boolean_t freed = thread_call_free(callout);
+       /*
+        * Unconditionally cancel to make sure there can't be any filt_timerexpire()
+        * running anymore.
+        */
+       thread_call_cancel_wait((thread_call_t)kn->kn_hook);
+       freed = thread_call_free((thread_call_t)kn->kn_hook);
        assert(freed);
 }
 
-/*
- * filt_timerevent - post events to a timer knote
- *
- * Called in the context of filt_timerexpire with
- * the filt_timerlock held
- */
-static int
-filt_timerevent(struct knote *kn, __unused long hint)
-{
-       filt_timer_assert_locked();
-
-       kn->kn_data = 1;
-       return (1);
-}
-
 /*
  * filt_timertouch - update timer knote with new user input
  *
@@ -1597,54 +1490,36 @@ filt_timerevent(struct knote *kn, __unused long hint)
  * pops have gone off (in kn_data).
  */
 static int
-filt_timertouch(
-       struct knote *kn,
-       struct kevent_internal_s *kev)
+filt_timertouch(struct knote *kn, struct kevent_internal_s *kev)
 {
+       struct filt_timer_params params;
+       uint32_t changed_flags = (kn->kn_sfflags ^ kev->fflags);
        int error;
 
-       filt_timerlock();
-
-       /*
-        * cancel current call - drops and retakes lock
-        * TODO: not safe against concurrent touches?
-        */
-       filt_timercancel(kn);
+       if (changed_flags & NOTE_ABSOLUTE) {
+               kev->flags |= EV_ERROR;
+               kev->data = EINVAL;
+               return 0;
+       }
 
-       /* clear if the timer had previously fired, the user no longer wants to see it */
-       kn->kn_data = 0;
+       if ((error = filt_timervalidate(kev, &params)) != 0) {
+               kev->flags |= EV_ERROR;
+               kev->data = error;
+               return 0;
+       }
 
        /* capture the new values used to compute deadline */
-       kn->kn_sdata = kev->data;
+       filt_timercancel(kn);
+       filt_timer_set_params(kn, &params);
        kn->kn_sfflags = kev->fflags;
-       kn->kn_ext[0] = kev->ext[0];
-       kn->kn_ext[1] = kev->ext[1];
-
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
-
-       /* recalculate deadline */
-       error = filt_timervalidate(kn);
-       if (error) {
-               /* no way to report error, so mark it in the knote */
-               kn->kn_flags |= EV_ERROR;
-               kn->kn_data = error;
-               filt_timerunlock();
-               return 1;
-       }
 
-       boolean_t timer_ready = FALSE;
-
-       if ((timer_ready = filt_timer_is_ready(kn))) {
-               /* cause immediate expiration */
-               kn->kn_data = 1;
+       if (filt_timer_is_ready(kn)) {
+               os_atomic_store(&kn->kn_hookid, TIMER_IMMEDIATE, relaxed);
+               return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
        } else {
                filt_timerarm(kn);
+               return FILTER_UPDATE_REQ_QOS;
        }
-
-       filt_timerunlock();
-
-       return timer_ready;
 }
 
 /*
@@ -1660,24 +1535,43 @@ filt_timerprocess(
        __unused struct filt_process_s *data,
        struct kevent_internal_s *kev)
 {
-       filt_timerlock();
-
-       if (kn->kn_data == 0 || (kn->kn_hookid & TIMER_CANCELWAIT)) {
+       /*
+        * filt_timerprocess is serialized with any filter routine except for
+        * filt_timerexpire which atomically does a TIMER_ARMED -> TIMER_FIRED
+        * transition, and on success, activates the knote.
+        *
+        * Hence, we don't need atomic modifications of the state, only to peek at
+        * whether we see any of the "FIRED" state, and if we do, it is safe to
+        * do simple state machine transitions.
+        */
+       switch (os_atomic_load(&kn->kn_hookid, relaxed)) {
+       case TIMER_IDLE:
+       case TIMER_ARMED:
                /*
-                * kn_data = 0:
-                * The timer hasn't yet fired, so there's nothing to deliver
-                * TIMER_CANCELWAIT:
-                * touch is in the middle of canceling the timer,
-                * so don't deliver or re-arm anything
-                *
                 * This can happen if a touch resets a timer that had fired
                 * without being processed
                 */
-               filt_timerunlock();
                return 0;
        }
 
-       if (kn->kn_sdata != 0 && ((kn->kn_flags & EV_ERROR) == 0)) {
+       os_atomic_store(&kn->kn_hookid, TIMER_IDLE, relaxed);
+
+       /*
+        * Copy out the interesting kevent state,
+        * but don't leak out the raw time calculations.
+        *
+        * TODO: potential enhancements - tell the user about:
+        *      - deadline to which this timer thought it was expiring
+        *      - return kn_sfflags in the fflags field so the client can know
+        *        under what flags the timer fired
+        */
+       *kev = kn->kn_kevent;
+       kev->ext[0] = 0;
+       /* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
+
+       if (kn->kn_sdata == 0) {
+               kev->data = 1;
+       } else {
                /*
                 * This is a 'repeating' timer, so we have to emit
                 * how many intervals expired between the arm
@@ -1687,9 +1581,6 @@ filt_timerprocess(
                 * this could easily be done in the client...
                 */
 
-               /* The timer better have had expired... */
-               assert((kn->kn_hookid & TIMER_RUNNING) == 0);
-
                uint64_t now;
 
                if (kn->kn_sfflags & NOTE_MACH_CONTINUOUS_TIME)
@@ -1713,18 +1604,11 @@ filt_timerprocess(
                 * and be in repeating mode, so therefore it must have been
                 * more than 'interval' time since the attach or last
                 * successful touch.
-                *
-                * An unsuccessful touch would:
-                * disarm the timer
-                * clear kn_data
-                * clear kn_sdata
-                * set EV_ERROR
-                * all of which will prevent this code from running.
                 */
                assert(num_fired > 0);
 
                /* report how many intervals have elapsed to the user */
-               kn->kn_data = (int64_t) num_fired;
+               kev->data = (int64_t)num_fired;
 
                /* We only need to re-arm the timer if it's not about to be destroyed */
                if ((kn->kn_flags & EV_ONESHOT) == 0) {
@@ -1735,62 +1619,33 @@ filt_timerprocess(
 
                        kn->kn_ext[0] = new_deadline;
 
+                       /*
+                        * This can't shortcut setting up the thread call, because
+                        * knote_process deactivates EV_CLEAR knotes unconditionnally.
+                        */
                        filt_timerarm(kn);
                }
        }
 
-       /*
-        * Copy out the interesting kevent state,
-        * but don't leak out the raw time calculations.
-        *
-        * TODO: potential enhancements - tell the user about:
-        *      - deadline to which this timer thought it was expiring
-        *      - return kn_sfflags in the fflags field so the client can know
-        *        under what flags the timer fired
-        */
-       *kev = kn->kn_kevent;
-       kev->ext[0] = 0;
-       /* kev->ext[1] = 0;  JMM - shouldn't we hide this too? */
-
-       /* we have delivered the event, reset the timer pop count */
-       kn->kn_data = 0;
-
-       filt_timerunlock();
-       return 1;
+       return FILTER_ACTIVE;
 }
 
 SECURITY_READ_ONLY_EARLY(static struct filterops) timer_filtops = {
+       .f_extended_codes = true,
        .f_attach   = filt_timerattach,
        .f_detach   = filt_timerdetach,
-       .f_event    = filt_timerevent,
+       .f_event    = filt_badevent,
        .f_touch    = filt_timertouch,
        .f_process  = filt_timerprocess,
 };
 
-
-#pragma mark EVFILT_USER
-
-
-static void
-filt_userlock(void)
-{
-       lck_spin_lock(&_filt_userlock);
-}
-
-static void
-filt_userunlock(void)
-{
-       lck_spin_unlock(&_filt_userlock);
-}
+#pragma mark user_filtops
 
 static int
 filt_userattach(struct knote *kn, __unused struct kevent_internal_s *kev)
 {
-       /* EVFILT_USER knotes are not attached to anything in the kernel */
-       /* Cant discover this knote until after attach - so no lock needed */
-       kn->kn_hook = NULL;
        if (kn->kn_sfflags & NOTE_TRIGGER) {
-               kn->kn_hookid = 1;
+               kn->kn_hookid = FILTER_ACTIVE;
        } else {
                kn->kn_hookid = 0;
        }
@@ -1804,24 +1659,10 @@ filt_userdetach(__unused struct knote *kn)
 }
 
 static int
-filt_user(
-       __unused struct knote *kn,
-       __unused long hint)
-{
-       panic("filt_user");
-       return 0;
-}
-
-static int
-filt_usertouch(
-       struct knote *kn,
-       struct kevent_internal_s *kev)
+filt_usertouch(struct knote *kn, struct kevent_internal_s *kev)
 {
        uint32_t ffctrl;
        int fflags;
-       int active;
-
-       filt_userlock();
 
        ffctrl = kev->fflags & NOTE_FFCTRLMASK;
        fflags = kev->fflags & NOTE_FFLAGSMASK;
@@ -1840,17 +1681,10 @@ filt_usertouch(
        }
        kn->kn_sdata = kev->data;
 
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
-
        if (kev->fflags & NOTE_TRIGGER) {
-               kn->kn_hookid = 1;
+               kn->kn_hookid = FILTER_ACTIVE;
        }
-       active = kn->kn_hookid;
-
-       filt_userunlock();
-
-       return (active);
+       return (int)kn->kn_hookid;
 }
 
 static int
@@ -1859,34 +1693,32 @@ filt_userprocess(
        __unused struct filt_process_s *data,
        struct kevent_internal_s *kev)
 {
-       filt_userlock();
-
-       if (kn->kn_hookid == 0) {
-               filt_userunlock();
-               return 0;
-       }
+       int result = (int)kn->kn_hookid;
 
-       *kev = kn->kn_kevent;
-       kev->fflags = (volatile UInt32)kn->kn_sfflags;
-       kev->data = kn->kn_sdata;
-       if (kn->kn_flags & EV_CLEAR) {
-               kn->kn_hookid = 0;
-               kn->kn_data = 0;
-               kn->kn_fflags = 0;
+       if (result) {
+               *kev = kn->kn_kevent;
+               kev->fflags = kn->kn_sfflags;
+               kev->data = kn->kn_sdata;
+               if (kn->kn_flags & EV_CLEAR) {
+                       kn->kn_hookid = 0;
+                       kn->kn_data = 0;
+                       kn->kn_fflags = 0;
+               }
        }
-       filt_userunlock();
 
-       return 1;
+       return result;
 }
 
-#pragma mark EVFILT_WORKLOOP
+SECURITY_READ_ONLY_EARLY(static struct filterops) user_filtops = {
+       .f_extended_codes = true,
+       .f_attach  = filt_userattach,
+       .f_detach  = filt_userdetach,
+       .f_event   = filt_badevent,
+       .f_touch   = filt_usertouch,
+       .f_process = filt_userprocess,
+};
 
-#if DEBUG || DEVELOPMENT
-/*
- * see src/queue_internal.h in libdispatch
- */
-#define DISPATCH_QUEUE_ENQUEUED 0x1ull
-#endif
+#pragma mark workloop_filtops
 
 static inline void
 filt_wllock(struct kqworkloop *kqwl)
@@ -1900,117 +1732,188 @@ filt_wlunlock(struct kqworkloop *kqwl)
        lck_mtx_unlock(&kqwl->kqwl_statelock);
 }
 
-static inline void
-filt_wlheld(__assert_only struct kqworkloop *kqwl)
-{
-       LCK_MTX_ASSERT(&kqwl->kqwl_statelock, LCK_MTX_ASSERT_OWNED);
-}
-
-#define WL_OWNER_SUSPENDED    ((thread_t)(~0ull))  /* special owner when suspended */
-
+/*
+ * Returns true when the interlock for the turnstile is the workqueue lock
+ *
+ * When this is the case, all turnstiles operations are delegated
+ * to the workqueue subsystem.
+ *
+ * This is required because kqueue_threadreq_bind_prepost only holds the
+ * workqueue lock but needs to move the inheritor from the workloop turnstile
+ * away from the creator thread, so that this now fulfilled request cannot be
+ * picked anymore by other threads.
+ */
 static inline bool
-filt_wlowner_is_valid(thread_t owner)
+filt_wlturnstile_interlock_is_workq(struct kqworkloop *kqwl)
 {
-       return owner != THREAD_NULL && owner != WL_OWNER_SUSPENDED;
+       struct kqrequest *kqr = &kqwl->kqwl_request;
+       return (kqr->kqr_state & KQR_THREQUESTED) &&
+                       (kqr->kqr_thread == THREAD_NULL);
 }
 
-static inline bool
-filt_wlshould_end_ownership(struct kqworkloop *kqwl,
-               struct kevent_internal_s *kev, int error)
+static void
+filt_wlupdate_inheritor(struct kqworkloop *kqwl, struct turnstile *ts,
+               turnstile_update_flags_t flags)
 {
-       thread_t owner = kqwl->kqwl_owner;
-       return (error == 0 || error == ESTALE) &&
-                       (kev->fflags & NOTE_WL_END_OWNERSHIP) &&
-                       (owner == current_thread() || owner == WL_OWNER_SUSPENDED);
-}
+       turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
+       struct kqrequest *kqr = &kqwl->kqwl_request;
 
-static inline bool
-filt_wlshould_update_ownership(struct kevent_internal_s *kev, int error)
-{
-       return error == 0 && (kev->fflags & NOTE_WL_DISCOVER_OWNER) &&
-                       kev->ext[EV_EXTIDX_WL_ADDR];
-}
+       /*
+        * binding to the workq should always happen through
+        * workq_kern_threadreq_update_inheritor()
+        */
+       assert(!filt_wlturnstile_interlock_is_workq(kqwl));
 
-static inline bool
-filt_wlshould_set_async_qos(struct kevent_internal_s *kev, int error,
-               kq_index_t async_qos)
-{
-       if (error != 0) {
-               return false;
+       if ((inheritor = kqwl->kqwl_owner)) {
+               flags |= TURNSTILE_INHERITOR_THREAD;
+       } else if ((inheritor = kqr->kqr_thread)) {
+               flags |= TURNSTILE_INHERITOR_THREAD;
        }
-       if (async_qos != THREAD_QOS_UNSPECIFIED) {
-               return true;
-       }
-       if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
-               /* see filt_wlprocess() */
-               return true;
-       }
-       return false;
+
+       turnstile_update_inheritor(ts, inheritor, flags);
 }
 
+#define FILT_WLATTACH 0
+#define FILT_WLTOUCH  1
+#define FILT_WLDROP   2
+
 __result_use_check
 static int
-filt_wlupdateowner(struct kqworkloop *kqwl, struct kevent_internal_s *kev,
-               int error, kq_index_t async_qos)
+filt_wlupdate(struct kqworkloop *kqwl, struct knote *kn,
+               struct kevent_internal_s *kev, kq_index_t qos_index, int op)
 {
+       user_addr_t uaddr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
        struct kqrequest *kqr = &kqwl->kqwl_request;
        thread_t cur_owner, new_owner, extra_thread_ref = THREAD_NULL;
-       kq_index_t cur_override = THREAD_QOS_UNSPECIFIED;
-       kq_index_t old_owner_override = THREAD_QOS_UNSPECIFIED;
-       boolean_t ipc_override_is_sync = false;
-       boolean_t old_owner_override_is_sync = false;
-       int action = KQWL_UTQ_NONE;
+       kq_index_t cur_owner_override = THREAD_QOS_UNSPECIFIED;
+       int action = KQWL_UTQ_NONE, error = 0;
+       bool needs_wake = false, needs_wllock = false;
+       uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
+       uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
+       uint64_t udata = 0;
+
+       if (kev->fflags & (NOTE_WL_END_OWNERSHIP | NOTE_WL_DISCOVER_OWNER)) {
+               /*
+                * If we're maybe going to change the kqwl_owner,
+                * then we need to hold the filt_wllock().
+                */
+               needs_wllock = true;
+       } else if (kqr->kqr_thread == current_thread()) {
+               /*
+                * <rdar://problem/41531764> Servicer updates need to be serialized with
+                * any ownership change too, as the kqr_thread value influences the
+                * outcome of handling NOTE_WL_DISCOVER_OWNER.
+                */
+               needs_wllock = true;
+       }
 
-       filt_wlheld(kqwl);
+       if (needs_wllock) {
+               filt_wllock(kqwl);
+               /*
+                * The kqwl owner is set under both the req and filter lock,
+                * meaning it's fine to look at it under any.
+                */
+               new_owner = cur_owner = kqwl->kqwl_owner;
+       } else {
+               new_owner = cur_owner = THREAD_NULL;
+       }
 
        /*
-        * The owner is only changed under both the filt_wllock and the
-        * kqwl_req_lock. Looking at it with either one held is fine.
+        * Phase 1:
+        *
+        * If asked, load the uint64 value at the user provided address and compare
+        * it against the passed in mask and expected value.
+        *
+        * If NOTE_WL_DISCOVER_OWNER is specified, translate the loaded name as
+        * a thread reference.
+        *
+        * If NOTE_WL_END_OWNERSHIP is specified and the currently known owner is
+        * the current thread, then end ownership.
+        *
+        * Lastly decide whether we need to perform a QoS update.
         */
-       cur_owner = kqwl->kqwl_owner;
-       if (filt_wlshould_end_ownership(kqwl, kev, error)) {
-               new_owner = THREAD_NULL;
-       } else if (filt_wlshould_update_ownership(kev, error)) {
-               /*
-                * Decipher the owner port name, and translate accordingly.
-                * The low 2 bits were borrowed for other flags, so mask them off.
-                */
-               uint64_t udata = kev->ext[EV_EXTIDX_WL_VALUE];
-               mach_port_name_t new_owner_name = (mach_port_name_t)udata & ~0x3;
-               if (new_owner_name != MACH_PORT_NULL) {
-                       new_owner_name = ipc_entry_name_mask(new_owner_name);
-               }
-
-               if (MACH_PORT_VALID(new_owner_name)) {
-                       new_owner = port_name_to_thread(new_owner_name);
-                       if (new_owner == THREAD_NULL)
-                               return EOWNERDEAD;
-                       extra_thread_ref = new_owner;
-               } else if (new_owner_name == MACH_PORT_DEAD) {
-                       new_owner = WL_OWNER_SUSPENDED;
-               } else {
+       if (uaddr) {
+               error = copyin_word(uaddr, &udata, sizeof(udata));
+               if (error) {
+                       goto out;
+               }
+
+               /* Update state as copied in.  */
+               kev->ext[EV_EXTIDX_WL_VALUE] = udata;
+
+               if ((udata & mask) != (kdata & mask)) {
+                       error = ESTALE;
+               } else if (kev->fflags & NOTE_WL_DISCOVER_OWNER) {
                        /*
-                        * We never want to learn a new owner that is NULL.
-                        * Ownership should be ended with END_OWNERSHIP.
+                        * Decipher the owner port name, and translate accordingly.
+                        * The low 2 bits were borrowed for other flags, so mask them off.
+                        *
+                        * Then attempt translation to a thread reference or fail.
                         */
-                       new_owner = cur_owner;
+                       mach_port_name_t name = (mach_port_name_t)udata & ~0x3;
+                       if (name != MACH_PORT_NULL) {
+                               name = ipc_entry_name_mask(name);
+                               extra_thread_ref = port_name_to_thread(name);
+                               if (extra_thread_ref == THREAD_NULL) {
+                                       error = EOWNERDEAD;
+                                       goto out;
+                               }
+                               new_owner = extra_thread_ref;
+                       }
                }
-       } else {
-               new_owner = cur_owner;
        }
 
-       if (filt_wlshould_set_async_qos(kev, error, async_qos)) {
-               action = KQWL_UTQ_SET_ASYNC_QOS;
+       if ((kev->fflags & NOTE_WL_END_OWNERSHIP) && new_owner == current_thread()) {
+               new_owner = THREAD_NULL;
+       }
+
+       if (error == 0) {
+               if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && (kev->flags & EV_DELETE)) {
+                       action = KQWL_UTQ_SET_QOS_INDEX;
+               } else if (qos_index && kqr->kqr_qos_index != qos_index) {
+                       action = KQWL_UTQ_SET_QOS_INDEX;
+               }
+
+               if (op == FILT_WLTOUCH) {
+                       /*
+                        * Save off any additional fflags/data we just accepted
+                        * But only keep the last round of "update" bits we acted on which helps
+                        * debugging a lot.
+                        */
+                       kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
+                       kn->kn_sfflags |= kev->fflags;
+                       kn->kn_sdata = kev->data;
+                       if (kev->fflags & NOTE_WL_SYNC_WAKE) {
+                               needs_wake = (kn->kn_hook != THREAD_NULL);
+                       }
+               } else if (op == FILT_WLDROP) {
+                       if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
+                                       NOTE_WL_SYNC_WAIT) {
+                               /*
+                                * When deleting a SYNC_WAIT knote that hasn't been woken up
+                                * explicitly, issue a wake up.
+                                */
+                               kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
+                               needs_wake = (kn->kn_hook != THREAD_NULL);
+                       }
+               }
        }
-       if (cur_owner == new_owner && action == KQWL_UTQ_NONE) {
+
+       /*
+        * Phase 2:
+        *
+        * Commit ownership and QoS changes if any, possibly wake up waiters
+        */
+
+       if (cur_owner == new_owner && action == KQWL_UTQ_NONE && !needs_wake) {
                goto out;
        }
 
-       kqwl_req_lock(kqwl);
+       kq_req_lock(kqwl);
 
        /* If already tracked as servicer, don't track as owner */
-       if ((kqr->kqr_state & KQR_BOUND) && new_owner == kqr->kqr_thread) {
-               kqwl->kqwl_owner = new_owner = THREAD_NULL;
+       if (new_owner == kqr->kqr_thread) {
+               new_owner = THREAD_NULL;
        }
 
        if (cur_owner != new_owner) {
@@ -2019,30 +1922,24 @@ filt_wlupdateowner(struct kqworkloop *kqwl, struct kevent_internal_s *kev,
                        /* we just transfered this ref to kqwl_owner */
                        extra_thread_ref = THREAD_NULL;
                }
-               cur_override = kqworkloop_combined_qos(kqwl, &ipc_override_is_sync);
-               old_owner_override = kqr->kqr_dsync_owner_qos;
-               old_owner_override_is_sync = kqr->kqr_owner_override_is_sync;
+               cur_owner_override = kqworkloop_owner_override(kqwl);
+
+               if (cur_owner) {
+                       thread_ends_owning_workloop(cur_owner);
+               }
 
-               if (filt_wlowner_is_valid(new_owner)) {
+               if (new_owner) {
                        /* override it before we drop the old */
-                       if (cur_override != THREAD_QOS_UNSPECIFIED) {
-                               thread_add_ipc_override(new_owner, cur_override);
+                       if (cur_owner_override != THREAD_QOS_UNSPECIFIED) {
+                               thread_add_ipc_override(new_owner, cur_owner_override);
                        }
-                       if (ipc_override_is_sync) {
-                               thread_add_sync_ipc_override(new_owner);
-                       }
-                       /* Update the kqr to indicate that owner has sync ipc override */
-                       kqr->kqr_dsync_owner_qos = cur_override;
-                       kqr->kqr_owner_override_is_sync = ipc_override_is_sync;
                        thread_starts_owning_workloop(new_owner);
-                       if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED) {
+                       if ((kqr->kqr_state & KQR_THREQUESTED) && !kqr->kqr_thread) {
                                if (action == KQWL_UTQ_NONE) {
                                        action = KQWL_UTQ_REDRIVE_EVENTS;
                                }
                        }
-               } else if (new_owner == THREAD_NULL) {
-                       kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
-                       kqr->kqr_owner_override_is_sync = false;
+               } else {
                        if ((kqr->kqr_state & (KQR_THREQUESTED | KQR_WAKEUP)) == KQR_WAKEUP) {
                                if (action == KQWL_UTQ_NONE) {
                                        action = KQWL_UTQ_REDRIVE_EVENTS;
@@ -2051,74 +1948,100 @@ filt_wlupdateowner(struct kqworkloop *kqwl, struct kevent_internal_s *kev,
                }
        }
 
+       struct turnstile *ts = kqwl->kqwl_turnstile;
+       bool wl_inheritor_updated = false;
+
        if (action != KQWL_UTQ_NONE) {
-               kqworkloop_update_threads_qos(kqwl, action, async_qos);
+               kqworkloop_update_threads_qos(kqwl, action, qos_index);
        }
 
-       kqwl_req_unlock(kqwl);
-
-       /* Now that we are unlocked, drop the override and ref on old owner */
-       if (new_owner != cur_owner && filt_wlowner_is_valid(cur_owner)) {
-               if (old_owner_override != THREAD_QOS_UNSPECIFIED) {
-                       thread_drop_ipc_override(cur_owner);
+       if (cur_owner != new_owner && ts) {
+               if (action == KQWL_UTQ_REDRIVE_EVENTS) {
+                       /*
+                        * Note that when action is KQWL_UTQ_REDRIVE_EVENTS,
+                        * the code went through workq_kern_threadreq_initiate()
+                        * and the workqueue has set the inheritor already
+                        */
+                       assert(filt_wlturnstile_interlock_is_workq(kqwl));
+               } else if (filt_wlturnstile_interlock_is_workq(kqwl)) {
+                       workq_kern_threadreq_lock(kqwl->kqwl_p);
+                       workq_kern_threadreq_update_inheritor(kqwl->kqwl_p, kqr, new_owner,
+                                       ts, TURNSTILE_IMMEDIATE_UPDATE);
+                       workq_kern_threadreq_unlock(kqwl->kqwl_p);
+                       if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
+                               /*
+                                * If the workq is no longer the interlock, then
+                                * workq_kern_threadreq_update_inheritor() has finished a bind
+                                * and we need to fallback to the regular path.
+                                */
+                               filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
+                       }
+                       wl_inheritor_updated = true;
+               } else {
+                       filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
+                       wl_inheritor_updated = true;
                }
-               if (old_owner_override_is_sync) {
-                       thread_drop_sync_ipc_override(cur_owner);
+
+               /*
+                * We need a turnstile reference because we are dropping the interlock
+                * and the caller has not called turnstile_prepare.
+                */
+               if (wl_inheritor_updated) {
+                       turnstile_reference(ts);
                }
-               thread_ends_owning_workloop(cur_owner);
-               thread_deallocate(cur_owner);
        }
 
-out:
-       if (extra_thread_ref) {
-               thread_deallocate(extra_thread_ref);
+       if (needs_wake && ts) {
+               waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T((event_t)kn),
+                               (thread_t)kn->kn_hook, THREAD_AWAKENED);
        }
-       return error;
-}
 
-static int
-filt_wldebounce(
-       struct kqworkloop *kqwl,
-       struct kevent_internal_s *kev,
-       int default_result)
-{
-       user_addr_t addr = CAST_USER_ADDR_T(kev->ext[EV_EXTIDX_WL_ADDR]);
-       uint64_t udata;
-       int error;
+       kq_req_unlock(kqwl);
 
-       /* we must have the workloop state mutex held */
-       filt_wlheld(kqwl);
+       if (wl_inheritor_updated) {
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+               turnstile_deallocate(ts);
+       }
 
-       /* Do we have a debounce address to work with? */
-       if (addr) {
-               uint64_t kdata = kev->ext[EV_EXTIDX_WL_VALUE];
-               uint64_t mask = kev->ext[EV_EXTIDX_WL_MASK];
+out:
+       /*
+        * Phase 3:
+        *
+        * Unlock and cleanup various lingering references and things.
+        */
+       if (needs_wllock) {
+               filt_wlunlock(kqwl);
+       }
 
-               error = copyin_word(addr, &udata, sizeof(udata));
-               if (error) {
-                       return error;
-               }
+#if CONFIG_WORKLOOP_DEBUG
+       KQWL_HISTORY_WRITE_ENTRY(kqwl, {
+               .updater = current_thread(),
+               .servicer = kqr->kqr_thread, /* Note: racy */
+               .old_owner = cur_owner,
+               .new_owner = new_owner,
 
-               /* update state as copied in */
-               kev->ext[EV_EXTIDX_WL_VALUE] = udata;
+               .kev_ident  = kev->ident,
+               .error      = (int16_t)error,
+               .kev_flags  = kev->flags,
+               .kev_fflags = kev->fflags,
 
-               /* If the masked bits don't match, reject it as stale */
-               if ((udata & mask) != (kdata & mask)) {
-                       return ESTALE;
-               }
+               .kev_mask   = mask,
+               .kev_value  = kdata,
+               .in_value   = udata,
+       });
+#endif // CONFIG_WORKLOOP_DEBUG
 
-#if DEBUG || DEVELOPMENT
-               if ((kev->fflags & NOTE_WL_THREAD_REQUEST) && !(kev->flags & EV_DELETE)) {
-                       if ((udata & DISPATCH_QUEUE_ENQUEUED) == 0 &&
-                                       (udata >> 48) != 0 && (udata >> 48) != 0xffff) {
-                               panic("kevent: workloop %#016llx is not enqueued "
-                                               "(kev:%p dq_state:%#016llx)", kev->udata, kev, udata);
-                       }
+       if (cur_owner && new_owner != cur_owner) {
+               if (cur_owner_override != THREAD_QOS_UNSPECIFIED) {
+                       thread_drop_ipc_override(cur_owner);
                }
-#endif
+               thread_deallocate(cur_owner);
        }
 
-       return default_result;
+       if (extra_thread_ref) {
+               thread_deallocate(extra_thread_ref);
+       }
+       return error;
 }
 
 /*
@@ -2129,59 +2052,14 @@ filt_wldebounce(
  * - data is set to the error if any
  */
 static inline void
-filt_wlremember_last_update(
-       __assert_only struct kqworkloop *kqwl,
-       struct knote *kn,
-       struct kevent_internal_s *kev,
-       int error)
+filt_wlremember_last_update(struct knote *kn, struct kevent_internal_s *kev,
+               int error)
 {
-       filt_wlheld(kqwl);
        kn->kn_fflags = kev->fflags;
        kn->kn_data = error;
        memcpy(kn->kn_ext, kev->ext, sizeof(kev->ext));
 }
 
-/*
- * Return which operations on EVFILT_WORKLOOP need to be protected against
- * knoteusewait() causing priority inversions.
- */
-static bool
-filt_wlneeds_boost(struct kevent_internal_s *kev)
-{
-       if (kev == NULL) {
-               /*
-                * this is an f_process() usecount, and it can cause a drop to wait
-                */
-               return true;
-       }
-       if (kev->fflags & NOTE_WL_THREAD_REQUEST) {
-               /*
-                * All operations on thread requests may starve drops or re-attach of
-                * the same knote, all of them need boosts. None of what we do under
-                * thread-request usecount holds blocks anyway.
-                */
-               return true;
-       }
-       if (kev->fflags & NOTE_WL_SYNC_WAIT) {
-               /*
-                * this may call filt_wlwait() and we don't want to hold any boost when
-                * woken up, this would cause background threads contending on
-                * dispatch_sync() to wake up at 64 and be preempted immediately when
-                * this drops.
-                */
-               return false;
-       }
-
-       /*
-        * SYNC_WAIT knotes when deleted don't need to be rushed, there's no
-        * detach/reattach race with these ever. In addition to this, when the
-        * SYNC_WAIT knote is dropped, the caller is no longer receiving the
-        * workloop overrides if any, and we'd rather schedule other threads than
-        * him, he's not possibly stalling anything anymore.
-        */
-       return (kev->flags & EV_DELETE) == 0;
-}
-
 static int
 filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
 {
@@ -2199,7 +2077,7 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
        if (kev->ident == 0 && kev->udata == 0 && kev->fflags == 0) {
                struct kqrequest *kqr = &kqwl->kqwl_request;
 
-               kqwl_req_lock(kqwl);
+               kq_req_lock(kqwl);
                kev->fflags = 0;
                if (kqr->kqr_dsync_waiters) {
                        kev->fflags |= NOTE_WL_SYNC_WAIT;
@@ -2207,21 +2085,16 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
                if (kqr->kqr_qos_index) {
                        kev->fflags |= NOTE_WL_THREAD_REQUEST;
                }
-               if (kqwl->kqwl_owner == WL_OWNER_SUSPENDED) {
-                       kev->ext[0] = ~0ull;
-               } else {
-                       kev->ext[0] = thread_tid(kqwl->kqwl_owner);
-               }
+               kev->ext[0] = thread_tid(kqwl->kqwl_owner);
                kev->ext[1] = thread_tid(kqwl->kqwl_request.kqr_thread);
                kev->ext[2] = thread_owned_workloops_count(current_thread());
                kev->ext[3] = kn->kn_kevent.ext[3];
-               kqwl_req_unlock(kqwl);
+               kq_req_unlock(kqwl);
                error = EBUSY;
                goto out;
        }
 #endif
 
-       /* Some simple validation */
        int command = (kn->kn_sfflags & NOTE_WL_COMMANDS_MASK);
        switch (command) {
        case NOTE_WL_THREAD_REQUEST:
@@ -2229,19 +2102,22 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
                        error = EINVAL;
                        goto out;
                }
-               qos_index = qos_index_from_qos(kn, kn->kn_qos, FALSE);
-               if (qos_index < THREAD_QOS_MAINTENANCE ||
-                               qos_index > THREAD_QOS_USER_INTERACTIVE) {
+               qos_index = _pthread_priority_thread_qos(kn->kn_qos);
+               if (qos_index == THREAD_QOS_UNSPECIFIED) {
                        error = ERANGE;
                        goto out;
                }
+               if (kqwl->kqwl_request.kqr_qos_index) {
+                       /*
+                        * There already is a thread request, and well, you're only allowed
+                        * one per workloop, so fail the attach.
+                        */
+                       error = EALREADY;
+                       goto out;
+               }
                break;
        case NOTE_WL_SYNC_WAIT:
        case NOTE_WL_SYNC_WAKE:
-               if (kq->kq_state & KQ_NO_WQ_THREAD) {
-                       error = ENOTSUP;
-                       goto out;
-               }
                if (kn->kn_id == kqwl->kqwl_dynamicid) {
                        error = EINVAL;
                        goto out;
@@ -2260,139 +2136,131 @@ filt_wlattach(struct knote *kn, struct kevent_internal_s *kev)
                goto out;
        }
 
-       filt_wllock(kqwl);
-       kn->kn_hook = NULL;
-
-       if (command == NOTE_WL_THREAD_REQUEST && kqwl->kqwl_request.kqr_qos_index) {
-               /*
-                * There already is a thread request, and well, you're only allowed
-                * one per workloop, so fail the attach.
-                *
-                * Note: kqr_qos_index is always set with the wllock held, so we
-                * don't need to take the kqr lock.
-                */
-               error = EALREADY;
-       } else {
-               /* Make sure user and kernel are in agreement on important state */
-               error = filt_wldebounce(kqwl, kev, 0);
-       }
+       error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLATTACH);
 
-       error = filt_wlupdateowner(kqwl, kev, error, qos_index);
-       filt_wlunlock(kqwl);
 out:
        if (error) {
-               kn->kn_flags |= EV_ERROR;
                /* If userland wants ESTALE to be hidden, fail the attach anyway */
                if (error == ESTALE && (kn->kn_sfflags & NOTE_WL_IGNORE_ESTALE)) {
                        error = 0;
                }
-               kn->kn_data = error;
+               knote_set_error(kn, error);
                return 0;
        }
-
+       if (command == NOTE_WL_SYNC_WAIT) {
+               return kevent_register_wait_prepare(kn, kev);
+       }
        /* Just attaching the thread request successfully will fire it */
-       return command == NOTE_WL_THREAD_REQUEST;
+       if (command == NOTE_WL_THREAD_REQUEST) {
+               /*
+                * Thread Request knotes need an explicit touch to be active again,
+                * so delivering an event needs to also consume it.
+                */
+               kn->kn_flags |= EV_CLEAR;
+               return FILTER_ACTIVE;
+       }
+       return 0;
 }
 
-__attribute__((noinline,not_tail_called))
-static int
-filt_wlwait(struct kqworkloop           *kqwl,
-            struct knote                *kn,
-            struct kevent_internal_s    *kev)
+static void __dead2
+filt_wlwait_continue(void *parameter, wait_result_t wr)
 {
-       filt_wlheld(kqwl);
-       assert((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0);
+       struct _kevent_register *cont_args = parameter;
+       struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq;
+       struct kqrequest *kqr = &kqwl->kqwl_request;
 
-       /*
-        * Hint to the wakeup side that this thread is waiting.  Also used by
-        * stackshot for waitinfo.
-        */
-       kn->kn_hook = current_thread();
+       kq_req_lock(kqwl);
+       kqr->kqr_dsync_waiters--;
+       if (filt_wlturnstile_interlock_is_workq(kqwl)) {
+               workq_kern_threadreq_lock(kqwl->kqwl_p);
+               turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL);
+               workq_kern_threadreq_unlock(kqwl->kqwl_p);
+       } else {
+               turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, NULL);
+       }
+       kq_req_unlock(kqwl);
 
-       thread_set_pending_block_hint(current_thread(), kThreadWaitWorkloopSyncWait);
+       turnstile_cleanup();
 
-       wait_result_t wr = assert_wait(kn, THREAD_ABORTSAFE);
+       if (wr == THREAD_INTERRUPTED) {
+               cont_args->kev.flags |= EV_ERROR;
+               cont_args->kev.data = EINTR;
+       } else if (wr != THREAD_AWAKENED) {
+               panic("Unexpected wait result: %d", wr);
+       }
 
-       if (wr == THREAD_WAITING) {
-               kq_index_t qos_index = qos_index_from_qos(kn, kev->qos, TRUE);
-               struct kqrequest *kqr = &kqwl->kqwl_request;
+       kevent_register_wait_return(cont_args);
+}
 
-               thread_t thread_to_handoff = THREAD_NULL; /* holds +1 thread ref */
+/*
+ * Called with the workloop mutex held, most of the time never returns as it
+ * calls filt_wlwait_continue through a continuation.
+ */
+static void __dead2
+filt_wlpost_register_wait(struct uthread *uth, struct knote_lock_ctx *knlc,
+               struct _kevent_register *cont_args)
+{
+       struct kqworkloop *kqwl = (struct kqworkloop *)cont_args->kq;
+       struct kqrequest *kqr = &kqwl->kqwl_request;
+       struct turnstile *ts;
+       bool workq_locked = false;
 
-               thread_t kqwl_owner = kqwl->kqwl_owner;
-               if (filt_wlowner_is_valid(kqwl_owner)) {
-                       thread_reference(kqwl_owner);
-                       thread_to_handoff = kqwl_owner;
-               }
+       kq_req_lock(kqwl);
 
-               kqwl_req_lock(kqwl);
+       kqr->kqr_dsync_waiters++;
 
-               if (qos_index) {
-                       assert(kqr->kqr_dsync_waiters < UINT16_MAX);
-                       kqr->kqr_dsync_waiters++;
-                       if (qos_index > kqr->kqr_dsync_waiters_qos) {
-                               kqworkloop_update_threads_qos(kqwl,
-                                               KQWL_UTQ_SET_SYNC_WAITERS_QOS, qos_index);
-                       }
-               }
+       if (filt_wlturnstile_interlock_is_workq(kqwl)) {
+               workq_kern_threadreq_lock(kqwl->kqwl_p);
+               workq_locked = true;
+       }
 
-               if ((kqr->kqr_state & KQR_BOUND) && thread_to_handoff == THREAD_NULL) {
-                       assert(kqr->kqr_thread != THREAD_NULL);
-                       thread_t servicer = kqr->kqr_thread;
+       ts = turnstile_prepare((uintptr_t)kqwl, &kqwl->kqwl_turnstile,
+                       TURNSTILE_NULL, TURNSTILE_WORKLOOPS);
 
-                       thread_reference(servicer);
-                       thread_to_handoff = servicer;
+       if (workq_locked) {
+               workq_kern_threadreq_update_inheritor(kqwl->kqwl_p,
+                               &kqwl->kqwl_request, kqwl->kqwl_owner, ts,
+                               TURNSTILE_DELAYED_UPDATE);
+               if (!filt_wlturnstile_interlock_is_workq(kqwl)) {
+                       /*
+                        * if the interlock is no longer the workqueue lock,
+                        * then we don't need to hold it anymore.
+                        */
+                       workq_kern_threadreq_unlock(kqwl->kqwl_p);
+                       workq_locked = false;
                }
+       }
+       if (!workq_locked) {
+               /*
+                * If the interlock is the workloop's, then it's our responsibility to
+                * call update_inheritor, so just do it.
+                */
+               filt_wlupdate_inheritor(kqwl, ts, TURNSTILE_DELAYED_UPDATE);
+       }
 
-               kqwl_req_unlock(kqwl);
-
-               filt_wlunlock(kqwl);
-
-               /* TODO: use continuation based blocking <rdar://problem/31299584> */
-
-               /* consume a refcount on thread_to_handoff, then thread_block() */
-               wr = thread_handoff(thread_to_handoff);
-               thread_to_handoff = THREAD_NULL;
-
-               filt_wllock(kqwl);
-
-               /* clear waiting state (only one waiting thread - so no race) */
-               assert(kn->kn_hook == current_thread());
+       thread_set_pending_block_hint(uth->uu_thread, kThreadWaitWorkloopSyncWait);
+       waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(cont_args->knote),
+                       THREAD_ABORTSAFE, TIMEOUT_WAIT_FOREVER);
 
-               if (qos_index) {
-                       kqwl_req_lock(kqwl);
-                       assert(kqr->kqr_dsync_waiters > 0);
-                       if (--kqr->kqr_dsync_waiters == 0) {
-                               assert(kqr->kqr_dsync_waiters_qos);
-                               kqworkloop_update_threads_qos(kqwl,
-                                               KQWL_UTQ_SET_SYNC_WAITERS_QOS, 0);
-                       }
-                       kqwl_req_unlock(kqwl);
-               }
+       if (workq_locked) {
+               workq_kern_threadreq_unlock(kqwl->kqwl_p);
        }
 
-       kn->kn_hook = NULL;
-
-       switch (wr) {
-       case THREAD_AWAKENED:
-               return 0;
-       case THREAD_INTERRUPTED:
-               return EINTR;
-       case THREAD_RESTART:
-               return ECANCELED;
-       default:
-               panic("filt_wlattach: unexpected wait result %d", wr);
-               return EINVAL;
+       thread_t thread = kqwl->kqwl_owner ?: kqr->kqr_thread;
+       if (thread) {
+               thread_reference(thread);
        }
+       kq_req_unlock(kqwl);
+
+       kevent_register_wait_block(ts, thread, knlc, filt_wlwait_continue, cont_args);
 }
 
 /* called in stackshot context to report the thread responsible for blocking this thread */
 void
 kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
-                                  event64_t event,
-                                  thread_waitinfo_t *waitinfo)
+               event64_t event, thread_waitinfo_t *waitinfo)
 {
-       struct knote *kn = (struct knote*) event;
+       struct knote *kn = (struct knote *)event;
        assert(kdp_is_in_zone(kn, "knote zone"));
 
        assert(kn->kn_hook == thread);
@@ -2407,9 +2275,7 @@ kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
        thread_t kqwl_owner = kqwl->kqwl_owner;
        thread_t servicer = kqr->kqr_thread;
 
-       if (kqwl_owner == WL_OWNER_SUSPENDED) {
-               waitinfo->owner = STACKSHOT_WAITOWNER_SUSPENDED;
-       } else if (kqwl_owner != THREAD_NULL) {
+       if (kqwl_owner != THREAD_NULL) {
                assert(kdp_is_in_zone(kqwl_owner, "threads"));
 
                waitinfo->owner = thread_tid(kqwl->kqwl_owner);
@@ -2424,205 +2290,82 @@ kdp_workloop_sync_wait_find_owner(__assert_only thread_t thread,
        }
 
        waitinfo->context = kqwl->kqwl_dynamicid;
-
-       return;
-}
-
-/*
- * Takes kqueue locked, returns locked, may drop in the middle and/or block for a while
- */
-static int
-filt_wlpost_attach(struct knote *kn, struct  kevent_internal_s *kev)
-{
-       struct kqueue *kq = knote_get_kq(kn);
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       int error = 0;
-
-       if (kev->fflags & NOTE_WL_SYNC_WAIT) {
-               if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
-                       filt_wllock(kqwl);
-                       /* if the wake has already preposted, don't wait */
-                       if ((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0)
-                               error = filt_wlwait(kqwl, kn, kev);
-                       filt_wlunlock(kqwl);
-                       knoteuse2kqlock(kq, kn, KNUSE_NONE);
-               }
-       }
-       return error;
 }
 
 static void
 filt_wldetach(__assert_only struct knote *kn)
 {
        assert(knote_get_kq(kn)->kq_state & KQ_WORKLOOP);
-
-       /*
-        * Thread requests have nothing to detach.
-        * Sync waiters should have been aborted out
-        * and drop their refs before we could drop/
-        * detach their knotes.
-        */
-       assert(kn->kn_hook == NULL);
-}
-
-static int
-filt_wlevent(
-       __unused struct knote *kn,
-       __unused long hint)
-{
-       panic("filt_wlevent");
-       return 0;
+       if (kn->kn_hook) {
+               kevent_register_wait_cleanup(kn);
+       }
 }
 
 static int
-filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev)
+filt_wlvalidate_kev_flags(struct knote *kn, struct kevent_internal_s *kev,
+               thread_qos_t *qos_index)
 {
        int new_commands = kev->fflags & NOTE_WL_COMMANDS_MASK;
        int sav_commands = kn->kn_sfflags & NOTE_WL_COMMANDS_MASK;
-       int error = 0;
+
+       if ((kev->fflags & NOTE_WL_DISCOVER_OWNER) && (kev->flags & EV_DELETE)) {
+               return EINVAL;
+       }
+       if (kev->fflags & NOTE_WL_UPDATE_QOS) {
+               if (kev->flags & EV_DELETE) {
+                       return EINVAL;
+               }
+               if (sav_commands != NOTE_WL_THREAD_REQUEST) {
+                       return EINVAL;
+               }
+               if (!(*qos_index = _pthread_priority_thread_qos(kev->qos))) {
+                       return ERANGE;
+               }
+       }
 
        switch (new_commands) {
        case NOTE_WL_THREAD_REQUEST:
                /* thread requests can only update themselves */
-               if (sav_commands != new_commands)
-                       error = EINVAL;
+               if (sav_commands != NOTE_WL_THREAD_REQUEST)
+                       return EINVAL;
                break;
 
        case NOTE_WL_SYNC_WAIT:
                if (kev->fflags & NOTE_WL_END_OWNERSHIP)
-                       error = EINVAL;
-               /* FALLTHROUGH */
+                       return EINVAL;
+               goto sync_checks;
+
        case NOTE_WL_SYNC_WAKE:
-               /* waits and wakes can update themselves or their counterparts */
+       sync_checks:
                if (!(sav_commands & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)))
-                       error = EINVAL;
-               if (kev->fflags & NOTE_WL_UPDATE_QOS)
-                       error = EINVAL;
+                       return EINVAL;
                if ((kev->flags & (EV_ENABLE | EV_DELETE)) == EV_ENABLE)
-                       error = EINVAL;
-               if (kev->flags & EV_DELETE) {
-                       /*
-                        * Really this is not supported: there is absolutely no reason
-                        * whatsoever to want to fail the drop of a NOTE_WL_SYNC_WAIT knote.
-                        */
-                       if (kev->ext[EV_EXTIDX_WL_ADDR] && kev->ext[EV_EXTIDX_WL_MASK]) {
-                               error = EINVAL;
-                       }
-               }
+                       return EINVAL;
                break;
 
        default:
-               error = EINVAL;
-       }
-       if ((kev->flags & EV_DELETE) && (kev->fflags & NOTE_WL_DISCOVER_OWNER)) {
-               error = EINVAL;
+               return EINVAL;
        }
-       return error;
+       return 0;
 }
 
 static int
-filt_wltouch(
-       struct knote *kn,
-       struct kevent_internal_s *kev)
+filt_wltouch(struct knote *kn, struct kevent_internal_s *kev)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       int error = 0;
-       struct kqworkloop *kqwl;
-
-       assert(kq->kq_state & KQ_WORKLOOP);
-       kqwl = (struct kqworkloop *)kq;
+       struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
+       thread_qos_t qos_index = THREAD_QOS_UNSPECIFIED;
 
-       error = filt_wlvalidate_kev_flags(kn, kev);
+       int error = filt_wlvalidate_kev_flags(kn, kev, &qos_index);
        if (error) {
                goto out;
        }
 
-       filt_wllock(kqwl);
-
-       /* Make sure user and kernel are in agreement on important state */
-       error = filt_wldebounce(kqwl, kev, 0);
-       if (error) {
-               error = filt_wlupdateowner(kqwl, kev, error, 0);
-               goto out_unlock;
-       }
-
-       int new_command = kev->fflags & NOTE_WL_COMMANDS_MASK;
-       switch (new_command) {
-       case NOTE_WL_THREAD_REQUEST:
-               assert(kqwl->kqwl_request.kqr_qos_index != THREAD_QOS_UNSPECIFIED);
-               break;
-
-       case NOTE_WL_SYNC_WAIT:
-               /*
-                * we need to allow waiting several times on the same knote because
-                * of EINTR. If it's already woken though, it won't block.
-                */
-               break;
-
-       case NOTE_WL_SYNC_WAKE:
-               if (kn->kn_sfflags & NOTE_WL_SYNC_WAKE) {
-                       /* disallow waking the same knote twice */
-                       error = EALREADY;
-                       goto out_unlock;
-               }
-               if (kn->kn_hook) {
-                       thread_wakeup_thread((event_t)kn, (thread_t)kn->kn_hook);
-               }
-               break;
-
-       default:
-               error = EINVAL;
-               goto out_unlock;
-       }
-
-       /*
-        * Save off any additional fflags/data we just accepted
-        * But only keep the last round of "update" bits we acted on which helps
-        * debugging a lot.
-        */
-       kn->kn_sfflags &= ~NOTE_WL_UPDATES_MASK;
-       kn->kn_sfflags |= kev->fflags;
-       kn->kn_sdata = kev->data;
-
-       kq_index_t qos_index = THREAD_QOS_UNSPECIFIED;
-
-       if (kev->fflags & NOTE_WL_UPDATE_QOS) {
-               qos_t qos = pthread_priority_canonicalize(kev->qos, FALSE);
-
-               if (kn->kn_qos != qos) {
-                       qos_index = qos_index_from_qos(kn, qos, FALSE);
-                       if (qos_index == THREAD_QOS_UNSPECIFIED) {
-                               error = ERANGE;
-                               goto out_unlock;
-                       }
-                       kqlock(kq);
-                       if (kn->kn_status & KN_QUEUED) {
-                               knote_dequeue(kn);
-                               knote_set_qos_index(kn, qos_index);
-                               knote_enqueue(kn);
-                               knote_wakeup(kn);
-                       } else {
-                               knote_set_qos_index(kn, qos_index);
-                       }
-                       kn->kn_qos = qos;
-                       kqunlock(kq);
-               }
-       }
-
-       error = filt_wlupdateowner(kqwl, kev, 0, qos_index);
+       error = filt_wlupdate(kqwl, kn, kev, qos_index, FILT_WLTOUCH);
+       filt_wlremember_last_update(kn, kev, error);
        if (error) {
-               goto out_unlock;
-       }
-
-       if (new_command == NOTE_WL_SYNC_WAIT) {
-               /* if the wake has already preposted, don't wait */
-               if ((kn->kn_sfflags & NOTE_WL_SYNC_WAKE) == 0)
-                       error = filt_wlwait(kqwl, kn, kev);
+               goto out;
        }
 
-out_unlock:
-       filt_wlremember_last_update(kqwl, kn, kev, error);
-       filt_wlunlock(kqwl);
 out:
        if (error) {
                if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
@@ -2633,144 +2376,46 @@ out:
                kev->data = error;
                return 0;
        }
+       int command = kev->fflags & NOTE_WL_COMMANDS_MASK;
+       if (command == NOTE_WL_SYNC_WAIT && !(kn->kn_sfflags & NOTE_WL_SYNC_WAKE)) {
+               return kevent_register_wait_prepare(kn, kev);
+       }
        /* Just touching the thread request successfully will fire it */
-       return new_command == NOTE_WL_THREAD_REQUEST;
+       if (command == NOTE_WL_THREAD_REQUEST) {
+               if (kev->fflags & NOTE_WL_UPDATE_QOS) {
+                       return FILTER_ACTIVE | FILTER_UPDATE_REQ_QOS;
+               }
+               return FILTER_ACTIVE;
+       }
+       return 0;
 }
 
-static int
-filt_wldrop_and_unlock(
-       struct knote *kn,
-       struct kevent_internal_s *kev)
+static bool
+filt_wlallow_drop(struct knote *kn, struct kevent_internal_s *kev)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       int error = 0, knoteuse_flags = KNUSE_NONE;
-
-       kqlock_held(kq);
+       struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
 
-       assert(kev->flags & EV_DELETE);
-       assert(kq->kq_state & KQ_WORKLOOP);
-
-       error = filt_wlvalidate_kev_flags(kn, kev);
+       int error = filt_wlvalidate_kev_flags(kn, kev, NULL);
        if (error) {
                goto out;
        }
 
-       if (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) {
-               knoteuse_flags |= KNUSE_BOOST;
-       }
-
-       /* take a usecount to allow taking the filt_wllock */
-       if (!kqlock2knoteuse(kq, kn, knoteuse_flags)) {
-               /* knote is being dropped already */
-               error = EINPROGRESS;
-               goto out;
-       }
-
-       filt_wllock(kqwl);
-
-       /*
-        * Make sure user and kernel are in agreement on important state
-        *
-        * Userland will modify bits to cause this to fail for the touch / drop
-        * race case (when a drop for a thread request quiescing comes in late after
-        * the workloop has been woken up again).
-        */
-       error = filt_wldebounce(kqwl, kev, 0);
-
-       if (!knoteuse2kqlock(kq, kn, knoteuse_flags)) {
-               /* knote is no longer alive */
-               error = EINPROGRESS;
-               goto out_unlock;
-       }
-
-       if (!error && (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) && kn->kn_inuse) {
-               /*
-                * There is a concurrent drop or touch happening, we can't resolve this,
-                * userland has to redrive.
-                *
-                * The race we're worried about here is the following:
-                *
-                *   f_touch               |  f_drop_and_unlock
-                * ------------------------+--------------------------------------------
-                *                         | kqlock()
-                *                         | kqlock2knoteuse()
-                *                         | filt_wllock()
-                *                         | debounces successfully
-                *  kqlock()               |
-                *  kqlock2knoteuse        |
-                *  filt_wllock() <BLOCKS> |
-                *                         | knoteuse2kqlock()
-                *                         | filt_wlunlock()
-                *                         | kqlock2knotedrop() <BLOCKS, WAKES f_touch>
-                *  debounces successfully |
-                *  filt_wlunlock()        |
-                *  caller WAKES f_drop    |
-                *                         | performs drop, but f_touch should have won
-                *
-                * So if the usecount is not 0 here, we need to wait for it to drop and
-                * redrive the whole logic (including looking up the knote again).
-                */
-               filt_wlunlock(kqwl);
-               knoteusewait(kq, kn);
-               return ERESTART;
-       }
-
-       /*
-        * If error is 0 this will set kqr_qos_index to THREAD_QOS_UNSPECIFIED
-        *
-        * If error is 0 or ESTALE this may drop ownership and cause a thread
-        * request redrive, however the kqlock is held which prevents f_process() to
-        * run until we did the drop for real.
-        */
-       error = filt_wlupdateowner(kqwl, kev, error, 0);
+       error = filt_wlupdate(kqwl, kn, kev, 0, FILT_WLDROP);
+       filt_wlremember_last_update(kn, kev, error);
        if (error) {
-               goto out_unlock;
-       }
-
-       if ((kn->kn_sfflags & (NOTE_WL_SYNC_WAIT | NOTE_WL_SYNC_WAKE)) ==
-                       NOTE_WL_SYNC_WAIT) {
-               /*
-                * When deleting a SYNC_WAIT knote that hasn't been woken up
-                * explicitly, issue a wake up.
-                */
-               kn->kn_sfflags |= NOTE_WL_SYNC_WAKE;
-               if (kn->kn_hook) {
-                       thread_wakeup_thread((event_t)kn, (thread_t)kn->kn_hook);
-               }
+               goto out;
        }
 
-out_unlock:
-       filt_wlremember_last_update(kqwl, kn, kev, error);
-       filt_wlunlock(kqwl);
-
 out:
-       if (error == 0) {
-               /* If nothing failed, do the regular knote drop. */
-               if (kqlock2knotedrop(kq, kn)) {
-                       knote_drop(kn, current_proc());
-               } else {
-                       error = EINPROGRESS;
+       if (error) {
+               if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
+                       return false;
                }
-       } else {
-               kqunlock(kq);
-       }
-       if (error == ESTALE && (kev->fflags & NOTE_WL_IGNORE_ESTALE)) {
-               error = 0;
-       }
-       if (error == EINPROGRESS) {
-               /*
-                * filt_wlprocess() makes sure that no event can be delivered for
-                * NOTE_WL_THREAD_REQUEST knotes once a drop is happening, and
-                * NOTE_WL_SYNC_* knotes are never fired.
-                *
-                * It means that EINPROGRESS is about a state that userland cannot
-                * observe for this filter (an event being delivered concurrently from
-                * a drop), so silence the error.
-                */
-               error = 0;
+               kev->flags |= EV_ERROR;
+               kev->data = error;
+               return false;
        }
-       return error;
+       return true;
 }
 
 static int
@@ -2779,66 +2424,87 @@ filt_wlprocess(
        __unused struct filt_process_s *data,
        struct kevent_internal_s *kev)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       struct kqrequest *kqr = &kqwl->kqwl_request;
+       struct kqworkloop *kqwl = (struct kqworkloop *)knote_get_kq(kn);
        int rc = 0;
 
-       assert(kq->kq_state & KQ_WORKLOOP);
-
-       /* only thread requests should get here */
        assert(kn->kn_sfflags & NOTE_WL_THREAD_REQUEST);
-       if (kn->kn_sfflags & NOTE_WL_THREAD_REQUEST) {
-               filt_wllock(kqwl);
-               assert(kqr->kqr_qos_index != THREAD_QOS_UNSPECIFIED);
-               if (kqwl->kqwl_owner) {
+
+       filt_wllock(kqwl);
+
+       if (kqwl->kqwl_owner) {
+               /*
+                * <rdar://problem/33584321> userspace sometimes due to events being
+                * delivered but not triggering a drain session can cause a process
+                * of the thread request knote.
+                *
+                * When that happens, the automatic deactivation due to process
+                * would swallow the event, so we have to activate the knote again.
+                */
+               kqlock(kqwl);
+               knote_activate(kn);
+               kqunlock(kqwl);
+       } else {
+#if DEBUG || DEVELOPMENT
+               if (kevent_debug_flags() & KEVENT_PANIC_ON_NON_ENQUEUED_PROCESS) {
                        /*
-                        * <rdar://problem/33584321> userspace sometimes due to events being
-                        * delivered but not triggering a drain session can cause a process
-                        * of the thread request knote.
-                        *
-                        * When that happens, the automatic deactivation due to process
-                        * would swallow the event, so we have to activate the knote again.
+                        * see src/queue_internal.h in libdispatch
                         */
-                       kqlock(kq);
-                       knote_activate(kn);
-                       kqunlock(kq);
-               } else if (kqr->kqr_qos_index) {
-#if DEBUG || DEVELOPMENT
+#define DISPATCH_QUEUE_ENQUEUED 0x1ull
                        user_addr_t addr = CAST_USER_ADDR_T(kn->kn_ext[EV_EXTIDX_WL_ADDR]);
                        task_t t = current_task();
                        uint64_t val;
                        if (addr && task_is_active(t) && !task_is_halting(t) &&
                                        copyin_word(addr, &val, sizeof(val)) == 0 &&
                                        val && (val & DISPATCH_QUEUE_ENQUEUED) == 0 &&
-                                       (val >> 48) != 0 && (val >> 48) != 0xffff) {
+                                       (val >> 48) != 0xdead && (val >> 48) != 0 && (val >> 48) != 0xffff) {
                                panic("kevent: workloop %#016llx is not enqueued "
                                                "(kn:%p dq_state:%#016llx kev.dq_state:%#016llx)",
-                                               kn->kn_udata, kn, val,
-                                               kn->kn_ext[EV_EXTIDX_WL_VALUE]);
+                                               kn->kn_udata, kn, val, kn->kn_ext[EV_EXTIDX_WL_VALUE]);
                        }
-#endif
-                       *kev = kn->kn_kevent;
-                       kev->fflags = kn->kn_sfflags;
-                       kev->data = kn->kn_sdata;
-                       kev->qos = kn->kn_qos;
-                       rc = 1;
                }
-               filt_wlunlock(kqwl);
+#endif
+               *kev = kn->kn_kevent;
+               kev->fflags = kn->kn_sfflags;
+               kev->data = kn->kn_sdata;
+               kev->qos = kn->kn_qos;
+               rc |= FILTER_ACTIVE;
+       }
+
+       filt_wlunlock(kqwl);
+
+       if (rc & FILTER_ACTIVE) {
+               workq_thread_set_max_qos(kqwl->kqwl_p, &kqwl->kqwl_request);
        }
        return rc;
 }
 
+SECURITY_READ_ONLY_EARLY(static struct filterops) workloop_filtops = {
+       .f_extended_codes = true,
+       .f_attach  = filt_wlattach,
+       .f_detach  = filt_wldetach,
+       .f_event   = filt_badevent,
+       .f_touch   = filt_wltouch,
+       .f_process = filt_wlprocess,
+       .f_allow_drop = filt_wlallow_drop,
+       .f_post_register_wait = filt_wlpost_register_wait,
+};
+
 #pragma mark kevent / knotes
 
 /*
  * JMM - placeholder for not-yet-implemented filters
  */
+static int
+filt_badevent(struct knote *kn, long hint)
+{
+       panic("%s[%d](%p, %ld)", __func__, kn->kn_filter, kn, hint);
+       return 0;
+}
+
 static int
 filt_badattach(__unused struct knote *kn, __unused struct kevent_internal_s *kev)
 {
-       kn->kn_flags |= EV_ERROR;
-       kn->kn_data = ENOTSUP;
+       knote_set_error(kn, ENOTSUP);
        return 0;
 }
 
@@ -2849,7 +2515,6 @@ kqueue_alloc(struct proc *p, unsigned int flags)
        struct kqueue *kq = NULL;
        int policy;
        void *hook = NULL;
-       uint64_t kq_addr_offset;
 
        if (flags & KEVENT_FLAG_WORKQ) {
                struct kqworkq *kqwq;
@@ -2865,16 +2530,29 @@ kqueue_alloc(struct proc *p, unsigned int flags)
                kqwq->kqwq_state = KQ_WORKQ;
 
                for (i = 0; i < KQWQ_NBUCKETS; i++) {
-                       TAILQ_INIT(&kq->kq_queue[i]);
+                       TAILQ_INIT(&kqwq->kqwq_queue[i]);
                }
-               for (i = 0; i < KQWQ_NQOS; i++) {
+               for (i = 0; i < KQWQ_NBUCKETS; i++) {
+                       if (i != KQWQ_QOS_MANAGER) {
+                               /*
+                                * Because of how the bucketized system works, we mix overcommit
+                                * sources with not overcommit: each time we move a knote from
+                                * one bucket to the next due to overrides, we'd had to track
+                                * overcommitness, and it's really not worth it in the workloop
+                                * enabled world that track this faithfully.
+                                *
+                                * Incidentally, this behaves like the original manager-based
+                                * kqwq where event delivery always happened (hence is
+                                * "overcommit")
+                                */
+                               kqwq->kqwq_request[i].kqr_state |= KQR_THOVERCOMMIT;
+                       }
                        kqwq->kqwq_request[i].kqr_qos_index = i;
+                       TAILQ_INIT(&kqwq->kqwq_request[i].kqr_suppressed);
                }
 
-               lck_spin_init(&kqwq->kqwq_reqlock, kq_lck_grp, kq_lck_attr);
                policy = SYNC_POLICY_FIFO;
                hook = (void *)kqwq;
-               
        } else if (flags & KEVENT_FLAG_WORKLOOP) {
                struct kqworkloop *kqwl;
                int i;
@@ -2887,41 +2565,36 @@ kqueue_alloc(struct proc *p, unsigned int flags)
 
                kqwl->kqwl_state = KQ_WORKLOOP | KQ_DYNAMIC;
                kqwl->kqwl_retains = 1; /* donate a retain to creator */
+               kqwl->kqwl_request.kqr_state = KQR_WORKLOOP;
 
                kq = &kqwl->kqwl_kqueue;
                for (i = 0; i < KQWL_NBUCKETS; i++) {
-                       TAILQ_INIT(&kq->kq_queue[i]);
+                       TAILQ_INIT(&kqwl->kqwl_queue[i]);
                }
                TAILQ_INIT(&kqwl->kqwl_request.kqr_suppressed);
 
-               lck_spin_init(&kqwl->kqwl_reqlock, kq_lck_grp, kq_lck_attr);
                lck_mtx_init(&kqwl->kqwl_statelock, kq_lck_grp, kq_lck_attr);
 
                policy = SYNC_POLICY_FIFO;
-               if (flags & KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD) {
-                       policy |= SYNC_POLICY_PREPOST;
-                       kq->kq_state |= KQ_NO_WQ_THREAD;
-               } else {
-                       hook = (void *)kqwl;
-               }
-               
+               hook = (void *)kqwl;
        } else {
                struct kqfile *kqf;
-               
+
                kqf = (struct kqfile *)zalloc(kqfile_zone);
                if (kqf == NULL)
                        return NULL;
 
                kq = &kqf->kqf_kqueue;
                bzero(kqf, sizeof (struct kqfile));
-               TAILQ_INIT(&kq->kq_queue[0]);
+               TAILQ_INIT(&kqf->kqf_queue);
                TAILQ_INIT(&kqf->kqf_suppressed);
-               
+
                policy = SYNC_POLICY_FIFO | SYNC_POLICY_PREPOST;
        }
 
        waitq_set_init(&kq->kq_wqs, policy, NULL, hook);
        lck_spin_init(&kq->kq_lock, kq_lck_grp, kq_lck_attr);
+       lck_spin_init(&kq->kq_reqlock, kq_lck_grp, kq_lck_attr);
        kq->kq_p = p;
 
        if (fdp->fd_knlistsize < 0) {
@@ -2931,19 +2604,16 @@ kqueue_alloc(struct proc *p, unsigned int flags)
                proc_fdunlock(p);
        }
 
-       kq_addr_offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS);
-       /* Assert that the address can be pointer compacted for use with knote */
-       assert(kq_addr_offset < (uint64_t)(1ull << KNOTE_KQ_BITSIZE));
        return (kq);
 }
 
 /*
  * knotes_dealloc - detach all knotes for the process and drop them
  *
- *             Called with proc_fdlock held.
- *             Returns with it locked.
- *             May drop it temporarily.
- *             Process is in such a state that it will not try to allocate
+ *             Called with proc_fdlock held.
+ *             Returns with it locked.
+ *             May drop it temporarily.
+ *             Process is in such a state that it will not try to allocate
  *             any more knotes during this process (stopped for exit or exec).
  */
 void
@@ -2962,10 +2632,7 @@ knotes_dealloc(proc_t p)
                                kq = knote_get_kq(kn);
                                kqlock(kq);
                                proc_fdunlock(p);
-                               /* drop it ourselves or wait */
-                               if (kqlock2knotedrop(kq, kn)) {
-                                       knote_drop(kn, p);
-                               }
+                               knote_drop(kq, kn, NULL);
                                proc_fdlock(p);
                        }
                }
@@ -2985,10 +2652,7 @@ knotes_dealloc(proc_t p)
                                kq = knote_get_kq(kn);
                                kqlock(kq);
                                knhash_unlock(p);
-                               /* drop it ourselves or wait */
-                               if (kqlock2knotedrop(kq, kn)) {
-                                       knote_drop(kn, p);
-                               }
+                               knote_drop(kq, kn, NULL);
                                knhash_lock(p);
                        }
                }
@@ -3006,11 +2670,43 @@ knotes_dealloc(proc_t p)
        proc_fdlock(p);
 }
 
+/*
+ * kqworkloop_invalidate
+ *
+ * Invalidate ownership of a workloop.
+ *
+ * This is meant to be used so that any remnant of overrides and ownership
+ * information is dropped before a kqworkloop can no longer be found in the
+ * global hash table and have ghost workloop ownership left over.
+ *
+ * Possibly returns a thread to deallocate in a safe context.
+ */
+static thread_t
+kqworkloop_invalidate(struct kqworkloop *kqwl)
+{
+       thread_t cur_owner = kqwl->kqwl_owner;
+
+       assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed));
+       if (cur_owner) {
+               /*
+                * If the kqueue had an owner that prevented the thread request to
+                * go through, then no unbind happened, and we may have lingering
+                * overrides to drop.
+                */
+               if (kqworkloop_owner_override(kqwl) != THREAD_QOS_UNSPECIFIED) {
+                       thread_drop_ipc_override(cur_owner);
+               }
+               thread_ends_owning_workloop(cur_owner);
+               kqwl->kqwl_owner = THREAD_NULL;
+       }
+
+       return cur_owner;
+}
 
 /*
  * kqueue_dealloc - detach all knotes from a kqueue and free it
  *
- *     We walk each list looking for knotes referencing this
+ *     We walk each list looking for knotes referencing this
  *     this kqueue.  If we find one, we try to drop it.  But
  *     if we fail to get a drop reference, that will wait
  *     until it is dropped.  So, we can just restart again
@@ -3039,7 +2735,13 @@ kqueue_dealloc(struct kqueue *kq)
        p = kq->kq_p;
        fdp = p->p_fd;
 
+       /*
+        * Workloops are refcounted by their knotes, so there's no point
+        * spending a lot of time under these locks just to deallocate one.
+        */
        if ((kq->kq_state & KQ_WORKLOOP) == 0) {
+               KNOTE_LOCK_CTX(knlc);
+
                proc_fdlock(p);
                for (i = 0; i < fdp->fd_knlistsize; i++) {
                        kn = SLIST_FIRST(&fdp->fd_knlist[i]);
@@ -3047,9 +2749,8 @@ kqueue_dealloc(struct kqueue *kq)
                                if (kq == knote_get_kq(kn)) {
                                        kqlock(kq);
                                        proc_fdunlock(p);
-                                       /* drop it ourselves or wait */
-                                       if (kqlock2knotedrop(kq, kn)) {
-                                               knote_drop(kn, p);
+                                       if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
+                                               knote_drop(kq, kn, &knlc);
                                        }
                                        proc_fdlock(p);
                                        /* start over at beginning of list */
@@ -3059,6 +2760,7 @@ kqueue_dealloc(struct kqueue *kq)
                                kn = SLIST_NEXT(kn, kn_link);
                        }
                }
+
                knhash_lock(p);
                proc_fdunlock(p);
 
@@ -3069,9 +2771,8 @@ kqueue_dealloc(struct kqueue *kq)
                                        if (kq == knote_get_kq(kn)) {
                                                kqlock(kq);
                                                knhash_unlock(p);
-                                               /* drop it ourselves or wait */
-                                               if (kqlock2knotedrop(kq, kn)) {
-                                                       knote_drop(kn, p);
+                                               if (knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
+                                                       knote_drop(kq, kn, &knlc);
                                                }
                                                knhash_lock(p);
                                                /* start over at beginning of list */
@@ -3087,28 +2788,17 @@ kqueue_dealloc(struct kqueue *kq)
 
        if (kq->kq_state & KQ_WORKLOOP) {
                struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-               struct kqrequest *kqr = &kqwl->kqwl_request;
-               thread_t cur_owner = kqwl->kqwl_owner;
+               thread_t cur_owner = kqworkloop_invalidate(kqwl);
 
-               assert(TAILQ_EMPTY(&kqwl->kqwl_request.kqr_suppressed));
-               if (filt_wlowner_is_valid(cur_owner)) {
-                       /*
-                        * If the kqueue had an owner that prevented the thread request to
-                        * go through, then no unbind happened, and we may have lingering
-                        * overrides to drop.
-                        */
-                       if (kqr->kqr_dsync_owner_qos != THREAD_QOS_UNSPECIFIED) {
-                               thread_drop_ipc_override(cur_owner);
-                               kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
-                       }
+               if (cur_owner) thread_deallocate(cur_owner);
 
-                       if (kqr->kqr_owner_override_is_sync) {
-                               thread_drop_sync_ipc_override(cur_owner);
-                               kqr->kqr_owner_override_is_sync = 0;
-                       }
-                       thread_ends_owning_workloop(cur_owner);
-                       thread_deallocate(cur_owner);
-                       kqwl->kqwl_owner = THREAD_NULL;
+               if (kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) {
+                       struct turnstile *ts;
+                       turnstile_complete((uintptr_t)kqwl, &kqwl->kqwl_turnstile, &ts);
+                       turnstile_cleanup();
+                       turnstile_deallocate(ts);
+               } else {
+                       assert(kqwl->kqwl_turnstile == NULL);
                }
        }
 
@@ -3118,23 +2808,18 @@ kqueue_dealloc(struct kqueue *kq)
         */
        waitq_set_deinit(&kq->kq_wqs);
        lck_spin_destroy(&kq->kq_lock, kq_lck_grp);
+       lck_spin_destroy(&kq->kq_reqlock, kq_lck_grp);
 
        if (kq->kq_state & KQ_WORKQ) {
-               struct kqworkq *kqwq = (struct kqworkq *)kq;
-
-               lck_spin_destroy(&kqwq->kqwq_reqlock, kq_lck_grp);
-               zfree(kqworkq_zone, kqwq);
+               zfree(kqworkq_zone, (struct kqworkq *)kq);
        } else if (kq->kq_state & KQ_WORKLOOP) {
                struct kqworkloop *kqwl = (struct kqworkloop *)kq;
 
                assert(kqwl->kqwl_retains == 0);
-               lck_spin_destroy(&kqwl->kqwl_reqlock, kq_lck_grp);
                lck_mtx_destroy(&kqwl->kqwl_statelock, kq_lck_grp);
                zfree(kqworkloop_zone, kqwl);
        } else {
-               struct kqfile *kqf = (struct kqfile *)kq;
-
-               zfree(kqfile_zone, kqf);
+               zfree(kqfile_zone, (struct kqfile *)kq);
        }
 }
 
@@ -3159,18 +2844,16 @@ kqueue_retain(struct kqueue *kq)
 #define KQUEUE_MIGHT_BE_LAST_REF 1
 
 static inline int
-kqueue_release(struct kqueue *kq, __assert_only int possibly_last)
+kqueue_release(kqueue_t kqu, __assert_only int possibly_last)
 {
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-
-       if ((kq->kq_state & KQ_DYNAMIC) == 0) {
+       if ((kqu.kq->kq_state & KQ_DYNAMIC) == 0) {
                return 0;
        }
 
-       assert(kq->kq_state & KQ_WORKLOOP); /* for now */
-       uint32_t refs = OSDecrementAtomic(&kqwl->kqwl_retains);
+       assert(kqu.kq->kq_state & KQ_WORKLOOP); /* for now */
+       uint32_t refs = OSDecrementAtomic(&kqu.kqwl->kqwl_retains);
        if (__improbable(refs == 0)) {
-               panic("kq(%p) over-release", kq);
+               panic("kq(%p) over-release", kqu.kq);
        }
        if (refs == 1) {
                assert(possibly_last);
@@ -3219,7 +2902,7 @@ kqueue(struct proc *p, __unused struct kqueue_args *uap, int32_t *retval)
 
 static int
 kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p,
-    unsigned int flags)
+               unsigned int flags)
 {
        int advance;
        int error;
@@ -3271,7 +2954,7 @@ kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p
                kevp->data = kev64.data;
                kevp->ext[0] = kev64.ext[0];
                kevp->ext[1] = kev64.ext[1];
-               
+
        } else {
                struct kevent_qos_s kevqos;
 
@@ -3301,13 +2984,13 @@ kevent_copyin(user_addr_t *addrp, struct kevent_internal_s *kevp, struct proc *p
 
 static int
 kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *p,
-    unsigned int flags)
+               unsigned int flags)
 {
        user_addr_t addr = *addrp;
        int advance;
        int error;
 
-       /* 
+       /*
         * fully initialize the differnt output event structure
         * types from the internal kevent (and some universal
         * defaults for fields not represented in the internal
@@ -3321,7 +3004,7 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *
 
                        advance = sizeof (kev64);
                        bzero(&kev64, advance);
-                       
+
                        /*
                         * deal with the special case of a user-supplied
                         * value of (uintptr_t)-1.
@@ -3367,7 +3050,7 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *
                error = copyout((caddr_t)&kev64, addr, advance);
        } else {
                struct kevent_qos_s kevqos;
-          
+
                advance = sizeof (struct kevent_qos_s);
                if (flags & KEVENT_FLAG_STACK_EVENTS) {
                        addr -= advance;
@@ -3397,10 +3080,11 @@ kevent_copyout(struct kevent_internal_s *kevp, user_addr_t *addrp, struct proc *
 }
 
 static int
-kevent_get_data_size(struct proc *p, 
-                     uint64_t data_available,
-                     unsigned int flags,
-                     user_size_t *residp)
+kevent_get_data_size(
+               struct proc *p,
+               uint64_t data_available,
+               unsigned int flags,
+               user_size_t *residp)
 {
        user_size_t resid;
        int error = 0;
@@ -3427,10 +3111,11 @@ kevent_get_data_size(struct proc *p,
 }
 
 static int
-kevent_put_data_size(struct proc *p, 
-                     uint64_t data_available,
-                     unsigned int flags,
-                     user_size_t resid)
+kevent_put_data_size(
+               struct proc *p,
+               uint64_t data_available,
+               unsigned int flags,
+               user_size_t resid)
 {
        int error = 0;
 
@@ -3453,7 +3138,6 @@ kevent_put_data_size(struct proc *p,
  *
  *     assume we inherit a use count on the kq fileglob.
  */
-
 __attribute__((noreturn))
 static void
 kevent_continue(__unused struct kqueue *kq, void *data, int error)
@@ -3553,13 +3237,13 @@ kevent_qos(struct proc *p, struct kevent_qos_args *uap, int32_t *retval)
                               retval);
 }
 
-int 
-kevent_qos_internal(struct proc *p, int fd, 
+int
+kevent_qos_internal(struct proc *p, int fd,
                    user_addr_t changelist, int nchanges,
                    user_addr_t eventlist, int nevents,
                    user_addr_t data_out, user_size_t *data_available,
-                   unsigned int flags, 
-                   int32_t *retval) 
+                   unsigned int flags,
+                   int32_t *retval)
 {
        return kevent_internal(p,
                               (kqueue_id_t)fd, NULL,
@@ -3594,8 +3278,8 @@ kevent_id_internal(struct proc *p, kqueue_id_t *id,
                    user_addr_t changelist, int nchanges,
                    user_addr_t eventlist, int nevents,
                    user_addr_t data_out, user_size_t *data_available,
-                   unsigned int flags, 
-                   int32_t *retval) 
+                   unsigned int flags,
+                   int32_t *retval)
 {
        return kevent_internal(p,
                               *id, id,
@@ -3607,7 +3291,7 @@ kevent_id_internal(struct proc *p, kqueue_id_t *id,
                               NULL,
                               retval);
 }
+
 static int
 kevent_get_timeout(struct proc *p,
                   user_addr_t utimeout,
@@ -3805,13 +3489,16 @@ kqueue_hash_lookup(struct proc *p, kqueue_id_t id)
 }
 
 static inline void
-kqueue_release_last(struct proc *p, struct kqueue *kq)
+kqueue_release_last(struct proc *p, kqueue_t kqu)
 {
+       struct kqueue *kq = kqu.kq;
        if (kq->kq_state & KQ_DYNAMIC) {
                kqhash_lock(p);
                if (kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF)) {
+                       thread_t cur_owner = kqworkloop_invalidate(kqu.kqwl);
                        kqueue_hash_remove(p, kq);
                        kqhash_unlock(p);
+                       if (cur_owner) thread_deallocate(cur_owner);
                        kqueue_dealloc(kq);
                } else {
                        kqhash_unlock(p);
@@ -3819,93 +3506,118 @@ kqueue_release_last(struct proc *p, struct kqueue *kq)
        }
 }
 
-static struct kqueue *
-kevent_get_bound_kq(__assert_only struct proc *p, thread_t thread,
-                    unsigned int kev_flags, unsigned int kq_flags)
+/*
+ * kqworkloops_dealloc - rebalance retains on kqworkloops created with
+ * scheduling parameters
+ *
+ *             Called with proc_fdlock held.
+ *             Returns with it locked.
+ *             Process is in such a state that it will not try to allocate
+ *             any more knotes during this process (stopped for exit or exec).
+ */
+void
+kqworkloops_dealloc(proc_t p)
 {
-       struct kqueue *kq;
-       struct uthread *ut = get_bsdthread_info(thread);
+       struct filedesc *fdp = p->p_fd;
+       struct kqlist *list;
+       struct kqworkloop *kqwl, *kqwln;
+       struct kqlist tofree;
+       int i;
+
+       if (!(fdp->fd_flags & FD_WORKLOOP)) {
+               return;
+       }
+
+       SLIST_INIT(&tofree);
+
+       kqhash_lock(p);
+       assert(fdp->fd_kqhashmask != 0);
 
-       assert(p == get_bsdthreadtask_info(thread));
+       for (i = 0; i <= (int)fdp->fd_kqhashmask; i++) {
+               list = &fdp->fd_kqhash[i];
+               SLIST_FOREACH_SAFE(kqwl, list, kqwl_hashlink, kqwln) {
+                       /*
+                        * kqworkloops that have scheduling parameters have an
+                        * implicit retain from kqueue_workloop_ctl that needs
+                        * to be balanced on process exit.
+                        */
+                       assert(kqwl->kqwl_params);
+                       SLIST_REMOVE(list, kqwl, kqworkloop, kqwl_hashlink);
+                       SLIST_INSERT_HEAD(&tofree, kqwl, kqwl_hashlink);
+               }
+       }
 
-       if (!(ut->uu_kqueue_flags & kev_flags))
-               return NULL;
+       kqhash_unlock(p);
 
-       kq = ut->uu_kqueue_bound;
-       if (!kq)
-               return NULL;
+       SLIST_FOREACH_SAFE(kqwl, &tofree, kqwl_hashlink, kqwln) {
+               struct kqueue *kq = (struct kqueue *)kqwl;
+               __assert_only bool released;
+               released = kqueue_release(kq, KQUEUE_MIGHT_BE_LAST_REF);
+               assert(released);
+               kqueue_dealloc(kq);
+       }
+}
 
-       if (!(kq->kq_state & kq_flags))
-               return NULL;
+static struct kqueue *
+kevent_get_bound_kqworkloop(thread_t thread)
+{
+       struct uthread *ut = get_bsdthread_info(thread);
+       struct kqrequest *kqr = ut->uu_kqr_bound;
 
-       return kq;
+       return kqr ? (struct kqueue *)kqr_kqworkloop(kqr) : NULL;
 }
 
 static int
-kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct fileproc **fpp, int *fdp, struct kqueue **kqp)
+kevent_get_kq(struct proc *p, kqueue_id_t id, workq_threadreq_param_t *trp,
+               unsigned int flags, struct fileproc **fpp, int *fdp,
+               struct kqueue **kqp)
 {
        struct filedesc *descp = p->p_fd;
        struct fileproc *fp = NULL;
-       struct kqueue *kq;
+       struct kqueue *kq = NULL;
        int fd = 0;
        int error = 0;
+       thread_t th = current_thread();
+
+       assert(!trp || (flags & KEVENT_FLAG_WORKLOOP));
 
        /* Was the workloop flag passed?  Then it is for sure only a workloop */
        if (flags & KEVENT_FLAG_DYNAMIC_KQUEUE) {
                assert(flags & KEVENT_FLAG_WORKLOOP);
+               assert(!trp || (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST));
+               kq = kevent_get_bound_kqworkloop(th);
+
+               /*
+                * when kevent_id_internal is called from within the
+                * kernel, and the passed 'id' value is '-1' then we
+                * look for the currently bound workloop kq.
+                */
                if (id == (kqueue_id_t)-1 &&
                    (flags & KEVENT_FLAG_KERNEL) &&
                    (flags & KEVENT_FLAG_WORKLOOP)) {
 
-                       assert(is_workqueue_thread(current_thread()));
-
-                       /*
-                        * when kevent_id_internal is called from within the
-                        * kernel, and the passed 'id' value is '-1' then we
-                        * look for the currently bound workloop kq.
-                        *
-                        * Until pthread kext avoids calling in to kevent_id_internal
-                        * for threads whose fulfill is canceled, calling in unbound
-                        * can't be fatal.
-                        */
-                       kq = kevent_get_bound_kq(p, current_thread(),
-                                                KEVENT_FLAG_WORKLOOP, KQ_WORKLOOP);
-                       if (kq) {
-                               kqueue_retain(kq);
-                       } else {
-                               struct uthread *ut = get_bsdthread_info(current_thread());
-
-                               /* If thread is unbound due to cancel, just return an error */
-                               if (ut->uu_kqueue_flags == KEVENT_FLAG_WORKLOOP_CANCELED) {
-                                       ut->uu_kqueue_flags = 0;
-                                       error = ECANCELED;
-                               } else {
-                                       panic("Unbound thread called kevent_internal with id=-1"
-                                             " uu_kqueue_flags:0x%x, uu_kqueue_bound:%p",
-                                             ut->uu_kqueue_flags, ut->uu_kqueue_bound);
-                               }
+                       if (!is_workqueue_thread(th) || !kq) {
+                               return EINVAL;
                        }
 
-                       *fpp = NULL;
-                       *fdp = 0;
-                       *kqp = kq;
-                       return error;
+                       kqueue_retain(kq);
+                       goto out;
+               }
+
+               if (id == 0 || id == (kqueue_id_t)-1) {
+                       return EINVAL;
                }
 
                /* try shortcut on kq lookup for bound threads */
-               kq = kevent_get_bound_kq(p, current_thread(), KEVENT_FLAG_WORKLOOP, KQ_WORKLOOP);
                if (kq != NULL && ((struct kqworkloop *)kq)->kqwl_dynamicid == id) {
 
                        if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
-                               error = EEXIST;
-                               kq = NULL;
-                               goto out;
+                               return EEXIST;
                        }
 
                        /* retain a reference while working with this kq. */
                        assert(kq->kq_state & KQ_DYNAMIC);
                        kqueue_retain(kq);
-                       error = 0;
                        goto out;
                }
 
@@ -3916,39 +3628,45 @@ kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct filepro
                        kqhash_unlock(p);
 
                        if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST) {
-                               error = ENOENT;
-                               goto out;
+                               return ENOENT;
                        }
 
                        struct kqueue *alloc_kq;
                        alloc_kq = kqueue_alloc(p, flags);
-                       if (alloc_kq) {
-                               kqhash_lock(p);
-                               kqueue_hash_init_if_needed(p);
-                               kq = kqueue_hash_lookup(p, id);
-                               if (kq == NULL) {
-                                       /* insert our new one */
-                                       kq = alloc_kq;
-                                       kqueue_hash_insert(p, id, kq);
-                                       kqhash_unlock(p);
-                               } else {
-                                       /* lost race, retain existing workloop */
-                                       kqueue_retain(kq);
-                                       kqhash_unlock(p);
-                                       kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
-                                       kqueue_dealloc(alloc_kq);
-                               }
-                       } else {
-                               error = ENOMEM;
-                               goto out;
+                       if (!alloc_kq) {
+                               return ENOMEM;
+                       }
+
+                       kqhash_lock(p);
+                       kqueue_hash_init_if_needed(p);
+                       kq = kqueue_hash_lookup(p, id);
+                       if (kq == NULL) {
+                               /* insert our new one */
+                               kq = alloc_kq;
+                               if (trp) {
+                                       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+                                       kqwl->kqwl_params = trp->trp_value;
+                               }
+                               kqueue_hash_insert(p, id, kq);
+                               kqhash_unlock(p);
+                       } else if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
+                               /* lost race and caller wants an error */
+                               kqhash_unlock(p);
+                               kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
+                               kqueue_dealloc(alloc_kq);
+                               return EEXIST;
+                       } else {
+                               /* lost race, retain existing workloop */
+                               kqueue_retain(kq);
+                               kqhash_unlock(p);
+                               kqueue_release(alloc_kq, KQUEUE_MIGHT_BE_LAST_REF);
+                               kqueue_dealloc(alloc_kq);
                        }
                } else {
 
                        if (flags & KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST) {
                                kqhash_unlock(p);
-                               kq = NULL;
-                               error =  EEXIST;
-                               goto out;
+                               return  EEXIST;
                        }
 
                        /* retain a reference while working with this kq. */
@@ -3956,7 +3674,7 @@ kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct filepro
                        kqueue_retain(kq);
                        kqhash_unlock(p);
                }
-               
+
        } else if (flags & KEVENT_FLAG_WORKQ) {
                /* must already exist for bound threads. */
                if (flags & KEVENT_FLAG_KERNEL) {
@@ -3972,8 +3690,9 @@ kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct filepro
                kq = descp->fd_wqkqueue;
                if (kq == NULL) {
                        struct kqueue *alloc_kq = kqueue_alloc(p, KEVENT_FLAG_WORKQ);
-                       if (alloc_kq == NULL)
+                       if (alloc_kq == NULL) {
                                return ENOMEM;
+                       }
 
                        knhash_lock(p);
                        if (descp->fd_wqkqueue == NULL) {
@@ -3996,13 +3715,13 @@ kevent_get_kq(struct proc *p, kqueue_id_t id, unsigned int flags, struct filepro
                if (fp != NULL)
                        fp_drop(p, fd, fp, 0);
                return error;
-       } 
+       }
 
 out:
        *fpp = fp;
        *fdp = fd;
        *kqp = kq;
-       
+
        return error;
 }
 
@@ -4048,7 +3767,7 @@ kevent_exit_on_workloop_ownership_leak(thread_t thread)
        proc_t p = current_proc();
        struct filedesc *fdp = p->p_fd;
        kqueue_id_t workloop_id = 0;
-       os_reason_t reason;
+       os_reason_t reason = OS_REASON_NULL;
        mach_vm_address_t addr;
        uint32_t reason_size;
 
@@ -4067,7 +3786,6 @@ kevent_exit_on_workloop_ownership_leak(thread_t thread)
                }
        }
        kqhash_unlock(p);
-       assert(workloop_id);
 
        reason = os_reason_create(OS_REASON_LIBSYSTEM,
                        OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK);
@@ -4082,21 +3800,26 @@ kevent_exit_on_workloop_ownership_leak(thread_t thread)
                goto out;
        }
 
-       struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor;
+       if (workloop_id) {
+               struct kcdata_descriptor *kcd = &reason->osr_kcd_descriptor;
 
-       if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID,
-                       sizeof(workloop_id), &addr) == KERN_SUCCESS) {
-               kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id));
-       }
+               if (kcdata_get_memory_addr(kcd, EXIT_REASON_WORKLOOP_ID,
+                               sizeof(workloop_id), &addr) == KERN_SUCCESS) {
+                       kcdata_memcpy(kcd, addr, &workloop_id, sizeof(workloop_id));
+               }
 
-       uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id);
-       if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO,
-                       sizeof(serial_no), &addr) == KERN_SUCCESS) {
-               kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no));
+               uint64_t serial_no = kevent_workloop_serial_no_copyin(p, workloop_id);
+               if (serial_no && kcdata_get_memory_addr(kcd, EXIT_REASON_DISPATCH_QUEUE_NO,
+                               sizeof(serial_no), &addr) == KERN_SUCCESS) {
+                       kcdata_memcpy(kcd, addr, &serial_no, sizeof(serial_no));
+               }
        }
-
 out:
 #if DEVELOPMENT || DEBUG
+       if (kevent_debug_flags() & KEVENT_PANIC_ON_WORKLOOP_OWNERSHIP_LEAK) {
+               panic("thread %p in task %p is leaked workloop 0x%016llx ownership",
+                               thread, p->task, workloop_id);
+       }
        psignal_try_thread_with_reason(p, thread, SIGABRT, reason);
        return 0;
 #else
@@ -4105,139 +3828,8 @@ out:
 #endif
 }
 
-
-static int
-kevent_servicer_detach_preflight(thread_t thread, unsigned int flags, struct kqueue *kq)
-{
-       int error = 0;
-       struct kqworkloop *kqwl;
-       struct uthread *ut;
-       struct kqrequest *kqr;
-
-       if (!(flags & KEVENT_FLAG_WORKLOOP) || !(kq->kq_state & KQ_WORKLOOP))
-               return EINVAL;
-
-       /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads */
-       if (!(kq->kq_state & KQ_NO_WQ_THREAD))
-               return EINVAL;
-
-       /* allow detach only on not wq threads */
-       if (is_workqueue_thread(thread))
-               return EINVAL;
-
-       /* check that the current thread is bound to the requested wq */
-       ut = get_bsdthread_info(thread);
-       if (ut->uu_kqueue_bound != kq)
-               return EINVAL;
-
-       kqwl = (struct kqworkloop *)kq;
-       kqwl_req_lock(kqwl);
-       kqr = &kqwl->kqwl_request;
-
-       /* check that the wq is bound to the thread */
-       if ((kqr->kqr_state & KQR_BOUND) == 0  || (kqr->kqr_thread != thread))
-               error = EINVAL;
-
-       kqwl_req_unlock(kqwl);
-
-       return error;
-}
-
-static void
-kevent_servicer_detach_thread(struct proc *p, kqueue_id_t id, thread_t thread,
-               unsigned int flags, struct kqueue *kq)
-{
-       struct kqworkloop *kqwl;
-       struct uthread *ut;
-
-       assert((flags & KEVENT_FLAG_WORKLOOP) && (kq->kq_state & KQ_WORKLOOP));
-
-       /* allow detach only on not wqthreads threads */
-       assert(!is_workqueue_thread(thread));
-
-       /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads */
-       assert(kq->kq_state & KQ_NO_WQ_THREAD);
-
-       /* check that the current thread is bound to the requested kq */
-       ut = get_bsdthread_info(thread);
-       assert(ut->uu_kqueue_bound == kq);
-
-       kqwl = (struct kqworkloop *)kq;
-
-       kqlock(kq);
-
-       /* unbind the thread.
-        * unbind itself checks if still processing and ends it.
-        */
-       kqworkloop_unbind_thread(kqwl, thread, flags);
-
-       kqunlock(kq);
-
-       kevent_put_kq(p, id, NULL, kq);
-
-       return;
-}
-
-static int
-kevent_servicer_attach_thread(thread_t thread, unsigned int flags, struct kqueue *kq)
-{
-       int error = 0;
-       struct kqworkloop *kqwl;
-       struct uthread *ut;
-       struct kqrequest *kqr;
-
-       if (!(flags & KEVENT_FLAG_WORKLOOP) || !(kq->kq_state & KQ_WORKLOOP))
-               return EINVAL;
-
-       /* only kq created with KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD from userspace can have attached threads*/
-       if (!(kq->kq_state & KQ_NO_WQ_THREAD))
-               return EINVAL;
-
-       /* allow attach only on not wqthreads */
-       if (is_workqueue_thread(thread))
-               return EINVAL;
-
-       /* check that the thread is not already bound */
-       ut = get_bsdthread_info(thread);
-       if (ut->uu_kqueue_bound != NULL)
-               return EINVAL;
-
-       assert(ut->uu_kqueue_flags == 0);
-
-       kqlock(kq);
-       kqwl = (struct kqworkloop *)kq;
-       kqwl_req_lock(kqwl);
-       kqr = &kqwl->kqwl_request;
-
-       /* check that the kqueue is not already bound */
-       if (kqr->kqr_state & (KQR_BOUND | KQR_THREQUESTED | KQR_DRAIN)) {
-               error = EINVAL;
-               goto out;
-       }
-
-       assert(kqr->kqr_thread == NULL);
-       assert((kqr->kqr_state & KQR_PROCESSING) == 0);
-
-       kqr->kqr_state |= KQR_THREQUESTED;
-       kqr->kqr_qos_index = THREAD_QOS_UNSPECIFIED;
-       kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
-       kqr->kqr_dsync_owner_qos = THREAD_QOS_UNSPECIFIED;
-       kqr->kqr_owner_override_is_sync = 0;
-
-       kqworkloop_bind_thread_impl(kqwl, thread, KEVENT_FLAG_WORKLOOP);
-
-       /* get a ref on the wlkq on behalf of the attached thread */
-       kqueue_retain(kq);
-
-out:
-       kqwl_req_unlock(kqwl);
-       kqunlock(kq);
-
-       return error;
-}
-
-static inline
-boolean_t kevent_args_requesting_events(unsigned int flags, int nevents)
+static inline boolean_t
+kevent_args_requesting_events(unsigned int flags, int nevents)
 {
        return (!(flags & KEVENT_FLAG_ERROR_EVENTS) && nevents > 0);
 }
@@ -4248,55 +3840,51 @@ kevent_internal(struct proc *p,
                user_addr_t changelist, int nchanges,
                user_addr_t ueventlist, int nevents,
                user_addr_t data_out, uint64_t data_available,
-               unsigned int flags, 
+               unsigned int flags,
                user_addr_t utimeout,
                kqueue_continue_t continuation,
                int32_t *retval)
 {
-       struct _kevent *cont_args;
        uthread_t ut;
        struct kqueue *kq;
        struct fileproc *fp = NULL;
        int fd = 0;
        struct kevent_internal_s kev;
-       int error, noutputs;
+       int error, noutputs, register_rc;
+       bool needs_end_processing = false;
        struct timeval atv;
        user_size_t data_size;
        user_size_t data_resid;
        thread_t thread = current_thread();
+       KNOTE_LOCK_CTX(knlc);
 
        /* Don't allow user-space threads to process output events from the workq kqs */
        if (((flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_KERNEL)) == KEVENT_FLAG_WORKQ) &&
            kevent_args_requesting_events(flags, nevents))
                return EINVAL;
 
+       if (flags & KEVENT_FLAG_PARKING) {
+               if (!kevent_args_requesting_events(flags, nevents) || id != (kqueue_id_t)-1)
+                       return EINVAL;
+       }
+
        /* restrict dynamic kqueue allocation to workloops (for now) */
        if ((flags & (KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP)) == KEVENT_FLAG_DYNAMIC_KQUEUE)
                return EINVAL;
 
        if ((flags & (KEVENT_FLAG_WORKLOOP)) && (flags & (KEVENT_FLAG_WORKQ)))
-                return EINVAL;
+               return EINVAL;
 
-       if (flags & (KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH | KEVENT_FLAG_WORKLOOP_SERVICER_DETACH |
-           KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST | KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD)) {
+       if (flags & (KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)) {
 
                /* allowed only on workloops when calling kevent_id from user-space */
                if (!(flags & KEVENT_FLAG_WORKLOOP) || (flags & KEVENT_FLAG_KERNEL) || !(flags & KEVENT_FLAG_DYNAMIC_KQUEUE))
                        return EINVAL;
-
-               /* cannot attach and detach simultaneously*/
-               if ((flags & KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH) && (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH))
-                       return EINVAL;
-
-               /* cannot ask for events and detach */
-               if ((flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) && kevent_args_requesting_events(flags, nevents))
-                       return EINVAL;
-
        }
 
        /* prepare to deal with stack-wise allocation of out events */
        if (flags & KEVENT_FLAG_STACK_EVENTS) {
-               int scale = ((flags & KEVENT_FLAG_LEGACY32) ? 
+               int scale = ((flags & KEVENT_FLAG_LEGACY32) ?
                             (IS_64BIT_PROCESS(p) ? sizeof(struct user64_kevent) :
                                                    sizeof(struct user32_kevent)) :
                             ((flags & KEVENT_FLAG_LEGACY64) ? sizeof(struct kevent64_s) :
@@ -4308,47 +3896,56 @@ kevent_internal(struct proc *p,
        error = kevent_get_timeout(p, utimeout, flags, &atv);
        if (error)
                return error;
-       
+
        /* copyin initial value of data residual from data_available */
        error = kevent_get_data_size(p, data_available, flags, &data_size);
        if (error)
                return error;
 
        /* get the kq we are going to be working on */
-       error = kevent_get_kq(p, id, flags, &fp, &fd, &kq);
+       error = kevent_get_kq(p, id, NULL, flags, &fp, &fd, &kq);
+#if CONFIG_WORKLOOP_DEBUG
+       ut = (uthread_t)get_bsdthread_info(thread);
+       UU_KEVENT_HISTORY_WRITE_ENTRY(ut, {
+               .uu_kqid = id,
+               .uu_kq = error ? NULL : kq,
+               .uu_error = error,
+               .uu_nchanges = nchanges,
+               .uu_nevents = nevents,
+               .uu_flags = flags,
+       });
+#endif // CONFIG_WORKLOOP_DEBUG
        if (error)
                return error;
 
        /* only bound threads can receive events on workloops */
-       if ((flags & KEVENT_FLAG_WORKLOOP) && kevent_args_requesting_events(flags, nevents)) {
-               ut = (uthread_t)get_bsdthread_info(thread);
-               if (ut->uu_kqueue_bound != kq) {
-                       error = EXDEV;
-                       goto out;
-               }
+       if (flags & KEVENT_FLAG_WORKLOOP) {
+               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+               struct kqrequest *kqr = &kqwl->kqwl_request;
 
-       }
+               assert(kq->kq_state & KQ_WORKLOOP);
 
-       /* attach the current thread if necessary */
-       if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH) {
-               error = kevent_servicer_attach_thread(thread, flags, kq);
-               if (error)
-                       goto out;
-       }
-       else {
-               /* before processing events and committing to the system call, return an error if the thread cannot be detached when requested */
-               if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) {
-                       error = kevent_servicer_detach_preflight(thread, flags, kq);
-                       if (error)
+               if (kevent_args_requesting_events(flags, nevents)) {
+                       if (kq != kevent_get_bound_kqworkloop(thread)) {
+                               error = EXDEV;
                                goto out;
+                       }
+
+                       kq_req_lock(kqwl);
+                       /*
+                        * Disable the R2K notification while doing a register, if the
+                        * caller wants events too, we don't want the AST to be set if we
+                        * will process these events soon.
+                        */
+                       kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
+                       needs_end_processing = true;
+                       kq_req_unlock(kq);
+               }
+
+               if (id_out) {
+                       *id_out = kqwl->kqwl_dynamicid;
                }
-       }
 
-       if (id_out && kq && (flags & KEVENT_FLAG_WORKLOOP)) {
-               assert(kq->kq_state & KQ_WORKLOOP);
-               struct kqworkloop *kqwl;
-               kqwl = (struct kqworkloop *)kq;
-               *id_out = kqwl->kqwl_dynamicid;
        }
 
        /* register all the change requests the user provided... */
@@ -4361,11 +3958,43 @@ kevent_internal(struct proc *p,
                /* Make sure user doesn't pass in any system flags */
                kev.flags &= ~EV_SYSFLAGS;
 
-               kevent_register(kq, &kev, p);
+               register_rc = kevent_register(kq, &kev, &knlc);
+               if (register_rc & FILTER_REGISTER_WAIT) {
+                       kqlock_held(kq);
+
+                       // f_post_register_wait is meant to call a continuation and not to
+                       // return, which is why we don't support FILTER_REGISTER_WAIT if
+                       // KEVENT_FLAG_ERROR_EVENTS is not passed, or if the event that
+                       // waits isn't the last.
+                       //
+                       // It is implementable, but not used by any userspace code at the
+                       // moment, so for now return ENOTSUP if someone tries to do it.
+                       if (nchanges == 1 && nevents >= 1 && (flags & KEVENT_FLAG_ERROR_EVENTS)) {
+                               struct _kevent_register *cont_args;
+                               /* store the continuation/completion data in the uthread */
+                               ut = (uthread_t)get_bsdthread_info(thread);
+                               cont_args = &ut->uu_save.uus_kevent_register;
+                               cont_args->kev = kev;
+                               cont_args->kq = kq;
+                               cont_args->fp = fp;
+                               cont_args->fd = fd;
+                               cont_args->ueventlist = ueventlist;
+                               cont_args->flags = flags;
+                               cont_args->retval = retval;
+                               cont_args->eventcount = nevents;
+                               cont_args->eventout = noutputs;
+                               knote_fops(cont_args->knote)->f_post_register_wait(ut, &knlc, cont_args);
+                               panic("f_post_register_wait returned (kev: %p)", &kev);
+                       }
+
+                       kev.flags |= EV_ERROR;
+                       kev.data = ENOTSUP;
+                       knote_unlock(kq, knlc.knlc_knote, &knlc, KNOTE_KQ_UNLOCK);
+               }
 
-               if (nevents > 0 &&
-                   ((kev.flags & EV_ERROR) || (kev.flags & EV_RECEIPT))) {
-                       if (kev.flags & EV_RECEIPT) {
+               // keep in sync with kevent_register_wait_return()
+               if (nevents > 0 && (kev.flags & (EV_ERROR|EV_RECEIPT))) {
+                       if ((kev.flags & EV_ERROR) == 0) {
                                kev.flags |= EV_ERROR;
                                kev.data = 0;
                        }
@@ -4386,9 +4015,10 @@ kevent_internal(struct proc *p,
 
        /* process pending events */
        if (nevents > 0 && noutputs == 0 && error == 0) {
+               struct _kevent *cont_args;
                /* store the continuation/completion data in the uthread */
                ut = (uthread_t)get_bsdthread_info(thread);
-               cont_args = &ut->uu_kevent.ss_kevent;
+               cont_args = &ut->uu_save.uus_kevent;
                cont_args->fp = fp;
                cont_args->fd = fd;
                cont_args->retval = retval;
@@ -4402,6 +4032,11 @@ kevent_internal(struct proc *p,
                cont_args->process_data.fp_data_size = data_size;
                cont_args->process_data.fp_data_resid = data_size;
 
+               /*
+                * kqworkloop_end_processing() will happen at the end of kqueue_scan()
+                */
+               needs_end_processing = false;
+
                error = kqueue_scan(kq, kevent_callback,
                                    continuation, cont_args,
                                    &cont_args->process_data,
@@ -4418,13 +4053,16 @@ kevent_internal(struct proc *p,
                }
        }
 
-       /* detach the current thread if necessary */
-       if (flags & KEVENT_FLAG_WORKLOOP_SERVICER_DETACH) {
-               assert(fp == NULL);
-               kevent_servicer_detach_thread(p, id, thread, flags, kq);
-       }
-
 out:
+       if (__improbable(needs_end_processing)) {
+               /*
+                * If we didn't through kqworkloop_end_processing(),
+                * we need to do it here.
+                */
+               kqlock(kq);
+               kqworkloop_end_processing((struct kqworkloop *)kq, 0, 0);
+               kqunlock(kq);
+       }
        kevent_put_kq(p, id, fp, kq);
 
        /* don't restart after signals... */
@@ -4446,7 +4084,7 @@ out:
  */
 static int
 kevent_callback(__unused struct kqueue *kq, struct kevent_internal_s *kevp,
-    void *data)
+               void *data)
 {
        struct _kevent *cont_args;
        int error;
@@ -4493,6 +4131,122 @@ kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
        return (s);
 }
 
+static int
+kevent_register_validate_priority(struct kqueue *kq, struct knote *kn,
+               struct kevent_internal_s *kev)
+{
+       /* We don't care about the priority of a disabled or deleted knote */
+       if (kev->flags & (EV_DISABLE | EV_DELETE)) {
+               return 0;
+       }
+
+       if (kq->kq_state & KQ_WORKLOOP) {
+               /*
+                * Workloops need valid priorities with a QOS (excluding manager) for
+                * any enabled knote.
+                *
+                * When it is pre-existing, just make sure it has a valid QoS as
+                * kevent_register() will not use the incoming priority (filters who do
+                * have the responsibility to validate it again, see filt_wltouch).
+                *
+                * If the knote is being made, validate the incoming priority.
+                */
+               if (!_pthread_priority_thread_qos(kn ? kn->kn_qos : kev->qos)) {
+                       return ERANGE;
+               }
+       }
+
+       return 0;
+}
+
+/*
+ * Prepare a filter for waiting after register.
+ *
+ * The f_post_register_wait hook will be called later by kevent_register()
+ * and should call kevent_register_wait_block()
+ */
+static int
+kevent_register_wait_prepare(struct knote *kn, struct kevent_internal_s *kev)
+{
+       thread_t thread = current_thread();
+       struct uthread *uth = get_bsdthread_info(thread);
+
+       assert(knote_fops(kn)->f_extended_codes);
+
+       if (kn->kn_hook == NULL) {
+               thread_reference(thread);
+               kn->kn_hook = thread;
+       } else if (kn->kn_hook != thread) {
+               /*
+                * kn_hook may be set from a previous aborted wait
+                * However, it has to be from the same thread.
+                */
+               kev->flags |= EV_ERROR;
+               kev->data = EXDEV;
+               return 0;
+       }
+
+       uth->uu_save.uus_kevent_register.knote = kn;
+       return FILTER_REGISTER_WAIT;
+}
+
+/*
+ * Cleanup a kevent_register_wait_prepare() effect for threads that have been
+ * aborted instead of properly woken up with thread_wakeup_thread().
+ */
+static void
+kevent_register_wait_cleanup(struct knote *kn)
+{
+       thread_t thread = kn->kn_hook;
+       kn->kn_hook = NULL;
+       thread_deallocate(thread);
+}
+
+/*
+ * Must be called at the end of a f_post_register_wait call from a filter.
+ */
+static void
+kevent_register_wait_block(struct turnstile *ts, thread_t thread,
+               struct knote_lock_ctx *knlc, thread_continue_t cont,
+               struct _kevent_register *cont_args)
+{
+       knote_unlock(cont_args->kq, cont_args->knote, knlc, KNOTE_KQ_UNLOCK);
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+       cont_args->handoff_thread = thread;
+       thread_handoff_parameter(thread, cont, cont_args);
+}
+
+/*
+ * Called by Filters using a f_post_register_wait to return from their wait.
+ */
+static void
+kevent_register_wait_return(struct _kevent_register *cont_args)
+{
+       struct kqueue *kq = cont_args->kq;
+       proc_t p = kq->kq_p;
+       struct kevent_internal_s *kev = &cont_args->kev;
+       int error = 0;
+
+       if (cont_args->handoff_thread) {
+               thread_deallocate(cont_args->handoff_thread);
+       }
+
+       if (kev->flags & (EV_ERROR|EV_RECEIPT)) {
+               if ((kev->flags & EV_ERROR) == 0) {
+                       kev->flags |= EV_ERROR;
+                       kev->data = 0;
+               }
+               error = kevent_copyout(kev, &cont_args->ueventlist, p, cont_args->flags);
+               if (error == 0) cont_args->eventout++;
+       }
+
+       kevent_put_kq(p, cont_args->fd, cont_args->fp, kq);
+       if (error == 0) {
+               *cont_args->retval = cont_args->eventout;
+       }
+       unix_syscall_return(error);
+}
+
 /*
  * kevent_register - add a new event to a kqueue
  *
@@ -4507,17 +4261,15 @@ kevent_description(struct kevent_internal_s *kevp, char *s, size_t n)
  *     caller holds a reference on the kqueue
  */
 
-void
+int
 kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
-    __unused struct proc *ctxp)
+               struct knote_lock_ctx *knlc)
 {
        struct proc *p = kq->kq_p;
        const struct filterops *fops;
        struct knote *kn = NULL;
-       int result = 0;
-       int error = 0;
+       int result = 0, error = 0;
        unsigned short kev_flags = kev->flags;
-       int knoteuse_flags = KNUSE_NONE;
 
        if (kev->filter < 0) {
                if (kev->filter + EVFILT_SYSCOUNT < 0) {
@@ -4532,7 +4284,7 @@ kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
 
        /* restrict EV_VANISHED to adding udata-specific dispatch kevents */
        if ((kev->flags & EV_VANISHED) &&
-           (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) {
+                       (kev->flags & (EV_ADD | EV_DISPATCH2)) != (EV_ADD | EV_DISPATCH2)) {
                error = EINVAL;
                goto out;
        }
@@ -4557,279 +4309,249 @@ kevent_register(struct kqueue *kq, struct kevent_internal_s *kev,
        }
 
 restart:
-
        /* find the matching knote from the fd tables/hashes */
        kn = kq_find_knote_and_kq_lock(kq, kev, fops->f_isfd, p);
+       error = kevent_register_validate_priority(kq, kn, kev);
+       result = 0;
+       if (error) {
+               goto out;
+       }
 
-       if (kn == NULL) {
-               if (kev->flags & EV_ADD) {
-                       struct fileproc *knote_fp = NULL;
+       if (kn == NULL && (kev->flags & EV_ADD) == 0) {
+               /*
+                * No knote found, EV_ADD wasn't specified
+                */
 
-                       /* grab a file reference for the new knote */
-                       if (fops->f_isfd) {
-                               if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) {
-                                       goto out;
-                               }
-                       }
+               if ((kev_flags & EV_ADD) && (kev_flags & EV_DELETE) &&
+                               (kq->kq_state & KQ_WORKLOOP)) {
+                       /*
+                        * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
+                        * that doesn't care about ENOENT, so just pretend the deletion
+                        * happened.
+                        */
+               } else {
+                       error = ENOENT;
+               }
+               goto out;
+
+       } else if (kn == NULL) {
+               /*
+                * No knote found, need to attach a new one (attach)
+                */
+
+               struct fileproc *knote_fp = NULL;
 
-                       kn = knote_alloc();
-                       if (kn == NULL) {
-                               error = ENOMEM;
-                               if (knote_fp != NULL)
-                                       fp_drop(p, kev->ident, knote_fp, 0);
+               /* grab a file reference for the new knote */
+               if (fops->f_isfd) {
+                       if ((error = fp_lookup(p, kev->ident, &knote_fp, 0)) != 0) {
                                goto out;
                        }
+               }
 
-                       kn->kn_fp = knote_fp;
-                       knote_set_kq(kn, kq);
-                       kqueue_retain(kq); /* retain a kq ref */
-                       kn->kn_filtid = ~kev->filter;
-                       kn->kn_inuse = 1;  /* for f_attach() */
-                       kn->kn_status = KN_ATTACHING | KN_ATTACHED;
-
-                       /* was vanish support requested */
-                       if (kev->flags & EV_VANISHED) {
-                               kev->flags &= ~EV_VANISHED;
-                               kn->kn_status |= KN_REQVANISH;
-                       }
+               kn = knote_alloc();
+               if (kn == NULL) {
+                       error = ENOMEM;
+                       if (knote_fp != NULL)
+                               fp_drop(p, kev->ident, knote_fp, 0);
+                       goto out;
+               }
 
-                       /* snapshot matching/dispatching protcol flags into knote */
-                       if (kev->flags & EV_DISPATCH)
-                               kn->kn_status |= KN_DISPATCH;
-                       if (kev->flags & EV_UDATA_SPECIFIC)
-                               kn->kn_status |= KN_UDATA_SPECIFIC;
+               kn->kn_fp = knote_fp;
+               kn->kn_kq_packed = (intptr_t)(struct kqueue *)kq;
+               kqueue_retain(kq); /* retain a kq ref */
+               kn->kn_filtid = ~kev->filter;
+               kn->kn_status = KN_ATTACHING | KN_ATTACHED;
 
-                       /*
-                        * copy the kevent state into knote
-                        * protocol is that fflags and data
-                        * are saved off, and cleared before
-                        * calling the attach routine.
-                        */
-                       kn->kn_kevent = *kev;
-                       kn->kn_sfflags = kev->fflags;
-                       kn->kn_sdata = kev->data;
-                       kn->kn_fflags = 0;
-                       kn->kn_data = 0;
+               /* was vanish support requested */
+               if (kev->flags & EV_VANISHED) {
+                       kev->flags &= ~EV_VANISHED;
+                       kn->kn_status |= KN_REQVANISH;
+               }
 
-                       /* invoke pthread kext to convert kevent qos to thread qos */
-                       knote_canonicalize_kevent_qos(kn);
-                       knote_set_qos_index(kn, qos_index_from_qos(kn, kn->kn_qos, FALSE));
+               /* snapshot matching/dispatching protcol flags into knote */
+               if (kev->flags & EV_DISPATCH)
+                       kn->kn_status |= KN_DISPATCH;
+               if (kev->flags & EV_UDATA_SPECIFIC)
+                       kn->kn_status |= KN_UDATA_SPECIFIC;
+               if (kev->flags & EV_DISABLE)
+                       kn->kn_status |= KN_DISABLED;
 
-                       /* before anyone can find it */
-                       if (kev->flags & EV_DISABLE) {
-                               /*
-                                * do this before anyone can find it,
-                                * this can't call knote_disable() because it expects having
-                                * the kqlock held
-                                */
-                               kn->kn_status |= KN_DISABLED;
-                       }
+               /*
+                * copy the kevent state into knote
+                * protocol is that fflags and data
+                * are saved off, and cleared before
+                * calling the attach routine.
+                */
+               kn->kn_kevent = *kev;
+               kn->kn_sfflags = kev->fflags;
+               kn->kn_sdata = kev->data;
+               kn->kn_fflags = 0;
+               kn->kn_data = 0;
+               knote_reset_priority(kn, kev->qos);
 
-                       /* Add the knote for lookup thru the fd table */
-                       error = kq_add_knote(kq, kn, kev, p, &knoteuse_flags);
-                       if (error) {
-                               (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
-                               knote_free(kn);
-                               if (knote_fp != NULL)
-                                       fp_drop(p, kev->ident, knote_fp, 0);
-
-                               if (error == ERESTART) {
-                                       error = 0;
-                                       goto restart;
-                               }
-                               goto out;
+               /* Add the knote for lookup thru the fd table */
+               error = kq_add_knote(kq, kn, knlc, p);
+               if (error) {
+                       (void)kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
+                       knote_free(kn);
+                       if (knote_fp != NULL)
+                               fp_drop(p, kev->ident, knote_fp, 0);
+
+                       if (error == ERESTART) {
+                               goto restart;
                        }
+                       goto out;
+               }
+
+               /* fp reference count now applies to knote */
 
-                       /* fp reference count now applies to knote */
-                       /* rwlock boost is now held */
+               /*
+                * we can't use filter_call() because f_attach can change the filter ops
+                * for a filter that supports f_extended_codes, so we need to reload
+                * knote_fops() and not use `fops`.
+                */
+               result = fops->f_attach(kn, kev);
+               if (result && !knote_fops(kn)->f_extended_codes) {
+                       result = FILTER_ACTIVE;
+               }
 
-                       /* call filter attach routine */
-                       result = fops->f_attach(kn, kev);
+               kqlock(kq);
 
+               if (kn->kn_flags & EV_ERROR) {
                        /*
-                        * Trade knote use count for kq lock.
-                        * Cannot be dropped because we held
-                        * KN_ATTACHING throughout.
+                        * Failed to attach correctly, so drop.
                         */
-                       knoteuse2kqlock(kq, kn, KNUSE_STEAL_DROP | knoteuse_flags);
+                       kn->kn_status &= ~(KN_ATTACHED | KN_ATTACHING);
+                       error = kn->kn_data;
+                       knote_drop(kq, kn, knlc);
+                       result = 0;
+                       goto out;
+               }
 
-                       if (kn->kn_flags & EV_ERROR) {
-                               /*
-                                * Failed to attach correctly, so drop.
-                                * All other possible users/droppers
-                                * have deferred to us.  Save the error
-                                * to return to our caller.
-                                */
-                               kn->kn_status &= ~KN_ATTACHED;
-                               kn->kn_status |= KN_DROPPING;
-                               error = kn->kn_data;
-                               kqunlock(kq);
-                               knote_drop(kn, p);
-                               goto out;
-                       }
-
-                       /* end "attaching" phase - now just attached */
-                       kn->kn_status &= ~KN_ATTACHING;
-
-                       if (kn->kn_status & KN_DROPPING) {
-                               /*
-                                * Attach succeeded, but someone else
-                                * deferred their drop - now we have
-                                * to do it for them.
-                                */
-                               kqunlock(kq);
-                               knote_drop(kn, p);
-                               goto out;
-                       }
+               /*
+                * end "attaching" phase - now just attached
+                *
+                * Mark the thread request overcommit, if appropos
+                *
+                * If the attach routine indicated that an
+                * event is already fired, activate the knote.
+                */
+               kn->kn_status &= ~KN_ATTACHING;
+               knote_set_qos_overcommit(kn);
 
-                       /* Mark the thread request overcommit - if appropos */
-                       knote_set_qos_overcommit(kn);
+               if (result & FILTER_ACTIVE) {
+                       if (result & FILTER_ADJUST_EVENT_QOS_BIT)
+                               knote_adjust_qos(kq, kn, result);
+                       knote_activate(kn);
+               }
 
-                       /*
-                        * If the attach routine indicated that an
-                        * event is already fired, activate the knote.
-                        */
-                       if (result)
-                               knote_activate(kn);
+       } else if (!knote_lock(kq, kn, knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
 
-                       if (knote_fops(kn)->f_post_attach) {
-                               error = knote_fops(kn)->f_post_attach(kn, kev);
-                               if (error) {
-                                       kqunlock(kq);
-                                       goto out;
-                               }
-                       }
+               /*
+                * The knote was dropped while we were waiting for the lock,
+                * we need to re-evaluate entirely
+                */
 
-               } else {
-                       if ((kev_flags & (EV_ADD | EV_DELETE)) == (EV_ADD | EV_DELETE) &&
-                                       (kq->kq_state & KQ_WORKLOOP)) {
-                               /*
-                                * For workloops, understand EV_ADD|EV_DELETE as a "soft" delete
-                                * that doesn't care about ENOENT, so just pretend the deletion
-                                * happened.
-                                */
-                       } else {
-                               error = ENOENT;
-                       }
-                       goto out;
-               }
+               goto restart;
 
-       } else {
-               /* existing knote: kqueue lock already taken by kq_find_knote_and_kq_lock */
+       } else if (kev->flags & EV_DELETE) {
+               /*
+                * Deletion of a knote (drop)
+                *
+                * If the filter wants to filter drop events, let it do so.
+                *
+                * defer-delete: when trying to delete a disabled EV_DISPATCH2 knote,
+                * we must wait for the knote to be re-enabled (unless it is being
+                * re-enabled atomically here).
+                */
 
-               if ((kn->kn_status & (KN_DROPPING | KN_ATTACHING)) != 0) {
-                       /*
-                        * The knote is not in a stable state, wait for that
-                        * transition to complete and then redrive the lookup.
-                        */
-                       knoteusewait(kq, kn);
-                       goto restart;
-               }
+               if (knote_fops(kn)->f_allow_drop) {
+                       bool drop;
 
-               if (kev->flags & EV_DELETE) {
+                       kqunlock(kq);
+                       drop = knote_fops(kn)->f_allow_drop(kn, kev);
+                       kqlock(kq);
 
-                       /*
-                        * If attempting to delete a disabled dispatch2 knote,
-                        * we must wait for the knote to be re-enabled (unless
-                        * it is being re-enabled atomically here).
-                        */
-                       if ((kev->flags & EV_ENABLE) == 0 &&
-                           (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) ==
-                                            (KN_DISPATCH2 | KN_DISABLED)) {
-                               kn->kn_status |= KN_DEFERDELETE;
-                               kqunlock(kq);
-                               error = EINPROGRESS;
-                       } else if (knote_fops(kn)->f_drop_and_unlock) {
-                               /*
-                                * The filter has requested to handle EV_DELETE events
-                                *
-                                * ERESTART means the kevent has to be re-evaluated
-                                */
-                               error = knote_fops(kn)->f_drop_and_unlock(kn, kev);
-                               if (error == ERESTART) {
-                                       error = 0;
-                                       goto restart;
-                               }
-                       } else if (kqlock2knotedrop(kq, kn)) {
-                               /* standard/default EV_DELETE path */
-                               knote_drop(kn, p);
-                       } else {
-                               /*
-                                * The kqueue is unlocked, it's not being
-                                * dropped, and kqlock2knotedrop returned 0:
-                                * this means that someone stole the drop of
-                                * the knote from us.
-                                */
-                               error = EINPROGRESS;
-                       }
-                       goto out;
+                       if (!drop) goto out_unlock;
                }
 
-               /*
-                * If we are re-enabling a deferred-delete knote,
-                * just enable it now and avoid calling the
-                * filter touch routine (it has delivered its
-                * last event already).
-                */
-               if ((kev->flags & EV_ENABLE) &&
-                   (kn->kn_status & KN_DEFERDELETE)) {
-                       assert(kn->kn_status & KN_DISABLED);
-                       knote_activate(kn);
-                       knote_enable(kn);
-                       kqunlock(kq);
-                       goto out;
+               if ((kev->flags & EV_ENABLE) == 0 &&
+                               (kn->kn_status & (KN_DISPATCH2 | KN_DISABLED)) ==
+                               (KN_DISPATCH2 | KN_DISABLED)) {
+                       kn->kn_status |= KN_DEFERDELETE;
+                       error = EINPROGRESS;
+                       goto out_unlock;
                }
 
-               /*
-                * If we are disabling, do it before unlocking and
-                * calling the touch routine (so no processing can
-                * see the new kevent state before the disable is
-                * applied).
-                */
-               if (kev->flags & EV_DISABLE)
-                       knote_disable(kn);
+               knote_drop(kq, kn, knlc);
+               goto out;
 
+       } else {
                /*
-                * Convert the kqlock to a use reference on the
-                * knote so we can call the filter touch routine.
+                * Regular update of a knote (touch)
+                *
+                * Call touch routine to notify filter of changes in filter values
+                * (and to re-determine if any events are fired).
+                *
+                * If the knote is in defer-delete, avoid calling the filter touch
+                * routine (it has delivered its last event already).
+                *
+                * If the touch routine had no failure,
+                * apply the requested side effects to the knote.
                 */
-               if (knoteuse_needs_boost(kn, kev)) {
-                       knoteuse_flags |= KNUSE_BOOST;
-               }
-               if (kqlock2knoteuse(kq, kn, knoteuse_flags)) {
-                       /*
-                        * Call touch routine to notify filter of changes
-                        * in filter values (and to re-determine if any
-                        * events are fired).
-                        */
-                       result = knote_fops(kn)->f_touch(kn, kev);
 
-                       /* Get the kq lock back (don't defer droppers). */
-                       if (!knoteuse2kqlock(kq, kn, knoteuse_flags)) {
-                               kqunlock(kq);
-                               goto out;
+               if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
+                       if (kev->flags & EV_ENABLE) {
+                               result = FILTER_ACTIVE;
                        }
+               } else {
+                       kqunlock(kq);
+                       result = filter_call(knote_fops(kn), f_touch(kn, kev));
+                       kqlock(kq);
+               }
 
-                       /* Handle errors during touch routine */
-                       if (kev->flags & EV_ERROR) {
-                               error = kev->data;
-                               kqunlock(kq);
-                               goto out;
+               if (kev->flags & EV_ERROR) {
+                       result = 0;
+               } else {
+                       /* accept new kevent state */
+                       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
+                               kn->kn_udata = kev->udata;
+                       if (kev->flags & EV_DISABLE)
+                               knote_disable(kn);
+                       if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT))
+                               knote_dequeue(kn);
+                       if ((result & FILTER_UPDATE_REQ_QOS) &&
+                                       kev->qos && kev->qos != kn->kn_qos) {
+                               knote_reset_priority(kn, kev->qos);
                        }
-
-                       /* Activate it if the touch routine said to */
-                       if (result)
+                       if (result & FILTER_ACTIVE) {
+                               thread_qos_t qos;
+                               if (result & FILTER_ADJUST_EVENT_QOS_BIT) {
+                                       if (knote_should_apply_qos_override(kq, kn, result, &qos)) {
+                                               knote_apply_qos_override(kn, qos);
+                                       }
+                               }
                                knote_activate(kn);
+                       }
+                       if (result & (FILTER_UPDATE_REQ_QOS | FILTER_ADJUST_EVENT_QOS_BIT)) {
+                               if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
+                                       knote_wakeup(kn);
+                               }
+                       }
+                       if (kev->flags & EV_ENABLE)
+                               knote_enable(kn);
                }
-
-               /* Enable the knote if called for */
-               if (kev->flags & EV_ENABLE)
-                       knote_enable(kn);
-
        }
 
-       /* still have kqlock held and knote is valid */
-       kqunlock(kq);
+out_unlock:
+       if ((result & FILTER_REGISTER_WAIT) == 0) {
+               /*
+                * When the filter asked for a post-register wait,
+                * we leave the knote and kqueue locked for kevent_register()
+                * to call the filter's f_post_register_wait hook.
+                */
+               knote_unlock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
+       }
 
 out:
        /* output local errors through the kevent */
@@ -4837,9 +4559,9 @@ out:
                kev->flags |= EV_ERROR;
                kev->data = error;
        }
+       return result;
 }
 
-
 /*
  * knote_process - process a triggered event
  *
@@ -4861,16 +4583,17 @@ out:
  *     kqueue locked on entry and exit - but may be dropped
  */
 static int
-knote_process(struct knote *kn,        
+knote_process(struct knote *kn,
        kevent_callback_t callback,
        void *callback_data,
-       struct filt_process_s *process_data,
-       struct proc *p)
+       struct filt_process_s *process_data)
 {
        struct kevent_internal_s kev;
        struct kqueue *kq = knote_get_kq(kn);
-       int result = 0;
+       KNOTE_LOCK_CTX(knlc);
+       int result = FILTER_ACTIVE;
        int error = 0;
+       bool drop = false;
 
        bzero(&kev, sizeof(kev));
 
@@ -4897,110 +4620,93 @@ knote_process(struct knote *kn,
                              kn->kn_status | (kn->kn_id << 32), kn->kn_filtid);
        }
 
+       if ((kn->kn_status & KN_DROPPING) ||
+                       !knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS)) {
+               /*
+                * When the knote is dropping or has dropped,
+                * then there's nothing we want to process.
+                */
+               return EJUSTRETURN;
+       }
+
        /*
         * For deferred-drop or vanished events, we just create a fake
         * event to acknowledge end-of-life.  Otherwise, we call the
         * filter's process routine to snapshot the kevent state under
         * the filter's locking protocol.
+        *
+        * suppress knotes to avoid returning the same event multiple times in
+        * a single call.
         */
+       knote_suppress(kn);
+
        if (kn->kn_status & (KN_DEFERDELETE | KN_VANISHED)) {
                /* create fake event */
                kev.filter = kn->kn_filter;
                kev.ident = kn->kn_id;
-               kev.qos = kn->kn_qos;
-               kev.flags = (kn->kn_status & KN_DEFERDELETE) ? 
-                           EV_DELETE : EV_VANISHED;
+               kev.flags = (kn->kn_status & KN_DEFERDELETE) ? EV_DELETE : EV_VANISHED;
                kev.flags |= (EV_DISPATCH2 | EV_ONESHOT);
                kev.udata = kn->kn_udata;
-               result = 1;
-
-               knote_suppress(kn);
        } else {
-               int flags = KNUSE_NONE;
                /* deactivate - so new activations indicate a wakeup */
                knote_deactivate(kn);
 
-               /* suppress knotes to avoid returning the same event multiple times in a single call. */
-               knote_suppress(kn);
-
-               if (knoteuse_needs_boost(kn, NULL)) {
-                       flags |= KNUSE_BOOST;
-               }
-               /* convert lock to a knote use reference */
-               if (!kqlock2knoteuse(kq, kn, flags))
-                       panic("dropping knote found on queue\n");
-
-               /* call out to the filter to process with just a ref */
-               result = knote_fops(kn)->f_process(kn, process_data, &kev);
-               if (result) flags |= KNUSE_STEAL_DROP;
+               kqunlock(kq);
+               result = filter_call(knote_fops(kn), f_process(kn, process_data, &kev));
+               kqlock(kq);
+       }
 
-               /*
-                * convert our reference back to a lock. accept drop
-                * responsibility from others if we've committed to
-                * delivering event data.
-                */
-               if (!knoteuse2kqlock(kq, kn, flags)) {
-                       /* knote dropped */
-                       kn = NULL;
+       /*
+        * Determine how to dispatch the knote for future event handling.
+        * not-fired: just return (do not callout, leave deactivated).
+        * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
+        *            is the deferred delete event delivery itself).  Otherwise,
+        *            drop it.
+        * Dispatch:  don't clear state, just mark it disabled.
+        * Cleared:   just leave it deactivated.
+        * Others:    re-activate as there may be more events to handle.
+        *            This will not wake up more handlers right now, but
+        *            at the completion of handling events it may trigger
+        *            more handler threads (TODO: optimize based on more than
+        *            just this one event being detected by the filter).
+        */
+       if ((result & FILTER_ACTIVE) == 0) {
+               if ((kn->kn_status & (KN_ACTIVE | KN_STAYACTIVE)) == 0) {
+                       /*
+                        * Stay active knotes should not be unsuppressed or we'd create an
+                        * infinite loop.
+                        *
+                        * Some knotes (like EVFILT_WORKLOOP) can be reactivated from
+                        * within f_process() but that doesn't necessarily make them
+                        * ready to process, so we should leave them be.
+                        *
+                        * For other knotes, since we will not return an event,
+                        * there's no point keeping the knote suppressed.
+                        */
+                       knote_unsuppress(kn);
                }
+               knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
+               return EJUSTRETURN;
        }
 
-       if (kn != NULL) {
-               /*
-                * Determine how to dispatch the knote for future event handling.
-                * not-fired: just return (do not callout, leave deactivated).
-                * One-shot:  If dispatch2, enter deferred-delete mode (unless this is
-                *            is the deferred delete event delivery itself).  Otherwise,
-                *            drop it.
-                * stolendrop:We took responsibility for someone else's drop attempt.
-                *            treat this just like one-shot and prepare to turn it back
-                *            into a deferred delete if required.
-                * Dispatch:  don't clear state, just mark it disabled.
-                * Cleared:   just leave it deactivated.
-                * Others:    re-activate as there may be more events to handle.
-                *            This will not wake up more handlers right now, but
-                *            at the completion of handling events it may trigger
-                *            more handler threads (TODO: optimize based on more than
-                *            just this one event being detected by the filter).
-                */
+       if (result & FILTER_ADJUST_EVENT_QOS_BIT)
+               knote_adjust_qos(kq, kn, result);
+       kev.qos = _pthread_priority_combine(kn->kn_qos, kn->kn_qos_override);
 
-               if (result == 0)
-                       return (EJUSTRETURN);
-
-               if ((kev.flags & EV_ONESHOT) || (kn->kn_status & KN_STOLENDROP)) {
-                       if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) {
-                               /* defer dropping non-delete oneshot dispatch2 events */
-                               kn->kn_status |= KN_DEFERDELETE;
-                               knote_disable(kn);
-
-                               /* if we took over another's drop clear those flags here */
-                               if (kn->kn_status & KN_STOLENDROP) {
-                                       assert(kn->kn_status & KN_DROPPING);
-                                       /*
-                                        * the knote will be dropped when the
-                                        * deferred deletion occurs
-                                        */
-                                       kn->kn_status &= ~(KN_DROPPING|KN_STOLENDROP);
-                               }
-                       } else if (kn->kn_status & KN_STOLENDROP) {
-                               /* We now own the drop of the knote. */
-                               assert(kn->kn_status & KN_DROPPING);
-                               knote_unsuppress(kn);
-                               kqunlock(kq);
-                               knote_drop(kn, p);
-                               kqlock(kq);
-                       } else if (kqlock2knotedrop(kq, kn)) {
-                               /* just EV_ONESHOT, _not_ DISPATCH2 */
-                               knote_drop(kn, p);
-                               kqlock(kq);
-                       }
-               } else if (kn->kn_status & KN_DISPATCH) {
-                       /* disable all dispatch knotes */
+       if (kev.flags & EV_ONESHOT) {
+               if ((kn->kn_status & (KN_DISPATCH2 | KN_DEFERDELETE)) == KN_DISPATCH2) {
+                       /* defer dropping non-delete oneshot dispatch2 events */
+                       kn->kn_status |= KN_DEFERDELETE;
                        knote_disable(kn);
-               } else if ((kev.flags & EV_CLEAR) == 0) {
-                       /* re-activate in case there are more events */
-                       knote_activate(kn);
+               } else {
+                       drop = true;
                }
+       } else if (kn->kn_status & KN_DISPATCH) {
+               /* disable all dispatch knotes */
+               knote_disable(kn);
+       } else if ((kev.flags & EV_CLEAR) == 0) {
+               /* re-activate in case there are more events */
+               knote_activate(kn);
        }
 
        /*
@@ -5008,80 +4714,137 @@ knote_process(struct knote *kn,
         * If we have to detach and drop the knote, do
         * it while we have the kq unlocked.
         */
-       if (result) {
-               kqunlock(kq);
-               error = (callback)(kq, &kev, callback_data);
-               kqlock(kq);
+       if (drop) {
+               knote_drop(kq, kn, &knlc);
+       } else {
+               knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
        }
-       return (error);
-}
 
+       if (kev.flags & EV_VANISHED) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KNOTE_VANISHED),
+                             kev.ident, kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
+                             kn->kn_filtid);
+       }
+
+       error = (callback)(kq, &kev, callback_data);
+       kqlock(kq);
+       return error;
+}
 
 /*
- * Return 0 to indicate that processing should proceed,
- * -1 if there is nothing to process.
- *
- * Called with kqueue locked and returns the same way,
- * but may drop lock temporarily.
+ * Returns -1 if the kqueue was unbound and processing should not happen
  */
+#define KQWQAE_BEGIN_PROCESSING 1
+#define KQWQAE_END_PROCESSING   2
+#define KQWQAE_UNBIND           3
 static int
-kqworkq_begin_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
+kqworkq_acknowledge_events(struct kqworkq *kqwq, struct kqrequest *kqr,
+               int kevent_flags, int kqwqae_op)
 {
-       struct kqrequest *kqr;
-       thread_t self = current_thread();
-       __assert_only struct uthread *ut = get_bsdthread_info(self);
-
-       assert(kqwq->kqwq_state & KQ_WORKQ);
-       assert(qos_index < KQWQ_NQOS);
+       thread_qos_t old_override = THREAD_QOS_UNSPECIFIED;
+       thread_t thread = kqr->kqr_thread;
+       struct knote *kn;
+       int rc = 0;
+       bool seen_stayactive = false, unbind;
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
-                     flags, qos_index);
+       kqlock_held(&kqwq->kqwq_kqueue);
 
-       kqwq_req_lock(kqwq);
+       if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
+               /*
+                * Return suppressed knotes to their original state.
+                * For workq kqueues, suppressed ones that are still
+                * truly active (not just forced into the queue) will
+                * set flags we check below to see if anything got
+                * woken up.
+                */
+               while ((kn = TAILQ_FIRST(&kqr->kqr_suppressed)) != NULL) {
+                       assert(kn->kn_status & KN_SUPPRESSED);
+                       knote_unsuppress(kn);
+                       if (kn->kn_status & KN_STAYACTIVE) {
+                               seen_stayactive = true;
+                       }
+               }
+       }
 
-       kqr = kqworkq_get_request(kqwq, qos_index);
+       kq_req_lock(kqwq);
 
-       /* manager skips buckets that haven't asked for its help */
-       if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
+#if DEBUG || DEVELOPMENT
+       thread_t self = current_thread();
+       struct uthread *ut = get_bsdthread_info(self);
 
-               /* If nothing for manager to do, just return */
-               if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
-                       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
-                                               0, kqr->kqr_state);
-                       kqwq_req_unlock(kqwq);
-                       return -1;
+       assert(kqr->kqr_state & KQR_THREQUESTED);
+       assert(kqr->kqr_thread == self);
+       assert(ut->uu_kqr_bound == kqr);
+#endif // DEBUG || DEVELOPMENT
+
+       if (kqwqae_op == KQWQAE_UNBIND) {
+               unbind = true;
+       } else if ((kevent_flags & KEVENT_FLAG_PARKING) == 0) {
+               unbind = false;
+       } else if (kqwqae_op == KQWQAE_BEGIN_PROCESSING && seen_stayactive) {
+               /*
+                * When we unsuppress stayactive knotes, for the kind that are hooked
+                * through select, we need to process once before we can assert there's
+                * no event pending. Hence we can't unbind during BEGIN PROCESSING.
+                */
+               unbind = false;
+       } else {
+               unbind = ((kqr->kqr_state & KQR_WAKEUP) == 0);
+       }
+       if (unbind) {
+               old_override = kqworkq_unbind_locked(kqwq, kqr, thread);
+               rc = -1;
+               /*
+                * request a new thread if we didn't process the whole queue or real events
+                * have happened (not just putting stay-active events back).
+                */
+               if (kqr->kqr_state & KQR_WAKEUP) {
+                       kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr,
+                                       kqr->kqr_qos_index, 0);
                }
-               /* bind manager thread from this time on */
-               kqworkq_bind_thread_impl(kqwq, qos_index, self, flags);
+       }
 
-       } else {
-               /* We should already be bound to this kqueue */
-               assert(kqr->kqr_state & KQR_BOUND);
-               assert(kqr->kqr_thread == self);
-               assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
-               assert(ut->uu_kqueue_qos_index == qos_index);
-               assert((ut->uu_kqueue_flags & flags) == ut->uu_kqueue_flags);
+       if (rc == 0) {
+               /*
+                * Reset wakeup bit to notice events firing while we are processing,
+                * as we cannot rely on the bucket queue emptiness because of stay
+                * active knotes.
+                */
+               kqr->kqr_state &= ~KQR_WAKEUP;
        }
 
-       /*
-        * we should have been requested to be here
-        * and nobody else should still be processing
-        */
-       assert(kqr->kqr_state & KQR_WAKEUP);
-       assert(kqr->kqr_state & KQR_THREQUESTED);
-       assert((kqr->kqr_state & KQR_PROCESSING) == 0);
+       kq_req_unlock(kqwq);
 
-       /* reset wakeup trigger to catch new events after we start processing */
-       kqr->kqr_state &= ~KQR_WAKEUP;
+       if (old_override) {
+               thread_drop_ipc_override(thread);
+       }
 
-       /* convert to processing mode */
-       kqr->kqr_state |= KQR_PROCESSING;
+       return rc;
+}
+
+/*
+ * Return 0 to indicate that processing should proceed,
+ * -1 if there is nothing to process.
+ *
+ * Called with kqueue locked and returns the same way,
+ * but may drop lock temporarily.
+ */
+static int
+kqworkq_begin_processing(struct kqworkq *kqwq, struct kqrequest *kqr,
+               int kevent_flags)
+{
+       int rc = 0;
+
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_START,
+                       0, kqr->kqr_qos_index);
+
+       rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
+                       KQWQAE_BEGIN_PROCESSING);
 
        KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_PROCESS_BEGIN) | DBG_FUNC_END,
-                     kqr_thread_id(kqr), kqr->kqr_state);
+                       thread_tid(kqr->kqr_thread), kqr->kqr_state);
 
-       kqwq_req_unlock(kqwq);
-       return 0;
+       return rc;
 }
 
 static inline bool
@@ -5102,10 +4865,11 @@ kqworkloop_is_processing_on_current_thread(struct kqworkloop *kqwl)
        return false;
 }
 
-static void
-kqworkloop_acknowledge_events(struct kqworkloop *kqwl, boolean_t clear_ipc_override)
+static thread_qos_t
+kqworkloop_acknowledge_events(struct kqworkloop *kqwl)
 {
        struct kqrequest *kqr = &kqwl->kqwl_request;
+       kq_index_t qos = THREAD_QOS_UNSPECIFIED;
        struct knote *kn, *tmp;
 
        kqlock_held(&kqwl->kqwl_kqueue);
@@ -5119,48 +4883,112 @@ kqworkloop_acknowledge_events(struct kqworkloop *kqwl, boolean_t clear_ipc_overr
                if (knote_fops(kn)->f_adjusts_qos && (kn->kn_status & KN_DISABLED) &&
                                (kn->kn_status & (KN_STAYACTIVE | KN_DROPPING)) == 0 &&
                                (kn->kn_flags & (EV_DISPATCH | EV_DISABLE)) == EV_DISPATCH) {
-                       /*
-                        * When called from unbind, clear the sync ipc override on the knote
-                        * for events which are delivered.
-                        */
-                       if (clear_ipc_override) {
-                               knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
-                       }
+                       qos = MAX(qos, knote_get_qos_override_index(kn));
                        continue;
                }
                knote_unsuppress(kn);
        }
+
+       return qos;
 }
 
 static int
-kqworkloop_begin_processing(struct kqworkloop *kqwl,
-               __assert_only unsigned int flags)
+kqworkloop_begin_processing(struct kqworkloop *kqwl, unsigned int kevent_flags)
 {
        struct kqrequest *kqr = &kqwl->kqwl_request;
        struct kqueue *kq = &kqwl->kqwl_kqueue;
+       thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override;
+       thread_t thread = kqr->kqr_thread;
+       int rc = 0, op = KQWL_UTQ_NONE;
 
        kqlock_held(kq);
 
        KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_START,
-                     kqwl->kqwl_dynamicid, flags, 0);
-
-       kqwl_req_lock(kqwl);
+                     kqwl->kqwl_dynamicid, 0, 0);
 
        /* nobody else should still be processing */
-       assert((kqr->kqr_state & KQR_PROCESSING) == 0);
        assert((kq->kq_state & KQ_PROCESSING) == 0);
 
-       kqr->kqr_state |= KQR_PROCESSING | KQR_R2K_NOTIF_ARMED;
        kq->kq_state |= KQ_PROCESSING;
 
-       kqwl_req_unlock(kqwl);
+       if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
+               op = KQWL_UTQ_RESET_WAKEUP_OVERRIDE;
+       }
+
+       if (kevent_flags & KEVENT_FLAG_PARKING) {
+               /*
+                * When "parking" we want to process events and if no events are found
+                * unbind.
+                *
+                * However, non overcommit threads sometimes park even when they have
+                * more work so that the pool can narrow.  For these, we need to unbind
+                * early, so that calling kqworkloop_update_threads_qos() can ask the
+                * workqueue subsystem whether the thread should park despite having
+                * pending events.
+                */
+               if (kqr->kqr_state & KQR_THOVERCOMMIT) {
+                       op = KQWL_UTQ_PARKING;
+               } else {
+                       op = KQWL_UTQ_UNBINDING;
+               }
+       }
+       if (op == KQWL_UTQ_NONE) {
+               goto done;
+       }
+
+       qos_override = kqworkloop_acknowledge_events(kqwl);
 
-       kqworkloop_acknowledge_events(kqwl, FALSE);
+       kq_req_lock(kqwl);
+
+       if (op == KQWL_UTQ_UNBINDING) {
+               old_override = kqworkloop_unbind_locked(kqwl, thread);
+               (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
+       }
+       kqworkloop_update_threads_qos(kqwl, op, qos_override);
+       if (op == KQWL_UTQ_PARKING) {
+               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
+                       /*
+                        * We cannot trust KQR_WAKEUP when looking at stay active knotes.
+                        * We need to process once, and kqworkloop_end_processing will
+                        * handle the unbind.
+                        */
+               } else if ((kqr->kqr_state & KQR_WAKEUP) == 0 || kqwl->kqwl_owner) {
+                       old_override = kqworkloop_unbind_locked(kqwl, thread);
+                       (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
+                       rc = -1;
+               }
+       } else if (op == KQWL_UTQ_UNBINDING) {
+               if (kqr->kqr_thread == thread) {
+                       /*
+                        * The thread request fired again, passed the admission check and
+                        * got bound to the current thread again.
+                        */
+               } else {
+                       rc = -1;
+               }
+       }
 
+       if (rc == 0) {
+               /*
+                * Reset wakeup bit to notice stay active events firing while we are
+                * processing, as we cannot rely on the stayactive bucket emptiness.
+                */
+               kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
+       } else {
+               kq->kq_state &= ~KQ_PROCESSING;
+       }
+
+       kq_req_unlock(kqwl);
+
+       if (old_override) {
+               thread_drop_ipc_override(thread);
+       }
+
+done:
        KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_BEGIN) | DBG_FUNC_END,
-                     kqwl->kqwl_dynamicid, flags, 0);
+                     kqwl->kqwl_dynamicid, 0, 0);
 
-       return 0;
+       return rc;
 }
 
 /*
@@ -5172,22 +5000,15 @@ kqworkloop_begin_processing(struct kqworkloop *kqwl,
  * May block.
  */
 static int
-kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int flags)
+kqfile_begin_processing(struct kqueue *kq)
 {
        struct kqtailq *suppressq;
 
        kqlock_held(kq);
 
-       if (kq->kq_state & KQ_WORKQ) {
-               return kqworkq_begin_processing((struct kqworkq *)kq, qos_index, flags);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               return kqworkloop_begin_processing((struct kqworkloop*)kq, flags);
-       }
-
+       assert((kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) == 0);
        KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_START,
-                     VM_KERNEL_UNSLIDE_OR_PERM(kq), flags);
-
-       assert(qos_index == QOS_INDEX_KQFILE);
+                     VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
 
        /* wait to become the exclusive processing thread */
        for (;;) {
@@ -5202,11 +5023,11 @@ kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int fl
 
                /* if someone else is processing the queue, wait */
                kq->kq_state |= KQ_PROCWAIT;
-               suppressq = kqueue_get_suppressed_queue(kq, qos_index);
+               suppressq = kqueue_get_suppressed_queue(kq, NULL);
                waitq_assert_wait64((struct waitq *)&kq->kq_wqs,
-                                   CAST_EVENT64_T(suppressq),
-                                   THREAD_UNINT, TIMEOUT_WAIT_FOREVER);
-               
+                               CAST_EVENT64_T(suppressq), THREAD_UNINT | THREAD_WAIT_NOREPORT,
+                               TIMEOUT_WAIT_FOREVER);
+
                kqunlock(kq);
                thread_block(THREAD_CONTINUE_NULL);
                kqlock(kq);
@@ -5219,7 +5040,7 @@ kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int fl
        kq->kq_state &= ~KQ_WAKEUP;
 
        /* anything left to process? */
-       if (kqueue_queue_empty(kq, qos_index)) {
+       if (kqueue_queue_empty(kq, QOS_INDEX_KQFILE)) {
                KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_BEGIN) | DBG_FUNC_END,
                              VM_KERNEL_UNSLIDE_OR_PERM(kq), 1);
                return -1;
@@ -5235,697 +5056,364 @@ kqueue_begin_processing(struct kqueue *kq, kq_index_t qos_index, unsigned int fl
 }
 
 /*
- *     kqworkq_end_processing - Complete the processing of a workq kqueue
- *
- *     We may have to request new threads.
- *     This can happen there are no waiting processing threads and:
- *     - there were active events we never got to (count > 0)
- *     - we pended waitq hook callouts during processing
- *     - we pended wakeups while processing (or unsuppressing)
+ * Try to end the processing, only called when a workq thread is attempting to
+ * park (KEVENT_FLAG_PARKING is set).
  *
- *     Called with kqueue lock held.
+ * When returning -1, the kqworkq is setup again so that it is ready to be
+ * processed.
  */
-static void
-kqworkq_end_processing(struct kqworkq *kqwq, kq_index_t qos_index, int flags)
+static int
+kqworkq_end_processing(struct kqworkq *kqwq, struct kqrequest *kqr,
+               int kevent_flags)
 {
-#pragma unused(flags)
-
-       struct kqueue *kq = &kqwq->kqwq_kqueue;
-       struct kqtailq *suppressq = kqueue_get_suppressed_queue(kq, qos_index);
-
-       thread_t self = current_thread();
-       struct uthread *ut = get_bsdthread_info(self);
-       struct knote *kn;
-       struct kqrequest *kqr;
-       thread_t thread;
-
-       assert(kqwq->kqwq_state & KQ_WORKQ);
-       assert(qos_index < KQWQ_NQOS);
-
-       /* Are we really bound to this kqueue? */
-       if (ut->uu_kqueue_bound != kq) {
-               assert(ut->uu_kqueue_bound == kq);
-               return;
-       }
-
-       kqr = kqworkq_get_request(kqwq, qos_index);
-
-       kqwq_req_lock(kqwq);
-
-       /* Do we claim to be manager? */
-       if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
-
-               /* bail if not bound that way */
-               if (ut->uu_kqueue_qos_index != KQWQ_QOS_MANAGER ||
-                   (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0) {
-                       assert(ut->uu_kqueue_qos_index == KQWQ_QOS_MANAGER);
-                       assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
-                       kqwq_req_unlock(kqwq);
-                       return;
-               }
-
-               /* bail if this request wasn't already getting manager help */
-               if ((kqr->kqr_state & KQWQ_THMANAGER) == 0 ||
-                   (kqr->kqr_state & KQR_PROCESSING) == 0) {
-                       kqwq_req_unlock(kqwq);
-                       return;
-               }
-       } else {
-               if (ut->uu_kqueue_qos_index != qos_index ||
-                   (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER)) {
-                       assert(ut->uu_kqueue_qos_index == qos_index);
-                       assert((ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER) == 0);
-                       kqwq_req_unlock(kqwq);
-                       return;
-               }
-       }
-
-       assert(kqr->kqr_state & KQR_BOUND);
-       thread = kqr->kqr_thread;
-       assert(thread == self);
-
-       assert(kqr->kqr_state & KQR_PROCESSING);
-
-       /* If we didn't drain the whole queue, re-mark a wakeup being needed */
-       if (!kqueue_queue_empty(kq, qos_index))
+       if (!kqueue_queue_empty(&kqwq->kqwq_kqueue, kqr->kqr_qos_index)) {
+               /* remember we didn't process everything */
+               kq_req_lock(kqwq);
                kqr->kqr_state |= KQR_WAKEUP;
-
-       kqwq_req_unlock(kqwq);
-
-       /*
-        * Return suppressed knotes to their original state.
-        * For workq kqueues, suppressed ones that are still
-        * truly active (not just forced into the queue) will
-        * set flags we check below to see if anything got
-        * woken up.
-        */
-       while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
-               assert(kn->kn_status & KN_SUPPRESSED);
-               knote_unsuppress(kn);
+               kq_req_unlock(kqwq);
        }
 
-       kqwq_req_lock(kqwq);
-
-       /* Indicate that we are done processing this request */
-       kqr->kqr_state &= ~KQR_PROCESSING;
-
-       /*
-        * Drop our association with this one request and its
-        * override on us.
-        */
-       kqworkq_unbind_thread(kqwq, qos_index, thread, flags);
-
-       /*
-        * request a new thread if we didn't process the whole
-        * queue or real events have happened (not just putting
-        * stay-active events back).
-        */
-       if (kqr->kqr_state & KQR_WAKEUP) {
-               if (kqueue_queue_empty(kq, qos_index)) {
-                       kqr->kqr_state &= ~KQR_WAKEUP;
-               } else {
-                       kqworkq_request_thread(kqwq, qos_index);
-               }
-       }
-       kqwq_req_unlock(kqwq);
-}
-
-static void
-kqworkloop_end_processing(struct kqworkloop *kqwl, int nevents,
-               unsigned int flags)
-{
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       struct kqueue *kq = &kqwl->kqwl_kqueue;
-
-       kqlock_held(kq);
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
-                       kqwl->kqwl_dynamicid, flags, 0);
-
-       if ((kq->kq_state & KQ_NO_WQ_THREAD) && nevents == 0 &&
-                       (flags & KEVENT_FLAG_IMMEDIATE) == 0) {
+       if (kevent_flags & KEVENT_FLAG_PARKING) {
                /*
-                * <rdar://problem/31634014> We may soon block, but have returned no
-                * kevents that need to be kept supressed for overriding purposes.
-                *
-                * It is hence safe to acknowledge events and unsuppress everything, so
-                * that if we block we can observe all events firing.
+                * if acknowledge events "succeeds" it means there are events,
+                * which is a failure condition for end_processing.
                 */
-               kqworkloop_acknowledge_events(kqwl, TRUE);
-       }
-
-       kqwl_req_lock(kqwl);
-
-       assert(kqr->kqr_state & KQR_PROCESSING);
-       assert(kq->kq_state & KQ_PROCESSING);
-
-       kq->kq_state &= ~KQ_PROCESSING;
-       kqr->kqr_state &= ~KQR_PROCESSING;
-       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
-
-       kqwl_req_unlock(kqwl);
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
-                       kqwl->kqwl_dynamicid, flags, 0);
-}
-
-/*
- * Called with kqueue lock held.
- */
-static void
-kqueue_end_processing(struct kqueue *kq, kq_index_t qos_index,
-               int nevents, unsigned int flags)
-{
-       struct knote *kn;
-       struct kqtailq *suppressq;
-       int procwait;
-
-       kqlock_held(kq);
-
-       assert((kq->kq_state & KQ_WORKQ) == 0);
-
-       if (kq->kq_state & KQ_WORKLOOP) {
-               return kqworkloop_end_processing((struct kqworkloop *)kq, nevents, flags);
-       }
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
-                     VM_KERNEL_UNSLIDE_OR_PERM(kq), flags);
-
-       assert(qos_index == QOS_INDEX_KQFILE);
-
-       /*
-        * Return suppressed knotes to their original state.
-        */
-       suppressq = kqueue_get_suppressed_queue(kq, qos_index);
-       while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
-               assert(kn->kn_status & KN_SUPPRESSED);
-               knote_unsuppress(kn);
+               int rc = kqworkq_acknowledge_events(kqwq, kqr, kevent_flags,
+                               KQWQAE_END_PROCESSING);
+               if (rc == 0) {
+                       return -1;
+               }
        }
 
-       procwait = (kq->kq_state & KQ_PROCWAIT);
-       kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
-
-       if (procwait) {
-               /* first wake up any thread already waiting to process */
-               waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                                  CAST_EVENT64_T(suppressq),
-                                  THREAD_AWAKENED,
-                                  WAITQ_ALL_PRIORITIES);
-       }
+       return 0;
 }
 
 /*
- *     kqwq_internal_bind - bind thread to processing workq kqueue
+ * Try to end the processing, only called when a workq thread is attempting to
+ * park (KEVENT_FLAG_PARKING is set).
  *
- *     Determines if the provided thread will be responsible for
- *     servicing the particular QoS class index specified in the
- *     parameters. Once the binding is done, any overrides that may
- *     be associated with the cooresponding events can be applied.
+ * When returning -1, the kqworkq is setup again so that it is ready to be
+ * processed (as if kqworkloop_begin_processing had just been called).
  *
- *     This should be called as soon as the thread identity is known,
- *     preferably while still at high priority during creation.
- *
- *  - caller holds a reference on the process (and workq kq)
- *     - the thread MUST call kevent_qos_internal after being bound
- *       or the bucket of events may never be delivered.  
- *     - Nothing locked
- *    (unless this is a synchronous bind, then the request is locked)
+ * If successful and KEVENT_FLAG_PARKING was set in the kevent_flags,
+ * the kqworkloop is unbound from its servicer as a side effect.
  */
 static int
-kqworkq_internal_bind(
-       struct proc *p,
-       kq_index_t qos_index,
-       thread_t thread,
-       unsigned int flags)
-{
-       struct kqueue *kq;
-       struct kqworkq *kqwq;
-       struct kqrequest *kqr;
-       struct uthread *ut = get_bsdthread_info(thread);
-
-       /* If no process workq, can't be our thread. */
-       kq = p->p_fd->fd_wqkqueue;
-
-       if (kq == NULL)
-               return 0;
-
-       assert(kq->kq_state & KQ_WORKQ);
-       kqwq = (struct kqworkq *)kq;
-
-       /*
-        * No need to bind the manager thread to any specific
-        * bucket, but still claim the thread.
-        */
-       if (qos_index == KQWQ_QOS_MANAGER) {
-               assert(ut->uu_kqueue_bound == NULL);
-               assert(flags & KEVENT_FLAG_WORKQ_MANAGER);
-               ut->uu_kqueue_bound = kq;
-               ut->uu_kqueue_qos_index = qos_index;
-               ut->uu_kqueue_flags = flags;
-
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND),
-                             thread_tid(thread), flags, qos_index);
-
-               return 1;
-       }
-
-       /*
-        * If this is a synchronous bind callback, the request
-        * lock is already held, so just do the bind.
-        */
-       if (flags & KEVENT_FLAG_SYNCHRONOUS_BIND) {
-               kqwq_req_held(kqwq);
-               /* strip out synchronout bind flag */
-               flags &= ~KEVENT_FLAG_SYNCHRONOUS_BIND;
-               kqworkq_bind_thread_impl(kqwq, qos_index, thread, flags);
-               return 1;
-       }
-
-       /*
-        * check the request that corresponds to our qos_index
-        * to see if there is an outstanding request.
-        */
-       kqr = kqworkq_get_request(kqwq, qos_index);
-       assert(kqr->kqr_qos_index == qos_index);
-       kqwq_req_lock(kqwq);
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND),
-                     thread_tid(thread), flags, qos_index, kqr->kqr_state);
-
-       if ((kqr->kqr_state & KQR_THREQUESTED) &&
-           (kqr->kqr_state & KQR_PROCESSING) == 0) {
-
-               if ((kqr->kqr_state & KQR_BOUND) &&
-                   thread == kqr->kqr_thread) {
-                       /* duplicate bind - claim the thread */
-                       assert(ut->uu_kqueue_bound == kq);
-                       assert(ut->uu_kqueue_qos_index == qos_index);
-                       kqwq_req_unlock(kqwq);
-                       return 1;
-               }
-               if ((kqr->kqr_state & (KQR_BOUND | KQWQ_THMANAGER)) == 0) {
-                       /* ours to bind to */
-                       kqworkq_bind_thread_impl(kqwq, qos_index, thread, flags);
-                       kqwq_req_unlock(kqwq);
-                       return 1;
-               }
-       }
-       kqwq_req_unlock(kqwq);
-       return 0;
-}
-
-static void
-kqworkloop_bind_thread_impl(struct kqworkloop *kqwl,
-                            thread_t thread,
-                            __assert_only unsigned int flags)
+kqworkloop_end_processing(struct kqworkloop *kqwl, int flags, int kevent_flags)
 {
-       assert(flags & KEVENT_FLAG_WORKLOOP);
-
-       /* the request object must be locked */
-       kqwl_req_held(kqwl);
-
+       struct kqueue *kq = &kqwl->kqwl_kqueue;
        struct kqrequest *kqr = &kqwl->kqwl_request;
-       struct uthread *ut = get_bsdthread_info(thread);
-       boolean_t ipc_override_is_sync;
-       kq_index_t qos_index = kqworkloop_combined_qos(kqwl, &ipc_override_is_sync);
-
-       /* nobody else bound so finally bind (as a workloop) */
-       assert(kqr->kqr_state & KQR_THREQUESTED);
-       assert((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) == 0);
-       assert(thread != kqwl->kqwl_owner);
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND),
-                     kqwl->kqwl_dynamicid, (uintptr_t)thread_tid(thread),
-                     qos_index,
-                     (uintptr_t)(((uintptr_t)kqr->kqr_override_index << 16) |
-                     (((uintptr_t)kqr->kqr_state) << 8) |
-                     ((uintptr_t)ipc_override_is_sync)));
+       thread_qos_t old_override = THREAD_QOS_UNSPECIFIED, qos_override;
+       thread_t thread = kqr->kqr_thread;
+       int rc = 0;
 
-       kqr->kqr_state |= KQR_BOUND | KQR_R2K_NOTIF_ARMED;
-       kqr->kqr_thread = thread;
+       kqlock_held(kq);
 
-       /* bind the workloop to the uthread */
-       ut->uu_kqueue_bound = (struct kqueue *)kqwl;
-       ut->uu_kqueue_flags = flags;
-       ut->uu_kqueue_qos_index = qos_index;
-       assert(ut->uu_kqueue_override_is_sync == 0);
-       ut->uu_kqueue_override_is_sync = ipc_override_is_sync;
-       if (qos_index) {
-               thread_add_ipc_override(thread, qos_index);
-       }
-       if (ipc_override_is_sync) {
-               thread_add_sync_ipc_override(thread);
-       }
-}
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_START,
+                       kqwl->kqwl_dynamicid, 0, 0);
 
-/*
- *  workloop_fulfill_threadreq - bind thread to processing workloop
- *
- * The provided thread will be responsible for delivering events
- * associated with the given kqrequest.  Bind it and get ready for
- * the thread to eventually arrive.
- *
- * If WORKLOOP_FULFILL_THREADREQ_SYNC is specified, the callback
- * within the context of the pthread_functions->workq_threadreq
- * callout.  In this case, the request structure is already locked.
- */
-int
-workloop_fulfill_threadreq(struct proc *p,
-                           workq_threadreq_t req,
-                           thread_t thread,
-                           int flags)
-{
-       int sync = (flags & WORKLOOP_FULFILL_THREADREQ_SYNC);
-       int cancel = (flags & WORKLOOP_FULFILL_THREADREQ_CANCEL);
-       struct kqrequest *kqr;
-       struct kqworkloop *kqwl;
+       if (flags & KQ_PROCESSING) {
+               assert(kq->kq_state & KQ_PROCESSING);
 
-       kqwl = (struct kqworkloop *)((uintptr_t)req -
-                                    offsetof(struct kqworkloop, kqwl_request) -
-                                    offsetof(struct kqrequest, kqr_req));
-       kqr = &kqwl->kqwl_request;
+               /*
+                * If we still have queued stayactive knotes, remember we didn't finish
+                * processing all of them.  This should be extremely rare and would
+                * require to have a lot of them registered and fired.
+                */
+               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[KQWL_BUCKET_STAYACTIVE])) {
+                       kq_req_lock(kqwl);
+                       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS,
+                                       KQWL_BUCKET_STAYACTIVE);
+                       kq_req_unlock(kqwl);
+               }
 
-       /* validate we're looking at something valid */
-       if (kqwl->kqwl_p != p ||
-           (kqwl->kqwl_state & KQ_WORKLOOP) == 0) {
-               assert(kqwl->kqwl_p == p);
-               assert(kqwl->kqwl_state & KQ_WORKLOOP);
-               return EINVAL;
+               /*
+                * When KEVENT_FLAG_PARKING is set, we need to attempt an unbind while
+                * still under the lock.
+                *
+                * So we do everything kqworkloop_unbind() would do, but because we're
+                * inside kqueue_process(), if the workloop actually received events
+                * while our locks were dropped, we have the opportunity to fail the end
+                * processing and loop again.
+                *
+                * This avoids going through the process-wide workqueue lock hence
+                * scales better.
+                */
+               if (kevent_flags & KEVENT_FLAG_PARKING) {
+                       qos_override = kqworkloop_acknowledge_events(kqwl);
+               }
        }
-       
-       if (!sync)
-               kqwl_req_lock(kqwl);
 
-       /* Should be a pending request */
-       if ((kqr->kqr_state & KQR_BOUND) ||
-           (kqr->kqr_state & KQR_THREQUESTED) == 0) {
+       kq_req_lock(kqwl);
 
-               assert((kqr->kqr_state & KQR_BOUND) == 0);
-               assert(kqr->kqr_state & KQR_THREQUESTED);
-               if (!sync)
-                       kqwl_req_unlock(kqwl);
-               return EINPROGRESS;
+       if (kevent_flags & KEVENT_FLAG_PARKING) {
+               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_PARKING, qos_override);
+               if ((kqr->kqr_state & KQR_WAKEUP) && !kqwl->kqwl_owner) {
+                       /*
+                        * Reset wakeup bit to notice stay active events firing while we are
+                        * processing, as we cannot rely on the stayactive bucket emptiness.
+                        */
+                       kqr->kqr_wakeup_indexes &= ~KQWL_STAYACTIVE_FIRED_BIT;
+                       rc = -1;
+               } else {
+                       old_override = kqworkloop_unbind_locked(kqwl, thread);
+                       (void)kqueue_release(kqwl, KQUEUE_CANT_BE_LAST_REF);
+                       kq->kq_state &= ~flags;
+               }
+       } else {
+               kq->kq_state &= ~flags;
+               kqr->kqr_state |= KQR_R2K_NOTIF_ARMED;
+               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
        }
 
-       assert((kqr->kqr_state & KQR_DRAIN) == 0);
+       kq_req_unlock(kqwl);
 
-       /*
-        * Is it a cancel indication from pthread.
-        * If so, we must be exiting/exec'ing. Forget
-        * our pending request.
-        */
-       if (cancel) {
-               kqr->kqr_state &= ~KQR_THREQUESTED;
-               kqr->kqr_state |= KQR_DRAIN;
-       } else {
-               /* do the actual bind? */
-               kqworkloop_bind_thread_impl(kqwl, thread, KEVENT_FLAG_WORKLOOP);
+       if (old_override) {
+               thread_drop_ipc_override(thread);
        }
 
-       if (!sync)
-               kqwl_req_unlock(kqwl);
-
-       if (cancel)
-               kqueue_release_last(p, &kqwl->kqwl_kqueue); /* may dealloc kq */
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_PROCESS_END) | DBG_FUNC_END,
+                       kqwl->kqwl_dynamicid, 0, 0);
 
-       return 0;
+       return rc;
 }
-       
 
 /*
- *     kevent_qos_internal_bind - bind thread to processing kqueue
- *
- *     Indicates that the provided thread will be responsible for
- *     servicing the particular QoS class index specified in the
- *     parameters. Once the binding is done, any overrides that may
- *     be associated with the cooresponding events can be applied.
- *
- *     This should be called as soon as the thread identity is known,
- *     preferably while still at high priority during creation.
- *
- *  - caller holds a reference on the kqueue.
- *     - the thread MUST call kevent_qos_internal after being bound
- *       or the bucket of events may never be delivered.  
- *     - Nothing locked (may take mutex or block).
+ * Called with kqueue lock held.
  */
-
-int
-kevent_qos_internal_bind(
-       struct proc *p,
-       int qos_class,
-       thread_t thread,
-       unsigned int flags)
-{
-       kq_index_t qos_index;
-
-       assert(flags & KEVENT_FLAG_WORKQ);
-
-       if (thread == THREAD_NULL || (flags & KEVENT_FLAG_WORKQ) == 0) {
-               return EINVAL;
-       }
-
-       /* get the qos index we're going to service */
-       qos_index = qos_index_for_servicer(qos_class, thread, flags);
-
-       if (kqworkq_internal_bind(p, qos_index, thread, flags))
-               return 0;
-
-       return EINPROGRESS;
-}
-
-
 static void
-kqworkloop_internal_unbind(
-       struct proc *p,
-       thread_t thread,
-       unsigned int flags)
+kqfile_end_processing(struct kqueue *kq)
 {
-       struct kqueue *kq;
-       struct kqworkloop *kqwl;
-       struct uthread *ut = get_bsdthread_info(thread);
-
-       assert(ut->uu_kqueue_bound != NULL);
-       kq = ut->uu_kqueue_bound;
-       assert(kq->kq_state & KQ_WORKLOOP);
-       kqwl = (struct kqworkloop *)kq;
-
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND),
-                     kqwl->kqwl_dynamicid, (uintptr_t)thread_tid(thread),
-                     flags, 0);
-
-       if (!(kq->kq_state & KQ_NO_WQ_THREAD)) {
-               assert(is_workqueue_thread(thread));
-
-               kqlock(kq);
-               kqworkloop_unbind_thread(kqwl, thread, flags);
-               kqunlock(kq);
-
-               /* If last reference, dealloc the workloop kq */
-               kqueue_release_last(p, kq);
-       } else {
-               assert(!is_workqueue_thread(thread));
-               kevent_servicer_detach_thread(p, kqwl->kqwl_dynamicid, thread, flags, kq);
-       }
-}
-
-static void
-kqworkq_internal_unbind(
-       struct proc *p,
-       kq_index_t qos_index,
-       thread_t thread,
-       unsigned int flags)
-{
-       struct kqueue *kq;
-       struct kqworkq *kqwq;
-       struct uthread *ut;
-       kq_index_t end_index;
-
-       assert(thread == current_thread());
-       ut = get_bsdthread_info(thread);
-
-       kq = p->p_fd->fd_wqkqueue;
-       assert(kq->kq_state & KQ_WORKQ);
-       assert(ut->uu_kqueue_bound == kq);
+       struct knote *kn;
+       struct kqtailq *suppressq;
+       int procwait;
 
-       kqwq = (struct kqworkq *)kq;
+       kqlock_held(kq);
 
-       /* end servicing any requests we might own */
-       end_index = (qos_index == KQWQ_QOS_MANAGER) ? 
-           0 : qos_index;
-       kqlock(kq);
+       assert((kq->kq_state & (KQ_WORKQ|KQ_WORKLOOP)) == 0);
 
-       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND),
-                     (uintptr_t)thread_tid(thread), flags, qos_index);
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQ_PROCESS_END),
+                       VM_KERNEL_UNSLIDE_OR_PERM(kq), 0);
 
-       do {
-               kqworkq_end_processing(kqwq, qos_index, flags);
-       } while (qos_index-- > end_index);
+       /*
+        * Return suppressed knotes to their original state.
+        */
+       suppressq = kqueue_get_suppressed_queue(kq, NULL);
+       while ((kn = TAILQ_FIRST(suppressq)) != NULL) {
+               assert(kn->kn_status & KN_SUPPRESSED);
+               knote_unsuppress(kn);
+       }
 
-       ut->uu_kqueue_bound = NULL;
-       ut->uu_kqueue_qos_index = 0;
-       ut->uu_kqueue_flags = 0;
+       procwait = (kq->kq_state & KQ_PROCWAIT);
+       kq->kq_state &= ~(KQ_PROCESSING | KQ_PROCWAIT);
 
-       kqunlock(kq);
+       if (procwait) {
+               /* first wake up any thread already waiting to process */
+               waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
+                                  CAST_EVENT64_T(suppressq),
+                                  THREAD_AWAKENED,
+                                  WAITQ_ALL_PRIORITIES);
+       }
 }
 
-/*
- *     kevent_qos_internal_unbind - unbind thread from processing kqueue
- *
- *     End processing the per-QoS bucket of events and allow other threads
- *     to be requested for future servicing.  
- *
- *     caller holds a reference on the kqueue.
- *     thread is the current thread.
- */
-
-int
-kevent_qos_internal_unbind(
-       struct proc *p,
-       int qos_class,
-       thread_t thread,
-       unsigned int flags)
+static int
+kqueue_workloop_ctl_internal(proc_t p, uintptr_t cmd, uint64_t __unused options,
+               struct kqueue_workloop_params *params, int *retval)
 {
-#pragma unused(qos_class)
-
-       struct uthread *ut;
+       int error = 0;
+       int fd;
+       struct fileproc *fp;
        struct kqueue *kq;
-       unsigned int bound_flags;
-       bool check_flags;
+       struct kqworkloop *kqwl;
+       struct filedesc *fdp = p->p_fd;
+       workq_threadreq_param_t trp = { };
 
-       ut = get_bsdthread_info(thread);
-       if (ut->uu_kqueue_bound == NULL) {
-               /* early out if we are already unbound */
-               assert(ut->uu_kqueue_flags == 0);
-               assert(ut->uu_kqueue_qos_index == 0);
-               assert(ut->uu_kqueue_override_is_sync == 0);
-               return EALREADY;
-       }
+       switch (cmd) {
+       case KQ_WORKLOOP_CREATE:
+               if (!params->kqwlp_flags) {
+                       error = EINVAL;
+                       break;
+               }
 
-       assert(flags & (KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP));
-       assert(thread == current_thread());
+               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) &&
+                               (params->kqwlp_sched_pri < 1 ||
+                                params->kqwlp_sched_pri > 63 /* MAXPRI_USER */)) {
+                       error = EINVAL;
+                       break;
+               }
 
-       check_flags = flags & KEVENT_FLAG_UNBIND_CHECK_FLAGS;
+               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) &&
+                               invalid_policy(params->kqwlp_sched_pol)) {
+                       error = EINVAL;
+                       break;
+               }
 
-       /* Get the kqueue we started with */
-       kq = ut->uu_kqueue_bound;
-       assert(kq != NULL);
-       assert(kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP));
+               if ((params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) &&
+                               (params->kqwlp_cpu_percent <= 0 ||
+                                params->kqwlp_cpu_percent > 100 ||
+                                params->kqwlp_cpu_refillms <= 0 ||
+                                params->kqwlp_cpu_refillms > 0x00ffffff)) {
+                       error = EINVAL;
+                       break;
+               }
 
-       /* get flags and QoS parameters we started with */
-       bound_flags = ut->uu_kqueue_flags;
+               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_PRI) {
+                       trp.trp_flags |= TRP_PRIORITY;
+                       trp.trp_pri = params->kqwlp_sched_pri;
+               }
+               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_SCHED_POL) {
+                       trp.trp_flags |= TRP_POLICY;
+                       trp.trp_pol = params->kqwlp_sched_pol;
+               }
+               if (params->kqwlp_flags & KQ_WORKLOOP_CREATE_CPU_PERCENT) {
+                       trp.trp_flags |= TRP_CPUPERCENT;
+                       trp.trp_cpupercent = (uint8_t)params->kqwlp_cpu_percent;
+                       trp.trp_refillms = params->kqwlp_cpu_refillms;
+               }
 
-       /* Unbind from the class of workq */
-       if (kq->kq_state & KQ_WORKQ) {
-               if (check_flags && !(flags & KEVENT_FLAG_WORKQ)) {
-                       return EINVAL;
+               error = kevent_get_kq(p, params->kqwlp_id, &trp,
+                               KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
+                               KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST , &fp, &fd, &kq);
+               if (error) {
+                       break;
                }
 
-               kqworkq_internal_unbind(p, ut->uu_kqueue_qos_index, thread, bound_flags);
-       } else {
-               if (check_flags && !(flags & KEVENT_FLAG_WORKLOOP)) {
-                       return EINVAL;
+               if (!(fdp->fd_flags & FD_WORKLOOP)) {
+                       /* FD_WORKLOOP indicates we've ever created a workloop
+                        * via this syscall but its only ever added to a process, never
+                        * removed.
+                        */
+                       proc_fdlock(p);
+                       fdp->fd_flags |= FD_WORKLOOP;
+                       proc_fdunlock(p);
+               }
+               break;
+       case KQ_WORKLOOP_DESTROY:
+               error = kevent_get_kq(p, params->kqwlp_id, NULL,
+                               KEVENT_FLAG_DYNAMIC_KQUEUE | KEVENT_FLAG_WORKLOOP |
+                               KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST , &fp, &fd, &kq);
+               if (error) {
+                       break;
+               }
+               kqlock(kq);
+               kqwl = (struct kqworkloop *)kq;
+               trp.trp_value = kqwl->kqwl_params;
+               if (trp.trp_flags && !(trp.trp_flags & TRP_RELEASED)) {
+                       trp.trp_flags |= TRP_RELEASED;
+                       kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
+               } else {
+                       error = EINVAL;
                }
+               kqunlock(kq);
+               kqueue_release_last(p, kq);
+               break;
+       }
+       *retval = 0;
+       return error;
+}
+
+int
+kqueue_workloop_ctl(proc_t p, struct kqueue_workloop_ctl_args *uap, int *retval)
+{
+       struct kqueue_workloop_params params = {
+               .kqwlp_id = 0,
+       };
+       if (uap->sz < sizeof(params.kqwlp_version)) {
+               return EINVAL;
+       }
 
-               kqworkloop_internal_unbind(p, thread, bound_flags);
+       size_t copyin_sz = MIN(sizeof(params), uap->sz);
+       int rv = copyin(uap->addr, &params, copyin_sz);
+       if (rv) {
+               return rv;
        }
 
-       return 0;
+       if (params.kqwlp_version != (int)uap->sz) {
+               return EINVAL;
+       }
+
+       return kqueue_workloop_ctl_internal(p, uap->cmd, uap->options, &params,
+                       retval);
 }
 
 /*
  * kqueue_process - process the triggered events in a kqueue
  *
- *     Walk the queued knotes and validate that they are
- *     really still triggered events by calling the filter
- *     routines (if necessary).  Hold a use reference on
- *     the knote to avoid it being detached. For each event
- *     that is still considered triggered, invoke the
- *     callback routine provided.
+ *     Walk the queued knotes and validate that they are really still triggered
+ *     events by calling the filter routines (if necessary).
+ *
+ *     For each event that is still considered triggered, invoke the callback
+ *     routine provided.
  *
  *     caller holds a reference on the kqueue.
  *     kqueue locked on entry and exit - but may be dropped
  *     kqueue list locked (held for duration of call)
  */
-
 static int
 kqueue_process(struct kqueue *kq,
-    kevent_callback_t callback,
-    void *callback_data,
-    struct filt_process_s *process_data,
-    int *countp,
-    struct proc *p)
+               kevent_callback_t callback,
+               void *callback_data,
+               struct filt_process_s *process_data,
+               int *countp)
 {
-       unsigned int flags = process_data ? process_data->fp_flags : 0;
        struct uthread *ut = get_bsdthread_info(current_thread());
-       kq_index_t start_index, end_index, i;
+       struct kqrequest *kqr = ut->uu_kqr_bound;
        struct knote *kn;
-       int nevents = 0;
-       int error = 0;
+       unsigned int flags = process_data ? process_data->fp_flags : 0;
+       int nevents = 0, error = 0, rc = 0;
+       struct kqtailq *base_queue, *queue;
+       kqueue_t kqu = { .kq = kq };
+#if DEBUG || DEVELOPMENT
+       int retries = 64;
+#endif
 
-       /*
-        * Based on the mode of the kqueue and the bound QoS of the servicer,
-        * determine the range of thread requests that need checking
-        */
        if (kq->kq_state & KQ_WORKQ) {
-               if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
-                       start_index = KQWQ_QOS_MANAGER;
-               } else if (ut->uu_kqueue_bound != kq) {
+               if (kqr == NULL || (kqr->kqr_state & KQR_WORKLOOP)) {
                        return EJUSTRETURN;
-               } else {
-                       start_index = ut->uu_kqueue_qos_index;
                }
-
-               /* manager services every request in a workq kqueue */
-               assert(start_index > 0 && start_index <= KQWQ_QOS_MANAGER);
-               end_index = (start_index == KQWQ_QOS_MANAGER) ? 0 : start_index;
-
+               rc = kqworkq_begin_processing(kqu.kqwq, kqr, flags);
        } else if (kq->kq_state & KQ_WORKLOOP) {
-               if (ut->uu_kqueue_bound != kq)
+               if (ut->uu_kqr_bound != &kqu.kqwl->kqwl_request) {
                        return EJUSTRETURN;
-
-               /*
-                * Single request servicing
-                * we want to deliver all events, regardless of the QOS
-                */
-               start_index = end_index = THREAD_QOS_UNSPECIFIED;
+               }
+               rc = kqworkloop_begin_processing(kqu.kqwl, flags);
        } else {
-               start_index = end_index = QOS_INDEX_KQFILE;
+               rc = kqfile_begin_processing(kq);
        }
-       
-       i = start_index;
 
-       do {
-               if (kqueue_begin_processing(kq, i, flags) == -1) {
-                       *countp = 0;
-                       /* Nothing to process */
-                       continue;
-               }
+       if (rc == -1) {
+               /* Nothing to process */
+               *countp = 0;
+               return 0;
+       }
 
-               /*
-                * loop through the enqueued knotes associated with this request,
-                * processing each one. Each request may have several queues
-                * of knotes to process (depending on the type of kqueue) so we
-                * have to loop through all the queues as long as we have additional
-                * space.
-                */
-               error = 0;
+       /*
+        * loop through the enqueued knotes associated with this request,
+        * processing each one. Each request may have several queues
+        * of knotes to process (depending on the type of kqueue) so we
+        * have to loop through all the queues as long as we have additional
+        * space.
+        */
 
-               struct kqtailq *base_queue = kqueue_get_base_queue(kq, i);
-               struct kqtailq *queue = kqueue_get_high_queue(kq, i);
-               do {
-                       while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) {
-                               error = knote_process(kn, callback, callback_data, process_data, p);
-                               if (error == EJUSTRETURN) {
-                                       error = 0;
-                               } else {
-                                       nevents++;
-                               }
-                               /* error is EWOULDBLOCK when the out event array is full */
-                       }
-               } while (error == 0 && queue-- > base_queue);
+process_again:
+       if (kq->kq_state & KQ_WORKQ) {
+               base_queue = queue = &kqu.kqwq->kqwq_queue[kqr->kqr_qos_index];
+       } else if (kq->kq_state & KQ_WORKLOOP) {
+               base_queue = &kqu.kqwl->kqwl_queue[0];
+               queue = &kqu.kqwl->kqwl_queue[KQWL_NBUCKETS - 1];
+       } else {
+               base_queue = queue = &kq->kq_queue[QOS_INDEX_KQFILE];
+       }
 
-               if ((kq->kq_state & KQ_WORKQ) == 0) {
-                       kqueue_end_processing(kq, i, nevents, flags);
+       do {
+               while (error == 0 && (kn = TAILQ_FIRST(queue)) != NULL) {
+                       error = knote_process(kn, callback, callback_data, process_data);
+                       if (error == EJUSTRETURN) {
+                               error = 0;
+                       } else {
+                               nevents++;
+                       }
+                       /* error is EWOULDBLOCK when the out event array is full */
                }
 
                if (error == EWOULDBLOCK) {
@@ -5933,10 +5421,40 @@ kqueue_process(struct kqueue *kq,
                        error = 0;
                        break;
                }
-       } while (i-- > end_index);
+       } while (queue-- > base_queue);
 
        *countp = nevents;
-       return (error);
+
+       /*
+        * If KEVENT_FLAG_PARKING is set, and no kevents have been returned,
+        * we want to unbind the kqrequest from the thread.
+        *
+        * However, because the kq locks are dropped several times during process,
+        * new knotes may have fired again, in which case, we want to fail the end
+        * processing and process again, until it converges.
+        *
+        * If we returned events however, end processing never fails.
+        */
+       if (error || nevents) flags &= ~KEVENT_FLAG_PARKING;
+       if (kq->kq_state & KQ_WORKQ) {
+               rc = kqworkq_end_processing(kqu.kqwq, kqr, flags);
+       } else if (kq->kq_state & KQ_WORKLOOP) {
+               rc = kqworkloop_end_processing(kqu.kqwl, KQ_PROCESSING, flags);
+       } else {
+               kqfile_end_processing(kq);
+               rc = 0;
+       }
+       if (rc == -1) {
+               assert(flags & KEVENT_FLAG_PARKING);
+#if DEBUG || DEVELOPMENT
+               if (retries-- == 0) {
+                       panic("kevent: way too many knote_process retries, kq: %p (0x%02x)",
+                                       kq, kq->kq_state);
+               }
+#endif
+               goto process_again;
+       }
+       return error;
 }
 
 static void
@@ -5944,7 +5462,7 @@ kqueue_scan_continue(void *data, wait_result_t wait_result)
 {
        thread_t self = current_thread();
        uthread_t ut = (uthread_t)get_bsdthread_info(self);
-       struct _kqueue_scan * cont_args = &ut->uu_kevent.ss_kqueue_scan;
+       struct _kqueue_scan * cont_args = &ut->uu_save.uus_kqueue_scan;
        struct kqueue *kq = (struct kqueue *)data;
        struct filt_process_s *process_data = cont_args->process_data;
        int error;
@@ -5955,8 +5473,8 @@ kqueue_scan_continue(void *data, wait_result_t wait_result)
        case THREAD_AWAKENED: {
                kqlock(kq);
        retry:
-               error = kqueue_process(kq, cont_args->call, cont_args->data, 
-                                      process_data, &count, current_proc());
+               error = kqueue_process(kq, cont_args->call, cont_args->data,
+                                      process_data, &count);
                if (error == 0 && count == 0) {
                        if (kq->kq_state & KQ_DRAIN) {
                                kqunlock(kq);
@@ -6011,7 +5529,6 @@ kqueue_scan_continue(void *data, wait_result_t wait_result)
  *     The callback routine must be valid.
  *     The caller must hold a use-count reference on the kq.
  */
-
 int
 kqueue_scan(struct kqueue *kq,
            kevent_callback_t callback,
@@ -6019,7 +5536,7 @@ kqueue_scan(struct kqueue *kq,
            void *callback_data,
            struct filt_process_s *process_data,
            struct timeval *atvp,
-           struct proc *p)
+           __unused struct proc *p)
 {
        thread_continue_t cont = THREAD_CONTINUE_NULL;
        unsigned int flags;
@@ -6047,7 +5564,7 @@ kqueue_scan(struct kqueue *kq,
                 */
                kqlock(kq);
                error = kqueue_process(kq, callback, callback_data,
-                                      process_data, &count, p);
+                                      process_data, &count);
                if (error || count)
                        break; /* lock still held */
 
@@ -6070,12 +5587,12 @@ kqueue_scan(struct kqueue *kq,
                                deadline -= now;
                                clock_absolutetime_interval_to_deadline(deadline, &deadline);
                        } else {
-                               deadline = 0;   /* block forever */
+                               deadline = 0;   /* block forever */
                        }
 
                        if (continuation) {
                                uthread_t ut = (uthread_t)get_bsdthread_info(current_thread());
-                               struct _kqueue_scan *cont_args = &ut->uu_kevent.ss_kqueue_scan;
+                               struct _kqueue_scan *cont_args = &ut->uu_save.uus_kqueue_scan;
 
                                cont_args->call = callback;
                                cont_args->cont = continuation;
@@ -6134,9 +5651,9 @@ kqueue_scan(struct kqueue *kq,
 /*ARGSUSED*/
 static int
 kqueue_read(__unused struct fileproc *fp,
-    __unused struct uio *uio,
-    __unused int flags,
-    __unused vfs_context_t ctx)
+               __unused struct uio *uio,
+               __unused int flags,
+               __unused vfs_context_t ctx)
 {
        return (ENXIO);
 }
@@ -6144,9 +5661,9 @@ kqueue_read(__unused struct fileproc *fp,
 /*ARGSUSED*/
 static int
 kqueue_write(__unused struct fileproc *fp,
-    __unused struct uio *uio,
-    __unused int flags,
-    __unused vfs_context_t ctx)
+               __unused struct uio *uio,
+               __unused int flags,
+               __unused vfs_context_t ctx)
 {
        return (ENXIO);
 }
@@ -6154,9 +5671,9 @@ kqueue_write(__unused struct fileproc *fp,
 /*ARGSUSED*/
 static int
 kqueue_ioctl(__unused struct fileproc *fp,
-    __unused u_long com,
-    __unused caddr_t data,
-    __unused vfs_context_t ctx)
+               __unused u_long com,
+               __unused caddr_t data,
+               __unused vfs_context_t ctx)
 {
        return (ENOTTY);
 }
@@ -6164,7 +5681,7 @@ kqueue_ioctl(__unused struct fileproc *fp,
 /*ARGSUSED*/
 static int
 kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
-    __unused vfs_context_t ctx)
+               __unused vfs_context_t ctx)
 {
        struct kqueue *kq = (struct kqueue *)fp->f_data;
        struct kqtailq *queue;
@@ -6189,7 +5706,7 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
         */
        if (wq_link_id != NULL) {
                thread_t cur_act = current_thread();
-               struct uthread * ut = get_bsdthread_info(cur_act);
+               struct uthread * ut = get_bsdthread_info(cur_act);
 
                kq->kq_state |= KQ_SEL;
                waitq_link((struct waitq *)&kq->kq_wqs, ut->uu_wqset,
@@ -6212,12 +5729,12 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
                memcpy(wq_link_id, (void *)&wqptr, sizeof(void *));
        }
 
-       if (kqueue_begin_processing(kq, QOS_INDEX_KQFILE, 0) == -1) {
+       if (kqfile_begin_processing(kq) == -1) {
                kqunlock(kq);
                return (0);
        }
 
-       queue = kqueue_get_base_queue(kq, QOS_INDEX_KQFILE);
+       queue = &kq->kq_queue[QOS_INDEX_KQFILE];
        if (!TAILQ_EMPTY(queue)) {
                /*
                 * there is something queued - but it might be a
@@ -6239,26 +5756,27 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
                 * There were no regular events on the queue, so take
                 * a deeper look at the stay-queued ones we suppressed.
                 */
-               suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
+               suppressq = kqueue_get_suppressed_queue(kq, NULL);
                while ((kn = (struct knote *)TAILQ_FIRST(suppressq)) != NULL) {
-                       unsigned peek = 1;
-
-                       assert(!knoteuse_needs_boost(kn, NULL));
+                       KNOTE_LOCK_CTX(knlc);
+                       int result = 0;
 
                        /* If didn't vanish while suppressed - peek at it */
-                       if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
-                               peek = knote_fops(kn)->f_peek(kn);
-
-                               /* if it dropped while getting lock - move on */
-                               if (!knoteuse2kqlock(kq, kn, KNUSE_NONE))
-                                       continue;
+                       if ((kn->kn_status & KN_DROPPING) || !knote_lock(kq, kn, &knlc,
+                                       KNOTE_KQ_LOCK_ON_FAILURE)) {
+                               continue;
                        }
 
+                       result = filter_call(knote_fops(kn), f_peek(kn));
+
+                       kqlock(kq);
+                       knote_unlock(kq, kn, &knlc, KNOTE_KQ_LOCK_ALWAYS);
+
                        /* unsuppress it */
                        knote_unsuppress(kn);
 
                        /* has data or it has to report a vanish */
-                       if (peek > 0) {
+                       if (result & FILTER_ACTIVE) {
                                retnum = 1;
                                goto out;
                        }
@@ -6266,7 +5784,7 @@ kqueue_select(struct fileproc *fp, int which, void *wq_link_id,
        }
 
 out:
-       kqueue_end_processing(kq, QOS_INDEX_KQFILE, retnum, 0);
+       kqfile_end_processing(kq);
        kqunlock(kq);
        return (retnum);
 }
@@ -6286,6 +5804,13 @@ kqueue_close(struct fileglob *fg, __unused vfs_context_t ctx)
        return (0);
 }
 
+/*
+ * Max depth of the nested kq path that can be created.
+ * Note that this has to be less than the size of kq_level
+ * to avoid wrapping around and mislabeling the level.
+ */
+#define MAX_NESTED_KQ 1000
+
 /*ARGSUSED*/
 /*
  * The callers has taken a use-count reference on this kqueue and will donate it
@@ -6299,13 +5824,12 @@ kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn,
        struct kqfile *kqf = (struct kqfile *)kn->kn_fp->f_data;
        struct kqueue *kq = &kqf->kqf_kqueue;
        struct kqueue *parentkq = knote_get_kq(kn);
+       uint16_t plevel = 0;
 
        assert((kqf->kqf_state & KQ_WORKQ) == 0);
 
-       if (parentkq == kq ||
-           kn->kn_filter != EVFILT_READ) {
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = EINVAL;
+       if (parentkq == kq || kn->kn_filter != EVFILT_READ) {
+               knote_set_error(kn, EINVAL);
                return 0;
        }
 
@@ -6318,6 +5842,8 @@ kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn,
         * into another kqueue at a lower level than the potenial
         * child (because it could indicate a cycle).  If that test
         * passes, we just mark the nesting levels accordingly.
+        *
+        * Only up to MAX_NESTED_KQ can be nested.
         */
 
        kqlock(parentkq);
@@ -6325,15 +5851,21 @@ kqueue_kqfilter(__unused struct fileproc *fp, struct knote *kn,
            parentkq->kq_level < kq->kq_level)
        {
                kqunlock(parentkq);
-               kn->kn_flags = EV_ERROR;
-               kn->kn_data = EINVAL;
+               knote_set_error(kn, EINVAL);
                return 0;
        } else {
                /* set parent level appropriately */
-               if (parentkq->kq_level == 0)
-                       parentkq->kq_level = 2;
-               if (parentkq->kq_level < kq->kq_level + 1)
-                       parentkq->kq_level = kq->kq_level + 1;
+               plevel = (parentkq->kq_level == 0)? 2: parentkq->kq_level;
+               if (plevel < kq->kq_level + 1) {
+                       if (kq->kq_level + 1 > MAX_NESTED_KQ) {
+                               kqunlock(parentkq);
+                               knote_set_error(kn, EINVAL);
+                               return 0;
+                       }
+                       plevel = kq->kq_level + 1;
+               }
+
+               parentkq->kq_level = plevel;
                kqunlock(parentkq);
 
                kn->kn_filtid = EVFILTID_KQREAD;
@@ -6408,10 +5940,8 @@ kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
 }
 
 /*
- * Interact with the pthread kext to request a servicing there.
- * Eventually, this will request threads at specific QoS levels.
- * For now, it only requests a dispatch-manager-QoS thread, and
- * only one-at-a-time.
+ * Interact with the pthread kext to request a servicing there at a specific QoS
+ * level.
  *
  * - Caller holds the workq request lock
  *
@@ -6419,279 +5949,296 @@ kqueue_stat(struct kqueue *kq, void *ub, int isstat64, proc_t p)
  *   so cannot do anything that could recurse on that.
  */
 static void
-kqworkq_request_thread(
-       struct kqworkq *kqwq, 
-       kq_index_t qos_index)
+kqueue_threadreq_initiate(struct kqueue *kq, struct kqrequest *kqr,
+               kq_index_t qos, int flags)
 {
-       struct kqrequest *kqr;
-
-       assert(kqwq->kqwq_state & KQ_WORKQ);
-       assert(qos_index < KQWQ_NQOS);
-
-       kqr = kqworkq_get_request(kqwq, qos_index);
-
        assert(kqr->kqr_state & KQR_WAKEUP);
+       assert(kqr->kqr_thread == THREAD_NULL);
+       assert((kqr->kqr_state & KQR_THREQUESTED) == 0);
+       struct turnstile *ts = TURNSTILE_NULL;
 
-       /* 
-        * If we have already requested a thread, and it hasn't
-        * started processing yet, there's no use hammering away
-        * on the pthread kext.
-        */
-       if (kqr->kqr_state & KQR_THREQUESTED)
+       if (workq_is_exiting(kq->kq_p)) {
                return;
+       }
 
-       assert((kqr->kqr_state & KQR_BOUND) == 0);
-
-       /* request additional workq threads if appropriate */
-       if (pthread_functions != NULL &&
-           pthread_functions->workq_reqthreads != NULL) {
-               unsigned int flags = KEVENT_FLAG_WORKQ;
-               unsigned long priority;
-               thread_t wqthread;
+       /* Add a thread request reference on the kqueue. */
+       kqueue_retain(kq);
 
-               /* Compute the appropriate pthread priority */
-               priority = qos_from_qos_index(qos_index);
+       kq_req_held(kq);
 
-#if 0
-               /* JMM - for now remain compatible with old invocations */
-               /* set the over-commit flag on the request if needed */
-               if (kqr->kqr_state & KQR_THOVERCOMMIT)
-                       priority |= _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
-#endif /* 0 */
-
-               /* Compute a priority based on qos_index. */
-               struct workq_reqthreads_req_s request = {
-                       .priority = priority,
-                       .count = 1
-               };
-
-               /* mark that we are making a request */
-               kqr->kqr_state |= KQR_THREQUESTED;
-               if (qos_index == KQWQ_QOS_MANAGER)
-                       kqr->kqr_state |= KQWQ_THMANAGER;
+       if (kq->kq_state & KQ_WORKLOOP) {
+               __assert_only struct kqworkloop *kqwl = (struct kqworkloop *)kq;
 
+               assert(kqwl->kqwl_owner == THREAD_NULL);
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
+                               kqwl->kqwl_dynamicid, 0, qos, kqr->kqr_state);
+               ts = kqwl->kqwl_turnstile;
+       } else {
+               assert(kq->kq_state & KQ_WORKQ);
                KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_THREQUEST),
-                             0, qos_index,
-                             (((uintptr_t)kqr->kqr_override_index << 8) |
-                              (uintptr_t)kqr->kqr_state));
-               wqthread = (*pthread_functions->workq_reqthreads)(kqwq->kqwq_p, 1, &request);
-
-               /* We've been switched to the emergency/manager thread */
-               if (wqthread == (thread_t)-1) {
-                       assert(qos_index != KQWQ_QOS_MANAGER);
-                       kqr->kqr_state |= KQWQ_THMANAGER;
-                       return;
-               }
+                               -1, 0, qos, kqr->kqr_state);
+       }
+
+       kqr->kqr_state |= KQR_THREQUESTED;
 
+       /*
+        * New-style thread request supported.
+        * Provide the pthread kext a pointer to a workq_threadreq_s structure for
+        * its use until a corresponding kqueue_threadreq_bind callback.
+        */
+       if ((kq->kq_state & KQ_WORKLOOP) && current_proc() == kq->kq_p) {
+               flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
+       }
+       if (qos == KQWQ_QOS_MANAGER) {
+               qos = WORKQ_THREAD_QOS_MANAGER;
+       }
+       if (!workq_kern_threadreq_initiate(kq->kq_p, kqr, ts, qos, flags)) {
                /*
-                * bind the returned thread identity
-                * This goes away when we switch to synchronous callback
-                * binding from the pthread kext.
+                * Process is shutting down or exec'ing.
+                * All the kqueues are going to be cleaned up
+                * soon. Forget we even asked for a thread -
+                * and make sure we don't ask for more.
                 */
-               if (wqthread != NULL) {
-                       kqworkq_bind_thread_impl(kqwq, qos_index, wqthread, flags);
-               }
+               kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
+               kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
        }
 }
 
 /*
- * If we aren't already busy processing events [for this QoS],
- * request workq thread support as appropriate.
+ * kqueue_threadreq_bind_prepost - prepost the bind to kevent
  *
- * TBD - for now, we don't segregate out processing by QoS.
+ * This is used when kqueue_threadreq_bind may cause a lock inversion.
+ */
+void
+kqueue_threadreq_bind_prepost(struct proc *p __unused, workq_threadreq_t req,
+               thread_t thread)
+{
+       struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
+       struct uthread *ut = get_bsdthread_info(thread);
+
+       req->tr_binding_thread = thread;
+       ut->uu_kqr_bound = kqr;
+       req->tr_state = TR_STATE_BINDING;
+
+       struct kqworkloop *kqwl = kqr_kqworkloop(kqr);
+       if (kqwl && kqwl->kqwl_turnstile) {
+               struct turnstile *ts = kqwl->kqwl_turnstile;
+               /*
+                * While a thread request is in flight, the workqueue
+                * is the interlock for the turnstile and can update the inheritor.
+                */
+               turnstile_update_inheritor(ts, thread, TURNSTILE_IMMEDIATE_UPDATE |
+                               TURNSTILE_INHERITOR_THREAD);
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       }
+}
+
+/*
+ * kqueue_threadreq_bind_commit - commit a bind prepost
  *
- * - May be called with the kqueue's wait queue set locked,
- *   so cannot do anything that could recurse on that.
+ * The workq code has to commit any binding prepost before the thread has
+ * a chance to come back to userspace (and do kevent syscalls) or be aborted.
  */
+void
+kqueue_threadreq_bind_commit(struct proc *p, thread_t thread)
+{
+       struct uthread *ut = get_bsdthread_info(thread);
+       struct kqrequest *kqr = ut->uu_kqr_bound;
+       kqueue_t kqu = kqr_kqueue(p, kqr);
+
+       kq_req_lock(kqu);
+       if (kqr->kqr_req.tr_state == TR_STATE_BINDING) {
+               kqueue_threadreq_bind(p, &kqr->kqr_req, thread, 0);
+       }
+       kq_req_unlock(kqu);
+}
+
 static void
-kqworkq_request_help(
-       struct kqworkq *kqwq, 
-       kq_index_t qos_index)
+kqueue_threadreq_modify(struct kqueue *kq, struct kqrequest *kqr, kq_index_t qos)
 {
-       struct kqrequest *kqr;
+       assert(kqr->kqr_state & KQR_THREQUESTED);
+       assert(kqr->kqr_thread == THREAD_NULL);
 
-       /* convert to thread qos value */
-       assert(qos_index < KQWQ_NQOS);
-       
-       kqwq_req_lock(kqwq);
-       kqr = kqworkq_get_request(kqwq, qos_index);
+       kq_req_held(kq);
 
-       if ((kqr->kqr_state & KQR_WAKEUP) == 0) {
-               /* Indicate that we needed help from this request */
-               kqr->kqr_state |= KQR_WAKEUP;
+       int flags = 0;
+       if ((kq->kq_state & KQ_WORKLOOP) && kq->kq_p == current_proc()) {
+               flags |= WORKQ_THREADREQ_SET_AST_ON_FAILURE;
+       }
+       workq_kern_threadreq_modify(kq->kq_p, kqr, qos, flags);
+}
+
+/*
+ * kqueue_threadreq_bind - bind thread to processing kqrequest
+ *
+ * The provided thread will be responsible for delivering events
+ * associated with the given kqrequest.  Bind it and get ready for
+ * the thread to eventually arrive.
+ */
+void
+kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req, thread_t thread,
+               unsigned int flags)
+{
+       struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
+       kqueue_t kqu = kqr_kqueue(p, kqr);
+       struct uthread *ut = get_bsdthread_info(thread);
+
+       kq_req_held(kqu);
+
+       assert(kqr->kqr_state & KQR_THREQUESTED);
+       assert(kqr->kqr_thread == THREAD_NULL);
+       assert(ut->uu_kqueue_override == 0);
+
+       if (kqr->kqr_req.tr_state == TR_STATE_BINDING) {
+               assert(ut->uu_kqr_bound == kqr);
+               assert(kqr->kqr_req.tr_binding_thread == thread);
+               kqr->kqr_req.tr_state = TR_STATE_IDLE;
+               kqr->kqr_req.tr_binding_thread = NULL;
+       } else {
+               assert(ut->uu_kqr_bound == NULL);
+       }
+
+       ut->uu_kqr_bound = kqr;
+       kqr->kqr_thread = thread;
+
+       if (kqu.kq->kq_state & KQ_WORKLOOP) {
+               struct turnstile *ts = kqu.kqwl->kqwl_turnstile;
+
+               if (__improbable(thread == kqu.kqwl->kqwl_owner)) {
+                       /*
+                        * <rdar://problem/38626999> shows that asserting here is not ok.
+                        *
+                        * This is not supposed to happen for correct use of the interface,
+                        * but it is sadly possible for userspace (with the help of memory
+                        * corruption, such as over-release of a dispatch queue) to make
+                        * the creator thread the "owner" of a workloop.
+                        *
+                        * Once that happens, and that creator thread picks up the same
+                        * workloop as a servicer, we trip this codepath. We need to fixup
+                        * the state to forget about this thread being the owner, as the
+                        * entire workloop state machine expects servicers to never be
+                        * owners and everything would basically go downhill from here.
+                        */
+                       kqu.kqwl->kqwl_owner = THREAD_NULL;
+                       if (kqworkloop_owner_override(kqu.kqwl)) {
+                               thread_drop_ipc_override(thread);
+                       }
+                       thread_ends_owning_workloop(thread);
+               }
 
-               /* Go assure a thread request has been made */
-               kqworkq_request_thread(kqwq, qos_index);
+               if (ts && (flags & KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE) == 0) {
+                       /*
+                        * Past this point, the interlock is the kq req lock again,
+                        * so we can fix the inheritor for good.
+                        */
+                       filt_wlupdate_inheritor(kqu.kqwl, ts, TURNSTILE_IMMEDIATE_UPDATE);
+                       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+               }
+
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_BIND), kqu.kqwl->kqwl_dynamicid,
+                               thread_tid(thread), kqr->kqr_qos_index,
+                               (kqr->kqr_override_index << 16) | kqr->kqr_state);
+
+               ut->uu_kqueue_override = kqr->kqr_override_index;
+               if (kqr->kqr_override_index) {
+                       thread_add_ipc_override(thread, kqr->kqr_override_index);
+               }
+       } else {
+               assert(kqr->kqr_override_index == 0);
+
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_BIND), -1,
+                               thread_tid(thread), kqr->kqr_qos_index,
+                               (kqr->kqr_override_index << 16) | kqr->kqr_state);
        }
-       kqwq_req_unlock(kqwq);
 }
 
-static void
-kqworkloop_threadreq_impl(struct kqworkloop *kqwl, kq_index_t qos_index)
+/*
+ * kqueue_threadreq_cancel - abort a pending thread request
+ *
+ * Called when exiting/exec'ing. Forget our pending request.
+ */
+void
+kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req)
 {
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       unsigned long pri = pthread_priority_for_kqrequest(kqr, qos_index);
-       int op, ret;
+       struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
+       kqueue_t kqu = kqr_kqueue(p, kqr);
 
-       assert((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED);
+       kq_req_lock(kqu);
 
-       /*
-        * New-style thread request supported. Provide
-        * the pthread kext a pointer to a workq_threadreq_s
-        * structure for its use until a corresponding
-        * workloop_fulfill_threqreq callback.
-        */
-       if (current_proc() == kqwl->kqwl_kqueue.kq_p) {
-               op = WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL;
-       } else {
-               op = WORKQ_THREADREQ_WORKLOOP;
-       }
-again:
-       ret = (*pthread_functions->workq_threadreq)(kqwl->kqwl_p, &kqr->kqr_req,
-                       WORKQ_THREADREQ_WORKLOOP, pri, 0);
-       switch (ret) {
-       case ENOTSUP:
-               assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
-               op = WORKQ_THREADREQ_WORKLOOP;
-               goto again;
-
-       case ECANCELED:
-       case EINVAL:
-               /*
-                * Process is shutting down or exec'ing.
-                * All the kqueues are going to be cleaned up
-                * soon. Forget we even asked for a thread -
-                * and make sure we don't ask for more.
-                */
-               kqueue_release((struct kqueue *)kqwl, KQUEUE_CANT_BE_LAST_REF);
-               kqr->kqr_state &= ~KQR_THREQUESTED;
-               kqr->kqr_state |= KQR_DRAIN;
-               break;
+       assert(kqr->kqr_thread == THREAD_NULL);
+       assert(kqr->kqr_state & KQR_THREQUESTED);
+       kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
 
-       case EAGAIN:
-               assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
-               act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
-               break;
+       kq_req_unlock(kqu);
 
-       default:
-               assert(ret == 0);
-       }
+       kqueue_release_last(p, kqu); /* may dealloc kqu */
 }
 
-static void
-kqworkloop_threadreq_modify(struct kqworkloop *kqwl, kq_index_t qos_index)
+workq_threadreq_param_t
+kqueue_threadreq_workloop_param(workq_threadreq_t req)
 {
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       unsigned long pri = pthread_priority_for_kqrequest(kqr, qos_index);
-       int ret, op = WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL;
+       struct kqrequest *kqr = __container_of(req, struct kqrequest, kqr_req);
+       struct kqworkloop *kqwl;
+       workq_threadreq_param_t trp;
 
-       assert((kqr->kqr_state & (KQR_THREQUESTED | KQR_BOUND)) == KQR_THREQUESTED);
+       assert(kqr->kqr_state & KQR_WORKLOOP);
+       kqwl = __container_of(kqr, struct kqworkloop, kqwl_request);
+       trp.trp_value = kqwl->kqwl_params;
+       return trp;
+}
 
-       if (current_proc() == kqwl->kqwl_kqueue.kq_p) {
-               op = WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL;
+/*
+ *     kqueue_threadreq_unbind - unbind thread from processing kqueue
+ *
+ *     End processing the per-QoS bucket of events and allow other threads
+ *     to be requested for future servicing.
+ *
+ *     caller holds a reference on the kqueue.
+ */
+void
+kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr)
+{
+       if (kqr->kqr_state & KQR_WORKLOOP) {
+               kqworkloop_unbind(p, kqr_kqworkloop(kqr));
        } else {
-               op = WORKQ_THREADREQ_CHANGE_PRI;
-       }
-again:
-       ret = (*pthread_functions->workq_threadreq_modify)(kqwl->kqwl_p,
-                       &kqr->kqr_req, op, pri, 0);
-       switch (ret) {
-       case ENOTSUP:
-               assert(op == WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL);
-               op = WORKQ_THREADREQ_CHANGE_PRI;
-               goto again;
-
-       case EAGAIN:
-               assert(op == WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL);
-               act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
-               break;
-
-       case ECANCELED:
-       case EINVAL:
-       case 0:
-               break;
-
-       default:
-               assert(ret == 0);
+               kqworkq_unbind(p, kqr);
        }
 }
 
 /*
- * Interact with the pthread kext to request a servicing thread.
- * This will request a single thread at the highest QoS level
- * for which there is work (whether that was the requested QoS
- * for an event or an override applied to a lower-QoS request).
+ * If we aren't already busy processing events [for this QoS],
+ * request workq thread support as appropriate.
  *
- * - Caller holds the workloop request lock
+ * TBD - for now, we don't segregate out processing by QoS.
  *
  * - May be called with the kqueue's wait queue set locked,
  *   so cannot do anything that could recurse on that.
  */
 static void
-kqworkloop_request_thread(struct kqworkloop *kqwl, kq_index_t qos_index)
+kqworkq_request_help(struct kqworkq *kqwq, kq_index_t qos_index)
 {
        struct kqrequest *kqr;
 
-       assert(kqwl->kqwl_state & KQ_WORKLOOP);
-
-       kqr = &kqwl->kqwl_request;
-
-       assert(kqwl->kqwl_owner == THREAD_NULL);
-       assert((kqr->kqr_state & KQR_BOUND) == 0);
-       assert((kqr->kqr_state & KQR_THREQUESTED) == 0);
-       assert(!(kqwl->kqwl_kqueue.kq_state & KQ_NO_WQ_THREAD));
-
-       /* If we're draining thread requests, just bail */
-       if (kqr->kqr_state & KQR_DRAIN)
-               return;
-
-       if (pthread_functions != NULL &&
-                       pthread_functions->workq_threadreq != NULL) {
-               /*
-                * set request state flags, etc... before calling pthread
-                * This assures they are set before a possible synchronous
-                * callback to workloop_fulfill_threadreq().
-                */
-               kqr->kqr_state |= KQR_THREQUESTED;
+       /* convert to thread qos value */
+       assert(qos_index < KQWQ_NBUCKETS);
 
-               /* Add a thread request reference on the kqueue. */
-               kqueue_retain((struct kqueue *)kqwl);
+       kq_req_lock(kqwq);
+       kqr = kqworkq_get_request(kqwq, qos_index);
 
-               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THREQUEST),
-                             kqwl->kqwl_dynamicid,
-                             0, qos_index, kqr->kqr_state);
-               kqworkloop_threadreq_impl(kqwl, qos_index);
-       } else {
-               panic("kqworkloop_request_thread");
-               return;
+       if ((kqr->kqr_state & KQR_WAKEUP) == 0) {
+               kqr->kqr_state |= KQR_WAKEUP;
+               if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
+                       kqueue_threadreq_initiate(&kqwq->kqwq_kqueue, kqr, qos_index, 0);
+               }
        }
+       kq_req_unlock(kqwq);
 }
 
-static void
-kqworkloop_update_sync_override_state(struct kqworkloop *kqwl, boolean_t sync_ipc_override)
-{
-       struct kqrequest *kqr = &kqwl->kqwl_request;
-       kqwl_req_lock(kqwl);
-       kqr->kqr_has_sync_override = sync_ipc_override;
-       kqwl_req_unlock(kqwl);
-
-}
-
-static inline kq_index_t
-kqworkloop_combined_qos(struct kqworkloop *kqwl, boolean_t *ipc_override_is_sync)
+static kq_index_t
+kqworkloop_owner_override(struct kqworkloop *kqwl)
 {
        struct kqrequest *kqr = &kqwl->kqwl_request;
-       kq_index_t override;
-
-       *ipc_override_is_sync = FALSE;
-       override = MAX(MAX(kqr->kqr_qos_index, kqr->kqr_override_index),
-                                       kqr->kqr_dsync_waiters_qos);
-
-       if (kqr->kqr_sync_suppress_count > 0 || kqr->kqr_has_sync_override) {
-               *ipc_override_is_sync = TRUE;
-               override = THREAD_QOS_USER_INTERACTIVE;
-       }
-       return override;
+       return MAX(kqr->kqr_qos_index, kqr->kqr_override_index);
 }
 
 static inline void
@@ -6699,12 +6246,10 @@ kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
 {
        struct kqrequest *kqr = &kqwl->kqwl_request;
 
-       kqwl_req_held(kqwl);
+       kq_req_held(kqwl);
 
        if (kqr->kqr_state & KQR_R2K_NOTIF_ARMED) {
-               assert(kqr->kqr_state & KQR_BOUND);
                assert(kqr->kqr_thread);
-
                kqr->kqr_state &= ~KQR_R2K_NOTIF_ARMED;
                act_set_astkevent(kqr->kqr_thread, AST_KEVENT_RETURN_TO_KERNEL);
        }
@@ -6713,17 +6258,13 @@ kqworkloop_request_fire_r2k_notification(struct kqworkloop *kqwl)
 static void
 kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
 {
-       const uint8_t KQWL_STAYACTIVE_FIRED_BIT = (1 << 0);
-
        struct kqrequest *kqr = &kqwl->kqwl_request;
-       boolean_t old_ipc_override_is_sync = FALSE;
-       kq_index_t old_qos = kqworkloop_combined_qos(kqwl, &old_ipc_override_is_sync);
        struct kqueue *kq = &kqwl->kqwl_kqueue;
-       bool static_thread = (kq->kq_state & KQ_NO_WQ_THREAD);
+       kq_index_t old_owner_override = kqworkloop_owner_override(kqwl);
        kq_index_t i;
 
        /* must hold the kqr lock */
-       kqwl_req_held(kqwl);
+       kq_req_held(kqwl);
 
        switch (op) {
        case KQWL_UTQ_UPDATE_WAKEUP_QOS:
@@ -6742,7 +6283,6 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
                        kqr->kqr_wakeup_indexes |= KQWL_STAYACTIVE_FIRED_BIT;
                        qos = kqr->kqr_stayactive_qos;
                        assert(qos);
-                       assert(!static_thread);
                }
                if (kqr->kqr_wakeup_indexes & (1 << qos)) {
                        assert(kqr->kqr_state & KQR_WAKEUP);
@@ -6752,7 +6292,7 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
                kqr->kqr_wakeup_indexes |= (1 << qos);
                kqr->kqr_state |= KQR_WAKEUP;
                kqworkloop_request_fire_r2k_notification(kqwl);
-               goto recompute_async;
+               goto recompute;
 
        case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
                assert(qos);
@@ -6761,19 +6301,25 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
                        if (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT) {
                                assert(kqr->kqr_state & KQR_WAKEUP);
                                kqr->kqr_wakeup_indexes |= (1 << qos);
-                               goto recompute_async;
+                               goto recompute;
                        }
                }
                break;
 
+       case KQWL_UTQ_PARKING:
+       case KQWL_UTQ_UNBINDING:
+               kqr->kqr_override_index = qos;
+               /* FALLTHROUGH */
        case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
-               kqlock_held(kq); // to look at kq_queues
-               kqr->kqr_has_sync_override = FALSE;
+               if (op == KQWL_UTQ_RECOMPUTE_WAKEUP_QOS) {
+                       assert(qos == THREAD_QOS_UNSPECIFIED);
+               }
+               kqlock_held(kqwl); // to look at kq_queues
                i = KQWL_BUCKET_STAYACTIVE;
                if (TAILQ_EMPTY(&kqr->kqr_suppressed)) {
                        kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
                }
-               if (!TAILQ_EMPTY(&kq->kq_queue[i]) &&
+               if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i]) &&
                                (kqr->kqr_wakeup_indexes & KQWL_STAYACTIVE_FIRED_BIT)) {
                        /*
                         * If the KQWL_STAYACTIVE_FIRED_BIT is set, it means a stay active
@@ -6787,13 +6333,8 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
                        kqr->kqr_wakeup_indexes = 0;
                }
                for (i = THREAD_QOS_UNSPECIFIED + 1; i < KQWL_BUCKET_STAYACTIVE; i++) {
-                       if (!TAILQ_EMPTY(&kq->kq_queue[i])) {
+                       if (!TAILQ_EMPTY(&kqwl->kqwl_queue[i])) {
                                kqr->kqr_wakeup_indexes |= (1 << i);
-                               struct knote *kn = TAILQ_FIRST(&kqwl->kqwl_kqueue.kq_queue[i]);
-                               if (i == THREAD_QOS_USER_INTERACTIVE &&
-                                   kn->kn_qos_override_is_sync) {
-                                       kqr->kqr_has_sync_override = TRUE;
-                               }
                        }
                }
                if (kqr->kqr_wakeup_indexes) {
@@ -6802,20 +6343,18 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
                } else {
                        kqr->kqr_state &= ~KQR_WAKEUP;
                }
-               assert(qos == THREAD_QOS_UNSPECIFIED);
-               goto recompute_async;
+               goto recompute;
 
        case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
-               kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
-               assert(qos == THREAD_QOS_UNSPECIFIED);
-               goto recompute_async;
+               kqr->kqr_override_index = qos;
+               goto recompute;
 
        case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
-       recompute_async:
+       recompute:
                /*
-                * When modifying the wakeup QoS or the async override QoS, we always
-                * need to maintain our invariant that kqr_override_index is at least as
-                * large as the highest QoS for which an event is fired.
+                * When modifying the wakeup QoS or the override QoS, we always need to
+                * maintain our invariant that kqr_override_index is at least as large
+                * as the highest QoS for which an event is fired.
                 *
                 * However this override index can be larger when there is an overriden
                 * suppressed knote pushing on the kqueue.
@@ -6831,96 +6370,44 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
        case KQWL_UTQ_REDRIVE_EVENTS:
                break;
 
-       case KQWL_UTQ_SET_ASYNC_QOS:
-               filt_wlheld(kqwl);
+       case KQWL_UTQ_SET_QOS_INDEX:
                kqr->kqr_qos_index = qos;
                break;
 
-       case KQWL_UTQ_SET_SYNC_WAITERS_QOS:
-               filt_wlheld(kqwl);
-               kqr->kqr_dsync_waiters_qos = qos;
-               break;
-
        default:
                panic("unknown kqwl thread qos update operation: %d", op);
        }
 
-       boolean_t new_ipc_override_is_sync = FALSE;
-       kq_index_t new_qos = kqworkloop_combined_qos(kqwl, &new_ipc_override_is_sync);
        thread_t kqwl_owner = kqwl->kqwl_owner;
        thread_t servicer = kqr->kqr_thread;
-       __assert_only int ret;
+       boolean_t qos_changed = FALSE;
+       kq_index_t new_owner_override = kqworkloop_owner_override(kqwl);
 
        /*
         * Apply the diffs to the owner if applicable
         */
-       if (filt_wlowner_is_valid(kqwl_owner)) {
+       if (kqwl_owner) {
 #if 0
                /* JMM - need new trace hooks for owner overrides */
                KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
-                               kqwl->kqwl_dynamicid,
-                               (kqr->kqr_state & KQR_BOUND) ? thread_tid(kqwl_owner) : 0,
-                               (kqr->kqr_qos_index << 8) | new_qos,
-                               (kqr->kqr_override_index << 8) | kqr->kqr_state);
+                               kqwl->kqwl_dynamicid, thread_tid(kqwl_owner), kqr->kqr_qos_index,
+                               (kqr->kqr_override_index << 16) | kqr->kqr_state);
 #endif
-               if (new_qos == kqr->kqr_dsync_owner_qos) {
+               if (new_owner_override == old_owner_override) {
                        // nothing to do
-               } else if (kqr->kqr_dsync_owner_qos == THREAD_QOS_UNSPECIFIED) {
-                       thread_add_ipc_override(kqwl_owner, new_qos);
-               } else if (new_qos == THREAD_QOS_UNSPECIFIED) {
+               } else if (old_owner_override == THREAD_QOS_UNSPECIFIED) {
+                       thread_add_ipc_override(kqwl_owner, new_owner_override);
+               } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) {
                        thread_drop_ipc_override(kqwl_owner);
-               } else /* kqr->kqr_dsync_owner_qos != new_qos */ {
-                       thread_update_ipc_override(kqwl_owner, new_qos);
-               }
-               kqr->kqr_dsync_owner_qos = new_qos;
-
-               if (new_ipc_override_is_sync &&
-                       !kqr->kqr_owner_override_is_sync) {
-                       thread_add_sync_ipc_override(kqwl_owner);
-               } else if (!new_ipc_override_is_sync &&
-                       kqr->kqr_owner_override_is_sync) {
-                       thread_drop_sync_ipc_override(kqwl_owner);
+               } else /*  old_owner_override != new_owner_override */ {
+                       thread_update_ipc_override(kqwl_owner, new_owner_override);
                }
-               kqr->kqr_owner_override_is_sync = new_ipc_override_is_sync;
        }
 
        /*
         * apply the diffs to the servicer
         */
-       if (static_thread) {
-               /*
-                * Statically bound thread
-                *
-                * These threads don't participates in QoS overrides today, just wakeup
-                * the thread blocked on this kqueue if a new event arrived.
-                */
-
-               switch (op) {
-               case KQWL_UTQ_UPDATE_WAKEUP_QOS:
-               case KQWL_UTQ_UPDATE_STAYACTIVE_QOS:
-               case KQWL_UTQ_RECOMPUTE_WAKEUP_QOS:
-                       break;
-
-               case KQWL_UTQ_RESET_WAKEUP_OVERRIDE:
-               case KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE:
-               case KQWL_UTQ_REDRIVE_EVENTS:
-               case KQWL_UTQ_SET_ASYNC_QOS:
-               case KQWL_UTQ_SET_SYNC_WAITERS_QOS:
-                       panic("should never be called");
-                       break;
-               }
-
-               kqlock_held(kq);
-
-               if ((kqr->kqr_state & KQR_BOUND) && (kqr->kqr_state & KQR_WAKEUP)) {
-                       assert(servicer && !is_workqueue_thread(servicer));
-                       if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
-                               kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
-                               waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT,
-                                               THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
-                       }
-               }
-       } else if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
+       if ((kqr->kqr_state & KQR_THREQUESTED) == 0) {
                /*
                 * No servicer, nor thread-request
                 *
@@ -6929,70 +6416,54 @@ kqworkloop_update_threads_qos(struct kqworkloop *kqwl, int op, kq_index_t qos)
                 * first place.
                 */
 
-               if (kqwl_owner == THREAD_NULL && (kqr->kqr_state & KQR_WAKEUP)) {
-                       kqworkloop_request_thread(kqwl, new_qos);
+               if (kqwl_owner == NULL && (kqr->kqr_state & KQR_WAKEUP)) {
+                       int initiate_flags = 0;
+                       if (op == KQWL_UTQ_UNBINDING) {
+                               initiate_flags = WORKQ_THREADREQ_ATTEMPT_REBIND;
+                       }
+                       kqueue_threadreq_initiate(kq, kqr, new_owner_override,
+                                       initiate_flags);
                }
-       } else if ((kqr->kqr_state & KQR_BOUND) == 0 &&
-                       (kqwl_owner || (kqr->kqr_state & KQR_WAKEUP) == 0)) {
+       } else if (servicer) {
                /*
-                * No servicer, thread request in flight we want to cancel
+                * Servicer in flight
                 *
-                * We just got rid of the last knote of the kqueue or noticed an owner
-                * with a thread request still in flight, take it back.
+                * Just apply the diff to the servicer
                 */
-               ret = (*pthread_functions->workq_threadreq_modify)(kqwl->kqwl_p,
-                               &kqr->kqr_req, WORKQ_THREADREQ_CANCEL, 0, 0);
-               if (ret == 0) {
-                       kqr->kqr_state &= ~KQR_THREQUESTED;
-                       kqueue_release(kq, KQUEUE_CANT_BE_LAST_REF);
+               struct uthread *ut = get_bsdthread_info(servicer);
+               if (ut->uu_kqueue_override != kqr->kqr_override_index) {
+                       if (ut->uu_kqueue_override == THREAD_QOS_UNSPECIFIED) {
+                               thread_add_ipc_override(servicer, kqr->kqr_override_index);
+                       } else if (kqr->kqr_override_index == THREAD_QOS_UNSPECIFIED) {
+                               thread_drop_ipc_override(servicer);
+                       } else /* ut->uu_kqueue_override != kqr->kqr_override_index */ {
+                               thread_update_ipc_override(servicer, kqr->kqr_override_index);
+                       }
+                       ut->uu_kqueue_override = kqr->kqr_override_index;
+                       qos_changed = TRUE;
                }
-       } else {
-               boolean_t qos_changed = FALSE;
-
+       } else if (new_owner_override == THREAD_QOS_UNSPECIFIED) {
                /*
-                * Servicer or request is in flight
+                * No events to deliver anymore.
                 *
-                * Just apply the diff to the servicer or the thread request
+                * However canceling with turnstiles is challenging, so the fact that
+                * the request isn't useful will be discovered by the servicer himself
+                * later on.
                 */
-               if (kqr->kqr_state & KQR_BOUND) {
-                       servicer = kqr->kqr_thread;
-                       struct uthread *ut = get_bsdthread_info(servicer);
-                       if (ut->uu_kqueue_qos_index != new_qos) {
-                               if (ut->uu_kqueue_qos_index == THREAD_QOS_UNSPECIFIED) {
-                                       thread_add_ipc_override(servicer, new_qos);
-                               } else if (new_qos == THREAD_QOS_UNSPECIFIED) {
-                                       thread_drop_ipc_override(servicer);
-                               } else /* ut->uu_kqueue_qos_index != new_qos */ {
-                                       thread_update_ipc_override(servicer, new_qos);
-                               }
-                               ut->uu_kqueue_qos_index = new_qos;
-                               qos_changed = TRUE;
-                       }
+       } else if (old_owner_override != new_owner_override) {
+               /*
+                * Request is in flight
+                *
+                * Apply the diff to the thread request
+                */
+               kqueue_threadreq_modify(kq, kqr, new_owner_override);
+               qos_changed = TRUE;
+       }
 
-                       if (new_ipc_override_is_sync != ut->uu_kqueue_override_is_sync) {
-                               if (new_ipc_override_is_sync &&
-                                   !ut->uu_kqueue_override_is_sync) {
-                                       thread_add_sync_ipc_override(servicer);
-                               } else if (!new_ipc_override_is_sync &&
-                                       ut->uu_kqueue_override_is_sync) {
-                                       thread_drop_sync_ipc_override(servicer);
-                               }
-                               ut->uu_kqueue_override_is_sync = new_ipc_override_is_sync;
-                               qos_changed = TRUE;
-                       }
-               } else if (old_qos != new_qos) {
-                       assert(new_qos);
-                       kqworkloop_threadreq_modify(kqwl, new_qos);
-                       qos_changed = TRUE;
-               }
-               if (qos_changed) {
-                       servicer = kqr->kqr_thread;
-                       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST),
-                               kqwl->kqwl_dynamicid,
-                               (kqr->kqr_state & KQR_BOUND) ? thread_tid(servicer) : 0,
-                               (kqr->kqr_qos_index << 16) | (new_qos << 8) | new_ipc_override_is_sync,
-                               (kqr->kqr_override_index << 8) | kqr->kqr_state);
-               }
+       if (qos_changed) {
+               KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_THADJUST), kqwl->kqwl_dynamicid,
+                               thread_tid(kqr->kqr_thread), kqr->kqr_qos_index,
+                               (kqr->kqr_override_index << 16) | kqr->kqr_state);
        }
 }
 
@@ -7002,179 +6473,179 @@ kqworkloop_request_help(struct kqworkloop *kqwl, kq_index_t qos_index)
        /* convert to thread qos value */
        assert(qos_index < KQWL_NBUCKETS);
 
-       kqwl_req_lock(kqwl);
+       kq_req_lock(kqwl);
        kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_QOS, qos_index);
-       kqwl_req_unlock(kqwl);
+       kq_req_unlock(kqwl);
 }
 
-/*
- * These arrays described the low and high qindexes for a given qos_index.
- * The values come from the chart in <sys/eventvar.h> (must stay in sync).
- */
-static kq_index_t _kqwq_base_index[KQWQ_NQOS] = {0, 0, 6, 11, 15, 18, 20, 21};
-static kq_index_t _kqwq_high_index[KQWQ_NQOS] = {0, 5, 10, 14, 17, 19, 20, 21};
-
 static struct kqtailq *
-kqueue_get_base_queue(struct kqueue *kq, kq_index_t qos_index)
+kqueue_get_queue(struct kqueue *kq, kq_index_t qos_index)
 {
        if (kq->kq_state & KQ_WORKQ) {
-               assert(qos_index < KQWQ_NQOS);
-               return &kq->kq_queue[_kqwq_base_index[qos_index]];
+               assert(qos_index < KQWQ_NBUCKETS);
        } else if (kq->kq_state & KQ_WORKLOOP) {
                assert(qos_index < KQWL_NBUCKETS);
-               return &kq->kq_queue[qos_index];
        } else {
                assert(qos_index == QOS_INDEX_KQFILE);
-               return &kq->kq_queue[QOS_INDEX_KQFILE];
        }
+       static_assert(offsetof(struct kqueue, kq_queue) == sizeof(struct kqueue),
+                       "struct kqueue::kq_queue must be exactly at the end");
+       return &kq->kq_queue[qos_index];
+}
+
+static int
+kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
+{
+       return TAILQ_EMPTY(kqueue_get_queue(kq, qos_index));
 }
 
 static struct kqtailq *
-kqueue_get_high_queue(struct kqueue *kq, kq_index_t qos_index)
+kqueue_get_suppressed_queue(kqueue_t kq, struct knote *kn)
 {
-       if (kq->kq_state & KQ_WORKQ) {
-               assert(qos_index < KQWQ_NQOS);
-               return &kq->kq_queue[_kqwq_high_index[qos_index]];
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               assert(qos_index < KQWL_NBUCKETS);
-               return &kq->kq_queue[KQWL_BUCKET_STAYACTIVE];
+       if (kq.kq->kq_state & KQ_WORKQ) {
+               return &kqworkq_get_request(kq.kqwq, kn->kn_qos_index)->kqr_suppressed;
+       } else if (kq.kq->kq_state & KQ_WORKLOOP) {
+               return &kq.kqwl->kqwl_request.kqr_suppressed;
        } else {
-               assert(qos_index == QOS_INDEX_KQFILE);
-               return &kq->kq_queue[QOS_INDEX_KQFILE];
+               return &kq.kqf->kqf_suppressed;
        }
 }
 
-static int
-kqueue_queue_empty(struct kqueue *kq, kq_index_t qos_index)
+static struct turnstile *
+kqueue_get_turnstile(kqueue_t kqu, bool can_alloc)
 {
-       struct kqtailq *base_queue = kqueue_get_base_queue(kq, qos_index);
-       struct kqtailq *queue = kqueue_get_high_queue(kq, qos_index);
+       uint8_t kqr_state;
 
-       do {
-               if (!TAILQ_EMPTY(queue))
-                       return 0;
-       } while (queue-- > base_queue);
-       return 1;
-}
+       if ((kqu.kq->kq_state & KQ_WORKLOOP) == 0) {
+               return TURNSTILE_NULL;
+       }
 
-static struct kqtailq *
-kqueue_get_suppressed_queue(struct kqueue *kq, kq_index_t qos_index)
-{
-    struct kqtailq *res;
-       struct kqrequest *kqr;
+       kqr_state = os_atomic_load(&kqu.kqwl->kqwl_request.kqr_state, relaxed);
+       if (kqr_state & KQR_ALLOCATED_TURNSTILE) {
+               /* force a dependency to pair with the atomic or with release below */
+               return os_atomic_load_with_dependency_on(&kqu.kqwl->kqwl_turnstile,
+                               kqr_state);
+       }
 
-       if (kq->kq_state & KQ_WORKQ) {
-               struct kqworkq *kqwq = (struct kqworkq *)kq;
+       if (!can_alloc) {
+               return TURNSTILE_NULL;
+       }
 
-               kqr = kqworkq_get_request(kqwq, qos_index);
-               res = &kqr->kqr_suppressed;
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+       struct turnstile *ts = turnstile_alloc(), *free_ts = TURNSTILE_NULL;
+
+       kq_req_lock(kqu);
+       if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) {
+               workq_kern_threadreq_lock(kqu.kqwl->kqwl_p);
+       }
 
-               kqr = &kqwl->kqwl_request;
-               res = &kqr->kqr_suppressed;
+       if (kqu.kqwl->kqwl_request.kqr_state & KQR_ALLOCATED_TURNSTILE) {
+               free_ts = ts;
+               ts = kqu.kqwl->kqwl_turnstile;
        } else {
-               struct kqfile *kqf = (struct kqfile *)kq;
-               res = &kqf->kqf_suppressed;
+               ts = turnstile_prepare((uintptr_t)kqu.kqwl, &kqu.kqwl->kqwl_turnstile,
+                               ts, TURNSTILE_WORKLOOPS);
+
+               /* release-barrier to pair with the unlocked load of kqwl_turnstile above */
+               os_atomic_or(&kqu.kqwl->kqwl_request.kqr_state,
+                               KQR_ALLOCATED_TURNSTILE, release);
        }
-       return res;
-}
 
-static kq_index_t
-knote_get_queue_index(struct knote *kn)
-{
-       kq_index_t override_index = knote_get_qos_override_index(kn);
-       kq_index_t qos_index = knote_get_qos_index(kn);
-       struct kqueue *kq = knote_get_kq(kn);
-       kq_index_t res;
+       if (filt_wlturnstile_interlock_is_workq(kqu.kqwl)) {
+               workq_kern_threadreq_unlock(kqu.kqwl->kqwl_p);
+       }
+       kq_req_unlock(kqu.kqwl);
 
-       if (kq->kq_state & KQ_WORKQ) {
-               res = _kqwq_base_index[qos_index];
-               if (override_index > qos_index)
-                       res += override_index - qos_index;
-               assert(res <= _kqwq_high_index[qos_index]);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               res = MAX(override_index, qos_index);
-               assert(res < KQWL_NBUCKETS);
-       } else {
-               assert(qos_index == QOS_INDEX_KQFILE);
-               assert(override_index == QOS_INDEX_KQFILE);
-               res = QOS_INDEX_KQFILE;
+       if (free_ts) {
+               turnstile_deallocate(free_ts);
        }
-       return res;
+       return ts;
 }
 
-static struct kqtailq *
-knote_get_queue(struct knote *kn)
+struct turnstile *
+kqueue_turnstile(struct kqueue *kq)
 {
-       kq_index_t qindex = knote_get_queue_index(kn);
-
-       return &(knote_get_kq(kn))->kq_queue[qindex];
+       return kqueue_get_turnstile(kq, false);
 }
 
-static kq_index_t
-knote_get_req_index(struct knote *kn)
+struct turnstile *
+kqueue_alloc_turnstile(struct kqueue *kq)
 {
-       return kn->kn_req_index;
+       return kqueue_get_turnstile(kq, true);
 }
 
-static kq_index_t
-knote_get_qos_index(struct knote *kn)
+static struct kqtailq *
+knote_get_queue(struct knote *kn)
 {
-       return kn->kn_qos_index;
+       return kqueue_get_queue(knote_get_kq(kn), kn->kn_qos_index);
 }
 
 static void
-knote_set_qos_index(struct knote *kn, kq_index_t qos_index)
+knote_reset_priority(struct knote *kn, pthread_priority_t pp)
 {
        struct kqueue *kq = knote_get_kq(kn);
+       kq_index_t qos = _pthread_priority_thread_qos(pp);
 
-       assert(qos_index < KQWQ_NQOS);
        assert((kn->kn_status & KN_QUEUED) == 0);
 
        if (kq->kq_state & KQ_WORKQ) {
-               assert(qos_index > THREAD_QOS_UNSPECIFIED);
+               if (qos == THREAD_QOS_UNSPECIFIED) {
+                       /* On workqueues, outside of QoS means MANAGER */
+                       qos = KQWQ_QOS_MANAGER;
+                       pp = _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
+               } else {
+                       pp = _pthread_priority_normalize(pp);
+               }
        } else if (kq->kq_state & KQ_WORKLOOP) {
-               /* XXX this policy decision shouldn't be here */
-               if (qos_index == THREAD_QOS_UNSPECIFIED)
-                       qos_index = THREAD_QOS_LEGACY;
-       } else
-               qos_index = QOS_INDEX_KQFILE;
+               assert((pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) == 0);
+               pp = _pthread_priority_normalize(pp);
+       } else {
+               pp = _pthread_unspecified_priority();
+               qos = THREAD_QOS_UNSPECIFIED;
+       }
 
-       /* always set requested */
-       kn->kn_req_index = qos_index;
+       kn->kn_qos = pp;
+       kn->kn_req_index = qos;
+
+       if ((kn->kn_status & KN_MERGE_QOS) == 0 || qos > kn->kn_qos_override) {
+               /* Never lower QoS when in "Merge" mode */
+               kn->kn_qos_override = qos;
+       }
 
        /* only adjust in-use qos index when not suppressed */
-       if ((kn->kn_status & KN_SUPPRESSED) == 0)
-               kn->kn_qos_index = qos_index;
+       if ((kn->kn_status & KN_SUPPRESSED) == 0) {
+               kn->kn_qos_index = qos;
+       } else if (kq->kq_state & KQ_WORKQ) {
+               kqworkq_update_override((struct kqworkq *)kq, kn, qos);
+       } else if (kq->kq_state & KQ_WORKLOOP) {
+               kqworkloop_update_override((struct kqworkloop *)kq, qos);
+       }
 }
 
 static void
 knote_set_qos_overcommit(struct knote *kn)
 {
        struct kqueue *kq = knote_get_kq(kn);
-       struct kqrequest *kqr;
 
        /* turn overcommit on for the appropriate thread request? */
-       if (kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
-               if (kq->kq_state & KQ_WORKQ) {
-                       kq_index_t qos_index = knote_get_qos_index(kn);
-                       struct kqworkq *kqwq = (struct kqworkq *)kq;
-
-                       kqr = kqworkq_get_request(kqwq, qos_index);
-
-                       kqwq_req_lock(kqwq);
-                       kqr->kqr_state |= KQR_THOVERCOMMIT;
-                       kqwq_req_unlock(kqwq);
-               } else if (kq->kq_state & KQ_WORKLOOP) {
-                       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+       if ((kn->kn_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) &&
+                       (kq->kq_state & KQ_WORKLOOP)) {
+               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+               struct kqrequest *kqr = &kqwl->kqwl_request;
 
-                       kqr = &kqwl->kqwl_request;
+               /*
+                * This test is racy, but since we never remove this bit,
+                * it allows us to avoid taking a lock.
+                */
+               if (kqr->kqr_state & KQR_THOVERCOMMIT) {
+                       return;
+               }
 
-                       kqwl_req_lock(kqwl);
-                       kqr->kqr_state |= KQR_THOVERCOMMIT;
-                       kqwl_req_unlock(kqwl);
+               kq_req_lock(kqwl);
+               kqr->kqr_state |= KQR_THOVERCOMMIT;
+               if (!kqr->kqr_thread && (kqr->kqr_state & KQR_THREQUESTED)) {
+                       kqueue_threadreq_modify(kq, kqr, kqr->kqr_req.tr_qos);
                }
+               kq_req_unlock(kqwl);
        }
 }
 
@@ -7185,490 +6656,309 @@ knote_get_qos_override_index(struct knote *kn)
 }
 
 static void
-knote_set_qos_override_index(struct knote *kn, kq_index_t override_index,
-               boolean_t override_is_sync)
-{
-       struct kqueue *kq = knote_get_kq(kn);
-       kq_index_t qos_index = knote_get_qos_index(kn);
-       kq_index_t old_override_index = knote_get_qos_override_index(kn);
-       boolean_t old_override_is_sync = kn->kn_qos_override_is_sync;
-       uint32_t flags = 0;
-
-       assert((kn->kn_status & KN_QUEUED) == 0);
-
-       if (override_index == KQWQ_QOS_MANAGER) {
-               assert(qos_index == KQWQ_QOS_MANAGER);
-       } else {
-               assert(override_index < KQWQ_QOS_MANAGER);
-       }
-
-       kn->kn_qos_override = override_index;
-       kn->kn_qos_override_is_sync = override_is_sync;
-
-       /*
-        * If this is a workq/workloop kqueue, apply the override to the
-        * servicing thread.
-        */
-       if (kq->kq_state & KQ_WORKQ)  {
-               struct kqworkq *kqwq = (struct kqworkq *)kq;
-
-               assert(qos_index > THREAD_QOS_UNSPECIFIED);
-               kqworkq_update_override(kqwq, qos_index, override_index);
-       } else if (kq->kq_state & KQ_WORKLOOP) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-
-               if ((kn->kn_status & KN_SUPPRESSED) == KN_SUPPRESSED) {
-                       flags = flags | KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS;
-
-                       if (override_index == THREAD_QOS_USER_INTERACTIVE
-                                       && override_is_sync) {
-                               flags = flags | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI;
-                       }
-
-                       if (old_override_index == THREAD_QOS_USER_INTERACTIVE
-                                       && old_override_is_sync) {
-                               flags = flags | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI;
-                       }
-               }
-
-               assert(qos_index > THREAD_QOS_UNSPECIFIED);
-               kqworkloop_update_override(kqwl, qos_index, override_index, flags);
-       }
-}
-
-static kq_index_t
-knote_get_sync_qos_override_index(struct knote *kn)
-{
-       return kn->kn_qos_sync_override;
-}
-
-static void
-kqworkq_update_override(struct kqworkq *kqwq, kq_index_t qos_index, kq_index_t override_index)
+kqworkq_update_override(struct kqworkq *kqwq, struct knote *kn,
+               kq_index_t override_index)
 {
        struct kqrequest *kqr;
        kq_index_t old_override_index;
+       kq_index_t queue_index = kn->kn_qos_index;
 
-       if (override_index <= qos_index) {
+       if (override_index <= queue_index) {
                return;
        }
 
-       kqr = kqworkq_get_request(kqwq, qos_index);
+       kqr = kqworkq_get_request(kqwq, queue_index);
 
-       kqwq_req_lock(kqwq);
+       kq_req_lock(kqwq);
        old_override_index = kqr->kqr_override_index;
        if (override_index > MAX(kqr->kqr_qos_index, old_override_index)) {
                kqr->kqr_override_index = override_index;
 
                /* apply the override to [incoming?] servicing thread */
-               if (kqr->kqr_state & KQR_BOUND) {
-                       thread_t wqthread = kqr->kqr_thread;
-
-                       /* only apply if non-manager */
-                       assert(wqthread);
-                   if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
-                               if (old_override_index)
-                                       thread_update_ipc_override(wqthread, override_index);
-                               else
-                                       thread_add_ipc_override(wqthread, override_index);
-                       }
+               if (kqr->kqr_thread) {
+                       if (old_override_index)
+                               thread_update_ipc_override(kqr->kqr_thread, override_index);
+                       else
+                               thread_add_ipc_override(kqr->kqr_thread, override_index);
                }
        }
-       kqwq_req_unlock(kqwq);
+       kq_req_unlock(kqwq);
 }
 
-/* called with the kqworkq lock held */
 static void
-kqworkq_bind_thread_impl(
-       struct kqworkq *kqwq,
-       kq_index_t qos_index,
-       thread_t thread,
-       unsigned int flags)
+kqworkloop_update_override(struct kqworkloop *kqwl, kq_index_t override_index)
 {
-       /* request lock must be held */
-       kqwq_req_held(kqwq);
-
-       struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
-       assert(kqr->kqr_state & KQR_THREQUESTED);
-
-       if (qos_index == KQWQ_QOS_MANAGER)
-               flags |= KEVENT_FLAG_WORKQ_MANAGER;
-
-       struct uthread *ut = get_bsdthread_info(thread);
-
-       /* 
-        * If this is a manager, and the manager request bit is
-        * not set, assure no other thread is bound. If the bit
-        * is set, make sure the old thread is us (or not set).
-        */
-       if (flags & KEVENT_FLAG_WORKQ_MANAGER) {
-               if ((kqr->kqr_state & KQR_BOUND) == 0) {
-                       kqr->kqr_state |= (KQR_BOUND | KQWQ_THMANAGER);
-                       TAILQ_INIT(&kqr->kqr_suppressed);
-                       kqr->kqr_thread = thread;
-                       ut->uu_kqueue_bound = (struct kqueue *)kqwq;
-                       ut->uu_kqueue_qos_index = KQWQ_QOS_MANAGER;
-                       ut->uu_kqueue_flags = (KEVENT_FLAG_WORKQ | 
-                                              KEVENT_FLAG_WORKQ_MANAGER);
-               } else {
-                       assert(kqr->kqr_state & KQR_BOUND);
-                       assert(thread == kqr->kqr_thread);
-                       assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
-                       assert(ut->uu_kqueue_qos_index == KQWQ_QOS_MANAGER);
-                       assert(ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ_MANAGER);
-               }
-               return;
-       }
-
-       /* Just a normal one-queue servicing thread */
-       assert(kqr->kqr_state & KQR_THREQUESTED);
-       assert(kqr->kqr_qos_index == qos_index);
-
-       if ((kqr->kqr_state & KQR_BOUND) == 0) {
-               kqr->kqr_state |= KQR_BOUND;
-               TAILQ_INIT(&kqr->kqr_suppressed);
-               kqr->kqr_thread = thread;
-
-               /* apply an ipc QoS override if one is needed */
-               if (kqr->kqr_override_index) {
-                       assert(kqr->kqr_qos_index);
-                       assert(kqr->kqr_override_index > kqr->kqr_qos_index);
-                       assert(thread_get_ipc_override(thread) == THREAD_QOS_UNSPECIFIED);
-                       thread_add_ipc_override(thread, kqr->kqr_override_index);
-               }
-
-               /* indicate that we are processing in the uthread */
-               ut->uu_kqueue_bound = (struct kqueue *)kqwq;
-               ut->uu_kqueue_qos_index = qos_index;
-               ut->uu_kqueue_flags = flags;
-       } else {
-               /*
-                * probably syncronously bound AND post-request bound
-                * this logic can go away when we get rid of post-request bind
-                */
-               assert(kqr->kqr_state & KQR_BOUND);
-               assert(thread == kqr->kqr_thread);
-               assert(ut->uu_kqueue_bound == (struct kqueue *)kqwq);
-               assert(ut->uu_kqueue_qos_index == qos_index);
-               assert((ut->uu_kqueue_flags & flags) == flags);
-       }
+       kq_req_lock(kqwl);
+       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
+                       override_index);
+       kq_req_unlock(kqwl);
 }
 
-static void
-kqworkloop_update_override(
-       struct kqworkloop *kqwl,
-       kq_index_t qos_index,
-       kq_index_t override_index,
-       uint32_t flags)
+static thread_qos_t
+kqworkloop_unbind_locked(struct kqworkloop *kqwl, thread_t thread)
 {
+       struct uthread *ut = get_bsdthread_info(thread);
        struct kqrequest *kqr = &kqwl->kqwl_request;
+       kq_index_t ipc_override = ut->uu_kqueue_override;
 
-       kqwl_req_lock(kqwl);
-
-       /* Do not override on attached threads */
-       if (kqr->kqr_state & KQR_BOUND) {
-               assert(kqr->kqr_thread);
-
-               if (kqwl->kqwl_kqueue.kq_state & KQ_NO_WQ_THREAD) {
-                       kqwl_req_unlock(kqwl);
-                       assert(!is_workqueue_thread(kqr->kqr_thread));
-                       return;
-               }
-       }
-
-       /* Update sync ipc counts on kqr for suppressed knotes */
-       if (flags & KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS) {
-               kqworkloop_update_suppress_sync_count(kqr, flags);
-       }
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWL_UNBIND), kqwl->kqwl_dynamicid,
+                       thread_tid(thread), 0, 0);
 
-       if ((flags & KQWL_UO_UPDATE_OVERRIDE_LAZY) == 0) {
-               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_WAKEUP_OVERRIDE,
-                       MAX(qos_index, override_index));
-       }
-       kqwl_req_unlock(kqwl);
-}
+       kq_req_held(kqwl);
+       assert(ut->uu_kqr_bound == kqr);
+       ut->uu_kqr_bound = NULL;
+       ut->uu_kqueue_override = THREAD_QOS_UNSPECIFIED;
 
-static void
-kqworkloop_update_suppress_sync_count(
-       struct kqrequest *kqr,
-       uint32_t flags)
-{
-       if (flags & KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI) {
-               kqr->kqr_sync_suppress_count++;
+       if (kqwl->kqwl_owner == NULL && kqwl->kqwl_turnstile) {
+               turnstile_update_inheritor(kqwl->kqwl_turnstile,
+                               TURNSTILE_INHERITOR_NULL, TURNSTILE_IMMEDIATE_UPDATE);
+               turnstile_update_inheritor_complete(kqwl->kqwl_turnstile,
+                               TURNSTILE_INTERLOCK_HELD);
        }
 
-       if (flags & KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI) {
-               assert(kqr->kqr_sync_suppress_count > 0);
-               kqr->kqr_sync_suppress_count--;
-       }
+       kqr->kqr_thread = NULL;
+       kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
+       return ipc_override;
 }
 
 /*
- *     kqworkloop_unbind_thread - Unbind the servicer thread of a workloop kqueue
+ *     kqworkloop_unbind - Unbind the servicer thread of a workloop kqueue
  *
- *     It will end the processing phase in case it was still processing:
- *
- *     We may have to request a new thread for not KQ_NO_WQ_THREAD workloop.
- *     This can happen if :
- *     - there were active events at or above our QoS we never got to (count > 0)
+ *     It will acknowledge events, and possibly request a new thread if:
+ *     - there were active events left
  *     - we pended waitq hook callouts during processing
  *     - we pended wakeups while processing (or unsuppressing)
  *
  *     Called with kqueue lock held.
  */
-
 static void
-kqworkloop_unbind_thread(
-       struct kqworkloop *kqwl,
-       thread_t thread,
-       __unused unsigned int flags)
+kqworkloop_unbind(proc_t p, struct kqworkloop *kqwl)
 {
        struct kqueue *kq = &kqwl->kqwl_kqueue;
        struct kqrequest *kqr = &kqwl->kqwl_request;
+       thread_t thread = kqr->kqr_thread;
+       int op = KQWL_UTQ_PARKING;
+       kq_index_t ipc_override, qos_override = THREAD_QOS_UNSPECIFIED;
 
-       kqlock_held(kq);
+       assert(thread == current_thread());
 
-       assert((kq->kq_state & KQ_PROCESSING) == 0);
-       if (kq->kq_state & KQ_PROCESSING) {
-               return;
-       }
+       kqlock(kqwl);
 
        /*
         * Forcing the KQ_PROCESSING flag allows for QoS updates because of
         * unsuppressing knotes not to be applied until the eventual call to
         * kqworkloop_update_threads_qos() below.
         */
-       kq->kq_state |= KQ_PROCESSING;
-       kqworkloop_acknowledge_events(kqwl, TRUE);
-       kq->kq_state &= ~KQ_PROCESSING;
-
-       kqwl_req_lock(kqwl);
-
-       /* deal with extraneous unbinds in release kernels */
-       assert((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) == KQR_BOUND);
-       if ((kqr->kqr_state & (KQR_BOUND | KQR_PROCESSING)) != KQR_BOUND) {
-               kqwl_req_unlock(kqwl);
-               return;
+       assert((kq->kq_state & KQ_PROCESSING) == 0);
+       if (!TAILQ_EMPTY(&kqr->kqr_suppressed)) {
+               kq->kq_state |= KQ_PROCESSING;
+               qos_override = kqworkloop_acknowledge_events(kqwl);
+               kq->kq_state &= ~KQ_PROCESSING;
        }
 
-       assert(thread == current_thread());
-       assert(kqr->kqr_thread == thread);
-       if (kqr->kqr_thread != thread) {
-               kqwl_req_unlock(kqwl);
-           return;
-       }
+       kq_req_lock(kqwl);
 
-       struct uthread *ut = get_bsdthread_info(thread);
-       kq_index_t old_qos_index = ut->uu_kqueue_qos_index;
-       boolean_t ipc_override_is_sync = ut->uu_kqueue_override_is_sync;
-       ut->uu_kqueue_bound = NULL;
-       ut->uu_kqueue_qos_index = 0;
-       ut->uu_kqueue_override_is_sync = 0;
-       ut->uu_kqueue_flags = 0;
-
-       /* unbind the servicer thread, drop overrides */
-       kqr->kqr_thread = NULL;
-       kqr->kqr_state &= ~(KQR_BOUND | KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
-       kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RECOMPUTE_WAKEUP_QOS, 0);
+       ipc_override = kqworkloop_unbind_locked(kqwl, thread);
+       kqworkloop_update_threads_qos(kqwl, op, qos_override);
+
+       kq_req_unlock(kqwl);
 
-       kqwl_req_unlock(kqwl);
+       kqunlock(kqwl);
 
        /*
         * Drop the override on the current thread last, after the call to
         * kqworkloop_update_threads_qos above.
         */
-       if (old_qos_index) {
+       if (ipc_override) {
                thread_drop_ipc_override(thread);
        }
-       if (ipc_override_is_sync) {
-               thread_drop_sync_ipc_override(thread);
-       }
+
+       /* If last reference, dealloc the workloop kq */
+       kqueue_release_last(p, kqwl);
 }
 
-/* called with the kqworkq lock held */
-static void
-kqworkq_unbind_thread(
-       struct kqworkq *kqwq,
-       kq_index_t qos_index,
-       thread_t thread, 
-       __unused unsigned int flags)
+static thread_qos_t
+kqworkq_unbind_locked(__assert_only struct kqworkq *kqwq,
+               struct kqrequest *kqr, thread_t thread)
 {
-       struct kqrequest *kqr = kqworkq_get_request(kqwq, qos_index);
-       kq_index_t override_index = 0;
-
-       /* request lock must be held */
-       kqwq_req_held(kqwq);
-
-       assert(thread == current_thread());
-
-       if ((kqr->kqr_state & KQR_BOUND) == 0) {
-               assert(kqr->kqr_state & KQR_BOUND);
-               return;
-       }
-
-       assert(kqr->kqr_thread == thread);
-       assert(TAILQ_EMPTY(&kqr->kqr_suppressed));
+       struct uthread *ut = get_bsdthread_info(thread);
+       kq_index_t old_override = kqr->kqr_override_index;
 
-       /* 
-        * If there is an override, drop it from the current thread
-        * and then we are free to recompute (a potentially lower)
-        * minimum override to apply to the next thread request.
-        */
-       if (kqr->kqr_override_index) {
-               struct kqtailq *base_queue = kqueue_get_base_queue(&kqwq->kqwq_kqueue, qos_index);
-               struct kqtailq *queue = kqueue_get_high_queue(&kqwq->kqwq_kqueue, qos_index);
+       KDBG_FILTERED(KEV_EVTID(BSD_KEVENT_KQWQ_UNBIND), -1,
+                       thread_tid(kqr->kqr_thread), kqr->kqr_qos_index, 0);
 
-               /* if not bound to a manager thread, drop the current ipc override */
-               if ((kqr->kqr_state & KQWQ_THMANAGER) == 0) {
-                       thread_drop_ipc_override(thread);
-               }
+       kq_req_held(kqwq);
+       assert(ut->uu_kqr_bound == kqr);
+       ut->uu_kqr_bound = NULL;
+       kqr->kqr_thread = NULL;
+       kqr->kqr_state &= ~(KQR_THREQUESTED | KQR_R2K_NOTIF_ARMED);
+       kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
 
-               /* recompute the new override */
-               do {
-                       if (!TAILQ_EMPTY(queue)) {
-                               override_index = queue - base_queue + qos_index;
-                               break;
-                       }
-               } while (queue-- > base_queue);
-       }
+       return old_override;
+}
 
-       /* Mark it unbound */
-       kqr->kqr_thread = NULL;
-       kqr->kqr_state &= ~(KQR_BOUND | KQR_THREQUESTED | KQWQ_THMANAGER);
+/*
+ *     kqworkq_unbind - unbind of a workq kqueue from a thread
+ *
+ *     We may have to request new threads.
+ *     This can happen there are no waiting processing threads and:
+ *     - there were active events we never got to (count > 0)
+ *     - we pended waitq hook callouts during processing
+ *     - we pended wakeups while processing (or unsuppressing)
+ */
+static void
+kqworkq_unbind(proc_t p, struct kqrequest *kqr)
+{
+       struct kqworkq *kqwq = (struct kqworkq *)p->p_fd->fd_wqkqueue;
+       __assert_only int rc;
 
-       /* apply the new override */
-       if (override_index > kqr->kqr_qos_index) {
-               kqr->kqr_override_index = override_index;
-       } else {
-               kqr->kqr_override_index = THREAD_QOS_UNSPECIFIED;
-       }
+       kqlock(kqwq);
+       rc = kqworkq_acknowledge_events(kqwq, kqr, 0, KQWQAE_UNBIND);
+       assert(rc == -1);
+       kqunlock(kqwq);
 }
 
 struct kqrequest *
 kqworkq_get_request(struct kqworkq *kqwq, kq_index_t qos_index)
 {
-       assert(qos_index < KQWQ_NQOS);
+       assert(qos_index < KQWQ_NBUCKETS);
        return &kqwq->kqwq_request[qos_index];
 }
 
-void
-knote_adjust_qos(struct knote *kn, qos_t new_qos, qos_t new_override, kq_index_t sync_override_index)
+static void
+knote_apply_qos_override(struct knote *kn, kq_index_t qos_index)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       boolean_t override_is_sync = FALSE;
-
-       if (kq->kq_state & (KQ_WORKQ | KQ_WORKLOOP)) {
-               kq_index_t new_qos_index;
-               kq_index_t new_override_index;
-               kq_index_t servicer_qos_index;
-
-               new_qos_index = qos_index_from_qos(kn, new_qos, FALSE);
-               new_override_index = qos_index_from_qos(kn, new_override, TRUE);
+       assert((kn->kn_status & KN_QUEUED) == 0);
 
-               /* make sure the servicer qos acts as a floor */
-               servicer_qos_index = qos_index_from_qos(kn, kn->kn_qos, FALSE);
-               if (servicer_qos_index > new_qos_index)
-                       new_qos_index = servicer_qos_index;
-               if (servicer_qos_index > new_override_index)
-                       new_override_index = servicer_qos_index;
-               if (sync_override_index >= new_override_index) {
-                       new_override_index = sync_override_index;
-                       override_is_sync = TRUE;
-               }
+       kn->kn_qos_override = qos_index;
 
-               kqlock(kq);
-               if (new_qos_index != knote_get_req_index(kn) ||
-                   new_override_index != knote_get_qos_override_index(kn) ||
-                   override_is_sync != kn->kn_qos_override_is_sync) {
-                       if (kn->kn_status & KN_QUEUED) {
-                               knote_dequeue(kn);
-                               knote_set_qos_index(kn, new_qos_index);
-                               knote_set_qos_override_index(kn, new_override_index, override_is_sync);
-                               knote_enqueue(kn);
-                               knote_wakeup(kn);
-                       } else {
-                               knote_set_qos_index(kn, new_qos_index);
-                               knote_set_qos_override_index(kn, new_override_index, override_is_sync);
-                       }
+       if (kn->kn_status & KN_SUPPRESSED) {
+               struct kqueue *kq = knote_get_kq(kn);
+               /*
+                * For suppressed events, the kn_qos_index field cannot be touched as it
+                * allows us to know on which supress queue the knote is for a kqworkq.
+                *
+                * Also, there's no natural push applied on the kqueues when this field
+                * changes anyway. We hence need to apply manual overrides in this case,
+                * which will be cleared when the events are later acknowledged.
+                */
+               if (kq->kq_state & KQ_WORKQ) {
+                       kqworkq_update_override((struct kqworkq *)kq, kn, qos_index);
+               } else {
+                       kqworkloop_update_override((struct kqworkloop *)kq, qos_index);
                }
-               kqunlock(kq);
+       } else {
+               kn->kn_qos_index = qos_index;
        }
 }
 
-void
-knote_adjust_sync_qos(struct knote *kn, kq_index_t sync_qos, boolean_t lock_kq)
+static bool
+knote_should_apply_qos_override(struct kqueue *kq, struct knote *kn, int result,
+               thread_qos_t *qos_out)
 {
-       struct kqueue *kq = knote_get_kq(kn);
-       kq_index_t old_sync_override;
-       kq_index_t qos_index = knote_get_qos_index(kn);
-       uint32_t flags = 0;
-
-       /* Tracking only happens for UI qos */
-       if (sync_qos != THREAD_QOS_USER_INTERACTIVE &&
-               sync_qos != THREAD_QOS_UNSPECIFIED) {
-               return;
-       }
-
-       if (lock_kq)
-               kqlock(kq);
+       thread_qos_t qos_index = (result >> FILTER_ADJUST_EVENT_QOS_SHIFT) & 7;
 
-       if (kq->kq_state & KQ_WORKLOOP) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
+       kqlock_held(kq);
 
-               old_sync_override = knote_get_sync_qos_override_index(kn);
-               if (old_sync_override != sync_qos) {
-                       kn->kn_qos_sync_override = sync_qos;
+       assert(result & FILTER_ADJUST_EVENT_QOS_BIT);
+       assert(qos_index < THREAD_QOS_LAST);
 
-                       /* update sync ipc counters for suppressed knotes */
-                       if ((kn->kn_status & KN_SUPPRESSED) == KN_SUPPRESSED) {
-                               flags = flags | KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS;
+       /*
+        * Early exit for knotes that should not change QoS
+        *
+        * It is safe to test kn_req_index against MANAGER / STAYACTIVE because
+        * knotes with such kn_req_index values never change for their entire
+        * lifetime.
+        */
+       if (__improbable(!knote_fops(kn)->f_adjusts_qos)) {
+               panic("filter %d cannot change QoS", kn->kn_filtid);
+       } else if (kq->kq_state & KQ_WORKLOOP) {
+               if (kn->kn_req_index == KQWL_BUCKET_STAYACTIVE) {
+                       return false;
+               }
+       } else if (kq->kq_state & KQ_WORKQ) {
+               if (kn->kn_req_index == KQWQ_QOS_MANAGER) {
+                       return false;
+               }
+       } else {
+               return false;
+       }
 
-                               /* Do not recalculate kqwl override, it would be done later */
-                               flags = flags | KQWL_UO_UPDATE_OVERRIDE_LAZY;
+       /*
+        * knotes with the FALLBACK flag will only use their registration QoS if the
+        * incoming event has no QoS, else, the registration QoS acts as a floor.
+        */
+       if (kn->kn_qos & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
+               if (qos_index == THREAD_QOS_UNSPECIFIED)
+                       qos_index = kn->kn_req_index;
+       } else {
+               if (qos_index < kn->kn_req_index)
+                       qos_index = kn->kn_req_index;
+       }
+       if ((kn->kn_status & KN_MERGE_QOS) && (qos_index < kn->kn_qos_override)) {
+               /* Never lower QoS when in "Merge" mode */
+               return false;
+       }
 
-                               if (sync_qos == THREAD_QOS_USER_INTERACTIVE) {
-                                       flags = flags | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI;
-                               }
+       if ((kn->kn_status & KN_LOCKED) && kn->kn_inuse) {
+               /*
+                * When we're trying to update the QoS override and that both an
+                * f_event() and other f_* calls are running concurrently, any of these
+                * in flight calls may want to perform overrides that aren't properly
+                * serialized with each other.
+                *
+                * The first update that observes this racy situation enters a "Merge"
+                * mode which causes subsequent override requests to saturate the
+                * override instead of replacing its value.
+                *
+                * This mode is left when knote_unlock() or knote_call_filter_event()
+                * observe that no other f_* routine is in flight.
+                */
+               kn->kn_status |= KN_MERGE_QOS;
+       }
 
-                               if (old_sync_override == THREAD_QOS_USER_INTERACTIVE) {
-                                       flags = flags | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI;
-                               }
+       if (kn->kn_qos_override == qos_index) {
+               return false;
+       }
 
-                               kqworkloop_update_override(kqwl, qos_index, sync_qos,
-                                       flags);
-                       }
+       *qos_out = qos_index;
+       return true;
+}
 
+static void
+knote_adjust_qos(struct kqueue *kq, struct knote *kn, int result)
+{
+       thread_qos_t qos;
+       if (knote_should_apply_qos_override(kq, kn, result, &qos)) {
+               knote_dequeue(kn);
+               knote_apply_qos_override(kn, qos);
+               if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
+                       knote_wakeup(kn);
                }
        }
-       if (lock_kq)
-               kqunlock(kq);
 }
 
 static void
 knote_wakeup(struct knote *kn)
 {
        struct kqueue *kq = knote_get_kq(kn);
-       kq_index_t qos_index = knote_get_qos_index(kn);
 
        kqlock_held(kq);
 
        if (kq->kq_state & KQ_WORKQ) {
-               /* request a servicing thread */
                struct kqworkq *kqwq = (struct kqworkq *)kq;
 
-               kqworkq_request_help(kqwq, qos_index);
-
+               kqworkq_request_help(kqwq, kn->kn_qos_index);
        } else if (kq->kq_state & KQ_WORKLOOP) {
-               /* request a servicing thread */
                struct kqworkloop *kqwl = (struct kqworkloop *)kq;
 
-               if (kqworkloop_is_processing_on_current_thread(kqwl)) {
-                       /*
-                        * kqworkloop_end_processing() will perform the required QoS
-                        * computations when it unsets the processing mode.
-                        */
-                       return;
+               /*
+                * kqworkloop_end_processing() will perform the required QoS
+                * computations when it unsets the processing mode.
+                */
+               if (!kqworkloop_is_processing_on_current_thread(kqwl)) {
+                       kqworkloop_request_help(kqwl, kn->kn_qos_index);
                }
-               kqworkloop_request_help(kqwl, qos_index);
        } else {
                struct kqfile *kqf = (struct kqfile *)kq;
 
@@ -7679,10 +6969,8 @@ knote_wakeup(struct knote *kn)
                /* wakeup a thread waiting on this queue */
                if (kq->kq_state & (KQ_SLEEP | KQ_SEL)) {
                        kq->kq_state &= ~(KQ_SLEEP | KQ_SEL);
-                       waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                                          KQ_EVENT,
-                                          THREAD_AWAKENED,
-                                          WAITQ_ALL_PRIORITIES);
+                       waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, KQ_EVENT,
+                                       THREAD_AWAKENED, WAITQ_ALL_PRIORITIES);
                }
 
                /* wakeup other kqueues/select sets we're inside */
@@ -7714,8 +7002,8 @@ kqueue_interrupt(struct kqueue *kq)
                assert(kq->kq_state & KQ_PROCESSING);
 
                kq->kq_state &= ~KQ_PROCWAIT;
-               suppressq = kqueue_get_suppressed_queue(kq, QOS_INDEX_KQFILE);
-               (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs, 
+               suppressq = kqueue_get_suppressed_queue(kq, NULL);
+               (void)waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
                                         CAST_EVENT64_T(suppressq),
                                         THREAD_RESTART,
                                         WAITQ_ALL_PRIORITIES);
@@ -7744,7 +7032,6 @@ waitq_set__CALLING_PREPOST_HOOK__(void *kq_hook, void *knote_hook, int qos)
                struct kqworkq *kqwq = (struct kqworkq *)kq;
 
                kqworkq_request_help(kqwq, KQWQ_QOS_MANAGER);
-
        } else if (kq->kq_state & KQ_WORKLOOP) {
                struct kqworkloop *kqwl = (struct kqworkloop *)kq;
 
@@ -7768,8 +7055,7 @@ klist_init(struct klist *list)
  *     the hint) and not deadlock itself.
  *
  *     The object lock should also hold off pending
- *     detach/drop operations.  But we'll prevent it here
- *     too (by taking a use reference) - just in case.
+ *     detach/drop operations.
  */
 void
 knote(struct klist *list, long hint)
@@ -7778,23 +7064,8 @@ knote(struct klist *list, long hint)
 
        SLIST_FOREACH(kn, list, kn_selnext) {
                struct kqueue *kq = knote_get_kq(kn);
-
                kqlock(kq);
-
-               assert(!knoteuse_needs_boost(kn, NULL));
-
-               /* If we can get a use reference - deliver event */
-               if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
-                       int result;
-
-                       /* call the event with only a use count */
-                       result = knote_fops(kn)->f_event(kn, hint);
-
-                       /* if its not going away and triggered */
-                       if (knoteuse2kqlock(kq, kn, KNUSE_NONE) && result)
-                               knote_activate(kn);
-                       /* kq lock held */
-               }
+               knote_call_filter_event(kq, kn, hint);
                kqunlock(kq);
        }
 }
@@ -7845,32 +7116,45 @@ knote_vanish(struct klist *list)
 
        SLIST_FOREACH_SAFE(kn, list, kn_selnext, kn_next) {
                struct kqueue *kq = knote_get_kq(kn);
-               int result;
 
                kqlock(kq);
-
-               assert(!knoteuse_needs_boost(kn, NULL));
-
-               if ((kn->kn_status & KN_DROPPING) == 0) {
+               if (kn->kn_status & KN_REQVANISH) {
                        /* If EV_VANISH supported - prepare to deliver one */
-                       if (kn->kn_status & KN_REQVANISH) {
-                               kn->kn_status |= KN_VANISHED;
-                               knote_activate(kn);
-
-                       } else if (kqlock2knoteuse(kq, kn, KNUSE_NONE)) {
-                               /* call the event with only a use count */
-                               result = knote_fops(kn)->f_event(kn, NOTE_REVOKE);
-
-                               /* if its not going away and triggered */
-                               if (knoteuse2kqlock(kq, kn, KNUSE_NONE) && result)
-                                       knote_activate(kn);
-                               /* lock held again */
-                       }
+                       kn->kn_status |= KN_VANISHED;
+                       knote_activate(kn);
+               } else {
+                       knote_call_filter_event(kq, kn, NOTE_REVOKE);
                }
                kqunlock(kq);
        }
 }
 
+/*
+ * Force a lazy allocation of the waitqset link
+ * of the kq_wqs associated with the kn
+ * if it wasn't already allocated.
+ *
+ * This allows knote_link_waitq to never block
+ * if reserved_link is not NULL.
+ */
+void
+knote_link_waitqset_lazy_alloc(struct knote *kn)
+{
+       struct kqueue *kq = knote_get_kq(kn);
+       waitq_set_lazy_init_link(&kq->kq_wqs);
+}
+
+/*
+ * Check if a lazy allocation for the waitqset link
+ * of the kq_wqs is needed.
+ */
+boolean_t
+knote_link_waitqset_should_lazy_alloc(struct knote *kn)
+{
+       struct kqueue *kq = knote_get_kq(kn);
+       return waitq_set_should_lazy_init_link(&kq->kq_wqs);
+}
+
 /*
  * For a given knote, link a provided wait queue directly with the kqueue.
  * Wakeups will happen via recursive wait queue support.  But nothing will move
@@ -7880,7 +7164,8 @@ knote_vanish(struct klist *list)
  * kqueue and knote references are held by caller.
  * waitq locked by caller.
  *
- * caller provides the wait queue link structure.
+ * caller provides the wait queue link structure and insures that the kq->kq_wqs
+ * is linked by previously calling knote_link_waitqset_lazy_alloc.
  */
 int
 knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link)
@@ -7920,17 +7205,15 @@ knote_unlink_waitq(struct knote *kn, struct waitq *wq)
 /*
  * remove all knotes referencing a specified fd
  *
- * Essentially an inlined knote_remove & knote_drop
- * when we know for sure that the thing is a file
- *
  * Entered with the proc_fd lock already held.
  * It returns the same way, but may drop it temporarily.
  */
 void
-knote_fdclose(struct proc *p, int fd, int force)
+knote_fdclose(struct proc *p, int fd)
 {
        struct klist *list;
        struct knote *kn;
+       KNOTE_LOCK_CTX(knlc);
 
 restart:
        list = &p->p_fd->fd_knlist[fd];
@@ -7948,45 +7231,28 @@ restart:
                 * transition it to vanished mode (or skip over
                 * it if already vanished).
                 */
-               if (!force && (kn->kn_status & KN_REQVANISH)) {
-
-                       if ((kn->kn_status & KN_VANISHED) == 0) {
-                               proc_fdunlock(p);
-
-                               assert(!knoteuse_needs_boost(kn, NULL));
-
-                               /* get detach reference (also marks vanished) */
-                               if (kqlock2knotedetach(kq, kn, KNUSE_NONE)) {
-                                       /* detach knote and drop fp use reference */
-                                       knote_fops(kn)->f_detach(kn);
-                                       if (knote_fops(kn)->f_isfd)
-                                               fp_drop(p, kn->kn_id, kn->kn_fp, 0);
-
-                                       /* activate it if it's still in existence */
-                                       if (knoteuse2kqlock(kq, kn, KNUSE_NONE)) {
-                                               knote_activate(kn);
-                                       }
-                                       kqunlock(kq);
-                               }
-                               proc_fdlock(p);
-                               goto restart;
-                       } else {
-                               kqunlock(kq);
-                               continue;
-                       }
+               if (kn->kn_status & KN_VANISHED) {
+                       kqunlock(kq);
+                       continue;
                }
 
                proc_fdunlock(p);
+               if (!knote_lock(kq, kn, &knlc, KNOTE_KQ_LOCK_ON_SUCCESS)) {
+                       /* the knote was dropped by someone, nothing to do */
+               } else if (kn->kn_status & KN_REQVANISH) {
+                       kn->kn_status |= KN_VANISHED;
+                       kn->kn_status &= ~KN_ATTACHED;
 
-               /*
-                * Convert the kq lock to a drop ref.
-                * If we get it, go ahead and drop it.
-                * Otherwise, we waited for the blocking
-                * condition to complete. Either way,
-                * we dropped the fdlock so start over.
-                */
-               if (kqlock2knotedrop(kq, kn)) {
-                       knote_drop(kn, p);
+                       kqunlock(kq);
+                       knote_fops(kn)->f_detach(kn);
+                       if (knote_fops(kn)->f_isfd)
+                               fp_drop(p, kn->kn_id, kn->kn_fp, 0);
+                       kqlock(kq);
+
+                       knote_activate(kn);
+                       knote_unlock(kq, kn, &knlc, KNOTE_KQ_UNLOCK);
+               } else {
+                       knote_drop(kq, kn, &knlc);
                }
 
                proc_fdlock(p);
@@ -7994,7 +7260,7 @@ restart:
        }
 }
 
-/* 
+/*
  * knote_fdfind - lookup a knote in the fd table for process
  *
  * If the filter is file-based, lookup based on fd index.
@@ -8009,15 +7275,15 @@ restart:
  */
 static struct knote *
 knote_fdfind(struct kqueue *kq,
-             struct kevent_internal_s *kev,
-            bool is_fd,
-             struct proc *p)
+               struct kevent_internal_s *kev,
+               bool is_fd,
+               struct proc *p)
 {
        struct filedesc *fdp = p->p_fd;
        struct klist *list = NULL;
        struct knote *kn = NULL;
 
-       /* 
+       /*
         * determine where to look for the knote
         */
        if (is_fd) {
@@ -8036,7 +7302,7 @@ knote_fdfind(struct kqueue *kq,
        if (list != NULL) {
                SLIST_FOREACH(kn, list, kn_link) {
                        if (kq == knote_get_kq(kn) &&
-                           kev->ident == kn->kn_id && 
+                           kev->ident == kn->kn_id &&
                            kev->filter == kn->kn_filter) {
                                if (kev->flags & EV_UDATA_SPECIFIC) {
                                        if ((kn->kn_status & KN_UDATA_SPECIFIC) &&
@@ -8067,9 +7333,8 @@ knote_fdfind(struct kqueue *kq,
  * Takes a rwlock boost if inserting the knote is successful.
  */
 static int
-kq_add_knote(struct kqueue *kq, struct knote *kn,
-             struct kevent_internal_s *kev,
-             struct proc *p, int *knoteuse_flags)
+kq_add_knote(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc,
+               struct proc *p)
 {
        struct filedesc *fdp = p->p_fd;
        struct klist *list = NULL;
@@ -8081,7 +7346,7 @@ kq_add_knote(struct kqueue *kq, struct knote *kn,
        else
                knhash_lock(p);
 
-       if (knote_fdfind(kq, kev, is_fd, p) != NULL) {
+       if (knote_fdfind(kq, &kn->kn_kevent, is_fd, p) != NULL) {
                /* found an existing knote: we can't add this one */
                ret = ERESTART;
                goto out_locked;
@@ -8092,8 +7357,7 @@ kq_add_knote(struct kqueue *kq, struct knote *kn,
                if (fdp->fd_knhashmask == 0) {
                        u_long size = 0;
 
-                       list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE,
-                                                 &size);
+                       list = hashinit(CONFIG_KN_HASHSIZE, M_KQUEUE, &size);
                        if (list == NULL) {
                                ret = ENOMEM;
                                goto out_locked;
@@ -8154,11 +7418,10 @@ kq_add_knote(struct kqueue *kq, struct knote *kn,
        }
 
 out_locked:
-       if (ret == 0 && knoteuse_needs_boost(kn, kev)) {
-               set_thread_rwlock_boost();
-               *knoteuse_flags = KNUSE_BOOST;
-       } else {
-               *knoteuse_flags = KNUSE_NONE;
+       if (ret == 0) {
+               kqlock(kq);
+               assert((kn->kn_status & KN_LOCKED) == 0);
+               (void)knote_lock(kq, kn, knlc, KNOTE_KQ_UNLOCK);
        }
        if (is_fd)
                proc_fdunlock(p);
@@ -8170,8 +7433,6 @@ out_locked:
 
 /*
  * kq_remove_knote - remove a knote from the fd table for process
- * and copy kn_status an kq_state while holding kqlock and
- * fd table locks.
  *
  * If the filter is file-based, remove based on fd index.
  * Otherwise remove from the hash based on the ident.
@@ -8180,10 +7441,11 @@ out_locked:
  */
 static void
 kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
-       kn_status_t *kn_status, uint16_t *kq_state)
+               struct knote_lock_ctx *knlc)
 {
        struct filedesc *fdp = p->p_fd;
        struct klist *list = NULL;
+       uint16_t kq_state;
        bool is_fd;
 
        is_fd = knote_fops(kn)->f_isfd;
@@ -8202,14 +7464,19 @@ kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
        SLIST_REMOVE(list, kn, knote, kn_link);
 
        kqlock(kq);
-       *kn_status = kn->kn_status;
-       *kq_state = kq->kq_state;
-       kqunlock(kq);
-
+       kq_state = kq->kq_state;
+       if (knlc) {
+               knote_unlock_cancel(kq, kn, knlc, KNOTE_KQ_UNLOCK);
+       } else {
+               kqunlock(kq);
+       }
        if (is_fd)
                proc_fdunlock(p);
        else
                knhash_unlock(p);
+
+       if (kq_state & KQ_DYNAMIC)
+               kqueue_release_last(p, kq);
 }
 
 /*
@@ -8220,10 +7487,8 @@ kq_remove_knote(struct kqueue *kq, struct knote *kn, struct proc *p,
  */
 
 static struct knote *
-kq_find_knote_and_kq_lock(struct kqueue *kq,
-             struct kevent_internal_s *kev,
-            bool is_fd,
-             struct proc *p)
+kq_find_knote_and_kq_lock(struct kqueue *kq, struct kevent_internal_s *kev,
+               bool is_fd, struct proc *p)
 {
        struct knote * ret;
 
@@ -8248,71 +7513,41 @@ kq_find_knote_and_kq_lock(struct kqueue *kq,
 /*
  * knote_drop - disconnect and drop the knote
  *
- * Called with the kqueue unlocked and holding a
- * "drop reference" on the knote in question.
- * This reference is most often aquired thru a call
- * to kqlock2knotedrop(). But it can also be acquired
- * through stealing a drop reference via a call to
- * knoteuse2knotedrop() or during the initial attach
- * of the knote.
+ * Called with the kqueue locked, returns with the kqueue unlocked.
+ *
+ * If a knote locking context is passed, it is canceled.
  *
  * The knote may have already been detached from
  * (or not yet attached to) its source object.
  */
 static void
-knote_drop(struct knote *kn, __unused struct proc *ctxp)
+knote_drop(struct kqueue *kq, struct knote *kn, struct knote_lock_ctx *knlc)
 {
-       struct kqueue *kq = knote_get_kq(kn);
        struct proc *p = kq->kq_p;
-       kn_status_t kn_status;
-       uint16_t kq_state;
+
+       kqlock_held(kq);
+
+       assert((kn->kn_status & KN_DROPPING) == 0);
+       if (knlc == NULL) {
+               assert((kn->kn_status & KN_LOCKED) == 0);
+       }
+       kn->kn_status |= KN_DROPPING;
+
+       knote_unsuppress(kn);
+       knote_dequeue(kn);
+       knote_wait_for_filter_events(kq, kn);
 
        /* If we are attached, disconnect from the source first */
        if (kn->kn_status & KN_ATTACHED) {
                knote_fops(kn)->f_detach(kn);
        }
 
-       /* Remove the source from the appropriate hash */
-       kq_remove_knote(kq, kn, p, &kn_status, &kq_state);
-
-       /*
-        * If a kqueue_dealloc is happening in parallel for the kq
-        * pointed by the knote the kq could be aready deallocated
-        * at this point.
-        * Do not access the kq after the kq_remove_knote if it is
-        * not a KQ_DYNAMIC.
-        */
-
-       /* determine if anyone needs to know about the drop */
-       assert((kn_status & (KN_DROPPING | KN_SUPPRESSED | KN_QUEUED)) == KN_DROPPING);
-
-       /*
-        * If KN_USEWAIT is set, some other thread was trying to drop the kn.
-        * Or it was in kqueue_dealloc, so the kqueue_dealloc did not happen
-        * because that thread was waiting on this wake, or it was a drop happening
-        * because of a kevent_register that takes a reference on the kq, and therefore
-        * the kq cannot be deallocated in parallel.
-        *
-        * It is safe to access kq->kq_wqs if needswakeup is set.
-        */
-       if (kn_status & KN_USEWAIT)
-               waitq_wakeup64_all((struct waitq *)&kq->kq_wqs,
-                                  CAST_EVENT64_T(&kn->kn_status),
-                                  THREAD_RESTART,
-                                  WAITQ_ALL_PRIORITIES);
-
+       /* kq may be freed when kq_remove_knote() returns */
+       kq_remove_knote(kq, kn, p, knlc);
        if (knote_fops(kn)->f_isfd && ((kn->kn_status & KN_VANISHED) == 0))
                fp_drop(p, kn->kn_id, kn->kn_fp, 0);
 
        knote_free(kn);
-
-       /*
-        * release reference on dynamic kq (and free if last).
-        * Will only be last if this is from fdfree, etc...
-        * because otherwise processing thread has reference.
-        */
-       if (kq_state & KQ_DYNAMIC)
-               kqueue_release_last(p, kq);
 }
 
 /* called with kqueue lock held */
@@ -8350,9 +7585,6 @@ knote_enable(struct knote *kn)
        kn->kn_status &= ~KN_DISABLED;
 
        if (kn->kn_status & KN_SUPPRESSED) {
-               /* Clear the sync qos on the knote */
-               knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
-
                /*
                 * it is possible for userland to have knotes registered for a given
                 * workloop `wl_orig` but really handled on another workloop `wl_new`.
@@ -8401,18 +7633,8 @@ knote_suppress(struct knote *kn)
 
        knote_dequeue(kn);
        kn->kn_status |= KN_SUPPRESSED;
-       suppressq = kqueue_get_suppressed_queue(kq, knote_get_qos_index(kn));
+       suppressq = kqueue_get_suppressed_queue(kq, kn);
        TAILQ_INSERT_TAIL(suppressq, kn, kn_tqe);
-
-       if ((kq->kq_state & KQ_WORKLOOP) &&
-            knote_get_qos_override_index(kn) == THREAD_QOS_USER_INTERACTIVE &&
-            kn->kn_qos_override_is_sync) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-               /* update the sync qos override counter for suppressed knotes */
-               kqworkloop_update_override(kqwl, knote_get_qos_index(kn),
-                       knote_get_qos_override_index(kn),
-                       (KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS | KQWL_UO_NEW_OVERRIDE_IS_SYNC_UI));
-       }
 }
 
 /* called with kqueue lock held */
@@ -8427,70 +7649,41 @@ knote_unsuppress(struct knote *kn)
        if ((kn->kn_status & KN_SUPPRESSED) == 0)
                return;
 
-       /* Clear the sync qos on the knote */
-       knote_adjust_sync_qos(kn, THREAD_QOS_UNSPECIFIED, FALSE);
-
        kn->kn_status &= ~KN_SUPPRESSED;
-       suppressq = kqueue_get_suppressed_queue(kq, knote_get_qos_index(kn));
+       suppressq = kqueue_get_suppressed_queue(kq, kn);
        TAILQ_REMOVE(suppressq, kn, kn_tqe);
 
-       /* udate in-use qos to equal requested qos */
-       kn->kn_qos_index = kn->kn_req_index;
+       /*
+        * If the knote is no longer active, reset its push,
+        * and resynchronize kn_qos_index with kn_qos_override
+        */
+       if ((kn->kn_status & KN_ACTIVE) == 0) {
+               kn->kn_qos_override = kn->kn_req_index;
+       }
+       kn->kn_qos_index = kn->kn_qos_override;
 
        /* don't wakeup if unsuppressing just a stay-active knote */
        if (knote_enqueue(kn) && (kn->kn_status & KN_ACTIVE)) {
                knote_wakeup(kn);
        }
 
-       if ((kq->kq_state & KQ_WORKLOOP) && !(kq->kq_state & KQ_NO_WQ_THREAD) &&
-            knote_get_qos_override_index(kn) == THREAD_QOS_USER_INTERACTIVE &&
-            kn->kn_qos_override_is_sync) {
+       if ((kq->kq_state & KQ_WORKLOOP) && TAILQ_EMPTY(suppressq)) {
                struct kqworkloop *kqwl = (struct kqworkloop *)kq;
 
-               /* update the sync qos override counter for suppressed knotes */
-               kqworkloop_update_override(kqwl, knote_get_qos_index(kn),
-                       knote_get_qos_override_index(kn),
-                       (KQWL_UO_UPDATE_SUPPRESS_SYNC_COUNTERS | KQWL_UO_OLD_OVERRIDE_IS_SYNC_UI));
-       }
-
-       if (TAILQ_EMPTY(suppressq) && (kq->kq_state & KQ_WORKLOOP) &&
-                       !(kq->kq_state & KQ_NO_WQ_THREAD)) {
-               struct kqworkloop *kqwl = (struct kqworkloop *)kq;
                if (kqworkloop_is_processing_on_current_thread(kqwl)) {
                        /*
-                        * kqworkloop_end_processing() will perform the required QoS
-                        * computations when it unsets the processing mode.
+                        * kqworkloop_end_processing() or kqworkloop_begin_processing()
+                        * will perform the required QoS computations when it unsets the
+                        * processing mode.
                         */
                } else {
-                       kqwl_req_lock(kqwl);
+                       kq_req_lock(kqwl);
                        kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_RESET_WAKEUP_OVERRIDE, 0);
-                       kqwl_req_unlock(kqwl);
+                       kq_req_unlock(kqwl);
                }
        }
 }
 
-/* called with kqueue lock held */
-static void
-knote_update_sync_override_state(struct knote *kn)
-{
-       struct kqtailq *queue = knote_get_queue(kn);
-       struct kqueue *kq = knote_get_kq(kn);
-
-       if (!(kq->kq_state & KQ_WORKLOOP) ||
-           knote_get_queue_index(kn) != THREAD_QOS_USER_INTERACTIVE)
-               return;
-
-       /* Update the sync ipc state on workloop */
-       struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-       boolean_t sync_ipc_override = FALSE;
-       if (!TAILQ_EMPTY(queue)) {
-               struct knote *kn_head = TAILQ_FIRST(queue);
-               if (kn_head->kn_qos_override_is_sync)
-                       sync_ipc_override = TRUE;
-       }
-       kqworkloop_update_sync_override_state(kqwl, sync_ipc_override);
-}
-
 /* called with kqueue lock held */
 static int
 knote_enqueue(struct knote *kn)
@@ -8504,15 +7697,9 @@ knote_enqueue(struct knote *kn)
                struct kqueue *kq = knote_get_kq(kn);
 
                kqlock_held(kq);
-               /* insert at head for sync ipc waiters */
-               if (kn->kn_qos_override_is_sync) {
-                       TAILQ_INSERT_HEAD(queue, kn, kn_tqe);
-               } else {
-                       TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
-               }
+               TAILQ_INSERT_TAIL(queue, kn, kn_tqe);
                kn->kn_status |= KN_QUEUED;
                kq->kq_count++;
-               knote_update_sync_override_state(kn);
                return 1;
        }
        return ((kn->kn_status & KN_STAYACTIVE) != 0);
@@ -8535,7 +7722,6 @@ knote_dequeue(struct knote *kn)
        TAILQ_REMOVE(queue, kn, kn_tqe);
        kn->kn_status &= ~KN_QUEUED;
        kq->kq_count--;
-       knote_update_sync_override_state(kn);
 }
 
 void
@@ -8561,12 +7747,6 @@ knote_init(void)
        /* Allocate kq lock attribute */
        kq_lck_attr = lck_attr_alloc_init();
 
-       /* Initialize the timer filter lock */
-       lck_mtx_init(&_filt_timerlock, kq_lck_grp, kq_lck_attr);
-
-       /* Initialize the user filter lock */
-       lck_spin_init(&_filt_userlock, kq_lck_grp, kq_lck_attr);
-
 #if CONFIG_MEMORYSTATUS
        /* Initialize the memorystatus list lock */
        memorystatus_kevent_init(kq_lck_grp, kq_lck_attr);
@@ -8583,15 +7763,16 @@ knote_fops(struct knote *kn)
 static struct knote *
 knote_alloc(void)
 {
-       struct knote *kn;
-       kn = ((struct knote *)zalloc(knote_zone));
-       *kn = (struct knote) { .kn_qos_override = 0, .kn_qos_sync_override = 0, .kn_qos_override_is_sync = 0 };
+       struct knote *kn = ((struct knote *)zalloc(knote_zone));
+       bzero(kn, sizeof(struct knote));
        return kn;
 }
 
 static void
 knote_free(struct knote *kn)
 {
+       assert(kn->kn_inuse == 0);
+       assert((kn->kn_status & KN_LOCKED) == 0);
        zfree(knote_zone, kn);
 }
 
@@ -8623,7 +7804,7 @@ static lck_rw_t *kev_rwlock = &kev_lck_data;
 static int kev_attach(struct socket *so, int proto, struct proc *p);
 static int kev_detach(struct socket *so);
 static int kev_control(struct socket *so, u_long cmd, caddr_t data,
-    struct ifnet *ifp, struct proc *p);
+               struct ifnet *ifp, struct proc *p);
 static lck_mtx_t * event_getlock(struct socket *, int);
 static int event_lock(struct socket *, int, void *);
 static int event_unlock(struct socket *, int, void *);
@@ -8658,8 +7839,8 @@ SYSCTL_NODE(_net_systm, OID_AUTO, kevt,
 
 struct kevtstat kevtstat;
 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, stats,
-    CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
-    kevt_getstat, "S,kevtstat", "");
+               CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
+               kevt_getstat, "S,kevtstat", "");
 
 SYSCTL_PROC(_net_systm_kevt, OID_AUTO, pcblist,
        CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
@@ -8906,7 +8087,7 @@ kev_detach(struct socket *so)
  */
 errno_t kev_vendor_code_find(
        const char      *string,
-       u_int32_t       *out_vendor_code)
+       u_int32_t       *out_vendor_code)
 {
        if (strlen(string) >= KEV_VENDOR_CODE_MAX_STR_LEN) {
                return (EINVAL);
@@ -8925,7 +8106,7 @@ kev_msg_post(struct kev_msg *event_msg)
        if (event_msg == NULL)
                return (EINVAL);
 
-       /* 
+       /*
         * Limit third parties to posting events for registered vendor codes
         * only
         */
@@ -9050,10 +8231,10 @@ kev_post_msg(struct kev_msg *event_msg)
 
 static int
 kev_control(struct socket *so,
-    u_long cmd,
-    caddr_t data,
-    __unused struct ifnet *ifp,
-    __unused struct proc *p)
+               u_long cmd,
+               caddr_t data,
+               __unused struct ifnet *ifp,
+               __unused struct proc *p)
 {
        struct kev_request *kev_req = (struct kev_request *) data;
        struct kern_event_pcb  *ev_pcb;
@@ -9255,6 +8436,7 @@ fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi)
 {
        struct kqworkloop *kqwl = (struct kqworkloop *)kq;
        struct kqrequest *kqr = &kqwl->kqwl_request;
+       workq_threadreq_param_t trp = {};
        int err;
 
        if ((kq->kq_state & KQ_WORKLOOP) == 0) {
@@ -9265,25 +8447,33 @@ fill_kqueue_dyninfo(struct kqueue *kq, struct kqueue_dyninfo *kqdi)
                return err;
        }
 
-       kqwl_req_lock(kqwl);
-
-       if (kqr->kqr_thread) {
-               kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread);
-       }
-
-       if (kqwl->kqwl_owner == WL_OWNER_SUSPENDED) {
-               kqdi->kqdi_owner = ~0ull;
-       } else {
-               kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
-       }
+       kq_req_lock(kqwl);
 
+       kqdi->kqdi_servicer = thread_tid(kqr->kqr_thread);
+       kqdi->kqdi_owner = thread_tid(kqwl->kqwl_owner);
        kqdi->kqdi_request_state = kqr->kqr_state;
        kqdi->kqdi_async_qos = kqr->kqr_qos_index;
        kqdi->kqdi_events_qos = kqr->kqr_override_index;
        kqdi->kqdi_sync_waiters = kqr->kqr_dsync_waiters;
-       kqdi->kqdi_sync_waiter_qos = kqr->kqr_dsync_waiters_qos;
+       kqdi->kqdi_sync_waiter_qos = 0;
+
+       trp.trp_value = kqwl->kqwl_params;
+       if (trp.trp_flags & TRP_PRIORITY)
+               kqdi->kqdi_pri = trp.trp_pri;
+       else
+               kqdi->kqdi_pri = 0;
 
-       kqwl_req_unlock(kqwl);
+       if (trp.trp_flags & TRP_POLICY)
+               kqdi->kqdi_pol = trp.trp_pol;
+       else
+               kqdi->kqdi_pol = 0;
+
+       if (trp.trp_flags & TRP_CPUPERCENT)
+               kqdi->kqdi_cpupercent = trp.trp_cpupercent;
+       else
+               kqdi->kqdi_cpupercent = 0;
+
+       kq_req_unlock(kqwl);
 
        return 0;
 }
@@ -9293,6 +8483,7 @@ void
 knote_markstayactive(struct knote *kn)
 {
        struct kqueue *kq = knote_get_kq(kn);
+       kq_index_t qos;
 
        kqlock(kq);
        kn->kn_status |= KN_STAYACTIVE;
@@ -9302,20 +8493,28 @@ knote_markstayactive(struct knote *kn)
         * established before it is fully attached.
         */
        assert(kn->kn_status & KN_ATTACHING);
+       assert((kn->kn_status & (KN_QUEUED | KN_SUPPRESSED)) == 0);
 
        /* handle all stayactive knotes on the (appropriate) manager */
        if (kq->kq_state & KQ_WORKQ) {
-               knote_set_qos_index(kn, KQWQ_QOS_MANAGER);
+               qos = KQWQ_QOS_MANAGER;
        } else if (kq->kq_state & KQ_WORKLOOP) {
                struct kqworkloop *kqwl = (struct kqworkloop *)kq;
-               kqwl_req_lock(kqwl);
-               assert(kn->kn_req_index && kn->kn_req_index < THREAD_QOS_LAST);
-               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS,
-                               kn->kn_req_index);
-               kqwl_req_unlock(kqwl);
-               knote_set_qos_index(kn, KQWL_BUCKET_STAYACTIVE);
+
+               qos = _pthread_priority_thread_qos(kn->kn_qos);
+               assert(qos && qos < THREAD_QOS_LAST);
+               kq_req_lock(kq);
+               kqworkloop_update_threads_qos(kqwl, KQWL_UTQ_UPDATE_STAYACTIVE_QOS, qos);
+               kq_req_unlock(kq);
+               qos = KQWL_BUCKET_STAYACTIVE;
+       } else {
+               qos = THREAD_QOS_UNSPECIFIED;
        }
 
+       kn->kn_req_index = qos;
+       kn->kn_qos_override = qos;
+       kn->kn_qos_index = qos;
+
        knote_activate(kn);
        kqunlock(kq);
 }
@@ -9546,7 +8745,7 @@ pid_kqueue_extinfo(proc_t p, struct kqueue *kq, user_addr_t ubuf,
        assert(bufsize >= sizeof(struct kevent_extinfo) * min(buflen, nknotes));
        err = copyout(kqext, ubuf, sizeof(struct kevent_extinfo) * min(buflen, nknotes));
 
- out:
+out:
        if (kqext) {
                kfree(kqext, buflen * sizeof(struct kevent_extinfo));
                kqext = NULL;
@@ -9631,14 +8830,6 @@ kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize)
        return (int)nuptrs;
 }
 
-static void
-kevent_redrive_proc_thread_request(proc_t p)
-{
-       __assert_only int ret;
-       ret = (*pthread_functions->workq_threadreq)(p, NULL, WORKQ_THREADREQ_REDRIVE, 0, 0);
-       assert(ret == 0 || ret == ECANCELED);
-}
-
 static void
 kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
 {
@@ -9649,12 +8840,8 @@ kevent_set_return_to_kernel_user_tsd(proc_t p, thread_t thread)
        uint64_t ast_flags64 = 0;
        struct uthread *ut = get_bsdthread_info(thread);
 
-       if (ut->uu_kqueue_bound != NULL) {
-               if (ut->uu_kqueue_flags & KEVENT_FLAG_WORKLOOP) {
-                       ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
-               } else if (ut->uu_kqueue_flags & KEVENT_FLAG_WORKQ) {
-                       ast_flags64 |= R2K_WORKQ_PENDING_EVENTS;
-               }
+       if (ut->uu_kqr_bound != NULL) {
+               ast_flags64 |= R2K_WORKLOOP_PENDING_EVENTS;
        }
 
        if (ast_flags64 == 0) {
@@ -9685,7 +8872,7 @@ kevent_ast(thread_t thread, uint16_t bits)
        proc_t p = current_proc();
 
        if (bits & AST_KEVENT_REDRIVE_THREADREQ) {
-               kevent_redrive_proc_thread_request(p);
+               workq_kern_threadreq_redrive(p, WORKQ_THREADREQ_CAN_CREATE_THREADS);
        }
        if (bits & AST_KEVENT_RETURN_TO_KERNEL) {
                kevent_set_return_to_kernel_user_tsd(p, thread);
@@ -9702,8 +8889,6 @@ kevent_sysctl SYSCTL_HANDLER_ARGS
 #pragma unused(oidp, arg2)
        uintptr_t type = (uintptr_t)arg1;
        uint64_t bound_id = 0;
-       struct uthread *ut;
-       struct kqueue *kq;
 
        if (type != KEVENT_SYSCTL_BOUND_ID) {
                return EINVAL;
@@ -9713,16 +8898,16 @@ kevent_sysctl SYSCTL_HANDLER_ARGS
                return EINVAL;
        }
 
-       ut = get_bsdthread_info(current_thread());
+       struct uthread *ut = get_bsdthread_info(current_thread());
        if (!ut) {
                return EFAULT;
        }
 
-       kq = ut->uu_kqueue_bound;
-       if (kq) {
-               if (kq->kq_state & KQ_WORKLOOP) {
-                       bound_id = ((struct kqworkloop *)kq)->kqwl_dynamicid;
-               } else if (kq->kq_state & KQ_WORKQ) {
+       struct kqrequest *kqr = ut->uu_kqr_bound;
+       if (kqr) {
+               if (kqr->kqr_state & KQR_WORKLOOP) {
+                       bound_id = kqr_kqworkloop(kqr)->kqwl_dynamicid;
+               } else {
                        bound_id = -1;
                }
        }
index e5ac6b2c7c48a1f663e25852396140eeddcb5210..5e145cfac6b2c1e18946099053f9bbd5b59bf715 100644 (file)
 #include <sys/kern_memorystatus.h>
 #endif
 
+extern boolean_t vm_darkwake_mode;
+
 #if CONFIG_DTRACE
 /* Do not include dtrace.h, it redefines kmem_[alloc/free] */
 extern void dtrace_proc_exec(proc_t);
@@ -177,7 +179,13 @@ static void (*dtrace_proc_waitfor_hook)(proc_t) = NULL;
 #endif
 
 /* support for child creation in exec after vfork */
-thread_t fork_create_child(task_t parent_task, coalition_t *parent_coalition, proc_t child_proc, int inherit_memory, int is64bit, int in_exec);
+thread_t fork_create_child(task_t parent_task,
+                                                  coalition_t *parent_coalition,
+                                                  proc_t child_proc,
+                                                  int inherit_memory,
+                                                  int is_64bit_addr,
+                                                  int is_64bit_data,
+                                                  int in_exec);
 void vfork_exit(proc_t p, int rv);
 extern void proc_apply_task_networkbg_internal(proc_t, thread_t);
 extern void task_set_did_exec_flag(task_t task);
@@ -727,11 +735,10 @@ activate_exec_state(task_t task, proc_t p, thread_t thread, load_result_t *resul
        int ret;
 
        task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, 0);
-       if (result->is64bit) {
-               task_set_64bit(task, TRUE);
+       task_set_64bit(task, result->is_64bit_addr, result->is_64bit_data);
+       if (result->is_64bit_addr) {
                OSBitOrAtomic(P_LP64, &p->p_flag);
        } else {
-               task_set_64bit(task, FALSE);
                OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag);
        }
 
@@ -833,7 +840,7 @@ exec_mach_imgact(struct image_params *imgp)
        vm_map_t old_map = VM_MAP_NULL;
        vm_map_t map = VM_MAP_NULL;
        load_return_t           lret;
-       load_result_t           load_result;
+       load_result_t           load_result = {};
        struct _posix_spawnattr *psa = NULL;
        int                     spawn = (imgp->ip_flags & IMGPF_SPAWN);
        int                     vfexec = (imgp->ip_flags & IMGPF_VFORK_EXEC);
@@ -879,8 +886,9 @@ exec_mach_imgact(struct image_params *imgp)
        thread = current_thread();
        uthread = get_bsdthread_info(thread);
 
-       if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64)
-               imgp->ip_flags |= IMGPF_IS_64BIT;
+       if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64) {
+               imgp->ip_flags |= IMGPF_IS_64BIT_ADDR | IMGPF_IS_64BIT_DATA;
+       }
 
        /* If posix_spawn binprefs exist, respect those prefs. */
        psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
@@ -913,6 +921,8 @@ grade:
                goto bad;
        }
 
+
+
        /* Copy in arguments/environment from the old process */
        error = exec_extract_strings(imgp);
        if (error)
@@ -931,7 +941,13 @@ grade:
         * new child process.
         */
        if (vfexec) {
-               imgp->ip_new_thread = fork_create_child(task, NULL, p, FALSE, (imgp->ip_flags & IMGPF_IS_64BIT), FALSE);
+               imgp->ip_new_thread = fork_create_child(task,
+                                                                                               NULL,
+                                                                                               p,
+                                                                                               FALSE,
+                                                                                               (imgp->ip_flags & IMGPF_IS_64BIT_ADDR),
+                                                                                               (imgp->ip_flags & IMGPF_IS_64BIT_DATA),
+                                                                                               FALSE);
                /* task and thread ref returned, will be released in __mac_execve */
                if (imgp->ip_new_thread == NULL) {
                        error = ENOMEM;
@@ -1002,7 +1018,7 @@ grade:
                imgp->ip_csflags |= load_result.csflags & 
                        (CS_VALID|CS_SIGNED|CS_DEV_CODE|
                         CS_HARD|CS_KILL|CS_RESTRICT|CS_ENFORCEMENT|CS_REQUIRE_LV|
-                        CS_ENTITLEMENTS_VALIDATED|CS_DYLD_PLATFORM|
+                        CS_FORCED_LV|CS_ENTITLEMENTS_VALIDATED|CS_DYLD_PLATFORM|CS_RUNTIME|
                         CS_ENTITLEMENT_FLAGS|
                         CS_EXEC_SET_HARD|CS_EXEC_SET_KILL|CS_EXEC_SET_ENFORCEMENT);
        } else {
@@ -1027,7 +1043,9 @@ grade:
        /*
         * Set up the system reserved areas in the new address space.
         */
-       vm_map_exec(map, task, load_result.is64bit, (void *)p->p_fd->fd_rdir, cpu_type());
+       int cpu_subtype;
+       cpu_subtype = 0; /* all cpu_subtypes use the same shared region */
+       vm_map_exec(map, task, load_result.is_64bit_addr, (void *)p->p_fd->fd_rdir, cpu_type(), cpu_subtype);
 
        /*
         * Close file descriptors which specify close-on-exec.
@@ -1129,7 +1147,7 @@ grade:
 
        if (load_result.dynlinker) {
                uint64_t        ap;
-               int                     new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
+               int                     new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4;
 
                /* Adjust the stack */
                ap = thread_adjuserstack(thread, -new_ptr_size);
@@ -1201,6 +1219,12 @@ grade:
        }
 #endif /* CONFIG_SECLUDED_MEMORY */
 
+#if __arm64__
+       if (load_result.legacy_footprint) {
+               task_set_legacy_footprint(task, TRUE);
+       }
+#endif /* __arm64__ */
+
        pal_dbg_set_task_name(task);
 
        /*
@@ -1525,14 +1549,20 @@ encapsulated_binary:
                }
        }
 
-       /*
-        * Call out to allow 3rd party notification of exec. 
-        * Ignore result of kauth_authorize_fileop call.
-        */
-       if (error == 0 && kauth_authorize_fileop_has_listeners()) {
-               kauth_authorize_fileop(vfs_context_ucred(imgp->ip_vfs_context),
-                                       KAUTH_FILEOP_EXEC,
-                                       (uintptr_t)ndp->ni_vp, 0);
+       if (error == 0) {
+               if (imgp->ip_flags & IMGPF_INTERPRET && ndp->ni_vp) {
+                       AUDIT_ARG(vnpath, ndp->ni_vp, ARG_VNODE2);
+               }
+
+               /*
+                * Call out to allow 3rd party notification of exec.
+                * Ignore result of kauth_authorize_fileop call.
+                */
+               if (kauth_authorize_fileop_has_listeners()) {
+                       kauth_authorize_fileop(vfs_context_ucred(imgp->ip_vfs_context),
+                                               KAUTH_FILEOP_EXEC,
+                                               (uintptr_t)ndp->ni_vp, 0);
+               }
        }
 bad:
        proc_transend(p, 0);
@@ -2228,6 +2258,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
        int portwatch_count = 0;
        ipc_port_t * portwatch_ports = NULL;
        vm_size_t px_sa_offset = offsetof(struct _posix_spawnattr, psa_ports);
+       task_t old_task = current_task();
        task_t new_task = NULL;
        boolean_t should_release_proc_ref = FALSE;
        void *inherit = NULL;
@@ -2255,7 +2286,7 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
        imgp->ip_vattr = vap;
        imgp->ip_origvattr = origvap;
        imgp->ip_vfs_context = &context;
-       imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE);
+       imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT_ADDR : IMGPF_NONE);
        imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
        imgp->ip_mac_return = 0;
        imgp->ip_px_persona = NULL;
@@ -2296,9 +2327,10 @@ posix_spawn(proc_t ap, struct posix_spawn_args *uap, int32_t *retval)
                         * This is a bit fragile: <rdar://problem/16427422>
                         */
 
-                       if ((error = copyin(px_args.attrp, &px_sa, px_sa_offset) != 0)) 
-                       goto bad;
-               
+                       if ((error = copyin(px_args.attrp, &px_sa, px_sa_offset)) != 0) {
+                               goto bad;
+                       }
+
                        bzero( (void *)( (unsigned long) &px_sa + px_sa_offset), sizeof(px_sa) - px_sa_offset );        
 
                        imgp->ip_px_sa = &px_sa;
@@ -2588,8 +2620,13 @@ do_fork1:
                 * During exec any transition from new_task -> proc is fine, but don't allow
                 * transition from proc->task, since it will modify old_task.
                 */
-               imgp->ip_new_thread = fork_create_child(current_task(),
-                                       NULL, p, FALSE, p->p_flag & P_LP64, TRUE);
+               imgp->ip_new_thread = fork_create_child(old_task,
+                                                                                               NULL,
+                                                                                               p,
+                                                                                               FALSE,
+                                                                                               p->p_flag & P_LP64,
+                                                                                               task_get_64bit_data(old_task),
+                                                                                               TRUE);
                /* task and thread ref returned by fork_create_child */
                if (imgp->ip_new_thread == NULL) {
                        error = ENOMEM;
@@ -2797,9 +2834,18 @@ do_fork1:
        error = exec_activate_image(imgp);
        
        if (error == 0 && !spawn_no_exec) {
-               p = proc_exec_switch_task(p, current_task(), new_task, imgp->ip_new_thread);
+               p = proc_exec_switch_task(p, old_task, new_task, imgp->ip_new_thread);
                /* proc ref returned */
                should_release_proc_ref = TRUE;
+
+               /*
+                * Need to transfer pending watch port boosts to the new task while still making
+                * sure that the old task remains in the importance linkage. Create an importance
+                * linkage from old task to new task, then switch the task importance base
+                * of old task and new task. After the switch the port watch boost will be
+                * boosting the new task and new task will be donating importance to old task.
+                */
+               inherit = ipc_importance_exec_switch_task(old_task, new_task);
        }
 
        if (error == 0) {
@@ -2926,6 +2972,9 @@ bad:
 
                }
 #endif /* CONFIG_MEMORYSTATUS */
+               if (imgp->ip_px_sa != NULL && px_sa.psa_thread_limit > 0) {
+                       task_set_thread_limit(new_task, (uint16_t)px_sa.psa_thread_limit);
+               }
        }
 
        /*
@@ -2966,14 +3015,14 @@ bad:
                error = proc_transstart(p, 0, 0);
 
                if (error == 0) {
-                       task_bank_init(get_threadtask(imgp->ip_new_thread));
+                       task_bank_init(new_task);
                        proc_transend(p, 0);
                }
        }
 
        /* Inherit task role from old task to new task for exec */
        if (error == 0 && !spawn_no_exec) {
-               proc_inherit_task_role(get_threadtask(imgp->ip_new_thread), current_task());
+               proc_inherit_task_role(new_task, old_task);
        }
 
        /*
@@ -2993,20 +3042,20 @@ bad:
        }
 
        /*
-        * Need to transfer pending watch port boosts to the new task while still making
-        * sure that the old task remains in the importance linkage. Create an importance
-        * linkage from old task to new task, then switch the task importance base
-        * of old task and new task. After the switch the port watch boost will be
-        * boosting the new task and new task will be donating importance to old task.
+        * Apply the requested maximum address.
         */
-       if (error == 0 && task_did_exec(current_task())) {
-               inherit = ipc_importance_exec_switch_task(current_task(), get_threadtask(imgp->ip_new_thread));
+       if (error == 0 && imgp->ip_px_sa != NULL) {
+               struct _posix_spawnattr *psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
+
+               if (psa->psa_max_addr) {
+                       vm_map_set_max_addr(get_task_map(new_task), psa->psa_max_addr);
+               }
        }
 
        if (error == 0) {
-               /* Apply the main thread qos */         
+               /* Apply the main thread qos */
                thread_t main_thread = imgp->ip_new_thread;
-               task_set_main_thread_qos(get_threadtask(imgp->ip_new_thread), main_thread);
+               task_set_main_thread_qos(new_task, main_thread);
 
 #if CONFIG_MACF
                /*
@@ -3014,7 +3063,7 @@ bad:
                 * a jumbo-size map.
                 */
                if (mac_proc_check_map_anon(p, 0, 0, 0, MAP_JIT, NULL) == 0) {
-                       vm_map_set_jumbo(get_task_map(p->task));
+                       vm_map_set_jumbo(get_task_map(new_task));
                }
 #endif /* CONFIG_MACF */
        }
@@ -3129,11 +3178,22 @@ bad:
                (*dtrace_proc_waitfor_hook)(p);
        }
 #endif
+
+#if CONFIG_AUDIT
+       if (!error && AUDIT_ENABLED() && p) {
+               /* Add the CDHash of the new process to the audit record */
+               uint8_t *cdhash = cs_get_cdhash(p);
+               if (cdhash) {
+                       AUDIT_ARG(data, cdhash, sizeof(uint8_t), CS_CDHASH_LEN);
+               }
+       }
+#endif
+
        /*
         * clear bsd_info from old task if it did exec.
         */
-       if (task_did_exec(current_task())) {
-               set_bsdtask_info(current_task(), NULL);
+       if (task_did_exec(old_task)) {
+               set_bsdtask_info(old_task, NULL);
        }
 
        /* clear bsd_info from new task and terminate it if exec failed  */
@@ -3177,9 +3237,9 @@ bad:
         * switch the tasks, terminating the current task without the switch would
         * result in loosing the SIGKILL status.
         */
-       if (task_did_exec(current_task())) {
+       if (task_did_exec(old_task)) {
                /* Terminate the current task, since exec will start in new task */
-               task_terminate_internal(current_task());
+               task_terminate_internal(old_task);
        }
 
        /* Release the thread ref returned by fork_create_child/fork1 */
@@ -3413,6 +3473,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
        int is_64 = IS_64BIT_PROCESS(p);
        struct vfs_context context;
        struct uthread  *uthread;
+       task_t old_task = current_task();
        task_t new_task = NULL;
        boolean_t should_release_proc_ref = FALSE;
        boolean_t exec_done = FALSE;
@@ -3441,7 +3502,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
        imgp->ip_vattr = vap;
        imgp->ip_origvattr = origvap;
        imgp->ip_vfs_context = &context;
-       imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE);
+       imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT_ADDR : IMGPF_NONE) | ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE);
        imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
        imgp->ip_mac_return = 0;
        imgp->ip_cs_error = OS_REASON_NULL;
@@ -3487,8 +3548,13 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
                 * During exec any transition from new_task -> proc is fine, but don't allow
                 * transition from proc->task, since it will modify old_task.
                 */
-               imgp->ip_new_thread = fork_create_child(current_task(),
-                                       NULL, p, FALSE, p->p_flag & P_LP64, TRUE);
+               imgp->ip_new_thread = fork_create_child(old_task,
+                                                                                               NULL,
+                                                                                               p,
+                                                                                               FALSE,
+                                                                                               p->p_flag & P_LP64,
+                                                                                               task_get_64bit_data(old_task),
+                                                                                               TRUE);
                /* task and thread ref returned by fork_create_child */
                if (imgp->ip_new_thread == NULL) {
                        error = ENOMEM;
@@ -3511,9 +3577,18 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
        }
 
        if (!error && !in_vfexec) {
-               p = proc_exec_switch_task(p, current_task(), new_task, imgp->ip_new_thread);
+               p = proc_exec_switch_task(p, old_task, new_task, imgp->ip_new_thread);
                /* proc ref returned */
                should_release_proc_ref = TRUE;
+
+               /*
+                * Need to transfer pending watch port boosts to the new task while still making
+                * sure that the old task remains in the importance linkage. Create an importance
+                * linkage from old task to new task, then switch the task importance base
+                * of old task and new task. After the switch the port watch boost will be
+                * boosting the new task and new task will be donating importance to old task.
+                */
+               inherit = ipc_importance_exec_switch_task(old_task, new_task);
        }
 
        kauth_cred_unref(&context.vc_ucred);
@@ -3562,7 +3637,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
        }
 
        if (!error) {
-               task_bank_init(get_threadtask(imgp->ip_new_thread));
+               task_bank_init(new_task);
                proc_transend(p, 0);
 
                /* Sever any extant thread affinity */
@@ -3570,7 +3645,7 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
 
                /* Inherit task role from old task to new task for exec */
                if (!in_vfexec) {
-                       proc_inherit_task_role(get_threadtask(imgp->ip_new_thread), current_task());
+                       proc_inherit_task_role(new_task, old_task);
                }
 
                thread_t main_thread = imgp->ip_new_thread;
@@ -3587,6 +3662,14 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
                }
 #endif /* CONFIG_MACF */
 
+               if (vm_darkwake_mode == TRUE) {
+                       /*
+                        * This process is being launched when the system
+                        * is in darkwake. So mark it specially. This will
+                        * cause all its pages to be entered in the background Q.
+                        */
+                       task_set_darkwake_mode(new_task, vm_darkwake_mode);
+               }
 
 #if CONFIG_DTRACE
                dtrace_thread_didexec(imgp->ip_new_thread);
@@ -3595,6 +3678,16 @@ __mac_execve(proc_t p, struct __mac_execve_args *uap, int32_t *retval)
                        (*dtrace_proc_waitfor_hook)(p);
 #endif
 
+#if CONFIG_AUDIT
+               if (!error && AUDIT_ENABLED() && p) {
+                       /* Add the CDHash of the new process to the audit record */
+                       uint8_t *cdhash = cs_get_cdhash(p);
+                       if (cdhash) {
+                               AUDIT_ARG(data, cdhash, sizeof(uint8_t), CS_CDHASH_LEN);
+                       }
+               }
+#endif
+
                if (in_vfexec) {
                        vfork_return(p, retval, p->p_pid);
                }
@@ -3607,8 +3700,8 @@ exit_with_error:
        /*
         * clear bsd_info from old task if it did exec.
         */
-       if (task_did_exec(current_task())) {
-               set_bsdtask_info(current_task(), NULL);
+       if (task_did_exec(old_task)) {
+               set_bsdtask_info(old_task, NULL);
        }
 
        /* clear bsd_info from new task and terminate it if exec failed  */
@@ -3617,26 +3710,15 @@ exit_with_error:
                task_terminate_internal(new_task);
        }
 
-       /*
-        * Need to transfer pending watch port boosts to the new task while still making
-        * sure that the old task remains in the importance linkage. Create an importance
-        * linkage from old task to new task, then switch the task importance base
-        * of old task and new task. After the switch the port watch boost will be
-        * boosting the new task and new task will be donating importance to old task.
-        */
-       if (error == 0 && task_did_exec(current_task())) {
-               inherit = ipc_importance_exec_switch_task(current_task(), get_threadtask(imgp->ip_new_thread));
-       }
-
        if (imgp != NULL) {
                /*
                 * Do not terminate the current task, if proc_exec_switch_task did not
                 * switch the tasks, terminating the current task without the switch would
                 * result in loosing the SIGKILL status.
                 */
-               if (task_did_exec(current_task())) {
+               if (task_did_exec(old_task)) {
                        /* Terminate the current task, since exec will start in new task */
-                       task_terminate_internal(current_task());
+                       task_terminate_internal(old_task);
                }
 
                /* Release the thread ref returned by fork_create_child */
@@ -3813,7 +3895,7 @@ static int
 exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp)
 {
        proc_t p = vfs_context_proc(imgp->ip_vfs_context);
-       int     ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
+       int     ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4;
        int     ptr_area_size;
        void *ptr_buffer_start, *ptr_buffer;
        int string_size;
@@ -3887,8 +3969,7 @@ exec_copyout_strings(struct image_params *imgp, user_addr_t *stackp)
         * Need room for one pointer for each string, plus
         * one for the NULLs terminating the argv, envv, and apple areas.
         */
-       ptr_area_size = (imgp->ip_argc + imgp->ip_envc + imgp->ip_applec + 3) *
-           ptr_size;
+       ptr_area_size = (imgp->ip_argc + imgp->ip_envc + imgp->ip_applec + 3) * ptr_size;
        stack -= ptr_area_size;
        ptr_area = stack;
 
@@ -4014,8 +4095,8 @@ static int
 exec_extract_strings(struct image_params *imgp)
 {
        int error = 0;
-       int     ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT) ? 8 : 4;
-       int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
+       int     ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT_ADDR) ? 8 : 4;
+       int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4;
        user_addr_t     argv = imgp->ip_user_argv;
        user_addr_t     envv = imgp->ip_user_envv;
 
@@ -4220,6 +4301,12 @@ bad:
 #define        ENTROPY_VALUES 2
 #define ENTROPY_KEY "malloc_entropy="
 
+/*
+ * libplatform needs a random pointer-obfuscation value when it is initialized.
+ */
+#define PTR_MUNGE_VALUES 1
+#define PTR_MUNGE_KEY "ptr_munge="
+
 /*
  * System malloc engages nanozone for UIAPP.
  */
@@ -4278,7 +4365,7 @@ exec_add_apple_strings(struct image_params *imgp,
                       const load_result_t *load_result)
 {
        int error;
-       int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT) ? 8 : 4;
+       int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? 8 : 4;
 
        /* exec_save_path stored the first string */
        imgp->ip_applec = 1;
@@ -4335,6 +4422,16 @@ exec_add_apple_strings(struct image_params *imgp,
        }
        imgp->ip_applec++;
 
+       /*
+        * Supply libpthread & libplatform with a random value to use for pointer
+        * obfuscation.
+        */
+       error = exec_add_entropy_key(imgp, PTR_MUNGE_KEY, PTR_MUNGE_VALUES, FALSE);
+       if (error) {
+               goto bad;
+       }
+       imgp->ip_applec++;
+
        /* 
         * Add MAIN_STACK_KEY: Supplies the address and size of the main thread's
         * stack if it was allocated by the kernel.
@@ -5050,7 +5147,7 @@ load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path)
        }
 
        if (proc_is64bit(p)) {
-               user64_addr_t argv64bit[3];
+               user64_addr_t argv64bit[3] = {};
 
                argv64bit[0] = argv0;
                argv64bit[1] = argv1;
@@ -5060,7 +5157,7 @@ load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path)
                if (error)
                        return error;
        } else {
-               user32_addr_t argv32bit[3];
+               user32_addr_t argv32bit[3] = {};
 
                argv32bit[0] = (user32_addr_t)argv0;
                argv32bit[1] = (user32_addr_t)argv1;
@@ -5694,7 +5791,7 @@ static void exec_prefault_data(proc_t p __unused, struct image_params *imgp, loa
                 FALSE, VM_KERN_MEMORY_NONE,
                 THREAD_UNINT, NULL, 0);
        
-       if (imgp->ip_flags & IMGPF_IS_64BIT) {
+       if (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) {
                expected_all_image_infos_size = sizeof(struct user64_dyld_all_image_infos);
        } else {
                expected_all_image_infos_size = sizeof(struct user32_dyld_all_image_infos);
@@ -5740,7 +5837,7 @@ static void exec_prefault_data(proc_t p __unused, struct image_params *imgp, loa
                        user_addr_t dyld_all_image_infos_address;
                        user_addr_t dyld_slide_amount;
 
-                       if (imgp->ip_flags & IMGPF_IS_64BIT) {
+                       if (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) {
                                notification_address = all_image_infos.infos64.notification;
                                dyld_image_address = all_image_infos.infos64.dyldImageLoadAddress;
                                dyld_version_address = all_image_infos.infos64.dyldVersion;
index b2e226f0691012b2872e656b259c19d19079fdde..edffa18547d1003681f36a951174c6f45531d9fd 100644 (file)
@@ -312,6 +312,21 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset *
        unsigned int pflags = 0;
        uint64_t max_footprint_mb;
        uint64_t max_footprint;
+
+    uint64_t ledger_internal;
+    uint64_t ledger_internal_compressed;
+    uint64_t ledger_iokit_mapped;
+    uint64_t ledger_alternate_accounting;
+    uint64_t ledger_alternate_accounting_compressed;
+    uint64_t ledger_purgeable_nonvolatile;
+    uint64_t ledger_purgeable_nonvolatile_compressed;
+    uint64_t ledger_page_table;
+    uint64_t ledger_phys_footprint;
+    uint64_t ledger_phys_footprint_lifetime_max;
+    uint64_t ledger_network_nonvolatile;
+    uint64_t ledger_network_nonvolatile_compressed;
+    uint64_t ledger_wired_mem;
+
        void *crash_info_ptr = task_get_corpseinfo(corpse_task);
 
 #if CONFIG_MEMORYSTATUS
@@ -412,6 +427,72 @@ populate_corpse_crashinfo(proc_t p, task_t corpse_task, struct rusage_superset *
                kcdata_memcpy(crash_info_ptr, uaddr, &max_footprint_mb, sizeof(max_footprint_mb));
        }
 
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT_LIFETIME_MAX, sizeof(ledger_phys_footprint_lifetime_max), &uaddr)) {
+        ledger_phys_footprint_lifetime_max = get_task_phys_footprint_lifetime_max(p->task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_phys_footprint_lifetime_max, sizeof(ledger_phys_footprint_lifetime_max));
+    }
+
+    // In the forking case, the current ledger info is copied into the corpse while the original task is suspended for consistency
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_INTERNAL, sizeof(ledger_internal), &uaddr)) {
+        ledger_internal = get_task_internal(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_internal, sizeof(ledger_internal));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_INTERNAL_COMPRESSED, sizeof(ledger_internal_compressed), &uaddr)) {
+        ledger_internal_compressed = get_task_internal_compressed(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_internal_compressed, sizeof(ledger_internal_compressed));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_IOKIT_MAPPED, sizeof(ledger_iokit_mapped), &uaddr)) {
+        ledger_iokit_mapped = get_task_iokit_mapped(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_iokit_mapped, sizeof(ledger_iokit_mapped));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING, sizeof(ledger_alternate_accounting), &uaddr)) {
+        ledger_alternate_accounting = get_task_alternate_accounting(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_alternate_accounting, sizeof(ledger_alternate_accounting));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING_COMPRESSED, sizeof(ledger_alternate_accounting_compressed), &uaddr)) {
+        ledger_alternate_accounting_compressed = get_task_alternate_accounting_compressed(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_alternate_accounting_compressed, sizeof(ledger_alternate_accounting_compressed));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE, sizeof(ledger_purgeable_nonvolatile), &uaddr)) {
+        ledger_purgeable_nonvolatile = get_task_purgeable_nonvolatile(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_purgeable_nonvolatile, sizeof(ledger_purgeable_nonvolatile));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE_COMPRESSED, sizeof(ledger_purgeable_nonvolatile_compressed), &uaddr)) {
+        ledger_purgeable_nonvolatile_compressed = get_task_purgeable_nonvolatile_compressed(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_purgeable_nonvolatile_compressed, sizeof(ledger_purgeable_nonvolatile_compressed));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PAGE_TABLE, sizeof(ledger_page_table), &uaddr)) {
+        ledger_page_table = get_task_page_table(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_page_table, sizeof(ledger_page_table));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT, sizeof(ledger_phys_footprint), &uaddr)) {
+        ledger_phys_footprint = get_task_phys_footprint(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_phys_footprint, sizeof(ledger_phys_footprint));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE, sizeof(ledger_network_nonvolatile), &uaddr)) {
+        ledger_network_nonvolatile = get_task_network_nonvolatile(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_network_nonvolatile, sizeof(ledger_network_nonvolatile));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED, sizeof(ledger_network_nonvolatile_compressed), &uaddr)) {
+        ledger_network_nonvolatile_compressed = get_task_network_nonvolatile_compressed(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_network_nonvolatile_compressed, sizeof(ledger_network_nonvolatile_compressed));
+    }
+
+    if (KERN_SUCCESS == kcdata_get_memory_addr(crash_info_ptr, TASK_CRASHINFO_LEDGER_WIRED_MEM, sizeof(ledger_wired_mem), &uaddr)) {
+        ledger_wired_mem = get_task_wired_mem(corpse_task);
+        kcdata_memcpy(crash_info_ptr, uaddr, &ledger_wired_mem, sizeof(ledger_wired_mem));
+    }
+
        bzero(&pwqinfo, sizeof(struct proc_workqueueinfo));
        retval = fill_procworkqueue(p, &pwqinfo);
        if (retval == 0) {
@@ -614,7 +695,7 @@ abort_with_payload_internal(proc_t p,
                                        reason_code, 0, 0);
 
        exit_reason = build_userspace_exit_reason(reason_namespace, reason_code,
-                       payload, payload_size, reason_string, reason_flags);
+                       payload, payload_size, reason_string, reason_flags | OS_REASON_FLAG_ABORT);
 
        if (internal_flags & OS_REASON_IFLAG_USER_FAULT) {
                mach_exception_code_t code = 0;
@@ -1065,7 +1146,7 @@ proc_exit(proc_t p)
        /* if any pending cpu limits action, clear it */
        task_clear_cpuusage(p->task, TRUE);
 
-       workqueue_mark_exiting(p);
+       workq_mark_exiting(p);
 
        _aio_exit( p );
 
@@ -1079,7 +1160,7 @@ proc_exit(proc_t p)
         * Once all the knotes, kqueues & workloops are destroyed, get rid of the
         * workqueue.
         */
-       workqueue_exit(p);
+       workq_exit(p);
 
        if (uth->uu_lowpri_window) {
                /*
@@ -1361,8 +1442,6 @@ proc_exit(proc_t p)
        proc_limitdrop(p, 1);
        p->p_limit = NULL;
 
-       vm_purgeable_disown(p->task);
-
        /*
         * Finish up by terminating the task
         * and halt this thread (only if a
@@ -1432,6 +1511,7 @@ proc_exit(proc_t p)
                 * The write is to an int and is coherent. Also parent is
                 *  keyed off of list lock for reaping
                 */
+               DTRACE_PROC2(exited, proc_t, p, int, exitval);
                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
                        BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END,
                        pid, exitval, 0, 0, 0);
@@ -1455,6 +1535,7 @@ proc_exit(proc_t p)
                 * The write is to an int and is coherent. Also parent is
                 *  keyed off of list lock for reaping
                 */
+               DTRACE_PROC2(exited, proc_t, p, int, exitval);
                proc_list_lock();
                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON,
                        BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END,
@@ -1716,7 +1797,7 @@ wait1continue(int result)
        thread = current_thread();
        uth = (struct uthread *)get_bsdthread_info(thread);
 
-       wait4_data = &uth->uu_kevent.uu_wait4_data;
+       wait4_data = &uth->uu_save.uus_wait4_data;
        uap = wait4_data->args;
        retval = wait4_data->retval;
        return(wait4_nocancel(p, uap, retval));
@@ -1763,6 +1844,14 @@ loop1:
                /* XXX This is racy because we don't get the lock!!!! */
 
                if (p->p_listflag & P_LIST_WAITING) {
+
+                       /* we're not using a continuation here but we still need to stash
+                        * the args for stackshot. */
+                       uth = current_uthread();
+                       wait4_data = &uth->uu_save.uus_wait4_data;
+                       wait4_data->args = uap;
+                       thread_set_pending_block_hint(current_thread(), kThreadWaitOnProcess);
+
                        (void)msleep(&p->p_stat, proc_list_mlock, PWAIT, "waitcoll", 0);
                        goto loop1;
                }
@@ -1897,10 +1986,11 @@ loop1:
 
        /* Save arguments for continuation. Backing storage is in uthread->uu_arg, and will not be deallocated */
        uth = current_uthread();
-       wait4_data = &uth->uu_kevent.uu_wait4_data;
+       wait4_data = &uth->uu_save.uus_wait4_data;
        wait4_data->args = uap;
        wait4_data->retval = retval;
 
+       thread_set_pending_block_hint(current_thread(), kThreadWaitOnProcess);
        if ((error = msleep0((caddr_t)q, proc_list_mlock, PWAIT | PCATCH | PDROP, "wait", 0, wait1continue)))
                return (error);
 
@@ -1937,7 +2027,7 @@ waitidcontinue(int result)
        thread = current_thread();
        uth = (struct uthread *)get_bsdthread_info(thread);
 
-       waitid_data = &uth->uu_kevent.uu_waitid_data;
+       waitid_data = &uth->uu_save.uus_waitid_data;
        uap = waitid_data->args;
        retval = waitid_data->retval;
        return(waitid_nocancel(p, uap, retval));
@@ -2161,7 +2251,7 @@ loop1:
 
        /* Save arguments for continuation. Backing storage is in uthread->uu_arg, and will not be deallocated */
        uth = current_uthread();
-       waitid_data = &uth->uu_kevent.uu_waitid_data;
+       waitid_data = &uth->uu_save.uus_waitid_data;
        waitid_data->args = uap;
        waitid_data->retval = retval;
 
@@ -2725,3 +2815,20 @@ munge_user32_rusage(struct rusage *a_rusage_p, struct user32_rusage *a_user_rusa
        a_user_rusage_p->ru_nvcsw = a_rusage_p->ru_nvcsw;
        a_user_rusage_p->ru_nivcsw = a_rusage_p->ru_nivcsw;
 }
+
+void
+kdp_wait4_find_process(thread_t thread, __unused event64_t wait_event, thread_waitinfo_t *waitinfo)
+{
+       assert(thread != NULL);
+       assert(waitinfo != NULL);
+
+       struct uthread *ut = get_bsdthread_info(thread);
+       waitinfo->context = 0;
+       // ensure wmesg is consistent with a thread waiting in wait4
+       assert(!strcmp(ut->uu_wmesg, "waitcoll") || !strcmp(ut->uu_wmesg, "wait"));
+       struct wait4_nocancel_args *args = ut->uu_save.uus_wait4_data.args;
+       // May not actually contain a pid; this is just the argument to wait4.
+       // See man wait4 for other valid wait4 arguments.
+       waitinfo->owner = args->pid;
+}
+
index 952b6f8fb3c5fee1b0d9a89c4913eedb7b947633..2fb8a03d61794ebeb4fe93afca9dd80d7b283252 100644 (file)
@@ -158,7 +158,13 @@ extern boolean_t task_is_exec_copy(task_t);
 thread_t cloneproc(task_t, coalition_t *, proc_t, int, int);
 proc_t forkproc(proc_t);
 void forkproc_free(proc_t);
-thread_t fork_create_child(task_t parent_task, coalition_t *parent_coalitions, proc_t child, int inherit_memory, int is64bit, int in_exec);
+thread_t fork_create_child(task_t parent_task,
+                                                  coalition_t *parent_coalitions,
+                                                  proc_t child,
+                                                  int inherit_memory,
+                                                  int is_64bit_addr,
+                                                  int is_64bit_data,
+                                                  int in_exec);
 void proc_vfork_begin(proc_t parent_proc);
 void proc_vfork_end(proc_t parent_proc);
 
@@ -738,14 +744,15 @@ vfork_return(proc_t child_proc, int32_t *retval, int rval)
  *
  * Parameters: parent_task             parent task
  *             parent_coalitions       parent's set of coalitions
- *             child_proc              child process
+ *             child_proc                      child process
  *             inherit_memory          TRUE, if the parents address space is
- *                                     to be inherited by the child
- *             is64bit                 TRUE, if the child being created will
- *                                     be associated with a 64 bit process
- *                                     rather than a 32 bit process
- *             in_exec                 TRUE, if called from execve or posix spawn set exec
- *                                     FALSE, if called from fork or vfexec
+ *                                                     to be inherited by the child
+ *             is_64bit_addr           TRUE, if the child being created will
+ *                                                     be associated with a 64 bit address space
+ *             is_64bit_data           TRUE if the child being created will use a
+                                                       64-bit register state
+ *             in_exec                         TRUE, if called from execve or posix spawn set exec
+ *                                                     FALSE, if called from fork or vfexec
  *
  * Note:       This code is called in the fork() case, from the execve() call
  *             graph, if implementing an execve() following a vfork(), from
@@ -764,7 +771,13 @@ vfork_return(proc_t child_proc, int32_t *retval, int rval)
  *             in this case, 'inherit_memory' MUST be FALSE.
  */
 thread_t
-fork_create_child(task_t parent_task, coalition_t *parent_coalitions, proc_t child_proc, int inherit_memory, int is64bit, int in_exec)
+fork_create_child(task_t parent_task,
+                                 coalition_t *parent_coalitions,
+                                 proc_t child_proc,
+                                 int inherit_memory,
+                                 int is_64bit_addr,
+                                 int is_64bit_data,
+                                 int in_exec)
 {
        thread_t        child_thread = NULL;
        task_t          child_task;
@@ -774,7 +787,8 @@ fork_create_child(task_t parent_task, coalition_t *parent_coalitions, proc_t chi
        result = task_create_internal(parent_task,
                                        parent_coalitions,
                                        inherit_memory,
-                                       is64bit,
+                                       is_64bit_addr,
+                                       is_64bit_data,
                                        TF_LRETURNWAIT | TF_LRETURNWAITER,         /* All created threads will wait in task_wait_to_return */
                                        in_exec ? TPF_EXEC_COPY : TPF_NONE,   /* Mark the task exec copy if in execve */
                                        &child_task);
@@ -968,7 +982,26 @@ cloneproc(task_t parent_task, coalition_t *parent_coalitions, proc_t parent_proc
                goto bad;
        }
 
-       child_thread = fork_create_child(parent_task, parent_coalitions, child_proc, inherit_memory, parent_proc->p_flag & P_LP64, FALSE);
+       /*
+        * In the case where the parent_task is TASK_NULL (during the init path)
+        * we make the assumption that the register size will be the same as the
+        * address space size since there's no way to determine the possible
+        * register size until an image is exec'd.
+        *
+        * The only architecture that has different address space and register sizes
+        * (arm64_32) isn't being used within kernel-space, so the above assumption
+        * always holds true for the init path.
+        */
+       const int parent_64bit_addr = parent_proc->p_flag & P_LP64;
+       const int parent_64bit_data = (parent_task == TASK_NULL) ? parent_64bit_addr : task_get_64bit_data(parent_task);
+
+       child_thread = fork_create_child(parent_task,
+                                                                        parent_coalitions,
+                                                                        child_proc,
+                                                                        inherit_memory,
+                                                                        parent_64bit_addr,
+                                                                        parent_64bit_data,
+                                                                        FALSE);
 
        if (child_thread == NULL) {
                /*
@@ -980,11 +1013,9 @@ cloneproc(task_t parent_task, coalition_t *parent_coalitions, proc_t parent_proc
        }
 
        child_task = get_threadtask(child_thread);
-       if (parent_proc->p_flag & P_LP64) {
-               task_set_64bit(child_task, TRUE);
+       if (parent_64bit_addr) {
                OSBitOrAtomic(P_LP64, (UInt32 *)&child_proc->p_flag);
        } else {
-               task_set_64bit(child_task, FALSE);
                OSBitAndAtomic(~((uint32_t)P_LP64), (UInt32 *)&child_proc->p_flag);
        }
 
@@ -1110,7 +1141,10 @@ forkproc_free(proc_t p)
 
        /* Free allocated memory */
        FREE_ZONE(p->p_sigacts, sizeof *p->p_sigacts, M_SIGACTS);
+       p->p_sigacts = NULL;
        FREE_ZONE(p->p_stats, sizeof *p->p_stats, M_PSTATS);
+       p->p_stats = NULL;
+
        proc_checkdeadrefs(p);
        FREE_ZONE(p, sizeof *p, M_PROC);
 }
@@ -1162,6 +1196,7 @@ forkproc(proc_t parent_proc)
        if (child_proc->p_sigacts == NULL) {
                printf("forkproc: M_SUBPROC zone exhausted (p_sigacts)\n");
                FREE_ZONE(child_proc->p_stats, sizeof *child_proc->p_stats, M_PSTATS);
+               child_proc->p_stats = NULL;
                FREE_ZONE(child_proc, sizeof *child_proc, M_PROC);
                child_proc = NULL;
                goto bad;
@@ -1171,7 +1206,9 @@ forkproc(proc_t parent_proc)
        child_proc->p_rcall = thread_call_allocate((thread_call_func_t)realitexpire, child_proc);
        if (child_proc->p_rcall == NULL) {
                FREE_ZONE(child_proc->p_sigacts, sizeof *child_proc->p_sigacts, M_SIGACTS);
+               child_proc->p_sigacts = NULL;
                FREE_ZONE(child_proc->p_stats, sizeof *child_proc->p_stats, M_PSTATS);
+               child_proc->p_stats = NULL;
                FREE_ZONE(child_proc, sizeof *child_proc, M_PROC);
                child_proc = NULL;
                goto bad;
@@ -1266,7 +1303,7 @@ retry:
        if (parent_proc->p_flag & P_PROFIL)
                startprofclock(child_proc);
 
-       child_proc->p_vfs_iopolicy = (parent_proc->p_vfs_iopolicy & (P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY));
+       child_proc->p_vfs_iopolicy = (parent_proc->p_vfs_iopolicy & (P_VFS_IOPOLICY_VALID_MASK));
 
        /*
         * Note that if the current thread has an assumed identity, this
@@ -1416,7 +1453,7 @@ retry:
        child_proc->p_memstat_memlimit_active   = 0;
        child_proc->p_memstat_memlimit_inactive = 0;
 #if CONFIG_FREEZE
-       child_proc->p_memstat_suspendedfootprint = 0;
+       child_proc->p_memstat_freeze_sharedanon_pages = 0;
 #endif
        child_proc->p_memstat_dirty = 0;
        child_proc->p_memstat_idledeadline = 0;
@@ -1646,12 +1683,8 @@ uthread_cleanup(task_t task, void *uthread, void * bsd_info)
         */
        assert(uth->uu_ar == NULL);
 
-       if (uth->uu_kqueue_bound) {
-               kevent_qos_internal_unbind(p,
-                                          0, /* didn't save qos_class */
-                                          uth->uu_thread,
-                                          uth->uu_kqueue_flags);
-               assert(uth->uu_kqueue_override_is_sync == 0);
+       if (uth->uu_kqr_bound) {
+               kqueue_threadreq_unbind(p, uth->uu_kqr_bound);
        }
 
        sel = &uth->uu_select;
index ea583e9cf014a192c4833978152749c8ea796c52..795eb56676855b8e7ca3f19c61a5af977c692548 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -53,6 +53,7 @@
 #include <security/mac_policy.h>
 #include <pexpert/pexpert.h>
 #include <sys/sysctl.h>
+#include <sys/reason.h>
 #endif
 
 
@@ -1202,11 +1203,39 @@ vng_file_label_destroy(struct label *label)
        lck_rw_unlock_exclusive(&llock);
 }
 
+static os_reason_t
+vng_reason_from_pathname(const char *path, uint32_t pathlen)
+{
+       os_reason_t r = os_reason_create(OS_REASON_GUARD, GUARD_REASON_VNODE);
+       if (NULL == r)
+               return (r);
+       /*
+        * If the pathname is very long, just keep the trailing part
+        */
+       const uint32_t pathmax = 3 * EXIT_REASON_USER_DESC_MAX_LEN / 4;
+       if (pathlen > pathmax) {
+               path += (pathlen - pathmax);
+               pathlen = pathmax;
+       }
+       uint32_t rsize = kcdata_estimate_required_buffer_size(1, pathlen);
+       if (0 == os_reason_alloc_buffer(r, rsize)) {
+               struct kcdata_descriptor *kcd = &r->osr_kcd_descriptor;
+               mach_vm_address_t addr;
+               if (kcdata_get_memory_addr(kcd,
+                   EXIT_REASON_USER_DESC, pathlen, &addr) == KERN_SUCCESS) {
+                       kcdata_memcpy(kcd, addr, path, pathlen);
+                       return (r);
+               }
+       }
+       os_reason_free(r);
+       return (OS_REASON_NULL);
+}
+
 static int vng_policy_flags;
 
 static int
 vng_guard_violation(const struct vng_info *vgi,
-    unsigned opval, const char *nm)
+    unsigned opval, vnode_t vp)
 {
        int retval = 0;
 
@@ -1215,7 +1244,7 @@ vng_guard_violation(const struct vng_info *vgi,
                retval = EPERM;
        }
 
-       if (vng_policy_flags & kVNG_POLICY_LOGMSG) {
+       if (vng_policy_flags & (kVNG_POLICY_LOGMSG|kVNG_POLICY_UPRINTMSG)) {
                /* log a message */
                const char *op;
                switch (opval) {
@@ -1244,16 +1273,33 @@ vng_guard_violation(const struct vng_info *vgi,
                        op = "(unknown)";
                        break;
                }
+
+               const char *nm = vnode_getname(vp);
                proc_t p = current_proc();
                const struct vng_owner *vgo;
                TAILQ_FOREACH(vgo, &vgi->vgi_owners, vgo_link) {
-                       printf("%s[%d]: %s%s: '%s' guarded by %s[%d] (0x%llx)\n",
-                           proc_name_address(p), proc_pid(p), op,
-                           0 != retval ? " denied" : "",
-                           NULL != nm ? nm : "(unknown)",
-                           proc_name_address(vgo->vgo_p), proc_pid(vgo->vgo_p),
-                           vgi->vgi_guard);
+                       const char fmt[] =
+                           "%s[%d]: %s%s: '%s' guarded by %s[%d] (0x%llx)\n";
+
+                       if (vng_policy_flags & kVNG_POLICY_LOGMSG) {
+                               printf(fmt,
+                                   proc_name_address(p), proc_pid(p), op,
+                                   0 != retval ? " denied" : "",
+                                   NULL != nm ? nm : "(unknown)",
+                                   proc_name_address(vgo->vgo_p),
+                                   proc_pid(vgo->vgo_p), vgi->vgi_guard);
+                       }
+                       if (vng_policy_flags & kVNG_POLICY_UPRINTMSG) {
+                               uprintf(fmt,
+                                   proc_name_address(p), proc_pid(p), op,
+                                   0 != retval ? " denied" : "",
+                                   NULL != nm ? nm : "(unknown)",
+                                   proc_name_address(vgo->vgo_p),
+                                   proc_pid(vgo->vgo_p), vgi->vgi_guard);
+                       }
                }
+               if (NULL != nm)
+                       vnode_putname(nm);
        }
 
        if (vng_policy_flags & (kVNG_POLICY_EXC|kVNG_POLICY_EXC_CORPSE)) {
@@ -1270,8 +1316,20 @@ vng_guard_violation(const struct vng_info *vgi,
                subcode = vgi->vgi_guard;
 
                if (vng_policy_flags & kVNG_POLICY_EXC_CORPSE) {
-                       task_violated_guard(code, subcode, NULL);
-                       /* not fatal */
+                       char *path;
+                       int len = MAXPATHLEN;
+                       MALLOC(path, char *, len, M_TEMP, M_WAITOK);
+                       os_reason_t r = NULL;
+                       if (NULL != path) {
+                               vn_getpath(vp, path, &len);
+                               if (*path && len)
+                                       r = vng_reason_from_pathname(path, len);
+                       }
+                       task_violated_guard(code, subcode, r); /* not fatal */
+                       if (NULL != r)
+                               os_reason_free(r);
+                       if (NULL != path)
+                               FREE(path, M_TEMP);
                } else {
                        thread_t t = current_thread();
                        thread_guard_violation(t, code, subcode);
@@ -1281,11 +1339,11 @@ vng_guard_violation(const struct vng_info *vgi,
                psignal(p, SIGKILL);
        }
 
-       return retval;
+       return (retval);
 }
 
 /*
- * A vnode guard was tripped on this thread.
+ * A fatal vnode guard was tripped on this thread.
  *
  * (Invoked before returning to userland from the syscall handler.)
  */
@@ -1305,11 +1363,11 @@ vn_guard_ast(thread_t __unused t,
 static int
 vng_vnode_check_rename(kauth_cred_t __unused cred,
     struct vnode *__unused dvp, struct label *__unused dlabel,
-    struct vnode *__unused vp, struct label *label,
-    struct componentname *cnp,
+    struct vnode *vp, struct label *label,
+    struct componentname *__unused cnp,
     struct vnode *__unused tdvp, struct label *__unused tdlabel,
-    struct vnode *__unused tvp, struct label *tlabel,
-    struct componentname *tcnp)
+    struct vnode *tvp, struct label *tlabel,
+    struct componentname *__unused tcnp)
 {
        int error = 0;
        if (NULL != label || NULL != tlabel) {
@@ -1317,17 +1375,16 @@ vng_vnode_check_rename(kauth_cred_t __unused cred,
                const struct vng_info *vgi =
                    vng_lbl_get_withattr(label, VNG_RENAME_FROM);
                if (NULL != vgi)
-                       error = vng_guard_violation(vgi,
-                           VNG_RENAME_FROM, cnp->cn_nameptr);
+                       error = vng_guard_violation(vgi, VNG_RENAME_FROM, vp);
                if (0 == error) {
                        vgi = vng_lbl_get_withattr(tlabel, VNG_RENAME_TO);
                        if (NULL != vgi)
                                error = vng_guard_violation(vgi,
-                                   VNG_RENAME_TO, tcnp->cn_nameptr);
+                                   VNG_RENAME_TO, tvp);
                }
                lck_rw_unlock_shared(&llock);
        }
-       return error;
+       return (error);
 }
 
 static int
@@ -1340,21 +1397,17 @@ vng_vnode_check_link(kauth_cred_t __unused cred,
                lck_rw_lock_shared(&llock);
                const struct vng_info *vgi =
                        vng_lbl_get_withattr(label, VNG_LINK);
-               if (vgi) {
-                       const char *nm = vnode_getname(vp);
-                       error = vng_guard_violation(vgi, VNG_LINK, nm);
-                       if (nm)
-                               vnode_putname(nm);
-               }
+               if (vgi)
+                       error = vng_guard_violation(vgi, VNG_LINK, vp);
                lck_rw_unlock_shared(&llock);
        }
-       return error;
+       return (error);
 }
 
 static int
 vng_vnode_check_unlink(kauth_cred_t __unused cred,
     struct vnode *__unused dvp, struct label *__unused dlabel,
-    struct vnode *__unused vp, struct label *label, struct componentname *cnp)
+    struct vnode *vp, struct label *label, struct componentname *__unused cnp)
 {
        int error = 0;
        if (NULL != label) {
@@ -1362,11 +1415,10 @@ vng_vnode_check_unlink(kauth_cred_t __unused cred,
                const struct vng_info *vgi =
                    vng_lbl_get_withattr(label, VNG_UNLINK);
                if (vgi)
-                       error = vng_guard_violation(vgi, VNG_UNLINK,
-                           cnp->cn_nameptr);
+                       error = vng_guard_violation(vgi, VNG_UNLINK, vp);
                lck_rw_unlock_shared(&llock);
        }
-       return error;
+       return (error);
 }
 
 /*
@@ -1388,16 +1440,12 @@ vng_vnode_check_write(kauth_cred_t __unused actv_cred,
                                if (vgo->vgo_p == p)
                                        goto done;
                        }
-                       const char *nm = vnode_getname(vp);
-                       error = vng_guard_violation(vgi,
-                           VNG_WRITE_OTHER, nm);
-                       if (nm)
-                               vnode_putname(nm);
+                       error = vng_guard_violation(vgi, VNG_WRITE_OTHER, vp);
                }
        done:
                lck_rw_unlock_shared(&llock);
        }
-       return error;
+       return (error);
 }
 
 /*
@@ -1420,11 +1468,7 @@ vng_vnode_check_truncate(kauth_cred_t __unused actv_cred,
                                if (vgo->vgo_p == p)
                                        goto done;
                        }
-                       const char *nm = vnode_getname(vp);
-                       error = vng_guard_violation(vgi,
-                           VNG_TRUNC_OTHER, nm);
-                       if (nm)
-                               vnode_putname(nm);
+                       error = vng_guard_violation(vgi, VNG_TRUNC_OTHER, vp);
                }
        done:
                lck_rw_unlock_shared(&llock);
@@ -1442,26 +1486,28 @@ vng_vnode_check_exchangedata(kauth_cred_t __unused cred,
                lck_rw_lock_shared(&llock);
                const struct vng_info *vgi =
                        vng_lbl_get_withattr(flabel, VNG_EXCHDATA);
-               if (NULL != vgi) {
-                        const char *nm = vnode_getname(fvp);
-                       error = vng_guard_violation(vgi,
-                           VNG_EXCHDATA, nm);
-                       if (nm)
-                               vnode_putname(nm);
-               }
+               if (NULL != vgi)
+                       error = vng_guard_violation(vgi, VNG_EXCHDATA, fvp);
                if (0 == error) {
                        vgi = vng_lbl_get_withattr(slabel, VNG_EXCHDATA);
-                       if (NULL != vgi) {
-                               const char *nm = vnode_getname(svp);
+                       if (NULL != vgi)
                                error = vng_guard_violation(vgi,
-                                   VNG_EXCHDATA, nm);
-                               if (nm)
-                                       vnode_putname(nm);
-                       }
+                                   VNG_EXCHDATA, svp);
                }
                lck_rw_unlock_shared(&llock);
        }
-       return error;
+       return (error);
+}
+
+/* Intercept open-time truncations (by "other") of a guarded vnode */
+
+static int
+vng_vnode_check_open(kauth_cred_t cred,
+    struct vnode *vp, struct label *label, int acc_mode)
+{
+       if (0 == (acc_mode & O_TRUNC))
+               return (0);
+       return (vng_vnode_check_truncate(cred, NULL, vp, label));
 }
 
 /*
@@ -1484,6 +1530,7 @@ SECURITY_READ_ONLY_EARLY(static struct mac_policy_ops) vng_policy_ops = {
        .mpo_vnode_check_write = vng_vnode_check_write,
        .mpo_vnode_check_truncate = vng_vnode_check_truncate,
        .mpo_vnode_check_exchangedata = vng_vnode_check_exchangedata,
+       .mpo_vnode_check_open = vng_vnode_check_open,
 
        .mpo_policy_syscall = vng_policy_syscall,
        .mpo_policy_init = vng_init,
@@ -1513,7 +1560,8 @@ vnguard_policy_init(void)
 {
        if (0 == PE_i_can_has_debugger(NULL))
                return;
-       vng_policy_flags = kVNG_POLICY_LOGMSG | kVNG_POLICY_EXC_CORPSE;
+       vng_policy_flags = kVNG_POLICY_LOGMSG |
+               kVNG_POLICY_EXC_CORPSE | kVNG_POLICY_UPRINTMSG;
        PE_parse_boot_argn("vnguard", &vng_policy_flags, sizeof (vng_policy_flags));
        if (vng_policy_flags)
                mac_policy_register(&vng_policy_conf, &vng_policy_handle, NULL);
index 098b7349f7ad640fa07afa57399cdb0ca66517ed..96700a924d516c6d95c6aafc32ffd90cc065c548 100644 (file)
@@ -66,24 +66,10 @@ static lck_grp_attr_t *sysctl_lckgrp_attr = NULL;
 static lck_grp_t *sysctl_lckgrp = NULL;
 static lck_mtx_t sysctl_lock;
 
-#if defined(__x86_64__)
-/* 18 cores, 7 counters each */
-#define KPC_MAX_COUNTERS_COPIED (18 * 7)
-#elif defined(__arm64__)
-#include <pexpert/arm64/board_config.h>
-#if defined(CPU_COUNT)
-#define KPC_MAX_COUNTERS_COPIED (CPU_COUNT * 10)
-#else /* defined(CPU_COUNT) */
-#define KPC_MAX_COUNTERS_COPIED (2 * 10)
-#endif /* !defined(CPU_COUNT) */
-#elif defined(__arm__)
-#define KPC_MAX_COUNTERS_COPIED (16)
-#else /* !defined(__arm__) && !defined(__arm64__) && !defined(__x86_64__) */
-#error "unknown architecture for kpc buffer sizes"
-#endif /* !defined(__arm__) && !defined(__arm64__) && !defined(__x86_64__) */
-
-static_assert((KPC_MAX_COUNTERS_COPIED * sizeof(uint64_t)) < 1024,
-               "kpc's stack could grow too large");
+/*
+ * Another element is needed to hold the CPU number when getting counter values.
+ */
+#define KPC_MAX_BUF_LEN (KPC_MAX_COUNTERS_COPIED + 1)
 
 typedef int (*setget_func_t)(int);
 
@@ -101,6 +87,29 @@ kpc_init(void)
        kpc_initted = 1;
 }
 
+static uint64_t *
+kpc_get_bigarray(uint32_t *size_out)
+{
+       static uint64_t *bigarray = NULL;
+
+       LCK_MTX_ASSERT(&sysctl_lock, LCK_MTX_ASSERT_OWNED);
+
+       uint32_t size = kpc_get_counterbuf_size() + sizeof(uint64_t);
+       *size_out = size;
+
+       if (bigarray) {
+               return bigarray;
+       }
+
+       /*
+        * Another element is needed to hold the CPU number when getting counter
+        * values.
+        */
+       bigarray = kalloc_tag(size, VM_KERN_MEMORY_DIAG);
+       assert(bigarray != NULL);
+       return bigarray;
+}
+
 /* abstract sysctl handlers */
 static int
 sysctl_get_int( struct sysctl_oid *oidp, struct sysctl_req *req,
@@ -276,8 +285,8 @@ static int
 sysctl_get_bigarray(struct sysctl_req *req,
                int (*get_fn)(uint32_t, uint32_t*, void*))
 {
-       uint64_t buf[KPC_MAX_COUNTERS_COPIED] = {};
-       uint32_t bufsize = sizeof(buf);
+       uint32_t bufsize = 0;
+       uint64_t *buf = kpc_get_bigarray(&bufsize);
        uint32_t arg = 0;
 
        /* get the argument */
@@ -286,9 +295,9 @@ sysctl_get_bigarray(struct sysctl_req *req,
                return error;
        }
 
-       error = get_fn(arg, &bufsize, &buf);
+       error = get_fn(arg, &bufsize, buf);
        if (!error) {
-               error = SYSCTL_OUT(req, &buf, bufsize);
+               error = SYSCTL_OUT(req, buf, bufsize);
        }
 
        return error;
@@ -318,10 +327,11 @@ sysctl_getset_bigarray(struct sysctl_req *req, int (*size_fn)(uint32_t arg),
                int (*get_fn)(uint32_t, void*), int (*set_fn)(uint32_t, void*))
 {
        int error = 0;
-       uint64_t buf[KPC_MAX_COUNTERS_COPIED] = {};
-       uint32_t bufsize = sizeof(buf);
        uint64_t arg;
 
+       uint32_t bufsize = 0;
+       uint64_t *buf = kpc_get_bigarray(&bufsize);
+
        /* get the config word */
        error = SYSCTL_IN(req, &arg, sizeof(arg));
        if (error) {
@@ -337,11 +347,11 @@ sysctl_getset_bigarray(struct sysctl_req *req, int (*size_fn)(uint32_t arg),
        /* if writing */
        if (req->newptr) {
                /* copy the rest -- SYSCTL_IN knows the copyin should be shifted */
-               error = SYSCTL_IN(req, &buf, regsize);
+               error = SYSCTL_IN(req, buf, regsize);
 
                /* SYSCTL_IN failure means only need to read */
                if (!error) {
-                       error = set_fn((uint32_t)arg, &buf);
+                       error = set_fn((uint32_t)arg, buf);
                        if (error) {
                                return error;
                        }
@@ -350,12 +360,12 @@ sysctl_getset_bigarray(struct sysctl_req *req, int (*size_fn)(uint32_t arg),
 
        /* if reading */
        if (req->oldptr) {
-               error = get_fn((uint32_t)arg, &buf);
+               error = get_fn((uint32_t)arg, buf);
                if (error) {
                        return error;
                }
 
-               error = SYSCTL_OUT(req, &buf, regsize);
+               error = SYSCTL_OUT(req, buf, regsize);
        }
 
        return error;
@@ -369,8 +379,13 @@ kpc_sysctl SYSCTL_HANDLER_ARGS
        // __unused struct sysctl_oid *unused_oidp = oidp;
        (void)arg2;
 
-       if( !kpc_initted )
+       if (!kpc_initted) {
                panic("kpc_init not called");
+       }
+
+       if (!kpc_supported) {
+               return ENOTSUP;
+       }
 
        ktrace_lock();
 
@@ -486,74 +501,74 @@ SYSCTL_NODE(, OID_AUTO, kpc, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
 
 /* values */
 SYSCTL_PROC(_kpc, OID_AUTO, classes,
-            CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_ANYBODY,
+            CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_CLASSES, 
             sizeof(int), kpc_sysctl, "I", "Available classes");
 
 SYSCTL_PROC(_kpc, OID_AUTO, counting,
-            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY,
+            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_COUNTING, 
             sizeof(int), kpc_sysctl, "I", "PMCs counting");
 
 SYSCTL_PROC(_kpc, OID_AUTO, thread_counting,
-            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY,
+            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_THREAD_COUNTING, 
             sizeof(int), kpc_sysctl, "I", "Thread accumulation");
 
 SYSCTL_PROC(_kpc, OID_AUTO, pmu_version,
-            CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_ANYBODY,
+            CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void *)REQ_PMU_VERSION,
             sizeof(int), kpc_sysctl, "I", "PMU version for hardware");
 
 /* faux values */
 SYSCTL_PROC(_kpc, OID_AUTO, config_count,
-            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY,
+            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_CONFIG_COUNT, 
             sizeof(int), kpc_sysctl, "S", "Config count");
 
 SYSCTL_PROC(_kpc, OID_AUTO, counter_count,
-            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY,
+            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_COUNTER_COUNT, 
             sizeof(int), kpc_sysctl, "S", "Counter count");
 
 SYSCTL_PROC(_kpc, OID_AUTO, sw_inc,
-            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY,
+            CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_SW_INC, 
             sizeof(int), kpc_sysctl, "S", "Software increment");
 
 /* arrays */
 SYSCTL_PROC(_kpc, OID_AUTO, thread_counters,
-            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY,
+            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_THREAD_COUNTERS, 
             sizeof(uint64_t), kpc_sysctl, 
             "QU", "Current thread counters");
 
 SYSCTL_PROC(_kpc, OID_AUTO, counters,
-            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY,
+            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_COUNTERS, 
             sizeof(uint64_t), kpc_sysctl, 
             "QU", "Current counters");
 
 SYSCTL_PROC(_kpc, OID_AUTO, shadow_counters,
-            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY,
+            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_SHADOW_COUNTERS, 
             sizeof(uint64_t), kpc_sysctl, 
             "QU", "Current shadow counters");
 
 SYSCTL_PROC(_kpc, OID_AUTO, config,
-            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY,
+            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_CONFIG, 
             sizeof(uint64_t), kpc_sysctl, 
             "QU", "Set counter configs");
 
 SYSCTL_PROC(_kpc, OID_AUTO, period,
-            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY,
+            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_PERIOD, 
             sizeof(uint64_t), kpc_sysctl, 
             "QU", "Set counter periods");
 
 SYSCTL_PROC(_kpc, OID_AUTO, actionid,
-            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY,
+            CTLFLAG_RD|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED,
             (void*)REQ_ACTIONID, 
             sizeof(uint32_t), kpc_sysctl, 
             "QU", "Set counter actionids");
index 5284f060cf419e82a54c942a0fff43c8966db89b..32ab96fd76053f7223d318868bb629f1be11aa82 100644 (file)
@@ -330,12 +330,10 @@ lf_advlock(struct vnop_advlock_args *ap)
                FREE(lock, M_LOCKF);
                break;
 
-#if CONFIG_EMBEDDED
        case F_GETLKPID:
                error = lf_getlock(lock, fl, fl->l_pid);
                FREE(lock, M_LOCKF);
                break;
-#endif
 
        default:
                FREE(lock, M_LOCKF);
index 6e28f3f775ecfc636da8cfdb47a51d7f90688597..56583973882d88c133a99aeace826abdb2b4caa9 100644 (file)
@@ -303,7 +303,8 @@ const char *memname[] = {
        "Event Handler",/* 125 M_EVENTHANDLER */
        "Link Layer Table",     /* 126 M_LLTABLE */
        "Network Work Queue",   /* 127 M_NWKWQ */
-       ""
+       "Content Filter", /* 128 M_CFIL */
+    ""
 };
 
 /* for use with kmzones.kz_zalloczone */
@@ -491,6 +492,7 @@ struct kmzones {
        { 0,            KMZ_MALLOC, FALSE },            /* 125 M_EVENTHANDLER */
        { 0,            KMZ_MALLOC, FALSE },            /* 126 M_LLTABLE */
        { 0,            KMZ_MALLOC, FALSE },            /* 127 M_NWKWQ */
+       { 0,            KMZ_MALLOC, FALSE },            /* 128 M_CFIL */
 #undef SOS
 #undef SOX
 };
index a2de71f0f1a1246edb0ecef1a02a25246c1e201f..f52c05c1aae53b1d7a55da125846752454033787 100644 (file)
 
 #include <mach/machine/sdt.h>
 #include <libkern/section_keywords.h>
+#include <stdatomic.h>
 
 /* For logging clarity */
 static const char *memorystatus_kill_cause_name[] = {
-       ""                      ,
-       "jettisoned"            ,       /* kMemorystatusKilled                  */
-       "highwater"             ,       /* kMemorystatusKilledHiwat             */
-       "vnode-limit"           ,       /* kMemorystatusKilledVnodes            */
-       "vm-pageshortage"       ,       /* kMemorystatusKilledVMPageShortage    */
-       "vm-thrashing"          ,       /* kMemorystatusKilledVMThrashing       */
-       "fc-thrashing"          ,       /* kMemorystatusKilledFCThrashing       */
-       "per-process-limit"     ,       /* kMemorystatusKilledPerProcessLimit   */
-       "diagnostic"            ,       /* kMemorystatusKilledDiagnostic        */
-       "idle-exit"             ,       /* kMemorystatusKilledIdleExit          */
-       "zone-map-exhaustion"   ,       /* kMemorystatusKilledZoneMapExhaustion */
+       ""                                                              ,               /* kMemorystatusInvalid                                                 */
+       "jettisoned"                                    ,               /* kMemorystatusKilled                                                  */
+       "highwater"                                             ,               /* kMemorystatusKilledHiwat                                             */
+       "vnode-limit"                                   ,               /* kMemorystatusKilledVnodes                                    */
+       "vm-pageshortage"                               ,               /* kMemorystatusKilledVMPageShortage                    */
+       "proc-thrashing"                                ,               /* kMemorystatusKilledProcThrashing                             */
+       "fc-thrashing"                                  ,               /* kMemorystatusKilledFCThrashing                               */
+       "per-process-limit"                             ,               /* kMemorystatusKilledPerProcessLimit                   */
+       "disk-space-shortage"                   ,               /* kMemorystatusKilledDiskSpaceShortage                 */
+       "idle-exit"                                             ,               /* kMemorystatusKilledIdleExit                                  */
+       "zone-map-exhaustion"                   ,               /* kMemorystatusKilledZoneMapExhaustion                 */
+       "vm-compressor-thrashing"               ,               /* kMemorystatusKilledVMCompressorThrashing             */
+       "vm-compressor-space-shortage"  ,               /* kMemorystatusKilledVMCompressorSpaceShortage */
 };
 
 static const char *
@@ -115,8 +118,9 @@ static boolean_t
 is_reason_thrashing(unsigned cause)
 {
        switch (cause) {
-       case kMemorystatusKilledVMThrashing:
        case kMemorystatusKilledFCThrashing:
+       case kMemorystatusKilledVMCompressorThrashing:
+       case kMemorystatusKilledVMCompressorSpaceShortage:
                return TRUE;
        default:
                return FALSE;
@@ -280,12 +284,11 @@ boolean_t is_knote_registered_modify_task_pressure_bits(struct knote*, int, task
 void memorystatus_klist_reset_all_for_level(vm_pressure_level_t pressure_level_to_clear);
 void memorystatus_send_low_swap_note(void);
 
-int memorystatus_wakeup = 0;
-
 unsigned int memorystatus_level = 0;
 
 static int memorystatus_list_count = 0;
 
+
 #define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
 
 typedef struct memstat_bucket {
@@ -303,8 +306,16 @@ int system_procs_aging_band = JETSAM_PRIORITY_AGING_BAND1;
 int applications_aging_band = JETSAM_PRIORITY_IDLE;
 
 #define isProcessInAgingBands(p)       ((isSysProc(p) && system_procs_aging_band && (p->p_memstat_effectivepriority == system_procs_aging_band)) || (isApp(p) && applications_aging_band && (p->p_memstat_effectivepriority == applications_aging_band)))
-#define isApp(p)                       (! (p->p_memstat_dirty & P_DIRTY_TRACK))
-#define isSysProc(p)                   ((p->p_memstat_dirty & P_DIRTY_TRACK))
+
+/*
+ * Checking the p_memstat_state almost always requires the proc_list_lock
+ * because the jetsam thread could be on the other core changing the state.
+ *
+ * App -- almost always managed by a system process. Always have dirty tracking OFF. Can include extensions too.
+ * System Processes -- not managed by anybody. Always have dirty tracking ON. Can include extensions (here) too.
+ */
+#define isApp(p)                       ((p->p_memstat_state & P_MEMSTAT_MANAGED) || ! (p->p_memstat_dirty & P_DIRTY_TRACK))
+#define isSysProc(p)                   ( ! (p->p_memstat_state & P_MEMSTAT_MANAGED) || (p->p_memstat_dirty & P_DIRTY_TRACK))
 
 #define        kJetsamAgingPolicyNone                          (0)
 #define kJetsamAgingPolicyLegacy                       (1)
@@ -598,9 +609,12 @@ static uint32_t kill_under_pressure_cause = 0;
  * default jetsam snapshot support
  */
 static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
+static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot_copy;
 #define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
 static unsigned int memorystatus_jetsam_snapshot_count = 0;
+static unsigned int memorystatus_jetsam_snapshot_copy_count = 0;
 static unsigned int memorystatus_jetsam_snapshot_max = 0;
+static unsigned int memorystatus_jetsam_snapshot_size = 0;
 static uint64_t memorystatus_jetsam_snapshot_last_timestamp = 0;
 static uint64_t memorystatus_jetsam_snapshot_timeout = 0;
 #define JETSAM_SNAPSHOT_TIMEOUT_SECS 30
@@ -615,7 +629,7 @@ static boolean_t memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memory
 static void memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause, uint64_t killtime);
 
 static void memorystatus_clear_errors(void);
-static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
+static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages);
 static void memorystatus_get_task_phys_footprint_page_counts(task_t task,
                                                             uint64_t *internal_pages, uint64_t *internal_compressed_pages,
                                                             uint64_t *purgeable_nonvolatile_pages, uint64_t *purgeable_nonvolatile_compressed_pages,
@@ -629,7 +643,7 @@ static uint32_t memorystatus_build_state(proc_t p);
 
 static boolean_t memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause, os_reason_t jetsam_reason, int32_t *priority, uint32_t *errors);
 static boolean_t memorystatus_kill_top_process_aggressive(uint32_t cause, int aggr_count, int32_t priority_max, uint32_t *errors);
-static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors);
+static boolean_t memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors);
 static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors, boolean_t *purged);
 
 static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
@@ -691,7 +705,14 @@ int32_t max_kill_priority = JETSAM_PRIORITY_IDLE;
 #endif /* CONFIG_JETSAM */
 
 unsigned int memorystatus_frozen_count = 0;
+unsigned int memorystatus_frozen_processes_max = 0;
+unsigned int memorystatus_frozen_shared_mb = 0;
+unsigned int memorystatus_frozen_shared_mb_max = 0;
+unsigned int memorystatus_freeze_shared_mb_per_process_max = 0; /* Max. MB allowed per process to be freezer-eligible. */
+unsigned int memorystatus_freeze_private_shared_pages_ratio = 2; /* Ratio of private:shared pages for a process to be freezer-eligible. */
 unsigned int memorystatus_suspended_count = 0;
+unsigned int memorystatus_thaw_count = 0;
+unsigned int memorystatus_refreeze_eligible_count = 0; /* # of processes currently thawed i.e. have state on disk & in-memory */
 
 #if VM_PRESSURE_EVENTS
 
@@ -715,6 +736,21 @@ boolean_t memorystatus_hwm_candidates = 0;
 
 static int memorystatus_send_note(int event_code, void *data, size_t data_length);
 
+/*
+ * This value is the threshold that a process must meet to be considered for scavenging.
+ */
+#if CONFIG_EMBEDDED
+#define VM_PRESSURE_MINIMUM_RSIZE              6       /* MB */
+#else /* CONFIG_EMBEDDED */
+#define VM_PRESSURE_MINIMUM_RSIZE              10      /* MB */
+#endif /* CONFIG_EMBEDDED */
+
+uint32_t vm_pressure_task_footprint_min = VM_PRESSURE_MINIMUM_RSIZE;
+
+#if DEVELOPMENT || DEBUG
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_task_footprint_min, CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pressure_task_footprint_min, 0, "");
+#endif /* DEVELOPMENT || DEBUG */
+
 #endif /* VM_PRESSURE_EVENTS */
 
 
@@ -728,12 +764,24 @@ extern boolean_t kill_on_no_paging_space;
 #endif /* DEVELOPMENT || DEBUG */
 
 
+/*
+ * Table that expresses the probability of a process
+ * being used in the next hour.
+ */
+typedef struct memorystatus_internal_probabilities {
+       char proc_name[MAXCOMLEN + 1];
+       int use_probability;
+} memorystatus_internal_probabilities_t;
+
+static memorystatus_internal_probabilities_t *memorystatus_global_probabilities_table = NULL;
+static size_t memorystatus_global_probabilities_size = 0;
+
 /* Freeze */
 
 #if CONFIG_FREEZE
-
 boolean_t memorystatus_freeze_enabled = FALSE;
 int memorystatus_freeze_wakeup = 0;
+int memorystatus_freeze_jetsam_band = 0; /* the jetsam band which will contain P_MEMSTAT_FROZEN processes */
 
 lck_grp_attr_t *freezer_lck_grp_attr;
 lck_grp_t *freezer_lck_grp;
@@ -741,8 +789,11 @@ static lck_mtx_t freezer_mutex;
 
 static inline boolean_t memorystatus_can_freeze_processes(void);
 static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
-
+static boolean_t memorystatus_is_process_eligible_for_freeze(proc_t p);
 static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
+static boolean_t memorystatus_freeze_thread_should_run(void);
+
+void memorystatus_disable_freeze(void);
 
 /* Thresholds */
 static unsigned int memorystatus_freeze_threshold = 0;
@@ -753,24 +804,37 @@ static unsigned int memorystatus_freeze_pages_max = 0;
 static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
 
 static unsigned int memorystatus_freeze_daily_mb_max = FREEZE_DAILY_MB_MAX_DEFAULT;
+static uint64_t         memorystatus_freeze_budget_pages_remaining = 0; //remaining # of pages that can be frozen to disk
+static boolean_t memorystatus_freeze_degradation = FALSE; //protected by the freezer mutex. Signals we are in a degraded freeze mode.
+
+static unsigned int memorystatus_max_frozen_demotions_daily = 0;
+static unsigned int memorystatus_thaw_count_demotion_threshold = 0;
 
 /* Stats */
-static uint64_t memorystatus_freeze_count = 0;
 static uint64_t memorystatus_freeze_pageouts = 0;
 
 /* Throttling */
+#define DEGRADED_WINDOW_MINS   (30)
+#define NORMAL_WINDOW_MINS     (24 * 60)
+
 static throttle_interval_t throttle_intervals[] = {
-       {      60,  8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */
-       { 24 * 60,  1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */
+       { DEGRADED_WINDOW_MINS, 1, 0, 0, { 0, 0 }},
+       { NORMAL_WINDOW_MINS, 1, 0, 0, { 0, 0 }},
 };
+throttle_interval_t *degraded_throttle_window = &throttle_intervals[0];
+throttle_interval_t *normal_throttle_window = &throttle_intervals[1];
 
-static uint64_t memorystatus_freeze_throttle_count = 0;
+extern uint64_t vm_swap_get_free_space(void);
+extern boolean_t vm_swap_max_budget(uint64_t *);
 
-static unsigned int memorystatus_suspended_footprint_total = 0;        /* pages */
+static void memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed);
 
-extern uint64_t vm_swap_get_free_space(void);
+static uint64_t memorystatus_freezer_thread_next_run_ts = 0;
 
-static boolean_t memorystatus_freeze_update_throttle(void);
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_frozen_count, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_thaw_count, 0, "");
+SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
+SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_budget_pages_remaining, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_budget_pages_remaining, "");
 
 #endif /* CONFIG_FREEZE */
 
@@ -1135,18 +1199,41 @@ SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|C
 
 #if CONFIG_FREEZE
 
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_jetsam_band, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_jetsam_band, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_daily_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_daily_mb_max, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_degraded_mode, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_degradation, 0, "");
 
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
 
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
 
-SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, "");
-SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
-SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_refreeze_eligible_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_refreeze_eligible_count, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_processes_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_frozen_processes_max, 0, "");
+
+/*
+ * Max. shared-anonymous memory in MB that can be held by frozen processes in the high jetsam band.
+ * "0" means no limit.
+ * Default is 10% of system-wide task limit.
+ */
+
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb_max, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_frozen_shared_mb, 0, "");
+
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_shared_mb_per_process_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_shared_mb_per_process_max, 0, "");
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_private_shared_pages_ratio, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_private_shared_pages_ratio, 0, "");
+
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
 
+/*
+ * max. # of frozen process demotions we will allow in our daily cycle.
+ */
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_max_freeze_demotions_daily, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_max_frozen_demotions_daily, 0, "");
+/*
+ * min # of thaws needed by a process to protect it from getting demoted into the IDLE band.
+ */
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_thaw_count_demotion_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_thaw_count_demotion_threshold, 0, "");
+
 boolean_t memorystatus_freeze_throttle_enabled = TRUE;
 SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
 
@@ -1160,8 +1247,10 @@ sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
 #pragma unused(arg1, arg2)
        int error, pid = 0;
        proc_t p;
+       int freezer_error_code = 0;
 
        if (memorystatus_freeze_enabled == FALSE) {
+               printf("sysctl_freeze: Freeze is DISABLED\n");
                return ENOTSUP;
        }
 
@@ -1179,21 +1268,22 @@ sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
 
        p = proc_find(pid);
        if (p != NULL) {
-               uint32_t purgeable, wired, clean, dirty;
-               boolean_t shared;
-               uint32_t max_pages = 0;
+               uint32_t purgeable, wired, clean, dirty, shared;
+               uint32_t max_pages = 0, state = 0;
 
                if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-
-                       unsigned int avail_swap_space = 0; /* in pages. */
-
                        /*
                         * Freezer backed by the compressor and swap file(s)
-                        * while will hold compressed data.
+                        * will hold compressed data.
+                        *
+                        * We don't care about the global freezer budget or the process's (min/max) budget here.
+                        * The freeze sysctl is meant to force-freeze a process.
+                        *
+                        * We also don't update any global or process stats on this path, so that the jetsam/ freeze
+                        * logic remains unaffected. The tasks we're performing here are: freeze the process, set the
+                        * P_MEMSTAT_FROZEN bit, and elevate the process to a higher band (if the freezer is active).
                         */
-                       avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
-
-                       max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
+                       max_pages = memorystatus_freeze_pages_max;
 
                } else {
                        /*
@@ -1202,16 +1292,87 @@ sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
                        max_pages = UINT32_MAX - 1;
                }
 
-               error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
-               proc_rele(p);
+               proc_list_lock();
+               state = p->p_memstat_state;
+               proc_list_unlock();
+
+               /*
+                * The jetsam path also verifies that the process is a suspended App. We don't care about that here.
+                * We simply ensure that jetsam is not already working on the process and that the process has not
+                * explicitly disabled freezing.
+                */
+               if (state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED)) {
+                       printf("sysctl_freeze: p_memstat_state check failed, process is%s%s%s\n",
+                                       (state & P_MEMSTAT_TERMINATED) ? " terminated" : "",
+                                       (state & P_MEMSTAT_LOCKED) ? " locked" : "",
+                                       (state & P_MEMSTAT_FREEZE_DISABLED) ? " unfreezable" : "");
+
+                       proc_rele(p);
+                       lck_mtx_unlock(&freezer_mutex);
+                       return EPERM;
+               }
+
+               error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
+
+               if (error) {
+                       char reason[128];
+                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
+                               strlcpy(reason, "too much shared memory", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
+                               strlcpy(reason, "low private-shared pages ratio", 128);
+                       }
 
-               if (error)
-                       error = EIO;
+                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
+                               strlcpy(reason, "no compressor space", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
+                               strlcpy(reason, "no swap space", 128);
+                       }
+                       
+                       printf("sysctl_freeze: task_freeze failed: %s\n", reason);
+
+                       if (error == KERN_NO_SPACE) {
+                               /* Make it easy to distinguish between failures due to low compressor/ swap space and other failures. */
+                               error = ENOSPC;
+                       } else {
+                               error = EIO;
+                       }
+               } else {
+                       proc_list_lock();
+                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
+                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
+                               memorystatus_frozen_count++;
+                       }
+                       p->p_memstat_frozen_count++;
+
+
+                       proc_list_unlock();
+
+                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                               /*
+                                * We elevate only if we are going to swap out the data.
+                                */
+                               error = memorystatus_update_inactive_jetsam_priority_band(pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE,
+                                               memorystatus_freeze_jetsam_band, TRUE);
+
+                               if (error) {
+                                       printf("sysctl_freeze: Elevating frozen process to higher jetsam band failed with %d\n", error);
+                               }
+                       }
+               }
+
+               proc_rele(p);
 
                lck_mtx_unlock(&freezer_mutex);
                return error;
+       } else {
+               printf("sysctl_freeze: Invalid process\n");
        }
 
+
        lck_mtx_unlock(&freezer_mutex);
        return EINVAL;
 }
@@ -1242,10 +1403,23 @@ sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
                p = proc_find(pid);
                if (p != NULL) {
                        error = task_thaw(p->task);
-                       proc_rele(p);
 
-                       if (error)
+                       if (error) {
                                error = EIO;
+                       } else {
+                               /*
+                                * task_thaw() succeeded.
+                                *
+                                * We increment memorystatus_frozen_count on the sysctl freeze path.
+                                * And so we need the P_MEMSTAT_FROZEN to decrement the frozen count
+                                * when this process exits.
+                                *
+                                * proc_list_lock();
+                                * p->p_memstat_state &= ~P_MEMSTAT_FROZEN;
+                                * proc_list_unlock();
+                                */
+                       }
+                       proc_rele(p);
                        return error;
                }
        }
@@ -1256,6 +1430,194 @@ sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
     0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
 
+typedef struct _global_freezable_status{
+       boolean_t       freeze_pages_threshold_crossed;
+       boolean_t       freeze_eligible_procs_available;
+       boolean_t       freeze_scheduled_in_future;
+}global_freezable_status_t;
+
+typedef struct _proc_freezable_status{
+       boolean_t       freeze_has_memstat_state;
+       boolean_t       freeze_has_pages_min;
+       int             freeze_has_probability;
+       boolean_t       freeze_attempted;
+       uint32_t        p_memstat_state;
+       uint32_t        p_pages;
+       int             p_freeze_error_code;
+       int             p_pid;
+       char            p_name[MAXCOMLEN + 1];
+}proc_freezable_status_t;
+
+#define MAX_FREEZABLE_PROCESSES 100
+
+static int
+memorystatus_freezer_get_status(user_addr_t buffer, size_t buffer_size, int32_t *retval) 
+{
+       uint32_t                        proc_count = 0, i = 0;
+       global_freezable_status_t       *list_head;
+       proc_freezable_status_t         *list_entry;
+       size_t                          list_size = 0;
+       proc_t                          p;
+       memstat_bucket_t                *bucket;
+       uint32_t                        state = 0, pages = 0, entry_count = 0;
+       boolean_t                       try_freeze = TRUE;
+       int                             error = 0, probability_of_use = 0;
+
+
+       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
+               return ENOTSUP;
+       }
+
+       list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES);
+
+       if (buffer_size < list_size) {
+               return EINVAL;
+       }
+
+       list_head = (global_freezable_status_t*)kalloc(list_size);
+       if (list_head == NULL) {
+               return ENOMEM;
+       }
+
+       memset(list_head, 0, list_size);
+
+       list_size = sizeof(global_freezable_status_t);
+
+       proc_list_lock();
+
+       uint64_t curr_time = mach_absolute_time();
+
+       list_head->freeze_pages_threshold_crossed = (memorystatus_available_pages < memorystatus_freeze_threshold);
+       list_head->freeze_eligible_procs_available = ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold);
+       list_head->freeze_scheduled_in_future = (curr_time < memorystatus_freezer_thread_next_run_ts);
+
+       list_entry = (proc_freezable_status_t*) ((uintptr_t)list_head + sizeof(global_freezable_status_t));
+
+       bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
+       
+       entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t));
+
+       p = memorystatus_get_first_proc_locked(&i, FALSE);
+       proc_count++;
+
+       while ((proc_count <= MAX_FREEZABLE_PROCESSES) &&
+              (p) &&
+              (list_size < buffer_size)) {
+
+               if (isApp(p) == FALSE) {
+                       p = memorystatus_get_next_proc_locked(&i, p, FALSE);
+                       proc_count++;
+                       continue;
+               }
+
+               strlcpy(list_entry->p_name, p->p_name, MAXCOMLEN + 1);
+
+               list_entry->p_pid = p->p_pid;
+       
+               state = p->p_memstat_state;
+
+               if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) ||
+                       !(state & P_MEMSTAT_SUSPENDED)) {
+
+                       try_freeze = list_entry->freeze_has_memstat_state = FALSE;
+               } else {
+                       try_freeze = list_entry->freeze_has_memstat_state = TRUE;
+               }
+
+               list_entry->p_memstat_state = state;
+
+               memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
+               if (pages < memorystatus_freeze_pages_min) {
+                       try_freeze = list_entry->freeze_has_pages_min = FALSE;
+               } else {
+                       list_entry->freeze_has_pages_min = TRUE;
+                       if (try_freeze != FALSE) {
+                               try_freeze = TRUE;
+                       }
+               }
+
+               list_entry->p_pages = pages;
+       
+               if (entry_count) {
+                       uint32_t j = 0;
+                       for (j = 0; j < entry_count; j++ ) {
+                               if (strncmp(memorystatus_global_probabilities_table[j].proc_name,
+                                           p->p_name,
+                                           MAXCOMLEN + 1) == 0) {
+
+                                       probability_of_use = memorystatus_global_probabilities_table[j].use_probability;
+                                       break;
+                               }
+                       }
+
+                       list_entry->freeze_has_probability = probability_of_use;
+
+                       if (probability_of_use && try_freeze != FALSE) {
+                               try_freeze = TRUE;
+                       } else {
+                               try_freeze = FALSE;
+                       }
+               } else {
+                       if (try_freeze != FALSE) {
+                               try_freeze = TRUE;
+                       }
+                       list_entry->freeze_has_probability = -1;
+               }
+
+               if (try_freeze) {
+               
+                       uint32_t purgeable, wired, clean, dirty, shared;
+                       uint32_t max_pages = 0;
+                       int freezer_error_code = 0;
+
+                       error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, TRUE /* eval only */);
+
+                       if (error) {
+                               list_entry->p_freeze_error_code = freezer_error_code;
+                       }
+
+                       list_entry->freeze_attempted = TRUE;
+               }
+
+               list_entry++;
+
+               list_size += sizeof(proc_freezable_status_t);
+               
+               p = memorystatus_get_next_proc_locked(&i, p, FALSE);
+               proc_count++;
+       }
+       
+       proc_list_unlock();
+
+       buffer_size = list_size;
+
+       error = copyout(list_head, buffer, buffer_size);
+       if (error == 0) {
+               *retval = buffer_size;
+       } else {
+               *retval = 0;
+       }
+
+       list_size = sizeof(global_freezable_status_t) + (sizeof(proc_freezable_status_t) * MAX_FREEZABLE_PROCESSES);
+       kfree(list_head, list_size);
+
+       MEMORYSTATUS_DEBUG(1, "memorystatus_freezer_get_status: returning %d (%lu - size)\n", error, (unsigned long)*list_size);
+       
+       return error;
+}
+
+static int
+memorystatus_freezer_control(int32_t flags, user_addr_t buffer, size_t buffer_size, int32_t *retval)
+{
+       int err = ENOTSUP;
+
+       if (flags == FREEZER_CONTROL_GET_STATUS) {
+               err = memorystatus_freezer_get_status(buffer, buffer_size, retval);
+       }
+
+       return err;
+}
+
 #endif /* CONFIG_FREEZE */
 
 #endif /* DEVELOPMENT || DEBUG */
@@ -1390,7 +1752,7 @@ static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_inde
        p = TAILQ_FIRST(&current_bucket->list);
 
        while (p) {
-               memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
+               memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
                max_pages = pages;
                max_proc = p;
                prev_max_proc = p;
@@ -1398,7 +1760,7 @@ static void memorystatus_sort_by_largest_process_locked(unsigned int bucket_inde
                while ((next_p = TAILQ_NEXT(p, p_memstat_list)) != NULL) {
                        /* traversing list until we find next largest process */
                        p=next_p;
-                       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
+                       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
                        if (pages > max_pages) {
                                max_pages = pages;
                                max_proc = p;
@@ -1459,16 +1821,65 @@ static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc
        return next_p;
 }
 
+/*
+ * Structure to hold state for a jetsam thread.
+ * Typically there should be a single jetsam thread
+ * unless parallel jetsam is enabled.
+ */
+struct jetsam_thread_state {
+       boolean_t       inited; /* if the thread is initialized */
+       int             memorystatus_wakeup; /* wake channel */
+       int             index; /* jetsam thread index */
+       thread_t        thread; /* jetsam thread pointer */
+} *jetsam_threads;
+
+/* Maximum number of jetsam threads allowed */
+#define JETSAM_THREADS_LIMIT   3
+
+/* Number of active jetsam threads */
+_Atomic int active_jetsam_threads = 1;
+
+/* Number of maximum jetsam threads configured */
+int max_jetsam_threads = JETSAM_THREADS_LIMIT;
+
+/*
+ * Global switch for enabling fast jetsam. Fast jetsam is
+ * hooked up via the system_override() system call. It has the
+ * following effects:
+ * - Raise the jetsam threshold ("clear-the-deck")
+ * - Enabled parallel jetsam on eligible devices
+ */
+int fast_jetsam_enabled = 0;
+
+/* Routine to find the jetsam state structure for the current jetsam thread */
+static inline struct jetsam_thread_state *
+jetsam_current_thread(void)
+{
+       for (int thr_id = 0; thr_id < max_jetsam_threads; thr_id++) {
+               if (jetsam_threads[thr_id].thread == current_thread())
+                       return &(jetsam_threads[thr_id]);
+       }
+       panic("jetsam_current_thread() is being called from a non-jetsam thread\n");
+       /* Contol should not reach here */
+       return NULL;
+}
+
+
 __private_extern__ void
 memorystatus_init(void)
 {
-       thread_t thread = THREAD_NULL;
        kern_return_t result;
        int i;
 
 #if CONFIG_FREEZE
+       memorystatus_freeze_jetsam_band = JETSAM_PRIORITY_UI_SUPPORT;
+       memorystatus_frozen_processes_max = FREEZE_PROCESSES_MAX;
+       memorystatus_frozen_shared_mb_max = ((MAX_FROZEN_SHARED_MB_PERCENT * max_task_footprint_mb) / 100); /* 10% of the system wide task limit */
+       memorystatus_freeze_shared_mb_per_process_max = (memorystatus_frozen_shared_mb_max / 4);
        memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
        memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
+       memorystatus_max_frozen_demotions_daily = MAX_FROZEN_PROCESS_DEMOTIONS;
+       memorystatus_thaw_count_demotion_threshold = MIN_THAW_DEMOTION_THRESHOLD;
 #endif
 
 #if DEVELOPMENT || DEBUG
@@ -1591,13 +2002,22 @@ memorystatus_init(void)
 #endif /* CONFIG_JETSAM */
 
        memorystatus_jetsam_snapshot_max = maxproc;
+
+       memorystatus_jetsam_snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
+               (sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
+
        memorystatus_jetsam_snapshot = 
-               (memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) +
-               sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
+               (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size);
        if (!memorystatus_jetsam_snapshot) {
                panic("Could not allocate memorystatus_jetsam_snapshot");
        }
 
+       memorystatus_jetsam_snapshot_copy =
+               (memorystatus_jetsam_snapshot_t*)kalloc(memorystatus_jetsam_snapshot_size);
+       if (!memorystatus_jetsam_snapshot_copy) {
+               panic("Could not allocate memorystatus_jetsam_snapshot_copy");
+       }
+
        nanoseconds_to_absolutetime((uint64_t)JETSAM_SNAPSHOT_TIMEOUT_SECS * NSEC_PER_SEC, &memorystatus_jetsam_snapshot_timeout);
 
        memset(&memorystatus_at_boot_snapshot, 0, sizeof(memorystatus_jetsam_snapshot_t));
@@ -1606,11 +2026,41 @@ memorystatus_init(void)
        memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
 #endif
        
-       result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
-       if (result == KERN_SUCCESS) {
-               thread_deallocate(thread);
-       } else {
-               panic("Could not create memorystatus_thread");
+       /* Check the boot-arg to see if fast jetsam is allowed */
+       if (!PE_parse_boot_argn("fast_jetsam_enabled", &fast_jetsam_enabled, sizeof (fast_jetsam_enabled))) {
+               fast_jetsam_enabled = 0;
+       }
+
+       /* Check the boot-arg to configure the maximum number of jetsam threads */
+       if (!PE_parse_boot_argn("max_jetsam_threads", &max_jetsam_threads, sizeof (max_jetsam_threads))) {
+               max_jetsam_threads = JETSAM_THREADS_LIMIT;
+       }
+
+       /* Restrict the maximum number of jetsam threads to JETSAM_THREADS_LIMIT */
+       if (max_jetsam_threads > JETSAM_THREADS_LIMIT) {
+               max_jetsam_threads = JETSAM_THREADS_LIMIT;
+        }
+
+       /* For low CPU systems disable fast jetsam mechanism */
+       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
+               max_jetsam_threads = 1;
+               fast_jetsam_enabled = 0;
+       }
+
+       /* Initialize the jetsam_threads state array */
+       jetsam_threads = kalloc(sizeof(struct jetsam_thread_state) * max_jetsam_threads);
+
+       /* Initialize all the jetsam threads */
+       for (i = 0; i < max_jetsam_threads; i++) {
+
+               result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &jetsam_threads[i].thread);
+               if (result == KERN_SUCCESS) {
+                       jetsam_threads[i].inited = FALSE;
+                       jetsam_threads[i].index = i;
+                       thread_deallocate(jetsam_threads[i].thread);
+               } else {
+                       panic("Could not create memorystatus_thread %d", i);
+               }
        }
 }
 
@@ -1658,15 +2108,20 @@ memorystatus_do_kill(proc_t p, uint32_t cause, os_reason_t jetsam_reason) {
                       (uint64_t)memorystatus_available_pages);
        }
 
+       /*
+        * The jetsam_reason (os_reason_t) has enough information about the kill cause.
+        * We don't really need jetsam_flags anymore, so it's okay that not all possible kill causes have been mapped.
+        */
        int jetsam_flags = P_LTERM_JETSAM;
        switch (cause) {
-               case kMemorystatusKilledHiwat:                  jetsam_flags |= P_JETSAM_HIWAT; break;
-               case kMemorystatusKilledVnodes:                 jetsam_flags |= P_JETSAM_VNODE; break;
-               case kMemorystatusKilledVMPageShortage:         jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
-               case kMemorystatusKilledVMThrashing:            jetsam_flags |= P_JETSAM_VMTHRASHING; break;
-               case kMemorystatusKilledFCThrashing:            jetsam_flags |= P_JETSAM_FCTHRASHING; break;
-               case kMemorystatusKilledPerProcessLimit:        jetsam_flags |= P_JETSAM_PID; break;
-               case kMemorystatusKilledIdleExit:               jetsam_flags |= P_JETSAM_IDLEEXIT; break;
+               case kMemorystatusKilledHiwat:                                          jetsam_flags |= P_JETSAM_HIWAT; break;
+               case kMemorystatusKilledVnodes:                                         jetsam_flags |= P_JETSAM_VNODE; break;
+               case kMemorystatusKilledVMPageShortage:                         jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
+               case kMemorystatusKilledVMCompressorThrashing:
+               case kMemorystatusKilledVMCompressorSpaceShortage:      jetsam_flags |= P_JETSAM_VMTHRASHING; break;
+               case kMemorystatusKilledFCThrashing:                            jetsam_flags |= P_JETSAM_FCTHRASHING; break;
+               case kMemorystatusKilledPerProcessLimit:                        jetsam_flags |= P_JETSAM_PID; break;
+               case kMemorystatusKilledIdleExit:                                       jetsam_flags |= P_JETSAM_IDLEEXIT; break;
        }
        error = jetsam_do_kill(p, jetsam_flags, jetsam_reason);
 
@@ -1702,7 +2157,7 @@ memorystatus_check_levels_locked(void) {
  */
 
 int
-memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, boolean_t effective_now)
+memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags, int jetsam_prio, boolean_t effective_now)
 {
        int error = 0;  
        boolean_t enable = FALSE;
@@ -1734,7 +2189,7 @@ memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags,
                                memorystatus_invalidate_idle_demotion_locked(p, TRUE);
 
                                if (effective_now) {
-                                       if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_ELEVATED_INACTIVE) {
+                                       if (p->p_memstat_effectivepriority < jetsam_prio) {
                                                if(memorystatus_highwater_enabled) {
                                                        /*
                                                         * Process is about to transition from
@@ -1746,7 +2201,7 @@ memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags,
                                                        CACHE_ACTIVE_LIMITS_LOCKED(p, is_fatal);
                                                        task_set_phys_footprint_limit_internal(p->task, (p->p_memstat_memlimit > 0) ? p->p_memstat_memlimit : -1, NULL, use_active, is_fatal);
                                                }
-                                               memorystatus_update_priority_locked(p, JETSAM_PRIORITY_ELEVATED_INACTIVE, FALSE, FALSE);
+                                               memorystatus_update_priority_locked(p, jetsam_prio, FALSE, FALSE);
                                        }
                                } else {
                                        if (isProcessInAgingBands(p)) {
@@ -1759,7 +2214,7 @@ memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t op_flags,
                                memorystatus_invalidate_idle_demotion_locked(p, TRUE);
 
                                if (effective_now) {
-                                       if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_ELEVATED_INACTIVE) {
+                                       if (p->p_memstat_effectivepriority == jetsam_prio) {
                                                memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, FALSE, TRUE);
                                        }
                                } else {
@@ -2122,21 +2577,48 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser
                         * We can, however, override that if the process has an 'elevated inactive jetsam band' attribute.
                         */
 
-                       if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
-                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
-
+                       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
+                               /*
+                                * 2 types of processes can use the non-standard elevated inactive band:
+                                * - Frozen processes that always land in memorystatus_freeze_jetsam_band
+                                * OR
+                                * - processes that specifically opt-in to the elevated inactive support e.g. docked processes.
+                                */
+#if CONFIG_FREEZE
+                               if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
+                                       if (priority <= memorystatus_freeze_jetsam_band) {
+                                               priority = memorystatus_freeze_jetsam_band;
+                                       } 
+                               } else
+#endif /* CONFIG_FREEZE */
+                               {
+                                       if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
+                                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
+                                       }
+                               }
                                assert(! (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS));
                        }
                } else if (isApp(p)) {
                
                        /*
                         * Check to see if the application is being lowered in jetsam priority. If so, and:
-                        * - it has an 'elevated inactive jetsam band' attribute, then put it in the JETSAM_PRIORITY_ELEVATED_INACTIVE band.
+                        * - it has an 'elevated inactive jetsam band' attribute, then put it in the appropriate band.
                         * - it is a normal application, then let it age in the aging band if that policy is in effect.
                         */
        
-                       if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE && (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND)) {
-                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
+                       if (p->p_memstat_state & P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND) {
+#if CONFIG_FREEZE
+                               if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
+                                       if (priority <= memorystatus_freeze_jetsam_band) {
+                                               priority = memorystatus_freeze_jetsam_band;
+                                       } 
+                               } else 
+#endif /* CONFIG_FREEZE */
+                               {
+                                       if (priority <= JETSAM_PRIORITY_ELEVATED_INACTIVE) {
+                                               priority = JETSAM_PRIORITY_ELEVATED_INACTIVE;
+                                       }
+                               }
                        } else {
 
                                if (applications_aging_band) {
@@ -2259,6 +2741,15 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser
                if (now > p->p_memstat_idle_start) {
                        p->p_memstat_idle_delta = now - p->p_memstat_idle_start;
                }
+
+               /*
+                * About to become active and so memory footprint could change.
+                * So mark it eligible for freeze-considerations next time around.
+                */
+               if (p->p_memstat_state & P_MEMSTAT_FREEZE_IGNORE) {
+                       p->p_memstat_state &= ~P_MEMSTAT_FREEZE_IGNORE;
+               }
+
        } else if (priority == JETSAM_PRIORITY_IDLE) {
                /*
                 * Transitioning into the idle priority bucket.
@@ -2267,6 +2758,8 @@ memorystatus_update_priority_locked(proc_t p, int priority, boolean_t head_inser
                p->p_memstat_idle_start = mach_absolute_time();
        }
 
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0);
+
        p->p_memstat_effectivepriority = priority;
 
 #if CONFIG_SECLUDED_MEMORY
@@ -2536,11 +3029,18 @@ memorystatus_remove(proc_t p, boolean_t locked)
 
 #if CONFIG_FREEZE    
        if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
+
+               if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
+                       p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
+                       memorystatus_refreeze_eligible_count--;
+               }
+
                memorystatus_frozen_count--;
+               memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
+               p->p_memstat_freeze_sharedanon_pages = 0;
        }
 
        if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
-               memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
                memorystatus_suspended_count--;
        }
 #endif
@@ -2587,8 +3087,15 @@ memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) {
                return EINVAL;
        }
 
-       /* Deferral is only relevant if idle exit is specified */
+       /* Only one type of DEFER behavior is allowed.*/
        if ((pcontrol & PROC_DIRTY_DEFER) && 
+           (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) {
+               return EINVAL;
+       }
+
+       /* Deferral is only relevant if idle exit is specified */
+       if (((pcontrol & PROC_DIRTY_DEFER) ||
+           (pcontrol & PROC_DIRTY_DEFER_ALWAYS)) &&
           !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
                return EINVAL;
        }
@@ -2714,12 +3221,18 @@ memorystatus_dirty_track(proc_t p, uint32_t pcontrol) {
 
 
        /* This can be set and cleared exactly once. */
-       if (pcontrol & PROC_DIRTY_DEFER) {
+       if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
 
-               if ( !(old_dirty & P_DIRTY_DEFER)) {
+               if ((pcontrol & (PROC_DIRTY_DEFER)) &&
+                   !(old_dirty & P_DIRTY_DEFER)) {
                        p->p_memstat_dirty |= P_DIRTY_DEFER;
                }
 
+               if ((pcontrol & (PROC_DIRTY_DEFER_ALWAYS)) &&
+                   !(old_dirty & P_DIRTY_DEFER_ALWAYS)) {
+                       p->p_memstat_dirty |= P_DIRTY_DEFER_ALWAYS;
+               }
+
                defer_now = TRUE;
        }
 
@@ -2901,6 +3414,8 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
                        /*
                         * Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band OR it might be heading back
                         * there once it's clean again. For the legacy case, this only applies if it has some protection window left.
+                        * P_DIRTY_DEFER: one-time protection window given at launch
+                        * P_DIRTY_DEFER_ALWAYS: protection window given for every dirty->clean transition. Like non-legacy mode.
                         *
                         * Non-Legacy mode: P_DIRTY_AGING_IN_PROGRESS means the process is in the aging band. It will always stop over
                         * in that band on it's way to IDLE.
@@ -2924,9 +3439,11 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
                                 */
 
                                if (jetsam_aging_policy == kJetsamAgingPolicyLegacy) {
-                                       if (mach_absolute_time() >= p->p_memstat_idledeadline) {
+                                       if (((p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) == FALSE) &&
+                                           (mach_absolute_time() >= p->p_memstat_idledeadline)) {
                                                /*
-                                                * The process' deadline has expired. It currently
+                                                * The process' hasn't enrolled in the "always defer after dirty"
+                                                * mode and its deadline has expired. It currently
                                                 * does not reside in any of the aging buckets.
                                                 * 
                                                 * It's on its way to the JETSAM_PRIORITY_IDLE 
@@ -2942,12 +3459,16 @@ memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
                                                reschedule = TRUE;
                                        } else {
                                                /*
-                                                * It still has some protection window left and so
+                                                * Process enrolled in "always stop in deferral band after dirty" OR
+                                                * it still has some protection window left and so
                                                 * we just re-arm the timer without modifying any
                                                 * state on the process iff it still wants into that band.
                                                 */
 
-                                               if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) {
+                                               if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
+                                                       memorystatus_schedule_idle_demotion_locked(p, TRUE);
+                                                       reschedule = TRUE;
+                                               } else if (p->p_memstat_dirty & P_DIRTY_AGING_IN_PROGRESS) {
                                                        memorystatus_schedule_idle_demotion_locked(p, FALSE);
                                                        reschedule = TRUE;
                                                }
@@ -3077,7 +3598,7 @@ memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) {
                goto exit;
        } 
 
-       if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER)) == 0) {
+       if (!pcontrol || (pcontrol & (PROC_DIRTY_LAUNCH_IN_PROGRESS | PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) == 0) {
                ret = EINVAL;
                goto exit;
        }
@@ -3087,16 +3608,19 @@ memorystatus_dirty_clear(proc_t p, uint32_t pcontrol) {
        }
 
        /* This can be set and cleared exactly once. */
-       if (pcontrol & PROC_DIRTY_DEFER) {
+       if (pcontrol & (PROC_DIRTY_DEFER | PROC_DIRTY_DEFER_ALWAYS)) {
 
-               if (p->p_memstat_dirty & P_DIRTY_DEFER) {
+               if (p->p_memstat_dirty & P_DIRTY_DEFER) {
+                       p->p_memstat_dirty &= ~(P_DIRTY_DEFER);
+               }
 
-                       p->p_memstat_dirty &= ~P_DIRTY_DEFER;
+               if (p->p_memstat_dirty & P_DIRTY_DEFER_ALWAYS) {
+                       p->p_memstat_dirty &= ~(P_DIRTY_DEFER_ALWAYS);
+               }
 
-                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
-                       memorystatus_update_idle_priority_locked(p);
-                       memorystatus_reschedule_idle_demotion_locked();
-               }
+               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+               memorystatus_update_idle_priority_locked(p);
+               memorystatus_reschedule_idle_demotion_locked();
        }
 
        ret = 0;
@@ -3156,12 +3680,10 @@ memorystatus_on_suspend(proc_t p)
 {
 #if CONFIG_FREEZE
        uint32_t pages;
-       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
+       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
 #endif
        proc_list_lock();
 #if CONFIG_FREEZE
-       p->p_memstat_suspendedfootprint = pages;
-       memorystatus_suspended_footprint_total += pages;
        memorystatus_suspended_count++;
 #endif
        p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
@@ -3181,17 +3703,36 @@ memorystatus_on_resume(proc_t p)
 #if CONFIG_FREEZE
        frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
        if (frozen) {
-               memorystatus_frozen_count--;
-               p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW;
+               /*
+                * Now that we don't _thaw_ a process completely,
+                * resuming it (and having some on-demand swapins)
+                * shouldn't preclude it from being counted as frozen.
+                *
+                * memorystatus_frozen_count--;
+                *
+                * We preserve the P_MEMSTAT_FROZEN state since the process
+                * could have state on disk AND so will deserve some protection
+                * in the jetsam bands.
+                */
+               if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == 0) {
+                       p->p_memstat_state |= P_MEMSTAT_REFREEZE_ELIGIBLE;
+                       memorystatus_refreeze_eligible_count++;
+               }
+               p->p_memstat_thaw_count++;
+
+               memorystatus_thaw_count++;
        }
 
-       memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
        memorystatus_suspended_count--;
        
        pid = p->p_pid;
 #endif
 
-       p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
+       /*
+        * P_MEMSTAT_FROZEN will remain unchanged. This used to be:
+        * p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
+        */
+       p->p_memstat_state &= ~P_MEMSTAT_SUSPENDED;
 
        proc_list_unlock();
     
@@ -3227,7 +3768,7 @@ memorystatus_build_state(proc_t p) {
        if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
                snapshot_state |= kMemorystatusFrozen;
        }
-       if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) {
+       if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
                snapshot_state |= kMemorystatusWasThawed;
        }
        
@@ -3296,19 +3837,49 @@ kill_idle_exit_proc(void)
 }
 
 static void
-memorystatus_thread_wake(void) {
-       thread_wakeup((event_t)&memorystatus_wakeup);
+memorystatus_thread_wake(void)
+{      
+       int thr_id = 0;
+       int active_thr = atomic_load(&active_jetsam_threads);
+
+       /* Wakeup all the jetsam threads */
+       for (thr_id = 0; thr_id < active_thr; thr_id++) {
+               thread_wakeup((event_t)&jetsam_threads[thr_id].memorystatus_wakeup); 
+       }       
+}      
+               
+#if CONFIG_JETSAM
+
+static void    
+memorystatus_thread_pool_max()
+{
+       /* Increase the jetsam thread pool to max_jetsam_threads */
+       int max_threads = max_jetsam_threads;
+       printf("Expanding memorystatus pool to %d!\n", max_threads);
+       atomic_store(&active_jetsam_threads, max_threads);
+}
+
+static void
+memorystatus_thread_pool_default()
+{
+       /* Restore the jetsam thread pool to a single thread */
+       printf("Reverting memorystatus pool back to 1\n");
+       atomic_store(&active_jetsam_threads, 1);
 }
 
+#endif /* CONFIG_JETSAM */
+
 extern void vm_pressure_response(void);
 
 static int
 memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
 {
+       struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
+
        if (interval_ms) {
-               assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC);
+               assert_wait_timeout(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT, interval_ms, NSEC_PER_MSEC);
        } else {
-               assert_wait(&memorystatus_wakeup, THREAD_UNINT);
+               assert_wait(&jetsam_thread->memorystatus_wakeup, THREAD_UNINT);
        }
        
        return thread_block(continuation);   
@@ -3380,6 +3951,192 @@ memorystatus_action_needed(void)
 #endif /* CONFIG_EMBEDDED */
 }
 
+#if CONFIG_FREEZE
+extern void            vm_swap_consider_defragmenting(int);
+
+/*
+ * This routine will _jetsam_ all frozen processes
+ * and reclaim the swap space immediately.
+ *
+ * So freeze has to be DISABLED when we call this routine.
+ */
+
+void
+memorystatus_disable_freeze(void)
+{
+       memstat_bucket_t *bucket;
+       int bucket_count = 0, retries = 0;
+       boolean_t retval = FALSE, killed = FALSE;
+       uint32_t errors = 0, errors_over_prev_iteration = 0;
+       os_reason_t jetsam_reason = 0;
+       unsigned int band = 0;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+
+       assert(memorystatus_freeze_enabled == FALSE);
+
+       jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE);
+       if (jetsam_reason == OS_REASON_NULL) {
+               printf("memorystatus_disable_freeze: failed to allocate jetsam reason\n");
+       }
+
+       /*
+        * Let's relocate all frozen processes into band 8. Demoted frozen processes
+        * are sitting in band 0 currently and it's possible to have a frozen process
+        * in the FG band being actively used. We don't reset its frozen state when
+        * it is resumed because it has state on disk.
+        *
+        * We choose to do this relocation rather than implement a new 'kill frozen'
+        * process function for these reasons:
+        * - duplication of code: too many kill functions exist and we need to rework them better.
+        * - disk-space-shortage kills are rare
+        * - not having the 'real' jetsam band at time of the this frozen kill won't preclude us
+        *   from answering any imp. questions re. jetsam policy/effectiveness.
+        *
+        * This is essentially what memorystatus_update_inactive_jetsam_priority_band() does while
+        * avoiding the application of memory limits.
+        */
+
+again:
+       proc_list_lock();
+
+       band = JETSAM_PRIORITY_IDLE;
+       p = PROC_NULL;
+       next_p = PROC_NULL;
+
+       next_p = memorystatus_get_first_proc_locked(&band, TRUE);
+       while (next_p) {
+
+               p = next_p;
+               next_p = memorystatus_get_next_proc_locked(&band, p, TRUE);
+
+               if (p->p_memstat_effectivepriority > JETSAM_PRIORITY_FOREGROUND) {
+                       break;
+               }
+
+               if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
+                       continue;
+               }
+
+               if (p->p_memstat_state & P_MEMSTAT_ERROR) {
+                       p->p_memstat_state &= ~P_MEMSTAT_ERROR;
+               }
+
+               if (p->p_memstat_effectivepriority == memorystatus_freeze_jetsam_band) {
+                       continue;
+               }
+
+               /*
+                * We explicitly add this flag here so the process looks like a normal
+                * frozen process i.e. P_MEMSTAT_FROZEN and P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND.
+                * We don't bother with assigning the 'active' memory
+                * limits at this point because we are going to be killing it soon below.
+                */
+               p->p_memstat_state |= P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
+               memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+
+               memorystatus_update_priority_locked(p, memorystatus_freeze_jetsam_band, FALSE, TRUE);
+       }
+
+       bucket = &memstat_bucket[memorystatus_freeze_jetsam_band];
+       bucket_count = bucket->count;
+       proc_list_unlock();
+
+       /*
+        * Bucket count is already stale at this point. But, we don't expect
+        * freezing to continue since we have already disabled the freeze functionality.
+        * However, an existing freeze might be in progress. So we might miss that process
+        * in the first go-around. We hope to catch it in the next.
+        */
+
+       errors_over_prev_iteration = 0;
+       while (bucket_count) {
+
+               bucket_count--;
+
+               /*
+                * memorystatus_kill_elevated_process() drops a reference,
+                * so take another one so we can continue to use this exit reason
+                * even after it returns.
+                */
+
+               os_reason_ref(jetsam_reason);
+               retval = memorystatus_kill_elevated_process(
+                       kMemorystatusKilledDiskSpaceShortage,
+                       jetsam_reason,
+                       memorystatus_freeze_jetsam_band,
+                       0, /* the iteration of aggressive jetsam..ignored here */
+                       &errors);
+
+               if (errors > 0) {
+                       printf("memorystatus_disable_freeze: memorystatus_kill_elevated_process returned %d error(s)\n", errors);
+                       errors_over_prev_iteration += errors;
+                       errors = 0;
+               }
+
+               if (retval == 0) {
+                       /*
+                        * No frozen processes left to kill.
+                        */
+                       break;
+               }
+
+               killed = TRUE;
+       }
+
+       proc_list_lock();
+
+       if (memorystatus_frozen_count) {
+               /*
+                * A frozen process snuck in and so
+                * go back around to kill it. That
+                * process may have been resumed and
+                * put into the FG band too. So we
+                * have to do the relocation again.
+                */
+               assert(memorystatus_freeze_enabled == FALSE);
+
+               retries++;
+               if (retries < 3) {
+                       proc_list_unlock();
+                       goto again;
+               }
+#if DEVELOPMENT || DEBUG
+               panic("memorystatus_disable_freeze: Failed to kill all frozen processes, memorystatus_frozen_count = %d, errors = %d",
+                               memorystatus_frozen_count, errors_over_prev_iteration);
+#endif /* DEVELOPMENT || DEBUG */
+       }
+       proc_list_unlock();
+
+       os_reason_free(jetsam_reason);
+
+       if (killed) {
+
+               vm_swap_consider_defragmenting(VM_SWAP_FLAGS_FORCE_DEFRAG | VM_SWAP_FLAGS_FORCE_RECLAIM);
+
+               proc_list_lock();
+               size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
+                       sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
+               uint64_t timestamp_now = mach_absolute_time();
+               memorystatus_jetsam_snapshot->notification_time = timestamp_now;
+               memorystatus_jetsam_snapshot->js_gencount++;
+               if (memorystatus_jetsam_snapshot_count > 0 && (memorystatus_jetsam_snapshot_last_timestamp == 0 ||
+                               timestamp_now > memorystatus_jetsam_snapshot_last_timestamp + memorystatus_jetsam_snapshot_timeout)) {
+                       proc_list_unlock();
+                       int ret = memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
+                       if (!ret) {
+                               proc_list_lock();
+                               memorystatus_jetsam_snapshot_last_timestamp = timestamp_now;
+                               proc_list_unlock();
+                       }
+               } else {
+                       proc_list_unlock();
+               }
+       }
+
+       return;
+}
+#endif /* CONFIG_FREEZE */
+
 static boolean_t
 memorystatus_act_on_hiwat_processes(uint32_t *errors, uint32_t *hwm_kill, boolean_t *post_snapshot, __unused boolean_t *is_critical)
 {
@@ -3557,6 +4314,7 @@ memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_
                                killed = memorystatus_kill_elevated_process(
                                        cause,
                                        jetsam_reason,
+                                       JETSAM_PRIORITY_ELEVATED_INACTIVE,
                                        jld_eval_aggressive_count,
                                        &errors);
 
@@ -3582,11 +4340,11 @@ memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_
 
                        /*
                         * memorystatus_kill_top_process_aggressive() allocates its own
-                        * jetsam_reason so the kMemorystatusKilledVMThrashing cause
+                        * jetsam_reason so the kMemorystatusKilledProcThrashing cause
                         * is consistent throughout the aggressive march.
                         */
                        killed = memorystatus_kill_top_process_aggressive(
-                               kMemorystatusKilledVMThrashing,
+                               kMemorystatusKilledProcThrashing,
                                jld_eval_aggressive_count, 
                                jld_priority_band_max, 
                                &errors);
@@ -3609,26 +4367,31 @@ memorystatus_act_aggressive(uint32_t cause, os_reason_t jetsam_reason, int *jld_
 static void
 memorystatus_thread(void *param __unused, wait_result_t wr __unused)
 {
-       static boolean_t is_vm_privileged = FALSE;
-
        boolean_t post_snapshot = FALSE;
        uint32_t errors = 0;
        uint32_t hwm_kill = 0;
        boolean_t sort_flag = TRUE;
        boolean_t corpse_list_purged = FALSE;
        int     jld_idle_kills = 0;
+       struct jetsam_thread_state *jetsam_thread = jetsam_current_thread();
 
-       if (is_vm_privileged == FALSE) {
+       if (jetsam_thread->inited == FALSE) {
                /* 
                 * It's the first time the thread has run, so just mark the thread as privileged and block.
                 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
                 */
+
+               char name[32];
                thread_wire(host_priv_self(), current_thread(), TRUE);
-               is_vm_privileged = TRUE;
-               
-               if (vm_restricted_to_single_processor == TRUE)
-                       thread_vm_bind_group_add();
-               thread_set_thread_name(current_thread(), "VM_memorystatus");
+               snprintf(name, 32, "VM_memorystatus_%d", jetsam_thread->index + 1);
+
+               if (jetsam_thread->index == 0) {
+                       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
+                               thread_vm_bind_group_add();
+                       }
+               }
+               thread_set_thread_name(current_thread(), name);
+               jetsam_thread->inited = TRUE;
                memorystatus_thread_block(0, memorystatus_thread);
        }
        
@@ -3659,8 +4422,11 @@ memorystatus_thread(void *param __unused, wait_result_t wr __unused)
                        case kMemorystatusKilledFCThrashing:
                                jetsam_reason_code = JETSAM_REASON_MEMORY_FCTHRASHING;
                                break;
-                       case kMemorystatusKilledVMThrashing:
-                               jetsam_reason_code = JETSAM_REASON_MEMORY_VMTHRASHING;
+                       case kMemorystatusKilledVMCompressorThrashing:
+                               jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING;
+                               break;
+                       case kMemorystatusKilledVMCompressorSpaceShortage:
+                               jetsam_reason_code = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE;
                                break;
                        case kMemorystatusKilledZoneMapExhaustion:
                                jetsam_reason_code = JETSAM_REASON_ZONE_MAP_EXHAUSTION;
@@ -3863,7 +4629,7 @@ memorystatus_on_ledger_footprint_exceeded(boolean_t warning, boolean_t memlimit_
                jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_PERPROCESSLIMIT);
                if (jetsam_reason == NULL) {
                        printf("task_exceeded footprint: failed to allocate jetsam reason\n");
-               } else if (corpse_for_fatal_memkill != 0) {
+               } else if (corpse_for_fatal_memkill != 0 && proc_send_synchronous_EXC_RESOURCE(p) == FALSE) {
                        /* Set OS_REASON_FLAG_GENERATE_CRASH_REPORT to generate corpse */
                        jetsam_reason->osr_flags |= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
                }
@@ -4254,7 +5020,6 @@ memorystatus_allowed_vm_map_fork(task_t task)
 #if CONFIG_EMBEDDED
 
        uint64_t footprint_in_bytes;
-       uint64_t purgeable_in_bytes;
        uint64_t max_allowed_bytes;
 
        if (max_task_footprint_mb == 0) {
@@ -4262,17 +5027,12 @@ memorystatus_allowed_vm_map_fork(task_t task)
                return (is_allowed);
        }
 
-       purgeable_in_bytes = get_task_purgeable_size(task);
        footprint_in_bytes = get_task_phys_footprint(task);
 
        /*
-        * Maximum is half the system-wide task limit.
+        * Maximum is 1/4 of the system-wide task limit.
         */
-       max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 1;
-
-       if (footprint_in_bytes > purgeable_in_bytes) {
-               footprint_in_bytes -= purgeable_in_bytes;
-       }
+       max_allowed_bytes = ((uint64_t)max_task_footprint_mb * 1024 * 1024) >> 2;
 
        if (footprint_in_bytes > max_allowed_bytes) {
                printf("memorystatus disallowed vm_map_fork %lld  %lld\n", footprint_in_bytes, max_allowed_bytes);
@@ -4287,7 +5047,7 @@ memorystatus_allowed_vm_map_fork(task_t task)
 }
 
 static void
-memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
+memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint_lifetime, uint32_t *purgeable_pages)
 {
        assert(task);
        assert(footprint);
@@ -4298,11 +5058,6 @@ memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *ma
        assert(((uint32_t)pages) == pages);
        *footprint = (uint32_t)pages;
 
-       if (max_footprint) {
-               pages = (get_task_phys_footprint_recent_max(task) / PAGE_SIZE_64);
-               assert(((uint32_t)pages) == pages);
-               *max_footprint = (uint32_t)pages;
-       }
        if (max_footprint_lifetime) {
                pages = (get_task_resident_max(task) / PAGE_SIZE_64);
                assert(((uint32_t)pages) == pages);
@@ -4371,6 +5126,8 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause,
 
        unsigned int i;
 
+       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+
        if (memorystatus_jetsam_snapshot_count == 0) {
                /*
                 * No active snapshot.
@@ -4411,6 +5168,11 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause,
                        entry->jse_killtime = killtime;
                        entry->jse_gencount = snapshot->js_gencount;
                        entry->jse_idle_delta = p->p_memstat_idle_delta;
+#if CONFIG_FREEZE
+                       entry->jse_thaw_count = p->p_memstat_thaw_count;
+#else /* CONFIG_FREEZE */
+                       entry->jse_thaw_count = 0;
+#endif /* CONFIG_FREEZE */
 
                        /*
                         * If a process has moved between bands since snapshot was
@@ -4430,13 +5192,11 @@ memorystatus_update_jetsam_snapshot_entry_locked(proc_t p, uint32_t kill_cause,
                          */
 
                         uint32_t pages              = 0;
-                        uint32_t max_pages          = 0;
                         uint32_t max_pages_lifetime = 0;
                         uint32_t purgeable_pages    = 0;
 
-                        memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages);
+                        memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
                         entry->pages              = (uint64_t)pages;
-                        entry->max_pages          = (uint64_t)max_pages;
                         entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
                         entry->purgeable_pages    = (uint64_t)purgeable_pages;
 
@@ -4546,6 +5306,25 @@ void memorystatus_pages_update(unsigned int pages_avail)
                        memorystatus_thread_wake();
                }
        }
+#if CONFIG_FREEZE
+       /*
+        * We can't grab the freezer_mutex here even though that synchronization would be correct to inspect
+        * the # of frozen processes and wakeup the freezer thread. Reason being that we come here into this
+        * code with (possibly) the page-queue locks held and preemption disabled. So trying to grab a mutex here
+        * will result in the "mutex with preemption disabled" panic.
+        */
+
+       if (memorystatus_freeze_thread_should_run() == TRUE) {
+               /*
+                * The freezer thread is usually woken up by some user-space call i.e. pid_hibernate(any process).
+                * That trigger isn't invoked often enough and so we are enabling this explicit wakeup here. 
+                */
+               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                       thread_wakeup((event_t)&memorystatus_freeze_wakeup);
+               }
+       }
+#endif /* CONFIG_FREEZE */
+
 #else /* VM_PRESSURE_EVENTS */
 
        boolean_t critical, delta;
@@ -4578,7 +5357,6 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna
        clock_sec_t                     tv_sec;
        clock_usec_t                    tv_usec;
        uint32_t pages = 0;
-       uint32_t max_pages = 0;
        uint32_t max_pages_lifetime = 0;
        uint32_t purgeable_pages = 0;
        uint64_t internal_pages                         = 0;
@@ -4598,9 +5376,8 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna
        strlcpy(&entry->name[0], p->p_name, sizeof(entry->name));
        entry->priority = p->p_memstat_effectivepriority;
 
-       memorystatus_get_task_page_counts(p->task, &pages, &max_pages, &max_pages_lifetime, &purgeable_pages);
+       memorystatus_get_task_page_counts(p->task, &pages, &max_pages_lifetime, &purgeable_pages);
        entry->pages              = (uint64_t)pages;
-       entry->max_pages          = (uint64_t)max_pages;
        entry->max_pages_lifetime = (uint64_t)max_pages_lifetime;
        entry->purgeable_pages    = (uint64_t)purgeable_pages;
 
@@ -4627,8 +5404,8 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna
        entry->fds       = p->p_fd->fd_nfiles;
 
        absolutetime_to_microtime(get_task_cpu_time(p->task), &tv_sec, &tv_usec);
-       entry->cpu_time.tv_sec = tv_sec;
-       entry->cpu_time.tv_usec = tv_usec;
+       entry->cpu_time.tv_sec = (int64_t)tv_sec;
+       entry->cpu_time.tv_usec = (int64_t)tv_usec;
 
        assert(p->p_stats != NULL);
        entry->jse_starttime =  p->p_stats->ps_start;   /* abstime process started */
@@ -4638,6 +5415,12 @@ memorystatus_init_jetsam_snapshot_entry_locked(proc_t p, memorystatus_jetsam_sna
 
        entry->jse_idle_delta = p->p_memstat_idle_delta; /* Most recent timespan spent in idle-band */
 
+#if CONFIG_FREEZE
+       entry->jse_thaw_count = p->p_memstat_thaw_count;
+#else /* CONFIG_FREEZE */
+       entry->jse_thaw_count = 0;
+#endif /* CONFIG_FREEZE */
+
        proc_coalitionids(p, cids);
        entry->jse_coalition_jetsam_id = cids[COALITION_TYPE_JETSAM];
 
@@ -4699,6 +5482,8 @@ memorystatus_init_jetsam_snapshot_locked(memorystatus_jetsam_snapshot_t *od_snap
        memorystatus_jetsam_snapshot_entry_t *snapshot_list = NULL;
        unsigned int snapshot_max = 0;
 
+       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+
        if (od_snapshot) {
                /*
                 * This is an on_demand snapshot
@@ -4815,7 +5600,7 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool
        pid_t aPid = 0;
        uint32_t aPid_ep = 0;
 
-       uint64_t killtime = 0;
+       uint64_t        killtime = 0;
         clock_sec_t     tv_sec;
         clock_usec_t    tv_usec;
         uint32_t        tv_msec;
@@ -4891,7 +5676,11 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool
                        }
                }
 
-               memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusKilledDiagnostic, killtime);
+               proc_list_lock();
+               /* This diagnostic code is going away soon. Ignore the kMemorystatusInvalid cause here. */
+               memorystatus_update_jetsam_snapshot_entry_locked(p, kMemorystatusInvalid, killtime);
+               proc_list_unlock();
+
                p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
 
                if (p) {
@@ -4901,7 +5690,9 @@ memorystatus_kill_proc(proc_t p, uint32_t cause, os_reason_t jetsam_reason, bool
        } else
 #endif /* CONFIG_JETSAM && (DEVELOPMENT || DEBUG) */
        {
+               proc_list_lock();
                memorystatus_update_jetsam_snapshot_entry_locked(p, cause, killtime);
+               proc_list_unlock();
 
                char kill_reason_string[128];
 
@@ -4977,7 +5768,7 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause
         *
         * kMemorystatusKilledZoneMapExhaustion
         * AND
-        * kMemorystatusKilledVMThrashing
+        * kMemorystatusKilledVMCompressorSpaceShortage
         *
         * If we are here because of kMemorystatusKilledZoneMapExhaustion, we will consider
         * any and all processes as eligible kill candidates since we need to avoid a panic.
@@ -5047,7 +5838,7 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause
 
 #if CONFIG_FREEZE
                boolean_t skip;
-               boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM));
+               boolean_t reclaim_proc = !(p->p_memstat_state & P_MEMSTAT_LOCKED);
                if (any || reclaim_proc) {
                        skip = FALSE;
                } else {
@@ -5068,7 +5859,6 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause
                                 */
                                p->p_memstat_state |= P_MEMSTAT_TERMINATED;
 
-                               proc_list_unlock();
                        } else {
                                /*
                                 * We need to restart the search again because
@@ -5095,6 +5885,8 @@ memorystatus_kill_top_process(boolean_t any, boolean_t sort_flag, uint32_t cause
                                new_snapshot = TRUE;
                        }
 
+                       proc_list_unlock();
+
                        freed_mem = memorystatus_kill_proc(p, cause, jetsam_reason, &killed); /* purged and/or killed 'p' */
                        /* Success? */
                        if (freed_mem) {
@@ -5369,7 +6161,9 @@ exit:
 
        /* Clear snapshot if freshly captured and no target was found */
        if (new_snapshot && (kill_count == 0)) {
+           proc_list_lock();
            memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
+           proc_list_unlock();
        }
        
        KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
@@ -5531,13 +6325,12 @@ exit:
  *         false -- no pinned process was jetsammed
  */
 static boolean_t
-memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, int aggr_count, uint32_t *errors)
+memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, unsigned int band, int aggr_count, uint32_t *errors)
 {
        pid_t aPid = 0;
        proc_t p = PROC_NULL, next_p = PROC_NULL;
        boolean_t new_snapshot = FALSE, killed = FALSE;
        int kill_count = 0;
-       unsigned int i = JETSAM_PRIORITY_ELEVATED_INACTIVE;
        uint32_t aPid_ep;
        uint64_t killtime = 0;
         clock_sec_t     tv_sec;
@@ -5548,13 +6341,21 @@ memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, in
        KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
                memorystatus_available_pages, 0, 0, 0, 0);
 
+#if CONFIG_FREEZE
+       boolean_t consider_frozen_only = FALSE;
+
+       if (band == (unsigned int) memorystatus_freeze_jetsam_band) {
+               consider_frozen_only = TRUE;
+       }
+#endif /* CONFIG_FREEZE */
+
        proc_list_lock();
 
-       next_p = memorystatus_get_first_proc_locked(&i, FALSE);
+       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
        while (next_p) {
 
                p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&i, p, FALSE);
+               next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
 
                aPid = p->p_pid;
                aPid_ep = p->p_memstat_effectivepriority;
@@ -5571,10 +6372,14 @@ memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, in
                }
 
 #if CONFIG_FREEZE
+               if (consider_frozen_only && ! (p->p_memstat_state & P_MEMSTAT_FROZEN)) {
+                       continue;
+               }
+
                if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
                        continue;
                }
-#endif
+#endif /* CONFIG_FREEZE */
 
 #if DEVELOPMENT || DEBUG
                MEMORYSTATUS_DEBUG(1, "jetsam: elevated%d process pid %d [%s] - memorystatus_available_pages: %d\n",
@@ -5644,7 +6449,7 @@ memorystatus_kill_elevated_process(uint32_t cause, os_reason_t jetsam_reason, in
                 * And, we hold the the proc_list_lock at this point.
                 */
 
-               next_p = memorystatus_get_first_proc_locked(&i, FALSE);
+               next_p = memorystatus_get_first_proc_locked(&band, FALSE);
        }
 
        proc_list_unlock();
@@ -5673,8 +6478,12 @@ memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
         * NOTE: If a new async kill cause is added, make sure to update memorystatus_thread() to
         * add the appropriate exit reason code mapping.
         */
-       if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage && cause != kMemorystatusKilledVMThrashing &&
-                                  cause != kMemorystatusKilledFCThrashing && cause != kMemorystatusKilledZoneMapExhaustion)) {
+       if ((victim_pid != -1) ||
+                       (cause != kMemorystatusKilledVMPageShortage &&
+                       cause != kMemorystatusKilledVMCompressorThrashing &&
+                       cause != kMemorystatusKilledVMCompressorSpaceShortage &&
+                       cause != kMemorystatusKilledFCThrashing &&
+                       cause != kMemorystatusKilledZoneMapExhaustion)) {
                return FALSE;
        }
     
@@ -5684,20 +6493,34 @@ memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
 }
 
 boolean_t
-memorystatus_kill_on_VM_thrashing(boolean_t async) {
+memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async) {
        if (async) {
-               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing);
+               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorSpaceShortage);
        } else {
-               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMTHRASHING);
+               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE);
                if (jetsam_reason == OS_REASON_NULL) {
-                       printf("memorystatus_kill_on_VM_thrashing -- sync: failed to allocate jetsam reason\n");
+                       printf("memorystatus_kill_on_VM_compressor_space_shortage -- sync: failed to allocate jetsam reason\n");
                }
 
-               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing, jetsam_reason);
+               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorSpaceShortage, jetsam_reason);
        }
 }
 
 #if CONFIG_JETSAM
+boolean_t
+memorystatus_kill_on_VM_compressor_thrashing(boolean_t async) {
+       if (async) {
+               return memorystatus_kill_process_async(-1, kMemorystatusKilledVMCompressorThrashing);
+       } else {
+               os_reason_t jetsam_reason = os_reason_create(OS_REASON_JETSAM, JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING);
+               if (jetsam_reason == OS_REASON_NULL) {
+                       printf("memorystatus_kill_on_VM_compressor_thrashing -- sync: failed to allocate jetsam reason\n");
+               }
+
+               return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMCompressorThrashing, jetsam_reason);
+       }
+}
+
 boolean_t 
 memorystatus_kill_on_VM_page_shortage(boolean_t async) {
        if (async) {
@@ -5768,18 +6591,89 @@ memorystatus_freeze_init(void)
        freezer_lck_grp = lck_grp_alloc_init("freezer", freezer_lck_grp_attr);
 
        lck_mtx_init(&freezer_mutex, freezer_lck_grp, NULL);
-               
+
+       /*
+        * This is just the default value if the underlying
+        * storage device doesn't have any specific budget.
+        * We check with the storage layer in memorystatus_freeze_update_throttle()
+        * before we start our freezing the first time.
+        */
+       memorystatus_freeze_budget_pages_remaining = (memorystatus_freeze_daily_mb_max * 1024 * 1024) / PAGE_SIZE;
+
        result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
        if (result == KERN_SUCCESS) {
+
+               proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_IO, THROTTLE_LEVEL_COMPRESSOR_TIER2);
+               proc_set_thread_policy(thread, TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+               thread_set_thread_name(thread, "VM_freezer");
+
                thread_deallocate(thread);
        } else {
                panic("Could not create memorystatus_freeze_thread");
        }
 }
 
+static boolean_t
+memorystatus_is_process_eligible_for_freeze(proc_t p)
+{
+       /*
+        * Called with proc_list_lock held.
+        */
+
+       LCK_MTX_ASSERT(proc_list_mlock, LCK_MTX_ASSERT_OWNED);
+
+       boolean_t should_freeze = FALSE;
+       uint32_t state = 0, entry_count = 0, pages = 0, i = 0;
+       int probability_of_use = 0;
+
+       if (isApp(p) == FALSE) {
+               goto out;
+       }
+
+       state = p->p_memstat_state;
+
+       if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FREEZE_DISABLED | P_MEMSTAT_FREEZE_IGNORE)) ||
+                       !(state & P_MEMSTAT_SUSPENDED)) {
+               goto out;
+       }
+
+       /* Only freeze processes meeting our minimum resident page criteria */
+       memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL);
+       if (pages < memorystatus_freeze_pages_min) {
+               goto out;
+       }
+
+       entry_count = (memorystatus_global_probabilities_size / sizeof(memorystatus_internal_probabilities_t));
+
+       if (entry_count) {
+
+               for (i=0; i < entry_count; i++ ) {
+                       if (strncmp(memorystatus_global_probabilities_table[i].proc_name,
+                                   p->p_name,
+                                   MAXCOMLEN + 1) == 0) {
+
+                               probability_of_use = memorystatus_global_probabilities_table[i].use_probability;
+                               break;
+                       }
+               }
+
+               if (probability_of_use == 0) {
+                       goto out;
+               }
+       }
+
+       should_freeze = TRUE;
+out:
+       return should_freeze;
+}
+
 /*
  * Synchronously freeze the passed proc. Called with a reference to the proc held.
  *
+ * Doesn't deal with re-freezing because this is called on a specific process and
+ * not by the freezer thread. If that changes, we'll have to teach it about
+ * refreezing a frozen process.
+ *
  * Returns EINVAL or the value returned by task_freeze().
  */
 int
@@ -5788,69 +6682,49 @@ memorystatus_freeze_process_sync(proc_t p)
        int ret = EINVAL;
        pid_t aPid = 0;
        boolean_t memorystatus_freeze_swap_low = FALSE;
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
-               memorystatus_available_pages, 0, 0, 0, 0);
+       int     freezer_error_code = 0;
 
        lck_mtx_lock(&freezer_mutex);
 
        if (p == NULL) {
+               printf("memorystatus_freeze_process_sync: Invalid process\n");
                goto exit;
        }
 
        if (memorystatus_freeze_enabled == FALSE) {
+               printf("memorystatus_freeze_process_sync: Freezing is DISABLED\n");
                goto exit;
        }
 
        if (!memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
+               printf("memorystatus_freeze_process_sync: Low compressor and/or low swap space...skipping freeze\n");
                goto exit;
        }
 
-       if (memorystatus_freeze_update_throttle()) {
-               printf("memorystatus_freeze_process_sync: in throttle, ignorning freeze\n");
-               memorystatus_freeze_throttle_count++;
+       memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining);
+       if (!memorystatus_freeze_budget_pages_remaining) {
+               printf("memorystatus_freeze_process_sync: exit with NO available budget\n");
                goto exit;
        }
 
        proc_list_lock();
 
        if (p != NULL) {
-               uint32_t purgeable, wired, clean, dirty, state;
-               uint32_t max_pages, pages, i;
-               boolean_t shared;
+               uint32_t purgeable, wired, clean, dirty, shared;
+               uint32_t max_pages, i;
 
                aPid = p->p_pid;
-               state = p->p_memstat_state;
 
                /* Ensure the process is eligible for freezing */
-               if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
-                       proc_list_unlock();
-                       goto exit;
-               }
-
-               /* Only freeze processes meeting our minimum resident page criteria */
-               memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
-               if (pages < memorystatus_freeze_pages_min) {
+               if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) {
                        proc_list_unlock();
                        goto exit;
                }
 
                if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
 
-                       unsigned int avail_swap_space = 0; /* in pages. */
-
-                       /*
-                        * Freezer backed by the compressor and swap file(s)
-                        * while will hold compressed data.
-                        */
-                       avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
-
-                       max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
+                       max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining);
 
-                       if (max_pages < memorystatus_freeze_pages_min) {
-                               proc_list_unlock();
-                               goto exit;
-                       }
                } else {
                        /*
                         * We only have the compressor without any swap.
@@ -5862,7 +6736,13 @@ memorystatus_freeze_process_sync(proc_t p)
                p->p_memstat_state |= P_MEMSTAT_LOCKED;
                proc_list_unlock();
 
-               ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
+               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
+                       memorystatus_available_pages, 0, 0, 0, 0);
+
+               ret = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
+
+               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
+                       memorystatus_available_pages, aPid, 0, 0, 0);
 
                DTRACE_MEMORYSTATUS6(memorystatus_freeze, proc_t, p, unsigned int, memorystatus_available_pages, boolean_t, purgeable, unsigned int, wired, uint32_t, clean, uint32_t, dirty);
 
@@ -5872,99 +6752,188 @@ memorystatus_freeze_process_sync(proc_t p)
                                   memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
 
                proc_list_lock();
-               p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
 
                if (ret == KERN_SUCCESS) {
+
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...done",
+                               aPid, (*p->p_name ? p->p_name : "unknown"));
+
                        memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
 
-                       memorystatus_frozen_count++;
+                       p->p_memstat_freeze_sharedanon_pages += shared;
 
-                       p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
+                       memorystatus_frozen_shared_mb += shared;
 
-                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-                               /* Update stats */
-                               for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
-                                       throttle_intervals[i].pageouts += dirty;
-                               }
+                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
+                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
+                               memorystatus_frozen_count++;
                        }
 
-                       memorystatus_freeze_pageouts += dirty;
-                       memorystatus_freeze_count++;
+                       p->p_memstat_frozen_count++;
 
+                       /*
+                        * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process
+                        * to its higher jetsam band.
+                        */
                        proc_list_unlock();
 
                        memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
+
+                       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+
+                               ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE,
+                                               memorystatus_freeze_jetsam_band, TRUE);
+
+                               if (ret) {
+                                       printf("Elevating the frozen process failed with %d\n", ret);
+                                       /* not fatal */
+                                       ret = 0;
+                               }
+
+                               proc_list_lock();
+
+                               /* Update stats */
+                               for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
+                                       throttle_intervals[i].pageouts += dirty;
+                               }
+                       } else {
+                               proc_list_lock();
+                       }
+
+                       memorystatus_freeze_pageouts += dirty;
+
+                       if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) {
+                               /*
+                                * Add some eviction logic here? At some point should we
+                                * jetsam a process to get back its swap space so that we
+                                * can freeze a more eligible process at this moment in time?
+                                */
+                       }
                } else {
-                       proc_list_unlock();
+                       char reason[128];
+                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
+                               strlcpy(reason, "too much shared memory", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
+                               strlcpy(reason, "low private-shared pages ratio", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
+                               strlcpy(reason, "no compressor space", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
+                               strlcpy(reason, "no swap space", 128);
+                       }
+
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (specific) pid %d [%s]...skipped (%s)",
+                               aPid, (*p->p_name ? p->p_name : "unknown"), reason);
+                       p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE;
                }
+
+               p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
+               proc_list_unlock();
        }
 
 exit:
        lck_mtx_unlock(&freezer_mutex);
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
-               memorystatus_available_pages, aPid, 0, 0, 0);
 
        return ret;
 }
 
 static int
-memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
+memorystatus_freeze_top_process(void)
 {
        pid_t aPid = 0;
        int ret = -1;
        proc_t p = PROC_NULL, next_p = PROC_NULL;
        unsigned int i = 0;
-
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
-               memorystatus_available_pages, 0, 0, 0, 0);
+       unsigned int band = JETSAM_PRIORITY_IDLE;
+       boolean_t refreeze_processes = FALSE;
 
        proc_list_lock();
-       
-       next_p = memorystatus_get_first_proc_locked(&i, TRUE);
+
+       if (memorystatus_frozen_count >= memorystatus_frozen_processes_max) {
+               /*
+                * Freezer is already full but we are here and so let's
+                * try to refreeze any processes we might have thawed
+                * in the past and push out their compressed state out.
+                */
+               refreeze_processes = TRUE;
+               band = (unsigned int) memorystatus_freeze_jetsam_band;
+       }
+
+ freeze_process:
+
+       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
        while (next_p) {
                kern_return_t kr;
-               uint32_t purgeable, wired, clean, dirty;
-               boolean_t shared;
-               uint32_t pages;
+               uint32_t purgeable, wired, clean, dirty, shared;
                uint32_t max_pages = 0;
-               uint32_t state;
+               int     freezer_error_code = 0;
                
                p = next_p;
-               next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
+               next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
 
                aPid = p->p_pid;
-               state = p->p_memstat_state;
 
-               /* Ensure the process is eligible for freezing */
-               if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
-                       continue; // with lock held
+               if (p->p_memstat_effectivepriority != (int32_t) band) {
+                       /*
+                        * We shouldn't be freezing processes outside the
+                        * prescribed band.
+                        */
+                       break;
                }
-                                       
-               /* Only freeze processes meeting our minimum resident page criteria */
-               memorystatus_get_task_page_counts(p->task, &pages, NULL, NULL, NULL);
-               if (pages < memorystatus_freeze_pages_min) {
-                       continue; // with lock held
-               } 
 
-               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+               /* Ensure the process is eligible for (re-)freezing */
+               if (refreeze_processes) {
+                       /*
+                        * Has to have been frozen once before.
+                        */
+                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
+                               continue;
+                       }
+
+                       /*
+                        * Has to have been resumed once before.
+                        */
+                       if ((p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) == FALSE) {
+                               continue;
+                       }
+
+                       /*
+                        * Not currently being looked at for something.
+                        */
+                       if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
+                               continue;
+                       }
 
-                       /* Ensure there's enough free space to freeze this process. */
+                       /*
+                        * We are going to try and refreeze and so re-evaluate
+                        * the process. We don't want to double count the shared
+                        * memory. So deduct the old snapshot here.
+                        */
+                       memorystatus_frozen_shared_mb -= p->p_memstat_freeze_sharedanon_pages;
+                       p->p_memstat_freeze_sharedanon_pages = 0;
 
-                       unsigned int avail_swap_space = 0; /* in pages. */
+                       p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
+                       memorystatus_refreeze_eligible_count--;
 
+               } else {
+                       if (memorystatus_is_process_eligible_for_freeze(p) == FALSE) {
+                               continue; // with lock held
+                       }
+               }
+
+               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
                        /*
                         * Freezer backed by the compressor and swap file(s)
-                        * while will hold compressed data.
+                        * will hold compressed data.
                         */
-                       avail_swap_space = vm_swap_get_free_space() / PAGE_SIZE_64;
 
-                       max_pages = MIN(avail_swap_space, memorystatus_freeze_pages_max);
+                       max_pages = MIN(memorystatus_freeze_pages_max, memorystatus_freeze_budget_pages_remaining);
 
-                       if (max_pages < memorystatus_freeze_pages_min) {
-                               *memorystatus_freeze_swap_low = TRUE;
-                               proc_list_unlock();
-                               goto exit;
-                       }
                } else {
                        /*
                         * We only have the compressor pool.
@@ -5976,60 +6945,174 @@ memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
                p->p_memstat_state |= P_MEMSTAT_LOCKED;
 
                p = proc_ref_locked(p);
-               proc_list_unlock();        
                if (!p) {
-                       goto exit;
+                       break;
                }
-        
-               kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
+
+               proc_list_unlock();
+
+               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
+                       memorystatus_available_pages, 0, 0, 0, 0);
+
+               kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, &freezer_error_code, FALSE /* eval only */);
                
+               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
+                       memorystatus_available_pages, aPid, 0, 0, 0);
+
                MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
                        "memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, max_pages %d, shared %d\n", 
                                   (kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (*p->p_name ? p->p_name : "(unknown)"),
                                   memorystatus_available_pages, purgeable, wired, clean, dirty, max_pages, shared);
      
                proc_list_lock();
-               p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
                
                /* Success? */
                if (KERN_SUCCESS == kr) {
+
+                       if (refreeze_processes) {
+                               os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: Refreezing (general) pid %d [%s]...done",
+                               aPid, (*p->p_name ? p->p_name : "unknown"));
+                       } else {
+                               os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (general) pid %d [%s]...done",
+                               aPid, (*p->p_name ? p->p_name : "unknown"));
+                       }
+
                        memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
                        
-                       memorystatus_frozen_count++;
-                       
-                       p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
+                       p->p_memstat_freeze_sharedanon_pages += shared;
+
+                       memorystatus_frozen_shared_mb += shared;
+
+                       if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == 0) {
+                               p->p_memstat_state |= P_MEMSTAT_FROZEN;
+                               memorystatus_frozen_count++;
+                       }
+
+                       p->p_memstat_frozen_count++;
+
+                       /*
+                        * Still keeping the P_MEMSTAT_LOCKED bit till we are actually done elevating this frozen process
+                        * to its higher jetsam band.
+                        */
+                       proc_list_unlock();
+
+                       memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
 
                        if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+
+                               ret = memorystatus_update_inactive_jetsam_priority_band(p->p_pid, MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE, memorystatus_freeze_jetsam_band, TRUE);
+
+                               if (ret) {
+                                       printf("Elevating the frozen process failed with %d\n", ret);
+                                       /* not fatal */
+                                       ret = 0;
+                               }
+
+                               proc_list_lock();
+
                                /* Update stats */
                                for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
                                        throttle_intervals[i].pageouts += dirty;
                                }
+                       } else {
+                               proc_list_lock();
                        }
 
                        memorystatus_freeze_pageouts += dirty;
-                       memorystatus_freeze_count++;
-
-                       proc_list_unlock();
 
-                       memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
+                       if (memorystatus_frozen_count == (memorystatus_frozen_processes_max - 1)) {
+                               /*
+                                * Add some eviction logic here? At some point should we
+                                * jetsam a process to get back its swap space so that we
+                                * can freeze a more eligible process at this moment in time?
+                                */
+                       }
 
-                       /* Return KERN_SUCESS */
+                       /* Return KERN_SUCCESS */
                        ret = kr;
 
+                       p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
+                       proc_rele_locked(p);
+
+                       /*
+                        * We froze a process successfully. We can stop now
+                        * and see if that helped.
+                        */
+
+                       break;
                } else {
-                       proc_list_unlock();
+
+                       p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
+
+                       if (refreeze_processes == TRUE) {
+                               if ((freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) ||
+                                   (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO)) {
+                                       /*
+                                        * Keeping this prior-frozen process in this high band when
+                                        * we failed to re-freeze it due to bad shared memory usage
+                                        * could cause excessive pressure on the lower bands.
+                                        * We need to demote it for now. It'll get re-evaluated next
+                                        * time because we don't set the P_MEMSTAT_FREEZE_IGNORE
+                                        * bit.
+                                        */
+
+                                       p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
+                                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+                                       memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE);
+                               }
+                       } else {
+                               p->p_memstat_state |= P_MEMSTAT_FREEZE_IGNORE;
+                       }
+
+                       proc_rele_locked(p);
+
+                       char reason[128];
+                       if (freezer_error_code == FREEZER_ERROR_EXCESS_SHARED_MEMORY) {
+                               strlcpy(reason, "too much shared memory", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO) {
+                               strlcpy(reason, "low private-shared pages ratio", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_COMPRESSOR_SPACE) {
+                               strlcpy(reason, "no compressor space", 128);
+                       }
+
+                       if (freezer_error_code == FREEZER_ERROR_NO_SWAP_SPACE) {
+                               strlcpy(reason, "no swap space", 128);
+                       }
+
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: freezing (general) pid %d [%s]...skipped (%s)",
+                               aPid, (*p->p_name ? p->p_name : "unknown"), reason);
+
+                       if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
+                               break;
+                       }
                }
-        
-               proc_rele(p);
-               goto exit;
+       }
+
+       if ((ret == -1) &&
+           (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD) &&
+           (refreeze_processes == FALSE)) {
+               /*
+                * We failed to freeze a process from the IDLE
+                * band AND we have some thawed  processes
+                * AND haven't tried refreezing as yet.
+                * Let's try and re-freeze processes in the
+                * frozen band that have been resumed in the past
+                * and so have brought in state from disk.
+                */
+
+               band = (unsigned int) memorystatus_freeze_jetsam_band;
+
+               refreeze_processes = TRUE;
+
+               goto freeze_process;
        }
        
        proc_list_unlock();
        
-exit:
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
-               memorystatus_available_pages, aPid, 0, 0, 0);
-       
        return ret;
 }
 
@@ -6041,22 +7124,8 @@ memorystatus_can_freeze_processes(void)
        proc_list_lock();
        
        if (memorystatus_suspended_count) {
-               uint32_t average_resident_pages, estimated_processes;
-        
-               /* Estimate the number of suspended processes we can fit */
-               average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count;
-               estimated_processes = memorystatus_suspended_count +
-                       ((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages);
-
-               /* If it's predicted that no freeze will occur, lower the threshold temporarily */
-               if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) {
-                       memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW;
-               } else {
-                       memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
-               }
 
-               MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n", 
-                       memorystatus_suspended_count, average_resident_pages, estimated_processes);
+               memorystatus_freeze_suspended_threshold = MIN(memorystatus_freeze_suspended_threshold, FREEZE_SUSPENDED_THRESHOLD_DEFAULT);
        
                if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
                        ret = TRUE;
@@ -6126,67 +7195,254 @@ memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
        return can_freeze;
 }
 
+/*
+ * This function evaluates if the currently frozen processes deserve
+ * to stay in the higher jetsam band. If the # of thaws of a process
+ * is below our threshold, then we will demote that process into the IDLE
+ * band and put it at the head. We don't immediately kill the process here
+ * because it  already has state on disk and so it might be worth giving
+ * it another shot at getting thawed/resumed and used.
+ */
 static void
-memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval)
+memorystatus_demote_frozen_processes(void)
 {
-       unsigned int freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE);
-       if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) {
-               if (!interval->max_pageouts) {
-                       interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / (24 * 60)));
-               } else {
-                       printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins);
+       unsigned int band = (unsigned int) memorystatus_freeze_jetsam_band;
+       unsigned int demoted_proc_count = 0;
+       proc_t p = PROC_NULL, next_p = PROC_NULL;
+
+       proc_list_lock();
+
+       if (memorystatus_freeze_enabled == FALSE) {
+               /*
+                * Freeze has been disabled likely to
+                * reclaim swap space. So don't change
+                * any state on the frozen processes.
+                */
+               proc_list_unlock();
+               return;
+       }
+
+       next_p = memorystatus_get_first_proc_locked(&band, FALSE);
+       while (next_p) {
+
+               p = next_p;
+               next_p = memorystatus_get_next_proc_locked(&band, p, FALSE);
+
+               if ((p->p_memstat_state & P_MEMSTAT_FROZEN) == FALSE) {
+                       continue;
                }
-               interval->ts.tv_sec = interval->mins * 60;
-               interval->ts.tv_nsec = 0;
-               ADD_MACH_TIMESPEC(&interval->ts, ts);
-               /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
-               if (interval->pageouts > interval->max_pageouts) {
-                       interval->pageouts -= interval->max_pageouts;
-               } else {
-                       interval->pageouts = 0;
+
+               if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
+                       continue;
                }
-               interval->throttle = FALSE;
-       } else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) {
-               printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins);
-               interval->throttle = TRUE;
-       }       
 
-       MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n", 
-               interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60, 
-               interval->throttle ? "on" : "off");
+               if (p->p_memstat_thaw_count < memorystatus_thaw_count_demotion_threshold) {
+                       p->p_memstat_state &= ~P_MEMSTAT_USE_ELEVATED_INACTIVE_BAND;
+                       memorystatus_invalidate_idle_demotion_locked(p, TRUE);
+
+                       memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE, TRUE, TRUE);
+#if DEVELOPMENT || DEBUG
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus_demote_frozen_process pid %d [%s]",
+                                  p->p_pid, (*p->p_name ? p->p_name : "unknown"));
+#endif /* DEVELOPMENT || DEBUG */
+
+                       /*
+                        * The freezer thread will consider this a normal app to be frozen
+                        * because it is in the IDLE band. So we don't need the
+                        * P_MEMSTAT_REFREEZE_ELIGIBLE state here. Also, if it gets resumed
+                        * we'll correctly count it as eligible for re-freeze again.
+                        *
+                        * We don't drop the frozen count because this process still has
+                        * state on disk. So there's a chance it gets resumed and then it
+                        * should land in the higher jetsam band. For that it needs to
+                        * remain marked frozen.
+                        */
+                       if (p->p_memstat_state & P_MEMSTAT_REFREEZE_ELIGIBLE) {
+                               p->p_memstat_state &= ~P_MEMSTAT_REFREEZE_ELIGIBLE;
+                               memorystatus_refreeze_eligible_count--;
+                       }
+
+                       demoted_proc_count++;
+               }
+
+               if (demoted_proc_count == memorystatus_max_frozen_demotions_daily) {
+                       break;
+               }
+       }
+
+       memorystatus_thaw_count = 0;
+       proc_list_unlock();
 }
 
-static boolean_t
-memorystatus_freeze_update_throttle(void) 
+
+/*
+ * This function will do 4 things:
+ *
+ * 1) check to see if we are currently in a degraded freezer mode, and if so:
+ *     - check to see if our window has expired and we should exit this mode, OR,
+ *     - return a budget based on the degraded throttle window's max. pageouts vs current pageouts.
+ *
+ * 2) check to see if we are in a NEW normal window and update the normal throttle window's params.
+ *
+ * 3) check what the current normal window allows for a budget.
+ *
+ * 4) calculate the current rate of pageouts for DEGRADED_WINDOW_MINS duration. If that rate is below
+ *    what we would normally expect, then we are running low on our daily budget and need to enter
+ *    degraded perf. mode.
+ */
+
+static void
+memorystatus_freeze_update_throttle(uint64_t *budget_pages_allowed)
 {
        clock_sec_t sec;
        clock_nsec_t nsec;
        mach_timespec_t ts;
-       uint32_t i;
-       boolean_t throttled = FALSE;
+
+       unsigned int freeze_daily_pageouts_max = 0;
 
 #if DEVELOPMENT || DEBUG
-       if (!memorystatus_freeze_throttle_enabled)
-               return FALSE;
+       if (!memorystatus_freeze_throttle_enabled) {
+               /*
+                * No throttling...we can use the full budget everytime.
+                */
+               *budget_pages_allowed = UINT64_MAX;
+               return;
+       }
 #endif
 
        clock_get_system_nanotime(&sec, &nsec);
        ts.tv_sec = sec;
        ts.tv_nsec = nsec;
-       
-       /* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget.
-        *
-        * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has
-        * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in
-        * order to allow for bursts of activity.
-        */
-       for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
-               memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]);
-               if (throttle_intervals[i].throttle == TRUE)
-                       throttled = TRUE;
-       }                                                               
 
-       return throttled;
+       struct throttle_interval_t *interval = NULL;
+
+       if (memorystatus_freeze_degradation == TRUE) {
+
+               interval = degraded_throttle_window;
+
+               if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) {
+                       memorystatus_freeze_degradation = FALSE;
+                       interval->pageouts = 0;
+                       interval->max_pageouts = 0;
+
+               } else {
+                       *budget_pages_allowed = interval->max_pageouts - interval->pageouts;
+               }
+       }
+
+       interval = normal_throttle_window;
+
+       if (CMP_MACH_TIMESPEC(&ts, &interval->ts) >= 0) {
+               /*
+                * New throttle window.
+                * Rollover any unused budget.
+                * Also ask the storage layer what the new budget needs to be.
+                */
+               uint64_t freeze_daily_budget = 0;
+               unsigned int daily_budget_pageouts = 0;
+
+               if (vm_swap_max_budget(&freeze_daily_budget)) {
+                       memorystatus_freeze_daily_mb_max = (freeze_daily_budget / (1024 * 1024));
+                       os_log_with_startup_serial(OS_LOG_DEFAULT, "memorystatus: memorystatus_freeze_daily_mb_max set to %dMB\n", memorystatus_freeze_daily_mb_max);
+               }
+
+               freeze_daily_pageouts_max = memorystatus_freeze_daily_mb_max * (1024 * 1024 / PAGE_SIZE);
+
+               daily_budget_pageouts =  (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS));
+               interval->max_pageouts = (interval->max_pageouts - interval->pageouts) + daily_budget_pageouts;
+
+               interval->ts.tv_sec = interval->mins * 60;
+               interval->ts.tv_nsec = 0;
+               ADD_MACH_TIMESPEC(&interval->ts, &ts);
+               /* Since we update the throttle stats pre-freeze, adjust for overshoot here */
+               if (interval->pageouts > interval->max_pageouts) {
+                       interval->pageouts -= interval->max_pageouts;
+               } else {
+                       interval->pageouts = 0;
+               }
+               *budget_pages_allowed = interval->max_pageouts;
+
+               memorystatus_demote_frozen_processes();
+
+       } else {
+               /*
+                * Current throttle window.
+                * Deny freezing if we have no budget left.
+                * Try graceful degradation if we are within 25% of:
+                * - the daily budget, and
+                * - the current budget left is below our normal budget expectations.
+                */
+
+#if DEVELOPMENT || DEBUG
+               /*
+                * This can only happen in the INTERNAL configs because we allow modifying the daily budget for testing.
+                */
+
+               if (freeze_daily_pageouts_max > interval->max_pageouts) {
+                       /*
+                        * We just bumped the daily budget. Re-evaluate our normal window params.
+                        */
+                       interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * freeze_daily_pageouts_max) / NORMAL_WINDOW_MINS));
+                       memorystatus_freeze_degradation = FALSE; //we'll re-evaluate this below...
+               }
+#endif /* DEVELOPMENT || DEBUG */
+
+               if (memorystatus_freeze_degradation == FALSE) {
+
+                       if (interval->pageouts >= interval->max_pageouts) {
+
+                               *budget_pages_allowed = 0;
+
+                       } else {
+
+                               int budget_left = interval->max_pageouts - interval->pageouts;
+                               int budget_threshold = (freeze_daily_pageouts_max * FREEZE_DEGRADATION_BUDGET_THRESHOLD) / 100;
+
+                               mach_timespec_t time_left = {0,0};
+
+                               time_left.tv_sec = interval->ts.tv_sec;
+                               time_left.tv_nsec = 0;
+
+                               SUB_MACH_TIMESPEC(&time_left, &ts);
+
+                               if (budget_left <= budget_threshold) {
+
+                                       /*
+                                        * For the current normal window, calculate how much we would pageout in a DEGRADED_WINDOW_MINS duration.
+                                        * And also calculate what we would pageout for the same DEGRADED_WINDOW_MINS duration if we had the full
+                                        * daily pageout budget.
+                                        */
+
+                                       unsigned int current_budget_rate_allowed = ((budget_left / time_left.tv_sec) / 60) * DEGRADED_WINDOW_MINS;
+                                       unsigned int normal_budget_rate_allowed = (freeze_daily_pageouts_max / NORMAL_WINDOW_MINS) * DEGRADED_WINDOW_MINS;
+
+                                       /*
+                                        * The current rate of pageouts is below what we would expect for
+                                        * the normal rate i.e. we have below normal budget left and so...
+                                        */
+
+                                       if (current_budget_rate_allowed < normal_budget_rate_allowed) {
+
+                                               memorystatus_freeze_degradation = TRUE;
+                                               degraded_throttle_window->max_pageouts = current_budget_rate_allowed;
+                                               degraded_throttle_window->pageouts = 0;
+
+                                               /*
+                                                * Switch over to the degraded throttle window so the budget
+                                                * doled out is based on that window.
+                                                */
+                                               interval = degraded_throttle_window;
+                                       }
+                               }
+
+                               *budget_pages_allowed = interval->max_pageouts - interval->pageouts;
+                       }
+               }
+       }
+
+       MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
+               interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
+               interval->throttle ? "on" : "off");
 }
 
 static void
@@ -6195,23 +7451,77 @@ memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
        static boolean_t memorystatus_freeze_swap_low = FALSE;
 
        lck_mtx_lock(&freezer_mutex);
+
        if (memorystatus_freeze_enabled) {
-               if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
-                       /* Only freeze if we've not exceeded our pageout budgets.*/
-                       if (!memorystatus_freeze_update_throttle()) {
-                               memorystatus_freeze_top_process(&memorystatus_freeze_swap_low);
-                       } else {
-                               printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n");
-                               memorystatus_freeze_throttle_count++; /* Throttled, update stats */
+
+               if ((memorystatus_frozen_count < memorystatus_frozen_processes_max) ||
+                   (memorystatus_refreeze_eligible_count >= MIN_THAW_REFREEZE_THRESHOLD)) {
+
+                       if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
+
+                               /* Only freeze if we've not exceeded our pageout budgets.*/
+                               memorystatus_freeze_update_throttle(&memorystatus_freeze_budget_pages_remaining);
+
+                               if (memorystatus_freeze_budget_pages_remaining) {
+                                       memorystatus_freeze_top_process();
+                               }
                        }
                }
        }
-       lck_mtx_unlock(&freezer_mutex);
+
+       /*
+        * We use memorystatus_apps_idle_delay_time because if/when we adopt aging for applications,
+        * it'll tie neatly into running the freezer once we age an application.
+        *
+        * Till then, it serves as a good interval that can be tuned via a sysctl too.
+        */
+       memorystatus_freezer_thread_next_run_ts = mach_absolute_time() + memorystatus_apps_idle_delay_time;
 
        assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
+       lck_mtx_unlock(&freezer_mutex);
+
        thread_block((thread_continue_t) memorystatus_freeze_thread);   
 }
 
+static boolean_t
+memorystatus_freeze_thread_should_run(void)
+{
+       /*
+        * No freezer_mutex held here...see why near call-site
+        * within memorystatus_pages_update().
+        */
+
+       boolean_t should_run = FALSE;
+
+       if (memorystatus_freeze_enabled == FALSE) {
+               goto out;
+       }
+
+       if (memorystatus_available_pages > memorystatus_freeze_threshold) {
+               goto out;
+       }
+
+       if ((memorystatus_frozen_count >= memorystatus_frozen_processes_max) &&
+           (memorystatus_refreeze_eligible_count < MIN_THAW_REFREEZE_THRESHOLD)) {
+               goto out;
+       }
+
+       if (memorystatus_frozen_shared_mb_max && (memorystatus_frozen_shared_mb >= memorystatus_frozen_shared_mb_max)) {
+               goto out;
+       }
+
+       uint64_t curr_time = mach_absolute_time();
+
+       if (curr_time < memorystatus_freezer_thread_next_run_ts) {
+               goto out;
+       }
+
+       should_run = TRUE;
+
+out:
+       return should_run;
+}
+
 static int
 sysctl_memorystatus_do_fastwake_warmup_all  SYSCTL_HANDLER_ARGS
 {
@@ -6494,7 +7804,17 @@ memorystatus_bg_pressure_eligible(proc_t p) {
        if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
                 eligible = TRUE;
        }
-        
+
+       if (p->p_memstat_effectivepriority < JETSAM_PRIORITY_BACKGROUND_OPPORTUNISTIC) {
+               /*
+                * IDLE and IDLE_DEFERRED bands contain processes
+                * that have dropped memory to be under their inactive
+                * memory limits. And so they can't really give back
+                * anything.
+                */
+               eligible = FALSE;
+       }
+
        proc_list_unlock();
        
        return eligible;
@@ -6510,14 +7830,18 @@ memorystatus_is_foreground_locked(proc_t p) {
  * This is meant for stackshot and kperf -- it does not take the proc_list_lock
  * to access the p_memstat_dirty field.
  */
-boolean_t
-memorystatus_proc_is_dirty_unsafe(void *v)
+void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit)
 {
        if (!v) {
-               return FALSE;
+               *is_dirty = FALSE; 
+               *is_dirty_tracked = FALSE; 
+               *allow_idle_exit = FALSE;
+       } else {
+               proc_t p = (proc_t)v;
+               *is_dirty = (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
+               *is_dirty_tracked = (p->p_memstat_dirty & P_DIRTY_TRACK) != 0;
+               *allow_idle_exit = (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) != 0;
        }
-       proc_t p = (proc_t)v;
-       return (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) != 0;
 }
 
 #endif /* CONFIG_MEMORYSTATUS */
@@ -6539,14 +7863,6 @@ vm_pressure_level_t      memorystatus_manual_testing_level = kVMPressureNormal;
 extern struct knote *
 vm_pressure_select_optimal_candidate_to_notify(struct klist *, int, boolean_t);
 
-/*
- * This value is the threshold that a process must meet to be considered for scavenging.
- */
-#if CONFIG_EMBEDDED
-#define VM_PRESSURE_MINIMUM_RSIZE              1       /* MB */
-#else /* CONFIG_EMBEDDED */
-#define VM_PRESSURE_MINIMUM_RSIZE              10      /* MB */
-#endif /* CONFIG_EMBEDDED */
 
 #define VM_PRESSURE_NOTIFY_WAIT_PERIOD         10000   /* milliseconds */
 
@@ -6788,7 +8104,7 @@ vm_pressure_select_optimal_candidate_to_notify(struct klist *candidate_list, int
                  */
                 resident_size = (get_task_phys_footprint(t))/(1024*1024ULL);  /* MB */
 
-                if (resident_size >= VM_PRESSURE_MINIMUM_RSIZE) {
+                if (resident_size >= vm_pressure_task_footprint_min) {
 
                        if (level > 0) {
                                /*
@@ -7179,8 +8495,6 @@ SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG
 
 #endif /* DEBUG || DEVELOPMENT */
 
-extern int memorystatus_purge_on_warning;
-extern int memorystatus_purge_on_critical;
 
 static int
 sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
@@ -7229,12 +8543,12 @@ sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
        } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
 
                memorystatus_manual_testing_level = kVMPressureWarning;
-               force_purge = memorystatus_purge_on_warning;
+               force_purge = vm_pageout_state.memorystatus_purge_on_warning;
 
        } else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
 
                memorystatus_manual_testing_level = kVMPressureCritical;
-               force_purge = memorystatus_purge_on_critical;
+               force_purge = vm_pageout_state.memorystatus_purge_on_critical;
        }
 
        memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
@@ -7284,14 +8598,13 @@ SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_
     0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
 
 
-extern int memorystatus_purge_on_warning;
-extern int memorystatus_purge_on_urgent;
-extern int memorystatus_purge_on_critical;
-
-SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, "");
-SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, "");
-SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, "");
+SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_warning, 0, "");
+SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_urgent, 0, "");
+SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pageout_state.memorystatus_purge_on_critical, 0, "");
 
+#if DEBUG || DEVELOPMENT
+SYSCTL_UINT(_kern, OID_AUTO, memorystatus_vm_pressure_events_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &vm_pressure_events_enabled, 0, "");
+#endif
 
 #endif /* VM_PRESSURE_EVENTS */
 
@@ -7506,6 +8819,32 @@ memorystatus_update_levels_locked(boolean_t critical_only) {
 #endif
 }
 
+void
+memorystatus_fast_jetsam_override(boolean_t enable_override)
+{
+       /* If fast jetsam is not enabled, simply return */
+       if (!fast_jetsam_enabled)
+               return;
+
+       if (enable_override) {
+               if ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree)
+                       return;
+               proc_list_lock();
+               memorystatus_jetsam_policy |= kPolicyMoreFree;
+               memorystatus_thread_pool_max();
+               memorystatus_update_levels_locked(TRUE);
+               proc_list_unlock();
+       } else {
+               if ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0)
+                       return;
+               proc_list_lock();
+               memorystatus_jetsam_policy &= ~kPolicyMoreFree;
+               memorystatus_thread_pool_default();
+               memorystatus_update_levels_locked(TRUE);
+               proc_list_unlock();
+       }
+}
+
 
 static int
 sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS
@@ -7525,27 +8864,12 @@ sysctl_kern_memorystatus_policy_more_free SYSCTL_HANDLER_ARGS
        if (error || !req->newptr)
                return (error);
 
-       if ((more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == kPolicyMoreFree)) ||
-           (!more_free && ((memorystatus_jetsam_policy & kPolicyMoreFree) == 0))) {
-
-               /*
-                * No change in state.
-                */
-               return 0;
-       }
-
-       proc_list_lock();
-
        if (more_free) {
-               memorystatus_jetsam_policy |= kPolicyMoreFree;
+               memorystatus_fast_jetsam_override(true);
        } else {
-               memorystatus_jetsam_policy &= ~kPolicyMoreFree;
+               memorystatus_fast_jetsam_override(false);
        }
 
-       memorystatus_update_levels_locked(TRUE);
-
-       proc_list_unlock();
-
        return 0;
 }
 SYSCTL_PROC(_kern, OID_AUTO, memorystatus_policy_more_free, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
@@ -7587,6 +8911,35 @@ memorystatus_get_at_boot_snapshot(memorystatus_jetsam_snapshot_t **snapshot, siz
        return 0;
 }
 
+/*
+ * Get the previous fully populated snapshot
+ */
+static int
+memorystatus_get_jetsam_snapshot_copy(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
+       size_t input_size = *snapshot_size;
+
+       if (memorystatus_jetsam_snapshot_copy_count > 0) {
+               *snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_copy_count));
+       } else {
+               *snapshot_size = 0;
+       }
+
+       if (size_only) {
+               return 0;
+       }
+
+       if (input_size < *snapshot_size) {
+               return EINVAL;
+       }
+
+       *snapshot = memorystatus_jetsam_snapshot_copy;
+
+       MEMORYSTATUS_DEBUG(7, "memorystatus_get_jetsam_snapshot_copy: returned inputsize (%ld), snapshot_size(%ld), listcount(%ld)\n",
+                                  (long)input_size, (long)*snapshot_size, (long)memorystatus_jetsam_snapshot_copy_count);
+
+       return 0;
+}
+
 static int
 memorystatus_get_on_demand_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
        size_t input_size = *snapshot_size;
@@ -7680,17 +9033,16 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b
                is_default_snapshot = TRUE;
                error = memorystatus_get_jetsam_snapshot(&snapshot, &buffer_size, size_only);
        } else {
-               if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
+               if (flags & ~(MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT | MEMORYSTATUS_SNAPSHOT_COPY)) {
                        /*
                         * Unsupported bit set in flag.
                         */
                        return EINVAL;
                }
 
-               if ((flags & (MEMORYSTATUS_SNAPSHOT_ON_DEMAND | MEMORYSTATUS_SNAPSHOT_AT_BOOT)) ==
-                   (MEMORYSTATUS_SNAPSHOT_ON_DEMAND |  MEMORYSTATUS_SNAPSHOT_AT_BOOT)) {
+               if (flags & (flags - 0x1)) {
                        /*
-                        * Can't have both set at the same time.
+                        * Can't have multiple flags set at the same time.
                         */
                        return EINVAL;
                }
@@ -7706,6 +9058,8 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b
                } else if (flags & MEMORYSTATUS_SNAPSHOT_AT_BOOT) {
                        is_at_boot_snapshot = TRUE;
                        error = memorystatus_get_at_boot_snapshot(&snapshot, &buffer_size, size_only);
+               } else if (flags & MEMORYSTATUS_SNAPSHOT_COPY) {
+                       error = memorystatus_get_jetsam_snapshot_copy(&snapshot, &buffer_size, size_only);
                } else {
                        /*
                         * Invalid flag setting.
@@ -7726,14 +9080,20 @@ memorystatus_cmd_get_jetsam_snapshot(int32_t flags, user_addr_t buffer, size_t b
         *      clearing the buffer means, free it.
         * If working with the at_boot snapshot
         *      there is nothing to clear or update.
+        * If working with a copy of the snapshot
+        *      there is nothing to clear or update.
         */
        if (!size_only) {
                if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
                        if (is_default_snapshot) {
                                /*
                                 * The jetsam snapshot is never freed, its count is simply reset.
+                                * However, we make a copy for any parties that might be interested
+                                * in the previous fully populated snapshot.
                                 */
                                proc_list_lock();
+                               memcpy(memorystatus_jetsam_snapshot_copy, memorystatus_jetsam_snapshot, memorystatus_jetsam_snapshot_size);
+                               memorystatus_jetsam_snapshot_copy_count = memorystatus_jetsam_snapshot_count;
                                snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
                                memorystatus_jetsam_snapshot_last_timestamp = 0;
                                proc_list_unlock();
@@ -7759,10 +9119,9 @@ out:
 }
 
 /*
- *     Routine:        memorystatus_cmd_grp_set_properties
- *     Purpose:        Update properties for a group of processes.
+ *     Routine:        memorystatus_cmd_grp_set_priorities
+ *     Purpose:        Update priorities for a group of processes.
  *
- *     Supported Properties:
  *     [priority]
  *             Move each process out of its effective priority
  *             band and into a new priority band.
@@ -7794,18 +9153,9 @@ out:
  */
 
 
-/* This internal structure can expand when we add support for more properties */
-typedef        struct memorystatus_internal_properties
-{
-       proc_t proc;
-       int32_t priority;  /* see memorytstatus_priority_entry_t : priority */
-} memorystatus_internal_properties_t;
-       
-
 static int
-memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
-
-#pragma unused (flags)
+memorystatus_cmd_grp_set_priorities(user_addr_t buffer, size_t buffer_size)
+{
 
        /*
         * We only handle setting priority
@@ -7813,10 +9163,15 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu
         */
 
        int error = 0;
-       memorystatus_priority_entry_t *entries = NULL;
+       memorystatus_properties_entry_v1_t *entries = NULL;
        uint32_t entry_count = 0;
 
        /* This will be the ordered proc list */
+       typedef struct memorystatus_internal_properties {
+               proc_t proc;
+               int32_t priority;
+       } memorystatus_internal_properties_t;
+
        memorystatus_internal_properties_t *table = NULL;
        size_t table_size = 0;
        uint32_t table_count = 0;
@@ -7829,24 +9184,34 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu
        proc_t p;
 
        /* Verify inputs */
-       if ((buffer == USER_ADDR_NULL) || (buffer_size == 0) || ((buffer_size % sizeof(memorystatus_priority_entry_t)) != 0)) {
+       if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
                error = EINVAL;
                goto out;
        }
 
-       entry_count = (buffer_size / sizeof(memorystatus_priority_entry_t));
-       if ((entries = (memorystatus_priority_entry_t *)kalloc(buffer_size)) == NULL) {
+       entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
+       if ((entries = (memorystatus_properties_entry_v1_t *)kalloc(buffer_size)) == NULL) {
                error = ENOMEM;
                goto out;
        }
 
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, entry_count, 0, 0, 0, 0);
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, 0, 0, 0);
 
        if ((error = copyin(buffer, entries, buffer_size)) != 0) {
                goto out;
        }
 
        /* Verify sanity of input priorities */
+       if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
+               if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
+                       error = EINVAL;
+                       goto out;
+               }
+       } else {
+               error = EINVAL;
+               goto out;
+       }
        for (i=0; i < entry_count; i++) {
                if (entries[i].priority == -1) {
                        /* Use as shorthand for default priority */
@@ -7938,9 +9303,9 @@ memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t bu
         * then some pids were not found in a jetsam band.
         * harmless but interesting...
         */
-       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, entry_count, table_count, 0, 0, 0);
-       
 out:
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY, entry_count, table_count, 0, 0);
+       
        if (entries)
                kfree(entries, buffer_size);
        if (table)
@@ -7949,6 +9314,123 @@ out:
        return (error);
 }
 
+static int
+memorystatus_cmd_grp_set_probabilities(user_addr_t buffer, size_t buffer_size)
+{
+       int error = 0;
+       memorystatus_properties_entry_v1_t *entries = NULL;
+       uint32_t entry_count = 0, i = 0;
+       memorystatus_internal_probabilities_t *tmp_table_new = NULL, *tmp_table_old = NULL;
+       size_t tmp_table_new_size = 0, tmp_table_old_size = 0;
+
+       /* Verify inputs */
+       if ((buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
+               error = EINVAL;
+               goto out;
+       }
+
+       entry_count = (buffer_size / sizeof(memorystatus_properties_entry_v1_t));
+
+       if ((entries = (memorystatus_properties_entry_v1_t *) kalloc(buffer_size)) == NULL) {
+               error = ENOMEM;
+               goto out;
+       }
+
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_START, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, 0, 0, 0);
+
+       if ((error = copyin(buffer, entries, buffer_size)) != 0) {
+               goto out;
+       }
+
+       if (entries[0].version == MEMORYSTATUS_MPE_VERSION_1) {
+               if ((buffer_size % MEMORYSTATUS_MPE_VERSION_1_SIZE) != 0) {
+                       error = EINVAL;
+                       goto out;
+               }
+       } else {
+               error = EINVAL;
+               goto out;
+       }
+       /* Verify sanity of input priorities */
+       for (i=0; i < entry_count; i++) {
+               /*
+                * 0 - low probability of use.
+                * 1 - high probability of use.
+                *
+                * Keeping this field an int (& not a bool) to allow 
+                * us to experiment with different values/approaches
+                * later on.
+                */
+               if (entries[i].use_probability > 1) {
+                       error = EINVAL;
+                       goto out;
+               }
+       }
+
+       tmp_table_new_size = sizeof(memorystatus_internal_probabilities_t) * entry_count;
+
+       if ( (tmp_table_new = (memorystatus_internal_probabilities_t *) kalloc(tmp_table_new_size)) == NULL) {
+               error = ENOMEM;
+               goto out;
+       }
+       memset(tmp_table_new, 0, tmp_table_new_size);
+
+       proc_list_lock();
+
+       if (memorystatus_global_probabilities_table) {
+               tmp_table_old = memorystatus_global_probabilities_table;
+               tmp_table_old_size = memorystatus_global_probabilities_size;
+       }
+
+       memorystatus_global_probabilities_table = tmp_table_new;
+       memorystatus_global_probabilities_size = tmp_table_new_size;
+       tmp_table_new = NULL;
+
+       for (i=0; i < entry_count; i++ ) {
+               /* Build the table data  */
+               strlcpy(memorystatus_global_probabilities_table[i].proc_name, entries[i].proc_name, MAXCOMLEN + 1);
+               memorystatus_global_probabilities_table[i].use_probability = entries[i].use_probability;
+       }
+
+       proc_list_unlock();
+       
+out:
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_GRP_SET_PROP) | DBG_FUNC_END, MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY, entry_count, tmp_table_new_size, 0, 0);
+
+       if (entries) {
+               kfree(entries, buffer_size);
+               entries = NULL;
+       }
+
+       if (tmp_table_old) {
+               kfree(tmp_table_old, tmp_table_old_size);
+               tmp_table_old = NULL;
+       }
+
+       return (error);
+
+}
+
+static int
+memorystatus_cmd_grp_set_properties(int32_t flags, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval)
+{
+       int error = 0;
+
+       if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) == MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY) {
+
+               error = memorystatus_cmd_grp_set_priorities(buffer, buffer_size);
+
+       } else if ((flags & MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) == MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY) {
+
+               error = memorystatus_cmd_grp_set_probabilities(buffer, buffer_size);
+
+       } else {
+               error = EINVAL;
+       }
+
+       return error;
+}
 
 /*
  * This routine is used to update a process's jetsam priority position and stored user_data.
@@ -8313,9 +9795,131 @@ proc_get_memstat_priority(proc_t p, boolean_t effective_priority)
        return 0;
 }
 
+static int
+memorystatus_get_process_is_managed(pid_t pid, int *is_managed)
+{
+       proc_t p = NULL;
+
+       /* Validate inputs */
+       if (pid == 0) {
+               return EINVAL;
+       }
+
+       p = proc_find(pid);
+       if (!p) {
+               return ESRCH;
+       }
+
+       proc_list_lock();
+       *is_managed = ((p->p_memstat_state & P_MEMSTAT_MANAGED) ? 1 : 0);
+       proc_rele_locked(p);
+       proc_list_unlock();
+
+       return 0;
+}
+
+static int
+memorystatus_set_process_is_managed(pid_t pid, boolean_t set_managed)
+{
+       proc_t p = NULL;
+
+       /* Validate inputs */
+       if (pid == 0) {
+               return EINVAL;
+       }
+
+       p = proc_find(pid);
+       if (!p) {
+               return ESRCH;
+       }
+
+       proc_list_lock();
+       if (set_managed == TRUE) {
+               p->p_memstat_state |= P_MEMSTAT_MANAGED;
+       } else {
+               p->p_memstat_state &= ~P_MEMSTAT_MANAGED;
+       }
+       proc_rele_locked(p);
+       proc_list_unlock();
+
+       return 0;
+}
+
+static int
+memorystatus_get_process_is_freezable(pid_t pid, int *is_freezable)
+{
+       proc_t p = PROC_NULL;
+
+       if (pid == 0) {
+               return EINVAL;
+       }
+
+       p = proc_find(pid);
+       if (!p) {
+               return ESRCH;
+       }
+
+       /*
+        * Only allow this on the current proc for now.
+        * We can check for privileges and allow targeting another process in the future.
+        */
+       if (p != current_proc()) {
+               proc_rele(p);
+               return EPERM;
+       }
+
+       proc_list_lock();
+       *is_freezable = ((p->p_memstat_state & P_MEMSTAT_FREEZE_DISABLED) ? 0 : 1);
+       proc_rele_locked(p);
+       proc_list_unlock();
+
+       return 0;
+}
+
+static int
+memorystatus_set_process_is_freezable(pid_t pid, boolean_t is_freezable)
+{
+       proc_t p = PROC_NULL;
+
+       if (pid == 0) {
+               return EINVAL;
+       }
+
+       p = proc_find(pid);
+       if (!p) {
+               return ESRCH;
+       }
+
+       /*
+        * Only allow this on the current proc for now.
+        * We can check for privileges and allow targeting another process in the future.
+        */
+       if (p != current_proc()) {
+               proc_rele(p);
+               return EPERM;
+       }
+
+       proc_list_lock();
+       if (is_freezable == FALSE) {
+               /* Freeze preference set to FALSE. Set the P_MEMSTAT_FREEZE_DISABLED bit. */
+               p->p_memstat_state |= P_MEMSTAT_FREEZE_DISABLED;
+               printf("memorystatus_set_process_is_freezable: disabling freeze for pid %d [%s]\n",
+                               p->p_pid, (*p->p_name ? p->p_name : "unknown"));
+       } else {
+               p->p_memstat_state &= ~P_MEMSTAT_FREEZE_DISABLED;
+               printf("memorystatus_set_process_is_freezable: enabling freeze for pid %d [%s]\n",
+                               p->p_pid, (*p->p_name ? p->p_name : "unknown"));
+       }
+       proc_rele_locked(p);
+       proc_list_unlock();
+
+       return 0;
+}
+
 int
 memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) {
        int error = EINVAL;
+       boolean_t skip_auth_check = FALSE;
        os_reason_t jetsam_reason = OS_REASON_NULL;
 
 #if !CONFIG_JETSAM
@@ -8323,8 +9927,13 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
        #pragma unused(jetsam_reason)
 #endif
 
-       /* Need to be root or have entitlement */
-       if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT)) {
+       /* We don't need entitlements if we're setting/ querying the freeze preference for a process. Skip the check below. */
+       if (args->command == MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE || args->command == MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE) {
+               skip_auth_check = TRUE;
+       }
+
+       /* Need to be root or have entitlement. */
+       if (!kauth_cred_issuser(kauth_cred_get()) && !IOTaskHasEntitlement(current_task(), MEMORYSTATUS_ENTITLEMENT) && !skip_auth_check) {
                error = EPERM;
                goto out;
        }
@@ -8430,9 +10039,32 @@ memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *
 
        case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE:
        case MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE:
-               error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, args->flags ? TRUE : FALSE);
+               error = memorystatus_update_inactive_jetsam_priority_band(args->pid, args->command, JETSAM_PRIORITY_ELEVATED_INACTIVE, args->flags ? TRUE : FALSE);
+               break;
+       case MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED:
+               error = memorystatus_set_process_is_managed(args->pid, args->flags);
                break;
 
+       case MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED:
+               error = memorystatus_get_process_is_managed(args->pid, ret);
+               break;
+
+       case MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE:
+               error = memorystatus_set_process_is_freezable(args->pid, args->flags ? TRUE : FALSE);
+               break;
+
+       case MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE:
+               error = memorystatus_get_process_is_freezable(args->pid, ret);
+               break;
+
+#if CONFIG_FREEZE
+#if DEVELOPMENT || DEBUG
+       case MEMORYSTATUS_CMD_FREEZER_CONTROL:
+               error = memorystatus_freezer_control(args->flags, args->buffer, args->buffersize, ret);
+               break;
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_FREEZE */
+
        default:
                break;
        }
@@ -8592,9 +10224,6 @@ filt_memorystatustouch(struct knote *kn, struct kevent_internal_s *kev)
        }
 #endif /* !CONFIG_EMBEDDED */
 
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
-
        /*
         * reset the output flags based on a
         * combination of the old events and
@@ -8981,9 +10610,19 @@ int
 memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
 {
 #if !CONFIG_JETSAM
-       if (!p || (!isApp(p)) || (p->p_memstat_state & P_MEMSTAT_INTERNAL)) {
+       if (!p || (!isApp(p)) || (p->p_memstat_state & (P_MEMSTAT_INTERNAL | P_MEMSTAT_MANAGED))) {
                /*
                 * Ineligible processes OR system processes e.g. launchd.
+                *
+                * We also skip processes that have the P_MEMSTAT_MANAGED bit set, i.e.
+                * they're managed by assertiond. These are iOS apps that have been ported
+                * to macOS. assertiond might be in the process of modifying the app's
+                * priority / memory limit - so it might have the proc_list lock, and then try
+                * to take the task lock. Meanwhile we've entered this function with the task lock
+                * held, and we need the proc_list lock below. So we'll deadlock with assertiond.
+                *
+                * It should be fine to read the P_MEMSTAT_MANAGED bit without the proc_list
+                * lock here, since assertiond only sets this bit on process launch.
                 */
                return -1;
        }
@@ -9075,6 +10714,8 @@ memorystatus_update_priority_for_appnap(proc_t p, boolean_t is_appnap)
                p->p_memstat_idle_start = mach_absolute_time();
        }
 
+       KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CHANGE_PRIORITY), p->p_pid, priority, p->p_memstat_effectivepriority, 0, 0);
+
        p->p_memstat_effectivepriority = priority;
 
        proc_list_unlock();
index 35115e557ec8afca31263684985875b3d4c473f8..08cd860d6b6fca319a63df5dcd67e6cd7d10fd99 100644 (file)
@@ -503,7 +503,9 @@ int watchpoint_flag = -1;
 int breakpoint_flag = -1;
 int gNeon = -1;
 int gNeonHpfp = -1;
+int gNeonFp16 = -1;
 int gARMv81Atomics = 0;
+int gARMv8Crc32 = 0;
 
 #if defined (__arm__)
 int arm64_flag = 0;
@@ -517,7 +519,9 @@ SYSCTL_INT(_hw_optional, OID_AUTO, watchpoint, CTLFLAG_RD | CTLFLAG_KERN | CTLFL
 SYSCTL_INT(_hw_optional, OID_AUTO, breakpoint, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &breakpoint_flag, 0, "");
 SYSCTL_INT(_hw_optional, OID_AUTO, neon, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gNeon, 0, "");
 SYSCTL_INT(_hw_optional, OID_AUTO, neon_hpfp, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gNeonHpfp, 0, "");
+SYSCTL_INT(_hw_optional, OID_AUTO, neon_fp16, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gNeonFp16, 0, "");
 SYSCTL_INT(_hw_optional, OID_AUTO, armv8_1_atomics, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv81Atomics, 0, "");
+SYSCTL_INT(_hw_optional, OID_AUTO, armv8_crc32, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, &gARMv8Crc32, 0, "");
 
 /*
  * Without this little ifdef dance, the preprocessor replaces "arm64" with "1",
@@ -600,6 +604,7 @@ sysctl_mib_init(void)
        arm_mvfp_info_t *mvfp_info = arm_mvfp_info();
        gNeon = mvfp_info->neon;
        gNeonHpfp = mvfp_info->neon_hpfp;
+       gNeonFp16 = mvfp_info->neon_fp16;
 
        cacheconfig[0] = ml_get_max_cpus();
        cacheconfig[1] = 1;
index 4103009feae22f3a40f5212ac48aee7906cd29b3..0381325a9fdebd715e8b3eac6fda50fa51c3933c 100644 (file)
@@ -84,6 +84,7 @@
 #include <security/mac_framework.h>
 #endif
 
+
 lck_grp_t * sysctl_lock_group = NULL;
 lck_rw_t * sysctl_geometry_lock = NULL;
 lck_mtx_t * sysctl_unlocked_node_lock = NULL;
@@ -206,6 +207,7 @@ sysctl_register_oid(struct sysctl_oid *new_oidp)
                        new_oidp->oid_number = oidp->oid_number;
        }
 
+
        /*
         * Insert the oid into the parent's list in order.
         */
@@ -263,6 +265,7 @@ sysctl_unregister_oid(struct sysctl_oid *oidp)
                }
        }
 
+
        /*
         * We've removed it from the list at this point, but we don't want
         * to return to the caller until all handler references have drained
@@ -642,7 +645,7 @@ sysctl_sysctl_name(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
        int error = 0;
        struct sysctl_oid *oid;
        struct sysctl_oid_list *lsp = &sysctl__children, *lsp2;
-       char tempbuf[10];
+       char tempbuf[10] = {};
 
        lck_rw_lock_shared(sysctl_geometry_lock);
        while (namelen) {
@@ -834,7 +837,7 @@ sysctl_sysctl_next(__unused struct sysctl_oid *oidp, void *arg1, int arg2,
        int i, j, error;
        struct sysctl_oid *oid;
        struct sysctl_oid_list *lsp = &sysctl__children;
-       int newoid[CTL_MAXNAME];
+       int newoid[CTL_MAXNAME] = {};
 
        lck_rw_lock_shared(sysctl_geometry_lock);
        i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid);
@@ -966,7 +969,7 @@ sysctl_sysctl_name2oid(__unused struct sysctl_oid *oidp, __unused void *arg1,
        __unused int arg2, struct sysctl_req *req)
 {
        char *p;
-       int error, oid[CTL_MAXNAME];
+       int error, oid[CTL_MAXNAME] = {};
        u_int len = 0;          /* set by name2oid() */
 
        if (req->newlen < 1) 
@@ -1327,6 +1330,7 @@ sysctl_root(boolean_t from_kernel, boolean_t string_is_canonical, char *namestri
        int i;
        struct sysctl_oid *oid;
        struct sysctl_oid_list *lsp = &sysctl__children;
+       sysctl_handler_t oid_handler = NULL;
        int error;
        boolean_t unlocked_node_found = FALSE;
        boolean_t namestring_started = FALSE;
@@ -1464,7 +1468,12 @@ found:
            (error = proc_suser(req->p)))
                goto err;
 
-       if (!oid->oid_handler) {
+       /*
+        * sysctl_unregister_oid() may change the handler value, so grab it
+        * under the lock.
+        */
+       oid_handler = oid->oid_handler;
+       if (!oid_handler) {
            error = EINVAL;
                goto err;
        }
@@ -1503,14 +1512,11 @@ found:
                lck_mtx_lock(sysctl_unlocked_node_lock);
        }
 
+
        if ((oid->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
-               i = (oid->oid_handler) (oid,
-                                       name + indx, namelen - indx,
-                                       req);
+               i = oid_handler(oid, name + indx, namelen - indx, req);
        } else {
-               i = (oid->oid_handler) (oid,
-                                       oid->oid_arg1, oid->oid_arg2,
-                                       req);
+               i = oid_handler(oid, oid->oid_arg1, oid->oid_arg2, req);
        }
        error = i;
 
index 915f4c4b78fdce66ae4784354118f0bff6d12331..937f12d5d40895fe74cd8bedca395928a8fa0304 100644 (file)
@@ -293,11 +293,11 @@ ntp_gettime(struct proc *p, struct ntp_gettime_args *uap, __unused int32_t *retv
 }
 
 int
-ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retval)
+ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int32_t *retval)
 {
-       struct timex ntv;
+       struct timex ntv = {};
        long freq;
-       int modes;
+       unsigned int modes;
        int error, ret = 0;
        clock_sec_t sec;
        clock_usec_t microsecs;
@@ -334,7 +334,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retv
 
 #if DEVELOPEMNT || DEBUG
        if (g_should_log_clock_adjustments) {
-               os_log(OS_LOG_DEFAULT, "%s:BEFORE modes %u offset %ld freq %ld status %d constant %ld time_adjtime %lld\n",
+               os_log(OS_LOG_DEFAULT, "%s: BEFORE modes %u offset %ld freq %ld status %d constant %ld time_adjtime %lld\n",
                       __func__, ntv.modes, ntv.offset, ntv.freq, ntv.status, ntv.constant, time_adjtime);
        }
 #endif
@@ -429,8 +429,8 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retv
 
 #if DEVELOPEMNT || DEBUG
        if (g_should_log_clock_adjustments) {
-               os_log(OS_LOG_DEFAULT, "%s:AFTER offset %lld freq %lld status %d constant %ld time_adjtime %lld\n",
-                      __func__, time_offset, time_freq, time_status, time_constant, time_adjtime);
+               os_log(OS_LOG_DEFAULT, "%s: AFTER modes %u offset %lld freq %lld status %d constant %ld time_adjtime %lld\n",
+                      __func__, modes, time_offset, time_freq, time_status, time_constant, time_adjtime);
        }
 #endif
 
@@ -441,6 +441,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retv
        if (IS_64BIT_PROCESS(p)) {
                struct user64_timex user_ntv = {};
 
+               user_ntv.modes = modes;
                if (time_status & STA_NANO)
                        user_ntv.offset = L_GINT(time_offset);
                else
@@ -465,6 +466,7 @@ ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, __unused int32_t *retv
        else{
                struct user32_timex user_ntv = {};
 
+               user_ntv.modes = modes;
                if (time_status & STA_NANO)
                        user_ntv.offset = L_GINT(time_offset);
                else
index 8e70d80f03f5841633bdbc1a79d3680cef6db3a5..0d7ece73db85640b179cef3a84c02a73bfb11cd5 100644 (file)
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-/*
- * System Overrides syscall implementation
- */
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -52,6 +48,7 @@
 #include <sys/kdebug.h>
 #include <sys/sysproto.h>
 #include <sys/msgbuf.h>
+#include <sys/kern_memorystatus.h>
 
 /* Mutex for global system override state */
 static lck_mtx_t       sys_override_lock;
@@ -59,9 +56,33 @@ static lck_grp_t        *sys_override_mtx_grp;
 static lck_attr_t       *sys_override_mtx_attr;
 static lck_grp_attr_t   *sys_override_mtx_grp_attr;
 
-/* Assertion counts for system properties */
+/* 
+ * Assertion counts for system properties (add new ones for each new mechanism)
+ *
+ * The assertion count management for system overrides is as follows:
+ *
+ * - All assertion counts are protected by the sys_override_lock.
+ *
+ * - Each caller of system_override() increments the assertion count for the 
+ *   mechanism it specified in the flags. The caller then blocks for the 
+ *   timeout specified in the system call. 
+ *
+ * - At the end of the timeout, the caller thread wakes up and decrements the 
+ *   assertion count for the mechanism it originally took an assertion on.
+ *
+ * - If another caller calls the system_override() to disable the override 
+ *   for a mechanism, it simply disables the mechanism without changing any 
+ *   assertion counts. That way, the assertion counts are properly balanced. 
+ *
+ * One thing to note is that a SYS_OVERRIDE_DISABLE disables the overrides 
+ * for a mechanism irrespective of how many clients requested that override.
+ * That makes the implementation simpler and avoids keeping a lot of process 
+ * specific state in the kernel.
+ *
+ */
 static int64_t         io_throttle_assert_cnt;
 static int64_t         cpu_throttle_assert_cnt;
+static int64_t         fast_jetsam_assert_cnt;
 
 /* Wait Channel for system override */
 static uint64_t                sys_override_wait;
@@ -69,19 +90,13 @@ static uint64_t             sys_override_wait;
 /* Global variable to indicate if system_override is enabled */
 int                    sys_override_enabled;
 
-/* Sysctl definition for sys_override_enabled */
-SYSCTL_INT(_debug, OID_AUTO, sys_override_enabled, CTLFLAG_RW | CTLFLAG_LOCKED, &sys_override_enabled, 0, "");
-
-/* Forward Declarations */
-static void enable_system_override(uint64_t flags);
-static void disable_system_override(uint64_t flags);
+/* Helper routines */
+static void system_override_begin(uint64_t flags);
+static void system_override_end(uint64_t flags);
+static void system_override_abort(uint64_t flags);
+static void system_override_callouts(uint64_t flags, boolean_t enable_override);
 static __attribute__((noinline)) void PROCESS_OVERRIDING_SYSTEM_DEFAULTS(uint64_t timeout);
 
-/***************************** system_override ********************/
-/*
- * int system_override(uint64_t timeout, uint64_t flags);
- */
-
 void
 init_system_override()
 {
@@ -89,7 +104,7 @@ init_system_override()
        sys_override_mtx_grp = lck_grp_alloc_init("system_override", sys_override_mtx_grp_attr);
        sys_override_mtx_attr = lck_attr_alloc_init();
        lck_mtx_init(&sys_override_lock, sys_override_mtx_grp, sys_override_mtx_attr);
-       io_throttle_assert_cnt = cpu_throttle_assert_cnt = 0;
+       io_throttle_assert_cnt = cpu_throttle_assert_cnt = fast_jetsam_assert_cnt = 0;
        sys_override_enabled = 1;
 }
 
@@ -106,37 +121,28 @@ system_override(__unused struct proc *p, struct system_override_args * uap, __un
                goto out;
        }       
 
-       /* Check to see if some flags are specified. */
+       /* Check to see if sane flags are specified. */
        if ((flags & ~SYS_OVERRIDE_FLAGS_MASK) != 0) {
                error = EINVAL;
                goto out;
        }
 
-       if (flags == SYS_OVERRIDE_DISABLE) {
-               
-               printf("Process %s [%d] disabling system_override()\n", current_proc()->p_comm, current_proc()->p_pid);
-
-               lck_mtx_lock(&sys_override_lock);
-               
-               if (io_throttle_assert_cnt > 0)
-                       sys_override_io_throttle(THROTTLE_IO_ENABLE);
-               if (cpu_throttle_assert_cnt > 0)
-                       sys_override_cpu_throttle(CPU_THROTTLE_ENABLE);
-
-               sys_override_enabled = 0;
-                               
-               lck_mtx_unlock(&sys_override_lock);
-
+       /* Make sure that the system override syscall has been initialized */
+       if (!sys_override_enabled) {
+               error = EINVAL;
                goto out;
        }
 
        lck_mtx_lock(&sys_override_lock);
 
-       enable_system_override(flags);
-
-       PROCESS_OVERRIDING_SYSTEM_DEFAULTS(timeout);
-
-       disable_system_override(flags);
+       if (flags & SYS_OVERRIDE_DISABLE) {
+               flags &= ~SYS_OVERRIDE_DISABLE;
+               system_override_abort(flags);
+       } else {
+               system_override_begin(flags);
+               PROCESS_OVERRIDING_SYSTEM_DEFAULTS(timeout);
+               system_override_end(flags);
+       }
 
        lck_mtx_unlock(&sys_override_lock);
 
@@ -145,62 +151,164 @@ out:
 }
 
 /*
- * Call for enabling global system override.
- * This should be called only with the sys_override_lock held.
+ * Helper routines for enabling/disabling system overrides for various mechanisms.
+ * These routines should be called with the sys_override_lock held. Each subsystem 
+ * which is hooked into the override service provides two routines:
+ * 
+ * - void sys_override_foo_init(void);
+ * Routine to initialize the subsystem or the data needed for the override to work.
+ * This routine is optional and if a subsystem needs it, it should be invoked from 
+ * init_system_override().
+ * 
+ * - void sys_override_foo(boolean_t enable_override);
+ * Routine to enable/disable the override mechanism for that subsystem. A value of 
+ * true indicates that the mechanism should be overridden and the special behavior 
+ * should begin. A false value indicates that the subsystem should return to default 
+ * behavior. This routine is mandatory and should be invoked as part of the helper 
+ * routines if the flags passed in the syscall match the subsystem. Also, this 
+ * routine should preferably be idempotent.
+ */
+
+static void
+system_override_callouts(uint64_t flags, boolean_t enable_override)
+{
+       switch (flags) {
+               case SYS_OVERRIDE_IO_THROTTLE:
+                       if (enable_override) {
+                               KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_THROTTLE, IO_THROTTLE_DISABLE) | DBG_FUNC_START, 
+                                       current_proc()->p_pid, 0, 0, 0, 0);
+                       } else {
+                               KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_THROTTLE, IO_THROTTLE_DISABLE) | DBG_FUNC_END, 
+                                       current_proc()->p_pid, 0, 0, 0, 0);
+                       }
+                       sys_override_io_throttle(enable_override);
+                       break;
+
+               case SYS_OVERRIDE_CPU_THROTTLE:
+                       if (enable_override) {
+                               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CPU_THROTTLE_DISABLE) | DBG_FUNC_START, 
+                                       current_proc()->p_pid, 0, 0, 0, 0);
+                       } else {
+                               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CPU_THROTTLE_DISABLE) | DBG_FUNC_END, 
+                                       current_proc()->p_pid, 0, 0, 0, 0);
+                       }
+                       sys_override_cpu_throttle(enable_override);
+                       break;
+
+               case SYS_OVERRIDE_FAST_JETSAM:
+                       if (enable_override) {
+                               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FAST_JETSAM) | DBG_FUNC_START, 
+                                       current_proc()->p_pid, 0, 0, 0, 0);
+                       } else {
+                               KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FAST_JETSAM) | DBG_FUNC_END, 
+                                       current_proc()->p_pid, 0, 0, 0, 0);
+                       }
+#if CONFIG_JETSAM
+                       memorystatus_fast_jetsam_override(enable_override);
+#endif /* CONFIG_JETSAM */
+                       break;
+
+               default:
+                       panic("Unknown option to system_override_callouts(): %llu\n", flags);
+       }
+}
+
+/*
+ * system_override_begin(uint64_t flags)
+ *
+ * Routine to start a system override if the assertion count 
+ * transitions from 0->1 for a specified mechanism.
  */
 static void
-enable_system_override(uint64_t flags)
+system_override_begin(uint64_t flags)
 {
+       lck_mtx_assert(&sys_override_lock, LCK_MTX_ASSERT_OWNED);
        
        if (flags & SYS_OVERRIDE_IO_THROTTLE) {
-               if ((io_throttle_assert_cnt == 0) && sys_override_enabled) {
-                       /* Disable I/O Throttling */
-                       printf("Process %s [%d] disabling system-wide I/O Throttling\n", current_proc()->p_comm, current_proc()->p_pid);
-                       sys_override_io_throttle(THROTTLE_IO_DISABLE);
+               if (io_throttle_assert_cnt == 0) {
+                       system_override_callouts(SYS_OVERRIDE_IO_THROTTLE, true);
                }
-               KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_THROTTLE, IO_THROTTLE_DISABLE) | DBG_FUNC_START, current_proc()->p_pid, 0, 0, 0, 0);
                io_throttle_assert_cnt++;
        }
        
        if (flags & SYS_OVERRIDE_CPU_THROTTLE) {
-               if ((cpu_throttle_assert_cnt == 0) && sys_override_enabled) {
-                       /* Disable CPU Throttling */
-                       printf("Process %s [%d] disabling system-wide CPU Throttling\n", current_proc()->p_comm, current_proc()->p_pid);
-                       sys_override_cpu_throttle(CPU_THROTTLE_DISABLE);
+               if (cpu_throttle_assert_cnt == 0) {
+                       system_override_callouts(SYS_OVERRIDE_CPU_THROTTLE, true);
                }
-               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CPU_THROTTLE_DISABLE) | DBG_FUNC_START, current_proc()->p_pid, 0, 0, 0, 0);
                cpu_throttle_assert_cnt++;
        }
+       
+       if (flags & SYS_OVERRIDE_FAST_JETSAM) {
+               if (fast_jetsam_assert_cnt == 0) {
+                       system_override_callouts(SYS_OVERRIDE_FAST_JETSAM, true);
+               }
+               fast_jetsam_assert_cnt++;
+       }
 
 }
 
 /*
- * Call for disabling global system override.
- * This should be called only with the sys_override_lock held.
+ * system_override_end(uint64_t flags)
+ *
+ * Routine to end a system override if the assertion count 
+ * transitions from 1->0 for a specified mechanism.
  */
 static void
-disable_system_override(uint64_t flags)
+system_override_end(uint64_t flags)
 {
 
+       lck_mtx_assert(&sys_override_lock, LCK_MTX_ASSERT_OWNED);
+       
        if (flags & SYS_OVERRIDE_IO_THROTTLE) {
                assert(io_throttle_assert_cnt > 0);
                io_throttle_assert_cnt--;
-               KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_THROTTLE, IO_THROTTLE_DISABLE) | DBG_FUNC_END, current_proc()->p_pid, 0, 0, 0, 0);
-               if ((io_throttle_assert_cnt == 0) && sys_override_enabled) {
-                       /* Enable I/O Throttling */
-                       sys_override_io_throttle(THROTTLE_IO_ENABLE);
+               if (io_throttle_assert_cnt == 0) {
+                       system_override_callouts(SYS_OVERRIDE_IO_THROTTLE, false);
                }
        }
 
        if (flags & SYS_OVERRIDE_CPU_THROTTLE) {
                assert(cpu_throttle_assert_cnt > 0);
                cpu_throttle_assert_cnt--;
-               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_CPU_THROTTLE_DISABLE) | DBG_FUNC_END, current_proc()->p_pid, 0, 0, 0, 0);
-               if ((cpu_throttle_assert_cnt == 0) && sys_override_enabled) {
-                       /* Enable CPU Throttling */
-                       sys_override_cpu_throttle(CPU_THROTTLE_ENABLE);
+               if (cpu_throttle_assert_cnt == 0) {
+                       system_override_callouts(SYS_OVERRIDE_CPU_THROTTLE, false);
                }
        }
+
+       if (flags & SYS_OVERRIDE_FAST_JETSAM) {
+               assert(fast_jetsam_assert_cnt > 0);
+               fast_jetsam_assert_cnt--;
+               if (fast_jetsam_assert_cnt == 0) {
+                       system_override_callouts(SYS_OVERRIDE_FAST_JETSAM, false);
+               }
+       }
+
+}
+
+/*
+ * system_override_abort(uint64_t flags)
+ *
+ * Routine to abort a system override (if one was active) 
+ * irrespective of the assertion counts and number of blocked 
+ * requestors.
+ */
+static void
+system_override_abort(uint64_t flags)
+{
+       
+       lck_mtx_assert(&sys_override_lock, LCK_MTX_ASSERT_OWNED);
+       
+       if ((flags & SYS_OVERRIDE_IO_THROTTLE) && (io_throttle_assert_cnt > 0)) {
+               system_override_callouts(SYS_OVERRIDE_IO_THROTTLE, false);
+       }
+
+       if ((flags & SYS_OVERRIDE_CPU_THROTTLE) && (cpu_throttle_assert_cnt > 0))  {
+               system_override_callouts(SYS_OVERRIDE_CPU_THROTTLE, false);
+       }
+
+       if ((flags & SYS_OVERRIDE_FAST_JETSAM) && (fast_jetsam_assert_cnt > 0))  {
+               system_override_callouts(SYS_OVERRIDE_FAST_JETSAM, false);
+       }
 }
 
 static __attribute__((noinline)) void
index 887029225c46e7b6b78adbd0611688e088dbe2fd..eaedcc705d75d76d6be4fb2a777a2d454d0feddd 100644 (file)
@@ -228,7 +228,7 @@ pcsamples_control(int *name, __unused u_int namelen, user_addr_t where, size_t *
     int ret=0;
     size_t size=*sizep;
     int value = name[1];
-    pcinfo_t pc_bufinfo;
+    pcinfo_t pc_bufinfo = {};
     pid_t *pidcheck;
 
     pid_t curpid;
index 37df0b5a0fe80a7811ce89751d0b2917a386a158..e05e8d4249cc4c1d4ee8085a8f307f3d39fcda30 100644 (file)
 #include <sys/proc_info.h>
 #include <sys/resourcevar.h>
 
-#define pna_info(fmt, ...) \
-       printf("%s:  " fmt "\n", __func__, ## __VA_ARGS__)
-
+#include <os/log.h>
 #define pna_err(fmt, ...) \
-       printf("ERROR[%s]:  " fmt "\n", __func__, ## __VA_ARGS__)
+       os_log_error(OS_LOG_DEFAULT, "ERROR: " fmt, ## __VA_ARGS__)
 
 #define MAX_PERSONAS     512
 
 #define PERSONA_SYSTEM_UID    ((uid_t)99)
 #define PERSONA_SYSTEM_LOGIN  "system"
 
+#define PERSONA_ALLOC_TOKEN   (0x7a0000ae)
+#define PERSONA_INIT_TOKEN    (0x7500005e)
 #define PERSONA_MAGIC         (0x0aa55aa0)
+#define persona_initialized(p) ((p)->pna_valid == PERSONA_MAGIC || (p)->pna_valid == PERSONA_INIT_TOKEN)
 #define persona_valid(p)      ((p)->pna_valid == PERSONA_MAGIC)
 #define persona_mkinvalid(p)  ((p)->pna_valid = ~(PERSONA_MAGIC))
 
@@ -74,6 +75,8 @@ lck_attr_t *persona_lck_attr;
 lck_grp_t *persona_lck_grp;
 lck_grp_attr_t *persona_lck_grp_attr;
 
+os_refgrp_decl(static, persona_refgrp, "persona", NULL);
+
 static zone_t persona_zone;
 
 kauth_cred_t g_default_persona_cred;
@@ -126,15 +129,18 @@ void personas_bootstrap(void)
        g_system_persona = persona_alloc(PERSONA_SYSTEM_UID,
                                         PERSONA_SYSTEM_LOGIN,
                                         PERSONA_SYSTEM, NULL);
+       int err = persona_init_begin(g_system_persona);
+       assert(err == 0);
+
+       persona_init_end(g_system_persona, err);
+
        assert(g_system_persona != NULL);
 }
 
 struct persona *persona_alloc(uid_t id, const char *login, int type, int *error)
 {
-       struct persona *persona, *tmp;
+       struct persona *persona;
        int err = 0;
-       kauth_cred_t tmp_cred;
-       gid_t new_group;
 
        if (!login) {
                pna_err("Must provide a login name for a new persona!");
@@ -167,10 +173,11 @@ struct persona *persona_alloc(uid_t id, const char *login, int type, int *error)
        }
 
        strncpy(persona->pna_login, login, sizeof(persona->pna_login)-1);
+       persona_dbg("Starting persona allocation for: '%s'", persona->pna_login);
 
        LIST_INIT(&persona->pna_members);
        lck_mtx_init(&persona->pna_lock, persona_lck_grp, persona_lck_attr);
-       persona->pna_refcount = 1;
+       os_ref_init(&persona->pna_refcount, &persona_refgrp);
 
        /*
         * Setup initial (temporary) kauth_cred structure
@@ -184,18 +191,71 @@ struct persona *persona_alloc(uid_t id, const char *login, int type, int *error)
                goto out_error;
        }
 
+       persona->pna_type = type;
+       persona->pna_id = id;
+       persona->pna_valid = PERSONA_ALLOC_TOKEN;
+
+       /*
+        * NOTE: this persona has not been fully initialized. A subsequent
+        * call to persona_init_begin() followed by persona_init_end() will make
+        * the persona visible to the rest of the system.
+        */
+       if (error) {
+               *error = 0;
+       }
+       return persona;
+
+out_error:
+       (void)hw_atomic_add(&g_total_personas, -1);
+       zfree(persona_zone, persona);
+       if (error) {
+               *error = err;
+       }
+       return NULL;
+}
+
+/**
+ * persona_init_begin
+ *
+ * This function begins initialization of a persona. It first acquires the
+ * global persona list lock via lock_personas(), then selects an appropriate
+ * persona ID and sets up the persona's credentials. This function *must* be
+ * followed by a call to persona_init_end() which will mark the persona
+ * structure as valid
+ *
+ * Conditions:
+ *     persona has been allocated via persona_alloc()
+ *     nothing locked
+ *
+ * Returns:
+ *     global persona list is locked (even on error)
+ */
+int persona_init_begin(struct persona *persona)
+{
+       struct persona *tmp;
+       int err = 0;
+       kauth_cred_t tmp_cred;
+       gid_t new_group;
+       uid_t id;
+
+       if (!persona || (persona->pna_valid != PERSONA_ALLOC_TOKEN)) {
+               return EINVAL;
+       }
+
+       id = persona->pna_id;
+
        lock_personas();
 try_again:
-       if (id != PERSONA_ID_NONE)
-               persona->pna_id = id;
-       else
+       if (id == PERSONA_ID_NONE)
                persona->pna_id = g_next_persona_id;
 
-       persona_dbg("Adding %d (%s) to global list...", persona->pna_id, persona->pna_login);
+       persona_dbg("Beginning Initialization of %d:%d (%s)...", id, persona->pna_id, persona->pna_login);
 
        err = 0;
        LIST_FOREACH(tmp, &all_personas, pna_list) {
-               if (id == PERSONA_ID_NONE && tmp->pna_id == id) {
+               persona_lock(tmp);
+               if (id == PERSONA_ID_NONE && tmp->pna_id == persona->pna_id) {
+                       persona_unlock(tmp);
                        /*
                         * someone else manually claimed this ID, and we're
                         * trying to allocate an ID for the caller: try again
@@ -203,8 +263,9 @@ try_again:
                        g_next_persona_id += PERSONA_ID_STEP;
                        goto try_again;
                }
-               if (strncmp(tmp->pna_login, login, sizeof(tmp->pna_login)) == 0
-                   || tmp->pna_id == id) {
+               if (strncmp(tmp->pna_login, persona->pna_login, sizeof(tmp->pna_login)) == 0 ||
+                   tmp->pna_id == persona->pna_id) {
+                       persona_unlock(tmp);
                        /*
                         * Disallow use of identical login names and re-use
                         * of previously allocated persona IDs
@@ -212,9 +273,10 @@ try_again:
                        err = EEXIST;
                        break;
                }
+               persona_unlock(tmp);
        }
        if (err)
-               goto out_unlock;
+               goto out;
 
        /* ensure the cred has proper UID/GID defaults */
        kauth_cred_ref(persona->pna_cred);
@@ -227,7 +289,7 @@ try_again:
 
        if (!persona->pna_cred) {
                err = EACCES;
-               goto out_unlock;
+               goto out;
        }
 
        /* it should be a member of exactly 1 group (equal to its UID) */
@@ -243,54 +305,79 @@ try_again:
 
        if (!persona->pna_cred) {
                err = EACCES;
-               goto out_unlock;
+               goto out;
        }
 
-       persona->pna_type = type;
-
-       /* insert the, now valid, persona into the global list! */
-       persona->pna_valid = PERSONA_MAGIC;
-       LIST_INSERT_HEAD(&all_personas, persona, pna_list);
-
        /* if the kernel supplied the persona ID, increment for next time */
        if (id == PERSONA_ID_NONE)
                g_next_persona_id += PERSONA_ID_STEP;
 
-out_unlock:
-       unlock_personas();
+       persona->pna_valid = PERSONA_INIT_TOKEN;
 
-       if (err) {
-               switch (err) {
-               case EEXIST:
-                       persona_dbg("Login '%s' (%d) already exists",
-                                   login, persona->pna_id);
-                       break;
-               case EACCES:
-                       persona_dbg("kauth_error for persona:%d", persona->pna_id);
-                       break;
-               default:
-                       persona_dbg("Unknown error:%d", err);
-               }
-               goto out_error;
+out:
+       if (err != 0) {
+               persona_dbg("ERROR:%d while initializing %d:%d (%s)...", err, id, persona->pna_id, persona->pna_login);
+               /*
+                * mark the persona with an error so that persona_init_end()
+                * will *not* add it to the global list.
+                */
+               persona->pna_id = PERSONA_ID_NONE;
        }
 
-       return persona;
+       /*
+        * leave the global persona list locked: it will be
+        * unlocked in a call to persona_init_end()
+        */
+       return err;
+}
 
-out_error:
-       (void)hw_atomic_add(&g_total_personas, -1);
-       zfree(persona_zone, persona);
-       if (error)
-               *error = err;
-       return NULL;
+/**
+ * persona_init_end
+ *
+ * This function finalizes the persona initialization by marking it valid and
+ * adding it to the global list of personas. After unlocking the global list,
+ * the persona will be visible to the reset of the system. The function will
+ * only mark the persona valid if the input parameter 'error' is 0.
+ *
+ * Conditions:
+ *     persona is initialized via persona_init_begin()
+ *     global persona list is locked via lock_personas()
+ *
+ * Returns:
+ *     global persona list is unlocked
+ */
+void persona_init_end(struct persona *persona, int error)
+{
+       if (persona == NULL) {
+               return;
+       }
+
+       /*
+        * If the pna_valid member is set to the INIT_TOKEN value, then it has
+        * successfully gone through persona_init_begin(), and we can mark it
+        * valid and make it visible to the rest of the system. However, if
+        * there was an error either during initialization or otherwise, we
+        * need to decrement the global count of personas because this one
+        * will be disposed-of by the callers invocation of persona_put().
+        */
+       if (error != 0 || persona->pna_valid == PERSONA_ALLOC_TOKEN) {
+               persona_dbg("ERROR:%d after initialization of %d (%s)", error, persona->pna_id, persona->pna_login);
+               /* remove this persona from the global count */
+               (void)hw_atomic_add(&g_total_personas, -1);
+       } else if (error == 0 &&
+                  persona->pna_valid == PERSONA_INIT_TOKEN) {
+               persona->pna_valid = PERSONA_MAGIC;
+               LIST_INSERT_HEAD(&all_personas, persona, pna_list);
+               persona_dbg("Initialization of %d (%s) Complete.", persona->pna_id, persona->pna_login);
+       }
+
+       unlock_personas();
 }
 
 static struct persona *persona_get_locked(struct persona *persona)
 {
-       if (persona->pna_refcount) {
-               persona->pna_refcount++;
-               return persona;
-       }
-       return NULL;
+       os_ref_retain_locked(&persona->pna_refcount);
+       return persona;
 }
 
 struct persona *persona_get(struct persona *persona)
@@ -313,9 +400,8 @@ void persona_put(struct persona *persona)
                return;
 
        persona_lock(persona);
-       if (persona->pna_refcount >= 0) {
-               if (--(persona->pna_refcount) == 0)
-                       destroy = 1;
+       if (os_ref_release_locked(&persona->pna_refcount) == 0) {
+               destroy = 1;
        }
        persona_unlock(persona);
 
@@ -851,7 +937,7 @@ int persona_set_cred(struct persona *persona, kauth_cred_t cred)
                return EINVAL;
 
        persona_lock(persona);
-       if (!persona_valid(persona)) {
+       if (!persona_initialized(persona)) {
                ret = EINVAL;
                goto out_unlock;
        }
@@ -888,7 +974,7 @@ int persona_set_cred_from_proc(struct persona *persona, proc_t proc)
                return EINVAL;
 
        persona_lock(persona);
-       if (!persona_valid(persona)) {
+       if (!persona_initialized(persona)) {
                ret = EINVAL;
                goto out_unlock;
        }
@@ -969,7 +1055,7 @@ int persona_set_gid(struct persona *persona, gid_t gid)
                return EINVAL;
 
        persona_lock(persona);
-       if (!persona_valid(persona)) {
+       if (!persona_initialized(persona)) {
                ret = EINVAL;
                goto out_unlock;
        }
@@ -1016,7 +1102,7 @@ int persona_set_groups(struct persona *persona, gid_t *groups, unsigned ngroups,
                return EINVAL;
 
        persona_lock(persona);
-       if (!persona_valid(persona)) {
+       if (!persona_initialized(persona)) {
                ret = EINVAL;
                goto out_unlock;
        }
index f25390cb4955603dee16b057aefcf616a0db4d0d..3107ae6d058d41d3128eeb029936b180b6651e18 100644 (file)
@@ -162,10 +162,6 @@ extern struct tty cons;
 
 extern int cs_debug;
 
-#if DEVELOPMENT || DEBUG
-extern int cs_enforcement_enable;
-#endif
-
 #if DEBUG
 #define __PROC_INTERNAL_DEBUG 1
 #endif
@@ -188,14 +184,10 @@ typedef uint64_t unaligned_u64 __attribute__((aligned(1)));
 
 static void orphanpg(struct pgrp * pg);
 void proc_name_kdp(task_t t, char * buf, int size);
-void * proc_get_uthread_uu_threadlist(void * uthread_v);
 int proc_threadname_kdp(void * uth, char * buf, size_t size);
 void proc_starttime_kdp(void * p, unaligned_u64 *tv_sec, unaligned_u64 *tv_usec, unaligned_u64 *abstime);
 char * proc_name_address(void * p);
 
-/* TODO: make a header that's exported and usable in osfmk */
-char* proc_best_name(proc_t p);
-
 static void  pgrp_add(struct pgrp * pgrp, proc_t parent, proc_t child);
 static void pgrp_remove(proc_t p);
 static void pgrp_replace(proc_t p, struct pgrp *pgrp);
@@ -212,9 +204,6 @@ struct fixjob_iterargs {
 
 int fixjob_callback(proc_t, void *);
 
-uint64_t get_current_unique_pid(void);
-
-
 uint64_t
 get_current_unique_pid(void)
 {
@@ -1010,6 +999,17 @@ proc_exiting(proc_t p)
        return(retval? 1: 0);
 }
 
+int
+proc_in_teardown(proc_t p)
+{
+       int retval = 0;
+
+       if (p)
+               retval = p->p_lflag & P_LPEXIT;
+       return(retval? 1: 0);
+
+}
+
 int
 proc_forcequota(proc_t p)
 {
@@ -1079,6 +1079,13 @@ proc_is64bit(proc_t p)
        return(IS_64BIT_PROCESS(p));
 }
 
+int
+proc_is64bit_data(proc_t p)
+{
+       assert(p->task);
+       return (int)task_get_64bit_data(p->task);
+}
+
 int
 proc_pidversion(proc_t p)
 {
@@ -1950,6 +1957,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
                case CS_OPS_ENTITLEMENTS_BLOB:
                case CS_OPS_IDENTITY:
                case CS_OPS_BLOB:
+               case CS_OPS_TEAMID:
                        break;  /* not restricted to root */
                default:
                        if (forself == 0 && kauth_cred_issuser(kauth_cred_get()) != TRUE)
@@ -2000,12 +2008,16 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
 
                        proc_lock(pt);
                        retflags = pt->p_csflags;
-                       if (cs_enforcement(pt))
+                       if (cs_process_enforcement(pt))
                                retflags |= CS_ENFORCEMENT;
                        if (csproc_get_platform_binary(pt))
                                retflags |= CS_PLATFORM_BINARY;
                        if (csproc_get_platform_path(pt))
                                retflags |= CS_PLATFORM_PATH;
+                       //Don't return CS_REQUIRE_LV if we turned it on with CS_FORCED_LV but still report CS_FORCED_LV
+                       if ((pt->p_csflags & CS_FORCED_LV) == CS_FORCED_LV) {
+                               retflags &= (~CS_REQUIRE_LV);
+                       }
                        proc_unlock(pt);
 
                        if (uaddr != USER_ADDR_NULL)
@@ -2154,7 +2166,8 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
                        error = csops_copy_token(start, length, usize, uaddr);
                        break;
                }
-               case CS_OPS_IDENTITY: {
+               case CS_OPS_IDENTITY:
+               case CS_OPS_TEAMID: {
                        const char *identity;
                        uint8_t fakeheader[8];
                        uint32_t idlen;
@@ -2178,7 +2191,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
                                break;
                        }
 
-                       identity = cs_identity_get(pt);
+                       identity = ops == CS_OPS_TEAMID ? csproc_get_teamid(pt) : cs_identity_get(pt);
                        proc_unlock(pt);
                        if (identity == NULL) {
                                error = ENOENT;
@@ -2209,7 +2222,7 @@ csops_internal(pid_t pid, int ops, user_addr_t uaddr, user_size_t usersize, user
 
                case CS_OPS_CLEARPLATFORM:
 #if DEVELOPMENT || DEBUG
-                       if (cs_enforcement_enable) {
+                       if (cs_process_global_enforcement()) {
                                error = ENOTSUP;
                                break;
                        }
@@ -2248,7 +2261,7 @@ proc_iterate(
        proc_iterate_fn_t filterfn,
        void *filterarg)
 {
-       pid_t *pid_list;
+       pid_t *pid_list = NULL;
        vm_size_t pid_list_size = 0;
        vm_size_t pid_list_size_needed = 0;
        int pid_count = 0;
@@ -2260,7 +2273,7 @@ proc_iterate(
        for (;;) {
                proc_list_lock();
 
-               pid_count_available = nprocs + 1; //kernel_task is not counted in nprocs
+               pid_count_available = nprocs + 1 /* kernel_task not counted in nprocs */;
                assert(pid_count_available > 0);
 
                pid_list_size_needed = pid_count_available * sizeof(pid_t);
@@ -2278,6 +2291,7 @@ proc_iterate(
                }
                pid_list_size = pid_list_size_needed;
        }
+       assert(pid_list != NULL);
 
        /* filter pids into pid_list */
 
@@ -3229,7 +3243,7 @@ extern boolean_t kill_on_no_paging_space;
 #endif /* DEVELOPMENT || DEBUG */
 
 #define MB_SIZE        (1024 * 1024ULL)
-boolean_t      memorystatus_kill_on_VM_thrashing(boolean_t);
+boolean_t      memorystatus_kill_on_VM_compressor_space_shortage(boolean_t);
 
 extern int32_t max_kill_priority;
 extern int     memorystatus_get_proccnt_upto_priority(int32_t max_bucket_index);
@@ -3305,7 +3319,7 @@ no_paging_space_action()
        if (memorystatus_get_proccnt_upto_priority(max_kill_priority) > 0) {
 
                last_no_space_action = now;
-               memorystatus_kill_on_VM_thrashing(TRUE /* async */);
+               memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
                return (1);
        }
 
@@ -3432,11 +3446,17 @@ proc_chrooted(proc_t p)
        return retval;
 }
 
-void *
-proc_get_uthread_uu_threadlist(void * uthread_v)
+boolean_t
+proc_send_synchronous_EXC_RESOURCE(proc_t p)
 {
-       uthread_t uth = (uthread_t)uthread_v;
-       return (uth != NULL) ? uth->uu_threadlist : NULL;
+       if (p == PROC_NULL)
+               return FALSE;
+
+       /* Send sync EXC_RESOURCE if the process is traced */
+       if (ISSET(p->p_lflag, P_LTRACED)) {
+               return TRUE;
+       }
+       return FALSE;
 }
 
 #ifdef CONFIG_32BIT_TELEMETRY
index 9d825afcbf94c302d8011f8fdeb5cc9c58ab22c0..36beb273735e442be113b9625ba29f5e9051053c 100644 (file)
@@ -669,6 +669,12 @@ out:
  *             real, effective, or saved user or group IDs since beginning
  *             execution.
  */
+int
+proc_issetugid (proc_t p)
+{
+       return (p->p_flag & P_SUGID) ? 1 : 0;
+}
+
 int
 issetugid(proc_t p, __unused struct issetugid_args *uap, int32_t *retval)
 {
@@ -681,7 +687,7 @@ issetugid(proc_t p, __unused struct issetugid_args *uap, int32_t *retval)
         * that libc *might* have put in their data segment.
         */
 
-       *retval = (p->p_flag & P_SUGID) ? 1 : 0;
+       *retval = proc_issetugid(p);
        return (0);
 }
 
index 780159263d051645e27aea413e9f7592123608a2..cf55f42d5b4d62d6a9a13a5c9bebf57095f9010c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -1232,8 +1232,8 @@ int
 getrusage(struct proc *p, struct getrusage_args *uap, __unused int32_t *retval)
 {
        struct rusage *rup, rubuf;
-       struct user64_rusage rubuf64;
-       struct user32_rusage rubuf32;
+       struct user64_rusage rubuf64 = {};
+       struct user32_rusage rubuf32 = {};
        size_t retsize = sizeof(rubuf);                 /* default: 32 bits */
        caddr_t retbuf = (caddr_t)&rubuf;               /* default: 32 bits */
        struct timeval utime;
@@ -1421,6 +1421,13 @@ proc_limitreplace(proc_t p)
        return(0);
 }
 
+static int
+iopolicysys_disk(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
+static int
+iopolicysys_vfs_hfs_case_sensitivity(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
+static int
+iopolicysys_vfs_atime_updates(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
+
 /*
  * iopolicysys
  *
@@ -1433,12 +1440,6 @@ proc_limitreplace(proc_t p)
  *             EINVAL                          Invalid command or invalid policy arguments
  *
  */
-
-static int
-iopolicysys_disk(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
-static int
-iopolicysys_vfs(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param);
-
 int
 iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval)
 {
@@ -1459,7 +1460,12 @@ iopolicysys(struct proc *p, struct iopolicysys_args *uap, int32_t *retval)
                                goto out;
                        break;
                case IOPOL_TYPE_VFS_HFS_CASE_SENSITIVITY:
-                       error = iopolicysys_vfs(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
+                       error = iopolicysys_vfs_hfs_case_sensitivity(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
+                       if (error)
+                               goto out;
+                       break;
+               case IOPOL_TYPE_VFS_ATIME_UPDATES:
+                       error = iopolicysys_vfs_atime_updates(p, uap->cmd, iop_param.iop_scope, iop_param.iop_policy, &iop_param);
                        if (error)
                                goto out;
                        break;
@@ -1600,7 +1606,7 @@ out:
 }
 
 static int
-iopolicysys_vfs(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param)
+iopolicysys_vfs_hfs_case_sensitivity(struct proc *p, int cmd, int scope, int policy, struct _iopol_param_t *iop_param)
 {
        int                     error = 0;
 
@@ -1668,6 +1674,93 @@ out:
        return (error);
 }
 
+static inline int
+get_thread_atime_policy(struct uthread *ut)
+{
+       return (ut->uu_flag & UT_ATIME_UPDATE)? IOPOL_ATIME_UPDATES_OFF: IOPOL_ATIME_UPDATES_DEFAULT;
+}
+
+static inline void
+set_thread_atime_policy(struct uthread *ut, int policy)
+{
+       if (policy == IOPOL_ATIME_UPDATES_OFF) {
+               ut->uu_flag |= UT_ATIME_UPDATE;
+       } else {
+               ut->uu_flag &= ~UT_ATIME_UPDATE;
+       }
+}
+
+static inline void
+set_task_atime_policy(struct proc *p, int policy)
+{
+       if (policy == IOPOL_ATIME_UPDATES_OFF) {
+               OSBitOrAtomic16((uint16_t)P_VFS_IOPOLICY_ATIME_UPDATES, &p->p_vfs_iopolicy);
+       } else {
+               OSBitAndAtomic16(~((uint16_t)P_VFS_IOPOLICY_ATIME_UPDATES), &p->p_vfs_iopolicy);
+       }
+}
+
+static inline int
+get_task_atime_policy(struct proc *p)
+{
+       return (p->p_vfs_iopolicy & P_VFS_IOPOLICY_ATIME_UPDATES)? IOPOL_ATIME_UPDATES_OFF: IOPOL_ATIME_UPDATES_DEFAULT;
+}
+
+static int
+iopolicysys_vfs_atime_updates(struct proc *p __unused, int cmd, int scope, int policy, struct _iopol_param_t *iop_param)
+{
+       int                     error = 0;
+       thread_t                thread;
+
+       /* Validate scope */
+       switch (scope) {
+               case IOPOL_SCOPE_THREAD:
+                       thread = current_thread();
+                       break;
+               case IOPOL_SCOPE_PROCESS:
+                       thread = THREAD_NULL;
+                       break;
+               default:
+                       error = EINVAL;
+                       goto out;
+       }
+
+       /* Validate policy */
+       if (cmd == IOPOL_CMD_SET) {
+               switch (policy) {
+                       case IOPOL_ATIME_UPDATES_DEFAULT:
+                       case IOPOL_ATIME_UPDATES_OFF:
+                               break;
+                       default:
+                               error = EINVAL;
+                               goto out;
+               }
+       }
+
+       /* Perform command */
+       switch(cmd) {
+               case IOPOL_CMD_SET:
+                       if (thread != THREAD_NULL)
+                               set_thread_atime_policy(get_bsdthread_info(thread), policy);
+                       else
+                               set_task_atime_policy(p, policy);
+                       break;
+               case IOPOL_CMD_GET:
+                       if (thread != THREAD_NULL)
+                               policy = get_thread_atime_policy(get_bsdthread_info(thread));
+                       else
+                               policy = get_task_atime_policy(p);
+                       iop_param->iop_policy = policy;
+                       break;
+               default:
+                       error = EINVAL; /* unknown command */
+                       break;
+       }
+
+out:
+       return (error);
+}
+
 /* BSD call back function for task_policy networking changes */
 void
 proc_apply_task_networkbg(void * bsd_info, thread_t thread)
@@ -1697,8 +1790,11 @@ gather_rusage_info(proc_t p, rusage_info_current *ru, int flavor)
        case RUSAGE_INFO_V4:
                ru->ri_logical_writes = get_task_logical_writes(p->task);
                ru->ri_lifetime_max_phys_footprint = get_task_phys_footprint_lifetime_max(p->task);
+#if CONFIG_LEDGER_INTERVAL_MAX
+               ru->ri_interval_max_phys_footprint = get_task_phys_footprint_interval_max(p->task, FALSE);
+#endif
                fill_task_monotonic_rusage(p->task, ru);
-        /* fall through */
+       /* fall through */
 
        case RUSAGE_INFO_V3:
                fill_task_qos_rusage(p->task, ru);
@@ -1736,7 +1832,7 @@ gather_rusage_info(proc_t p, rusage_info_current *ru, int flavor)
 int
 proc_get_rusage(proc_t p, int flavor, user_addr_t buffer, __unused int is_zombie)
 {
-       rusage_info_current ri_current;
+       rusage_info_current ri_current = {};
 
        int error = 0;
        size_t size = 0;
@@ -1811,6 +1907,9 @@ mach_to_bsd_rv(int mach_rv)
  * uap->flavor available flavors:
  *
  *     RLIMIT_WAKEUPS_MONITOR
+ *     RLIMIT_CPU_USAGE_MONITOR
+ *     RLIMIT_THREAD_CPULIMITS
+ *     RLIMIT_FOOTPRINT_INTERVAL
  */
 int
 proc_rlimit_control(__unused struct proc *p, struct proc_rlimit_control_args *uap, __unused int32_t *retval)
@@ -1821,6 +1920,10 @@ proc_rlimit_control(__unused struct proc *p, struct proc_rlimit_control_args *ua
        uint32_t cpumon_flags;
        uint32_t cpulimits_flags;
        kauth_cred_t my_cred, target_cred;
+#if CONFIG_LEDGER_INTERVAL_MAX
+       uint32_t footprint_interval_flags;      
+       uint64_t interval_max_footprint;
+#endif /* CONFIG_LEDGER_INTERVAL_MAX */
 
        /* -1 implicitly means our own process (perhaps even the current thread for per-thread attributes) */
        if (uap->pid == -1) {
@@ -1883,6 +1986,20 @@ proc_rlimit_control(__unused struct proc *p, struct proc_rlimit_control_args *ua
 
                error = mach_to_bsd_rv(thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, percent, ns_refill));
                break;
+
+#if CONFIG_LEDGER_INTERVAL_MAX
+       case RLIMIT_FOOTPRINT_INTERVAL:
+               footprint_interval_flags = uap->arg; // XXX temporarily stashing flags in argp (12592127)
+               /*
+                * There is currently only one option for this flavor.
+                */
+               if ((footprint_interval_flags & FOOTPRINT_INTERVAL_RESET) == 0) {
+                       error = EINVAL;
+                       break;
+               }
+               interval_max_footprint = get_task_phys_footprint_interval_max(targetp->task, TRUE);
+               break;
+#endif /* CONFIG_LEDGER_INTERVAL_MAX */
        default:
                error = EINVAL;
                break;
index 849562ccaf2890333013775d13fee33926babb84..c444668bd6e750220cbfd7cca2a55de644df750f 100644 (file)
 
 #include <sys/sdt.h>
 #include <sys/codesign.h>
+#include <sys/random.h>
 #include <libkern/section_keywords.h>
 
 #if CONFIG_MACF
@@ -261,6 +262,11 @@ __sigaction_user32_to_kern(struct __user32_sigaction *in, struct __kern_sigactio
        out->sa_tramp = CAST_USER_ADDR_T(in->sa_tramp);
        out->sa_mask = in->sa_mask;
        out->sa_flags = in->sa_flags;
+
+       kern_return_t kr;
+       kr = machine_thread_function_pointers_convert_from_user(current_thread(),
+                       &out->sa_tramp, 1);
+       assert(kr == KERN_SUCCESS);
 }
 
 static void
@@ -270,6 +276,11 @@ __sigaction_user64_to_kern(struct __user64_sigaction *in, struct __kern_sigactio
        out->sa_tramp = in->sa_tramp;
        out->sa_mask = in->sa_mask;
        out->sa_flags = in->sa_flags;
+
+       kern_return_t kr;
+       kr = machine_thread_function_pointers_convert_from_user(current_thread(),
+                       &out->sa_tramp, 1);
+       assert(kr == KERN_SUCCESS);
 }
 
 #if SIGNAL_DEBUG
@@ -444,6 +455,7 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval)
 
        int signum;
        int bit, error=0;
+       uint32_t sigreturn_validation = PS_SIGRETURN_VALIDATION_DEFAULT;
 
        signum = uap->signum;
        if (signum <= 0 || signum >= NSIG ||
@@ -462,6 +474,9 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval)
                }
                if (error)
                        return (error);
+
+               sigreturn_validation = (__vec.sa_flags & SA_VALIDATE_SIGRETURN_FROM_SIGTRAMP) ?
+                               PS_SIGRETURN_VALIDATION_ENABLED : PS_SIGRETURN_VALIDATION_DISABLED;
                __vec.sa_flags &= SA_USERSPACE_MASK; /* Only pass on valid sa_flags */
 
                if ((__vec.sa_flags & SA_SIGINFO) || __vec.sa_handler != SIG_DFL) {
@@ -488,8 +503,6 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval)
                        sa->sa_flags |= SA_SIGINFO;
                if (ps->ps_signodefer & bit)
                        sa->sa_flags |= SA_NODEFER;
-               if (ps->ps_64regset & bit)
-                       sa->sa_flags |= SA_64REGSET;
                if ((signum == SIGCHLD) && (p->p_flag & P_NOCLDSTOP))
                        sa->sa_flags |= SA_NOCLDSTOP;
                if ((signum == SIGCHLD) && (p->p_flag & P_NOCLDWAIT))
@@ -509,6 +522,13 @@ sigaction(proc_t p, struct sigaction_args *uap, __unused int32_t *retval)
        }
 
        if (uap->nsa) {
+               uint32_t old_sigreturn_validation = atomic_load_explicit(
+                               &ps->ps_sigreturn_validation, memory_order_relaxed);
+               if (old_sigreturn_validation == PS_SIGRETURN_VALIDATION_DEFAULT) {
+                       atomic_compare_exchange_strong_explicit(&ps->ps_sigreturn_validation,
+                                       &old_sigreturn_validation, sigreturn_validation,
+                                       memory_order_relaxed, memory_order_relaxed);
+               }
                error = setsigvec(p, current_thread(), signum, &__vec, FALSE);
        }
 
@@ -673,10 +693,6 @@ setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigactio
                ps->ps_siginfo |= bit;
        else
                ps->ps_siginfo &= ~bit;
-       if (sa->sa_flags & SA_64REGSET)
-               ps->ps_64regset |= bit;
-       else
-               ps->ps_64regset &= ~bit;
        if ((sa->sa_flags & SA_RESTART) == 0)
                ps->ps_sigintr |= bit;
        else
@@ -685,10 +701,6 @@ setsigvec(proc_t p, __unused thread_t thread, int signum, struct __kern_sigactio
                ps->ps_sigonstack |= bit;
        else
                ps->ps_sigonstack &= ~bit;
-       if (sa->sa_flags & SA_USERTRAMP)
-               ps->ps_usertramp |= bit;
-       else
-               ps->ps_usertramp &= ~bit;
        if (sa->sa_flags & SA_RESETHAND)
                ps->ps_sigreset |= bit;
        else
@@ -786,6 +798,11 @@ execsigs(proc_t p, thread_t thread)
                ps->ps_sigact[nc] = SIG_DFL;
        }
 
+       atomic_store_explicit(&ps->ps_sigreturn_validation,
+                       PS_SIGRETURN_VALIDATION_DEFAULT, memory_order_relaxed);
+       /* Generate random token value used to validate sigreturn arguments */
+       read_random(&ps->ps_sigreturn_token, sizeof(ps->ps_sigreturn_token));
+
        /*
         * Reset stack state to the user stack.
         * Clear set of signals caught on the signal stack.
@@ -1678,6 +1695,15 @@ terminate_with_payload_internal(struct proc *cur_proc, int target_pid, uint32_t
                return EPERM;
        }
 
+       if (target_pid != cur_proc->p_pid) {
+               /*
+                * FLAG_ABORT should only be set on terminate_with_reason(getpid()) that
+                * was a fallback from an unsuccessful abort_with_reason(). In that case
+                * caller's pid matches the target one. Otherwise remove the flag.
+                */
+               reason_flags &= ~((typeof(reason_flags))OS_REASON_FLAG_ABORT);
+       }
+
        KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
                                        target_proc->p_pid, reason_namespace,
                                        reason_code, 0, 0);
@@ -3063,6 +3089,7 @@ postsig_locked(int signum)
        uint32_t code;
        int mask, returnmask;
        struct uthread * ut;
+       os_reason_t ut_exit_reason = OS_REASON_NULL;
 
 #if DIAGNOSTIC
        if (signum == 0)
@@ -3093,6 +3120,15 @@ postsig_locked(int signum)
                 * the process.  (Other cases were ignored above.)
                 */
                sig_lock_to_exit(p);
+
+               /*
+                * exit_with_reason() below will consume a reference to the thread's exit reason, so we take another
+                * reference so the thread still has one even after we call exit_with_reason(). The thread's reference will
+                * ultimately be destroyed in uthread_cleanup().
+                */
+               ut_exit_reason = ut->uu_exit_reason;
+               os_reason_ref(ut_exit_reason);
+
                p->p_acflag |= AXSIG;
                if (sigprop[signum] & SA_CORE) {
                        p->p_sigacts->ps_sig = signum;
@@ -3132,12 +3168,7 @@ postsig_locked(int signum)
                KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_FRCEXIT) | DBG_FUNC_NONE,
                                              p->p_pid, W_EXITCODE(0, signum), 3, 0, 0);
 
-               /*
-                * exit_with_reason() will consume a reference to the thread's exit reason, so we take another
-                * reference for the thread. This reference will be destroyed in uthread_cleanup().
-                */
-               os_reason_ref(ut->uu_exit_reason);
-               exit_with_reason(p, W_EXITCODE(0, signum), (int *)NULL, TRUE, TRUE, 0, ut->uu_exit_reason);
+               exit_with_reason(p, W_EXITCODE(0, signum), (int *)NULL, TRUE, TRUE, 0, ut_exit_reason);
 
                proc_lock(p);
                return;
@@ -3266,11 +3297,8 @@ filt_signaltouch(
 
        proc_klist_lock();
 
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
-       /* 
-        * No data to save - 
-        * just capture if it is already fired
+       /*
+        * No data to save - just capture if it is already fired
         */
        res = (kn->kn_data > 0);
 
index 46018b2de98b37a3e2c85393c24e0f2791128cfc..a88a51ca8774f6ac6f92c209b0c8da6b22987315 100644 (file)
@@ -52,6 +52,7 @@
 #include <sys/disk.h>
 #include <sys/conf.h>
 #include <sys/content_protection.h>
+#include <sys/fsctl.h>
 
 #include <mach-o/loader.h>
 #include <mach-o/nlist.h>
@@ -61,6 +62,9 @@
 #include <pexpert/pexpert.h>
 #include <IOKit/IOPolledInterface.h>
 
+#define HIBERNATE_MIN_PHYSICAL_LBA    (34)
+#define HIBERNATE_MIN_FILE_SIZE       (1024*1024)
+
 /* This function is called from kern_sysctl in the current process context;
  * it is exported with the System6.0.exports, but this appears to be a legacy
  * export, as there are no internal consumers.
@@ -75,13 +79,15 @@ get_kernel_symfile(__unused proc_t p, __unused char const **symfile)
 
 struct kern_direct_file_io_ref_t
 {
-    vfs_context_t  ctx;
-    struct vnode * vp;
-    dev_t          device;
-    uint32_t      blksize;
-    off_t          filelength;
-    char           cf;
-    char           pinned;
+    vfs_context_t      ctx;
+    struct vnode      * vp;
+    dev_t               device;
+    uint32_t           blksize;
+    off_t               filelength;
+    char                cf;
+    char                pinned;
+    char                frozen;
+    char                wbcranged;
 };
 
 
@@ -201,7 +207,7 @@ extern uint32_t freespace_mb(vnode_t vp);
 
 struct kern_direct_file_io_ref_t *
 kern_open_file_for_direct_io(const char * name, 
-                            boolean_t create_file,
+                            uint32_t iflags,
                             kern_get_file_extents_callback_t callback, 
                             void * callback_ref,
                              off_t set_file_size,
@@ -219,17 +225,18 @@ kern_open_file_for_direct_io(const char * name,
 
     proc_t            p;
     struct vnode_attr va;
+    dk_apfs_wbc_range_t wbc_range;
     int               error;
     off_t             f_offset;
     uint64_t          fileblk;
     size_t            filechunk;
-    uint64_t          physoffset;
+    uint64_t          physoffset, minoffset;
     dev_t             device;
     dev_t             target = 0;
     int               isssd = 0;
     uint32_t          flags = 0;
     uint32_t          blksize;
-    off_t             maxiocount, count, segcount;
+    off_t             maxiocount, count, segcount, wbctotal;
     boolean_t         locked = FALSE;
     int               fmode, cmode;
     struct            nameidata nd;
@@ -253,7 +260,7 @@ kern_open_file_for_direct_io(const char * name,
     p = kernproc;
     ref->ctx = vfs_context_kernel();
 
-    fmode  = (create_file) ? (O_CREAT | FWRITE) : FWRITE;
+    fmode  = (kIOPolledFileCreate & iflags) ? (O_CREAT | FWRITE) : FWRITE;
     cmode =  S_IRUSR | S_IWUSR;
     ndflags = NOFOLLOW;
     NDINIT(&nd, LOOKUP, OP_OPEN, ndflags, UIO_SYSSPACE, CAST_USER_ADDR_T(name), ref->ctx);
@@ -276,10 +283,10 @@ kern_open_file_for_direct_io(const char * name,
 
     if (write_file_addr && write_file_len)
     {
-       if ((error = kern_write_file(ref, write_file_offset, write_file_addr, write_file_len, IO_SKIP_ENCRYPTION))) {
-               kprintf("kern_write_file() failed with error: %d\n", error);
-               goto out;
-       }
+        if ((error = kern_write_file(ref, write_file_offset, write_file_addr, write_file_len, IO_SKIP_ENCRYPTION))) {
+            kprintf("kern_write_file() failed with error: %d\n", error);
+            goto out;
+        }
     }
 
     VATTR_INIT(&va);
@@ -292,6 +299,7 @@ kern_open_file_for_direct_io(const char * name,
     error = EFAULT;
     if (vnode_getattr(ref->vp, &va, ref->ctx)) goto out;
 
+    wbctotal = 0;
     mpFree = freespace_mb(ref->vp);
     mpFree <<= 20;
     kprintf("kern_direct_file(%s): vp size %qd, alloc %qd, mp free %qd, keep free %qd\n", 
@@ -309,8 +317,31 @@ kern_open_file_for_direct_io(const char * name,
         p2 = p;
         do_ioctl = &file_ioctl;
 
+        if (kIOPolledFileHibernate & iflags)
+        {
+            error = do_ioctl(p1, p2, DKIOCAPFSGETWBCRANGE, (caddr_t) &wbc_range);
+            ref->wbcranged = (error == 0);
+        }
+        if (ref->wbcranged)
+        {
+            uint32_t idx;
+            assert(wbc_range.count <= (sizeof(wbc_range.extents) / sizeof(wbc_range.extents[0])));
+            for (idx = 0; idx < wbc_range.count; idx++) wbctotal += wbc_range.extents[idx].length;
+            kprintf("kern_direct_file(%s): wbc %qd\n", name, wbctotal);
+            if (wbctotal) target = wbc_range.dev;
+        }
+
         if (set_file_size)
         {
+            if (wbctotal)
+            {
+                if (wbctotal >= set_file_size) set_file_size = HIBERNATE_MIN_FILE_SIZE;
+                else
+                {
+                    set_file_size -= wbctotal;
+                    if (set_file_size < HIBERNATE_MIN_FILE_SIZE) set_file_size = HIBERNATE_MIN_FILE_SIZE;
+                }
+            }
             if (fs_free_size)
             {
                mpFree += va.va_data_alloc;
@@ -354,6 +385,8 @@ kern_open_file_for_direct_io(const char * name,
     if (error)
         goto out;
 
+    minoffset = HIBERNATE_MIN_PHYSICAL_LBA * ref->blksize;
+
     if (ref->vp->v_type != VREG)
     {
         error = do_ioctl(p1, p2, DKIOCGETBLOCKCOUNT, (caddr_t) &fileblk);
@@ -361,12 +394,18 @@ kern_open_file_for_direct_io(const char * name,
        ref->filelength = fileblk * ref->blksize;    
     }
 
-    // pin logical extents
+    // pin logical extents, CS version
 
     error = kern_ioctl_file_extents(ref, _DKIOCCSPINEXTENT, 0, ref->filelength);
     if (error && (ENOTTY != error)) goto out;
     ref->pinned = (error == 0);
 
+    // pin logical extents, apfs version
+
+    error = VNOP_IOCTL(ref->vp, FSCTL_FREEZE_EXTENTS, NULL, 0, ref->ctx);
+    if (error && (ENOTTY != error)) goto out;
+    ref->frozen = (error == 0);
+
     // generate the block list
 
     error = do_ioctl(p1, p2, DKIOCLOCKPHYSICALEXTENTS, NULL);
@@ -412,6 +451,9 @@ kern_open_file_for_direct_io(const char * name,
                 error = ENOTSUP;
                 goto out;
             }
+
+            assert(getphysreq.offset >= minoffset);
+
 #if HIBFRAGMENT
            uint64_t rev;
            for (rev = 4096; rev <= getphysreq.length; rev += 4096)
@@ -424,6 +466,15 @@ kern_open_file_for_direct_io(const char * name,
             physoffset += getphysreq.length;
         }
     }
+    if (ref->wbcranged)
+    {
+        uint32_t idx;
+        for (idx = 0; idx < wbc_range.count; idx++)
+        {
+            assert(wbc_range.extents[idx].offset >= minoffset);
+            callback(callback_ref, wbc_range.extents[idx].offset, wbc_range.extents[idx].length);
+        }
+    }
     callback(callback_ref, 0ULL, 0ULL);
 
     if (ref->vp->v_type == VREG) p1 = &target;
@@ -529,15 +580,24 @@ out:
 
     if (error && ref)
     {
-       if (ref->vp)
-       {
-           (void) kern_ioctl_file_extents(ref, _DKIOCCSUNPINEXTENT, 0, (ref->pinned && ref->cf) ? ref->filelength : 0);
-           vnode_close(ref->vp, FWRITE, ref->ctx);
-           ref->vp = NULLVP;
-       }
-       ref->ctx = NULL;
-       kfree(ref, sizeof(struct kern_direct_file_io_ref_t));
-       ref = NULL;
+        if (ref->vp)
+        {
+            (void) kern_ioctl_file_extents(ref, _DKIOCCSUNPINEXTENT, 0, (ref->pinned && ref->cf) ? ref->filelength : 0);
+
+            if (ref->frozen)
+            {
+                (void) VNOP_IOCTL(ref->vp, FSCTL_THAW_EXTENTS, NULL, 0, ref->ctx);
+            }
+            if (ref->wbcranged)
+            {
+                (void) do_ioctl(p1, p2, DKIOCAPFSRELEASEWBCRANGE, (caddr_t) NULL);
+            }
+            vnode_close(ref->vp, FWRITE, ref->ctx);
+            ref->vp = NULLVP;
+        }
+        ref->ctx = NULL;
+        kfree(ref, sizeof(struct kern_direct_file_io_ref_t));
+        ref = NULL;
     }
 
     return(ref);
@@ -586,6 +646,9 @@ kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref,
         void * p1;
         void * p2;
 
+        discard_offset = ((discard_offset + ref->blksize - 1) & ~(((off_t) ref->blksize) - 1));
+        discard_end    = ((discard_end)                       & ~(((off_t) ref->blksize) - 1));
+
         if (ref->vp->v_type == VREG)
         {
             p1 = &ref->device;
@@ -616,6 +679,15 @@ kern_close_file_for_direct_io(struct kern_direct_file_io_ref_t * ref,
             (void) kern_ioctl_file_extents(ref, DKIOCUNMAP, discard_offset, (ref->cf) ? ref->filelength : discard_end);
         }
 
+        if (ref->frozen)
+        {
+            (void) VNOP_IOCTL(ref->vp, FSCTL_THAW_EXTENTS, NULL, 0, ref->ctx);
+        }
+        if (ref->wbcranged)
+        {
+            (void) do_ioctl(p1, p2, DKIOCAPFSRELEASEWBCRANGE, (caddr_t) NULL);
+        }
+
         if (addr && write_length)
         {
             (void) kern_write_file(ref, write_offset, addr, write_length, IO_SKIP_ENCRYPTION);
index f6ed41035cc32078d3cd10487e199096c35b3b6c..d937e9e4ff02f37eaa4b35e5c08e357e412b4a49 100644 (file)
 #include <kern/thread_group.h>
 #include <kern/processor.h>
 #include <kern/cpu_number.h>
+#include <kern/cpu_quiesce.h>
 #include <kern/debug.h>
 #include <kern/sched_prim.h>
 #include <vm/vm_kern.h>
@@ -186,8 +187,6 @@ extern unsigned int vm_max_batch;
 extern unsigned int vm_page_free_min;
 extern unsigned int vm_page_free_target;
 extern unsigned int vm_page_free_reserved;
-extern unsigned int vm_page_speculative_percentage;
-extern unsigned int vm_page_speculative_q_age_ms;
 
 #if (DEVELOPMENT || DEBUG)
 extern uint32_t        vm_page_creation_throttled_hard;
@@ -305,6 +304,13 @@ STATIC int sysctl_singleuser(struct sysctl_oid *oidp, void *arg1, int arg2, stru
 STATIC int sysctl_minimalboot(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
 STATIC int sysctl_slide(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
 
+#ifdef CONFIG_XNUPOST
+#include <tests/xnupost.h>
+
+STATIC int sysctl_debug_test_oslog_ctl(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
+STATIC int sysctl_debug_test_stackshot_mutex_owner(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
+STATIC int sysctl_debug_test_stackshot_rwlck_owner(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req);
+#endif
 
 extern void IORegistrySetOSBuildVersion(char * build_version); 
 
@@ -1269,6 +1275,7 @@ sysctl_procargsx(int *name, u_int namelen, user_addr_t where,
        caddr_t data;
        size_t argslen=0;
        int size;
+       vm_size_t alloc_size = 0;
        vm_offset_t     copy_start, copy_end;
        kern_return_t ret;
        int pid;
@@ -1383,12 +1390,13 @@ sysctl_procargsx(int *name, u_int namelen, user_addr_t where,
        if (proc_map == NULL)
                return(EINVAL);
 
-
-       ret = kmem_alloc(kernel_map, &copy_start, round_page(arg_size), VM_KERN_MEMORY_BSD);
+       alloc_size = round_page(arg_size);
+       ret = kmem_alloc(kernel_map, &copy_start, alloc_size, VM_KERN_MEMORY_BSD);
        if (ret != KERN_SUCCESS) {
                vm_map_deallocate(proc_map);
                return(ENOMEM);
        }
+       bzero((void *)copy_start, alloc_size);
 
        copy_end = round_page(copy_start + arg_size);
 
@@ -1622,6 +1630,11 @@ SYSCTL_STRING(_kern, KERN_VERSION, version,
 SYSCTL_STRING(_kern, OID_AUTO, uuid, 
                CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, 
                &kernel_uuid_string[0], 0, "");
+
+SYSCTL_STRING(_kern, OID_AUTO, osbuildconfig,
+               CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_MASKED,
+               &osbuild_config[0], 0, "");
+
 #if DEBUG
 #ifndef DKPR
 #define DKPR 1
@@ -1758,6 +1771,21 @@ SYSCTL_PROC(_kern, OID_AUTO, bootargs,
        NULL, 0,
        sysctl_sysctl_bootargs, "A", "bootargs");
 
+STATIC int
+sysctl_kernelcacheuuid(struct sysctl_oid *oidp, void *arg1, int arg2, struct sysctl_req *req)
+{
+    int rval = ENOENT;
+    if (kernelcache_uuid_valid) {
+        rval = sysctl_handle_string(oidp, arg1, arg2, req);
+    }
+    return rval;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, kernelcacheuuid,
+        CTLFLAG_RD | CTLFLAG_KERN | CTLTYPE_STRING | CTLFLAG_LOCKED,
+        kernelcache_uuid_string, sizeof(kernelcache_uuid_string),
+        sysctl_kernelcacheuuid, "A", "");
+
 SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, 
                CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
                &maxfiles, 0, "");
@@ -2017,11 +2045,11 @@ SYSCTL_UINT(_kern, OID_AUTO, vm_page_free_reserved,
 
 SYSCTL_UINT(_kern, OID_AUTO, vm_page_speculative_percentage,
                CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
-               &vm_page_speculative_percentage, 0, "");
+               &vm_pageout_state.vm_page_speculative_percentage, 0, "");
 
 SYSCTL_UINT(_kern, OID_AUTO, vm_page_speculative_q_age_ms,
                CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
-               &vm_page_speculative_q_age_ms, 0, "");
+               &vm_pageout_state.vm_page_speculative_q_age_ms, 0, "");
 
 SYSCTL_UINT(_kern, OID_AUTO, vm_max_delayed_work_limit,
                CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
@@ -2585,6 +2613,322 @@ sysctl_vm_toggle_address_reuse(__unused struct sysctl_oid *oidp, __unused void *
 
 SYSCTL_PROC(_debug, OID_AUTO, toggle_address_reuse, CTLFLAG_ANYBODY | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_vm_toggle_address_reuse,"I","");
 
+#ifdef CONFIG_XNUPOST
+
+extern int xnupost_export_testdata(void *outp, uint32_t size, uint32_t *lenp);
+extern uint32_t xnupost_get_estimated_testdata_size(void);
+
+extern int xnupost_reset_all_tests(void);
+
+STATIC int
+sysctl_handle_xnupost_get_tests SYSCTL_HANDLER_ARGS
+{
+       /* fixup unused arguments warnings */
+       __unused int _oa2                  = arg2;
+       __unused void * _oa1               = arg1;
+       __unused struct sysctl_oid * _oidp = oidp;
+
+       int error          = 0;
+       user_addr_t oldp   = 0;
+       user_addr_t newp   = 0;
+       uint32_t usedbytes = 0;
+
+       oldp = req->oldptr;
+       newp = req->newptr;
+
+       if (newp)
+               return ENOTSUP;
+
+       if ((void *)oldp == NULL) {
+               /* return estimated size for second call where info can be placed */
+               req->oldidx = xnupost_get_estimated_testdata_size();
+       } else {
+               error       = xnupost_export_testdata((void *)oldp, req->oldlen, &usedbytes);
+               req->oldidx = usedbytes;
+       }
+
+       return error;
+}
+
+SYSCTL_PROC(_debug,
+            OID_AUTO,
+            xnupost_get_tests,
+            CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_LOCKED,
+            0,
+            0,
+            sysctl_handle_xnupost_get_tests,
+            "-",
+            "read xnupost test data in kernel");
+
+STATIC int
+sysctl_debug_xnupost_ctl SYSCTL_HANDLER_ARGS
+{
+       /* fixup unused arguments warnings */
+       __unused int _oa2                  = arg2;
+       __unused void * _oa1               = arg1;
+       __unused struct sysctl_oid * _oidp = oidp;
+
+#define ARRCOUNT 4
+       /*
+        * INPUT: ACTION,  PARAM1, PARAM2, PARAM3
+        * OUTPUT: RESULTCODE, ADDITIONAL DATA
+        */
+       int32_t outval[ARRCOUNT] = {0};
+       int32_t input[ARRCOUNT]  = {0};
+       int32_t out_size         = sizeof(outval);
+       int32_t in_size          = sizeof(input);
+       int error                = 0;
+
+       /* if this is NULL call to find out size, send out size info */
+       if (!req->newptr) {
+               goto out;
+       }
+
+       /* pull in provided value from userspace */
+       error = SYSCTL_IN(req, &input[0], in_size);
+       if (error)
+               return error;
+
+       if (input[0] == XTCTL_RESET_TESTDATA) {
+               outval[0] = xnupost_reset_all_tests();
+               goto out;
+       }
+
+out:
+       error = SYSCTL_OUT(req, &outval[0], out_size);
+       return error;
+}
+
+SYSCTL_PROC(_debug,
+            OID_AUTO,
+            xnupost_testctl,
+            CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED,
+            0,
+            0,
+            sysctl_debug_xnupost_ctl,
+            "I",
+            "xnupost control for kernel testing");
+
+extern void test_oslog_handleOSLogCtl(int32_t * in, int32_t * out, int32_t arraycount);
+
+STATIC int
+sysctl_debug_test_oslog_ctl(__unused struct sysctl_oid * oidp, __unused void * arg1, __unused int arg2, struct sysctl_req * req)
+{
+#define ARRCOUNT 4
+       int32_t outval[ARRCOUNT] = {0};
+       int32_t input[ARRCOUNT]  = {0};
+       int32_t size_outval      = sizeof(outval);
+       int32_t size_inval       = sizeof(input);
+       int32_t error;
+
+       /* if this is NULL call to find out size, send out size info */
+       if (!req->newptr) {
+               error = SYSCTL_OUT(req, &outval[0], size_outval);
+               return error;
+       }
+
+       /* pull in provided value from userspace */
+       error = SYSCTL_IN(req, &input[0], size_inval);
+       if (error)
+               return error;
+
+       test_oslog_handleOSLogCtl(input, outval, ARRCOUNT);
+
+       error = SYSCTL_OUT(req, &outval[0], size_outval);
+
+       return error;
+}
+
+SYSCTL_PROC(_debug,
+            OID_AUTO,
+            test_OSLogCtl,
+            CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_LOCKED,
+            0,
+            0,
+            sysctl_debug_test_oslog_ctl,
+            "I",
+            "testing oslog in kernel");
+
+#include <mach/task.h>
+#include <mach/semaphore.h>
+
+extern lck_grp_t * sysctl_debug_test_stackshot_owner_grp; /* used for both mutexes and rwlocks */
+extern lck_mtx_t * sysctl_debug_test_stackshot_owner_init_mtx; /* used to protect lck_*_init */
+
+/* This is a sysctl for testing collection of owner info on a lock in kernel space. A multi-threaded
+ * test from userland sets this sysctl in such a way that a thread blocks in kernel mode, and a
+ * stackshot is taken to see if the owner of the lock can be identified.
+ *
+ * We can't return to userland with a kernel lock held, so be sure to unlock before we leave.
+ * the semaphores allow us to artificially create cases where the lock is being held and the
+ * thread is hanging / taking a long time to do something. */
+
+volatile char      sysctl_debug_test_stackshot_mtx_inited = 0;
+semaphore_t        sysctl_debug_test_stackshot_mutex_sem;
+lck_mtx_t          sysctl_debug_test_stackshot_owner_lck;
+
+#define SYSCTL_DEBUG_MTX_ACQUIRE_WAIT   1
+#define SYSCTL_DEBUG_MTX_ACQUIRE_NOWAIT 2
+#define SYSCTL_DEBUG_MTX_SIGNAL         3
+#define SYSCTL_DEBUG_MTX_TEARDOWN       4
+
+STATIC int
+sysctl_debug_test_stackshot_mutex_owner(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       long long option = -1;
+       /* if the user tries to read the sysctl, we tell them what the address of the lock is (to test against stackshot's output) */
+       long long mtx_unslid_addr = (long long)VM_KERNEL_UNSLIDE_OR_PERM(&sysctl_debug_test_stackshot_owner_lck);
+       int error = sysctl_io_number(req, mtx_unslid_addr, sizeof(long long), (void*)&option, NULL);
+
+       lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx);
+       if (!sysctl_debug_test_stackshot_mtx_inited) {
+               lck_mtx_init(&sysctl_debug_test_stackshot_owner_lck,
+                               sysctl_debug_test_stackshot_owner_grp,
+                               LCK_ATTR_NULL);
+               semaphore_create(kernel_task,
+                               &sysctl_debug_test_stackshot_mutex_sem,
+                               SYNC_POLICY_FIFO, 0);
+               sysctl_debug_test_stackshot_mtx_inited = 1;
+       }
+       lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx);
+
+       if (!error) {
+               switch(option) {
+                       case SYSCTL_DEBUG_MTX_ACQUIRE_NOWAIT:
+                               lck_mtx_lock(&sysctl_debug_test_stackshot_owner_lck);
+                               lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_lck);
+                               break;
+                       case SYSCTL_DEBUG_MTX_ACQUIRE_WAIT:
+                               lck_mtx_lock(&sysctl_debug_test_stackshot_owner_lck);
+                               semaphore_wait(sysctl_debug_test_stackshot_mutex_sem);
+                               lck_mtx_unlock(&sysctl_debug_test_stackshot_owner_lck);
+                               break;
+                       case SYSCTL_DEBUG_MTX_SIGNAL:
+                               semaphore_signal(sysctl_debug_test_stackshot_mutex_sem);
+                               break;
+                       case SYSCTL_DEBUG_MTX_TEARDOWN:
+                               lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx);
+
+                               lck_mtx_destroy(&sysctl_debug_test_stackshot_owner_lck,
+                                               sysctl_debug_test_stackshot_owner_grp);
+                               semaphore_destroy(kernel_task,
+                                               sysctl_debug_test_stackshot_mutex_sem);
+                               sysctl_debug_test_stackshot_mtx_inited = 0;
+
+                               lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx);
+                               break;
+                       case -1: /* user just wanted to read the value, so do nothing */
+                               break;
+                       default:
+                               error = EINVAL;
+                               break;
+               }
+       }
+       return error;
+}
+
+/* we can't return to userland with a kernel rwlock held, so be sure to unlock before we leave.
+ * the semaphores allow us to artificially create cases where the lock is being held and the
+ * thread is hanging / taking a long time to do something. */
+
+SYSCTL_PROC(_debug,
+            OID_AUTO,
+            test_MutexOwnerCtl,
+            CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
+            0,
+            0,
+            sysctl_debug_test_stackshot_mutex_owner,
+            "-",
+            "Testing mutex owner in kernel");
+
+volatile char sysctl_debug_test_stackshot_rwlck_inited = 0;
+lck_rw_t      sysctl_debug_test_stackshot_owner_rwlck;
+semaphore_t   sysctl_debug_test_stackshot_rwlck_sem;
+
+#define SYSCTL_DEBUG_KRWLCK_RACQUIRE_NOWAIT 1
+#define SYSCTL_DEBUG_KRWLCK_RACQUIRE_WAIT   2
+#define SYSCTL_DEBUG_KRWLCK_WACQUIRE_NOWAIT 3
+#define SYSCTL_DEBUG_KRWLCK_WACQUIRE_WAIT   4
+#define SYSCTL_DEBUG_KRWLCK_SIGNAL          5
+#define SYSCTL_DEBUG_KRWLCK_TEARDOWN        6
+
+STATIC int
+sysctl_debug_test_stackshot_rwlck_owner(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       long long option = -1;
+       /* if the user tries to read the sysctl, we tell them what the address of the lock is 
+        * (to test against stackshot's output) */
+       long long rwlck_unslid_addr = (long long)VM_KERNEL_UNSLIDE_OR_PERM(&sysctl_debug_test_stackshot_owner_rwlck);
+       int error = sysctl_io_number(req, rwlck_unslid_addr, sizeof(long long), (void*)&option, NULL);
+
+       lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx);
+       if (!sysctl_debug_test_stackshot_rwlck_inited) {
+               lck_rw_init(&sysctl_debug_test_stackshot_owner_rwlck,
+                               sysctl_debug_test_stackshot_owner_grp,
+                               LCK_ATTR_NULL);
+               semaphore_create(kernel_task,
+                               &sysctl_debug_test_stackshot_rwlck_sem,
+                               SYNC_POLICY_FIFO,
+                               0);
+               sysctl_debug_test_stackshot_rwlck_inited = 1;
+       }
+       lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx);
+
+       if (!error) {
+               switch(option) {
+                       case SYSCTL_DEBUG_KRWLCK_RACQUIRE_NOWAIT:
+                               lck_rw_lock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_SHARED);
+                               lck_rw_unlock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_SHARED);
+                               break;
+                       case SYSCTL_DEBUG_KRWLCK_RACQUIRE_WAIT:
+                               lck_rw_lock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_SHARED);
+                               semaphore_wait(sysctl_debug_test_stackshot_rwlck_sem);
+                               lck_rw_unlock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_SHARED);
+                               break;
+                       case SYSCTL_DEBUG_KRWLCK_WACQUIRE_NOWAIT:
+                               lck_rw_lock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_EXCLUSIVE);
+                               lck_rw_unlock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_EXCLUSIVE);
+                               break;
+                       case SYSCTL_DEBUG_KRWLCK_WACQUIRE_WAIT:
+                               lck_rw_lock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_EXCLUSIVE);
+                               semaphore_wait(sysctl_debug_test_stackshot_rwlck_sem);
+                               lck_rw_unlock(&sysctl_debug_test_stackshot_owner_rwlck, LCK_RW_TYPE_EXCLUSIVE);
+                               break;
+                       case SYSCTL_DEBUG_KRWLCK_SIGNAL:
+                               semaphore_signal(sysctl_debug_test_stackshot_rwlck_sem);
+                               break;
+                       case SYSCTL_DEBUG_KRWLCK_TEARDOWN:
+                               lck_mtx_lock(sysctl_debug_test_stackshot_owner_init_mtx);
+
+                               lck_rw_destroy(&sysctl_debug_test_stackshot_owner_rwlck,
+                                               sysctl_debug_test_stackshot_owner_grp);
+                               semaphore_destroy(kernel_task,
+                                               sysctl_debug_test_stackshot_rwlck_sem);
+                               sysctl_debug_test_stackshot_rwlck_inited = 0;
+
+                               lck_mtx_unlock(sysctl_debug_test_stackshot_owner_init_mtx);
+                               break;
+                       case -1: /* user just wanted to read the value, so do nothing */
+                               break;
+                       default:
+                               error = EINVAL;
+                               break;
+               }
+       }
+       return error;
+}
+
+
+SYSCTL_PROC(_debug,
+            OID_AUTO,
+            test_RWLockOwnerCtl,
+            CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
+            0,
+            0,
+            sysctl_debug_test_stackshot_rwlck_owner,
+            "-",
+            "Testing rwlock owner in kernel");
+#endif /* !CONFIG_XNUPOST */
 
 STATIC int
 sysctl_swapusage
@@ -2620,6 +2964,7 @@ SYSCTL_PROC(_vm, VM_SWAPUSAGE, swapusage,
 
 #if CONFIG_FREEZE
 extern void vm_page_reactivate_all_throttled(void);
+extern void memorystatus_disable_freeze(void);
 
 static int
 sysctl_freeze_enabled SYSCTL_HANDLER_ARGS
@@ -2632,7 +2977,7 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS
        if (error || !req->newptr)
                return (error);
 
-       if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
+       if (! VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
                //assert(req->newptr);
                printf("Failed attempt to set vm.freeze_enabled sysctl\n");
                return EINVAL;
@@ -2647,14 +2992,62 @@ sysctl_freeze_enabled SYSCTL_HANDLER_ARGS
        
        if (disabled) {
                vm_page_reactivate_all_throttled();
+               memorystatus_disable_freeze();
        }
        
        return (0);
 }
 
-SYSCTL_PROC(_vm, OID_AUTO, freeze_enabled, CTLTYPE_INT|CTLFLAG_RW, &memorystatus_freeze_enabled, 0, sysctl_freeze_enabled, "I", "");
+SYSCTL_PROC(_vm, OID_AUTO, freeze_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_ANYBODY, &memorystatus_freeze_enabled, 0, sysctl_freeze_enabled, "I", "");
 #endif /* CONFIG_FREEZE */
 
+#if DEVELOPMENT || DEBUG
+extern int vm_num_swap_files_config;
+extern int vm_num_swap_files;
+extern lck_mtx_t vm_swap_data_lock;
+#define VM_MAX_SWAP_FILE_NUM           100
+
+static int
+sysctl_vm_config_num_swap_files SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error = 0, val = vm_num_swap_files_config;
+
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || !req->newptr) {
+               goto out;
+       }
+
+       if (!VM_CONFIG_SWAP_IS_ACTIVE && !VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+               printf("Swap is disabled\n");
+               error = EINVAL;
+               goto out;
+       }
+
+       lck_mtx_lock(&vm_swap_data_lock);
+
+       if (val < vm_num_swap_files) {
+               printf("Cannot configure fewer swap files than already exist.\n");
+               error = EINVAL;
+               lck_mtx_unlock(&vm_swap_data_lock);
+               goto out;
+       }
+
+       if (val > VM_MAX_SWAP_FILE_NUM) {
+               printf("Capping number of swap files to upper bound.\n");
+               val = VM_MAX_SWAP_FILE_NUM;
+       }
+
+       vm_num_swap_files_config = val;
+       lck_mtx_unlock(&vm_swap_data_lock);
+out:
+
+       return (0);
+}
+
+SYSCTL_PROC(_debug, OID_AUTO, num_swap_files_configured, CTLFLAG_ANYBODY | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, 0, 0, sysctl_vm_config_num_swap_files, "I", "");
+#endif /* DEVELOPMENT || DEBUG */
+
 /* this kernel does NOT implement shared_region_make_private_np() */
 SYSCTL_INT(_kern, KERN_SHREG_PRIVATIZABLE, shreg_private, 
                CTLFLAG_RD | CTLFLAG_LOCKED, 
@@ -2685,8 +3078,9 @@ fetch_process_cputype(
        }
 
        ret = cpu_type() & ~CPU_ARCH_MASK;
-       if (IS_64BIT_PROCESS(p))
+       if (IS_64BIT_PROCESS(p)) {
                ret |= CPU_ARCH_ABI64;
+       }
 
        *cputype = ret;
        
@@ -2823,10 +3217,16 @@ SYSCTL_INT(_vm, OID_AUTO, vm_copy_src_large, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_ma
 
 
 extern uint32_t        vm_page_external_count;
-extern uint32_t        vm_page_filecache_min;
 
 SYSCTL_INT(_vm, OID_AUTO, vm_page_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_external_count, 0, "");
-SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_page_filecache_min, 0, "");
+
+SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_page_filecache_min, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_page_xpmapped_min, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_page_xpmapped_min, 0, "");
+
+#if DEVELOPMENT || DEBUG
+SYSCTL_INT(_vm, OID_AUTO, vm_page_filecache_min_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.vm_page_filecache_min_divisor, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_page_xpmapped_min_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_state.vm_page_xpmapped_min_divisor, 0, "");
+#endif
 
 extern int     vm_compressor_mode;
 extern int     vm_compressor_is_active;
@@ -2841,15 +3241,95 @@ extern uint32_t compressor_sample_min_in_msecs;
 extern uint32_t        compressor_sample_max_in_msecs;
 extern uint32_t        compressor_thrashing_threshold_per_10msecs;
 extern uint32_t        compressor_thrashing_min_per_10msecs;
+extern uint32_t vm_compressor_time_thread;
+
+#if DEVELOPMENT || DEBUG
 extern uint32_t        vm_compressor_minorcompact_threshold_divisor;
 extern uint32_t        vm_compressor_majorcompact_threshold_divisor;
 extern uint32_t        vm_compressor_unthrottle_threshold_divisor;
 extern uint32_t        vm_compressor_catchup_threshold_divisor;
-extern uint32_t vm_compressor_time_thread;
-#if DEVELOPMENT || DEBUG
+
+extern uint32_t        vm_compressor_minorcompact_threshold_divisor_overridden;
+extern uint32_t        vm_compressor_majorcompact_threshold_divisor_overridden;
+extern uint32_t        vm_compressor_unthrottle_threshold_divisor_overridden;
+extern uint32_t        vm_compressor_catchup_threshold_divisor_overridden;
+
 extern vmct_stats_t vmct_stats;
+
+
+STATIC int
+sysctl_minorcompact_threshold_divisor(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int new_value, changed;
+       int error = sysctl_io_number(req, vm_compressor_minorcompact_threshold_divisor, sizeof(int), &new_value, &changed);
+
+       if (changed) {
+               vm_compressor_minorcompact_threshold_divisor = new_value;
+               vm_compressor_minorcompact_threshold_divisor_overridden = 1;
+       }
+       return(error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, compressor_minorcompact_threshold_divisor,
+           CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
+           0, 0, sysctl_minorcompact_threshold_divisor, "I", "");
+
+
+STATIC int
+sysctl_majorcompact_threshold_divisor(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int new_value, changed;
+       int error = sysctl_io_number(req, vm_compressor_majorcompact_threshold_divisor, sizeof(int), &new_value, &changed);
+
+       if (changed) {
+               vm_compressor_majorcompact_threshold_divisor = new_value;
+               vm_compressor_majorcompact_threshold_divisor_overridden = 1;
+       }
+       return(error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, compressor_majorcompact_threshold_divisor,
+           CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
+           0, 0, sysctl_majorcompact_threshold_divisor, "I", "");
+
+
+STATIC int
+sysctl_unthrottle_threshold_divisor(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int new_value, changed;
+       int error = sysctl_io_number(req, vm_compressor_unthrottle_threshold_divisor, sizeof(int), &new_value, &changed);
+
+       if (changed) {
+               vm_compressor_unthrottle_threshold_divisor = new_value;
+               vm_compressor_unthrottle_threshold_divisor_overridden = 1;
+       }
+       return(error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, compressor_unthrottle_threshold_divisor,
+           CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
+           0, 0, sysctl_unthrottle_threshold_divisor, "I", "");
+
+
+STATIC int
+sysctl_catchup_threshold_divisor(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int new_value, changed;
+       int error = sysctl_io_number(req, vm_compressor_catchup_threshold_divisor, sizeof(int), &new_value, &changed);
+
+       if (changed) {
+               vm_compressor_catchup_threshold_divisor = new_value;
+               vm_compressor_catchup_threshold_divisor_overridden = 1;
+       }
+       return(error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, compressor_catchup_threshold_divisor,
+           CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
+           0, 0, sysctl_catchup_threshold_divisor, "I", "");
 #endif
 
+
 SYSCTL_QUAD(_vm, OID_AUTO, compressor_input_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_input_bytes, "");
 SYSCTL_QUAD(_vm, OID_AUTO, compressor_compressed_bytes, CTLFLAG_RD | CTLFLAG_LOCKED, &c_segment_compressed_bytes, "");
 SYSCTL_QUAD(_vm, OID_AUTO, compressor_bytes_used, CTLFLAG_RD | CTLFLAG_LOCKED, &compressor_bytes_used, "");
@@ -2866,10 +3346,6 @@ SYSCTL_INT(_vm, OID_AUTO, compressor_sample_min_in_msecs, CTLFLAG_RW | CTLFLAG_L
 SYSCTL_INT(_vm, OID_AUTO, compressor_sample_max_in_msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_sample_max_in_msecs, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, compressor_thrashing_threshold_per_10msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_thrashing_threshold_per_10msecs, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, compressor_thrashing_min_per_10msecs, CTLFLAG_RW | CTLFLAG_LOCKED, &compressor_thrashing_min_per_10msecs, 0, "");
-SYSCTL_INT(_vm, OID_AUTO, compressor_minorcompact_threshold_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_minorcompact_threshold_divisor, 0, "");
-SYSCTL_INT(_vm, OID_AUTO, compressor_majorcompact_threshold_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_majorcompact_threshold_divisor, 0, "");
-SYSCTL_INT(_vm, OID_AUTO, compressor_unthrottle_threshold_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_unthrottle_threshold_divisor, 0, "");
-SYSCTL_INT(_vm, OID_AUTO, compressor_catchup_threshold_divisor, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_compressor_catchup_threshold_divisor, 0, "");
 
 SYSCTL_STRING(_vm, OID_AUTO, swapfileprefix, CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED, swapfilename, sizeof(swapfilename) - SWAPFILENAME_INDEX_LEN, "");
 
@@ -2985,8 +3461,6 @@ extern uint32_t   vm_page_background_external_count;
 extern uint32_t        vm_page_background_mode;
 extern uint32_t        vm_page_background_exclude_external;
 extern uint64_t        vm_page_background_promoted_count;
-extern uint64_t vm_pageout_considered_bq_internal;
-extern uint64_t vm_pageout_considered_bq_external;
 extern uint64_t vm_pageout_rejected_bq_internal;
 extern uint64_t vm_pageout_rejected_bq_external;
 
@@ -2998,12 +3472,38 @@ SYSCTL_INT(_vm, OID_AUTO, vm_page_background_internal_count, CTLFLAG_RD | CTLFLA
 SYSCTL_INT(_vm, OID_AUTO, vm_page_background_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_background_external_count, 0, "");
 
 SYSCTL_QUAD(_vm, OID_AUTO, vm_page_background_promoted_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_background_promoted_count, "");
-SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_considered_bq_internal, "");
-SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_considered_bq_external, "");
+SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_considered_bq_internal, "");
+SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_considered_bq_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_considered_bq_external, "");
 SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_rejected_bq_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_rejected_bq_internal, "");
 SYSCTL_QUAD(_vm, OID_AUTO, vm_pageout_rejected_bq_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_rejected_bq_external, "");
 
-#endif
+#endif /* CONFIG_BACKGROUND_QUEUE */
+
+extern void vm_update_darkwake_mode(boolean_t);
+extern boolean_t vm_darkwake_mode;
+
+STATIC int
+sysctl_toggle_darkwake_mode(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       int new_value, changed;
+       int error = sysctl_io_number(req, vm_darkwake_mode, sizeof(int), &new_value, &changed);
+
+       if ( !error && changed) {
+
+               if (new_value != 0 && new_value != 1) {
+                       printf("Error: Invalid value passed to darkwake sysctl. Acceptable: 0 or 1.\n");
+                       error = EINVAL;
+               } else {
+                       vm_update_darkwake_mode((boolean_t) new_value);
+               }
+       }
+
+       return(error);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, darkwake_mode,
+           CTLTYPE_INT | CTLFLAG_LOCKED | CTLFLAG_RW,
+           0, 0, sysctl_toggle_darkwake_mode, "I", "");
 
 #if (DEVELOPMENT || DEBUG)
 
@@ -3020,11 +3520,9 @@ extern uint32_t vm_pageout_memorystatus_fb_factor_dr;
 SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_nr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_nr, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, vm_pageout_memorystatus_fb_factor_dr, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_memorystatus_fb_factor_dr, 0, "");
 
-extern uint32_t vm_grab_anon_overrides;
-extern uint32_t vm_grab_anon_nops;
 
-SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_overrides, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_grab_anon_overrides, 0, "");
-SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_nops, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_grab_anon_nops, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_overrides, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_debug.vm_grab_anon_overrides, 0, "");
+SYSCTL_INT(_vm, OID_AUTO, vm_grab_anon_nops, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_pageout_debug.vm_grab_anon_nops, 0, "");
 
 /* log message counters for persistence mode */
 extern uint32_t oslog_p_total_msgcount;
@@ -3097,6 +3595,29 @@ SYSCTL_STRING(_kern, OID_AUTO, sched,
                          sched_string, sizeof(sched_string),
                          "Timeshare scheduler implementation");
 
+#if CONFIG_QUIESCE_COUNTER
+static int
+sysctl_cpu_quiescent_counter_interval SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+
+       int error = sysctl_handle_int(oidp, &cpu_checkin_min_interval_us, 0, req);
+       if (error || !req->newptr)
+               return error;
+
+       cpu_quiescent_counter_set_min_interval_us(cpu_checkin_min_interval_us);
+
+       return 0;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, cpu_checkin_interval,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+            0, 0,
+            sysctl_cpu_quiescent_counter_interval, "I",
+            "Quiescent CPU checkin interval (microseconds)");
+#endif /* CONFIG_QUIESCE_COUNTER */
+
+
 /*
  * Only support runtime modification on embedded platforms
  * with development config enabled
@@ -3531,6 +4052,10 @@ SYSCTL_PROC(_debug, OID_AUTO, debugger_test, CTLTYPE_STRING | CTLFLAG_RW | CTLFL
 SYSCTL_PROC(_debug, OID_AUTO, spinlock_panic_test, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_MASKED, 0, 0, sysctl_spinlock_panic_test, "A", "spinlock panic test");
 SYSCTL_PROC(_debug, OID_AUTO, simultaneous_panic_test, CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_MASKED, 0, 0, sysctl_simultaneous_panic_test, "A", "simultaneous panic test");
 
+extern int exc_resource_threads_enabled;
+
+SYSCTL_INT(_kern, OID_AUTO, exc_resource_threads_enabled, CTLFLAG_RD | CTLFLAG_LOCKED, &exc_resource_threads_enabled, 0, "exc_resource thread limit enabled");
+
 
 #endif /* DEVELOPMENT || DEBUG */
 
@@ -3575,3 +4100,427 @@ SYSCTL_PROC(_kern, OID_AUTO, grade_cputype,
             CTLFLAG_RW|CTLFLAG_ANYBODY|CTLFLAG_MASKED|CTLFLAG_LOCKED|CTLTYPE_OPAQUE,
             0, 0, &sysctl_grade_cputype, "S",
             "grade value of cpu_type_t+cpu_sub_type_t");
+
+
+#if DEVELOPMENT || DEBUG
+
+static atomic_int wedge_thread_should_wake = 0;
+
+static int
+unwedge_thread SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, val = 0;
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || val == 0) {
+               return error;
+       }
+
+       atomic_store(&wedge_thread_should_wake, 1);
+       return 0;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, unwedge_thread, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, unwedge_thread, "I", "unwedge the thread wedged by kern.wedge_thread");
+
+extern uintptr_t phys_carveout_pa;
+SYSCTL_LONG(_kern, OID_AUTO, phys_carveout_pa, CTLFLAG_RD | CTLFLAG_LOCKED,
+               &phys_carveout_pa,
+               "base physical address of the phys_carveout_mb boot-arg region");
+extern size_t phys_carveout_size;
+SYSCTL_LONG(_kern, OID_AUTO, phys_carveout_size, CTLFLAG_RD | CTLFLAG_LOCKED,
+               &phys_carveout_size,
+               "size in bytes of the phys_carveout_mb boot-arg region");
+
+static int
+wedge_thread SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)     
+       
+       int error, val = 0; 
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || val == 0) {
+               return error; 
+       }
+       
+       uint64_t interval = 1;
+       nanoseconds_to_absolutetime(1000 * 1000 * 50, &interval);
+
+       atomic_store(&wedge_thread_should_wake, 0);
+       while (!atomic_load(&wedge_thread_should_wake)) {
+               tsleep1(NULL, 0, "wedge_thread", mach_absolute_time()+interval, NULL);
+       }
+       
+       return 0;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, wedge_thread, CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED, 0, 0, wedge_thread, "I", "wedge this thread so it cannot be cleaned up");
+
+static int
+sysctl_turnstile_test_prim_lock SYSCTL_HANDLER_ARGS;
+static int
+sysctl_turnstile_test_prim_unlock SYSCTL_HANDLER_ARGS;
+int
+tstile_test_prim_lock(boolean_t use_hashtable);
+int
+tstile_test_prim_unlock(boolean_t use_hashtable);
+
+#define SYSCTL_TURNSTILE_TEST_DEFAULT                   1
+#define SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE          2
+
+static int
+sysctl_turnstile_test_prim_lock SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, val = 0;
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || val == 0) {
+               return error;
+       }
+       boolean_t use_hashtable = (val == SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE) ? true : false;
+       return tstile_test_prim_lock(use_hashtable);
+}
+
+static int
+sysctl_turnstile_test_prim_unlock SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, val = 0;
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || val == 0) {
+               return error;
+       }
+       boolean_t use_hashtable = (val == SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE) ? true : false;
+       return tstile_test_prim_unlock(use_hashtable);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, turnstiles_test_lock, CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_turnstile_test_prim_lock, "I", "turnstiles test lock");
+
+SYSCTL_PROC(_kern, OID_AUTO, turnstiles_test_unlock, CTLFLAG_WR | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_turnstile_test_prim_unlock, "I", "turnstiles test unlock");
+
+int
+turnstile_get_boost_stats_sysctl(void *req);
+int
+turnstile_get_unboost_stats_sysctl(void *req);
+static int
+sysctl_turnstile_boost_stats SYSCTL_HANDLER_ARGS;
+static int
+sysctl_turnstile_unboost_stats SYSCTL_HANDLER_ARGS;
+extern uint64_t thread_block_on_turnstile_count;
+extern uint64_t thread_block_on_regular_waitq_count;
+
+static int
+sysctl_turnstile_boost_stats SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       return turnstile_get_boost_stats_sysctl(req);
+}
+
+static int
+sysctl_turnstile_unboost_stats SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       return turnstile_get_unboost_stats_sysctl(req);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, turnstile_boost_stats, CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLTYPE_STRUCT,
+       0, 0, sysctl_turnstile_boost_stats, "S", "turnstiles boost stats");
+SYSCTL_PROC(_kern, OID_AUTO, turnstile_unboost_stats, CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLTYPE_STRUCT,
+       0, 0, sysctl_turnstile_unboost_stats, "S", "turnstiles unboost stats");
+SYSCTL_QUAD(_kern, OID_AUTO, thread_block_count_on_turnstile,
+       CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       &thread_block_on_turnstile_count, "thread blocked on turnstile count");
+SYSCTL_QUAD(_kern, OID_AUTO, thread_block_count_on_reg_waitq,
+       CTLFLAG_RD | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       &thread_block_on_regular_waitq_count, "thread blocked on regular waitq count");
+
+static int
+sysctl_lck_mtx_test_lock SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, val = 0;
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || val == 0) {
+               return error;
+       }
+
+       if (val == 1) {
+               lck_mtx_test_init();
+               lck_mtx_test_lock();
+       }
+
+       return 0;
+}
+
+static int
+sysctl_lck_mtx_test_unlock SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, val = 0;
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || val == 0) {
+               return error;
+       }
+
+       if (val == 1) {
+               lck_mtx_test_init();
+               lck_mtx_test_unlock();
+       }
+
+       return 0;
+}
+
+static int
+sysctl_erase_all_test_mtx_stats SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int error, val = 0;
+       error = sysctl_handle_int(oidp, &val, 0, req);
+       if (error || val == 0) {
+               return error;
+       }
+
+       if (val == 1) {
+               lck_mtx_test_init();
+               erase_all_test_mtx_stats();
+       }
+
+       return 0;
+}
+
+static int
+sysctl_get_test_mtx_stats SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       char* buffer;
+       int size, buffer_size, error;
+
+       buffer_size = 1000;
+       buffer = kalloc(buffer_size);
+       if (!buffer)
+               panic("Impossible to allocate memory for %s\n", __func__);
+
+       lck_mtx_test_init();
+
+       size = get_test_mtx_stats_string(buffer, buffer_size);
+
+       error = sysctl_io_string(req, buffer, size, 0, NULL);
+
+       kfree(buffer, buffer_size);
+
+       return error;
+}
+
+static int
+sysctl_test_mtx_uncontended SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       char* buffer;
+       int buffer_size, offset, error, iter;
+       char input_val[40];
+
+       if (!req->newptr) {
+               return 0;
+       }
+
+       if (!req->oldptr) {
+               return EINVAL;
+       }
+
+       if (req->newlen >= sizeof(input_val)) {
+               return EINVAL;
+       }
+
+       error = SYSCTL_IN(req, input_val, req->newlen);
+       if (error) {
+               return error;
+       }
+       input_val[req->newlen] = '\0';
+
+       sscanf(input_val, "%d", &iter);
+
+       if (iter <= 0) {
+               printf("%s requested %d iterations, not starting the test\n", __func__, iter);
+               return EINVAL;
+       }
+
+       lck_mtx_test_init();
+
+       buffer_size = 2000;
+       offset = 0;
+       buffer = kalloc(buffer_size);
+       if (!buffer)
+               panic("Impossible to allocate memory for %s\n", __func__);
+       memset(buffer, 0, buffer_size);
+
+       printf("%s starting uncontended mutex test with %d iterations\n", __func__, iter);
+
+       offset = snprintf(buffer, buffer_size, "STATS INNER LOOP");
+       offset += lck_mtx_test_mtx_uncontended(iter, &buffer[offset], buffer_size - offset);
+
+       offset += snprintf(&buffer[offset], buffer_size - offset, "\nSTATS OUTER LOOP");
+       offset += lck_mtx_test_mtx_uncontended_loop_time(iter, &buffer[offset], buffer_size - offset);
+
+       error = SYSCTL_OUT(req, buffer, offset);
+
+       kfree(buffer, buffer_size);
+       return error;
+}
+
+static int
+sysctl_test_mtx_contended SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       char* buffer;
+       int buffer_size, offset, error, iter;
+       char input_val[40];
+
+       printf("%s called\n", __func__);
+
+       if (!req->newptr) {
+               return 0;
+       }
+
+       if (!req->oldptr) {
+               return EINVAL;
+       }
+
+       if (req->newlen >= sizeof(input_val)) {
+               return EINVAL;
+       }
+
+       error = SYSCTL_IN(req, input_val, req->newlen);
+       if (error) {
+               return error;
+       }
+       input_val[req->newlen] = '\0';
+
+       sscanf(input_val, "%d", &iter);
+
+       if (iter <= 0) {
+               printf("%s requested %d iterations, not starting the test\n", __func__, iter);
+               return EINVAL;
+       }
+
+       lck_mtx_test_init();
+
+       erase_all_test_mtx_stats();
+
+       buffer_size = 1000;
+       offset = 0;
+       buffer = kalloc(buffer_size);
+       if (!buffer)
+               panic("Impossible to allocate memory for %s\n", __func__);
+       memset(buffer, 0, buffer_size);
+
+       printf("%s starting contended mutex test with %d iterations\n", __func__, iter);
+
+       offset = snprintf(buffer, buffer_size, "STATS INNER LOOP");
+       offset += lck_mtx_test_mtx_contended(iter, &buffer[offset], buffer_size - offset);
+
+       printf("%s starting contended mutex loop test with %d iterations\n", __func__, iter);
+
+       offset += snprintf(&buffer[offset], buffer_size - offset, "\nSTATS OUTER LOOP");
+       offset += lck_mtx_test_mtx_contended_loop_time(iter, &buffer[offset], buffer_size - offset);
+
+       error = SYSCTL_OUT(req, buffer, offset);
+
+       kfree(buffer, buffer_size);
+
+       return error;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, lck_mtx_test_lock, CTLFLAG_WR | CTLFLAG_MASKED | CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_lck_mtx_test_lock, "I", "lck mtx test lock");
+
+SYSCTL_PROC(_kern, OID_AUTO, lck_mtx_test_unlock, CTLFLAG_WR | CTLFLAG_MASKED |CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_lck_mtx_test_unlock, "I", "lck mtx test unlock");
+
+SYSCTL_PROC(_kern, OID_AUTO, erase_all_test_mtx_stats, CTLFLAG_WR | CTLFLAG_MASKED |CTLFLAG_ANYBODY | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_erase_all_test_mtx_stats, "I", "erase test_mtx statistics");
+
+SYSCTL_PROC(_kern, OID_AUTO, get_test_mtx_stats, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED| CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_get_test_mtx_stats, "A", "get test_mtx statistics");
+
+SYSCTL_PROC(_kern, OID_AUTO, test_mtx_contended, CTLTYPE_STRING | CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_test_mtx_contended, "A", "get statistics for contended mtx test");
+
+SYSCTL_PROC(_kern, OID_AUTO, test_mtx_uncontended, CTLTYPE_STRING | CTLFLAG_MASKED | CTLFLAG_RW | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_test_mtx_uncontended, "A", "get statistics for uncontended mtx test");
+
+#if defined (__x86_64__)
+
+semaphore_t sysctl_test_panic_with_thread_sem;
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Winfinite-recursion" /* rdar://38801963 */
+__attribute__((noreturn))
+static void
+panic_thread_test_child_spin(void * arg, wait_result_t wres)
+{
+       static int panic_thread_recurse_count = 5;
+
+       if (panic_thread_recurse_count > 0) {
+               panic_thread_recurse_count--;
+               panic_thread_test_child_spin(arg, wres);
+       }
+
+       semaphore_signal(sysctl_test_panic_with_thread_sem);
+       while (1) { ; }
+}
+#pragma clang diagnostic pop
+
+static void
+panic_thread_test_child_park(void * arg __unused, wait_result_t wres __unused)
+{
+       int event;
+
+       assert_wait(&event, THREAD_UNINT);
+       semaphore_signal(sysctl_test_panic_with_thread_sem);
+       thread_block(panic_thread_test_child_park);
+}
+
+static int
+sysctl_test_panic_with_thread SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2)
+       int rval = 0;
+       char str[16] = { '\0' };
+       thread_t child_thread = THREAD_NULL;
+
+       rval = sysctl_handle_string(oidp, str, sizeof(str), req);
+       if (rval != 0 || !req->newptr) {
+               return EINVAL;
+       }
+
+       semaphore_create(kernel_task, &sysctl_test_panic_with_thread_sem, SYNC_POLICY_FIFO, 0);
+
+       /* Create thread to spin or park in continuation */
+       if (strncmp("spin", str, strlen("spin")) == 0) {
+               if (kernel_thread_start(panic_thread_test_child_spin, NULL, &child_thread) != KERN_SUCCESS) {
+                       semaphore_destroy(kernel_task, sysctl_test_panic_with_thread_sem);
+                       return EBUSY;
+               }
+       } else if (strncmp("continuation", str, strlen("continuation")) == 0) {
+               if (kernel_thread_start(panic_thread_test_child_park, NULL, &child_thread) != KERN_SUCCESS) {
+                       semaphore_destroy(kernel_task, sysctl_test_panic_with_thread_sem);
+                       return EBUSY;
+               }
+       } else {
+               semaphore_destroy(kernel_task, sysctl_test_panic_with_thread_sem);
+               return EINVAL;
+       }
+
+       semaphore_wait(sysctl_test_panic_with_thread_sem);
+
+       panic_with_thread_context(0, NULL, 0, child_thread, "testing panic_with_thread_context for thread %p", child_thread);
+
+       /* Not reached */
+       return EINVAL;
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, test_panic_with_thread, CTLFLAG_MASKED | CTLFLAG_KERN | CTLFLAG_LOCKED | CTLFLAG_WR | CTLTYPE_STRING,
+               0, 0, sysctl_test_panic_with_thread, "A", "test panic flow for backtracing a different thread");
+#endif /* defined (__x86_64__) */
+#endif /* DEVELOPMENT || DEBUG */
index 5ce07288a9f3fa62fb8354956dc508c33f580848..ba2e6c9901c9e3cc42d81c5792593da62c55b693 100644 (file)
@@ -103,6 +103,7 @@ static void         setthetime(
                                        struct timeval  *tv);
 
 void time_zone_slock_init(void);
+static boolean_t timeval_fixusec(struct timeval *t1);
 
 /*
  * Time of day and interval timer support.
@@ -209,8 +210,10 @@ settimeofday(__unused struct proc *p, struct settimeofday_args  *uap, __unused i
        if (uap->tzp && (error = copyin(uap->tzp, (caddr_t)&atz, sizeof(atz))))
                return (error);
        if (uap->tv) {
-               timevalfix(&atv);
-               if (atv.tv_sec < 0 || (atv.tv_sec == 0 && atv.tv_usec < 0))
+               /* only positive values of sec/usec are accepted */
+               if (atv.tv_sec < 0 || atv.tv_usec < 0)
+                       return (EPERM);
+               if (!timeval_fixusec(&atv))
                        return (EPERM);
                setthetime(&atv);
        }
@@ -711,6 +714,22 @@ timevalfix(
        }
 }
 
+static boolean_t
+timeval_fixusec(
+       struct timeval *t1)
+{
+       assert(t1->tv_usec >= 0);
+       assert(t1->tv_sec >= 0);
+
+       if (t1->tv_usec >= 1000000) {
+               if (os_add_overflow(t1->tv_sec, t1->tv_usec / 1000000, &t1->tv_sec))
+                       return FALSE;
+               t1->tv_usec = t1->tv_usec % 1000000;
+       }
+
+       return TRUE;
+}
+
 /*
  * Return the best possible estimate of the time in the timeval
  * to which tvp points.
index c922c1641894ee97851121a48153fbb060e48531..889ccd6c79f4eca68a92c5d0c56e9704afd3931f 100644 (file)
@@ -115,8 +115,10 @@ reboot(struct proc *p, struct reboot_args *uap, __unused int32_t *retval)
        if (uap->opt & RB_COMMAND)
                 return ENOSYS;
 
-        if (uap->opt & RB_PANIC) {
-               error = copyinstr(uap->command, (void *)message, sizeof(message), (size_t *)&dummy);
+        if (uap->opt & RB_PANIC && uap->msg != USER_ADDR_NULL) {
+               if (copyinstr(uap->msg, (void *)message, sizeof(message), (size_t *)&dummy)) {
+                       strncpy(message, "user space RB_PANIC message copyin failed", sizeof(message)-1);
+               }
         }
 
 #if CONFIG_MACF
@@ -139,7 +141,7 @@ skip_cred_check:
                OSBitOrAtomic(P_REBOOT, &p->p_flag);  /* No more signals for this proc */
                error = reboot_kernel(uap->opt, message);
        }
-       return(error);
+       return error;
 }
 
 int
index 6cb79b10e8bcc66f8b7a159f709ea7e32f4b82a0..2d8e97be26c14f40f090b4c9f287b721621026cb 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -386,7 +386,7 @@ mbuf_freem_list(mbuf_t mbuf)
 size_t
 mbuf_leadingspace(const mbuf_t mbuf)
 {
-       return (m_leadingspace(mbuf));
+       return (M_LEADINGSPACE(mbuf));
 }
 
 /*
@@ -397,7 +397,7 @@ mbuf_leadingspace(const mbuf_t mbuf)
 size_t
 mbuf_trailingspace(const mbuf_t mbuf)
 {
-       return (m_trailingspace(mbuf));
+       return (M_TRAILINGSPACE(mbuf));
 }
 
 /* Manipulation */
@@ -1725,6 +1725,21 @@ get_tx_compl_callback_index(mbuf_tx_compl_func callback)
        return (i);
 }
 
+mbuf_tx_compl_func
+m_get_tx_compl_callback(u_int32_t idx)
+{
+       mbuf_tx_compl_func cb;
+
+       if (idx >= MAX_MBUF_TX_COMPL_FUNC) {
+               ASSERT(0);
+               return (NULL);
+       }
+       lck_rw_lock_shared(mbuf_tx_compl_tbl_lock);
+       cb = mbuf_tx_compl_table[idx];
+       lck_rw_unlock_shared(mbuf_tx_compl_tbl_lock);
+       return (cb);
+}
+
 errno_t
 mbuf_register_tx_compl_callback(mbuf_tx_compl_func callback)
 {
index c63a8d8cde8fcc6b90a0792efc24e803d5db33bb..18d2239baf6899d5e347a50ab66076c498224ef1 100644 (file)
@@ -121,7 +121,8 @@ static const load_result_t load_result_null = {
        .needs_dynlinker = 0,
        .validentry = 0,
        .using_lcmain = 0,
-       .is64bit = 0,
+       .is_64bit_addr = 0,
+       .is_64bit_data = 0,
        .csflags = 0,
        .has_pagezero = 0,
        .uuid = { 0 },
@@ -314,8 +315,8 @@ note_all_image_info_section(const struct segment_command_64 *scp,
  * in exchange for better binary compatibility for legacy apps built
  * before 16KB-alignment was enforced.
  */
-int fourk_binary_compatibility_unsafe = TRUE;
-int fourk_binary_compatibility_allow_wx = FALSE;
+const int fourk_binary_compatibility_unsafe = TRUE;
+const int fourk_binary_compatibility_allow_wx = FALSE;
 #endif /* __arm64__ */
 
 load_return_t
@@ -349,7 +350,8 @@ load_machfile(
                return(LOAD_BADMACHO);
        }
 
-       result->is64bit = ((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT);
+       result->is_64bit_addr = ((imgp->ip_flags & IMGPF_IS_64BIT_ADDR) == IMGPF_IS_64BIT_ADDR);
+       result->is_64bit_data = ((imgp->ip_flags & IMGPF_IS_64BIT_DATA) == IMGPF_IS_64BIT_DATA);
 
        task_t ledger_task;
        if (imgp->ip_new_thread) {
@@ -359,14 +361,14 @@ load_machfile(
        }
        pmap = pmap_create(get_task_ledger(ledger_task),
                           (vm_map_size_t) 0,
-                          result->is64bit);
+                          result->is_64bit_addr);
        map = vm_map_create(pmap,
                        0,
-                       vm_compute_max_offset(result->is64bit),
+                       vm_compute_max_offset(result->is_64bit_addr),
                        TRUE);
 
 #if defined(__arm64__)
-       if (result->is64bit) {
+       if (result->is_64bit_addr) {
                /* enforce 16KB alignment of VM map entries */
                vm_map_set_page_shift(map, SIXTEENK_PAGE_SHIFT);
        } else {
@@ -383,8 +385,10 @@ load_machfile(
         * flag (CS_ENFORCEMENT) is not set yet, but we can use the
         * global flag.
         */
-       if ( !cs_enforcement(NULL) && (header->flags & MH_ALLOW_STACK_EXECUTION) )
+       if ( !cs_process_global_enforcement() && (header->flags & MH_ALLOW_STACK_EXECUTION) ) {
                vm_map_disable_NX(map);
+               // TODO: Message Trace or log that this is happening
+       }
 #endif
 
        /* Forcibly disallow execution from data pages on even if the arch
@@ -418,7 +422,8 @@ load_machfile(
        /*
         * re-set the bitness on the load result since we cleared the load result above.
         */
-       result->is64bit = ((imgp->ip_flags & IMGPF_IS_64BIT) == IMGPF_IS_64BIT);
+       result->is_64bit_addr = ((imgp->ip_flags & IMGPF_IS_64BIT_ADDR) == IMGPF_IS_64BIT_ADDR);
+       result->is_64bit_data = ((imgp->ip_flags & IMGPF_IS_64BIT_DATA) == IMGPF_IS_64BIT_DATA);
 
        lret = parse_machfile(vp, map, thread, header, file_offset, macho_size,
                              0, aslr_page_offset, dyld_aslr_page_offset, result,
@@ -433,7 +438,7 @@ load_machfile(
        /*
         * On x86, for compatibility, don't enforce the hard page-zero restriction for 32-bit binaries.
         */
-       if (!result->is64bit) {
+       if (!result->is_64bit_addr) {
                enforce_hard_pagezero = FALSE;
        }
 
@@ -443,7 +448,7 @@ load_machfile(
         */
 #define VM_MAP_HIGH_START_BITS_COUNT 8
 #define VM_MAP_HIGH_START_BITS_SHIFT 27
-       if (result->is64bit &&
+       if (result->is_64bit_addr &&
            (imgp->ip_flags & IMGPF_HIGH_BITS_ASLR)) {
                int random_bits;
                vm_map_offset_t high_start;
@@ -462,7 +467,7 @@ load_machfile(
        if (enforce_hard_pagezero &&
            (vm_map_has_hard_pagezero(map, 0x1000) == FALSE)) {
 #if __arm64__
-               if (!result->is64bit && /* not 64-bit */
+               if (!result->is_64bit_addr && /* not 64-bit address space */
                    !(header->flags & MH_PIE) &&          /* not PIE */
                    (vm_map_page_shift(map) != FOURK_PAGE_SHIFT ||
                     PAGE_SHIFT != FOURK_PAGE_SHIFT) && /* page size != 4KB */
@@ -513,9 +518,9 @@ load_machfile(
                        return (LOAD_FAILURE);
                }
                proc_transcommit(p, 0);
-               workqueue_mark_exiting(p);
+               workq_mark_exiting(p);
                task_complete_halt(task);
-               workqueue_exit(p);
+               workq_exit(p);
 
                /*
                 * Roll up accounting info to new task. The roll up is done after
@@ -527,7 +532,7 @@ load_machfile(
        *mapp = map;
 
 #ifdef CONFIG_32BIT_TELEMETRY
-       if (!result->is64bit) {
+       if (!result->is_64bit_data) {
                /*
                 * This may not need to be an AST; we merely need to ensure that
                 * we gather telemetry at the point where all of the information
@@ -863,7 +868,6 @@ parse_machfile(
                        switch(lcp->cmd) {
                        case LC_SEGMENT: {
                                struct segment_command *scp = (struct segment_command *) lcp;
-
                                if (pass == 0) {
                                        if (is_dyld && scp->vmaddr == 0 && scp->fileoff == 0) {
                                                dyld_no_load_addr = TRUE;
@@ -926,7 +930,6 @@ parse_machfile(
                                                   map,
                                                   slide,
                                                   result);
-
                                if (ret == LOAD_SUCCESS && scp->fileoff == 0 && scp->filesize > 0) {
                                        /* Enforce a single segment mapping offset zero, with R+X
                                         * protection. */
@@ -1052,7 +1055,7 @@ parse_machfile(
                                        /*
                                         * Allow injections to be ignored on devices w/o enforcement enabled
                                         */
-                                       if (!cs_enforcement(NULL))
+                                       if (!cs_process_global_enforcement())
                                            ret = LOAD_SUCCESS; /* ignore error */
 
                                } else {
@@ -1081,7 +1084,7 @@ parse_machfile(
                                                     if (cs_debug)
                                                             printf("CODE SIGNING: %s[%d]: invalid initial page at offset %lld validated:%d tainted:%d csflags:0x%x\n", 
                                                                    vp->v_name, p->p_pid, (long long)(file_offset + off), valid, tainted, result->csflags);
-                                                    if (cs_enforcement(NULL) ||
+                                                    if (cs_process_global_enforcement() ||
                                                         (result->csflags & (CS_HARD|CS_KILL|CS_ENFORCEMENT))) {
                                                             ret = LOAD_FAILURE;
                                                     }
@@ -1133,6 +1136,22 @@ parse_machfile(
                                }
                                break;
 #endif
+#if __arm64__
+                       case LC_VERSION_MIN_IPHONEOS: {
+                               struct version_min_command *vmc;
+
+                               if (pass != 1) {
+                                       break;
+                               }
+                               vmc = (struct version_min_command *) lcp;
+                               if (vmc->sdk < (12 << 16)) {
+                                       /* app built with a pre-iOS12 SDK: apply legacy footprint mitigation */
+                                       result->legacy_footprint = TRUE;
+                               }
+//                             printf("FBDP %s:%d vp %p (%s) sdk %d.%d.%d -> legacy_footprint=%d\n", __FUNCTION__, __LINE__, vp, vp->v_name, (vmc->sdk >> 16), ((vmc->sdk & 0xFF00) >> 8), (vmc->sdk & 0xFF), result->legacy_footprint);
+                               break;
+                       }
+#endif /* __arm64__ */
                        default:
                                /* Other commands are ignored by the kernel */
                                ret = LOAD_SUCCESS;
@@ -1146,7 +1165,7 @@ parse_machfile(
        }
 
        if (ret == LOAD_SUCCESS) {
-               if(!got_code_signatures && cs_enforcement(NULL)) {
+               if(!got_code_signatures && cs_process_global_enforcement()) {
                        ret = LOAD_FAILURE;
                }
 
@@ -1168,7 +1187,7 @@ parse_machfile(
                        if (result->thread_count == 0) {
                                ret = LOAD_FAILURE;
                        }
-#if CONFIG_EMBEDDED
+#if CONFIG_ENFORCE_SIGNED_CODE
                        if (result->needs_dynlinker && !(result->csflags & CS_DYLD_PLATFORM)) {
                                ret = LOAD_FAILURE;
                        }
@@ -1308,7 +1327,8 @@ map_segment(
        vm_map_offset_t         file_start,
        vm_map_offset_t         file_end,
        vm_prot_t               initprot,
-       vm_prot_t               maxprot)
+       vm_prot_t               maxprot,
+       load_result_t           *result)
 {
        vm_map_offset_t cur_offset, cur_start, cur_end;
        kern_return_t   ret;
@@ -1410,6 +1430,23 @@ map_segment(
                        /* regular mapping for the middle */
                        cur_vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
                }
+
+#if CONFIG_EMBEDDED
+               (void) result;
+#else /* CONFIG_EMBEDDED */
+               /*
+                * This process doesn't have its new csflags (from
+                * the image being loaded) yet, so tell VM to override the
+                * current process's CS_ENFORCEMENT for this mapping.
+                */
+               if (result->csflags & CS_ENFORCEMENT) {
+                       cur_vmk_flags.vmkf_cs_enforcement = TRUE;
+               } else {
+                       cur_vmk_flags.vmkf_cs_enforcement = FALSE;
+               }
+               cur_vmk_flags.vmkf_cs_enforcement_override = TRUE;
+#endif /* CONFIG_EMBEDDED */
+
                cur_end = vm_map_trunc_page(vm_start + (file_end -
                                                        file_start),
                                            effective_page_mask);
@@ -1785,7 +1822,8 @@ load_segment(
                                  file_start,
                                  file_end,
                                  initprot,
-                                 maxprot);
+                                 maxprot,
+                                 result);
                if (ret) {
                        return LOAD_NOSPACE;
                }
@@ -1843,7 +1881,8 @@ load_segment(
                                 0,
                                 delta_size,
                                 scp->initprot,
-                                scp->maxprot);
+                                scp->maxprot,
+                                result);
                if (kr != KERN_SUCCESS) {
                        return(LOAD_NOSPACE);
                }
@@ -1960,7 +1999,7 @@ load_main(
        }
 
        /* use default location for stack */
-       ret = thread_userstackdefault(&addr, result->is64bit);
+       ret = thread_userstackdefault(&addr, result->is_64bit_addr);
        if (ret != KERN_SUCCESS)
                return(LOAD_FAILURE);
 
@@ -2001,7 +2040,6 @@ load_unixthread(
        load_return_t   ret;
        int customstack =0;
        mach_vm_offset_t addr;
-       
        if (tcp->cmdsize < sizeof(*tcp))
                return (LOAD_BADMACHO);
        if (result->thread_count != 0) {
@@ -2012,15 +2050,15 @@ load_unixthread(
                return (LOAD_SUCCESS);
        
        ret = load_threadstack(thread,
-                      (uint32_t *)(((vm_offset_t)tcp) + 
-                               sizeof(struct thread_command)),
-                      tcp->cmdsize - sizeof(struct thread_command),
-                      &addr, &customstack, result);
+                               (uint32_t *)(((vm_offset_t)tcp) +
+                                       sizeof(struct thread_command)),
+                               tcp->cmdsize - sizeof(struct thread_command),
+                               &addr, &customstack, result);
        if (ret != LOAD_SUCCESS)
                return(ret);
 
        /* LC_UNIXTHREAD optionally specifies stack size and location */
-    
+
        if (!customstack) {
                result->user_stack_alloc_size = MAXSSIZ;
        }
@@ -2030,10 +2068,10 @@ load_unixthread(
        result->user_stack -= slide;
 
        ret = load_threadentry(thread,
-                      (uint32_t *)(((vm_offset_t)tcp) + 
-                               sizeof(struct thread_command)),
-                      tcp->cmdsize - sizeof(struct thread_command),
-                      &addr);
+                               (uint32_t *)(((vm_offset_t)tcp) +
+                                       sizeof(struct thread_command)),
+                               tcp->cmdsize - sizeof(struct thread_command),
+                               &addr);
        if (ret != LOAD_SUCCESS)
                return(ret);
 
@@ -2046,9 +2084,9 @@ load_unixthread(
        result->entry_point += slide;
 
        ret = load_threadstate(thread,
-                      (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)),
-                      tcp->cmdsize - sizeof(struct thread_command),
-                      result);
+                               (uint32_t *)(((vm_offset_t)tcp) + sizeof(struct thread_command)),
+                               tcp->cmdsize - sizeof(struct thread_command),
+                               result);
        if (ret != LOAD_SUCCESS)
                return (ret);
 
@@ -2148,7 +2186,7 @@ load_threadstack(
                 * to the appropriate type in thread_userstack() based on
                 * the value of flavor.
                 */
-               ret = thread_userstack(thread, flavor, (thread_state_t)ts, size, user_stack, customstack, result->is64bit);
+               ret = thread_userstack(thread, flavor, (thread_state_t)ts, size, user_stack, customstack, result->is_64bit_data);
                if (ret != KERN_SUCCESS) {
                        return(LOAD_FAILURE);
                }
@@ -2304,7 +2342,8 @@ load_dylinker(
                goto novp_out;
 
        *myresult = load_result_null;
-       myresult->is64bit = result->is64bit;
+       myresult->is_64bit_addr = result->is_64bit_addr;
+       myresult->is_64bit_data = result->is_64bit_data;
 
        ret = parse_machfile(vp, map, thread, header, file_offset,
                             macho_size, depth, slide, 0, myresult, result, imgp);
@@ -2373,23 +2412,44 @@ load_code_signature(
        }
 
        blob = ubc_cs_blob_get(vp, cputype, macho_offset);
+
        if (blob != NULL) {
                /* we already have a blob for this vnode and cputype */
-               if (blob->csb_cpu_type == cputype &&
-                   blob->csb_base_offset == macho_offset) {
-                       /* it matches the blob we want here, lets verify the version */
-                       if(0 != ubc_cs_generation_check(vp)) {
-                               if (0 != ubc_cs_blob_revalidate(vp, blob, imgp, 0)) {
-                                       ret = LOAD_FAILURE; /* set error same as from ubc_cs_blob_add */
-                                       goto out;
-                               }
-                       }
-                       ret = LOAD_SUCCESS;
-               } else {
+               if (blob->csb_cpu_type != cputype ||
+                   blob->csb_base_offset != macho_offset) {
                        /* the blob has changed for this vnode: fail ! */
                        ret = LOAD_BADMACHO;
+                       goto out;
                }
-               goto out;
+
+               /* It matches the blob we want here, let's verify the version */
+               if (ubc_cs_generation_check(vp) == 0) {
+                       /* No need to revalidate, we're good! */
+                       ret = LOAD_SUCCESS;
+                       goto out;
+               }
+
+               /* That blob may be stale, let's revalidate. */
+               error = ubc_cs_blob_revalidate(vp, blob, imgp, 0);
+               if (error == 0) {
+                       /* Revalidation succeeded, we're good! */
+                       ret = LOAD_SUCCESS;
+                       goto out;
+               }
+
+               if (error != EAGAIN) {
+                       printf("load_code_signature: revalidation failed: %d\n", error);
+                       ret = LOAD_FAILURE;
+                       goto out;
+               }
+
+               assert(error == EAGAIN);
+
+               /*
+                * Revalidation was not possible for this blob. We just continue as if there was no blob,
+                * rereading the signature, and ubc_cs_blob_add will do the right thing.
+                */
+               blob = NULL;
        }
 
        blob_size = lcp->datasize;
index b564d12017bdd22b2990b232ce993339a5fc0abc..7870e8e84c5ba47086ca303f7ec2e6e7eb6c28dd 100644 (file)
@@ -67,13 +67,16 @@ typedef struct _load_result {
        int                     thread_count;
        unsigned int
                /* boolean_t */ unixproc        :1,
-                               needs_dynlinker : 1,
-                               dynlinker       :1,
-                               validentry      :1,
-                               has_pagezero    :1,
-                               using_lcmain    :1,
-                               is64bit         :1,
-                                               :0;
+                               needs_dynlinker         :1,
+                               dynlinker                       :1,
+                               validentry                      :1,
+                               has_pagezero            :1,
+                               using_lcmain            :1,
+#if __arm64__
+                               legacy_footprint        :1,
+#endif /* __arm64__ */
+                               is_64bit_addr           :1,
+                               is_64bit_data           :1;
        unsigned int            csflags;
        unsigned char           uuid[16];
        mach_vm_address_t       min_vm_addr;
index 52a91f33bee73c74c3852a4dc7adbb91179854cc..527c89c167d4996155eff98635e79b98ac7ffd8a 100644 (file)
@@ -119,7 +119,7 @@ common_hook(void)
        return rv;
 }
 
-#if (MAC_POLICY_OPS_VERSION != 53)
+#if (MAC_POLICY_OPS_VERSION != 55)
 # error "struct mac_policy_ops doesn't match definition in mac_policy.h"
 #endif
 /*
@@ -268,9 +268,9 @@ const static struct mac_policy_ops policy_ops = {
        CHECK_SET_HOOK(proc_check_inherit_ipc_ports)
        CHECK_SET_HOOK(vnode_check_rename)
        CHECK_SET_HOOK(kext_check_query)
-       CHECK_SET_HOOK(iokit_check_nvram_get)
-       CHECK_SET_HOOK(iokit_check_nvram_set)
-       CHECK_SET_HOOK(iokit_check_nvram_delete)
+       CHECK_SET_HOOK(proc_notify_exec_complete)
+       .mpo_reserved5 = (mpo_reserved_hook_t *)common_hook,
+       .mpo_reserved6 = (mpo_reserved_hook_t *)common_hook,
        CHECK_SET_HOOK(proc_check_expose_task)
        CHECK_SET_HOOK(proc_check_set_host_special_port)
        CHECK_SET_HOOK(proc_check_set_host_exception_port)
index 52eaeda02f4dd051cf6f352b738fa1870f843ac5..2de42fa302b92c0868ce817656487b72bee9b61d 100644 (file)
@@ -67,6 +67,7 @@
 #include <mach/task_info.h>
 #include <mach/thread_info.h>
 #include <mach/vm_region.h>
+#include <mach/vm_types.h>
 
 #include <sys/mount_internal.h>
 #include <sys/proc_info.h>
@@ -154,9 +155,9 @@ int __attribute__ ((noinline)) proc_pidfdlist(proc_t p, user_addr_t buffer, uint
 int __attribute__ ((noinline)) proc_pidbsdinfo(proc_t p, struct proc_bsdinfo *pbsd, int zombie);
 int __attribute__ ((noinline)) proc_pidshortbsdinfo(proc_t p, struct proc_bsdshortinfo *pbsd_shortp, int zombie);
 int __attribute__ ((noinline)) proc_pidtaskinfo(proc_t p, struct proc_taskinfo *ptinfo);
-int __attribute__ ((noinline)) proc_pidthreadinfo(proc_t p, uint64_t arg,  int thuniqueid, struct proc_threadinfo *pthinfo);
+int __attribute__ ((noinline)) proc_pidthreadinfo(proc_t p, uint64_t arg,  bool thuniqueid, struct proc_threadinfo *pthinfo);
 int __attribute__ ((noinline)) proc_pidthreadpathinfo(proc_t p, uint64_t arg,  struct proc_threadwithpathinfo *pinfo);
-int __attribute__ ((noinline)) proc_pidlistthreads(proc_t p,  user_addr_t buffer, uint32_t buffersize, int32_t *retval);
+int __attribute__ ((noinline)) proc_pidlistthreads(proc_t p,  bool thuniqueid, user_addr_t buffer, uint32_t buffersize, int32_t *retval);
 int __attribute__ ((noinline)) proc_pidregioninfo(proc_t p, uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval);
 int __attribute__ ((noinline)) proc_pidregionpathinfo(proc_t p,  uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval);
 int __attribute__ ((noinline)) proc_pidregionpathinfo2(proc_t p,  uint64_t arg, user_addr_t buffer, uint32_t buffersize, int32_t *retval);
@@ -796,7 +797,7 @@ proc_pidtaskinfo(proc_t p, struct proc_taskinfo * ptinfo)
 
 
 int 
-proc_pidthreadinfo(proc_t p, uint64_t arg,  int thuniqueid, struct proc_threadinfo *pthinfo)
+proc_pidthreadinfo(proc_t p, uint64_t arg, bool thuniqueid, struct proc_threadinfo *pthinfo)
 {
        int error = 0;
        uint64_t threadaddr = (uint64_t)arg;
@@ -926,7 +927,7 @@ proc_pidthreadpathinfo(proc_t p, uint64_t arg,  struct proc_threadwithpathinfo *
 
 
 int 
-proc_pidlistthreads(proc_t p,  user_addr_t buffer, uint32_t  buffersize, int32_t *retval)
+proc_pidlistthreads(proc_t p, bool thuniqueid, user_addr_t buffer, uint32_t  buffersize, int32_t *retval)
 {
        uint32_t count = 0;
        int ret = 0;
@@ -950,7 +951,7 @@ proc_pidlistthreads(proc_t p,  user_addr_t buffer, uint32_t  buffersize, int32_t
                return(ENOMEM);
        bzero(kbuf, numthreads * sizeof(uint64_t));
        
-       ret = fill_taskthreadlist(p->task, kbuf, numthreads);
+       ret = fill_taskthreadlist(p->task, kbuf, numthreads, thuniqueid);
        
        error = copyout(kbuf, buffer, ret);
        kfree(kbuf, numthreads * sizeof(uint64_t));
@@ -1357,7 +1358,7 @@ proc_pidoriginatorinfo(int pid, int flavor, user_addr_t buffer, uint32_t  buffer
 
        switch (flavor) {
                case PROC_PIDORIGINATOR_UUID: {
-                       uuid_t uuid;
+                       uuid_t uuid = {};
 
                        error = proc_pidoriginatoruuid(uuid, sizeof(uuid));
                        if (error != 0)
@@ -1385,7 +1386,7 @@ proc_pidoriginatorinfo(int pid, int flavor, user_addr_t buffer, uint32_t  buffer
                break;
 
                case PROC_PIDORIGINATOR_BGSTATE: {
-                       uint32_t is_backgrounded;
+                       uint32_t is_backgrounded = 0;
                        error = proc_get_originatorbgstate(&is_backgrounded);
                        if (error)
                                goto out;
@@ -1684,7 +1685,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
        int shortversion = 0;
        uint32_t size;
        int zombie = 0;
-       int thuniqueid = 0;
+       bool thuniqueid = false;
        int uniqidversion = 0;
        boolean_t check_same_user;
 
@@ -1706,6 +1707,9 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
                case PROC_PIDTHREADINFO:
                        size = PROC_PIDTHREADINFO_SIZE;
                        break;
+               case PROC_PIDLISTTHREADIDS:
+                       size = PROC_PIDLISTTHREADIDS_SIZE;
+                       break;
                case PROC_PIDLISTTHREADS:
                        size = PROC_PIDLISTTHREADS_SIZE;
                        break;
@@ -1788,6 +1792,12 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
                                size = 0;
                        }
                        break;
+               case PROC_PIDVMRTFAULTINFO:
+                       size = sizeof(vm_rtfault_record_t);
+                       if (buffer == USER_ADDR_NULL) {
+                               size = 0;
+                       }
+                       break;
                default:
                        return(EINVAL);
        }
@@ -1917,7 +1927,7 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
                break;
 
                case PROC_PIDTHREADID64INFO:
-                       thuniqueid = 1;
+                       thuniqueid = true;
                case PROC_PIDTHREADINFO:{
                struct proc_threadinfo pthinfo;
 
@@ -1930,8 +1940,10 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
                }
                break;
 
+               case PROC_PIDLISTTHREADIDS:
+                       thuniqueid = true;
                case PROC_PIDLISTTHREADS:{
-                       error =  proc_pidlistthreads(p,  buffer, buffersize, retval);
+                       error =  proc_pidlistthreads(p, thuniqueid, buffer, buffersize, retval);
                }
                break;
 
@@ -2070,7 +2082,48 @@ proc_pidinfo(int pid, int flavor, uint64_t arg, user_addr_t buffer, uint32_t  bu
                case PROC_PIDLISTDYNKQUEUES:
                        error = kevent_copyout_proc_dynkqids(p, buffer, buffersize, retval);
                        break;
+               case PROC_PIDVMRTFAULTINFO: {
+                       /* This interface can only be employed on the current
+                        * process. We will eventually enforce an entitlement.
+                        */
+                       *retval = 0;
+
+                       if (p != current_proc()) {
+                               error = EINVAL;
+                               break;
+                       }
+
+                       size_t kbufsz = MIN(buffersize, vmrtfaultinfo_bufsz());
+                       void *vmrtfbuf = kalloc(kbufsz);
+
+                       if (vmrtfbuf == NULL) {
+                               error = ENOMEM;
+                               break;
+                       }
+
+                       bzero(vmrtfbuf, kbufsz);
+
+                       uint64_t effpid = get_current_unique_pid();
+                       /* The VM may choose to provide more comprehensive records
+                        * for root-privileged users on internal configurations.
+                        */
+                       boolean_t isroot = (suser(kauth_cred_get(), (u_short *)0) == 0);
+                       int vmf_residue = vmrtf_extract(effpid, isroot, kbufsz, vmrtfbuf, retval);
+                       int vmfsz = *retval * sizeof(vm_rtfault_record_t);
+
+                       error = 0;
+                       if (vmfsz) {
+                               error = copyout(vmrtfbuf, buffer, vmfsz);
+                       }
 
+                       if (error == 0) {
+                               if (vmf_residue) {
+                                       error = ENOMEM;
+                               }
+                       }
+                       kfree(vmrtfbuf, kbufsz);
+               }
+                       break;
                default:
                        error = ENOTSUP;
                        break;
diff --git a/bsd/kern/pthread_shims.c b/bsd/kern/pthread_shims.c
deleted file mode 100644 (file)
index 66fa1d7..0000000
+++ /dev/null
@@ -1,734 +0,0 @@
-/*
- * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-#define PTHREAD_INTERNAL 1
-
-#include <stdatomic.h>
-#include <kern/debug.h>
-#include <kern/mach_param.h>
-#include <kern/sched_prim.h>
-#include <kern/task.h>
-#include <kern/thread.h>
-#include <kern/affinity.h>
-#include <kern/zalloc.h>
-#include <kern/policy_internal.h>
-
-#include <machine/machine_routines.h>
-#include <mach/task.h>
-#include <mach/thread_act.h>
-#include <sys/param.h>
-#include <sys/eventvar.h>
-#include <sys/pthread_shims.h>
-#include <sys/proc_info.h>
-#include <sys/proc_internal.h>
-#include <sys/sysproto.h>
-#include <sys/systm.h>
-#include <vm/vm_map.h>
-#include <vm/vm_protos.h>
-#include <kern/kcdata.h>
-
-/* version number of the in-kernel shims given to pthread.kext */
-#define PTHREAD_SHIMS_VERSION 1
-
-/* on arm, the callbacks function has two #ifdef arm ponters */
-#if defined(__arm__)
-#define PTHREAD_CALLBACK_MEMBER map_is_1gb
-#else
-#define PTHREAD_CALLBACK_MEMBER ml_get_max_cpus
-#endif
-
-/* compile time asserts to check the length of structures in pthread_shims.h */
-static_assert((sizeof(struct pthread_functions_s) - offsetof(struct pthread_functions_s, psynch_rw_yieldwrlock) - sizeof(void*)) == (sizeof(void*) * 100));
-static_assert((sizeof(struct pthread_callbacks_s) - offsetof(struct pthread_callbacks_s, PTHREAD_CALLBACK_MEMBER) - sizeof(void*)) == (sizeof(void*) * 100));
-
-/* old pthread code had definitions for these as they don't exist in headers */
-extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t);
-extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t);
-
-#define PTHREAD_STRUCT_ACCESSOR(get, set, rettype, structtype, member) \
-       static rettype \
-       get(structtype x) { \
-               return (x)->member; \
-       } \
-       static void \
-       set(structtype x, rettype y) { \
-               (x)->member = y; \
-       }
-
-PTHREAD_STRUCT_ACCESSOR(proc_get_threadstart, proc_set_threadstart, user_addr_t, struct proc*, p_threadstart);
-PTHREAD_STRUCT_ACCESSOR(proc_get_pthsize, proc_set_pthsize, int, struct proc*, p_pthsize);
-PTHREAD_STRUCT_ACCESSOR(proc_get_wqthread, proc_set_wqthread, user_addr_t, struct proc*, p_wqthread);
-PTHREAD_STRUCT_ACCESSOR(proc_get_stack_addr_hint, proc_set_stack_addr_hint, user_addr_t, struct proc *, p_stack_addr_hint);
-PTHREAD_STRUCT_ACCESSOR(proc_get_dispatchqueue_offset, proc_set_dispatchqueue_offset, uint64_t, struct proc*, p_dispatchqueue_offset);
-PTHREAD_STRUCT_ACCESSOR(proc_get_dispatchqueue_serialno_offset, proc_set_dispatchqueue_serialno_offset, uint64_t, struct proc*, p_dispatchqueue_serialno_offset);
-PTHREAD_STRUCT_ACCESSOR(proc_get_pthread_tsd_offset, proc_set_pthread_tsd_offset, uint32_t, struct proc *, p_pth_tsd_offset);
-PTHREAD_STRUCT_ACCESSOR(proc_get_mach_thread_self_tsd_offset, proc_set_mach_thread_self_tsd_offset, uint64_t, struct proc *, p_mach_thread_self_offset);
-PTHREAD_STRUCT_ACCESSOR(proc_get_pthhash, proc_set_pthhash, void*, struct proc*, p_pthhash);
-PTHREAD_STRUCT_ACCESSOR(proc_get_return_to_kernel_offset, proc_set_return_to_kernel_offset, uint64_t, struct proc*, p_return_to_kernel_offset);
-PTHREAD_STRUCT_ACCESSOR(proc_get_user_stack, proc_set_user_stack, user_addr_t, struct proc*, user_stack);
-
-PTHREAD_STRUCT_ACCESSOR(uthread_get_threadlist, uthread_set_threadlist, void*, struct uthread*, uu_threadlist);
-PTHREAD_STRUCT_ACCESSOR(uthread_get_sigmask, uthread_set_sigmask, sigset_t, struct uthread*, uu_sigmask);
-PTHREAD_STRUCT_ACCESSOR(uthread_get_returnval, uthread_set_returnval, int, struct uthread*, uu_rval[0]);
-
-#define WQPTR_IS_INITING_VALUE ((void *)~(uintptr_t)0)
-
-static void *
-proc_get_wqptr(struct proc *p) {
-       void *wqptr =  p->p_wqptr;
-       return (wqptr == WQPTR_IS_INITING_VALUE) ? NULL : wqptr;
-}
-static void
-proc_set_wqptr(struct proc *p, void *y) {
-       proc_lock(p);
-
-       assert(y == NULL || p->p_wqptr == WQPTR_IS_INITING_VALUE);
-
-       p->p_wqptr = y;
-
-       if (y != NULL){
-               wakeup(&p->p_wqptr);
-       }
-
-       proc_unlock(p);
-}
-static boolean_t
-proc_init_wqptr_or_wait(struct proc *p) {
-       proc_lock(p);
-
-       if (p->p_wqptr == NULL){
-               p->p_wqptr = WQPTR_IS_INITING_VALUE;
-               proc_unlock(p);
-
-               return TRUE;
-       } else if (p->p_wqptr == WQPTR_IS_INITING_VALUE){
-               assert_wait(&p->p_wqptr, THREAD_UNINT);
-               proc_unlock(p);
-               thread_block(THREAD_CONTINUE_NULL);
-
-               return FALSE;
-       } else {
-               proc_unlock(p);
-
-               return FALSE;
-       }
-}
-
-__attribute__((noreturn))
-static void
-pthread_returning_to_userspace(void)
-{
-       thread_exception_return();
-}
-
-__attribute__((noreturn))
-static void
-pthread_bootstrap_return(void)
-{
-       thread_bootstrap_return();
-}
-
-static uint32_t
-get_task_threadmax(void) {
-       return task_threadmax;
-}
-
-static task_t
-proc_get_task(struct proc *p) {
-       return p->task;
-}
-
-static uint64_t
-proc_get_register(struct proc *p) {
-       return (p->p_lflag & P_LREGISTER);
-}
-
-static void
-proc_set_register(struct proc *p) {
-       proc_setregister(p);
-}
-
-static void*
-uthread_get_uukwe(struct uthread *t)
-{
-       return &t->uu_kevent.uu_kwe;
-}
-
-static int
-uthread_is_cancelled(struct uthread *t)
-{
-       return (t->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL;
-}
-
-static vm_map_t
-_current_map(void)
-{
-       return current_map();
-}
-
-static boolean_t
-qos_main_thread_active(void)
-{
-       return TRUE;
-}
-
-#if defined(__arm__)
-/* On iOS, the stack placement depends on the address space size */
-static uint32_t
-map_is_1gb(vm_map_t map)
-{
-       return ((!vm_map_is_64bit(map)) && (get_map_max(map) == ml_get_max_offset(FALSE, MACHINE_MAX_OFFSET_MIN)));
-}
-#endif
-
-static int proc_usynch_get_requested_thread_qos(struct uthread *uth)
-{
-       thread_t        thread = uth ? uth->uu_thread : current_thread();
-       int                     requested_qos;
-
-       requested_qos = proc_get_thread_policy(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS);
-
-       /*
-        * For the purposes of userspace synchronization, it doesn't make sense to
-        * place an override of UNSPECIFIED on another thread, if the current thread
-        * doesn't have any QoS set. In these cases, upgrade to
-        * THREAD_QOS_USER_INTERACTIVE.
-        */
-       if (requested_qos == THREAD_QOS_UNSPECIFIED) {
-               requested_qos = THREAD_QOS_USER_INTERACTIVE;
-       }
-
-       return requested_qos;
-}
-
-static int
-proc_usynch_thread_qos_add_override_for_resource_check_owner(thread_t thread,
-               int override_qos, boolean_t first_override_for_resource,
-               user_addr_t resource, int resource_type,
-               user_addr_t user_lock_addr, mach_port_name_t user_lock_owner)
-{
-       return proc_thread_qos_add_override_check_owner(thread, override_qos,
-                       first_override_for_resource, resource, resource_type,
-                       user_lock_addr, user_lock_owner);
-}
-
-static boolean_t
-proc_usynch_thread_qos_add_override_for_resource(task_t task, struct uthread *uth,
-               uint64_t tid, int override_qos, boolean_t first_override_for_resource,
-               user_addr_t resource, int resource_type)
-{
-       thread_t thread = uth ? uth->uu_thread : THREAD_NULL;
-
-       return proc_thread_qos_add_override(task, thread, tid, override_qos,
-                       first_override_for_resource, resource, resource_type);
-}
-
-static boolean_t
-proc_usynch_thread_qos_remove_override_for_resource(task_t task,
-               struct uthread *uth, uint64_t tid, user_addr_t resource, int resource_type)
-{
-       thread_t thread = uth ? uth->uu_thread : THREAD_NULL;
-
-       return proc_thread_qos_remove_override(task, thread, tid, resource, resource_type);
-}
-
-static boolean_t
-proc_usynch_thread_qos_reset_override_for_resource(task_t task,
-               struct uthread *uth, uint64_t tid, user_addr_t resource, int resource_type)
-{
-       thread_t thread = uth ? uth->uu_thread : THREAD_NULL;
-
-       return proc_thread_qos_reset_override(task, thread, tid, resource, resource_type);
-}
-
-static boolean_t
-proc_usynch_thread_qos_squash_override_for_resource(thread_t thread,
-               user_addr_t resource, int resource_type)
-{
-       return proc_thread_qos_squash_override(thread, resource, resource_type);
-}
-
-/* kernel (core) to kext shims */
-
-void
-pthread_init(void)
-{
-       if (!pthread_functions) {
-               panic("pthread kernel extension not loaded (function table is NULL).");
-       }
-       pthread_functions->pthread_init();
-}
-
-int
-fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
-{
-       return pthread_functions->fill_procworkqueue(p, pwqinfo);
-}
-
-/*
- * Returns true if the workqueue flags are available, and will fill
- * in exceeded_total and exceeded_constrained.
- */
-boolean_t
-workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total,
-                           boolean_t *exceeded_constrained)
-{
-       proc_t p = v;
-       struct proc_workqueueinfo pwqinfo;
-       int err;
-
-       assert(p != NULL);
-       assert(exceeded_total != NULL);
-       assert(exceeded_constrained != NULL);
-
-       err = fill_procworkqueue(p, &pwqinfo);
-       if (err) {
-               return FALSE;
-       }
-       if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) {
-               return FALSE;
-       }
-
-       *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT);
-       *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT);
-
-       return TRUE;
-}
-
-uint32_t
-workqueue_get_pwq_state_kdp(void * v)
-{
-       static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) == kTaskWqExceededConstrainedThreadLimit);
-       static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) == kTaskWqExceededTotalThreadLimit);
-       static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable);
-       static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT | WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7);
-       proc_t p = v;
-       if (pthread_functions == NULL || pthread_functions->get_pwq_state_kdp == NULL)
-               return 0;
-       else
-               return pthread_functions->get_pwq_state_kdp(p);
-}
-
-void
-workqueue_exit(struct proc *p)
-{
-       pthread_functions->workqueue_exit(p);
-}
-
-void
-workqueue_mark_exiting(struct proc *p)
-{
-       pthread_functions->workqueue_mark_exiting(p);
-}
-
-void
-workqueue_thread_yielded(void)
-{
-       pthread_functions->workqueue_thread_yielded();
-}
-
-sched_call_t
-workqueue_get_sched_callback(void)
-{
-       if (pthread_functions->workqueue_get_sched_callback) {
-               return pthread_functions->workqueue_get_sched_callback();
-       }
-       return NULL;
-}
-
-void
-pth_proc_hashinit(proc_t p)
-{
-       pthread_functions->pth_proc_hashinit(p);
-}
-
-void
-pth_proc_hashdelete(proc_t p)
-{
-       pthread_functions->pth_proc_hashdelete(p);
-}
-
-/* syscall shims */
-int
-bsdthread_create(struct proc *p, struct bsdthread_create_args *uap, user_addr_t *retval)
-{
-       return pthread_functions->bsdthread_create(p, uap->func, uap->func_arg, uap->stack, uap->pthread, uap->flags, retval);
-}
-
-int
-bsdthread_register(struct proc *p, struct bsdthread_register_args *uap, __unused int32_t *retval)
-{
-       if (pthread_functions->version >= 1) {
-               return pthread_functions->bsdthread_register2(p, uap->threadstart, uap->wqthread,
-                                                                                                         uap->flags, uap->stack_addr_hint, 
-                                                                                                         uap->targetconc_ptr, uap->dispatchqueue_offset,
-                                                                                                         uap->tsd_offset, retval);             
-       } else {
-               return pthread_functions->bsdthread_register(p, uap->threadstart, uap->wqthread,
-                                                                                                        uap->flags, uap->stack_addr_hint,
-                                                                                                        uap->targetconc_ptr, uap->dispatchqueue_offset,
-                                                                                                        retval);
-       }
-}
-
-int
-bsdthread_terminate(struct proc *p, struct bsdthread_terminate_args *uap, int32_t *retval)
-{
-       return pthread_functions->bsdthread_terminate(p, uap->stackaddr, uap->freesize, uap->port, uap->sem, retval);
-}
-
-int
-bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
-{
-    return pthread_functions->bsdthread_ctl(p, uap->cmd, uap->arg1, uap->arg2, uap->arg3, retval);
-}
-
-
-int
-thread_selfid(struct proc *p, __unused struct thread_selfid_args *uap, uint64_t *retval)
-{
-       return pthread_functions->thread_selfid(p, retval);
-}
-
-int
-workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval)
-{
-       return pthread_functions->workq_kernreturn(p, uap->options, uap->item, uap->affinity, uap->prio, retval);
-}
-
-int
-workq_open(struct proc *p, __unused struct workq_open_args  *uap, int32_t *retval)
-{
-       return pthread_functions->workq_open(p, retval);
-}
-
-/* pthread synchroniser syscalls */
-
-int
-psynch_mutexwait(proc_t p, struct psynch_mutexwait_args *uap, uint32_t *retval)
-{
-       return pthread_functions->psynch_mutexwait(p, uap->mutex, uap->mgen, uap->ugen, uap->tid, uap->flags, retval);
-}
-
-int
-psynch_mutexdrop(proc_t p, struct psynch_mutexdrop_args *uap, uint32_t *retval)
-{
-       return pthread_functions->psynch_mutexdrop(p, uap->mutex, uap->mgen, uap->ugen, uap->tid, uap->flags, retval);
-}
-
-int
-psynch_cvbroad(proc_t p, struct psynch_cvbroad_args *uap, uint32_t *retval)
-{
-       return pthread_functions->psynch_cvbroad(p, uap->cv, uap->cvlsgen, uap->cvudgen, uap->flags, uap->mutex, uap->mugen, uap->tid, retval);
-}
-
-int
-psynch_cvsignal(proc_t p, struct psynch_cvsignal_args *uap, uint32_t *retval)
-{
-       return pthread_functions->psynch_cvsignal(p, uap->cv, uap->cvlsgen, uap->cvugen, uap->thread_port, uap->mutex, uap->mugen, uap->tid, uap->flags, retval);
-}
-
-int
-psynch_cvwait(proc_t p, struct psynch_cvwait_args * uap, uint32_t * retval)
-{
-       return pthread_functions->psynch_cvwait(p, uap->cv, uap->cvlsgen, uap->cvugen, uap->mutex, uap->mugen, uap->flags, uap->sec, uap->nsec, retval);
-}
-
-int
-psynch_cvclrprepost(proc_t p, struct psynch_cvclrprepost_args * uap, int *retval)
-{
-       return pthread_functions->psynch_cvclrprepost(p, uap->cv, uap->cvgen, uap->cvugen, uap->cvsgen, uap->prepocnt, uap->preposeq, uap->flags, retval);
-}
-
-int
-psynch_rw_longrdlock(proc_t p, struct psynch_rw_longrdlock_args * uap,  uint32_t *retval)
-{
-       return pthread_functions->psynch_rw_longrdlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
-}
-
-int
-psynch_rw_rdlock(proc_t p, struct psynch_rw_rdlock_args * uap, uint32_t * retval)
-{
-       return pthread_functions->psynch_rw_rdlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
-}
-
-int
-psynch_rw_unlock(proc_t p, struct psynch_rw_unlock_args *uap, uint32_t *retval)
-{
-       return pthread_functions->psynch_rw_unlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
-}
-
-int
-psynch_rw_unlock2(__unused proc_t p, __unused struct psynch_rw_unlock2_args *uap, __unused uint32_t *retval)
-{
-       return ENOTSUP;
-}
-
-int
-psynch_rw_wrlock(proc_t p, struct psynch_rw_wrlock_args *uap, uint32_t *retval)
-{
-       return pthread_functions->psynch_rw_wrlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
-}
-
-int
-psynch_rw_yieldwrlock(proc_t p, struct psynch_rw_yieldwrlock_args *uap, uint32_t *retval)
-{
-       return pthread_functions->psynch_rw_yieldwrlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
-}
-
-int
-psynch_rw_upgrade(__unused proc_t p, __unused struct psynch_rw_upgrade_args * uap, __unused uint32_t *retval)
-{
-       return 0;
-}
-
-int
-psynch_rw_downgrade(__unused proc_t p, __unused struct psynch_rw_downgrade_args * uap, __unused int *retval)
-{
-       return 0;
-}
-
-int
-thread_qos_from_pthread_priority(unsigned long priority, unsigned long *flags)
-{
-       return pthread_functions->thread_qos_from_pthread_priority(priority, flags);
-}
-
-unsigned long
-pthread_priority_canonicalize(unsigned long priority, boolean_t propagation)
-{
-       return pthread_functions->pthread_priority_canonicalize2(priority, propagation);
-}
-
-boolean_t
-workq_thread_has_been_unbound(thread_t th, int qos_class)
-{
-       if (pthread_functions->workq_thread_has_been_unbound) {
-               return pthread_functions->workq_thread_has_been_unbound(th, qos_class);
-       } else {
-               panic("pthread kext does not support workq_thread_has_been_unbound");
-               return false;
-       }
-}
-
-void
-kdp_pthread_find_owner(thread_t thread, struct stackshot_thread_waitinfo *waitinfo)
-{
-       if (pthread_functions->pthread_find_owner)
-               pthread_functions->pthread_find_owner(thread, waitinfo);
-}
-
-void *
-kdp_pthread_get_thread_kwq(thread_t thread)
-{
-       if (pthread_functions->pthread_get_thread_kwq)
-               return pthread_functions->pthread_get_thread_kwq(thread);
-
-       return NULL;
-}
-
-static void
-thread_will_park_or_terminate(thread_t thread)
-{
-       if (thread_owned_workloops_count(thread)) {
-               (void)kevent_exit_on_workloop_ownership_leak(thread);
-       }
-}
-
-#if defined(__arm64__)
-static unsigned __int128
-atomic_fetch_add_128_relaxed(_Atomic unsigned __int128 *ptr, unsigned __int128 value)
-{
-       return atomic_fetch_add_explicit(ptr, value, memory_order_relaxed);
-}
-
-static unsigned __int128
-atomic_load_128_relaxed(_Atomic unsigned __int128 *ptr)
-{
-       return atomic_load_explicit(ptr, memory_order_relaxed);
-}
-#endif
-
-/*
- * The callbacks structure (defined in pthread_shims.h) contains a collection
- * of kernel functions that were not deemed sensible to expose as a KPI to all
- * kernel extensions. So the kext is given them in the form of a structure of
- * function pointers.
- */
-static const struct pthread_callbacks_s pthread_callbacks = {
-       .version = PTHREAD_SHIMS_VERSION,
-       .config_thread_max = CONFIG_THREAD_MAX,
-       .get_task_threadmax = get_task_threadmax,
-
-       .proc_get_threadstart = proc_get_threadstart,
-       .proc_set_threadstart = proc_set_threadstart,
-       .proc_get_pthsize = proc_get_pthsize,
-       .proc_set_pthsize = proc_set_pthsize,
-       .proc_get_wqthread = proc_get_wqthread,
-       .proc_set_wqthread = proc_set_wqthread,
-       .proc_get_dispatchqueue_offset = proc_get_dispatchqueue_offset,
-       .proc_set_dispatchqueue_offset = proc_set_dispatchqueue_offset,
-       .proc_get_wqptr = proc_get_wqptr,
-       .proc_set_wqptr = proc_set_wqptr,
-       .proc_get_pthhash = proc_get_pthhash,
-       .proc_set_pthhash = proc_set_pthhash,
-       .proc_get_task = proc_get_task,
-       .proc_lock = proc_lock,
-       .proc_unlock = proc_unlock,
-       .proc_get_register = proc_get_register,
-       .proc_set_register = proc_set_register,
-
-       /* kernel IPI interfaces */
-       .ipc_port_copyout_send = ipc_port_copyout_send,
-       .task_get_ipcspace = get_task_ipcspace,
-       .vm_map_page_info = vm_map_page_info,
-       .vm_map_switch = vm_map_switch,
-       .thread_set_wq_state32 = thread_set_wq_state32,
-#if !defined(__arm__)
-       .thread_set_wq_state64 = thread_set_wq_state64,
-#endif
-
-       .uthread_get_threadlist = uthread_get_threadlist,
-       .uthread_set_threadlist = uthread_set_threadlist,
-       .uthread_get_sigmask = uthread_get_sigmask,
-       .uthread_set_sigmask = uthread_set_sigmask,
-       .uthread_get_uukwe = uthread_get_uukwe,
-       .uthread_get_returnval = uthread_get_returnval,
-       .uthread_set_returnval = uthread_set_returnval,
-       .uthread_is_cancelled = uthread_is_cancelled,
-
-       .thread_exception_return = pthread_returning_to_userspace,
-       .thread_bootstrap_return = pthread_bootstrap_return,
-       .unix_syscall_return = unix_syscall_return,
-
-       .absolutetime_to_microtime = absolutetime_to_microtime,
-
-       .thread_set_workq_pri = thread_set_workq_pri,
-       .thread_set_workq_qos = thread_set_workq_qos,
-
-       .get_bsdthread_info = (void*)get_bsdthread_info,
-       .thread_sched_call = thread_sched_call,
-       .thread_static_param = thread_static_param,
-       .thread_create_workq = thread_create_workq,
-       .thread_policy_set_internal = thread_policy_set_internal,
-       .thread_policy_get = thread_policy_get,
-       .thread_set_voucher_name = thread_set_voucher_name,
-
-       .thread_affinity_set = thread_affinity_set,
-
-       .zalloc = zalloc,
-       .zfree = zfree,
-       .zinit = zinit,
-
-       .workloop_fulfill_threadreq = workloop_fulfill_threadreq,
-
-       .__pthread_testcancel = __pthread_testcancel,
-
-       .mach_port_deallocate = mach_port_deallocate,
-       .semaphore_signal_internal_trap = semaphore_signal_internal_trap,
-       .current_map = _current_map,
-       .thread_create = thread_create,
-       .thread_resume = thread_resume,
-
-       .convert_thread_to_port = convert_thread_to_port,
-       .ml_get_max_cpus = (void*)ml_get_max_cpus,
-
-#if defined(__arm__)
-       .map_is_1gb = map_is_1gb,
-#endif
-#if defined(__arm64__)
-       .atomic_fetch_add_128_relaxed = atomic_fetch_add_128_relaxed,
-       .atomic_load_128_relaxed = atomic_load_128_relaxed,
-#endif
-
-       .proc_get_dispatchqueue_serialno_offset = proc_get_dispatchqueue_serialno_offset,
-       .proc_set_dispatchqueue_serialno_offset = proc_set_dispatchqueue_serialno_offset,
-
-       .proc_get_stack_addr_hint = proc_get_stack_addr_hint,
-       .proc_set_stack_addr_hint = proc_set_stack_addr_hint,
-       .proc_get_pthread_tsd_offset = proc_get_pthread_tsd_offset,
-       .proc_set_pthread_tsd_offset = proc_set_pthread_tsd_offset,
-       .proc_get_mach_thread_self_tsd_offset = proc_get_mach_thread_self_tsd_offset,
-       .proc_set_mach_thread_self_tsd_offset = proc_set_mach_thread_self_tsd_offset,
-
-       .thread_set_tsd_base = thread_set_tsd_base,
-
-       .proc_usynch_get_requested_thread_qos = proc_usynch_get_requested_thread_qos,
-
-       .qos_main_thread_active = qos_main_thread_active,
-
-       .proc_usynch_thread_qos_add_override_for_resource_check_owner = proc_usynch_thread_qos_add_override_for_resource_check_owner,
-       .proc_usynch_thread_qos_add_override_for_resource = proc_usynch_thread_qos_add_override_for_resource,
-       .proc_usynch_thread_qos_remove_override_for_resource = proc_usynch_thread_qos_remove_override_for_resource,
-       .proc_usynch_thread_qos_reset_override_for_resource = proc_usynch_thread_qos_reset_override_for_resource,
-
-       .proc_init_wqptr_or_wait = proc_init_wqptr_or_wait,
-
-       .thread_set_tag = thread_set_tag,
-       .thread_get_tag = thread_get_tag,
-
-       .proc_usynch_thread_qos_squash_override_for_resource = proc_usynch_thread_qos_squash_override_for_resource,
-       .task_get_default_manager_qos = task_get_default_manager_qos,
-       .thread_create_workq_waiting = thread_create_workq_waiting,
-
-       .proc_get_return_to_kernel_offset = proc_get_return_to_kernel_offset,
-       .proc_set_return_to_kernel_offset = proc_set_return_to_kernel_offset,
-       .thread_will_park_or_terminate = thread_will_park_or_terminate,
-
-       .qos_max_parallelism = qos_max_parallelism,
-
-       .proc_get_user_stack = proc_get_user_stack,
-       .proc_set_user_stack = proc_set_user_stack,
-};
-
-pthread_callbacks_t pthread_kern = &pthread_callbacks;
-pthread_functions_t pthread_functions = NULL;
-
-/*
- * pthread_kext_register is called by pthread.kext upon load, it has to provide
- * us with a function pointer table of pthread internal calls. In return, this
- * file provides it with a table of function pointers it needs.
- */
-
-void
-pthread_kext_register(pthread_functions_t fns, pthread_callbacks_t *callbacks)
-{
-       if (pthread_functions != NULL) {
-               panic("Re-initialisation of pthread kext callbacks.");
-       }
-
-       if (callbacks != NULL) {
-               *callbacks = &pthread_callbacks;
-       } else {
-               panic("pthread_kext_register called without callbacks pointer.");
-       }
-
-       if (fns) {
-               pthread_functions = fns;
-       }
-}
index cde2de0bedd85a73761a37b378d289721e431907..ff45d5c8a58b9eb2ffbddb0e243a18a40632b44d 100644 (file)
@@ -161,6 +161,9 @@ int oslog_stream_open = 0;
 int    oslog_stream_buf_size = OSLOG_STREAM_BUF_SIZE;
 int    oslog_stream_num_entries = OSLOG_NUM_STREAM_ENTRIES;
 
+uint8_t __firehose_buffer_kernel_chunk_count = FIREHOSE_BUFFER_KERNEL_DEFAULT_CHUNK_COUNT;
+uint8_t __firehose_num_kernel_io_pages = FIREHOSE_BUFFER_KERNEL_DEFAULT_IO_PAGES;
+
 /* oslogsoftc only valid while oslog_open=1 */
 struct oslogsoftc {
        int     sc_state;               /* see above for possibilities */
@@ -784,7 +787,7 @@ int
 oslogioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __unused struct proc *p)
 {
        int ret = 0;
-       mach_vm_size_t buffer_size = (FIREHOSE_BUFFER_KERNEL_CHUNK_COUNT * FIREHOSE_CHUNK_SIZE);
+       mach_vm_size_t buffer_size = (__firehose_buffer_kernel_chunk_count * FIREHOSE_CHUNK_SIZE);
        firehose_buffer_map_info_t map_info = {0, 0};
        firehose_buffer_t kernel_firehose_buffer = NULL;
        mach_vm_address_t user_addr = 0;
@@ -809,6 +812,7 @@ oslogioctl(__unused dev_t dev, u_long com, caddr_t data, __unused int flag, __un
                                          buffer_size,
                                          0, /*  mask */
                                          VM_FLAGS_ANYWHERE,
+                                         VM_MAP_KERNEL_FLAGS_NONE,
                                          VM_KERN_MEMORY_NONE,
                                          mem_entry_ptr,
                                          0, /* offset */
@@ -876,7 +880,18 @@ void
 oslog_init(void)
 {
        kern_return_t kr;
-       vm_size_t size = FIREHOSE_BUFFER_KERNEL_CHUNK_COUNT * FIREHOSE_CHUNK_SIZE;
+       if (!PE_parse_boot_argn("firehose_chunk_count", &__firehose_buffer_kernel_chunk_count, sizeof(__firehose_buffer_kernel_chunk_count))) {
+               __firehose_buffer_kernel_chunk_count = FIREHOSE_BUFFER_KERNEL_DEFAULT_CHUNK_COUNT;
+       }
+       if (!PE_parse_boot_argn("firehose_io_pages", &__firehose_num_kernel_io_pages, sizeof(__firehose_num_kernel_io_pages))) {
+               __firehose_num_kernel_io_pages = FIREHOSE_BUFFER_KERNEL_DEFAULT_IO_PAGES;
+       }
+       if (!__firehose_kernel_configuration_valid(__firehose_buffer_kernel_chunk_count, __firehose_num_kernel_io_pages)) {
+               printf("illegal firehose configuration %u/%u, using defaults\n", __firehose_buffer_kernel_chunk_count, __firehose_num_kernel_io_pages);
+               __firehose_buffer_kernel_chunk_count = FIREHOSE_BUFFER_KERNEL_DEFAULT_CHUNK_COUNT;
+               __firehose_num_kernel_io_pages = FIREHOSE_BUFFER_KERNEL_DEFAULT_IO_PAGES;
+       }
+       vm_size_t size = __firehose_buffer_kernel_chunk_count * FIREHOSE_CHUNK_SIZE;
 
        oslog_lock_init();
 
@@ -891,7 +906,7 @@ oslog_init(void)
        /* register buffer with firehose */
        kernel_firehose_addr = (vm_offset_t)__firehose_buffer_create((size_t *) &size);
 
-       kprintf("oslog_init completed\n");
+       printf("oslog_init completed, %u chunks, %u io pages\n", __firehose_buffer_kernel_chunk_count, __firehose_num_kernel_io_pages);
 }
 
 /*
@@ -1333,3 +1348,46 @@ log_dmesg(user_addr_t buffer, uint32_t buffersize, int32_t * retval)
        return (error);
 }
 
+#ifdef CONFIG_XNUPOST
+
+uint32_t find_pattern_in_buffer(char * pattern, uint32_t len, int expected_count);
+
+/*
+ * returns count of pattern found in systemlog buffer.
+ * stops searching further if count reaches expected_count.
+ */
+uint32_t
+find_pattern_in_buffer(char * pattern, uint32_t len, int expected_count)
+{
+       int match_count = 0;
+       int i           = 0;
+       int j           = 0;
+       int no_match    = 0;
+       int pos         = 0;
+       char ch         = 0;
+
+       if (pattern == NULL || len == 0 || expected_count == 0) {
+               return 0;
+       }
+
+       for (i = 0; i < msgbufp->msg_size; i++) {
+               no_match = 0;
+               for (j = 0; j < (int)len; j++) {
+                       pos = (msgbufp->msg_bufx + i + j) % msgbufp->msg_size;
+                       ch  = msgbufp->msg_bufc[pos];
+                       if (ch != pattern[j]) {
+                               no_match = 1;
+                               break;
+                       }
+               }
+               if (no_match == 0) {
+                       match_count++;
+                       if (match_count >= expected_count) {
+                               break;
+                       }
+               }
+       }
+       return match_count;
+}
+
+#endif
index fec4bfa44b16cafa59af30b99ac5714062f5ad7b..4741bfc16536e08712b40d5f1863b351ae1ca5fc 100644 (file)
@@ -315,7 +315,7 @@ static int sysctl_coalition_get_ids SYSCTL_HANDLER_ARGS
        int error, pid;
        proc_t tproc;
        uint64_t value;
-       uint64_t ids[COALITION_NUM_TYPES];
+       uint64_t ids[COALITION_NUM_TYPES] = {};
 
 
        error = SYSCTL_IN(req, &value, sizeof(value));
@@ -349,7 +349,7 @@ static int sysctl_coalition_get_roles SYSCTL_HANDLER_ARGS
        int error, pid;
        proc_t tproc;
        int value;
-       int roles[COALITION_NUM_TYPES];
+       int roles[COALITION_NUM_TYPES] = {};
 
 
        error = SYSCTL_IN(req, &value, sizeof(value));
index 9cfbe7e9176d473723d6898c581d857b6dead11f..ad3d80a6bd9eb902f0a90cfd327efec970e061e7 100644 (file)
@@ -1085,7 +1085,7 @@ select_internal(struct proc *p, struct select_nocancel_args *uap, uint64_t timeo
        th_act = current_thread();
        uth = get_bsdthread_info(th_act);
        sel = &uth->uu_select;
-       seldata = &uth->uu_kevent.ss_select_data;
+       seldata = &uth->uu_save.uus_select_data;
        *retval = 0;
 
        seldata->args = uap;
@@ -1270,7 +1270,7 @@ selprocess(int error, int sel_pass)
        th_act = current_thread();
        uth = get_bsdthread_info(th_act);
        sel = &uth->uu_select;
-       seldata = &uth->uu_kevent.ss_select_data;
+       seldata = &uth->uu_save.uus_select_data;
        uap = seldata->args;
        retval = seldata->retval;
 
@@ -1483,16 +1483,7 @@ static uint64_t sellinkfp(struct fileproc *fp, void **wq_data, struct waitq_set
        }
 
        if ((fp->f_flags & FP_SELCONFLICT) == FP_SELCONFLICT) {
-               /*
-                * The conflict queue requires disabling interrupts, so we
-                * need to explicitly reserve a link object to avoid a
-                * panic/assert in the waitq code. Hopefully this extra step
-                * can be avoided if we can split the waitq structure into
-                * blocking and linkage sub-structures.
-                */
-               uint64_t reserved_link = waitq_link_reserve(&select_conflict_queue);
-               waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, &reserved_link);
-               waitq_link_release(reserved_link);
+               waitq_link(&select_conflict_queue, wqset, WAITQ_SHOULD_LOCK, NULL);
        }
 
        /*
@@ -1610,6 +1601,8 @@ selscan(struct proc *p, struct _select *sel, struct _select_data * seldata,
                                                fp->f_flags |= FP_SELCONFLICT;
                                        else
                                                fp->f_flags |= FP_INSELECT;
+
+                                       waitq_set_lazy_init_link(wqset);
                                }
 
                                context.vc_ucred = fp->f_cred;
@@ -1731,6 +1724,8 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
        OSBitOrAtomic(P_SELECT, &p->p_flag);
        for (i = 0; i < nfds; i++) {
                short events = fds[i].events;
+               KNOTE_LOCK_CTX(knlc);
+               __assert_only int rc;
 
                /* per spec, ignore fd values below zero */
                if (fds[i].fd < 0) {
@@ -1749,14 +1744,16 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                        kev.filter = EVFILT_READ;
                        if (events & ( POLLPRI | POLLRDBAND ))
                                kev.flags |= EV_OOBAND;
-                       kevent_register(kq, &kev, p);
+                       rc = kevent_register(kq, &kev, &knlc);
+                       assert((rc & FILTER_REGISTER_WAIT) == 0);
                }
 
                /* Handle output events */
                if ((kev.flags & EV_ERROR) == 0 &&
                    (events & ( POLLOUT | POLLWRNORM | POLLWRBAND ))) {
                        kev.filter = EVFILT_WRITE;
-                       kevent_register(kq, &kev, p);
+                       rc = kevent_register(kq, &kev, &knlc);
+                       assert((rc & FILTER_REGISTER_WAIT) == 0);
                }
 
                /* Handle BSD extension vnode events */
@@ -1772,7 +1769,8 @@ poll_nocancel(struct proc *p, struct poll_nocancel_args *uap, int32_t *retval)
                                kev.fflags |= NOTE_LINK;
                        if (events & POLLWRITE)
                                kev.fflags |= NOTE_WRITE;
-                       kevent_register(kq, &kev, p);
+                       rc = kevent_register(kq, &kev, &knlc);
+                       assert((rc & FILTER_REGISTER_WAIT) == 0);
                }
 
                if (kev.flags & EV_ERROR) {
@@ -2028,7 +2026,7 @@ seldrop_locked(struct proc *p, u_int32_t *ibits, int nfd, int lim, int *need_wak
        }
 
        nw = howmany(nfd, NFDBITS);
-       seldata = &uth->uu_kevent.ss_select_data;
+       seldata = &uth->uu_save.uus_select_data;
 
        nc = 0;
        for (msk = 0; msk < 3; msk++) {
@@ -2741,7 +2739,7 @@ waitevent(proc_t p, struct waitevent_args *uap, int *retval)
        union {
                struct eventreq64 er64;
                struct eventreq32 er32;
-       } uer;
+       } uer = {};
 
        interval = 0;
 
@@ -3112,7 +3110,7 @@ gethostuuid(struct proc *p, struct gethostuuid_args *uap, __unused int32_t *retv
        kern_return_t kret;
        int error;
        mach_timespec_t mach_ts;        /* for IOKit call */
-       __darwin_uuid_t uuid_kern;      /* for IOKit call */
+       __darwin_uuid_t uuid_kern = {}; /* for IOKit call */
 
        if (!uap->spi) {
 #if CONFIG_EMBEDDED
@@ -3227,7 +3225,7 @@ ledger(struct proc *p, struct ledger_args *args, __unused int32_t *retval)
                }
 #endif
                case LEDGER_INFO: {
-                       struct ledger_info info;
+                       struct ledger_info info = {};
 
                        rval = ledger_info(task, &info);
                        proc_rele(proc);
@@ -3287,6 +3285,9 @@ telemetry(__unused struct proc *p, struct telemetry_args *args, __unused int32_t
        case TELEMETRY_CMD_TIMER_EVENT:
                error = telemetry_timer_event(args->deadline, args->interval, args->leeway);
                break;
+       case TELEMETRY_CMD_PMI_SETUP:
+               error = telemetry_pmi_setup((enum telemetry_pmi)args->deadline, args->interval);
+               break;
 #endif /* CONFIG_TELEMETRY */
        case TELEMETRY_CMD_VOUCHER_NAME:
                if (thread_set_voucher_name((mach_port_name_t)args->deadline))
@@ -3681,6 +3682,26 @@ SYSCTL_PROC(_kern, OID_AUTO, wqset_clear_preposts, CTLTYPE_QUAD | CTLFLAG_RW | C
            0, 0, sysctl_wqset_clear_preposts, "Q", "clear preposts on given waitq set");
 
 #endif /* CONFIG_WAITQ_DEBUG */
+
+static int
+sysctl_waitq_set_nelem SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+        int nelem;
+
+       /* Read only  */
+       if (req->newptr != USER_ADDR_NULL)
+               return (EPERM);
+
+       nelem = sysctl_helper_waitq_set_nelem();
+
+       return SYSCTL_OUT(req, &nelem, sizeof(nelem));
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, n_ltable_entries, CTLFLAG_RD | CTLFLAG_LOCKED,
+                0, 0, sysctl_waitq_set_nelem, "I", "ltable elementis currently used");
+
+
 #endif /* DEVELOPMENT || DEBUG */
 
 
index 7f33f6769f6eb0789c4ab4abe2001e971e7cea5d..d00584e9792abe693a91a38fd51aa8d7a24ec826 100644 (file)
@@ -105,10 +105,15 @@ static int kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp)
        if (!persona)
                return error;
 
+       error = persona_init_begin(persona);
+       if (error) {
+               goto out_persona_err;
+       }
+
        if (kinfo.persona_gid) {
                error = persona_set_gid(persona, kinfo.persona_gid);
                if (error)
-                       goto out_error;
+                       goto out_persona_err;
        }
 
        if (kinfo.persona_ngroups > 0) {
@@ -120,13 +125,21 @@ static int kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp)
                                           kinfo.persona_ngroups,
                                           kinfo.persona_gmuid);
                if (error)
-                       goto out_error;
+                       goto out_persona_err;
        }
 
        error = copyout(&persona->pna_id, idp, sizeof(persona->pna_id));
-       if (error)
-               goto out_error;
+       if (error) {
+               goto out_persona_err;
+       }
+
+       kinfo.persona_id = persona->pna_id;
        error = kpersona_copyout(&kinfo, infop);
+       if (error) {
+               goto out_persona_err;
+       }
+
+       persona_init_end(persona, error);
 
        /*
         * On success, we have a persona structure in the global list with a
@@ -135,8 +148,13 @@ static int kpersona_alloc_syscall(user_addr_t infop, user_addr_t idp)
         */
        return error;
 
-out_error:
+out_persona_err:
+       assert(error != 0);
+       persona_init_end(persona, error);
+
+#if PERSONA_DEBUG
        printf("%s:  ERROR:%d\n", __func__, error);
+#endif
        if (persona)
                persona_put(persona);
        return error;
@@ -204,8 +222,8 @@ static int kpersona_info_syscall(user_addr_t idp, user_addr_t infop)
        if (!persona)
                return ESRCH;
 
-       persona_dbg("FOUND: persona:%p, id:%d, gid:%d, login:\"%s\"",
-                   persona, persona->pna_id, persona_get_gid(persona),
+       persona_dbg("FOUND: persona: id:%d, gid:%d, login:\"%s\"",
+                   persona->pna_id, persona_get_gid(persona),
                    persona->pna_login);
 
        memset(&kinfo, 0, sizeof(kinfo));
index 9e8b346e925615c09dfa09cc98555673b9b92c8e..c6725337b5b87fd024a1e95685df2d271af7baa3 100644 (file)
@@ -1419,7 +1419,7 @@ filt_piperead(struct knote *kn, long hint)
 
        return filt_piperead_common(kn, rpipe);
 }
-       
+
 static int
 filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev)
 {
@@ -1431,8 +1431,6 @@ filt_pipereadtouch(struct knote *kn, struct kevent_internal_s *kev)
        /* accept new inputs (and save the low water threshold and flag) */
        kn->kn_sdata = kev->data;
        kn->kn_sfflags = kev->fflags;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /* identify if any events are now fired */
        retval = filt_piperead_common(kn, rpipe);
@@ -1515,8 +1513,6 @@ filt_pipewritetouch(struct knote *kn, struct kevent_internal_s *kev)
        /* accept new kevent data (and save off lowat threshold and flag) */
        kn->kn_sfflags = kev->fflags;
        kn->kn_sdata = kev->data;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /* determine if any event is now deemed fired */
        res = filt_pipewrite_common(kn, rpipe);
index c8bf0da868a8a53c0a7120d3723869adfc2ba99f..0d8664c7e022cef8fc44a1a189708d8cf4d406b3 100644 (file)
@@ -53,6 +53,7 @@
 #include <kern/telemetry.h>
 #include <kern/waitq.h>
 #include <kern/sched_prim.h>
+#include <kern/turnstile.h>
 #include <kern/zalloc.h>
 #include <kern/debug.h>
 
  */
 
 static lck_grp_t *ull_lck_grp;
-static lck_mtx_t ull_table_lock;
 
-#define ull_global_lock()       lck_mtx_lock(&ull_table_lock)
-#define ull_global_unlock()     lck_mtx_unlock(&ull_table_lock)
-
-#define ull_lock(ull)           lck_mtx_lock(&ull->ull_lock)
-#define ull_unlock(ull)         lck_mtx_unlock(&ull->ull_lock)
-#define ull_assert_owned(ull)  LCK_MTX_ASSERT(&ull->ull_lock, LCK_MTX_ASSERT_OWNED)
+typedef lck_spin_t ull_lock_t;
+#define ull_lock_init(ull)      lck_spin_init(&ull->ull_lock, ull_lck_grp, NULL)
+#define ull_lock_destroy(ull)   lck_spin_destroy(&ull->ull_lock, ull_lck_grp)
+#define ull_lock(ull)           lck_spin_lock(&ull->ull_lock)
+#define ull_unlock(ull)         lck_spin_unlock(&ull->ull_lock)
+#define ull_assert_owned(ull)   LCK_SPIN_ASSERT(&ull->ull_lock, LCK_ASSERT_OWNED)
+#define ull_assert_notwned(ull) LCK_SPIN_ASSERT(&ull->ull_lock, LCK_ASSERT_NOTOWNED)
 
 #define ULOCK_TO_EVENT(ull)   ((event_t)ull)
 #define EVENT_TO_ULOCK(event) ((ull_t *)event)
@@ -119,25 +120,22 @@ typedef struct ull {
        thread_t        ull_owner; /* holds +1 thread reference */
        ulk_t           ull_key;
        ulk_t           ull_saved_key;
-       lck_mtx_t       ull_lock;
+       ull_lock_t      ull_lock;
+       uint            ull_bucket_index;
        int32_t         ull_nwaiters;
        int32_t         ull_max_nwaiters;
        int32_t         ull_refcount;
-       struct promote_token ull_promote_token;
-       queue_chain_t   ull_hash_link;
        uint8_t         ull_opcode;
+       struct turnstile *ull_turnstile;
+       queue_chain_t   ull_hash_link;
 } ull_t;
 
-static const bool ull_debug = false;
-
 extern void ulock_initialize(void);
 
 #define ULL_MUST_EXIST 0x0001
-static ull_t *ull_get(ulk_t *, uint32_t);
+static ull_t *ull_get(ulk_t *, uint32_t, ull_t **);
 static void ull_put(ull_t *);
 
-static thread_t ull_promote_owner_locked(ull_t* ull, thread_t thread);
-
 #if DEVELOPMENT || DEBUG
 static int ull_simulate_copyin_fault = 0;
 
@@ -154,15 +152,23 @@ ull_dump(ull_t *ull)
        kprintf("ull_refcount\t%d\n", ull->ull_refcount);
        kprintf("ull_opcode\t%d\n\n", ull->ull_opcode);
        kprintf("ull_owner\t0x%llx\n\n", thread_tid(ull->ull_owner));
-       kprintf("ull_promote_token\t%d, %d\n\n", ull->ull_promote_token.pt_basepri, ull->ull_promote_token.pt_qos);
+       kprintf("ull_turnstile\t%p\n\n", ull->ull_turnstile);
 }
 #endif
 
+typedef struct ull_bucket {
+       queue_head_t ulb_head;
+       lck_spin_t   ulb_lock;
+} ull_bucket_t;
+
 static int ull_hash_buckets;
-static queue_head_t *ull_bucket;
+static ull_bucket_t *ull_bucket;
 static uint32_t ull_nzalloc = 0;
 static zone_t ull_zone;
 
+#define ull_bucket_lock(i)       lck_spin_lock(&ull_bucket[i].ulb_lock)
+#define ull_bucket_unlock(i)     lck_spin_unlock(&ull_bucket[i].ulb_lock)
+
 static __inline__ uint32_t
 ull_hash_index(char *key, size_t length)
 {
@@ -185,7 +191,6 @@ void
 ulock_initialize(void)
 {
        ull_lck_grp = lck_grp_alloc_init("ulocks", NULL);
-       lck_mtx_init(&ull_table_lock, ull_lck_grp, NULL);
 
        assert(thread_max > 16);
        /* Size ull_hash_buckets based on thread_max.
@@ -196,11 +201,12 @@ ulock_initialize(void)
        kprintf("%s>thread_max=%d, ull_hash_buckets=%d\n", __FUNCTION__, thread_max, ull_hash_buckets);
        assert(ull_hash_buckets >= thread_max/4);
 
-       ull_bucket = (queue_head_t *)kalloc(sizeof(queue_head_t) * ull_hash_buckets);
+       ull_bucket = (ull_bucket_t *)kalloc(sizeof(ull_bucket_t) * ull_hash_buckets);
        assert(ull_bucket != NULL);
 
        for (int i = 0; i < ull_hash_buckets; i++) {
-               queue_init(&ull_bucket[i]);
+               queue_init(&ull_bucket[i].ulb_head);
+               lck_spin_init(&ull_bucket[i].ulb_lock, ull_lck_grp, NULL);
        }
 
        ull_zone = zinit(sizeof(ull_t),
@@ -218,30 +224,30 @@ static int
 ull_hash_dump(pid_t pid)
 {
        int count = 0;
-       ull_global_lock();
        if (pid == 0) {
                kprintf("%s>total number of ull_t allocated %d\n", __FUNCTION__, ull_nzalloc);
                kprintf("%s>BEGIN\n", __FUNCTION__);
        }
        for (int i = 0; i < ull_hash_buckets; i++) {
-               if (!queue_empty(&ull_bucket[i])) {
+               ull_bucket_lock(i);
+               if (!queue_empty(&ull_bucket[i].ulb_head)) {
                        ull_t *elem;
                        if (pid == 0) {
                                kprintf("%s>index %d:\n", __FUNCTION__, i);
                        }
-                       qe_foreach_element(elem, &ull_bucket[i], ull_hash_link) {
+                       qe_foreach_element(elem, &ull_bucket[i].ulb_head, ull_hash_link) {
                                if ((pid == 0) || (pid == elem->ull_key.ulk_pid)) {
                                        ull_dump(elem);
                                        count++;
                                }
                        }
                }
+               ull_bucket_unlock(i);
        }
        if (pid == 0) {
                kprintf("%s>END\n", __FUNCTION__);
                ull_nzalloc = 0;
        }
-       ull_global_unlock();
        return count;
 }
 #endif
@@ -255,14 +261,15 @@ ull_alloc(ulk_t *key)
        ull->ull_refcount = 1;
        ull->ull_key = *key;
        ull->ull_saved_key = *key;
+       ull->ull_bucket_index = ULL_INDEX(key);
        ull->ull_nwaiters = 0;
        ull->ull_max_nwaiters = 0;
        ull->ull_opcode = 0;
 
        ull->ull_owner = THREAD_NULL;
-       ull->ull_promote_token = PROMOTE_TOKEN_INIT;
+       ull->ull_turnstile = TURNSTILE_NULL;
 
-       lck_mtx_init(&ull->ull_lock, ull_lck_grp, NULL);
+       ull_lock_init(ull);
 
        ull_nzalloc++;
        return ull;
@@ -272,10 +279,11 @@ static void
 ull_free(ull_t *ull)
 {
        assert(ull->ull_owner == THREAD_NULL);
+       assert(ull->ull_turnstile == TURNSTILE_NULL);
 
-       LCK_MTX_ASSERT(&ull->ull_lock, LCK_ASSERT_NOTOWNED);
+       ull_assert_notwned(ull);
 
-       lck_mtx_destroy(&ull->ull_lock, ull_lck_grp);
+       ull_lock_destroy(ull);
 
        zfree(ull_zone, ull);
 }
@@ -283,17 +291,17 @@ ull_free(ull_t *ull)
 /* Finds an existing ulock structure (ull_t), or creates a new one.
  * If MUST_EXIST flag is set, returns NULL instead of creating a new one.
  * The ulock structure is returned with ull_lock locked
- *
- * TODO: Per-bucket lock to reduce contention on global lock
  */
 static ull_t *
-ull_get(ulk_t *key, uint32_t flags)
+ull_get(ulk_t *key, uint32_t flags, ull_t **unused_ull)
 {
        ull_t *ull = NULL;
        uint i = ULL_INDEX(key);
+       ull_t *new_ull = (flags & ULL_MUST_EXIST) ? NULL : ull_alloc(key);
        ull_t *elem;
-       ull_global_lock();
-       qe_foreach_element(elem, &ull_bucket[i], ull_hash_link) {
+
+       ull_bucket_lock(i);
+       qe_foreach_element(elem, &ull_bucket[i].ulb_head, ull_hash_link) {
                ull_lock(elem);
                if (ull_key_match(&elem->ull_key, key)) {
                        ull = elem;
@@ -305,30 +313,31 @@ ull_get(ulk_t *key, uint32_t flags)
        if (ull == NULL) {
                if (flags & ULL_MUST_EXIST) {
                        /* Must already exist (called from wake) */
-                       ull_global_unlock();
+                       ull_bucket_unlock(i);
+                       assert(new_ull == NULL);
+                       assert(unused_ull == NULL);
                        return NULL;
                }
 
-               /* NRG maybe drop the ull_global_lock before the kalloc,
-                * then take the lock and check again for a key match
-                * and either use the new ull_t or free it.
-                */
-
-               ull = ull_alloc(key);
-
-               if (ull == NULL) {
-                       ull_global_unlock();
+               if (new_ull == NULL) {
+                       /* Alloc above failed */
+                       ull_bucket_unlock(i);
                        return NULL;
                }
 
+               ull = new_ull;
                ull_lock(ull);
-
-               enqueue(&ull_bucket[i], &ull->ull_hash_link);
+               enqueue(&ull_bucket[i].ulb_head, &ull->ull_hash_link);
+       } else if (!(flags & ULL_MUST_EXIST)) {
+               assert(new_ull);
+               assert(unused_ull);
+               assert(*unused_ull == NULL);
+               *unused_ull = new_ull;
        }
 
        ull->ull_refcount++;
 
-       ull_global_unlock();
+       ull_bucket_unlock(i);
 
        return ull; /* still locked */
 }
@@ -348,38 +357,56 @@ ull_put(ull_t *ull)
                return;
        }
 
-       ull_global_lock();
+       ull_bucket_lock(ull->ull_bucket_index);
        remqueue(&ull->ull_hash_link);
-       ull_global_unlock();
+       ull_bucket_unlock(ull->ull_bucket_index);
 
-#if DEVELOPMENT || DEBUG
-       if (ull_debug) {
-               kprintf("%s>", __FUNCTION__);
-               ull_dump(ull);
-       }
-#endif
        ull_free(ull);
 }
 
+static void ulock_wait_continue(void *, wait_result_t);
+static void ulock_wait_cleanup(ull_t *, thread_t, thread_t, int32_t *);
+
+inline static int
+wait_result_to_return_code(wait_result_t wr)
+{
+       int ret = 0;
+
+       switch (wr) {
+       case THREAD_AWAKENED:
+               break;
+       case THREAD_TIMED_OUT:
+               ret = ETIMEDOUT;
+               break;
+       case THREAD_INTERRUPTED:
+       case THREAD_RESTART:
+       default:
+               ret = EINTR;
+               break;
+       }
+
+       return ret;
+}
+
 int
 ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
 {
        uint opcode = args->operation & UL_OPCODE_MASK;
        uint flags = args->operation & UL_FLAGS_MASK;
+
+       if (flags & ULF_WAIT_CANCEL_POINT) {
+               __pthread_testcancel(1);
+       }
+
        int ret = 0;
        thread_t self = current_thread();
-       int id = thread_tid(self);
        ulk_t key;
 
        /* involved threads - each variable holds +1 ref if not null */
        thread_t owner_thread   = THREAD_NULL;
        thread_t old_owner      = THREAD_NULL;
-       thread_t old_lingering_owner = THREAD_NULL;
-       sched_call_t workq_callback = NULL;
 
-       if (ull_debug) {
-               kprintf("[%d]%s>ENTER opcode %d addr %llx value %llx timeout %d flags %x\n", id, __FUNCTION__, opcode, (unsigned long long)(args->addr), args->value, args->timeout, flags);
-       }
+       ull_t *unused_ull = NULL;
 
        if ((flags & ULF_WAIT_MASK) != flags) {
                ret = EINVAL;
@@ -395,11 +422,6 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
        case UL_COMPARE_AND_WAIT:
                break;
        default:
-               if (ull_debug) {
-                       kprintf("[%d]%s>EINVAL opcode %d addr 0x%llx flags 0x%x\n",
-                               id, __FUNCTION__, opcode,
-                               (unsigned long long)(args->addr), flags);
-               }
                ret = EINVAL;
                goto munge_retval;
        }
@@ -415,12 +437,7 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
        key.ulk_pid = p->p_pid;
        key.ulk_addr = args->addr;
 
-       if (flags & ULF_WAIT_WORKQ_DATA_CONTENTION) {
-               workq_callback = workqueue_get_sched_callback();
-               workq_callback = thread_disable_sched_call(self, workq_callback);
-       }
-
-       ull_t *ull = ull_get(&key, 0);
+       ull_t *ull = ull_get(&key, 0, &unused_ull);
        if (ull == NULL) {
                ret = ENOMEM;
                goto munge_retval;
@@ -436,9 +453,8 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
        if (ull->ull_opcode == 0) {
                ull->ull_opcode = opcode;
        } else if (ull->ull_opcode != opcode) {
-               ull_unlock(ull);
                ret = EDOM;
-               goto out;
+               goto out_locked;
        }
 
        /*
@@ -446,14 +462,12 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
         * but we have to read the userspace value under the ull lock for correctness.
         *
         * Until <rdar://problem/24999882> exists,
-        * fake it by disabling preemption across copyin, which forces any
+        * holding the ull spinlock across copyin forces any
         * vm_fault we encounter to fail.
         */
        uint64_t val64; /* copyin_word always zero-extends to 64-bits */
 
-       disable_preemption();
        int copy_ret = copyin_word(args->addr, &val64, sizeof(value));
-       enable_preemption();
 
        value = (uint32_t)val64;
 
@@ -467,23 +481,16 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
        }
 #endif
        if (copy_ret != 0) {
-               ull_unlock(ull);
-
                /* copyin() will return an error if the access to the user addr would have faulted,
                 * so just return and let the user level code fault it in.
                 */
                ret = copy_ret;
-               goto out;
+               goto out_locked;
        }
 
        if (value != args->value) {
                /* Lock value has changed from expected so bail out */
-               ull_unlock(ull);
-               if (ull_debug) {
-                       kprintf("[%d]%s>Lock value %d has changed from expected %d so bail out\n",
-                               id, __FUNCTION__, value, (uint32_t)(args->value));
-               }
-               goto out;
+               goto out_locked;
        }
 
        if (set_owner) {
@@ -496,9 +503,8 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
                         * Translation failed - even though the lock value is up to date,
                         * whatever was stored in the lock wasn't actually a thread port.
                         */
-                       ull_unlock(ull);
                        ret = EOWNERDEAD;
-                       goto out;
+                       goto out_locked;
                }
                /* owner_thread has a +1 reference */
 
@@ -511,54 +517,102 @@ ulock_wait(struct proc *p, struct ulock_wait_args *args, int32_t *retval)
                 *              and is heading toward the kernel to call ull_wake.
                 *              If so, it's going to have to wait for the ull mutex.
                 *
-                * Therefore, I can promote its priority to match mine, and I can rely on it to
-                * come by later to issue the wakeup and lose its promotion.
+                * Therefore, I can ask the turnstile to promote its priority, and I can rely
+                * on it to come by later to issue the wakeup and lose its promotion.
                 */
 
-               old_owner = ull_promote_owner_locked(ull, owner_thread);
+               /* Return the +1 ref from the ull_owner field */
+               old_owner = ull->ull_owner;
+               ull->ull_owner = THREAD_NULL;
+
+               if (owner_thread != THREAD_NULL) {
+                       /* The ull_owner field now owns a +1 ref on owner_thread */
+                       thread_reference(owner_thread);
+                       ull->ull_owner = owner_thread;
+               }
        }
 
        wait_result_t wr;
        uint32_t timeout = args->timeout;
+       uint64_t deadline = TIMEOUT_WAIT_FOREVER;
+       wait_interrupt_t interruptible = THREAD_ABORTSAFE;
+       struct turnstile *ts;
+
+       ts = turnstile_prepare((uintptr_t)ull, &ull->ull_turnstile,
+                              TURNSTILE_NULL, TURNSTILE_ULOCK);
        thread_set_pending_block_hint(self, kThreadWaitUserLock);
+
+       if (flags & ULF_WAIT_WORKQ_DATA_CONTENTION) {
+               interruptible |= THREAD_WAIT_NOREPORT;
+       }
+
        if (timeout) {
-               wr = assert_wait_timeout(ULOCK_TO_EVENT(ull), THREAD_ABORTSAFE, timeout, NSEC_PER_USEC);
-       } else {
-               wr = assert_wait(ULOCK_TO_EVENT(ull), THREAD_ABORTSAFE);
+               clock_interval_to_deadline(timeout, NSEC_PER_USEC, &deadline);
        }
 
+       turnstile_update_inheritor(ts, owner_thread,
+               (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+       wr = waitq_assert_wait64(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
+                       interruptible, deadline);
+
        ull_unlock(ull);
 
-       if (ull_debug) {
-               kprintf("[%d]%s>after assert_wait() returned %d\n", id, __FUNCTION__, wr);
+       if (unused_ull) {
+               ull_free(unused_ull);
+               unused_ull = NULL;
        }
 
-       if (set_owner && owner_thread != THREAD_NULL && wr == THREAD_WAITING) {
-               wr = thread_handoff(owner_thread);
-               /* owner_thread ref is consumed */
-               owner_thread = THREAD_NULL;
-       } else {
-               /* NRG At some point this should be a continuation based block, so that we can avoid saving the full kernel context. */
-               wr = thread_block(NULL);
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+
+       if (wr == THREAD_WAITING) {
+               uthread_t uthread = (uthread_t)get_bsdthread_info(self);
+               uthread->uu_save.uus_ulock_wait_data.retval = retval;
+               uthread->uu_save.uus_ulock_wait_data.flags = flags;
+               uthread->uu_save.uus_ulock_wait_data.owner_thread = owner_thread;
+               uthread->uu_save.uus_ulock_wait_data.old_owner = old_owner;
+               if (set_owner && owner_thread != THREAD_NULL) {
+                       thread_handoff_parameter(owner_thread, ulock_wait_continue, ull);
+               } else {
+                       assert(owner_thread == THREAD_NULL);
+                       thread_block_parameter(ulock_wait_continue, ull);
+               }
+               /* NOT REACHED */
        }
-       if (ull_debug) {
-               kprintf("[%d]%s>thread_block() returned %d\n", id, __FUNCTION__, wr);
+
+       ret = wait_result_to_return_code(wr);
+
+       ull_lock(ull);
+       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL);
+
+out_locked:
+       ulock_wait_cleanup(ull, owner_thread, old_owner, retval);
+
+       if (unused_ull) {
+               ull_free(unused_ull);
+               unused_ull = NULL;
        }
-       switch (wr) {
-       case THREAD_AWAKENED:
-               break;
-       case THREAD_TIMED_OUT:
-               ret = ETIMEDOUT;
-               break;
-       case THREAD_INTERRUPTED:
-       case THREAD_RESTART:
-       default:
-               ret = EINTR;
-               break;
+
+       assert(*retval >= 0);
+
+munge_retval:
+       if ((flags & ULF_NO_ERRNO) && (ret != 0)) {
+               *retval = -ret;
+               ret = 0;
        }
+       return ret;
+}
+
+/*
+ * Must be called with ull_lock held
+ */
+static void
+ulock_wait_cleanup(ull_t *ull, thread_t owner_thread, thread_t old_owner, int32_t *retval)
+{
+       ull_assert_owned(ull);
+
+       thread_t old_lingering_owner = THREAD_NULL;
 
-out:
-       ull_lock(ull);
        *retval = --ull->ull_nwaiters;
        if (ull->ull_nwaiters == 0) {
                /*
@@ -566,11 +620,8 @@ out:
                 * clear out the lingering owner reference before
                 * freeing the ull.
                 */
-               if (ull->ull_owner != THREAD_NULL) {
-                       old_lingering_owner = ull_promote_owner_locked(ull, THREAD_NULL);
-               }
-
-               assert(ull->ull_owner == THREAD_NULL);
+               old_lingering_owner = ull->ull_owner;
+               ull->ull_owner = THREAD_NULL;
 
                ull->ull_key.ulk_pid = 0;
                ull->ull_key.ulk_addr = 0;
@@ -579,6 +630,9 @@ out:
        }
        ull_put(ull);
 
+       /* Need to be called after dropping the interlock */
+       turnstile_cleanup();
+
        if (owner_thread != THREAD_NULL) {
                thread_deallocate(owner_thread);
        }
@@ -592,17 +646,35 @@ out:
        }
 
        assert(*retval >= 0);
+}
 
-munge_retval:
-       if (workq_callback) {
-               thread_reenable_sched_call(self, workq_callback);
-       }
+__attribute__((noreturn))
+static void
+ulock_wait_continue(void * parameter, wait_result_t wr)
+{
+       thread_t self = current_thread();
+       uthread_t uthread = (uthread_t)get_bsdthread_info(self);
+       int ret = 0;
+
+       ull_t *ull = (ull_t *)parameter;
+       int32_t *retval = uthread->uu_save.uus_ulock_wait_data.retval;
+       uint flags = uthread->uu_save.uus_ulock_wait_data.flags;
+       thread_t owner_thread = uthread->uu_save.uus_ulock_wait_data.owner_thread;
+       thread_t old_owner = uthread->uu_save.uus_ulock_wait_data.old_owner;
+
+       ret = wait_result_to_return_code(wr);
+
+       ull_lock(ull);
+       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL);
+
+       ulock_wait_cleanup(ull, owner_thread, old_owner, retval);
 
        if ((flags & ULF_NO_ERRNO) && (ret != 0)) {
                *retval = -ret;
                ret = 0;
        }
-       return ret;
+
+       unix_syscall_return(ret);
 }
 
 int
@@ -611,18 +683,12 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva
        uint opcode = args->operation & UL_OPCODE_MASK;
        uint flags = args->operation & UL_FLAGS_MASK;
        int ret = 0;
-       int id = thread_tid(current_thread());
        ulk_t key;
 
        /* involved threads - each variable holds +1 ref if not null */
        thread_t wake_thread    = THREAD_NULL;
        thread_t old_owner      = THREAD_NULL;
 
-       if (ull_debug) {
-               kprintf("[%d]%s>ENTER opcode %d addr %llx flags %x\n",
-                       id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags);
-       }
-
        if ((flags & ULF_WAKE_MASK) != flags) {
                ret = EINVAL;
                goto munge_retval;
@@ -662,7 +728,7 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva
        key.ulk_pid = p->p_pid;
        key.ulk_addr = args->addr;
 
-       ull_t *ull = ull_get(&key, ULL_MUST_EXIST);
+       ull_t *ull = ull_get(&key, ULL_MUST_EXIST, NULL);
        if (ull == NULL) {
                if (wake_thread != THREAD_NULL) {
                        thread_deallocate(wake_thread);
@@ -681,19 +747,11 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva
        case UL_COMPARE_AND_WAIT:
                break;
        default:
-               if (ull_debug) {
-                       kprintf("[%d]%s>EINVAL opcode %d addr 0x%llx flags 0x%x\n",
-                               id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags);
-               }
                ret = EINVAL;
                goto out_locked;
        }
 
        if (opcode != ull->ull_opcode) {
-               if (ull_debug) {
-                       kprintf("[%d]%s>EDOM - opcode mismatch - opcode %d addr 0x%llx flags 0x%x\n",
-                               id, __FUNCTION__, opcode, (unsigned long long)(args->addr), flags);
-               }
                ret = EDOM;
                goto out_locked;
        }
@@ -702,10 +760,16 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva
                assert(ull->ull_owner == THREAD_NULL);
        }
 
+       struct turnstile *ts;
+       ts = turnstile_prepare((uintptr_t)ull, &ull->ull_turnstile,
+                              TURNSTILE_NULL, TURNSTILE_ULOCK);
+
        if (flags & ULF_WAKE_ALL) {
-               thread_wakeup(ULOCK_TO_EVENT(ull));
+               waitq_wakeup64_all(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
+                       THREAD_AWAKENED, 0);
        } else if (flags & ULF_WAKE_THREAD) {
-               kern_return_t kr = thread_wakeup_thread(ULOCK_TO_EVENT(ull), wake_thread);
+               kern_return_t kr = waitq_wakeup64_thread(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
+                       wake_thread, THREAD_AWAKENED);
                if (kr != KERN_SUCCESS) {
                        assert(kr == KERN_NOT_WAITING);
                        ret = EALREADY;
@@ -718,7 +782,8 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva
                 * TODO: 'owner is not current_thread (or null)' likely means we can avoid this wakeup
                 * <rdar://problem/25487001>
                 */
-               thread_wakeup_one_with_pri(ULOCK_TO_EVENT(ull), WAITQ_SELECT_MAX_PRI);
+               waitq_wakeup64_one(&ts->ts_waitq, CAST_EVENT64_T(ULOCK_TO_EVENT(ull)),
+                       THREAD_AWAKENED, WAITQ_SELECT_MAX_PRI);
        }
 
        /*
@@ -732,12 +797,21 @@ ulock_wake(struct proc *p, struct ulock_wake_args *args, __unused int32_t *retva
         */
 
        if (ull->ull_owner == current_thread()) {
-               old_owner = ull_promote_owner_locked(ull, THREAD_NULL);
+               turnstile_update_inheritor(ts, THREAD_NULL,
+                       (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+               old_owner = ull->ull_owner;
+               ull->ull_owner = THREAD_NULL;
        }
 
+       turnstile_complete((uintptr_t)ull, &ull->ull_turnstile, NULL);
+
 out_locked:
        ull_put(ull);
 
+       /* Need to be called after dropping the interlock */
+       turnstile_cleanup();
+
        if (wake_thread != THREAD_NULL) {
                thread_deallocate(wake_thread);
        }
@@ -754,46 +828,6 @@ munge_retval:
        return ret;
 }
 
-/*
- * Change ull_owner to be new_owner, and update it with the properties
- * of the current thread.
- *
- * Records the highest current promotion value in ull_promote_token, and applies that
- * to any new owner.
- *
- * Returns +1 ref to the old ull_owner if it is going away.
- */
-static thread_t
-ull_promote_owner_locked(ull_t*    ull,
-                         thread_t  new_owner)
-{
-       if (new_owner != THREAD_NULL && ull->ull_owner == new_owner) {
-               thread_user_promotion_update(new_owner, current_thread(), &ull->ull_promote_token);
-               return THREAD_NULL;
-       }
-
-       thread_t old_owner = ull->ull_owner;
-       ull->ull_owner = THREAD_NULL;
-
-       if (new_owner != THREAD_NULL) {
-               /* The ull_owner field now owns a +1 ref on thread */
-               thread_reference(new_owner);
-               ull->ull_owner = new_owner;
-
-               thread_user_promotion_add(new_owner, current_thread(), &ull->ull_promote_token);
-       } else {
-               /* No new owner - clear the saturated promotion value */
-               ull->ull_promote_token = PROMOTE_TOKEN_INIT;
-       }
-
-       if (old_owner != THREAD_NULL) {
-               thread_user_promotion_drop(old_owner);
-       }
-
-       /* Return the +1 ref from the ull_owner field */
-       return old_owner;
-}
-
 void
 kdp_ulock_find_owner(__unused struct waitq * waitq, event64_t event, thread_waitinfo_t * waitinfo)
 {
index e1b8a75c9bfcd9bbef446bb833e2aae1e46793e7..278dd224f9a4a8926ad9e0955b8272e5267c14f3 100644 (file)
 52     AUE_SIGPENDING  ALL     { int sigpending(struct sigvec *osv); } 
 53     AUE_SIGALTSTACK ALL     { int sigaltstack(struct sigaltstack *nss, struct sigaltstack *oss) NO_SYSCALL_STUB ; } 
 54     AUE_IOCTL       ALL     { int ioctl(int fd, u_long com, caddr_t data) NO_SYSCALL_STUB; } 
-55     AUE_REBOOT      ALL     { int reboot(int opt, char *command) NO_SYSCALL_STUB; }
+55     AUE_REBOOT      ALL     { int reboot(int opt, char *msg) NO_SYSCALL_STUB; }
 56     AUE_REVOKE      ALL     { int revoke(char *path); } 
 57     AUE_SYMLINK     ALL     { int symlink(char *path, char *link); } 
 58     AUE_READLINK    ALL     { int readlink(char *path, char *buf, int count); } 
 181    AUE_SETGID      ALL     { int setgid(gid_t gid); } 
 182    AUE_SETEGID     ALL     { int setegid(gid_t egid); } 
 183    AUE_SETEUID     ALL     { int seteuid(uid_t euid); } 
-184    AUE_SIGRETURN   ALL     { int sigreturn(struct ucontext *uctx, int infostyle) NO_SYSCALL_STUB; } 
+184    AUE_SIGRETURN   ALL     { int sigreturn(struct ucontext *uctx, int infostyle, user_addr_t token) NO_SYSCALL_STUB; }
 185    AUE_NULL        ALL     { int enosys(void); } { old chud }
 186    AUE_NULL        ALL     { int thread_selfcounts(int type, user_addr_t buf, user_size_t nbytes); }
 187    AUE_FDATASYNC   ALL     { int fdatasync(int fd); } 
 527    AUE_NULL        ALL     { int ntp_adjtime(struct timex *tp); }
 528    AUE_NULL        ALL     { int ntp_gettime(struct ntptimeval *ntvp); }
 529    AUE_NULL        ALL     { int os_fault_with_payload(uint32_t reason_namespace, uint64_t reason_code, void *payload, uint32_t payload_size, const char *reason_string, uint64_t reason_flags); }
+#if CONFIG_WORKQUEUE
+530    AUE_WORKLOOPCTL ALL     { int kqueue_workloop_ctl(user_addr_t cmd, uint64_t options, user_addr_t addr, size_t sz) NO_SYSCALL_STUB; }
+#else
 530    AUE_NULL        ALL     { int enosys(void); }
+#endif // CONFIG_WORKQUEUE
 531    AUE_NULL        ALL     { int enosys(void); }
index a962e9ab13d0f1ac66ae42c6a8561d78ed8fc87e..2fb45c9965378f7cc5cb5ef6af8989c9dc16a710 100644 (file)
@@ -488,6 +488,7 @@ shmat(struct proc *p, struct shmat_args *uap, user_addr_t *retval)
                         map_size,
                         0,
                         vm_flags,
+                        VM_MAP_KERNEL_FLAGS_NONE,
                         VM_KERN_MEMORY_NONE,
                         IPC_PORT_NULL,
                         0,
index 3065de277974bac5aa054f2e03cf5c64296e2049..2e1361f4fece586232dfae9bd0c7cc109d064fde 100644 (file)
 0x1200024      MACH_IPC_voucher_destroy
 0x1200028      MACH_IPC_kmsg_info
 0x120002c      MACH_IPC_kmsg_link
+0x1200030      MACH_IPC_port_entry_modify
 0x1250008      MACH_RMON_CPUUSAGE_VIOLATED
 0x1250010      MACH_RMON_CPUUSAGE_VIOLATED_K32A
 0x1250014      MACH_RMON_CPUUSAGE_VIOLATED_K32B
 0x130043c      MACH_vm_info5
 0x1300440      MACH_vm_info6
 0x1300444      MACH_vm_info7
+0x1300448      MACH_vm_info8
+0x130044c      MACH_vm_info9
 0x1300480      MACH_vm_upl_page_wait
 0x1300484      MACH_vm_iopl_page_wait
 0x1300488      MACH_vm_page_wait_block
 0x130048C      MACH_vm_page_sleep
 0x1300490      MACH_vm_page_expedite
 0x1300494      MACH_vm_page_expedite_no_memory
+0x1300498      MACH_vm_page_grab
+0x130049c      MACH_vm_page_release
 0x13004c0      MACH_vm_pressure_event
-0x1300500       MACH_vm_data_write
+0x13004c4      MACH_vm_execve
+0x13004c8      MACH_vm_wakeup_compactor_swapper
+0x13004cc      MACH_vm_upl_request
+0x13004d0      MACH_vm_iopl_request
+0x13004d4      MACH_vm_kern_request
+0x1300500      MACH_vm_data_write
+0x1300504      vm_pressure_level_change
 0x1320000      vm_disconnect_all_page_mappings
 0x1320004      vm_disconnect_task_page_mappings
 0x1320008      RealFaultAddressInternal
 0x14000C4      MACH_EXEC_DEMOTE
 0x14000C8      MACH_AMP_SIGNAL_SPILL
 0x14000CC      MACH_AMP_STEAL
+0x14000D0      MACH_SCHED_LOAD_EFFECTIVE
+0x14000D4      MACH_PROMOTED
+0x14000D8      MACH_UNPROMOTED
+0x14000DC      MACH_PROMOTED_UPDATE
+0x14000E0      MACH_QUIESCENT_COUNTER
 0x1500000      MACH_MSGID_INVALID
 0x1600000      MTX_SLEEP
 0x1600004      MTX_SLEEP_DEADLINE
 0x170003c      PMAP_flush_TLBS_TO
 0x1700040      PMAP_flush_EPT
 0x1700044      PMAP_fast_fault
+0x1700048      PMAP_switch
 0x1800000      MACH_CLOCK_EPOCH_CHANGE
 0x1800004      MACH_CLOCK_BRIDGE_RCV_TS
 0x1800008      MACH_CLOCK_BRIDGE_REMOTE_TIME
 0x3110004      OpenThrottleWindow
 0x3110008      CauseIOThrottle
 0x311000C      IO_THROTTLE_DISABLE
+0x3120000      DECMPFS_decmp_file
+0x3120004      DECMPFS_fetch_cmp_header
+0x3120008      DECMPFS_fetch_uncmp_data
+0x3120010      DECMPFS_free_cmp_data
+0x3120014      DECMPFS_file_is_cmp
 0x3CF0000      CP_OFFSET_IO
 0x4010004      proc_exit
 0x4010008      force_exit
 0x402002C      MEMSTAT_dirty_clear
 0x4020030      MEMSTAT_grp_set_properties
 0x4020034      MEMSTAT_do_kill
+0x4020038      MEMSTAT_change_priority
+0x402003C      MEMSTAT_fast_jetsam
 0x4030004      KEVENT_kq_processing_begin
 0x4030008      KEVENT_kq_processing_end
 0x403000c      KEVENT_kqwq_processing_begin
 0x403004c      KEVENT_kqwl_bind
 0x4030050      KEVENT_kqwl_unbind
 0x4030054      KEVENT_knote_enable
+0x4030058      KEVENT_knote_vanished
 0x40e0104      BSC_msync_extended_info
 0x40e0264      BSC_pread_extended_info
 0x40e0268      BSC_pwrite_extended_info
 0x8010800      F_DLIL_Output
 0x8010c00      F_DLIL_IfOut
 0x8040000      USER_STOP
-0x9000084      wq_deallocate_stack
-0x9000088      wq_allocate_stack
-0x9008070      wq_run_item
-0x9008074      wq_clean_thread
-0x9008078      wq_post_done
-0x900807c      wq_stk_cleanup
-0x9008080      wq_tsd_cleanup
-0x9008084      wq_tsd_destructor
-0x9008088      wq_pthread_exit
-0x900808c      wq_workqueue_exit
+0x9010004      wq_pthread_exit
+0x9010008      wq_workqueue_exit
+0x901000c      wq_runthread
+0x9010014      wq_death_call
+0x9010024      wq_thread_block
+0x9010028      wq_thactive_update
+0x901002c      wq_add_timer
+0x9010030      wq_start_add_timer
+0x9010050      wq_override_dispatch
+0x9010054      wq_override_reset
+0x9010074      wq_thread_create_failed
+0x9010078      wq_thread_terminate
+0x901007c      wq_thread_create
+0x9010080      wq_select_threadreq
+0x901008c      wq_creator_select
+0x9010090      wq_creator_yield
+0x9010094      wq_constrained_admission
+0x9010098      wq_wqops_reqthreads
+0x9020004      wq_create
+0x9020008      wq_destroy
+0x902000c      wq_thread_logical_run
+0x9020014      wq_thread_request_initiate
+0x9020018      wq_thread_request_modify
+0x9100004      bsdthread_set_qos_self
 0xa000100      P_CS_Read
 0xa000110      P_CS_Write
 0xa000104      P_CS_ReadDone
 0x2506002c     PERF_KPC_Thd_Sample
 0x25070000     PERF_KDBG_Handler
 0x25080000     PERF_TK_Snap_Sample
-0x25080004     PERF_TK_Snap_Data1
-0x25080008     PERF_TK_Snap_Data2
-0x2508000c     PERF_TK_Snap_Data1_32
-0x25080010     PERF_TK_Snap_Data2_32
+0x25080004     PERF_TK_Snap_Data
+0x25080008     PERF_TK_Snap_Data1_32
+0x2508000c     PERF_TK_Snap_Data2_32
+0x25080010     PERF_TK_Info_Data
+0x25090000     PERF_LZ_MkRunnable
+0x25090040     PERF_LZ_WaitSample
+0x25090080     PERF_LZ_CPUSample
 0x250a0000     PERF_MI_Sample
 0x250a0004     PERF_MI_Data
 0x250a0008     PERF_MI_SysMem_Data
+0x250a000c     PERF_MI_SysMem_Data_2
 0x26100008     imp_assertion_hold
 0x2610000c     imp_assertion_hold_ext
 0x26100020     imp_assertion_externalize
 0x26350028     imp_thread_qos
 0x26360028     imp_thread_qos_override
 0x26380028     imp_thread_qos_and_relprio
+0x263b0028     imp_thread_qos_workq_override
 0x263c0028     imp_thread_qos_promote
 0x263d0028     imp_thread_qos_ipc_override
 0x27000000     PERF_PCEVENT
 0x2a200008     ATM_VALUE_ADDED
 0x2a300004     ATM_VALUE_UNREGISTERED
 0x2a300008     ATM_VALUE_DIFF_MAILBOX
+0x35100004     TURNSTILE_thread_added_to_turnstile_waitq
+0x35100008     TURNSTILE_thread_removed_from_turnstile_waitq
+0x3510000c     TURNSTILE_thread_moved_in_turnstile_waitq
+0x35100010     TURNSTILE_turnstile_added_to_turnstile_heap
+0x35100014     TURNSTILE_turnstile_removed_from_turnstile_heap
+0x35100018     TURNSTILE_turnstile_moved_in_turnstile_heap
+0x3510001c     TURNSTILE_turnstile_added_to_thread_heap
+0x35100020     TURNSTILE_turnstile_removed_from_thread_heap
+0x35100024     TURNSTILE_turnstile_moved_in_thread_heap
+0x35100028     TURNSTILE_update_stopped_by_limit
+0x3510002c     TURNSTILE_thread_not_waiting_on_turnstile
+0x35200004     TURNSTILE_turnstile_priority_change
+0x35200008     TURNSTILE_thread_user_promotion_change
+0x35300004     TURNSTILE_turnstile_prepare
+0x35300008     TURNSTILE_turnstile_complete
 0xff000104     MSG_mach_notify_port_deleted
 0xff000114     MSG_mach_notify_port_destroyed
 0xff000118     MSG_mach_notify_no_senders
index 61d59ee6221215075911665a332696c7606bcc45..ed7e7965820295c04f6079338897c2b896c34e96 100644 (file)
@@ -913,6 +913,20 @@ ttysetpgrphup(struct tty *tp)
 {
        TTY_LOCK_OWNED(tp);     /* debug assert */
        SET(tp->t_state, TS_PGRPHUP);
+       /*
+        * Also wake up sleeping readers which may or may not belong to the
+        * current foreground process group.
+        *
+        * This forces any non-fg readers (which entered read when
+        * that process group was in the fg) to return with EIO (if they're
+        * catching SIGTTIN or with SIGTTIN). The ones which do belong to the fg
+        * process group will promptly go back to sleep and get a SIGHUP shortly
+        * This would normally happen as part of the close in revoke but if
+        * there is a sleeping reader from a non-fg process group we never get
+        * to the close because the sleeping reader holds an iocount on the
+        * vnode of the terminal which is going to get revoked->reclaimed.
+        */
+       wakeup(TSA_HUP_OR_INPUT(tp));
 }
 
 /*
@@ -1454,6 +1468,9 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p)
                 * case.
                 */
                if (ISSET(tp->t_state, TS_PGRPHUP)) {
+                       if (sessp != SESSION_NULL)
+                               session_rele(sessp);
+                       pg_rele(pgrp);
                        error = EPERM;
                        goto out;
                }
@@ -1462,6 +1479,23 @@ ttioctl_locked(struct tty *tp, u_long cmd, caddr_t data, int flag, proc_t p)
                tp->t_pgrp = pgrp;
                sessp->s_ttypgrpid = pgrp->pg_id;
                proc_list_unlock();
+
+               /*
+                * Wakeup readers to recheck if they are still the foreground
+                * process group.
+                *
+                * ttwakeup() isn't called because the readers aren't getting
+                * woken up becuse there is something to read but to force
+                * the re-evaluation of their foreground process group status.
+                *
+                * Ordinarily leaving these readers waiting wouldn't be an issue
+                * as launchd would send them a termination signal eventually
+                * (if nobody else does). But if this terminal happens to be
+                * /dev/console, launchd itself could get blocked forever behind
+                * a revoke of /dev/console and leave the system deadlocked.
+                */
+               wakeup(TSA_HUP_OR_INPUT(tp));
+
                /* SAFE: All callers drop the lock on return */
                tty_unlock(tp);
                if (oldpg != PGRP_NULL)
@@ -3333,6 +3367,12 @@ tty_set_knote_hook(struct knote *kn)
                        NULL);
        assert(kr == KERN_SUCCESS);
 
+       /*
+        * Lazy allocate the waitqset to avoid potential allocation under
+        * a spinlock;
+        */
+       waitq_set_lazy_init_link(&tmp_wqs);
+
        old_wqs = uth->uu_wqset;
        uth->uu_wqset = &tmp_wqs;
        /*
@@ -3507,8 +3547,6 @@ filt_ttytouch(struct knote *kn, struct kevent_internal_s *kev)
 
        kn->kn_sdata = kev->data;
        kn->kn_sfflags = kev->fflags;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        if (kn->kn_vnode_kqok) {
                res = filt_tty_common(kn, tp);
index 8e81ab6690cec86036d367aa51d7733802589267..893b1d9127dcb13f0d802599e450892d7c48399d 100644 (file)
@@ -597,9 +597,6 @@ ptsd_kqops_touch(struct knote *kn, struct kevent_internal_s *kev)
        /* accept new kevent state */
        kn->kn_sfflags = kev->fflags;
        kn->kn_sdata = kev->data;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
-               kn->kn_udata = kev->udata;
-       }
 
        /* recapture fired state of knote */
        ret = ptsd_kqops_common(kn, tp);
@@ -832,6 +829,7 @@ ptmx_kqops_common(struct knote *kn, struct ptmx_ioctl *pti, struct tty *tp)
                /* there's data on the TTY and it's not stopped */
                if (tp->t_outq.c_cc && !(tp->t_state & TS_TTSTOP)) {
                        retval = tp->t_outq.c_cc;
+                       kn->kn_data = retval;
                } else if (((pti->pt_flags & PF_PKT) && pti->pt_send) ||
                                ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl)) {
                        retval = 1;
@@ -907,9 +905,6 @@ ptmx_kqops_touch(struct knote *kn, struct kevent_internal_s *kev)
        /* accept new kevent state */
        kn->kn_sfflags = kev->fflags;
        kn->kn_sdata = kev->data;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0) {
-               kn->kn_udata = kev->udata;
-       }
 
        /* recapture fired state of knote */
        ret = ptmx_kqops_common(kn, pti, tp);
index e05d13c3e64f09411c3a136efb0200fe4a1a7306..6f09debf794c7940d9068cc7e85761ebfb1d6724 100644 (file)
@@ -81,9 +81,10 @@ extern kern_return_t memory_object_pages_resident(memory_object_control_t,
                                                        boolean_t *);
 extern kern_return_t   memory_object_signed(memory_object_control_t control,
                                             boolean_t is_signed);
-extern boolean_t       memory_object_is_slid(memory_object_control_t   control);
 extern boolean_t       memory_object_is_signed(memory_object_control_t);
 
+/* XXX Same for those. */
+
 extern void Debugger(const char *message);
 
 
@@ -112,7 +113,7 @@ static int ubc_msync_internal(vnode_t, off_t, off_t, off_t *, int, int *);
 static void ubc_cs_free(struct ubc_info *uip);
 
 static boolean_t ubc_cs_supports_multilevel_hash(struct cs_blob *blob);
-static void ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob);
+static kern_return_t ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob);
 
 struct zone    *ubc_info_zone;
 static uint32_t        cs_blob_generation_count = 1;
@@ -477,19 +478,18 @@ cs_validate_blob(const CS_GenericBlob *blob, size_t length)
 static int
 cs_validate_csblob(
        const uint8_t *addr,
-       size_t *blob_size_p,
+       const size_t blob_size,
        const CS_CodeDirectory **rcd,
        const CS_GenericBlob **rentitlements)
 {
        const CS_GenericBlob *blob;
        int error;
-       size_t length, blob_size;
+       size_t length;
 
        *rcd = NULL;
        *rentitlements = NULL;
 
        blob = (const CS_GenericBlob *)(const void *)addr;
-       blob_size = *blob_size_p;
 
        length = blob_size;
        error = cs_validate_blob(blob, length);
@@ -617,8 +617,6 @@ cs_validate_csblob(
        if (*rcd == NULL)
                return EBADEXEC;
 
-       *blob_size_p = blob_size;
-
        return 0;
 }
 
@@ -1467,17 +1465,6 @@ ubc_getobject(struct vnode *vp, __unused int flags)
        return (MEMORY_OBJECT_CONTROL_NULL);
 }
 
-boolean_t
-ubc_strict_uncached_IO(struct vnode *vp)
-{
-        boolean_t result = FALSE;
-
-       if (UBCINFOEXISTS(vp)) {
-               result = memory_object_is_slid(vp->v_ubcinfo->ui_control);
-       }
-       return result;
-}
-
 /*
  * ubc_blktooff
  *
@@ -2835,14 +2822,18 @@ ubc_cs_blob_allocate(
        vm_offset_t     *blob_addr_p,
        vm_size_t       *blob_size_p)
 {
-       kern_return_t   kr;
+       kern_return_t   kr = KERN_FAILURE;
 
-       *blob_addr_p = (vm_offset_t) kalloc_tag(*blob_size_p, VM_KERN_MEMORY_SECURITY);
-       if (*blob_addr_p == 0) {
-               kr = KERN_NO_SPACE;
-       } else {
-               kr = KERN_SUCCESS;
+       {
+               *blob_addr_p = (vm_offset_t) kalloc_tag(*blob_size_p, VM_KERN_MEMORY_SECURITY);
+
+               if (*blob_addr_p == 0) {
+                       kr = KERN_NO_SPACE;
+               } else {
+                       kr = KERN_SUCCESS;
+               }
        }
+
        return kr;
 }
 
@@ -2851,7 +2842,14 @@ ubc_cs_blob_deallocate(
        vm_offset_t     blob_addr,
        vm_size_t       blob_size)
 {
-       kfree((void *) blob_addr, blob_size);
+#if PMAP_CS
+       if (blob_size > pmap_cs_blob_limit) {
+               kmem_free(kernel_map, blob_addr, blob_size);
+       } else
+#endif
+       {
+               kfree((void *) blob_addr, blob_size);
+       }
 }
 
 /*
@@ -2871,6 +2869,7 @@ ubc_cs_supports_multilevel_hash(struct cs_blob *blob)
 {
        const CS_CodeDirectory *cd;
 
+       
        /*
         * Only applies to binaries that ship as part of the OS,
         * primarily the shared cache.
@@ -2935,11 +2934,25 @@ ubc_cs_supports_multilevel_hash(struct cs_blob *blob)
 }
 
 /*
- * All state and preconditions were checked before, so this
- * function cannot fail.
+ * Given a cs_blob with an already chosen best code directory, this
+ * function allocates memory and copies into it only the blobs that
+ * will be needed by the kernel, namely the single chosen code
+ * directory (and not any of its alternatives) and the entitlement
+ * blob.
+ *
+ * This saves significant memory with agile signatures, and additional
+ * memory for 3rd Party Code because we also omit the CMS blob.
+ *
+ * To support multilevel and other potential code directory rewriting,
+ * the size of a new code directory can be specified. Since that code
+ * directory will replace the existing code directory,
+ * ubc_cs_reconstitute_code_signature does not copy the original code
+ * directory when a size is given, and the caller must fill it in.
  */
-static void
-ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob)
+static int
+ubc_cs_reconstitute_code_signature(struct cs_blob const *blob, vm_size_t optional_new_cd_size,
+                                                                  vm_address_t *new_blob_addr_p, vm_size_t *new_blob_size_p,
+                                                                  CS_CodeDirectory **new_cd_p, CS_GenericBlob const **new_entitlements_p)
 {
        const CS_CodeDirectory  *old_cd, *cd;
        CS_CodeDirectory        *new_cd;
@@ -2949,20 +2962,10 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob)
        vm_size_t       new_cdsize;
        kern_return_t   kr;
        int                             error;
-       size_t          length;
-
-       uint32_t                hashes_per_new_hash_shift = (uint32_t)(PAGE_SHIFT - blob->csb_hash_pageshift);
-
-       if (cs_debug > 1) {
-               printf("CODE SIGNING: Attempting to convert Code Directory for %lu -> %lu page shift\n",
-                          (unsigned long)blob->csb_hash_pageshift, (unsigned long)PAGE_SHIFT);
-       }
 
        old_cd = blob->csb_cd;
 
-       /* Up to the hashes, we can copy all data */
-       new_cdsize  = ntohl(old_cd->hashOffset);
-       new_cdsize += (ntohl(old_cd->nCodeSlots) >> hashes_per_new_hash_shift) * old_cd->hashSize;
+       new_cdsize = optional_new_cd_size != 0 ? optional_new_cd_size : htonl(old_cd->length);
 
        new_blob_size  = sizeof(CS_SuperBlob);
        new_blob_size += sizeof(CS_BlobIndex);
@@ -2980,7 +2983,7 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob)
                        printf("CODE SIGNING: Failed to allocate memory for new Code Signing Blob: %d\n",
                                   kr);
                }
-               return;
+               return ENOMEM;
        }
 
        CS_SuperBlob            *new_superblob;
@@ -3004,15 +3007,69 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob)
 
                new_cd = (CS_CodeDirectory *)(new_blob_addr + cd_offset);
        } else {
-               vm_size_t                       cd_offset;
+               // Blob is the code directory, directly.
+               new_cd = (CS_CodeDirectory *)new_blob_addr;
+       }
 
-               cd_offset  = sizeof(CS_SuperBlob) + 1 * sizeof(CS_BlobIndex);
+       if (optional_new_cd_size == 0) {
+               // Copy code directory, and revalidate.
+               memcpy(new_cd, old_cd, new_cdsize);
 
-               new_superblob->count = htonl(1);
-               new_superblob->index[0].type = htonl(CSSLOT_CODEDIRECTORY);
-               new_superblob->index[0].offset = htonl((uint32_t)cd_offset);
+               vm_size_t length = new_blob_size;
 
-               new_cd = (CS_CodeDirectory *)new_blob_addr;
+               error = cs_validate_csblob((const uint8_t *)new_blob_addr, length, &cd, &entitlements);
+
+               if (error) {
+                       printf("CODE SIGNING: Failed to validate new Code Signing Blob: %d\n",
+                                  error);
+
+                       ubc_cs_blob_deallocate(new_blob_addr, new_blob_size);
+                       return error;
+               }
+               *new_entitlements_p = entitlements;
+       } else {
+               // Caller will fill out and validate code directory.
+               memset(new_cd, 0, new_cdsize);
+               *new_entitlements_p = NULL;
+       }
+
+       *new_blob_addr_p = new_blob_addr;
+       *new_blob_size_p = new_blob_size;
+       *new_cd_p = new_cd;
+
+       return 0;
+}
+
+static int
+ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob)
+{
+       const CS_CodeDirectory  *old_cd, *cd;
+       CS_CodeDirectory        *new_cd;
+       const CS_GenericBlob *entitlements;
+       vm_offset_t     new_blob_addr;
+       vm_size_t       new_blob_size;
+       vm_size_t       new_cdsize;
+       int                             error;
+
+       uint32_t                hashes_per_new_hash_shift = (uint32_t)(PAGE_SHIFT - blob->csb_hash_pageshift);
+
+       if (cs_debug > 1) {
+               printf("CODE SIGNING: Attempting to convert Code Directory for %lu -> %lu page shift\n",
+                          (unsigned long)blob->csb_hash_pageshift, (unsigned long)PAGE_SHIFT);
+       }
+
+       old_cd = blob->csb_cd;
+
+       /* Up to the hashes, we can copy all data */
+       new_cdsize  = ntohl(old_cd->hashOffset);
+       new_cdsize += (ntohl(old_cd->nCodeSlots) >> hashes_per_new_hash_shift) * old_cd->hashSize;
+
+       error = ubc_cs_reconstitute_code_signature(blob, new_cdsize,
+                                                                                       &new_blob_addr, &new_blob_size, &new_cd,
+                                                                                       &entitlements);
+       if (error != 0) {
+               printf("CODE SIGNING: Failed to reconsitute code signature: %d\n", error);
+               return error;
        }
 
        memcpy(new_cd, old_cd, ntohl(old_cd->hashOffset));
@@ -3066,21 +3123,17 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob)
                blob->csb_hashtype->cs_final(dst, &mdctx);
        }
 
-       length = new_blob_size;
-       error = cs_validate_csblob((const uint8_t *)new_blob_addr, &length, &cd, &entitlements);
-       assert(length == new_blob_size);
-       if (error) {
+       error = cs_validate_csblob((const uint8_t *)new_blob_addr, new_blob_size, &cd, &entitlements);
+       if (error != 0) {
 
-               if (cs_debug > 1) {
-                       printf("CODE SIGNING: Failed to validate new Code Signing Blob: %d\n",
-                                  error);
-               }
+               printf("CODE SIGNING: Failed to validate new Code Signing Blob: %d\n",
+                          error);
 
                ubc_cs_blob_deallocate(new_blob_addr, new_blob_size);
-               return;
+               return error;
        }
 
-       /* New Code Directory is ready for use, swap it out in the blob structure */
+    /* New Code Directory is ready for use, swap it out in the blob structure */
        ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size);
 
        blob->csb_mem_size = new_blob_size;
@@ -3103,31 +3156,33 @@ ubc_cs_convert_to_multilevel_hash(struct cs_blob *blob)
        } else {
                blob->csb_start_offset = 0;
        }
+
+       return 0;
 }
 
+/*
+ * Validate the code signature blob, create a struct cs_blob wrapper
+ * and return it together with a pointer to the chosen code directory
+ * and entitlements blob.
+ *
+ * Note that this takes ownership of the memory as addr, mainly because
+ * this function can actually replace the passed in blob with another
+ * one, e.g. when performing multilevel hashing optimization.
+ */
 int
-ubc_cs_blob_add(
-       struct vnode    *vp,
-       cpu_type_t      cputype,
-       off_t           base_offset,
-       vm_address_t    *addr,
-       vm_size_t       size,
-       struct image_params *imgp,
-       __unused int    flags,
-       struct cs_blob  **ret_blob)
+cs_blob_create_validated(
+       vm_address_t * const            addr,
+       vm_size_t                       size,
+       struct cs_blob ** const         ret_blob,
+    CS_CodeDirectory const ** const    ret_cd)
 {
-       kern_return_t           kr;
-       struct ubc_info         *uip;
-       struct cs_blob          *blob, *oblob;
-       int                     error;
+       struct cs_blob          *blob;
+       int             error = EINVAL;
        const CS_CodeDirectory *cd;
        const CS_GenericBlob *entitlements;
-       off_t                   blob_start_offset, blob_end_offset;
        union cs_hash_union     mdctx;
-       boolean_t               record_mtime;
        size_t                  length;
 
-       record_mtime = FALSE;
        if (ret_blob)
            *ret_blob = NULL;
 
@@ -3137,8 +3192,6 @@ ubc_cs_blob_add(
        }
 
        /* fill in the new blob */
-       blob->csb_cpu_type = cputype;
-       blob->csb_base_offset = base_offset;
        blob->csb_mem_size = size;
        blob->csb_mem_offset = 0;
        blob->csb_mem_kaddr = *addr;
@@ -3149,7 +3202,8 @@ ubc_cs_blob_add(
        blob->csb_teamid = NULL;
        blob->csb_entitlements_blob = NULL;
        blob->csb_entitlements = NULL;
-       
+       blob->csb_reconstituted = false;
+
        /* Transfer ownership. Even on error, this function will deallocate */
        *addr = 0;
 
@@ -3158,7 +3212,7 @@ ubc_cs_blob_add(
         */
        length = (size_t) size;
        error = cs_validate_csblob((const uint8_t *)blob->csb_mem_kaddr,
-                                  &length, &cd, &entitlements);
+                                                          length, &cd, &entitlements);
        if (error) {
 
                if (cs_debug)
@@ -3173,49 +3227,6 @@ ubc_cs_blob_add(
                uint8_t hash[CS_HASH_MAX_SIZE];
                int md_size;
 
-               size = (vm_size_t) length;
-               assert(size <= blob->csb_mem_size);
-               if (size < blob->csb_mem_size) {
-                       vm_address_t new_blob_addr;
-                       const CS_CodeDirectory *new_cd;
-                       const CS_GenericBlob *new_entitlements;
-
-                       kr = ubc_cs_blob_allocate(&new_blob_addr, &size);
-                       if (kr != KERN_SUCCESS) {
-                               if (cs_debug > 1) {
-                                       printf("CODE SIGNING: failed to "
-                                              "re-allocate blob (size "
-                                              "0x%llx->0x%llx) error 0x%x\n",
-                                              (uint64_t)blob->csb_mem_size,
-                                              (uint64_t)size,
-                                              kr);
-                               }
-                       } else {
-                               memcpy((void *)new_blob_addr, (void *)blob->csb_mem_kaddr, size);
-                               if (cd == NULL) {
-                                       new_cd = NULL;
-                               } else {
-                                       new_cd = (void *)(((uintptr_t)cd
-                                                 - (uintptr_t)blob->csb_mem_kaddr
-                                                 + (uintptr_t)new_blob_addr));
-                               }
-                               if (entitlements == NULL) {
-                                       new_entitlements = NULL;
-                               } else {
-                                       new_entitlements = (void *)(((uintptr_t)entitlements
-                                                           - (uintptr_t)blob->csb_mem_kaddr
-                                                           + (uintptr_t)new_blob_addr));
-                               }
-//                             printf("CODE SIGNING: %s:%d kaddr 0x%llx cd %p ents %p -> blob 0x%llx cd %p ents %p\n", __FUNCTION__, __LINE__, (uint64_t)blob->csb_mem_kaddr, cd, entitlements, (uint64_t)new_blob_addr, new_cd, new_entitlements);
-                               ubc_cs_blob_deallocate(blob->csb_mem_kaddr,
-                                                      blob->csb_mem_size);
-                               blob->csb_mem_kaddr = new_blob_addr;
-                               blob->csb_mem_size = size;
-                               cd = new_cd;
-                               entitlements = new_entitlements;
-                       }
-               }
-
                blob->csb_cd = cd;
                blob->csb_entitlements_blob = entitlements; /* may be NULL, not yet validated */
                blob->csb_hashtype = cs_find_md(cd->hashType);
@@ -3246,7 +3257,81 @@ ubc_cs_blob_add(
                memcpy(blob->csb_cdhash, hash, CS_CDHASH_LEN);
        }
 
-       /* 
+    error = 0;
+
+out:
+    if (error != 0) {
+        cs_blob_free(blob);
+        blob = NULL;
+        cd = NULL;
+    }
+
+    if (ret_blob != NULL) {
+        *ret_blob = blob;
+    }
+    if (ret_cd != NULL) {
+        *ret_cd = cd;
+    }
+
+    return error;
+}
+
+/*
+ * Free a cs_blob previously created by cs_blob_create_validated.
+ */
+void
+cs_blob_free(
+    struct cs_blob * const blob)
+{
+    if (blob != NULL) {
+        if (blob->csb_mem_kaddr) {
+            ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size);
+            blob->csb_mem_kaddr = 0;
+        }
+        if (blob->csb_entitlements != NULL) {
+            osobject_release(blob->csb_entitlements);
+            blob->csb_entitlements = NULL;
+        }
+        kfree(blob, sizeof (*blob));
+    }
+}
+
+int
+ubc_cs_blob_add(
+       struct vnode    *vp,
+       cpu_type_t      cputype,
+       off_t           base_offset,
+       vm_address_t    *addr,
+       vm_size_t       size,
+       struct image_params *imgp,
+       __unused int    flags,
+       struct cs_blob  **ret_blob)
+{
+       kern_return_t           kr;
+       struct ubc_info         *uip;
+       struct cs_blob          *blob, *oblob;
+       int                     error;
+       CS_CodeDirectory const *cd;
+       off_t                   blob_start_offset, blob_end_offset;
+       boolean_t               record_mtime;
+
+       record_mtime = FALSE;
+       if (ret_blob)
+           *ret_blob = NULL;
+    /* Create the struct cs_blob wrapper that will be attached to the vnode.
+     * Validates the passed in blob in the process. */
+    error = cs_blob_create_validated(addr, size, &blob, &cd);
+
+    if (error != 0) {
+               printf("malform code signature blob: %d\n", error);
+        return error;
+    }
+
+    blob->csb_cpu_type = cputype;
+       blob->csb_base_offset = base_offset;
+
+       /*
         * Let policy module check whether the blob's signature is accepted.
         */
 #if CONFIG_MACF
@@ -3269,6 +3354,38 @@ ubc_cs_blob_add(
        }
 #endif
 
+#if CONFIG_ENFORCE_SIGNED_CODE
+       /*
+        * Reconstitute code signature
+        */
+       {
+               vm_address_t new_mem_kaddr = 0;
+               vm_size_t new_mem_size = 0;
+
+               CS_CodeDirectory *new_cd = NULL;
+               CS_GenericBlob const *new_entitlements = NULL;
+
+               error = ubc_cs_reconstitute_code_signature(blob, 0,
+                                                                                                  &new_mem_kaddr, &new_mem_size,
+                                                                                                  &new_cd, &new_entitlements);
+
+               if (error != 0) {
+                       printf("failed code signature reconstitution: %d\n", error);
+                       goto out;
+               }
+
+               ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size);
+
+               blob->csb_mem_kaddr = new_mem_kaddr;
+               blob->csb_mem_size = new_mem_size;
+               blob->csb_cd = new_cd;
+               blob->csb_entitlements_blob = new_entitlements;
+               blob->csb_reconstituted = true;
+       }
+
+#endif
+
+
        if (blob->csb_flags & CS_PLATFORM_BINARY) {
                if (cs_debug > 1)
                        printf("check_signature[pid: %d]: platform binary\n", current_proc()->p_pid);
@@ -3301,7 +3418,12 @@ ubc_cs_blob_add(
        }
 
        if (ubc_cs_supports_multilevel_hash(blob)) {
-               ubc_cs_convert_to_multilevel_hash(blob);
+               error = ubc_cs_convert_to_multilevel_hash(blob);
+               if (error != 0) {
+                       printf("failed multilevel hash conversion: %d\n", error);
+                       goto out;
+               }
+               blob->csb_reconstituted = true;
        }
 
        vnode_lock(vp);
@@ -3378,6 +3500,11 @@ ubc_cs_blob_add(
                                          */
                                         oblob->csb_cpu_type = cputype;
                                 }
+
+                                /* The signature is still accepted, so update the
+                                 * generation count. */
+                                uip->cs_add_gen = cs_blob_generation_count;
+
                                 vnode_unlock(vp);
                                 if (ret_blob)
                                         *ret_blob = oblob;
@@ -3464,19 +3591,7 @@ out:
                if (cs_debug)
                        printf("check_signature[pid: %d]: error = %d\n", current_proc()->p_pid, error);
 
-               /* we failed; release what we allocated */
-               if (blob) {
-                       if (blob->csb_mem_kaddr) {
-                               ubc_cs_blob_deallocate(blob->csb_mem_kaddr, blob->csb_mem_size);
-                               blob->csb_mem_kaddr = 0;
-                       }
-                       if (blob->csb_entitlements != NULL) {
-                               osobject_release(blob->csb_entitlements);
-                               blob->csb_entitlements = NULL;
-                       }
-                       kfree(blob, sizeof (*blob));
-                       blob = NULL;
-               }
+        cs_blob_free(blob);
        }
 
        if (error == EAGAIN) {
@@ -3577,18 +3692,9 @@ ubc_cs_free(
             blob != NULL;
             blob = next_blob) {
                next_blob = blob->csb_next;
-               if (blob->csb_mem_kaddr != 0) {
-                       ubc_cs_blob_deallocate(blob->csb_mem_kaddr,
-                                              blob->csb_mem_size);
-                       blob->csb_mem_kaddr = 0;
-               }
-               if (blob->csb_entitlements != NULL) {
-                       osobject_release(blob->csb_entitlements);
-                       blob->csb_entitlements = NULL;
-               }
                OSAddAtomic(-1, &cs_blob_count);
                OSAddAtomic((SInt32) -blob->csb_mem_size, &cs_blob_size);
-               kfree(blob, sizeof (*blob));
+               cs_blob_free(blob);
        }
 #if CHECK_CS_VALIDATION_BITMAP
        ubc_cs_validation_bitmap_deallocate( uip->ui_vnode );
@@ -3634,17 +3740,45 @@ ubc_cs_blob_revalidate(
 
        size = blob->csb_mem_size;
        error = cs_validate_csblob((const uint8_t *)blob->csb_mem_kaddr,
-                                  &size, &cd, &entitlements);
+                                                          size, &cd, &entitlements);
        if (error) {
                if (cs_debug) {
                        printf("CODESIGNING: csblob invalid: %d\n", error);
                }
                goto out;
        }
-       assert(size == blob->csb_mem_size);
 
     unsigned int cs_flags = (ntohl(cd->flags) & CS_ALLOWED_MACHO) | CS_VALID;
     unsigned int signer_type = CS_SIGNER_TYPE_UNKNOWN;
+
+       if (blob->csb_reconstituted) {
+               /*
+                * Code signatures that have been modified after validation
+                * cannot be revalidated inline from their in-memory blob.
+                *
+                * That's okay, though, because the only path left that relies
+                * on revalidation of existing in-memory blobs is the legacy
+                * detached signature database path, which only exists on macOS,
+                * which does not do reconstitution of any kind.
+                */
+               if (cs_debug) {
+                       printf("CODESIGNING: revalidate: not inline revalidating reconstituted signature.\n");
+               }
+
+               /*
+                * EAGAIN tells the caller that they may reread the code
+                * signature and try attaching it again, which is the same
+                * thing they would do if there was no cs_blob yet in the
+                * first place.
+                *
+                * Conveniently, after ubc_cs_blob_add did a successful
+                * validation, it will detect that a matching cs_blob (cdhash,
+                * offset, arch etc.) already exists, and return success
+                * without re-adding a cs_blob to the vnode.
+                */
+               return EAGAIN;
+       }
+
        /* callout to mac_vnode_check_signature */
 #if CONFIG_MACF
        error = mac_vnode_check_signature(vp, blob, imgp, &cs_flags, &signer_type, flags);
@@ -4150,3 +4284,66 @@ void     ubc_cs_validation_bitmap_deallocate(__unused vnode_t vp){
        return;
 }
 #endif /* CHECK_CS_VALIDATION_BITMAP */
+
+#if PMAP_CS
+kern_return_t
+cs_associate_blob_with_mapping(
+       void                    *pmap,
+       vm_map_offset_t         start,
+       vm_map_size_t           size,
+       vm_object_offset_t      offset,
+       void                    *blobs_p)
+{
+       off_t                   blob_start_offset, blob_end_offset;
+       kern_return_t           kr;
+       struct cs_blob          *blobs, *blob;
+       vm_offset_t             kaddr;
+       struct pmap_cs_code_directory *cd_entry = NULL;
+
+       if (!pmap_cs) {
+               return KERN_NOT_SUPPORTED;
+       }
+       
+       blobs = (struct cs_blob *)blobs_p;
+
+       for (blob = blobs;
+            blob != NULL;
+            blob = blob->csb_next) {
+               blob_start_offset = (blob->csb_base_offset +
+                                    blob->csb_start_offset);
+               blob_end_offset = (blob->csb_base_offset +
+                                  blob->csb_end_offset);
+               if ((off_t) offset < blob_start_offset ||
+                   (off_t) offset >= blob_end_offset ||
+                   (off_t) (offset + size) <= blob_start_offset ||
+                   (off_t) (offset + size) > blob_end_offset) {
+                       continue;
+               }
+               kaddr = blob->csb_mem_kaddr;
+               if (kaddr == 0) {
+                       /* blob data has been released */
+                       continue;
+               }
+               cd_entry = blob->csb_pmap_cs_entry;
+               if (cd_entry == NULL) {
+                       continue;
+               }
+
+               break;
+       }
+
+       if (cd_entry != NULL) {
+               kr = pmap_cs_associate(pmap,
+                                      cd_entry,
+                                      start,
+                                      size);
+       } else {
+               kr = KERN_CODESIGN_ERROR;
+       }
+#if 00
+       printf("FBDP %d[%s] pmap_cs_associate(%p,%p,0x%llx,0x%llx) -> kr=0x%x\n", proc_selfpid(), &(current_proc()->p_comm[0]), pmap, cd_entry, (uint64_t)start, (uint64_t)size, kr);
+       kr = KERN_SUCCESS;
+#endif
+       return kr;
+}
+#endif /* PMAP_CS */
index e03a08e6d5650fa547a0e3f3e18c89fe3d9f5186..4433e81bd4e5ec9275c9a766bb663ee8baf46daa 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
index c0b0519227b824fba6da95f44c0bd212975deb36..f33335a384a3bbcde507f313b0430560432f9f08 100644 (file)
@@ -344,6 +344,7 @@ static uint64_t mb_expand_16kcl_total;
 static boolean_t mbuf_worker_needs_wakeup; /* wait channel for mbuf worker */
 static uint32_t mbuf_worker_run_cnt;
 static uint64_t mbuf_worker_last_runtime;
+static uint64_t mbuf_drain_last_runtime;
 static int mbuf_worker_ready;  /* worker thread is runnable */
 static int ncpu;               /* number of CPUs */
 static ppnum_t *mcl_paddr;     /* Array of cluster physical addresses */
@@ -708,7 +709,7 @@ static char *mbuf_dump_buf;
  * also toggeable via the kern.ipc.mb_watchdog sysctl.
  * Garbage collection is also enabled by default on embedded platforms.
  * mb_drain_maxint controls the amount of time to wait (in seconds) before
- * consecutive calls to m_drain().
+ * consecutive calls to mbuf_drain().
  */
 #if CONFIG_EMBEDDED
 static unsigned int mb_watchdog = 1;
@@ -801,6 +802,16 @@ static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
 static struct mbuf *m_split0(struct mbuf *, int, int, int);
 __private_extern__ void mbuf_report_peak_usage(void);
 static boolean_t mbuf_report_usage(mbuf_class_t);
+#if DEBUG || DEVELOPMENT
+#define mbwdog_logger(fmt, ...)  _mbwdog_logger(__func__, __LINE__, fmt, ## __VA_ARGS__)
+static void _mbwdog_logger(const char *func, const int line, const char *fmt, ...);
+static char *mbwdog_logging;
+const unsigned mbwdog_logging_size = 4096;
+static size_t mbwdog_logging_used;
+#else
+#define mbwdog_logger(fmt, ...)  do { } while (0)
+#endif
+static void mbuf_drain_locked(boolean_t);
 
 /* flags for m_copyback0 */
 #define        M_COPYBACK0_COPYBACK    0x0001  /* copyback from cp */
@@ -1528,6 +1539,7 @@ mbinit(void)
        _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
        _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
        _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
+       _CASSERT(MBUF_SC2TC(MBUF_SC_SIG) == MBUF_TC_VI);
        _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
        _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
 
@@ -2258,7 +2270,9 @@ mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
 
        if ((w = mb_waiters) > 0)
                mb_waiters = 0;
-
+       if (w) {
+               mbwdog_logger("waking up all threads");
+       }
        lck_mtx_unlock(mbuf_mlock);
 
        if (w != 0)
@@ -2332,6 +2346,9 @@ mbuf_slab_notify(void *arg, u_int32_t reason)
                m_notified(class)++;
                mb_waiters = 0;
        }
+       if (w) {
+               mbwdog_logger("waking up all threads");
+       }
        lck_mtx_unlock(mbuf_mlock);
 
        if (w != 0)
@@ -2755,6 +2772,9 @@ mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
 
        if ((w = mb_waiters) > 0)
                mb_waiters = 0;
+       if (w) {
+               mbwdog_logger("waking up all threads");
+       }
 
        lck_mtx_unlock(mbuf_mlock);
 
@@ -3144,6 +3164,8 @@ out:
         * pool or if the number of free clusters is less than requested.
         */
        if (i > 0 && mbuf_worker_ready && mbuf_worker_needs_wakeup) {
+               mbwdog_logger("waking up the worker thread to to grow %s by %d",
+                   m_cname(class), i);
                wakeup((caddr_t)&mbuf_worker_needs_wakeup);
                mbuf_worker_needs_wakeup = FALSE;
        }
@@ -3317,8 +3339,10 @@ freelist_populate(mbuf_class_t class, unsigned int num, int wait)
 
                if ((i = mb_waiters) > 0)
                        mb_waiters = 0;
-               if (i != 0)
+               if (i != 0) {
+                       mbwdog_logger("waking up all threads");
                        wakeup(mb_waitchan);
+               }
        }
        return (count != 0);
 }
@@ -4808,37 +4832,27 @@ m_freem(struct mbuf *m)
 /*
  * Mbuffer utility routines.
  */
-
 /*
- * Compute the amount of space available before the current start
- * of data in an mbuf.
+ * Set the m_data pointer of a newly allocated mbuf to place an object of the
+ * specified size at the end of the mbuf, longword aligned.
+ *
+ * NB: Historically, we had M_ALIGN(), MH_ALIGN(), and MEXT_ALIGN() as
+ * separate macros, each asserting that it was called at the proper moment.
+ * This required callers to themselves test the storage type and call the
+ * right one.  Rather than require callers to be aware of those layout
+ * decisions, we centralize here.
  */
-int
-m_leadingspace(struct mbuf *m)
+void
+m_align(struct mbuf *m, int len)
 {
-       if (m->m_flags & M_EXT) {
-               if (MCLHASREFERENCE(m))
-                       return (0);
-               return (m->m_data - m->m_ext.ext_buf);
-       }
-       if (m->m_flags & M_PKTHDR)
-               return (m->m_data - m->m_pktdat);
-       return (m->m_data - m->m_dat);
-}
+       int adjust = 0;
 
-/*
- * Compute the amount of space available after the end of data in an mbuf.
- */
-int
-m_trailingspace(struct mbuf *m)
-{
-       if (m->m_flags & M_EXT) {
-               if (MCLHASREFERENCE(m))
-                       return (0);
-               return (m->m_ext.ext_buf + m->m_ext.ext_size -
-                   (m->m_data + m->m_len));
-       }
-       return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
+       /* At this point data must point to start */
+       VERIFY(m->m_data == M_START(m));
+       VERIFY(len >= 0);
+       VERIFY(len <= M_SIZE(m));
+       adjust = M_SIZE(m) - len;
+       m->m_data += adjust &~ (sizeof(long) - 1);
 }
 
 /*
@@ -5321,6 +5335,17 @@ m_pullup(struct mbuf *n, int len)
                    __func__, len);
                goto bad;
        }
+       if (len > MLEN) {
+               os_log_info(OS_LOG_DEFAULT, "%s: failed len %d too big",
+                   __func__, len);
+               goto bad;
+       }
+       if ((n->m_flags & M_EXT) == 0 &&
+           n->m_data >= &n->m_dat[MLEN]) {
+               os_log_info(OS_LOG_DEFAULT, "%s: m_data out of bounds",
+                   __func__);
+               goto bad;
+       }
 
        /*
         * If first mbuf has no cluster, and has room for len bytes
@@ -5328,7 +5353,7 @@ m_pullup(struct mbuf *n, int len)
         * otherwise allocate a new mbuf to prepend to the chain.
         */
        if ((n->m_flags & M_EXT) == 0 &&
-           n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
+           len < &n->m_dat[MLEN] - n->m_data && n->m_next != NULL) {
                if (n->m_len >= len)
                        return (n);
                m = n;
@@ -5355,11 +5380,11 @@ m_pullup(struct mbuf *n, int len)
                m->m_len += count;
                n->m_len -= count;
                space -= count;
-               if (n->m_len)
+               if (n->m_len != 0)
                        n->m_data += count;
                else
                        n = m_free(n);
-       } while (len > 0 && n);
+       } while (len > 0 && n != NULL);
        if (len > 0) {
                (void) m_free(m);
                goto bad;
@@ -5439,18 +5464,52 @@ m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
        struct mbuf *m, *n;
        unsigned len = len0, remain;
 
+       /*
+        * First iterate to the mbuf which contains the first byte of
+        * data at offset len0
+        */
        for (m = m0; m && len > m->m_len; m = m->m_next)
                len -= m->m_len;
        if (m == NULL)
                return (NULL);
+       /*
+        * len effectively is now the offset in the current
+        * mbuf where we have to perform split.
+        *
+        * remain becomes the tail length.
+        * Note that len can also be == m->m_len
+        */
        remain = m->m_len - len;
-       if (copyhdr && (m0->m_flags & M_PKTHDR)) {
+
+       /*
+        * If current mbuf len contains the entire remaining offset len,
+        * just make the second mbuf chain pointing to next mbuf onwards
+        * and return after making necessary adjustments
+        */
+       if (copyhdr && (m0->m_flags & M_PKTHDR) && remain == 0) {
+               _MGETHDR(n, wait, m0->m_type);
+               if (n == NULL)
+                       return (NULL);
+               n->m_next = m->m_next;
+               m->m_next = NULL;
+               n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
+               n->m_pkthdr.len = m0->m_pkthdr.len - len0;
+               m0->m_pkthdr.len = len0;
+               return (n);
+       } if (copyhdr && (m0->m_flags & M_PKTHDR)) {
                _MGETHDR(n, wait, m0->m_type);
                if (n == NULL)
                        return (NULL);
                n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
                n->m_pkthdr.len = m0->m_pkthdr.len - len0;
                m0->m_pkthdr.len = len0;
+
+               /*
+                * If current points to external storage
+                * then it can be shared by making last mbuf
+                * of head chain and first mbuf of current chain
+                * pointing to different data offsets
+                */
                if (m->m_flags & M_EXT)
                        goto extpacket;
                if (remain > MHLEN) {
@@ -5472,7 +5531,11 @@ m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
                _MGET(n, wait, m->m_type);
                if (n == NULL)
                        return (NULL);
-               M_ALIGN(n, remain);
+
+               if ((m->m_flags & M_EXT) == 0) {
+                       VERIFY(remain <= MLEN);
+                       M_ALIGN(n, remain);
+               }
        }
 extpacket:
        if (m->m_flags & M_EXT) {
@@ -5607,6 +5670,9 @@ m_howmany(int num, size_t bufsize)
        if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
            (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
            (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
+               mbwdog_logger("maxed out nclusters (%u >= %u) or njcl (%u >= %u)",
+                   sumclusters, nclusters,
+                   (m_16kclusters << NCLPJCLSHIFT), njcl);
                return (0);
        }
 
@@ -6550,8 +6616,9 @@ mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
                wakeup((caddr_t)&mbuf_worker_needs_wakeup);
                mbuf_worker_needs_wakeup = FALSE;
        }
-
+       mbwdog_logger("waiting (%d mbufs in class %s)", num, m_cname(class));
        (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
+       mbwdog_logger("woke up (%d mbufs in class %s) ", num, m_cname(class));
 
        /* We are now up; stop getting notified until next round */
        mbuf_waiter_dec(class, (wait & MCR_COMP));
@@ -6576,8 +6643,31 @@ mbuf_worker_thread(void)
 
        while (1) {
                lck_mtx_lock(mbuf_mlock);
+               mbwdog_logger("worker thread running");
                mbuf_worker_run_cnt++;
                mbuf_expand = 0;
+               /*
+                * Allocations are based on page size, so if we have depleted
+                * the reserved spaces, try to free mbufs from the major classes.
+                */
+#if PAGE_SIZE == 4096
+               uint32_t m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
+               uint32_t m_clusters = m_total(MC_CL);
+               uint32_t m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
+               uint32_t sumclusters = m_mbclusters + m_clusters + m_bigclusters;
+               if (sumclusters >= nclusters) {
+                       mbwdog_logger("reclaiming bigcl");
+                       mbuf_drain_locked(TRUE);
+                       m_reclaim(MC_BIGCL, 4, FALSE);
+               }
+#else
+               uint32_t m_16kclusters = m_total(MC_16KCL);
+               if (njcl > 0 && (m_16kclusters << NCLPJCLSHIFT) >= njcl) {
+                       mbwdog_logger("reclaiming 16kcl");
+                       mbuf_drain_locked(TRUE);
+                       m_reclaim(MC_16KCL, 4, FALSE);
+               }
+#endif
                if (m_region_expand(MC_CL) > 0) {
                        int n;
                        mb_expand_cl_cnt++;
@@ -6591,8 +6681,10 @@ mbuf_worker_thread(void)
                        }
                        m_region_expand(MC_CL) = 0;
 
-                       if (n > 0)
+                       if (n > 0) {
+                               mbwdog_logger("expanding MC_CL by %d", n);
                                freelist_populate(MC_CL, n, M_WAIT);
+                       }
                }
                if (m_region_expand(MC_BIGCL) > 0) {
                        int n;
@@ -6607,8 +6699,10 @@ mbuf_worker_thread(void)
                        }
                        m_region_expand(MC_BIGCL) = 0;
 
-                       if (n > 0)
+                       if (n > 0) {
+                               mbwdog_logger("expanding MC_BIGCL by %d", n);
                                freelist_populate(MC_BIGCL, n, M_WAIT);
+                       }
                }
                if (m_region_expand(MC_16KCL) > 0) {
                        int n;
@@ -6623,8 +6717,10 @@ mbuf_worker_thread(void)
                        }
                        m_region_expand(MC_16KCL) = 0;
 
-                       if (n > 0)
+                       if (n > 0) {
+                               mbwdog_logger("expanding MC_16KCL by %d", n);
                                (void) freelist_populate(MC_16KCL, n, M_WAIT);
+                       }
                }
 
                /*
@@ -6633,11 +6729,23 @@ mbuf_worker_thread(void)
                 * mbufs -- otherwise we could have a large number of useless
                 * clusters allocated.
                 */
-               while (m_total(MC_MBUF) <
-                   (m_total(MC_BIGCL) + m_total(MC_CL) + m_total(MC_16KCL))) {
+               mbwdog_logger("totals: MC_MBUF %d MC_BIGCL %d MC_CL %d MC_16KCL %d",
+                   m_total(MC_MBUF), m_total(MC_BIGCL), m_total(MC_CL),
+                   m_total(MC_16KCL));
+               uint32_t total_mbufs = m_total(MC_MBUF);
+               uint32_t total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
+                   m_total(MC_16KCL);
+               if (total_mbufs < total_clusters) {
+                       mbwdog_logger("expanding MC_MBUF by %d",
+                               total_clusters - total_mbufs);
+               }
+               while (total_mbufs < total_clusters) {
                        mb_expand_cnt++;
                        if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
                                break;
+                       total_mbufs = m_total(MC_MBUF);
+                       total_clusters = m_total(MC_BIGCL) + m_total(MC_CL) +
+                           m_total(MC_16KCL);
                }
 
                mbuf_worker_needs_wakeup = TRUE;
@@ -6650,6 +6758,7 @@ mbuf_worker_thread(void)
                mbuf_worker_last_runtime = net_uptime();
                assert_wait((caddr_t)&mbuf_worker_needs_wakeup,
                    THREAD_UNINT);
+               mbwdog_logger("worker thread sleeping");
                lck_mtx_unlock(mbuf_mlock);
                (void) thread_block((thread_continue_t)mbuf_worker_thread);
        }
@@ -7419,6 +7528,7 @@ mbuf_dump(void)
        mleak_trace_stat_t *mltr;
        char *c = mbuf_dump_buf;
        int i, j, k, clen = MBUF_DUMP_BUF_SIZE;
+       bool printed_banner = false;
 
        mbuf_dump_buf[0] = '\0';
 
@@ -7562,14 +7672,29 @@ mbuf_dump(void)
                    net_uptime() - mbuf_worker_last_runtime);
                MBUF_DUMP_BUF_CHK();
        }
+       if (mbuf_drain_last_runtime != 0) {
+               k = snprintf(c, clen, "drain routine last run time: "
+                   "%llu (%llu seconds ago)\n",
+                   mbuf_drain_last_runtime,
+                   net_uptime() - mbuf_drain_last_runtime);
+               MBUF_DUMP_BUF_CHK();
+       }
 
-       k = snprintf(c, clen, "\nlargest allocation failure backtraces:\n");
+#if DEBUG || DEVELOPMENT
+       k = snprintf(c, clen, "\nworker thread log:\n%s\n", mbwdog_logging);
        MBUF_DUMP_BUF_CHK();
+#endif
 
        for (j = 0; j < MTRACELARGE_NUM_TRACES; j++) {
                struct mtracelarge *trace = &mtracelarge_table[j];
                if (trace->size == 0 || trace->depth == 0)
                        continue;
+               if (printed_banner == false) {
+                       k = snprintf(c, clen,
+                           "\nlargest allocation failure backtraces:\n");
+                       MBUF_DUMP_BUF_CHK();
+                       printed_banner = true;
+               }
                k = snprintf(c, clen, "size %llu: < ", trace->size);
                MBUF_DUMP_BUF_CHK();
                for (i = 0; i < trace->depth; i++) {
@@ -8017,10 +8142,27 @@ mbuf_report_peak_usage(void)
 }
 
 /*
- * Called by the VM when there's memory pressure.
+ * Simple routine to avoid taking the lock when we can't run the
+ * mbuf drain.
  */
-__private_extern__ void
-m_drain(void)
+static int
+mbuf_drain_checks(boolean_t ignore_waiters)
+{
+
+       if (mb_drain_maxint == 0)
+               return 0;
+       if (!ignore_waiters && mb_waiters != 0)
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Called by the VM when there's memory pressure or when we exhausted
+ * the 4k/16k reserved space.
+ */
+static void
+mbuf_drain_locked(boolean_t ignore_waiters)
 {
        mbuf_class_t mc;
        mcl_slab_t *sp, *sp_tmp, *nsp;
@@ -8030,11 +8172,11 @@ m_drain(void)
        ppnum_t offset;
        mcache_obj_t *obj;
        unsigned long per;
-       static uint64_t last_drain = 0;
        static unsigned char scratch[32];
        static ppnum_t scratch_pa = 0;
 
-       if (mb_drain_maxint == 0 || mb_waiters)
+       LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
+       if (!mbuf_drain_checks(ignore_waiters))
                return;
        if (scratch_pa == 0) {
                bzero(scratch, sizeof(scratch));
@@ -8053,20 +8195,15 @@ m_drain(void)
         * waiting times for mbufs.  Purge caches if we were asked to drain
         * in the last 5 minutes.
         */
-       lck_mtx_lock(mbuf_mlock);
-       if (last_drain == 0) {
-               last_drain = net_uptime();
-               lck_mtx_unlock(mbuf_mlock);
-               return;
-       }
-       interval = net_uptime() - last_drain;
-       if (interval <= mb_drain_maxint) {
-               lck_mtx_unlock(mbuf_mlock);
-               return;
+       if (mbuf_drain_last_runtime != 0) {
+               interval = net_uptime() - mbuf_drain_last_runtime;
+               if (interval <= mb_drain_maxint) {
+                       return;
+               }
+               if (interval <= mb_drain_maxint * 5)
+                       purge_caches = TRUE;
        }
-       if (interval <= mb_drain_maxint * 5)
-               purge_caches = TRUE;
-       last_drain = net_uptime();
+       mbuf_drain_last_runtime = net_uptime();
        /*
         * Don't free any memory if we're using 60% or more.
         */
@@ -8076,7 +8213,6 @@ m_drain(void)
        }
        per = (use_mem * 100) / total_mem;
        if (per >= 60) {
-               lck_mtx_unlock(mbuf_mlock);
                return;
        }
        /*
@@ -8205,9 +8341,20 @@ m_drain(void)
        mbstat.m_mbufs = m_total(MC_MBUF);
        mbuf_stat_sync();
        mbuf_mtypes_sync(TRUE);
+}
+
+__private_extern__ void
+mbuf_drain(boolean_t ignore_waiters)
+{
+       LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_NOTOWNED);
+       if (!mbuf_drain_checks(ignore_waiters))
+               return;
+       lck_mtx_lock(mbuf_mlock);
+       mbuf_drain_locked(ignore_waiters);
        lck_mtx_unlock(mbuf_mlock);
 }
 
+
 static int
 m_drain_force_sysctl SYSCTL_HANDLER_ARGS
 {
@@ -8218,16 +8365,60 @@ m_drain_force_sysctl SYSCTL_HANDLER_ARGS
        if (err != 0 || req->newptr == USER_ADDR_NULL)
                return (err);
        if (val) {
-               lck_mtx_lock(mbuf_mlock);
-               printf("%s\n", mbuf_dump());
-               lck_mtx_unlock(mbuf_mlock);
-               m_drain();
+               mbuf_drain(TRUE);
        }
 
        return (err);
 }
 
 #if DEBUG || DEVELOPMENT
+static void
+_mbwdog_logger(const char *func, const int line, const char *fmt, ...)
+{
+       va_list ap;
+       struct timeval now;
+       char str[384], p[256];
+       int len;
+
+       LCK_MTX_ASSERT(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
+       if (mbwdog_logging == NULL) {
+               mbwdog_logging = _MALLOC(mbwdog_logging_size,
+                   M_TEMP, M_ZERO|M_NOWAIT);
+               if (mbwdog_logging == NULL)
+                       return;
+       }
+       va_start(ap, fmt);
+       vsnprintf(p, sizeof(p), fmt, ap);
+       va_end(ap);
+       microuptime(&now);
+       len = snprintf(str, sizeof(str),
+           "\n%ld.%d (%d/%llx) %s:%d %s",
+           now.tv_sec, now.tv_usec,
+           current_proc()->p_pid,
+           (uint64_t)VM_KERNEL_ADDRPERM(current_thread()),
+           func, line, p);
+       if (len < 0)
+               return;
+       if (mbwdog_logging_used + len > mbwdog_logging_size) {
+               mbwdog_logging_used = mbwdog_logging_used / 2;
+               memmove(mbwdog_logging, mbwdog_logging + mbwdog_logging_used,
+                   mbwdog_logging_size - mbwdog_logging_used);
+               mbwdog_logging[mbwdog_logging_used] = 0;
+       }
+       strlcat(mbwdog_logging, str, mbwdog_logging_size);
+       mbwdog_logging_used += len;
+}
+
+static int
+sysctl_mbwdog_log SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+    return SYSCTL_OUT(req, mbwdog_logging, mbwdog_logging_used);
+}
+SYSCTL_DECL(_kern_ipc);
+SYSCTL_PROC(_kern_ipc, OID_AUTO, mbwdog_log,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_LOCKED,
+    0, 0, sysctl_mbwdog_log, "A", "");
 
 static int mbtest_val;
 static int mbtest_running;
@@ -8299,8 +8490,7 @@ mbtest SYSCTL_HANDLER_ARGS
 
        return (error);
 }
-#endif
-
+#endif // DEBUG || DEVELOPMENT
 
 static void
 mtracelarge_register(size_t size)
index bd6b3030f6f2fdbfac9726b761ce4024220346b1..348afd442dca895922b04bb2327043c39e5df165 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 
 /* TODO: this should be in a header file somewhere */
 extern char *proc_name_address(void *p);
-extern char *proc_best_name(proc_t);
 
 static u_int32_t       so_cache_hw;    /* High water mark for socache */
 static u_int32_t       so_cache_timeouts;      /* number of timeouts */
@@ -2332,12 +2331,12 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
 
                                        if ((m->m_flags & M_EXT))
                                                mlen = m->m_ext.ext_size -
-                                                   m_leadingspace(m);
+                                                   M_LEADINGSPACE(m);
                                        else if ((m->m_flags & M_PKTHDR))
                                                mlen =
-                                                   MHLEN - m_leadingspace(m);
+                                                   MHLEN - M_LEADINGSPACE(m);
                                        else
-                                               mlen = MLEN - m_leadingspace(m);
+                                               mlen = MLEN - M_LEADINGSPACE(m);
                                        len = imin(mlen, bytes_to_copy);
 
                                        chainlength += len;
@@ -2431,8 +2430,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                                 * Content filter processing
                                 */
                                error = cfil_sock_data_out(so, addr, top,
-                                   control, (sendflags & MSG_OOB) ?
-                                   sock_data_filt_flag_oob : 0);
+                                   control, sendflags);
                                if (error) {
                                        if (error == EJUSTRETURN) {
                                                error = 0;
@@ -2500,6 +2498,51 @@ out_locked:
        return (error);
 }
 
+int
+sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
+{
+       struct mbuf *m0, *control_end;
+
+       socket_lock_assert_owned(so);
+
+       /*
+        * top must points to mbuf chain to be sent.
+        * If control is not NULL, top must be packet header
+        */
+       VERIFY(top != NULL &&
+                  (control == NULL || top->m_flags & M_PKTHDR));
+
+       /*
+        * If control is not passed in, see if we can get it
+        * from top.
+        */
+       if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
+               // Locate start of control if present and start of data
+               for (m0 = top; m0 != NULL; m0 = m0->m_next) {
+                       if (m0->m_flags & M_PKTHDR) {
+                               top = m0;
+                               break;
+                       } else if (m0->m_type == MT_CONTROL) {
+                               if (control == NULL) {
+                                       // Found start of control
+                                       control = m0;
+                               }
+                               if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
+                                       // Found end of control
+                                       control_end = m0;
+                               }
+                       }
+               }
+               if (control_end != NULL)
+                       control_end->m_next = NULL;
+       }
+
+       int error = (*so->so_proto->pr_usrreqs->pru_send)
+                       (so, sendflags, top, addr, control, current_proc());
+
+       return error;
+}
+
 /*
  * Supported only connected sockets (no address) without ancillary data
  * (control mbuf) for atomic protocols
@@ -2684,12 +2727,12 @@ sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
                        for (n = m; n != NULL; n = n->m_next) {
                                if ((m->m_flags & M_EXT))
                                        mlen = m->m_ext.ext_size -
-                                           m_leadingspace(m);
+                                           M_LEADINGSPACE(m);
                                else if ((m->m_flags & M_PKTHDR))
                                        mlen =
-                                           MHLEN - m_leadingspace(m);
+                                           MHLEN - M_LEADINGSPACE(m);
                                else
-                                       mlen = MLEN - m_leadingspace(m);
+                                       mlen = MLEN - M_LEADINGSPACE(m);
                                len = imin(mlen, bytes_to_copy);
 
                                /*
@@ -4798,6 +4841,7 @@ sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                case SO_OOBINLINE:
                case SO_TIMESTAMP:
                case SO_TIMESTAMP_MONOTONIC:
+               case SO_TIMESTAMP_CONTINUOUS:
                case SO_DONTTRUNC:
                case SO_WANTMORE:
                case SO_WANTOOBFLAG:
@@ -5495,6 +5539,7 @@ sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
                case SO_OOBINLINE:
                case SO_TIMESTAMP:
                case SO_TIMESTAMP_MONOTONIC:
+               case SO_TIMESTAMP_CONTINUOUS:
                case SO_DONTTRUNC:
                case SO_WANTMORE:
                case SO_WANTOOBFLAG:
@@ -6187,8 +6232,6 @@ filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
        /* save off the new input fflags and data */
        kn->kn_sfflags = kev->fflags;
        kn->kn_sdata = kev->data;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /* determine if changes result in fired events */
        retval = filt_soread_common(kn, so);
@@ -6341,8 +6384,6 @@ filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
        /*save off the new input fflags and data */
        kn->kn_sfflags = kev->fflags;
        kn->kn_sdata = kev->data;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /* determine if these changes result in a triggered event */
        ret = filt_sowrite_common(kn, so);
@@ -6547,8 +6588,6 @@ filt_socktouch(
        /* save off the new input fflags and data */
        kn->kn_sfflags = kev->fflags;
        kn->kn_sdata = kev->data;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /* restrict the current results to the (smaller?) set of new interest */
        /*
@@ -6838,23 +6877,29 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
        if (so->so_flags & SOF_NODEFUNCT) {
                if (noforce) {
                        err = EOPNOTSUPP;
+                       if (p != PROC_NULL) {
+                               SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
+                                   "name %s level %d) so 0x%llx [%d,%d] "
+                                   "is not eligible for defunct "
+                                   "(%d)\n", __func__, proc_selfpid(),
+                                   proc_best_name(current_proc()), proc_pid(p),
+                                   proc_best_name(p), level,
+                                   (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
+                                   SOCK_DOM(so), SOCK_TYPE(so), err);
+                       }
+                       return (err);
+               }
+               so->so_flags &= ~SOF_NODEFUNCT;
+               if (p != PROC_NULL) {
                        SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
                            "name %s level %d) so 0x%llx [%d,%d] "
-                           "is not eligible for defunct "
+                           "defunct by force "
                            "(%d)\n", __func__, proc_selfpid(),
                            proc_best_name(current_proc()), proc_pid(p),
                            proc_best_name(p), level,
                            (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
                            SOCK_DOM(so), SOCK_TYPE(so), err);
-                       return (err);
                }
-               so->so_flags &= ~SOF_NODEFUNCT;
-               SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
-                   "so 0x%llx [%d,%d] defunct by force\n", __func__,
-                   proc_selfpid(), proc_best_name(current_proc()),
-                   proc_pid(p), proc_best_name(p), level,
-                   (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
-                   SOCK_DOM(so), SOCK_TYPE(so));
        } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
                struct inpcb *inp = (struct inpcb *)so->so_pcb;
                struct ifnet *ifp = inp->inp_last_outifp;
@@ -6865,7 +6910,7 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
                        OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
                } else if (soextbkidlestat.so_xbkidle_time == 0) {
                        OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
-               } else if (noforce) {
+               } else if (noforce && p != PROC_NULL) {
                        OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
 
                        so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
@@ -6875,14 +6920,14 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
                        inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
 
                        err = EOPNOTSUPP;
-                       SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
-                           "level %d) extend bk idle so 0x%llx rcv hw %d "
-                           "cc %d\n",
-                           __func__, proc_selfpid(),
+                       SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
+                           "name %s level %d) so 0x%llx [%d,%d] "
+                           "extend bk idle "
+                           "(%d)\n", __func__, proc_selfpid(),
                            proc_best_name(current_proc()), proc_pid(p),
                            proc_best_name(p), level,
                            (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
-                           so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
+                           SOCK_DOM(so), SOCK_TYPE(so), err);
                        return (err);
                } else {
                        OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
@@ -6908,13 +6953,16 @@ sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
        }
 
 done:
-       SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
-           "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
-           proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
-           level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
-           SOCK_TYPE(so), defunct ? "is already" : "marked as",
-           (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
-
+       if (p != PROC_NULL) {
+               SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
+                   "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
+                   proc_selfpid(), proc_best_name(current_proc()),
+                   proc_pid(p), proc_best_name(p), level,
+                   (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
+                   SOCK_TYPE(so), defunct ? "is already" : "marked as",
+                   (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
+                   " extbkidle" : "");
+       }
        return (err);
 }
 
@@ -6938,23 +6986,29 @@ sodefunct(struct proc *p, struct socket *so, int level)
                char d[MAX_IPv6_STR_LEN];
                struct inpcb *inp = sotoinpcb(so);
 
-               SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
-                   "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
-                   "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
-                   __func__, proc_selfpid(), proc_best_name(current_proc()),
-                   proc_pid(p), proc_best_name(p), level,
-                   (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
-                   (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
-                   inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
-                   (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
-                   s, sizeof (s)), ntohs(inp->in6p_lport),
-                   inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
-                   (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
-                   d, sizeof (d)), ntohs(inp->in6p_fport),
-                   (uint32_t)rcv->sb_sel.si_flags,
-                   (uint32_t)snd->sb_sel.si_flags,
-                   rcv->sb_flags, snd->sb_flags);
-       } else {
+               if (p != PROC_NULL) {
+                       SODEFUNCTLOG(
+                           "%s[%d, %s]: (target pid %d name %s level %d) "
+                           "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
+                           "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
+                           " snd_fl 0x%x]\n", __func__,
+                           proc_selfpid(), proc_best_name(current_proc()),
+                           proc_pid(p), proc_best_name(p), level,
+                           (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
+                           (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
+                           inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
+                           (void *)&inp->inp_laddr.s_addr :
+                           (void *)&inp->in6p_laddr),
+                           s, sizeof (s)), ntohs(inp->in6p_lport),
+                           inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
+                           (void *)&inp->inp_faddr.s_addr :
+                           (void *)&inp->in6p_faddr,
+                           d, sizeof (d)), ntohs(inp->in6p_fport),
+                           (uint32_t)rcv->sb_sel.si_flags,
+                           (uint32_t)snd->sb_sel.si_flags,
+                           rcv->sb_flags, snd->sb_flags);
+               }
+       } else if (p != PROC_NULL)  {
                SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
                    "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
                    "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
index 87d06ad39955b4ada649aaf5a1079655b137eb08..264819a7c28b1f126a638555ccdb5fecea237de1 100644 (file)
 #define        DBG_FNC_SBDROP          NETDBG_CODE(DBG_NETSOCK, 4)
 #define        DBG_FNC_SBAPPEND        NETDBG_CODE(DBG_NETSOCK, 5)
 
-extern char *proc_best_name(proc_t p);
-
 SYSCTL_DECL(_kern_ipc);
 
 __private_extern__ u_int32_t net_io_policy_throttle_best_effort = 0;
@@ -116,8 +114,6 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, throttle_best_effort,
 
 static inline void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
 static struct socket *sonewconn_internal(struct socket *, int);
-static int sbappendaddr_internal(struct sockbuf *, struct sockaddr *,
-    struct mbuf *, struct mbuf *);
 static int sbappendcontrol_internal(struct sockbuf *, struct mbuf *,
     struct mbuf *);
 static void soevent_ifdenied(struct socket *);
@@ -1110,23 +1106,20 @@ again:
 }
 
 /*
- * Append address and data, and optionally, control (ancillary) data
- * to the receive queue of a socket.  If present,
- * m0 must include a packet header with total length.
- * Returns 0 if no space in sockbuf or insufficient mbufs.
+ * Concatenate address (optional), control (optional) and data into one
+ * single mbuf chain.  If sockbuf *sb is passed in, space check will be
+ * performed.
  *
- * Returns:    0                       No space/out of mbufs
- *             1                       Success
+ * Returns:    mbuf chain pointer if succeeded, NULL if failed
  */
-static int
-sbappendaddr_internal(struct sockbuf *sb, struct sockaddr *asa,
-    struct mbuf *m0, struct mbuf *control)
+struct mbuf *
+sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control)
 {
-       struct mbuf *m, *n, *nlast;
-       int space = asa->sa_len;
+       struct mbuf *m = NULL, *n = NULL;
+       int space = 0;
 
        if (m0 && (m0->m_flags & M_PKTHDR) == 0)
-               panic("sbappendaddr");
+               panic("sbconcat_mbufs");
 
        if (m0)
                space += m0->m_pkthdr.len;
@@ -1135,22 +1128,59 @@ sbappendaddr_internal(struct sockbuf *sb, struct sockaddr *asa,
                if (n->m_next == 0)     /* keep pointer to last control buf */
                        break;
        }
-       if (space > sbspace(sb))
-               return (0);
-       if (asa->sa_len > MLEN)
-               return (0);
-       MGET(m, M_DONTWAIT, MT_SONAME);
-       if (m == 0)
-               return (0);
-       m->m_len = asa->sa_len;
-       bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
+
+       if (asa != NULL) {
+               if (asa->sa_len > MLEN)
+                       return (NULL);
+               space += asa->sa_len;
+       }
+
+       if (sb != NULL && space > sbspace(sb))
+               return (NULL);
+
        if (n)
                n->m_next = m0;         /* concatenate data to control */
        else
                control = m0;
-       m->m_next = control;
 
-       SBLASTRECORDCHK(sb, "sbappendadddr 1");
+       if (asa != NULL) {
+               MGET(m, M_DONTWAIT, MT_SONAME);
+               if (m == 0) {
+                       if (n) {
+                               /* unchain control and data if necessary */
+                               n->m_next = NULL;
+                       }
+                       return (NULL);
+               }
+               m->m_len = asa->sa_len;
+               bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
+
+               m->m_next = control;
+       } else {
+               m = control;
+       }
+
+       return (m);
+}
+
+/*
+ * Queue mbuf chain to the receive queue of a socket.
+ * Parameter space is the total len of the mbuf chain.
+ * If passed in, sockbuf space will be checked.
+ *
+ * Returns:    0               Invalid mbuf chain
+ *                     1               Success
+ */
+int
+sbappendchain(struct sockbuf *sb, struct mbuf *m, int space)
+{
+       struct mbuf *n, *nlast;
+
+       if (m == NULL)
+               return (0);
+
+       if (space != 0 && space > sbspace(sb))
+               return (0);
 
        for (n = m; n->m_next != NULL; n = n->m_next)
                sballoc(sb, n);
@@ -1186,6 +1216,7 @@ sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0,
 {
        int result = 0;
        boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
+       struct mbuf *mbuf_chain = NULL;
 
        if (error_out)
                *error_out = 0;
@@ -1230,7 +1261,9 @@ sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0,
                m0->m_flags &= ~M_SKIPCFIL;
        }
 
-       result = sbappendaddr_internal(sb, asa, m0, control);
+       mbuf_chain = sbconcat_mbufs(sb, asa, m0, control);
+       SBLASTRECORDCHK(sb, "sbappendadddr 1");
+       result = sbappendchain(sb, mbuf_chain, 0);
        if (result == 0) {
                if (m0)
                        m_freem(m0);
@@ -1359,6 +1392,9 @@ sbappendmsgstream_rcv(struct sockbuf *sb, struct mbuf *m, uint32_t seqnum,
        int ret = 0;
        struct socket *so = sb->sb_so;
 
+       if (m == NULL)
+               return (0);
+
        VERIFY((m->m_flags & M_PKTHDR) && m_pktlen(m) > 0);
        VERIFY(so->so_msg_state != NULL);
        VERIFY(sb->sb_flags & SB_RECV);
index b8b429c08c4b95f0b3b44020ac576eeefb9c209e..16a044f7f4f2c735be9d142e323495c6761c47d8 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
@@ -180,8 +180,8 @@ static int  unp_listen(struct unpcb *, proc_t);
 static void    unpcb_to_compat(struct unpcb *, struct unpcb_compat *);
 static void     unp_get_locks_in_order(struct socket *so, struct socket *conn_so);
 
-static void 
-unp_get_locks_in_order(struct socket *so, struct socket *conn_so) 
+static void
+unp_get_locks_in_order(struct socket *so, struct socket *conn_so)
 {
        if (so < conn_so) {
                socket_lock(conn_so, 1);
@@ -369,7 +369,7 @@ uipc_rcvd(struct socket *so, __unused int flags)
 #define        snd (&so2->so_snd)
                if (unp->unp_conn == 0)
                        break;
-               
+
                so2 = unp->unp_conn->unp_socket;
                unp_get_locks_in_order(so, so2);
                /*
@@ -485,7 +485,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
                        control = NULL;
                }
 
-               if (so != so2) 
+               if (so != so2)
                        socket_unlock(so2, 1);
 
                m = NULL;
@@ -524,7 +524,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
                so2 = unp->unp_conn->unp_socket;
                unp_get_locks_in_order(so, so2);
 
-               /* Check socket state again as we might have unlocked the socket 
+               /* Check socket state again as we might have unlocked the socket
                 * while trying to get the locks in order
                 */
 
@@ -532,7 +532,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
                        error = EPIPE;
                        socket_unlock(so2, 1);
                        break;
-               }       
+               }
 
                if (unp->unp_flags & UNP_TRACE_MDNS) {
                        struct mdns_ipc_msg_hdr hdr;
@@ -558,7 +558,7 @@ uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
 
                snd->sb_mbmax -= rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt;
                unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt;
-               if ((int32_t)snd->sb_hiwat >= 
+               if ((int32_t)snd->sb_hiwat >=
                    (int32_t)(rcv->sb_cc - unp->unp_conn->unp_cc)) {
                        snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc;
                } else {
@@ -844,7 +844,7 @@ unp_attach(struct socket *so)
                return (ENOBUFS);
        bzero(unp, sizeof (*unp));
 
-       lck_mtx_init(&unp->unp_mtx, 
+       lck_mtx_init(&unp->unp_mtx,
                unp_mtx_grp, unp_mtx_attr);
 
        lck_rw_lock_exclusive(unp_list_mtx);
@@ -886,7 +886,7 @@ unp_detach(struct unpcb *unp)
 
        lck_rw_lock_exclusive(unp_list_mtx);
        LIST_REMOVE(unp, unp_link);
-       --unp_count; 
+       --unp_count;
        ++unp_gencnt;
        lck_rw_done(unp_list_mtx);
        if (unp->unp_vnode) {
@@ -915,7 +915,7 @@ unp_detach(struct unpcb *unp)
 
                /* This datagram socket is connected to one or more
                 * sockets. In order to avoid a race condition between removing
-                * this reference and closing the connected socket, we need 
+                * this reference and closing the connected socket, we need
                 * to check disconnect_in_progress
                 */
                if (so_locked == 1) {
@@ -935,12 +935,12 @@ unp_detach(struct unpcb *unp)
                        unp2 = unp->unp_refs.lh_first;
                        socket_lock(unp2->unp_socket, 1);
                }
-               
+
                lck_mtx_lock(unp_disconnect_lock);
                disconnect_in_progress = 0;
                wakeup(&disconnect_in_progress);
                lck_mtx_unlock(unp_disconnect_lock);
-                       
+
                if (unp2 != NULL) {
                        /* We already locked this socket and have a reference on it */
                        unp_drop(unp2, ECONNRESET);
@@ -1005,10 +1005,11 @@ unp_bind(
        /*
         * Note: sun_path is not a zero terminated "C" string
         */
-       ASSERT(namelen < SOCK_MAXADDRLEN);
+       if (namelen >= SOCK_MAXADDRLEN)
+               return (EINVAL);
        bcopy(soun->sun_path, buf, namelen);
        buf[namelen] = 0;
-       
+
        socket_unlock(so, 0);
 
        NDINIT(&nd, CREATE, OP_MKFIFO, FOLLOW | LOCKPARENT, UIO_SYSSPACE,
@@ -1119,7 +1120,8 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p)
        /*
         * Note: sun_path is not a zero terminated "C" string
         */
-       ASSERT(len < SOCK_MAXADDRLEN);
+       if (len >= SOCK_MAXADDRLEN)
+               return (EINVAL);
        bcopy(soun->sun_path, buf, len);
        buf[len] = 0;
 
@@ -1298,7 +1300,7 @@ unp_connect(struct socket *so, struct sockaddr *nam, __unused proc_t p)
                        unp2->unp_flags |= UNP_TRACE_MDNS;
                }
        }
-       
+
        error = unp_connect2(so, so2);
 
 decref_out:
@@ -1350,18 +1352,18 @@ unp_connect2(struct socket *so, struct socket *so2)
                return (EINVAL);
 
        unp->unp_conn = unp2;
-       so2->so_usecount++; 
-       
+       so2->so_usecount++;
+
        switch (so->so_type) {
 
        case SOCK_DGRAM:
                LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
 
-               if (so != so2) {        
+               if (so != so2) {
                        /* Avoid lock order reversals due to drop/acquire in soisconnected. */
                        /* Keep an extra reference on so2 that will be dropped
-                        * soon after getting the locks in order 
-                        */ 
+                        * soon after getting the locks in order
+                        */
                        socket_unlock(so2, 0);
                        soisconnected(so);
                        unp_get_locks_in_order(so, so2);
@@ -1461,7 +1463,7 @@ try_again:
                socket_lock(so2, 1);
                waitso = so2;
        } else {
-               if (so_locked == 1) { 
+               if (so_locked == 1) {
                        socket_unlock(so, 0);
                }
                socket_lock(so2, 1);
@@ -1476,18 +1478,18 @@ try_again:
        /* Check for the UNP_DONTDISCONNECT flag, if it
         * is set, release both sockets and go to sleep
         */
-       
+
        if ((((struct unpcb *)waitso->so_pcb)->unp_flags & UNP_DONTDISCONNECT) != 0) {
                if (so != so2) {
                        socket_unlock(so2, 1);
                }
                so_locked = 0;
 
-               (void)msleep(waitso->so_pcb, &unp->unp_mtx, 
+               (void)msleep(waitso->so_pcb, &unp->unp_mtx,
                        PSOCK | PDROP, "unpdisconnect", NULL);
                goto try_again;
        }
-       
+
        if (unp->unp_conn == NULL) {
                panic("unp_conn became NULL after sleep");
        }
@@ -1739,7 +1741,7 @@ unp_pcblist64 SYSCTL_HANDLER_ARGS
        if (req->oldptr == USER_ADDR_NULL) {
                n = unp_count;
                req->oldidx = 2 * sizeof (xug) + (n + n / 8) *
-                   (sizeof (struct xunpcb64)); 
+                   (sizeof (struct xunpcb64));
                lck_rw_done(unp_list_mtx);
                return (0);
        }
@@ -1929,7 +1931,7 @@ unp_externalize(struct mbuf *rights)
         * now change each pointer to an fd in the global table to
         * an integer that is the index to the local fd table entry
         * that we set up to point to the global one we are transferring.
-        * XXX (1) this assumes a pointer and int are the same size, 
+        * XXX (1) this assumes a pointer and int are the same size,
         * XXX     or the mbuf can hold the expansion
         * XXX (2) allocation failures should be non-fatal
         */
@@ -1974,7 +1976,7 @@ unp_externalize(struct mbuf *rights)
                if (fileproc_l[i] != NULL) {
                        VERIFY(fileproc_l[i]->f_fglob != NULL &&
                            (fileproc_l[i]->f_fglob->fg_lflags & FG_RMMSGQ));
-                       VERIFY(fds[i] > 0);
+                       VERIFY(fds[i] >= 0);
                        fg_removeuipc(fileproc_l[i]->f_fglob);
 
                        /* Drop the iocount */
@@ -2079,7 +2081,7 @@ unp_internalize(struct mbuf *control, proc_t p)
        }
        rp = (struct fileglob **)(cm + 1);
 
-       /* On K64 we need to walk backwards because a fileglob * is twice the size of an fd 
+       /* On K64 we need to walk backwards because a fileglob * is twice the size of an fd
         * and doing them in-order would result in stomping over unprocessed fd's
         */
        for (i = (oldfds - 1); i >= 0; i--) {
@@ -2227,7 +2229,7 @@ unp_gc(void)
                         * message buffers. Follow those links and mark them
                         * as accessible too.
                         *
-                        * In case a file is passed onto itself we need to 
+                        * In case a file is passed onto itself we need to
                         * release the file lock.
                         */
                        lck_mtx_unlock(&fg->fg_lock);
@@ -2316,7 +2318,7 @@ unp_gc(void)
                        so = (struct socket *)(tfg->fg_data);
 
                        socket_lock(so, 0);
-                       
+
                        sorflush(so);
 
                        socket_unlock(so, 0);
@@ -2435,7 +2437,7 @@ unp_lock(struct socket *so, int refcount, void * lr)
         if (so->so_pcb) {
                 lck_mtx_lock(&((struct unpcb *)so->so_pcb)->unp_mtx);
         } else  {
-                panic("unp_lock: so=%p NO PCB! lr=%p ref=0x%x\n", 
+                panic("unp_lock: so=%p NO PCB! lr=%p ref=0x%x\n",
                        so, lr_saved, so->so_usecount);
         }
 
@@ -2482,7 +2484,7 @@ unp_unlock(struct socket *so, int refcount, void * lr)
 
                if (unp->unp_addr)
                        FREE(unp->unp_addr, M_SONAME);
-               
+
                lck_mtx_unlock(mutex_held);
 
                lck_mtx_destroy(&unp->unp_mtx, unp_mtx_grp);
@@ -2511,4 +2513,3 @@ unp_getlock(struct socket *so, __unused int flags)
                 return (so->so_proto->pr_domain->dom_mtx);
         }
 }
-
index 6b0060acdef84dd49906efe646ccdc1f1b770d13..5d65346c80138046e166fddeb1befa604e030687 100644 (file)
@@ -7,7 +7,7 @@ include $(MakeInc_cmd)
 include $(MakeInc_def)
 
 KERNELFILES = \
-       libkern.h
+       libkern.h copyio.h
 
 EXPORT_MI_LIST = ${KERNELFILES}
 
diff --git a/bsd/libkern/copyio.h b/bsd/libkern/copyio.h
new file mode 100644 (file)
index 0000000..1bec805
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _LIBKERN_COPYIO_H_
+#define _LIBKERN_COPYIO_H_
+
+#include <kern/debug.h>
+
+__BEGIN_DECLS
+
+int    copyin(const user_addr_t uaddr, void *kaddr, size_t len);
+int    copyout(const void *kaddr, user_addr_t udaddr, size_t len);
+
+#if defined (_FORTIFY_SOURCE) && _FORTIFY_SOURCE == 0
+/* FORTIFY_SOURCE disabled */
+#else
+__attribute__((always_inline)) static inline int
+__copyin_chk(const user_addr_t uaddr, void *kaddr, size_t len, size_t chk_size)
+{
+       if (chk_size < len) {
+               panic("__copyin_chk object size check failed: uaddr %p, kaddr %p, (%zu < %zu)", (void*)uaddr, kaddr, len, chk_size);
+       }
+       return copyin(uaddr, kaddr, len);
+}
+
+__attribute__((always_inline)) static inline int
+__copyout_chk(const void *kaddr, user_addr_t uaddr, size_t len, size_t chk_size)
+{
+       if  (chk_size < len) {
+               panic("__copyout_chk object size check failed: uaddr %p, kaddr %p, (%zu < %zu)", (void*)uaddr, kaddr, len, chk_size);
+       }
+       return copyout(kaddr, uaddr, len);
+}
+#define copyin(uaddr, kaddr, len) __copyin_chk(uaddr, kaddr, len, __builtin_object_size(kaddr, 0))
+#define copyout(kaddr, uaddr, len) __copyout_chk(kaddr, uaddr, len, __builtin_object_size(kaddr, 0))
+#endif
+__END_DECLS
+#endif /* _LIBKERN_COPYIO_H_ */
index fa8317325e493498ece6e0f33e8b9040f546c473..73545298d305961360af39e1b3e25e6c93d5ef70 100644 (file)
@@ -77,6 +77,8 @@
 #include <sys/cdefs.h>
 #include <sys/types.h>
 #include <mach/vm_param.h>
+#include <libkern/crc.h>
+#include <libkern/copyio.h>
 
 #if defined(__arm__) || defined(__arm64__)
 #include <arm/arch.h> /* for _ARM_ARCH_* */
@@ -191,8 +193,6 @@ __nosan_crc16(uint16_t crc, const void *bufp, size_t len) { return crc16(crc, bu
 int    copystr(const void *kfaddr, void *kdaddr, size_t len, size_t *done);
 int    copyinstr(const user_addr_t uaddr, void *kaddr, size_t len, size_t *done);
 int    copyoutstr(const void *kaddr, user_addr_t udaddr, size_t len, size_t *done);
-int    copyin(const user_addr_t uaddr, void *kaddr, size_t len);
-int    copyout(const void *kaddr, user_addr_t udaddr, size_t len);
 #if XNU_KERNEL_PRIVATE
 extern int copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes);
 #endif
index 331ca334e24bf5f5e98d124bf4f42e3522d2f6bb..0e724cc86d7326befbe72b61938e7a12f20ed2c2 100644 (file)
@@ -41,6 +41,7 @@ DATAFILES = \
        disconnectx.2           \
        dup.2                   \
        dup2.2                  \
+       errno.2                 \
        execve.2                \
        exchangedata.2          \
        faccessat.2             \
index 10a22ae345bc8b3543c9b56421dea9b1ed56c06e..48d05b6c8f8f0268231e3c85c5b754b0547a89f2 100644 (file)
@@ -73,6 +73,16 @@ Programs that reference the file via an object identifier will continue to
 reference the original file, but now it has the new data.
 .Pp
 .
+WARNING: This system call is largely supported only by HFS and AFP file systems.  Many other
+file systems, including APFS, do not support it.  Further, it is not supported on iOS, tvOS, or watchOS. 
+It is recommended that callers refer
+instead to 
+.Fn rename
+or
+.Fn renamex_np
+to conduct safe-save operations instead. 
+.Pp
+.
 .\" path1 and path2 parameters
 .
 The
@@ -115,6 +125,7 @@ is set to indicate the error.
 .Sh COMPATIBILITY
 Not all volumes support 
 .Fn exchangedata .
+This includes APFS volumes.
 You can test whether a volume supports 
 .Fn exchangedata 
 by using 
index d9b740f6751d7a6e1299c58b9cfbb14974526764..6407f428e743d66427e51848146d48a92368a4d1 100644 (file)
@@ -1,4 +1,4 @@
-.\" Copyright (c) 2017 Apple Computer, Inc. All rights reserved.
+.\" Copyright (c) 2017-2018 Apple Computer, Inc. All rights reserved.
 .\" 
 .\" The contents of this file constitute Original Code as defined in and
 .\" are subject to the Apple Public Source License Version 1.1 (the
@@ -20,7 +20,7 @@
 .Dt FS_SNAPSHOT_CREATE 2
 .Os Darwin
 .Sh NAME
-.Nm fs_snasphot_create
+.Nm fs_snapshot_create
 .Nd create read only snapshot of a mounted filesystem
 .Sh SYNOPSIS
 .Fd #include <sys/attr.h>
index 58950e7df00f337fc6dc67ecb1664ad41415662d..fd2bd1d3b8cca867540d0379cd25295f1092e4b3 100644 (file)
@@ -63,6 +63,17 @@ On return it contains the actual size of the address
 returned (in bytes).
 .Pp
 The address is truncated if the buffer provided is too small.
+.Pp
+Note: For the UNIX domain, the address length returned is the
+.Fa address_len
+parameter passed to the previous
+.Xr bind 2
+system call and not the
+.Va sa_len
+field of the
+.Fa address
+parameter passed to
+.Xr bind 2 .
 .Sh RETURN VALUES
 .Rv -std getsockname
 .Sh ERRORS
index f141a54374db4f85887ae3affdf22d3d9696ba7b..b90765b34826ccea4079929758404b1d56018247 100644 (file)
@@ -16,7 +16,7 @@
 .\" 
 .\"     @(#)searchfs.2
 .
-.Dd October 13, 2008
+.Dd November 16, 2017
 .Dt SEARCHFS 2
 .Os Darwin
 .Sh NAME
@@ -26,7 +26,7 @@
 .Fd #include <sys/attr.h>
 .Fd #include <unistd.h>
 .Ft int
-.Fn searchfs "const char* path" "struct fssearchblock* searchBlock" "unsigned int* numMatches" "unsigned int scriptCode" "unsigned int options" "struct searchstate* state"
+.Fn searchfs "const char* path" "struct fssearchblock* searchBlock" "unsigned long* numMatches" "unsigned int scriptCode" "unsigned int options" "struct searchstate* state"
 .
 .Sh DESCRIPTION
 The
@@ -818,8 +818,8 @@ static int SearchFSDemo(
     SearchAttrBuf   lower;
     SearchAttrBuf   upper;
     static const unsigned char kAllOnes[4] = { 0xFF, 0xFF, 0xFF, 0xFF };
-    unsigned int    matchCount;
-    unsigned int    matchIndex;
+    unsigned long   matchCount;
+    unsigned long   matchIndex;
     unsigned int    options;
     searchstate_t   state;
     ResultAttrBuf * thisEntry;
index 17f3be2432890bc38014da7a5393807db4798c71..acadd27b245e9d12603a4a9c64d6b807b472a021 100644 (file)
@@ -230,6 +230,9 @@ The socket is shut down for writing
 or the socket is connection-mode and is no longer connected.
 In the latter case, and if the socket is of type SOCK_STREAM,
 the SIGPIPE signal is generated to the calling thread.
+.\" ==========
+.It Bq Er EADDRNOTAVAIL
+The specified address is not available or no longer available on this machine.
 .El
 .Pp
 The
index a49ca1068430f28a5dd551a0df8d7a3b1b294f0c..205e8cfdccb4c12a6fa803e1137fe3fb4f79fa46 100644 (file)
@@ -104,7 +104,7 @@ in this way, the effective user ID of a set-user-ID executable
 may be toggled by switching to the real user ID, then re-enabled
 by reverting to the set-user-ID value.
 Similarly, the effective group ID may be set to the value
-of the real group ID or the saved set-user-ID.
+of the real group ID or the saved set-group-ID.
 .Pp
 .Sh RETURN VALUES
 Upon success, these functions return 0;
index 43bc02499180fd3f5be932d7653e5052266002b1..8c7e697436deeef7bed45fcd9e1c46676f37cf34 100644 (file)
@@ -23,20 +23,6 @@ or the current thread.  The policy of the I/O of the given type
 can be get or set for the given
 .Fa scope .
 .Pp
-The I/O type is specified in the argument
-.Fa iotype .
-The only currently supported I/O type is 
-.Dv IOPOL_TYPE_DISK ,
-which can mean either the I/O policy for I/Os to local disks or to
-remote volumes.
-I/Os to local disks are I/Os sent to the media without going through a network,
-including I/Os to internal and external hard drives, optical media in internal
-and external drives, flash drives, floppy disks, ram disks, and mounted disk
-images which reside on these media.
-I/Os to remote volumes are I/Os that require network activity to complete the
-operation.
-This is currently only supported for remote volumes mounted by SMB or AFP.
-.Pp
 The scope that the I/O policy takes effect is specified in the argument
 .Fa scope
 as follows:
@@ -55,8 +41,24 @@ the argument
 .Fa policy
 is an integer which contains the new I/O policy to be set for the given I/O
 type and scope.
-.Fa Policy
-can have the following values:
+.Pp
+The I/O type is specified in the argument
+.Fa iotype .
+The currently supported I/O types are as follows:
+.Bl -tag -width F1
+.It IOPOL_TYPE_DISK
+This can mean either the I/O policy for I/Os to local disks or to
+remote volumes.
+I/Os to local disks are I/Os sent to the media without going through a network,
+including I/Os to internal and external hard drives, optical media in internal
+and external drives, flash drives, floppy disks, ram disks, and mounted disk
+images which reside on these media.
+I/Os to remote volumes are I/Os that require network activity to complete the
+operation.
+This is currently only supported for remote volumes mounted by SMB or AFP.
+.Pp
+IOPOL_TYPE_DISK supports following values for
+.Fa policy:
 .Bl -tag -width IOPOL_PASSIVEXXX
 .It IOPOL_IMPORTANT
 I/Os with the IMPORTANT policy are unrestricted.  This policy should only be
@@ -102,6 +104,28 @@ broken into smaller requests which are then issued serially.
 The I/O policy of a newly created process is inherited from its parent
 process.  The I/O policy of an I/O request is the lowest priority
 policy of the current thread and the current process.
+.It IOPOL_TYPE_VFS_ATIME_UPDATES
+This
+.Fa iotype
+lets users change the access time updates policy for the files accessed
+by the current thread or process.
+.Pp
+IOPOL_TYPE_VFS_ATIME_UPDATES supports following values for
+.Fa policy:
+.Bl -tag -width IOPOL_ATIME_UPDATES_DEFAULT
+.It IOPOL_ATIME_UPDATES_OFF
+The ATIME_UPDATES_OFF policy turns off access time updation for files accessed.
+This policy is useful for applications which access a large number of files
+to reduce the metadata I/O writes.
+.It IOPOL_ATIME_UPDATES_DEFAULT
+This is the default I/O policy for new threads.
+.El
+.El
+.Pp
+Like with IOPOL_TYPE_DISK, the I/O policy of a newly created process is
+inherited from its parent process.  Access time updates are turned off if the
+I/O policy is set to IOPOL_ATIME_UPDATES_OFF for the current thread or current
+process.
 .Sh RETURN VALUES
 The
 .Fn getiopolicy_np
index adbc2e78f6ccccde9e0648c311aaf703f97ec90b..6358c5c6b1d61fdadd9b18bb6ce76a4d00cc5573 100644 (file)
@@ -967,7 +967,7 @@ static int
 dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp,
              struct devfsmount *dvm)
 {
-       devdirent_t *   entry_p;
+       devdirent_t *   entry_p = NULL;
        devdirent_t *   newback;
        devdirent_t *   newfront;
        int     error;
@@ -978,10 +978,14 @@ dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp,
         * go get the node made (if we need to)
         * use the back one as a prototype
         */
-       if ((error = dev_add_entry(back->de_name, parent, type,
-                               NULL, dnp,
-                               parent?parent->dn_dvm:dvm, &entry_p)) != 0) {
+    error = dev_add_entry(back->de_name, parent, type, NULL, dnp,
+                          parent?parent->dn_dvm:dvm, &entry_p);
+    if (!error && (entry_p == NULL)) {
+        error = ENOMEM; /* Really can't happen, but make static analyzer happy */
+    }
+       if (error != 0) {
                printf("duplicating %s failed\n",back->de_name);
+        goto out;
        }
 
        /*
@@ -1009,6 +1013,7 @@ dev_dup_entry(devnode_t * parent, devdirent_t * back, devdirent_t * *dnm_pp,
                        }
                }
        }
+out:
        *dnm_pp = entry_p;
        return error;
 }
index 389adb7e4b3b34a12a76560c998ab92b5ac8e13f..05e28abc112c429cac00d090d135cb873089b039 100644 (file)
@@ -1035,3 +1035,36 @@ static struct vnodeopv_entry_desc nullfs_vnodeop_entries[] = {
 };
 
 struct vnodeopv_desc nullfs_vnodeop_opv_desc = {&nullfs_vnodeop_p, nullfs_vnodeop_entries};
+
+//NULLFS Specific helper function
+
+int
+nullfs_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp)
+{
+       int result = EINVAL;
+
+       if (out_vpp == NULL || in_vp == NULL) {
+               goto end;
+       }
+
+       struct vfsstatfs * sp   = NULL;
+       mount_t mp = vnode_mount(in_vp);
+
+       sp = vfs_statfs(mp);
+       //If this isn't a nullfs vnode or it is but it's a special vnode
+       if (strcmp(sp->f_fstypename, "nullfs") != 0 || nullfs_checkspecialvp(in_vp)) {
+               *out_vpp = NULLVP;
+               result = ENOENT;
+               goto end;
+       }
+
+       vnode_t lvp = NULLVPTOLOWERVP(in_vp);
+       if ((result = vnode_getwithvid(lvp, NULLVPTOLOWERVID(in_vp)))) {
+               goto end;
+       }
+
+       *out_vpp = lvp;
+
+end:
+       return result;
+}
index e29b9e69615f8aa6a9bc77e514b3137af1733c0b..80d8f174cb641f2e2ef70f8a279467ed8fcf0476 100644 (file)
@@ -142,6 +142,8 @@ int null_getnewvnode(
     struct mount * mp, struct vnode * lowervp, struct vnode * dvp, struct vnode ** vpp, struct componentname * cnp, int root);
 void null_hashrem(struct null_node * xp);
 
+int nullfs_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp);
+
 #define NULLVPTOLOWERVP(vp) (VTONULL(vp)->null_lowervp)
 #define NULLVPTOLOWERVID(vp) (VTONULL(vp)->null_lowervid)
 #define NULLVPTOMYVID(vp) (VTONULL(vp)->null_myvid)
@@ -150,9 +152,6 @@ extern struct vnodeopv_desc nullfs_vnodeop_opv_desc;
 
 extern vop_t * nullfs_vnodeop_p;
 
-// int     nullfs_install_filesys(void);
-// int     nullfs_uninstall_filesys(void);
-
 __END_DECLS
 
 #ifdef NULLFS_DEBUG
index 6e0c09d1cf99dfb8020bb5fa70aa750995039c8b..702787a410b652e109f28b24441cb58ffcf967be 100644 (file)
@@ -1138,7 +1138,7 @@ throttle_timer(struct _throttle_io_info_t *info)
                        ut = (uthread_t)TAILQ_FIRST(&info->throttle_uthlist[wake_level]);
                        TAILQ_REMOVE(&info->throttle_uthlist[wake_level], ut, uu_throttlelist);
                        ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
-                       ut->uu_is_throttled = FALSE;
+                       ut->uu_is_throttled = false;
 
                        wake_address = (caddr_t)&ut->uu_on_throttlelist;
                }
@@ -1156,7 +1156,7 @@ throttle_timer(struct _throttle_io_info_t *info)
 
                        TAILQ_REMOVE(&info->throttle_uthlist[level], ut, uu_throttlelist);
                        ut->uu_on_throttlelist = THROTTLE_LEVEL_NONE;
-                       ut->uu_is_throttled = FALSE;
+                       ut->uu_is_throttled = false;
 
                        wakeup(&ut->uu_on_throttlelist);
                }
@@ -1335,13 +1335,12 @@ throttle_init(void)
 }
 
 void
-sys_override_io_throttle(int flag)
+sys_override_io_throttle(boolean_t enable_override)
 {
-       if (flag == THROTTLE_IO_ENABLE)
-               lowpri_throttle_enabled = 1;
-
-       if (flag == THROTTLE_IO_DISABLE)
+       if (enable_override)
                lowpri_throttle_enabled = 0;
+       else
+               lowpri_throttle_enabled = 1;
 }
 
 int rethrottle_wakeups = 0;
@@ -1382,19 +1381,19 @@ rethrottle_thread(uthread_t ut)
        boolean_t s = ml_set_interrupts_enabled(FALSE);
        lck_spin_lock(&ut->uu_rethrottle_lock);
 
-       if (ut->uu_is_throttled == FALSE)
-               ut->uu_was_rethrottled = TRUE;
+       if (!ut->uu_is_throttled)
+               ut->uu_was_rethrottled = true;
        else {
                int my_new_level = throttle_get_thread_throttle_level(ut);
 
                if (my_new_level != ut->uu_on_throttlelist) {
                        /*
                         * ut is currently blocked (as indicated by
-                        * ut->uu_is_throttled == TRUE)
+                        * ut->uu_is_throttled == true)
                         * and we're changing it's throttle level, so
                         * we need to wake it up.
                         */
-                       ut->uu_is_throttled = FALSE;
+                       ut->uu_is_throttled = false;
                        wakeup(&ut->uu_on_throttlelist);
 
                        rethrottle_wakeups++;
@@ -1622,7 +1621,7 @@ throttle_get_thread_throttle_level_internal(uthread_t ut, int io_tier) {
        assert(ut != NULL);
 
        /* Bootcache misses should always be throttled */
-       if (ut->uu_throttle_bc == TRUE)
+       if (ut->uu_throttle_bc)
                thread_throttle_level = THROTTLE_LEVEL_TIER3;
 
        /*
@@ -1781,7 +1780,7 @@ throttle_lowpri_io(int sleep_amount)
        info = ut->uu_throttle_info;
 
        if (info == NULL) {
-               ut->uu_throttle_bc = FALSE;
+               ut->uu_throttle_bc = false;
                ut->uu_lowpri_window = 0;
                return (0);
        }
@@ -1791,12 +1790,12 @@ throttle_lowpri_io(int sleep_amount)
        if (sleep_amount == 0)
                goto done;
 
-       if (sleep_amount == 1 && ut->uu_throttle_bc == FALSE)
+       if (sleep_amount == 1 && !ut->uu_throttle_bc)
                sleep_amount = 0;
 
        throttle_io_period_num = info->throttle_io_period_num;
 
-       ut->uu_was_rethrottled = FALSE;
+       ut->uu_was_rethrottled = false;
 
        while ( (throttle_type = throttle_io_will_be_throttled_internal(info, &mylevel, &throttling_level)) ) {
 
@@ -1836,7 +1835,7 @@ throttle_lowpri_io(int sleep_amount)
                 * this is the critical section w/r to our interaction
                 * with "rethrottle_thread"
                 */
-               if (ut->uu_was_rethrottled == TRUE) {
+               if (ut->uu_was_rethrottled) {
 
                        lck_spin_unlock(&ut->uu_rethrottle_lock);
                        ml_set_interrupts_enabled(s);
@@ -1844,7 +1843,7 @@ throttle_lowpri_io(int sleep_amount)
 
                        KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_FSRW, 103)), thread_tid(ut->uu_thread), ut->uu_on_throttlelist, 0, 0, 0);
 
-                       ut->uu_was_rethrottled = FALSE;
+                       ut->uu_was_rethrottled = false;
                        continue;
                }
                KERNEL_DEBUG_CONSTANT((FSDBG_CODE(DBG_THROTTLE, PROCESS_THROTTLED)) | DBG_FUNC_NONE,
@@ -1859,7 +1858,7 @@ throttle_lowpri_io(int sleep_amount)
 
                assert_wait((caddr_t)&ut->uu_on_throttlelist, THREAD_UNINT);
 
-               ut->uu_is_throttled = TRUE;
+               ut->uu_is_throttled = true;
                lck_spin_unlock(&ut->uu_rethrottle_lock);
                ml_set_interrupts_enabled(s);
 
@@ -1869,8 +1868,8 @@ throttle_lowpri_io(int sleep_amount)
 
                ut->uu_wmesg = NULL;
 
-               ut->uu_is_throttled = FALSE;
-               ut->uu_was_rethrottled = FALSE;
+               ut->uu_is_throttled = false;
+               ut->uu_was_rethrottled = false;
 
                lck_mtx_lock(&info->throttle_lock);
 
@@ -1904,7 +1903,7 @@ done:
        }
 
        ut->uu_throttle_info = NULL;
-       ut->uu_throttle_bc = FALSE;
+       ut->uu_throttle_bc = false;
        ut->uu_lowpri_window = 0;
 
        throttle_info_rel(info);
@@ -1942,7 +1941,7 @@ void throttle_info_reset_window(uthread_t ut)
 
                ut->uu_throttle_info = NULL;
                ut->uu_lowpri_window = 0;
-               ut->uu_throttle_bc = FALSE;
+               ut->uu_throttle_bc = false;
        }
 }
 
@@ -2349,7 +2348,7 @@ spec_strategy(struct vnop_strategy_args *ap)
 
        if (kdebug_enable) {
                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
-                                         buf_kernel_addrperm_addr(bp), bdev, (int)buf_blkno(bp), buf_count(bp), 0);
+                                         buf_kernel_addrperm_addr(bp), bdev, buf_blkno(bp), buf_count(bp), 0);
         }
 
        thread_update_io_stats(current_thread(), buf_count(bp), code);
@@ -2650,7 +2649,7 @@ static void filt_specdetach(struct knote *kn);
 static int filt_specevent(struct knote *kn, long hint);
 static int filt_spectouch(struct knote *kn, struct kevent_internal_s *kev);
 static int filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-static unsigned filt_specpeek(struct knote *kn);
+static int filt_specpeek(struct knote *kn);
 
 SECURITY_READ_ONLY_EARLY(struct filterops) spec_filtops = {
        .f_isfd    = 1,
@@ -2719,6 +2718,14 @@ spec_knote_select_and_link(struct knote *kn)
         */
        old_wqs = uth->uu_wqset;
        uth->uu_wqset = &(knote_get_kq(kn)->kq_wqs);
+
+       /*
+        * Be sure that the waitq set is linked
+        * before calling select to avoid possible
+        * allocation under spinlocks.
+        */
+       waitq_set_lazy_init_link(uth->uu_wqset);
+
        /*
         * Now these are the laws of VNOP_SELECT, as old and as true as the sky,
         * And the device that shall keep it may prosper, but the device that shall
@@ -2877,8 +2884,6 @@ filt_spectouch(struct knote *kn, struct kevent_internal_s *kev)
 {
        kn->kn_sdata = kev->data;
        kn->kn_sfflags = kev->fflags;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        if (kev->flags & EV_ENABLE) {
                return spec_knote_select_and_link(kn);
@@ -2902,8 +2907,6 @@ filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in
        ctx = vfs_context_current();
        vp = (vnode_t)kn->kn_fp->f_fglob->fg_data;
 
-       /* FIXME JMM - locking against touches? */
-
        error = vnode_getwithvid(vp, kn->kn_hookid);
        if (error != 0) {
                kn->kn_flags |= (EV_EOF | EV_ONESHOT);
@@ -2930,7 +2933,7 @@ filt_specprocess(struct knote *kn, struct filt_process_s *data, struct kevent_in
        return res;
 }
 
-static unsigned
+static int
 filt_specpeek(struct knote *kn)
 {
        int selres = 0;
@@ -2938,6 +2941,6 @@ filt_specpeek(struct knote *kn)
        selres = spec_knote_select_and_link(kn);
        filt_spec_common(kn, selres);
 
-       return kn->kn_data;
+       return kn->kn_data != 0;
 }
 
index a2e90264fe6c0c25ff64ff021f2b7b8a70f9f78d..60c66a58b95134d3b77afe10a0efa152c06ad8c0 100644 (file)
@@ -69,7 +69,8 @@ PRIVATE_DATAFILES = \
        raw_cb.h \
        route.h \
        net_perf.h \
-       net_kev.h
+       net_kev.h \
+       nat464_utils.h
 
 PRIVATE_KERNELFILES = $(filter-out radix.h,${KERNELFILES}) \
        bpfdesc.h ppp_comp.h \
index 70b69823b3d28d94dc940f8b822bf45470fe8315..860b738483c0876d1d6456c11acbbfb86c1b87a9 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
@@ -62,7 +62,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *      @(#)bpf.c      8.2 (Berkeley) 3/28/94
+ *     @(#)bpf.c       8.2 (Berkeley) 3/28/94
  *
  * $FreeBSD: src/sys/net/bpf.c,v 1.59.2.5 2001/01/05 04:49:09 jdp Exp $
  */
@@ -76,9 +76,9 @@
 #include "bpf.h"
 
 #ifndef __GNUC__
-#define inline
+#define        inline
 #else
-#define inline __inline
+#define        inline __inline
 #endif
 
 #include <sys/param.h>
 #include <net/bpfdesc.h>
 
 #include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #include <netinet/if_ether.h>
+#include <netinet/isakmp.h>
+#include <netinet6/esp.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <net/firewire.h>
 #include <security/mac_framework.h>
 #endif /* MAC_NET */
 
+#include <os/log.h>
+
 extern int tvtohz(struct timeval *);
 
-#define BPF_BUFSIZE 4096
-#define UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio)
+#define        BPF_BUFSIZE 4096
+#define        UIOMOVE(cp, len, code, uio) uiomove(cp, len, uio)
 
+#define        PRINET  26                      /* interruptible */
 
-#define PRINET  26                     /* interruptible */
+#define ISAKMP_HDR_SIZE (sizeof(struct isakmp) + sizeof(struct isakmp_gen))
+#define ESP_HDR_SIZE sizeof(struct newesp)
 
 typedef void (*pktcopyfunc_t)(const void *, void *, size_t);
 
@@ -183,10 +191,11 @@ static struct bpf_if      *bpf_iflist;
  * the bpf_d in a separate table indexed by minor device #.
  *
  * The value stored in bpf_dtab[n] represent three states:
- *  0: device not opened
- *  1: device opening or closing
+ *  NULL: device not opened
+ *  BPF_DEV_RESERVED: device opening or closing
  *  other: device <n> opened with pointer to storage
  */
+#define        BPF_DEV_RESERVED ((struct bpf_d *)(uintptr_t)1)
 static struct bpf_d    **bpf_dtab = NULL;
 static unsigned int bpf_dtab_size = 0;
 static unsigned int    nbpfilter = 0;
@@ -205,9 +214,10 @@ static int bpf_detachd(struct bpf_d *d, int);
 static void    bpf_freed(struct bpf_d *);
 static int     bpf_movein(struct uio *, int,
                    struct mbuf **, struct sockaddr *, int *);
-static int     bpf_setif(struct bpf_d *, ifnet_t ifp);
+static int     bpf_setif(struct bpf_d *, ifnet_t ifp, bool, bool);
 static void    bpf_timed_out(void *, void *);
 static void    bpf_wakeup(struct bpf_d *);
+static u_int   get_pkt_trunc_len(u_char *, u_int);
 static void    catchpacket(struct bpf_d *, struct bpf_packet *, u_int, int);
 static void    reset_d(struct bpf_d *);
 static int     bpf_setf(struct bpf_d *, u_int, user_addr_t, u_long);
@@ -219,7 +229,7 @@ static void bpf_set_packet_service_class(struct mbuf *, int);
 static void    bpf_acquire_d(struct bpf_d *);
 static void    bpf_release_d(struct bpf_d *);
 
-static  int bpf_devsw_installed;
+static int bpf_devsw_installed;
 
 void bpf_init(void *unused);
 static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m);
@@ -235,9 +245,8 @@ static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m);
        ioctl_fcn_t         bpfioctl;
        select_fcn_t        bpfselect;
 
-
 /* Darwin's cdevsw struct differs slightly from BSDs */
-#define CDEV_MAJOR 23
+#define        CDEV_MAJOR 23
 static struct cdevsw bpf_cdevsw = {
        /* open */          bpfopen,
        /* close */         bpfclose,
@@ -249,16 +258,17 @@ static struct cdevsw bpf_cdevsw = {
        /* tty */           NULL,
        /* select */        bpfselect,
        /* mmap */          eno_mmap,
-       /* strategy*/       eno_strat,
+       /* strategy */      eno_strat,
        /* getc */          eno_getc,
        /* putc */          eno_putc,
        /* type */          0
 };
 
-#define SOCKADDR_HDR_LEN          offsetof(struct sockaddr, sa_data)
+#define        SOCKADDR_HDR_LEN           offsetof(struct sockaddr, sa_data)
 
 static int
-bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *sockp, int *datlen)
+bpf_movein(struct uio *uio, int linktype, struct mbuf **mp,
+    struct sockaddr *sockp, int *datlen)
 {
        struct mbuf *m;
        int error;
@@ -267,40 +277,40 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc
        int hlen;
 
        switch (linktype) {
-       
+
 #if SLIP
        case DLT_SLIP:
                sa_family = AF_INET;
                hlen = 0;
                break;
 #endif /* SLIP */
-       
+
        case DLT_EN10MB:
                sa_family = AF_UNSPEC;
                /* XXX Would MAXLINKHDR be better? */
                hlen = sizeof(struct ether_header);
                break;
-       
+
 #if FDDI
        case DLT_FDDI:
-       #if defined(__FreeBSD__) || defined(__bsdi__)
+#if defined(__FreeBSD__) || defined(__bsdi__)
                sa_family = AF_IMPLINK;
                hlen = 0;
-       #else
+#else
                sa_family = AF_UNSPEC;
                /* XXX 4(FORMAC)+6(dst)+6(src)+3(LLC)+5(SNAP) */
                hlen = 24;
-       #endif
+#endif
                break;
 #endif /* FDDI */
-       
+
        case DLT_RAW:
        case DLT_NULL:
                sa_family = AF_UNSPEC;
                hlen = 0;
                break;
-       
-       #ifdef __FreeBSD__
+
+#ifdef __FreeBSD__
        case DLT_ATM_RFC1483:
                /*
                 * en atm driver requires 4-byte atm pseudo header.
@@ -308,21 +318,21 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc
                 * specified anyway.
                 */
                sa_family = AF_UNSPEC;
-               hlen = 12;      /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
+               hlen = 12;      /* XXX 4(ATM_PH) + 3(LLC) + 5(SNAP) */
                break;
-       #endif
+#endif
 
        case DLT_PPP:
                sa_family = AF_UNSPEC;
                hlen = 4;       /* This should match PPP_HDRLEN */
                break;
-       
+
        case DLT_APPLE_IP_OVER_IEEE1394:
                sa_family = AF_UNSPEC;
                hlen = sizeof(struct firewire_header);
                break;
 
-       case DLT_IEEE802_11:            /* IEEE 802.11 wireless */
+       case DLT_IEEE802_11:            /* IEEE 802.11 wireless */
                sa_family = AF_IEEE80211;
                hlen = 0;
                break;
@@ -365,7 +375,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc
                 */
                hlen = 0;
        }
-       
+
        MGETHDR(m, M_WAIT, MT_DATA);
        if (m == 0)
                return (ENOBUFS);
@@ -379,7 +389,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc
        m->m_pkthdr.len = m->m_len = len;
        m->m_pkthdr.rcvif = NULL;
        *mp = m;
-       
+
        /*
         * Make room for link header.
         */
@@ -394,24 +404,27 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc
        error = UIOMOVE(mtod(m, caddr_t), len - hlen, UIO_WRITE, uio);
        if (error)
                goto bad;
-       
+
        /* Check for multicast destination */
        switch (linktype) {
                case DLT_EN10MB: {
-                       struct ether_header *eh = mtod(m, struct ether_header *);
-                       
+                       struct ether_header *eh;
+
+                       eh = mtod(m, struct ether_header *);
                        if (ETHER_IS_MULTICAST(eh->ether_dhost)) {
-                               if (_ether_cmp(etherbroadcastaddr, eh->ether_dhost) == 0)
+                               if (_ether_cmp(etherbroadcastaddr,
+                                   eh->ether_dhost) == 0) {
                                        m->m_flags |= M_BCAST;
-                               else
+                               } else {
                                        m->m_flags |= M_MCAST;
+                               }
                        }
                        break;
                }
        }
-       
-       return 0;
- bad:
+
+       return (0);
+bad:
        m_freem(m);
        return (error);
 }
@@ -421,7 +434,7 @@ bpf_movein(struct uio *uio, int linktype, struct mbuf **mp, struct sockaddr *soc
 /*
  * The dynamic addition of a new device node must block all processes that
  * are opening the last device so that no process will get an unexpected
- * ENOENT 
+ * ENOENT
  */
 static void
 bpf_make_dev_t(int maj)
@@ -434,32 +447,33 @@ bpf_make_dev_t(int maj)
 
        while (bpf_growing) {
                /* Wait until new device has been created */
-               (void)tsleep((caddr_t)&bpf_growing, PZERO, "bpf_growing", 0);
+               (void) tsleep((caddr_t)&bpf_growing, PZERO, "bpf_growing", 0);
        }
        if (nbpfilter > cur_size) {
                /* other thread grew it already */
                return;
        }
        bpf_growing = 1;
-       
+
        /* need to grow bpf_dtab first */
        if (nbpfilter == bpf_dtab_size) {
                int new_dtab_size;
                struct bpf_d **new_dtab = NULL;
                struct bpf_d **old_dtab = NULL;
-               
-               new_dtab_size = bpf_dtab_size + NBPFILTER;      
-               new_dtab = (struct bpf_d **)_MALLOC(sizeof(struct bpf_d *) * new_dtab_size, M_DEVBUF, M_WAIT);
+
+               new_dtab_size = bpf_dtab_size + NBPFILTER;
+               new_dtab = (struct bpf_d **)_MALLOC(
+                   sizeof(struct bpf_d *) * new_dtab_size, M_DEVBUF, M_WAIT);
                if (new_dtab == 0) {
                        printf("bpf_make_dev_t: malloc bpf_dtab failed\n");
                        goto done;
                }
                if (bpf_dtab) {
-                       bcopy(bpf_dtab, new_dtab, 
-                                 sizeof(struct bpf_d *) * bpf_dtab_size);
+                       bcopy(bpf_dtab, new_dtab,
+                           sizeof(struct bpf_d *) * bpf_dtab_size);
                }
-               bzero(new_dtab + bpf_dtab_size, 
-                         sizeof(struct bpf_d *) * NBPFILTER);
+               bzero(new_dtab + bpf_dtab_size,
+                   sizeof(struct bpf_d *) * NBPFILTER);
                old_dtab = bpf_dtab;
                bpf_dtab = new_dtab;
                bpf_dtab_size = new_dtab_size;
@@ -485,7 +499,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 {
        int first = bp->bif_dlist == NULL;
        int     error = 0;
-       
+
        /*
         * Point d at bp, and add d to the interface's list of listeners.
         * Finally, point the driver's bpf cookie at the interface so
@@ -505,7 +519,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
                /* Find the default bpf entry for this ifp */
                if (bp->bif_ifp->if_bpf == NULL) {
                        struct bpf_if   *tmp, *primary = NULL;
-                       
+
                        for (tmp = bpf_iflist; tmp; tmp = tmp->bif_next) {
                                if (tmp->bif_ifp == bp->bif_ifp) {
                                        primary = tmp;
@@ -516,10 +530,12 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
                }
                /* Only call dlil_set_bpf_tap for primary dlt */
                if (bp->bif_ifp->if_bpf == bp)
-                       dlil_set_bpf_tap(bp->bif_ifp, BPF_TAP_INPUT_OUTPUT, bpf_tap_callback);          
+                       dlil_set_bpf_tap(bp->bif_ifp, BPF_TAP_INPUT_OUTPUT,
+                           bpf_tap_callback);
 
                if (bp->bif_tap != NULL)
-                       error = bp->bif_tap(bp->bif_ifp, bp->bif_dlt, BPF_TAP_INPUT_OUTPUT);
+                       error = bp->bif_tap(bp->bif_ifp, bp->bif_dlt,
+                           BPF_TAP_INPUT_OUTPUT);
        }
 
        /*
@@ -532,7 +548,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
        } else {
                d->bd_flags &= ~BPF_FINALIZE_PKTAP;
        }
-       return error;
+       return (error);
 }
 
 /*
@@ -583,7 +599,7 @@ bpf_detachd(struct bpf_d *d, int closing)
                        dlil_set_bpf_tap(ifp, BPF_TAP_DISABLE, NULL);
                if (bp->bif_tap)
                        bp->bif_tap(ifp, bp->bif_dlt, BPF_TAP_DISABLE);
-               
+
                for (bp = bpf_iflist; bp; bp = bp->bif_next)
                        if (bp->bif_ifp == ifp && bp->bif_dlist != 0)
                                break;
@@ -641,7 +657,6 @@ done:
                return (0);
 }
 
-
 /*
  * Start asynchronous timer, if necessary.
  * Must be called with bpf_mlock held.
@@ -660,7 +675,7 @@ bpf_start_timer(struct bpf_d *d)
                    (uint64_t)tv.tv_sec * USEC_PER_SEC + tv.tv_usec,
                    NSEC_PER_USEC, &deadline);
                /*
-                * The state is BPF_IDLE, so the timer hasn't 
+                * The state is BPF_IDLE, so the timer hasn't
                 * been started yet, and hasn't gone off yet;
                 * there is no thread call scheduled, so this
                 * won't change the schedule.
@@ -684,10 +699,10 @@ bpf_stop_timer(struct bpf_d *d)
         * If the timer has already gone off, this does nothing.
         * Our caller is expected to set d->bd_state to BPF_IDLE,
         * with the bpf_mlock, after we are called. bpf_timed_out()
-        * also grabs bpf_mlock, so, if the timer has gone off and 
+        * also grabs bpf_mlock, so, if the timer has gone off and
         * bpf_timed_out() hasn't finished, it's waiting for the
-        * lock; when this thread releases the lock, it will 
-        * find the state is BPF_IDLE, and just release the 
+        * lock; when this thread releases the lock, it will
+        * find the state is BPF_IDLE, and just release the
         * lock and return.
         */
        return (thread_call_cancel(d->bd_thread_call));
@@ -737,7 +752,7 @@ bpf_release_d(struct bpf_d *d)
 /* ARGSUSED */
 int
 bpfopen(dev_t dev, int flags, __unused int fmt,
-       __unused struct proc *p)
+       struct proc *p)
 {
        struct bpf_d *d;
 
@@ -746,28 +761,30 @@ bpfopen(dev_t dev, int flags, __unused int fmt,
                lck_mtx_unlock(bpf_mlock);
                return (ENXIO);
        }
-       /* 
-        * New device nodes are created on demand when opening the last one. 
-        * The programming model is for processes to loop on the minor starting at 0 
-        * as long as EBUSY is returned. The loop stops when either the open succeeds or 
-        * an error other that EBUSY is returned. That means that bpf_make_dev_t() must 
-        * block all processes that are opening the last  node. If not all 
-        * processes are blocked, they could unexpectedly get ENOENT and abort their 
-        * opening loop.
+       /*
+        * New device nodes are created on demand when opening the last one.
+        * The programming model is for processes to loop on the minor starting
+        * at 0 as long as EBUSY is returned. The loop stops when either the
+        * open succeeds or an error other that EBUSY is returned. That means
+        * that bpf_make_dev_t() must block all processes that are opening the
+        * last  node. If not all processes are blocked, they could unexpectedly
+        * get ENOENT and abort their opening loop.
         */
        if ((unsigned int) minor(dev) == (nbpfilter - 1))
                bpf_make_dev_t(major(dev));
 
        /*
-        * Each minor can be opened by only one process.  If the requested 
+        * Each minor can be opened by only one process.  If the requested
         * minor is in use, return EBUSY.
         *
-        * Important: bpfopen() and bpfclose() have to check and set the status of a device
-        * in the same lockin context otherwise the device may be leaked because the vnode use count 
-        * will be unpextectly greater than 1 when close() is called.
+        * Important: bpfopen() and bpfclose() have to check and set the status
+        * of a device in the same lockin context otherwise the device may be
+        * leaked because the vnode use count will be unpextectly greater than 1
+        * when close() is called.
         */
-       if (bpf_dtab[minor(dev)] == 0) {
-               bpf_dtab[minor(dev)] = (void *)1;       /* Mark opening */
+       if (bpf_dtab[minor(dev)] == NULL) {
+               /* Reserve while opening */
+               bpf_dtab[minor(dev)] = BPF_DEV_RESERVED;
        } else {
                lck_mtx_unlock(bpf_mlock);
                return (EBUSY);
@@ -779,7 +796,7 @@ bpfopen(dev_t dev, int flags, __unused int fmt,
                printf("bpfopen: malloc bpf_d failed\n");
                bpf_dtab[minor(dev)] = NULL;
                lck_mtx_unlock(bpf_mlock);
-               return ENOMEM;
+               return (ENOMEM);
        }
 
        /* Mark "in use" and do most initialization. */
@@ -804,11 +821,14 @@ bpfopen(dev_t dev, int flags, __unused int fmt,
 
                return (ENOMEM);
        }
+       d->bd_opened_by = p;
+       uuid_generate(d->bd_uuid);
+
 #if CONFIG_MACF_NET
        mac_bpfdesc_label_init(d);
        mac_bpfdesc_label_associate(kauth_cred_get(), d);
 #endif
-       bpf_dtab[minor(dev)] = d;                               /* Mark opened */
+       bpf_dtab[minor(dev)] = d; /* Mark opened */
        lck_mtx_unlock(bpf_mlock);
 
        return (0);
@@ -821,7 +841,7 @@ bpfopen(dev_t dev, int flags, __unused int fmt,
 /* ARGSUSED */
 int
 bpfclose(dev_t dev, __unused int flags, __unused int fmt,
-        __unused struct proc *p)
+    __unused struct proc *p)
 {
        struct bpf_d *d;
 
@@ -829,7 +849,7 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt,
        lck_mtx_lock(bpf_mlock);
 
        d = bpf_dtab[minor(dev)];
-       if (d == 0 || d == (void *)1) {
+       if (d == NULL || d == BPF_DEV_RESERVED) {
                lck_mtx_unlock(bpf_mlock);
                return (ENXIO);
        }
@@ -843,7 +863,7 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt,
                printf("%s: %llx\n",
                    __func__, (uint64_t)VM_KERNEL_ADDRPERM(d));
 
-       bpf_dtab[minor(dev)] = (void *)1;               /* Mark closing */
+       bpf_dtab[minor(dev)] = BPF_DEV_RESERVED; /* Reserve while closing */
 
        /*
         * Deal with any in-progress timeouts.
@@ -866,9 +886,9 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt,
                         */
                        if (!bpf_stop_timer(d)) {
                                /*
-                                * There was no pending call, so the call must 
+                                * There was no pending call, so the call must
                                 * have been in progress. Wait for the call to
-                                * complete; we have to drop the lock while 
+                                * complete; we have to drop the lock while
                                 * waiting. to let the in-progrss call complete
                                 */
                                d->bd_state = BPF_DRAINING;
@@ -891,8 +911,8 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt,
                         * Another thread is blocked on a close waiting for
                         * a timeout to finish.
                         * This "shouldn't happen", as the first thread to enter
-                        * bpfclose() will set bpf_dtab[minor(dev)] to 1, and 
-                        * all subsequent threads should see that and fail with 
+                        * bpfclose() will set bpf_dtab[minor(dev)] to 1, and
+                        * all subsequent threads should see that and fail with
                         * ENXIO.
                         */
                        panic("Two threads blocked in a BPF close");
@@ -907,7 +927,7 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt,
 #endif
        thread_call_free(d->bd_thread_call);
 
-       while (d->bd_hbuf_read)
+       while (d->bd_hbuf_read != 0)
                msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
 
        bpf_freed(d);
@@ -922,18 +942,38 @@ bpfclose(dev_t dev, __unused int flags, __unused int fmt,
        return (0);
 }
 
-
-#define BPF_SLEEP bpf_sleep
+#define        BPF_SLEEP bpf_sleep
 
 static int
 bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo)
 {
        u_int64_t abstime = 0;
 
-       if(timo)
+       if (timo != 0)
                clock_interval_to_deadline(timo, NSEC_PER_SEC / hz, &abstime);
-       
-       return msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime);
+
+       return (msleep1((caddr_t)d, bpf_mlock, pri, wmesg, abstime));
+}
+
+static void
+bpf_finalize_pktap(struct bpf_hdr *hp, struct pktap_header *pktaphdr)
+{
+       if (pktaphdr->pth_flags & PTH_FLAG_V2_HDR) {
+               struct pktap_v2_hdr *pktap_v2_hdr;
+
+               pktap_v2_hdr = (struct pktap_v2_hdr *)pktaphdr;
+
+               if (pktap_v2_hdr->pth_flags & PTH_FLAG_DELAY_PKTAP)
+                       pktap_v2_finalize_proc_info(pktap_v2_hdr);
+       } else {
+               if (pktaphdr->pth_flags & PTH_FLAG_DELAY_PKTAP)
+                       pktap_finalize_proc_info(pktaphdr);
+
+               if (pktaphdr->pth_flags & PTH_FLAG_TSTAMP) {
+                       hp->bh_tstamp.tv_sec = pktaphdr->pth_tstamp.tv_sec;
+                       hp->bh_tstamp.tv_usec = pktaphdr->pth_tstamp.tv_usec;
+               }
+       }
 }
 
 /*
@@ -941,8 +981,8 @@ bpf_sleep(struct bpf_d *d, int pri, const char *wmesg, int timo)
  * into the hold slot, and the free buffer into the store slot.
  * Zero the length of the new store buffer.
  */
-#define ROTATE_BUFFERS(d) \
-       if (d->bd_hbuf_read) \
+#define        ROTATE_BUFFERS(d) \
+       if (d->bd_hbuf_read != 0) \
                panic("rotating bpf buffers during read"); \
        (d)->bd_hbuf = (d)->bd_sbuf; \
        (d)->bd_hlen = (d)->bd_slen; \
@@ -958,7 +998,7 @@ int
 bpfread(dev_t dev, struct uio *uio, int ioflag)
 {
        struct bpf_d *d;
-       caddr_t hbuf; 
+       caddr_t hbuf;
        int timed_out, hbuf_len;
        int error;
        int flags;
@@ -966,7 +1006,8 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
        lck_mtx_lock(bpf_mlock);
 
        d = bpf_dtab[minor(dev)];
-       if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) {
+       if (d == NULL || d == BPF_DEV_RESERVED ||
+           (d->bd_flags & BPF_CLOSING) != 0) {
                lck_mtx_unlock(bpf_mlock);
                return (ENXIO);
        }
@@ -982,14 +1023,14 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
                lck_mtx_unlock(bpf_mlock);
                return (EINVAL);
        }
-       
-       if (d->bd_state == BPF_WAITING)
+
+       if (d->bd_state == BPF_WAITING)
                bpf_stop_timer(d);
-       
+
        timed_out = (d->bd_state == BPF_TIMED_OUT);
        d->bd_state = BPF_IDLE;
 
-       while (d->bd_hbuf_read
+       while (d->bd_hbuf_read != 0)
                msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
 
        if ((d->bd_flags & BPF_CLOSING) != 0) {
@@ -1003,8 +1044,8 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
         * have arrived to fill the store buffer.
         */
        while (d->bd_hbuf == 0) {
-               if ((d->bd_immediate || timed_out || (ioflag & IO_NDELAY)) 
-                       && d->bd_slen != 0) {
+               if ((d->bd_immediate || timed_out || (ioflag & IO_NDELAY)) &&
+                   d->bd_slen != 0) {
                        /*
                         * We're in immediate mode, or are reading
                         * in non-blocking mode, or a timer was
@@ -1034,8 +1075,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
                        lck_mtx_unlock(bpf_mlock);
                        return (EWOULDBLOCK);
                }
-               error = BPF_SLEEP(d, PRINET|PCATCH, "bpf",
-                                 d->bd_rtout);
+               error = BPF_SLEEP(d, PRINET|PCATCH, "bpf", d->bd_rtout);
                /*
                 * Make sure device is still opened
                 */
@@ -1045,8 +1085,9 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
                        return (ENXIO);
                }
 
-               while (d->bd_hbuf_read)
-                       msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
+               while (d->bd_hbuf_read != 0)
+                       msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading",
+                           NULL);
 
                if ((d->bd_flags & BPF_CLOSING) != 0) {
                        bpf_release_d(d);
@@ -1111,7 +1152,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
         */
 
        /*
-        * Set the hold buffer read. So we do not 
+        * Set the hold buffer read. So we do not
         * rotate the buffers until the hold buffer
         * read is complete. Also to avoid issues resulting
         * from page faults during disk sleep (<rdar://problem/13436396>).
@@ -1138,7 +1179,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
                        int found = 0;
 
                        ehp = (struct bpf_hdr_ext *)(void *)p;
-                       if ((flowid = ehp->bh_flowid)) {
+                       if ((flowid = ehp->bh_flowid) != 0) {
                                if (ehp->bh_proto == IPPROTO_TCP)
                                        found = inp_findinpcb_procinfo(&tcbinfo,
                                            flowid, &soprocinfo);
@@ -1147,26 +1188,20 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
                                            flowid, &soprocinfo);
                                if (found == 1) {
                                        ehp->bh_pid = soprocinfo.spi_pid;
-                                       proc_name(ehp->bh_pid, ehp->bh_comm, MAXCOMLEN);
+                                       proc_name(ehp->bh_pid, ehp->bh_comm,
+                                           MAXCOMLEN);
                                }
                                ehp->bh_flowid = 0;
                        }
 
                        if (flags & BPF_FINALIZE_PKTAP) {
                                struct pktap_header *pktaphdr;
-                               
+
                                pktaphdr = (struct pktap_header *)(void *)
                                    (p + BPF_WORDALIGN(ehp->bh_hdrlen));
 
-                               if (pktaphdr->pth_flags & PTH_FLAG_DELAY_PKTAP)
-                                       pktap_finalize_proc_info(pktaphdr);
-
-                               if (pktaphdr->pth_flags & PTH_FLAG_TSTAMP) {
-                                       ehp->bh_tstamp.tv_sec =
-                                               pktaphdr->pth_tstamp.tv_sec;
-                                       ehp->bh_tstamp.tv_usec =
-                                               pktaphdr->pth_tstamp.tv_usec;
-                               }
+                               bpf_finalize_pktap((struct bpf_hdr *) ehp,
+                                   pktaphdr);
                        }
                        p += BPF_WORDALIGN(ehp->bh_hdrlen + ehp->bh_caplen);
                }
@@ -1177,20 +1212,12 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
                while (p < hbuf + hbuf_len) {
                        struct bpf_hdr *hp;
                        struct pktap_header *pktaphdr;
-                       
+
                        hp = (struct bpf_hdr *)(void *)p;
                        pktaphdr = (struct pktap_header *)(void *)
                            (p + BPF_WORDALIGN(hp->bh_hdrlen));
 
-                       if (pktaphdr->pth_flags & PTH_FLAG_DELAY_PKTAP)
-                               pktap_finalize_proc_info(pktaphdr);
-
-                       if (pktaphdr->pth_flags & PTH_FLAG_TSTAMP) {
-                               hp->bh_tstamp.tv_sec =
-                                       pktaphdr->pth_tstamp.tv_sec;
-                               hp->bh_tstamp.tv_usec =
-                                       pktaphdr->pth_tstamp.tv_usec;
-                       }
+                       bpf_finalize_pktap(hp, pktaphdr);
 
                        p += BPF_WORDALIGN(hp->bh_hdrlen + hp->bh_caplen);
                }
@@ -1203,7 +1230,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
         * we checked above that the read buffer is bpf_bufsize bytes.
         */
        error = UIOMOVE(hbuf, hbuf_len, UIO_READ, uio);
-       
+
        lck_mtx_lock(bpf_mlock);
        /*
         * Make sure device is still opened
@@ -1213,7 +1240,7 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
                lck_mtx_unlock(bpf_mlock);
                return (ENXIO);
        }
-       
+
        d->bd_hbuf_read = 0;
        d->bd_fbuf = d->bd_hbuf;
        d->bd_hbuf = NULL;
@@ -1227,7 +1254,6 @@ bpfread(dev_t dev, struct uio *uio, int ioflag)
 
 }
 
-
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
@@ -1247,7 +1273,6 @@ bpf_wakeup(struct bpf_d *d)
                KNOTE(&d->bd_sel.si_note, 1);
 }
 
-
 static void
 bpf_timed_out(void *arg, __unused void *dummy)
 {
@@ -1256,7 +1281,7 @@ bpf_timed_out(void *arg, __unused void *dummy)
        lck_mtx_lock(bpf_mlock);
        if (d->bd_state == BPF_WAITING) {
                /*
-                * There's a select or kqueue waiting for this; if there's 
+                * There's a select or kqueue waiting for this; if there's
                 * now stuff to read, wake it up.
                 */
                d->bd_state = BPF_TIMED_OUT;
@@ -1272,13 +1297,9 @@ bpf_timed_out(void *arg, __unused void *dummy)
        }
        lck_mtx_unlock(bpf_mlock);
 }
-       
-
-
-
 
 /* keep in sync with bpf_movein above: */
-#define MAX_DATALINK_HDR_LEN   (sizeof(struct firewire_header))
+#define        MAX_DATALINK_HDR_LEN    (sizeof(struct firewire_header))
 
 int
 bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag)
@@ -1287,7 +1308,7 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag)
        struct ifnet *ifp;
        struct mbuf *m = NULL;
        int error;
-       char              dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN];
+       char              dst_buf[SOCKADDR_HDR_LEN + MAX_DATALINK_HDR_LEN];
        int datlen = 0;
        int bif_dlt;
        int bd_hdrcmplt;
@@ -1295,7 +1316,8 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag)
        lck_mtx_lock(bpf_mlock);
 
        d = bpf_dtab[minor(dev)];
-       if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) {
+       if (d == NULL || d == BPF_DEV_RESERVED ||
+           (d->bd_flags & BPF_CLOSING) != 0) {
                lck_mtx_unlock(bpf_mlock);
                return (ENXIO);
        }
@@ -1332,7 +1354,7 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag)
        /* bpf_movein allocating mbufs; drop lock */
        lck_mtx_unlock(bpf_mlock);
 
-       error = bpf_movein(uio, bif_dlt, &m, 
+       error = bpf_movein(uio, bif_dlt, &m,
        bd_hdrcmplt ? NULL : (struct sockaddr *)dst_buf,
        &datlen);
 
@@ -1366,7 +1388,6 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag)
                return (EMSGSIZE);
        }
 
-
 #if CONFIG_MACF_NET
        mac_mbuf_label_associate_bpfdesc(d, m);
 #endif
@@ -1402,7 +1423,7 @@ bpfwrite(dev_t dev, struct uio *uio, __unused int ioflag)
 static void
 reset_d(struct bpf_d *d)
 {
-       if (d->bd_hbuf_read)
+       if (d->bd_hbuf_read != 0)
                panic("resetting buffers during read");
 
        if (d->bd_hbuf) {
@@ -1418,6 +1439,146 @@ reset_d(struct bpf_d *d)
        d->bd_dcount = 0;
 }
 
+static struct bpf_d *
+bpf_get_device_from_uuid(uuid_t uuid)
+{
+       unsigned int i;
+
+       for (i = 0; i < nbpfilter; i++) {
+               struct bpf_d *d = bpf_dtab[i];
+
+               if (d == NULL || d == BPF_DEV_RESERVED ||
+                   (d->bd_flags & BPF_CLOSING) != 0)
+                       continue;
+               if (uuid_compare(uuid, d->bd_uuid) == 0)
+                       return (d);
+       }
+
+       return (NULL);
+}
+
+/*
+ * The BIOCSETUP command "atomically" attach to the interface and
+ * copy the buffer from another interface. This minimizes the risk
+ * of missing packet because this is done while holding
+ * the BPF global lock
+ */
+static int
+bpf_setup(struct bpf_d *d_to, uuid_t uuid_from, ifnet_t ifp)
+{
+       struct bpf_d *d_from;
+       int error = 0;
+
+       LCK_MTX_ASSERT(bpf_mlock, LCK_MTX_ASSERT_OWNED);
+
+       /*
+        * Sanity checks
+        */
+       d_from = bpf_get_device_from_uuid(uuid_from);
+       if (d_from == NULL) {
+               error = ENOENT;
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: uuids not found error %d",
+                   __func__, error);
+               return (error);
+       }
+       if (d_from->bd_opened_by != d_to->bd_opened_by) {
+               error = EACCES;
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: processes not matching error %d",
+                   __func__, error);
+               return (error);
+       }
+
+       /*
+        * Prevent any read while copying
+        */
+       while (d_to->bd_hbuf_read != 0)
+               msleep((caddr_t)d_to, bpf_mlock, PRINET, __func__, NULL);
+       d_to->bd_hbuf_read = 1;
+
+       while (d_from->bd_hbuf_read != 0)
+               msleep((caddr_t)d_from, bpf_mlock, PRINET, __func__, NULL);
+       d_from->bd_hbuf_read = 1;
+
+       /*
+        * Verify the devices have not been closed
+        */
+       if (d_to->bd_flags & BPF_CLOSING) {
+               error = ENXIO;
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: d_to is closing error %d",
+                   __func__, error);
+               goto done;
+       }
+       if (d_from->bd_flags & BPF_CLOSING) {
+               error = ENXIO;
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: d_from is closing error %d",
+                   __func__, error);
+               goto done;
+       }
+
+       /*
+        * For now require the same buffer size
+        */
+       if (d_from->bd_bufsize != d_to->bd_bufsize) {
+               error = EINVAL;
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: bufsizes not matching error %d",
+                   __func__, error);
+               goto done;
+       }
+
+       /*
+        * Attach to the interface
+        */
+       error = bpf_setif(d_to, ifp, false, true);
+       if (error != 0) {
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: bpf_setif() failed error %d",
+                   __func__, error);
+               goto done;
+       }
+
+       /*
+        * Make sure the buffers are setup as expected by bpf_setif()
+        */
+       ASSERT(d_to->bd_hbuf == NULL);
+       ASSERT(d_to->bd_sbuf != NULL);
+       ASSERT(d_to->bd_fbuf != NULL);
+
+       /*
+        * Copy the buffers and update the pointers and counts
+        */
+       memcpy(d_to->bd_sbuf, d_from->bd_sbuf, d_from->bd_slen);
+       d_to->bd_slen = d_from->bd_slen;
+       d_to->bd_scnt = d_from->bd_scnt;
+
+       if (d_from->bd_hbuf != NULL) {
+               d_to->bd_hbuf = d_to->bd_fbuf;
+               d_to->bd_fbuf = NULL;
+               memcpy(d_to->bd_hbuf, d_from->bd_hbuf, d_from->bd_hlen);
+       }
+       d_to->bd_hlen = d_from->bd_hlen;
+       d_to->bd_hcnt = d_from->bd_hcnt;
+
+       if (bpf_debug > 0) {
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: done slen %u scnt %u hlen %u hcnt %u",
+                   __func__, d_to->bd_slen, d_to->bd_scnt,
+                   d_to->bd_hlen, d_to->bd_hcnt);
+       }
+done:
+       d_from->bd_hbuf_read = 0;
+       wakeup((caddr_t)d_from);
+
+       d_to->bd_hbuf_read = 0;
+       wakeup((caddr_t)d_to);
+
+       return (error);
+}
+
 /*
  *  FIONREAD           Check for read packet available.
  *  SIOCGIFADDR                Get interface address - convenient hook to driver.
@@ -1456,7 +1617,8 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
        lck_mtx_lock(bpf_mlock);
 
        d = bpf_dtab[minor(dev)];
-       if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) {
+       if (d == NULL || d == BPF_DEV_RESERVED ||
+           (d->bd_flags & BPF_CLOSING) != 0) {
                lck_mtx_unlock(bpf_mlock);
                return (ENXIO);
        }
@@ -1511,30 +1673,48 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
        /*
         * Set buffer length.
         */
-       case BIOCSBLEN:                 /* u_int */
-               if (d->bd_bif != 0 || (d->bd_flags & BPF_DETACHING))
-                       error = EINVAL;
-               else {
-                       u_int size;
+       case BIOCSBLEN: {               /* u_int */
+               u_int size;
+               unsigned int maxbufsize = bpf_maxbufsize;
 
-                       bcopy(addr, &size, sizeof (size));
+               /*
+                * Allow larger buffer in head drop mode to with the
+                * assumption the reading process may be low priority but
+                * is interested in the most recent traffic
+                */
+               if (d->bd_headdrop != 0) {
+                       maxbufsize = 2 * bpf_maxbufsize;
+               }
 
+               if (d->bd_bif != 0 || (d->bd_flags & BPF_DETACHING)) {
                        /*
-                        * Allow larger buffer in head drop mode with the
-                        * assumption the capture is in standby mode to
-                        * keep a cache of recent traffic
+                        * Interface already attached, unable to change buffers
                         */
-                       if (d->bd_headdrop != 0 && size > 2 * bpf_maxbufsize)
-                               size = 2 * bpf_maxbufsize;
-                       else if (size > bpf_maxbufsize)
-                               size = bpf_maxbufsize;
-                       else if (size < BPF_MINBUFSIZE)
-                               size = BPF_MINBUFSIZE;
-                       bcopy(&size, addr, sizeof (size));
+                       error = EINVAL;
+                       break;
+               }
+               bcopy(addr, &size, sizeof (size));
+
+               if (size > maxbufsize) {
+                       d->bd_bufsize = maxbufsize;
+
+                       os_log_info(OS_LOG_DEFAULT,
+                           "%s bufsize capped to %u from %u",
+                           __func__, d->bd_bufsize, size);
+               } else if (size < BPF_MINBUFSIZE) {
+                       d->bd_bufsize = BPF_MINBUFSIZE;
+
+                       os_log_info(OS_LOG_DEFAULT,
+                           "%s bufsize bumped to %u from %u",
+                           __func__, d->bd_bufsize, size);
+               } else {
                        d->bd_bufsize = size;
                }
-               break;
 
+               /* It's a read/write ioctl */
+               bcopy(&d->bd_bufsize, addr, sizeof (u_int));
+               break;
+       }
        /*
         * Set link layer read filter.
         */
@@ -1561,8 +1741,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
         * Flush read packet buffer.
         */
        case BIOCFLUSH:
-               while (d->bd_hbuf_read) {
-                       msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
+               while (d->bd_hbuf_read != 0) {
+                       msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading",
+                           NULL);
                }
                if ((d->bd_flags & BPF_CLOSING) != 0) {
                        error = ENXIO;
@@ -1622,9 +1803,9 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
                        u_int dlt;
 
                        bcopy(addr, &dlt, sizeof (dlt));
-                       
-                       if (dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) {
-                               printf("BIOCSDLT downgrade DLT_PKTAP to DLT_RAW\n");
+
+                       if (dlt == DLT_PKTAP &&
+                           !(d->bd_flags & BPF_WANT_PKTAP)) {
                                dlt = DLT_RAW;
                        }
                        error = bpf_setdlt(d, dlt);
@@ -1657,7 +1838,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
                if (ifp == NULL)
                        error = ENXIO;
                else
-                       error = bpf_setif(d, ifp);
+                       error = bpf_setif(d, ifp, true, false);
                break;
        }
 
@@ -1846,7 +2027,7 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
                        d->bd_flags &= ~BPF_EXTENDED_HDR;
                break;
 
-       case BIOCGIFATTACHCOUNT: {              /* struct ifreq */
+       case BIOCGIFATTACHCOUNT: {              /* struct ifreq */
                ifnet_t ifp;
                struct bpf_if *bp;
 
@@ -1860,10 +2041,11 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
                ifr.ifr_intval = 0;
                for (bp = bpf_iflist; bp != 0; bp = bp->bif_next) {
                        struct bpf_d *bpf_d;
-                       
+
                        if (bp->bif_ifp == NULL || bp->bif_ifp != ifp)
                                continue;
-                       for (bpf_d = bp->bif_dlist; bpf_d; bpf_d = bpf_d->bd_next) {
+                       for (bpf_d = bp->bif_dlist; bpf_d;
+                           bpf_d = bpf_d->bd_next) {
                                ifr.ifr_intval += 1;
                        }
                }
@@ -1876,11 +2058,11 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
                break;
 
        case BIOCSWANTPKTAP:                    /* u_int */
-                bcopy(addr, &int_arg, sizeof (int_arg));
-                if (int_arg)
-                        d->bd_flags |= BPF_WANT_PKTAP;
-                else
-                        d->bd_flags &= ~BPF_WANT_PKTAP;
+               bcopy(addr, &int_arg, sizeof (int_arg));
+               if (int_arg)
+                       d->bd_flags |= BPF_WANT_PKTAP;
+               else
+                       d->bd_flags &= ~BPF_WANT_PKTAP;
                break;
 #endif
 
@@ -1892,6 +2074,49 @@ bpfioctl(dev_t dev, u_long cmd, caddr_t addr, __unused int flags,
        case BIOCGHEADDROP:
                bcopy(&d->bd_headdrop, addr, sizeof (int));
                break;
+
+       case BIOCSTRUNCATE:
+               bcopy(addr, &int_arg, sizeof(int_arg));
+               if (int_arg)
+                       d->bd_flags |=  BPF_TRUNCATE;
+               else
+                       d->bd_flags &= ~BPF_TRUNCATE;
+               break;
+
+       case BIOCGETUUID:
+               bcopy(&d->bd_uuid, addr, sizeof (uuid_t));
+               break;
+
+       case BIOCSETUP: {
+               struct bpf_setup_args bsa;
+               ifnet_t ifp;
+
+               bcopy(addr, &bsa, sizeof (struct bpf_setup_args));
+               bsa.bsa_ifname[IFNAMSIZ - 1] = 0;
+               ifp = ifunit(bsa.bsa_ifname);
+               if (ifp == NULL) {
+                       error = ENXIO;
+                       os_log_info(OS_LOG_DEFAULT,
+                           "%s: ifnet not found for %s error %d",
+                           __func__, bsa.bsa_ifname, error);
+                       break;
+       }
+
+               error = bpf_setup(d, bsa.bsa_uuid, ifp);
+               break;
+       }
+       case BIOCSPKTHDRV2:
+               bcopy(addr, &int_arg, sizeof(int_arg));
+               if (int_arg != 0)
+                       d->bd_flags |= BPF_PKTHDRV2;
+               else
+                       d->bd_flags &= ~BPF_PKTHDRV2;
+               break;
+
+       case BIOCGPKTHDRV2:
+               int_arg = d->bd_flags & BPF_PKTHDRV2 ? 1 : 0;
+               bcopy(&int_arg, addr, sizeof (int));
+               break;
        }
 
        bpf_release_d(d);
@@ -1911,12 +2136,12 @@ bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns,
        struct bpf_insn *fcode, *old;
        u_int flen, size;
 
-       while (d->bd_hbuf_read
+       while (d->bd_hbuf_read != 0)
                msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
 
        if ((d->bd_flags & BPF_CLOSING) != 0)
                return (ENXIO);
-       
+
        old = d->bd_filter;
        if (bf_insns == USER_ADDR_NULL) {
                if (bf_len != 0)
@@ -1940,10 +2165,10 @@ bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns,
        if (copyin(bf_insns, (caddr_t)fcode, size) == 0 &&
            bpf_validate(fcode, (int)flen)) {
                d->bd_filter = fcode;
-       
+
                if (cmd == BIOCSETF32 || cmd == BIOCSETF64)
                        reset_d(d);
-       
+
                if (old != 0)
                        FREE((caddr_t)old, M_DEVBUF);
 
@@ -1959,12 +2184,12 @@ bpf_setf(struct bpf_d *d, u_int bf_len, user_addr_t bf_insns,
  * Return an errno or 0.
  */
 static int
-bpf_setif(struct bpf_d *d, ifnet_t theywant)
+bpf_setif(struct bpf_d *d, ifnet_t theywant, bool do_reset, bool has_hbuf_read)
 {
        struct bpf_if *bp;
        int error;
 
-       while (d->bd_hbuf_read)
+       while (d->bd_hbuf_read != 0 && !has_hbuf_read)
                msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
 
        if ((d->bd_flags & BPF_CLOSING) != 0)
@@ -2006,15 +2231,15 @@ bpf_setif(struct bpf_d *d, ifnet_t theywant)
                        if (bpf_attachd(d, bp) != 0)
                                return (ENXIO);
                }
+               if (do_reset) {
                reset_d(d);
+               }
                return (0);
        }
        /* Not found. */
        return (ENXIO);
 }
 
-
-
 /*
  * Get a list of available data link type of the interface.
  */
@@ -2042,7 +2267,7 @@ bpf_getdltlist(struct bpf_d *d, caddr_t addr, struct proc *p)
        for (bp = bpf_iflist; bp; bp = bp->bif_next) {
                if (bp->bif_ifp != ifp)
                        continue;
-               /* 
+               /*
                 * Do not use DLT_PKTAP, unless requested explicitly
                 */
                if (bp->bif_dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP))
@@ -2074,11 +2299,11 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt)
        int error, opromisc;
        struct ifnet *ifp;
        struct bpf_if *bp;
-       
+
        if (d->bd_bif->bif_dlt == dlt)
                return (0);
-       
-       while (d->bd_hbuf_read)
+
+       while (d->bd_hbuf_read != 0)
                msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
 
        if ((d->bd_flags & BPF_CLOSING) != 0)
@@ -2090,7 +2315,8 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt)
                        /*
                         * Do not use DLT_PKTAP, unless requested explicitly
                         */
-                       if (bp->bif_dlt == DLT_PKTAP && !(d->bd_flags & BPF_WANT_PKTAP)) {
+                       if (bp->bif_dlt == DLT_PKTAP &&
+                           !(d->bd_flags & BPF_WANT_PKTAP)) {
                                continue;
                        }
                        break;
@@ -2103,8 +2329,9 @@ bpf_setdlt(struct bpf_d *d, uint32_t dlt)
                error = bpf_attachd(d, bp);
                if (error) {
                        printf("bpf_setdlt: bpf_attachd %s%d failed (%d)\n",
-                               ifnet_name(bp->bif_ifp), ifnet_unit(bp->bif_ifp), error);
-                       return error;
+                           ifnet_name(bp->bif_ifp), ifnet_unit(bp->bif_ifp),
+                           error);
+                       return (error);
                }
                reset_d(d);
                if (opromisc) {
@@ -2161,7 +2388,8 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p)
        lck_mtx_lock(bpf_mlock);
 
        d = bpf_dtab[minor(dev)];
-       if (d == 0 || d == (void *)1 || (d->bd_flags & BPF_CLOSING) != 0) {
+       if (d == NULL || d == BPF_DEV_RESERVED ||
+           (d->bd_flags & BPF_CLOSING) != 0) {
                lck_mtx_unlock(bpf_mlock);
                return (ENXIO);
        }
@@ -2174,7 +2402,7 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p)
                return (ENXIO);
        }
 
-       while (d->bd_hbuf_read
+       while (d->bd_hbuf_read != 0)
                msleep((caddr_t)d, bpf_mlock, PRINET, "bpf_reading", NULL);
 
        if ((d->bd_flags & BPF_CLOSING) != 0) {
@@ -2186,8 +2414,8 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p)
        switch (which) {
                case FREAD:
                        if (d->bd_hlen != 0 ||
-                                       ((d->bd_immediate || d->bd_state == BPF_TIMED_OUT) &&
-                                        d->bd_slen != 0))
+                           ((d->bd_immediate ||
+                           d->bd_state == BPF_TIMED_OUT) && d->bd_slen != 0))
                                ret = 1; /* read has data to return */
                        else {
                                /*
@@ -2201,7 +2429,8 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p)
                        break;
 
                case FWRITE:
-                       ret = 1; /* can't determine whether a write would block */
+                       /* can't determine whether a write would block */
+                       ret = 1;
                        break;
        }
 
@@ -2211,7 +2440,6 @@ bpfselect(dev_t dev, int which, void * wql, struct proc *p)
        return (ret);
 }
 
-
 /*
  * Support for kevent() system call.  Register EVFILT_READ filters and
  * reject all others.
@@ -2220,10 +2448,11 @@ int bpfkqfilter(dev_t dev, struct knote *kn);
 static void filt_bpfdetach(struct knote *);
 static int filt_bpfread(struct knote *, long);
 static int filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev);
-static int filt_bpfprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
+static int filt_bpfprocess(struct knote *kn, struct filt_process_s *data,
+    struct kevent_internal_s *kev);
 
 SECURITY_READ_ONLY_EARLY(struct filterops) bpfread_filtops = {
-       .f_isfd = 1, 
+       .f_isfd = 1,
        .f_detach = filt_bpfdetach,
        .f_event = filt_bpfread,
        .f_touch = filt_bpftouch,
@@ -2237,24 +2466,23 @@ filt_bpfread_common(struct knote *kn, struct bpf_d *d)
 
        if (d->bd_immediate) {
                /*
-                * If there's data in the hold buffer, it's the 
+                * If there's data in the hold buffer, it's the
                 * amount of data a read will return.
                 *
                 * If there's no data in the hold buffer, but
                 * there's data in the store buffer, a read will
-                * immediately rotate the store buffer to the 
+                * immediately rotate the store buffer to the
                 * hold buffer, the amount of data in the store
-                * buffer is the amount of data a read will 
+                * buffer is the amount of data a read will
                 * return.
                 *
-                * If there's no data in either buffer, we're not 
+                * If there's no data in either buffer, we're not
                 * ready to read.
                 */
-               kn->kn_data = ((d->bd_hlen == 0  || d->bd_hbuf_read) 
-                   d->bd_slen : d->bd_hlen);
+               kn->kn_data = (d->bd_hlen == 0 || d->bd_hbuf_read != 0 ?
+                   d->bd_slen : d->bd_hlen);
                int64_t lowwat = 1;
-               if (kn->kn_sfflags & NOTE_LOWAT)
-               {
+               if (kn->kn_sfflags & NOTE_LOWAT) {
                        if (kn->kn_sdata > d->bd_bufsize)
                                lowwat = d->bd_bufsize;
                        else if (kn->kn_sdata > lowwat)
@@ -2263,22 +2491,22 @@ filt_bpfread_common(struct knote *kn, struct bpf_d *d)
                ready = (kn->kn_data >= lowwat);
        } else {
                /*
-                * If there's data in the hold buffer, it's the 
+                * If there's data in the hold buffer, it's the
                 * amount of data a read will return.
                 *
-                * If there's no data in the hold buffer, but 
-                * there's data in the store buffer, if the 
+                * If there's no data in the hold buffer, but
+                * there's data in the store buffer, if the
                 * timer has expired a read will immediately
                 * rotate the store buffer to the hold buffer,
-                * so the amount of data in the store buffer is 
+                * so the amount of data in the store buffer is
                 * the amount of data a read will return.
                 *
-                * If there's no data in either buffer, or there's 
-                * no data in the hold buffer and the timer hasn't 
+                * If there's no data in either buffer, or there's
+                * no data in the hold buffer and the timer hasn't
                 * expired, we're not ready to read.
                 */
-               kn->kn_data = ((d->bd_hlen == 0 || d->bd_hbuf_read) && d->bd_state == BPF_TIMED_OUT ? 
-                               d->bd_slen : d->bd_hlen);
+               kn->kn_data = ((d->bd_hlen == 0 || d->bd_hbuf_read != 0) &&
+                   d->bd_state == BPF_TIMED_OUT ? d->bd_slen : d->bd_hlen);
                ready = (kn->kn_data > 0);
        }
        if (!ready)
@@ -2300,21 +2528,20 @@ bpfkqfilter(dev_t dev, struct knote *kn)
            kn->kn_filter != EVFILT_READ) {
                kn->kn_flags = EV_ERROR;
                kn->kn_data = EINVAL;
-               return 0;
+               return (0);
        }
 
        lck_mtx_lock(bpf_mlock);
 
        d = bpf_dtab[minor(dev)];
 
-       if (d == 0 ||
-           d == (void *)1 ||
-           d->bd_bif == NULL ||
-           (d->bd_flags & BPF_CLOSING) != 0) {
+       if (d == NULL || d == BPF_DEV_RESERVED ||
+           (d->bd_flags & BPF_CLOSING) != 0 ||
+           d->bd_bif == NULL) {
                lck_mtx_unlock(bpf_mlock);
                kn->kn_flags = EV_ERROR;
                kn->kn_data = ENXIO;
-               return 0;
+               return (0);
        }
 
        kn->kn_hook = d;
@@ -2349,7 +2576,7 @@ filt_bpfread(struct knote *kn, long hint)
 #pragma unused(hint)
        struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
 
-       return filt_bpfread_common(kn, d);
+       return (filt_bpfread_common(kn, d));
 }
 
 static int
@@ -2363,19 +2590,18 @@ filt_bpftouch(struct knote *kn, struct kevent_internal_s *kev)
        /* save off the lowat threshold and flag */
        kn->kn_sdata = kev->data;
        kn->kn_sfflags = kev->fflags;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /* output data will be re-generated here */
        res = filt_bpfread_common(kn, d);
 
        lck_mtx_unlock(bpf_mlock);
 
-       return res;
+       return (res);
 }
 
 static int
-filt_bpfprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
+filt_bpfprocess(struct knote *kn, struct filt_process_s *data,
+    struct kevent_internal_s *kev)
 {
 #pragma unused(data)
        struct bpf_d *d = (struct bpf_d *)kn->kn_hook;
@@ -2388,11 +2614,11 @@ filt_bpfprocess(struct knote *kn, struct filt_process_s *data, struct kevent_int
        }
        lck_mtx_unlock(bpf_mlock);
 
-       return res;
+       return (res);
 }
 
 /*
- * Copy data from an mbuf chain into a buffer.  This code is derived
+ * Copy data from an mbuf chain into a buffer. This code is derived
  * from m_copydata in kern/uipc_mbuf.c.
  */
 static void
@@ -2456,11 +2682,44 @@ bpf_tap_imp(
                goto done;
        }
        for (d = bp->bif_dlist; d; d = d->bd_next) {
+               struct bpf_packet *bpf_pkt_saved = bpf_pkt;
+               struct bpf_packet bpf_pkt_tmp;
+               struct pktap_header_buffer bpfp_header_tmp;
+
                if (outbound && !d->bd_seesent)
                        continue;
+
                ++d->bd_rcount;
                slen = bpf_filter(d->bd_filter, (u_char *)bpf_pkt,
-                                 bpf_pkt->bpfp_total_length, 0);
+                   bpf_pkt->bpfp_total_length, 0);
+               if (bp->bif_ifp->if_type == IFT_PKTAP &&
+                   bp->bif_dlt == DLT_PKTAP) {
+                       /*
+                        * Need to copy the bpf_pkt because the conversion
+                        * to v2 pktap header modifies the content of the
+                        * bpfp_header
+                        */
+                       if ((d->bd_flags & BPF_PKTHDRV2) &&
+                           bpf_pkt->bpfp_header_length <= sizeof(bpfp_header_tmp)) {
+                               bpf_pkt_tmp = *bpf_pkt;
+
+                               bpf_pkt = &bpf_pkt_tmp;
+
+                               memcpy(&bpfp_header_tmp, bpf_pkt->bpfp_header,
+                                   bpf_pkt->bpfp_header_length);
+
+                               bpf_pkt->bpfp_header = &bpfp_header_tmp;
+
+                               convert_to_pktap_header_to_v2(bpf_pkt,
+                                   !!(d->bd_flags & BPF_TRUNCATE));
+                       }
+
+                       if (d->bd_flags & BPF_TRUNCATE)  {
+                               slen = min(slen,
+                                   get_pkt_trunc_len((u_char *)bpf_pkt,
+                           bpf_pkt->bpfp_total_length));
+               }
+               }
                if (slen != 0) {
 #if CONFIG_MACF_NET
                        if (mac_bpfdesc_check_receive(d, bp->bif_ifp) != 0)
@@ -2468,9 +2727,10 @@ bpf_tap_imp(
 #endif
                        catchpacket(d, bpf_pkt, slen, outbound);
                }
+               bpf_pkt = bpf_pkt_saved;
        }
 
- done:
+done:
        lck_mtx_unlock(bpf_mlock);
 }
 
@@ -2531,11 +2791,24 @@ bpf_tap_in(
 static int bpf_tap_callback(struct ifnet *ifp, struct mbuf *m)
 {
        bpf_tap_mbuf(ifp, 0, m, NULL, 0, mbuf_pkthdr_rcvif(m) == NULL);
-       
-       return 0;
+
+       return (0);
 }
 
 
+static errno_t
+bpf_copydata(struct bpf_packet *pkt, size_t off, size_t len, void* out_data)
+{
+       errno_t err = 0;
+       if (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF) {
+               err = mbuf_copydata(pkt->bpfp_mbuf, off, len, out_data);
+       } else {
+               err = EINVAL;
+       }
+
+       return (err);
+}
+
 static void
 copy_bpf_packet(struct bpf_packet * pkt, void * dst, size_t len)
 {
@@ -2560,6 +2833,347 @@ copy_bpf_packet(struct bpf_packet * pkt, void * dst, size_t len)
        }
 }
 
+static uint16_t
+get_esp_trunc_len(__unused struct bpf_packet *pkt, __unused uint16_t off,
+    const uint16_t remaining_caplen)
+{
+       /*
+        * For some reason tcpdump expects to have one byte beyond the ESP header
+        */
+       uint16_t trunc_len = ESP_HDR_SIZE + 1;
+
+       if (trunc_len > remaining_caplen)
+               return (remaining_caplen);
+
+       return (trunc_len);
+}
+
+static uint16_t
+get_isakmp_trunc_len(__unused struct bpf_packet *pkt, __unused uint16_t off,
+    const uint16_t remaining_caplen)
+{
+       /*
+        * Include the payload generic header
+        */
+       uint16_t trunc_len = ISAKMP_HDR_SIZE;
+
+       if (trunc_len > remaining_caplen)
+               return (remaining_caplen);
+
+       return (trunc_len);
+}
+
+static uint16_t
+get_isakmp_natt_trunc_len(struct bpf_packet *pkt, uint16_t off,
+    const uint16_t remaining_caplen)
+{
+       int err = 0;
+       uint16_t trunc_len = 0;
+       char payload[remaining_caplen];
+
+       err = bpf_copydata(pkt, off, remaining_caplen, payload);
+       if (err != 0)
+               return (remaining_caplen);
+       /*
+        * They are three cases:
+        * - IKE: payload start with 4 bytes header set to zero before ISAKMP header
+        * - keep alive: 1 byte payload
+        * - otherwise it's ESP
+        */
+       if (remaining_caplen >= 4 &&
+               payload[0] == 0 && payload[1] == 0 &&
+               payload[2] == 0 && payload[3] == 0) {
+               trunc_len = 4 + get_isakmp_trunc_len(pkt, off + 4, remaining_caplen - 4);
+       } else if (remaining_caplen == 1) {
+               trunc_len = 1;
+       } else {
+               trunc_len = get_esp_trunc_len(pkt, off, remaining_caplen);
+       }
+
+       if (trunc_len > remaining_caplen)
+               return (remaining_caplen);
+
+       return (trunc_len);
+
+}
+
+static uint16_t
+get_udp_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
+{
+       int err = 0;
+       uint16_t trunc_len = sizeof(struct udphdr); /* By default no UDP payload */
+
+       if (trunc_len >= remaining_caplen)
+               return (remaining_caplen);
+
+       struct udphdr udphdr;
+       err = bpf_copydata(pkt, off, sizeof(struct udphdr), &udphdr);
+       if (err != 0)
+               return (remaining_caplen);
+
+       u_short sport, dport;
+
+       sport = EXTRACT_SHORT(&udphdr.uh_sport);
+       dport = EXTRACT_SHORT(&udphdr.uh_dport);
+
+       if (dport == PORT_DNS || sport == PORT_DNS) {
+               /*
+                * Full UDP payload for DNS
+                */
+               trunc_len = remaining_caplen;
+       } else if ((sport == PORT_BOOTPS && dport == PORT_BOOTPC) ||
+               (sport == PORT_BOOTPC && dport == PORT_BOOTPS)) {
+               /*
+                * Full UDP payload for BOOTP and DHCP
+                */
+               trunc_len = remaining_caplen;
+       } else if (dport == PORT_ISAKMP && sport == PORT_ISAKMP) {
+               /*
+                * Return the ISAKMP header
+                */
+               trunc_len += get_isakmp_trunc_len(pkt, off + sizeof(struct udphdr),
+                   remaining_caplen - sizeof(struct udphdr));
+       } else if (dport == PORT_ISAKMP_NATT && sport == PORT_ISAKMP_NATT) {
+               trunc_len += get_isakmp_natt_trunc_len(pkt, off + sizeof(struct udphdr),
+                   remaining_caplen - sizeof(struct udphdr));
+       }
+       if (trunc_len >= remaining_caplen)
+               return (remaining_caplen);
+
+       return (trunc_len);
+}
+
+static uint16_t
+get_tcp_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
+{
+       int err = 0;
+       uint16_t trunc_len = sizeof(struct tcphdr); /* By default no TCP payload */
+       if (trunc_len >= remaining_caplen)
+               return (remaining_caplen);
+
+       struct tcphdr tcphdr;
+       err = bpf_copydata(pkt, off, sizeof(struct tcphdr), &tcphdr);
+       if (err != 0)
+               return (remaining_caplen);
+
+       u_short sport, dport;
+       sport = EXTRACT_SHORT(&tcphdr.th_sport);
+       dport = EXTRACT_SHORT(&tcphdr.th_dport);
+
+       if (dport == PORT_DNS || sport == PORT_DNS) {
+               /*
+                * Full TCP payload  for DNS
+                */
+               trunc_len = remaining_caplen;
+       } else {
+               trunc_len = tcphdr.th_off << 2;
+       }
+       if (trunc_len >= remaining_caplen)
+               return (remaining_caplen);
+
+       return (trunc_len);
+}
+
+static uint16_t
+get_proto_trunc_len(uint8_t proto, struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
+{
+       uint16_t trunc_len;
+
+       switch (proto) {
+       case IPPROTO_ICMP: {
+               /*
+                * Full IMCP payload
+                */
+               trunc_len = remaining_caplen;
+               break;
+       }
+       case IPPROTO_ICMPV6: {
+               /*
+                * Full IMCPV6 payload
+                */
+               trunc_len = remaining_caplen;
+               break;
+       }
+       case IPPROTO_IGMP: {
+               /*
+                * Full IGMP payload
+                */
+               trunc_len = remaining_caplen;
+               break;
+       }
+       case IPPROTO_UDP: {
+               trunc_len = get_udp_trunc_len(pkt, off, remaining_caplen);
+               break;
+       }
+       case IPPROTO_TCP: {
+               trunc_len = get_tcp_trunc_len(pkt, off, remaining_caplen);
+               break;
+       }
+       case IPPROTO_ESP: {
+               trunc_len = get_esp_trunc_len(pkt, off, remaining_caplen);
+               break;
+       }
+       default: {
+               /*
+                * By default we only include the IP header
+                */
+               trunc_len = 0;
+               break;
+       }
+       }
+       if (trunc_len >= remaining_caplen)
+               return (remaining_caplen);
+
+       return (trunc_len);
+}
+
+static uint16_t
+get_ip_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
+{
+       int err = 0;
+       uint16_t iplen = sizeof(struct ip);
+       if (iplen >= remaining_caplen)
+               return (remaining_caplen);
+
+       struct ip iphdr;
+       err =  bpf_copydata(pkt, off, sizeof(struct ip), &iphdr);
+       if (err != 0)
+               return (remaining_caplen);
+
+       uint8_t proto = 0;
+
+       iplen = iphdr.ip_hl << 2;
+       if (iplen >= remaining_caplen)
+               return (remaining_caplen);
+
+       proto = iphdr.ip_p;
+       iplen += get_proto_trunc_len(proto, pkt, off + iplen, remaining_caplen - iplen);
+
+       if (iplen >= remaining_caplen)
+               return (remaining_caplen);
+
+       return (iplen);
+}
+
+static uint16_t
+get_ip6_trunc_len(struct bpf_packet *pkt, uint16_t off, const uint16_t remaining_caplen)
+{
+       int err = 0;
+       uint16_t iplen = sizeof(struct ip6_hdr);
+       if (iplen >= remaining_caplen)
+               return (remaining_caplen);
+
+       struct ip6_hdr ip6hdr;
+       err = bpf_copydata(pkt, off, sizeof(struct ip6_hdr), &ip6hdr);
+       if (err != 0)
+               return (remaining_caplen);
+
+       uint8_t proto = 0;
+
+       /*
+        * TBD: process the extension headers
+        */
+       proto = ip6hdr.ip6_nxt;
+       iplen += get_proto_trunc_len(proto, pkt, off + iplen, remaining_caplen - iplen);
+
+       if (iplen >= remaining_caplen)
+               return (remaining_caplen);
+
+       return (iplen);
+}
+
+static uint16_t
+get_ether_trunc_len(struct bpf_packet *pkt, int off, const uint16_t remaining_caplen)
+{
+       int err = 0;
+       uint16_t ethlen = sizeof(struct ether_header);
+       if (ethlen >= remaining_caplen)
+               return (remaining_caplen);
+
+       struct ether_header eh;
+       u_short type;
+       err = bpf_copydata(pkt, off, sizeof(struct ether_header), &eh);
+       if (err != 0)
+               return (remaining_caplen);
+
+       type = EXTRACT_SHORT(&eh.ether_type);
+       /* Include full ARP */
+       if (type == ETHERTYPE_ARP) {
+               ethlen = remaining_caplen;
+       } else if (type != ETHERTYPE_IP && type != ETHERTYPE_IPV6) {
+               ethlen = min(BPF_MIN_PKT_SIZE, remaining_caplen);
+       } else {
+               if (type == ETHERTYPE_IP) {
+                       ethlen += get_ip_trunc_len(pkt, sizeof(struct ether_header),
+                           remaining_caplen);
+               } else if (type == ETHERTYPE_IPV6) {
+                       ethlen += get_ip6_trunc_len(pkt, sizeof(struct ether_header),
+                       remaining_caplen);
+               }
+       }
+       return (ethlen);
+}
+
+static uint32_t
+get_pkt_trunc_len(u_char *p, u_int len)
+{
+       struct bpf_packet *pkt = (struct bpf_packet *)(void *) p;
+       struct pktap_header *pktap = (struct pktap_header *) (pkt->bpfp_header);
+       uint32_t out_pkt_len = 0, tlen = 0;
+       /*
+        * pktap->pth_frame_pre_length is L2 header length and accounts
+        * for both pre and pre_adjust.
+        * pktap->pth_length is sizeof(pktap_header) (excl the pre/pre_adjust)
+        * pkt->bpfp_header_length is (pktap->pth_length + pre_adjust)
+        * pre is the offset to the L3 header after the bpfp_header, or length
+        * of L2 header after bpfp_header, if present.
+       */
+       uint32_t pre = pktap->pth_frame_pre_length -
+           (pkt->bpfp_header_length - pktap->pth_length);
+
+       /* Length of the input packet starting from  L3 header */
+       uint32_t in_pkt_len = len - pkt->bpfp_header_length - pre;
+       if (pktap->pth_protocol_family == AF_INET ||
+           pktap->pth_protocol_family == AF_INET6) {
+               /* Contains L2 header */
+               if (pre > 0) {
+                       if (pre < sizeof(struct ether_header))
+                               goto too_short;
+
+                       out_pkt_len = get_ether_trunc_len(pkt, 0, in_pkt_len);
+               } else if (pre == 0) {
+                       if (pktap->pth_protocol_family == AF_INET) {
+                               out_pkt_len = get_ip_trunc_len(pkt, pre, in_pkt_len);
+                       } else if (pktap->pth_protocol_family == AF_INET6) {
+                               out_pkt_len = get_ip6_trunc_len(pkt, pre, in_pkt_len);
+                       }
+               } else {
+                       /* Ideally pre should be >= 0. This is an exception */
+                       out_pkt_len = min(BPF_MIN_PKT_SIZE, in_pkt_len);
+               }
+       } else {
+               if (pktap->pth_iftype == IFT_ETHER) {
+                       if (in_pkt_len < sizeof(struct ether_header)) {
+                               goto too_short;
+                       }
+                       /* At most include the Ethernet header and 16 bytes */
+                       out_pkt_len = MIN(sizeof(struct ether_header) + 16,
+                           in_pkt_len);
+               } else {
+                       /*
+                        * For unknown protocols include at most 16 bytes
+                        */
+                       out_pkt_len = MIN(16, in_pkt_len);
+               }
+       }
+done:
+       tlen = pkt->bpfp_header_length + out_pkt_len + pre;
+       return (tlen);
+too_short:
+       out_pkt_len = in_pkt_len;
+       goto done;
+}
+
 /*
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  Return 1 if it's time to wakeup a listener (buffer full),
@@ -2605,11 +3219,11 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt,
                 * We cannot rotate buffers if a read is in progress
                 * so drop the packet
                 */
-               if (d->bd_hbuf_read) {
+               if (d->bd_hbuf_read != 0) {
                        ++d->bd_dcount;
                        return;
                }
-               
+
                if (d->bd_fbuf == NULL) {
                        if (d->bd_headdrop == 0) {
                                /*
@@ -2630,11 +3244,10 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt,
                }
                do_wakeup = 1;
                curlen = 0;
-       }
-       else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
+       } else if (d->bd_immediate || d->bd_state == BPF_TIMED_OUT)
                /*
-                * Immediate mode is set, or the read timeout has 
-                * already expired during a select call. A packet 
+                * Immediate mode is set, or the read timeout has
+                * already expired during a select call. A packet
                 * arrived, so the reader should be woken up.
                 */
                do_wakeup = 1;
@@ -2643,18 +3256,18 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt,
         * Append the bpf header.
         */
        microtime(&tv);
-       if (d->bd_flags & BPF_EXTENDED_HDR) {
+       if (d->bd_flags & BPF_EXTENDED_HDR) {
                struct mbuf *m;
 
                m = (pkt->bpfp_type == BPF_PACKET_TYPE_MBUF)
                        ? pkt->bpfp_mbuf : NULL;
-               ehp = (struct bpf_hdr_ext *)(void *)(d->bd_sbuf + curlen);
-               memset(ehp, 0, sizeof(*ehp));
-               ehp->bh_tstamp.tv_sec = tv.tv_sec;
-               ehp->bh_tstamp.tv_usec = tv.tv_usec;
+               ehp = (struct bpf_hdr_ext *)(void *)(d->bd_sbuf + curlen);
+               memset(ehp, 0, sizeof(*ehp));
+               ehp->bh_tstamp.tv_sec = tv.tv_sec;
+               ehp->bh_tstamp.tv_usec = tv.tv_usec;
 
                ehp->bh_datalen = pkt->bpfp_total_length;
-               ehp->bh_hdrlen = hdrlen;
+               ehp->bh_hdrlen = hdrlen;
                caplen = ehp->bh_caplen = totlen - hdrlen;
                if (m == NULL) {
                        if (outbound) {
@@ -2688,16 +3301,16 @@ catchpacket(struct bpf_d *d, struct bpf_packet * pkt,
                        }
                } else
                        ehp->bh_flags |= BPF_HDR_EXT_FLAGS_DIR_IN;
-               payload = (u_char *)ehp + hdrlen;
-       } else {
-               hp = (struct bpf_hdr *)(void *)(d->bd_sbuf + curlen);
-               hp->bh_tstamp.tv_sec = tv.tv_sec;
-               hp->bh_tstamp.tv_usec = tv.tv_usec;
+               payload = (u_char *)ehp + hdrlen;
+       } else {
+               hp = (struct bpf_hdr *)(void *)(d->bd_sbuf + curlen);
+               hp->bh_tstamp.tv_sec = tv.tv_sec;
+               hp->bh_tstamp.tv_usec = tv.tv_usec;
                hp->bh_datalen = pkt->bpfp_total_length;
-               hp->bh_hdrlen = hdrlen;
+               hp->bh_hdrlen = hdrlen;
                caplen = hp->bh_caplen = totlen - hdrlen;
-               payload = (u_char *)hp + hdrlen;
-       }
+               payload = (u_char *)hp + hdrlen;
+       }
        /*
         * Copy the packet data into the store buffer and update its length.
         */
@@ -2757,12 +3370,12 @@ bpf_freed(struct bpf_d *d)
         * been detached from its interface and it yet hasn't been marked
         * free.
         */
-       if (d->bd_hbuf_read)
+       if (d->bd_hbuf_read != 0)
                panic("bpf buffer freed during read");
 
        if (d->bd_sbuf != 0) {
                FREE(d->bd_sbuf, M_DEVBUF);
-               if (d->bd_hbuf != 0) 
+               if (d->bd_hbuf != 0)
                        FREE(d->bd_hbuf, M_DEVBUF);
                if (d->bd_fbuf != 0)
                        FREE(d->bd_fbuf, M_DEVBUF);
@@ -2772,7 +3385,7 @@ bpf_freed(struct bpf_d *d)
 }
 
 /*
- * Attach an interface to bpf.  driverp is a pointer to a (struct bpf_if *)
+ * Attach an interface to bpf. driverp is a pointer to a (struct bpf_if *)
  * in the driver's softc; dlt is the link layer type; hdrlen is the fixed
  * size of the link header (variable length headers not yet supported).
  */
@@ -2833,20 +3446,19 @@ bpf_attach(
                printf("bpfattach - %s with dlt %d is already attached\n",
                        if_name(ifp), dlt);
                FREE(bp_new, M_DEVBUF);
-               return EEXIST;
+               return (EEXIST);
        }
-       
+
        bp_new->bif_ifp = ifp;
        bp_new->bif_dlt = dlt;
        bp_new->bif_send = send;
        bp_new->bif_tap = tap;
-       
+
        if (bp_first == NULL) {
                /* No other entries for this ifp */
                bp_new->bif_next = bpf_iflist;
                bpf_iflist = bp_new;
-       }
-       else {
+       } else {
                if (ifnet_type(ifp) == IFT_ETHER && dlt == DLT_EN10MB) {
                        /* Make this the first entry for this interface */
                        if (bp_before_first != NULL) {
@@ -2863,7 +3475,7 @@ bpf_attach(
                        bp_last->bif_next = bp_new;
                }
        }
-       
+
        /*
         * Compute the length of the bpf header.  This is not necessarily
         * equal to SIZEOF_BPF_HDR because we want to insert spacing such
@@ -2873,7 +3485,7 @@ bpf_attach(
        bp_new->bif_hdrlen = BPF_WORDALIGN(hdrlen + SIZEOF_BPF_HDR) - hdrlen;
        bp_new->bif_exthdrlen = BPF_WORDALIGN(hdrlen +
            sizeof(struct bpf_hdr_ext)) - hdrlen;
-       
+
        /* Take a reference on the interface */
        ifnet_reference(ifp);
 
@@ -2884,7 +3496,7 @@ bpf_attach(
                printf("bpf: %s attached\n", if_name(ifp));
 #endif
 
-       return 0;
+       return (0);
 }
 
 /*
@@ -2945,7 +3557,7 @@ void
 bpf_init(__unused void *unused)
 {
 #ifdef __APPLE__
-       int     i;
+       int     i;
        int     maj;
 
        if (bpf_devsw_installed == 0) {
@@ -2962,17 +3574,17 @@ bpf_init(__unused void *unused)
                                lck_grp_free(bpf_mlock_grp);
                        if (bpf_mlock_grp_attr)
                                lck_grp_attr_free(bpf_mlock_grp_attr);
-                       
+
                        bpf_mlock = NULL;
                        bpf_mlock_attr = NULL;
                        bpf_mlock_grp = NULL;
                        bpf_mlock_grp_attr = NULL;
                        bpf_devsw_installed = 0;
-                       printf("bpf_init: failed to allocate a major number!\n");
+                       printf("bpf_init: failed to allocate a major number\n");
                        return;
                }
 
-               for (i = 0 ; i < NBPFILTER; i++)
+               for (i = 0; i < NBPFILTER; i++)
                        bpf_make_dev_t(maj);
        }
 #else
@@ -2981,7 +3593,7 @@ bpf_init(__unused void *unused)
 }
 
 #ifndef __APPLE__
-SYSINIT(bpfdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,bpf_drvinit,NULL)
+SYSINIT(bpfdev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR, bpf_drvinit, NULL)
 #endif
 
 #if CONFIG_MACF_NET
index ff4eb1bfff8b12defc4bfa6f4564615d59cc7c68..0457a93abd679efa2a11bee6fe5d2d9bee7e5da9 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
 #include <sys/cdefs.h>
 #include <stdint.h>
 
+#ifdef PRIVATE
+#include <net/if_var.h>
+#include <uuid/uuid.h>
+
+struct bpf_setup_args {
+       uuid_t  bsa_uuid;
+       char    bsa_ifname[IFNAMSIZ];
+};
+#endif /* PRIVATE */
+
 #ifdef KERNEL
 #include <sys/kernel_types.h>
+
+#if !defined(__i386__) && !defined(__x86_64__)
+#define BPF_ALIGN 1
+#else /* defined(__i386__) || defined(__x86_64__) */
+#define BPF_ALIGN 0
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+#if !BPF_ALIGN
+#define EXTRACT_SHORT(p)        ((u_int16_t)ntohs(*(u_int16_t *)(void *)p))
+#define EXTRACT_LONG(p)         (ntohl(*(u_int32_t *)(void *)p))
+#else
+#define EXTRACT_SHORT(p)\
+        ((u_int16_t)\
+                ((u_int16_t)*((u_char *)p+0)<<8|\
+                 (u_int16_t)*((u_char *)p+1)<<0))
+#define EXTRACT_LONG(p)\
+                ((u_int32_t)*((u_char *)p+0)<<24|\
+                 (u_int32_t)*((u_char *)p+1)<<16|\
+                 (u_int32_t)*((u_char *)p+2)<<8|\
+                 (u_int32_t)*((u_char *)p+3)<<0)
 #endif
 
+#endif /* KERNEL */
+
 /* BSD style release date */
 #define        BPF_RELEASE 199606
 
@@ -113,7 +145,8 @@ struct bpf_program {
 };
 
 #ifdef KERNEL_PRIVATE
-/* LP64 version of bpf_program.  all pointers 
+/*
+ * LP64 version of bpf_program.  all pointers
  * grow when we're dealing with a 64-bit process.
  * WARNING - keep in sync with bpf_program
  */
@@ -211,6 +244,11 @@ struct bpf_version {
 #define        BIOCSWANTPKTAP  _IOWR('B', 127, u_int)
 #define BIOCSHEADDROP   _IOW('B', 128, int)
 #define BIOCGHEADDROP   _IOR('B', 128, int)
+#define BIOCSTRUNCATE  _IOW('B', 129, u_int)
+#define        BIOCGETUUID     _IOR('B', 130, uuid_t)
+#define        BIOCSETUP       _IOW('B', 131, struct bpf_setup_args)
+#define        BIOCSPKTHDRV2   _IOW('B', 132, int)
+#define        BIOCGPKTHDRV2   _IOW('B', 133, int)
 #endif /* PRIVATE */
 /*
  * Structure prepended to each packet.
@@ -268,6 +306,7 @@ struct bpf_mtag {
 #define        BPF_MTAG_DIR_IN         0
 #define        BPF_MTAG_DIR_OUT        1
 };
+
 #endif /* PRIVATE */
 
 /*
@@ -1299,6 +1338,13 @@ struct bpf_dltlist {
 #pragma pack()
 
 #ifdef KERNEL_PRIVATE
+#define BPF_MIN_PKT_SIZE 40
+#define PORT_DNS 53
+#define PORT_BOOTPS 67
+#define PORT_BOOTPC 68
+#define PORT_ISAKMP 500
+#define PORT_ISAKMP_NATT 4500  /* rfc3948 */
+
 /* Forward declerations */
 struct ifnet;
 struct mbuf;
index 80e31cd0648dd4eea2aa4e2fdac0513e042830ef..7fbafb3c0a98730be0e9c244dbd90003adb8305c 100644 (file)
 #include <netinet/in.h>
 #endif
 
-#if !defined(__i386__) && !defined(__x86_64__)
-#define BPF_ALIGN 1
-#else /* defined(__i386__) || defined(__x86_64__) */
-#define BPF_ALIGN 0
-#endif /* defined(__i386__) || defined(__x86_64__) */
-
-#if !BPF_ALIGN
-#define EXTRACT_SHORT(p)       ((u_int16_t)ntohs(*(u_int16_t *)(void *)p))
-#define EXTRACT_LONG(p)                (ntohl(*(u_int32_t *)(void *)p))
-#else
-#define EXTRACT_SHORT(p)\
-       ((u_int16_t)\
-               ((u_int16_t)*((u_char *)p+0)<<8|\
-                (u_int16_t)*((u_char *)p+1)<<0))
-#define EXTRACT_LONG(p)\
-               ((u_int32_t)*((u_char *)p+0)<<24|\
-                (u_int32_t)*((u_char *)p+1)<<16|\
-                (u_int32_t)*((u_char *)p+2)<<8|\
-                (u_int32_t)*((u_char *)p+3)<<0)
-#endif
-
 #ifdef KERNEL
 #include <sys/mbuf.h>
 #endif
index dcb9ac0afdbfb2420386e70309dd89a7dc717ff7..8e18cb9379e90cabbc3ce32c1f145166825a715b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
  * The items in this header file should be wrapped in #ifdef KERNEL.
  */
 
+#include <sys/proc.h>
 #include <sys/select.h>
 #include <kern/thread_call.h>
+#include <uuid/uuid.h>
 
 /*
  * Descriptor associated with each open bpf file.
@@ -145,6 +147,9 @@ struct bpf_d {
        void            *bd_unref_lr[BPF_REF_HIST];
        int             bd_next_ref_lr;
        int             bd_next_unref_lr;
+
+       struct proc     *bd_opened_by;
+       uuid_t          bd_uuid;
 };
 
 /* Values for bd_state */
@@ -159,13 +164,15 @@ struct bpf_d {
                          (bd)->bd_slen != 0))
 
 /* Values for bd_flags */
-#define        BPF_EXTENDED_HDR        0x01    /* process req. the extended header */
-#define        BPF_WANT_PKTAP          0x02    /* knows how to handle DLT_PKTAP */
-#define        BPF_FINALIZE_PKTAP      0x04    /* finalize pktap header on read */
-#define        BPF_KNOTE               0x08    /* kernel note attached */
-#define        BPF_DETACHING           0x10    /* bpf_d is being detached */
-#define        BPF_DETACHED            0x20    /* bpf_d is detached */
-#define        BPF_CLOSING             0x40    /* bpf_d is being closed */
+#define        BPF_EXTENDED_HDR        0x0001  /* process req. the extended header */
+#define        BPF_WANT_PKTAP          0x0002  /* knows how to handle DLT_PKTAP */
+#define        BPF_FINALIZE_PKTAP      0x0004  /* finalize pktap header on read */
+#define        BPF_KNOTE               0x0008  /* kernel note attached */
+#define        BPF_DETACHING           0x0010  /* bpf_d is being detached */
+#define        BPF_DETACHED            0x0020  /* bpf_d is detached */
+#define        BPF_CLOSING             0x0040  /* bpf_d is being closed */
+#define        BPF_TRUNCATE            0x0080  /* truncate the packet payload */
+#define        BPF_PKTHDRV2            0x0100  /* pktap header version 2 */
 
 /*
  * Descriptor associated with each attached hardware interface.
index 6db259bb7845509109fbcaa119298c739e6456a9..7f9411802ac32d471a6e930c5404327457613762 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
index 4a685a80a9105a4b60a936d61b1baff1e5c855f4..ae7ff13ab4deef332194ab42446635c3dcea8e4a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/param.h>
+#include <sys/mbuf.h>
 
 #include <kern/locks.h>
 #include <kern/zalloc.h>
 #include <netinet/in_pcb.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
 
 #include <string.h>
 #include <libkern/libkern.h>
-
+#include <kern/sched_prim.h>
 
 #define        MAX_CONTENT_FILTER 2
 
@@ -344,6 +349,7 @@ struct content_filter {
 struct content_filter **content_filters = NULL;
 uint32_t cfil_active_count = 0;        /* Number of active content filters */
 uint32_t cfil_sock_attached_count = 0; /* Number of sockets attachements */
+uint32_t cfil_sock_udp_attached_count = 0;     /* Number of UDP sockets attachements */
 uint32_t cfil_close_wait_timeout = 1000; /* in milliseconds */
 
 static kern_ctl_ref cfil_kctlref = NULL;
@@ -430,6 +436,8 @@ struct cfil_entry {
                        (cfil)->cfi_op_list_ctr ++;                                                                             \
                }
 
+struct cfil_hash_entry;
+
 /*
  * struct cfil_info
  *
@@ -454,7 +462,9 @@ struct cfil_info {
                 */
                uint64_t                cfi_pending_first;
                uint64_t                cfi_pending_last;
-               int                     cfi_pending_mbcnt;
+               uint32_t                cfi_pending_mbcnt;
+               uint32_t                cfi_pending_mbnum;
+               uint32_t                cfi_tail_drop_cnt;
                /*
                 * cfi_pass_offset is the minimum of all the filters
                 */
@@ -468,6 +478,7 @@ struct cfil_info {
        } cfi_snd, cfi_rcv;
 
        struct cfil_entry       cfi_entries[MAX_CONTENT_FILTER];
+       struct cfil_hash_entry *cfi_hash_entry;
 } __attribute__((aligned(8)));
 
 #define        CFIF_DROP               0x0001  /* drop action applied */
@@ -488,6 +499,98 @@ TAILQ_HEAD(cfil_sock_head, cfil_info) cfil_sock_head;
 #define        CFIL_QUEUE_VERIFY(x) if (cfil_debug) cfil_queue_verify(x)
 #define        CFIL_INFO_VERIFY(x) if (cfil_debug) cfil_info_verify(x)
 
+/*
+ * UDP Socket Support
+ */
+LIST_HEAD(cfilhashhead, cfil_hash_entry);
+#define CFILHASHSIZE 16
+#define CFIL_HASH(laddr, faddr, lport, fport) ((faddr) ^ ((laddr) >> 16) ^ (fport) ^ (lport))
+#define IS_UDP(so) (so && so->so_proto->pr_type == SOCK_DGRAM && so->so_proto->pr_protocol == IPPROTO_UDP)
+#define UNCONNECTED(inp) (inp && (((inp->inp_vflag & INP_IPV4) && (inp->inp_faddr.s_addr == INADDR_ANY)) || \
+                                                                 ((inp->inp_vflag & INP_IPV6) && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))))
+#define IS_ENTRY_ATTACHED(cfil_info, kcunit) (cfil_info != NULL && (kcunit <= MAX_CONTENT_FILTER) && \
+                                                                                         cfil_info->cfi_entries[kcunit - 1].cfe_filter != NULL)
+#define IS_DNS(local, remote) (check_port(local, 53) || check_port(remote, 53) || check_port(local, 5353) || check_port(remote, 5353))
+
+/*
+ * UDP Garbage Collection:
+ */
+static struct thread *cfil_udp_gc_thread;
+#define UDP_FLOW_GC_IDLE_TO          30  // Flow Idle Timeout in seconds
+#define UDP_FLOW_GC_ACTION_TO        10  // Flow Action Timeout (no action from user space) in seconds
+#define UDP_FLOW_GC_MAX_COUNT        100 // Max UDP flows to be handled per run
+#define UDP_FLOW_GC_RUN_INTERVAL_NSEC  (10 * NSEC_PER_SEC)  // GC wakes up every 10 seconds
+
+/*
+ * UDP flow queue thresholds
+ */
+#define UDP_FLOW_GC_MBUF_CNT_MAX  (2 << MBSHIFT) // Max mbuf byte count in flow queue (2MB)
+#define UDP_FLOW_GC_MBUF_NUM_MAX  (UDP_FLOW_GC_MBUF_CNT_MAX >> MCLSHIFT) // Max mbuf count in flow queue (1K)
+#define UDP_FLOW_GC_MBUF_SHIFT    5             // Shift to get 1/32 of platform limits
+/*
+ * UDP flow queue threshold globals:
+ */
+static unsigned int cfil_udp_gc_mbuf_num_max = UDP_FLOW_GC_MBUF_NUM_MAX;
+static unsigned int cfil_udp_gc_mbuf_cnt_max = UDP_FLOW_GC_MBUF_CNT_MAX;
+
+/*
+ * struct cfil_hash_entry
+ *
+ * Hash entry for cfil_info
+ */
+struct cfil_hash_entry {
+    LIST_ENTRY(cfil_hash_entry)    cfentry_link;
+    struct cfil_info               *cfentry_cfil;
+    u_short cfentry_fport;
+    u_short cfentry_lport;
+    sa_family_t                    cfentry_family;
+    u_int32_t                      cfentry_flowhash;
+    u_int32_t                      cfentry_lastused;
+    union {
+        /* foreign host table entry */
+        struct in_addr_4in6 addr46;
+        struct in6_addr addr6;
+    } cfentry_faddr;
+    union {
+        /* local host table entry */
+        struct in_addr_4in6 addr46;
+        struct in6_addr addr6;
+    } cfentry_laddr;
+};
+
+/*
+ * struct cfil_db
+ *
+ * For each UDP socket, this is a hash table maintaining all cfil_info structs
+ * keyed by the flow 4-tuples <lport,fport,laddr,faddr>.
+ */
+struct cfil_db {
+    struct socket       *cfdb_so;
+    uint32_t            cfdb_count;           /* Number of total content filters */
+    struct cfilhashhead *cfdb_hashbase;
+    u_long              cfdb_hashmask;
+       struct cfil_hash_entry *cfdb_only_entry;  /* Optimization for connected UDP */
+};
+
+/*
+ * CFIL specific mbuf tag:
+ * Save state of socket at the point of data entry into cfil.
+ * Use saved state for reinjection at protocol layer.
+ */
+struct cfil_tag {
+       union sockaddr_in_4_6 cfil_faddr;
+       uint32_t cfil_so_state_change_cnt;
+       short cfil_so_options;
+};
+
+#define    CFIL_HASH_ENTRY_ZONE_NAME    "cfil_entry_hash"
+#define    CFIL_HASH_ENTRY_ZONE_MAX     1024
+static struct zone *cfil_hash_entry_zone = NULL;
+
+#define    CFIL_DB_ZONE_NAME       "cfil_db"
+#define    CFIL_DB_ZONE_MAX        1024
+static struct zone *cfil_db_zone = NULL;
+
 /*
  * Statistics
  */
@@ -500,6 +603,15 @@ struct cfil_stats cfil_stats;
 int cfil_log_level = LOG_ERR;
 int cfil_debug = 1;
 
+// Debug controls added for selective debugging.
+// Disabled for production.  If enabled,
+// these will have performance impact
+#define LIFECYCLE_DEBUG 0
+#define VERDICT_DEBUG 0
+#define DATA_DEBUG 0
+#define SHOW_DEBUG 0
+#define GC_DEBUG 0
+
 /*
  * Sysctls for logs and statistics
  */
@@ -541,31 +653,32 @@ SYSCTL_STRUCT(_net_cfil, OID_AUTO, stats, CTLFLAG_RD|CTLFLAG_LOCKED,
 /*
  * Forward declaration to appease the compiler
  */
-static int cfil_action_data_pass(struct socket *, uint32_t, int,
+static int cfil_action_data_pass(struct socket *, struct cfil_info *, uint32_t, int,
        uint64_t, uint64_t);
-static int cfil_action_drop(struct socket *, uint32_t);
+static int cfil_action_drop(struct socket *, struct cfil_info *, uint32_t);
 static int cfil_action_bless_client(uint32_t, struct cfil_msg_hdr *);
-static int cfil_dispatch_closed_event(struct socket *, int);
-static int cfil_data_common(struct socket *, int, struct sockaddr *,
+static int cfil_dispatch_closed_event(struct socket *, struct cfil_info *, int);
+static int cfil_data_common(struct socket *, struct cfil_info *, int, struct sockaddr *,
        struct mbuf *, struct mbuf *, uint32_t);
-static int cfil_data_filter(struct socket *, uint32_t, int,
+static int cfil_data_filter(struct socket *, struct cfil_info *, uint32_t, int,
        struct mbuf *, uint64_t);
 static void fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *,
        struct in_addr, u_int16_t);
 static void fill_ip6_sockaddr_4_6(union sockaddr_in_4_6 *,
        struct in6_addr *, u_int16_t);
-static int cfil_dispatch_attach_event(struct socket *, uint32_t);
-static void cfil_info_free(struct socket *, struct cfil_info *);
-static struct cfil_info * cfil_info_alloc(struct socket *);
-static int cfil_info_attach_unit(struct socket *, uint32_t);
-static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t);
-static struct socket *cfil_socket_from_client_uuid(uuid_t, bool *);
-static int cfil_service_pending_queue(struct socket *, uint32_t, int);
-static int cfil_data_service_ctl_q(struct socket *, uint32_t, int);
+;
+static int cfil_dispatch_attach_event(struct socket *, struct cfil_info *, uint32_t);
+static void cfil_info_free(struct cfil_info *);
+static struct cfil_info * cfil_info_alloc(struct socket *, struct cfil_hash_entry *);
+static int cfil_info_attach_unit(struct socket *, uint32_t, struct cfil_info *);
+static struct socket * cfil_socket_from_sock_id(cfil_sock_id_t, bool);
+static struct socket * cfil_socket_from_client_uuid(uuid_t, bool *);
+static int cfil_service_pending_queue(struct socket *, struct cfil_info *, uint32_t, int);
+static int cfil_data_service_ctl_q(struct socket *, struct cfil_info *, uint32_t, int);
 static void cfil_info_verify(struct cfil_info *);
-static int cfil_update_data_offsets(struct socket *, uint32_t, int,
+static int cfil_update_data_offsets(struct socket *, struct cfil_info *, uint32_t, int,
        uint64_t, uint64_t);
-static int cfil_acquire_sockbuf(struct socket *, int);
+static int cfil_acquire_sockbuf(struct socket *, struct cfil_info *, int);
 static void cfil_release_sockbuf(struct socket *, int);
 static int cfil_filters_attached(struct socket *);
 
@@ -576,7 +689,41 @@ static void cfil_rw_unlock_shared(lck_rw_t *);
 static boolean_t cfil_rw_lock_shared_to_exclusive(lck_rw_t *);
 static void cfil_rw_lock_exclusive_to_shared(lck_rw_t *);
 
-static unsigned int cfil_data_length(struct mbuf *, int *);
+static unsigned int cfil_data_length(struct mbuf *, int *, int *);
+static errno_t cfil_db_init(struct socket *);
+static void cfil_db_free(struct socket *so);
+struct cfil_hash_entry *cfil_db_lookup_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *);
+struct cfil_hash_entry *cfil_db_lookup_entry_with_sockid(struct cfil_db *, u_int64_t);
+struct cfil_hash_entry *cfil_db_add_entry(struct cfil_db *, struct sockaddr *, struct sockaddr *);
+void cfil_db_delete_entry(struct cfil_db *, struct cfil_hash_entry *);
+struct cfil_hash_entry *cfil_sock_udp_get_flow(struct socket *, uint32_t, bool, struct sockaddr *, struct sockaddr *);
+struct cfil_info *cfil_db_get_cfil_info(struct cfil_db *, cfil_sock_id_t);
+static errno_t cfil_sock_udp_handle_data(bool, struct socket *, struct sockaddr *, struct sockaddr *,
+                                                                                struct mbuf *, struct mbuf *, uint32_t);
+static int32_t cfil_sock_udp_data_pending(struct sockbuf *, bool);
+static void cfil_sock_udp_is_closed(struct socket *);
+static int cfil_sock_udp_notify_shutdown(struct socket *, int , int, int);
+static int cfil_sock_udp_shutdown(struct socket *, int *);
+static void cfil_sock_udp_close_wait(struct socket *);
+static void cfil_sock_udp_buf_update(struct sockbuf *);
+static int cfil_filters_udp_attached(struct socket *, bool);
+static void cfil_get_flow_address_v6(struct cfil_hash_entry *, struct inpcb *,
+                                                                        struct in6_addr **, struct in6_addr **,
+                                                                        u_int16_t *, u_int16_t *);
+static void cfil_get_flow_address(struct cfil_hash_entry *, struct inpcb *,
+                                                                 struct in_addr *, struct in_addr *,
+                                                                 u_int16_t *, u_int16_t *);
+static void cfil_info_log(int, struct cfil_info *, const char *);
+void cfil_filter_show(u_int32_t);
+void cfil_info_show(void);
+bool cfil_info_idle_timed_out(struct cfil_info *, int, u_int32_t);
+bool cfil_info_action_timed_out(struct cfil_info *, int);
+bool cfil_info_buffer_threshold_exceeded(struct cfil_info *);
+struct m_tag *cfil_udp_save_socket_state(struct cfil_info *, struct mbuf *);
+static void cfil_udp_gc_thread_func(void *, wait_result_t);
+static void cfil_info_udp_expire(void *, wait_result_t);
+
+bool check_port(struct sockaddr *, u_short);
 
 /*
  * Content filter global read write lock
@@ -676,29 +823,65 @@ cfil_rw_lock_assert_held(lck_rw_t *lck, int exclusive)
 /*
  * Return the number of bytes in the mbuf chain using the same
  * method as m_length() or sballoc()
+ *
+ * Returns data len - starting from PKT start
+ * - retmbcnt - optional param to get total mbuf bytes in chain
+ * - retmbnum - optional param to get number of mbufs in chain
  */
 static unsigned int
-cfil_data_length(struct mbuf *m, int *retmbcnt)
+cfil_data_length(struct mbuf *m, int *retmbcnt, int *retmbnum)
 {
        struct mbuf *m0;
-       unsigned int pktlen;
+       unsigned int pktlen = 0;
        int mbcnt;
+       int mbnum;
+
+       // Locate the start of data
+       for (m0 = m; m0 != NULL; m0 = m0->m_next) {
+               if (m0->m_flags & M_PKTHDR)
+                       break;
+       }
+       if (m0 == NULL) {
+               CFIL_LOG(LOG_ERR, "cfil_data_length: no M_PKTHDR");
+               return (0);
+       }
+       m = m0;
 
-       if (retmbcnt == NULL)
+       if (retmbcnt == NULL && retmbnum == NULL)
                return (m_length(m));
 
        pktlen = 0;
        mbcnt = 0;
+       mbnum = 0;
        for (m0 = m; m0 != NULL; m0 = m0->m_next) {
                pktlen += m0->m_len;
+               mbnum++;
                mbcnt += MSIZE;
                if (m0->m_flags & M_EXT)
                        mbcnt += m0->m_ext.ext_size;
        }
-       *retmbcnt = mbcnt;
+       if (retmbcnt) {
+               *retmbcnt = mbcnt;
+       }
+       if (retmbnum) {
+               *retmbnum = mbnum;
+       }
        return (pktlen);
 }
 
+static struct mbuf *
+cfil_data_start(struct mbuf *m)
+{
+       struct mbuf *m0;
+
+       // Locate the start of data
+       for (m0 = m; m0 != NULL; m0 = m0->m_next) {
+               if (m0->m_flags & M_PKTHDR)
+                       break;
+       }
+       return m0;
+}
+
 /*
  * Common mbuf queue utilities
  */
@@ -754,6 +937,7 @@ cfil_queue_len(struct cfil_queue *cfq)
 static void
 cfil_queue_verify(struct cfil_queue *cfq)
 {
+       mbuf_t chain;
        mbuf_t m;
        mbuf_t n;
        uint64_t queuesize = 0;
@@ -769,11 +953,15 @@ cfil_queue_verify(struct cfil_queue *cfq)
                (!MBUFQ_EMPTY(&cfq->q_mq) &&
                cfq->q_start != cfq->q_end));
 
-       MBUFQ_FOREACH(m, &cfq->q_mq) {
+       MBUFQ_FOREACH(chain, &cfq->q_mq) {
                size_t chainsize = 0;
-               unsigned int mlen = m_length(m);
+               m = chain;
+               unsigned int mlen = cfil_data_length(m, NULL, NULL);
+               // skip the addr and control stuff if present
+               m = cfil_data_start(m);
 
-               if (m == (void *)M_TAG_FREE_PATTERN ||
+               if (m == NULL ||
+                       m == (void *)M_TAG_FREE_PATTERN ||
                        m->m_next == (void *)M_TAG_FREE_PATTERN ||
                        m->m_nextpkt == (void *)M_TAG_FREE_PATTERN)
                        panic("%s - mq %p is free at %p", __func__,
@@ -812,7 +1000,7 @@ cfil_queue_remove(struct cfil_queue *cfq, mbuf_t m, size_t len)
 {
        CFIL_QUEUE_VERIFY(cfq);
 
-       VERIFY(m_length(m) == len);
+       VERIFY(cfil_data_length(m, NULL, NULL) == len);
 
        MBUFQ_REMOVE(&cfq->q_mq, m);
        MBUFQ_NEXT(m) = NULL;
@@ -984,6 +1172,7 @@ cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
        errno_t error = 0;
        struct content_filter *cfc;
        struct cfil_entry *entry;
+       uint64_t sock_flow_id = 0;
 
        CFIL_LOG(LOG_NOTICE, "");
 
@@ -1028,6 +1217,7 @@ cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
                if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
                        struct cfil_info *cfil_info = entry->cfe_cfil_info;
                        struct socket *so = cfil_info->cfi_so;
+                       sock_flow_id = cfil_info->cfi_sock_id;
 
                        /* Need to let data flow immediately */
                        entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED |
@@ -1044,37 +1234,43 @@ cfil_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo)
                         * When cfe_filter is NULL the filter is detached
                         * and the entry has been removed from cf_sock_entries
                         */
-                       if (so->so_cfil == NULL || entry->cfe_filter == NULL) {
+                       if ((so->so_cfil == NULL && so->so_cfil_db == NULL) || entry->cfe_filter == NULL) {
                                cfil_rw_lock_exclusive(&cfil_lck_rw);
                                goto release;
                        }
-                       (void) cfil_action_data_pass(so, kcunit, 1,
+
+                       (void) cfil_action_data_pass(so, cfil_info, kcunit, 1,
                                        CFM_MAX_OFFSET,
                                        CFM_MAX_OFFSET);
 
-                       (void) cfil_action_data_pass(so, kcunit, 0,
+                       (void) cfil_action_data_pass(so, cfil_info, kcunit, 0,
                                        CFM_MAX_OFFSET,
                                        CFM_MAX_OFFSET);
 
                        cfil_rw_lock_exclusive(&cfil_lck_rw);
 
                        /*
-                        * Check again as the socket may have been unlocked
-                        * when when calling cfil_acquire_sockbuf()
+                        * Check again to make sure if the cfil_info is still valid
+                        * as the socket may have been unlocked when when calling
+                        * cfil_acquire_sockbuf()
                         */
-                       if (so->so_cfil == NULL || entry->cfe_filter == NULL)
+                       if (entry->cfe_filter == NULL ||
+                               (so->so_cfil == NULL && cfil_db_get_cfil_info(so->so_cfil_db, sock_flow_id) == NULL)) {
                                goto release;
+                       }
 
                        /* The filter is now detached */
                        entry->cfe_flags |= CFEF_CFIL_DETACHED;
+#if LIFECYCLE_DEBUG
+                       cfil_info_log(LOG_DEBUG, cfil_info, "CFIL: LIFECYCLE: - FILTER DISCONNECTED");
+#endif
                        CFIL_LOG(LOG_NOTICE, "so %llx detached %u",
                                (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
-
-                       if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
+                       if ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
                            cfil_filters_attached(so) == 0) {
                                CFIL_LOG(LOG_NOTICE, "so %llx waking",
                                        (uint64_t)VM_KERNEL_ADDRPERM(so));
-                               wakeup((caddr_t)&so->so_cfil);
+                               wakeup((caddr_t)cfil_info);
                        }
 
                        /*
@@ -1126,7 +1322,7 @@ done:
  * sblock(), sbunlock() or sodefunct()
  */
 static int
-cfil_acquire_sockbuf(struct socket *so, int outgoing)
+cfil_acquire_sockbuf(struct socket *so, struct cfil_info *cfil_info, int outgoing)
 {
        thread_t tp = current_thread();
        struct sockbuf *sb = outgoing ? &so->so_snd : &so->so_rcv;
@@ -1168,11 +1364,11 @@ cfil_acquire_sockbuf(struct socket *so, int outgoing)
        sb->sb_cfil_refs++;
 
        /* We acquire the socket buffer when we need to cleanup */
-       if (so->so_cfil == NULL) {
+       if (cfil_info == NULL) {
                CFIL_LOG(LOG_ERR, "so %llx cfil detached",
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
                error = 0;
-       } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
+       } else if (cfil_info->cfi_flags & CFIF_DROP) {
                CFIL_LOG(LOG_ERR, "so %llx drop set",
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
                error = EPIPE;
@@ -1221,15 +1417,36 @@ cfil_sock_id_from_socket(struct socket *so)
                return (CFIL_SOCK_ID_NONE);
 }
 
+static bool
+cfil_socket_safe_lock(struct inpcb *inp)
+{
+    if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
+        socket_lock(inp->inp_socket, 1);
+        if (in_pcb_checkstate(inp, WNT_RELEASE, 1) != WNT_STOPUSING) {
+            return true;
+        }
+        socket_unlock(inp->inp_socket, 1);
+    }
+    return false;
+}
+
 static struct socket *
-cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id)
+cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id, bool udp_only)
 {
        struct socket *so = NULL;
        u_int64_t gencnt = cfil_sock_id >> 32;
        u_int32_t flowhash = (u_int32_t)(cfil_sock_id & 0x0ffffffff);
        struct inpcb *inp = NULL;
-       struct inpcbinfo *pcbinfo = &tcbinfo;
+       struct inpcbinfo *pcbinfo = NULL;
+
+#if VERDICT_DEBUG
+       CFIL_LOG(LOG_ERR, "CFIL: VERDICT: search for socket: id %llu gencnt %llx flowhash %x", cfil_sock_id, gencnt, flowhash);
+#endif
+
+       if (udp_only)
+               goto find_udp;
 
+       pcbinfo = &tcbinfo;
        lck_rw_lock_shared(pcbinfo->ipi_lock);
        LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
                if (inp->inp_state != INPCB_STATE_DEAD &&
@@ -1237,12 +1454,33 @@ cfil_socket_from_sock_id(cfil_sock_id_t cfil_sock_id)
                        inp->inp_flowhash == flowhash &&
                        (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt &&
                        inp->inp_socket->so_cfil != NULL) {
-                       so = inp->inp_socket;
+                       if (cfil_socket_safe_lock(inp))
+                               so = inp->inp_socket;
+                       break;
+               }
+       }
+       lck_rw_done(pcbinfo->ipi_lock);
+       if (so != NULL) {
+               goto done;
+       }
+
+find_udp:
+
+       pcbinfo = &udbinfo;
+       lck_rw_lock_shared(pcbinfo->ipi_lock);
+       LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
+               if (inp->inp_state != INPCB_STATE_DEAD &&
+                       inp->inp_socket != NULL &&
+                       inp->inp_socket->so_cfil_db != NULL &&
+                       (inp->inp_socket->so_gencnt & 0x0ffffffff) == gencnt) {
+                       if (cfil_socket_safe_lock(inp))
+                               so = inp->inp_socket;
                        break;
                }
        }
        lck_rw_done(pcbinfo->ipi_lock);
 
+done:
        if (so == NULL) {
                OSIncrementAtomic(&cfil_stats.cfs_sock_id_not_found);
                CFIL_LOG(LOG_DEBUG,
@@ -1266,12 +1504,31 @@ cfil_socket_from_client_uuid(uuid_t necp_client_uuid, bool *cfil_attached)
                        inp->inp_socket != NULL &&
                        uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
                        *cfil_attached = (inp->inp_socket->so_cfil != NULL);
-                       so = inp->inp_socket;
+                       if (cfil_socket_safe_lock(inp))
+                               so = inp->inp_socket;
+                       break;
+               }
+       }
+       lck_rw_done(pcbinfo->ipi_lock);
+       if (so != NULL) {
+               goto done;
+       }
+
+       pcbinfo = &udbinfo;
+       lck_rw_lock_shared(pcbinfo->ipi_lock);
+       LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) {
+               if (inp->inp_state != INPCB_STATE_DEAD &&
+                       inp->inp_socket != NULL &&
+                       uuid_compare(inp->necp_client_uuid, necp_client_uuid) == 0) {
+                       *cfil_attached = (inp->inp_socket->so_cfil_db != NULL);
+                       if (cfil_socket_safe_lock(inp))
+                               so = inp->inp_socket;
                        break;
                }
        }
        lck_rw_done(pcbinfo->ipi_lock);
 
+done:
        return (so);
 }
 
@@ -1286,6 +1543,7 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
        struct socket *so;
        struct cfil_msg_action *action_msg;
        struct cfil_entry *entry;
+       struct cfil_info *cfil_info = NULL;
 
        CFIL_LOG(LOG_INFO, "");
 
@@ -1359,31 +1617,32 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                cfil_rw_unlock_shared(&cfil_lck_rw);
                goto done;
        }
+       cfil_rw_unlock_shared(&cfil_lck_rw);
 
-       so = cfil_socket_from_sock_id(msghdr->cfm_sock_id);
+       // Search for socket (TCP+UDP and lock so)
+       so = cfil_socket_from_sock_id(msghdr->cfm_sock_id, false);
        if (so == NULL) {
                CFIL_LOG(LOG_NOTICE, "bad sock_id %llx",
                        msghdr->cfm_sock_id);
                error = EINVAL;
-               cfil_rw_unlock_shared(&cfil_lck_rw);
                goto done;
        }
-       cfil_rw_unlock_shared(&cfil_lck_rw);
 
-       socket_lock(so, 1);
+       cfil_info = so->so_cfil_db != NULL ?
+               cfil_db_get_cfil_info(so->so_cfil_db, msghdr->cfm_sock_id) : so->so_cfil;
 
-       if (so->so_cfil == NULL) {
-               CFIL_LOG(LOG_NOTICE, "so %llx not attached",
-                       (uint64_t)VM_KERNEL_ADDRPERM(so));
+       if (cfil_info == NULL) {
+               CFIL_LOG(LOG_NOTICE, "so %llx <id %llu> not attached",
+                                (uint64_t)VM_KERNEL_ADDRPERM(so), msghdr->cfm_sock_id);
                error = EINVAL;
                goto unlock;
-       } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
+       } else if (cfil_info->cfi_flags & CFIF_DROP) {
                CFIL_LOG(LOG_NOTICE, "so %llx drop set",
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
                error = EINVAL;
                goto unlock;
        }
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
        if (entry->cfe_filter == NULL) {
                CFIL_LOG(LOG_NOTICE, "so %llx no filter",
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
@@ -1402,15 +1661,22 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
        }
 
        microuptime(&entry->cfe_last_action);
-       CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_action, &so->so_cfil->cfi_first_event, msghdr->cfm_op);
+       CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_action, &cfil_info->cfi_first_event, msghdr->cfm_op);
 
        action_msg = (struct cfil_msg_action *)msghdr;
 
        switch (msghdr->cfm_op) {
                case CFM_OP_DATA_UPDATE:
+#if VERDICT_DEBUG
+                       CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: <so %llx sockID %llu> <IN peek:%llu pass:%llu, OUT peek:%llu pass:%llu>",
+                                        (uint64_t)VM_KERNEL_ADDRPERM(so),
+                                        cfil_info->cfi_sock_id,
+                                        action_msg->cfa_in_peek_offset, action_msg->cfa_in_pass_offset,
+                                        action_msg->cfa_out_peek_offset, action_msg->cfa_out_pass_offset);
+#endif
                        if (action_msg->cfa_out_peek_offset != 0 ||
                                action_msg->cfa_out_pass_offset != 0)
-                               error = cfil_action_data_pass(so, kcunit, 1,
+                               error = cfil_action_data_pass(so, cfil_info, kcunit, 1,
                                        action_msg->cfa_out_pass_offset,
                                        action_msg->cfa_out_peek_offset);
                        if (error == EJUSTRETURN)
@@ -1419,7 +1685,7 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                                break;
                        if (action_msg->cfa_in_peek_offset != 0 ||
                                action_msg->cfa_in_pass_offset != 0)
-                               error = cfil_action_data_pass(so, kcunit, 0,
+                               error = cfil_action_data_pass(so, cfil_info, kcunit, 0,
                                        action_msg->cfa_in_pass_offset,
                                        action_msg->cfa_in_peek_offset);
                        if (error == EJUSTRETURN)
@@ -1427,7 +1693,7 @@ cfil_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, mbuf_t m,
                        break;
 
                case CFM_OP_DROP:
-                       error = cfil_action_drop(so, kcunit);
+                       error = cfil_action_drop(so, cfil_info, kcunit);
                        break;
 
                default:
@@ -1452,6 +1718,7 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
                int opt, void *data, size_t *len)
 {
 #pragma unused(kctlref, opt)
+       struct cfil_info *cfil_info = NULL;
        errno_t error = 0;
        struct content_filter *cfc = (struct content_filter *)unitinfo;
 
@@ -1501,14 +1768,6 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
 
                        struct cfil_opt_sock_info *sock_info = 
                                                                                        (struct cfil_opt_sock_info *) data;
-                       struct socket *sock = 
-                                                       cfil_socket_from_sock_id(sock_info->cfs_sock_id);
-                       if (sock == NULL) {
-                               CFIL_LOG(LOG_NOTICE, "bad sock_id %llx",
-                                       sock_info->cfs_sock_id);
-                               error = ENOENT;
-                               goto done;
-                       }
 
                        // Unlock here so that we never hold both cfil_lck_rw and the
                        // socket_lock at the same time. Otherwise, this can deadlock 
@@ -1521,11 +1780,26 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
                        //     goto return_already_unlocked from this branch. 
                        cfil_rw_unlock_shared(&cfil_lck_rw);
 
-                       socket_lock(sock, 1);
+                       // Search (TCP+UDP) and lock socket
+                       struct socket *sock =
+                               cfil_socket_from_sock_id(sock_info->cfs_sock_id, false);
+                       if (sock == NULL) {
+#if LIFECYCLE_DEBUG
+                               CFIL_LOG(LOG_ERR, "CFIL: GET_SOCKET_INFO failed: bad sock_id %llu",
+                                                sock_info->cfs_sock_id);
+#endif
+                               error = ENOENT;
+                               goto return_already_unlocked;
+                       }
+
+                       cfil_info = (sock->so_cfil_db != NULL) ?
+                               cfil_db_get_cfil_info(sock->so_cfil_db, sock_info->cfs_sock_id) : sock->so_cfil;
 
-                       if (sock->so_cfil == NULL) {
-                               CFIL_LOG(LOG_NOTICE, "so %llx not attached, cannot fetch info", 
+                       if (cfil_info == NULL) {
+#if LIFECYCLE_DEBUG
+                               CFIL_LOG(LOG_ERR, "CFIL: GET_SOCKET_INFO failed: so %llx not attached, cannot fetch info",
                                        (uint64_t)VM_KERNEL_ADDRPERM(sock));
+#endif
                                error = EINVAL;
                                socket_unlock(sock, 1);
                                goto return_already_unlocked;
@@ -1539,15 +1813,21 @@ cfil_ctl_getopt(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
                        // Source and destination addresses
                        struct inpcb *inp = sotoinpcb(sock);
                        if (inp->inp_vflag & INP_IPV6) {
-                               fill_ip6_sockaddr_4_6(&sock_info->cfs_local, 
-                                       &inp->in6p_laddr, inp->inp_lport);
-                               fill_ip6_sockaddr_4_6(&sock_info->cfs_remote,
-                                       &inp->in6p_faddr, inp->inp_fport);
+                               struct in6_addr *laddr = NULL, *faddr = NULL;
+                               u_int16_t lport = 0, fport = 0;
+
+                               cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp,
+                                                                                &laddr, &faddr, &lport, &fport);
+                               fill_ip6_sockaddr_4_6(&sock_info->cfs_local, laddr, lport);
+                               fill_ip6_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport);
                        } else if (inp->inp_vflag & INP_IPV4) {
-                               fill_ip_sockaddr_4_6(&sock_info->cfs_local,
-                                       inp->inp_laddr, inp->inp_lport);
-                               fill_ip_sockaddr_4_6(&sock_info->cfs_remote,
-                                       inp->inp_faddr, inp->inp_fport);
+                               struct in_addr laddr = {0}, faddr = {0};
+                               u_int16_t lport = 0, fport = 0;
+
+                               cfil_get_flow_address(cfil_info->cfi_hash_entry, inp,
+                                                                         &laddr, &faddr, &lport, &fport);
+                               fill_ip_sockaddr_4_6(&sock_info->cfs_local, laddr, lport);
+                               fill_ip_sockaddr_4_6(&sock_info->cfs_remote, faddr, fport);
                        }
 
                        // Set the pid info 
@@ -1644,6 +1924,7 @@ cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags)
        struct socket *so = NULL;
        int error;
        struct cfil_entry *entry;
+       struct cfil_info *cfil_info = NULL;
 
        CFIL_LOG(LOG_INFO, "");
 
@@ -1697,22 +1978,23 @@ cfil_ctl_rcvd(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo, int flags)
 
                OSIncrementAtomic(&cfil_stats.cfs_ctl_rcvd_flow_lift);
 
-               so = entry->cfe_cfil_info->cfi_so;
+               cfil_info = entry->cfe_cfil_info;
+               so = cfil_info->cfi_so;
 
                cfil_rw_unlock_shared(&cfil_lck_rw);
                socket_lock(so, 1);
 
                do {
-                       error = cfil_acquire_sockbuf(so, 1);
+                       error = cfil_acquire_sockbuf(so, cfil_info, 1);
                        if (error == 0)
-                               error = cfil_data_service_ctl_q(so, kcunit, 1);
+                               error = cfil_data_service_ctl_q(so, cfil_info, kcunit, 1);
                        cfil_release_sockbuf(so, 1);
                        if (error != 0)
                                break;
 
-                       error = cfil_acquire_sockbuf(so, 0);
+                       error = cfil_acquire_sockbuf(so, cfil_info, 0);
                        if (error == 0)
-                               error = cfil_data_service_ctl_q(so, kcunit, 0);
+                               error = cfil_data_service_ctl_q(so, cfil_info, kcunit, 0);
                        cfil_release_sockbuf(so, 0);
                } while (0);
 
@@ -1731,7 +2013,10 @@ cfil_init(void)
        struct kern_ctl_reg kern_ctl;
        errno_t error = 0;
        vm_size_t content_filter_size = 0;      /* size of content_filter */
-       vm_size_t cfil_info_size = 0;   /* size of cfil_info */
+    vm_size_t cfil_info_size = 0;      /* size of cfil_info */
+    vm_size_t cfil_hash_entry_size = 0;    /* size of cfil_hash_entry */
+    vm_size_t cfil_db_size = 0;    /* size of cfil_db */
+    unsigned int mbuf_limit = 0;
 
        CFIL_LOG(LOG_NOTICE, "");
 
@@ -1800,6 +2085,33 @@ cfil_init(void)
        zone_change(cfil_info_zone, Z_CALLERACCT, FALSE);
        zone_change(cfil_info_zone, Z_EXPAND, TRUE);
 
+    /*
+     * Zone for content filters cfil hash entries and db
+     */
+    cfil_hash_entry_size = sizeof(struct cfil_hash_entry);
+    cfil_hash_entry_zone = zinit(cfil_hash_entry_size,
+                                CFIL_HASH_ENTRY_ZONE_MAX * cfil_hash_entry_size,
+                                0,
+                                CFIL_HASH_ENTRY_ZONE_NAME);
+    if (cfil_hash_entry_zone == NULL) {
+        panic("%s: zinit(%s) failed", __func__, CFIL_HASH_ENTRY_ZONE_NAME);
+        /* NOTREACHED */
+    }
+    zone_change(cfil_hash_entry_zone, Z_CALLERACCT, FALSE);
+    zone_change(cfil_hash_entry_zone, Z_EXPAND, TRUE);
+    
+    cfil_db_size = sizeof(struct cfil_db);
+    cfil_db_zone = zinit(cfil_db_size,
+                         CFIL_DB_ZONE_MAX * cfil_db_size,
+                         0,
+                         CFIL_DB_ZONE_NAME);
+    if (cfil_db_zone == NULL) {
+        panic("%s: zinit(%s) failed", __func__, CFIL_DB_ZONE_NAME);
+        /* NOTREACHED */
+    }
+    zone_change(cfil_db_zone, Z_CALLERACCT, FALSE);
+    zone_change(cfil_db_zone, Z_EXPAND, TRUE);
+    
        /*
         * Allocate locks
         */
@@ -1843,10 +2155,24 @@ cfil_init(void)
                CFIL_LOG(LOG_ERR, "ctl_register failed: %d", error);
                return;
        }
+
+       // Spawn thread for gargage collection
+       if (kernel_thread_start(cfil_udp_gc_thread_func, NULL,
+                                                       &cfil_udp_gc_thread) != KERN_SUCCESS) {
+               panic_plain("%s: Can't create UDP GC thread", __func__);
+               /* NOTREACHED */
+       }
+       /* this must not fail */
+       VERIFY(cfil_udp_gc_thread != NULL);
+
+       // Set UDP per-flow mbuf thresholds to 1/32 of platform max
+       mbuf_limit = MAX(UDP_FLOW_GC_MBUF_CNT_MAX, (nmbclusters << MCLSHIFT) >> UDP_FLOW_GC_MBUF_SHIFT);
+       cfil_udp_gc_mbuf_num_max = (mbuf_limit >> MCLSHIFT);
+       cfil_udp_gc_mbuf_cnt_max = mbuf_limit;
 }
 
 struct cfil_info *
-cfil_info_alloc(struct socket *so)
+cfil_info_alloc(struct socket *so, struct cfil_hash_entry *hash_entry)
 {
        int kcunit;
        struct cfil_info *cfil_info = NULL;
@@ -1880,6 +2206,11 @@ cfil_info_alloc(struct socket *so)
                entry->cfe_rcv.cfe_pass_offset = 0;
                entry->cfe_rcv.cfe_peek_offset = 0;
                entry->cfe_rcv.cfe_peeked = 0;
+               /*
+                * Timestamp the last action to avoid pre-maturely
+                * triggering garbage collection
+                */
+               microuptime(&entry->cfe_last_action);
 
                cfil_queue_init(&entry->cfe_snd.cfe_pending_q);
                cfil_queue_init(&entry->cfe_rcv.cfe_pending_q);
@@ -1888,16 +2219,36 @@ cfil_info_alloc(struct socket *so)
        }
 
        cfil_rw_lock_exclusive(&cfil_lck_rw);
-
-       so->so_cfil = cfil_info;
-       cfil_info->cfi_so = so;
+    
        /*
         * Create a cfi_sock_id that's not the socket pointer!
         */
-       if (inp->inp_flowhash == 0)
-               inp->inp_flowhash = inp_calc_flowhash(inp);
-       cfil_info->cfi_sock_id =
-               ((so->so_gencnt << 32) | inp->inp_flowhash);
+    
+    if (hash_entry == NULL) {
+               // This is the TCP case, cfil_info is tracked per socket
+        if (inp->inp_flowhash == 0)
+            inp->inp_flowhash = inp_calc_flowhash(inp);
+        
+        so->so_cfil = cfil_info;
+        cfil_info->cfi_so = so;
+        cfil_info->cfi_sock_id =
+        ((so->so_gencnt << 32) | inp->inp_flowhash);
+    } else {
+        // This is the UDP case, cfil_info is tracked in per-socket hash
+               cfil_info->cfi_so = so;
+        hash_entry->cfentry_cfil = cfil_info;
+               cfil_info->cfi_hash_entry = hash_entry;
+        cfil_info->cfi_sock_id = ((so->so_gencnt << 32) | (hash_entry->cfentry_flowhash & 0xffffffff));
+               CFIL_LOG(LOG_DEBUG, "CFIL: UDP inp_flowhash %x so_gencnt %llx entry flowhash %x sockID %llx",
+                 inp->inp_flowhash, so->so_gencnt, hash_entry->cfentry_flowhash, cfil_info->cfi_sock_id);
+
+               // Wake up gc thread if this is first flow added
+               if (cfil_sock_udp_attached_count == 0) {
+                       thread_wakeup((caddr_t)&cfil_sock_udp_attached_count);
+               }
+
+               cfil_sock_udp_attached_count++;
+    }
 
        TAILQ_INSERT_TAIL(&cfil_sock_head, cfil_info, cfi_link);
 
@@ -1915,10 +2266,9 @@ done:
 }
 
 int
-cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit)
+cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit, struct cfil_info *cfil_info)
 {
        int kcunit;
-       struct cfil_info *cfil_info = so->so_cfil;
        int attached = 0;
 
        CFIL_LOG(LOG_INFO, "");
@@ -1956,19 +2306,12 @@ cfil_info_attach_unit(struct socket *so, uint32_t filter_control_unit)
 }
 
 static void
-cfil_info_free(struct socket *so, struct cfil_info *cfil_info)
+cfil_info_free(struct cfil_info *cfil_info)
 {
        int kcunit;
        uint64_t in_drain = 0;
        uint64_t out_drained = 0;
 
-       so->so_cfil = NULL;
-
-       if (so->so_flags & SOF_CONTENT_FILTER) {
-               so->so_flags &= ~SOF_CONTENT_FILTER;
-               VERIFY(so->so_usecount > 0);
-               so->so_usecount--;
-       }
        if (cfil_info == NULL)
                return;
 
@@ -1999,6 +2342,8 @@ cfil_info_free(struct socket *so, struct cfil_info *cfil_info)
 
                verify_content_filter(cfc);
        }
+       if (cfil_info->cfi_hash_entry != NULL)
+               cfil_sock_udp_attached_count--;
        cfil_sock_attached_count--;
        TAILQ_REMOVE(&cfil_sock_head, cfil_info, cfi_link);
 
@@ -2061,20 +2406,20 @@ cfil_sock_attach(struct socket *so)
                OSIncrementAtomic(&cfil_stats.cfs_sock_attach_already);
                CFIL_LOG(LOG_ERR, "already attached");
        } else {
-               cfil_info_alloc(so);
+               cfil_info_alloc(so, NULL);
                if (so->so_cfil == NULL) {
                        error = ENOMEM;
                        OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
                        goto done;
                }
        }
-       if (cfil_info_attach_unit(so, filter_control_unit) == 0) {
+       if (cfil_info_attach_unit(so, filter_control_unit, so->so_cfil) == 0) {
                CFIL_LOG(LOG_ERR, "cfil_info_attach_unit(%u) failed",
                        filter_control_unit);
                OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
                goto done;
        }
-       CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockid %llx",
+       CFIL_LOG(LOG_INFO, "so %llx filter_control_unit %u sockID %llx",
                (uint64_t)VM_KERNEL_ADDRPERM(so),
                filter_control_unit, so->so_cfil->cfi_sock_id);
 
@@ -2084,7 +2429,7 @@ cfil_sock_attach(struct socket *so)
        /* Hold a reference on the socket */
        so->so_usecount++;
 
-       error = cfil_dispatch_attach_event(so, filter_control_unit);
+       error = cfil_dispatch_attach_event(so, so->so_cfil, filter_control_unit);
        /* We can recover from flow control or out of memory errors */
        if (error == ENOBUFS || error == ENOMEM)
                error = 0;
@@ -2103,15 +2448,26 @@ done:
 errno_t
 cfil_sock_detach(struct socket *so)
 {
+       if (IS_UDP(so)) {
+               cfil_db_free(so);
+               return (0);
+       }
+
        if (so->so_cfil) {
-               cfil_info_free(so, so->so_cfil);
+               if (so->so_flags & SOF_CONTENT_FILTER) {
+                       so->so_flags &= ~SOF_CONTENT_FILTER;
+                       VERIFY(so->so_usecount > 0);
+                       so->so_usecount--;
+               }
+               cfil_info_free(so->so_cfil);
+               so->so_cfil = NULL;
                OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
        }
        return (0);
 }
 
 static int
-cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit)
+cfil_dispatch_attach_event(struct socket *so, struct cfil_info *cfil_info, uint32_t filter_control_unit)
 {
        errno_t error = 0;
        struct cfil_entry *entry = NULL;
@@ -2137,7 +2493,7 @@ cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit)
                        continue;
                if (cfc->cf_necp_control_unit != filter_control_unit)
                        continue;
-               entry = &so->so_cfil->cfi_entries[kcunit - 1];
+               entry = &cfil_info->cfi_entries[kcunit - 1];
                if (entry->cfe_filter == NULL)
                        continue;
 
@@ -2180,6 +2536,12 @@ cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit)
                msg_attached.cfs_e_pid = so->last_pid;
                memcpy(msg_attached.cfs_e_uuid, so->last_uuid, sizeof(uuid_t));
        }
+
+#if LIFECYCLE_DEBUG
+       CFIL_LOG(LOG_DEBUG, "CFIL: LIFECYCLE: SENDING ATTACH UP <sockID %llu> ",
+                        entry->cfe_cfil_info->cfi_sock_id);
+#endif
+
        error = ctl_enqueuedata(entry->cfe_filter->cf_kcref,
                                entry->cfe_filter->cf_kcunit,
                                &msg_attached,
@@ -2190,8 +2552,8 @@ cfil_dispatch_attach_event(struct socket *so, uint32_t filter_control_unit)
                goto done;
        }
        microuptime(&entry->cfe_last_event);
-       so->so_cfil->cfi_first_event.tv_sec = entry->cfe_last_event.tv_sec;
-       so->so_cfil->cfi_first_event.tv_usec = entry->cfe_last_event.tv_usec;
+       cfil_info->cfi_first_event.tv_sec = entry->cfe_last_event.tv_sec;
+       cfil_info->cfi_first_event.tv_usec = entry->cfe_last_event.tv_usec;
 
        entry->cfe_flags |= CFEF_SENT_SOCK_ATTACHED;
        OSIncrementAtomic(&cfil_stats.cfs_attach_event_ok);
@@ -2218,7 +2580,7 @@ done:
 }
 
 static int
-cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing)
+cfil_dispatch_disconnect_event(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
 {
        errno_t error = 0;
        struct mbuf *msg = NULL;
@@ -2231,7 +2593,7 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing)
 
        cfil_rw_lock_shared(&cfil_lck_rw);
 
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
        if (outgoing)
                entrybuf = &entry->cfe_snd;
        else
@@ -2242,7 +2604,7 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing)
                goto done;
 
        CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
-               (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
 
        /*
         * Send the disconnection event once
@@ -2270,6 +2632,12 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing)
                goto done;
        }
 
+#if LIFECYCLE_DEBUG
+       cfil_info_log(LOG_ERR, cfil_info, outgoing ?
+                                "CFIL: LIFECYCLE: OUT - SENDING DISCONNECT UP":
+                                "CFIL: LIFECYCLE: IN - SENDING DISCONNECT UP");
+#endif
+
        bzero(&msg_disconnected, sizeof(struct cfil_msg_hdr));
        msg_disconnected.cfm_len = sizeof(struct cfil_msg_hdr);
        msg_disconnected.cfm_version = CFM_VERSION_CURRENT;
@@ -2288,7 +2656,7 @@ cfil_dispatch_disconnect_event(struct socket *so, uint32_t kcunit, int outgoing)
                goto done;
        }
        microuptime(&entry->cfe_last_event);
-       CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_event, &so->so_cfil->cfi_first_event, msg_disconnected.cfm_op);
+       CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_event, &cfil_info->cfi_first_event, msg_disconnected.cfm_op);
 
        /* Remember we have sent the disconnection message */
        if (outgoing) {
@@ -2321,7 +2689,7 @@ done:
 }
 
 int
-cfil_dispatch_closed_event(struct socket *so, int kcunit)
+cfil_dispatch_closed_event(struct socket *so, struct cfil_info *cfil_info, int kcunit)
 {
        struct cfil_entry *entry;
        struct cfil_msg_sock_closed msg_closed;
@@ -2332,13 +2700,13 @@ cfil_dispatch_closed_event(struct socket *so, int kcunit)
 
        cfil_rw_lock_shared(&cfil_lck_rw);
 
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
        cfc = entry->cfe_filter;
        if (cfc == NULL)
                goto done;
 
        CFIL_LOG(LOG_INFO, "so %llx kcunit %d",
-               (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
 
        /* Would be wasteful to try when flow controlled */
        if (cfc->cf_flags & CFF_FLOW_CONTROLLED) {
@@ -2354,7 +2722,7 @@ cfil_dispatch_closed_event(struct socket *so, int kcunit)
                goto done;
 
        microuptime(&entry->cfe_last_event);
-       CFI_ADD_TIME_LOG(so->so_cfil, &entry->cfe_last_event, &so->so_cfil->cfi_first_event, CFM_OP_SOCKET_CLOSED);
+       CFI_ADD_TIME_LOG(cfil_info, &entry->cfe_last_event, &cfil_info->cfi_first_event, CFM_OP_SOCKET_CLOSED);
 
        bzero(&msg_closed, sizeof(struct cfil_msg_sock_closed));
        msg_closed.cfc_msghdr.cfm_len = sizeof(struct cfil_msg_sock_closed);
@@ -2362,13 +2730,15 @@ cfil_dispatch_closed_event(struct socket *so, int kcunit)
        msg_closed.cfc_msghdr.cfm_type = CFM_TYPE_EVENT;
        msg_closed.cfc_msghdr.cfm_op = CFM_OP_SOCKET_CLOSED;
        msg_closed.cfc_msghdr.cfm_sock_id = entry->cfe_cfil_info->cfi_sock_id;
-       msg_closed.cfc_first_event.tv_sec = so->so_cfil->cfi_first_event.tv_sec;
-       msg_closed.cfc_first_event.tv_usec = so->so_cfil->cfi_first_event.tv_usec;
-       memcpy(msg_closed.cfc_op_time, so->so_cfil->cfi_op_time, sizeof(uint32_t)*CFI_MAX_TIME_LOG_ENTRY);
-       memcpy(msg_closed.cfc_op_list, so->so_cfil->cfi_op_list, sizeof(unsigned char)*CFI_MAX_TIME_LOG_ENTRY);
-       msg_closed.cfc_op_list_ctr = so->so_cfil->cfi_op_list_ctr;
-
-       CFIL_LOG(LOG_INFO, "sock id %llu, op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, so->so_cfil->cfi_op_list_ctr, so->so_cfil->cfi_first_event.tv_sec, so->so_cfil->cfi_first_event.tv_usec);
+       msg_closed.cfc_first_event.tv_sec = cfil_info->cfi_first_event.tv_sec;
+       msg_closed.cfc_first_event.tv_usec = cfil_info->cfi_first_event.tv_usec;
+       memcpy(msg_closed.cfc_op_time, cfil_info->cfi_op_time, sizeof(uint32_t)*CFI_MAX_TIME_LOG_ENTRY);
+       memcpy(msg_closed.cfc_op_list, cfil_info->cfi_op_list, sizeof(unsigned char)*CFI_MAX_TIME_LOG_ENTRY);
+       msg_closed.cfc_op_list_ctr = cfil_info->cfi_op_list_ctr;
+
+#if LIFECYCLE_DEBUG
+       CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: SENDING CLOSED UP: <sock id %llu> op ctr %d, start time %llu.%llu", msg_closed.cfc_msghdr.cfm_sock_id, cfil_info->cfi_op_list_ctr, cfil_info->cfi_first_event.tv_sec, cfil_info->cfi_first_event.tv_usec);
+#endif
        /* for debugging
        if (msg_closed.cfc_op_list_ctr > CFI_MAX_TIME_LOG_ENTRY) {
                msg_closed.cfc_op_list_ctr  = CFI_MAX_TIME_LOG_ENTRY;       // just in case
@@ -2441,9 +2811,45 @@ fill_ip_sockaddr_4_6(union sockaddr_in_4_6 *sin46,
        sin->sin_addr.s_addr = ip.s_addr;
 }
 
+static void
+cfil_get_flow_address_v6(struct cfil_hash_entry *entry, struct inpcb *inp,
+                                                struct in6_addr **laddr, struct in6_addr **faddr,
+                                                u_int16_t *lport, u_int16_t *fport)
+{
+       if (entry != NULL) {
+               *laddr = &entry->cfentry_laddr.addr6;
+               *faddr = &entry->cfentry_faddr.addr6;
+               *lport = entry->cfentry_lport;
+               *fport = entry->cfentry_fport;
+       } else {
+               *laddr = &inp->in6p_laddr;
+               *faddr = &inp->in6p_faddr;
+               *lport = inp->inp_lport;
+               *fport = inp->inp_fport;
+       }
+}
+
+static void
+cfil_get_flow_address(struct cfil_hash_entry *entry, struct inpcb *inp,
+                                         struct in_addr *laddr, struct in_addr *faddr,
+                                         u_int16_t *lport, u_int16_t *fport)
+{
+       if (entry != NULL) {
+               *laddr = entry->cfentry_laddr.addr46.ia46_addr4;
+               *faddr = entry->cfentry_faddr.addr46.ia46_addr4;
+               *lport = entry->cfentry_lport;
+               *fport = entry->cfentry_fport;
+       } else {
+               *laddr = inp->inp_laddr;
+               *faddr = inp->inp_faddr;
+               *lport = inp->inp_lport;
+               *fport = inp->inp_fport;
+       }
+}
+
 static int
-cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing,
-       struct mbuf *data, unsigned int copyoffset, unsigned int copylen)
+cfil_dispatch_data_event(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
+                                                struct mbuf *data, unsigned int copyoffset, unsigned int copylen)
 {
        errno_t error = 0;
        struct mbuf *copy = NULL;
@@ -2459,7 +2865,7 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing,
 
        cfil_rw_lock_shared(&cfil_lck_rw);
 
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
        if (outgoing)
                entrybuf = &entry->cfe_snd;
        else
@@ -2469,6 +2875,12 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing,
        if (cfc == NULL)
                goto done;
 
+       data = cfil_data_start(data);
+       if (data == NULL || (data->m_flags & M_PKTHDR) == 0) {
+               CFIL_LOG(LOG_ERR, "NOT PKTHDR");
+               goto done;
+       }
+
        CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
                (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit, outgoing);
 
@@ -2522,33 +2934,36 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing,
         * parameters
         */
        if (inp->inp_vflag & INP_IPV6) {
+               struct in6_addr *laddr = NULL, *faddr = NULL;
+               u_int16_t lport = 0, fport = 0;
+
+               cfil_get_flow_address_v6(cfil_info->cfi_hash_entry, inp,
+                                                                &laddr, &faddr, &lport, &fport);
                if (outgoing) {
-                       fill_ip6_sockaddr_4_6(&data_req->cfc_src,
-                               &inp->in6p_laddr, inp->inp_lport);
-                       fill_ip6_sockaddr_4_6(&data_req->cfc_dst,
-                               &inp->in6p_faddr, inp->inp_fport);
+                       fill_ip6_sockaddr_4_6(&data_req->cfc_src, laddr, lport);
+                       fill_ip6_sockaddr_4_6(&data_req->cfc_dst, faddr, fport);
                } else {
-                       fill_ip6_sockaddr_4_6(&data_req->cfc_src,
-                               &inp->in6p_faddr, inp->inp_fport);
-                       fill_ip6_sockaddr_4_6(&data_req->cfc_dst,
-                               &inp->in6p_laddr, inp->inp_lport);
+                       fill_ip6_sockaddr_4_6(&data_req->cfc_src, faddr, fport);
+                       fill_ip6_sockaddr_4_6(&data_req->cfc_dst, laddr, lport);
                }
        } else if (inp->inp_vflag & INP_IPV4) {
+               struct in_addr laddr = {0}, faddr = {0};
+               u_int16_t lport = 0, fport = 0;
+
+               cfil_get_flow_address(cfil_info->cfi_hash_entry, inp,
+                                                         &laddr, &faddr, &lport, &fport);
+
                if (outgoing) {
-                       fill_ip_sockaddr_4_6(&data_req->cfc_src,
-                               inp->inp_laddr, inp->inp_lport);
-                       fill_ip_sockaddr_4_6(&data_req->cfc_dst,
-                               inp->inp_faddr, inp->inp_fport);
+                       fill_ip_sockaddr_4_6(&data_req->cfc_src, laddr, lport);
+                       fill_ip_sockaddr_4_6(&data_req->cfc_dst, faddr, fport);
                } else {
-                       fill_ip_sockaddr_4_6(&data_req->cfc_src,
-                               inp->inp_faddr, inp->inp_fport);
-                       fill_ip_sockaddr_4_6(&data_req->cfc_dst,
-                               inp->inp_laddr, inp->inp_lport);
+                       fill_ip_sockaddr_4_6(&data_req->cfc_src, faddr, fport);
+                       fill_ip_sockaddr_4_6(&data_req->cfc_dst, laddr, lport);
                }
        }
 
        microuptime(&tv);
-       CFI_ADD_TIME_LOG(so->so_cfil, &tv, &so->so_cfil->cfi_first_event, data_req->cfd_msghdr.cfm_op);
+       CFI_ADD_TIME_LOG(cfil_info, &tv, &cfil_info->cfi_first_event, data_req->cfd_msghdr.cfm_op);
 
        /* Pass the message to the content filter */
        error = ctl_enqueuembuf(entry->cfe_filter->cf_kcref,
@@ -2561,6 +2976,12 @@ cfil_dispatch_data_event(struct socket *so, uint32_t kcunit, int outgoing,
        }
        entry->cfe_flags &= ~CFEF_FLOW_CONTROLLED;
        OSIncrementAtomic(&cfil_stats.cfs_data_event_ok);
+
+#if VERDICT_DEBUG
+       CFIL_LOG(LOG_ERR, "CFIL: VERDICT ACTION: so %llx sockID %llu outgoing %d: mbuf %llx copyoffset %u copylen %u",
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing, (uint64_t)VM_KERNEL_ADDRPERM(data), copyoffset, copylen);
+#endif
+
 done:
        if (error == ENOBUFS) {
                entry->cfe_flags |= CFEF_FLOW_CONTROLLED;
@@ -2586,7 +3007,7 @@ done:
  * Process the queue of data waiting to be delivered to content filter
  */
 static int
-cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
+cfil_data_service_ctl_q(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
 {
        errno_t error = 0;
        struct mbuf *data, *tmp = NULL;
@@ -2595,7 +3016,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
        struct cfe_buf *entrybuf;
        uint64_t currentoffset = 0;
 
-       if (so->so_cfil == NULL)
+       if (cfil_info == NULL)
                return (0);
 
        CFIL_LOG(LOG_INFO, "so %llx kcunit %u outgoing %d",
@@ -2603,7 +3024,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
 
        socket_lock_assert_owned(so);
 
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
        if (outgoing)
                entrybuf = &entry->cfe_snd;
        else
@@ -2611,7 +3032,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
 
        /* Send attached message if not yet done */
        if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0) {
-               error = cfil_dispatch_attach_event(so, kcunit);
+               error = cfil_dispatch_attach_event(so, cfil_info, kcunit);
                if (error != 0) {
                        /* We can recover from flow control */
                        if (error == ENOBUFS || error == ENOMEM)
@@ -2622,15 +3043,18 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
                OSIncrementAtomic(&cfil_stats.cfs_ctl_q_not_started);
                goto done;
        }
-       CFIL_LOG(LOG_DEBUG, "pass_offset %llu peeked %llu peek_offset %llu",
+
+#if DATA_DEBUG
+       CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE CTL-Q: pass_offset %llu peeked %llu peek_offset %llu",
                entrybuf->cfe_pass_offset,
                entrybuf->cfe_peeked,
                entrybuf->cfe_peek_offset);
+#endif
 
        /* Move all data that can pass */
        while ((data = cfil_queue_first(&entrybuf->cfe_ctl_q)) != NULL &&
                entrybuf->cfe_ctl_q.q_start < entrybuf->cfe_pass_offset) {
-               datalen = cfil_data_length(data, NULL);
+               datalen = cfil_data_length(data, NULL, NULL);
                tmp = data;
 
                if (entrybuf->cfe_ctl_q.q_start + datalen <=
@@ -2648,15 +3072,17 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
                }
                VERIFY(copylen <= datalen);
 
+#if DATA_DEBUG
                CFIL_LOG(LOG_DEBUG,
-                       "%llx first %llu peeked %llu pass %llu peek %llu"
-                       "datalen %u copylen %u",
-                       (uint64_t)VM_KERNEL_ADDRPERM(tmp),
-                       entrybuf->cfe_ctl_q.q_start,
-                       entrybuf->cfe_peeked,
-                       entrybuf->cfe_pass_offset,
-                       entrybuf->cfe_peek_offset,
-                       datalen, copylen);
+                                "CFIL: SERVICE CTL-Q PASSING: %llx first %llu peeked %llu pass %llu peek %llu"
+                                "datalen %u copylen %u",
+                                (uint64_t)VM_KERNEL_ADDRPERM(tmp),
+                                entrybuf->cfe_ctl_q.q_start,
+                                entrybuf->cfe_peeked,
+                                entrybuf->cfe_pass_offset,
+                                entrybuf->cfe_peek_offset,
+                                datalen, copylen);
+#endif
 
                /*
                 * Data that passes has been peeked at explicitly or
@@ -2683,7 +3109,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
                        OSAddAtomic64(datalen,
                                &cfil_stats.cfs_pending_q_in_enqueued);
        }
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
        if (tmp != NULL)
                CFIL_LOG(LOG_DEBUG,
                        "%llx first %llu peeked %llu pass %llu peek %llu"
@@ -2702,7 +3128,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
                data != NULL && currentoffset < entrybuf->cfe_peek_offset;
                data = cfil_queue_next(&entrybuf->cfe_ctl_q, data),
                currentoffset += datalen) {
-               datalen = cfil_data_length(data, NULL);
+               datalen = cfil_data_length(data, NULL, NULL);
                tmp = data;
 
                /* We've already peeked at this mbuf */
@@ -2725,15 +3151,17 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
                                (currentoffset + copyoffset);
                }
 
+#if DATA_DEBUG
                CFIL_LOG(LOG_DEBUG,
-                       "%llx current %llu peeked %llu pass %llu peek %llu"
-                       "datalen %u copylen %u copyoffset %u",
-                       (uint64_t)VM_KERNEL_ADDRPERM(tmp),
-                       currentoffset,
-                       entrybuf->cfe_peeked,
-                       entrybuf->cfe_pass_offset,
-                       entrybuf->cfe_peek_offset,
-                       datalen, copylen, copyoffset);
+                                "CFIL: SERVICE CTL-Q PEEKING: %llx current %llu peeked %llu pass %llu peek %llu "
+                                "datalen %u copylen %u copyoffset %u",
+                                (uint64_t)VM_KERNEL_ADDRPERM(tmp),
+                                currentoffset,
+                                entrybuf->cfe_peeked,
+                                entrybuf->cfe_pass_offset,
+                                entrybuf->cfe_peek_offset,
+                                datalen, copylen, copyoffset);
+#endif
 
                /*
                 * Stop if there is nothing more to peek at
@@ -2743,7 +3171,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
                /*
                 * Let the filter get a peek at this span of data
                 */
-               error = cfil_dispatch_data_event(so, kcunit,
+               error = cfil_dispatch_data_event(so, cfil_info, kcunit,
                        outgoing, data, copyoffset, copylen);
                if (error != 0) {
                        /* On error, leave data in ctl_q */
@@ -2761,7 +3189,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
                if (copylen + copyoffset < datalen)
                        break;
        }
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
        if (tmp != NULL)
                CFIL_LOG(LOG_DEBUG,
                        "%llx first %llu peeked %llu pass %llu peek %llu"
@@ -2776,7 +3204,7 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
        /*
         * Process data that has passed the filter
         */
-       error = cfil_service_pending_queue(so, kcunit, outgoing);
+       error = cfil_service_pending_queue(so, cfil_info, kcunit, outgoing);
        if (error != 0) {
                CFIL_LOG(LOG_ERR, "cfil_service_pending_queue() error %d",
                        error);
@@ -2786,16 +3214,16 @@ cfil_data_service_ctl_q(struct socket *so, uint32_t kcunit, int outgoing)
        /*
         * Dispatch disconnect events that could not be sent
         */
-       if (so->so_cfil == NULL)
+       if (cfil_info == NULL)
                goto done;
        else if (outgoing) {
-               if ((so->so_cfil->cfi_flags & CFIF_SHUT_WR) &&
+               if ((cfil_info->cfi_flags & CFIF_SHUT_WR) &&
                    !(entry->cfe_flags & CFEF_SENT_DISCONNECT_OUT))
-                       cfil_dispatch_disconnect_event(so, kcunit, 1);
+                       cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 1);
        } else {
-               if ((so->so_cfil->cfi_flags & CFIF_SHUT_RD) &&
+               if ((cfil_info->cfi_flags & CFIF_SHUT_RD) &&
                    !(entry->cfe_flags & CFEF_SENT_DISCONNECT_IN))
-                       cfil_dispatch_disconnect_event(so, kcunit, 0);
+                       cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 0);
        }
 
 done:
@@ -2806,7 +3234,7 @@ done:
                entrybuf->cfe_pass_offset,
                entrybuf->cfe_peek_offset);
 
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
        return (error);
 }
 
@@ -2816,7 +3244,7 @@ done:
  * Process data for a content filter installed on a socket
  */
 int
-cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing,
+cfil_data_filter(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
        struct mbuf *data, uint64_t datalen)
 {
        errno_t error = 0;
@@ -2828,7 +3256,7 @@ cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing,
 
        socket_lock_assert_owned(so);
 
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
        if (outgoing)
                entrybuf = &entry->cfe_snd;
        else
@@ -2849,7 +3277,7 @@ cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing,
                OSAddAtomic64(datalen,
                        &cfil_stats.cfs_ctl_q_in_enqueued);
 
-       error = cfil_data_service_ctl_q(so, kcunit, outgoing);
+       error = cfil_data_service_ctl_q(so, cfil_info, kcunit, outgoing);
        if (error != 0) {
                CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
                        error);
@@ -2860,7 +3288,7 @@ cfil_data_filter(struct socket *so, uint32_t kcunit, int outgoing,
         */
        error = EJUSTRETURN;
 done:
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
 
        CFIL_LOG(LOG_INFO, "return %d", error);
        return (error);
@@ -2871,103 +3299,84 @@ done:
  * content filters
  */
 static int
-cfil_service_inject_queue(struct socket *so, int outgoing)
+cfil_service_inject_queue(struct socket *so, struct cfil_info *cfil_info, int outgoing)
 {
        mbuf_t data;
        unsigned int datalen;
-       int mbcnt;
-       unsigned int copylen;
+       int mbcnt = 0;
+       int mbnum = 0;
        errno_t error = 0;
-       struct mbuf *copy = NULL;
        struct cfi_buf *cfi_buf;
        struct cfil_queue *inject_q;
        int need_rwakeup = 0;
+       int count = 0;
 
-       if (so->so_cfil == NULL)
+       if (cfil_info == NULL)
                return (0);
 
-       CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
-               (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
-
        socket_lock_assert_owned(so);
 
        if (outgoing) {
-               cfi_buf = &so->so_cfil->cfi_snd;
-               so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_OUT;
+               cfi_buf = &cfil_info->cfi_snd;
+               cfil_info->cfi_flags &= ~CFIF_RETRY_INJECT_OUT;
        } else {
-               cfi_buf = &so->so_cfil->cfi_rcv;
-               so->so_cfil->cfi_flags &= ~CFIF_RETRY_INJECT_IN;
+               cfi_buf = &cfil_info->cfi_rcv;
+               cfil_info->cfi_flags &= ~CFIF_RETRY_INJECT_IN;
        }
        inject_q = &cfi_buf->cfi_inject_q;
 
-       while ((data = cfil_queue_first(inject_q)) != NULL) {
-               datalen = cfil_data_length(data, &mbcnt);
-
-               CFIL_LOG(LOG_INFO, "data %llx datalen %u",
-                       (uint64_t)VM_KERNEL_ADDRPERM(data), datalen);
-
-               /* Make a copy in case of injection error */
-               copy = m_copym_mode(data, 0, M_COPYALL, M_DONTWAIT,
-                       M_COPYM_COPY_HDR);
-               if (copy == NULL) {
-                       CFIL_LOG(LOG_ERR, "m_copym_mode() failed");
-                       error = ENOMEM;
-                       break;
-               }
+       if (cfil_queue_empty(inject_q))
+               return (0);
 
-               if ((copylen = m_length(copy)) != datalen)
-                       panic("%s so %p copylen %d != datalen %d",
-                               __func__, so, copylen, datalen);
+#if DATA_DEBUG | VERDICT_DEBUG
+       CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> outgoing %d queue len %llu",
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, cfil_queue_len(inject_q));
+#endif
 
-               if (outgoing) {
-                       socket_unlock(so, 0);
+       while ((data = cfil_queue_first(inject_q)) != NULL) {
+               datalen = cfil_data_length(data, &mbcnt, &mbnum);
 
-                       /*
-                        * Set both DONTWAIT and NBIO flags are we really
-                        * do not want to block
-                        */
-                       error = sosend(so, NULL, NULL,
-                                       copy, NULL,
-                                       MSG_SKIPCFIL | MSG_DONTWAIT | MSG_NBIO);
+#if DATA_DEBUG
+               CFIL_LOG(LOG_DEBUG, "CFIL: SERVICE INJECT-Q: <%s>: <so %llx> data %llx datalen %u (mbcnt %u)",
+                                remote_addr_ptr ? "UNCONNECTED" : "CONNECTED",
+                                (uint64_t)VM_KERNEL_ADDRPERM(so), (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, mbcnt);
+#endif
 
-                       socket_lock(so, 0);
+               /* Remove data from queue and adjust stats */
+               cfil_queue_remove(inject_q, data, datalen);
+               cfi_buf->cfi_pending_first += datalen;
+               cfi_buf->cfi_pending_mbcnt -= mbcnt;
+               cfi_buf->cfi_pending_mbnum -= mbnum;
+               cfil_info_buf_verify(cfi_buf);
 
+               if (outgoing) {
+                       error = sosend_reinject(so, NULL, data, NULL, 0);
                        if (error != 0) {
-                               CFIL_LOG(LOG_ERR, "sosend() failed %d",
-                                       error);
+#if DATA_DEBUG
+                               cfil_info_log(LOG_ERR, cfil_info, "CFIL: Error: sosend_reinject() failed");
+                               CFIL_LOG(LOG_ERR, "### sosend() failed %d", error);
+#endif
+                               break;
                        }
+                       // At least one injection succeeded, need to wake up pending threads.
+                       need_rwakeup = 1;
                } else {
-                       copy->m_flags |= M_SKIPCFIL;
+                       data->m_flags |= M_SKIPCFIL;
 
                        /*
-                        * NOTE:
-                        * This work only because we support plain TCP
-                        * For UDP, RAWIP, MPTCP and message TCP we'll
+                        * NOTE: We currently only support TCP and UDP.
+                        * For RAWIP, MPTCP and message TCP we'll
                         * need to call the appropriate sbappendxxx()
                         * of fix sock_inject_data_in()
                         */
-                       if (sbappendstream(&so->so_rcv, copy))
-                               need_rwakeup = 1;
-               }
-
-               /* Need to reassess if filter is still attached after unlock */
-               if (so->so_cfil == NULL) {
-                       CFIL_LOG(LOG_ERR, "so %llx cfil detached",
-                               (uint64_t)VM_KERNEL_ADDRPERM(so));
-                       OSIncrementAtomic(&cfil_stats.cfs_inject_q_detached);
-                       error = 0;
-                       break;
+                       if (IS_UDP(so) == TRUE) {
+                               if (sbappendchain(&so->so_rcv, data, 0))
+                                       need_rwakeup = 1;
+                       } else {
+                               if (sbappendstream(&so->so_rcv, data))
+                                       need_rwakeup = 1;
+                       }
                }
-               if (error != 0)
-                       break;
-
-               /* Injection successful */
-               cfil_queue_remove(inject_q, data, datalen);
-               mbuf_freem(data);
-
-               cfi_buf->cfi_pending_first += datalen;
-               cfi_buf->cfi_pending_mbcnt -= mbcnt;
-               cfil_info_buf_verify(cfi_buf);
 
                if (outgoing)
                        OSAddAtomic64(datalen,
@@ -2975,23 +3384,34 @@ cfil_service_inject_queue(struct socket *so, int outgoing)
                else
                        OSAddAtomic64(datalen,
                                &cfil_stats.cfs_inject_q_in_passed);
+
+               count++;
        }
 
+#if DATA_DEBUG | VERDICT_DEBUG
+       CFIL_LOG(LOG_ERR, "CFIL: SERVICE INJECT-Q: <so %llx> injected %d",
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), count);
+#endif
+
        /* A single wakeup is for several packets is more efficient */
-       if (need_rwakeup)
-               sorwakeup(so);
+       if (need_rwakeup) {
+               if (outgoing == TRUE)
+                       sowwakeup(so);
+               else
+                       sorwakeup(so);
+       }
 
-       if (error != 0 && so->so_cfil) {
+       if (error != 0 && cfil_info) {
                if (error == ENOBUFS)
                        OSIncrementAtomic(&cfil_stats.cfs_inject_q_nobufs);
                if (error == ENOMEM)
                        OSIncrementAtomic(&cfil_stats.cfs_inject_q_nomem);
 
                if (outgoing) {
-                       so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_OUT;
+                       cfil_info->cfi_flags |= CFIF_RETRY_INJECT_OUT;
                        OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_fail);
                } else {
-                       so->so_cfil->cfi_flags |= CFIF_RETRY_INJECT_IN;
+                       cfil_info->cfi_flags |= CFIF_RETRY_INJECT_IN;
                        OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_fail);
                }
        }
@@ -2999,26 +3419,26 @@ cfil_service_inject_queue(struct socket *so, int outgoing)
        /*
         * Notify
         */
-       if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_SHUT_WR)) {
+       if (cfil_info && (cfil_info->cfi_flags & CFIF_SHUT_WR)) {
                cfil_sock_notify_shutdown(so, SHUT_WR);
                if (cfil_sock_data_pending(&so->so_snd) == 0)
                        soshutdownlock_final(so, SHUT_WR);
        }
-       if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) {
+       if (cfil_info && (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)) {
                if (cfil_filters_attached(so) == 0) {
                        CFIL_LOG(LOG_INFO, "so %llx waking",
                                (uint64_t)VM_KERNEL_ADDRPERM(so));
-                       wakeup((caddr_t)&so->so_cfil);
+                       wakeup((caddr_t)cfil_info);
                }
        }
 
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
 
        return (error);
 }
 
 static int
-cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
+cfil_service_pending_queue(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing)
 {
        uint64_t passlen, curlen;
        mbuf_t data;
@@ -3033,7 +3453,7 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
 
        socket_lock_assert_owned(so);
 
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
        if (outgoing)
                entrybuf = &entry->cfe_snd;
        else
@@ -3049,12 +3469,14 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
         */
        curlen = 0;
        while ((data = cfil_queue_first(pending_q)) != NULL) {
-               datalen = cfil_data_length(data, NULL);
+               datalen = cfil_data_length(data, NULL, NULL);
 
-               CFIL_LOG(LOG_INFO,
-                       "data %llx datalen %u passlen %llu curlen %llu",
+#if DATA_DEBUG
+               CFIL_LOG(LOG_DEBUG,
+                                "CFIL: SERVICE PENDING-Q: data %llx datalen %u passlen %llu curlen %llu",
                        (uint64_t)VM_KERNEL_ADDRPERM(data), datalen,
                        passlen, curlen);
+#endif
 
                if (curlen + datalen > passlen)
                        break;
@@ -3066,7 +3488,7 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
                for (kcunit += 1;
                        kcunit <= MAX_CONTENT_FILTER;
                        kcunit++) {
-                       error = cfil_data_filter(so, kcunit, outgoing,
+                       error = cfil_data_filter(so, cfil_info, kcunit, outgoing,
                                data, datalen);
                        /* 0 means passed so we can continue */
                        if (error != 0)
@@ -3076,13 +3498,13 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
                if (error == 0) {
                        if (outgoing) {
                                cfil_queue_enqueue(
-                                       &so->so_cfil->cfi_snd.cfi_inject_q,
+                                       &cfil_info->cfi_snd.cfi_inject_q,
                                        data, datalen);
                                OSAddAtomic64(datalen,
                                        &cfil_stats.cfs_inject_q_out_enqueued);
                        } else {
                                cfil_queue_enqueue(
-                                       &so->so_cfil->cfi_rcv.cfi_inject_q,
+                                       &cfil_info->cfi_rcv.cfi_inject_q,
                                        data, datalen);
                                OSAddAtomic64(datalen,
                                        &cfil_stats.cfs_inject_q_in_enqueued);
@@ -3090,13 +3512,13 @@ cfil_service_pending_queue(struct socket *so, uint32_t kcunit, int outgoing)
                }
        }
 
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
 
        return (error);
 }
 
 int
-cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing,
+cfil_update_data_offsets(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
        uint64_t pass_offset, uint64_t peek_offset)
 {
        errno_t error = 0;
@@ -3108,19 +3530,19 @@ cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing,
 
        socket_lock_assert_owned(so);
 
-       if (so->so_cfil == NULL) {
+       if (cfil_info == NULL) {
                CFIL_LOG(LOG_ERR, "so %llx cfil detached",
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
                error = 0;
                goto done;
-       } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
+       } else if (cfil_info->cfi_flags & CFIF_DROP) {
                CFIL_LOG(LOG_ERR, "so %llx drop set",
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
                error = EPIPE;
                goto done;
        }
 
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
        if (outgoing)
                entrybuf = &entry->cfe_snd;
        else
@@ -3148,7 +3570,7 @@ cfil_update_data_offsets(struct socket *so, uint32_t kcunit, int outgoing,
                goto done;
 
        /* Move data held in control queue to pending queue if needed */
-       error = cfil_data_service_ctl_q(so, kcunit, outgoing);
+       error = cfil_data_service_ctl_q(so, cfil_info, kcunit, outgoing);
        if (error != 0) {
                CFIL_LOG(LOG_ERR, "cfil_data_service_ctl_q() error %d",
                        error);
@@ -3165,20 +3587,28 @@ done:
        if (entry != NULL &&
            ((entry->cfe_snd.cfe_pass_offset == CFM_MAX_OFFSET &&
            entry->cfe_rcv.cfe_pass_offset == CFM_MAX_OFFSET) ||
-           ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
+           ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
            cfil_queue_empty(&entry->cfe_snd.cfe_ctl_q) &&
            cfil_queue_empty(&entry->cfe_rcv.cfe_ctl_q)))) {
                entry->cfe_flags |= CFEF_CFIL_DETACHED;
+#if LIFECYCLE_DEBUG
+               cfil_info_log(LOG_ERR, cfil_info, outgoing ?
+                                        "CFIL: LIFECYCLE: OUT - PASSED ALL - DETACH":
+                                        "CFIL: LIFECYCLE: IN - PASSED ALL - DETACH");
+#endif
                CFIL_LOG(LOG_INFO, "so %llx detached %u",
                        (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
-               if ((so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT) &&
+               if ((cfil_info->cfi_flags & CFIF_CLOSE_WAIT) &&
                    cfil_filters_attached(so) == 0) {
+#if LIFECYCLE_DEBUG
+                       cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAKING");
+#endif
                        CFIL_LOG(LOG_INFO, "so %llx waking",
                                (uint64_t)VM_KERNEL_ADDRPERM(so));
-                       wakeup((caddr_t)&so->so_cfil);
+                       wakeup((caddr_t)cfil_info);
                }
        }
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
        CFIL_LOG(LOG_INFO, "return %d", error);
        return (error);
 }
@@ -3187,7 +3617,7 @@ done:
  * Update pass offset for socket when no data is pending
  */
 static int
-cfil_set_socket_pass_offset(struct socket *so, int outgoing)
+cfil_set_socket_pass_offset(struct socket *so, struct cfil_info *cfil_info, int outgoing)
 {
        struct cfi_buf *cfi_buf;
        struct cfil_entry *entry;
@@ -3195,7 +3625,7 @@ cfil_set_socket_pass_offset(struct socket *so, int outgoing)
        uint32_t kcunit;
        uint64_t pass_offset = 0;
 
-       if (so->so_cfil == NULL)
+       if (cfil_info == NULL)
                return (0);
 
        CFIL_LOG(LOG_INFO, "so %llx outgoing %d",
@@ -3204,13 +3634,17 @@ cfil_set_socket_pass_offset(struct socket *so, int outgoing)
        socket_lock_assert_owned(so);
 
        if (outgoing)
-               cfi_buf = &so->so_cfil->cfi_snd;
+               cfi_buf = &cfil_info->cfi_snd;
        else
-               cfi_buf = &so->so_cfil->cfi_rcv;
+               cfi_buf = &cfil_info->cfi_rcv;
+
+       CFIL_LOG(LOG_DEBUG, "CFIL: <so %llx, sockID %llu> outgoing %d cfi_pending_first %llu cfi_pending_last %llu",
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, outgoing,
+                        cfi_buf->cfi_pending_first, cfi_buf->cfi_pending_last);
 
        if (cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first == 0) {
                for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
-                       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+                       entry = &cfil_info->cfi_entries[kcunit - 1];
 
                        /* Are we attached to a filter? */
                        if (entry->cfe_filter == NULL)
@@ -3228,11 +3662,14 @@ cfil_set_socket_pass_offset(struct socket *so, int outgoing)
                cfi_buf->cfi_pass_offset = pass_offset;
        }
 
+       CFIL_LOG(LOG_DEBUG, "CFIL: <so %llx, sockID %llu>, cfi_pass_offset %llu",
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), cfil_info->cfi_sock_id, cfi_buf->cfi_pass_offset);
+
        return (0);
 }
 
 int
-cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing,
+cfil_action_data_pass(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit, int outgoing,
        uint64_t pass_offset, uint64_t peek_offset)
 {
        errno_t error = 0;
@@ -3241,7 +3678,7 @@ cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing,
 
        socket_lock_assert_owned(so);
 
-       error = cfil_acquire_sockbuf(so, outgoing);
+       error = cfil_acquire_sockbuf(so, cfil_info, outgoing);
        if (error != 0) {
                CFIL_LOG(LOG_INFO, "so %llx %s dropped",
                        (uint64_t)VM_KERNEL_ADDRPERM(so),
@@ -3249,14 +3686,14 @@ cfil_action_data_pass(struct socket *so, uint32_t kcunit, int outgoing,
                goto release;
        }
 
-       error = cfil_update_data_offsets(so, kcunit, outgoing,
+       error = cfil_update_data_offsets(so, cfil_info, kcunit, outgoing,
                pass_offset, peek_offset);
 
-       cfil_service_inject_queue(so, outgoing);
+       cfil_service_inject_queue(so, cfil_info, outgoing);
 
-       cfil_set_socket_pass_offset(so, outgoing);
+       cfil_set_socket_pass_offset(so, cfil_info, outgoing);
 release:
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
        cfil_release_sockbuf(so, outgoing);
 
        return (error);
@@ -3264,13 +3701,13 @@ release:
 
 
 static void
-cfil_flush_queues(struct socket *so)
+cfil_flush_queues(struct socket *so, struct cfil_info *cfil_info)
 {
        struct cfil_entry *entry;
        int kcunit;
        uint64_t drained;
 
-       if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
+       if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || cfil_info == NULL)
                goto done;
 
        socket_lock_assert_owned(so);
@@ -3279,19 +3716,19 @@ cfil_flush_queues(struct socket *so)
         * Flush the output queues and ignore errors as long as
         * we are attached
         */
-       (void) cfil_acquire_sockbuf(so, 1);
-       if (so->so_cfil != NULL) {
+       (void) cfil_acquire_sockbuf(so, cfil_info, 1);
+       if (cfil_info != NULL) {
                drained = 0;
                for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
-                       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+                       entry = &cfil_info->cfi_entries[kcunit - 1];
 
                        drained += cfil_queue_drain(&entry->cfe_snd.cfe_ctl_q);
-                       drained += cfil_queue_drain(
-                           &entry->cfe_snd.cfe_pending_q);
+                       drained += cfil_queue_drain(&entry->cfe_snd.cfe_pending_q);
                }
-               drained += cfil_queue_drain(&so->so_cfil->cfi_snd.cfi_inject_q);
+               drained += cfil_queue_drain(&cfil_info->cfi_snd.cfi_inject_q);
+
                if (drained) {
-                       if (so->so_cfil->cfi_flags & CFIF_DROP)
+                       if (cfil_info->cfi_flags & CFIF_DROP)
                                OSIncrementAtomic(
                                        &cfil_stats.cfs_flush_out_drop);
                        else
@@ -3304,20 +3741,21 @@ cfil_flush_queues(struct socket *so)
        /*
         * Flush the input queues
         */
-       (void) cfil_acquire_sockbuf(so, 0);
-       if (so->so_cfil != NULL) {
+       (void) cfil_acquire_sockbuf(so, cfil_info, 0);
+       if (cfil_info != NULL) {
                drained = 0;
                for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
-                       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+                       entry = &cfil_info->cfi_entries[kcunit - 1];
 
                                drained += cfil_queue_drain(
                                        &entry->cfe_rcv.cfe_ctl_q);
                                drained += cfil_queue_drain(
                                        &entry->cfe_rcv.cfe_pending_q);
                }
-               drained += cfil_queue_drain(&so->so_cfil->cfi_rcv.cfi_inject_q);
+               drained += cfil_queue_drain(&cfil_info->cfi_rcv.cfi_inject_q);
+
                if (drained) {
-                       if (so->so_cfil->cfi_flags & CFIF_DROP)
+                       if (cfil_info->cfi_flags & CFIF_DROP)
                                OSIncrementAtomic(
                                        &cfil_stats.cfs_flush_in_drop);
                        else
@@ -3327,28 +3765,28 @@ cfil_flush_queues(struct socket *so)
        }
        cfil_release_sockbuf(so, 0);
 done:
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
 }
 
 int
-cfil_action_drop(struct socket *so, uint32_t kcunit)
+cfil_action_drop(struct socket *so, struct cfil_info *cfil_info, uint32_t kcunit)
 {
        errno_t error = 0;
        struct cfil_entry *entry;
        struct proc *p;
 
-       if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
+       if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || cfil_info == NULL)
                goto done;
 
        socket_lock_assert_owned(so);
 
-       entry = &so->so_cfil->cfi_entries[kcunit - 1];
+       entry = &cfil_info->cfi_entries[kcunit - 1];
 
        /* Are we attached to the filter? */
        if (entry->cfe_filter == NULL)
                goto done;
 
-       so->so_cfil->cfi_flags |= CFIF_DROP;
+       cfil_info->cfi_flags |= CFIF_DROP;
 
        p = current_proc();
 
@@ -3356,28 +3794,33 @@ cfil_action_drop(struct socket *so, uint32_t kcunit)
         * Force the socket to be marked defunct
         * (forcing fixed along with rdar://19391339)
         */
-       error = sosetdefunct(p, so,
-           SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
-           FALSE);
+       if (so->so_cfil_db == NULL) {
+               error = sosetdefunct(p, so,
+                                                        SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL,
+                                                        FALSE);
 
-       /* Flush the socket buffer and disconnect */
-       if (error == 0)
-               error = sodefunct(p, so,
-                   SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
+               /* Flush the socket buffer and disconnect */
+               if (error == 0)
+                       error = sodefunct(p, so,
+                                                         SHUTDOWN_SOCKET_LEVEL_CONTENT_FILTER | SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
+       }
 
        /* The filter is done, mark as detached */
        entry->cfe_flags |= CFEF_CFIL_DETACHED;
+#if LIFECYCLE_DEBUG
+       cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: DROP - DETACH");
+#endif
        CFIL_LOG(LOG_INFO, "so %llx detached %u",
                (uint64_t)VM_KERNEL_ADDRPERM(so), kcunit);
 
        /* Pending data needs to go */
-       cfil_flush_queues(so);
+       cfil_flush_queues(so, cfil_info);
 
-       if (so->so_cfil && (so->so_cfil->cfi_flags & CFIF_CLOSE_WAIT)) {
+       if (cfil_info && (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)) {
                if (cfil_filters_attached(so) == 0) {
                        CFIL_LOG(LOG_INFO, "so %llx waking",
                                (uint64_t)VM_KERNEL_ADDRPERM(so));
-                       wakeup((caddr_t)&so->so_cfil);
+                       wakeup((caddr_t)cfil_info);
                }
        }
 done:
@@ -3388,33 +3831,42 @@ int
 cfil_action_bless_client(uint32_t kcunit, struct cfil_msg_hdr *msghdr)
 {
        errno_t error = 0;
-
-       cfil_rw_lock_exclusive(&cfil_lck_rw);
+       struct cfil_info *cfil_info = NULL;
 
        bool cfil_attached = false;
        struct cfil_msg_bless_client *blessmsg = (struct cfil_msg_bless_client *)msghdr;
+
+       // Search and lock socket
        struct socket *so = cfil_socket_from_client_uuid(blessmsg->cfb_client_uuid, &cfil_attached);
        if (so == NULL) {
                error = ENOENT;
        } else {
                // The client gets a pass automatically
-               socket_lock(so, 1);
+               cfil_info = (so->so_cfil_db != NULL) ?
+                       cfil_db_get_cfil_info(so->so_cfil_db, msghdr->cfm_sock_id) : so->so_cfil;
+
                if (cfil_attached) {
-                       (void)cfil_action_data_pass(so, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
-                       (void)cfil_action_data_pass(so, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
+#if VERDICT_DEBUG
+                       if (cfil_info != NULL) {
+                               CFIL_LOG(LOG_ERR, "CFIL: VERDICT RECEIVED: BLESS %s <so %llx sockID %llu>",
+                                                cfil_info->cfi_hash_entry ? "UDP" : "TCP",
+                                                (uint64_t)VM_KERNEL_ADDRPERM(so),
+                                                cfil_info->cfi_sock_id);
+                       }
+#endif
+                       (void)cfil_action_data_pass(so, cfil_info, kcunit, 1, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
+                       (void)cfil_action_data_pass(so, cfil_info, kcunit, 0, CFM_MAX_OFFSET, CFM_MAX_OFFSET);
                } else {
                        so->so_flags1 |= SOF1_CONTENT_FILTER_SKIP;
                }
                socket_unlock(so, 1);
        }
 
-       cfil_rw_unlock_exclusive(&cfil_lck_rw);
-
        return (error);
 }
 
 static int
-cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen)
+cfil_update_entry_offsets(struct socket *so, struct cfil_info *cfil_info, int outgoing, unsigned int datalen)
 {
        struct cfil_entry *entry;
        struct cfe_buf *entrybuf;
@@ -3424,7 +3876,7 @@ cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen)
                (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing, datalen);
 
        for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
-               entry = &so->so_cfil->cfi_entries[kcunit - 1];
+               entry = &cfil_info->cfi_entries[kcunit - 1];
 
                /* Are we attached to the filter? */
                if (entry->cfe_filter == NULL)
@@ -3446,62 +3898,94 @@ cfil_update_entry_offsets(struct socket *so, int outgoing, unsigned int datalen)
                entrybuf->cfe_pending_q.q_start += datalen;
                entrybuf->cfe_pending_q.q_end += datalen;
        }
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
        return (0);
 }
 
 int
-cfil_data_common(struct socket *so, int outgoing, struct sockaddr *to,
+cfil_data_common(struct socket *so, struct cfil_info *cfil_info, int outgoing, struct sockaddr *to,
                struct mbuf *data, struct mbuf *control, uint32_t flags)
 {
 #pragma unused(to, control, flags)
        errno_t error = 0;
        unsigned int datalen;
-       int mbcnt;
+       int mbcnt = 0;
+       int mbnum = 0;
        int kcunit;
        struct cfi_buf *cfi_buf;
+       struct mbuf *chain = NULL;
 
-       if (so->so_cfil == NULL) {
+       if (cfil_info == NULL) {
                CFIL_LOG(LOG_ERR, "so %llx cfil detached",
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
                error = 0;
                goto done;
-       } else if (so->so_cfil->cfi_flags & CFIF_DROP) {
+       } else if (cfil_info->cfi_flags & CFIF_DROP) {
                CFIL_LOG(LOG_ERR, "so %llx drop set",
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
                error = EPIPE;
                goto done;
        }
 
-       datalen = cfil_data_length(data, &mbcnt);
-
-       CFIL_LOG(LOG_INFO, "so %llx %s m %llx len %u flags 0x%x nextpkt %llx",
-               (uint64_t)VM_KERNEL_ADDRPERM(so),
-               outgoing ? "out" : "in",
-               (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags,
-               (uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt));
+       datalen = cfil_data_length(data, &mbcnt, &mbnum);
 
        if (outgoing)
-               cfi_buf = &so->so_cfil->cfi_snd;
+               cfi_buf = &cfil_info->cfi_snd;
        else
-               cfi_buf = &so->so_cfil->cfi_rcv;
+               cfi_buf = &cfil_info->cfi_rcv;
 
        cfi_buf->cfi_pending_last += datalen;
        cfi_buf->cfi_pending_mbcnt += mbcnt;
+       cfi_buf->cfi_pending_mbnum += mbnum;
+
+       if (IS_UDP(so)) {
+               if (cfi_buf->cfi_pending_mbnum > cfil_udp_gc_mbuf_num_max ||
+                       cfi_buf->cfi_pending_mbcnt > cfil_udp_gc_mbuf_cnt_max) {
+                       cfi_buf->cfi_tail_drop_cnt++;
+                       cfi_buf->cfi_pending_mbcnt -= mbcnt;
+                       cfi_buf->cfi_pending_mbnum -= mbnum;
+                       return (EPIPE);
+               }
+       }
+
        cfil_info_buf_verify(cfi_buf);
 
-       CFIL_LOG(LOG_INFO, "so %llx cfi_pending_last %llu cfi_pass_offset %llu",
-               (uint64_t)VM_KERNEL_ADDRPERM(so),
-               cfi_buf->cfi_pending_last,
-               cfi_buf->cfi_pass_offset);
+#if DATA_DEBUG
+       CFIL_LOG(LOG_DEBUG, "CFIL: QUEUEING DATA: <so %llx> %s: data %llx len %u flags 0x%x nextpkt %llx - cfi_pending_last %llu cfi_pending_mbcnt %u   cfi_pass_offset %llu",
+                        (uint64_t)VM_KERNEL_ADDRPERM(so),
+                        outgoing ? "OUT" : "IN",
+                        (uint64_t)VM_KERNEL_ADDRPERM(data), datalen, data->m_flags,
+                        (uint64_t)VM_KERNEL_ADDRPERM(data->m_nextpkt),
+                        cfi_buf->cfi_pending_last,
+                        cfi_buf->cfi_pending_mbcnt,
+                        cfi_buf->cfi_pass_offset);
+#endif
 
        /* Fast path when below pass offset */
        if (cfi_buf->cfi_pending_last <= cfi_buf->cfi_pass_offset) {
-               cfil_update_entry_offsets(so, outgoing, datalen);
+               cfil_update_entry_offsets(so, cfil_info, outgoing, datalen);
+#if DATA_DEBUG
+               CFIL_LOG(LOG_DEBUG, "CFIL: QUEUEING DATA: FAST PATH");
+#endif
        } else {
                for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
-                       error = cfil_data_filter(so, kcunit, outgoing, data,
-                               datalen);
+                       // Is cfil attached to this filter?
+                       if (IS_ENTRY_ATTACHED(cfil_info, kcunit)) {
+                               if (IS_UDP(so)) {
+                                       /* UDP only:
+                                        * Chain addr (incoming only TDB), control (optional) and data into one chain.
+                                        * This full chain will be reinjected into socket after recieving verdict.
+                                        */
+                                       (void) cfil_udp_save_socket_state(cfil_info, data);
+                                       chain = sbconcat_mbufs(NULL, outgoing ? NULL : to, data, control);
+                                       if (chain == NULL) {
+                                               return (ENOBUFS);
+                                       }
+                                       data = chain;
+                               }
+                               error = cfil_data_filter(so, cfil_info, kcunit, outgoing, data,
+                                                                                datalen);
+                       }
                        /* 0 means passed so continue with next filter */
                        if (error != 0)
                                break;
@@ -3512,10 +3996,11 @@ cfil_data_common(struct socket *so, int outgoing, struct sockaddr *to,
        if (error == 0) {
                cfi_buf->cfi_pending_first += datalen;
                cfi_buf->cfi_pending_mbcnt -= mbcnt;
+               cfi_buf->cfi_pending_mbnum -= mbnum;
                cfil_info_buf_verify(cfi_buf);
        }
 done:
-       CFIL_INFO_VERIFY(so->so_cfil);
+       CFIL_INFO_VERIFY(cfil_info);
 
        return (error);
 }
@@ -3528,6 +4013,10 @@ cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
                struct mbuf *data, struct mbuf *control, uint32_t flags)
 {
        int error = 0;
+    
+       if (IS_UDP(so)) {
+        return (cfil_sock_udp_handle_data(TRUE, so, NULL, to, data, control, flags));
+    }
 
        if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
                return (0);
@@ -3556,7 +4045,7 @@ cfil_sock_data_out(struct socket *so, struct sockaddr  *to,
                panic("%s sb_cfil_thread %p not NULL", __func__,
                        so->so_snd.sb_cfil_thread);
 
-       error = cfil_data_common(so, 1, to, data, control, flags);
+       error = cfil_data_common(so, so->so_cfil, 1, to, data, control, flags);
 
        return (error);
 }
@@ -3570,6 +4059,10 @@ cfil_sock_data_in(struct socket *so, struct sockaddr *from,
 {
        int error = 0;
 
+       if (IS_UDP(so)) {
+        return (cfil_sock_udp_handle_data(FALSE, so, NULL, from, data, control, flags));
+    }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
                return (0);
 
@@ -3590,7 +4083,7 @@ cfil_sock_data_in(struct socket *so, struct sockaddr *from,
                        (uint64_t)VM_KERNEL_ADDRPERM(so));
                OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
        }
-       error = cfil_data_common(so, 0, from, data, control, flags);
+       error = cfil_data_common(so, so->so_cfil, 0, from, data, control, flags);
 
        return (error);
 }
@@ -3608,6 +4101,10 @@ cfil_sock_shutdown(struct socket *so, int *how)
 {
        int error = 0;
 
+       if (IS_UDP(so)) {
+               return (cfil_sock_udp_shutdown(so, how));
+       }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
                goto done;
 
@@ -3689,6 +4186,11 @@ cfil_sock_is_closed(struct socket *so)
        errno_t error = 0;
        int kcunit;
 
+       if (IS_UDP(so)) {
+               cfil_sock_udp_is_closed(so);
+               return;
+       }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
                return;
 
@@ -3698,19 +4200,19 @@ cfil_sock_is_closed(struct socket *so)
 
        for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
                /* Let the filters know of the closing */
-               error = cfil_dispatch_closed_event(so, kcunit);
+               error = cfil_dispatch_closed_event(so, so->so_cfil, kcunit);
        }
 
        /* Last chance to push passed data out */
-       error = cfil_acquire_sockbuf(so, 1);
+       error = cfil_acquire_sockbuf(so, so->so_cfil, 1);
        if (error == 0)
-               cfil_service_inject_queue(so, 1);
+               cfil_service_inject_queue(so, so->so_cfil, 1);
        cfil_release_sockbuf(so, 1);
 
        so->so_cfil->cfi_flags |= CFIF_SOCK_CLOSED;
 
        /* Pending data needs to go */
-       cfil_flush_queues(so);
+       cfil_flush_queues(so, so->so_cfil);
 
        CFIL_INFO_VERIFY(so->so_cfil);
 }
@@ -3727,6 +4229,11 @@ cfil_sock_notify_shutdown(struct socket *so, int how)
        errno_t error = 0;
        int kcunit;
 
+       if (IS_UDP(so)) {
+               cfil_sock_udp_notify_shutdown(so, how, 0, 0);
+               return;
+       }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
                return;
 
@@ -3738,10 +4245,10 @@ cfil_sock_notify_shutdown(struct socket *so, int how)
        for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
                /* Disconnect incoming side */
                if (how != SHUT_WR)
-                       error = cfil_dispatch_disconnect_event(so, kcunit, 0);
+                       error = cfil_dispatch_disconnect_event(so, so->so_cfil, kcunit, 0);
                /* Disconnect outgoing side */
                if (how != SHUT_RD)
-                       error = cfil_dispatch_disconnect_event(so, kcunit, 1);
+                       error = cfil_dispatch_disconnect_event(so, so->so_cfil, kcunit, 1);
        }
 }
 
@@ -3752,6 +4259,10 @@ cfil_filters_attached(struct socket *so)
        uint32_t kcunit;
        int attached = 0;
 
+       if (IS_UDP(so)) {
+               return cfil_filters_udp_attached(so, FALSE);
+       }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
                return (0);
 
@@ -3785,6 +4296,11 @@ cfil_sock_close_wait(struct socket *so)
        struct timespec ts;
        int error;
 
+       if (IS_UDP(so)) {
+               cfil_sock_udp_close_wait(so);
+               return;
+       }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
                return;
 
@@ -3818,7 +4334,7 @@ cfil_sock_close_wait(struct socket *so)
 
                OSIncrementAtomic(&cfil_stats.cfs_close_wait);
                so->so_cfil->cfi_flags |= CFIF_CLOSE_WAIT;
-               error = msleep((caddr_t)&so->so_cfil, mutex_held,
+               error = msleep((caddr_t)so->so_cfil, mutex_held,
                        PSOCK | PCATCH, "cfil_sock_close_wait", &ts);
                so->so_cfil->cfi_flags &= ~CFIF_CLOSE_WAIT;
 
@@ -3845,6 +4361,10 @@ cfil_sock_data_pending(struct sockbuf *sb)
        struct socket *so = sb->sb_so;
        uint64_t pending = 0;
 
+       if (IS_UDP(so)) {
+               return (cfil_sock_udp_data_pending(sb, FALSE));
+       }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL) {
                struct cfi_buf *cfi_buf;
 
@@ -3881,6 +4401,10 @@ cfil_sock_data_space(struct sockbuf *sb)
        struct socket *so = sb->sb_so;
        uint64_t pending = 0;
 
+       if (IS_UDP(so)) {
+               return (cfil_sock_udp_data_pending(sb, TRUE));
+       }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil != NULL &&
                so->so_snd.sb_cfil_thread != current_thread()) {
                struct cfi_buf *cfi_buf;
@@ -3920,6 +4444,11 @@ cfil_sock_buf_update(struct sockbuf *sb)
        int error;
        struct socket *so = sb->sb_so;
 
+    if (IS_UDP(so)) {
+               cfil_sock_udp_buf_update(sb);
+               return;
+    }
+
        if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || so->so_cfil == NULL)
                return;
 
@@ -3943,9 +4472,9 @@ cfil_sock_buf_update(struct sockbuf *sb)
        CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
                (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
 
-       error = cfil_acquire_sockbuf(so, outgoing);
+       error = cfil_acquire_sockbuf(so, so->so_cfil, outgoing);
        if (error == 0)
-               cfil_service_inject_queue(so, outgoing);
+               cfil_service_inject_queue(so, so->so_cfil, outgoing);
        cfil_release_sockbuf(so, outgoing);
 }
 
@@ -3995,6 +4524,14 @@ sysctl_cfil_filter_list(struct sysctl_oid *oidp, void *arg1, int arg2,
 
        cfil_rw_unlock_shared(&cfil_lck_rw);
 
+#if SHOW_DEBUG
+       if (req->oldptr != USER_ADDR_NULL) {
+               for (i = 1; content_filters != NULL && i <= MAX_CONTENT_FILTER; i++) {
+                       cfil_filter_show(i);
+               }
+       }
+#endif
+
        return (error);
 }
 
@@ -4046,6 +4583,10 @@ static int sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2,
                                memcpy(stat.cfs_e_uuid, so->last_uuid,
                                        sizeof(uuid_t));
                        }
+
+                       stat.cfs_sock_family = so->so_proto->pr_domain->dom_family;
+                       stat.cfs_sock_type = so->so_proto->pr_type;
+                       stat.cfs_sock_protocol = so->so_proto->pr_protocol;
                }
 
                stat.cfs_snd.cbs_pending_first =
@@ -4128,5 +4669,1388 @@ static int sysctl_cfil_sock_list(struct sysctl_oid *oidp, void *arg1, int arg2,
 done:
        cfil_rw_unlock_shared(&cfil_lck_rw);
 
+#if SHOW_DEBUG
+       if (req->oldptr != USER_ADDR_NULL) {
+               cfil_info_show();
+       }
+#endif
+
+       return (error);
+}
+
+/*
+ * UDP Socket Support
+ */
+static void
+cfil_hash_entry_log(int level, struct socket *so, struct cfil_hash_entry *entry, uint64_t sockId, const char* msg)
+{
+    char local[MAX_IPv6_STR_LEN+6];
+    char remote[MAX_IPv6_STR_LEN+6];
+    const void  *addr;
+
+       // No sock or not UDP, no-op
+    if (so == NULL || entry == NULL) {
+        return;
+    }
+
+    local[0] = remote[0] = 0x0;
+
+    switch (entry->cfentry_family) {
+        case AF_INET6:
+            addr = &entry->cfentry_laddr.addr6;
+            inet_ntop(AF_INET6, addr, local, sizeof(local));
+            addr = &entry->cfentry_faddr.addr6;
+            inet_ntop(AF_INET6, addr, remote, sizeof(local));
+            break;
+        case AF_INET:
+            addr = &entry->cfentry_laddr.addr46.ia46_addr4.s_addr;
+            inet_ntop(AF_INET, addr, local, sizeof(local));
+            addr = &entry->cfentry_faddr.addr46.ia46_addr4.s_addr;
+            inet_ntop(AF_INET, addr, remote, sizeof(local));
+            break;
+        default:
+            return;
+    }
+    
+       CFIL_LOG(level, "<%s>: <UDP so %llx, entry %p, sockID %llu> lport %d fport %d laddr %s faddr %s",
+                        msg,
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), entry, sockId,
+                        ntohs(entry->cfentry_lport), ntohs(entry->cfentry_fport), local, remote);
+}
+
+static void
+cfil_inp_log(int level, struct socket *so, const char* msg)
+{
+    struct inpcb *inp = NULL;
+    char local[MAX_IPv6_STR_LEN+6];
+    char remote[MAX_IPv6_STR_LEN+6];
+    const void  *addr;
+
+    if (so == NULL) {
+        return;
+    }
+    
+    inp = sotoinpcb(so);
+    if (inp == NULL) {
+        return;
+    }
+    
+    local[0] = remote[0] = 0x0;
+
+#if INET6
+    if (inp->inp_vflag & INP_IPV6) {
+        addr = &inp->in6p_laddr.s6_addr32;
+        inet_ntop(AF_INET6, addr, local, sizeof(local));
+        addr = &inp->in6p_faddr.s6_addr32;
+        inet_ntop(AF_INET6, addr, remote, sizeof(local));
+    } else
+#endif /* INET6 */
+    {
+        addr = &inp->inp_laddr.s_addr;
+        inet_ntop(AF_INET, addr, local, sizeof(local));
+        addr = &inp->inp_faddr.s_addr;
+        inet_ntop(AF_INET, addr, remote, sizeof(local));
+    }
+
+       if (so->so_cfil != NULL)
+               CFIL_LOG(level, "<%s>: <%s so %llx - flags 0x%x 0x%x, sockID %llu> lport %d fport %d laddr %s faddr %s",
+                                msg, IS_UDP(so) ? "UDP" : "TCP",
+                                (uint64_t)VM_KERNEL_ADDRPERM(so), inp->inp_flags, inp->inp_socket->so_flags, so->so_cfil->cfi_sock_id,
+                                ntohs(inp->inp_lport), ntohs(inp->inp_fport), local, remote);
+       else
+               CFIL_LOG(level, "<%s>: <%s so %llx - flags 0x%x 0x%x> lport %d fport %d laddr %s faddr %s",
+                                msg, IS_UDP(so) ? "UDP" : "TCP",
+                                (uint64_t)VM_KERNEL_ADDRPERM(so), inp->inp_flags, inp->inp_socket->so_flags,
+                                ntohs(inp->inp_lport), ntohs(inp->inp_fport), local, remote);
+}
+
+static void
+cfil_info_log(int level, struct cfil_info *cfil_info, const char* msg)
+{
+       if (cfil_info == NULL)
+               return;
+
+       if (cfil_info->cfi_hash_entry != NULL)
+               cfil_hash_entry_log(level, cfil_info->cfi_so, cfil_info->cfi_hash_entry, cfil_info->cfi_sock_id, msg);
+       else
+               cfil_inp_log(level, cfil_info->cfi_so, msg);
+}
+
+errno_t
+cfil_db_init(struct socket *so)
+{
+    errno_t error = 0;
+    struct cfil_db *db = NULL;
+    
+    CFIL_LOG(LOG_INFO, "");
+    
+    db = zalloc(cfil_db_zone);
+    if (db == NULL) {
+        error = ENOMEM;
+        goto done;
+    }
+    bzero(db, sizeof(struct cfil_db));
+    db->cfdb_so = so;
+    db->cfdb_hashbase = hashinit(CFILHASHSIZE, M_CFIL, &db->cfdb_hashmask);
+    if (db->cfdb_hashbase == NULL) {
+        zfree(cfil_db_zone, db);
+        db = NULL;
+        error = ENOMEM;
+        goto done;
+    }
+
+    so->so_cfil_db = db;
+
+done:
+    return (error);
+}
+
+void
+cfil_db_free(struct socket *so)
+{
+    struct cfil_hash_entry *entry = NULL;
+    struct cfil_hash_entry *temp_entry = NULL;
+    struct cfilhashhead *cfilhash = NULL;
+    struct cfil_db *db = NULL;
+
+    CFIL_LOG(LOG_INFO, "");
+    
+    if (so == NULL || so->so_cfil_db == NULL) {
+        return;
+    }
+    db = so->so_cfil_db;
+
+#if LIFECYCLE_DEBUG
+       CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: <so %llx, db %p> freeing db (count == %d)",
+             (uint64_t)VM_KERNEL_ADDRPERM(so), db, db->cfdb_count);
+#endif
+
+    for (int i = 0; i < CFILHASHSIZE; i++) {
+        cfilhash = &db->cfdb_hashbase[i];
+        LIST_FOREACH_SAFE(entry, cfilhash, cfentry_link, temp_entry) {
+            if (entry->cfentry_cfil != NULL) {
+#if LIFECYCLE_DEBUG
+                               cfil_info_log(LOG_ERR, entry->cfentry_cfil, "CFIL: LIFECYCLE: DB FREE CLEAN UP");
+#endif
+                cfil_info_free(entry->cfentry_cfil);
+                OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
+                entry->cfentry_cfil = NULL;
+            }
+
+            cfil_db_delete_entry(db, entry);
+            if (so->so_flags & SOF_CONTENT_FILTER) {
+                if (db->cfdb_count == 0)
+                    so->so_flags &= ~SOF_CONTENT_FILTER;
+                VERIFY(so->so_usecount > 0);
+                so->so_usecount--;
+            }
+        }
+    }
+
+    // Make sure all entries are cleaned up!
+    VERIFY(db->cfdb_count == 0);
+#if LIFECYCLE_DEBUG
+    CFIL_LOG(LOG_ERR, "CFIL: LIFECYCLE: so usecount %d", so->so_usecount);
+#endif
+
+    FREE(db->cfdb_hashbase, M_CFIL);
+    zfree(cfil_db_zone, db);
+    so->so_cfil_db = NULL;
+}
+
+static bool
+fill_cfil_hash_entry_from_address(struct cfil_hash_entry *entry, bool isLocal, struct sockaddr *addr)
+{
+    struct sockaddr_in *sin = NULL;
+    struct sockaddr_in6 *sin6 = NULL;
+    
+    if (entry == NULL || addr == NULL) {
+        return FALSE;
+    }
+    
+    switch (addr->sa_family) {
+        case AF_INET:
+            sin = satosin(addr);
+            if (sin->sin_len != sizeof(*sin)) {
+                return FALSE;
+            }
+            if (isLocal == TRUE) {
+                entry->cfentry_lport = sin->sin_port;
+                entry->cfentry_laddr.addr46.ia46_addr4.s_addr = sin->sin_addr.s_addr;
+            } else {
+                entry->cfentry_fport = sin->sin_port;
+                entry->cfentry_faddr.addr46.ia46_addr4.s_addr = sin->sin_addr.s_addr;
+            }
+            entry->cfentry_family = AF_INET;
+            return TRUE;
+        case AF_INET6:
+            sin6 = satosin6(addr);
+            if (sin6->sin6_len != sizeof(*sin6)) {
+                return FALSE;
+            }
+            if (isLocal == TRUE) {
+                entry->cfentry_lport = sin6->sin6_port;
+                entry->cfentry_laddr.addr6 = sin6->sin6_addr;
+            } else {
+                entry->cfentry_fport = sin6->sin6_port;
+                entry->cfentry_faddr.addr6 = sin6->sin6_addr;
+            }
+            entry->cfentry_family = AF_INET6;
+            return TRUE;
+        default:
+            return FALSE;
+    }
+}
+
+static bool
+fill_cfil_hash_entry_from_inp(struct cfil_hash_entry *entry, bool isLocal, struct inpcb *inp)
+{
+    if (entry == NULL || inp == NULL) {
+        return FALSE;
+    }
+    
+    if (inp->inp_vflag & INP_IPV4) {
+        if (isLocal == TRUE) {
+            entry->cfentry_lport = inp->inp_lport;
+            entry->cfentry_laddr.addr46.ia46_addr4.s_addr = inp->inp_laddr.s_addr;
+        } else {
+            entry->cfentry_fport = inp->inp_fport;
+            entry->cfentry_faddr.addr46.ia46_addr4.s_addr = inp->inp_faddr.s_addr;
+        }
+        entry->cfentry_family = AF_INET;
+        return TRUE;
+    } else if (inp->inp_vflag & INP_IPV6) {
+        if (isLocal == TRUE) {
+            entry->cfentry_lport = inp->inp_lport;
+            entry->cfentry_laddr.addr6 = inp->in6p_laddr;
+        } else {
+            entry->cfentry_fport = inp->inp_fport;
+            entry->cfentry_faddr.addr6 = inp->in6p_faddr;
+        }
+        entry->cfentry_family = AF_INET6;
+        return TRUE;
+    }
+    return FALSE;
+}
+
+bool
+check_port(struct sockaddr *addr, u_short port)
+{
+       struct sockaddr_in *sin = NULL;
+       struct sockaddr_in6 *sin6 = NULL;
+
+       if (addr == NULL || port == 0) {
+               return FALSE;
+       }
+
+       switch (addr->sa_family) {
+               case AF_INET:
+                       sin = satosin(addr);
+                       if (sin->sin_len != sizeof(*sin)) {
+                               return FALSE;
+                       }
+                       if (port == ntohs(sin->sin_port)) {
+                               return TRUE;
+                       }
+                       break;
+               case AF_INET6:
+                       sin6 = satosin6(addr);
+                       if (sin6->sin6_len != sizeof(*sin6)) {
+                               return FALSE;
+                       }
+                       if (port == ntohs(sin6->sin6_port)) {
+                               return TRUE;
+                       }
+                       break;
+               default:
+                       break;
+       }
+       return FALSE;
+}
+
+struct cfil_hash_entry *
+cfil_db_lookup_entry_with_sockid(struct cfil_db *db, u_int64_t sock_id)
+{
+       struct cfilhashhead *cfilhash = NULL;
+       u_int32_t flowhash = (u_int32_t)(sock_id & 0x0ffffffff);
+       struct cfil_hash_entry *nextentry;
+
+       if (db == NULL || db->cfdb_hashbase == NULL || sock_id == 0) {
+               return NULL;
+       }
+
+       flowhash &= db->cfdb_hashmask;
+       cfilhash = &db->cfdb_hashbase[flowhash];
+
+       LIST_FOREACH(nextentry, cfilhash, cfentry_link) {
+               if (nextentry->cfentry_cfil != NULL &&
+                       nextentry->cfentry_cfil->cfi_sock_id == sock_id) {
+                       CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> matched <id %llu, hash %u>",
+                                        (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), nextentry->cfentry_cfil->cfi_sock_id, flowhash);
+                       cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, nextentry, 0, "CFIL: UDP found entry");
+                       return nextentry;
+               }
+       }
+
+       CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> NOT matched <id %llu, hash %u>",
+                        (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), sock_id, flowhash);
+       return NULL;
+}
+
+struct cfil_hash_entry *
+cfil_db_lookup_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote)
+{
+    struct cfil_hash_entry matchentry;
+    struct cfil_hash_entry *nextentry = NULL;
+    struct inpcb *inp = sotoinpcb(db->cfdb_so);
+    u_int32_t hashkey_faddr = 0, hashkey_laddr = 0;
+    int inp_hash_element = 0;
+    struct cfilhashhead *cfilhash = NULL;
+    
+    CFIL_LOG(LOG_INFO, "");
+    
+    if (inp == NULL) {
+        goto done;
+    }
+    
+    if (local != NULL) {
+        fill_cfil_hash_entry_from_address(&matchentry, TRUE, local);
+    } else {
+        fill_cfil_hash_entry_from_inp(&matchentry, TRUE, inp);
+    }
+    if (remote != NULL) {
+        fill_cfil_hash_entry_from_address(&matchentry, FALSE, remote);
+    } else {
+        fill_cfil_hash_entry_from_inp(&matchentry, FALSE, inp);
+    }
+    
+#if INET6
+    if (inp->inp_vflag & INP_IPV6) {
+        hashkey_faddr = matchentry.cfentry_faddr.addr6.s6_addr32[3];
+        hashkey_laddr = matchentry.cfentry_laddr.addr6.s6_addr32[3];
+    } else
+#endif /* INET6 */
+    {
+        hashkey_faddr = matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr;
+        hashkey_laddr = matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr;
+    }
+
+    inp_hash_element = CFIL_HASH(hashkey_laddr, hashkey_faddr,
+                                 matchentry.cfentry_lport, matchentry.cfentry_fport);
+    inp_hash_element &= db->cfdb_hashmask;
+
+    cfilhash = &db->cfdb_hashbase[inp_hash_element];
+    
+    LIST_FOREACH(nextentry, cfilhash, cfentry_link) {
+        
+#if INET6
+        if ((inp->inp_vflag & INP_IPV6) &&
+            nextentry->cfentry_lport == matchentry.cfentry_lport &&
+            nextentry->cfentry_fport == matchentry.cfentry_fport &&
+            IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_laddr.addr6, &matchentry.cfentry_laddr.addr6) &&
+            IN6_ARE_ADDR_EQUAL(&nextentry->cfentry_faddr.addr6, &matchentry.cfentry_faddr.addr6)) {
+#if DATA_DEBUG
+            cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V6 found entry");
+#endif
+            return nextentry;
+        } else
+#endif /* INET6 */
+        if (nextentry->cfentry_lport == matchentry.cfentry_lport &&
+            nextentry->cfentry_fport == matchentry.cfentry_fport &&
+            nextentry->cfentry_laddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_laddr.addr46.ia46_addr4.s_addr &&
+            nextentry->cfentry_faddr.addr46.ia46_addr4.s_addr == matchentry.cfentry_faddr.addr46.ia46_addr4.s_addr) {
+#if DATA_DEBUG
+            cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP V4 found entry");
+#endif
+            return nextentry;
+        }
+    }
+    
+done:
+#if DATA_DEBUG
+    cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, &matchentry, 0, "CFIL LOOKUP ENTRY: UDP no entry found");
+#endif
+    return NULL;
+}
+
+void
+cfil_db_delete_entry(struct cfil_db *db, struct cfil_hash_entry *hash_entry)
+{
+    if (hash_entry == NULL)
+        return;
+
+    LIST_REMOVE(hash_entry, cfentry_link);
+    zfree(cfil_hash_entry_zone, hash_entry);
+    db->cfdb_count--;
+    if (db->cfdb_only_entry == hash_entry)
+        db->cfdb_only_entry = NULL;
+}
+
+struct cfil_hash_entry *
+cfil_db_add_entry(struct cfil_db *db, struct sockaddr *local, struct sockaddr *remote)
+{
+    struct cfil_hash_entry *entry = NULL;
+    struct inpcb *inp = sotoinpcb(db->cfdb_so);
+    u_int32_t hashkey_faddr = 0, hashkey_laddr = 0;
+    int inp_hash_element = 0;
+    struct cfilhashhead *cfilhash = NULL;
+    
+    CFIL_LOG(LOG_INFO, "");
+    
+    if (inp == NULL) {
+        goto done;
+    }
+    
+    entry = zalloc(cfil_hash_entry_zone);
+    if (entry == NULL) {
+        goto done;
+    }
+    bzero(entry, sizeof(struct cfil_hash_entry));
+    
+    if (local != NULL) {
+        fill_cfil_hash_entry_from_address(entry, TRUE, local);
+    } else {
+        fill_cfil_hash_entry_from_inp(entry, TRUE, inp);
+    }
+    if (remote != NULL) {
+        fill_cfil_hash_entry_from_address(entry, FALSE, remote);
+    } else {
+        fill_cfil_hash_entry_from_inp(entry, FALSE, inp);
+    }
+    entry->cfentry_lastused = net_uptime();
+
+#if INET6
+    if (inp->inp_vflag & INP_IPV6) {
+        hashkey_faddr = entry->cfentry_faddr.addr6.s6_addr32[3];
+        hashkey_laddr = entry->cfentry_laddr.addr6.s6_addr32[3];
+    } else
+#endif /* INET6 */
+    {
+        hashkey_faddr = entry->cfentry_faddr.addr46.ia46_addr4.s_addr;
+        hashkey_laddr = entry->cfentry_laddr.addr46.ia46_addr4.s_addr;
+    }
+    entry->cfentry_flowhash = CFIL_HASH(hashkey_laddr, hashkey_faddr,
+                                        entry->cfentry_lport, entry->cfentry_fport);
+    inp_hash_element = entry->cfentry_flowhash & db->cfdb_hashmask;
+
+    cfilhash = &db->cfdb_hashbase[inp_hash_element];
+    
+    LIST_INSERT_HEAD(cfilhash, entry, cfentry_link);
+    db->cfdb_count++;
+       db->cfdb_only_entry = entry;
+       cfil_hash_entry_log(LOG_DEBUG, db->cfdb_so, entry, 0, "CFIL: cfil_db_add_entry: ADDED");
+    
+done:
+    CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> total count %d", (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), db->cfdb_count);
+    return entry;
+}
+
+struct cfil_info *
+cfil_db_get_cfil_info(struct cfil_db *db, cfil_sock_id_t id)
+{
+    struct cfil_hash_entry *hash_entry = NULL;
+
+    CFIL_LOG(LOG_INFO, "");
+
+    if (db == NULL || id == 0) {
+        CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> NULL DB <id %llu>",
+                 (uint64_t)VM_KERNEL_ADDRPERM(db->cfdb_so), id);
+        return NULL;
+    }
+
+       // This is an optimization for connected UDP socket which only has one flow.
+       // No need to do the hash lookup.
+       if (db->cfdb_count == 1) {
+               if (db->cfdb_only_entry && db->cfdb_only_entry->cfentry_cfil &&
+                       db->cfdb_only_entry->cfentry_cfil->cfi_sock_id == id) {
+                       return (db->cfdb_only_entry->cfentry_cfil);
+               }
+       }
+
+       hash_entry = cfil_db_lookup_entry_with_sockid(db, id);
+       return (hash_entry != NULL ? hash_entry->cfentry_cfil : NULL);
+}
+
+struct cfil_hash_entry *
+cfil_sock_udp_get_flow(struct socket *so, uint32_t filter_control_unit, bool outgoing, struct sockaddr *local, struct sockaddr *remote)
+{
+#pragma unused(so, filter_control_unit, outgoing, local, remote)
+       struct cfil_hash_entry *hash_entry = NULL;
+
+       errno_t error = 0;
+    socket_lock_assert_owned(so);
+
+       // If new socket, allocate cfil db
+       if (so->so_cfil_db == NULL) {
+               if (cfil_db_init(so) != 0) {
+                       return (NULL);
+               }
+       }
+
+    // See if flow already exists.
+    hash_entry = cfil_db_lookup_entry(so->so_cfil_db, local, remote);
+    if (hash_entry != NULL) {
+               return (hash_entry);
+    }
+
+    hash_entry = cfil_db_add_entry(so->so_cfil_db, local, remote);
+    if (hash_entry == NULL) {
+        OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
+        CFIL_LOG(LOG_ERR, "CFIL: UDP failed to add entry");
+               return (NULL);
+    }
+
+    if (cfil_info_alloc(so, hash_entry) == NULL ||
+        hash_entry->cfentry_cfil == NULL) {
+        cfil_db_delete_entry(so->so_cfil_db, hash_entry);
+        CFIL_LOG(LOG_ERR, "CFIL: UDP failed to alloc cfil_info");
+        OSIncrementAtomic(&cfil_stats.cfs_sock_attach_no_mem);
+        return (NULL);
+    }
+
+#if LIFECYCLE_DEBUG
+       cfil_info_log(LOG_ERR, hash_entry->cfentry_cfil, "CFIL: LIFECYCLE: ADDED");
+#endif
+
+    if (cfil_info_attach_unit(so, filter_control_unit, hash_entry->cfentry_cfil) == 0) {
+               CFIL_LOG(LOG_ERR, "CFIL: UDP cfil_info_attach_unit(%u) failed",
+                 filter_control_unit);
+        OSIncrementAtomic(&cfil_stats.cfs_sock_attach_failed);
+               return (NULL);
+    }
+    CFIL_LOG(LOG_DEBUG, "CFIL: UDP <so %llx> filter_control_unit %u sockID %llu attached",
+             (uint64_t)VM_KERNEL_ADDRPERM(so),
+             filter_control_unit, hash_entry->cfentry_cfil->cfi_sock_id);
+    
+    so->so_flags |= SOF_CONTENT_FILTER;
+    OSIncrementAtomic(&cfil_stats.cfs_sock_attached);
+
+    /* Hold a reference on the socket for each flow */
+    so->so_usecount++;
+    
+    error = cfil_dispatch_attach_event(so, hash_entry->cfentry_cfil, filter_control_unit);
+    /* We can recover from flow control or out of memory errors */
+    if (error != 0 && error != ENOBUFS && error != ENOMEM)
+               return (NULL);
+
+    CFIL_INFO_VERIFY(hash_entry->cfentry_cfil);
+       return (hash_entry);
+}
+
+errno_t
+cfil_sock_udp_handle_data(bool outgoing, struct socket *so,
+                          struct sockaddr *local, struct sockaddr *remote,
+                          struct mbuf *data, struct mbuf *control, uint32_t flags)
+{
+#pragma unused(outgoing, so, local, remote, data, control, flags)
+    errno_t error = 0;
+    uint32_t filter_control_unit;
+       struct cfil_hash_entry *hash_entry = NULL;
+       struct cfil_info *cfil_info = NULL;
+
+    socket_lock_assert_owned(so);
+
+    if (cfil_active_count == 0) {
+        CFIL_LOG(LOG_DEBUG, "CFIL: UDP no active filter");
+        OSIncrementAtomic(&cfil_stats.cfs_sock_attach_in_vain);
+        return (error);
+    }
+    
+    filter_control_unit = necp_socket_get_content_filter_control_unit(so);
+    if (filter_control_unit == 0) {
+        CFIL_LOG(LOG_DEBUG, "CFIL: UDP failed to get control unit");
+        return (error);
+    }
+
+    if ((filter_control_unit & NECP_MASK_USERSPACE_ONLY) != 0) {
+        CFIL_LOG(LOG_DEBUG, "CFIL: UDP user space only");
+        OSIncrementAtomic(&cfil_stats.cfs_sock_userspace_only);
+        return (error);
+    }
+       
+    hash_entry = cfil_sock_udp_get_flow(so, filter_control_unit, outgoing, local, remote);
+    if (hash_entry == NULL || hash_entry->cfentry_cfil == NULL) {
+               CFIL_LOG(LOG_ERR, "CFIL: Falied to create UDP flow");
+        return (EPIPE);
+    }
+       // Update last used timestamp, this is for flow Idle TO
+       hash_entry->cfentry_lastused = net_uptime();
+       cfil_info = hash_entry->cfentry_cfil;
+
+       if (cfil_info->cfi_flags & CFIF_DROP) {
+#if DATA_DEBUG
+               cfil_hash_entry_log(LOG_DEBUG, so, hash_entry, 0, "CFIL: UDP DROP");
+#endif
+               return (EPIPE);
+       }
+       if (control != NULL) {
+               OSIncrementAtomic(&cfil_stats.cfs_data_in_control);
+       }
+       if (data->m_type == MT_OOBDATA) {
+               CFIL_LOG(LOG_ERR, "so %llx MSG_OOB",
+                                (uint64_t)VM_KERNEL_ADDRPERM(so));
+               OSIncrementAtomic(&cfil_stats.cfs_data_in_oob);
+       }
+
+       error = cfil_data_common(so, cfil_info, outgoing, remote, data, control, flags);
+
+       return (error);
+}
+
+/*
+ * Go through all UDP flows for specified socket and returns TRUE if
+ * any flow is still attached.  If need_wait is TRUE, wait on first
+ * attached flow.
+ */
+static int
+cfil_filters_udp_attached(struct socket *so, bool need_wait)
+{
+       struct timespec ts;
+       lck_mtx_t *mutex_held;
+       struct cfilhashhead *cfilhash = NULL;
+       struct cfil_db *db = NULL;
+       struct cfil_hash_entry *hash_entry = NULL;
+       struct cfil_hash_entry *temp_hash_entry = NULL;
+       struct cfil_info *cfil_info = NULL;
+       struct cfil_entry *entry = NULL;
+       errno_t error = 0;
+       int kcunit;
+       int attached = 0;
+
+       socket_lock_assert_owned(so);
+
+       if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) {
+
+               if (so->so_proto->pr_getlock != NULL)
+                       mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
+               else
+                       mutex_held = so->so_proto->pr_domain->dom_mtx;
+               LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
+
+               db = so->so_cfil_db;
+
+               for (int i = 0; i < CFILHASHSIZE; i++) {
+                       cfilhash = &db->cfdb_hashbase[i];
+
+                       LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
+
+                               if (hash_entry->cfentry_cfil != NULL) {
+
+                                       cfil_info = hash_entry->cfentry_cfil;
+                                       for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
+                                               entry = &cfil_info->cfi_entries[kcunit - 1];
+
+                                               /* Are we attached to the filter? */
+                                               if (entry->cfe_filter == NULL) {
+                                                       continue;
+                                               }
+
+                                               if ((entry->cfe_flags & CFEF_SENT_SOCK_ATTACHED) == 0)
+                                                       continue;
+                                               if ((entry->cfe_flags & CFEF_CFIL_DETACHED) != 0)
+                                                       continue;
+
+                                               attached = 1;
+
+                                               if (need_wait == TRUE) {
+#if LIFECYCLE_DEBUG
+                                                       cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW TO FINISH");
+#endif
+
+                                                       ts.tv_sec = cfil_close_wait_timeout / 1000;
+                                                       ts.tv_nsec = (cfil_close_wait_timeout % 1000) *
+                                                       NSEC_PER_USEC * 1000;
+
+                                                       OSIncrementAtomic(&cfil_stats.cfs_close_wait);
+                                                       cfil_info->cfi_flags |= CFIF_CLOSE_WAIT;
+                                                       error = msleep((caddr_t)cfil_info, mutex_held,
+                                                                                  PSOCK | PCATCH, "cfil_filters_udp_attached", &ts);
+                                                       cfil_info->cfi_flags &= ~CFIF_CLOSE_WAIT;
+
+#if LIFECYCLE_DEBUG
+                                                       cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW DONE");
+#endif
+
+                                                       /*
+                                                        * Force close in case of timeout
+                                                        */
+                                                       if (error != 0) {
+                                                               OSIncrementAtomic(&cfil_stats.cfs_close_wait_timeout);
+#if LIFECYCLE_DEBUG
+                                                               cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: WAIT FOR FLOW TIMED OUT, FORCE DETACH");
+#endif
+                                                               entry->cfe_flags |= CFEF_CFIL_DETACHED;
+                                                               break;
+                                                       }
+                                               }
+                                               goto done;
+                                       }
+                               }
+                       }
+               }
+       }
+
+done:
+       return (attached);
+}
+
+int32_t
+cfil_sock_udp_data_pending(struct sockbuf *sb, bool check_thread)
+{
+       struct socket *so = sb->sb_so;
+       struct cfi_buf *cfi_buf;
+       uint64_t pending = 0;
+       uint64_t total_pending = 0;
+       struct cfilhashhead *cfilhash = NULL;
+       struct cfil_db *db = NULL;
+       struct cfil_hash_entry *hash_entry = NULL;
+       struct cfil_hash_entry *temp_hash_entry = NULL;
+
+       socket_lock_assert_owned(so);
+
+       if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL &&
+               (check_thread == FALSE || so->so_snd.sb_cfil_thread != current_thread())) {
+
+               db = so->so_cfil_db;
+
+               for (int i = 0; i < CFILHASHSIZE; i++) {
+                       cfilhash = &db->cfdb_hashbase[i];
+
+                       LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
+
+                               if (hash_entry->cfentry_cfil != NULL) {
+                                       if ((sb->sb_flags & SB_RECV) == 0)
+                                               cfi_buf = &hash_entry->cfentry_cfil->cfi_snd;
+                                       else
+                                               cfi_buf = &hash_entry->cfentry_cfil->cfi_rcv;
+
+                                       pending = cfi_buf->cfi_pending_last - cfi_buf->cfi_pending_first;
+                                       /*
+                                        * If we are limited by the "chars of mbufs used" roughly
+                                        * adjust so we won't overcommit
+                                        */
+                                       if ((uint64_t)cfi_buf->cfi_pending_mbcnt > pending)
+                                               pending = cfi_buf->cfi_pending_mbcnt;
+
+                                       total_pending += pending;
+                               }
+                       }
+               }
+
+               VERIFY(total_pending < INT32_MAX);
+#if DATA_DEBUG
+               CFIL_LOG(LOG_DEBUG, "CFIL: <so %llx> total pending %llu <check_thread %d>",
+                                (uint64_t)VM_KERNEL_ADDRPERM(so),
+                                total_pending, check_thread);
+#endif
+       }
+
+       return (int32_t)(total_pending);
+}
+
+int
+cfil_sock_udp_notify_shutdown(struct socket *so, int how, int drop_flag, int shut_flag)
+{
+       struct cfil_info *cfil_info = NULL;
+       struct cfilhashhead *cfilhash = NULL;
+       struct cfil_db *db = NULL;
+       struct cfil_hash_entry *hash_entry = NULL;
+       struct cfil_hash_entry *temp_hash_entry = NULL;
+       errno_t error = 0;
+       int done_count = 0;
+       int kcunit;
+
+       socket_lock_assert_owned(so);
+
+       if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) {
+
+               db = so->so_cfil_db;
+
+               for (int i = 0; i < CFILHASHSIZE; i++) {
+                       cfilhash = &db->cfdb_hashbase[i];
+
+                       LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
+
+                               if (hash_entry->cfentry_cfil != NULL) {
+                                       cfil_info = hash_entry->cfentry_cfil;
+
+                                       // This flow is marked as DROP
+                                       if (cfil_info->cfi_flags & drop_flag) {
+                                               done_count++;
+                                               continue;
+                                       }
+
+                                       // This flow has been shut already, skip
+                                       if (cfil_info->cfi_flags & shut_flag) {
+                                               continue;
+                                       }
+                                       // Mark flow as shut
+                                       cfil_info->cfi_flags |= shut_flag;
+                                       done_count++;
+
+                                       for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
+                                               /* Disconnect incoming side */
+                                               if (how != SHUT_WR) {
+                                                       error = cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 0);
+                                               }
+                                               /* Disconnect outgoing side */
+                                               if (how != SHUT_RD) {
+                                                       error = cfil_dispatch_disconnect_event(so, cfil_info, kcunit, 1);
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       if (done_count == 0) {
+               error = ENOTCONN;
+       }
        return (error);
 }
+
+int
+cfil_sock_udp_shutdown(struct socket *so, int *how)
+{
+       int error = 0;
+
+       if ((so->so_flags & SOF_CONTENT_FILTER) == 0 || (so->so_cfil_db == NULL))
+               goto done;
+
+       socket_lock_assert_owned(so);
+
+       CFIL_LOG(LOG_INFO, "so %llx how %d",
+                        (uint64_t)VM_KERNEL_ADDRPERM(so), *how);
+
+       /*
+        * Check the state of the socket before the content filter
+        */
+       if (*how != SHUT_WR && (so->so_state & SS_CANTRCVMORE) != 0) {
+               /* read already shut down */
+               error = ENOTCONN;
+               goto done;
+       }
+       if (*how != SHUT_RD && (so->so_state & SS_CANTSENDMORE) != 0) {
+               /* write already shut down */
+               error = ENOTCONN;
+               goto done;
+       }
+
+       /*
+        * shutdown read: SHUT_RD or SHUT_RDWR
+        */
+       if (*how != SHUT_WR) {
+               error = cfil_sock_udp_notify_shutdown(so, SHUT_RD, CFIF_DROP, CFIF_SHUT_RD);
+               if (error != 0)
+                       goto done;
+       }
+       /*
+        * shutdown write: SHUT_WR or SHUT_RDWR
+        */
+       if (*how != SHUT_RD) {
+               error = cfil_sock_udp_notify_shutdown(so, SHUT_WR, CFIF_DROP, CFIF_SHUT_WR);
+               if (error != 0)
+                       goto done;
+
+               /*
+                * When outgoing data is pending, we delay the shutdown at the
+                * protocol level until the content filters give the final
+                * verdict on the pending data.
+                */
+               if (cfil_sock_data_pending(&so->so_snd) != 0) {
+                       /*
+                        * When shutting down the read and write sides at once
+                        * we can proceed to the final shutdown of the read
+                        * side. Otherwise, we just return.
+                        */
+                       if (*how == SHUT_WR) {
+                               error = EJUSTRETURN;
+                       } else if (*how == SHUT_RDWR) {
+                               *how = SHUT_RD;
+                       }
+               }
+       }
+done:
+       return (error);
+}
+
+void
+cfil_sock_udp_close_wait(struct socket *so)
+{
+       socket_lock_assert_owned(so);
+
+       while (cfil_filters_udp_attached(so, FALSE)) {
+               /*
+                * Notify the filters we are going away so they can detach
+                */
+               cfil_sock_udp_notify_shutdown(so, SHUT_RDWR, 0, 0);
+
+               /*
+                * Make sure we need to wait after the filter are notified
+                * of the disconnection
+                */
+               if (cfil_filters_udp_attached(so, TRUE) == 0)
+                       break;
+       }
+}
+
+void
+cfil_sock_udp_is_closed(struct socket *so)
+{
+       struct cfil_info *cfil_info = NULL;
+       struct cfilhashhead *cfilhash = NULL;
+       struct cfil_db *db = NULL;
+       struct cfil_hash_entry *hash_entry = NULL;
+       struct cfil_hash_entry *temp_hash_entry = NULL;
+       errno_t error = 0;
+       int kcunit;
+
+       socket_lock_assert_owned(so);
+
+       if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) {
+
+               db = so->so_cfil_db;
+
+               for (int i = 0; i < CFILHASHSIZE; i++) {
+                       cfilhash = &db->cfdb_hashbase[i];
+
+                       LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
+                               if (hash_entry->cfentry_cfil != NULL) {
+
+                                       cfil_info = hash_entry->cfentry_cfil;
+
+                                       for (kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
+                                               /* Let the filters know of the closing */
+                                               error = cfil_dispatch_closed_event(so, cfil_info, kcunit);
+                                       }
+
+                                       /* Last chance to push passed data out */
+                                       error = cfil_acquire_sockbuf(so, cfil_info, 1);
+                                       if (error == 0)
+                                               cfil_service_inject_queue(so, cfil_info, 1);
+                                       cfil_release_sockbuf(so, 1);
+
+                                       cfil_info->cfi_flags |= CFIF_SOCK_CLOSED;
+
+                                       /* Pending data needs to go */
+                                       cfil_flush_queues(so, cfil_info);
+
+                                       CFIL_INFO_VERIFY(cfil_info);
+                               }
+                       }
+               }
+       }
+}
+
+void
+cfil_sock_udp_buf_update(struct sockbuf *sb)
+{
+       struct cfil_info *cfil_info = NULL;
+       struct cfilhashhead *cfilhash = NULL;
+       struct cfil_db *db = NULL;
+       struct cfil_hash_entry *hash_entry = NULL;
+       struct cfil_hash_entry *temp_hash_entry = NULL;
+       errno_t error = 0;
+       int outgoing;
+       struct socket *so = sb->sb_so;
+
+       socket_lock_assert_owned(so);
+
+       if ((so->so_flags & SOF_CONTENT_FILTER) != 0 && so->so_cfil_db != NULL) {
+
+               if (!cfil_sbtrim)
+                       return;
+
+               db = so->so_cfil_db;
+
+               for (int i = 0; i < CFILHASHSIZE; i++) {
+                       cfilhash = &db->cfdb_hashbase[i];
+
+                       LIST_FOREACH_SAFE(hash_entry, cfilhash, cfentry_link, temp_hash_entry) {
+                               if (hash_entry->cfentry_cfil != NULL) {
+
+                                       cfil_info = hash_entry->cfentry_cfil;
+
+                                       if ((sb->sb_flags & SB_RECV) == 0) {
+                                               if ((cfil_info->cfi_flags & CFIF_RETRY_INJECT_OUT) == 0)
+                                                       return;
+                                               outgoing = 1;
+                                               OSIncrementAtomic(&cfil_stats.cfs_inject_q_out_retry);
+                                       } else {
+                                               if ((cfil_info->cfi_flags & CFIF_RETRY_INJECT_IN) == 0)
+                                                       return;
+                                               outgoing = 0;
+                                               OSIncrementAtomic(&cfil_stats.cfs_inject_q_in_retry);
+                                       }
+
+                                       CFIL_LOG(LOG_NOTICE, "so %llx outgoing %d",
+                                                        (uint64_t)VM_KERNEL_ADDRPERM(so), outgoing);
+
+                                       error = cfil_acquire_sockbuf(so, cfil_info, outgoing);
+                                       if (error == 0)
+                                               cfil_service_inject_queue(so, cfil_info, outgoing);
+                                       cfil_release_sockbuf(so, outgoing);
+                               }
+                       }
+               }
+       }
+}
+
+void
+cfil_filter_show(u_int32_t kcunit)
+{
+       struct content_filter *cfc = NULL;
+       struct cfil_entry *entry;
+       int count = 0;
+
+       if (content_filters == NULL) {
+               return;
+       }
+       if (kcunit > MAX_CONTENT_FILTER) {
+               return;
+       }
+
+       cfil_rw_lock_shared(&cfil_lck_rw);
+
+       if (content_filters[kcunit - 1] == NULL) {
+               cfil_rw_unlock_shared(&cfil_lck_rw);
+               return;
+       }
+       cfc = content_filters[kcunit - 1];
+
+       CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: Filter <unit %d, entry count %d> flags <%lx>:",
+                        kcunit, cfc->cf_sock_count, (unsigned long)cfc->cf_flags);
+       if (cfc->cf_flags & CFF_DETACHING)
+               CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - DETACHING");
+       if (cfc->cf_flags & CFF_ACTIVE)
+               CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - ACTIVE");
+       if (cfc->cf_flags & CFF_FLOW_CONTROLLED)
+               CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: - FLOW CONTROLLED");
+
+       TAILQ_FOREACH(entry, &cfc->cf_sock_entries, cfe_link) {
+
+               if (entry->cfe_cfil_info && entry->cfe_cfil_info->cfi_so) {
+                       struct cfil_info *cfil_info = entry->cfe_cfil_info;
+
+                       count++;
+
+                       if (entry->cfe_flags & CFEF_CFIL_DETACHED)
+                               cfil_info_log(LOG_ERR, cfil_info, "CFIL: FILTER SHOW: - DETACHED");
+                       else
+                               cfil_info_log(LOG_ERR, cfil_info, "CFIL: FILTER SHOW: - ATTACHED");
+               }
+       }
+
+       CFIL_LOG(LOG_ERR, "CFIL: FILTER SHOW: Filter - total entries shown: %d", count);
+
+       cfil_rw_unlock_shared(&cfil_lck_rw);
+
+}
+
+void
+cfil_info_show(void)
+{
+       struct cfil_info *cfil_info;
+       int count = 0;
+
+       cfil_rw_lock_shared(&cfil_lck_rw);
+
+       CFIL_LOG(LOG_ERR, "CFIL: INFO SHOW: count %d", cfil_sock_attached_count);
+
+       TAILQ_FOREACH(cfil_info, &cfil_sock_head, cfi_link) {
+
+               count++;
+
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: INFO SHOW");
+
+               if (cfil_info->cfi_flags & CFIF_DROP)
+                       CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - DROP");
+               if (cfil_info->cfi_flags & CFIF_CLOSE_WAIT)
+                       CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - CLOSE_WAIT");
+               if (cfil_info->cfi_flags & CFIF_SOCK_CLOSED)
+                       CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SOCK_CLOSED");
+               if (cfil_info->cfi_flags & CFIF_RETRY_INJECT_IN)
+                       CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - RETRY_INJECT_IN");
+               if (cfil_info->cfi_flags & CFIF_RETRY_INJECT_OUT)
+                       CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - RETRY_INJECT_OUT");
+               if (cfil_info->cfi_flags & CFIF_SHUT_WR)
+                       CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SHUT_WR");
+               if (cfil_info->cfi_flags & CFIF_SHUT_RD)
+                       CFIL_LOG(LOG_ERR, "CFIL: INFO FLAG - SHUT_RD");
+       }
+
+       CFIL_LOG(LOG_ERR, "CFIL: INFO SHOW: total cfil_info shown: %d", count);
+
+       cfil_rw_unlock_shared(&cfil_lck_rw);
+}
+
+bool
+cfil_info_idle_timed_out(struct cfil_info *cfil_info, int timeout, u_int32_t current_time)
+{
+       if (cfil_info && cfil_info->cfi_hash_entry &&
+               (current_time - cfil_info->cfi_hash_entry->cfentry_lastused >= (u_int32_t)timeout)) {
+#if GC_DEBUG
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: flow IDLE timeout expired");
+#endif
+               return true;
+       }
+       return false;
+}
+
+bool
+cfil_info_action_timed_out(struct cfil_info *cfil_info, int timeout)
+{
+       struct cfil_entry *entry;
+       struct timeval current_tv;
+       struct timeval diff_time;
+
+       if (cfil_info == NULL)
+               return false;
+
+       /*
+        * If we have queued up more data than passed offset and we haven't received
+        * an action from user space for a while (the user space filter might have crashed),
+        * return action timed out.
+        */
+       if (cfil_info->cfi_snd.cfi_pending_last > cfil_info->cfi_snd.cfi_pass_offset ||
+               cfil_info->cfi_rcv.cfi_pending_last > cfil_info->cfi_rcv.cfi_pass_offset) {
+
+               microuptime(&current_tv);
+
+               for (int kcunit = 1; kcunit <= MAX_CONTENT_FILTER; kcunit++) {
+                       entry = &cfil_info->cfi_entries[kcunit - 1];
+
+                       if (entry->cfe_filter == NULL)
+                               continue;
+
+                       if (cfil_info->cfi_snd.cfi_pending_last > entry->cfe_snd.cfe_pass_offset ||
+                               cfil_info->cfi_rcv.cfi_pending_last > entry->cfe_rcv.cfe_pass_offset) {
+                               // haven't gotten an action from this filter, check timeout
+                               timersub(&current_tv, &entry->cfe_last_action, &diff_time);
+                               if (diff_time.tv_sec >= timeout) {
+#if GC_DEBUG
+                                       cfil_info_log(LOG_ERR, cfil_info, "CFIL: flow ACTION timeout expired");
+#endif
+                                       return true;
+                               }
+                       }
+               }
+       }
+       return false;
+}
+
+bool
+cfil_info_buffer_threshold_exceeded(struct cfil_info *cfil_info)
+{
+       if (cfil_info == NULL)
+               return false;
+
+       /*
+        * Clean up flow if it exceeded queue thresholds
+        */
+       if (cfil_info->cfi_snd.cfi_tail_drop_cnt ||
+               cfil_info->cfi_rcv.cfi_tail_drop_cnt) {
+#if GC_DEBUG
+               CFIL_LOG(LOG_ERR, "CFIL: queue threshold exceeded: mbuf max <count: %d bytes: %d> tail drop count <OUT: %d IN: %d>",
+                                cfil_udp_gc_mbuf_num_max,
+                                cfil_udp_gc_mbuf_cnt_max,
+                                cfil_info->cfi_snd.cfi_tail_drop_cnt,
+                                cfil_info->cfi_rcv.cfi_tail_drop_cnt);
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: queue threshold exceeded");
+#endif
+               return true;
+       }
+
+       return false;
+}
+
+static void
+cfil_udp_gc_thread_sleep(bool forever)
+{
+       if (forever) {
+               (void) assert_wait((event_t) &cfil_sock_udp_attached_count,
+                                                  THREAD_INTERRUPTIBLE);
+       } else {
+               uint64_t deadline = 0;
+               nanoseconds_to_absolutetime(UDP_FLOW_GC_RUN_INTERVAL_NSEC, &deadline);
+               clock_absolutetime_interval_to_deadline(deadline, &deadline);
+
+               (void) assert_wait_deadline(&cfil_sock_udp_attached_count,
+                                                                       THREAD_INTERRUPTIBLE, deadline);
+       }
+}
+
+static void
+cfil_udp_gc_thread_func(void *v, wait_result_t w)
+{
+#pragma unused(v, w)
+
+       ASSERT(cfil_udp_gc_thread == current_thread());
+       thread_set_thread_name(current_thread(), "CFIL_UPD_GC");
+
+       // Kick off gc shortly
+       cfil_udp_gc_thread_sleep(false);
+       thread_block_parameter((thread_continue_t) cfil_info_udp_expire, NULL);
+       /* NOTREACHED */
+}
+
+static void
+cfil_info_udp_expire(void *v, wait_result_t w)
+{
+#pragma unused(v, w)
+
+       static uint64_t expired_array[UDP_FLOW_GC_MAX_COUNT];
+       static uint32_t expired_count = 0;
+
+       struct cfil_info *cfil_info;
+       struct cfil_hash_entry *hash_entry;
+       struct cfil_db *db;
+       struct socket *so;
+       u_int32_t current_time = 0;
+
+       current_time = net_uptime();
+
+       // Get all expired UDP flow ids
+       cfil_rw_lock_shared(&cfil_lck_rw);
+
+       if (cfil_sock_udp_attached_count == 0) {
+               cfil_rw_unlock_shared(&cfil_lck_rw);
+               goto go_sleep;
+       }
+
+       TAILQ_FOREACH(cfil_info, &cfil_sock_head, cfi_link) {
+               if (expired_count >= UDP_FLOW_GC_MAX_COUNT)
+                       break;
+
+               if (IS_UDP(cfil_info->cfi_so)) {
+                       if (cfil_info_idle_timed_out(cfil_info, UDP_FLOW_GC_IDLE_TO, current_time) ||
+                               cfil_info_action_timed_out(cfil_info, UDP_FLOW_GC_ACTION_TO) ||
+                               cfil_info_buffer_threshold_exceeded(cfil_info)) {
+                               expired_array[expired_count] = cfil_info->cfi_sock_id;
+                               expired_count++;
+                       }
+               }
+       }
+       cfil_rw_unlock_shared(&cfil_lck_rw);
+
+       if (expired_count == 0)
+               goto go_sleep;
+
+       for (uint32_t i = 0; i < expired_count; i++) {
+
+               // Search for socket (UDP only and lock so)
+               so = cfil_socket_from_sock_id(expired_array[i], true);
+               if (so == NULL) {
+                       continue;
+               }
+
+               cfil_info = cfil_db_get_cfil_info(so->so_cfil_db, expired_array[i]);
+               if (cfil_info == NULL) {
+                       goto unlock;
+               }
+
+               db = so->so_cfil_db;
+               hash_entry = cfil_info->cfi_hash_entry;
+
+               if (db == NULL || hash_entry == NULL) {
+                       goto unlock;
+               }
+
+#if GC_DEBUG || LIFECYCLE_DEBUG
+               cfil_info_log(LOG_ERR, cfil_info, "CFIL: LIFECYCLE: GC CLEAN UP");
+#endif
+
+               cfil_db_delete_entry(db, hash_entry);
+               cfil_info_free(cfil_info);
+               OSIncrementAtomic(&cfil_stats.cfs_sock_detached);
+
+               if (so->so_flags & SOF_CONTENT_FILTER) {
+                       if (db->cfdb_count == 0)
+                               so->so_flags &= ~SOF_CONTENT_FILTER;
+                       VERIFY(so->so_usecount > 0);
+                       so->so_usecount--;
+               }
+unlock:
+               socket_unlock(so, 1);
+       }
+
+#if GC_DEBUG
+       CFIL_LOG(LOG_ERR, "CFIL: UDP flow idle timeout check: expired %d idle flows", expired_count);
+#endif
+       expired_count = 0;
+
+go_sleep:
+
+       // Sleep forever (until waken up) if no more UDP flow to clean
+       cfil_rw_lock_shared(&cfil_lck_rw);
+       cfil_udp_gc_thread_sleep(cfil_sock_udp_attached_count == 0 ? true : false);
+       cfil_rw_unlock_shared(&cfil_lck_rw);
+       thread_block_parameter((thread_continue_t)cfil_info_udp_expire, NULL);
+       /* NOTREACHED */
+}
+
+struct m_tag *
+cfil_udp_save_socket_state(struct cfil_info *cfil_info, struct mbuf *m)
+{
+       struct m_tag *tag = NULL;
+       struct cfil_tag *ctag = NULL;
+       struct cfil_hash_entry *hash_entry = NULL;
+
+       if (cfil_info == NULL || cfil_info->cfi_so == NULL ||
+               cfil_info->cfi_hash_entry == NULL || m == NULL || !(m->m_flags & M_PKTHDR)) {
+               return NULL;
+       }
+
+       /* Allocate a tag */
+       tag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP,
+                                          sizeof(struct cfil_tag), M_DONTWAIT, m);
+
+       if (tag) {
+               ctag = (struct cfil_tag*)(tag + 1);
+               ctag->cfil_so_state_change_cnt = cfil_info->cfi_so->so_state_change_cnt;
+               ctag->cfil_so_options = cfil_info->cfi_so->so_options;
+
+               hash_entry = cfil_info->cfi_hash_entry;
+               if (hash_entry->cfentry_family == AF_INET6) {
+                       fill_ip6_sockaddr_4_6(&ctag->cfil_faddr,
+                                                                 &hash_entry->cfentry_faddr.addr6,
+                                                                 hash_entry->cfentry_fport);
+               } else if (hash_entry->cfentry_family == AF_INET) {
+                       fill_ip_sockaddr_4_6(&ctag->cfil_faddr,
+                                                                hash_entry->cfentry_faddr.addr46.ia46_addr4,
+                                                                hash_entry->cfentry_fport);
+               }
+               m_tag_prepend(m, tag);
+               return (tag);
+       }
+       return NULL;
+}
+
+struct m_tag *
+cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt, short *options,
+                                                 struct sockaddr **faddr)
+{
+       struct m_tag *tag = NULL;
+       struct cfil_tag *ctag = NULL;
+
+       tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_CFIL_UDP, NULL);
+       if (tag) {
+               ctag = (struct cfil_tag *)(tag + 1);
+               if (state_change_cnt)
+                       *state_change_cnt = ctag->cfil_so_state_change_cnt;
+               if (options)
+                       *options = ctag->cfil_so_options;
+               if (faddr)
+                       *faddr = (struct sockaddr *) &ctag->cfil_faddr;
+
+               /*
+                * Unlink tag and hand it over to caller.
+                * Note that caller will be responsible to free it.
+                */
+               m_tag_unlink(m, tag);
+               return tag;
+       }
+       return NULL;
+}
+
+
index e4d1ce5d4fd48d68d1948d3a915d469af3aee2dd..55249920b5d6339b8f5656355f39e82e7e4c51b5 100644 (file)
@@ -422,6 +422,8 @@ extern void cfil_sock_buf_update(struct sockbuf *sb);
 
 extern cfil_sock_id_t cfil_sock_id_from_socket(struct socket *so);
 
+extern struct m_tag *cfil_udp_get_socket_state(struct mbuf *m, uint32_t *state_change_cnt,
+                                                                                          short *options, struct sockaddr **faddr);
 #endif /* BSD_KERNEL_PRIVATE */
 
 __END_DECLS
index 0438c30a18812964a0d74dbeedae708abf86eab5..cd4d8d963dd187a14b7815c5894be55a9b8aa052 100644 (file)
@@ -78,7 +78,7 @@
 #include <net/if_llatbl.h>
 #include <net/net_api_stats.h>
 #include <net/if_ports_used.h>
-
+#include <netinet/in.h>
 #if INET
 #include <netinet/in_var.h>
 #include <netinet/igmp_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/in_pcb.h>
 #include <netinet/in_tclass.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
 #endif /* INET */
 
 #if INET6
+#include <net/nat464_utils.h>
 #include <netinet6/in6_var.h>
 #include <netinet6/nd6.h>
 #include <netinet6/mld6_var.h>
 #include <netinet6/scope6_var.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
 #endif /* INET6 */
-
+#include <net/pf_pbuf.h>
 #include <libkern/OSAtomic.h>
 #include <libkern/tree.h>
 
@@ -278,7 +284,7 @@ static unsigned int dlif_size;              /* size of dlil_ifnet to allocate */
 static unsigned int dlif_bufsize;      /* size of dlif_size + headroom */
 static struct zone *dlif_zone;         /* zone for dlil_ifnet */
 
-#define        DLIF_ZONE_MAX           64              /* maximum elements in zone */
+#define        DLIF_ZONE_MAX           IFNETS_MAX      /* maximum elements in zone */
 #define        DLIF_ZONE_NAME          "ifnet"         /* zone name */
 
 static unsigned int dlif_filt_size;    /* size of ifnet_filter */
@@ -397,7 +403,9 @@ static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
     u_int32_t, ifnet_model_t, boolean_t);
 static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
     const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
-
+static int dlil_is_clat_needed(protocol_family_t , mbuf_t );
+static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
+static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
 #if DEBUG || DEVELOPMENT
 static void dlil_verify_sum16(void);
 #endif /* DEBUG || DEVELOPMENT */
@@ -1718,6 +1726,9 @@ dlil_init(void)
        /* Initialize the interface port list */
        if_ports_used_init();
 
+       /* Initialize the interface low power mode event handler */
+       if_low_power_evhdlr_init();
+
 #if DEBUG || DEVELOPMENT
        /* Run self-tests */
        dlil_verify_sum16();
@@ -3816,15 +3827,15 @@ static void
 dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
     u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
 {
-       int                             error = 0;
-       protocol_family_t               protocol_family;
-       mbuf_t                          next_packet;
-       ifnet_t                         ifp = ifp_param;
-       char *                          frame_header;
-       struct if_proto *               last_ifproto = NULL;
-       mbuf_t                          pkt_first = NULL;
-       mbuf_t *                        pkt_next = NULL;
-       u_int32_t                       poll_thresh = 0, poll_ival = 0;
+       int error = 0;
+       protocol_family_t protocol_family;
+       mbuf_t next_packet;
+       ifnet_t ifp = ifp_param;
+       char *frame_header = NULL;
+       struct if_proto *last_ifproto = NULL;
+       mbuf_t pkt_first = NULL;
+       mbuf_t *pkt_next = NULL;
+       u_int32_t poll_thresh = 0, poll_ival = 0;
 
        KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
@@ -3892,6 +3903,69 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
                        protocol_family = 0;
                }
 
+               pktap_input(ifp, protocol_family, m, frame_header);
+
+               /* Drop v4 packets received on CLAT46 enabled interface */
+               if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp)) {
+                       m_freem(m);
+                       ip6stat.ip6s_clat464_in_v4_drop++;
+                       goto next;
+               }
+
+               /* Translate the packet if it is received on CLAT interface */
+               if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
+                   && dlil_is_clat_needed(protocol_family, m)) {
+                       char *data = NULL;
+                       struct ether_header eh;
+                       struct ether_header *ehp = NULL;
+
+                       if (ifp->if_type == IFT_ETHER) {
+                               ehp = (struct ether_header *)(void *)frame_header;
+                               /* Skip RX Ethernet packets if they are not IPV6 */
+                               if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6)
+                                       goto skip_clat;
+
+                               /* Keep a copy of frame_header for Ethernet packets */
+                               bcopy(frame_header, (caddr_t)&eh, ETHER_HDR_LEN);
+                       }
+                       error = dlil_clat64(ifp, &protocol_family, &m);
+                       data = (char *) mbuf_data(m);
+                       if (error != 0) {
+                               m_freem(m);
+                               ip6stat.ip6s_clat464_in_drop++;
+                               goto next;
+                       }
+                       /* Native v6 should be No-op */
+                       if (protocol_family != PF_INET)
+                               goto skip_clat;
+
+                       /* Do this only for translated v4 packets. */
+                       switch (ifp->if_type) {
+                       case IFT_CELLULAR:
+                               frame_header = data;
+                               break;
+                       case IFT_ETHER:
+                               /*
+                                * Drop if the mbuf doesn't have enough
+                                * space for Ethernet header
+                                */
+                               if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
+                                       m_free(m);
+                                       ip6stat.ip6s_clat464_in_drop++;
+                                       goto next;
+                               }
+                               /*
+                                * Set the frame_header ETHER_HDR_LEN bytes
+                                * preceeding the data pointer. Change
+                                * the ether_type too.
+                                */
+                               frame_header = data - ETHER_HDR_LEN;
+                               eh.ether_type = htons(ETHERTYPE_IP);
+                               bcopy((caddr_t)&eh, frame_header, ETHER_HDR_LEN);
+                               break;
+                       }
+               }
+skip_clat:
                if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
                    !(m->m_pkthdr.pkt_flags & PKTF_LOOP))
                        dlil_input_cksum_dbg(ifp, m, frame_header,
@@ -3912,7 +3986,6 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
                    (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
                    (CSUM_DATA_VALID | CSUM_PARTIAL)) {
                        int adj;
-
                        if (frame_header == NULL ||
                            frame_header < (char *)mbuf_datastart(m) ||
                            frame_header > (char *)m->m_data ||
@@ -3926,7 +3999,8 @@ dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
                        }
                }
 
-               pktap_input(ifp, protocol_family, m, frame_header);
+               if (clat_debug)
+                       pktap_input(ifp, protocol_family, m, frame_header);
 
                if (m->m_flags & (M_BCAST|M_MCAST))
                        atomic_add_64(&ifp->if_imcasts, 1);
@@ -4288,7 +4362,7 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
        char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
        char dst_linkaddr_buffer[MAX_LINKADDR * 4];
        struct if_proto *proto = NULL;
-       mbuf_t  m;
+       mbuf_t  m = NULL;
        mbuf_t  send_head = NULL;
        mbuf_t  *send_tail = &send_head;
        int iorefcnt = 0;
@@ -4297,6 +4371,9 @@ dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
        int32_t flen = 0;
        struct timespec now;
        u_int64_t now_nsec;
+       boolean_t did_clat46 = FALSE;
+       protocol_family_t old_proto_family = proto_family;
+       struct rtentry *rt = NULL;
 
        KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
@@ -4339,6 +4416,85 @@ preout_again:
        packetlist = packetlist->m_nextpkt;
        m->m_nextpkt = NULL;
 
+       /*
+        * Perform address family translation for the first
+        * packet outside the loop in order to perform address
+        * lookup for the translated proto family.
+        */
+       if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
+           (ifp->if_type == IFT_CELLULAR ||
+            dlil_is_clat_needed(proto_family, m))) {
+               retval = dlil_clat46(ifp, &proto_family, &m);
+               /*
+                * Go to the next packet if translation fails
+                */
+               if (retval != 0) {
+                       m_freem(m);
+                       m = NULL;
+                       ip6stat.ip6s_clat464_out_drop++;
+                       /* Make sure that the proto family is PF_INET */
+                       ASSERT(proto_family == PF_INET);
+                       goto preout_again;
+               }
+               /*
+                * Free the old one and make it point to the IPv6 proto structure.
+                *
+                * Change proto for the first time we have successfully
+                * performed address family translation.
+                */
+               if (!did_clat46 && proto_family == PF_INET6) {
+                       struct sockaddr_in6 dest6;
+                       did_clat46 = TRUE;
+
+                       if (proto != NULL)
+                               if_proto_free(proto);
+                       ifnet_lock_shared(ifp);
+                       /* callee holds a proto refcnt upon success */
+                       proto = find_attached_proto(ifp, proto_family);
+                       if (proto == NULL) {
+                               ifnet_lock_done(ifp);
+                               retval = ENXIO;
+                               m_freem(m);
+                               m = NULL;
+                               goto cleanup;
+                       }
+                       ifnet_lock_done(ifp);
+                       if (ifp->if_type == IFT_ETHER) {
+                               /* Update the dest to translated v6 address */
+                               dest6.sin6_len = sizeof(struct sockaddr_in6);
+                               dest6.sin6_family = AF_INET6;
+                               dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
+                               dest = (const struct sockaddr *)&dest6;
+
+                               /*
+                                * Lookup route to the translated destination
+                                * Free this route ref during cleanup
+                                */
+                               rt = rtalloc1_scoped((struct sockaddr *)&dest6,
+                                   0, 0, ifp->if_index);
+
+                               route = rt;
+                       }
+               }
+       }
+
+       /*
+        * This path gets packet chain going to the same destination.
+        * The pre output routine is used to either trigger resolution of
+        * the next hop or retreive the next hop's link layer addressing.
+        * For ex: ether_inet(6)_pre_output routine.
+        *
+        * If the routine returns EJUSTRETURN, it implies that packet has
+        * been queued, and therefore we have to call preout_again for the
+        * following packet in the chain.
+        *
+        * For errors other than EJUSTRETURN, the current packet is freed
+        * and the rest of the chain (pointed by packetlist is freed as
+        * part of clean up.
+        *
+        * Else if there is no error the retrieved information is used for
+        * all the packets in the chain.
+        */
        if (raw == 0) {
                proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
                    proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
@@ -4351,6 +4507,7 @@ preout_again:
                                if (retval == EJUSTRETURN)
                                        goto preout_again;
                                m_freem(m);
+                               m = NULL;
                                goto cleanup;
                        }
                }
@@ -4366,6 +4523,30 @@ preout_again:
 #endif
 
        do {
+               /*
+                * Perform address family translation if needed.
+                * For now we only support stateless 4 to 6 translation
+                * on the out path.
+                *
+                * The routine below translates IP header, updates protocol
+                * checksum and also translates ICMP.
+                *
+                * We skip the first packet as it is already translated and
+                * the proto family is set to PF_INET6.
+                */
+               if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
+                   (ifp->if_type == IFT_CELLULAR ||
+                    dlil_is_clat_needed(proto_family, m))) {
+                       retval = dlil_clat46(ifp, &proto_family, &m);
+                        /* Goto the next packet if the translation fails */
+                       if (retval != 0) {
+                               m_freem(m);
+                               m = NULL;
+                               ip6stat.ip6s_clat464_out_drop++;
+                               goto next;
+                       }
+               }
+
 #if CONFIG_DTRACE
                if (!raw && proto_family == PF_INET) {
                        struct ip *ip = mtod(m, struct ip *);
@@ -4557,6 +4738,9 @@ next:
                        packetlist = packetlist->m_nextpkt;
                        m->m_nextpkt = NULL;
                }
+               /* Reset the proto family to old proto family for CLAT */
+               if (did_clat46)
+                       proto_family = old_proto_family;
        } while (m != NULL);
 
        if (send_head != NULL) {
@@ -4631,10 +4815,323 @@ cleanup:
                retval = 0;
        if (iorefcnt == 1)
                ifnet_decr_iorefcnt(ifp);
+       if (rt != NULL) {
+               rtfree(rt);
+               rt = NULL;
+       }
 
        return (retval);
 }
 
+/*
+ * This routine checks if the destination address is not a loopback, link-local,
+ * multicast or broadcast address.
+ */
+static int
+dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
+{
+       int ret = 0;
+       switch(proto_family) {
+       case PF_INET: {
+               struct ip *iph = mtod(m, struct ip *);
+               if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr)))
+                       ret = 1;
+               break;
+       }
+       case PF_INET6: {
+               struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
+               if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
+                   CLAT64_NEEDED(&ip6h->ip6_dst))
+                       ret = 1;
+               break;
+       }
+       }
+
+       return (ret);
+}
+/*
+ * @brief This routine translates IPv4 packet to IPv6 packet,
+ *     updates protocol checksum and also translates ICMP for code
+ *     along with inner header translation.
+ *
+ * @param ifp Pointer to the interface
+ * @param proto_family pointer to protocol family. It is updated if function
+ *     performs the translation successfully.
+ * @param m Pointer to the pointer pointing to the packet. Needed because this
+ *     routine can end up changing the mbuf to a different one.
+ *
+ * @return 0 on success or else a negative value.
+ */
+static errno_t
+dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
+{
+       VERIFY(*proto_family == PF_INET);
+       VERIFY(IS_INTF_CLAT46(ifp));
+
+       pbuf_t pbuf_store, *pbuf = NULL;
+       struct ip *iph = NULL;
+       struct in_addr osrc, odst;
+       uint8_t proto = 0;
+       struct in6_ifaddr *ia6_clat_src = NULL;
+       struct in6_addr *src = NULL;
+       struct in6_addr dst;
+       int error = 0;
+       uint32_t off = 0;
+       uint64_t tot_len = 0;
+       uint16_t ip_id_val = 0;
+       uint16_t ip_frag_off = 0;
+
+       boolean_t is_frag = FALSE;
+       boolean_t is_first_frag = TRUE;
+       boolean_t is_last_frag = TRUE;
+
+       pbuf_init_mbuf(&pbuf_store, *m, ifp);
+       pbuf = &pbuf_store;
+       iph = pbuf->pb_data;
+
+       osrc = iph->ip_src;
+       odst = iph->ip_dst;
+       proto = iph->ip_p;
+       off = iph->ip_hl << 2;
+       ip_id_val = iph->ip_id;
+       ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
+
+       tot_len = ntohs(iph->ip_len);
+
+       /*
+        * For packets that are not first frags
+        * we only need to adjust CSUM.
+        * For 4 to 6, Fragmentation header gets appended
+        * after proto translation.
+        */
+       if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
+               is_frag = TRUE;
+
+               /* If the offset is not zero, it is not first frag */
+               if (ip_frag_off != 0)
+                       is_first_frag = FALSE;
+
+               /* If IP_MF is set, then it is not last frag */
+               if (ntohs(iph->ip_off) & IP_MF)
+                       is_last_frag = FALSE;
+       }
+
+       /*
+        * Retrive the local IPv6 CLAT46 address reserved for stateless
+        * translation.
+        */
+       ia6_clat_src = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
+       if (ia6_clat_src == NULL) {
+               ip6stat.ip6s_clat464_out_nov6addr_drop++;
+               error = -1;
+               goto cleanup;
+       }
+
+       src = &ia6_clat_src->ia_addr.sin6_addr;
+
+       /*
+        * Translate IPv4 destination to IPv6 destination by using the
+        * prefixes learned through prior PLAT discovery.
+        */
+       if ((error = nat464_synthesize_ipv6(ifp, &odst, &dst)) != 0) {
+               ip6stat.ip6s_clat464_out_v6synthfail_drop++;
+               goto cleanup;
+       }
+
+       /* Translate the IP header part first */
+       error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
+           iph->ip_ttl, *src, dst, tot_len) == NT_NAT64) ? 0 : -1;
+
+       iph = NULL;     /* Invalidate iph as pbuf has been modified */
+
+       if (error != 0) {
+               ip6stat.ip6s_clat464_out_46transfail_drop++;
+               goto cleanup;
+       }
+
+       /*
+        * Translate protocol header, update checksum, checksum flags
+        * and related fields.
+        */
+       error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
+           proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
+
+       if (error != 0) {
+               ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
+               goto cleanup;
+       }
+
+       /* Now insert the IPv6 fragment header */
+       if (is_frag) {
+               error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
+
+               if (error != 0) {
+                       ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
+                       goto cleanup;
+               }
+       }
+
+cleanup:
+       if (ia6_clat_src != NULL)
+               IFA_REMREF(&ia6_clat_src->ia_ifa);
+
+       if (pbuf_is_valid(pbuf)) {
+               *m = pbuf->pb_mbuf;
+               pbuf->pb_mbuf = NULL;
+               pbuf_destroy(pbuf);
+       } else {
+               error = -1;
+               ip6stat.ip6s_clat464_out_invalpbuf_drop++;
+       }
+
+       if (error == 0) {
+               *proto_family = PF_INET6;
+               ip6stat.ip6s_clat464_out_success++;
+       }
+
+       return (error);
+}
+
+/*
+ * @brief This routine translates incoming IPv6 to IPv4 packet,
+ *     updates protocol checksum and also translates ICMPv6 outer
+ *     and inner headers
+ *
+ * @return 0 on success or else a negative value.
+ */
+static errno_t
+dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
+{
+       VERIFY(*proto_family == PF_INET6);
+       VERIFY(IS_INTF_CLAT46(ifp));
+
+       struct ip6_hdr *ip6h = NULL;
+       struct in6_addr osrc, odst;
+       uint8_t proto = 0;
+       struct in6_ifaddr *ia6_clat_dst = NULL;
+       struct in_ifaddr *ia4_clat_dst = NULL;
+       struct in_addr *dst = NULL;
+       struct in_addr src;
+       int error = 0;
+       uint32_t off = 0;
+       u_int64_t tot_len = 0;
+       uint8_t tos = 0;
+       boolean_t is_first_frag = TRUE;
+
+       /* Incoming mbuf does not contain valid IP6 header */
+       if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
+           ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
+           (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
+               ip6stat.ip6s_clat464_in_tooshort_drop++;
+               return (-1);
+       }
+
+       ip6h = mtod(*m, struct ip6_hdr *);
+       /* Validate that mbuf contains IP payload equal to ip6_plen  */
+       if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
+               ip6stat.ip6s_clat464_in_tooshort_drop++;
+               return (-1);
+       }
+
+       osrc = ip6h->ip6_src;
+       odst = ip6h->ip6_dst;
+
+       /*
+        * Retrieve the local CLAT46 reserved IPv6 address.
+        * Let the packet pass if we don't find one, as the flag
+        * may get set before IPv6 configuration has taken place.
+        */
+       ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
+       if (ia6_clat_dst == NULL)
+               goto done;
+
+       /*
+        * Check if the original dest in the packet is same as the reserved
+        * CLAT46 IPv6 address
+        */
+       if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
+               pbuf_t pbuf_store, *pbuf = NULL;
+               pbuf_init_mbuf(&pbuf_store, *m, ifp);
+               pbuf = &pbuf_store;
+
+               /*
+                * Retrive the local CLAT46 IPv4 address reserved for stateless
+                * translation.
+                */
+               ia4_clat_dst = inifa_ifpclatv4(ifp);
+               if (ia4_clat_dst == NULL) {
+                       IFA_REMREF(&ia6_clat_dst->ia_ifa);
+                       ip6stat.ip6s_clat464_in_nov4addr_drop++;
+                       error = -1;
+                       goto cleanup;
+               }
+               IFA_REMREF(&ia6_clat_dst->ia_ifa);
+
+               /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
+               dst = &ia4_clat_dst->ia_addr.sin_addr;
+               if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
+                       ip6stat.ip6s_clat464_in_v4synthfail_drop++;
+                       error = -1;
+                       goto cleanup;
+               }
+
+               ip6h = pbuf->pb_data;
+               off = sizeof(struct ip6_hdr);
+               proto = ip6h->ip6_nxt;
+               tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
+               tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
+
+               /*
+                * Translate the IP header and update the fragmentation
+                * header if needed
+                */
+               error = (nat464_translate_64(pbuf, off, tos, &proto,
+                   ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
+                   0 : -1;
+
+               ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
+
+               if (error != 0) {
+                       ip6stat.ip6s_clat464_in_64transfail_drop++;
+                       goto cleanup;
+               }
+
+               /*
+                * Translate protocol header, update checksum, checksum flags
+                * and related fields.
+                */
+               error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
+                   (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
+                   NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
+
+               if (error != 0) {
+                       ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
+                       goto cleanup;
+               }
+
+cleanup:
+               if (ia4_clat_dst != NULL)
+                       IFA_REMREF(&ia4_clat_dst->ia_ifa);
+
+               if (pbuf_is_valid(pbuf)) {
+                       *m = pbuf->pb_mbuf;
+                       pbuf->pb_mbuf = NULL;
+                       pbuf_destroy(pbuf);
+               } else {
+                       error = -1;
+                       ip6stat.ip6s_clat464_in_invalpbuf_drop++;
+               }
+
+               if (error == 0) {
+                       *proto_family = PF_INET;
+                       ip6stat.ip6s_clat464_in_success++;
+               }
+       } /* CLAT traffic */
+
+done:
+       return (error);
+}
+
 errno_t
 ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
     void *ioctl_arg)
@@ -6169,6 +6666,9 @@ ifnet_detach(ifnet_t ifp)
        ifp->if_eflags &= ~IFEF_ECN_DISABLE;
        ifp->if_eflags &= ~IFEF_ECN_ENABLE;
 
+       /* Reset CLAT46 flag */
+       ifp->if_eflags &= ~IFEF_CLAT46;
+
        /*
         * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
         * no longer be visible during lookups from this point.
@@ -8246,6 +8746,9 @@ ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
                    &prefixes[i].ipv6_prefix;
 
                if (prefix_len == 0) {
+                       clat_log0((LOG_DEBUG,
+                           "NAT64 prefixes purged from Interface %s\n",
+                           if_name(ifp)));
                        /* Allow clearing the signature */
                        IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
                        bzero(&IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
@@ -8258,11 +8761,15 @@ ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
                           prefix_len != NAT64_PREFIX_LEN_56 &&
                           prefix_len != NAT64_PREFIX_LEN_64 &&
                           prefix_len != NAT64_PREFIX_LEN_96) {
+                       clat_log0((LOG_DEBUG,
+                           "NAT64 prefixlen is incorrect %d\n", prefix_len));
                        error = EINVAL;
                        goto out;
                }
 
                if (IN6_IS_SCOPE_EMBED(prefix)) {
+                       clat_log0((LOG_DEBUG,
+                           "NAT64 prefix has interface/link local scope.\n"));
                        error = EINVAL;
                        goto out;
                }
@@ -8270,6 +8777,9 @@ ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
                IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
                bcopy(prefix, &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
                    sizeof(struct in6_addr));
+               clat_log0((LOG_DEBUG,
+                   "NAT64 prefix set to %s with prefixlen: %d\n",
+                   ip6_sprintf(prefix), prefix_len));
                one_set = 1;
        }
 
@@ -8643,7 +9153,8 @@ dlil_verify_sum16(void)
        kprintf("DLIL: running SUM16 self-tests ... ");
 
        m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
-       MH_ALIGN(m, sizeof (uint32_t));         /* 32-bit starting alignment */
+       m_align(m, sizeof(sumdata) + (sizeof (uint64_t) * 2));
+
        buf = mtod(m, uint8_t *);               /* base address */
 
        for (n = 0; n < SUMTBL_MAX; n++) {
index 3f61bc94fc38d10361070bdf4759d86ee2ceec56..23719456b1d00f396caea0f4f7bd4d6cbba3c23b 100644 (file)
 /*
  * Structure of a 10Mb/s Ethernet header.
  */
-struct ether_header {
+typedef struct ether_header {
        u_char  ether_dhost[ETHER_ADDR_LEN];
        u_char  ether_shost[ETHER_ADDR_LEN];
        u_short ether_type;
-};
+} ether_header_t;
 
 /*
  * Structure of a 48-bit Ethernet address.
  */
-struct ether_addr {
+typedef struct ether_addr {
        u_char octet[ETHER_ADDR_LEN];
-};
+} ether_addr_t;
 
 #define ether_addr_octet octet
 
index 0fe5872eaf537bf4058971028b9dd61f5baa8941..7bf8dc6041b5fa3bc2e03e306e4111c524e5cbc0 100644 (file)
 #include <netinet/in_var.h>
 #include <netinet/in_tclass.h>
 #include <netinet/ip_var.h>
+#include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp.h>
@@ -502,9 +503,12 @@ if_clone_create(char *name, int len, void *params)
        if (unit > ifc->ifc_maxunit)
                return (ENXIO);
 
+       lck_mtx_lock(&ifc->ifc_mutex);
        err = (*ifc->ifc_create)(ifc, unit, params);
-       if (err != 0)
+       if (err != 0) {
+               lck_mtx_unlock(&ifc->ifc_mutex);
                return (err);
+       }
 
        if (!wildcard) {
                bytoff = unit >> 3;
@@ -533,6 +537,7 @@ if_clone_create(char *name, int len, void *params)
                }
 
        }
+       lck_mtx_unlock(&ifc->ifc_mutex);
 
        return (0);
 }
@@ -543,36 +548,55 @@ if_clone_create(char *name, int len, void *params)
 static int
 if_clone_destroy(const char *name)
 {
-       struct if_clone *ifc;
-       struct ifnet *ifp;
+       struct if_clone *ifc = NULL;
+       struct ifnet *ifp = NULL;
        int bytoff, bitoff;
        u_int32_t unit;
+       int error = 0;
 
        ifc = if_clone_lookup(name, &unit);
-       if (ifc == NULL)
-               return (EINVAL);
 
-       if (unit < ifc->ifc_minifs)
-               return (EINVAL);
+       if (ifc == NULL) {
+               error = EINVAL;
+               goto done;
+       }
 
-       ifp = ifunit(name);
-       if (ifp == NULL)
-               return (ENXIO);
+       if (unit < ifc->ifc_minifs) {
+               error = EINVAL;
+               goto done;
+       }
 
-       if (ifc->ifc_destroy == NULL)
-               return (EOPNOTSUPP);
+       ifp = ifunit_ref(name);
+       if (ifp == NULL) {
+               error = ENXIO;
+               goto done;
+       }
+
+       if (ifc->ifc_destroy == NULL) {
+               error = EOPNOTSUPP;
+               goto done;
+       }
 
-       (*ifc->ifc_destroy)(ifp);
+       lck_mtx_lock(&ifc->ifc_mutex);
+       error = (*ifc->ifc_destroy)(ifp);
 
-       /*
-        * Compute offset in the bitmap and deallocate the unit.
-        */
+       if (error) {
+               lck_mtx_unlock(&ifc->ifc_mutex);
+               goto done;
+       }
+
+       /* Compute offset in the bitmap and deallocate the unit. */
        bytoff = unit >> 3;
        bitoff = unit - (bytoff << 3);
        KASSERT((ifc->ifc_units[bytoff] & (1 << bitoff)) != 0,
            ("%s: bit is already cleared", __func__));
        ifc->ifc_units[bytoff] &= ~(1 << bitoff);
-       return (0);
+       lck_mtx_unlock(&ifc->ifc_mutex);
+
+done:
+       if (ifp != NULL)
+               ifnet_decr_iorefcnt(ifp);
+       return (error);
 }
 
 /*
@@ -617,6 +641,28 @@ found_name:
        return (ifc);
 }
 
+void *
+if_clone_softc_allocate(const struct if_clone *ifc)
+{
+       void *p_clone = NULL;
+
+       VERIFY(ifc != NULL);
+
+       p_clone = zalloc(ifc->ifc_zone);
+       if (p_clone != NULL)
+               bzero(p_clone, ifc->ifc_softc_size);
+
+       return (p_clone);
+}
+
+void
+if_clone_softc_deallocate(const struct if_clone *ifc, void *p_softc)
+{
+       VERIFY(ifc != NULL && p_softc != NULL);
+       bzero(p_softc, ifc->ifc_softc_size);
+       zfree(ifc->ifc_zone, p_softc);
+}
+
 /*
  * Register a network interface cloner.
  */
@@ -643,6 +689,18 @@ if_clone_attach(struct if_clone *ifc)
        if (ifc->ifc_units == NULL)
                return (ENOBUFS);
        ifc->ifc_bmlen = len;
+       lck_mtx_init(&ifc->ifc_mutex, ifnet_lock_group, ifnet_lock_attr);
+
+       if (ifc->ifc_softc_size != 0) {
+               ifc->ifc_zone = zinit(ifc->ifc_softc_size, 
+                   ifc->ifc_zone_max_elem * ifc->ifc_softc_size, 0, ifc->ifc_name);
+               if (ifc->ifc_zone == NULL) {
+                       FREE(ifc->ifc_units, M_CLONE);
+                       return (ENOBUFS);
+               }
+               zone_change(ifc->ifc_zone, Z_EXPAND, TRUE);
+               zone_change(ifc->ifc_zone, Z_CALLERACCT, FALSE);
+       }
 
        LIST_INSERT_HEAD(&if_cloners, ifc, ifc_list);
        if_cloners_count++;
@@ -670,6 +728,10 @@ if_clone_detach(struct if_clone *ifc)
 {
        LIST_REMOVE(ifc, ifc_list);
        FREE(ifc->ifc_units, M_CLONE);
+       if (ifc->ifc_softc_size != 0)
+               zdestroy(ifc->ifc_zone);
+
+       lck_mtx_destroy(&ifc->ifc_mutex, ifnet_lock_group);
        if_cloners_count--;
 }
 
@@ -728,6 +790,8 @@ if_functional_type(struct ifnet *ifp, bool exclude_delegate)
                        ret = IFRTYPE_FUNCTIONAL_INTCOPROC;
                } else if ((exclude_delegate &&
                    (ifp->if_family == IFNET_FAMILY_ETHERNET ||
+                   ifp->if_family == IFNET_FAMILY_BOND ||
+                   ifp->if_family == IFNET_FAMILY_VLAN ||
                    ifp->if_family == IFNET_FAMILY_FIREWIRE)) ||
                    (!exclude_delegate && IFNET_IS_WIRED(ifp))) {
                        ret = IFRTYPE_FUNCTIONAL_WIRED;
@@ -2235,50 +2299,6 @@ ifioctl_iforder(u_long cmd, caddr_t data)
                break;
        }
 
-       case SIOCGIFORDER: {            /* struct if_order */
-               struct if_order *ifo = (struct if_order *)(void *)data;
-               u_int32_t ordered_count = *((volatile u_int32_t *)&if_ordered_count);
-
-               if (ifo->ifo_count == 0 ||
-                       ordered_count == 0) {
-                       ifo->ifo_count = 0;
-               } else if (ifo->ifo_ordered_indices != USER_ADDR_NULL) {
-                       u_int32_t count_to_copy =
-                           MIN(ordered_count, ifo->ifo_count);
-                       size_t length = (count_to_copy * sizeof(u_int32_t));
-                       struct ifnet *ifp = NULL;
-                       u_int32_t cursor = 0;
-
-                       ordered_indices = _MALLOC(length, M_NECP, M_WAITOK | M_ZERO);
-                       if (ordered_indices == NULL) {
-                               error = ENOMEM;
-                               break;
-                       }
-
-                       ifnet_head_lock_shared();
-                       TAILQ_FOREACH(ifp, &ifnet_ordered_head, if_ordered_link) {
-                               if (cursor >= count_to_copy ||
-                                   cursor >= if_ordered_count) {
-                                       break;
-                               }
-                               ordered_indices[cursor] = ifp->if_index;
-                               cursor++;
-                       }
-                       ifnet_head_done();
-
-                       /* We might have parsed less than the original length
-                        * because the list could have changed.
-                        */
-                       length = cursor * sizeof(u_int32_t);
-                       ifo->ifo_count = cursor;
-                       error = copyout(ordered_indices,
-                           ifo->ifo_ordered_indices, length);
-               } else {
-                       error = EINVAL;
-               }
-               break;
-       }
-
        default: {
                VERIFY(0);
                /* NOTREACHED */
@@ -2342,10 +2362,14 @@ ifioctl_nat64prefix(struct ifnet *ifp, u_long cmd, caddr_t data)
        switch (cmd) {
        case SIOCSIFNAT64PREFIX:                /* struct if_nat64req */
                error = ifnet_set_nat64prefix(ifp, ifnat64->ifnat64_prefixes);
+               if (error != 0)
+                       ip6stat.ip6s_clat464_plat64_pfx_setfail++;
                break;
 
        case SIOCGIFNAT64PREFIX:                /* struct if_nat64req */
                error = ifnet_get_nat64prefix(ifp, ifnat64->ifnat64_prefixes);
+               if (error != 0)
+                       ip6stat.ip6s_clat464_plat64_pfx_getfail++;
                break;
 
        default:
@@ -2355,6 +2379,36 @@ ifioctl_nat64prefix(struct ifnet *ifp, u_long cmd, caddr_t data)
 
        return (error);
 }
+
+static __attribute__((noinline)) int
+ifioctl_clat46addr(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+       struct if_clat46req *ifclat46 = (struct if_clat46req *)(void *)data;
+       struct in6_ifaddr *ia6_clat = NULL;
+       int error = 0;
+
+       VERIFY(ifp != NULL);
+
+       switch (cmd) {
+               case SIOCGIFCLAT46ADDR:
+                       ia6_clat = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
+                       if (ia6_clat == NULL) {
+                               error = ENOENT;
+                               break;
+                       }
+
+                       bcopy(&ia6_clat->ia_addr.sin6_addr, &ifclat46->ifclat46_addr.v6_address,
+                           sizeof(ifclat46->ifclat46_addr.v6_address));
+                       ifclat46->ifclat46_addr.v6_prefixlen = ia6_clat->ia_plen;
+                       IFA_REMREF(&ia6_clat->ia_ifa);
+                       break;
+               default:
+                       VERIFY(0);
+                       /* NOTREACHED */
+       }
+
+       return (error);
+}
 #endif
 
 
@@ -2380,7 +2434,7 @@ ifioctl_get_protolist(struct ifnet *ifp, u_int32_t * ret_count,
        if (count == 0) {
                goto done;
        }
-       list = _MALLOC(count * sizeof(*list), M_TEMP, M_WAITOK);
+       list = _MALLOC(count * sizeof(*list), M_TEMP, M_WAITOK | M_ZERO);
        if (list == NULL) {
                error = ENOMEM;
                goto done;
@@ -2567,7 +2621,6 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
                goto done;
 
        case SIOCSIFORDER:                      /* struct if_order */
-       case SIOCGIFORDER:              /* struct if_order */
                error = ifioctl_iforder(cmd, data);
                goto done;
 
@@ -2652,6 +2705,8 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
        case SIOCGQOSMARKINGENABLED:            /* struct ifreq */
        case SIOCSIFLOWINTERNET:                /* struct ifreq */
        case SIOCGIFLOWINTERNET:                /* struct ifreq */
+       case SIOCGIFLOWPOWER:                   /* struct ifreq */
+       case SIOCSIFLOWPOWER:                   /* struct ifreq */
        {                       /* struct ifreq */
                struct ifreq ifr;
                bcopy(data, &ifr, sizeof (ifr));
@@ -2851,10 +2906,14 @@ ifioctl(struct socket *so, u_long cmd, caddr_t data, struct proc *p)
                break;
 
 #if INET6
-       case SIOCSIFNAT64PREFIX:                /* struct if_nsreq */
-       case SIOCGIFNAT64PREFIX:                /* struct if_nsreq */
+       case SIOCSIFNAT64PREFIX:                /* struct if_nat64req */
+       case SIOCGIFNAT64PREFIX:                /* struct if_nat64req */
                error = ifioctl_nat64prefix(ifp, cmd, data);
                break;
+
+       case SIOCGIFCLAT46ADDR:                 /* struct if_clat46req */
+               error = ifioctl_clat46addr(ifp, cmd, data);
+               break;
 #endif
 
        case SIOCGIFPROTOLIST32:                /* struct if_protolistreq32 */
@@ -3582,6 +3641,17 @@ ifioctl_ifreq(struct socket *so, u_long cmd, struct ifreq *ifr, struct proc *p)
                            IFRTYPE_LOW_INTERNET_ENABLE_DL;
                ifnet_lock_done(ifp);
                break;
+       case SIOCGIFLOWPOWER:
+               ifr->ifr_low_power_mode =
+                   !!(ifp->if_xflags & IFXF_LOW_POWER);
+               break;
+       case SIOCSIFLOWPOWER:
+#if (DEVELOPMENT || DEBUG)
+               error = if_set_low_power(ifp, !!(ifr->ifr_low_power_mode));
+#else /* DEVELOPMENT || DEBUG */
+               error = EOPNOTSUPP;
+#endif /* DEVELOPMENT || DEBUG */
+               break;
        default:
                VERIFY(0);
                /* NOTREACHED */
@@ -4603,8 +4673,15 @@ if_rtmtu(struct radix_node *rn, void *arg)
                 * has not been locked (RTV_MTU is not set) and
                 * if it was non-zero to begin with.
                 */
-               if (!(rt->rt_rmx.rmx_locks & RTV_MTU) && rt->rt_rmx.rmx_mtu)
+               if (!(rt->rt_rmx.rmx_locks & RTV_MTU) && rt->rt_rmx.rmx_mtu) {
                        rt->rt_rmx.rmx_mtu = ifp->if_mtu;
+                       if (rt_key(rt)->sa_family == AF_INET &&
+                           INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
+                               rt->rt_rmx.rmx_mtu = IN6_LINKMTU(ifp);
+                               /* Further adjust the size for CLAT46 expansion */
+                               rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+                       }
+               }
        }
        RT_UNLOCK(rt);
 
@@ -5134,9 +5211,6 @@ ifioctl_cassert(void)
        case SIOCGIFAGENTIDS64:
        case SIOCGIFAGENTDATA32:
        case SIOCGIFAGENTDATA64:
-       case SIOCGIFAGENTLIST32:
-       case SIOCGIFAGENTLIST64:
-
 
        case SIOCSIFINTERFACESTATE:
        case SIOCGIFINTERFACESTATE:
@@ -5150,13 +5224,37 @@ ifioctl_cassert(void)
        case SIOCGECNMODE:
        case SIOCSECNMODE:
 
+       case SIOCSIFORDER:
+
        case SIOCSQOSMARKINGMODE:
        case SIOCSQOSMARKINGENABLED:
        case SIOCGQOSMARKINGMODE:
        case SIOCGQOSMARKINGENABLED:
 
+       case SIOCSIFTIMESTAMPENABLE:
+       case SIOCSIFTIMESTAMPDISABLE:
+       case SIOCGIFTIMESTAMPENABLED:
+
+       case SIOCSIFDISABLEOUTPUT:
+
+       case SIOCGIFAGENTLIST32:
+       case SIOCGIFAGENTLIST64:
+
+       case SIOCSIFLOWINTERNET:
+       case SIOCGIFLOWINTERNET:
+
+#if INET6
+       case SIOCGIFNAT64PREFIX:
+       case SIOCSIFNAT64PREFIX:
+
+       case SIOCGIFCLAT46ADDR:
+#endif /* INET6 */
+
        case SIOCGIFPROTOLIST32:
        case SIOCGIFPROTOLIST64:
+
+       case SIOCGIFLOWPOWER:
+       case SIOCSIFLOWPOWER:
                ;
        }
 }
index 2583ef5e72172b8054d2191d150328532a3a7763..1cf0a5f147a0be290ad21471fdf4ffd5eb946a7c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -137,6 +137,13 @@ struct if_clonereq32 {
 #define        IFEF_VLAN               0x00000200      /* interface has one or more vlans */
 #define        IFEF_BOND               0x00000400      /* interface is part of bond */
 #define        IFEF_ARPLL              0x00000800      /* ARP for IPv4LL addresses */
+#define        IFEF_CLAT46             0x00001000      /* CLAT46 RFC 6877 */
+
+#define        IS_INTF_CLAT46(ifp)     ((ifp) != NULL && ((ifp)->if_eflags & IFEF_CLAT46))
+#define        INTF_ADJUST_MTU_FOR_CLAT46(intf)                \
+    (IS_INTF_CLAT46((intf)) ||                         \
+     IS_INTF_CLAT46((intf)->if_delegated.ifp))         \
+
 /*
  * XXX IFEF_NOAUTOIPV6LL is deprecated and should be done away with.
  * Configd pretty much manages the interface configuration.
@@ -175,6 +182,7 @@ struct if_clonereq32 {
 #define        IFXF_LOW_INTERNET_UL            0x00000010 /* Uplink Low Internet is confirmed */
 #define        IFXF_LOW_INTERNET_DL            0x00000020 /* Downlink Low Internet is confirmed */
 #define        IFXF_ALLOC_KPI                  0x00000040 /* Allocated via the ifnet_alloc KPI */
+#define        IFXF_LOW_POWER                  0x00000080 /* Low Power Mode */
 
 /*
  * Current requirements for an AWDL interface.  Setting/clearing IFEF_AWDL
@@ -499,7 +507,7 @@ struct      ifreq {
 #define        IFRTYPE_LOW_INTERNET_DISABLE_UL_DL      0x0000
 #define        IFRTYPE_LOW_INTERNET_ENABLE_UL          0x0001
 #define        IFRTYPE_LOW_INTERNET_ENABLE_DL          0x0002
-
+               int ifru_low_power_mode;
 #endif /* PRIVATE */
        } ifr_ifru;
 #define        ifr_addr        ifr_ifru.ifru_addr      /* address */
@@ -549,6 +557,7 @@ struct      ifreq {
 #define        ifr_fastlane_enabled    ifr_qosmarking_enabled
 #define        ifr_disable_output      ifr_ifru.ifru_disable_output
 #define        ifr_low_internet        ifr_ifru.ifru_low_internet
+#define        ifr_low_power_mode      ifr_ifru.ifru_low_power_mode
 
 #endif /* PRIVATE */
 };
@@ -928,6 +937,14 @@ struct kev_dl_rrc_state {
        u_int32_t               rrc_state;
 };
 
+/*
+ * KEV_DL_LOW_POWER_MODE_CHANGED
+ */
+struct kev_dl_low_power_mode {
+       struct net_event_data   link_data;
+       int                     low_power_event;
+};
+
 /*
  * Length of network signature/fingerprint blob.
  */
@@ -960,14 +977,23 @@ struct ipv6_prefix {
        uint32_t        prefix_len;
 };
 
-/*
- * Structure for SIOC[S/G]IFNAT64PREFIX
- */
+struct if_ipv6_address {
+       struct in6_addr v6_address;
+       uint32_t        v6_prefixlen;
+};
+
+/* Structure for SIOC[S/G]IFNAT64PREFIX */
 struct if_nat64req {
        char                    ifnat64_name[IFNAMSIZ];
        struct ipv6_prefix      ifnat64_prefixes[NAT64_MAX_NUM_PREFIXES];
 };
 
+/* Structure for SIOCGIFCLAT46ADDR */
+struct if_clat46req {
+       char                    ifclat46_name[IFNAMSIZ];
+       struct if_ipv6_address  ifclat46_addr;
+};
+
 /*
  * Structure for SIOC[S/G]IFORDER
  *
index 204a04c0e9a92d2d7a6490174096ca120551848d..8682a9fe573bd3558e85138e403e8bce711cd81b 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -49,7 +49,6 @@
 #include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/kern_event.h>
-
 #include <net/bpf.h>
 #include <net/ethernet.h>
 #include <net/if.h>
@@ -68,6 +67,7 @@
 #include <net/kpi_protocol.h>
 
 #include <kern/locks.h>
+#include <kern/zalloc.h>
 #include <libkern/OSAtomic.h>
 
 #include <netinet/in.h>
@@ -83,9 +83,14 @@ static struct ether_addr slow_proto_multicast = {
     IEEE8023AD_SLOW_PROTO_MULTICAST
 };
 
+typedef struct ifbond_s ifbond, * ifbond_ref;
+typedef struct bondport_s bondport, * bondport_ref;
+
 #define        BOND_MAXUNIT            128
-#define BONDNAME               "bond"
-#define M_BOND                 M_DEVBUF
+#define        BOND_ZONE_MAX_ELEM      MIN(IFNETS_MAX, BOND_MAXUNIT)
+#define        BONDNAME                "bond"
+
+#define M_BOND                 M_DEVBUF
 
 #define EA_FORMAT      "%x:%x:%x:%x:%x:%x"
 #define EA_CH(e, i)    ((u_char)((u_char *)(e))[(i)])
@@ -617,24 +622,26 @@ bondport_collecting(bondport_ref p)
 static int bond_clone_create(struct if_clone *, u_int32_t, void *);
 static int bond_clone_destroy(struct ifnet *);
 static int bond_input(ifnet_t ifp, protocol_family_t protocol, mbuf_t m,
-                                         char *frame_header);
+    char *frame_header);
 static int bond_output(struct ifnet *ifp, struct mbuf *m);
 static int bond_ioctl(struct ifnet *ifp, u_long cmd, void * addr);
 static int bond_set_bpf_tap(struct ifnet * ifp, bpf_tap_mode mode,
-                           bpf_packet_func func);
+    bpf_packet_func func);
 static int bond_attach_protocol(struct ifnet *ifp);
 static int bond_detach_protocol(struct ifnet *ifp);
 static int bond_setmulti(struct ifnet *ifp);
 static int bond_add_interface(struct ifnet * ifp, struct ifnet * port_ifp);
 static int bond_remove_interface(ifbond_ref ifb, struct ifnet * port_ifp);
 static void bond_if_free(struct ifnet * ifp);
+static  void interface_link_event(struct ifnet * ifp, u_int32_t event_code);
 
 static struct if_clone bond_cloner = IF_CLONE_INITIALIZER(BONDNAME,
-                                                         bond_clone_create, 
-                                                         bond_clone_destroy, 
-                                                         0,
-                                                         BOND_MAXUNIT);
-static void interface_link_event(struct ifnet * ifp, u_int32_t event_code);
+    bond_clone_create,
+    bond_clone_destroy,
+    0,
+    BOND_MAXUNIT,
+    BOND_ZONE_MAX_ELEM,
+    sizeof(ifbond));
 
 static int
 siocsifmtu(struct ifnet * ifp, int mtu)
@@ -699,7 +706,7 @@ ifbond_release(ifbond_ref ifb)
        if (ifb->ifb_distributing_array != NULL) {
            FREE(ifb->ifb_distributing_array, M_BOND);
        }
-       FREE(ifb, M_BOND);
+       if_clone_softc_deallocate(&bond_cloner, ifb);
        break;
     default:
        break;
@@ -1092,7 +1099,7 @@ bond_clone_create(struct if_clone * ifc, u_int32_t unit, __unused void *params)
                return (error);
        }
        
-       ifb = _MALLOC(sizeof(ifbond), M_BOND, M_WAITOK | M_ZERO);
+       ifb = if_clone_softc_allocate(&bond_cloner);
        if (ifb == NULL) {
                return (ENOMEM);
        }
index 133d9af30dffa5938a2432d7cfb4f090bddfe940..33ae35c66f23266b1e6abe01a47427c3de3ffc10 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -888,7 +888,13 @@ static LIST_HEAD(, bridge_softc) bridge_list =
 static lck_grp_t *bridge_lock_grp = NULL;
 static lck_attr_t *bridge_lock_attr = NULL;
 
-static if_clone_t bridge_cloner = NULL;
+#define        BRIDGENAME      "bridge"
+#define        BRIDGES_MAX     IF_MAXUNIT
+#define        BRIDGE_ZONE_MAX_ELEM    MIN(IFNETS_MAX, BRIDGES_MAX)
+
+static struct if_clone bridge_cloner =
+    IF_CLONE_INITIALIZER(BRIDGENAME, bridge_clone_create, bridge_clone_destroy,
+        0, BRIDGES_MAX, BRIDGE_ZONE_MAX_ELEM, sizeof(struct bridge_softc));
 
 static int if_bridge_txstart = 0;
 SYSCTL_INT(_net_link_bridge, OID_AUTO, txstart, CTLFLAG_RW | CTLFLAG_LOCKED,
@@ -1126,7 +1132,6 @@ bridgeattach(int n)
 #pragma unused(n)
        int error;
        lck_grp_attr_t *lck_grp_attr = NULL;
-       struct ifnet_clone_params ifnet_clone_params;
 
        bridge_rtnode_pool = zinit(sizeof (struct bridge_rtnode),
            1024 * sizeof (struct bridge_rtnode), 0, "bridge_rtnode");
@@ -1153,11 +1158,7 @@ bridgeattach(int n)
        bstp_sys_init();
 #endif /* BRIDGESTP */
 
-       ifnet_clone_params.ifc_name = "bridge";
-       ifnet_clone_params.ifc_create = bridge_clone_create;
-       ifnet_clone_params.ifc_destroy = bridge_clone_destroy;
-
-       error = ifnet_clone_attach(&ifnet_clone_params, &bridge_cloner);
+       error = if_clone_attach(&bridge_cloner);
        if (error != 0)
                printf("%s: ifnet_clone_attach failed %d\n", __func__, error);
 
@@ -1243,13 +1244,18 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params)
 {
 #pragma unused(params)
        struct ifnet *ifp = NULL;
-       struct bridge_softc *sc, *sc2;
+       struct bridge_softc *sc = NULL;
+       struct bridge_softc *sc2 = NULL;
        struct ifnet_init_eparams init_params;
        errno_t error = 0;
        uint8_t eth_hostid[ETHER_ADDR_LEN];
        int fb, retry, has_hostid;
 
-       sc = _MALLOC(sizeof (*sc), M_DEVBUF, M_WAITOK | M_ZERO);
+       sc =  if_clone_softc_allocate(&bridge_cloner);
+       if (sc == NULL) {
+               error = ENOMEM;
+               goto done;
+       }
 
        lck_mtx_init(&sc->sc_mtx, bridge_lock_grp, bridge_lock_attr);
        sc->sc_brtmax = BRIDGE_RTABLE_MAX;
@@ -1422,7 +1428,7 @@ bridge_clone_create(struct if_clone *ifc, uint32_t unit, void *params)
 done:
        if (error != 0) {
                printf("%s failed error %d\n", __func__, error);
-               /* Cleanup TBD */
+               /* TBD: Clean up: sc, sc_rthash etc */
        }
 
        return (error);
@@ -6033,8 +6039,7 @@ bridge_detach(ifnet_t ifp)
        ifnet_release(ifp);
 
        lck_mtx_destroy(&sc->sc_mtx, bridge_lock_grp);
-
-       _FREE(sc, M_DEVBUF);
+       if_clone_softc_deallocate(&bridge_cloner, sc);
 }
 
 /*
index 96f8a3e43cf516dc4f9ebc0dc06d654a7874b513..543a0cb81df7557012833c7d0a8428c6294ee23c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -71,6 +71,7 @@
 #include <net/kpi_protocol.h>
 
 #include <kern/locks.h>
+#include <kern/zalloc.h>
 
 #ifdef INET
 #include <netinet/in.h>
@@ -106,14 +107,24 @@ static int if_fake_debug = 0;
 SYSCTL_INT(_net_link_fake, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
        &if_fake_debug, 0, "Fake interface debug logs");
 
+static int if_fake_wmm_mode = 0;
+SYSCTL_INT(_net_link_fake, OID_AUTO, wmm_mode, CTLFLAG_RW | CTLFLAG_LOCKED,
+       &if_fake_wmm_mode, 0, "Fake interface in 802.11 WMM mode");
+
 /**
  ** virtual ethernet structures, types
  **/
 
+#define        IFF_NUM_TX_RINGS_WMM_MODE       4
+#define        IFF_NUM_RX_RINGS_WMM_MODE       1
+#define        IFF_MAX_TX_RINGS        IFF_NUM_TX_RINGS_WMM_MODE
+#define        IFF_MAX_RX_RINGS        IFF_NUM_RX_RINGS_WMM_MODE
+
 typedef uint16_t       iff_flags_t;
 #define IFF_FLAGS_HWCSUM               0x0001
 #define IFF_FLAGS_BSD_MODE             0x0002
 #define IFF_FLAGS_DETACHING            0x0004
+#define        IFF_FLAGS_WMM_MODE              0x0008
 
 
 struct if_fake {
@@ -169,6 +180,9 @@ feth_enable_dequeue_stall(ifnet_t ifp, uint32_t enable)
        return (error);
 }
 
+
+#define        FETH_MAXUNIT    IF_MAXUNIT
+#define        FETH_ZONE_MAX_ELEM      MIN(IFNETS_MAX, FETH_MAXUNIT)
 #define M_FAKE                 M_DEVBUF
 
 static int feth_clone_create(struct if_clone *, u_int32_t, void *);
@@ -183,10 +197,12 @@ static    void feth_free(if_fake_ref fakeif);
 
 static struct if_clone
 feth_cloner = IF_CLONE_INITIALIZER(FAKE_ETHER_NAME,
-                                  feth_clone_create, 
-                                  feth_clone_destroy, 
-                                  0, 
-                                  IF_MAXUNIT);
+    feth_clone_create,
+    feth_clone_destroy,
+    0,
+    FETH_MAXUNIT,
+    FETH_ZONE_MAX_ELEM,
+    sizeof(struct if_fake));
 static void interface_link_event(ifnet_t ifp, u_int32_t event_code);
 
 /* some media words to pretend to be ethernet */
@@ -280,7 +296,7 @@ feth_free(if_fake_ref fakeif)
        }
 
        FETH_DPRINTF("%s\n", fakeif->iff_name);
-       FREE(fakeif, M_FAKE);
+       if_clone_softc_deallocate(&feth_cloner, fakeif);
 }
 
 static void
@@ -363,7 +379,7 @@ feth_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
        ifnet_t                         ifp;
        uint8_t                         mac_address[ETHER_ADDR_LEN];
 
-       fakeif = _MALLOC(sizeof(struct if_fake), M_FAKE, M_WAITOK | M_ZERO);
+       fakeif = if_clone_softc_allocate(&feth_cloner);
        if (fakeif == NULL) {
                return ENOBUFS;
        }
index bb9d156d07e0e21dff35ce0408f150c1bc5db278..e0caf004ba54443dd2b13d82445aae0ea4076c5a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -75,6 +75,7 @@
 #include <sys/syslog.h>
 #include <sys/protosw.h>
 #include <kern/cpu_number.h>
+#include <kern/zalloc.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 
 #define        GIFNAME         "gif"
 #define        GIFDEV          "if_gif"
-#define        GIF_MAXUNIT     0x7fff  /* ifp->if_unit is only 15 bits */
+
+#define        GIF_MAXUNIT     IF_MAXUNIT
+#define        GIF_ZONE_MAX_ELEM       MIN(IFNETS_MAX, GIF_MAXUNIT)
 
 /* gif lock variables */
 static lck_grp_t       *gif_mtx_grp;
@@ -155,12 +158,15 @@ static struct ip6protosw in6_gif_protosw =
 };
 #endif
 
-static if_clone_t gif_cloner = NULL;
+static int gif_remove(struct ifnet *);
 static int gif_clone_create(struct if_clone *, uint32_t, void *);
 static int gif_clone_destroy(struct ifnet *);
 static void gif_delete_tunnel(struct gif_softc *);
 static void gif_detach(struct ifnet *);
 
+static struct if_clone gif_cloner =
+    IF_CLONE_INITIALIZER(GIFNAME, gif_clone_create, gif_clone_destroy,
+        0, GIF_MAXUNIT, GIF_ZONE_MAX_ELEM, sizeof(struct gif_softc));
 /*
  * Theory of operation: initially, one gif interface is created.
  * Any time a gif interface is configured, if there are no other
@@ -251,8 +257,6 @@ void
 gif_init(void)
 {
        errno_t result;
-       struct ifnet_clone_params ifnet_clone_params;
-       struct if_clone *ifc = NULL;
 
        /* Initialize the list of interfaces */
        TAILQ_INIT(&gifs);
@@ -276,17 +280,11 @@ gif_init(void)
                printf("proto_register_plumber failed for AF_INET6 error=%d\n",
                    result);
 
-       ifnet_clone_params.ifc_name = "gif";
-       ifnet_clone_params.ifc_create = gif_clone_create;
-       ifnet_clone_params.ifc_destroy = gif_clone_destroy;
-
-       result = ifnet_clone_attach(&ifnet_clone_params, &gif_cloner);
+       result = if_clone_attach(&gif_cloner);
        if (result != 0)
-               printf("gifattach: ifnet_clone_attach failed %d\n", result);
+               panic("%s: if_clone_attach() failed, error %d\n", __func__, result);
 
-       /* Create first device */
-       ifc = if_clone_lookup("gif", NULL);
-       gif_clone_create(ifc, 0, NULL);
+       gif_clone_create(&gif_cloner, 0, NULL);
 }
 
 static errno_t
@@ -310,7 +308,7 @@ gif_detach(struct ifnet *ifp)
 {
        struct gif_softc *sc = ifp->if_softc;
        lck_mtx_destroy(&sc->gif_lock, gif_mtx_grp);
-       _FREE(ifp->if_softc, M_DEVBUF);
+       if_clone_softc_deallocate(&gif_cloner, sc);
        ifp->if_softc = NULL;
        (void) ifnet_release(ifp);
 }
@@ -330,7 +328,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params)
                goto done;
        }
 
-       sc = _MALLOC(sizeof (struct gif_softc), M_DEVBUF, M_WAITOK | M_ZERO);
+       sc = if_clone_softc_allocate(&gif_cloner);
        if (sc == NULL) {
                log(LOG_ERR, "gif_clone_create: failed to allocate gif%d\n",
                    unit);
@@ -366,7 +364,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params)
        error = ifnet_allocate_extended(&gif_init_params, &sc->gif_if);
        if (error != 0) {
                printf("gif_clone_create, ifnet_allocate failed - %d\n", error);
-               _FREE(sc, M_DEVBUF);
+               if_clone_softc_deallocate(&gif_cloner, sc);
                error = ENOBUFS;
                goto done;
        }
@@ -378,7 +376,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params)
        if (sc->encap_cookie4 == NULL) {
                printf("%s: unable to attach encap4\n", if_name(sc->gif_if));
                ifnet_release(sc->gif_if);
-               FREE(sc, M_DEVBUF);
+               if_clone_softc_deallocate(&gif_cloner, sc);
                error = ENOBUFS;
                goto done;
        }
@@ -393,7 +391,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params)
                }
                printf("%s: unable to attach encap6\n", if_name(sc->gif_if));
                ifnet_release(sc->gif_if);
-               FREE(sc, M_DEVBUF);
+               if_clone_softc_deallocate(&gif_cloner, sc);
                error = ENOBUFS;
                goto done;
        }
@@ -405,6 +403,7 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params)
        /* turn off ingress filter */
        sc->gif_if.if_flags  |= IFF_LINK2;
 #endif
+       sc->gif_flags |= IFGIF_DETACHING;
        error = ifnet_attach(sc->gif_if, NULL);
        if (error != 0) {
                printf("gif_clone_create - ifnet_attach failed - %d\n", error);
@@ -417,13 +416,14 @@ gif_clone_create(struct if_clone *ifc, uint32_t unit, __unused void *params)
                        encap_detach(sc->encap_cookie6);
                        sc->encap_cookie6 = NULL;
                }
-               FREE(sc, M_DEVBUF);
+               if_clone_softc_deallocate(&gif_cloner, sc);
                goto done;
        }
 #if CONFIG_MACF_NET
        mac_ifnet_label_init(&sc->gif_if);
 #endif
        bpfattach(sc->gif_if, DLT_NULL, sizeof (u_int));
+       sc->gif_flags &= ~IFGIF_DETACHING;
        TAILQ_INSERT_TAIL(&gifs, sc, gif_link);
        ngif++;
 done:
@@ -433,33 +433,63 @@ done:
 }
 
 static int
-gif_clone_destroy(struct ifnet *ifp)
+gif_remove(struct ifnet *ifp)
 {
-#if defined(INET) || defined(INET6)
        int error = 0;
-#endif
-       struct gif_softc *sc = ifp->if_softc;
+       struct gif_softc *sc = NULL;
 
        lck_mtx_lock(gif_mtx);
+       sc = ifp->if_softc;
+
+       if (sc == NULL) {
+               error = EINVAL;
+               goto done;
+       }
+
+       GIF_LOCK(sc);
+       if (sc->gif_flags & IFGIF_DETACHING) {
+               error = EINVAL;
+               goto done;
+       }
+
+       sc->gif_flags |= IFGIF_DETACHING;
        TAILQ_REMOVE(&gifs, sc, gif_link);
        ngif--;
 
-       GIF_LOCK(sc);
        gif_delete_tunnel(sc);
 #ifdef INET6
        if (sc->encap_cookie6 != NULL) {
                error = encap_detach(sc->encap_cookie6);
-               KASSERT(error == 0, ("gif_clone_destroy: Unexpected     \
-                   error detaching encap_cookie6"));
+               KASSERT(error == 0, ("gif_clone_destroy: Unexpected "
+                   "error detaching encap_cookie6"));
        }
 #endif
 #ifdef INET
        if (sc->encap_cookie4 != NULL) {
                error = encap_detach(sc->encap_cookie4);
-               KASSERT(error == 0, ("gif_clone_destroy: Unexpected     \
-                   error detaching encap_cookie4"));
+               KASSERT(error == 0, ("gif_clone_destroy: Unexpected "
+                   "error detaching encap_cookie4"));
        }
 #endif
+done:
+       if (sc != NULL)
+               GIF_UNLOCK(sc);
+       lck_mtx_unlock(gif_mtx);
+
+       return (error);
+}
+
+static int
+gif_clone_destroy(struct ifnet *ifp)
+{
+       int error = 0;
+
+       error = gif_remove(ifp);
+       if (error != 0) {
+               printf("gif_clone_destroy: gif remove failed %d\n", error);
+               return (error);
+       }
+
        error = ifnet_set_flags(ifp, 0, IFF_UP);
        if (error != 0) {
                printf("gif_clone_destroy: ifnet_set_flags failed %d\n", error);
@@ -469,10 +499,6 @@ gif_clone_destroy(struct ifnet *ifp)
        if (error != 0)
                panic("gif_clone_destroy: ifnet_detach(%p) failed %d\n", ifp,
                    error);
-
-       GIF_UNLOCK(sc);
-       lck_mtx_unlock(gif_mtx);
-
        return (0);
 }
 
index 619653bee71d4394edc81533fd0eae9fae559c8c..296cdbf21d66f973c9ddb7867086d8acd7b1acdf 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -87,6 +87,7 @@ struct gif_softc {
 #endif
        } gifsc_gifscr;
        int             gif_flags;
+#define        IFGIF_DETACHING 0x1
        int             gif_called;
        const struct encaptab *encap_cookie4;
        const struct encaptab *encap_cookie6;
index 098420596589e2bbce0b25c30723a6d816219835..22a1441ddba747f442e58e8c6d02bd97ee3dd5b8 100644 (file)
@@ -26,6 +26,7 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
+
 #include <sys/systm.h>
 #include <sys/kern_control.h>
 #include <net/kpi_protocol.h>
@@ -370,6 +371,7 @@ ipsec_interface_isvalid (ifnet_t interface)
     return 1;
 }
 
+#if IPSEC_NEXUS
 boolean_t
 ipsec_interface_needs_netagent(ifnet_t interface)
 {
@@ -387,6 +389,7 @@ ipsec_interface_needs_netagent(ifnet_t interface)
 
        return (pcb->ipsec_needs_netagent == true);
 }
+#endif // IPSEC_NEXUS
 
 static errno_t
 ipsec_ifnet_set_attrs(ifnet_t ifp)
@@ -2072,6 +2075,12 @@ ipsec_enable_channel(struct ipsec_pcb *pcb, struct proc *proc)
        struct kern_pbufpool_init pp_init;
        errno_t result;
 
+       kauth_cred_t cred = kauth_cred_get();
+       result = priv_check_cred(cred, PRIV_SKYWALK_REGISTER_KERNEL_PIPE, 0);
+       if (result) {
+               return result;
+       }
+
        result = ipsec_register_kernel_pipe_nexus();
        if (result) {
                return result;
@@ -2703,7 +2712,7 @@ ipsec_ctl_setopt(__unused kern_ctl_ref    kctlref,
                        if (result == 0) {
                                printf("%s IPSEC_OPT_SET_DELEGATE_INTERFACE %s to %s\n",
                                        __func__, pcb->ipsec_ifp->if_xname, 
-                                       del_ifp->if_xname);
+                                       del_ifp ? del_ifp->if_xname : "NULL");
 
                                result = ifnet_set_delegate(pcb->ipsec_ifp, del_ifp);
                                if (del_ifp)
@@ -3317,12 +3326,14 @@ ipsec_ioctl(ifnet_t interface,
                        u_long command,
                        void *data)
 {
+#if IPSEC_NEXUS
+       struct ipsec_pcb *pcb = ifnet_softc(interface);
+#endif
        errno_t result = 0;
        
        switch(command) {
                case SIOCSIFMTU: {
 #if IPSEC_NEXUS
-                       struct ipsec_pcb *pcb = ifnet_softc(interface);
                        if (pcb->ipsec_use_netif) {
                                // Make sure we can fit packets in the channel buffers
                                if (((uint64_t)((struct ifreq*)data)->ifr_mtu) > pcb->ipsec_slot_size) {
@@ -3430,7 +3441,7 @@ ipsec_attach_proto(ifnet_t                                interface,
 
 errno_t
 ipsec_inject_inbound_packet(ifnet_t    interface,
-                                                       mbuf_t packet)
+                           mbuf_t      packet)
 {
 #if IPSEC_NEXUS
        struct ipsec_pcb *pcb = ifnet_softc(interface);
diff --git a/bsd/net/if_low_power_mode.c b/bsd/net/if_low_power_mode.c
new file mode 100644 (file)
index 0000000..aac91d5
--- /dev/null
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/eventhandler.h>
+
+#include <net/dlil.h>
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/nwk_wq.h>
+
+#include <os/log.h>
+
+typedef enum {
+       IF_LOW_POWER_EVENT_OFF = 0,
+       IF_LOW_POWER_EVENT_ON = 1
+} if_low_power_ev_code_t;
+
+struct if_low_power_ev_args {
+       struct ifnet *ifp;
+       if_low_power_ev_code_t event_code;
+};
+
+struct if_low_power_ev_nwk_wq_entry {
+       struct nwk_wq_entry nwk_wqe;
+       struct if_low_power_ev_args ev_args;
+};
+
+
+typedef void (*if_low_power_event_fn) (struct eventhandler_entry_arg,
+    struct ifnet *, if_low_power_ev_code_t);
+EVENTHANDLER_DECLARE(if_low_power_event, if_low_power_event_fn);
+
+struct eventhandler_lists_ctxt if_low_power_evhdlr_ctx;
+
+static void if_low_power_evhdlr_callback(__unused struct eventhandler_entry_arg arg,
+    struct ifnet *ifp, if_low_power_ev_code_t event_code);
+
+#if 0
+static void if_low_power_nwk_ev_callback(void *arg);
+static void if_low_power_event_enqueue_nwk_wq_entry(struct ifnet *ifp,
+    if_low_power_ev_code_t event_code);
+#endif
+
+extern void shutdown_sockets_on_interface(struct ifnet *ifp);
+
+SYSCTL_DECL(_net_link_generic_system);
+SYSCTL_NODE(_net_link_generic_system, OID_AUTO, low_power,
+    CTLFLAG_RW | CTLFLAG_LOCKED, 0, "low power mode");
+
+int if_low_power_verbose = 0;
+int if_low_power_restricted = 1;
+
+#if (DEVELOPMENT || DEBUG)
+SYSCTL_INT(_net_link_generic_system_low_power, OID_AUTO, verbose,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &if_low_power_verbose, 0, "");
+SYSCTL_INT(_net_link_generic_system_low_power, OID_AUTO, restricted,
+    CTLFLAG_RW | CTLFLAG_LOCKED,
+    &if_low_power_restricted, 0, "");
+#endif /* (DEVELOPMENT || DEBUG) */
+
+
+static void
+if_low_power_evhdlr_callback(__unused struct eventhandler_entry_arg arg,
+    struct ifnet *ifp, if_low_power_ev_code_t event_code)
+{
+       struct kev_dl_low_power_mode kev;
+
+       if (!IF_FULLY_ATTACHED(ifp))
+               return;
+
+       if (if_low_power_verbose > 0) {
+               os_log_info(OS_LOG_DEFAULT,
+                   "%s: ifp %s event_code %d", __func__,
+                   if_name(ifp), event_code);
+       }
+
+       ifnet_lock_exclusive(ifp);
+       if (event_code == IF_LOW_POWER_EVENT_OFF) {
+               ifp->if_xflags &= ~IFXF_LOW_POWER;
+       } else {
+               ifp->if_xflags |= IFXF_LOW_POWER;
+       }
+       ifnet_lock_done(ifp);
+
+       if (event_code == IF_LOW_POWER_EVENT_ON) {
+               atomic_add_32(&ifp->if_low_power_gencnt, 1);
+
+               if (if_low_power_restricted != 0) {
+                       shutdown_sockets_on_interface(ifp);
+                       intf_event_enqueue_nwk_wq_entry(ifp, NULL,
+                           INTF_EVENT_CODE_LOW_POWER_UPDATE);
+               }
+       }
+
+       bzero(&kev, sizeof(struct kev_dl_low_power_mode));
+       kev.low_power_event = event_code;
+       dlil_post_msg(ifp,
+           KEV_DL_SUBCLASS,
+           KEV_DL_LOW_POWER_MODE_CHANGED,
+           (struct net_event_data *)&kev,
+           sizeof(struct kev_dl_low_power_mode));
+}
+
+void
+if_low_power_evhdlr_init(void)
+{
+       eventhandler_lists_ctxt_init(&if_low_power_evhdlr_ctx);
+
+       (void) EVENTHANDLER_REGISTER(&if_low_power_evhdlr_ctx,
+           if_low_power_event,
+           if_low_power_evhdlr_callback, 
+           eventhandler_entry_dummy_arg,
+           EVENTHANDLER_PRI_ANY);
+}
+
+#if 0
+static void
+if_low_power_nwk_ev_callback(void *arg)
+{
+       struct if_low_power_ev_args *if_low_power_ev_args =
+           (struct if_low_power_ev_args *)arg;
+       
+       EVENTHANDLER_INVOKE(&if_low_power_evhdlr_ctx,
+           if_low_power_event,
+           if_low_power_ev_args->ifp,
+           if_low_power_ev_args->event_code);
+}
+
+static void
+if_low_power_event_enqueue_nwk_wq_entry(struct ifnet *ifp,
+    if_low_power_ev_code_t event_code)
+{
+       struct if_low_power_ev_nwk_wq_entry *event_nwk_wq_entry = NULL;
+
+       MALLOC(event_nwk_wq_entry, struct if_low_power_ev_nwk_wq_entry *,
+           sizeof(struct if_low_power_ev_nwk_wq_entry),
+           M_NWKWQ, M_WAITOK | M_ZERO);
+
+       event_nwk_wq_entry->ev_args.ifp = ifp;
+       event_nwk_wq_entry->ev_args.event_code = event_code;
+
+       event_nwk_wq_entry->nwk_wqe.func = if_low_power_nwk_ev_callback;
+       event_nwk_wq_entry->nwk_wqe.is_arg_managed = TRUE;
+       event_nwk_wq_entry->nwk_wqe.arg = &event_nwk_wq_entry->ev_args;
+
+       nwk_wq_enqueue((struct nwk_wq_entry*)event_nwk_wq_entry);
+}
+#endif
+
+int
+if_set_low_power(ifnet_t ifp, bool on)
+{
+       int error = 0;
+
+       if (ifp == NULL)
+               return (EINVAL);
+
+       os_log(OS_LOG_DEFAULT,
+           "%s: ifp %s low_power mode %d", __func__, if_name(ifp), on);
+
+       ifnet_lock_exclusive(ifp);
+       ifp->if_xflags = on ? (ifp->if_xflags | IFXF_LOW_POWER) :
+           (ifp->if_xflags & ~IFXF_LOW_POWER);
+       ifnet_lock_done(ifp);
+
+       return (error);
+}
+
index 1c9113d70647976823a6c662d8e8c619dfe02719..cbed433abe4f159f7dd0d562005d8677ac9bf1c0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -70,6 +70,8 @@
 #include <sys/ioctl.h>
 #include <sys/mcache.h>
 
+#include <kern/zalloc.h>
+
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_types.h>
 #define DPRINTF(x)
 #endif
 
+static int pflog_remove(struct ifnet *);
 static int pflog_clone_create(struct if_clone *, u_int32_t, void *);
 static int pflog_clone_destroy(struct ifnet *);
 static errno_t pflogoutput(struct ifnet *, struct mbuf *);
@@ -116,7 +119,7 @@ static void pflogfree(struct ifnet *);
 static LIST_HEAD(, pflog_softc)        pflogif_list;
 static struct if_clone pflog_cloner =
     IF_CLONE_INITIALIZER(PFLOGNAME, pflog_clone_create, pflog_clone_destroy,
-        0, (PFLOGIFS_MAX - 1));
+        0, (PFLOGIFS_MAX - 1), PFLOGIF_ZONE_MAX_ELEM, sizeof(struct pflog_softc));
 
 struct ifnet *pflogifs[PFLOGIFS_MAX];  /* for fast access */
 
@@ -146,8 +149,7 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
                /* NOTREACHED */
        }
 
-       if ((pflogif = _MALLOC(sizeof (*pflogif),
-           M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
+       if ((pflogif = if_clone_softc_allocate(&pflog_cloner)) == NULL) {
                error = ENOMEM;
                goto done;
        }
@@ -170,11 +172,12 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
 
        bzero(pflogif, sizeof (*pflogif));
        pflogif->sc_unit = unit;
+       pflogif->sc_flags |= IFPFLF_DETACHING;
 
        error = ifnet_allocate_extended(&pf_init, &pflogif->sc_if);
        if (error != 0) {
                printf("%s: ifnet_allocate failed - %d\n", __func__, error);
-               _FREE(pflogif, M_DEVBUF);
+               if_clone_softc_deallocate(&pflog_cloner, pflogif);
                goto done;
        }
 
@@ -185,7 +188,7 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
        if (error != 0) {
                printf("%s: ifnet_attach failed - %d\n", __func__, error);
                ifnet_release(pflogif->sc_if);
-               _FREE(pflogif, M_DEVBUF);
+               if_clone_softc_deallocate(&pflog_cloner, pflogif);
                goto done;
        }
 
@@ -197,6 +200,7 @@ pflog_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
        lck_mtx_lock(pf_lock);
        LIST_INSERT_HEAD(&pflogif_list, pflogif, sc_list);
        pflogifs[unit] = pflogif->sc_if;
+       pflogif->sc_flags &= ~IFPFLF_DETACHING;
        lck_mtx_unlock(pf_lock);
        lck_rw_done(pf_perim_lock);
 
@@ -205,21 +209,40 @@ done:
 }
 
 static int
-pflog_clone_destroy(struct ifnet *ifp)
+pflog_remove(struct ifnet *ifp)
 {
-       struct pflog_softc *pflogif = ifp->if_softc;
+       int error = 0;
+       struct pflog_softc *pflogif = NULL;
 
        lck_rw_lock_shared(pf_perim_lock);
        lck_mtx_lock(pf_lock);
-       pflogifs[pflogif->sc_unit] = NULL;
+       pflogif = ifp->if_softc;
+
+       if (pflogif == NULL ||
+           (pflogif->sc_flags & IFPFLF_DETACHING) != 0) {
+               error = EINVAL;
+               goto done;
+       }
+
+       pflogif->sc_flags |= IFPFLF_DETACHING;
        LIST_REMOVE(pflogif, sc_list);
+done:
        lck_mtx_unlock(pf_lock);
        lck_rw_done(pf_perim_lock);
+       return error;
+}
 
-       /* bpfdetach() is taken care of as part of interface detach */
-       (void) ifnet_detach(ifp);
+static int
+pflog_clone_destroy(struct ifnet *ifp)
+{
+       int error = 0;
 
-       return 0;
+       if ((error = pflog_remove(ifp)) != 0)
+               goto done;
+       /* bpfdetach() is taken care of as part of interface detach */
+       (void)ifnet_detach(ifp);
+done:
+       return (error);
 }
 
 static errno_t
@@ -278,7 +301,7 @@ pflogdelproto(struct ifnet *ifp, protocol_family_t pf)
 static void
 pflogfree(struct ifnet *ifp)
 {
-       _FREE(ifp->if_softc, M_DEVBUF);
+       if_clone_softc_deallocate(&pflog_cloner, ifp->if_softc);
        ifp->if_softc = NULL;
        (void) ifnet_release(ifp);
 }
index 77f6fa5dfa6e9d3974ad69aaa1fb5d80dd01e809..1ebfb6bb69378972c28f357c293c6b36f4da6a40 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 extern "C" {
 #endif
 
-#define        PFLOGIFS_MAX    16
+#define        PFLOGIFS_MAX            16
+#define        PFLOGIF_ZONE_MAX_ELEM           MIN(IFNETS_MAX, PFLOGIFS_MAX)
 
 #if KERNEL_PRIVATE
 struct pflog_softc {
        struct ifnet            *sc_if;         /* back ptr to interface */
+       u_int32_t               sc_flags;
+#define        IFPFLF_DETACHING        0x1
        int                     sc_unit;
        LIST_ENTRY(pflog_softc) sc_list;
 };
index f5f7f9c11d32198e0a1576b0680b31d9e362f810..643df3b2d5de1787e424c8b716c6393b2778ca66 100644 (file)
 #include <sys/kauth.h>
 #include <sys/bitstring.h>
 #include <sys/priv.h>
+#include <sys/protosw.h>
 #include <sys/socket.h>
 
 #include <kern/locks.h>
 #include <kern/zalloc.h>
 
 #include <libkern/libkern.h>
-#include <mach/branch_predicates.h>
 
 #include <net/kpi_interface.h>
 #include <net/if_var.h>
@@ -461,13 +461,13 @@ sysctl_wakeuuid_not_set_last_time SYSCTL_HANDLER_ARGS
 #pragma unused(oidp, arg1, arg2)
 
        if (proc_is64bit(req->p)) {
-               struct user64_timeval tv;
+               struct user64_timeval tv = {};
 
                tv.tv_sec = wakeuuid_not_set_last_time.tv_sec;
                tv.tv_usec = wakeuuid_not_set_last_time.tv_usec;
                return SYSCTL_OUT(req, &tv, sizeof(tv));
        } else {
-               struct user32_timeval tv;
+               struct user32_timeval tv = {};
 
                tv.tv_sec = wakeuuid_not_set_last_time.tv_sec;
                tv.tv_usec = wakeuuid_not_set_last_time.tv_usec;
index c4fce0074db32d643b705605b73096c021a80760..2d5f2e90bd0b3ff94d0fa27d050220ef6e0a5813 100644 (file)
@@ -317,7 +317,7 @@ stfattach(void)
        stfinit();
 
        error = proto_register_plumber(PF_INET6, APPLE_IF_FAM_STF,
-                                                                  stf_attach_inet6, NULL);
+           stf_attach_inet6, NULL);
        if (error != 0)
                printf("proto_register_plumber failed for AF_INET6 error=%d\n", error);
 
index 225d9c21cc4481a2e2d9c4de8f7005a0a44246c6..b0e7116941a6fa2da480d303d4861a30b1b99bd4 100644 (file)
@@ -1285,6 +1285,12 @@ utun_enable_channel(struct utun_pcb *pcb, struct proc *proc)
        struct kern_pbufpool_init pp_init;
        errno_t result;
 
+       kauth_cred_t cred = kauth_cred_get();
+       result = priv_check_cred(cred, PRIV_SKYWALK_REGISTER_KERNEL_PIPE, 0);
+       if (result) {
+               return result;
+       }
+
        result = utun_register_kernel_pipe_nexus();
        if (result) {
                return result;
@@ -2488,6 +2494,12 @@ utun_demux(__unused ifnet_t interface,
                   __unused char *frame_header,
                   protocol_family_t *protocol)
 {
+#if UTUN_NEXUS
+       struct utun_pcb *pcb = ifnet_softc(interface);
+       struct ip *ip;
+       u_int ip_version;
+#endif
+
        while (data != NULL && mbuf_len(data) < 1) {
                data = mbuf_next(data);
        }
@@ -2497,10 +2509,6 @@ utun_demux(__unused ifnet_t interface,
        }
 
 #if UTUN_NEXUS
-       struct utun_pcb *pcb = ifnet_softc(interface);
-       struct ip *ip;
-       u_int ip_version;
-
        if (pcb->utun_use_netif) {
                ip = mtod(data, struct ip *);
                ip_version = ip->ip_v;
@@ -2538,14 +2546,14 @@ utun_framer(ifnet_t interface,
        VERIFY(interface == pcb->utun_ifp);
 
        u_int32_t header_length = UTUN_HEADER_SIZE(pcb);
-    if (mbuf_prepend(packet, header_length, MBUF_DONTWAIT) != 0) {
+       if (mbuf_prepend(packet, header_length, MBUF_DONTWAIT) != 0) {
                printf("utun_framer - ifnet_output prepend failed\n");
 
                ifnet_stat_increment_out(interface, 0, 0, 1);
 
                // just return, because the buffer was freed in mbuf_prepend
-        return EJUSTRETURN;    
-    }
+               return EJUSTRETURN;     
+       }
        if (prepend_len != NULL) {
                *prepend_len = header_length;
        }
@@ -2553,8 +2561,8 @@ utun_framer(ifnet_t interface,
                *postpend_len = 0;
        }
        
-    // place protocol number at the beginning of the mbuf
-    *(protocol_family_t *)mbuf_data(*packet) = *(protocol_family_t *)(uintptr_t)(size_t)frame_type;
+       // place protocol number at the beginning of the mbuf
+       *(protocol_family_t *)mbuf_data(*packet) = *(protocol_family_t *)(uintptr_t)(size_t)frame_type;
 
 
     return 0;
@@ -2590,12 +2598,14 @@ utun_ioctl(ifnet_t interface,
                   u_long command,
                   void *data)
 {
+#if UTUN_NEXUS
+       struct utun_pcb *pcb = ifnet_softc(interface);
+#endif
        errno_t result = 0;
-
+       
        switch(command) {
                case SIOCSIFMTU: {
 #if UTUN_NEXUS
-                       struct utun_pcb *pcb = ifnet_softc(interface);
                        if (pcb->utun_use_netif) {
                                // Make sure we can fit packets in the channel buffers
                                // Allow for the headroom in the slot
index 379b792ad42a72846689b3db900e0f05d4bf23e0..1c69489b5a05dafdc93d163072f667686b5fe4d7 100644 (file)
@@ -75,7 +75,7 @@
 #ifdef PRIVATE
 #include <net/route.h>
 #endif
-#ifdef BSD_KERN_PRIVATE
+#ifdef BSD_KERNEL_PRIVATE
 #include <sys/eventhandler.h>
 #endif
 
@@ -649,6 +649,8 @@ struct      ifqueue {
 };
 
 #ifdef BSD_KERNEL_PRIVATE
+#define        IFNETS_MAX      64
+
 /*
  * Internal storage of if_data. This is bound to change. Various places in the
  * stack will translate this data structure in to the externally visible
@@ -996,6 +998,8 @@ struct ifnet {
        uuid_t                  *if_agentids;   /* network agents attached to interface */
        u_int32_t               if_agentcount;
 
+       volatile uint32_t       if_low_power_gencnt;
+
        u_int32_t               if_generation;  /* generation to use with NECP clients */
        u_int32_t               if_fg_sendts;   /* last send on a fg socket in seconds */
 
@@ -1036,6 +1040,7 @@ typedef enum {
        INTF_EVENT_CODE_IPADDR_DETACHED,
        INTF_EVENT_CODE_LLADDR_UPDATE,
        INTF_EVENT_CODE_MTU_CHANGED,
+       INTF_EVENT_CODE_LOW_POWER_UPDATE,
 } intf_event_code_t;
 
 typedef void (*ifnet_event_fn)(struct eventhandler_entry_arg, struct ifnet *, struct sockaddr *, intf_event_code_t);
@@ -1072,20 +1077,34 @@ EVENTHANDLER_DECLARE(ifnet_event, ifnet_event_fn);
  */
 struct if_clone {
        LIST_ENTRY(if_clone) ifc_list;  /* on list of cloners */
+       decl_lck_mtx_data(, ifc_mutex); /* To serialize clone create/delete */
        const char      *ifc_name;      /* name of device, e.g. `vlan' */
        size_t          ifc_namelen;    /* length of name */
        u_int32_t       ifc_minifs;     /* minimum number of interfaces */
        u_int32_t       ifc_maxunit;    /* maximum unit number */
        unsigned char   *ifc_units;     /* bitmap to handle units */
        u_int32_t       ifc_bmlen;      /* bitmap length */
-
+       u_int32_t       ifc_zone_max_elem;      /* Max elements for this zone type */
+       u_int32_t       ifc_softc_size; /* size of softc for the device */
+       struct zone     *ifc_zone;      /* if_clone allocation zone */
        int             (*ifc_create)(struct if_clone *, u_int32_t, void *);
        int             (*ifc_destroy)(struct ifnet *);
 };
 
-#define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit) {       \
-       { NULL, NULL }, name, (sizeof (name) - 1), minifs, maxunit, NULL, 0,  \
-       create, destroy                                                       \
+#define IF_CLONE_INITIALIZER(name, create, destroy, minifs, maxunit, zone_max_elem, softc_size) {      \
+       .ifc_list = { NULL, NULL },                                                                     \
+       .ifc_mutex = {},                                                                                \
+       .ifc_name = name,                                                                               \
+       .ifc_namelen =  (sizeof (name) - 1),                                                            \
+       .ifc_minifs = minifs,                                                                           \
+       .ifc_maxunit = maxunit,                                                                         \
+       .ifc_units = NULL,                                                                              \
+       .ifc_bmlen = 0,                                                                                 \
+       .ifc_zone_max_elem = zone_max_elem,                                                             \
+       .ifc_softc_size = softc_size,                                                                   \
+       .ifc_zone = NULL,                                                                               \
+       .ifc_create = create,                                                                           \
+       .ifc_destroy = destroy                                                                          \
 }
 
 #define M_CLONE         M_IFADDR
@@ -1381,6 +1400,12 @@ struct ifmultiaddr {
        ((_ifp)->if_eflags & IFEF_EXPENSIVE ||                          \
        (_ifp)->if_delegated.expensive)
 
+#define        IFNET_IS_LOW_POWER(_ifp)                                        \
+       (if_low_power_restricted != 0 &&                                \
+       ((_ifp)->if_xflags & IFXF_LOW_POWER) ||                         \
+       ((_ifp)->if_delegated.ifp != NULL &&                            \
+       ((_ifp)->if_delegated.ifp->if_xflags & IFXF_LOW_POWER)))
+
 /*
  * We don't support AWDL interface delegation.
  */
@@ -1430,7 +1455,8 @@ extern void if_qflush_sc(struct ifnet *, mbuf_svc_class_t, u_int32_t,
 extern struct if_clone *if_clone_lookup(const char *, u_int32_t *);
 extern int if_clone_attach(struct if_clone *);
 extern void if_clone_detach(struct if_clone *);
-
+extern void *if_clone_softc_allocate(const struct if_clone *);
+extern void if_clone_softc_deallocate(const struct if_clone *, void *);
 extern u_int32_t if_functional_type(struct ifnet *, bool);
 
 extern errno_t if_mcasts_update(struct ifnet *);
@@ -1794,6 +1820,12 @@ __private_extern__ void ifnet_enqueue_multi_setup(struct ifnet *, uint16_t,
     uint16_t);
 __private_extern__ errno_t ifnet_enqueue_mbuf(struct ifnet *, struct mbuf *,
     boolean_t, boolean_t *);
+
+extern int if_low_power_verbose;
+extern int if_low_power_restricted;
+extern void if_low_power_evhdlr_init(void);
+extern int if_set_low_power(struct ifnet *, bool);
+
 #endif /* BSD_KERNEL_PRIVATE */
 #ifdef XNU_KERNEL_PRIVATE
 /* for uuid.c */
index 5c3535e68fc79838ae0de3ebd076f832550c2c34..8d26fab65144c198fa0a035d2a5a0b9fdcc96c50 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -97,6 +97,7 @@
 #include <net/kpi_protocol.h>
 
 #include <kern/locks.h>
+#include <kern/zalloc.h>
 
 #ifdef INET
 #include <netinet/in.h>
@@ -369,6 +370,8 @@ SYSCTL_NODE(_net_link, IFT_L2VLAN, vlan, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "IEEE 802
 SYSCTL_NODE(_net_link_vlan, PF_LINK, link, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "for consistency");
 #endif
 
+#define        VLAN_UNITMAX    IF_MAXUNIT
+#define        VLAN_ZONE_MAX_ELEM      MIN(IFNETS_MAX, VLAN_UNITMAX)
 #define M_VLAN                 M_DEVBUF
 
 static int vlan_clone_create(struct if_clone *, u_int32_t, void *);
@@ -386,10 +389,12 @@ static    void vlan_if_free(struct ifnet * ifp);
 static         int vlan_remove(ifvlan_ref ifv, int need_to_wait);
 
 static struct if_clone vlan_cloner = IF_CLONE_INITIALIZER(VLANNAME,
-                                                         vlan_clone_create, 
-                                                         vlan_clone_destroy, 
-                                                         0, 
-                                                         IF_MAXUNIT);
+    vlan_clone_create,
+    vlan_clone_destroy,
+    0,
+    VLAN_UNITMAX,
+    VLAN_ZONE_MAX_ELEM,
+    sizeof(struct ifvlan));
 static void interface_link_event(struct ifnet * ifp, u_int32_t event_code);
 static void vlan_parent_link_event(struct ifnet * p,
                                    u_int32_t event_code);
@@ -429,7 +434,7 @@ ifvlan_release(ifvlan_ref ifv)
            printf("ifvlan_release(%s)\n", ifv->ifv_name);
        }
        ifv->ifv_signature = 0;
-       FREE(ifv, M_VLAN);
+       if_clone_softc_deallocate(&vlan_cloner, ifv);
        break;
     default:
        break;
@@ -937,7 +942,7 @@ vlan_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
        if (error != 0) {
                return (error);
        }
-       ifv = _MALLOC(sizeof(struct ifvlan), M_VLAN, M_WAITOK | M_ZERO);
+       ifv = if_clone_softc_allocate(&vlan_cloner);
        if (ifv == NULL)
                return ENOBUFS;
        ifv->ifv_retain_count = 1;
index a4c2cabdbde6d303ae2cdb91483a877709c5815d..06fed0e3a05894298b236dddc0d6c9f584b4273f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -27,6 +27,7 @@
  */
 
 #include <kern/locks.h>
+#include <kern/zalloc.h>
 
 #include <sys/types.h>
 #include <sys/kernel_types.h>
@@ -107,12 +108,17 @@ static ipfilter_t iptap_ipf4, iptap_ipf6;
 
 void iptap_bpf_tap(struct mbuf *m, u_int32_t proto,  int outgoing);
 
+#define        IPTAP_MAXUNIT   IF_MAXUNIT
+#define        IPTAP_ZONE_MAX_ELEM     MIN(IFNETS_MAX, IPTAP_MAXUNIT)
+
 static struct if_clone iptap_cloner = 
        IF_CLONE_INITIALIZER(IPTAP_IFNAME, 
                iptap_clone_create, 
                iptap_clone_destroy,
                0, 
-               IF_MAXUNIT);
+               IPTAP_MAXUNIT,
+               IPTAP_ZONE_MAX_ELEM,
+               sizeof(struct iptap_softc));
 
 SYSCTL_DECL(_net_link);
 SYSCTL_NODE(_net_link, OID_AUTO, iptap, CTLFLAG_RW|CTLFLAG_LOCKED, 0,
@@ -189,7 +195,7 @@ iptap_clone_create(struct if_clone *ifc, u_int32_t unit, void *params)
        struct iptap_softc *iptap = NULL;
        struct ifnet_init_eparams if_init;
        
-       iptap = _MALLOC(sizeof(struct iptap_softc), M_DEVBUF, M_WAITOK | M_ZERO);
+       iptap = if_clone_softc_allocate(&iptap_cloner);
        if (iptap == NULL) {
                printf("%s: _MALLOC failed\n", __func__);
                error = ENOMEM;
@@ -253,7 +259,7 @@ iptap_clone_create(struct if_clone *ifc, u_int32_t unit, void *params)
 done:
        if (error != 0) {
                if (iptap != NULL)
-                       _FREE(iptap, M_DEVBUF);
+                       if_clone_softc_deallocate(&iptap_cloner, iptap);
        }
        return (error);
 }
@@ -445,7 +451,7 @@ done:
 __private_extern__ void
 iptap_detach(ifnet_t ifp)
 {
-       struct iptap_softc *iptap;
+       struct iptap_softc *iptap = NULL;
        
        iptap_lock_exclusive();
 
@@ -460,8 +466,7 @@ iptap_detach(ifnet_t ifp)
 
        /* Drop reference as it's no more on the global list */
        ifnet_release(ifp);
-       
-       _FREE(iptap, M_DEVBUF);
+       if_clone_softc_deallocate(&iptap_cloner, iptap);
 
        /* This is for the reference taken by ifnet_attach() */
        (void) ifnet_release(ifp);
index 2109e94cbe7beefa30665dbba65c0d3e2b3d5301..b0c1e35319c797aa3318caef91e70c34cb4ca1d0 100644 (file)
@@ -2861,6 +2861,10 @@ ifnet_get_keepalive_offload_frames(ifnet_t ifp,
        if (frames_array_count == 0)
                return (0);
 
+       /* Keep-alive offload not required for CLAT interface */
+       if (IS_INTF_CLAT46(ifp))
+               return (0);
+
        for (i = 0; i < frames_array_count; i++) {
                struct ifnet_keepalive_offload_frame *frame = frames_array + i;
 
@@ -3128,3 +3132,24 @@ ifnet_normalise_unsent_data(void)
        }
        ifnet_head_done();
 }
+
+errno_t
+ifnet_set_low_power_mode(ifnet_t ifp, boolean_t on)
+{
+       errno_t error;
+
+       error = if_set_low_power(ifp, on);
+
+       return (error);
+}
+
+errno_t
+ifnet_get_low_power_mode(ifnet_t ifp, boolean_t *on)
+{
+       if (ifp == NULL || on == NULL)
+               return (EINVAL);
+
+       *on  = !!(ifp->if_xflags & IFXF_LOW_POWER);
+
+       return (0);
+}
index 0dd25f44cc7550888b819b4e922c8c072f879d98..ba4736d593c25438fa65c25a9ca1480720c864b6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2004-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2004-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -3133,7 +3133,7 @@ typedef errno_t (*ifnet_clone_destroy_func)(ifnet_t interface);
        @field ifc_destroy The function to destroy an interface.
 */
 struct ifnet_clone_params {
-       const char                                      *ifc_name;
+       const char                      *ifc_name;
        ifnet_clone_create_func         ifc_create;
        ifnet_clone_destroy_func        ifc_destroy;
 };
@@ -3556,6 +3556,27 @@ extern errno_t ifnet_get_buffer_status(const ifnet_t interface,
  */
 extern void ifnet_normalise_unsent_data(void);
 
+/*************************************************************************/
+/* Low Power Mode                                                        */
+/*************************************************************************/
+
+/*!
+       @function ifnet_set_low_power_mode
+       @param interface The interface.
+       @param on Set the truth value that the interface is in low power mode.
+       @result Returns 0 on success, error number otherwise.
+ */
+extern errno_t ifnet_set_low_power_mode(ifnet_t interface, boolean_t on);
+
+/*!
+       @function ifnet_get_low_power_mode
+       @param interface The interface.
+       @param on On output contains the truth value that the interface
+               is in low power mode.
+       @result Returns 0 on success, error number otherwise.
+ */
+extern errno_t ifnet_get_low_power_mode(ifnet_t interface, boolean_t *on);
+
 /*!
  @function ifnet_touch_lastupdown
  @discussion Updates the lastupdown value to now.
index c6314269d24b257b5de835cd62a0c258e57fcaab..6265a4b4bb53c58629a69fbee8d5b5582ea70b42 100644 (file)
@@ -266,6 +266,9 @@ proto_input(protocol_family_t protocol, mbuf_t packet_list)
                        break;
        }
 
+       if (entry == NULL)
+               return (-1);
+
        if (entry->domain && !(entry->domain->dom_flags & DOM_REENTRANT)) {
                lck_mtx_lock(entry->domain->dom_mtx);
                locked = 1;
diff --git a/bsd/net/nat464_utils.c b/bsd/net/nat464_utils.c
new file mode 100644 (file)
index 0000000..495ac6c
--- /dev/null
@@ -0,0 +1,1212 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2013 Henning Brauer
+ * NAT64 - Copyright (c) 2010 Viagenie Inc. (http://www.viagenie.ca)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *    - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *    - Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ */
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/mbuf.h>
+
+#include <net/if.h>
+#include <net/if_types.h>
+#include <net/dlil.h>
+#include <net/nat464_utils.h>
+#include <net/nwk_wq.h>
+
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/ip_var.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/in_pcb.h>
+#include <netinet/icmp_var.h>
+#include <netinet/icmp6.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
+#include <os/log.h>
+
+int clat_debug = 0;
+
+os_log_t nat_log_handle;
+
+static void
+nat464_addr_cksum_fixup(uint16_t *, struct nat464_addr *, struct nat464_addr *,
+    protocol_family_t, protocol_family_t, uint8_t, boolean_t);
+
+/* Synthesize ipv6 from ipv4 */
+int
+nat464_synthesize_ipv6(ifnet_t ifp, const struct in_addr *addrv4, struct in6_addr *addr)
+{
+       static const struct in6_addr well_known_prefix = {
+               .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
+                                        0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                        0x00, 0x00, 0x00, 0x00},
+       };
+
+       struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
+       int error = 0, i = 0;
+       /* Below call is not optimized as it creates a copy of prefixes */
+       if ((error = ifnet_get_nat64prefix(ifp, nat64prefixes)) != 0)
+               return (error);
+
+       for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
+               if (nat64prefixes[i].prefix_len != 0)
+                       break;
+       }
+
+       VERIFY (i < NAT64_MAX_NUM_PREFIXES);
+
+       struct in6_addr prefix = nat64prefixes[i].ipv6_prefix;
+       int prefix_len = nat64prefixes[i].prefix_len;
+
+       char *ptrv4 = __DECONST(char *, addrv4);
+       char *ptr = __DECONST(char *, addr);
+
+       if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
+           IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
+           IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
+           IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
+           IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
+           IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
+           INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
+               return (-1);
+       }
+
+       /* Check for the well-known prefix */
+       if (prefix_len == NAT64_PREFIX_LEN_96 &&
+           IN6_ARE_ADDR_EQUAL(&prefix, &well_known_prefix)) { // https://tools.ietf.org/html/rfc6052#section-3.1
+               if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
+                   IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) // 100.64.0.0/10 Shared Address Space
+                       return (-1);
+       }
+
+       memcpy(ptr, (char *)&prefix, prefix_len);
+
+       switch (prefix_len) {
+               case NAT64_PREFIX_LEN_96:
+                       memcpy(ptr + 12, ptrv4, 4);
+                       break;
+               case NAT64_PREFIX_LEN_64:
+                       memcpy(ptr + 9, ptrv4, 4);
+                       break;
+               case NAT64_PREFIX_LEN_56:
+                       memcpy(ptr + 7, ptrv4, 1);
+                       memcpy(ptr + 9, ptrv4 + 1, 3);
+                       break;
+               case NAT64_PREFIX_LEN_48:
+                       memcpy(ptr + 6, ptrv4, 2);
+                       memcpy(ptr + 9, ptrv4 + 2, 2);
+                       break;
+               case NAT64_PREFIX_LEN_40:
+                       memcpy(ptr + 5, ptrv4, 3);
+                       memcpy(ptr + 9, ptrv4 + 3, 1);
+                       break;
+               case NAT64_PREFIX_LEN_32:
+                       memcpy(ptr + 4, ptrv4, 4);
+                       break;
+               default:
+                       panic("NAT64-prefix len is wrong: %u\n", prefix_len);
+       }
+
+       if (clat_debug) {
+               char buf[MAX_IPv6_STR_LEN];
+               clat_log2((LOG_DEBUG, "%s synthesized  %s\n", __func__,
+                   inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf))));
+       }
+
+       return (error);
+}
+
+/* Synthesize ipv4 from ipv6 */
+int
+nat464_synthesize_ipv4(ifnet_t ifp, const struct in6_addr *addr, struct in_addr *addrv4)
+{
+       struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
+       int error = 0, i = 0;
+
+       /* Below call is not optimized as it creates a copy of prefixes */
+       if ((error = ifnet_get_nat64prefix(ifp, nat64prefixes)) != 0)
+               return error;
+
+       for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
+               if (nat64prefixes[i].prefix_len != 0)
+                       break;
+       }
+
+       VERIFY (i < NAT64_MAX_NUM_PREFIXES);
+
+       struct in6_addr prefix = nat64prefixes[i].ipv6_prefix;
+       int prefix_len = nat64prefixes[i].prefix_len;
+
+       char *ptrv4 = __DECONST(void *, addrv4);
+       char *ptr = __DECONST(void *, addr);
+
+       if (memcmp(addr, &prefix, prefix_len) != 0)
+               return (-1);
+
+       switch (prefix_len) {
+               case NAT64_PREFIX_LEN_96:
+                       memcpy(ptrv4, ptr + 12, 4);
+                       break;
+               case NAT64_PREFIX_LEN_64:
+                       memcpy(ptrv4, ptr + 9, 4);
+                       break;
+               case NAT64_PREFIX_LEN_56:
+                       memcpy(ptrv4, ptr + 7, 1);
+                       memcpy(ptrv4 + 1, ptr + 9, 3);
+                       break;
+               case NAT64_PREFIX_LEN_48:
+                       memcpy(ptrv4, ptr + 6, 2);
+                       memcpy(ptrv4 + 2, ptr + 9, 2);
+                       break;
+               case NAT64_PREFIX_LEN_40:
+                       memcpy(ptrv4, ptr + 5, 3);
+                       memcpy(ptrv4 + 3, ptr + 9, 1);
+                       break;
+               case NAT64_PREFIX_LEN_32:
+                       memcpy(ptrv4, ptr + 4, 4);
+                       break;
+               default:
+                       panic("NAT64-prefix len is wrong: %u\n",
+                             prefix_len);
+       }
+
+       if(clat_debug) {
+               char buf[MAX_IPv4_STR_LEN];
+               clat_log2((LOG_DEBUG, "%s desynthesized to %s\n", __func__,
+                   inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf))));
+       }
+       return (error);
+}
+
+#define PTR_IP(field)  ((int32_t)offsetof(struct ip, field))
+#define PTR_IP6(field) ((int32_t)offsetof(struct ip6_hdr, field))
+
+/*
+ Translate the ICMP header
+*/
+int
+nat464_translate_icmp(int naf, void *arg)
+{
+       struct icmp             *icmp4;
+       struct icmp6_hdr        *icmp6;
+       uint32_t                 mtu;
+       int32_t                  ptr = -1;
+       uint8_t          type;
+       uint8_t          code;
+
+       switch (naf) {
+       case AF_INET:
+               icmp6 = arg;
+               type  = icmp6->icmp6_type;
+               code  = icmp6->icmp6_code;
+               mtu   = ntohl(icmp6->icmp6_mtu);
+
+               switch (type) {
+               case ICMP6_ECHO_REQUEST:
+                       type = ICMP_ECHO;
+                       break;
+               case ICMP6_ECHO_REPLY:
+                       type = ICMP_ECHOREPLY;
+                       break;
+               case ICMP6_DST_UNREACH:
+                       type = ICMP_UNREACH;
+                       switch (code) {
+                       case ICMP6_DST_UNREACH_NOROUTE:
+                       case ICMP6_DST_UNREACH_BEYONDSCOPE:
+                       case ICMP6_DST_UNREACH_ADDR:
+                               code = ICMP_UNREACH_HOST;
+                               break;
+                       case ICMP6_DST_UNREACH_ADMIN:
+                               code = ICMP_UNREACH_HOST_PROHIB;
+                               break;
+                       case ICMP6_DST_UNREACH_NOPORT:
+                               code = ICMP_UNREACH_PORT;
+                               break;
+                       default:
+                               return (-1);
+                       }
+                       break;
+               case ICMP6_PACKET_TOO_BIG:
+                       type = ICMP_UNREACH;
+                       code = ICMP_UNREACH_NEEDFRAG;
+                       mtu -= 20;
+                       break;
+               case ICMP6_TIME_EXCEEDED:
+                       type = ICMP_TIMXCEED;
+                       break;
+               case ICMP6_PARAM_PROB:
+                       switch (code) {
+                       case ICMP6_PARAMPROB_HEADER:
+                               type = ICMP_PARAMPROB;
+                               code = ICMP_PARAMPROB_ERRATPTR;
+                               ptr  = ntohl(icmp6->icmp6_pptr);
+
+                               if (ptr == PTR_IP6(ip6_vfc))
+                                       ; /* preserve */
+                               else if (ptr == PTR_IP6(ip6_vfc) + 1)
+                                       ptr = PTR_IP(ip_tos);
+                               else if (ptr == PTR_IP6(ip6_plen) ||
+                                   ptr == PTR_IP6(ip6_plen) + 1)
+                                       ptr = PTR_IP(ip_len);
+                               else if (ptr == PTR_IP6(ip6_nxt))
+                                       ptr = PTR_IP(ip_p);
+                               else if (ptr == PTR_IP6(ip6_hlim))
+                                       ptr = PTR_IP(ip_ttl);
+                               else if (ptr >= PTR_IP6(ip6_src) &&
+                                   ptr < PTR_IP6(ip6_dst))
+                                       ptr = PTR_IP(ip_src);
+                               else if (ptr >= PTR_IP6(ip6_dst) &&
+                                   ptr < (int32_t)sizeof(struct ip6_hdr))
+                                       ptr = PTR_IP(ip_dst);
+                               else {
+                                       return (-1);
+                               }
+                               break;
+                       case ICMP6_PARAMPROB_NEXTHEADER:
+                               type = ICMP_UNREACH;
+                               code = ICMP_UNREACH_PROTOCOL;
+                               break;
+                       default:
+                               return (-1);
+                       }
+                       break;
+               default:
+                       return (-1);
+               }
+               icmp6->icmp6_type = type;
+               icmp6->icmp6_code = code;
+               /* aligns well with a icmpv4 nextmtu */
+               icmp6->icmp6_mtu = htonl(mtu);
+               /* icmpv4 pptr is a one most significant byte */
+               if (ptr >= 0)
+                       icmp6->icmp6_pptr = htonl(ptr << 24);
+               break;
+
+       case AF_INET6:
+               icmp4 = arg;
+               type  = icmp4->icmp_type;
+               code  = icmp4->icmp_code;
+               mtu   = ntohs(icmp4->icmp_nextmtu);
+
+               switch (type) {
+               case ICMP_ECHO:
+                       type = ICMP6_ECHO_REQUEST;
+                       break;
+               case ICMP_ECHOREPLY:
+                       type = ICMP6_ECHO_REPLY;
+                       break;
+               case ICMP_UNREACH:
+                       type = ICMP6_DST_UNREACH;
+                       switch (code) {
+                       case ICMP_UNREACH_NET:
+                       case ICMP_UNREACH_HOST:
+                       case ICMP_UNREACH_NET_UNKNOWN:
+                       case ICMP_UNREACH_HOST_UNKNOWN:
+                       case ICMP_UNREACH_ISOLATED:
+                       case ICMP_UNREACH_TOSNET:
+                       case ICMP_UNREACH_TOSHOST:
+                               code = ICMP6_DST_UNREACH_NOROUTE;
+                               break;
+                       case ICMP_UNREACH_PORT:
+                               code = ICMP6_DST_UNREACH_NOPORT;
+                               break;
+                       case ICMP_UNREACH_NET_PROHIB:
+                       case ICMP_UNREACH_HOST_PROHIB:
+                       case ICMP_UNREACH_FILTER_PROHIB:
+                       case ICMP_UNREACH_PRECEDENCE_CUTOFF:
+                               code = ICMP6_DST_UNREACH_ADMIN;
+                               break;
+                       case ICMP_UNREACH_PROTOCOL:
+                               type = ICMP6_PARAM_PROB;
+                               code = ICMP6_PARAMPROB_NEXTHEADER;
+                               ptr  = offsetof(struct ip6_hdr, ip6_nxt);
+                               break;
+                       case ICMP_UNREACH_NEEDFRAG:
+                               type = ICMP6_PACKET_TOO_BIG;
+                               code = 0;
+                               mtu += 20;
+                               break;
+                       default:
+                               return (-1);
+                       }
+                       break;
+               case ICMP_TIMXCEED:
+                       type = ICMP6_TIME_EXCEEDED;
+                       break;
+               case ICMP_PARAMPROB:
+                       type = ICMP6_PARAM_PROB;
+                       switch (code) {
+                       case ICMP_PARAMPROB_ERRATPTR:
+                               code = ICMP6_PARAMPROB_HEADER;
+                               break;
+                       case ICMP_PARAMPROB_LENGTH:
+                               code = ICMP6_PARAMPROB_HEADER;
+                               break;
+                       default:
+                               return (-1);
+                       }
+
+                       ptr = icmp4->icmp_pptr;
+                       if (ptr == 0 || ptr == PTR_IP(ip_tos))
+                               ; /* preserve */
+                       else if (ptr == PTR_IP(ip_len) ||
+                           ptr == PTR_IP(ip_len) + 1)
+                               ptr = PTR_IP6(ip6_plen);
+                       else if (ptr == PTR_IP(ip_ttl))
+                               ptr = PTR_IP6(ip6_hlim);
+                       else if (ptr == PTR_IP(ip_p))
+                               ptr = PTR_IP6(ip6_nxt);
+                       else if (ptr >= PTR_IP(ip_src) &&
+                           ptr < PTR_IP(ip_dst))
+                               ptr = PTR_IP6(ip6_src);
+                       else if (ptr >= PTR_IP(ip_dst) &&
+                           ptr < (int32_t)sizeof(struct ip))
+                               ptr = PTR_IP6(ip6_dst);
+                       else {
+                               return (-1);
+                       }
+                       break;
+               default:
+                       return (-1);
+               }
+               icmp4->icmp_type = type;
+               icmp4->icmp_code = code;
+               icmp4->icmp_nextmtu = htons(mtu);
+               if (ptr >= 0)
+                       icmp4->icmp_void = htonl(ptr);
+               break;
+       }
+
+       return (0);
+}
+
+/*
+ * @brief This routine is called to perform address family translation on the
+ *     inner IP header (that may come as payload) of an ICMP(v4/v6) error
+ *     response.
+ *
+ * @param pbuf Pointer to packet buffer
+ * @param off Points to end of ICMP header
+ * @param tot_len Pointer to total length of the outer IP header
+ * @param off2 Points to end of inner IP header
+ * @param proto2 Inner IP proto field
+ * @param ttl2 Inner IP ttl field
+ * @param tot_len2 Inner IP total length
+ * @param src Pointer to the generic v4/v6 src address
+ * @param dst Pointer to the generic v4/v6 dst address
+ * @param af Old protocol family
+ * @param naf New protocol family
+ *
+ * @return -1 on error and 0 on success
+ */
+int
+nat464_translate_icmp_ip(pbuf_t *pbuf, uint32_t off, uint64_t *tot_len, uint32_t *off2,
+       uint8_t proto2, uint8_t ttl2, uint64_t tot_len2, struct nat464_addr *src,
+       struct nat464_addr *dst, protocol_family_t af, protocol_family_t naf)
+{
+       struct ip *ip4 = NULL;
+       struct ip6_hdr *ip6 = NULL;
+       void *hdr = NULL;
+       int hlen = 0, olen = 0;
+
+       if (af == naf || (af != AF_INET && af != AF_INET6) ||
+           (naf != AF_INET && naf != AF_INET6))
+               return (-1);
+
+       /* old header */
+       olen = *off2 - off;
+       /* new header */
+       hlen = naf == PF_INET ? sizeof(*ip4) : sizeof(*ip6);
+
+       /* Modify the pbuf to accommodate the new header */
+       hdr = pbuf_resize_segment(pbuf, off, olen, hlen);
+       if (hdr == NULL)
+               return (-1);
+
+       /* translate inner ip/ip6 header */
+       switch (naf) {
+       case AF_INET:
+               ip4 = hdr;
+               bzero(ip4, sizeof(*ip4));
+               ip4->ip_v = IPVERSION;
+               ip4->ip_hl = sizeof(*ip4) >> 2;
+               ip4->ip_len = htons(sizeof(*ip4) + tot_len2 - olen);
+               ip4->ip_id = rfc6864 ? 0 : htons(ip_randomid());
+               ip4->ip_off = htons(IP_DF);
+               ip4->ip_ttl = ttl2;
+               if (proto2 == IPPROTO_ICMPV6)
+                       ip4->ip_p = IPPROTO_ICMP;
+               else
+                       ip4->ip_p = proto2;
+               ip4->ip_src = src->natv4addr;
+               ip4->ip_dst = dst->natv4addr;
+               ip4->ip_sum = pbuf_inet_cksum(pbuf, 0, 0, ip4->ip_hl << 2);
+
+               if (clat_debug) {
+                       char buf[MAX_IPv4_STR_LEN];
+                       clat_log2((LOG_DEBUG, "%s translated to IPv4 (inner) "
+                           "ip_len: %#x ip_p: %d ip_sum: %#x ip_src: %s ip_dst: %s \n",
+                           __func__, ntohs(ip4->ip_len), ip4->ip_p, ntohs(ip4->ip_sum),
+                           inet_ntop(AF_INET, (void *)&ip4->ip_src, buf, sizeof(buf)),
+                           inet_ntop(AF_INET, (void *)&ip4->ip_dst, buf, sizeof(buf))));
+               }
+               break;
+       case AF_INET6:
+               ip6 = hdr;
+               bzero(ip6, sizeof(*ip6));
+               ip6->ip6_vfc  = IPV6_VERSION;
+               ip6->ip6_plen = htons(tot_len2 - olen);
+               if (proto2 == IPPROTO_ICMP)
+                       ip6->ip6_nxt = IPPROTO_ICMPV6;
+               else
+                       ip6->ip6_nxt = proto2;
+               if (!ttl2 || ttl2 > IPV6_DEFHLIM)
+                       ip6->ip6_hlim = IPV6_DEFHLIM;
+               else
+                       ip6->ip6_hlim = ttl2;
+               ip6->ip6_src  = src->natv6addr;
+               ip6->ip6_dst  = dst->natv6addr;
+
+               if (clat_debug) {
+                       char buf2[MAX_IPv6_STR_LEN];
+                       clat_log2((LOG_DEBUG, "%s translated to IPv6 (inner) "
+                           "ip6_plen: %#x ip6_nxt: %d ip6_src: %s ip6_dst: %s \n",
+                           __func__, ntohs(ip6->ip6_plen), ip6->ip6_nxt,
+                           inet_ntop(AF_INET6, (void *)&ip6->ip6_src, buf2, sizeof(buf2)),
+                           inet_ntop(AF_INET6, (void *)&ip6->ip6_dst, buf2, sizeof(buf2))));
+               }
+               break;
+       }
+
+       /* adjust payload offset and total packet length */
+       *off2 += hlen - olen;
+       *tot_len += hlen - olen;
+
+       return (0);
+}
+/*
+ * @brief The function inserts IPv6 fragmentation header
+ *     and populates it with the passed parameters.
+ *
+ * @param pbuf Pointer to the packet buffer
+ * @param ip_id IP identifier (in network byte order)
+ * @param frag_offset Fragment offset (in network byte order)
+ * @param is_last_frag Boolean indicating if the fragment header is for
+ *     last fragment or not.
+ *
+ * @return -1 on error and 0 on success.
+ */
+int
+nat464_insert_frag46(pbuf_t *pbuf, uint16_t ip_id_val, uint16_t frag_offset,
+    boolean_t is_last_frag)
+{
+       struct ip6_frag *p_ip6_frag = NULL;
+       struct ip6_hdr *p_ip6h = NULL;
+
+       /* Insert IPv6 fragmentation header */
+       if (pbuf_resize_segment(pbuf, sizeof(struct ip6_hdr), 0,
+           sizeof(struct ip6_frag)) == NULL)
+               return (-1);
+
+       p_ip6h = mtod(pbuf->pb_mbuf, struct ip6_hdr *);
+       p_ip6_frag = (struct ip6_frag *)pbuf_contig_segment(pbuf,
+           sizeof(struct ip6_hdr), sizeof(struct ip6_frag));
+
+       if (p_ip6_frag == NULL)
+               return (-1);
+
+       /* Populate IPv6 fragmentation header */
+       p_ip6_frag->ip6f_nxt = p_ip6h->ip6_nxt;
+       p_ip6_frag->ip6f_reserved = 0;
+       p_ip6_frag->ip6f_offlg = (frag_offset) << 3;
+       if (!is_last_frag)
+               p_ip6_frag->ip6f_offlg |= 0x1;
+       p_ip6_frag->ip6f_offlg = htons(p_ip6_frag->ip6f_offlg);
+       p_ip6_frag->ip6f_ident = ip_id_val;
+
+       /* Update IPv6 header */
+       p_ip6h->ip6_nxt = IPPROTO_FRAGMENT;
+       p_ip6h->ip6_plen = htons(ntohs(p_ip6h->ip6_plen) +
+           sizeof(struct ip6_frag));
+
+        return (0);
+}
+
+int
+nat464_translate_64(pbuf_t *pbuf, int off, uint8_t tos,
+    uint8_t *proto, uint8_t ttl, struct in_addr src_v4,
+    struct in_addr dst_v4, uint64_t tot_len, boolean_t *p_is_first_frag)
+{
+       struct ip *ip4;
+       struct ip6_frag *p_frag6 = NULL;
+       struct ip6_frag frag6 = {};
+       boolean_t is_frag = FALSE;
+       uint16_t ip_frag_off = 0;
+
+       /*
+        * ip_input asserts for rcvif to be not NULL
+        * That may not be true for two corner cases
+        * 1. If for some reason a local app sends DNS
+        * AAAA query to local host
+        * 2. If IPv6 stack in kernel internally generates a
+        * message destined for a synthesized IPv6 end-point.
+        */
+       if (pbuf->pb_ifp == NULL)
+               return (NT_DROP);
+
+       if (*proto == IPPROTO_FRAGMENT) {
+               p_frag6 = (struct ip6_frag *)pbuf_contig_segment(pbuf,
+                   sizeof(struct ip6_hdr), sizeof(struct ip6_frag));
+               if (p_frag6 == NULL) {
+                       ip6stat.ip6s_clat464_in_64frag_transfail_drop++;
+                       return (NT_DROP);
+               }
+
+               frag6 = *p_frag6;
+               p_frag6 = NULL;
+               *proto = frag6.ip6f_nxt;
+               off += sizeof(struct ip6_frag);
+               is_frag = TRUE;
+               ip_frag_off = (ntohs(frag6.ip6f_offlg & IP6F_OFF_MASK)) >> 3;
+               if (ip_frag_off != 0) {
+                       *p_is_first_frag = FALSE;
+               }
+       }
+
+       ip4 = (struct ip *)pbuf_resize_segment(pbuf, 0, off, sizeof(*ip4));
+       if (ip4 == NULL)
+               return (NT_DROP);
+       ip4->ip_v   = 4;
+       ip4->ip_hl  = 5;
+       ip4->ip_tos = tos;
+       ip4->ip_len = htons(sizeof(*ip4) + (tot_len - off));
+       ip4->ip_id  = 0;
+       ip4->ip_off = 0;
+       ip4->ip_ttl = ttl;
+       ip4->ip_p   = *proto;
+       ip4->ip_sum = 0;
+       ip4->ip_src = src_v4;
+       ip4->ip_dst = dst_v4;
+       if (is_frag) {
+               /*
+                * https://tools.ietf.org/html/rfc7915#section-5.1.1
+                * Identification:  Copied from the low-order 16 bits in the
+                * Identification field in the Fragment Header.
+                */
+               ip4->ip_id = ntohl(frag6.ip6f_ident) & 0xffff;
+               ip4->ip_id = htons(ip4->ip_id);
+               if(frag6.ip6f_offlg & IP6F_MORE_FRAG)
+                       ip_frag_off |= IP_MF;
+               ip4->ip_off = htons(ip_frag_off);
+       } else {
+               ip4->ip_off |= htons(IP_DF);
+       }
+
+       /*
+        * Defer calculating ip_sum for ICMPv6 as we do it
+        * later in Protocol translation
+        */
+       if (*proto != IPPROTO_ICMPV6)
+               ip4->ip_sum = pbuf_inet_cksum(pbuf, 0, 0, ip4->ip_hl << 2);
+
+       if (clat_debug) {
+               char buf1[MAX_IPv4_STR_LEN], buf2[MAX_IPv4_STR_LEN];
+               clat_log2((LOG_DEBUG, "%s translated to IPv4 ip_len: %#x "
+                   "ip_p: %d ip_sum: %#x ip_src: %s ip_dst: %s \n", __func__,
+                   ntohs(ip4->ip_len), ip4->ip_p, ntohs(ip4->ip_sum),
+                   inet_ntop(AF_INET, (void *)&ip4->ip_src, buf1, sizeof(buf1)),
+                   inet_ntop(AF_INET, (void *)&ip4->ip_dst, buf2, sizeof(buf2))));
+       }
+       return (NT_NAT64);
+}
+/*
+ * @brief The routine translates the IPv4 header to IPv6 header.
+ *
+ * @param pbuf Pointer to the generic packet buffer
+ * @param off Offset to the end of IP header
+ * @param tos Type of service
+ * @param proto Protocol running over IP
+ * @param ttl Time to live
+ * @param src_v6 Source IPv6 address
+ * @param dst_v6 Destination IPv6 address
+ * @param tot_len Total payload length
+ *
+ * @return NT_NAT64 if IP header translation is successful, else error
+ */ 
+int
+nat464_translate_46(pbuf_t *pbuf, int off, uint8_t tos,
+    uint8_t proto, uint8_t ttl, struct in6_addr src_v6,
+    struct in6_addr dst_v6, uint64_t tot_len)
+{
+       struct ip6_hdr *ip6;
+
+       if (pbuf->pb_ifp == NULL)
+               return (NT_DROP);
+
+       /*
+        * Trim the buffer from head of size equal to to off (which is equal to
+        * the size of IP header and prepend IPv6 header length to the buffer
+        */ 
+       ip6 = (struct ip6_hdr *)pbuf_resize_segment(pbuf, 0, off, sizeof(*ip6));
+       if (ip6 == NULL)
+               return (NT_DROP);
+       ip6->ip6_flow = htonl((6 << 28) | (tos << 20));
+       ip6->ip6_plen = htons(tot_len - off);
+       ip6->ip6_nxt  = proto;
+       ip6->ip6_hlim = ttl;
+       ip6->ip6_src = src_v6;
+       ip6->ip6_dst = dst_v6;
+
+       if (clat_debug) {
+               char buf1[MAX_IPv6_STR_LEN], buf2[MAX_IPv6_STR_LEN];
+               clat_log2((LOG_DEBUG, "%s translated to IPv6 ip6_plen: %#x "
+                   " ip6_nxt: %d ip6_src: %s ip6_dst: %s \n", __func__,
+                   ntohs(ip6->ip6_plen), ip6->ip6_nxt,
+                   inet_ntop(AF_INET6, (void *)&ip6->ip6_src, buf1, sizeof(buf1)),
+                   inet_ntop(AF_INET6, (void *)&ip6->ip6_dst, buf2, sizeof(buf2))));
+       }
+       return (NT_NAT64);
+}
+
+/* Handle the next protocol checksum */
+/*
+ * @brief This routine translates the Proto running over IP and updates the checksum
+ *     for IP header translation. It also updates pbuf checksum flags and related fields.
+ *
+ * @param pbuf Pointer to protocol buffer
+ * @param nsrc New source address
+ * @param ndst New destination address
+ * @param af Old family
+ * @param naf New family
+ *
+ * @return void
+ */ 
+int
+nat464_translate_proto(pbuf_t *pbuf, struct nat464_addr *osrc,
+    struct nat464_addr *odst, uint8_t oproto, protocol_family_t af,
+    protocol_family_t naf, int direction, boolean_t only_csum)
+{
+       struct ip *iph = NULL;
+       struct ip6_hdr *ip6h = NULL;
+       uint32_t hlen = 0, plen = 0;
+       uint64_t tot_len = 0;
+       void *nsrc = NULL, *ndst = NULL;
+       uint8_t *proto = 0;
+       uint16_t *psum = NULL;
+       boolean_t do_ones_complement = FALSE;
+
+       /* For now these routines only support 464 translations */
+       VERIFY(af != naf);
+       VERIFY(af == PF_INET || af == PF_INET6);
+
+       /*
+        * For now out must be for v4 to v6 translation
+        * and in must be for v6 to v4 translation.
+        */
+       switch (naf) {
+       case PF_INET: {
+               iph = pbuf->pb_data;
+               hlen = iph->ip_hl << 2;
+               plen = ntohs(iph->ip_len) - hlen;
+               tot_len = ntohs(iph->ip_len);
+               nsrc = &iph->ip_src;
+               ndst = &iph->ip_dst;
+               proto = &iph->ip_p;
+               break;
+       }
+       case PF_INET6: {
+               ip6h = pbuf->pb_data;
+               hlen = sizeof(*ip6h);
+               plen = ntohs(ip6h->ip6_plen);
+               tot_len = hlen + plen;
+               nsrc = &ip6h->ip6_src;
+               ndst = &ip6h->ip6_dst;
+               proto = &ip6h->ip6_nxt;
+               break;
+       }
+       }
+
+       VERIFY(*proto == oproto);
+
+       /*
+        * We may want to manipulate csum flags in some cases
+        * and not act on the protocol header as it may not
+        * carry protocol checksums.
+        * For example, fragments other than the first one would
+        * not carry protocol headers.
+        */
+       if (only_csum) {
+               /*
+                * Only translate ICMP proto in the header
+                * and adjust checksums
+                */
+               if (*proto == IPPROTO_ICMP) {
+                       if (naf != PF_INET6)
+                               return (NT_DROP);
+
+                       *proto = IPPROTO_ICMPV6;
+               }
+               else if (*proto == IPPROTO_ICMPV6) {
+                       if (naf != PF_INET)
+                               return (NT_DROP);
+
+                       *proto = IPPROTO_ICMP;
+                       /* Recalculate IP checksum as proto field has changed */
+                       iph->ip_sum = 0;
+                       iph->ip_sum = pbuf_inet_cksum(pbuf, 0, 0, hlen);
+               }
+               goto done;
+       }
+
+       switch (*proto) {
+       case IPPROTO_UDP: {
+               struct udphdr *uh = (struct udphdr *)pbuf_contig_segment(pbuf, hlen,
+                   sizeof(*uh));
+
+               if (uh == NULL)
+                       return (NT_DROP);
+
+               if (!(*pbuf->pb_csum_flags & (CSUM_UDP | CSUM_PARTIAL)) &&
+                   uh->uh_sum == 0 && af == PF_INET && naf == PF_INET6) {
+                       uh->uh_sum = pbuf_inet6_cksum(pbuf, IPPROTO_UDP,
+                           hlen, ntohs(ip6h->ip6_plen));
+                       if (uh->uh_sum == 0)
+                               uh->uh_sum = 0xffff;
+                       goto done;
+               }
+
+               psum = &uh->uh_sum;
+               break;
+       }
+       case IPPROTO_TCP: {
+               struct tcphdr *th = (struct tcphdr *)pbuf_contig_segment(pbuf, hlen,
+                   sizeof(*th));
+
+               if (th == NULL)
+                       return (NT_DROP);
+
+               psum = &th->th_sum;
+               break;
+       }
+       }
+
+       /* 
+        * Translate the protocol header, update IP header if needed,
+        * calculate checksums and update the checksum flags.
+        */
+       switch (*proto) {
+       case IPPROTO_UDP:
+               /* Fall through */
+       case IPPROTO_TCP:
+       {
+               /*
+                * If it is a locally generated and has CSUM flags set
+                * for TCP and UDP it means we have pseudo header checksum
+                * that has not yet been one's complemented.
+                */
+               if (direction == NT_OUT &&
+                   (*pbuf->pb_csum_flags & CSUM_DELAY_DATA))
+                       do_ones_complement = TRUE;
+
+               nat464_addr_cksum_fixup(psum, osrc, (struct nat464_addr *)nsrc,
+                   af, naf, (*proto == IPPROTO_UDP) ? 1 : 0, do_ones_complement);
+               nat464_addr_cksum_fixup(psum, odst, (struct nat464_addr *)ndst,
+                   af, naf, (*proto == IPPROTO_UDP) ? 1 : 0, do_ones_complement);
+
+               break;
+       }
+       case IPPROTO_ICMP: {
+               if (naf != PF_INET6)    /* allow only v6 as naf for ICMP */
+                       return (NT_DROP);
+
+               struct icmp *icmph = NULL;
+               struct icmp6_hdr *icmp6h = NULL;
+               uint32_t ip2off = 0, hlen2 = 0, tot_len2 = 0;
+
+               icmph = (struct icmp*) pbuf_contig_segment(pbuf, hlen,
+                   ICMP_MINLEN);
+               if (icmph == NULL)
+                       return (NT_DROP);
+
+               /* Translate the ICMP header */
+               if (nat464_translate_icmp(PF_INET6, icmph) != 0)
+                       return (NT_DROP);
+
+               *proto = IPPROTO_ICMPV6;
+               icmp6h = (struct icmp6_hdr *)(uintptr_t)icmph;
+               pbuf_copy_back(pbuf, hlen, sizeof(struct icmp6_hdr),
+                   icmp6h);
+
+               /*Translate the inner IP header only for error messages */
+               if (ICMP6_ERRORTYPE(icmp6h->icmp6_type)) {
+                       ip2off = hlen + sizeof(*icmp6h);
+                       struct ip *iph2;
+                       iph2 = (struct ip*) pbuf_contig_segment(pbuf, ip2off,
+                           sizeof (*iph2));
+                       if (iph2 == NULL)
+                               return (NT_DROP);
+
+                       hlen2 = ip2off + (iph2->ip_hl << 2);
+                       tot_len2 = ntohs(iph2->ip_len);
+
+                       /* Destination in outer IP should be Source in inner IP */
+                       VERIFY(IN_ARE_ADDR_EQUAL(&odst->natv4addr, &iph2->ip_src));
+                       if (nat464_translate_icmp_ip(pbuf, ip2off, &tot_len,
+                           &hlen2, iph2->ip_p, iph2->ip_ttl, tot_len2,
+                           (struct nat464_addr *)ndst, (struct nat464_addr *)nsrc,
+                           PF_INET, PF_INET6) != 0)
+                               return (NT_DROP);
+                       /* Update total length/payload length for outer header */
+                       switch (naf) {
+                       case PF_INET:
+                                       iph->ip_len = htons(tot_len);
+                               break;
+                       case PF_INET6:
+                                       ip6h->ip6_plen = htons(tot_len - hlen);
+                               break;
+                       }
+                       iph2 = NULL;
+               }
+
+               icmp6h->icmp6_cksum = 0;
+               icmp6h->icmp6_cksum = pbuf_inet6_cksum(pbuf, IPPROTO_ICMPV6, hlen,
+                   ntohs(ip6h->ip6_plen));
+
+               clat_log2((LOG_DEBUG, "%s translated to ICMPV6 type: %d "
+                   "code: %d checksum: %#x \n", __func__, icmp6h->icmp6_type,
+                   icmp6h->icmp6_code, icmp6h->icmp6_cksum));
+
+               icmph = NULL;
+               icmp6h = NULL;
+               break;
+        }
+       case IPPROTO_ICMPV6:
+       {       if (naf != PF_INET)             /* allow only v4 as naf for ICMPV6 */
+                       return (NT_DROP);
+
+               struct icmp6_hdr *icmp6h = NULL;
+               struct icmp *icmph = NULL;
+               uint32_t ip2off = 0, hlen2 = 0, tot_len2 = 0;
+
+               icmp6h = (struct icmp6_hdr*) pbuf_contig_segment(pbuf, hlen,
+                   sizeof(*icmp6h));
+               if (icmp6h == NULL)
+                       return (NT_DROP);
+
+               /* Translate the ICMP header */
+               if (nat464_translate_icmp(PF_INET, icmp6h) != 0)
+                       return (NT_DROP);
+
+               *proto = IPPROTO_ICMP;
+               icmph = (struct icmp *)(uintptr_t)icmp6h;
+               pbuf_copy_back(pbuf, hlen, ICMP_MINLEN,
+                   icmph);
+
+               /*Translate the inner IP header only for error messages */
+               if (ICMP_ERRORTYPE(icmph->icmp_type)) {
+                       ip2off = hlen + ICMP_MINLEN;
+                       struct ip6_hdr *iph2;
+                       iph2 = (struct ip6_hdr*) pbuf_contig_segment(pbuf, ip2off,
+                                       sizeof (*iph2));
+                       if (iph2 == NULL)
+                               return (NT_DROP);
+
+                       /* hlen2 points to end of inner IP header from the beginning */
+                       hlen2 = ip2off + sizeof(struct ip6_hdr);
+                       tot_len2 = ntohs(iph2->ip6_plen) + sizeof(struct ip6_hdr);
+
+                       if (nat464_translate_icmp_ip(pbuf, ip2off, &tot_len,
+                           &hlen2, iph2->ip6_nxt, iph2->ip6_hlim, tot_len2,
+                           (struct nat464_addr *)ndst, (struct nat464_addr *)nsrc,
+                           PF_INET6, PF_INET) != 0)
+                               return (NT_DROP);
+
+                       /* Update total length for outer header */
+                       switch (naf) {
+                       case PF_INET:
+                                       iph->ip_len = htons(tot_len);
+                               break;
+                       case PF_INET6:
+                                       ip6h->ip6_plen = htons(tot_len - hlen);
+                               break;
+                       }
+                       iph2 = NULL;
+               }
+               /* Recalculate IP checksum as some IP fields might have changed */
+               iph->ip_sum = 0;
+               iph->ip_sum = pbuf_inet_cksum(pbuf, 0, 0, iph->ip_hl << 2);
+               icmph->icmp_cksum = 0;
+               icmph->icmp_cksum = pbuf_inet_cksum(pbuf, 0, hlen,
+                   ntohs(iph->ip_len) - hlen);
+
+               clat_log2((LOG_DEBUG, "%s translated to ICMP type: %d "
+                   "code: %d checksum: %#x \n", __func__, icmph->icmp_type,
+                   icmph->icmp_code, icmph->icmp_cksum));
+
+               icmp6h = NULL;
+               icmph = NULL;
+               break;
+       }
+
+       /*
+        * https://tools.ietf.org/html/rfc7915#section-5.1.1
+        * If the Next Header field of the Fragment Header is an
+        * extension header (except ESP, but including the Authentication
+        * Header (AH)), then the packet SHOULD be dropped and logged.
+        */
+       case IPPROTO_HOPOPTS:
+       case IPPROTO_ROUTING:
+       case IPPROTO_DSTOPTS:
+       case IPPROTO_AH:
+               return (NT_DROP);
+
+       case IPPROTO_FRAGMENT:
+               /*
+                * The fragment header is appended after or removed before
+                * calling into this routine.
+                */
+               VERIFY(FALSE);
+       case IPPROTO_ESP:
+               break;
+
+       default:
+               return (NT_DROP);
+       }
+
+done:
+       /* Update checksum flags and offsets based on direction */
+       if (direction == NT_OUT) {
+               if ((*pbuf->pb_csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
+                   (CSUM_DATA_VALID | CSUM_PARTIAL)) {
+                       (pbuf->pb_mbuf)->m_pkthdr.csum_tx_start += CLAT46_HDR_EXPANSION_OVERHD;
+                       (pbuf->pb_mbuf)->m_pkthdr.csum_tx_stuff += CLAT46_HDR_EXPANSION_OVERHD;
+               }
+
+               if(*pbuf->pb_csum_flags & CSUM_TCP)
+                       *pbuf->pb_csum_flags |= CSUM_TCPIPV6;
+               if(*pbuf->pb_csum_flags & CSUM_UDP)
+                       *pbuf->pb_csum_flags |= CSUM_UDPIPV6;
+               if (*pbuf->pb_csum_flags & CSUM_FRAGMENT)
+                       *pbuf->pb_csum_flags |= CSUM_FRAGMENT_IPV6;
+
+               /* Clear IPv4 checksum flags */
+               *pbuf->pb_csum_flags &= ~(CSUM_IP | CSUM_IP_FRAGS | CSUM_DELAY_DATA | CSUM_FRAGMENT);
+       } else if (direction == NT_IN) {
+               /* XXX On input just reset csum flags */
+               *pbuf->pb_csum_flags = 0; /* Reset all flags for now */
+#if 0
+               /* Update csum flags and offsets for rx */
+               if (*pbuf->pb_csum_flags & CSUM_PARTIAL) {
+                       (pbuf->pb_mbuf)->m_pkthdr.csum_rx_start -= CLAT46_HDR_EXPANSION_OVERHD;
+               }
+#endif
+       }
+       return (NT_NAT64);
+}
+
+/* Fix the proto checksum for address change */
+static void
+nat464_addr_cksum_fixup(uint16_t *pc, struct nat464_addr *ao, struct nat464_addr *an,
+    protocol_family_t af, protocol_family_t naf, uint8_t u, boolean_t do_ones_complement)
+{
+       /* Currently we only support v4 to v6 and vice versa */
+       VERIFY (af != naf);
+
+       switch (af) {
+       case PF_INET:
+               switch (naf) {
+               case PF_INET6:
+                       if (do_ones_complement) {
+                               *pc = ~nat464_cksum_fixup(nat464_cksum_fixup(
+                                   nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(
+                                   nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(~*pc,
+                                   ao->nataddr16[0], an->nataddr16[0], u),
+                                   ao->nataddr16[1], an->nataddr16[1], u),
+                                   0,               an->nataddr16[2], u),
+                                   0,               an->nataddr16[3], u),
+                                   0,               an->nataddr16[4], u),
+                                   0,               an->nataddr16[5], u),
+                                   0,               an->nataddr16[6], u),
+                                   0,               an->nataddr16[7], u);
+                       } else {
+                               *pc = nat464_cksum_fixup(nat464_cksum_fixup(
+                                   nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(
+                                   nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(*pc,
+                                   ao->nataddr16[0], an->nataddr16[0], u),
+                                   ao->nataddr16[1], an->nataddr16[1], u),
+                                   0,               an->nataddr16[2], u),
+                                   0,               an->nataddr16[3], u),
+                                   0,               an->nataddr16[4], u),
+                                   0,               an->nataddr16[5], u),
+                                   0,               an->nataddr16[6], u),
+                                   0,               an->nataddr16[7], u);
+                       }
+                       break;
+               }
+               break;
+       case PF_INET6:
+               /*
+                * XXX For NAT464 this only applies to the incoming path.
+                * The checksum therefore is already ones complemented.
+                * Therefore we just perform normal fixup.
+                */
+               switch (naf) {
+               case PF_INET:
+                       *pc = nat464_cksum_fixup(nat464_cksum_fixup(
+                           nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(
+                           nat464_cksum_fixup(nat464_cksum_fixup(nat464_cksum_fixup(*pc,
+                           ao->nataddr16[0], an->nataddr16[0], u),
+                           ao->nataddr16[1], an->nataddr16[1], u),
+                           ao->nataddr16[2], 0,                       u),
+                           ao->nataddr16[3], 0,                       u),
+                           ao->nataddr16[4], 0,                       u),
+                           ao->nataddr16[5], 0,                       u),
+                           ao->nataddr16[6], 0,                       u),
+                           ao->nataddr16[7], 0,                       u);
+                       break;
+               }
+               break;
+       }
+}
+
+uint16_t
+nat464_cksum_fixup(uint16_t cksum, uint16_t old, uint16_t new, uint8_t udp)
+{
+       uint32_t l;
+
+       if (udp && !cksum)
+               return (0);
+       l = cksum + old - new;
+       l = (l >> 16) + (l & 0xffff);
+       l = l & 0xffff;
+       if (udp && !l)
+               return (0xffff);
+       return (l);
+}
+
+/* CLAT46 event handlers */
+void
+in6_clat46_eventhdlr_callback(struct eventhandler_entry_arg arg0 __unused,
+    in6_clat46_evhdlr_code_t in6_clat46_ev_code, pid_t epid, uuid_t euuid)
+{
+        struct kev_msg ev_msg;
+        struct kev_netevent_clat46_data clat46_event_data;
+
+        bzero(&ev_msg, sizeof(ev_msg));
+        bzero(&clat46_event_data, sizeof(clat46_event_data));
+
+        ev_msg.vendor_code      = KEV_VENDOR_APPLE;
+        ev_msg.kev_class        = KEV_NETWORK_CLASS;
+        ev_msg.kev_subclass     = KEV_NETEVENT_SUBCLASS;
+        ev_msg.event_code       = KEV_NETEVENT_CLAT46_EVENT;
+
+        bzero(&clat46_event_data, sizeof(clat46_event_data));
+        clat46_event_data.clat46_event_code = in6_clat46_ev_code;
+        clat46_event_data.epid = epid;
+        uuid_copy(clat46_event_data.euuid, euuid);
+
+        ev_msg.dv[0].data_ptr = &clat46_event_data;
+        ev_msg.dv[0].data_length = sizeof(clat46_event_data);
+
+        kev_post_msg(&ev_msg);
+}
+
+static void
+in6_clat46_event_callback(void *arg)
+{
+        struct kev_netevent_clat46_data *p_in6_clat46_ev =
+            (struct kev_netevent_clat46_data *)arg;
+
+        EVENTHANDLER_INVOKE(&in6_clat46_evhdlr_ctxt, in6_clat46_event,
+            p_in6_clat46_ev->clat46_event_code, p_in6_clat46_ev->epid,
+            p_in6_clat46_ev->euuid);
+}
+
+struct in6_clat46_event_nwk_wq_entry
+{
+        struct nwk_wq_entry nwk_wqe;
+        struct kev_netevent_clat46_data in6_clat46_ev_arg;
+};
+
+void
+in6_clat46_event_enqueue_nwk_wq_entry(in6_clat46_evhdlr_code_t in6_clat46_event_code,
+    pid_t epid, uuid_t euuid)
+{
+        struct in6_clat46_event_nwk_wq_entry *p_ev = NULL;
+
+        MALLOC(p_ev, struct in6_clat46_event_nwk_wq_entry *,
+            sizeof(struct in6_clat46_event_nwk_wq_entry),
+            M_NWKWQ, M_WAITOK | M_ZERO);
+
+        p_ev->nwk_wqe.func = in6_clat46_event_callback;
+        p_ev->nwk_wqe.is_arg_managed = TRUE;
+        p_ev->nwk_wqe.arg = &p_ev->in6_clat46_ev_arg;
+
+        p_ev->in6_clat46_ev_arg.clat46_event_code = in6_clat46_event_code;
+        p_ev->in6_clat46_ev_arg.epid = epid;
+        uuid_copy(p_ev->in6_clat46_ev_arg.euuid, euuid);
+
+        nwk_wq_enqueue((struct nwk_wq_entry*)p_ev);
+}
diff --git a/bsd/net/nat464_utils.h b/bsd/net/nat464_utils.h
new file mode 100644 (file)
index 0000000..be938d2
--- /dev/null
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ * Copyright (c) 2001 Daniel Hartmeier
+ * Copyright (c) 2002 - 2013 Henning Brauer
+ * NAT64 - Copyright (c) 2010 Viagenie Inc. (http://www.viagenie.ca)
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *    - Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *    - Redistributions in binary form must reproduce the above
+ *     copyright notice, this list of conditions and the following
+ *     disclaimer in the documentation and/or other materials provided
+ *     with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * Effort sponsored in part by the Defense Advanced Research Projects
+ * Agency (DARPA) and Air Force Research Laboratory, Air Force
+ * Materiel Command, USAF, under agreement number F30602-01-2-0537.
+ *
+ */
+#ifndef        _NET_NAT464_UTILS_H_
+#define        _NET_NAT464_UTILS_H_
+#include <netinet/in.h>
+#include <net/pf_pbuf.h>
+
+#define        clat_log0(x)    do { log x; } while (0)
+#define        clat_log1(x)    do { if (clat_debug >= 1) log x; } while (0)
+#define        clat_log2(x)    do { if (clat_debug >= 2) log x; } while (0)
+
+#define CLAT46_NEEDED(x)                                                       \
+       (!IN_LOOPBACK(x) && !IN_LINKLOCAL(x) && !IN_MULTICAST(x) &&             \
+       INADDR_BROADCAST != x)
+
+#define CLAT64_NEEDED(x)                                                       \
+       (!IN6_IS_ADDR_LOOPBACK(x) && !IN6_IS_ADDR_LINKLOCAL(x) &&               \
+       !IN6_IS_ADDR_MULTICAST(x))
+
+extern int clat_debug;
+
+enum   { NT_DROP, NT_NAT64 };
+enum   { NT_IN, NT_OUT };
+struct nat464_addr {
+       union {
+               struct in_addr          _v4addr;
+               struct in6_addr         _v6addr;
+               uint8_t         _addr8[16];
+               uint16_t                _addr16[8];
+               uint32_t                _addr32[4];
+       } nat464a;                  /* 128-bit address */
+#define natv4addr      nat464a._v4addr
+#define natv6addr      nat464a._v6addr
+#define nataddr8       nat464a._addr8
+#define nataddr16      nat464a._addr16
+#define nataddr32      nat464a._addr32
+};
+
+int
+nat464_translate_icmp(int , void *);
+
+int
+nat464_translate_icmp_ip(pbuf_t *, uint32_t , uint64_t *, uint32_t *,
+    uint8_t , uint8_t , uint64_t , struct nat464_addr *,
+    struct nat464_addr *, protocol_family_t , protocol_family_t );
+
+int
+nat464_synthesize_ipv6(ifnet_t, const struct in_addr *, struct in6_addr *);
+
+int
+nat464_synthesize_ipv4(ifnet_t, const struct in6_addr *, struct in_addr *);
+
+int
+nat464_translate_64(pbuf_t *, int, uint8_t, uint8_t *, uint8_t, struct in_addr,
+    struct in_addr, uint64_t, boolean_t *);
+
+int
+nat464_translate_46(pbuf_t *, int, uint8_t, uint8_t, uint8_t, struct in6_addr,
+    struct in6_addr, uint64_t);
+
+int
+nat464_translate_proto(pbuf_t *, struct nat464_addr *, struct nat464_addr *,
+    uint8_t, protocol_family_t, protocol_family_t, int, boolean_t);
+
+int
+nat464_insert_frag46(pbuf_t *, uint16_t, uint16_t, boolean_t);
+
+int
+nat464_remove_frag64(pbuf_t *, uint32_t, uint16_t, boolean_t);
+
+uint16_t
+nat464_cksum_fixup(uint16_t, uint16_t , uint16_t , uint8_t);
+#endif /* !_NET_NAT464_UTILS_H_ */
index b4b9ff3406f912c3d96fc09954a03487e0f9fa33..17d00fd4f7352850f63ff7d177034e0ca3e95704 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2013-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2013-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -182,26 +182,27 @@ u_int32_t necp_session_count = 0;
 
 #define        IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(x)     ((x) == NECP_ROUTE_RULE_DENY_INTERFACE || (x) == NECP_ROUTE_RULE_ALLOW_INTERFACE)
 
-#define        NECP_KERNEL_CONDITION_ALL_INTERFACES            0x00001
-#define        NECP_KERNEL_CONDITION_BOUND_INTERFACE           0x00002
-#define        NECP_KERNEL_CONDITION_PROTOCOL                          0x00004
-#define        NECP_KERNEL_CONDITION_LOCAL_START                       0x00008
-#define        NECP_KERNEL_CONDITION_LOCAL_END                         0x00010
-#define        NECP_KERNEL_CONDITION_LOCAL_PREFIX                      0x00020
-#define        NECP_KERNEL_CONDITION_REMOTE_START                      0x00040
-#define        NECP_KERNEL_CONDITION_REMOTE_END                        0x00080
-#define        NECP_KERNEL_CONDITION_REMOTE_PREFIX                     0x00100
-#define        NECP_KERNEL_CONDITION_APP_ID                            0x00200
-#define        NECP_KERNEL_CONDITION_REAL_APP_ID                       0x00400
-#define        NECP_KERNEL_CONDITION_DOMAIN                            0x00800
-#define        NECP_KERNEL_CONDITION_ACCOUNT_ID                        0x01000
-#define        NECP_KERNEL_CONDITION_POLICY_ID                         0x02000
-#define        NECP_KERNEL_CONDITION_PID                                       0x04000
-#define        NECP_KERNEL_CONDITION_UID                                       0x08000
-#define        NECP_KERNEL_CONDITION_LAST_INTERFACE            0x10000                 // Only set from packets looping between interfaces
-#define        NECP_KERNEL_CONDITION_TRAFFIC_CLASS                     0x20000
-#define        NECP_KERNEL_CONDITION_ENTITLEMENT                       0x40000
-#define        NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT        0x80000
+#define        NECP_KERNEL_CONDITION_ALL_INTERFACES            0x000001
+#define        NECP_KERNEL_CONDITION_BOUND_INTERFACE           0x000002
+#define        NECP_KERNEL_CONDITION_PROTOCOL                          0x000004
+#define        NECP_KERNEL_CONDITION_LOCAL_START                       0x000008
+#define        NECP_KERNEL_CONDITION_LOCAL_END                         0x000010
+#define        NECP_KERNEL_CONDITION_LOCAL_PREFIX                      0x000020
+#define        NECP_KERNEL_CONDITION_REMOTE_START                      0x000040
+#define        NECP_KERNEL_CONDITION_REMOTE_END                        0x000080
+#define        NECP_KERNEL_CONDITION_REMOTE_PREFIX                     0x000100
+#define        NECP_KERNEL_CONDITION_APP_ID                            0x000200
+#define        NECP_KERNEL_CONDITION_REAL_APP_ID                       0x000400
+#define        NECP_KERNEL_CONDITION_DOMAIN                            0x000800
+#define        NECP_KERNEL_CONDITION_ACCOUNT_ID                        0x001000
+#define        NECP_KERNEL_CONDITION_POLICY_ID                         0x002000
+#define        NECP_KERNEL_CONDITION_PID                                       0x004000
+#define        NECP_KERNEL_CONDITION_UID                                       0x008000
+#define        NECP_KERNEL_CONDITION_LAST_INTERFACE            0x010000                        // Only set from packets looping between interfaces
+#define        NECP_KERNEL_CONDITION_TRAFFIC_CLASS                     0x020000
+#define        NECP_KERNEL_CONDITION_ENTITLEMENT                       0x040000
+#define        NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT        0x080000
+#define        NECP_KERNEL_CONDITION_AGENT_TYPE                        0x100000
 
 #define NECP_MAX_POLICY_RESULT_SIZE                                    512
 #define NECP_MAX_ROUTE_RULES_ARRAY_SIZE                                1024
@@ -223,6 +224,8 @@ struct necp_session {
        u_int32_t                                       session_priority; // Descriptive priority rating
        u_int32_t                                       session_order;
 
+       necp_policy_id                          last_policy_id;
+
        decl_lck_mtx_data(, lock);
 
        bool                                            proc_locked; // Messages must come from proc_uuid
@@ -270,13 +273,6 @@ static     lck_attr_t              *necp_route_rule_mtx_attr       = NULL;
 static lck_grp_t               *necp_route_rule_mtx_grp        = NULL;
 decl_lck_rw_data(static, necp_route_rule_lock);
 
-static necp_policy_id necp_last_policy_id = 0;
-static necp_kernel_policy_id necp_last_kernel_policy_id = 0;
-static u_int32_t necp_last_uuid_id = 0;
-static u_int32_t necp_last_string_id = 0;
-static u_int32_t necp_last_route_rule_id = 0;
-static u_int32_t necp_last_aggregate_route_rule_id = 0;
-
 /*
  * On modification, invalidate cached lookups by bumping the generation count.
  * Other calls will need to take the slowpath of taking
@@ -342,13 +338,13 @@ static bool necp_policy_mark_all_for_deletion(struct necp_session *session);
 static bool necp_policy_delete(struct necp_session *session, struct necp_session_policy *policy);
 static void necp_policy_apply_all(struct necp_session *session);
 
-static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
+static necp_kernel_policy_id necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
 static bool necp_kernel_socket_policy_delete(necp_kernel_policy_id policy_id);
 static bool necp_kernel_socket_policies_reprocess(void);
 static bool necp_kernel_socket_policies_update_uuid_table(void);
-static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, size_t netagent_array_count, proc_t proc);
+static inline struct necp_kernel_socket_policy *necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id);
 
-static necp_kernel_policy_id necp_kernel_ip_output_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
+static necp_kernel_policy_id necp_kernel_ip_output_policy_add(necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter);
 static bool necp_kernel_ip_output_policy_delete(necp_kernel_policy_id policy_id);
 static bool necp_kernel_ip_output_policies_reprocess(void);
 
@@ -630,6 +626,12 @@ necp_session_find_from_fd(int fd, struct necp_session **session)
        }
        *session = (struct necp_session *)fp->f_fglob->fg_data;
 
+       if ((*session)->necp_fd_type != necp_fd_type_session) {
+               // Not a client fd, ignore
+               error = EINVAL;
+               goto done;
+       }
+
 done:
        proc_fdunlock(p);
        return (error);
@@ -839,7 +841,7 @@ necp_session_list_all(struct necp_session *session, struct necp_session_action_a
        u_int8_t *cursor = response;
        LIST_FOREACH(policy, &session->policies, chain) {
                if (!policy->pending_deletion && cur_policy_index < num_policies) {
-                       cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(u_int32_t), &policy->id, response, response_size);
+                       cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(u_int32_t), &policy->local_id, response, response_size);
                        cur_policy_index++;
                }
        }
@@ -1208,13 +1210,6 @@ necp_init(void)
        necp_kernel_ip_output_policies_count = 0;
        necp_kernel_ip_output_policies_non_id_count = 0;
 
-       necp_last_policy_id = 0;
-       necp_last_kernel_policy_id = 0;
-       necp_last_uuid_id = 0;
-       necp_last_string_id = 0;
-       necp_last_route_rule_id = 0;
-       necp_last_aggregate_route_rule_id = 0;
-
        necp_kernel_socket_policies_gencount = 1;
 
        memset(&necp_kernel_socket_policies_map, 0, sizeof(necp_kernel_socket_policies_map));
@@ -1982,53 +1977,39 @@ necp_policy_result_is_valid(u_int8_t *buffer, u_int32_t length)
        u_int8_t type = necp_policy_result_get_type_from_buffer(buffer, length);
        u_int32_t parameter_length = necp_policy_result_get_parameter_length_from_buffer(buffer, length);
        switch (type) {
-               case NECP_POLICY_RESULT_PASS: {
-                       validated = TRUE;
-                       break;
-               }
-               case NECP_POLICY_RESULT_SKIP: {
-                       if (parameter_length >= sizeof(u_int32_t)) {
-                               validated = TRUE;
-                       }
-                       break;
-               }
-               case NECP_POLICY_RESULT_DROP: {
+               case NECP_POLICY_RESULT_PASS:
+               case NECP_POLICY_RESULT_DROP:
+               case NECP_POLICY_RESULT_ROUTE_RULES:
+               case NECP_POLICY_RESULT_SCOPED_DIRECT: {
                        validated = TRUE;
                        break;
                }
-               case NECP_POLICY_RESULT_SOCKET_DIVERT: {
+               case NECP_POLICY_RESULT_SKIP:
+               case NECP_POLICY_RESULT_SOCKET_DIVERT:
+               case NECP_POLICY_RESULT_SOCKET_FILTER: {
                        if (parameter_length >= sizeof(u_int32_t)) {
                                validated = TRUE;
                        }
                        break;
                }
-               case NECP_POLICY_RESULT_SOCKET_SCOPED: {
-                       if (parameter_length > 0) {
-                               validated = TRUE;
-                       }
-                       break;
-               }
                case NECP_POLICY_RESULT_IP_TUNNEL: {
                        if (parameter_length > sizeof(u_int32_t)) {
                                validated = TRUE;
                        }
                        break;
                }
-               case NECP_POLICY_RESULT_SOCKET_FILTER: {
-                       if (parameter_length >= sizeof(u_int32_t)) {
+               case NECP_POLICY_RESULT_SOCKET_SCOPED: {
+                       if (parameter_length > 0) {
                                validated = TRUE;
                        }
                        break;
                }
-               case NECP_POLICY_RESULT_ROUTE_RULES: {
-                       validated = TRUE;
-                       break;
-               }
                case NECP_POLICY_RESULT_TRIGGER:
                case NECP_POLICY_RESULT_TRIGGER_IF_NEEDED:
                case NECP_POLICY_RESULT_TRIGGER_SCOPED:
                case NECP_POLICY_RESULT_NO_TRIGGER_SCOPED:
-               case NECP_POLICY_RESULT_USE_NETAGENT: {
+               case NECP_POLICY_RESULT_USE_NETAGENT:
+               case NECP_POLICY_RESULT_NETAGENT_SCOPED:{
                        if (parameter_length >= sizeof(uuid_t)) {
                                validated = TRUE;
                        }
@@ -2115,7 +2096,9 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli
                                                                                policy_result_type == NECP_POLICY_RESULT_NO_TRIGGER_SCOPED ||
                                                                                policy_result_type == NECP_POLICY_RESULT_SOCKET_SCOPED ||
                                                                                policy_result_type == NECP_POLICY_RESULT_ROUTE_RULES ||
-                                                                               policy_result_type == NECP_POLICY_RESULT_USE_NETAGENT) ? TRUE : FALSE;
+                                                                               policy_result_type == NECP_POLICY_RESULT_USE_NETAGENT ||
+                                                                               policy_result_type == NECP_POLICY_RESULT_NETAGENT_SCOPED ||
+                                                                               policy_result_type == NECP_POLICY_RESULT_SCOPED_DIRECT) ? TRUE : FALSE;
        u_int32_t condition_length = necp_policy_condition_get_value_length_from_buffer(buffer, length);
        u_int8_t *condition_value = necp_policy_condition_get_value_pointer_from_buffer(buffer, length);
        u_int8_t type = necp_policy_condition_get_type_from_buffer(buffer, length);
@@ -2190,6 +2173,13 @@ necp_policy_condition_is_valid(u_int8_t *buffer, u_int32_t length, u_int8_t poli
                        }
                        break;
                }
+               case NECP_POLICY_CONDITION_AGENT_TYPE: {
+                       if (!(flags & NECP_POLICY_CONDITION_FLAGS_NEGATIVE) &&
+                               condition_length >= sizeof(struct necp_policy_condition_agent_type)) {
+                               validated = TRUE;
+                       }
+                       break;
+               }
                default: {
                        validated = FALSE;
                        break;
@@ -2228,6 +2218,10 @@ necp_policy_route_rule_is_valid(u_int8_t *buffer, u_int32_t length)
                        validated = TRUE;
                        break;
                }
+               case NECP_ROUTE_RULE_DENY_LQM_ABORT: {
+                       validated = TRUE;
+                       break;
+               }
                default: {
                        validated = FALSE;
                        break;
@@ -2658,9 +2652,9 @@ necp_handle_policy_add(struct necp_session *session, u_int32_t message_id, mbuf_
        }
 
        if (packet != NULL) {
-               necp_send_policy_id_response(session, NECP_PACKET_TYPE_POLICY_ADD, message_id, policy->id);
+               necp_send_policy_id_response(session, NECP_PACKET_TYPE_POLICY_ADD, message_id, policy->local_id);
        }
-       return (policy->id);
+       return (policy->local_id);
 
 fail:
        if (policy_result != NULL) {
@@ -2815,7 +2809,7 @@ necp_handle_policy_list_all(struct necp_session *session, u_int32_t message_id,
 
        LIST_FOREACH(policy, &session->policies, chain) {
                if (!policy->pending_deletion && cur_policy_index < num_policies) {
-                       cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(u_int32_t), &policy->id, response, response_size);
+                       cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_ID, sizeof(u_int32_t), &policy->local_id, response, response_size);
                        cur_policy_index++;
                }
        }
@@ -2836,22 +2830,17 @@ necp_handle_policy_delete_all(struct necp_session *session, u_int32_t message_id
 }
 
 static necp_policy_id
-necp_policy_get_new_id(void)
+necp_policy_get_new_id(struct necp_session *session)
 {
-       necp_policy_id newid = 0;
-
-       lck_rw_lock_exclusive(&necp_kernel_policy_lock);
-
-       necp_last_policy_id++;
-       if (necp_last_policy_id < 1) {
-               necp_last_policy_id = 1;
+       session->last_policy_id++;
+       if (session->last_policy_id < 1) {
+               session->last_policy_id = 1;
        }
 
-       newid = necp_last_policy_id;
-       lck_rw_done(&necp_kernel_policy_lock);
+       necp_policy_id newid = session->last_policy_id;
 
        if (newid == 0) {
-               NECPLOG0(LOG_DEBUG, "Allocate policy id failed.\n");
+               NECPLOG0(LOG_ERR, "Allocate policy id failed.\n");
                return (0);
        }
 
@@ -3112,6 +3101,10 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id,
                                }
                                num_conditions++;
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) {
+                               condition_tlv_length += sizeof(struct necp_policy_condition_agent_type);
+                               num_conditions++;
+                       }
                }
 
                condition_tlv_length += num_conditions * (sizeof(u_int8_t) + sizeof(u_int32_t)); // These are for the condition TLVs. The space for "value" is already accounted for above.
@@ -3236,6 +3229,11 @@ necp_handle_policy_dump_all(struct necp_session *session, u_int32_t message_id,
                                                                                                                        cond_buf, condition_tlv_length);
                                }
                        }
+                       if (condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) {
+                               cond_buf_cursor = necp_buffer_write_tlv(cond_buf_cursor, NECP_POLICY_CONDITION_AGENT_TYPE,
+                                                                                                               sizeof(policy->cond_agent_type), &policy->cond_agent_type,
+                                                                                                               cond_buf, condition_tlv_length);
+                       }
                }
 
                cursor = necp_buffer_write_tlv(cursor, NECP_TLV_POLICY_CONDITION, cond_buf_cursor - cond_buf, cond_buf, tlv_buffer, total_allocated_bytes);
@@ -3377,7 +3375,7 @@ necp_policy_create(struct necp_session *session, necp_policy_order order, u_int8
        new_policy->route_rules_size = route_rules_array_size;
        new_policy->result = result;
        new_policy->result_size = result_size;
-       new_policy->id = necp_policy_get_new_id();
+       new_policy->local_id = necp_policy_get_new_id(session);
 
        LIST_INSERT_SORTED_ASCENDING(&session->policies, new_policy, chain, order, tmp_policy);
 
@@ -3399,7 +3397,7 @@ necp_policy_find(struct necp_session *session, necp_policy_id policy_id)
        }
 
        LIST_FOREACH(policy, &session->policies, chain) {
-               if (policy->id == policy_id) {
+               if (policy->local_id == policy_id) {
                        return (policy);
                }
        }
@@ -3610,6 +3608,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
        u_int32_t offset = 0;
        u_int8_t ultimate_result = 0;
        u_int32_t secondary_result = 0;
+       struct necp_policy_condition_agent_type cond_agent_type = {};
        necp_kernel_policy_result_parameter secondary_result_parameter;
        memset(&secondary_result_parameter, 0, sizeof(secondary_result_parameter));
        u_int32_t cond_last_interface_index = 0;
@@ -3862,6 +3861,14 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                                socket_ip_conditions = TRUE;
                                break;
                        }
+                       case NECP_POLICY_CONDITION_AGENT_TYPE: {
+                               if (condition_length >= sizeof(cond_agent_type)) {
+                                       master_condition_mask |= NECP_KERNEL_CONDITION_AGENT_TYPE;
+                                       memcpy(&cond_agent_type, condition_value, sizeof(cond_agent_type));
+                                       socket_only_conditions = TRUE;
+                               }
+                               break;
+                       }
                        default: {
                                break;
                        }
@@ -3978,7 +3985,8 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                        }
                        break;
                }
-               case NECP_POLICY_RESULT_USE_NETAGENT: {
+               case NECP_POLICY_RESULT_USE_NETAGENT:
+               case NECP_POLICY_RESULT_NETAGENT_SCOPED: {
                        uuid_t netagent_uuid;
                        if (necp_policy_get_result_parameter(policy, (u_int8_t *)&netagent_uuid, sizeof(netagent_uuid))) {
                                ultimate_result_parameter.netagent_id = necp_create_uuid_service_id_mapping(netagent_uuid);
@@ -4004,6 +4012,10 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                        }
                        break;
                }
+               case NECP_POLICY_RESULT_SCOPED_DIRECT: {
+                       socket_layer_non_id_conditions = TRUE;
+                       break;
+               }
                case NECP_POLICY_RESULT_ROUTE_RULES: {
                        if (policy->route_rules != NULL && policy->route_rules_size > 0) {
                                u_int32_t route_rule_id = necp_create_route_rule(&necp_route_rules, policy->route_rules, policy->route_rules_size);
@@ -4021,7 +4033,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
        }
 
        if (socket_layer_non_id_conditions) {
-               necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->id, policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter);
+               necp_kernel_policy_id policy_id = necp_kernel_socket_policy_add(policy->order, session->session_order, session->proc_pid, master_condition_mask, master_condition_negated_mask, cond_app_id, cond_real_app_id, cond_custom_entitlement, cond_account_id, cond_domain, cond_pid, cond_uid, cond_bound_interface, cond_traffic_class, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, &cond_agent_type, ultimate_result, ultimate_result_parameter);
 
                if (policy_id == 0) {
                        NECPLOG0(LOG_DEBUG, "Error applying socket kernel policy");
@@ -4037,7 +4049,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
                if (ip_output_layer_non_id_only) {
                        condition_mask |= NECP_KERNEL_CONDITION_POLICY_ID;
                }
-               necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS, session->session_order, session->proc_pid, condition_mask, master_condition_negated_mask, NECP_KERNEL_POLICY_ID_NONE, cond_bound_interface, 0, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter);
+               necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS, session->session_order, session->proc_pid, condition_mask, master_condition_negated_mask, NECP_KERNEL_POLICY_ID_NONE, cond_bound_interface, 0, cond_protocol, &cond_local_start, &cond_local_end, cond_local_prefix, &cond_remote_start, &cond_remote_end, cond_remote_prefix, ultimate_result, ultimate_result_parameter);
 
                if (policy_id == 0) {
                        NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy");
@@ -4048,7 +4060,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
        }
 
        if (ip_output_layer_id_condition) {
-               necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, cond_ip_output_layer_id, NULL, 0, 0, NULL, NULL, 0, NULL, NULL, 0, ultimate_result, ultimate_result_parameter);
+               necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, cond_ip_output_layer_id, NULL, 0, 0, NULL, NULL, 0, NULL, NULL, 0, ultimate_result, ultimate_result_parameter);
 
                if (policy_id == 0) {
                        NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy");
@@ -4060,7 +4072,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
 
        // Extra policies for IP Output tunnels for when packets loop back
        if (ip_output_layer_tunnel_condition_from_id) {
-               necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_TUNNEL_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, policy->kernel_ip_output_policies[NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS], NULL, cond_last_interface_index, 0, NULL, NULL, 0, NULL, NULL, 0, secondary_result, secondary_result_parameter);
+               necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_NON_ID_TUNNEL_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, policy->kernel_ip_output_policies[NECP_KERNEL_POLICY_SUBORDER_NON_ID_CONDITIONS], NULL, cond_last_interface_index, 0, NULL, NULL, 0, NULL, NULL, 0, secondary_result, secondary_result_parameter);
 
                if (policy_id == 0) {
                        NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy");
@@ -4071,7 +4083,7 @@ necp_policy_apply(struct necp_session *session, struct necp_session_policy *poli
        }
 
        if (ip_output_layer_tunnel_condition_from_id) {
-               necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->id, policy->order, NECP_KERNEL_POLICY_SUBORDER_ID_TUNNEL_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, policy->kernel_ip_output_policies[NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION], NULL, cond_last_interface_index, 0, NULL, NULL, 0, NULL, NULL, 0, secondary_result, secondary_result_parameter);
+               necp_kernel_policy_id policy_id = necp_kernel_ip_output_policy_add(policy->order, NECP_KERNEL_POLICY_SUBORDER_ID_TUNNEL_CONDITION, session->session_order, session->proc_pid, NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE | NECP_KERNEL_CONDITION_ALL_INTERFACES, 0, policy->kernel_ip_output_policies[NECP_KERNEL_POLICY_SUBORDER_ID_CONDITION], NULL, cond_last_interface_index, 0, NULL, NULL, 0, NULL, NULL, 0, secondary_result, secondary_result_parameter);
 
                if (policy_id == 0) {
                        NECPLOG0(LOG_DEBUG, "Error applying IP output kernel policy");
@@ -4189,9 +4201,9 @@ necp_kernel_policy_get_new_id(bool socket_level)
        return (newid);
 }
 
-#define        NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT)
+#define        NECP_KERNEL_VALID_SOCKET_CONDITIONS (NECP_KERNEL_CONDITION_APP_ID | NECP_KERNEL_CONDITION_REAL_APP_ID | NECP_KERNEL_CONDITION_DOMAIN | NECP_KERNEL_CONDITION_ACCOUNT_ID | NECP_KERNEL_CONDITION_PID | NECP_KERNEL_CONDITION_UID | NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_TRAFFIC_CLASS | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_ENTITLEMENT | NECP_KERNEL_CONDITION_CUSTOM_ENTITLEMENT | NECP_KERNEL_CONDITION_AGENT_TYPE)
 static necp_kernel_policy_id
-necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
+necp_kernel_socket_policy_add(necp_policy_order order, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_app_id cond_app_id, necp_app_id cond_real_app_id, char *cond_custom_entitlement, u_int32_t cond_account_id, char *cond_domain, pid_t cond_pid, uid_t cond_uid, ifnet_t cond_bound_interface, struct necp_policy_condition_tc_range cond_traffic_class, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, struct necp_policy_condition_agent_type *cond_agent_type, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
 {
        struct necp_kernel_socket_policy *new_kernel_policy = NULL;
        struct necp_kernel_socket_policy *tmp_kernel_policy = NULL;
@@ -4202,7 +4214,6 @@ necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order
        }
 
        memset(new_kernel_policy, 0, sizeof(*new_kernel_policy)); // M_ZERO is not supported for MALLOC_ZONE
-       new_kernel_policy->parent_policy_id = parent_policy_id;
        new_kernel_policy->id = necp_kernel_policy_get_new_id(true);
        new_kernel_policy->order = order;
        new_kernel_policy->session_order = session_order;
@@ -4281,6 +4292,9 @@ necp_kernel_socket_policy_add(necp_policy_id parent_policy_id, necp_policy_order
        if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_REMOTE_PREFIX) {
                new_kernel_policy->cond_remote_prefix = cond_remote_prefix;
        }
+       if (new_kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) {
+               memcpy(&new_kernel_policy->cond_agent_type, cond_agent_type, sizeof(*cond_agent_type));
+       }
 
        new_kernel_policy->result = result;
        memcpy(&new_kernel_policy->result_parameter, &result_parameter, sizeof(result_parameter));
@@ -4388,6 +4402,10 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul
                        snprintf(result_string, MAX_RESULT_STRING_LEN, "SocketScoped (%s%d)", ifnet_name(interface), ifnet_unit(interface));
                        break;
                }
+               case NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT: {
+                       snprintf(result_string, MAX_RESULT_STRING_LEN, "ScopedDirect");
+                       break;
+               }
                case NECP_KERNEL_POLICY_RESULT_ROUTE_RULES: {
                        int index = 0;
                        char interface_names[IFXNAMSIZ][MAX_ROUTE_RULE_INTERFACES];
@@ -4498,6 +4516,16 @@ necp_get_result_description(char *result_string, necp_kernel_policy_result resul
                        snprintf(result_string, MAX_RESULT_STRING_LEN, "UseNetAgent (%s)", found_mapping ? uuid_string : "Unknown");
                        break;
                }
+               case NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED: {
+                       bool found_mapping = FALSE;
+                       struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(result_parameter.netagent_id);
+                       if (mapping != NULL) {
+                               uuid_unparse(mapping->uuid, uuid_string);
+                               found_mapping = TRUE;
+                       }
+                       snprintf(result_string, MAX_RESULT_STRING_LEN, "NetAgentScoped (%s)", found_mapping ? uuid_string : "Unknown");
+                       break;
+               }
                case NECP_POLICY_RESULT_TRIGGER: {
                        bool found_mapping = FALSE;
                        struct necp_uuid_id_mapping *mapping = necp_uuid_lookup_uuid_with_service_id_locked(result_parameter.service.identifier);
@@ -4597,7 +4625,8 @@ necp_kernel_socket_policy_results_overlap(struct necp_kernel_socket_policy *uppe
                return (TRUE);
        } else if (upper_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER ||
                           upper_policy->result == NECP_KERNEL_POLICY_RESULT_ROUTE_RULES ||
-                          upper_policy->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT) {
+                          upper_policy->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT ||
+                          upper_policy->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED) {
                // Filters and route rules never cancel out lower policies
                return (FALSE);
        } else if (necp_kernel_socket_result_is_trigger_service_type(upper_policy)) {
@@ -4766,6 +4795,11 @@ necp_kernel_socket_policy_is_unnecessary(struct necp_kernel_socket_policy *polic
                        }
                }
 
+               if (compared_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE &&
+                       memcmp(&compared_policy->cond_agent_type, &policy->cond_agent_type, sizeof(policy->cond_agent_type)) == 0) {
+                       continue;
+               }
+
                return (TRUE);
        }
 
@@ -4813,6 +4847,11 @@ necp_kernel_socket_policies_reprocess(void)
                necp_kernel_application_policies_count++;
                app_layer_allocation_count++;
 
+               if ((kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE)) {
+                       // Agent type conditions only apply to app layer
+                       continue;
+               }
+
                // Update socket layer bucket mask/counts
                necp_kernel_socket_policies_condition_mask |= kernel_policy->condition_mask;
                necp_kernel_socket_policies_count++;
@@ -4850,7 +4889,19 @@ necp_kernel_socket_policies_reprocess(void)
 
        // Fill out maps
        LIST_FOREACH(kernel_policy, &necp_kernel_socket_policies, chain) {
-               // Insert pointers into map
+               // Add app layer policies
+               if (!necp_kernel_socket_policy_is_unnecessary(kernel_policy, necp_kernel_socket_policies_app_layer_map, app_layer_current_free_index)) {
+                       necp_kernel_socket_policies_app_layer_map[app_layer_current_free_index] = kernel_policy;
+                       app_layer_current_free_index++;
+                       necp_kernel_socket_policies_app_layer_map[app_layer_current_free_index] = NULL;
+               }
+
+               if ((kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE)) {
+                       // Agent type conditions only apply to app layer
+                       continue;
+               }
+
+               // Add socket policies
                if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_APP_ID) ||
                        kernel_policy->condition_negated_mask & NECP_KERNEL_CONDITION_APP_ID) {
                        for (app_i = 0; app_i < NECP_KERNEL_SOCKET_POLICIES_MAP_NUM_APP_ID_BUCKETS; app_i++) {
@@ -4868,12 +4919,6 @@ necp_kernel_socket_policies_reprocess(void)
                                (necp_kernel_socket_policies_map[app_i])[(bucket_current_free_index[app_i])] = NULL;
                        }
                }
-
-               if (!necp_kernel_socket_policy_is_unnecessary(kernel_policy, necp_kernel_socket_policies_app_layer_map, app_layer_current_free_index)) {
-                       necp_kernel_socket_policies_app_layer_map[app_layer_current_free_index] = kernel_policy;
-                       app_layer_current_free_index++;
-                       necp_kernel_socket_policies_app_layer_map[app_layer_current_free_index] = NULL;
-               }
        }
        necp_kernel_socket_policies_dump_all();
        BUMP_KERNEL_SOCKET_POLICIES_GENERATION_COUNT();
@@ -4902,18 +4947,29 @@ fail:
 static u_int32_t
 necp_get_new_string_id(void)
 {
+       static u_int32_t necp_last_string_id = 0;
+
        u_int32_t newid = 0;
 
        LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE);
 
-       necp_last_string_id++;
-       if (necp_last_string_id < 1) {
-               necp_last_string_id = 1;
-       }
+       bool wrapped = FALSE;
+       do {
+               necp_last_string_id++;
+               if (necp_last_string_id < 1) {
+                       if (wrapped) {
+                               // Already wrapped, give up
+                               NECPLOG0(LOG_ERR, "Failed to find a free app UUID.\n");
+                               return (0);
+                       }
+                       necp_last_string_id = 1;
+                       wrapped = TRUE;
+               }
+               newid = necp_last_string_id;
+       } while (necp_lookup_string_with_id_locked(&necp_account_id_list, newid) != NULL); // If already used, keep trying
 
-       newid = necp_last_string_id;
        if (newid == 0) {
-               NECPLOG0(LOG_DEBUG, "Allocate string id failed.\n");
+               NECPLOG0(LOG_ERR, "Allocate string id failed.\n");
                return (0);
        }
 
@@ -5007,42 +5063,57 @@ necp_remove_string_to_id_mapping(struct necp_string_id_mapping_list *list, char
        return (FALSE);
 }
 
+#define NECP_FIRST_VALID_ROUTE_RULE_ID 1
+#define NECP_FIRST_VALID_AGGREGATE_ROUTE_RULE_ID UINT16_MAX
 static u_int32_t
-necp_get_new_route_rule_id(void)
+necp_get_new_route_rule_id(bool aggregate)
 {
-       u_int32_t newid = 0;
+       static u_int32_t necp_last_route_rule_id = 0;
+       static u_int32_t necp_last_aggregate_route_rule_id = 0;
 
-       LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE);
-
-       necp_last_route_rule_id++;
-       if (necp_last_route_rule_id < 1 || necp_last_route_rule_id > UINT16_MAX) {
-               necp_last_route_rule_id = 1;
-       }
-
-       newid = necp_last_route_rule_id;
-       if (newid == 0) {
-               NECPLOG0(LOG_DEBUG, "Allocate route rule id failed.\n");
-               return (0);
-       }
-
-       return (newid);
-}
-
-static u_int32_t
-necp_get_new_aggregate_route_rule_id(void)
-{
        u_int32_t newid = 0;
 
-       LCK_RW_ASSERT(&necp_route_rule_lock, LCK_RW_ASSERT_EXCLUSIVE);
+       if (!aggregate) {
+               // Main necp_kernel_policy_lock protects non-aggregate rule IDs
+               LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE);
 
-       necp_last_aggregate_route_rule_id++;
-       if (necp_last_aggregate_route_rule_id <= UINT16_MAX) {
-               necp_last_aggregate_route_rule_id = UINT16_MAX + 1;
+               bool wrapped = FALSE;
+               do {
+                       necp_last_route_rule_id++;
+                       if (necp_last_route_rule_id < NECP_FIRST_VALID_ROUTE_RULE_ID ||
+                               necp_last_route_rule_id >= NECP_FIRST_VALID_AGGREGATE_ROUTE_RULE_ID) {
+                               if (wrapped) {
+                                       // Already wrapped, give up
+                                       NECPLOG0(LOG_ERR, "Failed to find a free route rule id.\n");
+                                       return (0);
+                               }
+                               necp_last_route_rule_id = NECP_FIRST_VALID_ROUTE_RULE_ID;
+                               wrapped = TRUE;
+                       }
+                       newid = necp_last_route_rule_id;
+               } while (necp_lookup_route_rule_locked(&necp_route_rules, newid) != NULL); // If already used, keep trying
+       } else {
+               // necp_route_rule_lock protects aggregate rule IDs
+               LCK_RW_ASSERT(&necp_route_rule_lock, LCK_RW_ASSERT_EXCLUSIVE);
+
+               bool wrapped = FALSE;
+               do {
+                       necp_last_aggregate_route_rule_id++;
+                       if (necp_last_aggregate_route_rule_id < NECP_FIRST_VALID_AGGREGATE_ROUTE_RULE_ID) {
+                               if (wrapped) {
+                                       // Already wrapped, give up
+                                       NECPLOG0(LOG_ERR, "Failed to find a free aggregate route rule id.\n");
+                                       return (0);
+                               }
+                               necp_last_aggregate_route_rule_id = NECP_FIRST_VALID_AGGREGATE_ROUTE_RULE_ID;
+                               wrapped = TRUE;
+                       }
+                       newid = necp_last_aggregate_route_rule_id;
+               } while (necp_lookup_route_rule_locked(&necp_route_rules, newid) != NULL); // If already used, keep trying
        }
 
-       newid = necp_last_aggregate_route_rule_id;
        if (newid == 0) {
-               NECPLOG0(LOG_DEBUG, "Allocate aggregate route rule id failed.\n");
+               NECPLOG0(LOG_ERR, "Allocate route rule ID failed.\n");
                return (0);
        }
 
@@ -5202,7 +5273,7 @@ necp_create_route_rule(struct necp_route_rule_list *list, u_int8_t *route_rules_
                MALLOC(new_rule, struct necp_route_rule *, sizeof(struct necp_route_rule), M_NECP, M_WAITOK);
                if (new_rule != NULL) {
                        memset(new_rule, 0, sizeof(struct necp_route_rule));
-                       route_rule_id = new_rule->id = necp_get_new_route_rule_id();
+                       route_rule_id = new_rule->id = necp_get_new_route_rule_id(false);
                        new_rule->default_action = default_action;
                        new_rule->cellular_action = cellular_action;
                        new_rule->wifi_action = wifi_action;
@@ -5308,7 +5379,7 @@ necp_create_aggregate_route_rule(u_int32_t *rule_ids)
        MALLOC(new_rule, struct necp_aggregate_route_rule *, sizeof(struct necp_aggregate_route_rule), M_NECP, M_WAITOK);
        if (new_rule != NULL) {
                memset(new_rule, 0, sizeof(struct necp_aggregate_route_rule));
-               aggregate_route_rule_id = new_rule->id = necp_get_new_aggregate_route_rule_id();
+               aggregate_route_rule_id = new_rule->id = necp_get_new_route_rule_id(true);
                new_rule->id = aggregate_route_rule_id;
                memcpy(new_rule->rule_ids, rule_ids, (sizeof(u_int32_t) * MAX_AGGREGATE_ROUTE_RULES));
                LIST_INSERT_HEAD(&necp_aggregate_route_rules, new_rule, chain);
@@ -5319,22 +5390,54 @@ necp_create_aggregate_route_rule(u_int32_t *rule_ids)
 }
 
 #define NECP_NULL_SERVICE_ID 1
+#define NECP_FIRST_VALID_SERVICE_ID  2
+#define NECP_FIRST_VALID_APP_ID  UINT16_MAX
 static u_int32_t
-necp_get_new_uuid_id(void)
+necp_get_new_uuid_id(bool service)
 {
+       static u_int32_t necp_last_service_uuid_id = 0;
+       static u_int32_t necp_last_app_uuid_id = 0;
+
        u_int32_t newid = 0;
 
        LCK_RW_ASSERT(&necp_kernel_policy_lock, LCK_RW_ASSERT_EXCLUSIVE);
 
-       necp_last_uuid_id++;
-       if (necp_last_uuid_id < (NECP_NULL_SERVICE_ID + 1)) {
-               necp_last_uuid_id = (NECP_NULL_SERVICE_ID + 1);
+       if (service) {
+               bool wrapped = FALSE;
+               do {
+                       necp_last_service_uuid_id++;
+                       if (necp_last_service_uuid_id < NECP_FIRST_VALID_SERVICE_ID ||
+                               necp_last_service_uuid_id >= NECP_FIRST_VALID_APP_ID) {
+                               if (wrapped) {
+                                       // Already wrapped, give up
+                                       NECPLOG0(LOG_ERR, "Failed to find a free service UUID.\n");
+                                       return (NECP_NULL_SERVICE_ID);
+                               }
+                               necp_last_service_uuid_id = NECP_FIRST_VALID_SERVICE_ID;
+                               wrapped = TRUE;
+                       }
+                       newid = necp_last_service_uuid_id;
+               } while (necp_uuid_lookup_uuid_with_service_id_locked(newid) != NULL); // If already used, keep trying
+       } else {
+               bool wrapped = FALSE;
+               do {
+                       necp_last_app_uuid_id++;
+                       if (necp_last_app_uuid_id < NECP_FIRST_VALID_APP_ID) {
+                               if (wrapped) {
+                                       // Already wrapped, give up
+                                       NECPLOG0(LOG_ERR, "Failed to find a free app UUID.\n");
+                                       return (NECP_NULL_SERVICE_ID);
+                               }
+                               necp_last_app_uuid_id = NECP_FIRST_VALID_APP_ID;
+                               wrapped = TRUE;
+                       }
+                       newid = necp_last_app_uuid_id;
+               } while (necp_uuid_lookup_uuid_with_app_id_locked(newid) != NULL); // If already used, keep trying
        }
 
-       newid = necp_last_uuid_id;
-       if (newid == 0) {
-               NECPLOG0(LOG_DEBUG, "Allocate uuid id failed.\n");
-               return (0);
+       if (newid == NECP_NULL_SERVICE_ID) {
+               NECPLOG0(LOG_ERR, "Allocate uuid ID failed.\n");
+               return (NECP_NULL_SERVICE_ID);
        }
 
        return (newid);
@@ -5399,7 +5502,7 @@ necp_create_uuid_app_id_mapping(uuid_t uuid, bool *allocated_mapping, bool uuid_
                MALLOC(new_mapping, struct necp_uuid_id_mapping *, sizeof(*new_mapping), M_NECP, M_WAITOK);
                if (new_mapping != NULL) {
                        uuid_copy(new_mapping->uuid, uuid);
-                       new_mapping->id = necp_get_new_uuid_id();
+                       new_mapping->id = necp_get_new_uuid_id(false);
                        new_mapping->refcount = 1;
                        if (uuid_policy_table) {
                                new_mapping->table_refcount = 1;
@@ -5520,7 +5623,7 @@ necp_create_uuid_service_id_mapping(uuid_t uuid)
                MALLOC(new_mapping, struct necp_uuid_id_mapping *, sizeof(*new_mapping), M_NECP, M_WAITOK);
                if (new_mapping != NULL) {
                        uuid_copy(new_mapping->uuid, uuid);
-                       new_mapping->id = necp_get_new_uuid_id();
+                       new_mapping->id = necp_get_new_uuid_id(true);
                        new_mapping->refcount = 1;
 
                        LIST_INSERT_HEAD(&necp_uuid_service_id_list, new_mapping, chain);
@@ -5588,7 +5691,7 @@ necp_kernel_socket_policies_update_uuid_table(void)
 
 #define        NECP_KERNEL_VALID_IP_OUTPUT_CONDITIONS (NECP_KERNEL_CONDITION_ALL_INTERFACES | NECP_KERNEL_CONDITION_BOUND_INTERFACE | NECP_KERNEL_CONDITION_PROTOCOL | NECP_KERNEL_CONDITION_LOCAL_START | NECP_KERNEL_CONDITION_LOCAL_END | NECP_KERNEL_CONDITION_LOCAL_PREFIX | NECP_KERNEL_CONDITION_REMOTE_START | NECP_KERNEL_CONDITION_REMOTE_END | NECP_KERNEL_CONDITION_REMOTE_PREFIX | NECP_KERNEL_CONDITION_POLICY_ID | NECP_KERNEL_CONDITION_LAST_INTERFACE)
 static necp_kernel_policy_id
-necp_kernel_ip_output_policy_add(necp_policy_id parent_policy_id, necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
+necp_kernel_ip_output_policy_add(necp_policy_order order, necp_policy_order suborder, u_int32_t session_order, int session_pid, u_int32_t condition_mask, u_int32_t condition_negated_mask, necp_kernel_policy_id cond_policy_id, ifnet_t cond_bound_interface, u_int32_t cond_last_interface_index, u_int16_t cond_protocol, union necp_sockaddr_union *cond_local_start, union necp_sockaddr_union *cond_local_end, u_int8_t cond_local_prefix, union necp_sockaddr_union *cond_remote_start, union necp_sockaddr_union *cond_remote_end, u_int8_t cond_remote_prefix, necp_kernel_policy_result result, necp_kernel_policy_result_parameter result_parameter)
 {
        struct necp_kernel_ip_output_policy *new_kernel_policy = NULL;
        struct necp_kernel_ip_output_policy *tmp_kernel_policy = NULL;
@@ -5599,7 +5702,6 @@ necp_kernel_ip_output_policy_add(necp_policy_id parent_policy_id, necp_policy_or
        }
 
        memset(new_kernel_policy, 0, sizeof(*new_kernel_policy)); // M_ZERO is not supported for MALLOC_ZONE
-       new_kernel_policy->parent_policy_id = parent_policy_id;
        new_kernel_policy->id = necp_kernel_policy_get_new_id(false);
        new_kernel_policy->suborder = suborder;
        new_kernel_policy->order = order;
@@ -5894,12 +5996,17 @@ necp_kernel_ip_output_policies_reprocess(void)
                necp_kernel_ip_output_policies_condition_mask |= kernel_policy->condition_mask;
                necp_kernel_ip_output_policies_count++;
 
-               // Update bucket counts
-               if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID)) {
-                       necp_kernel_ip_output_policies_non_id_count++;
+               /* Update bucket counts:
+                * Non-id and SKIP policies will be added to all buckets
+                */
+               if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) ||
+                       kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) {
                        for (i = 0; i < NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS; i++) {
                                bucket_allocation_counts[i]++;
                        }
+               }
+               if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID)) {
+                       necp_kernel_ip_output_policies_non_id_count++;
                } else {
                        bucket_allocation_counts[NECP_IP_OUTPUT_MAP_ID_TO_BUCKET(kernel_policy->cond_policy_id)]++;
                }
@@ -5921,7 +6028,8 @@ necp_kernel_ip_output_policies_reprocess(void)
 
        LIST_FOREACH(kernel_policy, &necp_kernel_ip_output_policies, chain) {
                // Insert pointers into map
-               if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID)) {
+               if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) ||
+                       kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP) {
                        for (i = 0; i < NECP_KERNEL_IP_OUTPUT_POLICIES_MAP_NUM_ID_BUCKETS; i++) {
                                if (!necp_kernel_ip_output_policy_is_unnecessary(kernel_policy, necp_kernel_ip_output_policies_map[i], bucket_current_free_index[i])) {
                                        (necp_kernel_ip_output_policies_map[i])[(bucket_current_free_index[i])] = kernel_policy;
@@ -6074,6 +6182,27 @@ necp_copy_string(char *string, size_t length)
        return (copied_string);
 }
 
+static u_int32_t
+necp_get_primary_direct_interface_index(void)
+{
+       u_int32_t interface_index = IFSCOPE_NONE;
+
+       ifnet_head_lock_shared();
+       struct ifnet *ordered_interface = NULL;
+       TAILQ_FOREACH(ordered_interface, &ifnet_ordered_head, if_ordered_link) {
+               const u_int8_t functional_type = if_functional_type(ordered_interface, TRUE);
+               if (functional_type != IFRTYPE_FUNCTIONAL_UNKNOWN &&
+                       functional_type != IFRTYPE_FUNCTIONAL_LOOPBACK) {
+                       // All known, non-loopback functional types represent direct physical interfaces (Wi-Fi, Cellular, Wired)
+                       interface_index = ordered_interface->if_index;
+                       break;
+               }
+       }
+       ifnet_head_done();
+
+       return interface_index;
+}
+
 static inline void
 necp_get_parent_cred_result(proc_t proc, struct necp_socket_info *info)
 {
@@ -6244,8 +6373,15 @@ necp_application_find_policy_match_internal(proc_t proc,
        char *domain = NULL;
        char *account = NULL;
 
+#define NECP_MAX_REQUIRED_AGENTS 16
+       u_int32_t num_required_agent_types = 0;
+       struct necp_client_parameter_netagent_type required_agent_types[NECP_MAX_REQUIRED_AGENTS];
+       memset(&required_agent_types, 0, sizeof(required_agent_types));
+
        u_int32_t netagent_ids[NECP_MAX_NETAGENTS];
+       u_int32_t netagent_use_flags[NECP_MAX_NETAGENTS];
        memset(&netagent_ids, 0, sizeof(netagent_ids));
+       memset(&netagent_use_flags, 0, sizeof(netagent_use_flags));
        int netagent_cursor;
 
        bool has_checked_delegation_entitlement = FALSE;
@@ -6398,6 +6534,17 @@ necp_application_find_policy_match_internal(proc_t proc,
                                                if (length >= sizeof(client_flags)) {
                                                        memcpy(&client_flags, value, sizeof(client_flags));
                                                }
+                                               break;
+                                       }
+                                       case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: {
+                                               if (num_required_agent_types >= NECP_MAX_REQUIRED_AGENTS) {
+                                                       break;
+                                               }
+                                               if (length >= sizeof(struct necp_client_parameter_netagent_type)) {
+                                                       memcpy(&required_agent_types[num_required_agent_types], value, sizeof(struct necp_client_parameter_netagent_type));
+                                                       num_required_agent_types++;
+                                               }
+                                               break;
                                        }
                                        default: {
                                                break;
@@ -6413,7 +6560,7 @@ necp_application_find_policy_match_internal(proc_t proc,
        lck_rw_lock_shared(&necp_kernel_policy_lock);
 
        necp_application_fillout_info_locked(application_uuid, real_application_uuid, account, domain, pid, uid, protocol, bound_interface_index, traffic_class, &local_addr, &remote_addr, proc, &info);
-       matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, proc);
+       matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_app_layer_map, &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, netagent_use_flags, NECP_MAX_NETAGENTS, required_agent_types, num_required_agent_types, proc, NULL);
        if (matched_policy) {
                returned_result->policy_id = matched_policy->id;
                returned_result->routing_result = matched_policy->result;
@@ -6460,7 +6607,7 @@ necp_application_find_policy_match_internal(proc_t proc,
                mapping = necp_uuid_lookup_uuid_with_service_id_locked(netagent_id);
                if (mapping != NULL) {
                        uuid_copy(returned_result->netagents[netagent_cursor], mapping->uuid);
-                       returned_result->netagent_flags[netagent_cursor] = netagent_get_flags(mapping->uuid);
+                       returned_result->netagent_use_flags[netagent_cursor] = netagent_use_flags[netagent_cursor];
                }
        }
 
@@ -6470,6 +6617,14 @@ necp_application_find_policy_match_internal(proc_t proc,
                output_bound_interface = returned_result->routing_result_parameter.scoped_interface_index;
        } else if (returned_result->routing_result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL) {
                output_bound_interface = returned_result->routing_result_parameter.tunnel_interface_index;
+       } else if (returned_result->routing_result == NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT) {
+               output_bound_interface = necp_get_primary_direct_interface_index();
+               if (output_bound_interface == IFSCOPE_NONE) {
+                       returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_DROP;
+               } else {
+                       returned_result->routing_result = NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED;
+                       returned_result->routing_result_parameter.scoped_interface_index = output_bound_interface;
+               }
        }
 
        if (local_addr.sa.sa_len == 0 ||
@@ -6508,6 +6663,13 @@ necp_application_find_policy_match_internal(proc_t proc,
                rt = rtalloc1_scoped((struct sockaddr *)&remote_addr, 0, 0,
                        output_bound_interface);
 
+               if (remote_addr.sa.sa_family == AF_INET && rt != NULL &&
+                   IS_INTF_CLAT46(rt->rt_ifp)) {
+                       rtfree(rt);
+                       rt = NULL;
+                       returned_result->routed_interface_index = 0;
+               }
+
                if (no_remote_addr && remote_family == 0 &&
                        (rt == NULL || rt->rt_ifp == NULL)) {
                        // Route lookup for default IPv4 failed, try IPv6
@@ -6674,6 +6836,10 @@ necp_application_find_policy_match_internal(proc_t proc,
                                if (necp_update_qos_marking(rt->rt_ifp, route_rule_id)) {
                                        *flags |= NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING;
                                }
+
+                               if (IFNET_IS_LOW_POWER(rt->rt_ifp)) {
+                                       *flags |= NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER;
+                               }
                        }
                }
 
@@ -6697,7 +6863,7 @@ necp_application_find_policy_match_internal(proc_t proc,
                            returned_result->routed_interface_index);
 
                        if (v4Route != NULL) {
-                               if (v4Route->rt_ifp != NULL) {
+                               if (v4Route->rt_ifp != NULL && !IS_INTF_CLAT46(v4Route->rt_ifp)) {
                                        *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV4;
                                }
                                rtfree(v4Route);
@@ -6707,6 +6873,10 @@ necp_application_find_policy_match_internal(proc_t proc,
                        if (v6Route != NULL) {
                                if (v6Route->rt_ifp != NULL) {
                                        *flags |= NECP_CLIENT_RESULT_FLAG_HAS_IPV6;
+
+                                       if (ifnet_get_nat64prefix(v6Route->rt_ifp, NULL) == 0) {
+                                               *flags |= NECP_CLIENT_RESULT_FLAG_HAS_NAT64;
+                                       }
                                }
                                rtfree(v6Route);
                                v6Route = NULL;
@@ -6741,7 +6911,7 @@ necp_application_find_policy_match_internal(proc_t proc,
 }
 
 static bool
-necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, proc_t proc)
+necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_app_id app_id, necp_app_id real_app_id, errno_t cred_result, u_int32_t account_id, struct substring domain, u_int8_t domain_dot_count, pid_t pid, uid_t uid, u_int32_t bound_interface_index, u_int32_t traffic_class, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote, struct necp_client_parameter_netagent_type *required_agent_types, u_int32_t num_required_agent_types, proc_t proc)
 {
        if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) {
                if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) {
@@ -6914,6 +7084,24 @@ necp_socket_check_policy(struct necp_kernel_socket_policy *kernel_policy, necp_a
                }
        }
 
+       if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_AGENT_TYPE) {
+               bool matches_agent_type = FALSE;
+               for (u_int32_t i = 0; i < num_required_agent_types; i++) {
+                       struct necp_client_parameter_netagent_type *required_agent_type = &required_agent_types[i];
+                       if ((strlen(kernel_policy->cond_agent_type.agent_domain) == 0 ||
+                                strncmp(required_agent_type->netagent_domain, kernel_policy->cond_agent_type.agent_domain, NETAGENT_DOMAINSIZE) == 0) &&
+                               (strlen(kernel_policy->cond_agent_type.agent_type) == 0 ||
+                                strncmp(required_agent_type->netagent_type, kernel_policy->cond_agent_type.agent_type, NETAGENT_TYPESIZE) == 0)) {
+                                       // Found a required agent that matches
+                                       matches_agent_type = TRUE;
+                                       break;
+                               }
+               }
+               if (!matches_agent_type) {
+                       return (FALSE);
+               }
+       }
+
        if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_START) {
                if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_LOCAL_END) {
                        bool inRange = necp_is_addr_in_range((struct sockaddr *)local, (struct sockaddr *)&kernel_policy->cond_local_start, (struct sockaddr *)&kernel_policy->cond_local_end);
@@ -7097,7 +7285,12 @@ necp_socket_fillout_info_locked(struct inpcb *inp, struct sockaddr *override_loc
 }
 
 static inline struct necp_kernel_socket_policy *
-necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info, necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id, necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service, u_int32_t *return_netagent_array, size_t netagent_array_count, proc_t proc)
+necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy **policy_search_array, struct necp_socket_info *info,
+                                                                                          necp_kernel_policy_filter *return_filter, u_int32_t *return_route_rule_id,
+                                                                                          necp_kernel_policy_result *return_service_action, necp_kernel_policy_service *return_service,
+                                                                                          u_int32_t *return_netagent_array, u_int32_t *return_netagent_use_flags_array, size_t netagent_array_count,
+                                                                                          struct necp_client_parameter_netagent_type *required_agent_types,
+                                                                                          u_int32_t num_required_agent_types, proc_t proc, necp_kernel_policy_id *skip_policy_id)
 {
        struct necp_kernel_socket_policy *matched_policy = NULL;
        u_int32_t skip_order = 0;
@@ -7152,7 +7345,7 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy
                                // Skip this policy
                                continue;
                        }
-                       if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, proc)) {
+                       if (necp_socket_check_policy(policy_search_array[i], info->application_id, info->real_application_id, info->cred_result, info->account_id, domain_substring, domain_dot_count, info->pid, info->uid, info->bound_interface_index, info->traffic_class, info->protocol, &info->local_addr, &info->remote_addr, required_agent_types, num_required_agent_types, proc)) {
                                if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SOCKET_FILTER) {
                                        if (return_filter && *return_filter == 0) {
                                                *return_filter = policy_search_array[i]->result_parameter.filter_control_unit;
@@ -7184,13 +7377,21 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy
                                                }
                                        }
                                        continue;
-                               } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT) {
+                               } else if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT ||
+                                                  policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED) {
                                        if (return_netagent_array != NULL &&
                                                netagent_cursor < netagent_array_count) {
                                                return_netagent_array[netagent_cursor] = policy_search_array[i]->result_parameter.netagent_id;
+                                               if (return_netagent_use_flags_array != NULL &&
+                                                       policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED) {
+                                                       return_netagent_use_flags_array[netagent_cursor] |= NECP_AGENT_USE_FLAG_SCOPE;
+                                               }
                                                netagent_cursor++;
                                                if (necp_debug > 1) {
-                                                       NECPLOG(LOG_DEBUG, "Socket Policy: (Application %d Real Application %d BoundInterface %d Proto %d) Use Netagent %d", info->application_id, info->real_application_id, info->bound_interface_index, info->protocol, policy_search_array[i]->result_parameter.netagent_id);
+                                                       NECPLOG(LOG_DEBUG, "Socket Policy: (Application %d Real Application %d BoundInterface %d Proto %d) %s Netagent %d",
+                                                                       info->application_id, info->real_application_id, info->bound_interface_index, info->protocol,
+                                                                       policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_USE_NETAGENT ? "Use" : "Scope",
+                                                                       policy_search_array[i]->result_parameter.netagent_id);
                                                }
                                        }
                                        continue;
@@ -7200,6 +7401,9 @@ necp_socket_find_policy_match_with_info_locked(struct necp_kernel_socket_policy
                                if (policy_search_array[i]->result == NECP_KERNEL_POLICY_RESULT_SKIP) {
                                        skip_order = policy_search_array[i]->result_parameter.skip_policy_order;
                                        skip_session_order = policy_search_array[i]->session_order + 1;
+                                       if (skip_policy_id) {
+                                               *skip_policy_id = policy_search_array[i]->id;
+                                       }
                                        continue;
                                }
 
@@ -7324,6 +7528,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
                (!(inp->inp_flags2 & INP2_WANT_APP_POLICY) && necp_kernel_socket_policies_non_app_count == 0)) {
                if (necp_drop_all_order > 0) {
                        inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                        inp->inp_policyresult.policy_gencount = 0;
                        inp->inp_policyresult.app_id = 0;
                        inp->inp_policyresult.flowhash = 0;
@@ -7342,6 +7547,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
        if (necp_socket_bypass(override_local_addr, override_remote_addr, inp)) {
                // Mark socket as a pass
                inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+               inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                inp->inp_policyresult.policy_gencount = 0;
                inp->inp_policyresult.app_id = 0;
                inp->inp_policyresult.flowhash = 0;
@@ -7371,7 +7577,8 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
        }
 
        // Match socket to policy
-       matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, current_proc());
+       necp_kernel_policy_id skip_policy_id;
+       matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, &filter_control_unit, &route_rule_id, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), &skip_policy_id);
        // If the socket matched a scoped service policy, mark as Drop if not registered.
        // This covers the cases in which a service is required (on demand) but hasn't started yet.
        if ((service_action == NECP_KERNEL_POLICY_RESULT_TRIGGER_SCOPED ||
@@ -7389,6 +7596,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
                if (!service_is_registered) {
                        // Mark socket as a drop if service is not registered
                        inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                       inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                        inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
                        inp->inp_policyresult.flowhash = flowhash;
                        inp->inp_policyresult.results.filter_control_unit = 0;
@@ -7429,6 +7637,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
 
                                        // Mark socket as a drop if required agent is not active
                                        inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+                                       inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                                        inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
                                        inp->inp_policyresult.flowhash = flowhash;
                                        inp->inp_policyresult.results.filter_control_unit = 0;
@@ -7449,6 +7658,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
        if (matched_policy) {
                matched_policy_id = matched_policy->id;
                inp->inp_policyresult.policy_id = matched_policy->id;
+               inp->inp_policyresult.skip_policy_id = skip_policy_id;
                inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
                inp->inp_policyresult.flowhash = flowhash;
                inp->inp_policyresult.results.filter_control_unit = filter_control_unit;
@@ -7476,6 +7686,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
        } else if (necp_drop_all_order > 0) {
                // Mark socket as a drop if set
                inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+               inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
                inp->inp_policyresult.flowhash = flowhash;
                inp->inp_policyresult.results.filter_control_unit = 0;
@@ -7484,6 +7695,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
        } else {
                // Mark non-matching socket so we don't re-check it
                inp->inp_policyresult.policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
+               inp->inp_policyresult.skip_policy_id = NECP_KERNEL_POLICY_ID_NO_MATCH;
                inp->inp_policyresult.policy_gencount = necp_kernel_socket_policies_gencount;
                inp->inp_policyresult.flowhash = flowhash;
                inp->inp_policyresult.results.filter_control_unit = filter_control_unit; // We may have matched a filter, so mark it!
@@ -7498,7 +7710,7 @@ necp_socket_find_policy_match(struct inpcb *inp, struct sockaddr *override_local
 }
 
 static bool
-necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, necp_kernel_policy_id socket_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote)
+necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy, necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local, union necp_sockaddr_union *remote)
 {
        if (!(kernel_policy->condition_mask & NECP_KERNEL_CONDITION_ALL_INTERFACES)) {
                if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_BOUND_INTERFACE) {
@@ -7527,7 +7739,9 @@ necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy,
        }
 
        if (kernel_policy->condition_mask & NECP_KERNEL_CONDITION_POLICY_ID) {
-               if (socket_policy_id != kernel_policy->cond_policy_id) {
+               necp_kernel_policy_id matched_policy_id =
+                       kernel_policy->result == NECP_KERNEL_POLICY_RESULT_SKIP ? socket_skip_policy_id : socket_policy_id;
+               if (matched_policy_id != kernel_policy->cond_policy_id) {
                        // No match, does not match required id
                        return (FALSE);
                }
@@ -7609,7 +7823,7 @@ necp_ip_output_check_policy(struct necp_kernel_ip_output_policy *kernel_policy,
 }
 
 static inline struct necp_kernel_ip_output_policy *
-necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr)
+necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id, necp_kernel_policy_id socket_skip_policy_id, u_int32_t bound_interface_index, u_int32_t last_interface_index, u_int16_t protocol, union necp_sockaddr_union *local_addr, union necp_sockaddr_union *remote_addr)
 {
        u_int32_t skip_order = 0;
        u_int32_t skip_session_order = 0;
@@ -7640,7 +7854,7 @@ necp_ip_output_find_policy_match_locked(necp_kernel_policy_id socket_policy_id,
                                // Skip this policy
                                continue;
                        }
-                       if (necp_ip_output_check_policy(policy_search_array[i], socket_policy_id, bound_interface_index, last_interface_index, protocol, local_addr, remote_addr)) {
+                       if (necp_ip_output_check_policy(policy_search_array[i], socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, local_addr, remote_addr)) {
                                // Passed all tests, found a match
                                matched_policy = policy_search_array[i];
 
@@ -7679,6 +7893,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a
        struct ip *ip = NULL;
        int hlen = sizeof(struct ip);
        necp_kernel_policy_id socket_policy_id = NECP_KERNEL_POLICY_ID_NONE;
+       necp_kernel_policy_id socket_skip_policy_id = NECP_KERNEL_POLICY_ID_NONE;
        necp_kernel_policy_id matched_policy_id = NECP_KERNEL_POLICY_ID_NONE;
        struct necp_kernel_ip_output_policy *matched_policy = NULL;
        u_int16_t protocol = 0;
@@ -7700,6 +7915,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a
        }
 
        socket_policy_id = necp_get_policy_id_from_packet(packet);
+       socket_skip_policy_id = necp_get_skip_policy_id_from_packet(packet);
 
        // Exit early for an empty list
        // Don't lock. Possible race condition, but we don't want the performance hit.
@@ -7782,7 +7998,7 @@ necp_ip_output_find_policy_match(struct mbuf *packet, int flags, struct ip_out_a
 
        // Match packet to policy
        lck_rw_lock_shared(&necp_kernel_policy_lock);
-       matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr);
+       matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr);
        if (matched_policy) {
                matched_policy_id = matched_policy->id;
                if (result) {
@@ -7815,6 +8031,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out
        int next = -1;
        int offset = 0;
        necp_kernel_policy_id socket_policy_id = NECP_KERNEL_POLICY_ID_NONE;
+       necp_kernel_policy_id socket_skip_policy_id = NECP_KERNEL_POLICY_ID_NONE;
        necp_kernel_policy_id matched_policy_id = NECP_KERNEL_POLICY_ID_NONE;
        struct necp_kernel_ip_output_policy *matched_policy = NULL;
        u_int16_t protocol = 0;
@@ -7836,6 +8053,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out
        }
 
        socket_policy_id = necp_get_policy_id_from_packet(packet);
+       socket_skip_policy_id = necp_get_skip_policy_id_from_packet(packet);
 
        // Exit early for an empty list
        // Don't lock. Possible race condition, but we don't want the performance hit.
@@ -7915,7 +8133,7 @@ necp_ip6_output_find_policy_match(struct mbuf *packet, int flags, struct ip6_out
 
        // Match packet to policy
        lck_rw_lock_shared(&necp_kernel_policy_lock);
-       matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr);
+       matched_policy = necp_ip_output_find_policy_match_locked(socket_policy_id, socket_skip_policy_id, bound_interface_index, last_interface_index, protocol, &local_addr, &remote_addr);
        if (matched_policy) {
                matched_policy_id = matched_policy->id;
                if (result) {
@@ -8266,6 +8484,22 @@ done:
        }
 }
 
+static bool
+necp_route_is_lqm_abort(struct ifnet *ifp, struct ifnet *delegated_ifp)
+{
+       if (ifp != NULL &&
+               (ifp->if_interface_state.valid_bitmask & IF_INTERFACE_STATE_LQM_STATE_VALID) &&
+               ifp->if_interface_state.lqm_state == IFNET_LQM_THRESH_ABORT) {
+               return true;
+       }
+       if (delegated_ifp != NULL &&
+               (delegated_ifp->if_interface_state.valid_bitmask & IF_INTERFACE_STATE_LQM_STATE_VALID) &&
+               delegated_ifp->if_interface_state.lqm_state == IFNET_LQM_THRESH_ABORT) {
+               return true;
+       }
+       return false;
+}
+
 static bool
 necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t route_rule_id, u_int32_t *interface_type_denied)
 {
@@ -8296,65 +8530,104 @@ necp_route_is_allowed_inner(struct rtentry *route, struct ifnet *ifp, u_int32_t
                if (route_rule->exception_if_indices[exception_index] == 0) {
                        break;
                }
-               if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->exception_if_actions[exception_index]) == FALSE) {
-                       continue;
-               }
                if (route_rule->exception_if_indices[exception_index] == ifp->if_index ||
                        (delegated_ifp != NULL && route_rule->exception_if_indices[exception_index] == delegated_ifp->if_index)) {
-                       if (necp_debug > 1) {
-                               NECPLOG(LOG_DEBUG, "Route Allowed: Interface match %d for Rule %d Allowed %d", route_rule->exception_if_indices[exception_index], route_rule_id, ((route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE));
+                       if (route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_LQM_ABORT) {
+                               const bool lqm_abort = necp_route_is_lqm_abort(ifp, delegated_ifp);
+                               if (necp_debug > 1 && lqm_abort) {
+                                       NECPLOG(LOG_DEBUG, "Route Allowed: Interface match %d for Rule %d Deny LQM Abort",
+                                                       route_rule->exception_if_indices[exception_index], route_rule_id);
+                               }
+                               return false;
+                       } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->exception_if_actions[exception_index])) {
+                               if (necp_debug > 1) {
+                                       NECPLOG(LOG_DEBUG, "Route Allowed: Interface match %d for Rule %d Allowed %d", route_rule->exception_if_indices[exception_index], route_rule_id, ((route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE));
+                               }
+                               return ((route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE);
                        }
-                       return ((route_rule->exception_if_actions[exception_index] == NECP_ROUTE_RULE_DENY_INTERFACE) ? FALSE : TRUE);
                }
        }
 
-       if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->cellular_action) &&
-               IFNET_IS_CELLULAR(ifp)) {
-               if (interface_type_denied != NULL) {
-                       *interface_type_denied = IFRTYPE_FUNCTIONAL_CELLULAR;
+       if (IFNET_IS_CELLULAR(ifp)) {
+               if (route_rule->cellular_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) {
+                       if (necp_route_is_lqm_abort(ifp, delegated_ifp)) {
+                               if (interface_type_denied != NULL) {
+                                       *interface_type_denied = IFRTYPE_FUNCTIONAL_CELLULAR;
+                               }
+                               // Mark aggregate action as deny
+                               type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE;
+                       }
+               } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->cellular_action)) {
+                       if (interface_type_denied != NULL) {
+                               *interface_type_denied = IFRTYPE_FUNCTIONAL_CELLULAR;
+                       }
+                       if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
+                               (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
+                                route_rule->cellular_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
+                                       // Deny wins if there is a conflict
+                                       type_aggregate_action = route_rule->cellular_action;
+                               }
                }
-               if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
-                       (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
-                        route_rule->cellular_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
-                               // Deny wins if there is a conflict
-                               type_aggregate_action = route_rule->cellular_action;
-                       }
        }
 
-       if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wifi_action) &&
-               IFNET_IS_WIFI(ifp)) {
-               if (interface_type_denied != NULL) {
-                       *interface_type_denied = IFRTYPE_FUNCTIONAL_WIFI_INFRA;
+       if (IFNET_IS_WIFI(ifp)) {
+               if (route_rule->wifi_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) {
+                       if (necp_route_is_lqm_abort(ifp, delegated_ifp)) {
+                               if (interface_type_denied != NULL) {
+                                       *interface_type_denied = IFRTYPE_FUNCTIONAL_WIFI_INFRA;
+                               }
+                               // Mark aggregate action as deny
+                               type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE;
+                       }
+               } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wifi_action)) {
+                       if (interface_type_denied != NULL) {
+                               *interface_type_denied = IFRTYPE_FUNCTIONAL_WIFI_INFRA;
+                       }
+                       if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
+                               (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
+                                route_rule->wifi_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
+                                       // Deny wins if there is a conflict
+                                       type_aggregate_action = route_rule->wifi_action;
+                               }
                }
-               if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
-                       (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
-                        route_rule->wifi_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
-                               // Deny wins if there is a conflict
-                               type_aggregate_action = route_rule->wifi_action;
-                       }
        }
 
-       if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wired_action) &&
-               IFNET_IS_WIRED(ifp)) {
-               if (interface_type_denied != NULL) {
-                       *interface_type_denied = IFRTYPE_FUNCTIONAL_WIRED;
+       if (IFNET_IS_WIRED(ifp)) {
+               if (route_rule->wired_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) {
+                       if (necp_route_is_lqm_abort(ifp, delegated_ifp)) {
+                               if (interface_type_denied != NULL) {
+                                       *interface_type_denied = IFRTYPE_FUNCTIONAL_WIRED;
+                               }
+                               // Mark aggregate action as deny
+                               type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE;
+                       }
+               } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->wired_action)) {
+                       if (interface_type_denied != NULL) {
+                               *interface_type_denied = IFRTYPE_FUNCTIONAL_WIRED;
+                       }
+                       if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
+                               (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
+                                route_rule->wired_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
+                                       // Deny wins if there is a conflict
+                                       type_aggregate_action = route_rule->wired_action;
+                               }
                }
-               if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
-                       (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
-                        route_rule->wired_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
-                               // Deny wins if there is a conflict
-                               type_aggregate_action = route_rule->wired_action;
-                       }
        }
 
-       if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->expensive_action) &&
-               IFNET_IS_EXPENSIVE(ifp)) {
-               if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
-                       (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
-                        route_rule->expensive_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
-                               // Deny wins if there is a conflict
-                               type_aggregate_action = route_rule->expensive_action;
+       if (IFNET_IS_EXPENSIVE(ifp)) {
+               if (route_rule->expensive_action == NECP_ROUTE_RULE_DENY_LQM_ABORT) {
+                       if (necp_route_is_lqm_abort(ifp, delegated_ifp)) {
+                               // Mark aggregate action as deny
+                               type_aggregate_action = NECP_ROUTE_RULE_DENY_INTERFACE;
                        }
+               } else if (IS_NECP_ROUTE_RULE_ALLOW_OR_DENY(route_rule->expensive_action)) {
+                       if (type_aggregate_action == NECP_ROUTE_RULE_NONE ||
+                               (type_aggregate_action == NECP_ROUTE_RULE_ALLOW_INTERFACE &&
+                                route_rule->expensive_action == NECP_ROUTE_RULE_DENY_INTERFACE)) {
+                                       // Deny wins if there is a conflict
+                                       type_aggregate_action = route_rule->expensive_action;
+                               }
+               }
        }
 
        if (type_aggregate_action != NECP_ROUTE_RULE_NONE) {
@@ -8442,7 +8715,7 @@ necp_netagents_allow_traffic(u_int32_t *netagent_ids, size_t netagent_id_count)
 }
 
 static bool
-necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id)
+necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id)
 {
        u_int32_t verifyifindex = interface ? interface->if_index : 0;
        bool allowed_to_receive = TRUE;
@@ -8460,6 +8733,9 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
        if (return_policy_id) {
                *return_policy_id = NECP_KERNEL_POLICY_ID_NONE;
        }
+       if (return_skip_policy_id) {
+               *return_skip_policy_id = NECP_KERNEL_POLICY_ID_NONE;
+       }
        if (return_route_rule_id) {
                *return_route_rule_id = 0;
        }
@@ -8511,6 +8787,9 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
                                        if (return_policy_id) {
                                                *return_policy_id = inp->inp_policyresult.policy_id;
                                        }
+                                       if (return_skip_policy_id) {
+                                               *return_skip_policy_id = inp->inp_policyresult.skip_policy_id;
+                                       }
                                        if (return_route_rule_id) {
                                                *return_route_rule_id = inp->inp_policyresult.results.route_rule_id;
                                        }
@@ -8552,7 +8831,7 @@ necp_socket_is_allowed_to_send_recv_internal(struct inpcb *inp, struct sockaddr
                goto done;
        }
 
-       struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, &route_rule_id, &service_action, &service, netagent_ids, NECP_MAX_NETAGENTS, current_proc());
+       struct necp_kernel_socket_policy *matched_policy = necp_socket_find_policy_match_with_info_locked(necp_kernel_socket_policies_map[NECP_SOCKET_MAP_APP_ID_TO_BUCKET(info.application_id)], &info, NULL, &route_rule_id, &service_action, &service, netagent_ids, NULL, NECP_MAX_NETAGENTS, NULL, 0, current_proc(), return_skip_policy_id);
        if (matched_policy != NULL) {
                if (matched_policy->result == NECP_KERNEL_POLICY_RESULT_DROP ||
                        matched_policy->result == NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT ||
@@ -8601,10 +8880,10 @@ done:
 }
 
 bool
-necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id)
+necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in_addr *local_addr, struct in_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id)
 {
-       struct sockaddr_in local;
-       struct sockaddr_in remote;
+       struct sockaddr_in local = {};
+       struct sockaddr_in remote = {};
        local.sin_family = remote.sin_family = AF_INET;
        local.sin_len = remote.sin_len = sizeof(struct sockaddr_in);
        local.sin_port = local_port;
@@ -8612,14 +8891,15 @@ necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port,
        memcpy(&local.sin_addr, local_addr, sizeof(local.sin_addr));
        memcpy(&remote.sin_addr, remote_addr, sizeof(remote.sin_addr));
 
-       return (necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface, return_policy_id, return_route_rule_id));
+       return (necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface,
+                                                                                                                return_policy_id, return_route_rule_id, return_skip_policy_id));
 }
 
 bool
-necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id)
+necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port, u_int16_t remote_port, struct in6_addr *local_addr, struct in6_addr *remote_addr, ifnet_t interface, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id, necp_kernel_policy_id *return_skip_policy_id)
 {
-       struct sockaddr_in6 local;
-       struct sockaddr_in6 remote;
+       struct sockaddr_in6 local = {};
+       struct sockaddr_in6 remote = {};
        local.sin6_family = remote.sin6_family = AF_INET6;
        local.sin6_len = remote.sin6_len = sizeof(struct sockaddr_in6);
        local.sin6_port = local_port;
@@ -8627,17 +8907,20 @@ necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port,
        memcpy(&local.sin6_addr, local_addr, sizeof(local.sin6_addr));
        memcpy(&remote.sin6_addr, remote_addr, sizeof(remote.sin6_addr));
 
-       return (necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface, return_policy_id, return_route_rule_id));
+       return (necp_socket_is_allowed_to_send_recv_internal(inp, (struct sockaddr *)&local, (struct sockaddr *)&remote, interface,
+                                                                                                                return_policy_id, return_route_rule_id, return_skip_policy_id));
 }
 
 bool
-necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id)
+necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id,
+                                                                       necp_kernel_policy_id *return_skip_policy_id)
 {
-       return (necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, NULL, return_policy_id, return_route_rule_id));
+       return (necp_socket_is_allowed_to_send_recv_internal(inp, NULL, NULL, NULL, return_policy_id, return_route_rule_id, return_skip_policy_id));
 }
 
 int
-necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id, u_int32_t route_rule_id)
+necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id, u_int32_t route_rule_id,
+                                                        necp_kernel_policy_id skip_policy_id)
 {
        if (packet == NULL || inp == NULL || !(packet->m_flags & M_PKTHDR)) {
                return (EINVAL);
@@ -8660,6 +8943,10 @@ necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel
        }
        packet->m_pkthdr.necp_mtag.necp_app_id = inp->inp_policyresult.app_id;
 
+       if (skip_policy_id != NECP_KERNEL_POLICY_ID_NONE) {
+               packet->m_pkthdr.necp_mtag.necp_skip_policy_id = skip_policy_id;
+       }
+
        return (0);
 }
 
@@ -8721,6 +9008,16 @@ necp_get_policy_id_from_packet(struct mbuf *packet)
        return (packet->m_pkthdr.necp_mtag.necp_policy_id);
 }
 
+necp_kernel_policy_id
+necp_get_skip_policy_id_from_packet(struct mbuf *packet)
+{
+       if (packet == NULL || !(packet->m_flags & M_PKTHDR)) {
+               return (NECP_KERNEL_POLICY_ID_NONE);
+       }
+
+       return (packet->m_pkthdr.necp_mtag.necp_skip_policy_id);
+}
+
 u_int32_t
 necp_get_last_interface_index_from_packet(struct mbuf *packet)
 {
@@ -8817,7 +9114,8 @@ necp_socket_should_rescope(struct inpcb *inp)
                return (FALSE);
        }
 
-       return (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED);
+       return (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED ||
+                       inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT);
 }
 
 u_int
@@ -8829,6 +9127,8 @@ necp_socket_get_rescope_if_index(struct inpcb *inp)
 
        if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED) {
                return (inp->inp_policyresult.results.result_parameter.scoped_interface_index);
+       } else if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT) {
+               return (necp_get_primary_direct_interface_index());
        }
 
        return (0);
index d4f42f38631cedb4c2340a6869b473954c118028..8eb159c17762e6ad41d360f8d623ed482acb80f5 100644 (file)
@@ -135,6 +135,7 @@ struct necp_packet_header {
 #define        NECP_POLICY_CONDITION_REMOTE_ADDR               13      // necp_policy_condition_addr
 #define        NECP_POLICY_CONDITION_LOCAL_ADDR_RANGE  14      // necp_policy_condition_addr_range
 #define        NECP_POLICY_CONDITION_REMOTE_ADDR_RANGE 15      // necp_policy_condition_addr_range
+#define        NECP_POLICY_CONDITION_AGENT_TYPE                16      // struct necp_policy_condition_agent_type
 
 /*
  * Results
@@ -153,8 +154,10 @@ struct necp_packet_header {
 #define        NECP_POLICY_RESULT_SOCKET_SCOPED                12      // String, interface name
 #define        NECP_POLICY_RESULT_ROUTE_RULES                  13      // N/A, must have route rules defined
 #define        NECP_POLICY_RESULT_USE_NETAGENT                 14      // netagent uuid_t
+#define        NECP_POLICY_RESULT_NETAGENT_SCOPED              15      // netagent uuid_t
+#define        NECP_POLICY_RESULT_SCOPED_DIRECT                16      // N/A, scopes to primary physical interface
 
-#define        NECP_POLICY_RESULT_MAX                                  NECP_POLICY_RESULT_USE_NETAGENT
+#define        NECP_POLICY_RESULT_MAX                                  NECP_POLICY_RESULT_SCOPED_DIRECT
 
 /*
  * Route Rules
@@ -163,7 +166,8 @@ struct necp_packet_header {
 #define        NECP_ROUTE_RULE_NONE                                    0       // N/A
 #define        NECP_ROUTE_RULE_DENY_INTERFACE                  1       // String, or empty to match all
 #define        NECP_ROUTE_RULE_ALLOW_INTERFACE                 2       // String, or empty to match all
-#define        NECP_ROUTE_RULE_QOS_MARKING             3       // String, or empty to match all
+#define        NECP_ROUTE_RULE_QOS_MARKING                             3       // String, or empty to match all
+#define        NECP_ROUTE_RULE_DENY_LQM_ABORT                  4       // String, or empty to match all
 
 #define        NECP_ROUTE_RULE_FLAG_CELLULAR                   0x01
 #define        NECP_ROUTE_RULE_FLAG_WIFI                               0x02
@@ -212,6 +216,11 @@ struct necp_policy_condition_addr_range {
        } end_address;
 } __attribute__((__packed__));
 
+struct necp_policy_condition_agent_type {
+       char agent_domain[32];
+       char agent_type[32];
+} __attribute__((__packed__));
+
 #define        NECP_SESSION_PRIORITY_UNKNOWN                   0
 #define        NECP_SESSION_PRIORITY_CONTROL                   1
 #define        NECP_SESSION_PRIORITY_PRIVILEGED_TUNNEL 2
@@ -238,6 +247,8 @@ typedef union {
 #define        NECP_SERVICE_FLAGS_REGISTERED                   0x01
 #define        NECP_MAX_NETAGENTS                                              8
 
+#define NECP_AGENT_USE_FLAG_SCOPE                              0x01
+
 #define NECP_TFO_COOKIE_LEN_MAX      16
 struct necp_aggregate_result {
        necp_kernel_policy_result                       routing_result;
@@ -250,7 +261,7 @@ struct necp_aggregate_result {
        u_int                                                           routed_interface_index;
        u_int32_t                                                       policy_id;
        uuid_t                                                          netagents[NECP_MAX_NETAGENTS];
-       u_int32_t                                                       netagent_flags[NECP_MAX_NETAGENTS];
+       u_int32_t                                                       netagent_use_flags[NECP_MAX_NETAGENTS];
        u_int8_t                                                        mss_recommended;
 };
 
@@ -438,6 +449,9 @@ typedef struct necp_cache_buffer {
 #define        NECP_CLIENT_ACTION_UPDATE_CACHE                                 14 // Update heuristics and cache
 #define        NECP_CLIENT_ACTION_COPY_CLIENT_UPDATE                   15 // Fetch an updated client for push-mode observer. Output: Client id, struct necp_client_observer_update in buffer
 #define        NECP_CLIENT_ACTION_COPY_UPDATED_RESULT                  16 // Copy client result only if changed. Input: client_id; Output: result in buffer
+#define        NECP_CLIENT_ACTION_ADD_FLOW                                             17 // Add a flow. Input: client_id; Output: struct necp_client_add_flow
+#define        NECP_CLIENT_ACTION_REMOVE_FLOW                                  18 // Remove a flow. Input: flow_id, optional struct ifnet_stats_per_flow
+
 
 #define        NECP_CLIENT_PARAMETER_APPLICATION                               NECP_POLICY_CONDITION_APPLICATION               // Requires entitlement
 #define        NECP_CLIENT_PARAMETER_REAL_APPLICATION                  NECP_POLICY_CONDITION_REAL_APPLICATION  // Requires entitlement
@@ -463,10 +477,14 @@ typedef struct necp_cache_buffer {
 #define        NECP_CLIENT_PARAMETER_REQUIRE_AGENT                             112             // uuid_t, network agent UUID
 #define        NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE                113             // struct necp_client_parameter_netagent_type
 
-// "Prefer" will choose an interface with that property, or best otherwise if not found
+// "Prefer" will choose an interface with an agent, or best otherwise if not found
 #define        NECP_CLIENT_PARAMETER_PREFER_AGENT                              122             // uuid_t, network agent UUID
 #define        NECP_CLIENT_PARAMETER_PREFER_AGENT_TYPE                 123             // struct necp_client_parameter_netagent_type
 
+// "Avoid" will choose an interface without an agent, or best otherwise if unavoidable
+#define        NECP_CLIENT_PARAMETER_AVOID_AGENT                               124             // uuid_t, network agent UUID
+#define        NECP_CLIENT_PARAMETER_AVOID_AGENT_TYPE                  125             // struct necp_client_parameter_netagent_type
+
 // Use actions with NECP_CLIENT_ACTION_AGENT
 #define        NECP_CLIENT_PARAMETER_TRIGGER_AGENT                             130             // uuid_t, network agent UUID
 #define        NECP_CLIENT_PARAMETER_ASSERT_AGENT                              131             // uuid_t, network agent UUID
@@ -486,6 +504,8 @@ typedef struct necp_cache_buffer {
 #define        NECP_CLIENT_PARAMETER_FLAG_ECN_ENABLE                   0x0020  // Client is requesting to enable ECN
 #define        NECP_CLIENT_PARAMETER_FLAG_ECN_DISABLE                  0x0040  // Client is requesting to disable ECN
 #define        NECP_CLIENT_PARAMETER_FLAG_TFO_ENABLE                   0x0080  // Client is requesting to enable TFO
+#define        NECP_CLIENT_PARAMETER_FLAG_ONLY_PRIMARY_REQUIRES_TYPE 0x0100    // Interpret NECP_CLIENT_PARAMETER_REQUIRE_IF_TYPE only for primary
+                                                                                                                                       // interface, and allow exceptions for multipath or listeners
 
 #define        NECP_CLIENT_RESULT_CLIENT_ID                                    1               // uuid_t
 #define        NECP_CLIENT_RESULT_POLICY_RESULT                                2               // u_int32_t
@@ -495,19 +515,21 @@ typedef struct necp_cache_buffer {
 #define        NECP_CLIENT_RESULT_NETAGENT                                             6               // struct necp_client_result_netagent
 #define        NECP_CLIENT_RESULT_FLAGS                                                7               // u_int32_t, see NECP_CLIENT_RESULT_FLAG_* values
 #define        NECP_CLIENT_RESULT_INTERFACE                                    8               // struct necp_client_result_interface
-#define        NECP_CLIENT_RESULT_MULTIPATH_INTERFACE                  9               // struct necp_client_result_interface
+#define        NECP_CLIENT_RESULT_INTERFACE_OPTION                             9               // struct necp_client_interface_option
 #define        NECP_CLIENT_RESULT_EFFECTIVE_MTU                                10              // u_int32_t
 #define        NECP_CLIENT_RESULT_FLOW                                                 11              // TLV array of a single flow's state
 #define        NECP_CLIENT_RESULT_PROTO_CTL_EVENT                              12
 #define        NECP_CLIENT_RESULT_TFO_COOKIE                                   13              // NECP_TFO_COOKIE_LEN_MAX
 #define        NECP_CLIENT_RESULT_TFO_FLAGS                                    14              // u_int8_t
 #define        NECP_CLIENT_RESULT_RECOMMENDED_MSS                              15              // u_int8_t
+#define        NECP_CLIENT_RESULT_FLOW_ID                                              16              // uuid_t
 #define        NECP_CLIENT_RESULT_INTERFACE_TIME_DELTA                 17              // u_int32_t, seconds since interface up/down
 
 #define        NECP_CLIENT_RESULT_NEXUS_INSTANCE                               100             // uuid_t
 #define        NECP_CLIENT_RESULT_NEXUS_PORT                                   101             // u_int16_t
 #define        NECP_CLIENT_RESULT_NEXUS_KEY                                    102             // uuid_t
 #define        NECP_CLIENT_RESULT_NEXUS_PORT_FLOW_INDEX                103             // u_int32_t
+#define        NECP_CLIENT_RESULT_NEXUS_FLOW_STATS                             104             // struct sk_stats_flow *
 
 #define        NECP_CLIENT_RESULT_LOCAL_ENDPOINT                               200             // struct necp_client_endpoint
 #define        NECP_CLIENT_RESULT_REMOTE_ENDPOINT                              201             // struct necp_client_endpoint
@@ -528,6 +550,10 @@ typedef struct necp_cache_buffer {
 #define        NECP_CLIENT_RESULT_FLAG_FAST_OPEN_BLOCKED               0x0400  // Fast open should not be used
 #define        NECP_CLIENT_RESULT_FLAG_LINK_QUALITY_ABORT              0x0800  // Link quality is very bad, recommend close connections
 #define        NECP_CLIENT_RESULT_FLAG_ALLOW_QOS_MARKING               0x1000  // QoS marking is allowed
+#define        NECP_CLIENT_RESULT_FLAG_HAS_NAT64                       0x2000  // Has NAT64 prefix
+#define        NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER             0x4000  // Interface is in low-power mode
+
+#define NECP_CLIENT_RESULT_FLAG_FORCE_UPDATE (NECP_CLIENT_RESULT_FLAG_HAS_IPV4 | NECP_CLIENT_RESULT_FLAG_HAS_IPV6 | NECP_CLIENT_RESULT_FLAG_HAS_NAT64 | NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER)
 
 #define        NECP_CLIENT_RESULT_FAST_OPEN_SND_PROBE                  0x01    // DEPRECATED - Fast open send probe
 #define        NECP_CLIENT_RESULT_FAST_OPEN_RCV_PROBE                  0x02    // DEPRECATED - Fast open receive probe
@@ -556,6 +582,8 @@ struct necp_interface_details {
 #define        NECP_INTERFACE_FLAG_EXPENSIVE                                   0x0001
 #define        NECP_INTERFACE_FLAG_TXSTART                                     0X0002
 #define        NECP_INTERFACE_FLAG_NOACKPRI                                    0x0004
+#define        NECP_INTERFACE_FLAG_3CARRIERAGG                                 0x0008
+#define        NECP_INTERFACE_FLAG_IS_LOW_POWER                                0x0010
 
 struct necp_client_parameter_netagent_type {
        char netagent_domain[32];
@@ -572,6 +600,12 @@ struct necp_client_result_interface {
        u_int32_t index;
 };
 
+struct necp_client_interface_option {
+       u_int32_t interface_index;
+       u_int32_t interface_generation;
+       uuid_t nexus_agent;
+};
+
 struct necp_client_endpoint {
        union {
                struct sockaddr sa;
@@ -596,6 +630,24 @@ struct kev_necp_policies_changed_data {
        u_int32_t               changed_count;  // Defaults to 0.
 };
 
+#define        NECP_CLIENT_FLOW_FLAGS_ALLOW_NEXUS                      0x01    // Request a nexus instance upon adding a flow
+#define        NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID            0x02    // Register the client ID rather than the flow registration ID with network agents
+
+struct necp_client_flow_stats {
+       u_int32_t stats_type; // NECP_CLIENT_STATISTICS_TYPE_*
+       u_int32_t stats_version; // NECP_CLIENT_STATISTICS_TYPE_*_VER
+       u_int32_t stats_size;
+       mach_vm_address_t stats_addr;
+};
+
+struct necp_client_add_flow {
+       uuid_t agent_uuid;
+       uuid_t registration_id;
+       u_int16_t flags; // NECP_CLIENT_FLOW_FLAGS_*
+       u_int16_t stats_request_count;
+       struct necp_client_flow_stats stats_requests[0];
+} __attribute__((__packed__));
+
 struct necp_agent_use_parameters {
        uuid_t agent_uuid;
        uint64_t out_use_count;
@@ -622,7 +674,6 @@ struct necp_client_observer_update {
 #include <sys/socketvar.h>
 #include <sys/kern_control.h>
 #include <netinet/ip_var.h>
-#include <netinet/mp_pcb.h>
 #include <netinet6/ip6_var.h>
 #include <net/if_var.h>
 #include <sys/syslog.h>
@@ -739,6 +790,8 @@ typedef u_int32_t necp_app_id;
 #define        NECP_KERNEL_POLICY_RESULT_SOCKET_SCOPED                 NECP_POLICY_RESULT_SOCKET_SCOPED
 #define        NECP_KERNEL_POLICY_RESULT_ROUTE_RULES                   NECP_POLICY_RESULT_ROUTE_RULES
 #define        NECP_KERNEL_POLICY_RESULT_USE_NETAGENT                  NECP_POLICY_RESULT_USE_NETAGENT
+#define        NECP_KERNEL_POLICY_RESULT_NETAGENT_SCOPED               NECP_POLICY_RESULT_NETAGENT_SCOPED
+#define        NECP_KERNEL_POLICY_RESULT_SCOPED_DIRECT                 NECP_POLICY_RESULT_SCOPED_DIRECT
 
 typedef struct {
        u_int32_t identifier;
@@ -764,7 +817,6 @@ enum necp_boolean_state {
 
 struct necp_kernel_socket_policy {
        LIST_ENTRY(necp_kernel_socket_policy)   chain;
-       necp_policy_id                          parent_policy_id;
        necp_kernel_policy_id           id;
        necp_policy_order                       order;
        u_int32_t                                       session_order;
@@ -791,6 +843,7 @@ struct necp_kernel_socket_policy {
        union necp_sockaddr_union       cond_remote_start;                              // Matches remote IP address (or start)
        union necp_sockaddr_union       cond_remote_end;                                // Matches IP address range
        u_int8_t                                        cond_remote_prefix;                             // Defines subnet
+       struct necp_policy_condition_agent_type cond_agent_type;
 
        necp_kernel_policy_result       result;
        necp_kernel_policy_result_parameter     result_parameter;
@@ -798,7 +851,6 @@ struct necp_kernel_socket_policy {
 
 struct necp_kernel_ip_output_policy {
        LIST_ENTRY(necp_kernel_ip_output_policy)        chain;
-       necp_policy_id                          parent_policy_id;
        necp_kernel_policy_id           id;
        necp_policy_order                       suborder;
        necp_policy_order                       order;
@@ -829,7 +881,7 @@ struct necp_session_policy {
        bool                            applied;                        // Applied into the kernel table
        bool                            pending_deletion;       // Waiting to be removed from kernel table
        bool                            pending_update;         // Policy has been modified since creation/last application
-       necp_policy_id          id;
+       necp_policy_id          local_id;
        necp_policy_order       order;
        u_int8_t                        *result;
        u_int32_t                       result_size;
@@ -861,6 +913,7 @@ struct necp_aggregate_socket_result {
 struct necp_inpcb_result {
        u_int32_t                                                       app_id;
        necp_kernel_policy_id                           policy_id;
+       necp_kernel_policy_id                           skip_policy_id;
        int32_t                                                         policy_gencount;
        u_int32_t                                                       flowhash;
        struct necp_aggregate_socket_result     results;
@@ -872,7 +925,6 @@ extern errno_t necp_set_socket_attributes(struct socket *so, struct sockopt *sop
 extern errno_t necp_get_socket_attributes(struct socket *so, struct sockopt *sopt);
 extern void necp_inpcb_remove_cb(struct inpcb *inp);
 extern void necp_inpcb_dispose(struct inpcb *inp);
-extern void necp_mppcb_dispose(struct mppcb *mpp);
 
 extern u_int32_t necp_socket_get_content_filter_control_unit(struct socket *so);
 
@@ -884,19 +936,23 @@ extern u_int necp_socket_get_rescope_if_index(struct inpcb *inp);
 extern u_int32_t necp_socket_get_effective_mtu(struct inpcb *inp, u_int32_t current_mtu);
 
 extern bool necp_socket_is_allowed_to_send_recv(struct inpcb *inp, necp_kernel_policy_id *return_policy_id,
-                                                                                               u_int32_t *return_route_rule_id);
+                                                                                               u_int32_t *return_route_rule_id,
+                                                                                               necp_kernel_policy_id *return_skip_policy_id);
 extern bool necp_socket_is_allowed_to_send_recv_v4(struct inpcb *inp, u_int16_t local_port,
                                                                                                   u_int16_t remote_port, struct in_addr *local_addr,
                                                                                                   struct in_addr *remote_addr, ifnet_t interface,
-                                                                                                  necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id);
+                                                                                                  necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id,
+                                                                                                  necp_kernel_policy_id *return_skip_policy_id);
 extern bool necp_socket_is_allowed_to_send_recv_v6(struct inpcb *inp, u_int16_t local_port,
                                                                                                   u_int16_t remote_port, struct in6_addr *local_addr,
                                                                                                   struct in6_addr *remote_addr, ifnet_t interface,
-                                                                                                  necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id);
+                                                                                                  necp_kernel_policy_id *return_policy_id, u_int32_t *return_route_rule_id,
+                                                                                                  necp_kernel_policy_id *return_skip_policy_id);
 extern void necp_socket_update_qos_marking(struct inpcb *inp, struct rtentry *route, struct ifnet *interface, u_int32_t route_rule_id);
 extern int necp_mark_packet_from_socket(struct mbuf *packet, struct inpcb *inp, necp_kernel_policy_id policy_id,
-                                                                               u_int32_t route_rule_id);
+                                                                               u_int32_t route_rule_id, necp_kernel_policy_id skip_policy_id);
 extern necp_kernel_policy_id necp_get_policy_id_from_packet(struct mbuf *packet);
+extern necp_kernel_policy_id necp_get_skip_policy_id_from_packet(struct mbuf *packet);
 extern u_int32_t necp_get_last_interface_index_from_packet(struct mbuf *packet);
 extern u_int32_t necp_get_route_rule_id_from_packet(struct mbuf *packet);
 extern int necp_get_app_uuid_from_packet(struct mbuf *packet,
@@ -924,9 +980,7 @@ extern bool necp_get_is_keepalive_from_packet(struct mbuf *packet);
 
 extern void necp_update_all_clients(void); // Handle general re-evaluate event
 
-extern void necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid); // Cause a single client to get an update event
-
-extern void necp_client_early_close(uuid_t client_id); // Cause a single client to close stats, etc
+extern void necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid, u_int32_t agent_generation); // Cause a single client to get an update event
 
 extern void necp_set_client_as_background(proc_t proc, struct fileproc *fp, bool background); // Set all clients for an fp as background or not
 
@@ -936,7 +990,7 @@ extern void necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd); // Set
 
 extern int necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp);
 
-extern int necp_client_register_multipath_cb(pid_t pid, uuid_t client_id, struct mppcb *mpp);
+extern int necp_client_assert_bb_radio_manager(uuid_t client_id, bool assert);
 
 extern int necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp);
 
@@ -956,7 +1010,7 @@ necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id,
 #define        NECP_FLOWADV_IDX_INVALID        UINT32_MAX
 extern void *necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, void *key, uint32_t key_length,
                                                                                          struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint,
-                                                                                         u_int32_t flow_adv_index, size_t *message_length);
+                                                                                         u_int32_t flow_adv_index, void *flow_stats, size_t *message_length);
 
 struct necp_client_nexus_parameters {
        pid_t pid;
@@ -971,41 +1025,21 @@ struct necp_client_nexus_parameters {
        unsigned allow_qos_marking:1;
 };
 
-extern int necp_client_copy_parameters(uuid_t client_uuid, struct necp_client_nexus_parameters *parameters);
-
 #define        NECP_CLIENT_CBACTION_NONVIABLE  1
 #define        NECP_CLIENT_CBACTION_VIABLE     2
 #define        NECP_CLIENT_CBACTION_INITIAL    3
 
-struct necp_client_flow {
-       LIST_ENTRY(necp_client_flow) flow_chain;
-       unsigned invalid : 1;
-       unsigned nexus : 1; // If true, flow is a nexus; if false, flow is attached to socket
-       unsigned socket : 1;
-       unsigned viable : 1;
-       unsigned requested_nexus : 1;
-       unsigned assigned : 1;
-       unsigned has_protoctl_event : 1;
-       unsigned check_tcp_heuristics : 1;
-       union {
-               uuid_t nexus_agent;
-               struct {
-                       void *socket_handle;
-                       void (*cb)(void *handle, int action, struct necp_client_flow *flow);
-               };
-       } u;
-       uint32_t interface_index;
-       uint16_t interface_flags;
-       uint32_t necp_flow_flags;
-       struct necp_client_flow_protoctl_event protoctl_event;
-       union necp_sockaddr_union local_addr;
-       union necp_sockaddr_union remote_addr;
+struct necp_client_add_flow_default {
+       uuid_t agent_uuid;
+       uuid_t registration_id;
+       u_int16_t flags; // NECP_CLIENT_FLOW_FLAGS_*
+       u_int16_t stats_request_count;
+       struct necp_client_flow_stats stats_requests[1];
+} __attribute__((__packed__));
 
-       size_t assigned_results_length;
-       u_int8_t *assigned_results;
-};
+typedef void (*necp_client_flow_cb)(void *handle, int action, uint32_t interface_index, uint32_t necp_flags, bool *viable);
 
-extern void necp_client_reap_caches(boolean_t);
+extern void necp_client_reap_caches(boolean_t purge);
 
 
 #endif /* BSD_KERNEL_PRIVATE */
index 41e6efaa84524fbbe6154257d1a38f4c13065832..814f2f0be88b4348b6d5e94beeb1270ed70de168 100644 (file)
 
 extern u_int32_t necp_debug;
 
-// proc_best_name() is declared here in advance of it landing in a header file.
-// See comment in kern_proc.c
-extern char *proc_best_name(proc_t p);
-
 static int noop_read(struct fileproc *, struct uio *, int, vfs_context_t);
 static int noop_write(struct fileproc *, struct uio *, int, vfs_context_t);
 static int noop_ioctl(struct fileproc *, unsigned long, caddr_t,
@@ -192,15 +188,17 @@ extern unsigned int get_maxmtu(struct rtentry *);
 #define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT                    0x00040
 #define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT          0x00080
 #define NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT           0x00100
-#define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE       0x00200
-#define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE     0x00400
-#define NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE      0x00800
-#define NECP_PARSED_PARAMETERS_FIELD_FLAGS                                     0x01000
-#define NECP_PARSED_PARAMETERS_FIELD_IP_PROTOCOL                       0x02000
-#define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_PID                     0x04000
-#define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_UUID                    0x08000
-#define NECP_PARSED_PARAMETERS_FIELD_TRAFFIC_CLASS                     0x10000
-#define NECP_PARSED_PARAMETERS_FIELD_LOCAL_PORT                                0x20000
+#define NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT                     0x00200
+#define NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE       0x00400
+#define NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE     0x00800
+#define NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE      0x01000
+#define NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE                0x02000
+#define NECP_PARSED_PARAMETERS_FIELD_FLAGS                                     0x04000
+#define NECP_PARSED_PARAMETERS_FIELD_IP_PROTOCOL                       0x08000
+#define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_PID                     0x10000
+#define NECP_PARSED_PARAMETERS_FIELD_EFFECTIVE_UUID                    0x20000
+#define NECP_PARSED_PARAMETERS_FIELD_TRAFFIC_CLASS                     0x40000
+#define NECP_PARSED_PARAMETERS_FIELD_LOCAL_PORT                                0x80000
 
 #define NECP_MAX_PARSED_PARAMETERS 16
 struct necp_client_parsed_parameters {
@@ -215,9 +213,11 @@ struct necp_client_parsed_parameters {
        struct necp_client_parameter_netagent_type required_netagent_types[NECP_MAX_PARSED_PARAMETERS];
        struct necp_client_parameter_netagent_type prohibited_netagent_types[NECP_MAX_PARSED_PARAMETERS];
        struct necp_client_parameter_netagent_type preferred_netagent_types[NECP_MAX_PARSED_PARAMETERS];
+       struct necp_client_parameter_netagent_type avoided_netagent_types[NECP_MAX_PARSED_PARAMETERS];
        uuid_t required_netagents[NECP_MAX_PARSED_PARAMETERS];
        uuid_t prohibited_netagents[NECP_MAX_PARSED_PARAMETERS];
        uuid_t preferred_netagents[NECP_MAX_PARSED_PARAMETERS];
+       uuid_t avoided_netagents[NECP_MAX_PARSED_PARAMETERS];
        u_int16_t ip_protocol;
        pid_t effective_pid;
        uuid_t effective_uuid;
@@ -226,7 +226,7 @@ struct necp_client_parsed_parameters {
 
 static bool
 necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_parameters,
-                                                                  u_int *return_ifindex);
+                                                                  u_int *return_ifindex, bool *validate_agents);
 
 static bool
 necp_ifnet_matches_local_address(struct ifnet *ifp, struct sockaddr *sa);
@@ -234,7 +234,8 @@ necp_ifnet_matches_local_address(struct ifnet *ifp, struct sockaddr *sa);
 static bool
 necp_ifnet_matches_parameters(struct ifnet *ifp,
                                                          struct necp_client_parsed_parameters *parsed_parameters,
-                                                         u_int32_t *preferred_count, bool ignore_require_if);
+                                                         u_int32_t *preferred_count,
+                                                         bool secondary_interface);
 
 static const struct fileops necp_fd_ops = {
        .fo_type = DTYPE_NETPOLICY,
@@ -254,6 +255,8 @@ struct necp_client_assertion {
 
 struct necp_client_flow_header {
        struct necp_tlv_header outer_header;
+       struct necp_tlv_header flow_id_tlv_header;
+       uuid_t flow_id;
        struct necp_tlv_header flags_tlv_header;
        u_int32_t flags_value;
        struct necp_tlv_header interface_tlv_header;
@@ -274,10 +277,64 @@ struct necp_client_nexus_flow_header {
 } __attribute__((__packed__));
 
 
+struct necp_client_flow {
+       LIST_ENTRY(necp_client_flow) flow_chain;
+       unsigned invalid : 1;
+       unsigned nexus : 1; // If true, flow is a nexus; if false, flow is attached to socket
+       unsigned socket : 1;
+       unsigned viable : 1;
+       unsigned assigned : 1;
+       unsigned has_protoctl_event : 1;
+       unsigned check_tcp_heuristics : 1;
+       unsigned _reserved : 1;
+       union {
+               uuid_t nexus_agent;
+               struct {
+                       void *socket_handle;
+                       necp_client_flow_cb cb;
+               };
+       } u;
+       uint32_t interface_index;
+       uint16_t interface_flags;
+       uint32_t necp_flow_flags;
+       struct necp_client_flow_protoctl_event protoctl_event;
+       union necp_sockaddr_union local_addr;
+       union necp_sockaddr_union remote_addr;
+
+       size_t assigned_results_length;
+       u_int8_t *assigned_results;
+};
+
+struct necp_client_flow_registration {
+       RB_ENTRY(necp_client_flow_registration) fd_link;
+       RB_ENTRY(necp_client_flow_registration) global_link;
+       RB_ENTRY(necp_client_flow_registration) client_link;
+       LIST_ENTRY(necp_client_flow_registration) collect_stats_chain;
+       uuid_t registration_id;
+       u_int32_t flags;
+       unsigned flow_result_read : 1;
+       unsigned defunct : 1;
+       void *interface_handle;
+       necp_client_flow_cb interface_cb;
+       struct necp_client *client;
+       LIST_HEAD(_necp_registration_flow_list, necp_client_flow) flow_list;
+       u_int64_t last_interface_details __attribute__((aligned(sizeof(u_int64_t))));
+};
+
+static int necp_client_flow_id_cmp(struct necp_client_flow_registration *flow0, struct necp_client_flow_registration *flow1);
+
+RB_HEAD(_necp_client_flow_tree, necp_client_flow_registration);
+RB_PROTOTYPE_PREV(_necp_client_flow_tree, necp_client_flow_registration, client_link, necp_client_flow_id_cmp);
+RB_GENERATE_PREV(_necp_client_flow_tree, necp_client_flow_registration, client_link, necp_client_flow_id_cmp);
+
+#define NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT 4
+#define NECP_CLIENT_MAX_INTERFACE_OPTIONS 16
+
+#define NECP_CLIENT_INTERFACE_OPTION_EXTRA_COUNT (NECP_CLIENT_MAX_INTERFACE_OPTIONS - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT)
+
 struct necp_client {
        RB_ENTRY(necp_client) link;
        RB_ENTRY(necp_client) global_link;
-       LIST_ENTRY(necp_client) collect_stats_chain;
 
        decl_lck_mtx_data(, lock);
        decl_lck_mtx_data(, route_lock);
@@ -285,10 +342,9 @@ struct necp_client {
 
        uuid_t client_id;
        unsigned result_read : 1;
-       unsigned flow_result_read : 1;
        unsigned allow_multiple_flows : 1;
+       unsigned legacy_client_is_flow : 1;
 
-       unsigned defunct : 1;
        unsigned background : 1;
        unsigned background_update : 1;
        unsigned platform_binary : 1;
@@ -301,13 +357,18 @@ struct necp_client {
        u_int16_t ip_protocol;
        int proc_pid;
 
-       LIST_HEAD(_necp_client_flow_list, necp_client_flow) flow_list;
+       struct _necp_client_flow_tree flow_registrations;
        LIST_HEAD(_necp_client_assertion_list, necp_client_assertion) assertion_list;
 
        struct rtentry *current_route;
 
-       void *interface_handle;
-       void (*interface_cb)(void *handle, int action, struct necp_client_flow *flow);
+       struct necp_client_interface_option interface_options[NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT];
+       struct necp_client_interface_option *extra_interface_options;
+       u_int8_t interface_option_count; // Number in interface_options + extra_interface_options
+
+       struct necp_client_result_netagent failed_trigger_agent;
+
+       void *agent_handle;
 
        size_t parameters_length;
        u_int8_t parameters[0];
@@ -331,18 +392,19 @@ necp_client_add_assertion(struct necp_client *client, uuid_t netagent_uuid);
 static bool
 necp_client_remove_assertion(struct necp_client *client, uuid_t netagent_uuid);
 
-LIST_HEAD(_necp_client_list, necp_client);
-static struct _necp_client_list necp_collect_stats_client_list;
+LIST_HEAD(_necp_flow_registration_list, necp_client_flow_registration);
+static struct _necp_flow_registration_list necp_collect_stats_flow_list;
 
-struct necp_client_defunct {
-       LIST_ENTRY(necp_client_defunct) chain;
+struct necp_flow_defunct {
+       LIST_ENTRY(necp_flow_defunct) chain;
 
-       uuid_t client_id;
+       uuid_t flow_id;
        uuid_t nexus_agent;
+       void *agent_handle;
        int proc_pid;
 };
 
-LIST_HEAD(_necp_client_defunct_list, necp_client_defunct);
+LIST_HEAD(_necp_flow_defunct_list, necp_flow_defunct);
 
 static int necp_client_id_cmp(struct necp_client *client0, struct necp_client *client1);
 
@@ -354,7 +416,16 @@ RB_HEAD(_necp_client_global_tree, necp_client);
 RB_PROTOTYPE_PREV(_necp_client_global_tree, necp_client, global_link, necp_client_id_cmp);
 RB_GENERATE_PREV(_necp_client_global_tree, necp_client, global_link, necp_client_id_cmp);
 
+RB_HEAD(_necp_fd_flow_tree, necp_client_flow_registration);
+RB_PROTOTYPE_PREV(_necp_fd_flow_tree, necp_client_flow_registration, fd_link, necp_client_flow_id_cmp);
+RB_GENERATE_PREV(_necp_fd_flow_tree, necp_client_flow_registration, fd_link, necp_client_flow_id_cmp);
+
+RB_HEAD(_necp_client_flow_global_tree, necp_client_flow_registration);
+RB_PROTOTYPE_PREV(_necp_client_flow_global_tree, necp_client_flow_registration, global_link, necp_client_flow_id_cmp);
+RB_GENERATE_PREV(_necp_client_flow_global_tree, necp_client_flow_registration, global_link, necp_client_flow_id_cmp);
+
 static struct _necp_client_global_tree necp_client_global_tree;
+static struct _necp_client_flow_global_tree necp_client_flow_global_tree;
 
 struct necp_client_update {
        TAILQ_ENTRY(necp_client_update) chain;
@@ -366,10 +437,15 @@ struct necp_client_update {
 };
 
 
+#define        NAIF_ATTACHED   0x1     // arena is attached to list
+#define        NAIF_REDIRECT   0x2     // arena mmap has been redirected
+#define        NAIF_DEFUNCT    0x4     // arena is now defunct
+
 struct necp_fd_data {
        u_int8_t necp_fd_type;
        LIST_ENTRY(necp_fd_data) chain;
        struct _necp_client_tree clients;
+       struct _necp_fd_flow_tree flows;
        TAILQ_HEAD(_necp_client_update_list, necp_client_update) update_list;
        int update_count;
        int flags;
@@ -392,11 +468,17 @@ static LIST_HEAD(_necp_fd_observer_list, necp_fd_data) necp_fd_observer_list;
 static unsigned int necp_client_fd_size;       /* size of zone element */
 static struct zone *necp_client_fd_zone;       /* zone for necp_fd_data */
 
-#define        NECP_FLOW_ZONE_MAX                      512
-#define        NECP_FLOW_ZONE_NAME                     "necp.flow"
+#define        NECP_FLOW_ZONE_NAME                                     "necp.flow"
+#define        NECP_FLOW_REGISTRATION_ZONE_NAME        "necp.flowregistration"
 
 static unsigned int necp_flow_size;            /* size of necp_client_flow */
-static struct mcache *necp_flow_cache;         /* cache for necp_client_flow */
+static struct mcache *necp_flow_cache; /* cache for necp_client_flow */
+
+static unsigned int necp_flow_registration_size;       /* size of necp_client_flow_registration */
+static struct mcache *necp_flow_registration_cache;    /* cache for necp_client_flow_registration */
+
+#define        NECP_ARENA_INFO_ZONE_MAX                128
+#define        NECP_ARENA_INFO_ZONE_NAME               "necp.arenainfo"
 
 
 static lck_grp_attr_t  *necp_fd_grp_attr       = NULL;
@@ -406,6 +488,7 @@ static      lck_grp_t               *necp_fd_mtx_grp        = NULL;
 decl_lck_rw_data(static, necp_fd_lock);
 decl_lck_rw_data(static, necp_observer_lock);
 decl_lck_rw_data(static, necp_client_tree_lock);
+decl_lck_rw_data(static, necp_flow_tree_lock);
 decl_lck_rw_data(static, necp_collect_stats_list_lock);
 
 #define NECP_STATS_LIST_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&necp_collect_stats_list_lock)
@@ -415,6 +498,12 @@ decl_lck_rw_data(static, necp_collect_stats_list_lock);
 #define NECP_CLIENT_TREE_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&necp_client_tree_lock)
 #define NECP_CLIENT_TREE_LOCK_SHARED() lck_rw_lock_shared(&necp_client_tree_lock)
 #define NECP_CLIENT_TREE_UNLOCK() lck_rw_done(&necp_client_tree_lock)
+#define NECP_CLIENT_TREE_ASSERT_LOCKED() LCK_RW_ASSERT(&necp_client_tree_lock, LCK_RW_ASSERT_HELD)
+
+#define NECP_FLOW_TREE_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&necp_flow_tree_lock)
+#define NECP_FLOW_TREE_LOCK_SHARED() lck_rw_lock_shared(&necp_flow_tree_lock)
+#define NECP_FLOW_TREE_UNLOCK() lck_rw_done(&necp_flow_tree_lock)
+#define NECP_FLOW_TREE_ASSERT_LOCKED() LCK_RW_ASSERT(&necp_flow_tree_lock, LCK_RW_ASSERT_HELD)
 
 #define NECP_FD_LIST_LOCK_EXCLUSIVE() lck_rw_lock_exclusive(&necp_fd_lock)
 #define NECP_FD_LIST_LOCK_SHARED() lck_rw_lock_shared(&necp_fd_lock)
@@ -428,7 +517,8 @@ decl_lck_rw_data(static, necp_collect_stats_list_lock);
 
 // Take NECP_FD_LIST_LOCK when accessing or modifying the necp_fd_list
 // Take NECP_CLIENT_TREE_LOCK when accessing or modifying the necp_client_global_tree
-// Take NECP_STATS_LIST_LOCK when accessing or modifying the necp_collect_stats_client_list
+// Take NECP_FLOW_TREE_LOCK when accessing or modifying the necp_client_flow_global_tree
+// Take NECP_STATS_LIST_LOCK when accessing or modifying the necp_collect_stats_flow_list
 // Take NECP_FD_LOCK when accessing or modifying an necp_fd_data entry
 // Take NECP_CLIENT_LOCK when accessing or modifying a single necp_client
 // Take NECP_CLIENT_ROUTE_LOCK when accessing or modifying a client's route
@@ -438,8 +528,9 @@ decl_lck_rw_data(static, necp_collect_stats_list_lock);
 // 2. NECP_FD_LOCK (any)
 // 3. NECP_CLIENT_TREE_LOCK
 // 4. NECP_CLIENT_LOCK (any)
-// 5. NECP_STATS_LIST_LOCK
-// 6. NECP_CLIENT_ROUTE_LOCK (any)
+// 5. NECP_FLOW_TREE_LOCK
+// 6. NECP_STATS_LIST_LOCK
+// 7. NECP_CLIENT_ROUTE_LOCK (any)
 
 static thread_call_t necp_client_update_tcall;
 
@@ -489,6 +580,19 @@ necp_fd_notify(struct necp_fd_data *fd_data, bool locked)
        }
 }
 
+static inline bool
+necp_client_has_unread_flows(struct necp_client *client)
+{
+       NECP_CLIENT_ASSERT_LOCKED(client);
+       struct necp_client_flow_registration *flow_registration = NULL;
+       RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+               if (!flow_registration->flow_result_read) {
+                       return true;
+               }
+       }
+       return false;
+}
+
 static int
 necp_fd_poll(struct necp_fd_data *fd_data, int events, void *wql, struct proc *p, int is_kevent)
 {
@@ -508,7 +612,7 @@ necp_fd_poll(struct necp_fd_data *fd_data, int events, void *wql, struct proc *p
                        bool has_unread_clients = FALSE;
                        RB_FOREACH(client, _necp_client_tree, &fd_data->clients) {
                                NECP_CLIENT_LOCK(client);
-                               if (!client->result_read || !client->flow_result_read) {
+                               if (!client->result_read || necp_client_has_unread_flows(client)) {
                                        has_unread_clients = TRUE;
                                }
                                NECP_CLIENT_UNLOCK(client);
@@ -526,14 +630,96 @@ necp_fd_poll(struct necp_fd_data *fd_data, int events, void *wql, struct proc *p
        return (revents);
 }
 
+static inline void
+necp_generate_client_id(uuid_t client_id, bool is_flow)
+{
+       uuid_generate_random(client_id);
+
+       if (is_flow) {
+               client_id[9] |= 0x01;
+       } else {
+               client_id[9] &= ~0x01;
+       }
+}
+
+static inline bool
+necp_client_id_is_flow(uuid_t client_id)
+{
+       return (client_id[9] & 0x01);
+}
+
 static struct necp_client *
-necp_client_fd_find_client_and_lock(struct necp_fd_data *client_fd, uuid_t client_id)
+necp_find_client_and_lock(uuid_t client_id)
+{
+       NECP_CLIENT_TREE_ASSERT_LOCKED();
+
+       struct necp_client *client = NULL;
+
+       if (necp_client_id_is_flow(client_id)) {
+               NECP_FLOW_TREE_LOCK_SHARED();
+               struct necp_client_flow_registration find;
+               uuid_copy(find.registration_id, client_id);
+               struct necp_client_flow_registration *flow = RB_FIND(_necp_client_flow_global_tree, &necp_client_flow_global_tree, &find);
+               if (flow != NULL) {
+                       client = flow->client;
+               }
+               NECP_FLOW_TREE_UNLOCK();
+       } else {
+               struct necp_client find;
+               uuid_copy(find.client_id, client_id);
+               client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find);
+       }
+
+       if (client != NULL) {
+               NECP_CLIENT_LOCK(client);
+       }
+
+       return (client);
+}
+
+static struct necp_client_flow_registration *
+necp_client_find_flow(struct necp_client *client, uuid_t flow_id)
+{
+       NECP_CLIENT_ASSERT_LOCKED(client);
+       struct necp_client_flow_registration *flow = NULL;
+
+       if (necp_client_id_is_flow(flow_id)) {
+               struct necp_client_flow_registration find;
+               uuid_copy(find.registration_id, flow_id);
+               flow = RB_FIND(_necp_client_flow_tree, &client->flow_registrations, &find);
+       } else {
+               flow = RB_ROOT(&client->flow_registrations);
+       }
+
+       return (flow);
+}
+
+static struct necp_client *
+necp_client_fd_find_client_unlocked(struct necp_fd_data *client_fd, uuid_t client_id)
 {
-       struct necp_client find;
        NECP_FD_ASSERT_LOCKED(client_fd);
-       uuid_copy(find.client_id, client_id);
-       struct necp_client *client = RB_FIND(_necp_client_tree, &client_fd->clients, &find);
+       struct necp_client *client = NULL;
+
+       if (necp_client_id_is_flow(client_id)) {
+               struct necp_client_flow_registration find;
+               uuid_copy(find.registration_id, client_id);
+               struct necp_client_flow_registration *flow = RB_FIND(_necp_fd_flow_tree, &client_fd->flows, &find);
+               if (flow != NULL) {
+                       client = flow->client;
+               }
+       } else {
+               struct necp_client find;
+               uuid_copy(find.client_id, client_id);
+               client = RB_FIND(_necp_client_tree, &client_fd->clients, &find);
+       }
+
+       return (client);
+}
 
+static struct necp_client *
+necp_client_fd_find_client_and_lock(struct necp_fd_data *client_fd, uuid_t client_id)
+{
+       struct necp_client *client = necp_client_fd_find_client_unlocked(client_fd, client_id);
        if (client != NULL) {
                NECP_CLIENT_LOCK(client);
        }
@@ -547,6 +733,12 @@ necp_client_id_cmp(struct necp_client *client0, struct necp_client *client1)
        return (uuid_compare(client0->client_id, client1->client_id));
 }
 
+static inline int
+necp_client_flow_id_cmp(struct necp_client_flow_registration *flow0, struct necp_client_flow_registration *flow1)
+{
+       return (uuid_compare(flow0->registration_id, flow1->registration_id));
+}
+
 static int
 necpop_select(struct fileproc *fp, int which, void *wql, vfs_context_t ctx)
 {
@@ -630,8 +822,6 @@ necp_fd_knrtouch(struct knote *kn, struct kevent_internal_s *kev)
        fd_data = (struct necp_fd_data *)kn->kn_hook;
 
        NECP_FD_LOCK(fd_data);
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
        revents = necp_fd_poll(fd_data, POLLIN, NULL, current_proc(), 1);
        NECP_FD_UNLOCK(fd_data);
 
@@ -681,58 +871,47 @@ necpop_kqfilter(struct fileproc *fp, struct knote *kn,
        return ((revents & POLLIN) != 0);
 }
 
+#define INTERFACE_FLAGS_SHIFT   32
+#define INTERFACE_FLAGS_MASK    0xffff
+#define INTERFACE_INDEX_SHIFT   0
+#define INTERFACE_INDEX_MASK    0xffffffff
 
-static bool
-necp_set_client_defunct(struct necp_client *client)
+static uint64_t
+combine_interface_details(uint32_t interface_index, uint16_t interface_flags)
 {
-       bool updated = FALSE;
-       u_int32_t flags = 0;
-       u_int32_t value_size = 0;
-
-       client->defunct = TRUE;
-
-       u_int8_t *flags_pointer = necp_buffer_get_tlv_value(client->result, 0, &value_size);
-       if (flags_pointer && value_size == sizeof(flags)) {
-               memcpy(&flags, flags_pointer, value_size);
-
-               flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT;
-
-               (void)necp_buffer_write_tlv_if_different(client->result, NECP_CLIENT_RESULT_FLAGS,
-                                                                                                sizeof(flags), &flags, &updated, client->result, sizeof(client->result));
-
-        if (updated) {
-            client->result_read = FALSE;
-        }
-       }
-
-       return (updated);
+    return (((uint64_t)interface_flags & INTERFACE_FLAGS_MASK) << INTERFACE_FLAGS_SHIFT |
+            ((uint64_t)interface_index & INTERFACE_INDEX_MASK) << INTERFACE_INDEX_SHIFT);
 }
 
+
 static void
-necp_defunct_client_for_policy(struct necp_client *client,
-                                                          struct _necp_client_defunct_list *defunct_list)
+necp_defunct_flow_registration(struct necp_client *client,
+                                                          struct necp_client_flow_registration *flow_registration,
+                                                          struct _necp_flow_defunct_list *defunct_list)
 {
        NECP_CLIENT_ASSERT_LOCKED(client);
-       
-       if (!client->defunct) {
+
+       if (!flow_registration->defunct) {
                bool needs_defunct = false;
                struct necp_client_flow *search_flow = NULL;
-               LIST_FOREACH(search_flow, &client->flow_list, flow_chain) {
+               LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) {
                        if (search_flow->nexus &&
-                               !uuid_is_null(search_flow->u.nexus_agent) &&
-                               search_flow->requested_nexus) {
+                               !uuid_is_null(search_flow->u.nexus_agent)) {
 
                                // Save defunct values for the nexus
                                if (defunct_list != NULL) {
                                        // Sleeping alloc won't fail; copy only what's necessary
-                                       struct necp_client_defunct *client_defunct = _MALLOC(sizeof (struct necp_client_defunct),
-                                                                                                                                                M_NECP, M_WAITOK | M_ZERO);
-                                       uuid_copy(client_defunct->nexus_agent, search_flow->u.nexus_agent);
-                                       uuid_copy(client_defunct->client_id, client->client_id);
-                                       client_defunct->proc_pid = client->proc_pid;
+                                       struct necp_flow_defunct *flow_defunct = _MALLOC(sizeof (struct necp_flow_defunct),
+                                                                                                                                        M_NECP, M_WAITOK | M_ZERO);
+                                       uuid_copy(flow_defunct->nexus_agent, search_flow->u.nexus_agent);
+                                       uuid_copy(flow_defunct->flow_id, ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ?
+                                                                                                         client->client_id :
+                                                                                                         flow_registration->registration_id));
+                                       flow_defunct->proc_pid = client->proc_pid;
+                                       flow_defunct->agent_handle = client->agent_handle;
 
                                        // Add to the list provided by caller
-                                       LIST_INSERT_HEAD(defunct_list, client_defunct, chain);
+                                       LIST_INSERT_HEAD(defunct_list, flow_defunct, chain);
                                }
 
                                needs_defunct = true;
@@ -740,12 +919,25 @@ necp_defunct_client_for_policy(struct necp_client *client,
                }
 
                if (needs_defunct) {
+
                        // Only set defunct if there was some assigned flow
-                       client->defunct = true;
+                       flow_registration->defunct = true;
                }
        }
 }
 
+static void
+necp_defunct_client_for_policy(struct necp_client *client,
+                                                          struct _necp_flow_defunct_list *defunct_list)
+{
+       NECP_CLIENT_ASSERT_LOCKED(client);
+
+       struct necp_client_flow_registration *flow_registration = NULL;
+       RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+               necp_defunct_flow_registration(client, flow_registration, defunct_list);
+       }
+}
+
 static void
 necp_client_free(struct necp_client *client)
 {
@@ -753,6 +945,9 @@ necp_client_free(struct necp_client *client)
 
        NECP_CLIENT_UNLOCK(client);
 
+       FREE(client->extra_interface_options, M_NECP);
+       client->extra_interface_options = NULL;
+
        lck_mtx_destroy(&client->route_lock, necp_fd_mtx_grp);
        lck_mtx_destroy(&client->lock, necp_fd_mtx_grp);
 
@@ -927,32 +1122,26 @@ necp_client_update_observer_remove(struct necp_client *client)
 }
 
 static void
-necp_destroy_client(struct necp_client *client, pid_t pid, bool abort)
+necp_destroy_client_flow_registration(struct necp_client *client,
+                                                                         struct necp_client_flow_registration *flow_registration,
+                                                                         pid_t pid, bool abort)
 {
-       NECP_CLIENT_ASSERT_UNLOCKED(client);
-
-       necp_client_update_observer_remove(client);
-
-       NECP_CLIENT_LOCK(client);
+       NECP_CLIENT_ASSERT_LOCKED(client);
 
-       // Free route
-       NECP_CLIENT_ROUTE_LOCK(client);
-       if (client->current_route != NULL) {
-               rtfree(client->current_route);
-               client->current_route = NULL;
-       }
-       NECP_CLIENT_ROUTE_UNLOCK(client);
 
-       // Remove flow assignments
        struct necp_client_flow *search_flow = NULL;
        struct necp_client_flow *temp_flow = NULL;
-       LIST_FOREACH_SAFE(search_flow, &client->flow_list, flow_chain, temp_flow) {
+       LIST_FOREACH_SAFE(search_flow, &flow_registration->flow_list, flow_chain, temp_flow) {
                if (search_flow->nexus &&
-                       !uuid_is_null(search_flow->u.nexus_agent) &&
-                       search_flow->requested_nexus) {
+                       !uuid_is_null(search_flow->u.nexus_agent)) {
                        // Note that if we had defuncted the client earlier, this would result in a harmless ENOENT
-                       int netagent_error = netagent_client_message(search_flow->u.nexus_agent, client->client_id, pid,
-                           abort ? NETAGENT_MESSAGE_TYPE_ABORT_NEXUS : NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS);
+                       int netagent_error = netagent_client_message(search_flow->u.nexus_agent,
+                                                                                                                ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ?
+                                                                                                                 client->client_id :
+                                                                                                                 flow_registration->registration_id),
+                                                                                                                pid, client->agent_handle,
+                                                                                                                (abort ? NETAGENT_MESSAGE_TYPE_ABORT_NEXUS :
+                                                                                                                 NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS));
                        if (netagent_error != 0 && netagent_error != ENOENT) {
                                NECPLOG(LOG_ERR, "necp_client_remove close nexus error (%d)", netagent_error);
                        }
@@ -971,11 +1160,42 @@ necp_destroy_client(struct necp_client *client, pid_t pid, bool abort)
                mcache_free(necp_flow_cache, search_flow);
        }
 
+       RB_REMOVE(_necp_client_flow_tree, &client->flow_registrations, flow_registration);
+       flow_registration->client = NULL;
+
+       mcache_free(necp_flow_registration_cache, flow_registration);
+}
+
+static void
+necp_destroy_client(struct necp_client *client, pid_t pid, bool abort)
+{
+       NECP_CLIENT_ASSERT_UNLOCKED(client);
+
+       necp_client_update_observer_remove(client);
+
+       NECP_CLIENT_LOCK(client);
+
+       // Free route
+       NECP_CLIENT_ROUTE_LOCK(client);
+       if (client->current_route != NULL) {
+               rtfree(client->current_route);
+               client->current_route = NULL;
+       }
+       NECP_CLIENT_ROUTE_UNLOCK(client);
+
+       // Remove flow assignments
+       struct necp_client_flow_registration *flow_registration = NULL;
+       struct necp_client_flow_registration *temp_flow_registration = NULL;
+       RB_FOREACH_SAFE(flow_registration, _necp_client_flow_tree, &client->flow_registrations, temp_flow_registration) {
+               necp_destroy_client_flow_registration(client, flow_registration, pid, abort);
+       }
+
        // Remove agent assertions
        struct necp_client_assertion *search_assertion = NULL;
        struct necp_client_assertion *temp_assertion = NULL;
        LIST_FOREACH_SAFE(search_assertion, &client->assertion_list, assertion_chain, temp_assertion) {
-               int netagent_error = netagent_client_message(search_assertion->asserted_netagent, client->client_id, pid, NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT);
+               int netagent_error = netagent_client_message(search_assertion->asserted_netagent, client->client_id, pid,
+                                                                                                        client->agent_handle, NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT);
                if (netagent_error != 0) {
                        NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR),
                                        "necp_client_remove unassert agent error (%d)", netagent_error);
@@ -1018,6 +1238,16 @@ necpop_close(struct fileglob *fg, vfs_context_t ctx)
 
                NECP_FD_LOCK(fd_data);
                pid_t pid = fd_data->proc_pid;
+
+               struct necp_client_flow_registration *flow_registration = NULL;
+               struct necp_client_flow_registration *temp_flow_registration = NULL;
+               RB_FOREACH_SAFE(flow_registration, _necp_fd_flow_tree, &fd_data->flows, temp_flow_registration) {
+                       NECP_FLOW_TREE_LOCK_EXCLUSIVE();
+                       RB_REMOVE(_necp_client_flow_global_tree, &necp_client_flow_global_tree, flow_registration);
+                       NECP_FLOW_TREE_UNLOCK();
+                       RB_REMOVE(_necp_fd_flow_tree, &fd_data->flows, flow_registration);
+               }
+
                struct necp_client *client = NULL;
                struct necp_client *temp_client = NULL;
                RB_FOREACH_SAFE(client, _necp_client_tree, &fd_data->clients, temp_client) {
@@ -1089,75 +1319,143 @@ necp_find_fd_data(int fd, struct necp_fd_data **fd_data)
        }
        *fd_data = (struct necp_fd_data *)fp->f_fglob->fg_data;
 
+       if ((*fd_data)->necp_fd_type != necp_fd_type_client) {
+               // Not a client fd, ignore
+               error = EINVAL;
+               goto done;
+       }
+
 done:
        proc_fdunlock(p);
        return (error);
 }
 
 
-static void
-necp_client_add_socket_flow(struct necp_client *client, struct inpcb *inp)
-{
-       struct necp_client_flow *new_flow = mcache_alloc(necp_flow_cache, MCR_SLEEP);
-       if (new_flow == NULL) {
-               NECPLOG0(LOG_ERR, "Failed to allocate socket flow");
-               return;
-       }
-
-       memset(new_flow, 0, sizeof(*new_flow));
-
-       new_flow->socket = TRUE;
-       new_flow->u.socket_handle = inp;
-       new_flow->u.cb = inp->necp_cb;
-
-       OSIncrementAtomic(&necp_socket_flow_count);
-
-       LIST_INSERT_HEAD(&client->flow_list, new_flow, flow_chain);
-}
-
-static void
-necp_client_add_interface_flow(struct necp_client *client, uint32_t interface_index)
+static struct necp_client_flow *
+necp_client_add_interface_flow(struct necp_client_flow_registration *flow_registration,
+                                                          uint32_t interface_index)
 {
        struct necp_client_flow *new_flow = mcache_alloc(necp_flow_cache, MCR_SLEEP);
        if (new_flow == NULL) {
                NECPLOG0(LOG_ERR, "Failed to allocate interface flow");
-               return;
+               return NULL;
        }
 
        memset(new_flow, 0, sizeof(*new_flow));
 
        // Neither nexus nor socket
        new_flow->interface_index = interface_index;
-       new_flow->u.socket_handle = client->interface_handle;
-       new_flow->u.cb = client->interface_cb;
+       new_flow->u.socket_handle = flow_registration->interface_handle;
+       new_flow->u.cb = flow_registration->interface_cb;
 
        OSIncrementAtomic(&necp_if_flow_count);
 
-       LIST_INSERT_HEAD(&client->flow_list, new_flow, flow_chain);
+       LIST_INSERT_HEAD(&flow_registration->flow_list, new_flow, flow_chain);
+
+       return new_flow;
 }
 
-static void
-necp_client_add_interface_flow_if_needed(struct necp_client *client, uint32_t interface_index)
+static struct necp_client_flow *
+necp_client_add_interface_flow_if_needed(struct necp_client *client,
+                                                                                struct necp_client_flow_registration *flow_registration,
+                                                                                uint32_t interface_index)
 {
        if (!client->allow_multiple_flows ||
                interface_index == IFSCOPE_NONE) {
                // Interface not set, or client not allowed to use this mode
-               return;
+               return NULL;
        }
 
        struct necp_client_flow *flow = NULL;
-       LIST_FOREACH(flow, &client->flow_list, flow_chain) {
+       LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) {
                if (!flow->nexus && !flow->socket && flow->interface_index == interface_index) {
                        // Already have the flow
                        flow->invalid = FALSE;
+                       flow->u.socket_handle = flow_registration->interface_handle;
+                       flow->u.cb = flow_registration->interface_cb;
+                       return NULL;
+               }
+       }
+       return necp_client_add_interface_flow(flow_registration, interface_index);
+}
+
+static void
+necp_client_add_interface_option_if_needed(struct necp_client *client,
+                                                                                  uint32_t interface_index,
+                                                                                  uint32_t interface_generation,
+                                                                                  uuid_t *nexus_agent)
+{
+       if (interface_index == IFSCOPE_NONE ||
+               (client->interface_option_count != 0 && !client->allow_multiple_flows)) {
+               // Interface not set, or client not allowed to use this mode
+               return;
+       }
+
+       if (client->interface_option_count >= NECP_CLIENT_MAX_INTERFACE_OPTIONS) {
+               // Cannot take any more interface options
+               return;
+       }
 
-                       flow->u.socket_handle = client->interface_handle;
-                       flow->u.cb = client->interface_cb;
-                       return;
+       // Check if already present
+       for (u_int32_t option_i = 0; option_i < client->interface_option_count; option_i++) {
+               if (option_i < NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) {
+                       struct necp_client_interface_option *option = &client->interface_options[option_i];
+                       if (option->interface_index == interface_index) {
+                               if (nexus_agent == NULL) {
+                                       return;
+                               }
+                               if (uuid_compare(option->nexus_agent, *nexus_agent) == 0) {
+                                       return;
+                               }
+                               if (uuid_is_null(option->nexus_agent)) {
+                                       uuid_copy(option->nexus_agent, *nexus_agent);
+                                       return;
+                               }
+                               // If we get to this point, this is a new nexus flow
+                       }
+               } else {
+                       struct necp_client_interface_option *option = &client->extra_interface_options[option_i - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT];
+                       if (option->interface_index == interface_index) {
+                               if (nexus_agent == NULL) {
+                                       return;
+                               }
+                               if (uuid_compare(option->nexus_agent, *nexus_agent) == 0) {
+                                       return;
+                               }
+                               if (uuid_is_null(option->nexus_agent)) {
+                                       uuid_copy(option->nexus_agent, *nexus_agent);
+                                       return;
+                               }
+                               // If we get to this point, this is a new nexus flow
+                       }
                }
        }
 
-       necp_client_add_interface_flow(client, interface_index);
+       // Add a new entry
+       if (client->interface_option_count < NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) {
+               // Add to static
+               struct necp_client_interface_option *option = &client->interface_options[client->interface_option_count];
+               option->interface_index = interface_index;
+               option->interface_generation = interface_generation;
+               if (nexus_agent != NULL) {
+                       uuid_copy(option->nexus_agent, *nexus_agent);
+               }
+               client->interface_option_count++;
+       } else {
+               // Add to extra
+               if (client->extra_interface_options == NULL) {
+                       client->extra_interface_options = _MALLOC(sizeof(struct necp_client_interface_option) * NECP_CLIENT_INTERFACE_OPTION_EXTRA_COUNT, M_NECP, M_WAITOK | M_ZERO);
+               }
+               if (client->extra_interface_options != NULL) {
+                       struct necp_client_interface_option *option = &client->extra_interface_options[client->interface_option_count - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT];
+                       option->interface_index = interface_index;
+                       option->interface_generation = interface_generation;
+                       if (nexus_agent != NULL) {
+                               uuid_copy(option->nexus_agent, *nexus_agent);
+                       }
+                       client->interface_option_count++;
+               }
+       }
 }
 
 static bool
@@ -1179,66 +1477,110 @@ necp_client_flow_is_viable(proc_t proc, struct necp_client *client,
                        result.routing_result != NECP_KERNEL_POLICY_RESULT_DROP);
 }
 
+static void
+necp_flow_add_interface_flows(proc_t proc,
+                                                         struct necp_client *client,
+                                                         struct necp_client_flow_registration *flow_registration,
+                                                         bool send_initial)
+{
+       // Traverse all interfaces and add a tracking flow if needed
+       for (u_int32_t option_i = 0; option_i < client->interface_option_count; option_i++) {
+               if (option_i < NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) {
+                       struct necp_client_interface_option *option = &client->interface_options[option_i];
+                       struct necp_client_flow *flow = necp_client_add_interface_flow_if_needed(client, flow_registration, option->interface_index);
+                       if (flow != NULL && send_initial) {
+                               flow->viable = necp_client_flow_is_viable(proc, client, flow);
+                               if (flow->viable && flow->u.cb) {
+                                       bool viable = flow->viable;
+                                       flow->u.cb(flow_registration->interface_handle, NECP_CLIENT_CBACTION_INITIAL, flow->interface_index, flow->necp_flow_flags, &viable);
+                                       flow->viable = viable;
+                               }
+                       }
+               } else {
+                       struct necp_client_interface_option *option = &client->extra_interface_options[option_i - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT];
+                       struct necp_client_flow *flow = necp_client_add_interface_flow_if_needed(client, flow_registration, option->interface_index);
+                       if (flow != NULL && send_initial) {
+                               flow->viable = necp_client_flow_is_viable(proc, client, flow);
+                               if (flow->viable && flow->u.cb) {
+                                       bool viable = flow->viable;
+                                       flow->u.cb(flow_registration->interface_handle, NECP_CLIENT_CBACTION_INITIAL, flow->interface_index, flow->necp_flow_flags, &viable);
+                                       flow->viable = viable;
+                               }
+                       }
+               }
+       }
+}
+
 static bool
 necp_client_update_flows(proc_t proc,
                                                 struct necp_client *client,
-                                                struct _necp_client_defunct_list *defunct_list,
-                                                bool *defuncted_by_flow)
+                                                struct _necp_flow_defunct_list *defunct_list)
 {
        NECP_CLIENT_ASSERT_LOCKED(client);
 
        bool client_updated = FALSE;
        struct necp_client_flow *flow = NULL;
        struct necp_client_flow *temp_flow = NULL;
-       LIST_FOREACH_SAFE(flow, &client->flow_list, flow_chain, temp_flow) {
-               // Check policy result for flow
-               int old_flags = flow->necp_flow_flags;
-               bool viable = necp_client_flow_is_viable(proc, client, flow);
-
-               // TODO: Defunct nexus flows that are blocked by policy
-
-               if (flow->viable != viable) {
-                       flow->viable = viable;
-                       client_updated = TRUE;
+       struct necp_client_flow_registration *flow_registration = NULL;
+       RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+               if (flow_registration->interface_cb != NULL) {
+                       // Add any interface flows that are not already tracked
+                       necp_flow_add_interface_flows(proc, client, flow_registration, false);
                }
 
-               if ((old_flags & (NECP_CLIENT_RESULT_FLAG_HAS_IPV4 | NECP_CLIENT_RESULT_FLAG_HAS_IPV6)) !=
-                   (flow->necp_flow_flags & (NECP_CLIENT_RESULT_FLAG_HAS_IPV4 | NECP_CLIENT_RESULT_FLAG_HAS_IPV6))) {
-                       client_updated = TRUE;
-               }
+               LIST_FOREACH_SAFE(flow, &flow_registration->flow_list, flow_chain, temp_flow) {
+                       // Check policy result for flow
+                       int old_flags = flow->necp_flow_flags;
+                       bool viable = necp_client_flow_is_viable(proc, client, flow);
 
-               if (flow->viable && client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) {
-                       flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_VIABLE, flow);
-               }
+                       // TODO: Defunct nexus flows that are blocked by policy
 
-               if (!flow->viable || flow->invalid) {
-                       if (client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) {
-                               flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_NONVIABLE, flow);
+                       if (flow->viable != viable) {
+                               flow->viable = viable;
+                               client_updated = TRUE;
+                       }
+
+                       if ((old_flags & NECP_CLIENT_RESULT_FLAG_FORCE_UPDATE) !=
+                               (flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_FORCE_UPDATE)) {
+                               client_updated = TRUE;
                        }
-                       // The callback might change the viable-flag of the
-                       // flow depending on its policy. Thus, we need to
-                       // check again the flags after the callback.
-               }
 
-               (void)defunct_list;
-               (void)defuncted_by_flow;
+                       if (flow->viable && client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) {
+                               bool flow_viable = flow->viable;
+                               flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_VIABLE, flow->interface_index, flow->necp_flow_flags, &viable);
+                               flow->viable = flow_viable;
+                       }
 
-               // Handle flows that no longer match
-               if (!flow->viable || flow->invalid) {
-                       // Drop them as long as they aren't assigned data
-                       if (!flow->requested_nexus && !flow->assigned) {
-                               if (flow->assigned_results != NULL) {
-                                       FREE(flow->assigned_results, M_NETAGENT);
-                                       flow->assigned_results = NULL;
-                                       client_updated = TRUE;
+                       if (!flow->viable || flow->invalid) {
+                               if (client_updated && (flow->socket || (!flow->socket && !flow->nexus)) && flow->u.cb) {
+                                       bool flow_viable = flow->viable;
+                                       flow->u.cb(flow->u.socket_handle, NECP_CLIENT_CBACTION_NONVIABLE, flow->interface_index, flow->necp_flow_flags, &viable);
+                                       flow->viable = flow_viable;
                                }
-                               LIST_REMOVE(flow, flow_chain);
-                               if (flow->socket) {
-                                       OSDecrementAtomic(&necp_socket_flow_count);
-                               } else {
-                                       OSDecrementAtomic(&necp_if_flow_count);
+                               // The callback might change the viable-flag of the
+                               // flow depending on its policy. Thus, we need to
+                               // check the flags again after the callback.
+                       }
+
+                       (void)defunct_list;
+
+                       // Handle flows that no longer match
+                       if (!flow->viable || flow->invalid) {
+                               // Drop them as long as they aren't assigned data
+                               if (!flow->nexus && !flow->assigned) {
+                                       if (flow->assigned_results != NULL) {
+                                               FREE(flow->assigned_results, M_NETAGENT);
+                                               flow->assigned_results = NULL;
+                                               client_updated = TRUE;
+                                       }
+                                       LIST_REMOVE(flow, flow_chain);
+                                       if (flow->socket) {
+                                               OSDecrementAtomic(&necp_socket_flow_count);
+                                       } else {
+                                               OSDecrementAtomic(&necp_if_flow_count);
+                                       }
+                                       mcache_free(necp_flow_cache, flow);
                                }
-                               mcache_free(necp_flow_cache, flow);
                        }
                }
        }
@@ -1249,23 +1591,29 @@ necp_client_update_flows(proc_t proc,
 static void
 necp_client_mark_all_nonsocket_flows_as_invalid(struct necp_client *client)
 {
+       struct necp_client_flow_registration *flow_registration = NULL;
        struct necp_client_flow *flow = NULL;
-       LIST_FOREACH(flow, &client->flow_list, flow_chain) {
-               if (!flow->socket) { // Socket flows are not marked as invalid
-                       flow->invalid = TRUE;
+       RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+               LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) {
+                       if (!flow->socket) { // Socket flows are not marked as invalid
+                               flow->invalid = TRUE;
+                       }
                }
        }
+
+       // Reset option count every update
+       client->interface_option_count = 0;
 }
 
 static bool
-necp_netagent_applies_to_client(__unused struct necp_client *client,
+necp_netagent_applies_to_client(struct necp_client *client,
                                                                const struct necp_client_parsed_parameters *parameters,
-                                                               uuid_t netagent_uuid, bool allow_nexus,
-                                                               uint32_t interface_index, u_int16_t interface_flags)
+                                                               uuid_t *netagent_uuid, bool allow_nexus,
+                                                               uint32_t interface_index, uint32_t interface_generation)
 {
-#pragma unused(interface_index, interface_flags)
+#pragma unused(interface_index, interface_generation)
        bool applies = FALSE;
-       u_int32_t flags = netagent_get_flags(netagent_uuid);
+       u_int32_t flags = netagent_get_flags(*netagent_uuid);
        if (!(flags & NETAGENT_FLAG_REGISTERED)) {
                // Unregistered agents never apply
                return (applies);
@@ -1279,6 +1627,17 @@ necp_netagent_applies_to_client(__unused struct necp_client *client,
                return (applies);
        }
 
+       if (uuid_compare(client->failed_trigger_agent.netagent_uuid, *netagent_uuid) == 0) {
+               if (client->failed_trigger_agent.generation == netagent_get_generation(*netagent_uuid)) {
+                       // If this agent was triggered, and failed, and hasn't changed, keep hiding it
+                       return (applies);
+               } else {
+                       // Mismatch generation, clear out old trigger
+                       uuid_clear(client->failed_trigger_agent.netagent_uuid);
+                       client->failed_trigger_agent.generation = 0;
+               }
+       }
+
        if (flags & NETAGENT_FLAG_SPECIFIC_USE_ONLY) {
                // Specific use agents only apply when required
                bool required = FALSE;
@@ -1288,7 +1647,7 @@ necp_netagent_applies_to_client(__unused struct necp_client *client,
                                if (uuid_is_null(parameters->required_netagents[i])) {
                                        break;
                                }
-                               if (uuid_compare(parameters->required_netagents[i], netagent_uuid) == 0) {
+                               if (uuid_compare(parameters->required_netagents[i], *netagent_uuid) == 0) {
                                        required = TRUE;
                                        break;
                                }
@@ -1309,7 +1668,7 @@ necp_netagent_applies_to_client(__unused struct necp_client *client,
                                        }
 
                                        if (!fetched_type) {
-                                               if (netagent_get_agent_domain_and_type(netagent_uuid, netagent_domain, netagent_type)) {
+                                               if (netagent_get_agent_domain_and_type(*netagent_uuid, netagent_domain, netagent_type)) {
                                                        fetched_type = TRUE;
                                                } else {
                                                        break;
@@ -1337,18 +1696,18 @@ necp_netagent_applies_to_client(__unused struct necp_client *client,
 }
 
 static void
-necp_client_add_agent_flows_for_interface(struct necp_client *client,
-                                                                                 const struct necp_client_parsed_parameters *parsed_parameters,
-                                                                                 ifnet_t ifp)
+necp_client_add_agent_interface_options(struct necp_client *client,
+                                                                               const struct necp_client_parsed_parameters *parsed_parameters,
+                                                                               ifnet_t ifp)
 {
        if (ifp != NULL && ifp->if_agentids != NULL) {
                for (u_int32_t i = 0; i < ifp->if_agentcount; i++) {
                        if (uuid_is_null(ifp->if_agentids[i])) {
                                continue;
                        }
-                       u_int16_t if_flags = nstat_ifnet_to_flags(ifp);
                        // Relies on the side effect that nexus agents that apply will create flows
-                       (void)necp_netagent_applies_to_client(client, parsed_parameters, ifp->if_agentids[i], TRUE, ifp->if_index, if_flags);
+                       (void)necp_netagent_applies_to_client(client, parsed_parameters, &ifp->if_agentids[i], TRUE,
+                                                                                                 ifp->if_index, ifnet_get_generation(ifp));
                }
        }
 }
@@ -1378,9 +1737,11 @@ necp_client_parse_parameters(u_int8_t *parameters,
        u_int32_t num_required_agents = 0;
        u_int32_t num_prohibited_agents = 0;
        u_int32_t num_preferred_agents = 0;
+       u_int32_t num_avoided_agents = 0;
        u_int32_t num_required_agent_types = 0;
        u_int32_t num_prohibited_agent_types = 0;
        u_int32_t num_preferred_agent_types = 0;
+       u_int32_t num_avoided_agent_types = 0;
 
        if (parsed_parameters == NULL) {
                return (EINVAL);
@@ -1536,6 +1897,17 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                                }
                                                break;
                                        }
+                                       case NECP_CLIENT_PARAMETER_AVOID_AGENT: {
+                                               if (num_avoided_agents >= NECP_MAX_PARSED_PARAMETERS) {
+                                                       break;
+                                               }
+                                               if (length >= sizeof(uuid_t)) {
+                                                       memcpy(&parsed_parameters->avoided_netagents[num_avoided_agents], value, sizeof(uuid_t));
+                                                       num_avoided_agents++;
+                                                       parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT;
+                                               }
+                                               break;
+                                       }
                                        case NECP_CLIENT_PARAMETER_REQUIRE_AGENT_TYPE: {
                                                if (num_required_agent_types >= NECP_MAX_PARSED_PARAMETERS) {
                                                        break;
@@ -1569,9 +1941,20 @@ necp_client_parse_parameters(u_int8_t *parameters,
                                                }
                                                break;
                                        }
-                                       case NECP_CLIENT_PARAMETER_FLAGS: {
-                                               if (length >= sizeof(u_int32_t)) {
-                                                       memcpy(&parsed_parameters->flags, value, sizeof(parsed_parameters->flags));
+                                       case NECP_CLIENT_PARAMETER_AVOID_AGENT_TYPE: {
+                                               if (num_avoided_agent_types >= NECP_MAX_PARSED_PARAMETERS) {
+                                                       break;
+                                               }
+                                               if (length >= sizeof(struct necp_client_parameter_netagent_type)) {
+                                                       memcpy(&parsed_parameters->avoided_netagent_types[num_avoided_agent_types], value, sizeof(struct necp_client_parameter_netagent_type));
+                                                       num_avoided_agent_types++;
+                                                       parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE;
+                                               }
+                                               break;
+                                       }
+                                       case NECP_CLIENT_PARAMETER_FLAGS: {
+                                               if (length >= sizeof(u_int32_t)) {
+                                                       memcpy(&parsed_parameters->flags, value, sizeof(parsed_parameters->flags));
                                                        parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_FLAGS;
                                                }
                                                break;
@@ -1621,8 +2004,10 @@ static int
 necp_client_parse_result(u_int8_t *result,
                                                 u_int32_t result_size,
                                                 union necp_sockaddr_union *local_address,
-                                                union necp_sockaddr_union *remote_address)
+                                                union necp_sockaddr_union *remote_address,
+                                                void **flow_stats)
 {
+#pragma unused(flow_stats)
        int error = 0;
        size_t offset = 0;
 
@@ -1665,37 +2050,109 @@ necp_client_parse_result(u_int8_t *result,
        return (error);
 }
 
+static struct necp_client_flow_registration *
+necp_client_create_flow_registration(struct necp_fd_data *fd_data, struct necp_client *client)
+{
+       NECP_FD_ASSERT_LOCKED(fd_data);
+       NECP_CLIENT_ASSERT_LOCKED(client);
+
+       struct necp_client_flow_registration *new_registration = mcache_alloc(necp_flow_registration_cache, MCR_SLEEP);
+       if (new_registration == NULL) {
+               return NULL;
+       }
+
+       memset(new_registration, 0, sizeof(*new_registration));
+
+       new_registration->last_interface_details = combine_interface_details(IFSCOPE_NONE, NSTAT_IFNET_IS_UNKNOWN_TYPE);
+
+       necp_generate_client_id(new_registration->registration_id, true);
+       LIST_INIT(&new_registration->flow_list);
+
+       // Add registration to client list
+       RB_INSERT(_necp_client_flow_tree, &client->flow_registrations, new_registration);
+
+       // Add registration to fd list
+       RB_INSERT(_necp_fd_flow_tree, &fd_data->flows, new_registration);
+
+       // Add registration to global tree for lookup
+       NECP_FLOW_TREE_LOCK_EXCLUSIVE();
+       RB_INSERT(_necp_client_flow_global_tree, &necp_client_flow_global_tree, new_registration);
+       NECP_FLOW_TREE_UNLOCK();
+
+       new_registration->client = client;
+
+       // Start out assuming there is nothing to read from the flow
+       new_registration->flow_result_read = true;
+
+       return new_registration;
+}
+
+static void
+necp_client_add_socket_flow(struct necp_client_flow_registration *flow_registration,
+                                                       struct inpcb *inp)
+{
+       struct necp_client_flow *new_flow = mcache_alloc(necp_flow_cache, MCR_SLEEP);
+       if (new_flow == NULL) {
+               NECPLOG0(LOG_ERR, "Failed to allocate socket flow");
+               return;
+       }
+
+       memset(new_flow, 0, sizeof(*new_flow));
+
+       new_flow->socket = TRUE;
+       new_flow->u.socket_handle = inp;
+       new_flow->u.cb = inp->necp_cb;
+
+       OSIncrementAtomic(&necp_socket_flow_count);
+
+       LIST_INSERT_HEAD(&flow_registration->flow_list, new_flow, flow_chain);
+}
+
 int
 necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp)
 {
        int error = 0;
+       struct necp_fd_data *client_fd = NULL;
        bool found_client = FALSE;
 
-       NECP_CLIENT_TREE_LOCK_SHARED();
-
-       struct necp_client find;
-       uuid_copy(find.client_id, client_id);
-       struct necp_client *client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find);
-       if (client != NULL) {
-               NECP_CLIENT_LOCK(client);
+       NECP_FD_LIST_LOCK_SHARED();
+       LIST_FOREACH(client_fd, &necp_fd_list, chain) {
+               NECP_FD_LOCK(client_fd);
+               struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id);
+               if (client != NULL) {
+                       if (!pid || client->proc_pid == pid) {
+                               struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+                               if (flow_registration != NULL) {
+                                       // Found the right client and flow registration, add a new flow
+                                       found_client = TRUE;
+                                       necp_client_add_socket_flow(flow_registration, inp);
+                               } else if (RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) {
+                                       // No flows yet on this client, add a new registration
+                                       flow_registration = necp_client_create_flow_registration(client_fd, client);
+                                       if (flow_registration == NULL) {
+                                               error = ENOMEM;
+                                       } else {
+                                               // Add a new flow
+                                               found_client = TRUE;
+                                               necp_client_add_socket_flow(flow_registration, inp);
+                                       }
+                               }
+                       }
 
-               if (!pid || client->proc_pid == pid) {
-                       // Found the right client!
-                       found_client = TRUE;
-                       necp_client_add_socket_flow(client, inp);
+                       NECP_CLIENT_UNLOCK(client);
                }
+               NECP_FD_UNLOCK(client_fd);
 
-               NECP_CLIENT_UNLOCK(client);
+               if (found_client) {
+                       break;
+               }
        }
-
-       NECP_CLIENT_TREE_UNLOCK();
+       NECP_FD_LIST_UNLOCK();
 
        if (!found_client) {
                error = ENOENT;
        } else {
-               /*
-                * Count the sockets that have the NECP client UUID set
-                */
+               // Count the sockets that have the NECP client UUID set
                struct socket *so = inp->inp_socket;
                if (!(so->so_flags1 & SOF1_HAS_NECP_CLIENT_UUID)) {
                        so->so_flags1 |= SOF1_HAS_NECP_CLIENT_UUID;
@@ -1707,62 +2164,192 @@ necp_client_register_socket_flow(pid_t pid, uuid_t client_id, struct inpcb *inp)
 }
 
 static void
-necp_client_add_multipath_cb(struct necp_client *client, struct mppcb *mpp)
+necp_client_add_multipath_interface_flows(struct necp_client_flow_registration *flow_registration,
+                                                                                 struct necp_client *client,
+                                                                                 struct mppcb *mpp)
 {
-       struct necp_client_flow *flow = NULL;
+       flow_registration->interface_handle = mpp;
+       flow_registration->interface_cb = mpp->necp_cb;
 
-       client->interface_handle = mpp;
-       client->interface_cb = mpp->necp_cb;
+       proc_t proc = proc_find(client->proc_pid);
+       if (proc == PROC_NULL) {
+               return;
+       }
 
-       LIST_FOREACH(flow, &client->flow_list, flow_chain) {
-               if (flow->nexus || flow->socket) {
+       // Traverse all interfaces and add a tracking flow if needed
+       necp_flow_add_interface_flows(proc, client, flow_registration, true);
+
+       proc_rele(proc);
+       proc = PROC_NULL;
+}
+
+int
+necp_client_register_multipath_cb(pid_t pid, uuid_t client_id, struct mppcb *mpp)
+{
+       int error = 0;
+       struct necp_fd_data *client_fd = NULL;
+       bool found_client = FALSE;
+
+       NECP_FD_LIST_LOCK_SHARED();
+       LIST_FOREACH(client_fd, &necp_fd_list, chain) {
+               NECP_FD_LOCK(client_fd);
+               struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id);
+               if (client != NULL) {
+                       if (!pid || client->proc_pid == pid) {
+                               struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+                               if (flow_registration != NULL) {
+                                       // Found the right client and flow registration, add a new flow
+                                       found_client = TRUE;
+                                       necp_client_add_multipath_interface_flows(flow_registration, client, mpp);
+                               } else if (RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) {
+                                       // No flows yet on this client, add a new registration
+                                       flow_registration = necp_client_create_flow_registration(client_fd, client);
+                                       if (flow_registration == NULL) {
+                                               error = ENOMEM;
+                                       } else {
+                                               // Add a new flow
+                                               found_client = TRUE;
+                                               necp_client_add_multipath_interface_flows(flow_registration, client, mpp);
+                                       }
+                               }
+                       }
+
+                       NECP_CLIENT_UNLOCK(client);
+               }
+               NECP_FD_UNLOCK(client_fd);
+
+               if (found_client) {
+                       break;
+               }
+       }
+       NECP_FD_LIST_UNLOCK();
+
+       if (!found_client && error == 0) {
+               error = ENOENT;
+       }
+
+       return (error);
+}
+
+#define        NETAGENT_DOMAIN_RADIO_MANAGER   "WirelessRadioManager"
+#define        NETAGENT_TYPE_RADIO_MANAGER     "WirelessRadioManager:BB Manager"
+
+static int
+necp_client_lookup_bb_radio_manager(struct necp_client *client,
+                                   uuid_t netagent_uuid)
+{
+       char netagent_domain[NETAGENT_DOMAINSIZE];
+       char netagent_type[NETAGENT_TYPESIZE];
+       struct necp_aggregate_result result;
+       proc_t proc;
+       int error;
+
+       proc = proc_find(client->proc_pid);
+       if (proc == PROC_NULL) {
+               return ESRCH;
+       }
+
+       error = necp_application_find_policy_match_internal(proc, client->parameters, (u_int32_t)client->parameters_length,
+                               &result, NULL, 0, NULL, NULL, NULL, true);
+
+       proc_rele(proc);
+       proc = PROC_NULL;
+
+       if (error) {
+               return error;
+       }
+
+       for (int i = 0; i < NECP_MAX_NETAGENTS; i++) {
+               if (uuid_is_null(result.netagents[i])) {
+                       // Passed end of valid agents
+                       break;
+               }
+
+               memset(&netagent_domain, 0, NETAGENT_DOMAINSIZE);
+               memset(&netagent_type, 0, NETAGENT_TYPESIZE);
+               if (netagent_get_agent_domain_and_type(result.netagents[i], netagent_domain, netagent_type) == FALSE) {
+                       continue;
+               }
+
+               if (strncmp(netagent_domain, NETAGENT_DOMAIN_RADIO_MANAGER, NETAGENT_DOMAINSIZE) != 0) {
                        continue;
                }
 
-               flow->u.socket_handle = mpp;
-               flow->u.cb = mpp->necp_cb;
+               if (strncmp(netagent_type, NETAGENT_TYPE_RADIO_MANAGER, NETAGENT_TYPESIZE) != 0) {
+                       continue;
+               }
+
+               uuid_copy(netagent_uuid, result.netagents[i]);
 
-               if (flow->viable && flow->u.cb) {
-                       flow->u.cb(mpp, NECP_CLIENT_CBACTION_INITIAL, flow);
+               break;
+       }
+
+       return 0;
+}
+
+static int
+necp_client_assert_bb_radio_manager_common(struct necp_client *client, bool assert)
+{
+       uuid_t netagent_uuid;
+       uint8_t assert_type;
+       int error;
+
+       error = necp_client_lookup_bb_radio_manager(client, netagent_uuid);
+       if (error) {
+               NECPLOG0(LOG_ERR, "BB radio manager agent not found");
+               return error;
+       }
+
+       // Before unasserting, verify that the assertion was already taken
+       if (assert == FALSE) {
+               assert_type = NETAGENT_MESSAGE_TYPE_CLIENT_UNASSERT;
+
+               if (!necp_client_remove_assertion(client, netagent_uuid)) {
+                       return EINVAL;
                }
+       } else {
+               assert_type = NETAGENT_MESSAGE_TYPE_CLIENT_ASSERT;
+       }
+
+       error = netagent_client_message(netagent_uuid, client->client_id, client->proc_pid, client->agent_handle, assert_type);
+       if (error) {
+               NECPLOG0(LOG_ERR, "netagent_client_message failed");
+               return error;
        }
+
+       // Only save the assertion if the action succeeded
+       if (assert == TRUE) {
+               necp_client_add_assertion(client, netagent_uuid);
+       }
+
+       return 0;
 }
 
 int
-necp_client_register_multipath_cb(pid_t pid, uuid_t client_id, struct mppcb *mpp)
+necp_client_assert_bb_radio_manager(uuid_t client_id, bool assert)
 {
+       struct necp_client *client;
        int error = 0;
-       bool found_client = FALSE;
 
        NECP_CLIENT_TREE_LOCK_SHARED();
 
-       struct necp_client find;
-       uuid_copy(find.client_id, client_id);
-       struct necp_client *client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find);
-       if (client != NULL) {
-               NECP_CLIENT_LOCK(client);
+       client = necp_find_client_and_lock(client_id);
 
-               if (!pid || client->proc_pid == pid) {
-                       // Found the right client!
-                       found_client = TRUE;
-                       necp_client_add_multipath_cb(client, mpp);
-               }
+       if (client) {
+               // Found the right client!
+               error = necp_client_assert_bb_radio_manager_common(client, assert);
 
                NECP_CLIENT_UNLOCK(client);
+       } else {
+               NECPLOG0(LOG_ERR, "Couldn't find client");
+               error = ENOENT;
        }
 
        NECP_CLIENT_TREE_UNLOCK();
 
-       if (!found_client) {
-               error = ENOENT;
-       }
-
        return (error);
 }
 
-#define        NETAGENT_DOMAIN_NETEXT  "NetworkExtension"
-#define        NETAGENT_TYPE_PATHCTRL  "PathController"
-
 static int
 necp_client_unregister_socket_flow(uuid_t client_id, void *handle)
 {
@@ -1777,22 +2364,26 @@ necp_client_unregister_socket_flow(uuid_t client_id, void *handle)
 
                struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id);
                if (client != NULL) {
-                       // Found the right client!
-                       found_client = TRUE;
+                       struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+                       if (flow_registration != NULL) {
+                               // Found the right client and flow!
+                               found_client = TRUE;
 
-                       // Remove flow assignment
-                       struct necp_client_flow *search_flow = NULL;
-                       struct necp_client_flow *temp_flow = NULL;
-                       LIST_FOREACH_SAFE(search_flow, &client->flow_list, flow_chain, temp_flow) {
-                               if (search_flow->socket && search_flow->u.socket_handle == handle) {
-                                       if (search_flow->assigned_results != NULL) {
-                                               FREE(search_flow->assigned_results, M_NETAGENT);
-                                               search_flow->assigned_results = NULL;
+                               // Remove flow assignment
+                               struct necp_client_flow *search_flow = NULL;
+                               struct necp_client_flow *temp_flow = NULL;
+                               LIST_FOREACH_SAFE(search_flow, &flow_registration->flow_list, flow_chain, temp_flow) {
+                                       if (search_flow->socket && search_flow->u.socket_handle == handle) {
+                                               if (search_flow->assigned_results != NULL) {
+                                                       FREE(search_flow->assigned_results, M_NETAGENT);
+                                                       search_flow->assigned_results = NULL;
+                                               }
+                                               client_updated = TRUE;
+                                               flow_registration->flow_result_read = FALSE;
+                                               LIST_REMOVE(search_flow, flow_chain);
+                                               OSDecrementAtomic(&necp_socket_flow_count);
+                                               mcache_free(necp_flow_cache, search_flow);
                                        }
-                                       client_updated = TRUE;
-                                       LIST_REMOVE(search_flow, flow_chain);
-                                       OSDecrementAtomic(&necp_socket_flow_count);
-                                       mcache_free(necp_flow_cache, search_flow);
                                }
                        }
 
@@ -1800,7 +2391,6 @@ necp_client_unregister_socket_flow(uuid_t client_id, void *handle)
                }
 
                if (client_updated) {
-                       client->flow_result_read = FALSE;
                        necp_fd_notify(client_fd, true);
                }
                NECP_FD_UNLOCK(client_fd);
@@ -1826,28 +2416,27 @@ necp_client_unregister_multipath_cb(uuid_t client_id, void *handle)
 
        NECP_CLIENT_TREE_LOCK_SHARED();
 
-       struct necp_client find;
-       uuid_copy(find.client_id, client_id);
-       struct necp_client *client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find);
+       struct necp_client *client = necp_find_client_and_lock(client_id);
        if (client != NULL) {
-               NECP_CLIENT_LOCK(client);
-
-               // Found the right client!
-               found_client = TRUE;
+               struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+               if (flow_registration != NULL) {
+                       // Found the right client and flow!
+                       found_client = TRUE;
 
-               // Remove flow assignment
-               struct necp_client_flow *search_flow = NULL;
-               struct necp_client_flow *temp_flow = NULL;
-               LIST_FOREACH_SAFE(search_flow, &client->flow_list, flow_chain, temp_flow) {
-                       if (!search_flow->socket && !search_flow->nexus &&
-                               search_flow->u.socket_handle == handle) {
-                               search_flow->u.socket_handle = NULL;
-                               search_flow->u.cb = NULL;
+                       // Remove flow assignment
+                       struct necp_client_flow *search_flow = NULL;
+                       struct necp_client_flow *temp_flow = NULL;
+                       LIST_FOREACH_SAFE(search_flow, &flow_registration->flow_list, flow_chain, temp_flow) {
+                               if (!search_flow->socket && !search_flow->nexus &&
+                                       search_flow->u.socket_handle == handle) {
+                                       search_flow->u.socket_handle = NULL;
+                                       search_flow->u.cb = NULL;
+                               }
                        }
-               }
 
-               client->interface_handle = NULL;
-               client->interface_cb = NULL;
+                       flow_registration->interface_handle = NULL;
+                       flow_registration->interface_cb = NULL;
+               }
 
                NECP_CLIENT_UNLOCK(client);
        }
@@ -1884,58 +2473,68 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp)
 
                struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id);
                if (client != NULL) {
-                       // Found the right client!
-                       found_client = TRUE;
-
-                       struct necp_client_flow *flow = NULL;
-                       LIST_FOREACH(flow, &client->flow_list, flow_chain) {
-                               if (flow->socket && flow->u.socket_handle == inp) {
-                                       // Release prior results and route
-                                       if (flow->assigned_results != NULL) {
-                                               FREE(flow->assigned_results, M_NETAGENT);
-                                               flow->assigned_results = NULL;
-                                       }
+                       struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+                       if (flow_registration == NULL && RB_EMPTY(&client->flow_registrations) && !necp_client_id_is_flow(client_id)) {
+                               // No flows yet on this client, add a new registration
+                               flow_registration = necp_client_create_flow_registration(client_fd, client);
+                               if (flow_registration == NULL) {
+                                       error = ENOMEM;
+                               }
+                       }
+                       if (flow_registration != NULL) {
+                               // Found the right client and flow!
+                               found_client = TRUE;
 
-                                       ifnet_t ifp = NULL;
-                                       if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp) {
-                                               ifp = inp->inp_boundifp;
-                                       } else {
-                                               ifp = inp->inp_last_outifp;
-                                       }
+                               struct necp_client_flow *flow = NULL;
+                               LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) {
+                                       if (flow->socket && flow->u.socket_handle == inp) {
+                                               // Release prior results and route
+                                               if (flow->assigned_results != NULL) {
+                                                       FREE(flow->assigned_results, M_NETAGENT);
+                                                       flow->assigned_results = NULL;
+                                               }
 
-                                       if (ifp != NULL) {
-                                               flow->interface_index = ifp->if_index;
-                                       } else {
-                                               flow->interface_index = IFSCOPE_NONE;
-                                       }
+                                               ifnet_t ifp = NULL;
+                                               if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp) {
+                                                       ifp = inp->inp_boundifp;
+                                               } else {
+                                                       ifp = inp->inp_last_outifp;
+                                               }
 
-                                       if (inp->inp_vflag & INP_IPV4) {
-                                               flow->local_addr.sin.sin_family = AF_INET;
-                                               flow->local_addr.sin.sin_len = sizeof(struct sockaddr_in);
-                                               flow->local_addr.sin.sin_port = inp->inp_lport;
-                                               memcpy(&flow->local_addr.sin.sin_addr, &inp->inp_laddr, sizeof(struct in_addr));
-
-                                               flow->remote_addr.sin.sin_family = AF_INET;
-                                               flow->remote_addr.sin.sin_len = sizeof(struct sockaddr_in);
-                                               flow->remote_addr.sin.sin_port = inp->inp_fport;
-                                               memcpy(&flow->remote_addr.sin.sin_addr, &inp->inp_faddr, sizeof(struct in_addr));
-                                       } else if (inp->inp_vflag & INP_IPV6) {
-                                               in6_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, &flow->local_addr.sin6, sizeof(flow->local_addr));
-                                               in6_ip6_to_sockaddr(&inp->in6p_faddr, inp->inp_fport, &flow->remote_addr.sin6, sizeof(flow->remote_addr));
-                                       }
+                                               if (ifp != NULL) {
+                                                       flow->interface_index = ifp->if_index;
+                                               } else {
+                                                       flow->interface_index = IFSCOPE_NONE;
+                                               }
 
-                                       flow->viable = necp_client_flow_is_viable(proc, client, flow);
+                                               if (inp->inp_vflag & INP_IPV4) {
+                                                       flow->local_addr.sin.sin_family = AF_INET;
+                                                       flow->local_addr.sin.sin_len = sizeof(struct sockaddr_in);
+                                                       flow->local_addr.sin.sin_port = inp->inp_lport;
+                                                       memcpy(&flow->local_addr.sin.sin_addr, &inp->inp_laddr, sizeof(struct in_addr));
+
+                                                       flow->remote_addr.sin.sin_family = AF_INET;
+                                                       flow->remote_addr.sin.sin_len = sizeof(struct sockaddr_in);
+                                                       flow->remote_addr.sin.sin_port = inp->inp_fport;
+                                                       memcpy(&flow->remote_addr.sin.sin_addr, &inp->inp_faddr, sizeof(struct in_addr));
+                                               } else if (inp->inp_vflag & INP_IPV6) {
+                                                       in6_ip6_to_sockaddr(&inp->in6p_laddr, inp->inp_lport, &flow->local_addr.sin6, sizeof(flow->local_addr));
+                                                       in6_ip6_to_sockaddr(&inp->in6p_faddr, inp->inp_fport, &flow->remote_addr.sin6, sizeof(flow->remote_addr));
+                                               }
 
-                                       uuid_t empty_uuid;
-                                       uuid_clear(empty_uuid);
-                                       flow->assigned = TRUE;
-                                       flow->assigned_results = necp_create_nexus_assign_message(empty_uuid, 0, NULL, 0,
-                                                                                                                                                         (struct necp_client_endpoint *)&flow->local_addr,
-                                                                                                                                                         (struct necp_client_endpoint *)&flow->remote_addr,
-                                                                                                                                                         0, &flow->assigned_results_length);
-                                       client->flow_result_read = FALSE;
-                                       client_updated = TRUE;
-                                       break;
+                                               flow->viable = necp_client_flow_is_viable(proc, client, flow);
+
+                                               uuid_t empty_uuid;
+                                               uuid_clear(empty_uuid);
+                                               flow->assigned = TRUE;
+                                               flow->assigned_results = necp_create_nexus_assign_message(empty_uuid, 0, NULL, 0,
+                                                                                                                                                                 (struct necp_client_endpoint *)&flow->local_addr,
+                                                                                                                                                                 (struct necp_client_endpoint *)&flow->remote_addr,
+                                                                                                                                                                 0, NULL, &flow->assigned_results_length);
+                                               flow_registration->flow_result_read = FALSE;
+                                               client_updated = TRUE;
+                                               break;
+                                       }
                                }
                        }
 
@@ -1955,10 +2554,12 @@ necp_client_assign_from_socket(pid_t pid, uuid_t client_id, struct inpcb *inp)
        }
        NECP_FD_LIST_UNLOCK();
 
-       if (!found_client) {
-               error = ENOENT;
-       } else if (!client_updated) {
-               error = EINVAL;
+       if (error == 0) {
+               if (!found_client) {
+                       error = ENOENT;
+               } else if (!client_updated) {
+                       error = EINVAL;
+               }
        }
 
        return (error);
@@ -1985,22 +2586,25 @@ necp_update_flow_protoctl_event(uuid_t netagent_uuid, uuid_t client_id,
 
                struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id);
                if (client != NULL) {
-                       /* Found the right client! */
-                       found_client = TRUE;
+                       struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+                       if (flow_registration != NULL) {
+                               // Found the right client and flow!
+                               found_client = TRUE;
 
-                       struct necp_client_flow *flow = NULL;
-                       LIST_FOREACH(flow, &client->flow_list, flow_chain) {
-                               // Verify that the client nexus agent matches
-                               if (flow->nexus &&
-                                   uuid_compare(flow->u.nexus_agent,
-                                   netagent_uuid) == 0) {
-                                       flow->has_protoctl_event = TRUE;
-                                       flow->protoctl_event.protoctl_event_code = protoctl_event_code;
-                                       flow->protoctl_event.protoctl_event_val = protoctl_event_val;
-                                       flow->protoctl_event.protoctl_event_tcp_seq_num = protoctl_event_tcp_seq_number;
-                                       client->flow_result_read = FALSE;
-                                       client_updated = TRUE;
-                                       break;
+                               struct necp_client_flow *flow = NULL;
+                               LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) {
+                                       // Verify that the client nexus agent matches
+                                       if (flow->nexus &&
+                                               uuid_compare(flow->u.nexus_agent,
+                                               netagent_uuid) == 0) {
+                                               flow->has_protoctl_event = TRUE;
+                                               flow->protoctl_event.protoctl_event_code = protoctl_event_code;
+                                               flow->protoctl_event.protoctl_event_val = protoctl_event_val;
+                                               flow->protoctl_event.protoctl_event_tcp_seq_num = protoctl_event_tcp_seq_number;
+                                               flow_registration->flow_result_read = FALSE;
+                                               client_updated = TRUE;
+                                               break;
+                                       }
                                }
                        }
 
@@ -2033,6 +2637,7 @@ static bool
 necp_assign_client_result_locked(struct proc *proc,
                                                                 struct necp_fd_data *client_fd,
                                                                 struct necp_client *client,
+                                                                struct necp_client_flow_registration *flow_registration,
                                                                 uuid_t netagent_uuid,
                                                                 u_int8_t *assigned_results,
                                                                 size_t assigned_results_length,
@@ -2044,7 +2649,7 @@ necp_assign_client_result_locked(struct proc *proc,
        NECP_CLIENT_ASSERT_LOCKED(client);
 
        struct necp_client_flow *flow = NULL;
-       LIST_FOREACH(flow, &client->flow_list, flow_chain) {
+       LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) {
                // Verify that the client nexus agent matches
                if (flow->nexus &&
                        uuid_compare(flow->u.nexus_agent, netagent_uuid) == 0) {
@@ -2054,9 +2659,10 @@ necp_assign_client_result_locked(struct proc *proc,
                                flow->assigned_results = NULL;
                        }
 
+                       void *nexus_stats = NULL;
                        if (assigned_results != NULL && assigned_results_length > 0) {
                                int error = necp_client_parse_result(assigned_results, (u_int32_t)assigned_results_length,
-                                                                                                &flow->local_addr, &flow->remote_addr);
+                                                                                                &flow->local_addr, &flow->remote_addr, &nexus_stats);
                                VERIFY(error == 0);
                        }
 
@@ -2065,7 +2671,7 @@ necp_assign_client_result_locked(struct proc *proc,
                        flow->assigned = TRUE;
                        flow->assigned_results = assigned_results;
                        flow->assigned_results_length = assigned_results_length;
-                       client->flow_result_read = FALSE;
+                       flow_registration->flow_result_read = FALSE;
                        client_updated = TRUE;
                        break;
                }
@@ -2099,12 +2705,14 @@ necp_assign_client_result(uuid_t netagent_uuid, uuid_t client_id,
                NECP_FD_LOCK(client_fd);
                struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id);
                if (client != NULL) {
-                       // Found the right client!
-                       found_client = TRUE;
-
-                       if (necp_assign_client_result_locked(proc, client_fd, client, netagent_uuid,
-                                                                                                assigned_results, assigned_results_length, true)) {
-                               client_updated = TRUE;
+                       struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+                       if (flow_registration != NULL) {
+                               // Found the right client and flow!
+                               found_client = TRUE;
+                               if (necp_assign_client_result_locked(proc, client_fd, client, flow_registration, netagent_uuid,
+                                                                                                        assigned_results, assigned_results_length, true)) {
+                                       client_updated = TRUE;
+                               }
                        }
 
                        NECP_CLIENT_UNLOCK(client);
@@ -2114,36 +2722,251 @@ necp_assign_client_result(uuid_t netagent_uuid, uuid_t client_id,
                proc_rele(proc);
                proc = PROC_NULL;
 
-               if (found_client) {
-                       break;
+               if (found_client) {
+                       break;
+               }
+       }
+
+       NECP_FD_LIST_UNLOCK();
+
+       // upon error, client must free assigned_results
+       if (!found_client) {
+               error = ENOENT;
+       } else if (!client_updated) {
+               error = EINVAL;
+       }
+
+       return (error);
+}
+
+/// Client updating
+
+static bool
+necp_update_parsed_parameters(struct necp_client_parsed_parameters *parsed_parameters,
+                                                         struct necp_aggregate_result *result)
+{
+       if (parsed_parameters == NULL ||
+               result == NULL) {
+               return (false);
+       }
+
+       bool updated = false;
+       for (int i = 0; i < NECP_MAX_NETAGENTS; i++) {
+               if (uuid_is_null(result->netagents[i])) {
+                       // Passed end of valid agents
+                       break;
+               }
+
+               if (!(result->netagent_use_flags[i] & NECP_AGENT_USE_FLAG_SCOPE)) {
+                       // Not a scoped agent, ignore
+                       continue;
+               }
+
+               // This is a scoped agent. Add it to the required agents.
+               if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) {
+                       // Already some required agents, add this at the end
+                       for (int j = 0; j < NECP_MAX_PARSED_PARAMETERS; j++) {
+                               if (uuid_compare(parsed_parameters->required_netagents[j], result->netagents[i]) == 0) {
+                                       // Already required, break
+                                       break;
+                               }
+                               if (uuid_is_null(parsed_parameters->required_netagents[j])) {
+                                       // Add here
+                                       memcpy(&parsed_parameters->required_netagents[j], result->netagents[i], sizeof(uuid_t));
+                                       updated = true;
+                                       break;
+                               }
+                       }
+               } else {
+                       // No required agents yet, add this one
+                       parsed_parameters->valid_fields |= NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT;
+                       memcpy(&parsed_parameters->required_netagents[0], result->netagents[i], sizeof(uuid_t));
+                       updated = true;
+               }
+
+               // Remove requirements for agents of the same type
+               if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) {
+                       char remove_agent_domain[NETAGENT_DOMAINSIZE] = { 0 };
+                       char remove_agent_type[NETAGENT_TYPESIZE] = { 0 };
+                       if (netagent_get_agent_domain_and_type(result->netagents[i], remove_agent_domain, remove_agent_type)) {
+                               for (int j = 0; j < NECP_MAX_PARSED_PARAMETERS; j++) {
+                                       if (strlen(parsed_parameters->required_netagent_types[j].netagent_domain) == 0 &&
+                                               strlen(parsed_parameters->required_netagent_types[j].netagent_type) == 0) {
+                                               break;
+                                       }
+
+                                       if (strncmp(parsed_parameters->required_netagent_types[j].netagent_domain, remove_agent_domain, NETAGENT_DOMAINSIZE) == 0 &&
+                                               strncmp(parsed_parameters->required_netagent_types[j].netagent_type, remove_agent_type, NETAGENT_TYPESIZE) == 0) {
+
+                                               updated = true;
+
+                                               if (j == NECP_MAX_PARSED_PARAMETERS - 1) {
+                                                       // Last field, just clear and break
+                                                       memset(&parsed_parameters->required_netagent_types[NECP_MAX_PARSED_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type));
+                                                       break;
+                                               } else {
+                                                       // Move the parameters down, clear the last entry
+                                                       memmove(&parsed_parameters->required_netagent_types[j],
+                                                                       &parsed_parameters->required_netagent_types[j + 1],
+                                                                       sizeof(struct necp_client_parameter_netagent_type) * (NECP_MAX_PARSED_PARAMETERS - (j + 1)));
+                                                       memset(&parsed_parameters->required_netagent_types[NECP_MAX_PARSED_PARAMETERS - 1], 0, sizeof(struct necp_client_parameter_netagent_type));
+                                                       // Continue, don't increment but look at the new shifted item instead
+                                                       continue;
+                                               }
+                                       }
+
+                                       // Increment j to look at the next agent type parameter
+                                       j++;
+                               }
+                       }
+               }
+       }
+
+       if (updated &&
+               parsed_parameters->required_interface_index != IFSCOPE_NONE &&
+               (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IF) == 0) {
+               // A required interface index was added after the fact. Clear it.
+               parsed_parameters->required_interface_index = IFSCOPE_NONE;
+       }
+
+
+       return (updated);
+}
+
+static inline bool
+necp_agent_types_match(const char *agent_domain1, const char *agent_type1,
+                                          const char *agent_domain2, const char *agent_type2)
+{
+       return ((strlen(agent_domain1) == 0 ||
+                        strncmp(agent_domain2, agent_domain1, NETAGENT_DOMAINSIZE) == 0) &&
+                       (strlen(agent_type1) == 0 ||
+                        strncmp(agent_type2, agent_type1, NETAGENT_TYPESIZE) == 0));
+}
+
+static inline bool
+necp_calculate_client_result(proc_t proc,
+                                                        struct necp_client *client,
+                                                        struct necp_client_parsed_parameters *parsed_parameters,
+                                                        struct necp_aggregate_result *result,
+                                                        u_int32_t *flags)
+{
+       struct rtentry *route = NULL;
+
+       // Check parameters to find best interface
+       bool validate_agents = false;
+       u_int matching_if_index = 0;
+       if (necp_find_matching_interface_index(parsed_parameters, &matching_if_index, &validate_agents)) {
+               if (matching_if_index != 0) {
+                       parsed_parameters->required_interface_index = matching_if_index;
+               }
+               // Interface found or not needed, match policy.
+               memset(result, 0, sizeof(*result));
+               int error = necp_application_find_policy_match_internal(proc, client->parameters,
+                                                                                                                               (u_int32_t)client->parameters_length,
+                                                                                                                               result, flags, matching_if_index,
+                                                                                                                               NULL, NULL, &route, false);
+               if (error != 0) {
+                       if (route != NULL) {
+                               rtfree(route);
+                       }
+                       return (FALSE);
+               }
+
+               if (validate_agents) {
+                       bool requirement_failed = FALSE;
+                       if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT) {
+                               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                                       if (uuid_is_null(parsed_parameters->required_netagents[i])) {
+                                               break;
+                                       }
+
+                                       bool requirement_found = FALSE;
+                                       for (int j = 0; j < NECP_MAX_NETAGENTS; j++) {
+                                               if (uuid_is_null(result->netagents[j])) {
+                                                       break;
+                                               }
+
+                                               if (uuid_compare(parsed_parameters->required_netagents[i], result->netagents[j]) == 0) {
+                                                       requirement_found = TRUE;
+                                                       break;
+                                               }
+                                       }
+
+                                       if (!requirement_found) {
+                                               requirement_failed = TRUE;
+                                               break;
+                                       }
+                               }
+                       }
+
+                       if (!requirement_failed && parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE) {
+                               for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                                       if (strlen(parsed_parameters->required_netagent_types[i].netagent_domain) == 0 &&
+                                               strlen(parsed_parameters->required_netagent_types[i].netagent_type) == 0) {
+                                               break;
+                                       }
+
+                                       bool requirement_found = FALSE;
+                                       for (int j = 0; j < NECP_MAX_NETAGENTS; j++) {
+                                               if (uuid_is_null(result->netagents[j])) {
+                                                       break;
+                                               }
+
+                                               char policy_agent_domain[NETAGENT_DOMAINSIZE] = { 0 };
+                                               char policy_agent_type[NETAGENT_TYPESIZE] = { 0 };
+
+                                               if (netagent_get_agent_domain_and_type(result->netagents[j], policy_agent_domain, policy_agent_type)) {
+                                                       if (necp_agent_types_match(parsed_parameters->required_netagent_types[i].netagent_domain,
+                                                                                                          parsed_parameters->required_netagent_types[i].netagent_type,
+                                                                                                          policy_agent_domain, policy_agent_type)) {
+                                                               requirement_found = TRUE;
+                                                               break;
+                                                       }
+                                               }
+                                       }
+
+                                       if (!requirement_found) {
+                                               requirement_failed = TRUE;
+                                               break;
+                                       }
+                               }
+                       }
+
+                       if (requirement_failed) {
+                               // Agent requirement failed. Clear out the whole result, make everything fail.
+                               memset(result, 0, sizeof(*result));
+                               if (route != NULL) {
+                                       rtfree(route);
+                               }
+                               return (TRUE);
+                       }
                }
-       }
-
-       NECP_FD_LIST_UNLOCK();
 
-       // upon error, client must free assigned_results
-       if (!found_client) {
-               error = ENOENT;
-       } else if (!client_updated) {
-               error = EINVAL;
+               // Reset current route
+               NECP_CLIENT_ROUTE_LOCK(client);
+               if (client->current_route != NULL) {
+                       rtfree(client->current_route);
+               }
+               client->current_route = route;
+               NECP_CLIENT_ROUTE_UNLOCK(client);
+       } else {
+               // Interface not found. Clear out the whole result, make everything fail.
+               memset(result, 0, sizeof(*result));
        }
 
-       return (error);
+       return (TRUE);
 }
 
-/// Client updating
-
 static bool
 necp_update_client_result(proc_t proc,
                                                  struct necp_fd_data *client_fd,
                                                  struct necp_client *client,
-                                                 struct _necp_client_defunct_list *defunct_list)
+                                                 struct _necp_flow_defunct_list *defunct_list)
 {
        struct necp_client_result_netagent netagent;
        struct necp_aggregate_result result;
        struct necp_client_parsed_parameters *parsed_parameters = NULL;
        u_int32_t flags = 0;
-       struct rtentry *route = NULL;
 
        NECP_CLIENT_ASSERT_LOCKED(client);
 
@@ -2165,35 +2988,18 @@ necp_update_client_result(proc_t proc,
        // Update saved IP protocol
        client->ip_protocol = parsed_parameters->ip_protocol;
 
-       // Check parameters to find best interface
-       u_int matching_if_index = 0;
-       if (necp_find_matching_interface_index(parsed_parameters, &matching_if_index)) {
-               if (matching_if_index != 0) {
-                       parsed_parameters->required_interface_index = matching_if_index;
-               }
-               // Interface found or not needed, match policy.
-               error = necp_application_find_policy_match_internal(proc, client->parameters,
-                                                                                                                       (u_int32_t)client->parameters_length,
-                                                                                                                       &result, &flags, matching_if_index,
-                                                                                                                       NULL, NULL, &route, false);
-               if (error != 0) {
-                       if (route != NULL) {
-                               rtfree(route);
-                       }
+       // Calculate the policy result
+       if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags)) {
+               FREE(parsed_parameters, M_NECP);
+               return (FALSE);
+       }
+
+       if (necp_update_parsed_parameters(parsed_parameters, &result)) {
+               // Changed the parameters based on result, try again (only once)
+               if (!necp_calculate_client_result(proc, client, parsed_parameters, &result, &flags)) {
                        FREE(parsed_parameters, M_NECP);
                        return (FALSE);
                }
-
-               // Reset current route
-               NECP_CLIENT_ROUTE_LOCK(client);
-               if (client->current_route != NULL) {
-                       rtfree(client->current_route);
-               }
-               client->current_route = route;
-               NECP_CLIENT_ROUTE_UNLOCK(client);
-       } else {
-               // Interface not found. Clear out the whole result, make everything fail.
-               memset(&result, 0, sizeof(result));
        }
 
        // Save the last policy id on the client
@@ -2223,9 +3029,6 @@ necp_update_client_result(proc_t proc,
        }
 
        // Recalculate flags
-       if (client->defunct) {
-               flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT;
-       }
        if (parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) {
                // Listeners are valid as long as they aren't dropped
                if (result.routing_result != NECP_KERNEL_POLICY_RESULT_DROP) {
@@ -2303,7 +3106,7 @@ necp_update_client_result(proc_t proc,
                }
                uuid_copy(netagent.netagent_uuid, result.netagents[i]);
                netagent.generation = netagent_get_generation(netagent.netagent_uuid);
-               if (necp_netagent_applies_to_client(client, parsed_parameters, netagent.netagent_uuid, TRUE, 0, 0)) {
+               if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, TRUE, 0, 0)) {
                        cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated,
                                                                                                                client->result, sizeof(client->result));
                }
@@ -2374,10 +3177,11 @@ necp_update_client_result(proc_t proc,
                TAILQ_FOREACH(multi_interface, &ifnet_ordered_head, if_ordered_link) {
                        if (necp_ifnet_matches_parameters(multi_interface, parsed_parameters, NULL, true)) {
                                // Add multipath interface flows for kernel MPTCP
-                               necp_client_add_interface_flow_if_needed(client, multi_interface->if_index);
+                               necp_client_add_interface_option_if_needed(client, multi_interface->if_index,
+                                                                                                                  ifnet_get_generation(multi_interface), NULL);
 
                                // Add nexus agents for multipath
-                               necp_client_add_agent_flows_for_interface(client, parsed_parameters, multi_interface);
+                               necp_client_add_agent_interface_options(client, parsed_parameters, multi_interface);
                        }
                }
        } else if ((parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_LISTENER) &&
@@ -2385,9 +3189,9 @@ necp_update_client_result(proc_t proc,
                // Get listener interface options from global list
                struct ifnet *listen_interface = NULL;
                TAILQ_FOREACH(listen_interface, &ifnet_head, if_link) {
-                       if (necp_ifnet_matches_parameters(listen_interface, parsed_parameters, NULL, false)) {
+                       if (necp_ifnet_matches_parameters(listen_interface, parsed_parameters, NULL, true)) {
                                // Add nexus agents for listeners
-                               necp_client_add_agent_flows_for_interface(client, parsed_parameters, listen_interface);
+                               necp_client_add_agent_interface_options(client, parsed_parameters, listen_interface);
                        }
                }
        }
@@ -2400,10 +3204,10 @@ necp_update_client_result(proc_t proc,
                                if (uuid_is_null(original_scoped_interface->if_agentids[i])) {
                                        continue;
                                }
-                               u_int16_t if_flags = nstat_ifnet_to_flags(original_scoped_interface);
                                uuid_copy(netagent.netagent_uuid, original_scoped_interface->if_agentids[i]);
                                netagent.generation = netagent_get_generation(netagent.netagent_uuid);
-                               if (necp_netagent_applies_to_client(client, parsed_parameters, netagent.netagent_uuid, FALSE, original_scoped_interface->if_index, if_flags)) {
+                               if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, FALSE,
+                                                                                                       original_scoped_interface->if_index, ifnet_get_generation(original_scoped_interface))) {
                                        cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated,
                                                                                                                                client->result, sizeof(client->result));
                                }
@@ -2418,10 +3222,10 @@ necp_update_client_result(proc_t proc,
                                if (uuid_is_null(direct_interface->if_agentids[i])) {
                                        continue;
                                }
-                               u_int16_t if_flags = nstat_ifnet_to_flags(direct_interface);
                                uuid_copy(netagent.netagent_uuid, direct_interface->if_agentids[i]);
                                netagent.generation = netagent_get_generation(netagent.netagent_uuid);
-                               if (necp_netagent_applies_to_client(client, parsed_parameters, netagent.netagent_uuid, TRUE, direct_interface->if_index, if_flags)) {
+                               if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, TRUE,
+                                                                                                       direct_interface->if_index, ifnet_get_generation(direct_interface))) {
                                        cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated,
                                                                                                                                client->result, sizeof(client->result));
                                }
@@ -2436,10 +3240,10 @@ necp_update_client_result(proc_t proc,
                                if (uuid_is_null(delegate_interface->if_agentids[i])) {
                                        continue;
                                }
-                               u_int16_t if_flags = nstat_ifnet_to_flags(delegate_interface);
                                uuid_copy(netagent.netagent_uuid, delegate_interface->if_agentids[i]);
                                netagent.generation = netagent_get_generation(netagent.netagent_uuid);
-                               if (necp_netagent_applies_to_client(client, parsed_parameters, netagent.netagent_uuid, FALSE, delegate_interface->if_index, if_flags)) {
+                               if (necp_netagent_applies_to_client(client, parsed_parameters, &netagent.netagent_uuid, FALSE,
+                                                                                                       delegate_interface->if_index, ifnet_get_generation(delegate_interface))) {
                                        cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_NETAGENT, sizeof(netagent), &netagent, &updated,
                                                                                                                                client->result, sizeof(client->result));
                                }
@@ -2449,6 +3253,19 @@ necp_update_client_result(proc_t proc,
        }
        ifnet_head_done();
 
+       // Add interface options
+       for (u_int32_t option_i = 0; option_i < client->interface_option_count; option_i++) {
+               if (option_i < NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT) {
+                       struct necp_client_interface_option *option = &client->interface_options[option_i];
+                       cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_INTERFACE_OPTION, sizeof(*option), option, &updated,
+                                                                                                               client->result, sizeof(client->result));
+               } else {
+                       struct necp_client_interface_option *option = &client->extra_interface_options[option_i - NECP_CLIENT_INTERFACE_OPTION_STATIC_COUNT];
+                       cursor = necp_buffer_write_tlv_if_different(cursor, NECP_CLIENT_RESULT_INTERFACE_OPTION, sizeof(*option), option, &updated,
+                                                                                                               client->result, sizeof(client->result));
+               }
+       }
+
        size_t new_result_length = (cursor - client->result);
        if (new_result_length != client->result_length) {
                client->result_length = new_result_length;
@@ -2456,14 +3273,8 @@ necp_update_client_result(proc_t proc,
        }
 
        // Update flow viability/flags
-       bool defuncted_by_flow = FALSE;
-       if (necp_client_update_flows(proc, client, defunct_list, &defuncted_by_flow)) {
+       if (necp_client_update_flows(proc, client, defunct_list)) {
                updated = TRUE;
-               if (defuncted_by_flow && client->defunct) {
-                       // Reset initial TLV
-                       flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT;
-                       (void)necp_buffer_write_tlv_if_different(client->result, NECP_CLIENT_RESULT_FLAGS, sizeof(flags), &flags, &updated, client->result, sizeof(client->result));
-               }
        }
 
        if (updated) {
@@ -2476,7 +3287,7 @@ necp_update_client_result(proc_t proc,
 }
 
 static inline void
-necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_client_defunct_list *defunct_list, struct proc *proc)
+necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_flow_defunct_list *defunct_list, struct proc *proc)
 {
 #pragma unused(proc)
        bool updated_result = FALSE;
@@ -2485,27 +3296,34 @@ necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_clien
        NECP_FD_ASSERT_LOCKED(client_fd);
 
        RB_FOREACH(client, _necp_client_tree, &client_fd->clients) {
+               struct necp_client_flow_registration *flow_registration = NULL;
+
                NECP_CLIENT_LOCK(client);
-               if (!client->defunct) {
-                       updated_result = necp_set_client_defunct(client);
 
-                       // Prepare close events to be sent to the nexus to effectively remove the flows
-                       struct necp_client_flow *search_flow = NULL;
-                       LIST_FOREACH(search_flow, &client->flow_list, flow_chain) {
+               // Prepare close events to be sent to the nexus to effectively remove the flows
+               struct necp_client_flow *search_flow = NULL;
+               RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+                       LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) {
                                if (search_flow->nexus &&
-                                       !uuid_is_null(search_flow->u.nexus_agent) &&
-                                       search_flow->requested_nexus) {
+                                       !uuid_is_null(search_flow->u.nexus_agent)) {
 
-                                       struct necp_client_defunct *client_defunct;
+                                       struct necp_flow_defunct *flow_defunct;
 
                                        // Sleeping alloc won't fail; copy only what's necessary
-                                       client_defunct = _MALLOC(sizeof (struct necp_client_defunct), M_NECP, M_WAITOK | M_ZERO);
-                                       uuid_copy(client_defunct->nexus_agent, search_flow->u.nexus_agent);
-                                       uuid_copy(client_defunct->client_id, client->client_id);
-                                       client_defunct->proc_pid = client->proc_pid;
+                                       flow_defunct = _MALLOC(sizeof (struct necp_flow_defunct), M_NECP, M_WAITOK | M_ZERO);
+                                       uuid_copy(flow_defunct->nexus_agent, search_flow->u.nexus_agent);
+                                       uuid_copy(flow_defunct->flow_id, ((flow_registration->flags & NECP_CLIENT_FLOW_FLAGS_USE_CLIENT_ID) ?
+                                                                                                         client->client_id :
+                                                                                                         flow_registration->registration_id));
+                                       flow_defunct->proc_pid = client->proc_pid;
+                                       flow_defunct->agent_handle = client->agent_handle;
 
                                        // Add to the list provided by caller
-                                       LIST_INSERT_HEAD(defunct_list, client_defunct, chain);
+                                       LIST_INSERT_HEAD(defunct_list, flow_defunct, chain);
+
+                                       flow_registration->defunct = true;
+                                       flow_registration->flow_result_read = false;
+                                       updated_result = true;
                                }
                        }
                }
@@ -2521,7 +3339,7 @@ necp_defunct_client_fd_locked(struct necp_fd_data *client_fd, struct _necp_clien
 static inline void
 necp_update_client_fd_locked(struct necp_fd_data *client_fd,
                                                         proc_t proc,
-                                                        struct _necp_client_defunct_list *defunct_list)
+                                                        struct _necp_flow_defunct_list *defunct_list)
 {
        struct necp_client *client = NULL;
        bool updated_result = FALSE;
@@ -2545,7 +3363,7 @@ necp_update_all_clients_callout(__unused thread_call_param_t dummy,
 {
        struct necp_fd_data *client_fd = NULL;
 
-       struct _necp_client_defunct_list defunct_list;
+       struct _necp_flow_defunct_list defunct_list;
        LIST_INIT(&defunct_list);
 
        NECP_FD_LIST_LOCK_SHARED();
@@ -2569,25 +3387,26 @@ necp_update_all_clients_callout(__unused thread_call_param_t dummy,
 
        // Handle the case in which some clients became newly defunct
        if (!LIST_EMPTY(&defunct_list)) {
-               struct necp_client_defunct *client_defunct = NULL;
-               struct necp_client_defunct *temp_client_defunct = NULL;
+               struct necp_flow_defunct *flow_defunct = NULL;
+               struct necp_flow_defunct *temp_flow_defunct = NULL;
 
                // For each newly defunct client, send a message to the nexus to remove the flow
-               LIST_FOREACH_SAFE(client_defunct, &defunct_list, chain, temp_client_defunct) {
-                       if (!uuid_is_null(client_defunct->nexus_agent)) {
-                               int netagent_error = netagent_client_message(client_defunct->nexus_agent,
-                                                                                                                        client_defunct->client_id,
-                                                                                                                        client_defunct->proc_pid,
+               LIST_FOREACH_SAFE(flow_defunct, &defunct_list, chain, temp_flow_defunct) {
+                       if (!uuid_is_null(flow_defunct->nexus_agent)) {
+                               int netagent_error = netagent_client_message(flow_defunct->nexus_agent,
+                                                                                                                        flow_defunct->flow_id,
+                                                                                                                        flow_defunct->proc_pid,
+                                                                                                                        flow_defunct->agent_handle,
                                                                                                                         NETAGENT_MESSAGE_TYPE_ABORT_NEXUS);
                                if (netagent_error != 0) {
                                        char namebuf[MAXCOMLEN+1];
                                        (void) strlcpy(namebuf, "unknown", sizeof (namebuf));
-                                       proc_name(client_defunct->proc_pid, namebuf, sizeof (namebuf));
-                                       NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_update_client abort nexus error (%d) for pid %d %s", netagent_error, client_defunct->proc_pid, namebuf);
+                                       proc_name(flow_defunct->proc_pid, namebuf, sizeof (namebuf));
+                                       NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_update_client abort nexus error (%d) for pid %d %s", netagent_error, flow_defunct->proc_pid, namebuf);
                                }
                        }
-                       LIST_REMOVE(client_defunct, chain);
-                       FREE(client_defunct, M_NECP);
+                       LIST_REMOVE(flow_defunct, chain);
+                       FREE(flow_defunct, M_NECP);
                }
        }
        ASSERT(LIST_EMPTY(&defunct_list));
@@ -2646,11 +3465,14 @@ necp_set_client_as_background(proc_t proc,
                NECP_CLIENT_LOCK(client);
 
                bool has_assigned_flow = FALSE;
+               struct necp_client_flow_registration *flow_registration = NULL;
                struct necp_client_flow *search_flow = NULL;
-               LIST_FOREACH(search_flow, &client->flow_list, flow_chain) {
-                       if (search_flow->assigned) {
-                               has_assigned_flow = TRUE;
-                               break;
+               RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+                       LIST_FOREACH(search_flow, &flow_registration->flow_list, flow_chain) {
+                               if (search_flow->assigned) {
+                                       has_assigned_flow = TRUE;
+                                       break;
+                               }
                        }
                }
 
@@ -2683,7 +3505,7 @@ necp_fd_memstatus(proc_t proc, uint32_t status,
 void
 necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd)
 {
-       struct _necp_client_defunct_list defunct_list;
+       struct _necp_flow_defunct_list defunct_list;
 
        ASSERT(proc != PROC_NULL);
        ASSERT(client_fd != NULL);
@@ -2702,22 +3524,23 @@ necp_fd_defunct(proc_t proc, struct necp_fd_data *client_fd)
        NECP_FD_UNLOCK(client_fd);
 
        if (!LIST_EMPTY(&defunct_list)) {
-               struct necp_client_defunct *client_defunct = NULL;
-               struct necp_client_defunct *temp_client_defunct = NULL;
+               struct necp_flow_defunct *flow_defunct = NULL;
+               struct necp_flow_defunct *temp_flow_defunct = NULL;
 
                // For each defunct client, remove flow from the nexus
-               LIST_FOREACH_SAFE(client_defunct, &defunct_list, chain, temp_client_defunct) {
-                       if (!uuid_is_null(client_defunct->nexus_agent)) {
-                               int netagent_error = netagent_client_message(client_defunct->nexus_agent,
-                                                                                                                        client_defunct->client_id,
-                                                                                                                        client_defunct->proc_pid,
+               LIST_FOREACH_SAFE(flow_defunct, &defunct_list, chain, temp_flow_defunct) {
+                       if (!uuid_is_null(flow_defunct->nexus_agent)) {
+                               int netagent_error = netagent_client_message(flow_defunct->nexus_agent,
+                                                                                                                        flow_defunct->flow_id,
+                                                                                                                        flow_defunct->proc_pid,
+                                                                                                                        flow_defunct->agent_handle,
                                                                                                                         NETAGENT_MESSAGE_TYPE_ABORT_NEXUS);
                                if (netagent_error != 0) {
                                        NECPLOG((netagent_error == ENOENT ? LOG_DEBUG : LOG_ERR), "necp_defunct_client abort nexus error (%d)", netagent_error);
                                }
                        }
-                       LIST_REMOVE(client_defunct, chain);
-                       FREE(client_defunct, M_NECP);
+                       LIST_REMOVE(flow_defunct, chain);
+                       FREE(flow_defunct, M_NECP);
                }
        }
        ASSERT(LIST_EMPTY(&defunct_list));
@@ -2757,7 +3580,7 @@ necp_client_remove_agent_from_result(struct necp_client *client, uuid_t netagent
 }
 
 void
-necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid)
+necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid, u_int32_t agent_generation)
 {
        struct necp_fd_data *client_fd = NULL;
 
@@ -2768,10 +3591,12 @@ necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid)
                NECP_FD_LOCK(client_fd);
                struct necp_client *client = necp_client_fd_find_client_and_lock(client_fd, client_id);
                if (client != NULL) {
+                       client->failed_trigger_agent.generation = agent_generation;
+                       uuid_copy(client->failed_trigger_agent.netagent_uuid, remove_netagent_uuid);
                        if (!uuid_is_null(remove_netagent_uuid)) {
                                necp_client_remove_agent_from_result(client, remove_netagent_uuid);
                        }
-                       client->flow_result_read = FALSE;
+                       client->result_read = FALSE;
                        // Found the client, break
                        updated_result = TRUE;
                        NECP_CLIENT_UNLOCK(client);
@@ -2799,19 +3624,28 @@ necp_force_update_client(uuid_t client_id, uuid_t remove_netagent_uuid)
                                                                                                                 NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT |                  \
                                                                                                                 NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT |                \
                                                                                                                 NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT |                 \
+                                                                                                                NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT |                   \
                                                                                                                 NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE |             \
                                                                                                                 NECP_PARSED_PARAMETERS_FIELD_PROHIBITED_AGENT_TYPE |   \
-                                                                                                                NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE)
-
-#define NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR |                  \
-                                                                                                       NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE |          \
-                                                                                                       NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT |           \
-                                                                                                       NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT |          \
-                                                                                                       NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE |      \
-                                                                                                       NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE)
-
-#define NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT | \
-                                                                                                          NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE)
+                                                                                                                NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE |    \
+                                                                                                                NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE)
+
+#define NECP_PARSED_PARAMETERS_SCOPED_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR |                        \
+                                                                                         NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE |                \
+                                                                                         NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT |         \
+                                                                                         NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT |                \
+                                                                                         NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT |          \
+                                                                                         NECP_PARSED_PARAMETERS_FIELD_REQUIRED_AGENT_TYPE |    \
+                                                                                         NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE |   \
+                                                                                         NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE)
+
+#define NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS (NECP_PARSED_PARAMETERS_FIELD_LOCAL_ADDR |          \
+                                                                                                       NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE)
+
+#define NECP_PARSED_PARAMETERS_PREFERRED_FIELDS (NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT |                \
+                                                                                                NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT |                   \
+                                                                                                NECP_PARSED_PARAMETERS_FIELD_PREFERRED_AGENT_TYPE |    \
+                                                                                                NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE)
 
 static bool
 necp_ifnet_matches_type(struct ifnet *ifp, u_int8_t interface_type, bool check_delegates)
@@ -2872,7 +3706,7 @@ necp_ifnet_matches_agent(struct ifnet *ifp, uuid_t *agent_uuid, bool check_deleg
 }
 
 static bool
-necp_necp_ifnet_matches_agent_type(struct ifnet *ifp, const char *agent_domain, const char *agent_type, bool check_delegates)
+necp_ifnet_matches_agent_type(struct ifnet *ifp, const char *agent_domain, const char *agent_type, bool check_delegates)
 {
        struct ifnet *check_ifp = ifp;
 
@@ -2888,13 +3722,10 @@ necp_necp_ifnet_matches_agent_type(struct ifnet *ifp, const char *agent_domain,
                                char if_agent_type[NETAGENT_TYPESIZE] = { 0 };
 
                                if (netagent_get_agent_domain_and_type(check_ifp->if_agentids[index], if_agent_domain, if_agent_type)) {
-                                       if ((strlen(agent_domain) == 0 ||
-                                                strncmp(if_agent_domain, agent_domain, NETAGENT_DOMAINSIZE) == 0) &&
-                                               (strlen(agent_type) == 0 ||
-                                                strncmp(if_agent_type, agent_type, NETAGENT_TYPESIZE) == 0)) {
-                                                       ifnet_lock_done(check_ifp);
-                                                       return (TRUE);
-                                               }
+                                       if (necp_agent_types_match(agent_domain, agent_type, if_agent_domain, if_agent_type)) {
+                                               ifnet_lock_done(check_ifp);
+                                               return (TRUE);
+                                       }
                                }
                        }
                }
@@ -2952,10 +3783,13 @@ necp_interface_type_is_primary_eligible(u_int8_t interface_type)
 
 #define NECP_IFP_IS_ON_ORDERED_LIST(_ifp) ((_ifp)->if_ordered_link.tqe_next != NULL || (_ifp)->if_ordered_link.tqe_prev != NULL)
 
+// Secondary interface flag indicates that the interface is being
+// used for multipath or a listener as an extra path
 static bool
 necp_ifnet_matches_parameters(struct ifnet *ifp,
                                                          struct necp_client_parsed_parameters *parsed_parameters,
-                                                         u_int32_t *preferred_count, bool ignore_require_if)
+                                                         u_int32_t *preferred_count,
+                                                         bool secondary_interface)
 {
        if (preferred_count) {
                *preferred_count = 0;
@@ -2974,7 +3808,9 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                }
        }
 
-       if (!ignore_require_if &&
+       if ((!secondary_interface || // Enforce interface type if this is the primary interface
+                !(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_FLAGS) || // or if there are no flags
+                !(parsed_parameters->flags & NECP_CLIENT_PARAMETER_FLAG_ONLY_PRIMARY_REQUIRES_TYPE)) && // or if the flags don't give an exception
            (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE) &&
                !necp_ifnet_matches_type(ifp, parsed_parameters->required_interface_type, FALSE)) {
                return (FALSE);
@@ -3035,7 +3871,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                                break;
                        }
 
-                       if (!necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->required_netagent_types[i].netagent_domain, parsed_parameters->required_netagent_types[i].netagent_type, FALSE)) {
+                       if (!necp_ifnet_matches_agent_type(ifp, parsed_parameters->required_netagent_types[i].netagent_domain, parsed_parameters->required_netagent_types[i].netagent_type, FALSE)) {
                                return (FALSE);
                        }
                }
@@ -3048,7 +3884,7 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                                break;
                        }
 
-                       if (necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->prohibited_netagent_types[i].netagent_domain, parsed_parameters->prohibited_netagent_types[i].netagent_type, TRUE)) {
+                       if (necp_ifnet_matches_agent_type(ifp, parsed_parameters->prohibited_netagent_types[i].netagent_domain, parsed_parameters->prohibited_netagent_types[i].netagent_type, TRUE)) {
                                return (FALSE);
                        }
                }
@@ -3075,7 +3911,33 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
                                        break;
                                }
 
-                               if (necp_necp_ifnet_matches_agent_type(ifp, parsed_parameters->preferred_netagent_types[i].netagent_domain, parsed_parameters->preferred_netagent_types[i].netagent_type, TRUE)) {
+                               if (necp_ifnet_matches_agent_type(ifp, parsed_parameters->preferred_netagent_types[i].netagent_domain, parsed_parameters->preferred_netagent_types[i].netagent_type, TRUE)) {
+                                       (*preferred_count)++;
+                               }
+                       }
+               }
+
+               if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT) {
+                       for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                               if (uuid_is_null(parsed_parameters->avoided_netagents[i])) {
+                                       break;
+                               }
+
+                               if (!necp_ifnet_matches_agent(ifp, &parsed_parameters->avoided_netagents[i], TRUE)) {
+                                       (*preferred_count)++;
+                               }
+                       }
+               }
+
+               if (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_AVOIDED_AGENT_TYPE) {
+                       for (int i = 0; i < NECP_MAX_PARSED_PARAMETERS; i++) {
+                               if (strlen(parsed_parameters->avoided_netagent_types[i].netagent_domain) == 0 &&
+                                       strlen(parsed_parameters->avoided_netagent_types[i].netagent_type) == 0) {
+                                       break;
+                               }
+
+                               if (!necp_ifnet_matches_agent_type(ifp, parsed_parameters->avoided_netagent_types[i].netagent_domain,
+                                                                                                               parsed_parameters->avoided_netagent_types[i].netagent_type, TRUE)) {
                                        (*preferred_count)++;
                                }
                        }
@@ -3086,7 +3948,8 @@ necp_ifnet_matches_parameters(struct ifnet *ifp,
 }
 
 static bool
-necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_parameters, u_int *return_ifindex)
+necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_parameters,
+                                                                  u_int *return_ifindex, bool *validate_agents)
 {
        struct ifnet *ifp = NULL;
        u_int32_t best_preferred_count = 0;
@@ -3102,12 +3965,12 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_
                return (TRUE);
        }
 
-       has_preferred_fields = (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS);
+       has_preferred_fields = (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_FIELDS);
 
        // We have interesting parameters to parse and find a matching interface
        ifnet_head_lock_shared();
 
-       if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS)) {
+       if (!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS)) {
                // We do have fields to match, but they are only prohibitory
                // If the first interface in the list matches, or there are no ordered interfaces, we don't need to scope
                ifp = TAILQ_FIRST(&ifnet_ordered_head);
@@ -3137,7 +4000,7 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_
        }
 
        // Then check the remaining interfaces
-       if ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS) &&
+       if ((parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_FIELDS) &&
                ((!(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_FIELD_REQUIRED_IFTYPE)) ||
                 !necp_interface_type_is_primary_eligible(parsed_parameters->required_interface_type)) &&
                *return_ifindex == 0) {
@@ -3165,13 +4028,21 @@ necp_find_matching_interface_index(struct necp_client_parsed_parameters *parsed_
 
        ifnet_head_done();
 
-       if ((parsed_parameters->valid_fields == (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_IFNET_FIELDS)) &&
+       if ((parsed_parameters->valid_fields == (parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_PREFERRED_FIELDS)) &&
                best_preferred_count == 0) {
                // If only has preferred fields, and nothing was found, clear the interface index and return TRUE
                *return_ifindex = 0;
                return (TRUE);
        }
 
+       if (*return_ifindex == 0 &&
+               !(parsed_parameters->valid_fields & NECP_PARSED_PARAMETERS_SCOPED_IFNET_FIELDS)) {
+               // Has required fields, but not including specific interface fields. Pass for now, and check
+               // to see if agents are satisfied by policy.
+               *validate_agents = TRUE;
+               return (TRUE);
+       }
+
        return (*return_ifindex != 0);
 }
 
@@ -3194,7 +4065,8 @@ necp_open(struct proc *p, struct necp_open_args *uap, int *retval)
        struct fileproc *fp = NULL;
        int fd = -1;
 
-       if (uap->flags & NECP_OPEN_FLAG_OBSERVER) {
+       if (uap->flags & NECP_OPEN_FLAG_OBSERVER ||
+               uap->flags & NECP_OPEN_FLAG_PUSH_OBSERVER) {
                if (necp_skywalk_priv_check_cred(p, kauth_cred_get()) != 0 &&
                        priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0) != 0) {
                        NECPLOG0(LOG_ERR, "Client does not hold necessary entitlement to observe other NECP clients");
@@ -3218,6 +4090,7 @@ necp_open(struct proc *p, struct necp_open_args *uap, int *retval)
        fd_data->necp_fd_type = necp_fd_type_client;
        fd_data->flags = uap->flags;
        RB_INIT(&fd_data->clients);
+       RB_INIT(&fd_data->flows);
        TAILQ_INIT(&fd_data->update_list);
        lck_mtx_init(&fd_data->fd_lock, necp_fd_mtx_grp, necp_fd_mtx_attr);
        klist_init(&fd_data->si.si_note);
@@ -3309,11 +4182,12 @@ necp_client_add(struct proc *p, struct necp_fd_data *fd_data, struct necp_client
 
        client->parameters_length = uap->buffer_size;
        client->proc_pid = fd_data->proc_pid; // Save off proc pid in case the client will persist past fd
+       client->agent_handle = (void *)fd_data;
        client->platform_binary = ((csproc_get_platform_binary(p) == 0) ? 0 : 1);
 
-       uuid_generate_random(client->client_id);
+       necp_generate_client_id(client->client_id, false);
        LIST_INIT(&client->assertion_list);
-       LIST_INIT(&client->flow_list);
+       RB_INIT(&client->flow_registrations);
 
        error = copyout(client->client_id, uap->client_id, sizeof(uuid_t));
        if (error) {
@@ -3351,8 +4225,6 @@ static int
 necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
        int error = 0;
-       struct necp_client *client = NULL;
-       struct necp_client find = {};
        uuid_t client_id = {};
        struct ifnet_stats_per_flow flow_ifnet_stats = {};
 
@@ -3382,15 +4254,27 @@ necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args
        NECP_FD_LOCK(fd_data);
 
        pid_t pid = fd_data->proc_pid;
-       uuid_copy(find.client_id, client_id);
-       client = RB_FIND(_necp_client_tree, &fd_data->clients, &find);
+       struct necp_client *client = necp_client_fd_find_client_unlocked(fd_data, client_id);
        if (client != NULL) {
+               // Remove any flow registrations that match
+               struct necp_client_flow_registration *flow_registration = NULL;
+               struct necp_client_flow_registration *temp_flow_registration = NULL;
+               RB_FOREACH_SAFE(flow_registration, _necp_fd_flow_tree, &fd_data->flows, temp_flow_registration) {
+                       if (flow_registration->client == client) {
+                               NECP_FLOW_TREE_LOCK_EXCLUSIVE();
+                               RB_REMOVE(_necp_client_flow_global_tree, &necp_client_flow_global_tree, flow_registration);
+                               NECP_FLOW_TREE_UNLOCK();
+                               RB_REMOVE(_necp_fd_flow_tree, &fd_data->flows, flow_registration);
+                       }
+               }
+               // Remove client from lists
                NECP_CLIENT_TREE_LOCK_EXCLUSIVE();
                RB_REMOVE(_necp_client_global_tree, &necp_client_global_tree, client);
                NECP_CLIENT_TREE_UNLOCK();
                RB_REMOVE(_necp_client_tree, &fd_data->clients, client);
        }
 
+
        NECP_FD_UNLOCK(fd_data);
 
        if (client != NULL) {
@@ -3400,13 +4284,13 @@ necp_client_remove(struct necp_fd_data *fd_data, struct necp_client_action_args
                error = ENOENT;
                NECPLOG(LOG_ERR, "necp_client_remove invalid client_id (%d)", error);
        }
-
 done:
        *retval = error;
 
        return (error);
 }
 
+
 static int
 necp_client_check_tcp_heuristics(struct necp_client *client, struct necp_client_flow *flow, u_int32_t *flags, u_int8_t *tfo_cookie, u_int8_t *tfo_cookie_len)
 {
@@ -3492,188 +4376,278 @@ do_unlock:
        return (error);
 }
 
+static size_t
+necp_client_calculate_flow_tlv_size(struct necp_client_flow_registration *flow_registration)
+{
+       size_t assigned_results_size = 0;
+       struct necp_client_flow *flow = NULL;
+       LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) {
+               if (flow->assigned) {
+                       size_t header_length = 0;
+                       if (flow->nexus) {
+                               header_length = sizeof(struct necp_client_nexus_flow_header);
+                       } else {
+                               header_length = sizeof(struct necp_client_flow_header);
+                       }
+                       assigned_results_size += (header_length + flow->assigned_results_length);
+
+                       if (flow->has_protoctl_event) {
+                               assigned_results_size += sizeof(struct necp_client_flow_protoctl_event_header);
+                       }
+               }
+       }
+       return assigned_results_size;
+}
+
+static int
+necp_client_fillout_flow_tlvs(struct necp_client *client,
+                                                         bool client_is_observed,
+                                                         struct necp_client_flow_registration *flow_registration,
+                                                         struct necp_client_action_args *uap,
+                                                         size_t *assigned_results_cursor)
+{
+       int error = 0;
+       struct necp_client_flow *flow = NULL;
+       LIST_FOREACH(flow, &flow_registration->flow_list, flow_chain) {
+               if (flow->assigned) {
+                       // Write TLV headers
+                       struct necp_client_nexus_flow_header header = {};
+                       u_int32_t length = 0;
+                       u_int32_t flags = 0;
+                       u_int8_t tfo_cookie_len = 0;
+                       u_int8_t type = 0;
+
+                       type = NECP_CLIENT_RESULT_FLOW_ID;
+                       length = sizeof(header.flow_header.flow_id);
+                       memcpy(&header.flow_header.flow_id_tlv_header.type, &type, sizeof(type));
+                       memcpy(&header.flow_header.flow_id_tlv_header.length, &length, sizeof(length));
+                       uuid_copy(header.flow_header.flow_id, flow_registration->registration_id);
+
+                       if (flow->nexus) {
+                               if (flow->check_tcp_heuristics) {
+                                       u_int8_t tfo_cookie[NECP_TFO_COOKIE_LEN_MAX];
+                                       tfo_cookie_len = NECP_TFO_COOKIE_LEN_MAX;
+
+                                       if (necp_client_check_tcp_heuristics(client, flow, &flags,
+                                                                                                                tfo_cookie, &tfo_cookie_len) != 0) {
+                                               tfo_cookie_len = 0;
+                                       } else {
+                                               flow->check_tcp_heuristics = FALSE;
+
+                                               if (tfo_cookie_len != 0) {
+                                                       type = NECP_CLIENT_RESULT_TFO_COOKIE;
+                                                       length = tfo_cookie_len;
+                                                       memcpy(&header.tfo_cookie_tlv_header.type, &type, sizeof(type));
+                                                       memcpy(&header.tfo_cookie_tlv_header.length, &length, sizeof(length));
+                                                       memcpy(&header.tfo_cookie_value, tfo_cookie, tfo_cookie_len);
+                                               }
+                                       }
+                               }
+                       }
+
+                       size_t header_length = 0;
+                       if (flow->nexus) {
+                               if (tfo_cookie_len != 0) {
+                                       header_length = sizeof(struct necp_client_nexus_flow_header) - (NECP_TFO_COOKIE_LEN_MAX - tfo_cookie_len);
+                               } else {
+                                       header_length = sizeof(struct necp_client_nexus_flow_header) - sizeof(struct necp_tlv_header) - NECP_TFO_COOKIE_LEN_MAX;
+                               }
+                       } else {
+                               header_length = sizeof(struct necp_client_flow_header);
+                       }
+
+                       type = NECP_CLIENT_RESULT_FLAGS;
+                       length = sizeof(header.flow_header.flags_value);
+                       memcpy(&header.flow_header.flags_tlv_header.type, &type, sizeof(type));
+                       memcpy(&header.flow_header.flags_tlv_header.length, &length, sizeof(length));
+                       if (flow->assigned) {
+                               flags |= NECP_CLIENT_RESULT_FLAG_FLOW_ASSIGNED;
+                       }
+                       if (flow->viable) {
+                               flags |= NECP_CLIENT_RESULT_FLAG_FLOW_VIABLE;
+                       }
+                       if (flow_registration->defunct) {
+                               flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT;
+                       }
+                       flags |= flow->necp_flow_flags;
+                       memcpy(&header.flow_header.flags_value, &flags, sizeof(flags));
+
+                       type = NECP_CLIENT_RESULT_INTERFACE;
+                       length = sizeof(header.flow_header.interface_value);
+                       memcpy(&header.flow_header.interface_tlv_header.type, &type, sizeof(type));
+                       memcpy(&header.flow_header.interface_tlv_header.length, &length, sizeof(length));
+
+                       struct necp_client_result_interface interface_struct;
+                       interface_struct.generation = 0;
+                       interface_struct.index = flow->interface_index;
+
+                       memcpy(&header.flow_header.interface_value, &interface_struct, sizeof(interface_struct));
+                       if (flow->nexus) {
+                               type = NECP_CLIENT_RESULT_NETAGENT;
+                               length = sizeof(header.agent_value);
+                               memcpy(&header.agent_tlv_header.type, &type, sizeof(type));
+                               memcpy(&header.agent_tlv_header.length, &length, sizeof(length));
+
+                               struct necp_client_result_netagent agent_struct;
+                               agent_struct.generation = 0;
+                               uuid_copy(agent_struct.netagent_uuid, flow->u.nexus_agent);
+
+                               memcpy(&header.agent_value, &agent_struct, sizeof(agent_struct));
+                       }
+
+                       // Don't include outer TLV header in length field
+                       type = NECP_CLIENT_RESULT_FLOW;
+                       length = (header_length - sizeof(struct necp_tlv_header) + flow->assigned_results_length);
+                       if (flow->has_protoctl_event) {
+                               length += sizeof(struct necp_client_flow_protoctl_event_header);
+                       }
+                       memcpy(&header.flow_header.outer_header.type, &type, sizeof(type));
+                       memcpy(&header.flow_header.outer_header.length, &length, sizeof(length));
+
+                       error = copyout(&header, uap->buffer + client->result_length + *assigned_results_cursor, header_length);
+                       if (error) {
+                               NECPLOG(LOG_ERR, "necp_client_copy assigned results tlv_header copyout error (%d)", error);
+                               return (error);
+                       }
+                       *assigned_results_cursor += header_length;
+
+                       if (flow->assigned_results && flow->assigned_results_length) {
+                               // Write inner TLVs
+                               error = copyout(flow->assigned_results, uap->buffer + client->result_length + *assigned_results_cursor,
+                                                               flow->assigned_results_length);
+                               if (error) {
+                                       NECPLOG(LOG_ERR, "necp_client_copy assigned results copyout error (%d)", error);
+                                       return (error);
+                               }
+                       }
+                       *assigned_results_cursor += flow->assigned_results_length;
+
+                       /* Read the protocol event and reset it */
+                       if (flow->has_protoctl_event) {
+                               struct necp_client_flow_protoctl_event_header protoctl_event_header = {};
+
+                               type = NECP_CLIENT_RESULT_PROTO_CTL_EVENT;
+                               length = sizeof(protoctl_event_header.protoctl_event);
+
+                               memcpy(&protoctl_event_header.protoctl_tlv_header.type, &type, sizeof(type));
+                               memcpy(&protoctl_event_header.protoctl_tlv_header.length, &length, sizeof(length));
+                               memcpy(&protoctl_event_header.protoctl_event, &flow->protoctl_event,
+                                          sizeof(flow->protoctl_event));
+
+                               error = copyout(&protoctl_event_header, uap->buffer + client->result_length + *assigned_results_cursor,
+                                                               sizeof(protoctl_event_header));
+
+                               if (error) {
+                                       NECPLOG(LOG_ERR, "necp_client_copy protocol control event results"
+                                                       " tlv_header copyout error (%d)", error);
+                                       return (error);
+                               }
+                               *assigned_results_cursor += sizeof(protoctl_event_header);
+                               flow->has_protoctl_event = FALSE;
+                               flow->protoctl_event.protoctl_event_code = 0;
+                               flow->protoctl_event.protoctl_event_val = 0;
+                               flow->protoctl_event.protoctl_event_tcp_seq_num = 0;
+                       }
+               }
+       }
+       if (!client_is_observed) {
+               flow_registration->flow_result_read = TRUE;
+       }
+       return (0);
+}
+
 static int
-necp_client_copy_internal(struct necp_client *client, bool client_is_observed, struct necp_client_action_args *uap, int *retval)
+necp_client_copy_internal(struct necp_client *client, uuid_t client_id, bool client_is_observed, struct necp_client_action_args *uap, int *retval)
 {
+       NECP_CLIENT_ASSERT_LOCKED(client);
        int error = 0;
        // Copy results out
        if (uap->action == NECP_CLIENT_ACTION_COPY_PARAMETERS) {
                if (uap->buffer_size < client->parameters_length) {
-                       error = EINVAL;
-                       goto done;
+                       return (EINVAL);
                }
                error = copyout(client->parameters, uap->buffer, client->parameters_length);
                if (error) {
                        NECPLOG(LOG_ERR, "necp_client_copy parameters copyout error (%d)", error);
-                       goto done;
+                       return (error);
                }
                *retval = client->parameters_length;
        } else if (uap->action == NECP_CLIENT_ACTION_COPY_UPDATED_RESULT &&
-                          client->result_read && client->flow_result_read) {
+                          client->result_read && !necp_client_has_unread_flows(client)) {
                // Copy updates only, but nothing to read
                // Just return 0 for bytes read
                *retval = 0;
        } else if (uap->action == NECP_CLIENT_ACTION_COPY_RESULT ||
                           uap->action == NECP_CLIENT_ACTION_COPY_UPDATED_RESULT) {
                size_t assigned_results_size = 0;
-               struct necp_client_flow *flow = NULL;
-               LIST_FOREACH(flow, &client->flow_list, flow_chain) {
-                       if (flow->nexus || (flow->socket && flow->assigned)) {
-                               size_t header_length = 0;
-                               if (flow->nexus) {
-                                       header_length = sizeof(struct necp_client_nexus_flow_header);
-                               } else {
-                                       header_length = sizeof(struct necp_client_flow_header);
-                               }
-                               assigned_results_size += (header_length + flow->assigned_results_length);
 
-                               if (flow->has_protoctl_event) {
-                                       assigned_results_size += sizeof(struct necp_client_flow_protoctl_event_header);
+               bool some_flow_is_defunct = false;
+               struct necp_client_flow_registration *single_flow_registration = NULL;
+               if (necp_client_id_is_flow(client_id)) {
+                       single_flow_registration = necp_client_find_flow(client, client_id);
+                       if (single_flow_registration != NULL) {
+                               assigned_results_size += necp_client_calculate_flow_tlv_size(single_flow_registration);
+                       }
+               } else {
+                       // This request is for the client, so copy everything
+                       struct necp_client_flow_registration *flow_registration = NULL;
+                       RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+                               if (flow_registration->defunct) {
+                                       some_flow_is_defunct = true;
                                }
+                               assigned_results_size += necp_client_calculate_flow_tlv_size(flow_registration);
                        }
                }
                if (uap->buffer_size < (client->result_length + assigned_results_size)) {
-                       error = EINVAL;
-                       goto done;
+                       return (EINVAL);
+               }
+
+               u_int32_t original_flags = 0;
+               bool flags_updated = false;
+               if (some_flow_is_defunct && client->legacy_client_is_flow) {
+                       // If our client expects the defunct flag in the client, add it now
+                       u_int32_t client_flags = 0;
+                       u_int32_t value_size = 0;
+                       u_int8_t *flags_pointer = necp_buffer_get_tlv_value(client->result, 0, &value_size);
+                       if (flags_pointer != NULL && value_size == sizeof(client_flags)) {
+                               memcpy(&client_flags, flags_pointer, value_size);
+                               original_flags = client_flags;
+                               client_flags |= NECP_CLIENT_RESULT_FLAG_DEFUNCT;
+                               (void)necp_buffer_write_tlv_if_different(client->result, NECP_CLIENT_RESULT_FLAGS,
+                                                                                                                sizeof(client_flags), &client_flags, &flags_updated,
+                                                                                                                client->result, sizeof(client->result));
+                       }
                }
+
                error = copyout(client->result, uap->buffer, client->result_length);
+
+               if (flags_updated) {
+                       // Revert stored flags
+                       (void)necp_buffer_write_tlv_if_different(client->result, NECP_CLIENT_RESULT_FLAGS,
+                                                                                                        sizeof(original_flags), &original_flags, &flags_updated,
+                                                                                                        client->result, sizeof(client->result));
+               }
+
                if (error) {
                        NECPLOG(LOG_ERR, "necp_client_copy result copyout error (%d)", error);
-                       goto done;
+                       return (error);
                }
 
                size_t assigned_results_cursor = 0;
-
-               flow = NULL;
-               LIST_FOREACH(flow, &client->flow_list, flow_chain) {
-                       if (flow->nexus || (flow->socket && flow->assigned)) {
-                               // Write TLV headers
-                               struct necp_client_nexus_flow_header header = {};
-                               u_int32_t length = 0;
-                               u_int32_t flags = 0;
-                               u_int8_t tfo_cookie_len = 0;
-                               u_int8_t type = 0;
-
-                               if (flow->nexus) {
-                                       if (flow->check_tcp_heuristics) {
-                                               u_int8_t tfo_cookie[NECP_TFO_COOKIE_LEN_MAX];
-                                               tfo_cookie_len = NECP_TFO_COOKIE_LEN_MAX;
-
-                                               if (necp_client_check_tcp_heuristics(client, flow, &flags,
-                                                                                   tfo_cookie, &tfo_cookie_len) != 0) {
-                                                       tfo_cookie_len = 0;
-                                               } else {
-                                                       flow->check_tcp_heuristics = FALSE;
-
-                                                       if (tfo_cookie_len != 0) {
-                                                               type = NECP_CLIENT_RESULT_TFO_COOKIE;
-                                                               length = tfo_cookie_len;
-                                                               memcpy(&header.tfo_cookie_tlv_header.type, &type, sizeof(type));
-                                                               memcpy(&header.tfo_cookie_tlv_header.length, &length, sizeof(length));
-                                                               memcpy(&header.tfo_cookie_value, tfo_cookie, tfo_cookie_len);
-                                                       }
-                                               }
-                                       }
-                               }
-
-                               size_t header_length = 0;
-                               if (flow->nexus) {
-                                       if (tfo_cookie_len != 0) {
-                                               header_length = sizeof(struct necp_client_nexus_flow_header) - (NECP_TFO_COOKIE_LEN_MAX - tfo_cookie_len);
-                                       } else {
-                                               header_length = sizeof(struct necp_client_nexus_flow_header) - sizeof(struct necp_tlv_header) - NECP_TFO_COOKIE_LEN_MAX;
-                                       }
-                               } else {
-                                       header_length = sizeof(struct necp_client_flow_header);
-                               }
-
-                               type = NECP_CLIENT_RESULT_FLAGS;
-                               length = sizeof(header.flow_header.flags_value);
-                               memcpy(&header.flow_header.flags_tlv_header.type, &type, sizeof(type));
-                               memcpy(&header.flow_header.flags_tlv_header.length, &length, sizeof(length));
-                               if (flow->assigned) {
-                                       flags |= NECP_CLIENT_RESULT_FLAG_FLOW_ASSIGNED;
-                               }
-                               if (flow->viable) {
-                                       flags |= NECP_CLIENT_RESULT_FLAG_FLOW_VIABLE;
-                               }
-                               memcpy(&header.flow_header.flags_value, &flags, sizeof(flags));
-
-                               type = NECP_CLIENT_RESULT_INTERFACE;
-                               length = sizeof(header.flow_header.interface_value);
-                               memcpy(&header.flow_header.interface_tlv_header.type, &type, sizeof(type));
-                               memcpy(&header.flow_header.interface_tlv_header.length, &length, sizeof(length));
-
-                               struct necp_client_result_interface interface_struct;
-                               interface_struct.generation = 0;
-                               interface_struct.index = flow->interface_index;
-
-                               memcpy(&header.flow_header.interface_value, &interface_struct, sizeof(interface_struct));
-                               if (flow->nexus) {
-                                       type = NECP_CLIENT_RESULT_NETAGENT;
-                                       length = sizeof(header.agent_value);
-                                       memcpy(&header.agent_tlv_header.type, &type, sizeof(type));
-                                       memcpy(&header.agent_tlv_header.length, &length, sizeof(length));
-
-                                       struct necp_client_result_netagent agent_struct;
-                                       agent_struct.generation = 0;
-                                       uuid_copy(agent_struct.netagent_uuid, flow->u.nexus_agent);
-
-                                       memcpy(&header.agent_value, &agent_struct, sizeof(agent_struct));
+               if (necp_client_id_is_flow(client_id)) {
+                       if (single_flow_registration != NULL) {
+                               error = necp_client_fillout_flow_tlvs(client, client_is_observed, single_flow_registration, uap, &assigned_results_cursor);
+                               if (error != 0) {
+                                       return (error);
                                }
-
-                               // Don't include outer TLV header in length field
-                               type = NECP_CLIENT_RESULT_FLOW;
-                               length = (header_length - sizeof(struct necp_tlv_header) + flow->assigned_results_length);
-                               if (flow->has_protoctl_event) {
-                                       length += sizeof(struct necp_client_flow_protoctl_event_header);
-                               }
-                               memcpy(&header.flow_header.outer_header.type, &type, sizeof(type));
-                               memcpy(&header.flow_header.outer_header.length, &length, sizeof(length));
-
-                               error = copyout(&header, uap->buffer + client->result_length + assigned_results_cursor, header_length);
-                               if (error) {
-                                       NECPLOG(LOG_ERR, "necp_client_copy assigned results tlv_header copyout error (%d)", error);
-                                       goto done;
-                               }
-                               assigned_results_cursor += header_length;
-
-                               if (flow->assigned_results && flow->assigned_results_length) {
-                                       // Write inner TLVs
-                                       error = copyout(flow->assigned_results, uap->buffer + client->result_length + assigned_results_cursor,
-                                                                       flow->assigned_results_length);
-                                       if (error) {
-                                               NECPLOG(LOG_ERR, "necp_client_copy assigned results copyout error (%d)", error);
-                                               goto done;
-                                       }
-                               }
-                               assigned_results_cursor += flow->assigned_results_length;
-
-                               /* Read the protocol event and reset it */
-                               if (flow->has_protoctl_event) {
-                                       struct necp_client_flow_protoctl_event_header protoctl_event_header = {};
-
-                                       type = NECP_CLIENT_RESULT_PROTO_CTL_EVENT;
-                                       length = sizeof(protoctl_event_header.protoctl_event);
-
-                                       memcpy(&protoctl_event_header.protoctl_tlv_header.type, &type, sizeof(type));
-                                       memcpy(&protoctl_event_header.protoctl_tlv_header.length, &length, sizeof(length));
-                                       memcpy(&protoctl_event_header.protoctl_event, &flow->protoctl_event,
-                                           sizeof(flow->protoctl_event));
-
-                                       error = copyout(&protoctl_event_header, uap->buffer + client->result_length + assigned_results_cursor,
-                                           sizeof(protoctl_event_header));
-
-                                       if (error) {
-                                               NECPLOG(LOG_ERR, "necp_client_copy protocol control event results"
-                                                   " tlv_header copyout error (%d)", error);
-                                               goto done;
-                                       }
-                                       assigned_results_cursor += sizeof(protoctl_event_header);
-                                       flow->has_protoctl_event = FALSE;
-                                       flow->protoctl_event.protoctl_event_code = 0;
-                                       flow->protoctl_event.protoctl_event_val = 0;
-                                       flow->protoctl_event.protoctl_event_tcp_seq_num = 0;
+                       }
+               } else {
+                       // This request is for the client, so copy everything
+                       struct necp_client_flow_registration *flow_registration = NULL;
+                       RB_FOREACH(flow_registration, _necp_client_flow_tree, &client->flow_registrations) {
+                               error = necp_client_fillout_flow_tlvs(client, client_is_observed, flow_registration, uap, &assigned_results_cursor);
+                               if (error != 0) {
+                                       return (error);
                                }
                        }
                }
@@ -3682,12 +4656,10 @@ necp_client_copy_internal(struct necp_client *client, bool client_is_observed, s
 
                if (!client_is_observed) {
                        client->result_read = TRUE;
-                       client->flow_result_read = TRUE;
                }
        }
 
-done:
-       return (error);
+       return (0);
 }
 
 static int
@@ -3701,28 +4673,25 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u
        *retval = 0;
 
        if (uap->buffer_size == 0 || uap->buffer == 0) {
-               error = EINVAL;
-               goto done;
+               return (EINVAL);
        }
 
        if (uap->action != NECP_CLIENT_ACTION_COPY_PARAMETERS &&
                uap->action != NECP_CLIENT_ACTION_COPY_RESULT &&
                uap->action != NECP_CLIENT_ACTION_COPY_UPDATED_RESULT) {
-               error = EINVAL;
-               goto done;
+               return (EINVAL);
        }
 
        if (uap->client_id) {
                if (uap->client_id_len != sizeof(uuid_t)) {
                        NECPLOG(LOG_ERR, "Incorrect length (got %d, expected %d)", uap->client_id_len, sizeof(uuid_t));
-                       error = ERANGE;
-                       goto done;
+                       return (ERANGE);
                }
 
                error = copyin(uap->client_id, client_id, sizeof(uuid_t));
                if (error) {
                        NECPLOG(LOG_ERR, "necp_client_copy client_id copyin error (%d)", error);
-                       goto done;
+                       return (error);
                }
        }
 
@@ -3735,7 +4704,7 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u
                        struct necp_client *find_client = NULL;
                        RB_FOREACH(find_client, _necp_client_tree, &fd_data->clients) {
                                NECP_CLIENT_LOCK(find_client);
-                               if (!find_client->result_read || !find_client->flow_result_read) {
+                               if (!find_client->result_read || necp_client_has_unread_flows(find_client)) {
                                        client = find_client;
                                        // Leave the client locked, and break
                                        break;
@@ -3749,7 +4718,7 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u
 
        if (client != NULL) {
                // If client is set, it is locked
-               error = necp_client_copy_internal(client, FALSE, uap, retval);
+               error = necp_client_copy_internal(client, client_id, FALSE, uap, retval);
                NECP_CLIENT_UNLOCK(client);
        }
 
@@ -3765,16 +4734,11 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u
 
                        bool found_client = FALSE;
 
-                       struct necp_client find;
-                       uuid_copy(find.client_id, client_id);
-                       client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find);
+                       client = necp_find_client_and_lock(client_id);
                        if (client != NULL) {
-                               NECP_CLIENT_LOCK(client);
-
                                // Matched, copy out data
                                found_client = TRUE;
-                               error = necp_client_copy_internal(client, TRUE, uap, retval);
-
+                               error = necp_client_copy_internal(client, client_id, TRUE, uap, retval);
                                NECP_CLIENT_UNLOCK(client);
                        }
 
@@ -3783,17 +4747,14 @@ necp_client_copy(struct necp_fd_data *fd_data, struct necp_client_action_args *u
 
                        // No client found, fail
                        if (!found_client) {
-                               error = ENOENT;
-                               goto done;
+                               return (ENOENT);
                        }
                } else {
                        // No client found, and not allowed to search other fds, fail
-                       error = ENOENT;
-                       goto done;
+                       return (ENOENT);
                }
        }
 
-done:
        return (error);
 }
 
@@ -3856,7 +4817,8 @@ necp_client_copy_client_update(struct necp_fd_data *fd_data, struct necp_client_
 }
 
 static int
-necp_client_copy_parameters_locked(struct necp_client *client, struct necp_client_nexus_parameters *parameters)
+necp_client_copy_parameters_locked(struct necp_client *client,
+                                                                  struct necp_client_nexus_parameters *parameters)
 {
        VERIFY(parameters != NULL);
 
@@ -3890,44 +4852,6 @@ necp_client_copy_parameters_locked(struct necp_client *client, struct necp_clien
        return (error);
 }
 
-int
-necp_client_copy_parameters(uuid_t client_id, struct necp_client_nexus_parameters *parameters)
-{
-       int error = 0;
-       struct necp_client *client = NULL;
-
-       if (parameters == NULL) {
-               return EINVAL;
-       }
-
-       // Lock tree
-       NECP_CLIENT_TREE_LOCK_SHARED();
-
-       bool found_client = FALSE;
-       struct necp_client find;
-       uuid_copy(find.client_id, client_id);
-       client = RB_FIND(_necp_client_global_tree, &necp_client_global_tree, &find);
-       if (client != NULL) {
-               NECP_CLIENT_LOCK(client);
-
-               // Matched, parse parameters
-               found_client = TRUE;
-               error = necp_client_copy_parameters_locked(client, parameters);
-
-               NECP_CLIENT_UNLOCK(client);
-       }
-
-       // Unlock tree
-       NECP_CLIENT_TREE_UNLOCK();
-
-       // No client found, fail
-       if (!found_client) {
-               return ENOENT;
-       }
-
-       return error;
-}
-
 static int
 necp_client_list(struct necp_fd_data *fd_data, struct necp_client_action_args *uap, int *retval)
 {
@@ -4145,6 +5069,7 @@ necp_client_agent_action(struct necp_fd_data *fd_data, struct necp_client_action
                                                error = netagent_client_message_with_params(agent_uuid,
                                                                                                                                        client_id,
                                                                                                                                        fd_data->proc_pid,
+                                                                                                                                       client->agent_handle,
                                                                                                                                        netagent_message_type,
                                                                                                                                        &parsed_parameters,
                                                                                                                                        NULL, NULL);
@@ -4316,6 +5241,12 @@ necp_client_copy_interface(__unused struct necp_fd_data *fd_data, struct necp_cl
                if ((interface->if_eflags & IFEF_NOACKPRI) == IFEF_NOACKPRI) {
                        interface_details.flags |= NECP_INTERFACE_FLAG_NOACKPRI;
                }
+               if ((interface->if_eflags & IFEF_3CA) == IFEF_3CA) {
+                       interface_details.flags |= NECP_INTERFACE_FLAG_3CARRIERAGG;
+               }
+               if (IFNET_IS_LOW_POWER(interface)) {
+                       interface_details.flags |= NECP_INTERFACE_FLAG_IS_LOW_POWER;
+               }
                interface_details.mtu = interface->if_mtu;
 
                u_int8_t ipv4_signature_len = sizeof(interface_details.ipv4_signature.signature);
@@ -4439,9 +5370,17 @@ necp_client_update_cache(struct necp_fd_data *fd_data, struct necp_client_action
                goto done;
        }
 
+       struct necp_client_flow_registration *flow_registration = necp_client_find_flow(client, client_id);
+       if (flow_registration == NULL) {
+               NECP_CLIENT_UNLOCK(client);
+               NECP_FD_UNLOCK(fd_data);
+               error = ENOENT;
+               goto done;
+       }
+
        NECP_CLIENT_ROUTE_LOCK(client);
        // This needs to be changed when TFO/ECN is supported by multiple flows
-       struct necp_client_flow *flow = LIST_FIRST(&client->flow_list);
+       struct necp_client_flow *flow = LIST_FIRST(&flow_registration->flow_list);
        if (flow == NULL ||
                (flow->remote_addr.sa.sa_family != AF_INET &&
                 flow->remote_addr.sa.sa_family != AF_INET6) ||
@@ -4608,7 +5547,7 @@ necp_match_policy(struct proc *p, struct necp_match_policy_args *uap, int32_t *r
 {
 #pragma unused(retval)
        u_int8_t *parameters = NULL;
-       struct necp_aggregate_result returned_result = {};
+       struct necp_aggregate_result returned_result;
        int error = 0;
 
        if (uap == NULL) {
@@ -4821,7 +5760,7 @@ done:
 void *
 necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, void *key, uint32_t key_length,
                                                                 struct necp_client_endpoint *local_endpoint, struct necp_client_endpoint *remote_endpoint,
-                                                                u_int32_t flow_adv_index, size_t *message_length)
+                                                                u_int32_t flow_adv_index, void *flow_stats, size_t *message_length)
 {
        u_int8_t *buffer = NULL;
        u_int8_t *cursor = NULL;
@@ -4846,6 +5785,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo
        if (remote_endpoint != NULL) {
                valsize += sizeof(struct necp_tlv_header) + sizeof(struct necp_client_endpoint);
        }
+       if (flow_stats != NULL) {
+               valsize += sizeof(struct necp_tlv_header) + sizeof(void *);
+       }
        if (valsize == 0) {
                return (NULL);
        }
@@ -4872,6 +5814,9 @@ necp_create_nexus_assign_message(uuid_t nexus_instance, u_int32_t nexus_port, vo
        if (remote_endpoint != NULL) {
                cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_REMOTE_ENDPOINT, sizeof(struct necp_client_endpoint), remote_endpoint, buffer, valsize);
        }
+       if (flow_stats != NULL) {
+               cursor = necp_buffer_write_tlv(cursor, NECP_CLIENT_RESULT_NEXUS_FLOW_STATS, sizeof(void *), &flow_stats, buffer, valsize);
+       }
 
        *message_length = valsize;
 
@@ -4949,6 +5894,13 @@ necp_client_init(void)
                /* NOTREACHED */
        }
 
+       necp_flow_registration_size = sizeof(struct necp_client_flow_registration);
+       necp_flow_registration_cache = mcache_create(NECP_FLOW_REGISTRATION_ZONE_NAME, necp_flow_registration_size, sizeof (uint64_t), 0, MCR_SLEEP);
+       if (necp_flow_registration_cache == NULL) {
+               panic("mcache_create(necp_client_flow_registration) failed\n");
+               /* NOTREACHED */
+       }
+
        necp_client_update_tcall = thread_call_allocate_with_options(necp_update_all_clients_callout, NULL,
                                                                                                                                 THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
        VERIFY(necp_client_update_tcall != NULL);
@@ -4956,13 +5908,15 @@ necp_client_init(void)
        lck_rw_init(&necp_fd_lock, necp_fd_mtx_grp, necp_fd_mtx_attr);
        lck_rw_init(&necp_observer_lock, necp_fd_mtx_grp, necp_fd_mtx_attr);
        lck_rw_init(&necp_client_tree_lock, necp_fd_mtx_grp, necp_fd_mtx_attr);
+    lck_rw_init(&necp_flow_tree_lock, necp_fd_mtx_grp, necp_fd_mtx_attr);
        lck_rw_init(&necp_collect_stats_list_lock, necp_fd_mtx_grp, necp_fd_mtx_attr);
 
        LIST_INIT(&necp_fd_list);
        LIST_INIT(&necp_fd_observer_list);
-       LIST_INIT(&necp_collect_stats_client_list);
+       LIST_INIT(&necp_collect_stats_flow_list);
 
        RB_INIT(&necp_client_global_tree);
+       RB_INIT(&necp_client_flow_global_tree);
 
        return (0);
 }
@@ -4971,5 +5925,6 @@ void
 necp_client_reap_caches(boolean_t purge)
 {
        mcache_reap_now(necp_flow_cache, purge);
+       mcache_reap_now(necp_flow_registration_cache, purge);
 }
 
index 366b801a3d4cd641004f3cdba0292c5a76453562..f7fd5a699e7fabab0a15de6ea19a6edd087e4ad4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -81,6 +81,7 @@
 #define        KEV_DL_AWDL_UNRESTRICTED                27
 #define        KEV_DL_RRC_STATE_CHANGED                28
 #define        KEV_DL_QOS_MODE_CHANGED                 29
+#define        KEV_DL_LOW_POWER_MODE_CHANGED           30
 
 #ifdef PRIVATE
 #define        KEV_NETPOLICY_SUBCLASS  3       /* Network policy subclass */
 #define        KEV_NETEVENT_SUBCLASS   11      /* Generic Net events subclass */
 /* KEV_NETEVENT_SUBCLASS event codes */
 #define        KEV_NETEVENT_APNFALLBACK                1
+#define        KEV_NETEVENT_CLAT46_EVENT               2
 
 #define        KEV_MPTCP_SUBCLASS      12      /* Global MPTCP events subclass */
 /* KEV_MPTCP_SUBCLASS event codes */
index 36385a019ed69e189d6c641469bc426af46b4416..24bc5426fd2f18590cdf4ccb8a38f080410dd9c2 100644 (file)
@@ -28,6 +28,8 @@
 
 #include <kern/debug.h>
 
+#if !NETWORKING
+
 #define        STUB(name)                                                      \
        int name(void);                                                 \
        int name(void)                                                  \
@@ -36,8 +38,6 @@
                return (0);                                             \
        }
 
-#if !NETWORKING
-
 STUB(bpf_attach);
 STUB(bpf_tap_in);
 STUB(bpf_tap_out);
@@ -350,6 +350,7 @@ STUB(ifnet_get_fastlane_capable);
 STUB(ifnet_get_unsent_bytes);
 STUB(ifnet_get_buffer_status);
 STUB(ifnet_normalise_unsent_data);
+STUB(ifnet_set_low_power_mode);
 STUB(in6_localaddr);
 STUB(in_localaddr);
 STUB(in6addr_local);
@@ -365,7 +366,6 @@ STUB(m_mtod);
 STUB(m_prepend_2);
 STUB(m_pullup);
 STUB(m_split);
-STUB(m_trailingspace);
 STUB(mbuf_get_driver_scratch);
 STUB(mbuf_get_unsent_data_bytes);
 STUB(mbuf_get_buffer_status);
@@ -461,13 +461,10 @@ STUB(sock_socket_internal);
 /*
  * Called from vm_pageout.c. Nothing to be done when there's no networking.
  */
-void m_drain(void);
-void m_drain(void)
+void mbuf_drain(boolean_t);
+void mbuf_drain(boolean_t)
 {
        return;
 }
 
-#else /* NETWORKING */
-
-
 #endif /* !NETWORKING */
index 392665f18c2d5b04a0eab98dd3cfbd9ba1acd18d..a52cd6506941c902af2a0da56c3a6e00d05f4f40 100644 (file)
@@ -417,6 +417,12 @@ netagent_send_error_response(struct netagent_session *session, u_int8_t message_
        int error = 0;
        u_int8_t *response = NULL;
        size_t response_size = sizeof(struct netagent_message_header);
+
+       if (session == NULL) {
+               NETAGENTLOG0(LOG_ERR, "Got a NULL session");
+               return (EINVAL);
+       }
+
        MALLOC(response, u_int8_t *, response_size, M_NETAGENT, M_WAITOK);
        if (response == NULL) {
                return (ENOMEM);
@@ -1038,7 +1044,7 @@ netagent_handle_update_inner(struct netagent_session *session, struct netagent_w
                search_client = NULL;
                temp_client = NULL;
                LIST_FOREACH_SAFE(search_client, &pending_triggers_list_copy, client_chain, temp_client) {
-                       necp_force_update_client(search_client->client_id, session->wrapper->netagent.netagent_uuid);
+                       necp_force_update_client(search_client->client_id, session->wrapper->netagent.netagent_uuid, session->wrapper->generation);
                        netagent_send_cellular_failed_event(new_wrapper, search_client->client_pid, search_client->client_proc_uuid);
                        LIST_REMOVE(search_client, client_chain);
                        FREE(search_client, M_NETAGENT);
@@ -1826,7 +1832,7 @@ netagent_get_agent_domain_and_type(uuid_t uuid, char *domain, char *type)
                memcpy(domain, wrapper->netagent.netagent_domain, NETAGENT_DOMAINSIZE);
                memcpy(type, wrapper->netagent.netagent_type, NETAGENT_TYPESIZE);
        } else {
-               NETAGENTLOG0(LOG_DEBUG, "Type requested for invalid netagent");
+               NETAGENTLOG0(LOG_ERR, "Type requested for invalid netagent");
        }
        lck_rw_done(&netagent_lock);
 
@@ -1871,6 +1877,7 @@ int
 netagent_client_message_with_params(uuid_t agent_uuid,
                                                                        uuid_t necp_client_uuid,
                                                                        pid_t pid,
+                                                                       void *handle,
                                                                        u_int8_t message_type,
                                                                        struct necp_client_nexus_parameters *parameters,
                                                                        void **assigned_results,
@@ -1938,13 +1945,16 @@ netagent_client_message_with_params(uuid_t agent_uuid,
        }
 
        if (wrapper->control_unit == 0) {
-               should_unlock = FALSE;
-               lck_rw_done(&netagent_lock);
                if (wrapper->event_handler == NULL) {
                        // No event handler registered for kernel agent
                        error = EINVAL;
                } else {
-                       error = wrapper->event_handler(message_type, necp_client_uuid, pid, wrapper->event_context, parameters, assigned_results, assigned_results_length);
+                       // We hold the shared lock during the event handler callout, so it is expected
+                       // that the event handler will not lead to any registrations or unregistrations
+                       // of network agents.
+                       error = wrapper->event_handler(message_type, necp_client_uuid, pid, handle,
+                                                                                  wrapper->event_context, parameters,
+                                                                                  assigned_results, assigned_results_length);
                        if (error != 0) {
                                VERIFY(assigned_results == NULL || *assigned_results == NULL);
                                VERIFY(assigned_results_length == NULL || *assigned_results_length == 0);
@@ -1998,9 +2008,9 @@ done:
 }
 
 int
-netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, u_int8_t message_type)
+netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, void *handle, u_int8_t message_type)
 {
-       return (netagent_client_message_with_params(agent_uuid, necp_client_uuid, pid, message_type, NULL, NULL, NULL));
+       return (netagent_client_message_with_params(agent_uuid, necp_client_uuid, pid, handle, message_type, NULL, NULL, NULL));
 }
 
 int
index 3e2c864174455e6f1dcd7756a916f6f1fb1212b2..0eddfa2aaf4b02bb61e1bc0ce777f669c79d3340 100644 (file)
@@ -219,11 +219,12 @@ extern bool netagent_get_agent_domain_and_type(uuid_t uuid, char *domain, char *
 
 extern int netagent_kernel_trigger(uuid_t uuid);
 
-extern int netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, u_int8_t message_type);
+extern int netagent_client_message(uuid_t agent_uuid, uuid_t necp_client_uuid, pid_t pid, void *handle, u_int8_t message_type);
 
 extern int netagent_client_message_with_params(uuid_t agent_uuid,
                                                                                           uuid_t necp_client_uuid,
                                                                                           pid_t pid,
+                                                                                          void *handle,
                                                                                           u_int8_t message_type,
                                                                                           struct necp_client_nexus_parameters *parameters,
                                                                                           void **assigned_results,
@@ -248,7 +249,7 @@ struct netagent_nexus_agent {
 #define        NETAGENT_EVENT_NEXUS_FLOW_REMOVE                        NETAGENT_MESSAGE_TYPE_CLOSE_NEXUS
 #define        NETAGENT_EVENT_NEXUS_FLOW_ABORT                         NETAGENT_MESSAGE_TYPE_ABORT_NEXUS
 
-typedef errno_t (*netagent_event_f)(u_int8_t event, uuid_t necp_client_uuid, pid_t pid, void *context, struct necp_client_nexus_parameters *parameters, void **assigned_results, size_t *assigned_results_length);
+typedef errno_t (*netagent_event_f)(u_int8_t event, uuid_t necp_client_uuid, pid_t pid, void *necp_handle, void *context, struct necp_client_nexus_parameters *parameters, void **assigned_results, size_t *assigned_results_length);
 
 extern netagent_session_t netagent_create(netagent_event_f event_handler, void *handle);
 
index e370d7c937fcc5cdca5574a0794bee98dd0fe07e..eefb69aaf481e0d6d9c8bc299a25a34f624907ba 100644 (file)
@@ -3044,6 +3044,9 @@ nstat_sysinfo_send_data_internal(
                        nstat_set_keyval_scalar(&kv[i++],
                            NSTAT_SYSINFO_MPTCP_CELL_PROXY,
                            data->u.tcp_stats.mptcp_cell_proxy);
+                       nstat_set_keyval_scalar(&kv[i++],
+                           NSTAT_SYSINFO_MPTCP_TRIGGERED_CELL,
+                           data->u.tcp_stats.mptcp_triggered_cell);
                        VERIFY(i == nkeyvals);
                        break;
                }
@@ -5106,3 +5109,97 @@ nstat_control_send(
 }
 
 
+static int
+tcp_progress_indicators_for_interface(unsigned int ifindex, uint64_t recentflow_maxduration, struct xtcpprogress_indicators *indicators)
+{
+       int error = 0;
+       struct inpcb *inp;
+       uint64_t min_recent_start_time;
+
+       min_recent_start_time = mach_continuous_time() - recentflow_maxduration;
+       bzero(indicators, sizeof(*indicators));
+
+       lck_rw_lock_shared(tcbinfo.ipi_lock);
+       /*
+        * For progress indicators we don't need to special case TCP to collect time wait connections
+        */
+       LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list)
+       {
+               struct tcpcb  *tp = intotcpcb(inp);
+               if (tp && inp->inp_last_outifp &&
+                       inp->inp_last_outifp->if_index == ifindex &&
+                       inp->inp_state != INPCB_STATE_DEAD &&
+                       !(tp->t_flags & TF_LOCAL))
+               {
+                       struct tcp_conn_status connstatus;
+                       indicators->xp_numflows++;
+                       tcp_get_connectivity_status(tp, &connstatus);
+                       if (connstatus.write_probe_failed)
+                               indicators->xp_write_probe_fails++;
+                       if (connstatus.read_probe_failed)
+                               indicators->xp_read_probe_fails++;
+                       if (connstatus.conn_probe_failed)
+                               indicators->xp_conn_probe_fails++;
+                       if (inp->inp_start_timestamp > min_recent_start_time)
+                       {
+                               uint64_t flow_count;
+
+                               indicators->xp_recentflows++;
+                               atomic_get_64(flow_count, &inp->inp_stat->rxbytes);
+                               indicators->xp_recentflows_rxbytes += flow_count;
+                               atomic_get_64(flow_count, &inp->inp_stat->txbytes);
+                               indicators->xp_recentflows_txbytes += flow_count;
+
+                               indicators->xp_recentflows_rxooo += tp->t_stat.rxoutoforderbytes;
+                               indicators->xp_recentflows_rxdup += tp->t_stat.rxduplicatebytes;
+                               indicators->xp_recentflows_retx += tp->t_stat.txretransmitbytes;
+                               if (tp->snd_max - tp->snd_una)
+                               {
+                                       indicators->xp_recentflows_unacked++;
+                               }
+                       }
+               }
+       }
+       lck_rw_done(tcbinfo.ipi_lock);
+
+       return (error);
+}
+
+
+__private_extern__ int
+ntstat_tcp_progress_indicators(struct sysctl_req *req)
+{
+       struct xtcpprogress_indicators indicators = {};
+       int error = 0;
+       struct tcpprogressreq requested;
+
+       if (priv_check_cred(kauth_cred_get(), PRIV_NET_PRIVILEGED_NETWORK_STATISTICS, 0) != 0)
+       {
+               return EACCES;
+       }
+       if (req->newptr == USER_ADDR_NULL)
+       {
+               return EINVAL;
+       }
+       if (req->newlen < sizeof(req))
+       {
+               return EINVAL;
+       }
+       error = SYSCTL_IN(req, &requested, sizeof(requested));
+       if (error != 0)
+       {
+               return error;
+       }
+       error = tcp_progress_indicators_for_interface(requested.ifindex, requested.recentflow_maxduration, &indicators);
+       if (error != 0)
+       {
+               return error;
+       }
+       error = SYSCTL_OUT(req, &indicators, sizeof(indicators));
+
+       return (error);
+}
+
+
+
+
index af474f67a1c7203176cee9a7332fc00b2cd3e006..82577499f441e2f766fe5e1e665320eabacfe992 100644 (file)
@@ -316,9 +316,10 @@ enum
        ,NSTAT_SYSINFO_MPTCP_WIFI_PROXY = 184
        ,NSTAT_SYSINFO_MPTCP_CELL_PROXY = 185
        ,NSTAT_SYSINFO_ECN_IFNET_FALLBACK_SYNRST = 186
+       ,NSTAT_SYSINFO_MPTCP_TRIGGERED_CELL = 187
 
 // NSTAT_SYSINFO_ENUM_VERSION must be updated any time a value is added
-#define        NSTAT_SYSINFO_ENUM_VERSION      20170623
+#define        NSTAT_SYSINFO_ENUM_VERSION      20180416
 };
 
 #define        NSTAT_SYSINFO_API_FIRST NSTAT_SYSINFO_API_IF_FLTR_ATTACH
@@ -1058,9 +1059,11 @@ typedef struct nstat_sysinfo_tcp_stats
        u_int64_t               mptcp_aggregate_all_bytes;
        u_int32_t               mptcp_wifi_proxy;               /* Total number of new subflows that fell back to regular TCP on cell */
        u_int32_t               mptcp_cell_proxy;               /* Total number of new subflows that fell back to regular TCP on WiFi */
+       u_int32_t               mptcp_triggered_cell;           /* Total number of times an MPTCP-connection triggered cell bringup */
+       u_int32_t               _padding;
        /* When adding/removing here, also adjust NSTAT_SYSINFO_TCP_STATS_COUNT */
 } nstat_sysinfo_tcp_stats;
-#define NSTAT_SYSINFO_TCP_STATS_COUNT  70
+#define NSTAT_SYSINFO_TCP_STATS_COUNT  71
 
 enum {
        NSTAT_IFNET_ECN_PROTO_IPV4 = 1
@@ -1158,6 +1161,8 @@ void nstat_ifnet_threshold_reached(unsigned int ifindex);
 
 void nstat_sysinfo_send_data(struct nstat_sysinfo_data *);
 
+int ntstat_tcp_progress_indicators(struct sysctl_req *req);
+
 
 // Utilities for userland stats reporting
 u_int16_t nstat_ifnet_to_flags(struct ifnet *ifp);
index a09e7c74c2321ba4185d649975f8d0f89842a8c4..24d18870ab4c5f336ace558172d2c9f9bb13c377 100644 (file)
@@ -1068,6 +1068,7 @@ static void chksm_update(mbuf_t data)
        u_int16_t ip_sum;
        u_int16_t tsum;
        struct tcphdr *tcp;
+       errno_t err;
 
        unsigned char *ptr = (unsigned char *)mbuf_data(data);
        struct ip *ip = (struct ip *)(void *)ptr;
@@ -1076,16 +1077,17 @@ static void chksm_update(mbuf_t data)
        }
 
        ip->ip_sum = 0;
-       mbuf_inet_cksum(data, 0, 0, ip->ip_hl << 2, &ip_sum); // ip sum
-
-       ip->ip_sum = ip_sum;
+       err = mbuf_inet_cksum(data, 0, 0, ip->ip_hl << 2, &ip_sum); // ip sum
+       if (err == 0)
+               ip->ip_sum = ip_sum;
        switch (ip->ip_p) {
                case IPPROTO_TCP:
                        tcp = (struct tcphdr *)(void *)(ptr + (ip->ip_hl << 2));
                        tcp->th_sum = 0;
-                       mbuf_inet_cksum(data, IPPROTO_TCP, ip->ip_hl << 2,
+                       err = mbuf_inet_cksum(data, IPPROTO_TCP, ip->ip_hl << 2,
                            ntohs(ip->ip_len) - (ip->ip_hl << 2), &tsum);
-                       tcp->th_sum = tsum;
+                       if (err == 0)
+                               tcp->th_sum = tsum;
                        break;
                case IPPROTO_UDP:
                        /* Don't handle UDP */
index 0ddbf167a68358c2915779bab18ce0f1f3862cc0..70f1f906de53788de06356c0e6881cf165d4c2e0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2007-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2007-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -82,7 +82,6 @@
 #include <libkern/libkern.h>
 
 #include <mach/thread_act.h>
-#include <mach/branch_predicates.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
 #include <net/if_ether.h>
 #include <net/ethernet.h>
 #include <net/flowhash.h>
+#include <net/nat464_utils.h>
 #include <net/pfvar.h>
 #include <net/if_pflog.h>
 
@@ -2061,16 +2061,7 @@ pf_addr_wrap_neq(struct pf_addr_wrap *aw1, struct pf_addr_wrap *aw2)
 u_int16_t
 pf_cksum_fixup(u_int16_t cksum, u_int16_t old, u_int16_t new, u_int8_t udp)
 {
-       u_int32_t       l;
-
-       if (udp && !cksum)
-               return (0);
-       l = cksum + old - new;
-       l = (l >> 16) + (l & 0xffff);
-       l = l & 0xffff;
-       if (udp && !l)
-               return (0xffff);
-       return (l);
+       return (nat464_cksum_fixup(cksum, old, new, udp));
 }
 
 /*
@@ -2111,17 +2102,23 @@ pf_change_ap(int dir, pbuf_t *pbuf, struct pf_addr *a, u_int16_t *p,
                        ao.addr16[0], an->addr16[0], 0),
                        ao.addr16[1], an->addr16[1], 0);
                        *p = pn;
-               /*
-                * If the packet is originated from an ALG on the NAT gateway
-                * (source address is loopback or local), in which case the
-                * TCP/UDP checksum field contains the pseudo header checksum
-                * that's not yet complemented. A packet generated locally
-                * will have UDP/TCP CSUM flag set (gets set in protocol
-                * output).
-                */
+                       /*
+                        * If the packet is originated from an ALG on the NAT gateway
+                        * (source address is loopback or local), in which case the
+                        * TCP/UDP checksum field contains the pseudo header checksum
+                        * that's not yet complemented.
+                        * In that case we do not need to fixup the checksum for port
+                        * translation as the pseudo header checksum doesn't include ports.
+                        *
+                        * A packet generated locally will have UDP/TCP CSUM flag
+                        * set (gets set in protocol output).
+                        *
+                        * It should be noted that the fixup doesn't do anything if the
+                        * checksum is 0.
+                        */
                        if (dir == PF_OUT && pbuf != NULL &&
-                       (*pbuf->pb_csum_flags & (CSUM_TCP | CSUM_UDP))) {
-                       /* Pseudo-header checksum does not include ports */
+                           (*pbuf->pb_csum_flags & (CSUM_TCP | CSUM_UDP))) {
+                               /* Pseudo-header checksum does not include ports */
                                *pc = ~pf_cksum_fixup(pf_cksum_fixup(~*pc,
                                ao.addr16[0], an->addr16[0], u),
                                ao.addr16[1], an->addr16[1], u);
@@ -4062,7 +4059,16 @@ pf_calc_mss(struct pf_addr *addr, sa_family_t af, u_int16_t offer)
        }
 
        if (rt && rt->rt_ifp) {
-               mss = rt->rt_ifp->if_mtu - hlen - sizeof (struct tcphdr);
+                /* This is relevant only for PF SYN Proxy */
+               int interface_mtu = rt->rt_ifp->if_mtu;
+
+               if (af == AF_INET &&
+                   INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) {
+                       interface_mtu = IN6_LINKMTU(rt->rt_ifp);
+                       /* Further adjust the size for CLAT46 expansion */
+                       interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+               }
+               mss = interface_mtu - hlen - sizeof (struct tcphdr);
                mss = max(tcp_mssdflt, mss);
                rtfree(rt);
        }
@@ -4483,10 +4489,10 @@ pf_nat64_ipv6(pbuf_t *pbuf, int off, struct pf_pdesc *pd)
        ip4->ip_hl  = 5;
        ip4->ip_tos = pd->tos & htonl(0x0ff00000);
        ip4->ip_len = htons(sizeof(*ip4) + (pd->tot_len - off));
-        ip4->ip_id  = 0;
-        ip4->ip_off = htons(IP_DF);
-        ip4->ip_ttl = pd->ttl;
-        ip4->ip_p   = pd->proto;
+       ip4->ip_id  = 0;
+       ip4->ip_off = htons(IP_DF);
+       ip4->ip_ttl = pd->ttl;
+       ip4->ip_p   = pd->proto;
        ip4->ip_sum = 0;
        ip4->ip_src = pd->naddr.v4addr;
        ip4->ip_dst = pd->ndaddr.v4addr;
@@ -4500,7 +4506,7 @@ pf_nat64_ipv6(pbuf_t *pbuf, int off, struct pf_pdesc *pd)
                icmp = (struct icmp *)pbuf_contig_segment(pbuf, hlen,
                    ICMP_MINLEN);
                if (icmp == NULL)
-                       return (PF_NAT64);
+                       return (PF_DROP);
 
                icmp->icmp_cksum = 0;
                icmp->icmp_cksum = pbuf_inet_cksum(pbuf, 0, hlen,
@@ -4628,11 +4634,7 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
                icmptype = pd->hdr.icmp->icmp_type;
                icmpcode = pd->hdr.icmp->icmp_code;
 
-               if (icmptype == ICMP_UNREACH ||
-                   icmptype == ICMP_SOURCEQUENCH ||
-                   icmptype == ICMP_REDIRECT ||
-                   icmptype == ICMP_TIMXCEED ||
-                   icmptype == ICMP_PARAMPROB)
+               if (ICMP_ERRORTYPE(icmptype))
                        state_icmp++;
                break;
 #endif /* INET */
@@ -4645,10 +4647,7 @@ pf_test_rule(struct pf_rule **rm, struct pf_state **sm, int direction,
                icmptype = pd->hdr.icmp6->icmp6_type;
                icmpcode = pd->hdr.icmp6->icmp6_code;
 
-               if (icmptype == ICMP6_DST_UNREACH ||
-                   icmptype == ICMP6_PACKET_TOO_BIG ||
-                   icmptype == ICMP6_TIME_EXCEEDED ||
-                   icmptype == ICMP6_PARAM_PROB)
+               if (ICMP6_ERRORTYPE(icmptype))
                        state_icmp++;
                break;
 #endif /* INET6 */
@@ -7374,11 +7373,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
                icmpid = pd->hdr.icmp->icmp_id;
                icmpsum = &pd->hdr.icmp->icmp_cksum;
 
-               if (icmptype == ICMP_UNREACH ||
-                   icmptype == ICMP_SOURCEQUENCH ||
-                   icmptype == ICMP_REDIRECT ||
-                   icmptype == ICMP_TIMXCEED ||
-                   icmptype == ICMP_PARAMPROB)
+               if (ICMP_ERRORTYPE(icmptype))
                        state_icmp++;
                break;
 #endif /* INET */
@@ -7388,10 +7383,7 @@ pf_test_state_icmp(struct pf_state **state, int direction, struct pfi_kif *kif,
                icmpid = pd->hdr.icmp6->icmp6_id;
                icmpsum = &pd->hdr.icmp6->icmp6_cksum;
 
-               if (icmptype == ICMP6_DST_UNREACH ||
-                   icmptype == ICMP6_PACKET_TOO_BIG ||
-                   icmptype == ICMP6_TIME_EXCEEDED ||
-                   icmptype == ICMP6_PARAM_PROB)
+               if (ICMP6_ERRORTYPE(icmptype))
                        state_icmp++;
                break;
 #endif /* INET6 */
@@ -8735,7 +8727,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp,
        struct pf_src_node      *sn = NULL;
        int                      error = 0;
        uint32_t                 sw_csum;
-
+       int                      interface_mtu = 0;
        bzero(&iproute, sizeof (iproute));
 
        if (pbufp == NULL || !pbuf_is_valid(*pbufp) || r == NULL ||
@@ -8837,7 +8829,15 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp,
        ip_output_checksum(ifp, m0, ((ip->ip_hl) << 2), ntohs(ip->ip_len),
            &sw_csum);
 
-       if (ntohs(ip->ip_len) <= ifp->if_mtu || TSO_IPV4_OK(ifp, m0) ||
+       interface_mtu = ifp->if_mtu;
+
+       if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
+               interface_mtu = IN6_LINKMTU(ifp);
+               /* Further adjust the size for CLAT46 expansion */
+               interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+       }
+
+       if (ntohs(ip->ip_len) <= interface_mtu || TSO_IPV4_OK(ifp, m0) ||
            (!(ip->ip_off & htons(IP_DF)) &&
            (ifp->if_hwassist & CSUM_FRAGMENT))) {
                ip->ip_sum = 0;
@@ -8860,7 +8860,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp,
                ipstat.ips_cantfrag++;
                if (r->rt != PF_DUPTO) {
                        icmp_error(m0, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0,
-                           ifp->if_mtu);
+                           interface_mtu);
                        goto done;
                } else
                        goto bad;
@@ -8873,7 +8873,7 @@ pf_route(pbuf_t **pbufp, struct pf_rule *r, int dir, struct ifnet *oifp,
        NTOHS(ip->ip_off);
        NTOHS(ip->ip_len);
 #endif
-       error = ip_fragment(m0, ifp, ifp->if_mtu, sw_csum);
+       error = ip_fragment(m0, ifp, interface_mtu, sw_csum);
 
        if (error) {
                m0 = NULL;
index 977751814169994df197c18fd2d9c2428bdbc74f..395568c48b1d09b35f62f8d8fd548b4941247e29 100644 (file)
@@ -459,6 +459,7 @@ pfinit(void)
        _CASSERT((SC_AV & SCIDX_MASK) == SCIDX_AV);
        _CASSERT((SC_RV & SCIDX_MASK) == SCIDX_RV);
        _CASSERT((SC_VI & SCIDX_MASK) == SCIDX_VI);
+       _CASSERT((SC_SIG & SCIDX_MASK) == SCIDX_SIG);
        _CASSERT((SC_VO & SCIDX_MASK) == SCIDX_VO);
        _CASSERT((SC_CTL & SCIDX_MASK) == SCIDX_CTL);
 
index a5b69b22617e0650d279a8493763959af0840941..86cc47c3bcfe80262f6b5842a61555898d9ad9a1 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
@@ -257,8 +257,7 @@ pbuf_resize_segment(pbuf_t *pbuf, int off, int olen, int nlen)
                }
 
                pbuf_sync(pbuf);
-       } else
-       if (pbuf->pb_type == PBUF_TYPE_MEMORY) {
+       } else if (pbuf->pb_type == PBUF_TYPE_MEMORY) {
                struct pbuf_memory *nm = &pbuf->pb_memory;
                u_int true_offset, move_len;
                int delta_len;
@@ -280,9 +279,9 @@ pbuf_resize_segment(pbuf_t *pbuf, int off, int olen, int nlen)
                VERIFY((nm->pm_len + nm->pm_offset) <= nm->pm_buffer_len);
 
                pbuf_sync(pbuf);
-       } else
+       } else {
                panic("pbuf_csum_flags_get: bad pb_type: %d", pbuf->pb_type);
-
+       }
        return (rv);
 }
 
@@ -293,7 +292,7 @@ pbuf_contig_segment(pbuf_t *pbuf, int off, int len)
 
        VERIFY(off >= 0);
        VERIFY(len >= 0);
-       VERIFY((u_int)(off + len) < pbuf->pb_packet_len);
+       VERIFY((u_int)(off + len) <= pbuf->pb_packet_len);
 
        /*
         * Note: If this fails, then the pbuf is destroyed. This is a
@@ -301,7 +300,6 @@ pbuf_contig_segment(pbuf_t *pbuf, int off, int len)
         *
         * PF expects this behaviour so it's not a real problem.
         */
-
        if (pbuf->pb_type == PBUF_TYPE_MBUF) {
                struct mbuf *n;
                int moff;
index 55c7f0aa8c624e43e9b237705f753b883387150f..ec6d0333aabb3068975eb739d27c110069fd4748 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  *
@@ -68,7 +68,7 @@ typedef struct pbuf {
        uint32_t        pb_packet_len;
        uint32_t        pb_contig_len;
        uint32_t        *pb_csum_flags;
-       uint32_t        *pb_csum_data;
+       uint32_t        *pb_csum_data;    /* data field used by csum routines */
        uint8_t         *pb_proto;
        uint8_t         *pb_flowsrc;
        uint32_t        *pb_flowid;
@@ -76,6 +76,7 @@ typedef struct pbuf {
        struct pf_mtag  *pb_pftag;
        struct ifnet    *pb_ifp;
        struct pbuf     *pb_next;
+
 } pbuf_t;
 
 #define pbuf_is_valid(pb) (!((pb) == NULL || (pb)->pb_type == PBUF_TYPE_ZOMBIE))
index d7ea4c6d1b1eb5efab7155c0989c6471a54f427d..8b6f61cedfaf8127723e549ab2eccf70068640a8 100644 (file)
@@ -780,6 +780,7 @@ struct pf_rule {
 #define        SC_AV                   0x15
 #define        SC_RV                   0x16
 #define        SC_VI                   0x17
+#define        SC_SIG                  0x17
 #define        SC_VO                   0x18
 #define        SC_CTL                  0x19
 
index 66616a73f11b636c55daeb0c5f4655a13bdf4f9c..41da2471dc192cc2744a8c9247021762c0934a72 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -27,6 +27,7 @@
  */
 
 #include <kern/locks.h>
+#include <kern/zalloc.h>
 
 #include <sys/types.h>
 #include <sys/kernel_types.h>
@@ -127,12 +128,17 @@ static LIST_HEAD(pktap_list, pktap_softc) pktap_list =
 int pktap_clone_create(struct if_clone *, u_int32_t, void *);
 int pktap_clone_destroy(struct ifnet *);
 
+#define        PKTAP_MAXUNIT   IF_MAXUNIT
+#define        PKTAP_ZONE_MAX_ELEM     MIN(IFNETS_MAX, PKTAP_MAXUNIT)
+
 static struct if_clone pktap_cloner =
        IF_CLONE_INITIALIZER(PKTAP_IFNAME,
                pktap_clone_create,
                pktap_clone_destroy,
                0,
-               IF_MAXUNIT);
+               PKTAP_MAXUNIT,
+               PKTAP_ZONE_MAX_ELEM,
+               sizeof(struct pktap_softc));
 
 errno_t pktap_if_output(ifnet_t, mbuf_t);
 errno_t pktap_demux(ifnet_t, mbuf_t, char *, protocol_family_t *);
@@ -175,12 +181,17 @@ pktap_hexdump(int mask, void *addr, size_t len)
                printf("\n");
 }
 
+#define _CASSERT_OFFFSETOF_FIELD(s1, s2, f) \
+       _CASSERT(offsetof(struct s1, f) == offsetof(struct s2, f))
+
 __private_extern__ void
 pktap_init(void)
 {
        int error = 0;
        lck_grp_attr_t *lck_grp_attr = NULL;
 
+       _CASSERT_OFFFSETOF_FIELD(pktap_header, pktap_v2_hdr, pth_flags);
+
        /* Make sure we're called only once */
        VERIFY(pktap_inited == 0);
 
@@ -212,8 +223,7 @@ pktap_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
 
        PKTAP_LOG(PKTP_LOG_FUNC, "unit %u\n", unit);
 
-       pktap = _MALLOC(sizeof(struct pktap_softc), M_DEVBUF,
-           M_WAITOK | M_ZERO);
+       pktap = if_clone_softc_allocate(&pktap_cloner);
        if (pktap == NULL) {
                printf("%s: _MALLOC failed\n", __func__);
                error = ENOMEM;
@@ -291,10 +301,8 @@ pktap_clone_create(struct if_clone *ifc, u_int32_t unit, __unused void *params)
        LIST_INSERT_HEAD(&pktap_list, pktap, pktp_link);
        lck_rw_done(pktap_lck_rw);
 done:
-       if (error != 0) {
-               if (pktap != NULL)
-                       _FREE(pktap, M_DEVBUF);
-       }
+       if (error != 0 && pktap != NULL)
+               if_clone_softc_deallocate(&pktap_cloner, pktap);
        return (error);
 }
 
@@ -682,8 +690,7 @@ pktap_detach(ifnet_t ifp)
        /* Drop reference as it's no more on the global list */
        ifnet_release(ifp);
 
-       _FREE(pktap, M_DEVBUF);
-
+       if_clone_softc_deallocate(&pktap_cloner, pktap);
        /* This is for the reference taken by ifnet_attach() */
        (void) ifnet_release(ifp);
 }
@@ -766,16 +773,15 @@ static void
 pktap_set_procinfo(struct pktap_header *hdr, struct so_procinfo *soprocinfo)
 {
        hdr->pth_pid = soprocinfo->spi_pid;
-       proc_name(soprocinfo->spi_pid, hdr->pth_comm, MAXCOMLEN);
+       if (hdr->pth_comm[0] == 0)
+               proc_name(soprocinfo->spi_pid, hdr->pth_comm, MAXCOMLEN);
        if (soprocinfo->spi_pid != 0)
                uuid_copy(hdr->pth_uuid, soprocinfo->spi_uuid);
 
-       /*
-        * When not delegated, the effective pid is the same as the real pid
-        */
        if (soprocinfo->spi_delegated != 0) {
                hdr->pth_flags |= PTH_FLAG_PROC_DELEGATED;
                hdr->pth_epid = soprocinfo->spi_epid;
+               if (hdr->pth_ecomm[0] == 0)
                proc_name(soprocinfo->spi_epid, hdr->pth_ecomm, MAXCOMLEN);
                uuid_copy(hdr->pth_euuid, soprocinfo->spi_euuid);
        }
@@ -790,11 +796,6 @@ pktap_finalize_proc_info(struct pktap_header *hdr)
        if (!(hdr->pth_flags & PTH_FLAG_DELAY_PKTAP))
                return;
 
-       /*
-        * Clear the flag as it's internal
-        */
-       hdr->pth_flags &= ~PTH_FLAG_DELAY_PKTAP;
-
        if (hdr->pth_ipproto == IPPROTO_TCP)
                found = inp_findinpcb_procinfo(&tcbinfo, hdr->pth_flowid,
                    &soprocinfo);
@@ -809,13 +810,83 @@ pktap_finalize_proc_info(struct pktap_header *hdr)
                pktap_set_procinfo(hdr, &soprocinfo);
 }
 
+static void
+pktap_v2_set_procinfo(struct pktap_v2_hdr *pktap_v2_hdr,
+    struct so_procinfo *soprocinfo)
+{
+       pktap_v2_hdr->pth_pid = soprocinfo->spi_pid;
+
+       if (soprocinfo->spi_pid != 0 && soprocinfo->spi_pid != -1) {
+               if (pktap_v2_hdr->pth_comm_offset != 0) {
+                       char *ptr = ((char *)pktap_v2_hdr) +
+                           pktap_v2_hdr->pth_comm_offset;
+
+                       proc_name(soprocinfo->spi_pid,
+                           ptr, PKTAP_MAX_COMM_SIZE);
+               }
+               if (pktap_v2_hdr->pth_uuid_offset != 0) {
+                       uuid_t *ptr = (uuid_t *) (((char *)pktap_v2_hdr) +
+                           pktap_v2_hdr->pth_uuid_offset);
+
+                       uuid_copy(*ptr, soprocinfo->spi_uuid);
+               }
+       }
+
+       if (!(pktap_v2_hdr->pth_flags & PTH_FLAG_PROC_DELEGATED))
+               return;
+
+       /*
+        * The effective UUID may be set independently from the effective pid
+        */
+       if (soprocinfo->spi_delegated != 0) {
+               pktap_v2_hdr->pth_flags |= PTH_FLAG_PROC_DELEGATED;
+               pktap_v2_hdr->pth_e_pid = soprocinfo->spi_epid;
+
+               if (soprocinfo->spi_pid != 0 && soprocinfo->spi_pid != -1 &&
+                   pktap_v2_hdr->pth_e_comm_offset != 0) {
+                       char *ptr = ((char *)pktap_v2_hdr) +
+                           pktap_v2_hdr->pth_e_comm_offset;
+
+                       proc_name(soprocinfo->spi_epid,
+                           ptr, PKTAP_MAX_COMM_SIZE);
+               }
+               if (pktap_v2_hdr->pth_e_uuid_offset != 0) {
+                       uuid_t *ptr = (uuid_t *) (((char *)pktap_v2_hdr) +
+                           pktap_v2_hdr->pth_e_uuid_offset);
+
+                       uuid_copy(*ptr, soprocinfo->spi_euuid);
+               }
+       }
+}
+
 __private_extern__ void
-pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto,
-       struct mbuf *m, u_int32_t pre, int outgoing, struct ifnet *ifp)
+pktap_v2_finalize_proc_info(struct pktap_v2_hdr *pktap_v2_hdr)
 {
-       int found = 0;
+       int found;
        struct so_procinfo soprocinfo;
 
+       if (!(pktap_v2_hdr->pth_flags & PTH_FLAG_DELAY_PKTAP))
+               return;
+
+       if (pktap_v2_hdr->pth_ipproto == IPPROTO_TCP) {
+               found = inp_findinpcb_procinfo(&tcbinfo,
+                   pktap_v2_hdr->pth_flowid, &soprocinfo);
+       } else if (pktap_v2_hdr->pth_ipproto == IPPROTO_UDP) {
+               found = inp_findinpcb_procinfo(&udbinfo,
+                   pktap_v2_hdr->pth_flowid, &soprocinfo);
+       } else {
+               found = inp_findinpcb_procinfo(&ripcbinfo,
+                   pktap_v2_hdr->pth_flowid, &soprocinfo);
+       }
+       if (found == 1) {
+               pktap_v2_set_procinfo(pktap_v2_hdr, &soprocinfo);
+       }
+}
+
+__private_extern__ void
+pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto,
+       struct mbuf *m, u_int32_t pre, int outgoing, struct ifnet *ifp)
+{
        /*
         * Getting the pid and procname is expensive
         * For outgoing, do the lookup only if there's an
@@ -823,22 +894,54 @@ pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto,
         */
        if (outgoing != 0 && m->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
                /*
-                * To avoid lock ordering issues we delay the process lookup
+                * To avoid lock ordering issues we delay the proc UUID lookup
                 * to the BPF read as we cannot
                 * assume the socket lock is unlocked on output
                 */
-               found = 0;
                hdr->pth_flags |= PTH_FLAG_DELAY_PKTAP;
+               hdr->pth_flags |= PTH_FLAG_SOCKET;
                hdr->pth_flowid = m->m_pkthdr.pkt_flowid;
-               if (m->m_pkthdr.pkt_flags & PKTF_FLOW_RAWSOCK)
+
+               if (m->m_pkthdr.pkt_flags & PKTF_FLOW_RAWSOCK) {
                        hdr->pth_ipproto = IPPROTO_RAW;
-               else            
+               } else {
                        hdr->pth_ipproto = m->m_pkthdr.pkt_proto;
-               if (m->m_pkthdr.pkt_flags & PKTF_NEW_FLOW)
+               }
+
+               if (hdr->pth_ipproto == IPPROTO_TCP) {
+                       hdr->pth_pid = m->m_pkthdr.tx_tcp_pid;
+                       hdr->pth_epid = m->m_pkthdr.tx_tcp_e_pid;
+               } else if (hdr->pth_ipproto == IPPROTO_UDP) {
+                       hdr->pth_pid = m->m_pkthdr.tx_udp_pid;
+                       hdr->pth_epid = m->m_pkthdr.tx_udp_e_pid;
+               } else if (hdr->pth_ipproto == IPPROTO_RAW) {
+                       hdr->pth_pid = m->m_pkthdr.tx_rawip_pid;
+                       hdr->pth_epid = m->m_pkthdr.tx_rawip_e_pid;
+               }
+
+               if (hdr->pth_pid != 0 && hdr->pth_pid != -1) {
+                       proc_name(hdr->pth_pid, hdr->pth_comm, MAXCOMLEN);
+               } else {
+                       hdr->pth_pid = -1;
+               }
+
+               if (hdr->pth_epid != 0 && hdr->pth_epid != -1) {
+                       hdr->pth_flags|= PTH_FLAG_PROC_DELEGATED;
+                       proc_name(hdr->pth_epid, hdr->pth_ecomm, MAXCOMLEN);
+               } else {
+                       hdr->pth_epid = -1;
+               }
+
+               if (m->m_pkthdr.pkt_flags & PKTF_NEW_FLOW) {
                        hdr->pth_flags |= PTH_FLAG_NEW_FLOW;
+               }
        } else if (outgoing == 0) {
+               int found = 0;
+               struct so_procinfo soprocinfo;
                struct inpcb *inp = NULL;
 
+               memset(&soprocinfo, 0, sizeof(struct so_procinfo));
+
                if (proto == PF_INET) {
                        struct ip ip;
                        errno_t error;
@@ -969,22 +1072,24 @@ pktap_fill_proc_info(struct pktap_header *hdr, protocol_family_t proto,
                        }
                }
                if (inp != NULL) {
+                       hdr->pth_flags |= PTH_FLAG_SOCKET;
                        if (inp->inp_state != INPCB_STATE_DEAD && inp->inp_socket != NULL) {
                                found = 1;
                                inp_get_soprocinfo(inp, &soprocinfo);
                        }
                        in_pcb_checkstate(inp, WNT_RELEASE, 0);
                }
-       }
 done:
-       /*
-        * -1 means PID not found
-        */
-       hdr->pth_pid = -1;
-       hdr->pth_epid = -1;
+               /*
+                * -1 means PID not found
+                */
+               hdr->pth_pid = -1;
+               hdr->pth_epid = -1;
+
        if (found != 0)
                pktap_set_procinfo(hdr, &soprocinfo);
 }
+}
 
 __private_extern__ void
 pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m,
@@ -994,7 +1099,6 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m,
        void (*bpf_tap_func)(ifnet_t, u_int32_t, mbuf_t, void *, size_t) =
                outgoing ? bpf_tap_out : bpf_tap_in;
 
-
        /*
         * Skip the coprocessor interface
         */
@@ -1084,7 +1188,8 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m,
                                        hdr->pth_dlt = DLT_APPLE_IP_OVER_IEEE1394;
                                        break;
                                case IFT_OTHER:
-                                       if (strncmp(ifp->if_name, "utun", strlen("utun")) == 0) {
+                                       if (ifp->if_subfamily == IFNET_SUBFAMILY_IPSEC ||
+                                           ifp->if_subfamily == IFNET_SUBFAMILY_UTUN) {
                                                /*
                                                 * For utun:
                                                 * - incoming packets do not have the prefix set to four
@@ -1141,6 +1246,11 @@ pktap_bpf_tap(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m,
                                hdr->pth_iftype = ifp->if_type;
                                hdr->pth_ifunit = ifp->if_unit;
 
+                               if (m->m_pkthdr.pkt_flags & PKTF_KEEPALIVE)
+                                       hdr->pth_flags |= PTH_FLAG_KEEP_ALIVE;
+                               if (m->m_pkthdr.pkt_flags & PKTF_TCP_REXMT)
+                                       hdr->pth_flags |= PTH_FLAG_REXMIT;
+
                                pktap_fill_proc_info(hdr, proto, m, pre, outgoing, ifp);
 
                                hdr->pth_svc = so_svc2tc(m->m_pkthdr.pkt_svc);
@@ -1212,3 +1322,163 @@ pktap_output(struct ifnet *ifp, protocol_family_t proto, struct mbuf *m,
        pktap_bpf_tap(ifp, proto, m, pre, post, 1);
 }
 
+
+void
+convert_to_pktap_header_to_v2(struct bpf_packet *bpf_pkt, bool truncate)
+{
+       struct pktap_header *pktap_header;
+       size_t extra_src_size;
+       struct pktap_buffer_v2_hdr_extra pktap_buffer_v2_hdr_extra;
+       struct pktap_v2_hdr_space *pktap_v2_hdr_space;
+       struct pktap_v2_hdr *pktap_v2_hdr;
+       uint8_t *ptr;
+
+       pktap_header = (struct pktap_header *)bpf_pkt->bpfp_header;
+
+       if (pktap_header->pth_type_next != PTH_TYPE_PACKET) {
+               return;
+       }
+
+       VERIFY(bpf_pkt->bpfp_header_length >= sizeof(struct pktap_header));
+
+       /*
+        * extra_src_size is the length of the optional link layer header
+        */
+       extra_src_size = bpf_pkt->bpfp_header_length -
+           sizeof(struct pktap_header);
+
+       VERIFY(extra_src_size <= sizeof(union pktap_header_extra));
+
+       pktap_v2_hdr_space = &pktap_buffer_v2_hdr_extra.hdr_space;
+       pktap_v2_hdr = &pktap_v2_hdr_space->pth_hdr;
+       ptr = (uint8_t *) (pktap_v2_hdr + 1);
+
+       COPY_PKTAP_COMMON_FIELDS_TO_V2(pktap_v2_hdr, pktap_header);
+
+       /*
+        * When truncating don't bother with the process UUIDs
+        */
+       if (!truncate) {
+               if ((pktap_header->pth_flags & PTH_FLAG_DELAY_PKTAP)) {
+                       pktap_v2_hdr->pth_uuid_offset = pktap_v2_hdr->pth_length;
+                       pktap_v2_hdr->pth_length += sizeof(uuid_t);
+                       uuid_clear(*(uuid_t *)ptr);
+                       ptr += sizeof(uuid_t);
+                       VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+               } else if (!uuid_is_null(pktap_header->pth_uuid)) {
+                       pktap_v2_hdr->pth_uuid_offset = pktap_v2_hdr->pth_length;
+                       uuid_copy(*(uuid_t *)ptr, pktap_header->pth_uuid);
+                       pktap_v2_hdr->pth_length += sizeof(uuid_t);
+                       ptr += sizeof(uuid_t);
+                       VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+               }
+
+               if ((pktap_header->pth_flags & PTH_FLAG_DELAY_PKTAP)) {
+                       if (pktap_header->pth_flags & PTH_FLAG_PROC_DELEGATED) {
+                               pktap_v2_hdr->pth_e_uuid_offset = pktap_v2_hdr->pth_length;
+                               uuid_clear(*(uuid_t *)ptr);
+                               pktap_v2_hdr->pth_length += sizeof(uuid_t);
+                               ptr += sizeof(uuid_t);
+                               VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+                       }
+               } else if(!uuid_is_null(pktap_header->pth_euuid)) {
+                       pktap_v2_hdr->pth_e_uuid_offset = pktap_v2_hdr->pth_length;
+                       uuid_copy(*(uuid_t *)ptr, pktap_header->pth_euuid);
+                       pktap_v2_hdr->pth_length += sizeof(uuid_t);
+                       ptr += sizeof(uuid_t);
+                       VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+               }
+       }
+
+       if (pktap_header->pth_ifname[0] != 0) {
+               size_t strsize;
+
+               pktap_v2_hdr->pth_ifname_offset = pktap_v2_hdr->pth_length;
+
+               /*
+                * Note: strlcpy() returns the length of the string so we need
+                * to add one for the end-of-string
+                */
+               strsize = 1 + strlcpy((char *)ptr, pktap_header->pth_ifname,
+                   sizeof(pktap_v2_hdr_space->pth_ifname));
+               pktap_v2_hdr->pth_length += strsize;
+               ptr += strsize;
+               VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+       }
+
+       /*
+        * Do not waste space with the process name if we do not have a pid
+        */
+       if (pktap_header->pth_pid != 0 && pktap_header->pth_pid != -1) {
+               if (pktap_header->pth_comm[0] != 0) {
+                       size_t strsize;
+
+                       pktap_v2_hdr->pth_comm_offset = pktap_v2_hdr->pth_length;
+
+                       strsize = 1 + strlcpy((char *)ptr, pktap_header->pth_comm,
+                           sizeof(pktap_v2_hdr_space->pth_comm));
+                       pktap_v2_hdr->pth_length += strsize;
+                       ptr += strsize;
+                       VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+               } else if ((pktap_header->pth_flags & PTH_FLAG_DELAY_PKTAP)) {
+                       size_t strsize = sizeof(pktap_v2_hdr_space->pth_comm);
+
+                       pktap_v2_hdr->pth_comm_offset = pktap_v2_hdr->pth_length;
+
+                       *ptr = 0;       /* empty string by default */
+                       pktap_v2_hdr->pth_length += strsize;
+                       ptr += strsize;
+                       VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+               }
+       }
+
+       /*
+        * Do not waste space with the effective process name if we do not have
+        * an effective pid or it's the same as the pid
+        */
+       if (pktap_header->pth_epid != 0 && pktap_header->pth_epid != -1 &&
+           pktap_header->pth_epid != pktap_header->pth_pid) {
+               if (pktap_header->pth_ecomm[0] != 0) {
+                       size_t strsize;
+
+                       pktap_v2_hdr->pth_e_comm_offset = pktap_v2_hdr->pth_length;
+
+                       strsize = 1 + strlcpy((char *)ptr, pktap_header->pth_ecomm,
+                           sizeof(pktap_v2_hdr_space->pth_e_comm));
+                       pktap_v2_hdr->pth_length += strsize;
+                       ptr += strsize;
+                       VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+               } else if ((pktap_header->pth_flags & PTH_FLAG_DELAY_PKTAP)) {
+                       size_t strsize = sizeof(pktap_v2_hdr_space->pth_e_comm);
+
+                       pktap_v2_hdr->pth_e_comm_offset = pktap_v2_hdr->pth_length;
+                       *ptr = 0;       /* empty string by default */
+                       pktap_v2_hdr->pth_length += strsize;
+                       ptr += strsize;
+                       VERIFY((void *)ptr < (void *)(pktap_v2_hdr_space + 1));
+               }
+       }
+
+       if (extra_src_size > 0) {
+               char *extra_src_ptr = (char *)(pktap_header + 1);
+               char *extra_dst_ptr = ((char *)pktap_v2_hdr) +
+                   pktap_v2_hdr->pth_length;
+
+               VERIFY(pktap_v2_hdr->pth_length + extra_src_size <=
+                   sizeof(struct pktap_buffer_v2_hdr_extra));
+
+               memcpy(extra_dst_ptr, extra_src_ptr, extra_src_size);
+       }
+
+       VERIFY(pktap_v2_hdr->pth_length + extra_src_size <=
+           bpf_pkt->bpfp_header_length);
+
+       memcpy(bpf_pkt->bpfp_header, pktap_v2_hdr,
+           pktap_v2_hdr->pth_length + extra_src_size);
+
+       bpf_pkt->bpfp_total_length += pktap_v2_hdr->pth_length -
+           sizeof(struct pktap_header);
+       bpf_pkt->bpfp_header_length += pktap_v2_hdr->pth_length -
+           sizeof(struct pktap_header);
+}
+
index 74b0b5bd13bd26faf2dc30c5e23a871bbe6bccb0..25ed642fdd3a8532d97dfb540e658c473062a656 100644 (file)
@@ -33,6 +33,7 @@
 #include <stdint.h>
 #include <net/if.h>
 #include <uuid/uuid.h>
+#include <string.h>
 
 #ifdef PRIVATE
 
@@ -123,23 +124,113 @@ struct pktap_header {
 };
 
 /*
- *
+ * The original version 1 of the pktap_header structure always had the field
+ * pth_type_next set to PTH_TYPE_PACKET
  */
 #define PTH_TYPE_NONE  0               /* No more data following */
 #define PTH_TYPE_PACKET        1               /* Actual captured packet data */
 
-#define PTH_FLAG_DIR_IN                        0x0001  /* Outgoing packet */
-#define PTH_FLAG_DIR_OUT               0x0002  /* Incoming packet */
-#define PTH_FLAG_PROC_DELEGATED                0x0004  /* Process delegated */
-#define PTH_FLAG_IF_DELEGATED          0x0008  /* Interface delegated */
+/*
+ * Size of buffer that can contain any pktap header
+ * followed by the optional 4 bytes protocol field
+ * or 16 bytes link layer header
+ */
+union pktap_header_extra {
+               uint8_t         llhdr[16];
+               uint32_t        proto;
+};
+
+/*
+ * Version 2 version of the header
+ *
+ * The field pth_flags is at the same offset as the orignal pktap_header and
+ * the flag PTH_FLAG_V2_HDR allows to differentiate the header version.
+ */
+
+#define PKTAP_MAX_COMM_SIZE (MAXCOMLEN + 1)
+
+struct pktap_v2_hdr {
+       uint8_t                 pth_length;                     /* length of this header */
+       uint8_t                 pth_uuid_offset;                /* max size: sizeof(uuid_t) */
+       uint8_t                 pth_e_uuid_offset;              /* max size: sizeof(uuid_t) */
+       uint8_t                 pth_ifname_offset;              /* max size: PKTAP_IFXNAMESIZE*/
+       uint8_t                 pth_comm_offset;                /* max size: PKTAP_MAX_COMM_SIZE */
+       uint8_t                 pth_e_comm_offset;              /* max size: PKTAP_MAX_COMM_SIZE */
+       uint16_t                pth_dlt;                        /* DLT of packet */
+       uint16_t                pth_frame_pre_length;
+       uint16_t                pth_frame_post_length;
+       uint16_t                pth_iftype;
+       uint16_t                pth_ipproto;
+       uint32_t                pth_protocol_family;
+       uint32_t                pth_svc;                        /* service class */
+       uint32_t                pth_flowid;
+       pid_t                   pth_pid;                        /* process ID */
+       pid_t                   pth_e_pid;                      /* effective process ID */
+       uint32_t                pth_flags;                      /* flags */
+};
+
+struct pktap_v2_hdr_space {
+       struct pktap_v2_hdr pth_hdr;
+       uint8_t pth_uuid[sizeof(uuid_t)];
+       uint8_t pth_e_uuid[sizeof(uuid_t)];
+       uint8_t pth_ifname[PKTAP_IFXNAMESIZE];
+       uint8_t pth_comm[PKTAP_MAX_COMM_SIZE];
+       uint8_t pth_e_comm[PKTAP_MAX_COMM_SIZE];
+};
+
+struct pktap_buffer_v2_hdr_extra {
+       struct pktap_v2_hdr_space hdr_space;
+       union pktap_header_extra extra;
+};
+
+#define COPY_PKTAP_COMMON_FIELDS_TO_V2(pktap_v2_hdr_dst, pktap_header_src) { \
+       (pktap_v2_hdr_dst)->pth_length = sizeof(struct pktap_v2_hdr); \
+       (pktap_v2_hdr_dst)->pth_uuid_offset = 0; \
+       (pktap_v2_hdr_dst)->pth_e_uuid_offset = 0; \
+       (pktap_v2_hdr_dst)->pth_ifname_offset = 0; \
+       (pktap_v2_hdr_dst)->pth_comm_offset = 0; \
+       (pktap_v2_hdr_dst)->pth_e_comm_offset = 0; \
+       (pktap_v2_hdr_dst)->pth_dlt = (pktap_header_src)->pth_dlt; \
+       (pktap_v2_hdr_dst)->pth_frame_pre_length = (pktap_header_src)->pth_frame_pre_length; \
+       (pktap_v2_hdr_dst)->pth_frame_post_length = (pktap_header_src)->pth_frame_post_length; \
+       (pktap_v2_hdr_dst)->pth_iftype = (pktap_header_src)->pth_iftype; \
+       (pktap_v2_hdr_dst)->pth_ipproto = (pktap_header_src)->pth_ipproto; \
+       (pktap_v2_hdr_dst)->pth_protocol_family = (pktap_header_src)->pth_protocol_family; \
+       (pktap_v2_hdr_dst)->pth_svc = (pktap_header_src)->pth_svc; \
+       (pktap_v2_hdr_dst)->pth_flowid = (pktap_header_src)->pth_flowid; \
+       (pktap_v2_hdr_dst)->pth_pid = (pktap_header_src)->pth_pid; \
+       (pktap_v2_hdr_dst)->pth_e_pid = (pktap_header_src)->pth_epid; \
+       (pktap_v2_hdr_dst)->pth_flags = (pktap_header_src)->pth_flags; \
+       (pktap_v2_hdr_dst)->pth_flags |= PTH_FLAG_V2_HDR; \
+}
+
+/*
+ * Values for field pth_flags
+ */
+#define        PTH_FLAG_DIR_IN         0x00000001 /* Outgoing packet */
+#define        PTH_FLAG_DIR_OUT        0x00000002 /* Incoming packet */
+#define        PTH_FLAG_PROC_DELEGATED 0x00000004 /* Process delegated */
+#define        PTH_FLAG_IF_DELEGATED   0x00000008 /* Interface delegated */
 #ifdef BSD_KERNEL_PRIVATE
-#define PTH_FLAG_DELAY_PKTAP           0x1000  /* Finalize pktap header on read */
+#define        PTH_FLAG_DELAY_PKTAP    0x00001000 /* Finalize pktap header on read */
 #endif /* BSD_KERNEL_PRIVATE */
-#define PTH_FLAG_TSTAMP                        0x2000  /* Has time stamp */
-#define        PTH_FLAG_NEW_FLOW               0x4000  /* Packet from a new flow */
-#define        PTH_FLAG_MSFSW                  0x8000  /* Multi stack flow switch */
+#define        PTH_FLAG_TSTAMP         0x00002000 /* Has time stamp */
+#define        PTH_FLAG_NEW_FLOW       0x00004000 /* Packet from a new flow */
+#define        PTH_FLAG_REXMIT         0x00008000 /* Packet is a retransmission */
+#define        PTH_FLAG_KEEP_ALIVE     0x00010000 /* Is keep alive packet */
+#define        PTH_FLAG_SOCKET         0x00020000 /* Packet on a Socket */
+#define        PTH_FLAG_NEXUS_CHAN     0x00040000 /* Packet on a nexus channel */
+#define        PTH_FLAG_V2_HDR         0x00080000 /* Version 2 of pktap */
 
 #ifdef BSD_KERNEL_PRIVATE
+
+#include <net/bpf.h>
+
+struct pktap_header_buffer {
+       struct pktap_header             pkth;
+       union pktap_header_extra        extra;
+} ;
+
 extern uint32_t pktap_total_tap_count;
 
 extern void pktap_init(void);
@@ -149,7 +240,8 @@ extern void pktap_output(struct ifnet *, protocol_family_t, struct mbuf *,
 extern void pktap_fill_proc_info(struct pktap_header *, protocol_family_t , 
        struct mbuf *, u_int32_t , int , struct ifnet *);
 extern void pktap_finalize_proc_info(struct pktap_header *);
-
+extern void pktap_v2_finalize_proc_info(struct pktap_v2_hdr *);
+extern void convert_to_pktap_header_to_v2(struct bpf_packet *bpf_pkt, bool truncate);
 #endif /* BSD_KERNEL_PRIVATE */
 #endif /* PRIVATE */
 
index c30dc2eb16f2f4f9bd04e8bb2c4d44c07a2ae04e..425173a5fa8c7de59fc27be4f9e33a18fc22e5bb 100644 (file)
@@ -206,6 +206,7 @@ fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
                case MBUF_SC_AV:
                case MBUF_SC_RV:
                case MBUF_SC_VI:
+               case MBUF_SC_SIG:
                        pri = FQ_IF_VI_INDEX;
                        break;
                case MBUF_SC_VO:
@@ -245,6 +246,9 @@ fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
        case MBUF_SC_VI:
                pri = FQ_IF_VI_INDEX;
                break;
+       case MBUF_SC_SIG:
+               pri = FQ_IF_SIG_INDEX;
+               break;
        case MBUF_SC_VO:
                pri = FQ_IF_VO_INDEX;
                break;
@@ -827,6 +831,10 @@ fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
                fq_if_classq_init(fqs, FQ_IF_VO_INDEX, 600,
                    8, MBUF_SC_VO);
        } else {
+               /* SIG shares same INDEX with VI */
+               _CASSERT(SCIDX_SIG == SCIDX_VI);
+               _CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
+
                fq_if_classq_init(fqs, FQ_IF_BK_SYS_INDEX, 1500,
                    2, MBUF_SC_BK_SYS);
                fq_if_classq_init(fqs, FQ_IF_BK_INDEX, 1500,
index be7629a7198a743046fdf850704cd169ca1ec4ac..0929882a6a4eb2fe95b43c895924b00f4e6b1e23 100644 (file)
@@ -105,6 +105,7 @@ enum fq_if_state {
 #define        FQ_IF_AV_INDEX  4
 #define        FQ_IF_RV_INDEX  3
 #define        FQ_IF_VI_INDEX  2
+#define        FQ_IF_SIG_INDEX 2
 #define        FQ_IF_VO_INDEX  1
 #define        FQ_IF_CTL_INDEX 0
 
index 1eda0fa301556248ab3406566025f38eec9bdda6..986c2f6a2157c95e5a093d5c639911895f46b8c0 100644 (file)
@@ -92,6 +92,7 @@
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
+#include <netinet/ip.h>
 #include <netinet/ip6.h>
 #include <netinet/in_arp.h>
 
@@ -1687,6 +1688,16 @@ ifa_ifwithroute_common_locked(int flags, const struct sockaddr *dst,
                ifa = NULL;
        }
 
+       /*
+        * ifa's address family must match destination's address family
+        * after all is said and done.
+        */
+       if (ifa != NULL &&
+           ifa->ifa_addr->sa_family != dst->sa_family) {
+               IFA_REMREF(ifa);
+               ifa = NULL;
+       }
+
        return (ifa);
 }
 
@@ -3464,8 +3475,15 @@ rtinit_locked(struct ifaddr *ifa, int cmd, int flags)
                         * If rmx_mtu is not locked, update it
                         * to the MTU used by the new interface.
                         */
-                       if (!(rt->rt_rmx.rmx_locks & RTV_MTU))
+                       if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) {
                                rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
+                               if (dst->sa_family == AF_INET &&
+                                   INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) {
+                                       rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);
+                                       /* Further adjust the size for CLAT46 expansion */
+                                       rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+                               }
+                       }
 
                        /*
                         * Now ask the protocol to check if it needs
index dff054212f241bf07012750e647ccae0252afb12..8ae08e20676b4224fe8b0efc6e8f9a52926f91c4 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -84,6 +84,8 @@
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_arp.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
 #include <netinet6/nd6.h>
 
 extern struct rtstat rtstat;
@@ -552,12 +554,7 @@ route_output(struct mbuf *m, struct socket *so)
                        struct ifaddr *ifa2;
 report:
                        cred = kauth_cred_proc_ref(current_proc());
-
-                       if (rt->rt_ifp == lo_ifp ||
-                           route_op_entitlement_check(so, NULL, ROUTE_OP_READ, TRUE) != 0)
-                               credp = &cred;
-                       else
-                               credp = NULL;
+                       credp = &cred;
 
                        ifa2 = NULL;
                        RT_LOCK_ASSERT_HELD(rt);
@@ -961,8 +958,15 @@ rt_setif(struct rtentry *rt, struct sockaddr *Ifpaddr, struct sockaddr *Ifaaddr,
                         * If rmx_mtu is not locked, update it
                         * to the MTU used by the new interface.
                         */
-                       if (!(rt->rt_rmx.rmx_locks & RTV_MTU))
+                       if (!(rt->rt_rmx.rmx_locks & RTV_MTU)) {
                                rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
+                               if (rt_key(rt)->sa_family == AF_INET &&
+                                   INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
+                                       rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);
+                                       /* Further adjust the size for CLAT46 expansion */
+                                       rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+                               }
+                       }
 
                        if (rt->rt_ifa != NULL) {
                                IFA_LOCK_SPIN(rt->rt_ifa);
@@ -1522,15 +1526,25 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
        kauth_cred_t *credp;
 
        cred = kauth_cred_proc_ref(current_proc());
-       if (rt->rt_ifp == lo_ifp ||
-           route_op_entitlement_check(NULL, cred, ROUTE_OP_READ, TRUE) != 0)
-               credp = &cred;
-       else
-               credp = NULL;
+       credp = &cred;
 
        RT_LOCK(rt);
-       if (w->w_op == NET_RT_FLAGS && !(rt->rt_flags & w->w_arg))
+       if ((w->w_op == NET_RT_FLAGS || w->w_op == NET_RT_FLAGS_PRIV) &&
+           !(rt->rt_flags & w->w_arg))
                goto done;
+
+       /*
+        * If the matching route has RTF_LLINFO set, then we can skip scrubbing the MAC
+        * only if the outgoing interface is not loopback and the process has entitlement
+        * for neighbor cache read.
+        */
+       if (w->w_op == NET_RT_FLAGS_PRIV && (rt->rt_flags & RTF_LLINFO)) {
+               if (rt->rt_ifp != lo_ifp &&
+                   (route_op_entitlement_check(NULL, cred, ROUTE_OP_READ, TRUE) == 0)) {
+                       credp = NULL;
+               }
+       }
+
        bzero((caddr_t)&info, sizeof (info));
        info.rti_info[RTAX_DST] = rt_key(rt);
        info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
@@ -1720,6 +1734,12 @@ sysctl_iflist(int af, struct walkarg *w)
                                        IFA_UNLOCK(ifa);
                                        continue;
                                }
+                               if (ifa->ifa_addr->sa_family == AF_INET6 &&
+                                   (((struct in6_ifaddr *)ifa)->ia6_flags &
+                                    IN6_IFF_CLAT46) != 0) {
+                                       IFA_UNLOCK(ifa);
+                                       continue;
+                               }
                                info.rti_info[RTAX_IFA] = ifa->ifa_addr;
                                info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
                                info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
@@ -1877,6 +1897,13 @@ sysctl_iflist2(int af, struct walkarg *w)
                                        IFA_UNLOCK(ifa);
                                        continue;
                                }
+                               if (ifa->ifa_addr->sa_family == AF_INET6 &&
+                                   (((struct in6_ifaddr *)ifa)->ia6_flags &
+                                    IN6_IFF_CLAT46) != 0) {
+                                       IFA_UNLOCK(ifa);
+                                       continue;
+                               }
+
                                info.rti_info[RTAX_IFA] = ifa->ifa_addr;
                                info.rti_info[RTAX_NETMASK] = ifa->ifa_netmask;
                                info.rti_info[RTAX_BRD] = ifa->ifa_dstaddr;
@@ -2051,6 +2078,7 @@ sysctl_rtsock SYSCTL_HANDLER_ARGS
        case NET_RT_DUMP:
        case NET_RT_DUMP2:
        case NET_RT_FLAGS:
+       case NET_RT_FLAGS_PRIV:
                lck_mtx_lock(rnh_lock);
                for (i = 1; i <= AF_MAX; i++)
                        if ((rnh = rt_tables[i]) && (af == 0 || af == i) &&
index 2176a02992a2723d37f9c5fa60176801af6bd6e0..7e19edab7ec14566af23340e81daa7530580abfc 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -229,6 +229,10 @@ struct mld_hdr {
 #define mld_v2_reserved        mld_icmp6_hdr.icmp6_data16[0]
 #define mld_v2_numrecs mld_icmp6_hdr.icmp6_data16[1]
 
+
+#define ICMP6_ERRORTYPE(type) \
+       ((type) == ICMP6_DST_UNREACH || (type) == ICMP6_PACKET_TOO_BIG || \
+       (type) == ICMP6_TIME_EXCEEDED || (type) == ICMP6_PARAM_PROB)
 /*
  * Neighbor Discovery
  */
index cbb7f8cb64445a163d00e5474216afc675a64b61..61de1526df04ec9f5e95c7dc0220f06bcc387112 100644 (file)
@@ -2603,3 +2603,57 @@ in_lltattach(struct ifnet *ifp)
 
        return (llt);
 }
+
+struct in_ifaddr*
+inifa_ifpwithflag(struct ifnet * ifp, uint32_t flag)
+{
+       struct ifaddr *ifa;
+
+       ifnet_lock_shared(ifp);
+       TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_link)
+       {
+               IFA_LOCK_SPIN(ifa);
+               if (ifa->ifa_addr->sa_family != AF_INET) {
+                       IFA_UNLOCK(ifa);
+                       continue;
+               }
+               if ((((struct in_ifaddr *)ifa)->ia_flags & flag) == flag) {
+                       IFA_ADDREF_LOCKED(ifa);
+                       IFA_UNLOCK(ifa);
+                       break;
+               }
+               IFA_UNLOCK(ifa);
+       }
+       ifnet_lock_done(ifp);
+
+       return ((struct in_ifaddr *)ifa);
+}
+
+struct in_ifaddr *
+inifa_ifpclatv4(struct ifnet * ifp)
+{
+       struct ifaddr *ifa;
+
+       ifnet_lock_shared(ifp);
+       TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_link)
+       {
+               uint32_t addr = 0;
+               IFA_LOCK_SPIN(ifa);
+               if (ifa->ifa_addr->sa_family != AF_INET) {
+                       IFA_UNLOCK(ifa);
+                       continue;
+               }
+
+               addr = ntohl(SIN(ifa->ifa_addr)->sin_addr.s_addr);
+               if (!IN_LINKLOCAL(addr) &&
+                   !IN_LOOPBACK(addr)) {
+                       IFA_ADDREF_LOCKED(ifa);
+                       IFA_UNLOCK(ifa);
+                       break;
+               }
+               IFA_UNLOCK(ifa);
+       }
+       ifnet_lock_done(ifp);
+
+       return ((struct in_ifaddr *)ifa);
+}
index 07732679ad3672ed3000fd921ecdb171f6e98687..5a8400e22268a82659ba4a5da346680c879f4307 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -392,6 +392,10 @@ struct sockaddr_in {
        char            sin_zero[8];
 };
 
+#define IN_ARE_ADDR_EQUAL(a, b) \
+    (bcmp(&(a)->s_addr, &(b)->s_addr, \
+        sizeof (struct in_addr)) == 0)
+
 #ifdef PRIVATE
 /*
  * sockaddr_in with scope ID field; this is used internally to keep
@@ -811,6 +815,8 @@ union sockaddr_in_4_6 {
        struct sockaddr_in6     sin6;
 };
 
+#define        CLAT46_HDR_EXPANSION_OVERHD     (sizeof(struct ip6_hdr) - sizeof(struct ip))
+
 /*
  * Recommended DiffServ Code Point values
  */
@@ -880,6 +886,8 @@ extern uint32_t in_cksum_mbuf_ref(struct mbuf *, int, int, uint32_t);
 extern int in_getconninfo(struct socket *, sae_connid_t, uint32_t *,
     uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *,
     uint32_t *, user_addr_t, uint32_t *);
+extern struct in_ifaddr * inifa_ifpwithflag(struct ifnet *, uint32_t);
+extern struct in_ifaddr * inifa_ifpclatv4(struct ifnet *);
 
 #define        in_cksum(_m, _l)                        \
        inet_cksum(_m, 0, 0, _l)
index 674da52bf91ad700f29c42729985ad1c85b9dfad..2b717a5d9d1c93ca77da6a79e71566ff793c4d63 100644 (file)
@@ -83,6 +83,8 @@
 
 #include <netinet/if_ether.h>
 #include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
 #include <kern/zalloc.h>
 
 #include <kern/thread.h>
@@ -1955,8 +1957,14 @@ match:
                         * If rmx_mtu is not locked, update it
                         * to the MTU used by the new interface.
                         */
-                       if (!(route->rt_rmx.rmx_locks & RTV_MTU))
+                       if (!(route->rt_rmx.rmx_locks & RTV_MTU)) {
                                route->rt_rmx.rmx_mtu = route->rt_ifp->if_mtu;
+                               if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
+                                       route->rt_rmx.rmx_mtu = IN6_LINKMTU(route->rt_ifp);
+                                       /* Further adjust the size for CLAT46 expansion */
+                                       route->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+                               }
+                       }
 
                        rtsetifa(route, &best_ia->ia_ifa);
                        gateway->sdl_index = ifp->if_index;
index 29594c123fb9f58eefb93cf3224e40e58b86fd99..538794d8dd2f3f1ff82e25d78e4ee8a65504402c 100644 (file)
@@ -305,7 +305,6 @@ inet_cksum_buffer(const void *buffer, uint32_t nxt, uint32_t off,
 }
 
 #if DEBUG || DEVELOPMENT
-#include <mach/branch_predicates.h>
 #include <pexpert/pexpert.h>
 
 #define        CKSUM_ERR kprintf
index b74b0af2f27861df9513c0a72299045396e21843..3d2e8c91df96fe15e3d041cccf3667849a03cdd0 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -92,6 +92,7 @@
 #include <net/route.h>
 #include <net/flowhash.h>
 #include <net/flowadv.h>
+#include <net/nat464_utils.h>
 #include <net/ntstat.h>
 
 #include <netinet/in.h>
@@ -132,8 +133,6 @@ static boolean_t inpcb_garbage_collecting = FALSE; /* gc timer is scheduled */
 static boolean_t inpcb_ticking = FALSE;                /* "slow" timer is scheduled */
 static boolean_t inpcb_fast_timer_on = FALSE;
 
-extern char *proc_best_name(proc_t);
-
 #define        INPCB_GCREQ_THRESHOLD   50000
 
 static thread_call_t inpcb_thread_call, inpcb_fast_thread_call;
@@ -219,6 +218,8 @@ static boolean_t apn_fallbk_enabled = TRUE;
 
 SYSCTL_DECL(_net_inet);
 SYSCTL_NODE(_net_inet, OID_AUTO, apn_fallback, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "APN Fallback");
+SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &apn_fallbk_enabled, 0, "APN fallback enable");
 SYSCTL_UINT(_net_inet_apn_fallback, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED,
     &apn_fallbk_debug, 0, "APN fallback debug enable");
 #else
@@ -806,7 +807,8 @@ in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                        uid_t u;
 
 #if !CONFIG_EMBEDDED
-                       if (ntohs(lport) < IPPORT_RESERVED) {
+                       if (ntohs(lport) < IPPORT_RESERVED &&
+                               SIN(nam)->sin_addr.s_addr != 0) {
                                cred = kauth_cred_proc_ref(p);
                                error = priv_check_cred(cred,
                                    PRIV_NETINET_RESERVEDPORT, 0);
@@ -1148,7 +1150,7 @@ apn_fallback_required (proc_t proc, struct socket *so, struct sockaddr_in *p_dst
 }
 
 static void
-apn_fallback_trigger(proc_t proc)
+apn_fallback_trigger(proc_t proc, struct socket *so)
 {
        pid_t pid = 0;
        struct kev_msg ev_msg;
@@ -1168,8 +1170,14 @@ apn_fallback_trigger(proc_t proc)
        ev_msg.event_code       = KEV_NETEVENT_APNFALLBACK;
 
        bzero(&apnfallbk_data, sizeof(apnfallbk_data));
-       apnfallbk_data.epid = pid;
-       uuid_copy(apnfallbk_data.euuid, application_uuid);
+
+       if (so->so_flags & SOF_DELEGATED) {
+               apnfallbk_data.epid = so->e_pid;
+               uuid_copy(apnfallbk_data.euuid, so->e_uuid);
+       } else {
+               apnfallbk_data.epid = so->last_pid;
+               uuid_copy(apnfallbk_data.euuid, so->last_uuid);
+       }
 
        ev_msg.dv[0].data_ptr   = &apnfallbk_data;
        ev_msg.dv[0].data_length = sizeof(apnfallbk_data);
@@ -1306,7 +1314,7 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr,
 
                if (apn_fallback_required(proc, inp->inp_socket,
                    (void *)nam))
-                       apn_fallback_trigger(proc);
+                       apn_fallback_trigger(proc, inp->inp_socket);
 
                goto done;
        }
@@ -1333,6 +1341,20 @@ in_pcbladdr(struct inpcb *inp, struct sockaddr *nam, struct in_addr *laddr,
                        RT_CONVERT_LOCK(ro->ro_rt);
                        ia = ifatoia(ro->ro_rt->rt_ifa);
                        IFA_ADDREF(&ia->ia_ifa);
+
+                       /*
+                        * Mark the control block for notification of
+                        * a possible flow that might undergo clat46
+                        * translation.
+                        *
+                        * We defer the decision to a later point when
+                        * inpcb is being disposed off.
+                        * The reason is that we only want to send notification
+                        * if the flow was ever used to send data.
+                        */
+                       if (IS_INTF_CLAT46(ro->ro_rt->rt_ifp))
+                               inp->inp_flags2 |= INP2_CLAT46_FLOW;
+
                        RT_UNLOCK(ro->ro_rt);
                        error = 0;
                }
@@ -1464,6 +1486,11 @@ in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p,
        int error;
        struct socket *so = inp->inp_socket;
 
+#if CONTENT_FILTER
+       if (so)
+               so->so_state_change_cnt++;
+#endif
+
        /*
         *   Call inner routine, to assign local interface address.
         */
@@ -1548,6 +1575,11 @@ in_pcbdisconnect(struct inpcb *inp)
        inp->inp_faddr.s_addr = INADDR_ANY;
        inp->inp_fport = 0;
 
+#if CONTENT_FILTER
+       if (so)
+               so->so_state_change_cnt++;
+#endif
+
        if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->ipi_lock)) {
                /* lock inversion issue, mostly with udp multicast packets */
                socket_unlock(so, 0);
@@ -1624,6 +1656,35 @@ in_pcbdetach(struct inpcb *inp)
                inp->inp_moptions = NULL;
                sofreelastref(so, 0);
                inp->inp_state = INPCB_STATE_DEAD;
+
+               /*
+                * Enqueue an event to send kernel event notification
+                * if the flow has to CLAT46 for data packets
+                */
+               if (inp->inp_flags2 & INP2_CLAT46_FLOW) {
+                       /*
+                        * If there has been any exchange of data bytes
+                        * over this flow.
+                        * Schedule a notification to report that flow is
+                        * using client side translation.
+                        */
+                       if (inp->inp_stat != NULL &&
+                           (inp->inp_stat->txbytes != 0 ||
+                            inp->inp_stat->rxbytes !=0)) {
+                               if (so->so_flags & SOF_DELEGATED) {
+                                       in6_clat46_event_enqueue_nwk_wq_entry(
+                                           IN6_CLAT46_EVENT_V4_FLOW,
+                                           so->e_pid,
+                                           so->e_uuid);
+                               } else {
+                                       in6_clat46_event_enqueue_nwk_wq_entry(
+                                            IN6_CLAT46_EVENT_V4_FLOW,
+                                            so->last_pid,
+                                            so->last_uuid);
+                               }
+                       }
+               }
+
                /* makes sure we're not called twice from so_close */
                so->so_flags |= SOF_PCBCLEARING;
 
index 588a4d0548b45d57866bc8e40faad89f27fab32e..e1a7c9941e3d57af025703ef8a853e6a41398788 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -221,7 +221,7 @@ struct inpcb {
        } inp_necp_attributes;
        struct necp_inpcb_result inp_policyresult;
        uuid_t necp_client_uuid;
-       void    (*necp_cb)(void *, int, struct necp_client_flow *);
+       necp_client_flow_cb necp_cb;
 #endif
        u_char *inp_keepalive_data;     /* for keepalive offload */
        u_int8_t inp_keepalive_datalen; /* keepalive data length */
@@ -692,7 +692,7 @@ struct inpcbinfo {
        IN6P_RTHDR|IN6P_RTHDRDSTOPTS|IN6P_TCLASS|IN6P_RFC2292|IN6P_MTU)
 
 #define        INP_UNMAPPABLEOPTS \
-       (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR| IN6P_TCLASS|IN6P_AUTOFLOWLABEL)
+       (IN6P_HOPOPTS|IN6P_DSTOPTS|IN6P_RTHDR|IN6P_AUTOFLOWLABEL)
 
 /*
  * Flags for inp_flags2.
@@ -706,8 +706,9 @@ struct inpcbinfo {
 #define        INP2_INHASHLIST         0x00000010 /* pcb is in inp_hash list */
 #define        INP2_AWDL_UNRESTRICTED  0x00000020 /* AWDL restricted mode allowed */
 #define        INP2_KEEPALIVE_OFFLOAD  0x00000040 /* Enable UDP or TCP keepalive offload */
-#define INP2_INTCOPROC_ALLOWED 0x00000080 /* Allow communication via internal co-processor interfaces */
-#define INP2_CONNECT_IN_PROGRESS       0x00000100 /* A connect call is in progress, so binds are intermediate steps */
+#define        INP2_INTCOPROC_ALLOWED  0x00000080 /* Allow communication via internal co-processor interfaces */
+#define        INP2_CONNECT_IN_PROGRESS        0x00000100 /* A connect call is in progress, so binds are intermediate steps */
+#define        INP2_CLAT46_FLOW        0x00000200 /* The flow is going to use CLAT46 path */
 
 /*
  * Flags passed to in_pcblookup*() functions.
index 266754acc6814273a0dfc2677bae490743de3da8..7865d6a1e4f75dde5abda6025ddb00967bdc4e12 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2010-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -64,6 +64,8 @@
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
+#include <sys/filedesc.h>
+#include <sys/file_internal.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/dtrace.h>
@@ -89,6 +91,8 @@
 #include <netinet/tcp_var.h>
 #include <netinet6/in6_var.h>
 
+#include <os/log.h>
+
 #ifndef ROUNDUP64
 #define        ROUNDUP64(x) P2ROUNDUP((x), sizeof (u_int64_t))
 #endif
 
 static void inpcb_to_xinpcb_n(struct inpcb *, struct xinpcb_n *);
 static void tcpcb_to_xtcpcb_n(struct tcpcb *, struct xtcpcb_n *);
+void shutdown_sockets_on_interface(struct ifnet *ifp);
+
 
 __private_extern__ void
 sotoxsocket_n(struct socket *so, struct xsocket_n *xso)
@@ -442,10 +448,39 @@ inpcb_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags,
                    (so->so_state & SS_ISDISCONNECTED))
                        continue;
 
-               if (!(protocol == PF_UNSPEC ||
-                   (protocol == PF_INET && (inp->inp_vflag & INP_IPV4)) ||
-                   (protocol == PF_INET6 && (inp->inp_vflag & INP_IPV6))))
-                       continue;
+               /*
+                * If protocol is specified, filter out inpcbs that
+                * are not relevant to the protocol family of interest.
+                */
+               if (protocol != PF_UNSPEC) {
+                       if (protocol == PF_INET) {
+                               /*
+                                * If protocol of interest is IPv4, skip the inpcb
+                                * if the family is not IPv4.
+                                * OR
+                                * If the family is IPv4, skip if the IPv4 flow is
+                                * CLAT46 translated.
+                                */
+                               if ((inp->inp_vflag & INP_IPV4) == 0 ||
+                                   (inp->inp_flags2 & INP2_CLAT46_FLOW) != 0) {
+                                       continue;
+                               }
+                       } else if (protocol == PF_INET6) {
+                               /*
+                                * If protocol of interest is IPv6, skip the inpcb
+                                * if the family is not IPv6.
+                                * AND
+                                * The flow is not a CLAT46'd flow.
+                                */
+                               if ((inp->inp_vflag & INP_IPV6) == 0 &&
+                                   (inp->inp_flags2 & INP2_CLAT46_FLOW) == 0) {
+                                       continue;
+                               }
+                       } else {
+                               /* Protocol family not supported */
+                               continue;
+                       }
+               }
 
                if (SOCK_PROTO(inp->inp_socket) != IPPROTO_UDP &&
                    SOCK_PROTO(inp->inp_socket) != IPPROTO_TCP)
@@ -631,3 +666,87 @@ inpcb_find_anypcb_byaddr(struct ifaddr *ifa, struct inpcbinfo *pcbinfo)
        lck_rw_done(pcbinfo->ipi_lock);
        return (0);
 }
+
+static int
+shutdown_sockets_on_interface_proc_callout(proc_t p, void *arg)
+{
+       struct filedesc *fdp;
+       int i;
+       struct ifnet *ifp = (struct ifnet *)arg;
+
+       if (ifp == NULL)
+               return (PROC_RETURNED);
+
+       proc_fdlock(p);
+       fdp = p->p_fd;
+       for (i = 0; i < fdp->fd_nfiles; i++) {
+               struct fileproc *fp = fdp->fd_ofiles[i];
+               struct fileglob *fg;
+               struct socket *so;
+               struct inpcb *inp;
+               struct ifnet *inp_ifp;
+               int error;
+
+               if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0) {
+                       continue;
+               }
+
+               fg = fp->f_fglob;
+               if (FILEGLOB_DTYPE(fg) != DTYPE_SOCKET)
+                       continue;
+
+               so = (struct socket *)fp->f_fglob->fg_data;
+               if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6)
+                       continue;
+
+               inp = (struct inpcb *)so->so_pcb;
+
+               if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
+                       continue;
+
+               socket_lock(so, 1);
+
+               if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
+                       socket_unlock(so, 1);
+                       continue;
+               }
+
+               if (inp->inp_boundifp != NULL) {
+                       inp_ifp = inp->inp_boundifp;
+               } else if (inp->inp_last_outifp != NULL) {
+                       inp_ifp = inp->inp_last_outifp;
+               } else {
+                       socket_unlock(so, 1);
+                       continue;
+               }
+
+               if (inp_ifp != ifp && inp_ifp->if_delegated.ifp != ifp) {
+                       socket_unlock(so, 1);
+                       continue;
+               }
+               error = sosetdefunct(p, so, 0, TRUE);
+               if (error != 0) {
+                       log(LOG_ERR, "%s: sosetdefunct() error %d",
+                           __func__, error);
+               } else {
+                       error = sodefunct(p, so, 0);
+                       if (error != 0) {
+                               log(LOG_ERR, "%s: sodefunct() error %d",
+                                   __func__, error);
+                       }
+               }
+
+               socket_unlock(so, 1);
+       }
+       proc_fdunlock(p);
+
+       return (PROC_RETURNED);
+}
+
+void
+shutdown_sockets_on_interface(struct ifnet *ifp)
+{
+       proc_iterate(PROC_ALLPROCLIST,
+               shutdown_sockets_on_interface_proc_callout,
+               ifp, NULL, NULL);
+}
index 1c0fc36965e2a2f66fafde8534146f9b6efb7e03..4aa6864024183d4fe0dc4063d9cca95c0f3cb048 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -84,6 +84,9 @@
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/in_arp.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet6/nd6.h>
 
 extern int tvtohz(struct timeval *);
 
@@ -163,8 +166,14 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
        }
 
        if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
-           rt->rt_ifp)
+           rt->rt_ifp) {
                rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
+               if (INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) {
+                       rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);
+                       /* Further adjust the size for CLAT46 expansion */
+                       rt->rt_rmx.rmx_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+               }
+       }
 
        ret = rn_addroute(v_arg, n_arg, head, treenodes);
        if (ret == NULL && (rt->rt_flags & RTF_HOST)) {
index ff40a2872c31679489c2063c86c82180300ba20a..5301613458044b9307b30a0b03f8d2d292979919 100644 (file)
@@ -260,8 +260,6 @@ mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE];
 
 #if (DEVELOPMENT || DEBUG)
 
-extern char *proc_best_name(proc_t p);
-
 static int tfp_count = 0;
 
 static TAILQ_HEAD(, tclass_for_proc) tfp_head =
@@ -1308,6 +1306,9 @@ so_tc2msc(int tc)
        case _SO_TC_VI:
                msc = MBUF_SC_VI;
                break;
+       case SO_TC_NETSVC_SIG:
+               msc = MBUF_SC_SIG;
+               break;
        case SO_TC_VO:
        case _SO_TC_VO:
                msc = MBUF_SC_VO;
@@ -1344,6 +1345,8 @@ so_svc2tc(mbuf_svc_class_t svc)
                return (SO_TC_RV);
        case MBUF_SC_VI:
                return (SO_TC_VI);
+       case MBUF_SC_SIG:
+               return (SO_TC_NETSVC_SIG);
        case MBUF_SC_VO:
                return (SO_TC_VO);
        case MBUF_SC_CTL:
index 8088cb4cb954c34ae015e7f4f6e7bbb84892216f..7d181e66c11d18e83052f79bb488564be4f394ed 100644 (file)
@@ -132,7 +132,7 @@ struct ip6_hdr {
 #if BYTE_ORDER == LITTLE_ENDIAN
 #define IPV6_FLOWINFO_MASK     0xffffff0f      /* flow info (28 bits) */
 #define IPV6_FLOWLABEL_MASK    0xffff0f00      /* flow label (20 bits) */
-#define IPV6_FLOW_ECN_MASK     0x00000300      /* the 2 ECN bits */
+#define IPV6_FLOW_ECN_MASK     0x00003000      /* the 2 ECN bits */
 #endif /* LITTLE_ENDIAN */
 #endif
 #if 1
@@ -141,8 +141,6 @@ struct ip6_hdr {
 #define IP6TOS_ECT             0x02    /* ECN-capable transport */
 #endif
 
-#define        IP6FLOW_ECN_MASK        0x00300000
-
 /*
  * To access the 6 bits of the DSCP value in the 32 bits ip6_flow field
  */
index 122698fa4849022370e059b08d89029b077b6128..c9f5668220d15bf0ec9e2dc6ab134aa1cd11b4e6 100644 (file)
@@ -2421,7 +2421,7 @@ dummynet_get(struct sockopt *sopt)
        for (i = 0; i < 10; i++) {
                size = dn_calc_size(is64user);
                lck_mtx_unlock(dn_mutex);
-               buf = _MALLOC(size, M_TEMP, M_WAITOK);
+               buf = _MALLOC(size, M_TEMP, M_WAITOK | M_ZERO);
                if (buf == NULL)
                        return(ENOBUFS);
                lck_mtx_lock(dn_mutex);
index 3d69b2029d5edbb290ee9443d24fdba3f9ecd732..9b365b2c075b7c0681f36ebc282ac8c1846177f0 100644 (file)
@@ -3646,7 +3646,7 @@ ipfw_ctl(struct sockopt *sopt)
                        struct ip_old_fw        *buf2, *rule_vers0;
                        
                        lck_mtx_lock(ipfw_mutex);
-                       buf2 = _MALLOC(static_count * sizeof(struct ip_old_fw), M_TEMP, M_WAITOK);
+                       buf2 = _MALLOC(static_count * sizeof(struct ip_old_fw), M_TEMP, M_WAITOK | M_ZERO);
                        if (buf2 == 0) {
                                lck_mtx_unlock(ipfw_mutex);
                                error = ENOBUFS;
@@ -3687,7 +3687,7 @@ ipfw_ctl(struct sockopt *sopt)
                        buf_size = static_count * ipfwcompsize + 
                                                dyn_count * ipfwdyncompsize;
                                                
-                       buf2 = _MALLOC(buf_size, M_TEMP, M_WAITOK);
+                       buf2 = _MALLOC(buf_size, M_TEMP, M_WAITOK | M_ZERO);
                        if (buf2 == 0) {
                                lck_mtx_unlock(ipfw_mutex);
                                error = ENOBUFS;
index e67b329cabd5902d00d05268cfa629b0df3690b8..260449c30778ede4811e24cffb219073746e864a 100644 (file)
@@ -207,58 +207,84 @@ icmp_error(
        u_int32_t dest,
        u_int32_t nextmtu)
 {
-       struct ip *oip, *nip;
-       struct icmp *icp;
-       struct mbuf *m;
-       u_int32_t oiphlen, icmplen, icmpelen, nlen;
-
+       struct ip *oip = NULL;
+       struct ip *nip = NULL;
+       struct icmp *icp = NULL;
+       struct mbuf *m = NULL;
+       u_int32_t oiphlen = 0;
+       u_int32_t icmplen = 0;
+       u_int32_t icmpelen = 0;
+       u_int32_t nlen = 0;
+
+       VERIFY((u_int)type <= ICMP_MAXTYPE);
        /* Expect 32-bit aligned data pointer on strict-align platforms */
        MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(n);
 
+       if (type != ICMP_REDIRECT)
+               icmpstat.icps_error++;
+       /*
+        * Don't send error:
+        *   if not the first fragment of message
+        *   if original packet was a multicast or broadcast packet
+        *   if the old packet protocol was ICMP
+        *   error message, only known informational types.
+        */
+       if (n->m_flags & (M_BCAST|M_MCAST))
+               goto freeit;
+
+       /*
+        * Drop if IP header plus ICMP_MINLEN bytes are not contiguous
+        * in first mbuf.
+        */
+       if (n->m_len < sizeof(struct ip) + ICMP_MINLEN)
+               goto freeit;
+
        oip = mtod(n, struct ip *);
        oiphlen = IP_VHL_HL(oip->ip_vhl) << 2;
+       if (n->m_len < oiphlen + ICMP_MINLEN)
+               goto freeit;
 
 #if (DEBUG | DEVELOPMENT)
        if (icmpprintfs > 1)
                printf("icmp_error(0x%llx, %x, %d)\n",
                    (uint64_t)VM_KERNEL_ADDRPERM(oip), type, code);
 #endif
-       if (type != ICMP_REDIRECT)
-               icmpstat.icps_error++;
-       /*
-        * Don't send error if not the first fragment of message.
-        * Don't error if the old packet protocol was ICMP
-        * error message, only known informational types.
-        */
+
        if (oip->ip_off & ~(IP_MF|IP_DF))
                goto freeit;
 
        if (oip->ip_p == IPPROTO_ICMP && type != ICMP_REDIRECT &&
-         n->m_len >= oiphlen + ICMP_MINLEN &&
-         !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiphlen))->
-         icmp_type)) {
+           n->m_len >= oiphlen + ICMP_MINLEN &&
+           !ICMP_INFOTYPE(((struct icmp *)(void *)((caddr_t)oip + oiphlen))->
+               icmp_type)) {
                icmpstat.icps_oldicmp++;
                goto freeit;
        }
-       /*
-        * Don't send error in response to a multicast or
-        * broadcast packet
-        */
-       if (n->m_flags & (M_BCAST|M_MCAST))
-               goto freeit;
 
        /*
         * Calculate the length to quote from original packet and prevent
         * the ICMP mbuf from overflowing.
+        * Unfortunatly this is non-trivial since ip_forward()
+        * sends us truncated packets.
         */
        nlen = m_length(n);
        if (oip->ip_p == IPPROTO_TCP) {
-               struct tcphdr *th;
-               u_int16_t tcphlen;
+               struct tcphdr *th = NULL;
+               u_int16_t tcphlen = 0;
 
+               /*
+                * If the packet got truncated and TCP header
+                * is not contained in the packet, send out
+                * standard reply with only IP header as payload
+                */
                if (oiphlen + sizeof(struct tcphdr) > n->m_len &&
                    n->m_next == NULL)
                        goto stdreply;
+
+               /*
+                * Otherwise, pull up to get IP and TCP headers
+                * together
+                */
                if (n->m_len < (oiphlen + sizeof(struct tcphdr)) &&
                    (n = m_pullup(n, (oiphlen + sizeof(struct tcphdr)))) == NULL)
                        goto freeit;
@@ -274,6 +300,8 @@ icmp_error(
                    sizeof(u_int32_t))))
                        goto freeit;
                tcphlen = th->th_off << 2;
+
+               /* Sanity checks */
                if (tcphlen < sizeof(struct tcphdr))
                        goto freeit;
                if (oip->ip_len < (oiphlen + tcphlen))
@@ -297,11 +325,14 @@ icmp_error(
 stdreply:      icmpelen = max(ICMP_MINLEN, min(icmp_datalen,
                    (oip->ip_len - oiphlen)));
 
-       icmplen = min(oiphlen + icmpelen, min(nlen, oip->ip_len));
+       icmplen = min(oiphlen + icmpelen, nlen);
        if (icmplen < sizeof(struct ip))
                goto freeit;
+
        /*
         * First, formulate icmp message
+        * Allocate enough space for the IP header, ICMP header
+        * and the payload (part of the original message to be sent back).
         */
        if (MHLEN > (sizeof(struct ip) + ICMP_MINLEN + icmplen))
                m = m_gethdr(M_DONTWAIT, MT_HEADER);    /* MAC-OK */
@@ -311,24 +342,20 @@ stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen,
        if (m == NULL)
                goto freeit;
 
-       if (n->m_flags & M_SKIP_FIREWALL) {
-               /*
-                * set M_SKIP_FIREWALL to skip firewall check, since
-                * we're called from firewall
-                */
-               m->m_flags |= M_SKIP_FIREWALL;
-       }
-
 #if CONFIG_MACF_NET
        mac_mbuf_label_associate_netlayer(n, m);
 #endif
-       m->m_len = icmplen + ICMP_MINLEN; /* for ICMP header and data */
-       MH_ALIGN(m, m->m_len);
+       /*
+        * Further refine the payload length to the space
+        * remaining in mbuf after including the IP header and ICMP
+        * header.
+        */
+       icmplen = min(icmplen, M_TRAILINGSPACE(m) -
+           sizeof(struct ip) - ICMP_MINLEN);
+       m_align(m, ICMP_MINLEN + icmplen);
+       m->m_len = ICMP_MINLEN + icmplen; /* for ICMP header and data */
+
        icp = mtod(m, struct icmp *);
-       if ((u_int)type > ICMP_MAXTYPE) {
-               m_freem(m);
-               goto freeit;
-       }
        icmpstat.icps_outhist[type]++;
        icp->icmp_type = type;
        if (type == ICMP_REDIRECT)
@@ -349,6 +376,11 @@ stdreply:  icmpelen = max(ICMP_MINLEN, min(icmp_datalen,
        }
 
        icp->icmp_code = code;
+
+       /*
+        * Copy icmplen worth of content from original
+        * mbuf (n) to the new packet after ICMP header.
+        */
        m_copydata(n, 0, icmplen, (caddr_t)&icp->icmp_ip);
        nip = &icp->icmp_ip;
 
@@ -360,13 +392,12 @@ stdreply: icmpelen = max(ICMP_MINLEN, min(icmp_datalen,
        HTONS(nip->ip_off);
 #endif
        /*
-        * Now, copy old ip header (without options)
-        * in front of icmp message.
-        */
-       if (m->m_data - sizeof(struct ip) < m->m_pktdat) {
-               m_freem(m);
-               goto freeit;
-       }
+        * Set up ICMP message mbuf and copy old IP header (without options
+        * in front of ICMP message.
+        * If the original mbuf was meant to bypass the firewall, the error
+        * reply should bypass as well.
+         */
+       m->m_flags |= n->m_flags & M_SKIP_FIREWALL;
        m->m_data -= sizeof(struct ip);
        m->m_len += sizeof(struct ip);
        m->m_pkthdr.len = m->m_len;
@@ -379,7 +410,6 @@ stdreply:   icmpelen = max(ICMP_MINLEN, min(icmp_datalen,
        nip->ip_tos = 0;
        nip->ip_off = 0;
        icmp_reflect(m);
-
 freeit:
        m_freem(n);
 }
index 3438a1bdb7ac3019ef73b6edaaab43fd426a819c..2de986d94e22d161d2f604d2301d3332ee93bfc3 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2013 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -227,6 +227,11 @@ struct icmp {
        (type) == ICMP_IREQ || (type) == ICMP_IREQREPLY || \
        (type) == ICMP_MASKREQ || (type) == ICMP_MASKREPLY)
 
+#define ICMP_ERRORTYPE(type) \
+       ((type) == ICMP_UNREACH || (type) == ICMP_SOURCEQUENCH || \
+       (type) == ICMP_REDIRECT || (type) == ICMP_TIMXCEED || \
+       (type) == ICMP_PARAMPROB)
+
 #ifdef BSD_KERNEL_PRIVATE
 void   icmp_error(struct mbuf *, int, int, n_long, u_int32_t);
 void   icmp_input(struct mbuf *, int);
index 97750ef575f648f2736c2d774626e66497b592ff..70ee5fac6b9375a3470f733537d5676541e359a9 100644 (file)
@@ -4197,6 +4197,16 @@ ip_savecontrol(struct inpcb *inp, struct mbuf **mp, struct ip *ip,
                        goto no_mbufs;
                }
        }
+       if (inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) {
+               uint64_t time;
+
+               time = mach_continuous_time();
+               mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof (time),
+                       SCM_TIMESTAMP_CONTINUOUS, SOL_SOCKET, mp);
+               if (*mp == NULL) {
+                       goto no_mbufs;
+               }
+       }
        if (inp->inp_flags & INP_RECVDSTADDR) {
                mp = sbcreatecontrol_mbuf((caddr_t)&ip->ip_dst,
                    sizeof (struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, mp);
index f5b51ac5251a76661d22467733c9cd2a2218578e..35f778d25ca26a2e49698780e2e813e9b97f4f28 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <netinet/ip_var.h>
 #include <netinet/kpi_ipfilter_var.h>
 #include <netinet/in_tclass.h>
+#include <netinet/udp.h>
+
+#include <netinet6/nd6.h>
 
 #if CONFIG_MACF_NET
 #include <security/mac_framework.h>
@@ -350,6 +353,8 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
                uint32_t raw;
        } ipobf = { .raw = 0 };
 
+       int interface_mtu = 0;
+
 /*
  * Here we check for restrictions when sending frames.
  * N.B.: IPv4 over internal co-processor interfaces is not allowed.
@@ -357,7 +362,7 @@ ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
 #define        IP_CHECK_RESTRICTIONS(_ifp, _ipobf)                             \
        (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) ||                \
         ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) ||          \
-          (IFNET_IS_INTCOPROC(_ifp)) ||                                        \
+         (IFNET_IS_INTCOPROC(_ifp)) ||                                 \
         (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
 
        if (ip_output_measure)
@@ -1822,11 +1827,19 @@ pass:
        ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
            ip->ip_len, &sw_csum);
 
+       interface_mtu = ifp->if_mtu;
+
+       if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
+               interface_mtu = IN6_LINKMTU(ifp);
+               /* Further adjust the size for CLAT46 expansion */
+               interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+       }
+
        /*
         * If small enough for interface, or the interface will take
         * care of the fragmentation for us, can just send directly.
         */
-       if ((u_short)ip->ip_len <= ifp->if_mtu || TSO_IPV4_OK(ifp, m) ||
+       if ((u_short)ip->ip_len <= interface_mtu || TSO_IPV4_OK(ifp, m) ||
            (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) {
 #if BYTE_ORDER != BIG_ENDIAN
                HTONS(ip->ip_len);
@@ -1899,6 +1912,8 @@ sendchain:
                        goto loopit;
                }
        }
+
+       VERIFY(interface_mtu != 0);
        /*
         * Too large for interface; fragment if possible.
         * Must be able to put at least 8 bytes per fragment.
@@ -1918,8 +1933,8 @@ sendchain:
                        RT_LOCK_SPIN(ro->ro_rt);
                        if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
                            !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
-                           (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
-                               ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
+                           (ro->ro_rt->rt_rmx.rmx_mtu > interface_mtu)) {
+                               ro->ro_rt->rt_rmx.rmx_mtu = interface_mtu;
                        }
                        RT_UNLOCK(ro->ro_rt);
                }
@@ -1930,7 +1945,46 @@ sendchain:
                goto bad;
        }
 
-       error = ip_fragment(m, ifp, ifp->if_mtu, sw_csum);
+       /*
+        * XXX Only TCP seems to be passing a list of packets here.
+        * The following issue is limited to UDP datagrams with 0 checksum.
+        * For now limit it to the case when single packet is passed down.
+        */
+       if (packetchain == 0 && IS_INTF_CLAT46(ifp)) {
+               /*
+                * If it is a UDP packet that has checksum set to 0
+                * and is also not being offloaded, compute a full checksum
+                * and update the UDP checksum.
+                */
+               if (ip->ip_p == IPPROTO_UDP &&
+                   !(m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_PARTIAL))) {
+                       struct udphdr *uh = NULL;
+
+                       if (m->m_len < hlen + sizeof (struct udphdr)) {
+                               m = m_pullup(m, hlen + sizeof (struct udphdr));
+                               if (m == NULL) {
+                                       error = ENOBUFS;
+                                       m0 = m;
+                                       goto bad;
+                               }
+                               m0 = m;
+                               ip = mtod(m, struct ip *);
+                       }
+                       /*
+                        * Get UDP header and if checksum is 0, then compute the full
+                        * checksum.
+                        */
+                       uh = (struct udphdr *)(void *)((caddr_t)ip + hlen);
+                       if (uh->uh_sum == 0) {
+                               uh->uh_sum = inet_cksum(m, IPPROTO_UDP, hlen,
+                                   ip->ip_len - hlen);
+                               if (uh->uh_sum == 0)
+                                       uh->uh_sum = 0xffff;
+                       }
+               }
+       }
+
+       error = ip_fragment(m, ifp, interface_mtu, sw_csum);
        if (error != 0) {
                m0 = m = NULL;
                goto bad;
@@ -2029,6 +2083,16 @@ ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum)
        hlen = ip->ip_hl << 2;
 #endif /* !_IP_VHL */
 
+#ifdef INET6
+       /*
+        * We need to adjust the fragment sizes to account
+        * for IPv6 fragment header if it needs to be translated
+        * from IPv4 to IPv6.
+        */
+       if (IS_INTF_CLAT46(ifp))
+               mtu -= sizeof(struct ip6_frag);
+
+#endif
        firstlen = len = (mtu - hlen) &~ 7;
        if (len < 8) {
                m_freem(m);
@@ -3435,6 +3499,19 @@ in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
        return (ifa);
 }
 
+/*
+ * @brief      Given outgoing interface it determines what checksum needs
+ *     to be computed in software and what needs to be offloaded to the
+ *     interface.
+ *
+ * @param      ifp Pointer to the outgoing interface
+ * @param      m Pointer to the packet
+ * @param      hlen IP header length
+ * @param      ip_len Total packet size i.e. headers + data payload
+ * @param      sw_csum Pointer to a software checksum flag set
+ *
+ * @return     void
+ */
 void
 ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
     uint32_t *sw_csum)
@@ -3458,6 +3535,14 @@ ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
                *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) &
                    m->m_pkthdr.csum_flags);
        } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) {
+               int interface_mtu = ifp->if_mtu;
+
+               if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
+                       interface_mtu = IN6_LINKMTU(ifp);
+                       /* Further adjust the size for CLAT46 expansion */
+                       interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+               }
+
                /*
                 * Partial checksum offload, if non-IP fragment, and TCP only
                 * (no UDP support, as the hardware may not be able to convert
@@ -3468,7 +3553,7 @@ ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
                    ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
                    ((hwcap & CSUM_ZERO_INVERT) &&
                    (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
-                   ip_len <= ifp->if_mtu) {
+                   ip_len <= interface_mtu) {
                        uint16_t start = sizeof (struct ip);
                        uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
                        m->m_pkthdr.csum_flags |=
diff --git a/bsd/netinet/isakmp.h b/bsd/netinet/isakmp.h
new file mode 100644 (file)
index 0000000..299e90a
--- /dev/null
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ * RFC 2408 Internet Security Association and Key Management Protocol
+ */
+
+#ifndef _NETINET_ISAKMP_H_
+#define _NETINET_ISAKMP_H_
+
+typedef u_char cookie_t[8];
+typedef u_char msgid_t[4];
+
+/* 3.1 ISAKMP Header Format (IKEv1 and IKEv2)
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ !                          Initiator                            !
+ !                            Cookie                             !
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ !                          Responder                            !
+ !                            Cookie                             !
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ !  Next Payload ! MjVer ! MnVer ! Exchange Type !     Flags     !
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ !                          Message ID                           !
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ !                            Length                             !
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct isakmp {
+       cookie_t i_ck;          /* Initiator Cookie */
+       cookie_t r_ck;          /* Responder Cookie */
+       uint8_t np;             /* Next Payload Type */
+       uint8_t vers;
+#define ISAKMP_VERS_MAJOR      0xf0
+#define ISAKMP_VERS_MAJOR_SHIFT        4
+#define ISAKMP_VERS_MINOR      0x0f
+#define ISAKMP_VERS_MINOR_SHIFT        0
+       uint8_t etype;          /* Exchange Type */
+       uint8_t flags;          /* Flags */
+       msgid_t msgid;
+       uint32_t len;           /* Length */
+};
+
+/* 3.2 Payload Generic Header
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ ! Next Payload  !   RESERVED    !         Payload Length        !
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+struct isakmp_gen {
+       uint8_t  np;       /* Next Payload */
+       uint8_t  critical; /* bit 7 - critical, rest is RESERVED */
+       uint16_t len;      /* Payload Length */
+};
+
+#endif /* _NETINET_ISAKMP_H_ */
index ea54abae4069b899d0396c033c9f6fc02c379669..ad5e5f6644ef13aa03f083660d5193dd2d005a2a 100644 (file)
@@ -278,10 +278,15 @@ ipf_inject_input(
        struct mbuf *m = (struct mbuf *)data;
        struct m_tag *mtag = 0;
        struct ip *ip = mtod(m, struct ip *);
+       struct ip6_hdr *ip6;
        u_int8_t        vers;
        int hlen;
        errno_t error = 0;
        protocol_family_t proto;
+       struct in_ifaddr *ia = NULL;
+       struct in_addr *pkt_dst = NULL;
+       struct in6_ifaddr *ia6 = NULL;
+       struct sockaddr_in6 pkt_dst6;
 
        vers = IP_VHL_V(ip->ip_vhl);
 
@@ -298,7 +303,46 @@ ipf_inject_input(
        }
 
        if (filter_ref == 0 && m->m_pkthdr.rcvif == 0) {
-               m->m_pkthdr.rcvif = lo_ifp;
+               /*
+                * Search for interface with the local address
+                */
+               switch (proto) {
+                       case PF_INET:
+                               pkt_dst = &ip->ip_dst;
+                               lck_rw_lock_shared(in_ifaddr_rwlock);
+                               TAILQ_FOREACH(ia, INADDR_HASH(pkt_dst->s_addr), ia_hash) {
+                                       if (IA_SIN(ia)->sin_addr.s_addr == pkt_dst->s_addr) {
+                                               m->m_pkthdr.rcvif = ia->ia_ifp;
+                                               break;
+                                       }
+                               }
+                               lck_rw_done(in_ifaddr_rwlock);
+                               break;
+
+                       case PF_INET6:
+                               ip6 = mtod(m, struct ip6_hdr *);
+                               pkt_dst6.sin6_addr = ip6->ip6_dst;
+                               lck_rw_lock_shared(&in6_ifaddr_rwlock);
+                               for (ia6 = in6_ifaddrs; ia6 != NULL; ia6 = ia6->ia_next) {
+                                       if (IN6_ARE_ADDR_EQUAL(&ia6->ia_addr.sin6_addr, &pkt_dst6.sin6_addr)) {
+                                               m->m_pkthdr.rcvif = ia6->ia_ifp;
+                                               break;
+                                       }
+                               }
+                               lck_rw_done(&in6_ifaddr_rwlock);
+                               break;
+
+                       default:
+                               break;
+               }
+
+               /*
+                * If none found, fallback to loopback
+                */
+               if (m->m_pkthdr.rcvif == NULL) {
+                       m->m_pkthdr.rcvif = lo_ifp;
+               }
+
                m->m_pkthdr.csum_data = 0;
                m->m_pkthdr.csum_flags = 0;
                if (vers == 4) {
index 392d0650b739ea97512df86dfde6bcb5df999863..ae9adf13db5227de83edb9a27813dfc5fa406b04 100644 (file)
@@ -50,14 +50,14 @@ struct ipf_pktopts {
        int                             ippo_mcast_loop;
        u_int8_t                        ippo_mcast_ttl;
 };
-#define IPPOF_MCAST_OPTS       0x1
+#define        IPPOF_MCAST_OPTS        0x1
 #ifdef PRIVATE
-#define IPPOF_BOUND_IF         0x2
-#define IPPOF_NO_IFT_CELLULAR  0x4
-#define IPPOF_SELECT_SRCIF     0x8
-#define IPPOF_BOUND_SRCADDR    0x10
-#define IPPOF_SHIFT_IFSCOPE    16
-#define IPPOF_NO_IFF_EXPENSIVE 0x20
+#define        IPPOF_BOUND_IF          0x2
+#define        IPPOF_NO_IFT_CELLULAR   0x4
+#define        IPPOF_SELECT_SRCIF      0x8
+#define        IPPOF_BOUND_SRCADDR     0x10
+#define        IPPOF_SHIFT_IFSCOPE     16
+#define        IPPOF_NO_IFF_EXPENSIVE  0x20
 #endif /* PRIVATE */
 
 typedef struct ipf_pktopts *ipf_pktopts_t;
index f8fb188c0848dff5222d4a3fb765a9d0055aaa8c..5d1cd3ef03887e3ce4c6a39b969c444a645d21f8 100644 (file)
@@ -43,10 +43,6 @@ typedef enum mppcb_state {
        MPPCB_STATE_DEAD        = 2,
 } mppcb_state_t;
 
-
-/* net/necp.h already includes mp_pcb.h - so we have to forward-declare */
-struct necp_client_flow;
-
 /*
  * Multipath Protocol Control Block
  */
@@ -61,7 +57,7 @@ struct mppcb {
 
 #if NECP
        uuid_t necp_client_uuid;
-       void    (*necp_cb)(void *, int, struct necp_client_flow *);
+       void (*necp_cb)(void *, int, uint32_t, uint32_t, bool *);
 #endif
 };
 
@@ -120,6 +116,10 @@ extern void mptcp_timer_sched(void);
 extern void mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag);
 extern int mp_getsockaddr(struct socket *mp_so, struct sockaddr **nam);
 extern int mp_getpeeraddr(struct socket *mp_so, struct sockaddr **nam);
+#if NECP
+extern int necp_client_register_multipath_cb(pid_t pid, uuid_t client_id, struct mppcb *mpp);
+extern void necp_mppcb_dispose(struct mppcb *mpp);
+#endif
 __END_DECLS
 
 #endif /* BSD_KERNEL_PRIVATE */
index 55829e0a16febfdebe53ca5f8759a8be9a9f8b54..80db1552d4ef13aab0f57c7fd2ef01506a10741a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -499,13 +499,6 @@ fallback:
                        m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
                }
 
-               if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
-                   !LIST_EMPTY(&mp_tp->mpt_segq)) {
-                       mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
-
-                       goto next;
-               }
-               mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
 
                if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
                        if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
@@ -531,6 +524,14 @@ fallback:
                            MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
                }
 
+               if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
+                   !LIST_EMPTY(&mp_tp->mpt_segq)) {
+                       mb_dfin = mptcp_reass(mp_so, &m->m_pkthdr, &mb_datalen, m);
+
+                       goto next;
+               }
+               mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
+
                mptcp_sbrcv_grow(mp_tp);
 
                if (sbappendstream_rcvdemux(mp_so, m, 0, 0))
@@ -885,9 +886,8 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
                /*
                 * Only handover if Symptoms tells us to do so.
                 */
-               if (IFNET_IS_WIFI(bestinp->inp_last_outifp) &&
-                   mptcp_is_wifi_unusable() &&
-                   besttp->t_rxtshift >= mptcp_fail_thresh)
+               if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
+                   mptcp_is_wifi_unusable(mpte) != 0 && mptcp_subflow_is_bad(mpte, best))
                        return (mptcp_return_subflow(second_best));
 
                return (mptcp_return_subflow(best));
@@ -896,8 +896,8 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
                int rto_thresh = mptcp_rtothresh;
 
                /* Adjust with symptoms information */
-               if (IFNET_IS_WIFI(bestinp->inp_last_outifp) &&
-                   mptcp_is_wifi_unusable()) {
+               if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
+                   mptcp_is_wifi_unusable(mpte) != 0) {
                        rtt_thresh /= 2;
                        rto_thresh /= 2;
                }
@@ -914,7 +914,7 @@ mptcp_get_subflow(struct mptses *mpte, struct mptsub *ignore, struct mptsub **pr
                        return (mptcp_return_subflow(second_best));
                }
 
-               if (besttp->t_rxtshift >= mptcp_fail_thresh &&
+               if (mptcp_subflow_is_bad(mpte, best) &&
                    secondtp->t_rxtshift == 0) {
                        return (mptcp_return_subflow(second_best));
                }
@@ -1136,8 +1136,8 @@ mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
                return;
        }
        mptcplog((LOG_DEBUG,
-           "%s: seqn = %x len = %x full = %llx rcvnxt = %llu \n", __func__,
-           seqn, mdss_data_len, full_dsn, mp_tp->mpt_rcvnxt),
+           "%s: seqn = %u len = %u full = %u rcvnxt = %u \n", __func__,
+           seqn, mdss_data_len, (uint32_t)full_dsn, (uint32_t)mp_tp->mpt_rcvnxt),
            MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
 
        mptcp_notify_mpready(tp->t_inpcb->inp_socket);
@@ -1356,11 +1356,17 @@ mptcp_reset_itfinfo(struct mpt_itf_info *info)
        info->ifindex = 0;
        info->has_v4_conn = 0;
        info->has_v6_conn = 0;
+       info->has_nat64_conn = 0;
 }
 
 void
-mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
+mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
+                     uint32_t necp_flags, __unused bool *viable)
 {
+       boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
+       boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
+       boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
+       boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
        struct mppcb *mp = (struct mppcb *)handle;
        struct mptses *mpte = mptompte(mp);
        struct socket *mp_so;
@@ -1368,7 +1374,7 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
        int locked = 0;
        uint32_t i, ifindex;
 
-       ifindex = flow->interface_index;
+       ifindex = interface_index;
        VERIFY(ifindex != IFSCOPE_NONE);
 
        /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
@@ -1389,15 +1395,26 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
        mp_tp = mpte->mpte_mptcb;
        mp_so = mptetoso(mpte);
 
-       os_log_debug(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u\n",
-                    __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state);
+       os_log_info(mptcp_log_handle, "%s, action: %u ifindex %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
+                    __func__, action, ifindex, mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
+                    has_v4, has_v6, has_nat64, low_power);
 
        /* No need on fallen back sockets */
        if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)
                goto out;
 
+       /*
+        * When the interface goes in low-power mode we don't want to establish
+        * new subflows on it. Thus, mark it internally as non-viable.
+        */
+       if (low_power)
+               action = NECP_CLIENT_CBACTION_NONVIABLE;
+
        if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
                for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
+                       if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE)
+                               continue;
+
                        if (mpte->mpte_itfinfo[i].ifindex == ifindex)
                                mptcp_reset_itfinfo(&mpte->mpte_itfinfo[i]);
                }
@@ -1406,8 +1423,6 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
        } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
                   action == NECP_CLIENT_CBACTION_INITIAL) {
                int found_slot = 0, slot_index = -1;
-               boolean_t has_v4 = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
-               boolean_t has_v6 = !!(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
                struct ifnet *ifp;
 
                ifnet_head_lock_shared();
@@ -1425,6 +1440,9 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
                    (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR))
                        goto out;
 
+               if (IS_INTF_CLAT46(ifp))
+                       has_v4 = FALSE;
+
                /* Look for the slot on where to store/update the interface-info. */
                for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
                        /* Found a potential empty slot where we can put it */
@@ -1439,7 +1457,8 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
                         */
                        if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
                            (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
-                            mpte->mpte_itfinfo[i].has_v6_conn != has_v6)) {
+                            mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
+                            mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
                                found_slot = 1;
                                slot_index = i;
                                break;
@@ -1455,8 +1474,12 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
                }
 
                if ((mpte->mpte_dst.sa_family == AF_INET || mpte->mpte_dst.sa_family == 0) &&
-                   !(flow->necp_flow_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4) &&
-                   ifnet_get_nat64prefix(ifp, NULL) == ENOENT) {
+                   !has_nat64 && !has_v4) {
+                       if (found_slot) {
+                               mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
+                               mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
+                               mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
+                       }
                        mptcp_ask_for_nat64(ifp);
                        goto out;
                }
@@ -1466,8 +1489,8 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
                        struct mpt_itf_info *info = _MALLOC(sizeof(*info) * new_size, M_TEMP, M_ZERO);
 
                        if (info == NULL) {
-                               mptcplog((LOG_ERR, "%s malloc failed for %u\n", __func__, new_size),
-                                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                               os_log_error(mptcp_log_handle, "%s malloc failed for %u\n",
+                                            __func__, new_size);
                                goto out;
                        }
 
@@ -1481,15 +1504,13 @@ mptcp_session_necp_cb(void *handle, int action, struct necp_client_flow *flow)
 
                        mpte->mpte_itfinfo = info;
                        mpte->mpte_itfinfo_size = new_size;
-
-                       mptcplog((LOG_DEBUG, "%s Needed to realloc to %u\n", __func__, new_size),
-                           MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
                }
 
                VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
                mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
                mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
                mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
+               mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
 
                mptcp_sched_create_subflows(mpte);
        }
@@ -1518,6 +1539,8 @@ mptcp_set_restrictions(struct socket *mp_so)
                        continue;
 
                ifp = ifindex2ifnet[ifindex];
+               if (ifp == NULL)
+                       continue;
 
                if (IFNET_IS_EXPENSIVE(ifp) &&
                    (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
index 6da8235e881cbc45b21deb24fcebef86f58020cf..13a2055865e12c45769ca1b90dd96d90e012d769 100644 (file)
@@ -1080,39 +1080,31 @@ mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack)
 }
 
 void
-mptcp_update_window_fallback(struct tcpcb *tp)
+mptcp_update_window_wakeup(struct tcpcb *tp)
 {
        struct mptcb *mp_tp = tptomptp(tp);
 
        mpte_lock_assert_held(mp_tp->mpt_mpte);
 
-       if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
-               return;
-
-       mptcplog((LOG_DEBUG, "%s: update window to %u\n", __func__, tp->snd_wnd),
-                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-
-       mp_tp->mpt_sndwnd = tp->snd_wnd;
-       mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
-       mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
+       if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
+               mp_tp->mpt_sndwnd = tp->snd_wnd;
+               mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
+               mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
+       }
 
        sowwakeup(tp->t_inpcb->inp_socket);
 }
 
 static void
-mptcp_update_window(struct mptcb *mp_tp, u_int64_t ack, u_int64_t seq,
-    u_int32_t tiwin)
+mptcp_update_window(struct mptcb *mp_tp, u_int64_t ack, u_int64_t seq, u_int32_t tiwin)
 {
-       /* Don't look at the window if there is no ACK flag */
-       if ((SEQ_LT(mp_tp->mpt_sndwl1, seq) ||
-           (mp_tp->mpt_sndwl1 == seq && (SEQ_LT(mp_tp->mpt_sndwl2, ack) ||
-           (mp_tp->mpt_sndwl2 == ack && tiwin > mp_tp->mpt_sndwnd))))) {
+       if (SEQ_LT(mp_tp->mpt_sndwl1, seq) ||
+           (mp_tp->mpt_sndwl1 == seq &&
+            (SEQ_LT(mp_tp->mpt_sndwl2, ack) ||
+             (mp_tp->mpt_sndwl2 == ack && tiwin > mp_tp->mpt_sndwnd)))) {
                mp_tp->mpt_sndwnd = tiwin;
                mp_tp->mpt_sndwl1 = seq;
                mp_tp->mpt_sndwl2 = ack;
-
-               mptcplog((LOG_DEBUG, "%s: Updating window to %u\n", __func__,
-                         mp_tp->mpt_sndwnd), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
        }
 }
 
@@ -1138,11 +1130,11 @@ mptcp_do_dss_opt_ack_meat(u_int64_t full_dack, u_int64_t full_dsn,
                if (close_notify)
                        mptcp_notify_close(tp->t_inpcb->inp_socket);
        } else {
-               mptcplog((LOG_ERR,"%s: unexpected dack %u snduna %u sndmax %u\n", __func__,
-                   (u_int32_t)full_dack, (u_int32_t)mp_tp->mpt_snduna,
-                   (u_int32_t)mp_tp->mpt_sndmax),
-                   (MPTCP_SOCKET_DBG|MPTCP_RECEIVER_DBG),
-                   MPTCP_LOGLVL_LOG);
+               os_log_error(mptcp_log_handle,
+                            "%s: unexpected dack %u snduna %u sndmax %u\n",
+                            __func__, (u_int32_t)full_dack,
+                            (u_int32_t)mp_tp->mpt_snduna,
+                            (u_int32_t)mp_tp->mpt_sndmax);
        }
 
        mptcp_update_window(mp_tp, full_dack, full_dsn, tiwin);
index 785e1a9985e65652554948aefb9bf56c9707ae5d..f00653f08413a550ffa27799f55adfe958444072 100644 (file)
@@ -42,7 +42,7 @@
 
 __BEGIN_DECLS
 extern void mptcp_data_ack_rcvd(struct mptcb *mp_tp, struct tcpcb *tp, u_int64_t full_dack);
-extern void mptcp_update_window_fallback(struct tcpcb *tp);
+extern void mptcp_update_window_wakeup(struct tcpcb *tp);
 extern void tcp_do_mptcp_options(struct tcpcb *, u_char *, struct tcphdr *,
     struct tcpopt *, int);
 extern unsigned mptcp_setup_syn_opts(struct socket *, u_char*, unsigned);
index a2a656883cef2fd7bf59eae5819a043bde553017..1606cdb62839934c239cc6de0c2e6d05846d0ffd 100644 (file)
@@ -652,6 +652,9 @@ mptcpstats_session_wrapup(struct mptses *mpte)
 
        if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi)
                tcpstat.tcps_mptcp_back_to_wifi++;
+
+       if (mpte->mpte_triggered_cell)
+               tcpstat.tcps_mptcp_triggered_cell++;
 }
 
 /*
@@ -695,7 +698,7 @@ static boolean_t
 mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
 {
        return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
-               mp_tp->mpt_state < MPTCPS_TIME_WAIT &&
+               mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
                !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP));
 }
 
@@ -711,12 +714,12 @@ mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addr
        char *ptrv4 = (char *)addrv4;
        char *ptr = (char *)addr;
 
-       if (IN_ZERONET(addrv4->s_addr) || // 0.0.0.0/8 Source hosts on local network
-           IN_LOOPBACK(addrv4->s_addr) || // 127.0.0.0/8 Loopback
-           IN_LINKLOCAL(addrv4->s_addr) || // 169.254.0.0/16 Link Local
-           IN_DS_LITE(addrv4->s_addr) || // 192.0.0.0/29 DS-Lite
-           IN_6TO4_RELAY_ANYCAST(addrv4->s_addr) || // 192.88.99.0/24 6to4 Relay Anycast
-           IN_MULTICAST(addrv4->s_addr) || // 224.0.0.0/4 Multicast
+       if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
+           IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
+           IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
+           IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
+           IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
+           IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
            INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
                return (-1);
        }
@@ -724,8 +727,8 @@ mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addr
        /* Check for the well-known prefix */
        if (len == NAT64_PREFIX_LEN_96 &&
            IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
-               if (IN_PRIVATE(addrv4->s_addr) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
-                   IN_SHARED_ADDRESS_SPACE(addrv4->s_addr)) // 100.64.0.0/10 Shared Address Space
+               if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
+                   IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) // 100.64.0.0/10 Shared Address Space
                        return (-1);
        }
 
@@ -762,10 +765,36 @@ mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addr
        return (0);
 }
 
+static void
+mptcp_trigger_cell_bringup(struct mptses *mpte)
+{
+       struct socket *mp_so = mptetoso(mpte);
+
+       if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
+               uuid_string_t uuidstr;
+               int err;
+
+               err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
+                                                         TRUE);
+
+               if (err == 0)
+                       mpte->mpte_triggered_cell = 1;
+
+               uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
+               os_log_info(mptcp_log_handle, "%s asked irat to bringup cell for uuid %s, err %d\n",
+                           __func__, uuidstr, err);
+       } else {
+               os_log_info(mptcp_log_handle, "%s UUID is already null\n", __func__);
+       }
+}
+
+
 void
 mptcp_check_subflows_and_add(struct mptses *mpte)
 {
        struct mptcb *mp_tp = mpte->mpte_mptcb;
+       boolean_t cellular_viable = FALSE;
+       boolean_t want_cellular = TRUE;
        uint32_t i;
 
        if (!mptcp_ok_to_create_subflows(mp_tp))
@@ -774,6 +803,7 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
        for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
                struct mpt_itf_info *info;
                struct mptsub *mpts;
+               struct ifnet *ifp;
                uint32_t ifindex;
                int found = 0;
 
@@ -786,23 +816,22 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                if (ifindex == IFSCOPE_NONE)
                        continue;
 
+               ifnet_head_lock_shared();
+               ifp = ifindex2ifnet[ifindex];
+               ifnet_head_done();
+
+               if (ifp == NULL)
+                       continue;
+
+               if (IFNET_IS_CELLULAR(ifp))
+                       cellular_viable = TRUE;
+
                TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
-                       const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
+                       const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
 
-                       if (ifp == NULL)
+                       if (subifp == NULL)
                                continue;
 
-                       if (ifp->if_index == ifindex &&
-                           !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
-                           sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
-                               /*
-                                * We found a subflow on this interface.
-                                * No need to create a new one.
-                                */
-                               found = 1;
-                               break;
-                       }
-
                        /*
                         * In Handover mode, only create cell subflow if
                         * 1. Wi-Fi Assist is active
@@ -821,15 +850,37 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                         *    good performance.
                         */
                        if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
-                           !IFNET_IS_CELLULAR(ifp) &&
+                           !IFNET_IS_CELLULAR(subifp) &&
                            !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
-                           (!mptcp_is_wifi_unusable() ||
-                            (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh &&
-                             mptetoso(mpte)->so_snd.sb_cc))) {
-                               mptcplog((LOG_DEBUG, "%s handover, wifi state %u rxt %u ifindex %u this %u\n",
-                                         __func__, mptcp_is_wifi_unusable(), sototcpcb(mpts->mpts_socket)->t_rxtshift, ifindex,
-                                         ifp->if_index),
-                                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
+                           (mptcp_is_wifi_unusable(mpte) == 0 ||
+                            (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh * 2 &&
+                             ((mpte->mpte_flags & MPTE_FIRSTPARTY) || mptetoso(mpte)->so_snd.sb_cc)))) {
+                               os_log_debug(mptcp_log_handle, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n",
+                                            __func__, mptcp_is_wifi_unusable(mpte),
+                                            sototcpcb(mpts->mpts_socket)->t_rxtshift,
+                                            !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
+                                            mptetoso(mpte)->so_snd.sb_cc,
+                                            ifindex, subifp->if_index);
+                               found = 1;
+
+                               /* We found a proper subflow on WiFi - no need for cell */
+                               want_cellular = FALSE;
+                               break;
+                       } else {
+                               os_log_debug(mptcp_log_handle, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n",
+                                            __func__, mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
+                                            mptcp_is_wifi_unusable(mpte), sototcpcb(mpts->mpts_socket)->t_rxtshift,
+                                            !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc);
+
+                       }
+
+                       if (subifp->if_index == ifindex &&
+                           !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
+                           sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
+                               /*
+                                * We found a subflow on this interface.
+                                * No need to create a new one.
+                                */
                                found = 1;
                                break;
                        }
@@ -847,22 +898,16 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                        struct sockaddr_in6 nat64pre;
 
                        if (mpte->mpte_dst.sa_family == AF_INET &&
-                           !info->has_v4_conn && info->has_v6_conn) {
+                           !info->has_v4_conn && info->has_nat64_conn) {
                                struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
-                               struct ifnet *ifp;
                                int error, j;
 
                                bzero(&nat64pre, sizeof(struct sockaddr_in6));
 
-                               ifnet_head_lock_shared();
-                               ifp = ifindex2ifnet[ifindex];
-                               ifnet_head_done();
-
                                error = ifnet_get_nat64prefix(ifp, nat64prefixes);
                                if (error) {
-                                       mptcplog((LOG_ERR, "%s: no NAT64-prefix on itf %s, error %d\n",
-                                                 __func__, ifp->if_name, error),
-                                                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+                                       os_log_error(mptcp_log_handle, "%s: no NAT64-prefix on itf %s, error %d\n",
+                                                    __func__, ifp->if_name, error);
                                        continue;
                                }
 
@@ -877,8 +922,8 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                                                               nat64prefixes[j].prefix_len,
                                                               &mpte->__mpte_dst_v4.sin_addr);
                                if (error != 0) {
-                                       mptcplog((LOG_INFO, "%s: cannot synthesize this addr\n", __func__),
-                                                MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
+                                       os_log_info(mptcp_log_handle, "%s: cannot synthesize this addr\n",
+                                                   __func__);
                                        continue;
                                }
 
@@ -908,6 +953,11 @@ mptcp_check_subflows_and_add(struct mptses *mpte)
                        mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
                }
        }
+
+       if (!cellular_viable && want_cellular) {
+               /* Trigger Cell Bringup */
+               mptcp_trigger_cell_bringup(mpte);
+       }
 }
 
 /*
@@ -919,7 +969,7 @@ mptcp_check_subflows_and_remove(struct mptses *mpte)
 {
        struct mptsub *mpts, *tmpts;
        int found_working_subflow = 0, removed_some = 0;
-       int wifi_unusable = mptcp_is_wifi_unusable();
+       int wifi_unusable = mptcp_is_wifi_unusable(mpte);
 
        if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
                return;
@@ -943,8 +993,8 @@ mptcp_check_subflows_and_remove(struct mptses *mpte)
                    tp->t_state != TCPS_ESTABLISHED)
                        continue;
 
-               /* Either this subflow is in good condition while we try to send */
-               if (tp->t_rxtshift == 0 && mptetoso(mpte)->so_snd.sb_cc)
+               /* Is this subflow in good condition? */
+               if (tp->t_rxtshift == 0)
                        found_working_subflow = 1;
 
                /* Or WiFi is fine */
@@ -1225,13 +1275,18 @@ mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so
 
 static void
 mptcp_subflow_necp_cb(void *handle, __unused int action,
-                     __unused struct necp_client_flow *flow)
+                     __unused uint32_t interface_index,
+                     uint32_t necp_flags, bool *viable)
 {
+       boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
        struct inpcb *inp = (struct inpcb *)handle;
        struct socket *so = inp->inp_socket;
        struct mptsub *mpts;
        struct mptses *mpte;
 
+       if (low_power)
+               action = NECP_CLIENT_CBACTION_NONVIABLE;
+
        if (action != NECP_CLIENT_CBACTION_NONVIABLE)
                return;
 
@@ -1251,15 +1306,15 @@ mptcp_subflow_necp_cb(void *handle, __unused int action,
        mpte = tptomptp(sototcpcb(so))->mpt_mpte;
        mpts = sototcpcb(so)->t_mpsub;
 
-       mptcplog((LOG_DEBUG, "%s: Subflow became non-viable", __func__),
-                MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
+       os_log_debug(mptcp_log_handle, "%s Subflow on itf %u became non-viable, power %u",
+                    __func__, mpts->mpts_ifscope, low_power);
 
        mpts->mpts_flags |= MPTSF_CLOSE_REQD;
 
        mptcp_sched_create_subflows(mpte);
 
-       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
-               flow->viable = 1;
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && viable != NULL)
+               *viable = 1;
 
 out:
        socket_unlock(so, 1);
@@ -1797,8 +1852,8 @@ mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
                 * Check if the full mapping is now present
                 */
                if ((int)so->so_rcv.sb_cc < dlen - dfin) {
-                       mptcplog((LOG_INFO, "%s not enough data (%u) need %u\n",
-                                 __func__, so->so_rcv.sb_cc, dlen),
+                       mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
+                                 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
                                 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
 
                        if (*mp0 == NULL)
@@ -3751,9 +3806,9 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
                if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
                        goto done;
                mpts->mpts_flags |= MPTSF_MP_DEGRADED;
-       }
-       else
+       } else {
                mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
+       }
 
        if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
                mpts->mpts_flags |= MPTSF_MP_READY;
@@ -3768,6 +3823,9 @@ mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
        if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
                VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
                ret = MPTS_EVRET_DISCONNECT_FALLBACK;
+
+               m_freem_list(mpte->mpte_reinjectq);
+               mpte->mpte_reinjectq = NULL;
        } else if (mpts->mpts_flags & MPTSF_MP_READY) {
                mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
                ret = MPTS_EVRET_CONNECT_PENDING;
@@ -3955,10 +4013,12 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *
        if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
            mpo->mpo_level == SOL_SOCKET &&
            mpo->mpo_name == SO_MARK_CELLFALLBACK) {
-               mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %u lastcell? %d boundcell? %d\n",
-                         __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(),
+               struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
+
+               mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
+                         __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(mpte),
                          sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
-                         mpts->mpts_ifscope != IFSCOPE_NONE ? IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]) : -1),
+                         mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
                         MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
 
                /*
@@ -3980,8 +4040,8 @@ mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *
                 * interface, then it definitely is not a cell-fallback
                 * connection.
                 */
-               if (mpts->mpts_ifscope == IFSCOPE_NONE ||
-                   !IFNET_IS_CELLULAR(ifindex2ifnet[mpts->mpts_ifscope]))
+               if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
+                   !IFNET_IS_CELLULAR(ifp))
                        return (0);
        }
 
@@ -5667,13 +5727,12 @@ symptoms_advisory_t mptcp_advisory;
 
 static errno_t
 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
-       void **unitinfo)
+                          void **unitinfo)
 {
 #pragma unused(kctlref, sac, unitinfo)
 
        if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0)
-               mptcplog((LOG_ERR, "%s MPTCP kernel-control socket already open!", __func__),
-                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s MPTCP kernel-control socket for Symptoms already open!", __func__);
 
        mptcp_kern_skt_unit = sac->sc_unit;
 
@@ -5760,8 +5819,7 @@ mptcp_ask_symptoms(struct mptses *mpte)
        int pid, prio, err;
 
        if (mptcp_kern_skt_unit == 0) {
-               mptcplog((LOG_ERR, "%s skt_unit is still 0\n", __func__),
-                         MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s skt_unit is still 0\n", __func__);
                return;
        }
 
@@ -5774,8 +5832,7 @@ mptcp_ask_symptoms(struct mptses *mpte)
 
        p = proc_find(pid);
        if (p == PROC_NULL) {
-               mptcplog((LOG_ERR, "%s Couldn't find proc for pid %u\n", __func__,
-                         pid), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s Couldn't find proc for pid %u\n", __func__, pid);
                return;
        }
 
@@ -5795,14 +5852,12 @@ mptcp_ask_symptoms(struct mptses *mpte)
        else
                ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
 
-       mptcplog((LOG_DEBUG, "%s ask symptoms about pid %u, prio %u\n", __func__,
-                 pid, ask.priority), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
-
        err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
                              &ask, sizeof(ask), CTL_DATA_EOR);
-       if (err)
-               mptcplog((LOG_ERR, "%s ctl_enqueuedata failed %d\n", __func__, err),
-                         MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+
+       os_log_debug(mptcp_log_handle, "%s asked symptoms about pid %u, prio %u, err %d\n",
+                    __func__, pid, ask.priority, err);
+
 
        proc_rele(p);
 }
@@ -5826,19 +5881,20 @@ mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
        symptoms_advisory_t     *sa = NULL;
 
        if (kcunit != mptcp_kern_skt_unit)
-               mptcplog((LOG_ERR, "%s kcunit %u is different from expected one %u\n",
-                         __func__, kcunit, mptcp_kern_skt_unit),
-                        MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
+               os_log_error(mptcp_log_handle, "%s kcunit %u is different from expected one %u\n",
+                            __func__, kcunit, mptcp_kern_skt_unit);
 
        if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
                mbuf_freem(m);
                return (EINVAL);
        }
 
-       if (mbuf_len(m) >= sizeof(*sa))
-               sa = mbuf_data(m);
-       else
+       if (mbuf_len(m) < sizeof(*sa)) {
+               mbuf_freem(m);
                return (EINVAL);
+       }
+
+       sa = mbuf_data(m);
 
        if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
            sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
@@ -5870,6 +5926,7 @@ mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
                mptcp_allow_uuid(uuid);
        }
 
+       mbuf_freem(m);
        return (0);
 }
 
@@ -5890,11 +5947,40 @@ mptcp_control_register(void)
        (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
 }
 
+/*
+ * Three return-values:
+ * 1  : WiFi is bad
+ * 0  : WiFi is good
+ * -1 : WiFi-state is unknown, use subflow-only heuristics
+ */
 int
-mptcp_is_wifi_unusable(void)
+mptcp_is_wifi_unusable(struct mptses *mpte)
 {
-       /* a false return val indicates there is no info or wifi is ok */
-       return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
+       if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
+               if (mptcp_advisory.sa_wifi_status)
+                       return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0);
+
+               /*
+                * If it's a first-party app and we don't have any info
+                * about the Wi-Fi state, let's be pessimistic.
+                */
+               return (-1);
+       }
+
+       return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0);
+}
+
+boolean_t
+mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts)
+{
+       struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
+       int fail_thresh = mptcp_fail_thresh;
+
+       if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
+               fail_thresh *= 2;
+
+       return (tp->t_rxtshift >= fail_thresh &&
+               (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq));
 }
 
 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
index d09642033b2094551a4b28bc45dc8b13347cc982..1a85f2e3e345acc2541dafa3d1cdbc54a45a00ac 100644 (file)
@@ -47,6 +47,7 @@ struct mpt_itf_info {
        uint32_t ifindex;
        uint32_t has_v4_conn:1,
                 has_v6_conn:1,
+                has_nat64_conn:1,
                 no_mptcp_support:1;
 };
 
@@ -106,6 +107,7 @@ struct mptses {
        uint32_t        mpte_used_cell:1,
                        mpte_used_wifi:1,
                        mpte_initial_cell:1,
+                       mpte_triggered_cell,
                        mpte_handshake_success:1;
 
        struct mptcp_itf_stats  mpte_itfstats[MPTCP_ITFSTATS_SIZE];
@@ -652,9 +654,10 @@ extern u_int32_t mptcp_get_notsent_lowat(struct mptses *mpte);
 extern int mptcp_notsent_lowat_check(struct socket *so);
 extern void mptcp_ask_symptoms(struct mptses *mpte);
 extern void mptcp_control_register(void);
-extern int mptcp_is_wifi_unusable(void);
+extern int mptcp_is_wifi_unusable(struct mptses *mpte);
+extern boolean_t mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts);
 extern void mptcp_ask_for_nat64(struct ifnet *ifp);
-extern void mptcp_session_necp_cb(void *, int, struct necp_client_flow *);
+extern void mptcp_session_necp_cb(void *, int, uint32_t, uint32_t, bool *);
 extern void mptcp_set_restrictions(struct socket *mp_so);
 extern int mptcp_freeq(struct mptcb *);
 extern void mptcp_set_cellicon(struct mptses *mpte);
index 30c5e8e338dfd21f96cd76847c4265fd0fa5b212..65f2d2a41fd33b2c9a3270937d2b24ca33b26f4c 100644 (file)
@@ -235,7 +235,7 @@ rip_input(struct mbuf *m, int iphlen)
 
 #if NECP
                        if (n && !necp_socket_is_allowed_to_send_recv_v4(last, 0, 0,
-                               &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL)) {
+                               &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL, NULL)) {
                                m_freem(n);
                                /* do not inject data to pcb */
                                skipit = 1;
@@ -254,7 +254,8 @@ rip_input(struct mbuf *m, int iphlen)
                                int error = 0;
                                if ((last->inp_flags & INP_CONTROLOPTS) != 0 ||
                                    (last->inp_socket->so_options & SO_TIMESTAMP) != 0 ||
-                                   (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+                                   (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+                                       (last->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
                                        ret = ip_savecontrol(last, &opts, ip, n);
                                        if (ret != 0) {
                                                m_freem(n);
@@ -288,7 +289,7 @@ rip_input(struct mbuf *m, int iphlen)
        skipit = 0;
 #if NECP
        if (last && !necp_socket_is_allowed_to_send_recv_v4(last, 0, 0,
-               &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL)) {
+               &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL, NULL)) {
                m_freem(m);
                OSAddAtomic(1, &ipstat.ips_delivered);
                /* do not inject data to pcb */
@@ -307,7 +308,8 @@ rip_input(struct mbuf *m, int iphlen)
                if (last) {
                        if ((last->inp_flags & INP_CONTROLOPTS) != 0 ||
                                (last->inp_socket->so_options & SO_TIMESTAMP) != 0 ||
-                               (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+                               (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+                               (last->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
                                ret = ip_savecontrol(last, &opts, ip, m);
                                if (ret != 0) {
                                        m_freem(m);
@@ -455,6 +457,7 @@ rip_output(
 #if NECP
        {
                necp_kernel_policy_id policy_id;
+               necp_kernel_policy_id skip_policy_id;
                u_int32_t route_rule_id;
 
                /*
@@ -492,12 +495,12 @@ rip_output(
                }
 
                if (!necp_socket_is_allowed_to_send_recv_v4(inp, 0, 0,
-                       &ip->ip_src, &ip->ip_dst, NULL, &policy_id, &route_rule_id)) {
+                       &ip->ip_src, &ip->ip_dst, NULL, &policy_id, &route_rule_id, &skip_policy_id)) {
                        m_freem(m);
                        return(EHOSTUNREACH);
                }
 
-               necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id);
+               necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id);
 
                if (net_qos_policy_restricted != 0) {
                        struct ifnet *rt_ifp = NULL;
@@ -529,6 +532,12 @@ rip_output(
        m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC |
            PKTF_FLOW_RAWSOCK);
        m->m_pkthdr.pkt_proto = inp->inp_ip_p;
+       m->m_pkthdr.tx_rawip_pid = so->last_pid;
+       m->m_pkthdr.tx_rawip_e_pid = so->e_pid;
+       if (so->so_flags & SOF_DELEGATED)
+               m->m_pkthdr.tx_rawip_e_pid = so->e_pid;
+       else
+               m->m_pkthdr.tx_rawip_e_pid = 0;
 
 #if CONFIG_MACF_NET
        mac_mbuf_label_associate_inpcb(inp, m);
index a512944680c4786b8433c8bab25008d81f14cbc7..b64798dd270743871365342f82e6f71ea1c252e7 100644 (file)
@@ -102,6 +102,7 @@ struct tcphdr {
 #define        TH_ECE  0x40
 #define        TH_CWR  0x80
 #define        TH_FLAGS        (TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG|TH_ECE|TH_CWR)
+#define        TH_ACCEPT       (TH_FIN|TH_SYN|TH_RST|TH_ACK)
 
        unsigned short  th_win;         /* window */
        unsigned short  th_sum;         /* checksum */
index 216814cff767947b72862b23112da076f77618bf..c18f014e104d51dcb0e4db0602da25db2c717423 100644 (file)
@@ -237,13 +237,6 @@ SYSCTL_SKMEM_TCP_INT(OID_AUTO, autorcvbufmax,
     CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_autorcvbuf_max, 512 * 1024,
     "Maximum receive socket buffer size");
 
-u_int32_t tcp_autorcvbuf_max_ca = 512 * 1024;
-#if (DEBUG || DEVELOPMENT)
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, autorcvbufmaxca,
-    CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_autorcvbuf_max_ca, 0,
-    "Maximum receive socket buffer size");
-#endif /* (DEBUG || DEVELOPMENT) */
-
 #if CONFIG_EMBEDDED
 int sw_lro = 1;
 #else
@@ -290,6 +283,13 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_access_to_stats,
     CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_disable_access_to_stats, 0,
     "Disable access to tcpstat");
 
+SYSCTL_SKMEM_TCP_INT(OID_AUTO, challengeack_limit,
+    CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_challengeack_limit, 10,
+    "Maximum number of challenge ACKs per connection per second");
+
+SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_rfc5961,
+    CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_do_rfc5961, 1,
+    "Enable/Disable full RFC 5961 compliance");
 
 extern int tcp_TCPTV_MIN;
 extern int tcp_acc_iaj_high;
@@ -551,6 +551,40 @@ void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
 }
 #endif /* TRAFFIC_MGT */
 
+/*
+ * Perform rate limit check per connection per second
+ * tp->t_challengeack_last is the last_time diff was greater than 1sec
+ * tp->t_challengeack_count is the number of ACKs sent (within 1sec)
+ * Return TRUE if we shouldn't send the ACK due to rate limitation
+ * Return FALSE if it is still ok to send challenge ACK
+ */
+static boolean_t
+tcp_is_ack_ratelimited(struct tcpcb *tp)
+{
+       boolean_t ret = TRUE;
+       uint32_t now = tcp_now;
+       int32_t diff = 0;
+
+       diff = timer_diff(now, 0, tp->t_challengeack_last, 0);
+       /* If it is first time or diff > 1000ms,
+        * update the challengeack_last and reset the
+        * current count of ACKs
+        */
+       if (tp->t_challengeack_last == 0 || diff >= 1000) {
+               tp->t_challengeack_last = now;
+               tp->t_challengeack_count = 0;
+               ret = FALSE;
+       } else if (tp->t_challengeack_count < tcp_challengeack_limit) {
+               ret = FALSE;
+       }
+
+       /* Careful about wrap-around */
+       if (ret == FALSE && (tp->t_challengeack_count + 1 > 0))
+               tp->t_challengeack_count++;
+
+       return (ret);
+}
+
 /* Check if enough amount of data has been acknowledged since
  * bw measurement was started
  */
@@ -1815,7 +1849,7 @@ tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th,
                        tp->max_sndwnd = tp->snd_wnd;
 
                if (tp->t_inpcb->inp_socket->so_flags & SOF_MP_SUBFLOW)
-                       mptcp_update_window_fallback(tp);
+                       mptcp_update_window_wakeup(tp);
                return (true);
        }
        return (false);
@@ -2247,7 +2281,7 @@ findpcb:
        if (so->so_state & SS_ISCONNECTED) {
                // Connected TCP sockets have a fully-bound local and remote,
                // so the policy check doesn't need to override addresses
-               if (!necp_socket_is_allowed_to_send_recv(inp, NULL, NULL)) {
+               if (!necp_socket_is_allowed_to_send_recv(inp, NULL, NULL, NULL)) {
                        IF_TCP_STATINC(ifp, badformat);
                        goto drop;
                }
@@ -2256,7 +2290,7 @@ findpcb:
                if (isipv6) {
                        if (!necp_socket_is_allowed_to_send_recv_v6(inp,
                                th->th_dport, th->th_sport, &ip6->ip6_dst,
-                               &ip6->ip6_src, ifp, NULL, NULL)) {
+                               &ip6->ip6_src, ifp, NULL, NULL, NULL)) {
                                IF_TCP_STATINC(ifp, badformat);
                                goto drop;
                        }
@@ -2265,7 +2299,7 @@ findpcb:
                {
                        if (!necp_socket_is_allowed_to_send_recv_v4(inp,
                                th->th_dport, th->th_sport, &ip->ip_dst, &ip->ip_src,
-                               ifp, NULL, NULL)) {
+                               ifp, NULL, NULL, NULL)) {
                                IF_TCP_STATINC(ifp, badformat);
                                goto drop;
                        }
@@ -2282,6 +2316,10 @@ findpcb:
        if (tp->t_state == TCPS_CLOSED)
                goto drop;
 
+       /* If none of the FIN|SYN|RST|ACK flag is set, drop */
+       if (tcp_do_rfc5961 && (thflags & TH_ACCEPT) == 0)
+               goto drop;
+
        /* Unscale the window into a 32-bit value. */
        if ((thflags & TH_SYN) == 0)
                tiwin = th->th_win << tp->snd_scale;
@@ -2603,7 +2641,7 @@ findpcb:
                        /* now drop the reference on the listener */
                        socket_unlock(oso, 1);
 
-                       tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(ifp));
+                       tcp_set_max_rwinscale(tp, so, ifp);
 
                        KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END,0,0,0,0,0);
                }
@@ -3212,6 +3250,7 @@ findpcb:
                 *   initialize CCsend and CCrecv.
                 */
                tp->snd_wnd = tiwin;    /* initial send-window */
+               tp->max_sndwnd = tp->snd_wnd;
                tp->t_flags |= TF_ACKNOW;
                tp->t_unacksegs = 0;
                DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
@@ -3317,6 +3356,7 @@ findpcb:
                if ((thflags & TH_SYN) == 0)
                        goto drop;
                tp->snd_wnd = th->th_win;       /* initial send window */
+               tp->max_sndwnd = tp->snd_wnd;
 
                tp->irs = th->th_seq;
                tcp_rcvseqinit(tp);
@@ -3524,11 +3564,20 @@ trimthenstep6:
        /* Received a SYN while connection is already established.
         * This is a "half open connection and other anomalies" described
         * in RFC793 page 34, send an ACK so the remote reset the connection
-        * or recovers by adjusting its sequence numberering
+        * or recovers by adjusting its sequence numbering. Sending an ACK is
+        * in accordance with RFC 5961 Section 4.2
         */
        case TCPS_ESTABLISHED:
-               if (thflags & TH_SYN)
-                       goto dropafterack;
+               if (thflags & TH_SYN) {
+                       /* Drop the packet silently if we have reached the limit */
+                       if (tcp_do_rfc5961 && tcp_is_ack_ratelimited(tp)) {
+                               goto drop;
+                       } else {
+                               /* Send challenge ACK */
+                               tcpstat.tcps_synchallenge++;
+                               goto dropafterack;
+                       }
+               }
                break;
        }
 
@@ -3566,6 +3615,11 @@ trimthenstep6:
         *   only accepting RSTs where the sequence number is equal to
         *   last_ack_sent.  In all other states (the states in which a
         *   RST is more likely), the more permissive check is used.
+        * RFC 5961 Section 3.2: if the RST bit is set, sequence # is
+        *    within the receive window and last_ack_sent == seq,
+        *    then reset the connection. Otherwise if the seq doesn't
+        *    match last_ack_sent, TCP must send challenge ACK. Perform
+        *    rate limitation when sending the challenge ACK.
         * If we have multiple segments in flight, the intial reset
         * segment sequence numbers will be to the left of last_ack_sent,
         * but they will eventually catch up.
@@ -3606,52 +3660,64 @@ trimthenstep6:
                    (tp->rcv_wnd == 0 &&
                    ((tp->last_ack_sent == th->th_seq) ||
                    ((tp->last_ack_sent -1) == th->th_seq)))) {
-                       switch (tp->t_state) {
+                       if (tcp_do_rfc5961 == 0 || tp->last_ack_sent == th->th_seq) {
+                               switch (tp->t_state) {
 
-                       case TCPS_SYN_RECEIVED:
-                               IF_TCP_STATINC(ifp, rstinsynrcv);
-                               so->so_error = ECONNREFUSED;
-                               goto close;
+                               case TCPS_SYN_RECEIVED:
+                                       IF_TCP_STATINC(ifp, rstinsynrcv);
+                                       so->so_error = ECONNREFUSED;
+                                       goto close;
 
-                       case TCPS_ESTABLISHED:
-                               if (tp->last_ack_sent != th->th_seq) {
-                                       tcpstat.tcps_badrst++;
-                                       goto drop;
-                               }
-                               if (TCP_ECN_ENABLED(tp) &&
-                                   tp->snd_una == tp->iss + 1 &&
-                                   SEQ_GT(tp->snd_max, tp->snd_una)) {
+                               case TCPS_ESTABLISHED:
+                                       if (tcp_do_rfc5961 == 0 && tp->last_ack_sent != th->th_seq) {
+                                               tcpstat.tcps_badrst++;
+                                               goto drop;
+                                       }
+                                       if (TCP_ECN_ENABLED(tp) &&
+                                           tp->snd_una == tp->iss + 1 &&
+                                           SEQ_GT(tp->snd_max, tp->snd_una)) {
+                                               /*
+                                                * If the first data packet on an
+                                                * ECN connection, receives a RST
+                                                * increment the heuristic
+                                                */
+                                               tcp_heuristic_ecn_droprst(tp);
+                                       }
+                               case TCPS_FIN_WAIT_1:
+                               case TCPS_CLOSE_WAIT:
                                        /*
-                                        * If the first data packet on an
-                                        * ECN connection, receives a RST
-                                        * increment the heuristic
-                                        */
-                                       tcp_heuristic_ecn_droprst(tp);
-                               }
-                       case TCPS_FIN_WAIT_1:
-                       case TCPS_CLOSE_WAIT:
-                               /*
-                                 Drop through ...
-                               */
-                       case TCPS_FIN_WAIT_2:
-                               so->so_error = ECONNRESET;
-                       close:
-                               postevent(so, 0, EV_RESET);
-                               soevent(so,
-                                   (SO_FILT_HINT_LOCKED |
-                                   SO_FILT_HINT_CONNRESET));
-
-                               tcpstat.tcps_drops++;
-                               tp = tcp_close(tp);
-                               break;
+                                         Drop through ...
+                                       */
+                               case TCPS_FIN_WAIT_2:
+                                       so->so_error = ECONNRESET;
+                               close:
+                                       postevent(so, 0, EV_RESET);
+                                       soevent(so,
+                                           (SO_FILT_HINT_LOCKED |
+                                           SO_FILT_HINT_CONNRESET));
+
+                                       tcpstat.tcps_drops++;
+                                       tp = tcp_close(tp);
+                                       break;
 
-                       case TCPS_CLOSING:
-                       case TCPS_LAST_ACK:
-                               tp = tcp_close(tp);
-                               break;
+                               case TCPS_CLOSING:
+                               case TCPS_LAST_ACK:
+                                       tp = tcp_close(tp);
+                                       break;
 
-                       case TCPS_TIME_WAIT:
-                               break;
+                               case TCPS_TIME_WAIT:
+                                       break;
+                               }
+                       } else if (tcp_do_rfc5961) {
+                               tcpstat.tcps_badrst++;
+                               /* Drop if we have reached the ACK limit */
+                               if (tcp_is_ack_ratelimited(tp)) {
+                                       goto drop;
+                               } else {
+                                       /* Send challenge ACK */
+                                       tcpstat.tcps_rstchallenge++;
+                                       goto dropafterack;
+                               }
                        }
                }
                goto drop;
@@ -3728,9 +3794,16 @@ trimthenstep6:
                goto dropwithreset;
        }
 
+       /*
+        * Check if there is old data at the beginning of the window
+        * i.e. the sequence number is before rcv_nxt
+        */
        todrop = tp->rcv_nxt - th->th_seq;
        if (todrop > 0) {
+               boolean_t is_syn_set = FALSE;
+
                if (thflags & TH_SYN) {
+                       is_syn_set = TRUE;
                        thflags &= ~TH_SYN;
                        th->th_seq++;
                        if (th->th_urp > 1)
@@ -3741,6 +3814,8 @@ trimthenstep6:
                }
                /*
                 * Following if statement from Stevens, vol. 2, p. 960.
+                * The amount of duplicate data is greater than or equal
+                * to the size of the segment - entire segment is duplicate
                 */
                if (todrop > tlen
                    || (todrop == tlen && (thflags & TH_FIN) == 0)) {
@@ -3754,8 +3829,19 @@ trimthenstep6:
                        /*
                         * Send an ACK to resynchronize and drop any data.
                         * But keep on processing for RST or ACK.
+                        *
+                        * If the SYN bit was originally set, then only send
+                        * an ACK if we are not rate-limiting this connection.
                         */
-                       tp->t_flags |= TF_ACKNOW;
+                       if (tcp_do_rfc5961 && is_syn_set) {
+                               if (!tcp_is_ack_ratelimited(tp)) {
+                                       tcpstat.tcps_synchallenge++;
+                                       tp->t_flags |= TF_ACKNOW;
+                               }
+                       } else {
+                               tp->t_flags |= TF_ACKNOW;
+                       }
+
                        if (todrop == 1) {
                                /* This could be a keepalive */
                                soevent(so, SO_FILT_HINT_LOCKED |
@@ -3898,15 +3984,31 @@ trimthenstep6:
        }
 
        /*
-        * If a SYN is in the window, then this is an
+        * Stevens: If a SYN is in the window, then this is an
         * error and we send an RST and drop the connection.
+        *
+        * RFC 5961 Section 4.2
+        * Send challenge ACK for any SYN in synchronized state
+        * Perform rate limitation in doing so.
         */
        if (thflags & TH_SYN) {
-               tp = tcp_drop(tp, ECONNRESET);
-               rstreason = BANDLIM_UNLIMITED;
-               postevent(so, 0, EV_RESET);
-               IF_TCP_STATINC(ifp, synwindow);
-               goto dropwithreset;
+               if (tcp_do_rfc5961) {
+                       tcpstat.tcps_badsyn++;
+                       /* Drop if we have reached ACK limit */
+                       if (tcp_is_ack_ratelimited(tp)) {
+                               goto drop;
+                       } else {
+                               /* Send challenge ACK */
+                               tcpstat.tcps_synchallenge++;
+                               goto dropafterack;
+                       }
+               } else {
+                       tp = tcp_drop(tp, ECONNRESET);
+                       rstreason = BANDLIM_UNLIMITED;
+                       postevent(so, 0, EV_RESET);
+                       IF_TCP_STATINC(ifp, synwindow);
+                       goto dropwithreset;
+               }
        }
 
        /*
@@ -3969,6 +4071,7 @@ trimthenstep6:
                        tp->snd_scale = tp->requested_s_scale;
                        tp->rcv_scale = tp->request_r_scale;
                        tp->snd_wnd = th->th_win << tp->snd_scale;
+                       tp->max_sndwnd = tp->snd_wnd;
                        tiwin = tp->snd_wnd;
                }
                /*
@@ -4088,7 +4191,18 @@ trimthenstep6:
        case TCPS_TIME_WAIT:
                if (SEQ_GT(th->th_ack, tp->snd_max)) {
                        tcpstat.tcps_rcvacktoomuch++;
-                       goto dropafterack;
+                       if (tcp_do_rfc5961 && tcp_is_ack_ratelimited(tp)) {
+                               goto drop;
+                       } else {
+                               goto dropafterack;
+                       }
+               }
+               if (tcp_do_rfc5961 && SEQ_LT(th->th_ack, tp->snd_una - tp->max_sndwnd)) {
+                       if (tcp_is_ack_ratelimited(tp)) {
+                               goto drop;
+                       } else {
+                               goto dropafterack;
+                       }
                }
                if (SACK_ENABLED(tp) && to.to_nsacks > 0) {
                        recvd_dsack = tcp_sack_process_dsack(tp, &to, th);
@@ -5607,12 +5721,22 @@ static inline unsigned int
 tcp_maxmtu(struct rtentry *rt)
 {
        unsigned int maxmtu;
+       int interface_mtu = 0;
 
        RT_LOCK_ASSERT_HELD(rt);
+       interface_mtu = rt->rt_ifp->if_mtu;
+
+       if (rt_key(rt)->sa_family == AF_INET &&
+           INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) {
+               interface_mtu = IN6_LINKMTU(rt->rt_ifp);
+               /* Further adjust the size for CLAT46 expansion */
+               interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
+       }
+
        if (rt->rt_rmx.rmx_mtu == 0)
-               maxmtu = rt->rt_ifp->if_mtu;
+               maxmtu = interface_mtu;
        else
-               maxmtu = MIN(rt->rt_rmx.rmx_mtu, rt->rt_ifp->if_mtu);
+               maxmtu = MIN(rt->rt_rmx.rmx_mtu, interface_mtu);
 
        return (maxmtu);
 }
@@ -6564,6 +6688,7 @@ tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
        return (0);
 }
 
+
 SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat,
     "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
index 52884d35708f59ed900d89611ad0db170b9b25a3..75e8634c0bf63d96395584e7f4e4022f0840fe08 100644 (file)
@@ -2287,13 +2287,13 @@ timer:
                 *
                 * Every time new data is sent PTO will get reset.
                 */
-               if (tcp_enable_tlp && tp->t_state == TCPS_ESTABLISHED &&
-                   SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp)
-                   && tp->snd_nxt == tp->snd_max
-                   && SEQ_GT(tp->snd_nxt, tp->snd_una)
-                   && tp->t_rxtshift == 0
-                   && (tp->t_flagsext & (TF_SENT_TLPROBE|TF_PKTS_REORDERED)) == 0) {
-                       u_int32_t pto, srtt, new_rto = 0;
+               if (tcp_enable_tlp && len != 0 && tp->t_state == TCPS_ESTABLISHED &&
+                   SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
+                   tp->snd_nxt == tp->snd_max &&
+                   SEQ_GT(tp->snd_nxt, tp->snd_una) &&
+                   tp->t_rxtshift == 0 &&
+                   (tp->t_flagsext & (TF_SENT_TLPROBE|TF_PKTS_REORDERED)) == 0) {
+                       u_int32_t pto, srtt;
 
                        /*
                         * Using SRTT alone to set PTO can cause spurious
@@ -2311,21 +2311,9 @@ timer:
                                pto = max(10, pto);
 
                        /* if RTO is less than PTO, choose RTO instead */
-                       if (tp->t_rxtcur < pto) {
-                               /*
-                                * Schedule PTO instead of RTO in favor of
-                                * fast recovery.
-                                */
+                       if (tp->t_rxtcur < pto)
                                pto = tp->t_rxtcur;
 
-                               /* Reset the next RTO to be after PTO. */
-                               TCPT_RANGESET(new_rto,
-                                   (pto + TCP_REXMTVAL(tp)),
-                                   max(tp->t_rttmin, tp->t_rttcur + 2),
-                                   TCPTV_REXMTMAX, 0);
-                               tp->t_timer[TCPT_REXMT] =
-                                   OFFSET_FROM_START(tp, new_rto);
-                       }
                        tp->t_timer[TCPT_PTO] = OFFSET_FROM_START(tp, pto);
                }
        } else {
@@ -2412,13 +2400,14 @@ timer:
 #if NECP
        {
                necp_kernel_policy_id policy_id;
+               necp_kernel_policy_id skip_policy_id;
                u_int32_t route_rule_id;
-               if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id)) {
+               if (!necp_socket_is_allowed_to_send_recv(inp, &policy_id, &route_rule_id, &skip_policy_id)) {
                        m_freem(m);
                        error = EHOSTUNREACH;
                        goto out;
                }
-               necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id);
+               necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id);
 
                if (net_qos_policy_restricted != 0) {
                        necp_socket_update_qos_marking(inp, inp->inp_route.ro_rt,
@@ -2445,6 +2434,11 @@ timer:
        m->m_pkthdr.pkt_flowid = inp->inp_flowhash;
        m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
        m->m_pkthdr.pkt_proto = IPPROTO_TCP;
+       m->m_pkthdr.tx_tcp_pid = so->last_pid;
+       if (so->so_flags & SOF_DELEGATED)
+               m->m_pkthdr.tx_tcp_e_pid = so->e_pid;
+       else
+               m->m_pkthdr.tx_tcp_e_pid = 0;
 
        m->m_nextpkt = NULL;
 
index 363dea99f8e46a9333ab2a3acac05999c9315914..1c9b36da3c9fe1f865af4187177cae0803389f0c 100644 (file)
@@ -90,6 +90,7 @@
 #include <net/route.h>
 #include <net/if.h>
 #include <net/content_filter.h>
+#include <net/ntstat.h>
 
 #define        tcp_minmssoverload fring
 #define        _IP_VHL
@@ -638,21 +639,11 @@ tcp_init(struct protosw *pp, struct domain *dp)
         * maximum allowed receive and send socket buffer size.
         */
        if (nmbclusters > 30720) {
-               #if CONFIG_EMBEDDED
-                       tcp_autorcvbuf_max = 2 * 1024 * 1024;
-                       tcp_autosndbuf_max = 2 * 1024 * 1024;
-               #else
-                       tcp_autorcvbuf_max = 1024 * 1024;
-                       tcp_autosndbuf_max = 1024 * 1024;
-               #endif /* CONFIG_EMBEDDED */
+               tcp_autorcvbuf_max = 2 * 1024 * 1024;
+               tcp_autosndbuf_max = 2 * 1024 * 1024;
+
                SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max);
                SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max);
-
-               /*
-                * Receive buffer max for cellular interfaces supporting
-                * Carrier Aggregation is higher
-                */
-               tcp_autorcvbuf_max_ca = 2 * 1024 * 1024;
        }
 }
 
@@ -925,7 +916,7 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
 #endif
 
 #if NECP
-       necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0);
+       necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0, 0);
 #endif /* NECP */
 
 #if IPSEC
@@ -950,6 +941,8 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
                m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
                m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
                m->m_pkthdr.pkt_proto = IPPROTO_TCP;
+               m->m_pkthdr.tx_tcp_pid = tp->t_inpcb->inp_socket->last_pid;
+               m->m_pkthdr.tx_tcp_e_pid = tp->t_inpcb->inp_socket->e_pid;
        }
 
 #if INET6
@@ -2138,17 +2131,29 @@ tcp_pcblist_n SYSCTL_HANDLER_ARGS
 
 
 SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
-           CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
-           tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
+       CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
+       tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
+
+static int
+tcp_progress_indicators SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+
+       return (ntstat_tcp_progress_indicators(req));
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress,
+       CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0,
+       tcp_progress_indicators, "S", "Various items that indicate the current state of progress on the link");
 
 
 __private_extern__ void
 tcp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags,
     bitstr_t *bitfield)
 {
-               inpcb_get_ports_used(ifindex, protocol, flags, bitfield,
-                   &tcbinfo);
-       }
+       inpcb_get_ports_used(ifindex, protocol, flags, bitfield,
+           &tcbinfo);
+}
 
 __private_extern__ uint32_t
 tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
@@ -2409,7 +2414,7 @@ tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d, __unused struct ifnet *ifp)
        }
 
        if (m == NULL ||
-           (m->m_pkthdr.len < (int32_t) (off + offsetof(struct tcphdr, th_seq))))
+           (m->m_pkthdr.len < (int32_t) (off + offsetof(struct tcphdr, th_ack))))
                return;
 
        th = (struct tcphdr *)(void *)mtodo(m, off);
@@ -2873,15 +2878,15 @@ tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope)
                if (inp->inp_last_outifp == NULL) {
                        inp->inp_last_outifp = rt->rt_ifp;
                }
-       }
 
-       /* Note if the peer is local */
-       if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
-               (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
-               IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
-               rt->rt_gateway->sa_family == AF_LINK ||
-               in6_localaddr(&inp->in6p_faddr))) {
-               tp->t_flags |= TF_LOCAL;
+               /* Note if the peer is local */
+               if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
+                    (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
+                    IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
+                    rt->rt_gateway->sa_family == AF_LINK ||
+                    in6_localaddr(&inp->in6p_faddr))) {
+                       tp->t_flags |= TF_LOCAL;
+               }
        }
 
        /*
@@ -3311,15 +3316,25 @@ calculate_tcp_clock(void)
  * defined by the constant tcp_autorcvbuf_max.
  */
 void
-tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so,
-    u_int32_t rcvbuf_max)
+tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, struct ifnet *ifp)
 {
-       u_int32_t maxsockbufsize;
+       uint32_t maxsockbufsize;
+       uint32_t rcvbuf_max;
+
        if (!tcp_do_rfc1323) {
                tp->request_r_scale = 0;
                return;
        }
 
+       /*
+        * When we start a connection and don't know about the interface, set
+        * the scaling factor simply to the max - we can always announce less.
+        */
+       if (!ifp || (IFNET_IS_CELLULAR(ifp) && (ifp->if_eflags & IFEF_3CA)))
+               rcvbuf_max = (tcp_autorcvbuf_max << 1);
+       else
+               rcvbuf_max = tcp_autorcvbuf_max;
+
        tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
        maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
                so->so_rcv.sb_hiwat : rcvbuf_max;
@@ -3332,14 +3347,20 @@ tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so,
 }
 
 int
-tcp_notsent_lowat_check(struct socket *so) {
+tcp_notsent_lowat_check(struct socket *so)
+{
        struct inpcb *inp = sotoinpcb(so);
        struct tcpcb *tp = NULL;
        int notsent = 0;
+
        if (inp != NULL) {
                tp = intotcpcb(inp);
        }
 
+       if (tp == NULL) {
+               return (0);
+       }
+
        notsent = so->so_snd.sb_cc -
                (tp->snd_nxt - tp->snd_una);
 
index 417b61b89b9a438e6c4f8c92796272d30442cbee..1896035494b955c1f9c0431fe69d6f4cd6225fe9 100644 (file)
@@ -375,6 +375,7 @@ struct tcp_last_report_stats {
        u_int32_t       tcps_mptcp_back_to_wifi;
        u_int32_t       tcps_mptcp_wifi_proxy;
        u_int32_t       tcps_mptcp_cell_proxy;
+       u_int32_t       tcps_mptcp_triggered_cell;
 };
 
 
@@ -992,11 +993,9 @@ retransmit_packet:
 #if MPTCP
                if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
                    (tp->t_state == TCPS_ESTABLISHED) &&
-                   (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
+                   (tp->t_mpflags & TMPF_MPTCP_TRUE))
                        mptcp_act_on_txfail(so);
 
-               }
-
                if (so->so_flags & SOF_MP_SUBFLOW) {
                        struct mptses *mpte = tptomptp(tp)->mpt_mpte;
 
@@ -1126,7 +1125,7 @@ retransmit_packet:
                                if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
                                        tp->t_maxopd = tcp_pmtud_black_hole_mss;
                                } else {
-                                       tp->t_maxopd =  /* use the default MSS */
+                                       tp->t_maxopd = /* use the default MSS */
 #if INET6
                                                isipv6 ? tcp_v6mssdflt :
 #endif /* INET6 */
@@ -1135,9 +1134,9 @@ retransmit_packet:
                                tp->t_maxseg = tp->t_maxopd - optlen;
 
                                /*
-                                * Reset the slow-start flight size
+                                * Reset the slow-start flight size
                                 * as it may depend on the new MSS
-                                */
+                                */
                                if (CC_ALGO(tp)->cwnd_init != NULL)
                                        CC_ALGO(tp)->cwnd_init(tp);
                                tp->snd_cwnd = tp->t_maxseg;
@@ -1300,7 +1299,7 @@ fc_output:
                    (tp->t_flagsext & TF_DETECT_READSTALL) ||
                    (tp->t_tfo_probe_state == TFO_PROBE_PROBING)) &&
                    (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
-                       if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
+                       if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
                                goto dropit;
                        /*
                         * Send a packet designed to force a response
@@ -1489,9 +1488,10 @@ fc_output:
                 * send a probe
                 */
                if (tp->t_state != TCPS_ESTABLISHED ||
-                   (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING))
-                   || tp->snd_max == tp->snd_una ||
-                   !SACK_ENABLED(tp) || !TAILQ_EMPTY(&tp->snd_holes) ||
+                   (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING)) ||
+                   tp->snd_max == tp->snd_una ||
+                   !SACK_ENABLED(tp) ||
+                   !TAILQ_EMPTY(&tp->snd_holes) ||
                    IN_FASTRECOVERY(tp))
                        break;
 
@@ -1522,6 +1522,15 @@ fc_output:
                tp->t_tlpstart = tcp_now;
 
                tp->snd_cwnd += tp->t_maxseg;
+
+               /*
+                * When tail-loss-probe fires, we reset the RTO timer, because
+                * a probe just got sent, so we are good to push out the timer.
+                *
+                * Set to 0 to ensure that tcp_output() will reschedule it
+                */
+               tp->t_timer[TCPT_REXMT] = 0;
+
                (void )tcp_output(tp);
                tp->snd_cwnd -= tp->t_maxseg;
 
@@ -2388,7 +2397,8 @@ tcp_report_stats(void)
            &prev.tcps_mptcp_wifi_proxy , &stat.mptcp_wifi_proxy);
        tcp_cumulative_stat(tcpstat.tcps_mptcp_cell_proxy,
            &prev.tcps_mptcp_cell_proxy , &stat.mptcp_cell_proxy);
-
+       tcp_cumulative_stat(tcpstat.tcps_mptcp_triggered_cell,
+           &prev.tcps_mptcp_triggered_cell, &stat.mptcp_triggered_cell);
 
        nstat_sysinfo_send_data(&data);
 
index 1af338ade1997928b275e7d7fab1c09155ca0890..bea1e4c0da97be56e21d0bef1ff1183568fec576 100644 (file)
@@ -414,12 +414,13 @@ tcp_connect_complete(struct socket *so)
 
        /* TFO delays the tcp_output until later, when the app calls write() */
        if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
-               if (!necp_socket_is_allowed_to_send_recv(sotoinpcb(so), NULL, NULL))
+               if (!necp_socket_is_allowed_to_send_recv(sotoinpcb(so), NULL, NULL, NULL))
                        return (EHOSTUNREACH);
 
                /* Initialize enough state so that we can actually send data */
                tcp_mss(tp, -1, IFSCOPE_NONE);
                tp->snd_wnd = tp->t_maxseg;
+               tp->max_sndwnd = tp->snd_wnd;
        } else {
                error = tcp_output(tp);
        }
@@ -1068,6 +1069,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
                        if (error)
                                goto out;
                        tp->snd_wnd = TTCP_CLIENT_SND_WND;
+                       tp->max_sndwnd = tp->snd_wnd;
                        tcp_mss(tp, -1, IFSCOPE_NONE);
                }
 
@@ -1119,6 +1121,7 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
                        if (error)
                                goto out;
                        tp->snd_wnd = TTCP_CLIENT_SND_WND;
+                       tp->max_sndwnd = tp->snd_wnd;
                        tcp_mss(tp, -1, IFSCOPE_NONE);
                }
                tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
@@ -1380,7 +1383,7 @@ skip_oinp:
        if (inp->inp_flowhash == 0)
                inp->inp_flowhash = inp_calc_flowhash(inp);
 
-       tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(outif));
+       tcp_set_max_rwinscale(tp, so, outif);
 
        soisconnecting(so);
        tcpstat.tcps_connattempt++;
@@ -1474,7 +1477,7 @@ tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct proc *p)
                    (htonl(inp->inp_flowhash) & IPV6_FLOWLABEL_MASK);
        }
 
-       tcp_set_max_rwinscale(tp, so, TCP_AUTORCVBUF_MAX(outif));
+       tcp_set_max_rwinscale(tp, so, outif);
 
        soisconnecting(so);
        tcpstat.tcps_connattempt++;
index 4fde35c904f95d0be2d3bfbb0078c7ee6425a01b..49e3f973117d3a4b81a1f68752ef620b512455a4 100644 (file)
@@ -620,6 +620,8 @@ struct tcpcb {
        SLIST_HEAD(,tcp_notify_ack_marker) t_notify_ack; /* state for notifying data acknowledgements */
        u_int32_t       t_recv_throttle_ts;     /* TS for start of recv throttle */
        u_int32_t       t_rxt_minimum_timeout;  /* minimum retransmit timeout in ms */
+       uint32_t        t_challengeack_last;    /* last time challenge ACK was sent per sec */
+       uint32_t        t_challengeack_count;   /* # of challenge ACKs already sent per sec */
 };
 
 #define IN_FASTRECOVERY(tp)    (tp->t_flags & TF_FASTRECOVERY)
@@ -718,9 +720,8 @@ extern int tcprexmtthresh;
        mptcp_reset_rexmit_state((_tp_)); \
 } while(0);
 
-#define        TCP_AUTORCVBUF_MAX(_ifp_) (((_ifp_) != NULL && \
-       ((_ifp_)->if_eflags & IFEF_3CA)) ? tcp_autorcvbuf_max_ca : \
-       tcp_autorcvbuf_max)
+#define        TCP_AUTORCVBUF_MAX(_ifp_) (((_ifp_) != NULL && (IFNET_IS_CELLULAR((_ifp_))) && ((_ifp_)->if_eflags & IFEF_3CA)) ? \
+               (tcp_autorcvbuf_max << 1) : tcp_autorcvbuf_max)
 
 enum tcp_cc_event {
        TCP_CC_CWND_INIT,       /* 0 */
@@ -1003,9 +1004,12 @@ struct   tcpstat {
        u_int32_t       tcps_badsyn;            /* bogus SYN, e.g. premature ACK */
        u_int32_t       tcps_mturesent;         /* resends due to MTU discovery */
        u_int32_t       tcps_listendrop;        /* listen queue overflows */
+       u_int32_t       tcps_synchallenge;      /* challenge ACK due to bad SYN */
+       u_int32_t       tcps_rstchallenge;      /* challenge ACK due to bad RST */
 
        /* new stats from FreeBSD 5.4 sync up */
        u_int32_t       tcps_minmssdrops;       /* average minmss too low drops */
+
        u_int32_t       tcps_sndrexmitbad;      /* unnecessary packet retransmissions */
        u_int32_t       tcps_badrst;            /* ignored RSTs in the window */
 
@@ -1202,6 +1206,7 @@ struct    tcpstat {
        u_int32_t       tcps_mptcp_back_to_wifi;        /* Total number of connections that succeed to move traffic away from cell (when starting on cell) */
        u_int32_t       tcps_mptcp_wifi_proxy;          /* Total number of new subflows that fell back to regular TCP on cell */
        u_int32_t       tcps_mptcp_cell_proxy;          /* Total number of new subflows that fell back to regular TCP on WiFi */
+       u_int32_t       tcps_mptcp_triggered_cell;      /* Total number of times an MPTCP-connection triggered cell bringup */
 };
 
 
@@ -1422,7 +1427,37 @@ struct  xtcpcb_n {
 #define        TCP_RTTVAR_SCALE        16      /* multiplier for rttvar; 4 bits */
 #define        TCP_RTTVAR_SHIFT        4       /* shift for rttvar; 4 bits */
 #define        TCP_DELTA_SHIFT         2       /* see tcp_input.c */
-       
+
+
+/*
+ * TCP structure with information that gives insight into forward progress on an interface,
+ * exported to user-land via sysctl(3).
+ */
+struct  xtcpprogress_indicators {
+       u_int32_t       xp_numflows;            /* Total number of flows */
+       u_int32_t       xp_conn_probe_fails;    /* Count of connection failures */
+       u_int32_t       xp_read_probe_fails;    /* Count of read probe failures */
+       u_int32_t       xp_write_probe_fails;   /* Count of write failures */
+       u_int32_t       xp_recentflows;         /* Total of "recent" flows */
+       u_int32_t       xp_recentflows_unacked; /* Total of "recent" flows with unacknowledged data */
+       u_int64_t       xp_recentflows_rxbytes; /* Total of "recent" flows received bytes */
+       u_int64_t       xp_recentflows_txbytes; /* Total of "recent" flows transmitted bytes */
+       u_int64_t       xp_recentflows_rxooo;   /* Total of "recent" flows received out of order bytes */
+       u_int64_t       xp_recentflows_rxdup;   /* Total of "recent" flows received duplicate bytes */
+       u_int64_t       xp_recentflows_retx;    /* Total of "recent" flows retransmitted bytes */
+       u_int64_t       xp_reserved1;                   /* Expansion */
+       u_int64_t       xp_reserved2;                   /* Expansion */
+       u_int64_t       xp_reserved3;                   /* Expansion */
+       u_int64_t       xp_reserved4;                   /* Expansion */
+};
+
+struct tcpprogressreq {
+       u_int64_t       ifindex;                                /* Interface index for progress indicators */
+       u_int64_t       recentflow_maxduration; /* In mach_absolute_time, max duration for flow to be counted as "recent" */
+       u_int64_t       xp_reserved1;                   /* Expansion */
+       u_int64_t       xp_reserved2;                   /* Expansion */
+};
+
 #endif /* PRIVATE */
 
 #pragma pack()
@@ -1505,7 +1540,6 @@ extern int tcp_ecn_outbound;
 extern int tcp_ecn_inbound;
 extern u_int32_t tcp_do_autorcvbuf;
 extern u_int32_t tcp_autorcvbuf_max;
-extern u_int32_t tcp_autorcvbuf_max_ca;
 extern u_int32_t tcp_autorcvbuf_inc_shift;
 extern int tcp_recv_bg;
 
@@ -1575,8 +1609,7 @@ void       tcp_reset_stretch_ack(struct tcpcb *tp);
 extern void tcp_get_ports_used(u_int32_t, int, u_int32_t, bitstr_t *);
 uint32_t tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags);
 uint32_t tcp_find_anypcb_byaddr(struct ifaddr *ifa);
-void    tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so,
-    u_int32_t maxrcvbuf);
+void tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, struct ifnet *ifp);
 struct bwmeas* tcp_bwmeas_alloc(struct tcpcb *tp);
 void tcp_bwmeas_free(struct tcpcb *tp);
 extern int32_t timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2);
index a04bfcca4591e3bc407b853c0a0a7a8d05473154..f11f5a4a5b2b9a9c0683b183ff6cd888ca917251 100644 (file)
@@ -119,6 +119,10 @@ extern int esp_udp_encap_port;
 #include <netinet/flow_divert.h>
 #endif /* FLOW_DIVERT */
 
+#if CONTENT_FILTER
+#include <net/content_filter.h>
+#endif /* CONTENT_FILTER */
+
 #define        DBG_LAYER_IN_BEG        NETDBG_CODE(DBG_NETUDP, 0)
 #define        DBG_LAYER_IN_END        NETDBG_CODE(DBG_NETUDP, 2)
 #define        DBG_LAYER_OUT_BEG       NETDBG_CODE(DBG_NETUDP, 1)
@@ -258,7 +262,11 @@ udp_init(struct protosw *pp, struct domain *dp)
        if (udp_initialized)
                return;
        udp_initialized = 1;
-
+       uint32_t pool_size = (nmbclusters << MCLSHIFT) >> MBSHIFT;
+       if (pool_size >= 96) {
+               /* Improves 10GbE UDP performance. */
+               udp_recvspace = 786896;
+       }
        LIST_INIT(&udb);
        udbinfo.ipi_listhead = &udb;
        udbinfo.ipi_hashbase = hashinit(UDBHASHSIZE, M_PCB,
@@ -516,7 +524,7 @@ udp_input(struct mbuf *m, int iphlen)
                        skipit = 0;
                        if (!necp_socket_is_allowed_to_send_recv_v4(inp,
                            uh->uh_dport, uh->uh_sport, &ip->ip_dst,
-                           &ip->ip_src, ifp, NULL, NULL)) {
+                           &ip->ip_src, ifp, NULL, NULL, NULL)) {
                                /* do not inject data to pcb */
                                skipit = 1;
                        }
@@ -691,7 +699,7 @@ udp_input(struct mbuf *m, int iphlen)
        }
 #if NECP
        if (!necp_socket_is_allowed_to_send_recv_v4(inp, uh->uh_dport,
-           uh->uh_sport, &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL)) {
+           uh->uh_sport, &ip->ip_dst, &ip->ip_src, ifp, NULL, NULL, NULL)) {
                udp_unlock(inp->inp_socket, 1, 0);
                IF_UDP_STATINC(ifp, badipsec);
                goto bad;
@@ -706,7 +714,8 @@ udp_input(struct mbuf *m, int iphlen)
        udp_in.sin_addr = ip->ip_src;
        if ((inp->inp_flags & INP_CONTROLOPTS) != 0 ||
            (inp->inp_socket->so_options & SO_TIMESTAMP) != 0 ||
-           (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+           (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+               (inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
 #if INET6
                if (inp->inp_vflag & INP_IPV6) {
                        int savedflags;
@@ -811,7 +820,8 @@ udp_append(struct inpcb *last, struct ip *ip, struct mbuf *n, int off,
 #endif /* CONFIG_MACF_NET */
        if ((last->inp_flags & INP_CONTROLOPTS) != 0 ||
            (last->inp_socket->so_options & SO_TIMESTAMP) != 0 ||
-           (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+           (last->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+               (last->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
 #if INET6
                if (last->inp_vflag & INP_IPV6) {
                        int savedflags;
@@ -1309,9 +1319,9 @@ __private_extern__ void
 udp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags,
     bitstr_t *bitfield)
 {
-               inpcb_get_ports_used(ifindex, protocol, flags, bitfield,
-                   &udbinfo);
-       }
+       inpcb_get_ports_used(ifindex, protocol, flags, bitfield,
+           &udbinfo);
+}
 
 __private_extern__ uint32_t
 udp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
@@ -1415,6 +1425,13 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
        struct ip_moptions *mopts;
        struct route ro;
        struct ip_out_args ipoa;
+#if CONTENT_FILTER
+       struct m_tag *cfil_tag = NULL;
+       bool cfil_faddr_use = false;
+       uint32_t cfil_so_state_change_cnt = 0;
+       short cfil_so_options = 0;
+       struct sockaddr *cfil_faddr = NULL;
+#endif
 
        bzero(&ipoa, sizeof(ipoa));
        ipoa.ipoa_boundif = IFSCOPE_NONE;
@@ -1434,6 +1451,35 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
        KERNEL_DEBUG(DBG_FNC_UDP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
 
        socket_lock_assert_owned(so);
+
+#if CONTENT_FILTER
+       /*
+        * If socket is subject to UDP Content Filter and no addr is passed in,
+        * retrieve CFIL saved state from mbuf and use it if necessary.
+        */
+       if (so->so_cfil_db && !addr) {
+               cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, &cfil_so_options, &cfil_faddr);
+               if (cfil_tag) {
+                       sin = (struct sockaddr_in *)(void *)cfil_faddr;
+                       if (inp && inp->inp_faddr.s_addr == INADDR_ANY) {
+                               /*
+                                * Socket is unconnected, simply use the saved faddr as 'addr' to go through
+                                * the connect/disconnect logic.
+                                */
+                               addr = (struct sockaddr *)cfil_faddr;
+                       } else if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
+                                          (inp->inp_fport != sin->sin_port ||
+                                               inp->inp_faddr.s_addr != sin->sin_addr.s_addr)) {
+                               /*
+                                * Socket is connected but socket state and dest addr/port changed.
+                                * We need to use the saved faddr info.
+                                */
+                               cfil_faddr_use = true;
+                       }
+               }
+       }
+#endif
+
        if (control != NULL) {
                sotc = so_tc_from_control(control, &netsvctype);
                VERIFY(outif == NULL);
@@ -1496,8 +1542,15 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
         * If there was a routing change, discard cached route and check
         * that we have a valid source address.  Reacquire a new source
         * address if INADDR_ANY was specified.
+        *
+        * If we are using cfil saved state, go through this cache cleanup
+        * so that we can get a new route.
         */
-       if (ROUTE_UNUSABLE(&inp->inp_route)) {
+       if (ROUTE_UNUSABLE(&inp->inp_route)
+#if CONTENT_FILTER
+               || cfil_faddr_use
+#endif
+               ) {
                struct in_ifaddr *ia = NULL;
 
                ROUTE_RELEASE(&inp->inp_route);
@@ -1551,6 +1604,14 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
        lport = inp->inp_lport;
        fport = inp->inp_fport;
 
+#if CONTENT_FILTER
+       if (cfil_faddr_use)
+       {
+               faddr = ((struct sockaddr_in *)(void *)cfil_faddr)->sin_addr;
+               fport = ((struct sockaddr_in *)(void *)cfil_faddr)->sin_port;
+       }
+#endif
+
        if (addr) {
                sin = (struct sockaddr_in *)(void *)addr;
                if (faddr.s_addr != INADDR_ANY) {
@@ -1659,9 +1720,26 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
        ui->ui_ulen = htons((u_short)len + sizeof (struct udphdr));
 
        /*
-        * Set up checksum and output datagram.
+        * Set up checksum to pseudo header checksum and output datagram.
+        *
+        * Treat flows to be CLAT46'd as IPv6 flow and compute checksum
+        * no matter what, as IPv6 mandates checksum for UDP.
+        *
+        * Here we only compute the one's complement sum of the pseudo header.
+        * The payload computation and final complement is delayed to much later
+        * in IP processing to decide if remaining computation needs to be done
+        * through offload.
+        *
+        * That is communicated by setting CSUM_UDP in csum_flags.
+        * The offset of checksum from the start of ULP header is communicated
+        * through csum_data.
+        *
+        * Note since this already contains the pseudo checksum header, any
+        * later operation at IP layer that modify the values used here must
+        * update the checksum as well (for example NAT etc).
         */
-       if (udpcksum && !(inp->inp_flags & INP_UDP_NOCKSUM)) {
+       if ((inp->inp_flags2 & INP2_CLAT46_FLOW) ||
+           (udpcksum && !(inp->inp_flags & INP_UDP_NOCKSUM))) {
                ui->ui_sum = in_pseudo(ui->ui_src.s_addr, ui->ui_dst.s_addr,
                    htons((u_short)len + sizeof (struct udphdr) + IPPROTO_UDP));
                m->m_pkthdr.csum_flags = (CSUM_UDP|CSUM_ZERO_INVERT);
@@ -1680,6 +1758,7 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
 #if NECP
        {
                necp_kernel_policy_id policy_id;
+               necp_kernel_policy_id skip_policy_id;
                u_int32_t route_rule_id;
 
                /*
@@ -1715,12 +1794,12 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
                }
 
                if (!necp_socket_is_allowed_to_send_recv_v4(inp, lport, fport,
-                   &laddr, &faddr, NULL, &policy_id, &route_rule_id)) {
+                   &laddr, &faddr, NULL, &policy_id, &route_rule_id, &skip_policy_id)) {
                        error = EHOSTUNREACH;
                        goto abort;
                }
 
-               necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id);
+               necp_mark_packet_from_socket(m, inp, policy_id, route_rule_id, skip_policy_id);
 
                if (net_qos_policy_restricted != 0) {
                        necp_socket_update_qos_marking(inp,
@@ -1739,7 +1818,13 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
 #endif /* IPSEC */
 
        inpopts = inp->inp_options;
-       soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST));
+#if CONTENT_FILTER
+       if (cfil_tag && (inp->inp_socket->so_options != cfil_so_options))
+               soopts |= (cfil_so_options & (SO_DONTROUTE | SO_BROADCAST));
+       else
+#endif
+               soopts |= (inp->inp_socket->so_options & (SO_DONTROUTE | SO_BROADCAST));
+
        mopts = inp->inp_moptions;
        if (mopts != NULL) {
                IMO_LOCK(mopts);
@@ -1763,6 +1848,11 @@ udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr,
        m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC);
        if (flowadv)
                m->m_pkthdr.pkt_flags |= PKTF_FLOW_ADV;
+       m->m_pkthdr.tx_udp_pid = so->last_pid;
+       if (so->so_flags & SOF_DELEGATED)
+               m->m_pkthdr.tx_udp_e_pid = so->e_pid;
+       else
+               m->m_pkthdr.tx_udp_e_pid = 0;
 
        if (ipoa.ipoa_boundif != IFSCOPE_NONE)
                ipoa.ipoa_flags |= IPOAF_BOUND_IF;
@@ -1826,6 +1916,15 @@ abort:
 
                if (rt->rt_flags & (RTF_MULTICAST|RTF_BROADCAST))
                        rt = NULL;      /* unusable */
+
+#if CONTENT_FILTER
+               /*
+                * Discard temporary route for cfil case
+                */
+               if (cfil_faddr_use)
+                       rt = NULL;      /* unusable */
+#endif
+
                /*
                 * Always discard if it is a multicast or broadcast route.
                 */
@@ -1868,6 +1967,11 @@ release:
        if (outif != NULL)
                ifnet_release(outif);
 
+#if CONTENT_FILTER
+       if (cfil_tag)
+               m_tag_free(cfil_tag);
+#endif
+
        return (error);
 }
 
index 0970f698319e1b26e7aaf84dca73ef7ebcc68179..2c68e6f078ca97c577274efe6817691041cfe8e1 100644 (file)
@@ -79,11 +79,12 @@ typedef struct _esp_chachapoly_ctx {
                }                                                                                                                                               \
        } while (0)
 
-#define ESP_CHECK_ARG(_arg) ESP_ASSERT(_arg != NULL, #_arg "is NULL")
+#define ESP_CHECK_ARG(_arg) ESP_ASSERT(_arg != NULL, #_arg " is NULL")
 
 #define _esp_log(_level, _format, ...)  \
        log(_level, "%s:%d " _format, __FUNCTION__, __LINE__, ##__VA_ARGS__)
 #define esp_log_err(_format, ...) _esp_log(LOG_ERR, _format, ##__VA_ARGS__)
+#define esp_log_default(_format, ...) _esp_log(LOG_NOTICE, _format, ##__VA_ARGS__)
 
 #define _esp_packet_log(_level, _format, ...)  \
        ipseclog((_level, "%s:%d " _format, __FUNCTION__, __LINE__, ##__VA_ARGS__))
@@ -97,38 +98,47 @@ esp_chachapoly_mature(struct secasvar *sav)
        ESP_CHECK_ARG(sav);
 
        if ((sav->flags & SADB_X_EXT_OLD) != 0) {
-               esp_log_err("ChaChaPoly is incompatible with SADB_X_EXT_OLD");
+               esp_log_err("ChaChaPoly is incompatible with SADB_X_EXT_OLD, SPI 0x%08x",
+                                       ntohl(sav->spi));
                return 1;
        }
        if ((sav->flags & SADB_X_EXT_DERIV) != 0) {
-               esp_log_err("ChaChaPoly is incompatible with SADB_X_EXT_DERIV");
+               esp_log_err("ChaChaPoly is incompatible with SADB_X_EXT_DERIV, SPI 0x%08x",
+                                       ntohl(sav->spi));
                return 1;
        }
 
        if (sav->alg_enc != SADB_X_EALG_CHACHA20POLY1305) {
-               esp_log_err("ChaChaPoly unsupported algorithm %d",
-                                       sav->alg_enc);
+               esp_log_err("ChaChaPoly unsupported algorithm %d, SPI 0x%08x",
+                                       sav->alg_enc, ntohl(sav->spi));
                return 1;
        }
 
        if (sav->key_enc == NULL) {
-               esp_log_err("ChaChaPoly key is missing");
+               esp_log_err("ChaChaPoly key is missing, SPI 0x%08x",
+                                       ntohl(sav->spi));
                return 1;
        }
 
        algo = esp_algorithm_lookup(sav->alg_enc);
        if (algo == NULL) {
-               esp_log_err("ChaChaPoly lookup failed for algorithm %d",
-                                       sav->alg_enc);
+               esp_log_err("ChaChaPoly lookup failed for algorithm %d, SPI 0x%08x",
+                                       sav->alg_enc, ntohl(sav->spi));
                return 1;
        }
 
        if (sav->key_enc->sadb_key_bits != ESP_CHACHAPOLY_KEYBITS_WITH_SALT) {
-               esp_log_err("ChaChaPoly invalid key length %d bits",
-                                       sav->key_enc->sadb_key_bits);
+               esp_log_err("ChaChaPoly invalid key length %d bits, SPI 0x%08x",
+                                       sav->key_enc->sadb_key_bits, ntohl(sav->spi));
                return 1;
        }
 
+       esp_log_default("ChaChaPoly Mature SPI 0x%08x%s %s dir %u state %u mode %u",
+                                       ntohl(sav->spi),
+                                       (((sav->flags & SADB_X_EXT_IIV) != 0) ? " IIV" : ""),
+                                       ((sav->sah->ipsec_if != NULL) ? if_name(sav->sah->ipsec_if) : "NONE"),
+                                       sav->sah->dir, sav->sah->state, sav->sah->saidx.mode);
+
        return 0;
 }
 
@@ -146,22 +156,27 @@ esp_chachapoly_schedule(__unused const struct esp_algorithm *algo,
        int rc = 0;
 
        ESP_CHECK_ARG(sav);
-       if (sav->ivlen != ESP_CHACHAPOLY_IV_LEN) {
-               esp_log_err("Invalid ivlen %u", sav->ivlen);
-               return EINVAL;
-       }
        if (_KEYLEN(sav->key_enc) != ESP_CHACHAPOLY_KEY_LEN + ESP_CHACHAPOLY_SALT_LEN) {
-               esp_log_err("Invalid key len %u", _KEYLEN(sav->key_enc));
+               esp_log_err("ChaChaPoly Invalid key len %u, SPI 0x%08x",
+                                       _KEYLEN(sav->key_enc), ntohl(sav->spi));
                return EINVAL;
        }
        LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_OWNED);
 
        esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched;
+       esp_ccp_ctx->ccp_implicit_iv = ((sav->flags & SADB_X_EXT_IIV) != 0);
+
+       if (sav->ivlen != (esp_ccp_ctx->ccp_implicit_iv ? 0 : ESP_CHACHAPOLY_IV_LEN)) {
+               esp_log_err("ChaChaPoly Invalid ivlen %u, SPI 0x%08x",
+                                       sav->ivlen, ntohl(sav->spi));
+               return EINVAL;
+       }
 
        rc = chacha20poly1305_init(&esp_ccp_ctx->ccp_ctx,
                                                           (const uint8_t *)_KEYBUF(sav->key_enc));
        if (rc != 0) {
-               esp_log_err("chacha20poly1305_init returned %d", rc);
+               esp_log_err("ChaChaPoly chacha20poly1305_init failed %d, SPI 0x%08x",
+                                       rc, ntohl(sav->spi));
                return rc;
        }
 
@@ -169,11 +184,30 @@ esp_chachapoly_schedule(__unused const struct esp_algorithm *algo,
                   (const uint8_t *)_KEYBUF(sav->key_enc) + ESP_CHACHAPOLY_KEY_LEN,
                   sizeof(esp_ccp_ctx->ccp_salt));
 
-       esp_ccp_ctx->ccp_implicit_iv = ((sav->flags & SADB_X_EXT_IIV) != 0);
+
+       esp_log_default("ChaChaPoly Schedule SPI 0x%08x%s %s dir %u state %u mode %u",
+                                       ntohl(sav->spi), (esp_ccp_ctx->ccp_implicit_iv ? " IIV" : ""),
+                                       ((sav->sah->ipsec_if != NULL) ? if_name(sav->sah->ipsec_if) : "NONE"),
+                                       sav->sah->dir, sav->sah->state, sav->sah->saidx.mode);
 
        return 0;
 }
 
+int
+esp_chachapoly_ivlen(const struct esp_algorithm *algo,
+                                        struct secasvar *sav)
+{
+       ESP_CHECK_ARG(algo);
+
+       if (sav != NULL &&
+               ((sav->sched != NULL && ((esp_chachapoly_ctx_t)sav->sched)->ccp_implicit_iv) ||
+                ((sav->flags & SADB_X_EXT_IIV) != 0))) {
+               return 0;
+       } else {
+               return algo->ivlenval;
+       }
+}
+
 int
 esp_chachapoly_encrypt_finalize(struct secasvar *sav,
                                                                unsigned char *tag,
@@ -185,14 +219,16 @@ esp_chachapoly_encrypt_finalize(struct secasvar *sav,
        ESP_CHECK_ARG(sav);
        ESP_CHECK_ARG(tag);
        if (tag_bytes != ESP_CHACHAPOLY_ICV_LEN) {
-               esp_log_err("Invalid tag_bytes %u", tag_bytes);
+               esp_log_err("ChaChaPoly Invalid tag_bytes %u, SPI 0x%08x",
+                                       tag_bytes, ntohl(sav->spi));
                return EINVAL;
        }
 
        esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched;
        rc = chacha20poly1305_finalize(&esp_ccp_ctx->ccp_ctx, tag);
        if (rc != 0) {
-               esp_log_err("chacha20poly1305_finalize returned %d", rc);
+               esp_log_err("ChaChaPoly chacha20poly1305_finalize failed %d, SPI 0x%08x",
+                                       rc, ntohl(sav->spi));
                return rc;
        }
        return 0;
@@ -209,14 +245,16 @@ esp_chachapoly_decrypt_finalize(struct secasvar *sav,
        ESP_CHECK_ARG(sav);
        ESP_CHECK_ARG(tag);
        if (tag_bytes != ESP_CHACHAPOLY_ICV_LEN) {
-               esp_log_err("Invalid tag_bytes %u", tag_bytes);
+               esp_log_err("ChaChaPoly Invalid tag_bytes %u, SPI 0x%08x",
+                                       tag_bytes, ntohl(sav->spi));
                return EINVAL;
        }
 
        esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched;
        rc = chacha20poly1305_verify(&esp_ccp_ctx->ccp_ctx, tag);
        if (rc != 0) {
-               esp_log_err("chacha20poly1305_finalize returned %d", rc);
+               esp_packet_log_err("ChaChaPoly chacha20poly1305_verify failed %d, SPI 0x%08x",
+                                                  rc, ntohl(sav->spi));
                return rc;
        }
        return 0;
@@ -236,35 +274,36 @@ esp_chachapoly_encrypt(struct mbuf *m, // head of mbuf chain
        uint8_t *sp; // buffer of a given encryption round
        size_t len; // length of a given encryption round
        const int32_t ivoff = (int32_t)off + (int32_t)sizeof(struct newesp); // IV offset
-       int32_t bodyoff; // body offset
+       const int32_t bodyoff = ivoff + ivlen; // body offset
        int rc = 0; // return code of corecrypto operations
        struct newesp esp_hdr; // ESP header for AAD
        _Static_assert(sizeof(esp_hdr) == 8, "Bad size");
-       uint8_t nonce[ESP_CHACHAPOLY_NONCE_LEN];
+       uint32_t nonce[ESP_CHACHAPOLY_NONCE_LEN / 4]; // ensure 32bit alignment
+       _Static_assert(sizeof(nonce) == ESP_CHACHAPOLY_NONCE_LEN, "Bad nonce length");
        esp_chachapoly_ctx_t esp_ccp_ctx;
 
        ESP_CHECK_ARG(m);
        ESP_CHECK_ARG(sav);
-       if (ivlen != ESP_CHACHAPOLY_IV_LEN) {
+
+       esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched;
+
+       if (ivlen != (esp_ccp_ctx->ccp_implicit_iv ? 0 : ESP_CHACHAPOLY_IV_LEN)) {
                m_freem(m);
-               esp_log_err("Invalid ivlen %u", ivlen);
+               esp_log_err("ChaChaPoly Invalid ivlen %u, SPI 0x%08x",
+                                       ivlen, ntohl(sav->spi));
                return EINVAL;
        }
-       if (sav->ivlen != ESP_CHACHAPOLY_IV_LEN) {
+       if (sav->ivlen != ivlen) {
                m_freem(m);
-               esp_log_err("Invalid sav->ivlen %u", sav->ivlen);
+               esp_log_err("ChaChaPoly Invalid sav->ivlen %u, SPI 0x%08x",
+                                       sav->ivlen, ntohl(sav->spi));
                return EINVAL;
        }
 
-       esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched;
-       if (esp_ccp_ctx->ccp_implicit_iv) {
-               bodyoff = ivoff;
-       } else {
-               bodyoff = ivoff + ivlen;
-       }
        // check if total packet length is enough to contain ESP + IV
        if (m->m_pkthdr.len < bodyoff) {
-               esp_log_err("Packet too short %d < %zu", m->m_pkthdr.len, bodyoff);
+               esp_log_err("ChaChaPoly Packet too short %d < %zu, SPI 0x%08x",
+                                       m->m_pkthdr.len, bodyoff, ntohl(sav->spi));
                m_freem(m);
                return EINVAL;
        }
@@ -272,45 +311,52 @@ esp_chachapoly_encrypt(struct mbuf *m, // head of mbuf chain
        rc = chacha20poly1305_reset(&esp_ccp_ctx->ccp_ctx);
        if (rc != 0) {
                m_freem(m);
-               esp_log_err("chacha20poly1305_reset failed %d", rc);
+               esp_log_err("ChaChaPoly chacha20poly1305_reset failed %d, SPI 0x%08x",
+                                       rc, ntohl(sav->spi));
                return rc;
        }
 
+       // esp_hdr is used for nonce and AAD
+       m_copydata(m, (int)off, sizeof(esp_hdr), (void *)&esp_hdr);
+
        // RFC 7634 dictates that the 12 byte nonce must be
        // the 4 byte salt followed by the 8 byte IV.
        // The IV MUST be non-repeating but does not need to be unpredictable,
        // so we use 4 bytes of 0 followed by the 4 byte ESP sequence number.
-       // this allows us to use implicit IV -- draft-mglt-ipsecme-implicit-iv
-       memset(sav->iv, 0, 4);
-       memcpy(sav->iv + 4, &sav->seq, sizeof(sav->seq));
-       _Static_assert(4 + sizeof(sav->seq) == ESP_CHACHAPOLY_IV_LEN,
-                                  "Bad IV length");
+       // this allows us to use implicit IV -- draft-ietf-ipsecme-implicit-iv
+       // Note that sav->seq is zero here so we must get esp_seq from esp_hdr
        memcpy(nonce, esp_ccp_ctx->ccp_salt, ESP_CHACHAPOLY_SALT_LEN);
-       memcpy(nonce + ESP_CHACHAPOLY_SALT_LEN, sav->iv, ESP_CHACHAPOLY_IV_LEN);
+       memset(((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN, 0, 4);
+       memcpy(((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN + 4,
+                  &esp_hdr.esp_seq, sizeof(esp_hdr.esp_seq));
+
+       _Static_assert(4 + sizeof(esp_hdr.esp_seq) == ESP_CHACHAPOLY_IV_LEN,
+                                  "Bad IV length");
        _Static_assert(ESP_CHACHAPOLY_SALT_LEN + ESP_CHACHAPOLY_IV_LEN == sizeof(nonce),
                                   "Bad nonce length");
 
-       rc = chacha20poly1305_setnonce(&esp_ccp_ctx->ccp_ctx, nonce);
+       rc = chacha20poly1305_setnonce(&esp_ccp_ctx->ccp_ctx, (uint8_t *)nonce);
        if (rc != 0) {
                m_freem(m);
-               esp_log_err("chacha20poly1305_setnonce failed %d", rc);
+               esp_log_err("ChaChaPoly chacha20poly1305_setnonce failed %d, SPI 0x%08x",
+                                       rc, ntohl(sav->spi));
                return rc;
        }
 
        if (!esp_ccp_ctx->ccp_implicit_iv) {
+               memcpy(sav->iv, ((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN, ESP_CHACHAPOLY_IV_LEN);
                m_copyback(m, ivoff, ivlen, sav->iv);
        }
        cc_clear(sizeof(nonce), nonce);
 
        // Set Additional Authentication Data (AAD)
-       m_copydata(m, (int)off, sizeof(esp_hdr), (void *)&esp_hdr);
-
        rc = chacha20poly1305_aad(&esp_ccp_ctx->ccp_ctx,
                                                          sizeof(esp_hdr),
                                                          (void *)&esp_hdr);
        if (rc != 0) {
                m_freem(m);
-               esp_log_err("chacha20poly1305_aad failed %d", rc);
+               esp_log_err("ChaChaPoly chacha20poly1305_aad failed %d, SPI 0x%08x",
+                                       rc, ntohl(sav->spi));
                return rc;
        }
 
@@ -337,7 +383,8 @@ esp_chachapoly_encrypt(struct mbuf *m, // head of mbuf chain
                                                                          len, sp, sp);
                if (rc != 0) {
                        m_freem(m);
-                       esp_log_err("chacha20poly1305_encrypt failed %d", rc);
+                       esp_log_err("ChaChaPoly chacha20poly1305_encrypt failed %d, SPI 0x%08x",
+                                               rc, ntohl(sav->spi));
                        return rc;
                }
 
@@ -347,7 +394,8 @@ esp_chachapoly_encrypt(struct mbuf *m, // head of mbuf chain
        }
        if (s == NULL && soff != m->m_pkthdr.len) {
                m_freem(m);
-               esp_log_err("not enough mbufs %d %d", soff, m->m_pkthdr.len);
+               esp_log_err("ChaChaPoly not enough mbufs %d %d, SPI 0x%08x",
+                                       soff, m->m_pkthdr.len, ntohl(sav->spi));
                return EFBIG;
        }
        return 0;
@@ -366,35 +414,36 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain
        uint8_t *sp; // buffer of a given encryption round
        size_t len; // length of a given encryption round
        const int32_t ivoff = (int32_t)off + (int32_t)sizeof(struct newesp); // IV offset
-       int32_t bodyoff; // body offset
+       const int32_t bodyoff = ivoff + ivlen; // body offset
        int rc = 0; // return code of corecrypto operations
        struct newesp esp_hdr; // ESP header for AAD
        _Static_assert(sizeof(esp_hdr) == 8, "Bad size");
-       uint8_t nonce[ESP_CHACHAPOLY_NONCE_LEN];
+       uint32_t nonce[ESP_CHACHAPOLY_NONCE_LEN / 4]; // ensure 32bit alignment
+       _Static_assert(sizeof(nonce) == ESP_CHACHAPOLY_NONCE_LEN, "Bad nonce length");
        esp_chachapoly_ctx_t esp_ccp_ctx;
 
        ESP_CHECK_ARG(m);
        ESP_CHECK_ARG(sav);
-       if (ivlen != ESP_CHACHAPOLY_IV_LEN) {
+
+       esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched;
+
+       if (ivlen != (esp_ccp_ctx->ccp_implicit_iv ? 0 : ESP_CHACHAPOLY_IV_LEN)) {
                m_freem(m);
-               esp_log_err("Invalid ivlen %u", ivlen);
+               esp_log_err("ChaChaPoly Invalid ivlen %u, SPI 0x%08x",
+                                       ivlen, ntohl(sav->spi));
                return EINVAL;
        }
-       if (sav->ivlen != ESP_CHACHAPOLY_IV_LEN) {
+       if (sav->ivlen != ivlen) {
                m_freem(m);
-               esp_log_err("Invalid sav->ivlen %u", sav->ivlen);
+               esp_log_err("ChaChaPoly Invalid sav->ivlen %u, SPI 0x%08x",
+                                       sav->ivlen, ntohl(sav->spi));
                return EINVAL;
        }
 
-       esp_ccp_ctx = (esp_chachapoly_ctx_t)sav->sched;
-       if (esp_ccp_ctx->ccp_implicit_iv) {
-               bodyoff = ivoff;
-       } else {
-               bodyoff = ivoff + ivlen;
-       }
        // check if total packet length is enough to contain ESP + IV
        if (m->m_pkthdr.len < bodyoff) {
-               esp_packet_log_err("Packet too short %d < %zu", m->m_pkthdr.len, bodyoff);
+               esp_packet_log_err("ChaChaPoly Packet too short %d < %zu, SPI 0x%08x",
+                                                  m->m_pkthdr.len, bodyoff, ntohl(sav->spi));
                m_freem(m);
                return EINVAL;
        }
@@ -402,7 +451,8 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain
        rc = chacha20poly1305_reset(&esp_ccp_ctx->ccp_ctx);
        if (rc != 0) {
                m_freem(m);
-               esp_log_err("chacha20poly1305_reset failed %d", rc);
+               esp_log_err("ChaChaPoly chacha20poly1305_reset failed %d, SPI 0x%08x",
+                                       rc, ntohl(sav->spi));
                return rc;
        }
 
@@ -413,20 +463,22 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain
        memcpy(nonce, esp_ccp_ctx->ccp_salt, ESP_CHACHAPOLY_SALT_LEN);
        if (esp_ccp_ctx->ccp_implicit_iv) {
                // IV is implicit (4 zero bytes followed by the ESP sequence number)
-               memset(nonce + ESP_CHACHAPOLY_SALT_LEN, 0, 4);
-               memcpy(nonce + ESP_CHACHAPOLY_SALT_LEN + 4, &esp_hdr.esp_seq, sizeof(esp_hdr.esp_seq));
+               memset(((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN, 0, 4);
+               memcpy(((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN + 4,
+                          &esp_hdr.esp_seq, sizeof(esp_hdr.esp_seq));
                _Static_assert(4 + sizeof(esp_hdr.esp_seq) == ESP_CHACHAPOLY_IV_LEN, "Bad IV length");
        } else {
                // copy IV from packet
-               m_copydata(m, ivoff, ESP_CHACHAPOLY_IV_LEN, nonce + ESP_CHACHAPOLY_SALT_LEN);
+               m_copydata(m, ivoff, ESP_CHACHAPOLY_IV_LEN, ((uint8_t *)nonce) + ESP_CHACHAPOLY_SALT_LEN);
        }
        _Static_assert(ESP_CHACHAPOLY_SALT_LEN + ESP_CHACHAPOLY_IV_LEN == sizeof(nonce),
                                   "Bad nonce length");
 
-       rc = chacha20poly1305_setnonce(&esp_ccp_ctx->ccp_ctx, nonce);
+       rc = chacha20poly1305_setnonce(&esp_ccp_ctx->ccp_ctx, (uint8_t *)nonce);
        if (rc != 0) {
                m_freem(m);
-               esp_log_err("chacha20poly1305_setnonce failed %d", rc);
+               esp_log_err("ChaChaPoly chacha20poly1305_setnonce failed %d, SPI 0x%08x",
+                                       rc, ntohl(sav->spi));
                return rc;
        }
        cc_clear(sizeof(nonce), nonce);
@@ -437,7 +489,8 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain
                                                          (void *)&esp_hdr);
        if (rc != 0) {
                m_freem(m);
-               esp_log_err("chacha20poly1305_aad failed %d", rc);
+               esp_log_err("ChaChaPoly chacha20poly1305_aad failed %d, SPI 0x%08x",
+                                       rc, ntohl(sav->spi));
                return rc;
        }
 
@@ -464,7 +517,8 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain
                                                                          len, sp, sp);
                if (rc != 0) {
                        m_freem(m);
-                       esp_packet_log_err("chacha20poly1305_decrypt failed %d", rc);
+                       esp_packet_log_err("chacha20poly1305_decrypt failed %d, SPI 0x%08x",
+                                                          rc, ntohl(sav->spi));
                        return rc;
                }
 
@@ -474,7 +528,8 @@ esp_chachapoly_decrypt(struct mbuf *m, // head of mbuf chain
        }
        if (s == NULL && soff != m->m_pkthdr.len) {
                m_freem(m);
-               esp_packet_log_err("not enough mbufs %d %d", soff, m->m_pkthdr.len);
+               esp_packet_log_err("not enough mbufs %d %d, SPI 0x%08x",
+                                                  soff, m->m_pkthdr.len, ntohl(sav->spi));
                return EFBIG;
        }
        return 0;
index b98b77a405f8a21b2a4d02cca11d80d25ace5e17..8e3c58e4dd843ae12e22e8c69c717b2a3e23a929 100644 (file)
@@ -48,6 +48,7 @@ int esp_chachapoly_decrypt(struct mbuf *, size_t, struct secasvar *,
 int esp_chachapoly_encrypt_finalize(struct secasvar *, unsigned char *, unsigned int);
 int esp_chachapoly_decrypt_finalize(struct secasvar *, unsigned char *, unsigned int);
 int esp_chachapoly_mature(struct secasvar *);
+int esp_chachapoly_ivlen(const struct esp_algorithm *, struct secasvar *);
 
 #endif /* _ESP_CHACHA_POLY_H_ */
 #endif /* BSD_KERNEL_PRIVATE */
index a26873e4500dc6675b5bed34eca6a5ba36552606..03fdcd7f1e7526f0e049694e04fc608b2aeacdc9 100644 (file)
@@ -188,7 +188,7 @@ static const struct esp_algorithm chacha_poly =
        { ESP_CHACHAPOLY_PAD_BOUND, ESP_CHACHAPOLY_IV_LEN,
                esp_chachapoly_mature, ESP_CHACHAPOLY_KEYBITS_WITH_SALT,
                ESP_CHACHAPOLY_KEYBITS_WITH_SALT, esp_chachapoly_schedlen,
-               "chacha-poly", esp_common_ivlen, esp_chachapoly_decrypt,
+               "chacha-poly", esp_chachapoly_ivlen, esp_chachapoly_decrypt,
                esp_chachapoly_encrypt, esp_chachapoly_schedule,
                NULL, NULL, ESP_CHACHAPOLY_ICV_LEN,
                esp_chachapoly_decrypt_finalize, esp_chachapoly_encrypt_finalize};
@@ -268,6 +268,7 @@ esp_schedule(const struct esp_algorithm *algo, struct secasvar *sav)
                ipseclog((LOG_ERR,
                    "esp_schedule %s: implicit IV not allowed\n",
                        algo->name));
+               lck_mtx_unlock(sadb_mutex);
                return EINVAL;
        }
 
index 853d97702f1ec23b3c7d26fac5702cd47488ca9d..58917f864b49d49917d690d96059aeb6c33d8ce4 100644 (file)
@@ -2068,7 +2068,8 @@ icmp6_rip6_input(struct mbuf **mp, int off)
                        if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
                                if ((last->in6p_flags & INP_CONTROLOPTS) != 0 ||
                                    (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 ||
-                                   (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+                                   (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+                                       (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
                                        ret = ip6_savecontrol(last, n, &opts);
                                        if (ret != 0) {
                                                m_freem(n);
@@ -2093,7 +2094,8 @@ icmp6_rip6_input(struct mbuf **mp, int off)
        if (last) {
                if ((last->in6p_flags & INP_CONTROLOPTS) != 0 ||
                    (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 ||
-                   (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+                   (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+                       (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
                        ret = ip6_savecontrol(last, m, &opts);
                        if (ret != 0) {
                                goto error;
@@ -2232,7 +2234,7 @@ icmp6_reflect(struct mbuf *m, size_t off)
        for (ia = in6_ifaddrs; ia; ia = ia->ia_next) {
                IFA_LOCK(&ia->ia_ifa);
                if (IN6_ARE_ADDR_EQUAL(&t, &ia->ia_addr.sin6_addr) &&
-                   (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY)) == 0) {
+                   (ia->ia6_flags & (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|IN6_IFF_CLAT46)) == 0) {
                        IFA_UNLOCK(&ia->ia_ifa);
                        src = &t;
                        break;
@@ -2651,8 +2653,8 @@ icmp6_redirect_output(struct mbuf *m0, struct rtentry *rt)
                /* get ip6 linklocal address for ifp(my outgoing interface). */
                struct in6_ifaddr *ia;
                if ((ia = in6ifa_ifpforlinklocal(ifp,
-                                                IN6_IFF_NOTREADY|
-                                                IN6_IFF_ANYCAST)) == NULL)
+                   IN6_IFF_NOTREADY|
+                   IN6_IFF_ANYCAST)) == NULL)
                        goto fail;
                IFA_LOCK(&ia->ia_ifa);
                ifp_ll6 = ia->ia_addr.sin6_addr;
index a76b74157805e541eb98131be5e2c3a4ef1d0d5b..4f34af6bac58587a0ae9bfe6750423b2844f6f67 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -262,7 +262,7 @@ static struct zone *in6ifa_zone;            /* zone for in6_ifaddr */
 #define        IN6IFA_ZONE_NAME        "in6_ifaddr"    /* zone name */
 
 struct eventhandler_lists_ctxt in6_evhdlr_ctxt;
-
+struct eventhandler_lists_ctxt in6_clat46_evhdlr_ctxt;
 /*
  * Subroutine for in6_ifaddloop() and in6_ifremloop().
  * This routine does actual work.
@@ -934,7 +934,7 @@ in6ctl_gifstat(struct ifnet *ifp, u_long cmd, struct in6_ifreq *ifr)
                /* N.B.: if_inet6data is never freed once set. */
                if (IN6_IFEXTRA(ifp) == NULL) {
                        /* return (EAFNOSUPPORT)? */
-                       bzero(&ifr->ifr_ifru.ifru_stat,
+                       bzero(&ifr->ifr_ifru.ifru_icmp6stat,
                            sizeof (ifr->ifr_ifru.ifru_icmp6stat));
                } else {
                        bcopy(&IN6_IFEXTRA(ifp)->icmp6_ifstat,
@@ -1070,6 +1070,88 @@ in6ctl_alifetime(struct in6_ifaddr *ia, u_long cmd, struct in6_ifreq *ifr,
        return (error);
 }
 
+static int
+in6ctl_clat46start(struct ifnet *ifp)
+{
+       struct nd_prefix *pr = NULL;
+       struct nd_prefix *next = NULL;
+       struct in6_ifaddr *ia6 = NULL;
+       int error = 0;
+
+       if (ifp == lo_ifp)
+               return (EINVAL);
+       /*
+        * Traverse the list of prefixes and find the first non-linklocal
+        * prefix on the interface.
+        * For that found eligible prefix, configure a CLAT46 reserved address.
+        */
+       lck_mtx_lock(nd6_mutex);
+       for (pr = nd_prefix.lh_first; pr; pr = next) {
+               next = pr->ndpr_next;
+
+               NDPR_LOCK(pr);
+               if (pr->ndpr_ifp != ifp) {
+                       NDPR_UNLOCK(pr);
+                       continue;
+               }
+
+               if (IN6_IS_ADDR_LINKLOCAL(&pr->ndpr_prefix.sin6_addr)) {
+                       NDPR_UNLOCK(pr);
+                       continue; /* XXX */
+               }
+
+               if (pr->ndpr_raf_auto == 0) {
+                       NDPR_UNLOCK(pr);
+                       continue;
+               }
+
+               if (pr->ndpr_stateflags & NDPRF_DEFUNCT) {
+                       NDPR_UNLOCK(pr);
+                       continue;
+               }
+
+               if ((pr->ndpr_stateflags & NDPRF_CLAT46) == 0
+                   && pr->ndpr_vltime != 0) {
+                       NDPR_ADDREF_LOCKED(pr); /* Take reference for rest of the processing */
+                       NDPR_UNLOCK(pr);
+                       break;
+               } else {
+                       NDPR_UNLOCK(pr);
+                       continue;
+               }
+       }
+       lck_mtx_unlock(nd6_mutex);
+
+       if (pr != NULL) {
+               if ((ia6 = in6_pfx_newpersistaddr(pr, FALSE, &error, TRUE)) == NULL) {
+                       nd6log0((LOG_ERR, "Could not configure CLAT46 address on interface "
+                           "%s.\n", ifp->if_xname));
+               } else {
+                       IFA_LOCK(&ia6->ia_ifa);
+                       NDPR_LOCK(pr);
+                       ia6->ia6_ndpr = pr;
+                       NDPR_ADDREF_LOCKED(pr); /* for addr reference */
+                       pr->ndpr_stateflags |= NDPRF_CLAT46;
+                       pr->ndpr_addrcnt++;
+                       VERIFY(pr->ndpr_addrcnt != 0);
+                       NDPR_UNLOCK(pr);
+                       IFA_UNLOCK(&ia6->ia_ifa);
+                       IFA_REMREF(&ia6->ia_ifa);
+                       ia6 = NULL;
+                       /*
+                        * A newly added address might affect the status
+                        * of other addresses, so we check and update it.
+                        * XXX: what if address duplication happens?
+                        */
+                       lck_mtx_lock(nd6_mutex);
+                       pfxlist_onlink_check();
+                       lck_mtx_unlock(nd6_mutex);
+               }
+               NDPR_REMREF(pr);
+       }
+       return (error);
+}
+
 #define        ifa2ia6(ifa)    ((struct in6_ifaddr *)(void *)(ifa))
 
 /*
@@ -1191,6 +1273,30 @@ in6_control(struct socket *so, u_long cmd, caddr_t data, struct ifnet *ifp,
                error = in6ctl_llstop(ifp);
                goto done;
 
+       case SIOCCLAT46_START:          /* struct in6_ifreq */
+               if (!privileged) {
+                       error = EPERM;
+                       goto done;
+               }
+               error = in6ctl_clat46start(ifp);
+               if (error == 0)
+                       ifp->if_eflags |= IFEF_CLAT46;
+               goto done;
+
+       case SIOCCLAT46_STOP:           /* struct in6_ifreq */
+               if (!privileged) {
+                       error = EPERM;
+                       goto done;
+               }
+
+               /*
+                * Not much to be done here and it might not be needed
+                * It would usually be done when IPv6 configuration is being
+                * flushed.
+                * XXX Probably STOP equivalent is not needed here.
+                */
+               ifp->if_eflags &= ~IFEF_CLAT46;
+               goto done;
        case SIOCSETROUTERMODE_IN6:     /* struct in6_ifreq */
                if (!privileged) {
                        error = EPERM;
@@ -2500,6 +2606,8 @@ in6_unlink_ifa(struct in6_ifaddr *ia, struct ifnet *ifp)
                        NDPR_LOCK(pr);
                        VERIFY(pr->ndpr_addrcnt != 0);
                        pr->ndpr_addrcnt--;
+                       if (oia->ia6_flags & IN6_IFF_CLAT46)
+                               pr->ndpr_stateflags &= ~NDPRF_CLAT46;
                        NDPR_UNLOCK(pr);
                        NDPR_REMREF(pr);        /* release addr reference */
                }
@@ -2665,6 +2773,31 @@ in6ifa_ifpforlinklocal(struct ifnet *ifp, int ignoreflags)
        return ((struct in6_ifaddr *)ifa);
 }
 
+struct in6_ifaddr *
+in6ifa_ifpwithflag(struct ifnet * ifp, int flag)
+{
+       struct ifaddr *ifa;
+
+       ifnet_lock_shared(ifp);
+       TAILQ_FOREACH(ifa, &ifp->if_addrlist, ifa_list)
+       {
+               IFA_LOCK_SPIN(ifa);
+               if (ifa->ifa_addr->sa_family != AF_INET6 ) {
+                       IFA_UNLOCK(ifa);
+                       continue;
+               }
+               if ((((struct in6_ifaddr *)ifa)->ia6_flags & flag) == flag) {
+                       IFA_ADDREF_LOCKED(ifa);
+                       IFA_UNLOCK(ifa);
+                       break;
+               }
+               IFA_UNLOCK(ifa);
+       }
+       ifnet_lock_done(ifp);
+
+       return ((struct in6_ifaddr *)ifa);
+}
+
 /*
  * find the internet address corresponding to a given interface and address.
  */
@@ -3010,7 +3143,7 @@ in6_ifawithscope(struct ifnet *oifp, struct in6_addr *dst)
                         * nor a duplicated address.
                         */
                        if (((struct in6_ifaddr *)ifa)->ia6_flags &
-                           IN6_IFF_NOTREADY) {
+                           (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) {
                                IFA_UNLOCK(ifa);
                                continue;
                        }
@@ -3294,7 +3427,7 @@ in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
                        IFA_UNLOCK(ifa);
                        continue; /* XXX: is there any case to allow anycast? */
                }
-               if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_NOTREADY) {
+               if (ifa2ia6(ifa)->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) {
                        IFA_UNLOCK(ifa);
                        continue; /* don't use this interface */
                }
@@ -3364,7 +3497,7 @@ in6_ifawithifp(struct ifnet *ifp, struct in6_addr *dst)
                        IFA_UNLOCK(ifa);
                        continue; /* XXX: is there any case to allow anycast? */
                }
-               if (ifa2ia6(ifa)->ia6_flags & IN6_IFF_NOTREADY) {
+               if (ifa2ia6(ifa)->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) {
                        IFA_UNLOCK(ifa);
                        continue; /* don't use this interface */
                }
index 3fe1484e2ff1bf7df631a7cf393ae3fd4a39d35b..e057fd9ebd52daef4c81e7bdaf91398851da69f6 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -99,7 +99,6 @@
 #define        _NETINET6_IN6_H_
 #include <sys/appleapiopts.h>
 #include <sys/_types.h>
-
 #include <sys/_types/_sa_family_t.h>
 
 /*
 /*
  * IPv6 address
  */
-struct in6_addr {
+typedef struct in6_addr {
        union {
                __uint8_t   __u6_addr8[16];
                __uint16_t  __u6_addr16[8];
                __uint32_t  __u6_addr32[4];
        } __u6_addr;                    /* 128-bit IP6 address */
-};
+} in6_addr_t;
 
 #define        s6_addr   __u6_addr.__u6_addr8
 #ifdef KERNEL  /* XXX nonstandard */
@@ -887,7 +886,6 @@ extern uint32_t in6_finalize_cksum(struct mbuf *, uint32_t, int32_t,
 
 /* IPv6 protocol events */
 extern struct eventhandler_lists_ctxt in6_evhdlr_ctxt;
-
 /*
  * XXX Avoid reordering the enum values below.
  * If the order is changed, please make sure
@@ -923,7 +921,6 @@ struct in6_event2kev {
        const char              *in6_event_str;
 };
 extern struct in6_event2kev in6_event2kev_array[];
-
 extern void in6_eventhdlr_callback(struct eventhandler_entry_arg, in6_evhdlr_code_t,
     struct ifnet *, struct in6_addr *, uint32_t);
 extern void in6_event_enqueue_nwk_wq_entry(in6_evhdlr_code_t,
@@ -934,6 +931,33 @@ typedef void (*in6_event_fn) (struct eventhandler_entry_arg, in6_evhdlr_code_t,
 EVENTHANDLER_DECLARE(in6_event, in6_event_fn);
 #endif /* BSD_KERNEL_PRIVATE */
 
+#ifdef PRIVATE
+/* CLAT46 events */
+typedef enum in6_clat46_evhdlr_code_t {
+       IN6_CLAT46_EVENT_V4_FLOW,
+       IN6_CLAT46_EVENT_V6_ADDR_CONFFAIL,
+} in6_clat46_evhdlr_code_t;
+
+struct kev_netevent_clat46_data {
+       in6_clat46_evhdlr_code_t clat46_event_code;
+       pid_t epid;
+       uuid_t euuid;
+};
+#endif /* PRIVATE */
+
+#ifdef BSD_KERNEL_PRIVATE
+/* CLAT46 events */
+extern struct eventhandler_lists_ctxt in6_clat46_evhdlr_ctxt;
+extern void in6_clat46_eventhdlr_callback(struct eventhandler_entry_arg,
+    in6_clat46_evhdlr_code_t, pid_t, uuid_t);
+extern void in6_clat46_event_enqueue_nwk_wq_entry(in6_clat46_evhdlr_code_t,
+    pid_t, uuid_t);
+
+typedef void (*in6_clat46_event_fn) (struct eventhandler_entry_arg, in6_clat46_evhdlr_code_t,
+    pid_t, uuid_t);
+EVENTHANDLER_DECLARE(in6_clat46_event, in6_clat46_event_fn);
+#endif /* BSD_KERNEL_PRIVATE */
+
 #ifdef KERNEL_PRIVATE
 /* exporte for ApplicationFirewall */
 extern int in6_localaddr(struct in6_addr *);
index 86759c2020138f7f1dfe628695a7b7fac261f797..f19872e563c4a2b89dfeb1a5eae2ce8a0c6ef77a 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -746,10 +746,6 @@ skipmcast:
                    sizeof(IN6_IFEXTRA(ifp)->icmp6_ifstat));
                bzero(&IN6_IFEXTRA(ifp)->in6_ifstat,
                    sizeof(IN6_IFEXTRA(ifp)->in6_ifstat));
-               IN6_IFEXTRA(ifp)->netsig_len = 0;
-               bzero(&IN6_IFEXTRA(ifp)->netsig,
-                   sizeof(IN6_IFEXTRA(ifp)->netsig));
-               bzero(IN6_IFEXTRA(ifp)->nat64_prefixes, sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
                /* XXX TBD Purge the layer two table */
                /*
                 * XXX When recycling, nd_ifinfo gets initialized, other
@@ -758,7 +754,7 @@ skipmcast:
        }
 
        /*
-        * XXX Only initialize NDP ifinfo for the interface
+        * XXX Only initialize IPv6 configuration for the interface
         * if interface has not yet been configured with
         * link local IPv6 address.
         * Could possibly be optimized with an interface flag if need
@@ -766,6 +762,11 @@ skipmcast:
         */
        ia6 = in6ifa_ifpforlinklocal(ifp, 0);
        if (ia6 == NULL) {
+               IN6_IFEXTRA(ifp)->netsig_len = 0;
+               bzero(&IN6_IFEXTRA(ifp)->netsig,
+                   sizeof(IN6_IFEXTRA(ifp)->netsig));
+               bzero(IN6_IFEXTRA(ifp)->nat64_prefixes,
+                   sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
                /* initialize NDP variables */
                nd6_ifattach(ifp);
        } else {
index 3bd00c8a2162e663914534926b938541eff2e8eb..467b3a1640896def6e6c0563fdb0c38fdad2fda4 100644 (file)
@@ -2671,7 +2671,7 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
                if (error)
                        return (error);
                /* we never use msfr.msfr_srcs; */
-               memcpy(&msfr, &msfr64, sizeof(msfr));
+               memcpy(&msfr, &msfr64, sizeof(msfr64));
        } else {
                error = sooptcopyin(sopt, &msfr32,
                    sizeof(struct __msfilterreq32),
@@ -2679,7 +2679,7 @@ in6p_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
                if (error)
                        return (error);
                /* we never use msfr.msfr_srcs; */
-               memcpy(&msfr, &msfr32, sizeof(msfr));
+               memcpy(&msfr, &msfr32, sizeof(msfr32));
        }
 
        if ((size_t) msfr.msfr_nsrcs >
index 3118b7bae193af127f2095e16c51ab55ce17296d..db72c5c35c402b26f016de7469c3ca00772948ab 100644 (file)
@@ -270,8 +270,8 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                                 */
                                IFA_LOCK_SPIN(ifa);
                                if (((struct in6_ifaddr *)ifa)->ia6_flags &
-                                   (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
-                                   IN6_IFF_DETACHED)) {
+                                   (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY|
+                                   IN6_IFF_DETACHED | IN6_IFF_CLAT46)) {
                                        IFA_UNLOCK(ifa);
                                        IFA_REMREF(ifa);
                                        lck_rw_done(pcbinfo->ipi_lock);
@@ -295,9 +295,9 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
                        struct inpcb *t;
                        uid_t u;
 
-                       /* GROSS */
 #if !CONFIG_EMBEDDED
-                       if (ntohs(lport) < IPV6PORT_RESERVED) {
+                       if (ntohs(lport) < IPV6PORT_RESERVED &&
+                               !IN6_IS_ADDR_UNSPECIFIED(&sin6.sin6_addr)) {
                                cred = kauth_cred_proc_ref(p);
                                error = priv_check_cred(cred,
                                    PRIV_NETINET_RESERVEDPORT, 0);
@@ -533,6 +533,11 @@ in6_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct proc *p)
        struct ifnet *outif = NULL;
        struct socket *so = inp->inp_socket;
 
+#if CONTENT_FILTER
+       if (so)
+               so->so_state_change_cnt++;
+#endif
+
        if (so->so_proto->pr_protocol == IPPROTO_UDP &&
            sin6->sin6_port == htons(53) && !(so->so_flags1 & SOF1_DNS_COUNTED)) {
                so->so_flags1 |= SOF1_DNS_COUNTED;
@@ -598,6 +603,11 @@ in6_pcbdisconnect(struct inpcb *inp)
 {
        struct socket *so = inp->inp_socket;
 
+#if CONTENT_FILTER
+       if (so)
+               so->so_state_change_cnt++;
+#endif
+
        if (!lck_rw_try_lock_exclusive(inp->inp_pcbinfo->ipi_lock)) {
                /* lock inversion issue, mostly with udp multicast packets */
                socket_unlock(so, 0);
index e71714e69a380949c96b53b158c6572c5849fa6e..cd8777c0a734c00de4f154a30a9b925edee97273 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2008-2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2008-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #include <net/if.h>
 #include <net/radix.h>
 #include <net/route.h>
+#include <net/nat464_utils.h>
 
 #include <netinet/in.h>
 #include <netinet/in_systm.h>
@@ -630,6 +631,8 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXDYNROUTES,
 SYSCTL_INT(_net_inet6_ip6, OID_AUTO,
        only_allow_rfc4193_prefixes, CTLFLAG_RW | CTLFLAG_LOCKED,
        &ip6_only_allow_rfc4193_prefix, 0, "");
+SYSCTL_INT(_net_inet6_ip6, OID_AUTO,
+       clat_debug, CTLFLAG_RW | CTLFLAG_LOCKED,        &clat_debug,            0, "");
 
 /* net.inet6.icmp6 */
 SYSCTL_INT(_net_inet6_icmp6, ICMPV6CTL_REDIRACCEPT,
index ab987d2dbb86876af79ba910e8cf64a7a5bc3311..8af4dc7b1060f5a19799fd1ab944ef8faa23f622 100644 (file)
@@ -326,6 +326,15 @@ in6_selectsrc_core(struct sockaddr_in6 *dstsock, uint32_t hint_mask,
 
                IFA_LOCK(&ia->ia_ifa);
 
+               /*
+                * Simply skip addresses reserved for CLAT46
+                */
+               if (ia->ia6_flags & IN6_IFF_CLAT46) {
+                       SASEL_LOG("NEXT ia %s address on ifp1 %s skipped as it is "
+                           "reserved for CLAT46", s_src, ifp1->if_xname);
+                       goto next;
+               }
+
                /*
                 * XXX By default we are strong end system and will
                 * limit candidate set of source address to the ones
@@ -687,7 +696,7 @@ in6_selectsrc(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
                        goto done;
                }
                IFA_LOCK_SPIN(&ia6->ia_ifa);
-               if ((ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY)) ||
+               if ((ia6->ia6_flags & (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) ||
                    (inp && inp_restricted_send(inp, ia6->ia_ifa.ifa_ifp))) {
                        IFA_UNLOCK(&ia6->ia_ifa);
                        IFA_REMREF(&ia6->ia_ifa);
@@ -1429,8 +1438,7 @@ in6_pcbsetport(struct in6_addr *laddr, struct inpcb *inp, struct proc *p,
        bool found;
        struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
        kauth_cred_t cred;
-
-       (void)laddr;
+#pragma unused(laddr)
        if (!locked) { /* Make sure we don't run into a deadlock: 4052373 */
                if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
                        socket_unlock(inp->inp_socket, 0);
index 8a08baa857257a9ef59cdbbc67068f7ba1d6ba0e..50cd3e9b9b43c851db951817c75f6f90dd50e668 100644 (file)
@@ -723,9 +723,12 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *, uint8_t *mac);
 #define        SIOCLL_CGASTART_32      _IOW('i', 160, struct in6_cgareq_32)
 #define        SIOCLL_CGASTART_64      _IOW('i', 160, struct in6_cgareq_64)
 #endif
+
 #define        SIOCGIFCGAPREP_IN6      _IOWR('i', 187, struct in6_cgareq)
 #define        SIOCSIFCGAPREP_IN6      _IOWR('i', 188, struct in6_cgareq)
 
+#define        SIOCCLAT46_START        _IOWR('i', 189, struct in6_ifreq)
+#define        SIOCCLAT46_STOP         _IOWR('i', 190, struct in6_ifreq)
 #endif /* PRIVATE */
 
 #ifdef BSD_KERNEL_PRIVATE
@@ -754,6 +757,7 @@ void in6_post_msg(struct ifnet *, u_int32_t, struct in6_ifaddr *, uint8_t *mac);
 #ifdef PRIVATE
 #define        IN6_IFF_SWIFTDAD        0x0800  /* DAD with no delay */
 #endif
+#define        IN6_IFF_CLAT46          0x1000  /* Address reserved for CLAT46 */
 #define        IN6_IFF_NOPFX           0x8000  /* Depreciated. Don't use. */
 
 /* Duplicate Address Detection [DAD] in progress. */
@@ -1114,6 +1118,7 @@ extern void in6_setmaxmtu(void);
 extern void in6_restoremkludge(struct in6_ifaddr *, struct ifnet *);
 extern void in6_purgemkludge(struct ifnet *);
 extern struct in6_ifaddr *in6ifa_ifpforlinklocal(struct ifnet *, int);
+extern struct in6_ifaddr *in6ifa_ifpwithflag(struct ifnet *, int);
 extern struct in6_ifaddr *in6ifa_ifpwithaddr(struct ifnet *, struct in6_addr *);
 extern struct in6_ifaddr *in6ifa_prproxyaddr(struct in6_addr *);
 extern void in6ifa_getlifetime(struct in6_ifaddr *,
index 6ca8bba66dfb6022a7ab740e870af74efcb1462f..2e8eea64bd68f359e1e9b708d8e52f0fbfe62095 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2003-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -315,6 +315,11 @@ ip6_init(struct ip6protosw *pp, struct domain *dp)
            in6_eventhdlr_callback, eventhandler_entry_dummy_arg,
            EVENTHANDLER_PRI_ANY);
 
+       eventhandler_lists_ctxt_init(&in6_clat46_evhdlr_ctxt);
+       (void)EVENTHANDLER_REGISTER(&in6_clat46_evhdlr_ctxt, in6_clat46_event,
+           in6_clat46_eventhdlr_callback, eventhandler_entry_dummy_arg,
+           EVENTHANDLER_PRI_ANY);
+
        for (i = 0; i < IN6_EVENT_MAX; i++)
                VERIFY(in6_event2kev_array[i].in6_event_code == i);
 
@@ -895,7 +900,7 @@ check_with_pf:
                 * a lot of things in the address are set once and never
                 * changed (e.g. ia_ifp.)
                 */
-               if (!(ia6->ia6_flags & IN6_IFF_NOTREADY)) {
+               if (!(ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
                        /* this address is ready */
                        ours = 1;
                        deliverifp = ia6->ia_ifp;
@@ -1613,6 +1618,15 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
                if (*mp == NULL)
                        return (NULL);
        }
+       if ((inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
+               uint64_t time;
+
+               time = mach_continuous_time();
+               mp = sbcreatecontrol_mbuf((caddr_t)&time, sizeof (time),
+                       SCM_TIMESTAMP_CONTINUOUS, SOL_SOCKET, mp);
+               if (*mp == NULL)
+                       return (NULL);
+       }
        if ((inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) != 0) {
                int tc = m_get_traffic_class(m);
 
@@ -1622,13 +1636,43 @@ ip6_savecontrol_v4(struct inpcb *inp, struct mbuf *m, struct mbuf **mp,
                        return (NULL);
        }
 
+#define        IS2292(inp, x, y)       (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
        if ((ip6->ip6_vfc & IPV6_VERSION_MASK) != IPV6_VERSION) {
-               if (v4only != NULL)
+               if (v4only != NULL) {
                        *v4only = 1;
+               }
+
+               // Send ECN flags for v4-mapped addresses
+               if ((inp->inp_flags & IN6P_TCLASS) != 0) {
+                       struct ip *ip_header = mtod(m, struct ip *);
+                       u_int8_t tos = (ip_header->ip_tos & IPTOS_ECN_MASK);
+
+                       mp = sbcreatecontrol_mbuf((caddr_t)&tos, sizeof(tos),
+                                                                         IPV6_TCLASS, IPPROTO_IPV6, mp);
+                       if (*mp == NULL)
+                               return (NULL);
+               }
+
+               // Send IN6P_PKTINFO for v4-mapped address
+               if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
+                       struct in6_pktinfo pi6 = {
+                               .ipi6_addr = IN6ADDR_V4MAPPED_INIT,
+                               .ipi6_ifindex = (m && m->m_pkthdr.rcvif) ? m->m_pkthdr.rcvif->if_index : 0,
+                       };
+
+                       struct ip *ip_header = mtod(m, struct ip *);
+                       bcopy(&ip_header->ip_dst, &pi6.ipi6_addr.s6_addr32[3], sizeof(struct in_addr));
+
+                       mp = sbcreatecontrol_mbuf((caddr_t)&pi6,
+                                                                         sizeof (struct in6_pktinfo),
+                                                                         IS2292(inp, IPV6_2292PKTINFO, IPV6_PKTINFO),
+                                                                         IPPROTO_IPV6, mp);
+                       if (*mp == NULL)
+                               return (NULL);
+               }
                return (mp);
        }
 
-#define        IS2292(inp, x, y)       (((inp)->inp_flags & IN6P_RFC2292) ? (x) : (y))
        /* RFC 2292 sec. 5 */
        if ((inp->inp_flags & IN6P_PKTINFO) != 0) {
                struct in6_pktinfo pi6;
index 73a66159e4109fa15b81ef42626f1351ede9a37f..0720d6809f59992495cfab7ec091fa4aaba186b1 100644 (file)
 #include <net/net_osdep.h>
 #include <net/net_perf.h>
 
+#include <netinet/ip.h>
 #include <netinet/in.h>
 #include <netinet/in_var.h>
 #include <netinet/ip_var.h>
@@ -2555,6 +2556,13 @@ ip6_ctloutput(struct socket *so, struct sockopt *sopt)
                                optp = &in6p->in6p_outputopts;
                                error = ip6_pcbopt(optname, (u_char *)&optval,
                                    sizeof (optval), optp, uproto);
+
+                               if (optname == IPV6_TCLASS) {
+                                       // Add in the ECN flags
+                                       u_int8_t tos = (in6p->inp_ip_tos & ~IPTOS_ECN_MASK);
+                                       u_int8_t ecn = optval & IPTOS_ECN_MASK;
+                                       in6p->inp_ip_tos = tos | ecn;
+                               }
                                break;
                        }
 
index 67fcd97bc1cd817dddc5c944a371549faf059088..23c5107107b1302d3a629316a957e733d294d536 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -339,6 +339,32 @@ struct     ip6stat {
 
        /* NECP policy related drop */
        u_quad_t ip6s_necp_policy_drop;
+
+       /* CLAT46 stats */
+       u_quad_t ip6s_clat464_in_tooshort_drop;
+       u_quad_t ip6s_clat464_in_nov6addr_drop;
+       u_quad_t ip6s_clat464_in_nov4addr_drop;
+       u_quad_t ip6s_clat464_in_v4synthfail_drop;
+       u_quad_t ip6s_clat464_in_64transfail_drop;
+       u_quad_t ip6s_clat464_in_64proto_transfail_drop;
+       u_quad_t ip6s_clat464_in_64frag_transfail_drop;
+       u_quad_t ip6s_clat464_in_invalpbuf_drop;
+       u_quad_t ip6s_clat464_in_success;
+       u_quad_t ip6s_clat464_in_drop;
+       u_quad_t ip6s_clat464_in_v4_drop;
+
+       u_quad_t ip6s_clat464_out_nov6addr_drop;
+       u_quad_t ip6s_clat464_out_v6synthfail_drop;
+       u_quad_t ip6s_clat464_out_46transfail_drop;
+       u_quad_t ip6s_clat464_out_46proto_transfail_drop;
+       u_quad_t ip6s_clat464_out_46frag_transfail_drop;
+       u_quad_t ip6s_clat464_out_invalpbuf_drop;
+       u_quad_t ip6s_clat464_out_success;
+       u_quad_t ip6s_clat464_out_drop;
+
+       u_quad_t ip6s_clat464_v6addr_conffail;
+       u_quad_t ip6s_clat464_plat64_pfx_setfail;
+       u_quad_t ip6s_clat464_plat64_pfx_getfail;
 };
 
 enum ip6s_sources_rule_index {
@@ -421,6 +447,7 @@ struct ip6_out_args {
 #define        IP6OAF_AWDL_UNRESTRICTED 0x00000040     /* privileged AWDL */
 #define        IP6OAF_QOSMARKING_ALLOWED 0x00000080    /* policy allows Fastlane DSCP marking */
 #define IP6OAF_INTCOPROC_ALLOWED 0x00000100    /* access to internal coproc interfaces */
+#define        IP6OAF_NO_LOW_POWER     0x00000200      /* skip low power */
        u_int32_t       ip6oa_retflags; /* IP6OARF return flags (see below) */
 #define        IP6OARF_IFDENIED        0x00000001      /* denied access to interface */
        int             ip6oa_sotc;             /* traffic class for Fastlane DSCP mapping */
index 283158d5843843c2efb2517025f710c7ac5b2df9..5442bf7e837c45668589eaa289b1d81dbc1d0155 100644 (file)
@@ -2873,8 +2873,10 @@ ipsec_updatereplay(u_int32_t seq, struct secasvar *sav)
        wsizeb = replay->wsize << 3;
 
        /* sequence number of 0 is invalid */
-       if (seq == 0)
-               return 1;
+    if (seq == 0) {
+        lck_mtx_unlock(sadb_mutex);
+        return 1;
+    }
 
        /* first time */
        if (replay->count == 0) {
@@ -3274,14 +3276,31 @@ ipsec4_interface_output(struct ipsec_output_state *state, ifnet_t interface)
        
        LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED);
        
-       if (!state)
+       if (state == NULL) {
                panic("state == NULL in ipsec4_output");
-       if (!state->m)
+       }
+       if (state->m == NULL) {
                panic("state->m == NULL in ipsec4_output");
-       if (!state->dst)
+       }
+       if (state->dst == NULL) {
                panic("state->dst == NULL in ipsec4_output");
+       }
+
+       struct ip *ip = mtod(state->m, struct ip *);
+
+       struct sockaddr_in src = {};
+       src.sin_family = AF_INET;
+       src.sin_len = sizeof(src);
+       memcpy(&src.sin_addr, &ip->ip_src, sizeof(src.sin_addr));
+
+       struct sockaddr_in dst = {};
+       dst.sin_family = AF_INET;
+       dst.sin_len = sizeof(dst);
+       memcpy(&dst.sin_addr, &ip->ip_dst, sizeof(dst.sin_addr));
        
-       sav = key_alloc_outbound_sav_for_interface(interface, AF_INET);
+       sav = key_alloc_outbound_sav_for_interface(interface, AF_INET,
+                                                                                          (struct sockaddr *)&src,
+                                                                                          (struct sockaddr *)&dst);
        if (sav == NULL) {
                goto bad;
        }
@@ -3291,13 +3310,15 @@ ipsec4_interface_output(struct ipsec_output_state *state, ifnet_t interface)
        }
        
        KERNEL_DEBUG(DBG_FNC_IPSEC_OUT | DBG_FUNC_END, 0,0,0,0,0);
-       if (sav)
+       if (sav) {
                key_freesav(sav, KEY_SADB_UNLOCKED);
+       }
        return 0;
        
 bad:
-       if (sav)
+       if (sav) {
                key_freesav(sav, KEY_SADB_UNLOCKED);
+       }
        m_freem(state->m);
        state->m = NULL;
        KERNEL_DEBUG(DBG_FNC_IPSEC_OUT | DBG_FUNC_END, error,0,0,0,0);
@@ -4058,16 +4079,34 @@ ipsec6_interface_output(struct ipsec_output_state *state, ifnet_t interface, u_c
        
        LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED);
        
-       if (!state)
+       if (state == NULL) {
                panic("state == NULL in ipsec6_output");
-       if (!state->m)
+       }
+       if (state->m == NULL) {
                panic("state->m == NULL in ipsec6_output");
-       if (!nexthdrp)
+       }
+       if (nexthdrp == NULL) {
                panic("nexthdrp == NULL in ipsec6_output");
-       if (!mprev)
+       }
+       if (mprev == NULL) {
                panic("mprev == NULL in ipsec6_output");
-       
-       sav = key_alloc_outbound_sav_for_interface(interface, AF_INET6);
+       }
+
+       struct ip6_hdr *ip6 = mtod(state->m, struct ip6_hdr *);
+
+       struct sockaddr_in6 src = {};
+       src.sin6_family = AF_INET6;
+       src.sin6_len = sizeof(src);
+       memcpy(&src.sin6_addr, &ip6->ip6_src, sizeof(src.sin6_addr));
+
+       struct sockaddr_in6 dst = {};
+       dst.sin6_family = AF_INET6;
+       dst.sin6_len = sizeof(dst);
+       memcpy(&dst.sin6_addr, &ip6->ip6_dst, sizeof(dst.sin6_addr));
+
+       sav = key_alloc_outbound_sav_for_interface(interface, AF_INET6,
+                                                                                          (struct sockaddr *)&src,
+                                                                                          (struct sockaddr *)&dst);
        if (sav == NULL) {
                goto bad;
        }
@@ -4083,13 +4122,15 @@ ipsec6_interface_output(struct ipsec_output_state *state, ifnet_t interface, u_c
                }
        }
        
-       if (sav)
+       if (sav) {
                key_freesav(sav, KEY_SADB_UNLOCKED);
+       }
        return 0;
        
 bad:
-       if (sav)
+       if (sav) {
                key_freesav(sav, KEY_SADB_UNLOCKED);
+       }
        m_freem(state->m);
        state->m = NULL;
        return error;
index 0b150814ae62e491db59bc67552901d4aa6a5b7f..c0c9d9a56640ea7850202f44dbaf4e06d1c0807c 100644 (file)
@@ -531,6 +531,10 @@ nd6_ifattach(struct ifnet *ifp)
        nd6_ifreset(ifp);
        lck_mtx_unlock(&ndi->lock);
        nd6_setmtu(ifp);
+
+       nd6log0((LOG_INFO, ": ",
+           "%s Reinit'd ND information for interface %s\n",
+           if_name(ifp)));
        return;
 }
 
@@ -1390,7 +1394,7 @@ addrloop:
                if (pr->ndpr_expire != 0 && pr->ndpr_expire < timenow) {
                        /*
                         * address expiration and prefix expiration are
-                        * separate.  NEVER perform in6_purgeaddr here.
+                        * separate. NEVER perform in6_purgeaddr here.
                         */
                        pr->ndpr_stateflags |= NDPRF_PROCESSED_SERVICE;
                        NDPR_ADDREF_LOCKED(pr);
index 13bb3e96303d11602d658cc87505134ba149593d..04c90e17a8da1a7265b1881f05108bd222536bfb 100644 (file)
@@ -458,6 +458,7 @@ struct      in6_ndifreq_64 {
 #define        NDPRF_PROCESSED_ONLINK  0x08000
 #define        NDPRF_PROCESSED_SERVICE 0x10000
 #define        NDPRF_DEFUNCT           0x20000
+#define        NDPRF_CLAT46            0x40000
 #endif
 
 /* protocol constants */
@@ -871,6 +872,8 @@ extern void nd6_alt_node_present(struct ifnet *, struct sockaddr_in6 *,
 extern void nd6_alt_node_absent(struct ifnet *, struct sockaddr_in6 *);
 
 /* nd6_rtr.c */
+extern struct in6_ifaddr *in6_pfx_newpersistaddr(struct nd_prefix *, int,
+    int *, boolean_t);
 extern void nd6_rtr_init(void);
 extern void nd6_rs_input(struct mbuf *, int, int);
 extern void nd6_ra_input(struct mbuf *, int, int);
index f54c20f547f9609aeea6598d654fdb1b2b38a17a..453eec269259b7abd7f3a66dcb4f4c64078afc6d 100644 (file)
@@ -100,9 +100,6 @@ static struct nd_defrouter *defrtrlist_update_common(struct nd_defrouter *,
     boolean_t);
 static struct nd_defrouter *defrtrlist_update(struct nd_defrouter *);
 
-static struct in6_ifaddr *in6_pfx_newpersistaddr(struct nd_prefix *, int,
-    int *);
-
 static struct nd_pfxrouter *pfxrtr_lookup(struct nd_prefix *,
        struct nd_defrouter *);
 static void pfxrtr_add(struct nd_prefix *, struct nd_defrouter *);
@@ -2362,8 +2359,7 @@ prelist_update(
                 * No address matched and the valid lifetime is non-zero.
                 * Create a new address.
                 */
-
-               if ((ia6 = in6_pfx_newpersistaddr(new, mcast, &error))
+               if ((ia6 = in6_pfx_newpersistaddr(new, mcast, &error, FALSE))
                    != NULL) {
                        /*
                         * note that we should use pr (not new) for reference.
@@ -2401,6 +2397,46 @@ prelist_update(
                        IFA_REMREF(&ia6->ia_ifa);
                        ia6 = NULL;
 
+                       /*
+                        * If the interface is marked for CLAT46 configuration
+                        * try and configure the reserved IPv6 address for
+                        * stateless translation.
+                        */
+                       if (IS_INTF_CLAT46(ifp)) {
+                               if ((ia6 = in6_pfx_newpersistaddr(new, mcast,&error, TRUE)) != NULL) {
+                                       IFA_LOCK(&ia6->ia_ifa);
+                                       NDPR_LOCK(pr);
+                                       ia6->ia6_ndpr = pr;
+                                       NDPR_ADDREF_LOCKED(pr); /* for addr reference */
+                                       pr->ndpr_addrcnt++;
+                                       VERIFY(pr->ndpr_addrcnt != 0);
+                                       pr->ndpr_stateflags |= NDPRF_CLAT46;
+                                       NDPR_UNLOCK(pr);
+                                       IFA_UNLOCK(&ia6->ia_ifa);
+                                       IFA_REMREF(&ia6->ia_ifa);
+                                       ia6 = NULL;
+                               } else if (error != EEXIST) {
+                                       uuid_t tmp_uuid = {};
+                                       /*
+                                        * Only report the error if it is not
+                                        * EEXIST.
+                                        */
+                                       ip6stat.ip6s_clat464_v6addr_conffail++;
+                                       in6_clat46_event_enqueue_nwk_wq_entry(
+                                           IN6_CLAT46_EVENT_V6_ADDR_CONFFAIL,
+                                           0,
+                                           tmp_uuid);
+                                       nd6log0((LOG_ERR, "Could not configure CLAT46 address on interface "
+                                           "%s.\n", ifp->if_xname));
+                               }
+                               /*
+                                * Reset the error as we do not want to
+                                * treat failure of CLAT46 address configuration
+                                * as complete failure in prelist update path.
+                                */
+                               error = 0;
+                       }
+
                        /*
                         * A newly added address might affect the status
                         * of other addresses, so we check and update it.
@@ -2411,7 +2447,6 @@ prelist_update(
                        lck_mtx_unlock(nd6_mutex);
                }
        }
-
 end:
        if (pr != NULL)
                NDPR_REMREF(pr);
@@ -3543,8 +3578,8 @@ nd6_prefix_offlink(struct nd_prefix *pr)
        return (error);
 }
 
-static struct in6_ifaddr *
-in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp)
+struct in6_ifaddr *
+in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp, boolean_t is_clat46)
 {
        struct in6_ifaddr *ia6 = NULL;
        struct ifnet *ifp = NULL;
@@ -3619,7 +3654,7 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp)
        lck_mtx_unlock(&ndi->lock);
        NDPR_UNLOCK(pr);
 
-       if (notcga) {
+       if (notcga && !is_clat46) {
                ia6 = in6ifa_ifpforlinklocal(ifp, 0);
                if (ia6 == NULL) {
                        error = EADDRNOTAVAIL;
@@ -3644,22 +3679,43 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp)
                in6_cga_node_lock();
                struct in6_cga_prepare local_cga_prepare;
 
+               /*
+                * XXX For now the collision count is not used in the classical
+                * way for secure addresses.
+                * Use a different collision count value to generate reserved
+                * address for stateless CLAT46
+                */
                if (ndi->cga_initialized) {
                        bcopy(&(ndi->local_cga_modifier),
                            &(local_cga_prepare.cga_modifier),
                            sizeof(local_cga_prepare.cga_modifier));
-                       error = in6_cga_generate(&local_cga_prepare, 0,
-                           &ifra.ifra_addr.sin6_addr);
+                       if (!is_clat46) {
+                               error = in6_cga_generate(&local_cga_prepare, 0,
+                                   &ifra.ifra_addr.sin6_addr);
+                       } else {
+                               error = in6_cga_generate(&local_cga_prepare, 1,
+                                   &ifra.ifra_addr.sin6_addr);
+                       }
                } else {
-                       error = in6_cga_generate(NULL, 0,
-                           &ifra.ifra_addr.sin6_addr);
+                       if (!is_clat46)
+                               error = in6_cga_generate(NULL, 0,
+                                   &ifra.ifra_addr.sin6_addr);
+                       else
+                               error = in6_cga_generate(NULL, 1,
+                                   &ifra.ifra_addr.sin6_addr);
                }
                in6_cga_node_unlock();
-               if (error == 0)
+               if (error == 0) {
                        ifra.ifra_flags |= IN6_IFF_SECURED;
-               else {
-                       nd6log((LOG_ERR, "%s: no CGA available (%s)\n",
-                           __func__, if_name(ifp)));
+                       if (is_clat46)
+                               ifra.ifra_flags |= IN6_IFF_CLAT46;
+               } else {
+                       if (!is_clat46)
+                               nd6log((LOG_ERR, "%s: no CGA available (%s)\n",
+                                   __func__, if_name(ifp)));
+                       else
+                               nd6log((LOG_ERR, "%s: no CLAT46 available (%s)\n",
+                                    __func__, if_name(ifp)));
                        goto done;
                }
        }
@@ -3686,7 +3742,7 @@ in6_pfx_newpersistaddr(struct nd_prefix *pr, int mcast, int *errorp)
         */
        if ((ia6 = in6ifa_ifpwithaddr(ifp, &ifra.ifra_addr.sin6_addr))
            != NULL) {
-               error = EADDRNOTAVAIL;
+               error = EEXIST;
                IFA_REMREF(&ia6->ia_ifa);
                ia6 = NULL;
 
index cc7d35a5ffd8e1b594a48a1483192a2f521c449d..18a0b96e45466d4b59f416d0e4f5059ad70e16b7 100644 (file)
@@ -116,7 +116,7 @@ sysctl_cga_parameters SYSCTL_HANDLER_ARGS
 #endif
 
        MALLOC(buffer, char *, SYSCTL_CGA_PARAMETERS_BUFFER_SIZE, M_IP6CGA,
-           M_WAITOK);
+           M_WAITOK | M_ZERO);
        if (buffer == NULL) {
                log(LOG_ERR, "%s: could not allocate marshaling buffer.\n",
                    __func__);
index ec7f823fb1809d78d737304ee349e35515d59d3a..92ec475f4b173e6a699c4bea0afac1b6ce6e4c15 100644 (file)
@@ -197,7 +197,7 @@ rip6_input(
 
 #if NECP
                        if (n && !necp_socket_is_allowed_to_send_recv_v6(in6p, 0, 0,
-                               &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL)) {
+                               &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL, NULL)) {
                                m_freem(n);
                                /* do not inject data into pcb */
                        } else
@@ -205,7 +205,8 @@ rip6_input(
                        if (n) {
                                if ((last->in6p_flags & INP_CONTROLOPTS) != 0 ||
                                    (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 ||
-                                   (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+                                   (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+                                       (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
                                        ret = ip6_savecontrol(last, n, &opts);
                                        if (ret != 0) {
                                                m_freem(n);
@@ -231,7 +232,7 @@ rip6_input(
 
 #if NECP
        if (last && !necp_socket_is_allowed_to_send_recv_v6(in6p, 0, 0,
-               &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL)) {
+               &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL, NULL)) {
                m_freem(m);
                ip6stat.ip6s_delivered--;
                /* do not inject data into pcb */
@@ -240,7 +241,8 @@ rip6_input(
        if (last) {
                if ((last->in6p_flags & INP_CONTROLOPTS) != 0 ||
                    (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 ||
-                   (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+                   (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+                       (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
                        ret = ip6_savecontrol(last, m, &opts);
                        if (ret != 0) {
                                m_freem(m);
@@ -568,6 +570,7 @@ rip6_output(
 #if NECP
        {
                necp_kernel_policy_id policy_id;
+               necp_kernel_policy_id skip_policy_id;
                u_int32_t route_rule_id;
 
                /*
@@ -603,12 +606,12 @@ rip6_output(
                }
 
                if (!necp_socket_is_allowed_to_send_recv_v6(in6p, 0, 0,
-                       &ip6->ip6_src, &ip6->ip6_dst, NULL, &policy_id, &route_rule_id)) {
+                       &ip6->ip6_src, &ip6->ip6_dst, NULL, &policy_id, &route_rule_id, &skip_policy_id)) {
                        error = EHOSTUNREACH;
                        goto bad;
                }
 
-               necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id);
+               necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id, skip_policy_id);
 
                if (net_qos_policy_restricted != 0) {
                        necp_socket_update_qos_marking(in6p, in6p->in6p_route.ro_rt,
@@ -640,6 +643,11 @@ rip6_output(
        m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC |
            PKTF_FLOW_RAWSOCK);
        m->m_pkthdr.pkt_proto = in6p->in6p_ip6_nxt;
+       m->m_pkthdr.tx_rawip_pid = so->last_pid;
+       if (so->so_flags & SOF_DELEGATED)
+               m->m_pkthdr.tx_rawip_e_pid = so->e_pid;
+       else
+               m->m_pkthdr.tx_rawip_e_pid = 0;
 
        if (im6o != NULL)
                IM6O_ADDREF(im6o);
@@ -880,8 +888,8 @@ rip6_bind(struct socket *so, struct sockaddr *nam, struct proc *p)
        if (ifa != NULL) {
                IFA_LOCK(ifa);
                if (((struct in6_ifaddr *)ifa)->ia6_flags &
-                   (IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
-                    IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
+                   (IN6_IFF_ANYCAST | IN6_IFF_NOTREADY | IN6_IFF_CLAT46 |
+                    IN6_IFF_DETACHED | IN6_IFF_DEPRECATED)) {
                        IFA_UNLOCK(ifa);
                        IFA_REMREF(ifa);
                        return (EADDRNOTAVAIL);
index fdda7e512c838c4eb81bd612c41b0d3528e66d87..2e674c328662a35f5d17fbaaebe90f274a8a3302 100644 (file)
 
 #include <net/net_osdep.h>
 
+#if CONTENT_FILTER
+#include <net/content_filter.h>
+#endif /* CONTENT_FILTER */
+
 /*
  * UDP protocol inplementation.
  * Per RFC 768, August, 1980.
@@ -166,6 +170,13 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
        struct socket *so = in6p->in6p_socket;
        struct route_in6 ro;
        int flowadv = 0;
+#if CONTENT_FILTER
+       struct m_tag *cfil_tag = NULL;
+       bool cfil_faddr_use = false;
+       uint32_t cfil_so_state_change_cnt = 0;
+       struct sockaddr *cfil_faddr = NULL;
+       struct sockaddr_in6 *cfil_sin6 = NULL;
+#endif
 
        bzero(&ip6oa, sizeof(ip6oa));
        ip6oa.ip6oa_boundif = IFSCOPE_NONE;
@@ -192,6 +203,28 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
        if (INP_INTCOPROC_ALLOWED(in6p))
                ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
 
+#if CONTENT_FILTER
+       /*
+        * If socket is subject to UDP Content Filter and no addr is passed in,
+        * retrieve CFIL saved state from mbuf and use it if necessary.
+        */
+       if (so->so_cfil_db && !addr6) {
+               cfil_tag = cfil_udp_get_socket_state(m, &cfil_so_state_change_cnt, NULL, &cfil_faddr);
+               if (cfil_tag) {
+                       cfil_sin6 = (struct sockaddr_in6 *)(void *)cfil_faddr;
+                       if ((so->so_state_change_cnt != cfil_so_state_change_cnt) &&
+                               (in6p->in6p_fport != cfil_sin6->sin6_port ||
+                                !IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &cfil_sin6->sin6_addr))) {
+                               /*
+                                * Socket is connected but socket state and dest addr/port changed.
+                                * We need to use the saved faddr info.
+                                */
+                               cfil_faddr_use = true;
+                       }
+               }
+       }
+#endif
+
        if (control) {
                sotc = so_tc_from_control(control, &netsvctype);
                if ((error = ip6_setpktopts(control, &opt,
@@ -284,7 +317,20 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                        error = ENOTCONN;
                        goto release;
                }
-               if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
+               laddr = &in6p->in6p_laddr;
+               faddr = &in6p->in6p_faddr;
+               fport = in6p->in6p_fport;
+#if CONTENT_FILTER
+               if (cfil_faddr_use)
+               {
+                       faddr = &((struct sockaddr_in6 *)(void *)cfil_faddr)->sin6_addr;
+                       fport = ((struct sockaddr_in6 *)(void *)cfil_faddr)->sin6_port;
+
+                       /* Do not use cached route */
+                       ROUTE_RELEASE(&in6p->in6p_route);
+               }
+#endif
+               if (IN6_IS_ADDR_V4MAPPED(faddr)) {
                        if ((in6p->in6p_flags & IN6P_IPV6_V6ONLY)) {
                                /*
                                 * XXX: this case would happen when the
@@ -300,9 +346,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                        } else
                                af = AF_INET;
                }
-               laddr = &in6p->in6p_laddr;
-               faddr = &in6p->in6p_faddr;
-               fport = in6p->in6p_fport;
+
        }
 
        if (in6p->inp_flowhash == 0)
@@ -374,6 +418,7 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
 #if NECP
                {
                        necp_kernel_policy_id policy_id;
+                       necp_kernel_policy_id skip_policy_id;
                        u_int32_t route_rule_id;
 
                        /*
@@ -408,12 +453,12 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                                in6p->inp_policyresult.results.qos_marking_gencount = 0;
                        }
 
-                       if (!necp_socket_is_allowed_to_send_recv_v6(in6p, in6p->in6p_lport, fport, laddr, faddr, NULL, &policy_id, &route_rule_id)) {
+                       if (!necp_socket_is_allowed_to_send_recv_v6(in6p, in6p->in6p_lport, fport, laddr, faddr, NULL, &policy_id, &route_rule_id, &skip_policy_id)) {
                                error = EHOSTUNREACH;
                                goto release;
                        }
 
-                       necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id);
+                       necp_mark_packet_from_socket(m, in6p, policy_id, route_rule_id, skip_policy_id);
 
                        if (net_qos_policy_restricted != 0) {
                                necp_socket_update_qos_marking(in6p, in6p->in6p_route.ro_rt,
@@ -447,6 +492,11 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC);
                if (flowadv)
                        m->m_pkthdr.pkt_flags |= PKTF_FLOW_ADV;
+               m->m_pkthdr.tx_udp_pid = so->last_pid;
+               if (so->so_flags & SOF_DELEGATED)
+                       m->m_pkthdr.tx_udp_e_pid = so->e_pid;
+               else
+                       m->m_pkthdr.tx_udp_e_pid = 0;
 
                im6o = in6p->in6p_moptions;
                if (im6o != NULL) {
@@ -523,6 +573,14 @@ udp6_output(struct in6pcb *in6p, struct mbuf *m, struct sockaddr *addr6,
                        if (rt->rt_flags & RTF_MULTICAST)
                                rt = NULL;      /* unusable */
 
+#if CONTENT_FILTER
+                       /*
+                        * Discard temporary route for cfil case
+                        */
+                       if (cfil_faddr_use)
+                               rt = NULL;      /* unusable */
+#endif
+                       
                        /*
                         * Always discard the cached route for unconnected
                         * socket or if it is a multicast route.
@@ -574,5 +632,9 @@ releaseopt:
                        ip6_clearpktopts(optp, -1);
                m_freem(control);
        }
+#if CONTENT_FILTER
+       if (cfil_tag)
+               m_tag_free(cfil_tag);
+#endif
        return (error);
 }
index 8680560fcf3223d4809a3ce678bca016e3ff2674..325e3773d66d375cfb8e4c80f50153d3a32f7dfa 100644 (file)
@@ -145,6 +145,10 @@ extern int esp_udp_encap_port;
 #include <netinet/flow_divert.h>
 #endif /* FLOW_DIVERT */
 
+#if CONTENT_FILTER
+#include <net/content_filter.h>
+#endif /* CONTENT_FILTER */
+
 /*
  * UDP protocol inplementation.
  * Per RFC 768, August, 1980.
@@ -206,7 +210,8 @@ udp6_append(struct inpcb *last, struct ip6_hdr *ip6,
 #endif /* CONFIG_MACF_NET */
        if ((last->in6p_flags & INP_CONTROLOPTS) != 0 ||
            (last->in6p_socket->so_options & SO_TIMESTAMP) != 0 ||
-           (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+           (last->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+               (last->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
                ret = ip6_savecontrol(last, n, &opts);
                if (ret != 0) {
                        m_freem(n);
@@ -400,7 +405,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto)
                        skipit = 0;
                        if (!necp_socket_is_allowed_to_send_recv_v6(in6p,
                            uh->uh_dport, uh->uh_sport, &ip6->ip6_dst,
-                           &ip6->ip6_src, ifp, NULL, NULL)) {
+                           &ip6->ip6_src, ifp, NULL, NULL, NULL)) {
                                /* do not inject data to pcb */
                                skipit = 1;
                        }
@@ -548,7 +553,7 @@ udp6_input(struct mbuf **mp, int *offp, int proto)
        }
 #if NECP
        if (!necp_socket_is_allowed_to_send_recv_v6(in6p, uh->uh_dport,
-           uh->uh_sport, &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL)) {
+           uh->uh_sport, &ip6->ip6_dst, &ip6->ip6_src, ifp, NULL, NULL, NULL)) {
                in_pcb_checkstate(in6p, WNT_RELEASE, 0);
                IF_UDP_STATINC(ifp, badipsec);
                goto bad;
@@ -571,7 +576,8 @@ udp6_input(struct mbuf **mp, int *offp, int proto)
        udp_in6.sin6_port = uh->uh_sport;
        if ((in6p->in6p_flags & INP_CONTROLOPTS) != 0 ||
            (in6p->in6p_socket->so_options & SO_TIMESTAMP) != 0 ||
-           (in6p->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0) {
+           (in6p->in6p_socket->so_options & SO_TIMESTAMP_MONOTONIC) != 0 ||
+               (in6p->in6p_socket->so_options & SO_TIMESTAMP_CONTINUOUS) != 0) {
                ret = ip6_savecontrol(in6p, m, &opts);
                if (ret != 0) {
                        udp_unlock(in6p->in6p_socket, 1, 0);
@@ -943,6 +949,10 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 #if defined(NECP) && defined(FLOW_DIVERT)
        int should_use_flow_divert = 0;
 #endif /* defined(NECP) && defined(FLOW_DIVERT) */
+#if CONTENT_FILTER
+       struct m_tag *cfil_tag = NULL;
+       struct sockaddr *cfil_faddr = NULL;
+#endif
 
        inp = sotoinpcb(so);
        if (inp == NULL) {
@@ -950,6 +960,16 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
                goto bad;
        }
 
+#if CONTENT_FILTER
+       //If socket is subject to UDP Content Filter and unconnected, get addr from tag.
+       if (so->so_cfil_db && !addr && IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
+               cfil_tag = cfil_udp_get_socket_state(m, NULL, NULL, &cfil_faddr);
+               if (cfil_tag) {
+                       addr = (struct sockaddr *)cfil_faddr;
+               }
+       }
+#endif
+
 #if defined(NECP) && defined(FLOW_DIVERT)
        should_use_flow_divert = necp_socket_should_use_flow_divert(inp);
 #endif /* defined(NECP) && defined(FLOW_DIVERT) */
@@ -989,6 +1009,10 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
                        pru = ip_protox[IPPROTO_UDP]->pr_usrreqs;
                        error = ((*pru->pru_send)(so, flags, m, addr,
                            control, p));
+#if CONTENT_FILTER
+                       if (cfil_tag)
+                               m_tag_free(cfil_tag);
+#endif
                        /* addr will just be freed in sendit(). */
                        return (error);
                }
@@ -998,11 +1022,21 @@ udp6_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *addr,
 do_flow_divert:
        if (should_use_flow_divert) {
                /* Implicit connect */
-               return (flow_divert_implicit_data_out(so, flags, m, addr, control, p));
+               error = flow_divert_implicit_data_out(so, flags, m, addr, control, p);
+#if CONTENT_FILTER
+               if (cfil_tag)
+                       m_tag_free(cfil_tag);
+#endif
+               return error;
        }
 #endif /* defined(NECP) && defined(FLOW_DIVERT) */
 
-       return (udp6_output(inp, m, addr, control, p));
+       error = udp6_output(inp, m, addr, control, p);
+#if CONTENT_FILTER
+       if (cfil_tag)
+               m_tag_free(cfil_tag);
+#endif
+       return error;
 
 bad:
        VERIFY(error != 0);
@@ -1011,7 +1045,10 @@ bad:
                m_freem(m);
        if (control != NULL)
                m_freem(control);
-
+#if CONTENT_FILTER
+       if (cfil_tag)
+               m_tag_free(cfil_tag);
+#endif
        return (error);
 }
 
index 5272fa8df12f28854c1671221d9ebc148d9ddc4f..f373441d06af49b7a008b0a933660f71d06041a3 100644 (file)
@@ -838,7 +838,9 @@ found:
        return sp;
 }
 
-struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t interface, int family)
+struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t interface, int family,
+                                                                                                         struct sockaddr *src,
+                                                                                                         struct sockaddr *dst)
 {
        struct secashead *sah;
        struct secasvar *sav;
@@ -848,47 +850,75 @@ struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t interface, int fam
        int arraysize;
        struct sockaddr_in *sin;
        u_int16_t dstport;
+       bool strict = true;
     
-       if (interface == NULL)
+       if (interface == NULL) {
         return NULL;
+       }
        
        LCK_MTX_ASSERT(sadb_mutex, LCK_MTX_ASSERT_NOTOWNED);
        
        lck_mtx_lock(sadb_mutex);
-       
-       LIST_FOREACH(sah, &sahtree, chain) {
-               if (sah->state == SADB_SASTATE_DEAD) {
-                       continue;
-               }
-               if (sah->ipsec_if == interface &&
-                       (family == AF_INET6 || family == AF_INET) &&
-                       sah->dir == IPSEC_DIR_OUTBOUND) {
-                       /* This SAH is linked to the IPSec interface, and the right family. We found it! */
-                       if (key_preferred_oldsa) {
-                               saorder_state_valid = saorder_state_valid_prefer_old;
-                               arraysize = _ARRAYLEN(saorder_state_valid_prefer_old);
-                       } else {
-                               saorder_state_valid = saorder_state_valid_prefer_new;
-                               arraysize = _ARRAYLEN(saorder_state_valid_prefer_new);
+
+       do {
+               LIST_FOREACH(sah, &sahtree, chain) {
+                       if (sah->state == SADB_SASTATE_DEAD) {
+                               continue;
                        }
-                       
-                       sin = (struct sockaddr_in *)&sah->saidx.dst;
-                       dstport = sin->sin_port;
-                       if (sah->saidx.mode == IPSEC_MODE_TRANSPORT)
-                               sin->sin_port = IPSEC_PORT_ANY;
-                       
-                       for (stateidx = 0; stateidx < arraysize; stateidx++) {
-                               state = saorder_state_valid[stateidx];
-                               sav = key_do_allocsa_policy(sah, state, dstport);
-                               if (sav != NULL) {
-                                       lck_mtx_unlock(sadb_mutex);
-                                       return sav;
+                       if (sah->ipsec_if == interface &&
+                               (family == AF_INET6 || family == AF_INET) &&
+                               sah->dir == IPSEC_DIR_OUTBOUND) {
+
+                               if (strict &&
+                                       sah->saidx.mode == IPSEC_MODE_TRANSPORT &&
+                                       src != NULL && dst != NULL) {
+                                       // Validate addresses for transport mode
+                                       if (key_sockaddrcmp((struct sockaddr *)&sah->saidx.src, src, 0) != 0) {
+                                               // Source doesn't match
+                                               continue;
+                                       }
+
+                                       if (key_sockaddrcmp((struct sockaddr *)&sah->saidx.dst, dst, 0) != 0) {
+                                               // Destination doesn't match
+                                               continue;
+                                       }
                                }
+
+                               /* This SAH is linked to the IPSec interface, and the right family. We found it! */
+                               if (key_preferred_oldsa) {
+                                       saorder_state_valid = saorder_state_valid_prefer_old;
+                                       arraysize = _ARRAYLEN(saorder_state_valid_prefer_old);
+                               } else {
+                                       saorder_state_valid = saorder_state_valid_prefer_new;
+                                       arraysize = _ARRAYLEN(saorder_state_valid_prefer_new);
+                               }
+
+                               sin = (struct sockaddr_in *)&sah->saidx.dst;
+                               dstport = sin->sin_port;
+                               if (sah->saidx.mode == IPSEC_MODE_TRANSPORT) {
+                                       sin->sin_port = IPSEC_PORT_ANY;
+                               }
+
+                               for (stateidx = 0; stateidx < arraysize; stateidx++) {
+                                       state = saorder_state_valid[stateidx];
+                                       sav = key_do_allocsa_policy(sah, state, dstport);
+                                       if (sav != NULL) {
+                                               lck_mtx_unlock(sadb_mutex);
+                                               return sav;
+                                       }
+                               }
+
+                               break;
                        }
-                       
+               }
+               if (strict) {
+                       // If we didn't find anything, try again without strict
+                       strict = false;
+               } else {
+                       // We already were on the second try, bail
                        break;
                }
-       }
+       } while (true);
        
        lck_mtx_unlock(sadb_mutex);
        return NULL;
@@ -9232,7 +9262,7 @@ key_promisc(
        }
 }
 
-static int (*key_typesw[])(struct socket *, struct mbuf *,
+static int (*const key_typesw[])(struct socket *, struct mbuf *,
                                                   const struct sadb_msghdr *) = {
        NULL,           /* SADB_RESERVED */
        key_getspi,     /* SADB_GETSPI */
index c13c36947c151f34fec123824b125866e20016b7..c61f04f2218d97f89d385088876fea8ffa7e61fb 100644 (file)
@@ -57,7 +57,9 @@ extern struct secpolicy *key_allocsp(struct secpolicyindex *, u_int);
 extern struct secasvar *key_allocsa_policy(struct secasindex *);
 extern struct secpolicy *key_gettunnel(struct sockaddr *,
        struct sockaddr *, struct sockaddr *, struct sockaddr *);
-extern struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t, int);
+extern struct secasvar *key_alloc_outbound_sav_for_interface(ifnet_t interface, int family,
+                                                                                                                        struct sockaddr *src,
+                                                                                                                        struct sockaddr *dst);
 extern int key_checkrequest(struct ipsecrequest *isr, struct secasindex *,
        struct secasvar **sav);
 extern struct secasvar *key_allocsa(u_int, caddr_t, caddr_t,
index a9c78f1932f05bd9a7c2911d74c2399e24bd9401..02c1212897c9a3db67ca9d19914902b107bf8aa0 100644 (file)
@@ -1598,13 +1598,15 @@ nfs_gss_clnt_ctx_callserver(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp)
                FREE(cp->gss_clnt_handle, M_TEMP);
                cp->gss_clnt_handle = NULL;
        }
-       if (cp->gss_clnt_handle_len > 0) {
+       if (cp->gss_clnt_handle_len > 0 && cp->gss_clnt_handle_len < GSS_MAX_CTX_HANDLE_LEN) {
                MALLOC(cp->gss_clnt_handle, u_char *, cp->gss_clnt_handle_len, M_TEMP, M_WAITOK);
                if (cp->gss_clnt_handle == NULL) {
                        error = ENOMEM;
                        goto nfsmout;
                }
                nfsm_chain_get_opaque(error, &nmrep, cp->gss_clnt_handle_len, cp->gss_clnt_handle);
+       } else {
+               error = EBADRPC;
        }
        nfsm_chain_get_32(error, &nmrep, cp->gss_clnt_major);
        nfsm_chain_get_32(error, &nmrep, cp->gss_clnt_minor);
@@ -1612,13 +1614,15 @@ nfs_gss_clnt_ctx_callserver(struct nfsreq *req, struct nfs_gss_clnt_ctx *cp)
        nfsm_chain_get_32(error, &nmrep, cp->gss_clnt_tokenlen);
        if (error)
                goto nfsmout;
-       if (cp->gss_clnt_tokenlen > 0) {
+       if (cp->gss_clnt_tokenlen > 0 && cp->gss_clnt_tokenlen < GSS_MAX_TOKEN_LEN) {
                MALLOC(cp->gss_clnt_token, u_char *, cp->gss_clnt_tokenlen, M_TEMP, M_WAITOK);
                if (cp->gss_clnt_token == NULL) {
                        error = ENOMEM;
                        goto nfsmout;
                }
                nfsm_chain_get_opaque(error, &nmrep, cp->gss_clnt_tokenlen, cp->gss_clnt_token);
+       } else {
+               error = EBADRPC;
        }
 
        /*
@@ -3065,7 +3069,9 @@ nfs_gss_svc_cred_get(struct nfsrv_descript *nd, struct nfsm_chain *nmc)
                        nmc_tmp = *nmc;
                        nfsm_chain_adv(error, &nmc_tmp, arglen);
                        nfsm_chain_get_32(error, &nmc_tmp, cksum.length);
-                       MALLOC(cksum.value, void *, cksum.length, M_TEMP, M_WAITOK);
+                       cksum.value = NULL;
+                       if (cksum.length > 0 && cksum.length < GSS_MAX_MIC_LEN)
+                               MALLOC(cksum.value, void *, cksum.length, M_TEMP, M_WAITOK);
 
                        if (cksum.value == NULL) {
                                error = EBADRPC;
@@ -3354,11 +3360,9 @@ nfs_gss_svc_ctx_init(struct nfsrv_descript *nd, struct nfsrv_sock *slp, mbuf_t *
        case RPCSEC_GSS_CONTINUE_INIT:
                /* Get the token from the request */
                nfsm_chain_get_32(error, nmreq, cp->gss_svc_tokenlen);
-               if (cp->gss_svc_tokenlen == 0) {
-                       autherr = RPCSEC_GSS_CREDPROBLEM;
-                       break;
-               }
-               MALLOC(cp->gss_svc_token, u_char *, cp->gss_svc_tokenlen, M_TEMP, M_WAITOK);
+               cp->gss_svc_token = NULL;
+               if (cp->gss_svc_tokenlen > 0 && cp->gss_svc_tokenlen < GSS_MAX_TOKEN_LEN)
+                       MALLOC(cp->gss_svc_token, u_char *, cp->gss_svc_tokenlen, M_TEMP, M_WAITOK);
                if (cp->gss_svc_token == NULL) {
                        autherr = RPCSEC_GSS_CREDPROBLEM;
                        break;
index fe3db18936cd31f4b87efa77527ce9667a4f711d..5b6887f9bd691ef33c333467d473ef4948e4efe9 100644 (file)
@@ -54,6 +54,22 @@ enum rpcsec_gss_service {
 extern u_char krb5_mech_oid[11];
 
 
+/*
+ * RFC 2203 and friends don't define maximums for token lengths
+ * and context handles. We try to pick reasonable values here.
+ *
+ * N.B. Kerberos mech tokens can be quite large from the output
+ * of a gss_init_sec_context if it includes a large PAC.
+ */
+
+#define GSS_MAX_CTX_HANDLE_LEN         256
+#define GSS_MAX_TOKEN_LEN              64*1024
+
+/*
+ * Put a "reasonble" bound on MIC lengths
+ */
+#define GSS_MAX_MIC_LEN                        2048
+
 #define GSS_MAXSEQ                     0x80000000      // The biggest sequence number
 #define GSS_SVC_MAXCONTEXTS            500000          // Max contexts supported
 #define GSS_SVC_SEQWINDOW              256             // Server's sequence window
index 9920f3c89ff6cf5e34d00219e79fcd824ac139b1..2514489c17215564331078f7b1177f32b5e6e960 100644 (file)
@@ -679,8 +679,8 @@ wait_for_granted:
                         * higher levels can resend the request.
                         */
                        msg->lm_flags &= ~LOCKD_MSG_CANCEL;
-                       nfs_lockdmsg_dequeue(msgreq);
                        error = NFSERR_DENIED;
+                       /* Will dequeue msgreq after the following break at the end of this routine */
                        break;
                }
 
index 027d7a5d8406f9fc751b9102d48d3c55364d7cbb..06c11bb7223679c429f3118b9b060899a9e82ab3 100644 (file)
@@ -440,8 +440,26 @@ nfsrv_getattr(
        error = nfsrv_credcheck(nd, ctx, nx, nxo);
        nfsmerr_if(error);
 
+#if CONFIG_MAC
+       if (mac_vnode_check_open(ctx, vp, FREAD))
+               error = ESTALE;
+       nfsmerr_if(error);
+#endif
+
        nfsm_srv_vattr_init(&vattr, nd->nd_vers);
        error = vnode_getattr(vp, &vattr, ctx);
+
+#if CONFIG_MAC
+       /* XXXab: Comment in the VFS code makes it sound like
+         *        some arguments can be filtered out, but not
+         *        what it actually means. Hopefully not like
+         *        they gonna set mtime to 0 or something. For
+         *        now trust there are no shenanigans here.
+         */
+       error = mac_vnode_check_getattr(ctx, NOCRED, vp, &vattr);
+       nfsmerr_if(error);
+#endif
+
        vnode_put(vp);
        vp = NULL;
 
@@ -556,6 +574,9 @@ nfsrv_setattr(
                error = nfsrv_authorize(vp, NULL, action, ctx, nxo, 0);
 
 #if CONFIG_MACF
+       if (!error && mac_vnode_check_open(ctx, vp, FREAD|FWRITE))
+               error = ESTALE;
+
        if (!error) {
                /* chown case */
                if (VATTR_IS_ACTIVE(vap, va_uid) || VATTR_IS_ACTIVE(vap, va_gid)) {
@@ -666,6 +687,18 @@ nfsrv_lookup(
                        /* update active user stats */
                        nfsrv_update_user_stat(nx, nd, saved_uid, 1, 0, 0);
                }
+               if (!error && mac_vnode_check_open(ctx, ni.ni_vp, FREAD)) {
+                       error = EACCES;
+                       if (dirp) {
+                               vnode_put(dirp);
+                               dirp = NULL;
+                       }
+
+                       if (ni.ni_vp) {
+                               vnode_put(ni.ni_vp);
+                               ni.ni_vp = NULL;
+                       }
+               }
        }
 
        if (dirp) {
@@ -788,6 +821,13 @@ nfsrv_readlink(
 
        if (!error)
                error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, ctx, nxo, 0);
+#if CONFIG_MACF
+       if (mac_vnode_check_open(ctx, vp, FREAD))
+               error = ESTALE;
+       nfsmerr_if(error);
+       if (!error)
+               error = mac_vnode_check_readlink(ctx, vp);
+#endif
        if (!error)
                error = VNOP_READLINK(vp, auio, ctx);
        if (vp) {
@@ -906,6 +946,21 @@ nfsrv_read(
            if ((error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_READ_DATA, ctx, nxo, 1)))
                error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_EXECUTE, ctx, nxo, 1);
        }
+#if CONFIG_MACF
+       if (!error) {
+               error = mac_vnode_check_open(ctx, vp, FREAD);
+               if (error) {
+                       error = EACCES;
+               } else {
+                       /* XXXab: Do we need to do this?! */
+                       error = mac_vnode_check_read(ctx, vfs_context_ucred(ctx), vp);
+                       if (error)
+                               error = EACCES;
+                       /* mac_vnode_check_exec() can't be done here. */
+               }
+       }
+       nfsmerr_if(error);
+#endif
        nfsm_srv_vattr_init(vap, nd->nd_vers);
        attrerr = vnode_getattr(vp, vap, ctx);
        if (!error)
@@ -4073,6 +4128,15 @@ nfsrv_readdir(
        }
        if (!error)
                error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_LIST_DIRECTORY, ctx, nxo, 0);
+#if CONFIG_MACF
+       if (!error) {
+               if (!error && mac_vnode_check_open(ctx, vp, FREAD))
+                       error = EACCES;
+
+               if (!error)
+                       error = mac_vnode_check_readdir(ctx, vp);
+       }
+#endif
        nfsmerr_if(error);
 
        MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
@@ -4294,6 +4358,15 @@ nfsrv_readdirplus(
                error = NFSERR_BAD_COOKIE;
        if (!error)
                error = nfsrv_authorize(vp, NULL, KAUTH_VNODE_LIST_DIRECTORY, ctx, nxo, 0);
+#if CONFIG_MACF
+       if (!error) {
+               if (!error && mac_vnode_check_open(ctx, vp, FREAD))
+                       error = EACCES;
+
+               if (!error)
+                       error = mac_vnode_check_readdir(ctx, vp);
+       }
+#endif
        nfsmerr_if(error);
 
        MALLOC(rbuf, caddr_t, siz, M_TEMP, M_WAITOK);
index f1123c3285501a8be7f1926dabc2b245acc43f86..8e3562c6405248b2453784bb31743e76aa88356b 100644 (file)
@@ -4749,6 +4749,7 @@ nfs_request_timer(__unused void *param0, __unused void *param1)
        int timeo, maxtime, finish_asyncio, error;
        struct timeval now;
        TAILQ_HEAD(nfs_mount_pokeq, nfsmount) nfs_mount_poke_queue;
+       TAILQ_INIT(&nfs_mount_poke_queue);
 
 restart:
        lck_mtx_lock(nfs_request_mutex);
@@ -4760,7 +4761,6 @@ restart:
        }
 
        nfs_reqbusy(req);
-       TAILQ_INIT(&nfs_mount_poke_queue);
 
        microuptime(&now);
        for ( ; req != NULL ; req = nfs_reqnext(req)) {
index ff5ae70b6fbf4faf263f939264e2039b90c111f8..0702fbed92433ea410d462d5beda04b020168ec8 100644 (file)
@@ -1688,8 +1688,15 @@ nfs_getattrcache(nfsnode_t np, struct nfs_vattr *nvaper, int flags)
         * and return cached attributes.
         */
        if (!nfs_use_cache(nmp)) {
-               timeo = nfs_attrcachetimeout(np);
                microuptime(&nowup);
+               if (np->n_attrstamp > nowup.tv_sec) {
+                       printf("NFS: Attribute time stamp is in the future by %ld seconds. Invalidating cache\n",
+                              np->n_attrstamp - nowup.tv_sec);
+                       NATTRINVALIDATE(np);
+                       NACCESSINVALIDATE(np);
+                       return (ENOENT);
+               }
+               timeo = nfs_attrcachetimeout(np);
                if ((nowup.tv_sec - np->n_attrstamp) >= timeo) {
                        FSDBG(528, np, 0, 0xffffff02, ENOENT);
                        OSAddAtomic64(1, &nfsstats.attrcache_misses);
index a5fc908b504c5aa94bec4e59bea17ca97d99c55a..17c51b7dac3c631473e0723a49bae3fa6d4c48cf 100644 (file)
@@ -2814,8 +2814,9 @@ mountnfs(
        xb_get_32(error, &xb, val); /* version */
        xb_get_32(error, &xb, argslength); /* args length */
        xb_get_32(error, &xb, val); /* XDR args version */
-       if (val != NFS_XDRARGS_VERSION_0)
+       if (val != NFS_XDRARGS_VERSION_0 || argslength < ((4 + NFS_MATTR_BITMAP_LEN + 1) * XDRWORD)) {
                error = EINVAL;
+       }
        len = NFS_MATTR_BITMAP_LEN;
        xb_get_bitmap(error, &xb, mattrs, len); /* mount attribute bitmap */
        attrslength = 0;
@@ -4523,6 +4524,8 @@ nfs_mount_zombie(struct nfsmount *nmp, int nm_state_flags)
        /* Since we've drop the request mutex we can now safely unreference the request */
        TAILQ_FOREACH_SAFE(req, &resendq, r_rchain, treq) {
                TAILQ_REMOVE(&resendq, req, r_rchain);
+               /* Make sure we don't try and remove again in nfs_request_destroy */
+               req->r_rchain.tqe_next = NFSREQNOLIST;
                nfs_request_rele(req);
        }
 
index 0ca66d6e69e8e203b054bbbf1518331998e53647..5753b6ea825285b9e9904055a05501a6e5e26061 100644 (file)
@@ -6925,7 +6925,7 @@ nfs_vnop_ioctl(
        vfs_context_t ctx = ap->a_context;
        vnode_t vp = ap->a_vp;
        struct nfsmount *mp = VTONMP(vp);
-       struct user_nfs_gss_principal gprinc;
+       struct user_nfs_gss_principal gprinc = {};
        uint32_t len;
        int error = ENOTTY;
 
index ec24bfffecb786d02e2bf99a661ede5d8f6f2595..4c115151b8622f8ffa91c654864e83101a19a61e 100644 (file)
@@ -113,7 +113,7 @@ kern_return_t do_pgo_reset_counters()
 static kern_return_t
 kextpgo_trap()
 {
-    return DebuggerTrapWithState(DBOP_RESET_PGO_COUNTERS, NULL, NULL, NULL, 0, FALSE, 0);
+    return DebuggerTrapWithState(DBOP_RESET_PGO_COUNTERS, NULL, NULL, NULL, 0, NULL, FALSE, 0);
 }
 
 static kern_return_t
diff --git a/bsd/pthread/Makefile b/bsd/pthread/Makefile
new file mode 100644 (file)
index 0000000..ef0643f
--- /dev/null
@@ -0,0 +1,48 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+DATAFILES = \
+       bsdthread_private.h \
+       priority_private.h \
+       workqueue_syscalls.h
+
+PRIVATE_DATAFILES = \
+       bsdthread_private.h \
+       priority_private.h \
+       workqueue_syscalls.h
+
+KERNELFILES = \
+
+PRIVATE_KERNELFILES = \
+
+INTERNAL_KERNELFILES = \
+       bsdthread_private.h \
+       priority_private.h \
+       workqueue_internal.h \
+       workqueue_syscalls.h \
+       workqueue_trace.h
+
+INSTALL_MI_DIR = pthread
+
+# /usr/local/include without PRIVATE stuff
+# /System/Library/Frameworks/System.framework/PrivateHeaders
+INCDIR = /usr/local/include
+INSTALL_MI_LIST = ${DATAFILES}
+INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
+
+# /System/Library/Frameworks/Kernel.framework/Headers
+# /System/Library/Frameworks/Kernel.framework/PrivateHeaders
+INSTALL_KF_MI_LIST = $(sort ${KERNELFILES})
+INSTALL_KF_MI_LCL_LIST = $(sort ${KERNELFILES} ${PRIVATE_KERNELFILES})
+
+EXPORT_MI_LIST = $(sort ${KERNELFILES} ${PRIVATE_KERNELFILES} ${INTERNAL_KERNELFILES})
+
+EXPORT_MI_DIR = ${INSTALL_MI_DIR}
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/bsd/pthread/bsdthread_private.h b/bsd/pthread/bsdthread_private.h
new file mode 100644 (file)
index 0000000..af854fe
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017 Apple, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _PTHREAD_BSDTHREAD_PRIVATE_H_
+#define _PTHREAD_BSDTHREAD_PRIVATE_H_
+
+#if XNU_KERNEL_PRIVATE && !defined(__PTHREAD_EXPOSE_INTERNALS__)
+#define __PTHREAD_EXPOSE_INTERNALS__ 1
+#endif // XNU_KERNEL_PRIVATE
+
+#ifdef __PTHREAD_EXPOSE_INTERNALS__
+
+/* pthread bsdthread_ctl sysctl commands */
+/* bsdthread_ctl(BSDTHREAD_CTL_SET_QOS, thread_port, tsd_entry_addr, 0) */
+#define BSDTHREAD_CTL_SET_QOS                          0x10
+/* bsdthread_ctl(BSDTHREAD_CTL_GET_QOS, thread_port, 0, 0) */
+#define BSDTHREAD_CTL_GET_QOS                          0x20
+/* bsdthread_ctl(BSDTHREAD_CTL_QOS_OVERRIDE_START, thread_port, priority, 0) */
+#define BSDTHREAD_CTL_QOS_OVERRIDE_START       0x40
+/* bsdthread_ctl(BSDTHREAD_CTL_QOS_OVERRIDE_END, thread_port, 0, 0) */
+#define BSDTHREAD_CTL_QOS_OVERRIDE_END         0x80
+/* bsdthread_ctl(BSDTHREAD_CTL_SET_SELF, priority, voucher, flags) */
+#define BSDTHREAD_CTL_SET_SELF                         0x100
+/* bsdthread_ctl(BSDTHREAD_CTL_QOS_OVERRIDE_RESET, 0, 0, 0) */
+#define BSDTHREAD_CTL_QOS_OVERRIDE_RESET       0x200
+/* bsdthread_ctl(BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH, thread_port, priority, 0) */
+#define BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH    0x400
+/* bsdthread_ctl(BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD, thread_port, priority, resource) */
+#define BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD           0x401
+/* bsdthread_ctl(BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET, 0|1 (?reset_all), resource, 0) */
+#define BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET         0x402
+/* bsdthread_ctl(BSDTHREAD_CTL_QOS_MAX_PARALLELISM, priority, flags, 0) */
+#define BSDTHREAD_CTL_QOS_MAX_PARALLELISM      0x800
+
+#define _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL 0x1
+#define _PTHREAD_QOS_PARALLELISM_REALTIME 0x2
+
+#endif // __PTHREAD_EXPOSE_INTERNALS__
+#endif // _PTHREAD_BSDTHREAD_PRIVATE_H_
diff --git a/bsd/pthread/priority_private.h b/bsd/pthread/priority_private.h
new file mode 100644 (file)
index 0000000..5d20e08
--- /dev/null
@@ -0,0 +1,236 @@
+/*
+ * Copyright (c) 2000-2017 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _PTHREAD_PRIORITY_PRIVATE_H_
+#define _PTHREAD_PRIORITY_PRIVATE_H_
+
+/*!
+ * @typedef pthread_priority_t
+ *
+ * @abstract
+ * pthread_priority_t is an on opaque integer that is guaranteed to be ordered
+ * such that combations of QoS classes and relative priorities are ordered
+ * numerically, according to their combined priority.
+ *
+ * <b>xnu, pthread & libdispatch flags</b>
+ *
+ * @const _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
+ * The thread this priority is applied to is overcommit (affects the workqueue
+ * creation policy for this priority).
+ *
+ * @const _PTHREAD_PRIORITY_FALLBACK_FLAG
+ * Indicates that this priority is is used only when incoming events have no
+ * priority at all. It is merely used as a fallback (hence the name) instead of
+ * a floor.
+ *
+ * This is usually used with QOS_CLASS_DEFAULT and a 0 relative priority.
+ *
+ * @const _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
+ * The event manager flag indicates that this thread/request is for a event
+ * manager thread.  There can only ever be one event manager thread at a time
+ * and it is brought up at the highest of all event manager priorities pthread
+ * knows about.
+ *
+ * <b>pthread & dispatch only flags</b>
+ *
+ * @const _PTHREAD_PRIORITY_SCHED_PRI_FLAG
+ * @const _PTHREAD_PRIORITY_SCHED_PRI_MASK
+ * This flag indicates that the bits extracted using
+ * _PTHREAD_PRIORITY_SCHED_PRI_MASK represent a scheduler priority instead of
+ * a {qos, relative priority} pair.
+ *
+ * This flag is only used by the pthread kext to indicate libdispatch that the
+ * event manager queue priority is a scheduling priority and not a QoS. This
+ * flag is never used as an input by anything else and is why it can perform
+ * a double duty with _PTHREAD_PRIORITY_ROOTQUEUE_FLAG.
+ *
+ * @const _PTHREAD_PRIORITY_NEEDS_UNBIND_FLAG
+ * This flag is used for the priority of event delivery threads to indicate
+ * to libdispatch that this thread is bound to a kqueue.
+ *
+ * <b>dispatch only flags</b>
+ *
+ * @const _PTHREAD_PRIORITY_INHERIT_FLAG
+ * This flag is meaningful to libdispatch only and has no meanting for the
+ * kernel and/or pthread.
+ *
+ * @const _PTHREAD_PRIORITY_ROOTQUEUE_FLAG
+ * This flag is meaningful to libdispatch only and has no meanting for the
+ * kernel and/or pthread.
+ *
+ * @const _PTHREAD_PRIORITY_ENFORCE_FLAG
+ * This flag is used to indicate that this priority should be prefered for work
+ * submited asynchronously over the intrinsic priority of the queue/thread the
+ * work is submitted to.
+ *
+ * @const _PTHREAD_PRIORITY_OVERRIDE_FLAG
+ * No longer used
+ */
+typedef unsigned long pthread_priority_t;
+
+#define _PTHREAD_PRIORITY_FLAGS_MASK                   0xff000000
+#define _PTHREAD_PRIORITY_FLAGS_SHIFT                  (24ull)
+
+#define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG              0x80000000
+#define _PTHREAD_PRIORITY_INHERIT_FLAG                 0x40000000 /* dispatch only */
+#define _PTHREAD_PRIORITY_ROOTQUEUE_FLAG               0x20000000 /* dispatch only */
+#define _PTHREAD_PRIORITY_SCHED_PRI_FLAG               0x20000000
+#define _PTHREAD_PRIORITY_SCHED_PRI_MASK               0x0000ffff
+#define _PTHREAD_PRIORITY_ENFORCE_FLAG                 0x10000000 /* dispatch only */
+#define _PTHREAD_PRIORITY_OVERRIDE_FLAG                        0x08000000 /* unused */
+#define _PTHREAD_PRIORITY_FALLBACK_FLAG                        0x04000000
+#define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG   0x02000000
+#define _PTHREAD_PRIORITY_NEEDS_UNBIND_FLAG            0x01000000
+#define _PTHREAD_PRIORITY_DEFAULTQUEUE_FLAG            _PTHREAD_PRIORITY_FALLBACK_FLAG // compat
+
+#define _PTHREAD_PRIORITY_ENCODING_MASK                        0x00a00000
+#define _PTHREAD_PRIORITY_ENCODING_SHIFT               (22ull)
+#define _PTHREAD_PRIORITY_ENCODING_V0                  0x00000000
+#define _PTHREAD_PRIORITY_ENCODING_V1                  0x00400000 /* unused */
+#define _PTHREAD_PRIORITY_ENCODING_V2                  0x00800000 /* unused */
+#define _PTHREAD_PRIORITY_ENCODING_V3                  0x00a00000 /* unused */
+
+#define _PTHREAD_PRIORITY_QOS_CLASS_MASK               0x003fff00
+#define _PTHREAD_PRIORITY_VALID_QOS_CLASS_MASK 0x00003f00
+#define _PTHREAD_PRIORITY_QOS_CLASS_SHIFT              (8ull)
+
+#define _PTHREAD_PRIORITY_PRIORITY_MASK                        0x000000ff
+#define _PTHREAD_PRIORITY_PRIORITY_SHIFT               (0)
+
+#if PRIVATE
+#if XNU_KERNEL_PRIVATE && !defined(__PTHREAD_EXPOSE_INTERNALS__)
+#define __PTHREAD_EXPOSE_INTERNALS__ 1
+#endif // XNU_KERNEL_PRIVATE
+#ifdef __PTHREAD_EXPOSE_INTERNALS__
+/*
+ * This exposes the encoding used for pthread_priority_t
+ * and is meant to be used by pthread and XNU only
+ */
+#include <mach/thread_policy.h> // THREAD_QOS_*
+#include <stdbool.h>
+
+__attribute__((always_inline, const))
+static inline bool
+_pthread_priority_has_qos(pthread_priority_t pp)
+{
+       return (pp & (_PTHREAD_PRIORITY_SCHED_PRI_FLAG |
+                                       _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG)) == 0 &&
+                       (pp & (_PTHREAD_PRIORITY_QOS_CLASS_MASK &
+                                       ~_PTHREAD_PRIORITY_VALID_QOS_CLASS_MASK)) == 0 &&
+                       (pp & _PTHREAD_PRIORITY_VALID_QOS_CLASS_MASK) != 0;
+}
+
+__attribute__((always_inline, const))
+static inline pthread_priority_t
+_pthread_priority_make_from_thread_qos(thread_qos_t qos, int relpri,
+               unsigned long flags)
+{
+       pthread_priority_t pp = (flags & _PTHREAD_PRIORITY_FLAGS_MASK);
+       if (qos && qos < THREAD_QOS_LAST) {
+               pp |= (1 << (_PTHREAD_PRIORITY_QOS_CLASS_SHIFT + qos - 1));
+               pp |= ((uint8_t)relpri - 1) & _PTHREAD_PRIORITY_PRIORITY_MASK;
+       }
+       return pp;
+}
+
+__attribute__((always_inline, const))
+static inline pthread_priority_t
+_pthread_event_manager_priority(void)
+{
+       return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
+}
+
+__attribute__((always_inline, const))
+static inline pthread_priority_t
+_pthread_unspecified_priority(void)
+{
+       return _pthread_priority_make_from_thread_qos(THREAD_QOS_UNSPECIFIED, 0, 0);
+}
+
+__attribute__((always_inline, const))
+static inline pthread_priority_t
+_pthread_default_priority(unsigned long flags)
+{
+       return _pthread_priority_make_from_thread_qos(THREAD_QOS_LEGACY, 0, flags);
+}
+
+__attribute__((always_inline, const))
+static inline thread_qos_t
+_pthread_priority_thread_qos(pthread_priority_t pp)
+{
+       if (_pthread_priority_has_qos(pp)) {
+               pp &= _PTHREAD_PRIORITY_QOS_CLASS_MASK;
+               pp >>= _PTHREAD_PRIORITY_QOS_CLASS_SHIFT;
+               return (thread_qos_t)__builtin_ffs((int)pp);
+       }
+       return THREAD_QOS_UNSPECIFIED;
+}
+
+__attribute__((always_inline, const))
+static inline int
+_pthread_priority_relpri(pthread_priority_t pp)
+{
+       if (_pthread_priority_has_qos(pp)) {
+               pp &= _PTHREAD_PRIORITY_PRIORITY_MASK;
+               pp >>= _PTHREAD_PRIORITY_PRIORITY_SHIFT;
+               return (int8_t)pp + 1;
+       }
+       return 0;
+}
+
+#if KERNEL
+// Interfaces only used by the kernel and not implemented in userspace.
+
+/*
+ * Keep managerness, overcomitness and fallback, discard other flags.
+ * Normalize and validate QoS/relpri
+ */
+__attribute__((const))
+pthread_priority_t
+_pthread_priority_normalize(pthread_priority_t pp);
+
+/*
+ * Keep managerness, discard other flags.
+ * Normalize and validate QoS/relpri
+ */
+__attribute__((const))
+pthread_priority_t
+_pthread_priority_normalize_for_ipc(pthread_priority_t pp);
+
+/*
+ * Keep the flags from base_pp and return the priority with the maximum priority
+ * of base_pp and _pthread_priority_make_from_thread_qos(qos, 0, 0)
+ */
+__attribute__((const))
+pthread_priority_t
+_pthread_priority_combine(pthread_priority_t base_pp, thread_qos_t qos);
+
+#endif // KERNEL
+#endif // __PTHREAD_EXPOSE_INTERNALS__
+#endif // PRIVATE
+#endif // _PTHREAD_PRIORITY_PRIVATE_H_
diff --git a/bsd/pthread/pthread_priority.c b/bsd/pthread/pthread_priority.c
new file mode 100644 (file)
index 0000000..53cda95
--- /dev/null
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ *
+ */
+
+#include <pthread/priority_private.h>
+
+#ifndef QOS_MIN_RELATIVE_PRIORITY // from <sys/qos.h> in userspace
+#define QOS_MIN_RELATIVE_PRIORITY -15
+#endif
+
+pthread_priority_t
+_pthread_priority_normalize(pthread_priority_t pp)
+{
+       if (pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) {
+               return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
+       }
+       if (_pthread_priority_has_qos(pp)) {
+               int relpri = _pthread_priority_relpri(pp);
+               if (relpri > 0 || relpri < QOS_MIN_RELATIVE_PRIORITY) {
+                       pp |= _PTHREAD_PRIORITY_PRIORITY_MASK;
+               }
+               return pp & (_PTHREAD_PRIORITY_OVERCOMMIT_FLAG |
+                               _PTHREAD_PRIORITY_FALLBACK_FLAG |
+                               _PTHREAD_PRIORITY_QOS_CLASS_MASK |
+                               _PTHREAD_PRIORITY_PRIORITY_MASK);
+       }
+       return _pthread_unspecified_priority();
+}
+
+pthread_priority_t
+_pthread_priority_normalize_for_ipc(pthread_priority_t pp)
+{
+       if (_pthread_priority_has_qos(pp)) {
+               int relpri = _pthread_priority_relpri(pp);
+               if (relpri > 0 || relpri < QOS_MIN_RELATIVE_PRIORITY) {
+                       pp |= _PTHREAD_PRIORITY_PRIORITY_MASK;
+               }
+               return pp & (_PTHREAD_PRIORITY_QOS_CLASS_MASK |
+                               _PTHREAD_PRIORITY_PRIORITY_MASK);
+       }
+       return _pthread_unspecified_priority();
+}
+
+pthread_priority_t
+_pthread_priority_combine(pthread_priority_t base_pp, thread_qos_t qos)
+{
+       if (base_pp & _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG) {
+               return _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG;
+       }
+
+       if (base_pp & _PTHREAD_PRIORITY_FALLBACK_FLAG) {
+               if (!qos) {
+                       return base_pp;
+               }
+       } else if (qos < _pthread_priority_thread_qos(base_pp)) {
+               return base_pp;
+       }
+
+       return _pthread_priority_make_from_thread_qos(qos, 0,
+                       base_pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG);
+}
diff --git a/bsd/pthread/pthread_shims.c b/bsd/pthread/pthread_shims.c
new file mode 100644 (file)
index 0000000..b23487e
--- /dev/null
@@ -0,0 +1,599 @@
+/*
+ * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#define PTHREAD_INTERNAL 1
+
+#include <stdatomic.h>
+#include <kern/debug.h>
+#include <kern/mach_param.h>
+#include <kern/sched_prim.h>
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <kern/affinity.h>
+#include <kern/zalloc.h>
+#include <kern/policy_internal.h>
+
+#include <machine/machine_routines.h>
+#include <mach/task.h>
+#include <mach/thread_act.h>
+#include <sys/param.h>
+#include <sys/eventvar.h>
+#include <sys/pthread_shims.h>
+#include <pthread/workqueue_internal.h>
+#include <sys/cdefs.h>
+#include <sys/proc_info.h>
+#include <sys/proc_internal.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <vm/vm_map.h>
+#include <vm/vm_protos.h>
+#include <kern/kcdata.h>
+
+/* version number of the in-kernel shims given to pthread.kext */
+#define PTHREAD_SHIMS_VERSION 1
+
+/* on arm, the callbacks function has two #ifdef arm ponters */
+#if defined(__arm__)
+#define PTHREAD_CALLBACK_MEMBER __unused_was_map_is_1gb
+#else
+#define PTHREAD_CALLBACK_MEMBER __unused_was_ml_get_max_cpus
+#endif
+
+/* compile time asserts to check the length of structures in pthread_shims.h */
+static_assert((sizeof(struct pthread_functions_s) - offsetof(struct pthread_functions_s, psynch_rw_yieldwrlock) - sizeof(void*)) == (sizeof(void*) * 100));
+static_assert((sizeof(struct pthread_callbacks_s) - offsetof(struct pthread_callbacks_s, PTHREAD_CALLBACK_MEMBER) - sizeof(void*)) == (sizeof(void*) * 100));
+
+/* old pthread code had definitions for these as they don't exist in headers */
+extern kern_return_t mach_port_deallocate(ipc_space_t, mach_port_name_t);
+extern kern_return_t semaphore_signal_internal_trap(mach_port_name_t);
+extern void thread_deallocate_safe(thread_t thread);
+
+#define PTHREAD_STRUCT_ACCESSOR(get, set, rettype, structtype, member) \
+       static rettype \
+       get(structtype x) { \
+               return (x)->member; \
+       } \
+       static void \
+       set(structtype x, rettype y) { \
+               (x)->member = y; \
+       }
+
+PTHREAD_STRUCT_ACCESSOR(proc_get_threadstart, proc_set_threadstart, user_addr_t, struct proc*, p_threadstart);
+PTHREAD_STRUCT_ACCESSOR(proc_get_pthsize, proc_set_pthsize, int, struct proc*, p_pthsize);
+PTHREAD_STRUCT_ACCESSOR(proc_get_wqthread, proc_set_wqthread, user_addr_t, struct proc*, p_wqthread);
+PTHREAD_STRUCT_ACCESSOR(proc_get_stack_addr_hint, proc_set_stack_addr_hint, user_addr_t, struct proc *, p_stack_addr_hint);
+PTHREAD_STRUCT_ACCESSOR(proc_get_pthread_tsd_offset, proc_set_pthread_tsd_offset, uint32_t, struct proc *, p_pth_tsd_offset);
+PTHREAD_STRUCT_ACCESSOR(proc_get_mach_thread_self_tsd_offset, proc_set_mach_thread_self_tsd_offset, uint64_t, struct proc *, p_mach_thread_self_offset);
+PTHREAD_STRUCT_ACCESSOR(proc_get_pthhash, proc_set_pthhash, void*, struct proc*, p_pthhash);
+
+#define WQPTR_IS_INITING_VALUE ((void *)~(uintptr_t)0)
+
+static void
+proc_set_dispatchqueue_offset(struct proc *p, uint64_t offset)
+{
+       p->p_dispatchqueue_offset = offset;
+}
+
+static void
+proc_set_return_to_kernel_offset(struct proc *p, uint64_t offset)
+{
+       p->p_return_to_kernel_offset = offset;
+}
+
+static user_addr_t
+proc_get_user_stack(struct proc *p)
+{
+       return p->user_stack;
+}
+
+static void
+uthread_set_returnval(struct uthread *uth, int retval)
+{
+       uth->uu_rval[0] = retval;
+}
+
+__attribute__((noreturn))
+static void
+pthread_returning_to_userspace(void)
+{
+       thread_exception_return();
+}
+
+__attribute__((noreturn))
+static void
+pthread_bootstrap_return(void)
+{
+       thread_bootstrap_return();
+}
+
+static uint32_t
+get_task_threadmax(void) {
+       return task_threadmax;
+}
+
+static uint64_t
+proc_get_register(struct proc *p) {
+       return (p->p_lflag & P_LREGISTER);
+}
+
+static void
+proc_set_register(struct proc *p) {
+       proc_setregister(p);
+}
+
+static void*
+uthread_get_uukwe(struct uthread *t)
+{
+       return &t->uu_save.uus_kwe;
+}
+
+static int
+uthread_is_cancelled(struct uthread *t)
+{
+       return (t->uu_flag & (UT_CANCELDISABLE | UT_CANCEL | UT_CANCELED)) == UT_CANCEL;
+}
+
+static vm_map_t
+_current_map(void)
+{
+       return current_map();
+}
+
+static boolean_t
+qos_main_thread_active(void)
+{
+       return TRUE;
+}
+
+static int proc_usynch_get_requested_thread_qos(struct uthread *uth)
+{
+       thread_t        thread = uth ? uth->uu_thread : current_thread();
+       int                     requested_qos;
+
+       requested_qos = proc_get_thread_policy(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS);
+
+       /*
+        * For the purposes of userspace synchronization, it doesn't make sense to
+        * place an override of UNSPECIFIED on another thread, if the current thread
+        * doesn't have any QoS set. In these cases, upgrade to
+        * THREAD_QOS_USER_INTERACTIVE.
+        */
+       if (requested_qos == THREAD_QOS_UNSPECIFIED) {
+               requested_qos = THREAD_QOS_USER_INTERACTIVE;
+       }
+
+       return requested_qos;
+}
+
+static boolean_t
+proc_usynch_thread_qos_add_override_for_resource(task_t task, struct uthread *uth,
+               uint64_t tid, int override_qos, boolean_t first_override_for_resource,
+               user_addr_t resource, int resource_type)
+{
+       thread_t thread = uth ? uth->uu_thread : THREAD_NULL;
+
+       return proc_thread_qos_add_override(task, thread, tid, override_qos,
+                       first_override_for_resource, resource, resource_type) == 0;
+}
+
+static boolean_t
+proc_usynch_thread_qos_remove_override_for_resource(task_t task,
+               struct uthread *uth, uint64_t tid, user_addr_t resource, int resource_type)
+{
+       thread_t thread = uth ? uth->uu_thread : THREAD_NULL;
+
+       return proc_thread_qos_remove_override(task, thread, tid, resource,
+                       resource_type) == 0;
+}
+
+
+static wait_result_t
+psynch_wait_prepare(uintptr_t kwq, struct turnstile **tstore,
+               thread_t owner, block_hint_t block_hint, uint64_t deadline)
+{
+       struct turnstile *ts;
+       wait_result_t wr;
+
+       if (tstore) {
+               ts = turnstile_prepare(kwq, tstore, TURNSTILE_NULL,
+                               TURNSTILE_PTHREAD_MUTEX);
+
+               turnstile_update_inheritor(ts, owner,
+                               (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+               thread_set_pending_block_hint(current_thread(), block_hint);
+
+               wr = waitq_assert_wait64_leeway(&ts->ts_waitq, (event64_t)kwq,
+                               THREAD_ABORTSAFE, TIMEOUT_URGENCY_USER_NORMAL, deadline, 0);
+       } else {
+               thread_set_pending_block_hint(current_thread(), block_hint);
+
+               wr = assert_wait_deadline_with_leeway((event_t)kwq, THREAD_ABORTSAFE,
+                               TIMEOUT_URGENCY_USER_NORMAL, deadline, 0);
+       }
+
+       return wr;
+}
+
+static void
+psynch_wait_update_complete(struct turnstile *ts)
+{
+       assert(ts);
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+}
+
+static void
+psynch_wait_complete(uintptr_t kwq, struct turnstile **tstore)
+{
+       assert(tstore);
+       turnstile_complete(kwq, tstore, NULL);
+}
+
+static void
+psynch_wait_update_owner(uintptr_t kwq, thread_t owner,
+               struct turnstile **tstore)
+{
+       struct turnstile *ts;
+
+       ts = turnstile_prepare(kwq, tstore, TURNSTILE_NULL,
+                       TURNSTILE_PTHREAD_MUTEX);
+
+       turnstile_update_inheritor(ts, owner,
+                       (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+       turnstile_complete(kwq, tstore, NULL);
+}
+
+static void
+psynch_wait_cleanup(void)
+{
+       turnstile_cleanup();
+}
+
+static kern_return_t
+psynch_wait_wakeup(uintptr_t kwq, struct ksyn_waitq_element *kwe,
+               struct turnstile **tstore)
+{
+       struct uthread *uth;
+       struct turnstile *ts;
+       kern_return_t kr;
+
+       uth = __container_of(kwe, struct uthread, uu_save.uus_kwe);
+       assert(uth);
+
+       if (tstore) {
+               ts = turnstile_prepare(kwq, tstore, TURNSTILE_NULL,
+                               TURNSTILE_PTHREAD_MUTEX);
+               turnstile_update_inheritor(ts, uth->uu_thread,
+                               (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+               kr = waitq_wakeup64_thread(&ts->ts_waitq, (event64_t)kwq,
+                               uth->uu_thread, THREAD_AWAKENED);
+
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+               turnstile_complete(kwq, tstore, NULL);
+       } else {
+               kr = thread_wakeup_thread((event_t)kwq, uth->uu_thread);
+       }
+
+       return kr;
+}
+
+/* kernel (core) to kext shims */
+
+void
+pthread_init(void)
+{
+       if (!pthread_functions) {
+               panic("pthread kernel extension not loaded (function table is NULL).");
+       }
+       pthread_functions->pthread_init();
+}
+
+void
+pth_proc_hashinit(proc_t p)
+{
+       pthread_functions->pth_proc_hashinit(p);
+}
+
+void
+pth_proc_hashdelete(proc_t p)
+{
+       pthread_functions->pth_proc_hashdelete(p);
+}
+
+/* syscall shims */
+int
+bsdthread_create(struct proc *p, struct bsdthread_create_args *uap, user_addr_t *retval)
+{
+       return pthread_functions->bsdthread_create(p, uap->func, uap->func_arg, uap->stack, uap->pthread, uap->flags, retval);
+}
+
+int
+bsdthread_register(struct proc *p, struct bsdthread_register_args *uap, __unused int32_t *retval)
+{
+       kern_return_t kr;
+       static_assert(offsetof(struct bsdthread_register_args, threadstart) + sizeof(user_addr_t) ==
+                       offsetof(struct bsdthread_register_args, wqthread));
+       kr = machine_thread_function_pointers_convert_from_user(current_thread(), &uap->threadstart, 2);
+       assert(kr == KERN_SUCCESS);
+
+       if (pthread_functions->version >= 1) {
+               return pthread_functions->bsdthread_register2(p, uap->threadstart,
+                               uap->wqthread, uap->flags, uap->stack_addr_hint,
+                               uap->targetconc_ptr, uap->dispatchqueue_offset,
+                               uap->tsd_offset, retval);
+       } else {
+               return pthread_functions->bsdthread_register(p, uap->threadstart,
+                               uap->wqthread, uap->flags, uap->stack_addr_hint,
+                               uap->targetconc_ptr, uap->dispatchqueue_offset,
+                               retval);
+       }
+}
+
+int
+bsdthread_terminate(struct proc *p, struct bsdthread_terminate_args *uap, int32_t *retval)
+{
+       thread_t th = current_thread();
+       if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) {
+               workq_thread_terminate(p, get_bsdthread_info(th));
+       }
+       return pthread_functions->bsdthread_terminate(p, uap->stackaddr, uap->freesize, uap->port, uap->sem, retval);
+}
+
+int
+thread_selfid(struct proc *p, __unused struct thread_selfid_args *uap, uint64_t *retval)
+{
+       return pthread_functions->thread_selfid(p, retval);
+}
+
+/* pthread synchroniser syscalls */
+
+int
+psynch_mutexwait(proc_t p, struct psynch_mutexwait_args *uap, uint32_t *retval)
+{
+       return pthread_functions->psynch_mutexwait(p, uap->mutex, uap->mgen, uap->ugen, uap->tid, uap->flags, retval);
+}
+
+int
+psynch_mutexdrop(proc_t p, struct psynch_mutexdrop_args *uap, uint32_t *retval)
+{
+       return pthread_functions->psynch_mutexdrop(p, uap->mutex, uap->mgen, uap->ugen, uap->tid, uap->flags, retval);
+}
+
+int
+psynch_cvbroad(proc_t p, struct psynch_cvbroad_args *uap, uint32_t *retval)
+{
+       return pthread_functions->psynch_cvbroad(p, uap->cv, uap->cvlsgen, uap->cvudgen, uap->flags, uap->mutex, uap->mugen, uap->tid, retval);
+}
+
+int
+psynch_cvsignal(proc_t p, struct psynch_cvsignal_args *uap, uint32_t *retval)
+{
+       return pthread_functions->psynch_cvsignal(p, uap->cv, uap->cvlsgen, uap->cvugen, uap->thread_port, uap->mutex, uap->mugen, uap->tid, uap->flags, retval);
+}
+
+int
+psynch_cvwait(proc_t p, struct psynch_cvwait_args * uap, uint32_t * retval)
+{
+       return pthread_functions->psynch_cvwait(p, uap->cv, uap->cvlsgen, uap->cvugen, uap->mutex, uap->mugen, uap->flags, uap->sec, uap->nsec, retval);
+}
+
+int
+psynch_cvclrprepost(proc_t p, struct psynch_cvclrprepost_args * uap, int *retval)
+{
+       return pthread_functions->psynch_cvclrprepost(p, uap->cv, uap->cvgen, uap->cvugen, uap->cvsgen, uap->prepocnt, uap->preposeq, uap->flags, retval);
+}
+
+int
+psynch_rw_longrdlock(proc_t p, struct psynch_rw_longrdlock_args * uap,  uint32_t *retval)
+{
+       return pthread_functions->psynch_rw_longrdlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
+}
+
+int
+psynch_rw_rdlock(proc_t p, struct psynch_rw_rdlock_args * uap, uint32_t * retval)
+{
+       return pthread_functions->psynch_rw_rdlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
+}
+
+int
+psynch_rw_unlock(proc_t p, struct psynch_rw_unlock_args *uap, uint32_t *retval)
+{
+       return pthread_functions->psynch_rw_unlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
+}
+
+int
+psynch_rw_unlock2(__unused proc_t p, __unused struct psynch_rw_unlock2_args *uap, __unused uint32_t *retval)
+{
+       return ENOTSUP;
+}
+
+int
+psynch_rw_wrlock(proc_t p, struct psynch_rw_wrlock_args *uap, uint32_t *retval)
+{
+       return pthread_functions->psynch_rw_wrlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
+}
+
+int
+psynch_rw_yieldwrlock(proc_t p, struct psynch_rw_yieldwrlock_args *uap, uint32_t *retval)
+{
+       return pthread_functions->psynch_rw_yieldwrlock(p, uap->rwlock, uap->lgenval, uap->ugenval, uap->rw_wc, uap->flags, retval);
+}
+
+int
+psynch_rw_upgrade(__unused proc_t p, __unused struct psynch_rw_upgrade_args * uap, __unused uint32_t *retval)
+{
+       return 0;
+}
+
+int
+psynch_rw_downgrade(__unused proc_t p, __unused struct psynch_rw_downgrade_args * uap, __unused int *retval)
+{
+       return 0;
+}
+
+void
+kdp_pthread_find_owner(thread_t thread, struct stackshot_thread_waitinfo *waitinfo)
+{
+       if (pthread_functions->pthread_find_owner)
+               pthread_functions->pthread_find_owner(thread, waitinfo);
+}
+
+void *
+kdp_pthread_get_thread_kwq(thread_t thread)
+{
+       if (pthread_functions->pthread_get_thread_kwq)
+               return pthread_functions->pthread_get_thread_kwq(thread);
+
+       return NULL;
+}
+
+void
+thread_will_park_or_terminate(thread_t thread)
+{
+       if (thread_owned_workloops_count(thread)) {
+               (void)kevent_exit_on_workloop_ownership_leak(thread);
+       }
+}
+
+/*
+ * The callbacks structure (defined in pthread_shims.h) contains a collection
+ * of kernel functions that were not deemed sensible to expose as a KPI to all
+ * kernel extensions. So the kext is given them in the form of a structure of
+ * function pointers.
+ */
+static const struct pthread_callbacks_s pthread_callbacks = {
+       .version = PTHREAD_SHIMS_VERSION,
+       .config_thread_max = CONFIG_THREAD_MAX,
+       .get_task_threadmax = get_task_threadmax,
+
+       .proc_get_threadstart = proc_get_threadstart,
+       .proc_set_threadstart = proc_set_threadstart,
+       .proc_get_pthsize = proc_get_pthsize,
+       .proc_set_pthsize = proc_set_pthsize,
+       .proc_get_wqthread = proc_get_wqthread,
+       .proc_set_wqthread = proc_set_wqthread,
+       .proc_set_dispatchqueue_offset = proc_set_dispatchqueue_offset,
+       .proc_get_pthhash = proc_get_pthhash,
+       .proc_set_pthhash = proc_set_pthhash,
+       .proc_get_register = proc_get_register,
+       .proc_set_register = proc_set_register,
+
+       /* kernel IPI interfaces */
+       .ipc_port_copyout_send = ipc_port_copyout_send,
+       .task_get_ipcspace = get_task_ipcspace,
+       .vm_map_page_info = vm_map_page_info,
+       .thread_set_wq_state32 = thread_set_wq_state32,
+#if !defined(__arm__)
+       .thread_set_wq_state64 = thread_set_wq_state64,
+#endif
+
+       .uthread_get_uukwe = uthread_get_uukwe,
+       .uthread_set_returnval = uthread_set_returnval,
+       .uthread_is_cancelled = uthread_is_cancelled,
+
+       .thread_exception_return = pthread_returning_to_userspace,
+       .thread_bootstrap_return = pthread_bootstrap_return,
+       .unix_syscall_return = unix_syscall_return,
+
+       .get_bsdthread_info = (void*)get_bsdthread_info,
+       .thread_policy_set_internal = thread_policy_set_internal,
+       .thread_policy_get = thread_policy_get,
+
+       .__pthread_testcancel = __pthread_testcancel,
+
+       .mach_port_deallocate = mach_port_deallocate,
+       .semaphore_signal_internal_trap = semaphore_signal_internal_trap,
+       .current_map = _current_map,
+       .thread_create = thread_create,
+       .thread_resume = thread_resume,
+
+       .convert_thread_to_port = convert_thread_to_port,
+
+       .proc_get_stack_addr_hint = proc_get_stack_addr_hint,
+       .proc_set_stack_addr_hint = proc_set_stack_addr_hint,
+       .proc_get_pthread_tsd_offset = proc_get_pthread_tsd_offset,
+       .proc_set_pthread_tsd_offset = proc_set_pthread_tsd_offset,
+       .proc_get_mach_thread_self_tsd_offset = proc_get_mach_thread_self_tsd_offset,
+       .proc_set_mach_thread_self_tsd_offset = proc_set_mach_thread_self_tsd_offset,
+
+       .thread_set_tsd_base = thread_set_tsd_base,
+
+       .proc_usynch_get_requested_thread_qos = proc_usynch_get_requested_thread_qos,
+
+       .qos_main_thread_active = qos_main_thread_active,
+       .thread_set_voucher_name = thread_set_voucher_name,
+
+       .proc_usynch_thread_qos_add_override_for_resource = proc_usynch_thread_qos_add_override_for_resource,
+       .proc_usynch_thread_qos_remove_override_for_resource = proc_usynch_thread_qos_remove_override_for_resource,
+
+       .thread_set_tag = thread_set_tag,
+       .thread_get_tag = thread_get_tag,
+
+       .proc_set_return_to_kernel_offset = proc_set_return_to_kernel_offset,
+       .thread_will_park_or_terminate = thread_will_park_or_terminate,
+
+       .proc_get_user_stack = proc_get_user_stack,
+       .task_findtid = task_findtid,
+       .thread_deallocate_safe = thread_deallocate_safe,
+
+       .psynch_wait_prepare = psynch_wait_prepare,
+       .psynch_wait_update_complete = psynch_wait_update_complete,
+       .psynch_wait_complete = psynch_wait_complete,
+       .psynch_wait_cleanup = psynch_wait_cleanup,
+       .psynch_wait_wakeup = psynch_wait_wakeup,
+       .psynch_wait_update_owner = psynch_wait_update_owner,
+};
+
+pthread_callbacks_t pthread_kern = &pthread_callbacks;
+pthread_functions_t pthread_functions = NULL;
+
+/*
+ * pthread_kext_register is called by pthread.kext upon load, it has to provide
+ * us with a function pointer table of pthread internal calls. In return, this
+ * file provides it with a table of function pointers it needs.
+ */
+
+void
+pthread_kext_register(pthread_functions_t fns, pthread_callbacks_t *callbacks)
+{
+       if (pthread_functions != NULL) {
+               panic("Re-initialisation of pthread kext callbacks.");
+       }
+
+       if (callbacks != NULL) {
+               *callbacks = &pthread_callbacks;
+       } else {
+               panic("pthread_kext_register called without callbacks pointer.");
+       }
+
+       if (fns) {
+               pthread_functions = fns;
+       }
+}
diff --git a/bsd/pthread/pthread_workqueue.c b/bsd/pthread/pthread_workqueue.c
new file mode 100644 (file)
index 0000000..0e8aee8
--- /dev/null
@@ -0,0 +1,3467 @@
+/*
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/* Copyright (c) 1995-2018 Apple, Inc. All Rights Reserved */
+
+#include <sys/cdefs.h>
+
+// <rdar://problem/26158937> panic() should be marked noreturn
+extern void panic(const char *string, ...) __printflike(1,2) __dead2;
+
+#include <kern/assert.h>
+#include <kern/ast.h>
+#include <kern/clock.h>
+#include <kern/cpu_data.h>
+#include <kern/kern_types.h>
+#include <kern/policy_internal.h>
+#include <kern/processor.h>
+#include <kern/sched_prim.h>   /* for thread_exception_return */
+#include <kern/task.h>
+#include <kern/thread.h>
+#include <kern/zalloc.h>
+#include <mach/kern_return.h>
+#include <mach/mach_param.h>
+#include <mach/mach_port.h>
+#include <mach/mach_types.h>
+#include <mach/mach_vm.h>
+#include <mach/sync_policy.h>
+#include <mach/task.h>
+#include <mach/thread_act.h> /* for thread_resume */
+#include <mach/thread_policy.h>
+#include <mach/thread_status.h>
+#include <mach/vm_prot.h>
+#include <mach/vm_statistics.h>
+#include <machine/atomic.h>
+#include <machine/machine_routines.h>
+#include <vm/vm_map.h>
+#include <vm/vm_protos.h>
+
+#include <sys/eventvar.h>
+#include <sys/kdebug.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/param.h>
+#include <sys/proc_info.h>     /* for fill_procworkqueue */
+#include <sys/proc_internal.h>
+#include <sys/pthread_shims.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/sysproto.h>
+#include <sys/systm.h>
+#include <sys/ulock.h> /* for ulock_owner_value_to_port_name */
+
+#include <pthread/bsdthread_private.h>
+#include <pthread/workqueue_syscalls.h>
+#include <pthread/workqueue_internal.h>
+#include <pthread/workqueue_trace.h>
+
+#include <os/log.h>
+
+extern thread_t        port_name_to_thread(mach_port_name_t port_name); /* osfmk/kern/ipc_tt.h   */
+
+static void workq_unpark_continue(void *uth, wait_result_t wr) __dead2;
+static void workq_schedule_creator(proc_t p, struct workqueue *wq, int flags);
+
+static bool workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
+               workq_threadreq_t req);
+
+static uint32_t workq_constrained_allowance(struct workqueue *wq,
+               thread_qos_t at_qos, struct uthread *uth, bool may_start_timer);
+
+static bool workq_thread_is_busy(uint64_t cur_ts,
+               _Atomic uint64_t *lastblocked_tsp);
+
+static int workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS;
+
+#pragma mark globals
+
+struct workq_usec_var {
+       uint32_t usecs;
+       uint64_t abstime;
+};
+
+#define WORKQ_SYSCTL_USECS(var, init) \
+               static struct workq_usec_var var = { .usecs = init }; \
+               SYSCTL_OID(_kern, OID_AUTO, var##_usecs, \
+                               CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &var, 0, \
+                               workq_sysctl_handle_usecs, "I", "")
+
+static lck_grp_t      *workq_lck_grp;
+static lck_attr_t     *workq_lck_attr;
+static lck_grp_attr_t *workq_lck_grp_attr;
+os_refgrp_decl(static, workq_refgrp, "workq", NULL);
+
+static zone_t workq_zone_workqueue;
+static zone_t workq_zone_threadreq;
+
+WORKQ_SYSCTL_USECS(wq_stalled_window,     WQ_STALLED_WINDOW_USECS);
+WORKQ_SYSCTL_USECS(wq_reduce_pool_window, WQ_REDUCE_POOL_WINDOW_USECS);
+WORKQ_SYSCTL_USECS(wq_max_timer_interval, WQ_MAX_TIMER_INTERVAL_USECS);
+static uint32_t wq_max_threads              = WORKQUEUE_MAXTHREADS;
+static uint32_t wq_max_constrained_threads  = WORKQUEUE_MAXTHREADS / 8;
+static uint32_t wq_init_constrained_limit   = 1;
+static uint16_t wq_death_max_load;
+static uint32_t wq_max_parallelism[WORKQ_NUM_QOS_BUCKETS];
+
+#pragma mark sysctls
+
+static int
+workq_sysctl_handle_usecs SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg2)
+       struct workq_usec_var *v = arg1;
+       int error = sysctl_handle_int(oidp, &v->usecs, 0, req);
+       if (error || !req->newptr)
+               return error;
+       clock_interval_to_absolutetime_interval(v->usecs, NSEC_PER_USEC,
+                       &v->abstime);
+       return 0;
+}
+
+SYSCTL_INT(_kern, OID_AUTO, wq_max_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
+               &wq_max_threads, 0, "");
+
+SYSCTL_INT(_kern, OID_AUTO, wq_max_constrained_threads, CTLFLAG_RW | CTLFLAG_LOCKED,
+               &wq_max_constrained_threads, 0, "");
+
+#pragma mark p_wqptr
+
+#define WQPTR_IS_INITING_VALUE ((struct workqueue *)~(uintptr_t)0)
+
+static struct workqueue *
+proc_get_wqptr_fast(struct proc *p)
+{
+       return os_atomic_load(&p->p_wqptr, relaxed);
+}
+
+static struct workqueue *
+proc_get_wqptr(struct proc *p)
+{
+       struct workqueue *wq = proc_get_wqptr_fast(p);
+       return wq == WQPTR_IS_INITING_VALUE ? NULL : wq;
+}
+
+static void
+proc_set_wqptr(struct proc *p, struct workqueue *wq)
+{
+       wq = os_atomic_xchg(&p->p_wqptr, wq, release);
+       if (wq == WQPTR_IS_INITING_VALUE) {
+               proc_lock(p);
+               thread_wakeup(&p->p_wqptr);
+               proc_unlock(p);
+       }
+}
+
+static bool
+proc_init_wqptr_or_wait(struct proc *p)
+{
+       struct workqueue *wq;
+
+       proc_lock(p);
+       wq = p->p_wqptr;
+
+       if (wq == NULL) {
+               p->p_wqptr = WQPTR_IS_INITING_VALUE;
+               proc_unlock(p);
+               return true;
+       }
+
+       if (wq == WQPTR_IS_INITING_VALUE) {
+               assert_wait(&p->p_wqptr, THREAD_UNINT);
+               proc_unlock(p);
+               thread_block(THREAD_CONTINUE_NULL);
+       } else {
+               proc_unlock(p);
+       }
+       return false;
+}
+
+static inline event_t
+workq_parked_wait_event(struct uthread *uth)
+{
+       return (event_t)&uth->uu_workq_stackaddr;
+}
+
+static inline void
+workq_thread_wakeup(struct uthread *uth)
+{
+       if ((uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) == 0) {
+               thread_wakeup_thread(workq_parked_wait_event(uth), uth->uu_thread);
+       }
+}
+
+#pragma mark wq_thactive
+
+#if defined(__LP64__)
+// Layout is:
+//   127 - 115 : 13 bits of zeroes
+//   114 - 112 : best QoS among all pending constrained requests
+//   111 -   0 : MGR, AUI, UI, IN, DF, UT, BG+MT buckets every 16 bits
+#define WQ_THACTIVE_BUCKET_WIDTH 16
+#define WQ_THACTIVE_QOS_SHIFT    (7 * WQ_THACTIVE_BUCKET_WIDTH)
+#else
+// Layout is:
+//   63 - 61 : best QoS among all pending constrained requests
+//   60      : Manager bucket (0 or 1)
+//   59 -  0 : AUI, UI, IN, DF, UT, BG+MT buckets every 10 bits
+#define WQ_THACTIVE_BUCKET_WIDTH 10
+#define WQ_THACTIVE_QOS_SHIFT    (6 * WQ_THACTIVE_BUCKET_WIDTH + 1)
+#endif
+#define WQ_THACTIVE_BUCKET_MASK  ((1U << WQ_THACTIVE_BUCKET_WIDTH) - 1)
+#define WQ_THACTIVE_BUCKET_HALF  (1U << (WQ_THACTIVE_BUCKET_WIDTH - 1))
+
+static_assert(sizeof(wq_thactive_t) * CHAR_BIT - WQ_THACTIVE_QOS_SHIFT >= 3,
+               "Make sure we have space to encode a QoS");
+
+static inline wq_thactive_t
+_wq_thactive(struct workqueue *wq)
+{
+       return os_atomic_load(&wq->wq_thactive, relaxed);
+}
+
+static inline int
+_wq_bucket(thread_qos_t qos)
+{
+       // Map both BG and MT to the same bucket by over-shifting down and
+       // clamping MT and BG together.
+       switch (qos) {
+       case THREAD_QOS_MAINTENANCE:
+               return 0;
+       default:
+               return qos - 2;
+       }
+}
+
+#define WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(tha) \
+               ((tha) >> WQ_THACTIVE_QOS_SHIFT)
+
+static inline thread_qos_t
+_wq_thactive_best_constrained_req_qos(struct workqueue *wq)
+{
+       // Avoid expensive atomic operations: the three bits we're loading are in
+       // a single byte, and always updated under the workqueue lock
+       wq_thactive_t v = *(wq_thactive_t *)&wq->wq_thactive;
+       return WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(v);
+}
+
+static void
+_wq_thactive_refresh_best_constrained_req_qos(struct workqueue *wq)
+{
+       thread_qos_t old_qos, new_qos;
+       workq_threadreq_t req;
+
+       req = priority_queue_max(&wq->wq_constrained_queue,
+                       struct workq_threadreq_s, tr_entry);
+       new_qos = req ? req->tr_qos : THREAD_QOS_UNSPECIFIED;
+       old_qos = _wq_thactive_best_constrained_req_qos(wq);
+       if (old_qos != new_qos) {
+               long delta = (long)new_qos - (long)old_qos;
+               wq_thactive_t v = (wq_thactive_t)delta << WQ_THACTIVE_QOS_SHIFT;
+               /*
+                * We can do an atomic add relative to the initial load because updates
+                * to this qos are always serialized under the workqueue lock.
+                */
+               v = os_atomic_add(&wq->wq_thactive, v, relaxed);
+#ifdef __LP64__
+               WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, (uint64_t)v,
+                               (uint64_t)(v >> 64), 0, 0);
+#else
+               WQ_TRACE_WQ(TRACE_wq_thactive_update, wq, v, 0, 0, 0);
+#endif
+       }
+}
+
+static inline wq_thactive_t
+_wq_thactive_offset_for_qos(thread_qos_t qos)
+{
+       return (wq_thactive_t)1 << (_wq_bucket(qos) * WQ_THACTIVE_BUCKET_WIDTH);
+}
+
+static inline wq_thactive_t
+_wq_thactive_inc(struct workqueue *wq, thread_qos_t qos)
+{
+       wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
+       return os_atomic_add_orig(&wq->wq_thactive, v, relaxed);
+}
+
+static inline wq_thactive_t
+_wq_thactive_dec(struct workqueue *wq, thread_qos_t qos)
+{
+       wq_thactive_t v = _wq_thactive_offset_for_qos(qos);
+       return os_atomic_sub_orig(&wq->wq_thactive, v, relaxed);
+}
+
+static inline void
+_wq_thactive_move(struct workqueue *wq,
+               thread_qos_t old_qos, thread_qos_t new_qos)
+{
+       wq_thactive_t v = _wq_thactive_offset_for_qos(new_qos) -
+                       _wq_thactive_offset_for_qos(old_qos);
+       os_atomic_add_orig(&wq->wq_thactive, v, relaxed);
+       wq->wq_thscheduled_count[_wq_bucket(old_qos)]--;
+       wq->wq_thscheduled_count[_wq_bucket(new_qos)]++;
+}
+
+static inline uint32_t
+_wq_thactive_aggregate_downto_qos(struct workqueue *wq, wq_thactive_t v,
+               thread_qos_t qos, uint32_t *busycount, uint32_t *max_busycount)
+{
+       uint32_t count = 0, active;
+       uint64_t curtime;
+
+       assert(WORKQ_THREAD_QOS_MIN <= qos && qos <= WORKQ_THREAD_QOS_MAX);
+
+       if (busycount) {
+               curtime = mach_absolute_time();
+               *busycount = 0;
+       }
+       if (max_busycount) {
+               *max_busycount = THREAD_QOS_LAST - qos;
+       }
+
+       int i = _wq_bucket(qos);
+       v >>= i * WQ_THACTIVE_BUCKET_WIDTH;
+       for (; i < WORKQ_NUM_QOS_BUCKETS; i++, v >>= WQ_THACTIVE_BUCKET_WIDTH) {
+               active = v & WQ_THACTIVE_BUCKET_MASK;
+               count += active;
+
+               if (busycount && wq->wq_thscheduled_count[i] > active) {
+                       if (workq_thread_is_busy(curtime, &wq->wq_lastblocked_ts[i])) {
+                               /*
+                                * We only consider the last blocked thread for a given bucket
+                                * as busy because we don't want to take the list lock in each
+                                * sched callback. However this is an approximation that could
+                                * contribute to thread creation storms.
+                                */
+                               (*busycount)++;
+                       }
+               }
+       }
+
+       return count;
+}
+
+#pragma mark wq_flags
+
+static inline uint32_t
+_wq_flags(struct workqueue *wq)
+{
+       return os_atomic_load(&wq->wq_flags, relaxed);
+}
+
+static inline bool
+_wq_exiting(struct workqueue *wq)
+{
+       return _wq_flags(wq) & WQ_EXITING;
+}
+
+bool
+workq_is_exiting(struct proc *p)
+{
+       struct workqueue *wq = proc_get_wqptr(p);
+       return !wq || _wq_exiting(wq);
+}
+
+struct turnstile *
+workq_turnstile(struct proc *p)
+{
+       struct workqueue *wq = proc_get_wqptr(p);
+       return wq ? wq->wq_turnstile : TURNSTILE_NULL;
+}
+
+#pragma mark workqueue lock
+
+static bool
+workq_lock_spin_is_acquired_kdp(struct workqueue *wq)
+{
+       return kdp_lck_spin_is_acquired(&wq->wq_lock);
+}
+
+static inline void
+workq_lock_spin(struct workqueue *wq)
+{
+       lck_spin_lock(&wq->wq_lock);
+}
+
+static inline void
+workq_lock_held(__assert_only struct workqueue *wq)
+{
+       LCK_SPIN_ASSERT(&wq->wq_lock, LCK_ASSERT_OWNED);
+}
+
+static inline bool
+workq_lock_try(struct workqueue *wq)
+{
+       return lck_spin_try_lock(&wq->wq_lock);
+}
+
+static inline void
+workq_unlock(struct workqueue *wq)
+{
+       lck_spin_unlock(&wq->wq_lock);
+}
+
+#pragma mark idle thread lists
+
+#define WORKQ_POLICY_INIT(qos) \
+               (struct uu_workq_policy){ .qos_req = qos, .qos_bucket = qos }
+
+static inline thread_qos_t
+workq_pri_bucket(struct uu_workq_policy req)
+{
+       return MAX(MAX(req.qos_req, req.qos_max), req.qos_override);
+}
+
+static inline thread_qos_t
+workq_pri_override(struct uu_workq_policy req)
+{
+       return MAX(workq_pri_bucket(req), req.qos_bucket);
+}
+
+static inline bool
+workq_thread_needs_params_change(workq_threadreq_t req, struct uthread *uth)
+{
+       workq_threadreq_param_t cur_trp, req_trp = { };
+
+       cur_trp.trp_value = uth->uu_save.uus_workq_park_data.workloop_params;
+       if (req->tr_flags & TR_FLAG_WL_PARAMS) {
+               req_trp = kqueue_threadreq_workloop_param(req);
+       }
+
+       /*
+        * CPU percent flags are handled separately to policy changes, so ignore
+        * them for all of these checks.
+        */
+       uint16_t cur_flags = (cur_trp.trp_flags & ~TRP_CPUPERCENT);
+       uint16_t req_flags = (req_trp.trp_flags & ~TRP_CPUPERCENT);
+
+       if (!req_flags && !cur_flags) {
+               return false;
+       }
+
+       if (req_flags != cur_flags) {
+               return true;
+       }
+
+       if ((req_flags & TRP_PRIORITY) && req_trp.trp_pri != cur_trp.trp_pri) {
+               return true;
+       }
+
+       if ((req_flags & TRP_POLICY) && cur_trp.trp_pol != cur_trp.trp_pol) {
+               return true;
+       }
+
+       return false;
+}
+
+static inline bool
+workq_thread_needs_priority_change(workq_threadreq_t req, struct uthread *uth)
+{
+       if (workq_thread_needs_params_change(req, uth)) {
+               return true;
+       }
+
+       return req->tr_qos != workq_pri_override(uth->uu_workq_pri);
+}
+
+static void
+workq_thread_update_bucket(proc_t p, struct workqueue *wq, struct uthread *uth,
+               struct uu_workq_policy old_pri, struct uu_workq_policy new_pri,
+               bool force_run)
+{
+       thread_qos_t old_bucket = old_pri.qos_bucket;
+       thread_qos_t new_bucket = workq_pri_bucket(new_pri);
+
+       if (old_bucket != new_bucket) {
+               _wq_thactive_move(wq, old_bucket, new_bucket);
+       }
+
+       new_pri.qos_bucket = new_bucket;
+       uth->uu_workq_pri = new_pri;
+
+       if (workq_pri_override(old_pri) != new_bucket) {
+               thread_set_workq_override(uth->uu_thread, new_bucket);
+       }
+
+       if (wq->wq_reqcount && (old_bucket > new_bucket || force_run)) {
+               int flags = WORKQ_THREADREQ_CAN_CREATE_THREADS;
+               if (old_bucket > new_bucket) {
+                       /*
+                        * When lowering our bucket, we may unblock a thread request,
+                        * but we can't drop our priority before we have evaluated
+                        * whether this is the case, and if we ever drop the workqueue lock
+                        * that would cause a priority inversion.
+                        *
+                        * We hence have to disallow thread creation in that case.
+                        */
+                       flags = 0;
+               }
+               workq_schedule_creator(p, wq, flags);
+       }
+}
+
+/*
+ * Sets/resets the cpu percent limits on the current thread. We can't set
+ * these limits from outside of the current thread, so this function needs
+ * to be called when we're executing on the intended
+ */
+static void
+workq_thread_reset_cpupercent(workq_threadreq_t req, struct uthread *uth)
+{
+       assert(uth == current_uthread());
+       workq_threadreq_param_t trp = { };
+
+       if (req && (req->tr_flags & TR_FLAG_WL_PARAMS)) {
+               trp = kqueue_threadreq_workloop_param(req);
+       }
+
+       if (uth->uu_workq_flags & UT_WORKQ_CPUPERCENT) {
+               /*
+                * Going through disable when we have an existing CPU percent limit
+                * set will force the ledger to refill the token bucket of the current
+                * thread. Removing any penalty applied by previous thread use.
+                */
+               thread_set_cpulimit(THREAD_CPULIMIT_DISABLE, 0, 0);
+               uth->uu_workq_flags &= ~UT_WORKQ_CPUPERCENT;
+       }
+
+       if (trp.trp_flags & TRP_CPUPERCENT) {
+               thread_set_cpulimit(THREAD_CPULIMIT_BLOCK, trp.trp_cpupercent,
+                               (uint64_t)trp.trp_refillms * NSEC_PER_SEC);
+               uth->uu_workq_flags |= UT_WORKQ_CPUPERCENT;
+       }
+}
+
+static void
+workq_thread_reset_pri(struct workqueue *wq, struct uthread *uth,
+               workq_threadreq_t req)
+{
+       thread_t th = uth->uu_thread;
+       thread_qos_t qos = req ? req->tr_qos : WORKQ_THREAD_QOS_CLEANUP;
+       workq_threadreq_param_t trp = { };
+       int priority = 31;
+       int policy = POLICY_TIMESHARE;
+
+       if (req && (req->tr_flags & TR_FLAG_WL_PARAMS)) {
+               trp = kqueue_threadreq_workloop_param(req);
+       }
+
+       uth->uu_workq_pri = WORKQ_POLICY_INIT(qos);
+       uth->uu_workq_flags &= ~UT_WORKQ_OUTSIDE_QOS;
+       uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
+
+       // qos sent out to userspace (may differ from uu_workq_pri on param threads)
+       uth->uu_save.uus_workq_park_data.qos = qos;
+
+       if (qos == WORKQ_THREAD_QOS_MANAGER) {
+               uint32_t mgr_pri = wq->wq_event_manager_priority;
+               assert(trp.trp_value == 0); // manager qos and thread policy don't mix
+
+               if (mgr_pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
+                       mgr_pri &= _PTHREAD_PRIORITY_SCHED_PRI_MASK;
+                       thread_set_workq_pri(th, THREAD_QOS_UNSPECIFIED, mgr_pri,
+                                       POLICY_TIMESHARE);
+                       return;
+               }
+
+               qos = _pthread_priority_thread_qos(mgr_pri);
+       } else {
+               if (trp.trp_flags & TRP_PRIORITY) {
+                       qos = THREAD_QOS_UNSPECIFIED;
+                       priority = trp.trp_pri;
+                       uth->uu_workq_flags |= UT_WORKQ_OUTSIDE_QOS;
+               }
+
+               if (trp.trp_flags & TRP_POLICY) {
+                       policy = trp.trp_pol;
+               }
+       }
+
+       thread_set_workq_pri(th, qos, priority, policy);
+}
+
+/*
+ * Called by kevent with the NOTE_WL_THREAD_REQUEST knote lock held,
+ * every time a servicer is being told about a new max QoS.
+ */
+void
+workq_thread_set_max_qos(struct proc *p, struct kqrequest *kqr)
+{
+       struct uu_workq_policy old_pri, new_pri;
+       struct uthread *uth = get_bsdthread_info(kqr->kqr_thread);
+       struct workqueue *wq = proc_get_wqptr_fast(p);
+       thread_qos_t qos = kqr->kqr_qos_index;
+
+       if (uth->uu_workq_pri.qos_max == qos)
+               return;
+
+       workq_lock_spin(wq);
+       old_pri = new_pri = uth->uu_workq_pri;
+       new_pri.qos_max = qos;
+       workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
+       workq_unlock(wq);
+}
+
+#pragma mark idle threads accounting and handling
+
+static inline struct uthread *
+workq_oldest_killable_idle_thread(struct workqueue *wq)
+{
+       struct uthread *uth = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
+
+       if (uth && !uth->uu_save.uus_workq_park_data.has_stack) {
+               uth = TAILQ_PREV(uth, workq_uthread_head, uu_workq_entry);
+               if (uth) {
+                       assert(uth->uu_save.uus_workq_park_data.has_stack);
+               }
+       }
+       return uth;
+}
+
+static inline uint64_t
+workq_kill_delay_for_idle_thread(struct workqueue *wq)
+{
+       uint64_t delay = wq_reduce_pool_window.abstime;
+       uint16_t idle = wq->wq_thidlecount;
+
+       /*
+        * If we have less than wq_death_max_load threads, have a 5s timer.
+        *
+        * For the next wq_max_constrained_threads ones, decay linearly from
+        * from 5s to 50ms.
+        */
+       if (idle <= wq_death_max_load) {
+               return delay;
+       }
+
+       if (wq_max_constrained_threads > idle - wq_death_max_load) {
+               delay *= (wq_max_constrained_threads - (idle - wq_death_max_load));
+       }
+       return delay / wq_max_constrained_threads;
+}
+
+static inline bool
+workq_should_kill_idle_thread(struct workqueue *wq, struct uthread *uth,
+               uint64_t now)
+{
+       uint64_t delay = workq_kill_delay_for_idle_thread(wq);
+       return now - uth->uu_save.uus_workq_park_data.idle_stamp > delay;
+}
+
+static void
+workq_death_call_schedule(struct workqueue *wq, uint64_t deadline)
+{
+       uint32_t wq_flags = os_atomic_load(&wq->wq_flags, relaxed);
+
+       if (wq_flags & (WQ_EXITING | WQ_DEATH_CALL_SCHEDULED)) {
+               return;
+       }
+       os_atomic_or(&wq->wq_flags, WQ_DEATH_CALL_SCHEDULED, relaxed);
+
+       WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
+
+       /*
+        * <rdar://problem/13139182> Due to how long term timers work, the leeway
+        * can't be too short, so use 500ms which is long enough that we will not
+        * wake up the CPU for killing threads, but short enough that it doesn't
+        * fall into long-term timer list shenanigans.
+        */
+       thread_call_enter_delayed_with_leeway(wq->wq_death_call, NULL, deadline,
+                       wq_reduce_pool_window.abstime / 10,
+                       THREAD_CALL_DELAY_LEEWAY | THREAD_CALL_DELAY_USER_BACKGROUND);
+}
+
+/*
+ * `decrement` is set to the number of threads that are no longer dying:
+ * - because they have been resuscitated just in time (workq_pop_idle_thread)
+ * - or have been killed (workq_thread_terminate).
+ */
+static void
+workq_death_policy_evaluate(struct workqueue *wq, uint16_t decrement)
+{
+       struct uthread *uth;
+
+       assert(wq->wq_thdying_count >= decrement);
+       if ((wq->wq_thdying_count -= decrement) > 0)
+               return;
+
+       if (wq->wq_thidlecount <= 1)
+               return;
+
+       if ((uth = workq_oldest_killable_idle_thread(wq)) == NULL)
+               return;
+
+       uint64_t now = mach_absolute_time();
+       uint64_t delay = workq_kill_delay_for_idle_thread(wq);
+
+       if (now - uth->uu_save.uus_workq_park_data.idle_stamp > delay) {
+               WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
+                               wq, wq->wq_thidlecount, 0, 0, 0);
+               wq->wq_thdying_count++;
+               uth->uu_workq_flags |= UT_WORKQ_DYING;
+               workq_thread_wakeup(uth);
+               return;
+       }
+
+       workq_death_call_schedule(wq,
+                       uth->uu_save.uus_workq_park_data.idle_stamp + delay);
+}
+
+void
+workq_thread_terminate(struct proc *p, struct uthread *uth)
+{
+       struct workqueue *wq = proc_get_wqptr_fast(p);
+
+       workq_lock_spin(wq);
+       TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
+       if (uth->uu_workq_flags & UT_WORKQ_DYING) {
+               WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_END,
+                               wq, wq->wq_thidlecount, 0, 0, 0);
+               workq_death_policy_evaluate(wq, 1);
+       }
+       if (wq->wq_nthreads-- == wq_max_threads) {
+               /*
+                * We got under the thread limit again, which may have prevented
+                * thread creation from happening, redrive if there are pending requests
+                */
+               if (wq->wq_reqcount) {
+                       workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
+               }
+       }
+       workq_unlock(wq);
+
+       thread_deallocate(uth->uu_thread);
+}
+
+static void
+workq_kill_old_threads_call(void *param0, void *param1 __unused)
+{
+       struct workqueue *wq = param0;
+
+       workq_lock_spin(wq);
+       WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_START, wq, 0, 0, 0, 0);
+       os_atomic_and(&wq->wq_flags, ~WQ_DEATH_CALL_SCHEDULED, relaxed);
+       workq_death_policy_evaluate(wq, 0);
+       WQ_TRACE_WQ(TRACE_wq_death_call | DBG_FUNC_END, wq, 0, 0, 0, 0);
+       workq_unlock(wq);
+}
+
+static struct uthread *
+workq_pop_idle_thread(struct workqueue *wq)
+{
+       struct uthread *uth;
+
+       if ((uth = TAILQ_FIRST(&wq->wq_thidlelist))) {
+               TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
+       } else {
+               uth = TAILQ_FIRST(&wq->wq_thnewlist);
+               TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
+       }
+       TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
+
+       assert((uth->uu_workq_flags & UT_WORKQ_RUNNING) == 0);
+       uth->uu_workq_flags |= UT_WORKQ_RUNNING | UT_WORKQ_OVERCOMMIT;
+       wq->wq_threads_scheduled++;
+       wq->wq_thidlecount--;
+
+       if (__improbable(uth->uu_workq_flags & UT_WORKQ_DYING)) {
+               uth->uu_workq_flags ^= UT_WORKQ_DYING;
+               workq_death_policy_evaluate(wq, 1);
+       }
+       return uth;
+}
+
+/*
+ * Called by thread_create_workq_waiting() during thread initialization, before
+ * assert_wait, before the thread has been started.
+ */
+event_t
+workq_thread_init_and_wq_lock(task_t task, thread_t th)
+{
+       struct uthread *uth = get_bsdthread_info(th);
+
+       uth->uu_workq_flags = UT_WORKQ_NEW;
+       uth->uu_workq_pri = WORKQ_POLICY_INIT(THREAD_QOS_LEGACY);
+       uth->uu_workq_thport = MACH_PORT_NULL;
+       uth->uu_workq_stackaddr = 0;
+
+       thread_set_tag(th, THREAD_TAG_PTHREAD | THREAD_TAG_WORKQUEUE);
+       thread_reset_workq_qos(th, THREAD_QOS_LEGACY);
+
+       workq_lock_spin(proc_get_wqptr_fast(get_bsdtask_info(task)));
+       return workq_parked_wait_event(uth);
+}
+
+/**
+ * Try to add a new workqueue thread.
+ *
+ * - called with workq lock held
+ * - dropped and retaken around thread creation
+ * - return with workq lock held
+ */
+static bool
+workq_add_new_idle_thread(proc_t p, struct workqueue *wq)
+{
+       mach_vm_offset_t th_stackaddr;
+       kern_return_t kret;
+       thread_t th;
+
+       wq->wq_nthreads++;
+
+       workq_unlock(wq);
+
+       vm_map_t vmap = get_task_map(p->task);
+
+       kret = pthread_functions->workq_create_threadstack(p, vmap, &th_stackaddr);
+       if (kret != KERN_SUCCESS) {
+               WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
+                               kret, 1, 0, 0);
+               goto out;
+       }
+
+       kret = thread_create_workq_waiting(p->task, workq_unpark_continue, &th);
+       if (kret != KERN_SUCCESS) {
+               WQ_TRACE_WQ(TRACE_wq_thread_create_failed | DBG_FUNC_NONE, wq,
+                               kret, 0, 0, 0);
+               pthread_functions->workq_destroy_threadstack(p, vmap, th_stackaddr);
+               goto out;
+       }
+
+       // thread_create_workq_waiting() will return with the wq lock held
+       // on success, because it calls workq_thread_init_and_wq_lock() above
+
+       struct uthread *uth = get_bsdthread_info(th);
+
+       wq->wq_creations++;
+       wq->wq_thidlecount++;
+       uth->uu_workq_stackaddr = th_stackaddr;
+       TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
+
+       WQ_TRACE_WQ(TRACE_wq_thread_create | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
+       return true;
+
+out:
+       workq_lock_spin(wq);
+       /*
+        * Do not redrive here if we went under wq_max_threads again,
+        * it is the responsibility of the callers of this function
+        * to do so when it fails.
+        */
+       wq->wq_nthreads--;
+       return false;
+}
+
+#define WORKQ_UNPARK_FOR_DEATH_WAS_IDLE 0x1
+
+__attribute__((noreturn, noinline))
+static void
+workq_unpark_for_death_and_unlock(proc_t p, struct workqueue *wq,
+               struct uthread *uth, uint32_t death_flags)
+{
+       thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
+       bool first_use = uth->uu_workq_flags & UT_WORKQ_NEW;
+
+       if (qos > WORKQ_THREAD_QOS_CLEANUP) {
+               workq_thread_reset_pri(wq, uth, NULL);
+               qos = WORKQ_THREAD_QOS_CLEANUP;
+       }
+
+       workq_thread_reset_cpupercent(NULL, uth);
+
+       if (death_flags & WORKQ_UNPARK_FOR_DEATH_WAS_IDLE) {
+               wq->wq_thidlecount--;
+               if (first_use) {
+                       TAILQ_REMOVE(&wq->wq_thnewlist, uth, uu_workq_entry);
+               } else {
+                       TAILQ_REMOVE(&wq->wq_thidlelist, uth, uu_workq_entry);
+               }
+       }
+       TAILQ_INSERT_TAIL(&wq->wq_thrunlist, uth, uu_workq_entry);
+
+       workq_unlock(wq);
+
+       uint32_t flags = WQ_FLAG_THREAD_NEWSPI | qos | WQ_FLAG_THREAD_PRIO_QOS;
+       uint32_t setup_flags = WQ_SETUP_EXIT_THREAD;
+       thread_t th = uth->uu_thread;
+       vm_map_t vmap = get_task_map(p->task);
+
+       if (!first_use) flags |= WQ_FLAG_THREAD_REUSE;
+
+       pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
+                       uth->uu_workq_thport, 0, setup_flags, flags);
+       __builtin_unreachable();
+}
+
+bool
+workq_is_current_thread_updating_turnstile(struct workqueue *wq)
+{
+       return wq->wq_turnstile_updater == current_thread();
+}
+
+__attribute__((always_inline))
+static inline void
+workq_perform_turnstile_operation_locked(struct workqueue *wq,
+               void (^operation)(void))
+{
+       workq_lock_held(wq);
+       wq->wq_turnstile_updater = current_thread();
+       operation();
+       wq->wq_turnstile_updater = THREAD_NULL;
+}
+
+static void
+workq_turnstile_update_inheritor(struct workqueue *wq,
+               turnstile_inheritor_t inheritor,
+               turnstile_update_flags_t flags)
+{
+       workq_perform_turnstile_operation_locked(wq, ^{
+               turnstile_update_inheritor(wq->wq_turnstile, inheritor,
+                               flags | TURNSTILE_IMMEDIATE_UPDATE);
+               turnstile_update_inheritor_complete(wq->wq_turnstile,
+                               TURNSTILE_INTERLOCK_HELD);
+       });
+}
+
+static void
+workq_push_idle_thread(proc_t p, struct workqueue *wq, struct uthread *uth)
+{
+       uint64_t now = mach_absolute_time();
+
+       uth->uu_workq_flags &= ~UT_WORKQ_RUNNING;
+       if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
+               wq->wq_constrained_threads_scheduled--;
+       }
+       TAILQ_REMOVE(&wq->wq_thrunlist, uth, uu_workq_entry);
+       wq->wq_threads_scheduled--;
+
+       if (wq->wq_creator == uth) {
+               WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 3, 0,
+                               uth->uu_save.uus_workq_park_data.yields, 0);
+               wq->wq_creator = NULL;
+               if (wq->wq_reqcount) {
+                       workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
+               } else {
+                       workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
+               }
+               if (uth->uu_workq_flags & UT_WORKQ_NEW) {
+                       TAILQ_INSERT_TAIL(&wq->wq_thnewlist, uth, uu_workq_entry);
+                       wq->wq_thidlecount++;
+                       return;
+               }
+       } else {
+               _wq_thactive_dec(wq, uth->uu_workq_pri.qos_bucket);
+               wq->wq_thscheduled_count[_wq_bucket(uth->uu_workq_pri.qos_bucket)]--;
+               assert(!(uth->uu_workq_flags & UT_WORKQ_NEW));
+               uth->uu_workq_flags |= UT_WORKQ_IDLE_CLEANUP;
+       }
+
+       uth->uu_save.uus_workq_park_data.idle_stamp = now;
+
+       struct uthread *oldest = workq_oldest_killable_idle_thread(wq);
+       uint16_t cur_idle = wq->wq_thidlecount;
+
+       if (cur_idle >= wq_max_constrained_threads ||
+                       (wq->wq_thdying_count == 0 && oldest &&
+                       workq_should_kill_idle_thread(wq, oldest, now))) {
+               /*
+                * Immediately kill threads if we have too may of them.
+                *
+                * And swap "place" with the oldest one we'd have woken up.
+                * This is a relatively desperate situation where we really
+                * need to kill threads quickly and it's best to kill
+                * the one that's currently on core than context switching.
+                */
+               if (oldest) {
+                       oldest->uu_save.uus_workq_park_data.idle_stamp = now;
+                       TAILQ_REMOVE(&wq->wq_thidlelist, oldest, uu_workq_entry);
+                       TAILQ_INSERT_HEAD(&wq->wq_thidlelist, oldest, uu_workq_entry);
+               }
+
+               WQ_TRACE_WQ(TRACE_wq_thread_terminate | DBG_FUNC_START,
+                               wq, cur_idle, 0, 0, 0);
+               wq->wq_thdying_count++;
+               uth->uu_workq_flags |= UT_WORKQ_DYING;
+               uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
+               workq_unpark_for_death_and_unlock(p, wq, uth, 0);
+               __builtin_unreachable();
+       }
+
+       struct uthread *tail = TAILQ_LAST(&wq->wq_thidlelist, workq_uthread_head);
+
+       cur_idle += 1;
+       wq->wq_thidlecount = cur_idle;
+
+       if (cur_idle >= wq_death_max_load && tail &&
+                       tail->uu_save.uus_workq_park_data.has_stack) {
+               uth->uu_save.uus_workq_park_data.has_stack = false;
+               TAILQ_INSERT_TAIL(&wq->wq_thidlelist, uth, uu_workq_entry);
+       } else {
+               uth->uu_save.uus_workq_park_data.has_stack = true;
+               TAILQ_INSERT_HEAD(&wq->wq_thidlelist, uth, uu_workq_entry);
+       }
+
+       if (!tail) {
+               uint64_t delay = workq_kill_delay_for_idle_thread(wq);
+               workq_death_call_schedule(wq, now + delay);
+       }
+}
+
+#pragma mark thread requests
+
+static inline int
+workq_priority_for_req(workq_threadreq_t req)
+{
+       thread_qos_t qos = req->tr_qos;
+
+       if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) {
+               workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
+               assert(trp.trp_flags & TRP_PRIORITY);
+               return trp.trp_pri;
+       }
+       return thread_workq_pri_for_qos(qos);
+}
+
+static inline struct priority_queue *
+workq_priority_queue_for_req(struct workqueue *wq, workq_threadreq_t req)
+{
+       if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) {
+               return &wq->wq_special_queue;
+       } else if (req->tr_flags & TR_FLAG_OVERCOMMIT) {
+               return &wq->wq_overcommit_queue;
+       } else {
+               return &wq->wq_constrained_queue;
+       }
+}
+
+/*
+ * returns true if the the enqueued request is the highest priority item
+ * in its priority queue.
+ */
+static bool
+workq_threadreq_enqueue(struct workqueue *wq, workq_threadreq_t req)
+{
+       assert(req->tr_state == TR_STATE_NEW);
+
+       req->tr_state = TR_STATE_QUEUED;
+       wq->wq_reqcount += req->tr_count;
+
+       if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
+               assert(wq->wq_event_manager_threadreq == NULL);
+               assert(req->tr_flags & TR_FLAG_KEVENT);
+               assert(req->tr_count == 1);
+               wq->wq_event_manager_threadreq = req;
+               return true;
+       }
+       if (priority_queue_insert(workq_priority_queue_for_req(wq, req),
+                       &req->tr_entry, workq_priority_for_req(req),
+                       PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+               if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+                       _wq_thactive_refresh_best_constrained_req_qos(wq);
+               }
+               return true;
+       }
+       return false;
+}
+
+/*
+ * returns true if the the dequeued request was the highest priority item
+ * in its priority queue.
+ */
+static bool
+workq_threadreq_dequeue(struct workqueue *wq, workq_threadreq_t req)
+{
+       wq->wq_reqcount--;
+
+       if (--req->tr_count == 0) {
+               if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
+                       assert(wq->wq_event_manager_threadreq == req);
+                       assert(req->tr_count == 0);
+                       wq->wq_event_manager_threadreq = NULL;
+                       return true;
+               }
+               if (priority_queue_remove(workq_priority_queue_for_req(wq, req),
+                               &req->tr_entry, PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+                       if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+                               _wq_thactive_refresh_best_constrained_req_qos(wq);
+                       }
+                       return true;
+               }
+       }
+       return false;
+}
+
+static void
+workq_threadreq_destroy(proc_t p, workq_threadreq_t req)
+{
+       req->tr_state = TR_STATE_IDLE;
+       if (req->tr_flags & (TR_FLAG_WORKLOOP | TR_FLAG_KEVENT)) {
+               kqueue_threadreq_cancel(p, req);
+       } else {
+               zfree(workq_zone_threadreq, req);
+       }
+}
+
+/*
+ * Mark a thread request as complete.  At this point, it is treated as owned by
+ * the submitting subsystem and you should assume it could be freed.
+ *
+ * Called with the workqueue lock held.
+ */
+static void
+workq_threadreq_bind_and_unlock(proc_t p, struct workqueue *wq,
+               workq_threadreq_t req, struct uthread *uth)
+{
+       uint8_t tr_flags = req->tr_flags;
+       bool needs_commit = false;
+       int creator_flags = 0;
+
+       wq->wq_fulfilled++;
+
+       if (req->tr_state == TR_STATE_QUEUED) {
+               workq_threadreq_dequeue(wq, req);
+               creator_flags = WORKQ_THREADREQ_CAN_CREATE_THREADS;
+       }
+
+       if (wq->wq_creator == uth) {
+               WQ_TRACE_WQ(TRACE_wq_creator_select, wq, 4, 0,
+                               uth->uu_save.uus_workq_park_data.yields, 0);
+               creator_flags = WORKQ_THREADREQ_CAN_CREATE_THREADS |
+                               WORKQ_THREADREQ_CREATOR_TRANSFER;
+               wq->wq_creator = NULL;
+               _wq_thactive_inc(wq, req->tr_qos);
+               wq->wq_thscheduled_count[_wq_bucket(req->tr_qos)]++;
+       } else if (uth->uu_workq_pri.qos_bucket != req->tr_qos) {
+               _wq_thactive_move(wq, uth->uu_workq_pri.qos_bucket, req->tr_qos);
+       }
+       workq_thread_reset_pri(wq, uth, req);
+
+       if (tr_flags & TR_FLAG_OVERCOMMIT) {
+               if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
+                       uth->uu_workq_flags |= UT_WORKQ_OVERCOMMIT;
+                       wq->wq_constrained_threads_scheduled--;
+               }
+       } else {
+               if ((uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) != 0) {
+                       uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT;
+                       wq->wq_constrained_threads_scheduled++;
+               }
+       }
+
+       if (tr_flags & (TR_FLAG_KEVENT | TR_FLAG_WORKLOOP)) {
+               if (req->tr_state == TR_STATE_NEW) {
+                       /*
+                        * We're called from workq_kern_threadreq_initiate()
+                        * due to an unbind, with the kq req held.
+                        */
+                       assert(!creator_flags);
+                       req->tr_state = TR_STATE_IDLE;
+                       kqueue_threadreq_bind(p, req, uth->uu_thread, 0);
+               } else {
+                       assert(req->tr_count == 0);
+                       workq_perform_turnstile_operation_locked(wq, ^{
+                               kqueue_threadreq_bind_prepost(p, req, uth->uu_thread);
+                       });
+                       needs_commit = true;
+               }
+               req = NULL;
+       } else if (req->tr_count > 0) {
+               req = NULL;
+       }
+
+       if (creator_flags) {
+               /* This can drop the workqueue lock, and take it again */
+               workq_schedule_creator(p, wq, creator_flags);
+       }
+
+       workq_unlock(wq);
+
+       if (req) {
+               zfree(workq_zone_threadreq, req);
+       }
+       if (needs_commit) {
+               kqueue_threadreq_bind_commit(p, uth->uu_thread);
+       }
+
+       /*
+        * Run Thread, Run!
+        */
+       uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI;
+       if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
+               upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
+       } else if (tr_flags & TR_FLAG_OVERCOMMIT) {
+               upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
+       }
+       if (tr_flags & TR_FLAG_KEVENT) {
+               upcall_flags |= WQ_FLAG_THREAD_KEVENT;
+       }
+       if (tr_flags & TR_FLAG_WORKLOOP) {
+               upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
+       }
+       uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
+}
+
+#pragma mark workqueue thread creation thread calls
+
+static inline bool
+workq_thread_call_prepost(struct workqueue *wq, uint32_t sched, uint32_t pend,
+               uint32_t fail_mask)
+{
+       uint32_t old_flags, new_flags;
+
+       os_atomic_rmw_loop(&wq->wq_flags, old_flags, new_flags, acquire, {
+               if (__improbable(old_flags & (WQ_EXITING | sched | pend | fail_mask))) {
+                       os_atomic_rmw_loop_give_up(return false);
+               }
+               if (__improbable(old_flags & WQ_PROC_SUSPENDED)) {
+                       new_flags = old_flags | pend;
+               } else {
+                       new_flags = old_flags | sched;
+               }
+       });
+
+       return (old_flags & WQ_PROC_SUSPENDED) == 0;
+}
+
+#define WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART 0x1
+
+static bool
+workq_schedule_delayed_thread_creation(struct workqueue *wq, int flags)
+{
+       assert(!preemption_enabled());
+
+       if (!workq_thread_call_prepost(wq, WQ_DELAYED_CALL_SCHEDULED,
+                       WQ_DELAYED_CALL_PENDED, WQ_IMMEDIATE_CALL_PENDED |
+                       WQ_IMMEDIATE_CALL_SCHEDULED)) {
+               return false;
+       }
+
+       uint64_t now = mach_absolute_time();
+
+       if (flags & WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART) {
+               /* do not change the window */
+       } else if (now - wq->wq_thread_call_last_run <= wq->wq_timer_interval) {
+               wq->wq_timer_interval *= 2;
+               if (wq->wq_timer_interval > wq_max_timer_interval.abstime) {
+                       wq->wq_timer_interval = wq_max_timer_interval.abstime;
+               }
+       } else if (now - wq->wq_thread_call_last_run > 2 * wq->wq_timer_interval) {
+               wq->wq_timer_interval /= 2;
+               if (wq->wq_timer_interval < wq_stalled_window.abstime) {
+                       wq->wq_timer_interval = wq_stalled_window.abstime;
+               }
+       }
+
+       WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
+                       _wq_flags(wq), wq->wq_timer_interval, 0);
+
+       thread_call_t call = wq->wq_delayed_call;
+       uintptr_t arg = WQ_DELAYED_CALL_SCHEDULED;
+       uint64_t deadline = now + wq->wq_timer_interval;
+       if (thread_call_enter1_delayed(call, (void *)arg, deadline)) {
+               panic("delayed_call was already enqueued");
+       }
+       return true;
+}
+
+static void
+workq_schedule_immediate_thread_creation(struct workqueue *wq)
+{
+       assert(!preemption_enabled());
+
+       if (workq_thread_call_prepost(wq, WQ_IMMEDIATE_CALL_SCHEDULED,
+                       WQ_IMMEDIATE_CALL_PENDED, 0)) {
+               WQ_TRACE_WQ(TRACE_wq_start_add_timer, wq, wq->wq_reqcount,
+                               _wq_flags(wq), 0, 0);
+
+               uintptr_t arg = WQ_IMMEDIATE_CALL_SCHEDULED;
+               if (thread_call_enter1(wq->wq_immediate_call, (void *)arg)) {
+                       panic("immediate_call was already enqueued");
+               }
+       }
+}
+
+void
+workq_proc_suspended(struct proc *p)
+{
+       struct workqueue *wq = proc_get_wqptr(p);
+
+       if (wq) os_atomic_or(&wq->wq_flags, WQ_PROC_SUSPENDED, relaxed);
+}
+
+void
+workq_proc_resumed(struct proc *p)
+{
+       struct workqueue *wq = proc_get_wqptr(p);
+       uint32_t wq_flags;
+
+       if (!wq) return;
+
+       wq_flags = os_atomic_and_orig(&wq->wq_flags, ~(WQ_PROC_SUSPENDED |
+                       WQ_DELAYED_CALL_PENDED | WQ_IMMEDIATE_CALL_PENDED), relaxed);
+       if ((wq_flags & WQ_EXITING) == 0) {
+               disable_preemption();
+               if (wq_flags & WQ_IMMEDIATE_CALL_PENDED) {
+                       workq_schedule_immediate_thread_creation(wq);
+               } else if (wq_flags & WQ_DELAYED_CALL_PENDED) {
+                       workq_schedule_delayed_thread_creation(wq,
+                                       WORKQ_SCHEDULE_DELAYED_THREAD_CREATION_RESTART);
+               }
+               enable_preemption();
+       }
+}
+
+/**
+ * returns whether lastblocked_tsp is within wq_stalled_window usecs of now
+ */
+static bool
+workq_thread_is_busy(uint64_t now, _Atomic uint64_t *lastblocked_tsp)
+{
+       uint64_t lastblocked_ts = os_atomic_load(lastblocked_tsp, relaxed);
+       if (now <= lastblocked_ts) {
+               /*
+                * Because the update of the timestamp when a thread blocks
+                * isn't serialized against us looking at it (i.e. we don't hold
+                * the workq lock), it's possible to have a timestamp that matches
+                * the current time or that even looks to be in the future relative
+                * to when we grabbed the current time...
+                *
+                * Just treat this as a busy thread since it must have just blocked.
+                */
+               return true;
+       }
+       return (now - lastblocked_ts) < wq_stalled_window.abstime;
+}
+
+static void
+workq_add_new_threads_call(void *_p, void *flags)
+{
+       proc_t p = _p;
+       struct workqueue *wq = proc_get_wqptr(p);
+       uint32_t my_flag = (uint32_t)(uintptr_t)flags;
+
+       /*
+        * workq_exit() will set the workqueue to NULL before
+        * it cancels thread calls.
+        */
+       if (!wq) return;
+
+       assert((my_flag == WQ_DELAYED_CALL_SCHEDULED) ||
+                       (my_flag == WQ_IMMEDIATE_CALL_SCHEDULED));
+
+       WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_START, wq, _wq_flags(wq),
+                       wq->wq_nthreads, wq->wq_thidlecount, 0);
+
+       workq_lock_spin(wq);
+
+       wq->wq_thread_call_last_run = mach_absolute_time();
+       os_atomic_and(&wq->wq_flags, ~my_flag, release);
+
+       /* This can drop the workqueue lock, and take it again */
+       workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
+
+       workq_unlock(wq);
+
+       WQ_TRACE_WQ(TRACE_wq_add_timer | DBG_FUNC_END, wq, 0,
+                       wq->wq_nthreads, wq->wq_thidlecount, 0);
+}
+
+#pragma mark thread state tracking
+
+static void
+workq_sched_callback(int type, thread_t thread)
+{
+       struct uthread *uth = get_bsdthread_info(thread);
+       proc_t proc = get_bsdtask_info(get_threadtask(thread));
+       struct workqueue *wq = proc_get_wqptr(proc);
+       thread_qos_t req_qos, qos = uth->uu_workq_pri.qos_bucket;
+       wq_thactive_t old_thactive;
+       bool start_timer = false;
+
+       if (qos == WORKQ_THREAD_QOS_MANAGER) {
+               return;
+       }
+
+       switch (type) {
+       case SCHED_CALL_BLOCK:
+               old_thactive = _wq_thactive_dec(wq, qos);
+               req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
+
+               /*
+                * Remember the timestamp of the last thread that blocked in this
+                * bucket, it used used by admission checks to ignore one thread
+                * being inactive if this timestamp is recent enough.
+                *
+                * If we collide with another thread trying to update the
+                * last_blocked (really unlikely since another thread would have to
+                * get scheduled and then block after we start down this path), it's
+                * not a problem.  Either timestamp is adequate, so no need to retry
+                */
+               os_atomic_store(&wq->wq_lastblocked_ts[_wq_bucket(qos)],
+                               thread_last_run_time(thread), relaxed);
+
+               if (req_qos == THREAD_QOS_UNSPECIFIED) {
+                       /*
+                        * No pending request at the moment we could unblock, move on.
+                        */
+               } else if (qos < req_qos) {
+                       /*
+                        * The blocking thread is at a lower QoS than the highest currently
+                        * pending constrained request, nothing has to be redriven
+                        */
+               } else {
+                       uint32_t max_busycount, old_req_count;
+                       old_req_count = _wq_thactive_aggregate_downto_qos(wq, old_thactive,
+                                       req_qos, NULL, &max_busycount);
+                       /*
+                        * If it is possible that may_start_constrained_thread had refused
+                        * admission due to being over the max concurrency, we may need to
+                        * spin up a new thread.
+                        *
+                        * We take into account the maximum number of busy threads
+                        * that can affect may_start_constrained_thread as looking at the
+                        * actual number may_start_constrained_thread will see is racy.
+                        *
+                        * IOW at NCPU = 4, for IN (req_qos = 1), if the old req count is
+                        * between NCPU (4) and NCPU - 2 (2) we need to redrive.
+                        */
+                       uint32_t conc = wq_max_parallelism[_wq_bucket(qos)];
+                       if (old_req_count <= conc && conc <= old_req_count + max_busycount) {
+                               start_timer = workq_schedule_delayed_thread_creation(wq, 0);
+                       }
+               }
+               if (__improbable(kdebug_enable)) {
+                       __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
+                                       old_thactive, qos, NULL, NULL);
+                       WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_START, wq,
+                                       old - 1, qos | (req_qos << 8),
+                                       wq->wq_reqcount << 1 | start_timer, 0);
+               }
+               break;
+
+       case SCHED_CALL_UNBLOCK:
+               /*
+                * we cannot take the workqueue_lock here...
+                * an UNBLOCK can occur from a timer event which
+                * is run from an interrupt context... if the workqueue_lock
+                * is already held by this processor, we'll deadlock...
+                * the thread lock for the thread being UNBLOCKED
+                * is also held
+                */
+               old_thactive = _wq_thactive_inc(wq, qos);
+               if (__improbable(kdebug_enable)) {
+                       __unused uint32_t old = _wq_thactive_aggregate_downto_qos(wq,
+                                       old_thactive, qos, NULL, NULL);
+                       req_qos = WQ_THACTIVE_BEST_CONSTRAINED_REQ_QOS(old_thactive);
+                       WQ_TRACE_WQ(TRACE_wq_thread_block | DBG_FUNC_END, wq,
+                                       old + 1, qos | (req_qos << 8),
+                                       wq->wq_threads_scheduled, 0);
+               }
+               break;
+       }
+}
+
+#pragma mark workq lifecycle
+
+void
+workq_reference(struct workqueue *wq)
+{
+       os_ref_retain(&wq->wq_refcnt);
+}
+
+void
+workq_destroy(struct workqueue *wq)
+{
+       struct turnstile *ts;
+
+       turnstile_complete((uintptr_t)wq, &wq->wq_turnstile, &ts);
+       assert(ts);
+       turnstile_cleanup();
+       turnstile_deallocate(ts);
+
+       lck_spin_destroy(&wq->wq_lock, workq_lck_grp);
+       zfree(workq_zone_workqueue, wq);
+}
+
+static void
+workq_deallocate(struct workqueue *wq)
+{
+       if (os_ref_release_relaxed(&wq->wq_refcnt) == 0) {
+               workq_destroy(wq);
+       }
+}
+
+void
+workq_deallocate_safe(struct workqueue *wq)
+{
+       if (__improbable(os_ref_release_relaxed(&wq->wq_refcnt) == 0)) {
+               workq_deallocate_enqueue(wq);
+       }
+}
+
+/**
+ * Setup per-process state for the workqueue.
+ */
+int
+workq_open(struct proc *p, __unused struct workq_open_args *uap,
+               __unused int32_t *retval)
+{
+       struct workqueue *wq;
+       int error = 0;
+
+       if ((p->p_lflag & P_LREGISTER) == 0) {
+               return EINVAL;
+       }
+
+       if (wq_init_constrained_limit) {
+               uint32_t limit, num_cpus = ml_get_max_cpus();
+
+               /*
+                * set up the limit for the constrained pool
+                * this is a virtual pool in that we don't
+                * maintain it on a separate idle and run list
+                */
+               limit = num_cpus * WORKQUEUE_CONSTRAINED_FACTOR;
+
+               if (limit > wq_max_constrained_threads)
+                       wq_max_constrained_threads = limit;
+
+               if (wq_max_threads > WQ_THACTIVE_BUCKET_HALF) {
+                       wq_max_threads = WQ_THACTIVE_BUCKET_HALF;
+               }
+               if (wq_max_threads > CONFIG_THREAD_MAX - 20) {
+                       wq_max_threads = CONFIG_THREAD_MAX - 20;
+               }
+
+               wq_death_max_load = (uint16_t)fls(num_cpus) + 1;
+
+               for (thread_qos_t qos = WORKQ_THREAD_QOS_MIN; qos <= WORKQ_THREAD_QOS_MAX; qos++) {
+                       wq_max_parallelism[_wq_bucket(qos)] =
+                                       qos_max_parallelism(qos, QOS_PARALLELISM_COUNT_LOGICAL);
+               }
+
+               wq_init_constrained_limit = 0;
+       }
+
+       if (proc_get_wqptr(p) == NULL) {
+               if (proc_init_wqptr_or_wait(p) == FALSE) {
+                       assert(proc_get_wqptr(p) != NULL);
+                       goto out;
+               }
+
+               wq = (struct workqueue *)zalloc(workq_zone_workqueue);
+               bzero(wq, sizeof(struct workqueue));
+
+               os_ref_init_count(&wq->wq_refcnt, &workq_refgrp, 1);
+
+               // Start the event manager at the priority hinted at by the policy engine
+               thread_qos_t mgr_priority_hint = task_get_default_manager_qos(current_task());
+               pthread_priority_t pp = _pthread_priority_make_from_thread_qos(mgr_priority_hint, 0, 0);
+               wq->wq_event_manager_priority = (uint32_t)pp;
+               wq->wq_timer_interval = wq_stalled_window.abstime;
+               wq->wq_proc = p;
+               turnstile_prepare((uintptr_t)wq, &wq->wq_turnstile, turnstile_alloc(),
+                               TURNSTILE_WORKQS);
+
+               TAILQ_INIT(&wq->wq_thrunlist);
+               TAILQ_INIT(&wq->wq_thnewlist);
+               TAILQ_INIT(&wq->wq_thidlelist);
+               priority_queue_init(&wq->wq_overcommit_queue,
+                               PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+               priority_queue_init(&wq->wq_constrained_queue,
+                               PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+               priority_queue_init(&wq->wq_special_queue,
+                               PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+
+               wq->wq_delayed_call = thread_call_allocate_with_options(
+                               workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
+                               THREAD_CALL_OPTIONS_ONCE);
+               wq->wq_immediate_call = thread_call_allocate_with_options(
+                               workq_add_new_threads_call, p, THREAD_CALL_PRIORITY_KERNEL,
+                               THREAD_CALL_OPTIONS_ONCE);
+               wq->wq_death_call = thread_call_allocate_with_options(
+                               workq_kill_old_threads_call, wq,
+                               THREAD_CALL_PRIORITY_USER, THREAD_CALL_OPTIONS_ONCE);
+
+               lck_spin_init(&wq->wq_lock, workq_lck_grp, workq_lck_attr);
+
+               WQ_TRACE_WQ(TRACE_wq_create | DBG_FUNC_NONE, wq,
+                               VM_KERNEL_ADDRHIDE(wq), 0, 0, 0);
+               proc_set_wqptr(p, wq);
+       }
+out:
+
+       return error;
+}
+
+/*
+ * Routine:    workq_mark_exiting
+ *
+ * Function:   Mark the work queue such that new threads will not be added to the
+ *             work queue after we return.
+ *
+ * Conditions: Called against the current process.
+ */
+void
+workq_mark_exiting(struct proc *p)
+{
+       struct workqueue *wq = proc_get_wqptr(p);
+       uint32_t wq_flags;
+       workq_threadreq_t mgr_req;
+
+       if (!wq) return;
+
+       WQ_TRACE_WQ(TRACE_wq_pthread_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
+
+       workq_lock_spin(wq);
+
+       wq_flags = os_atomic_or_orig(&wq->wq_flags, WQ_EXITING, relaxed);
+       if (__improbable(wq_flags & WQ_EXITING)) {
+               panic("workq_mark_exiting called twice");
+       }
+
+       /*
+        * Opportunistically try to cancel thread calls that are likely in flight.
+        * workq_exit() will do the proper cleanup.
+        */
+       if (wq_flags & WQ_IMMEDIATE_CALL_SCHEDULED) {
+               thread_call_cancel(wq->wq_immediate_call);
+       }
+       if (wq_flags & WQ_DELAYED_CALL_SCHEDULED) {
+               thread_call_cancel(wq->wq_delayed_call);
+       }
+       if (wq_flags & WQ_DEATH_CALL_SCHEDULED) {
+               thread_call_cancel(wq->wq_death_call);
+       }
+
+       mgr_req = wq->wq_event_manager_threadreq;
+       wq->wq_event_manager_threadreq = NULL;
+       wq->wq_reqcount = 0; /* workq_schedule_creator must not look at queues */
+       workq_turnstile_update_inheritor(wq, NULL, 0);
+
+       workq_unlock(wq);
+
+       if (mgr_req) {
+               kqueue_threadreq_cancel(p, mgr_req);
+       }
+       /*
+        * No one touches the priority queues once WQ_EXITING is set.
+        * It is hence safe to do the tear down without holding any lock.
+        */
+       priority_queue_destroy(&wq->wq_overcommit_queue,
+                       struct workq_threadreq_s, tr_entry, ^(void *e){
+               workq_threadreq_destroy(p, e);
+       });
+       priority_queue_destroy(&wq->wq_constrained_queue,
+                       struct workq_threadreq_s, tr_entry, ^(void *e){
+               workq_threadreq_destroy(p, e);
+       });
+       priority_queue_destroy(&wq->wq_special_queue,
+                       struct workq_threadreq_s, tr_entry, ^(void *e){
+               workq_threadreq_destroy(p, e);
+       });
+
+       WQ_TRACE(TRACE_wq_pthread_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Routine:    workq_exit
+ *
+ * Function:   clean up the work queue structure(s) now that there are no threads
+ *             left running inside the work queue (except possibly current_thread).
+ *
+ * Conditions: Called by the last thread in the process.
+ *             Called against current process.
+ */
+void
+workq_exit(struct proc *p)
+{
+       struct workqueue *wq;
+       struct uthread *uth, *tmp;
+
+       wq = os_atomic_xchg(&p->p_wqptr, NULL, relaxed);
+       if (wq != NULL) {
+               thread_t th = current_thread();
+
+               WQ_TRACE_WQ(TRACE_wq_workqueue_exit|DBG_FUNC_START, wq, 0, 0, 0, 0);
+
+               if (thread_get_tag(th) & THREAD_TAG_WORKQUEUE) {
+                       /*
+                        * <rdar://problem/40111515> Make sure we will no longer call the
+                        * sched call, if we ever block this thread, which the cancel_wait
+                        * below can do.
+                        */
+                       thread_sched_call(th, NULL);
+               }
+
+               /*
+                * Thread calls are always scheduled by the proc itself or under the
+                * workqueue spinlock if WQ_EXITING is not yet set.
+                *
+                * Either way, when this runs, the proc has no threads left beside
+                * the one running this very code, so we know no thread call can be
+                * dispatched anymore.
+                */
+               thread_call_cancel_wait(wq->wq_delayed_call);
+               thread_call_cancel_wait(wq->wq_immediate_call);
+               thread_call_cancel_wait(wq->wq_death_call);
+               thread_call_free(wq->wq_delayed_call);
+               thread_call_free(wq->wq_immediate_call);
+               thread_call_free(wq->wq_death_call);
+
+               /*
+                * Clean up workqueue data structures for threads that exited and
+                * didn't get a chance to clean up after themselves.
+                *
+                * idle/new threads should have been interrupted and died on their own
+                */
+               TAILQ_FOREACH_SAFE(uth, &wq->wq_thrunlist, uu_workq_entry, tmp) {
+                       thread_sched_call(uth->uu_thread, NULL);
+                       thread_deallocate(uth->uu_thread);
+               }
+               assert(TAILQ_EMPTY(&wq->wq_thnewlist));
+               assert(TAILQ_EMPTY(&wq->wq_thidlelist));
+
+               WQ_TRACE_WQ(TRACE_wq_destroy | DBG_FUNC_END, wq,
+                               VM_KERNEL_ADDRHIDE(wq), 0, 0, 0);
+
+               workq_deallocate(wq);
+
+               WQ_TRACE(TRACE_wq_workqueue_exit|DBG_FUNC_END, 0, 0, 0, 0, 0);
+       }
+}
+
+
+#pragma mark bsd thread control
+
+static bool
+_pthread_priority_to_policy(pthread_priority_t priority,
+               thread_qos_policy_data_t *data)
+{
+       data->qos_tier = _pthread_priority_thread_qos(priority);
+       data->tier_importance = _pthread_priority_relpri(priority);
+       if (data->qos_tier == THREAD_QOS_UNSPECIFIED || data->tier_importance > 0 ||
+                       data->tier_importance < THREAD_QOS_MIN_TIER_IMPORTANCE) {
+               return false;
+       }
+       return true;
+}
+
+static int
+bsdthread_set_self(proc_t p, thread_t th, pthread_priority_t priority,
+               mach_port_name_t voucher, enum workq_set_self_flags flags)
+{
+       struct uthread *uth = get_bsdthread_info(th);
+       struct workqueue *wq = proc_get_wqptr(p);
+
+       kern_return_t kr;
+       int unbind_rv = 0, qos_rv = 0, voucher_rv = 0, fixedpri_rv = 0;
+       bool is_wq_thread = (thread_get_tag(th) & THREAD_TAG_WORKQUEUE);
+
+       if (flags & WORKQ_SET_SELF_WQ_KEVENT_UNBIND) {
+               if (!is_wq_thread) {
+                       unbind_rv = EINVAL;
+                       goto qos;
+               }
+
+               if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
+                       unbind_rv = EINVAL;
+                       goto qos;
+               }
+
+               struct kqrequest *kqr = uth->uu_kqr_bound;
+               if (kqr == NULL) {
+                       unbind_rv = EALREADY;
+                       goto qos;
+               }
+
+               if (kqr->kqr_state & KQR_WORKLOOP) {
+                       unbind_rv = EINVAL;
+                       goto qos;
+               }
+
+               kqueue_threadreq_unbind(p, uth->uu_kqr_bound);
+       }
+
+qos:
+       if (flags & WORKQ_SET_SELF_QOS_FLAG) {
+               thread_qos_policy_data_t new_policy;
+
+               if (!_pthread_priority_to_policy(priority, &new_policy)) {
+                       qos_rv = EINVAL;
+                       goto voucher;
+               }
+
+               if (!is_wq_thread) {
+                       /*
+                        * Threads opted out of QoS can't change QoS
+                        */
+                       if (!thread_has_qos_policy(th)) {
+                               qos_rv = EPERM;
+                               goto voucher;
+                       }
+               } else if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
+                       /*
+                        * Workqueue manager threads can't change QoS
+                        */
+                       qos_rv = EINVAL;
+                       goto voucher;
+               } else {
+                       /*
+                        * For workqueue threads, possibly adjust buckets and redrive thread
+                        * requests.
+                        */
+                       bool old_overcommit = uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT;
+                       bool new_overcommit = priority & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
+                       struct uu_workq_policy old_pri, new_pri;
+                       bool force_run = false;
+
+                       workq_lock_spin(wq);
+
+                       if (old_overcommit != new_overcommit) {
+                               uth->uu_workq_flags ^= UT_WORKQ_OVERCOMMIT;
+                               if (old_overcommit) {
+                                       wq->wq_constrained_threads_scheduled++;
+                               } else if (wq->wq_constrained_threads_scheduled-- ==
+                                               wq_max_constrained_threads) {
+                                       force_run = true;
+                               }
+                       }
+
+                       old_pri = new_pri = uth->uu_workq_pri;
+                       new_pri.qos_req = new_policy.qos_tier;
+                       workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, force_run);
+                       workq_unlock(wq);
+               }
+
+               kr = thread_policy_set_internal(th, THREAD_QOS_POLICY,
+                               (thread_policy_t)&new_policy, THREAD_QOS_POLICY_COUNT);
+               if (kr != KERN_SUCCESS) {
+                       qos_rv = EINVAL;
+               }
+       }
+
+voucher:
+       if (flags & WORKQ_SET_SELF_VOUCHER_FLAG) {
+               kr = thread_set_voucher_name(voucher);
+               if (kr != KERN_SUCCESS) {
+                       voucher_rv = ENOENT;
+                       goto fixedpri;
+               }
+       }
+
+fixedpri:
+       if (qos_rv) goto done;
+       if (flags & WORKQ_SET_SELF_FIXEDPRIORITY_FLAG) {
+               thread_extended_policy_data_t extpol = {.timeshare = 0};
+
+               if (is_wq_thread) {
+                       /* Not allowed on workqueue threads */
+                       fixedpri_rv = ENOTSUP;
+                       goto done;
+               }
+
+               kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
+                               (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
+               if (kr != KERN_SUCCESS) {
+                       fixedpri_rv = EINVAL;
+                       goto done;
+               }
+       } else if (flags & WORKQ_SET_SELF_TIMESHARE_FLAG) {
+               thread_extended_policy_data_t extpol = {.timeshare = 1};
+
+               if (is_wq_thread) {
+                       /* Not allowed on workqueue threads */
+                       fixedpri_rv = ENOTSUP;
+                       goto done;
+               }
+
+               kr = thread_policy_set_internal(th, THREAD_EXTENDED_POLICY,
+                               (thread_policy_t)&extpol, THREAD_EXTENDED_POLICY_COUNT);
+               if (kr != KERN_SUCCESS) {
+                       fixedpri_rv = EINVAL;
+                       goto done;
+               }
+       }
+
+done:
+       if (qos_rv && voucher_rv) {
+               /* Both failed, give that a unique error. */
+               return EBADMSG;
+       }
+
+       if (unbind_rv) {
+               return unbind_rv;
+       }
+
+       if (qos_rv) {
+               return qos_rv;
+       }
+
+       if (voucher_rv) {
+               return voucher_rv;
+       }
+
+       if (fixedpri_rv) {
+               return fixedpri_rv;
+       }
+
+       return 0;
+}
+
+static int
+bsdthread_add_explicit_override(proc_t p, mach_port_name_t kport,
+               pthread_priority_t pp, user_addr_t resource)
+{
+       thread_qos_t qos = _pthread_priority_thread_qos(pp);
+       if (qos == THREAD_QOS_UNSPECIFIED) {
+               return EINVAL;
+       }
+
+       thread_t th = port_name_to_thread(kport);
+       if (th == THREAD_NULL) {
+               return ESRCH;
+       }
+
+       int rv = proc_thread_qos_add_override(p->task, th, 0, qos, TRUE,
+                       resource, THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
+
+       thread_deallocate(th);
+       return rv;
+}
+
+static int
+bsdthread_remove_explicit_override(proc_t p, mach_port_name_t kport,
+               user_addr_t resource)
+{
+       thread_t th = port_name_to_thread(kport);
+       if (th == THREAD_NULL) {
+               return ESRCH;
+       }
+
+       int rv = proc_thread_qos_remove_override(p->task, th, 0, resource,
+                       THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE);
+
+       thread_deallocate(th);
+       return rv;
+}
+
+static int
+workq_thread_add_dispatch_override(proc_t p, mach_port_name_t kport,
+               pthread_priority_t pp, user_addr_t ulock_addr)
+{
+       struct uu_workq_policy old_pri, new_pri;
+       struct workqueue *wq = proc_get_wqptr(p);
+
+       thread_qos_t qos_override = _pthread_priority_thread_qos(pp);
+       if (qos_override == THREAD_QOS_UNSPECIFIED) {
+               return EINVAL;
+       }
+
+       thread_t thread = port_name_to_thread(kport);
+       if (thread == THREAD_NULL) {
+               return ESRCH;
+       }
+
+       struct uthread *uth = get_bsdthread_info(thread);
+       if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
+               thread_deallocate(thread);
+               return EPERM;
+       }
+
+       WQ_TRACE_WQ(TRACE_wq_override_dispatch | DBG_FUNC_NONE,
+                       wq, thread_tid(thread), 1, pp, 0);
+
+       thread_mtx_lock(thread);
+
+       if (ulock_addr) {
+               uint64_t val;
+               int rc;
+               /*
+                * Workaround lack of explicit support for 'no-fault copyin'
+                * <rdar://problem/24999882>, as disabling preemption prevents paging in
+                */
+               disable_preemption();
+               rc = copyin_word(ulock_addr, &val, sizeof(kport));
+               enable_preemption();
+               if (rc == 0 && ulock_owner_value_to_port_name((uint32_t)val) != kport) {
+                       goto out;
+               }
+       }
+
+       workq_lock_spin(wq);
+
+       old_pri = uth->uu_workq_pri;
+       if (old_pri.qos_override >= qos_override) {
+               /* Nothing to do */
+       } else if (thread == current_thread()) {
+               new_pri = old_pri;
+               new_pri.qos_override = qos_override;
+               workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
+       } else {
+               uth->uu_workq_pri.qos_override = qos_override;
+               if (qos_override > workq_pri_override(old_pri)) {
+                       thread_set_workq_override(thread, qos_override);
+               }
+       }
+
+       workq_unlock(wq);
+
+out:
+       thread_mtx_unlock(thread);
+       thread_deallocate(thread);
+       return 0;
+}
+
+static int
+workq_thread_reset_dispatch_override(proc_t p, thread_t thread)
+{
+       struct uu_workq_policy old_pri, new_pri;
+       struct workqueue *wq = proc_get_wqptr(p);
+       struct uthread *uth = get_bsdthread_info(thread);
+
+       if ((thread_get_tag(thread) & THREAD_TAG_WORKQUEUE) == 0) {
+               return EPERM;
+       }
+
+       WQ_TRACE_WQ(TRACE_wq_override_reset | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
+
+       workq_lock_spin(wq);
+       old_pri = new_pri = uth->uu_workq_pri;
+       new_pri.qos_override = THREAD_QOS_UNSPECIFIED;
+       workq_thread_update_bucket(p, wq, uth, old_pri, new_pri, false);
+       workq_unlock(wq);
+       return 0;
+}
+
+static int
+bsdthread_get_max_parallelism(thread_qos_t qos, unsigned long flags,
+               int *retval)
+{
+       static_assert(QOS_PARALLELISM_COUNT_LOGICAL ==
+                       _PTHREAD_QOS_PARALLELISM_COUNT_LOGICAL, "logical");
+       static_assert(QOS_PARALLELISM_REALTIME ==
+                       _PTHREAD_QOS_PARALLELISM_REALTIME, "realtime");
+
+       if (flags & ~(QOS_PARALLELISM_REALTIME | QOS_PARALLELISM_COUNT_LOGICAL)) {
+               return EINVAL;
+       }
+
+       if (flags & QOS_PARALLELISM_REALTIME) {
+               if (qos) {
+                       return EINVAL;
+               }
+       } else if (qos == THREAD_QOS_UNSPECIFIED || qos >= THREAD_QOS_LAST) {
+               return EINVAL;
+       }
+
+       *retval = qos_max_parallelism(qos, flags);
+       return 0;
+}
+
+#define ENSURE_UNUSED(arg) \
+               ({ if ((arg) != 0) { return EINVAL; } })
+
+int
+bsdthread_ctl(struct proc *p, struct bsdthread_ctl_args *uap, int *retval)
+{
+       switch (uap->cmd) {
+       case BSDTHREAD_CTL_QOS_OVERRIDE_START:
+               return bsdthread_add_explicit_override(p, (mach_port_name_t)uap->arg1,
+                               (pthread_priority_t)uap->arg2, uap->arg3);
+       case BSDTHREAD_CTL_QOS_OVERRIDE_END:
+               ENSURE_UNUSED(uap->arg3);
+               return bsdthread_remove_explicit_override(p, (mach_port_name_t)uap->arg1,
+                               (user_addr_t)uap->arg2);
+
+       case BSDTHREAD_CTL_QOS_OVERRIDE_DISPATCH:
+               return workq_thread_add_dispatch_override(p, (mach_port_name_t)uap->arg1,
+                               (pthread_priority_t)uap->arg2, uap->arg3);
+       case BSDTHREAD_CTL_QOS_OVERRIDE_RESET:
+               return workq_thread_reset_dispatch_override(p, current_thread());
+
+       case BSDTHREAD_CTL_SET_SELF:
+               return bsdthread_set_self(p, current_thread(),
+                               (pthread_priority_t)uap->arg1, (mach_port_name_t)uap->arg2,
+                               (enum workq_set_self_flags)uap->arg3);
+
+       case BSDTHREAD_CTL_QOS_MAX_PARALLELISM:
+               ENSURE_UNUSED(uap->arg3);
+               return bsdthread_get_max_parallelism((thread_qos_t)uap->arg1,
+                               (unsigned long)uap->arg2, retval);
+
+       case BSDTHREAD_CTL_SET_QOS:
+       case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_ADD:
+       case BSDTHREAD_CTL_QOS_DISPATCH_ASYNCHRONOUS_OVERRIDE_RESET:
+               /* no longer supported */
+               return ENOTSUP;
+
+       default:
+               return EINVAL;
+       }
+}
+
+#pragma mark workqueue thread manipulation
+
+static void __dead2
+workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
+               struct uthread *uth);
+
+static void workq_setup_and_run(proc_t p, struct uthread *uth, int flags) __dead2;
+
+#if KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD
+static inline uint64_t
+workq_trace_req_id(workq_threadreq_t req)
+{
+       struct kqworkloop *kqwl;
+       if (req->tr_flags & TR_FLAG_WORKLOOP) {
+               kqwl = __container_of(req, struct kqworkloop, kqwl_request.kqr_req);
+               return kqwl->kqwl_dynamicid;
+       }
+
+       return VM_KERNEL_ADDRHIDE(req);
+}
+#endif
+
+/**
+ * Entry point for libdispatch to ask for threads
+ */
+static int
+workq_reqthreads(struct proc *p, uint32_t reqcount, pthread_priority_t pp)
+{
+       thread_qos_t qos = _pthread_priority_thread_qos(pp);
+       struct workqueue *wq = proc_get_wqptr(p);
+       uint32_t unpaced, upcall_flags = WQ_FLAG_THREAD_NEWSPI;
+
+       if (wq == NULL || reqcount <= 0 || reqcount > UINT16_MAX ||
+                       qos == THREAD_QOS_UNSPECIFIED) {
+               return EINVAL;
+       }
+
+       WQ_TRACE_WQ(TRACE_wq_wqops_reqthreads | DBG_FUNC_NONE,
+                       wq, reqcount, pp, 0, 0);
+
+       workq_threadreq_t req = zalloc(workq_zone_threadreq);
+       priority_queue_entry_init(&req->tr_entry);
+       req->tr_state = TR_STATE_NEW;
+       req->tr_flags = 0;
+       req->tr_qos   = qos;
+
+       if (pp & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG) {
+               req->tr_flags |= TR_FLAG_OVERCOMMIT;
+               upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
+       }
+
+       WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE,
+                       wq, workq_trace_req_id(req), req->tr_qos, reqcount, 0);
+
+       workq_lock_spin(wq);
+       do {
+               if (_wq_exiting(wq)) {
+                       goto exiting;
+               }
+
+               /*
+                * When userspace is asking for parallelism, wakeup up to (reqcount - 1)
+                * threads without pacing, to inform the scheduler of that workload.
+                *
+                * The last requests, or the ones that failed the admission checks are
+                * enqueued and go through the regular creator codepath.
+                *
+                * If there aren't enough threads, add one, but re-evaluate everything
+                * as conditions may now have changed.
+                */
+               if (reqcount > 1 && (req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+                       unpaced = workq_constrained_allowance(wq, qos, NULL, false);
+                       if (unpaced >= reqcount - 1) {
+                               unpaced = reqcount - 1;
+                       }
+               } else {
+                       unpaced = reqcount - 1;
+               }
+
+               /*
+                * This path does not currently handle custom workloop parameters
+                * when creating threads for parallelism.
+                */
+               assert(!(req->tr_flags & TR_FLAG_WL_PARAMS));
+
+               /*
+                * This is a trimmed down version of workq_threadreq_bind_and_unlock()
+                */
+               while (unpaced > 0 && wq->wq_thidlecount) {
+                       struct uthread *uth = workq_pop_idle_thread(wq);
+
+                       _wq_thactive_inc(wq, qos);
+                       wq->wq_thscheduled_count[_wq_bucket(qos)]++;
+                       workq_thread_reset_pri(wq, uth, req);
+                       wq->wq_fulfilled++;
+
+                       uth->uu_workq_flags |= UT_WORKQ_EARLY_BOUND;
+                       if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+                               uth->uu_workq_flags &= ~UT_WORKQ_OVERCOMMIT;
+                               wq->wq_constrained_threads_scheduled++;
+                       }
+                       uth->uu_save.uus_workq_park_data.upcall_flags = upcall_flags;
+                       uth->uu_save.uus_workq_park_data.thread_request = req;
+                       workq_thread_wakeup(uth);
+                       unpaced--;
+                       reqcount--;
+               }
+       } while (unpaced && wq->wq_nthreads < wq_max_threads &&
+                       workq_add_new_idle_thread(p, wq));
+
+       if (_wq_exiting(wq)) {
+               goto exiting;
+       }
+
+       req->tr_count = reqcount;
+       if (workq_threadreq_enqueue(wq, req)) {
+               /* This can drop the workqueue lock, and take it again */
+               workq_schedule_creator(p, wq, WORKQ_THREADREQ_CAN_CREATE_THREADS);
+       }
+       workq_unlock(wq);
+       return 0;
+
+exiting:
+       workq_unlock(wq);
+       zfree(workq_zone_threadreq, req);
+       return ECANCELED;
+}
+
+bool
+workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr,
+               struct turnstile *workloop_ts, thread_qos_t qos, int flags)
+{
+       struct workqueue *wq = proc_get_wqptr_fast(p);
+       workq_threadreq_t req = &kqr->kqr_req;
+       struct uthread *uth = NULL;
+       uint8_t tr_flags = 0;
+
+       if (kqr->kqr_state & KQR_WORKLOOP) {
+               tr_flags = TR_FLAG_WORKLOOP;
+
+               workq_threadreq_param_t trp = kqueue_threadreq_workloop_param(req);
+               if (trp.trp_flags & TRP_PRIORITY) {
+                       tr_flags |= TR_FLAG_WL_OUTSIDE_QOS;
+                       qos = thread_workq_qos_for_pri(trp.trp_pri);
+                       if (qos == THREAD_QOS_UNSPECIFIED) {
+                               qos = WORKQ_THREAD_QOS_ABOVEUI;
+                       }
+               }
+               if (trp.trp_flags) {
+                       tr_flags |= TR_FLAG_WL_PARAMS;
+               }
+       } else {
+               tr_flags = TR_FLAG_KEVENT;
+       }
+       if (qos != WORKQ_THREAD_QOS_MANAGER &&
+                       (kqr->kqr_state & KQR_THOVERCOMMIT)) {
+               tr_flags |= TR_FLAG_OVERCOMMIT;
+       }
+
+       assert(req->tr_state == TR_STATE_IDLE);
+       priority_queue_entry_init(&req->tr_entry);
+       req->tr_count = 1;
+       req->tr_state = TR_STATE_NEW;
+       req->tr_flags = tr_flags;
+       req->tr_qos   = qos;
+
+       WQ_TRACE_WQ(TRACE_wq_thread_request_initiate | DBG_FUNC_NONE, wq,
+                       workq_trace_req_id(req), qos, 1, 0);
+
+       if (flags & WORKQ_THREADREQ_ATTEMPT_REBIND) {
+               /*
+                * we're called back synchronously from the context of
+                * kqueue_threadreq_unbind from within workq_thread_return()
+                * we can try to match up this thread with this request !
+                */
+               uth = current_uthread();
+               assert(uth->uu_kqr_bound == NULL);
+       }
+
+       workq_lock_spin(wq);
+       if (_wq_exiting(wq)) {
+               workq_unlock(wq);
+               return false;
+       }
+
+       if (uth && workq_threadreq_admissible(wq, uth, req)) {
+               assert(uth != wq->wq_creator);
+               workq_threadreq_bind_and_unlock(p, wq, req, uth);
+       } else {
+               if (workloop_ts) {
+                       workq_perform_turnstile_operation_locked(wq, ^{
+                               turnstile_update_inheritor(workloop_ts, wq->wq_turnstile,
+                                               TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
+                               turnstile_update_inheritor_complete(workloop_ts,
+                                               TURNSTILE_INTERLOCK_HELD);
+                       });
+               }
+               if (workq_threadreq_enqueue(wq, req)) {
+                       workq_schedule_creator(p, wq, flags);
+               }
+               workq_unlock(wq);
+       }
+
+       return true;
+}
+
+void
+workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr,
+               thread_qos_t qos, int flags)
+{
+       struct workqueue *wq = proc_get_wqptr_fast(p);
+       workq_threadreq_t req = &kqr->kqr_req;
+       bool change_overcommit = false;
+
+       if (req->tr_flags & TR_FLAG_WL_OUTSIDE_QOS) {
+               /* Requests outside-of-QoS shouldn't accept modify operations */
+               return;
+       }
+
+       workq_lock_spin(wq);
+
+       assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
+       assert(req->tr_flags & (TR_FLAG_KEVENT | TR_FLAG_WORKLOOP));
+
+       if (req->tr_state == TR_STATE_BINDING) {
+               kqueue_threadreq_bind(p, req, req->tr_binding_thread, 0);
+               workq_unlock(wq);
+               return;
+       }
+
+       change_overcommit = (bool)(kqr->kqr_state & KQR_THOVERCOMMIT) !=
+                       (bool)(req->tr_flags & TR_FLAG_OVERCOMMIT);
+
+       if (_wq_exiting(wq) || (req->tr_qos == qos && !change_overcommit)) {
+               workq_unlock(wq);
+               return;
+       }
+
+       assert(req->tr_count == 1);
+       if (req->tr_state != TR_STATE_QUEUED) {
+               panic("Invalid thread request (%p) state %d", req, req->tr_state);
+       }
+
+       WQ_TRACE_WQ(TRACE_wq_thread_request_modify | DBG_FUNC_NONE, wq,
+                       workq_trace_req_id(req), qos, 0, 0);
+
+       struct priority_queue *pq = workq_priority_queue_for_req(wq, req);
+       workq_threadreq_t req_max;
+
+       /*
+        * Stage 1: Dequeue the request from its priority queue.
+        *
+        * If we dequeue the root item of the constrained priority queue,
+        * maintain the best constrained request qos invariant.
+        */
+       if (priority_queue_remove(pq, &req->tr_entry,
+                       PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+               if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+                       _wq_thactive_refresh_best_constrained_req_qos(wq);
+               }
+       }
+
+       /*
+        * Stage 2: Apply changes to the thread request
+        *
+        * If the item will not become the root of the priority queue it belongs to,
+        * then we need to wait in line, just enqueue and return quickly.
+        */
+       if (__improbable(change_overcommit)) {
+               req->tr_flags ^= TR_FLAG_OVERCOMMIT;
+               pq = workq_priority_queue_for_req(wq, req);
+       }
+       req->tr_qos = qos;
+
+       req_max = priority_queue_max(pq, struct workq_threadreq_s, tr_entry);
+       if (req_max && req_max->tr_qos >= qos) {
+               priority_queue_insert(pq, &req->tr_entry, workq_priority_for_req(req),
+                               PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+               workq_unlock(wq);
+               return;
+       }
+
+       /*
+        * Stage 3: Reevaluate whether we should run the thread request.
+        *
+        * Pretend the thread request is new again:
+        * - adjust wq_reqcount to not count it anymore.
+        * - make its state TR_STATE_NEW (so that workq_threadreq_bind_and_unlock
+        *   properly attempts a synchronous bind)
+        */
+       wq->wq_reqcount--;
+       req->tr_state = TR_STATE_NEW;
+       if (workq_threadreq_enqueue(wq, req)) {
+               workq_schedule_creator(p, wq, flags);
+       }
+       workq_unlock(wq);
+}
+
+void
+workq_kern_threadreq_lock(struct proc *p)
+{
+       workq_lock_spin(proc_get_wqptr_fast(p));
+}
+
+void
+workq_kern_threadreq_unlock(struct proc *p)
+{
+       workq_unlock(proc_get_wqptr_fast(p));
+}
+
+void
+workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr,
+               thread_t owner, struct turnstile *wl_ts,
+               turnstile_update_flags_t flags)
+{
+       struct workqueue *wq = proc_get_wqptr_fast(p);
+       workq_threadreq_t req = &kqr->kqr_req;
+       turnstile_inheritor_t inheritor;
+
+       assert(req->tr_qos != WORKQ_THREAD_QOS_MANAGER);
+       assert(req->tr_flags & TR_FLAG_WORKLOOP);
+       workq_lock_held(wq);
+
+       if (req->tr_state == TR_STATE_BINDING) {
+               kqueue_threadreq_bind(p, req, req->tr_binding_thread,
+                               KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE);
+               return;
+       }
+
+       if (_wq_exiting(wq)) {
+               inheritor = TURNSTILE_INHERITOR_NULL;
+       } else {
+               if (req->tr_state != TR_STATE_QUEUED) {
+                       panic("Invalid thread request (%p) state %d", req, req->tr_state);
+               }
+
+               if (owner) {
+                       inheritor = owner;
+                       flags |= TURNSTILE_INHERITOR_THREAD;
+               } else {
+                       inheritor = wq->wq_turnstile;
+                       flags |= TURNSTILE_INHERITOR_TURNSTILE;
+               }
+       }
+
+       workq_perform_turnstile_operation_locked(wq, ^{
+               turnstile_update_inheritor(wl_ts, inheritor, flags);
+       });
+}
+
+void
+workq_kern_threadreq_redrive(struct proc *p, int flags)
+{
+       struct workqueue *wq = proc_get_wqptr_fast(p);
+
+       workq_lock_spin(wq);
+       workq_schedule_creator(p, wq, flags);
+       workq_unlock(wq);
+}
+
+void
+workq_schedule_creator_turnstile_redrive(struct workqueue *wq, bool locked)
+{
+       if (!locked) workq_lock_spin(wq);
+       workq_schedule_creator(NULL, wq, WORKQ_THREADREQ_CREATOR_SYNC_UPDATE);
+       if (!locked) workq_unlock(wq);
+}
+
+static int
+workq_thread_return(struct proc *p, struct workq_kernreturn_args *uap,
+               struct workqueue *wq)
+{
+       thread_t th = current_thread();
+       struct uthread *uth = get_bsdthread_info(th);
+       struct kqrequest *kqr = uth->uu_kqr_bound;
+       workq_threadreq_param_t trp = { };
+       int nevents = uap->affinity, error;
+       user_addr_t eventlist = uap->item;
+
+       if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
+                       (uth->uu_workq_flags & UT_WORKQ_DYING)) {
+               return EINVAL;
+       }
+
+       if (eventlist && nevents && kqr == NULL) {
+               return EINVAL;
+       }
+
+       /* reset signal mask on the workqueue thread to default state */
+       if (uth->uu_sigmask != (sigset_t)(~workq_threadmask)) {
+               proc_lock(p);
+               uth->uu_sigmask = ~workq_threadmask;
+               proc_unlock(p);
+       }
+
+       if (kqr && kqr->kqr_req.tr_flags & TR_FLAG_WL_PARAMS) {
+               /*
+                * Ensure we store the threadreq param before unbinding
+                * the kqr from this thread.
+                */
+               trp = kqueue_threadreq_workloop_param(&kqr->kqr_req);
+       }
+
+       if (kqr) {
+               uint32_t upcall_flags = WQ_FLAG_THREAD_NEWSPI | WQ_FLAG_THREAD_REUSE;
+               if (kqr->kqr_state & KQR_WORKLOOP) {
+                       upcall_flags |= WQ_FLAG_THREAD_WORKLOOP | WQ_FLAG_THREAD_KEVENT;
+               } else {
+                       upcall_flags |= WQ_FLAG_THREAD_KEVENT;
+               }
+               if (uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER) {
+                       upcall_flags |= WQ_FLAG_THREAD_EVENT_MANAGER;
+               } else {
+                       if (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) {
+                               upcall_flags |= WQ_FLAG_THREAD_OVERCOMMIT;
+                       }
+                       if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
+                               upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
+                       } else {
+                               upcall_flags |= uth->uu_workq_pri.qos_req |
+                                               WQ_FLAG_THREAD_PRIO_QOS;
+                       }
+               }
+
+               error = pthread_functions->workq_handle_stack_events(p, th,
+                               get_task_map(p->task), uth->uu_workq_stackaddr,
+                               uth->uu_workq_thport, eventlist, nevents, upcall_flags);
+               if (error) return error;
+
+               // pthread is supposed to pass KEVENT_FLAG_PARKING here
+               // which should cause the above call to either:
+               // - not return
+               // - return an error
+               // - return 0 and have unbound properly
+               assert(uth->uu_kqr_bound == NULL);
+       }
+
+       WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_END, wq, uap->options, 0, 0, 0);
+
+       thread_sched_call(th, NULL);
+       thread_will_park_or_terminate(th);
+#if CONFIG_WORKLOOP_DEBUG
+       UU_KEVENT_HISTORY_WRITE_ENTRY(uth, { .uu_error = -1, });
+#endif
+
+       workq_lock_spin(wq);
+       WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
+       uth->uu_save.uus_workq_park_data.workloop_params = trp.trp_value;
+       workq_select_threadreq_or_park_and_unlock(p, wq, uth);
+       __builtin_unreachable();
+}
+
+/**
+ * Multiplexed call to interact with the workqueue mechanism
+ */
+int
+workq_kernreturn(struct proc *p, struct workq_kernreturn_args *uap, int32_t *retval)
+{
+       int options = uap->options;
+       int arg2 = uap->affinity;
+       int arg3 = uap->prio;
+       struct workqueue *wq = proc_get_wqptr(p);
+       int error = 0;
+
+       if ((p->p_lflag & P_LREGISTER) == 0) {
+               return EINVAL;
+       }
+
+       switch (options) {
+       case WQOPS_QUEUE_NEWSPISUPP: {
+               /*
+                * arg2 = offset of serialno into dispatch queue
+                * arg3 = kevent support
+                */
+               int offset = arg2;
+               if (arg3 & 0x01){
+                       // If we get here, then userspace has indicated support for kevent delivery.
+               }
+
+               p->p_dispatchqueue_serialno_offset = (uint64_t)offset;
+               break;
+       }
+       case WQOPS_QUEUE_REQTHREADS: {
+               /*
+                * arg2 = number of threads to start
+                * arg3 = priority
+                */
+               error = workq_reqthreads(p, arg2, arg3);
+               break;
+       }
+       case WQOPS_SET_EVENT_MANAGER_PRIORITY: {
+               /*
+                * arg2 = priority for the manager thread
+                *
+                * if _PTHREAD_PRIORITY_SCHED_PRI_FLAG is set,
+                * the low bits of the value contains a scheduling priority
+                * instead of a QOS value
+                */
+               pthread_priority_t pri = arg2;
+
+               if (wq == NULL) {
+                       error = EINVAL;
+                       break;
+               }
+
+               /*
+                * Normalize the incoming priority so that it is ordered numerically.
+                */
+               if (pri & _PTHREAD_PRIORITY_SCHED_PRI_FLAG) {
+                       pri &= (_PTHREAD_PRIORITY_SCHED_PRI_MASK |
+                                       _PTHREAD_PRIORITY_SCHED_PRI_FLAG);
+               } else {
+                       thread_qos_t qos = _pthread_priority_thread_qos(pri);
+                       int relpri = _pthread_priority_relpri(pri);
+                       if (relpri > 0 || relpri < THREAD_QOS_MIN_TIER_IMPORTANCE ||
+                                       qos == THREAD_QOS_UNSPECIFIED) {
+                               error = EINVAL;
+                               break;
+                       }
+                       pri &= ~_PTHREAD_PRIORITY_FLAGS_MASK;
+               }
+
+               /*
+                * If userspace passes a scheduling priority, that wins over any QoS.
+                * Userspace should takes care not to lower the priority this way.
+                */
+               workq_lock_spin(wq);
+               if (wq->wq_event_manager_priority < (uint32_t)pri) {
+                       wq->wq_event_manager_priority = (uint32_t)pri;
+               }
+               workq_unlock(wq);
+               break;
+       }
+       case WQOPS_THREAD_KEVENT_RETURN:
+       case WQOPS_THREAD_WORKLOOP_RETURN:
+       case WQOPS_THREAD_RETURN: {
+               error = workq_thread_return(p, uap, wq);
+               break;
+       }
+
+       case WQOPS_SHOULD_NARROW: {
+               /*
+                * arg2 = priority to test
+                * arg3 = unused
+                */
+               thread_t th = current_thread();
+               struct uthread *uth = get_bsdthread_info(th);
+               if (((thread_get_tag(th) & THREAD_TAG_WORKQUEUE) == 0) ||
+                               (uth->uu_workq_flags & (UT_WORKQ_DYING|UT_WORKQ_OVERCOMMIT))) {
+                       error = EINVAL;
+                       break;
+               }
+
+               thread_qos_t qos = _pthread_priority_thread_qos(arg2);
+               if (qos == THREAD_QOS_UNSPECIFIED) {
+                       error = EINVAL;
+                       break;
+               }
+               workq_lock_spin(wq);
+               bool should_narrow = !workq_constrained_allowance(wq, qos, uth, false);
+               workq_unlock(wq);
+
+               *retval = should_narrow;
+               break;
+       }
+       default:
+               error = EINVAL;
+               break;
+       }
+
+       return (error);
+}
+
+/*
+ * We have no work to do, park ourselves on the idle list.
+ *
+ * Consumes the workqueue lock and does not return.
+ */
+__attribute__((noreturn, noinline))
+static void
+workq_park_and_unlock(proc_t p, struct workqueue *wq, struct uthread *uth)
+{
+       assert(uth == current_uthread());
+       assert(uth->uu_kqr_bound == NULL);
+       workq_push_idle_thread(p, wq, uth); // may not return
+
+       workq_thread_reset_cpupercent(NULL, uth);
+
+       if (uth->uu_workq_flags & UT_WORKQ_IDLE_CLEANUP) {
+               workq_unlock(wq);
+
+               /*
+                * workq_push_idle_thread() will unset `has_stack`
+                * if it wants us to free the stack before parking.
+                */
+               if (!uth->uu_save.uus_workq_park_data.has_stack) {
+                       pthread_functions->workq_markfree_threadstack(p, uth->uu_thread,
+                                       get_task_map(p->task), uth->uu_workq_stackaddr);
+               }
+
+               /*
+                * When we remove the voucher from the thread, we may lose our importance
+                * causing us to get preempted, so we do this after putting the thread on
+                * the idle list.  Then, when we get our importance back we'll be able to
+                * use this thread from e.g. the kevent call out to deliver a boosting
+                * message.
+                */
+               __assert_only kern_return_t kr;
+               kr = thread_set_voucher_name(MACH_PORT_NULL);
+               assert(kr == KERN_SUCCESS);
+
+               workq_lock_spin(wq);
+               uth->uu_workq_flags &= ~UT_WORKQ_IDLE_CLEANUP;
+       }
+
+       if (uth->uu_workq_flags & UT_WORKQ_RUNNING) {
+               /*
+                * While we'd dropped the lock to unset our voucher, someone came
+                * around and made us runnable.  But because we weren't waiting on the
+                * event their thread_wakeup() was ineffectual.  To correct for that,
+                * we just run the continuation ourselves.
+                */
+               WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
+               workq_select_threadreq_or_park_and_unlock(p, wq, uth);
+               __builtin_unreachable();
+       }
+
+       if (uth->uu_workq_flags & UT_WORKQ_DYING) {
+               workq_unpark_for_death_and_unlock(p, wq, uth,
+                               WORKQ_UNPARK_FOR_DEATH_WAS_IDLE);
+               __builtin_unreachable();
+       }
+
+       thread_set_pending_block_hint(uth->uu_thread, kThreadWaitParkedWorkQueue);
+       assert_wait(workq_parked_wait_event(uth), THREAD_INTERRUPTIBLE);
+       workq_unlock(wq);
+       WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_END, wq, 0, 0, 0, 0);
+       thread_block(workq_unpark_continue);
+       __builtin_unreachable();
+}
+
+static inline bool
+workq_may_start_event_mgr_thread(struct workqueue *wq, struct uthread *uth)
+{
+       /*
+        * There's an event manager request and either:
+        * - no event manager currently running
+        * - we are re-using the event manager
+        */
+       return wq->wq_thscheduled_count[_wq_bucket(WORKQ_THREAD_QOS_MANAGER)] == 0 ||
+                       (uth && uth->uu_workq_pri.qos_bucket == WORKQ_THREAD_QOS_MANAGER);
+}
+
+static uint32_t
+workq_constrained_allowance(struct workqueue *wq, thread_qos_t at_qos,
+               struct uthread *uth, bool may_start_timer)
+{
+       assert(at_qos != WORKQ_THREAD_QOS_MANAGER);
+       uint32_t count = 0;
+
+       uint32_t max_count = wq->wq_constrained_threads_scheduled;
+       if (uth && (uth->uu_workq_flags & UT_WORKQ_OVERCOMMIT) == 0) {
+               /*
+                * don't count the current thread as scheduled
+                */
+               assert(max_count > 0);
+               max_count--;
+       }
+       if (max_count >= wq_max_constrained_threads) {
+               WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 1,
+                               wq->wq_constrained_threads_scheduled,
+                               wq_max_constrained_threads, 0);
+               /*
+                * we need 1 or more constrained threads to return to the kernel before
+                * we can dispatch additional work
+                */
+               return 0;
+       }
+       max_count -= wq_max_constrained_threads;
+
+       /*
+        * Compute a metric for many how many threads are active.  We find the
+        * highest priority request outstanding and then add up the number of
+        * active threads in that and all higher-priority buckets.  We'll also add
+        * any "busy" threads which are not active but blocked recently enough that
+        * we can't be sure they've gone idle yet.  We'll then compare this metric
+        * to our max concurrency to decide whether to add a new thread.
+        */
+
+       uint32_t busycount, thactive_count;
+
+       thactive_count = _wq_thactive_aggregate_downto_qos(wq, _wq_thactive(wq),
+                       at_qos, &busycount, NULL);
+
+       if (uth && uth->uu_workq_pri.qos_bucket != WORKQ_THREAD_QOS_MANAGER &&
+                       at_qos <= uth->uu_workq_pri.qos_bucket) {
+               /*
+                * Don't count this thread as currently active, but only if it's not
+                * a manager thread, as _wq_thactive_aggregate_downto_qos ignores active
+                * managers.
+                */
+               assert(thactive_count > 0);
+               thactive_count--;
+       }
+
+       count = wq_max_parallelism[_wq_bucket(at_qos)];
+       if (count > thactive_count + busycount) {
+               count -= thactive_count + busycount;
+               WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 2,
+                               thactive_count, busycount, 0);
+               return MIN(count, max_count);
+       } else {
+               WQ_TRACE_WQ(TRACE_wq_constrained_admission | DBG_FUNC_NONE, wq, 3,
+                               thactive_count, busycount, 0);
+       }
+
+       if (busycount && may_start_timer) {
+               /*
+                * If this is called from the add timer, we won't have another timer
+                * fire when the thread exits the "busy" state, so rearm the timer.
+                */
+               workq_schedule_delayed_thread_creation(wq, 0);
+       }
+
+       return 0;
+}
+
+static bool
+workq_threadreq_admissible(struct workqueue *wq, struct uthread *uth,
+               workq_threadreq_t req)
+{
+       if (req->tr_qos == WORKQ_THREAD_QOS_MANAGER) {
+               return workq_may_start_event_mgr_thread(wq, uth);
+       }
+       if ((req->tr_flags & TR_FLAG_OVERCOMMIT) == 0) {
+               return workq_constrained_allowance(wq, req->tr_qos, uth, true);
+       }
+       return true;
+}
+
+static workq_threadreq_t
+workq_threadreq_select_for_creator(struct workqueue *wq)
+{
+       workq_threadreq_t req_qos, req_pri, req_tmp;
+       thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
+       uint8_t pri = 0;
+
+       req_tmp = wq->wq_event_manager_threadreq;
+       if (req_tmp && workq_may_start_event_mgr_thread(wq, NULL)) {
+               return req_tmp;
+       }
+
+       /*
+        * Compute the best priority request, and ignore the turnstile for now
+        */
+
+       req_pri = priority_queue_max(&wq->wq_special_queue,
+                       struct workq_threadreq_s, tr_entry);
+       if (req_pri) {
+               pri = priority_queue_entry_key(&wq->wq_special_queue, &req_pri->tr_entry);
+       }
+
+       /*
+        * Compute the best QoS Request, and check whether it beats the "pri" one
+        */
+
+       req_qos = priority_queue_max(&wq->wq_overcommit_queue,
+                       struct workq_threadreq_s, tr_entry);
+       if (req_qos) {
+               qos = req_qos->tr_qos;
+       }
+
+       req_tmp = priority_queue_max(&wq->wq_constrained_queue,
+                       struct workq_threadreq_s, tr_entry);
+
+       if (req_tmp && qos < req_tmp->tr_qos) {
+               if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
+                       return req_pri;
+               }
+
+               if (workq_constrained_allowance(wq, req_tmp->tr_qos, NULL, true)) {
+                       /*
+                        * If the constrained thread request is the best one and passes
+                        * the admission check, pick it.
+                        */
+                       return req_tmp;
+               }
+       }
+
+       if (pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
+               return req_pri;
+       }
+
+       if (req_qos) {
+               return req_qos;
+       }
+
+       /*
+        * If we had no eligible request but we have a turnstile push,
+        * it must be a non overcommit thread request that failed
+        * the admission check.
+        *
+        * Just fake a BG thread request so that if the push stops the creator
+        * priority just drops to 4.
+        */
+       if (turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile, NULL)) {
+               static struct workq_threadreq_s workq_sync_push_fake_req = {
+                       .tr_qos = THREAD_QOS_BACKGROUND,
+               };
+
+               return &workq_sync_push_fake_req;
+       }
+
+       return NULL;
+}
+
+static workq_threadreq_t
+workq_threadreq_select(struct workqueue *wq, struct uthread *uth)
+{
+       workq_threadreq_t req_qos, req_pri, req_tmp;
+       uintptr_t proprietor;
+       thread_qos_t qos = THREAD_QOS_UNSPECIFIED;
+       uint8_t pri = 0;
+
+       if (uth == wq->wq_creator) uth = NULL;
+
+       req_tmp = wq->wq_event_manager_threadreq;
+       if (req_tmp && workq_may_start_event_mgr_thread(wq, uth)) {
+               return req_tmp;
+       }
+
+       /*
+        * Compute the best priority request (special or turnstile)
+        */
+
+       pri = turnstile_workq_proprietor_of_max_turnstile(wq->wq_turnstile,
+                       &proprietor);
+       if (pri) {
+               struct kqworkloop *kqwl = (struct kqworkloop *)proprietor;
+               req_pri = &kqwl->kqwl_request.kqr_req;
+               if (req_pri->tr_state != TR_STATE_QUEUED) {
+                       panic("Invalid thread request (%p) state %d",
+                                       req_pri, req_pri->tr_state);
+               }
+       } else {
+               req_pri = NULL;
+       }
+
+       req_tmp = priority_queue_max(&wq->wq_special_queue,
+                       struct workq_threadreq_s, tr_entry);
+       if (req_tmp && pri < priority_queue_entry_key(&wq->wq_special_queue,
+                       &req_tmp->tr_entry)) {
+               req_pri = req_tmp;
+               pri = priority_queue_entry_key(&wq->wq_special_queue, &req_tmp->tr_entry);
+       }
+
+       /*
+        * Compute the best QoS Request, and check whether it beats the "pri" one
+        */
+
+       req_qos = priority_queue_max(&wq->wq_overcommit_queue,
+                       struct workq_threadreq_s, tr_entry);
+       if (req_qos) {
+               qos = req_qos->tr_qos;
+       }
+
+       req_tmp = priority_queue_max(&wq->wq_constrained_queue,
+                       struct workq_threadreq_s, tr_entry);
+
+       if (req_tmp && qos < req_tmp->tr_qos) {
+               if (pri && pri >= thread_workq_pri_for_qos(req_tmp->tr_qos)) {
+                       return req_pri;
+               }
+
+               if (workq_constrained_allowance(wq, req_tmp->tr_qos, uth, true)) {
+                       /*
+                        * If the constrained thread request is the best one and passes
+                        * the admission check, pick it.
+                        */
+                       return req_tmp;
+               }
+       }
+
+       if (req_pri && (!qos || pri >= thread_workq_pri_for_qos(qos))) {
+               return req_pri;
+       }
+
+       return req_qos;
+}
+
+/*
+ * The creator is an anonymous thread that is counted as scheduled,
+ * but otherwise without its scheduler callback set or tracked as active
+ * that is used to make other threads.
+ *
+ * When more requests are added or an existing one is hurried along,
+ * a creator is elected and setup, or the existing one overridden accordingly.
+ *
+ * While this creator is in flight, because no request has been dequeued,
+ * already running threads have a chance at stealing thread requests avoiding
+ * useless context switches, and the creator once scheduled may not find any
+ * work to do and will then just park again.
+ *
+ * The creator serves the dual purpose of informing the scheduler of work that
+ * hasn't be materialized as threads yet, and also as a natural pacing mechanism
+ * for thread creation.
+ *
+ * By being anonymous (and not bound to anything) it means that thread requests
+ * can be stolen from this creator by threads already on core yielding more
+ * efficient scheduling and reduced context switches.
+ */
+static void
+workq_schedule_creator(proc_t p, struct workqueue *wq, int flags)
+{
+       workq_threadreq_t req;
+       struct uthread *uth;
+
+       workq_lock_held(wq);
+       assert(p || (flags & WORKQ_THREADREQ_CAN_CREATE_THREADS) == 0);
+
+again:
+       uth = wq->wq_creator;
+
+       if (!wq->wq_reqcount) {
+               if (uth == NULL) {
+                       workq_turnstile_update_inheritor(wq, TURNSTILE_INHERITOR_NULL, 0);
+               }
+               return;
+       }
+
+       req = workq_threadreq_select_for_creator(wq);
+       if (req == NULL) {
+               if (flags & WORKQ_THREADREQ_CREATOR_SYNC_UPDATE) {
+                       assert((flags & WORKQ_THREADREQ_CREATOR_TRANSFER) == 0);
+                       /*
+                        * turnstile propagation code is reaching out to us,
+                        * and we still don't want to do anything, do not recurse.
+                        */
+               } else {
+                       workq_turnstile_update_inheritor(wq, wq, TURNSTILE_INHERITOR_WORKQ);
+               }
+               return;
+       }
+
+       if (uth) {
+               /*
+                * We need to maybe override the creator we already have
+                */
+               if (workq_thread_needs_priority_change(req, uth)) {
+                       WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
+                                       wq, 1, thread_tid(uth->uu_thread), req->tr_qos, 0);
+                       workq_thread_reset_pri(wq, uth, req);
+               }
+       } else if (wq->wq_thidlecount) {
+               /*
+                * We need to unpark a creator thread
+                */
+               wq->wq_creator = uth = workq_pop_idle_thread(wq);
+               if (workq_thread_needs_priority_change(req, uth)) {
+                       workq_thread_reset_pri(wq, uth, req);
+               }
+               workq_turnstile_update_inheritor(wq, uth->uu_thread,
+                               TURNSTILE_INHERITOR_THREAD);
+               WQ_TRACE_WQ(TRACE_wq_creator_select | DBG_FUNC_NONE,
+                               wq, 2, thread_tid(uth->uu_thread), req->tr_qos, 0);
+               uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
+               uth->uu_save.uus_workq_park_data.yields = 0;
+               workq_thread_wakeup(uth);
+       } else {
+               /*
+                * We need to allocate a thread...
+                */
+               if (__improbable(wq->wq_nthreads >= wq_max_threads)) {
+                       /* out of threads, just go away */
+               } else if (flags & WORKQ_THREADREQ_SET_AST_ON_FAILURE) {
+                       act_set_astkevent(current_thread(), AST_KEVENT_REDRIVE_THREADREQ);
+               } else if (!(flags & WORKQ_THREADREQ_CAN_CREATE_THREADS)) {
+                       /* This can drop the workqueue lock, and take it again */
+                       workq_schedule_immediate_thread_creation(wq);
+               } else if (workq_add_new_idle_thread(p, wq)) {
+                       goto again;
+               } else {
+                       workq_schedule_delayed_thread_creation(wq, 0);
+               }
+
+               if (flags & WORKQ_THREADREQ_CREATOR_TRANSFER) {
+                       /*
+                        * workq_schedule_creator() failed at creating a thread,
+                        * and the responsibility of redriving is now with a thread-call.
+                        *
+                        * We still need to tell the turnstile the previous creator is gone.
+                        */
+                       workq_turnstile_update_inheritor(wq, NULL, 0);
+               }
+       }
+}
+
+/**
+ * Runs a thread request on a thread
+ *
+ * - if thread is THREAD_NULL, will find a thread and run the request there.
+ *   Otherwise, the thread must be the current thread.
+ *
+ * - if req is NULL, will find the highest priority request and run that.  If
+ *   it is not NULL, it must be a threadreq object in state NEW.  If it can not
+ *   be run immediately, it will be enqueued and moved to state QUEUED.
+ *
+ *   Either way, the thread request object serviced will be moved to state
+ *   BINDING and attached to the uthread.
+ *
+ *   Should be called with the workqueue lock held.  Will drop it.
+ */
+__attribute__((noreturn, noinline))
+static void
+workq_select_threadreq_or_park_and_unlock(proc_t p, struct workqueue *wq,
+               struct uthread *uth)
+{
+       uint32_t setup_flags = 0;
+       workq_threadreq_t req;
+
+       if (uth->uu_workq_flags & UT_WORKQ_EARLY_BOUND) {
+               if (uth->uu_workq_flags & UT_WORKQ_NEW) {
+                       setup_flags |= WQ_SETUP_FIRST_USE;
+               }
+               uth->uu_workq_flags &= ~(UT_WORKQ_NEW | UT_WORKQ_EARLY_BOUND);
+               /*
+                * This pointer is possibly freed and only used for tracing purposes.
+                */
+               req = uth->uu_save.uus_workq_park_data.thread_request;
+               workq_unlock(wq);
+               WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
+                               VM_KERNEL_ADDRHIDE(req), 0, 0, 0);
+               goto run;
+       } else if (_wq_exiting(wq)) {
+               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 0, 0, 0, 0);
+       } else if (wq->wq_reqcount == 0) {
+               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 1, 0, 0, 0);
+       } else if ((req = workq_threadreq_select(wq, uth)) == NULL) {
+               WQ_TRACE_WQ(TRACE_wq_select_threadreq | DBG_FUNC_NONE, wq, 2, 0, 0, 0);
+       } else {
+               WQ_TRACE_WQ(TRACE_wq_thread_logical_run | DBG_FUNC_START, wq,
+                               workq_trace_req_id(req), 0, 0, 0);
+               if (uth->uu_workq_flags & UT_WORKQ_NEW) {
+                       uth->uu_workq_flags ^= UT_WORKQ_NEW;
+                       setup_flags |= WQ_SETUP_FIRST_USE;
+               }
+               workq_thread_reset_cpupercent(req, uth);
+               workq_threadreq_bind_and_unlock(p, wq, req, uth);
+run:
+               workq_setup_and_run(p, uth, setup_flags);
+               __builtin_unreachable();
+       }
+
+       workq_park_and_unlock(p, wq, uth);
+       __builtin_unreachable();
+}
+
+static bool
+workq_creator_should_yield(struct workqueue *wq, struct uthread *uth)
+{
+       thread_qos_t qos = workq_pri_override(uth->uu_workq_pri);
+
+       if (qos >= THREAD_QOS_USER_INTERACTIVE) {
+               return false;
+       }
+
+       uint32_t snapshot = uth->uu_save.uus_workq_park_data.fulfilled_snapshot;
+       if (wq->wq_fulfilled == snapshot) {
+               return false;
+       }
+
+       uint32_t cnt = 0, conc = wq_max_parallelism[_wq_bucket(qos)];
+       if (wq->wq_fulfilled - snapshot > conc) {
+               /* we fulfilled more than NCPU requests since being dispatched */
+               WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 1,
+                               wq->wq_fulfilled, snapshot, 0);
+               return true;
+       }
+
+       for (int i = _wq_bucket(qos); i < WORKQ_NUM_QOS_BUCKETS; i++) {
+               cnt += wq->wq_thscheduled_count[i];
+       }
+       if (conc <= cnt) {
+               /* We fulfilled requests and have more than NCPU scheduled threads */
+               WQ_TRACE_WQ(TRACE_wq_creator_yield, wq, 2,
+                               wq->wq_fulfilled, snapshot, 0);
+               return true;
+       }
+
+       return false;
+}
+
+/**
+ * parked thread wakes up
+ */
+__attribute__((noreturn, noinline))
+static void
+workq_unpark_continue(void *parameter __unused, wait_result_t wr __unused)
+{
+       struct uthread *uth = current_uthread();
+       proc_t p = current_proc();
+       struct workqueue *wq = proc_get_wqptr_fast(p);
+
+       workq_lock_spin(wq);
+
+       if (wq->wq_creator == uth && workq_creator_should_yield(wq, uth)) {
+               /*
+                * If the number of threads we have out are able to keep up with the
+                * demand, then we should avoid sending this creator thread to
+                * userspace.
+                */
+               uth->uu_save.uus_workq_park_data.fulfilled_snapshot = wq->wq_fulfilled;
+               uth->uu_save.uus_workq_park_data.yields++;
+               workq_unlock(wq);
+               thread_yield_with_continuation(workq_unpark_continue, NULL);
+               __builtin_unreachable();
+       }
+
+       if (__probable(uth->uu_workq_flags & UT_WORKQ_RUNNING)) {
+               workq_select_threadreq_or_park_and_unlock(p, wq, uth);
+               __builtin_unreachable();
+       }
+
+       if (__probable(wr == THREAD_AWAKENED)) {
+               /*
+                * We were set running, but for the purposes of dying.
+                */
+               assert(uth->uu_workq_flags & UT_WORKQ_DYING);
+               assert((uth->uu_workq_flags & UT_WORKQ_NEW) == 0);
+       } else {
+               /*
+                * workaround for <rdar://problem/38647347>,
+                * in case we do hit userspace, make sure calling
+                * workq_thread_terminate() does the right thing here,
+                * and if we never call it, that workq_exit() will too because it sees
+                * this thread on the runlist.
+                */
+               assert(wr == THREAD_INTERRUPTED);
+               wq->wq_thdying_count++;
+               uth->uu_workq_flags |= UT_WORKQ_DYING;
+       }
+
+       workq_unpark_for_death_and_unlock(p, wq, uth,
+                       WORKQ_UNPARK_FOR_DEATH_WAS_IDLE);
+       __builtin_unreachable();
+}
+
+__attribute__((noreturn, noinline))
+static void
+workq_setup_and_run(proc_t p, struct uthread *uth, int setup_flags)
+{
+       thread_t th = uth->uu_thread;
+       vm_map_t vmap = get_task_map(p->task);
+
+       if (setup_flags & WQ_SETUP_CLEAR_VOUCHER) {
+               /*
+                * For preemption reasons, we want to reset the voucher as late as
+                * possible, so we do it in two places:
+                *   - Just before parking (i.e. in workq_park_and_unlock())
+                *   - Prior to doing the setup for the next workitem (i.e. here)
+                *
+                * Those two places are sufficient to ensure we always reset it before
+                * it goes back out to user space, but be careful to not break that
+                * guarantee.
+                */
+               __assert_only kern_return_t kr;
+               kr = thread_set_voucher_name(MACH_PORT_NULL);
+               assert(kr == KERN_SUCCESS);
+       }
+
+       uint32_t upcall_flags = uth->uu_save.uus_workq_park_data.upcall_flags;
+       if (!(setup_flags & WQ_SETUP_FIRST_USE)) {
+               upcall_flags |= WQ_FLAG_THREAD_REUSE;
+       }
+
+       if (uth->uu_workq_flags & UT_WORKQ_OUTSIDE_QOS) {
+               /*
+                * For threads that have an outside-of-QoS thread priority, indicate
+                * to userspace that setting QoS should only affect the TSD and not
+                * change QOS in the kernel.
+                */
+               upcall_flags |= WQ_FLAG_THREAD_OUTSIDEQOS;
+       } else {
+               /*
+                * Put the QoS class value into the lower bits of the reuse_thread
+                * register, this is where the thread priority used to be stored
+                * anyway.
+                */
+               upcall_flags |= uth->uu_save.uus_workq_park_data.qos |
+                               WQ_FLAG_THREAD_PRIO_QOS;
+       }
+
+       if (uth->uu_workq_thport == MACH_PORT_NULL) {
+               /* convert_thread_to_port() consumes a reference */
+               thread_reference(th);
+               ipc_port_t port = convert_thread_to_port(th);
+               uth->uu_workq_thport = ipc_port_copyout_send(port, get_task_ipcspace(p->task));
+       }
+
+       /*
+        * Call out to pthread, this sets up the thread, pulls in kevent structs
+        * onto the stack, sets up the thread state and then returns to userspace.
+        */
+       WQ_TRACE_WQ(TRACE_wq_runthread | DBG_FUNC_START,
+                       proc_get_wqptr_fast(p), 0, 0, 0, 0);
+       thread_sched_call(th, workq_sched_callback);
+       pthread_functions->workq_setup_thread(p, th, vmap, uth->uu_workq_stackaddr,
+                       uth->uu_workq_thport, 0, setup_flags, upcall_flags);
+
+       __builtin_unreachable();
+}
+
+#pragma mark misc
+
+int
+fill_procworkqueue(proc_t p, struct proc_workqueueinfo * pwqinfo)
+{
+       struct workqueue *wq = proc_get_wqptr(p);
+       int error = 0;
+       int     activecount;
+
+       if (wq == NULL) {
+               return EINVAL;
+       }
+
+       /*
+        * This is sometimes called from interrupt context by the kperf sampler.
+        * In that case, it's not safe to spin trying to take the lock since we
+        * might already hold it.  So, we just try-lock it and error out if it's
+        * already held.  Since this is just a debugging aid, and all our callers
+        * are able to handle an error, that's fine.
+        */
+       bool locked = workq_lock_try(wq);
+       if (!locked) {
+               return EBUSY;
+       }
+
+       wq_thactive_t act = _wq_thactive(wq);
+       activecount = _wq_thactive_aggregate_downto_qos(wq, act,
+                       WORKQ_THREAD_QOS_MIN, NULL, NULL);
+       if (act & _wq_thactive_offset_for_qos(WORKQ_THREAD_QOS_MANAGER)) {
+               activecount++;
+       }
+       pwqinfo->pwq_nthreads = wq->wq_nthreads;
+       pwqinfo->pwq_runthreads = activecount;
+       pwqinfo->pwq_blockedthreads = wq->wq_threads_scheduled - activecount;
+       pwqinfo->pwq_state = 0;
+
+       if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
+               pwqinfo->pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
+       }
+
+       if (wq->wq_nthreads >= wq_max_threads) {
+               pwqinfo->pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
+       }
+
+       workq_unlock(wq);
+       return error;
+}
+
+boolean_t
+workqueue_get_pwq_exceeded(void *v, boolean_t *exceeded_total,
+               boolean_t *exceeded_constrained)
+{
+       proc_t p = v;
+       struct proc_workqueueinfo pwqinfo;
+       int err;
+
+       assert(p != NULL);
+       assert(exceeded_total != NULL);
+       assert(exceeded_constrained != NULL);
+
+       err = fill_procworkqueue(p, &pwqinfo);
+       if (err) {
+               return FALSE;
+       }
+       if (!(pwqinfo.pwq_state & WQ_FLAGS_AVAILABLE)) {
+               return FALSE;
+       }
+
+       *exceeded_total = (pwqinfo.pwq_state & WQ_EXCEEDED_TOTAL_THREAD_LIMIT);
+       *exceeded_constrained = (pwqinfo.pwq_state & WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT);
+
+       return TRUE;
+}
+
+uint32_t
+workqueue_get_pwq_state_kdp(void * v)
+{
+       static_assert((WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT << 17) ==
+                       kTaskWqExceededConstrainedThreadLimit);
+       static_assert((WQ_EXCEEDED_TOTAL_THREAD_LIMIT << 17) ==
+                       kTaskWqExceededTotalThreadLimit);
+       static_assert((WQ_FLAGS_AVAILABLE << 17) == kTaskWqFlagsAvailable);
+       static_assert((WQ_FLAGS_AVAILABLE | WQ_EXCEEDED_TOTAL_THREAD_LIMIT |
+                               WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT) == 0x7);
+
+       if (v == NULL) {
+               return 0;
+       }
+
+       proc_t p = v;
+       struct workqueue *wq = proc_get_wqptr(p);
+
+       if (wq == NULL || workq_lock_spin_is_acquired_kdp(wq)) {
+               return 0;
+       }
+
+       uint32_t pwq_state = WQ_FLAGS_AVAILABLE;
+
+       if (wq->wq_constrained_threads_scheduled >= wq_max_constrained_threads) {
+               pwq_state |= WQ_EXCEEDED_CONSTRAINED_THREAD_LIMIT;
+       }
+
+       if (wq->wq_nthreads >= wq_max_threads) {
+               pwq_state |= WQ_EXCEEDED_TOTAL_THREAD_LIMIT;
+       }
+
+       return pwq_state;
+}
+
+void
+workq_init(void)
+{
+       workq_lck_grp_attr = lck_grp_attr_alloc_init();
+       workq_lck_attr = lck_attr_alloc_init();
+       workq_lck_grp = lck_grp_alloc_init("workq", workq_lck_grp_attr);
+
+       workq_zone_workqueue = zinit(sizeof(struct workqueue),
+                       1024 * sizeof(struct workqueue), 8192, "workq.wq");
+       workq_zone_threadreq = zinit(sizeof(struct workq_threadreq_s),
+                       1024 * sizeof(struct workq_threadreq_s), 8192, "workq.threadreq");
+
+       clock_interval_to_absolutetime_interval(wq_stalled_window.usecs,
+                       NSEC_PER_USEC, &wq_stalled_window.abstime);
+       clock_interval_to_absolutetime_interval(wq_reduce_pool_window.usecs,
+                       NSEC_PER_USEC, &wq_reduce_pool_window.abstime);
+       clock_interval_to_absolutetime_interval(wq_max_timer_interval.usecs,
+                       NSEC_PER_USEC, &wq_max_timer_interval.abstime);
+}
diff --git a/bsd/pthread/workqueue_internal.h b/bsd/pthread/workqueue_internal.h
new file mode 100644 (file)
index 0000000..a072d35
--- /dev/null
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2014 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _WORKQUEUE_INTERNAL_H_
+#define _WORKQUEUE_INTERNAL_H_
+
+// Sometimes something gets passed a bucket number and we need a way to express
+// that it's actually the event manager.  Use the (0)th bucket for that.
+#define WORKQ_THREAD_QOS_MIN        (THREAD_QOS_MAINTENANCE)
+#define WORKQ_THREAD_QOS_MAX        (THREAD_QOS_LAST)
+#define WORKQ_THREAD_QOS_CLEANUP    (THREAD_QOS_LEGACY)
+#define WORKQ_THREAD_QOS_ABOVEUI    (THREAD_QOS_LAST)
+#define WORKQ_THREAD_QOS_MANAGER    (THREAD_QOS_LAST + 1) // outside of MIN/MAX
+
+#define WORKQ_NUM_QOS_BUCKETS       (WORKQ_THREAD_QOS_MAX - 1)  // MT/BG shared
+#define WORKQ_NUM_BUCKETS           (WORKQ_NUM_QOS_BUCKETS + 1) // + mgr
+
+/* These definitions are only available to the kext, to avoid bleeding
+ * constants and types across the boundary to the userspace library.
+ */
+#ifdef KERNEL
+#pragma mark wq structs
+
+/* These defines come from kern/thread.h but are XNU_KERNEL_PRIVATE so do not get
+ * exported to kernel extensions.
+ */
+#define SCHED_CALL_BLOCK 0x1
+#define SCHED_CALL_UNBLOCK 0x2
+
+/* old workq priority scheme */
+
+#define WORKQUEUE_HIGH_PRIOQUEUE    0       /* high priority queue */
+#define WORKQUEUE_DEFAULT_PRIOQUEUE 1       /* default priority queue */
+#define WORKQUEUE_LOW_PRIOQUEUE     2       /* low priority queue */
+#define WORKQUEUE_BG_PRIOQUEUE      3       /* background priority queue */
+
+/* wq_max_constrained_threads = max(64, N_CPU * WORKQUEUE_CONSTRAINED_FACTOR)
+ * This used to be WORKQ_NUM_BUCKETS + 1 when NUM_BUCKETS was 4, yielding
+ * N_CPU * 5. When NUM_BUCKETS changed, we decided that the limit should
+ * not change. So the factor is now always 5.
+ */
+#define WORKQUEUE_CONSTRAINED_FACTOR 5
+
+#if BSD_KERNEL_PRIVATE
+#include <kern/priority_queue.h>
+#include <kern/thread_call.h>
+#include <kern/turnstile.h>
+#include <mach/kern_return.h>
+#include <sys/queue.h>
+#include <sys/kernel_types.h>
+
+/* struct uthread::uu_workq_flags */
+#define UT_WORKQ_NEW                   0x01 /* First return to userspace */
+#define UT_WORKQ_RUNNING               0x02 /* On thrunlist, not parked. */
+#define UT_WORKQ_DYING                 0x04 /* Thread is being killed */
+#define UT_WORKQ_OVERCOMMIT            0x08 /* Overcommit thread. */
+#define UT_WORKQ_OUTSIDE_QOS           0x10 /* Thread should avoid send QoS changes to kernel */
+#define UT_WORKQ_IDLE_CLEANUP          0x20 /* Thread is removing its voucher or stack */
+#define UT_WORKQ_EARLY_BOUND           0x40 /* Thread has been bound early */
+#define UT_WORKQ_CPUPERCENT            0x80 /* Thread has CPU percent policy active */
+
+typedef union workq_threadreq_param_s {
+       struct {
+               uint16_t trp_flags;
+               uint8_t trp_pri;
+               uint8_t trp_pol;
+               uint32_t trp_cpupercent: 8,
+                               trp_refillms: 24;
+       };
+       uint64_t trp_value;
+} workq_threadreq_param_t;
+
+#define TRP_PRIORITY           0x1
+#define TRP_POLICY                     0x2
+#define TRP_CPUPERCENT         0x4
+#define TRP_RELEASED           0x8000
+
+typedef struct workq_threadreq_s {
+       union {
+               struct priority_queue_entry tr_entry;
+               thread_t tr_binding_thread;
+       };
+       uint32_t     tr_flags;
+       uint8_t      tr_state;
+       thread_qos_t tr_qos;
+       uint16_t     tr_count;
+} *workq_threadreq_t;
+
+TAILQ_HEAD(threadreq_head, workq_threadreq_s);
+
+#define TR_STATE_IDLE          0  /* request isn't in flight       */
+#define TR_STATE_NEW           1  /* request is being initiated    */
+#define TR_STATE_QUEUED                2  /* request is being queued       */
+#define TR_STATE_BINDING       4  /* request is preposted for bind */
+
+#define TR_FLAG_KEVENT                 0x01
+#define TR_FLAG_WORKLOOP               0x02
+#define TR_FLAG_OVERCOMMIT             0x04
+#define TR_FLAG_WL_PARAMS              0x08
+#define TR_FLAG_WL_OUTSIDE_QOS 0x10
+
+#if defined(__LP64__)
+typedef unsigned __int128 wq_thactive_t;
+#else
+typedef uint64_t wq_thactive_t;
+#endif
+
+typedef enum {
+       WQ_EXITING                  = 0x0001,
+       WQ_PROC_SUSPENDED           = 0x0002,
+       WQ_DEATH_CALL_SCHEDULED     = 0x0004,
+
+       WQ_DELAYED_CALL_SCHEDULED   = 0x0010,
+       WQ_DELAYED_CALL_PENDED      = 0x0020,
+       WQ_IMMEDIATE_CALL_SCHEDULED = 0x0040,
+       WQ_IMMEDIATE_CALL_PENDED    = 0x0080,
+} workq_state_flags_t;
+
+TAILQ_HEAD(workq_uthread_head, uthread);
+
+struct workqueue {
+       thread_call_t   wq_delayed_call;
+       thread_call_t   wq_immediate_call;
+       thread_call_t   wq_death_call;
+       struct turnstile *wq_turnstile;
+
+       lck_spin_t      wq_lock;
+
+       uint64_t        wq_thread_call_last_run;
+       struct os_refcnt wq_refcnt;
+       workq_state_flags_t _Atomic wq_flags;
+       uint32_t        wq_fulfilled;
+       uint32_t        wq_creations;
+       uint32_t        wq_timer_interval;
+       uint32_t        wq_event_manager_priority;
+       uint32_t        wq_reqcount;  /* number of elements on the wq_*_reqlists */
+       uint16_t        wq_thdying_count;
+       uint16_t        wq_threads_scheduled;
+       uint16_t        wq_constrained_threads_scheduled;
+       uint16_t        wq_nthreads;
+       uint16_t        wq_thidlecount;
+       uint16_t        wq_thscheduled_count[WORKQ_NUM_BUCKETS]; // incl. manager
+
+       _Atomic wq_thactive_t wq_thactive;
+       _Atomic uint64_t wq_lastblocked_ts[WORKQ_NUM_QOS_BUCKETS];
+
+       struct proc    *wq_proc;
+       struct uthread *wq_creator;
+       thread_t wq_turnstile_updater; // thread doing a turnstile_update_ineritor
+       struct workq_uthread_head wq_thrunlist;
+       struct workq_uthread_head wq_thnewlist;
+       struct workq_uthread_head wq_thidlelist;
+
+       struct priority_queue wq_overcommit_queue;
+       struct priority_queue wq_constrained_queue;
+       struct priority_queue wq_special_queue;
+       workq_threadreq_t wq_event_manager_threadreq;
+};
+
+static_assert(offsetof(struct workqueue, wq_lock) >= sizeof(struct queue_entry),
+               "Make sure workq_deallocate_enqueue can cast the workqueue");
+
+#define WORKQUEUE_MAXTHREADS           512
+#define WQ_STALLED_WINDOW_USECS                200
+#define WQ_REDUCE_POOL_WINDOW_USECS    5000000
+#define        WQ_MAX_TIMER_INTERVAL_USECS     50000
+
+#pragma mark definitions
+
+struct kqrequest;
+uint32_t _get_pwq_state_kdp(proc_t p);
+
+void workq_exit(struct proc *p);
+void workq_mark_exiting(struct proc *p);
+
+bool workq_is_exiting(struct proc *p);
+
+struct turnstile *workq_turnstile(struct proc *p);
+
+void workq_thread_set_max_qos(struct proc *p, struct kqrequest *kqr);
+
+void workq_thread_terminate(struct proc *p, struct uthread *uth);
+
+#define WORKQ_THREADREQ_SET_AST_ON_FAILURE  0x01
+#define WORKQ_THREADREQ_ATTEMPT_REBIND      0x02
+#define WORKQ_THREADREQ_CAN_CREATE_THREADS  0x04
+#define WORKQ_THREADREQ_CREATOR_TRANSFER    0x08
+#define WORKQ_THREADREQ_CREATOR_SYNC_UPDATE 0x10
+
+// called with the kq req lock held
+bool workq_kern_threadreq_initiate(struct proc *p, struct kqrequest *kqr,
+               struct turnstile *ts, thread_qos_t qos, int flags);
+
+// called with the kq req lock held
+void workq_kern_threadreq_modify(struct proc *p, struct kqrequest *kqr,
+               thread_qos_t qos, int flags);
+
+// called with the kq req lock held
+void workq_kern_threadreq_update_inheritor(struct proc *p, struct kqrequest *kqr,
+               thread_t owner, struct turnstile *ts, turnstile_update_flags_t flags);
+
+void workq_kern_threadreq_lock(struct proc *p);
+void workq_kern_threadreq_unlock(struct proc *p);
+
+void workq_kern_threadreq_redrive(struct proc *p, int flags);
+
+enum workq_set_self_flags {
+       WORKQ_SET_SELF_QOS_FLAG = 0x1,
+       WORKQ_SET_SELF_VOUCHER_FLAG = 0x2,
+       WORKQ_SET_SELF_FIXEDPRIORITY_FLAG = 0x4,
+       WORKQ_SET_SELF_TIMESHARE_FLAG = 0x8,
+       WORKQ_SET_SELF_WQ_KEVENT_UNBIND = 0x10,
+};
+
+void workq_proc_suspended(struct proc *p);
+void workq_proc_resumed(struct proc *p);
+
+#endif // BSD_KERNEL_PRIVATE
+
+void workq_init(void);
+
+#endif // KERNEL
+
+#endif // _WORKQUEUE_INTERNAL_H_
diff --git a/bsd/pthread/workqueue_syscalls.h b/bsd/pthread/workqueue_syscalls.h
new file mode 100644 (file)
index 0000000..e860419
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2017 Apple, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _PTHREAD_WORKQUEUE_PRIVATE_H_
+#define _PTHREAD_WORKQUEUE_PRIVATE_H_
+
+#if XNU_KERNEL_PRIVATE && !defined(__PTHREAD_EXPOSE_INTERNALS__)
+#define __PTHREAD_EXPOSE_INTERNALS__ 1
+#endif // XNU_KERNEL_PRIVATE
+
+#ifdef __PTHREAD_EXPOSE_INTERNALS__
+/* workq_kernreturn commands */
+#define WQOPS_THREAD_RETURN        0x04        /* parks the thread back into the kernel */
+#define WQOPS_QUEUE_NEWSPISUPP     0x10        /* this is to check for newer SPI support */
+#define WQOPS_QUEUE_REQTHREADS     0x20        /* request number of threads of a prio */
+#define WQOPS_QUEUE_REQTHREADS2    0x30        /* request a number of threads in a given priority bucket */
+#define WQOPS_THREAD_KEVENT_RETURN 0x40        /* parks the thread after delivering the passed kevent array */
+#define WQOPS_SET_EVENT_MANAGER_PRIORITY 0x80  /* max() in the provided priority in the the priority of the event manager */
+#define WQOPS_THREAD_WORKLOOP_RETURN 0x100     /* parks the thread after delivering the passed kevent array */
+#define WQOPS_SHOULD_NARROW 0x200      /* checks whether we should narrow our concurrency */
+
+/* flag values for upcall flags field, only 8 bits per struct threadlist */
+#define WQ_FLAG_THREAD_PRIO_SCHED              0x00008000
+#define WQ_FLAG_THREAD_PRIO_QOS                        0x00004000
+#define WQ_FLAG_THREAD_PRIO_MASK               0x00000fff
+
+#define WQ_FLAG_THREAD_OVERCOMMIT              0x00010000  /* thread is with overcommit prio */
+#define WQ_FLAG_THREAD_REUSE                   0x00020000  /* thread is being reused */
+#define WQ_FLAG_THREAD_NEWSPI                  0x00040000  /* the call is with new SPIs */
+#define WQ_FLAG_THREAD_KEVENT                  0x00080000  /* thread is response to kevent req */
+#define WQ_FLAG_THREAD_EVENT_MANAGER   0x00100000  /* event manager thread */
+#define WQ_FLAG_THREAD_TSD_BASE_SET            0x00200000  /* tsd base has already been set */
+#define WQ_FLAG_THREAD_WORKLOOP                        0x00400000  /* workloop thread */
+#define WQ_FLAG_THREAD_OUTSIDEQOS              0x00800000  /* thread qos changes should not be sent to kernel */
+
+#define WQ_KEVENT_LIST_LEN  16 // WORKQ_KEVENT_EVENT_BUFFER_LEN
+#define WQ_KEVENT_DATA_SIZE (32 * 1024)
+
+/* kqueue_workloop_ctl commands */
+#define KQ_WORKLOOP_CREATE                             0x01
+#define KQ_WORKLOOP_DESTROY                            0x02
+
+/* indicate which fields of kq_workloop_create params are valid */
+#define KQ_WORKLOOP_CREATE_SCHED_PRI   0x01
+#define KQ_WORKLOOP_CREATE_SCHED_POL   0x02
+#define KQ_WORKLOOP_CREATE_CPU_PERCENT 0x04
+
+struct kqueue_workloop_params {
+       int kqwlp_version;
+       int kqwlp_flags;
+       uint64_t kqwlp_id;
+       int kqwlp_sched_pri;
+       int kqwlp_sched_pol;
+       int kqwlp_cpu_percent;
+       int kqwlp_cpu_refillms;
+} __attribute__((packed));
+
+_Static_assert(offsetof(struct kqueue_workloop_params, kqwlp_version) == 0,
+               "kqwlp_version should be first");
+
+int
+__workq_open(void);
+
+int
+__workq_kernreturn(int op, void *arg2, int arg3, int arg4);
+
+int
+__kqueue_workloop_ctl(uintptr_t cmd, uint64_t options, void *addr, size_t sz);
+
+/* SPI flags between WQ and workq_setup_thread in pthread.kext */
+#define WQ_SETUP_FIRST_USE      1
+#define WQ_SETUP_CLEAR_VOUCHER  2
+// was  WQ_SETUP_SET_SCHED_CALL 4
+#define WQ_SETUP_EXIT_THREAD    8
+
+#endif // __PTHREAD_EXPOSE_INTERNALS__
+#endif // _PTHREAD_WORKQUEUE_PRIVATE_H_
diff --git a/bsd/pthread/workqueue_trace.h b/bsd/pthread/workqueue_trace.h
new file mode 100644 (file)
index 0000000..6625798
--- /dev/null
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2017 Apple, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _WORKQUEUE_TRACE_H_
+#define _WORKQUEUE_TRACE_H_
+
+// General workqueue tracepoints, mostly for debugging
+#define WQ_TRACE_WORKQUEUE_SUBCLASS 1
+// Workqueue request scheduling tracepoints
+#define WQ_TRACE_REQUESTS_SUBCLASS 2
+// Generic pthread tracepoints
+#define WQ_TRACE_BSDTHREAD_SUBCLASS 16
+
+#define TRACE_wq_pthread_exit \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x01)
+#define TRACE_wq_workqueue_exit \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x02)
+#define TRACE_wq_runthread \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x03)
+#define TRACE_wq_death_call \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x05)
+#define TRACE_wq_thread_block \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x09)
+#define TRACE_wq_thactive_update \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x0a)
+#define TRACE_wq_add_timer \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x0b)
+#define TRACE_wq_start_add_timer \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x0c)
+#define TRACE_wq_override_dispatch \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x14)
+#define TRACE_wq_override_reset \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x15)
+#define TRACE_wq_thread_create_failed \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x1d)
+#define TRACE_wq_thread_terminate \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x1e)
+#define TRACE_wq_thread_create \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x1f)
+#define TRACE_wq_select_threadreq \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x20)
+#define TRACE_wq_creator_select \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x23)
+#define TRACE_wq_creator_yield \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x24)
+#define TRACE_wq_constrained_admission \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x25)
+#define TRACE_wq_wqops_reqthreads \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_WORKQUEUE_SUBCLASS, 0x26)
+
+#define TRACE_wq_create \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x01)
+#define TRACE_wq_destroy \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x02)
+#define TRACE_wq_thread_logical_run \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x03)
+#define TRACE_wq_thread_request_initiate \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x05)
+#define TRACE_wq_thread_request_modify \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x06)
+#define TRACE_wq_thread_request_fulfill \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_REQUESTS_SUBCLASS, 0x08)
+
+#define TRACE_bsdthread_set_qos_self \
+               KDBG_CODE(DBG_PTHREAD, WQ_TRACE_BSDTHREAD_SUBCLASS, 0x1)
+
+#define WQ_TRACE(x,a,b,c,d,e) \
+               ({ KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e); })
+#define WQ_TRACE_WQ(x,wq,b,c,d,e) \
+               ({ KERNEL_DEBUG_CONSTANT(x, (wq)->wq_proc->p_pid, b, c, d, e); })
+
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
+#define __wq_trace_only
+#else // (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
+#define __wq_trace_only __unused
+#endif // (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
+
+#endif // _WORKQUEUE_TRACE_H_
index 830c1fc338dc53bf9db2986516d0a37d03011517..e91adedf42a92ad2ddfe19be7995f68573d60c06 100644 (file)
@@ -100,6 +100,13 @@ int                        audit_suspended;
 int                    audit_syscalls;
 au_class_t             audit_kevent_mask;
 
+/*
+ * The audit control mode is used to ensure configuration settings are only
+ * accepted from appropriate sources based on the current mode.
+ */
+au_ctlmode_t audit_ctl_mode;
+au_expire_after_t audit_expire_after;
+
 /*
  * Flags controlling behavior in low storage situations.  Should we panic if
  * a write fails?  Should we fail stop if we're out of disk space?
@@ -274,6 +281,7 @@ audit_record_dtor(struct kaudit_record *ar)
                free(ar->k_ar.ar_arg_argv, M_AUDITTEXT);
        if (ar->k_ar.ar_arg_envv != NULL)
                free(ar->k_ar.ar_arg_envv, M_AUDITTEXT);
+       audit_identity_info_destruct(&ar->k_ar.ar_arg_identity);
 }
 
 /*
@@ -294,6 +302,10 @@ audit_init(void)
        audit_in_failure = 0;
        audit_argv = 0;
        audit_arge = 0;
+       audit_ctl_mode = AUDIT_CTLMODE_NORMAL;
+       audit_expire_after.age = 0;
+       audit_expire_after.size = 0;
+       audit_expire_after.op_type = AUDIT_EXPIRE_OP_AND;
 
        audit_fstat.af_filesz = 0;      /* '0' means unset, unbounded. */
        audit_fstat.af_currsz = 0;
@@ -610,7 +622,7 @@ audit_syscall_enter(unsigned int code, proc_t proc, struct uthread *uthread)
         * the syscall table(s).  This table is generated by makesyscalls.sh
         * from syscalls.master and stored in audit_kevents.c.
         */
-       if (code > nsysent)
+       if (code >= nsysent)
                return;
        event = sys_au_event[code];
        if (event == AUE_NULL)
@@ -668,6 +680,14 @@ audit_syscall_enter(unsigned int code, proc_t proc, struct uthread *uthread)
                        uthread->uu_ar = audit_new(event, proc, uthread);
        } 
 
+       /*
+        * All audited events will contain an identity
+        *
+        * Note: Identity should be obtained prior to the syscall implementation
+        * being called to handle cases like execve(2) where the process changes
+        */
+       AUDIT_ARG(identity);
+
 out:
        kauth_cred_unref(&cred);
 }
index 61e818f6089b4668cadc99fd8f503507d798629d..6a60b36c6fabca79f5114a682ac55dd4a2a5036a 100644 (file)
@@ -130,6 +130,7 @@ extern int  audit_syscalls;
 #define        ARG_DATA                0x0010000000000000ULL   /* darwin-only */
 #define        ARG_ADDR64              0x0020000000000000ULL   /* darwin-only */
 #define        ARG_FD2                 0x0040000000000000ULL   /* darwin-only */
+#define        ARG_IDENTITY            0x0080000000000000ULL   /* darwin-only */
 #define        ARG_NONE                0x0000000000000000ULL
 #define        ARG_ALL                 0xFFFFFFFFFFFFFFFFULL
 
@@ -242,6 +243,7 @@ void         audit_arg_argv(struct kaudit_record *ar, char *argv, int argc,
                int length); 
 void    audit_arg_envv(struct kaudit_record *ar, char *envv, int envc,
                int length);
+void   audit_arg_identity(struct kaudit_record *ar);
 
 void    audit_arg_mach_port1(struct kaudit_record *ar, mach_port_name_t port);
 void    audit_arg_mach_port2(struct kaudit_record *ar, mach_port_name_t port);
@@ -323,7 +325,7 @@ extern au_event_t sys_au_event[];
        if (AUDIT_SYSCALLS()) {                                         \
                struct kaudit_record *__ar = AUDIT_RECORD();            \
                if (AUDIT_AUDITING(__ar))                               \
-                       audit_arg_ ## op (__ar, args);                  \
+                       audit_arg_ ## op (__ar, ## args);               \
        }                                                               \
 } while (0)
 
index 7e338fd2ba77ba029dab393f4e64cc0502bdfb4b..950d1f49ff4d50140b42a83ba49c0ce93ea2f257 100644 (file)
@@ -59,6 +59,8 @@
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/socketvar.h>
+#include <sys/codesign.h>
+#include <sys/ubc.h>
 
 #include <bsm/audit.h>
 #include <bsm/audit_internal.h>
@@ -900,4 +902,91 @@ audit_sysclose(struct kaudit_record *ar, proc_t p, int fd)
        fp_drop(p, fd, fp, 0);
 }
 
+void
+audit_identity_info_destruct(struct au_identity_info *id_info)
+{
+       if (!id_info) {
+               return;
+       }
+
+       if (id_info->signing_id != NULL) {
+               free(id_info->signing_id, M_AUDITTEXT);
+               id_info->signing_id = NULL;
+       }
+
+       if (id_info->team_id != NULL) {
+               free(id_info->team_id, M_AUDITTEXT);
+               id_info->team_id = NULL;
+       }
+
+       if (id_info->cdhash != NULL) {
+               free(id_info->cdhash, M_AUDITDATA);
+               id_info->cdhash = NULL;
+       }
+}
+
+void
+audit_identity_info_construct(struct au_identity_info *id_info)
+{
+       struct proc *p;
+       struct cs_blob *blob;
+       unsigned int signer_type = 0;
+       const char *signing_id = NULL;
+       const char* team_id = NULL;
+       const uint8_t *cdhash = NULL;
+       size_t src_len = 0;
+
+       p = current_proc();
+       blob = csproc_get_blob(p);
+       if (blob) {
+               signing_id = csblob_get_identity(blob);
+               cdhash = csblob_get_cdhash(blob);
+               team_id = csblob_get_teamid(blob);
+               signer_type = csblob_get_platform_binary(blob) ? 1 : 0;
+       }
+
+       id_info->signer_type = signer_type;
+
+       if (id_info->signing_id == NULL && signing_id != NULL) {
+               id_info->signing_id = malloc( MAX_AU_IDENTITY_SIGNING_ID_LENGTH,
+                       M_AUDITTEXT, M_WAITOK);
+               if (id_info->signing_id != NULL) {
+                       src_len = strlcpy(id_info->signing_id,
+                               signing_id, MAX_AU_IDENTITY_SIGNING_ID_LENGTH);
+
+                       if (src_len >= MAX_AU_IDENTITY_SIGNING_ID_LENGTH) {
+                               id_info->signing_id_trunc = 1;
+                       }
+               }
+       }
+
+       if (id_info->team_id == NULL && team_id != NULL) {
+               id_info->team_id = malloc(MAX_AU_IDENTITY_TEAM_ID_LENGTH,
+                       M_AUDITTEXT, M_WAITOK);
+               if (id_info->team_id != NULL) {
+                       src_len = strlcpy(id_info->team_id, team_id,
+                               MAX_AU_IDENTITY_TEAM_ID_LENGTH);
+
+                       if (src_len >= MAX_AU_IDENTITY_TEAM_ID_LENGTH) {
+                               id_info->team_id_trunc = 1;
+                       }
+               }
+       }
+
+       if (id_info->cdhash == NULL && cdhash != NULL) {
+               id_info->cdhash = malloc(CS_CDHASH_LEN, M_AUDITDATA, M_WAITOK);
+               if (id_info->cdhash != NULL) {
+                       memcpy(id_info->cdhash, cdhash, CS_CDHASH_LEN);
+                       id_info->cdhash_len = CS_CDHASH_LEN;
+               }
+       }
+}
+
+void
+audit_arg_identity(struct kaudit_record *ar)
+{
+       audit_identity_info_construct(&ar->k_ar.ar_arg_identity);
+       ARG_SET_VALID(ar, ARG_IDENTITY);
+}
+
 #endif /* CONFIG_AUDIT */
index 64b4e9f05a8e0b59ceff4ada57e30aa30d62e269..5a6ea37507c10fe13c35eef449107753de660913 100644 (file)
@@ -234,8 +234,9 @@ _audit_free(void *addr, __unused au_malloc_type_t *type)
                return;
        hdr = addr; hdr--;
 
-       KASSERT(hdr->mh_magic == AUDIT_MHMAGIC,
-           ("_audit_free(): hdr->mh_magic != AUDIT_MHMAGIC"));
+       if (hdr->mh_magic != AUDIT_MHMAGIC) {
+           panic("_audit_free(): hdr->mh_magic (%lx) != AUDIT_MHMAGIC", hdr->mh_magic);
+       }
 
 #if AUDIT_MALLOC_DEBUG
        if (type != NULL) {
index edebfd61bef521369ee8c08e81c429ae3f21cde7..60c8dbf149a3453a746cd819161c69c6e16dbf33 100644 (file)
@@ -263,6 +263,18 @@ kau_free(struct au_record *rec)
        }                                                               \
 } while (0)
 
+#define        VNODE2_PATH_TOKENS do {                                 \
+       if (ARG_IS_VALID(kar, ARG_KPATH2)) {                            \
+               tok = au_to_path(ar->ar_arg_kpath2);                    \
+               kau_write(rec, tok);                                    \
+       }                                                               \
+       if (ARG_IS_VALID(kar, ARG_VNODE2)) {                            \
+               tok = au_to_attr32(&ar->ar_arg_vnode2);                 \
+               kau_write(rec, tok);                                    \
+               MAC_VNODE2_LABEL_TOKEN;                                 \
+       }                                                               \
+} while (0)
+
 #define        FD_VNODE1_TOKENS do {                                           \
        if (ARG_IS_VALID(kar, ARG_VNODE1)) {                            \
                if (ARG_IS_VALID(kar, ARG_KPATH1)) {                    \
@@ -983,6 +995,12 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau)
                        kau_write(rec, tok);
                }
                UPATH1_VNODE1_TOKENS;
+               VNODE2_PATH_TOKENS;
+               if (ARG_IS_VALID(kar, ARG_DATA)) {
+                       tok = au_to_data(AUP_HEX, ar->ar_arg_data_type,
+                           ar->ar_arg_data_count, ar->ar_arg_data);
+                       kau_write(rec, tok);
+               }
                break;
 
        case AUE_FCHMOD_EXTENDED:
@@ -2020,6 +2038,14 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau)
        tok = au_to_return32(au_errno_to_bsm(ar->ar_errno), ar->ar_retval);
        kau_write(rec, tok);  /* Every record gets a return token */
 
+       if (ARG_IS_VALID(kar, ARG_IDENTITY)) {
+               struct au_identity_info *id = &ar->ar_arg_identity;
+               tok = au_to_identity(id->signer_type, id->signing_id,
+                       id->signing_id_trunc, id->team_id, id->team_id_trunc,
+                       id->cdhash, id->cdhash_len);
+               kau_write(rec, tok);
+       }
+
        kau_close(rec, &ar->ar_endtime, ar->ar_event);
 
        *pau = rec;
@@ -2027,25 +2053,47 @@ kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau)
 }
 
 /*
- * Verify that a record is a valid BSM record. This verification is simple
- * now, but may be expanded on sometime in the future.  Return 1 if the
+ * Verify that a record is a valid BSM record. Return 1 if the
  * record is good, 0 otherwise.
  */
 int
-bsm_rec_verify(void *rec)
+bsm_rec_verify(void *rec, int length)
 {
-       char c = *(char *)rec;
+       /* Used to partially deserialize the buffer */
+       struct hdr_tok_partial *hdr;
+       struct trl_tok_partial *trl;
 
-       /*
-        * Check the token ID of the first token; it has to be a header
-        * token.
-        *
-        * XXXAUDIT There needs to be a token structure to map a token.
-        * XXXAUDIT 'Shouldn't be simply looking at the first char.
-        */
-       if ((c != AUT_HEADER32) && (c != AUT_HEADER32_EX) &&
-           (c != AUT_HEADER64) && (c != AUT_HEADER64_EX))
+       /* A record requires a complete header and trailer token */
+       if (length < (AUDIT_HEADER_SIZE + AUDIT_TRAILER_SIZE)) {
+               return (0);
+       }
+
+       hdr = (struct hdr_tok_partial*)rec;
+
+       /* Ensure the provided length matches what the record shows */
+       if ((uint32_t)length != ntohl(hdr->len)) {
+               return (0);
+       }
+
+       trl = (struct trl_tok_partial*)(rec + (length - AUDIT_TRAILER_SIZE));
+
+       /* Ensure the buffer contains what look like header and trailer tokens */
+       if (((hdr->type != AUT_HEADER32) && (hdr->type != AUT_HEADER32_EX) &&
+           (hdr->type != AUT_HEADER64) && (hdr->type != AUT_HEADER64_EX)) ||
+           (trl->type != AUT_TRAILER)) {
                return (0);
+       }
+
+       /* Ensure the header and trailer agree on the length */
+       if (hdr->len != trl->len) {
+               return (0);
+       }
+
+       /* Ensure the trailer token has a proper magic value */
+       if (ntohs(trl->magic) != AUT_TRAILER_MAGIC) {
+               return (0);
+       }
+
        return (1);
 }
 #endif /* CONFIG_AUDIT */
index 4a8187f4450bc478f3c975831d46e41de5f0837e..30787a14c5cdc065afd611c759a5cdd3402aa60b 100644 (file)
@@ -53,6 +53,7 @@
 #include <security/audit/audit.h>
 #include <security/audit/audit_bsd.h>
 #include <security/audit/audit_private.h>
+#include <IOKit/IOBSD.h>
 
 #if CONFIG_AUDIT
 /*
@@ -103,10 +104,48 @@ out:
        return (class);
 }
 
+/*
+ * Return a new class mask that allows changing the reserved class bit
+ * only if the current task is entitled to do so or if this is being done
+ * from the kernel task. If the current task is not allowed to make the
+ * change, the reserved bit is reverted to its previous state and the rest
+ * of the mask is left intact.
+ */
+static au_class_t
+au_class_protect(au_class_t old_class, au_class_t new_class)
+{
+       au_class_t result = new_class;
+
+       /* Check if the reserved class bit has been flipped */
+       if ((old_class & AU_CLASS_MASK_RESERVED) !=
+               (new_class & AU_CLASS_MASK_RESERVED)) {
+
+               task_t task = current_task();
+               if (task != kernel_task &&
+                       !IOTaskHasEntitlement(task, AU_CLASS_RESERVED_ENTITLEMENT)) {
+                       /*
+                        * If the caller isn't entitled, revert the class bit:
+                        * - First remove the reserved bit from the new_class mask
+                        * - Next get the state of the old_class mask's reserved bit
+                        * - Finally, OR the result from the first two operations
+                        */
+                       result = (new_class & ~AU_CLASS_MASK_RESERVED) |
+                               (old_class & AU_CLASS_MASK_RESERVED);
+               }
+       }
+
+       return result;
+}
+
 /*
  * Insert a event to class mapping. If the event already exists in the
  * mapping, then replace the mapping with the new one.
  *
+ * IMPORTANT: This function should only be called from the kernel during
+ * initialization (e.g. during au_evclassmap_init). Calling afterwards can
+ * have adverse effects on other system components that rely on event/class
+ * map state.
+ *
  * XXX There is currently no constraints placed on the number of mappings.
  * May want to either limit to a number, or in terms of memory usage.
  */
@@ -135,7 +174,7 @@ au_evclassmap_insert(au_event_t event, au_class_t class)
        evcl = &evclass_hash[event % EVCLASSMAP_HASH_TABLE_SIZE];
        LIST_FOREACH(evc, &evcl->head, entry) {
                if (evc->event == event) {
-                       evc->class = class;
+                       evc->class = au_class_protect(evc->class, class);
                        EVCLASS_WUNLOCK();
                        free(evc_new, M_AUDITEVCLASS);
                        return;
@@ -143,7 +182,11 @@ au_evclassmap_insert(au_event_t event, au_class_t class)
        }
        evc = evc_new;
        evc->event = event;
-       evc->class = class;
+       /*
+        * Mappings that require a new element must use 0 as the "old_class" since
+        * there is no previous state.
+        */
+       evc->class = au_class_protect(0, class);
        LIST_INSERT_HEAD(&evcl->head, evc, entry);
        EVCLASS_WUNLOCK();
 }
index cf0b781a8e070b76bcf1caf7de95fc7a2763fda4..4873495958e7a765c02a0bb59616ca8a04b0716c 100644 (file)
@@ -1158,7 +1158,6 @@ au_to_exec_strings(const char *strs, int count, u_char type)
 token_t *
 au_to_exec_args(char *args, int argc)
 {
-
        return (au_to_exec_strings(args, argc, AUT_EXEC_ARGS));
 }
 
@@ -1170,9 +1169,30 @@ au_to_exec_args(char *args, int argc)
 token_t *
 au_to_exec_env(char *envs, int envc)
 {
-
        return (au_to_exec_strings(envs, envc, AUT_EXEC_ENV));
 }
+
+/*
+ * token ID         1 byte
+ * count            4 bytes
+ * text             count null-terminated strings
+ */
+token_t *
+au_to_certificate_hash(char *hashes, int hashc)
+{
+       return (au_to_exec_strings(hashes, hashc, AUT_CERT_HASH));
+}
+
+/*
+ * token ID         1 byte
+ * count            4 bytes
+ * text             count null-terminated strings
+ */
+token_t *
+au_to_krb5_principal(char *principals, int princ)
+{
+       return (au_to_exec_strings(principals, princ, AUT_KRB5_PRINCIPAL));
+}
 #else
 /*
  * token ID        1 byte
@@ -1273,6 +1293,69 @@ au_to_exec_env(char **envp)
 }
 #endif  /* !(defined(_KERNEL) || defined(KERNEL)) */
 
+/*
+ * token ID             1 byte
+ * signer type          4 bytes
+ * signer id length     2 bytes
+ * signer id            n bytes
+ * signer id truncated  1 byte
+ * team id length       2 bytes
+ * team id              n bytes
+ * team id truncated    1 byte
+ * cdhash length        2 bytes
+ * cdhash               n bytes
+ */
+token_t*
+au_to_identity(uint32_t signer_type, const char* signing_id,
+       u_char signing_id_trunc, const char* team_id, u_char team_id_trunc,
+       uint8_t* cdhash, uint16_t cdhash_len)
+{
+       token_t *t = NULL;
+       u_char *dptr = NULL;
+       size_t signing_id_len = 0;
+       size_t team_id_len = 0;
+       size_t totlen = 0;
+
+       if (signing_id) {
+               signing_id_len = strlen(signing_id);
+       }
+
+       if (team_id) {
+               team_id_len = strlen(team_id);
+       }
+
+       totlen =
+               sizeof(u_char) +    // token id
+               sizeof(uint32_t) +  // signer type
+               sizeof(uint16_t) +  // singing id length
+               signing_id_len +    // length of signing id to copy
+               sizeof(u_char) +    // null terminator for signing id
+               sizeof(u_char) +    // if signing id truncated
+               sizeof(uint16_t) +  // team id length
+               team_id_len +       // length of team id to copy
+               sizeof(u_char) +    // null terminator for team id
+               sizeof(u_char) +    // if team id truncated
+               sizeof(uint16_t) +  // cdhash length
+               cdhash_len;         // cdhash buffer
+
+       GET_TOKEN_AREA(t, dptr, totlen);
+
+       ADD_U_CHAR(dptr, AUT_IDENTITY);                // token id
+       ADD_U_INT32(dptr, signer_type);                // signer type
+       ADD_U_INT16(dptr, signing_id_len + 1);         // signing id length+null
+       ADD_STRING(dptr, signing_id, signing_id_len);  // truncated signing id
+       ADD_U_CHAR(dptr, 0);                           // null terminator byte
+       ADD_U_CHAR(dptr, signing_id_trunc);            // if signing id is trunc
+       ADD_U_INT16(dptr, team_id_len + 1);            // team id length+null
+       ADD_STRING(dptr, team_id, team_id_len);        // truncated team id
+       ADD_U_CHAR(dptr, 0);                           // null terminator byte
+       ADD_U_CHAR(dptr, team_id_trunc);               // if team id is trunc
+       ADD_U_INT16(dptr, cdhash_len);                 // cdhash length
+       ADD_MEM(dptr, cdhash, cdhash_len);             // cdhash
+
+       return (t);
+}
+
 /*
  * token ID                1 byte
  * record byte count       4 bytes
index 8a5a556d8df98db0ca69725a71279b4b004b503d..8b58b79c1ad01cfa0527b8824d83948c3ea280e2 100644 (file)
@@ -71,6 +71,8 @@ extern int                    audit_panic_on_write_fail;
 extern int                     audit_fail_stop;
 extern int                     audit_argv;
 extern int                     audit_arge;
+extern au_ctlmode_t    audit_ctl_mode;
+extern au_expire_after_t       audit_expire_after;
 
 /*
  * Kernel mask that is used to check to see if system calls need to be audited.
@@ -182,6 +184,8 @@ union auditon_udata {
        au_stat_t               au_stat;
        au_fstat_t              au_fstat;
        auditinfo_addr_t        au_kau_info;
+       au_ctlmode_t    au_ctl_mode;
+       au_expire_after_t       au_expire_after;
 };
 
 struct posix_ipc_perm {
@@ -190,6 +194,16 @@ struct posix_ipc_perm {
        mode_t  pipc_mode;
 };
 
+struct au_identity_info {
+       u_int32_t       signer_type;
+       char            *signing_id;
+       u_char          signing_id_trunc;
+       char            *team_id;
+       u_char          team_id_trunc;
+       u_int8_t        *cdhash;
+       u_int16_t       cdhash_len;
+};
+
 struct audit_record {
        /* Audit record header. */
        u_int32_t               ar_magic;
@@ -285,6 +299,7 @@ struct audit_record {
        LIST_HEAD(mac_audit_record_list_t, mac_audit_record)    *ar_mac_records;
        int                     ar_forced_by_mac;
 #endif
+       struct au_identity_info ar_arg_identity;
 };
 
 /*
@@ -333,7 +348,7 @@ struct kaudit_record        *audit_new(int event, proc_t p, struct uthread *td);
  */
 struct au_record;
 int     kaudit_to_bsm(struct kaudit_record *kar, struct au_record **pau);
-int     bsm_rec_verify(void *rec);
+int     bsm_rec_verify(void *rec, int length);
 
 /*
  * Kernel versions of the libbsm audit record functions.
@@ -421,6 +436,10 @@ void                        audit_free(struct kaudit_record *ar);
 void                    audit_rotate_vnode(struct ucred *cred,
                            struct vnode *vp);
 void                    audit_worker_init(void);
+void                    audit_identity_info_construct(
+                           struct au_identity_info *id_info);
+void                    audit_identity_info_destruct(
+                           struct au_identity_info *id_info);
 
 /*
  * Audit pipe functions.
@@ -459,6 +478,36 @@ int        audit_session_lookup(au_asid_t asid, auditinfo_addr_t *ret_aia);
 #define        ASSIGNED_ASID_MIN       (PID_MAX + 1)
 #define        ASSIGNED_ASID_MAX       (0xFFFFFFFF - 1)
 
+/*
+ * Entitlement required to control various audit subsystem settings
+ */
+#define AU_CLASS_RESERVED_ENTITLEMENT "com.apple.private.dz.audit"
+
+/*
+ * Entitlement required to control auditctl sys call
+ */
+#define AU_AUDITCTL_RESERVED_ENTITLEMENT "com.apple.private.protected-audit-control"
+
+/*
+ * Max sizes used by the kernel for signing id and team id values of the
+ * identity tokens. These lengths include space for the null terminator.
+ */
+#define MAX_AU_IDENTITY_SIGNING_ID_LENGTH 129
+#define MAX_AU_IDENTITY_TEAM_ID_LENGTH 17
+
+struct __attribute__((__packed__)) hdr_tok_partial {
+       u_char type;
+       uint32_t len;
+};
+static_assert(sizeof(struct hdr_tok_partial) == 5);
+
+struct __attribute__((__packed__)) trl_tok_partial {
+       u_char type;
+       uint16_t magic;
+       uint32_t len;
+};
+static_assert(sizeof(struct trl_tok_partial) == 7);
+
 #endif /* defined(KERNEL) || defined(_KERNEL) */
 
 #endif /* ! _SECURITY_AUDIT_PRIVATE_H_ */
index 2a46a579d4898fbeea6372c53338792f426c0204..191596b5f9fab031d3c60352f986bd1e7364bf14 100644 (file)
@@ -60,6 +60,7 @@
 #include <sys/socketvar.h>
 
 #include <bsm/audit.h>
+#include <bsm/audit_internal.h>
 #include <bsm/audit_kevents.h>
 
 #include <security/audit/audit.h>
@@ -87,6 +88,8 @@
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
 
+#include <IOKit/IOBSD.h>
+
 #if CONFIG_AUDIT
 
 #define        IS_NOT_VALID_PID(p)     ((p) < 1 || (p) > PID_MAX)
 int
 audit(proc_t p, struct audit_args *uap, __unused int32_t *retval)
 {
-       int error;
-       void * rec;
-       struct kaudit_record *ar;
-       struct uthread *uthr;
+       int error = 0;
+       void * rec = NULL;
+       void * full_rec = NULL;
+       struct kaudit_record *ar = NULL;
+       struct uthread *uthr = NULL;
+       int add_identity_token = 1;
+       int max_record_length = MAX_AUDIT_RECORD_SIZE;
+       void *udata = NULL;
+       u_int ulen = 0;
+       struct au_identity_info id_info = {0, NULL, 0, NULL, 0, NULL, 0};
+       token_t *id_tok = NULL;
 
        error = suser(kauth_cred_get(), &p->p_acflag);
-       if (error)
-               return (error);
+       if (error) {
+               goto free_out;
+       }
 
        mtx_lock(&audit_mtx);
-       if ((uap->length <= 0) || (uap->length > (int)audit_qctrl.aq_bufsz)) {
-               mtx_unlock(&audit_mtx);
-               return (EINVAL);
-       }
+       max_record_length = MIN(audit_qctrl.aq_bufsz, MAX_AUDIT_RECORD_SIZE);
        mtx_unlock(&audit_mtx);
 
+       if (IOTaskHasEntitlement(current_task(),
+               AU_CLASS_RESERVED_ENTITLEMENT)) {
+               /* Entitled tasks are trusted to add appropriate identity info */
+               add_identity_token = 0;
+       } else {
+               /*
+                * If the caller is unentitled, an identity token will be added and
+                * the space must be accounted for
+                */
+               max_record_length -= MAX_AUDIT_IDENTITY_SIZE;
+       }
+
+       if ((uap->length <= 0) || (uap->length > max_record_length)) {
+               error = EINVAL;
+               goto free_out;
+       }
+
        ar = currecord();
 
        /*
@@ -171,8 +196,11 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval)
         */
        if (ar == NULL) {
                uthr = curthread();
-               if (uthr == NULL)       /* can this happen? */
-                       return (ENOTSUP);
+               if (uthr == NULL) {
+                       /* can this happen? */
+                       error = ENOTSUP;
+                       goto free_out;
+               }
 
                /*
                 * This is not very efficient; we're required to allocate a
@@ -180,32 +208,88 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval)
                 * tag along.
                 */
                uthr->uu_ar = audit_new(AUE_NULL, p, uthr);
-               if (uthr->uu_ar == NULL)
-                       return (ENOTSUP);
+               if (uthr->uu_ar == NULL) {
+                       error = ENOTSUP;
+                       goto free_out;
+               }
                ar = uthr->uu_ar;
        }
 
-       if (uap->length > MAX_AUDIT_RECORD_SIZE)
-               return (EINVAL);
-
        rec = malloc(uap->length, M_AUDITDATA, M_WAITOK);
+       if (!rec) {
+               error = ENOMEM;
+               goto free_out;
+       }
 
        error = copyin(uap->record, rec, uap->length);
-       if (error)
+       if (error) {
                goto free_out;
+       }
 
 #if CONFIG_MACF
        error = mac_system_check_audit(kauth_cred_get(), rec, uap->length);
-       if (error)
+       if (error) {
                goto free_out;
+       }
 #endif
 
        /* Verify the record. */
-       if (bsm_rec_verify(rec) == 0) {
+       if (bsm_rec_verify(rec, uap->length) == 0) {
                error = EINVAL;
                goto free_out;
        }
 
+       if (add_identity_token) {
+               struct hdr_tok_partial *hdr;
+               struct trl_tok_partial *trl;
+               int bytes_copied = 0;
+
+               /* Create a new identity token for this buffer */
+               audit_identity_info_construct(&id_info);
+               id_tok = au_to_identity(id_info.signer_type, id_info.signing_id,
+                       id_info.signing_id_trunc, id_info.team_id, id_info.team_id_trunc,
+                       id_info.cdhash, id_info.cdhash_len);
+               if (!id_tok) {
+                       error = ENOMEM;
+                       goto free_out;
+               }
+
+               /* Splice the record together using a new buffer */
+               full_rec = malloc(uap->length + id_tok->len, M_AUDITDATA, M_WAITOK);
+               if (!full_rec) {
+                       error = ENOMEM;
+                       goto free_out;
+               }
+
+               /* Copy the original buffer up to but not including the trailer */
+               memcpy(full_rec, rec, uap->length - AUDIT_TRAILER_SIZE);
+               bytes_copied = uap->length - AUDIT_TRAILER_SIZE;
+
+               /* Copy the identity token */
+               memcpy(full_rec + bytes_copied, id_tok->t_data, id_tok->len);
+               bytes_copied += id_tok->len;
+
+               /* Copy the old trailer */
+               memcpy(full_rec + bytes_copied,
+                       rec + (uap->length - AUDIT_TRAILER_SIZE), AUDIT_TRAILER_SIZE);
+               bytes_copied += AUDIT_TRAILER_SIZE;
+
+               /* Fix the record size stored in the header token */
+               hdr = (struct hdr_tok_partial*)full_rec;
+               hdr->len = htonl(bytes_copied);
+
+               /* Fix the record size stored in the trailer token */
+               trl = (struct trl_tok_partial*)
+                       (full_rec + bytes_copied - AUDIT_TRAILER_SIZE);
+               trl->len = htonl(bytes_copied);
+
+               udata = full_rec;
+               ulen = bytes_copied;
+       } else {
+               udata = rec;
+               ulen = uap->length;
+       }
+
        /*
         * Attach the user audit record to the kernel audit record.  Because
         * this system call is an auditable event, we will write the user
@@ -214,8 +298,8 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval)
         * XXXAUDIT: KASSERT appropriate starting values of k_udata, k_ulen,
         * k_ar_commit & AR_COMMIT_USER?
         */
-       ar->k_udata = rec;
-       ar->k_ulen  = uap->length;
+       ar->k_udata = udata;
+       ar->k_ulen  = ulen;
        ar->k_ar_commit |= AR_COMMIT_USER;
 
        /*
@@ -225,14 +309,30 @@ audit(proc_t p, struct audit_args *uap, __unused int32_t *retval)
         * want to setup kernel based preselection.
         */
        ar->k_ar_commit |= (AR_PRESELECT_USER_TRAIL | AR_PRESELECT_USER_PIPE);
-       return (0);
 
 free_out:
        /*
-        * audit_syscall_exit() will free the audit record on the thread even
-        * if we allocated it above.
+        * If rec was allocated, it must be freed if an identity token was added
+        * (since full_rec will be used) OR there was an error (since nothing
+        * will be attached to the kernel structure).
         */
-       free(rec, M_AUDITDATA);
+       if (rec && (add_identity_token || error)) {
+               free(rec, M_AUDITDATA);
+       }
+
+       /* Only free full_rec if an error occurred */
+       if (full_rec && error) {
+               free(full_rec, M_AUDITDATA);
+       }
+
+       audit_identity_info_destruct(&id_info);
+       if (id_tok) {
+               if (id_tok->t_data) {
+                       free(id_tok->t_data, M_AUDITBSM);
+               }
+               free(id_tok, M_AUDITBSM);
+       }
+
        return (error);
 }
 
@@ -288,6 +388,8 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval)
        case A_GETSINFO_ADDR:
        case A_GETSFLAGS:
        case A_SETSFLAGS:
+       case A_SETCTLMODE:
+       case A_SETEXPAFTER:
                error = copyin(uap->data, (void *)&udata, uap->length);
                if (error)
                        return (error);
@@ -319,6 +421,13 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval)
                 * control implemented in audit_session_setaia().
                 */
                break;
+       case A_SETCTLMODE:
+       case A_SETEXPAFTER:
+               if (!IOTaskHasEntitlement(current_task(),
+                       AU_CLASS_RESERVED_ENTITLEMENT)) {
+                       error = EPERM;
+               }
+               break;
        default:
                error = suser(kauth_cred_get(), &p->p_acflag);
                break;
@@ -326,6 +435,26 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval)
        if (error)
                return (error);
 
+       /*
+        * If the audit subsytem is in external control mode, additional
+        * privilege checks are required for a subset of auditon commands
+        */
+       if (audit_ctl_mode == AUDIT_CTLMODE_EXTERNAL) {
+               switch (uap->cmd) {
+               case A_SETCOND:
+               case A_SETFSIZE:
+               case A_SETPOLICY:
+               case A_SETQCTRL:
+                       if (!IOTaskHasEntitlement(current_task(),
+                               AU_CLASS_RESERVED_ENTITLEMENT)) {
+                               error = EPERM;
+                       }
+                       break;
+               }
+               if (error)
+                       return (error);
+       }
+
        /*
         * XXX Need to implement these commands by accessing the global
         * values associated with the commands.
@@ -698,6 +827,56 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval)
                        return (error);
                break;
 
+       case A_GETCTLMODE:
+               if (sizeof(udata.au_ctl_mode) != uap->length) {
+                       return (EINVAL);
+               }
+               mtx_lock(&audit_mtx);
+               udata.au_ctl_mode = audit_ctl_mode;
+               mtx_unlock(&audit_mtx);
+               break;
+
+       case A_SETCTLMODE:
+               if (sizeof(udata.au_ctl_mode) != uap->length) {
+                       return (EINVAL);
+               }
+
+               mtx_lock(&audit_mtx);
+
+               if (udata.au_ctl_mode == AUDIT_CTLMODE_NORMAL) {
+                       audit_ctl_mode = AUDIT_CTLMODE_NORMAL;
+               } else if (udata.au_ctl_mode == AUDIT_CTLMODE_EXTERNAL) {
+                       audit_ctl_mode = AUDIT_CTLMODE_EXTERNAL;
+               } else {
+                       mtx_unlock(&audit_mtx);
+                       return (EINVAL);
+               }
+
+               mtx_unlock(&audit_mtx);
+               break;
+
+       case A_GETEXPAFTER:
+               if (sizeof(udata.au_expire_after) != uap->length) {
+                       return (EINVAL);
+               }
+               mtx_lock(&audit_mtx);
+               udata.au_expire_after.age = audit_expire_after.age;
+               udata.au_expire_after.size = audit_expire_after.size;
+               udata.au_expire_after.op_type = audit_expire_after.op_type;
+               mtx_unlock(&audit_mtx);
+               break;
+
+       case A_SETEXPAFTER:
+               if (sizeof(udata.au_expire_after) != uap->length) {
+                       return (EINVAL);
+               }
+               mtx_lock(&audit_mtx);
+               audit_expire_after.age = udata.au_expire_after.age;
+               audit_expire_after.size = udata.au_expire_after.size;
+               audit_expire_after.op_type = udata.au_expire_after.op_type;
+               mtx_unlock(&audit_mtx);
+               break;
+
        default:
                return (EINVAL);
        }
@@ -723,6 +902,8 @@ auditon(proc_t p, struct auditon_args *uap, __unused int32_t *retval)
        case A_GETKAUDIT:
        case A_GETSINFO_ADDR:
        case A_GETSFLAGS:
+       case A_GETCTLMODE:
+       case A_GETEXPAFTER:
                error = copyout((void *)&udata, uap->data, uap->length);
                if (error)
                        return (ENOSYS);
@@ -906,11 +1087,22 @@ auditctl(proc_t p, struct auditctl_args *uap, __unused int32_t *retval)
        kauth_cred_t cred;
        struct vnode *vp;
        int error = 0;
+       au_ctlmode_t ctlmode;
 
        error = suser(kauth_cred_get(), &p->p_acflag);
        if (error)
                return (error);
 
+       ctlmode = audit_ctl_mode;
+
+       /*
+        * Do not allow setting of a path when auditing is in reserved mode
+        */
+       if (ctlmode == AUDIT_CTLMODE_EXTERNAL &&
+               !IOTaskHasEntitlement(current_task(), AU_AUDITCTL_RESERVED_ENTITLEMENT)) {
+               return (EPERM);
+       }
+
        vp = NULL;
        cred = NULL;
 
index 88c3a51c4acbbceec2983998161940ad11d00267..e2ad05581027ae82b4e50fc0f400a2b3a9ec20ba 100644 (file)
@@ -169,6 +169,25 @@ PRIVATE_KERNELFILES = \
        fsevents.h \
        work_interval.h \
 
+XNU_ONLY_EXPORTS = \
+       bsdtask_info.h \
+       file_internal.h \
+       filedesc.h \
+       guarded.h \
+       linker_set.h \
+       mount_internal.h \
+       munge.h \
+       pipe.h \
+       proc_internal.h \
+       pthread_internal.h \
+       resourcevar.h \
+       semaphore.h \
+       tree.h \
+       uio_internal.h \
+       ulock.h \
+       ux_exception.h \
+       vnode_internal.h
+
 # /usr/include
 INSTALL_MI_LIST        = ${DATAFILES}
 
@@ -176,9 +195,7 @@ INSTALL_MI_GEN_LIST = syscall.h _posix_availability.h _symbol_aliasing.h
 
 INSTALL_MI_DIR = sys
 
-EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} linker_set.h bsdtask_info.h pthread_internal.h filedesc.h pipe.h resourcevar.h semaphore.h \
-                                                               vnode_internal.h proc_internal.h file_internal.h mount_internal.h \
-                                                               uio_internal.h tree.h munge.h guarded.h ulock.h
+EXPORT_MI_LIST = ${KERNELFILES} ${PRIVATE_KERNELFILES} ${XNU_ONLY_EXPORTS}
 
 EXPORT_MI_GEN_LIST = syscall.h sysproto.h kdebugevents.h
 
index 2547592f47897167690e68fce281035ca6c6f463..eb1422e5ea7ce25f3e2c9ea61e3a2423bacc1cff 100644 (file)
 #define _STRUCT_USER64_TIMEX   struct user64_timex
 _STRUCT_USER64_TIMEX
 {
-       u_int64_t modes;
+       u_int32_t modes;
        user64_long_t   offset;
        user64_long_t   freq;
        user64_long_t   maxerror;
        user64_long_t   esterror;
-       __int64_t       status;
+       __int32_t       status;
        user64_long_t   constant;
        user64_long_t   precision;
        user64_long_t   tolerance;
 
        user64_long_t   ppsfreq;
        user64_long_t   jitter;
-       __int64_t       shift;
+       __int32_t       shift;
        user64_long_t   stabil;
        user64_long_t   jitcnt;
        user64_long_t   calcnt;
index 74c747bfee85011aafc6503df102e27a472c830d..7f2edccd20000503b949d5ff250d8e2e2e105c4d 100644 (file)
@@ -111,8 +111,8 @@ extern struct vnode *vnode_mountdevvp(struct vnode *);
 extern int fill_procregioninfo(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid);
 extern int fill_procregioninfo_onlymappedvnodes(task_t t, uint64_t arg, struct proc_regioninfo_internal *pinfo, uintptr_t *vp, uint32_t *vid);
 void fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo);
-int fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_threadinfo_internal * ptinfo, void *, int *);
-int fill_taskthreadlist(task_t task, void * buffer, int thcount);
+int fill_taskthreadinfo(task_t task, uint64_t thaddr, bool thuniqueid, struct proc_threadinfo_internal * ptinfo, void *, int *);
+int fill_taskthreadlist(task_t task, void * buffer, int thcount, bool thuniqueid);
 int get_numthreads(task_t);
 boolean_t bsd_hasthreadname(void *uth);
 void bsd_getthreadname(void *uth, char* buffer);
index 4339caf7bdc81f752bf4ed4e3e39af466fbd2da4..a7f6639e07db846c01eb317f792679ae65e6dd51 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
 #define        __DEQUALIFY(type, var)  __CAST_AWAY_QUALIFIER(var, const volatile, type)
 #endif
 
+/*
+ * __alloc_size can be used to label function arguments that represent the
+ * size of memory that the function allocates and returns. The one-argument
+ * form labels a single argument that gives the allocation size (where the
+ * arguments are numbered from 1):
+ *
+ * void        *malloc(size_t __size) __alloc_size(1);
+ *
+ * The two-argument form handles the case where the size is calculated as the
+ * product of two arguments:
+ *
+ * void        *calloc(size_t __count, size_t __size) __alloc_size(1,2);
+ */
+#ifndef __alloc_size
+#if __has_attribute(alloc_size)
+#define __alloc_size(...) __attribute__((alloc_size(__VA_ARGS__)))
+#else
+#define __alloc_size(...)
+#endif
+#endif // __alloc_size
+
 /*
  * COMPILATION ENVIRONMENTS -- see compat(5) for additional detail
  *
                                _Pragma("clang diagnostic pop")
 #endif
 
+#if defined(PRIVATE) || defined(KERNEL)
+/*
+ * Check if __probable and __improbable have already been defined elsewhere.
+ * These macros inform the compiler (and humans) about which branches are likely
+ * to be taken.
+ */
+#if !defined(__probable) && !defined(__improbable)
+#define        __probable(x)   __builtin_expect(!!(x), 1)
+#define        __improbable(x) __builtin_expect(!!(x), 0)
+#endif /* !defined(__probable) && !defined(__improbable) */
+
+#define __container_of(ptr, type, field) ({ \
+               const typeof(((type *)0)->field) *__ptr = (ptr); \
+               (type *)((uintptr_t)__ptr - offsetof(type, field)); \
+       })
+
+#endif /* KERNEL || PRIVATE */
+
+#define __compiler_barrier() __asm__ __volatile__("" ::: "memory")
+
 #endif /* !_CDEFS_H_ */
index f56878e73fe735013b007eb93a20bbafba5a5096..069725c7e41e83df91507bf9c6f88f476bf02a7a 100644 (file)
@@ -55,6 +55,7 @@
 #define CS_OPS_IDENTITY                11      /* get codesign identity */
 #define CS_OPS_CLEARINSTALLER  12      /* clear INSTALLER flag */
 #define CS_OPS_CLEARPLATFORM 13 /* clear platform binary status (DEVELOPMENT-only) */
+#define CS_OPS_TEAMID       14  /* get team id */
 
 #define CS_MAX_TEAMID_LEN      64
 
@@ -71,6 +72,7 @@ __END_DECLS
 
 #else /* !KERNEL */
 
+#include <mach/machine.h>
 #include <mach/vm_types.h>
 
 #include <sys/cdefs.h>
@@ -82,8 +84,11 @@ struct fileglob;
 
 __BEGIN_DECLS
 int    cs_valid(struct proc *);
-int    cs_enforcement(struct proc *);
+int    cs_process_enforcement(struct proc *);
+int cs_process_global_enforcement(void);
+int cs_system_enforcement(void);
 int    cs_require_lv(struct proc *);
+int csproc_forced_lv(struct proc* p);
 int    cs_system_require_lv(void);
 uint32_t cs_entitlement_flags(struct proc *p);
 int    cs_entitlements_blob_get(struct proc *, void **, size_t *);
@@ -108,6 +113,11 @@ unsigned int       csblob_get_signer_type(struct cs_blob *);
 void                   csproc_clear_platform_binary(struct proc *);
 #endif
 
+void csproc_disable_enforcement(struct proc* p);
+void csproc_mark_invalid_allowed(struct proc* p);
+int csproc_check_invalid_allowed(struct proc* p);
+int csproc_hardened_runtime(struct proc* p);
+
 int            csblob_get_entitlements(struct cs_blob *, void **, size_t *);
 
 const CS_GenericBlob *
@@ -139,6 +149,14 @@ uint8_t csvnode_get_platform_identifier(struct vnode *, off_t);
 uint8_t csproc_get_platform_identifier(struct proc *);
 
 extern int cs_debug;
+extern int cs_debug_fail_on_unsigned_code;
+extern unsigned int cs_debug_unsigned_exec_failures;
+extern unsigned int cs_debug_unsigned_mmap_failures;
+
+int cs_blob_create_validated(vm_address_t* addr, vm_size_t size,
+                                                        struct cs_blob ** ret_blob, CS_CodeDirectory const **ret_cd);
+
+void cs_blob_free(struct cs_blob *blob);
 
 #ifdef XNU_KERNEL_PRIVATE
 
index 4c7f51ece75611db6e3480dff0df5698ee5b9815..9b6c0d0caeb0df8ad2212d129f2edce91d6666e8 100644 (file)
@@ -50,6 +50,7 @@ typedef uint32_t csr_op_t;
 #define CSR_ALLOW_DEVICE_CONFIGURATION (1 << 7)
 #define CSR_ALLOW_ANY_RECOVERY_OS      (1 << 8)
 #define CSR_ALLOW_UNAPPROVED_KEXTS     (1 << 9)
+#define CSR_ALLOW_EXECUTABLE_POLICY_OVERRIDE   (1 << 10)
 
 #define CSR_VALID_FLAGS (CSR_ALLOW_UNTRUSTED_KEXTS | \
                          CSR_ALLOW_UNRESTRICTED_FS | \
@@ -60,7 +61,8 @@ typedef uint32_t csr_op_t;
                          CSR_ALLOW_UNRESTRICTED_NVRAM | \
                          CSR_ALLOW_DEVICE_CONFIGURATION | \
                          CSR_ALLOW_ANY_RECOVERY_OS | \
-                         CSR_ALLOW_UNAPPROVED_KEXTS)
+                         CSR_ALLOW_UNAPPROVED_KEXTS | \
+                         CSR_ALLOW_EXECUTABLE_POLICY_OVERRIDE)
 
 #define CSR_ALWAYS_ENFORCED_FLAGS (CSR_ALLOW_DEVICE_CONFIGURATION | CSR_ALLOW_ANY_RECOVERY_OS)
 
index f30a6decc4b144d6ac481831448548d74e2725a8..1f57e93bf4ccddaa36c0b2079bddfd1a513a9d15 100644 (file)
 #ifndef _SYS_DECMPFS_H_
 #define _SYS_DECMPFS_H_ 1
 
-#include <sys/kernel_types.h>
 #include <stdbool.h>
+#include <sys/kdebug.h>
+#include <sys/kernel_types.h>
 #include <sys/vnode.h>
 
+/*
+ * Please switch on @DECMPFS_ENABLE_KDEBUG_TRACES to enable tracepoints.
+ * Tracepoints are compiled out by default to eliminate any overhead due to
+ * kernel tracing.
+ *
+ * #define DECMPFS_ENABLE_KDEBUG_TRACES 1
+ */
+#if DECMPFS_ENABLE_KDEBUG_TRACES
+#define DECMPFS_EMIT_TRACE_ENTRY(D, ...)\
+        KDBG_FILTERED((D) | DBG_FUNC_START, ## __VA_ARGS__)
+#define DECMPFS_EMIT_TRACE_RETURN(D, ...)\
+        KDBG_FILTERED((D) | DBG_FUNC_END, ##__VA_ARGS__)
+#else
+#define DECMPFS_EMIT_TRACE_ENTRY(D, ...) do {} while (0)
+#define DECMPFS_EMIT_TRACE_RETURN(D, ...) do {} while (0)
+#endif /* DECMPFS_ENABLE_KDEBUG_TRACES */
+
+/*
+ * KERNEL_DEBUG related definitions for decmpfs.
+ *
+ * Please NOTE: The Class DBG_FSYSTEM = 3, and Subclass DBG_DECMP = 0x12, so
+ * these debug codes are of the form 0x0312nnnn.
+ */
+#define DECMPDBG_CODE(code)  FSDBG_CODE(DBG_DECMP, code)
+
+enum {
+    DECMPDBG_DECOMPRESS_FILE            = DECMPDBG_CODE(0), /* 0x03120000 */
+    DECMPDBG_FETCH_COMPRESSED_HEADER    = DECMPDBG_CODE(1), /* 0x03120004 */
+    DECMPDBG_FETCH_UNCOMPRESSED_DATA    = DECMPDBG_CODE(2), /* 0x03120008 */
+    DECMPDBG_FREE_COMPRESSED_DATA       = DECMPDBG_CODE(4), /* 0x03120010 */
+    DECMPDBG_FILE_IS_COMPRESSED         = DECMPDBG_CODE(5), /* 0x03120014 */
+};
+
 #define MAX_DECMPFS_XATTR_SIZE 3802
 
 /*
index f046e7f76f49e8bab933f26caa0d832932a7003d..f1c4c821ecf6931c1de779eca3cc766ae00bf3ee 100644 (file)
@@ -78,6 +78,8 @@
  * DKIOCGETIOMINSATURATIONBYTECOUNT      get minimum byte count to saturate storage bandwidth
  *
  * DKIOCGETERRORDESCRIPTION              get description of any drive error
+ *
+ * DKIOCGETMAXSWAPWRITE                  get maximum swap file write per day in bytes
  */
 
 #define DK_FEATURE_BARRIER                    0x00000002
@@ -339,6 +341,18 @@ typedef enum {
 
 #define DKIOCGETAPFSFLAVOUR    _IOR('d', 91, dk_apfs_flavour_t)
 
+// Extent's offset and length returned in bytes
+typedef struct dk_apfs_wbc_range {
+       dev_t dev;              // Physical device for extents
+       uint32_t count;         // Number of extents
+       dk_extent_t extents[2]; // Addresses are relative to device we return
+} dk_apfs_wbc_range_t;
+
+#define DKIOCAPFSGETWBCRANGE           _IOR('d', 92, dk_apfs_wbc_range_t)
+#define DKIOCAPFSRELEASEWBCRANGE       _IO('d', 93)
+
+#define DKIOCGETMAXSWAPWRITE           _IOR('d', 94, uint64_t)
+
 #endif /* PRIVATE */
 #endif /* KERNEL */
 
index 077e1f3d9505575360de0888e925da0325136d50..bf6a94092afd37e32cc971ee64215b6c6ef4905d 100644 (file)
 /*
  * cmn_err
  */
-#define        CE_CONT         0       /* continuation         */
 #define        CE_NOTE         1       /* notice               */
 #define        CE_WARN         2       /* warning              */
-#define        CE_PANIC        3       /* panic                */
-#define        CE_IGNORE       4       /* print nothing        */
 
 extern void cmn_err( int, const char *, ... );
 
@@ -69,6 +66,9 @@ extern void cmn_err( int, const char *, ... );
 proc_t* sprlock(pid_t pid);
 void sprunlock(proc_t *p);
 
+void dtrace_sprlock(proc_t *p);
+void dtrace_sprunlock(proc_t *p);
+
 /*
  * uread/uwrite
  */
@@ -85,15 +85,11 @@ int fuword16(user_addr_t, uint16_t *);
 int fuword32(user_addr_t, uint32_t *);
 int fuword64(user_addr_t, uint64_t *);
 
-void fuword8_noerr(user_addr_t, uint8_t *);
-void fuword16_noerr(user_addr_t, uint16_t *);
 void fuword32_noerr(user_addr_t, uint32_t *);
 void fuword64_noerr(user_addr_t, uint64_t *);
 
 int suword64(user_addr_t, uint64_t value);
 int suword32(user_addr_t, uint32_t value);
-int suword16(user_addr_t, uint16_t value);
-int suword8(user_addr_t, uint8_t value);
 
 /*
  * cpuvar
@@ -233,7 +229,8 @@ typedef struct modctl {
 #define MODCTL_FBT_PRIVATE_PROBES_PROVIDED     0x80  // fbt private probes have been provided
 #define MODCTL_FBT_PROVIDE_PRIVATE_PROBES      0x100 // fbt provider must provide private probes
 #define MODCTL_FBT_PROVIDE_BLACKLISTED_PROBES  0x200 // fbt provider must provide blacklisted probes
-#define MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED 0x400 // fbt blacklisted probes have been provided
+#define MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED 0x400 // fbt blacklisted probes have been provided
+#define MODCTL_IS_STATIC_KEXT                  0x800 // module is a static kext
 
 /* Simple/singular mod_flags accessors */
 #define MOD_IS_MACH_KERNEL(mod)                        (mod->mod_flags & MODCTL_IS_MACH_KERNEL)
@@ -248,6 +245,7 @@ typedef struct modctl {
 #define MOD_FBT_PROVIDE_PRIVATE_PROBES(mod)    (mod->mod_flags & MODCTL_FBT_PROVIDE_PRIVATE_PROBES)
 #define MOD_FBT_BLACKLISTED_PROBES_PROVIDED(mod) (mod->mod_flags & MODCTL_FBT_BLACKLISTED_PROBES_PROVIDED)
 #define MOD_FBT_PROVIDE_BLACKLISTED_PROBES(mod)        (mod->mod_flags & MODCTL_FBT_PROVIDE_BLACKLISTED_PROBES)
+#define MOD_IS_STATIC_KEXT(mod)                        (mod->mod_flags & MODCTL_IS_STATIC_KEXT)
 
 /* Compound accessors */
 #define MOD_FBT_PRIVATE_PROBES_DONE(mod)       (MOD_FBT_PRIVATE_PROBES_PROVIDED(mod) || !MOD_FBT_PROVIDE_PRIVATE_PROBES(mod))
@@ -258,6 +256,8 @@ typedef struct modctl {
 
 extern modctl_t *dtrace_modctl_list;
 
+extern int dtrace_addr_in_module(void*, struct modctl*);
+
 /*
  * cred_t
  */
@@ -280,20 +280,14 @@ extern cred_t *dtrace_CRED(void); /* Safe to call from probe context. */
 #define CRED() kauth_cred_get() /* Can't be called from probe context! */
 extern int PRIV_POLICY_CHOICE(void *, int, int);
 extern int PRIV_POLICY_ONLY(void *, int, int);
-extern gid_t crgetgid(const cred_t *);
 extern uid_t crgetuid(const cred_t *);
 #define crgetzoneid(x) ((zoneid_t)0)
 
-#define crhold(a) {}
-#define crfree(a) {}
-
 /*
  * "cyclic"
  */
 #define        CY_LOW_LEVEL            0
-#define        CY_LOCK_LEVEL           1
 #define        CY_HIGH_LEVEL           2
-#define        CY_SOFT_LEVELS          2
 #define        CY_LEVELS                       3
 
 typedef uintptr_t cyclic_id_t;
@@ -338,18 +332,8 @@ extern void cyclic_timer_remove(cyclic_id_t);
 #define DDI_SUCCESS                    0
 #define DDI_FAILURE                    -1
 
-#define        DDI_DEV_T_NONE  ((dev_t)-1)
-#define        DDI_DEV_T_ANY   ((dev_t)-2)
-#define        DDI_MAJOR_T_UNKNOWN     ((major_t)0)
-
 #define DDI_PSEUDO "ddi_pseudo"
 
-typedef enum {
-       DDI_ATTACH = 0,
-       DDI_RESUME = 1,
-       DDI_PM_RESUME = 2
-} ddi_attach_cmd_t;
-
 typedef enum {
        DDI_DETACH = 0,
        DDI_SUSPEND = 1,
@@ -365,10 +349,6 @@ typedef uint_t minor_t;
 
 typedef struct __dev_info *dev_info_t;
 
-extern void ddi_report_dev(dev_info_t *);
-
-int ddi_getprop(dev_t dev, dev_info_t *dip, int flags, const char *name, int defvalue);
-
 extern int ddi_driver_major(dev_info_t *);
 
 extern int ddi_create_minor_node(dev_info_t *, const char *, int, minor_t, const char *, int);
@@ -377,43 +357,15 @@ extern void ddi_remove_minor_node(dev_info_t *, char *);
 extern major_t getemajor(dev_t);
 extern minor_t getminor(dev_t);
 
-extern dev_t makedevice(major_t, minor_t);
-
 /*
  * Kernel Debug Interface
  */
-
-typedef enum kdi_dtrace_set {
-       KDI_DTSET_DTRACE_ACTIVATE,
-       KDI_DTSET_DTRACE_DEACTIVATE,
-       KDI_DTSET_KMDB_BPT_ACTIVATE,
-       KDI_DTSET_KMDB_BPT_DEACTIVATE
-} kdi_dtrace_set_t;
-
-extern int kdi_dtrace_set(kdi_dtrace_set_t);
 extern void debug_enter(char *);
 
 /*
  * DTrace specific zone allocation
  */
 
-/*
- * To break dtrace memory usage out in a trackable
- * fashion, uncomment the #define below. This will
- * enable emulation of the general kalloc.XXX zones
- * for most dtrace allocations. (kalloc.large is not
- * emulated)
- *
- * #define DTRACE_MEMORY_ZONES 1
- *
- */
-
-#if defined(DTRACE_MEMORY_ZONES)
-void dtrace_alloc_init(void);
-void *dtrace_alloc(vm_size_t);
-void dtrace_free(void *, vm_size_t);
-#endif
-
 /*
  * kmem
  */
@@ -424,15 +376,32 @@ void dtrace_free(void *, vm_size_t);
 typedef struct vmem vmem_t;
 typedef struct kmem_cache kmem_cache_t;
 
-#define kmem_alloc dt_kmem_alloc /* Avoid clash with Darwin's kmem_alloc */
 #define kmem_free dt_kmem_free /* Avoid clash with Darwin's kmem_free */
-#define kmem_zalloc dt_kmem_zalloc /* Avoid clash with Darwin's kmem_zalloc */
-extern void *dt_kmem_alloc(size_t, int);
+#define kmem_free_aligned dt_kmem_free_aligned
+
+#define kmem_alloc(size, kmflag) \
+       ({ VM_ALLOC_SITE_STATIC(0, 0); \
+       dt_kmem_alloc_site(size, kmflag, &site); })
+
+extern void *dt_kmem_alloc_site(size_t, int, vm_allocation_site_t*);
 extern void dt_kmem_free(void *, size_t);
-extern void *dt_kmem_zalloc(size_t, int);
 
-extern void *dt_kmem_alloc_aligned(size_t, size_t, int);
-extern void *dt_kmem_zalloc_aligned(size_t, size_t, int);
+#define kmem_zalloc(size, kmflag) \
+       ({ VM_ALLOC_SITE_STATIC(0, 0); \
+       dt_kmem_zalloc_site(size, kmflag, &site); })
+
+extern void *dt_kmem_zalloc_site(size_t, int, vm_allocation_site_t*);
+
+#define kmem_alloc_aligned(size, align, kmflag) \
+       ({ VM_ALLOC_SITE_STATIC(0, 0); \
+       dt_kmem_alloc_aligned_site(size, align, kmflag, &site); })
+extern void *dt_kmem_alloc_aligned_site(size_t, size_t, int, vm_allocation_site_t*);
+
+#define kmem_zalloc_aligned(size, align, kmflag) \
+       ({ VM_ALLOC_SITE_STATIC(0, 0); \
+       dt_kmem_zalloc_aligned_site(size, align, kmflag, &site); })
+extern void *dt_kmem_zalloc_aligned_site(size_t, size_t, int, vm_allocation_site_t*);
+
 extern void dt_kmem_free_aligned(void*, size_t);
 
 extern kmem_cache_t *
@@ -452,7 +421,6 @@ typedef struct _kthread kthread_t; /* For dtrace_vtime_switch(), dtrace_panicked
  * proc
  */
 
-#define DATAMODEL_MASK  0x0FF00000
 
 #define DATAMODEL_ILP32 0x00100000
 #define DATAMODEL_LP64  0x00200000
@@ -467,23 +435,6 @@ typedef struct _kthread kthread_t; /* For dtrace_vtime_switch(), dtrace_panicked
 
 typedef unsigned int model_t; /* For dtrace_instr_size_isa() prototype in <sys/dtrace.h> */
 
-/*
- * taskq
- */
-
-#define        TQ_SLEEP        0x00    /* Can block for memory */
-
-typedef uint_t pri_t;
-typedef struct taskq taskq_t;
-typedef void (task_func_t)(void *);
-typedef uintptr_t taskqid_t;
-
-extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
-extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
-extern void    taskq_destroy(taskq_t *);
-
-extern pri_t maxclsyspri;
-
 /*
  * vmem
  */
@@ -569,6 +520,7 @@ extern hrtime_t dtrace_abs_to_nano(uint64_t);
 __private_extern__ const char * strstr(const char *, const char *);
 const void* bsearch(const void*, const void*, size_t, size_t, int (*compar)(const void *, const void *));
 
+int dtrace_copy_maxsize(void);
 int dtrace_buffer_copyout(const void*, user_addr_t, vm_size_t);
 
 
index 125293dbfb084b652ea16865d9b583fcde782222..f463b49e35e8c8886e75f5cdab9993ab33c870ff 100644 (file)
@@ -110,6 +110,8 @@ struct dtrace_probe {
        char *dtpr_mod;                         /* probe's module name */
        char *dtpr_func;                        /* probe's function name */
        char *dtpr_name;                        /* probe's name */
+       dtrace_probe_t *dtpr_nextprov;          /* next in provider hash */
+       dtrace_probe_t *dtpr_prevprov;          /* previous in provider hash */
        dtrace_probe_t *dtpr_nextmod;           /* next in module hash */
        dtrace_probe_t *dtpr_prevmod;           /* previous in module hash */
        dtrace_probe_t *dtpr_nextfunc;          /* next in function hash */
@@ -135,18 +137,21 @@ typedef struct dtrace_probekey {
 
 typedef struct dtrace_hashbucket {
        struct dtrace_hashbucket *dthb_next;    /* next on hash chain */
-       dtrace_probe_t *dthb_chain;             /* chain of probes */
+       void *dthb_chain;                       /* chain of elements */
        int dthb_len;                           /* number of probes here */
 } dtrace_hashbucket_t;
 
+typedef const char* dtrace_strkey_f(void*, uintptr_t);
+
 typedef struct dtrace_hash {
-       dtrace_hashbucket_t **dth_tab;          /* hash table */
-       int dth_size;                           /* size of hash table */
-       int dth_mask;                           /* mask to index into table */
-       int dth_nbuckets;                       /* total number of buckets */
-       uintptr_t dth_nextoffs;                 /* offset of next in probe */
-       uintptr_t dth_prevoffs;                 /* offset of prev in probe */
-       uintptr_t dth_stroffs;                  /* offset of str in probe */
+       dtrace_hashbucket_t **dth_tab;  /* hash table */
+       int dth_size;                   /* size of hash table */
+       int dth_mask;                   /* mask to index into table */
+       int dth_nbuckets;               /* total number of buckets */
+       uintptr_t dth_nextoffs;         /* offset of next in element */
+       uintptr_t dth_prevoffs;         /* offset of prev in element */
+       dtrace_strkey_f *dth_getstr;    /* func to retrieve str in element */
+       uintptr_t dth_stroffs;          /* offset of str in element */
 } dtrace_hash_t;
 
 /*
@@ -1310,6 +1315,16 @@ typedef struct dtrace_errhash {
 
 #endif /* DTRACE_ERRDEBUG */
 
+
+typedef struct dtrace_string dtrace_string_t;
+
+typedef struct dtrace_string {
+       dtrace_string_t *dtst_next;
+       dtrace_string_t *dtst_prev;
+       uint32_t dtst_refcount;
+       char dtst_str[];
+} dtrace_string_t;
+
 /**
  * DTrace Matching pre-conditions
  *
@@ -1374,6 +1389,8 @@ extern int dtrace_attached(void);
 extern hrtime_t dtrace_gethrestime(void);
 extern void dtrace_isa_init(void);
 
+extern void dtrace_flush_caches(void);
+
 extern void dtrace_copy(uintptr_t, uintptr_t, size_t);
 extern void dtrace_copystr(uintptr_t, uintptr_t, size_t, volatile uint16_t *);
 
index e7d8d9b0cc24781004ed09a5f633750aeee1501a..93382d71d986b9d61069256af2356f7abd95a1b6 100644 (file)
@@ -74,19 +74,17 @@ extern "C" {
 
 #define DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD (64)
 
-#define DTRACE_PTSS_ENTRIES_PER_PAGE (PAGE_SIZE / DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD)
+#define DTRACE_PTSS_ENTRIES_PER_PAGE (PAGE_MAX_SIZE / DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD)
 
 struct dtrace_ptss_page_entry {
        struct dtrace_ptss_page_entry*  next;
        user_addr_t                     addr;
-#if CONFIG_EMBEDDED
        user_addr_t                     write_addr;
-#endif
 };
 
 struct dtrace_ptss_page {
        struct dtrace_ptss_page*       next;
-       struct dtrace_ptss_page_entry  entries[PAGE_MAX_SIZE / DTRACE_PTSS_SCRATCH_SPACE_PER_THREAD]; 
+       struct dtrace_ptss_page_entry  entries[DTRACE_PTSS_ENTRIES_PER_PAGE]; 
 };
 
 struct dtrace_ptss_page_entry* dtrace_ptss_claim_entry(struct proc* p); /* sprlock not held */
index 04385bc6c5d204199bced0ded7103ad65384b07f..60eee50aba19a95513cd7909291dba59348a6853 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2003-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*-
@@ -189,9 +189,9 @@ typedef uint64_t kqueue_id_t;
 
 
 /* kevent system call flags */
-#define KEVENT_FLAG_NONE                         0x000 /* no flag value */
-#define KEVENT_FLAG_IMMEDIATE                    0x001 /* immediate timeout */
-#define KEVENT_FLAG_ERROR_EVENTS                 0x002 /* output events only include change errors */
+#define KEVENT_FLAG_NONE                         0x000000      /* no flag value */
+#define KEVENT_FLAG_IMMEDIATE                    0x000001      /* immediate timeout */
+#define KEVENT_FLAG_ERROR_EVENTS                 0x000002      /* output events only include change errors */
 
 #ifdef PRIVATE
 
@@ -201,34 +201,36 @@ typedef uint64_t kqueue_id_t;
  * instead.
  */
 
-#define KEVENT_FLAG_STACK_EVENTS                 0x004   /* output events treated as stack (grows down) */
-#define KEVENT_FLAG_STACK_DATA                   0x008   /* output data allocated as stack (grows down) */
-#define KEVENT_FLAG_UNBIND_CHECK_FLAGS           0x010   /* check the flags passed to kevent_qos_internal_unbind */
-#define KEVENT_FLAG_WORKQ                        0x020   /* interact with the default workq kq */
-#define KEVENT_FLAG_WORKQ_MANAGER                0x200   /* current thread is the workq manager */
-#define KEVENT_FLAG_WORKLOOP                     0x400   /* interact with the specified workloop kq */
-#define KEVENT_FLAG_SYNCHRONOUS_BIND             0x800   /* synchronous bind callback */
-
-#define KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH     0x8000  /* attach current thread to workloop */
-#define KEVENT_FLAG_WORKLOOP_SERVICER_DETACH     0x10000 /* unbind current thread from workloop */
-#define KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST        0x20000 /* kq lookup by id must exist */
-#define KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST    0x40000 /* kq lookup by id must not exist */
-#define KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD        0x80000 /* do not create workqueue threads for this worloop */
+#define KEVENT_FLAG_STACK_EVENTS                 0x000004   /* output events treated as stack (grows down) */
+#define KEVENT_FLAG_STACK_DATA                   0x000008   /* output data allocated as stack (grows down) */
+//                                               0x000010
+#define KEVENT_FLAG_WORKQ                        0x000020   /* interact with the default workq kq */
+//      KEVENT_FLAG_LEGACY32                     0x000040
+//      KEVENT_FLAG_LEGACY64                     0x000080
+//                                               0x000100
+#define KEVENT_FLAG_WORKQ_MANAGER                0x000200   /* obsolete */
+#define KEVENT_FLAG_WORKLOOP                     0x000400   /* interact with the specified workloop kq */
+#define KEVENT_FLAG_PARKING                      0x000800   /* workq thread is parking */
+//      KEVENT_FLAG_KERNEL                       0x001000
+//      KEVENT_FLAG_DYNAMIC_KQUEUE               0x002000
+//                                               0x004000
+#define KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH     0x008000   /* obsolete */
+#define KEVENT_FLAG_WORKLOOP_SERVICER_DETACH     0x010000   /* obsolete */
+#define KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST        0x020000   /* kq lookup by id must exist */
+#define KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST    0x040000   /* kq lookup by id must not exist */
+#define KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD        0x080000   /* obsolete */
 
 #ifdef XNU_KERNEL_PRIVATE
 
-#define KEVENT_FLAG_LEGACY32                     0x040   /* event data in legacy 32-bit format */
-#define KEVENT_FLAG_LEGACY64                     0x080   /* event data in legacy 64-bit format */
+#define KEVENT_FLAG_LEGACY32                     0x0040  /* event data in legacy 32-bit format */
+#define KEVENT_FLAG_LEGACY64                     0x0080  /* event data in legacy 64-bit format */
 #define KEVENT_FLAG_KERNEL                       0x1000  /* caller is in-kernel */
 #define KEVENT_FLAG_DYNAMIC_KQUEUE               0x2000  /* kqueue is dynamically allocated */
-#define KEVENT_FLAG_WORKLOOP_CANCELED            0x4000  /* workloop bind was cancelled */
 
 #define KEVENT_FLAG_USER (KEVENT_FLAG_IMMEDIATE | KEVENT_FLAG_ERROR_EVENTS | \
-                          KEVENT_FLAG_STACK_EVENTS | KEVENT_FLAG_STACK_DATA | \
-                          KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP | \
-                          KEVENT_FLAG_WORKLOOP_SERVICER_ATTACH | KEVENT_FLAG_WORKLOOP_SERVICER_DETACH | \
-                          KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST | \
-                         KEVENT_FLAG_WORKLOOP_NO_WQ_THREAD)
+               KEVENT_FLAG_STACK_EVENTS | KEVENT_FLAG_STACK_DATA | \
+               KEVENT_FLAG_WORKQ | KEVENT_FLAG_WORKLOOP | \
+               KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST | KEVENT_FLAG_DYNAMIC_KQ_MUST_NOT_EXIST)
 
 /*
  * Since some filter ops are not part of the standard sysfilt_ops, we use
@@ -260,48 +262,48 @@ typedef uint64_t kqueue_id_t;
 #endif /* PRIVATE */
 
 /* actions */
-#define EV_ADD              0x0001             /* add event to kq (implies enable) */
-#define EV_DELETE           0x0002             /* delete event from kq */
-#define EV_ENABLE           0x0004             /* enable event */
-#define EV_DISABLE          0x0008             /* disable event (not reported) */
+#define EV_ADD              0x0001      /* add event to kq (implies enable) */
+#define EV_DELETE           0x0002      /* delete event from kq */
+#define EV_ENABLE           0x0004      /* enable event */
+#define EV_DISABLE          0x0008      /* disable event (not reported) */
 
 /* flags */
-#define EV_ONESHOT          0x0010             /* only report one occurrence */
-#define EV_CLEAR            0x0020             /* clear event state after reporting */
-#define EV_RECEIPT          0x0040             /* force immediate event output */
-                                               /* ... with or without EV_ERROR */
-                                               /* ... use KEVENT_FLAG_ERROR_EVENTS */
-                                               /*     on syscalls supporting flags */
+#define EV_ONESHOT          0x0010      /* only report one occurrence */
+#define EV_CLEAR            0x0020      /* clear event state after reporting */
+#define EV_RECEIPT          0x0040      /* force immediate event output */
+                                        /* ... with or without EV_ERROR */
+                                        /* ... use KEVENT_FLAG_ERROR_EVENTS */
+                                        /*     on syscalls supporting flags */
 
-#define EV_DISPATCH         0x0080             /* disable event after reporting */
-#define EV_UDATA_SPECIFIC   0x0100             /* unique kevent per udata value */
+#define EV_DISPATCH         0x0080      /* disable event after reporting */
+#define EV_UDATA_SPECIFIC   0x0100      /* unique kevent per udata value */
 
 #define EV_DISPATCH2        (EV_DISPATCH | EV_UDATA_SPECIFIC)
-                                               /* ... in combination with EV_DELETE */
-                                               /* will defer delete until udata-specific */
-                                               /* event enabled. EINPROGRESS will be */
-                                               /* returned to indicate the deferral */
+                                        /* ... in combination with EV_DELETE */
+                                        /* will defer delete until udata-specific */
+                                        /* event enabled. EINPROGRESS will be */
+                                        /* returned to indicate the deferral */
 
-#define EV_VANISHED         0x0200             /* report that source has vanished  */
-                                               /* ... only valid with EV_DISPATCH2 */
+#define EV_VANISHED         0x0200      /* report that source has vanished  */
+                                        /* ... only valid with EV_DISPATCH2 */
 
-#define EV_SYSFLAGS         0xF000             /* reserved by system */
-#define EV_FLAG0            0x1000             /* filter-specific flag */
-#define EV_FLAG1            0x2000             /* filter-specific flag */
+#define EV_SYSFLAGS         0xF000      /* reserved by system */
+#define EV_FLAG0            0x1000      /* filter-specific flag */
+#define EV_FLAG1            0x2000      /* filter-specific flag */
 
 /* returned values */
-#define EV_EOF              0x8000             /* EOF detected */
-#define EV_ERROR            0x4000             /* error, data contains errno */
+#define EV_EOF              0x8000      /* EOF detected */
+#define EV_ERROR            0x4000      /* error, data contains errno */
 
 /*
  * Filter specific flags for EVFILT_READ
  *
  * The default behavior for EVFILT_READ is to make the "read" determination
- * relative to the current file descriptor read pointer. 
+ * relative to the current file descriptor read pointer.
  *
  * The EV_POLL flag indicates the determination should be made via poll(2)
  * semantics. These semantics dictate always returning true for regular files,
- * regardless of the amount of unread data in the file.  
+ * regardless of the amount of unread data in the file.
  *
  * On input, EV_OOBAND specifies that filter should actively return in the
  * presence of OOB on the descriptor. It implies that filter will return
@@ -331,7 +333,7 @@ typedef uint64_t kqueue_id_t;
 #define NOTE_TRIGGER   0x01000000
 
 /*
- * On input, the top two bits of fflags specifies how the lower twenty four 
+ * On input, the top two bits of fflags specifies how the lower twenty four
  * bits should be applied to the stored value of fflags.
  *
  * On output, the top two bits will always be set to NOTE_FFNOP and the
@@ -342,7 +344,7 @@ typedef uint64_t kqueue_id_t;
 #define NOTE_FFOR       0x80000000              /* or fflags */
 #define NOTE_FFCOPY     0xc0000000              /* copy fflags */
 #define NOTE_FFCTRLMASK 0xc0000000              /* mask for operations */
-#define NOTE_FFLAGSMASK        0x00ffffff 
+#define NOTE_FFLAGSMASK        0x00ffffff
 
 #ifdef PRIVATE
 /*
@@ -434,13 +436,13 @@ typedef uint64_t kqueue_id_t;
 /*
  * data/hint fflags for EVFILT_VNODE, shared with userspace
  */
-#define        NOTE_DELETE     0x00000001              /* vnode was removed */
-#define        NOTE_WRITE      0x00000002              /* data contents changed */
-#define        NOTE_EXTEND     0x00000004              /* size increased */
-#define        NOTE_ATTRIB     0x00000008              /* attributes changed */
-#define        NOTE_LINK       0x00000010              /* link count changed */
-#define        NOTE_RENAME     0x00000020              /* vnode was renamed */
-#define        NOTE_REVOKE     0x00000040              /* vnode access was revoked */
+#define NOTE_DELETE    0x00000001              /* vnode was removed */
+#define NOTE_WRITE     0x00000002              /* data contents changed */
+#define NOTE_EXTEND    0x00000004              /* size increased */
+#define NOTE_ATTRIB    0x00000008              /* attributes changed */
+#define NOTE_LINK      0x00000010              /* link count changed */
+#define NOTE_RENAME    0x00000020              /* vnode was renamed */
+#define NOTE_REVOKE    0x00000040              /* vnode access was revoked */
 #define NOTE_NONE      0x00000080              /* No specific vnode event: to test for EVFILT_READ activation*/
 #define NOTE_FUNLOCK   0x00000100              /* vnode was unlocked by flock(2) */
 
@@ -458,22 +460,22 @@ enum {
        eNoteReapDeprecated __deprecated_enum_msg("This kqueue(2) EVFILT_PROC flag is deprecated") = 0x10000000
 };
 
-#define        NOTE_EXIT               0x80000000      /* process exited */
-#define        NOTE_FORK               0x40000000      /* process forked */
-#define        NOTE_EXEC               0x20000000      /* process exec'd */
-#define        NOTE_REAP               ((unsigned int)eNoteReapDeprecated /* 0x10000000 */)    /* process reaped */
-#define        NOTE_SIGNAL             0x08000000      /* shared with EVFILT_SIGNAL */
-#define        NOTE_EXITSTATUS         0x04000000      /* exit status to be returned, valid for child process only */
-#define        NOTE_EXIT_DETAIL        0x02000000      /* provide details on reasons for exit */
+#define NOTE_EXIT              0x80000000      /* process exited */
+#define NOTE_FORK              0x40000000      /* process forked */
+#define NOTE_EXEC              0x20000000      /* process exec'd */
+#define NOTE_REAP              ((unsigned int)eNoteReapDeprecated /* 0x10000000 */)    /* process reaped */
+#define NOTE_SIGNAL            0x08000000      /* shared with EVFILT_SIGNAL */
+#define NOTE_EXITSTATUS                0x04000000      /* exit status to be returned, valid for child process only */
+#define NOTE_EXIT_DETAIL       0x02000000      /* provide details on reasons for exit */
 
-#define        NOTE_PDATAMASK  0x000fffff              /* mask for signal & exit status */
-#define        NOTE_PCTRLMASK  (~NOTE_PDATAMASK)
+#define NOTE_PDATAMASK 0x000fffff              /* mask for signal & exit status */
+#define NOTE_PCTRLMASK (~NOTE_PDATAMASK)
 
 /*
  * If NOTE_EXITSTATUS is present, provide additional info about exiting process.
  */
 enum {
-       eNoteExitReparentedDeprecated __deprecated_enum_msg("This kqueue(2) EVFILT_PROC flag is no longer sent") = 0x00080000 
+       eNoteExitReparentedDeprecated __deprecated_enum_msg("This kqueue(2) EVFILT_PROC flag is no longer sent") = 0x00080000
 };
 #define NOTE_EXIT_REPARENTED   ((unsigned int)eNoteExitReparentedDeprecated)   /* exited while reparented */
 
@@ -481,8 +483,8 @@ enum {
  * If NOTE_EXIT_DETAIL is present, these bits indicate specific reasons for exiting.
  */
 #define NOTE_EXIT_DETAIL_MASK          0x00070000
-#define        NOTE_EXIT_DECRYPTFAIL           0x00010000 
-#define        NOTE_EXIT_MEMORY                0x00020000
+#define NOTE_EXIT_DECRYPTFAIL          0x00010000
+#define NOTE_EXIT_MEMORY               0x00020000
 #define NOTE_EXIT_CSERROR              0x00040000
 
 #ifdef PRIVATE
@@ -536,15 +538,15 @@ enum {
  */
 #define EVFILT_MEMORYSTATUS_ALL_MASK \
        (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL | NOTE_MEMORYSTATUS_LOW_SWAP | \
-        NOTE_MEMORYSTATUS_PROC_LIMIT_WARN | NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL | NOTE_MEMORYSTATUS_MSL_STATUS)
+        NOTE_MEMORYSTATUS_PROC_LIMIT_WARN | NOTE_MEMORYSTATUS_PROC_LIMIT_CRITICAL | NOTE_MEMORYSTATUS_MSL_STATUS)
 
 #endif /* KERNEL_PRIVATE */
 
 typedef enum vm_pressure_level {
-        kVMPressureNormal   = 0,
-        kVMPressureWarning  = 1,
-        kVMPressureUrgent   = 2,
-        kVMPressureCritical = 3,
+       kVMPressureNormal   = 0,
+       kVMPressureWarning  = 1,
+       kVMPressureUrgent   = 2,
+       kVMPressureCritical = 3,
 } vm_pressure_level_t;
 
 #endif /* PRIVATE */
@@ -561,7 +563,7 @@ typedef enum vm_pressure_level {
 #define NOTE_NSECONDS  0x00000004              /* data is nanoseconds     */
 #define NOTE_ABSOLUTE  0x00000008              /* absolute timeout        */
        /* ... implicit EV_ONESHOT, timeout uses the gettimeofday epoch */
-#define NOTE_LEEWAY    0x00000010              /* ext[1] holds leeway for power aware timers */
+#define NOTE_LEEWAY            0x00000010              /* ext[1] holds leeway for power aware timers */
 #define NOTE_CRITICAL  0x00000020              /* system does minimal timer coalescing */
 #define NOTE_BACKGROUND        0x00000040              /* system does maximum timer coalescing */
 #define NOTE_MACH_CONTINUOUS_TIME      0x00000080
@@ -580,27 +582,32 @@ typedef enum vm_pressure_level {
  * data/hint fflags for EVFILT_SOCK, shared with userspace.
  *
  */
-#define        NOTE_CONNRESET          0x00000001 /* Received RST */
-#define        NOTE_READCLOSED         0x00000002 /* Read side is shutdown */
-#define        NOTE_WRITECLOSED        0x00000004 /* Write side is shutdown */
-#define        NOTE_TIMEOUT            0x00000008 /* timeout: rexmt, keep-alive or persist */
-#define        NOTE_NOSRCADDR          0x00000010 /* source address not available */
-#define        NOTE_IFDENIED           0x00000020 /* interface denied connection */
-#define        NOTE_SUSPEND            0x00000040 /* output queue suspended */
-#define        NOTE_RESUME             0x00000080 /* output queue resumed */
+#define NOTE_CONNRESET         0x00000001 /* Received RST */
+#define NOTE_READCLOSED                0x00000002 /* Read side is shutdown */
+#define NOTE_WRITECLOSED       0x00000004 /* Write side is shutdown */
+#define NOTE_TIMEOUT           0x00000008 /* timeout: rexmt, keep-alive or persist */
+#define NOTE_NOSRCADDR         0x00000010 /* source address not available */
+#define NOTE_IFDENIED          0x00000020 /* interface denied connection */
+#define NOTE_SUSPEND           0x00000040 /* output queue suspended */
+#define NOTE_RESUME            0x00000080 /* output queue resumed */
 #define NOTE_KEEPALIVE         0x00000100 /* TCP Keepalive received */
 #define NOTE_ADAPTIVE_WTIMO    0x00000200 /* TCP adaptive write timeout */
 #define NOTE_ADAPTIVE_RTIMO    0x00000400 /* TCP adaptive read timeout */
-#define        NOTE_CONNECTED          0x00000800 /* socket is connected */
-#define        NOTE_DISCONNECTED       0x00001000 /* socket is disconnected */
-#define        NOTE_CONNINFO_UPDATED   0x00002000 /* connection info was updated */
-#define        NOTE_NOTIFY_ACK         0x00004000 /* notify acknowledgement */
+#define NOTE_CONNECTED         0x00000800 /* socket is connected */
+#define NOTE_DISCONNECTED      0x00001000 /* socket is disconnected */
+#define NOTE_CONNINFO_UPDATED  0x00002000 /* connection info was updated */
+#define NOTE_NOTIFY_ACK                0x00004000 /* notify acknowledgement */
 
-#define        EVFILT_SOCK_LEVEL_TRIGGER_MASK \
-    (NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_SUSPEND | NOTE_RESUME | NOTE_CONNECTED | NOTE_DISCONNECTED)
+#define EVFILT_SOCK_LEVEL_TRIGGER_MASK \
+               (NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_SUSPEND | NOTE_RESUME | \
+                NOTE_CONNECTED | NOTE_DISCONNECTED)
 
 #define EVFILT_SOCK_ALL_MASK \
-    (NOTE_CONNRESET | NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_TIMEOUT | NOTE_NOSRCADDR | NOTE_IFDENIED | NOTE_SUSPEND | NOTE_RESUME | NOTE_KEEPALIVE | NOTE_ADAPTIVE_WTIMO | NOTE_ADAPTIVE_RTIMO | NOTE_CONNECTED | NOTE_DISCONNECTED | NOTE_CONNINFO_UPDATED | NOTE_NOTIFY_ACK)
+               (NOTE_CONNRESET | NOTE_READCLOSED | NOTE_WRITECLOSED | NOTE_TIMEOUT | \
+               NOTE_NOSRCADDR | NOTE_IFDENIED | NOTE_SUSPEND | NOTE_RESUME | \
+               NOTE_KEEPALIVE | NOTE_ADAPTIVE_WTIMO | NOTE_ADAPTIVE_RTIMO | \
+               NOTE_CONNECTED | NOTE_DISCONNECTED | NOTE_CONNINFO_UPDATED | \
+               NOTE_NOTIFY_ACK)
 
 #endif /* PRIVATE */
 
@@ -623,7 +630,7 @@ typedef enum vm_pressure_level {
  * system call argument specifying an ouput area (kevent_qos) will be consulted. If
  * the system call specified an output data area, the user-space address
  * of the received message is carved from that provided output data area (if enough
- * space remains there). The address and length of each received message is 
+ * space remains there). The address and length of each received message is
  * returned in the ext[0] and ext[1] fields (respectively) of the corresponding kevent.
  *
  * IF_MACH_RCV_VOUCHER_CONTENT is specified, the contents of the message voucher is
@@ -642,9 +649,9 @@ typedef enum vm_pressure_level {
  * NOTE_TRACK, NOTE_TRACKERR, and NOTE_CHILD are no longer supported as of 10.5
  */
 /* additional flags for EVFILT_PROC */
-#define        NOTE_TRACK      0x00000001              /* follow across forks */
-#define        NOTE_TRACKERR   0x00000002              /* could not track child */
-#define        NOTE_CHILD      0x00000004              /* am a child process */
+#define NOTE_TRACK     0x00000001              /* follow across forks */
+#define NOTE_TRACKERR  0x00000002              /* could not track child */
+#define NOTE_CHILD     0x00000004              /* am a child process */
 
 
 #ifdef PRIVATE
@@ -652,7 +659,7 @@ typedef enum vm_pressure_level {
 
 #ifndef KERNEL
 /* Temporay solution for BootX to use inode.h till kqueue moves to vfs layer */
-#include <sys/queue.h> 
+#include <sys/queue.h>
 struct knote;
 SLIST_HEAD(klist, knote);
 #endif
@@ -660,10 +667,11 @@ SLIST_HEAD(klist, knote);
 #ifdef KERNEL
 
 #ifdef XNU_KERNEL_PRIVATE
-#include <sys/queue.h> 
+#include <sys/queue.h>
 #include <kern/kern_types.h>
 #include <sys/fcntl.h> /* FREAD, FWRITE */
 #include <kern/debug.h> /* panic */
+#include <pthread/priority_private.h>
 
 #ifdef MALLOC_DECLARE
 MALLOC_DECLARE(M_KQUEUE);
@@ -671,58 +679,61 @@ MALLOC_DECLARE(M_KQUEUE);
 
 TAILQ_HEAD(kqtailq, knote);    /* a list of "queued" events */
 
-/* Bit size for packed field within knote */
-#define KNOTE_KQ_BITSIZE                       40
-
-
 /* index into various kq queues */
-typedef uint8_t kq_index_t; 
+typedef uint8_t kq_index_t;
 typedef uint16_t kn_status_t;
 
-#define KN_ACTIVE          0x0001              /* event has been triggered */
-#define KN_QUEUED          0x0002              /* event is on queue */
-#define KN_DISABLED        0x0004              /* event is disabled */
-#define KN_DROPPING        0x0008              /* knote is being dropped */
-#define KN_USEWAIT         0x0010              /* wait for knote use */
-#define KN_ATTACHING       0x0020              /* event is pending attach */
-#define KN_STAYACTIVE      0x0040              /* force event to stay active */
-#define KN_DEFERDELETE     0x0080              /* defer delete until re-enabled */
-#define KN_ATTACHED        0x0100              /* currently attached to source */
-#define KN_DISPATCH        0x0200              /* disables as part of deliver */
-#define KN_UDATA_SPECIFIC  0x0400              /* udata is part of matching */
-#define KN_SUPPRESSED      0x0800              /* event is suppressed during delivery */
-#define KN_STOLENDROP     0x1000               /* someone stole the drop privilege */
-#define KN_REQVANISH       0x2000       /* requested EV_VANISH */
-#define KN_VANISHED        0x4000       /* has vanished */
-
+#define KN_ACTIVE          0x0001      /* event has been triggered */
+#define KN_QUEUED          0x0002      /* event is on queue */
+#define KN_DISABLED        0x0004      /* event is disabled */
+#define KN_DROPPING        0x0008      /* knote is being dropped */
+#define KN_LOCKED          0x0010      /* knote is locked (kq_knlocks) */
+#define KN_ATTACHING       0x0020      /* event is pending attach */
+#define KN_STAYACTIVE      0x0040      /* force event to stay active */
+#define KN_DEFERDELETE     0x0080      /* defer delete until re-enabled */
+#define KN_ATTACHED        0x0100      /* currently attached to source */
+#define KN_DISPATCH        0x0200      /* disables as part of deliver */
+#define KN_UDATA_SPECIFIC  0x0400      /* udata is part of matching */
+#define KN_SUPPRESSED      0x0800      /* event is suppressed during delivery */
+#define KN_MERGE_QOS       0x1000      /* f_event() / f_* ran concurrently and
+                                                                          overrides must merge */
+#define KN_REQVANISH       0x2000      /* requested EV_VANISH */
+#define KN_VANISHED        0x4000      /* has vanished */
+//                         0x8000
+
+/* combination defines deferred-delete mode enabled */
 #define KN_DISPATCH2           (KN_DISPATCH | KN_UDATA_SPECIFIC)
-                                       /* combination defines deferred-delete mode enabled */
 
+#define KNOTE_KQ_BITSIZE    42
+_Static_assert(KNOTE_KQ_BITSIZE >= VM_KERNEL_POINTER_SIGNIFICANT_BITS,
+               "Make sure sign extending kn_kq_packed is legit");
+
+struct kqueue;
 struct knote {
        TAILQ_ENTRY(knote)       kn_tqe;            /* linkage for tail queue */
        SLIST_ENTRY(knote)       kn_link;           /* linkage for search list */
        SLIST_ENTRY(knote)       kn_selnext;        /* klist element chain */
-       union {
-               struct fileproc      *p_fp;             /* file data pointer */
-               struct proc          *p_proc;           /* proc pointer */
-               struct ipc_mqueue    *p_mqueue;         /* pset pointer */
-       } kn_ptr;
-       uint64_t                     kn_req_index:3,                   /* requested qos index */
-                                    kn_qos_index:3,                   /* in-use qos index */
-                                    kn_qos_override:3,                /* qos override index */
-                                    kn_qos_sync_override:3,           /* qos sync override index */
-                                    kn_vnode_kqok:1,
-                                    kn_vnode_use_ofst:1,
-                                    kn_qos_override_is_sync:1,        /* qos override index is a sync override */
-                                    kn_reserved:1,                    /* reserved bits */
-                                    kn_filtid:8,                      /* filter id to index filter ops */
-                                    kn_kq_packed:KNOTE_KQ_BITSIZE;    /* packed pointer for kq */
-
+       uintptr_t                kn_filtid:8,       /* filter id to index filter ops */
+                                kn_req_index:4,    /* requested qos index */
+                                kn_qos_index:4,    /* in-use qos index */
+                                kn_qos_override:4, /* qos override index */
+                                kn_vnode_kqok:1,
+                                kn_vnode_use_ofst:1;
+#if __LP64__
+       intptr_t                 kn_kq_packed : KNOTE_KQ_BITSIZE;
+#else
+       intptr_t                 kn_kq_packed;
+#endif
        union {
                void                 *kn_hook;
                uint64_t             kn_hook_data;
        };
        int64_t                  kn_sdata;          /* saved data field */
+       union {
+               struct fileproc      *p_fp;             /* file data pointer */
+               struct proc          *p_proc;           /* proc pointer */
+               struct ipc_mqueue    *p_mqueue;         /* pset pointer */
+       } kn_ptr;
        struct kevent_internal_s kn_kevent;
        int                      kn_sfflags;        /* saved filter flags */
        int                      kn_hookid;
@@ -741,28 +752,16 @@ struct knote {
 #define kn_fp          kn_ptr.p_fp
 };
 
-static inline struct kqueue *knote_get_kq(struct knote *kn)
-{
-       if (!(kn->kn_kq_packed))
-               return 0;
-       else
-               return (struct kqueue *)((uintptr_t)(kn->kn_kq_packed) + (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS);
-}
-
-static inline void knote_set_kq(struct knote *kn, void *kq)
+static inline struct kqueue *
+knote_get_kq(struct knote *kn)
 {
-       if (!kq)
-               kn->kn_kq_packed = 0;
-       else {
-               uint64_t offset = ((uintptr_t)kq - (uintptr_t)VM_MIN_KERNEL_AND_KEXT_ADDRESS);
-               kn->kn_kq_packed = offset;
-       }
+       return (struct kqueue *)kn->kn_kq_packed;
 }
 
 static inline int knote_get_seltype(struct knote *kn)
 {
        switch (kn->kn_filter) {
-       case EVFILT_READ: 
+       case EVFILT_READ:
                return FREAD;
        case EVFILT_WRITE:
                return FWRITE;
@@ -792,8 +791,20 @@ typedef struct filt_process_s *filt_process_data_t;
  * Filter operators
  *
  * These routines, provided by each filter, are called to attach, detach, deliver events,
- * change/update filter registration and process/deliver events.  They are called with the
- * with a use-count referenced knote, with the kq unlocked.  Here are more details:
+ * change/update filter registration and process/deliver events:
+ *
+ * - the f_attach, f_touch, f_process, f_peek and f_detach callbacks are always
+ *   serialized with respect to each other for the same knote.
+ *
+ * - the f_event routine is called with a use-count taken on the knote to
+ *   prolongate its lifetime and protect against drop, but is not otherwise
+ *   serialized with other routine calls.
+ *
+ * - the f_detach routine is always called last, and is serialized with all
+ *   other callbacks, including f_event calls.
+ *
+ *
+ * Here are more details:
  *
  * f_isfd -
  *        identifies if the "ident" field in the kevent structure is a file-descriptor.
@@ -808,17 +819,17 @@ typedef struct filt_process_s *filt_process_data_t;
  * f_adjusts_qos -
  *        identifies if the filter can adjust its QoS during its lifetime.
  *
- *        Currently, EVFILT_MAACHPORT is the only filter using this facility.
+ *        Filters using this facility should request the new overrides they want
+ *        using the appropriate FILTER_{RESET,ADJUST}_EVENT_QOS extended codes.
  *
- * f_needs_boost -
- *        [OPTIONAL] used by filters to communicate they need to hold a boost
- *        while holding a usecount on this knote. This is called with the kqlock
- *        held.
+ *        Currently, EVFILT_MACHPORT is the only filter using this facility.
  *
- *        This is only used by EVFILT_WORKLOOP currently.
+ * f_extended_codes -
+ *        identifies if the filter returns extended codes from its routines
+ *        (see FILTER_ACTIVE, ...) or 0 / 1 values.
  *
  * f_attach -
- *           called to attach the knote to the underlying object that will be delivering events
+ *        called to attach the knote to the underlying object that will be delivering events
  *        through it when EV_ADD is supplied and no existing matching event is found
  *
  *        provided a knote that is pre-attached to the fd or hashed (see above) but is
@@ -836,21 +847,9 @@ typedef struct filt_process_s *filt_process_data_t;
  *        The return value indicates if the knote should already be considered "activated" at
  *        the time of attach (one or more of the interest events has already occured).
  *
- * f_post_attach -
- *        [OPTIONAL] called after a successful attach, with the kqueue lock held,
- *        returns lock held, may drop and re-acquire
- *
- *        If this function is non-null, then it indicates that the filter wants
- *        to perform an action after a successful ATTACH of a knote.
- *
- *        Currently, EVFILT_WORKLOOP is the only filter using this facility.
- *
- *        The return value indicates an error to report to userland.
- *
- *
  * f_detach -
  *        called to disassociate the knote from the underlying object delivering events
- *           the filter should not attempt to deliver events through this knote after this
+ *        the filter should not attempt to deliver events through this knote after this
  *        operation returns control to the kq system.
  *
  * f_event -
@@ -864,24 +863,8 @@ typedef struct filt_process_s *filt_process_data_t;
  *        The return value indicates if the knote should already be considered "activated" at
  *        the time of attach (one or more of the interest events has already occured).
  *
- * f_drop_and_unlock -
- *        [OPTIONAL] called with the kqueue locked, and has to unlock
- *
- *        If this function is non-null, then it indicates that the filter
- *        wants to handle EV_DELETE events. This is necessary if a particular
- *        filter needs to synchronize knote deletion with its own filter lock.
- *        Currently, EVFILT_WORKLOOP is the only filter using this facility.
- *
- *        The return value indicates an error during the knote drop, i.e., the
- *        knote still exists and user space should re-drive the EV_DELETE.
- *
- *        If the return value is ERESTART, kevent_register() is called from
- *        scratch again (useful to wait for usecounts to drop and then
- *        reevaluate the relevance of that drop)
- *
- *
  * f_process -
- *        called when attempting to deliver triggered events to user-space. 
+ *        called when attempting to deliver triggered events to user-space.
  *
  *        If the knote was previously activated, this operator will be called when a
  *        thread is trying to deliver events to user-space.  The filter gets one last
@@ -912,47 +895,148 @@ typedef struct filt_process_s *filt_process_data_t;
  *        Unless one of the special output flags was set in the output kevent, a non-
  *        zero return value ALSO indicates that the knote should be re-activated
  *        for future event processing (in case it delivers level-based or a multi-edge
- *        type events like message queues that already exist).  
+ *        type events like message queues that already exist).
  *
  *        NOTE: In the future, the boolean may change to an enum that allows more
  *              explicit indication of just delivering a current event vs delivering
  *              an event with more events still pending.
  *
  * f_touch -
- *        called to update the knote with new state from the user during EVFILT_ADD/ENABLE/DISABLE
- *        on an already-attached knote.
+ *        called to update the knote with new state from the user during
+ *        EVFILT_ADD/ENABLE/DISABLE on an already-attached knote.
  *
  *        f_touch should copy relevant new data from the kevent into the knote.
- *        (if KN_UDATA_SPECIFIC is not set, you may need to update the udata too)
  *
- *        operator must lock against concurrent f_event and f_process operations.
+ *        operator must lock against concurrent f_event operations.
  *
- *        A return value of 1 indicates that the knote should now be considered 'activated'.
+ *        A return value of 1 indicates that the knote should now be considered
+ *        'activated'.
  *
- *        f_touch can set EV_ERROR with specific error in the data field to return an error to the client.
- *        You should return 1 to indicate that the kevent needs to be activated and processed.
+ *        f_touch can set EV_ERROR with specific error in the data field to
+ *        return an error to the client. You should return 1 to indicate that
+ *        the kevent needs to be activated and processed.
  *
  * f_peek -
- *        For knotes marked KN_STAYACTIVE, indicate if the knote is truly active at
- *        the moment (not used for event delivery, but for status checks).
+ *        For knotes marked KN_STAYACTIVE, indicate if the knote is truly active
+ *        at the moment (not used for event delivery, but for status checks).
+ *
+ * f_allow_drop -
+ *
+ *        [OPTIONAL] If this function is non-null, then it indicates that the
+ *        filter wants to validate EV_DELETE events. This is necessary if
+ *        a particular filter needs to synchronize knote deletion with its own
+ *        filter lock.
+ *
+ *        When true is returned, the the EV_DELETE is allowed and can proceed.
+ *
+ *        If false is returned, the EV_DELETE doesn't proceed, and the passed in
+ *        kevent is used for the copyout to userspace.
+ *
+ *        Currently, EVFILT_WORKLOOP is the only filter using this facility.
+ *
+ * f_post_register_wait -
+ *        [OPTIONAL] called when attach or touch return the FILTER_REGISTER_WAIT
+ *        extended code bit. It is possible to use this facility when the last
+ *        register command wants to wait.
+ *
+ *        Currently, EVFILT_WORKLOOP is the only filter using this facility.
  */
 
+struct _kevent_register;
+struct knote_lock_ctx;
+struct proc;
+struct uthread;
+struct waitq;
+
 struct filterops {
-       bool    f_isfd;         /* true if ident == filedescriptor */
-       bool    f_adjusts_qos; /* true if the filter can override the knote */
-       bool    (*f_needs_boost)(struct kevent_internal_s *kev);
+       bool    f_isfd;               /* true if ident == filedescriptor */
+       bool    f_adjusts_qos;    /* true if the filter can override the knote */
+       bool    f_extended_codes; /* hooks return extended codes */
+
        int     (*f_attach)(struct knote *kn, struct kevent_internal_s *kev);
-       int     (*f_post_attach)(struct knote *kn, struct kevent_internal_s *kev);
        void    (*f_detach)(struct knote *kn);
        int     (*f_event)(struct knote *kn, long hint);
        int     (*f_touch)(struct knote *kn, struct kevent_internal_s *kev);
-       int     (*f_drop_and_unlock)(struct knote *kn, struct kevent_internal_s *kev);
        int     (*f_process)(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-       unsigned (*f_peek)(struct knote *kn);
+       int     (*f_peek)(struct knote *kn);
+
+       /* optional & advanced */
+       bool    (*f_allow_drop)(struct knote *kn, struct kevent_internal_s *kev);
+       void    (*f_post_register_wait)(struct uthread *uth, struct knote_lock_ctx *ctx,
+                       struct _kevent_register *ss_kr);
 };
 
-struct proc;
-struct waitq;
+/*
+ * Extended codes returned by filter routines when f_extended_codes is set.
+ *
+ * FILTER_ACTIVE
+ *     The filter is active and a call to f_process() may return an event.
+ *
+ *     For f_process() the meaning is slightly different: the knote will be
+ *     activated again as long as f_process returns FILTER_ACTIVE, unless
+ *     EV_CLEAR is set, which require a new f_event to reactivate the knote.
+ *
+ *     Valid:    f_attach, f_event, f_touch, f_process, f_peek
+ *     Implicit: -
+ *     Ignored:  -
+ *
+ * FILTER_REGISTER_WAIT
+ *     The filter wants its f_post_register_wait() to be called.
+ *
+ *     Note: It is only valid to ask for this behavior for a workloop kqueue,
+ *     and is really only meant to be used by EVFILT_WORKLOOP.
+ *
+ *     Valid:    f_attach, f_touch
+ *     Implicit: -
+ *     Ignored:  f_event, f_process, f_peek
+ *
+ * FILTER_UPDATE_REQ_QOS
+ *     The filter wants the passed in QoS to be updated as the new intrinsic qos
+ *     for this knote. If the kevent `qos` field is 0, no update is performed.
+ *
+ *     This also will reset the event QoS, so FILTER_ADJUST_EVENT_QOS() must
+ *     also be used if an override should be maintained.
+ *
+ *     Valid:    f_touch
+ *     Implicit: f_attach
+ *     Ignored:  f_event, f_process, f_peek
+ *
+ * FILTER_RESET_EVENT_QOS
+ * FILTER_ADJUST_EVENT_QOS(qos)
+ *     The filter wants the QoS of the next event delivery to be overridden
+ *     at the specified QoS.  This allows for the next event QoS to be elevated
+ *     from the knote requested qos (See FILTER_UPDATE_REQ_QOS).
+ *
+ *     Event QoS Overrides are reset when a particular knote is no longer
+ *     active. Hence this is ignored if FILTER_ACTIVE isn't also returned.
+ *
+ *     Races between an f_event() and any other f_* routine asking for
+ *     a specific QoS override are handled generically and the filters do not
+ *     have to worry about them.
+ *
+ *     To use this facility, filters MUST set their f_adjusts_qos bit to true.
+ *
+ *     It is expected that filters will return the new QoS they expect to be
+ *     applied from any f_* callback except for f_process() where no specific
+ *     information should be provided. Filters should not try to hide no-ops,
+ *     kevent will already optimize these away.
+ *
+ *     Valid:    f_touch, f_attach, f_event, f_process
+ *     Implicit: -
+ *     Ignored:  f_peek
+ */
+#define FILTER_ACTIVE                       0x00000001
+#define FILTER_REGISTER_WAIT                0x00000002
+#define FILTER_UPDATE_REQ_QOS               0x00000004
+#define FILTER_ADJUST_EVENT_QOS_BIT         0x00000008
+#define FILTER_ADJUST_EVENT_QOS_MASK        0x00000070
+#define FILTER_ADJUST_EVENT_QOS_SHIFT 4
+#define FILTER_ADJUST_EVENT_QOS(qos) \
+               (((qos) << FILTER_ADJUST_EVENT_QOS_SHIFT) | FILTER_ADJUST_EVENT_QOS_BIT)
+#define FILTER_RESET_EVENT_QOS              FILTER_ADJUST_EVENT_QOS_BIT
+
+#define filter_call(_ops, call)  \
+               ((_ops)->f_extended_codes ? (_ops)->call : !!((_ops)->call))
 
 SLIST_HEAD(klist, knote);
 extern void    knote_init(void);
@@ -965,17 +1049,20 @@ extern void      klist_init(struct klist *list);
 extern void    knote(struct klist *list, long hint);
 extern int     knote_attach(struct klist *list, struct knote *kn);
 extern int     knote_detach(struct klist *list, struct knote *kn);
-extern void knote_vanish(struct klist *list);
+extern void    knote_vanish(struct klist *list);
+extern void    knote_link_waitqset_lazy_alloc(struct knote *kn);
+extern boolean_t knote_link_waitqset_should_lazy_alloc(struct knote *kn);
 extern int     knote_link_waitq(struct knote *kn, struct waitq *wq, uint64_t *reserved_link);
 extern int     knote_unlink_waitq(struct knote *kn, struct waitq *wq);
-extern void    knote_fdclose(struct proc *p, int fd, int force);
+extern void    knote_fdclose(struct proc *p, int fd);
 extern void    knote_markstayactive(struct knote *kn);
 extern void    knote_clearstayactive(struct knote *kn);
-extern void knote_adjust_qos(struct knote *kn, int qos, int override, kq_index_t sync_override_index);
-extern void knote_adjust_sync_qos(struct knote *kn, kq_index_t sync_qos, boolean_t lock_kq);
 extern const struct filterops *knote_fops(struct knote *kn);
 extern void knote_set_error(struct knote *kn, int error);
 
+extern struct turnstile *kqueue_turnstile(struct kqueue *);
+extern struct turnstile *kqueue_alloc_turnstile(struct kqueue *);
+
 int kevent_exit_on_workloop_ownership_leak(thread_t thread);
 int kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize);
 int kevent_copyout_proc_dynkqids(void *proc, user_addr_t ubuf,
@@ -987,7 +1074,7 @@ int kevent_copyout_dynkqextinfo(void *proc, kqueue_id_t kq_id, user_addr_t ubuf,
 
 #elif defined(KERNEL_PRIVATE) /* !XNU_KERNEL_PRIVATE: kexts still need a klist structure definition */
 
-#include <sys/queue.h> 
+#include <sys/queue.h>
 struct proc;
 struct knote;
 SLIST_HEAD(klist, knote);
@@ -998,17 +1085,12 @@ SLIST_HEAD(klist, knote);
 #ifdef PRIVATE
 
 /* make these private functions available to the pthread kext */
-extern int     kevent_qos_internal(struct proc *p, int fd, 
+extern int     kevent_qos_internal(struct proc *p, int fd,
                            user_addr_t changelist, int nchanges,
                            user_addr_t eventlist, int nevents,
                            user_addr_t data_out, user_size_t *data_available,
                            unsigned int flags, int32_t *retval);
 
-extern int  kevent_qos_internal_bind(struct proc *p,
-                int qos, thread_t thread, unsigned int flags);
-extern int  kevent_qos_internal_unbind(struct proc *p,
-                int qos, thread_t thread, unsigned int flags);
-
 extern int     kevent_id_internal(struct proc *p, kqueue_id_t *id,
                            user_addr_t changelist, int nchanges,
                            user_addr_t eventlist, int nevents,
@@ -1018,7 +1100,7 @@ extern int        kevent_id_internal(struct proc *p, kqueue_id_t *id,
 #endif  /* PRIVATE */
 #endif  /* KERNEL_PRIVATE */
 
-#else  /* KERNEL */
+#else  /* KERNEL */
 
 #include <sys/types.h>
 
@@ -1026,24 +1108,24 @@ struct timespec;
 
 __BEGIN_DECLS
 int     kqueue(void);
-int     kevent(int kq, 
+int     kevent(int kq,
               const struct kevent *changelist, int nchanges,
               struct kevent *eventlist, int nevents,
               const struct timespec *timeout);
-int     kevent64(int kq, 
+int     kevent64(int kq,
                 const struct kevent64_s *changelist, int nchanges,
                 struct kevent64_s *eventlist, int nevents,
-                unsigned int flags, 
+                unsigned int flags,
                 const struct timespec *timeout);
 
 #ifdef PRIVATE
-int     kevent_qos(int kq, 
+int     kevent_qos(int kq,
                   const struct kevent_qos_s *changelist, int nchanges,
                   struct kevent_qos_s *eventlist, int nevents,
                   void *data_out, size_t *data_available,
                   unsigned int flags);
 
-int     kevent_id(kqueue_id_t id, 
+int     kevent_id(kqueue_id_t id,
                   const struct kevent_qos_s *changelist, int nchanges,
                   struct kevent_qos_s *eventlist, int nevents,
                   void *data_out, size_t *data_available,
@@ -1063,5 +1145,4 @@ __END_DECLS
 
 #endif /* PRIVATE */
 
-
 #endif /* !_SYS_EVENT_H_ */
index 79ed9351267859686f49741d81e407623da9ba04..e5b717bfed5a637e9e2e570354e28a1255812041 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2016-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -131,8 +131,7 @@ typedef struct eventhandler_entry   *eventhandler_tag;
                        EHL_LOCK_SPIN((list));                          \
                }                                                       \
        }                                                               \
-       KASSERT((list)->el_runcount > 0,                                \
-           ("eventhandler_invoke: runcount underflow"));               \
+       VERIFY((list)->el_runcount > 0);                                \
        (list)->el_runcount--;                                          \
        if ((list)->el_runcount == 0) {                                 \
                EHL_LOCK_CONVERT((list));                               \
index 82323625f9887af1fa8e303557c5864bc0f3e138..e60eaeb867812ee441f263bc3e099b23f843f45c 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
-/*-
+/*
  * Copyright (c) 1999,2000 Jonathan Lemon <jlemon@FreeBSD.org>
  * All rights reserved.
  *
@@ -68,8 +68,8 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int);
 
 #include <stdint.h>
 #include <kern/locks.h>
-#include <sys/pthread_shims.h>
 #include <mach/thread_policy.h>
+#include <pthread/workqueue_internal.h>
 
 /*
  * Lock ordering:
@@ -100,6 +100,40 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int);
 
 #define KQEXTENT       256             /* linear growth by this amount */
 
+struct knote_lock_ctx {
+       struct knote                       *knlc_knote;
+       thread_t                            knlc_thread;
+       // TODO: knlc_turnstile
+       TAILQ_HEAD(, knote_lock_ctx)        knlc_head;
+       union {
+               LIST_ENTRY(knote_lock_ctx)      knlc_le;
+               TAILQ_ENTRY(knote_lock_ctx)     knlc_tqe;
+       };
+#if DEBUG || DEVELOPMENT
+#define KNOTE_LOCK_CTX_UNLOCKED 0
+#define KNOTE_LOCK_CTX_LOCKED   1
+#define KNOTE_LOCK_CTX_WAITING  2
+       int knlc_state;
+#endif
+};
+LIST_HEAD(knote_locks, knote_lock_ctx);
+
+#if DEBUG || DEVELOPMENT
+/*
+ * KNOTE_LOCK_CTX(name) is a convenience macro to define a knote lock context on
+ * the stack named `name`. In development kernels, it uses tricks to make sure
+ * not locks was still held when exiting the C-scope that contains this context.
+ */
+__attribute__((noinline,not_tail_called))
+void knote_lock_ctx_chk(struct knote_lock_ctx *ctx);
+#define KNOTE_LOCK_CTX(n) \
+               struct knote_lock_ctx n __attribute__((cleanup(knote_lock_ctx_chk))); \
+               n.knlc_state = KNOTE_LOCK_CTX_UNLOCKED
+#else
+#define KNOTE_LOCK_CTX(n) \
+               struct knote_lock_ctx n
+#endif
+
 /*
  * kqueue - common core definition of a kqueue
  *
@@ -108,13 +142,17 @@ typedef void (*kqueue_continue_t)(struct kqueue *, void *, int);
  *          derived from this definition.
  */
 struct kqueue {
-       struct waitq_set    kq_wqs;       /* private waitq set */
-       lck_spin_t          kq_lock;      /* kqueue lock */
-       uint16_t            kq_state;     /* state of the kq */
-       uint16_t            kq_level;     /* nesting level of the kq */
-       uint32_t            kq_count;     /* number of queued events */
-       struct proc         *kq_p;        /* process containing kqueue */
-       struct kqtailq      kq_queue[1];  /* variable array of kqtailq structs */
+       struct {
+               struct waitq_set    kq_wqs;       /* private waitq set */
+               lck_spin_t          kq_lock;      /* kqueue lock */
+               uint16_t            kq_state;     /* state of the kq */
+               uint16_t            kq_level;     /* nesting level of the kq */
+               uint32_t            kq_count;     /* number of queued events */
+               struct proc        *kq_p;         /* process containing kqueue */
+               struct knote_locks  kq_knlocks;   /* list of knote locks held */
+               lck_spin_t          kq_reqlock;   /* kqueue request lock */
+       }; /* make sure struct padding is put before kq_queue */
+       struct kqtailq      kq_queue[0];      /* variable array of queues */
 };
 
 #define KQ_SEL            0x001  /* select was recorded for kq */
@@ -129,7 +167,6 @@ struct kqueue {
 #define KQ_DRAIN          0x200  /* kq is draining */
 #define KQ_WAKEUP         0x400  /* kq awakened while processing */
 #define KQ_DYNAMIC        0x800  /* kqueue is dynamically managed */
-#define KQ_NO_WQ_THREAD   0x1000 /* kq will not have workqueue threads dynamically created */
 /*
  * kqfile - definition of a typical kqueue opened as a file descriptor
  *          via the kqueue() system call.
@@ -139,6 +176,7 @@ struct kqueue {
  */
 struct kqfile {
        struct kqueue       kqf_kqueue;     /* common kqueue core */
+       struct kqtailq      kqf_queue;      /* queue of woken up knotes */
        struct kqtailq      kqf_suppressed; /* suppression queue */
        struct selinfo      kqf_sel;        /* parent select/kqueue info */
 };
@@ -149,57 +187,33 @@ struct kqfile {
 #define kqf_level    kqf_kqueue.kq_level
 #define kqf_count    kqf_kqueue.kq_count
 #define kqf_p        kqf_kqueue.kq_p
-#define kqf_queue    kqf_kqueue.kq_queue
 
 #define QOS_INDEX_KQFILE   0          /* number of qos levels in a file kq */
 
-struct kqr_bound {
-       struct kqtailq   kqrb_suppressed;     /* Per-QoS suppression queues */
-       thread_t         kqrb_thread;         /* thread to satisfy request */
-};
-
 /*
  * kqrequest - per-QoS thread request status
  */
 struct kqrequest {
-#if 0
-       union {
-               struct kqr_bound kqru_bound;       /* used when thread is bound */
-               struct workq_threadreq_s kqru_req; /* used when request oustanding */
-       } kqr_u;
-#define kqr_suppressed kqr_u.kqru_bound.kqrb_suppressed
-#define kqr_thread     kqr_u.kqru_bound.kqrb_thread
-#define kqr_req        kqr_u.kqru_req
-#else
-       struct kqr_bound kqr_bound;            /* used when thread is bound */
        struct workq_threadreq_s kqr_req;      /* used when request oustanding */
-#define kqr_suppressed kqr_bound.kqrb_suppressed
-#define kqr_thread     kqr_bound.kqrb_thread
-#endif
-       uint8_t          kqr_state;                    /* KQ/workq interaction state */
-       uint8_t          kqr_wakeup_indexes;           /* QoS/override levels that woke */
-       uint16_t         kqr_dsync_waiters:13,         /* number of dispatch sync waiters */
-                        kqr_dsync_owner_qos:3;        /* Qos override on dispatch sync owner */
-       uint16_t         kqr_sync_suppress_count;      /* number of suppressed sync ipc knotes */
-       kq_index_t       kqr_stayactive_qos:3,         /* max QoS of statyactive knotes */
-                        kqr_owner_override_is_sync:1, /* sync owner has sync ipc override */
-                        kqr_override_index:3,         /* highest wakeup override index */
-                        kqr_has_sync_override:1;      /* Qos/override at UI is sync ipc override */
-
-       /* set under both the kqlock and the filt_wllock */
-       kq_index_t       :0;                           /* prevent bitfields coalescing <rdar://problem/31854115> */
-       kq_index_t       kqr_qos_index:4,              /* QoS for the thread request */
-                        kqr_dsync_waiters_qos:4;      /* override from dispatch sync waiters */
+       struct kqtailq   kqr_suppressed;       /* Per-QoS suppression queues */
+       thread_t         kqr_thread;           /* thread to satisfy request */
+       uint8_t          kqr_state;            /* KQ/workq interaction state */
+#define KQWL_STAYACTIVE_FIRED_BIT     (1 << 0)
+       uint8_t          kqr_wakeup_indexes;   /* QoS/override levels that woke */
+       uint16_t         kqr_dsync_waiters;    /* number of dispatch sync waiters */
+       kq_index_t       kqr_stayactive_qos;   /* max QoS of statyactive knotes */
+       kq_index_t       kqr_override_index;   /* highest wakeup override index */
+       kq_index_t       kqr_qos_index;        /* QoS for the thread request */
 };
 
 
-#define KQR_PROCESSING              0x01       /* requested thread is running the q */
+#define KQR_WORKLOOP                 0x01   /* owner is a workloop */
 #define KQR_THREQUESTED              0x02      /* thread has been requested from workq */
 #define KQR_WAKEUP                   0x04      /* wakeup called during processing */
-#define KQR_BOUND                    0x08       /* servicing thread is bound */
-#define KQR_THOVERCOMMIT             0x20       /* overcommit needed for thread requests */
-#define KQR_DRAIN                    0x40       /* cancel initiated - drain fulfill */
-#define KQR_R2K_NOTIF_ARMED          0x80       /* ast notifications armed */
+#define KQR_THOVERCOMMIT             0x08   /* overcommit needed for thread requests */
+#define KQR_R2K_NOTIF_ARMED          0x10   /* ast notifications armed */
+#define KQR_ALLOCATED_TURNSTILE      0x20   /* kqwl_turnstile is allocated */
+
 /*
  * WorkQ kqueues need to request threads to service the triggered
  * knotes in the queue.  These threads are brought up on a
@@ -213,40 +227,8 @@ struct kqrequest {
 #define KQWQ_QOS_MANAGER (THREAD_QOS_LAST)
 #endif
 
-#if !defined(KQWQ_NQOS)
-#define KQWQ_NQOS    (KQWQ_QOS_MANAGER + 1)
-#endif
-
-/*
- * Workq thread start out a particular effective-requested-QoS, but
- * additional events processed by the filters may represent
- * backlogged events that may themselves have a higher requested-QoS.
- * To represent this, the filter may apply an override to a knote's
- * requested QoS.
- *
- * We further segregate these overridden knotes into different buckets
- * by <requested, override> grouping. This allows easy matching of
- * knotes to process vs. the highest workq thread override applied.
- *
- * Only certain override patterns need to be supported. A knote
- * cannot have an effective-requested-QoS of UNSPECIFIED - because
- * the kevent->qos (when canonicalized) will always be above that
- * or indicate manager.  And we don't allow an override to specify
- * manager.  This results in the following buckets being needed:
- *
- *                  Effective-Requested QoS
- *           MAINT  BG    UTIL  DEFAULT UINIT UINTER MANAGER
- * override:
- * MAINT      0
- * BG         1      6
- * UTILITY    2      7     11
- * DEFAULT    3      8     12    15
- * UINIT      4      9     13    16     18
- * UINTER     5     10     14    17     19     20
- *                                                    21
- */
 #if !defined(KQWQ_NBUCKETS)
-#define KQWQ_NBUCKETS 22
+#define KQWQ_NBUCKETS    (KQWQ_QOS_MANAGER + 1)
 #endif
 
 /*
@@ -259,9 +241,8 @@ struct kqrequest {
  */
 struct kqworkq {
        struct kqueue    kqwq_kqueue;
-       struct kqtailq   kqwq_queuecont[KQWQ_NBUCKETS-1]; /* continue array of queues */
-       struct kqrequest kqwq_request[KQWQ_NQOS];         /* per-QoS request states */
-       lck_spin_t       kqwq_reqlock;                    /* kqueue request lock */
+       struct kqtailq   kqwq_queue[KQWQ_NBUCKETS];       /* array of queues */
+       struct kqrequest kqwq_request[KQWQ_NBUCKETS];     /* per-QoS request states */
 };
 
 #define kqwq_wqs     kqwq_kqueue.kq_wqs
@@ -270,13 +251,6 @@ struct kqworkq {
 #define kqwq_level   kqwq_kqueue.kq_level
 #define kqwq_count   kqwq_kqueue.kq_count
 #define kqwq_p       kqwq_kqueue.kq_p
-#define kqwq_queue   kqwq_kqueue.kq_queue
-
-#define kqwq_req_lock(kqwq)    lck_spin_lock(&kqwq->kqwq_reqlock)
-#define kqwq_req_unlock(kqwq)  lck_spin_unlock(&kqwq->kqwq_reqlock)
-#define kqwq_req_held(kqwq)    LCK_SPIN_ASSERT(&kqwq->kqwq_reqlock, LCK_ASSERT_OWNED)
-
-#define KQWQ_THMANAGER    0x10      /* expect manager thread to run the queue */
 
 /*
  * WorkLoop kqueues need to request a thread to service the triggered
@@ -319,16 +293,49 @@ struct kqworkq {
  */
 struct kqworkloop {
        struct kqueue    kqwl_kqueue;                     /* queue of events */
-       struct kqtailq   kqwl_queuecont[KQWL_NBUCKETS-1]; /* continue array of queues */
+       struct kqtailq   kqwl_queue[KQWL_NBUCKETS];       /* array of queues */
        struct kqrequest kqwl_request;                    /* thread request state */
-       lck_spin_t       kqwl_reqlock;                    /* kqueue request lock */
        lck_mtx_t        kqwl_statelock;                  /* state/debounce lock */
        thread_t         kqwl_owner;                      /* current [sync] owner thread */
        uint32_t         kqwl_retains;                    /* retain references */
        kqueue_id_t      kqwl_dynamicid;                  /* dynamic identity */
+       uint64_t         kqwl_params;                     /* additional parameters */
+       struct turnstile *kqwl_turnstile;                 /* turnstile for sync IPC/waiters */
        SLIST_ENTRY(kqworkloop) kqwl_hashlink;            /* linkage for search list */
+#if CONFIG_WORKLOOP_DEBUG
+#define KQWL_HISTORY_COUNT 32
+#define KQWL_HISTORY_WRITE_ENTRY(kqwl, ...) ({ \
+               struct kqworkloop *__kqwl = (kqwl); \
+               unsigned int __index = os_atomic_inc_orig(&__kqwl->kqwl_index, relaxed); \
+               __kqwl->kqwl_history[__index % KQWL_HISTORY_COUNT] = \
+                               (struct kqwl_history)__VA_ARGS__; \
+       })
+       struct kqwl_history {
+               thread_t updater;  /* Note: updates can be reordered */
+               thread_t servicer;
+               thread_t old_owner;
+               thread_t new_owner;
+
+               uint64_t kev_ident;
+               int16_t  error;
+               uint16_t kev_flags;
+               uint32_t kev_fflags;
+
+               uint64_t kev_mask;
+               uint64_t kev_value;
+               uint64_t in_value;
+       } kqwl_history[KQWL_HISTORY_COUNT];
+       unsigned int kqwl_index;
+#endif // CONFIG_WORKLOOP_DEBUG
 };
 
+typedef union {
+       struct kqueue       *kq;
+       struct kqworkq      *kqwq;
+       struct kqfile       *kqf;
+       struct kqworkloop   *kqwl;
+} __attribute__((transparent_union)) kqueue_t;
+
 SLIST_HEAD(kqlist, kqworkloop);
 
 #define kqwl_wqs     kqwl_kqueue.kq_wqs
@@ -337,30 +344,39 @@ SLIST_HEAD(kqlist, kqworkloop);
 #define kqwl_level   kqwl_kqueue.kq_level
 #define kqwl_count   kqwl_kqueue.kq_count
 #define kqwl_p       kqwl_kqueue.kq_p
-#define kqwl_queue   kqwl_kqueue.kq_queue
-
-#define kqwl_req_lock(kqwl)    lck_spin_lock(&kqwl->kqwl_reqlock)
-#define kqwl_req_unlock(kqwl)  lck_spin_unlock(&kqwl->kqwl_reqlock)
-#define kqwl_req_held(kqwl)    LCK_SPIN_ASSERT(&kqwl->kqwl_reqlock, LCK_ASSERT_OWNED)
 
 #define KQ_WORKLOOP_RETAINS_MAX UINT32_MAX
 
-extern int workloop_fulfill_threadreq(struct proc *p, workq_threadreq_t req, thread_t thread, int flags);
+extern void kqueue_threadreq_unbind(struct proc *p, struct kqrequest *kqr);
+
+// called with the kq req held
+#define KQUEUE_THREADERQ_BIND_NO_INHERITOR_UPDATE 0x1
+extern void kqueue_threadreq_bind(struct proc *p, workq_threadreq_t req,
+               thread_t thread, unsigned int flags);
+
+// called with the wq lock held
+extern void kqueue_threadreq_bind_prepost(struct proc *p, workq_threadreq_t req, thread_t thread);
+
+// called with no lock held
+extern void kqueue_threadreq_bind_commit(struct proc *p, thread_t thread);
+
+extern void kqueue_threadreq_cancel(struct proc *p, workq_threadreq_t req);
+
+// lock not held as kqwl_params is immutable after creation
+extern workq_threadreq_param_t kqueue_threadreq_workloop_param(workq_threadreq_t req);
 
 extern struct kqueue *kqueue_alloc(struct proc *, unsigned int);
 extern void kqueue_dealloc(struct kqueue *);
 
 extern void knotes_dealloc(struct proc *);
+extern void kqworkloops_dealloc(struct proc *);
 
-extern void kevent_register(struct kqueue *, struct kevent_internal_s *, struct proc *);
+extern int kevent_register(struct kqueue *, struct kevent_internal_s *,
+               struct knote_lock_ctx *);
 extern int kqueue_scan(struct kqueue *, kevent_callback_t, kqueue_continue_t,
-                      void *, struct filt_process_s *, struct timeval *, struct proc *);
+               void *, struct filt_process_s *, struct timeval *, struct proc *);
 extern int kqueue_stat(struct kqueue *, void *, int, proc_t);
 
 #endif /* XNU_KERNEL_PRIVATE */
 
 #endif /* !_SYS_EVENTVAR_H_ */
-
-
-
-
index 1ca389cb6eb1d8b5f1e6c03fd25827c2d047446a..863e6037eb2b591b45c0d2c9e79df934e2de206b 100644 (file)
@@ -190,11 +190,6 @@ extern fasttrap_hash_t             fasttrap_tpoints;
 #define        FASTTRAP_TPOINTS_INDEX(pid, pc) \
        (((pc) / sizeof (fasttrap_instr_t) + (pid)) & fasttrap_tpoints.fth_mask)
 
-
-#ifdef CONFIG_EMBEDDED
-#define FASTTRAP_ASYNC_REMOVE
-#endif
-
 extern void fasttrap_tracepoint_retire(proc_t *p, fasttrap_tracepoint_t *tp);
 
 /*
index fdda0b16148917494b859244bb046e644e9c352d..a6411a57f727d9a908befefe30c8f4ee61c390c3 100644 (file)
@@ -69,7 +69,5 @@ extern int fbt_enable (void *arg, dtrace_id_t id, void *parg);
 extern int fbt_module_excluded(struct modctl*);
 extern int fbt_excluded(const char *);
 
-extern void fbt_provide_probe(struct modctl *ctl, uintptr_t instr_low, uintptr_t instr_high, char *modname, char* symbol_name, machine_inst_t* symbol_start);
-
-extern void fbt_provide_module_kernel_syms(struct modctl *ctl);
+extern void fbt_provide_probe(struct modctl *ctl, const char *modname, const char *name, machine_inst_t *instr, machine_inst_t *limit);
 #endif /* _FBT_H */
index b79440b48e2066626b3293c53bf03104c42b104d..16e33533aee0757efbd01808abec2f200f2eee17 100644 (file)
@@ -118,6 +118,9 @@ struct filedesc {
                                 /* if we're force unmounted and unable to */
                                 /* take a vnode_ref on fd_rdir during a fork */
 
+#define FD_WORKLOOP    0x02    /* process has created a kqworkloop that */
+                                /* requires manual cleanup on exit */
+
 /*
  * Per-process open flags.
  */
index 8a3624d3b7329d77e91675dd740f3c523b6c402b..eafcb9b45307463993a8f0d2d53ab340ccddc073 100644 (file)
@@ -251,6 +251,13 @@ typedef struct disk_conditioner_info {
   uint64_t read_throughput_mbps; // maximum throughput for reads
   uint64_t write_throughput_mbps; // maximum throughput for writes
   int is_ssd; // behave like an SSD
+
+  /* revision 2 */
+  uint32_t ioqueue_depth;
+  uint32_t maxreadcnt;
+  uint32_t maxwritecnt;
+  uint32_t segreadcnt;
+  uint32_t segwritecnt;
 } disk_conditioner_info;
 
 #define        FSCTL_SYNC_FULLSYNC     (1<<0)  /* Flush the data fully to disk, if supported by the filesystem */
@@ -328,6 +335,14 @@ typedef struct disk_conditioner_info {
 #define SPOTLIGHT_IOC_GET_LAST_MTIME             _IOR('h', 19, u_int32_t)
 #define SPOTLIGHT_FSCTL_GET_LAST_MTIME           IOCBASECMD(SPOTLIGHT_IOC_GET_LAST_MTIME)
 
+/* Mark file's extents as "frozen" because someone has references to physical address */
+#define FSIOC_FREEZE_EXTENTS                           _IO('h', 20)
+#define FSCTL_FREEZE_EXTENTS                           IOCBASECMD(FSIOC_FREEZE_EXTENTS)
+
+/* Clear the "frozen" status of file's extents */
+#define FSIOC_THAW_EXTENTS                             _IO('h', 21)
+#define FSCTL_THAW_EXTENTS                             IOCBASECMD(FSIOC_THAW_EXTENTS)
+
 #ifndef KERNEL
 
 #include <sys/cdefs.h>
index ae1ec05f983300d583071e864ec0e2135a5e8f28..f445d4fd64506e957571f8758e629e0d26384dea 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -150,8 +150,6 @@ enum guard_vn_exception_codes {
        kGUARD_EXC_EXCHDATA     = VNG_EXCHDATA,
 };
 
-#if defined(KERNEL)
-
 /* Guard violation behaviors: not all combinations make sense */
 
 #define kVNG_POLICY_LOGMSG     (1u << 0)
@@ -159,7 +157,9 @@ enum guard_vn_exception_codes {
 #define kVNG_POLICY_EXC                (1u << 2)
 #define kVNG_POLICY_EXC_CORPSE (1u << 3)
 #define kVNG_POLICY_SIGKILL    (1u << 4)
+#define kVNG_POLICY_UPRINTMSG  (1u << 5)
 
+#if defined(KERNEL)
 extern int vnguard_exceptions_active(void);
 extern void vnguard_policy_init(void);
 #endif /* KERNEL */
index c1fbde2526a6d646ed6248a302b01278177b2527..80344fce5a2123b882c0a636b4053a4a09f5edd0 100644 (file)
@@ -127,16 +127,18 @@ struct image_params {
 /*
  * Image flags
  */
-#define        IMGPF_NONE              0x00000000      /* No flags */
-#define        IMGPF_INTERPRET         0x00000001      /* Interpreter invoked */
-#define        IMGPF_RESERVED          0x00000002
-#define        IMGPF_WAS_64BIT         0x00000004      /* exec from a 64Bit binary */
-#define        IMGPF_IS_64BIT          0x00000008      /* exec to a 64Bit binary */
-#define        IMGPF_SPAWN             0x00000010      /* spawn (without setexec) */
-#define        IMGPF_DISABLE_ASLR      0x00000020      /* disable ASLR */
+#define        IMGPF_NONE                              0x00000000      /* No flags */
+#define        IMGPF_INTERPRET                 0x00000001      /* Interpreter invoked */
+#define        IMGPF_RESERVED                  0x00000002
+#define        IMGPF_WAS_64BIT_ADDR    0x00000004      /* exec from a 64Bit address space */
+#define        IMGPF_IS_64BIT_ADDR             0x00000008      /* exec to a 64Bit address space */
+#define        IMGPF_SPAWN                             0x00000010      /* spawn (without setexec) */
+#define        IMGPF_DISABLE_ASLR              0x00000020      /* disable ASLR */
 #define        IMGPF_ALLOW_DATA_EXEC   0x00000040      /* forcibly disallow data execution */
-#define        IMGPF_VFORK_EXEC        0x00000080      /* vfork followed by exec */
-#define        IMGPF_EXEC              0x00000100      /* exec */
+#define        IMGPF_VFORK_EXEC                0x00000080      /* vfork followed by exec */
+#define        IMGPF_EXEC                              0x00000100      /* exec */
 #define        IMGPF_HIGH_BITS_ASLR    0x00000200      /* randomize high bits of ASLR slide */
+#define        IMGPF_IS_64BIT_DATA             0x00000400      /* exec to a 64Bit register state */
+
 
 #endif /* !_SYS_IMGACT */
index 6d46a5afba720fd508851067ae9f7f20215ae06a..48ae2b3f29bd0ef77a14a2ba1c56061d0a881e7f 100644 (file)
@@ -582,6 +582,7 @@ __END_DECLS
 #define KAUTH_FILEOP_LINK                      5
 #define KAUTH_FILEOP_EXEC                      6
 #define KAUTH_FILEOP_DELETE                    7
+#define        KAUTH_FILEOP_WILL_RENAME                8
 
 /*
  * arguments passed to KAUTH_FILEOP_OPEN listeners
@@ -591,6 +592,10 @@ __END_DECLS
  *             arg0 is pointer to vnode (vnode *) for file to be closed.
  *             arg1 is pointer to path (char *) of file to be closed.
  *             arg2 is close flags.
+ * arguments passed to KAUTH_FILEOP_WILL_RENAME listeners
+ *             arg0 is pointer to vnode (vnode *) of the file being renamed
+ *             arg1 is pointer to the "from" path (char *)
+ *             arg2 is pointer to the "to" path (char *)
  * arguments passed to KAUTH_FILEOP_RENAME listeners
  *             arg0 is pointer to "from" path (char *).
  *             arg1 is pointer to "to" path (char *).
index 132698775f0280611b5b9b95ce4d163bccfc873b..7d5f89cf8cb8f00d6886285ed110c8e97d8ab7cd 100644 (file)
@@ -47,10 +47,6 @@ __BEGIN_DECLS
 #include <Availability.h>
 #endif
 
-#ifdef XNU_KERNEL_PRIVATE
-#include <mach/branch_predicates.h> /* __improbable */
-#endif
-
 /*
  * Kdebug is a facility for tracing events occurring on a system.
  *
@@ -190,7 +186,7 @@ extern void kernel_debug_enter(
 #define DBG_DRIVERS     6
 #define DBG_TRACE       7
 #define DBG_DLIL        8
-#define DBG_WORKQUEUE   9
+#define DBG_PTHREAD     9
 #define DBG_CORESTORAGE 10
 #define DBG_CG          11
 #define DBG_MONOTONIC   12
@@ -211,6 +207,7 @@ extern void kernel_debug_enter(
 #define DBG_DISPATCH    46
 #define DBG_IMG         49
 #define DBG_UMALLOC     51
+#define DBG_TURNSTILE   53
 
 
 #define DBG_MIG         255
@@ -389,12 +386,14 @@ extern void kdebug_reset(void);
 #define DBG_MACH_ZALLOC         0xA5 /* Zone allocator */
 #define DBG_MACH_THREAD_GROUP   0xA6 /* Thread groups */
 #define DBG_MACH_COALITION      0xA7 /* Coalitions */
+#define DBG_MACH_SHAREDREGION   0xA8 /* Shared region */
 
 /* Interrupt type bits for DBG_MACH_EXCP_INTR */
 #define DBG_INTR_TYPE_UNKNOWN   0x0     /* default/unknown interrupt */
 #define DBG_INTR_TYPE_IPI       0x1     /* interprocessor interrupt */
 #define DBG_INTR_TYPE_TIMER     0x2     /* timer interrupt */
 #define DBG_INTR_TYPE_OTHER     0x3     /* other (usually external) interrupt */
+#define DBG_INTR_TYPE_PMI       0x4     /* performance monitor interrupt */
 
 /* Codes for Scheduler (DBG_MACH_SCHED) */
 #define MACH_SCHED              0x0     /* Scheduler */
@@ -404,8 +403,8 @@ extern void kdebug_reset(void);
 #define MACH_CALLOUT            0x4     /* callouts */
 #define MACH_STACK_DETACH       0x5
 #define MACH_MAKE_RUNNABLE      0x6     /* make thread runnable */
-#define        MACH_PROMOTE            0x7     /* promoted due to resource */
-#define        MACH_DEMOTE             0x8     /* promotion undone */
+#define MACH_PROMOTE            0x7     /* promoted due to resource (replaced by MACH_PROMOTED) */
+#define MACH_DEMOTE             0x8     /* promotion undone (replaced by MACH_UNPROMOTED) */
 #define MACH_IDLE               0x9    /* processor idling */
 #define MACH_STACK_DEPTH        0xa    /* stack depth at switch */
 #define MACH_MOVED              0xb    /* did not use original scheduling decision */
@@ -447,6 +446,11 @@ extern void kdebug_reset(void);
 #define MACH_EXEC_DEMOTE           0x31 /* Thread demoted from exec boost */
 #define MACH_AMP_SIGNAL_SPILL      0x32 /* AMP spill signal sent to cpuid */
 #define MACH_AMP_STEAL             0x33 /* AMP thread stolen or spilled */
+#define MACH_SCHED_LOAD_EFFECTIVE  0x34 /* Effective scheduler load */
+#define        MACH_PROMOTED              0x35 /* thread promoted due to mutex priority promotion */
+#define        MACH_UNPROMOTED            0x36 /* thread unpromoted due to mutex priority promotion */
+#define        MACH_PROMOTED_UPDATE       0x37 /* thread already promoted, but promotion priority changed */
+#define        MACH_QUIESCENT_COUNTER     0x38 /* quiescent counter tick */
 
 /* Variants for MACH_MULTIQ_DEQUEUE */
 #define MACH_MULTIQ_BOUND     1
@@ -478,6 +482,7 @@ extern void kdebug_reset(void);
 #define MACH_IPC_VOUCHER_DESTROY               0x9     /* Voucher removed from global voucher hashtable */
 #define MACH_IPC_KMSG_INFO                     0xa     /* Send/Receive info for a kmsg */
 #define MACH_IPC_KMSG_LINK                     0xb     /* link a kernel kmsg pointer to user mach_msg_header_t */
+#define MACH_IPC_PORT_ENTRY_MODIFY     0xc     /* A port space gained or lost a port right (reference) */
 
 /* Codes for thread groups (DBG_MACH_THREAD_GROUP) */
 #define MACH_THREAD_GROUP_NEW           0x0
@@ -513,6 +518,9 @@ extern void kdebug_reset(void);
 #define PMAP__FLUSH_TLBS_TO    0xf
 #define PMAP__FLUSH_EPT        0x10
 #define PMAP__FAST_FAULT       0x11
+#define PMAP__SWITCH           0x12
+#define PMAP__TTE              0x13
+#define PMAP__SWITCH_USER_TTB  0x14
 
 /* Codes for clock (DBG_MACH_CLOCK) */
 #define        MACH_EPOCH_CHANGE       0x0     /* wake epoch change */
@@ -661,6 +669,7 @@ extern void kdebug_reset(void);
 #define DBG_DRVSSM           24 /* System State Manager(AppleSSM) */
 #define DBG_DRVSMC           25 /* System Management Controller */
 #define DBG_DRVMACEFIMANAGER 26 /* Mac EFI Manager */
+#define DBG_DRVANE           27 /* ANE */
 
 /* Backwards compatibility */
 #define        DBG_DRVPOINTING         DBG_DRVHID      /* OBSOLETE: Use DBG_DRVHID instead */
@@ -674,7 +683,11 @@ extern void kdebug_reset(void);
 #define DBG_DLIL_IF_FLT 5       /* DLIL Interface FIlter */
 
 
-/* The Kernel Debug Sub Classes for File System (DBG_FSYSTEM) */
+/*
+ * The Kernel Debug Sub Classes for File System (DBG_FSYSTEM)
+ *
+ * Please NOTE: sub class values 0xC and 0xD are currently unused.
+ */
 #define DBG_FSRW      0x1     /* reads and writes to the filesystem */
 #define DBG_DKRW      0x2     /* reads and writes to the disk */
 #define DBG_FSVN      0x3     /* vnode operations (inc. locking/unlocking) */
@@ -690,6 +703,7 @@ extern void kdebug_reset(void);
 #define DBG_MSDOS     0xF     /* FAT-specific events; see the msdosfs project */
 #define DBG_ACFS      0x10    /* Xsan-specific events; see the XsanFS project */
 #define DBG_THROTTLE  0x11    /* I/O Throttling events */
+#define DBG_DECMP     0x12    /* Decmpfs-specific events */
 #define DBG_CONTENT_PROT 0xCF /* Content Protection Events: see bsd/sys/cprotect.h */
 
 /*
@@ -736,7 +750,9 @@ extern void kdebug_reset(void);
 #ifdef  PRIVATE
 #define BSD_MEMSTAT_GRP_SET_PROP    12  /* set group properties */
 #define BSD_MEMSTAT_DO_KILL         13  /* memorystatus kills */
+#define BSD_MEMSTAT_CHANGE_PRIORITY 14  /* priority changed */
 #endif /* PRIVATE */
+#define BSD_MEMSTAT_FAST_JETSAM     15  /* Aggressive jetsam ("clear-the-deck") */
 
 /* Codes for BSD subcode class DBG_BSD_KEVENT */
 #define BSD_KEVENT_KQ_PROCESS_BEGIN   1
@@ -760,6 +776,7 @@ extern void kdebug_reset(void);
 #define BSD_KEVENT_KQWL_BIND          19
 #define BSD_KEVENT_KQWL_UNBIND        20
 #define BSD_KEVENT_KNOTE_ENABLE       21
+#define BSD_KEVENT_KNOTE_VANISHED     22
 
 /* The Kernel Debug Sub Classes for DBG_TRACE */
 #define DBG_TRACE_DATA      0
@@ -795,12 +812,14 @@ extern void kdebug_reset(void);
 
 /* The Kernel Debug Sub Classes for DBG_MONOTONIC */
 #define DBG_MT_INSTRS_CYCLES 1
+#define DBG_MT_DEBUG 2
 #define DBG_MT_TMPTH 0xfe
 #define DBG_MT_TMPCPU 0xff
 
 /* The Kernel Debug Sub Classes for DBG_MISC */
-#define DBG_EVENT      0x10
-#define        DBG_BUFFER      0x20
+#define DBG_EVENT       0x10
+#define DBG_MISC_LAYOUT 0x1a
+#define DBG_BUFFER      0x20
 
 /* The Kernel Debug Sub Classes for DBG_DYLD */
 #define DBG_DYLD_UUID (5)
@@ -841,7 +860,9 @@ extern void kdebug_reset(void);
 #define DBG_APP_SYSTEMUI        0x05
 #define DBG_APP_SIGNPOST        0x0A
 #define DBG_APP_APPKIT          0x0C
+#define DBG_APP_UIKIT           0x0D
 #define DBG_APP_DFR             0x0E
+#define DBG_APP_LAYOUT          0x0F
 #define DBG_APP_SAMBA           0x80
 #define DBG_APP_EOSSUPPORT      0x81
 #define DBG_APP_MACEFIMANAGER   0x82
@@ -898,6 +919,32 @@ extern void kdebug_reset(void);
 #define IMP_SYNC_IPC_QOS_OVERFLOW               0x2
 #define IMP_SYNC_IPC_QOS_UNDERFLOW              0x3
 
+/* Subclasses for Turnstiles (DBG_TURNSTILE) */
+#define TURNSTILE_HEAP_OPERATIONS               0x10
+#define TURNSTILE_PRIORITY_OPERATIONS           0x20
+#define TURNSTILE_FREELIST_OPERATIONS           0x30
+
+/* Codes for TURNSTILE_HEAP_OPERATIONS */
+#define THREAD_ADDED_TO_TURNSTILE_WAITQ         0x1
+#define THREAD_REMOVED_FROM_TURNSTILE_WAITQ     0x2
+#define THREAD_MOVED_IN_TURNSTILE_WAITQ         0x3
+#define TURNSTILE_ADDED_TO_TURNSTILE_HEAP       0x4
+#define TURNSTILE_REMOVED_FROM_TURNSTILE_HEAP   0x5
+#define TURNSTILE_MOVED_IN_TURNSTILE_HEAP       0x6
+#define TURNSTILE_ADDED_TO_THREAD_HEAP          0x7
+#define TURNSTILE_REMOVED_FROM_THREAD_HEAP      0x8
+#define TURNSTILE_MOVED_IN_THREAD_HEAP          0x9
+#define TURNSTILE_UPDATE_STOPPED_BY_LIMIT       0xa
+#define THREAD_NOT_WAITING_ON_TURNSTILE         0xb
+
+/* Codes for TURNSTILE_PRIORITY_OPERATIONS */
+#define TURNSTILE_PRIORITY_CHANGE               0x1
+#define THREAD_USER_PROMOTION_CHANGE            0x2
+
+/* Codes for TURNSTILE_FREELIST_OPERATIONS */
+#define TURNSTILE_PREPARE                       0x1
+#define TURNSTILE_COMPLETE                      0x2
+
 /* Subclasses for MACH Bank Voucher Attribute Manager (DBG_BANK) */
 #define BANK_ACCOUNT_INFO              0x10    /* Trace points related to bank account struct */
 #define BANK_TASK_INFO                 0x11    /* Trace points related to bank task struct */
@@ -968,6 +1015,7 @@ extern void kdebug_reset(void);
 #define IMPORTANCE_CODE(SubClass, code) KDBG_CODE(DBG_IMPORTANCE, (SubClass), (code))
 #define BANK_CODE(SubClass, code) KDBG_CODE(DBG_BANK, (SubClass), (code))
 #define ATM_CODE(SubClass, code) KDBG_CODE(DBG_ATM, (SubClass), (code))
+#define TURNSTILE_CODE(SubClass, code) KDBG_CODE(DBG_TURNSTILE, (SubClass), (code))
 
 /* Kernel Debug Macros for specific daemons */
 #define COREDUETDBG_CODE(code) DAEMONDBG_CODE(DBG_DAEMON_COREDUET, code)
@@ -1002,16 +1050,23 @@ extern void kdebug_reset(void);
  */
 
 /*
- * Traced on debug and development (and release OS X) kernels.
+ * Traced on debug and development (and release macOS) kernels.
  */
 #define KDBG(x, ...) KDBG_(, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
 
 /*
- * Traced on debug and development (and release OS X) kernels if explicitly
+ * Traced on debug and development (and release macOS) kernels if explicitly
  * requested.  Omitted from tracing without a typefilter.
  */
 #define KDBG_FILTERED(x, ...) KDBG_(_FILTERED, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
 
+/*
+ * Traced on debug and development (and release macOS) kernels, even if the
+ * process filter would reject it.
+ */
+#define KDBG_RELEASE_NOPROCFILT(x, ...) \
+               KDBG_(_RELEASE_NOPROCFILT, x, ## __VA_ARGS__, 4, 3, 2, 1, 0)
+
 /*
  * Traced on debug, development, and release kernels.
  *
@@ -1096,17 +1151,31 @@ extern unsigned int kdebug_enable;
  * tracing without a typefilter.
  */
 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
-#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...)             \
-       do {                                                               \
-               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {     \
-                       kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b), \
-                               (uintptr_t)(c), (uintptr_t)(d));                       \
-               }                                                              \
+#define KERNEL_DEBUG_CONSTANT_FILTERED(x, a, b, c, d, ...)           \
+       do {                                                             \
+               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {   \
+                       kernel_debug_filtered((x), (uintptr_t)(a), (uintptr_t)(b),  \
+                               (uintptr_t)(c), (uintptr_t)(d)); \
+               }                                                            \
        } while (0)
 #else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
 #define KERNEL_DEBUG_CONSTANT_FILTERED(type, x, a, b, c, d, ...) do {} while (0)
 #endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD) */
 
+#if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
+#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...)   \
+       do {                                                               \
+               if (KDBG_IMPROBABLE(kdebug_enable & ~KDEBUG_ENABLE_PPT)) {     \
+                       kernel_debug_flags((x), (uintptr_t)(a), (uintptr_t)(b),    \
+                               (uintptr_t)(c), (uintptr_t)(d), KDBG_FLAG_NOPROCFILT); \
+               }                                                              \
+       } while (0)
+#else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
+#define KERNEL_DEBUG_CONSTANT_RELEASE_NOPROCFILT(x, a, b, c, d, ...) \
+       do { } while (0)
+#endif /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
+
+
 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD)
 #define KERNEL_DEBUG_CONSTANT(x, a, b, c, d, e)                               \
        do {                                                                      \
@@ -1227,6 +1296,17 @@ extern void kernel_debug1(
                uintptr_t arg4,
                uintptr_t arg5);
 
+#define KDBG_FLAG_FILTERED 0x01
+#define KDBG_FLAG_NOPROCFILT 0x02
+
+extern void kernel_debug_flags(
+               uint32_t  debugid,
+               uintptr_t arg1,
+               uintptr_t arg2,
+               uintptr_t arg3,
+               uintptr_t arg4,
+               uint64_t flags);
+
 extern void kernel_debug_filtered(
                uint32_t  debugid,
                uintptr_t arg1,
@@ -1398,7 +1478,15 @@ boolean_t kdebug_debugid_enabled(uint32_t debugid);
 boolean_t kdebug_debugid_explicitly_enabled(uint32_t debugid);
 
 uint32_t kdebug_commpage_state(void);
-void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t lookup);
+
+#define KDBG_VFS_LOOKUP_FLAG_LOOKUP 0x01
+#define KDBG_VFS_LOOKUP_FLAG_NOPROCFILT 0x02
+void kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp,
+               uint32_t flags);
+
+void kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp,
+               boolean_t lookup);
+
 void kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid);
 
 void kdbg_trace_string(struct proc *proc, long *arg1, long *arg2, long *arg3, long *arg4);
@@ -1409,8 +1497,6 @@ void kdebug_trace_start(unsigned int n_events, const char *filterdesc,
                boolean_t wrapping, boolean_t at_wake);
 void kdebug_free_early_buf(void);
 struct task;
-boolean_t disable_wrap(uint32_t *old_slowcheck, uint32_t *old_flags);
-void enable_wrap(uint32_t old_slowcheck, boolean_t lostevents);
 void release_storage_unit(int cpu,  uint32_t storage_unit);
 int allocate_storage_unit(int cpu);
 
@@ -1427,78 +1513,92 @@ __END_DECLS
  * private kernel_debug definitions
  */
 
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__arm64__)
+typedef uint64_t kd_buf_argtype;
+#else
+typedef uintptr_t kd_buf_argtype;
+#endif
+
 typedef struct {
        uint64_t timestamp;
-       uintptr_t arg1;
-       uintptr_t arg2;
-       uintptr_t arg3;
-       uintptr_t arg4;
-       uintptr_t arg5; /* the thread ID */
+       kd_buf_argtype arg1;
+       kd_buf_argtype arg2;
+       kd_buf_argtype arg3;
+       kd_buf_argtype arg4;
+       kd_buf_argtype arg5; /* the thread ID */
        uint32_t debugid;
-#if defined(__LP64__)
+/*
+ * Ensure that both LP32 and LP64 variants of arm64 use the same kd_buf
+ * structure.
+ */
+#if defined(__LP64__) || defined(__arm64__)
        uint32_t cpuid;
-       uintptr_t unused;
+       kd_buf_argtype unused;
 #endif
 } kd_buf;
 
-#if !defined(__LP64__)
-#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL
-#define KDBG_CPU_MASK       0xff00000000000000ULL
-#define KDBG_CPU_SHIFT      56
+#if defined(__LP64__) || defined(__arm64__)
+#define KDBG_TIMESTAMP_MASK            0xffffffffffffffffULL
 static inline void
 kdbg_set_cpu(kd_buf *kp, int cpu)
 {
-       kp->timestamp = (kp->timestamp & KDBG_TIMESTAMP_MASK) |
-                               (((uint64_t) cpu) << KDBG_CPU_SHIFT);
+       kp->cpuid = (unsigned int)cpu;
 }
 static inline int
 kdbg_get_cpu(kd_buf *kp)
 {
-       return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT);
+       return (int)kp->cpuid;
 }
 static inline void
 kdbg_set_timestamp(kd_buf *kp, uint64_t thetime)
 {
-       kp->timestamp = thetime & KDBG_TIMESTAMP_MASK;
+       kp->timestamp = thetime;
 }
 static inline uint64_t
 kdbg_get_timestamp(kd_buf *kp)
 {
-       return kp->timestamp & KDBG_TIMESTAMP_MASK;
+       return kp->timestamp;
 }
 static inline void
 kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu)
 {
-       kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) |
-                               (((uint64_t) cpu) << KDBG_CPU_SHIFT);
+       kdbg_set_timestamp(kp, thetime);
+       kdbg_set_cpu(kp, cpu);
 }
 #else
-#define KDBG_TIMESTAMP_MASK            0xffffffffffffffffULL
+#define KDBG_TIMESTAMP_MASK 0x00ffffffffffffffULL
+#define KDBG_CPU_MASK       0xff00000000000000ULL
+#define KDBG_CPU_SHIFT      56
 static inline void
 kdbg_set_cpu(kd_buf *kp, int cpu)
 {
-       kp->cpuid = (unsigned int)cpu;
+       kp->timestamp = (kp->timestamp & KDBG_TIMESTAMP_MASK) |
+                               (((uint64_t) cpu) << KDBG_CPU_SHIFT);
 }
 static inline int
 kdbg_get_cpu(kd_buf *kp)
 {
-       return (int)kp->cpuid;
+       return (int) (((kp)->timestamp & KDBG_CPU_MASK) >> KDBG_CPU_SHIFT);
 }
 static inline void
 kdbg_set_timestamp(kd_buf *kp, uint64_t thetime)
 {
-       kp->timestamp = thetime;
+       kp->timestamp = thetime & KDBG_TIMESTAMP_MASK;
 }
 static inline uint64_t
 kdbg_get_timestamp(kd_buf *kp)
 {
-       return kp->timestamp;
+       return kp->timestamp & KDBG_TIMESTAMP_MASK;
 }
 static inline void
 kdbg_set_timestamp_and_cpu(kd_buf *kp, uint64_t thetime, int cpu)
 {
-       kdbg_set_timestamp(kp, thetime);
-       kdbg_set_cpu(kp, cpu);
+       kp->timestamp = (thetime & KDBG_TIMESTAMP_MASK) |
+                               (((uint64_t) cpu) << KDBG_CPU_SHIFT);
 }
 #endif
 
@@ -1570,7 +1670,11 @@ typedef struct {
 
 typedef struct {
        /* the thread ID */
+#if defined(__arm64__)
+       uint64_t thread;
+#else
        uintptr_t thread;
+#endif
        /* 0 for invalid, otherwise the PID (or 1 for kernel_task) */
        int valid;
        /* the name of the process owning the thread */
index 52bce789c6c5e5791f8a6257a435377dad8034f0..c3f5cec27fb443547d31b435beb96545c11b5cac 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2006-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -96,6 +96,31 @@ typedef struct memorystatus_priority_entry {
        uint32_t state;
 } memorystatus_priority_entry_t;
 
+/*
+ * This should be the structure to specify different properties
+ * for processes (group or single) from user-space. Unfortunately,
+ * we can't move to it completely because the priority_entry structure
+ * above has been in use for a while now. We'll have to deprecate it.
+ *
+ * To support new fields/properties, we will add a new structure with a
+ * new version and a new size. 
+ */
+#define MEMORYSTATUS_MPE_VERSION_1             1
+
+#define MEMORYSTATUS_MPE_VERSION_1_SIZE                sizeof(struct memorystatus_properties_entry_v1)
+
+typedef        struct memorystatus_properties_entry_v1 {
+       int version;
+       pid_t pid;
+       int32_t priority;
+       int use_probability;
+       uint64_t user_data;
+       int32_t limit;  /* MB */
+       uint32_t state;
+       char proc_name[MAXCOMLEN+1];
+       char __pad1[3];
+} memorystatus_properties_entry_v1_t;
+       
 typedef struct memorystatus_kernel_stats {
        uint32_t free_pages;
        uint32_t active_pages;
@@ -131,7 +156,6 @@ typedef struct jetsam_snapshot_entry {
        uint64_t user_data;
        uint64_t killed;
        uint64_t pages;
-       uint64_t max_pages;
        uint64_t max_pages_lifetime;
        uint64_t purgeable_pages;
        uint64_t jse_internal_pages;
@@ -148,7 +172,8 @@ typedef struct jetsam_snapshot_entry {
        uint64_t jse_killtime;                  /* absolute time when jetsam chooses to kill a process */
        uint64_t jse_idle_delta;                /* time spent in idle band */
        uint64_t jse_coalition_jetsam_id;       /* we only expose coalition id for COALITION_TYPE_JETSAM */
-       struct timeval cpu_time;
+       struct timeval64 cpu_time;
+       uint64_t jse_thaw_count;
 } memorystatus_jetsam_snapshot_entry_t;
 
 typedef struct jetsam_snapshot {
@@ -185,19 +210,21 @@ typedef struct memorystatus_freeze_entry {
  *     kMemorystatusKilled... Cause enum
  *     memorystatus_kill_cause_name[]
  */
-#define JETSAM_REASON_INVALID                  0
-#define JETSAM_REASON_GENERIC                  1
-#define JETSAM_REASON_MEMORY_HIGHWATER         2
-#define JETSAM_REASON_VNODE                    3
-#define JETSAM_REASON_MEMORY_VMPAGESHORTAGE    4
-#define JETSAM_REASON_MEMORY_VMTHRASHING       5
-#define JETSAM_REASON_MEMORY_FCTHRASHING       6
-#define JETSAM_REASON_MEMORY_PERPROCESSLIMIT   7
-#define JETSAM_REASON_MEMORY_DIAGNOSTIC                8
-#define JETSAM_REASON_MEMORY_IDLE_EXIT         9
-#define JETSAM_REASON_ZONE_MAP_EXHAUSTION      10
-
-#define JETSAM_REASON_MEMORYSTATUS_MAX   JETSAM_REASON_ZONE_MAP_EXHAUSTION
+#define JETSAM_REASON_INVALID                                                          0
+#define JETSAM_REASON_GENERIC                                                          1
+#define JETSAM_REASON_MEMORY_HIGHWATER                                         2
+#define JETSAM_REASON_VNODE                                                                    3
+#define JETSAM_REASON_MEMORY_VMPAGESHORTAGE                                    4
+#define JETSAM_REASON_MEMORY_PROCTHRASHING                                     5
+#define JETSAM_REASON_MEMORY_FCTHRASHING                                       6
+#define JETSAM_REASON_MEMORY_PERPROCESSLIMIT                           7
+#define JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE                       8
+#define JETSAM_REASON_MEMORY_IDLE_EXIT                                         9
+#define JETSAM_REASON_ZONE_MAP_EXHAUSTION                                      10
+#define JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING                    11
+#define JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE       12
+
+#define JETSAM_REASON_MEMORYSTATUS_MAX JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE
 
 /*
  * Jetsam exit reason definitions - not related to memorystatus
@@ -206,19 +233,26 @@ typedef struct memorystatus_freeze_entry {
 
 /* Cause */
 enum {
-       kMemorystatusInvalid                    = JETSAM_REASON_INVALID,
-       kMemorystatusKilled                     = JETSAM_REASON_GENERIC,
-       kMemorystatusKilledHiwat                = JETSAM_REASON_MEMORY_HIGHWATER,
-       kMemorystatusKilledVnodes               = JETSAM_REASON_VNODE,
-       kMemorystatusKilledVMPageShortage       = JETSAM_REASON_MEMORY_VMPAGESHORTAGE,
-       kMemorystatusKilledVMThrashing          = JETSAM_REASON_MEMORY_VMTHRASHING,
-       kMemorystatusKilledFCThrashing          = JETSAM_REASON_MEMORY_FCTHRASHING,
-       kMemorystatusKilledPerProcessLimit      = JETSAM_REASON_MEMORY_PERPROCESSLIMIT,
-       kMemorystatusKilledDiagnostic           = JETSAM_REASON_MEMORY_DIAGNOSTIC,
-       kMemorystatusKilledIdleExit             = JETSAM_REASON_MEMORY_IDLE_EXIT,
-       kMemorystatusKilledZoneMapExhaustion    = JETSAM_REASON_ZONE_MAP_EXHAUSTION
+       kMemorystatusInvalid                                                    = JETSAM_REASON_INVALID,
+       kMemorystatusKilled                                                             = JETSAM_REASON_GENERIC,
+       kMemorystatusKilledHiwat                                                = JETSAM_REASON_MEMORY_HIGHWATER,
+       kMemorystatusKilledVnodes                                               = JETSAM_REASON_VNODE,
+       kMemorystatusKilledVMPageShortage                               = JETSAM_REASON_MEMORY_VMPAGESHORTAGE,
+       kMemorystatusKilledProcThrashing                                = JETSAM_REASON_MEMORY_PROCTHRASHING,
+       kMemorystatusKilledFCThrashing                                  = JETSAM_REASON_MEMORY_FCTHRASHING,
+       kMemorystatusKilledPerProcessLimit                              = JETSAM_REASON_MEMORY_PERPROCESSLIMIT,
+       kMemorystatusKilledDiskSpaceShortage                    = JETSAM_REASON_MEMORY_DISK_SPACE_SHORTAGE,
+       kMemorystatusKilledIdleExit                                             = JETSAM_REASON_MEMORY_IDLE_EXIT,
+       kMemorystatusKilledZoneMapExhaustion                    = JETSAM_REASON_ZONE_MAP_EXHAUSTION,
+       kMemorystatusKilledVMCompressorThrashing                = JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING,
+       kMemorystatusKilledVMCompressorSpaceShortage    = JETSAM_REASON_MEMORY_VMCOMPRESSOR_SPACE_SHORTAGE,
 };
 
+/* For backwards compatibility */
+#define kMemorystatusKilledDiagnostic          kMemorystatusKilledDiskSpaceShortage
+#define kMemorystatusKilledVMThrashing         kMemorystatusKilledVMCompressorThrashing
+#define JETSAM_REASON_MEMORY_VMTHRASHING       JETSAM_REASON_MEMORY_VMCOMPRESSOR_THRASHING
+
 /* Memorystatus control */
 #define MEMORYSTATUS_BUFFERSIZE_MAX 65536
 
@@ -241,8 +275,19 @@ int memorystatus_control(uint32_t command, int32_t pid, uint32_t flags, void *bu
 #define MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_ENABLE  11   /* Enable the 'lenient' mode for aggressive jetsam. See comments in kern_memorystatus.c near the top. */
 #define MEMORYSTATUS_CMD_AGGRESSIVE_JETSAM_LENIENT_MODE_DISABLE 12   /* Disable the 'lenient' mode for aggressive jetsam. */
 #define MEMORYSTATUS_CMD_GET_MEMLIMIT_EXCESS          13   /* Compute how much a process's phys_footprint exceeds inactive memory limit */
-#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE        14
-#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE       15
+#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_ENABLE        14 /* Set the inactive jetsam band for a process to JETSAM_PRIORITY_ELEVATED_INACTIVE */
+#define MEMORYSTATUS_CMD_ELEVATED_INACTIVEJETSAMPRIORITY_DISABLE       15 /* Reset the inactive jetsam band for a process to the default band (0)*/
+#define MEMORYSTATUS_CMD_SET_PROCESS_IS_MANAGED       16   /* (Re-)Set state on a process that marks it as (un-)managed by a system entity e.g. assertiond */
+#define MEMORYSTATUS_CMD_GET_PROCESS_IS_MANAGED       17   /* Return the 'managed' status of a process */
+#define MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE     18   /* Is the process eligible for freezing? Apps and extensions can pass in FALSE to opt out of freezing, i.e.,
+                                                              if they would prefer being jetsam'ed in the idle band to being frozen in an elevated band. */
+#define MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE     19   /* Return the freezable state of a process. */
+
+#if CONFIG_FREEZE
+#if DEVELOPMENT || DEBUG
+#define MEMORYSTATUS_CMD_FREEZER_CONTROL             20
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* CONFIG_FREEZE */
 
 /* Commands that act on a group of processes */
 #define MEMORYSTATUS_CMD_GRP_SET_PROPERTIES           100
@@ -268,6 +313,14 @@ typedef struct memorystatus_jetsam_panic_options {
 
 #endif /* PRIVATE */
 
+/* memorystatus_control() flags */
+
+#define MEMORYSTATUS_FLAGS_SNAPSHOT_ON_DEMAND          0x1     /* A populated snapshot buffer is returned on demand */
+#define MEMORYSTATUS_FLAGS_SNAPSHOT_AT_BOOT            0x2     /* Returns a snapshot with memstats collected at boot */
+#define MEMORYSTATUS_FLAGS_SNAPSHOT_COPY               0x4     /* Returns the previously populated snapshot created by the system */
+#define MEMORYSTATUS_FLAGS_GRP_SET_PRIORITY            0x8     /* Set jetsam priorities for a group of pids */
+#define MEMORYSTATUS_FLAGS_GRP_SET_PROBABILITY         0x10    /* Set probability of use for a group of processes */
+
 /*
  * For use with memorystatus_control:
  * MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT
@@ -287,14 +340,18 @@ typedef struct memorystatus_jetsam_panic_options {
  *     stats do not change.  In this mode,
  *     the snapshot entry_count is always 0.
  *
+ *     Copy mode - this returns the previous snapshot
+ *     collected by the system. The current snaphshot
+ *     might be only half populated.
+ *
  * Snapshots are inherently racey between request
  * for buffer size and actual data compilation.
 */
 
-/* Flags */
-#define MEMORYSTATUS_SNAPSHOT_ON_DEMAND                0x1     /* A populated snapshot buffer is returned on demand */
-#define MEMORYSTATUS_SNAPSHOT_AT_BOOT          0x2     /* Returns a snapshot with memstats collected at boot */
-
+/* These definitions are required for backwards compatibility */
+#define MEMORYSTATUS_SNAPSHOT_ON_DEMAND                MEMORYSTATUS_FLAGS_SNAPSHOT_ON_DEMAND
+#define MEMORYSTATUS_SNAPSHOT_AT_BOOT          MEMORYSTATUS_FLAGS_SNAPSHOT_AT_BOOT
+#define MEMORYSTATUS_SNAPSHOT_COPY             MEMORYSTATUS_FLAGS_SNAPSHOT_COPY
 
 /*
  * For use with memorystatus_control:
@@ -348,19 +405,19 @@ typedef struct memorystatus_memlimit_properties {
  *     - in kernel process state and memlimit state
  */
 
-#define P_MEMSTAT_SUSPENDED            0x00000001
-#define P_MEMSTAT_FROZEN               0x00000002
-#define P_MEMSTAT_NORECLAIM            0x00000004
-#define P_MEMSTAT_ERROR                0x00000008
-#define P_MEMSTAT_LOCKED               0x00000010
-#define P_MEMSTAT_TERMINATED           0x00000020
-#define P_MEMSTAT_NOTFIED              0x00000040
-#define P_MEMSTAT_PRIORITYUPDATED      0x00000080
-#define P_MEMSTAT_FOREGROUND           0x00000100
-#define P_MEMSTAT_DIAG_SUSPENDED       0x00000200
-#define P_MEMSTAT_PRIOR_THAW           0x00000400
-/* unused                              0x00000800 */
-#define P_MEMSTAT_INTERNAL             0x00001000
+#define P_MEMSTAT_SUSPENDED            0x00000001 /* Process is suspended and likely in the IDLE band */
+#define P_MEMSTAT_FROZEN               0x00000002 /* Process has some state on disk. It should be suspended */
+#define P_MEMSTAT_FREEZE_DISABLED      0x00000004 /* Process isn't freeze-eligible and will not be frozen */
+#define P_MEMSTAT_ERROR                0x00000008 /* Process couldn't be jetsammed for some reason. Transient state so jetsam can skip it next time it sees it */
+#define P_MEMSTAT_LOCKED               0x00000010 /* Process is being actively worked on behind the proc_list_lock */
+#define P_MEMSTAT_TERMINATED           0x00000020 /* Process is exiting */
+#define P_MEMSTAT_FREEZE_IGNORE        0x00000040 /* Process was evaluated by freezer and will be ignored till the next time it goes active and does something */
+#define P_MEMSTAT_PRIORITYUPDATED      0x00000080 /* Process had its jetsam priority updated */
+#define P_MEMSTAT_FOREGROUND           0x00000100 /* Process is in the FG jetsam band...unused??? */
+#define P_MEMSTAT_DIAG_SUSPENDED       0x00000200 /* ...unused??? */
+#define P_MEMSTAT_REFREEZE_ELIGIBLE    0x00000400 /* Process was once thawed i.e. its state was brought back from disk. It is now refreeze eligible.*/
+#define P_MEMSTAT_MANAGED              0x00000800 /* Process is managed by assertiond i.e. is either application or extension */
+#define P_MEMSTAT_INTERNAL             0x00001000 /* Process is a system-critical-not-be-jetsammed process i.e. launchd */
 #define P_MEMSTAT_FATAL_MEMLIMIT                  0x00002000   /* current fatal state of the process's memlimit */
 #define P_MEMSTAT_MEMLIMIT_ACTIVE_FATAL           0x00004000   /* if set, exceeding limit is fatal when the process is active   */
 #define P_MEMSTAT_MEMLIMIT_INACTIVE_FATAL         0x00008000   /* if set, exceeding limit is fatal when the process is inactive */
@@ -378,7 +435,7 @@ extern int memorystatus_update(proc_t p, int priority, uint64_t user_data, boole
 
 extern int memorystatus_remove(proc_t p, boolean_t locked);
 
-int memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t opflags, boolean_t effective_now);
+int memorystatus_update_inactive_jetsam_priority_band(pid_t pid, uint32_t opflags, int priority, boolean_t effective_now);
 
 
 extern int memorystatus_dirty_track(proc_t p, uint32_t pcontrol);
@@ -405,7 +462,7 @@ void memorystatus_knote_unregister(struct knote *kn);
 void memorystatus_log_exception(const int max_footprint_mb, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal);
 void memorystatus_on_ledger_footprint_exceeded(int warning, boolean_t memlimit_is_active, boolean_t memlimit_is_fatal);
 void proc_memstat_terminated(proc_t p, boolean_t set);
-boolean_t memorystatus_proc_is_dirty_unsafe(void *v);
+void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit);
 #endif /* CONFIG_MEMORYSTATUS */
 
 int memorystatus_get_pressure_status_kdp(void);
@@ -420,19 +477,18 @@ typedef enum memorystatus_policy {
        kPolicyDiagnoseActive = (kPolicyDiagnoseAll | kPolicyDiagnoseFirst),
 } memorystatus_policy_t;
 
-extern int memorystatus_jetsam_wakeup;
-extern unsigned int memorystatus_jetsam_running;
-
 boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
 boolean_t memorystatus_kill_on_FC_thrashing(boolean_t async);
+boolean_t memorystatus_kill_on_VM_compressor_thrashing(boolean_t async);
 boolean_t memorystatus_kill_on_vnode_limit(void);
 
 void jetsam_on_ledger_cpulimit_exceeded(void);
+void memorystatus_fast_jetsam_override(boolean_t enable_override);
 
 #endif /* CONFIG_JETSAM */
 
 boolean_t memorystatus_kill_on_zone_map_exhaustion(pid_t pid);
-boolean_t memorystatus_kill_on_VM_thrashing(boolean_t async);
+boolean_t memorystatus_kill_on_VM_compressor_space_shortage(boolean_t async);
 void memorystatus_pages_update(unsigned int pages_avail);
 
 boolean_t memorystatus_idle_exit_from_VM(void);
@@ -440,13 +496,19 @@ boolean_t memorystatus_idle_exit_from_VM(void);
 
 #ifdef CONFIG_FREEZE
 
-#define FREEZE_PAGES_MIN   ( 1 * 1024 * 1024 / PAGE_SIZE)
-#define FREEZE_PAGES_MAX   (16 * 1024 * 1024 / PAGE_SIZE)
+#define FREEZE_PAGES_MIN   ( 8 * 1024 * 1024 / PAGE_SIZE)
+#define FREEZE_PAGES_MAX   (32 * 1024 * 1024 / PAGE_SIZE)
 
-#define FREEZE_SUSPENDED_THRESHOLD_LOW     2
 #define FREEZE_SUSPENDED_THRESHOLD_DEFAULT 4
+#define FREEZE_PROCESSES_MAX              20
 
 #define FREEZE_DAILY_MB_MAX_DEFAULT      1024
+#define FREEZE_DEGRADATION_BUDGET_THRESHOLD    25 //degraded perf. when the daily budget left falls below this threshold percentage
+
+#define MAX_FROZEN_SHARED_MB_PERCENT 10 /* max shared MB calculated as percent of system task limit. */
+#define MAX_FROZEN_PROCESS_DEMOTIONS 2  /* max demotions of frozen processes into IDLE band done daily. */
+#define MIN_THAW_DEMOTION_THRESHOLD  5  /* min # of thaws required for a process to be safe from demotion. */
+#define MIN_THAW_REFREEZE_THRESHOLD  3  /* min # of global thaws needed for us to consider refreezing these processes. */
 
 typedef struct throttle_interval_t {
        uint32_t mins;
@@ -454,7 +516,6 @@ typedef struct throttle_interval_t {
        uint32_t pageouts;
        uint32_t max_pageouts;
        mach_timespec_t ts;
-       boolean_t throttle;
 } throttle_interval_t;
 
 extern boolean_t memorystatus_freeze_enabled;
@@ -462,6 +523,11 @@ extern int memorystatus_freeze_wakeup;
 
 extern void memorystatus_freeze_init(void) __attribute__((section("__TEXT, initcode")));
 extern int  memorystatus_freeze_process_sync(proc_t p);
+
+#if DEVELOPMENT || DEBUG
+#define FREEZER_CONTROL_GET_STATUS     (1)
+#endif /* DEVELOPMENT || DEBUG */
+
 #endif /* CONFIG_FREEZE */
 
 #if VM_PRESSURE_EVENTS
index b1b486528ebdac0eeb3a380520ec7a4b28bcf808..e2212a3c3904e124b2532a1522b96e0f80b2870b 100644 (file)
 
 __BEGIN_DECLS
 
+/*
+ * system_override() system call
+ *
+ * The system_override() syscall is used to modify some kernel performance mechanisms.
+ * The system call needs a special entitlement and should be used with extreme caution. 
+ * A misuse of this syscall could lead to severe performance and battery life issues.
+ *
+ * The caller needs to specify the mask for the specific mechanisms to modify and a 
+ * timeout. The implementation of this system call blocks the thread in the syscall 
+ * for the duration specified in the call. Blocking a thread in the system call allows 
+ * the kernel to revert the modification in case the calling process dies. It also 
+ * makes the change of behavior extremely obvious due to the backtrace of the calling 
+ * thread.
+ *
+ * Multiple agents are allowed to call this interface at the same time. The behavior 
+ * change is effective from the time the first call is made (for a specific mechanism) 
+ * until the longest timeout specified by any agent. If the caller wishes to disable 
+ * the behavior change caused by itself, it can call the same interface with the 
+ * SYS_OVERRIDE_DISABLE flag and the mechanism mask from another thread in the same 
+ * process. Note that this does not break out the original thread from the block 
+ * immediately. It simply undoes the mechanism change underneath.
+ *
+ * The currently supported overrides are:
+ * - SYS_OVERRIDE_IO_THROTTLE:   Modifies I/O throttling behavior
+ * - SYS_OVERRIDE_CPU_THROTTLE:  Modifies background stepper throttling mechanism
+ * - SYS_OVERRIDE_FAST_JETSAM:   Modifies jetsam behavior to use aggressive parallel jetsam
+ *
+ */
+
 /* System Overrides Flags */
-#define SYS_OVERRIDE_DISABLE           0x0
+#define SYS_OVERRIDE_DISABLE           (~(~0ull >> 1))
 #define SYS_OVERRIDE_IO_THROTTLE       0x1
 #define SYS_OVERRIDE_CPU_THROTTLE      0x2
+#define SYS_OVERRIDE_FAST_JETSAM       0x4
 
-
-#define SYS_OVERRIDE_FLAGS_MASK                (SYS_OVERRIDE_DISABLE | SYS_OVERRIDE_IO_THROTTLE | SYS_OVERRIDE_CPU_THROTTLE)
+#define SYS_OVERRIDE_FLAGS_MASK                (SYS_OVERRIDE_DISABLE | SYS_OVERRIDE_IO_THROTTLE | SYS_OVERRIDE_CPU_THROTTLE | SYS_OVERRIDE_FAST_JETSAM)
 
 #ifdef BSD_KERNEL_PRIVATE
 void init_system_override(void);
index d877f0974f7fd28bd5db406b917508adc436aa74..9ecef199cb7cad48cb4e036bc5e3ed1635ec86ba 100644 (file)
@@ -1535,6 +1535,10 @@ extern errno_t mbuf_get_traffic_class_index(mbuf_traffic_class_t tc,
                medium loss tolerant, elastic flow, constant packet interval,
                variable rate & size.  This level corresponds to WMM access
                class "VI" or MBUF_TC_VI.
+       @constant MBUF_SC_SIG "Signaling", low delay tolerant, low loss
+               tolerant, inelastic flow, jitter tolerant, rate is bursty but
+               short, variable size. e.g. SIP.  This level corresponds to WMM
+               access class "VI" or MBUF_TC_VI.
        @constant MBUF_SC_VO "Interactive Voice", low delay tolerant, low loss
                tolerant, inelastic flow, constant packet rate, somewhat fixed
                size.  This level corresponds to WMM access class "VO" or
@@ -1556,6 +1560,7 @@ typedef enum {
        MBUF_SC_AV              = 0x00280120,
        MBUF_SC_RV              = 0x00300110,
        MBUF_SC_VI              = 0x00380100,
+       MBUF_SC_SIG             = 0x00380130,
 
        MBUF_SC_VO              = 0x00400180,
        MBUF_SC_CTL             = 0x00480190,   /* highest class */
index 8fd29dbb9fc9874c8ccc952a2ea50919fea10cfc..4ebdcec61d606fb0b5ca5b4ca596dc09b48a539f 100644 (file)
@@ -146,15 +146,8 @@ struct linker_set_entry {
  *     Iterates over the members of _set within _object.  Since the set contains
  *     pointers to its elements, for a set of elements of type etyp, _pvar must
  *     be (etyp **).
- * set_member_type **LINKER_SET_OBJECT_ITEM(_object, _set, _i)
- *     Returns a pointer to the _i'th element of _set within _object.
- *
- * void **LINKER_SET_BEGIN(_set)
- * void **LINKER_SET_LIMINT(_set)
  * LINKER_SET_FOREACH((set_member_type **)_pvar, _cast, _set)
- * set_member_type **LINKER_SET_ITEM(_set, _i)
- *     These versions implicitly reference the kernel/application object.
- * 
+ *
  * Example of _cast: For the _pvar "struct sysctl_oid **oidpp", _cast would be
  *                  "struct sysctl_oid **"
  *
@@ -168,17 +161,11 @@ struct linker_set_entry {
             _pvar < (_cast) LINKER_SET_OBJECT_LIMIT(_object, _set);    \
             _pvar++)
 
-#define LINKER_SET_OBJECT_ITEM(_object, _set, _i)                      \
-       ((LINKER_SET_OBJECT_BEGIN(_object, _set))[_i])
+#define LINKER_SET_OBJECT_ITEM(_object, _cast, _set, _i)               \
+       (((_cast)(LINKER_SET_OBJECT_BEGIN(_object, _set)))[_i])
 
-#define LINKER_SET_BEGIN(_set)                                         \
-               LINKER_SET_OBJECT_BEGIN((kernel_mach_header_t *)&_mh_execute_header, _set)
-#define LINKER_SET_LIMIT(_set)                                         \
-               LINKER_SET_OBJECT_LIMIT((kernel_mach_header_t *)&_mh_execute_header, _set)
 #define LINKER_SET_FOREACH(_pvar, _cast, _set)                                 \
        LINKER_SET_OBJECT_FOREACH((kernel_mach_header_t *)&_mh_execute_header, _pvar, _cast, _set)
-#define LINKER_SET_ITEM(_set, _i)                                      \
-       LINKER_SET_OBJECT_ITEM((kernel_mach_header_t *)&_mh_execute_header, _set, _i)
 
 /*
  * Implementation.
index a9e536d7ae014f15ba8c94c94707c558b8bf0d6f..8707892610d786b991fa3d3210a078e7f37a69a5 100644 (file)
@@ -182,7 +182,7 @@ extern void (lockstat_probe_wrapper)(int, uintptr_t, int);
 #define        LOCKSTAT_RECORD4(probe, lp, arg0, arg1, arg2, arg3)             \
        {                                                               \
                dtrace_id_t id;                                         \
-               if ((id = lockstat_probemap[(probe)])) {                \
+               if (__improbable(id = lockstat_probemap[(probe)])) {            \
                        (*lockstat_probe)(id, (uintptr_t)(lp), (arg0),  \
                            (arg1), (arg2), (arg3));                    \
                }                                                       \
index 0dd7117f5b8fce73129d737ca72821c4e51804b4..fea78a29cf54655fa17020deacc51c4533f47c11 100644 (file)
 #define M_EVENTHANDLER 125     /* Eventhandler */
 #define M_LLTABLE      126     /* Link layer table */
 #define M_NWKWQ                127     /* Network work queue */
+#define M_CFIL      128 /* Content Filter */
 
-#define        M_LAST          128     /* Must be last type + 1 */
+#define        M_LAST          129     /* Must be last type + 1 */
 
 #else /* BSD_KERNEL_PRIVATE */
 
index 8dafca387c015f37a77fa4a0b35e7d344b4857f8..b89ee1459fa9c0fe939061086dd500a2450241df 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1999-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #ifndef        _SYS_MBUF_H_
 #define        _SYS_MBUF_H_
 
-#include <sys/cdefs.h>
 #include <sys/appleapiopts.h>
+#include <sys/cdefs.h>
 #include <sys/_types/_u_int32_t.h> /* u_int32_t */
 #include <sys/_types/_u_int64_t.h> /* u_int64_t */
 #include <sys/_types/_u_short.h> /* u_short */
 
-#ifdef XNU_KERNEL_PRIVATE
+#ifdef KERNEL
+#include <sys/kpi_mbuf.h>
+#endif
 
+#ifdef XNU_KERNEL_PRIVATE
 #include <sys/lock.h>
 #include <sys/queue.h>
 #include <machine/endian.h>
@@ -228,6 +231,8 @@ struct tcp_pktinfo {
                struct {
                        u_int32_t segsz;        /* segment size (actual MSS) */
                        u_int32_t start_seq;    /* start seq of this packet */
+                       pid_t     pid;
+                       pid_t     e_pid;
                } __tx;
                struct {
                        u_int16_t lro_pktlen;   /* max seg size encountered */
@@ -241,6 +246,8 @@ struct tcp_pktinfo {
        } __msgattr;
 #define tso_segsz      proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.segsz
 #define        tx_start_seq    proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.start_seq
+#define        tx_tcp_pid      proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.pid
+#define        tx_tcp_e_pid    proto_mtag.__pr_u.tcp.tm_tcp.__offload.__tx.e_pid
 #define lro_pktlen     proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_pktlen
 #define lro_npkts      proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_npkts
 #define lro_elapsed    proto_mtag.__pr_u.tcp.tm_tcp.__offload.__rx.lro_timediff
@@ -275,6 +282,20 @@ struct tcp_mtag {
        };
 };
 
+struct udp_mtag {
+       pid_t     _pid;
+       pid_t     _e_pid;
+#define        tx_udp_pid      proto_mtag.__pr_u.udp._pid
+#define        tx_udp_e_pid    proto_mtag.__pr_u.udp._e_pid
+};
+
+struct rawip_mtag {
+       pid_t     _pid;
+       pid_t     _e_pid;
+#define        tx_rawip_pid    proto_mtag.__pr_u.rawip._pid
+#define        tx_rawip_e_pid  proto_mtag.__pr_u.rawip._e_pid
+};
+
 struct driver_mtag_ {
        uintptr_t               _drv_tx_compl_arg;
        uintptr_t               _drv_tx_compl_data;
@@ -297,6 +318,8 @@ struct driver_mtag_ {
 struct proto_mtag_ {
        union {
                struct tcp_mtag tcp;            /* TCP specific */
+               struct udp_mtag         udp;    /* UDP specific */
+               struct rawip_mtag       rawip;  /* raw IPv4/IPv6 specific */
        } __pr_u;
 };
 
@@ -305,9 +328,10 @@ struct proto_mtag_ {
  */
 struct necp_mtag_ {
        u_int32_t       necp_policy_id;
-       u_int32_t       necp_last_interface_index;
+       u_int32_t       necp_skip_policy_id;
        u_int32_t       necp_route_rule_id;
-       u_int32_t       necp_app_id;
+       u_int16_t       necp_last_interface_index;
+       u_int16_t       necp_app_id;
 };
 
 union builtin_mtag {
@@ -346,7 +370,11 @@ struct pkthdr {
                } _csum_tx;
 #define        csum_tx_start   _csum_tx.start
 #define        csum_tx_stuff   _csum_tx.stuff
-               u_int32_t csum_data;    /* data field used by csum routines */
+               /*
+                * Generic data field used by csum routines.
+                * It gets used differently in different contexts.
+                */
+               u_int32_t csum_data;
        };
        u_int16_t vlan_tag;             /* VLAN tag, host byte order */
        /*
@@ -758,36 +786,61 @@ union m16kcluster {
 #define        M_COPY_CLASSIFIER(to, from)     m_copy_classifier(to, from)
 
 /*
- * Set the m_data pointer of a newly-allocated mbuf (m_get/MGET) to place
- * an object of the specified size at the end of the mbuf, longword aligned.
+ * Evaluate TRUE if it's safe to write to the mbuf m's data region (this can
+ * be both the local data payload, or an external buffer area, depending on
+ * whether M_EXT is set).
  */
-#define        M_ALIGN(m, len)                                                 \
-do {                                                                   \
-       (m)->m_data += (MLEN - (len)) &~ (sizeof (long) - 1);           \
-} while (0)
+#define        M_WRITABLE(m)   (((m)->m_flags & M_EXT) == 0 || !MCLHASREFERENCE(m))
 
 /*
- * As above, for mbufs allocated with m_gethdr/MGETHDR
- * or initialized by M_COPY_PKTHDR.
+ * These macros are mapped to the appropriate KPIs, so that private code
+ * can be simply recompiled in order to be forward-compatible with future
+ * changes toward the struture sizes.
+ */
+#define MLEN            mbuf_get_mlen()         /* normal mbuf data len */
+#define MHLEN           mbuf_get_mhlen()        /* data len in an mbuf w/pkthdr */
+#define MINCLSIZE       mbuf_get_minclsize()    /* cluster usage threshold */
+/*
+ * Return the address of the start of the buffer associated with an mbuf,
+ * handling external storage, packet-header mbufs, and regular data mbufs.
  */
-#define        MH_ALIGN(m, len)                                                \
-do {                                                                   \
-       (m)->m_data += (MHLEN - (len)) &~ (sizeof (long) - 1);          \
-} while (0)
+#define M_START(m)                                                      \
+        (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_buf :                  \
+         ((m)->m_flags & M_PKTHDR) ? &(m)->m_pktdat[0] :                \
+         &(m)->m_dat[0])
 
 /*
- * Compute the amount of space available
- * before the current start of data in an mbuf.
- * Subroutine - data not available if certain references.
+ * Return the size of the buffer associated with an mbuf, handling external
+ * storage, packet-header mbufs, and regular data mbufs.
  */
-#define        M_LEADINGSPACE(m)       m_leadingspace(m)
+#define M_SIZE(m)                                                       \
+        (((m)->m_flags & M_EXT) ? (m)->m_ext.ext_size :                 \
+         ((m)->m_flags & M_PKTHDR) ? MHLEN :                            \
+         MLEN)
+
+#define        M_ALIGN(m, len)         m_align(m, len)
+#define        MH_ALIGN(m, len)        m_align(m, len)
+#define        MEXT_ALIGN(m, len)      m_align(m, len)
+
+/*
+ * Compute the amount of space available before the current start of data in
+ * an mbuf.
+ *
+ * The M_WRITABLE() is a temporary, conservative safety measure: the burden
+ * of checking writability of the mbuf data area rests solely with the caller.
+ */
+#define        M_LEADINGSPACE(m)                                               \
+       (M_WRITABLE(m) ? ((m)->m_data - M_START(m)) : 0)
 
 /*
- * Compute the amount of space available
- * after the end of data in an mbuf.
- * Subroutine - data not available if certain references.
+ * Compute the amount of space available after the end of data in an mbuf.
+ *
+ * The M_WRITABLE() is a temporary, conservative safety measure: the burden
+ * of checking writability of the mbuf data area rests solely with the caller.
  */
-#define        M_TRAILINGSPACE(m)      m_trailingspace(m)
+#define        M_TRAILINGSPACE(m)                                              \
+       (M_WRITABLE(m) ?                                                \
+           ((M_START(m) + M_SIZE(m)) - ((m)->m_data + (m)->m_len)) : 0)
 
 /*
  * Arrange to prepend space of size plen to mbuf m.
@@ -1175,16 +1228,6 @@ struct mbuf;
 #define        M_COPYM_MUST_COPY_HDR   3       /* MUST copy pkthdr from old to new */
 #define        M_COPYM_MUST_MOVE_HDR   4       /* MUST move pkthdr from old to new */
 
-/*
- * These macros are mapped to the appropriate KPIs, so that private code
- * can be simply recompiled in order to be forward-compatible with future
- * changes toward the struture sizes.
- */
-#define        MLEN            mbuf_get_mlen()         /* normal data len */
-#define        MHLEN           mbuf_get_mhlen()        /* data len w/pkthdr */
-
-#define        MINCLSIZE       mbuf_get_minclsize()    /* cluster usage threshold */
-
 extern void m_freem(struct mbuf *);
 extern u_int64_t mcl_to_paddr(char *);
 extern void m_adj(struct mbuf *, int);
@@ -1247,6 +1290,7 @@ extern void m_mclfree(caddr_t p);
  *     MBUF_SC_AV      ] ==>   MBUF_TC_VI
  *     MBUF_SC_RV      ]
  *     MBUF_SC_VI      ]
+ *     MBUF_SC_SIG     ]
  *
  *     MBUF_SC_VO      ] ==>   MBUF_TC_VO
  *     MBUF_SC_CTL     ]
@@ -1276,6 +1320,7 @@ extern void m_mclfree(caddr_t p);
 #define        SCIDX_AV                MBUF_SCIDX(MBUF_SC_AV)
 #define        SCIDX_RV                MBUF_SCIDX(MBUF_SC_RV)
 #define        SCIDX_VI                MBUF_SCIDX(MBUF_SC_VI)
+#define        SCIDX_SIG               MBUF_SCIDX(MBUF_SC_SIG)
 #define        SCIDX_VO                MBUF_SCIDX(MBUF_SC_VO)
 #define        SCIDX_CTL               MBUF_SCIDX(MBUF_SC_CTL)
 
@@ -1287,26 +1332,27 @@ extern void m_mclfree(caddr_t p);
 #define        SCVAL_AV                MBUF_SCVAL(MBUF_SC_AV)
 #define        SCVAL_RV                MBUF_SCVAL(MBUF_SC_RV)
 #define        SCVAL_VI                MBUF_SCVAL(MBUF_SC_VI)
+#define        SCVAL_SIG               MBUF_SCVAL(MBUF_SC_SIG)
 #define        SCVAL_VO                MBUF_SCVAL(MBUF_SC_VO)
 #define        SCVAL_CTL               MBUF_SCVAL(MBUF_SC_CTL)
 
 #define        MBUF_VALID_SC(c)                                                \
        (c == MBUF_SC_BK_SYS || c == MBUF_SC_BK || c == MBUF_SC_BE ||   \
        c == MBUF_SC_RD || c == MBUF_SC_OAM || c == MBUF_SC_AV ||       \
-       c == MBUF_SC_RV || c == MBUF_SC_VI || c == MBUF_SC_VO ||        \
-       c == MBUF_SC_CTL)
+       c == MBUF_SC_RV || c == MBUF_SC_VI || c == MBUF_SC_SIG ||       \
+       c == MBUF_SC_VO || c == MBUF_SC_CTL)
 
 #define        MBUF_VALID_SCIDX(c)                                             \
        (c == SCIDX_BK_SYS || c == SCIDX_BK || c == SCIDX_BE ||         \
        c == SCIDX_RD || c == SCIDX_OAM || c == SCIDX_AV ||             \
-       c == SCIDX_RV || c == SCIDX_VI || c == SCIDX_VO ||              \
-       c == SCIDX_CTL)
+       c == SCIDX_RV || c == SCIDX_VI || c == SCIDX_SIG ||             \
+       c == SCIDX_VO || c == SCIDX_CTL)
 
 #define        MBUF_VALID_SCVAL(c)                                             \
        (c == SCVAL_BK_SYS || c == SCVAL_BK || c == SCVAL_BE ||         \
        c == SCVAL_RD || c == SCVAL_OAM || c == SCVAL_AV ||             \
-       c == SCVAL_RV || c == SCVAL_VI || c == SCVAL_VO ||              \
-       c == SCVAL_CTL)
+       c == SCVAL_RV || c == SCVAL_VI || c == SCVAL_SIG ||             \
+       c == SCVAL_VO || SCVAL_CTL)
 
 extern unsigned char *mbutl;   /* start VA of mbuf pool */
 extern unsigned char *embutl;  /* end VA of mbuf pool */
@@ -1363,8 +1409,7 @@ __private_extern__ struct mbuf *m_dtom(void *);
 __private_extern__ int m_mtocl(void *);
 __private_extern__ union mcluster *m_cltom(int);
 
-__private_extern__ int m_trailingspace(struct mbuf *);
-__private_extern__ int m_leadingspace(struct mbuf *);
+__private_extern__ void m_align(struct mbuf *, int);
 
 __private_extern__ struct mbuf *m_normalize(struct mbuf *m);
 __private_extern__ void m_mchtype(struct mbuf *m, int t);
@@ -1389,7 +1434,7 @@ __private_extern__ uint32_t m_ext_get_prop(struct mbuf *);
 __private_extern__ int m_ext_paired_is_active(struct mbuf *);
 __private_extern__ void m_ext_paired_activate(struct mbuf *);
 
-__private_extern__ void m_drain(void);
+__private_extern__ void mbuf_drain(boolean_t);
 
 /*
  * Packets may have annotations attached by affixing a list of "packet
@@ -1432,6 +1477,7 @@ enum {
        KERNEL_TAG_TYPE_INET6                   = 9,
        KERNEL_TAG_TYPE_IPSEC                   = 10,
        KERNEL_TAG_TYPE_DRVAUX                  = 11,
+       KERNEL_TAG_TYPE_CFIL_UDP                = 13,
 };
 
 /* Packet tag routines */
@@ -1451,13 +1497,6 @@ __private_extern__ void m_tag_init(struct mbuf *, int);
 __private_extern__ struct  m_tag *m_tag_first(struct mbuf *);
 __private_extern__ struct  m_tag *m_tag_next(struct mbuf *, struct m_tag *);
 
-__END_DECLS
-#endif /* XNU_KERNEL_PRIVATE */
-#ifdef KERNEL
-#include <sys/kpi_mbuf.h>
-#ifdef XNU_KERNEL_PRIVATE
-__BEGIN_DECLS
-
 __private_extern__ void m_scratch_init(struct mbuf *);
 __private_extern__ u_int32_t m_scratch_get(struct mbuf *, u_int8_t **);
 
@@ -1485,9 +1524,9 @@ __private_extern__ struct ext_ref *m_get_rfa(struct mbuf *);
 __private_extern__ m_ext_free_func_t m_get_ext_free(struct mbuf *);
 __private_extern__ caddr_t m_get_ext_arg(struct mbuf *);
 
-extern void m_do_tx_compl_callback(struct mbuf *, struct ifnet *);
+__private_extern__ void m_do_tx_compl_callback(struct mbuf *, struct ifnet *);
+__private_extern__ mbuf_tx_compl_func m_get_tx_compl_callback(u_int32_t);
 
 __END_DECLS
 #endif /* XNU_KERNEL_PRIVATE */
-#endif /* KERNEL */
 #endif /* !_SYS_MBUF_H_ */
index 9bf6ed529fd9c243be301c7836d38fac91ca60d1..b8007aa6d38c3455e34bc57ccf346161365f897a 100644 (file)
@@ -37,7 +37,6 @@ extern "C" {
 #include <sys/types.h>
 #include <sys/queue.h>
 #include <mach/boolean.h>
-#include <mach/branch_predicates.h>
 #include <kern/locks.h>
 #include <libkern/OSAtomic.h>
 
index 883b6a0adb332bde4d43bf93416ea8f8d4aa9608..e880a9a0a73cd151658bf2bc667fc850af9dd1de 100644 (file)
@@ -12,9 +12,16 @@ __BEGIN_DECLS
  * XXX These declarations are subject to change at any time.
  */
 
+#define MT_IOC(x) _IO('m', (x))
+
+#define MT_IOC_RESET MT_IOC(0)
+
+#define MT_IOC_ADD MT_IOC(1)
+
 struct monotonic_config {
        uint64_t event;
        uint64_t allowed_ctr_mask;
+       uint64_t cpu_mask;
 };
 
 union monotonic_ctl_add {
@@ -27,12 +34,20 @@ union monotonic_ctl_add {
        } out;
 };
 
+/*
+ * - Consider a separate IOC for disable -- to avoid the copyin to determine
+ *   which way to set it.
+ */
+#define MT_IOC_ENABLE MT_IOC(2)
+
 union monotonic_ctl_enable {
        struct {
                bool enable;
        } in;
 };
 
+#define MT_IOC_COUNTS MT_IOC(3)
+
 union monotonic_ctl_counts {
        struct {
                uint64_t ctr_mask;
@@ -43,24 +58,15 @@ union monotonic_ctl_counts {
        } out;
 };
 
-#define MT_IOC(x) _IO('m', (x))
+#define MT_IOC_GET_INFO MT_IOC(4)
 
-/*
- * FIXME
- *
- * - Consider a separate IOC for disable -- to avoid the copyin to determine which way to set it.
- *
- * - Maybe IOC_COUNTS should just return all the enable counters' counts.
- */
-enum monotonic_ioc {
-       MT_IOC_RESET = MT_IOC(0),
-       MT_IOC_ADD = MT_IOC(1),
-       MT_IOC_ENABLE = MT_IOC(2),
-       MT_IOC_COUNTS = MT_IOC(3),
+union monotonic_ctl_info {
+       struct {
+               unsigned int nmonitors;
+               unsigned int ncounters;
+       } out;
 };
 
-#undef MT_IOC
-
 #if XNU_KERNEL_PRIVATE
 
 #include <kern/monotonic.h>
@@ -125,18 +131,22 @@ enum monotonic_ioc {
 #define MT_KDBG_TMPTH_START(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_START)
 #define MT_KDBG_TMPTH_END(CODE) MT_KDBG_TMPTH_(CODE, DBG_FUNC_END)
 
-/* maybe provider, bank, group, set, unit, pmu */
-
-struct monotonic_dev {
+struct mt_device {
        const char *mtd_name;
-       int (*mtd_init)(void);
-       int (*mtd_add)(struct monotonic_config *config, uint32_t *ctr_out);
-       void (*mtd_reset)(void);
-       void (*mtd_enable)(bool enable);
-       int (*mtd_read)(uint64_t ctr_mask, uint64_t *counts_out);
+       int (* const mtd_init)(struct mt_device *dev);
+       int (* const mtd_add)(struct monotonic_config *config, uint32_t *ctr_out);
+       void (* const mtd_reset)(void);
+       void (* const mtd_enable)(bool enable);
+       int (* const mtd_read)(uint64_t ctr_mask, uint64_t *counts_out);
+       decl_lck_mtx_data(, mtd_lock);
+
+       uint8_t mtd_nmonitors;
+       uint8_t mtd_ncounters;
+       bool mtd_inuse;
 };
+typedef struct mt_device *mt_device_t;
 
-extern const struct monotonic_dev monotonic_devs[];
+extern struct mt_device mt_devices[];
 
 extern lck_grp_t *mt_lock_grp;
 
index 644faceb1d88f9022d78813f4f7310f35d0949c7..243c75e020dc7cd33ce9430be2ef7a540849bf40 100644 (file)
@@ -222,6 +222,11 @@ struct mount {
  */
 #define MNT_DEFAULT_IOQUEUE_DEPTH      32
 
+/*
+ * mnt_ioscale value for the given ioqueue depth
+ */
+#define MNT_IOSCALE(ioqueue_depth)     ((ioqueue_depth + (MNT_DEFAULT_IOQUEUE_DEPTH - 1)) / MNT_DEFAULT_IOQUEUE_DEPTH)
+
 /* mount point to which dead vps point to */
 extern struct mount * dead_mountp;
 
@@ -484,7 +489,7 @@ extern int num_trailing_0(uint64_t n);
 /* sync lock */
 extern lck_mtx_t * sync_mtx_lck;
 
-extern int sync_timeout;
+extern int sync_timeout_seconds;
 
 __END_DECLS
 
index 2f5b90bfbe6558ca4897d272b1baabd84d295a2c..26a5c707b65a133c0dcd7b8838ca3289de1fca42 100644 (file)
@@ -260,6 +260,7 @@ int relookup(struct vnode *dvp, struct vnode **vpp,
                struct componentname *cnp);
 int    lookup_traverse_union(vnode_t dvp, vnode_t *new_dvp, vfs_context_t ctx);
 void   lookup_compound_vnop_post_hook(int error, vnode_t dvp, vnode_t vp, struct nameidata *ndp, int did_create);
+void   kdebug_lookup(struct vnode *dp, struct componentname *cnp);
 
 /*
  * namecache function prototypes
index 4e3d71f46409fafd941366a1038d49cdbf55bedb..64d135168dfd54463569651fdf404544c99b377d 100644 (file)
@@ -177,10 +177,12 @@ int kpersona_find(const char *name, uid_t uid, uid_t *id, size_t *idlen);
 #include <sys/cdefs.h>
 #include <sys/kauth.h>
 #include <libkern/libkern.h>
+#include <os/refcnt.h>
 
 #ifdef PERSONA_DEBUG
+#include <os/log.h>
 #define persona_dbg(fmt, ...) \
-       printf("[%4d] %s:  " fmt "\n", \
+       os_log(OS_LOG_DEFAULT, "[%4d] %s:  " fmt "\n", \
               current_proc() ? current_proc()->p_pid : -1, \
               __func__, ## __VA_ARGS__)
 #else
@@ -193,7 +195,7 @@ int kpersona_find(const char *name, uid_t uid, uid_t *id, size_t *idlen);
 #ifdef XNU_KERNEL_PRIVATE
 /* only XNU proper needs to see the persona structure */
 struct persona {
-       int32_t      pna_refcount;
+       os_refcnt_t  pna_refcount;
        int32_t      pna_valid;
 
        uid_t        pna_id;
@@ -323,6 +325,9 @@ void personas_bootstrap(void);
 struct persona *persona_alloc(uid_t id, const char *login,
                              int type, int *error);
 
+int persona_init_begin(struct persona *persona);
+void persona_init_end(struct persona *persona, int error);
+
 struct persona *persona_lookup_and_invalidate(uid_t id);
 
 static inline int proc_has_persona(proc_t p)
index 1933800d26c781914434d73d6c1c6298f2685dc6..688da64494196c1567445166b868376e33555bea 100644 (file)
 #define PRIV_VFS_SNAPSHOT_REVERT       14003   /* Allow reverting filesystem to a previous snapshot */
 
 #define PRIV_APFS_EMBED_DRIVER         14100   /* Allow embedding an EFI driver into the APFS container */
-#define PRIV_APFS_FUSION_DEBUG      14101   /* Allow getting internal statistics and controlling the APFS fusion container */
+#define PRIV_APFS_FUSION_DEBUG      14101   /* Allow getting internal statistics and controlling the APFS Fusion container */
+#define PRIV_APFS_FUSION_ALLOW_PIN_FASTPROMOTE  14102   /* Allow changing pinned/fastPromote inode flags in APFS Fusion container */
+
 #ifdef KERNEL
 /*
  * Privilege check interface.  No flags are currently defined for the API.
index 652cc74b96f1b64496d35f171f6ccdcbfaa54043..46536dee4f3882579c7bb4315221cc05e9cdcbb6 100644 (file)
@@ -219,6 +219,8 @@ struct extern_proc {
 #define P_DIRTY_MARKED                          0x00000080      /* marked dirty previously */
 #define P_DIRTY_AGING_IN_PROGRESS               0x00000100      /* aging in one of the 'aging bands' */
 #define P_DIRTY_LAUNCH_IN_PROGRESS              0x00000200      /* launch is in progress */
+#define P_DIRTY_DEFER_ALWAYS                    0x00000400      /* defer going to idle-exit after every dirty->clean transition.
+                                                                * For legacy jetsam policy only. This is the default with the other policies.*/
 
 #define P_DIRTY_IS_DIRTY                        (P_DIRTY | P_DIRTY_SHUTDOWN)
 #define P_DIRTY_IDLE_EXIT_ENABLED               (P_DIRTY_TRACK|P_DIRTY_ALLOW_IDLE_EXIT)
@@ -255,6 +257,8 @@ extern int proc_isinferior(int pid1, int pid2);
  * routine is to be used typically for debugging
  */
 void proc_name(int pid, char * buf, int size);
+/* returns the 32-byte name if it exists, otherwise returns the 16-byte name */
+extern char *proc_best_name(proc_t p);
 /* This routine is simillar to proc_name except it returns for current process */
 void proc_selfname(char * buf, int size);
 
@@ -274,15 +278,23 @@ extern int proc_noremotehang(proc_t);
 extern int proc_forcequota(proc_t);
 /* returns 1 if the process is chrooted */
 extern int proc_chrooted(proc_t);
+/* returns TRUE if a sync EXC_RESOURCE should be sent for the process */
+extern boolean_t proc_send_synchronous_EXC_RESOURCE(proc_t p);
 
-/* this routine returns 1 if the process is running with 64bit address space, else 0 */
+/* this routine returns 1 if the process is running with 64bit address space, else 0 */
 extern int proc_is64bit(proc_t);
+/* this routine returns 1 if the process is running with a 64bit register state, else 0 */
+extern int proc_is64bit_data(proc_t);
 /* is this process exiting? */
 extern int proc_exiting(proc_t);
+/* returns whether the process has started down proc_exit() */
+extern int proc_in_teardown(proc_t);
 /* this routine returns error if the process is not one with super user privileges */
 int proc_suser(proc_t p);
 /* returns the cred assicaited with the process; temporary api */
 kauth_cred_t proc_ucred(proc_t p);
+/* returns 1 if the process is tainted by uid or gid changes,e else 0 */
+extern int proc_issetugid(proc_t p);
 
 extern int proc_tbe(proc_t);
 
@@ -367,7 +379,7 @@ extern void proc_coalitionids(proc_t, uint64_t [COALITION_NUM_TYPES]);
 #ifdef CONFIG_32BIT_TELEMETRY
 extern void proc_log_32bit_telemetry(proc_t p);
 #endif /* CONFIG_32BIT_TELEMETRY */
-
+extern uint64_t get_current_unique_pid(void);
 #endif /* XNU_KERNEL_PRIVATE*/
 
 #ifdef KERNEL_PRIVATE
index f28ae3d101ed5283a576940b7548284611e04110..8e247fcf13ada2b69abb83cd9f9b0c5a0f73458f 100644 (file)
@@ -646,7 +646,10 @@ struct kqueue_dyninfo {
        uint8_t  kqdi_async_qos;
        uint16_t kqdi_request_state;
        uint8_t  kqdi_events_qos;
-       uint8_t  _kqdi_reserved0[7];
+       uint8_t  kqdi_pri;
+       uint8_t  kqdi_pol;
+       uint8_t  kqdi_cpupercent;
+       uint8_t  _kqdi_reserved0[4];
        uint64_t _kqdi_reserved1[4];
 };
 
@@ -724,7 +727,6 @@ struct proc_fileportinfo {
 #define PROC_PIDLISTTHREADS            6
 #define PROC_PIDLISTTHREADS_SIZE       (2* sizeof(uint32_t))
 
-
 #define PROC_PIDREGIONINFO             7
 #define PROC_PIDREGIONINFO_SIZE                (sizeof(struct proc_regioninfo))
 
@@ -793,8 +795,12 @@ struct proc_fileportinfo {
 #define PROC_PIDLISTDYNKQUEUES      27
 #define PROC_PIDLISTDYNKQUEUES_SIZE (sizeof(kqueue_id_t))
 
-#endif
+#define PROC_PIDLISTTHREADIDS          28
+#define PROC_PIDLISTTHREADIDS_SIZE     (2* sizeof(uint32_t))
 
+#define PROC_PIDVMRTFAULTINFO          29
+#define PROC_PIDVMRTFAULTINFO_SIZE (7 * sizeof(uint64_t))
+#endif /* PRIVATE */
 /* Flavors for proc_pidfdinfo */
 
 #define PROC_PIDFDVNODEINFO            1
@@ -865,6 +871,7 @@ struct proc_fileportinfo {
 #define PROC_DIRTY_ALLOW_IDLE_EXIT      0x2
 #define PROC_DIRTY_DEFER                0x4
 #define PROC_DIRTY_LAUNCH_IN_PROGRESS   0x8
+#define PROC_DIRTY_DEFER_ALWAYS         0x10
 
 /* proc_get_dirty() flags */
 #define PROC_DIRTY_TRACKED              0x1
@@ -929,7 +936,6 @@ struct proc_fileportinfo {
 #define PROC_INFO_CALL_CANUSEFGHW        0xc
 #define PROC_INFO_CALL_PIDDYNKQUEUEINFO  0xd
 #define PROC_INFO_CALL_UDATA_INFO        0xe
-
 #endif /* PRIVATE */
 
 #ifdef XNU_KERNEL_PRIVATE
index 7119591d235adecd30e4ebd25e196a01e20b86ba..c2aacbc967ff3aa5a33e99778a5b2333061122e0 100644 (file)
@@ -194,7 +194,6 @@ struct proc;
 struct proc {
        LIST_ENTRY(proc) p_list;                /* List of all processes. */
 
-       pid_t           p_pid;                  /* Process identifier. (static)*/
        void *          task;                   /* corresponding task (static)*/
        struct  proc *  p_pptr;                 /* Pointer to parent process.(LL) */
        pid_t           p_ppid;                 /* process's parent pid number */
@@ -209,7 +208,7 @@ struct      proc {
        uint64_t        p_puniqueid;            /* parent's unique ID - set on fork/spawn/vfork, doesn't change if reparented. */
 
        lck_mtx_t       p_mlock;                /* mutex lock for proc */
-
+       pid_t           p_pid;                  /* Process identifier. (static)*/
        char            p_stat;                 /* S* process status. (PL)*/
        char            p_shutdownstate;
        char            p_kdebug;               /* P_KDEBUG eq (CC)*/ 
@@ -238,12 +237,12 @@ struct    proc {
        struct  plimit *p_limit;                /* Process limits.(PL) */
 
        struct  sigacts *p_sigacts;             /* Signal actions, state (PL) */
-        int            p_siglist;              /* signals captured back from threads */
        lck_spin_t      p_slock;                /* spin lock for itimer/profil protection */
 
 #define        p_rlimit        p_limit->pl_rlimit
 
        struct  plimit *p_olimit;               /* old process limits  - not inherited by child  (PL) */
+       int             p_siglist;              /* signals captured back from threads */
        unsigned int    p_flag;                 /* P_* flags. (atomic bit ops) */
        unsigned int    p_lflag;                /* local flags  (PL) */
        unsigned int    p_listflag;             /* list flags (LL) */
@@ -251,10 +250,8 @@ struct     proc {
        int             p_refcount;             /* number of outstanding users(LL) */
        int             p_childrencnt;          /* children holding ref on parent (LL) */
        int             p_parentref;            /* children lookup ref on parent (LL) */
-
        pid_t           p_oppid;                /* Save parent pid during ptrace. XXX */
        u_int           p_xstat;                /* Exit status for wait; also stop signal. */
-       uint8_t p_xhighbits;            /* Stores the top byte of exit status to avoid truncation*/
 
 #ifdef _PROC_HAS_SCHEDINFO_
        /* may need cleanup, not used */
@@ -273,11 +270,9 @@ struct     proc {
        boolean_t       sigwait;        /* indication to suspend (PL) */
        void    *sigwait_thread;        /* 'thread' holding sigwait(PL)  */
        void    *exit_thread;           /* Which thread is exiting(PL)  */
+       void *  p_vforkact;             /* activation running this vfork proc)(static)  */
        int     p_vforkcnt;             /* number of outstanding vforks(PL)  */
-       void *  p_vforkact;             /* activation running this vfork proc)(static)  */
        int     p_fpdrainwait;          /* (PFDL) */
-       pid_t   p_contproc;     /* last PID to send us a SIGCONT (PL) */
-
        /* Following fields are info from SIGCHLD (PL) */
        pid_t   si_pid;                 /* (PL) */
        u_int   si_status;              /* (PL) */
@@ -290,9 +285,9 @@ struct      proc {
        user_addr_t                     p_dtrace_argv;                  /* (write once, read only after that) */
        user_addr_t                     p_dtrace_envp;                  /* (write once, read only after that) */
        lck_mtx_t                       p_dtrace_sprlock;               /* sun proc lock emulation */
+       uint8_t                         p_dtrace_stop;                  /* indicates a DTrace-desired stop */
        int                             p_dtrace_probes;                /* (PL) are there probes for this proc? */
        u_int                           p_dtrace_count;                 /* (sprlock) number of DTrace tracepoints */
-        uint8_t                         p_dtrace_stop;                  /* indicates a DTrace-desired stop */
        struct dtrace_ptss_page*        p_dtrace_ptss_pages;            /* (sprlock) list of user ptss pages */
        struct dtrace_ptss_page_entry*  p_dtrace_ptss_free_list;        /* (atomic) list of individual ptss entries */
        struct dtrace_helpers*          p_dtrace_helpers;               /* (dtrace_lock) DTrace per-proc private */
@@ -321,7 +316,8 @@ struct      proc {
        // types currently in sys/param.h
        command_t   p_comm;
        proc_name_t p_name;     /* can be changed by the process */
-
+       uint8_t p_xhighbits;    /* Stores the top byte of exit status to avoid truncation*/
+       pid_t   p_contproc;     /* last PID to send us a SIGCONT (PL) */
 
        struct  pgrp *p_pgrp;           /* Pointer to process group. (LL) */
        uint32_t        p_csflags;      /* flags for codesign (PL) */
@@ -346,10 +342,9 @@ struct     proc {
        struct klist p_klist;  /* knote list (PL ?)*/
 
        struct  rusage_superset *p_ru;  /* Exit information. (PL) */
-       int             p_sigwaitcnt;
        thread_t        p_signalholder;
        thread_t        p_transholder;
-
+       int             p_sigwaitcnt;
        /* DEPRECATE following field  */
        u_short p_acflag;       /* Accounting flags. */
        volatile u_short p_vfs_iopolicy;        /* VFS iopolicy flags. (atomic bit ops) */
@@ -359,7 +354,7 @@ struct      proc {
        int     p_pthsize;                      /* pthread size */
        uint32_t        p_pth_tsd_offset;       /* offset from pthread_t to TSD for new threads */
        user_addr_t     p_stack_addr_hint;      /* stack allocation hint for wq threads */
-       void *  p_wqptr;                        /* workq ptr */
+       struct workqueue *_Atomic p_wqptr;                      /* workq ptr */
 
        struct  timeval p_start;                /* starting time */
        void *  p_rcall;
@@ -400,7 +395,9 @@ struct      proc {
        int32_t           p_memstat_memlimit_active;    /* memory limit enforced when process is in active jetsam state */
        int32_t           p_memstat_memlimit_inactive;  /* memory limit enforced when process is in inactive jetsam state */
 #if CONFIG_FREEZE
-       uint32_t          p_memstat_suspendedfootprint; /* footprint at time of suspensions */
+       uint32_t          p_memstat_freeze_sharedanon_pages; /* shared pages left behind after freeze */
+       uint32_t          p_memstat_frozen_count;
+       uint32_t          p_memstat_thaw_count;
 #endif /* CONFIG_FREEZE */
 #endif /* CONFIG_MEMORYSTATUS */
 
@@ -498,7 +495,9 @@ struct      proc {
 #define P_LXBKIDLEINPROG       0x02
 
 /* p_vfs_iopolicy flags */
-#define P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY 0x0001
+#define P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY      0x0001
+#define P_VFS_IOPOLICY_ATIME_UPDATES                   0x0002
+#define P_VFS_IOPOLICY_VALID_MASK                      (P_VFS_IOPOLICY_ATIME_UPDATES | P_VFS_IOPOLICY_FORCE_HFS_CASE_SENSITIVITY)
 
 /* process creation arguments */
 #define        PROC_CREATE_FORK        0       /* independent child (running) */
@@ -514,10 +513,13 @@ struct    proc {
 #ifdef KERNEL
 #include <sys/time.h>  /* user_timeval, user_itimerval */
 
-/* This packing breaks symmetry with userspace side (struct extern_proc 
- * of proc.h) for the ARMV7K ABI where 64-bit types are 64-bit aligned
+/*
+ * This packing is required to ensure symmetry between userspace and kernelspace
+ * when the kernel is 64-bit and the user application is 32-bit. All currently
+ * supported ARM slices (arm64/armv7k/arm64_32) contain the same struct
+ * alignment ABI so this packing isn't needed for ARM.
  */
-#if !(__arm__ && (__BIGGEST_ALIGNMENT__ > 4))
+#if defined(__x86_64__)
 #pragma pack(4)
 #endif
 struct user32_extern_proc {
index 1dff9968f1f63693091b00abb2a4268bc6ff8f56..3f4c3f12cf877ee0d183ea24a88a58d99051ab6e 100644 (file)
@@ -40,12 +40,9 @@ struct ksyn_waitq_element {
 #endif
 };
 
-void workqueue_mark_exiting(struct proc *);
-void workqueue_exit(struct proc *);
+void workq_mark_exiting(struct proc *);
+void workq_exit(struct proc *);
 void pthread_init(void);
-int thread_qos_from_pthread_priority(unsigned long, unsigned long *);
-unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t propagation);
-boolean_t workq_thread_has_been_unbound(thread_t th, int qos_class);
 
 #endif /* _SYS_PTHREAD_INTERNAL_H_ */
 
index 2256a4a010993e52abd40ba4d2a2eb636964c8a9..03b2333a1c45170fd1b09f9498cc3858fd3fafee 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2012 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
 
 #ifndef ASSEMBLER
 
+#include <kern/block_hint.h>
 #include <kern/clock.h>
 #include <kern/kern_types.h>
 #include <kern/kcdata.h>
 #include <kern/locks.h>
+#include <kern/turnstile.h>
+#include <pthread/priority_private.h>
 #include <sys/user.h>
 #include <sys/_types.h>
 #include <sys/_types/_sigset_t.h>
@@ -44,6 +47,7 @@
 
 #ifndef PTHREAD_INTERNAL
 struct uthread;
+struct ksyn_waitq_element;
 #define M_PROC 41
 #endif
 
@@ -52,22 +56,6 @@ struct uthread;
 typedef void (*sched_call_t)(int type, thread_t thread);
 #endif
 
-typedef struct workq_reqthreads_req_s {unsigned long priority; int count;} *workq_reqthreads_req_t;
-typedef struct workq_threadreq_s { void *opaqueptr[2]; uint32_t opaqueint[2];} *workq_threadreq_t;
-enum workq_threadreq_type {
-       WORKQ_THREADREQ_KEVENT = 1,
-       WORKQ_THREADREQ_WORKLOOP = 2,
-       WORKQ_THREADREQ_WORKLOOP_NO_THREAD_CALL = 3,
-       WORKQ_THREADREQ_REDRIVE = 4,
-};
-enum workq_threadreq_op {
-       WORKQ_THREADREQ_CHANGE_PRI = 1,
-       WORKQ_THREADREQ_CANCEL = 2,
-       WORKQ_THREADREQ_CHANGE_PRI_NO_THREAD_CALL = 3,
-};
-#define WORKQ_THREADREQ_FLAG_NOEMERGENCY 0x1
-
-
 /*
  * Increment each time new reserved slots are used. When the pthread
  * kext registers this table, it will include the version of the xnu
@@ -80,14 +68,14 @@ typedef const struct pthread_functions_s {
 
        /* internal calls, kernel core -> kext */
        void (*pthread_init)(void);
-       int (*fill_procworkqueue)(proc_t p, void* pwqinfo);
 
-       void (*__unused1)(void);
-       void (*__unused2)(void);
+       void *__unused_was_fill_procworkqueue;
+       void *__unused1;
+       void *__unused2;
+       void *__unused_was_workqueue_exit;
+       void *__unused_was_workqueue_mark_exiting;
+       void *__unused_was_workqueue_thread_yielded;
 
-       void (*workqueue_exit)(struct proc *p);
-       void (*workqueue_mark_exiting)(struct proc *p);
-       void (*workqueue_thread_yielded)(void);
        void (*pth_proc_hashinit)(proc_t p);
        void (*pth_proc_hashdelete)(proc_t p);
 
@@ -96,8 +84,8 @@ typedef const struct pthread_functions_s {
        int (*bsdthread_register)(struct proc *p, user_addr_t threadstart, user_addr_t wqthread, int pthsize, user_addr_t dummy_value, user_addr_t targetconc_ptr, uint64_t dispatchqueue_offset, int32_t *retval);
        int (*bsdthread_terminate)(struct proc *p, user_addr_t stackaddr, size_t size, uint32_t kthport, uint32_t sem, int32_t *retval);
        int (*thread_selfid)(struct proc *p, uint64_t *retval);
-       int (*workq_kernreturn)(struct proc *p, int options, user_addr_t item, int affinity, int prio, int32_t *retval);
-       int (*workq_open)(struct proc *p, int32_t *retval);
+       void *__unused_was_workq_kernreturn;
+       void *__unused_was_workq_open;
 
        /* psynch syscalls */
        int (*psynch_mutexwait)(proc_t p, user_addr_t mutex,  uint32_t mgen, uint32_t  ugen, uint64_t tid, uint32_t flags, uint32_t *retval);
@@ -112,68 +100,44 @@ typedef const struct pthread_functions_s {
        int (*psynch_rw_wrlock)(proc_t p, user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags, uint32_t *retval);
        int (*psynch_rw_yieldwrlock)(proc_t p, user_addr_t rwlock, uint32_t lgenval, uint32_t ugenval, uint32_t rw_wc, int flags, uint32_t *retval);
 
-       sched_call_t (*workqueue_get_sched_callback)(void);
+       void *__unused_was_workqueue_get_sched_callback;
 
        /* New register function with TSD offset */
        int (*bsdthread_register2)(struct proc *p, user_addr_t threadstart, user_addr_t wqthread, uint32_t flags, user_addr_t stack_addr_hint, user_addr_t targetconc_ptr, uint32_t dispatchqueue_offset, uint32_t tsd_offset, int32_t *retval);
 
-       /* New pthreadctl system. */
-       int (*bsdthread_ctl)(struct proc *p, user_addr_t cmd, user_addr_t arg1, user_addr_t arg2, user_addr_t arg3, int *retval);
+       void *__unused_was_bsdthread_ctl;
+       void *__unused_was_workq_reqthreads;
+
+       void *__unused_was_thread_qos_from_pthread_priority;
+       void *__unused_was_get_pwq_state_kdp;
+       void *__unused3;
+       void *__unused_was_pthread_priority_canonicalize2;
+       void *__unused_was_workq_thread_has_been_unbound;
 
-    /* Request threads to deliver kevents */
-    thread_t (*workq_reqthreads)(struct proc *p, int requests_count, workq_reqthreads_req_t requests);
+       void (*pthread_find_owner)(thread_t thread, struct stackshot_thread_waitinfo *waitinfo);
+       void *(*pthread_get_thread_kwq)(thread_t thread);
 
-    /* Resolve a pthread_priority_t to a QoS/relative pri */
-    integer_t (*thread_qos_from_pthread_priority)(unsigned long pthread_priority, unsigned long *flags);
+       void *__unused_was_workq_threadreq;
 
-       /* try to get wq flags in debugger context */
-       uint32_t (*get_pwq_state_kdp)(proc_t p);
+       int  (*workq_handle_stack_events)(proc_t p, thread_t th, vm_map_t map,
+                       user_addr_t stackaddr, mach_port_name_t kport,
+                       user_addr_t events, int nevents, int upcall_flags);
 
-       void (*__unused3)(void);
-       unsigned long (*pthread_priority_canonicalize2)(unsigned long pthread_priority, boolean_t propagation);
+       int (*workq_create_threadstack)(proc_t p, vm_map_t vmap,
+                       mach_vm_offset_t *out_addr);
 
-       /* Returns true on success, false on mismatch */
-       boolean_t (*workq_thread_has_been_unbound)(thread_t th, int qos_class);
+       int (*workq_destroy_threadstack)(proc_t p, vm_map_t vmap,
+                       mach_vm_offset_t stackaddr);
 
-       void (*pthread_find_owner)(thread_t thread, struct stackshot_thread_waitinfo *waitinfo);
-       void *(*pthread_get_thread_kwq)(thread_t thread);
+       void (*workq_setup_thread)(proc_t p, thread_t th, vm_map_t map,
+                       user_addr_t stackaddr, mach_port_name_t kport, int th_qos,
+                       int setup_flags, int upcall_flags);
 
-       /*
-        * Submits a threadreq to the workq system.
-        *
-        * If type is WORKQ_THREADREQ_KEVENT, the semantics are similar to a call
-        * to workq_reqthreads and the kevent bind function will be called to
-        * indicate the thread fulfilling the request.  The req argument is ignored.
-        *
-        * If type is WORKQ_THREADREQ_WORKLOOP, The req argument should point to
-        * allocated memory of at least the sizeof(workq_threadreq_t).  That memory
-        * is lent to the workq system until workloop_fulfill_threadreq is called
-        * and passed the pointer, at which point it may be freed.
-        *
-        * The properties of the request are passed in the (pthread) priority and flags arguments.
-        *
-        * Will return zero upon success or an error value on failure.  An error of
-        * ENOTSUP means the type argument was not understood.
-        */
-       int (*workq_threadreq)(struct proc *p, workq_threadreq_t req,
-               enum workq_threadreq_type, unsigned long priority, int flags);
-
-       /*
-        * Modifies an already submitted thread request.
-        *
-        * If operation is WORKQ_THREADREQ_CHANGE_PRI, arg1 is the new priority and arg2 is unused.
-        *
-        * If operation is WORKQ_THREADREQ_CANCEL, arg1 and arg2 are unused.
-        *
-        * Will return zero upon success or an error value on failure.  An error of
-        * ENOTSUP means the operation argument was not understood.
-        */
-       int (*workq_threadreq_modify)(struct proc *t, workq_threadreq_t req,
-                       enum workq_threadreq_op operation,
-                       unsigned long arg1, unsigned long arg2);
+       void (*workq_markfree_threadstack)(proc_t p, thread_t, vm_map_t map,
+                       user_addr_t stackaddr);
 
        /* padding for future */
-       void * _pad[87];
+       void * _pad[83];
 } * pthread_functions_t;
 
 typedef const struct pthread_callbacks_s {
@@ -193,35 +157,42 @@ typedef const struct pthread_callbacks_s {
        void (*proc_set_wqthread)(struct proc *t, user_addr_t addr);
        int (*proc_get_pthsize)(struct proc *t);
        void (*proc_set_pthsize)(struct proc *t, int size);
-#if defined(__arm64__)
-       unsigned __int128 (*atomic_fetch_add_128_relaxed)(_Atomic unsigned __int128 *ptr,
-                       unsigned __int128 value);
-       unsigned __int128 (*atomic_load_128_relaxed)(_Atomic unsigned __int128 *ptr);
-#else
-       void *unused_was_proc_get_targconc;
-       void *unused_was_proc_set_targconc;
-#endif
-       uint64_t (*proc_get_dispatchqueue_offset)(struct proc *t);
+
+       thread_t (*task_findtid)(task_t t, uint64_t tid);
+       void (*thread_deallocate_safe)(thread_t);
+       void *__unused_was_proc_get_dispatchqueue_offset;
        void (*proc_set_dispatchqueue_offset)(struct proc *t, uint64_t offset);
-       void *unused_was_proc_get_wqlockptr;
-       void *unused_was_proc_get_wqinitingptr;
-       void* (*proc_get_wqptr)(struct proc *t);
-       void (*proc_set_wqptr)(struct proc *t, void* ptr);
-       void *unused_was_proc_get_wqsize;
-       void *unused_was_proc_set_wqsize;
-       void (*proc_lock)(struct proc *t);
-       void (*proc_unlock)(struct proc *t);
-       task_t (*proc_get_task)(struct proc *t);
+       void *__unused_was_proc_get_wqlockptr;
+       void *__unused_was_proc_get_wqinitingptr;
+       void *__unused_was_proc_get_wqptr;
+
+       wait_result_t (*psynch_wait_prepare)(uintptr_t kwq,
+                       struct turnstile **tstore, thread_t owner, block_hint_t block_hint,
+                       uint64_t deadline);
+
+       void (*psynch_wait_update_complete)(struct turnstile *turnstile);
+
+       void (*psynch_wait_complete)(uintptr_t kwq, struct turnstile **tstore);
+
+       void (*psynch_wait_cleanup)(void);
+
+       kern_return_t (*psynch_wait_wakeup)(uintptr_t kwq,
+                       struct ksyn_waitq_element *kwe, struct turnstile **tstore);
+
+       void (*psynch_wait_update_owner)(uintptr_t kwq, thread_t owner,
+                       struct turnstile **tstore);
+
        void* (*proc_get_pthhash)(struct proc *t);
        void (*proc_set_pthhash)(struct proc *t, void* ptr);
 
        /* bsd/sys/user.h */
-       void* (*uthread_get_threadlist)(struct uthread *t);
-       void (*uthread_set_threadlist)(struct uthread *t, void* threadlist);
-       sigset_t (*uthread_get_sigmask)(struct uthread *t);
-       void (*uthread_set_sigmask)(struct uthread *t, sigset_t s);
+       void *__unused_was_uthread_get_threadlist;
+       void *__unused_was_uthread_set_threadlist;
+       void *__unused_was_uthread_get_sigmask;
+       void *__unused_was_uthread_set_sigmask;
+
        void* (*uthread_get_uukwe)(struct uthread *t);
-       int (*uthread_get_returnval)(struct uthread *t);
+       void *__unused_was_uthread_get_returnval;
        void (*uthread_set_returnval)(struct uthread *t, int val);
        int (*uthread_is_cancelled)(struct uthread *t);
 
@@ -231,7 +202,7 @@ typedef const struct pthread_callbacks_s {
 
        /* osfmk/vm/vm_map.h */
        kern_return_t (*vm_map_page_info)(vm_map_t map, vm_map_offset_t offset, vm_page_info_flavor_t flavor, vm_page_info_t info, mach_msg_type_number_t *count);
-       vm_map_t (*vm_map_switch)(vm_map_t map);
+       void *__unused_was_vm_map_switch;
 
        /* wq functions */
        kern_return_t (*thread_set_wq_state32)(thread_t thread, thread_state_t state);
@@ -243,29 +214,25 @@ typedef const struct pthread_callbacks_s {
        void (*thread_exception_return)(void);
        void (*thread_bootstrap_return)(void);
 
-       /* kern/clock.h */
-       void (*absolutetime_to_microtime)(uint64_t abstime, clock_sec_t *secs, clock_usec_t *microsecs);
-
-       kern_return_t (*thread_set_workq_pri)(thread_t thread, integer_t priority, integer_t policy);
-       kern_return_t (*thread_set_workq_qos)(thread_t thread, int qos_tier, int relprio);
+       void *__unused_was_absolutetime_to_microtime;
+       void *__unused_was_thread_set_workq_pri;
+       void *__unused_was_thread_set_workq_qos;
 
        /* osfmk/kern/thread.h */
        struct uthread* (*get_bsdthread_info)(thread_t th);
-       void (*thread_sched_call)(thread_t t, sched_call_t call);
-       void (*thread_static_param)(thread_t t, boolean_t state);
-       kern_return_t (*thread_create_workq)(task_t t, thread_continue_t c, thread_t *new_t);
+       void *__unused_was_thread_sched_call;
+       void *__unused_was_thread_static_param;
+       void *__unused_was_thread_create_workq_waiting_parameter;
        kern_return_t (*thread_policy_set_internal)(thread_t t, thread_policy_flavor_t flavour, thread_policy_t info, mach_msg_type_number_t count);
 
-       /* osfmk/kern/affinity.h */
-       kern_return_t (*thread_affinity_set)(thread_t thread, uint32_t tag);
+       void *__unused_was_thread_affinity_set;
 
        /* bsd/sys/systm.h */
        void (*unix_syscall_return)(int error);
 
-       /* osfmk/kern/zalloc.h */
-       void* (*zalloc)(zone_t zone);
-       void (*zfree)(zone_t zone, void* ptr);
-       zone_t (*zinit)(vm_size_t, vm_size_t maxmem, vm_size_t alloc, const char *name);
+       void *__unused_was_zalloc;
+       void *__unused_was_zfree;
+       void *__unused_was_zinit;
 
        /* bsd/kerb/kern_sig.c */
        void (*__pthread_testcancel)(int);
@@ -284,20 +251,16 @@ typedef const struct pthread_callbacks_s {
        /* mach/thread_act.h */
        kern_return_t (*thread_resume)(thread_act_t target_act);
 
-       /* osfmk/<arch>/machine_routines.h */
-       int (*ml_get_max_cpus)(void);
-
-       #if defined(__arm__)
-       uint32_t (*map_is_1gb)(vm_map_t);
-       #endif
+       void *__unused_was_ml_get_max_cpus;
+#if defined(__arm__)
+       void *__unused_was_map_is_1gb;
+#endif
 
-       /* <rdar://problem/12809089> xnu: struct proc p_dispatchqueue_serialno_offset additions */
-       uint64_t (*proc_get_dispatchqueue_serialno_offset)(struct proc *p);
-       void (*proc_set_dispatchqueue_serialno_offset)(struct proc *p, uint64_t offset);
+       void *__unused_was_proc_get_dispatchqueue_serialno_offset;
+       void *__unused_was_proc_set_dispatchqueue_serialno_offset;
 
-       int (*proc_usynch_thread_qos_add_override_for_resource_check_owner)(thread_t thread, int override_qos, boolean_t first_override_for_resource,
-                       user_addr_t resource, int resource_type, user_addr_t user_lock_addr, mach_port_name_t user_lock_owner);
-       void *unused_was_proc_set_stack_addr_hint;
+       void *__unused_was_proc_usynch_thread_qos_add_override_for_resource_check_owner;
+       void *__unused_was_proc_set_stack_addr_hint;
 
        uint32_t (*proc_get_pthread_tsd_offset)(struct proc *p);
        void (*proc_set_pthread_tsd_offset)(struct proc *p, uint32_t pthread_tsd_offset);
@@ -311,56 +274,46 @@ typedef const struct pthread_callbacks_s {
        kern_return_t (*thread_policy_get)(thread_t t, thread_policy_flavor_t flavor, thread_policy_t info, mach_msg_type_number_t *count, boolean_t *get_default);
        boolean_t (*qos_main_thread_active)(void);
 
-       kern_return_t (*thread_set_voucher_name)(mach_port_name_t voucher_name);
+       kern_return_t (*thread_set_voucher_name)(mach_port_name_t name);
 
        boolean_t (*proc_usynch_thread_qos_add_override_for_resource)(task_t task, struct uthread *, uint64_t tid, int override_qos, boolean_t first_override_for_resource, user_addr_t resource, int resource_type);
        boolean_t (*proc_usynch_thread_qos_remove_override_for_resource)(task_t task, struct uthread *, uint64_t tid, user_addr_t resource, int resource_type);
-       boolean_t (*proc_usynch_thread_qos_reset_override_for_resource)(task_t task, struct uthread *, uint64_t tid, user_addr_t resource, int resource_type);
+       void *__unused_was_proc_usynch_thread_qos_reset_override_for_resource;
 
-       boolean_t (*proc_init_wqptr_or_wait)(proc_t proc);
+       void *__unused_was_proc_init_wqptr_or_wait;
 
        uint16_t (*thread_set_tag)(thread_t thread, uint16_t tag);
        uint16_t (*thread_get_tag)(thread_t thread);
 
-       int (*proc_usynch_thread_qos_squash_override_for_resource)(thread_t thread, user_addr_t resource, int resource_type);
-       int (*task_get_default_manager_qos)(task_t task);
-
-       int (*thread_create_workq_waiting)(task_t task, thread_continue_t thread_return, event_t event, thread_t *new_thread);
+       void *__unused_was_proc_usynch_thread_qos_squash_override_for_resource;
+       void *__unused_was_task_get_default_manager_qos;
+       void *__unused_was_thread_create_workq_waiting;
 
        user_addr_t (*proc_get_stack_addr_hint)(struct proc *p);
        void (*proc_set_stack_addr_hint)(struct proc *p, user_addr_t stack_addr_hint);
 
-       uint64_t (*proc_get_return_to_kernel_offset)(struct proc *t);
+       void *__unused_was_proc_get_return_to_kernel_offset;
        void (*proc_set_return_to_kernel_offset)(struct proc *t, uint64_t offset);
 
-       /* indicates call is being made synchronously with workq_threadreq call */
-#      define WORKLOOP_FULFILL_THREADREQ_SYNC   0x1
-#      define WORKLOOP_FULFILL_THREADREQ_CANCEL 0x2
-       int (*workloop_fulfill_threadreq)(struct proc *p, workq_threadreq_t req, thread_t thread, int flags);
+       void *__unused_was_workloop_fulfill_threadreq;
        void (*thread_will_park_or_terminate)(thread_t thread);
 
-       /* For getting maximum parallelism for a given QoS */
-       uint32_t (*qos_max_parallelism)(int qos, uint64_t options);
+       void *__unused_was_qos_max_parallelism;
 
        /* proc_internal.h: struct proc user_stack accessor */
        user_addr_t (*proc_get_user_stack)(struct proc *p);
-       void (*proc_set_user_stack)(struct proc *p, user_addr_t user_stack);
+       void *__unused_was_proc_set_user_stack;
 
        /* padding for future */
        void* _pad[69];
-
 } *pthread_callbacks_t;
 
 void
 pthread_kext_register(pthread_functions_t fns, pthread_callbacks_t *callbacks);
 
 #ifdef BSD_KERNEL_PRIVATE
-void workqueue_mark_exiting(struct proc *);
-void workqueue_exit(struct proc *);
-void workqueue_thread_yielded(void);
-sched_call_t workqueue_get_sched_callback(void);
+void thread_will_park_or_terminate(thread_t thread);
 void pthread_init(void);
-
 extern pthread_callbacks_t pthread_kern;
 extern pthread_functions_t pthread_functions;
 #endif
index 294eec935455f6dcf65d770ead99375efb30c370..aa26d76368174d0d426a2557c05d710e2dee4780 100644 (file)
 #ifndef _SYS_QUEUE_H_
 #define        _SYS_QUEUE_H_
 
+#ifdef KERNEL_PRIVATE
+#include <kern/debug.h> /* panic function call */
+#include <sys/cdefs.h> /* __improbable in kernelspace */
+#else
+#ifndef __improbable
+#define __improbable(x) (x)            /* noop in userspace */
+#endif /* __improbable */
+#endif /* KERNEL_PRIVATE */
+
 /*
  * This file defines five types of data structures: singly-linked lists,
  * singly-linked tail queues, lists, tail queues, and circular queues.
@@ -436,30 +445,32 @@ __MISMATCH_TAGS_POP
  * List functions.
  */
 
-#if (defined(_KERNEL) && defined(INVARIANTS)) || defined(QUEUE_MACRO_DEBUG)
-#define        QMD_LIST_CHECK_HEAD(head, field) do {                           \
-       if (LIST_FIRST((head)) != NULL &&                               \
-           LIST_FIRST((head))->field.le_prev !=                        \
-            &LIST_FIRST((head)))                                       \
-               panic("Bad list head %p first->prev != head", (head));  \
+#ifdef KERNEL_PRIVATE
+#define        LIST_CHECK_HEAD(head, field) do {                               \
+       if (__improbable(                                               \
+             LIST_FIRST((head)) != NULL &&                             \
+             LIST_FIRST((head))->field.le_prev !=                      \
+             &LIST_FIRST((head))))                                     \
+                    panic("Bad list head %p first->prev != head", (head));     \
 } while (0)
 
-#define        QMD_LIST_CHECK_NEXT(elm, field) do {                            \
-       if (LIST_NEXT((elm), field) != NULL &&                          \
-           LIST_NEXT((elm), field)->field.le_prev !=                   \
-            &((elm)->field.le_next))                                   \
-               panic("Bad link elm %p next->prev != elm", (elm));      \
+#define        LIST_CHECK_NEXT(elm, field) do {                                \
+       if (__improbable(                                               \
+             LIST_NEXT((elm), field) != NULL &&                        \
+             LIST_NEXT((elm), field)->field.le_prev !=                 \
+             &((elm)->field.le_next)))                                 \
+                    panic("Bad link elm %p next->prev != elm", (elm)); \
 } while (0)
 
-#define        QMD_LIST_CHECK_PREV(elm, field) do {                            \
-       if (*(elm)->field.le_prev != (elm))                             \
+#define        LIST_CHECK_PREV(elm, field) do {                                \
+       if (__improbable(*(elm)->field.le_prev != (elm)))               \
                panic("Bad link elm %p prev->next != elm", (elm));      \
 } while (0)
 #else
-#define        QMD_LIST_CHECK_HEAD(head, field)
-#define        QMD_LIST_CHECK_NEXT(elm, field)
-#define        QMD_LIST_CHECK_PREV(elm, field)
-#endif /* (_KERNEL && INVARIANTS) || QUEUE_MACRO_DEBUG */
+#define        LIST_CHECK_HEAD(head, field)
+#define        LIST_CHECK_NEXT(elm, field)
+#define        LIST_CHECK_PREV(elm, field)
+#endif /* KERNEL_PRIVATE */
 
 #define        LIST_EMPTY(head)        ((head)->lh_first == NULL)
 
@@ -480,7 +491,7 @@ __MISMATCH_TAGS_POP
 } while (0)
 
 #define        LIST_INSERT_AFTER(listelm, elm, field) do {                     \
-       QMD_LIST_CHECK_NEXT(listelm, field);                            \
+       LIST_CHECK_NEXT(listelm, field);                                \
        if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\
                LIST_NEXT((listelm), field)->field.le_prev =            \
                    &LIST_NEXT((elm), field);                           \
@@ -489,7 +500,7 @@ __MISMATCH_TAGS_POP
 } while (0)
 
 #define        LIST_INSERT_BEFORE(listelm, elm, field) do {                    \
-       QMD_LIST_CHECK_PREV(listelm, field);                            \
+       LIST_CHECK_PREV(listelm, field);                                \
        (elm)->field.le_prev = (listelm)->field.le_prev;                \
        LIST_NEXT((elm), field) = (listelm);                            \
        *(listelm)->field.le_prev = (elm);                              \
@@ -497,7 +508,7 @@ __MISMATCH_TAGS_POP
 } while (0)
 
 #define        LIST_INSERT_HEAD(head, elm, field) do {                         \
-       QMD_LIST_CHECK_HEAD((head), field);                             \
+       LIST_CHECK_HEAD((head), field);                         \
        if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL)     \
                LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\
        LIST_FIRST((head)) = (elm);                                     \
@@ -507,8 +518,8 @@ __MISMATCH_TAGS_POP
 #define        LIST_NEXT(elm, field)   ((elm)->field.le_next)
 
 #define        LIST_REMOVE(elm, field) do {                                    \
-       QMD_LIST_CHECK_NEXT(elm, field);                                \
-       QMD_LIST_CHECK_PREV(elm, field);                                \
+       LIST_CHECK_NEXT(elm, field);                            \
+       LIST_CHECK_PREV(elm, field);                            \
        if (LIST_NEXT((elm), field) != NULL)                            \
                LIST_NEXT((elm), field)->field.le_prev =                \
                    (elm)->field.le_prev;                               \
@@ -557,6 +568,33 @@ __MISMATCH_TAGS_POP
 /*
  * Tail queue functions.
  */
+#ifdef KERNEL_PRIVATE
+#define TAILQ_CHECK_HEAD(head, field) do {                             \
+       if (__improbable(                                               \
+             TAILQ_FIRST((head)) != NULL &&                            \
+             TAILQ_FIRST((head))->field.tqe_prev !=                    \
+             &TAILQ_FIRST((head))))                                    \
+                    panic("Bad tailq head %p first->prev != head", (head));    \
+} while (0)
+
+#define TAILQ_CHECK_NEXT(elm, field) do {                              \
+       if (__improbable(                                               \
+             TAILQ_NEXT((elm), field) != NULL &&                       \
+             TAILQ_NEXT((elm), field)->field.tqe_prev !=               \
+             &((elm)->field.tqe_next)))                                \
+                    panic("Bad tailq elm %p next->prev != elm", (elm));        \
+} while(0)
+
+#define        TAILQ_CHECK_PREV(elm, field) do {                               \
+       if (__improbable(*(elm)->field.tqe_prev != (elm)))              \
+             panic("Bad tailq elm %p prev->next != elm", (elm));       \
+} while(0)
+#else
+#define        TAILQ_CHECK_HEAD(head, field)
+#define        TAILQ_CHECK_NEXT(elm, field)
+#define        TAILQ_CHECK_PREV(elm, field)
+#endif /* KERNEL_PRIVATE */
+
 #define        TAILQ_CONCAT(head1, head2, field) do {                          \
        if (!TAILQ_EMPTY(head2)) {                                      \
                *(head1)->tqh_last = (head2)->tqh_first;                \
@@ -598,7 +636,9 @@ __MISMATCH_TAGS_POP
        QMD_TRACE_HEAD(head);                                           \
 } while (0)
 
+
 #define        TAILQ_INSERT_AFTER(head, listelm, elm, field) do {              \
+        TAILQ_CHECK_NEXT(listelm, field);                              \
        if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\
                TAILQ_NEXT((elm), field)->field.tqe_prev =              \
                    &TAILQ_NEXT((elm), field);                          \
@@ -613,6 +653,7 @@ __MISMATCH_TAGS_POP
 } while (0)
 
 #define        TAILQ_INSERT_BEFORE(listelm, elm, field) do {                   \
+        TAILQ_CHECK_PREV(listelm, field);                              \
        (elm)->field.tqe_prev = (listelm)->field.tqe_prev;              \
        TAILQ_NEXT((elm), field) = (listelm);                           \
        *(listelm)->field.tqe_prev = (elm);                             \
@@ -622,6 +663,7 @@ __MISMATCH_TAGS_POP
 } while (0)
 
 #define        TAILQ_INSERT_HEAD(head, elm, field) do {                        \
+        TAILQ_CHECK_HEAD(head, field);                                 \
        if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL)   \
                TAILQ_FIRST((head))->field.tqe_prev =                   \
                    &TAILQ_NEXT((elm), field);                          \
@@ -655,6 +697,8 @@ __MISMATCH_TAGS_PUSH                                                        \
 __MISMATCH_TAGS_POP
 
 #define        TAILQ_REMOVE(head, elm, field) do {                             \
+        TAILQ_CHECK_NEXT(elm, field);                                  \
+        TAILQ_CHECK_PREV(elm, field);                                  \
        if ((TAILQ_NEXT((elm), field)) != NULL)                         \
                TAILQ_NEXT((elm), field)->field.tqe_prev =              \
                    (elm)->field.tqe_prev;                              \
@@ -713,6 +757,31 @@ __MISMATCH_TAGS_POP
 /*
  * Circular queue functions.
  */
+#ifdef KERNEL_PRIVATE
+#define        CIRCLEQ_CHECK_HEAD(head, field) do {                            \
+       if (__improbable(                                               \
+             CIRCLEQ_FIRST((head)) != ((void*)(head)) &&               \
+             CIRCLEQ_FIRST((head))->field.cqe_prev != ((void*)(head))))\
+                    panic("Bad circleq head %p first->prev != head", (head));  \
+} while(0)
+#define        CIRCLEQ_CHECK_NEXT(head, elm, field) do {                       \
+       if (__improbable(                                               \
+             CIRCLEQ_NEXT((elm), field) != ((void*)(head)) &&          \
+             CIRCLEQ_NEXT((elm), field)->field.cqe_prev != (elm)))     \
+                    panic("Bad circleq elm %p next->prev != elm", (elm));      \
+} while(0)
+#define        CIRCLEQ_CHECK_PREV(head, elm, field) do {                       \
+       if (__improbable(                                               \
+             CIRCLEQ_PREV((elm), field) != ((void*)(head)) &&          \
+             CIRCLEQ_PREV((elm), field)->field.cqe_next != (elm)))     \
+                    panic("Bad circleq elm %p prev->next != elm", (elm));      \
+} while(0)
+#else
+#define        CIRCLEQ_CHECK_HEAD(head, field)
+#define        CIRCLEQ_CHECK_NEXT(head, elm, field)
+#define        CIRCLEQ_CHECK_PREV(head, elm, field)
+#endif /* KERNEL_PRIVATE */
+
 #define CIRCLEQ_EMPTY(head) ((head)->cqh_first == (void *)(head))
 
 #define CIRCLEQ_FIRST(head) ((head)->cqh_first)
@@ -728,6 +797,7 @@ __MISMATCH_TAGS_POP
 } while (0)
 
 #define CIRCLEQ_INSERT_AFTER(head, listelm, elm, field) do {           \
+        CIRCLEQ_CHECK_NEXT(head, listelm, field);                      \
        (elm)->field.cqe_next = (listelm)->field.cqe_next;              \
        (elm)->field.cqe_prev = (listelm);                              \
        if ((listelm)->field.cqe_next == (void *)(head))                \
@@ -738,6 +808,7 @@ __MISMATCH_TAGS_POP
 } while (0)
 
 #define CIRCLEQ_INSERT_BEFORE(head, listelm, elm, field) do {          \
+        CIRCLEQ_CHECK_PREV(head, listelm, field);                      \
        (elm)->field.cqe_next = (listelm);                              \
        (elm)->field.cqe_prev = (listelm)->field.cqe_prev;              \
        if ((listelm)->field.cqe_prev == (void *)(head))                \
@@ -748,6 +819,7 @@ __MISMATCH_TAGS_POP
 } while (0)
 
 #define CIRCLEQ_INSERT_HEAD(head, elm, field) do {                     \
+        CIRCLEQ_CHECK_HEAD(head, field);                               \
        (elm)->field.cqe_next = (head)->cqh_first;                      \
        (elm)->field.cqe_prev = (void *)(head);                         \
        if ((head)->cqh_last == (void *)(head))                         \
@@ -774,6 +846,8 @@ __MISMATCH_TAGS_POP
 #define CIRCLEQ_PREV(elm,field) ((elm)->field.cqe_prev)
 
 #define        CIRCLEQ_REMOVE(head, elm, field) do {                           \
+        CIRCLEQ_CHECK_NEXT(head, elm, field);                          \
+        CIRCLEQ_CHECK_PREV(head, elm, field);                          \
        if ((elm)->field.cqe_next == (void *)(head))                    \
                (head)->cqh_last = (elm)->field.cqe_prev;               \
        else                                                            \
@@ -801,12 +875,37 @@ struct quehead {
 };
 
 #ifdef __GNUC__
+#ifdef KERNEL_PRIVATE
+static __inline void
+chkquenext(void *a)
+{
+       struct quehead *element = (struct quehead *)a;
+       if (__improbable(element->qh_link != NULL &&
+                           element->qh_link->qh_rlink != element)) {
+             panic("Bad que elm %p next->prev != elm", a);
+       }
+}
+
+static __inline void
+chkqueprev(void *a)
+{
+       struct quehead *element = (struct quehead *)a;
+       if (__improbable(element->qh_rlink != NULL &&
+                           element->qh_rlink->qh_link != element)) {
+             panic("Bad que elm %p prev->next != elm", a);
+       }
+}
+#else /* !KERNEL_PRIVATE */
+#define chkquenext(a)
+#define chkqueprev(a)
+#endif /* KERNEL_PRIVATE */
 
 static __inline void
 insque(void *a, void *b)
 {
        struct quehead *element = (struct quehead *)a,
                 *head = (struct quehead *)b;
+       chkquenext(head);
 
        element->qh_link = head->qh_link;
        element->qh_rlink = head;
@@ -818,6 +917,8 @@ static __inline void
 remque(void *a)
 {
        struct quehead *element = (struct quehead *)a;
+       chkquenext(element);
+       chkqueprev(element);
 
        element->qh_link->qh_rlink = element->qh_rlink;
        element->qh_rlink->qh_link = element->qh_link;
@@ -831,7 +932,7 @@ void        remque(void *a);
 
 #endif /* __GNUC__ */
 
-#endif
+#endif /* NOTFB31 */
 #endif /* _KERNEL */
 
 #endif /* !_SYS_QUEUE_H_ */
index 81792a1b9c2a78cfa4c9934ca9e7647ce459f9f3..ce2d47670163ae981b94eb64cd140cf1c1564cfc 100644 (file)
@@ -105,11 +105,13 @@ void os_reason_free(os_reason_t cur_reason);
 #define OS_REASON_WATCHDOG      20
 #define OS_REASON_METAL         21
 #define OS_REASON_WATCHKIT      22
+#define OS_REASON_GUARD         23
+#define OS_REASON_ANALYTICS     24
 
 /*
  * Update whenever new OS_REASON namespaces are added.
  */
-#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_WATCHKIT
+#define OS_REASON_MAX_VALID_NAMESPACE OS_REASON_ANALYTICS
 
 #define OS_REASON_BUFFER_MAX_SIZE 5120
 
@@ -122,11 +124,12 @@ void os_reason_free(os_reason_t cur_reason);
 #define OS_REASON_FLAG_CONSISTENT_FAILURE       0x40  /* Whatever caused this reason to be created will happen again */
 #define OS_REASON_FLAG_ONE_TIME_FAILURE         0x80  /* Whatever caused this reason to be created was a one time issue */
 #define OS_REASON_FLAG_NO_CRASHED_TID           0x100 /* Don't include the TID that processed the exit in the crash report */
+#define OS_REASON_FLAG_ABORT                    0x200 /* Reason created from abort_* rather than terminate_* */
 
 /*
  * Set of flags that are allowed to be passed from userspace
  */
-#define OS_REASON_FLAG_MASK_ALLOWED_FROM_USER  (OS_REASON_FLAG_CONSISTENT_FAILURE | OS_REASON_FLAG_ONE_TIME_FAILURE | OS_REASON_FLAG_NO_CRASH_REPORT)
+#define OS_REASON_FLAG_MASK_ALLOWED_FROM_USER (OS_REASON_FLAG_CONSISTENT_FAILURE | OS_REASON_FLAG_ONE_TIME_FAILURE | OS_REASON_FLAG_NO_CRASH_REPORT | OS_REASON_FLAG_ABORT)
 
 /*
  * Macros to encode the exit reason namespace and first 32 bits of code in exception code
@@ -234,6 +237,13 @@ int terminate_with_payload(int pid, uint32_t reason_namespace, uint64_t reason_c
 #define EXEC_EXIT_REASON_UPX                12
 #define EXEC_EXIT_REASON_NO32EXEC           13
 
+/*
+ * guard reasons
+ */
+#define GUARD_REASON_VNODE       1
+#define GUARD_REASON_VIRT_MEMORY 2
+#define GUARD_REASON_MACH_PORT   3
+
 __END_DECLS
 
 #endif /* _REASON_H_ */
index 2f0316c87b945e9b402f2a772bb90acf8d0eb949..55b553a1b48164f3dab7a0e4d5356101a07c7bd8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -126,6 +126,7 @@ typedef __uint64_t  rlim_t;
 #define PRIO_DARWIN_ROLE_NON_UI         0x3     /* Off screen, non-focal UI */
 #define PRIO_DARWIN_ROLE_UI_NON_FOCAL   0x4     /* On  screen, non-focal UI */
 #define PRIO_DARWIN_ROLE_TAL_LAUNCH     0x5     /* Throttled-launch (for OS X TAL resume) */
+#define PRIO_DARWIN_ROLE_DARWIN_BG      0x6     /* Throttled for running in the background */
 
 #endif /* PRIVATE */
 
@@ -337,8 +338,9 @@ struct rusage_info_v4 {
        uint64_t ri_cycles;
        uint64_t ri_billed_energy;
        uint64_t ri_serviced_energy;
-       // We're reserving 2 counters for future extension
-       uint64_t ri_unused[2];
+        uint64_t ri_interval_max_phys_footprint; 
+       // 1 reserve counter(s) remaining for future extension
+       uint64_t ri_unused[1];
 };
 
 typedef struct rusage_info_v4 rusage_info_current;
@@ -454,6 +456,7 @@ struct rlimit {
 #define RLIMIT_WAKEUPS_MONITOR         0x1 /* Configure the wakeups monitor. */
 #define        RLIMIT_CPU_USAGE_MONITOR        0x2 /* Configure the CPU usage monitor. */
 #define        RLIMIT_THREAD_CPULIMITS         0x3 /* Configure a blocking, per-thread, CPU limits. */
+#define        RLIMIT_FOOTPRINT_INTERVAL       0x4 /* Configure memory footprint interval tracking */
 
 /*
  * Flags for wakeups monitor control.
@@ -463,11 +466,17 @@ struct rlimit {
 #define WAKEMON_GET_PARAMS             0x04
 #define WAKEMON_SET_DEFAULTS           0x08
 #define        WAKEMON_MAKE_FATAL              0x10 /* Configure the task so that violations are fatal. */
+
 /*
  * Flags for CPU usage monitor control.
  */
 #define        CPUMON_MAKE_FATAL               0x1000
 
+/*
+ * Flags for memory footprint interval tracking.
+ */
+#define        FOOTPRINT_INTERVAL_RESET        0x1 /* Reset the footprint interval counter to zero */
+
 struct proc_rlimit_control_wakeupmon {
        uint32_t wm_flags;
        int32_t wm_rate;
@@ -488,6 +497,7 @@ struct proc_rlimit_control_wakeupmon {
 #if PRIVATE
 #define IOPOL_TYPE_VFS_HFS_CASE_SENSITIVITY 1
 #endif
+#define IOPOL_TYPE_VFS_ATIME_UPDATES 2
 
 /* scope */
 #define IOPOL_SCOPE_PROCESS   0
@@ -511,6 +521,9 @@ struct proc_rlimit_control_wakeupmon {
 #define IOPOL_VFS_HFS_CASE_SENSITIVITY_FORCE_CASE_SENSITIVE    1
 #endif
 
+#define IOPOL_ATIME_UPDATES_DEFAULT    0
+#define IOPOL_ATIME_UPDATES_OFF                1
+
 #ifdef PRIVATE
 /*
  * Structures for use in communicating via iopolicysys() between Libc and the
index a0675b2b19f9baa77139ad9013ab538fbf0d68b1..f48f83e501db2e8e7da6f9ae41c147b5811e094c 100644 (file)
@@ -68,6 +68,7 @@ extern int sdt_invop(uintptr_t, uintptr_t *, uintptr_t);
 extern uint64_t sdt_getarg(void *, dtrace_id_t, void *, int, int);
 
 void sdt_provide_module(void *, struct modctl *);
+void sdt_early_init(void);
 void sdt_init(void);
 
 extern int          sdt_probetab_size;
index 817454ab57b50758de20c848a7ef23dc157bb179..e7218b5669641f09bce0152f9471905f2ecb4129 100644 (file)
@@ -473,6 +473,9 @@ struct      __kern_sigaction {
 /* This will provide 64bit register set in a 32bit user address space */
 #define        SA_64REGSET     0x0200  /* signal handler with SA_SIGINFO args with 64bit regs information */
 #endif /* (!_POSIX_C_SOURCE || _DARWIN_C_SOURCE) */
+#ifdef BSD_KERNEL_PRIVATE
+#define        SA_VALIDATE_SIGRETURN_FROM_SIGTRAMP     0x0400  /* use token to validate sigreturn was called from matching sigtramp */
+#endif /* BSD_KERNEL_PRIVATE */
 
 /* the following are the only bits we support from user space, the 
  * rest are for kernel use only.
index b280c686f82a23d1ac55e7584972791bb5a7b228..3419e8c236496b4992c0ea7709c143c926b8f3dd 100644 (file)
@@ -67,6 +67,9 @@
 #include <sys/appleapiopts.h>
 
 #ifdef BSD_KERNEL_PRIVATE
+
+#include <stdatomic.h>
+
 /*
  * Kernel signal definitions and data structures,
  * not exported to user programs.
@@ -86,13 +89,13 @@ struct      sigacts {
        sigset_t ps_signodefer;         /* signals not masked while handled */
        sigset_t ps_siginfo;            /* signals that want SA_SIGINFO args */
        sigset_t ps_oldmask;            /* saved mask from before sigpause */
+       user_addr_t ps_sigreturn_token; /* random token used to validate sigreturn arguments */
+       _Atomic uint32_t ps_sigreturn_validation; /* sigreturn argument validation state */
        int     ps_flags;               /* signal flags, below */
        struct kern_sigaltstack ps_sigstk;      /* sp, length & flags */
        int     ps_sig;                 /* for core dump/debugger XXX */
        int     ps_code;                /* for core dump/debugger XXX */
        int     ps_addr;                /* for core dump/debugger XXX */
-       sigset_t ps_usertramp;          /* SunOS compat; libc sigtramp XXX */
-       sigset_t ps_64regset;           /* signals that want SA_EXSIGINFO args */
 };
 
 /* signal flags */
@@ -108,6 +111,11 @@ struct     sigacts {
 #define        KERN_SIG_HOLD   CAST_USER_ADDR_T(3)
 #define        KERN_SIG_WAIT   CAST_USER_ADDR_T(4)
 
+/* Values for ps_sigreturn_validation */
+#define PS_SIGRETURN_VALIDATION_DEFAULT 0x0u
+#define PS_SIGRETURN_VALIDATION_ENABLED 0x1u
+#define PS_SIGRETURN_VALIDATION_DISABLED 0x2u
+
 /*
  * get signal action for process and signal; currently only for current process
  */
index 3602f776496324dab3cd03629502fa55a86f4092..f6bafa632ea2829e91d8a0d6d5caf27f029c5acd 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #ifdef PRIVATE
 #define        SO_NOWAKEFROMSLEEP      0x10000 /* Don't wake for traffic to this socket */
 #define        SO_NOAPNFALLBK          0x20000 /* Don't attempt APN fallback for the socket */
+#define        SO_TIMESTAMP_CONTINUOUS 0x40000 /* Continuous monotonic timestamp on rcvd dgram */
 #endif
 
 #endif  /* (!__APPLE__) */
        (c == SO_TC_BK_SYS || c == SO_TC_BK || c == SO_TC_BE ||         \
        c == SO_TC_RD || c == SO_TC_OAM || c == SO_TC_AV ||             \
        c == SO_TC_RV || c == SO_TC_VI || c == SO_TC_VO ||              \
-       c == SO_TC_CTL)
+       c == SO_TC_CTL || c == SO_TC_NETSVC_SIG)
 
 #define        SO_TC_UNSPEC    ((int)-1)               /* Traffic class not specified */
 
@@ -760,7 +761,12 @@ struct sockaddr_storage {
 #define        NET_RT_DUMPX            8       /* private */
 #define        NET_RT_DUMPX_FLAGS      9       /* private */
 #endif /* PRIVATE */
-#define        NET_RT_MAXID            10
+/*
+ * Allows read access non-local host's MAC address
+ * if the process has neighbor cache entitlement.
+ */
+#define        NET_RT_FLAGS_PRIV       10
+#define        NET_RT_MAXID            11
 #endif /* (_POSIX_C_SOURCE && !_DARWIN_C_SOURCE) */
 
 #ifdef KERNEL_PRIVATE
@@ -1084,6 +1090,7 @@ struct cmsgcred {
 #ifdef PRIVATE
 #define        SCM_SEQNUM                      0x05    /* TCP unordered recv seq no */
 #define        SCM_MSG_PRIORITY                0x06    /* TCP unordered snd priority */
+#define        SCM_TIMESTAMP_CONTINUOUS                0x07    /* timestamp (uint64_t) */
 #endif /* PRIVATE */
 
 #ifdef KERNEL_PRIVATE
index b0be72420adc88f0e9f4b27f40a4d16a57959675..caf61205127d7b0791bfbe358ad5a84ac4a4b15d 100644 (file)
@@ -81,6 +81,9 @@
 #include <net/kext_net.h>
 #include <sys/ev.h>
 #include <uuid/uuid.h>
+#ifdef BSD_KERNEL_PRIVATE
+#include <sys/eventhandler.h>
+#endif /* BSD_KERNEL_PRIVATE */
 #endif /* KERNEL_PRIVATE */
 
 typedef        u_quad_t so_gen_t;
@@ -310,7 +313,11 @@ struct socket {
        struct msg_state *so_msg_state;         /* unordered snd/rcv state */
        struct flow_divert_pcb  *so_fd_pcb;     /* Flow Divert control block */
 
-       struct cfil_info        *so_cfil;
+#if CONTENT_FILTER
+       struct cfil_info    *so_cfil;
+       struct cfil_db      *so_cfil_db;
+       u_int32_t           so_state_change_cnt; /* incr for each connect, disconnect */
+#endif
 
        u_int32_t       so_eventmask;           /* event mask */
 
@@ -748,6 +755,7 @@ __BEGIN_DECLS
 /* Exported */
 extern int sbappendaddr(struct sockbuf *sb, struct sockaddr *asa,
     struct mbuf *m0, struct mbuf *control, int *error_out);
+extern int sbappendchain(struct sockbuf *sb, struct mbuf *m, int space);
 extern int sbappendrecord(struct sockbuf *sb, struct mbuf *m0);
 extern void sbflush(struct sockbuf *sb);
 extern int sbspace(struct sockbuf *sb);
@@ -776,11 +784,17 @@ extern void soreserve_preconnect(struct socket *so, unsigned int pre_cc);
 extern void sorwakeup(struct socket *so);
 extern int sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
     struct mbuf *top, struct mbuf *control, int flags);
+extern int sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top,
+                                                  struct mbuf *control, uint32_t sendflags);
 extern int sosend_list(struct socket *so, struct uio **uio, u_int uiocnt,
     int flags);
 extern int soreceive_list(struct socket *so, struct recv_msg_elem *msgarray,
     u_int msgcnt, int *flags);
 extern void sonullevent(struct socket *so, void *arg, uint32_t hint);
+extern struct mbuf *sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0,
+                                                                  struct mbuf *control);
+
+
 __END_DECLS
 
 #ifdef BSD_KERNEL_PRIVATE
index be5cfba2e69656d6783fd67aaafe2bd39ac4a45d..0ef6be269709f627f8c0f6ed1abd7a36b87810f8 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
 #define        SIOCSECNMODE            _IOW('i', 177, struct ifreq)
 
 #define        SIOCSIFORDER    _IOWR('i', 178, struct if_order)
-#define        SIOCGIFORDER    _IOWR('i', 179, struct if_order)
 
 #define        SIOCSQOSMARKINGMODE     _IOWR('i', 180, struct ifreq)
 #define        SIOCSFASTLANECAPABLE    SIOCSQOSMARKINGMODE
 #endif /* BSD_KERNEL_PRIVATE */
 #endif /* PRIVATE */
 
+#ifdef PRIVATE
+#define        SIOCGIFLOWPOWER _IOWR('i', 199, struct ifreq)   /* Low Power Mode */
+#define        SIOCSIFLOWPOWER _IOWR('i', 200, struct ifreq)   /* Low Power Mode */
+
+#if INET6
+#define        SIOCGIFCLAT46ADDR       _IOWR('i', 201, struct if_clat46req)
+#endif /* INET6 */
+#endif /* PRIVATE */
+
 #endif /* !_SYS_SOCKIO_H_ */
index 29ea49d3b5d2ead53e2be2e1a0190f90bb1bfd41..069897d1be06bfb31c0a1b3555cd2da3a770ef3b 100644 (file)
@@ -182,6 +182,9 @@ struct _posix_spawn_persona_info {
  * can be set, as well as any metadata whose validity is signalled by the
  * presence of a bit in the flags field.  All fields are initialized to the
  * appropriate default values by posix_spawnattr_init().
+ *
+ * Fields must be added at the end of this, but before extensions array
+ * pointers.
  */
 
 typedef struct _posix_spawnattr {
@@ -205,6 +208,9 @@ typedef struct _posix_spawnattr {
 
        uint64_t        psa_qos_clamp;          /* QoS Clamp to set on the new process */
        uint64_t        psa_darwin_role;           /* PRIO_DARWIN_ROLE to set on the new process */
+       int             psa_thread_limit;       /* thread limit */
+
+       uint64_t        psa_max_addr;           /* Max valid VM address */
 
        /*
         * NOTE: Extensions array pointers must stay at the end so that
index 183fdd2079d5b6ea01e1b71503a8d57bc59fb7f6..1169924d59fe1f41238be1298cd7b90c8a6cd8ad 100644 (file)
@@ -368,14 +368,16 @@ struct user32_stat64 {
        __uint32_t      st_gen;                                 /* file generation number */
        __uint32_t      st_lspare;                              /* RESERVED: DO NOT USE! */
        __int64_t       st_qspare[2];                   /* RESERVED: DO NOT USE! */
-#if __arm__ && (__BIGGEST_ALIGNMENT__ > 4)
-/* For the newer ARMv7k ABI where 64-bit types are 64-bit aligned, but pointers
- * are 32-bit:
- * Applying attributes here causes a mismatch with the user-space struct stat64
+#if defined(__x86_64__)
+/*
+ * This packing is required to ensure symmetry between userspace and kernelspace
+ * when the kernel is 64-bit and the user application is 32-bit. All currently
+ * supported ARM slices (arm64/armv7k/arm64_32) contain the same struct
+ * alignment ABI so this packing isn't needed for ARM.
  */
-};
-#else
 } __attribute__((packed,aligned(4)));
+#else
+};
 #endif
 
 extern void munge_user64_stat64(struct stat64 *sbp, struct user64_stat64 *usbp);
index ff17198ca85dde7be375cd45fb30a2afbbfa7956..0d4414a584baf8549fbaa402bb1c36c7748ce5f4 100644 (file)
@@ -1139,6 +1139,7 @@ extern char       machine[];
 extern char    osrelease[];
 extern char    ostype[];
 extern char    osversion[];
+extern char    osbuild_config[];
 
 struct linker_set;
 
index 98dc4dd529f619835e46739441b8dff108ff271b..bec0bc45e09ab72aa836b2ce31cdbc0b3284f8a1 100644 (file)
@@ -263,9 +263,7 @@ void *exec_spawnattr_getmacpolicyinfo(const void *macextensions, const char *pol
 
 #ifdef BSD_KERNEL_PRIVATE
 
-#define THROTTLE_IO_ENABLE     1
-#define THROTTLE_IO_DISABLE    0
-void sys_override_io_throttle(int flag);
+void sys_override_io_throttle(boolean_t enable_override);
 
 #endif /* BSD_KERNEL_PRIVATE */
 
index 0699b5b0312065a458570f713cafe81b2775d513..4c209a6e7bf0d57b2e6369c9e8252c7fa1ee2cb8 100644 (file)
@@ -37,6 +37,7 @@
 #include <sys/cdefs.h>
 #include <sys/kernel_types.h>
 #include <kern/locks.h>
+#include <mach/machine.h>
 #include <mach/memory_object_types.h>
 #include <sys/ucred.h>
 
@@ -97,6 +98,8 @@ const char *cs_identity_get(proc_t);
 #endif
 
 /* cluster IO routines */
+void    cluster_update_state(vnode_t, vm_object_offset_t, vm_object_offset_t, boolean_t);
+
 int    advisory_read(vnode_t, off_t, off_t, int);
 int    advisory_read_ext(vnode_t, off_t, off_t, int, int (*)(buf_t, void *), void *, int);
 
index 3724348addb349005025913fd0a46695fdab2e38..be82f0f66a4b5648b965412471da1f490dfc93b7 100644 (file)
@@ -120,9 +120,11 @@ struct cs_blob {
        void *          csb_entitlements;       /* The entitlements as an OSDictionary */
        unsigned int    csb_signer_type;
 
+       unsigned int    csb_reconstituted;      /* signature has potentially been modified after validation */
        /* The following two will be replaced by the csb_signer_type. */
        unsigned int    csb_platform_binary:1;
        unsigned int    csb_platform_path:1;
+
 };
 
 /*
@@ -186,7 +188,6 @@ __private_extern__ uint32_t cluster_throttle_io_limit(vnode_t, uint32_t *);
 #define UBC_FOR_PAGEOUT         0x0002
 
 memory_object_control_t ubc_getobject(vnode_t, int);
-boolean_t      ubc_strict_uncached_IO(vnode_t);
 
 int    ubc_info_init(vnode_t);
 int    ubc_info_init_withsize(vnode_t, off_t);
index de799d8f1ade20e6895937cf9b85195b7e03554e..5a1b5f62e91b5c0bb226d6f0f6f8e52ddc507c3d 100644 (file)
@@ -84,12 +84,17 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value);
 
 /*
  * operation bits [23, 16] contain the flags for __ulock_wait
- */
-/* The waiter is contending on this lock for synchronization around global data.
+ *
+ * @const ULF_WAIT_WORKQ_DATA_CONTENTION
+ * The waiter is contending on this lock for synchronization around global data.
  * This causes the workqueue subsystem to not create new threads to offset for
  * waiters on this lock.
+ *
+ * @const ULF_WAIT_CANCEL_POINT
+ * This wait is a cancelation point
  */
-#define ULF_WAIT_WORKQ_DATA_CONTENTION 0x00010000
+#define ULF_WAIT_WORKQ_DATA_CONTENTION  0x00010000
+#define ULF_WAIT_CANCEL_POINT           0x00020000
 
 /*
  * operation bits [31, 24] contain the generic flags
@@ -104,7 +109,8 @@ extern int __ulock_wake(uint32_t operation, void *addr, uint64_t wake_value);
 #define ULF_GENERIC_MASK       0xFFFF0000
 
 #define ULF_WAIT_MASK          (ULF_NO_ERRNO | \
-                                                        ULF_WAIT_WORKQ_DATA_CONTENTION)
+                                                        ULF_WAIT_WORKQ_DATA_CONTENTION | \
+                                                        ULF_WAIT_CANCEL_POINT)
 
 #define ULF_WAKE_MASK          (ULF_WAKE_ALL | \
                                                         ULF_WAKE_THREAD | \
index 92b235bb938f50590b61b1b6a0cc1539539216eb..552bacac6d5bc1bdc6b2ba2c29b8f6479b7124e5 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /* Copyright (c) 1995, 1997 Apple Computer, Inc. All Rights Reserved */
@@ -112,18 +112,19 @@ struct label;             /* MAC label dummy struct */
 /*
  *     Per-thread U area.
  */
+
 struct uthread {
        /* syscall parameters, results and catches */
        u_int64_t uu_arg[8]; /* arguments to current system call */
-    int uu_rval[2];
+       int uu_rval[2];
+       char uu_cursig; /* p_cursig for exc. */
        unsigned int syscall_code; /* current syscall code */
 
        /* thread exception handling */
+       int     uu_exception;
        mach_exception_code_t uu_code;  /* ``code'' to trap */
        mach_exception_subcode_t uu_subcode;
-       int     uu_exception;
-       char uu_cursig;                 /* p_cursig for exc. */
+
        /* support for syscalls which use continuations */
        union {
                struct _select_data {
@@ -132,102 +133,165 @@ struct uthread {
                        int count;
                        struct select_nocancel_args *args;  /* original syscall arguments */
                        int32_t *retval;                    /* place to store return val */
-               } ss_select_data;
+               } uus_select_data;
+
                struct _kqueue_scan {
                        kevent_callback_t call;             /* per-event callback */
                        kqueue_continue_t cont;             /* whole call continuation */
                        filt_process_data_t process_data;   /* needed for filter processing */
                        uint64_t deadline;                  /* computed deadline for operation */
                        void *data;                         /* caller's private data */
-               } ss_kqueue_scan;                           /* saved state for kevent_scan() */
+               } uus_kqueue_scan;                       /* saved state for kevent_scan() */
+
                struct _kevent {
                        struct _kqueue_scan scan;           /* space for the generic data */
                        struct fileproc *fp;                /* fileproc we hold iocount on */
                        int fd;                             /* fd for fileproc (if held) */
-                       int eventcount;                     /* user-level event count */
+                       int eventcount;                     /* user-level event count */
                        int eventout;                       /* number of events output */
                        struct filt_process_s process_data; /* space for process data fed thru */
                        int32_t *retval;                    /* place to store return val */
                        user_addr_t eventlist;              /* user-level event list address */
                        uint64_t data_available;            /* [user/kernel] addr of in/out size */
-               } ss_kevent;                     /* saved state for kevent() */
+               } uus_kevent;                            /* saved state for kevent() */
+
+               struct _kevent_register {
+                       struct kevent_internal_s kev;       /* the kevent to maybe copy out */
+                       struct knote *knote;                /* the knote used for the wait */
+                       struct fileproc *fp;                /* fileproc we hold iocount on */
+                       thread_t handoff_thread;            /* thread we handed off to, has +1 */
+                       struct kqueue *kq;
+                       int fd;                             /* fd for fileproc (if held) */
+                       int eventcount;                     /* user-level event count */
+                       int eventout;                       /* number of events output */
+                       unsigned int flags;                 /* flags for kevent_copyout() */
+                       int32_t *retval;                    /* place to store return val */
+                       user_addr_t ueventlist;             /* the user-address to copyout to */
+               } uus_kevent_register;                   /* saved for EVFILT_WORKLOOP wait */
 
                struct _kauth {
-                       user_addr_t message;    /* message in progress */
-               } uu_kauth;
+                       user_addr_t message;                /* message in progress */
+               } uus_kauth;
 
-               struct ksyn_waitq_element  uu_kwe;              /* user for pthread synch */
+               struct ksyn_waitq_element uus_kwe;       /* user for pthread synch */
 
                struct _waitid_data {
-                       struct waitid_nocancel_args *args;      /* original syscall arguments */
-                       int32_t *retval;                        /* place to store return val */
-               } uu_waitid_data;
+                       struct waitid_nocancel_args *args;  /* original syscall arguments */
+                       int32_t *retval;                    /* place to store return val */
+               } uus_waitid_data;
 
                struct _wait4_data {
-                       struct wait4_nocancel_args *args;       /* original syscall arguments */
-                       int32_t *retval;                        /* place to store return val */
-               } uu_wait4_data;
-       } uu_kevent;
+                       struct wait4_nocancel_args *args;   /* original syscall arguments */
+                       int32_t *retval;                    /* place to store return val */
+               } uus_wait4_data;
+
+               struct _workq_park_data {
+                       uint64_t idle_stamp;
+                       uint64_t workloop_params;
+                       uint32_t fulfilled_snapshot;
+                       uint32_t yields;
+                       void *thread_request;                /* request being fulfilled, for tracing only */
+                       uint32_t upcall_flags;
+                       bool has_stack;
+                       thread_qos_t qos;
+               } uus_workq_park_data;                   /* saved for parked workq threads */
+
+               struct _ulock_wait_data {
+                       thread_t owner_thread;
+                       thread_t old_owner;
+                       int32_t *retval;
+                       uint flags;
+               } uus_ulock_wait_data;
+       } uu_save;
 
        /* Persistent memory allocations across system calls */
        struct _select {
-                       u_int32_t       *ibits, *obits; /* bits to select on */
-                       uint    nbytes; /* number of bytes in ibits and obits */
+               u_int32_t       *ibits, *obits; /* bits to select on */
+               uint    nbytes; /* number of bytes in ibits and obits */
        } uu_select;                    /* saved state for select() */
 
-  /* internal support for continuation framework */
-    int (*uu_continuation)(int);
-    int uu_pri;
-    int uu_timo;
+       /* internal support for continuation framework */
+       int (*uu_continuation)(int);
+       int uu_pri;
+       int uu_timo;
        caddr_t uu_wchan;                       /* sleeping thread wait channel */
        const char *uu_wmesg;                   /* ... wait message */
-       struct proc * uu_proc;
+       struct proc *uu_proc;
        thread_t uu_thread;
        void * uu_userstate;
        struct waitq_set *uu_wqset;             /* waitq state cached across select calls */
        size_t uu_wqstate_sz;                   /* ...size of uu_wqset buffer */
        int uu_flag;
        sigset_t uu_siglist;                            /* signals pending for the thread */
-       sigset_t  uu_sigwait;                           /*  sigwait on this thread*/
-       sigset_t  uu_sigmask;                           /* signal mask for the thread */
-       sigset_t  uu_oldmask;                           /* signal mask saved before sigpause */
-       sigset_t  uu_vforkmask;                         /* saved signal mask during vfork */
+       sigset_t uu_sigwait;                            /*  sigwait on this thread*/
+       sigset_t uu_sigmask;                            /* signal mask for the thread */
+       sigset_t uu_oldmask;                            /* signal mask saved before sigpause */
+       sigset_t uu_vforkmask;                          /* saved signal mask during vfork */
        struct vfs_context uu_context;                  /* thread + cred */
 
        TAILQ_ENTRY(uthread) uu_list;           /* List of uthreads in proc */
 
-       struct kaudit_record    *uu_ar;                 /* audit record */
+       struct kaudit_record    *uu_ar;                 /* audit record */
        struct task*    uu_aio_task;                    /* target task for async io */
-    
+
        lck_mtx_t       *uu_mtx;
 
        lck_spin_t      uu_rethrottle_lock;     /* locks was_rethrottled and is_throttled */
        TAILQ_ENTRY(uthread) uu_throttlelist;   /* List of uthreads currently throttled */
-       void    *       uu_throttle_info;       /* pointer to throttled I/Os info */
+       void    *       uu_throttle_info;       /* pointer to throttled I/Os info */
        int             uu_on_throttlelist;
        int             uu_lowpri_window;
-       boolean_t       uu_was_rethrottled;
-       boolean_t       uu_is_throttled;
-       boolean_t       uu_throttle_bc;
+       /* These boolean fields are protected by different locks */
+       bool            uu_was_rethrottled;
+       bool            uu_is_throttled;
+       bool            uu_throttle_bc;
 
        u_int32_t       uu_network_marks;       /* network control flow marks */
 
        struct kern_sigaltstack uu_sigstk;
-        vnode_t                uu_vreclaims;
+       vnode_t         uu_vreclaims;
        vnode_t         uu_cdir;                /* per thread CWD */
        int             uu_dupfd;               /* fd in fdesc_open/dupfdopen */
-        int            uu_defer_reclaims;
-
-       struct kqueue *uu_kqueue_bound;           /* kqueue we are bound to service */
-       unsigned int uu_kqueue_qos_index;         /* qos index we are bound to service */
-       unsigned int uu_kqueue_flags;             /* the flags we are using */
-       boolean_t uu_kqueue_override_is_sync;     /* sync qos override applied to servicer */
+       int             uu_defer_reclaims;
+
+       /*
+        * Bound kqueue request. This field is only cleared by the current thread,
+        * hence can be dereferenced safely by the current thread without locks.
+        */
+       struct kqrequest *uu_kqr_bound;
+       TAILQ_ENTRY(uthread) uu_workq_entry;
+       mach_vm_offset_t uu_workq_stackaddr;
+       mach_port_name_t uu_workq_thport;
+       struct uu_workq_policy {
+               uint16_t qos_req : 4;         /* requested QoS */
+               uint16_t qos_max : 4;         /* current acked max qos */
+               uint16_t qos_override : 4;    /* received async override */
+               uint16_t qos_bucket : 4;      /* current acked bucket */
+       } uu_workq_pri;
+       uint8_t uu_workq_flags;
+       kq_index_t uu_kqueue_override;
 
 #ifdef JOE_DEBUG
-        int            uu_iocount;
-        int            uu_vpindex;
-        void   *       uu_vps[32];
-        void    *       uu_pcs[32][10];
+       int             uu_iocount;
+       int             uu_vpindex;
+       void    *uu_vps[32];
+       void    *uu_pcs[32][10];
+#endif
+#if CONFIG_WORKLOOP_DEBUG
+#define UU_KEVENT_HISTORY_COUNT 32
+#define UU_KEVENT_HISTORY_WRITE_ENTRY(uth, ...)  ({ \
+               struct uthread *__uth = (uth); \
+               unsigned int __index = __uth->uu_kevent_index++; \
+               __uth->uu_kevent_history[__index % UU_KEVENT_HISTORY_COUNT] = \
+                               (struct uu_kevent_history)__VA_ARGS__; \
+       })
+       struct uu_kevent_history {
+               uint64_t uu_kqid;
+               struct kqueue *uu_kq;
+               int uu_error, uu_nchanges, uu_nevents;
+               unsigned int uu_flags;
+       } uu_kevent_history[UU_KEVENT_HISTORY_COUNT];
+       unsigned int uu_kevent_index;
 #endif
        int             uu_proc_refcount;
 #if PROC_REF_DEBUG
@@ -241,22 +305,22 @@ struct uthread {
 #if CONFIG_DTRACE
        uint32_t        t_dtrace_errno; /* Most recent errno */
        siginfo_t       t_dtrace_siginfo;
-        uint64_t        t_dtrace_resumepid; /* DTrace's pidresume() pid */
-        uint8_t         t_dtrace_stop;  /* indicates a DTrace desired stop */
-        uint8_t         t_dtrace_sig;   /* signal sent via DTrace's raise() */
-                            
-        union __tdu {
-                struct __tds {
-                        uint8_t _t_dtrace_on;   /* hit a fasttrap tracepoint */
-                        uint8_t _t_dtrace_step; /* about to return to kernel */
-                        uint8_t _t_dtrace_ret;  /* handling a return probe */
-                        uint8_t _t_dtrace_ast;  /* saved ast flag */
+       uint64_t        t_dtrace_resumepid; /* DTrace's pidresume() pid */
+       uint8_t         t_dtrace_stop;  /* indicates a DTrace desired stop */
+       uint8_t         t_dtrace_sig;   /* signal sent via DTrace's raise() */
+
+       union __tdu {
+               struct __tds {
+                       uint8_t _t_dtrace_on;   /* hit a fasttrap tracepoint */
+                       uint8_t _t_dtrace_step; /* about to return to kernel */
+                       uint8_t _t_dtrace_ret;  /* handling a return probe */
+                       uint8_t _t_dtrace_ast;  /* saved ast flag */
 #if __sol64 || defined(__APPLE__)
-                        uint8_t _t_dtrace_reg;  /* modified register */
+                       uint8_t _t_dtrace_reg;  /* modified register */
 #endif
-                } _tds;
-                u_int32_t _t_dtrace_ft;           /* bitwise or of these flags */
-        } _tdu;
+               } _tds;
+               u_int32_t _t_dtrace_ft;           /* bitwise or of these flags */
+       } _tdu;
 #define t_dtrace_ft     _tdu._t_dtrace_ft
 #define t_dtrace_on     _tdu._tds._t_dtrace_on
 #define t_dtrace_step   _tdu._tds._t_dtrace_step
@@ -266,20 +330,19 @@ struct uthread {
 #define t_dtrace_reg    _tdu._tds._t_dtrace_reg
 #endif
 
-        user_addr_t    t_dtrace_pc;    /* DTrace saved pc from fasttrap */
-        user_addr_t    t_dtrace_npc;   /* DTrace next pc from fasttrap */
-        user_addr_t    t_dtrace_scrpc; /* DTrace per-thread scratch location */
-        user_addr_t    t_dtrace_astpc; /* DTrace return sequence location */
+       user_addr_t     t_dtrace_pc;    /* DTrace saved pc from fasttrap */
+       user_addr_t     t_dtrace_npc;   /* DTrace next pc from fasttrap */
+       user_addr_t     t_dtrace_scrpc; /* DTrace per-thread scratch location */
+       user_addr_t     t_dtrace_astpc; /* DTrace return sequence location */
 
        struct dtrace_ptss_page_entry*  t_dtrace_scratch; /* scratch space entry */
 
 #if __sol64 || defined(__APPLE__)
-        uint64_t        t_dtrace_regv;  /* DTrace saved reg from fasttrap */
+       uint64_t        t_dtrace_regv;  /* DTrace saved reg from fasttrap */
 #endif
-       void *          t_dtrace_syscall_args;
+       void *t_dtrace_syscall_args;
 #endif /* CONFIG_DTRACE */
-       void *          uu_threadlist;
-       char *          pth_name;
+       char *pth_name;
 
        /* Document Tracking struct used to track a "tombstone" for a document */
        struct doc_tombstone *t_tombstone;
@@ -300,10 +363,10 @@ typedef struct uthread * uthread_t;
 #define UT_THROTTLE_IO 0x00000080      /* this thread issues throttle I/O */
 #define UT_PASSIVE_IO  0x00000100      /* this thread issues passive I/O */
 #define UT_PROCEXIT    0x00000200      /* this thread completed the  proc exit */
-#define UT_RAGE_VNODES 0x00000400      /* rapid age any vnodes created by this thread */       
-/* 0x00000800 unused, used to be UT_BACKGROUND */
+#define UT_RAGE_VNODES 0x00000400      /* rapid age any vnodes created by this thread */
+#define UT_KERN_RAGE_VNODES    0x00000800      /* rapid age any vnodes created by this thread (kernel set) */
 /* 0x00001000 unused, used to be UT_BACKGROUND_TRAFFIC_MGT */
-
+#define        UT_ATIME_UPDATE 0x00002000      /* don't update atime for files accessed by this thread */
 #define        UT_VFORK        0x02000000      /* thread has vfork children */
 #define        UT_SETUID       0x04000000      /* thread is settugid() */
 #define UT_WASSETUID   0x08000000      /* thread was settugid() (in vfork) */
@@ -321,9 +384,9 @@ typedef struct uthread * uthread_t;
  * This structure may or may not be at the same kernel address
  * in all processes.
  */
+
 struct user {
-  /* NOT USED ANYMORE */
+       /* NOT USED ANYMORE */
 };
 
 #endif /* !_SYS_USER_H_ */
index 836c658cae84d455d2d33ace7848c7a8bb86ef20..99352e29ad3f2fce500aad1b1f8033407f1342c7 100644 (file)
 
 #endif /* __APPLE_API_UNSTABLE */
 
-#ifdef KERNEL
-#ifdef __APPLE_API_PRIVATE
-/*
- *     Kernel data structures for Unix exception handler.
- */
+#ifdef XNU_KERNEL_PRIVATE
 
-#include <mach/port.h>
+/* Kernel functions for Unix exception handler. */
 
-#if defined(__x86_64__) || defined(__arm64__)
-extern mach_port_t                     ux_exception_port;
-#else
-extern mach_port_name_t                        ux_exception_port;
-#endif /* __x86_64__ */
+#include <mach/mach_types.h>
 
-boolean_t      machine_exception(int exception, mach_exception_code_t code, 
-                       mach_exception_subcode_t subcode,
-                       int *unix_signal, mach_exception_code_t *unix_code);
-void   ux_handler_init(void);
+extern int
+machine_exception(int exception, mach_exception_code_t code,
+                  mach_exception_subcode_t subcode);
 
-#endif /* __APPLE_API_PRIVATE */
-#endif /* KERNEL */
+extern kern_return_t
+handle_ux_exception(thread_t thread, int exception,
+                    mach_exception_code_t code,
+                    mach_exception_subcode_t subcode);
+
+#endif /* XNU_KERNEL_PRIVATE */
 
 #endif /* _SYS_UX_EXCEPTION_H_ */
+
index 74e0704e83ae8dae5fb37743a537638f9f1d2a80..b7aa1efe8f7ad3906418f16fbd80e3f21db69abd 100644 (file)
@@ -1102,6 +1102,17 @@ int      vnode_isswap(vnode_t vp);
  */
 int    vnode_isnamedstream(vnode_t vp);
 
+#ifdef KERNEL_PRIVATE
+/*!
+ @function vnode_setasnamedstream
+ @abstract Set svp as a named stream of vp and take appropriate references.
+ @param vp The vnode whose namedstream has to be set.
+ @param svp The namedstream vnode.
+ @return 0 if the operation is successful, an error otherwise.
+ */
+errno_t        vnode_setasnamedstream(vnode_t vp, vnode_t svp);
+#endif
+
 /*!
  @function vnode_ismountedon
  @abstract Determine if a vnode is a block device on which a filesystem has been mounted.
@@ -1653,6 +1664,7 @@ int       vn_getpath_fsenter_with_parent(struct vnode *dvp, struct vnode *vp, char *pa
 #endif /* KERNEL_PRIVATE */
 
 #define        VNODE_UPDATE_PARENT     0x01
+#define        VNODE_UPDATE_NAMEDSTREAM_PARENT VNODE_UPDATE_PARENT
 #define        VNODE_UPDATE_NAME       0x02
 #define        VNODE_UPDATE_CACHE      0x04
 #define VNODE_UPDATE_PURGE     0x08
@@ -2171,6 +2183,16 @@ const char *vnode_getname_printable(vnode_t vp);
 void vnode_putname_printable(const char *name);
 #endif // KERNEL_PRIVATE
 
+/*!
+ @function vnode_getbackingvnode
+ @abstract If the input vnode is a NULLFS mirrored vnode, then return the vnode it wraps.
+ @Used to un-mirror files, primarily for security purposes. On success, out_vp is always set to a vp with an iocount. The caller must release the iocount.
+ @param in_vp The vnode being asked about
+ @param out_vpp A pointer to the output vnode, unchanged on error
+ @return 0 on Success, ENOENT if in_vp doesn't mirror anything, EINVAL on parameter errors.
+ */
+int vnode_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp);
+
 /*
  * Helper functions for implementing VNOP_GETATTRLISTBULK for a filesystem
  */
index d06102237dc2fe0015340f7285a489a56f7f57e6..b34dbc110e1694dc824b88e9f494ddeb532292cc 100644 (file)
@@ -253,9 +253,7 @@ struct vnode {
 #define        VLOCKLOCAL      0x080000        /* this vnode does adv locking in vfs */
 #define        VISHARDLINK     0x100000        /* hard link needs special processing on lookup and in volfs */
 #define        VISUNION        0x200000        /* union special processing */
-#if NAMEDSTREAMS
 #define        VISNAMEDSTREAM  0x400000        /* vnode is a named stream (eg HFS resource fork) */
-#endif
 #define VOPENEVT        0x800000        /* if process is P_CHECKOPENEVT, then or in the O_EVTONLY flag on open */
 #define VNEEDSSNAPSHOT 0x1000000
 #define VNOCS         0x2000000        /* is there no code signature available */
@@ -444,6 +442,9 @@ int         vn_authorize_rename(struct vnode *fdvp, struct vnode *fvp, struct component
 int vn_authorize_renamex(struct vnode *fdvp,  struct vnode *fvp,  struct componentname *fcnp,
                                                 struct vnode *tdvp,  struct vnode *tvp,  struct componentname *tcnp,
                                                 vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved);
+int vn_authorize_renamex_with_paths(struct vnode *fdvp,  struct vnode *fvp,  struct componentname *fcnp, const char *from_path,
+                                                struct vnode *tdvp,  struct vnode *tvp,  struct componentname *tcnp, const char *to_path,
+                                                vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved);
 int    vn_authorize_rmdir(vnode_t dvp, vnode_t vp, struct componentname *cnp, vfs_context_t ctx, void *reserved);
 
 typedef int (*vn_create_authorizer_t)(vnode_t, struct componentname *, struct vnode_attr *, vfs_context_t, void*);
@@ -479,7 +480,7 @@ errno_t  vnode_verifynamedstream (vnode_t vp);
 
 
 void   nchinit(void);
-int    resize_namecache(uint32_t newsize);
+int    resize_namecache(int newsize);
 void   name_cache_lock_shared(void);
 void   name_cache_lock(void);
 void   name_cache_unlock(void);
index ae881a0f91d7bc429d1cc75bad64f5f005deba9f..797f929bc751935cf5c93ac9c967b4124a194e56 100644 (file)
@@ -117,6 +117,7 @@ __BEGIN_DECLS
 #define WORK_INTERVAL_TYPE_COREANIMATION        (0x2 << 28)
 #define WORK_INTERVAL_TYPE_CA_RENDER_SERVER     (0x2 << 28)
 #define WORK_INTERVAL_TYPE_CA_CLIENT            (0x3 << 28)
+#define WORK_INTERVAL_TYPE_HID_DELIVERY         (0x4 << 28)
 #define WORK_INTERVAL_TYPE_LAST                 (0xF << 28)
 
 #ifndef KERNEL
diff --git a/bsd/tests/bsd_tests.c b/bsd/tests/bsd_tests.c
new file mode 100644 (file)
index 0000000..dfb3791
--- /dev/null
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <tests/ktest.h>
+#include <tests/xnupost.h>
+#include <kern/assert.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/types.h>
+#include <sys/kdebug.h>
+#include <libkern/libkern.h>
+#include <kern/kalloc.h>
+#include <sys/cdefs.h>
+#include <libkern/version.h>
+#include <kern/clock.h>
+#include <kern/kern_cdata.h>
+#include <pexpert/pexpert.h>
+
+
+#if !(DEVELOPMENT || DEBUG)
+#error "Testing is not enabled on RELEASE configurations"
+#endif
+
+#ifdef __arm64__
+extern kern_return_t arm64_lock_test(void);
+#endif
+kern_return_t kalloc_test(void);
+kern_return_t ipi_test(void);
+
+struct xnupost_test bsd_post_tests[] = {
+#ifdef __arm64__
+       XNUPOST_TEST_CONFIG_BASIC(arm64_lock_test),
+#endif
+       XNUPOST_TEST_CONFIG_BASIC(kalloc_test),
+       XNUPOST_TEST_CONFIG_BASIC(ipi_test)
+};
+
+uint32_t bsd_post_tests_count = sizeof(bsd_post_tests) / sizeof(xnupost_test_data_t);
+
+extern uint64_t last_loaded_timestamp; /* updated by OSKext::load() */
+extern uint64_t kernel_post_args;
+int
+bsd_list_tests()
+{
+       if (kernel_post_args == 0) {
+               return 0;
+       }
+
+       uint64_t prev_load_time    = last_loaded_timestamp;
+       int no_load_counter        = 5;
+       int absolute_break_counter = 15;
+       int delay_duration_usecs   = 300000; /* 0.3 second for kext loading to stabilize */
+
+       while (no_load_counter > 0) {
+               printf("bsd_list_tests:INFO waiting for %d usecs\n", delay_duration_usecs);
+               printf("bsd_list_tests: prev: %llu current: %llu\n", prev_load_time, last_loaded_timestamp);
+
+               delay(delay_duration_usecs);
+               absolute_break_counter -= 1;
+
+               if (absolute_break_counter <= 0) {
+                       printf("bsd_list_tests: WARNING: Waiting beyond normal time for stabilizing kext loading\n");
+                       break;
+               }
+
+               if (prev_load_time == last_loaded_timestamp) {
+                       no_load_counter -= 1;
+                       printf("bsd_list_tests: INFO: no new kexts loaded. remaining checks: %d\n", no_load_counter);
+               }
+
+               prev_load_time = last_loaded_timestamp;
+       }
+
+       return xnupost_list_tests(bsd_post_tests, bsd_post_tests_count);
+}
+
+int
+bsd_do_post()
+{
+       return xnupost_run_tests(bsd_post_tests, bsd_post_tests_count);
+}
+
+kern_return_t
+kalloc_test()
+{
+       uint64_t * data_ptr;
+       size_t alloc_size;
+
+       T_LOG("Running kalloc test.\n");
+
+       alloc_size = sizeof(uint64_t);
+       data_ptr = kalloc(alloc_size);
+       T_ASSERT_NOTNULL(data_ptr, "kalloc sizeof(uint64_t) return not null");
+       kfree(data_ptr, alloc_size);
+
+       alloc_size = 3544;
+       data_ptr = kalloc(alloc_size);
+       T_ASSERT_NOTNULL(data_ptr, "kalloc 3544 return not null");
+       kfree(data_ptr, alloc_size);
+
+       return KERN_SUCCESS;
+}
+
+/* kcdata type definition */
+#define XNUPOST_TNAME_MAXLEN 132
+
+struct kcdata_subtype_descriptor kc_xnupost_test_def[] = {
+    {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT16, 0, sizeof(uint16_t), "config"},
+    {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT16, 1 * sizeof(uint16_t), sizeof(uint16_t), "test_num"},
+    {KCS_SUBTYPE_FLAGS_NONE, KC_ST_INT32, 2 * sizeof(uint16_t), sizeof(int32_t), "retval"},
+    {KCS_SUBTYPE_FLAGS_NONE, KC_ST_INT32, 2 * sizeof(uint16_t) + sizeof(int32_t), sizeof(int32_t), "expected_retval"},
+    {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 2 * (sizeof(uint16_t) + sizeof(int32_t)), sizeof(uint64_t), "begin_time"},
+    {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 2 * (sizeof(uint16_t) + sizeof(int32_t)) + sizeof(uint64_t), sizeof(uint64_t), "end_time"},
+    {KCS_SUBTYPE_FLAGS_ARRAY,
+     KC_ST_CHAR,
+     2 * (sizeof(uint16_t) + sizeof(int32_t) + sizeof(uint64_t)),
+     KCS_SUBTYPE_PACK_SIZE(XNUPOST_TNAME_MAXLEN * sizeof(char), sizeof(char)),
+     "test_name"}};
+
+const uint32_t kc_xnupost_test_def_count = sizeof(kc_xnupost_test_def) / sizeof(struct kcdata_subtype_descriptor);
+
+kern_return_t xnupost_copyout_test(xnupost_test_t t, mach_vm_address_t outaddr);
+
+int
+xnupost_copyout_test(xnupost_test_t t, mach_vm_address_t outaddr)
+{
+       /* code to copyout test config */
+       int kret         = 0;
+       uint32_t namelen = 0;
+
+       kret = copyout(&t->xt_config, outaddr, sizeof(uint16_t));
+       if (kret)
+               return kret;
+       outaddr += sizeof(uint16_t);
+
+       kret = copyout(&t->xt_test_num, outaddr, sizeof(uint16_t));
+       if (kret)
+               return kret;
+       outaddr += sizeof(uint16_t);
+
+       kret = copyout(&t->xt_retval, outaddr, sizeof(uint32_t));
+       if (kret)
+               return kret;
+       outaddr += sizeof(uint32_t);
+
+       kret = copyout(&t->xt_expected_retval, outaddr, sizeof(uint32_t));
+       if (kret)
+               return kret;
+       outaddr += sizeof(uint32_t);
+
+       kret = copyout(&t->xt_begin_time, outaddr, sizeof(uint64_t));
+       if (kret)
+               return kret;
+       outaddr += sizeof(uint64_t);
+
+       kret = copyout(&t->xt_end_time, outaddr, sizeof(uint64_t));
+       if (kret)
+               return kret;
+       outaddr += sizeof(uint64_t);
+
+       namelen = strnlen(t->xt_name, XNUPOST_TNAME_MAXLEN);
+       kret = copyout(t->xt_name, outaddr, namelen);
+       if (kret)
+               return kret;
+       outaddr += namelen;
+
+       return 0;
+}
+
+uint32_t
+xnupost_get_estimated_testdata_size(void)
+{
+       uint32_t total_tests = bsd_post_tests_count + kernel_post_tests_count;
+       uint32_t elem_size = kc_xnupost_test_def[kc_xnupost_test_def_count - 1].kcs_elem_offset +
+                            kcs_get_elem_size(&kc_xnupost_test_def[kc_xnupost_test_def_count - 1]);
+       uint32_t retval = 1024; /* account for type definition and mach timebase */
+       retval += 1024;         /* kernel version and boot-args string data */
+       retval += (total_tests * elem_size);
+
+       return retval;
+}
+
+int
+xnupost_export_testdata(void * outp, uint32_t size, uint32_t * lenp)
+{
+       struct kcdata_descriptor kcd;
+       mach_vm_address_t user_addr        = 0;
+       mach_vm_address_t tmp_entry_addr   = 0;
+       kern_return_t kret                 = 0;
+       uint32_t i                         = 0;
+       char kctype_name[32]               = "xnupost_test_config";
+       mach_timebase_info_data_t timebase = {0, 0};
+       uint32_t length_to_copy            = 0;
+
+#define RET_IF_OP_FAIL                                                                                       \
+       do {                                                                                                     \
+               if (kret != KERN_SUCCESS) {                                                                          \
+                       return (kret == KERN_NO_ACCESS) ? EACCES : ((kret == KERN_RESOURCE_SHORTAGE) ? ENOMEM : EINVAL); \
+               }                                                                                                    \
+       } while (0)
+
+       kret = kcdata_memory_static_init(&kcd, (mach_vm_address_t)outp, KCDATA_BUFFER_BEGIN_XNUPOST_CONFIG, size, KCFLAG_USE_COPYOUT);
+       RET_IF_OP_FAIL;
+
+       /* add mach timebase info */
+       clock_timebase_info(&timebase);
+       kret = kcdata_get_memory_addr(&kcd, KCDATA_TYPE_TIMEBASE, sizeof(timebase), &user_addr);
+       RET_IF_OP_FAIL;
+       kret = copyout(&timebase, user_addr, sizeof(timebase));
+       RET_IF_OP_FAIL;
+
+       /* save boot-args and osversion string */
+       length_to_copy = MIN((uint32_t)(strlen(version) + 1), OSVERSIZE);
+       kret           = kcdata_get_memory_addr(&kcd, STACKSHOT_KCTYPE_OSVERSION, length_to_copy, &user_addr);
+       RET_IF_OP_FAIL;
+       kret = copyout(&version[0], user_addr, length_to_copy);
+       RET_IF_OP_FAIL;
+
+       length_to_copy = MIN((uint32_t)(strlen(PE_boot_args()) + 1), OSVERSIZE);
+       kret           = kcdata_get_memory_addr(&kcd, STACKSHOT_KCTYPE_BOOTARGS, length_to_copy, &user_addr);
+       RET_IF_OP_FAIL;
+       kret = copyout(PE_boot_args(), user_addr, length_to_copy);
+       RET_IF_OP_FAIL;
+
+       /* add type definition to buffer */
+       kret = kcdata_add_type_definition(&kcd, XNUPOST_KCTYPE_TESTCONFIG, kctype_name, &kc_xnupost_test_def[0],
+                                         kc_xnupost_test_def_count);
+       RET_IF_OP_FAIL;
+
+       /* add the tests to buffer as array */
+       uint32_t total_tests = bsd_post_tests_count + kernel_post_tests_count;
+       uint32_t elem_size = kc_xnupost_test_def[kc_xnupost_test_def_count - 1].kcs_elem_offset +
+                            kcs_get_elem_size(&kc_xnupost_test_def[kc_xnupost_test_def_count - 1]);
+
+       kret = kcdata_get_memory_addr_for_array(&kcd, XNUPOST_KCTYPE_TESTCONFIG, elem_size, total_tests, &user_addr);
+       RET_IF_OP_FAIL;
+
+       for (i = 0; i < bsd_post_tests_count; i++) {
+               tmp_entry_addr = (mach_vm_address_t)((uint64_t)(user_addr) + (uint64_t)(i * elem_size));
+               kret           = xnupost_copyout_test(&bsd_post_tests[i], tmp_entry_addr);
+               RET_IF_OP_FAIL;
+       }
+       user_addr = (mach_vm_address_t)((uint64_t)(user_addr) + (uint64_t)(i * elem_size));
+
+       for (i = 0; i < kernel_post_tests_count; i++) {
+               tmp_entry_addr = (mach_vm_address_t)((uint64_t)(user_addr) + (uint64_t)(i * elem_size));
+               kret           = xnupost_copyout_test(&kernel_post_tests[i], tmp_entry_addr);
+               RET_IF_OP_FAIL;
+       }
+
+       if (kret == KERN_SUCCESS && lenp != NULL)
+               *lenp = (uint32_t)kcdata_memory_get_used_bytes(&kcd);
+       RET_IF_OP_FAIL;
+
+#undef RET_IF_OP_FAIL
+       return kret;
+}
+
+int
+xnupost_reset_all_tests(void)
+{
+       xnupost_reset_tests(&bsd_post_tests[0], bsd_post_tests_count);
+       xnupost_reset_tests(&kernel_post_tests[0], kernel_post_tests_count);
+       return 0;
+}
diff --git a/bsd/tests/ctrr_test_sysctl.c b/bsd/tests/ctrr_test_sysctl.c
new file mode 100644 (file)
index 0000000..ca1056f
--- /dev/null
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/sysctl.h>
+
diff --git a/bsd/tests/pmap_test_sysctl.c b/bsd/tests/pmap_test_sysctl.c
new file mode 100644 (file)
index 0000000..d128037
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/sysctl.h>
+
+extern kern_return_t test_pmap_enter_disconnect(unsigned int);
+extern kern_return_t test_pmap_iommu_disconnect(void);
+
+static int
+sysctl_test_pmap_enter_disconnect(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       unsigned int num_loops;
+       int error, changed;
+       error = sysctl_io_number(req, 0, sizeof(num_loops), &num_loops, &changed);
+       if (error || !changed)
+               return error;
+       return test_pmap_enter_disconnect(num_loops);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, pmap_enter_disconnect_test,
+        CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+        0, 0, sysctl_test_pmap_enter_disconnect, "I", "");
+
+static int
+sysctl_test_pmap_iommu_disconnect(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused int arg2, struct sysctl_req *req)
+{
+       unsigned int run = 0;
+       int error, changed;
+       error = sysctl_io_number(req, 0, sizeof(run), &run, &changed);
+       if (error || !changed)
+               return error;
+       return test_pmap_iommu_disconnect();
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, pmap_iommu_disconnect_test,
+        CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
+        0, 0, sysctl_test_pmap_iommu_disconnect, "I", "");
index 65524909a2112bae2e96696a272da14f5187d6a1..f751dc2ed91fd9d7c7bbbe7b821ff67ca08c918f 100644 (file)
@@ -62,6 +62,8 @@ void uuid_generate(uuid_t out);
 void uuid_generate_random(uuid_t out);
 void uuid_generate_time(uuid_t out);
 
+void uuid_generate_early_random(uuid_t out);
+
 int uuid_is_null(const uuid_t uu);
 
 int uuid_parse(const uuid_string_t in, uuid_t uu);
index 21bd3eec9535affb2aebfb98e9fd4a0a14633770..b69437f3dad3d5adb070130c20a343d4a6eb3547 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
-/* 
+/*
  * Mach Operating System
  * Copyright (c) 1987 Carnegie-Mellon University
  * All rights reserved.  The CMU software License Agreement specifies
  * the terms and conditions for use and redistribution.
  */
 
-/*
- *********************************************************************
- * HISTORY
- **********************************************************************
- */
-
 #include <sys/param.h>
 
 #include <mach/boolean.h>
 #include <mach/exception.h>
 #include <mach/kern_return.h>
-#include <mach/message.h>
-#include <mach/port.h>
-#include <mach/mach_port.h>
-#include <mach/mig_errors.h>
-#include <mach/exc_server.h>
-#include <mach/mach_exc_server.h>
-#include <kern/task.h>
-#include <kern/thread.h>
-#include <kern/sched_prim.h>
-#include <kern/kalloc.h>
 
 #include <sys/proc.h>
 #include <sys/user.h>
 #include <sys/systm.h>
-#include <sys/ux_exception.h>
-#include <sys/vmparam.h>       /* MAXSSIZ */
-
-#include <vm/vm_protos.h>      /* get_task_ipcspace() */
-/*
- * XXX Things that should be retrieved from Mach headers, but aren't
- */
-struct ipc_object;
-extern kern_return_t ipc_object_copyin(ipc_space_t space, mach_port_name_t name,
-               mach_msg_type_name_t msgt_name, struct ipc_object **objectp);
-extern mach_msg_return_t mach_msg_receive(mach_msg_header_t *msg,
-               mach_msg_option_t option, mach_msg_size_t rcv_size,
-               mach_port_name_t rcv_name, mach_msg_timeout_t rcv_timeout,
-               void (*continuation)(mach_msg_return_t),
-               mach_msg_size_t slist_size);
-extern mach_msg_return_t mach_msg_send(mach_msg_header_t *msg,
-               mach_msg_option_t option, mach_msg_size_t send_size,
-               mach_msg_timeout_t send_timeout, mach_port_name_t notify);
-extern thread_t convert_port_to_thread(ipc_port_t port);
-extern void ipc_port_release_send(ipc_port_t port);
-
-
+#include <sys/vmparam.h>        /* MAXSSIZ */
 
+#include <sys/ux_exception.h>
 
 /*
- *     Unix exception handler.
+ * Translate Mach exceptions to UNIX signals.
+ *
+ * ux_exception translates a mach exception, code and subcode to
+ * a signal.  Calls machine_exception (machine dependent)
+ * to attempt translation first.
  */
-
-static void    ux_exception(int exception, mach_exception_code_t code, 
-                               mach_exception_subcode_t subcode,
-                               int *ux_signal, mach_exception_code_t *ux_code);
-
-#if defined(__x86_64__) || defined(__arm64__)
-mach_port_t                    ux_exception_port;
-#else
-mach_port_name_t               ux_exception_port;
-#endif /* __x86_64__ */
-
-static task_t                  ux_handler_self;
-
-__attribute__((noreturn))
-static void
-ux_handler(void)
+static int
+ux_exception(int                        exception,
+             mach_exception_code_t      code,
+             mach_exception_subcode_t   subcode)
 {
-    task_t             self = current_task();
-    mach_port_name_t   exc_port_name;
-    mach_port_name_t   exc_set_name;
-
-    /* self->kernel_vm_space = TRUE; */
-    ux_handler_self = self;
-
-
-    /*
-     * Allocate a port set that we will receive on.
-     */
-    if (mach_port_allocate(get_task_ipcspace(ux_handler_self), MACH_PORT_RIGHT_PORT_SET,  &exc_set_name) != MACH_MSG_SUCCESS)
-           panic("ux_handler: port_set_allocate failed");
-
-    /*
-     * Allocate an exception port and use object_copyin to
-     * translate it to the global name.  Put it into the set.
-     */
-    if (mach_port_allocate(get_task_ipcspace(ux_handler_self), MACH_PORT_RIGHT_RECEIVE, &exc_port_name) != MACH_MSG_SUCCESS)
-       panic("ux_handler: port_allocate failed");
-    if (mach_port_move_member(get_task_ipcspace(ux_handler_self),
-                       exc_port_name,  exc_set_name) != MACH_MSG_SUCCESS)
-       panic("ux_handler: port_set_add failed");
-
-    if (ipc_object_copyin(get_task_ipcspace(self), exc_port_name,
-                       MACH_MSG_TYPE_MAKE_SEND, 
-                       (void *) &ux_exception_port) != MACH_MSG_SUCCESS)
-               panic("ux_handler: object_copyin(ux_exception_port) failed");
-
-    proc_list_lock();
-    thread_wakeup(&ux_exception_port);
-    proc_list_unlock();
-
-    /* Message handling loop. */
-
-    for (;;) {
-       struct rep_msg {
-               mach_msg_header_t Head;
-               NDR_record_t NDR;
-               kern_return_t RetCode;
-       } rep_msg;
-       struct exc_msg {
-               mach_msg_header_t Head;
-               /* start of the kernel processed data */
-               mach_msg_body_t msgh_body;
-               mach_msg_port_descriptor_t thread;
-               mach_msg_port_descriptor_t task;
-               /* end of the kernel processed data */
-               NDR_record_t NDR;
-               exception_type_t exception;
-               mach_msg_type_number_t codeCnt;
-               mach_exception_data_t code;
-               /* some times RCV_TO_LARGE probs */
-               char pad[512];
-       } exc_msg;
-       mach_port_name_t        reply_port;
-       kern_return_t    result;
-
-       exc_msg.Head.msgh_local_port = CAST_MACH_NAME_TO_PORT(exc_set_name);
-       exc_msg.Head.msgh_size = sizeof (exc_msg);
-#if 0
-       result = mach_msg_receive(&exc_msg.Head);
-#else
-       result = mach_msg_receive(&exc_msg.Head, MACH_RCV_MSG,
-                            sizeof (exc_msg), exc_set_name,
-                            MACH_MSG_TIMEOUT_NONE, MACH_PORT_NULL,
-                            0);
-#endif
-       if (result == MACH_MSG_SUCCESS) {
-           reply_port = CAST_MACH_PORT_TO_NAME(exc_msg.Head.msgh_remote_port);
-
-           if (mach_exc_server(&exc_msg.Head, &rep_msg.Head)) {
-               result = mach_msg_send(&rep_msg.Head, MACH_SEND_MSG,
-                       sizeof (rep_msg),MACH_MSG_TIMEOUT_NONE,MACH_PORT_NULL);
-               if (reply_port != 0 && result != MACH_MSG_SUCCESS)
-                       mach_port_deallocate(get_task_ipcspace(ux_handler_self), reply_port);
-           }
-
+       int machine_signal = 0;
+
+       /* Try machine-dependent translation first. */
+       if ((machine_signal = machine_exception(exception, code, subcode)) != 0)
+               return machine_signal;
+
+       switch(exception) {
+               case EXC_BAD_ACCESS:
+                       if (code == KERN_INVALID_ADDRESS)
+                               return SIGSEGV;
+                       else
+                               return SIGBUS;
+
+               case EXC_BAD_INSTRUCTION:
+                       return SIGILL;
+
+               case EXC_ARITHMETIC:
+                       return SIGFPE;
+
+               case EXC_EMULATION:
+                       return SIGEMT;
+
+               case EXC_SOFTWARE:
+                       switch (code) {
+                               case EXC_UNIX_BAD_SYSCALL:
+                                       return SIGSYS;
+                               case EXC_UNIX_BAD_PIPE:
+                                       return SIGPIPE;
+                               case EXC_UNIX_ABORT:
+                                       return SIGABRT;
+                               case EXC_SOFT_SIGNAL:
+                                       return SIGKILL;
+                       }
+                       break;
+
+               case EXC_BREAKPOINT:
+                       return SIGTRAP;
        }
-       else if (result == MACH_RCV_TOO_LARGE)
-               /* ignore oversized messages */;
-       else
-               panic("exception_handler");
-    }
-}
 
-void
-ux_handler_init(void)
-{
-       thread_t        thread = THREAD_NULL;
-
-       ux_exception_port = MACH_PORT_NULL;
-       (void) kernel_thread_start((thread_continue_t)ux_handler, NULL, &thread);
-       thread_deallocate(thread);
-       proc_list_lock();
-       if (ux_exception_port == MACH_PORT_NULL)  {
-               (void)msleep(&ux_exception_port, proc_list_mlock, 0, "ux_handler_wait", 0);
-       }
-       proc_list_unlock();
+       return 0;
 }
 
+/*
+ * Sends the corresponding UNIX signal to a thread that has triggered a Mach exception.
+ */
 kern_return_t
-catch_exception_raise(
-        __unused mach_port_t exception_port,
-        mach_port_t thread,
-        mach_port_t task,
-        exception_type_t exception,
-        exception_data_t code,
-        __unused mach_msg_type_number_t codeCnt
-)
+handle_ux_exception(thread_t                    thread,
+                    int                         exception,
+                    mach_exception_code_t       code,
+                    mach_exception_subcode_t    subcode)
 {
-       mach_exception_data_type_t big_code[EXCEPTION_CODE_MAX];
-       big_code[0] = code[0];
-       big_code[1] = code[1];
+       /* Returns +1 proc reference */
+       proc_t p = proc_findthread(thread);
 
-       return catch_mach_exception_raise(exception_port,
-                       thread,
-                       task,
-                       exception,
-                       big_code,
-                       codeCnt);
+       /* Can't deliver a signal without a bsd process reference */
+       if (p == NULL)
+               return KERN_FAILURE;
 
-}
+       /* Translate exception and code to signal type */
+       int ux_signal = ux_exception(exception, code, subcode);
 
-kern_return_t
-catch_mach_exception_raise(
-        __unused mach_port_t exception_port,
-        mach_port_t thread,
-        mach_port_t task,
-        exception_type_t exception,
-        mach_exception_data_t code,
-        __unused mach_msg_type_number_t codeCnt
-)
-{
-       task_t                  self = current_task();
-       thread_t                th_act;
-       ipc_port_t              thread_port;
-       struct proc             *p;
-       kern_return_t           result = MACH_MSG_SUCCESS;
-       int                     ux_signal = 0;
-       mach_exception_code_t   ucode = 0;
-       struct uthread          *ut;
-       mach_port_name_t thread_name = CAST_MACH_PORT_TO_NAME(thread);
-       mach_port_name_t task_name = CAST_MACH_PORT_TO_NAME(task);
+       uthread_t ut = get_bsdthread_info(thread);
 
        /*
-        *      Convert local thread name to global port.
+        * Stack overflow should result in a SIGSEGV signal
+        * on the alternate stack.
+        * but we have one or more guard pages after the
+        * stack top, so we would get a KERN_PROTECTION_FAILURE
+        * exception instead of KERN_INVALID_ADDRESS, resulting in
+        * a SIGBUS signal.
+        * Detect that situation and select the correct signal.
         */
-   if (MACH_PORT_VALID(thread_name) &&
-       (ipc_object_copyin(get_task_ipcspace(self), thread_name,
-                      MACH_MSG_TYPE_PORT_SEND,
-                      (void *) &thread_port) == MACH_MSG_SUCCESS)) {
-        if (IPC_PORT_VALID(thread_port)) {
-          th_act = convert_port_to_thread(thread_port);
-          ipc_port_release_send(thread_port);
-       } else {
-          th_act = THREAD_NULL;
+       if (code == KERN_PROTECTION_FAILURE &&
+           ux_signal == SIGBUS) {
+               user_addr_t sp = subcode;
+
+               user_addr_t stack_max = p->user_stack;
+               user_addr_t stack_min = p->user_stack - MAXSSIZ;
+               if (sp >= stack_min && sp < stack_max) {
+                       /*
+                        * This is indeed a stack overflow.  Deliver a
+                        * SIGSEGV signal.
+                        */
+                       ux_signal = SIGSEGV;
+
+                       /*
+                        * If the thread/process is not ready to handle
+                        * SIGSEGV on an alternate stack, force-deliver
+                        * SIGSEGV with a SIG_DFL handler.
+                        */
+                       int mask = sigmask(ux_signal);
+                       struct sigacts *ps = p->p_sigacts;
+                       if ((p->p_sigignore & mask) ||
+                           (ut->uu_sigwait & mask) ||
+                           (ut->uu_sigmask & mask) ||
+                           (ps->ps_sigact[SIGSEGV] == SIG_IGN) ||
+                           (! (ps->ps_sigonstack & mask))) {
+                               p->p_sigignore &= ~mask;
+                               p->p_sigcatch &= ~mask;
+                               ps->ps_sigact[SIGSEGV] = SIG_DFL;
+                               ut->uu_sigwait &= ~mask;
+                               ut->uu_sigmask &= ~mask;
+                       }
+               }
        }
 
-       /*
-        *      Catch bogus ports
-        */
-       if (th_act != THREAD_NULL) {
-
-           /*
-            *  Convert exception to unix signal and code.
-            */
-           ux_exception(exception, code[0], code[1], &ux_signal, &ucode);
-
-           ut = get_bsdthread_info(th_act);
-           p = proc_findthread(th_act);
-
-           /* Can't deliver a signal without a bsd process reference */
-           if (p == NULL) {
-                   ux_signal = 0;
-                   result = KERN_FAILURE;
-           }
-
-           /*
-            * Stack overflow should result in a SIGSEGV signal
-            * on the alternate stack.
-            * but we have one or more guard pages after the
-            * stack top, so we would get a KERN_PROTECTION_FAILURE
-            * exception instead of KERN_INVALID_ADDRESS, resulting in
-            * a SIGBUS signal.
-            * Detect that situation and select the correct signal.
-            */
-           if (code[0] == KERN_PROTECTION_FAILURE &&
-               ux_signal == SIGBUS) {
-                   user_addr_t         sp, stack_min, stack_max;
-                   int                 mask;
-                   struct sigacts      *ps;
-
-                   sp = code[1];
-
-                   stack_max = p->user_stack;
-                   stack_min = p->user_stack - MAXSSIZ;
-                   if (sp >= stack_min &&
-                       sp < stack_max) {
-                           /*
-                            * This is indeed a stack overflow.  Deliver a
-                            * SIGSEGV signal.
-                            */
-                           ux_signal = SIGSEGV;
-
-                           /*
-                            * If the thread/process is not ready to handle
-                            * SIGSEGV on an alternate stack, force-deliver
-                            * SIGSEGV with a SIG_DFL handler.
-                            */
-                           mask = sigmask(ux_signal);
-                           ps = p->p_sigacts;
-                           if ((p->p_sigignore & mask) ||
-                               (ut->uu_sigwait & mask) ||
-                               (ut->uu_sigmask & mask) ||
-                               (ps->ps_sigact[SIGSEGV] == SIG_IGN) ||
-                               (! (ps->ps_sigonstack & mask))) {
-                                   p->p_sigignore &= ~mask;
-                                   p->p_sigcatch &= ~mask;
-                                   ps->ps_sigact[SIGSEGV] = SIG_DFL;
-                                   ut->uu_sigwait &= ~mask;
-                                   ut->uu_sigmask &= ~mask;
-                           }
-                   }
-           }
-           /*
-            *  Send signal.
-            */
-           if (ux_signal != 0) {
-                       ut->uu_exception = exception;
-                       //ut->uu_code = code[0]; // filled in by threadsignal
-                       ut->uu_subcode = code[1];                       
-                       threadsignal(th_act, ux_signal, code[0], TRUE);
-           }
-           if (p != NULL) 
-                   proc_rele(p);
-           thread_deallocate(th_act);
+       /* Send signal to thread */
+       if (ux_signal != 0) {
+               ut->uu_exception = exception;
+               //ut->uu_code = code; // filled in by threadsignal
+               ut->uu_subcode = subcode;
+               threadsignal(thread, ux_signal, code, TRUE);
        }
-       else
-           result = KERN_INVALID_ARGUMENT;
-    }
-    else
-       result = KERN_INVALID_ARGUMENT;
 
-    /*
-     * Delete our send rights to the task port.
-     */
-    (void)mach_port_deallocate(get_task_ipcspace(ux_handler_self), task_name);
+       proc_rele(p);
 
-    return (result);
+       return KERN_SUCCESS;
 }
 
-kern_return_t
-catch_exception_raise_state(
-        __unused mach_port_t exception_port,
-        __unused exception_type_t exception,
-        __unused const exception_data_t code,
-        __unused mach_msg_type_number_t codeCnt,
-        __unused int *flavor,
-        __unused const thread_state_t old_state,
-        __unused mach_msg_type_number_t old_stateCnt,
-        __unused thread_state_t new_state,
-        __unused mach_msg_type_number_t *new_stateCnt)
-{
-       return(KERN_INVALID_ARGUMENT);
-}
-
-kern_return_t
-catch_mach_exception_raise_state(
-        __unused mach_port_t exception_port,
-        __unused exception_type_t exception,
-        __unused const mach_exception_data_t code,
-        __unused mach_msg_type_number_t codeCnt,
-        __unused int *flavor,
-        __unused const thread_state_t old_state,
-        __unused mach_msg_type_number_t old_stateCnt,
-        __unused thread_state_t new_state,
-        __unused mach_msg_type_number_t *new_stateCnt)
-{
-       return(KERN_INVALID_ARGUMENT);
-}
-
-kern_return_t
-catch_exception_raise_state_identity(
-        __unused mach_port_t exception_port,
-        __unused mach_port_t thread,
-        __unused mach_port_t task,
-        __unused exception_type_t exception,
-        __unused exception_data_t code,
-        __unused mach_msg_type_number_t codeCnt,
-        __unused int *flavor,
-        __unused thread_state_t old_state,
-        __unused mach_msg_type_number_t old_stateCnt,
-        __unused thread_state_t new_state,
-        __unused mach_msg_type_number_t *new_stateCnt)
-{
-       return(KERN_INVALID_ARGUMENT);
-}
-
-kern_return_t
-catch_mach_exception_raise_state_identity(
-        __unused mach_port_t exception_port,
-        __unused mach_port_t thread,
-        __unused mach_port_t task,
-        __unused exception_type_t exception,
-        __unused mach_exception_data_t code,
-        __unused mach_msg_type_number_t codeCnt,
-        __unused int *flavor,
-        __unused thread_state_t old_state,
-        __unused mach_msg_type_number_t old_stateCnt,
-        __unused thread_state_t new_state,
-        __unused mach_msg_type_number_t *new_stateCnt)
-{
-       return(KERN_INVALID_ARGUMENT);
-}
-
-
-/*
- *     ux_exception translates a mach exception, code and subcode to
- *     a signal and u.u_code.  Calls machine_exception (machine dependent)
- *     to attempt translation first.
- */
-
-static
-void ux_exception(
-               int                     exception,
-               mach_exception_code_t   code,
-               mach_exception_subcode_t subcode,
-               int                     *ux_signal,
-               mach_exception_code_t   *ux_code)
-{
-    /*
-     * Try machine-dependent translation first.
-     */
-    if (machine_exception(exception, code, subcode, ux_signal, ux_code))
-       return;
-       
-    switch(exception) {
-
-       case EXC_BAD_ACCESS:
-               if (code == KERN_INVALID_ADDRESS)
-                       *ux_signal = SIGSEGV;
-               else
-                       *ux_signal = SIGBUS;
-               break;
-
-       case EXC_BAD_INSTRUCTION:
-           *ux_signal = SIGILL;
-           break;
-
-       case EXC_ARITHMETIC:
-           *ux_signal = SIGFPE;
-           break;
-
-       case EXC_EMULATION:
-           *ux_signal = SIGEMT;
-           break;
-
-       case EXC_SOFTWARE:
-           switch (code) {
-
-           case EXC_UNIX_BAD_SYSCALL:
-               *ux_signal = SIGSYS;
-               break;
-           case EXC_UNIX_BAD_PIPE:
-               *ux_signal = SIGPIPE;
-               break;
-           case EXC_UNIX_ABORT:
-               *ux_signal = SIGABRT;
-               break;
-           case EXC_SOFT_SIGNAL:
-               *ux_signal = SIGKILL;
-               break;
-           }
-           break;
-
-       case EXC_BREAKPOINT:
-           *ux_signal = SIGTRAP;
-           break;
-    }
-}
index 060866928d0df6558df03a3413c98da2436376f6..f09e98f741522138fe9e5ba0448a9bfa7a1a0895 100644 (file)
 #include <sys/user.h>
 #include <sys/lockf.h>
 #include <sys/xattr.h>
+#include <sys/kdebug.h>
 
 #include <kern/assert.h>
 #include <kern/kalloc.h>
 #include <kern/task.h>
+#include <kern/policy_internal.h>
 
 #include <libkern/OSByteOrder.h>
 
 #include <security/mac_framework.h>
 #endif
 
+#if NULLFS
+#include <miscfs/nullfs/nullfs.h>
+#endif
+
 #include <sys/sdt.h>
 
 #define ESUCCESS 0
@@ -1595,12 +1601,16 @@ vfs_ctx_skipatime (vfs_context_t ctx) {
                if (proc->p_lflag & P_LRAGE_VNODES) {
                        return 1;
                }
-               
+
                if (ut) {
-                       if  (ut->uu_flag & UT_RAGE_VNODES) {
+                       if  (ut->uu_flag & (UT_RAGE_VNODES | UT_ATIME_UPDATE)) {
                                return 1;
                        }
                }
+
+               if (proc->p_vfs_iopolicy & P_VFS_IOPOLICY_ATIME_UPDATES) {
+                       return 1;
+               }
        }
        return 0;
 }
@@ -2904,6 +2914,20 @@ vnode_ismonitored(vnode_t vp) {
        return (vp->v_knotes.slh_first != NULL);
 }
 
+int
+vnode_getbackingvnode(vnode_t in_vp, vnode_t* out_vpp)
+{
+       if (out_vpp) {
+               *out_vpp = NULLVP;
+       }
+#if NULLFS
+       return nullfs_getbackingvnode(in_vp, out_vpp);
+#else
+#pragma unused(in_vp)
+       return ENOENT;
+#endif
+}
+
 /*
  * Initialize a struct vnode_attr and activate the attributes required
  * by the vnode_notify() call.
@@ -4003,37 +4027,35 @@ vn_rename(struct vnode *fdvp, struct vnode **fvpp, struct componentname *fcnp, s
                 * in the rename syscall. It's OK if the source file does not exist, since this
                 * is only for AppleDouble files.
                 */
-               if (xfromname != NULL) {
-                       MALLOC(fromnd, struct nameidata *, sizeof (struct nameidata), M_TEMP, M_WAITOK);
-                       NDINIT(fromnd, RENAME, OP_RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK,
-                              UIO_SYSSPACE, CAST_USER_ADDR_T(xfromname), ctx);
-                       fromnd->ni_dvp = fdvp;
-                       error = namei(fromnd);
-               
-                       /* 
-                        * If there was an error looking up source attribute file, 
-                        * we'll behave as if it didn't exist. 
-                        */
+               MALLOC(fromnd, struct nameidata *, sizeof (struct nameidata), M_TEMP, M_WAITOK);
+               NDINIT(fromnd, RENAME, OP_RENAME, NOFOLLOW | USEDVP | CN_NBMOUNTLOOK,
+                               UIO_SYSSPACE, CAST_USER_ADDR_T(xfromname), ctx);
+               fromnd->ni_dvp = fdvp;
+               error = namei(fromnd);
 
-                       if (error == 0) {
-                               if (fromnd->ni_vp) {
-                                       /* src_attr_vp indicates need to call vnode_put / nameidone later */
-                                       src_attr_vp = fromnd->ni_vp;
-                                                                               
-                                       if (fromnd->ni_vp->v_type != VREG) {
-                                               src_attr_vp = NULLVP;
-                                               vnode_put(fromnd->ni_vp);
-                                       }
-                               } 
-                               /*
-                                * Either we got an invalid vnode type (not a regular file) or the namei lookup 
-                                * suppressed ENOENT as a valid error since we're renaming. Either way, we don't 
-                                * have a vnode here, so we drop our namei buffer for the source attribute file
-                                */
-                               if (src_attr_vp == NULLVP) {
-                                       nameidone(fromnd);
+               /*
+                * If there was an error looking up source attribute file,
+                * we'll behave as if it didn't exist.
+                */
+
+               if (error == 0) {
+                       if (fromnd->ni_vp) {
+                               /* src_attr_vp indicates need to call vnode_put / nameidone later */
+                               src_attr_vp = fromnd->ni_vp;
+
+                               if (fromnd->ni_vp->v_type != VREG) {
+                                       src_attr_vp = NULLVP;
+                                       vnode_put(fromnd->ni_vp);
                                }
                        }
+                       /*
+                        * Either we got an invalid vnode type (not a regular file) or the namei lookup
+                        * suppressed ENOENT as a valid error since we're renaming. Either way, we don't
+                        * have a vnode here, so we drop our namei buffer for the source attribute file
+                        */
+                       if (src_attr_vp == NULLVP) {
+                               nameidone(fromnd);
+                       }
                }
        }
 #endif /* CONFIG_APPLEDOUBLE */
@@ -5466,8 +5488,11 @@ VNOP_CLONEFILE(vnode_t fvp, vnode_t dvp, vnode_t *vpp,
 
        _err = (*dvp->v_op[vnop_clonefile_desc.vdesc_offset])(&a);
 
-       if (_err == 0 && *vpp)
+       if (_err == 0 && *vpp) {
                DTRACE_FSINFO(clonefile, vnode_t, *vpp);
+               if (kdebug_enable)
+                       kdebug_lookup(*vpp, cnp);
+       }
 
        post_event_if_success(dvp, _err, NOTE_WRITE);
 
index cde828a7b391bcfbd544975568144f8f1a2174b6..cd8cbacad02a77712121ceb8ffc4688285fa2497 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1995-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 1995-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -1136,8 +1136,8 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp,
         * Note that since we won't ever copy out more than the caller requested,
         * we never need to allocate more than they offer.
         */
-       ab.allocated = ulmin(bufferSize, fixedsize + varsize);
-       if (ab.allocated > ATTR_MAX_BUFFER) {
+       ab.allocated = fixedsize + varsize;
+       if (((size_t)ab.allocated) > ATTR_MAX_BUFFER) {
                error = ENOMEM;
                VFS_DEBUG(ctx, vp, "ATTRLIST - ERROR: buffer size too large (%d limit %d)", ab.allocated, ATTR_MAX_BUFFER);
                goto out;
@@ -1182,6 +1182,10 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp,
        ab.needed = fixedsize + varsize;
 
        /* common attributes **************************************************/
+       if (alp->commonattr & ATTR_CMN_ERROR) {
+               ATTR_PACK4(ab, 0);
+               ab.actual.commonattr |= ATTR_CMN_ERROR;
+       }
        if (alp->commonattr & ATTR_CMN_NAME) {
                attrlist_pack_string(&ab, cnp, cnl);
                ab.actual.commonattr |= ATTR_CMN_NAME;
@@ -1477,7 +1481,7 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp,
         * of the result buffer, even if we copied less out.  The caller knows how big a buffer
         * they gave us, so they can always check for truncation themselves.
         */
-       *(uint32_t *)ab.base = (options & FSOPT_REPORT_FULLSIZE) ? ab.needed : imin(ab.allocated, ab.needed);
+       *(uint32_t *)ab.base = (options & FSOPT_REPORT_FULLSIZE) ? ab.needed : imin(bufferSize, ab.needed);
 
        /* Return attribute set output if requested. */
        if (return_valid &&
@@ -1493,9 +1497,9 @@ getvolattrlist(vfs_context_t ctx, vnode_t vp, struct attrlist *alp,
 
        if (UIO_SEG_IS_USER_SPACE(segflg))
                error = copyout(ab.base, CAST_USER_ADDR_T(attributeBuffer),
-                               ab.allocated);
+                               ulmin(bufferSize, ab.needed));
        else
-               bcopy(ab.base, (void *)attributeBuffer, (size_t)ab.allocated);
+               bcopy(ab.base, (void *)attributeBuffer, (size_t)ulmin(bufferSize, ab.needed));
 
 out:
        if (vs.f_vol_name != NULL)
@@ -3700,6 +3704,7 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval)
        struct fileproc *fp;
        struct fd_vn_data *fvdata;
        vfs_context_t ctx;
+       uthread_t ut;
        enum uio_seg segflg;
        int count;
        uio_t auio = NULL;
@@ -3719,6 +3724,7 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval)
        fvdata = NULL;
        eofflag = 0;
        ctx = vfs_context_current();
+       ut = get_bsdthread_info(current_thread());
        segflg = IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
 
        if ((fp->f_fglob->fg_flag & FREAD) == 0) {
@@ -3865,8 +3871,14 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval)
                        (void)getattrlist_setupvattr_all(&al, &va, VNON, NULL,
                            IS_64BIT_PROCESS(p), (uap->options & FSOPT_ATTR_CMN_EXTENDED));
 
+                       /*
+                        * Set UT_KERN_RAGE_VNODES to cause all vnodes created by the
+                        * filesystem to be rapidly aged.
+                        */
+                       ut->uu_flag |= UT_KERN_RAGE_VNODES;
                        error = VNOP_GETATTRLISTBULK(dvp, &al, &va, auio, NULL,
                            options, &eofflag, &count, ctx);
+                       ut->uu_flag &= ~UT_KERN_RAGE_VNODES;
 
                        FREE(va_name, M_TEMP);
 
@@ -3887,8 +3899,10 @@ getattrlistbulk(proc_t p, struct getattrlistbulk_args *uap, int32_t *retval)
                eofflag = 0;
                count = 0;
 
+               ut->uu_flag |= UT_KERN_RAGE_VNODES;
                error = readdirattr(dvp, fvdata, auio, &al, options,
                    &count, &eofflag, ctx);
+               ut->uu_flag &= ~UT_KERN_RAGE_VNODES;
        }
 
        if (count) {
index c1019a32793b254e60575b4abef9be0dd1bfd6a2..d26613ce8be6b8e829f15bbb3a22af06e8b82111 100644 (file)
@@ -3255,13 +3255,14 @@ start:
                        if (kret != KERN_SUCCESS)
                                panic("getblk: ubc_upl_map() failed with (%d)", kret);
                        break;
-                 }
+                 } // end BLK_READ
                default:
                        panic("getblk: paging or unknown operation - %x", operation);
                        /*NOTREACHED*/
                        break;
-               }
-       }
+               } // end switch
+       } //end buf_t !incore
+
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
                     bp, bp->b_datap, bp->b_flags, 3, 0);
 
@@ -4044,9 +4045,11 @@ buf_biodone(buf_t bp)
                        code |= DKIO_TIER_UPGRADE;
                }
 
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
-                                         buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
-        }
+               KDBG_RELEASE_NOPROCFILT(FSDBG_CODE(DBG_DKRW, code),
+                               buf_kernel_addrperm_addr(bp),
+                               (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid,
+                               bp->b_error);
+       }
 
        microuptime(&real_elapsed);
        timevalsub(&real_elapsed, &bp->b_timestamp_tv);
@@ -4579,7 +4582,7 @@ fs_buffer_cache_gc_dispatch_callouts(int all)
        lck_mtx_unlock(buf_gc_callout);
 }
 
-boolean_t 
+static boolean_t 
 buffer_cache_gc(int all)
 {
        buf_t bp;
index b24dbc590153dbd9ddde5f37451d24b9b171f70a..56c69754c777d576f84a2cf98aee950068274545 100644 (file)
@@ -82,6 +82,7 @@
 #include <sys/kauth.h>
 #include <sys/user.h>
 #include <sys/paths.h>
+#include <os/overflow.h>
 
 #if CONFIG_MACF
 #include <security/mac_framework.h>
@@ -876,10 +877,8 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u
 {
        struct  namecache *ncp;
         vnode_t        old_parentvp = NULLVP;
-#if NAMEDSTREAMS
        int isstream = (vp->v_flag & VISNAMEDSTREAM);
        int kusecountbumped = 0;
-#endif
        kauth_cred_t tcred = NULL;
        const char *vname = NULL;
        const char *tname = NULL;
@@ -888,7 +887,6 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u
                if (dvp && vnode_ref(dvp) != 0) {
                        dvp = NULLVP;
                }
-#if NAMEDSTREAMS
                /* Don't count a stream's parent ref during unmounts */
                if (isstream && dvp && (dvp != vp) && (dvp != vp->v_parent) && (dvp->v_type == VREG)) {
                        vnode_lock_spin(dvp);
@@ -896,7 +894,6 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u
                        kusecountbumped = 1;
                        vnode_unlock(dvp);
                }
-#endif
        } else {
                dvp = NULLVP;
        }
@@ -960,7 +957,6 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u
                        kauth_cred_unref(&tcred);
        }
        if (dvp != NULLVP) {
-#if NAMEDSTREAMS
                /* Back-out the ref we took if we lost a race for vp->v_parent. */
                if (kusecountbumped) {
                        vnode_lock_spin(dvp);
@@ -968,20 +964,17 @@ vnode_update_identity(vnode_t vp, vnode_t dvp, const char *name, int name_len, u
                                --dvp->v_kusecount;  
                        vnode_unlock(dvp);
                }
-#endif
                vnode_rele(dvp);
        }
        if (old_parentvp) {
                struct  uthread *ut;
 
-#if NAMEDSTREAMS
                if (isstream) {
                        vnode_lock_spin(old_parentvp);
                        if ((old_parentvp->v_type != VDIR) && (old_parentvp->v_kusecount > 0))
                                --old_parentvp->v_kusecount;
                        vnode_unlock(old_parentvp);
                }
-#endif
                ut = get_bsdthread_info(current_thread());
 
                /*
@@ -1437,7 +1430,7 @@ skiprsrcfork:
                                 * Force directory hardlinks to go to
                                 * file system for ".." requests.
                                 */
-                               if (dp && (dp->v_flag & VISHARDLINK)) {
+                               if ((dp->v_flag & VISHARDLINK)) {
                                        break;
                                }
                                /*
@@ -2167,28 +2160,35 @@ name_cache_unlock(void)
 
 
 int
-resize_namecache(u_int newsize)
+resize_namecache(int newsize)
 {
     struct nchashhead  *new_table;
     struct nchashhead  *old_table;
     struct nchashhead  *old_head, *head;
     struct namecache   *entry, *next;
     uint32_t           i, hashval;
-    int                        dNodes, dNegNodes;
+    int                        dNodes, dNegNodes, nelements;
     u_long             new_size, old_size;
 
+    if (newsize < 0)
+        return EINVAL;
+
     dNegNodes = (newsize / 10);
     dNodes = newsize + dNegNodes;
-
     // we don't support shrinking yet
     if (dNodes <= desiredNodes) {
-       return 0;
+        return 0;
+    }
+
+    if (os_mul_overflow(dNodes, 2, &nelements)) {
+        return EINVAL;
     }
-    new_table = hashinit(2 * dNodes, M_CACHE, &nchashmask);
+
+    new_table = hashinit(nelements, M_CACHE, &nchashmask);
     new_size  = nchashmask + 1;
 
     if (new_table == NULL) {
-       return ENOMEM;
+        return ENOMEM;
     }
 
     NAME_CACHE_LOCK();
index 60807acea98e397cb94542a950b99e9c22bb1df4..cb023ccf958f3c3ac7ac8e068815551a66ddeaff 100644 (file)
@@ -207,18 +207,25 @@ static int cluster_write_direct(vnode_t vp, struct uio *uio, off_t oldEOF, off_t
 static int cluster_write_contig(vnode_t vp, struct uio *uio, off_t newEOF,
                                int *write_type, u_int32_t *write_length, int (*)(buf_t, void *), void *callback_arg, int bflag);
 
+static void cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes, boolean_t *first_pass,
+                                         off_t write_off, int write_cnt, off_t newEOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
+
 static int cluster_align_phys_io(vnode_t vp, struct uio *uio, addr64_t usr_paddr, u_int32_t xsize, int flags, int (*)(buf_t, void *), void *callback_arg);
 
 static int     cluster_read_prefetch(vnode_t vp, off_t f_offset, u_int size, off_t filesize, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
-static void    cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra, int (*callback)(buf_t, void *), void *callback_arg, int bflag);
+static void    cluster_read_ahead(vnode_t vp, struct cl_extent *extent, off_t filesize, struct cl_readahead *ra,
+                                  int (*callback)(buf_t, void *), void *callback_arg, int bflag);
 
-static int     cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg);
+static int     cluster_push_now(vnode_t vp, struct cl_extent *, off_t EOF, int flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_ioitiated);
 
-static int     cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *), void *callback_arg, int *err);
+static int     cluster_try_push(struct cl_writebehind *, vnode_t vp, off_t EOF, int push_flag, int flags, int (*)(buf_t, void *),
+                                void *callback_arg, int *err, boolean_t vm_initiated);
 
-static void    sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg);
-static int     sparse_cluster_push(void **cmapp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*)(buf_t, void *), void *callback_arg);
-static void    sparse_cluster_add(void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF, int (*)(buf_t, void *), void *callback_arg);
+static int     sparse_cluster_switch(struct cl_writebehind *, vnode_t vp, off_t EOF, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
+static int     sparse_cluster_push(struct cl_writebehind *, void **cmapp, vnode_t vp, off_t EOF, int push_flag,
+                                   int io_flags, int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
+static int     sparse_cluster_add(struct cl_writebehind *, void **cmapp, vnode_t vp, struct cl_extent *, off_t EOF,
+                                  int (*)(buf_t, void *), void *callback_arg, boolean_t vm_initiated);
 
 static kern_return_t vfs_drt_mark_pages(void **cmapp, off_t offset, u_int length, u_int *setcountp);
 static kern_return_t vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp);
@@ -487,7 +494,7 @@ cluster_syncup(vnode_t vp, off_t newEOF, int (*callback)(buf_t, void *), void *c
                if (wbp->cl_number) {
                        lck_mtx_lock(&wbp->cl_lockw);
 
-                       cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL);
+                       cluster_try_push(wbp, vp, newEOF, PUSH_ALL | flags, 0, callback, callback_arg, NULL, FALSE);
 
                        lck_mtx_unlock(&wbp->cl_lockw);
                }
@@ -704,9 +711,9 @@ cluster_ioerror(upl_t upl, int upl_offset, int abort_size, int error, int io_fla
                         * leave pages in the cache unchanged on error
                         */
                        upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
-               else if (page_out && ((error != ENXIO) || vnode_isswap(vp)))
+               else if (((io_flags & B_READ) == 0)  && ((error != ENXIO) || vnode_isswap(vp)))
                        /*
-                        * transient error... leave pages unchanged
+                        * transient error on pageout/write path... leave pages unchanged
                         */
                        upl_abort_code = UPL_ABORT_FREE_ON_EMPTY;
                else if (page_in)
@@ -830,9 +837,9 @@ cluster_iodone(buf_t bp, void *callback_arg)
 
        if (ISSET(b_flags, B_COMMIT_UPL)) {
                cluster_handle_associated_upl(iostate,
-                                                                         cbp_head->b_upl,
-                                                                         upl_offset,
-                                                                         transaction_size);
+                                             cbp_head->b_upl,
+                                             upl_offset,
+                                             transaction_size);
        }
 
        if (error == 0 && total_resid)
@@ -881,12 +888,15 @@ cluster_iodone(buf_t bp, void *callback_arg)
        }
 
        if (b_flags & B_COMMIT_UPL) {
+
                pg_offset   = upl_offset & PAGE_MASK;
                commit_size = (pg_offset + transaction_size + (PAGE_SIZE - 1)) & ~PAGE_MASK;
 
-               if (error)
+               if (error) {
+                       upl_set_iodone_error(upl, error);
+
                        upl_flags = cluster_ioerror(upl, upl_offset - pg_offset, commit_size, error, b_flags, vp);
-               else {
+               else {
                        upl_flags = UPL_COMMIT_FREE_ON_EMPTY;
 
                        if ((b_flags & B_PHYS) && (b_flags & B_READ)) 
@@ -2977,6 +2987,280 @@ cluster_zero_range(upl_t upl, upl_page_info_t *pl, int flags, int io_offset, off
 }
 
 
+void
+cluster_update_state(vnode_t vp, vm_object_offset_t s_offset, vm_object_offset_t e_offset, boolean_t vm_initiated)
+{
+       struct cl_extent cl;
+       boolean_t first_pass = TRUE;
+
+       assert(s_offset < e_offset);
+       assert((s_offset & PAGE_MASK_64) == 0);
+       assert((e_offset & PAGE_MASK_64) == 0);
+
+       cl.b_addr = (daddr64_t)(s_offset / PAGE_SIZE_64);
+       cl.e_addr = (daddr64_t)(e_offset / PAGE_SIZE_64);
+
+       cluster_update_state_internal(vp, &cl, 0, TRUE, &first_pass, s_offset, (int)(e_offset - s_offset),
+                                     vp->v_un.vu_ubcinfo->ui_size, NULL, NULL, vm_initiated);
+}
+
+
+static void
+cluster_update_state_internal(vnode_t vp, struct cl_extent *cl, int flags, boolean_t defer_writes,
+                             boolean_t *first_pass, off_t write_off, int write_cnt, off_t newEOF,
+                             int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
+{
+       struct cl_writebehind *wbp;
+       int     cl_index;
+       int     ret_cluster_try_push;
+       u_int   max_cluster_pgcount;
+
+
+       max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
+
+       /*
+        * take the lock to protect our accesses
+        * of the writebehind and sparse cluster state
+        */
+       wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
+
+       if (wbp->cl_scmap) {
+
+               if ( !(flags & IO_NOCACHE)) {
+                       /*
+                        * we've fallen into the sparse
+                        * cluster method of delaying dirty pages
+                        */
+                       sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
+
+                       lck_mtx_unlock(&wbp->cl_lockw);
+                       return;
+               }
+               /*
+                * must have done cached writes that fell into
+                * the sparse cluster mechanism... we've switched
+                * to uncached writes on the file, so go ahead
+                * and push whatever's in the sparse map
+                * and switch back to normal clustering
+                */
+               wbp->cl_number = 0;
+
+               sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg, vm_initiated);
+               /*
+                * no clusters of either type present at this point
+                * so just go directly to start_new_cluster since
+                * we know we need to delay this I/O since we've
+                * already released the pages back into the cache
+                * to avoid the deadlock with sparse_cluster_push
+                */
+               goto start_new_cluster;
+       }
+       if (*first_pass == TRUE) {
+               if (write_off == wbp->cl_last_write)
+                       wbp->cl_seq_written += write_cnt;
+               else
+                       wbp->cl_seq_written = write_cnt;
+
+               wbp->cl_last_write = write_off + write_cnt;
+
+               *first_pass = FALSE;
+       }
+       if (wbp->cl_number == 0)
+               /*
+                * no clusters currently present
+                */
+               goto start_new_cluster;
+
+       for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
+               /*
+                * check each cluster that we currently hold
+                * try to merge some or all of this write into
+                * one or more of the existing clusters... if
+                * any portion of the write remains, start a
+                * new cluster
+                */
+               if (cl->b_addr >= wbp->cl_clusters[cl_index].b_addr) {
+                       /*
+                        * the current write starts at or after the current cluster
+                        */
+                       if (cl->e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
+                               /*
+                                * we have a write that fits entirely
+                                * within the existing cluster limits
+                                */
+                               if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr)
+                                       /*
+                                        * update our idea of where the cluster ends
+                                        */
+                                       wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
+                               break;
+                       }
+                       if (cl->b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
+                               /*
+                                * we have a write that starts in the middle of the current cluster
+                                * but extends beyond the cluster's limit... we know this because
+                                * of the previous checks
+                                * we'll extend the current cluster to the max
+                                * and update the b_addr for the current write to reflect that
+                                * the head of it was absorbed into this cluster...
+                                * note that we'll always have a leftover tail in this case since
+                                * full absorbtion would have occurred in the clause above
+                                */
+                               wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
+
+                               cl->b_addr = wbp->cl_clusters[cl_index].e_addr;
+                       }
+                       /*
+                        * we come here for the case where the current write starts
+                        * beyond the limit of the existing cluster or we have a leftover
+                        * tail after a partial absorbtion
+                        *
+                        * in either case, we'll check the remaining clusters before 
+                        * starting a new one
+                        */
+               } else {
+                       /*
+                        * the current write starts in front of the cluster we're currently considering
+                        */
+                       if ((wbp->cl_clusters[cl_index].e_addr - cl->b_addr) <= max_cluster_pgcount) {
+                               /*
+                                * we can just merge the new request into
+                                * this cluster and leave it in the cache
+                                * since the resulting cluster is still 
+                                * less than the maximum allowable size
+                                */
+                               wbp->cl_clusters[cl_index].b_addr = cl->b_addr;
+
+                               if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr) {
+                                       /*
+                                        * the current write completely
+                                        * envelops the existing cluster and since
+                                        * each write is limited to at most max_cluster_pgcount pages
+                                        * we can just use the start and last blocknos of the write
+                                        * to generate the cluster limits
+                                        */
+                                       wbp->cl_clusters[cl_index].e_addr = cl->e_addr;
+                               }
+                               break;
+                       }
+                       /*
+                        * if we were to combine this write with the current cluster
+                        * we would exceed the cluster size limit.... so,
+                        * let's see if there's any overlap of the new I/O with
+                        * the cluster we're currently considering... in fact, we'll
+                        * stretch the cluster out to it's full limit and see if we
+                        * get an intersection with the current write
+                        * 
+                        */
+                       if (cl->e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
+                               /*
+                                * the current write extends into the proposed cluster
+                                * clip the length of the current write after first combining it's
+                                * tail with the newly shaped cluster
+                                */
+                               wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
+
+                               cl->e_addr = wbp->cl_clusters[cl_index].b_addr;
+                       }
+                       /*
+                        * if we get here, there was no way to merge
+                        * any portion of this write with this cluster 
+                        * or we could only merge part of it which 
+                        * will leave a tail...
+                        * we'll check the remaining clusters before starting a new one
+                        */
+               }
+       }
+       if (cl_index < wbp->cl_number)
+               /*
+                * we found an existing cluster(s) that we
+                * could entirely merge this I/O into
+                */
+               goto delay_io;
+
+       if (defer_writes == FALSE &&
+           wbp->cl_number == MAX_CLUSTERS &&
+           wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
+               uint32_t        n;
+
+               if (vp->v_mount->mnt_minsaturationbytecount) {
+                       n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
+                                       
+                       if (n > MAX_CLUSTERS)
+                               n = MAX_CLUSTERS;
+               } else
+                       n = 0;
+
+               if (n == 0) {
+                       if (disk_conditioner_mount_is_ssd(vp->v_mount))
+                               n = WRITE_BEHIND_SSD;
+                       else
+                               n = WRITE_BEHIND;
+               }
+               while (n--)
+                       cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL, vm_initiated);
+       }
+       if (wbp->cl_number < MAX_CLUSTERS) {
+               /*
+                * we didn't find an existing cluster to
+                * merge into, but there's room to start
+                * a new one
+                */
+               goto start_new_cluster;
+       }
+       /*
+        * no exisitng cluster to merge with and no
+        * room to start a new one... we'll try 
+        * pushing one of the existing ones... if none of
+        * them are able to be pushed, we'll switch
+        * to the sparse cluster mechanism
+        * cluster_try_push updates cl_number to the
+        * number of remaining clusters... and
+        * returns the number of currently unused clusters
+        */
+       ret_cluster_try_push = 0;
+
+       /*
+        * if writes are not deferred, call cluster push immediately
+        */
+       if (defer_writes == FALSE) {
+               
+               ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL, vm_initiated);
+       }
+       /*
+        * execute following regardless of writes being deferred or not
+        */
+       if (ret_cluster_try_push == 0) {
+               /*
+                * no more room in the normal cluster mechanism
+                * so let's switch to the more expansive but expensive
+                * sparse mechanism....
+                */
+               sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg, vm_initiated);
+               sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, cl, newEOF, callback, callback_arg, vm_initiated);
+               
+               lck_mtx_unlock(&wbp->cl_lockw);
+               return;
+       }
+start_new_cluster:
+       wbp->cl_clusters[wbp->cl_number].b_addr = cl->b_addr;
+       wbp->cl_clusters[wbp->cl_number].e_addr = cl->e_addr;
+
+       wbp->cl_clusters[wbp->cl_number].io_flags = 0;
+
+       if (flags & IO_NOCACHE)
+               wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
+
+       if (flags & IO_PASSIVE)
+               wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
+
+       wbp->cl_number++;
+delay_io:
+       lck_mtx_unlock(&wbp->cl_lockw);
+       return;
+}
+
+
 static int
 cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t oldEOF, off_t newEOF, off_t headOff,
                   off_t tailOff, int flags, int (*callback)(buf_t, void *), void *callback_arg)
@@ -3005,9 +3289,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old
        int              write_cnt = 0;
        boolean_t        first_pass = FALSE;
        struct cl_extent cl;
-       struct cl_writebehind *wbp;
        int              bflag;
-       u_int            max_cluster_pgcount;
        u_int            max_io_size;
 
        if (uio) {
@@ -3036,7 +3318,6 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old
        zero_off  = 0;
        zero_off1 = 0;
 
-       max_cluster_pgcount = MAX_CLUSTER_SIZE(vp) / PAGE_SIZE;
        max_io_size = cluster_max_io_size(vp->v_mount, CL_WRITE);
 
        if (flags & IO_HEADZEROFILL) {
@@ -3293,7 +3574,7 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old
                        retval = cluster_copy_upl_data(uio, upl, io_offset, (int *)&io_requested);
 
                        if (retval) {
-                               ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_DUMP_PAGES | UPL_ABORT_FREE_ON_EMPTY);
+                               ubc_upl_abort_range(upl, 0, upl_size, UPL_ABORT_FREE_ON_EMPTY);
 
                                KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 45)) | DBG_FUNC_NONE,
                                             upl, 0, 0, retval, 0);
@@ -3318,20 +3599,15 @@ cluster_write_copy(vnode_t vp, struct uio *uio, u_int32_t io_req_size, off_t old
                        io_offset  += bytes_to_zero;
                }
                if (retval == 0) {
-                       int cl_index;
-                       int ret_cluster_try_push;
                        int do_zeroing = 1;
-
                        
                        io_size += start_offset;
-                       
 
                        /* Force more restrictive zeroing behavior only on APFS */
                        if ((vnode_tag(vp) == VT_APFS) && (newEOF < oldEOF)) {
                                do_zeroing = 0;
                        }
 
-
                        if (do_zeroing && (upl_f_offset + io_size) >= newEOF && (u_int)io_size < upl_size) {
 
                                /*
@@ -3370,269 +3646,28 @@ check_cluster:
                        cl.e_addr = (daddr64_t)((upl_f_offset + (off_t)upl_size) / PAGE_SIZE_64);
 
                        if (flags & IO_SYNC) {
-                               /*
-                                * if the IO_SYNC flag is set than we need to 
-                                * bypass any clusters and immediately issue
-                                * the I/O
-                                */
-                               goto issue_io;
-                       }
-                       /*
-                        * take the lock to protect our accesses
-                        * of the writebehind and sparse cluster state
-                        */
-                       wbp = cluster_get_wbp(vp, CLW_ALLOCATE | CLW_RETURNLOCKED);
-
-                       if (wbp->cl_scmap) {
-
-                               if ( !(flags & IO_NOCACHE)) {
-                                       /*
-                                        * we've fallen into the sparse
-                                        * cluster method of delaying dirty pages
-                                        */
-                                       sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
-
-                                       lck_mtx_unlock(&wbp->cl_lockw);
-
-                                       continue;
-                               }
-                               /*
-                                * must have done cached writes that fell into
-                                * the sparse cluster mechanism... we've switched
-                                * to uncached writes on the file, so go ahead
-                                * and push whatever's in the sparse map
-                                * and switch back to normal clustering
-                                */
-                               wbp->cl_number = 0;
-
-                               sparse_cluster_push(&(wbp->cl_scmap), vp, newEOF, PUSH_ALL, 0, callback, callback_arg);
                                /*
-                                * no clusters of either type present at this point
-                                * so just go directly to start_new_cluster since
-                                * we know we need to delay this I/O since we've
-                                * already released the pages back into the cache
-                                * to avoid the deadlock with sparse_cluster_push
-                                */
-                               goto start_new_cluster;
-                       }
-                       if (first_pass) {
-                               if (write_off == wbp->cl_last_write)
-                                       wbp->cl_seq_written += write_cnt;
-                               else
-                                       wbp->cl_seq_written = write_cnt;
-
-                               wbp->cl_last_write = write_off + write_cnt;
-
-                               first_pass = FALSE;
-                       }
-                       if (wbp->cl_number == 0)
-                               /*
-                                * no clusters currently present
-                                */
-                               goto start_new_cluster;
-
-                       for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
-                               /*
-                                * check each cluster that we currently hold
-                                * try to merge some or all of this write into
-                                * one or more of the existing clusters... if
-                                * any portion of the write remains, start a
-                                * new cluster
-                                */
-                               if (cl.b_addr >= wbp->cl_clusters[cl_index].b_addr) {
-                                       /*
-                                        * the current write starts at or after the current cluster
-                                        */
-                                       if (cl.e_addr <= (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
-                                               /*
-                                                * we have a write that fits entirely
-                                                * within the existing cluster limits
-                                                */
-                                               if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr)
-                                                       /*
-                                                        * update our idea of where the cluster ends
-                                                        */
-                                                       wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
-                                               break;
-                                       }
-                                       if (cl.b_addr < (wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount)) {
-                                               /*
-                                                * we have a write that starts in the middle of the current cluster
-                                                * but extends beyond the cluster's limit... we know this because
-                                                * of the previous checks
-                                                * we'll extend the current cluster to the max
-                                                * and update the b_addr for the current write to reflect that
-                                                * the head of it was absorbed into this cluster...
-                                                * note that we'll always have a leftover tail in this case since
-                                                * full absorbtion would have occurred in the clause above
-                                                */
-                                               wbp->cl_clusters[cl_index].e_addr = wbp->cl_clusters[cl_index].b_addr + max_cluster_pgcount;
-
-                                               cl.b_addr = wbp->cl_clusters[cl_index].e_addr;
-                                       }
-                                       /*
-                                        * we come here for the case where the current write starts
-                                        * beyond the limit of the existing cluster or we have a leftover
-                                        * tail after a partial absorbtion
-                                        *
-                                        * in either case, we'll check the remaining clusters before 
-                                        * starting a new one
-                                        */
-                               } else {
-                                       /*
-                                        * the current write starts in front of the cluster we're currently considering
-                                        */
-                                       if ((wbp->cl_clusters[cl_index].e_addr - cl.b_addr) <= max_cluster_pgcount) {
-                                               /*
-                                                * we can just merge the new request into
-                                                * this cluster and leave it in the cache
-                                                * since the resulting cluster is still 
-                                                * less than the maximum allowable size
-                                                */
-                                               wbp->cl_clusters[cl_index].b_addr = cl.b_addr;
-
-                                               if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr) {
-                                                       /*
-                                                        * the current write completely
-                                                        * envelops the existing cluster and since
-                                                        * each write is limited to at most max_cluster_pgcount pages
-                                                        * we can just use the start and last blocknos of the write
-                                                        * to generate the cluster limits
-                                                        */
-                                                       wbp->cl_clusters[cl_index].e_addr = cl.e_addr;
-                                               }
-                                               break;
-                                       }
-
-                                       /*
-                                        * if we were to combine this write with the current cluster
-                                        * we would exceed the cluster size limit.... so,
-                                        * let's see if there's any overlap of the new I/O with
-                                        * the cluster we're currently considering... in fact, we'll
-                                        * stretch the cluster out to it's full limit and see if we
-                                        * get an intersection with the current write
-                                        * 
-                                        */
-                                       if (cl.e_addr > wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount) {
-                                               /*
-                                                * the current write extends into the proposed cluster
-                                                * clip the length of the current write after first combining it's
-                                                * tail with the newly shaped cluster
-                                                */
-                                               wbp->cl_clusters[cl_index].b_addr = wbp->cl_clusters[cl_index].e_addr - max_cluster_pgcount;
-
-                                               cl.e_addr = wbp->cl_clusters[cl_index].b_addr;
-                                       }
-                                       /*
-                                        * if we get here, there was no way to merge
-                                        * any portion of this write with this cluster 
-                                        * or we could only merge part of it which 
-                                        * will leave a tail...
-                                        * we'll check the remaining clusters before starting a new one
-                                        */
-                               }
-                       }
-                       if (cl_index < wbp->cl_number)
-                               /*
-                                * we found an existing cluster(s) that we
-                                * could entirely merge this I/O into
-                                */
-                               goto delay_io;
-
-                       if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE) &&
-                           wbp->cl_number == MAX_CLUSTERS &&
-                           wbp->cl_seq_written >= (MAX_CLUSTERS * (max_cluster_pgcount * PAGE_SIZE))) {
-                               uint32_t        n;
-
-                               if (vp->v_mount->mnt_minsaturationbytecount) {
-                                       n = vp->v_mount->mnt_minsaturationbytecount / MAX_CLUSTER_SIZE(vp);
-                                       
-                                       if (n > MAX_CLUSTERS)
-                                               n = MAX_CLUSTERS;
-                               } else
-                                       n = 0;
-
-                               if (n == 0) {
-                                       if (disk_conditioner_mount_is_ssd(vp->v_mount))
-                                               n = WRITE_BEHIND_SSD;
-                                       else
-                                               n = WRITE_BEHIND;
-                               }
-                               while (n--)
-                                       cluster_try_push(wbp, vp, newEOF, 0, 0, callback, callback_arg, NULL);
-                       }
-                       if (wbp->cl_number < MAX_CLUSTERS) {
-                               /*
-                                * we didn't find an existing cluster to
-                                * merge into, but there's room to start
-                                * a new one
-                                */
-                               goto start_new_cluster;
-                       }
-                       /*
-                        * no exisitng cluster to merge with and no
-                        * room to start a new one... we'll try 
-                        * pushing one of the existing ones... if none of
-                        * them are able to be pushed, we'll switch
-                        * to the sparse cluster mechanism
-                        * cluster_try_push updates cl_number to the
-                        * number of remaining clusters... and
-                        * returns the number of currently unused clusters
-                        */
-                       ret_cluster_try_push = 0;
-
-                       /*
-                        * if writes are not deferred, call cluster push immediately
-                        */
-                       if (!((unsigned int)vfs_flags(vp->v_mount) & MNT_DEFWRITE)) {
-                               
-                               ret_cluster_try_push = cluster_try_push(wbp, vp, newEOF, (flags & IO_NOCACHE) ? 0 : PUSH_DELAY, 0, callback, callback_arg, NULL);
-                       }
-
-                       /*
-                        * execute following regardless of writes being deferred or not
-                        */
-                       if (ret_cluster_try_push == 0) {
-                               /*
-                                * no more room in the normal cluster mechanism
-                                * so let's switch to the more expansive but expensive
-                                * sparse mechanism....
+                                * if the IO_SYNC flag is set than we need to bypass
+                                * any clustering and immediately issue the I/O
+                                *
+                                * we don't hold the lock at this point
+                                *
+                                * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
+                                * so that we correctly deal with a change in state of the hardware modify bit...
+                                * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
+                                * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
+                                * responsible for generating the correct sized I/O(s)
                                 */
-                               sparse_cluster_switch(wbp, vp, newEOF, callback, callback_arg);
-                               sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, newEOF, callback, callback_arg);
+                               retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg, FALSE);
+                       } else {
+                               boolean_t defer_writes = FALSE;
 
-                               lck_mtx_unlock(&wbp->cl_lockw);
+                               if (vfs_flags(vp->v_mount) & MNT_DEFWRITE)
+                                       defer_writes = TRUE;
 
-                               continue;
+                               cluster_update_state_internal(vp, &cl, flags, defer_writes, &first_pass,
+                                                             write_off, write_cnt, newEOF, callback, callback_arg, FALSE);
                        }
-start_new_cluster:
-                       wbp->cl_clusters[wbp->cl_number].b_addr = cl.b_addr;
-                       wbp->cl_clusters[wbp->cl_number].e_addr = cl.e_addr;
-
-                       wbp->cl_clusters[wbp->cl_number].io_flags = 0;
-
-                       if (flags & IO_NOCACHE)
-                               wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IONOCACHE;
-
-                       if (bflag & CL_PASSIVE)
-                               wbp->cl_clusters[wbp->cl_number].io_flags |= CLW_IOPASSIVE;
-
-                       wbp->cl_number++;
-delay_io:
-                       lck_mtx_unlock(&wbp->cl_lockw);
-
-                       continue;
-issue_io:
-                       /*
-                        * we don't hold the lock at this point
-                        *
-                        * we've already dropped the current upl, so pick it back up with COPYOUT_FROM set
-                        * so that we correctly deal with a change in state of the hardware modify bit...
-                        * we do this via cluster_push_now... by passing along the IO_SYNC flag, we force
-                        * cluster_push_now to wait until all the I/Os have completed... cluster_push_now is also
-                        * responsible for generating the correct sized I/O(s)
-                        */
-                       retval = cluster_push_now(vp, &cl, newEOF, flags, callback, callback_arg);
                }
        }
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 40)) | DBG_FUNC_END, retval, 0, io_resid, 0, 0);
@@ -4368,7 +4403,6 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type,
        u_int32_t        max_rd_size;
        u_int32_t        max_rd_ahead;
        u_int32_t        max_vector_size;
-       boolean_t        strict_uncached_IO = FALSE;
        boolean_t        io_throttled = FALSE;
 
        u_int32_t        vector_upl_iosize = 0;
@@ -4433,8 +4467,6 @@ cluster_read_direct(vnode_t vp, struct uio *uio, off_t filesize, int *read_type,
                devblocksize = PAGE_SIZE;
        }
 
-       strict_uncached_IO = ubc_strict_uncached_IO(vp);
-
        orig_iov_base = uio_curriovbase(uio);
        last_iov_base = orig_iov_base;
 
@@ -4512,7 +4544,7 @@ next_dread:
                 * cluster_copy_ubc_data returns the resid
                 * in io_size
                 */
-               if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
+               if ((flags & IO_ENCRYPTED) == 0) {
                        retval = cluster_copy_ubc_data_internal(vp, uio, (int *)&io_size, 0, 0);
                }
                /*
@@ -4602,7 +4634,7 @@ next_dread:
                 * Don't re-check the UBC data if we are looking for uncached IO
                 * or asking for encrypted blocks.
                 */
-               if ((strict_uncached_IO == FALSE) && ((flags & IO_ENCRYPTED) == 0)) {
+               if ((flags & IO_ENCRYPTED) == 0) {
 
                        if ((xsize = io_size) > max_rd_size)
                                xsize = max_rd_size;
@@ -4865,7 +4897,16 @@ wait_for_dreads:
                 * we couldn't handle the tail of this request in DIRECT mode
                 * so fire it through the copy path
                 */
-               retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
+               if (flags & IO_ENCRYPTED) {
+                       /*
+                        * We cannot fall back to the copy path for encrypted I/O. If this
+                        * happens, there is something wrong with the user buffer passed
+                        * down.
+                        */
+                       retval = EFAULT;
+               } else {
+                       retval = cluster_read_copy(vp, uio, io_req_size, filesize, flags, callback, callback_arg);
+               }
 
                *read_type = IO_UNKNOWN;
        }
@@ -5371,6 +5412,7 @@ cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca
         int    retval;
        int     my_sparse_wait = 0;
        struct  cl_writebehind *wbp;
+       int     local_err = 0;
 
        if (err)
                *err = 0;
@@ -5440,22 +5482,35 @@ cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca
 
                        lck_mtx_unlock(&wbp->cl_lockw);
 
-                       retval = sparse_cluster_push(&scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
+                       retval = sparse_cluster_push(wbp, &scmap, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
 
                        lck_mtx_lock(&wbp->cl_lockw);
 
                        wbp->cl_sparse_pushes--;
+
+                       if (retval) {
+                               if (wbp->cl_scmap != NULL) {
+                                       panic("cluster_push_err: Expected NULL cl_scmap\n");
+                               }
+
+                               wbp->cl_scmap = scmap;
+                       }
                        
                        if (wbp->cl_sparse_wait && wbp->cl_sparse_pushes == 0)
                                wakeup((caddr_t)&wbp->cl_sparse_pushes);
                } else {
-                       retval = sparse_cluster_push(&(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg);
+                       retval = sparse_cluster_push(wbp, &(wbp->cl_scmap), vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, FALSE);
                }
+
+               local_err = retval;
+
                if (err)
                        *err = retval;
                retval = 1;
        } else {
-               retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, err);
+               retval = cluster_try_push(wbp, vp, ubc_getsize(vp), PUSH_ALL, flags, callback, callback_arg, &local_err, FALSE);
+               if (err)
+                       *err = local_err;
        }
        lck_mtx_unlock(&wbp->cl_lockw);
 
@@ -5476,7 +5531,7 @@ cluster_push_err(vnode_t vp, int flags, int (*callback)(buf_t, void *), void *ca
                lck_mtx_unlock(&wbp->cl_lockw);
        }
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 53)) | DBG_FUNC_END,
-                    wbp->cl_scmap, wbp->cl_number, retval, 0, 0);
+                    wbp->cl_scmap, wbp->cl_number, retval, local_err, 0);
 
        return (retval);
 }
@@ -5516,7 +5571,7 @@ cluster_release(struct ubc_info *ubc)
 
 
 static int
-cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err)
+cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg, int *err, boolean_t vm_initiated)
 {
         int cl_index;
        int cl_index1;
@@ -5597,6 +5652,9 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla
                                goto dont_try;
                }
        }
+       if (vm_initiated == TRUE)
+               lck_mtx_unlock(&wbp->cl_lockw);
+
        for (cl_index = 0; cl_index < cl_len; cl_index++) {
                int     flags;
                struct  cl_extent cl;
@@ -5619,19 +5677,23 @@ cluster_try_push(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int push_fla
                cl.b_addr = l_clusters[cl_index].b_addr;
                cl.e_addr = l_clusters[cl_index].e_addr;
 
-               retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg);
-
-               if (error == 0 && retval)
-                       error = retval;
+               retval = cluster_push_now(vp, &cl, EOF, flags, callback, callback_arg, vm_initiated);
 
-               l_clusters[cl_index].b_addr = 0;
-               l_clusters[cl_index].e_addr = 0;
+               if (retval == 0) {
+                       cl_pushed++;
 
-               cl_pushed++;
+                       l_clusters[cl_index].b_addr = 0;
+                       l_clusters[cl_index].e_addr = 0;
+               } else if (error == 0) {
+                       error = retval;
+               }
 
                if ( !(push_flag & PUSH_ALL) )
                        break;
        }
+       if (vm_initiated == TRUE)
+               lck_mtx_lock(&wbp->cl_lockw);
+
        if (err)
                *err = error;
 
@@ -5651,7 +5713,7 @@ dont_try:
                         *
                         * collect the active public clusters...
                         */
-                       sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
+                       sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
 
                        for (cl_index = 0, cl_index1 = 0; cl_index < cl_len; cl_index++) {
                                if (l_clusters[cl_index].b_addr == l_clusters[cl_index].e_addr)
@@ -5671,7 +5733,7 @@ dont_try:
                         * and collect the original clusters that were moved into the 
                         * local storage for sorting purposes
                         */
-                       sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg);
+                       sparse_cluster_switch(wbp, vp, EOF, callback, callback_arg, vm_initiated);
 
                } else {
                        /*
@@ -5701,7 +5763,8 @@ dont_try:
 
 
 static int
-cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*callback)(buf_t, void *), void *callback_arg)
+cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags,
+                int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
 {
        upl_page_info_t *pl;
        upl_t            upl;
@@ -5758,6 +5821,13 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c
        } else
                size = upl_size;
 
+
+       if (vm_initiated) {
+               vnode_pageout(vp, NULL, (upl_offset_t)0, upl_f_offset, (upl_size_t)upl_size,
+                             UPL_MSYNC | UPL_VNODE_PAGER | UPL_KEEPCACHED, &error);
+
+               return (error);
+       }
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 41)) | DBG_FUNC_START, upl_size, size, 0, 0, 0);
 
        /*
@@ -5868,7 +5938,7 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c
 
                size -= io_size;
        }
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, 0, 0, 0);
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 51)) | DBG_FUNC_END, 1, 3, error, 0, 0);
 
        return(error);
 }
@@ -5877,12 +5947,13 @@ cluster_push_now(vnode_t vp, struct cl_extent *cl, off_t EOF, int flags, int (*c
 /*
  * sparse_cluster_switch is called with the write behind lock held
  */
-static void
-sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
+static int
+sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
 {
         int    cl_index;
+       int     error;
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0);
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_START, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, 0, 0);
 
        for (cl_index = 0; cl_index < wbp->cl_number; cl_index++) {
                int       flags;
@@ -5894,14 +5965,20 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c
                                if (flags & UPL_POP_DIRTY) {
                                        cl.e_addr = cl.b_addr + 1;
 
-                                       sparse_cluster_add(&(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg);
+                                       error = sparse_cluster_add(wbp, &(wbp->cl_scmap), vp, &cl, EOF, callback, callback_arg, vm_initiated);
+
+                                       if (error) {
+                                               break;
+                                       }
                                }
                        }
                }
        }
-       wbp->cl_number = 0;
+       wbp->cl_number -= cl_index;
+
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, wbp->cl_number, error, 0);
 
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 78)) | DBG_FUNC_END, kdebug_vnode(vp), wbp->cl_scmap, 0, 0, 0);
+       return error;
 }
 
 
@@ -5911,11 +5988,13 @@ sparse_cluster_switch(struct cl_writebehind *wbp, vnode_t vp, off_t EOF, int (*c
  * from the write-behind context (the cluster_push case), the wb lock is not held
  */
 static int
-sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_flags, int (*callback)(buf_t, void *), void *callback_arg)
+sparse_cluster_push(struct cl_writebehind *wbp, void **scmap, vnode_t vp, off_t EOF, int push_flag,
+                   int io_flags, int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
 {
         struct cl_extent cl;
         off_t          offset;
        u_int           length;
+       void            *l_scmap;
        int error = 0;
 
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_START, kdebug_vnode(vp), (*scmap), 0, push_flag, 0);
@@ -5923,22 +6002,44 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_f
        if (push_flag & PUSH_ALL)
                vfs_drt_control(scmap, 1);
 
+       l_scmap = *scmap;
+
        for (;;) {
                int retval;
+
                if (vfs_drt_get_cluster(scmap, &offset, &length) != KERN_SUCCESS)
                        break;
 
+               if (vm_initiated == TRUE)
+                       lck_mtx_unlock(&wbp->cl_lockw);
+
                cl.b_addr = (daddr64_t)(offset / PAGE_SIZE_64);
                cl.e_addr = (daddr64_t)((offset + length) / PAGE_SIZE_64);
 
-               retval = cluster_push_now(vp, &cl, EOF, io_flags & (IO_PASSIVE|IO_CLOSE), callback, callback_arg);
+               retval = cluster_push_now(vp, &cl, EOF, io_flags, callback, callback_arg, vm_initiated);
                if (error == 0 && retval)
                        error = retval;
 
-               if ( !(push_flag & PUSH_ALL) )
+               if (vm_initiated == TRUE) {
+                       lck_mtx_lock(&wbp->cl_lockw);
+
+                       if (*scmap != l_scmap)
+                               break;
+               }
+
+               if (error) {
+                       if (vfs_drt_mark_pages(scmap, offset, length, NULL) != KERN_SUCCESS) {
+                               panic("Failed to restore dirty state on failure\n");
+                       }
+
+                       break;
+               }
+
+               if ( !(push_flag & PUSH_ALL)) {
                        break;
+               }
        }
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0);
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 79)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
 
        return error;
 }
@@ -5947,12 +6048,14 @@ sparse_cluster_push(void **scmap, vnode_t vp, off_t EOF, int push_flag, int io_f
 /*
  * sparse_cluster_add is called with the write behind lock held
  */
-static void
-sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, int (*callback)(buf_t, void *), void *callback_arg)
+static int
+sparse_cluster_add(struct cl_writebehind *wbp, void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF,
+                  int (*callback)(buf_t, void *), void *callback_arg, boolean_t vm_initiated)
 {
         u_int  new_dirty;
        u_int   length;
        off_t   offset;
+       int     error;
 
        KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_START, (*scmap), 0, cl->b_addr, (int)cl->e_addr, 0);
 
@@ -5965,12 +6068,18 @@ sparse_cluster_add(void **scmap, vnode_t vp, struct cl_extent *cl, off_t EOF, in
                 * only a partial update was done
                 * push out some pages and try again
                 */
-               sparse_cluster_push(scmap, vp, EOF, 0, 0, callback, callback_arg);
+               error = sparse_cluster_push(wbp, scmap, vp, EOF, 0, 0, callback, callback_arg, vm_initiated);
+
+               if (error) {
+                       break;
+               }
 
                offset += (new_dirty * PAGE_SIZE_64);
                length -= (new_dirty * PAGE_SIZE);
        }
-       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), 0, 0, 0);
+       KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 80)) | DBG_FUNC_END, kdebug_vnode(vp), (*scmap), error, 0, 0);
+
+       return error;
 }
 
 
@@ -6259,7 +6368,7 @@ is_file_clean(vnode_t vp, off_t filesize)
  * single hashtable entry.  Each hashtable entry is aligned to this
  * size within the file.
  */
-#define DRT_BITVECTOR_PAGES            ((1024 * 1024) / PAGE_SIZE)
+#define DRT_BITVECTOR_PAGES            ((1024 * 256) / PAGE_SIZE)
 
 /*
  * File offset handling.
@@ -6306,6 +6415,7 @@ is_file_clean(vnode_t vp, off_t filesize)
        } while(0);
 
 
+#if CONFIG_EMBEDDED
 /*
  * Hash table moduli.
  *
@@ -6314,13 +6424,14 @@ is_file_clean(vnode_t vp, off_t filesize)
  * both being prime and fitting within the desired allocation
  * size, these values need to be manually determined.
  *
- * For DRT_BITVECTOR_SIZE = 256, the entry size is 40 bytes.
+ * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
  *
- * The small hashtable allocation is 1024 bytes, so the modulus is 23.
- * The large hashtable allocation is 16384 bytes, so the modulus is 401.
+ * The small hashtable allocation is 4096 bytes, so the modulus is 251.
+ * The large hashtable allocation is 32768 bytes, so the modulus is 2039.
  */
-#define DRT_HASH_SMALL_MODULUS 23
-#define DRT_HASH_LARGE_MODULUS 401
+
+#define DRT_HASH_SMALL_MODULUS 251
+#define DRT_HASH_LARGE_MODULUS 2039
 
 /*
  * Physical memory required before the large hash modulus is permitted.
@@ -6330,11 +6441,58 @@ is_file_clean(vnode_t vp, off_t filesize)
  */
 #define DRT_HASH_LARGE_MEMORY_REQUIRED (1024LL * 1024LL * 1024LL)      /* 1GiB */
 
-#define DRT_SMALL_ALLOCATION   1024    /* 104 bytes spare */
-#define DRT_LARGE_ALLOCATION   16384   /* 344 bytes spare */
+#define DRT_SMALL_ALLOCATION   4096    /* 80 bytes spare */
+#define DRT_LARGE_ALLOCATION   32768   /* 144 bytes spare */
+
+#else
+/*
+ * Hash table moduli.
+ *
+ * Since the hashtable entry's size is dependent on the size of
+ * the bitvector, and since the hashtable size is constrained to
+ * both being prime and fitting within the desired allocation
+ * size, these values need to be manually determined.
+ *
+ * For DRT_BITVECTOR_SIZE = 64, the entry size is 16 bytes.
+ *
+ * The small hashtable allocation is 16384 bytes, so the modulus is 1019.
+ * The large hashtable allocation is 131072 bytes, so the modulus is 8179.
+ */
+
+#define DRT_HASH_SMALL_MODULUS 1019
+#define DRT_HASH_LARGE_MODULUS 8179
+
+/*
+ * Physical memory required before the large hash modulus is permitted.
+ *
+ * On small memory systems, the large hash modulus can lead to phsyical
+ * memory starvation, so we avoid using it there.
+ */
+#define DRT_HASH_LARGE_MEMORY_REQUIRED (4 * 1024LL * 1024LL * 1024LL)  /* 4GiB */
+
+#define DRT_SMALL_ALLOCATION   16384   /* 80 bytes spare */
+#define DRT_LARGE_ALLOCATION   131072  /* 208 bytes spare */
+
+#endif
 
 /* *** nothing below here has secret dependencies on DRT_BITVECTOR_PAGES *** */
 
+/*
+ * Hashtable entry.
+ */
+struct vfs_drt_hashentry {
+       u_int64_t       dhe_control;
+/*
+* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
+* DRT_BITVECTOR_PAGES is defined as ((1024 * 256) / PAGE_SIZE)
+* Since PAGE_SIZE is only known at boot time, 
+*      -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k) 
+*      -declare dhe_bitvector array for largest possible length
+*/
+#define MAX_DRT_BITVECTOR_PAGES (1024 * 256)/( 4 * 1024)
+       u_int32_t       dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32];
+};
+
 /*
  * Hashtable bitvector handling.
  *
@@ -6351,30 +6509,12 @@ is_file_clean(vnode_t vp, off_t filesize)
        ((scm)->scm_hashtable[(i)].dhe_bitvector[(bit) / 32] & (1 << ((bit) % 32)))
     
 #define DRT_BITVECTOR_CLEAR(scm, i)                            \
-       bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
+       bzero(&(scm)->scm_hashtable[(i)].dhe_bitvector[0], (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
 
 #define DRT_BITVECTOR_COPY(oscm, oi, scm, i)                   \
        bcopy(&(oscm)->scm_hashtable[(oi)].dhe_bitvector[0],    \
            &(scm)->scm_hashtable[(i)].dhe_bitvector[0],        \
-           (DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
-
-
-/*
- * Hashtable entry.
- */
-struct vfs_drt_hashentry {
-       u_int64_t       dhe_control;
-/*
-* dhe_bitvector was declared as dhe_bitvector[DRT_BITVECTOR_PAGES / 32];
-* DRT_BITVECTOR_PAGES is defined as ((1024 * 1024) / PAGE_SIZE)
-* Since PAGE_SIZE is only known at boot time, 
-*      -define MAX_DRT_BITVECTOR_PAGES for smallest supported page size (4k) 
-*      -declare dhe_bitvector array for largest possible length
-*/
-#define MAX_DRT_BITVECTOR_PAGES (1024 * 1024)/( 4 * 1024)
-       u_int32_t       dhe_bitvector[MAX_DRT_BITVECTOR_PAGES/32];
-};
+           (MAX_DRT_BITVECTOR_PAGES / 32) * sizeof(u_int32_t))
 
 /*
  * Dirty Region Tracking structure.
@@ -6754,12 +6894,17 @@ vfs_drt_do_mark_pages(
                for (i = 0; i < pgcount; i++) {
                        if (dirty) {
                                if (!DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
+                                       if (ecount >= DRT_BITVECTOR_PAGES)
+                                               panic("ecount >= DRT_BITVECTOR_PAGES, cmap = %p, index = %d, bit = %d", cmap, index, pgoff+i);
                                        DRT_HASH_SET_BIT(cmap, index, pgoff + i);
                                        ecount++;
                                        setcount++;
                                }
                        } else {
                                if (DRT_HASH_TEST_BIT(cmap, index, pgoff + i)) {
+                                       if (ecount <= 0)
+                                               panic("ecount <= 0, cmap = %p, index = %d, bit = %d", cmap, index, pgoff+i);
+                                       assert(ecount > 0);
                                        DRT_HASH_CLEAR_BIT(cmap, index, pgoff + i);
                                        ecount--;
                                        setcount++;
@@ -6870,7 +7015,8 @@ vfs_drt_get_cluster(void **cmapp, off_t *offsetp, u_int *lengthp)
                }
                if (fs == -1) {
                        /*  didn't find any bits set */
-                       panic("vfs_drt: entry summary count > 0 but no bits set in map");
+                       panic("vfs_drt: entry summary count > 0 but no bits set in map, cmap = %p, index = %d, count = %lld",
+                             cmap, index, DRT_HASH_GET_COUNT(cmap, index));
                }
                for (ls = 0; i < DRT_BITVECTOR_PAGES; i++, ls++) {
                        if (!DRT_HASH_TEST_BIT(cmap, index, i))
index ff53ee1a3c516d90d16d33c45c89c97b327a41cc..26af78a7541880e0aa0d533dab5c300cb40a0ffa 100644 (file)
@@ -33,6 +33,9 @@
 #include <sys/content_protection.h>
 #include <libkern/crypto/sha1.h>
 #include <libkern/libkern.h>
+//for write protection
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
 
 #define PTR_ADD(type, base, offset)            (type)((uintptr_t)(base) + (offset))
 
@@ -54,7 +57,10 @@ enum {
        // Using AES IV context generated from key
        CPX_IV_AES_CTX_VFS                      = 0x08,
        CPX_SYNTHETIC_OFFSET_FOR_IV = 0x10,
-    CPX_COMPOSITEKEY            = 0x20
+       CPX_COMPOSITEKEY            = 0x20, 
+       
+       //write page protection
+       CPX_WRITE_PROTECTABLE           = 0x40
 };
 
 struct cpx {
@@ -88,21 +94,39 @@ size_t cpx_sizex(const struct cpx *cpx)
 
 cpx_t cpx_alloc(size_t key_len)
 {
-       cpx_t cpx;
+       cpx_t cpx = NULL;
        
-#if TARGET_OS_OSX
+#if CONFIG_KEYPAGE_WP
        /* 
         * Macs only use 1 key per volume, so force it into its own page.
         * This way, we can write-protect as needed.
         */
        size_t cpsize = cpx_size (key_len);
        if (cpsize < PAGE_SIZE) {
-               MALLOC(cpx, cpx_t, PAGE_SIZE, M_TEMP, M_WAITOK);
+               /* 
+                * Don't use MALLOC to allocate the page-sized structure.  Instead, 
+                * use kmem_alloc to bypass KASAN since we are supplying our own
+                * unilateral write protection on this page. Note that kmem_alloc 
+                * can block.
+                */
+               if (kmem_alloc (kernel_map, (vm_offset_t *)&cpx, PAGE_SIZE, VM_KERN_MEMORY_FILE)) {
+                       /*
+                        * returning NULL at this point (due to failed allocation) would just 
+                        * result in a panic. fall back to attempting a normal MALLOC, and don't
+                        * let the cpx get marked PROTECTABLE.
+                        */
+                       MALLOC(cpx, cpx_t, cpx_size(key_len), M_TEMP, M_WAITOK);
+               }
+               else {
+                       //mark the page as protectable, since kmem_alloc succeeded.
+                       cpx->cpx_flags |= CPX_WRITE_PROTECTABLE;
+               }
        }
        else {
                panic ("cpx_size too large ! (%lu)", cpsize);
        }
 #else
+       /* If key page write protection disabled, just switch to kernel MALLOC */
        MALLOC(cpx, cpx_t, cpx_size(key_len), M_TEMP, M_WAITOK);
 #endif
        cpx_init(cpx, key_len);
@@ -113,10 +137,12 @@ cpx_t cpx_alloc(size_t key_len)
 /* this is really a void function */
 void cpx_writeprotect (cpx_t cpx) 
 {
-#if TARGET_OS_OSX
+#if CONFIG_KEYPAGE_WP
        void *cpxstart = (void*)cpx;
        void *cpxend = (void*)((uint8_t*)cpx + PAGE_SIZE);
-       vm_map_protect (kernel_map, cpxstart, cpxend, (VM_PROT_READ), FALSE);
+       if (cpx->cpx_flags & CPX_WRITE_PROTECTABLE) {
+               vm_map_protect (kernel_map, (vm_map_offset_t)cpxstart, (vm_map_offset_t)cpxend, (VM_PROT_READ), FALSE);
+       }
 #else
        (void) cpx;
 #endif
@@ -136,15 +162,26 @@ void cpx_free(cpx_t cpx)
        assert(*PTR_ADD(uint32_t *, cpx, cpx_sizex(cpx) - 4) == cpx_magic2);
 #endif
        
-#if TARGET_OS_OSX
+#if CONFIG_KEYPAGE_WP
        /* unprotect the page before bzeroing */
        void *cpxstart = (void*)cpx;
-       void *cpxend = (void*)((uint8_t*)cpx + PAGE_SIZE);
-       vm_map_protect (kernel_map, cpxstart, cpxend, (VM_PROT_DEFAULT), FALSE);
-#endif
+       void *cpxend = (void*)((uint8_t*)cpx + PAGE_SIZE); 
+       if (cpx->cpx_flags & CPX_WRITE_PROTECTABLE) {
+               vm_map_protect (kernel_map, (vm_map_offset_t)cpxstart, (vm_map_offset_t)cpxend, (VM_PROT_DEFAULT), FALSE);
 
+               //now zero the memory after un-protecting it
+               bzero(cpx->cpx_cached_key, cpx->cpx_max_key_len);
+
+               //If we are here, then we used kmem_alloc to get the page. Must use kmem_free to drop it.
+               kmem_free(kernel_map, (vm_offset_t)cpx, PAGE_SIZE);
+               return;
+       }
+#else 
        bzero(cpx->cpx_cached_key, cpx->cpx_max_key_len);
        FREE(cpx, M_TEMP);
+       return;
+#endif
+
 }
 
 void cpx_init(cpx_t cpx, size_t key_len)
index 8cc7237c6a332dd1c28cb4dc3ed7e6a84ff93baf..79872204b7ecbb9ddcae8de2440d01bc6a5c1a70 100644 (file)
 // idle period until assumed disk spin down
 #define DISK_IDLE_SEC (10 * 60)
 
+struct saved_mount_fields {
+       uint32_t        mnt_maxreadcnt;         /* Max. byte count for read */
+       uint32_t        mnt_maxwritecnt;        /* Max. byte count for write */
+       uint32_t        mnt_segreadcnt;         /* Max. segment count for read */
+       uint32_t        mnt_segwritecnt;        /* Max. segment count for write */
+       uint32_t        mnt_ioqueue_depth;      /* the maxiumum number of commands a device can accept */
+       uint32_t        mnt_ioscale;            /* scale the various throttles/limits imposed on the amount of I/O in flight */
+};
+
 struct _disk_conditioner_info_t {
-       boolean_t enabled; // if other fields have any effect
-       uint64_t access_time_usec; // maximum latency before an I/O transfer begins
-       uint64_t read_throughput_mbps; // throughput of an I/O read
-       uint64_t write_throughput_mbps; // throughput of an I/O write
-       boolean_t is_ssd; // behave like an SSD (for both conditioning and affecting behavior in other parts of VFS)
+       disk_conditioner_info dcinfo; // all the original data from fsctl
+       struct saved_mount_fields mnt_fields; // fields to restore in mount_t when conditioner is disabled
+
        daddr64_t last_blkno; // approx. last transfered block for simulating seek times
        struct timeval last_io_timestamp; // the last time an I/O completed
 };
@@ -85,25 +92,33 @@ disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_e
        daddr64_t blkdiff;
        daddr64_t last_blkno;
        double access_time_scale;
-       struct _disk_conditioner_info_t *info = NULL;
+       struct _disk_conditioner_info_t *internal_info = NULL;
+       disk_conditioner_info *info = NULL;
        struct timeval elapsed;
        struct timeval start;
+       vnode_t vp;
 
-       mp = buf_vnode(bp)->v_mount;
+       vp = buf_vnode(bp);
+       if (!vp) {
+               return;
+       }
+
+       mp = vp->v_mount;
        if (!mp) {
                return;
        }
 
-       info = mp->mnt_disk_conditioner_info;
-       if (!info || !info->enabled) {
+       internal_info = mp->mnt_disk_conditioner_info;
+       if (!internal_info || !internal_info->dcinfo.enabled) {
                return;
        }
+       info = &(internal_info->dcinfo);
 
        if (!info->is_ssd) {
                // calculate approximate seek time based on difference in block number
-               last_blkno = info->last_blkno;
+               last_blkno = internal_info->last_blkno;
                blkdiff = bp->b_blkno > last_blkno ? bp->b_blkno - last_blkno : last_blkno - bp->b_blkno;
-               info->last_blkno = bp->b_blkno + bp->b_bcount;
+               internal_info->last_blkno = bp->b_blkno + bp->b_bcount;
        } else {
                blkdiff = BLK_MAX(mp);
        }
@@ -122,15 +137,15 @@ disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_e
        // try simulating disk spinup based on time since last I/O
        if (!info->is_ssd) {
                microuptime(&elapsed);
-               timevalsub(&elapsed, &info->last_io_timestamp);
+               timevalsub(&elapsed, &internal_info->last_io_timestamp);
                // avoid this delay right after boot (assuming last_io_timestamp is 0 and disk is already spinning)
-               if (elapsed.tv_sec > DISK_IDLE_SEC && info->last_io_timestamp.tv_sec != 0) {
+               if (elapsed.tv_sec > DISK_IDLE_SEC && internal_info->last_io_timestamp.tv_sec != 0) {
                        delay_usec += DISK_SPINUP_SEC * USEC_PER_SEC;
                }
        }
 
        if (delay_usec <= already_elapsed_usec) {
-               microuptime(&info->last_io_timestamp);
+               microuptime(&internal_info->last_io_timestamp);
                return;
        }
 
@@ -153,7 +168,7 @@ disk_conditioner_delay(buf_t bp, int extents, int total_size, uint64_t already_e
                }
        }
 
-       microuptime(&info->last_io_timestamp);
+       microuptime(&internal_info->last_io_timestamp);
 }
 
 int
@@ -167,23 +182,29 @@ disk_conditioner_get_info(mount_t mp, disk_conditioner_info *uinfo)
 
        info = mp->mnt_disk_conditioner_info;
 
-       if (!info) {
-               return 0;
+       if (info) {
+               memcpy(uinfo, &(info->dcinfo), sizeof(disk_conditioner_info));
        }
 
-       uinfo->enabled = info->enabled;
-       uinfo->access_time_usec = info->access_time_usec;
-       uinfo->read_throughput_mbps = info->read_throughput_mbps;
-       uinfo->write_throughput_mbps = info->write_throughput_mbps;
-       uinfo->is_ssd = info->is_ssd;
-
        return 0;
 }
 
+static inline void
+disk_conditioner_restore_mount_fields(mount_t mp, struct saved_mount_fields *mnt_fields) {
+       mp->mnt_maxreadcnt = mnt_fields->mnt_maxreadcnt;
+       mp->mnt_maxwritecnt = mnt_fields->mnt_maxwritecnt;
+       mp->mnt_segreadcnt = mnt_fields->mnt_segreadcnt;
+       mp->mnt_segwritecnt = mnt_fields->mnt_segwritecnt;
+       mp->mnt_ioqueue_depth = mnt_fields->mnt_ioqueue_depth;
+       mp->mnt_ioscale = mnt_fields->mnt_ioscale;
+}
+
 int
 disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo)
 {
-       struct _disk_conditioner_info_t *info;
+       struct _disk_conditioner_info_t *internal_info;
+       disk_conditioner_info *info;
+       struct saved_mount_fields *mnt_fields;
 
        if (!kauth_cred_issuser(kauth_cred_get()) || !IOTaskHasEntitlement(current_task(), DISK_CONDITIONER_SET_ENTITLEMENT)) {
                return EPERM;
@@ -193,18 +214,62 @@ disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo)
                return EINVAL;
        }
 
-       info = mp->mnt_disk_conditioner_info;
-       if (!info) {
-               info = mp->mnt_disk_conditioner_info = kalloc(sizeof(struct _disk_conditioner_info_t));
-               bzero(info, sizeof(struct _disk_conditioner_info_t));
+       mount_lock(mp);
+
+       internal_info = mp->mnt_disk_conditioner_info;
+       if (!internal_info) {
+               internal_info = mp->mnt_disk_conditioner_info = kalloc(sizeof(struct _disk_conditioner_info_t));
+               bzero(internal_info, sizeof(struct _disk_conditioner_info_t));
+               mnt_fields = &(internal_info->mnt_fields);
+
+               /* save mount_t fields for restoration later */
+               mnt_fields->mnt_maxreadcnt = mp->mnt_maxreadcnt;
+               mnt_fields->mnt_maxwritecnt = mp->mnt_maxwritecnt;
+               mnt_fields->mnt_segreadcnt = mp->mnt_segreadcnt;
+               mnt_fields->mnt_segwritecnt = mp->mnt_segwritecnt;
+               mnt_fields->mnt_ioqueue_depth = mp->mnt_ioqueue_depth;
+               mnt_fields->mnt_ioscale = mp->mnt_ioscale;
+       }
+
+       info = &(internal_info->dcinfo);
+       mnt_fields = &(internal_info->mnt_fields);
+
+       if (!uinfo->enabled && info->enabled) {
+               /* disk conditioner is being disabled when already enabled */
+               disk_conditioner_restore_mount_fields(mp, mnt_fields);
+       }
+
+       memcpy(info, uinfo, sizeof(disk_conditioner_info));
+
+       /* scale back based on hardware advertised limits */
+       if (uinfo->ioqueue_depth == 0 || uinfo->ioqueue_depth > mnt_fields->mnt_ioqueue_depth) {
+               info->ioqueue_depth = mnt_fields->mnt_ioqueue_depth;
+       }
+       if (uinfo->maxreadcnt == 0 || uinfo->maxreadcnt > mnt_fields->mnt_maxreadcnt) {
+               info->maxreadcnt = mnt_fields->mnt_maxreadcnt;
+       }
+       if (uinfo->maxwritecnt == 0 || uinfo->maxwritecnt > mnt_fields->mnt_maxwritecnt) {
+               info->maxwritecnt = mnt_fields->mnt_maxwritecnt;
+       }
+       if (uinfo->segreadcnt == 0 || uinfo->segreadcnt > mnt_fields->mnt_segreadcnt) {
+               info->segreadcnt = mnt_fields->mnt_segreadcnt;
        }
+       if (uinfo->segwritecnt == 0 || uinfo->segwritecnt > mnt_fields->mnt_segwritecnt) {
+               info->segwritecnt = mnt_fields->mnt_segwritecnt;
+       }
+
+       if (uinfo->enabled) {
+               mp->mnt_maxreadcnt = info->maxreadcnt;
+               mp->mnt_maxwritecnt = info->maxwritecnt;
+               mp->mnt_segreadcnt = info->segreadcnt;
+               mp->mnt_segwritecnt = info->segwritecnt;
+               mp->mnt_ioqueue_depth = info->ioqueue_depth;
+               mp->mnt_ioscale = MNT_IOSCALE(info->ioqueue_depth);
+       }
+
+       mount_unlock(mp);
 
-       info->enabled = uinfo->enabled;
-       info->access_time_usec = uinfo->access_time_usec;
-       info->read_throughput_mbps = uinfo->read_throughput_mbps;
-       info->write_throughput_mbps = uinfo->write_throughput_mbps;
-       info->is_ssd = uinfo->is_ssd;
-       microuptime(&info->last_io_timestamp);
+       microuptime(&internal_info->last_io_timestamp);
 
        // make sure throttling picks up the new periods
        throttle_info_mount_reset_period(mp, info->is_ssd);
@@ -215,21 +280,27 @@ disk_conditioner_set_info(mount_t mp, disk_conditioner_info *uinfo)
 void
 disk_conditioner_unmount(mount_t mp)
 {
-       if (!mp->mnt_disk_conditioner_info) {
+       struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info;
+
+       if (!internal_info) {
                return;
        }
-       kfree(mp->mnt_disk_conditioner_info, sizeof(struct _disk_conditioner_info_t));
+
+       if (internal_info->dcinfo.enabled) {
+               disk_conditioner_restore_mount_fields(mp, &(internal_info->mnt_fields));
+       }
        mp->mnt_disk_conditioner_info = NULL;
+       kfree(internal_info, sizeof(struct _disk_conditioner_info_t));
 }
 
 boolean_t
 disk_conditioner_mount_is_ssd(mount_t mp)
 {
-       struct _disk_conditioner_info_t *info = mp->mnt_disk_conditioner_info;
+       struct _disk_conditioner_info_t *internal_info = mp->mnt_disk_conditioner_info;
 
-       if (!info || !info->enabled) {
-               return (mp->mnt_kern_flag & MNTK_SSD);
+       if (!internal_info || !internal_info->dcinfo.enabled) {
+               return !!(mp->mnt_kern_flag & MNTK_SSD);
        }
 
-       return info->is_ssd;
+       return internal_info->dcinfo.is_ssd;
 }
index f2e6b0bc31cbbe6e8512b2ced22a1f094cce26ce..5b8eac30ec581d84b41d5656b4640de672f2e214 100644 (file)
@@ -1303,11 +1303,11 @@ copy_out_kfse(fs_event_watcher *watcher, kfs_event *kfse, struct uio *uio)
        return 0;
     }
 
-    if (kfse->type == FSE_RENAME && kfse->dest == NULL) {
+    if (((kfse->type == FSE_RENAME) || (kfse->type == FSE_CLONE)) && kfse->dest == NULL) {
        //
        // This can happen if an event gets recycled but we had a
        // pointer to it in our event queue.  The event is the
-       // destination of a rename which we'll process separately
+       // destination of a rename or clone which we'll process separately
        // (that is, another kfse points to this one so it's ok
        // to skip this guy because we'll process it when we process
        // the other one)
@@ -1967,7 +1967,7 @@ filt_fsevent(struct knote *kn, long hint)
        switch(kn->kn_filter) {
                case EVFILT_READ:
                        kn->kn_data = amt;
-                       
+
                        if (kn->kn_data != 0) {
                                activate = 1;
                        }
@@ -2001,8 +2001,6 @@ filt_fsevent_touch(struct knote *kn, struct kevent_internal_s *kev)
        /* accept new fflags/data as saved */
        kn->kn_sfflags = kev->fflags;
        kn->kn_sdata = kev->data;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /* restrict the current results to the (smaller?) set of new interest */
        /*
@@ -2079,8 +2077,6 @@ fseventsf_drain(struct fileproc *fp, __unused vfs_context_t ctx)
     int counter = 0;
     fsevent_handle *fseh = (struct fsevent_handle *)fp->f_fglob->fg_data;
 
-    fseh->watcher->flags |= WATCHER_CLOSING;
-
     // if there are people still waiting, sleep for 10ms to
     // let them clean up and get out of there.  however we
     // also don't want to get stuck forever so if they don't
index 6dbd62b930b767df7cb3c8f62720f173b9d9b19c..87db6067f5470105291a4f7c0dc84e26a2936029 100644 (file)
@@ -75,7 +75,7 @@ fslog_extmod_msgtracer(proc_t caller, proc_t target)
                strlcat(c_name, "(", sizeof(c_name));
                strlcat(c_name, uuidstr, sizeof(c_name));
                strlcat(c_name, ")", sizeof(c_name));
-               if (0 != escape_str(c_name, strlen(c_name), sizeof(c_name))) {
+               if (0 != escape_str(c_name, strlen(c_name) + 1, sizeof(c_name))) {
                        return;
                }
 
@@ -84,7 +84,7 @@ fslog_extmod_msgtracer(proc_t caller, proc_t target)
                strlcat(t_name, "(", sizeof(t_name));
                strlcat(t_name, uuidstr, sizeof(t_name));
                strlcat(t_name, ")", sizeof(t_name));
-               if (0 != escape_str(t_name, strlen(t_name), sizeof(t_name))) {
+               if (0 != escape_str(t_name, strlen(t_name) + 1, sizeof(t_name))) {
                        return;
                }
 #if DEBUG
index 55b86f9e65efb6d812f20172cd9d96266c57bfeb..ccee2e1c59a14ce37e65329d4793d7632e4f071e 100644 (file)
 #define VOLFS_MIN_PATH_LEN  9
 
 
-static void kdebug_lookup(struct vnode *dp, struct componentname *cnp);
-
 #if CONFIG_VOLFS
 static int vfs_getrealpath(const char * path, char * realpath, size_t bufsize, vfs_context_t ctx);
 #define MAX_VOLFS_RESTARTS 5
@@ -1746,24 +1744,33 @@ nameidone(struct nameidata *ndp)
 #if (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST)
 
 void
-kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t lookup)
+kdebug_vfs_lookup(long *dbg_parms, int dbg_namelen, void *dp, uint32_t flags)
 {
        int code;
        unsigned int i;
+       bool lookup = flags & KDBG_VFS_LOOKUP_FLAG_LOOKUP;
+       bool noprocfilt = flags & KDBG_VFS_LOOKUP_FLAG_NOPROCFILT;
 
        /*
         * In the event that we collect multiple, consecutive pathname
         * entries, we must mark the start of the path's string and the end.
         */
-       if (lookup == TRUE)
+       if (lookup) {
                code = VFS_LOOKUP | DBG_FUNC_START;
-       else
+       } else {
                code = VFS_LOOKUP_DONE | DBG_FUNC_START;
+       }
 
        if (dbg_namelen <= (int)(3 * sizeof(long)))
                code |= DBG_FUNC_END;
 
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, code, kdebug_vnode(dp), dbg_parms[0], dbg_parms[1], dbg_parms[2], 0);
+       if (noprocfilt) {
+               KDBG_RELEASE_NOPROCFILT(code, kdebug_vnode(dp), dbg_parms[0],
+                               dbg_parms[1], dbg_parms[2]);
+       } else {
+               KDBG_RELEASE(code, kdebug_vnode(dp), dbg_parms[0], dbg_parms[1],
+                               dbg_parms[2]);
+       }
 
        code &= ~DBG_FUNC_START;
 
@@ -1771,11 +1778,25 @@ kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp, boolean_t l
                if (dbg_namelen <= (int)(4 * sizeof(long)))
                        code |= DBG_FUNC_END;
 
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, code, dbg_parms[i], dbg_parms[i+1], dbg_parms[i+2], dbg_parms[i+3], 0);
+               if (noprocfilt) {
+                       KDBG_RELEASE_NOPROCFILT(code, dbg_parms[i], dbg_parms[i + 1],
+                                       dbg_parms[i + 2], dbg_parms[i + 3]);
+               } else {
+                       KDBG_RELEASE(code, dbg_parms[i], dbg_parms[i + 1], dbg_parms[i + 2],
+                                       dbg_parms[i + 3]);
+               }
        }
 }
 
-static void
+void
+kdebug_lookup_gen_events(long *dbg_parms, int dbg_namelen, void *dp,
+               boolean_t lookup)
+{
+       kdebug_vfs_lookup(dbg_parms, dbg_namelen, dp,
+                       lookup ? KDBG_VFS_LOOKUP_FLAG_LOOKUP : 0);
+}
+
+void
 kdebug_lookup(vnode_t dp, struct componentname *cnp)
 {
        int dbg_namelen;
@@ -1799,13 +1820,15 @@ kdebug_lookup(vnode_t dp, struct componentname *cnp)
                       *(cnp->cn_nameptr + cnp->cn_namelen) ? '>' : 0,
                       sizeof(dbg_parms) - dbg_namelen);
        }
-       kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)dp, TRUE);
-}      
+       kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)dp,
+                       KDBG_VFS_LOOKUP_FLAG_LOOKUP);
+}
 
 #else /* (KDEBUG_LEVEL >= KDEBUG_LEVEL_IST) */
 
 void
-kdebug_lookup_gen_events(long *dbg_parms __unused, int dbg_namelen __unused, void *dp __unused)
+kdebug_vfs_lookup(long *dbg_parms __unused, int dbg_namelen __unused,
+               void *dp __unused, __unused uint32_t flags)
 {
 }
 
index 4da9d4535b4b5c90823fe5bd47e02bb533b2b5b0..e1d18c7c3b25821121f1499c9cf59230ce19239f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -3326,7 +3326,7 @@ vfs_init_io_attributes(vnode_t devvp, mount_t mp)
                temp = MNT_DEFAULT_IOQUEUE_DEPTH;
 
        mp->mnt_ioqueue_depth = temp;
-       mp->mnt_ioscale = (mp->mnt_ioqueue_depth + (MNT_DEFAULT_IOQUEUE_DEPTH - 1)) / MNT_DEFAULT_IOQUEUE_DEPTH;
+       mp->mnt_ioscale = MNT_IOSCALE(mp->mnt_ioqueue_depth);
 
        if (mp->mnt_ioscale > 1)
                printf("ioqueue_depth = %d,   ioscale = %d\n", (int)mp->mnt_ioqueue_depth, (int)mp->mnt_ioscale);
@@ -3782,8 +3782,6 @@ filt_fstouch(struct knote *kn, struct kevent_internal_s *kev)
        lck_mtx_lock(fs_klist_lock);
 
        kn->kn_sfflags = kev->fflags;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        /*
         * the above filter function sets bits even if nobody is looking for them.
@@ -3919,7 +3917,7 @@ SYSCTL_PROC(_vfs_generic, OID_AUTO, noremotehang, CTLFLAG_RW | CTLFLAG_ANYBODY,
 SYSCTL_INT(_vfs_generic, VFS_MAXTYPENUM, maxtypenum,
                   CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED,
                   &maxvfstypenum, 0, "");
-SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout, 0, "");
+SYSCTL_INT(_vfs_generic, OID_AUTO, sync_timeout, CTLFLAG_RW | CTLFLAG_LOCKED, &sync_timeout_seconds, 0, "");
 SYSCTL_NODE(_vfs_generic, VFS_CONF, conf,
                   CTLFLAG_RD | CTLFLAG_LOCKED,
                   sysctl_vfs_generic_conf, "");
@@ -5133,12 +5131,17 @@ vnode_create_internal(uint32_t flavor, uint32_t size, void *data, vnode_t *vpp,
        ut = get_bsdthread_info(current_thread());
 
        if ((current_proc()->p_lflag & P_LRAGE_VNODES) ||
-           (ut->uu_flag & UT_RAGE_VNODES)) {
+           (ut->uu_flag & (UT_RAGE_VNODES | UT_KERN_RAGE_VNODES))) {
                /*
                 * process has indicated that it wants any
                 * vnodes created on its behalf to be rapidly
                 * aged to reduce the impact on the cached set
                 * of vnodes
+                *
+                * if UT_KERN_RAGE_VNODES is set, then the
+                * kernel internally wants vnodes to be rapidly
+                * aged, even if the process hasn't requested
+                * this
                 */
                vp->v_flag |= VRAGE;
        }
@@ -5843,9 +5846,17 @@ error:
                if (!batched) {
                        *vpp = (vnode_t) 0;
                        vnode_put(vp);
+                       vp = NULLVP;
                }
        }
 
+       /*
+        * For creation VNOPs, this is the equivalent of
+        * lookup_handle_found_vnode.
+        */
+       if (kdebug_enable && *vpp)
+               kdebug_lookup(*vpp, cnp);
+
 out:
        vn_attribute_cleanup(vap, defaulted);
 
@@ -6135,6 +6146,15 @@ vn_authorize_renamex(struct vnode *fdvp,  struct vnode *fvp,  struct componentna
                                         struct vnode *tdvp,  struct vnode *tvp,  struct componentname *tcnp,
                                         vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
 {
+
+       return vn_authorize_renamex_with_paths(fdvp, fvp, fcnp, NULL, tdvp, tvp, tcnp, NULL, ctx, flags, reserved);
+}
+
+int
+vn_authorize_renamex_with_paths(struct vnode *fdvp,  struct vnode *fvp,  struct componentname *fcnp, const char *from_path,
+                                        struct vnode *tdvp,  struct vnode *tvp,  struct componentname *tcnp, const char *to_path,
+                                        vfs_context_t ctx, vfs_rename_flags_t flags, void *reserved)
+{
        int error = 0;
        int moving = 0;
        bool swap = flags & VFS_RENAME_SWAP;
@@ -6231,6 +6251,23 @@ vn_authorize_renamex(struct vnode *fdvp,  struct vnode *fvp,  struct componentna
 
        /***** <Kauth> *****/
 
+       /*
+        * As part of the Kauth step, we call out to allow 3rd-party
+        * fileop notification of "about to rename".  This is needed
+        * in the event that 3rd-parties need to know that the DELETE
+        * authorization is actually part of a rename.  It's important
+        * that we guarantee that the DELETE call-out will always be
+        * made if the WILL_RENAME call-out is made.  Another fileop
+        * call-out will be performed once the operation is completed.
+        * We can ignore the result of kauth_authorize_fileop().
+        *
+        * N.B. We are passing the vnode and *both* paths to each
+        * call; kauth_authorize_fileop() extracts the "from" path
+        * when posting a KAUTH_FILEOP_WILL_RENAME notification.
+        * As such, we only post these notifications if all of the
+        * information we need is provided.
+        */
+
        if (swap) {
                kauth_action_t f = 0, t = 0;
 
@@ -6244,9 +6281,19 @@ vn_authorize_renamex(struct vnode *fdvp,  struct vnode *fvp,  struct componentna
                        if (vnode_isdir(tvp))
                                t = KAUTH_VNODE_ADD_SUBDIRECTORY;
                }
+               if (to_path != NULL)
+                       kauth_authorize_fileop(vfs_context_ucred(ctx),
+                                       KAUTH_FILEOP_WILL_RENAME,
+                                       (uintptr_t)fvp,
+                                       (uintptr_t)to_path);
                error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | f, ctx);
                if (error)
                        goto out;
+               if (from_path != NULL)
+                       kauth_authorize_fileop(vfs_context_ucred(ctx),
+                                       KAUTH_FILEOP_WILL_RENAME,
+                                       (uintptr_t)tvp,
+                                       (uintptr_t)from_path);
                error = vnode_authorize(tvp, tdvp, KAUTH_VNODE_DELETE | t, ctx);
                if (error)
                        goto out;
@@ -6278,6 +6325,11 @@ vn_authorize_renamex(struct vnode *fdvp,  struct vnode *fvp,  struct componentna
                 * If fvp is a directory, and we are changing it's parent,
                 * then we also need rights to rewrite its ".." entry as well.
                 */
+               if (to_path != NULL)
+                       kauth_authorize_fileop(vfs_context_ucred(ctx),
+                                       KAUTH_FILEOP_WILL_RENAME,
+                                       (uintptr_t)fvp,
+                                       (uintptr_t)to_path);
                if (vnode_isdir(fvp)) {
                        if ((error = vnode_authorize(fvp, fdvp, KAUTH_VNODE_DELETE | KAUTH_VNODE_ADD_SUBDIRECTORY, ctx)) != 0)
                                goto out;
@@ -9880,7 +9932,8 @@ static int vnode_trace_path_callback(struct vnode *vp, void *arg) {
        /* vn_getpath() NUL-terminates, and len includes the NUL */
 
        if (!rv) {
-               kdebug_lookup_gen_events(ctx->path, len, vp, TRUE);
+               kdebug_vfs_lookup(ctx->path, len, vp,
+                               KDBG_VFS_LOOKUP_FLAG_LOOKUP | KDBG_VFS_LOOKUP_FLAG_NOPROCFILT);
 
                if (++(ctx->count) == 1000) {
                        thread_yield_to_preemption();
index dccc77bd61fddc787e9cffa80600b6c40e86cf55..767d352c6993af8969b2a1f37520912eee24e5a7 100644 (file)
@@ -174,8 +174,6 @@ static int getfsstat_callback(mount_t mp, void * arg);
 static int getutimes(user_addr_t usrtvp, struct timespec *tsp);
 static int setutimes(vfs_context_t ctx, vnode_t vp, const struct timespec *ts, int nullflag);
 static int sync_callback(mount_t, void *);
-static void hibernate_sync_thread(void *, __unused wait_result_t);
-static int hibernate_sync_async(int);
 static int munge_statfs(struct mount *mp, struct vfsstatfs *sfsp,
                        user_addr_t bufp, int *sizep, boolean_t is_64_bit,
                                                boolean_t partial_copy);
@@ -217,6 +215,13 @@ static void mount_end_update(mount_t mp);
 static int relocate_imageboot_source(vnode_t pvp, vnode_t vp, struct componentname *cnp, const char *fsname, vfs_context_t ctx, boolean_t is64bit, user_addr_t fsmountargs, boolean_t by_index);
 #endif /* CONFIG_IMGSRC_ACCESS */
 
+//snapshot functions
+#if CONFIG_MNT_ROOTSNAP
+static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx);
+#else
+static int snapshot_root(int dirfd, user_addr_t name, uint32_t flags, vfs_context_t ctx) __attribute__((unused));
+#endif
+
 int (*union_dircheckp)(struct vnode **, struct fileproc *, vfs_context_t);
 
 __private_extern__
@@ -2323,8 +2328,6 @@ int syncprt = 0;
 #endif
 
 int print_vmpage_stat=0;
-int sync_timeout = 60;  // Sync time limit (sec)
-
 
 static int
 sync_callback(mount_t mp, __unused void *arg)
@@ -2358,15 +2361,64 @@ sync(__unused proc_t p, __unused struct sync_args *uap, __unused int32_t *retval
        return 0;
 }
 
+typedef enum {
+       SYNC_ALL = 0,
+       SYNC_ONLY_RELIABLE_MEDIA = 1,
+       SYNC_ONLY_UNRELIABLE_MEDIA = 2
+} sync_type_t;
+
+static int
+sync_internal_callback(mount_t mp, void *arg)
+{
+       if (arg) {
+               int is_reliable = !(mp->mnt_kern_flag & MNTK_VIRTUALDEV) &&
+                                  (mp->mnt_flag & MNT_LOCAL);
+               sync_type_t sync_type = *((sync_type_t *)arg);
+
+               if ((sync_type == SYNC_ONLY_RELIABLE_MEDIA) && !is_reliable)
+                       return (VFS_RETURNED);
+               else if ((sync_type = SYNC_ONLY_UNRELIABLE_MEDIA) && is_reliable)
+                       return (VFS_RETURNED);
+       }
+
+       (void)sync_callback(mp, NULL);
+
+       return (VFS_RETURNED);
+}
+
+int sync_thread_state = 0;
+int sync_timeout_seconds = 5;
+
+#define SYNC_THREAD_RUN       0x0001
+#define SYNC_THREAD_RUNNING   0x0002
+
 static void
-hibernate_sync_thread(void *arg, __unused wait_result_t wr)
+sync_thread(__unused void *arg, __unused wait_result_t wr)
 {
-       int *timeout = (int *) arg;
+       sync_type_t sync_type;
 
-       vfs_iterate(LK_NOWAIT, sync_callback, NULL);
+       lck_mtx_lock(sync_mtx_lck);
+       while (sync_thread_state & SYNC_THREAD_RUN) {
+               sync_thread_state &= ~SYNC_THREAD_RUN;
+               lck_mtx_unlock(sync_mtx_lck);
+
+               sync_type = SYNC_ONLY_RELIABLE_MEDIA;
+               vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
+               sync_type = SYNC_ONLY_UNRELIABLE_MEDIA;
+               vfs_iterate(LK_NOWAIT, sync_internal_callback, &sync_type);
+
+               lck_mtx_lock(sync_mtx_lck);
+       }
+       /*
+        * This wakeup _has_ to be issued before the lock is released otherwise
+        * we may end up waking up a thread in sync_internal which is
+        * expecting a wakeup from a thread it just created and not from this
+        * thread which is about to exit.
+        */
+       wakeup(&sync_thread_state);
+       sync_thread_state &= ~SYNC_THREAD_RUNNING;
+       lck_mtx_unlock(sync_mtx_lck);
 
-       if (timeout)
-               wakeup((caddr_t) timeout);
        if (print_vmpage_stat) {
                vm_countdirtypages();
        }
@@ -2377,41 +2429,52 @@ hibernate_sync_thread(void *arg, __unused wait_result_t wr)
 #endif /* DIAGNOSTIC */
 }
 
+struct timeval sync_timeout_last_print = {0, 0};
+
 /*
- * Sync in a separate thread so we can time out if it blocks.
+ * An in-kernel sync for power management to call.
+ * This function always returns within sync_timeout seconds.
  */
-static int
-hibernate_sync_async(int timeout)
+__private_extern__ int
+sync_internal(void)
 {
        thread_t thd;
        int error;
-       struct timespec ts = {timeout, 0};
+       int thread_created = FALSE;
+       struct timespec ts = {sync_timeout_seconds, 0};
 
        lck_mtx_lock(sync_mtx_lck);
-       if (kernel_thread_start(hibernate_sync_thread, &timeout, &thd) != KERN_SUCCESS) {
-               printf("hibernate_sync_thread failed\n");
-               lck_mtx_unlock(sync_mtx_lck);
-               return (0);
+       sync_thread_state |= SYNC_THREAD_RUN;
+       if (!(sync_thread_state & SYNC_THREAD_RUNNING)) {
+               int kr;
+
+               sync_thread_state |= SYNC_THREAD_RUNNING;
+               kr = kernel_thread_start(sync_thread, NULL, &thd);
+               if (kr != KERN_SUCCESS) {
+                       sync_thread_state &= ~SYNC_THREAD_RUNNING;
+                       lck_mtx_unlock(sync_mtx_lck);
+                       printf("sync_thread failed\n");
+                       return (0);
+               }
+               thread_created = TRUE;
        }
 
-       error = msleep((caddr_t) &timeout, sync_mtx_lck, (PVFS | PDROP | PCATCH), "hibernate_sync_thread", &ts);
+       error = msleep((caddr_t)&sync_thread_state, sync_mtx_lck,
+           (PVFS | PDROP | PCATCH), "sync_thread", &ts);
        if (error) {
-               printf("sync timed out: %d sec\n", timeout);
+               struct timeval now;
+
+               microtime(&now);
+               if (now.tv_sec - sync_timeout_last_print.tv_sec > 120) {
+                       printf("sync timed out: %d sec\n", sync_timeout_seconds);
+                       sync_timeout_last_print.tv_sec = now.tv_sec;
+               }
        }
-       thread_deallocate(thd);
 
-       return (0);
-}
+       if (thread_created)
+               thread_deallocate(thd);
 
-/*
- * An in-kernel sync for power management to call.
- */
-__private_extern__ int
-sync_internal(void)
-{
-       (void) hibernate_sync_async(sync_timeout);
-
-       return 0;
+       return (0);
 } /* end of sync_internal call */
 
 /*
@@ -2422,12 +2485,12 @@ int
 quotactl(proc_t p, struct quotactl_args *uap, __unused int32_t *retval)
 {
        struct mount *mp;
-       int error, quota_cmd, quota_status;
+       int error, quota_cmd, quota_status = 0;
        caddr_t datap;
        size_t fnamelen;
        struct nameidata nd;
        vfs_context_t ctx = vfs_context_current();
-       struct dqblk my_dqblk;
+       struct dqblk my_dqblk = {};
 
        AUDIT_ARG(uid, uap->uid);
        AUDIT_ARG(cmd, uap->cmd);
@@ -3646,6 +3709,12 @@ open1(vfs_context_t ctx, struct nameidata *ndp, int uflags,
                                     strlen(vp->v_name)) ||
                            !strncmp(vp->v_name,
                                     "mediaserverd",
+                                    strlen(vp->v_name)) || 
+                           !strncmp(vp->v_name,
+                                    "SpringBoard",
+                                    strlen(vp->v_name)) || 
+                           !strncmp(vp->v_name,
+                                    "backboardd",
                                     strlen(vp->v_name))) {
                                /*
                                 * This file matters when launching Camera:
@@ -5294,7 +5363,7 @@ access_extended(__unused proc_t p, struct access_extended_args *uap, __unused in
                error = ENOMEM;
                goto out;
        }
-       MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK);
+       MALLOC(result, errno_t *, desc_actual * sizeof(errno_t), M_TEMP, M_WAITOK | M_ZERO);
        if (result == NULL) {
                error = ENOMEM;
                goto out;
@@ -7340,6 +7409,57 @@ continue_lookup:
        }
 
        batched = vnode_compound_rename_available(fdvp);
+
+#if CONFIG_FSE
+       need_event = need_fsevent(FSE_RENAME, fdvp);
+       if (need_event) {
+               if (fvp) {
+                       get_fse_info(fvp, &from_finfo, ctx);
+               } else {
+                       error = vfs_get_notify_attributes(&__rename_data->fv_attr);
+                       if (error) {
+                               goto out1;
+                       }
+
+                       fvap = &__rename_data->fv_attr;
+               }
+
+               if (tvp) {
+                       get_fse_info(tvp, &to_finfo, ctx);
+               } else if (batched) {
+                       error = vfs_get_notify_attributes(&__rename_data->tv_attr);
+                       if (error) {
+                               goto out1;
+                       }
+
+                       tvap = &__rename_data->tv_attr;
+               }
+       }
+#else
+       need_event = 0;
+#endif /* CONFIG_FSE */
+
+       if (need_event || kauth_authorize_fileop_has_listeners()) {
+               if (from_name == NULL) {
+                       GET_PATH(from_name);
+                       if (from_name == NULL) {
+                               error = ENOMEM;
+                               goto out1;
+                       }
+               }
+
+               from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
+
+               if (to_name == NULL) {
+                       GET_PATH(to_name);
+                       if (to_name == NULL) {
+                               error = ENOMEM;
+                               goto out1;
+                       }
+               }
+
+               to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
+       }
        if (!fvp) {
                /*
                 * Claim: this check will never reject a valid rename.
@@ -7359,7 +7479,7 @@ continue_lookup:
        }
 
        if (!batched) {
-               error = vn_authorize_renamex(fdvp, fvp, &fromnd->ni_cnd, tdvp, tvp, &tond->ni_cnd, ctx, flags, NULL);
+               error = vn_authorize_renamex_with_paths(fdvp, fvp, &fromnd->ni_cnd, from_name, tdvp, tvp, &tond->ni_cnd, to_name, ctx, flags, NULL);
                if (error) {
                        if (error == ENOENT) {
                                assert(retry_count < MAX_AUTHORIZE_ENOENT_RETRIES);
@@ -7550,56 +7670,6 @@ continue_lookup:
        oparent = fvp->v_parent;
 
 skipped_lookup:
-#if CONFIG_FSE
-       need_event = need_fsevent(FSE_RENAME, fdvp);
-       if (need_event) {
-               if (fvp) {
-                       get_fse_info(fvp, &from_finfo, ctx);
-               } else {
-                       error = vfs_get_notify_attributes(&__rename_data->fv_attr);
-                       if (error) {
-                               goto out1;
-                       }
-
-                       fvap = &__rename_data->fv_attr;
-               }
-
-               if (tvp) {
-                       get_fse_info(tvp, &to_finfo, ctx);
-               } else if (batched) {
-                       error = vfs_get_notify_attributes(&__rename_data->tv_attr);
-                       if (error) {
-                               goto out1;
-                       }
-
-                       tvap = &__rename_data->tv_attr;
-               }
-       }
-#else
-       need_event = 0;
-#endif /* CONFIG_FSE */
-
-       if (need_event || kauth_authorize_fileop_has_listeners()) {
-               if (from_name == NULL) {
-                       GET_PATH(from_name);
-                       if (from_name == NULL) {
-                               error = ENOMEM;
-                               goto out1;
-                       }
-               }
-
-               from_len = safe_getpath(fdvp, fromnd->ni_cnd.cn_nameptr, from_name, MAXPATHLEN, &from_truncated);
-
-               if (to_name == NULL) {
-                       GET_PATH(to_name);
-                       if (to_name == NULL) {
-                               error = ENOMEM;
-                               goto out1;
-                       }
-               }
-
-               to_len = safe_getpath(tdvp, tond->ni_cnd.cn_nameptr, to_name, MAXPATHLEN, &to_truncated);
-       }
        error = vn_rename(fdvp, &fvp, &fromnd->ni_cnd, fvap,
                            tdvp, &tvp, &tond->ni_cnd, tvap,
                            flags, ctx);
@@ -8658,10 +8728,10 @@ getdirentriesattr (proc_t p, struct getdirentriesattr_args *uap, int32_t *retval
        struct fileproc *fp;
        uio_t auio = NULL;
        int spacetype = proc_is64bit(p) ? UIO_USERSPACE64 : UIO_USERSPACE32;
-       uint32_t count, savecount;
-       uint32_t newstate;
+       uint32_t count = 0, savecount = 0;
+       uint32_t newstate = 0;
        int error, eofflag;
-       uint32_t loff;
+       uint32_t loff = 0;
        struct attrlist attributelist;
        vfs_context_t ctx = vfs_context_current();
        int fd = uap->fd;
@@ -10613,7 +10683,8 @@ getxattr(proc_t p, struct getxattr_args *uap, user_ssize_t *retval)
        vp = nd.ni_vp;
        nameidone(&nd);
 
-       if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
+       error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
+       if (error != 0) {
                goto out;
        }
        if (xattr_protected(attrname)) {
@@ -10693,7 +10764,8 @@ fgetxattr(proc_t p, struct fgetxattr_args *uap, user_ssize_t *retval)
                file_drop(uap->fd);
                return(error);
        }
-       if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
+       error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
+       if (error != 0) {
                goto out;
        }
        if (xattr_protected(attrname)) {
@@ -10739,7 +10811,8 @@ setxattr(proc_t p, struct setxattr_args *uap, int *retval)
        if (uap->options & (XATTR_NOSECURITY | XATTR_NODEFAULT))
                return (EINVAL);
 
-       if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
+       error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
+       if (error != 0) {
                if (error == EPERM) {
                        /* if the string won't fit in attrname, copyinstr emits EPERM */
                        return (ENAMETOOLONG);
@@ -10798,7 +10871,8 @@ fsetxattr(proc_t p, struct fsetxattr_args *uap, int *retval)
        if (uap->options & (XATTR_NOFOLLOW | XATTR_NOSECURITY | XATTR_NODEFAULT))
                return (EINVAL);
 
-       if ((error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen) != 0)) {
+       error = copyinstr(uap->attrname, attrname, sizeof(attrname), &namelen);
+       if (error != 0) {
                if (error == EPERM) {
                        /* if the string won't fit in attrname, copyinstr emits EPERM */
                        return (ENAMETOOLONG);
@@ -11096,9 +11170,9 @@ unionget:
 
        if (kdebug_enable) {
                long dbg_parms[NUMPARMS];
-                int  dbg_namelen;
+               int  dbg_namelen;
 
-                dbg_namelen = (int)sizeof(dbg_parms);
+               dbg_namelen = (int)sizeof(dbg_parms);
 
         if (length < dbg_namelen) {
                        memcpy((char *)dbg_parms, buf, length);
@@ -11109,7 +11183,8 @@ unionget:
                        memcpy((char *)dbg_parms, buf + (length - dbg_namelen), dbg_namelen);
                }
 
-               kdebug_lookup_gen_events(dbg_parms, dbg_namelen, (void *)vp, TRUE);
+               kdebug_vfs_lookup(dbg_parms, dbg_namelen, (void *)vp,
+                               KDBG_VFS_LOOKUP_FLAG_LOOKUP);
        }
 
        *pathlen = (user_ssize_t)length; /* may be superseded by error */
@@ -11140,7 +11215,7 @@ fsgetpath(__unused proc_t p, struct fsgetpath_args *uap, user_ssize_t *retval)
        if (uap->bufsize > PAGE_SIZE) {
                return (EINVAL);
        }
-       MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK);
+       MALLOC(realpath, char *, uap->bufsize, M_TEMP, M_WAITOK | M_ZERO);
        if (realpath == NULL) {
                return (ENOMEM);
        }
@@ -12031,11 +12106,11 @@ fs_snapshot(__unused proc_t p, struct fs_snapshot_args *uap,
     case SNAPSHOT_OP_REVERT:
         error = snapshot_revert(uap->dirfd, uap->name1, uap->flags, ctx);
         break;
-#if !TARGET_OS_OSX
+#if CONFIG_MNT_ROOTSNAP
        case SNAPSHOT_OP_ROOT:
                error = snapshot_root(uap->dirfd, uap->name1, uap->flags, ctx);
                break;
-#endif /* !TARGET_OS_OSX */
+#endif /* CONFIG_MNT_ROOTSNAP */
        default:
                error = ENOSYS;
        }
index 797573d7540237f7f9bc211917f31dbe4f475450..6b03aa5a44f4d75f81a62ad5a6c0fd8edae9e978 100644 (file)
@@ -1795,7 +1795,7 @@ filt_vndetach(struct knote *kn)
  * differently than the regular case for VREG files.  If not in poll(),
  * then we need to know current fileproc offset for VREG.
  */
-static intptr_t
+static int64_t
 vnode_readable_data_count(vnode_t vp, off_t current_offset, int ispoll)
 {
        if (vnode_isfifo(vp)) {
@@ -1803,25 +1803,25 @@ vnode_readable_data_count(vnode_t vp, off_t current_offset, int ispoll)
                int cnt;
                int err = fifo_charcount(vp, &cnt);
                if (err == 0) {
-                       return (intptr_t)cnt;
+                       return (int64_t)cnt;
                } else 
 #endif
                {
-                       return (intptr_t)0;
+                       return 0;
                }
        } else if (vnode_isreg(vp)) {
                if (ispoll) {
-                       return (intptr_t)1;
+                       return 1;
                }
 
                off_t amount;
                amount = vp->v_un.vu_ubcinfo->ui_size - current_offset;
-               if (amount > (off_t)INTPTR_MAX) {
-                       return INTPTR_MAX;
-               } else if (amount < (off_t)INTPTR_MIN) {
-                       return INTPTR_MIN;
+               if (amount > INT64_MAX) {
+                       return INT64_MAX;
+               } else if (amount < INT64_MIN) {
+                       return INT64_MIN;
                } else {
-                       return (intptr_t)amount;
+                       return (int64_t)amount;
                } 
        } else {
                panic("Should never have an EVFILT_READ except for reg or fifo.");
@@ -1936,8 +1936,6 @@ filt_vntouch(struct knote *kn, struct kevent_internal_s *kev)
 
        /* accept new input fflags mask */
        kn->kn_sfflags = kev->fflags;
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
 
        activate = filt_vnode_common(kn, vp, hint);
 
index b47ec555393023842523756c892e62ee17ef4742..f01d117b5c4f833e3563694295b1c26a489ed7e5 100644 (file)
@@ -397,6 +397,48 @@ xattr_protected(const char *attrname)
 }
 
 
+static void
+vnode_setasnamedstream_internal(vnode_t vp, vnode_t svp)
+{
+       uint32_t streamflags = VISNAMEDSTREAM;
+
+       if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) {
+               streamflags |= VISSHADOW;
+       }
+
+       /* Tag the vnode. */
+       vnode_lock_spin(svp);
+       svp->v_flag |= streamflags;
+       vnode_unlock(svp);
+
+       /* Tag the parent so we know to flush credentials for streams on setattr */
+       vnode_lock_spin(vp);
+       vp->v_lflag |= VL_HASSTREAMS;
+       vnode_unlock(vp);
+
+       /* Make the file it's parent.
+        * Note:  This parent link helps us distinguish vnodes for
+        * shadow stream files from vnodes for resource fork on file
+        * systems that support namedstream natively (both have
+        * VISNAMEDSTREAM set) by allowing access to mount structure
+        * for checking MNTK_NAMED_STREAMS bit at many places in the
+        * code.
+        */
+       vnode_update_identity(svp, vp, NULL, 0, 0, VNODE_UPDATE_NAMEDSTREAM_PARENT);
+
+       return;
+}
+
+errno_t
+vnode_setasnamedstream(vnode_t vp, vnode_t svp)
+{
+       if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0)
+               return (EINVAL);
+
+       vnode_setasnamedstream_internal(vp, svp);
+       return (0);
+}
+
 #if NAMEDSTREAMS
 
 /*
@@ -417,33 +459,8 @@ vnode_getnamedstream(vnode_t vp, vnode_t *svpp, const char *name, enum nsoperati
        }
 
        if (error == 0) {
-               uint32_t streamflags = VISNAMEDSTREAM;
-               vnode_t svp = *svpp;
-
-               if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) {
-                       streamflags |= VISSHADOW;
-               }
-               
-               /* Tag the vnode. */
-               vnode_lock_spin(svp);
-               svp->v_flag |= streamflags;
-               vnode_unlock(svp);
-
-               /* Tag the parent so we know to flush credentials for streams on setattr */
-               vnode_lock_spin(vp);
-               vp->v_lflag |= VL_HASSTREAMS;
-               vnode_unlock(vp);
-
-               /* Make the file it's parent.  
-                * Note:  This parent link helps us distinguish vnodes for 
-                * shadow stream files from vnodes for resource fork on file 
-                * systems that support namedstream natively (both have 
-                * VISNAMEDSTREAM set) by allowing access to mount structure 
-                * for checking MNTK_NAMED_STREAMS bit at many places in the 
-                * code.
-                */
-               vnode_update_identity(svp, vp, NULL, 0, 0, VNODE_UPDATE_PARENT);
-       }               
+               vnode_setasnamedstream_internal(vp, *svpp);
+       }
 
        return (error);
 }
@@ -462,34 +479,9 @@ vnode_makenamedstream(vnode_t vp, vnode_t *svpp, const char *name, int flags, vf
                error = default_makenamedstream(vp, svpp, name, context);
 
        if (error == 0) {
-               uint32_t streamflags = VISNAMEDSTREAM;
-               vnode_t svp = *svpp;
-
-               /* Tag the vnode. */
-               if ((vp->v_mount->mnt_kern_flag & MNTK_NAMED_STREAMS) == 0) {
-                       streamflags |= VISSHADOW;
-               }
-               
-               /* Tag the vnode. */
-               vnode_lock_spin(svp);
-               svp->v_flag |= streamflags;
-               vnode_unlock(svp);
-
-               /* Tag the parent so we know to flush credentials for streams on setattr */
-               vnode_lock_spin(vp);
-               vp->v_lflag |= VL_HASSTREAMS;
-               vnode_unlock(vp);
-
-               /* Make the file it's parent.
-                * Note:  This parent link helps us distinguish vnodes for 
-                * shadow stream files from vnodes for resource fork on file 
-                * systems that support namedstream natively (both have 
-                * VISNAMEDSTREAM set) by allowing access to mount structure 
-                * for checking MNTK_NAMED_STREAMS bit at many places in the 
-                * code.
-                */
-               vnode_update_identity(svp, vp, NULL, 0, 0, VNODE_UPDATE_PARENT);
+               vnode_setasnamedstream_internal(vp, *svpp);
        }
+
        return (error);
 }
 
index 295d023fa4197c18ff564de15ddc8877ab08d94e..e54b6835630de01ffce1a103b109b20d6a40d808 100644 (file)
 #include <sys/disk.h>
 #include <vm/vm_protos.h>
 #include <vm/vm_pageout.h>
+#include <sys/content_protection.h>
 
 void vm_swapfile_open(const char *path, vnode_t *vp);
 void vm_swapfile_close(uint64_t path, vnode_t vp);
 int vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin);
 uint64_t vm_swapfile_get_blksize(vnode_t vp);
 uint64_t vm_swapfile_get_transfer_size(vnode_t vp);
-int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags);
+int vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *);
 int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size);
 
+#if CONFIG_FREEZE
+int vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget);
+#endif /* CONFIG_FREEZE */
+
 
 void
 vm_swapfile_open(const char *path, vnode_t *vp)
@@ -115,7 +120,9 @@ vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin)
        int             error = 0;
        uint64_t        file_size = 0;
        vfs_context_t   ctx = NULL;
-
+#if CONFIG_FREEZE
+       struct vnode_attr va;
+#endif /* CONFIG_FREEZE */
 
        ctx = vfs_context_kernel();
 
@@ -148,6 +155,18 @@ vm_swapfile_preallocate(vnode_t vp, uint64_t *size, boolean_t *pin)
        vnode_lock_spin(vp);
        SET(vp->v_flag, VSWAP);
        vnode_unlock(vp);
+
+#if CONFIG_FREEZE
+       VATTR_INIT(&va);
+       VATTR_SET(&va, va_dataprotect_class, PROTECTION_CLASS_C);
+       error = VNOP_SETATTR(vp, &va, ctx);
+
+       if (error) {
+               printf("setattr PROTECTION_CLASS_C for swap file failed: %d\n", error);
+               goto done;
+       }
+#endif /* CONFIG_FREEZE */
+
 done:
        return error;
 }
@@ -170,7 +189,7 @@ vm_record_file_write(vnode_t vp, uint64_t offset, char *buf, int size)
 
 
 int
-vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags)
+vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_iodone)
 {
        int error = 0;
        uint64_t io_size = npages * PAGE_SIZE_64;
@@ -184,11 +203,13 @@ vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flag
 
        upl_create_flags = UPL_SET_INTERNAL | UPL_SET_LITE;
 
+       if (upl_iodone == NULL)
+               upl_control_flags = UPL_IOSYNC;
+
 #if ENCRYPTED_SWAP
-       upl_control_flags = UPL_IOSYNC | UPL_PAGING_ENCRYPTED;
-#else
-       upl_control_flags = UPL_IOSYNC;
+       upl_control_flags |= UPL_PAGING_ENCRYPTED;
 #endif
+
        if ((flags & SWAP_READ) == FALSE) {
                upl_create_flags |= UPL_COPYOUT_FROM;
        }
@@ -224,6 +245,8 @@ vm_swapfile_io(vnode_t vp, uint64_t offset, uint64_t start, int npages, int flag
                }
        
        } else {
+               upl_set_iodone(upl, upl_iodone);
+
                vnode_pageout(vp,
                              upl,
                              0,
@@ -367,3 +390,19 @@ trim_exit:
 
        return error;
 }
+
+#if CONFIG_FREEZE
+int
+vm_swap_vol_get_budget(vnode_t vp, uint64_t *freeze_daily_budget)
+{
+       vnode_t         devvp = NULL;
+       vfs_context_t   ctx = vfs_context_kernel();
+       errno_t         err = 0;
+
+       devvp = vp->v_mount->mnt_devvp;
+
+       err = VNOP_IOCTL(devvp, DKIOCGETMAXSWAPWRITE, (caddr_t)freeze_daily_budget, 0, ctx);
+
+       return err;
+}
+#endif /* CONFIG_FREEZE */
index df5f607ae13951edf46b0fe9dd269ee21af94f57..d3109c5646d200dd91160196818ea497dd02d7c4 100644 (file)
 #include <security/mac_framework.h>
 #endif
 
+#if CONFIG_CSR
+#include <sys/csr.h>
+#endif /* CONFIG_CSR */
+
 int _shared_region_map_and_slide(struct proc*, int, unsigned int, struct shared_file_mapping_np*, uint32_t, user_addr_t, user_addr_t);
 int shared_region_copyin_mappings(struct proc*, user_addr_t, unsigned int, struct shared_file_mapping_np *);
 
@@ -276,12 +280,6 @@ extern int allow_stack_exec, allow_data_exec;
 SYSCTL_INT(_vm, OID_AUTO, allow_stack_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_stack_exec, 0, "");
 SYSCTL_INT(_vm, OID_AUTO, allow_data_exec, CTLFLAG_RW | CTLFLAG_LOCKED, &allow_data_exec, 0, "");
 
-#if __arm64__
-extern int fourk_binary_compatibility_unsafe;
-extern int fourk_binary_compatibility_allow_wx;
-SYSCTL_INT(_vm, OID_AUTO, fourk_binary_compatibility_unsafe, CTLFLAG_RW | CTLFLAG_LOCKED, &fourk_binary_compatibility_unsafe, 0, "");
-SYSCTL_INT(_vm, OID_AUTO, fourk_binary_compatibility_allow_wx, CTLFLAG_RW | CTLFLAG_LOCKED, &fourk_binary_compatibility_allow_wx, 0, "");
-#endif /* __arm64__ */
 #endif /* DEVELOPMENT || DEBUG */
 
 static const char *prot_values[] = {
@@ -330,7 +328,18 @@ static char scdir_path[] = "/System/Library/Caches/com.apple.dyld/";
 #endif
 
 #ifndef SECURE_KERNEL
-SYSCTL_INT(_vm, OID_AUTO, enforce_shared_cache_dir, CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, "");
+static int sysctl_scdir_enforce SYSCTL_HANDLER_ARGS
+{
+#if CONFIG_CSR
+       if (csr_check(CSR_ALLOW_UNRESTRICTED_FS) != 0) {
+               printf("Failed attempt to set vm.enforce_shared_cache_dir sysctl\n");
+               return EPERM;
+       }
+#endif /* CONFIG_CSR */
+       return sysctl_handle_int(oidp, arg1, arg2, req);
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, enforce_shared_cache_dir, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &scdir_enforce, 0, sysctl_scdir_enforce, "I", "");
 #endif
 
 /* These log rate throttling state variables aren't thread safe, but
@@ -1759,7 +1768,7 @@ _shared_region_map_and_slide(
        }
 
        /* check that the mappings are properly covered by code signatures */
-       if (!cs_enforcement(NULL)) {
+       if (!cs_system_enforcement()) {
                /* code signing is not enforced: no need to check */
        } else for (i = 0; i < mappings_count; i++) {
                if (mappings[i].sfm_init_prot & VM_PROT_ZF) {
@@ -1790,7 +1799,7 @@ _shared_region_map_and_slide(
        }
 
        /* get the process's shared region (setup in vm_map_exec()) */
-       shared_region = vm_shared_region_get(current_task());
+       shared_region = vm_shared_region_trim_and_get(current_task());
        if (shared_region == NULL) {
                SHARED_REGION_TRACE_ERROR(
                        ("shared_region: %p [%d(%s)] map(%p:'%s'): "
@@ -1798,6 +1807,7 @@ _shared_region_map_and_slide(
                         (void *)VM_KERNEL_ADDRPERM(current_thread()),
                         p->p_pid, p->p_comm,
                         (void *)VM_KERNEL_ADDRPERM(vp), vp->v_name));
+               error = EINVAL;
                goto done;
        }
 
@@ -1970,9 +1980,8 @@ extern unsigned int       vm_page_free_target;
 SYSCTL_INT(_vm, OID_AUTO, vm_page_free_target, CTLFLAG_RD | CTLFLAG_LOCKED, 
                   &vm_page_free_target, 0, "Pageout daemon free target");
 
-extern unsigned int    vm_memory_pressure;
 SYSCTL_INT(_vm, OID_AUTO, memory_pressure, CTLFLAG_RD | CTLFLAG_LOCKED,
-          &vm_memory_pressure, 0, "Memory pressure indicator");
+          &vm_pageout_state.vm_memory_pressure, 0, "Memory pressure indicator");
 
 static int
 vm_ctl_page_free_wanted SYSCTL_HANDLER_ARGS
@@ -1995,9 +2004,42 @@ extern unsigned int      vm_page_purgeable_wired_count;
 SYSCTL_INT(_vm, OID_AUTO, page_purgeable_wired_count, CTLFLAG_RD | CTLFLAG_LOCKED,
           &vm_page_purgeable_wired_count, 0, "Wired purgeable page count");
 
-extern unsigned int    vm_pageout_purged_objects;
+#if DEVELOPMENT || DEBUG
+extern uint64_t get_pages_grabbed_count(void);
+
+static int
+pages_grabbed SYSCTL_HANDLER_ARGS
+{
+#pragma unused(arg1, arg2, oidp)
+       uint64_t value = get_pages_grabbed_count();
+       return SYSCTL_OUT(req, &value, sizeof(value));
+}
+
+SYSCTL_PROC(_vm, OID_AUTO, pages_grabbed, CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED,
+           0, 0, &pages_grabbed, "QU", "Total pages grabbed");
+SYSCTL_ULONG(_vm, OID_AUTO, pages_freed, CTLFLAG_RD | CTLFLAG_LOCKED,
+            &vm_pageout_vminfo.vm_page_pages_freed, "Total pages freed");
+
 SYSCTL_INT(_vm, OID_AUTO, pageout_purged_objects, CTLFLAG_RD | CTLFLAG_LOCKED,
-          &vm_pageout_purged_objects, 0, "System purged object count");
+          &vm_pageout_debug.vm_pageout_purged_objects, 0, "System purged object count");
+SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED,
+           &vm_pageout_debug.vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
+SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED,
+           &vm_pageout_debug.vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
+
+SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
+           &vm_pageout_debug.vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
+SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
+           &vm_pageout_debug.vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
+SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
+           &vm_pageout_debug.vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
+SYSCTL_ULONG(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
+           &vm_pageout_vminfo.vm_pageout_freed_cleaned, "Cleaned pages freed");
+SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED,
+           &vm_pageout_debug.vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
+SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED,
+           &vm_pageout_debug.vm_pageout_enqueued_cleaned, 0, ""); /* sum of next two */
+#endif
 
 extern int madvise_free_debug;
 SYSCTL_INT(_vm, OID_AUTO, madvise_free_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
@@ -2049,34 +2091,16 @@ SYSCTL_UINT(_vm, OID_AUTO, page_pageable_internal_count, CTLFLAG_RD | CTLFLAG_LO
 SYSCTL_UINT(_vm, OID_AUTO, page_pageable_external_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pageable_external_count, 0, "");
 
 /* pageout counts */
-extern unsigned int vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external, vm_pageout_inactive_clean, vm_pageout_speculative_clean, vm_pageout_inactive_used;
-extern unsigned int vm_pageout_freed_from_inactive_clean, vm_pageout_freed_from_speculative;
-SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_dirty_internal, 0, "");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_dirty_external, 0, "");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_clean, 0, "");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_speculative_clean, 0, "");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_inactive_used, 0, "");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_inactive_clean, 0, "");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_speculative, 0, "");
-
-extern unsigned int vm_pageout_freed_from_cleaned;
-SYSCTL_UINT(_vm, OID_AUTO, pageout_freed_from_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_freed_from_cleaned, 0, "");
-
-/* counts of pages entering the cleaned queue */
-extern unsigned int vm_pageout_enqueued_cleaned, vm_pageout_enqueued_cleaned_from_inactive_dirty;
-SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_enqueued_cleaned, 0, ""); /* sum of next two */
-SYSCTL_UINT(_vm, OID_AUTO, pageout_enqueued_cleaned_from_inactive_dirty, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_enqueued_cleaned_from_inactive_dirty, 0, "");
-
-/* counts of pages leaving the cleaned queue */
-extern unsigned int vm_pageout_cleaned_reclaimed, vm_pageout_cleaned_reactivated, vm_pageout_cleaned_reference_reactivated, vm_pageout_cleaned_volatile_reactivated, vm_pageout_cleaned_fault_reactivated, vm_pageout_cleaned_commit_reactivated, vm_pageout_cleaned_busy, vm_pageout_cleaned_nolock;
-SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reclaimed, 0, "Cleaned pages reclaimed");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reactivated, 0, "Cleaned pages reactivated"); /* sum of all reactivated AND busy and nolock (even though those actually get reDEactivated */
-SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_reference_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_reference_reactivated, 0, "Cleaned pages reference reactivated");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_volatile_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_volatile_reactivated, 0, "Cleaned pages volatile reactivated");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_fault_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_fault_reactivated, 0, "Cleaned pages fault reactivated");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_commit_reactivated, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_commit_reactivated, 0, "Cleaned pages commit reactivated");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_busy, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_busy, 0, "Cleaned pages busy (deactivated)");
-SYSCTL_UINT(_vm, OID_AUTO, pageout_cleaned_nolock, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_cleaned_nolock, 0, "Cleaned pages no-lock (deactivated)");
+SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_clean, 0, "");
+SYSCTL_UINT(_vm, OID_AUTO, pageout_inactive_used, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_state.vm_pageout_inactive_used, 0, "");
+
+SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_internal, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_internal, "");
+SYSCTL_ULONG(_vm, OID_AUTO, pageout_inactive_dirty_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_inactive_dirty_external, "");
+SYSCTL_ULONG(_vm, OID_AUTO, pageout_speculative_clean, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
+SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_external, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_external, "");
+SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_speculative, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_speculative, "");
+SYSCTL_ULONG(_vm, OID_AUTO, pageout_freed_cleaned, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_vminfo.vm_pageout_freed_cleaned, "");
+
 
 /* counts of pages prefaulted when entering a memory object */
 extern int64_t vm_prefault_nb_pages, vm_prefault_nb_bailout;
@@ -2134,9 +2158,6 @@ SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_failure_dirty, CTLFLAG_RD | CTLFLA
 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit, 0, "");
 SYSCTL_UINT(_vm, OID_AUTO, page_secluded_grab_for_iokit_success, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_secluded.grab_for_iokit_success, 0, "");
 
-extern uint64_t vm_pageout_secluded_burst_count;
-SYSCTL_QUAD(_vm, OID_AUTO, pageout_secluded_burst_count, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_pageout_secluded_burst_count, "");
-
 #endif /* CONFIG_SECLUDED_MEMORY */
 
 #include <kern/thread.h>
@@ -2289,10 +2310,12 @@ SYSCTL_ULONG(_vm, OID_AUTO, vm_max_kernel_address, CTLFLAG_RD, (unsigned long *)
 extern uint32_t vm_page_pages;
 SYSCTL_UINT(_vm, OID_AUTO, pages, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_pages, 0, "");
 
+extern uint32_t vm_page_busy_absent_skipped;
+SYSCTL_UINT(_vm, OID_AUTO, page_busy_absent_skipped, CTLFLAG_RD | CTLFLAG_LOCKED, &vm_page_busy_absent_skipped, 0, "");
+
 #if (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG)
-extern int pacified_footprint_suspend;
-int footprint_suspend_allowed = 0;
-SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &footprint_suspend_allowed, 0, "");
+extern int vm_footprint_suspend_allowed;
+SYSCTL_INT(_vm, OID_AUTO, footprint_suspend_allowed, CTLFLAG_RW | CTLFLAG_LOCKED, &vm_footprint_suspend_allowed, 0, "");
 
 extern void pmap_footprint_suspend(vm_map_t map, boolean_t suspend);
 static int
@@ -2309,8 +2332,7 @@ sysctl_vm_footprint_suspend SYSCTL_HANDLER_ARGS
        if (error) {
                return error;
        }
-       if (pacified_footprint_suspend &&
-           !footprint_suspend_allowed) {
+       if (!vm_footprint_suspend_allowed) {
                if (new_value != 0) {
                        /* suspends are not allowed... */
                        return 0;
@@ -2329,3 +2351,46 @@ SYSCTL_PROC(_vm, OID_AUTO, footprint_suspend,
            CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_ANYBODY|CTLFLAG_LOCKED|CTLFLAG_MASKED,
            0, 0, &sysctl_vm_footprint_suspend, "I", "");
 #endif /* (__arm__ || __arm64__) && (DEVELOPMENT || DEBUG) */
+
+extern uint64_t vm_map_corpse_footprint_count;
+extern uint64_t vm_map_corpse_footprint_size_avg;
+extern uint64_t vm_map_corpse_footprint_size_max;
+extern uint64_t vm_map_corpse_footprint_full;
+extern uint64_t vm_map_corpse_footprint_no_buf;
+SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_count,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_count, "");
+SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_avg,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_avg, "");
+SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_size_max,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_size_max, "");
+SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_full,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_full, "");
+SYSCTL_QUAD(_vm, OID_AUTO, corpse_footprint_no_buf,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &vm_map_corpse_footprint_no_buf, "");
+
+#if PMAP_CS
+extern uint64_t vm_cs_defer_to_pmap_cs;
+extern uint64_t vm_cs_defer_to_pmap_cs_not;
+SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_pmap_cs,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_pmap_cs, "");
+SYSCTL_QUAD(_vm, OID_AUTO, cs_defer_to_pmap_cs_not,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &vm_cs_defer_to_pmap_cs_not, "");
+#endif /* PMAP_CS */
+
+extern uint64_t shared_region_pager_copied;
+extern uint64_t shared_region_pager_slid;
+extern uint64_t shared_region_pager_slid_error;
+extern uint64_t shared_region_pager_reclaimed;
+SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_copied,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_copied, "");
+SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid, "");
+SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_slid_error,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_slid_error, "");
+SYSCTL_QUAD(_vm, OID_AUTO, shared_region_pager_reclaimed,
+           CTLFLAG_RD | CTLFLAG_LOCKED, &shared_region_pager_reclaimed, "");
+
+#if MACH_ASSERT
+extern int pmap_ledgers_panic_leeway;
+SYSCTL_INT(_vm, OID_AUTO, pmap_ledgers_panic_leeway, CTLFLAG_RW | CTLFLAG_LOCKED, &pmap_ledgers_panic_leeway, 0, "");
+#endif /* MACH_ASSERT */
index 69dad298104bb6e094d20a828773cd35841e7097..fdbccff33a8b28c8099f0509d1c37928fb484a32 100644 (file)
@@ -126,6 +126,15 @@ vnode_pager_issue_reprioritize_io(struct vnode *devvp, uint64_t blkno, uint32_t
 }
 #endif
 
+void
+vnode_pager_was_dirtied(
+       struct vnode            *vp,
+       vm_object_offset_t      s_offset,
+       vm_object_offset_t      e_offset)
+{
+        cluster_update_state(vp, s_offset, e_offset, TRUE);
+}
+
 uint32_t
 vnode_pager_isinuse(struct vnode *vp)
 {
index 83659ed3c199d3ef4dbc2ff1880baba73fd92b81..934486bb8559b25b0db2f783ae7823a35c04171e 100644 (file)
@@ -474,6 +474,7 @@ _proc_exiting
 _proc_find
 _proc_forcequota
 _proc_is64bit
+_proc_is64bit_data
 _proc_is_classic
 _proc_isinferior
 _proc_issignal
index f4ee08125bae763955daa290117b5e5a76f106c0..ad89576cafab0a1526bfc8b938b10cee6de4e41b 100644 (file)
@@ -307,3 +307,5 @@ __ZNK8IOPMprot12getMetaClassEv
 __ZNK8IOPMprot9MetaClass5allocEv
 __ZTV8IOPMprot
 __ZTVN8IOPMprot9MetaClassE
+
+__ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionarylU13block_pointerFbPS_P10IONotifierE
index ed271b62c7d8b1be6eef06e53ca1c4bb430c8329..065a36f0f2df065db674f0831119e8ae05b91f6f 100644 (file)
@@ -228,3 +228,5 @@ __ZNK15IORegistryEntry12copyPropertyEPK8OSSymbolPK15IORegistryPlanej
 __ZNK15IORegistryEntry12copyPropertyEPKcPK15IORegistryPlanej
 __ZNK18IOMemoryDescriptor19dmaCommandOperationEjPvj
 __ZNK25IOGeneralMemoryDescriptor19dmaCommandOperationEjPvj
+
+__ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryiU13block_pointerFbPS_P10IONotifierE
index 0f0678d3d5920f4c51adee71367c0ad9dee9c394..c5589237724883e490c37f7e26f2985b4c8637e4 100644 (file)
@@ -886,6 +886,7 @@ __ZN22_IOOpenServiceIteratorD0Ev
 __ZN22_IOOpenServiceIteratorD2Ev
 __ZN23IOMultiMemoryDescriptor10gMetaClassE
 __ZN23IOMultiMemoryDescriptor10superClassE
+__ZN23IOMultiMemoryDescriptor16getPreparationIDEv
 __ZN23IOMultiMemoryDescriptor4freeEv
 __ZN23IOMultiMemoryDescriptor9MetaClassC1Ev
 __ZN23IOMultiMemoryDescriptor9MetaClassC2Ev
@@ -964,6 +965,7 @@ __ZN28IOFilterInterruptEventSource20interruptEventSourceEP8OSObjectPFvS1_P22IOIn
 __ZN28IOFilterInterruptEventSource23normalInterruptOccurredEPvP9IOServicei
 __ZN28IOFilterInterruptEventSource24disableInterruptOccurredEPvP9IOServicei
 __ZN28IOFilterInterruptEventSource26filterInterruptEventSourceEP8OSObjectPFvS1_P22IOInterruptEventSourceiEPFbS1_PS_EP9IOServicei
+__ZN28IOFilterInterruptEventSource4freeEv
 __ZN28IOFilterInterruptEventSource4initEP8OSObjectPFvS1_P22IOInterruptEventSourceiEP9IOServicei
 __ZN28IOFilterInterruptEventSource4initEP8OSObjectPFvS1_P22IOInterruptEventSourceiEPFbS1_PS_EP9IOServicei
 __ZN28IOFilterInterruptEventSource9MetaClassC1Ev
@@ -1654,3 +1656,14 @@ __ZTVN14IOReportLegend9MetaClassE
 __ZTVN15IOStateReporter9MetaClassE
 __ZTVN16IOSimpleReporter9MetaClassE
 __ZTVN19IOHistogramReporter9MetaClassE
+__ZN10IOWorkLoop14runActionBlockEU13block_pointerFivE
+__ZN13IOCommandGate14runActionBlockEU13block_pointerFivE
+__ZN13IOEventSource14setActionBlockEU13block_pointerFivE
+__ZN18IOTimerEventSource16timerEventSourceEjP8OSObjectU13block_pointerFvPS_E
+__ZN22IOInterruptEventSource20interruptEventSourceEP8OSObjectP9IOServiceiU13block_pointerFvPS_iE
+__ZN28IOFilterInterruptEventSource26filterInterruptEventSourceEP8OSObjectP9IOServiceiU13block_pointerFvP22IOInterruptEventSourceiEU13block_pointerFbPS_E
+__ZN9IOService16registerInterestEPK8OSSymbolU13block_pointerFijPS_PvmE
+__ZN9IOService22registerInterruptBlockEiP8OSObjectU13block_pointerFvPS_iE
+__ZNK13IOEventSource14getActionBlockEU13block_pointerFivE
+__ZN13IOEventSource9setRefconEPv
+__ZNK13IOEventSource9getRefconEv
index 1f7734ca8ec605600269b2e8a4b0add69aac9511..d53a169a51a23489fda2a53ea52fbb5c57be255d 100644 (file)
@@ -499,3 +499,5 @@ __ZTV8IOSyncer
 __ZTVN8IOSyncer9MetaClassE
 _ev_try_lock
 _ev_unlock
+
+__ZN9IOService23addMatchingNotificationEPK8OSSymbolP12OSDictionaryiU13block_pointerFbPS_P10IONotifierE
index 051d0d07ba3ea8d98b7820a6f2992f2437058dfe..ab47a9396799cf5d2ab83def98d81a030ab16e5b 100644 (file)
@@ -1,4 +1,5 @@
 _OSAddAtomic64
 _OSCompareAndSwap64
+__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvE
 __ZN12OSOrderedSet12withCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_
 __ZN12OSOrderedSet16initWithCapacityEjPFlPK15OSMetaClassBaseS2_PvES3_
index cc07f5dc2f4a10bc38541975aadd1692340cab4b..40f33219b2843ba86052d0241c67a3304d99d8c9 100644 (file)
@@ -1,5 +1,6 @@
 _OSAddAtomic64
 _OSCompareAndSwap64
 _PAGE_SHIFT_CONST
+__ZN15OSMetaClassBase9_ptmf2ptfEPKS_MS_FvvE
 __ZN12OSOrderedSet12withCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_
 __ZN12OSOrderedSet16initWithCapacityEjPFiPK15OSMetaClassBaseS2_PvES3_
index 1737075047b82d20c5bba9f2596b5e962ae5a14a..d0c0a5554bee7e122787f93bfc881b3fd7df1fb1 100644 (file)
@@ -1,4 +1,5 @@
 _Assert
+_img4_interface_register
 _MD5Final
 _MD5Init
 _MD5Update
@@ -654,6 +655,7 @@ _kern_os_free
 _kern_os_malloc
 _kern_os_realloc
 _kext_assertions_enable
+_kmod_info:_invalid_kmod_info
 _kprintf
 _lck_attr_alloc_init
 _lck_attr_free
@@ -708,6 +710,12 @@ _os_log_debug_enabled
 _os_log_info_enabled
 _os_release
 _os_retain
+_os_ref_init_count
+_os_ref_retain
+_os_ref_release_explicit
+_os_ref_retain_try
+_os_ref_retain_locked
+_os_ref_release_locked
 _osrelease
 _ostype
 _page_mask
@@ -761,3 +769,15 @@ _vsnprintf
 _vsscanf
 _zError
 _zlibVersion
+
+__Block_copy
+__Block_release
+__NSConcreteAutoBlock
+__NSConcreteFinalizingBlock
+__NSConcreteGlobalBlock
+__NSConcreteMallocBlock
+__NSConcreteStackBlock
+__NSConcreteWeakBlockVariable
+__ZN12OSCollection14iterateObjectsEU13block_pointerFbP8OSObjectE
+__ZN12OSDictionary14iterateObjectsEU13block_pointerFbPK8OSSymbolP8OSObjectE
+__ZN12OSSerializer9withBlockEU13block_pointerFbP11OSSerializeE
index 67b209861e8895ad6478f9830d44d802418aa24c..e594b265f3865253529339cd5003441264d528ae 100644 (file)
@@ -12,10 +12,6 @@ _mac_label_set
 _mac_audit_text
 
 _mac_iokit_check_hid_control
-_mac_iokit_check_nvram_delete
-_mac_iokit_check_nvram_get
-_mac_iokit_check_nvram_set
-
 _mac_vnode_check_trigger_resolve
 
 _sbuf_cat
index d4561e50dbfaf1b9506bd3324ed1b4dfbf632f8f..b3c36794afa2f4453002ed0c31d5126838606a35 100644 (file)
@@ -1,7 +1,7 @@
 #
 # Mach Operating System
 # Copyright (c) 1986 Carnegie-Mellon University
-# Copyright 2001-2014 Apple Inc.
+# Copyright 2001-2018 Apple Inc.
 #
 # All rights reserved.  The CMU software License Agreement
 # specifies the terms and conditions for use and redistribution.
@@ -116,6 +116,7 @@ options             CONFIG_IMAGEBOOT        # local image boot      # <config_imageboot>
 options                CONFIG_MBUF_JUMBO       # jumbo cluster pool    # <config_mbuf_jumbo>
 
 options                CONFIG_WORKQUEUE        # <config_workqueue>
+options                CONFIG_WORKLOOP_DEBUG   # <config_workloop_debug>
 
 #
 #      4.4 filesystems 
@@ -141,6 +142,7 @@ options             CONFIG_TRIGGERS # trigger vnodes                # <config_triggers>
 options                CONFIG_EXT_RESOLVER # e.g. memberd              # <config_ext_resolver>
 options                CONFIG_SEARCHFS # searchfs syscall support      # <config_searchfs>
 options                CONFIG_MNT_SUID # allow suid binaries  # <config_mnt_suid>
+options                CONFIG_MNT_ROOTSNAP # allow rooting from snapshot # <config_mnt_rootsnap>
 
 #
 # NFS support
@@ -172,6 +174,8 @@ options                     CRYPTO                          # <ipsec,crypto>
 options                        CRYPTO_SHA2                     # <crypto_sha2>
 options                        ENCRYPTED_SWAP                  # <encrypted_swap>
 
+options                        CONFIG_IMG4                     # <config_img4>
+
 options                ZLIB    # inflate/deflate support       # <zlib>
 
 options                IF_BRIDGE                               # <if_bridge>
@@ -307,6 +311,12 @@ options   CONFIG_NO_KPRINTF_STRINGS                # <no_kprintf_str>
 #
 options   CONFIG_FINE_LOCK_GROUPS               # <medium,large,xlarge>
 
+#
+# configurable kernel - general switch to say we are building for an
+# embedded device
+#
+options   CONFIG_EMBEDDED                      # <config_embedded>
+
 
 # support dynamic signing of code
 #
@@ -326,6 +336,9 @@ options             CONFIG_CODE_DECRYPTION          # <config_code_decryption>
 #
 options                CONFIG_PROTECT                  # <config_protect>
 
+#allow write-protection of key page
+options                CONFIG_KEYPAGE_WP               # <config_keypage_wp>
+
 #
 # enable per-process memory priority tracking
 #
@@ -371,6 +384,11 @@ options            CONFIG_SECLUDED_MEMORY          # <config_secluded_memory>
 
 options                CONFIG_BACKGROUND_QUEUE         # <config_background_queue>
 
+#
+# Ledger features
+#
+options                CONFIG_LEDGER_INTERVAL_MAX      # <config_ledger_interval_max>
+
 #
 # I/O Scheduling
 #
@@ -477,6 +495,7 @@ options         NO_KERNEL_HID                       # <no_kernel_hid>
 #
 
 options                LIBKERNCPP              # C++ implementation    # <libkerncpp>
+options                CONFIG_BLOCKS           # Blocks runtime        # <config_blocks>
 options                CONFIG_KXLD             # kxld/runtime linking of kexts # <config_kxld>
 options                CONFIG_KEC_FIPS         # Kernel External Components for FIPS compliance (KEC_FIPS) # <config_kec_fips>
 
@@ -554,7 +573,7 @@ options             MACH_MP_DEBUG   #                               # <debug>
 #      operations on each element.
 #
 options                ZONE_DEBUG      #               # <debug>
-
+options                CONFIG_ZCACHE   #Enable per-cpu caching for zones       # <config_zcache>
 options                CONFIG_ZLEAKS   # Live zone leak debugging      # <zleaks>
 
 #
@@ -740,3 +759,6 @@ options             COPYOUT_SHIM                    # Shim for copyout memory analysis via kext #<copyout_sh
 # Telemetry for 32-bit process launch
 #
 options                CONFIG_32BIT_TELEMETRY # # <config_32bit_telemetry>
+
+options                CONFIG_QUIESCE_COUNTER # Support for _COMM_PAGE_CPU_QUIESCENT_COUNTER # <config_quiesce_counter>
+
index 8deb4e4459e04762e893588e73f11685571bb3de..d463ad18982e88ec981b6ba48a06659d0f3c9a82 100644 (file)
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ arm xsmall config_embedded ]
+#  KERNEL_BASE =    [ arm xsmall config_embedded config_enforce_signed_code config_zcache ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
 #  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ]
-#  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_waitq_debug ]
+#  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ]
 #  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
 #  BSD_DEV =        [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
 #  BSD_DEBUG =      [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
-#  FILESYS_BASE =   [ devfs fifo fs_compression config_protect config_fse routefs quota namedstreams ]
+#  FILESYS_BASE =   [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ]
 #  FILESYS_RELEASE= [ FILESYS_BASE ]
 #  FILESYS_DEV =    [ FILESYS_BASE fdesc ]
 #  FILESYS_DEBUG =  [ FILESYS_BASE fdesc ]
@@ -37,7 +37,7 @@
 #  IOKIT_RELEASE =  [ IOKIT_BASE ]
 #  IOKIT_DEV =      [ IOKIT_BASE iokitstats iotracking ]
 #  IOKIT_DEBUG =    [ IOKIT_BASE iokitstats iotracking ]
-#  LIBKERN_BASE =   [ libkerncpp config_kec_fips zlib crypto_sha2 ]
+#  LIBKERN_BASE =   [ libkerncpp config_blocks config_kec_fips zlib crypto_sha2 config_img4 ]
 #  LIBKERN_RELEASE =[ LIBKERN_BASE ]
 #  LIBKERN_DEV =    [ LIBKERN_BASE iotracking ]
 #  LIBKERN_DEBUG =  [ LIBKERN_BASE iotracking ]
 #  PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ]
 #  PERF_DBG_DEV =   [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging interrupt_masked_debug ]
 #  PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging interrupt_masked_debug ]
-#  MACH_BASE =      [ mach slidable vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_library_validation config_iosched config_telemetry config_sysdiagnose ]
+#  MACH_BASE =      [ mach slidable vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_library_validation config_iosched config_telemetry config_sysdiagnose config_quiesce_counter ]
 #  MACH_RELEASE =   [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ]
-#  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace ]
-#  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace importance_debug ]
+#  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ]
+#  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ]
 #  SCHED_BASE =     [ config_sched_traditional config_sched_multiq ]
 #  SCHED_RELEASE =  [ SCHED_BASE ]
 #  SCHED_DEV =      [ SCHED_BASE ]
index a6636b77362dbf48f584951931d1b4b6ca7afe3d..32189c5fdf2882b3b648e0a91c0c9c7f51307453 100644 (file)
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ arm64 xsmall config_embedded config_requires_u32_munging ]
+#  KERNEL_BASE =    [ arm64 xsmall config_embedded config_enforce_signed_code config_requires_u32_munging config_zcache ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
 #  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ]
-#  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_waitq_debug pgtrace ]
+#  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
 #  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ]
 #  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
 #  BSD_DEV =        [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
 #  BSD_DEBUG =      [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
-#  FILESYS_BASE =   [ devfs fifo fs_compression config_protect config_fse routefs quota namedstreams ]
+#  FILESYS_BASE =   [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ]
 #  FILESYS_RELEASE= [ FILESYS_BASE ]
 #  FILESYS_DEV =    [ FILESYS_BASE fdesc ]
 #  FILESYS_DEBUG =  [ FILESYS_BASE fdesc ]
@@ -37,7 +37,7 @@
 #  IOKIT_RELEASE =  [ IOKIT_BASE ]
 #  IOKIT_DEV =      [ IOKIT_BASE iokitstats iotracking ]
 #  IOKIT_DEBUG =    [ IOKIT_BASE iokitstats iotracking]
-#  LIBKERN_BASE =   [ libkerncpp config_kec_fips zlib crypto_sha2 ]
+#  LIBKERN_BASE =   [ libkerncpp config_blocks config_kec_fips zlib crypto_sha2 config_img4 ]
 #  LIBKERN_RELEASE =[ LIBKERN_BASE ]
 #  LIBKERN_DEV =    [ LIBKERN_BASE iotracking ]
 #  LIBKERN_DEBUG =  [ LIBKERN_BASE iotracking ]
 #  PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ]
 #  PERF_DBG_DEV =   [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ]
 #  PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ]
-#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time]
+#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter ]
 #  MACH_RELEASE =   [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ]
-#  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace ]
-#  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace importance_debug ]
+#  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max ]
+#  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace config_ledger_interval_max importance_debug ]
 #  SCHED_BASE =     [ config_sched_traditional config_sched_multiq config_sched_deferred_ast ]
 #  SCHED_RELEASE =  [ SCHED_BASE ]
 #  SCHED_DEV =      [ SCHED_BASE ]
diff --git a/config/MASTER.arm64.bcm2837 b/config/MASTER.arm64.bcm2837
new file mode 100644 (file)
index 0000000..65dd486
--- /dev/null
@@ -0,0 +1,88 @@
+#
+# Mach Operating System
+# Copyright (c) 1986 Carnegie-Mellon University
+# Copyright 2001-2016 Apple Inc.
+#
+# All rights reserved.  The CMU software License Agreement
+# specifies the terms and conditions for use and redistribution.
+#  
+######################################################################
+#
+#  Master Apple configuration file (see the master machine independent
+#  configuration file for a description of the file format).
+#
+######################################################################
+#  
+#  Standard Apple OS Configurations:
+#  -------- ----- -- ---------------
+#
+#  KERNEL_BASE =    [ arm64 xsmall config_embedded config_requires_u32_munging config_zcache ]
+#  KERNEL_RELEASE = [ KERNEL_BASE ]
+#  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug pgtrace ]
+#  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug pgtrace ]
+#  BSD_BASE =       [ mach_bsd config_workqueue psynch config_proc_uuid_policy config_personas ]
+#  BSD_RELEASE =    [ BSD_BASE no_printf_str no_kprintf_str secure_kernel ]
+#  BSD_DEV =        [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
+#  BSD_DEBUG =      [ BSD_BASE config_imageboot config_coredump pgo config_vnguard ]
+#  FILESYS_BASE =   [ devfs fifo fs_compression config_mnt_rootsnap config_protect config_fse routefs quota namedstreams ]
+#  FILESYS_RELEASE= [ FILESYS_BASE ]
+#  FILESYS_DEV =    [ FILESYS_BASE fdesc ]
+#  FILESYS_DEBUG =  [ FILESYS_BASE fdesc ]
+#  NFS =            [ nfsclient nfsserver ]
+#  NETWORKING =     [ inet tcpdrop_synfin bpfilter inet6 ipv6send if_bridge traffic_mgt dummynet ah_all_crypto packet_mangler if_fake ]
+#  VPN =            [ ipsec flow_divert necp content_filter ]
+#  PF =             [ pf ]
+#  MULTIPATH =      [ multipath mptcp ]
+#  IOKIT_BASE =     [ iokit iokitcpp no_kextd no_kernel_hid config_sleep ]
+#  IOKIT_RELEASE =  [ IOKIT_BASE ]
+#  IOKIT_DEV =      [ IOKIT_BASE iokitstats iotracking ]
+#  IOKIT_DEBUG =    [ IOKIT_BASE iokitstats iotracking]
+#  LIBKERN_BASE =   [ libkerncpp config_kec_fips zlib crypto_sha2 ]
+#  LIBKERN_RELEASE =[ LIBKERN_BASE ]
+#  LIBKERN_DEV =    [ LIBKERN_BASE iotracking ]
+#  LIBKERN_DEBUG =  [ LIBKERN_BASE iotracking ]
+#  PERF_DBG_BASE =  [ mach_kdp config_serial_kdp kperf kpc ]
+#  PERF_DBG_RELEASE=[ PERF_DBG_BASE ist_kdebug ]
+#  PERF_DBG_DEV =   [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ]
+#  PERF_DBG_DEBUG = [ PERF_DBG_BASE config_dtrace zleaks kdp_interactive_debugging alternate_debugger interrupt_masked_debug ]
+#  MACH_BASE =      [ mach slidable config_ecc_logging vc_progress_white mdebug ipc_debug importance_inheritance config_atm config_coalitions config_iosched config_library_validation config_sysdiagnose config_telemetry config_mach_bridge_recv_time config_quiesce_counter ]
+#  MACH_RELEASE =   [ MACH_BASE config_skip_precise_user_kernel_time debugger_for_zone_info ]
+#  MACH_DEV =       [ MACH_BASE task_zone_info config_io_accounting importance_trace ]
+#  MACH_DEBUG =     [ MACH_BASE task_zone_info config_io_accounting importance_trace importance_debug ]
+#  SCHED_BASE =     [ config_sched_traditional config_sched_multiq config_sched_deferred_ast ]
+#  SCHED_RELEASE =  [ SCHED_BASE ]
+#  SCHED_DEV =      [ SCHED_BASE ]
+#  SCHED_DEBUG =    [ SCHED_BASE config_sched_grrr config_sched_proto ]
+#  VM_BASE =        [ vm_pressure_events jetsam freeze memorystatus config_code_decryption phantom_cache config_secluded_memory config_background_queue config_cs_validation_bitmap]
+#  VM_RELEASE =     [ VM_BASE ]
+#  VM_DEV =         [ VM_BASE dynamic_codesigning ]
+#  VM_DEBUG =       [ VM_BASE dynamic_codesigning ]
+#  SECURITY =       [ config_macf kernel_integrity ]
+#  RELEASE =        [ KERNEL_RELEASE BSD_RELEASE FILESYS_RELEASE SKYWALK_RELEASE NETWORKING PF MULTIPATH VPN IOKIT_RELEASE LIBKERN_RELEASE PERF_DBG_RELEASE MACH_RELEASE SCHED_RELEASE VM_RELEASE SECURITY ]
+#  DEVELOPMENT =    [ KERNEL_DEV     BSD_DEV     FILESYS_DEV NFS SKYWALK_DEV     NETWORKING PF MULTIPATH VPN IOKIT_DEV     LIBKERN_DEV     PERF_DBG_DEV     MACH_DEV     SCHED_DEV     VM_DEV     SECURITY ]
+#  DEBUG =          [ KERNEL_DEBUG   BSD_DEBUG   FILESYS_DEBUG   SKYWALK_DEBUG   NETWORKING PF MULTIPATH VPN IOKIT_DEBUG   LIBKERN_DEBUG   PERF_DBG_DEBUG   MACH_DEBUG   SCHED_DEBUG   VM_DEBUG   SECURITY ]
+#  KASAN =          [ DEVELOPMENT ]
+#
+######################################################################
+#
+machine                "arm64"                                         # <arm64>
+
+makeoptions    OSFMK_MACHINE = "arm64"                         # <mach>
+
+options                COUNT_SYSCALLS          # count bsd system calls        # <countcalls>
+options     TRASH_VFP_ON_SAVE   # <debug,trash_vfp>
+options                ALTERNATE_DEBUGGER      # <alternate_debugger>
+
+options   CONFIG_VNODES=1024           # <xsmall>
+
+options   CONFIG_FREEZE_SUSPENDED_MIN=4                # <xsmall>
+
+options          CONFIG_MACH_APPROXIMATE_TIME
+
+options   CONFIG_KERNEL_INTEGRITY              # <kernel_integrity>
+
+options   INTERRUPT_MASKED_DEBUG=1                     #      # <interrupt_masked_debug>
+
+options CONFIG_PGTRACE                                      # <pgtrace>
+options CONFIG_PGTRACE_NONKEXT                              # <pgtrace_nonkext>
+pseudo-device   pgtrace     1   init    pgtrace_dev_init    # <pgtrace_nonkext>
index 1a934777de2cc366f4d95f25cb5289ea9fd8d984..b14a338d615cfa9cc1c80450527bf7e39b0287cb 100644 (file)
@@ -1,7 +1,7 @@
 #
 # Mach Operating System
 # Copyright (c) 1986 Carnegie-Mellon University
-# Copyright 2001-2016 Apple Inc.
+# Copyright 2001-2018 Apple Inc.
 #
 # All rights reserved.  The CMU software License Agreement
 # specifies the terms and conditions for use and redistribution.
 #  Standard Apple OS Configurations:
 #  -------- ----- -- ---------------
 #
-#  KERNEL_BASE =    [ intel medium config_requires_u32_munging ]
+#  KERNEL_BASE =    [ intel medium config_requires_u32_munging config_zcache ]
 #  KERNEL_RELEASE = [ KERNEL_BASE ]
 #  KERNEL_DEV =     [ KERNEL_BASE development mach_assert config_xnupost proc_ref_debug os_reason_debug ]
-#  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_waitq_debug ]
+#  KERNEL_DEBUG =   [ KERNEL_BASE debug mach_assert config_xnupost config_ltable_stats config_ltable_debug config_waitq_stats config_workloop_debug config_waitq_debug ]
 #  BSD_BASE =       [ mach_bsd sysv_sem sysv_msg sysv_shm config_imageboot config_workqueue psynch config_proc_uuid_policy config_coredump pgo config_32bit_telemetry ]
 #  BSD_RELEASE =    [ BSD_BASE ]
 #  BSD_DEV =        [ BSD_BASE config_vnguard ]
 #  BSD_DEBUG =      [ BSD_BASE config_vnguard ]
-#  FILESYS_BASE =   [ devfs fdesc config_dev_kmem config_fse quota namedstreams config_protect fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs config_mnt_suid ]
+#  FILESYS_BASE =   [ devfs fdesc config_dev_kmem config_fse quota namedstreams config_mnt_rootsnap config_keypage_wp config_protect fifo config_volfs fs_compression config_imgsrc_access config_triggers config_ext_resolver config_searchfs config_appledouble nullfs config_mnt_suid ]
 #  FILESYS_RELEASE= [ FILESYS_BASE ]
 #  FILESYS_DEV =    [ FILESYS_BASE ]
 #  FILESYS_DEBUG =  [ FILESYS_BASE ]
 #  IOKIT_RELEASE =  [ IOKIT_BASE ]
 #  IOKIT_DEV =      [ IOKIT_BASE iotracking ]
 #  IOKIT_DEBUG =    [ IOKIT_BASE iotracking ]
-#  LIBKERN_BASE =   [ libkerncpp config_kxld config_kec_fips zlib crypto_sha2 ]
+#  LIBKERN_BASE =   [ libkerncpp config_blocks config_kxld config_kec_fips zlib crypto_sha2 config_img4 ]
 #  LIBKERN_RELEASE =[ LIBKERN_BASE ]
 #  LIBKERN_DEV =    [ LIBKERN_BASE iotracking ]
 #  LIBKERN_DEBUG =  [ LIBKERN_BASE iotracking ]
 #  PERF_DBG =       [ config_dtrace mach_kdp config_serial_kdp kdp_interactive_debugging kperf kpc zleaks config_gzalloc MONOTONIC_BASE ]
 #  MACH_BASE =      [ mach config_kext_basement mdebug ipc_debug config_mca config_vmx config_mtrr config_lapic config_telemetry importance_inheritance config_atm config_coalitions hypervisor config_iosched config_sysdiagnose config_mach_bridge_send_time copyout_shim ]
 #  MACH_RELEASE =   [ MACH_BASE ]
-#  MACH_DEV =       [ MACH_BASE task_zone_info importance_trace ]
-#  MACH_DEBUG =     [ MACH_BASE task_zone_info importance_trace importance_debug ]
+#  MACH_DEV =       [ MACH_BASE task_zone_info importance_trace config_ledger_interval_max ]
+#  MACH_DEBUG =     [ MACH_BASE task_zone_info importance_trace config_ledger_interval_max importance_debug ]
 #  SCHED_BASE =     [ config_sched_traditional config_sched_multiq config_sched_sfi ]
 #  SCHED_RELEASE =  [ SCHED_BASE ]
 #  SCHED_DEV =      [ SCHED_BASE ]
index 36b16f2596e57f3e6839cc5dba86083daaeebd49..da46458ffa7dc3563c7438264e0a371b2f98cf42 100644 (file)
@@ -50,6 +50,10 @@ EXPORTS_FILES = $(foreach symbolset,$(SYMBOL_COMPONENT_LIST),$(symbolset).export
 
 SYMBOL_SET_BUILD = $(foreach symbolset, $(SYMBOL_COMPONENT_LIST), $(OBJPATH)/$(symbolset).symbolset)
 
+ifeq ($(KASAN),1)
+KASAN_EXPORTS = $(SRCROOT)/san/Kasan_kasan.exports
+endif
+
 $(OBJPATH)/allsymbols: $(OBJPATH)/$(KERNEL_FILE_NAME)
        $(_v)$(NM) -gj $< > $@
 
@@ -140,9 +144,12 @@ endif
 
 
 $(OBJPATH)/all-kpi.exp: $(EXPORTS_FILES)
-       $(_v)$(SOURCE)/generate_linker_exports.sh $@ $+
+       $(_v)$(SOURCE)/generate_linker_exports.sh $@ $+ $(KASAN_EXPORTS)
+
+$(OBJPATH)/all-alias.exp: $(EXPORTS_FILES)
+       $(_v)$(SOURCE)/generate_linker_aliases.sh $@ $+ $(KASAN_EXPORTS)
 
-do_build_all:: $(OBJPATH)/all-kpi.exp
+do_build_all:: $(OBJPATH)/all-kpi.exp $(OBJPATH)/all-alias.exp
 
 include $(MakeInc_rule)
 include $(MakeInc_dir)
index b857211fe8f7f21b5b2532f92d75b368cb639225..68608495c0c29d9c8f8274f1dcb176e6b97a93d5 100644 (file)
@@ -1,4 +1,4 @@
-17.7.0
+18.2.0
 
 # The first line of this file contains the master version number for the kernel.
 # All other instances of the kernel version in xnu are derived from this file.
index 0b393134f5b34694b9fbf9d4c2514d7703854e74..36db1f3f74266a4f66297653053e28f8678d1aeb 100644 (file)
@@ -14,8 +14,10 @@ _cpu_broadcast_xcall
 _cpu_xcall
 _cpu_number
 _enable_kernel_vfp_context
+_get_preemption_level
 _PE_consistent_debug_register
 _ml_static_ptovirt
 _ml_static_mfree
 _sched_perfcontrol_register_callbacks
 _sched_perfcontrol_update_recommended_cores
+_PE_panic_debugging_enabled
index 3ac76793908b0932c0fc4849ee7f447511aa72a8..a9e2160a276a1bdc268fc45a3f2469e67ac32959 100644 (file)
@@ -7,6 +7,8 @@ _PE_mark_hwaccess
 _PE_smc_stashed_x86_system_state
 _PE_smc_stashed_x86_power_state
 _PE_smc_stashed_x86_efi_boot_state
+_PE_smc_stashed_x86_shutdown_cause
+_PE_smc_stashed_x86_prev_power_transitions
 _PE_pcie_stashed_link_state
 __ZN17IONVRAMController*
 __ZTV17IONVRAMController
@@ -16,6 +18,7 @@ _cpu_cluster_id
 _cpu_number
 _cpu_qos_update_register
 _ecc_log_record_event
+_get_preemption_level
 _ml_arm_sleep
 _ml_get_abstime_offset
 _ml_get_conttime_offset
@@ -36,3 +39,4 @@ _pgtrace_stop
 _pgtrace_active
 _pgtrace_add_probe
 _pgtrace_clear_probe
+_PE_panic_debugging_enabled
index 5630d2a5082096b53aeabc95df73ee3489db60f2..3e655ff1030c6083756a1fbc2ebb3470566868a4 100644 (file)
@@ -1,4 +1,6 @@
 _PE_i_can_has_debugger
+__ZN15IORegistryEntry18setIndexedPropertyEjP8OSObject
+__ZNK15IORegistryEntry18getIndexedPropertyEj
 __ZN16IOPlatformExpert*
 __ZNK16IOPlatformExpert*
 __ZTV16IOPlatformExpert
@@ -88,14 +90,21 @@ _cpx_sizex
 _cpx_use_offset_for_iv
 _cpx_synthetic_offset_for_iv
 _cpx_writeprotect
+_cs_blob_create_validated
+_cs_blob_free
 _cs_blob_reset_cache
 _cs_debug
-_cs_enforcement
 _cs_entitlement_flags
 _cs_entitlements_blob_get
+_cs_debug_fail_on_unsigned_code
+_cs_debug_unsigned_exec_failures
+_cs_debug_unsigned_mmap_failures
 _cs_get_cdhash
 _cs_identity_get
+_cs_process_enforcement
+_cs_process_global_enforcement
 _cs_require_lv
+_cs_system_enforcement
 _cs_system_require_lv
 _cs_restricted
 _cs_valid
@@ -196,6 +205,7 @@ _ifnet_tx_compl_status
 _ifnet_get_unsent_bytes
 _ifnet_get_buffer_status
 _ifnet_normalise_unsent_data
+_ifnet_set_low_power_mode
 _in6_localaddr
 _in6addr_local
 _in_localaddr
@@ -237,8 +247,6 @@ _kern_stack_snapshot_with_reason
 _kernel_debug_string
 _kevent_id_internal
 _kevent_qos_internal
-_kevent_qos_internal_bind
-_kevent_qos_internal_unbind
 _kmem_alloc_kobject:_kmem_alloc_kobject_external
 _kmem_alloc_pageable:_kmem_alloc_pageable_external
 _kx_qsort
@@ -330,8 +338,8 @@ _pffindproto:_pffindproto_old
 _port_name_to_task
 _port_name_to_thread
 _post_sys_powersource
-_prng_factory_register
 _proc_getexecutablevnode
+_proc_issetugid
 _proc_pidbackgrounded
 _proc_pidversion
 _proc_set_responsible_pid
@@ -355,6 +363,7 @@ _pru_sockaddr_notsupp
 _pru_sopoll_notsupp
 _pthread_kext_register
 _q_to_b
+_register_and_init_prng
 _register_crypto_functions
 _register_decmpfs_decompressor
 _rootdev
@@ -484,6 +493,8 @@ _vnode_istty
 _vnode_lookup_continue_needed
 _vnode_clearnoflush
 _vnode_isnoflush
+_vnode_getbackingvnode
+_vnode_setasnamedstream
 _vnop_compound_mkdir_desc
 _vnop_compound_open_desc
 _vnop_compound_remove_desc
@@ -600,3 +611,15 @@ _zone_change
 _fs_buffer_cache_gc_register
 _fs_buffer_cache_gc_unregister
 _cp_key_store_action_for_volume
+
+_Block_size
+__Block_extended_layout
+__Block_has_signature
+__Block_isDeallocating
+__Block_layout
+__Block_object_assign
+__Block_object_dispose
+__Block_signature
+__Block_tryRetain
+__Block_use_RR2
+__Block_use_stret
index 52902c403eb5f7da46005f5b1d097a569c81a6db..0ad58ec1a9505a7cc235267291816e20a9c3ceec 100644 (file)
@@ -61,3 +61,10 @@ _register_copyout_shim
 _getsegdatafromheader
 _getsegbynamefromheader
 __mh_execute_header
+
+#macOS only codesigning kpi
+_csproc_disable_enforcement
+_csproc_mark_invalid_allowed
+_csproc_check_invalid_allowed
+_csproc_hardened_runtime
+_csproc_forced_lv
index 883f244804a3607c98fc9f330b08ab8f05056059..3687a214769241ee63418228f3bb938a6c2de150 100644 (file)
@@ -4,6 +4,7 @@ __ZN9IODTNVRAM19convertPropToObjectEPhjS0_jPPK8OSSymbolPP8OSObject
 __ZN9IODTNVRAM19searchNVRAMPropertyEP17IONVRAMDescriptorPj
 __ZN9IODTNVRAM19unescapeBytesToDataEPKhj
 _bsd_set_dependency_capable
+__get_commpage_priv_address
 _kdp_register_callout
 _kdp_set_ip_and_mac_addresses
 _logwakeup
diff --git a/config/generate_linker_aliases.sh b/config/generate_linker_aliases.sh
new file mode 100755 (executable)
index 0000000..45c7700
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+set -e
+
+if [ $# -lt 2 ]; then
+    echo "Usage: $0 output.exp input1 [input2 ... ]" 1>&2
+    exit 1
+fi
+
+OUTPUT="$1"
+shift
+
+( grep -h ":" "$@" | awk -F: '{print $2 "  " $1}' ) | sort -u > "$OUTPUT"
+
+exit 0
index f093b3378e14bffd0dea991b384d9034fe7b2488..8d670049ea94e3718240fe9ce748ab75d941606c 100755 (executable)
@@ -14,6 +14,7 @@
 #   ###KERNEL_VERSION_REVISION###           3
 #   ###KERNEL_VERSION_STAGE###              VERSION_STAGE_BETA (see libkern/version.h)
 #   ###KERNEL_VERSION_PRERELEASE_LEVEL###   4
+#   ###KERNEL_BUILD_CONFIG###               development
 #   ###KERNEL_BUILDER###                    root
 #   ###KERNEL_BUILD_OBJROOT###              xnu/xnu-690.obj~2/RELEASE_PPC
 #   ###KERNEL_BUILD_DATE###                 Sun Oct 24 05:33:28 PDT 2004
@@ -56,6 +57,8 @@ my $BUILD_OBJPATH=$ENV{'TARGET'} || $ENV{'OBJROOT'};
 $BUILD_OBJPATH =~ s,/+$,,;
 my $BUILD_DATE = `date`;
 $BUILD_DATE =~ s/[\n\t]//g;
+my $BUILD_CONFIG = "unknown";
+$BUILD_CONFIG = $ENV{'CURRENT_KERNEL_CONFIG_LC'} if defined($ENV{'CURRENT_KERNEL_CONFIG_LC'});
 my $BUILDER=`whoami`;
 $BUILDER =~ s/[\n\t]//g;
 my $RC_STRING = $ENV{'RC_ProjectNameAndSourceVersion'} . "~" . $ENV{'RC_ProjectBuildVersion'} if defined($ENV{'RC_XBS'});
@@ -166,6 +169,7 @@ foreach $file (@ARGV) {
   $count += $data =~ s/###KERNEL_VERSION_REVISION###/$VERSION_REVISION/g;
   $count += $data =~ s/###KERNEL_VERSION_STAGE###/$VERSION_STAGE/g;
   $count += $data =~ s/###KERNEL_VERSION_PRERELEASE_LEVEL###/$VERSION_PRERELEASE_LEVEL/g;
+  $count += $data =~ s/###KERNEL_BUILD_CONFIG###/$BUILD_CONFIG/g;
   $count += $data =~ s/###KERNEL_BUILDER###/$BUILDER/g;
   $count += $data =~ s/###KERNEL_BUILD_OBJROOT###/$BUILD_OBJROOT/g;
   $count += $data =~ s/###KERNEL_BUILD_DATE###/$BUILD_DATE/g;
@@ -183,6 +187,7 @@ if (0==scalar @ARGV) {
   print "newvers.pl: ###KERNEL_VERSION_REVISION### = $VERSION_REVISION\n";
   print "newvers.pl: ###KERNEL_VERSION_STAGE### = $VERSION_STAGE\n";
   print "newvers.pl: ###KERNEL_VERSION_PRERELEASE_LEVEL### = $VERSION_PRERELEASE_LEVEL\n";
+  print "newvers.pl: ###KERNEL_BUILD_CONFIG### = $BUILD_CONFIG\n";
   print "newvers.pl: ###KERNEL_BUILDER### = $BUILDER\n";
   print "newvers.pl: ###KERNEL_BUILD_OBJROOT### = $BUILD_OBJROOT\n";
   print "newvers.pl: ###KERNEL_BUILD_DATE### = $BUILD_DATE\n";
index 4870d134c96e5e395c21129272ba3be8dd1ab1c0..894ed9468dcd94c97c4e0ff9e6a6620c7a0c5f75 100644 (file)
@@ -35,6 +35,8 @@
 
 #include <libkern/version.h>
 
+// for what(1):
+const char __kernelVersionString[] __attribute__((used)) = "@(#)VERSION: " OSTYPE " Kernel Version ###KERNEL_VERSION_LONG###: ###KERNEL_BUILD_DATE###; ###KERNEL_BUILDER###:###KERNEL_BUILD_OBJROOT###";
 const char version[] = OSTYPE " Kernel Version ###KERNEL_VERSION_LONG###: ###KERNEL_BUILD_DATE###; ###KERNEL_BUILDER###:###KERNEL_BUILD_OBJROOT###";
 const int  version_major = VERSION_MAJOR;
 const int  version_minor = VERSION_MINOR;
@@ -42,6 +44,7 @@ const int  version_revision = VERSION_REVISION;
 const int  version_stage = VERSION_STAGE;
 const int  version_prerelease_level = VERSION_PRERELEASE_LEVEL;
 const char version_variant[] = VERSION_VARIANT;
+const char osbuild_config[] = "###KERNEL_BUILD_CONFIG###";
 const char osbuilder[] = "###KERNEL_BUILDER###";
 const char osrelease[] = OSRELEASE;
 const char ostype[] = OSTYPE;
index 505e23efd48e3f286030417da178228e075547f7..b72a4e8f52364e53a3e5a42628b943fa458370bd 100644 (file)
@@ -63,6 +63,7 @@ extern void IOBSDMountChange(struct mount * mp, uint32_t op);
 extern boolean_t IOTaskHasEntitlement(task_t task, const char * entitlement);
 
 extern struct IOPolledFileIOVars * gIOPolledCoreFileVars;
+extern kern_return_t gIOPolledCoreFileOpenRet;
 
 #ifdef __cplusplus
 }
index 4bc13d299b95d45dac9a7117a85e05f3db9ff3cb..431f179d46507dd46fae4d76260c600008b13ac0 100644 (file)
@@ -154,6 +154,21 @@ work loop event sources.  If the command is disabled the attempt to run a comman
                               void *arg0 = 0, void *arg1 = 0,
                               void *arg2 = 0, void *arg3 = 0);
 
+#ifdef __BLOCKS__
+/*! @function runActionBlock
+    @abstract Single thread a call to an action with the target work loop.
+    @discussion Client function that causes the given action to be called in
+a single threaded manner.  Beware the work loop's gate is recursive and command
+gates can cause direct or indirect re-entrancy.         When the executing on a
+client's thread runAction will sleep until the work loop's gate opens for
+execution of client actions, the action is single threaded against all other
+work loop event sources.  If the command is disabled the attempt to run a command will be stalled until enable is called.
+    @param action Block to be executed in the context of the work loop.
+    @result The return value of action if it was called, kIOReturnBadArgument if action is not defined, kIOReturnAborted if a disabled command gate is free()ed before being reenabled.
+*/
+    IOReturn runActionBlock(ActionBlock action);
+#endif /* __BLOCKS__ */
+
 /*! @function attemptCommand
     @abstract Single thread a command with the target work loop.
     @discussion Client function that causes the current action to be called in
@@ -187,10 +202,10 @@ client's thread attemptCommand will fail if the work loop's gate is closed.
 
 /*! @function commandSleep  
     @abstract Put a thread that is currently holding the command gate to sleep.
-    @discussion Put a thread to sleep waiting for an event but release the gate first.  If the event occurs then the commandGate is closed before the function returns.
+    @discussion Put a thread to sleep waiting for an event but release the gate first.  If the event occurs then the commandGate is closed before the function returns. If the thread does not hold the gate, panic.
     @param event Pointer to an address.
     @param interruptible THREAD_UNINT, THREAD_INTERRUPTIBLE or THREAD_ABORTSAFE.  THREAD_UNINT specifies that the sleep cannot be interrupted by a signal.  THREAD_INTERRUPTIBLE specifies that the sleep may be interrupted by a "kill -9" signal.  THREAD_ABORTSAFE (the default value) specifies that the sleep may be interrupted by any user signal.
-    @result THREAD_AWAKENED - normal wakeup, THREAD_TIMED_OUT - timeout expired, THREAD_INTERRUPTED - interrupted, THREAD_RESTART - restart operation entirely, kIOReturnNotPermitted if the calling thread does not hold the command gate. */
+    @result THREAD_AWAKENED - normal wakeup, THREAD_TIMED_OUT - timeout expired, THREAD_INTERRUPTED - interrupted, THREAD_RESTART - restart operation entirely. */
     virtual IOReturn commandSleep(void *event,
                                   UInt32 interruptible = THREAD_ABORTSAFE);
 
@@ -212,11 +227,11 @@ client's thread attemptCommand will fail if the work loop's gate is closed.
 
 /*! @function commandSleep  
     @abstract Put a thread that is currently holding the command gate to sleep.
-    @discussion Put a thread to sleep waiting for an event but release the gate first.  If the event occurs or timeout occurs then the commandGate is closed before the function returns.
+    @discussion Put a thread to sleep waiting for an event but release the gate first.  If the event occurs or timeout occurs then the commandGate is closed before the function returns.  If the thread does not hold the gate, panic.
     @param event Pointer to an address.
        @param deadline Clock deadline to timeout the sleep.
     @param interruptible THREAD_UNINT, THREAD_INTERRUPTIBLE or THREAD_ABORTSAFE.  THREAD_UNINT specifies that the sleep cannot be interrupted by a signal.  THREAD_INTERRUPTIBLE specifies that the sleep may be interrupted by a "kill -9" signal.  THREAD_ABORTSAFE specifies that the sleep may be interrupted by any user signal.
-    @result THREAD_AWAKENED - normal wakeup, THREAD_TIMED_OUT - timeout expired, THREAD_INTERRUPTED - interrupted, THREAD_RESTART - restart operation entirely, kIOReturnNotPermitted if the calling thread does not hold the command gate. */
+    @result THREAD_AWAKENED - normal wakeup, THREAD_TIMED_OUT - timeout expired, THREAD_INTERRUPTED - interrupted, THREAD_RESTART - restart operation entirely. */
     virtual IOReturn commandSleep(void *event,
                                                                  AbsoluteTime deadline,
                                   UInt32 interruptible);
index 44502a12ea8a5fa3fe4e77b3630f678fbdeca1b8..34273c47078b87176db887de9b82d8675408b820 100644 (file)
@@ -106,6 +106,10 @@ is implicitly the first paramter in the target member function's parameter list.
     @discussion Backward compatibilty define for the old non-class scoped type definition.  See $link IOEventSource::Action */
  #define IOEventSourceAction IOEventSource::Action
 
+#ifdef __BLOCKS__
+    typedef IOReturn (^ActionBlock)();
+#endif /* __BLOCKS__ */
+
 protected:
 /*! @var eventChainNext
        The next event source in the event chain. nil at end of chain. */
@@ -116,18 +120,24 @@ protected:
 
 /*! @var action
        The action method called when an event has been delivered */
+
+#if XNU_KERNEL_PRIVATE
+    union { Action action; ActionBlock actionBlock; };
+#else /* XNU_KERNEL_PRIVATE */
     Action action;
+#endif /* !XNU_KERNEL_PRIVATE */
 
 /*! @var enabled
        Is this event source enabled to deliver requests to the work-loop. */
     bool enabled;
 
 #if XNU_KERNEL_PRIVATE
-
     enum
     {
-        kPassive = 0x0001,
-        kActive  = 0x0002,
+        kPassive         = 0x0001,
+        kActive          = 0x0002,
+        kActionBlock     = 0x0004,
+        kSubClass0       = 0x0008,
     };
     uint8_t  eventSourceReserved1[1];
     uint16_t flags;
@@ -231,6 +241,26 @@ public:
     @result value of action. */
     virtual IOEventSource::Action getAction() const;
 
+#ifdef __BLOCKS__
+/*! @function setActionBlock
+    @abstract Setter for action ivar. The current block is released, & the new block is retained.
+    @param block Block pointer of type IOEventSource::ActionBlock. */
+    void setActionBlock(ActionBlock block);
+/*! @function getActionBlock
+    @abstract Getter for action ivar.
+    @result Block pointer of type IOEventSource::ActionBlock, if set, or NULL. */
+    ActionBlock getActionBlock(ActionBlock) const;
+#endif /* __BLOCKS__ */
+
+/*! @function setRefcon
+    @abstract Setter for refcon ivar. This function will assert if a block action has been set.
+    @param refcon Refcon. */
+    void setRefcon(void *refcon);
+/*! @function getRefcon
+    @abstract Getter for refcon ivar.
+    @result The refcon. This function will assert if a block action has been set. */
+    void * getRefcon() const;
+
 /*! @function enable
     @abstract Enable event source.
     @discussion A subclass implementation is expected to respect the enabled
index 3cf68bf1105996830603250b1705189dffae5f80..1a5470b457609331abcf494766b981e9dc246c06 100644 (file)
@@ -66,6 +66,10 @@ public:
     @discussion Backward compatibilty define for the old non-class scoped type definition.  See $link IOFilterInterruptSource::Filter */
 #define IOFilterInterruptAction IOFilterInterruptEventSource::Filter
 
+#ifdef __BLOCKS__
+    typedef bool (^FilterBlock)(IOFilterInterruptEventSource *sender);
+#endif /* __BLOCKS__ */
+
 private:
     // Hide the superclass initializers
     virtual bool init(OSObject *inOwner,
@@ -81,7 +85,12 @@ private:
 
 protected:
 /*! @var filterAction Filter callout */
+
+#if XNU_KERNEL_PRIVATE
+    union { Filter filterAction; FilterBlock filterActionBlock; };
+#else /* XNU_KERNEL_PRIVATE */
     Filter filterAction;
+#endif /* !XNU_KERNEL_PRIVATE */
 
 /*! @struct ExpansionData
     @discussion This structure will be used to expand the capablilties of the IOWorkLoop in the future.
@@ -110,6 +119,30 @@ public:
                                   IOService *provider,
                                   int intIndex = 0);
 
+#ifdef __BLOCKS__
+/*! @function filterInterruptEventSource
+    @abstract Factor method to create and initialise an IOFilterInterruptEventSource.  See $link init.
+    @param owner Owner/client of this event source.
+    @param provider Service that provides interrupts.
+    @param intIndex The index of the interrupt within the provider's interrupt sources.
+    @param action Block for the callout routine of this event source.
+    @param filter Block to invoke when HW interrupt occurs.
+    @result a new event source if succesful, 0 otherwise.  */
+    static IOFilterInterruptEventSource *
+       filterInterruptEventSource(OSObject *owner,
+                                  IOService *provider,
+                                  int intIndex,
+                                  IOInterruptEventSource::ActionBlock action,
+                                  FilterBlock filter);
+#endif /* __BLOCKS__ */
+
+#if XNU_KERNEL_PRIVATE
+    enum
+    {
+        kFilterBlock = kSubClass0,
+    };
+#endif
+
 /*! @function init
     @abstract Primary initialiser for the IOFilterInterruptEventSource class.
     @param owner Owner/client of this event source.
@@ -125,6 +158,7 @@ successfully.  */
                      IOService *provider,
                      int intIndex = 0);
 
+    virtual void free( void ) APPLE_KEXT_OVERRIDE;
 
 /*! @function signalInterrupt
     @abstract Cause the work loop to schedule the action.
@@ -136,6 +170,13 @@ successfully.  */
     @result value of filterAction. */
     virtual Filter getFilterAction() const;
 
+#ifdef __BLOCKS__
+/*! @function getFilterActionBlock
+    @abstract Get'ter for filterAction variable.
+    @result value of filterAction. */
+    FilterBlock getFilterActionBlock() const;
+#endif /* __BLOCKS__ */
+
 /*! @function normalInterruptOccurred
     @abstract Override $link IOInterruptEventSource::normalInterruptOccured to make a filter callout. */
     virtual void normalInterruptOccurred(void *self, IOService *prov, int ind) APPLE_KEXT_OVERRIDE;
index b6f3b7f62a39822b2d17d7431bdc8b332c97c6b3..f3195d0f104578e4c491709794c1419d2a8a304e 100644 (file)
@@ -310,8 +310,6 @@ typedef struct hibernate_statistics_t hibernate_statistics_t;
 void     IOHibernateSystemInit(IOPMrootDomain * rootDomain);
 
 IOReturn IOHibernateSystemSleep(void);
-void     IOOpenDebugDataFile(const char *fname, uint64_t size);
-void     IOCloseDebugDataFile();
 IOReturn IOHibernateIOKitSleep(void);
 IOReturn IOHibernateSystemHasSlept(void);
 IOReturn IOHibernateSystemWake(void);
index 6acde040b62d6ff8bfffea99a56ac55fd41a0b01..1d63d5c3a5df6f7089a979e3aadebde66164ba05 100644 (file)
@@ -70,6 +70,10 @@ public:
     @param count Number of interrupts seen before delivery. */
     typedef void (*Action)(OSObject *owner, IOInterruptEventSource *sender, int count);
 
+#ifdef __BLOCKS__
+    typedef void (^ActionBlock)(IOInterruptEventSource *sender, int count);
+#endif /* __BLOCKS__ */
+
 /*! @defined IOInterruptEventAction
     @discussion Backward compatibilty define for the old non-class scoped type definition.  See $link IOInterruptEventSource::Action */
 #define IOInterruptEventAction IOInterruptEventSource::Action
@@ -137,6 +141,26 @@ public:
                             IOService *provider = 0,
                             int intIndex = 0);
 
+
+#ifdef __BLOCKS__
+/*! @function interruptEventSource
+    @abstract Factory function for IOInterruptEventSources creation and initialisation.
+    @param owner Owning client of the new event source.
+    @param provider IOService that represents the interrupt source.  When no provider is defined the event source assumes that the client will in some manner call the interruptOccured method explicitly.  This will start the ball rolling for safe delivery of asynchronous event's into the driver.
+    @param intIndex The index of the interrupt within the provider's interrupt sources.
+    @param action Block for the callout routine of this event source..
+    @result A new interrupt event source if successfully created and initialised, 0 otherwise.  */
+    static IOInterruptEventSource *
+       interruptEventSource(OSObject *owner,
+                            IOService *provider,
+                            int intIndex,
+                            ActionBlock action);
+#endif /* __BLOCKS__ */
+
+#if XNU_KERNEL_PRIVATE
+    static void actionToBlock(OSObject *owner, IOInterruptEventSource *sender, int count);
+#endif /* XNU_KERNEL_PRIVATE */
+
 /*! @function init
     @abstract Primary initialiser for the IOInterruptEventSource class.
     @param owner Owning client of the new event source.
index fa8aa7b33376017a886117084c8d242cadcf8226..d58ea9f074ddc53f48f9be70b95055102213aa35 100644 (file)
@@ -50,6 +50,16 @@ struct IOInterruptSource {
 };
 typedef struct IOInterruptSource IOInterruptSource;
 
+#ifdef XNU_KERNEL_PRIVATE
+
+struct IOInterruptSourcePrivate {
+    void * vectorBlock;
+};
+typedef struct IOInterruptSourcePrivate IOInterruptSourcePrivate;
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+
 #endif /* __cplusplus */
 
 typedef void (*IOInterruptHandler)(void *target, void *refCon,
index 44ed1180715a5021b1f8d571fa25ec529c39087b..1d5bf5afa77f4d3a1c329f77c15d90f6d1952b77 100644 (file)
 #define kIOMinimumSegmentAlignmentByteCountKey  "IOMinimumSegmentAlignmentByteCount"  // (OSNumber)
 #define kIOMaximumSegmentAddressableBitCountKey "IOMaximumSegmentAddressableBitCount" // (OSNumber)
 #define kIOMinimumSaturationByteCountKey        "IOMinimumSaturationByteCount"        // (OSNumber)
+#define kIOMaximumSwapWriteKey                  "IOMaximumSwapWrite"                  // (OSNumber)
 
 // properties found in services that wish to describe an icon
 //
index 35c3305816d0fa46e8fa86ec655b0007d3c81ff9..35d037da0f7a6dcc023c483be0d7e936dc32fa5b 100644 (file)
@@ -109,9 +109,7 @@ enum {
     kIOMemoryPreparedReadOnly  = 0x00008000,
 #endif
     kIOMemoryPersistent                = 0x00010000,
-#ifdef XNU_KERNEL_PRIVATE
     kIOMemoryMapCopyOnWrite    = 0x00020000,
-#endif
     kIOMemoryRemote            = 0x00040000,
     kIOMemoryThreadSafe                = 0x00100000,   // Shared with Buffer MD
     kIOMemoryClearEncrypt      = 0x00200000,   // Shared with Buffer MD
index 1a5883abd8fa0ef0d0047b689f20fe16730ce8c0..8d3fd47feef04bf1d60016666612277bfea7a2f2 100644 (file)
@@ -118,6 +118,8 @@ public:
     IOReturn getPageCounts(IOByteCount * residentPageCount,
                            IOByteCount * dirtyPageCount);
 
+    virtual uint64_t getPreparationID( void ) APPLE_KEXT_OVERRIDE;
+
 #define IOMULTIMEMORYDESCRIPTOR_SUPPORTS_GETPAGECOUNTS 1
 
 private:
index 84d27043b03cc749c6142e06843c239bd499d9cf..f22e999fcd5bc1decdfadc531e0ddd34dc88f232 100644 (file)
@@ -119,9 +119,17 @@ public:
 #include <IOKit/IOTypes.h>
 #include <IOKit/IOHibernatePrivate.h>
 
+// kern_open_file_for_direct_io() flags
 enum
 {
-    kIOPolledFileSSD = 0x00000001
+    kIOPolledFileCreate    = 0x00000001,
+    kIOPolledFileHibernate = 0x00000002,
+};
+
+// kern_open_file_for_direct_io() oflags
+enum
+{
+    kIOPolledFileSSD    = 0x00000001
 };
 
 #if !defined(__cplusplus)
@@ -174,7 +182,8 @@ typedef struct IOPolledFileCryptVars IOPolledFileCryptVars;
 
 #if defined(__cplusplus)
 
-IOReturn IOPolledFileOpen(const char * filename, 
+IOReturn IOPolledFileOpen(const char * filename,
+                         uint32_t flags,
                          uint64_t setFileSize, uint64_t fsFreeSize,
                          void * write_file_addr, size_t write_file_len,
                          IOPolledFileIOVars ** fileVars,
@@ -224,7 +233,8 @@ __BEGIN_DECLS
 typedef void (*kern_get_file_extents_callback_t)(void * ref, uint64_t start, uint64_t size);
 
 struct kern_direct_file_io_ref_t *
-kern_open_file_for_direct_io(const char * name, boolean_t create_file,
+kern_open_file_for_direct_io(const char * name,
+                            uint32_t flags,
                             kern_get_file_extents_callback_t callback, 
                             void * callback_ref,
                              off_t set_file_size,
index 97f66e612fae0c7932cac359061744152cc02421..59ba42d991caa53a556bca81c54ec8a746e1a78b 100644 (file)
@@ -57,6 +57,14 @@ enum {
     kIORegistryIterateParents       = 0x00000002,
 };
 
+#ifdef KERNEL_PRIVATE
+enum
+{
+       kIORegistryEntryIndexedPropertyCLPC = 0,
+       kIORegistryEntryIndexedPropertyCount,
+};
+#endif /* KERNEL_PRIVATE */
+
 /*! @class IORegistryEntry : public OSObject
     @abstract The base class for all objects in the registry.
     @discussion The IORegistryEntry base class provides functions for describing graphs of connected registry entries, each with a dictionary-based property table. Entries may be connected in different planes, with differing topologies. Access to the registry is protected against multiple threads. Inside the kernel planes are specified with plane objects and are published by the creator - IOService exports the gIOServicePlane plane object for example. Non kernel clients specify planes by their name.
@@ -280,6 +288,11 @@ public:
 
     virtual bool setProperty(const OSSymbol * aKey, OSObject * anObject);
 
+#ifdef KERNEL_PRIVATE
+    OSObject * setIndexedProperty(uint32_t index, OSObject * anObject);
+    OSObject * getIndexedProperty(uint32_t index) const;
+#endif /* KERNEL_PRIVATE */
+
 /*! @function setProperty
     @abstract Synchronized method to add a property to a registry entry's property table.
     @discussion This method will add or replace a property in a registry entry's property table, using the OSDictionary::setObject semantics. This method is synchronized with other IORegistryEntry accesses to the property table.
index 504d6f2219315de9d07fa34a97be3e1b4c6c4173..3f1b2877f67108e721c46ae60789e2da3b8a1b56 100644 (file)
@@ -62,6 +62,7 @@ typedef       kern_return_t           IOReturn;
 #define sub_iokit_hidsystem               err_sub(14)
 #define sub_iokit_scsi                    err_sub(16)
 #define sub_iokit_usbaudio                err_sub(17)
+#define sub_iokit_wirelesscharging        err_sub(18)
 //#define sub_iokit_pccard                err_sub(21)
 #ifdef PRIVATE
 #define sub_iokit_nvme                    err_sub(28)
@@ -81,6 +82,7 @@ typedef       kern_return_t           IOReturn;
 #define sub_iokit_sdio                    err_sub(0x174)
 #define sub_iokit_wlan                    err_sub(0x208)
 #define sub_iokit_appleembeddedsleepwakehandler  err_sub(0x209)
+#define sub_iokit_appleppm                err_sub(0x20A)
 
 #define sub_iokit_vendor_specific         err_sub(-2)
 #define sub_iokit_reserved                err_sub(-1)
index 20ad4e6abca80b1f4438e27a20bbd4fef06017dc..e06839160e26486056fc31425504cf7c2b73aca4 100644 (file)
@@ -155,6 +155,10 @@ extern SInt32 IOServiceOrdering( const OSMetaClassBase * inObj1, const OSMetaCla
 typedef void (*IOInterruptAction)( OSObject * target, void * refCon,
                    IOService * nub, int source );
 
+#ifdef __BLOCKS__
+typedef void (^IOInterruptActionBlock)(IOService * nub, int source);
+#endif /* __BLOCKS__ */
+
 /*! @typedef IOServiceNotificationHandler
     @param target Reference supplied when the notification was registered.
     @param refCon Reference constant supplied when the notification was registered.
@@ -167,6 +171,12 @@ typedef bool (*IOServiceMatchingNotificationHandler)( void * target, void * refC
                                   IOService * newService,
                                   IONotifier * notifier );
 
+#ifdef __BLOCKS__
+typedef bool (^IOServiceMatchingNotificationHandlerBlock)(IOService * newService,
+                                  IONotifier * notifier );
+#endif /* __BLOCKS__ */
+
+
 /*! @typedef IOServiceInterestHandler
     @param target Reference supplied when the notification was registered.
     @param refCon Reference constant supplied when the notification was registered.
@@ -179,6 +189,11 @@ typedef IOReturn (*IOServiceInterestHandler)( void * target, void * refCon,
                                               UInt32 messageType, IOService * provider,
                                               void * messageArgument, vm_size_t argSize );
 
+#ifdef __BLOCKS__
+typedef IOReturn (^IOServiceInterestHandlerBlock)( uint32_t messageType, IOService * provider,
+                                                   void   * messageArgument, size_t argSize );
+#endif /* __BLOCKS__ */
+
 typedef void (*IOServiceApplierFunction)(IOService * service, void * context);
 typedef void (*OSObjectApplierFunction)(OSObject * object, void * context);
 
@@ -774,6 +789,14 @@ public:
                             void * target, void * ref = 0,
                             SInt32 priority = 0 );
 
+
+#ifdef __BLOCKS__
+    static IONotifier * addMatchingNotification(
+                            const OSSymbol * type, OSDictionary * matching,
+                            SInt32 priority,
+                            IOServiceMatchingNotificationHandlerBlock handler);
+#endif /* __BLOCKS__ */
+
 /*! @function waitForService
     @abstract Deprecated use waitForMatchingService(). Waits for a matching to service to be published.
     @discussion Provides a method of waiting for an IOService object matching the supplied matching dictionary to be registered and fully matched. 
@@ -1113,6 +1136,19 @@ public:
     virtual IOReturn registerInterrupt(int source, OSObject *target,
                                        IOInterruptAction handler,
                                        void *refCon = 0);
+
+#ifdef __BLOCKS__
+/*! @function registerInterrupt
+    @abstract Registers a block handler for a device supplying interrupts.
+    @discussion This method installs a C function interrupt handler to be called at primary interrupt time for a device's interrupt. Only one handler may be installed per interrupt source. IOInterruptEventSource provides a work loop based abstraction for interrupt delivery that may be more appropriate for work loop based drivers.
+    @param source The index of the interrupt source in the device.
+    @param target An object instance to be passed to the interrupt handler.
+    @param handler The block to be invoked at primary interrupt time when the interrupt occurs. The handler should process the interrupt by clearing the interrupt, or by disabling the source.
+    @result An IOReturn code.<br><code>kIOReturnNoInterrupt</code> is returned if the source is not valid; <code>kIOReturnNoResources</code> is returned if the interrupt already has an installed handler. */
+
+       IOReturn registerInterruptBlock(int source, OSObject *target,
+                                     IOInterruptActionBlock handler);
+#endif /* __BLOCKS__ */
                                        
 /*! @function unregisterInterrupt
     @abstract Removes a C function interrupt handler for a device supplying hardware interrupts.
@@ -1215,6 +1251,11 @@ public:
                                            IOServiceInterestHandler handler,
                                            void * target, void * ref = 0 );
 
+#ifdef __BLOCKS__
+    IONotifier * registerInterest(const OSSymbol * typeOfInterest,
+                                  IOServiceInterestHandlerBlock handler);
+#endif /* __BLOCKS__ */
+
     virtual void applyToProviders( IOServiceApplierFunction applier,
                                    void * context );
 
@@ -1841,13 +1882,15 @@ public:
     uint32_t getPowerStateForClient( const OSSymbol * client );
     static const char * getIOMessageString( uint32_t msg );
     static void setAdvisoryTickleEnable( bool enable );
-    void reset_watchdog_timer( void );
+    void reset_watchdog_timer(IOService *obj, int timeout);
     void start_watchdog_timer ( void );
-    bool stop_watchdog_timer ( void );
+    void stop_watchdog_timer ( void );
+    void start_watchdog_timer(uint64_t deadline);
     IOReturn registerInterestForNotifier( IONotifier *notify, const OSSymbol * typeOfInterest,
                   IOServiceInterestHandler handler, void * target, void * ref );
 
     static IOWorkLoop * getIOPMWorkloop( void );
+    bool getBlockingDriverCall(thread_t *thread, const void **callMethod);
 
 protected:
     bool tellClientsWithResponse( int messageType );
index 17662cb4e081a4e2b673800bc13fcb1fbd1cbb17..f04fe93a1783a2815cd172569400d961fabc23da 100644 (file)
@@ -72,6 +72,7 @@ struct IOPMDriverCallEntry {
     queue_chain_t   link;
     thread_t        thread;
     IOService *     target;
+    const void  *callMethod;
 };
 
 // Power clients (desires)
index 16e7cdde27a529b5b6d6f78ef20bf6319d2938b1..5b8a9f3e60086af8ae0cbdb5e2796b6db748eac6 100644 (file)
@@ -32,6 +32,9 @@
 #ifdef dequeue
 #undef dequeue
 #endif
+#ifdef enqueue
+#undef enqueue
+#endif
 
 #define DISABLE_DATAQUEUE_WARNING /* IODataQueue is deprecated, please use IOSharedDataQueue instead */
 
@@ -148,6 +151,12 @@ public:
      */
     virtual Boolean enqueue(void *data, UInt32 dataSize) APPLE_KEXT_OVERRIDE;
 
+#ifdef PRIVATE
+    /* workaround for queue.h redefine, please do not use */
+    __inline__ Boolean enqueue_tail(void *data, UInt32 dataSize) { return (IOSharedDataQueue::enqueue(data, dataSize)); }
+#endif
+
+#if APPLE_KEXT_VTABLE_PADDING
     OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 0);
     OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 1);
     OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 2);
@@ -156,6 +165,7 @@ public:
     OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 5);
     OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 6);
     OSMetaClassDeclareReservedUnused(IOSharedDataQueue, 7);
+#endif
 };
 
 #endif /* _IOKIT_IOSHAREDDATAQUEUE_H */
index 91ab47cf0293988ce4b5100a388e130ee5df7f66..8ef49ef191cd9411a34a688f5f6a80beef9ac8b1 100644 (file)
@@ -159,6 +159,10 @@ public:
     @param sender The object that timed out. */
     typedef void (*Action)(OSObject *owner, IOTimerEventSource *sender);
 
+#ifdef __BLOCKS__
+    typedef void (^ActionBlock)(IOTimerEventSource *sender);
+#endif /* __BLOCKS__ */
+
     static IOTimerEventSource *
        timerEventSource(OSObject *owner, Action action = 0);
 
@@ -171,6 +175,22 @@ public:
     static IOTimerEventSource *
        timerEventSource(uint32_t options, OSObject *owner, Action action = 0);
 
+#ifdef __BLOCKS__
+/*! @function timerEventSource
+    @abstract Allocates and returns an initialized timer instance.
+    @param options Mask of kIOTimerEventSourceOptions* options.
+    @param inOwner The object that that will be passed to the Action callback.
+    @param action Block for the callout routine of this event source.
+    */
+    static IOTimerEventSource *
+       timerEventSource(uint32_t options, OSObject *inOwner, ActionBlock action);
+#endif /* __BLOCKS__ */
+
+#if XNU_KERNEL_PRIVATE
+       __inline__ void invokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts,
+            OSObject * owner, IOWorkLoop * workLoop);
+#endif /* XNU_KERNEL_PRIVATE */
+
 /*! @function init
     @abstract Initializes the timer with an owner, and a handler to call when the timeout expires.
     */
index 62b5a6b089c4402de5e87adf970ead7b76f6f77c..c3f056001336886f0c53dcdf155109014f199ff0 100644 (file)
@@ -83,7 +83,7 @@ typedef mach_vm_address_t     IOVirtualAddress;
 typedef vm_address_t           IOVirtualAddress;
 #endif
 
-#if !defined(__arm__) && !defined(__i386__) && !(defined(__x86_64__) && !defined(KERNEL))
+#if !defined(__arm__) && !defined(__i386__) && !(defined(__x86_64__) && !defined(KERNEL)) && !(defined(__arm64__) && !defined(__LP64__))
 typedef IOByteCount64          IOByteCount;
 #else
 typedef IOByteCount32          IOByteCount;
index afc4979b01214be648781a8047daa22a9a7f3236..c62c13216ce4616c32199a3a3b0d356be031396d 100644 (file)
@@ -74,6 +74,11 @@ member function's parameter list.
     typedef IOReturn (*Action)(OSObject *target,
                               void *arg0, void *arg1,
                               void *arg2, void *arg3);
+
+#ifdef __BLOCKS__
+    typedef IOReturn (^ActionBlock)();
+#endif /* __BLOCKS__ */
+
     enum {
        kPreciousStack  = 0x00000001,
        kTimeLockPanics = 0x00000002,
@@ -292,6 +297,16 @@ public:
                               void *arg0 = 0, void *arg1 = 0,
                               void *arg2 = 0, void *arg3 = 0);
 
+#ifdef __BLOCKS__
+/*! @function runAction
+    @abstract Single thread a call to an action with the work-loop.
+    @discussion Client function that causes the given action to be called in a single threaded manner.  Beware: the work-loop's gate is recursive and runAction can cause direct or indirect re-entrancy.  When executing on a client's thread, runAction will sleep until the work-loop's gate opens for execution of client actions, the action is single threaded against all other work-loop event sources.
+    @param action Block to be executed in work-loop context.
+    @result Returns the result of the action block.
+*/
+    IOReturn runActionBlock(ActionBlock action);
+#endif /* __BLOCKS__ */
+
 /*! @function runEventSources
     @discussion Consists of the inner 2 loops of the threadMain function(qv).
     The outer loop terminates when there is no more work, and the inside loop
diff --git a/iokit/IOKit/perfcontrol/IOPerfControl.h b/iokit/IOKit/perfcontrol/IOPerfControl.h
new file mode 100644 (file)
index 0000000..886d0a0
--- /dev/null
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ */
+
+#pragma once
+
+#ifdef KERNEL_PRIVATE
+#ifdef __cplusplus
+
+#include <IOKit/IOService.h>
+
+struct thread_group;
+
+enum
+{
+    kIOPerfControlClientWorkUntracked = 0,
+};
+
+/*!
+ * @class IOPerfControlClient : public OSObject
+ * @abstract Class which implements an interface allowing device drivers to participate in performance control.
+ * @discussion TODO
+ */
+class IOPerfControlClient final : public OSObject
+{
+    OSDeclareDefaultStructors(IOPerfControlClient);
+
+protected:
+    virtual bool init(IOService *driver, uint64_t maxWorkCapacity);
+
+public:
+    /*!
+     * @function copyClient
+     * @abstract Return a retained reference to a client object, to be released by the driver. It may be
+     * shared with other drivers in the system.
+     * @param driver The device driver that will be using this interface.
+     * @param maxWorkCapacity The maximum number of concurrent work items supported by the device driver.
+     * @returns An instance of IOPerfControlClient.
+     */
+    static IOPerfControlClient *copyClient(IOService *driver, uint64_t maxWorkCapacity);
+
+    /*!
+     * @function registerDevice
+     * @abstract Inform the system that work will be dispatched to a device in the future.
+     * @discussion The system will do some one-time setup work associated with the device, and may block the
+     * current thread during the setup. Devices should not be passed to work workSubmit, workSubmitAndBegin,
+     * workBegin, or workEnd until they have been successfully registered. The unregistration process happens
+     * automatically when the device object is deallocated.
+     * @param device The device object. Some platforms require device to be a specific subclass of IOService.
+     * @returns kIOReturnSuccess or an IOReturn error code
+     */
+    virtual IOReturn registerDevice(IOService *driver, IOService *device);
+
+    /*!
+     * @function unregisterDevice
+     * @abstract Inform the system that work will be no longer be dispatched to a device in the future.
+     * @discussion This call is optional as the unregistration process happens automatically when the device
+     * object is deallocated. This call may block the current thread and/or acquire locks. It should not be
+     * called until after all submitted work has been ended using workEnd.
+     * @param device The device object. Some platforms require device to be a specific subclass of IOService.
+     */
+    virtual void unregisterDevice(IOService *driver, IOService *device);
+
+    /*!
+     * @struct WorkSubmitArgs
+     * @discussion Drivers may submit additional device-specific arguments related to the submission of a work item
+     * by passing a struct with WorkSubmitArgs as its first member. Note: Drivers are responsible for publishing
+     * a header file describing these arguments.
+     */
+    struct WorkSubmitArgs
+    {
+        uint32_t version;
+        uint32_t size;
+        uint64_t submit_time;
+        uint64_t reserved[4];
+        void *driver_data;
+    };
+
+    /*!
+     * @function workSubmit
+     * @abstract Tell the performance controller that work was submitted.
+     * @param device The device that will execute the work. Some platforms require device to be a
+     * specific subclass of IOService.
+     * @param args Optional device-specific arguments related to the submission of this work item.
+     * @returns A token representing this work item, which must be passed to workEnd when the work is finished
+     * unless the token equals kIOPerfControlClientWorkUntracked. Failure to do this will result in memory leaks
+     * and a degradation of system performance.
+     */
+    virtual uint64_t workSubmit(IOService *device, WorkSubmitArgs *args = nullptr);
+
+    /*!
+     * @struct WorkBeginArgs
+     * @discussion Drivers may submit additional device-specific arguments related to the start of a work item
+     * by passing a struct with WorkBeginArgs as its first member. Note: Drivers are responsible for publishing
+     * a header file describing these arguments.
+     */
+    struct WorkBeginArgs
+    {
+        uint32_t version;
+        uint32_t size;
+        uint64_t begin_time;
+        uint64_t reserved[4];
+        void *driver_data;
+    };
+
+    /*!
+     * @function workSubmitAndBegin
+     * @abstract Tell the performance controller that work was submitted and immediately began executing.
+     * @param device The device that is executing the work. Some platforms require device to be a
+     * specific subclass of IOService.
+     * @param submitArgs Optional device-specific arguments related to the submission of this work item.
+     * @param beginArgs Optional device-specific arguments related to the start of this work item.
+     * @returns A token representing this work item, which must be passed to workEnd when the work is finished
+     * unless the token equals kIOPerfControlClientWorkUntracked. Failure to do this will result in memory leaks
+     * and a degradation of system performance.
+     */
+    virtual uint64_t workSubmitAndBegin(IOService *device, WorkSubmitArgs *submitArgs = nullptr,
+                                        WorkBeginArgs *beginArgs = nullptr);
+
+    /*!
+     * @function workBegin
+     * @abstract Tell the performance controller that previously submitted work began executing.
+     * @param device The device that is executing the work. Some platforms require device to be a
+     * specific subclass of IOService.
+     * @param args Optional device-specific arguments related to the start of this work item.
+     */
+    virtual void workBegin(IOService *device, uint64_t token, WorkBeginArgs *args = nullptr);
+
+    /*!
+     * @struct WorkEndArgs
+     * @discussion Drivers may submit additional device-specific arguments related to the end of a work item
+     * by passing a struct with WorkEndArgs as its first member. Note: Drivers are responsible for publishing
+     * a header file describing these arguments.
+     */
+    struct WorkEndArgs
+    {
+        uint32_t version;
+        uint32_t size;
+        uint64_t end_time;
+        uint64_t reserved[4];
+        void *driver_data;
+    };
+
+    /*!
+     * @function workEnd
+     * @abstract Tell the performance controller that previously started work finished executing.
+     * @param device The device that executed the work. Some platforms require device to be a
+     * specific subclass of IOService.
+     * @param args Optional device-specific arguments related to the end of this work item.
+     * @param done Optional Set to false if the work has not yet completed. Drivers are then responsible for
+     * calling workBegin when the work resumes and workEnd with done set to True when it has completed.
+     */
+    virtual void workEnd(IOService *device, uint64_t token, WorkEndArgs *args = nullptr, bool done = true);
+
+    /*!
+     * @struct PerfControllerInterface
+     * @discussion Function pointers necessary to register a performance controller. Not for general driver use.
+     */
+    struct PerfControllerInterface
+    {
+        struct WorkState {
+            uint64_t thread_group_id;
+            void *thread_group_data;
+            void *work_data;
+            uint32_t work_data_size;
+        };
+
+        using RegisterDeviceFunction = IOReturn (*)(IOService *);
+        using WorkCanSubmitFunction = bool (*)(IOService *, WorkState *, WorkSubmitArgs *);
+        using WorkSubmitFunction = void (*)(IOService *, uint64_t, WorkState *, WorkSubmitArgs *);
+        using WorkBeginFunction = void (*)(IOService *, uint64_t, WorkState *, WorkBeginArgs *);
+        using WorkEndFunction = void (*)(IOService *, uint64_t, WorkState *, WorkEndArgs *, bool);
+
+        uint64_t version;
+        RegisterDeviceFunction registerDevice;
+        RegisterDeviceFunction unregisterDevice;
+        WorkCanSubmitFunction workCanSubmit;
+        WorkSubmitFunction workSubmit;
+        WorkBeginFunction workBegin;
+        WorkEndFunction workEnd;
+    };
+
+    /*!
+     * @function registerPerformanceController
+     * @abstract Register a performance controller to receive callbacks. Not for general driver use.
+     * @param interface Struct containing callback functions implemented by the performance controller.
+     * @returns kIOReturnSuccess or kIOReturnError if the interface was already registered.
+     */
+    virtual IOReturn registerPerformanceController(PerfControllerInterface interface);
+
+private:
+    struct WorkTableEntry
+    {
+        struct thread_group *thread_group;
+        bool started;
+        uint8_t perfcontrol_data[32];
+    };
+
+    // TODO: size of table should match sum(maxWorkCapacity) of all users
+    static constexpr size_t kWorkTableNumEntries = 1024;
+
+    uint64_t allocateToken(thread_group *thread_group);
+    void deallocateToken(uint64_t token);
+    bool getEntryForToken(uint64_t token, WorkTableEntry &entry);
+    void markEntryStarted(uint64_t token, bool started);
+
+    PerfControllerInterface interface;
+    IOLock *interfaceLock;
+    OSSet *deviceRegistrationList;
+
+    // TODO: replace with ltable or pool of objects
+    WorkTableEntry workTable[kWorkTableNumEntries];
+    size_t workTableNextIndex;
+    IOSimpleLock *workTableLock;
+};
+
+#endif /* __cplusplus */
+#endif /* KERNEL_PRIVATE */
diff --git a/iokit/IOKit/perfcontrol/Makefile b/iokit/IOKit/perfcontrol/Makefile
new file mode 100644 (file)
index 0000000..3f8cad1
--- /dev/null
@@ -0,0 +1,32 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+IOKIT_FRAMEDIR = $(FRAMEDIR)/IOKit.framework/Versions/A
+INCDIR = $(IOKIT_FRAMEDIR)/Headers
+LCLDIR = $(IOKIT_FRAMEDIR)/PrivateHeaders
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+MI_DIR = perfcontrol
+NOT_EXPORT_HEADERS =
+
+ALL_HEADERS = $(shell (cd $(SOURCE); echo *.h))
+
+# Install these only in Kernel.framework's PrivateHeaders (not Headers).
+NOT_KF_MI_HEADERS  = $(NOT_EXPORT_HEADERS)                     \
+                    IOPerfControl.h
+
+INSTALL_MI_LIST        =
+INSTALL_MI_LCL_LIST =
+INSTALL_MI_DIR = $(MI_DIR)
+
+EXPORT_MI_LIST = $(filter-out $(NOT_EXPORT_HEADERS), $(ALL_HEADERS))
+EXPORT_MI_DIR = IOKit/$(MI_DIR)
+
+INSTALL_KF_MI_LIST = $(filter-out $(NOT_KF_MI_HEADERS), $(ALL_HEADERS))
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
index c8393354369ee03fd35e6f1e620c6b129c6b371f..930a45d8d4ec7c85da46af14ec011ff86bee8dde 100644 (file)
@@ -249,11 +249,29 @@ enum {
 
 /* kIOPMDeepSleepDelayKey
  * Key refers to a CFNumberRef that represents the delay in seconds before
- * entering Deep Sleep state. The property is not present if Deep Sleep is
- * unsupported.
+ * entering Deep Sleep state when on battery power and when remaining
+ * battery capacity is below a particular threshold (e.g., 50%.) The
+ * property is not present if Deep Sleep is unsupported.
  */
 #define kIOPMDeepSleepDelayKey              "Standby Delay"
 
+/* kIOPMDeepSleepDelayHighKey
+ * Key refers to a CFNumberRef that represents the delay in seconds before
+ * entering Deep Sleep state. This is used instead of the value specified by
+ * kIOPMDeepSleepDelayKey if the remaining battery capacity is above a
+ * particular threshold (e.g. 50%) or on AC power. The property is not
+ * present if Deep Sleep is unsupported.
+ */
+#define kIOPMDeepSleepDelayHighKey          "High Standby Delay"
+
+/* kIOPMLowBatteryThresholdKey
+ * Key refers to a CFNumberRef that represents the threshold used to choose
+ * between the normal deep sleep delay and the high deep sleep delay (as a
+ * percentage of total battery capacity remaining.) The property is not
+ * present if Deep Sleep is unsupported.
+ */
+#define kIOPMStandbyBatteryThresholdKey     "Standby Battery Threshold"
+
 /* kIOPMDestroyFVKeyOnStandbyKey
  * Specifies if FileVault key can be stored when going to standby mode
  * It has a boolean value,
@@ -631,11 +649,15 @@ enum {
     kIOPSFamilyCodeUSBChargingPortDownstream  = iokit_family_err(sub_iokit_usb, 5),
     kIOPSFamilyCodeUSBChargingPort    = iokit_family_err(sub_iokit_usb, 6),
     kIOPSFamilyCodeUSBUnknown     = iokit_family_err(sub_iokit_usb, 7),
+    kIOPSFamilyCodeUSBCBrick      = iokit_family_err(sub_iokit_usb, 8),
+    kIOPSFamilyCodeUSBCTypeC      = iokit_family_err(sub_iokit_usb, 9),
+    kIOPSFamilyCodeUSBCPD         = iokit_family_err(sub_iokit_usb, 10),
     kIOPSFamilyCodeAC       = iokit_family_err(sub_iokit_pmu, 0),
     kIOPSFamilyCodeExternal     = iokit_family_err(sub_iokit_pmu, 1),
     kIOPSFamilyCodeExternal2     = iokit_family_err(sub_iokit_pmu, 2),
     kIOPSFamilyCodeExternal3     = iokit_family_err(sub_iokit_pmu, 3),
     kIOPSFamilyCodeExternal4     = iokit_family_err(sub_iokit_pmu, 4),
+    kIOPSFamilyCodeExternal5     = iokit_family_err(sub_iokit_pmu, 5),
 };
 
 // values for kIOPMPSAdapterDetailsErrorFlagsKey
index cd0db25bf743613106d120db491f2a7640138c10..015c70a054777545c5218096a1c50bde4f5444c6 100644 (file)
@@ -40,6 +40,7 @@ enum {
     kTenMinutesInSeconds = 600
 };
 
+
 /*! @class IOPMPowerSource
  *
  * See IOKit/pwr_mgt/IOPM.h for power source keys relevant to this class. These
index b3f7b33972a4375397c8cdfa8f58efb52d190074..798be5d88d8f4240304b8bc9ad0fb7623306d072 100644 (file)
@@ -857,11 +857,14 @@ typedef struct {
 
 #define SWD_HDR_SIGNATURE       0xdeb8da2a
 
-#define SWD_BUF_SIZE            (40*PAGE_SIZE)
-#define SWD_INITIAL_STACK_SIZE  ((SWD_BUF_SIZE/2)-sizeof(swd_hdr))
+#define SWD_STACKSHOT_SIZE      (40*PAGE_SIZE)
+#define SWD_COMPRESSED_BUFSIZE  (5*PAGE_SIZE)
+#define SWD_ZLIB_BUFSIZE        (10*PAGE_SIZE)
+#define SWD_STACKSHOT_VAR_PREFIX    "sleepwake_diags"
 
 #define SWD_SPINDUMP_SIZE          (256*1024)
 #define SWD_INITIAL_SPINDUMP_SIZE  ((SWD_SPINDUMP_SIZE/2)-sizeof(swd_hdr))
+#define SWD_MAX_STACKSHOTS          (10)
 
 /* Bits in swd_flags */
 #define SWD_WDOG_ENABLED        0x01
@@ -880,25 +883,11 @@ typedef struct {
 
 
 /* Filenames associated with the stackshots/logs generated by the SWD */
-#define kSleepWakeStackBinFilename          "/var/log/SleepWakeStacks.bin"
-#define kSleepWakeStackFilename             "/var/log/SleepWakeStacks.dump"
-#define kSleepWakeLogFilename               "/var/log/SleepWakeLog.dump"
-#define kAppleOSXWatchdogStackFilename      "/var/log/AppleOSXWatchdogStacks.dump"
-#define kAppleOSXWatchdogLogFilename        "/var/log/AppleOSXWatchdogLog.dump"
+#define kOSWatchdogStacksFilename           "/var/log/OSXWatchdogStacks.gz"
+#define kOSWatchdogFailureStringFile        "/var/log/OSWatchdogFailureString.txt"
+#define kSleepWakeStacksFilename            "/var/log/SleepWakeStacks.gz"
+#define kSleepWakeFailureStringFile         "/var/log/SleepWakeFailureString.txt"
 
-inline char const* getDumpStackFilename(swd_hdr *hdr)
-{
-    if (hdr && hdr->is_osx_watchdog)
-        return kAppleOSXWatchdogStackFilename;
-    return kSleepWakeStackFilename;
-}
-
-inline char const* getDumpLogFilename(swd_hdr *hdr)
-{
-    if (hdr && hdr->is_osx_watchdog)
-        return kAppleOSXWatchdogLogFilename;
-    return kSleepWakeLogFilename;
-}
 
 /* RootDomain IOReporting channels */
 #define kSleepCntChID IOREPORT_MAKEID('S','l','e','e','p','C','n','t')
index 95474d652b91155fe2327c5787af1341b6425ee9..eef58a3205980998cdbdf0dda98c6a3733dc539e 100644 (file)
@@ -511,7 +511,7 @@ public:
                             uintptr_t param1, uintptr_t param2, uintptr_t param3 = 0);
     void        tracePoint(uint8_t point);
     void        traceDetail(uint32_t msgType, uint32_t msgIndex, uint32_t delay);
-    void        traceDetail(OSObject *notifier);
+    void        traceDetail(OSObject *notifier, bool start);
     void        traceAckDelay(OSObject *notifier, uint32_t response, uint32_t delay_ms);
 
     void        startSpinDump(uint32_t spindumpKind);
@@ -553,12 +553,10 @@ public:
     void        sleepWakeDebugTrig(bool restart);
     void        sleepWakeDebugEnableWdog();
     bool        sleepWakeDebugIsWdogEnabled();
-    static void saveTimeoutAppStackShot(void *p0, void *p1);
     void        sleepWakeDebugSaveSpinDumpFile();
-    void        swdDebugSetup();
-    void        swdDebugTeardown();
     bool        checkShutdownTimeout();
     void        panicWithShutdownLog(uint32_t timeoutInMs);
+    uint32_t    getWatchdogTimeout();
 
 private:
     friend class PMSettingObject;
@@ -581,10 +579,6 @@ private:
                                     IOService * newService,
                                     IONotifier * notifier);
 
-    static bool IONVRAMMatchPublished( void * target, void * refCon,
-                                    IOService * newService,
-                                    IONotifier * notifier);
-
     static bool batteryPublished( void * target, void * refCon,
                                     IOService * resourceService,
                                     IONotifier * notifier);
@@ -654,8 +648,6 @@ private:
     thread_call_t           extraSleepTimer;
     thread_call_t           diskSyncCalloutEntry;
     thread_call_t           fullWakeThreadCall;
-    thread_call_t           swdDebugSetupEntry;
-    thread_call_t           swdDebugTearDownEntry;
     thread_call_t           updateConsoleUsersEntry;
 
     // Track system capabilities.
@@ -787,13 +779,13 @@ private:
     volatile uint32_t   swd_lock;    /* Lock to access swd_buffer & and its header */
     void  *             swd_buffer;  /* Memory allocated for dumping sleep/wake logs */
     uint32_t            swd_flags;   /* Flags defined in IOPMPrivate.h */
-    uint8_t             swd_DebugImageSetup;
+    void *              swd_compressed_buffer;
     void  *             swd_spindump_buffer;
+    thread_t            notifierThread;
+    OSObject            *notifierObject;
 
     IOBufferMemoryDescriptor    *swd_memDesc;
 
-    IOMemoryMap  *      swd_logBufMap; /* Memory with sleep/wake logs from previous boot */
-
     // Wake Event Reporting
     OSArray *               _systemWakeEventsArray;
     bool                    _acceptSystemWakeEvents;
@@ -858,19 +850,12 @@ private:
 
     uint32_t    checkForValidDebugData(const char *fname, vfs_context_t *ctx, 
                                             void *tmpBuf, struct vnode **vp);
+    void        getFailureData(thread_t *thread, char *failureStr, size_t strLen);
+    void        saveFailureData2File();
+    void        tracePhase2String(uint32_t tracePhase, const char **phaseString, const char **description);
     void        sleepWakeDebugMemAlloc( );
     void        sleepWakeDebugSpinDumpMemAlloc( );
-    void        sleepWakeDebugDumpFromMem(IOMemoryMap *logBufMap);
-    void        sleepWakeDebugDumpFromFile( );
-    IOMemoryMap *sleepWakeDebugRetrieve();
     errno_t     sleepWakeDebugSaveFile(const char *name, char *buf, int len);
-    errno_t     sleepWakeDebugCopyFile( struct vnode *srcVp,
-                               vfs_context_t srcCtx,
-                               char *tmpBuf, uint64_t tmpBufSize,
-                               uint64_t srcOffset, 
-                               const char *dstFname, 
-                               uint64_t numBytes,
-                               uint32_t crc);
 
 
 #if HIBERNATION
index 2757c8a13de4bc01dd40c7b81b06658260fb843f..159f3eb9398621c1a6257fe0884acbfa548b6edf 100644 (file)
@@ -55,7 +55,7 @@ protected:
     /*! @var reserved
         Reserved for future use.  (Internal use only)  */
     struct ExpansionData { };
-    ExpansionData *reserved;
+    ExpansionData *iortc_reserved __unused;
 
 public:
 
index e9173c6558bf85e48eab0764e60b4e8c210b9009..8ad8d76cd83457851598f2411d5ae56f4c887a00 100644 (file)
@@ -47,6 +47,8 @@ extern void kperf_kernel_configure(char *);
 
 extern "C" void console_suspend();
 extern "C" void console_resume();
+extern "C" void sched_override_recommended_cores_for_sleep(void);
+extern "C" void sched_restore_recommended_cores_after_sleep(void);
 
 typedef kern_return_t (*iocpu_platform_action_t)(void * refcon0, void * refcon1, uint32_t priority,
                                                 void * param1, void * param2, void * param3,
@@ -352,64 +354,63 @@ IORemoveServicePlatformActions(IOService * service)
 kern_return_t PE_cpu_start(cpu_id_t target,
                           vm_offset_t start_paddr, vm_offset_t arg_paddr)
 {
-  IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
+    IOCPU *targetCPU = (IOCPU *)target;
   
-  if (targetCPU == 0) return KERN_FAILURE;
-  return targetCPU->startCPU(start_paddr, arg_paddr);
+    if (targetCPU == NULL) return KERN_FAILURE;
+    return targetCPU->startCPU(start_paddr, arg_paddr);
 }
 
 void PE_cpu_halt(cpu_id_t target)
 {
-  IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
+    IOCPU *targetCPU = (IOCPU *)target;
   
-  if (targetCPU) targetCPU->haltCPU();
+    targetCPU->haltCPU();
 }
 
 void PE_cpu_signal(cpu_id_t source, cpu_id_t target)
 {
-  IOCPU *sourceCPU = OSDynamicCast(IOCPU, (OSObject *)source);
-  IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
+    IOCPU *sourceCPU = (IOCPU *)source;
+    IOCPU *targetCPU = (IOCPU *)target;
   
-  if (sourceCPU && targetCPU) sourceCPU->signalCPU(targetCPU);
+    sourceCPU->signalCPU(targetCPU);
 }
 
 void PE_cpu_signal_deferred(cpu_id_t source, cpu_id_t target)
 {
-  IOCPU *sourceCPU = OSDynamicCast(IOCPU, (OSObject *)source);
-  IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
+    IOCPU *sourceCPU = (IOCPU *)source;
+    IOCPU *targetCPU = (IOCPU *)target;
 
-  if (sourceCPU && targetCPU) sourceCPU->signalCPUDeferred(targetCPU);
+    sourceCPU->signalCPUDeferred(targetCPU);
 }
 
 void PE_cpu_signal_cancel(cpu_id_t source, cpu_id_t target)
 {
-  IOCPU *sourceCPU = OSDynamicCast(IOCPU, (OSObject *)source);
-  IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
+    IOCPU *sourceCPU = (IOCPU *)source;
+    IOCPU *targetCPU = (IOCPU *)target;
 
-  if (sourceCPU && targetCPU) sourceCPU->signalCPUCancel(targetCPU);
+    sourceCPU->signalCPUCancel(targetCPU);
 }
 
 void PE_cpu_machine_init(cpu_id_t target, boolean_t bootb)
 {
-  IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
-  
-  if (targetCPU) {
-   targetCPU->initCPU(bootb);
+    IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
+
+    if (targetCPU == NULL)
+        panic("%s: invalid target CPU %p", __func__, target);
+
+    targetCPU->initCPU(bootb);
 #if defined(__arm__) || defined(__arm64__)
-   if (!bootb && (targetCPU->getCPUNumber() == (UInt32)master_cpu)) ml_set_is_quiescing(false);
+    if (!bootb && (targetCPU->getCPUNumber() == (UInt32)master_cpu)) ml_set_is_quiescing(false);
 #endif /* defined(__arm__) || defined(__arm64__) */
-  }
 }
 
 void PE_cpu_machine_quiesce(cpu_id_t target)
 {
-  IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
-  if (targetCPU) {
+    IOCPU *targetCPU = (IOCPU*)target;
 #if defined(__arm__) || defined(__arm64__)
-      if (targetCPU->getCPUNumber() == (UInt32)master_cpu) ml_set_is_quiescing(true);
+    if (targetCPU->getCPUNumber() == (UInt32)master_cpu) ml_set_is_quiescing(true);
 #endif /* defined(__arm__) || defined(__arm64__) */
-      targetCPU->quiesceCPU();
-  }
+    targetCPU->quiesceCPU();
 }
 
 #if defined(__arm__) || defined(__arm64__)
@@ -424,15 +425,17 @@ kern_return_t PE_cpu_perfmon_interrupt_install_handler(perfmon_interrupt_handler
 
 void PE_cpu_perfmon_interrupt_enable(cpu_id_t target, boolean_t enable)
 {
-    IOCPU *targetCPU = OSDynamicCast(IOCPU, (OSObject *)target);
+    IOCPU *targetCPU = (IOCPU*)target;
 
-    if (targetCPU) {
-        if (enable) {
-           targetCPU->getProvider()->registerInterrupt(1, targetCPU, (IOInterruptAction)pmi_handler, 0);
-           targetCPU->getProvider()->enableInterrupt(1);
-       } else {
-           targetCPU->getProvider()->disableInterrupt(1);
-       }
+    if (targetCPU == nullptr) {
+        return;
+    }
+
+    if (enable) {
+        targetCPU->getProvider()->registerInterrupt(1, targetCPU, (IOInterruptAction)pmi_handler, 0);
+        targetCPU->getProvider()->enableInterrupt(1);
+    } else {
+        targetCPU->getProvider()->disableInterrupt(1);
     }
 }
 #endif
@@ -461,6 +464,9 @@ void IOCPUSleepKernel(void)
     IOPMrootDomain  *rootDomain = IOService::getPMRootDomain();
 
     kprintf("IOCPUSleepKernel\n");
+#if defined(__arm64__)
+    sched_override_recommended_cores_for_sleep();
+#endif
 
     IORegistryIterator * iter;
     OSOrderedSet *       all;
@@ -526,10 +532,12 @@ void IOCPUSleepKernel(void)
     console_suspend();
 
     rootDomain->tracePoint( kIOPMTracePointSleepPlatformDriver );
+    rootDomain->stop_watchdog_timer();
 
     // Now sleep the boot CPU.
     bootCPU->haltCPU();
 
+    rootDomain->start_watchdog_timer();
     rootDomain->tracePoint( kIOPMTracePointWakePlatformActions );
 
     console_resume();
@@ -564,6 +572,10 @@ void IOCPUSleepKernel(void)
                 processor_start(target->getMachProcessor());
         }
     }
+
+#if defined(__arm64__)
+    sched_restore_recommended_cores_after_sleep();
+#endif
 }
 
 bool IOCPU::start(IOService *provider)
index 6c1f457679104da63a7eb8fce805eca9f6fa6d31..e69457efeeb2cbac44601421c5c89f1156d476fc 100644 (file)
@@ -162,6 +162,19 @@ IOReturn IOCommandGate::attemptCommand(void *arg0, void *arg1,
     return attemptAction((Action) action, arg0, arg1, arg2, arg3);
 }
 
+
+static IOReturn IOCommandGateActionToBlock(OSObject *owner,
+                              void *arg0, void *arg1,
+                              void *arg2, void *arg3)
+{
+    return ((IOEventSource::ActionBlock) arg0)();
+}
+
+IOReturn IOCommandGate::runActionBlock(ActionBlock action)
+{
+    return (runAction(&IOCommandGateActionToBlock, action));
+}
+
 IOReturn IOCommandGate::runAction(Action inAction,
                                   void *arg0, void *arg1,
                                   void *arg2, void *arg3)
@@ -275,16 +288,20 @@ IOReturn IOCommandGate::attemptAction(Action inAction,
 
 IOReturn IOCommandGate::commandSleep(void *event, UInt32 interruptible)
 {
-    if (!workLoop->inGate())
-        return kIOReturnNotPermitted;
+    if (!workLoop->inGate()) {
+        /* The equivalent of 'msleep' while not holding the mutex is invalid */
+        panic("invalid commandSleep while not holding the gate");
+    }
 
     return sleepGate(event, interruptible);
 }
 
 IOReturn IOCommandGate::commandSleep(void *event, AbsoluteTime deadline, UInt32 interruptible)
 {
-    if (!workLoop->inGate())
-        return kIOReturnNotPermitted;
+    if (!workLoop->inGate()) {
+        /* The equivalent of 'msleep' while not holding the mutex is invalid */
+        panic("invalid commandSleep while not holding the gate");
+    }
 
     return sleepGate(event, deadline, interruptible);
 }
index 261f86b1b3147d3fe17c2597adb135f032bc3410..ee6642b22a2eafeb89cc9e5e37ccb36690b2eccd 100644 (file)
@@ -379,7 +379,8 @@ IODMACommand::setMemoryDescriptor(const IOMemoryDescriptor *mem, bool autoPrepar
        fInternalState->fNewMD = true;
        mem->retain();
        fMemory = mem;
-       if (!fMapper) mem->dmaCommandOperation(kIOMDSetDMAActive, this, 0);
+       fInternalState->fSetActiveNoMapper = (!fMapper);
+       if (fInternalState->fSetActiveNoMapper) mem->dmaCommandOperation(kIOMDSetDMAActive, this, 0);
        if (autoPrepare) {
            err = prepare();
            if (err) {
@@ -399,7 +400,7 @@ IODMACommand::clearMemoryDescriptor(bool autoComplete)
     if (fMemory)
     {
        while (fActive) complete();
-       if (!fMapper) fMemory->dmaCommandOperation(kIOMDSetDMAInactive, this, 0);
+       if (fInternalState->fSetActiveNoMapper) fMemory->dmaCommandOperation(kIOMDSetDMAInactive, this, 0);
        fMemory->release();
        fMemory = 0;
     }
@@ -823,8 +824,6 @@ IODMACommand::prepare(UInt64 offset, UInt64 length, bool flushCache, bool synchr
        state->fLocalMapperAllocValid  = false;
        state->fLocalMapperAllocLength = 0;
 
-       state->fLocalMapper    = (fMapper && (fMapper != IOMapper::gSystem));
-
        state->fSourceAlignMask = fAlignMask;
        if (fMapper)
            state->fSourceAlignMask &= page_mask;
index e6124dfcf874443922eec63c8a8a4bfb46008935..15f68a362bfd92bf206d45ac57e4e6932c563a39 100644 (file)
@@ -239,18 +239,28 @@ Boolean IODataQueue::enqueue(void * data, UInt32 dataSize)
         }
     }
 
-    // Store tail with a release memory barrier
-    __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE);
-
-    // Send notification (via mach message) that data is available.
-
-    if ( ( head == tail )                /* queue was empty prior to enqueue() */
-    ||   ( tail == __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_ACQUIRE) ) )   /* queue was emptied during enqueue() */
-    {
-        sendDataAvailableNotification();
-    }
+       // Publish the data we just enqueued
+       __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE);
+
+       if (tail != head) {
+               //
+               // The memory barrier below paris with the one in ::dequeue
+               // so that either our store to the tail cannot be missed by
+               // the next dequeue attempt, or we will observe the dequeuer
+               // making the queue empty.
+               //
+               // Of course, if we already think the queue is empty,
+               // there's no point paying this extra cost.
+               //
+               __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
+               head = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED);
+       }
 
-    return true;
+       if (tail == head) {
+               // Send notification (via mach message) that data is now available.
+               sendDataAvailableNotification();
+       }
+       return true;
 }
 
 void IODataQueue::setNotificationPort(mach_port_t port)
index 3393993e0a286123748daf7a2e514cafb39f13d9..76c2d5032cf529bb7966bc102b133f3ef9908cdc 100644 (file)
@@ -36,6 +36,7 @@ HISTORY
 
 #include <IOKit/IOEventSource.h>
 #include <IOKit/IOWorkLoop.h>
+#include <libkern/Block.h>
 
 #define super OSObject
 
@@ -162,6 +163,8 @@ bool IOEventSource::init(OSObject *inOwner,
 void IOEventSource::free( void )
 {
     IOStatisticsUnregisterCounter();
+
+       if ((kActionBlock & flags) && actionBlock) Block_release(actionBlock);
        
     if (reserved)
                IODelete(reserved, ExpansionData, 1);
@@ -169,13 +172,41 @@ void IOEventSource::free( void )
     super::free();
 }
 
-IOEventSource::Action IOEventSource::getAction () const { return action; };
+void IOEventSource::setRefcon(void *newrefcon)
+{
+       refcon = newrefcon;
+}
+
+void * IOEventSource::getRefcon() const
+{
+       return refcon;
+}
+
+IOEventSource::Action IOEventSource::getAction() const
+{
+       if (kActionBlock & flags) return NULL;
+       return (action);
+}
+
+IOEventSource::ActionBlock IOEventSource::getActionBlock(ActionBlock) const
+{
+       if (kActionBlock & flags) return actionBlock;
+       return (NULL);
+}
 
 void IOEventSource::setAction(Action inAction)
 {
+       if ((kActionBlock & flags) && actionBlock) Block_release(actionBlock);
     action = inAction;
 }
 
+void IOEventSource::setActionBlock(ActionBlock block)
+{
+       if ((kActionBlock & flags) && actionBlock) Block_release(actionBlock);
+       actionBlock = Block_copy(block);
+       flags |= kActionBlock;
+}
+
 IOEventSource *IOEventSource::getNext() const { return eventChainNext; };
 
 void IOEventSource::setNext(IOEventSource *inNext)
index c6f79e91dda530fac08cb7d1119d2552a8eae18d..f3c61367b3fdc26042992e8b9a655aa619a244f9 100644 (file)
@@ -32,6 +32,7 @@
 #include <IOKit/IOTimeStamp.h>
 #include <IOKit/IOWorkLoop.h>
 #include <IOKit/IOInterruptAccountingPrivate.h>
+#include <libkern/Block.h>
 
 #if IOKITSTATS
 
@@ -123,6 +124,39 @@ IOFilterInterruptEventSource *IOFilterInterruptEventSource
     return me;
 }
 
+
+IOFilterInterruptEventSource *IOFilterInterruptEventSource
+::filterInterruptEventSource(OSObject *inOwner,
+                             IOService *inProvider,
+                             int inIntIndex,
+                             ActionBlock inAction,
+                             FilterBlock inFilterAction)
+{
+    IOFilterInterruptEventSource *me = new IOFilterInterruptEventSource;
+
+    FilterBlock filter = Block_copy(inFilterAction);
+    if (!filter) return 0;
+
+    if (me
+    && !me->init(inOwner, (Action) NULL, (Filter) filter, inProvider, inIntIndex)) {
+        me->release();
+           Block_release(filter);
+        return 0;
+    }
+    me->flags |= kFilterBlock;
+    me->setActionBlock((IOEventSource::ActionBlock) inAction);
+
+    return me;
+}
+
+
+void IOFilterInterruptEventSource::free( void )
+{
+       if ((kFilterBlock & flags) && filterActionBlock) Block_release(filterActionBlock);
+
+    super::free();
+}
+
 void IOFilterInterruptEventSource::signalInterrupt()
 {
        bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false;
@@ -144,11 +178,16 @@ void IOFilterInterruptEventSource::signalInterrupt()
 IOFilterInterruptEventSource::Filter
 IOFilterInterruptEventSource::getFilterAction() const
 {
+       if (kFilterBlock & flags) return NULL;
     return filterAction;
 }
 
-
-
+IOFilterInterruptEventSource::FilterBlock
+IOFilterInterruptEventSource::getFilterActionBlock() const
+{
+       if (kFilterBlock & flags) return filterActionBlock;
+       return (NULL);
+}
 
 void IOFilterInterruptEventSource::normalInterruptOccurred
     (void */*refcon*/, IOService */*prov*/, int /*source*/)
@@ -169,7 +208,8 @@ void IOFilterInterruptEventSource::normalInterruptOccurred
     }
     
     // Call the filter.
-    filterRes = (*filterAction)(owner, this);
+    if (kFilterBlock & flags) filterRes = (filterActionBlock)(this);
+    else                      filterRes = (*filterAction)(owner, this);
 
     if (IOInterruptEventSource::reserved->statistics) {
         if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelCountIndex)) {
@@ -210,7 +250,8 @@ void IOFilterInterruptEventSource::disableInterruptOccurred
     }
     
     // Call the filter.
-    filterRes = (*filterAction)(owner, this);
+    if (kFilterBlock & flags) filterRes = (filterActionBlock)(this);
+    else                      filterRes = (*filterAction)(owner, this);
 
     if (IOInterruptEventSource::reserved->statistics) {
         if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingFirstLevelCountIndex)) {
index 6db950f67b8ed212d12ee12048639ca5ca1df8ea..94d5b465ed8cff67902ec98931a9967143f5bf15 100644 (file)
@@ -214,9 +214,6 @@ static OSData *                     gIOHibernateBoot0082Data;
 static OSData *                        gIOHibernateBootNextData;
 static OSObject *              gIOHibernateBootNextSave;
 
-static IOPolledFileIOVars *     gDebugImageFileVars;
-static IOLock             *     gDebugImageLock;
-
 #endif /* defined(__i386__) || defined(__x86_64__) */
 
 static IOLock *                           gFSLock;
@@ -530,19 +527,11 @@ IOHibernateSystemSleep(void)
            }
        }
 
-       // Invalidate the image file
-    if (gDebugImageLock) {
-        IOLockLock(gDebugImageLock);
-        if (gDebugImageFileVars != 0) {
-            IOSetBootImageNVRAM(0);
-            IOPolledFileClose(&gDebugImageFileVars, 0, 0, 0, 0, 0);
-        }
-        IOLockUnlock(gDebugImageLock);
-    }
-
         vars->volumeCryptKeySize = sizeof(vars->volumeCryptKey);
-        err = IOPolledFileOpen(gIOHibernateFilename, setFileSize, 0,
-                               gIOHibernateCurrentHeader, sizeof(gIOHibernateCurrentHeader),
+        err = IOPolledFileOpen(gIOHibernateFilename,
+                                (kIOPolledFileCreate | kIOPolledFileHibernate),
+                                setFileSize, 0,
+                                gIOHibernateCurrentHeader, sizeof(gIOHibernateCurrentHeader),
                                 &vars->fileVars, &nvramData,
                                 &vars->volumeCryptKey[0], &vars->volumeCryptKeySize);
 
@@ -890,75 +879,6 @@ exit:
     return err;
 }
 
-extern "C" boolean_t root_is_CF_drive;
-
-void
-IOOpenDebugDataFile(const char *fname, uint64_t size)
-{
-    IOReturn   err;
-    OSData *   imagePath = NULL;
-    uint64_t   padding;
-
-    if (!gDebugImageLock) {
-        gDebugImageLock = IOLockAlloc();
-    }
-
-    if (root_is_CF_drive) return;
-
-    // Try to get a lock, but don't block for getting lock
-    if (!IOLockTryLock(gDebugImageLock)) {
-        HIBLOG("IOOpenDebugDataFile: Failed to get lock\n");
-        return;
-    }
-
-    if (gDebugImageFileVars ||  !fname || !size) {
-        HIBLOG("IOOpenDebugDataFile: conditions failed\n");
-        goto exit;
-    }
-
-    padding = (PAGE_SIZE*2);  // allocate couple more pages for header and fileextents
-    err = IOPolledFileOpen(fname, size+padding, 32ULL*1024*1024*1024,
-                           NULL, 0,
-                           &gDebugImageFileVars, &imagePath, NULL, 0);
-
-    if ((kIOReturnSuccess == err) && imagePath)
-    {
-        if ((gDebugImageFileVars->fileSize < (size+padding)) ||
-            (gDebugImageFileVars->fileExtents->getLength() > PAGE_SIZE)) {
-            // Can't use the file
-            IOPolledFileClose(&gDebugImageFileVars, 0, 0, 0, 0, 0);
-            HIBLOG("IOOpenDebugDataFile: too many file extents\n");
-            goto exit;
-        }
-
-        // write extents for debug data usage in EFI
-        IOWriteExtentsToFile(gDebugImageFileVars, kIOHibernateHeaderOpenSignature);
-        IOSetBootImageNVRAM(imagePath);
-    }
-
-exit:
-    IOLockUnlock(gDebugImageLock);
-
-    if (imagePath) imagePath->release();
-    return;
-}
-
-void
-IOCloseDebugDataFile()
-{
-    IOSetBootImageNVRAM(0);
-
-    if (gDebugImageLock) {
-        IOLockLock(gDebugImageLock);
-        if (gDebugImageFileVars != 0) {
-            IOPolledFileClose(&gDebugImageFileVars, 0, 0, 0, 0, 0);
-        }
-        IOLockUnlock(gDebugImageLock);
-    }
-
-
-}
-
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 DECLARE_IOHIBERNATEPROGRESSALPHA
@@ -1394,6 +1314,8 @@ IOReturn
 IOHibernateSystemPostWake(bool now)
 {
     gIOHibernateCurrentHeader->signature = kIOHibernateHeaderInvalidSignature;
+    IOSetBootImageNVRAM(0);
+
     IOLockLock(gFSLock);
     if (kFSTrimDelay == gFSState)
     {
@@ -1913,12 +1835,11 @@ hibernate_write_image(void)
         };
 
         bool cpuAES = (0 != (CPUID_FEATURE_AES & cpuid_features()));
-#define _pmap_is_noencrypt(x) (cpuAES ? false : pmap_is_noencrypt((x)))
 
         for (pageType = kWiredEncrypt; pageType >= kUnwiredEncrypt; pageType--)
         {
            if (kUnwiredEncrypt == pageType)
-          {
+           {
                // start unwired image
                if (!vars->hwEncrypt && (kIOHibernateModeEncrypt & gIOHibernateMode))
                {
@@ -1933,27 +1854,36 @@ hibernate_write_image(void)
             }
             for (iterDone = false, ppnum = 0; !iterDone; )
             {
-                count = hibernate_page_list_iterate((kWired & pageType)
-                                                            ? vars->page_list_wired : vars->page_list,
-                                                        &ppnum);
+               if (cpuAES && (pageType == kWiredClear))
+               {
+                   count = 0;
+               }
+               else
+               {
+                   count = hibernate_page_list_iterate((kWired & pageType) ? vars->page_list_wired : vars->page_list,
+                                                       &ppnum);
+               }
 //              kprintf("[%d](%x : %x)\n", pageType, ppnum, count);
                 iterDone = !count;
 
-                if (count && (kWired & pageType) && needEncrypt)
-                {
-                    uint32_t checkIndex;
-                    for (checkIndex = 0;
-                            (checkIndex < count)
-                                && (((kEncrypt & pageType) == 0) == _pmap_is_noencrypt(ppnum + checkIndex));
-                            checkIndex++)
-                    {}
-                    if (!checkIndex)
-                    {
-                        ppnum++;
-                        continue;
-                    }
-                    count = checkIndex;
-                }
+               if (!cpuAES)
+               {
+                   if (count && (kWired & pageType) && needEncrypt)
+                   {
+                       uint32_t checkIndex;
+                       for (checkIndex = 0;
+                               (checkIndex < count)
+                                   && (((kEncrypt & pageType) == 0) == pmap_is_noencrypt(ppnum + checkIndex));
+                               checkIndex++)
+                       {}
+                       if (!checkIndex)
+                       {
+                           ppnum++;
+                           continue;
+                       }
+                       count = checkIndex;
+                   }
+               }
 
                 switch (pageType)
                 {
index a410de27e334983aa3ab6937109753d309754233..0d96bbb8649202557dfcf85df55c03682644a1a0 100644 (file)
@@ -220,6 +220,19 @@ IOInterruptEventSource::interruptEventSource(OSObject *inOwner,
     return me;
 }
 
+IOInterruptEventSource *
+IOInterruptEventSource::interruptEventSource(OSObject *inOwner,
+                                            IOService *inProvider,
+                                            int inIntIndex,
+                                            ActionBlock inAction)
+{
+    IOInterruptEventSource * ies;
+    ies = IOInterruptEventSource::interruptEventSource(inOwner, (Action) NULL, inProvider, inIntIndex);
+    if (ies) ies->setActionBlock((IOEventSource::ActionBlock) inAction);
+
+    return ies;
+}
+
 void IOInterruptEventSource::free()
 {
     if (provider && intIndex >= 0)
@@ -300,6 +313,7 @@ bool IOInterruptEventSource::checkForWork()
     unsigned int cacheProdCount = producerCount;
     int numInts = cacheProdCount - consumerCount;
     IOInterruptEventAction intAction = (IOInterruptEventAction) action;
+    ActionBlock intActionBlock = (ActionBlock) actionBlock;
        bool trace = (gIOKitTrace & kIOTraceIntEventSource) ? true : false;
        
     IOStatisticsCheckForWork();
@@ -322,7 +336,8 @@ bool IOInterruptEventSource::checkForWork()
                }
 
                // Call the handler
-               (*intAction)(owner, this, numInts);
+               if (kActionBlock & flags) (intActionBlock)(this, numInts);
+               else                      (*intAction)(owner, this, numInts);
 
                if (reserved->statistics) {
                        if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingSecondLevelCountIndex)) {
@@ -368,7 +383,8 @@ bool IOInterruptEventSource::checkForWork()
                }
                
                // Call the handler
-               (*intAction)(owner, this, -numInts);
+               if (kActionBlock & flags) (intActionBlock)(this, numInts);
+               else                      (*intAction)(owner, this, numInts);
 
                if (reserved->statistics) {
                        if (IA_GET_STATISTIC_ENABLED(kInterruptAccountingSecondLevelCountIndex)) {
index d1711fcc3b97e6cb52282627524315afe6186dfa..4b0cf6ffc951993c4398c0e3485e5dc0198d6dd3 100644 (file)
@@ -131,10 +131,10 @@ struct IODMACommandInternal
     UInt8  fPrepared;
     UInt8  fDoubleBuffer;
     UInt8  fNewMD;
-    UInt8  fLocalMapper;
     UInt8  fLocalMapperAllocValid;
     UInt8  fIOVMAddrValid;
     UInt8  fForceDoubleBuffer;
+    UInt8  fSetActiveNoMapper;
 
     vm_page_t fCopyPageAlloc;
     vm_page_t fCopyNext;
index 73057d7a98a3a6f8d6c4ab22df8336128469b8f4..385ce056ff6db39b37e385f027c29f78d97d7e48 100644 (file)
@@ -548,8 +548,8 @@ IOKernelAllocateWithPhysicalRestrict(mach_vm_size_t size, mach_vm_address_t maxP
         alignment = 1;
 
     alignMask = alignment - 1;
-    adjustedSize = (2 * size) + sizeofIOLibPageMallocHeader;
-    if (adjustedSize < size) return (0);
+
+    if (os_mul_and_add_overflow(2, size, sizeofIOLibPageMallocHeader, &adjustedSize)) return (0);
 
     contiguous = (contiguous && (adjustedSize > page_size))
                    || (alignment > page_size);
index 99999991d92eef222d389b71e963a97670150158..3314a68a1587a0100794adf86658424501746b57 100644 (file)
@@ -244,8 +244,13 @@ IOBigMemoryCursor::outputSegment(PhysicalSegment inSegment,
     IOPhysicalAddress * segment;
 
     segment = &((PhysicalSegment *) inSegments)[inSegmentIndex].location;
+#if IOPhysSize == 64
+    OSWriteBigInt64(segment, 0, inSegment.location);
+    OSWriteBigInt64(segment, sizeof(IOPhysicalAddress), inSegment.length);
+#else
     OSWriteBigInt(segment, 0, inSegment.location);
     OSWriteBigInt(segment, sizeof(IOPhysicalAddress), inSegment.length);
+#endif
 }
 
 IOBigMemoryCursor *
@@ -291,8 +296,13 @@ IOLittleMemoryCursor::outputSegment(PhysicalSegment inSegment,
     IOPhysicalAddress * segment;
 
     segment = &((PhysicalSegment *) inSegments)[inSegmentIndex].location;
+#if IOPhysSize == 64
+    OSWriteLittleInt64(segment, 0, inSegment.location);
+    OSWriteLittleInt64(segment, sizeof(IOPhysicalAddress), inSegment.length);
+#else
     OSWriteLittleInt(segment, 0, inSegment.location);
     OSWriteLittleInt(segment, sizeof(IOPhysicalAddress), inSegment.length);
+#endif
 }
 
 IOLittleMemoryCursor *
index 0d03e32caffc006bfe4846424c0be0167729645c..3c1c4674b3a38e06bac1bd645fd970b99de7e1a1 100644 (file)
@@ -42,6 +42,7 @@
 
 #include <IOKit/IOKitDebug.h>
 #include <libkern/OSDebug.h>
+#include <libkern/OSKextLibPrivate.h>
 
 #include "IOKitKernelInternal.h"
 
@@ -873,7 +874,7 @@ IOGeneralMemoryDescriptor::memoryReferenceMap(
      * kIOMapPrefault is redundant in that case, so don't try to use it for UPL
      * operations.
      */ 
-    if ((reserved != NULL) && (reserved->dp.devicePager) && (_memoryEntries == NULL) && (_wireCount != 0))
+    if ((reserved != NULL) && (reserved->dp.devicePager) && (_wireCount != 0))
         options &= ~kIOMapPrefault;
 
     /*
@@ -1704,6 +1705,7 @@ IOGeneralMemoryDescriptor::initWithOptions(void * buffers,
              && (VM_KERN_MEMORY_NONE == _kernelTag))
             {
                _kernelTag = IOMemoryTag(kernel_map);
+                if (_kernelTag == gIOSurfaceTag) _userTag = VM_MEMORY_IOSURFACE;
             }
 
            if ( (kIOMemoryPersistent & _flags) && !_memRef)
@@ -1962,7 +1964,11 @@ IOByteCount IOMemoryDescriptor::writeBytes
 
     assert(!remaining);
 
+#if defined(__x86_64__)
+    // copypv does not cppvFsnk on intel
+#else
     if (!srcAddr) performOperation(kIOMemoryIncoherentIOFlush, inoffset, length);
+#endif
 
     return length - remaining;
 }
@@ -3642,6 +3648,7 @@ IOReturn IOGeneralMemoryDescriptor::doMap(
      && (mapping->fAddressTask == _task)
      && (mapping->fAddressMap == get_task_map(_task)) 
      && (options & kIOMapAnywhere)
+     && (!(kIOMapUnique & options))
      && (1 == _rangesCount) 
      && (0 == offset)
      && range0Addr 
@@ -4535,9 +4542,8 @@ IOMemoryMap * IOMemoryDescriptor::makeMapping(
     if (!(kIOMap64Bit & options)) panic("IOMemoryDescriptor::makeMapping !64bit");
 #endif /* !__LP64__ */
 
-    IOMemoryDescriptor * mapDesc = 0;
-    IOMemoryMap *       result = 0;
-    OSIterator *        iter;
+    IOMemoryDescriptor *  mapDesc = 0;
+    __block IOMemoryMap * result  = 0;
 
     IOMemoryMap *  mapping = (IOMemoryMap *) __address;
     mach_vm_size_t offset  = mapping->fOffset + __offset;
@@ -4582,20 +4588,17 @@ IOMemoryMap * IOMemoryDescriptor::makeMapping(
        else
        {
            // look for a compatible existing mapping
-           if( (iter = OSCollectionIterator::withCollection(_mappings)))
+           if (_mappings) _mappings->iterateObjects(^(OSObject * object)
            {
-               IOMemoryMap * lookMapping;
-               while ((lookMapping = (IOMemoryMap *) iter->getNextObject()))
+               IOMemoryMap * lookMapping = (IOMemoryMap *) object;
+               if ((result = lookMapping->copyCompatible(mapping)))
                {
-                   if ((result = lookMapping->copyCompatible(mapping)))
-                   {
-                       addMapping(result);
-                       result->setMemoryDescriptor(this, offset);
-                       break;
-                   }
+                   addMapping(result);
+                   result->setMemoryDescriptor(this, offset);
+                   return (true);
                }
-               iter->release();
-           }
+               return (false);
+           });
            if (result || (options & kIOMapReference))
            {
                if (result != mapping)
index 13a5a39a644dbd0e69eecb2e0b30ae5185b8faa4..d54824088321219d850357ea62def6d269df6922 100644 (file)
@@ -394,3 +394,31 @@ IOReturn IOMultiMemoryDescriptor::getPageCounts(IOByteCount * pResidentPageCount
 
     return (err);
 }
+
+uint64_t IOMultiMemoryDescriptor::getPreparationID( void )
+{
+
+    if (!super::getKernelReserved())
+    {
+        return (kIOPreparationIDUnsupported);
+    }
+
+    for (unsigned index = 0; index < _descriptorsCount; index++)
+    {
+        uint64_t preparationID = _descriptors[index]->getPreparationID();
+
+        if ( preparationID == kIOPreparationIDUnsupported )
+        {
+           return (kIOPreparationIDUnsupported);
+        }
+
+        if ( preparationID == kIOPreparationIDUnprepared )
+        {
+            return (kIOPreparationIDUnprepared);
+        }
+    }
+
+    super::setPreparationID();
+
+    return (super::getPreparationID());
+}
index 94d6b75dd77d0ce7bb88afa83cf6ef4874ee2d64..4814258d1ed8e053d0b80a9547ab5dde1a0df6d6 100644 (file)
 #include <kern/debug.h>
 #include <pexpert/pexpert.h>
 
-#if CONFIG_MACF
-extern "C" {
-#include <security/mac.h>
-#include <security/mac_framework.h>
-};
-#endif /* MAC */
-
 #define super IOService
 
 #define kIONVRAMPrivilege      kIOClientPrivilegeAdministrator
@@ -296,11 +289,7 @@ bool IODTNVRAM::serializeProperties(OSSerialize *s) const
       
       variablePerm = getOFVariablePerm(key);
       if ((hasPrivilege || (variablePerm != kOFVariablePermRootOnly)) &&
-         ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) )
-#if CONFIG_MACF
-          && (current_task() == kernel_task || mac_iokit_check_nvram_get(kauth_cred_get(), key->getCStringNoCopy()) == 0)
-#endif
-         ) { }
+         ( ! (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) )) { }
       else {
         dict->removeObject(key);
         iter->reset();
@@ -332,12 +321,6 @@ OSObject *IODTNVRAM::copyProperty(const OSSymbol *aKey) const
   }
   if (variablePerm == kOFVariablePermKernelOnly && current_task() != kernel_task) return 0;
 
-#if CONFIG_MACF
-  if (current_task() != kernel_task &&
-      mac_iokit_check_nvram_get(kauth_cred_get(), aKey->getCStringNoCopy()) != 0)
-    return 0;
-#endif
-
   IOLockLock(_ofLock);
   theObject = _ofDict->getObject(aKey);
   if (theObject) theObject->retain();
@@ -384,7 +367,7 @@ bool IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
 {
   bool     result;
   UInt32   propType, propPerm;
-  OSString *tmpString;
+  OSString *tmpString = 0;
   OSObject *propObject = 0, *oldObject;
 
   if (_ofDict == 0) return false;
@@ -399,12 +382,6 @@ bool IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
   // Don't allow change of 'aapl,panic-info'.
   if (aKey->isEqualTo(kIODTNVRAMPanicInfoKey)) return false;
 
-#if CONFIG_MACF
-  if (current_task() != kernel_task &&
-      mac_iokit_check_nvram_set(kauth_cred_get(), aKey->getCStringNoCopy(), anObject) != 0)
-    return false;
-#endif
-  
   // Make sure the object is of the correct type.
   propType = getOFVariableType(aKey);
   switch (propType) {
@@ -458,6 +435,9 @@ bool IODTNVRAM::setProperty(const OSSymbol *aKey, OSObject *anObject)
   if (oldObject) {
     oldObject->release();
   }
+  if (tmpString) {
+    propObject->release();
+  }
 
   IOLockUnlock(_ofLock);
 
@@ -482,12 +462,6 @@ void IODTNVRAM::removeProperty(const OSSymbol *aKey)
   // Don't allow change of 'aapl,panic-info'.
   if (aKey->isEqualTo(kIODTNVRAMPanicInfoKey)) return;
   
-#if CONFIG_MACF
-  if (current_task() != kernel_task &&
-      mac_iokit_check_nvram_delete(kauth_cred_get(), aKey->getCStringNoCopy()) != 0)
-    return;
-#endif
-
   // If the object exists, remove it from the dictionary.
 
   IOLockLock(_ofLock);
index b6ee3bce858bb4a4c27c8d84e70eb3eaad6a30a8..40a11e05eb85688a369bb73d1865ce7cae39f4cc 100644 (file)
 #include <sys/fcntl.h>
 #include <os/log.h>
 #include <pexpert/protos.h>
+#include <AssertMacros.h>
 
 #include <sys/time.h>
 #include "IOServicePrivate.h"   // _IOServiceInterestNotifier
 #include "IOServicePMPrivate.h"
 
+#include <libkern/zlib.h>
+
 __BEGIN_DECLS
 #include <mach/shared_region.h>
 #include <kern/clock.h>
@@ -182,6 +185,7 @@ IOReturn OSKextSystemSleepOrWake( UInt32 );
 }
 extern "C" ppnum_t      pmap_find_phys(pmap_t pmap, addr64_t va);
 extern "C" addr64_t     kvtophys(vm_offset_t va);
+extern "C" boolean_t    kdp_has_polled_corefile();
 
 static void idleSleepTimerExpired( thread_call_param_t, thread_call_param_t );
 static void notifySystemShutdown( IOService * root, uint32_t messageType );
@@ -198,7 +202,8 @@ static const OSSymbol *sleepMessagePEFunction   = NULL;
 #define kIORequestWranglerIdleKey   "IORequestIdle"
 #define kDefaultWranglerIdlePeriod  1000 // in milliseconds
 
-#define kIOSleepWakeDebugKey        "Persistent-memory-note"
+#define kIOSleepWakeFailureString   "SleepWakeFailureString"
+#define kIOOSWatchdogFailureString  "OSWatchdogFailureString"
 #define kIOEFIBootRomFailureKey     "wake-failure"
 
 #define kRD_AllPowerSources (kIOPMSupportedOnAC \
@@ -331,8 +336,13 @@ uuid_string_t bootsessionuuid_string;
 
 static uint32_t         gDarkWakeFlags = kDarkWakeFlagHIDTickleNone;
 static uint32_t         gNoIdleFlag = 0;
+static uint32_t         gSwdPanic = 0;
+static uint32_t         gSwdSleepTimeout = 0;
+static uint32_t         gSwdWakeTimeout = 0;
+static uint32_t         gSwdSleepWakeTimeout = 0;
 static PMStatsStruct    gPMStats;
 
+
 #if HIBERNATION
 static IOPMSystemSleepPolicyHandler     gSleepPolicyHandler = 0;
 static IOPMSystemSleepPolicyVariables * gSleepPolicyVars = 0;
@@ -346,11 +356,18 @@ static char gWakeReasonString[128];
 static bool gWakeReasonSysctlRegistered = false;
 static AbsoluteTime gIOLastWakeAbsTime;
 static AbsoluteTime gIOLastSleepAbsTime;
+static AbsoluteTime gUserActiveAbsTime;
+static AbsoluteTime gUserInactiveAbsTime;
 
 #if defined(__i386__) || defined(__x86_64__)
 static bool gSpinDumpBufferFull = false;
 #endif
 
+z_stream          swd_zs;
+vm_offset_t swd_zs_zmem;
+//size_t swd_zs_zsize;
+size_t swd_zs_zoffset;
+
 static unsigned int     gPMHaltBusyCount;
 static unsigned int     gPMHaltIdleCount;
 static int              gPMHaltDepth;
@@ -359,7 +376,6 @@ static IOLock *         gPMHaltLock  = 0;
 static OSArray *        gPMHaltArray = 0;
 static const OSSymbol * gPMHaltClientAcknowledgeKey = 0;
 static bool             gPMQuiesced;
-static uint32_t         gIOPMPCIHostBridgeWakeDelay;
 
 // Constants used as arguments to IOPMrootDomain::informCPUStateChange
 #define kCPUUnknownIndex    9999999
@@ -697,7 +713,6 @@ extern "C" void IOSystemShutdownNotification(int stage)
 #if HIBERNATION
     startTime = mach_absolute_time();
     IOHibernateSystemPostWake(true);
-    gRootDomain->swdDebugTeardown();
     halt_log_enter("IOHibernateSystemPostWake", 0, mach_absolute_time() - startTime);
 #endif
     if (OSCompareAndSwap(0, 1, &gPagingOff))
@@ -789,76 +804,6 @@ void IOPMrootDomain::updateConsoleUsers(void)
 
 //******************************************************************************
 
-static void swdDebugSetupCallout( thread_call_param_t p0, thread_call_param_t p1 )
-{
-    IOPMrootDomain * rootDomain = (IOPMrootDomain *) p0;
-    uint32_t    notifyRef  = (uint32_t)(uintptr_t) p1;
-
-    rootDomain->swdDebugSetup();
-
-    if (p1) {
-        rootDomain->allowPowerChange(notifyRef);
-    }
-    DLOG("swdDebugSetupCallout finish\n");
-}
-
-void IOPMrootDomain::swdDebugSetup( )
-{
-#if    HIBERNATION
-    static int32_t noDebugFile = -1;
-    if (noDebugFile == -1) {
-        if (PEGetCoprocessorVersion() >= kCoprocessorVersion2)
-            noDebugFile = 1;
-        else if (PE_parse_boot_argn("swd_mem_only", &noDebugFile, sizeof(noDebugFile)) == false)
-            noDebugFile = 0;
-    }
-
-   if ((noDebugFile == 1) || (gRootDomain->sleepWakeDebugIsWdogEnabled() == false)) {
-       return;
-   }
-    DLOG("swdDebugSetup state:%d\n", swd_DebugImageSetup);
-    if (swd_DebugImageSetup == FALSE) {
-        swd_DebugImageSetup = TRUE;
-        if (CAP_GAIN(kIOPMSystemCapabilityGraphics) ||
-                (CAP_LOSS(kIOPMSystemCapabilityGraphics))) {
-            IOHibernateSystemPostWake(true);
-            IOCloseDebugDataFile();
-        }
-        IOOpenDebugDataFile(kSleepWakeStackBinFilename, SWD_BUF_SIZE);
-    }
-#endif
-
-
-}
-
-static void swdDebugTeardownCallout( thread_call_param_t p0, thread_call_param_t p1 )
-{
-    IOPMrootDomain * rootDomain = (IOPMrootDomain *) p0;
-    uint32_t    notifyRef  = (uint32_t)(uintptr_t) p1;
-
-    rootDomain->swdDebugTeardown();
-    if (p1) {
-        rootDomain->allowPowerChange(notifyRef);
-    }
-    DLOG("swdDebugTeardownCallout finish\n");
-}
-
-void IOPMrootDomain::swdDebugTeardown( )
-{
-
-#if    HIBERNATION
-    DLOG("swdDebugTeardown state:%d\n", swd_DebugImageSetup);
-    if (swd_DebugImageSetup == TRUE) {
-        swd_DebugImageSetup = FALSE;
-        IOCloseDebugDataFile();
-    }
-#endif
-
-
-}
-//******************************************************************************
-
-
 static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 )
 {
     IOService * rootDomain = (IOService *) p0;
@@ -875,12 +820,10 @@ static void disk_sync_callout( thread_call_param_t p0, thread_call_param_t p1 )
         // Block sleep until trim issued on previous wake path is completed.
         IOHibernateSystemPostWake(true);
 #endif
-        swdDebugSetupCallout(p0, NULL);
     }
 #if HIBERNATION
     else
     {
-        swdDebugTeardownCallout(p0, NULL);
         IOHibernateSystemPostWake(false);
 
         if (gRootDomain)
@@ -943,6 +886,8 @@ static SYSCTL_PROC(_kern, OID_AUTO, waketime,
 
 SYSCTL_QUAD(_kern, OID_AUTO, wake_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gIOLastWakeAbsTime, "");
 SYSCTL_QUAD(_kern, OID_AUTO, sleep_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gIOLastSleepAbsTime, "");
+SYSCTL_QUAD(_kern, OID_AUTO, useractive_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gUserActiveAbsTime, "");
+SYSCTL_QUAD(_kern, OID_AUTO, userinactive_abs_time, CTLFLAG_RD|CTLFLAG_LOCKED, &gUserInactiveAbsTime, "");
 
 static int
 sysctl_willshutdown
@@ -1081,6 +1026,11 @@ SYSCTL_PROC(_hw, OID_AUTO, targettype,
 
 static SYSCTL_INT(_debug, OID_AUTO, darkwake, CTLFLAG_RW, &gDarkWakeFlags, 0, "");
 static SYSCTL_INT(_debug, OID_AUTO, noidle, CTLFLAG_RW, &gNoIdleFlag, 0, "");
+static SYSCTL_INT(_debug, OID_AUTO, swd_sleep_timeout, CTLFLAG_RW, &gSwdSleepTimeout, 0, "");
+static SYSCTL_INT(_debug, OID_AUTO, swd_wake_timeout, CTLFLAG_RW, &gSwdWakeTimeout, 0, "");
+static SYSCTL_INT(_debug, OID_AUTO, swd_timeout, CTLFLAG_RW, &gSwdSleepWakeTimeout, 0, "");
+static SYSCTL_INT(_debug, OID_AUTO, swd_panic, CTLFLAG_RW, &gSwdPanic, 0, "");
+
 
 static const OSSymbol * gIOPMSettingAutoWakeCalendarKey;
 static const OSSymbol * gIOPMSettingAutoWakeSecondsKey;
@@ -1103,9 +1053,6 @@ bool IOPMrootDomain::start( IOService * nub )
     OSIterator      *psIterator;
     OSDictionary    *tmpDict;
     IORootParent *   patriarch;
-#if defined(__i386__) || defined(__x86_64__)
-    IONotifier   *   notifier;
-#endif
 
     super::start(nub);
 
@@ -1151,9 +1098,11 @@ bool IOPMrootDomain::start( IOService * nub )
 
     PE_parse_boot_argn("darkwake", &gDarkWakeFlags, sizeof(gDarkWakeFlags));
     PE_parse_boot_argn("noidle", &gNoIdleFlag, sizeof(gNoIdleFlag));
+    PE_parse_boot_argn("swd_sleeptimeout", &gSwdSleepTimeout, sizeof(gSwdSleepTimeout));
+    PE_parse_boot_argn("swd_waketimeout", &gSwdWakeTimeout, sizeof(gSwdWakeTimeout));
+    PE_parse_boot_argn("swd_timeout", &gSwdSleepWakeTimeout, sizeof(gSwdSleepWakeTimeout));
     PE_parse_boot_argn("haltmspanic", &gHaltTimeMaxPanic, sizeof(gHaltTimeMaxPanic));
     PE_parse_boot_argn("haltmslog", &gHaltTimeMaxLog, sizeof(gHaltTimeMaxLog));
-       PE_parse_boot_argn("pcihostbridge_wake_delay", &gIOPMPCIHostBridgeWakeDelay, sizeof(gIOPMPCIHostBridgeWakeDelay));
 
     queue_init(&aggressivesQueue);
     aggressivesThreadCall = thread_call_allocate(handleAggressivesFunction, this);
@@ -1173,12 +1122,6 @@ bool IOPMrootDomain::start( IOService * nub )
     diskSyncCalloutEntry = thread_call_allocate(
                         &disk_sync_callout,
                         (thread_call_param_t) this);
-    swdDebugSetupEntry = thread_call_allocate(
-                        &swdDebugSetupCallout,
-                        (thread_call_param_t) this);
-    swdDebugTearDownEntry = thread_call_allocate(
-                        &swdDebugTeardownCallout,
-                        (thread_call_param_t) this);
     updateConsoleUsersEntry = thread_call_allocate(
                         &updateConsoleUsersCallout,
                         (thread_call_param_t) this);
@@ -1215,6 +1158,7 @@ bool IOPMrootDomain::start( IOService * nub )
     // Will never transition to user inactive w/o wrangler.
     fullWakeReason = kFullWakeReasonLocalUser;
     userIsActive = userWasActive = true;
+    clock_get_uptime(&gUserActiveAbsTime);
     setProperty(gIOPMUserIsActiveKey, kOSBooleanTrue);
 
     // Set the default system capabilities at boot.
@@ -1302,15 +1246,6 @@ bool IOPMrootDomain::start( IOService * nub )
 
 #if defined(__i386__) || defined(__x86_64__)
 
-    if ((tmpDict = serviceMatching("IODTNVRAM")))
-    {
-        notifier = addMatchingNotification(
-                gIOFirstPublishNotification, tmpDict,
-                (IOServiceMatchingNotificationHandler) &IONVRAMMatchPublished,
-                this, 0);
-        tmpDict->release();
-    }
-
     wranglerIdleSettings = NULL;
     OSNumber * wranglerIdlePeriod = NULL;
     wranglerIdleSettings = OSDictionary::withCapacity(1);
@@ -2324,6 +2259,7 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
     DLOG("PowerChangeDone: %u->%u\n",
         (uint32_t) previousPowerState, (uint32_t) getPowerState());
 
+    notifierThread = current_thread();
     switch ( getPowerState() )
     {
         case SLEEP_STATE: {
@@ -2376,7 +2312,6 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
                 }
             }
             assertOnWakeSecs = 0;
-            ((IOService *)this)->stop_watchdog_timer(); //14456299
             lowBatteryCondition = false;
 
 #if DEVELOPMENT || DEBUG
@@ -2403,7 +2338,6 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
             IOLog("gIOLastWakeAbsTime: %lld\n", gIOLastWakeAbsTime);
             _highestCapability = 0;
 
-            ((IOService *)this)->start_watchdog_timer(); //14456299
 #if HIBERNATION
             IOHibernateSystemWake();
 #endif
@@ -2611,6 +2545,7 @@ void IOPMrootDomain::powerChangeDone( unsigned long previousPowerState )
 #endif
 
     }
+    notifierThread = NULL;
 }
 
 //******************************************************************************
@@ -3082,19 +3017,7 @@ IOReturn IOPMrootDomain::sysPowerDownHandler(
     if (!gRootDomain)
         return kIOReturnUnsupported;
 
-    if (messageType == kIOMessageSystemWillSleep)
-    {
-#if HIBERNATION
-        IOPowerStateChangeNotification *notify =
-            (IOPowerStateChangeNotification *)messageArgs;
-
-        notify->returnValue = 30 * 1000 * 1000;
-        thread_call_enter1(
-                           gRootDomain->swdDebugSetupEntry,
-                           (thread_call_param_t)(uintptr_t) notify->powerRef);
-#endif
-    }
-    else if (messageType == kIOMessageSystemCapabilityChange)
+    if (messageType == kIOMessageSystemCapabilityChange)
     {
         IOPMSystemCapabilityChangeParameters * params =
             (IOPMSystemCapabilityChangeParameters *) messageArgs;
@@ -3161,25 +3084,6 @@ IOReturn IOPMrootDomain::sysPowerDownHandler(
                 gRootDomain->diskSyncCalloutEntry,
                 (thread_call_param_t)(uintptr_t) params->notifyRef);
         }
-        else if (CAP_WILL_CHANGE_TO_OFF(params, kIOPMSystemCapabilityGraphics) ||
-                 CAP_WILL_CHANGE_TO_ON(params, kIOPMSystemCapabilityGraphics))
-        {
-            // WillChange for Full wake -> Darkwake
-           params->maxWaitForReply = 30 * 1000 * 1000;
-           thread_call_enter1(
-                              gRootDomain->swdDebugSetupEntry,
-                              (thread_call_param_t)(uintptr_t) params->notifyRef);
-        }
-        else if (CAP_DID_CHANGE_TO_OFF(params, kIOPMSystemCapabilityGraphics) ||
-                 CAP_DID_CHANGE_TO_ON(params, kIOPMSystemCapabilityGraphics))
-        {
-            // DidChange for Full wake -> Darkwake
-           params->maxWaitForReply = 30 * 1000 * 1000;
-           thread_call_enter1(
-                              gRootDomain->swdDebugTearDownEntry,
-                              (thread_call_param_t)(uintptr_t) params->notifyRef);
-
-        }
 #endif
         ret = kIOReturnSuccess;
     }
@@ -3402,6 +3306,7 @@ void IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState
 
     if (SLEEP_STATE == newPowerState)
     {
+        notifierThread = current_thread();
         if (!tasksSuspended)
         {
            AbsoluteTime deadline;
@@ -3431,6 +3336,7 @@ void IOPMrootDomain::willNotifyPowerChildren( IOPMPowerStateIndex newPowerState
             if (secs) secs->release();
         }
 
+        notifierThread = NULL;
     }
 }
 
@@ -4524,7 +4430,8 @@ void IOPMrootDomain::evaluateSystemSleepPolicyFinal( void )
     {
         if ((kIOPMSleepTypeStandby == params.sleepType)
          && gIOHibernateStandbyDisabled && gSleepPolicyVars
-         && (!(kIOPMSleepFactorStandbyForced & gSleepPolicyVars->sleepFactors)))
+         && (!((kIOPMSleepFactorStandbyForced|kIOPMSleepFactorAutoPowerOffForced|kIOPMSleepFactorHibernateForced)
+                 & gSleepPolicyVars->sleepFactors)))
         {
             standbyNixed = true;
             wakeNow = true;
@@ -4925,8 +4832,6 @@ IOReturn IOPMrootDomain::restartSystem( void )
 // MARK: -
 // MARK: System Capability
 
-SYSCTL_UINT(_kern, OID_AUTO, pcihostbridge_wake_delay, CTLFLAG_RD | CTLFLAG_KERN | CTLFLAG_LOCKED, (uint32_t *)&gIOPMPCIHostBridgeWakeDelay, 0, "");
-
 //******************************************************************************
 // tagPowerPlaneService
 //
@@ -4997,7 +4902,7 @@ void IOPMrootDomain::tagPowerPlaneService(
 
         while (child != this)
         {
-            if ((gIOPMPCIHostBridgeWakeDelay ? (parent == pciHostBridgeDriver) : (parent->metaCast("IOPCIDevice") != NULL)) ||
+            if (parent->metaCast("IOPCIDevice") ||
                 (parent == this))
             {
                 if (OSDynamicCast(IOPowerConnection, child))
@@ -6262,50 +6167,6 @@ bool IOPMrootDomain::displayWranglerMatchPublished(
     return true;
 }
 
-#if defined(__i386__) || defined(__x86_64__)
-
-bool IOPMrootDomain::IONVRAMMatchPublished(
-    void * target,
-    void * refCon,
-    IOService * newService,
-    IONotifier * notifier)
-{
-    unsigned int     len = 0;
-    IOPMrootDomain *rd = (IOPMrootDomain *)target;
-    OSNumber    *statusCode = NULL;
-
-    if (PEReadNVRAMProperty(kIOSleepWakeDebugKey, NULL, &len))
-    {
-        statusCode = OSDynamicCast(OSNumber, rd->getProperty(kIOPMSleepWakeFailureCodeKey));
-        if (statusCode != NULL) {
-            if (statusCode->unsigned64BitValue() != 0) {
-                rd->swd_flags |= SWD_BOOT_BY_SW_WDOG;
-                MSG("System was rebooted due to Sleep/Wake failure\n");
-            }
-            else {
-                rd->swd_flags |= SWD_BOOT_BY_OSX_WDOG;
-                MSG("System was non-responsive and was rebooted by watchdog\n");
-            }
-        }
-
-        rd->swd_logBufMap = rd->sleepWakeDebugRetrieve();
-    }
-    if (notifier) notifier->remove();
-    return true;
-}
-
-#else
-bool IOPMrootDomain::IONVRAMMatchPublished(
-    void * target,
-    void * refCon,
-    IOService * newService,
-    IONotifier * notifier __unused)
-{
-    return false;
-}
-
-#endif
-
 //******************************************************************************
 // reportUserInput
 //
@@ -6663,19 +6524,9 @@ void IOPMrootDomain::dispatchPowerEvent(
                     break;
                 }
 
-                if (swd_flags & SWD_VALID_LOGS) {
-                    if (swd_flags & SWD_LOGS_IN_MEM) {
-                        sleepWakeDebugDumpFromMem(swd_logBufMap);
-                        swd_logBufMap->release();
-                        swd_logBufMap = 0;
-                    }
-                    else if (swd_flags & SWD_LOGS_IN_FILE) 
-                        sleepWakeDebugDumpFromFile();
-                }
-                else if (swd_flags & (SWD_BOOT_BY_SW_WDOG|SWD_BOOT_BY_OSX_WDOG)) {
-                    // If logs are invalid, write the failure code
-                    sleepWakeDebugDumpFromMem(NULL);
-                }
+                sleepWakeDebugMemAlloc();
+                saveFailureData2File();
+
                 // If lid is closed, re-send lid closed notification
                 // now that booting is complete.
                 if ( clamshellClosed )
@@ -7004,20 +6855,25 @@ void IOPMrootDomain::handlePowerNotification( UInt32 msg )
      */
     if (msg & kIOPMClamshellClosed)
     {
-        DLOG("Clamshell closed\n");
-        // Received clamshel open message from clamshell controlling driver
-        // Update our internal state and tell general interest clients
-        clamshellClosed = true;
-        clamshellExists = true;
+        if (clamshellClosed && clamshellExists) {
+            DLOG("Ignoring redundant Clamshell close event\n");
+        }
+        else {
+            DLOG("Clamshell closed\n");
+            // Received clamshel open message from clamshell controlling driver
+            // Update our internal state and tell general interest clients
+            clamshellClosed = true;
+            clamshellExists = true;
 
-        // Tell PMCPU
-        informCPUStateChange(kInformLid, 1);
+            // Tell PMCPU
+            informCPUStateChange(kInformLid, 1);
 
-        // Tell general interest clients
-        sendClientClamshellNotification();
+            // Tell general interest clients
+            sendClientClamshellNotification();
 
-        // And set eval_clamshell = so we can attempt
-        eval_clamshell = true;
+            // And set eval_clamshell = so we can attempt
+            eval_clamshell = true;
+        }
     }
 
     /*
@@ -7190,6 +7046,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
             {
                 userIsActive = true;
                 userWasActive = true;
+                clock_get_uptime(&gUserActiveAbsTime);
 
                 // Stay awake after dropping demand for display power on
                 if (kFullWakeReasonDisplayOn == fullWakeReason) {
@@ -7209,6 +7066,7 @@ void IOPMrootDomain::evaluatePolicy( int stimulus, uint32_t arg )
             DLOG("evaluatePolicy( %d, 0x%x )\n", stimulus, arg);
             if (userIsActive)
             {
+                clock_get_uptime(&gUserInactiveAbsTime);
                 userIsActive = false;
                 clock_get_uptime(&userBecameInactiveTime);
                 flags.bit.userBecameInactive = true;
@@ -7987,15 +7845,20 @@ void IOPMrootDomain::tracePoint( uint8_t point )
     pmTracer->tracePoint(point);
 }
 
-void IOPMrootDomain::traceDetail(OSObject *object)
+void IOPMrootDomain::traceDetail(OSObject *object, bool start)
 {
-    IOPMServiceInterestNotifier *notifier = OSDynamicCast(IOPMServiceInterestNotifier, object);
+    IOPMServiceInterestNotifier *notifier;
+
+    if (systemBooting) {
+        return;
+    }
+
+    notifier = OSDynamicCast(IOPMServiceInterestNotifier, object);
     if (!notifier) {
-        DLOG("Unknown notifier\n");
         return;
     }
 
-    if (!systemBooting) {
+    if (start) {
         pmTracer->traceDetail( notifier->uuid0 >> 32 );
         kdebugTrace(kPMLogSleepWakeMessage, pmTracer->getTracePhase(), notifier->msgType, notifier->uuid0, notifier->uuid1);
         if (notifier->identifier) {
@@ -8005,8 +7868,15 @@ void IOPMrootDomain::traceDetail(OSObject *object)
         else {
             DLOG("trace point 0x%02x msg 0x%x\n", pmTracer->getTracePhase(), notifier->msgType);
         }
+        notifierThread = current_thread();
+        notifierObject = notifier;
+        notifier->retain();
+    }
+    else {
+        notifierThread = NULL;
+        notifierObject = NULL;
+        notifier->release();
     }
-
 }
 
 
@@ -9762,13 +9632,24 @@ OSObject * IORootParent::copyProperty( const char * aKey) const
     return (IOService::copyProperty(aKey));
 }
 
+uint32_t IOPMrootDomain::getWatchdogTimeout()
+{
+    if (gSwdSleepWakeTimeout) {
+        gSwdSleepTimeout = gSwdWakeTimeout = gSwdSleepWakeTimeout;
+    }
+    if ((pmTracer->getTracePhase() < kIOPMTracePointSystemSleep) ||
+            (pmTracer->getTracePhase() == kIOPMTracePointDarkWakeEntry)) {
+        return gSwdSleepTimeout ? gSwdSleepTimeout : WATCHDOG_SLEEP_TIMEOUT;
+    }
+    else {
+        return gSwdWakeTimeout ? gSwdWakeTimeout : WATCHDOG_WAKE_TIMEOUT;
+    }
+}
+
 
 #if defined(__i386__) || defined(__x86_64__)
 IOReturn IOPMrootDomain::restartWithStackshot()
 {
-    if ((swd_flags & SWD_WDOG_ENABLED) == 0)
-        return kIOReturnError;
-
     takeStackshot(true, true, false);
 
     return kIOReturnSuccess;
@@ -9779,200 +9660,688 @@ void IOPMrootDomain::sleepWakeDebugTrig(bool wdogTrigger)
     takeStackshot(wdogTrigger, false, false);
 }
 
-void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool isSpinDump)
+void IOPMrootDomain::tracePhase2String(uint32_t tracePhase, const char **phaseString, const char **description)
 {
-   swd_hdr *         hdr = NULL;
-   addr64_t          data[3];
-   int               wdog_panic = -1;
-   int               stress_rack = -1;
-   int               cnt = 0;
-   pid_t             pid = 0;
-   kern_return_t     kr = KERN_SUCCESS;
-   uint32_t          flags;
+    switch (tracePhase) {
 
-   char *            dstAddr;
-   uint32_t          size;
-   uint32_t          bytesRemaining;
-   unsigned          bytesWritten = 0;
-   unsigned          totalBytes = 0;
-   unsigned int      len;
-   OSString *        UUIDstring = NULL;
-   uint64_t          code;
-   IOMemoryMap *     logBufMap = NULL;
+        case kIOPMTracePointSleepStarted:
+            *phaseString = "kIOPMTracePointSleepStarted";
+            *description = "starting sleep";
+            break;
 
+        case kIOPMTracePointSleepApplications:
+            *phaseString = "kIOPMTracePointSleepApplications";
+            *description = "notifying applications";
+            break;
 
-   uint32_t          bufSize;
-   uint32_t          initialStackSize;
+        case kIOPMTracePointSleepPriorityClients:
+            *phaseString = "kIOPMTracePointSleepPriorityClients";
+            *description = "notifying clients about upcoming system capability changes";
+            break;
 
-   if (isSpinDump) {
-       if (_systemTransitionType != kSystemTransitionSleep &&
-           _systemTransitionType != kSystemTransitionWake)
-           return;
-   } else {
-       if ( kIOSleepWakeWdogOff & gIOKitDebug )
-           return;
-   }
+        case kIOPMTracePointSleepWillChangeInterests:
+            *phaseString = "kIOPMTracePointSleepWillChangeInterests";
+            *description = "creating hibernation file or while calling rootDomain's clients about upcoming rootDomain's state changes";
+            break;
 
-   if (wdogTrigger) {
-       PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic));
-       PE_parse_boot_argn("stress-rack", &stress_rack, sizeof(stress_rack));
-       if ((wdog_panic == 1) || (stress_rack == 1) || (PEGetCoprocessorVersion() >= kCoprocessorVersion2)) {
-           // If boot-arg specifies to panic then panic.
-           panic("Sleep/Wake hang detected");
-           return;
-       }
-       else if (swd_flags & SWD_BOOT_BY_SW_WDOG) {
-           // If current boot is due to this watch dog trigger restart in previous boot,
-           // then don't trigger again until at least 1 successful sleep & wake.
-           if (!(sleepCnt && (displayWakeCnt || darkWakeCnt))) {
-               IOLog("Shutting down due to repeated Sleep/Wake failures\n");
-               if (!tasksSuspended) {
-                   tasksSuspended = TRUE;
-                   tasks_system_suspend(true);
-               }
-               PEHaltRestart(kPEHaltCPU);
-               return;
-           }
-       }
+        case kIOPMTracePointSleepPowerPlaneDrivers:
+            *phaseString = "kIOPMTracePointSleepPowerPlaneDrivers";
+            *description = "calling power state change callbacks";
+            break;
 
-   }
+        case kIOPMTracePointSleepDidChangeInterests:
+            *phaseString = "kIOPMTracePointSleepDidChangeInterests";
+            *description = "calling rootDomain's clients about rootDomain's state changes";
+            break;
 
-   if (isSpinDump) {
-      if (gSpinDumpBufferFull)
-         return;
-      if (swd_spindump_buffer == NULL) {
-         sleepWakeDebugSpinDumpMemAlloc();
-         if (swd_spindump_buffer == NULL) return;
-      }
+        case kIOPMTracePointSleepCapabilityClients:
+            *phaseString = "kIOPMTracePointSleepCapabilityClients";
+            *description = "notifying clients about current system capabilities";
+            break;
 
-      bufSize = SWD_SPINDUMP_SIZE;
-      initialStackSize = SWD_INITIAL_SPINDUMP_SIZE;
-   } else {
-      if (sleepWakeDebugIsWdogEnabled() == false)
-         return;
+        case kIOPMTracePointSleepPlatformActions:
+            *phaseString = "kIOPMTracePointSleepPlatformActions";
+            *description = "calling Quiesce/Sleep action callbacks";
+            break;
 
-      if (swd_buffer == NULL) {
-         sleepWakeDebugMemAlloc();
-         if (swd_buffer == NULL) return;
-      }
+        case kIOPMTracePointSleepCPUs:
+            *phaseString = "kIOPMTracePointSleepCPUs";
+            *description = "halting all non-boot CPUs";
+            break;
 
-      bufSize = SWD_BUF_SIZE;
-      initialStackSize = SWD_INITIAL_STACK_SIZE;
-   }
+        case kIOPMTracePointSleepPlatformDriver:
+            *phaseString = "kIOPMTracePointSleepPlatformDriver";
+            *description = "executing platform specific code";
+            break;
 
-   if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock))
-       return;
+        case kIOPMTracePointHibernate:
+            *phaseString = "kIOPMTracePointHibernate";
+            *description = "writing the hibernation image";
+            break;
 
-   if (isSpinDump) {
-      hdr = (swd_hdr *)swd_spindump_buffer;
-   }
-   else {
-      hdr = (swd_hdr *)swd_buffer;
-   }
+        case kIOPMTracePointSystemSleep:
+            *phaseString = "kIOPMTracePointSystemSleep";
+            *description = "in EFI/Bootrom after last point of entry to sleep";
+            break;
 
-   memset(hdr->UUID, 0x20, sizeof(hdr->UUID));
-   if ((UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey))) != NULL ) {
+        case kIOPMTracePointWakePlatformDriver:
+            *phaseString = "kIOPMTracePointWakePlatformDriver";
+            *description = "executing platform specific code";
+            break;
 
-      if (wdogTrigger || (!UUIDstring->isEqualTo(hdr->UUID))) {
-         const char *str = UUIDstring->getCStringNoCopy();
-         snprintf(hdr->UUID, sizeof(hdr->UUID), "UUID: %s", str);
-      }
-      else {
-         DLOG("Data for current UUID already exists\n");
-         goto exit;
-      }
-   }
 
-   dstAddr = (char*)hdr + hdr->spindump_offset;
-   bytesRemaining = bufSize - hdr->spindump_offset;
+        case kIOPMTracePointWakePlatformActions:
+            *phaseString = "kIOPMTracePointWakePlatformActions";
+            *description = "calling Wake action callbacks";
+            break;
 
-   /* if AppleOSXWatchdog triggered the stackshot, set the flag in the heaer */
-   hdr->is_osx_watchdog = isOSXWatchdog;
+        case kIOPMTracePointWakeCPUs:
+            *phaseString = "kIOPMTracePointWakeCPUs";
+            *description = "starting non-boot CPUs";
+            break;
 
-   DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining);
+        case kIOPMTracePointWakeWillPowerOnClients:
+            *phaseString = "kIOPMTracePointWakeWillPowerOnClients";
+            *description = "sending kIOMessageSystemWillPowerOn message to kernel and userspace clients";
+            break;
 
-   flags = STACKSHOT_KCDATA_FORMAT|STACKSHOT_NO_IO_STATS|STACKSHOT_SAVE_KEXT_LOADINFO;
-   while (kr == KERN_SUCCESS) {
+        case kIOPMTracePointWakeWillChangeInterests:
+            *phaseString = "kIOPMTracePointWakeWillChangeInterests";
+            *description = "calling rootDomain's clients about upcoming rootDomain's state changes";
+            break;
 
-       if (cnt == 0) {
-           /*
-            * Take stackshot of all process on first sample. Size is restricted
-            * to SWD_INITIAL_STACK_SIZE
-            */
-           pid = -1;
-           size = (bytesRemaining > initialStackSize) ? initialStackSize : bytesRemaining;
-           flags |= STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY;
-       }
-       else {
-           /* Take sample of kernel threads only */
-           pid = 0;
-           size = bytesRemaining;
-       }
+        case kIOPMTracePointWakeDidChangeInterests:
+            *phaseString = "kIOPMTracePointWakeDidChangeInterests";
+            *description = "calling rootDomain's clients about completed rootDomain's state changes";
+            break;
 
-       kr = stack_snapshot_from_kernel(pid, dstAddr, size, flags, 0, &bytesWritten);
-       DLOG("stack_snapshot_from_kernel returned 0x%x. pid: %d bufsize:0x%x flags:0x%x bytesWritten: %d\n",
-               kr, pid, size, flags, bytesWritten);
-       if (kr == KERN_INSUFFICIENT_BUFFER_SIZE) {
-           if (pid == -1) {
-               // Insufficient buffer when trying to take stackshot of user & kernel space threads.
-               // Continue to take stackshot of just kernel threads
-               ++cnt;
-               kr = KERN_SUCCESS;
-               continue;
-           }
-           else if (totalBytes == 0) {
-               MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags);
-           }
-       }
+        case kIOPMTracePointWakePowerPlaneDrivers:
+            *phaseString = "kIOPMTracePointWakePowerPlaneDrivers";
+            *description = "calling power state change callbacks";
+            break;
 
-       dstAddr += bytesWritten;
-       totalBytes += bytesWritten;
-       bytesRemaining -= bytesWritten;
+        case kIOPMTracePointWakeCapabilityClients:
+            *phaseString = "kIOPMTracePointWakeCapabilityClients";
+            *description = "informing clients about current system capabilities";
+            break;
 
-       if (++cnt == 10) {
-           break;
-       }
-       IOSleep(10); // 10 ms
-   }
+        case kIOPMTracePointWakeApplications:
+            *phaseString = "kIOPMTracePointWakeApplications";
+            *description = "sending asynchronous kIOMessageSystemHasPoweredOn message to userspace clients";
+            break;
 
-   hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset);
+        case kIOPMTracePointDarkWakeEntry:
+            *phaseString = "kIOPMTracePointDarkWakeEntry";
+            *description = "entering darkwake on way to sleep";
+            break;
 
+        case kIOPMTracePointDarkWakeExit:
+            *phaseString = "kIOPMTracePointDarkWakeExit";
+            *description = "entering fullwake from darkwake";
+            break;
 
-   memset(hdr->spindump_status, 0x20, sizeof(hdr->spindump_status));
-   code = pmTracer->getPMStatusCode();
-   memset(hdr->PMStatusCode, 0x20, sizeof(hdr->PMStatusCode));
-   snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: %08x %08x",
-           (uint32_t)((code >> 32) & 0xffffffff), (uint32_t)(code & 0xffffffff));
-   memset(hdr->reason, 0x20, sizeof(hdr->reason));
-   if (isSpinDump) {
-      snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: PSC Delay\n\n");
-      gRootDomain->swd_lock = 0;
-      gSpinDumpBufferFull = true;
-      return;
+        default:
+            *phaseString = NULL;
+            *description = NULL;
+    }
+
+}
+
+void IOPMrootDomain::saveFailureData2File( )
+{
+    unsigned int len = 0;
+    char  failureStr[512];
+    errno_t error;
+    char *outbuf;
+    bool oswatchdog = false;
+
+    if (!PEReadNVRAMProperty(kIOSleepWakeFailureString, NULL, &len) &&
+            !PEReadNVRAMProperty(kIOOSWatchdogFailureString, NULL, &len) ) {
+        DLOG("No SleepWake failure or OSWatchdog failure string to read\n");
+        return;
+    }
+
+    if (len == 0) {
+        DLOG("Ignoring zero byte SleepWake failure string\n");
+        goto exit;
+    }
+
+    if (len > sizeof(failureStr)) {
+        len = sizeof(failureStr);
+    }
+    failureStr[0] = 0;
+    if (PEReadNVRAMProperty(kIOSleepWakeFailureString, failureStr, &len) == false) {
+        if (PEReadNVRAMProperty(kIOOSWatchdogFailureString, failureStr, &len)) {
+            oswatchdog = true;
+        }
+    }
+    if (failureStr[0] != 0) {
+        error = sleepWakeDebugSaveFile(oswatchdog ? kOSWatchdogFailureStringFile : kSleepWakeFailureStringFile,
+                failureStr, len);
+        if (error) {
+            DLOG("Failed to save SleepWake failure string to file. error:%d\n", error);
+        }
+        else {
+            DLOG("Saved SleepWake failure string to file.\n");
+        }
+        if (!oswatchdog) {
+            swd_flags |= SWD_BOOT_BY_SW_WDOG;
+        }
+    }
+
+    if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock))
+        goto exit;
+
+    if (swd_buffer) {
+        unsigned int len = 0;
+        errno_t error;
+        char nvram_var_name_buffer[20];
+        unsigned int concat_len = 0;
+        swd_hdr      *hdr = NULL;
+
+
+        hdr = (swd_hdr *)swd_buffer;
+        outbuf = (char *)hdr + hdr->spindump_offset;
+
+        for (int i=0; i < 8; i++) {
+            snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, i+1);
+            if (!PEReadNVRAMProperty(nvram_var_name_buffer, NULL, &len)) {
+                LOG("No SleepWake blob to read beyond chunk %d\n", i);
+                break;
+            }
+            if (PEReadNVRAMProperty(nvram_var_name_buffer, outbuf+concat_len, &len) == FALSE) {
+                PERemoveNVRAMProperty(nvram_var_name_buffer);
+                LOG("Could not read the property :-(\n");
+                break;
+            }
+            PERemoveNVRAMProperty(nvram_var_name_buffer);
+            concat_len += len;
+        }
+        LOG("Concatenated length for the SWD blob %d\n", concat_len);
+
+        if (concat_len) {
+            error = sleepWakeDebugSaveFile(oswatchdog ? kOSWatchdogStacksFilename : kSleepWakeStacksFilename,
+                                outbuf, concat_len);
+            if (error) {
+                LOG("Failed to save SleepWake zipped data to file. error:%d\n", error);
+            } else {
+                LOG("Saved SleepWake zipped data to file.\n");
+            }
+        }
+
+    }
+    else {
+        LOG("No buffer allocated to save failure stackshot\n");
+    }
+
+
+    gRootDomain->swd_lock = 0;
+exit:
+    PERemoveNVRAMProperty(oswatchdog ? kIOOSWatchdogFailureString : kIOSleepWakeFailureString);
+    return;
+}
+
+
+void IOPMrootDomain::getFailureData(thread_t *thread, char *failureStr, size_t strLen)
+{
+    IORegistryIterator *    iter;
+    IORegistryEntry *       entry;
+    IOService *             node;
+    bool                    nodeFound = false;
+
+    const void *            callMethod = NULL;
+    const char *            objectName = NULL;
+    uint32_t                timeout = getWatchdogTimeout();
+    const char *            phaseString = NULL;
+    const char *            phaseDescription = NULL;
+
+    IOPMServiceInterestNotifier *notifier = OSDynamicCast(IOPMServiceInterestNotifier, notifierObject);
+    uint32_t tracePhase = pmTracer->getTracePhase();
+
+    *thread = NULL;
+    if ((tracePhase < kIOPMTracePointSystemSleep) || (tracePhase == kIOPMTracePointDarkWakeEntry)) {
+        snprintf(failureStr, strLen, "%sSleep transition timed out after %d seconds", failureStr, timeout);
+    }
+    else {
+        snprintf(failureStr, strLen, "%sWake transition timed out after %d seconds", failureStr,timeout);
+    }
+    tracePhase2String(tracePhase, &phaseString, &phaseDescription);
+
+    if (notifierThread) {
+        if (notifier && (notifier->identifier)) {
+                objectName = notifier->identifier->getCStringNoCopy();
+        }
+        *thread = notifierThread;
+    }
+    else {
+
+        iter = IORegistryIterator::iterateOver(
+                getPMRootDomain(), gIOPowerPlane, kIORegistryIterateRecursively);
+
+        if (iter)
+        {
+            while ((entry = iter->getNextObject()))
+            {
+                node = OSDynamicCast(IOService, entry);
+                if (!node)
+                    continue;
+                if (OSDynamicCast(IOPowerConnection, node)) {
+                    continue;
+                }
+
+                if(node->getBlockingDriverCall(thread, &callMethod)) {
+                    nodeFound = true;
+                    break;
+                }
+            }
+            iter->release();
+        }
+        if (nodeFound) {
+            OSKext *kext = OSKext::lookupKextWithAddress((vm_address_t)callMethod);
+            if (kext) {
+                objectName = kext->getIdentifierCString();
+            }
+        }
+    }
+    if (phaseDescription) {
+        snprintf(failureStr, strLen, "%s while %s.", failureStr, phaseDescription);
+    }
+    if (objectName) {
+        snprintf(failureStr, strLen, "%s Suspected bundle: %s.", failureStr, objectName);
+    }
+    if (*thread) {
+        snprintf(failureStr, strLen, "%s Thread 0x%llx.", failureStr, thread_tid(*thread));
+    }
+
+    DLOG("%s\n", failureStr);
+}
+
+struct swd_stackshot_compressed_data
+{
+       z_output_func   zoutput;
+       size_t                  zipped;
+       uint64_t                totalbytes;
+       uint64_t                lastpercent;
+       IOReturn                error;
+       unsigned                outremain;
+       unsigned                outlen;
+       unsigned                writes;
+       Bytef *                 outbuf;
+};
+struct swd_stackshot_compressed_data swd_zip_var = { };
+
+static void *swd_zs_alloc(void *__unused ref, u_int items, u_int size)
+{
+       void *result;
+       LOG("Alloc in zipping %d items of size %d\n", items, size);
+
+       result = (void *)(swd_zs_zmem + swd_zs_zoffset);
+       swd_zs_zoffset += ~31L & (31 + (items * size)); // 32b align for vector crc
+       LOG("Offset %zu\n", swd_zs_zoffset);
+       return (result);
+}
+
+static int swd_zinput(z_streamp strm, Bytef *buf, unsigned size)
+{
+       unsigned len;
+
+       len = strm->avail_in;
+
+       if (len > size)
+               len = size;
+       if (len == 0)
+               return 0;
+
+    if (strm->next_in != (Bytef *) strm)
+               memcpy(buf, strm->next_in, len);
+    else
+               bzero(buf, len);
+
+    strm->adler = z_crc32(strm->adler, buf, len);
+
+    strm->avail_in -= len;
+    strm->next_in  += len;
+    strm->total_in += len;
+
+    return (int)len;
+}
+
+static int swd_zoutput(z_streamp strm, Bytef *buf, unsigned len)
+{
+       unsigned int i = 0;
+       // if outlen > max size don't add to the buffer
+       if (strm && buf) {
+               if (swd_zip_var.outlen + len > SWD_COMPRESSED_BUFSIZE) {
+                       LOG("No space to GZIP... not writing to NVRAM\n");
+                       return (len);
+               }
+       }
+       for (i = 0; i < len; i++) {
+               *(swd_zip_var.outbuf + swd_zip_var.outlen + i) = *(buf +i);
+       }
+       swd_zip_var.outlen += len;
+       return (len);
+}
+static void swd_zs_free(void * __unused ref, void * __unused ptr) {}
+
+static int swd_compress(char *inPtr, char *outPtr, size_t numBytes)
+{
+   int wbits = 12;
+   int memlevel = 3;
+
+   if (!swd_zs.zalloc) {
+          swd_zs.zalloc = swd_zs_alloc;
+          swd_zs.zfree = swd_zs_free;
+          if (deflateInit2(&swd_zs, Z_BEST_SPEED, Z_DEFLATED, wbits + 16, memlevel, Z_DEFAULT_STRATEGY)) {
+                  // allocation failed
+                  bzero(&swd_zs, sizeof(swd_zs));
+                  // swd_zs_zoffset = 0;
+          } else {
+                  LOG("PMRD inited the zlib allocation routines\n");
+          }
    }
-   snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Watchdog\n\n");
 
 
-   data[0] = round_page(sizeof(swd_hdr) + hdr->spindump_size);
-   /* Header & rootdomain log is constantly changing and  is not covered by CRC */
-   data[1] = hdr->crc = crc32(0, ((char*)swd_buffer+hdr->spindump_offset), hdr->spindump_size);
-   data[2] = kvtophys((vm_offset_t)swd_buffer);
-   len = sizeof(addr64_t)*3;
-   DLOG("bytes: 0x%llx crc:0x%llx paddr:0x%llx\n",
-         data[0], data[1], data[2]);
 
-   if (PEWriteNVRAMProperty(kIOSleepWakeDebugKey, data, len) == false)
+    swd_zip_var.zipped = 0;
+    swd_zip_var.totalbytes = 0; // should this be the max that we have?
+    swd_zip_var.lastpercent = 0;
+    swd_zip_var.error = kIOReturnSuccess;
+    swd_zip_var.outremain = 0;
+    swd_zip_var.outlen = 0;
+    swd_zip_var.writes = 0;
+    swd_zip_var.outbuf = (Bytef *)outPtr;
+
+    swd_zip_var.totalbytes = numBytes;
+
+    swd_zs.avail_in = 0;
+    swd_zs.next_in = NULL;
+    swd_zs.avail_out = 0;
+    swd_zs.next_out = NULL;
+
+    deflateResetWithIO(&swd_zs, swd_zinput, swd_zoutput);
+
+    z_stream *zs;
+    int zr;
+    zs = &swd_zs;
+
+    zr = Z_OK;
+
+    while (swd_zip_var.error >= 0) {
+        if (!zs->avail_in) {
+            zs->next_in = (unsigned char *)inPtr ? (Bytef *)inPtr : (Bytef *)zs; /* zero marker? */
+            zs->avail_in = numBytes;
+        }
+        if (!zs->avail_out) {
+            zs->next_out = (Bytef *)zs;
+            zs->avail_out = UINT32_MAX;
+        }
+        zr = deflate(zs, Z_NO_FLUSH);
+        if (Z_STREAM_END == zr)
+            break;
+        if (zr != Z_OK) {
+            LOG("ZERR %d\n", zr);
+            swd_zip_var.error = zr;
+        } else {
+            if (zs->total_in == numBytes) {
+                break;
+            }
+        }
+    }
+    zr = Z_OK;
+    //now flush the stream
+    while (swd_zip_var.error >= 0) {
+        if (!zs->avail_out) {
+            zs->next_out = (Bytef *)zs;
+            zs->avail_out = UINT32_MAX;
+        }
+        zr = deflate(zs, Z_FINISH);
+        if (Z_STREAM_END == zr) {
+            break;
+        }
+        if (zr != Z_OK) {
+            LOG("ZERR %d\n", zr);
+            swd_zip_var.error = zr;
+        } else {
+            if (zs->total_in == numBytes) {
+                LOG("Total output size %d\n", swd_zip_var.outlen);
+                break;
+            }
+        }
+    }
+
+    return swd_zip_var.outlen;
+}
+
+void IOPMrootDomain::takeStackshot(bool wdogTrigger, bool isOSXWatchdog, bool isSpinDump)
+{
+   swd_hdr *         hdr = NULL;
+   int               wdog_panic = -1;
+   int               cnt = 0;
+   pid_t             pid = 0;
+   kern_return_t     kr = KERN_SUCCESS;
+   uint32_t          flags;
+
+   char *            dstAddr;
+   uint32_t          size;
+   uint32_t          bytesRemaining;
+   unsigned          bytesWritten = 0;
+   unsigned          totalBytes = 0;
+   OSString *        UUIDstring = NULL;
+
+   char              failureStr[512];
+   thread_t          thread = NULL;
+   const char *      uuid;
+
+
+   uint32_t          bufSize;
+   uint32_t          initialStackSize;
+
+
+
+   failureStr[0] = 0;
+   if (isSpinDump) {
+       if (_systemTransitionType != kSystemTransitionSleep &&
+           _systemTransitionType != kSystemTransitionWake)
+           return;
+
+      if (gSpinDumpBufferFull)
+         return;
+      if (swd_spindump_buffer == NULL) {
+         sleepWakeDebugSpinDumpMemAlloc();
+         if (swd_spindump_buffer == NULL) return;
+      }
+
+      bufSize = SWD_SPINDUMP_SIZE;
+      initialStackSize = SWD_INITIAL_SPINDUMP_SIZE;
+      hdr = (swd_hdr *)swd_spindump_buffer;
+
+   } else {
+       if ( (kIOSleepWakeWdogOff & gIOKitDebug) || systemBooting || systemShutdown || gWillShutdown)
+           return;
+
+       if (isOSXWatchdog) {
+           snprintf(failureStr, sizeof(failureStr), "Stackshot Reason: ");
+           snprintf(failureStr, sizeof(failureStr), "%smacOS watchdog triggered failure\n", failureStr);
+       }
+       else if (wdogTrigger) {
+           if ((UUIDstring = OSDynamicCast(OSString, getProperty(kIOPMSleepWakeUUIDKey))) != NULL ) {
+               uuid = UUIDstring->getCStringNoCopy();
+               snprintf(failureStr, sizeof(failureStr), "UUID: %s\n", uuid);
+           }
+
+           snprintf(failureStr, sizeof(failureStr), "%sStackshot Reason: ", failureStr);
+           getFailureData(&thread, failureStr, sizeof(failureStr));
+           if (PEGetCoprocessorVersion() >= kCoprocessorVersion2) {
+               goto skip_stackshot;
+           }
+
+       }
+       else {
+           snprintf(failureStr, sizeof(failureStr), "%sStackshot triggered for debugging stackshot collection.\n", failureStr);
+       }
+          // Take only one stackshot in this case.
+          cnt = SWD_MAX_STACKSHOTS-1;
+
+      if (swd_buffer == NULL) {
+         sleepWakeDebugMemAlloc();
+         if (swd_buffer == NULL) return;
+      }
+      hdr = (swd_hdr *)swd_buffer;
+
+      bufSize = hdr->alloc_size;;
+      initialStackSize = bufSize;
+
+   }
+
+
+   if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock))
+       return;
+
+
+   dstAddr = (char*)hdr + hdr->spindump_offset;
+   bytesRemaining = bufSize - hdr->spindump_offset;
+
+   DLOG("Taking snapshot. bytesRemaining: %d\n", bytesRemaining);
+
+   flags = STACKSHOT_KCDATA_FORMAT|STACKSHOT_NO_IO_STATS|STACKSHOT_SAVE_KEXT_LOADINFO|STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY|STACKSHOT_THREAD_WAITINFO;
+   while (kr == KERN_SUCCESS) {
+
+       if (cnt == 0) {
+           /*
+            * Take stackshot of all process on first sample. Size is restricted
+            * to SWD_INITIAL_STACK_SIZE
+            */
+           pid = -1;
+           size = (bytesRemaining > initialStackSize) ? initialStackSize : bytesRemaining;
+       }
+       else {
+           /* Take sample of kernel threads only */
+           pid = 0;
+           size = bytesRemaining;
+       }
+
+       kr = stack_snapshot_from_kernel(pid, dstAddr, size, flags, 0, &bytesWritten);
+       DLOG("stack_snapshot_from_kernel returned 0x%x. pid: %d bufsize:0x%x flags:0x%x bytesWritten: %d\n",
+               kr, pid, size, flags, bytesWritten);
+       if (kr == KERN_INSUFFICIENT_BUFFER_SIZE) {
+           if (pid == -1) {
+               // Insufficient buffer when trying to take stackshot of user & kernel space threads.
+               // Continue to take stackshot of just kernel threads
+               ++cnt;
+               kr = KERN_SUCCESS;
+               continue;
+           }
+           else if (totalBytes == 0) {
+               MSG("Failed to get stackshot(0x%x) bufsize:0x%x flags:0x%x\n", kr, size, flags);
+           }
+       }
+
+       dstAddr += bytesWritten;
+       totalBytes += bytesWritten;
+       bytesRemaining -= bytesWritten;
+
+       if (++cnt == SWD_MAX_STACKSHOTS) {
+           break;
+       }
+       IOSleep(10); // 10 ms
+   }
+
+   hdr->spindump_size = (bufSize - bytesRemaining - hdr->spindump_offset);
+
+   memset(hdr->reason, 0x20, sizeof(hdr->reason));
+   if (isSpinDump) {
+      snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Power State Change Delay\n\n");
+      gRootDomain->swd_lock = 0;
+      gSpinDumpBufferFull = true;
+      return;
+   }
+
+   // Compress stackshot and save to NVRAM
    {
-      DLOG("Failed to update nvram boot-args\n");
-      goto exit;
+       char *outbuf = (char *)swd_compressed_buffer;
+       int outlen = 0;
+       int num_chunks = 0;
+       int max_chunks = 0;
+       int leftover = 0;
+       char nvram_var_name_buffer[20];
+
+       outlen = swd_compress((char*)hdr + hdr->spindump_offset, outbuf, bytesWritten);
+
+       if (outlen) {
+           max_chunks = outlen / (2096 - 200);
+           leftover = outlen % (2096 - 200);
+
+           if (max_chunks < 8) {
+               for (num_chunks = 0; num_chunks < max_chunks; num_chunks++) {
+                   snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks+1);
+                   if (PEWriteNVRAMProperty(nvram_var_name_buffer, (outbuf + (num_chunks * (2096-200))), (2096 - 200)) == FALSE) {
+                       LOG("Failed to update NVRAM %d\n", num_chunks);
+                       break;
+                   }
+               }
+               if (leftover) {
+                   snprintf(nvram_var_name_buffer, 20, "%s%02d", SWD_STACKSHOT_VAR_PREFIX, num_chunks+1);
+                   if (PEWriteNVRAMProperty(nvram_var_name_buffer, (outbuf + (num_chunks * (2096-200))), leftover) == FALSE) {
+                       LOG("Failed to update NVRAM with leftovers\n");
+                   }
+               }
+           }
+           else {
+               LOG("Compressed failure stackshot is too large. size=%d bytes\n", outlen);
+           }
+       }
    }
 
-exit:
+   if (failureStr[0]) {
 
+       if (!isOSXWatchdog) {
+           // append sleep-wake failure code
+           snprintf(failureStr, sizeof(failureStr), "%s\nFailure code:: 0x%08x %08x\n",
+                   failureStr, pmTracer->getTraceData(), pmTracer->getTracePhase());
+           if (PEWriteNVRAMProperty(kIOSleepWakeFailureString, failureStr, strlen(failureStr)) == false) {
+               DLOG("Failed to write SleepWake failure string\n");
+           }
+       }
+       else {
+           if (PEWriteNVRAMProperty(kIOOSWatchdogFailureString, failureStr, strlen(failureStr)) == false) {
+               DLOG("Failed to write OSWatchdog failure string\n");
+           }
+       }
+   }
    gRootDomain->swd_lock = 0;
 
+skip_stackshot:
    if (wdogTrigger) {
-       IOLog("Restarting to collect Sleep wake debug logs\n");
+       PE_parse_boot_argn("swd_panic", &wdog_panic, sizeof(wdog_panic));
+
+       if ((wdog_panic == 1) || (PEGetCoprocessorVersion() >= kCoprocessorVersion2)) {
+           if (thread) {
+               panic_with_thread_context(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, thread, "%s", failureStr);
+           }
+           else {
+               panic_with_options(0, NULL, DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT, "%s", failureStr);
+           }
+           return;
+       }
+       else if (swd_flags & SWD_BOOT_BY_SW_WDOG) {
+           // If current boot is due to this watch dog trigger restart in previous boot,
+           // then don't trigger again until at least 1 successful sleep & wake.
+           if (!(sleepCnt && (displayWakeCnt || darkWakeCnt))) {
+               LOG("Shutting down due to repeated Sleep/Wake failures\n");
+               if (!tasksSuspended) {
+                   tasksSuspended = TRUE;
+                   tasks_system_suspend(true);
+               }
+               PEHaltRestart(kPEHaltCPU);
+               return;
+           }
+       }
+   }
+
+
+   if (wdogTrigger) {
+       LOG("Restarting to collect Sleep wake debug logs\n");
        if (!tasksSuspended) {
             tasksSuspended = TRUE;
            tasks_system_suspend(true);
@@ -9981,20 +10350,16 @@ exit:
        PEHaltRestart(kPERestartCPU);
    }
    else {
-     logBufMap = sleepWakeDebugRetrieve();
-      if (logBufMap) {
-          sleepWakeDebugDumpFromMem(logBufMap);
-          logBufMap->release();
-          logBufMap = 0;
-      }
+       saveFailureData2File();
    }
 }
 
 void IOPMrootDomain::sleepWakeDebugMemAlloc( )
 {
-    vm_size_t    size = SWD_BUF_SIZE;
+    vm_size_t    size = SWD_STACKSHOT_SIZE + SWD_COMPRESSED_BUFSIZE + SWD_ZLIB_BUFSIZE;
 
     swd_hdr      *hdr = NULL;
+    void         *bufPtr = NULL;
 
     IOBufferMemoryDescriptor  *memDesc = NULL;
 
@@ -10008,28 +10373,31 @@ void IOPMrootDomain::sleepWakeDebugMemAlloc( )
     if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock))
        return;
 
-    // Try allocating above 4GB. If that fails, try at 2GB
-    memDesc = IOBufferMemoryDescriptor::inTaskWithPhysicalMask(
-                            kernel_task, kIOMemoryPhysicallyContiguous|kIOMemoryMapperNone,
-                            size, 0xFFFFFFFF00000000ULL);
-    if (!memDesc) {
-       memDesc = IOBufferMemoryDescriptor::inTaskWithPhysicalMask(
-                            kernel_task, kIOMemoryPhysicallyContiguous|kIOMemoryMapperNone,
-                            size, 0xFFFFFFFF10000000ULL);
-    }
-
+    memDesc = IOBufferMemoryDescriptor::inTaskWithOptions(
+                            kernel_task, kIODirectionIn|kIOMemoryMapperNone,
+                            size);
     if (memDesc == NULL)
     {
       DLOG("Failed to allocate Memory descriptor for sleepWake debug\n");
       goto exit;
     }
 
+    bufPtr = memDesc->getBytesNoCopy();
 
-    hdr = (swd_hdr *)memDesc->getBytesNoCopy();
+    // Carve out memory for zlib routines
+    swd_zs_zmem = (vm_offset_t)bufPtr;
+    bufPtr = (char *)bufPtr + SWD_ZLIB_BUFSIZE;
+
+    // Carve out memory for compressed stackshots
+    swd_compressed_buffer = bufPtr;
+    bufPtr = (char *)bufPtr + SWD_COMPRESSED_BUFSIZE;
+
+    // Remaining is used for holding stackshot
+    hdr = (swd_hdr *)bufPtr;
     memset(hdr, 0, sizeof(swd_hdr));
 
     hdr->signature = SWD_HDR_SIGNATURE;
-    hdr->alloc_size = size;
+    hdr->alloc_size = SWD_STACKSHOT_SIZE;
 
     hdr->spindump_offset = sizeof(swd_hdr);
     swd_buffer = (void *)hdr;
@@ -10077,15 +10445,11 @@ exit:
 
 void IOPMrootDomain::sleepWakeDebugEnableWdog()
 {
-    swd_flags |= SWD_WDOG_ENABLED;
-    if (!swd_buffer)
-        sleepWakeDebugMemAlloc();
 }
 
 bool IOPMrootDomain::sleepWakeDebugIsWdogEnabled()
 {
-    return ((swd_flags & SWD_WDOG_ENABLED) &&
-            !systemBooting && !systemShutdown && !gWillShutdown);
+    return (!systemBooting && !systemShutdown && !gWillShutdown);
 }
 
 void IOPMrootDomain::sleepWakeDebugSaveSpinDumpFile()
@@ -10120,7 +10484,7 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int
    if (vnode_open(name, (O_CREAT | FWRITE | O_NOFOLLOW),
                         S_IRUSR|S_IRGRP|S_IROTH, VNODE_LOOKUP_NOFOLLOW, &vp, ctx) != 0)
    {
-      IOLog("Failed to open the file %s\n", name);
+      LOG("Failed to open the file %s\n", name);
       swd_flags |= SWD_FILEOP_ERROR;
       goto exit;
    }
@@ -10129,7 +10493,7 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int
    /* Don't dump to non-regular files or files with links. */
    if (vp->v_type != VREG ||
         vnode_getattr(vp, &va, ctx) || va.va_nlink != 1) {
-        IOLog("Bailing as this is not a regular file\n");
+        LOG("Bailing as this is not a regular file\n");
         swd_flags |= SWD_FILEOP_ERROR;
         goto exit;
     }
@@ -10140,9 +10504,9 @@ errno_t IOPMrootDomain::sleepWakeDebugSaveFile(const char *name, char *buf, int
 
     if (buf != NULL) {
         error = vn_rdwr(UIO_WRITE, vp, buf, len, 0,
-                UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) 0,  vfs_context_proc(ctx));
+                UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, (int *) NULL,  vfs_context_proc(ctx));
         if (error != 0) {
-            IOLog("Failed to save sleep wake log. err 0x%x\n", error);
+            LOG("Failed to save sleep wake log. err 0x%x\n", error);
             swd_flags |= SWD_FILEOP_ERROR;
         }
         else {
@@ -10158,515 +10522,6 @@ exit:
 
 }
 
-errno_t IOPMrootDomain::sleepWakeDebugCopyFile(
-                               struct vnode *srcVp, 
-                               vfs_context_t srcCtx,
-                               char *tmpBuf, uint64_t tmpBufSize,
-                               uint64_t srcOffset, 
-                               const char *dstFname, 
-                               uint64_t numBytes,
-                               uint32_t crc)
-{
-   struct vnode         *vp = NULL;
-   vfs_context_t        ctx = vfs_context_create(vfs_context_current());
-   struct vnode_attr    va;
-   errno_t      error = EIO;
-   uint64_t bytesToRead, bytesToWrite;
-   uint64_t readFileOffset, writeFileOffset, srcDataOffset; 
-   uint32_t newcrc = 0;
-
-   if (vnode_open(dstFname, (O_CREAT | FWRITE | O_NOFOLLOW), 
-                        S_IRUSR|S_IRGRP|S_IROTH, VNODE_LOOKUP_NOFOLLOW, &vp, ctx) != 0) 
-   {
-      IOLog("Failed to open the file %s\n", dstFname);
-      swd_flags |= SWD_FILEOP_ERROR;
-      goto exit;
-   }
-   VATTR_INIT(&va);
-   VATTR_WANTED(&va, va_nlink);
-   /* Don't dump to non-regular files or files with links. */
-   if (vp->v_type != VREG ||
-        vnode_getattr(vp, &va, ctx) || va.va_nlink != 1) {
-        IOLog("Bailing as this is not a regular file\n");
-        swd_flags |= SWD_FILEOP_ERROR;
-        goto exit;
-       }
-    VATTR_INIT(&va);   
-    VATTR_SET(&va, va_data_size, 0);
-    vnode_setattr(vp, &va, ctx);
-   
-    writeFileOffset = 0;
-    while(numBytes) {
-        bytesToRead = (round_page(numBytes) > tmpBufSize) ? tmpBufSize : round_page(numBytes);
-        readFileOffset = trunc_page(srcOffset);
-
-       DLOG("Read file (numBytes:0x%llx offset:0x%llx)\n", bytesToRead, readFileOffset);
-       error = vn_rdwr(UIO_READ, srcVp, tmpBuf, bytesToRead, readFileOffset,
-               UIO_SYSSPACE, IO_SKIP_ENCRYPTION|IO_SYNC|IO_NODELOCKED|IO_UNIT|IO_NOCACHE, 
-               vfs_context_ucred(srcCtx), (int *) 0,
-               vfs_context_proc(srcCtx));
-       if (error) {
-           IOLog("Failed to read file(numBytes:0x%llx)\n", bytesToRead);
-           swd_flags |= SWD_FILEOP_ERROR;
-           break;
-       }
-
-       srcDataOffset = (uint64_t)tmpBuf + (srcOffset - readFileOffset);
-       bytesToWrite = bytesToRead - (srcOffset - readFileOffset);
-       if (bytesToWrite > numBytes) bytesToWrite = numBytes;
-
-       if (crc) {
-           newcrc = crc32(newcrc, (void *)srcDataOffset, bytesToWrite);
-       }
-       DLOG("Write file (numBytes:0x%llx offset:0x%llx)\n", bytesToWrite, writeFileOffset);
-       error = vn_rdwr(UIO_WRITE, vp, (char *)srcDataOffset, bytesToWrite, writeFileOffset,
-               UIO_SYSSPACE, IO_SYNC|IO_NODELOCKED|IO_UNIT, 
-               vfs_context_ucred(ctx), (int *) 0,
-               vfs_context_proc(ctx));
-       if (error) {
-           IOLog("Failed to write file(numBytes:0x%llx)\n", bytesToWrite);
-           swd_flags |= SWD_FILEOP_ERROR;
-           break;
-       }
-
-       writeFileOffset += bytesToWrite;
-       numBytes -= bytesToWrite;
-       srcOffset += bytesToWrite;
-
-    }
-    if (crc != newcrc) {
-        /* Set stackshot size to 0 if crc doesn't match */
-        VATTR_INIT(&va);
-        VATTR_SET(&va, va_data_size, 0);
-        vnode_setattr(vp, &va, ctx);
-
-        IOLog("CRC check failed. expected:0x%x actual:0x%x\n", crc, newcrc);
-        swd_flags |= SWD_DATA_CRC_ERROR;
-        error = EFAULT;
-    }
-exit:
-    if (vp) { 
-        error = vnode_close(vp, FWRITE, ctx);
-        DLOG("vnode_close on file %s returned 0x%x\n",dstFname, error);
-    }
-    if (ctx) vfs_context_rele(ctx);
-
-    return error;
-
-
-
-}
-uint32_t IOPMrootDomain::checkForValidDebugData(const char *fname, vfs_context_t *ctx, 
-                                            void *tmpBuf, struct vnode **vp)
-{
-    int             rc;
-    uint64_t        hdrOffset;
-    uint32_t        error = 0;
-
-    struct vnode_attr           va;
-    IOHibernateImageHeader      *imageHdr;
-
-    *vp = NULL;
-    if (vnode_open(fname, (FREAD | O_NOFOLLOW), 0,
-                   VNODE_LOOKUP_NOFOLLOW, vp, *ctx) != 0) 
-    {
-        DMSG("sleepWakeDebugDumpFromFile: Failed to open the file %s\n", fname);
-        goto err;
-    }
-    VATTR_INIT(&va);
-    VATTR_WANTED(&va, va_nlink);
-    VATTR_WANTED(&va, va_data_alloc);
-    if ((*vp)->v_type != VREG ||
-        vnode_getattr((*vp), &va, *ctx) || va.va_nlink != 1) {
-        IOLog("sleepWakeDebugDumpFromFile: Bailing as %s is not a regular file\n", fname);
-        error = SWD_FILEOP_ERROR;
-        goto err;
-    }
-
-    /* Read the sleepimage file header */
-    rc = vn_rdwr(UIO_READ, *vp, (char *)tmpBuf, round_page(sizeof(IOHibernateImageHeader)), 0,
-                UIO_SYSSPACE, IO_SKIP_ENCRYPTION|IO_SYNC|IO_NODELOCKED|IO_UNIT|IO_NOCACHE, 
-                vfs_context_ucred(*ctx), (int *) 0,
-                vfs_context_proc(*ctx));
-    if (rc != 0) {
-        IOLog("sleepWakeDebugDumpFromFile: Failed to read header size %llu(rc=%d) from %s\n",
-             mach_vm_round_page(sizeof(IOHibernateImageHeader)), rc, fname);
-        error = SWD_FILEOP_ERROR;
-        goto err;
-    }
-
-    imageHdr = ((IOHibernateImageHeader *)tmpBuf);
-    if (imageHdr->signature != kIOHibernateHeaderDebugDataSignature) {
-        IOLog("sleepWakeDebugDumpFromFile: File %s header has unexpected value 0x%x\n", 
-             fname, imageHdr->signature);
-        error = SWD_HDR_SIGNATURE_ERROR;
-        goto err;
-    }
-
-    /* Sleep/Wake debug header(swd_hdr) is at the beggining of the second block */
-    hdrOffset = imageHdr->deviceBlockSize;
-    if (hdrOffset + sizeof(swd_hdr) >= va.va_data_alloc) {
-        IOLog("sleepWakeDebugDumpFromFile: header is crossing file size(0x%llx) in file %s\n",  
-             va.va_data_alloc, fname);
-        error = SWD_HDR_SIZE_ERROR;
-        goto err;
-    }
-
-    return 0; 
-
-err:
-    if (*vp) vnode_close(*vp, FREAD, *ctx);
-    *vp = NULL;
-
-    return error;
-}
-
-void IOPMrootDomain::sleepWakeDebugDumpFromFile( )
-{
-#if HIBERNATION
-    int             rc;
-    char                       hibernateFilename[MAXPATHLEN+1];
-    void            *tmpBuf;
-    swd_hdr         *hdr = NULL;
-    uint32_t        stacksSize, logSize;
-    uint64_t        tmpBufSize;
-    uint64_t        hdrOffset, stacksOffset, logOffset;
-    errno_t         error = EIO;
-    OSObject        *obj = NULL;
-    OSString        *str = NULL;
-    OSNumber        *failStat = NULL;
-    struct vnode    *vp = NULL;
-    vfs_context_t   ctx = NULL;
-    const char      *stacksFname, *logFname;
-
-    IOBufferMemoryDescriptor    *tmpBufDesc = NULL;
-
-    DLOG("sleepWakeDebugDumpFromFile\n");
-    if ((swd_flags & SWD_LOGS_IN_FILE) == 0)
-        return;
-
-   if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock))
-       return;
-
-
-    /* Allocate a temp buffer to copy data between files */
-    tmpBufSize = 2*4096;
-    tmpBufDesc = IOBufferMemoryDescriptor::
-        inTaskWithOptions(kernel_task, kIODirectionOutIn | kIOMemoryMapperNone, 
-                          tmpBufSize, PAGE_SIZE);
-
-    if (!tmpBufDesc) {
-        DMSG("sleepWakeDebugDumpFromFile: Fail to allocate temp buf\n");
-        goto exit;
-    }
-
-    tmpBuf = tmpBufDesc->getBytesNoCopy();
-
-   ctx = vfs_context_create(vfs_context_current());
-
-    /* First check if 'kSleepWakeStackBinFilename' has valid data */
-    swd_flags |= checkForValidDebugData(kSleepWakeStackBinFilename, &ctx, tmpBuf, &vp);
-    if (vp == NULL) {
-        /* Check if the debug data is saved to hibernation file */
-        hibernateFilename[0] = 0;
-        if ((obj = copyProperty(kIOHibernateFileKey)))
-        {
-            if ((str = OSDynamicCast(OSString, obj)))
-                strlcpy(hibernateFilename, str->getCStringNoCopy(),
-                        sizeof(hibernateFilename));
-            obj->release();
-        }
-        if (!hibernateFilename[0]) {
-            DMSG("sleepWakeDebugDumpFromFile: Failed to get hibernation file name\n");
-            goto exit;
-        }
-
-        swd_flags |= checkForValidDebugData(hibernateFilename, &ctx, tmpBuf, &vp);
-        if (vp == NULL) {
-            DMSG("sleepWakeDebugDumpFromFile: No valid debug data is found\n");
-            goto exit;
-        }
-        DLOG("Getting SW Stacks image from file %s\n", hibernateFilename);
-    }
-    else {
-        DLOG("Getting SW Stacks image from file %s\n", kSleepWakeStackBinFilename);
-    }
-
-    hdrOffset = ((IOHibernateImageHeader *)tmpBuf)->deviceBlockSize;
-
-    DLOG("Reading swd_hdr len 0x%llx offset 0x%lx\n", mach_vm_round_page(sizeof(swd_hdr)), trunc_page(hdrOffset));
-    /* Read the sleep/wake debug header(swd_hdr) */
-    rc = vn_rdwr(UIO_READ, vp, (char *)tmpBuf, round_page(sizeof(swd_hdr)), trunc_page(hdrOffset),
-                UIO_SYSSPACE, IO_SKIP_ENCRYPTION|IO_SYNC|IO_NODELOCKED|IO_UNIT|IO_NOCACHE, 
-                vfs_context_ucred(ctx), (int *) 0,
-                vfs_context_proc(ctx));
-    if (rc != 0) {
-        DMSG("sleepWakeDebugDumpFromFile: Failed to debug read header size %llu. rc=%d\n",
-             mach_vm_round_page(sizeof(swd_hdr)), rc);
-          swd_flags |= SWD_FILEOP_ERROR;
-        goto exit;
-    }
-
-    hdr = (swd_hdr *)((char *)tmpBuf + (hdrOffset - trunc_page(hdrOffset)));
-    if ((hdr->signature != SWD_HDR_SIGNATURE) || (hdr->alloc_size > SWD_BUF_SIZE) ||
-        (hdr->spindump_offset > SWD_BUF_SIZE) || (hdr->spindump_size > SWD_BUF_SIZE)) {
-        DMSG("sleepWakeDebugDumpFromFile: Invalid data in debug header. sign:0x%x size:0x%x spindump_offset:0x%x spindump_size:0x%x\n",
-             hdr->signature, hdr->alloc_size, hdr->spindump_offset, hdr->spindump_size);
-          swd_flags |= SWD_BUF_SIZE_ERROR;
-        goto exit;
-    }
-    stacksSize = hdr->spindump_size;
-
-    /* Get stacks & log offsets in the image file */
-    stacksOffset = hdrOffset + hdr->spindump_offset;
-    logOffset = hdrOffset + offsetof(swd_hdr, UUID);
-    logSize = sizeof(swd_hdr)-offsetof(swd_hdr, UUID); 
-    stacksFname = getDumpStackFilename(hdr);
-    logFname = getDumpLogFilename(hdr);
-
-    error = sleepWakeDebugCopyFile(vp, ctx, (char *)tmpBuf, tmpBufSize, stacksOffset,
-                                   stacksFname, stacksSize, hdr->crc);
-    if (error == EFAULT) {
-        DMSG("sleepWakeDebugDumpFromFile: Stackshot CRC doesn't match\n");
-        goto exit;
-    }
-    error = sleepWakeDebugCopyFile(vp, ctx, (char *)tmpBuf, tmpBufSize, logOffset, 
-                                   logFname, logSize, 0);
-    if (error) {
-        DMSG("sleepWakeDebugDumpFromFile: Failed to write the log file(0x%x)\n", error);
-        goto exit;
-    }
-exit:
-    if (error) {
-      // Write just the SleepWakeLog.dump with failure code
-      uint64_t fcode = 0;
-      const char *fname;
-      swd_hdr hdrCopy;
-      char *offset = NULL;
-      int  size;
-
-      hdr = &hdrCopy;
-      if (swd_flags & SWD_BOOT_BY_SW_WDOG) {
-          failStat = OSDynamicCast(OSNumber, getProperty(kIOPMSleepWakeFailureCodeKey));
-          fcode = failStat->unsigned64BitValue();
-          fname = kSleepWakeLogFilename;
-      }
-      else {
-          fname = kAppleOSXWatchdogLogFilename;
-      }
-
-      offset = (char*)hdr+offsetof(swd_hdr, UUID);
-      size = sizeof(swd_hdr)-offsetof(swd_hdr, UUID);
-      memset(offset, 0x20, size); // Fill with spaces
-
-
-      snprintf(hdr->spindump_status, sizeof(hdr->spindump_status), "\nstatus: 0x%x", swd_flags);
-      snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: 0x%llx", fcode);
-      snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Watchdog\n\n");
-      sleepWakeDebugSaveFile(fname, offset, size);
-
-    }
-    gRootDomain->swd_lock = 0;
-
-    if (vp) vnode_close(vp, FREAD, ctx);
-    if (ctx) vfs_context_rele(ctx);
-    if (tmpBufDesc) tmpBufDesc->release();
-#endif /* HIBERNATION */
-}
-
-void IOPMrootDomain::sleepWakeDebugDumpFromMem(IOMemoryMap *logBufMap)
-{
-   IOVirtualAddress     srcBuf = NULL;
-   char                 *stackBuf = NULL, *logOffset = NULL;
-   int                  logSize = 0;
-
-   errno_t      error = EIO;
-   uint64_t     bufSize = 0;
-   swd_hdr      *hdr = NULL;
-   OSNumber  *failStat = NULL;
-
-   if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock))
-       return;
-
-   if ((logBufMap == 0) || ( (srcBuf = logBufMap->getVirtualAddress()) == 0) )
-   {
-      DLOG("Nothing saved to dump to file\n");
-      goto exit;
-   }
-
-   hdr = (swd_hdr *)srcBuf;
-   bufSize = logBufMap->getLength();
-   if (bufSize <= sizeof(swd_hdr))
-   {
-      IOLog("SleepWake log buffer size is invalid\n");
-      swd_flags |= SWD_BUF_SIZE_ERROR;
-      goto exit;
-   }
-
-   stackBuf = (char*)hdr+hdr->spindump_offset;
-
-   error = sleepWakeDebugSaveFile(getDumpStackFilename(hdr), stackBuf, hdr->spindump_size);
-   if (error) goto exit;
-
-   logOffset = (char*)hdr+offsetof(swd_hdr, UUID);
-   logSize = sizeof(swd_hdr)-offsetof(swd_hdr, UUID);
-
-   error = sleepWakeDebugSaveFile(getDumpLogFilename(hdr), logOffset, logSize);
-   if (error) goto exit;
-
-    hdr->spindump_size = 0;
-    error = 0;
-
-exit:
-    if (error) {
-      // Write just the SleepWakeLog.dump with failure code
-      uint64_t fcode = 0;
-      const char *sname, *lname;
-      swd_hdr hdrCopy;
-
-      /* Try writing an empty stacks file */
-      hdr = &hdrCopy;
-      if (swd_flags & SWD_BOOT_BY_SW_WDOG) {
-          failStat = OSDynamicCast(OSNumber, getProperty(kIOPMSleepWakeFailureCodeKey));
-          fcode = failStat->unsigned64BitValue();
-          lname = kSleepWakeLogFilename;
-          sname = kSleepWakeStackFilename;
-      }
-      else {
-          lname = kAppleOSXWatchdogLogFilename;
-          sname= kAppleOSXWatchdogStackFilename;
-      }
-
-      sleepWakeDebugSaveFile(sname, NULL, 0);
-
-      logOffset = (char*)hdr+offsetof(swd_hdr, UUID);
-      logSize = sizeof(swd_hdr)-offsetof(swd_hdr, UUID);
-      memset(logOffset, 0x20, logSize); // Fill with spaces
-
-
-      snprintf(hdr->spindump_status, sizeof(hdr->spindump_status), "\nstatus: 0x%x", swd_flags);
-      snprintf(hdr->PMStatusCode, sizeof(hdr->PMStatusCode), "\nCode: 0x%llx", fcode);
-      snprintf(hdr->reason, sizeof(hdr->reason), "\nStackshot reason: Watchdog\n\n");
-      sleepWakeDebugSaveFile(lname, logOffset, logSize);
-    }
-
-    gRootDomain->swd_lock = 0;
-}
-
-IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( )
-{
-   IOVirtualAddress     vaddr = NULL;
-   IOMemoryDescriptor * desc = NULL;
-   IOMemoryMap *        logBufMap = NULL;
-
-   uint32_t          len = INT_MAX;
-   addr64_t          data[3];
-   uint64_t          bufSize = 0;
-   uint64_t          crc = 0;
-   uint64_t          newcrc = 0;
-   uint64_t          paddr = 0;
-   swd_hdr           *hdr = NULL;
-   bool              ret = false;
-   char              str[20];
-
-
-   if (!OSCompareAndSwap(0, 1, &gRootDomain->swd_lock))
-       return NULL;
-
-   if (!PEReadNVRAMProperty(kIOSleepWakeDebugKey, 0, &len)) {
-      DLOG("No sleepWakeDebug note to read\n");
-      goto exit;
-   }
-
-   if (len == strlen("sleepimage")) {
-       str[0] = 0;
-       PEReadNVRAMProperty(kIOSleepWakeDebugKey, str, &len);
-
-       if (!strncmp((char*)str, "sleepimage", strlen("sleepimage"))) {
-           DLOG("sleepWakeDebugRetrieve: in file logs\n");
-           swd_flags |= SWD_LOGS_IN_FILE|SWD_VALID_LOGS;
-           goto exit;
-       }
-   }
-   else if (len == sizeof(addr64_t)*3) {
-       PEReadNVRAMProperty(kIOSleepWakeDebugKey, data, &len);
-   }
-   else {
-      DLOG("Invalid sleepWakeDebug note length(%d)\n", len);
-      goto exit;
-   }
-
-
-
-   DLOG("sleepWakeDebugRetrieve: data[0]:0x%llx data[1]:0x%llx data[2]:0x%llx\n",
-        data[0], data[1], data[2]);
-   DLOG("sleepWakeDebugRetrieve: in mem logs\n");
-   bufSize = data[0];
-   crc = data[1];
-   paddr = data[2];
-   if ( (bufSize <= sizeof(swd_hdr)) ||(bufSize > SWD_BUF_SIZE) || (crc == 0) )
-   {
-      IOLog("SleepWake log buffer size is invalid\n");
-      swd_flags |= SWD_BUF_SIZE_ERROR;
-      return NULL;
-   }
-
-   DLOG("size:0x%llx crc:0x%llx paddr:0x%llx\n",
-         bufSize, crc, paddr);
-
-
-   desc = IOMemoryDescriptor::withAddressRange( paddr, bufSize,
-                          kIODirectionOutIn | kIOMemoryMapperNone, NULL);
-   if (desc == NULL)
-   {
-      IOLog("Fail to map SleepWake log buffer\n");
-      swd_flags |= SWD_INTERNAL_FAILURE;
-      goto exit;
-   }
-
-   logBufMap = desc->map();
-
-   vaddr = logBufMap->getVirtualAddress();
-
-
-   if ( (logBufMap->getLength() <= sizeof(swd_hdr)) || (vaddr == NULL) ) {
-      IOLog("Fail to map SleepWake log buffer\n");
-      swd_flags |= SWD_INTERNAL_FAILURE;
-      goto exit;
-   }
-
-   hdr = (swd_hdr *)vaddr;
-   if (hdr->spindump_offset+hdr->spindump_size > bufSize)
-   {
-      IOLog("SleepWake log header size is invalid\n");
-      swd_flags |= SWD_HDR_SIZE_ERROR;
-      goto exit;
-   }
-
-   hdr->crc = crc;
-   newcrc = crc32(0, (void *)((char*)vaddr+hdr->spindump_offset),
-            hdr->spindump_size);
-   if (newcrc != crc) {
-      IOLog("SleepWake log buffer contents are invalid\n");
-      swd_flags |= SWD_DATA_CRC_ERROR;
-      goto exit;
-   }
-
-   ret = true;
-   swd_flags |= SWD_LOGS_IN_MEM | SWD_VALID_LOGS;
-
-
-exit:
-   PERemoveNVRAMProperty(kIOSleepWakeDebugKey);
-   if (!ret) {
-      if (logBufMap) logBufMap->release();
-      logBufMap = 0;
-   }
-   if (desc) desc->release();
-    gRootDomain->swd_lock = 0;
-
-   return logBufMap;
-}
 
 #else
 
@@ -10693,28 +10548,8 @@ void IOPMrootDomain::takeStackshot(bool restart, bool isOSXWatchdog, bool isSpin
 void IOPMrootDomain::sleepWakeDebugMemAlloc( )
 {
 }
-void IOPMrootDomain::sleepWakeDebugDumpFromMem(IOMemoryMap *map)
-{
-}
-errno_t IOPMrootDomain::sleepWakeDebugCopyFile(
-                               struct vnode *srcVp, 
-                               vfs_context_t srcCtx,
-                               char *tmpBuf, uint64_t tmpBufSize,
-                               uint64_t srcOffset, 
-                               const char *dstFname, 
-                               uint64_t numBytes,
-                               uint32_t crc)
-{
-    return EIO;
-}
-
-void IOPMrootDomain::sleepWakeDebugDumpFromFile()
-{
-}
-
-IOMemoryMap *IOPMrootDomain::sleepWakeDebugRetrieve( )
+void IOPMrootDomain::saveFailureData2File( )
 {
-   return NULL;
 }
 
 void IOPMrootDomain::sleepWakeDebugEnableWdog()
diff --git a/iokit/Kernel/IOPerfControl.cpp b/iokit/Kernel/IOPerfControl.cpp
new file mode 100644 (file)
index 0000000..e5ece34
--- /dev/null
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ */
+
+#include <IOKit/perfcontrol/IOPerfControl.h>
+
+#include <stdatomic.h>
+
+#include <kern/thread_group.h>
+
+#undef super
+#define super OSObject
+OSDefineMetaClassAndStructors(IOPerfControlClient, OSObject);
+
+bool IOPerfControlClient::init(IOService *driver, uint64_t maxWorkCapacity)
+{
+    if (!super::init())
+        return false;
+
+    interface = PerfControllerInterface{
+        .version = 0,
+        .registerDevice =
+            [](IOService *device) {
+                return kIOReturnSuccess;
+            },
+        .unregisterDevice =
+            [](IOService *device) {
+                return kIOReturnSuccess;
+            },
+        .workCanSubmit =
+            [](IOService *device, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) {
+                return false;
+            },
+        .workSubmit =
+            [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkSubmitArgs *args) {
+            },
+        .workBegin =
+            [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkBeginArgs *args) {
+            },
+        .workEnd =
+            [](IOService *device, uint64_t token, PerfControllerInterface::WorkState *state, WorkEndArgs *args, bool done) {
+            },
+    };
+
+    interfaceLock = IOLockAlloc();
+    if (!interfaceLock)
+        goto error;
+
+    deviceRegistrationList = OSSet::withCapacity(4);
+    if (!deviceRegistrationList)
+        goto error;
+
+    bzero(workTable, sizeof(workTable));
+    memset(&workTable[kIOPerfControlClientWorkUntracked], ~0, sizeof(WorkTableEntry));
+    workTableNextIndex = kIOPerfControlClientWorkUntracked + 1;
+
+    workTableLock = IOSimpleLockAlloc();
+    if (!workTableLock)
+        goto error;
+
+    // TODO: check sum(maxWorkCapacities) < table size
+
+    return true;
+
+error:
+    if (interfaceLock)
+        IOLockFree(interfaceLock);
+    if (deviceRegistrationList)
+        deviceRegistrationList->release();
+    if (workTableLock)
+        IOSimpleLockFree(workTableLock);
+    return false;
+}
+
+IOPerfControlClient *_Atomic gSharedClient = nullptr;
+
+IOPerfControlClient *IOPerfControlClient::copyClient(IOService *driver, uint64_t maxWorkCapacity)
+{
+    IOPerfControlClient *client = atomic_load_explicit(&gSharedClient, memory_order_acquire);
+    if (client == nullptr) {
+        IOPerfControlClient *expected = client;
+        client = new IOPerfControlClient;
+        if (!client || !client->init(driver, maxWorkCapacity))
+            panic("could not create IOPerfControlClient");
+        if (!atomic_compare_exchange_strong_explicit(&gSharedClient, &expected, client, memory_order_acq_rel,
+                                                     memory_order_acquire)) {
+            client->release();
+            client = expected;
+        }
+    }
+    // TODO: add maxWorkCapacity to existing client
+    client->retain();
+    return client;
+}
+
+uint64_t IOPerfControlClient::allocateToken(thread_group *thread_group)
+{
+    uint64_t token = kIOPerfControlClientWorkUntracked;
+
+
+    return token;
+}
+
+void IOPerfControlClient::deallocateToken(uint64_t token)
+{
+}
+
+bool IOPerfControlClient::getEntryForToken(uint64_t token, IOPerfControlClient::WorkTableEntry &entry)
+{
+    if (token == kIOPerfControlClientWorkUntracked)
+        return false;
+
+    if (token >= kWorkTableNumEntries)
+        panic("Invalid work token (%llu): index out of bounds.", token);
+
+    entry = workTable[token];
+    auto *thread_group = entry.thread_group;
+    assertf(thread_group, "Invalid work token: %llu", token);
+    return thread_group != nullptr;
+}
+
+void IOPerfControlClient::markEntryStarted(uint64_t token, bool started)
+{
+    if (token == kIOPerfControlClientWorkUntracked)
+        return;
+
+    if (token >= kWorkTableNumEntries)
+        panic("Invalid work token (%llu): index out of bounds.", token);
+
+    workTable[token].started = started;
+}
+
+IOReturn IOPerfControlClient::registerDevice(__unused IOService *driver, IOService *device)
+{
+    IOReturn ret = kIOReturnSuccess;
+
+    IOLockLock(interfaceLock);
+
+    if (interface.version > 0)
+        ret = interface.registerDevice(device);
+    else
+        deviceRegistrationList->setObject(device);
+
+    IOLockUnlock(interfaceLock);
+
+    return ret;
+}
+
+void IOPerfControlClient::unregisterDevice(__unused IOService *driver, IOService *device)
+{
+    IOLockLock(interfaceLock);
+
+    if (interface.version > 0)
+        interface.unregisterDevice(device);
+    else
+        deviceRegistrationList->removeObject(device);
+
+    IOLockUnlock(interfaceLock);
+}
+
+uint64_t IOPerfControlClient::workSubmit(IOService *device, WorkSubmitArgs *args)
+{
+    return kIOPerfControlClientWorkUntracked;
+}
+
+uint64_t IOPerfControlClient::workSubmitAndBegin(IOService *device, WorkSubmitArgs *submitArgs, WorkBeginArgs *beginArgs)
+{
+    return kIOPerfControlClientWorkUntracked;
+}
+
+void IOPerfControlClient::workBegin(IOService *device, uint64_t token, WorkBeginArgs *args)
+{
+}
+
+void IOPerfControlClient::workEnd(IOService *device, uint64_t token, WorkEndArgs *args, bool done)
+{
+}
+
+IOReturn IOPerfControlClient::registerPerformanceController(PerfControllerInterface pci)
+{
+    IOReturn result = kIOReturnError;
+
+    IOLockLock(interfaceLock);
+
+    if (interface.version == 0 && pci.version > 0) {
+        assert(pci.registerDevice && pci.unregisterDevice && pci.workCanSubmit && pci.workSubmit && pci.workBegin && pci.workEnd);
+        result = kIOReturnSuccess;
+
+        OSObject *obj;
+        while ((obj = deviceRegistrationList->getAnyObject())) {
+            IOService *device = OSDynamicCast(IOService, obj);
+            if (device)
+                pci.registerDevice(device);
+            deviceRegistrationList->removeObject(obj);
+        }
+
+        interface = pci;
+    }
+
+    IOLockUnlock(interfaceLock);
+
+    return result;
+}
index 7372f799ac903e0553e436b9af7df930aa0b7cf6..035177e8fd6f1476509a817e8016520d8761e316 100644 (file)
@@ -1637,48 +1637,6 @@ IOWorkLoop *IOPlatformExpertDevice::getWorkLoop() const
 
 IOReturn IOPlatformExpertDevice::setProperties( OSObject * properties )
 {
-    OSDictionary * dictionary;
-    OSObject *     object;
-    IOReturn       status;
-
-    status = super::setProperties( properties );
-    if ( status != kIOReturnUnsupported ) return status;
-
-    status = IOUserClient::clientHasPrivilege( current_task( ), kIOClientPrivilegeAdministrator );
-    if ( status != kIOReturnSuccess ) return status;
-
-    dictionary = OSDynamicCast( OSDictionary, properties );
-    if ( dictionary == 0 ) return kIOReturnBadArgument;
-
-    object = dictionary->getObject( kIOPlatformUUIDKey );
-    if ( object )
-    {
-        IORegistryEntry * entry;
-        OSString *        string;
-        uuid_t            uuid;
-
-        string = ( OSString * ) getProperty( kIOPlatformUUIDKey );
-        if ( string ) return kIOReturnNotPermitted;
-
-        string = OSDynamicCast( OSString, object );
-        if ( string == 0 ) return kIOReturnBadArgument;
-
-        status = uuid_parse( string->getCStringNoCopy( ), uuid );
-        if ( status != 0 ) return kIOReturnBadArgument;
-
-        entry = IORegistryEntry::fromPath( "/options", gIODTPlane );
-        if ( entry )
-        {
-            entry->setProperty( "platform-uuid", uuid, sizeof( uuid_t ) );
-            entry->release( );
-        }
-
-        setProperty( kIOPlatformUUIDKey, string );
-        publishResource( kIOPlatformUUIDKey, string );
-
-        return kIOReturnSuccess;
-    }
-
     return kIOReturnUnsupported;
 }
 
index 28256a35cc7f70104ffed594f0c9fab721934401..8218c5dde9ed6f430e0f1d7cab9e6e205b12e765 100644 (file)
@@ -584,6 +584,7 @@ IOGetVolumeCryptKey(dev_t block_dev,  OSString ** pKeyUUID,
 
 IOReturn
 IOPolledFileOpen(const char * filename,
+                 uint32_t flags,
                  uint64_t setFileSize, uint64_t fsFreeSize,
                  void * write_file_addr, size_t write_file_len,
                  IOPolledFileIOVars ** fileVars,
@@ -614,7 +615,7 @@ IOPolledFileOpen(const char * filename,
         clock_get_uptime(&startTime);
 
         vars->fileRef = kern_open_file_for_direct_io(filename,
-                                                     (write_file_addr != NULL) || (0 != setFileSize),
+                                                     flags,
                                                      &file_extent_callback, &ctx,
                                                      setFileSize,
                                                      fsFreeSize,
index f07b4231805428004c7417c315a87b4d645e3730..03bb8724f1f3f672ea84a618de44ca48bd90cefd 100644 (file)
@@ -33,7 +33,7 @@
 #include <IOKit/IOTimeStamp.h>
 
 #include <IOKit/IOLib.h>
-
+#include <stdatomic.h>
 #include <IOKit/assert.h>
 
 #include "IOKitKernelInternal.h"
@@ -60,9 +60,10 @@ OSDefineMetaClassAndStructors(IORegistryEntry, OSObject)
 
 struct IORegistryEntry::ExpansionData
 {
-    IORecursiveLock * fLock;
-    uint64_t         fRegistryEntryID;
-    SInt32            fRegistryEntryGenerationCount;
+    IORecursiveLock *        fLock;
+    uint64_t                fRegistryEntryID;
+    SInt32                   fRegistryEntryGenerationCount;
+    OSObject       **_Atomic fIndexedProperties;
 };
 
 
@@ -404,7 +405,15 @@ void IORegistryEntry::free( void )
 
     if (reserved)
     {
-       if (reserved->fLock) IORecursiveLockFree(reserved->fLock);
+       if (reserved->fIndexedProperties)
+       {
+           for (int idx = 0; idx < kIORegistryEntryIndexedPropertyCount; idx++)
+           {
+               if (reserved->fIndexedProperties[idx]) reserved->fIndexedProperties[idx]->release();
+           }
+           IODelete(reserved->fIndexedProperties, OSObject *, kIORegistryEntryIndexedPropertyCount);
+       }
+       if (reserved->fLock)              IORecursiveLockFree(reserved->fLock);
        IODelete(reserved, ExpansionData, 1);
     }
 
@@ -744,6 +753,40 @@ IORegistryEntry::setProperty( const char *      aKey,
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
+OSObject * IORegistryEntry::setIndexedProperty(uint32_t index, OSObject * anObject)
+{
+    OSObject ** array;
+    OSObject *  prior;
+
+    if (index >= kIORegistryEntryIndexedPropertyCount) return (0);
+
+    array = atomic_load_explicit(&reserved->fIndexedProperties, memory_order_acquire);
+    if (!array)
+    {
+       array = IONew(OSObject *, kIORegistryEntryIndexedPropertyCount);
+       if (!array) return (0);
+       bzero(array, kIORegistryEntryIndexedPropertyCount * sizeof(array[0]));
+       if (!OSCompareAndSwapPtr(NULL, array, &reserved->fIndexedProperties)) IODelete(array, OSObject *, kIORegistryEntryIndexedPropertyCount);
+    }
+    if (!reserved->fIndexedProperties) return (0);
+
+    prior = reserved->fIndexedProperties[index];
+    if (anObject) anObject->retain();
+    reserved->fIndexedProperties[index] = anObject;
+
+    return (prior);
+}
+
+OSObject * IORegistryEntry::getIndexedProperty(uint32_t index) const
+{
+    if (index >= kIORegistryEntryIndexedPropertyCount) return (0);
+    if (!reserved->fIndexedProperties) return (0);
+
+    return (reserved->fIndexedProperties[index]);
+}
+
+/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
+
 /* Name, location, paths */
 
 const char * IORegistryEntry::getName( const IORegistryPlane * plane ) const
@@ -886,7 +929,7 @@ IORegistryEntry::compareName( OSString * name, OSString ** matched ) const
     const OSSymbol *   sym = copyName();
     bool               isEqual;
 
-    isEqual = sym->isEqualTo( name );
+    isEqual = (sym && sym->isEqualTo(name));
 
     if( isEqual && matched) {
        name->retain();
index 5d4be71a60f65b42fbd9da7efd34bebdf7a02393..cfb2d9c0f47a567e95a3e76429b524ba4e0d3552 100644 (file)
@@ -33,6 +33,7 @@
 #include <libkern/c++/OSContainers.h>
 #include <libkern/c++/OSKext.h>
 #include <libkern/c++/OSUnserialize.h>
+#include <libkern/Block.h>
 #include <IOKit/IOCatalogue.h>
 #include <IOKit/IOCommand.h>
 #include <IOKit/IODeviceTreeSupport.h>
@@ -613,7 +614,12 @@ void IOService::free( void )
 
     if (_numInterruptSources && _interruptSources)
     {
-       IOFree(_interruptSources, _numInterruptSources * sizeof(IOInterruptSource));
+       for (i = 0; i < _numInterruptSources; i++) {
+           void * block = _interruptSourcesPrivate(this)[i].vectorBlock;
+           if (block) Block_release(block);
+       }
+       IOFree(_interruptSources,
+               _numInterruptSources * sizeofAllIOInterruptSource);
        _interruptSources = 0;
     }
 
@@ -724,7 +730,7 @@ void IOService::detach( IOService * provider )
 
     unlockForArbitration();
 
-    if( newProvider) {
+    if( newProvider && adjParent) {
         newProvider->lockForArbitration();
         newProvider->_adjustBusy(1);
         newProvider->unlockForArbitration();
@@ -1810,12 +1816,45 @@ IONotifier * IOService::registerInterest( const OSSymbol * typeOfInterest,
     return( notify );
 }
 
+
+
+static IOReturn
+IOServiceInterestHandlerToBlock( void * target __unused, void * refCon,
+                                              UInt32 messageType, IOService * provider,
+                                              void * messageArgument, vm_size_t argSize )
+{
+    return ((IOServiceInterestHandlerBlock) refCon)(messageType, provider, messageArgument, argSize);
+}
+
+IONotifier * IOService::registerInterest(const OSSymbol * typeOfInterest,
+                  IOServiceInterestHandlerBlock handler)
+{
+    IONotifier * notify;
+    void       * block;
+
+    block = Block_copy(handler);
+    if (!block) return (NULL);
+
+    notify = registerInterest(typeOfInterest, &IOServiceInterestHandlerToBlock, NULL, block);
+
+    if (!notify) Block_release(block);
+
+    return (notify);
+}
+
 IOReturn IOService::registerInterestForNotifier( IONotifier *svcNotify, const OSSymbol * typeOfInterest,
                   IOServiceInterestHandler handler, void * target, void * ref )
 {
     IOReturn rc = kIOReturnSuccess;
     _IOServiceInterestNotifier  *notify = 0;
 
+    if (!svcNotify || !(notify = OSDynamicCast(_IOServiceInterestNotifier, svcNotify)))
+        return( kIOReturnBadArgument );
+
+    notify->handler = handler;
+    notify->target = target;
+    notify->ref = ref;
+
     if( (typeOfInterest != gIOGeneralInterest)
      && (typeOfInterest != gIOBusyInterest)
      && (typeOfInterest != gIOAppPowerStateInterest)
@@ -1823,15 +1862,9 @@ IOReturn IOService::registerInterestForNotifier( IONotifier *svcNotify, const OS
      && (typeOfInterest != gIOPriorityPowerStateInterest))
         return( kIOReturnBadArgument );
 
-    if (!svcNotify || !(notify = OSDynamicCast(_IOServiceInterestNotifier, svcNotify)))
-        return( kIOReturnBadArgument );
-
     lockForArbitration();
     if( 0 == (__state[0] & kIOServiceInactiveState)) {
 
-        notify->handler = handler;
-        notify->target = target;
-        notify->ref = ref;
         notify->state = kIOServiceNotifyEnable;
 
         ////// queue
@@ -1942,6 +1975,9 @@ void _IOServiceInterestNotifier::wait()
 void _IOServiceInterestNotifier::free()
 {
     assert( queue_empty( &handlerInvocations ));
+
+    if (handler == &IOServiceInterestHandlerToBlock) Block_release(ref);
+
     OSObject::free();
 }
 
@@ -4680,6 +4716,34 @@ IONotifier * IOService::addMatchingNotification(
     return( ret );
 }
 
+static bool
+IOServiceMatchingNotificationHandlerToBlock( void * target __unused, void * refCon,
+                                  IOService * newService,
+                                  IONotifier * notifier )
+{
+    return ((IOServiceMatchingNotificationHandlerBlock) refCon)(newService, notifier);
+}
+
+IONotifier * IOService::addMatchingNotification(
+                            const OSSymbol * type, OSDictionary * matching,
+                            SInt32 priority,
+                            IOServiceMatchingNotificationHandlerBlock handler)
+{
+    IONotifier * notify;
+    void       * block;
+
+    block = Block_copy(handler);
+    if (!block) return (NULL);
+
+    notify = addMatchingNotification(type, matching,
+               &IOServiceMatchingNotificationHandlerToBlock, NULL, block, priority);
+
+    if (!notify) Block_release(block);
+
+    return (notify);
+}
+
+
 bool IOService::syncNotificationHandler(
                        void * /* target */, void * ref,
                        IOService * newService,
@@ -4981,6 +5045,9 @@ void _IOServiceNotifier::wait()
 void _IOServiceNotifier::free()
 {
     assert( queue_empty( &handlerInvocations ));
+
+    if (handler == &IOServiceMatchingNotificationHandlerToBlock) Block_release(ref);
+
     OSObject::free();
 }
 
@@ -6303,10 +6370,11 @@ IOReturn IOService::resolveInterrupt(IOService *nub, int source)
   // Allocate space for the IOInterruptSources if needed... then return early.
   if (nub->_interruptSources == 0) {
     numSources = array->getCount();
-    interruptSources = (IOInterruptSource *)IOMalloc(numSources * sizeof(IOInterruptSource));
+    interruptSources = (IOInterruptSource *)IOMalloc(
+       numSources * sizeofAllIOInterruptSource);
     if (interruptSources == 0) return kIOReturnNoMemory;
     
-    bzero(interruptSources, numSources * sizeof(IOInterruptSource));
+    bzero(interruptSources, numSources * sizeofAllIOInterruptSource);
     
     nub->_numInterruptSources = numSources;
     nub->_interruptSources = interruptSources;
@@ -6353,7 +6421,7 @@ IOReturn IOService::lookupInterrupt(int source, bool resolve, IOInterruptControl
   if (*interruptController == NULL) {
     if (!resolve) return kIOReturnNoInterrupt;
     
-    /* Try to reslove the interrupt */
+    /* Try to resolve the interrupt */
     ret = resolveInterrupt(this, source);
     if (ret != kIOReturnSuccess) return ret;    
     
@@ -6379,16 +6447,49 @@ IOReturn IOService::registerInterrupt(int source, OSObject *target,
                                                refCon);
 }
 
+static void IOServiceInterruptActionToBlock( OSObject * target, void * refCon,
+                   IOService * nub, int source )
+{
+  ((IOInterruptActionBlock)(refCon))(nub, source);
+}
+
+IOReturn IOService::registerInterruptBlock(int source, OSObject *target,
+                                     IOInterruptActionBlock handler)
+{
+    IOReturn ret;
+    void   * block;
+
+    block = Block_copy(handler);
+    if (!block) return (kIOReturnNoMemory);
+
+    ret = registerInterrupt(source, target, &IOServiceInterruptActionToBlock, block);
+    if (kIOReturnSuccess != ret) {
+      Block_release(block);
+      return (ret);
+    }
+    _interruptSourcesPrivate(this)[source].vectorBlock = block;
+
+    return (ret);
+}
+
 IOReturn IOService::unregisterInterrupt(int source)
 {
-  IOInterruptController *interruptController;
   IOReturn              ret;
+  IOInterruptController *interruptController;
+  void                  *block;
   
   ret = lookupInterrupt(source, false, &interruptController);
   if (ret != kIOReturnSuccess) return ret;
   
   /* Unregister the source */
-  return interruptController->unregisterInterrupt(this, source);
+  block = _interruptSourcesPrivate(this)[source].vectorBlock;
+  ret = interruptController->unregisterInterrupt(this, source);
+  if ((kIOReturnSuccess == ret) && (block = _interruptSourcesPrivate(this)[source].vectorBlock)) {
+    _interruptSourcesPrivate(this)[source].vectorBlock = NULL;
+    Block_release(block);
+  }
+
+  return ret;
 }
 
 IOReturn IOService::addInterruptStatistics(IOInterruptAccountingData * statistics, int source)
index 6fc90a603c97334a568a12fe79041a45c9d3ce6f..e6bf85a01b6b5c3d14d860e765a0d1fd163fe752 100644 (file)
@@ -474,6 +474,9 @@ void IOService::PMinit( void )
         {
             fWatchdogTimer = thread_call_allocate(
                   &IOService::watchdog_timer_expired, (thread_call_param_t)this);
+            fWatchdogLock = IOLockAlloc();
+
+            fBlockedArray =  OSArray::withCapacity(4);
         }
 
         fAckTimer = thread_call_allocate(
@@ -544,6 +547,16 @@ void IOService::PMfree( void )
             fWatchdogTimer = NULL;
         }
 
+        if (fWatchdogLock) {
+            IOLockFree(fWatchdogLock);
+            fWatchdogLock = NULL;
+        }
+
+        if (fBlockedArray) {
+            fBlockedArray->release();
+            fBlockedArray = NULL;
+        }
+
         if ( fSettleTimer ) {
             thread_call_cancel(fSettleTimer);
             thread_call_free(fSettleTimer);
@@ -1080,6 +1093,7 @@ IOReturn IOService::removePowerChild( IOPowerConnection * theNub )
             if ( fHeadNotePendingAcks == 0 )
             {
                 stop_ack_timer();
+                getPMRootDomain()->reset_watchdog_timer(this, 0);
 
                 // This parent may have a request in the work queue that is
                 // blocked on fHeadNotePendingAcks=0. And removePowerChild()
@@ -1600,6 +1614,7 @@ bool IOService::handleAcknowledgePowerChange( IOPMRequest * request )
             stop_ack_timer();
             // and now we can continue
             all_acked = true;
+            getPMRootDomain()->reset_watchdog_timer(this, 0);
         }
     } else {
         OUR_PMLog(kPMLogAcknowledgeErr3, 0, 0); // not expecting anybody to ack
@@ -3608,6 +3623,7 @@ void IOService::notifyInterestedDriversDone( void )
     IOItemCount         count;
     DriverCallParam *   param;
     IOReturn            result;
+    int                 maxTimeout = 0;
 
     PM_ASSERT_IN_GATE();
     assert( fDriverCallBusy == false );
@@ -3650,6 +3666,9 @@ void IOService::notifyInterestedDriversDone( void )
                     result = kMinAckTimeoutTicks;
 
                 informee->timer = (result / (ACK_TIMER_PERIOD / ns_per_us)) + 1;
+                if (result > maxTimeout) {
+                    maxTimeout = result;
+                }
             }
             // else, child has already acked or driver has removed interest,
             // and head_note_pendingAcks decremented.
@@ -3665,6 +3684,7 @@ void IOService::notifyInterestedDriversDone( void )
         {
             OUR_PMLog(kPMLogStartAckTimer, 0, 0);
             start_ack_timer();
+            getPMRootDomain()->reset_watchdog_timer(this, maxTimeout/USEC_PER_SEC+1);
         }
     }
 
@@ -3986,6 +4006,7 @@ void IOService::driverSetPowerState( void )
     param = (DriverCallParam *) fDriverCallParamPtr;
     powerState = fHeadNotePowerState;
 
+    callEntry.callMethod = OSMemberFunctionCast(const void *, fControllingDriver, &IOService::setPowerState);
     if (assertPMDriverCall(&callEntry))
     {
         OUR_PMLogFuncStart(kPMLogProgramHardware, (uintptr_t) this, powerState);
@@ -4066,6 +4087,12 @@ void IOService::driverInformPowerChange( void )
         informee = (IOPMinformee *) param->Target;
         driver   = informee->whatObject;
 
+        if (fDriverCallReason == kDriverCallInformPreChange) {
+            callEntry.callMethod = OSMemberFunctionCast(const void *, driver, &IOService::powerStateWillChangeTo);
+        }
+        else {
+            callEntry.callMethod = OSMemberFunctionCast(const void *, driver, &IOService::powerStateDidChangeTo);
+        }
         if (assertPMDriverCall(&callEntry, 0, informee))
         {
             if (fDriverCallReason == kDriverCallInformPreChange)
@@ -4277,6 +4304,7 @@ void IOService::notifyControllingDriverDone( void )
         {
             OUR_PMLog(kPMLogStartAckTimer, 0, 0);
             start_ack_timer();
+            getPMRootDomain()->reset_watchdog_timer(this, result/USEC_PER_SEC+1);
         }
     }
 
@@ -5311,6 +5339,7 @@ bool IOService::ackTimerTick( void )
                         done = true;
                     }
 #endif
+                    getPMRootDomain()->reset_watchdog_timer(this, 0);
                 } else {
                     // still waiting, set timer again
                     start_ack_timer();
@@ -5392,55 +5421,124 @@ bool IOService::ackTimerTick( void )
 //*********************************************************************************
 void IOService::start_watchdog_timer( void )
 {
-    AbsoluteTime    deadline;
-    boolean_t       pending;
-    static int      timeout = -1;
+    int             timeout;
+    uint64_t        deadline;
 
     if (!fWatchdogTimer || (kIOSleepWakeWdogOff & gIOKitDebug))
        return;
 
-    if (thread_call_isactive(fWatchdogTimer)) return;
-    if (timeout == -1) {
-       PE_parse_boot_argn("swd_timeout", &timeout, sizeof(timeout));
-    }
-    if (timeout < 60) {
-       timeout = WATCHDOG_TIMER_PERIOD;
-    }
+    IOLockLock(fWatchdogLock);
 
+    timeout = getPMRootDomain()->getWatchdogTimeout();
     clock_interval_to_deadline(timeout, kSecondScale, &deadline);
+    fWatchdogDeadline = deadline;
+    start_watchdog_timer(deadline);
+    IOLockUnlock(fWatchdogLock);
+}
 
-    retain();
-    pending = thread_call_enter_delayed(fWatchdogTimer, deadline);
-    if (pending) release();
+void IOService::start_watchdog_timer(uint64_t deadline)
+{
+
+    IOLockAssert(fWatchdogLock, kIOLockAssertOwned);
+
+    if (!thread_call_isactive(fWatchdogTimer)) {
+        thread_call_enter_delayed(fWatchdogTimer, deadline);
+    }
 
 }
 
 //*********************************************************************************
 // [private] stop_watchdog_timer
-// Returns true if watchdog was enabled and stopped now
 //*********************************************************************************
 
-bool IOService::stop_watchdog_timer( void )
+void IOService::stop_watchdog_timer( void )
 {
-    boolean_t   pending;
-
     if (!fWatchdogTimer || (kIOSleepWakeWdogOff & gIOKitDebug))
-       return false;
+       return;
 
-    pending = thread_call_cancel(fWatchdogTimer);
-    if (pending) release();
+    IOLockLock(fWatchdogLock);
+
+    thread_call_cancel(fWatchdogTimer);
+    fWatchdogDeadline = 0;
+
+    while (fBlockedArray->getCount()) {
+        IOService *obj = OSDynamicCast(IOService, fBlockedArray->getObject(0));
+        if (obj) {
+            PM_ERROR("WDOG:Object %s unexpected in blocked array\n", obj->fName);
+            fBlockedArray->removeObject(0);
+        }
+    }
 
-    return pending;
+    IOLockUnlock(fWatchdogLock);
 }
 
 //*********************************************************************************
 // reset_watchdog_timer
 //*********************************************************************************
 
-void IOService::reset_watchdog_timer( void )
+void IOService::reset_watchdog_timer(IOService *blockedObject, int pendingResponseTimeout)
 {
-    if (stop_watchdog_timer())
-        start_watchdog_timer();
+    unsigned int i;
+    uint64_t    deadline;
+    IOService *obj;
+
+    if (!fWatchdogTimer || (kIOSleepWakeWdogOff & gIOKitDebug))
+        return;
+
+
+    IOLockLock(fWatchdogLock);
+    if (!fWatchdogDeadline) {
+        goto exit;
+    }
+
+    i = fBlockedArray->getNextIndexOfObject(blockedObject, 0);
+    if (pendingResponseTimeout == 0) {
+        blockedObject->fPendingResponseDeadline = 0;
+        if (i == (unsigned int)-1) {
+            goto exit;
+        }
+        fBlockedArray->removeObject(i);
+    }
+    else {
+        // Set deadline 2secs after the expected response timeout to allow
+        // ack timer to handle the timeout.
+        clock_interval_to_deadline(pendingResponseTimeout+2, kSecondScale, &deadline);
+
+        if (i != (unsigned int)-1) {
+            PM_ERROR("WDOG:Object %s is already blocked for responses. Ignoring timeout %d\n",
+                    fName, pendingResponseTimeout);
+            goto exit;
+        }
+
+
+        for (i = 0; i < fBlockedArray->getCount(); i++) {
+            obj = OSDynamicCast(IOService, fBlockedArray->getObject(i));
+            if (obj && (obj->fPendingResponseDeadline < deadline)) {
+                blockedObject->fPendingResponseDeadline = deadline;
+                fBlockedArray->setObject(i, blockedObject);
+                break;
+            }
+        }
+        if (i == fBlockedArray->getCount()) {
+            blockedObject->fPendingResponseDeadline = deadline;
+            fBlockedArray->setObject(blockedObject);
+        }
+    }
+
+    obj = OSDynamicCast(IOService, fBlockedArray->getObject(0));
+    if (!obj) {
+        int timeout = getPMRootDomain()->getWatchdogTimeout();
+        clock_interval_to_deadline(timeout, kSecondScale, &deadline);
+    }
+    else {
+        deadline = obj->fPendingResponseDeadline;
+    }
+
+    thread_call_cancel(fWatchdogTimer);
+    start_watchdog_timer(deadline);
+
+exit:
+    IOLockUnlock(fWatchdogLock);
 }
 
 
@@ -5493,10 +5591,6 @@ void IOService::start_ack_timer ( UInt32 interval, UInt32 scale )
     pending = thread_call_enter_delayed(fAckTimer, deadline);
     if (pending) release();
 
-    // Stop watchdog if ack is delayed by more than a sec
-    if (interval * scale > kSecondScale) {
-        stop_watchdog_timer();
-    }
 }
 
 //*********************************************************************************
@@ -5509,8 +5603,6 @@ void IOService::stop_ack_timer( void )
 
     pending = thread_call_cancel(fAckTimer);
     if (pending) release();
-
-    start_watchdog_timer();
 }
 
 //*********************************************************************************
@@ -5535,7 +5627,6 @@ IOService::actionAckTimerExpired(
     if (done && gIOPMWorkQueue)
     {
         gIOPMWorkQueue->signalWorkAvailable();
-        me->start_watchdog_timer();
     }
 
     return kIOReturnSuccess;
@@ -5832,6 +5923,9 @@ void IOService::cleanClientResponses( bool logErrors )
         }
     }
 
+    if (IS_ROOT_DOMAIN) {
+        getPMRootDomain()->reset_watchdog_timer(this, 0);
+    }
     if (fResponseArray)
     {
         fResponseArray->release();
@@ -5924,6 +6018,7 @@ bool IOService::tellClientsWithResponse( int messageType )
                }
            }
            context.maxTimeRequested = maxTimeOut;
+            context.enableTracing = isRootDomain;
             applyToInterested( gIOGeneralInterest,
                 pmTellClientWithResponse, (void *) &context );
 
@@ -5972,6 +6067,7 @@ bool IOService::tellClientsWithResponse( int messageType )
         OUR_PMLog(kPMLogStartAckTimer, context.maxTimeRequested, 0);
         if (context.enableTracing) {
             getPMRootDomain()->traceDetail(context.messageType, 0, context.maxTimeRequested / 1000);
+            getPMRootDomain()->reset_watchdog_timer(this, context.maxTimeRequested/USEC_PER_SEC+1);
         }
         start_ack_timer( context.maxTimeRequested / 1000, kMillisecondScale );
         return false;
@@ -6160,13 +6256,19 @@ void IOService::pmTellClientWithResponse( OSObject * object, void * arg )
 
     if (context->enableTracing && (notifier != 0))
     {
-        getPMRootDomain()->traceDetail(notifier);
+        getPMRootDomain()->traceDetail(notifier, true);
     }
 
     clock_get_uptime(&start);
     retCode = context->us->messageClient(msgType, object, (void *) &notify, sizeof(notify));
     clock_get_uptime(&end);
 
+    if (context->enableTracing && (notifier != NULL))
+    {
+        getPMRootDomain()->traceDetail(notifier, false);
+    }
+
+
     if (kIOReturnSuccess == retCode)
     {
         if (0 == notify.returnValue) {
@@ -6373,13 +6475,17 @@ void IOService::pmTellCapabilityClientWithResponse(
 
     if (context->enableTracing && (notifier != 0))
     {
-        getPMRootDomain()->traceDetail(notifier);
+        getPMRootDomain()->traceDetail(notifier, true);
     }
 
     clock_get_uptime(&start);
     retCode = context->us->messageClient(
         msgType, object, (void *) &msgArg, sizeof(msgArg));
     clock_get_uptime(&end);
+    if (context->enableTracing && (notifier != NULL))
+    {
+        getPMRootDomain()->traceDetail(notifier, false);
+    }
 
     if ( kIOReturnSuccess == retCode )
     {
@@ -6490,6 +6596,7 @@ void IOService::tellClients( int messageType )
     context.stateNumber   = fHeadNotePowerState;
     context.stateFlags    = fHeadNotePowerArrayEntry->capabilityFlags;
     context.changeFlags   = fHeadNoteChangeFlags;
+    context.enableTracing = IS_ROOT_DOMAIN;
     context.messageFilter = (IS_ROOT_DOMAIN) ?
                             OSMemberFunctionCast(
                                 IOPMMessageFilter,
@@ -6539,7 +6646,17 @@ static void tellKernelClientApplier( OSObject * object, void * arg )
     notify.stateNumber  = context->stateNumber;
     notify.stateFlags   = context->stateFlags;
 
+    if (context->enableTracing && object)
+    {
+        IOService::getPMRootDomain()->traceDetail(object, true);
+    }
     context->us->messageClient(context->messageType, object, &notify, sizeof(notify));
+    if (context->enableTracing && object)
+    {
+        IOService::getPMRootDomain()->traceDetail(object, false);
+    }
+
+
 
     if ((kIOLogDebugPower & gIOKitDebug) &&
         (OSDynamicCast(_IOServiceInterestNotifier, object)))
@@ -7974,6 +8091,7 @@ bool IOService::actionPMReplyQueue( IOPMRequest * request, IOPMRequestQueue * qu
                 // expected ack, stop the timer
                 stop_ack_timer();
 
+                getPMRootDomain()->reset_watchdog_timer(this, 0);
 
                 uint64_t nsec = computeTimeDeltaNS(&fDriverCallStartTime);
                 if (nsec > LOG_SETPOWER_TIMES) {
@@ -8109,6 +8227,33 @@ void IOService::deassertPMDriverCall( IOPMDriverCallEntry * entry )
         PM_LOCK_WAKEUP(&fPMDriverCallQueue);
 }
 
+bool IOService::getBlockingDriverCall(thread_t *thread, const void **callMethod)
+{
+    const IOPMDriverCallEntry * entry = NULL;
+    bool    blocked = false;
+
+    if (!initialized) {
+        return false;
+    }
+
+    if (current_thread() != gIOPMWatchDogThread) {
+        // Meant to be accessed only from watchdog thread
+        return false;
+    }
+
+    PM_LOCK();
+    entry = qe_queue_first(&fPMDriverCallQueue, IOPMDriverCallEntry, link);
+    if (entry) {
+        *thread = entry->thread;
+        *callMethod = entry->callMethod;
+        blocked = true;
+    }
+    PM_UNLOCK();
+
+    return blocked;
+}
+
+
 void IOService::waitForPMDriverCall( IOService * target )
 {
     const IOPMDriverCallEntry * entry;
index 0dbc58aca37219afad96fb45dc075b9415075635..26bfbee7f8ce44ff7108d7d46a6f12fcfa20a5dc 100644 (file)
@@ -186,6 +186,11 @@ private:
     thread_call_t           WatchdogTimer;
     thread_call_t           SpinDumpTimer;
 
+    IOLock  *               WatchdogLock;
+    OSArray *               BlockedArray;
+    uint64_t                PendingResponseDeadline;
+    uint64_t                WatchdogDeadline;
+
     // Settle time after changing power state.
     uint32_t                SettleTimeUS;
     uint32_t                IdleTimerGeneration;
@@ -360,6 +365,10 @@ private:
 #define fSettleTimer                pwrMgt->SettleTimer
 #define fIdleTimer                  pwrMgt->IdleTimer
 #define fWatchdogTimer              pwrMgt->WatchdogTimer
+#define fWatchdogDeadline           pwrMgt->WatchdogDeadline
+#define fWatchdogLock               pwrMgt->WatchdogLock
+#define fBlockedArray               pwrMgt->BlockedArray
+#define fPendingResponseDeadline    pwrMgt->PendingResponseDeadline
 #define fSpinDumpTimer              pwrMgt->SpinDumpTimer
 #define fSettleTimeUS               pwrMgt->SettleTimeUS
 #define fIdleTimerGeneration        pwrMgt->IdleTimerGeneration
@@ -459,9 +468,11 @@ the ack timer is ticking every tenth of a second.
 #define ACK_TIMER_PERIOD            100000000
 
 #if defined(__i386__) || defined(__x86_64__)
-#define WATCHDOG_TIMER_PERIOD       (300)   // 300 secs
+#define WATCHDOG_SLEEP_TIMEOUT      (180)   // 180 secs
+#define WATCHDOG_WAKE_TIMEOUT       (180)   // 180  secs
 #else
-#define WATCHDOG_TIMER_PERIOD       (180)   // 180 secs
+#define WATCHDOG_SLEEP_TIMEOUT      (180)   // 180 secs
+#define WATCHDOG_WAKE_TIMEOUT       (180)   // 180 secs
 #endif
 
 // Max wait time in microseconds for kernel priority and capability clients
index 7b01220b3be461e84cb74a0ca253bfc1d5837048..5b420452eebcd1fe75e8fb864fdaf8b44d8328de 100644 (file)
@@ -71,7 +71,8 @@ enum {
 // notify state
 enum {
     kIOServiceNotifyEnable     = 0x00000001,
-    kIOServiceNotifyWaiter     = 0x00000002
+    kIOServiceNotifyWaiter     = 0x00000002,
+    kIOServiceNotifyBlock      = 0x00000004
 };
 
 struct _IOServiceNotifierInvocation
@@ -225,5 +226,12 @@ extern const OSSymbol *    gIOConsoleSessionAuditIDKey;
 extern const OSSymbol * gIOConsoleSessionOnConsoleKey;
 extern const OSSymbol * gIOConsoleSessionSecureInputPIDKey;
 
+
+#define _interruptSourcesPrivate(service)   \
+    ((IOInterruptSourcePrivate *)(&(service)->_interruptSources[(service)->_numInterruptSources]))
+
+#define sizeofAllIOInterruptSource          \
+    (sizeof(IOInterruptSourcePrivate) + sizeof(IOInterruptSource))
+
 #endif /* ! _IOKIT_IOSERVICEPRIVATE_H */
 
index 17656644ad104e1882528a49dc78cda2b0ac7b5d..385393f65c1bb7d34538870f0391859f6264d773 100644 (file)
@@ -285,18 +285,28 @@ Boolean IOSharedDataQueue::enqueue(void * data, UInt32 dataSize)
         }
     }
 
-    // Update tail with release barrier
-    __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE);
-    
-    // Send notification (via mach message) that data is available.
-    
-    if ( ( tail == head )                                                   /* queue was empty prior to enqueue() */
-      || ( tail == __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_ACQUIRE) ) )   /* queue was emptied during enqueue() */
-    {
-        sendDataAvailableNotification();
-    }
-    
-    return true;
+       // Publish the data we just enqueued
+       __c11_atomic_store((_Atomic UInt32 *)&dataQueue->tail, newTail, __ATOMIC_RELEASE);
+
+       if (tail != head) {
+               //
+               // The memory barrier below paris with the one in ::dequeue
+               // so that either our store to the tail cannot be missed by
+               // the next dequeue attempt, or we will observe the dequeuer
+               // making the queue empty.
+               //
+               // Of course, if we already think the queue is empty,
+               // there's no point paying this extra cost.
+               //
+               __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
+               head = __c11_atomic_load((_Atomic UInt32 *)&dataQueue->head, __ATOMIC_RELAXED);
+       }
+
+       if (tail == head) {
+               // Send notification (via mach message) that data is now available.
+               sendDataAvailableNotification();
+       }
+       return true;
 }
 
 Boolean IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize)
@@ -308,7 +318,7 @@ Boolean IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize)
     UInt32              tailOffset      = 0;
     UInt32              newHeadOffset   = 0;
 
-    if (!dataQueue) {
+       if (!dataQueue || (data && !dataSize)) {
         return false;
     }
 
@@ -356,30 +366,30 @@ Boolean IOSharedDataQueue::dequeue(void *data, UInt32 *dataSize)
             }
             newHeadOffset   = headOffset + entrySize + DATA_QUEUE_ENTRY_HEADER_SIZE;
         }
-    }
-
-    if (entry) {
-        if (data) {
-            if (dataSize) {
-                if (entrySize <= *dataSize) {
-                    memcpy(data, &(entry->data), entrySize);
-                    __c11_atomic_store((_Atomic UInt32 *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE);
-                } else {
-                    retVal = FALSE;
-                }
-            } else {
-                retVal = FALSE;
-            }
-        } else {
-            __c11_atomic_store((_Atomic UInt32 *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE);
-        }
-
-        if (dataSize) {
-            *dataSize = entrySize;
-        }
-    } else {
-        retVal = FALSE;
-    }
+       } else {
+               // empty queue
+               return false;
+       }
+
+       if (data) {
+               if (entrySize > *dataSize) {
+                       // not enough space
+                       return false;
+               }
+               memcpy(data, &(entry->data), entrySize);
+               *dataSize = entrySize;
+       }
+
+       __c11_atomic_store((_Atomic UInt32 *)&dataQueue->head, newHeadOffset, __ATOMIC_RELEASE);
+
+       if (newHeadOffset == tailOffset) {
+               //
+               // If we are making the queue empty, then we need to make sure
+               // that either the enqueuer notices, or we notice the enqueue
+               // that raced with our making of the queue empty.
+               //
+               __c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
+       }
     
     return retVal;
 }
index f3771602a03c174a6316689a1e9b3857719336ae..141eecabf632b8387a9f77f6cf79de7deab6a156 100644 (file)
@@ -762,7 +762,7 @@ int IOStatistics::getWorkLoopStatistics(sysctl_req *req)
                error = ENOMEM;
                goto exit;
        }
-
+       memset(buffer, 0, calculatedSize);
        header = (IOStatisticsWorkLoopHeader*)((void*)buffer);
        
        header->sig = IOSTATISTICS_SIG_WORKLOOP;
@@ -827,7 +827,7 @@ int IOStatistics::getUserClientStatistics(sysctl_req *req)
                error = ENOMEM;
                goto exit;
        }
-
+       memset(buffer, 0, calculatedSize);
        header = (IOStatisticsUserClientHeader*)((void*)buffer);
 
        header->sig = IOSTATISTICS_SIG_USERCLIENT;
index c4f9458feab9f32386c485cf486fa08e4c317570..6890305d67e15fa4680b36fea3fa78ced4268a6c 100644 (file)
@@ -475,17 +475,18 @@ strtouq(const char *nptr,
 char *
 strncat(char *s1, const char *s2, unsigned long n)
 {
-       char *os1;
-       int i = n;
+       if (n != 0) {
+               char *d = s1;
+               const char *s = s2;
 
-       os1 = s1;
-       while (*s1++)
-               ;
-       --s1;
-       while ((*s1++ = *s2++))
-               if (--i < 0) {
-                       *--s1 = '\0';
-                       break;
-               }
-       return(os1);
+               while (*d != 0)
+                       d++;
+               do {
+                       if ((*d = *s++) == '\0')
+                               break;
+                       d++;
+               } while (--n != 0);
+               *d = '\0';
+       }
+       return (s1);
 }
index eedde763ea78e7191a86c12cd8d279990c8c7137..22987d8b82975c845759ca0cbacfff6e52050e2b 100644 (file)
@@ -45,6 +45,9 @@ __END_DECLS
 #include <mach/sdt.h>
 #endif
 
+#include <libkern/Block.h>
+
+
 #define super IOEventSource
 OSDefineMetaClassAndStructors(IOTimerEventSource, IOEventSource)
 OSMetaClassDefineReservedUsed(IOTimerEventSource, 0);
@@ -96,8 +99,8 @@ do { \
 // the timeout interval expires.
 //
 
-static __inline__ void
-InvokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts,
+__inline__ void
+IOTimerEventSource::invokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts,
             OSObject * owner, IOWorkLoop * workLoop)
 {
     bool    trace = (gIOKitTrace & kIOTraceTimers) ? true : false;
@@ -106,7 +109,8 @@ InvokeAction(IOTimerEventSource::Action action, IOTimerEventSource * ts,
        IOTimeStampStartConstant(IODBG_TIMES(IOTIMES_ACTION),
                                 VM_KERNEL_ADDRHIDE(action), VM_KERNEL_ADDRHIDE(owner));
 
-    (*action)(owner, ts);
+    if (kActionBlock & flags) ((IOTimerEventSource::ActionBlock) actionBlock)(ts);
+    else                      (*action)(owner, ts);
 
 #if CONFIG_DTRACE
     DTRACE_TMR3(iotescallout__expire, Action, action, OSObject, owner, void, workLoop);
@@ -135,7 +139,7 @@ void IOTimerEventSource::timeout(void *self)
             doit = (Action) me->action;
             if (doit && me->enabled && AbsoluteTime_to_scalar(&me->abstime))
             {
-                InvokeAction(doit, me, me->owner, me->workLoop);
+                me->invokeAction(doit, me, me->owner, me->workLoop);
             }
             IOStatisticsOpenGate();
             wl->openGate();
@@ -164,7 +168,7 @@ void IOTimerEventSource::timeoutAndRelease(void * self, void * c)
             doit = (Action) me->action;
             if (doit && (me->reserved->calloutGeneration == count))
             {
-                InvokeAction(doit, me, me->owner, me->workLoop);
+                me->invokeAction(doit, me, me->owner, me->workLoop);
             }
             IOStatisticsOpenGate();
             wl->openGate();
@@ -186,7 +190,7 @@ bool IOTimerEventSource::checkForWork()
      && enabled && (doit = (Action) action))
     {
        reserved->calloutGenerationSignaled = ~reserved->calloutGeneration;
-       InvokeAction(doit, this, owner, workLoop);
+       invokeAction(doit, this, owner, workLoop);
     }
 
     return false;
@@ -303,6 +307,16 @@ IOTimerEventSource::timerEventSource(uint32_t inOptions, OSObject *inOwner, Acti
     return me;
 }
 
+IOTimerEventSource *
+IOTimerEventSource::timerEventSource(uint32_t options, OSObject *inOwner, ActionBlock action)
+{
+    IOTimerEventSource * tes;
+    tes = IOTimerEventSource::timerEventSource(options, inOwner, (Action) NULL);
+    if (tes) tes->setActionBlock((IOEventSource::ActionBlock) action);
+
+    return tes;
+}
+
 #define _thread_call_cancel(tc)   ((kActive & flags) ? thread_call_cancel_wait((tc)) : thread_call_cancel((tc)))
 
 IOTimerEventSource *
index d331cb2b0a3af0e86dcd791abe62137a82af9c00..b99f027ddd6fa28a952469428e29dd759f473360 100644 (file)
@@ -383,6 +383,7 @@ public:
     virtual void reset() APPLE_KEXT_OVERRIDE;
     virtual bool isValid() APPLE_KEXT_OVERRIDE;
     virtual OSObject * getNextObject() APPLE_KEXT_OVERRIDE;
+    virtual OSObject * copyNextObject();
 };
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@@ -471,11 +472,20 @@ IOUserIterator::isValid()
 OSObject *
 IOUserIterator::getNextObject()
 {
-    OSObject * ret;
+    assert(false);
+    return (NULL);
+}
+
+OSObject *
+IOUserIterator::copyNextObject()
+{
+    OSObject * ret = NULL;
 
     IOLockLock(lock);
-    assert(OSDynamicCast(OSIterator, userIteratorObject));
-    ret = ((OSIterator *)userIteratorObject)->getNextObject();
+    if (userIteratorObject) {
+        ret = ((OSIterator *)userIteratorObject)->getNextObject();
+        if (ret) ret->retain();
+    }
     IOLockUnlock(lock);
 
     return (ret);
@@ -616,7 +626,6 @@ class IOServiceUserNotification : public IOUserNotification
     PingMsg    *       pingMsg;
     vm_size_t          msgSize;
     OSArray    *       newSet;
-    OSObject   *       lastEntry;
     bool               armed;
     bool                ipcLogged;
 
@@ -633,6 +642,7 @@ public:
     virtual bool handler( void * ref, IOService * newService );
 
     virtual OSObject * getNextObject() APPLE_KEXT_OVERRIDE;
+    virtual OSObject * copyNextObject() APPLE_KEXT_OVERRIDE;
 };
 
 class IOServiceMessageUserNotification : public IOUserNotification
@@ -670,6 +680,7 @@ public:
                               void * messageArgument, vm_size_t argSize );
 
     virtual OSObject * getNextObject() APPLE_KEXT_OVERRIDE;
+    virtual OSObject * copyNextObject() APPLE_KEXT_OVERRIDE;
 };
 
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
@@ -776,11 +787,9 @@ void IOServiceUserNotification::free( void )
     PingMsg   *        _pingMsg;
     vm_size_t  _msgSize;
     OSArray   *        _newSet;
-    OSObject  *        _lastEntry;
 
     _pingMsg   = pingMsg;
     _msgSize   = msgSize;
-    _lastEntry = lastEntry;
     _newSet    = newSet;
 
     super::free();
@@ -792,9 +801,6 @@ void IOServiceUserNotification::free( void )
         IOFree(_pingMsg, _msgSize);
        }
 
-    if( _lastEntry)
-        _lastEntry->release();
-
     if( _newSet)
         _newSet->release();
 }
@@ -850,16 +856,19 @@ bool IOServiceUserNotification::handler( void * ref,
 
     return( true );
 }
-
 OSObject * IOServiceUserNotification::getNextObject()
+{
+    assert(false);
+    return (NULL);
+}
+
+OSObject * IOServiceUserNotification::copyNextObject()
 {
     unsigned int       count;
     OSObject *         result;
-    OSObject *         releaseEntry;
 
     IOLockLock(lock);
 
-    releaseEntry = lastEntry;
     count = newSet->getCount();
     if( count ) {
         result = newSet->getObject( count - 1 );
@@ -869,12 +878,9 @@ OSObject * IOServiceUserNotification::getNextObject()
         result = 0;
         armed = true;
     }
-    lastEntry = result;
 
     IOLockUnlock(lock);
 
-    if (releaseEntry) releaseEntry->release();
-
     return( result );
 }
 
@@ -1068,6 +1074,11 @@ OSObject * IOServiceMessageUserNotification::getNextObject()
     return( 0 );
 }
 
+OSObject * IOServiceMessageUserNotification::copyNextObject()
+{
+    return( NULL );
+}
+
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 #undef super
@@ -1438,7 +1449,7 @@ IOUserClient::registerOwner(task_t task)
     if (newOwner)
     {
         owner = IONew(IOUserClientOwner, 1);
-        if (!newOwner) ret = kIOReturnNoMemory;
+        if (!owner) ret = kIOReturnNoMemory;
         else
         {
             owner->task = task;
@@ -2049,12 +2060,24 @@ kern_return_t is_io_iterator_next(
 {
     IOReturn    ret;
     OSObject * obj;
+    OSIterator * iter;
+    IOUserIterator * uiter;
 
-    CHECK( OSIterator, iterator, iter );
+       if ((uiter = OSDynamicCast(IOUserIterator, iterator)))
+       {
+               obj = uiter->copyNextObject();
+       }
+       else if ((iter = OSDynamicCast(OSIterator, iterator)))
+       {
+               obj = iter->getNextObject();
+               if (obj) obj->retain();
+       }
+       else
+       {
+           return( kIOReturnBadArgument );
+       }
 
-    obj = iter->getNextObject();
     if( obj) {
-       obj->retain();
        *object = obj;
         ret = kIOReturnSuccess;
     } else
@@ -3292,8 +3315,8 @@ kern_return_t is_io_registry_entry_get_child_iterator(
 {
     CHECK( IORegistryEntry, registry_entry, entry );
 
-    *iterator = entry->getChildIterator(
-    IORegistryEntry::getPlane( plane ));
+    *iterator = IOUserIterator::withIterator(entry->getChildIterator(
+    IORegistryEntry::getPlane( plane )));
 
     return( kIOReturnSuccess );
 }
@@ -3306,8 +3329,8 @@ kern_return_t is_io_registry_entry_get_parent_iterator(
 {
     CHECK( IORegistryEntry, registry_entry, entry );
 
-    *iterator = entry->getParentIterator(
-       IORegistryEntry::getPlane( plane ));
+    *iterator = IOUserIterator::withIterator(entry->getParentIterator(
+       IORegistryEntry::getPlane( plane )));
 
     return( kIOReturnSuccess );
 }
@@ -4944,7 +4967,7 @@ kern_return_t is_io_catalog_send_data(
         return kIOReturnBadArgument;
     }
 
-    if (!IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-management"))
+    if (!IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-secure-management"))
     {
         OSString * taskName = IOCopyLogNameForPID(proc_selfpid());
         IOLog("IOCatalogueSendData(%s): Not entitled\n", taskName ? taskName->getCStringNoCopy() : "");
index a5eb8518145a35c8328f575d97ad0d11aa6a135b..74efbd0cd8b2eac242a0073ff8901f9bf34258f7 100644 (file)
@@ -416,11 +416,14 @@ restartThread:
     } while(workToDo);
 
 exitThread:
-       thread_t thread = workThread;
+    closeGate();
+    thread_t thread = workThread;
     workThread = 0;    // Say we don't have a loop and free ourselves
+    openGate();
+
     free();
 
-       thread_deallocate(thread);
+    thread_deallocate(thread);
     (void) thread_terminate(thread);
 }
 
@@ -494,6 +497,18 @@ void IOWorkLoop::wakeupGate(void *event, bool oneThread)
     IORecursiveLockWakeup(gateLock, event, oneThread);
 }
 
+static IOReturn IOWorkLoopActionToBlock(OSObject *owner,
+                              void *arg0, void *arg1,
+                              void *arg2, void *arg3)
+{
+    return ((IOWorkLoop::ActionBlock) arg0)();
+}
+
+IOReturn IOWorkLoop::runActionBlock(ActionBlock action)
+{
+    return (runAction(&IOWorkLoopActionToBlock, this, action));
+}
+
 IOReturn IOWorkLoop::runAction(Action inAction, OSObject *target,
                                   void *arg0, void *arg1,
                                   void *arg2, void *arg3)
index d1ba499d81f9dc6a9ea4d47322c44f31f9c3703a..dca24997b2133bca767ed9b5d886dd6e79fe5066 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2018 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 1998-2000 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -29,7 +29,7 @@
  *
  */
 
-#define TEST_HEADERS   0
+#define TEST_HEADERS    0
 
 #if TEST_HEADERS
 
 #include <libkern/c++/OSData.h>
 #include "Tests.h"
 
-#include <IOKit/IOTimerEventSource.h>
-#include <IOKit/IOWorkLoop.h>
 
 #if DEVELOPMENT || DEBUG
 
+#include <IOKit/IOWorkLoop.h>
+#include <IOKit/IOTimerEventSource.h>
+#include <IOKit/IOInterruptEventSource.h>
+#include <IOKit/IOCommandGate.h>
+#include <IOKit/IOPlatformExpert.h>
+#include <libkern/Block.h>
+#include <libkern/Block_private.h>
+
 static uint64_t gIOWorkLoopTestDeadline;
 
 static void
@@ -197,6 +203,7 @@ IOWorkLoopTest(int newValue)
     uint32_t idx;
     IOWorkLoop * wl;
     IOTimerEventSource * tes;
+    IOInterruptEventSource * ies;
 
     wl = IOWorkLoop::workLoop();
     assert(wl);
@@ -204,7 +211,7 @@ IOWorkLoopTest(int newValue)
     assert(tes);
     err = wl->addEventSource(tes);
     assert(kIOReturnSuccess == err);
-    clock_interval_to_deadline(2000, kMillisecondScale, &gIOWorkLoopTestDeadline);
+    clock_interval_to_deadline(100, kMillisecondScale, &gIOWorkLoopTestDeadline);
     for (idx = 0; mach_absolute_time() < gIOWorkLoopTestDeadline; idx++)
     {
        tes->setTimeout(idx & 1023, kNanosecondScale);
@@ -212,11 +219,166 @@ IOWorkLoopTest(int newValue)
     tes->cancelTimeout();
     wl->removeEventSource(tes);
     tes->release();
+
+    int value = 3;
+
+    tes = IOTimerEventSource::timerEventSource(kIOTimerEventSourceOptionsDefault, wl, ^(IOTimerEventSource * tes){
+       kprintf("wl %p, value %d\n", wl, value);
+    });
+    err = wl->addEventSource(tes);
+    assert(kIOReturnSuccess == err);
+
+    value = 2;
+    tes->setTimeout(1, kNanosecondScale);
+    IOSleep(1);
+    wl->removeEventSource(tes);
+    tes->release();
+
+    ies = IOInterruptEventSource::interruptEventSource(wl, NULL, 0, ^void(IOInterruptEventSource *sender, int count){
+       kprintf("ies block %p, %d\n", sender, count);
+    });
+
+    assert(ies);
+    kprintf("ies %p\n", ies);
+    err = wl->addEventSource(ies);
+    assert(kIOReturnSuccess == err);
+    ies->interruptOccurred(NULL, NULL, 0);
+    IOSleep(1);
+    ies->interruptOccurred(NULL, NULL, 0);
+    IOSleep(1);
+    wl->removeEventSource(ies);
+    ies->release();
+
     wl->release();
 
     return (0);
 }
 
+static int
+OSCollectionTest(int newValue)
+{
+    OSArray * array = OSArray::withCapacity(8);
+    array->setObject(kOSBooleanTrue);
+    array->setObject(kOSBooleanFalse);
+    array->setObject(kOSBooleanFalse);
+    array->setObject(kOSBooleanTrue);
+    array->setObject(kOSBooleanFalse);
+    array->setObject(kOSBooleanTrue);
+
+    __block unsigned int index;
+    index = 0;
+    array->iterateObjects(^bool(OSObject * obj) {
+       kprintf("%d:%d ", index, (obj == kOSBooleanTrue) ? 1 : (obj == kOSBooleanFalse) ? 0 : 2);
+       index++;
+       return (false);
+    });
+    kprintf("\n");
+    array->release();
+
+    OSDictionary * dict = IOService::resourceMatching("hello");
+    assert(dict);
+    index = 0;
+    dict->iterateObjects(^bool(const OSSymbol * sym, OSObject * obj) {
+       OSString * str = OSDynamicCast(OSString, obj);
+       assert(str);
+       kprintf("%d:%s=%s\n", index, sym->getCStringNoCopy(), str->getCStringNoCopy());
+       index++;
+       return (false);
+    });
+    dict->release();
+
+    OSSerializer * serializer = OSSerializer::withBlock(^bool(OSSerialize * s){
+       return (gIOBSDUnitKey->serialize(s));
+    });
+    assert(serializer);
+    IOService::getPlatform()->setProperty("OSSerializer_withBlock", serializer);
+    serializer->release();
+
+    return (0);
+}
+
+#if 0
+#include <IOKit/IOUserClient.h>
+class TestUserClient : public IOUserClient
+{
+    OSDeclareDefaultStructors(TestUserClient);
+    virtual void stop( IOService *provider) APPLE_KEXT_OVERRIDE;
+    virtual bool finalize(IOOptionBits options) APPLE_KEXT_OVERRIDE;
+    virtual IOReturn externalMethod( uint32_t selector,
+                    IOExternalMethodArguments * arguments,
+                    IOExternalMethodDispatch * dispatch,
+                    OSObject * target,
+                    void * reference ) APPLE_KEXT_OVERRIDE;
+};
+
+void TestUserClient::stop( IOService *provider)
+{
+    kprintf("TestUserClient::stop\n");
+}
+bool TestUserClient::finalize(IOOptionBits options)
+{
+    kprintf("TestUserClient::finalize\n");
+    return(true);
+}
+IOReturn TestUserClient::externalMethod( uint32_t selector,
+                IOExternalMethodArguments * arguments,
+                IOExternalMethodDispatch * dispatch,
+                OSObject * target,
+                void * reference )
+{
+    getProvider()->terminate();
+    IOSleep(500);
+    return (0);
+}
+OSDefineMetaClassAndStructors(TestUserClient, IOUserClient);
+#endif
+
+static int
+IOServiceTest(int newValue)
+{
+    OSDictionary      * matching;
+    IONotifier        * note;
+    __block IOService * found;
+
+#if 0
+    found = new IOService;
+    found->init();
+    found->setName("IOTestUserClientProvider");
+    found->attach(IOService::getPlatform());
+    found->setProperty("IOUserClientClass", "TestUserClient");
+    found->registerService();
+#endif
+
+    matching = IOService::serviceMatching("IOPlatformExpert");
+    assert(matching);
+    found = nullptr;
+    note = IOService::addMatchingNotification(gIOMatchedNotification, matching, 0,
+       ^bool(IOService * newService, IONotifier * notifier) {
+           kprintf("found %s, %d\n", newService->getName(), newService->getRetainCount());
+           found = newService;
+           found->retain();
+           return (true);
+       }
+    );
+    assert(note);
+    assert(found);
+    matching->release();
+    note->remove();
+
+    note = found->registerInterest(gIOBusyInterest,
+       ^IOReturn(uint32_t messageType, IOService * provider,
+                 void   * messageArgument, size_t argSize) {
+       kprintf("%p messageType 0x%08x %p\n", provider, messageType, messageArgument);
+       return (kIOReturnSuccess);
+    });
+    assert(note);
+    IOSleep(1*1000);
+    note->remove();
+    found->release();
+
+    return (0);
+}
+
 #endif  /* DEVELOPMENT || DEBUG */
 
 static int
@@ -229,9 +391,34 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused
     if (error) return (error);
 
 #if DEVELOPMENT || DEBUG
+    if (changed && (66==newValue))
+    {
+       IOReturn ret;
+       IOWorkLoop * wl = IOWorkLoop::workLoop();
+       IOCommandGate * cg = IOCommandGate::commandGate(wl);
+       ret = wl->addEventSource(cg);
+
+       struct x
+       {
+           uint64_t h;
+           uint64_t l;
+       };
+       struct x y;
+
+       y.h = 0x1111111122222222;
+       y.l = 0x3333333344444444;
+
+       kprintf("ret1 %d\n", ret);
+       ret = cg->runActionBlock(^(){
+           printf("hello %d 0x%qx\n", wl->inGate(), y.h);
+           return 99;
+       });
+       kprintf("ret %d\n", ret);
+    }
+
     if (changed && (999==newValue))
     {
-       OSData * data = OSData::withCapacity(16);
+       OSData * data = OSData::withCapacity(16);
        data->release();
        data->release();
     }
@@ -241,6 +428,10 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused
     {
        error = IOWorkLoopTest(newValue);
        assert(KERN_SUCCESS == error);
+       error = IOServiceTest(newValue);
+       assert(KERN_SUCCESS == error);
+       error = OSCollectionTest(newValue);
+       assert(KERN_SUCCESS == error);
        error = IOMemoryDescriptorTest(newValue);
        assert(KERN_SUCCESS == error);
     }
@@ -250,7 +441,7 @@ sysctl_iokittest(__unused struct sysctl_oid *oidp, __unused void *arg1, __unused
 }
 
 SYSCTL_PROC(_kern, OID_AUTO, iokittest,
-        CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
-        0, 0, sysctl_iokittest, "I", "");
+       CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NOAUTO | CTLFLAG_KERN | CTLFLAG_LOCKED,
+       0, 0, sysctl_iokittest, "I", "");
 
 
index 27ef74433357d6663bdb543dd060c6b1160a2471..7c0e5e9a763341001f07b7809b52a09bd7cf476f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2011 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -38,6 +38,7 @@ extern "C" {
 
 #include <pexpert/pexpert.h>
 #include <kern/clock.h>
+#include <mach/machine.h>
 #include <uuid/uuid.h>
 #include <sys/vnode_internal.h>
 #include <sys/mount.h>
@@ -57,30 +58,33 @@ extern void mdevremoveall(void);
 extern int mdevgetrange(int devid, uint64_t *base, uint64_t *size);
 extern void di_root_ramfile(IORegistryEntry * entry);
 
-
 #if CONFIG_EMBEDDED
 #define IOPOLLED_COREFILE      (CONFIG_KDP_INTERACTIVE_DEBUGGING)
 
 #if defined(XNU_TARGET_OS_BRIDGE)
+
 #define kIOCoreDumpSize         150ULL*1024ULL*1024ULL
 // leave free space on volume:
 #define kIOCoreDumpFreeSize     150ULL*1024ULL*1024ULL
 #define kIOCoreDumpPath         "/private/var/internal/kernelcore"
-#else
-#define kIOCoreDumpSize         350ULL*1024ULL*1024ULL
+
+#else /* defined(XNU_TARGET_OS_BRIDGE) */
+#define kIOCoreDumpMinSize      350ULL*1024ULL*1024ULL
+#define kIOCoreDumpLargeSize    500ULL*1024ULL*1024ULL
 // leave free space on volume:
 #define kIOCoreDumpFreeSize     350ULL*1024ULL*1024ULL
 #define kIOCoreDumpPath         "/private/var/vm/kernelcore"
-#endif
 
-#elif DEVELOPMENT
+#endif /* defined(XNU_TARGET_OS_BRIDGE) */
+
+#elif DEVELOPMENT /* CONFIG_EMBEDDED */
 #define IOPOLLED_COREFILE      1
 // no sizing
 #define kIOCoreDumpSize                0ULL
 #define kIOCoreDumpFreeSize    0ULL
-#else
+#else /* CONFIG_EMBEDDED */
 #define IOPOLLED_COREFILE      0
-#endif
+#endif /* CONFIG_EMBEDDED */
 
 
 #if IOPOLLED_COREFILE
@@ -764,7 +768,7 @@ kern_return_t IOBSDGetPlatformUUID( uuid_t uuid, mach_timespec_t timeout )
 #include <IOKit/IOBufferMemoryDescriptor.h>
 
 IOPolledFileIOVars * gIOPolledCoreFileVars;
-
+kern_return_t gIOPolledCoreFileOpenRet = kIOReturnNotReady;
 #if IOPOLLED_COREFILE
 
 static IOReturn 
@@ -772,6 +776,7 @@ IOOpenPolledCoreFile(const char * filename)
 {
     IOReturn err;
     unsigned int debug;
+    uint64_t corefile_size_bytes = 0;
 
     if (gIOPolledCoreFileVars)                             return (kIOReturnBusy);
     if (!IOPolledInterface::gMetaClass.getInstanceCount()) return (kIOReturnUnsupported);
@@ -780,15 +785,89 @@ IOOpenPolledCoreFile(const char * filename)
     PE_parse_boot_argn("debug", &debug, sizeof (debug));
     if (DB_DISABLE_LOCAL_CORE & debug)                     return (kIOReturnUnsupported);
 
-    err = IOPolledFileOpen(filename, kIOCoreDumpSize, kIOCoreDumpFreeSize,
-                           NULL, 0,
-                           &gIOPolledCoreFileVars, NULL, NULL, 0);
-    if (kIOReturnSuccess != err)                           return (err);
+#if CONFIG_EMBEDDED
+    unsigned int requested_corefile_size = 0;
+    if (PE_parse_boot_argn("corefile_size_mb", &requested_corefile_size, sizeof(requested_corefile_size))) {
+        IOLog("Boot-args specify %d MB kernel corefile\n", requested_corefile_size);
+
+        corefile_size_bytes = (requested_corefile_size * 1024ULL * 1024ULL);
+    }
+#endif
+
+
+    do {
+#if defined(kIOCoreDumpLargeSize)
+        if (0 == corefile_size_bytes)
+        {
+                // If no custom size was requested and we're on a device with >3GB of DRAM, attempt
+                // to allocate a large corefile otherwise use a small file.
+                if (max_mem > (3 * 1024ULL * 1024ULL * 1024ULL))
+                {
+                        corefile_size_bytes = kIOCoreDumpLargeSize;
+                        err = IOPolledFileOpen(filename,
+                                                kIOPolledFileCreate,
+                                                corefile_size_bytes, kIOCoreDumpFreeSize,
+                                                NULL, 0,
+                                                &gIOPolledCoreFileVars, NULL, NULL, 0);
+                        if (kIOReturnSuccess == err)
+                        {
+                                break;
+                        }
+                        else if (kIOReturnNoSpace == err)
+                        {
+                                IOLog("Failed to open corefile of size %llu MB (low disk space)",
+                                        (corefile_size_bytes / (1024ULL * 1024ULL)));
+                                if (corefile_size_bytes == kIOCoreDumpMinSize)
+                                {
+                                        gIOPolledCoreFileOpenRet = err;
+                                        return (err);
+                                }
+                                // Try to open a smaller corefile (set size and fall-through)
+                                corefile_size_bytes = kIOCoreDumpMinSize;
+                        }
+                        else
+                        {
+                                IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n",
+                                        (corefile_size_bytes / (1024ULL * 1024ULL)), err);
+                                gIOPolledCoreFileOpenRet = err;
+                                return (err);
+                        }
+                }
+                else
+                {
+                        corefile_size_bytes = kIOCoreDumpMinSize;
+                }
+        }
+#else /* defined(kIOCoreDumpLargeSize) */
+        if (0 == corefile_size_bytes)
+        {
+            corefile_size_bytes = kIOCoreDumpSize;
+        }
+#endif /* defined(kIOCoreDumpLargeSize) */
+        err = IOPolledFileOpen(filename,
+                    kIOPolledFileCreate,
+                    corefile_size_bytes, kIOCoreDumpFreeSize,
+                    NULL, 0,
+                    &gIOPolledCoreFileVars, NULL, NULL, 0);
+        if (kIOReturnSuccess != err)
+        {
+                IOLog("Failed to open corefile of size %llu MB (returned error 0x%x)\n",
+                                (corefile_size_bytes / (1024ULL * 1024ULL)), err);
+                gIOPolledCoreFileOpenRet = err;
+                return (err);
+        }
+    } while (false);
 
     err = IOPolledFilePollersSetup(gIOPolledCoreFileVars, kIOPolledPreflightCoreDumpState);
     if (kIOReturnSuccess != err)
     {
-       IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0);
+        IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0);
+        IOLog("IOPolledFilePollersSetup for corefile failed with error: 0x%x\n", err);
+        gIOPolledCoreFileOpenRet = err;
+    }
+    else
+    {
+        IOLog("Opened corefile of size %llu MB\n", (corefile_size_bytes / (1024ULL * 1024ULL)));
     }
 
     return (err);
@@ -797,6 +876,7 @@ IOOpenPolledCoreFile(const char * filename)
 static void 
 IOClosePolledCoreFile(void)
 {
+    gIOPolledCoreFileOpenRet = kIOReturnNotOpen;
     IOPolledFilePollersClose(gIOPolledCoreFileVars, kIOPolledPostflightCoreDumpState);
     IOPolledFileClose(&gIOPolledCoreFileVars, NULL, NULL, 0, 0, 0);
 }
@@ -940,7 +1020,6 @@ IOBSDMountChange(struct mount * mp, uint32_t op)
 #endif /* IOPOLLED_COREFILE */
 }
 
-
 /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
 
 extern "C" boolean_t 
index 61de9d5843c630135527427d281a556148500703..587fe426428077a3cc907575bdec15aa9c973a94 100644 (file)
@@ -9,7 +9,7 @@ UNCONFIGURED_HIB_FILES= \
 HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS))
 
 # Unconfigured __HIB files must be Mach-O for "setsegname"
-IOHibernateRestoreKernel.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG)
+IOHibernateRestoreKernel.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG)
 
 ######################################################################
 #END    Machine dependent Makefile fragment for x86_64
index bc356a362345e5fdc75a4dc764f930a5057b7a7d..ac6b06e0a71db2ae50ab86c02f37e5376e2c2293 100644 (file)
@@ -107,3 +107,4 @@ iokit/Kernel/IOPowerConnection.cpp                  optional iokitcpp
 # System Management
 iokit/Families/IOSystemManagement/IOWatchDogTimer.cpp  optional iokitcpp
 
+
index 702bfacbc30ea742facb12a3ce64fba2e4ac6695..e36c55352fdfbdaebccc25e7b25bef2af9b935f0 100644 (file)
@@ -436,47 +436,49 @@ struct kcdata_type_definition {
  * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes
  * in STACKSHOT_KCTYPE_* types.
  */
-#define STACKSHOT_KCTYPE_IOSTATS 0x901u          /* io_stats_snapshot */
-#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */
+#define STACKSHOT_KCTYPE_IOSTATS 0x901u                   /* io_stats_snapshot */
+#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u          /* struct mem_and_io_snapshot */
 #define STACKSHOT_KCCONTAINER_TASK 0x903u
 #define STACKSHOT_KCCONTAINER_THREAD 0x904u
-#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u         /* task_snapshot_v2 */
-#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u       /* thread_snapshot_v2, thread_snapshot_v3 */
-#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u         /* int[] */
-#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u  /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
-#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u           /* char[] */
-#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au       /* struct stack_snapshot_frame32 */
-#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu     /* struct stack_snapshot_frame64 */
-#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu       /* struct stack_snapshot_frame32 */
-#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du     /* struct stack_snapshot_frame64 */
-#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu              /* boot args string */
-#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu             /* os version string */
-#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u        /* kernel page size in uint32_t */
-#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u          /* jetsam level in uint32_t */
-#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */
+#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u             /* task_snapshot_v2 */
+#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u           /* thread_snapshot_v2, thread_snapshot_v3 */
+#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u             /* int[] */
+#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u      /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u               /* char[] */
+#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au           /* struct stack_snapshot_frame32 */
+#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu         /* struct stack_snapshot_frame64 */
+#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu           /* struct stack_snapshot_frame32 */
+#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du         /* struct stack_snapshot_frame64 */
+#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu                  /* boot args string */
+#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu                 /* os version string */
+#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u            /* kernel page size in uint32_t */
+#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u              /* jetsam level in uint32_t */
+#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u     /* timestamp used for the delta stackshot */
+#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u              /* uint32_t */
+#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u            /* uint64_t */
+#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u              /* uint32_t */
+#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u            /* uint64_t */
+#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u          /* uint64_t */
+#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u         /* uint64_t */
+#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u                 /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */
+#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au        /* struct stackshot_duration */
+#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu     /* struct stackshot_fault_stats */
+#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO  0x91cu     /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du           /* struct stackshot_thread_waitinfo */
+#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu     /* struct thread_group_snapshot or thread_group_snapshot_v2 */
+#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu              /* uint64_t */
+#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */
+#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u          /* uint64_t */
+#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u     /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */
+#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u             /* struct instrs_cycles_snapshot */
+#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u             /* struct stack_snapshot_stacktop */
+#define STACKSHOT_KCTYPE_ASID 0x925u                      /* uint32_t */
+#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u               /* uint64_t */
+#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u    /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
 
 #define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u   /* task_delta_snapshot_v2 */
 #define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v* */
 
-#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u          /* uint32_t */
-#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u        /* uint64_t */
-#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u          /* uint32_t */
-#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u        /* uint64_t */
-#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u      /* uint64_t */
-#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u     /* uint64_t */
-#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u             /* struct stackshot_cpu_times */
-#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au    /* struct stackshot_duration */
-#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */
-#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO  0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
-#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du       /* struct stackshot_thread_waitinfo */
-#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */
-#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu          /* uint64_t */
-#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */
-#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u      /* uint64_t */
-#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u         /* struct instrs_cycles_snapshot */
-
-#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */
-
 struct stack_snapshot_frame32 {
        uint32_t lr;
        uint32_t sp;
@@ -537,6 +539,10 @@ enum task_snapshot_flags {
        kTaskUUIDInfoMissing                  = 0x200000, /* some UUID info was paged out */
        kTaskUUIDInfoTriedFault               = 0x400000, /* tried to fault in UUID info */
        kTaskSharedRegionInfoUnavailable      = 0x800000,  /* shared region info unavailable */
+       kTaskTALEngaged                       = 0x1000000,
+       /* 0x2000000 unused */
+       kTaskIsDirtyTracked                   = 0x4000000,
+       kTaskAllowIdleExit                    = 0x8000000,
 };
 
 enum thread_snapshot_flags {
@@ -785,6 +791,12 @@ struct stackshot_cpu_times {
        uint64_t system_usec;
 } __attribute__((packed));
 
+struct stackshot_cpu_times_v2 {
+       uint64_t user_usec;
+       uint64_t system_usec;
+       uint64_t runnable_usec;
+} __attribute__((packed));
+
 struct stackshot_duration {
        uint64_t stackshot_duration;
        uint64_t stackshot_duration_outer;
@@ -813,6 +825,12 @@ typedef struct stackshot_thread_waitinfo {
 #define STACKSHOT_WAITOWNER_SUSPENDED      (UINT64_MAX - 7) /* workloop is suspended */
 
 
+struct stack_snapshot_stacktop {
+       uint64_t sp;
+       uint8_t stack_contents[8];
+};
+
+
 /**************** definitions for crashinfo *********************/
 
 /*
@@ -866,6 +884,22 @@ struct crashinfo_proc_uniqidentifierinfo {
 #define TASK_CRASHINFO_UDATA_PTRS           0x81C  /* uint64_t */
 #define TASK_CRASHINFO_MEMORY_LIMIT         0x81D  /* uint64_t */
 
+#define TASK_CRASHINFO_LEDGER_INTERNAL                          0x81E /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_INTERNAL_COMPRESSED               0x81F /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_IOKIT_MAPPED                      0x820 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING              0x821 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING_COMPRESSED   0x822 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE             0x823 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE_COMPRESSED  0x824 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PAGE_TABLE                        0x825 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT                    0x826 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT_LIFETIME_MAX       0x827 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE               0x828 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED    0x829 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_WIRED_MEM                         0x82A /* uint64_t */
+
+
+
 #define TASK_CRASHINFO_END                  KCDATA_TYPE_BUFFER_END
 
 /**************** definitions for os reasons *********************/
@@ -963,7 +997,7 @@ kcdata_iter_type(kcdata_iter_t iter)
 static inline uint32_t
 kcdata_calc_padding(uint32_t size)
 {
-       /* calculate number of bits to add to size to get something divisible by 16 */
+       /* calculate number of bytes to add to size to get something divisible by 16 */
        return (-size) & 0xf;
 }
 
index 0a36b5aa78b33325c349459ca766ba64658b0140..c9a2809d5cd2bbfb3c21220bba25b1e8cc8242da 100644 (file)
@@ -121,15 +121,15 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s
                break;
        }
     
-        case KCDATA_TYPE_TYPEDEFINTION: {
-            i = 0;
-            setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_type_identifier), "typeID");
-            setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_num_elements), "numOfFields");
-            setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, offsetof(struct kcdata_type_definition, kct_name), KCDATA_DESC_MAXLEN, "name");
-            // Note "fields" is an array of run time defined length. So we populate fields at parsing time.
-            setup_type_definition(retval, type_id, i, "typedef");
-            break;
-        }
+       case KCDATA_TYPE_TYPEDEFINTION: {
+               i = 0;
+               setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_type_identifier), "typeID");
+               setup_subtype_description(&subtypes[i++], KC_ST_UINT32, offsetof(struct kcdata_type_definition, kct_num_elements), "numOfFields");
+               setup_subtype_array_description(&subtypes[i++], KC_ST_CHAR, offsetof(struct kcdata_type_definition, kct_name), KCDATA_DESC_MAXLEN, "name");
+               // Note "fields" is an array of run time defined length. So we populate fields at parsing time.
+               setup_type_definition(retval, type_id, i, "typedef");
+               break;
+       }
 
        case KCDATA_TYPE_CONTAINER_BEGIN: {
                i = 0;
@@ -536,8 +536,9 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s
 
        case STACKSHOT_KCTYPE_CPU_TIMES: {
                i = 0;
-               _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times, user_usec);
-               _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times, system_usec);
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times_v2, user_usec);
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times_v2, system_usec);
+               _SUBTYPE(KC_ST_UINT64, struct stackshot_cpu_times_v2, runnable_usec);
                setup_type_definition(retval, type_id, i, "cpu_times");
                break;
        }
@@ -614,6 +615,14 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s
                break;
         }
 
+       case STACKSHOT_KCTYPE_USER_STACKTOP: {
+               i = 0;
+               _SUBTYPE(KC_ST_UINT64, struct stack_snapshot_stacktop, sp);
+               _SUBTYPE_ARRAY(KC_ST_UINT8, struct stack_snapshot_stacktop, stack_contents, 8);
+               setup_type_definition(retval, type_id, i, "user_stacktop");
+               break;
+       }
+
        case TASK_CRASHINFO_PROC_STARTTIME: {
                i = 0;
                _SUBTYPE(KC_ST_INT64, struct timeval64, tv_sec);
@@ -784,8 +793,8 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s
                _SUBTYPE(KC_ST_UINT8, struct codesigning_exit_reason_info, ceri_page_dirty);
                _SUBTYPE(KC_ST_UINT32, struct codesigning_exit_reason_info, ceri_page_shadow_depth);
                setup_type_definition(retval, type_id, i, "exit_reason_codesigning_info");
-
                break;
+       }
 
        case EXIT_REASON_WORKLOOP_ID: {
                i = 0;
@@ -801,8 +810,28 @@ kcdata_get_typedescription(unsigned type_id, uint8_t * buffer, uint32_t buffer_s
                break;
        }
 
+       case STACKSHOT_KCTYPE_ASID: {
+               i = 0;
+               setup_subtype_description(&subtypes[i++], KC_ST_UINT32, 0, "ts_asid");
+               setup_type_definition(retval, type_id, i, "ts_asid");
+               break;
        }
 
+       case STACKSHOT_KCTYPE_PAGE_TABLES: {
+               i = 0;
+               setup_subtype_description(&subtypes[i++], KC_ST_UINT64, 0, "ts_pagetable");
+               setup_type_definition(retval, type_id, i, "ts_pagetable");
+               break;
+       }
+
+       case STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT: {
+               i = 0;
+               _SUBTYPE(KC_ST_UINT64, struct user64_dyld_uuid_info, imageLoadAddress);
+               _SUBTYPE_ARRAY(KC_ST_UINT8, struct user64_dyld_uuid_info, imageUUID, 16);
+               setup_type_definition(retval, type_id, i, "system_shared_cache_layout");
+               break;
+       }
+    
        default:
                retval = NULL;
                break;
index f9e1e542556944000d33697d0226078f32dd8300..a16b8bdcaf5fd8f22d5a0b5c61253b6730b5217a 100644 (file)
@@ -6,6 +6,21 @@
        objectVersion = 46;
        objects = {
 
+/* Begin PBXAggregateTarget section */
+               08CFD8441FBB9E39008D51F6 /* Default */ = {
+                       isa = PBXAggregateTarget;
+                       buildConfigurationList = 08CFD8471FBB9E39008D51F6 /* Build configuration list for PBXAggregateTarget "Default" */;
+                       buildPhases = (
+                       );
+                       dependencies = (
+                               08CFD8491FBB9E42008D51F6 /* PBXTargetDependency */,
+                               08CFD84B1FBB9E43008D51F6 /* PBXTargetDependency */,
+                       );
+                       name = Default;
+                       productName = Default;
+               };
+/* End PBXAggregateTarget section */
+
 /* Begin PBXBuildFile section */
                045F7F121D2ADE7C00B4808B /* stackshot-with-waitinfo in Resources */ = {isa = PBXBuildFile; fileRef = 04C64AC91D25C43400C6C781 /* stackshot-with-waitinfo */; };
                045F7F131D2ADE8000B4808B /* stackshot-with-waitinfo.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 04C64ACA1D25C43400C6C781 /* stackshot-with-waitinfo.plist.gz */; };
@@ -18,6 +33,8 @@
                084085AC1FA3CE3D005BAD16 /* kdd.h in Headers */ = {isa = PBXBuildFile; fileRef = 084085AA1FA3CE32005BAD16 /* kdd.h */; settings = {ATTRIBUTES = (Public, ); }; };
                0843EE921BF6AFC600CD4150 /* stackshot-sample in Resources */ = {isa = PBXBuildFile; fileRef = 0843EE911BF6AFB700CD4150 /* stackshot-sample */; };
                0843EE941BF6BAC100CD4150 /* stackshot-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */; };
+               084422F82048BABB008A085B /* stackshot-sample-asid in Resources */ = {isa = PBXBuildFile; fileRef = 084422F62048B801008A085B /* stackshot-sample-asid */; };
+               084422F92048BABB008A085B /* stackshot-sample-asid.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 084422F72048B801008A085B /* stackshot-sample-asid.plist.gz */; };
                08603F371BF69EDE007D3784 /* Tests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 08603F361BF69EDE007D3784 /* Tests.swift */; };
                08603F391BF69EDE007D3784 /* libkdd.a in Frameworks */ = {isa = PBXBuildFile; fileRef = C91C93C71ACB58B700119B60 /* libkdd.a */; };
                0860F87A1BFC3857007E1301 /* stackshot-sample-tailspin-2 in Resources */ = {isa = PBXBuildFile; fileRef = 0860F8781BFC3845007E1301 /* stackshot-sample-tailspin-2 */; };
@@ -37,6 +54,8 @@
                08A4C94C1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m in Sources */ = {isa = PBXBuildFile; fileRef = 08A4C94B1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m */; };
                08A4C94F1C470F1C00D5F010 /* nested-sample in Resources */ = {isa = PBXBuildFile; fileRef = 08A4C94D1C470F0900D5F010 /* nested-sample */; };
                08A4C9501C470F1C00D5F010 /* nested-sample.plist in Resources */ = {isa = PBXBuildFile; fileRef = 08A4C94E1C470F0900D5F010 /* nested-sample.plist */; };
+               08AD0BF01FBE370000CB41B2 /* stackshot-sample-stacktop in Resources */ = {isa = PBXBuildFile; fileRef = 08AD0BEE1FBE370000CB41B2 /* stackshot-sample-stacktop */; };
+               08AD0BF11FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08AD0BEF1FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz */; };
                08B480781BF8297500B4AAE0 /* stackshot-sample-new-arrays in Resources */ = {isa = PBXBuildFile; fileRef = 08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */; };
                08B480791BF8297500B4AAE0 /* stackshot-sample-new-arrays.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */; };
                08B4807A1BF8297500B4AAE0 /* stackshot-sample-old-arrays in Resources */ = {isa = PBXBuildFile; fileRef = 08B480761BF8294E00B4AAE0 /* stackshot-sample-old-arrays */; };
@@ -59,6 +78,8 @@
                08F2AC0B1FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 08F2AC091FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz */; };
                1368F0851C87E06A00940FC6 /* exitreason-codesigning.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 1368F0841C87E06300940FC6 /* exitreason-codesigning.plist.gz */; };
                1368F0861C87E06C00940FC6 /* exitreason-codesigning in Resources */ = {isa = PBXBuildFile; fileRef = 1368F0831C87E06300940FC6 /* exitreason-codesigning */; };
+               13739E8520DB18B600D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13739E8320DB18B500D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz */; };
+               13739E8620DB18B600D8D9B9 /* stackshot-with-shared-cache-layout in Resources */ = {isa = PBXBuildFile; fileRef = 13739E8420DB18B600D8D9B9 /* stackshot-with-shared-cache-layout */; };
                13A79CAA1CF8C5D600FFC181 /* stackshot-with-kcid in Resources */ = {isa = PBXBuildFile; fileRef = 13A79CA81CF8C5D200FFC181 /* stackshot-with-kcid */; };
                13A79CAB1CF8C5D600FFC181 /* stackshot-with-kcid.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = 13A79CA91CF8C5D200FFC181 /* stackshot-with-kcid.plist.gz */; };
                13CC08441CB97F8D00EA6069 /* stackshot-fault-stats in Resources */ = {isa = PBXBuildFile; fileRef = 13CC08421CB97F8A00EA6069 /* stackshot-fault-stats */; };
                C91C93E51ACB598700119B60 /* KCDBasicTypeDescription.m in Sources */ = {isa = PBXBuildFile; fileRef = C91C93E11ACB598700119B60 /* KCDBasicTypeDescription.m */; };
                C91C93E61ACB598700119B60 /* KCDStructTypeDescription.h in Headers */ = {isa = PBXBuildFile; fileRef = C91C93E21ACB598700119B60 /* KCDStructTypeDescription.h */; };
                C91C93E71ACB598700119B60 /* KCDStructTypeDescription.m in Sources */ = {isa = PBXBuildFile; fileRef = C91C93E31ACB598700119B60 /* KCDStructTypeDescription.m */; };
+               C95E4D1A204F42C500FD2229 /* stackshot-sample-cpu-times.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = C95E4D18204F42C400FD2229 /* stackshot-sample-cpu-times.plist.gz */; };
+               C95E4D1B204F42C500FD2229 /* stackshot-sample-cpu-times in Resources */ = {isa = PBXBuildFile; fileRef = C95E4D19204F42C500FD2229 /* stackshot-sample-cpu-times */; };
                C9C5C68C1ACDAFDB00BE0E5E /* kcdtypes.c in Sources */ = {isa = PBXBuildFile; fileRef = C9C5C68B1ACDAFDB00BE0E5E /* kcdtypes.c */; };
                C9D7B53F1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz in Resources */ = {isa = PBXBuildFile; fileRef = C9D7B53D1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz */; };
                C9D7B5401D1B41D700F1019D /* xnupost_testconfig-sample in Resources */ = {isa = PBXBuildFile; fileRef = C9D7B53E1D1B41D700F1019D /* xnupost_testconfig-sample */; };
 /* End PBXBuildFile section */
 
 /* Begin PBXContainerItemProxy section */
-               08603F3A1BF69EDE007D3784 /* PBXContainerItemProxy */ = {
+               086395BA1BF565AB005ED913 /* PBXContainerItemProxy */ = {
                        isa = PBXContainerItemProxy;
                        containerPortal = C91C93BF1ACB58B700119B60 /* Project object */;
                        proxyType = 1;
                        remoteGlobalIDString = C91C93C61ACB58B700119B60;
                        remoteInfo = libkdd;
                };
-               086395BA1BF565AB005ED913 /* PBXContainerItemProxy */ = {
+               08CFD8481FBB9E42008D51F6 /* PBXContainerItemProxy */ = {
                        isa = PBXContainerItemProxy;
                        containerPortal = C91C93BF1ACB58B700119B60 /* Project object */;
                        proxyType = 1;
                        remoteGlobalIDString = C91C93C61ACB58B700119B60;
                        remoteInfo = libkdd;
                };
+               08CFD84A1FBB9E43008D51F6 /* PBXContainerItemProxy */ = {
+                       isa = PBXContainerItemProxy;
+                       containerPortal = C91C93BF1ACB58B700119B60 /* Project object */;
+                       proxyType = 1;
+                       remoteGlobalIDString = 0864FCEE1FA3C0B7001B7B0B;
+                       remoteInfo = kdd.framework;
+               };
 /* End PBXContainerItemProxy section */
 
 /* Begin PBXCopyFilesBuildPhase section */
                084085AE1FA3D156005BAD16 /* module.modulemap */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = "sourcecode.module-map"; path = module.modulemap; sourceTree = "<group>"; };
                0843EE911BF6AFB700CD4150 /* stackshot-sample */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample"; path = "tests/stackshot-sample"; sourceTree = SOURCE_ROOT; };
                0843EE931BF6BAB400CD4150 /* stackshot-sample.plist.gz */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = archive.gzip; name = "stackshot-sample.plist.gz"; path = "tests/stackshot-sample.plist.gz"; sourceTree = SOURCE_ROOT; };
+               084422F62048B801008A085B /* stackshot-sample-asid */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-asid"; sourceTree = "<group>"; };
+               084422F72048B801008A085B /* stackshot-sample-asid.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-asid.plist.gz"; sourceTree = "<group>"; };
                08603F341BF69EDE007D3784 /* tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
                08603F361BF69EDE007D3784 /* Tests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Tests.swift; sourceTree = "<group>"; };
                08603F381BF69EDE007D3784 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
                08A4C94B1C4701B800D5F010 /* KCDEmbeddedBufferDescription.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KCDEmbeddedBufferDescription.m; sourceTree = "<group>"; };
                08A4C94D1C470F0900D5F010 /* nested-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "nested-sample"; path = "tests/nested-sample"; sourceTree = SOURCE_ROOT; };
                08A4C94E1C470F0900D5F010 /* nested-sample.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; name = "nested-sample.plist"; path = "tests/nested-sample.plist"; sourceTree = SOURCE_ROOT; };
+               08AD0BEE1FBE370000CB41B2 /* stackshot-sample-stacktop */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-stacktop"; sourceTree = "<group>"; };
+               08AD0BEF1FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-stacktop.plist.gz"; sourceTree = "<group>"; };
                08B480741BF8294E00B4AAE0 /* stackshot-sample-new-arrays */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample-new-arrays"; path = "tests/stackshot-sample-new-arrays"; sourceTree = SOURCE_ROOT; };
                08B480751BF8294E00B4AAE0 /* stackshot-sample-new-arrays.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-sample-new-arrays.plist.gz"; path = "tests/stackshot-sample-new-arrays.plist.gz"; sourceTree = SOURCE_ROOT; };
                08B480761BF8294E00B4AAE0 /* stackshot-sample-old-arrays */ = {isa = PBXFileReference; lastKnownFileType = text; name = "stackshot-sample-old-arrays"; path = "tests/stackshot-sample-old-arrays"; sourceTree = SOURCE_ROOT; };
                08F2AC091FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-delta-thread-policy.plist.gz"; sourceTree = "<group>"; };
                1368F0831C87E06300940FC6 /* exitreason-codesigning */ = {isa = PBXFileReference; lastKnownFileType = file; name = "exitreason-codesigning"; path = "tests/exitreason-codesigning"; sourceTree = SOURCE_ROOT; };
                1368F0841C87E06300940FC6 /* exitreason-codesigning.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "exitreason-codesigning.plist.gz"; path = "tests/exitreason-codesigning.plist.gz"; sourceTree = SOURCE_ROOT; };
+               13739E8320DB18B500D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-with-shared-cache-layout.plist.gz"; sourceTree = "<group>"; };
+               13739E8420DB18B600D8D9B9 /* stackshot-with-shared-cache-layout */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-with-shared-cache-layout"; sourceTree = "<group>"; };
                13A79CA81CF8C5D200FFC181 /* stackshot-with-kcid */ = {isa = PBXFileReference; lastKnownFileType = file; name = "stackshot-with-kcid"; path = "tests/stackshot-with-kcid"; sourceTree = SOURCE_ROOT; };
                13A79CA91CF8C5D200FFC181 /* stackshot-with-kcid.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; name = "stackshot-with-kcid.plist.gz"; path = "tests/stackshot-with-kcid.plist.gz"; sourceTree = SOURCE_ROOT; };
                13AF287B1C4A0D6A000795E2 /* corpse-twr-sample */ = {isa = PBXFileReference; lastKnownFileType = file; name = "corpse-twr-sample"; path = "tests/corpse-twr-sample"; sourceTree = SOURCE_ROOT; };
                C91C93E11ACB598700119B60 /* KCDBasicTypeDescription.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KCDBasicTypeDescription.m; sourceTree = "<group>"; };
                C91C93E21ACB598700119B60 /* KCDStructTypeDescription.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = KCDStructTypeDescription.h; sourceTree = "<group>"; };
                C91C93E31ACB598700119B60 /* KCDStructTypeDescription.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = KCDStructTypeDescription.m; sourceTree = "<group>"; };
+               C95E4D18204F42C400FD2229 /* stackshot-sample-cpu-times.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "stackshot-sample-cpu-times.plist.gz"; sourceTree = "<group>"; };
+               C95E4D19204F42C500FD2229 /* stackshot-sample-cpu-times */ = {isa = PBXFileReference; lastKnownFileType = file; path = "stackshot-sample-cpu-times"; sourceTree = "<group>"; };
                C9C5C68B1ACDAFDB00BE0E5E /* kcdtypes.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = kcdtypes.c; sourceTree = "<group>"; };
                C9D7B53D1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz */ = {isa = PBXFileReference; lastKnownFileType = archive.gzip; path = "xnupost_testconfig-sample.plist.gz"; sourceTree = "<group>"; };
                C9D7B53E1D1B41D700F1019D /* xnupost_testconfig-sample */ = {isa = PBXFileReference; lastKnownFileType = file; path = "xnupost_testconfig-sample"; sourceTree = "<group>"; };
                08603F351BF69EDE007D3784 /* tests */ = {
                        isa = PBXGroup;
                        children = (
+                               13739E8420DB18B600D8D9B9 /* stackshot-with-shared-cache-layout */,
+                               13739E8320DB18B500D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz */,
+                               C95E4D19204F42C500FD2229 /* stackshot-sample-cpu-times */,
+                               C95E4D18204F42C400FD2229 /* stackshot-sample-cpu-times.plist.gz */,
+                               084422F62048B801008A085B /* stackshot-sample-asid */,
+                               084422F72048B801008A085B /* stackshot-sample-asid.plist.gz */,
+                               08AD0BEE1FBE370000CB41B2 /* stackshot-sample-stacktop */,
+                               08AD0BEF1FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz */,
                                08F2AC081FA136EB00271A11 /* stackshot-sample-delta-thread-policy */,
                                08F2AC091FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz */,
                                18C577C51F96DB7100C67EB3 /* stackshot-sample-thread-groups-flags.plist.gz */,
                        buildRules = (
                        );
                        dependencies = (
-                               08603F3B1BF69EDE007D3784 /* PBXTargetDependency */,
                        );
                        name = tests;
                        productName = Tests;
                                                CreatedOnToolsVersion = 9.1;
                                                ProvisioningStyle = Automatic;
                                        };
+                                       08CFD8441FBB9E39008D51F6 = {
+                                               CreatedOnToolsVersion = 9.0.1;
+                                               ProvisioningStyle = Automatic;
+                                       };
                                        C91C93C61ACB58B700119B60 = {
                                                CreatedOnToolsVersion = 7.0;
                                        };
                        projectDirPath = "";
                        projectRoot = "";
                        targets = (
+                               08CFD8441FBB9E39008D51F6 /* Default */,
                                C91C93C61ACB58B700119B60 /* libkdd */,
                                086395B11BF5655D005ED913 /* kdd */,
                                08603F331BF69EDE007D3784 /* tests */,
                        isa = PBXResourcesBuildPhase;
                        buildActionMask = 2147483647;
                        files = (
+                               13739E8620DB18B600D8D9B9 /* stackshot-with-shared-cache-layout in Resources */,
+                               084422F82048BABB008A085B /* stackshot-sample-asid in Resources */,
+                               084422F92048BABB008A085B /* stackshot-sample-asid.plist.gz in Resources */,
                                08F2AC0A1FA136EB00271A11 /* stackshot-sample-delta-thread-policy in Resources */,
                                18C577C61F96DB7100C67EB3 /* stackshot-sample-thread-groups-flags.plist.gz in Resources */,
                                18C577C31F96DB5200C67EB3 /* stackshot-sample-thread-groups-flags in Resources */,
                                088C36E11EF323C300ABB2E0 /* stackshot-sample-thread-policy.plist.gz in Resources */,
                                045F7F131D2ADE8000B4808B /* stackshot-with-waitinfo.plist.gz in Resources */,
                                045F7F121D2ADE7C00B4808B /* stackshot-with-waitinfo in Resources */,
+                               C95E4D1A204F42C500FD2229 /* stackshot-sample-cpu-times.plist.gz in Resources */,
                                08A4C94F1C470F1C00D5F010 /* nested-sample in Resources */,
                                1862B0341E7A083F0005ADF4 /* stackshot-sample-thread-groups in Resources */,
                                08A4C9501C470F1C00D5F010 /* nested-sample.plist in Resources */,
                                13D6C5D21C4DDDBE005E617C /* test-twr-sample in Resources */,
                                13D6C5D01C4DDDB6005E617C /* corpse-twr-sample in Resources */,
+                               C95E4D1B204F42C500FD2229 /* stackshot-sample-cpu-times in Resources */,
                                C9D7B5401D1B41D700F1019D /* xnupost_testconfig-sample in Resources */,
                                081725D51C3F476500371A54 /* stackshot-sample-duration in Resources */,
                                08F2AC0B1FA136EB00271A11 /* stackshot-sample-delta-thread-policy.plist.gz in Resources */,
                                0860F87B1BFC3857007E1301 /* stackshot-sample-tailspin-2.plist.gz in Resources */,
                                08CF18FF1BF9B7B100D05813 /* stackshot-sample-tailspin in Resources */,
                                1368F0861C87E06C00940FC6 /* exitreason-codesigning in Resources */,
+                               13739E8520DB18B600D8D9B9 /* stackshot-with-shared-cache-layout.plist.gz in Resources */,
                                13DBA26A1CAB1BA000227EB2 /* stackshot-sample-sharedcachev2 in Resources */,
                                C9D7B53F1D1B41D700F1019D /* xnupost_testconfig-sample.plist.gz in Resources */,
                                13DBA2681CAB1AD600227EB2 /* stackshot-sample-sharedcachev2.plist.gz in Resources */,
                                08C9D83E1BFFF8E100DF6C05 /* exitreason-sample.plist.gz in Resources */,
                                18E592981E9451A20018612A /* stackshot-sample-coalitions in Resources */,
                                08B4808B1BF9474A00B4AAE0 /* corpse-sample in Resources */,
+                               08AD0BF01FBE370000CB41B2 /* stackshot-sample-stacktop in Resources */,
                                13D6C5D11C4DDDB8005E617C /* corpse-twr-sample.plist.gz in Resources */,
                                08B4808C1BF9474A00B4AAE0 /* corpse-sample.plist.gz in Resources */,
                                08B480881BF92E0500B4AAE0 /* kcdata.py in Resources */,
                                08B4807B1BF8297500B4AAE0 /* stackshot-sample-old-arrays.plist.gz in Resources */,
                                0843EE941BF6BAC100CD4150 /* stackshot-sample.plist.gz in Resources */,
                                0843EE921BF6AFC600CD4150 /* stackshot-sample in Resources */,
+                               08AD0BF11FBE370000CB41B2 /* stackshot-sample-stacktop.plist.gz in Resources */,
                                18E592991E9451A20018612A /* stackshot-sample-coalitions.plist.gz in Resources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
 /* End PBXSourcesBuildPhase section */
 
 /* Begin PBXTargetDependency section */
-               08603F3B1BF69EDE007D3784 /* PBXTargetDependency */ = {
+               086395BB1BF565AB005ED913 /* PBXTargetDependency */ = {
                        isa = PBXTargetDependency;
                        target = C91C93C61ACB58B700119B60 /* libkdd */;
-                       targetProxy = 08603F3A1BF69EDE007D3784 /* PBXContainerItemProxy */;
+                       targetProxy = 086395BA1BF565AB005ED913 /* PBXContainerItemProxy */;
                };
-               086395BB1BF565AB005ED913 /* PBXTargetDependency */ = {
+               08CFD8491FBB9E42008D51F6 /* PBXTargetDependency */ = {
                        isa = PBXTargetDependency;
                        target = C91C93C61ACB58B700119B60 /* libkdd */;
-                       targetProxy = 086395BA1BF565AB005ED913 /* PBXContainerItemProxy */;
+                       targetProxy = 08CFD8481FBB9E42008D51F6 /* PBXContainerItemProxy */;
+               };
+               08CFD84B1FBB9E43008D51F6 /* PBXTargetDependency */ = {
+                       isa = PBXTargetDependency;
+                       target = 0864FCEE1FA3C0B7001B7B0B /* kdd.framework */;
+                       targetProxy = 08CFD84A1FBB9E43008D51F6 /* PBXContainerItemProxy */;
                };
 /* End PBXTargetDependency section */
 
                                COMBINE_HIDPI_IMAGES = YES;
                                ENABLE_TESTABILITY = YES;
                                INFOPLIST_FILE = tests/Info.plist;
+                               INSTALL_PATH = /AppleInternal/XCTests/com.apple.libkdd;
                                LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/../Frameworks";
                                PRODUCT_BUNDLE_IDENTIFIER = apple.com.Tests;
                                PRODUCT_NAME = "$(TARGET_NAME)";
                                SDKROOT = macosx;
                                SWIFT_OBJC_BRIDGING_HEADER = tests/kdd_bridge.h;
                                SWIFT_OPTIMIZATION_LEVEL = "-Onone";
-                               SWIFT_VERSION = 3.0;
+                               SWIFT_VERSION = 4.0;
                        };
                        name = Debug;
                };
                                CODE_SIGN_IDENTITY = "-";
                                COMBINE_HIDPI_IMAGES = YES;
                                INFOPLIST_FILE = tests/Info.plist;
+                               INSTALL_PATH = /AppleInternal/XCTests/com.apple.libkdd;
                                LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/../Frameworks";
                                PRODUCT_BUNDLE_IDENTIFIER = apple.com.Tests;
                                PRODUCT_NAME = "$(TARGET_NAME)";
                                SDKROOT = macosx;
                                SWIFT_OBJC_BRIDGING_HEADER = tests/kdd_bridge.h;
-                               SWIFT_VERSION = 3.0;
+                               SWIFT_VERSION = 4.0;
                        };
                        name = Release;
                };
                                GCC_C_LANGUAGE_STANDARD = gnu11;
                                INFOPLIST_FILE = kdd.framework/Info.plist;
                                INSTALLHDRS_SCRIPT_PHASE = YES;
-                               INSTALL_PATH = /AppleInternal/Ariadne/Frameworks/;
+                               INSTALL_PATH = /AppleInternal/Library/Frameworks/;
                                LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/Frameworks";
                                MACOSX_DEPLOYMENT_TARGET = 10.13;
                                MODULEMAP_FILE = "$(SRCROOT)/kdd.framework/module.modulemap";
                                GCC_C_LANGUAGE_STANDARD = gnu11;
                                INFOPLIST_FILE = kdd.framework/Info.plist;
                                INSTALLHDRS_SCRIPT_PHASE = YES;
-                               INSTALL_PATH = /AppleInternal/Ariadne/Frameworks/;
+                               INSTALL_PATH = /AppleInternal/Library/Frameworks/;
                                LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/Frameworks";
                                MACOSX_DEPLOYMENT_TARGET = 10.13;
                                MODULEMAP_FILE = "$(SRCROOT)/kdd.framework/module.modulemap";
                        };
                        name = Release;
                };
+               08CFD8451FBB9E39008D51F6 /* Debug */ = {
+                       isa = XCBuildConfiguration;
+                       buildSettings = {
+                               CODE_SIGN_STYLE = Automatic;
+                               PRODUCT_NAME = "$(TARGET_NAME)";
+                       };
+                       name = Debug;
+               };
+               08CFD8461FBB9E39008D51F6 /* Release */ = {
+                       isa = XCBuildConfiguration;
+                       buildSettings = {
+                               CODE_SIGN_STYLE = Automatic;
+                               PRODUCT_NAME = "$(TARGET_NAME)";
+                       };
+                       name = Release;
+               };
+               08CFD84C1FBB9E72008D51F6 /* ReleaseHost */ = {
+                       isa = XCBuildConfiguration;
+                       buildSettings = {
+                               ALWAYS_SEARCH_USER_PATHS = NO;
+                               CLANG_CXX_LANGUAGE_STANDARD = "gnu++0x";
+                               CLANG_CXX_LIBRARY = "libc++";
+                               CLANG_ENABLE_OBJC_ARC = YES;
+                               CLANG_WARN_BOOL_CONVERSION = YES;
+                               CLANG_WARN_CONSTANT_CONVERSION = YES;
+                               CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
+                               CLANG_WARN_EMPTY_BODY = YES;
+                               CLANG_WARN_ENUM_CONVERSION = YES;
+                               CLANG_WARN_IMPLICIT_SIGN_CONVERSION = YES;
+                               CLANG_WARN_INT_CONVERSION = YES;
+                               CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
+                               CLANG_WARN_UNREACHABLE_CODE = YES;
+                               CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                               COPY_PHASE_STRIP = NO;
+                               DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+                               ENABLE_NS_ASSERTIONS = NO;
+                               ENABLE_STRICT_OBJC_MSGSEND = YES;
+                               GCC_C_LANGUAGE_STANDARD = gnu99;
+                               GCC_NO_COMMON_BLOCKS = YES;
+                               GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
+                               GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
+                               GCC_WARN_UNDECLARED_SELECTOR = YES;
+                               GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
+                               GCC_WARN_UNUSED_FUNCTION = YES;
+                               GCC_WARN_UNUSED_VARIABLE = YES;
+                               HEADER_SEARCH_PATHS = "$(SRCROOT)";
+                               MTL_ENABLE_DEBUG_INFO = NO;
+                               OTHER_CFLAGS = "";
+                               SDKROOT = macosx.internal;
+                       };
+                       name = ReleaseHost;
+               };
+               08CFD84D1FBB9E72008D51F6 /* ReleaseHost */ = {
+                       isa = XCBuildConfiguration;
+                       buildSettings = {
+                               CODE_SIGN_STYLE = Automatic;
+                               PRODUCT_NAME = "$(TARGET_NAME)";
+                       };
+                       name = ReleaseHost;
+               };
+               08CFD84E1FBB9E72008D51F6 /* ReleaseHost */ = {
+                       isa = XCBuildConfiguration;
+                       buildSettings = {
+                               COMBINE_HIDPI_IMAGES = YES;
+                               EXECUTABLE_PREFIX = lib;
+                               OTHER_CFLAGS = "-I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders";
+                               PRODUCT_NAME = kdd;
+                       };
+                       name = ReleaseHost;
+               };
+               08CFD84F1FBB9E72008D51F6 /* ReleaseHost */ = {
+                       isa = XCBuildConfiguration;
+                       buildSettings = {
+                               CLANG_ENABLE_MODULES = YES;
+                               CODE_SIGN_IDENTITY = "-";
+                               MACOSX_DEPLOYMENT_TARGET = 10.11;
+                               PRODUCT_NAME = "$(TARGET_NAME)";
+                               SDKROOT = macosx;
+                       };
+                       name = ReleaseHost;
+               };
+               08CFD8501FBB9E72008D51F6 /* ReleaseHost */ = {
+                       isa = XCBuildConfiguration;
+                       buildSettings = {
+                               CLANG_ENABLE_MODULES = YES;
+                               CODE_SIGN_IDENTITY = "-";
+                               COMBINE_HIDPI_IMAGES = YES;
+                               INFOPLIST_FILE = tests/Info.plist;
+                               INSTALL_PATH = /AppleInternal/XCTests/com.apple.libkdd;
+                               LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/../Frameworks";
+                               PRODUCT_BUNDLE_IDENTIFIER = apple.com.Tests;
+                               PRODUCT_NAME = "$(TARGET_NAME)";
+                               SDKROOT = macosx;
+                               SWIFT_OBJC_BRIDGING_HEADER = tests/kdd_bridge.h;
+                               SWIFT_VERSION = 4.0;
+                       };
+                       name = ReleaseHost;
+               };
+               08CFD8511FBB9E72008D51F6 /* ReleaseHost */ = {
+                       isa = XCBuildConfiguration;
+                       buildSettings = {
+                               CLANG_ANALYZER_NONNULL = YES;
+                               CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
+                               CLANG_CXX_LANGUAGE_STANDARD = "gnu++14";
+                               CLANG_ENABLE_MODULES = YES;
+                               CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                               CLANG_WARN_COMMA = YES;
+                               CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
+                               CLANG_WARN_INFINITE_RECURSION = YES;
+                               CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                               CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                               CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                               CLANG_WARN_STRICT_PROTOTYPES = YES;
+                               CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                               CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
+                               CODE_SIGN_IDENTITY = "-";
+                               CODE_SIGN_STYLE = Automatic;
+                               COMBINE_HIDPI_IMAGES = YES;
+                               CURRENT_PROJECT_VERSION = 1;
+                               DEFINES_MODULE = YES;
+                               DYLIB_COMPATIBILITY_VERSION = 1;
+                               DYLIB_CURRENT_VERSION = 1;
+                               DYLIB_INSTALL_NAME_BASE = "@rpath";
+                               FRAMEWORK_VERSION = A;
+                               GCC_C_LANGUAGE_STANDARD = gnu11;
+                               INFOPLIST_FILE = kdd.framework/Info.plist;
+                               INSTALLHDRS_SCRIPT_PHASE = YES;
+                               INSTALL_PATH = /AppleInternal/Ariadne/Frameworks/;
+                               LD_RUNPATH_SEARCH_PATHS = "$(inherited) @executable_path/../Frameworks @loader_path/Frameworks";
+                               MACOSX_DEPLOYMENT_TARGET = 10.13;
+                               MODULEMAP_FILE = "$(SRCROOT)/kdd.framework/module.modulemap";
+                               PRODUCT_BUNDLE_IDENTIFIER = "test.kdd-framework";
+                               PRODUCT_NAME = kdd;
+                               SDKROOT = macosx.internal;
+                               SKIP_INSTALL = NO;
+                               SUPPORTS_TEXT_BASED_API = YES;
+                               VERSIONING_SYSTEM = "apple-generic";
+                               VERSION_INFO_PREFIX = "";
+                       };
+                       name = ReleaseHost;
+               };
                C91C93D81ACB58B700119B60 /* Debug */ = {
                        isa = XCBuildConfiguration;
                        buildSettings = {
                        buildConfigurations = (
                                08603F3C1BF69EDE007D3784 /* Debug */,
                                08603F3D1BF69EDE007D3784 /* Release */,
+                               08CFD8501FBB9E72008D51F6 /* ReleaseHost */,
                        );
                        defaultConfigurationIsVisible = 0;
                        defaultConfigurationName = Release;
                        buildConfigurations = (
                                086395B71BF5655D005ED913 /* Debug */,
                                086395B81BF5655D005ED913 /* Release */,
+                               08CFD84F1FBB9E72008D51F6 /* ReleaseHost */,
                        );
                        defaultConfigurationIsVisible = 0;
                        defaultConfigurationName = Release;
                        buildConfigurations = (
                                0864FD001FA3C0B7001B7B0B /* Debug */,
                                0864FD011FA3C0B7001B7B0B /* Release */,
+                               08CFD8511FBB9E72008D51F6 /* ReleaseHost */,
+                       );
+                       defaultConfigurationIsVisible = 0;
+                       defaultConfigurationName = Release;
+               };
+               08CFD8471FBB9E39008D51F6 /* Build configuration list for PBXAggregateTarget "Default" */ = {
+                       isa = XCConfigurationList;
+                       buildConfigurations = (
+                               08CFD8451FBB9E39008D51F6 /* Debug */,
+                               08CFD8461FBB9E39008D51F6 /* Release */,
+                               08CFD84D1FBB9E72008D51F6 /* ReleaseHost */,
                        );
                        defaultConfigurationIsVisible = 0;
                        defaultConfigurationName = Release;
                        buildConfigurations = (
                                C91C93D81ACB58B700119B60 /* Debug */,
                                C91C93D91ACB58B700119B60 /* Release */,
+                               08CFD84C1FBB9E72008D51F6 /* ReleaseHost */,
                        );
                        defaultConfigurationIsVisible = 0;
                        defaultConfigurationName = Release;
                        buildConfigurations = (
                                C91C93DB1ACB58B700119B60 /* Debug */,
                                C91C93DC1ACB58B700119B60 /* Release */,
+                               08CFD84E1FBB9E72008D51F6 /* ReleaseHost */,
                        );
                        defaultConfigurationIsVisible = 0;
                        defaultConfigurationName = Release;
index 7a8a6ffeed0b98408e63bc5ce0015545b3dea7d9..e073f85a406950ee8f9c068c509f024da6a3f4bb 100644 (file)
@@ -1208,7 +1208,7 @@ class Tests: XCTestCase {
         // check that we agree with sample file
 
         guard let sampledata = self.dataWithResource(name)
-            else { XCTFail(); return }
+            else { XCTFail("failed to open bundle resource named " + name); return }
         var dict : NSDictionary?
 
         dict = try? self.parseBuffer(sampledata) as NSDictionary
@@ -1233,14 +1233,16 @@ class Tests: XCTestCase {
                               self.dataWithResource(name + ".plist")
             else {XCTFail(); return}
 
-        var dict2 = try? PropertyListSerialization.propertyList(from: plistdata as Data, options: [], format: nil)
-        if dict2 == nil {
-            dict2 = try? PropertyListSerialization.propertyList(from:decompress(plistdata) as Data, options:[], format: nil)
+        var opt_dict2 = try? PropertyListSerialization.propertyList(from: plistdata as Data, options: [], format: nil)
+        if opt_dict2 == nil {
+            opt_dict2 = try? PropertyListSerialization.propertyList(from:decompress(plistdata) as Data, options:[], format: nil)
         }
+        guard let dict2 = opt_dict2
+            else { XCTFail(); return}
 
-        XCTAssert(dict2 != nil)
+        XCTAssertEqual(dict, dict2 as! NSDictionary);
 
-        XCTAssert(dict == dict2 as? NSDictionary)
+        //XCTAssert(dict == dict2 as? NSDictionary)
 
         // check that we agree with python
 
@@ -1378,6 +1380,26 @@ class Tests: XCTestCase {
         self.testSampleStackshot("stackshot-sample-instrs-cycles")
     }
 
+    func testStackshotWithStacktop() {
+        self.testSampleStackshot("stackshot-sample-stacktop")
+    }
+
+    func testStackshotWithASID() {
+        self.testSampleStackshot("stackshot-sample-asid")
+    }
+
+    func testStackshotWithPageTables() {
+        self.testSampleStackshot("stackshot-sample-asid-pagetable")
+    }
+
+    func testStackshotCPUTimes() {
+        self.testSampleStackshot("stackshot-sample-cpu-times")
+    }
+    
+    func testStackshotWithSharedCacheLayout() {
+        self.testSampleStackshot("stackshot-with-shared-cache-layout")
+    }
+
     func testTrivial() {
     }
 }
diff --git a/libkdd/tests/stackshot-sample-asid b/libkdd/tests/stackshot-sample-asid
new file mode 100644 (file)
index 0000000..048e7c4
Binary files /dev/null and b/libkdd/tests/stackshot-sample-asid differ
diff --git a/libkdd/tests/stackshot-sample-asid-pagetable b/libkdd/tests/stackshot-sample-asid-pagetable
new file mode 100644 (file)
index 0000000..5f278e9
Binary files /dev/null and b/libkdd/tests/stackshot-sample-asid-pagetable differ
diff --git a/libkdd/tests/stackshot-sample-asid-pagetable.plist.gz b/libkdd/tests/stackshot-sample-asid-pagetable.plist.gz
new file mode 100644 (file)
index 0000000..8542e5e
Binary files /dev/null and b/libkdd/tests/stackshot-sample-asid-pagetable.plist.gz differ
diff --git a/libkdd/tests/stackshot-sample-asid.plist.gz b/libkdd/tests/stackshot-sample-asid.plist.gz
new file mode 100644 (file)
index 0000000..4c371a2
Binary files /dev/null and b/libkdd/tests/stackshot-sample-asid.plist.gz differ
diff --git a/libkdd/tests/stackshot-sample-cpu-times b/libkdd/tests/stackshot-sample-cpu-times
new file mode 100644 (file)
index 0000000..f7d7f84
Binary files /dev/null and b/libkdd/tests/stackshot-sample-cpu-times differ
diff --git a/libkdd/tests/stackshot-sample-cpu-times.plist.gz b/libkdd/tests/stackshot-sample-cpu-times.plist.gz
new file mode 100644 (file)
index 0000000..f0092d9
Binary files /dev/null and b/libkdd/tests/stackshot-sample-cpu-times.plist.gz differ
diff --git a/libkdd/tests/stackshot-sample-stacktop b/libkdd/tests/stackshot-sample-stacktop
new file mode 100644 (file)
index 0000000..3d3bbed
Binary files /dev/null and b/libkdd/tests/stackshot-sample-stacktop differ
diff --git a/libkdd/tests/stackshot-sample-stacktop.plist.gz b/libkdd/tests/stackshot-sample-stacktop.plist.gz
new file mode 100644 (file)
index 0000000..079ca7d
Binary files /dev/null and b/libkdd/tests/stackshot-sample-stacktop.plist.gz differ
diff --git a/libkdd/tests/stackshot-with-shared-cache-layout b/libkdd/tests/stackshot-with-shared-cache-layout
new file mode 100644 (file)
index 0000000..8f218d1
Binary files /dev/null and b/libkdd/tests/stackshot-with-shared-cache-layout differ
diff --git a/libkdd/tests/stackshot-with-shared-cache-layout.plist.gz b/libkdd/tests/stackshot-with-shared-cache-layout.plist.gz
new file mode 100644 (file)
index 0000000..54808ef
Binary files /dev/null and b/libkdd/tests/stackshot-with-shared-cache-layout.plist.gz differ
index ea6a8e53cb0952c4c04999eede242b9cfb241b9c..236984072cb9d5028edb99882726c3397941ccad 100644 (file)
 #include <libkern/OSKextLibPrivate.h>
 
 #define VERS_MAJOR_DIGITS        (4)
-#define VERS_MINOR_DIGITS        (2)
-#define VERS_REVISION_DIGITS     (2)
+#define VERS_MINOR_DIGITS        (4)
+#define VERS_REVISION_DIGITS     (4)
 #define VERS_STAGE_DIGITS        (1)
 #define VERS_STAGE_LEVEL_DIGITS  (3)
 
 #define VERS_MAJOR_MAX           (9999)
 #define VERS_STAGE_LEVEL_MAX      (255)
 
-#define VERS_MAJOR_MULT    (100000000)
-#define VERS_MINOR_MULT      (1000000)
-#define VERS_REVISION_MULT     (10000)
-#define VERS_STAGE_MULT         (1000)
+#define VERS_MAJOR_MULT  (1000000000000)
+#define VERS_MINOR_MULT      (100000000)
+#define VERS_REVISION_MULT       (10000)
+#define VERS_STAGE_MULT           (1000)
 
 
 typedef enum {
index 53b3b7b960d138eccd3eae7096def146e190d7f9..260d9b361e4d2f0a723967006fac5d759a42f960 100644 (file)
@@ -102,3 +102,37 @@ OSCollection *  OSCollection::copyCollection(OSDictionary *cycleDict)
        return this;
     }
 }
+
+bool OSCollection::iterateObjects(void * refcon, bool (*callback)(void * refcon, OSObject * object))
+{
+    uint64_t     iteratorStore[2];
+    unsigned int initialUpdateStamp;
+    bool         done;
+
+    assert(iteratorSize() < sizeof(iteratorStore));
+
+    if (!initIterator(&iteratorStore[0])) return (false);
+
+    initialUpdateStamp = updateStamp;
+    done = false;
+    do
+    {
+        OSObject * object;
+        if (!getNextObjectForIterator(&iteratorStore[0], &object)) break;
+        done = callback(refcon, object);
+    }
+    while (!done && (initialUpdateStamp == updateStamp));
+
+    return initialUpdateStamp == updateStamp;
+}
+
+static bool OSCollectionIterateObjectsBlock(void * refcon, OSObject * object)
+{
+    bool (^block)(OSObject * object) = (typeof(block)) refcon;
+    return (block(object));
+}
+
+bool OSCollection::iterateObjects(bool (^block)(OSObject * object))
+{
+    return (iterateObjects((void *) block, OSCollectionIterateObjectsBlock));
+}
index fda3dd7c9c3f1a84880f97fae580d60d8a93aafa..e37bb128b00aca9f91d55a7a295866b640878d4f 100644 (file)
@@ -289,7 +289,9 @@ bool OSData::appendBytes(const void *bytes, unsigned int inLength)
     if (capacity == EXTERNAL)
         return false;
     
-    newSize = length + inLength;
+    if (os_add_overflow(length, inLength, &newSize))
+        return false;
+
     if ( (newSize > capacity) && newSize > ensureCapacity(newSize) )
         return false;
 
@@ -313,7 +315,9 @@ bool OSData::appendByte(unsigned char byte, unsigned int inLength)
     if (capacity == EXTERNAL)
         return false;
     
-    newSize = length + inLength;
+    if (os_add_overflow(length, inLength, &newSize))
+        return false;
+
     if ( (newSize > capacity) && newSize > ensureCapacity(newSize) )
         return false;
 
index 27224c7079090a6be9e55aca2e262398af346e14..868152ba1c9e5c306707357342fb4ebdceeea619 100644 (file)
@@ -723,3 +723,31 @@ OSArray * OSDictionary::copyKeys(void)
        }
     return (array);
 }
+
+bool OSDictionary::iterateObjects(void * refcon, bool (*callback)(void * refcon, const OSSymbol * key, OSObject * object))
+{
+    unsigned int initialUpdateStamp;
+    bool         done;
+
+    initialUpdateStamp = updateStamp;
+    done = false;
+       for (unsigned int i = 0; i < count; i++)
+    {
+        done = callback(refcon, dictionary[i].key, EXT_CAST(dictionary[i].value));
+        if (done)                              break;
+        if (initialUpdateStamp != updateStamp) break;
+    }
+
+    return initialUpdateStamp == updateStamp;
+}
+
+static bool OSDictionaryIterateObjectsBlock(void * refcon, const OSSymbol * key, OSObject * object)
+{
+    bool (^block)(const OSSymbol * key, OSObject * object) = (typeof(block)) refcon;
+    return (block(key, object));
+}
+
+bool OSDictionary::iterateObjects(bool (^block)(const OSSymbol * key, OSObject * object))
+{
+       return (iterateObjects((void *)block, &OSDictionaryIterateObjectsBlock));
+}
index feb99abadd39246d64868fb7ad45f74fb2fba07e..3f448512cba8efdfcafa1759592d998444bbb369 100644 (file)
@@ -48,7 +48,6 @@ extern "C" {
 #include <mach/mach_time.h>
 #include <sys/sysctl.h>
 #include <uuid/uuid.h>
-// 04/18/11 - gab: <rdar://problem/9236163>
 #include <sys/random.h>
 
 #include <sys/pgo.h>
@@ -81,12 +80,17 @@ extern "C" {
 extern "C" {
 extern int  IODTGetLoaderInfo(const char * key, void ** infoAddr, int * infoSize);
 extern void IODTFreeLoaderInfo(const char * key, void * infoAddr, int infoSize);
-extern void OSRuntimeUnloadCPPForSegment(kernel_segment_command_t * segment);
-extern void OSRuntimeUnloadCPP(kmod_info_t * ki, void * data);
 
 extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); /* osfmk/machine/pmap.h */
+extern int dtrace_keep_kernel_symbols(void);
 }
 
+extern unsigned long gVirtBase;
+extern unsigned long gPhysBase;
+#if CONFIG_EMBEDDED
+extern vm_offset_t   segLOWESTTEXT;
+#endif /* CONFIG_EMBEDDED */
+
 static OSReturn _OSKextCreateRequest(
     const char    * predicate,
     OSDictionary ** requestP);
@@ -110,10 +114,6 @@ static bool _OSKextInUnloadedPrelinkedKexts(const OSSymbol * theBundleID);
 // So few pad slots, though....
 static bool _OSArrayContainsCString(OSArray * array, const char * cString);
 
-#if CONFIG_KEC_FIPS
-static void * GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDict);
-#endif // CONFIG_KEC_FIPS
-
 /* Prelinked arm kexts do not have VM entries because the method we use to
  * fake an entry (see libsa/bootstrap.cpp:readPrelinkedExtensions()) does
  * not work on ARM.  To get around that, we must free prelinked kext
@@ -327,6 +327,23 @@ kmod_info_t g_kernel_kmod_info = {
     /* stop            */ 0
 };
 
+/* Set up a fake kmod_info struct for statically linked kexts that don't have one. */
+
+kmod_info_t invalid_kmod_info = {
+    /* next            */ 0,
+    /* info_version    */ KMOD_INFO_VERSION,
+    /* id              */ UINT32_MAX,
+    /* name            */ "invalid",
+    /* version         */ "0",
+    /* reference_count */ -1,
+    /* reference_list  */ NULL,
+    /* address         */ 0,
+    /* size            */ 0,
+    /* hdr_size        */ 0,
+    /* start           */ 0,
+    /* stop            */ 0
+};
+
 extern "C" {
 // symbol 'kmod' referenced in: model_dep.c, db_trace.c, symbols.c, db_low_trace.c,
 // dtrace.c, dtrace_glue.h, OSKext.cpp, locore.s, lowmem_vectors.s,
@@ -352,6 +369,25 @@ static u_long     last_unloaded_strlen          = 0;
 static void     * last_unloaded_address         = NULL;
 static u_long     last_unloaded_size            = 0;
 
+// Statically linked kmods described by several mach-o sections:
+//
+// kPrelinkInfoSegment:kBuiltinInfoSection
+// Array of pointers to kmod_info_t structs.
+//
+// kPrelinkInfoSegment:kBuiltinInfoSection
+// Array of pointers to an embedded mach-o header.
+//
+// __DATA:kBuiltinInitSection, kBuiltinTermSection
+// Structors for all kmods. Has to be filtered by proc address.
+//
+
+static uint32_t gBuiltinKmodsCount;
+static kernel_section_t * gBuiltinKmodsSectionInfo;
+static kernel_section_t * gBuiltinKmodsSectionStart;
+
+static const OSSymbol       * gIOSurfaceIdentifier;
+vm_tag_t                      gIOSurfaceTag;
+
 /*********************************************************************
 * sKextInnerLock protects against cross-calls with IOService and
 * IOCatalogue, and owns the variables declared immediately below.
@@ -687,6 +723,10 @@ OSKext::initialize(void)
     }
 
     PE_parse_boot_argn("keepsyms", &sKeepSymbols, sizeof(sKeepSymbols));
+#if CONFIG_DTRACE
+    if (dtrace_keep_kernel_symbols())
+           sKeepSymbols = true;
+#endif /* CONFIG_DTRACE */
 #if KASAN_DYNAMIC_BLACKLIST
     /* needed for function lookup */
     sKeepSymbols = true;
@@ -717,6 +757,7 @@ OSKext::initialize(void)
     sKernelKext->version = OSKextParseVersionString(osrelease);
     sKernelKext->compatibleVersion = sKernelKext->version;
     sKernelKext->linkedExecutable = kernelExecutable;
+    sKernelKext->interfaceUUID = sKernelKext->copyUUID();
     
     sKernelKext->flags.hasAllDependencies = 1;
     sKernelKext->flags.kernelComponent = 1;
@@ -783,6 +824,27 @@ OSKext::initialize(void)
     OSSafeReleaseNULL(kernelCPUType);
     OSSafeReleaseNULL(kernelCPUSubtype);
 
+    gBuiltinKmodsSectionInfo = getsectbyname(kPrelinkInfoSegment, kBuiltinInfoSection);
+    if (gBuiltinKmodsSectionInfo) {
+        uint32_t count;
+
+        assert(gBuiltinKmodsSectionInfo->addr);
+        assert(gBuiltinKmodsSectionInfo->size);
+        gBuiltinKmodsCount = (gBuiltinKmodsSectionInfo->size / sizeof(kmod_info_t *));
+
+        gBuiltinKmodsSectionStart = getsectbyname(kPrelinkInfoSegment, kBuiltinStartSection);
+        assert(gBuiltinKmodsSectionStart);
+        assert(gBuiltinKmodsSectionStart->addr);
+        assert(gBuiltinKmodsSectionStart->size);
+        count = (gBuiltinKmodsSectionStart->size / sizeof(uintptr_t));
+        // one extra pointer for the end of last kmod
+        assert(count == (gBuiltinKmodsCount + 1));
+
+        vm_kernel_builtinkmod_text     = ((uintptr_t *)gBuiltinKmodsSectionStart->addr)[0];
+        vm_kernel_builtinkmod_text_end = ((uintptr_t *)gBuiltinKmodsSectionStart->addr)[count - 1];
+    }
+    gIOSurfaceIdentifier = OSSymbol::withCStringNoCopy("com.apple.iokit.IOSurface");
+
     timestamp = __OSAbsoluteTimePtr(&last_loaded_timestamp);
     *timestamp = 0;
     timestamp = __OSAbsoluteTimePtr(&last_unloaded_timestamp);
@@ -801,8 +863,8 @@ OSKext::initialize(void)
 }
 
 /*********************************************************************
-* This could be in OSKextLib.cpp but we need to hold a lock
-* while removing all the segments and sKextLock will do.
+* This is expected to be called exactly once, from exactly one thread
+* context, during kernel bootstrap.
 *********************************************************************/
 /* static */
 OSReturn
@@ -810,8 +872,6 @@ OSKext::removeKextBootstrap(void)
 {
     OSReturn                   result                = kOSReturnError;
     
-    static bool                alreadyDone           = false;
-
     const char               * dt_kernel_header_name = "Kernel-__HEADER";
     const char               * dt_kernel_symtab_name = "Kernel-__SYMTAB";
     kernel_mach_header_t     * dt_mach_header        = NULL;
@@ -828,17 +888,6 @@ OSKext::removeKextBootstrap(void)
     int                        segment_size          = 0;
 #endif
 
-   /* This must be the very first thing done by this function.
-    */
-    IORecursiveLockLock(sKextLock);
-
-   /* If we already did this, it's a success.
-    */
-    if (alreadyDone) {
-        result = kOSReturnSuccess;
-        goto finish;
-    }
-
     OSKextLog(/* kext */ NULL,
         kOSKextLogProgressLevel |
         kOSKextLogGeneralFlag,
@@ -870,7 +919,6 @@ OSKext::removeKextBootstrap(void)
     }
 
 #if __arm__ || __arm64__
-#if !(defined(KERNEL_INTEGRITY_KTRR))
    /* Free the memory that was set up by bootx.
     */
     dt_segment_name = "Kernel-__KLD";
@@ -882,7 +930,6 @@ OSKext::removeKextBootstrap(void)
         IODTFreeLoaderInfo(dt_segment_name, (void *)segment_paddress,
             (int)segment_size);
     }
-#endif /* !(defined(KERNEL_INTEGRITY_KTRR)) */
 #elif __i386__ || __x86_64__
    /* On x86, use the mapping data from the segment load command to
     * unload KLD directly.
@@ -943,7 +990,7 @@ OSKext::removeKextBootstrap(void)
                 kOSKextLogErrorLevel |
                 kOSKextLogGeneralFlag | kOSKextLogArchiveFlag,
                 "Can't copy __LINKEDIT segment for VM reassign.");
-            goto finish;
+            return result;
         }
         seg_copy_offset = (vm_map_offset_t) seg_copy;
 
@@ -978,7 +1025,7 @@ OSKext::removeKextBootstrap(void)
                 kOSKextLogGeneralFlag | kOSKextLogArchiveFlag,
                 "Can't create __LINKEDIT VM entry at %p, length 0x%llx (error 0x%x).",
                 seg_data, seg_length, mem_result);
-            goto finish;
+            return result;
         }
 
        /* And copy it back.
@@ -1018,15 +1065,8 @@ OSKext::removeKextBootstrap(void)
 
     seg_to_remove = NULL;
 
-    alreadyDone = true;
     result = kOSReturnSuccess;
 
-finish:
-
-   /* This must be the very last thing done before returning.
-    */
-    IORecursiveLockUnlock(sKextLock);
-
     return result;
 }
 
@@ -1503,12 +1543,12 @@ OSKext::initWithPrelinkedInfoDict(
             goto finish;
         }
 
-        data = (void *) ((intptr_t) (addressNum->unsigned64BitValue()) + vm_kernel_slide);
+        data = (void *) ml_static_slide((intptr_t) (addressNum->unsigned64BitValue()));
         length = (uint32_t) (lengthNum->unsigned32BitValue());
 
 #if KASLR_KEXT_DEBUG
         IOLog("kaslr: unslid 0x%lx slid 0x%lx length %u - prelink executable \n",
-              (unsigned long)VM_KERNEL_UNSLIDE(data), 
+              (unsigned long)ml_static_unslide(data), 
               (unsigned long)data,
               length);
 #endif
@@ -1521,11 +1561,11 @@ OSKext::initWithPrelinkedInfoDict(
          */
         addressNum = OSDynamicCast(OSNumber, anInfoDict->getObject(kPrelinkExecutableSourceKey));
         if (addressNum) {
-            srcData = (void *) ((intptr_t) (addressNum->unsigned64BitValue()) + vm_kernel_slide);
+            srcData = (void *) ml_static_slide((intptr_t) (addressNum->unsigned64BitValue()));
             
 #if KASLR_KEXT_DEBUG
             IOLog("kaslr: unslid 0x%lx slid 0x%lx - prelink executable source \n",
-                  (unsigned long)VM_KERNEL_UNSLIDE(srcData),
+                  (unsigned long)ml_static_unslide(srcData),
                   (unsigned long)srcData);
 #endif
             
@@ -1583,14 +1623,14 @@ OSKext::initWithPrelinkedInfoDict(
         }
 
         if (addressNum->unsigned64BitValue() != 0) {
-            kmod_info = (kmod_info_t *) (intptr_t) (addressNum->unsigned64BitValue() + vm_kernel_slide);
-            kmod_info->address += vm_kernel_slide;
+            kmod_info = (kmod_info_t *) ml_static_slide((intptr_t) (addressNum->unsigned64BitValue()));
+            kmod_info->address = ml_static_slide(kmod_info->address);
 #if KASLR_KEXT_DEBUG
             IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info \n",
-                  (unsigned long)VM_KERNEL_UNSLIDE(kmod_info), 
+                  (unsigned long)ml_static_unslide(kmod_info), 
                   (unsigned long)kmod_info);
             IOLog("kaslr: unslid 0x%lx slid 0x%lx - kmod_info->address \n", 
-                  (unsigned long)VM_KERNEL_UNSLIDE(kmod_info->address), 
+                  (unsigned long)ml_static_unslide(kmod_info->address), 
                   (unsigned long)kmod_info->address);
  #endif
         }
@@ -1598,6 +1638,23 @@ OSKext::initWithPrelinkedInfoDict(
         anInfoDict->removeObject(kPrelinkKmodInfoKey);
     }
 
+    if ((addressNum = OSDynamicCast(OSNumber, anInfoDict->getObject("ModuleIndex"))))
+    {
+        uintptr_t builtinTextStart;
+        uintptr_t builtinTextEnd;
+
+        flags.builtin = true;
+        builtinKmodIdx = addressNum->unsigned32BitValue();
+        assert(builtinKmodIdx < gBuiltinKmodsCount);
+
+        builtinTextStart = ((uintptr_t *)gBuiltinKmodsSectionStart->addr)[builtinKmodIdx];
+        builtinTextEnd   = ((uintptr_t *)gBuiltinKmodsSectionStart->addr)[builtinKmodIdx + 1];
+
+        kmod_info = ((kmod_info_t **)gBuiltinKmodsSectionInfo->addr)[builtinKmodIdx];
+        kmod_info->address = builtinTextStart;
+        kmod_info->size    = builtinTextEnd - builtinTextStart;
+    }
+
    /* If the plist has a UUID for an interface, save that off.
     */
     if (isInterface()) {
@@ -3345,13 +3402,17 @@ finish:
 OSData *
 OSKext::copyKextUUIDForAddress(OSNumber *address)
 {
-       OSData *uuid = NULL;
+       OSData              * uuid = NULL;
+    OSKextActiveAccount * active;
+    OSKext              * kext = NULL;
+    uint32_t              baseIdx;
+    uint32_t              lim;
 
        if (!address) {
                return NULL;
        }
 
-       uintptr_t addr = (uintptr_t)address->unsigned64BitValue() + vm_kernel_slide;
+       uintptr_t addr = ml_static_slide((uintptr_t)address->unsigned64BitValue());
 
 #if CONFIG_MACF
        /* Is the calling process allowed to query kext info? */
@@ -3373,18 +3434,36 @@ OSKext::copyKextUUIDForAddress(OSNumber *address)
        }
 #endif
 
-       if (((vm_offset_t)addr >= vm_kernel_stext) && ((vm_offset_t)addr < vm_kernel_etext)) {
-               /* address in xnu proper */
-               unsigned long uuid_len = 0;
-               uuid = OSData::withBytes(getuuidfromheader(&_mh_execute_header, &uuid_len), uuid_len);
-       } else {
-               IOLockLock(sKextSummariesLock);
-               OSKextLoadedKextSummary *summary = OSKext::summaryForAddress(addr);
-               if (summary) {
-                       uuid = OSData::withBytes(summary->uuid, sizeof(uuid_t));
-               }
-               IOLockUnlock(sKextSummariesLock);
-       }
+    IOSimpleLockLock(sKextAccountsLock);
+    // bsearch sKextAccounts list
+    for (baseIdx = 0, lim = sKextAccountsCount; lim; lim >>= 1)
+    {
+        active = &sKextAccounts[baseIdx + (lim >> 1)];
+        if ((addr >= active->address) && (addr < active->address_end))
+        {
+            kext = active->account->kext;
+            if (kext) kext->retain();
+            break;
+        }
+        else if (addr > active->address)
+        {
+            // move right
+            baseIdx += (lim >> 1) + 1;
+            lim--;
+        }
+        // else move left
+    }
+    IOSimpleLockUnlock(sKextAccountsLock);
+
+    if (kext)
+    {
+        uuid = kext->copyTextUUID();
+        kext->release();
+    }
+    else if (((vm_offset_t)addr >= vm_kernel_stext) && ((vm_offset_t)addr < vm_kernel_etext))
+    {
+        uuid = sKernelKext->copyTextUUID();
+    }
 
        return uuid;
 }
@@ -3923,6 +4002,8 @@ OSKext::getExecutable(void)
     OSData * extractedExecutable = NULL;  // must release
     OSData * mkextExecutableRef  = NULL;  // do not release
 
+    if (flags.builtin) return (sKernelKext->linkedExecutable);
+
     result = OSDynamicCast(OSData, infoDict->getObject(_kOSKextExecutableKey));
     if (result) {
         goto finish;
@@ -4124,10 +4205,7 @@ OSKext::copyUUID(void)
 {
     OSData                     * result        = NULL;
     OSData                     * theExecutable = NULL;  // do not release
-    const kernel_mach_header_t * header        = NULL;
-    const struct load_command  * load_cmd      = NULL;
-    const struct uuid_command  * uuid_cmd      = NULL;
-    uint32_t                     i;
+    const kernel_mach_header_t * header;
 
    /* An interface kext doesn't have a linked executable with an LC_UUID,
     * we create one when it's linked.
@@ -4138,6 +4216,8 @@ OSKext::copyUUID(void)
         goto finish;
     }
 
+    if (flags.builtin || isInterface()) return (sKernelKext->copyUUID());
+
    /* For real kexts, try to get the UUID from the linked executable,
     * or if is hasn't been linked yet, the unrelocated executable.
     */
@@ -4150,6 +4230,34 @@ OSKext::copyUUID(void)
     }
 
     header = (const kernel_mach_header_t *)theExecutable->getBytesNoCopy();
+    result = copyMachoUUID(header);
+
+finish:
+    return result;
+}
+
+/*********************************************************************
+*********************************************************************/
+OSData *
+OSKext::copyTextUUID(void)
+{
+    if (flags.builtin)
+    {
+        return (copyMachoUUID((const kernel_mach_header_t *)kmod_info->address));
+    }
+    return (copyUUID());
+}
+
+/*********************************************************************
+*********************************************************************/
+OSData *
+OSKext::copyMachoUUID(const kernel_mach_header_t * header)
+{
+    OSData                     * result        = NULL;
+    const struct load_command  * load_cmd      = NULL;
+    const struct uuid_command  * uuid_cmd      = NULL;
+    uint32_t                     i;
+
     load_cmd = (const struct load_command *)&header[1];
         
     if (header->magic != MH_MAGIC_KERNEL) {
@@ -4799,7 +4907,9 @@ OSKext::load(
     pendingPgoHead.next = &pendingPgoHead;
     pendingPgoHead.prev = &pendingPgoHead;
 
-    uuid_generate(instance_uuid);
+    // The kernel PRNG is not initialized when the first kext is
+    // loaded, so use early random
+    uuid_generate_early_random(instance_uuid);
     account = IONew(OSKextAccount, 1);
     if (!account) {
        result = KERN_MEMORY_ERROR;
@@ -4810,6 +4920,10 @@ OSKext::load(
     account->site.refcount = 0;
     account->site.flags = VM_TAG_KMOD;
     account->kext = this;
+    if (gIOSurfaceIdentifier == bundleID) {
+        vm_tag_alloc(&account->site);
+        gIOSurfaceTag = account->site.tag;
+    }
 
     flags.loaded = true;
 
@@ -4964,6 +5078,8 @@ OSKext::lookupSection(const char *segname, const char *secname)
     kernel_segment_command_t * seg           = NULL;
     kernel_section_t         * sec           = NULL;
 
+    if (!linkedExecutable) return (NULL);
+
     mh = (kernel_mach_header_t *)linkedExecutable->getBytesNoCopy();
 
     for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) {
@@ -5010,7 +5126,7 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides)
     int                            reloc_size;
     vm_offset_t                    new_kextsize;
 
-    if (linkedExecutable == NULL || vm_kernel_slide == 0) {
+    if (linkedExecutable == NULL || flags.builtin) {
         result = kOSReturnSuccess;
         goto finish;
     }
@@ -5022,12 +5138,13 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides)
         if (!seg->vmaddr) {
             continue;
         }
-        seg->vmaddr += vm_kernel_slide;
-                
+
+       seg->vmaddr = ml_static_slide(seg->vmaddr);
+
 #if KASLR_KEXT_DEBUG
         IOLog("kaslr: segname %s unslid 0x%lx slid 0x%lx \n",
               seg->segname,
-              (unsigned long)VM_KERNEL_UNSLIDE(seg->vmaddr), 
+              (unsigned long)ml_static_unslide(seg->vmaddr), 
               (unsigned long)seg->vmaddr);
 #endif
        
@@ -5041,12 +5158,12 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides)
             linkeditSeg = seg;
         }
         for (sec = firstsect(seg); sec != NULL; sec = nextsect(seg, sec)) {
-            sec->addr += vm_kernel_slide;
+            sec->addr = ml_static_slide(sec->addr);
 
 #if KASLR_KEXT_DEBUG
             IOLog("kaslr: sectname %s unslid 0x%lx slid 0x%lx \n",
                   sec->sectname,
-                  (unsigned long)VM_KERNEL_UNSLIDE(sec->addr), 
+                  (unsigned long)ml_static_unslide(sec->addr), 
                   (unsigned long)sec->addr);
 #endif
         }
@@ -5065,13 +5182,13 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides)
                 if (sym[i].n_type & N_STAB) {
                     continue;
                 }
-                sym[i].n_value += vm_kernel_slide;
+                sym[i].n_value = ml_static_slide(sym[i].n_value);
                 
 #if KASLR_KEXT_DEBUG
 #define MAX_SYMS_TO_LOG 5
                 if ( i < MAX_SYMS_TO_LOG ) {
                     IOLog("kaslr: LC_SYMTAB unslid 0x%lx slid 0x%lx \n", 
-                          (unsigned long)VM_KERNEL_UNSLIDE(sym[i].n_value), 
+                          (unsigned long)ml_static_unslide(sym[i].n_value), 
                           (unsigned long)sym[i].n_value);
                 }
 #endif
@@ -5130,13 +5247,14 @@ OSKext::slidePrelinkedExecutable(bool doCoalesedSlides)
                 if (reloc[i].r_pcrel != 0) {
                     continue;
                 }
-                *((uintptr_t *)(relocBase + reloc[i].r_address)) += vm_kernel_slide;
+                uintptr_t *relocAddr = (uintptr_t*)(relocBase + reloc[i].r_address);
+                *relocAddr = ml_static_slide(*relocAddr);
 
 #if KASLR_KEXT_DEBUG
 #define MAX_DYSYMS_TO_LOG 5
                 if ( i < MAX_DYSYMS_TO_LOG ) {
                     IOLog("kaslr: LC_DYSYMTAB unslid 0x%lx slid 0x%lx \n", 
-                          (unsigned long)VM_KERNEL_UNSLIDE(*((uintptr_t *)(relocBase + reloc[i].r_address))), 
+                          (unsigned long)ml_static_unslide(*((uintptr_t *)(relocAddr))), 
                           (unsigned long)*((uintptr_t *)(relocBase + reloc[i].r_address)));
                 }
 #endif
@@ -5272,7 +5390,7 @@ OSKext::loadExecutable()
     }
 
     /* <rdar://problem/21444003> all callers must be entitled */
-    if (FALSE == IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-management")) {
+    if (FALSE == IOTaskHasEntitlement(current_task(), "com.apple.rootless.kext-secure-management")) {
         OSKextLog(this,
                   kOSKextLogErrorLevel | kOSKextLogLoadFlag,
                   "Not entitled to link kext '%s'",
@@ -5502,7 +5620,7 @@ register_kmod:
             "Kext %s executable loaded; %u pages at 0x%lx (load tag %u).", 
             kmod_info->name,
             (unsigned)kmod_info->size / PAGE_SIZE,
-            (unsigned long)VM_KERNEL_UNSLIDE(kmod_info->address),
+            (unsigned long)ml_static_unslide(kmod_info->address),
             (unsigned)kmod_info->id);
     }
 
@@ -5513,8 +5631,10 @@ register_kmod:
     }
 
 #if KASAN
-    kasan_load_kext((vm_offset_t)linkedExecutable->getBytesNoCopy(),
-                    linkedExecutable->getLength(), getIdentifierCString());
+    if (linkedExecutable) {
+        kasan_load_kext((vm_offset_t)linkedExecutable->getBytesNoCopy(),
+                        linkedExecutable->getLength(), getIdentifierCString());
+    }
 #else
     if (lookupSection(KASAN_GLOBAL_SEGNAME, KASAN_GLOBAL_SECTNAME)) {
         OSKextLog(this,
@@ -5669,6 +5789,7 @@ OSKext::jettisonDATASegmentPadding(void)
     vm_offset_t dataSegEnd, lastSecEnd;
     vm_size_t padSize;
 
+    if (flags.builtin) return;
     mh = (kernel_mach_header_t *)kmod_info->address;
 
     dataSeg = getsegbynamefromheader(mh, SEG_DATA);
@@ -5774,6 +5895,9 @@ OSKext::registerWithDTrace(void)
         if (forceInit == kOSBooleanTrue) {
             modflag |= KMOD_DTRACE_FORCE_INIT;
         }
+        if (flags.builtin) {
+            modflag |= KMOD_DTRACE_STATIC_KEXT;
+        }
 
         (void)(*dtrace_modload)(kmod_info, modflag);
         flags.dtraceInitialized = true;
@@ -5872,7 +5996,7 @@ OSKext::setVMAttributes(bool protect, bool wire)
     vm_map_offset_t             end             = 0;
     OSReturn                    result          = kOSReturnError;
 
-    if (isInterface() || !declaresExecutable()) {
+    if (isInterface() || !declaresExecutable() || flags.builtin) {
         result = kOSReturnSuccess;
         goto finish;
     }
@@ -5981,6 +6105,8 @@ OSKext::validateKextMapping(bool startFlag)
     mach_msg_type_number_t                count;
     vm_region_submap_short_info_data_64_t info;
 
+    if (flags.builtin) return (kOSReturnSuccess);
+
     count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
     bzero(&info, sizeof(info));
 
@@ -6041,8 +6167,8 @@ OSKext::validateKextMapping(bool startFlag)
                 getIdentifierCString(),
                 whichOp,
                 whichOp,
-                (void *)VM_KERNEL_UNSLIDE(address),
-                (void *)VM_KERNEL_UNSLIDE(kmod_info->address));
+                (void *)ml_static_unslide(address),
+                (void *)ml_static_unslide(kmod_info->address));
             result = kOSKextReturnBadData;
             goto finish;
         }
@@ -6060,9 +6186,9 @@ OSKext::validateKextMapping(bool startFlag)
                 getIdentifierCString(),
                 whichOp,
                 whichOp,
-                (void *)VM_KERNEL_UNSLIDE(address),
-                (void *)VM_KERNEL_UNSLIDE(kmod_info->address),
-                (void *)(VM_KERNEL_UNSLIDE(kmod_info->address) + kmod_info->size));
+                (void *)ml_static_unslide(address),
+                (void *)ml_static_unslide(kmod_info->address),
+                (void *)(ml_static_unslide(kmod_info->address) + kmod_info->size));
             result = kOSKextReturnBadData;
             goto finish;
         }
@@ -6084,7 +6210,7 @@ OSKext::validateKextMapping(bool startFlag)
                 kOSKextLogLoadFlag,
                 "Kext %s - bad %s pointer %p.",
                 getIdentifierCString(),
-                whichOp, (void *)VM_KERNEL_UNSLIDE(address)); 
+                whichOp, (void *)ml_static_unslide(address)); 
             result = kOSKextReturnBadData;
             goto finish;
         }
@@ -6164,14 +6290,14 @@ OSKextLogKextInfo(OSKext *aKext, uint64_t address, uint64_t size, firehose_trace
     stamp = firehose_tracepoint_time(firehose_activity_flags_default);
     trace_id.ftid_value = FIREHOSE_TRACE_ID_MAKE(firehose_tracepoint_namespace_metadata, _firehose_tracepoint_type_metadata_kext, (firehose_tracepoint_flags_t)0, code);
 
-    uuid_data = aKext->copyUUID();
+    uuid_data = aKext->copyTextUUID();
     if (uuid_data) {
         memcpy(uuid_info->ftui_uuid, uuid_data->getBytesNoCopy(), sizeof(uuid_info->ftui_uuid));
         OSSafeReleaseNULL(uuid_data);
     }
 
     uuid_info->ftui_size    = size;
-    uuid_info->ftui_address = VM_KERNEL_UNSLIDE(address);
+    uuid_info->ftui_address = ml_static_unslide(address);
 
     firehose_trace_metadata(firehose_stream_metadata, trace_id, stamp, uuid_info, uuid_info_len);
     return;
@@ -6267,33 +6393,10 @@ OSKext::start(bool startDependenciesFlag)
 
     // Drop a log message so logd can grab the needed information to decode this kext
     OSKextLogKextInfo(this, kmod_info->address, kmod_info->size, firehose_tracepoint_code_load);
-
-#if !CONFIG_STATIC_CPPINIT
-    result = OSRuntimeInitializeCPP(kmod_info, NULL);
+    result = OSRuntimeInitializeCPP(this);
     if (result == KERN_SUCCESS) {
-#endif
-
-#if CONFIG_KEC_FIPS
-        kmodStartData = GetAppleTEXTHashForKext(this, this->infoDict);
-        
-#if 0
-        if (kmodStartData) {
-            OSKextLog(this,
-                      kOSKextLogErrorLevel |
-                      kOSKextLogGeneralFlag,
-                      "Kext %s calling module start function. kmodStartData %p. arch %s",
-                      getIdentifierCString(), kmodStartData, ARCHNAME); 
-        }
-#endif
-#endif // CONFIG_KEC_FIPS 
         result = startfunc(kmod_info, kmodStartData);
-
-#if !CONFIG_STATIC_CPPINIT
-        if (result != KERN_SUCCESS) {
-            (void) OSRuntimeFinalizeCPP(kmod_info, NULL);
-        }
     }
-#endif
 
     flags.starting = 0;
 
@@ -6426,11 +6529,9 @@ OSKext::stop(void)
         flags.stopping = 1;
 
         result = stopfunc(kmod_info, /* userData */ NULL);
-#if !CONFIG_STATIC_CPPINIT
         if (result == KERN_SUCCESS) {
-            result = OSRuntimeFinalizeCPP(kmod_info, NULL);
+            result = OSRuntimeFinalizeCPP(this);
         }
-#endif
 
         flags.stopping = 0;
 
@@ -6571,6 +6672,7 @@ OSKext::unload(void)
     if (metaClasses) {
         metaClasses->flushCollection();
     }
+    (void) OSRuntimeFinalizeCPP(this);
 
    /* Remove the kext from the list of loaded kexts, patch the gap
     * in the kmod_info_t linked list, and reset "kmod" to point to the
@@ -8634,14 +8736,10 @@ OSKext::copyLoadedKextInfoByUUID(
         OSKext       *thisKext     = NULL;  // do not release
         Boolean       includeThis  = true;
         uuid_t        thisKextUUID;
+        uuid_t        thisKextTextUUID;
         OSData       *uuid_data;
         uuid_string_t uuid_key;
 
-        if (kextInfo) {
-            kextInfo->release();
-            kextInfo = NULL;
-        }
-
         thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i));
         if (!thisKext) {
             continue;
@@ -8657,6 +8755,13 @@ OSKext::copyLoadedKextInfoByUUID(
 
        uuid_unparse(thisKextUUID, uuid_key);
 
+       uuid_data = thisKext->copyTextUUID();
+       if (!uuid_data) {
+           continue;
+       }
+       memcpy(&thisKextTextUUID, uuid_data->getBytesNoCopy(), sizeof(thisKextTextUUID));
+       OSSafeReleaseNULL(uuid_data);
+
        /* Skip current kext if we have a list of UUIDs and
         * it isn't in the list.
         */
@@ -8670,8 +8775,15 @@ OSKext::copyLoadedKextInfoByUUID(
                 uuid_t uuid;
                 uuid_parse(wantedUUID->getCStringNoCopy(), uuid);
 
-                if (0 == uuid_compare(uuid, thisKextUUID)) {
+                if ((0 == uuid_compare(uuid, thisKextUUID))
+                 || (0 == uuid_compare(uuid, thisKextTextUUID))) {
                     includeThis = true;
+                   /* Only need to find the first kext if multiple match,
+                    * ie. asking for the kernel uuid does not need to find
+                    * interface kexts or builtin static kexts.
+                    */
+                    kextIdentifiers->removeObject(idIndex);
+                    uuid_unparse(uuid, uuid_key);
                     break;
                 }
 
@@ -8685,14 +8797,17 @@ OSKext::copyLoadedKextInfoByUUID(
         kextInfo = thisKext->copyInfo(infoKeys);
         if (kextInfo) {
             result->setObject(uuid_key, kextInfo);
+            kextInfo->release();
+        }
+
+        if (kextIdentifiers && !kextIdentifiers->getCount()) {
+            break;
         }
     }
 
 finish:
     IORecursiveLockUnlock(sKextLock);
 
-    if (kextInfo) kextInfo->release();
-
     return result;
 }
 
@@ -8705,10 +8820,8 @@ OSKext::copyLoadedKextInfo(
     OSArray * infoKeys) 
 {
     OSDictionary * result = NULL;
-    OSDictionary * kextInfo = NULL;  // must release
-    uint32_t       count, i;
     uint32_t       idCount = 0;
-    uint32_t       idIndex = 0;
+    bool           onlyLoaded;
 
     IORecursiveLockLock(sKextLock);
 
@@ -8746,8 +8859,9 @@ OSKext::copyLoadedKextInfo(
         infoKeys = NULL;
     }
 
-    count = sLoadedKexts->getCount();
-    result = OSDictionary::withCapacity(count);
+    onlyLoaded =  (!infoKeys || !_OSArrayContainsCString(infoKeys, kOSBundleAllPrelinkedKey));
+
+    result = OSDictionary::withCapacity(128);
     if (!result) {
         goto finish;
     }
@@ -8790,28 +8904,31 @@ OSKext::copyLoadedKextInfo(
               vm_slinkedit, vm_elinkedit);
 #endif
 
-    for (i = 0; i < count; i++) {
-        OSKext   * thisKext     = NULL;  // do not release
-        Boolean    includeThis  = true;
+    sKextsByID->iterateObjects(^bool(const OSSymbol * thisKextID, OSObject * obj)
+    {
+        OSKext       * thisKext     = NULL;  // do not release
+        Boolean        includeThis  = true;
+        OSDictionary * kextInfo     = NULL;  // must release
 
-        if (kextInfo) {
-            kextInfo->release();
-            kextInfo = NULL;
-        }
-        thisKext = OSDynamicCast(OSKext, sLoadedKexts->getObject(i));
+        thisKext = OSDynamicCast(OSKext, obj);
         if (!thisKext) {
-            continue;
+            return (false);;
+        }
+
+        /* Skip current kext if not yet started and caller didn't request all.
+         */
+        if (onlyLoaded && (-1U == sLoadedKexts->getNextIndexOfObject(thisKext, 0))) {
+            return (false);;
         }
 
        /* Skip current kext if we have a list of bundle IDs and
         * it isn't in the list.
         */
         if (kextIdentifiers) {
-            const OSString * thisKextID = thisKext->getIdentifier();
 
             includeThis = false;
 
-            for (idIndex = 0; idIndex < idCount; idIndex++) {
+            for (uint32_t idIndex = 0; idIndex < idCount; idIndex++) {
                 const OSString * thisRequestID = OSDynamicCast(OSString,
                     kextIdentifiers->getObject(idIndex));
                 if (thisKextID->isEqualTo(thisRequestID)) {
@@ -8822,20 +8939,20 @@ OSKext::copyLoadedKextInfo(
         }
         
         if (!includeThis) {
-            continue;
+            return (false);
         }
 
         kextInfo = thisKext->copyInfo(infoKeys);
         if (kextInfo) {
             result->setObject(thisKext->getIdentifier(), kextInfo);
+            kextInfo->release();
         }
-    }
+        return (false);
+    });
     
 finish:
     IORecursiveLockUnlock(sKextLock);
 
-    if (kextInfo) kextInfo->release();
-
     return result;
 }
 
@@ -8951,10 +9068,10 @@ OSKext::copyInfo(OSArray * infoKeys)
                                       __FUNCTION__, segp->vmaddr, vm_kext_base, vm_kext_top);
                         }
 #endif
-                        segp->vmaddr = VM_KERNEL_UNSLIDE(segp->vmaddr);
+                        segp->vmaddr = ml_static_unslide(segp->vmaddr);
 
                         for (secp = firstsect(segp); secp != NULL; secp = nextsect(segp, secp)) {
-                            secp->addr = VM_KERNEL_UNSLIDE(secp->addr);
+                            secp->addr = ml_static_unslide(secp->addr);
                         }
                     }
                     lcp = (struct load_command *)((caddr_t)lcp + lcp->cmdsize);
@@ -9096,6 +9213,8 @@ OSKext::copyInfo(OSArray * infoKeys)
             }
 
             result->setObject(kOSBundleExecutablePathKey, executablePathString);
+        } else if (flags.builtin) {
+            result->setObject(kOSBundleExecutablePathKey, bundleID);
         }
     }
 
@@ -9105,6 +9224,13 @@ OSKext::copyInfo(OSArray * infoKeys)
         uuid = copyUUID();
         if (uuid) {
             result->setObject(kOSBundleUUIDKey, uuid);
+            uuid->release();
+        }
+    }
+    if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleTextUUIDKey)) {
+        uuid = copyTextUUID();
+        if (uuid) {
+            result->setObject(kOSBundleTextUUIDKey, uuid); uuid->release();
         }
     }
     
@@ -9152,7 +9278,7 @@ OSKext::copyInfo(OSArray * infoKeys)
         _OSArrayContainsCString(infoKeys, kOSBundleExecLoadSizeKey) ||
         _OSArrayContainsCString(infoKeys, kOSBundleWiredSizeKey))
     {
-        if (isInterface() || linkedExecutable) {
+        if (isInterface() || flags.builtin || linkedExecutable) {
            /* These go to userspace via serialization, so we don't want any doubts
             * about their size.
             */
@@ -9168,21 +9294,27 @@ OSKext::copyInfo(OSArray * infoKeys)
             * xxx - leaving in # when we have a linkedExecutable...a kernelcomp
             * xxx - shouldn't have one!
             */
-            if (linkedExecutable /* && !isInterface() */) {
+
+            if (flags.builtin || linkedExecutable) {
                 kernel_mach_header_t     *mh  = NULL;
                 kernel_segment_command_t *seg = NULL;
 
-                loadAddress = (uint64_t)linkedExecutable->getBytesNoCopy();
+                if (flags.builtin) {
+                    loadAddress = kmod_info->address;
+                    loadSize    = kmod_info->size;
+                } else  {
+                    loadAddress = (uint64_t)linkedExecutable->getBytesNoCopy();
+                    loadSize = linkedExecutable->getLength();
+                }
                 mh = (kernel_mach_header_t *)loadAddress;
-                loadAddress = VM_KERNEL_UNSLIDE(loadAddress);
-                loadSize = linkedExecutable->getLength();
+                loadAddress = ml_static_unslide(loadAddress);
 
                /* Walk through the kext, looking for the first executable
                 * segment in case we were asked for its size/address.
                 */
                 for (seg = firstsegfromheader(mh); seg != NULL; seg = nextsegfromheader(mh, seg)) {
                     if (seg->initprot & VM_PROT_EXECUTE) {
-                        execLoadAddress = VM_KERNEL_UNSLIDE(seg->vmaddr);
+                        execLoadAddress = ml_static_unslide(seg->vmaddr);
                         execLoadSize = seg->vmsize;
                         break;
                     }
@@ -9208,6 +9340,23 @@ OSKext::copyInfo(OSArray * infoKeys)
                 result->setObject(kOSBundleLoadAddressKey, scratchNumber);
                 OSSafeReleaseNULL(scratchNumber);
             }
+#if CONFIG_EMBEDDED
+            if ((!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleCacheLoadAddressKey))
+              && loadAddress && loadSize) {
+                scratchNumber = OSNumber::withNumber(
+                    (unsigned long long)ml_static_unslide((uintptr_t)segLOWESTTEXT),
+                    /* numBits */ 8 * sizeof(loadAddress));
+                if (!scratchNumber) {
+                    goto finish;
+                }
+                result->setObject(kOSBundleCacheLoadAddressKey, scratchNumber);
+                OSSafeReleaseNULL(scratchNumber);
+            }
+            if ((!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleKextsInKernelTextKey))
+             && (this == sKernelKext) && gBuiltinKmodsCount) {
+                result->setObject(kOSBundleKextsInKernelTextKey, kOSBooleanTrue);
+            }
+#endif /* CONFIG_EMBEDDED */
             if (!infoKeys || _OSArrayContainsCString(infoKeys, kOSBundleExecLoadAddressKey)) {
                 scratchNumber = OSNumber::withNumber(
                     (unsigned long long)(execLoadAddress),
@@ -9358,7 +9507,6 @@ finish:
     OSSafeReleaseNULL(cpuSubtypeNumber);
     OSSafeReleaseNULL(executablePathString);
     if (executablePathCString) kfree(executablePathCString, executablePathCStringSize);
-    OSSafeReleaseNULL(uuid);
     OSSafeReleaseNULL(scratchNumber);
     OSSafeReleaseNULL(dependencyLoadTags);
     OSSafeReleaseNULL(metaClassIterator);
@@ -10913,10 +11061,10 @@ bool ScanForAddrInObject(OSObject * theObject,
 /* static */
 void
 OSKext::printKextsInBacktrace(
-    vm_offset_t  * addr,
-    unsigned int   cnt,
-    int         (* printf_func)(const char *fmt, ...),
-    uint32_t       flags)
+    vm_offset_t  * addr __unused,
+    unsigned int   cnt __unused,
+    int         (* printf_func)(const char *fmt, ...) __unused,
+    uint32_t       flags __unused)
 {
     addr64_t    summary_page = 0;
     addr64_t    last_summary_page = 0;
@@ -11038,24 +11186,46 @@ OSKext::summaryForAddress(const uintptr_t addr)
 
 /* static */
 void *
-OSKext::kextForAddress(const void *addr)
+OSKext::kextForAddress(const void *address)
 {
-       void *image = NULL;
+    void                * image = NULL;
+    OSKextActiveAccount * active;
+    OSKext              * kext = NULL;
+    uint32_t              baseIdx;
+    uint32_t              lim;
+    uintptr_t             addr = (uintptr_t) address;
 
-       if (((vm_offset_t)(uintptr_t)addr >= vm_kernel_stext) &&
-                       ((vm_offset_t)(uintptr_t)addr < vm_kernel_etext)) {
-               return (void *)&_mh_execute_header;
-       }
-
-       if (!sKextSummariesLock) {
+       if (!addr) {
                return NULL;
        }
-       IOLockLock(sKextSummariesLock);
-       OSKextLoadedKextSummary *summary = OSKext::summaryForAddress((uintptr_t)addr);
-       if (summary) {
-               image = (void *)summary->address;
-       }
-       IOLockUnlock(sKextSummariesLock);
+
+    if (sKextAccountsCount)
+    {
+        IOSimpleLockLock(sKextAccountsLock);
+        // bsearch sKextAccounts list
+        for (baseIdx = 0, lim = sKextAccountsCount; lim; lim >>= 1)
+        {
+            active = &sKextAccounts[baseIdx + (lim >> 1)];
+            if ((addr >= active->address) && (addr < active->address_end))
+            {
+                kext = active->account->kext;
+                if (kext && kext->kmod_info) image = (void *) kext->kmod_info->address;
+                break;
+            }
+            else if (addr > active->address)
+            {
+                // move right
+                baseIdx += (lim >> 1) + 1;
+                lim--;
+            }
+            // else move left
+        }
+        IOSimpleLockUnlock(sKextAccountsLock);
+    }
+    if (!image && (addr >= vm_kernel_stext) && (addr < vm_kernel_etext))
+    {
+        image = (void *) &_mh_execute_header;
+    }
 
        return image;
 }
@@ -11108,7 +11278,7 @@ void OSKext::printSummary(
     (void) uuid_unparse(summary->uuid, uuid);
     
     if (kPrintKextsUnslide & flags) {
-        tmpAddr = VM_KERNEL_UNSLIDE(summary->address);
+        tmpAddr = ml_static_unslide(summary->address);
     }
     else {
         tmpAddr = summary->address;
@@ -11129,7 +11299,7 @@ void OSKext::printSummary(
         if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)kmod_ref)) == 0) {
             (*printf_func)("            kmod dependency scan stopped "
                            "due to missing dependency page: %p\n",
-                          (kPrintKextsUnslide & flags) ? (void *)VM_KERNEL_UNSLIDE(kmod_ref) : kmod_ref);
+                          (kPrintKextsUnslide & flags) ? (void *)ml_static_unslide((vm_offset_t)kmod_ref) : kmod_ref);
             break;
         }
         rinfo = kmod_ref->info;
@@ -11137,7 +11307,7 @@ void OSKext::printSummary(
         if (pmap_find_phys(kernel_pmap, (addr64_t)((uintptr_t)rinfo)) == 0) {
             (*printf_func)("            kmod dependency scan stopped "
                            "due to missing kmod page: %p\n",
-                          (kPrintKextsUnslide & flags) ? (void *)VM_KERNEL_UNSLIDE(rinfo) : rinfo);
+                          (kPrintKextsUnslide & flags) ? (void *)ml_static_unslide((vm_offset_t)rinfo) : rinfo);
             break;
         }
         
@@ -11149,7 +11319,7 @@ void OSKext::printSummary(
         findSummaryUUID(rinfo->id, uuid);
         
         if (kPrintKextsUnslide & flags) {
-            tmpAddr = VM_KERNEL_UNSLIDE(rinfo->address);
+            tmpAddr = ml_static_unslide(rinfo->address);
         }
         else {
             tmpAddr = rinfo->address;
@@ -11718,8 +11888,16 @@ OSKext::updateLoadedKextSummary(OSKextLoadedKextSummary *summary)
         OSSafeReleaseNULL(uuid);
     }
 
-    summary->address = kmod_info->address;
-    summary->size = kmod_info->size;
+    if (flags.builtin) {
+//      this value will stop lldb from parsing the mach-o header
+//      summary->address = UINT64_MAX;
+//      summary->size = 0;
+        summary->address = kmod_info->address;
+        summary->size = kmod_info->size;
+    } else {
+        summary->address = kmod_info->address;
+        summary->size = kmod_info->size;
+    }
     summary->version = getVersion();
     summary->loadTag = kmod_info->id;
     summary->flags = 0;
@@ -11737,8 +11915,9 @@ OSKext::updateActiveAccount(OSKextActiveAccount *accountp)
     kernel_mach_header_t     *hdr = NULL;
     kernel_segment_command_t *seg = NULL;
 
-    hdr = (kernel_mach_header_t *)kmod_info->address;
+    bzero(accountp, sizeof(*accountp));
 
+    hdr = (kernel_mach_header_t *)kmod_info->address;
     if (getcommandfromheader(hdr, LC_SEGMENT_SPLIT_INFO)) {
         /* If this kext supports split segments, use the first
          * executable segment as the range for instructions
@@ -11750,8 +11929,6 @@ OSKext::updateActiveAccount(OSKextActiveAccount *accountp)
             }
         }
     }
-
-    bzero(accountp, sizeof(*accountp));
     if (seg) {
         accountp->address = seg->vmaddr;
         if (accountp->address) {
@@ -11768,6 +11945,7 @@ OSKext::updateActiveAccount(OSKextActiveAccount *accountp)
             accountp->address_end = kmod_info->address + kmod_info->size;
         }
     }
+
     accountp->account = this->account;
 }
 
@@ -11794,8 +11972,8 @@ OSKextGetAllocationSiteForCaller(uintptr_t address)
            if (!site->tag) vm_tag_alloc_locked(site, &releasesite);
            break;
        }
-       else if (address > active->address) 
-       {       
+       else if (address > active->address)
+       {
            // move right
            baseIdx += (lim >> 1) + 1;
            lim--;
@@ -11833,75 +12011,6 @@ OSKextFreeSite(vm_allocation_site_t * site)
 
 /*********************************************************************
 *********************************************************************/
-    
-#if CONFIG_KEC_FIPS
-    
-#if PRAGMA_MARK
-#pragma mark Kernel External Components for FIPS compliance
-#endif
-    
-/*********************************************************************
- * Kernel External Components for FIPS compliance (KEC_FIPS)
- *********************************************************************/
-static void * 
-GetAppleTEXTHashForKext(OSKext * theKext, OSDictionary *theInfoDict)
-{
-    AppleTEXTHash_t         my_ath = {2, 0, NULL};
-    AppleTEXTHash_t *       my_athp = NULL;         // do not release
-    OSData *                segmentHash = NULL;     // do not release
-    
-    if (theKext == NULL || theInfoDict == NULL) {
-        return(NULL);
-    }
-    
-    // Get the part of the plist associate with kAppleTextHashesKey and let
-    // the crypto library do further parsing (slice/architecture)
-    segmentHash = OSDynamicCast(OSData, theInfoDict->getObject(kAppleTextHashesKey));
-    // Support for ATH v1 while rolling out ATH v2 without revision locking submissions
-    // Remove this when v2 PLIST are supported
-    if (segmentHash == NULL) {
-        // If this fails, we may be dealing with a v1 PLIST
-        OSDictionary *          textHashDict = NULL;    // do not release
-        textHashDict = OSDynamicCast(OSDictionary, theInfoDict->getObject(kAppleTextHashesKey));
-        if (textHashDict == NULL) {
-            return(NULL);
-        }
-        my_ath.ath_version=1;
-        segmentHash = OSDynamicCast(OSData,textHashDict->getObject(ARCHNAME));
-    } // end of v2 rollout
-
-    if (segmentHash == NULL) {
-        return(NULL);
-    }
-    
-    // KEC_FIPS type kexts never unload so we don't have to clean up our 
-    // AppleTEXTHash_t
-    if (kmem_alloc(kernel_map, (vm_offset_t *) &my_athp, 
-                   sizeof(AppleTEXTHash_t), VM_KERN_MEMORY_OSKEXT) != KERN_SUCCESS) {
-        return(NULL);
-    }
-    
-    memcpy(my_athp, &my_ath, sizeof(my_ath));
-    my_athp->ath_length = segmentHash->getLength();
-    if (my_athp->ath_length > 0) {
-        my_athp->ath_hash = (void *)segmentHash->getBytesNoCopy();
-    }
-        
-#if 0
-    OSKextLog(theKext,
-              kOSKextLogErrorLevel |
-              kOSKextLogGeneralFlag,
-              "Kext %s ath_version %d ath_length %d ath_hash %p",
-              theKext->getIdentifierCString(), 
-              my_athp->ath_version,
-              my_athp->ath_length,
-              my_athp->ath_hash); 
-#endif
-        
-    return( (void *) my_athp );
-}
-    
-#endif // CONFIG_KEC_FIPS
 
 #if CONFIG_IMAGEBOOT
 int OSKextGetUUIDForName(const char *name, uuid_t uuid)
index 71b316c8c13f946ab62332ccc0caf057bfe54b34..f7c0594d8a3d37535457b8df830ccd97ce9bc14e 100644 (file)
@@ -148,6 +148,66 @@ void OSMetaClassBase::_RESERVEDOSMetaClassBase6()
     { panic("OSMetaClassBase::_RESERVEDOSMetaClassBase%d called.", 6); }
 #endif
 
+
+/*********************************************************************
+*********************************************************************/
+
+#if defined(__arm__) || defined(__arm64__)
+
+
+
+/*
+IHI0059A "C++ Application Binary Interface Standard for the ARM 64 - bit Architecture":
+
+3.2.1 Representation of pointer to member function The generic C++ ABI [GC++ABI]
+specifies that a pointer to member function is a pair of words <ptr, adj>. The
+least significant bit of ptr discriminates between (0) the address of a non-
+virtual member function and (1) the offset in the class's virtual table of the
+address of a virtual function. This encoding cannot work for the AArch64
+instruction set where the architecture reserves all bits of code addresses. This
+ABI specifies that adj contains twice the this adjustment, plus 1 if the member
+function is virtual. The least significant bit of adj then makes exactly the
+same discrimination as the least significant bit of ptr does for Itanium. A
+pointer to member function is NULL when ptr = 0 and the least significant bit of
+adj is zero.
+*/
+
+OSMetaClassBase::_ptf_t
+OSMetaClassBase::_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void))
+{
+       typedef long int ptrdiff_t;
+    struct ptmf_t {
+        _ptf_t fPFN;
+        ptrdiff_t delta;
+    };
+    union {
+        void (OSMetaClassBase::*fIn)(void);
+        struct ptmf_t pTMF;
+    } map;
+    _ptf_t pfn;
+
+    map.fIn = func;
+    pfn     = map.pTMF.fPFN;
+
+    if (map.pTMF.delta & 1) {
+        // virtual
+        union {
+            const OSMetaClassBase *fObj;
+            _ptf_t **vtablep;
+        } u;
+        u.fObj = self;
+
+        // Virtual member function so dereference table
+        pfn = *(_ptf_t *)(((uintptr_t)*u.vtablep) + (uintptr_t)pfn);
+        return pfn;
+
+    } else {
+        // Not virtual, i.e. plain member func
+        return pfn;
+    }
+}
+
+#endif /* defined(__arm__) || defined(__arm64__) */
 /*********************************************************************
 * These used to be inline in the header but gcc didn't believe us
 * Now we MUST pull the inline out at least until the compiler is
index e6cf48ba7d4d8ee2d8d3903ffe42dc0938fd7dd8..d0a09ca06f9dd43335fe05429f34c4f010a2edda 100644 (file)
@@ -42,6 +42,7 @@ __BEGIN_DECLS
 #include <string.h>
 #include <mach/mach_types.h>
 #include <libkern/kernel_mach_header.h>
+#include <libkern/prelink.h>
 #include <stdarg.h>
 
 #if PRAGMA_MARK
@@ -169,160 +170,189 @@ kern_os_realloc(
 }
 
 #if PRAGMA_MARK
-#pragma mark C++ Runtime Load/Unload
+#pragma mark Libkern Init
 #endif /* PRAGMA_MARK */
 /*********************************************************************
-* kern_os C++ Runtime Load/Unload
+* Libkern Init
 *********************************************************************/
 
-/*********************************************************************
-*********************************************************************/
 #if __GNUC__ >= 3
 void __cxa_pure_virtual( void )    { panic("%s", __FUNCTION__); }
 #else
 void __pure_virtual( void )        { panic("%s", __FUNCTION__); }
 #endif
 
-typedef void (*structor_t)(void);
-
-/*********************************************************************
-*********************************************************************/
-static boolean_t
-sectionIsDestructor(kernel_section_t * section)
-{
-    boolean_t result;
-
-    result = !strncmp(section->sectname, SECT_MODTERMFUNC,
-        sizeof(SECT_MODTERMFUNC) - 1);
-#if !__LP64__
-    result = result || !strncmp(section->sectname, SECT_DESTRUCTOR, 
-        sizeof(SECT_DESTRUCTOR) - 1);
-#endif
-
-    return result;
-}
-
-/*********************************************************************
-*********************************************************************/
-static boolean_t
-sectionIsConstructor(kernel_section_t * section)
-{
-    boolean_t result;
-
-    result = !strncmp(section->sectname, SECT_MODINITFUNC,
-        sizeof(SECT_MODINITFUNC) - 1);
-#if !__LP64__
-    result = result || !strncmp(section->sectname, SECT_CONSTRUCTOR, 
-        sizeof(SECT_CONSTRUCTOR) - 1);
-#endif
-
-    return result;
-}
+extern lck_grp_t * IOLockGroup;
+extern kmod_info_t g_kernel_kmod_info;
 
+enum {
+    kOSSectionNamesDefault     = 0,
+    kOSSectionNamesBuiltinKext = 1,
+    kOSSectionNamesCount       = 2,
+};
+enum {
+    kOSSectionNameInitializer = 0,
+    kOSSectionNameFinalizer   = 1,
+    kOSSectionNameCount       = 2
+};
 
-/*********************************************************************
-* OSRuntimeUnloadCPPForSegment()
-*
-* Given a pointer to a mach object segment, iterate the segment to
-* obtain a destructor section for C++ objects, and call each of the
-* destructors there.
-*********************************************************************/
+static const char *
+gOSStructorSectionNames[kOSSectionNamesCount][kOSSectionNameCount] = {
+    { SECT_MODINITFUNC,    SECT_MODTERMFUNC },
+    { kBuiltinInitSection, kBuiltinTermSection }
+};
 
-void
-OSRuntimeUnloadCPPForSegmentInKmod(
-    kernel_segment_command_t * segment,
-    kmod_info_t              * kmodInfo)
+void OSlibkernInit(void)
 {
+    // This must be called before calling OSRuntimeInitializeCPP.
+    OSMetaClassBase::initialize();
 
-    kernel_section_t * section = NULL;  // do not free
-    OSKext           * theKext = NULL;  // must release
-
-    if (gKernelCPPInitialized && kmodInfo) {
-        theKext = OSKext::lookupKextWithIdentifier(kmodInfo->name);
+    g_kernel_kmod_info.address = (vm_address_t) &_mh_execute_header;
+    if (kOSReturnSuccess != OSRuntimeInitializeCPP(NULL)) {
+    // &g_kernel_kmod_info, gOSSectionNamesStandard, 0, 0)) {
+        panic("OSRuntime: C++ runtime failed to initialize.");
     }
 
-    for (section = firstsect(segment);
-         section != 0;
-         section = nextsect(segment, section)) {
-
-        if (sectionIsDestructor(section)) {
-            structor_t * destructors = (structor_t *)section->addr;
-
-            if (destructors) {
-                int num_destructors = section->size / sizeof(structor_t);
-                int hit_null_destructor = 0;
-
-                for (int i = 0; i < num_destructors; i++) {
-                    if (destructors[i]) {
-                        (*destructors[i])();
-                    } else if (!hit_null_destructor) {
-                        hit_null_destructor = 1;
-                        OSRuntimeLog(theKext, kOSRuntimeLogSpec,
-                            "Null destructor in kext %s segment %s!",
-                            kmodInfo ? kmodInfo->name : "(unknown)",
-                            section->segname);
-                    }
-                }
-            } /* if (destructors) */
-        } /* if (strncmp...) */
-    } /* for (section...) */
+    gKernelCPPInitialized = true;
 
-    OSSafeReleaseNULL(theKext);
     return;
 }
 
-void
-OSRuntimeUnloadCPPForSegment(kernel_segment_command_t * segment) {
-    OSRuntimeUnloadCPPForSegmentInKmod(segment, NULL);
-}
+__END_DECLS
 
+#if PRAGMA_MARK
+#pragma mark C++ Runtime Load/Unload
+#endif /* PRAGMA_MARK */
 /*********************************************************************
+* kern_os C++ Runtime Load/Unload
 *********************************************************************/
-void
-OSRuntimeUnloadCPP(
-    kmod_info_t * kmodInfo,
-    void        * data __unused)
-{
-    if (kmodInfo && kmodInfo->address) {
-
-        kernel_segment_command_t * segment;
-        kernel_mach_header_t * header;
 
-        OSSymbol::checkForPageUnload((void *)kmodInfo->address,
-            (void *)(kmodInfo->address + kmodInfo->size));
 
-        header = (kernel_mach_header_t *)kmodInfo->address;
-        segment = firstsegfromheader(header);
+typedef void (*structor_t)(void);
 
-        for (segment = firstsegfromheader(header);
-             segment != 0;
-             segment = nextsegfromheader(header, segment)) {
+static bool
+OSRuntimeCallStructorsInSection(
+    OSKext                   * theKext,
+    kmod_info_t              * kmodInfo,
+    void                     * metaHandle,
+    kernel_segment_command_t * segment,
+    const char               * sectionName,
+    uintptr_t                  textStart,
+    uintptr_t                  textEnd)
+{
+    kernel_section_t * section;
+    bool result = TRUE;
 
-            OSRuntimeUnloadCPPForSegmentInKmod(segment, kmodInfo);
+    for (section = firstsect(segment);
+         section != NULL;
+         section = nextsect(segment, section))
+    {
+        if (strncmp(section->sectname, sectionName, sizeof(section->sectname) - 1)) continue;
+
+        structor_t * structors = (structor_t *)section->addr;
+        if (!structors) continue;
+
+        structor_t structor;
+        unsigned int num_structors = section->size / sizeof(structor_t);
+        unsigned int hit_null_structor = 0;
+        unsigned int firstIndex = 0;
+
+        if (textStart)
+        {
+            // bsearch for any in range
+            unsigned int baseIdx;
+            unsigned int lim;
+            uintptr_t value;
+            firstIndex = num_structors;
+            for (lim = num_structors, baseIdx = 0; lim; lim >>= 1)
+            {
+                value = (uintptr_t) structors[baseIdx + (lim >> 1)];
+                if (!value) panic("%s: null structor", kmodInfo->name);
+                if ((value >= textStart) && (value < textEnd))
+                {
+                    firstIndex = (baseIdx + (lim >> 1));
+                    // scan back for the first in range
+                    for (; firstIndex; firstIndex--)
+                    {
+                        value = (uintptr_t) structors[firstIndex - 1];
+                        if ((value < textStart) || (value >= textEnd)) break;
+                    }
+                    break;
+                }
+                if (textStart > value)
+                {
+                    // move right
+                    baseIdx += (lim >> 1) + 1;
+                    lim--;
+                }
+                // else move left
+            }
+            baseIdx = (baseIdx + (lim >> 1));
         }
-    }
-
-    return;
+        for (;
+             (firstIndex < num_structors)
+              && (!metaHandle || OSMetaClass::checkModLoad(metaHandle));
+             firstIndex++)
+        {
+            if ((structor = structors[firstIndex]))
+            {
+                if ((textStart && ((uintptr_t) structor < textStart))
+                 || (textEnd && ((uintptr_t) structor >= textEnd))) break;
+
+                (*structor)();
+            }
+            else if (!hit_null_structor)
+            {
+                hit_null_structor = 1;
+                OSRuntimeLog(theKext, kOSRuntimeLogSpec,
+                    "Null structor in kext %s segment %s!",
+                    kmodInfo->name, section->segname);
+            }
+        }
+        if (metaHandle) result = OSMetaClass::checkModLoad(metaHandle);
+        break;
+    } /* for (section...) */
+    return (result);
 }
 
 /*********************************************************************
 *********************************************************************/
 kern_return_t
 OSRuntimeFinalizeCPP(
-    kmod_info_t * kmodInfo,
-    void        * data __unused)
+    OSKext                   * theKext)
 {
-    kern_return_t   result = KMOD_RETURN_FAILURE;
-    void          * metaHandle = NULL;  // do not free
-    OSKext        * theKext    = NULL;  // must release
-
-    if (gKernelCPPInitialized) {
-        theKext = OSKext::lookupKextWithIdentifier(kmodInfo->name);
-    }
-
-    if (theKext && !theKext->isCPPInitialized()) {
-        result = KMOD_RETURN_SUCCESS;
-        goto finish;
+    kern_return_t              result = KMOD_RETURN_FAILURE;
+    void                     * metaHandle = NULL;  // do not free
+    kernel_mach_header_t     * header;
+    kernel_segment_command_t * segment;
+    kmod_info_t              * kmodInfo;
+    const char              ** sectionNames;
+    uintptr_t                  textStart;
+    uintptr_t                  textEnd;
+
+    textStart    = 0;
+    textEnd      = 0;
+    sectionNames = gOSStructorSectionNames[kOSSectionNamesDefault];
+    if (theKext) {
+        if (!theKext->isCPPInitialized()) {
+            result = KMOD_RETURN_SUCCESS;
+            goto finish;
+        }
+        kmodInfo = theKext->kmod_info;
+        if (!kmodInfo || !kmodInfo->address) {
+            result = kOSKextReturnInvalidArgument;
+            goto finish;
+        }
+        header = (kernel_mach_header_t *)kmodInfo->address;
+        if (theKext->flags.builtin) {
+            header       = (kernel_mach_header_t *)g_kernel_kmod_info.address;
+            textStart    = kmodInfo->address;
+            textEnd      = textStart + kmodInfo->size;
+            sectionNames = gOSStructorSectionNames[kOSSectionNamesBuiltinKext];
+        }
+    } else {
+        kmodInfo = &g_kernel_kmod_info;
+        header   = (kernel_mach_header_t *)kmodInfo->address;
     }
 
    /* OSKext checks for this condition now, but somebody might call
@@ -344,7 +374,21 @@ OSRuntimeFinalizeCPP(
     * return a failure (it only does actual work on the init path anyhow).
     */
     metaHandle = OSMetaClass::preModLoad(kmodInfo->name);
-    OSRuntimeUnloadCPP(kmodInfo, 0);
+
+    OSSymbol::checkForPageUnload((void *)kmodInfo->address,
+        (void *)(kmodInfo->address + kmodInfo->size));
+
+    header = (kernel_mach_header_t *)kmodInfo->address;
+    segment = firstsegfromheader(header);
+
+    for (segment = firstsegfromheader(header);
+         segment != 0;
+         segment = nextsegfromheader(header, segment)) {
+
+        OSRuntimeCallStructorsInSection(theKext, kmodInfo, NULL, segment,
+                sectionNames[kOSSectionNameFinalizer], textStart, textEnd);
+    }
+
     (void)OSMetaClass::postModLoad(metaHandle);
 
     if (theKext) {
@@ -352,43 +396,53 @@ OSRuntimeFinalizeCPP(
     }
     result = KMOD_RETURN_SUCCESS;
 finish:
-    OSSafeReleaseNULL(theKext);
     return result;
 }
 
-// Functions used by the extenTools/kmod library project
-
 /*********************************************************************
 *********************************************************************/
 kern_return_t
 OSRuntimeInitializeCPP(
-    kmod_info_t * kmodInfo,
-    void *        data __unused)
+    OSKext                   * theKext)
 {
     kern_return_t              result          = KMOD_RETURN_FAILURE;
-    OSKext                   * theKext         = NULL;  // must release
     kernel_mach_header_t     * header          = NULL;
     void                     * metaHandle      = NULL;  // do not free
     bool                       load_success    = true;
     kernel_segment_command_t * segment         = NULL;  // do not free
     kernel_segment_command_t * failure_segment = NULL;  // do not free
+    kmod_info_t             *  kmodInfo;
+    const char              ** sectionNames;
+    uintptr_t                  textStart;
+    uintptr_t                  textEnd;
+
+    textStart    = 0;
+    textEnd      = 0;
+    sectionNames = gOSStructorSectionNames[kOSSectionNamesDefault];
+    if (theKext) {
+        if (theKext->isCPPInitialized()) {
+            result = KMOD_RETURN_SUCCESS;
+            goto finish;
+        }
 
-    if (!kmodInfo || !kmodInfo->address) {
-        result = kOSKextReturnInvalidArgument;
-        goto finish;
-    }
-
-    if (gKernelCPPInitialized) {
-        theKext = OSKext::lookupKextWithIdentifier(kmodInfo->name);
-    }
+        kmodInfo = theKext->kmod_info;
+        if (!kmodInfo || !kmodInfo->address) {
+            result = kOSKextReturnInvalidArgument;
+            goto finish;
+        }
+        header = (kernel_mach_header_t *)kmodInfo->address;
 
-    if (theKext && theKext->isCPPInitialized()) {
-        result = KMOD_RETURN_SUCCESS;
-        goto finish;
+        if (theKext->flags.builtin) {
+            header       = (kernel_mach_header_t *)g_kernel_kmod_info.address;
+            textStart    = kmodInfo->address;
+            textEnd      = textStart + kmodInfo->size;
+            sectionNames = gOSStructorSectionNames[kOSSectionNamesBuiltinKext];
+        }
+    } else {
+        kmodInfo = &g_kernel_kmod_info;
+        header   = (kernel_mach_header_t *)kmodInfo->address;
     }
 
-    header = (kernel_mach_header_t *)kmodInfo->address;
-
    /* Tell the meta class system that we are starting the load
     */
     metaHandle = OSMetaClass::preModLoad(kmodInfo->name);
@@ -404,45 +458,15 @@ OSRuntimeInitializeCPP(
     */
     for (segment = firstsegfromheader(header);
          segment != NULL && load_success;
-         segment = nextsegfromheader(header, segment)) {
-
-        kernel_section_t * section;
-
+         segment = nextsegfromheader(header, segment))
+    {
        /* Record the current segment in the event of a failure.
         */
         failure_segment = segment;
-
-        for (section = firstsect(segment);
-             section != NULL;
-             section = nextsect(segment, section)) {
-
-            if (sectionIsConstructor(section)) {
-                structor_t * constructors = (structor_t *)section->addr;
-
-                if (constructors) {
-                    int num_constructors = section->size / sizeof(structor_t);
-                    int hit_null_constructor = 0;
-
-                    for (int i = 0;
-                         i < num_constructors &&
-                         OSMetaClass::checkModLoad(metaHandle);
-                         i++) {
-
-                        if (constructors[i]) {
-                            (*constructors[i])();
-                        } else if (!hit_null_constructor) {
-                            hit_null_constructor = 1;
-                            OSRuntimeLog(theKext, kOSRuntimeLogSpec,
-                                "Null constructor in kext %s segment %s!",
-                                kmodInfo->name, section->segname);
-                        }
-                    }
-                    load_success = OSMetaClass::checkModLoad(metaHandle);
-
-                    break;
-                } /* if (constructors) */
-            } /* if (strncmp...) */
-        } /* for (section...) */
+        load_success = OSRuntimeCallStructorsInSection(
+                            theKext, kmodInfo, metaHandle, segment,
+                            sectionNames[kOSSectionNameInitializer],
+                            textStart, textEnd);
     } /* for (segment...) */
 
    /* We failed so call all of the destructors. We must do this before
@@ -458,7 +482,8 @@ OSRuntimeInitializeCPP(
              segment != failure_segment && segment != 0;
              segment = nextsegfromheader(header, segment)) {
 
-            OSRuntimeUnloadCPPForSegment(segment);
+            OSRuntimeCallStructorsInSection(theKext, kmodInfo, NULL, segment,
+                    sectionNames[kOSSectionNameFinalizer], textStart, textEnd);
 
         } /* for (segment...) */
     }
@@ -478,46 +503,28 @@ OSRuntimeInitializeCPP(
     * classes, and there had better not be any created on the C++ init path.
     */
     if (load_success && result != KMOD_RETURN_SUCCESS) {
-        (void)OSRuntimeFinalizeCPP(kmodInfo, NULL);
+        (void)OSRuntimeFinalizeCPP(theKext); //kmodInfo, sectionNames, textStart, textEnd);
     }
 
     if (theKext && load_success && result == KMOD_RETURN_SUCCESS) {
         theKext->setCPPInitialized(true);
     }
 finish:
-    OSSafeReleaseNULL(theKext);
     return result;
 }
 
-#if PRAGMA_MARK
-#pragma mark Libkern Init
-#endif /* PRAGMA_MARK */
-/*********************************************************************
-* Libkern Init
-*********************************************************************/
-
 /*********************************************************************
+Unload a kernel segment.
 *********************************************************************/
-extern lck_grp_t * IOLockGroup;
-extern kmod_info_t g_kernel_kmod_info;
 
-void OSlibkernInit(void)
+void
+OSRuntimeUnloadCPPForSegment(
+    kernel_segment_command_t * segment)
 {
-    // This must be called before calling OSRuntimeInitializeCPP.
-    OSMetaClassBase::initialize();
-    
-    g_kernel_kmod_info.address = (vm_address_t) &_mh_execute_header;
-    if (kOSReturnSuccess != OSRuntimeInitializeCPP(&g_kernel_kmod_info, 0)) {
-        panic("OSRuntime: C++ runtime failed to initialize.");
-    }
-    
-    gKernelCPPInitialized = true;
-
-    return;
+    OSRuntimeCallStructorsInSection(NULL, &g_kernel_kmod_info, NULL, segment,
+                    gOSStructorSectionNames[kOSSectionNamesDefault][kOSSectionNameFinalizer], 0, 0);
 }
 
-__END_DECLS
-
 #if PRAGMA_MARK
 #pragma mark C++ Allocators & Deallocators
 #endif /* PRAGMA_MARK */
@@ -526,9 +533,6 @@ __END_DECLS
 *********************************************************************/
 void *
 operator new(size_t size)
-#if __cplusplus >= 201103L
-                                                               noexcept
-#endif
 {
     void * result;
 
@@ -548,9 +552,6 @@ operator delete(void * addr)
 
 void *
 operator new[](unsigned long sz)
-#if __cplusplus >= 201103L
-                                                               noexcept
-#endif
 {
     if (sz == 0) sz = 1;
     return kern_os_malloc(sz);
@@ -584,4 +585,3 @@ __throw_length_error(const char *msg __unused)
 }
 
 };
-
index e2d93058e9c7bbe3b5d5e800876b8377a3a07d91..a82a37891f71e304e4585ec84068c5f886ca2b36 100644 (file)
@@ -37,6 +37,7 @@ __END_DECLS
 #include <libkern/c++/OSLib.h>
 #include <libkern/c++/OSDictionary.h>
 #include <libkern/OSSerializeBinary.h>
+#include <libkern/Block.h>
 #include <IOKit/IOLib.h>
 
 #define super OSObject
@@ -290,6 +291,35 @@ OSSerializer * OSSerializer::forTarget( void * target,
     return( thing );
 }
 
+bool OSSerializer::callbackToBlock(void * target __unused, void * ref,
+                                     OSSerialize * serializer)
+{
+    return ((OSSerializerBlock)ref)(serializer);
+}
+
+OSSerializer * OSSerializer::withBlock(
+        OSSerializerBlock callback)
+{
+    OSSerializer * serializer;
+    OSSerializerBlock block;
+
+    block = Block_copy(callback);
+    if (!block) return (0);
+
+    serializer = (OSSerializer::forTarget(NULL, &OSSerializer::callbackToBlock, block));
+
+    if (!serializer) Block_release(block);
+
+    return (serializer);
+}
+
+void OSSerializer::free(void)
+{
+    if (callback == &callbackToBlock) Block_release(ref);
+
+    super::free();
+}
+
 bool OSSerializer::serialize( OSSerialize * s ) const
 {
     return( (*callback)(target, ref, s) );
index ffaef6a68cb826502dbb7e31bd28cd4bd2c0d4d0..3de4336af9ffe717397271cadff447a95281b041 100644 (file)
@@ -75,6 +75,7 @@ bool OSSerialize::addBinary(const void * bits, size_t size)
        if (newCapacity >= capacity) 
        {
           newCapacity = (((newCapacity - 1) / capacityIncrement) + 1) * capacityIncrement;
+          if (newCapacity < capacity) return (false);
           if (newCapacity > ensureCapacity(newCapacity)) return (false);
     }
 
@@ -99,6 +100,7 @@ bool OSSerialize::addBinaryObject(const OSMetaClassBase * o, uint32_t key,
        if (newCapacity >= capacity) 
        {
           newCapacity = (((newCapacity - 1) / capacityIncrement) + 1) * capacityIncrement;
+          if (newCapacity < capacity) return (false);
           if (newCapacity > ensureCapacity(newCapacity)) return (false);
     }
 
index 6b32a76eee608c385a6461e6c19780a6e4d9b96f..3d82a99ce6d4d8c9eb17cdd807ea88b40984eece 100644 (file)
@@ -185,11 +185,7 @@ static OSObject    *parsedObject;
 
 #define YYSTYPE object_t *
 
-extern "C" {
-extern void *kern_os_malloc(size_t size);
-extern void *kern_os_realloc(void * addr, size_t size);
-extern void kern_os_free(void * addr);
-} /* extern "C" */
+#include <libkern/OSRuntime.h>
 
 #define malloc(s) kern_os_malloc(s)
 #define realloc(a, s) kern_os_realloc(a, s)
index 450ce081133401bbcd4b04a3201677ae1f281a1c..86f396784c8df94d7ef65f3ea71db68f4d42a68a 100644 (file)
@@ -99,11 +99,7 @@ static OSObject      *parsedObject;
 
 #define YYSTYPE object_t *
 
-extern "C" {
-extern void *kern_os_malloc(size_t size);
-extern void *kern_os_realloc(void * addr, size_t size);
-extern void kern_os_free(void * addr);
-} /* extern "C" */
+#include <libkern/OSRuntime.h>
 
 #define malloc(s) kern_os_malloc(s)
 #define realloc(a, s) kern_os_realloc(a, s)
index 60f1bb2387f4bbe290b2556866ed1d5d49420613..6905c39797cc3c6c1a107ab4dc5d245d3ff3b3a9 100644 (file)
@@ -219,12 +219,7 @@ static object_t            *buildData(parser_state_t *state, object_t *o);
 static object_t                *buildNumber(parser_state_t *state, object_t *o);
 static object_t                *buildBoolean(parser_state_t *state, object_t *o);
 
-extern "C" {
-extern void            *kern_os_malloc(size_t size);
-extern void            *kern_os_realloc(void * addr, size_t size);
-extern void            kern_os_free(void * addr);
-
-} /* extern "C" */
+#include <libkern/OSRuntime.h>
 
 #define malloc(s) kern_os_malloc(s)
 #define realloc(a, s) kern_os_realloc(a, s)
@@ -1635,11 +1630,11 @@ yyreduce:
     { (yyval) = retrieveObject(STATE, (yyvsp[(1) - (1)])->idref);
                                  if ((yyval)) {
                                    STATE->retrievedObjectCount++;
+                                   (yyval)->object->retain();
                                    if (STATE->retrievedObjectCount > MAX_REFED_OBJECTS) {
                                      yyerror("maximum object reference count");
                                      YYERROR;
                                    }
-                                   (yyval)->object->retain();
                                  } else { 
                                    yyerror("forward reference detected");
                                    YYERROR;
index 5bd2167702f45ef755615ee5288e63777feb2bbd..4f1c3cc97e1861fdde4b5534529738c4294c7077 100644 (file)
@@ -122,12 +122,7 @@ static object_t            *buildData(parser_state_t *state, object_t *o);
 static object_t                *buildNumber(parser_state_t *state, object_t *o);
 static object_t                *buildBoolean(parser_state_t *state, object_t *o);
 
-extern "C" {
-extern void            *kern_os_malloc(size_t size);
-extern void            *kern_os_realloc(void * addr, size_t size);
-extern void            kern_os_free(void * addr);
-
-} /* extern "C" */
+#include <libkern/OSRuntime.h>
 
 #define malloc(s) kern_os_malloc(s)
 #define realloc(a, s) kern_os_realloc(a, s)
index c448bb853a3de4fd11c161b513dba72c413c3b46..b38b73fe1a353954cd33fc1a14831bfecf8cccbc 100644 (file)
@@ -37,6 +37,10 @@ inflate.o_CWARNFLAGS_ADD = -Wno-cast-qual
 trees.o_CWARNFLAGS_ADD = -Wno-cast-qual
 uncompr.o_CWARNFLAGS_ADD = -Wno-cast-qual
 
+# libclosure
+runtime.cpo_CXXWARNFLAGS_ADD = -Wno-cast-qual
+
+
 # warnings in bison-generated code
 OSUnserializeXML.cpo_CXXWARNFLAGS_ADD += -Wno-uninitialized -Wno-unreachable-code -Wno-unreachable-code-break
 OSUnserialize.cpo_CXXWARNFLAGS_ADD += -Wno-unreachable-code
index 5867b937352a570af61528f156035a0635362330..5181c01431c00fe866c809a2354afceac2930ecf 100644 (file)
@@ -54,12 +54,13 @@ libkern/uuid/uuid.c                                 standard
 libkern/os/log.c                                       standard
 libkern/os/object.c                                    standard
 libkern/os/internal.c                                  standard
+libkern/os/refcnt.c                                    standard
 
 libkern/kernel_mach_header.c                            standard
 
 libkern/zlib/adler32.c                                  optional zlib
 libkern/zlib/compress.c                                 optional zlib
-libkern/zlib/crc32.c                                    optional zlib
+libkern/zlib/z_crc32.c                                  optional zlib
 libkern/zlib/deflate.c                                  optional zlib
 libkern/zlib/infback.c                                  optional zlib
 libkern/zlib/inffast.c                                  optional zlib
@@ -80,6 +81,8 @@ libkern/crypto/corecrypto_rand.c              optional crypto
 libkern/crypto/corecrypto_rsa.c                    optional crypto
 libkern/crypto/corecrypto_chacha20poly1305.c   optional        crypto
 
+libkern/img4/interface.c               standard
+
 libkern/stack_protector.c       standard
 
 libkern/kxld/kxld.c             optional config_kxld
@@ -101,3 +104,6 @@ libkern/kxld/kxld_uuid.c        optional config_kxld
 libkern/kxld/kxld_versionmin.c  optional config_kxld
 libkern/kxld/kxld_vtable.c      optional config_kxld
 libkern/kxld/kxld_stubs.c       standard
+
+libkern/libclosure/runtime.cpp      optional config_blocks
+libkern/libclosure/libclosuredata.c optional config_blocks
diff --git a/libkern/conf/files.arm64 b/libkern/conf/files.arm64
new file mode 100644 (file)
index 0000000..8b13789
--- /dev/null
@@ -0,0 +1 @@
+
index ea1f9127914e33c4a986a6bc2f254bfde48d6cd6..d99a5819f7b9981bcb7840661827531f6cd5ca16 100644 (file)
@@ -73,7 +73,7 @@ OS_ENUM(firehose_stream, uint8_t,
        firehose_stream_special                                                         = 1,
        firehose_stream_memory                                                          = 2,
        firehose_stream_metadata                                                        = 3,
-       firehose_stream_memory_high_traffic                                     = 4,
+       firehose_stream_signpost                                                        = 4,
        firehose_stream_memory_wifi                                                     = 5,
        firehose_stream_memory_baseband                                         = 6,
 
@@ -92,6 +92,7 @@ OS_ENUM(firehose_tracepoint_namespace, uint8_t,
        firehose_tracepoint_namespace_log                                       = 0x04,
        firehose_tracepoint_namespace_metadata                          = 0x05,
        firehose_tracepoint_namespace_signpost                          = 0x06,
+       firehose_tracepoint_namespace_loss                                      = 0x07,
 );
 
 /*!
@@ -203,6 +204,7 @@ OS_ENUM(_firehose_tracepoint_flags_log, uint16_t,
        _firehose_tracepoint_flags_log_has_subsystem            = 0x0200,
        _firehose_tracepoint_flags_log_has_rules                        = 0x0400,
        _firehose_tracepoint_flags_log_has_oversize                     = 0x0800,
+       _firehose_tracepoint_flags_log_has_context_data         = 0x1000,
 );
 
 /*!
@@ -239,12 +241,15 @@ OS_ENUM(_firehose_tracepoint_type_signpost, firehose_tracepoint_type_t,
  *
  * @abstract
  * Flags for Log tracepoints (namespace signpost).
+ *
+ * When flags are shared with the log type, they should have the same values.
  */
 OS_ENUM(_firehose_tracepoint_flags_signpost, uint16_t,
        _firehose_tracepoint_flags_signpost_has_private_data    = 0x0100,
        _firehose_tracepoint_flags_signpost_has_subsystem               = 0x0200,
        _firehose_tracepoint_flags_signpost_has_rules                   = 0x0400,
        _firehose_tracepoint_flags_signpost_has_oversize                = 0x0800,
+       _firehose_tracepoint_flags_signpost_has_context_data    = 0x1000,
 );
 
 /* MIG firehose push reply structure */
index d4c4e104087fe5e0645bb451de86a2e52964c123..399c95a549d9642bef1410098e7f49ebfb801964 100644 (file)
@@ -21,7 +21,7 @@
 #ifndef __FIREHOSE_FIREHOSE_PRIVATE__
 #define __FIREHOSE_FIREHOSE_PRIVATE__
 
-#define FIREHOSE_SPI_VERSION 20170907
+#define FIREHOSE_SPI_VERSION 20180416
 
 #include "firehose_types_private.h"
 #include "tracepoint_private.h"
index a0c7122124aab2aaa668b3a8a599b062861fbf82..76531fd28c64e762d9dc613441be2ff91ee8fe04 100644 (file)
@@ -114,6 +114,21 @@ typedef struct firehose_tracepoint_s {
 #define FIREHOSE_TRACE_ID_SET_CODE(tid, code) \
        ((tid).ftid._code = code)
 
+/*!
+ * @typedef firehose_loss_payload_s
+ *
+ * @abstract
+ * The payload for tracepoints in the loss namespace, generated by the firehose
+ * itself when unreliable tracepoints are lost.
+ */
+typedef struct firehose_loss_payload_s {
+       uint64_t start_stamp; /* may (rarely!) disagree with the tracepoint stamp */
+       uint64_t end_stamp;
+#define FIREHOSE_LOSS_COUNT_WIDTH 6 /* as many bits as can be spared */
+#define FIREHOSE_LOSS_COUNT_MAX ((1u << FIREHOSE_LOSS_COUNT_WIDTH) - 1)
+       uint32_t count;
+} firehose_loss_payload_s, *firehose_loss_payload_t;
+
 __BEGIN_DECLS
 
 #if __has_feature(address_sanitizer)
index f112636313088c93fa1bbca021516b51d54c0aec..cbcdd57287ee50baf58b50afcdae7a953c58fbed 100644 (file)
@@ -43,6 +43,7 @@
 #include <sys/kdebug.h>
 #include <kern/thread.h>
 
+
 extern int etext;
 __BEGIN_DECLS
 // From osmfk/kern/thread.h but considered to be private
diff --git a/libkern/img4/interface.c b/libkern/img4/interface.c
new file mode 100644 (file)
index 0000000..3863334
--- /dev/null
@@ -0,0 +1,18 @@
+#include <libkern/libkern.h>
+#include <libkern/section_keywords.h>
+#include <libkern/img4/interface.h>
+
+#if defined(SECURITY_READ_ONLY_LATE)
+SECURITY_READ_ONLY_LATE(const img4_interface_t *) img4if = NULL;
+#else
+const img4_interface_t *img4if = NULL;
+#endif
+
+void
+img4_interface_register(const img4_interface_t *i4)
+{
+       if (img4if) {
+               panic("img4 interface already set");
+       }
+       img4if = i4;
+}
index eb77e72a990debfa340acf184131ee016fb16ce9..f0b5f0f6431250c3875ca5e082bcfbb082cb0f82 100644 (file)
@@ -42,7 +42,3 @@
     The linkline must look like this.
         *.o -lkmodc++ kmod_info.o -lkmod
  */
-
-/* The following preprocessor test must match exactly with the architectures
- * that define the CONFIG_STATIC_CPPINIT config option.
- */
index eb77e72a990debfa340acf184131ee016fb16ce9..f0b5f0f6431250c3875ca5e082bcfbb082cb0f82 100644 (file)
@@ -42,7 +42,3 @@
     The linkline must look like this.
         *.o -lkmodc++ kmod_info.o -lkmod
  */
-
-/* The following preprocessor test must match exactly with the architectures
- * that define the CONFIG_STATIC_CPPINIT config option.
- */
index 81160694c769fddc166b1e46411e394a494a6c42..bebe8982910030b431f5c5a3237da5aa42078238 100644 (file)
@@ -66,6 +66,7 @@ ifeq ($(strip $(SDK_DIR)),)
        SDK_DIR := /
 endif
 
+
 DEFINES = -DPRIVATE
 CFLAGS=-std=c99 -Wall -Wextra -Werror -pedantic -Wformat=2 -Wcast-align \
        -Wwrite-strings -Wshorten-64-to-32 -Wshadow -Winit-self -Wpointer-arith \
@@ -140,6 +141,7 @@ $(LIBKXLDSYM_ARCHIVE): $(LIBKXLDOBJ_ARCHIVE)
        @mkdir -p $(SYMROOT)
        install -c -m 644 $< $@
 
+
 $(LIBKXLDOBJ_DYLIB): $(OBJS)
        $(CC) $(LDFLAGS) -o $@ $^
 
diff --git a/libkern/libclosure/libclosuredata.c b/libkern/libclosure/libclosuredata.c
new file mode 100644 (file)
index 0000000..27e906f
--- /dev/null
@@ -0,0 +1,24 @@
+/*
+ * data.c
+ * libclosure
+ *
+ * Copyright (c) 2008-2010 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LLVM_LICENSE_HEADER@
+ *
+ */
+
+/********************
+NSBlock support
+
+We allocate space and export a symbol to be used as the Class for the on-stack and malloc'ed copies until ObjC arrives on the scene.  These data areas are set up by Foundation to link in as real classes post facto.
+
+We keep these in a separate file so that we can include the runtime code in test subprojects but not include the data so that compiled code that sees the data in libSystem doesn't get confused by a second copy.  Somehow these don't get unified in a common block.
+**********************/
+
+void * _NSConcreteStackBlock[32] = { 0 };
+void * _NSConcreteMallocBlock[32] = { 0 };
+void * _NSConcreteAutoBlock[32] = { 0 };
+void * _NSConcreteFinalizingBlock[32] = { 0 };
+void * _NSConcreteGlobalBlock[32] = { 0 };
+void * _NSConcreteWeakBlockVariable[32] = { 0 };
diff --git a/libkern/libclosure/runtime.cpp b/libkern/libclosure/runtime.cpp
new file mode 100644 (file)
index 0000000..42e3798
--- /dev/null
@@ -0,0 +1,540 @@
+/*
+ * runtime.c
+ * libclosure
+ *
+ * Copyright (c) 2008-2010 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LLVM_LICENSE_HEADER@
+ */
+
+
+#ifndef KERNEL
+
+#include "Block_private.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <dlfcn.h>
+#include <os/assumes.h>
+
+#else /* !KERNEL */
+
+#include <libkern/Block_private.h>
+#include <libkern/OSRuntime.h>
+
+#define malloc(s)  kern_os_malloc((s))
+#define free(a)    kern_os_free((a))
+
+#endif /* KERNEL */
+
+#include <string.h>
+#include <stdint.h>
+#ifndef os_assumes
+#define os_assumes(_x) (_x)
+#endif
+#ifndef os_assert
+#define os_assert(_x) assert(_x)
+#endif
+
+#if TARGET_OS_WIN32
+#define _CRT_SECURE_NO_WARNINGS 1
+#include <windows.h>
+static __inline bool OSAtomicCompareAndSwapLong(long oldl, long newl, long volatile *dst) 
+{ 
+    // fixme barrier is overkill -- see objc-os.h
+    long original = InterlockedCompareExchange(dst, newl, oldl);
+    return (original == oldl);
+}
+
+static __inline bool OSAtomicCompareAndSwapInt(int oldi, int newi, int volatile *dst) 
+{ 
+    // fixme barrier is overkill -- see objc-os.h
+    int original = InterlockedCompareExchange(dst, newi, oldi);
+    return (original == oldi);
+}
+#else
+#define OSAtomicCompareAndSwapLong(_Old, _New, _Ptr) __sync_bool_compare_and_swap(_Ptr, _Old, _New)
+#define OSAtomicCompareAndSwapInt(_Old, _New, _Ptr) __sync_bool_compare_and_swap(_Ptr, _Old, _New)
+#endif
+
+
+/*******************************************************************************
+Internal Utilities
+********************************************************************************/
+
+static int32_t latching_incr_int(volatile int32_t *where) {
+    while (1) {
+        int32_t old_value = *where;
+        if ((old_value & BLOCK_REFCOUNT_MASK) == BLOCK_REFCOUNT_MASK) {
+            return BLOCK_REFCOUNT_MASK;
+        }
+        if (OSAtomicCompareAndSwapInt(old_value, old_value+2, where)) {
+            return old_value+2;
+        }
+    }
+}
+
+static bool latching_incr_int_not_deallocating(volatile int32_t *where) {
+    while (1) {
+        int32_t old_value = *where;
+        if (old_value & BLOCK_DEALLOCATING) {
+            // if deallocating we can't do this
+            return false;
+        }
+        if ((old_value & BLOCK_REFCOUNT_MASK) == BLOCK_REFCOUNT_MASK) {
+            // if latched, we're leaking this block, and we succeed
+            return true;
+        }
+        if (OSAtomicCompareAndSwapInt(old_value, old_value+2, where)) {
+            // otherwise, we must store a new retained value without the deallocating bit set
+            return true;
+        }
+    }
+}
+
+
+// return should_deallocate?
+static bool latching_decr_int_should_deallocate(volatile int32_t *where) {
+    while (1) {
+        int32_t old_value = *where;
+        if ((old_value & BLOCK_REFCOUNT_MASK) == BLOCK_REFCOUNT_MASK) {
+            return false; // latched high
+        }
+        if ((old_value & BLOCK_REFCOUNT_MASK) == 0) {
+            return false;   // underflow, latch low
+        }
+        int32_t new_value = old_value - 2;
+        bool result = false;
+        if ((old_value & (BLOCK_REFCOUNT_MASK|BLOCK_DEALLOCATING)) == 2) {
+            new_value = old_value - 1;
+            result = true;
+        }
+        if (OSAtomicCompareAndSwapInt(old_value, new_value, where)) {
+            return result;
+        }
+    }
+}
+
+
+/**************************************************************************
+Framework callback functions and their default implementations.
+***************************************************************************/
+#if !TARGET_OS_WIN32
+#pragma mark Framework Callback Routines
+#endif
+
+static void _Block_retain_object_default(const void *ptr __unused) { }
+
+static void _Block_release_object_default(const void *ptr __unused) { }
+
+static void _Block_destructInstance_default(const void *aBlock __unused) {}
+
+static void (*_Block_retain_object)(const void *ptr) = _Block_retain_object_default;
+static void (*_Block_release_object)(const void *ptr) = _Block_release_object_default;
+static void (*_Block_destructInstance) (const void *aBlock) = _Block_destructInstance_default;
+
+
+/**************************************************************************
+Callback registration from ObjC runtime and CoreFoundation
+***************************************************************************/
+
+void _Block_use_RR2(const Block_callbacks_RR *callbacks) {
+    _Block_retain_object = callbacks->retain;
+    _Block_release_object = callbacks->release;
+    _Block_destructInstance = callbacks->destructInstance;
+}
+
+/****************************************************************************
+Accessors for block descriptor fields
+*****************************************************************************/
+#if 0
+static struct Block_descriptor_1 * _Block_descriptor_1(struct Block_layout *aBlock)
+{
+    return aBlock->descriptor;
+}
+#endif
+
+static struct Block_descriptor_2 * _Block_descriptor_2(struct Block_layout *aBlock)
+{
+    if (! (aBlock->flags & BLOCK_HAS_COPY_DISPOSE)) return NULL;
+    uint8_t *desc = (uint8_t *)aBlock->descriptor;
+    desc += sizeof(struct Block_descriptor_1);
+    return __IGNORE_WCASTALIGN((struct Block_descriptor_2 *)desc);
+}
+
+static struct Block_descriptor_3 * _Block_descriptor_3(struct Block_layout *aBlock)
+{
+    if (! (aBlock->flags & BLOCK_HAS_SIGNATURE)) return NULL;
+    uint8_t *desc = (uint8_t *)aBlock->descriptor;
+    desc += sizeof(struct Block_descriptor_1);
+    if (aBlock->flags & BLOCK_HAS_COPY_DISPOSE) {
+        desc += sizeof(struct Block_descriptor_2);
+    }
+    return __IGNORE_WCASTALIGN((struct Block_descriptor_3 *)desc);
+}
+
+static void _Block_call_copy_helper(void *result, struct Block_layout *aBlock)
+{
+    struct Block_descriptor_2 *desc = _Block_descriptor_2(aBlock);
+    if (!desc) return;
+
+    (*desc->copy)(result, aBlock); // do fixup
+}
+
+static void _Block_call_dispose_helper(struct Block_layout *aBlock)
+{
+    struct Block_descriptor_2 *desc = _Block_descriptor_2(aBlock);
+    if (!desc) return;
+
+    (*desc->dispose)(aBlock);
+}
+
+/*******************************************************************************
+Internal Support routines for copying
+********************************************************************************/
+
+#if !TARGET_OS_WIN32
+#pragma mark Copy/Release support
+#endif
+
+// Copy, or bump refcount, of a block.  If really copying, call the copy helper if present.
+void *_Block_copy(const void *arg) {
+    struct Block_layout *aBlock;
+
+    if (!arg) return NULL;
+    
+    // The following would be better done as a switch statement
+    aBlock = (struct Block_layout *)arg;
+    if (aBlock->flags & BLOCK_NEEDS_FREE) {
+        // latches on high
+        latching_incr_int(&aBlock->flags);
+        return aBlock;
+    }
+    else if (aBlock->flags & BLOCK_IS_GLOBAL) {
+        return aBlock;
+    }
+    else {
+        // Its a stack block.  Make a copy.
+        struct Block_layout *result = (typeof(result)) malloc(aBlock->descriptor->size);
+        if (!result) return NULL;
+        memmove(result, aBlock, aBlock->descriptor->size); // bitcopy first
+#if __has_feature(ptrauth_calls)
+        // Resign the invoke pointer as it uses address authentication.
+        result->invoke = aBlock->invoke;
+#endif
+        // reset refcount
+        result->flags &= ~(BLOCK_REFCOUNT_MASK|BLOCK_DEALLOCATING);    // XXX not needed
+        result->flags |= BLOCK_NEEDS_FREE | 2;  // logical refcount 1
+        _Block_call_copy_helper(result, aBlock);
+        // Set isa last so memory analysis tools see a fully-initialized object.
+        result->isa = _NSConcreteMallocBlock;
+        return result;
+    }
+}
+
+
+// Runtime entry points for maintaining the sharing knowledge of byref data blocks.
+
+// A closure has been copied and its fixup routine is asking us to fix up the reference to the shared byref data
+// Closures that aren't copied must still work, so everyone always accesses variables after dereferencing the forwarding ptr.
+// We ask if the byref pointer that we know about has already been copied to the heap, and if so, increment and return it.
+// Otherwise we need to copy it and update the stack forwarding pointer
+static struct Block_byref *_Block_byref_copy(const void *arg) {
+    struct Block_byref *src = (struct Block_byref *)arg;
+
+    if ((src->forwarding->flags & BLOCK_REFCOUNT_MASK) == 0) {
+        // src points to stack
+        struct Block_byref *copy = (struct Block_byref *)malloc(src->size);
+        copy->isa = NULL;
+        // byref value 4 is logical refcount of 2: one for caller, one for stack
+        copy->flags = src->flags | BLOCK_BYREF_NEEDS_FREE | 4;
+        copy->forwarding = copy; // patch heap copy to point to itself
+        src->forwarding = copy;  // patch stack to point to heap copy
+        copy->size = src->size;
+
+        if (src->flags & BLOCK_BYREF_HAS_COPY_DISPOSE) {
+            // Trust copy helper to copy everything of interest
+            // If more than one field shows up in a byref block this is wrong XXX
+            struct Block_byref_2 *src2 = (struct Block_byref_2 *)(src+1);
+            struct Block_byref_2 *copy2 = (struct Block_byref_2 *)(copy+1);
+            copy2->byref_keep = src2->byref_keep;
+            copy2->byref_destroy = src2->byref_destroy;
+
+            if (src->flags & BLOCK_BYREF_LAYOUT_EXTENDED) {
+                struct Block_byref_3 *src3 = (struct Block_byref_3 *)(src2+1);
+                struct Block_byref_3 *copy3 = (struct Block_byref_3*)(copy2+1);
+                copy3->layout = src3->layout;
+            }
+
+            (*src2->byref_keep)(copy, src);
+        }
+        else {
+            // Bitwise copy.
+            // This copy includes Block_byref_3, if any.
+            memmove(copy+1, src+1, src->size - sizeof(*src));
+        }
+    }
+    // already copied to heap
+    else if ((src->forwarding->flags & BLOCK_BYREF_NEEDS_FREE) == BLOCK_BYREF_NEEDS_FREE) {
+        latching_incr_int(&src->forwarding->flags);
+    }
+    
+    return src->forwarding;
+}
+
+static void _Block_byref_release(const void *arg) {
+    struct Block_byref *byref = (struct Block_byref *)arg;
+
+    // dereference the forwarding pointer since the compiler isn't doing this anymore (ever?)
+    byref = byref->forwarding;
+    
+    if (byref->flags & BLOCK_BYREF_NEEDS_FREE) {
+        __assert_only int32_t refcount = byref->flags & BLOCK_REFCOUNT_MASK;
+        os_assert(refcount);
+        if (latching_decr_int_should_deallocate(&byref->flags)) {
+            if (byref->flags & BLOCK_BYREF_HAS_COPY_DISPOSE) {
+                struct Block_byref_2 *byref2 = (struct Block_byref_2 *)(byref+1);
+                (*byref2->byref_destroy)(byref);
+            }
+            free(byref);
+        }
+    }
+}
+
+
+/************************************************************
+ *
+ * API supporting SPI
+ * _Block_copy, _Block_release, and (old) _Block_destroy
+ *
+ ***********************************************************/
+
+#if !TARGET_OS_WIN32
+#pragma mark SPI/API
+#endif
+
+
+// API entry point to release a copied Block
+void _Block_release(const void *arg) {
+    struct Block_layout *aBlock = (struct Block_layout *)arg;
+    if (!aBlock) return;
+    if (aBlock->flags & BLOCK_IS_GLOBAL) return;
+    if (! (aBlock->flags & BLOCK_NEEDS_FREE)) return;
+
+    if (latching_decr_int_should_deallocate(&aBlock->flags)) {
+        _Block_call_dispose_helper(aBlock);
+        _Block_destructInstance(aBlock);
+        free(aBlock);
+    }
+}
+
+bool _Block_tryRetain(const void *arg) {
+    struct Block_layout *aBlock = (struct Block_layout *)arg;
+    return latching_incr_int_not_deallocating(&aBlock->flags);
+}
+
+bool _Block_isDeallocating(const void *arg) {
+    struct Block_layout *aBlock = (struct Block_layout *)arg;
+    return (aBlock->flags & BLOCK_DEALLOCATING) != 0;
+}
+
+
+/************************************************************
+ *
+ * SPI used by other layers
+ *
+ ***********************************************************/
+
+size_t Block_size(void *aBlock) {
+    return ((struct Block_layout *)aBlock)->descriptor->size;
+}
+
+bool _Block_use_stret(void *aBlock) {
+    struct Block_layout *layout = (struct Block_layout *)aBlock;
+
+    int requiredFlags = BLOCK_HAS_SIGNATURE | BLOCK_USE_STRET;
+    return (layout->flags & requiredFlags) == requiredFlags;
+}
+
+// Checks for a valid signature, not merely the BLOCK_HAS_SIGNATURE bit.
+bool _Block_has_signature(void *aBlock) {
+    return _Block_signature(aBlock) ? true : false;
+}
+
+const char * _Block_signature(void *aBlock)
+{
+    struct Block_descriptor_3 *desc3 = _Block_descriptor_3((struct Block_layout *)aBlock);
+    if (!desc3) return NULL;
+
+    return desc3->signature;
+}
+
+const char * _Block_layout(void *aBlock)
+{
+    // Don't return extended layout to callers expecting old GC layout
+    struct Block_layout *layout = (struct Block_layout *)aBlock;
+    if (layout->flags & BLOCK_HAS_EXTENDED_LAYOUT) return NULL;
+
+    struct Block_descriptor_3 *desc3 = _Block_descriptor_3((struct Block_layout *)aBlock);
+    if (!desc3) return NULL;
+
+    return desc3->layout;
+}
+
+const char * _Block_extended_layout(void *aBlock)
+{
+    // Don't return old GC layout to callers expecting extended layout
+    struct Block_layout *layout = (struct Block_layout *)aBlock;
+    if (! (layout->flags & BLOCK_HAS_EXTENDED_LAYOUT)) return NULL;
+
+    struct Block_descriptor_3 *desc3 = _Block_descriptor_3((struct Block_layout *)aBlock);
+    if (!desc3) return NULL;
+
+    // Return empty string (all non-object bytes) instead of NULL 
+    // so callers can distinguish "empty layout" from "no layout".
+    if (!desc3->layout) return "";
+    else return desc3->layout;
+}
+
+#if !TARGET_OS_WIN32
+#pragma mark Compiler SPI entry points
+#endif
+
+    
+/*******************************************************
+
+Entry points used by the compiler - the real API!
+
+
+A Block can reference four different kinds of things that require help when the Block is copied to the heap.
+1) C++ stack based objects
+2) References to Objective-C objects
+3) Other Blocks
+4) __block variables
+
+In these cases helper functions are synthesized by the compiler for use in Block_copy and Block_release, called the copy and dispose helpers.  The copy helper emits a call to the C++ const copy constructor for C++ stack based objects and for the rest calls into the runtime support function _Block_object_assign.  The dispose helper has a call to the C++ destructor for case 1 and a call into _Block_object_dispose for the rest.
+
+The flags parameter of _Block_object_assign and _Block_object_dispose is set to
+    * BLOCK_FIELD_IS_OBJECT (3), for the case of an Objective-C Object,
+    * BLOCK_FIELD_IS_BLOCK (7), for the case of another Block, and
+    * BLOCK_FIELD_IS_BYREF (8), for the case of a __block variable.
+If the __block variable is marked weak the compiler also or's in BLOCK_FIELD_IS_WEAK (16)
+
+So the Block copy/dispose helpers should only ever generate the four flag values of 3, 7, 8, and 24.
+
+When  a __block variable is either a C++ object, an Objective-C object, or another Block then the compiler also generates copy/dispose helper functions.  Similarly to the Block copy helper, the "__block" copy helper (formerly and still a.k.a. "byref" copy helper) will do a C++ copy constructor (not a const one though!) and the dispose helper will do the destructor.  And similarly the helpers will call into the same two support functions with the same values for objects and Blocks with the additional BLOCK_BYREF_CALLER (128) bit of information supplied.
+
+So the __block copy/dispose helpers will generate flag values of 3 or 7 for objects and Blocks respectively, with BLOCK_FIELD_IS_WEAK (16) or'ed as appropriate and always 128 or'd in, for the following set of possibilities:
+    __block id                   128+3       (0x83)
+    __block (^Block)             128+7       (0x87)
+    __weak __block id            128+3+16    (0x93)
+    __weak __block (^Block)      128+7+16    (0x97)
+        
+
+********************************************************/
+
+//
+// When Blocks or Block_byrefs hold objects then their copy routine helpers use this entry point
+// to do the assignment.
+//
+void _Block_object_assign(void *destArg, const void *object, const int flags) {
+    const void **dest = (const void **)destArg;
+    switch (os_assumes(flags & BLOCK_ALL_COPY_DISPOSE_FLAGS)) {
+      case BLOCK_FIELD_IS_OBJECT:
+        /*******
+        id object = ...;
+        [^{ object; } copy];
+        ********/
+
+        _Block_retain_object(object);
+        *dest = object;
+        break;
+
+      case BLOCK_FIELD_IS_BLOCK:
+        /*******
+        void (^object)(void) = ...;
+        [^{ object; } copy];
+        ********/
+
+        *dest = _Block_copy(object);
+        break;
+    
+      case BLOCK_FIELD_IS_BYREF | BLOCK_FIELD_IS_WEAK:
+      case BLOCK_FIELD_IS_BYREF:
+        /*******
+         // copy the onstack __block container to the heap
+         // Note this __weak is old GC-weak/MRC-unretained.
+         // ARC-style __weak is handled by the copy helper directly.
+         __block ... x;
+         __weak __block ... x;
+         [^{ x; } copy];
+         ********/
+
+        *dest = _Block_byref_copy(object);
+        break;
+        
+      case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_OBJECT:
+      case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_BLOCK:
+        /*******
+         // copy the actual field held in the __block container
+         // Note this is MRC unretained __block only. 
+         // ARC retained __block is handled by the copy helper directly.
+         __block id object;
+         __block void (^object)(void);
+         [^{ object; } copy];
+         ********/
+
+        *dest = object;
+        break;
+
+      case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_OBJECT | BLOCK_FIELD_IS_WEAK:
+      case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_BLOCK  | BLOCK_FIELD_IS_WEAK:
+        /*******
+         // copy the actual field held in the __block container
+         // Note this __weak is old GC-weak/MRC-unretained.
+         // ARC-style __weak is handled by the copy helper directly.
+         __weak __block id object;
+         __weak __block void (^object)(void);
+         [^{ object; } copy];
+         ********/
+
+        *dest = object;
+        break;
+
+      default:
+        break;
+    }
+}
+
+// When Blocks or Block_byrefs hold objects their destroy helper routines call this entry point
+// to help dispose of the contents
+void _Block_object_dispose(const void *object, const int flags) {
+    switch (os_assumes(flags & BLOCK_ALL_COPY_DISPOSE_FLAGS)) {
+      case BLOCK_FIELD_IS_BYREF | BLOCK_FIELD_IS_WEAK:
+      case BLOCK_FIELD_IS_BYREF:
+        // get rid of the __block data structure held in a Block
+        _Block_byref_release(object);
+        break;
+      case BLOCK_FIELD_IS_BLOCK:
+        _Block_release(object);
+        break;
+      case BLOCK_FIELD_IS_OBJECT:
+        _Block_release_object(object);
+        break;
+      case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_OBJECT:
+      case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_BLOCK:
+      case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_OBJECT | BLOCK_FIELD_IS_WEAK:
+      case BLOCK_BYREF_CALLER | BLOCK_FIELD_IS_BLOCK  | BLOCK_FIELD_IS_WEAK:
+        break;
+      default:
+        break;
+    }
+}
+
+
+// Workaround for <rdar://26015603> dylib with no __DATA segment fails to rebase
+__attribute__((used))
+static int let_there_be_data = 42;
+
+#undef malloc
+#undef free
+
diff --git a/libkern/libkern/Block.h b/libkern/libkern/Block.h
new file mode 100644 (file)
index 0000000..5509250
--- /dev/null
@@ -0,0 +1,66 @@
+/*
+ *  Block.h
+ *
+ * Copyright (c) 2008-2010 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LLVM_LICENSE_HEADER@
+ *
+ */
+
+#ifndef _Block_H_
+#define _Block_H_
+
+#if !defined(BLOCK_EXPORT)
+#   if defined(__cplusplus)
+#       define BLOCK_EXPORT extern "C" 
+#   else
+#       define BLOCK_EXPORT extern
+#   endif
+#endif
+
+#include <Availability.h>
+#ifndef KERNEL
+#include <TargetConditionals.h>
+#endif /* KERNEL */
+
+#if __cplusplus
+extern "C" {
+#endif
+
+// Create a heap based copy of a Block or simply add a reference to an existing one.
+// This must be paired with Block_release to recover memory, even when running
+// under Objective-C Garbage Collection.
+BLOCK_EXPORT void *_Block_copy(const void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+
+// Lose the reference, and if heap based and last reference, recover the memory
+BLOCK_EXPORT void _Block_release(const void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+
+
+// Used by the compiler. Do not call this function yourself.
+BLOCK_EXPORT void _Block_object_assign(void *, const void *, const int)
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+
+// Used by the compiler. Do not call this function yourself.
+BLOCK_EXPORT void _Block_object_dispose(const void *, const int)
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+
+// Used by the compiler. Do not use these variables yourself.
+BLOCK_EXPORT void * _NSConcreteGlobalBlock[32]
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+BLOCK_EXPORT void * _NSConcreteStackBlock[32]
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+
+
+#if __cplusplus
+}
+#endif
+
+// Type correct macros
+
+#define Block_copy(...) ((__typeof(__VA_ARGS__))_Block_copy((const void *)(__VA_ARGS__)))
+#define Block_release(...) _Block_release((const void *)(__VA_ARGS__))
+
+
+#endif
diff --git a/libkern/libkern/Block_private.h b/libkern/libkern/Block_private.h
new file mode 100644 (file)
index 0000000..d122f92
--- /dev/null
@@ -0,0 +1,458 @@
+/*
+ * Block_private.h
+ *
+ * SPI for Blocks
+ *
+ * Copyright (c) 2008-2010 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LLVM_LICENSE_HEADER@
+ *
+ */
+
+#ifndef _BLOCK_PRIVATE_H_
+#define _BLOCK_PRIVATE_H_
+
+#include <Availability.h>
+#include <AvailabilityMacros.h>
+#ifndef KERNEL
+#include <TargetConditionals.h>
+#endif
+
+#include <stdbool.h>
+#include <stdint.h>
+#ifdef KERNEL
+#include <sys/systm.h>
+#else
+#include <stdio.h>
+#endif
+
+
+#ifdef KERNEL
+#include <libkern/Block.h>
+struct Block_byref;
+#else
+#include <Block.h>
+#endif
+
+#if __has_include(<ptrauth.h>)
+#include <ptrauth.h>
+#endif
+
+#if __has_feature(ptrauth_calls) &&  __cplusplus < 201103L
+
+// C ptrauth or old C++ ptrauth
+
+#define _Block_set_function_pointer(field, value)                       \
+    ((value)                                                            \
+     ? ((field) =                                                       \
+        (__typeof__(field))                                             \
+        ptrauth_auth_and_resign((void*)(value),                         \
+                                ptrauth_key_function_pointer, 0,        \
+                                ptrauth_key_block_function, &(field)))  \
+     : ((field) = 0))
+
+#define _Block_get_function_pointer(field)                              \
+    ((field)                                                            \
+     ? (__typeof__(field))                                              \
+       ptrauth_auth_function((void*)(field),                            \
+                             ptrauth_key_block_function, &(field))      \
+     : (__typeof__(field))0)
+
+#else
+
+// C++11 ptrauth or no ptrauth
+
+#define _Block_set_function_pointer(field, value)       \
+    (field) = (value)
+
+#define _Block_get_function_pointer(field)      \
+    (field)
+
+#endif
+
+
+#if __has_feature(ptrauth_calls)  &&  __cplusplus >= 201103L
+
+// StorageSignedFunctionPointer<Key, Fn> stores a function pointer of type
+// Fn but signed with the given ptrauth key and with the address of its
+// storage as extra data.
+// Function pointers inside block objects are signed this way.
+template <typename Fn, ptrauth_key Key>
+class StorageSignedFunctionPointer {
+    uintptr_t bits;
+
+ public:
+
+    // Authenticate function pointer fn as a C function pointer.
+    // Re-sign it with our key and the storage address as extra data.
+    // DOES NOT actually write to our storage.
+    uintptr_t prepareWrite(Fn fn) const
+    {
+        if (fn == nullptr) {
+            return 0;
+        } else {
+            return (uintptr_t)
+                ptrauth_auth_and_resign(fn, ptrauth_key_function_pointer, 0,
+                                        Key, &bits);
+        }
+    }
+
+    // Authenticate otherBits at otherStorage.
+    // Re-sign it with our storage address.
+    // DOES NOT actually write to our storage.
+    uintptr_t prepareWrite(const StorageSignedFunctionPointer& other) const
+    {
+        if (other.bits == 0) {
+            return 0;
+        } else {
+            return (uintptr_t)
+                ptrauth_auth_and_resign((void*)other.bits, Key, &other.bits,
+                                        Key, &bits);
+        }
+    }
+
+    // Authenticate ptr as if it were stored at our storage address.
+    // Re-sign it as a C function pointer.
+    // DOES NOT actually read from our storage.
+    Fn completeReadFn(uintptr_t ptr) const
+    {
+        if (ptr == 0) {
+            return nullptr;
+        } else {
+            return ptrauth_auth_function((Fn)ptr, Key, &bits);
+        }
+    }
+
+    // Authenticate ptr as if it were at our storage address.
+    // Return it as a dereferenceable pointer.
+    // DOES NOT actually read from our storage.
+    void* completeReadRaw(uintptr_t ptr) const
+    {
+        if (ptr == 0) {
+            return nullptr;
+        } else {
+            return ptrauth_auth_data((void*)ptr, Key, &bits);
+        }
+    }
+
+    StorageSignedFunctionPointer() { }
+
+    StorageSignedFunctionPointer(Fn value)
+        : bits(prepareWrite(value)) { }
+
+    StorageSignedFunctionPointer(const StorageSignedFunctionPointer& value)
+        : bits(prepareWrite(value)) { }
+
+    StorageSignedFunctionPointer&
+    operator = (Fn rhs) {
+        bits = prepareWrite(rhs);
+        return *this;
+    }
+
+    StorageSignedFunctionPointer&
+    operator = (const StorageSignedFunctionPointer& rhs) {
+        bits = prepareWrite(rhs);
+        return *this;
+    }
+
+    operator Fn () const {
+        return completeReadFn(bits);
+    }
+
+    explicit operator void* () const {
+        return completeReadRaw(bits);
+    }
+
+    explicit operator bool () const {
+        return completeReadRaw(bits) != nullptr;
+    }
+};
+
+using BlockCopyFunction = StorageSignedFunctionPointer
+    <void(*)(void *, const void *),
+     ptrauth_key_block_function>;
+
+using BlockDisposeFunction = StorageSignedFunctionPointer
+    <void(*)(const void *),
+     ptrauth_key_block_function>;
+
+using BlockInvokeFunction = StorageSignedFunctionPointer
+    <void(*)(void *, ...),
+     ptrauth_key_block_function>;
+
+using BlockByrefKeepFunction = StorageSignedFunctionPointer
+    <void(*)(struct Block_byref *, struct Block_byref *),
+     ptrauth_key_block_function>;
+
+using BlockByrefDestroyFunction = StorageSignedFunctionPointer
+    <void(*)(struct Block_byref *),
+     ptrauth_key_block_function>;
+
+// c++11 and ptrauth_calls
+#elif !__has_feature(ptrauth_calls)
+// not ptrauth_calls
+
+typedef void(*BlockCopyFunction)(void *, const void *);
+typedef void(*BlockDisposeFunction)(const void *);
+typedef void(*BlockInvokeFunction)(void *, ...);
+typedef void(*BlockByrefKeepFunction)(struct Block_byref*, struct Block_byref*);
+typedef void(*BlockByrefDestroyFunction)(struct Block_byref *);
+
+#else
+// ptrauth_calls but not c++11
+
+typedef uintptr_t BlockCopyFunction;
+typedef uintptr_t BlockDisposeFunction;
+typedef uintptr_t BlockInvokeFunction;
+typedef uintptr_t BlockByrefKeepFunction;
+typedef uintptr_t BlockByrefDestroyFunction;
+
+#endif
+
+
+// Values for Block_layout->flags to describe block objects
+enum {
+    BLOCK_DEALLOCATING =      (0x0001),  // runtime
+    BLOCK_REFCOUNT_MASK =     (0xfffe),  // runtime
+    BLOCK_NEEDS_FREE =        (1 << 24), // runtime
+    BLOCK_HAS_COPY_DISPOSE =  (1 << 25), // compiler
+    BLOCK_HAS_CTOR =          (1 << 26), // compiler: helpers have C++ code
+    BLOCK_IS_GC =             (1 << 27), // runtime
+    BLOCK_IS_GLOBAL =         (1 << 28), // compiler
+    BLOCK_USE_STRET =         (1 << 29), // compiler: undefined if !BLOCK_HAS_SIGNATURE
+    BLOCK_HAS_SIGNATURE  =    (1 << 30), // compiler
+    BLOCK_HAS_EXTENDED_LAYOUT=(1 << 31)  // compiler
+};
+
+#define BLOCK_DESCRIPTOR_1 1
+struct Block_descriptor_1 {
+    uintptr_t reserved;
+    uintptr_t size;
+};
+
+#define BLOCK_DESCRIPTOR_2 1
+struct Block_descriptor_2 {
+    // requires BLOCK_HAS_COPY_DISPOSE
+    BlockCopyFunction copy;
+    BlockDisposeFunction dispose;
+};
+
+#define BLOCK_DESCRIPTOR_3 1
+struct Block_descriptor_3 {
+    // requires BLOCK_HAS_SIGNATURE
+    const char *signature;
+    const char *layout;     // contents depend on BLOCK_HAS_EXTENDED_LAYOUT
+};
+
+struct Block_layout {
+    void *isa;
+    volatile int32_t flags; // contains ref count
+    int32_t reserved;
+    BlockInvokeFunction invoke;
+    struct Block_descriptor_1 *descriptor;
+    // imported variables
+};
+
+
+// Values for Block_byref->flags to describe __block variables
+enum {
+    // Byref refcount must use the same bits as Block_layout's refcount.
+    // BLOCK_DEALLOCATING =      (0x0001),  // runtime
+    // BLOCK_REFCOUNT_MASK =     (0xfffe),  // runtime
+
+    BLOCK_BYREF_LAYOUT_MASK =       (0xf << 28), // compiler
+    BLOCK_BYREF_LAYOUT_EXTENDED =   (  1 << 28), // compiler
+    BLOCK_BYREF_LAYOUT_NON_OBJECT = (  2 << 28), // compiler
+    BLOCK_BYREF_LAYOUT_STRONG =     (  3 << 28), // compiler
+    BLOCK_BYREF_LAYOUT_WEAK =       (  4 << 28), // compiler
+    BLOCK_BYREF_LAYOUT_UNRETAINED = (  5 << 28), // compiler
+
+    BLOCK_BYREF_IS_GC =             (  1 << 27), // runtime
+
+    BLOCK_BYREF_HAS_COPY_DISPOSE =  (  1 << 25), // compiler
+    BLOCK_BYREF_NEEDS_FREE =        (  1 << 24), // runtime
+};
+
+struct Block_byref {
+    void *isa;
+    struct Block_byref *forwarding;
+    volatile int32_t flags; // contains ref count
+    uint32_t size;
+};
+
+struct Block_byref_2 {
+    // requires BLOCK_BYREF_HAS_COPY_DISPOSE
+    BlockByrefKeepFunction byref_keep;
+    BlockByrefDestroyFunction byref_destroy;
+};
+
+struct Block_byref_3 {
+    // requires BLOCK_BYREF_LAYOUT_EXTENDED
+    const char *layout;
+};
+
+
+// Extended layout encoding.
+
+// Values for Block_descriptor_3->layout with BLOCK_HAS_EXTENDED_LAYOUT
+// and for Block_byref_3->layout with BLOCK_BYREF_LAYOUT_EXTENDED
+
+// If the layout field is less than 0x1000, then it is a compact encoding 
+// of the form 0xXYZ: X strong pointers, then Y byref pointers, 
+// then Z weak pointers.
+
+// If the layout field is 0x1000 or greater, it points to a 
+// string of layout bytes. Each byte is of the form 0xPN.
+// Operator P is from the list below. Value N is a parameter for the operator.
+// Byte 0x00 terminates the layout; remaining block data is non-pointer bytes.
+
+enum {
+    BLOCK_LAYOUT_ESCAPE = 0, // N=0 halt, rest is non-pointer. N!=0 reserved.
+    BLOCK_LAYOUT_NON_OBJECT_BYTES = 1,    // N bytes non-objects
+    BLOCK_LAYOUT_NON_OBJECT_WORDS = 2,    // N words non-objects
+    BLOCK_LAYOUT_STRONG           = 3,    // N words strong pointers
+    BLOCK_LAYOUT_BYREF            = 4,    // N words byref pointers
+    BLOCK_LAYOUT_WEAK             = 5,    // N words weak pointers
+    BLOCK_LAYOUT_UNRETAINED       = 6,    // N words unretained pointers
+    BLOCK_LAYOUT_UNKNOWN_WORDS_7  = 7,    // N words, reserved
+    BLOCK_LAYOUT_UNKNOWN_WORDS_8  = 8,    // N words, reserved
+    BLOCK_LAYOUT_UNKNOWN_WORDS_9  = 9,    // N words, reserved
+    BLOCK_LAYOUT_UNKNOWN_WORDS_A  = 0xA,  // N words, reserved
+    BLOCK_LAYOUT_UNUSED_B         = 0xB,  // unspecified, reserved
+    BLOCK_LAYOUT_UNUSED_C         = 0xC,  // unspecified, reserved
+    BLOCK_LAYOUT_UNUSED_D         = 0xD,  // unspecified, reserved
+    BLOCK_LAYOUT_UNUSED_E         = 0xE,  // unspecified, reserved
+    BLOCK_LAYOUT_UNUSED_F         = 0xF,  // unspecified, reserved
+};
+
+
+// Runtime support functions used by compiler when generating copy/dispose helpers
+
+// Values for _Block_object_assign() and _Block_object_dispose() parameters
+enum {
+    // see function implementation for a more complete description of these fields and combinations
+    BLOCK_FIELD_IS_OBJECT   =  3,  // id, NSObject, __attribute__((NSObject)), block, ...
+    BLOCK_FIELD_IS_BLOCK    =  7,  // a block variable
+    BLOCK_FIELD_IS_BYREF    =  8,  // the on stack structure holding the __block variable
+    BLOCK_FIELD_IS_WEAK     = 16,  // declared __weak, only used in byref copy helpers
+    BLOCK_BYREF_CALLER      = 128, // called from __block (byref) copy/dispose support routines.
+};
+
+enum {
+    BLOCK_ALL_COPY_DISPOSE_FLAGS = 
+        BLOCK_FIELD_IS_OBJECT | BLOCK_FIELD_IS_BLOCK | BLOCK_FIELD_IS_BYREF |
+        BLOCK_FIELD_IS_WEAK | BLOCK_BYREF_CALLER
+};
+
+
+// Function pointer accessors
+
+static inline __typeof__(void (*)(void *, ...))
+_Block_get_invoke_fn(struct Block_layout *block)
+{
+    return (void (*)(void *, ...))_Block_get_function_pointer(block->invoke);
+}
+
+static inline void 
+_Block_set_invoke_fn(struct Block_layout *block, void (*fn)(void *, ...))
+{
+    _Block_set_function_pointer(block->invoke, fn);
+}
+
+
+static inline __typeof__(void (*)(void *, const void *))
+_Block_get_copy_fn(struct Block_descriptor_2 *desc)
+{
+    return (void (*)(void *, const void *))_Block_get_function_pointer(desc->copy);
+}
+
+static inline void 
+_Block_set_copy_fn(struct Block_descriptor_2 *desc,
+                   void (*fn)(void *, const void *))
+{
+    _Block_set_function_pointer(desc->copy, fn);
+}
+
+
+static inline __typeof__(void (*)(const void *))
+_Block_get_dispose_fn(struct Block_descriptor_2 *desc)
+{
+    return (void (*)(const void *))_Block_get_function_pointer(desc->dispose);
+}
+
+static inline void 
+_Block_set_dispose_fn(struct Block_descriptor_2 *desc,
+                      void (*fn)(const void *))
+{
+    _Block_set_function_pointer(desc->dispose, fn);
+}
+
+
+// Other support functions
+
+
+// runtime entry to get total size of a closure
+BLOCK_EXPORT size_t Block_size(void *aBlock);
+
+// indicates whether block was compiled with compiler that sets the ABI related metadata bits
+BLOCK_EXPORT bool _Block_has_signature(void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3);
+
+// returns TRUE if return value of block is on the stack, FALSE otherwise
+BLOCK_EXPORT bool _Block_use_stret(void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3);
+
+// Returns a string describing the block's parameter and return types.
+// The encoding scheme is the same as Objective-C @encode.
+// Returns NULL for blocks compiled with some compilers.
+BLOCK_EXPORT const char * _Block_signature(void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3);
+
+// Returns a string describing the block's GC layout.
+// This uses the GC skip/scan encoding.
+// May return NULL.
+BLOCK_EXPORT const char * _Block_layout(void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3);
+
+// Returns a string describing the block's layout.
+// This uses the "extended layout" form described above.
+// May return NULL.
+BLOCK_EXPORT const char * _Block_extended_layout(void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_8, __IPHONE_7_0);
+
+// Callable only from the ARR weak subsystem while in exclusion zone
+BLOCK_EXPORT bool _Block_tryRetain(const void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3);
+
+// Callable only from the ARR weak subsystem while in exclusion zone
+BLOCK_EXPORT bool _Block_isDeallocating(const void *aBlock)
+    __OSX_AVAILABLE_STARTING(__MAC_10_7, __IPHONE_4_3);
+
+
+// the raw data space for runtime classes for blocks
+// class+meta used for stack, malloc, and collectable based blocks
+BLOCK_EXPORT void * _NSConcreteMallocBlock[32]
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+BLOCK_EXPORT void * _NSConcreteAutoBlock[32]
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+BLOCK_EXPORT void * _NSConcreteFinalizingBlock[32]
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+BLOCK_EXPORT void * _NSConcreteWeakBlockVariable[32]
+    __OSX_AVAILABLE_STARTING(__MAC_10_6, __IPHONE_3_2);
+// declared in Block.h
+// BLOCK_EXPORT void * _NSConcreteGlobalBlock[32];
+// BLOCK_EXPORT void * _NSConcreteStackBlock[32];
+
+
+struct Block_callbacks_RR {
+    size_t  size;                   // size == sizeof(struct Block_callbacks_RR)
+    void  (*retain)(const void *);
+    void  (*release)(const void *);
+    void  (*destructInstance)(const void *);
+};
+typedef struct Block_callbacks_RR Block_callbacks_RR;
+
+BLOCK_EXPORT void _Block_use_RR2(const Block_callbacks_RR *callbacks);
+
+
+#endif
index e1c7b3a4ab16ba071a417c90423bf1a5b23491f9..a7b66fe8a94fd001514ae49762aa7f51c324f59e 100644 (file)
@@ -7,9 +7,10 @@ include $(MakeInc_cmd)
 include $(MakeInc_def)
 
 INSTINC_SUBDIRS = \
-        machine \
-       c++ \
-       crypto
+    machine \
+    c++ \
+    crypto \
+    img4
 INSTINC_SUBDIRS_X86_64 = \
         i386
 INSTINC_SUBDIRS_X86_64H = \
@@ -42,7 +43,9 @@ KERNELFILES = \
        sysctl.h \
        tree.h \
        zconf.h \
-       zlib.h
+       zlib.h \
+       crc.h \
+       Block.h
 
 PRIVATE_KERNELFILES = \
        OSKextLibPrivate.h \
@@ -50,7 +53,8 @@ PRIVATE_KERNELFILES = \
        kext_request_keys.h \
        mkext.h \
        prelink.h \
-       section_keywords.h
+       section_keywords.h \
+       Block_private.h
 
 PRIVATE_DATAFILES = \
        ${PRIVATE_KERNELFILES} \
index fd08744ed58277a2cf31f2310931134d716a8774..147ab96f6b7b0fd41beb537b011f94cb3e017080 100644 (file)
@@ -130,6 +130,7 @@ typedef uint8_t OSKextExcludeLevel;
 #define kOSBundlePathKey                        "OSBundlePath"
 #define kOSBundleExecutablePathKey              "OSBundleExecutablePath"
 #define kOSBundleUUIDKey                        "OSBundleUUID"
+#define kOSBundleTextUUIDKey                    "OSBundleTextUUID"
 #define kOSBundleStartedKey                     "OSBundleStarted"
 #define kOSBundlePrelinkedKey                   "OSBundlePrelinked"
 #define kOSBundleLoadTagKey                     "OSBundleLoadTag"
@@ -140,6 +141,11 @@ typedef uint8_t OSKextExcludeLevel;
 #define kOSBundleWiredSizeKey                   "OSBundleWiredSize"
 #define kOSBundleDependenciesKey                "OSBundleDependencies"
 #define kOSBundleRetainCountKey                 "OSBundleRetainCount"
+#define kOSBundleCacheLoadAddressKey            "OSBundleCacheLoadAddress"
+// Kernel TEXT encompasses kexts
+#define kOSBundleKextsInKernelTextKey           "OSBundleKextsInKernelText"
+// OSKextCopyLoadedKextInfo includes non-started kexts when present:
+#define kOSBundleAllPrelinkedKey                "OSBundleAllPrelinked"
 
 /* Dictionary of metaclass info keyed by classname.
  */
@@ -934,6 +940,10 @@ extern void                         OSKextFreeSite(vm_allocation_site_t * site);
 extern int OSKextGetUUIDForName(const char *, uuid_t);
 #endif
 
+extern vm_tag_t gIOSurfaceTag;
+
+extern void *OSKextKextForAddress(const void *addr);
+
 #endif /* XNU_KERNEL_PRIVATE */
 
 __END_DECLS
diff --git a/libkern/libkern/OSRuntime.h b/libkern/libkern/OSRuntime.h
new file mode 100644 (file)
index 0000000..bf7232a
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 1999-2012 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _OS_OSRUNTIME_H
+#define _OS_OSRUNTIME_H
+
+#include <libkern/OSBase.h>
+
+__BEGIN_DECLS
+
+extern void *kern_os_malloc(size_t size) __attribute__((alloc_size(1)));
+extern void *kern_os_realloc(void * addr, size_t size) __attribute__((alloc_size(2)));
+extern void kern_os_free(void * address);
+
+__END_DECLS
+
+#endif /* _OS_OSRUNTIME_H */
index 91deba1fad3df6ecba7885cf72ea78c68c40fb14..f162bbdc3aaa38d340472433546bc801a3618474 100644 (file)
@@ -448,6 +448,46 @@ public:
     virtual OSCollection *copyCollection(OSDictionary * cycleDict = 0);
     OSMetaClassDeclareReservedUsed(OSCollection, 1);
 
+   /*!
+    * @function iterateObjects
+    *
+    * @abstract
+    * Invoke a callback for each member of the collection.
+    *
+    * @param refcon   A reference constant for the callback.
+    * @param callback The callback function,
+    *                 called with the refcon and each member object
+    *                 of the collection in turn, on the callers thread.
+    *                 The callback should return true to early terminate
+    *                 the iteration, false otherwise.
+    *
+    * @result
+    * False if the collection iteration was made invalid
+    * (see OSCollectionIterator::isValid()) otherwise true.
+    */
+    bool iterateObjects(void * refcon, bool (*callback)(void * refcon, OSObject * object));
+
+#ifdef __BLOCKS__
+
+   /*!
+    * @function iterateObjects
+    *
+    * @abstract
+    * Invoke a block for each member of the collection.
+    *
+    * @param block    The block,
+    *                 called with the refcon and each member object
+    *                 of the collection in turn, on the callers thread.
+    *                 The block should return true to early terminate
+    *                 the iteration, false otherwise.
+    *
+    * @result
+    * False if the collection iteration was made invalid
+    * (see OSCollectionIterator::isValid()) otherwise true.
+    */
+    bool iterateObjects(bool (^block)(OSObject * object));
+
+#endif /* __BLOCKS__ */
 
     OSMetaClassDeclareReservedUnused(OSCollection, 2);
     OSMetaClassDeclareReservedUnused(OSCollection, 3);
index c5438e9d3ad47f72060dc5e9fd8167e47418acfc..5168ca4d8bfca5b534e2b8e4eb797b100037d8bb 100644 (file)
@@ -925,6 +925,48 @@ public:
     OSArray * copyKeys(void);
 #endif /* XNU_KERNEL_PRIVATE */
 
+
+   /*!
+    * @function iterateObjects
+    *
+    * @abstract
+    * Invoke a callback for each member of the collection.
+    *
+    * @param refcon   A reference constant for the callback.
+    * @param callback The callback function,
+    *                 called with the refcon and each member key & object
+    *                 of the dictionary in turn, on the callers thread.
+    *                 The callback should return true to early terminate
+    *                 the iteration, false otherwise.
+    *
+    * @result
+    * False if the dictionary iteration was made invalid
+    * (see OSCollectionIterator::isValid()) otherwise true.
+    */
+    bool iterateObjects(void * refcon, bool (*callback)(void * refcon, const OSSymbol * key, OSObject * object));
+
+#ifdef __BLOCKS__
+
+   /*!
+    * @function iterateObjects
+    *
+    * @abstract
+    * Invoke a block for each member of the collection.
+    *
+    * @param block    The block,
+    *                 called with the refcon and each member key & object
+    *                 of the dictionary in turn, on the callers thread.
+    *                 The callback should return true to early terminate
+    *                 the iteration, false otherwise.
+    *
+    * @result
+    * False if the dictionary iteration was made invalid
+    * (see OSCollectionIterator::isValid()) otherwise true.
+    */
+    bool iterateObjects(bool (^block)(const OSSymbol * key, OSObject * object));
+
+#endif /* __BLOCKS__ */
+
     OSMetaClassDeclareReservedUnused(OSDictionary, 0);
     OSMetaClassDeclareReservedUnused(OSDictionary, 1);
     OSMetaClassDeclareReservedUnused(OSDictionary, 2);
index 2930a5fc7c70fd7c2d5e5c703b84917b7d2ed69f..2abc2929c36906c12eb1f6479db0920efa7d51d1 100644 (file)
@@ -88,11 +88,11 @@ void OSKextVLog(
 void OSKextRemoveKextBootstrap(void);
 
 kern_return_t OSRuntimeInitializeCPP(
-    kmod_info_t * kmodInfo,
-    void *        data);
+    OSKext * kext);
 kern_return_t OSRuntimeFinalizeCPP(
-    kmod_info_t * kmodInfo,
-    void        * data);
+    OSKext * kext);
+void OSRuntimeUnloadCPPForSegment(
+    kernel_segment_command_t * segment);
 
 kern_return_t is_io_catalog_send_data(
     mach_port_t              masterPort,
@@ -212,11 +212,11 @@ class OSKext : public OSObject
         __unused thread_call_param_t p1);
 
     friend kern_return_t OSRuntimeInitializeCPP(
-        kmod_info_t * kmodInfo,
-        void *        data);
+        OSKext * kext);
     friend kern_return_t OSRuntimeFinalizeCPP(
-        kmod_info_t * kmodInfo,
-        void        * data);
+        OSKext * kext);
+       friend void OSRuntimeUnloadCPPForSegment(
+        kernel_segment_command_t * segment);
 
     friend kern_return_t is_io_catalog_send_data(
             mach_port_t              masterPort,
@@ -275,6 +275,7 @@ private:
         unsigned int interface:1;
         unsigned int kernelComponent:1;
         unsigned int prelinked:1;
+        unsigned int builtin:1;
         unsigned int loaded:1;
         unsigned int dtraceInitialized:1;
         unsigned int starting:1;
@@ -292,6 +293,7 @@ private:
     struct list_head pendingPgoHead;
     uuid_t instance_uuid;
     OSKextAccount * account;
+    uint32_t builtinKmodIdx;
 
 #if PRAGMA_MARK
 /**************************************/
@@ -307,6 +309,10 @@ public:
     static OSDictionary * copyKexts(void);
     static OSReturn       removeKextBootstrap(void);
     static void           willShutdown(void);  // called by IOPMrootDomain on shutdown
+    static  void reportOSMetaClassInstances(
+        const char     * kextIdentifier,
+        OSKextLogSpec    msgLogSpec);
+
 #endif /* XNU_KERNEL_PRIVATE */
 
 private:
@@ -500,9 +506,7 @@ private:
         OSMetaClass * aClass);
     virtual bool    hasOSMetaClassInstances(void);
     virtual OSSet * getMetaClasses(void);
-    static  void reportOSMetaClassInstances(
-        const char     * kextIdentifier,
-        OSKextLogSpec    msgLogSpec);
+
     virtual void reportOSMetaClassInstances(
         OSKextLogSpec msgLogSpec);
 
@@ -565,12 +569,14 @@ private:
     void updateLoadedKextSummary(OSKextLoadedKextSummary *summary);
     void updateActiveAccount(OSKextActiveAccount *accountp);
 
+#ifdef XNU_KERNEL_PRIVATE
+public:
+#endif /* XNU_KERNEL_PRIVATE */
+
     /* C++ Initialization.
      */
     virtual void               setCPPInitialized(bool initialized=true);
 
-
-
 #if PRAGMA_MARK
 /**************************************/
 #pragma mark Public Functions
@@ -645,6 +651,8 @@ public:
     virtual OSKextLoadTag      getLoadTag(void);
     virtual void               getSizeInfo(uint32_t *loadSize, uint32_t *wiredSize);
     virtual OSData           * copyUUID(void);
+    OSData                   * copyTextUUID(void);
+    OSData                   * copyMachoUUID(const kernel_mach_header_t * header);
     virtual OSArray          * copyPersonalitiesArray(void);
     
    /* This removes personalities naming the kext (by CFBundleIdentifier),
index 6098d6b790f097f215278c60061c3195a11330d9..f05a9b858550d0276ac3a595b2f2986182c4ba0c 100644 (file)
@@ -318,87 +318,14 @@ public:
  *  @abstract Release an object if not <code>NULL</code>, then set it to <code>NULL</code>.
  *  @param    inst  Instance of an OSObject, may be <code>NULL</code>.
  */
-#define OSSafeReleaseNULL(inst)   do { if (inst) (inst)->release(); (inst) = NULL; } while (0)
+#define OSSafeReleaseNULL(inst)   do { if (inst != NULL) (inst)->release(); (inst) = NULL; } while (0)
 
 typedef void (*_ptf_t)(void);
 
-#if APPLE_KEXT_LEGACY_ABI
-
-// Arcane evil code interprets a C++ pointer to function as specified in the
-// -fapple-kext ABI, i.e. the gcc-2.95 generated code.  IT DOES NOT ALLOW
-// the conversion of functions that are from MULTIPLY inherited classes.
-
-static inline _ptf_t
-_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void))
-{
-    union {
-        void (OSMetaClassBase::*fIn)(void);
-        struct {     // Pointer to member function 2.95
-            unsigned short fToff;
-            short  fVInd;
-            union {
-                _ptf_t fPFN;
-                short  fVOff;
-            } u;
-        } fptmf2;
-    } map;
-
-    map.fIn = func;
-    if (map.fptmf2.fToff) {
-        panic("Multiple inheritance is not supported");
-        return 0;
-    } else if (map.fptmf2.fVInd < 0) {
-        // Not virtual, i.e. plain member func
-        return map.fptmf2.u.fPFN;
-    } else {
-        union {
-            const OSMetaClassBase *fObj;
-            _ptf_t **vtablep;
-        } u;
-        u.fObj = self;
-
-        // Virtual member function so dereference vtable
-        return (*u.vtablep)[map.fptmf2.fVInd - 1];
-    }
-}
-
-#else /* !APPLE_KEXT_LEGACY_ABI */
 #if defined(__arm__) || defined(__arm64__)
-typedef long int ptrdiff_t;
-/*
- * Ugly reverse engineered ABI.  Where does it come from?  Nobody knows.
- * <rdar://problem/5641129> gcc 4.2-built ARM kernel panics with multiple inheritance (no, really)
- */
-static inline _ptf_t
-_ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void))
-{
-    struct ptmf_t {
-        _ptf_t fPFN;
-        ptrdiff_t delta;
-    };
-    union {
-        void (OSMetaClassBase::*fIn)(void);
-        struct ptmf_t pTMF;
-    } map;
 
+    static _ptf_t _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void));
 
-    map.fIn = func;
-
-    if (map.pTMF.delta & 1) {
-        // virtual
-        union {
-            const OSMetaClassBase *fObj;
-            _ptf_t **vtablep;
-        } u;
-        u.fObj = self;
-
-        // Virtual member function so dereference table
-        return *(_ptf_t *)(((uintptr_t)*u.vtablep) + (uintptr_t)map.pTMF.fPFN);
-    } else {
-        // Not virtual, i.e. plain member func
-        return map.pTMF.fPFN;
-    } 
-}
 #elif defined(__i386__) || defined(__x86_64__)
 
 // Slightly less arcane and slightly less evil code to do
@@ -436,7 +363,6 @@ _ptmf2ptf(const OSMetaClassBase *self, void (OSMetaClassBase::*func)(void))
 #error Unknown architecture.
 #endif /* __arm__ */
 
-#endif /* !APPLE_KEXT_LEGACY_ABI */
 
    /*!
     * @define OSMemberFunctionCast
index 59e12d1088b6c0f84392818329f2d76804fa46e7..53de72aa6d427053e6e0d3c75c80402a83969d6a 100644 (file)
@@ -310,11 +310,15 @@ public:
     OSMetaClassDeclareReservedUnused(OSSerialize, 7);
 };
 
-// xx-review: this whole class seems to be unused!
 
 typedef bool (*OSSerializerCallback)(void * target, void * ref,
                                      OSSerialize * serializer);
 
+#ifdef __BLOCKS__
+typedef bool (^OSSerializerBlock)(OSSerialize * serializer);
+#endif /* __BLOCKS__ */
+
+
 class OSSerializer : public OSObject
 {
     OSDeclareDefaultStructors(OSSerializer)
@@ -330,6 +334,18 @@ public:
         OSSerializerCallback callback,
         void * ref = 0);
 
+#ifdef __BLOCKS__
+    static OSSerializer * withBlock(
+        OSSerializerBlock callback);
+#endif
+
+    virtual void free( void ) APPLE_KEXT_OVERRIDE;
+
+#if XNU_KERNEL_PRIVATE
+         static bool callbackToBlock(void * target, void * ref,
+                                     OSSerialize * serializer);
+#endif /* XNU_KERNEL_PRIVATE */
+
     virtual bool serialize(OSSerialize * serializer) const APPLE_KEXT_OVERRIDE;
 };
 
diff --git a/libkern/libkern/crc.h b/libkern/libkern/crc.h
new file mode 100644 (file)
index 0000000..bf7f42b
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2017-2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _LIBKERN_CRC_H_
+#define _LIBKERN_CRC_H_
+
+#include <sys/cdefs.h>
+#include <stdint.h>
+
+__BEGIN_DECLS
+
+uint16_t       crc16(uint16_t crc, const void *bufp, size_t len);
+uint32_t       crc32(uint32_t crc, const void *bufp, size_t len);
+
+__END_DECLS
+
+#endif /* _LIBKERN_CRC_H_ */
diff --git a/libkern/libkern/img4/Makefile b/libkern/libkern/img4/Makefile
new file mode 100644 (file)
index 0000000..1ae8a81
--- /dev/null
@@ -0,0 +1,24 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+DATAFILES =
+PRIVATE_DATAFILES =
+KERNELFILES =
+PRIVATE_KERNELFILES = interface.h
+
+INSTALL_MI_LIST = ${DATAFILES}
+INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
+INSTALL_KF_MI_LIST = ${KERNELFILES}
+INSTALL_KF_MI_LCL_LIST = ${PRIVATE_KERNELFILES}
+EXPORT_MI_LIST = ${INSTALL_KF_MI_LCL_LIST}
+
+INSTALL_MI_DIR = libkern/img4
+EXPORT_MI_DIR = libkern/img4
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/libkern/libkern/img4/interface.h b/libkern/libkern/img4/interface.h
new file mode 100644 (file)
index 0000000..f88d89b
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*!
+ * @header
+ * Interfaces to register the AppleImage4 interface with xnu-proper to avoid a
+ * build-time layering inversion.
+ */
+#ifndef __IMG4_INTERFACE_H
+#define __IMG4_INTERFACE_H
+
+#include <os/base.h>
+#include <sys/cdefs.h>
+
+/*
+ * We rely on img4.h's logic for either including sys/types.h or declaring
+ * errno_t ourselves.
+ */
+#include <img4/img4.h>
+
+/*!
+ * @const IMG4_INTERFACE_VERSION
+ * The version of the interface supported by the implementation. As new
+ * functions are added to the interface, this value will be incremented so that
+ * it can be tested at build-time and not require rev-locked submissions of xnu
+ * and AppleImage4.
+ */
+#define IMG4_INTERFACE_VERSION (1u)
+
+/*!
+ * @typedef img4_init_t
+ * A type describing the img4_init() function.
+ */
+typedef errno_t (*img4_init_t)(
+       img4_t *i4,
+       img4_flags_t flags,
+       const uint8_t *bytes,
+       size_t len,
+       img4_destructor_t destructor
+);
+
+/*!
+ * @typedef img4_init_t
+ * A type describing the img4_set_custom_tag_handler() function.
+ */
+typedef void (*img4_set_custom_tag_handler_t)(
+       img4_t *i4,
+       const img4_custom_tag_t *tags,
+       size_t tags_cnt
+);
+
+/*!
+ * @typedef img4_init_t
+ * A type describing the img4_get_trusted_payload() function.
+ */
+typedef errno_t (*img4_get_trusted_payload_t)(
+       img4_t *i4,
+       img4_tag_t tag,
+       const img4_environment_t *env,
+       void *ctx,
+       const uint8_t **bytes,
+       size_t *len
+);
+
+/*!
+ * @typedef img4_init_t
+ * A type describing the img4_get_trusted_external_payload() function.
+ */
+typedef errno_t (*img4_get_trusted_external_payload_t)(
+       img4_t *img4,
+       img4_payload_t *payload,
+       const img4_environment_t *env,
+       void *ctx,
+       const uint8_t **bytes,
+       size_t *len
+);
+
+/*!
+ * @typedef img4_init_t
+ * A type describing the img4_get_entitlement_bool() function.
+ */
+typedef bool (*img4_get_entitlement_bool_t)(
+       img4_t *i4,
+       img4_tag_t entitlement
+);
+
+/*!
+ * @typedef img4_init_t
+ * A type describing the img4_get_object_entitlement_bool() function.
+ */
+typedef bool (*img4_get_object_entitlement_bool_t)(
+       img4_t *i4,
+       img4_tag_t object,
+       img4_tag_t entitlement
+);
+
+/*!
+ * @typedef img4_init_t
+ * A type describing the img4_destroy() function.
+ */
+typedef void (*img4_destroy_t)(
+       img4_t *i4
+);
+
+/*!
+ * @typedef img4_interface_t
+ * A structure describing the interface to the AppleImage4 kext.
+ *
+ * @property i4if_version
+ * The version of the structure supported by the implementation.
+ *
+ * @property i4if_init
+ * A pointer to the img4_init function.
+ *
+ * @property i4if_set_custom_tag_handler
+ * A pointer to the img4_set_custom_tag_handler function.
+ *
+ * @property i4if_get_trusted_payload
+ * A pointer to the img4_get_trusted_payload function.
+ *
+ * @property i4if_get_trusted_external_payload
+ * A pointer to the img4_get_trusted_external_payload function.
+ *
+ * @property i4if_get_entitlement_bool
+ * A pointer to the img4_get_entitlement_bool function.
+ *
+ * @property i4if_get_object_entitlement_bool
+ * A pointer to the img4_get_object_entitlement_bool function.
+ *
+ * @property i4if_destroy
+ * A pointer to the img4_destroy function.
+ *
+ * @property i4if_v1
+ * All members added in version 1 of the structure.
+ *
+ * @property environment_platform
+ * The IMG4_ENVIRONMENT_PLATFORM global.
+ */
+typedef struct _img4_interface {
+       const uint32_t i4if_version;
+       const img4_init_t i4if_init;
+       const img4_set_custom_tag_handler_t i4if_set_custom_tag_handler;
+       const img4_get_trusted_payload_t i4if_get_trusted_payload;
+       const img4_get_trusted_external_payload_t i4if_get_trusted_external_payload;
+       const img4_get_entitlement_bool_t i4if_get_entitlement_bool;
+       const img4_get_object_entitlement_bool_t i4if_get_object_entitlement_bool;
+       const img4_destroy_t i4if_destroy;
+       struct {
+               const img4_environment_t *environment_platform;
+       } i4if_v1;
+       void *__reserved[23];
+} img4_interface_t;
+
+__BEGIN_DECLS;
+
+/*!
+ * @const img4if
+ * The AppleImage4 interface that was registered.
+ */
+extern const img4_interface_t *img4if;
+
+/*!
+ * @function img4_interface_register
+ * Registers the AppleImage4 kext interface with xnu.
+ *
+ * @param i4
+ * The interface to register.
+ *
+ * @discussion
+ * This routine may only be called once and must be called before late-const has
+ * been applied to kernel memory.
+ */
+OS_EXPORT OS_NONNULL1
+void
+img4_interface_register(const img4_interface_t *i4);
+
+__END_DECLS;
+
+#endif // __IMG4_INTERFACE_H
index 929ab17bc56913f225f2569c1fdf551e1c0738fe..38996dbb908918a98a18df942bac3167be649edc 100644 (file)
 
 #define kPrelinkInfoSegment                "__PRELINK_INFO"
 #define kPrelinkInfoSection                "__info"
+#define kBuiltinInfoSection                "__kmod_info"
+#define kBuiltinStartSection               "__kmod_start"
+
+// __DATA segment
+#define kBuiltinInitSection                "__kmod_init"
+#define kBuiltinTermSection                "__kmod_term"
 
 #define kPrelinkBundlePathKey              "_PrelinkBundlePath"
 #define kPrelinkExecutableRelativePathKey  "_PrelinkExecutableRelativePath"
index 57b97d48a29a04e57ce4198f2d7114ccea28607c..281f8e41054b22c1be641f0a0b459dc31ab54666 100644 (file)
  */
 #define VERSION_PRERELEASE_LEVEL       ###KERNEL_VERSION_PRERELEASE_LEVEL###
 
+/* OSBUILD_CONFIG, osbuild_config is a one-word string describing the build
+ * configuration of the kernel, e.g., development or release */
+#define OSBUILD_CONFIG  "###KERNEL_BUILD_CONFIG###"
+
 /* OSTYPE, ostype, is a string as returned by uname -s */
 #define        OSTYPE          "Darwin"
 
@@ -89,6 +93,9 @@ extern const int version_stage;
 /* Build-time value of VERSION_PRERELEASE_LEVEL */
 extern const int version_prerelease_level;
 
+/* Build-time value of CURRENT_KERNEL_CONFIG */
+extern const char osbuild_config[];
+
 /* Build-time value of OSTYPE */
 extern const char ostype[];
 
index 390b9b86174e41b2fdfcf615e04e02be23f68971..dc30508ab5cfffba8cb9a047ceb2939684242374 100644 (file)
@@ -17,7 +17,8 @@ KERNELFILES = \
 
 PRIVATE_KERNELFILES = \
        object_private.h \
-       reason_private.h
+       reason_private.h \
+       refcnt.h
 
 PRIVATE_DATAFILES = \
        reason_private.h
index 14326986279b30c7e29cc424dfd3aa500f7b0554..d4a8a3e7a8f9e505cf8823131daa9900f33f6ef0 100644 (file)
 
 #include "log_encode.h"
 
+/* on embedded, with no kext loading or unloads,
+ * make the kernel use the libtrace shared cache path for logging
+ */
+#define FIREHOSE_USES_SHARED_CACHE NO_KEXTD
+
+#if FIREHOSE_USES_SHARED_CACHE
+extern vm_offset_t   segLOWESTTEXT;
+#endif
+
 struct os_log_s {
        int a;
 };
@@ -280,6 +289,13 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
     uint8_t pubdata[OS_LOG_BUFFER_MAX_SIZE];
     va_list args_copy;
 
+    if (addr == NULL) {
+        return;
+    }
+
+#if FIREHOSE_USES_SHARED_CACHE
+    dso = (void *) segLOWESTTEXT;
+#else /* FIREHOSE_USES_SHARED_CACHE */
     if (dso == NULL) {
         dso = (void *) OSKextKextForAddress(format);
         if (dso == NULL) {
@@ -291,14 +307,11 @@ _os_log_to_log_internal(os_log_t oslog, os_log_type_t type,
         return;
     }
 
-    if (addr == NULL) {
-        return;
-    }
-
     void *dso_addr = (void *) OSKextKextForAddress(addr);
     if (dso != dso_addr) {
         return;
     }
+#endif /* FIREHOSE_USES_SHARED_CACHE */
 
     memset(&context, 0, sizeof(context));
     memset(buffer, 0, OS_LOG_BUFFER_MAX_SIZE);
@@ -326,10 +339,18 @@ static inline size_t
 _os_trace_write_location_for_address(uint8_t buf[static sizeof(uint64_t)],
                void *dso, const void *address, firehose_tracepoint_flags_t *flags)
 {
-       kernel_mach_header_t *mh = dso;
+#if FIREHOSE_USES_SHARED_CACHE
+    *flags = _firehose_tracepoint_flags_pc_style_shared_cache;
+    memcpy(buf, (uint32_t[]){ (uintptr_t)address - (uintptr_t)dso },
+                       sizeof(uint32_t));
+       return sizeof(uint32_t);
+
+#else /* FIREHOSE_USES_SHARED_CACHE */
+    kernel_mach_header_t *mh = dso;
 
        if (mh->filetype == MH_EXECUTE) {
                *flags = _firehose_tracepoint_flags_pc_style_main_exe;
+
                memcpy(buf, (uint32_t[]){ (uintptr_t)address - (uintptr_t)dso },
                                sizeof(uint32_t));
                return sizeof(uint32_t);
@@ -342,6 +363,7 @@ _os_trace_write_location_for_address(uint8_t buf[static sizeof(uint64_t)],
                return sizeof(uintptr_t);
 #endif
        }
+#endif /* !FIREHOSE_USES_SHARED_CACHE */
 }
 
 
@@ -616,3 +638,260 @@ __firehose_critical_region_leave(void) {
         return;
 }
 
+#ifdef CONFIG_XNUPOST
+
+#include <tests/xnupost.h>
+#define TESTOSLOGFMT(fn_name) "%u^%llu/%llu^kernel^0^test^" fn_name
+#define TESTOSLOGPFX "TESTLOG:%u#"
+#define TESTOSLOG(fn_name) TESTOSLOGPFX TESTOSLOGFMT(fn_name "#")
+
+extern u_int32_t RandomULong(void);
+extern uint32_t find_pattern_in_buffer(char * pattern, uint32_t len, int expected_count);
+void test_oslog_default_helper(uint32_t uniqid, uint64_t count);
+void test_oslog_info_helper(uint32_t uniqid, uint64_t count);
+void test_oslog_debug_helper(uint32_t uniqid, uint64_t count);
+void test_oslog_error_helper(uint32_t uniqid, uint64_t count);
+void test_oslog_fault_helper(uint32_t uniqid, uint64_t count);
+void _test_log_loop(void * arg __unused, wait_result_t wres __unused);
+void test_oslog_handleOSLogCtl(int32_t * in, int32_t * out, int32_t len);
+kern_return_t test_stresslog_dropmsg(uint32_t uniqid);
+
+kern_return_t test_os_log(void);
+kern_return_t test_os_log_parallel(void);
+
+#define GENOSLOGHELPER(fname, ident, callout_f)                                                            \
+    void fname(uint32_t uniqid, uint64_t count)                                                            \
+    {                                                                                                      \
+        int32_t datalen = 0;                                                                               \
+        uint32_t checksum = 0;                                                                             \
+        char databuffer[256];                                                                              \
+        T_LOG("Doing os_log of %llu TESTLOG msgs for fn " ident, count);                                   \
+        for (uint64_t i = 0; i < count; i++)                                                               \
+        {                                                                                                  \
+            datalen = snprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT(ident), uniqid, i + 1, count); \
+            checksum = crc32(0, databuffer, datalen);                                                      \
+            callout_f(OS_LOG_DEFAULT, TESTOSLOG(ident), checksum, uniqid, i + 1, count);                   \
+            /*T_LOG(TESTOSLOG(ident), checksum, uniqid, i + 1, count);*/                                   \
+        }                                                                                                  \
+    }
+
+GENOSLOGHELPER(test_oslog_info_helper, "oslog_info_helper", os_log_info);
+GENOSLOGHELPER(test_oslog_fault_helper, "oslog_fault_helper", os_log_fault);
+GENOSLOGHELPER(test_oslog_debug_helper, "oslog_debug_helper", os_log_debug);
+GENOSLOGHELPER(test_oslog_error_helper, "oslog_error_helper", os_log_error);
+GENOSLOGHELPER(test_oslog_default_helper, "oslog_default_helper", os_log);
+
+kern_return_t test_os_log()
+{
+    char databuffer[256];
+    uint32_t uniqid = RandomULong();
+    uint32_t match_count = 0;
+    uint32_t checksum = 0;
+    uint32_t total_msg = 0;
+    uint32_t saved_msg = 0;
+    uint32_t dropped_msg = 0;
+    int datalen = 0;
+    uint64_t a = mach_absolute_time();
+    uint64_t seqno = 1;
+    uint64_t total_seqno = 2;
+
+    os_log_t log_handle = os_log_create("com.apple.xnu.test.t1", "kpost");
+
+    T_ASSERT_EQ_PTR(&_os_log_default, log_handle, "os_log_create returns valid value.");
+    T_ASSERT_EQ_INT(TRUE, os_log_info_enabled(log_handle), "os_log_info is enabled");
+    T_ASSERT_EQ_INT(TRUE, os_log_debug_enabled(log_handle), "os_log_debug is enabled");
+    T_ASSERT_EQ_PTR(&_os_log_default, OS_LOG_DEFAULT, "ensure OS_LOG_DEFAULT is _os_log_default");
+
+    total_msg = oslog_p_total_msgcount;
+    saved_msg = oslog_p_saved_msgcount;
+    dropped_msg = oslog_p_dropped_msgcount;
+    T_LOG("oslog internal counters total %u , saved %u, dropped %u", total_msg, saved_msg, dropped_msg);
+
+    T_LOG("Validating with uniqid %u u64 %llu", uniqid, a);
+    T_ASSERT_NE_UINT(0, uniqid, "random number should not be zero");
+    T_ASSERT_NE_ULLONG(0, a, "absolute time should not be zero");
+
+    datalen = snprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT("printf_only"), uniqid, seqno, total_seqno);
+    checksum = crc32(0, databuffer, datalen);
+    printf(TESTOSLOG("printf_only") "mat%llu\n", checksum, uniqid, seqno, total_seqno, a);
+
+    seqno += 1;
+    datalen = snprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT("printf_only"), uniqid, seqno, total_seqno);
+    checksum = crc32(0, databuffer, datalen);
+    printf(TESTOSLOG("printf_only") "mat%llu\n", checksum, uniqid, seqno, total_seqno, a);
+
+    datalen = snprintf(databuffer, sizeof(databuffer), "kernel^0^test^printf_only#mat%llu", a);
+    match_count = find_pattern_in_buffer(databuffer, datalen, total_seqno);
+    T_EXPECT_EQ_UINT(match_count, 2, "verify printf_only goes to systemlog buffer");
+
+    uint32_t logging_config = atm_get_diagnostic_config();
+    T_LOG("checking atm_diagnostic_config 0x%X", logging_config);
+
+    if ((logging_config & ATM_TRACE_OFF) || (logging_config & ATM_TRACE_DISABLE))
+    {
+        T_LOG("ATM_TRACE_OFF / ATM_TRACE_DISABLE is set. Would not see oslog messages. skipping the rest of test.");
+        return KERN_SUCCESS;
+    }
+
+    /* for enabled logging printfs should be saved in oslog as well */
+    T_EXPECT_GE_UINT((oslog_p_total_msgcount - total_msg), 2, "atleast 2 msgs should be seen by oslog system");
+
+    a = mach_absolute_time();
+    total_seqno = 1;
+    seqno = 1;
+    total_msg = oslog_p_total_msgcount;
+    saved_msg = oslog_p_saved_msgcount;
+    dropped_msg = oslog_p_dropped_msgcount;
+    datalen = snprintf(databuffer, sizeof(databuffer), TESTOSLOGFMT("oslog_info"), uniqid, seqno, total_seqno);
+    checksum = crc32(0, databuffer, datalen);
+    os_log_info(log_handle, TESTOSLOG("oslog_info") "mat%llu", checksum, uniqid, seqno, total_seqno, a);
+    T_EXPECT_GE_UINT((oslog_p_total_msgcount - total_msg), 1, "total message count in buffer");
+
+    datalen = snprintf(databuffer, sizeof(databuffer), "kernel^0^test^oslog_info#mat%llu", a);
+    match_count = find_pattern_in_buffer(databuffer, datalen, total_seqno);
+    T_EXPECT_EQ_UINT(match_count, 1, "verify oslog_info does not go to systemlog buffer");
+
+    total_msg = oslog_p_total_msgcount;
+    test_oslog_info_helper(uniqid, 10);
+    T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_info_helper: Should have seen 10 msgs");
+
+    total_msg = oslog_p_total_msgcount;
+    test_oslog_debug_helper(uniqid, 10);
+    T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_debug_helper:Should have seen 10 msgs");
+
+    total_msg = oslog_p_total_msgcount;
+    test_oslog_error_helper(uniqid, 10);
+    T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_error_helper:Should have seen 10 msgs");
+
+    total_msg = oslog_p_total_msgcount;
+    test_oslog_default_helper(uniqid, 10);
+    T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_default_helper:Should have seen 10 msgs");
+
+    total_msg = oslog_p_total_msgcount;
+    test_oslog_fault_helper(uniqid, 10);
+    T_EXPECT_GE_UINT(oslog_p_total_msgcount - total_msg, 10, "test_oslog_fault_helper:Should have seen 10 msgs");
+
+    T_LOG("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount,
+          oslog_p_dropped_msgcount);
+
+    return KERN_SUCCESS;
+}
+
+static uint32_t _test_log_loop_count = 0;
+void _test_log_loop(void * arg __unused, wait_result_t wres __unused)
+{
+    uint32_t uniqid = RandomULong();
+    test_oslog_debug_helper(uniqid, 100);
+    (void)hw_atomic_add(&_test_log_loop_count, 100);
+}
+
+kern_return_t test_os_log_parallel(void)
+{
+    thread_t thread[2];
+    kern_return_t kr;
+    uint32_t uniqid = RandomULong();
+
+    printf("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount,
+           oslog_p_dropped_msgcount);
+
+    kr = kernel_thread_start(_test_log_loop, NULL, &thread[0]);
+    T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully");
+
+    kr = kernel_thread_start(_test_log_loop, NULL, &thread[1]);
+    T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully");
+
+    test_oslog_info_helper(uniqid, 100);
+
+    /* wait until other thread has also finished */
+    while (_test_log_loop_count < 200)
+    {
+        delay(1000);
+    }
+
+    thread_deallocate(thread[0]);
+    thread_deallocate(thread[1]);
+
+    T_LOG("oslog internal counters total %u , saved %u, dropped %u", oslog_p_total_msgcount, oslog_p_saved_msgcount,
+          oslog_p_dropped_msgcount);
+    T_PASS("parallel_logging tests is now complete");
+
+    return KERN_SUCCESS;
+}
+
+void test_oslog_handleOSLogCtl(int32_t * in, int32_t * out, int32_t len)
+{
+    if (!in || !out || len != 4)
+        return;
+    switch (in[0]) {
+           case 1:
+           {
+               /* send out counters */
+               out[1] = oslog_p_total_msgcount;
+               out[2] = oslog_p_saved_msgcount;
+               out[3] = oslog_p_dropped_msgcount;
+               out[0] = KERN_SUCCESS;
+               break;
+           }
+           case 2:
+           {
+               /* mini stress run */
+               out[0] = test_os_log_parallel();
+               break;
+           }
+           case 3:
+           {
+               /* drop msg tests */
+                       out[1] = RandomULong();
+               out[0] = test_stresslog_dropmsg(out[1]);
+               break;
+           }
+           case 4:
+           {
+               /* invoke log helpers */
+               uint32_t uniqid = in[3];
+               int32_t msgcount = in[2];
+               if (uniqid == 0 || msgcount == 0)
+               {
+                   out[0] = KERN_INVALID_VALUE;
+                   return;
+               }
+
+               switch (in[1]) {
+                       case OS_LOG_TYPE_INFO: test_oslog_info_helper(uniqid, msgcount); break;
+                       case OS_LOG_TYPE_DEBUG: test_oslog_debug_helper(uniqid, msgcount); break;
+                       case OS_LOG_TYPE_ERROR: test_oslog_error_helper(uniqid, msgcount); break;
+                       case OS_LOG_TYPE_FAULT: test_oslog_fault_helper(uniqid, msgcount); break;
+                       case OS_LOG_TYPE_DEFAULT:
+                       default: test_oslog_default_helper(uniqid, msgcount); break;
+               }
+               out[0] = KERN_SUCCESS;
+               break;
+               /* end of case 4 */
+           }
+           default:
+           {
+               out[0] = KERN_INVALID_VALUE;
+               break;
+           }
+    }
+    return;
+}
+
+kern_return_t test_stresslog_dropmsg(uint32_t uniqid)
+{
+    uint32_t total, saved, dropped;
+    total = oslog_p_total_msgcount;
+    saved = oslog_p_saved_msgcount;
+    dropped = oslog_p_dropped_msgcount;
+    uniqid = RandomULong();
+    test_oslog_debug_helper(uniqid, 100);
+    while ((oslog_p_dropped_msgcount - dropped) == 0)
+    {
+        test_oslog_debug_helper(uniqid, 100);
+    }
+    printf("test_stresslog_dropmsg: logged %u msgs, saved %u and caused a drop of %u msgs. \n", oslog_p_total_msgcount - total,
+           oslog_p_saved_msgcount - saved, oslog_p_dropped_msgcount - dropped);
+    return KERN_SUCCESS;
+}
+
+#endif
index 4f8afae5c394ac96cdbba32d905998ab8aee1b9b..d214bab2100ed22ffa1e84b7406154056e7013f9 100644 (file)
 #include "log_encode_types.h"
 #include <sys/param.h>
 
+#if __has_feature(ptrauth_calls)
+#include <mach/vm_param.h>
+#include <ptrauth.h>
+#endif /* __has_feature(ptrauth_calls) */
+
 #ifdef KERNEL
 #define isdigit(ch) (((ch) >= '0') && ((ch) <= '9'))
 extern boolean_t doprnt_hide_pointers;
@@ -156,13 +161,21 @@ _os_log_encode_arg(void *arg, uint16_t arg_len, os_log_value_type_t ctype, bool
         unsigned long long value = 0;
         memcpy(&value, arg, arg_len);
 
+#if __has_feature(ptrauth_calls)
+                       /**
+                        * Strip out the pointer authentication code before
+                        * checking whether the pointer is a kernel address.
+                        */
+                       value = (unsigned long long)VM_KERNEL_STRIP_PTR(value);
+#endif /* __has_feature(ptrauth_calls) */
+
         if (value >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && value <= VM_MAX_KERNEL_ADDRESS) {
             is_private = true;
             bzero(arg, arg_len);
         }
     }
 #endif
-    
+
     content->type = ctype;
     content->flags = (is_private ? OS_LOG_CONTENT_FLAG_PRIVATE : 0);
     
index 477bceeed45f35f3da39484f121ad472ddec1d97..56a68f1f11a80775fcd2c693d4a2a45577141dd9 100644 (file)
@@ -37,6 +37,7 @@
 OS_ENUM(os_reason_libsystem_code, uint64_t,
        OS_REASON_LIBSYSTEM_CODE_WORKLOOP_OWNERSHIP_LEAK = 1,
        OS_REASON_LIBSYSTEM_CODE_FAULT = 2, /* generated by os_log_fault */
+       OS_REASON_LIBSYSTEM_CODE_SECINIT_INITIALIZER = 3,
 );
 
 #ifndef KERNEL
diff --git a/libkern/os/refcnt.c b/libkern/os/refcnt.c
new file mode 100644 (file)
index 0000000..5396598
--- /dev/null
@@ -0,0 +1,298 @@
+#include <kern/assert.h>
+#include <kern/debug.h>
+#include <pexpert/pexpert.h>
+#include <kern/btlog.h>
+#include <kern/backtrace.h>
+#include <libkern/libkern.h>
+#include "refcnt.h"
+
+#define OS_REFCNT_MAX_COUNT     ((os_ref_count_t)0x0FFFFFFFUL)
+
+#if OS_REFCNT_DEBUG
+os_refgrp_decl(static, global_ref_group, "all", NULL);
+static bool ref_debug_enable = false;
+static const size_t ref_log_nrecords = 1000000;
+
+#define REFLOG_BTDEPTH   10
+#define REFLOG_RETAIN    1
+#define REFLOG_RELEASE   2
+
+#define __debug_only
+#else
+# define __debug_only __unused
+#endif /* OS_REFCNT_DEBUG */
+
+static const char *
+ref_grp_name(struct os_refcnt __debug_only *rc)
+{
+#if OS_REFCNT_DEBUG
+       if (rc && rc->ref_group && rc->ref_group->grp_name) {
+               return rc->ref_group->grp_name;
+       }
+#endif
+       return "<null>";
+}
+
+static void
+os_ref_check_underflow(struct os_refcnt *rc, os_ref_count_t count)
+{
+       if (__improbable(count == 0)) {
+               panic("os_refcnt: underflow (rc=%p, grp=%s)\n", rc, ref_grp_name(rc));
+               __builtin_unreachable();
+       }
+}
+
+static void
+os_ref_assert_referenced(struct os_refcnt *rc, os_ref_count_t count)
+{
+       if (__improbable(count == 0)) {
+               panic("os_refcnt: used unsafely when zero (rc=%p, grp=%s)\n", rc, ref_grp_name(rc));
+               __builtin_unreachable();
+       }
+}
+
+static void
+os_ref_check_overflow(struct os_refcnt *rc, os_ref_count_t count)
+{
+       if (__improbable(count >= OS_REFCNT_MAX_COUNT)) {
+               panic("os_refcnt: overflow (rc=%p, grp=%s)\n", rc, ref_grp_name(rc));
+               __builtin_unreachable();
+       }
+}
+
+static void
+os_ref_check_retain(struct os_refcnt *rc, os_ref_count_t count)
+{
+       os_ref_assert_referenced(rc, count);
+       os_ref_check_overflow(rc, count);
+}
+
+#if OS_REFCNT_DEBUG
+static void
+ref_log_op(struct os_refgrp *grp, void *elem, int op)
+{
+       if (!ref_debug_enable || grp == NULL) {
+               return;
+       }
+
+       if (grp->grp_log == NULL) {
+               ref_log_op(grp->grp_parent, elem, op);
+               return;
+       }
+
+       uintptr_t bt[REFLOG_BTDEPTH];
+       uint32_t nframes = backtrace(bt, REFLOG_BTDEPTH);
+       btlog_add_entry((btlog_t *)grp->grp_log, elem, op, (void **)bt, nframes);
+}
+
+static void
+ref_log_drop(struct os_refgrp *grp, void *elem)
+{
+       if (!ref_debug_enable || grp == NULL) {
+               return;
+       }
+
+       if (grp->grp_log == NULL) {
+               ref_log_drop(grp->grp_parent, elem);
+               return;
+       }
+
+       btlog_remove_entries_for_element(grp->grp_log, elem);
+}
+
+static void
+ref_log_init(struct os_refgrp *grp)
+{
+       if (grp->grp_log != NULL) {
+               return;
+       }
+
+       char grpbuf[128];
+       char *refgrp = grpbuf;
+       if (!PE_parse_boot_argn("rlog", refgrp, sizeof(grpbuf))) {
+               return;
+       }
+
+       const char *g;
+       while ((g = strsep(&refgrp, ",")) != NULL) {
+               if (strcmp(g, grp->grp_name) == 0) {
+                       /* enable logging on this refgrp */
+                       grp->grp_log = btlog_create(ref_log_nrecords, REFLOG_BTDEPTH, true);
+                       assert(grp->grp_log);
+                       ref_debug_enable = true;
+                       return;
+               }
+       }
+
+}
+
+/*
+ * attach a new refcnt to a group
+ */
+static void
+ref_attach_to_group(struct os_refcnt *rc, struct os_refgrp *grp, os_ref_count_t init_count)
+{
+       if (grp == NULL) {
+               return;
+       }
+
+       if (atomic_fetch_add_explicit(&grp->grp_children, 1, memory_order_relaxed) == 0) {
+               /* First reference count object in this group. Check if we should enable
+                * refcount logging. */
+               ref_log_init(grp);
+       }
+
+       atomic_fetch_add_explicit(&grp->grp_count, init_count, memory_order_relaxed);
+       atomic_fetch_add_explicit(&grp->grp_retain_total, init_count, memory_order_relaxed);
+
+       if (grp == &global_ref_group) {
+               return;
+       }
+
+       if (grp->grp_parent == NULL) {
+               grp->grp_parent = &global_ref_group;
+       }
+
+       ref_attach_to_group(rc, grp->grp_parent, init_count);
+}
+
+static inline void
+ref_retain_group(struct os_refgrp *grp)
+{
+       if (grp) {
+               atomic_fetch_add_explicit(&grp->grp_count, 1, memory_order_relaxed);
+               atomic_fetch_add_explicit(&grp->grp_retain_total, 1, memory_order_relaxed);
+               ref_retain_group(grp->grp_parent);
+       }
+}
+
+static inline void
+ref_release_group(struct os_refgrp *grp, bool final)
+{
+       if (grp) {
+               atomic_fetch_sub_explicit(&grp->grp_count, 1, memory_order_relaxed);
+               atomic_fetch_add_explicit(&grp->grp_release_total, 1, memory_order_relaxed);
+               if (final) {
+                       atomic_fetch_sub_explicit(&grp->grp_children, 1, memory_order_relaxed);
+               }
+
+               ref_release_group(grp->grp_parent, final);
+       }
+}
+#endif
+
+#undef os_ref_init_count
+void
+os_ref_init_count(struct os_refcnt *rc, struct os_refgrp __debug_only *grp, os_ref_count_t count)
+{
+       atomic_init(&rc->ref_count, count);
+
+#if OS_REFCNT_DEBUG
+       assert(count > 0);
+       if (grp) {
+               rc->ref_group = grp;
+       } else {
+               rc->ref_group = &global_ref_group;
+       }
+
+       ref_attach_to_group(rc, rc->ref_group, count);
+
+       for (os_ref_count_t i = 0; i < count; i++) {
+               ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN);
+       }
+#endif
+}
+
+void
+os_ref_retain(struct os_refcnt *rc)
+{
+       os_ref_count_t old = atomic_fetch_add_explicit(&rc->ref_count, 1, memory_order_relaxed);
+       os_ref_check_retain(rc, old);
+
+#if OS_REFCNT_DEBUG
+       ref_retain_group(rc->ref_group);
+       ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN);
+#endif
+}
+
+bool
+os_ref_retain_try(struct os_refcnt *rc)
+{
+       os_ref_count_t cur = os_ref_get_count(rc);
+
+       while (1) {
+               if (__improbable(cur == 0)) {
+                       return false;
+               }
+
+               os_ref_check_retain(rc, cur);
+
+               if (atomic_compare_exchange_weak_explicit(&rc->ref_count, &cur, cur+1,
+                                       memory_order_relaxed, memory_order_relaxed)) {
+#if OS_REFCNT_DEBUG
+                       ref_retain_group(rc->ref_group);
+                       ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN);
+#endif
+                       return true;
+               }
+       }
+}
+
+os_ref_count_t
+os_ref_release_explicit(struct os_refcnt *rc, memory_order release_order, memory_order dealloc_order)
+{
+#if OS_REFCNT_DEBUG
+       /*
+        * Care not to use 'rc' after the decrement because it might be deallocated
+        * under us.
+        */
+       struct os_refgrp *grp = rc->ref_group;
+       ref_log_op(grp, (void *)rc, REFLOG_RELEASE);
+#endif
+
+       os_ref_count_t val = atomic_fetch_sub_explicit(&rc->ref_count, 1, release_order);
+       os_ref_check_underflow(rc, val);
+       if (__improbable(--val == 0)) {
+               atomic_load_explicit(&rc->ref_count, dealloc_order);
+#if OS_REFCNT_DEBUG
+               ref_log_drop(grp, (void *)rc); /* rc is only used as an identifier */
+#endif
+       }
+
+#if OS_REFCNT_DEBUG
+       ref_release_group(grp, !val);
+#endif
+
+       return val;
+}
+
+void
+os_ref_retain_locked(struct os_refcnt *rc)
+{
+       os_ref_count_t val = rc->ref_count;
+       os_ref_check_retain(rc, val);
+       rc->ref_count = ++val;
+
+#if OS_REFCNT_DEBUG
+       ref_retain_group(rc->ref_group);
+       ref_log_op(rc->ref_group, (void *)rc, REFLOG_RETAIN);
+#endif
+}
+
+os_ref_count_t
+os_ref_release_locked(struct os_refcnt *rc)
+{
+       os_ref_count_t val = rc->ref_count;
+       os_ref_check_underflow(rc, val);
+       rc->ref_count = --val;
+
+#if OS_REFCNT_DEBUG
+       ref_release_group(rc->ref_group, !val);
+       ref_log_op(rc->ref_group, (void *)rc, REFLOG_RELEASE);
+       if (val == 0) {
+               ref_log_drop(rc->ref_group, (void *)rc);
+       }
+#endif
+       return val;
+}
+
diff --git a/libkern/os/refcnt.h b/libkern/os/refcnt.h
new file mode 100644 (file)
index 0000000..6148059
--- /dev/null
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _OS_REFCNT_H_
+#define _OS_REFCNT_H_
+
+/*
+ * os_refcnt reference counting API
+ *
+ * Two flavors are provided: atomic and locked. Atomic internally uses C11 atomic
+ * operations and requires no external synchronization, whereas the locked flavor
+ * assumes the refcnt object is locked by the caller. It is NOT safe to
+ * mix-and-match locked and atomic calls.
+ */
+
+#include <stdatomic.h>
+#include <stdbool.h>
+#include <os/base.h>
+
+struct os_refcnt;
+struct os_refgrp;
+typedef struct os_refcnt os_refcnt_t;
+
+/* type of the internal counter */
+typedef uint32_t os_ref_count_t;
+
+#if DEVELOPMENT || DEBUG
+# define OS_REFCNT_DEBUG 1
+#else
+# define OS_REFCNT_DEBUG 0
+#endif
+
+/*
+ * Debugging is keyed off ref_group, so leave that field for kexts so that the
+ * combination of dev/debug kernel and release kext works.
+ */
+#if XNU_KERNEL_PRIVATE
+# define OS_REFCNT_HAS_GROUP OS_REFCNT_DEBUG
+#else
+# define OS_REFCNT_HAS_GROUP 1
+#endif
+
+struct os_refcnt {
+       _Atomic os_ref_count_t ref_count;
+#if OS_REFCNT_HAS_GROUP
+       struct os_refgrp *ref_group;
+#endif
+};
+
+#if OS_REFCNT_DEBUG
+struct os_refgrp {
+       const char *const grp_name;
+       _Atomic os_ref_count_t grp_children; /* number of refcount objects in group */
+       _Atomic os_ref_count_t grp_count;    /* current reference count of group */
+       _Atomic uint64_t grp_retain_total;
+       _Atomic uint64_t grp_release_total;
+       struct os_refgrp *grp_parent;
+       void *grp_log;                       /* refcount logging context */
+};
+#endif
+
+#if __has_attribute(diagnose_if)
+# define os_error_if(cond, msg) __attribute__((diagnose_if((cond), (msg), "error")))
+#else
+# define os_error_if(...)
+#endif
+
+__BEGIN_DECLS
+
+/*
+ * os_ref_init: initialize an os_refcnt with a count of 1
+ * os_ref_init_count: initialize an os_refcnt with a specific count >= 1
+ */
+#define os_ref_init(rc, grp) os_ref_init_count((rc), (grp), 1)
+void os_ref_init_count(struct os_refcnt *, struct os_refgrp *, os_ref_count_t count)
+       os_error_if(count == 0, "Reference count must be non-zero initialized");
+
+#if OS_REFCNT_DEBUG
+# define os_refgrp_decl(qual, var, name, parent) \
+       qual struct os_refgrp __attribute__((section("__DATA,__refgrps"))) var = { \
+               .grp_name =          (name), \
+               .grp_children =      ATOMIC_VAR_INIT(0), \
+               .grp_count =         ATOMIC_VAR_INIT(0), \
+               .grp_retain_total =  ATOMIC_VAR_INIT(0), \
+               .grp_release_total = ATOMIC_VAR_INIT(0), \
+               .grp_parent =        (parent), \
+               .grp_log =           NULL, \
+       }
+
+/* Create a default group based on the init() callsite if no explicit group
+ * is provided. */
+# define os_ref_init_count(rc, grp, count) ({ \
+               os_refgrp_decl(static, __grp, __func__, NULL); \
+               (os_ref_init_count)((rc), (grp) ? (grp) : &__grp, (count)); \
+       })
+#else
+# define os_refgrp_decl(...)
+# define os_ref_init_count(rc, grp, count) (os_ref_init_count)((rc), NULL, (count))
+#endif /* OS_REFCNT_DEBUG */
+
+/*
+ * os_ref_retain: acquire a reference (increment reference count by 1) atomically.
+ *
+ * os_ref_release: release a reference (decrement reference count) atomically and
+ *             return the new count. Memory is synchronized such that the dealloc block
+ *             (i.e. code handling the final release() == 0 call) sees up-to-date memory
+ *             with respect to all prior release()s on the same refcnt object. This
+ *             memory ordering is sufficient for most use cases.
+ *
+ * os_ref_release_relaxed: same as release() but with weaker relaxed memory ordering.
+ *             This can be used when the dealloc block is already synchronized with other
+ *             accesses to the object (for example, with a lock).
+ *
+ * os_ref_release_live: release a reference that is guaranteed not to be the last one.
+ */
+void os_ref_retain(struct os_refcnt *);
+
+os_ref_count_t os_ref_release_explicit(struct os_refcnt *rc,
+               memory_order release_order, memory_order dealloc_order) OS_WARN_RESULT;
+
+static inline os_ref_count_t OS_WARN_RESULT
+os_ref_release(struct os_refcnt *rc)
+{
+       return os_ref_release_explicit(rc, memory_order_release, memory_order_acquire);
+}
+
+static inline os_ref_count_t OS_WARN_RESULT
+os_ref_release_relaxed(struct os_refcnt *rc)
+{
+       return os_ref_release_explicit(rc, memory_order_relaxed, memory_order_relaxed);
+}
+
+static inline void
+os_ref_release_live(struct os_refcnt *rc)
+{
+       if (__improbable(os_ref_release_explicit(rc,
+                       memory_order_release, memory_order_relaxed) == 0)) {
+               panic("os_refcnt: unexpected release of final reference (rc=%p)\n", rc);
+               __builtin_unreachable();
+       }
+}
+
+
+/*
+ * os_ref_retain_try: a variant of atomic retain that fails for objects with a
+ *             zero reference count. The caller must therefore ensure that the object
+ *             remains alive for any possible retain_try() caller, usually by using a
+ *             lock protecting both the retain and dealloc paths. This variant is useful
+ *             for objects stored in a collection, because no lock is required on the
+ *             release() side until the object is deallocated.
+ */
+bool os_ref_retain_try(struct os_refcnt *) OS_WARN_RESULT;
+
+
+/*
+ * os_ref_retain_locked: acquire a reference on an object protected by a held
+ *             lock. The caller must ensure mutual exclusivity of retain_locked() and
+ *             release_locked() calls on the same object.
+ *
+ * os_ref_release_locked: release a reference on an object protected by a held
+ *             lock.
+ */
+void os_ref_retain_locked(struct os_refcnt *);
+os_ref_count_t os_ref_release_locked(struct os_refcnt *) OS_WARN_RESULT;
+
+
+/*
+ * os_ref_get_count: return the current reference count. This is unsafe for
+ *             synchronization.
+ */
+static inline os_ref_count_t
+os_ref_get_count(struct os_refcnt *rc)
+{
+       return atomic_load_explicit(&rc->ref_count, memory_order_relaxed);
+}
+
+__END_DECLS
+
+#endif
index 1ee1c984e5877cbc09f21762d5b33835350520ad..dc40b5fa0acc7b9917fa9b2ddca069250f2ab907 100644 (file)
@@ -26,6 +26,7 @@
 
 #include <os/log.h>
 #include <uuid/uuid.h>
+#include <kern/assert.h>
 #include <firehose/firehose_types_private.h>
 
 __BEGIN_DECLS
@@ -34,6 +35,7 @@ OS_ALWAYS_INLINE
 inline uint32_t
 _os_trace_offset(const void *dso, const void *addr, _firehose_tracepoint_flags_activity_t flags __unused)
 {
+    assert((uintptr_t)addr >= (uintptr_t)dso);
     return (uint32_t) ((uintptr_t)addr - (uintptr_t)dso);
 }
 
index ce69ad76640cc10da9b0343b891f3b5e89ec12c1..eec3f494369c8d62978bec8e36a246f6ec4e365a 100644 (file)
@@ -82,13 +82,25 @@ uuid_copy(uuid_t dst, const uuid_t src)
        memcpy(dst, src, sizeof(uuid_t));
 }
 
+static void
+uuid_random_setflags(uuid_t out)
+{
+       out[6] = (out[6] & 0x0F) | 0x40;
+       out[8] = (out[8] & 0x3F) | 0x80;
+}
+
 void
 uuid_generate_random(uuid_t out)
 {
        read_random(out, sizeof(uuid_t));
+       uuid_random_setflags(out);
+}
 
-       out[6] = (out[6] & 0x0F) | 0x40;
-       out[8] = (out[8] & 0x3F) | 0x80;
+void
+uuid_generate_early_random(uuid_t out)
+{
+       read_frandom(out, sizeof(uuid_t));
+       uuid_random_setflags(out);
 }
 
 void
diff --git a/libkern/zlib/crc32.c b/libkern/zlib/crc32.c
deleted file mode 100644 (file)
index ac0acac..0000000
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2008-2016 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/* crc32.c -- compute the CRC-32 of a data stream
- * Copyright (C) 1995-2005 Mark Adler
- * For conditions of distribution and use, see copyright notice in zlib.h
- *
- * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
- * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
- * tables for updating the shift register in one step with three exclusive-ors
- * instead of four steps with four exclusive-ors.  This results in about a
- * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
- */
-
-/* @(#) $Id$ */
-
-/*
-  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
-  protection on the static variables used to control the first-use generation
-  of the crc tables.  Therefore, if you #define DYNAMIC_CRC_TABLE, you should
-  first call get_crc_table() to initialize the tables before allowing more than
-  one thread to use crc32().
- */
-
-
-#ifdef MAKECRCH
-#  include <stdio.h>
-#  ifndef DYNAMIC_CRC_TABLE
-#    define DYNAMIC_CRC_TABLE
-#  endif /* !DYNAMIC_CRC_TABLE */
-#endif /* MAKECRCH */
-
-#include "zutil.h"      /* for STDC and FAR definitions */
-
-#define local static
-
-/* Find a four-byte integer type for crc32_little() and crc32_big(). */
-#ifndef NOBYFOUR
-#  ifdef STDC           /* need ANSI C limits.h to determine sizes */
-#    include <machine/limits.h>
-#    define BYFOUR
-#    if (UINT_MAX == 0xffffffffUL)
-       typedef unsigned int u4;
-#    else
-#      if (ULONG_MAX == 0xffffffffUL)
-         typedef unsigned long u4;
-#      else
-#        if (USHRT_MAX == 0xffffffffUL)
-           typedef unsigned short u4;
-#        else
-#          undef BYFOUR     /* can't find a four-byte integer type! */
-#        endif
-#      endif
-#    endif
-#  endif /* STDC */
-#endif /* !NOBYFOUR */
-
-/* Definitions for doing the crc four data bytes at a time. */
-#ifdef BYFOUR
-#  define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
-                (((w)&0xff00)<<8)+(((w)&0xff)<<24))
-   local unsigned long crc32_little OF((unsigned long,
-                        const unsigned char FAR *, unsigned));
-   local unsigned long crc32_big OF((unsigned long,
-                        const unsigned char FAR *, unsigned));
-#  define TBLS 8
-#else
-#  define TBLS 1
-#endif /* BYFOUR */
-
-/* Local functions for crc concatenation */
-local unsigned long gf2_matrix_times OF((unsigned long *mat,
-                                         unsigned long vec));
-local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat));
-
-#ifdef DYNAMIC_CRC_TABLE
-
-local volatile int crc_table_empty = 1;
-local unsigned long FAR crc_table[TBLS][256];
-local void make_crc_table OF((void));
-#ifdef MAKECRCH
-   local void write_table OF((FILE *, const unsigned long FAR *));
-#endif /* MAKECRCH */
-/*
-  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
-  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
-
-  Polynomials over GF(2) are represented in binary, one bit per coefficient,
-  with the lowest powers in the most significant bit.  Then adding polynomials
-  is just exclusive-or, and multiplying a polynomial by x is a right shift by
-  one.  If we call the above polynomial p, and represent a byte as the
-  polynomial q, also with the lowest power in the most significant bit (so the
-  byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
-  where a mod b means the remainder after dividing a by b.
-
-  This calculation is done using the shift-register method of multiplying and
-  taking the remainder.  The register is initialized to zero, and for each
-  incoming bit, x^32 is added mod p to the register if the bit is a one (where
-  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
-  x (which is shifting right by one and adding x^32 mod p if the bit shifted
-  out is a one).  We start with the highest power (least significant bit) of
-  q and repeat for all eight bits of q.
-
-  The first table is simply the CRC of all possible eight bit values.  This is
-  all the information needed to generate CRCs on data a byte at a time for all
-  combinations of CRC register values and incoming bytes.  The remaining tables
-  allow for word-at-a-time CRC calculation for both big-endian and little-
-  endian machines, where a word is four bytes.
-*/
-local void
-make_crc_table(void)
-{
-    unsigned long c;
-    int n, k;
-    unsigned long poly;                 /* polynomial exclusive-or pattern */
-    /* terms of polynomial defining this crc (except x^32): */
-    static volatile int first = 1;      /* flag to limit concurrent making */
-    static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
-
-    /* See if another task is already doing this (not thread-safe, but better
-       than nothing -- significantly reduces duration of vulnerability in
-       case the advice about DYNAMIC_CRC_TABLE is ignored) */
-    if (first) {
-        first = 0;
-
-        /* make exclusive-or pattern from polynomial (0xedb88320UL) */
-        poly = 0UL;
-        for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
-            poly |= 1UL << (31 - p[n]);
-
-        /* generate a crc for every 8-bit value */
-        for (n = 0; n < 256; n++) {
-            c = (unsigned long)n;
-            for (k = 0; k < 8; k++)
-                c = c & 1 ? poly ^ (c >> 1) : c >> 1;
-            crc_table[0][n] = c;
-        }
-
-#ifdef BYFOUR
-        /* generate crc for each value followed by one, two, and three zeros,
-           and then the byte reversal of those as well as the first table */
-        for (n = 0; n < 256; n++) {
-            c = crc_table[0][n];
-            crc_table[4][n] = REV(c);
-            for (k = 1; k < 4; k++) {
-                c = crc_table[0][c & 0xff] ^ (c >> 8);
-                crc_table[k][n] = c;
-                crc_table[k + 4][n] = REV(c);
-            }
-        }
-#endif /* BYFOUR */
-
-        crc_table_empty = 0;
-    }
-    else {      /* not first */
-        /* wait for the other guy to finish (not efficient, but rare) */
-        while (crc_table_empty)
-            ;
-    }
-
-#ifdef MAKECRCH
-    /* write out CRC tables to crc32.h */
-    {
-        FILE *out;
-
-        out = fopen("crc32.h", "w");
-        if (out == NULL) return;
-        fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
-        fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
-        fprintf(out, "local const unsigned long FAR ");
-        fprintf(out, "crc_table[TBLS][256] =\n{\n  {\n");
-        write_table(out, crc_table[0]);
-#  ifdef BYFOUR
-        fprintf(out, "#ifdef BYFOUR\n");
-        for (k = 1; k < 8; k++) {
-            fprintf(out, "  },\n  {\n");
-            write_table(out, crc_table[k]);
-        }
-        fprintf(out, "#endif\n");
-#  endif /* BYFOUR */
-        fprintf(out, "  }\n};\n");
-        fclose(out);
-    }
-#endif /* MAKECRCH */
-}
-
-#ifdef MAKECRCH
-local void
-write_table(FILE *out, const unsigned long FAR *table)
-{
-    int n;
-
-    for (n = 0; n < 256; n++)
-        fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : "    ", table[n],
-                n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
-}
-#endif /* MAKECRCH */
-
-#else /* !DYNAMIC_CRC_TABLE */
-/* ========================================================================
- * Tables of CRC-32s of all single-byte values, made by make_crc_table().
- */
-#include "crc32.h"
-#endif /* DYNAMIC_CRC_TABLE */
-
-/* =========================================================================
- * This function can be used by asm versions of crc32()
- */
-const unsigned long FAR * ZEXPORT
-get_crc_table(void)
-{
-#ifdef DYNAMIC_CRC_TABLE
-    if (crc_table_empty)
-        make_crc_table();
-#endif /* DYNAMIC_CRC_TABLE */
-    return (const unsigned long FAR *)crc_table;
-}
-
-/* ========================================================================= */
-#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
-#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
-
-/* ========================================================================= */
-unsigned long ZEXPORT
-z_crc32(unsigned long crc, const unsigned char FAR *buf, unsigned len)
-{
-    if (buf == Z_NULL) return 0UL;
-
-#ifdef DYNAMIC_CRC_TABLE
-    if (crc_table_empty)
-        make_crc_table();
-#endif /* DYNAMIC_CRC_TABLE */
-
-#ifdef BYFOUR
-    if (sizeof(void *) == sizeof(ptrdiff_t)) {
-        u4 endian;
-
-        endian = 1;
-        if (*((unsigned char *)(&endian)))
-            return crc32_little(crc, buf, len);
-        else
-            return crc32_big(crc, buf, len);
-    }
-#endif /* BYFOUR */
-    crc = crc ^ 0xffffffffUL;
-    while (len >= 8) {
-        DO8;
-        len -= 8;
-    }
-    if (len) do {
-        DO1;
-    } while (--len);
-    return crc ^ 0xffffffffUL;
-}
-
-#ifdef BYFOUR
-
-/* ========================================================================= */
-#define DOLIT4 c ^= *buf4++; \
-        c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
-            crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
-#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
-
-/* ========================================================================= */
-local unsigned long
-crc32_little(unsigned long crc, const unsigned char FAR *buf, unsigned len)
-{
-    u4 c;
-    const u4 FAR *buf4;
-
-    c = (u4)crc;
-    c = ~c;
-    while (len && ((ptrdiff_t)buf & 3)) {
-        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
-        len--;
-    }
-
-    buf4 = (const u4 FAR *)(const void FAR *)buf;
-    while (len >= 32) {
-        DOLIT32;
-        len -= 32;
-    }
-    while (len >= 4) {
-        DOLIT4;
-        len -= 4;
-    }
-    buf = (const unsigned char FAR *)buf4;
-
-    if (len) do {
-        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
-    } while (--len);
-    c = ~c;
-    return (unsigned long)c;
-}
-
-/* ========================================================================= */
-#define DOBIG4 c ^= *++buf4; \
-        c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
-            crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
-#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
-
-/* ========================================================================= */
-local unsigned long
-crc32_big(unsigned long crc, const unsigned char FAR *buf, unsigned len)
-{
-    u4 c;
-    const u4 FAR *buf4;
-
-    c = REV((u4)crc);
-    c = ~c;
-    while (len && ((ptrdiff_t)buf & 3)) {
-        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
-        len--;
-    }
-
-    buf4 = (const u4 FAR *)(const void FAR *)buf;
-    buf4--;
-    while (len >= 32) {
-        DOBIG32;
-        len -= 32;
-    }
-    while (len >= 4) {
-        DOBIG4;
-        len -= 4;
-    }
-    buf4++;
-    buf = (const unsigned char FAR *)buf4;
-
-    if (len) do {
-        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
-    } while (--len);
-    c = ~c;
-    return (unsigned long)(REV(c));
-}
-
-#endif /* BYFOUR */
-
-#define GF2_DIM 32      /* dimension of GF(2) vectors (length of CRC) */
-
-/* ========================================================================= */
-local unsigned long
-gf2_matrix_times(unsigned long *mat, unsigned long vec)
-{
-    unsigned long sum;
-
-    sum = 0;
-    while (vec) {
-        if (vec & 1)
-            sum ^= *mat;
-        vec >>= 1;
-        mat++;
-    }
-    return sum;
-}
-
-/* ========================================================================= */
-local void
-gf2_matrix_square(unsigned long *square, unsigned long *mat)
-{
-    int n;
-
-    for (n = 0; n < GF2_DIM; n++)
-        square[n] = gf2_matrix_times(mat, mat[n]);
-}
-
-/* ========================================================================= */
-uLong ZEXPORT
-z_crc32_combine(uLong crc1, uLong crc2, z_off_t len2)
-{
-    int n;
-    unsigned long row;
-    unsigned long even[GF2_DIM];    /* even-power-of-two zeros operator */
-    unsigned long odd[GF2_DIM];     /* odd-power-of-two zeros operator */
-
-    /* degenerate case */
-    if (len2 == 0)
-        return crc1;
-
-    /* put operator for one zero bit in odd */
-    odd[0] = 0xedb88320L;           /* CRC-32 polynomial */
-    row = 1;
-    for (n = 1; n < GF2_DIM; n++) {
-        odd[n] = row;
-        row <<= 1;
-    }
-
-    /* put operator for two zero bits in even */
-    gf2_matrix_square(even, odd);
-
-    /* put operator for four zero bits in odd */
-    gf2_matrix_square(odd, even);
-
-    /* apply len2 zeros to crc1 (first square will put the operator for one
-       zero byte, eight zero bits, in even) */
-    do {
-        /* apply zeros operator for this bit of len2 */
-        gf2_matrix_square(even, odd);
-        if (len2 & 1)
-            crc1 = gf2_matrix_times(even, crc1);
-        len2 >>= 1;
-
-        /* if no more bits set, then done */
-        if (len2 == 0)
-            break;
-
-        /* another iteration of the loop with odd and even swapped */
-        gf2_matrix_square(odd, even);
-        if (len2 & 1)
-            crc1 = gf2_matrix_times(odd, crc1);
-        len2 >>= 1;
-
-        /* if no more bits set, then done */
-    } while (len2 != 0);
-
-    /* return combined crc */
-    crc1 ^= crc2;
-    return crc1;
-}
diff --git a/libkern/zlib/z_crc32.c b/libkern/zlib/z_crc32.c
new file mode 100644 (file)
index 0000000..ac0acac
--- /dev/null
@@ -0,0 +1,442 @@
+/*
+ * Copyright (c) 2008-2016 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2005 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors.  This results in about a
+ * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+/* @(#) $Id$ */
+
+/*
+  Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
+  protection on the static variables used to control the first-use generation
+  of the crc tables.  Therefore, if you #define DYNAMIC_CRC_TABLE, you should
+  first call get_crc_table() to initialize the tables before allowing more than
+  one thread to use crc32().
+ */
+
+
+#ifdef MAKECRCH
+#  include <stdio.h>
+#  ifndef DYNAMIC_CRC_TABLE
+#    define DYNAMIC_CRC_TABLE
+#  endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+#include "zutil.h"      /* for STDC and FAR definitions */
+
+#define local static
+
+/* Find a four-byte integer type for crc32_little() and crc32_big(). */
+#ifndef NOBYFOUR
+#  ifdef STDC           /* need ANSI C limits.h to determine sizes */
+#    include <machine/limits.h>
+#    define BYFOUR
+#    if (UINT_MAX == 0xffffffffUL)
+       typedef unsigned int u4;
+#    else
+#      if (ULONG_MAX == 0xffffffffUL)
+         typedef unsigned long u4;
+#      else
+#        if (USHRT_MAX == 0xffffffffUL)
+           typedef unsigned short u4;
+#        else
+#          undef BYFOUR     /* can't find a four-byte integer type! */
+#        endif
+#      endif
+#    endif
+#  endif /* STDC */
+#endif /* !NOBYFOUR */
+
+/* Definitions for doing the crc four data bytes at a time. */
+#ifdef BYFOUR
+#  define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
+                (((w)&0xff00)<<8)+(((w)&0xff)<<24))
+   local unsigned long crc32_little OF((unsigned long,
+                        const unsigned char FAR *, unsigned));
+   local unsigned long crc32_big OF((unsigned long,
+                        const unsigned char FAR *, unsigned));
+#  define TBLS 8
+#else
+#  define TBLS 1
+#endif /* BYFOUR */
+
+/* Local functions for crc concatenation */
+local unsigned long gf2_matrix_times OF((unsigned long *mat,
+                                         unsigned long vec));
+local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat));
+
+#ifdef DYNAMIC_CRC_TABLE
+
+local volatile int crc_table_empty = 1;
+local unsigned long FAR crc_table[TBLS][256];
+local void make_crc_table OF((void));
+#ifdef MAKECRCH
+   local void write_table OF((FILE *, const unsigned long FAR *));
+#endif /* MAKECRCH */
+/*
+  Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+  x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+  Polynomials over GF(2) are represented in binary, one bit per coefficient,
+  with the lowest powers in the most significant bit.  Then adding polynomials
+  is just exclusive-or, and multiplying a polynomial by x is a right shift by
+  one.  If we call the above polynomial p, and represent a byte as the
+  polynomial q, also with the lowest power in the most significant bit (so the
+  byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+  where a mod b means the remainder after dividing a by b.
+
+  This calculation is done using the shift-register method of multiplying and
+  taking the remainder.  The register is initialized to zero, and for each
+  incoming bit, x^32 is added mod p to the register if the bit is a one (where
+  x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+  x (which is shifting right by one and adding x^32 mod p if the bit shifted
+  out is a one).  We start with the highest power (least significant bit) of
+  q and repeat for all eight bits of q.
+
+  The first table is simply the CRC of all possible eight bit values.  This is
+  all the information needed to generate CRCs on data a byte at a time for all
+  combinations of CRC register values and incoming bytes.  The remaining tables
+  allow for word-at-a-time CRC calculation for both big-endian and little-
+  endian machines, where a word is four bytes.
+*/
+local void
+make_crc_table(void)
+{
+    unsigned long c;
+    int n, k;
+    unsigned long poly;                 /* polynomial exclusive-or pattern */
+    /* terms of polynomial defining this crc (except x^32): */
+    static volatile int first = 1;      /* flag to limit concurrent making */
+    static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+    /* See if another task is already doing this (not thread-safe, but better
+       than nothing -- significantly reduces duration of vulnerability in
+       case the advice about DYNAMIC_CRC_TABLE is ignored) */
+    if (first) {
+        first = 0;
+
+        /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+        poly = 0UL;
+        for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
+            poly |= 1UL << (31 - p[n]);
+
+        /* generate a crc for every 8-bit value */
+        for (n = 0; n < 256; n++) {
+            c = (unsigned long)n;
+            for (k = 0; k < 8; k++)
+                c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+            crc_table[0][n] = c;
+        }
+
+#ifdef BYFOUR
+        /* generate crc for each value followed by one, two, and three zeros,
+           and then the byte reversal of those as well as the first table */
+        for (n = 0; n < 256; n++) {
+            c = crc_table[0][n];
+            crc_table[4][n] = REV(c);
+            for (k = 1; k < 4; k++) {
+                c = crc_table[0][c & 0xff] ^ (c >> 8);
+                crc_table[k][n] = c;
+                crc_table[k + 4][n] = REV(c);
+            }
+        }
+#endif /* BYFOUR */
+
+        crc_table_empty = 0;
+    }
+    else {      /* not first */
+        /* wait for the other guy to finish (not efficient, but rare) */
+        while (crc_table_empty)
+            ;
+    }
+
+#ifdef MAKECRCH
+    /* write out CRC tables to crc32.h */
+    {
+        FILE *out;
+
+        out = fopen("crc32.h", "w");
+        if (out == NULL) return;
+        fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
+        fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
+        fprintf(out, "local const unsigned long FAR ");
+        fprintf(out, "crc_table[TBLS][256] =\n{\n  {\n");
+        write_table(out, crc_table[0]);
+#  ifdef BYFOUR
+        fprintf(out, "#ifdef BYFOUR\n");
+        for (k = 1; k < 8; k++) {
+            fprintf(out, "  },\n  {\n");
+            write_table(out, crc_table[k]);
+        }
+        fprintf(out, "#endif\n");
+#  endif /* BYFOUR */
+        fprintf(out, "  }\n};\n");
+        fclose(out);
+    }
+#endif /* MAKECRCH */
+}
+
+#ifdef MAKECRCH
+local void
+write_table(FILE *out, const unsigned long FAR *table)
+{
+    int n;
+
+    for (n = 0; n < 256; n++)
+        fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : "    ", table[n],
+                n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
+}
+#endif /* MAKECRCH */
+
+#else /* !DYNAMIC_CRC_TABLE */
+/* ========================================================================
+ * Tables of CRC-32s of all single-byte values, made by make_crc_table().
+ */
+#include "crc32.h"
+#endif /* DYNAMIC_CRC_TABLE */
+
+/* =========================================================================
+ * This function can be used by asm versions of crc32()
+ */
+const unsigned long FAR * ZEXPORT
+get_crc_table(void)
+{
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+    return (const unsigned long FAR *)crc_table;
+}
+
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+unsigned long ZEXPORT
+z_crc32(unsigned long crc, const unsigned char FAR *buf, unsigned len)
+{
+    if (buf == Z_NULL) return 0UL;
+
+#ifdef DYNAMIC_CRC_TABLE
+    if (crc_table_empty)
+        make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+
+#ifdef BYFOUR
+    if (sizeof(void *) == sizeof(ptrdiff_t)) {
+        u4 endian;
+
+        endian = 1;
+        if (*((unsigned char *)(&endian)))
+            return crc32_little(crc, buf, len);
+        else
+            return crc32_big(crc, buf, len);
+    }
+#endif /* BYFOUR */
+    crc = crc ^ 0xffffffffUL;
+    while (len >= 8) {
+        DO8;
+        len -= 8;
+    }
+    if (len) do {
+        DO1;
+    } while (--len);
+    return crc ^ 0xffffffffUL;
+}
+
+#ifdef BYFOUR
+
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+        c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+            crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+local unsigned long
+crc32_little(unsigned long crc, const unsigned char FAR *buf, unsigned len)
+{
+    u4 c;
+    const u4 FAR *buf4;
+
+    c = (u4)crc;
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+        len--;
+    }
+
+    buf4 = (const u4 FAR *)(const void FAR *)buf;
+    while (len >= 32) {
+        DOLIT32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOLIT4;
+        len -= 4;
+    }
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)c;
+}
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *++buf4; \
+        c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+            crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+local unsigned long
+crc32_big(unsigned long crc, const unsigned char FAR *buf, unsigned len)
+{
+    u4 c;
+    const u4 FAR *buf4;
+
+    c = REV((u4)crc);
+    c = ~c;
+    while (len && ((ptrdiff_t)buf & 3)) {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+        len--;
+    }
+
+    buf4 = (const u4 FAR *)(const void FAR *)buf;
+    buf4--;
+    while (len >= 32) {
+        DOBIG32;
+        len -= 32;
+    }
+    while (len >= 4) {
+        DOBIG4;
+        len -= 4;
+    }
+    buf4++;
+    buf = (const unsigned char FAR *)buf4;
+
+    if (len) do {
+        c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+    } while (--len);
+    c = ~c;
+    return (unsigned long)(REV(c));
+}
+
+#endif /* BYFOUR */
+
+#define GF2_DIM 32      /* dimension of GF(2) vectors (length of CRC) */
+
+/* ========================================================================= */
+local unsigned long
+gf2_matrix_times(unsigned long *mat, unsigned long vec)
+{
+    unsigned long sum;
+
+    sum = 0;
+    while (vec) {
+        if (vec & 1)
+            sum ^= *mat;
+        vec >>= 1;
+        mat++;
+    }
+    return sum;
+}
+
+/* ========================================================================= */
+local void
+gf2_matrix_square(unsigned long *square, unsigned long *mat)
+{
+    int n;
+
+    for (n = 0; n < GF2_DIM; n++)
+        square[n] = gf2_matrix_times(mat, mat[n]);
+}
+
+/* ========================================================================= */
+uLong ZEXPORT
+z_crc32_combine(uLong crc1, uLong crc2, z_off_t len2)
+{
+    int n;
+    unsigned long row;
+    unsigned long even[GF2_DIM];    /* even-power-of-two zeros operator */
+    unsigned long odd[GF2_DIM];     /* odd-power-of-two zeros operator */
+
+    /* degenerate case */
+    if (len2 == 0)
+        return crc1;
+
+    /* put operator for one zero bit in odd */
+    odd[0] = 0xedb88320L;           /* CRC-32 polynomial */
+    row = 1;
+    for (n = 1; n < GF2_DIM; n++) {
+        odd[n] = row;
+        row <<= 1;
+    }
+
+    /* put operator for two zero bits in even */
+    gf2_matrix_square(even, odd);
+
+    /* put operator for four zero bits in odd */
+    gf2_matrix_square(odd, even);
+
+    /* apply len2 zeros to crc1 (first square will put the operator for one
+       zero byte, eight zero bits, in even) */
+    do {
+        /* apply zeros operator for this bit of len2 */
+        gf2_matrix_square(even, odd);
+        if (len2 & 1)
+            crc1 = gf2_matrix_times(even, crc1);
+        len2 >>= 1;
+
+        /* if no more bits set, then done */
+        if (len2 == 0)
+            break;
+
+        /* another iteration of the loop with odd and even swapped */
+        gf2_matrix_square(odd, even);
+        if (len2 & 1)
+            crc1 = gf2_matrix_times(odd, crc1);
+        len2 >>= 1;
+
+        /* if no more bits set, then done */
+    } while (len2 != 0);
+
+    /* return combined crc */
+    crc1 ^= crc2;
+    return crc1;
+}
index a73df0946000809541121cc8fd58740d2cd30ca9..fecfc43dbd515e933c2d7876350dba23d43f1b3f 100644 (file)
@@ -29,10 +29,6 @@ extern "C" {
 #include <mach/kmod.h>
 #include <libkern/kernel_mach_header.h>
 #include <libkern/prelink.h>
-
-#if CONFIG_EMBEDDED
-extern uuid_t kernelcache_uuid;
-#endif
 }
 
 #include <libkern/version.h>
@@ -104,6 +100,7 @@ static const char * sKernelComponentNames[] = {
    "com.apple.kpi.bsd",
    "com.apple.kpi.dsep",
    "com.apple.kpi.iokit",
+   "com.apple.kpi.kasan",
    "com.apple.kpi.libkern",
    "com.apple.kpi.mach",
    "com.apple.kpi.private",
@@ -246,9 +243,7 @@ KLDBootstrap::readPrelinkedExtensions(
     OSDictionary              * prelinkInfoDict         = NULL;  // do not release
     OSString                  * errorString             = NULL;  // must release
     OSKext                    * theKernel               = NULL;  // must release
-#if CONFIG_EMBEDDED
     OSData                    * kernelcacheUUID         = NULL;  // do not release
-#endif
 
     kernel_segment_command_t  * prelinkTextSegment      = NULL;  // see code
     kernel_segment_command_t  * prelinkInfoSegment      = NULL;  // see code
@@ -374,19 +369,19 @@ KLDBootstrap::readPrelinkedExtensions(
     ramDiskBoot = IORamDiskBSDRoot();
 #endif /* NO_KEXTD */
 
-#if CONFIG_EMBEDDED
     /* Copy in the kernelcache UUID */
     kernelcacheUUID = OSDynamicCast(OSData,
         prelinkInfoDict->getObject(kPrelinkInfoKCIDKey));
-    if (!kernelcacheUUID) {
-       bzero(&kernelcache_uuid, sizeof(kernelcache_uuid));
-    } else if (kernelcacheUUID->getLength() != sizeof(kernelcache_uuid)) {
-        panic("kernelcacheUUID length is %d, expected %lu", kernelcacheUUID->getLength(),
-            sizeof(kernelcache_uuid));
-    } else {
-        memcpy((void *)&kernelcache_uuid, (const void *)kernelcacheUUID->getBytesNoCopy(), kernelcacheUUID->getLength());
+    if (kernelcacheUUID) {
+        if (kernelcacheUUID->getLength() != sizeof(kernelcache_uuid)) {
+            panic("kernelcacheUUID length is %d, expected %lu", kernelcacheUUID->getLength(),
+                sizeof(kernelcache_uuid));
+        } else {
+            kernelcache_uuid_valid = TRUE;
+            memcpy((void *)&kernelcache_uuid, (const void *)kernelcacheUUID->getBytesNoCopy(), kernelcacheUUID->getLength());
+            uuid_unparse_upper(kernelcache_uuid, kernelcache_uuid_string);
+        }
     }
-#endif /* CONFIG_EMBEDDED */
 
     infoDictArray = OSDynamicCast(OSArray, 
         prelinkInfoDict->getObject(kPrelinkInfoDictionaryKey));
@@ -454,7 +449,7 @@ KLDBootstrap::readPrelinkedExtensions(
                 infoDict->getObject(kPrelinkExecutableSizeKey));
             if (addressNum && lengthNum) {
 #if __arm__ || __arm64__
-                vm_offset_t data = (vm_offset_t) ((addressNum->unsigned64BitValue()) + vm_kernel_slide);
+                vm_offset_t data = ml_static_slide(addressNum->unsigned64BitValue());
                 vm_size_t length = (vm_size_t) (lengthNum->unsigned32BitValue());
                 ml_static_mfree(data, length);
 #else
@@ -493,7 +488,7 @@ KLDBootstrap::readPrelinkedExtensions(
 
                    slideAddrSegIndex = __whereIsAddr( (vm_offset_t)slideAddr, &plk_segSizes[0], &plk_segAddrs[0], PLK_SEGMENTS );
                    if (slideAddrSegIndex >= 0) {
-                           addrToSlideSegIndex = __whereIsAddr( (vm_offset_t)(*slideAddr + vm_kernel_slide), &plk_segSizes[0], &plk_segAddrs[0], PLK_SEGMENTS );
+                           addrToSlideSegIndex = __whereIsAddr(ml_static_slide((vm_offset_t)(*slideAddr)), &plk_segSizes[0], &plk_segAddrs[0], PLK_SEGMENTS );
                            if (addrToSlideSegIndex < 0) {
                                    badSlideTarget++;
                                    continue;
@@ -505,7 +500,7 @@ KLDBootstrap::readPrelinkedExtensions(
                    }
 
                    slidKextAddrCount++;
-                   *(slideAddr) += vm_kernel_slide;
+                   *slideAddr = ml_static_slide(*slideAddr);
            } // for ...
 
            /* All kexts are now slid, set VM protections for them */
index 105571937e11c30eccdce62e221f7f42b408a1c9..737c41068be97d7808b463a2e5c4a5d5a715a2cc 100644 (file)
@@ -28,7 +28,7 @@ GCC_TREAT_WARNINGS_AS_ERRORS = YES
 GCC_WARN_ABOUT_MISSING_NEWLINE = YES
 CODE_SIGN_IDENTITY = -
 DYLIB_CURRENT_VERSION = $(RC_ProjectSourceVersion)
-DYLIB_LDFLAGS = -umbrella System -all_load -lCrashReporterClient
+DYLIB_LDFLAGS = -umbrella System -all_load
 DYLIB_LDFLAGS[sdk=iphoneos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
 DYLIB_LDFLAGS[sdk=watchos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
 DYLIB_LDFLAGS[sdk=tvos*] = $(inherited) -Wl,-sectalign,__DATA,__data,1000
index b489c7ed0d5dadf5b75d49aa7e56c69b599a622a..e138202c11b626cff410c44dbc4e8da90636168d 100644 (file)
                40DF0F741E5CD7BB0035A864 /* cpu_copy_in_cksum_gen.c in Sources */ = {isa = PBXBuildFile; fileRef = 40DF0F731E5CD7B30035A864 /* cpu_copy_in_cksum_gen.c */; settings = {COMPILER_FLAGS = "-fno-builtin"; }; };
                435F3CAA1B06B7BA005ED9EF /* work_interval.c in Sources */ = {isa = PBXBuildFile; fileRef = 435F3CA91B06B7BA005ED9EF /* work_interval.c */; };
                467DAFD4157E8AF200CE68F0 /* guarded_open_np.c in Sources */ = {isa = PBXBuildFile; fileRef = 467DAFD3157E8AF200CE68F0 /* guarded_open_np.c */; };
+               4BCDD8AF20741A5B00FA37A3 /* mach_right.h in Headers */ = {isa = PBXBuildFile; fileRef = 4BCDD8AE20741A4700FA37A3 /* mach_right.h */; };
+               4BCDD8B020741BC400FA37A3 /* mach_right.h in Headers */ = {isa = PBXBuildFile; fileRef = 4BCDD8AE20741A4700FA37A3 /* mach_right.h */; };
+               4BCDD8B220741C2F00FA37A3 /* mach_right.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BCDD8B120741C2F00FA37A3 /* mach_right.c */; };
                4BDD5F1D1891AB2F004BF300 /* mach_approximate_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */; };
                4BDD5F1E1891AB2F004BF300 /* mach_approximate_time.s in Sources */ = {isa = PBXBuildFile; fileRef = 4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */; };
                726D915520ACD7FC0039A2FE /* mach_bridge_remote_time.c in Sources */ = {isa = PBXBuildFile; fileRef = 726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */; };
                929FD46F1C5711DB0087B9C8 /* mach_timebase_info.c in Sources */ = {isa = PBXBuildFile; fileRef = 929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */; };
                978228281B8678DC008385AC /* pselect-darwinext.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228271B8678CB008385AC /* pselect-darwinext.c */; };
                978228291B8678DF008385AC /* pselect-darwinext-cancel.c in Sources */ = {isa = PBXBuildFile; fileRef = 978228261B8678C2008385AC /* pselect-darwinext-cancel.c */; };
+               9C6DA3D220A3D09F0090330B /* mach_sync_ipc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */; };
+               9C6DA3D320A3D09F0090330B /* mach_sync_ipc.h in Headers */ = {isa = PBXBuildFile; fileRef = 9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */; };
                9CCF28271E68E993002EE6CD /* pid_shutdown_networking.c in Sources */ = {isa = PBXBuildFile; fileRef = 9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */; };
                A50845861DDA69AC0041C0E0 /* thread_self_restrict.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */; };
                A50845871DDA69C90041C0E0 /* thread_self_restrict.h in CopyFiles */ = {isa = PBXBuildFile; fileRef = A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */; };
                C6D3EFC816542C510052CF30 /* exc_catcher.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A091611F8E7A800E4693F /* exc_catcher.h */; };
                C6D3EFC916542C510052CF30 /* _libkernel_init.h in Headers */ = {isa = PBXBuildFile; fileRef = 247A08B211F8B05900E4693F /* _libkernel_init.h */; settings = {ATTRIBUTES = (Private, ); }; };
                C6D3F03016542C980052CF30 /* dummy.c in Sources */ = {isa = PBXBuildFile; fileRef = C6D3F02F16542C980052CF30 /* dummy.c */; };
+               C9001753206B00AC0070D674 /* port_descriptions.c in Sources */ = {isa = PBXBuildFile; fileRef = C9001751206B00850070D674 /* port_descriptions.c */; };
+               C9001754206B00D00070D674 /* port_descriptions.h in Headers */ = {isa = PBXBuildFile; fileRef = C9001752206B008B0070D674 /* port_descriptions.h */; };
                C962B16C18DBA2C80031244A /* setpriority.c in Sources */ = {isa = PBXBuildFile; fileRef = C962B16B18DBA2C80031244A /* setpriority.c */; };
                C962B16E18DBB43F0031244A /* thread_act.c in Sources */ = {isa = PBXBuildFile; fileRef = C962B16D18DBB43F0031244A /* thread_act.c */; };
                C99A4F501305B2BD0054B7B7 /* __get_cpu_capabilities.s in Sources */ = {isa = PBXBuildFile; fileRef = C99A4F4E1305B1B70054B7B7 /* __get_cpu_capabilities.s */; };
                40DF0F731E5CD7B30035A864 /* cpu_copy_in_cksum_gen.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = cpu_copy_in_cksum_gen.c; path = skywalk/cpu_copy_in_cksum_gen.c; sourceTree = "<group>"; };
                435F3CA91B06B7BA005ED9EF /* work_interval.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = work_interval.c; sourceTree = "<group>"; };
                467DAFD3157E8AF200CE68F0 /* guarded_open_np.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = guarded_open_np.c; sourceTree = "<group>"; };
+               4BCDD8AE20741A4700FA37A3 /* mach_right.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = mach_right.h; sourceTree = "<group>"; };
+               4BCDD8B120741C2F00FA37A3 /* mach_right.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = mach_right.c; sourceTree = "<group>"; };
                4BDD5F1B1891AB2F004BF300 /* mach_approximate_time.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_approximate_time.c; sourceTree = "<group>"; };
                4BDD5F1C1891AB2F004BF300 /* mach_approximate_time.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = mach_approximate_time.s; sourceTree = "<group>"; };
                726D915420ACD7FC0039A2FE /* mach_bridge_remote_time.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = mach_bridge_remote_time.c; path = wrappers/mach_bridge_remote_time.c; sourceTree = "<group>"; };
                929FD46E1C5711CF0087B9C8 /* mach_timebase_info.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = mach_timebase_info.c; sourceTree = "<group>"; };
                978228261B8678C2008385AC /* pselect-darwinext-cancel.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext-cancel.c"; sourceTree = "<group>"; };
                978228271B8678CB008385AC /* pselect-darwinext.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = "pselect-darwinext.c"; sourceTree = "<group>"; };
+               9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mach_sync_ipc.h; sourceTree = "<group>"; };
                9CCF28261E68E993002EE6CD /* pid_shutdown_networking.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = pid_shutdown_networking.c; sourceTree = "<group>"; };
                A50BD52E1DDA548F006622C8 /* thread_self_restrict.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = thread_self_restrict.h; sourceTree = "<group>"; };
                A59CB95516669DB700B064B3 /* stack_logging_internal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = stack_logging_internal.h; sourceTree = "<group>"; };
                C6C40121174154D9000AE69F /* gethostuuid_private.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = gethostuuid_private.h; sourceTree = "<group>"; };
                C6D3F02E16542C510052CF30 /* libsystem_Libsyscall_headers_Sim.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libsystem_Libsyscall_headers_Sim.a; sourceTree = BUILT_PRODUCTS_DIR; };
                C6D3F02F16542C980052CF30 /* dummy.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = dummy.c; sourceTree = "<group>"; };
+               C9001751206B00850070D674 /* port_descriptions.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = port_descriptions.c; sourceTree = "<group>"; };
+               C9001752206B008B0070D674 /* port_descriptions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = port_descriptions.h; sourceTree = "<group>"; };
                C93B50491C487698009DD6AB /* __kdebug_trace_string.s */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = __kdebug_trace_string.s; sourceTree = "<group>"; };
                C962B16B18DBA2C80031244A /* setpriority.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = setpriority.c; sourceTree = "<group>"; };
                C962B16D18DBB43F0031244A /* thread_act.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; path = thread_act.c; sourceTree = "<group>"; };
                08FB7795FE84155DC02AAC07 /* mach */ = {
                        isa = PBXGroup;
                        children = (
+                               C9001751206B00850070D674 /* port_descriptions.c */,
                                247A08FF11F8E18000E4693F /* abort.h */,
                                C9D9BCC5114B00600000D8B9 /* clock_priv.defs */,
                                C9D9BCC6114B00600000D8B9 /* clock_reply.defs */,
                                C9D9BCF1114B00600000D8B9 /* mach_msg.c */,
                                291D3C261354FDD100D46061 /* mach_port.c */,
                                C9D9BCF2114B00600000D8B9 /* mach_port.defs */,
+                               4BCDD8B120741C2F00FA37A3 /* mach_right.c */,
                                C9D9BCF3114B00600000D8B9 /* mach_traps.s */,
                                291D3C271354FDD100D46061 /* mach_vm.c */,
                                E4216C301822D404006F2632 /* mach_voucher.defs */,
                C9D9BCD8114B00600000D8B9 /* mach */ = {
                        isa = PBXGroup;
                        children = (
+                               C9001752206B008B0070D674 /* port_descriptions.h */,
                                C9D9BCD9114B00600000D8B9 /* errorlib.h */,
                                C9D9BCDA114B00600000D8B9 /* mach.h */,
                                C9D9BCDB114B00600000D8B9 /* mach_error.h */,
                                C9D9BCDC114B00600000D8B9 /* mach_init.h */,
                                C9D9BCDD114B00600000D8B9 /* mach_interface.h */,
+                               9C6DA3D120A3D09F0090330B /* mach_sync_ipc.h */,
+                               4BCDD8AE20741A4700FA37A3 /* mach_right.h */,
                                C9D9BCDF114B00600000D8B9 /* port_obj.h */,
                                C9D9BCE0114B00600000D8B9 /* sync.h */,
                                928336A21B8412C100873B90 /* thread_state.h */,
                                C6D3EFC616542C510052CF30 /* SYS.h in Headers */,
                                C6D3EFC716542C510052CF30 /* abort.h in Headers */,
                                C6D3EFC816542C510052CF30 /* exc_catcher.h in Headers */,
+                               4BCDD8B020741BC400FA37A3 /* mach_right.h in Headers */,
                                C6D3EFC916542C510052CF30 /* _libkernel_init.h in Headers */,
                                E453AF3A17013F4C00F2C94C /* stack_logging_internal.h in Headers */,
                                E453AF3817013F1400F2C94C /* spawn.h in Headers */,
+                               9C6DA3D320A3D09F0090330B /* mach_sync_ipc.h in Headers */,
                                E453AF3917013F1B00F2C94C /* spawn_private.h in Headers */,
                                E453AF3617013CBF00F2C94C /* libproc.h in Headers */,
                                E453AF3717013CC200F2C94C /* libproc_internal.h in Headers */,
                                C9D9BD26114B00600000D8B9 /* mach.h in Headers */,
                                C9D9BD27114B00600000D8B9 /* mach_error.h in Headers */,
                                C9D9BD28114B00600000D8B9 /* mach_init.h in Headers */,
+                               C9001754206B00D00070D674 /* port_descriptions.h in Headers */,
                                9299E14A1B841E74005B7350 /* thread_state.h in Headers */,
                                C6C40122174155E3000AE69F /* gethostuuid_private.h in Headers */,
                                C9D9BD29114B00600000D8B9 /* mach_interface.h in Headers */,
                                24D1158311E671B20063D54D /* SYS.h in Headers */,
                                247A090011F8E18000E4693F /* abort.h in Headers */,
                                247A091711F8E7A800E4693F /* exc_catcher.h in Headers */,
+                               4BCDD8AF20741A5B00FA37A3 /* mach_right.h in Headers */,
                                24B028F511FF5C3500CA64A9 /* _libkernel_init.h in Headers */,
                                A59CB95616669EFB00B064B3 /* stack_logging_internal.h in Headers */,
                                E4D45C3F16FB20D30002AF25 /* spawn.h in Headers */,
+                               9C6DA3D220A3D09F0090330B /* mach_sync_ipc.h in Headers */,
                                E4D45C4016FB20DC0002AF25 /* spawn_private.h in Headers */,
                                E4D45C2F16F868ED0002AF25 /* libproc.h in Headers */,
                                E4D45C3016F868ED0002AF25 /* libproc_internal.h in Headers */,
                08FB7793FE84155DC02AAC07 /* Project object */ = {
                        isa = PBXProject;
                        attributes = {
-                               LastUpgradeCheck = 0500;
+                               LastUpgradeCheck = 1000;
                        };
                        buildConfigurationList = 1DEB914E08733D8E0010E9CD /* Build configuration list for PBXProject "Libsyscall" */;
                        compatibilityVersion = "Xcode 3.2";
                        isa = PBXSourcesBuildPhase;
                        buildActionMask = 2147483647;
                        files = (
+                               C9001753206B00AC0070D674 /* port_descriptions.c in Sources */,
                                726D915520ACD7FC0039A2FE /* mach_bridge_remote_time.c in Sources */,
                                403C7CEE1E1F4E4400D6FEEF /* os_packet.c in Sources */,
                                E214BDC81C2E358300CEE8A3 /* clonefile.c in Sources */,
                                978228281B8678DC008385AC /* pselect-darwinext.c in Sources */,
                                2485235511582D8F0051B413 /* mach_legacy.c in Sources */,
                                242AB66611EBDC1200107336 /* errno.c in Sources */,
+                               4BCDD8B220741C2F00FA37A3 /* mach_right.c in Sources */,
                                E4D45C2E16F868ED0002AF25 /* libproc.c in Sources */,
                                24A7C5BC11FF8DA6007669EB /* accept.c in Sources */,
                                24A7C5BD11FF8DA6007669EB /* bind.c in Sources */,
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
+                               CLANG_ENABLE_OBJC_WEAK = YES;
                                COPY_PHASE_STRIP = NO;
                                INSTALL_PATH = /usr/local/lib/dyld;
                                STRIP_INSTALLED_PRODUCT = NO;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
+                               CLANG_ANALYZER_LOCALIZABILITY_NONLOCALIZED = YES;
+                               CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
+                               CLANG_WARN_BOOL_CONVERSION = YES;
+                               CLANG_WARN_COMMA = YES;
+                               CLANG_WARN_CONSTANT_CONVERSION = YES;
+                               CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
+                               CLANG_WARN_EMPTY_BODY = YES;
+                               CLANG_WARN_ENUM_CONVERSION = YES;
+                               CLANG_WARN_INFINITE_RECURSION = YES;
+                               CLANG_WARN_INT_CONVERSION = NO;
+                               CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
+                               CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
+                               CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
+                               CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
+                               CLANG_WARN_STRICT_PROTOTYPES = YES;
+                               CLANG_WARN_SUSPICIOUS_MOVE = YES;
+                               CLANG_WARN_UNREACHABLE_CODE = YES;
+                               CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
+                               ENABLE_STRICT_OBJC_MSGSEND = YES;
                                GCC_C_LANGUAGE_STANDARD = gnu99;
+                               GCC_NO_COMMON_BLOCKS = YES;
+                               GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
                                GCC_WARN_ABOUT_RETURN_TYPE = YES;
+                               GCC_WARN_UNDECLARED_SELECTOR = YES;
                                GCC_WARN_UNINITIALIZED_AUTOS = YES;
                                GCC_WARN_UNUSED_FUNCTION = YES;
                                GCC_WARN_UNUSED_PARAMETER = YES;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
-                               ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+                               CLANG_ENABLE_OBJC_WEAK = YES;
                                COPY_PHASE_STRIP = YES;
                                DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
                                MAP_PLATFORM = "$(MAP_PLATFORM_$(PLATFORM_NAME))";
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
+                               CLANG_ENABLE_OBJC_WEAK = YES;
                                OTHER_LDFLAGS = "$(DYLIB_LDFLAGS)";
                                VERSION_INFO_PREFIX = "___";
                        };
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
+                               CLANG_ENABLE_OBJC_WEAK = YES;
                                COPY_PHASE_STRIP = NO;
                                INSTALLHDRS_COPY_PHASE = NO;
                                PRODUCT_NAME = Build;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
+                               CLANG_ENABLE_OBJC_WEAK = YES;
                                PRODUCT_NAME = "$(TARGET_NAME)";
                        };
                        name = Release;
                        isa = XCBuildConfiguration;
                        baseConfigurationReference = C9D9BE0F114FFADC0000D8B9 /* Libsyscall.xcconfig */;
                        buildSettings = {
+                               CLANG_ENABLE_OBJC_WEAK = YES;
                                COPY_PHASE_STRIP = NO;
                                PRODUCT_NAME = Libsyscall_headers_Sim;
                                SKIP_INSTALL = YES;
index ff93f852a09ae7a4ad7e877058fbb7df184c6df0..a62cc1f996821bf905b1ff4a081059e2b37613e5 100644 (file)
@@ -228,9 +228,9 @@ LEAF(pseudo, 0)                                     ;\
 #endif
 
 #define MI_ENTRY_POINT(name)                           \
+       .text                                                                   ;\
        .align 2        ;\
        .globl  name                                                    ;\
-       .text                                                                   ;\
 name:
 
 /* load the syscall number into r12 and trap */
@@ -425,6 +425,18 @@ pseudo:                                                                    ;\
 #include <mach/arm/vm_param.h>
 #include <mach/arm64/asm.h>
 
+#if defined(__arm64__) && !defined(__LP64__)
+#define ZERO_EXTEND(argnum) uxtw  x ## argnum, w ## argnum
+#else
+#define ZERO_EXTEND(argnum)
+#endif
+
+#if defined(__arm64__) && !defined(__LP64__)
+#define SIGN_EXTEND(argnum) sxtw  x ## argnum, w ## argnum
+#else
+#define SIGN_EXTEND(argnum)
+#endif
+
 /*
  * ARM64 system call interface:
  *
index dc517a1a28c3cde24d34f7709b0ebbdde8922adc..ffcbe5ab4e5294d635921fe730ac59329455ca03 100644 (file)
@@ -132,6 +132,7 @@ Lparent:
 #include <mach/arm64/asm.h>
        
 MI_ENTRY_POINT(___fork)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
        // ARM moves a 1 in to r1 here, but I can't see why.
        mov             x16, #SYS_fork                          // Syscall code
@@ -144,14 +145,14 @@ MI_ENTRY_POINT(___fork)
        mov             w0, #0  
        str             w0, [x9]                                        // Clear cached current pid                             
        POP_FRAME                                                       // And done
-       ret
+       ARM64_STACK_EPILOG
 
 Lbotch:
        MI_CALL_EXTERNAL(_cerror)                       // Handle error
        mov             w0, #-1                                         // Return value is -1
 Lparent:
        POP_FRAME                                                       // Return
-       ret
+       ARM64_STACK_EPILOG
 
 #else
 #error Unsupported architecture
index a048f48aaac12b73c7c37f23ed6db2e946d1aaa7..a8daa7398196cf16cbfd00cb973c4511bdda11ad 100644 (file)
@@ -159,7 +159,7 @@ MI_ENTRY_POINT(___getpid)
        MI_GET_ADDRESS(x9, __current_pid)       // Get address of cached value
        ldr             w0, [x9]                                        // Load it
        cmp             w0, #0                                          // See if there's a cached value
-       b.ls    L_notcached                                     // If not, make syscall
+       b.le    L_notcached                                     // If not, make syscall
        ret                                                                     // Else, we're done
 L_notcached:
        SYSCALL_NONAME(getpid, 0, cerror_nocancel)
index a6a24404e789b033ae9d75d310921da5dcfd9b31..62238b9eda4ffe5fb9d9e3aa4d85ed3b82dba779 100644 (file)
 
 #if defined(__x86_64__)
 
-__SYSCALL(___sigreturn, sigreturn, 2)
+__SYSCALL(___sigreturn, sigreturn, 3)
 
 #elif defined(__i386__)
 
-__SYSCALL_INT(___sigreturn, sigreturn, 2)
+__SYSCALL_INT(___sigreturn, sigreturn, 3)
 
 #elif defined(__arm__)
 
-__SYSCALL(___sigreturn, sigreturn, 2)
+__SYSCALL(___sigreturn, sigreturn, 3)
 
 #elif defined(__arm64__)
 
-__SYSCALL(___sigreturn, sigreturn, 2)
+__SYSCALL(___sigreturn, sigreturn, 3)
 
 #else
 #error Unsupported architecture
index 00abb72164ef9ce632a3b044d0b0b32f83e2769b..fbe13755aecdf33372b464a57a937e9c0d74d3d8 100644 (file)
@@ -129,6 +129,7 @@ extern mach_msg_return_t    mach_msg_server_importance(boolean_t (*)
                                                mach_msg_size_t,
                                                mach_port_t,
                                                mach_msg_options_t);
+
 /*
  * Prototypes for compatibility
  */
diff --git a/libsyscall/mach/mach/mach_right.h b/libsyscall/mach/mach/mach_right.h
new file mode 100644 (file)
index 0000000..2a7522e
--- /dev/null
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef __MACH_RIGHT_H
+#define __MACH_RIGHT_H
+
+#include <os/base.h>
+#include <mach/mach.h>
+#include <mach/port.h>
+#include <mach/mach_port.h>
+#include <sys/cdefs.h>
+#include <stdbool.h>
+
+__BEGIN_DECLS;
+
+/*!
+ * @typedef mach_right_recv_t
+ * A type representing the receive right to a Mach port.
+ */
+typedef struct _mach_right_recv {
+       mach_port_t mrr_name;
+} mach_right_recv_t;
+
+/*!
+ * @const MACH_RIGHT_RECV_NULL
+ * A convenience initializer for a receive right object.
+ */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define MACH_RIGHT_RECV_NULL ((mach_right_recv_t){MACH_PORT_NULL})
+#elif defined(__cplusplus) && __cplusplus >= 201103L
+#define MACH_RIGHT_RECV_NULL (mach_right_recv_t{MACH_PORT_NULL})
+#elif defined(__cplusplus)
+#define MACH_RIGHT_RECV_NULL \
+               (mach_right_recv_t((mach_right_recv_t){MACH_PORT_NULL}))
+#else
+#define MACH_RIGHT_RECV_NULL {MACH_PORT_NULL}
+#endif
+
+/*!
+ * @typedef mach_right_send_t
+ * A type representing a send right to a Mach port.
+ */
+typedef struct _mach_right_send {
+       mach_port_t mrs_name;
+} mach_right_send_t;
+
+/*!
+ * @const MACH_RIGHT_SEND_NULL
+ * A convenience initializer for a send right object.
+ */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define MACH_RIGHT_SEND_NULL ((mach_right_send_t){MACH_PORT_NULL})
+#elif defined(__cplusplus) && __cplusplus >= 201103L
+#define MACH_RIGHT_SEND_NULL (mach_right_send_t{MACH_PORT_NULL})
+#elif defined(__cplusplus)
+#define MACH_RIGHT_SEND_NULL \
+               (mach_right_send_t((mach_right_send_t){MACH_PORT_NULL}))
+#else
+#define MACH_RIGHT_SEND_NULL {MACH_PORT_NULL}
+#endif
+
+/*!
+ * @typedef mach_right_send_once_t
+ * A type representing a send-once right to a Mach port.
+ */
+typedef struct _mach_right_send_once {
+       mach_port_t mrso_name;
+} mach_right_send_once_t;
+
+/*!
+ * @const MACH_RIGHT_SEND_ONCE_NULL
+ * A convenience initializer for a send-once right object.
+ */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define MACH_RIGHT_SEND_ONCE_NULL ((mach_right_send_once_t){MACH_PORT_NULL})
+#elif defined(__cplusplus) && __cplusplus >= 201103L
+#define MACH_RIGHT_SEND_ONCE_NULL (mach_right_send_once_t{MACH_PORT_NULL})
+#elif defined(__cplusplus)
+#define MACH_RIGHT_SEND_ONCE_NULL \
+               (mach_right_send_once_t((mach_right_send_once_t){MACH_PORT_NULL}))
+#else
+#define MACH_RIGHT_SEND_ONCE_NULL {MACH_PORT_NULL}
+#endif
+
+/*!
+ * @function mach_right_recv
+ * Wraps a port name as a receive right object.
+ *
+ * @param pn
+ * The port name. If this name is valid but does not represent a receive right,
+ * the behavior of mach_right_recv_* implementations is undefined.
+ *
+ * @result
+ * A new receive right object.
+ */
+OS_ALWAYS_INLINE OS_WARN_RESULT
+static inline mach_right_recv_t
+mach_right_recv(mach_port_name_t pn)
+{
+       mach_right_recv_t mrr = {pn};
+       return mrr;
+}
+
+/*!
+ * @function mach_right_send
+ * Wraps a port name as a send right object.
+ *
+ * @param pn
+ * The port name. If this name is valid but does not represent a send right, the
+ * behavior of mach_right_send_* implementations is undefined.
+ *
+ * @result
+ * A new send right object.
+ */
+OS_ALWAYS_INLINE OS_WARN_RESULT
+static inline mach_right_send_t
+mach_right_send(mach_port_name_t pn)
+{
+       mach_right_send_t mrs = {pn};
+       return mrs;
+}
+
+/*!
+ * @function mach_right_send_valid
+ * Checks if the given send right object is valid.
+ *
+ * @param mrs
+ * The send right object to check.
+ *
+ * @result
+ * A Boolean indicating whether the right is valid.
+ */
+OS_ALWAYS_INLINE OS_WARN_RESULT
+static inline bool
+mach_right_send_valid(mach_right_send_t mrs)
+{
+       return MACH_PORT_VALID(mrs.mrs_name);
+}
+
+/*!
+ * @function mach_right_send_once
+ * Wraps a port name as a send-once right object.
+ *
+ * @param pn
+ * The port name. If this name is valid but does not represent a send-once
+ * right, the behavior of mach_right_send_once_* implementations is undefined.
+ *
+ * @result
+ * A new send-once right object.
+ */
+OS_ALWAYS_INLINE OS_WARN_RESULT
+static inline mach_right_send_once_t
+mach_right_send_once(mach_port_name_t pn)
+{
+       mach_right_send_once_t mrso = {pn};
+       return mrso;
+}
+
+/*!
+ * @function mach_right_send_once_valid
+ * Checks if the given send-once right object is valid.
+ *
+ * @param mrso
+ * The send-once right object to check.
+ *
+ * @result
+ * A Boolean indicating whether the right is valid.
+ */
+OS_ALWAYS_INLINE OS_WARN_RESULT
+static inline bool
+mach_right_send_once_valid(mach_right_send_once_t mrso)
+{
+       return MACH_PORT_VALID(mrso.mrso_name);
+}
+
+/*!
+ * @typedef mach_right_flags_t
+ * Flags influencing the behavior of a constructed Mach port.
+ *
+ * @const MACH_RIGHT_RECV_INIT
+ * No flags set. This value is suitable for initialization purposes.
+ *
+ * @const MACH_RIGHT_RECV_UNGUARDED
+ * The given context should not serve as a guard for the underlying port's
+ * destruction.
+ */
+OS_ENUM(mach_right_flags, uint64_t,
+       MACH_RIGHT_RECV_FLAG_INIT = 0,
+       MACH_RIGHT_RECV_FLAG_UNGUARDED = (1 << 0),
+);
+
+/*!
+ * @function mach_right_recv_construct
+ * Allocates a new Mach port and returns the receive right to the caller.
+ *
+ * @param flags
+ * Flags to influence the behavior of the new port.
+ *
+ * @param sr
+ * If non-NULL, will be filled in with the name of a send right which
+ * corresponds to the new port. The caller is responsible for disposing of this
+ * send right with {@link mach_right_send_release}.
+ *
+ * @param ctx
+ * Context to be associated with the new port. By default, this context must be
+ * passed to {@link mach_right_recv_destruct} in order to destroy the underlying
+ * port. This requirement may be elided with the
+ * {@link MACH_RIGHT_RECV_UNGUARDED} flag.
+ *
+ * @result
+ * A new port handle which refers to the receive right for the newly-created
+ * port. The caller is responsible for disposing of this handle with
+ * {@link mach_right_recv_destruct}.
+ *
+ * @discussion
+ * The implementation will abort on any failure to allocate a new port object in
+ * the kernel. Thus the caller may assert that a new, valid receive right is
+ * always returned.
+ */
+OS_EXPORT OS_WARN_RESULT
+mach_right_recv_t
+mach_right_recv_construct(mach_right_flags_t flags,
+               mach_right_send_t *_Nullable sr, uintptr_t ctx);
+
+/*!
+ * @function mach_right_recv_destruct
+ * Closes the port referred to by the given receive right.
+ *
+ * @param r
+ * The receive right for the port to manipulate.
+ *
+ * @param s
+ * A pointer to the send right to dispose of. If NULL is given, no attempt will
+ * be made to clean up any send right associated with the port. If the name of
+ * the given send right does not match the name of the given receive right, the
+ * implementation's behavior is undefined.
+ *
+ * @param ctx
+ * The context which guards the underlying port destruction. If the receive
+ * right was created with {@link MACH_RIGHT_RECV_UNGUARDED}, this parameter is
+ * ignored.
+ *
+ * @discussion
+ * If a send right is passed, the implementation performs the moral equivalent
+ * of
+ *
+ *     mach_right_recv_destruct(r, MACH_PORT_NULL, ctx);
+ *     mach_right_send_release(s);
+ *
+ * except in a more efficient manner, requiring only one system call.
+ *
+ * The implementation will abort on any failure to dispose of the port. As such,
+ * this routine should only be used on ports that are known to be under the
+ * caller's complete control.
+ */
+OS_EXPORT
+void
+mach_right_recv_destruct(mach_right_recv_t r, mach_right_send_t *_Nullable s,
+               uintptr_t ctx);
+
+/*!
+ * @function mach_right_send_create
+ * Creates a send right to the port referenced by the given receive right.
+ *
+ * @param r
+ * The receive right for the port for which to create the send right.
+ *
+ * @result
+ * The name of the new send right. The caller is responsible for disposing of
+ * this send right with {@link mach_right_send_release}.
+ *
+ * This operation will increment the make-send count of the port referenced by
+ * the given receive right.
+ *
+ * @discussion
+ * The implementation will abort on any failure to create the send right. As
+ * such, this routine should only be used on ports that are known to be under
+ * the caller's complete control.
+ */
+OS_EXPORT OS_WARN_RESULT
+mach_right_send_t
+mach_right_send_create(mach_right_recv_t r);
+
+/*!
+ * @function mach_right_send_retain
+ * Increments the user reference count for the given send right.
+ *
+ * @param s
+ * The send right to manipulate.
+ *
+ * @result
+ * If the reference count was successfully incremented, the given port name is
+ * returned. If either MACH_PORT_NULL or MACH_PORT_DEAD are given, the given
+ * value is returned. If the given send right became a dead name before or
+ * during the attempt to retain the send right, MACH_PORT_DEAD is returned.
+ *
+ * If the implementation encounters any other failure condition, it will abort.
+ */
+OS_EXPORT OS_WARN_RESULT
+mach_right_send_t
+mach_right_send_retain(mach_right_send_t s);
+
+/*!
+ * @function mach_right_send_release
+ * Decrements the user reference count for the given send right.
+ *
+ * @param s
+ * The send right to manipulate.
+ *
+ * @discussion
+ * If the given send right became a dead name before or during the attempt to
+ * release it, the implementation will dispose of that dead name.
+ *
+ * If the implementation encounters any other failure condition, it will abort.
+ */
+OS_EXPORT
+void
+mach_right_send_release(mach_right_send_t s);
+
+/*!
+ * @function mach_right_send_once_create
+ * Creates a send-once right from the given receive right.
+ *
+ * @param r
+ * The receive right for the port for which to create the send-once right.
+ *
+ * @result
+ * The newly-created send-once right.
+ *
+ * @discussion
+ * The implementation will abort on any failure to allocate a new send-once
+ * right, and therefore the caller should only provide a receive right which is
+ * under its complete control. The caller may assert that a new, valid send-once
+ * right is always returned.
+ *
+ * The returned send-once right will never share a name with the given receive
+ * right. A send-once right must be consumed either by using it to send a
+ * message or by consuming it with {@link mach_right_send_once_consume}.
+ *
+ * The returned right does not support retain/release semantics despite the
+ * presence of "create" in the name.
+ */
+OS_EXPORT OS_WARN_RESULT
+mach_right_send_once_t
+mach_right_send_once_create(mach_right_recv_t r);
+
+/*!
+ * @function mach_right_send_once_consume
+ * Consumes the given send-once right.
+ *
+ * @param so
+ * The send-once right to manipulate.
+ *
+ * @discussion
+ * If the given send-once right became a dead name before or during the attempt
+ * to release it, the implementation will dispose of that dead name.
+ *
+ * If the implementation encounters any other failure condition, it will abort.
+ *
+ * This operation will cause a send-once notification to be delivered to the
+ * port to which the send-once right refers unless the right is a dead name, in
+ * which case there are no side effects.
+ */
+OS_EXPORT
+void
+mach_right_send_once_consume(mach_right_send_once_t so);
+
+__END_DECLS;
+
+#endif // __MACH_RIGHT_H
diff --git a/libsyscall/mach/mach/mach_sync_ipc.h b/libsyscall/mach/mach/mach_sync_ipc.h
new file mode 100644 (file)
index 0000000..032e7ac
--- /dev/null
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/* 
+ * Mach Operating System
+ * Copyright (c) 1991,1990,1989 Carnegie Mellon University
+ * All Rights Reserved.
+ * 
+ * Permission to use, copy, modify and distribute this software and its
+ * documentation is hereby granted, provided that both the copyright
+ * notice and this permission notice appear in all copies of the
+ * software, derivative works or modified versions, and any portions
+ * thereof, and that both notices appear in supporting documentation.
+ * 
+ * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
+ * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
+ * 
+ * Carnegie Mellon requests users of this software to return to
+ * 
+ *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
+ *  School of Computer Science
+ *  Carnegie Mellon University
+ *  Pittsburgh PA 15213-3890
+ * 
+ * any improvements or extensions that they make and grant Carnegie Mellon
+ * the rights to redistribute these changes.
+ */
+
+#ifndef        _MACH_SYNC_IPC_H_
+#define        _MACH_SYNC_IPC_H_
+
+#include <mach/mach.h>
+
+__BEGIN_DECLS
+
+/*!
+ * @function mach_sync_ipc_link_monitoring_start
+ *
+ * @abstract
+ * Starts monitoring the sync IPC priority inversion avoidance
+ * facility of the current thread.
+ * A subsequent call to mach_sync_ipc_link_monitoring_stop() will
+ * validate that the facility took effect for all synchronous IPC
+ * performed from this thread between the calls to start and stop.
+ *
+ * @discussion
+ * In case of success, a port right is returned, which has to be
+ * deallocated by passing it to mach_sync_ipc_link_monitoring_stop().
+ *
+ * @param port
+ * Pointer to a mach_port_t that will be populated in case of success.
+ *
+ * @result
+ * KERN_SUCCESS in case of success, specific error otherwise.
+ * If the call is not supported, KERN_NOT_SUPPORTED is returned.
+ */
+extern kern_return_t mach_sync_ipc_link_monitoring_start(mach_port_t* port);
+
+/*!
+ * @function mach_sync_ipc_link_monitoring_stop
+ *
+ * @abstract
+ * Stops monitoring the sync IPC priority inversion avoidance facility
+ * of the current thread started by a call to mach_sync_ipc_link_monitoring_start().
+ *
+ * Returns whether the facility took effect for all synchronous IPC performed
+ * from this thread between the calls to start and stop.
+ *
+ * Reasons for this function to return false include:
+ * -remote message event handler did not reply to the message itself
+ * -remote message was not received by a workloop (xpc connection or dispatch mach channel)
+ *
+ * @discussion
+ * To be called after mach_sync_ipc_link_monitoring_start(). If
+ * mach_sync_ipc_link_monitoring_start() didn't return an error this
+ * function must be called to deallocate the port right that was returned.
+ *
+ * @param port
+ * mach_port_t returned by mach_sync_ipc_link_monitoring_start().
+ *
+ * @param in_effect
+ * Pointer to boolean_t value that will be populated in the case of success.
+ * Indicates whether the sync IPC priority inversion avoidance facility took
+ * effect for all synchronous IPC performed from this thread between the calls
+ * to start and stop.
+ *
+ * @result
+ * KERN_SUCCESS in case of no errors, specific error otherwise.
+ * If the call is not supported, KERN_NOT_SUPPORTED is returned.
+ */
+extern kern_return_t mach_sync_ipc_link_monitoring_stop(mach_port_t port, boolean_t* in_effect);
+
+typedef enum thread_destruct_special_reply_port_rights {
+       THREAD_SPECIAL_REPLY_PORT_ALL,
+       THREAD_SPECIAL_REPLY_PORT_RECEIVE_ONLY,
+       THREAD_SPECIAL_REPLY_PORT_SEND_ONLY,
+} thread_destruct_special_reply_port_rights_t;
+
+extern kern_return_t thread_destruct_special_reply_port(mach_port_name_t port, thread_destruct_special_reply_port_rights_t rights);
+
+extern mach_port_t mig_get_special_reply_port(void);
+
+extern void mig_dealloc_special_reply_port(mach_port_t migport);
+
+
+__END_DECLS
+
+#endif /* _MACH_SYNC_IPC_H_ */
diff --git a/libsyscall/mach/mach/port_descriptions.h b/libsyscall/mach/mach/port_descriptions.h
new file mode 100644 (file)
index 0000000..e237e27
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _MACH_PORT_DESCRIPTIONS_
+#define _MACH_PORT_DESCRIPTIONS_
+
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+/*
+ * Returns a string describing the host special port offset provided, or NULL if
+ * the provided offset is not a host special port offset.
+ */
+const char *mach_host_special_port_description(int offset);
+
+/*
+ * Returns a string describing the task special port offset provided, or NULL if
+ * the provided offset is not a task special port offset.
+ */
+const char *mach_task_special_port_description(int offset);
+
+/*
+ * Returns the port for the given identifier of a host special port.  For
+ * instance, passing "HOST_PRIV_PORT" would return 1.
+ *
+ * Returns -1 on error.
+ */
+int mach_host_special_port_for_id(const char *id);
+
+/*
+ * Returns the port for the given identifier of a task special port.
+ *
+ * Returns -1 on error.
+ */
+int mach_task_special_port_for_id(const char *id);
+
+__END_DECLS
+
+#endif /* !defined(_MACH_PORT_DESCRIPTIONS_) */
index 42abc4ec258dafd167449f433e760d126c231cc9..67afb68357ea6e2cb5f7a0a11433d7d0c6da37bb 100644 (file)
 #ifndef KERNEL
 /*
  * Gets all register values in the target thread with pointer-like contents.
- * There's no guarantee that the returned values are valid pointers, but all
+ *
+ * There is no guarantee that the returned values are valid pointers, but all
  * valid pointers will be returned.  The order and count of the provided
  * register values is unspecified and may change; registers with values that
  * are not valid pointers may be omitted, so the number of pointers returned
  * may vary from call to call.
  *
- * sp is an out parameter that will contain the stack pointer
- * length is an in/out parameter for the length of the values array
- * values is an array of pointers
+ * sp is an out parameter that will contain the stack pointer.
+ * length is an in/out parameter for the length of the values array.
+ * values is an array of pointers.
  *
  * This may only be called on threads in the current task.  If the current
  * platform defines a stack red zone, the stack pointer returned will be
  * adjusted to account for red zone.
  *
- * If length is insufficient KERN_INSUFFICIENT_BUFFER_SIZE will be returned and
- * length set to the amount of memory required.  Callers MUST NOT assume that
- * any particular size of buffer will be sufficient and should retry with an
- * aproproately sized buffer upon this error.
+ * If length is insufficient, KERN_INSUFFICIENT_BUFFER_SIZE will be returned
+ * and length set to the amount of memory required.  Callers MUST NOT assume
+ * that any particular size of buffer will be sufficient and should retry with
+ * an appropriately sized buffer upon this error.
  */
-__OSX_UNAVAILABLE
-__IOS_UNAVAILABLE
-__TVOS_AVAILABLE(9.0)
-__WATCHOS_UNAVAILABLE
-kern_return_t thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *length, uintptr_t *values);
+__API_AVAILABLE(macosx(10.14), ios(12.0), tvos(9.0), watchos(5.0))
+kern_return_t thread_get_register_pointer_values(thread_t thread,
+               uintptr_t *sp, size_t *length, uintptr_t *values);
 #endif
 
 #endif /* _MACH_THREAD_STATE_H_ */
index bdb446c33fc04313c556f2e8ea317651c6bf6c9e..4b90d19e5a79807702288140a528cfc632e7529d 100644 (file)
@@ -559,6 +559,7 @@ mach_msg_server(
 
                        buffers_swapped = FALSE;
                        old_state = voucher_mach_msg_adopt(&bufRequest->Head);
+                       bufReply->Head = (mach_msg_header_t){};
 
                        (void) (*demux)(&bufRequest->Head, &bufReply->Head);
 
index e2cf670be29849b48d2992ebc44c02ae07dedf00..3219d730162e105c135e215075b1249ae886f81c 100644 (file)
@@ -30,6 +30,8 @@
 #include <mach/mach.h>
 #include <mach/mach_vm.h>
 #include <mach/mach_traps.h>
+#include <mach/mach_sync_ipc.h>
+#include "tsd.h"
 
 kern_return_t
 mach_port_names(
@@ -302,9 +304,23 @@ mach_port_get_attributes(
 {
        kern_return_t rv;
 
-       rv = _kernelrpc_mach_port_get_attributes(task, name, flavor,
+       rv = _kernelrpc_mach_port_get_attributes_trap(task, name, flavor,
                        port_info_out, port_info_outCnt);
 
+#ifdef __x86_64__
+       /* REMOVE once XBS kernel has new trap */
+       if (rv == ((1 << 24) | 40)) /* see mach/i386/syscall_sw.h */
+               rv = MACH_SEND_INVALID_DEST;
+#elif defined(__i386__)
+       /* REMOVE once XBS kernel has new trap */
+       if (rv == (kern_return_t)(-40))
+               rv = MACH_SEND_INVALID_DEST;
+#endif
+
+       if (rv == MACH_SEND_INVALID_DEST)
+               rv = _kernelrpc_mach_port_get_attributes(task, name, flavor,
+                               port_info_out, port_info_outCnt);
+
        return (rv);
 }
 
@@ -407,6 +423,93 @@ mach_port_space_basic_info(
        return (rv);
 }
 
+static inline mach_port_t
+_tsd_get_special_reply_port()
+{
+       return (mach_port_t)(uintptr_t)_os_tsd_get_direct(__TSD_MACH_SPECIAL_REPLY);
+}
+
+static inline void
+_tsd_set_special_reply_port(mach_port_t port)
+{
+       _os_tsd_set_direct(__TSD_MACH_SPECIAL_REPLY, (void *)(uintptr_t)port);
+}
+
+mach_port_t
+mig_get_special_reply_port(void)
+{
+       mach_port_t srp;
+
+       srp = _tsd_get_special_reply_port();
+       if (!MACH_PORT_VALID(srp)) {
+               srp = thread_get_special_reply_port();
+               _tsd_set_special_reply_port(srp);
+       }
+
+       return srp;
+}
+
+void
+mig_dealloc_special_reply_port(mach_port_t migport)
+{
+       mach_port_t srp = _tsd_get_special_reply_port();
+       if (MACH_PORT_VALID(srp)) {
+               thread_destruct_special_reply_port(srp, THREAD_SPECIAL_REPLY_PORT_ALL);
+               if (migport != srp) {
+                       mach_port_deallocate(mach_task_self(), migport);
+               }
+               _tsd_set_special_reply_port(MACH_PORT_NULL);
+       }
+}
+
+kern_return_t
+mach_sync_ipc_link_monitoring_start(mach_port_t *special_reply_port)
+{
+       mach_port_t srp;
+       boolean_t link_broken;
+       kern_return_t kr;
+
+       *special_reply_port = MACH_PORT_DEAD;
+
+       srp = mig_get_special_reply_port();
+
+       kr = mach_port_mod_refs(mach_task_self(), srp, MACH_PORT_RIGHT_SEND, 1);
+
+       if (kr != KERN_SUCCESS) {
+               return kr;
+       }
+
+       kr = _kernelrpc_mach_port_special_reply_port_reset_link(mach_task_self(), srp, &link_broken);
+       if (kr != KERN_SUCCESS) {
+               mach_port_deallocate(mach_task_self(), srp);
+               return kr;
+       }
+
+       *special_reply_port = srp;
+
+       return kr;
+}
+
+kern_return_t
+mach_sync_ipc_link_monitoring_stop(mach_port_t srp, boolean_t* in_effect)
+{
+       kern_return_t kr;
+       boolean_t link_broken = TRUE;
+
+       kr = _kernelrpc_mach_port_special_reply_port_reset_link(mach_task_self(), srp, &link_broken);
+
+       /*
+        * We return if the sync IPC priority inversion avoidance facility took
+        * effect, so if the link was broken it didn't take effect.
+        * Flip the return.
+        */
+       *in_effect = !link_broken;
+
+       mach_port_deallocate(mach_task_self(), srp);
+
+       return kr;
+}
+
 kern_return_t
 mach_port_dnrequest_info(
        ipc_space_t task,
@@ -602,18 +705,29 @@ mach_voucher_extract_attr_recipe(
 
        rv = mach_voucher_extract_attr_recipe_trap(voucher, key, recipe, recipe_size);
 
-#ifdef __x86_64__
-       /* REMOVE once XBS kernel has new trap */
-       if (rv == ((1 << 24) | 72)) /* see mach/i386/syscall_sw.h */
-               rv = MACH_SEND_INVALID_DEST;
-#elif defined(__i386__)
-       /* REMOVE once XBS kernel has new trap */
-       if (rv == (kern_return_t)(-72))
-               rv = MACH_SEND_INVALID_DEST;
-#endif
-
        if (rv == MACH_SEND_INVALID_DEST)
                rv = _kernelrpc_mach_voucher_extract_attr_recipe(voucher, key, recipe, recipe_size);
 
        return rv;
 }
+
+
+kern_return_t
+thread_destruct_special_reply_port(
+               mach_port_name_t port,
+               thread_destruct_special_reply_port_rights_t rights)
+{
+       switch (rights) {
+       case THREAD_SPECIAL_REPLY_PORT_ALL:
+               return mach_port_destruct(mach_task_self(), port, -1, 0);
+
+       case THREAD_SPECIAL_REPLY_PORT_RECEIVE_ONLY:
+               return mach_port_destruct(mach_task_self(), port, 0, 0);
+
+       case THREAD_SPECIAL_REPLY_PORT_SEND_ONLY:
+               return mach_port_deallocate(mach_task_self(), port);
+
+       default:
+               return KERN_INVALID_ARGUMENT;
+       }
+}
diff --git a/libsyscall/mach/mach_right.c b/libsyscall/mach/mach_right.c
new file mode 100644 (file)
index 0000000..c69133e
--- /dev/null
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#include <mach/mach.h>
+#include <mach/mach_traps.h>
+#include <mach/mach_port.h>
+#include <mach/mach_right.h>
+
+
+#pragma mark Utilities
+#define _assert_mach(__op, __kr) \
+       do { \
+               if (kr != KERN_SUCCESS) { \
+                       __builtin_trap(); \
+               } \
+       } while (0)
+
+#pragma mark API
+mach_right_recv_t
+mach_right_recv_construct(mach_right_flags_t flags,
+               mach_right_send_t *_Nullable sr, uintptr_t ctx)
+{
+       kern_return_t kr = KERN_FAILURE;
+       mach_port_t p = MACH_PORT_NULL;
+       mach_port_options_t opts = {
+               .flags = MPO_CONTEXT_AS_GUARD,
+               .mpl = {
+                       .mpl_qlimit = MACH_PORT_QLIMIT_BASIC,
+               },
+       };
+
+       if (flags & MACH_RIGHT_RECV_FLAG_UNGUARDED) {
+               opts.flags &= (~MPO_CONTEXT_AS_GUARD);
+       }
+       if (sr) {
+               opts.flags |= MPO_INSERT_SEND_RIGHT;
+       }
+
+       kr = mach_port_construct(mach_task_self(), &opts, ctx, &p);
+       _mach_assert("construct recv right", kr);
+
+       if (sr) {
+               sr->mrs_name = p;
+       }
+
+       return mach_right_recv(p);
+}
+
+void
+mach_right_recv_destruct(mach_right_recv_t r, mach_right_send_t *s,
+               uintptr_t ctx)
+{
+       kern_return_t kr = KERN_FAILURE;
+       mach_port_delta_t srd = 0;
+
+       if (s) {
+               if (r.mrr_name != s->mrs_name) {
+                       _os_set_crash_log_cause_and_message(s->mrs_name,
+                                       "api misuse: bad send right");
+                       __builtin_trap();
+               }
+
+               srd = -1;
+       }
+
+       kr = mach_port_destruct(mach_task_self(), r.mrr_name, srd, ctx);
+       _mach_assert("destruct recv right", kr);
+}
+
+mach_right_send_t
+mach_right_send_create(mach_right_recv_t r)
+{
+       kern_return_t kr = KERN_FAILURE;
+
+       kr = mach_port_insert_right(mach_task_self(), r.mrr_name, r.mrr_name,
+                       MACH_MSG_TYPE_MAKE_SEND);
+       _mach_assert("create send right", kr);
+
+       return mach_right_send(r.mrr_name);
+}
+
+mach_right_send_t
+mach_right_send_retain(mach_right_send_t s)
+{
+       kern_return_t kr = KERN_FAILURE;
+       mach_right_send_t rs = MACH_RIGHT_SEND_NULL;
+
+       kr = mach_port_mod_refs(mach_task_self(), s.mrs_name,
+                       MACH_PORT_RIGHT_SEND, 1);
+       switch (kr) {
+       case 0:
+               rs = s;
+               break;
+       case KERN_INVALID_RIGHT:
+               rs.mrs_name = MACH_PORT_DEAD;
+               break;
+       case KERN_INVALID_NAME:
+               // mach_port_mod_refs() will return success when given either
+               // MACH_PORT_DEAD or MACH_PORT_NULL with send or send-once right
+               // operations, so this is always fatal.
+       default:
+               _mach_assert("retain send right", kr);
+       }
+
+       return rs;
+}
+
+void
+mach_right_send_release(mach_right_send_t s)
+{
+       kern_return_t kr = KERN_FAILURE;
+
+       kr = mach_port_mod_refs(mach_task_self(), s.mrs_name,
+                       MACH_PORT_RIGHT_SEND, -1);
+       switch (kr) {
+       case 0:
+               break;
+       case KERN_INVALID_RIGHT:
+               kr = mach_port_mod_refs(mach_task_self(), s.mrs_name,
+                               MACH_PORT_RIGHT_DEAD_NAME, -1);
+               _mach_assert("release dead name", kr);
+               break;
+       default:
+               _mach_assert("release send right", kr);
+       }
+}
+
+mach_right_send_once_t
+mach_right_send_once_create(mach_right_recv_t r)
+{
+       mach_msg_type_name_t right = 0;
+       mach_port_t so = MACH_PORT_NULL;
+       kern_return_t kr = mach_port_extract_right(mach_task_self(), r.mrr_name,
+                       MACH_MSG_TYPE_MAKE_SEND_ONCE, &so, &right);
+       _mach_assert("create send-once right", kr);
+
+       return mach_right_send_once(so);
+}
+
+void
+mach_right_send_once_consume(mach_right_send_once_t so)
+{
+       kern_return_t kr = KERN_FAILURE;
+
+       kr = mach_port_mod_refs(mach_task_self(), so.mrso_name,
+                       MACH_PORT_RIGHT_SEND_ONCE, -1);
+       switch (kr) {
+       case 0:
+               break;
+       case KERN_INVALID_RIGHT:
+               kr = mach_port_mod_refs(mach_task_self(), so.mrso_name,
+                               MACH_PORT_RIGHT_DEAD_NAME, -1);
+               _mach_assert("release dead name", kr);
+               break;
+       default:
+               _mach_assert("consume send-once right", kr);
+       }
+}
index 934c1aa208a367ca469268034c9949e79a6460c7..ee7e867cde2df2a4c6efc1847f499d54f3ac9ac2 100644 (file)
@@ -36,13 +36,13 @@ __XNU_PRIVATE_EXTERN mach_port_t _task_reply_port = MACH_PORT_NULL;
 static inline mach_port_t
 _mig_get_reply_port()
 {
-       return _os_tsd_get_direct(__TSD_MIG_REPLY);
+       return (mach_port_t)(uintptr_t)_os_tsd_get_direct(__TSD_MIG_REPLY);
 }
 
 static inline void
 _mig_set_reply_port(mach_port_t port)
 {
-       _os_tsd_set_direct(__TSD_MIG_REPLY, port);
+       _os_tsd_set_direct(__TSD_MIG_REPLY, (void *)(uintptr_t)port);
 }
 
 /*
diff --git a/libsyscall/mach/port_descriptions.c b/libsyscall/mach/port_descriptions.c
new file mode 100644 (file)
index 0000000..a5d8a93
--- /dev/null
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <errno.h>
+#include <mach/host_special_ports.h>
+#include <mach/task_special_ports.h>
+#include <mach/port_descriptions.h>
+#include <stdlib.h>
+#include <strings.h>
+
+const char *
+mach_host_special_port_description(int port)
+{
+       int port_index = (int)port;
+
+       if (port_index < 0 || port_index > HOST_MAX_SPECIAL_PORT) {
+               return NULL;
+       }
+
+       static const char *hsp_descs[] = {
+               [HOST_PORT] = "host (restricted)",
+               [HOST_PRIV_PORT] = "host private (restricted)",
+               [HOST_IO_MASTER_PORT] = "I/O master (restricted)",
+
+               [HOST_DYNAMIC_PAGER_PORT] = "dynamic pager",
+               [HOST_AUDIT_CONTROL_PORT] = "audit control",
+               [HOST_USER_NOTIFICATION_PORT] = "user notification",
+               [HOST_AUTOMOUNTD_PORT] = "automounter",
+               [HOST_LOCKD_PORT] = "lockd",
+               [HOST_KTRACE_BACKGROUND_PORT] = "ktrace background notification",
+               [HOST_SEATBELT_PORT] = "seatbelt",
+               [HOST_KEXTD_PORT] = "kextd",
+               [HOST_LAUNCHCTL_PORT] = "launchctl",
+               [HOST_UNFREED_PORT] = "fairplay",
+               [HOST_AMFID_PORT] = "amfi",
+               [HOST_GSSD_PORT] = "gssd",
+               [HOST_TELEMETRY_PORT] = "telemetry",
+               [HOST_ATM_NOTIFICATION_PORT] = "atm notification",
+               [HOST_COALITION_PORT] = "coalition notification",
+               [HOST_SYSDIAGNOSE_PORT] = "sysdiagnose notification",
+               [HOST_XPC_EXCEPTION_PORT] = "XPC exception",
+               [HOST_CONTAINERD_PORT] = "container manager",
+               [HOST_NODE_PORT] = "node",
+               [HOST_RESOURCE_NOTIFY_PORT] = "resource notify",
+               [HOST_CLOSURED_PORT] = "closured",
+               [HOST_SYSPOLICYD_PORT] = "syspolicyd",
+       };
+       _Static_assert(HOST_SYSPOLICYD_PORT == HOST_MAX_SPECIAL_PORT,
+                       "all host special ports must have descriptions");
+
+       return hsp_descs[port_index];
+}
+
+const char *
+mach_task_special_port_description(int port)
+{
+       int port_index = (int)port;
+
+       if (port_index < 0 || port_index > TASK_MAX_SPECIAL_PORT) {
+               return NULL;
+       }
+
+       static const char *tsp_descs[] = {
+               [TASK_KERNEL_PORT] = "kernel",
+               [TASK_HOST_PORT] = "host",
+               [TASK_NAME_PORT] = "name",
+               [TASK_BOOTSTRAP_PORT] = "bootstrap",
+               [TASK_SEATBELT_PORT] = "seatbelt",
+               [TASK_ACCESS_PORT] = "access",
+               [TASK_DEBUG_CONTROL_PORT] = "debug control",
+               [TASK_RESOURCE_NOTIFY_PORT] = "resource notify",
+       };
+       _Static_assert(TASK_RESOURCE_NOTIFY_PORT == TASK_MAX_SPECIAL_PORT,
+                       "all task special ports must have descriptions");
+
+       return tsp_descs[port_index];
+}
+
+static int
+port_for_id_internal(const char *id, const char **ids, int nids)
+{
+       if (!id) {
+               errno = EINVAL;
+               return -1;
+       }
+
+       for (int i = 0; i < nids; i++) {
+               if (ids[i] && strcmp(ids[i], id) == 0) {
+                       return i;
+               }
+       }
+
+       errno = ENOENT;
+       return -1;
+}
+
+int
+mach_host_special_port_for_id(const char *id)
+{
+       static const char *hsp_ids[] = {
+#define SP_ENTRY(id) [id] = #id
+               SP_ENTRY(HOST_PORT),
+               SP_ENTRY(HOST_PRIV_PORT),
+               SP_ENTRY(HOST_IO_MASTER_PORT),
+               SP_ENTRY(HOST_DYNAMIC_PAGER_PORT),
+               SP_ENTRY(HOST_AUDIT_CONTROL_PORT),
+               SP_ENTRY(HOST_USER_NOTIFICATION_PORT),
+               SP_ENTRY(HOST_AUTOMOUNTD_PORT),
+               SP_ENTRY(HOST_LOCKD_PORT),
+               SP_ENTRY(HOST_KTRACE_BACKGROUND_PORT),
+               SP_ENTRY(HOST_SEATBELT_PORT),
+               SP_ENTRY(HOST_KEXTD_PORT),
+               SP_ENTRY(HOST_LAUNCHCTL_PORT),
+               SP_ENTRY(HOST_UNFREED_PORT),
+               SP_ENTRY(HOST_AMFID_PORT),
+               SP_ENTRY(HOST_GSSD_PORT),
+               SP_ENTRY(HOST_TELEMETRY_PORT),
+               SP_ENTRY(HOST_ATM_NOTIFICATION_PORT),
+               SP_ENTRY(HOST_COALITION_PORT),
+               SP_ENTRY(HOST_SYSDIAGNOSE_PORT),
+               SP_ENTRY(HOST_XPC_EXCEPTION_PORT),
+               SP_ENTRY(HOST_CONTAINERD_PORT),
+               SP_ENTRY(HOST_NODE_PORT),
+               SP_ENTRY(HOST_RESOURCE_NOTIFY_PORT),
+               SP_ENTRY(HOST_CLOSURED_PORT),
+               SP_ENTRY(HOST_SYSPOLICYD_PORT),
+       };
+
+       return port_for_id_internal(id, hsp_ids,
+                       sizeof(hsp_ids) / sizeof(hsp_ids[0]));
+}
+
+int
+mach_task_special_port_for_id(const char *id)
+{
+       static const char *tsp_ids[] = {
+               SP_ENTRY(TASK_KERNEL_PORT),
+               SP_ENTRY(TASK_HOST_PORT),
+               SP_ENTRY(TASK_NAME_PORT),
+               SP_ENTRY(TASK_BOOTSTRAP_PORT),
+               SP_ENTRY(TASK_SEATBELT_PORT),
+               SP_ENTRY(TASK_ACCESS_PORT),
+               SP_ENTRY(TASK_DEBUG_CONTROL_PORT),
+               SP_ENTRY(TASK_RESOURCE_NOTIFY_PORT),
+#undef SP_ENTRY
+       };
+
+       return port_for_id_internal(id, tsp_ids,
+                       sizeof(tsp_ids) / sizeof(tsp_ids[0]));
+}
diff --git a/libsyscall/os/thread_self_restrict.h b/libsyscall/os/thread_self_restrict.h
new file mode 100644 (file)
index 0000000..153f516
--- /dev/null
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef OS_THREAD_SELF_RESTRICT_H
+#define OS_THREAD_SELF_RESTRICT_H
+#endif /* OS_THREAD_SELF_RESTRICT_H */
+
index d49087f1481048c693b2e9e930450a2bd624dbf4..474c97aec2976bba7491dc0bd2e3c457e3acac1e 100644 (file)
@@ -29,8 +29,6 @@
 #ifndef OS_TSD_H
 #define OS_TSD_H
 
-#include <stdint.h>
-
 /* The low nine slots of the TSD are reserved for libsyscall usage. */
 #define __TSD_RESERVED_BASE 0
 #define __TSD_RESERVED_MAX 9
 #define __TSD_THREAD_QOS_CLASS 4
 #define __TSD_RETURN_TO_KERNEL 5
 /* slot 6 is reserved for Windows/WINE compatibility reasons */
+#define __TSD_PTR_MUNGE 7
+#define __TSD_MACH_SPECIAL_REPLY 8
 #define __TSD_SEMAPHORE_CACHE 9
 
+#ifndef __ASSEMBLER__
+
+#include <stdint.h>
+
 #ifdef __arm__
 #include <arm/arch.h>
 #endif
 
+extern void _thread_set_tsd_base(void *tsd_base);
+
 __attribute__((always_inline))
 static __inline__ unsigned int
 _os_cpu_number(void)
@@ -147,6 +153,61 @@ _os_tsd_set_direct(unsigned long slot, void *val)
 }
 #endif
 
-extern void _thread_set_tsd_base(void *tsd_base);
+__attribute__((always_inline, pure))
+static __inline__ uintptr_t
+_os_ptr_munge_token(void)
+{
+       return (uintptr_t)_os_tsd_get_direct(__TSD_PTR_MUNGE);
+}
 
-#endif
+__attribute__((always_inline, pure))
+static __inline__ uintptr_t
+_os_ptr_munge(uintptr_t ptr)
+{
+       return ptr ^ _os_ptr_munge_token();
+}
+#define _OS_PTR_MUNGE(_ptr) _os_ptr_munge((uintptr_t)(_ptr))
+#define _OS_PTR_UNMUNGE(_ptr) _os_ptr_munge((uintptr_t)(_ptr))
+
+#else // __ASSEMBLER__
+
+#define _OS_TSD_OFFSET(_key) \
+       ((__POINTER_WIDTH__/__CHAR_BIT__)*_key)
+
+#if defined(__i386__) || defined(__x86_64__)
+
+#define _OS_PTR_MUNGE(_reg) \
+       xor %gs:_OS_TSD_OFFSET(__TSD_PTR_MUNGE), _reg
+
+#define _OS_PTR_UNMUNGE(_reg) \
+       _OS_PTR_MUNGE(_reg)
+
+#elif defined(__arm__) || defined(__arm64__)
+
+#if defined(__arm__)
+
+#define _OS_PTR_MUNGE_TOKEN(_reg, _token) \
+       mrc p15, 0, _reg, c13, c0, 3; \
+       bic     _reg, _reg, #3; \
+       ldr     _token, [ _reg,  #_OS_TSD_OFFSET(__TSD_PTR_MUNGE) ]
+
+#elif defined(__arm64__)
+
+#define _OS_PTR_MUNGE_TOKEN(_reg, _token) \
+       mrs _reg, TPIDRRO_EL0 %% \
+       and     _reg, _reg, #~0x7 %% \
+       ldr     _token, [ _reg,  #_OS_TSD_OFFSET(__TSD_PTR_MUNGE) ]
+
+#endif // defined(__arm64__)
+
+#define _OS_PTR_MUNGE(_regdest, _regsrc, _token) \
+       eor _regdest, _regsrc, _token
+
+#define _OS_PTR_UNMUNGE(_regdest, _regsrc, _token) \
+       _OS_PTR_MUNGE(_regdest, _regsrc, _token)
+
+#endif // defined(__arm__) || defined(__arm64__)
+
+#endif // __ASSEMBLER__
+
+#endif // OS_TSD_H
index d7ecb2573d6265d2d51943cb37f50e188cfe8df4..e1e7e001d593911ef82d5f48df121dd0939b2a15 100644 (file)
@@ -95,7 +95,11 @@ __commpage_gettimeofday_internal(struct timeval *tp, uint64_t *tbr_out)
        if (delta >= Ticks_per_sec)
                return(1);
 
-       tp->tv_sec = TimeStamp_sec;
+       if (TimeStamp_sec > __LONG_MAX__) {
+               return(1);
+       }
+
+       tp->tv_sec = (__darwin_time_t)TimeStamp_sec;
 
        over = multi_overflow(Tick_scale, delta);
        if(over){
index 63eb09ca6e90b54fce5e2985319b363a258e247f..9c65ef3ff391cd4beda13eeffc106ca81529c5c9 100644 (file)
@@ -27,6 +27,8 @@
  */
 
 #include "_libkernel_init.h"
+#include "strings.h"
+
 extern _libkernel_functions_t _libkernel_functions;
 extern void mig_os_release(void* ptr);
 
@@ -82,6 +84,157 @@ _pthread_clear_qos_tsd(mach_port_t thread_port)
        }
 }
 
+/*
+ * Upcalls to optimized libplatform string functions
+ */
+
+static const struct _libkernel_string_functions
+               _libkernel_generic_string_functions = {
+       .bzero = _libkernel_bzero,
+       .memmove = _libkernel_memmove,
+       .memset = _libkernel_memset,
+       .strchr = _libkernel_strchr,
+       .strcmp = _libkernel_strcmp,
+       .strcpy = _libkernel_strcpy,
+       .strlcpy = _libkernel_strlcpy,
+       .strlen = _libkernel_strlen,
+};
+static _libkernel_string_functions_t _libkernel_string_functions =
+               &_libkernel_generic_string_functions;
+
+kern_return_t
+__libkernel_platform_init(_libkernel_string_functions_t fns)
+{
+       _libkernel_string_functions = fns;
+       return KERN_SUCCESS;
+}
+
+__attribute__((visibility("hidden")))
+void
+bzero(void *s, size_t n)
+{
+       return _libkernel_string_functions->bzero(s, n);
+}
+
+__attribute__((visibility("hidden")))
+void *
+memchr(const void *s, int c, size_t n)
+{
+       return _libkernel_string_functions->memchr(s, c, n);
+}
+
+__attribute__((visibility("hidden")))
+int
+memcmp(const void *s1, const void *s2, size_t n)
+{
+       return _libkernel_string_functions->memcmp(s1, s2, n);
+}
+
+__attribute__((visibility("hidden")))
+void *
+memmove(void *dst, const void *src, size_t n)
+{
+       return _libkernel_string_functions->memmove(dst, src, n);
+}
+
+__attribute__((visibility("hidden")))
+void *
+memcpy(void *dst, const void *src, size_t n)
+{
+       return _libkernel_string_functions->memmove(dst, src, n);
+}
+
+__attribute__((visibility("hidden")))
+void *
+memccpy(void *__restrict dst, const void *__restrict src, int c, size_t n)
+{
+       return _libkernel_string_functions->memccpy(dst, src, c, n);
+}
+
+__attribute__((visibility("hidden")))
+void *
+memset(void *b, int c, size_t len)
+{
+       return _libkernel_string_functions->memset(b, c, len);
+}
+
+__attribute__((visibility("hidden")))
+char *
+strchr(const char *s, int c)
+{
+       return _libkernel_string_functions->strchr(s, c);
+}
+
+__attribute__((visibility("hidden")))
+char *
+index(const char *s, int c)
+{
+       return _libkernel_string_functions->strchr(s, c);
+}
+
+__attribute__((visibility("hidden")))
+int
+strcmp(const char *s1, const char *s2)
+{
+       return _libkernel_string_functions->strcmp(s1, s2);
+}
+
+__attribute__((visibility("hidden")))
+char *
+strcpy(char * restrict dst, const char * restrict src)
+{
+       return _libkernel_string_functions->strcpy(dst, src);
+}
+
+__attribute__((visibility("hidden")))
+size_t
+strlcat(char * restrict dst, const char * restrict src, size_t maxlen)
+{
+       return _libkernel_string_functions->strlcat(dst, src, maxlen);
+}
+
+__attribute__((visibility("hidden")))
+size_t
+strlcpy(char * restrict dst, const char * restrict src, size_t maxlen)
+{
+       return _libkernel_string_functions->strlcpy(dst, src, maxlen);
+}
+
+__attribute__((visibility("hidden")))
+size_t
+strlen(const char *str)
+{
+       return _libkernel_string_functions->strlen(str);
+}
+
+__attribute__((visibility("hidden")))
+int
+strncmp(const char *s1, const char *s2, size_t n)
+{
+       return _libkernel_string_functions->strncmp(s1, s2, n);
+}
+
+__attribute__((visibility("hidden")))
+char *
+strncpy(char * restrict dst, const char * restrict src, size_t maxlen)
+{
+       return _libkernel_string_functions->strncpy(dst, src, maxlen);
+}
+
+__attribute__((visibility("hidden")))
+size_t
+strnlen(const char *s, size_t maxlen)
+{
+       return _libkernel_string_functions->strnlen(s, maxlen);
+}
+
+__attribute__((visibility("hidden")))
+char *
+strstr(const char *s, const char *find)
+{
+       return _libkernel_string_functions->strstr(s, find);
+}
+
 /*
  * mach/mach.h voucher_mach_msg API
  */
index 3eb67853d09d5ed6d3fb888da03b1e780adf6271..31e6cb47accec4d088555aaa5c844a0b656959d8 100644 (file)
@@ -46,6 +46,7 @@ void* (*_dlsym)(void*, const char*) __attribute__((visibility("hidden")));
 __attribute__((visibility("hidden")))
 _libkernel_functions_t _libkernel_functions;
 
+
 void
 __libkernel_init(_libkernel_functions_t fns,
                const char *envp[] __attribute__((unused)),
index 68a7067e4f8c5ab41349a2cfb762c76f08b1ff5a..b081ebc90ef45ac2038c9ecccf14c6e72a9897ec 100644 (file)
@@ -67,6 +67,28 @@ typedef const struct _libkernel_functions {
        /* Subsequent versions must only add pointers! */
 } *_libkernel_functions_t;
 
+typedef const struct _libkernel_string_functions {
+       /* The following functions are included in version 1 of this structure */
+       unsigned long version;
+       void (*bzero)(void *s, size_t n);
+       void * (*memchr)(const void *s, int c, size_t n);
+       int (*memcmp)(const void *s1, const void *s2, size_t n);
+       void * (*memmove)(void *dst, const void *src, size_t n);
+       void * (*memccpy)(void *__restrict dst, const void *__restrict src, int c, size_t n);
+       void * (*memset)(void *b, int c, size_t len);
+       char * (*strchr)(const char *s, int c);
+       int (*strcmp)(const char *s1, const char *s2);
+       char * (*strcpy)(char * restrict dst, const char * restrict src);
+       size_t (*strlcat)(char * restrict dst, const char * restrict src, size_t maxlen);
+       size_t (*strlcpy)(char * restrict dst, const char * restrict src, size_t maxlen);
+       size_t (*strlen)(const char *str);
+       int (*strncmp)(const char *s1, const char *s2, size_t n);
+       char * (*strncpy)(char * restrict dst, const char * restrict src, size_t maxlen);
+       size_t (*strnlen)(const char *s, size_t maxlen);
+       char * (*strstr)(const char *s, const char *find);
+       /* Subsequent versions must only add pointers! */
+} *_libkernel_string_functions_t;
+
 typedef const struct _libkernel_voucher_functions {
        /* The following functions are included in version 1 of this structure */
        unsigned long version;
@@ -83,6 +105,8 @@ struct ProgramVars; /* forward reference */
 void __libkernel_init(_libkernel_functions_t fns, const char *envp[],
                const char *apple[], const struct ProgramVars *vars);
 
+kern_return_t __libkernel_platform_init(_libkernel_string_functions_t fns);
+
 kern_return_t __libkernel_voucher_init(_libkernel_voucher_functions_t fns);
 
 #endif // __LIBKERNEL_INIT_H
index ba535082493160fc34cfb0b82abfca8110ec1f04..fc98ea7aedc39b370e7b5debe6d29ca006cee332 100644 (file)
@@ -28,8 +28,6 @@ int __FCNTL(int, int, void *);
  * Stub function to account for the differences in the size of the third
  * argument when int and void * are different sizes. Also add pthread
  * cancelability.
- *
- * This is for LP64 only.
  */
 int
 fcntl(int fd, int cmd, ...)
index e5db000a647008584ec5e1785451d707e681df02..3354657ed9e101465ebb6b0c4c035773611c050e 100644 (file)
@@ -21,7 +21,7 @@
  * @APPLE_LICENSE_HEADER_END@
  */
 
-#if defined(__LP64__) || defined(__arm__)
+#if !defined(__i386__)
 
 #include <fcntl.h>
 #define __FCNTL        __fcntl
index f31bff7ef3e9bdd70b26e62ace931f5bbd5eb24b..830a79f5dfe8eab7f12bd95eab9fd7e236347da6 100644 (file)
@@ -21,7 +21,7 @@
  * @APPLE_LICENSE_HEADER_END@
  */
 
-#if defined(__LP64__) || defined(__arm__)
+#if !defined(__i386__)
 
 #undef __DARWIN_NON_CANCELABLE
 #define __DARWIN_NON_CANCELABLE 1
index 627da2261984a14f72b85ad03a473e6fd36c8f50..ecc36b1ecbcb56ae8d72325d4750dcb6025fd28a 100644 (file)
@@ -29,7 +29,7 @@
 
 /* Syscall entry points */
 int __coalition(uint32_t operation, uint64_t *cid, uint32_t flags);
-int __coalition_info(uint32_t operation, uint64_t *cid, void *buffer, size_t bufsize);
+int __coalition_info(uint32_t operation, uint64_t *cid, void *buffer, size_t *bufsize);
 
 int coalition_create(uint64_t *cid_out, uint32_t flags)
 {
index 5833216228be4eef6adb9c4a8750d11b84f67537..335717f2b7c6af5ef33a3795d09393a8f280794c 100644 (file)
@@ -33,7 +33,7 @@ getiopolicy_np(int iotype, int scope)
        int policy, error;
        struct _iopol_param_t iop_param;
 
-       if (iotype != IOPOL_TYPE_DISK ||
+       if ((iotype != IOPOL_TYPE_DISK && iotype != IOPOL_TYPE_VFS_ATIME_UPDATES) ||
                (scope != IOPOL_SCOPE_PROCESS && scope != IOPOL_SCOPE_THREAD)) {
                errno = EINVAL;
                policy = -1;
index 8414c1dfab8db510919f3e4e6ddf982917df9cc9..70271d320fe3d22c1c89230fb235f3b8f2ba92f9 100644 (file)
@@ -35,7 +35,7 @@ int _cpu_capabilities = 0;
 void
 _init_cpu_capabilities( void )
 {
-       _cpu_capabilities = _get_cpu_capabilities();
+       _cpu_capabilities = (int)_get_cpu_capabilities();
 }
 
 #elif defined(__arm__) || defined(__arm64__)
index eced7e7e16792c12874796e9f885b4eee211dbff..a0f12a27e1ac8cb62cee134b25c9637dafabfe27 100644 (file)
@@ -21,7 +21,7 @@
  * @APPLE_LICENSE_HEADER_END@
  */
 
-#if defined(__LP64__) || defined(__arm__)
+#if !defined(__i386__)
 
 #include <sys/ioctl.h>
 #include <stdarg.h>
@@ -29,8 +29,6 @@
 int __ioctl(int, unsigned long, void *);
 /*
  * Stub function to account for the third argument being void *
- *
- * This is for LP64 only.
  */
 int
 ioctl(int d, unsigned long request, ...)
index 39958a1a520a6a1ee81fd4136eae3421524dbece..5cc1f7258413f80badb37845ef44f6f31ca4887f 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2006, 2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2006-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
@@ -499,7 +499,7 @@ proc_set_cpumon_params_fatal(pid_t pid, int percentage, int interval)
                return (ret);
        }
        
-       if ((ret = proc_rlimit_control(pid, RLIMIT_CPU_USAGE_MONITOR, CPUMON_MAKE_FATAL)) != 0) {
+       if ((ret = proc_rlimit_control(pid, RLIMIT_CPU_USAGE_MONITOR, (void *)(uintptr_t)CPUMON_MAKE_FATAL)) != 0) {
                /* Failed to set termination, back out the CPU monitor settings. */
                (void)proc_disable_cpumon(pid);
        }
@@ -592,6 +592,12 @@ proc_setcpu_percentage(pid_t pid, int action, int percentage)
                return(errno);
 }
 
+int
+proc_reset_footprint_interval(pid_t pid)
+{
+       return (proc_rlimit_control(pid, RLIMIT_FOOTPRINT_INTERVAL, (void *)(uintptr_t)FOOTPRINT_INTERVAL_RESET));
+}
+
 int
 proc_clear_cpulimits(pid_t pid)
 {
@@ -712,7 +718,7 @@ proc_pidbind(int pid, uint64_t threadid, int bind)
 int
 proc_can_use_foreground_hw(int pid, uint32_t *reason)
 {
-       return __proc_info(PROC_INFO_CALL_CANUSEFGHW, pid, 0,  NULL,  reason, sizeof(*reason));
+       return __proc_info(PROC_INFO_CALL_CANUSEFGHW, pid, 0,  0,  reason, sizeof(*reason));
 }
 #endif /* TARGET_OS_EMBEDDED */
 
index f18366427aa8ad1126f905925578d7f9387890bc..513fda9ba682ae0e2312e97f8406df3135c4988c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2010-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_LICENSE_HEADER_START@
  * 
@@ -150,6 +150,8 @@ int proc_set_wakemon_params(pid_t pid, int rate_hz, int flags) __OSX_AVAILABLE_S
 int proc_get_wakemon_params(pid_t pid, int *rate_hz, int *flags) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0);
 int proc_disable_wakemon(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_7_0);
 
+int proc_reset_footprint_interval(pid_t pid) __OSX_AVAILABLE_STARTING(__MAC_10_14, __IPHONE_12_0);
+
 /* request trace buffer collection */
 int proc_trace_log(pid_t pid, uint64_t uniqueid) __OSX_AVAILABLE_STARTING(__MAC_10_10, __IPHONE_8_0);
 
diff --git a/libsyscall/wrappers/skywalk/cpu_copy_in_cksum.s b/libsyscall/wrappers/skywalk/cpu_copy_in_cksum.s
new file mode 100644 (file)
index 0000000..a32c0a2
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ *  extern int cpu_copy_in_cksum(const void *src, void *dst, uint32_t len,
+ *     uint32_t initial_sum);
+ *
+ *  input :
+ *      src : source starting address
+ *      dst : destination starting address
+ *      len : byte stream length
+ *      initial_sum : 32bit sum
+ *
+ *  output :
+ *      the source byte stream is copied into the destination buffer
+ *      the function returns the final 16bit checksum
+ */
+
diff --git a/libsyscall/wrappers/skywalk/cpu_copy_in_cksum_gen.c b/libsyscall/wrappers/skywalk/cpu_copy_in_cksum_gen.c
new file mode 100644 (file)
index 0000000..d21226b
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
diff --git a/libsyscall/wrappers/skywalk/cpu_in_cksum.s b/libsyscall/wrappers/skywalk/cpu_in_cksum.s
new file mode 100644 (file)
index 0000000..9410d93
--- /dev/null
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ *  extern uint32_t os_cpu_in_cksum(const void *data, uint32_t len,
+ *      uint32_t initial_sum);
+ *
+ *  input :
+ *      data : starting address
+ *      len : byte stream length
+ *      initial_sum : 32-bit sum
+ *
+ *  output :
+ *     This function returns the partial 16-bit checksum accumulated in
+ *     a 32-bit variable (withouth 1's complement); caller is responsible
+ *     for folding the 32-bit sum into 16-bit and performinng the 1's
+ *     complement if applicable
+ */
+
diff --git a/libsyscall/wrappers/skywalk/cpu_in_cksum_gen.c b/libsyscall/wrappers/skywalk/cpu_in_cksum_gen.c
new file mode 100644 (file)
index 0000000..d21226b
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
diff --git a/libsyscall/wrappers/skywalk/os_channel.c b/libsyscall/wrappers/skywalk/os_channel.c
new file mode 100644 (file)
index 0000000..4aee6e0
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
diff --git a/libsyscall/wrappers/skywalk/os_nexus.c b/libsyscall/wrappers/skywalk/os_nexus.c
new file mode 100644 (file)
index 0000000..121ec4a
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2015-2016 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
diff --git a/libsyscall/wrappers/skywalk/os_packet.c b/libsyscall/wrappers/skywalk/os_packet.c
new file mode 100644 (file)
index 0000000..6eda01c
--- /dev/null
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2015-2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
index 3a149790c8d0f9c72ee9f2bc24369a3f28cb9e02..20083809a4358086a258c044cef1324f6d0164d2 100644 (file)
@@ -123,6 +123,9 @@ posix_spawnattr_init(posix_spawnattr_t *attr)
                (*psattrp)->psa_memlimit_active = -1;
                (*psattrp)->psa_memlimit_inactive = -1;
 
+               /* Default is no thread limit */
+               (*psattrp)->psa_thread_limit = 0;
+
                /* Default is no CPU usage monitor active. */
                (*psattrp)->psa_cpumonitor_percent = 0;
                (*psattrp)->psa_cpumonitor_interval = 0;
@@ -150,6 +153,8 @@ posix_spawnattr_init(posix_spawnattr_t *attr)
 
                /* Default is no change to role */
                (*psattrp)->psa_darwin_role = POSIX_SPAWN_DARWIN_ROLE_NONE;
+
+               (*psattrp)->psa_max_addr = 0;
        }
 
        return (err);
@@ -1415,6 +1420,23 @@ posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr,
        return (0);
 }
 
+int
+posix_spawnattr_set_threadlimit_ext(posix_spawnattr_t * __restrict attr,
+       int thread_limit)
+{
+       _posix_spawnattr_t psattr;
+
+       if (attr == NULL || *attr == NULL)
+               return EINVAL;
+
+       psattr = *(_posix_spawnattr_t *)attr;
+
+       psattr->psa_thread_limit = thread_limit;
+
+       return (0);
+
+}
+
 
 /*
  * posix_spawnattr_set_importancewatch_port_np
@@ -1752,7 +1774,20 @@ posix_spawnattr_set_persona_groups_np(const posix_spawnattr_t * __restrict attr,
        return 0;
 }
 
+int
+posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint64_t max_addr)
+{
+       _posix_spawnattr_t psattr;
 
+       if (attr == NULL || *attr == NULL) {
+               return EINVAL;
+       }
+
+       psattr = *(_posix_spawnattr_t *)attr;
+       psattr->psa_max_addr = max_addr;
+
+       return 0;
+}
 
 /*
  * posix_spawn
index bebd58e60dae62d0bd915336b278fda90280a89c..e0ea9d495a5ca6d85694f7cbf0c355a073ba8e65 100644 (file)
@@ -48,6 +48,9 @@ int     posix_spawnattr_setjetsam(posix_spawnattr_t * __restrict attr,
 int     posix_spawnattr_setjetsam_ext(posix_spawnattr_t * __restrict attr,
                   short flags, int priority, int memlimit_active, int memlimit_inactive) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
 
+int    posix_spawnattr_set_threadlimit_ext(posix_spawnattr_t * __restrict attr,
+               int thread_limit);
+
 #define POSIX_SPAWN_IMPORTANCE_PORT_COUNT 32
 int    posix_spawnattr_set_importancewatch_port_np(posix_spawnattr_t * __restrict attr,
                  int count, mach_port_t portarray[])  __OSX_AVAILABLE_STARTING(__MAC_10_9, __IPHONE_6_0);
@@ -68,5 +71,6 @@ int     posix_spawnattr_set_persona_np(const posix_spawnattr_t * __restrict, uid
 int     posix_spawnattr_set_persona_uid_np(const posix_spawnattr_t * __restrict, uid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
 int     posix_spawnattr_set_persona_gid_np(const posix_spawnattr_t * __restrict, gid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
 int     posix_spawnattr_set_persona_groups_np(const posix_spawnattr_t * __restrict, int, gid_t *, uid_t) __OSX_AVAILABLE_STARTING(__MAC_10_11, __IPHONE_9_0);
+int     posix_spawnattr_set_max_addr_np(const posix_spawnattr_t * __restrict attr, uint64_t max_addr) __OSX_AVAILABLE_STARTING(__MAC_10_14, __IPHONE_12_0);
 
 #endif /* !defined _SPAWN_PRIVATE_H_*/
index 0e065edd2f93cd408a0fe4588d6bb59f6b74d9ef..3a7489e02ac5b5afb59186e01a88dd9bcbf8874b 100644 (file)
@@ -140,9 +140,9 @@ stackshot_capture_with_config(stackshot_config_t *stackshot_config)
                return EINVAL;
        }
 
-       s_config->sc_out_buffer_addr = &s_config->sc_buffer;
-       s_config->sc_out_size_addr = &s_config->sc_size;
-       ret = __stack_snapshot_with_config(STACKSHOT_CONFIG_TYPE, s_config, sizeof(stackshot_config_t));
+       s_config->sc_out_buffer_addr = (uintptr_t)&s_config->sc_buffer;
+       s_config->sc_out_size_addr = (uintptr_t)&s_config->sc_size;
+       ret = __stack_snapshot_with_config(STACKSHOT_CONFIG_TYPE, (uintptr_t)s_config, sizeof(stackshot_config_t));
        
        if (ret != 0) {
                ret = errno;
index 8c4d3e05e528258b126145f25db19cbf517c92cd..fcbc147daac53c18fc2bd8e81764fb35ff85d5cb 100644 (file)
@@ -31,8 +31,7 @@
 
 __attribute__((visibility("hidden")))
 char *
-index
-(const char *p, int ch)
+_libkernel_strchr(const char *p, int ch)
 {
        char c;
 
index ef30a90ba3897563673db14ac5f6684955efe861..53d527b6ac1cf65acd39de57877fcfa466d8061a 100644 (file)
@@ -48,7 +48,7 @@ typedef       int word;               /* "word" used for optimal copy speed */
  */
 
 __attribute__((visibility("hidden")))
-void * memcpy(void *dst0, const void *src0, size_t length)
+void * _libkernel_memmove(void *dst0, const void *src0, size_t length)
 {
        char *dst = dst0;
        const char *src = src0;
@@ -113,13 +113,6 @@ done:
        return (dst0);
 }
 
-__attribute__((visibility("hidden")))
-void *
-memmove(void *s1, const void *s2, size_t n)
-{
-       return memcpy(s1, s2, n);
-}
-
 __attribute__((visibility("hidden")))
 void
 bcopy(const void *s1, void *s2, size_t n)
index cab6587d6f059317ae0a244c941ba5693ba67e93..82c1eb0a9125370a2e49e217fa00558bba96798a 100644 (file)
@@ -40,9 +40,9 @@
 // a recursive call to bzero.
 __attribute__((visibility("hidden")))
 void
-bzero(void *dst0, size_t length)
+_libkernel_bzero(void *dst0, size_t length)
 {
-    return (void)memset(dst0, 0, length);
+    return (void)_libkernel_memset(dst0, 0, length);
 }
 
 #define        RETURN  return (dst0)
@@ -51,7 +51,7 @@ bzero(void *dst0, size_t length)
 
 __attribute__((visibility("hidden")))
 void *
-memset(void *dst0, int c0, size_t length)
+_libkernel_memset(void *dst0, int c0, size_t length)
 {
        size_t t;
        u_int c;
index cffe0788323b0c2e5ef329dd11e6b96fe9e8f868..cfe403516e85655a54737c6bd0358d99f10b49f9 100644 (file)
@@ -37,7 +37,7 @@
  */
 __attribute__((visibility("hidden")))
 int
-strcmp(const char *s1, const char *s2)
+_libkernel_strcmp(const char *s1, const char *s2)
 {
        while (*s1 == *s2++)
                if (*s1++ == '\0')
index 026d098c764a47d3e5659a195e9ded74e823a3ad..e67282e069ec9e96a76ac106208614d848dc2474 100644 (file)
 
 __attribute__((visibility("hidden")))
 char *
-strcpy(char * restrict dst, const char * restrict src) {
-       const size_t length = strlen(src);
+_libkernel_strcpy(char * restrict dst, const char * restrict src) {
+       const size_t length = _libkernel_strlen(src);
     //  The stpcpy() and strcpy() functions copy the string src to dst
     //  (including the terminating '\0' character).
-    memcpy(dst, src, length+1);
+    _libkernel_memmove(dst, src, length+1);
     //  The strcpy() and strncpy() functions return dst.
     return dst;
 }
index 5407112604557a5b3afac477778c6628d38af9fc..a8222044ae2205bdba386c48b565c37d2dacf8b5 100644 (file)
@@ -72,8 +72,17 @@ char *strsep(char **, const char *);
 void    bcopy(const void *, void *, size_t);
 void    bzero(void *, size_t);
 char   *index(const char *, int);
+char   *strchr(const char *, int);
 
 #include "string.h"
 
-#endif  /* _STRINGS_H_ */
+void    *_libkernel_memmove(void *, const void *, size_t);
+void    *_libkernel_memset(void *, int, size_t);
+int      _libkernel_strcmp(const char *, const char *);
+char    *_libkernel_strcpy(char *, const char *);
+size_t   _libkernel_strlen(const char *);
+size_t   _libkernel_strlcpy(char *, const char *, size_t);
+void     _libkernel_bzero(void *, size_t);
+char    *_libkernel_strchr(const char *, int);
 
+#endif  /* _STRINGS_H_ */
index 72e4b2bd8b3c3a448ae7ee66581d48ef9ae8e3bc..1be4fe3330246a5f2c5572521d5e121a7d04a445 100644 (file)
 
 __attribute__((visibility("hidden")))
 size_t
-strlcpy(char * restrict dst, const char * restrict src, size_t maxlen) {
-    const size_t srclen = strlen(src);
+_libkernel_strlcpy(char * restrict dst, const char * restrict src, size_t maxlen) {
+    const size_t srclen = _libkernel_strlen(src);
     if (srclen < maxlen) {
-        memcpy(dst, src, srclen+1);
+        _libkernel_memmove(dst, src, srclen+1);
     } else if (maxlen != 0) {
-        memcpy(dst, src, maxlen-1);
+        _libkernel_memmove(dst, src, maxlen-1);
         dst[maxlen-1] = '\0';
     }
     return srclen;
index 6854e0343480ade1ad782571eb433779c38c3dfd..9054ac39e165cf005ba0fc1729e143b4bbed6de3 100644 (file)
@@ -75,7 +75,7 @@ static const unsigned long mask80 = 0x8080808080808080;
 
 __attribute__((visibility("hidden")))
 size_t
-strlen(const char *str)
+_libkernel_strlen(const char *str)
 {
        const char *p;
        const unsigned long *lp;
index 3bb8a6683cf76a3c06cce268e859fbebd1c9759e..6277f4be021c61cb6b3b723f1e90ccc4af9e4739 100644 (file)
@@ -73,7 +73,7 @@ static void abort_with_payload_wrapper_internal(uint32_t reason_namespace, uint6
 
        /* If sending a SIGABRT failed, we fall back to SIGKILL */
        terminate_with_payload(getpid(), reason_namespace, reason_code, payload, payload_size,
-                       reason_string, reason_flags);
+                       reason_string, reason_flags | OS_REASON_FLAG_ABORT);
 
        __builtin_unreachable();
 }
index e1181d251d28d922e38cb10804709ee95f0d4ae9..d2a9f3268865eb99a05b98845fc4bd3f98579df8 100644 (file)
@@ -74,8 +74,6 @@ thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *lengt
 #if defined(__i386__)
     if (sp) *sp = state.__esp;
 
-    push_register_value(state.__eip);
-
     push_register_value(state.__eax);
     push_register_value(state.__ebx);
     push_register_value(state.__ecx);
@@ -91,8 +89,6 @@ thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *lengt
                *sp = 0;
     }
 
-    push_register_value(state.__rip);
-
     push_register_value(state.__rax);
     push_register_value(state.__rbx);
     push_register_value(state.__rcx);
@@ -110,7 +106,6 @@ thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *lengt
 #elif defined(__arm__)
     if (sp) *sp = state.__sp;
 
-    push_register_value(state.__pc);
     push_register_value(state.__lr);
 
     for (int i = 0; i < 13; i++){
@@ -118,14 +113,14 @@ thread_get_register_pointer_values(thread_t thread, uintptr_t *sp, size_t *lengt
     }
 #elif defined(__arm64__)
     if (sp) {
-       if (state.__sp > 128)
-               *sp = state.__sp - 128 /* redzone */;
+       uintptr_t __sp = arm_thread_state64_get_sp(state);
+       if (__sp > 128)
+               *sp = __sp - 128 /* redzone */;
        else
                *sp = 0;
     }
 
-    push_register_value(state.__pc);
-    push_register_value(state.__lr);
+    push_register_value(arm_thread_state64_get_lr(state));
 
     for (int i = 0; i < 29; i++){
         push_register_value(state.__x[i]);
index 6a22a53958b42ccc894b4a08e41796123b4c8ebc..fae37a483fc0f5bed890da68789442afd2f8b36e 100644 (file)
  *     sem_t* __sem_open(const char *name, int oflag, int mode, int value);
  */
 MI_ENTRY_POINT(_sem_open) 
+       ARM64_STACK_PROLOG
        PUSH_FRAME
+#if __LP64__
        ldp     x2, x3, [fp, #16]
+#else
+       ldp     w2, w3, [fp, #16]
+#endif
        MI_CALL_EXTERNAL(___sem_open)
+#if !__LP64__
+       /* xnu returns a 64-bit '-1' on failure, but pointers must have the high
+        * 32-bits set to zero. The following instruction is equivalent to
+        * masking off the top 32-bits.
+        */
+       mov w0, w0
+#endif
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  *     int open(const char *name, int oflag, ...);
  *     int __open(const char *name, int oflag, int mode, int value);
  */
 MI_ENTRY_POINT(_open) 
+       ARM64_STACK_PROLOG
        PUSH_FRAME
+#if __LP64__
        ldr     x2, [fp, #16]
+#else
+       ldr     w2, [fp, #16]
+#endif
        MI_CALL_EXTERNAL(___open)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  *     int open_nocancel(const char *name, int oflag, ...);
  *     int __open_nocancel(const char *name, int oflag, int mode);
  */
 MI_ENTRY_POINT(_open$NOCANCEL) 
+       ARM64_STACK_PROLOG
        PUSH_FRAME
+#if __LP64__
        ldr     x2, [fp, #16]
+#else
+       ldr     w2, [fp, #16]
+#endif
        MI_CALL_EXTERNAL(___open_nocancel)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  *     int openat(int fd,const char *name, int oflag, ...);
  *     int __openat(int fd, const char *name, int oflag, int mode, int value);
  */
 MI_ENTRY_POINT(_openat)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
+#if __LP64__
        ldr     x3, [fp, #16]
+#else
+       ldr     w3, [fp, #16]
+#endif
        MI_CALL_EXTERNAL(___openat)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  *     int openat_nocancel(int fd, const char *name, int oflag, ...);
  *     int __openat_nocancel(int fd, const char *name, int oflag, int mode);
  */
 MI_ENTRY_POINT(_openat$NOCANCEL)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
+#if __LP64__
        ldr     x3, [fp, #16]
+#else
+       ldr     w3, [fp, #16]
+#endif
        MI_CALL_EXTERNAL(___openat_nocancel)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /* 
  * int shm_open(const char *, int, ...);
  * int __shm_open(const char*, int oflag, int mode);
  */
 MI_ENTRY_POINT(_shm_open)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
-       ldr x2, [fp, #16]
+#if __LP64__
+       ldr     x2, [fp, #16]
+#else
+       ldr     w2, [fp, #16]
+#endif
        MI_CALL_EXTERNAL(___shm_open)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  * int msgsys(int, ...);
  * int __msgsys(int which, int a2, int a3, int a4, int a5);
  */
 MI_ENTRY_POINT(_msgsys)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
-       ldp x1, x2, [fp, #16]
-       ldp x3, x4, [fp, #32]
+#if __LP64__
+       ldp     x1, x2, [fp, #16]
+       ldp     x3, x4, [fp, #32]
+#else
+       ldp     w1, w2, [fp, #16]
+       ldp     w3, w4, [fp, #24]
+#endif
        MI_CALL_EXTERNAL(___msgsys)
        POP_FRAME
-       ret
 
 /*
  * int semsys(int, ...);
  * int __semsys(int which, int a2, int a3, int a4, int a5);
  */
 MI_ENTRY_POINT(_semsys)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
-       ldp x1, x2, [fp, #16]
-       ldp x3, x4, [fp, #32]
+#if __LP64__
+       ldp     x1, x2, [fp, #16]
+       ldp     x3, x4, [fp, #32]
+#else
+       ldp     w1, w2, [fp, #16]
+       ldp     w3, w4, [fp, #24]
+#endif
        MI_CALL_EXTERNAL(___semsys)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /* 
  * int semctl(int, int, int, ...);
  * int __semctl(int semid, int semnum, int cmd, semun_t arg);
  */
  MI_ENTRY_POINT(_semctl)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
-       ldr x3, [fp, #16]
+#if __LP64__
+       ldr     x3, [fp, #16]
+#else
+       ldr     w3, [fp, #16]
+#endif
        MI_CALL_EXTERNAL(___semctl)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /* 
  * int shmsys(int, ...);
  * int __shmsys(int which, int a2, int a3, int a4);
  */
  MI_ENTRY_POINT(_shmsys)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
-       ldp x1, x2, [fp, #16]
-       ldr x3, [fp, #32]
+#if __LP64__
+       ldp     x1, x2, [fp, #16]
+       ldr     x3, [fp, #32]
+#else
+       ldp     w1, w2, [fp, #16]
+       ldr     w3, [fp, #24]
+#endif
        MI_CALL_EXTERNAL(___shmsys)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 #endif /* defined(__arm64__) */
index 81dfc8a8a7baddc030f51a4f55b76b17ee7d7b26..a4e17d689ec971d5ec83e097c8914f35fed9b8c6 100755 (executable)
@@ -94,6 +94,17 @@ my %TypeBytes = (
     'uuid_t'           => 4,
 );
 
+# Types that potentially have different sizes in user-space compared to
+# kernel-space as well as whether the value should be sign/zero-extended when
+# passing the user/kernel boundary.
+my %UserKernelMismatchTypes = (
+    'long'          => 'SIGN_EXTEND',
+    'size_t'        => 'ZERO_EXTEND',
+    'u_long'        => 'ZERO_EXTEND',
+    'user_size_t'   => 'ZERO_EXTEND',
+    'user_ssize_t'  => 'SIGN_EXTEND'
+);
+
 # Moving towards storing all data in this hash, then we always know
 # if data is aliased or not, or promoted or not.
 my %Symbols = (
@@ -106,6 +117,7 @@ my %Symbols = (
         nargs => 0,
         bytes => 0,
         aliases => {},
+        mismatch_args => {}, # Arguments that might need to be zero/sign-extended
     },
 );
 
@@ -178,12 +190,15 @@ sub readMaster {
         $args =~ s/\s+$//;
         my $argbytes = 0;
         my $nargs = 0;
+        my %mismatch_args;
         if($args ne '' && $args ne 'void') {
             my @a = split(',', $args);
             $nargs = scalar(@a);
-            # Calculate the size of all the arguments (only used for i386)
+            my $index = 0;
             for my $type (@a) {
                 $type =~ s/\s*\w+$//; # remove the argument name
+
+                # Calculate the size of all the arguments (only used for i386)
                 if($type =~ /\*$/) {
                     $argbytes += 4; # a pointer type
                 } else {
@@ -192,6 +207,12 @@ sub readMaster {
                     die "$MyName: $name: unknown type '$type'\n" unless defined($b);
                     $argbytes += $b;
                 }
+                # Determine which arguments might need to be zero/sign-extended
+                if(exists $UserKernelMismatchTypes{$type}) {
+                    $mismatch_args{$index} = $UserKernelMismatchTypes{$type};
+                }
+
+                $index++;
             }
         }
         $Symbols{$name} = {
@@ -203,6 +224,7 @@ sub readMaster {
             nargs => $nargs,
             bytes => $argbytes,
             aliases => {},
+            mismatch_args => \%mismatch_args, # Arguments that might need to be zero/sign-extended
             except => [],
         };
     }
@@ -301,23 +323,47 @@ sub writeStubForSymbol {
     my ($f, $symbol) = @_;
     
     my @conditions;
+    my $has_arm64 = 0;
     for my $subarch (@Architectures) {
         (my $arch = $subarch) =~ s/arm(v.*)/arm/;
         $arch =~ s/x86_64(.*)/x86_64/;
         $arch =~ s/arm64(.*)/arm64/;
         push(@conditions, "defined(__${arch}__)") unless grep { $_ eq $arch } @{$$symbol{except}};
+
+        if($arch == 'arm64') {
+            $has_arm64 = 1 unless grep { $_ eq $arch } @{$$symbol{except}};
+        }
     }
 
        my %is_cancel;
        for (@Cancelable) { $is_cancel{$_} = 1 };
-    
+
     print $f "#define __SYSCALL_32BIT_ARG_BYTES $$symbol{bytes}\n";
     print $f "#include \"SYS.h\"\n\n";
+
     if (scalar(@conditions)) {
         printf $f "#ifndef SYS_%s\n", $$symbol{syscall};
         printf $f "#error \"SYS_%s not defined. The header files libsyscall is building against do not match syscalls.master.\"\n", $$symbol{syscall};
-        printf $f "#endif\n\n";    
-        my $nc = ($is_cancel{$$symbol{syscall}} ? "cerror" : "cerror_nocancel");
+        printf $f "#endif\n\n";
+    }
+
+    my $nc = ($is_cancel{$$symbol{syscall}} ? "cerror" : "cerror_nocancel");
+
+    if($has_arm64) {
+        printf $f "#if defined(__arm64__)\n";
+        printf $f "MI_ENTRY_POINT(%s)\n", $$symbol{asm_sym};
+        if(keys %{$$symbol{mismatch_args}}) {
+            while(my($argnum, $extend) = each %{$$symbol{mismatch_args}}) {
+                printf $f "%s(%d)\n", $extend, $argnum;
+            }
+        }
+
+        printf $f "SYSCALL_NONAME(%s, %d, %s)\n", $$symbol{syscall}, $$symbol{nargs}, $nc;
+        printf $f "ret\n";
+        printf $f "#else\n";
+    }
+
+    if (scalar(@conditions)) {
         printf $f "#if " . join(" || ", @conditions) . "\n";
         printf $f "__SYSCALL2(%s, %s, %d, %s)\n", $$symbol{asm_sym}, $$symbol{syscall}, $$symbol{nargs}, $nc;
         if (!$$symbol{is_private} && (scalar(@conditions) < scalar(@Architectures))) {
@@ -329,6 +375,10 @@ sub writeStubForSymbol {
         # actually this isnt an inconsistency. kernel can expose what it wants but if all our arches
         # override it we need to honour that.
     }
+
+    if($has_arm64) {
+        printf $f "#endif\n\n";
+    }
 }
 
 sub writeAliasesForSymbol {
index 94cc8dbb742c9cf44239f9a221d5789952e641c5..cdd598aee7b745712463017cde103c73738d0602 100755 (executable)
@@ -39,6 +39,7 @@ MIG_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach"
 MIG_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach"
 SERVER_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/servers"
 MACH_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/include/mach"
+MACH_PRIVATE_HEADER_DST="$BUILT_PRODUCTS_DIR/mig_hdr/local/include/mach"
 
 # from old Libsystem makefiles
 MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 1`
@@ -46,16 +47,20 @@ if [[ ( "$MACHINE_ARCH" =~ ^"arm64" || "$MACHINE_ARCH" =~ ^"x86_64" ) && `echo $
 then
        # MACHINE_ARCH needs to be a 32-bit arch to generate vm_map_internal.h correctly.
        MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 2`
-    if [[ ( "$MACHINE_ARCH" =~ ^"arm64" || "$MACHINE_ARCH" =~ ^"x86_64" ) && `echo $ARCHS | wc -w` -gt 1 ]]
-    then
-           # MACHINE_ARCH needs to be a 32-bit arch to generate vm_map_internal.h correctly.
-           MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 3`
-    fi
+       if [[ ( "$MACHINE_ARCH" =~ ^"arm64" || "$MACHINE_ARCH" =~ ^"x86_64" ) && `echo $ARCHS | wc -w` -gt 2 ]]
+       then
+               # MACHINE_ARCH needs to be a 32-bit arch to generate vm_map_internal.h correctly.
+               MACHINE_ARCH=`echo $ARCHS | cut -d' ' -f 3`
+       fi
 fi
+# MACHINE_ARCH *really* needs to be a 32-bit arch to generate vm_map_internal.h correctly, even if there are no 32-bit targets.
 if [[ ( "$MACHINE_ARCH" =~ ^"arm64" ) ]]
 then
-    # MACHINE_ARCH *really* needs to be a 32-bit arch to generate vm_map_internal.h correctly, even if there are no 32-bit targets.
-    MACHINE_ARCH="armv7"
+       MACHINE_ARCH="armv7"
+fi
+if [[ ( "$MACHINE_ARCH" =~ ^"x86_64" ) ]]
+then
+       MACHINE_ARCH="i386"
 fi
 
 SRC="$SRCROOT/mach"
@@ -109,12 +114,16 @@ MACH_HDRS="mach.h
        mach_error.h
        mach_init.h
        mach_interface.h
+       mach_right.h
        port_obj.h
        sync.h
        vm_task.h
        vm_page_size.h
        thread_state.h"
 
+MACH_PRIVATE_HDRS="port_descriptions.h
+       mach_sync_ipc.h"
+
 MIG_FILTERS="watchos_prohibited_mig.txt tvos_prohibited_mig.txt"
 
 # install /usr/include/server headers 
@@ -129,6 +138,12 @@ for hdr in $MACH_HDRS; do
        install $ASROOT -c -m 444 $SRC/mach/$hdr $MACH_HEADER_DST
 done
 
+# install /usr/local/include/mach headers
+mkdir -p $MACH_PRIVATE_HEADER_DST
+for hdr in $MACH_PRIVATE_HDRS; do
+       install $ASROOT -c -m 444 $SRC/mach/$hdr $MACH_PRIVATE_HEADER_DST
+done
+
 # special case because we only have one to do here
 $MIG -novouchers -arch $MACHINE_ARCH -header "$SERVER_HEADER_DST/netname.h" $SRC/servers/netname.defs
 
@@ -153,7 +168,7 @@ for mig in $MIGS_PRIVATE $MIGS_DUAL_PUBLIC_PRIVATE; do
        MIG_NAME=`basename $mig .defs`
        $MIG -novouchers -arch $MACHINE_ARCH -cc $MIGCC -header "$MIG_PRIVATE_HEADER_DST/$MIG_NAME.h" $MIG_DEFINES $MIG_PRIVATE_DEFS_INCFLAGS $SRC/$mig
        if [ ! -e "$MIG_HEADER_DST/$MIG_NAME.h" ]; then
-           echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h"
+               echo "#error $MIG_NAME.h unsupported." > "$MIG_HEADER_DST/$MIG_NAME.h"
        fi
 done
 
index a70a2d815858f35ab08c3dcd533c51ce33c41684..3f6f71317537500a96aef226c4cc38e786588791 100644 (file)
@@ -34,7 +34,7 @@ else
        XCRUN = /usr/bin/xcrun
 endif
 
-SDKROOT ?= macosx.internal
+SDKROOT ?= macosx
 HOST_SDKROOT ?= macosx
 
 # SDKROOT may be passed as a shorthand like "iphoneos.internal". We
@@ -50,6 +50,9 @@ override SDKROOT = $(SDKROOT_RESOLVED)
 
 ifeq ($(HOST_SDKROOT_RESOLVED),)
 export HOST_SDKROOT_RESOLVED := $(shell $(XCRUN) -sdk $(HOST_SDKROOT) -show-sdk-path)
+ifeq ($(strip $(HOST_SDKROOT_RESOLVED)),)
+export HOST_SDKROOT_RESOLVED := /
+endif
 endif
 override HOST_SDKROOT = $(HOST_SDKROOT_RESOLVED)
 
index dcba4ab78590dc2e3643e6bad715e64303dcd042..73f7cdc57094f71cce43798fb5f77df423056017 100644 (file)
@@ -9,7 +9,7 @@
 #
 # Architecture Configuration options
 #
-SUPPORTED_ARCH_CONFIGS := X86_64 X86_64H
+SUPPORTED_ARCH_CONFIGS := X86_64 X86_64H ARM ARM64
 
 #
 # Kernel Configuration options
@@ -23,6 +23,9 @@ SUPPORTED_KERNEL_CONFIGS = RELEASE DEVELOPMENT DEBUG PROFILE KASAN
 SUPPORTED_X86_64_MACHINE_CONFIGS = NONE
 SUPPORTED_X86_64H_MACHINE_CONFIGS = NONE
 
+SUPPORTED_ARM_MACHINE_CONFIGS = S7002 T8002 T8004
+SUPPORTED_ARM64_MACHINE_CONFIGS = S5L8960X T7000 T7001 S8000 S8001 T8010 T8011 BCM2837
+
 
 #
 # Setup up *_LC variables during recursive invocations
@@ -47,26 +50,45 @@ COMPONENT_LIST      = osfmk bsd libkern iokit pexpert libsa security san
 COMPONENT      = $(if $(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(word 2,$(subst /, ,$(RELATIVE_SOURCE_PATH))),$(firstword $(subst /, ,$(RELATIVE_SOURCE_PATH))))
 COMPONENT_IMPORT_LIST = $(filter-out $(COMPONENT),$(COMPONENT_LIST))
 
+MACHINE_FLAGS_ARM64_S5L8960X = -DARM64_BOARD_CONFIG_S5L8960X
+MACHINE_FLAGS_ARM64_T7000 = -DARM64_BOARD_CONFIG_T7000
+MACHINE_FLAGS_ARM64_T7001 = -DARM64_BOARD_CONFIG_T7001
+MACHINE_FLAGS_ARM_S7002 = -DARM_BOARD_CONFIG_S7002
+MACHINE_FLAGS_ARM64_S8000 = -DARM64_BOARD_CONFIG_S8000
+MACHINE_FLAGS_ARM64_S8001 = -DARM64_BOARD_CONFIG_S8001
+MACHINE_FLAGS_ARM_T8002 = -DARM_BOARD_CONFIG_T8002
+MACHINE_FLAGS_ARM_T8004 = -DARM_BOARD_CONFIG_T8004
+MACHINE_FLAGS_ARM64_T8010 = -DARM64_BOARD_CONFIG_T8010 -mcpu=hurricane
+MACHINE_FLAGS_ARM64_T8011 = -DARM64_BOARD_CONFIG_T8011 -mcpu=hurricane
+MACHINE_FLAGS_ARM64_BCM2837 = -DARM64_BOARD_CONFIG_BCM2837
+
 
 #
 # Deployment target flag
 #
 ifeq ($(PLATFORM),MacOSX)
     DEPLOYMENT_TARGET_FLAGS = -mmacosx-version-min=$(SDKVERSION)
+    DEPLOYMENT_LINKER_FLAGS = -Wl,-macosx_version_min,$(SDKVERSION)
 else ifeq ($(PLATFORM),WatchOS)
-    DEPLOYMENT_TARGET_FLAGS = -mwatchos-version-min=$(SDKVERSION)
+    DEPLOYMENT_TARGET_FLAGS = -mwatchos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_WATCH
+    DEPLOYMENT_LINKER_FLAGS =
 else ifeq ($(PLATFORM),tvOS)
     DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION)
+    DEPLOYMENT_LINKER_FLAGS =
 else ifeq ($(PLATFORM),AppleTVOS)
     DEPLOYMENT_TARGET_FLAGS = -mtvos-version-min=$(SDKVERSION)
 else ifeq ($(PLATFORM),BridgeOS)
     DEPLOYMENT_TARGET_FLAGS = -mbridgeos-version-min=$(SDKVERSION) -DXNU_TARGET_OS_BRIDGE
+    DEPLOYMENT_LINKER_FLAGS =
 else ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),)
     DEPLOYMENT_TARGET_FLAGS = -miphoneos-version-min=$(SDKVERSION)
+    DEPLOYMENT_LINKER_FLAGS = -Wl,-ios_version_min,$(SDKVERSION)
 else ifneq ($(filter $(SUPPORTED_SIMULATOR_PLATFORMS),$(PLATFORM)),)
     DEPLOYMENT_TARGET_FLAGS =
+    DEPLOYMENT_LINKER_FLAGS =
 else
     DEPLOYMENT_TARGET_FLAGS =
+    DEPLOYMENT_LINKER_FLAGS =
 endif
 
 DEPLOYMENT_TARGET_DEFINES = -DPLATFORM_$(PLATFORM)
@@ -176,7 +198,34 @@ endef
 ARCH_FLAGS_X86_64        = -arch x86_64
 ARCH_FLAGS_X86_64H       = -arch x86_64h
 
+ifneq ($(filter ARM ARM64,$(CURRENT_ARCH_CONFIG)),)
+
+ifndef ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG
+export ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT KernelMachOArchitecture FROM Targets WHERE KernelPlatform IS \"$(CURRENT_MACHINE_CONFIG_LC)\" LIMIT 1 || echo UNKNOWN )
+endif
+
+BUILD_STATIC_LINK := 1
 
+endif
+
+ARCH_FLAGS_ARM           = -arch $(ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG)
+ARCH_FLAGS_ARM64         = -arch $(ARCH_STRING_FOR_CURRENT_MACHINE_CONFIG)
+
+#
+# Clang static analyzer flags
+#
+ANALYZER                       = $(CC)
+ANALYZERPP                     = $(CXX)
+ANALYZERFLAGS          = --analyze -D__clang_analyzer__
+ifneq ($(ANALYZE_FORMAT),text)
+ANALYZERFLAGS          += -Xanalyzer -analyzer-output=html
+ANALYZERFLAGS          += -o $(OBJROOT)/analyzer-html
+else
+ANALYZERFLAGS          += -Xanalyzer -analyzer-output=text
+endif
+ifneq ($(ANALYZE_VERBOSE),YES)
+ANALYZERFLAGS          += -Xclang -analyzer-disable-checker -Xclang deadcode.DeadStores
+endif
 
 #
 # Default CFLAGS
@@ -215,6 +264,11 @@ CFLAGS_X86_64      = -Dx86_64 -DX86_64 -D__X86_64__ -DLP64 \
 
 CFLAGS_X86_64H = $(CFLAGS_X86_64)
 
+CFLAGS_ARM     = -Darm -DARM -D__ARM__ -DPAGE_SIZE_FIXED \
+                       -fno-strict-aliasing -D__API__=v4
+
+CFLAGS_ARM64   = -Darm64 -DARM64 -D__ARM64__ -DLP64 -DPAGE_SIZE_FIXED \
+                       -fno-strict-aliasing -D__API__=v4 -mkernel
 
 CFLAGS_RELEASEX86_64 = -O2
 CFLAGS_DEVELOPMENTX86_64 = -O2
@@ -235,31 +289,58 @@ CFLAGS_DEVELOPMENTARM = -O2
 CFLAGS_DEBUGARM = -O0
 CFLAGS_PROFILEARM = -O2
 
-
+CFLAGS_RELEASEARM64 = -O2
+CFLAGS_DEVELOPMENTARM64 = -O2
+CFLAGS_KASANARM64 = $(CFLAGS_DEVELOPMENTARM64)
+CFLAGS_DEBUGARM64 = -O0
+CFLAGS_PROFILEARM64 = -O2
 
 #
-# KASAN support
+# Sanitizers Support (KASan, UBSan)
 #
 
 
+SAN=0
+
 ifeq ($(CURRENT_KERNEL_CONFIG),KASAN)
 KASAN = 1
 endif
 
 ifeq ($(KASAN),1)
-
+SAN=1
 BUILD_LTO = 0
+KASAN_SHIFT_ARM64=0xdffffff800000000
 KASAN_SHIFT_X86_64=0xdffffe1000000000
 KASAN_SHIFT_X86_64H=$(KASAN_SHIFT_X86_64)
 KASAN_SHIFT=$($(addsuffix $(CURRENT_ARCH_CONFIG),KASAN_SHIFT_))
-KASAN_BLACKLIST=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC)
 CFLAGS_GEN += -DKASAN=1 -DKASAN_SHIFT=$(KASAN_SHIFT) -fsanitize=address \
                -mllvm -asan-globals-live-support \
-               -mllvm -asan-mapping-offset=$(KASAN_SHIFT) \
-               -fsanitize-blacklist=$(KASAN_BLACKLIST)
+               -mllvm -asan-mapping-offset=$(KASAN_SHIFT)
 
 endif
 
+ifeq ($(UBSAN),1)
+SAN=1
+UBSAN_CHECKS = signed-integer-overflow shift pointer-overflow  # non-fatal (calls runtime, can return)
+UBSAN_CHECKS_FATAL =                                           # fatal (calls runtime, must not return)
+UBSAN_CHECKS_TRAP = vla-bound builtin                          # emit a trap instruction (no runtime support)
+UBSAN_DISABLED = bounds object-size
+
+ifneq ($(KASAN),1)
+UBSAN_CHECKS += alignment         # UBSan alignment + KASan code size is too large
+UBSAN_CHECKS_FATAL += unreachable # UBSan unreachable doesn't play nice with ASan (40723397)
+endif
+
+CFLAGS_GEN += -DUBSAN=1
+CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS) $(UBSAN_CHECKS_FATAL) $(UBSAN_CHECKS_TRAP),-fsanitize=$(x))
+CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS_FATAL),-fno-sanitize-recover=$(x))
+CFLAGS_GEN += $(foreach x,$(UBSAN_CHECKS_TRAP),-fsanitize-trap=$(x))
+endif
+
+ifeq ($(SAN),1)
+CFLAGS_GEN += -fsanitize-blacklist=$(OBJROOT)/san/kasan-blacklist-$(CURRENT_ARCH_CONFIG_LC)
+endif
+
 CFLAGS = $(CFLAGS_GEN) \
                  $($(addsuffix $(CURRENT_MACHINE_CONFIG),MACHINE_FLAGS_$(CURRENT_ARCH_CONFIG)_)) \
                  $($(addsuffix $(CURRENT_ARCH_CONFIG),ARCH_FLAGS_)) \
@@ -276,7 +357,7 @@ CFLAGS      = $(CFLAGS_GEN) \
 
 OTHER_CXXFLAGS =
 
-CXXFLAGS_GEN  = -std=gnu++11 -fapple-kext $(OTHER_CXXFLAGS)
+CXXFLAGS_GEN  = -std=gnu++1z -fapple-kext $(OTHER_CXXFLAGS)
 
 CXXFLAGS      = $(CXXFLAGS_GEN) \
                  $($(addsuffix $(CURRENT_ARCH_CONFIG),CXXFLAGS_)) \
@@ -301,6 +382,8 @@ SFLAGS_PROFILE      =
 
 SFLAGS_X86_64  = $(CFLAGS_X86_64)
 SFLAGS_X86_64H = $(CFLAGS_X86_64H)
+SFLAGS_ARM     = $(CFLAGS_ARM)
+SFLAGS_ARM64   = $(CFLAGS_ARM64)
 
 SFLAGS = $(SFLAGS_GEN) \
                  $($(addsuffix $(CURRENT_MACHINE_CONFIG),MACHINE_FLAGS_$(CURRENT_ARCH_CONFIG)_)) \
@@ -330,12 +413,7 @@ LDFLAGS_KERNEL_GEN = \
        -Wl,-sectalign,__TEXT,__text,0x1000 \
        -Wl,-sectalign,__DATA,__common,0x1000 \
        -Wl,-sectalign,__DATA,__bss,0x1000 \
-    -Wl,-sectcreate,__PRELINK_TEXT,__text,/dev/null \
-    -Wl,-sectcreate,"__PLK_TEXT_EXEC",__text,/dev/null \
-    -Wl,-sectcreate,__PRELINK_DATA,__data,/dev/null \
-    -Wl,-sectcreate,"__PLK_DATA_CONST",__data,/dev/null \
-    -Wl,-sectcreate,"__PLK_LLVM_COV",__llvm_covmap,/dev/null \
-    -Wl,-sectcreate,"__PLK_LINKEDIT",__data,/dev/null \
+       -Wl,-sectcreate,__PRELINK_TEXT,__text,/dev/null \
        -Wl,-sectcreate,__PRELINK_INFO,__info,/dev/null \
        -Wl,-new_linker \
        -Wl,-pagezero_size,0x0 \
@@ -343,7 +421,8 @@ LDFLAGS_KERNEL_GEN = \
        -Wl,-function_starts \
        -Wl,-headerpad,152
 
-LDFLAGS_KERNEL_SDK     = -L$(SDKROOT)/usr/local/lib/kernel -lfirehose_kernel
+# LDFLAGS_KERNEL_SDK   = -L$(SDKROOT)/usr/local/lib/kernel -lfirehose_kernel
+LDFLAGS_KERNEL_SDK     = -L$(SDKROOT)/usr/local/lib/kernel
 
 LDFLAGS_KERNEL_RELEASE =
 LDFLAGS_KERNEL_DEVELOPMENT     =
@@ -386,9 +465,13 @@ LDFLAGS_KERNEL_RELEASEX86_64 = \
        -Wl,-no_zero_fill_sections \
        $(LDFLAGS_NOSTRIP_FLAG)
 
+ifeq ($(SAN),1)
+LDFLAGS_KERNEL_RELEASEX86_64 += \
+       -Wl,-sectalign,__HIB,__cstring,0x1000
+endif
+
 ifeq ($(KASAN),1)
 LDFLAGS_KERNEL_RELEASEX86_64 += \
-       -Wl,-sectalign,__HIB,__cstring,0x1000 \
        -Wl,-sectalign,__HIB,__asan_globals,0x1000 \
        -Wl,-sectalign,__HIB,__asan_liveness,0x1000 \
        -Wl,-sectalign,__HIB,__mod_term_func,0x1000 \
@@ -411,6 +494,99 @@ LDFLAGS_KERNEL_DEVELOPMENTX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H)
 LDFLAGS_KERNEL_KASANX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H)
 LDFLAGS_KERNEL_PROFILEX86_64H = $(LDFLAGS_KERNEL_RELEASEX86_64H)
 
+# We preload ___udivmoddi4 in order to work around an issue with building
+# LTO on armv7.
+LDFLAGS_KERNEL_GENARM = \
+       -Wl,-pie \
+       -Wl,-static \
+       -Wl,-image_base,0x80001000 \
+       -Wl,-sectalign,__DATA,__const,0x1000 \
+       -Wl,-u,___udivmoddi4
+
+LDFLAGS_KERNEL_RELEASEARM     = \
+       $(LDFLAGS_KERNEL_GENARM)
+
+LDFLAGS_KERNEL_EXPORTS_RELEASEARM     = \
+       -Wl,-exported_symbols_list,$(TARGET)/all-kpi.exp
+
+LDFLAGS_KERNEL_DEVELOPMENTARM     = \
+       $(LDFLAGS_KERNEL_GENARM) \
+       $(LDFLAGS_NOSTRIP_FLAG)
+
+LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM =
+
+LDFLAGS_KERNEL_DEBUGARM = $(LDFLAGS_KERNEL_DEVELOPMENTARM)
+LDFLAGS_KERNEL_EXPORTS_DEBUGARM = $(LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM)
+
+# Offset image base by page to have iBoot load kernel TEXT correctly.
+# First page is used for various purposes : sleep token, reset vector.
+# We also need a 32MB offset, as this is the minimum block mapping size
+# for a 16KB page runtime, and we wish to use the first virtual block
+# to map the low globals page.  We also need another 4MB to account for
+# the address space reserved by L4 (because the reservation is not a
+# multiple of the block size in alignment/length, we will implictly map
+# it with our block mapping, and we therefore must reflect that the
+# first 4MB of the block mapping for xnu do not belong to xnu).
+# For the moment, kaliber has a unique memory layout (monitor at the top
+# of memory).  Support this by breaking 16KB on other platforms and
+# mandating 32MB alignment. Image base (i.e. __TEXT) must be 16KB
+# aligned since ld64 will link with 16KB alignment for ARM64.
+#
+# We currently offset by an additional 32MB in order to reclaim memory.
+# We need a dedicated virtual page for the low globals.  Our bootloader
+# may have a significant chunk of memory (up to an L2 entry in size)
+# that lies before the kernel.  The addition 32MB of virtual padding
+# ensures that we have enough virtual address space to map all of that
+# memory as part of the V-to-P mapping.
+# 23355738 - put __PRELINK_TEXT first. We reserve enough room
+# for 0x0000000003000000 = 48MB of kexts
+#
+# 0xfffffff000000000 (32MB range for low globals)
+# 0xfffffff002000000 (32MB range to allow for large page physical slide)
+# 0xfffffff004000000 (16KB range to reserve the first available page)
+# 0xfffffff004004000 (48MB range for kexts)
+# 0xfffffff007004000 (Start of xnu proper).
+LDFLAGS_KERNEL_GENARM64 = \
+       -Wl,-pie \
+       -Wl,-static \
+       -Wl,-segaddr,__PRELINK_TEXT,0xfffffff004004000 \
+       -Wl,-image_base,0xfffffff007004000 \
+       -Wl,-sectalign,__DATA,__const,0x4000 \
+       -Wl,-rename_section,__DATA,__mod_init_func,__DATA_CONST,__mod_init_func \
+       -Wl,-rename_section,__DATA,__mod_term_func,__DATA_CONST,__mod_term_func \
+       -Wl,-rename_section,__DATA,__const,__DATA_CONST,__const \
+       -Wl,-rename_section,__TEXT,__text,__TEXT_EXEC,__text \
+       -Wl,-rename_section,__TEXT,__stubs,__TEXT_EXEC,__stubs \
+       -Wl,-rename_section,__TEXT,initcode,__TEXT_EXEC,initcode \
+       -Wl,-sectcreate,"__PLK_TEXT_EXEC",__text,/dev/null \
+       -Wl,-sectcreate,__PRELINK_DATA,__data,/dev/null \
+       -Wl,-sectcreate,"__PLK_DATA_CONST",__data,/dev/null \
+       -Wl,-sectcreate,"__PLK_LLVM_COV",__llvm_covmap,/dev/null \
+       -Wl,-sectcreate,"__PLK_LINKEDIT",__data,/dev/null
+
+
+LDFLAGS_KERNEL_SEGARM64 ?= \
+       -Wl,-segment_order,__TEXT:__DATA_CONST:__LINKEDIT:__TEXT_EXEC:__LAST:__KLD:__DATA:__BOOTDATA
+
+LDFLAGS_KERNEL_RELEASEARM64     = \
+       $(LDFLAGS_KERNEL_GENARM64) \
+       $(LDFLAGS_KERNEL_SEGARM64)
+
+LDFLAGS_KERNEL_EXPORTS_RELEASEARM64     = \
+       -Wl,-exported_symbols_list,$(TARGET)/all-kpi.exp
+
+LDFLAGS_KERNEL_DEVELOPMENTARM64     = \
+       $(LDFLAGS_KERNEL_GENARM64) \
+       $(LDFLAGS_KERNEL_SEGARM64) \
+       $(LDFLAGS_NOSTRIP_FLAG)
+
+LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM64 =
+
+LDFLAGS_KERNEL_KASANARM64 = $(LDFLAGS_KERNEL_DEVELOPMENTARM64)
+LDFLAGS_KERNEL_DEBUGARM64 = $(LDFLAGS_KERNEL_DEVELOPMENTARM64)
+
+LDFLAGS_KERNEL_EXPORTS_KASANARM64 = $(LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM64)
+LDFLAGS_KERNEL_EXPORTS_DEBUGARM64 = $(LDFLAGS_KERNEL_EXPORTS_DEVELOPMENTARM64)
 
 LDFLAGS_KERNEL = $(LDFLAGS_KERNEL_GEN) \
                  $(LDFLAGS_KERNEL_SDK) \
@@ -420,10 +596,15 @@ LDFLAGS_KERNEL    = $(LDFLAGS_KERNEL_GEN) \
                  $($(addsuffix $(CURRENT_ARCH_CONFIG), $(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_))) \
                  $(DEPLOYMENT_TARGET_FLAGS)
 
+
+LDFLAGS_KERNEL_EXPORTS   =   \
+                 $($(addsuffix $(CURRENT_ARCH_CONFIG), $(addsuffix $(CURRENT_KERNEL_CONFIG),LDFLAGS_KERNEL_EXPORTS_)))
+
 #
 # Default runtime libraries to be linked with the kernel
 #
-LD_KERNEL_LIBS = -lcc_kext
+LD_KERNEL_LIBS    = -lcc_kext
+LD_KERNEL_ARCHIVES = $(LDFLAGS_KERNEL_SDK) -lfirehose_kernel
 
 #
 # DTrace support
@@ -575,10 +756,12 @@ XNU_PRIVATE_UNIFDEF = -UMACH_KERNEL_PRIVATE -UBSD_KERNEL_PRIVATE -UIOKIT_KERNEL_
 
 PLATFORM_UNIFDEF = $(foreach x,$(SUPPORTED_PLATFORMS),$(if $(filter $(PLATFORM),$(x)),-DPLATFORM_$(x) $(foreach token,$(PLATFORM_UNIFDEF_BLACKLIST_TOKENS_$(x)),-U$(token)),-UPLATFORM_$(x)))
 
+
 SPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -DPRIVATE -U_OPEN_SOURCE_ -U__OPEN_SOURCE__
 SINCFRAME_UNIFDEF  = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UKERNEL -UPRIVATE -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
 KPINCFRAME_UNIFDEF = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -DKERNEL_PRIVATE -DPRIVATE -DKERNEL -U_OPEN_SOURCE_ -U__OPEN_SOURCE__
 KINCFRAME_UNIFDEF  = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -UKERNEL_PRIVATE -UPRIVATE -DKERNEL -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
+DATA_UNIFDEF       = $(PLATFORM_UNIFDEF) $(XNU_PRIVATE_UNIFDEF) $(SEED_DEFINES) -D_OPEN_SOURCE_ -D__OPEN_SOURCE__
 
 #
 # Compononent Header file destinations
@@ -603,6 +786,8 @@ DSYMUTIL_FLAGS_GEN  = --minimize
 
 DSYMUTIL_FLAGS_X86_64  = --arch=x86_64
 DSYMUTIL_FLAGS_X86_64H = --arch=x86_64h
+DSYMUTIL_FLAGS_ARM     = --arch=arm
+DSYMUTIL_FLAGS_ARM64   =
 
 DSYMUTIL_FLAGS = $(DSYMUTIL_FLAGS_GEN) \
        $($(addsuffix $(CURRENT_ARCH_CONFIG),DSYMUTIL_FLAGS_))
index 0885b3fabfbfd44533cd8ee1e680fa229a77d20f..55de6d307d6280655cb4197f9640c4ad9ff9f222 100644 (file)
@@ -26,8 +26,6 @@ ifeq ($(filter $(PLATFORM),$(SUPPORTED_PLATFORMS)),)
 $(error Unsupported PLATFORM $(PLATFORM))
 endif
 
-STATIC_KMODS =  $(SRCROOT)/kmods.a
-
 ifeq ($(BUILD_JSON_COMPILATION_DATABASE),1)
 do_build_setup::
        $(_v)$(CAT) > $(OBJPATH)/compile_commands.json < /dev/null
@@ -41,13 +39,29 @@ endif
 # 1) $(KERNEL_FILE_NAME).unstripped    (raw linked kernel, unstripped)
 # 2) $(KERNEL_FILE_NAME)               (stripped kernel, with optional CTF data)
 # 3) $(KERNEL_FILE_NAME).dSYM          (dSYM)
-#
+# 4) $(KERNEL_FILE_NAME).link          (bits for static linking)
+
+ifeq ($(BUILD_STATIC_LINK),1)
+
+KERNEL_STATIC_LINK_TARGETS = \
+        $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a
+
+KERNEL_STATIC_LINK_DST = \
+                       $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a             \
+                       $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments \
+                       $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives  \
+                       $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp           \
+                       $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp     \
+                       $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros                   \
+                       $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME)
+
+endif
 
 do_build_all:: do_build_kernel
 
 .PHONY: do_build_kernel
 
-do_build_kernel: $(TARGET)/$(KERNEL_FILE_NAME) $(TARGET)/$(KERNEL_FILE_NAME).unstripped
+do_build_kernel: $(TARGET)/$(KERNEL_FILE_NAME) $(TARGET)/$(KERNEL_FILE_NAME).unstripped $(KERNEL_STATIC_LINK_TARGETS)
        @:
 
 ifeq ($(BUILD_DSYM),1)
@@ -60,7 +74,7 @@ do_build_kernel_dSYM: $(TARGET)/$(KERNEL_FILE_NAME).dSYM
        @:
 
 .LDFLAGS: ALWAYS
-       $(_v)$(REPLACECONTENTS) $@ $(LD) $(LDFLAGS_KERNEL) $(LD_KERNEL_LIBS)
+       $(_v)$(REPLACECONTENTS) $@ $(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) $(LD_KERNEL_LIBS)
 .CFLAGS: ALWAYS
        $(_v)$(REPLACECONTENTS) $@ $(KCC) $(CFLAGS) $(INCFLAGS)
 
@@ -90,7 +104,18 @@ $(TARGET)/$(KERNEL_FILE_NAME).unstripped: $(addprefix $(TARGET)/,$(foreach compo
        $(_v)${MAKE} -f $(firstword $(MAKEFILE_LIST)) version.o
        @echo "$(ColorL)LD$(Color0)  $(ColorLF)$(@F)$(Color0)"
        $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > link.filelist
-       $(_v)$(LD) $(LDFLAGS_KERNEL) -filelist link.filelist version.o $(filter %.o,$+) -o $@ $(LD_KERNEL_LIBS)
+       $(_v)$(LD) $(LDFLAGS_KERNEL) $(LDFLAGS_KERNEL_EXPORTS) -filelist link.filelist version.o $(filter %.o,$+) -o $@ $(LD_KERNEL_LIBS) $(LD_KERNEL_ARCHIVES)
+
+$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(addprefix $(TARGET)/,$(foreach component,$(COMPONENT_LIST),$(component)/$(CURRENT_KERNEL_CONFIG)/$(component).filelist)) lastkerneldataconst.o lastkernelconstructor.o $(TARGET)/$(KERNEL_FILE_NAME).unstripped .LDFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
+       @echo "$(ColorL)LIBTOOL$(Color0)    $(ColorLF)$(@F)$(Color0)"
+       $(_v)$(MKDIR) $(dir $@)
+       $(_v)$(CAT) $(filter %.filelist,$+) < /dev/null > libtool.filelist
+       $(_v)$(LIBTOOL) -ca -filelist libtool.filelist $(filter %.o,$+) version.o -o $@
+       $(_v)cp $(TARGET)/all-kpi.exp $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp
+       $(_v)cp $(TARGET)/all-alias.exp $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp
+       $(_v)echo "$(LD_KERNEL_ARCHIVES)" >$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives
+       $(_v)echo "$(LDFLAGS_KERNEL) $(LD_KERNEL_LIBS)" >$(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments
+       $(_v)$(LN) $(call function_convert_build_config_to_objdir,$(CURRENT_BUILD_CONFIG))/$(KERNEL_FILE_NAME).link $(OBJROOT)/$(KERNEL_FILE_NAME).link
 
 -include version.d
 version.o: .CFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
@@ -105,6 +130,7 @@ $(OBJPATH)/version.c: $(SRCROOT)/config/version.c $(NEWVERS) $(SRCROOT)/config/M
        $(_v)$(CP) $< $@
        $(_v)$(NEWVERS) $(OBJPATH)/version.c > /dev/null;
 
+
 -include lastkerneldataconst.d
 lastkerneldataconst.o: .CFLAGS $(filter %/MakeInc.kernel,$(MAKEFILE_LIST))
 lastkerneldataconst.o: $(SRCROOT)/libsa/lastkerneldataconst.c
@@ -194,6 +220,36 @@ $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NA
        fi;                                                             \
        exit $$cmdstatus
 
+ifeq ($(BUILD_STATIC_LINK),1)
+
+$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).a ALWAYS
+       $(_v)$(MKDIR) $(dir $@)
+       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
+
+$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarguments ALWAYS
+       $(_v)$(MKDIR) $(dir $@)
+       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
+
+$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).linkarchives ALWAYS
+       $(_v)$(MKDIR) $(dir $@)
+       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
+
+$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).exp ALWAYS
+       $(_v)$(MKDIR) $(dir $@)
+       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
+
+$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp: $(TARGET)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).alias.exp ALWAYS
+       $(_v)$(MKDIR) $(dir $@)
+       @echo "$(ColorH)INSTALL$(Color0)       $(ColorF)$(@F)$(Color0)"
+       $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
+
+# BUILD_STATIC_LINK
+endif
+
 $(SYMROOT)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped ALWAYS
        $(_v)$(MKDIR) $(dir $@)
        $(_v)if [ $(OBJROOT)/.mach_kernel.timestamp -nt $@ ]; then              \
@@ -207,13 +263,20 @@ $(SYMROOT)/$(KERNEL_FILE_NAME): $(TARGET)/$(KERNEL_FILE_NAME).unstripped ALWAYS
        fi;                                                                     \
        exit $$cmdstatus
 
-$(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros: $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros
+
+$(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros                                                                     \
+$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros                      \
+$(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros:                                          \
+$(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros
        $(_v)$(MKDIR) $(dir $@)
        @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
        $(_v)$(CP) -r $< $(dir $@)
        $(_v)$(TOUCH) $@
 
-$(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME) $(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME): $(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME)
+$(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME)                                                   \
+$(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME)    \
+$(DSTROOT)/$(INSTALL_KERNEL_SYM_DIR)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME):                        \
+$(TARGET)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/$(KERNEL_LLDBBOOTSTRAP_NAME)
        $(_v)$(MKDIR) $(dir $@)
        @echo "$(ColorH)INSTALLMACROS$(Color0) $(ColorF)$(@F)$(Color0) \"($(ColorLF)$(CURRENT_ARCH_CONFIG_LC)$(Color0))\""
        $(_v)$(INSTALL) $(INSTALL_FLAGS) $< $@
@@ -243,8 +306,9 @@ $(SYMROOT)/$(KERNEL_FILE_NAME).dSYM/$(DSYMDWARFDIR)/$(KERNEL_FILE_NAME) $(DSTROO
 
 .PHONY: do_install_machine_specific_kernel do_install_machine_specific_kernel_dSYM
 
-do_install_machine_specific_kernel: $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME) \
-                       $(SYMROOT)/$(KERNEL_FILE_NAME)
+do_install_machine_specific_kernel: $(DSTROOT)/$(INSTALL_KERNEL_DIR)/$(KERNEL_FILE_NAME)                \
+                       $(SYMROOT)/$(KERNEL_FILE_NAME)                                                              \
+                       $(KERNEL_STATIC_LINK_DST)
        @:
 
 do_install_machine_specific_kernel_dSYM: \
index 8c1c5d0c86efb7daf64e23ca0bdd6ab2ed4ec0ec..d2e05a89fea2cdf3c00ef2f4ed107c7629143411 100644 (file)
@@ -112,7 +112,7 @@ $(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR
        $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) $(4)
 
 $(1): $(dir $(firstword $(1)))% : $(if $(2),%,$$(SOURCE)/%) | $(3)_MKDIR
-       @echo "$$(ColorH)INSTALLHDR$(Color0)    $$(ColorF)$$*$$(Color0)"
+       @echo "$$(ColorH)INSTALLHDR$$(Color0)    $$(ColorF)$$*$$(Color0)"
        $$(_v)$$(UNIFDEF) $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$;       \
        if [ $$$$? -eq 2 ]; then                                                \
                echo Parse failure for $$<;                             \
@@ -126,6 +126,43 @@ $(1): $(dir $(firstword $(1)))% : $(if $(2),%,$$(SOURCE)/%) | $(3)_MKDIR
        $$(RM) ./$(3)/$$*.unifdef.$$$$$$$$ ./$(3)/$$*.strip.$$$$$$$$
 endef
 
+# $(1) is the list of install paths
+# $(2) is the source path pattern (using % to match with $(5)) or source file
+# $(3) is the local temp directory for processing
+# $(4) is the unifdef flags
+# $(5) is the destination directory (when pattern matching) or empty
+#
+# $$$$$$$$ is a double-escaped "$$" to represent the current pid
+# of the shell process for creating uniquely named temporary files
+
+define INSTALLPYTHON_RULE_template
+
+.PHONY: $(3)_MKDIR
+
+$(3)_MKDIR:
+       $$(_v)$$(MKDIR) ./$(3)
+
+# Rebuild if unifdef flags change
+$(1): $(3)/.UNIFDEF_FLAGS
+$(3)/.UNIFDEF_FLAGS: ALWAYS | $(3)_MKDIR
+       $$(_v)$$(REPLACECONTENTS) $$@ $$(UNIFDEF) -t $(4)
+
+$(1): $(5)% : $(2) | $(3)_MKDIR
+       @echo "$$(ColorH)INSTALLPY$$(Color0)    $$(ColorF)$$*$$(Color0)"
+       $$(_v)$$(MKDIR) $$(dir $$@) $$(dir ./$(3)/$$*)
+       $$(_v)$$(UNIFDEF) -t $(4) $$< > ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*); \
+       if [ $$$$? -eq 2 ]; then                                                \
+               echo Parse failure for $$<;                             \
+               exit 1;                                                 \
+       fi;                                                             \
+       $$(INSTALL) $$(DATA_INSTALL_FLAGS) \
+               ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*) $$@ || exit 1; \
+       $$(PYTHON) $$(LLDBMACROS_SOURCE)/core/syntax_checker.py \
+               ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*) $$(_vstdout) || exit 1; \
+       $$(RM) ./$(3)/$$*.unifdef.$$$$$$$$$$(suffix $$*)
+       $$(_v)if [ -n "$(5)" ]; then $$(TOUCH) "$(5)"; fi
+endef
+
 #
 # Machine-independent (public) files
 #
index 76fd8500a04043913b8a1a1ed10387608c33d43e..6d9bcf1469a827cf24cc7a9cd34e81fb6f274390 100644 (file)
@@ -87,7 +87,8 @@ endif
 override DEFAULT_I386_MACHINE_CONFIG := NONE
 override DEFAULT_X86_64_MACHINE_CONFIG := NONE
 override DEFAULT_X86_64H_MACHINE_CONFIG := NONE
-
+override DEFAULT_ARM_MACHINE_CONFIG    := T8002
+override DEFAULT_ARM64_MACHINE_CONFIG  := S5L8960X
 
 # This is typically never specified (TARGET_CONFIGS is used)
 ifndef MACHINE_CONFIGS
@@ -114,6 +115,76 @@ endif
 # default kernel configuration = DEFAULT_KERNEL_CONFIG
 # default architecture configuration = system architecture where you are running make.
 
+ifneq ($(filter $(SUPPORTED_EMBEDDED_PLATFORMS),$(PLATFORM)),)
+
+# Defaults for "make all_embedded"
+ifeq ($(KERNEL_CONFIGS),DEFAULT)
+KERNEL_CONFIGS_EMBEDDED := RELEASE DEVELOPMENT
+else
+KERNEL_CONFIGS_EMBEDDED := $(KERNEL_CONFIGS)
+endif
+
+ifeq ($(ARCH_CONFIGS),DEFAULT)
+ARCH_CONFIGS_EMBEDDED := ARM ARM64
+else
+ARCH_CONFIGS_EMBEDDED := $(strip $(shell echo $(ARCH_CONFIGS) | $(TR) a-z A-Z))
+endif
+
+# Find supported products from the device map
+DEVICEMAP_PRODUCTS_ARMV7  := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH)          \
+                               -query 'SELECT DISTINCT TargetType                      \
+                                       FROM Files                                      \
+                                         INNER JOIN Manifests USING (manifestID)       \
+                                         INNER JOIN Targets USING (Target)             \
+                                       WHERE (KernelMachOArchitecture LIKE "armv7"     \
+                                          AND fileType in ("KernelCache", "RestoreKernelCache"))')
+DEVICEMAP_PRODUCTS_ARMV7S := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH)          \
+                               -query 'SELECT DISTINCT TargetType                      \
+                                       FROM Files                                      \
+                                         INNER JOIN Manifests USING (manifestID)       \
+                                         INNER JOIN Targets USING (Target)             \
+                                       WHERE (KernelMachOArchitecture LIKE "armv7s"    \
+                                          AND fileType in ("KernelCache", "RestoreKernelCache"))')
+DEVICEMAP_PRODUCTS_ARMV7K := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH)          \
+                               -query 'SELECT DISTINCT TargetType                      \
+                                       FROM Files                                      \
+                                         INNER JOIN Manifests USING (manifestID)       \
+                                         INNER JOIN Targets USING (Target)             \
+                                       WHERE (KernelMachOArchitecture LIKE "armv7k"    \
+                                          AND fileType in ("KernelCache", "RestoreKernelCache"))')
+DEVICEMAP_PRODUCTS_ARM := $(DEVICEMAP_PRODUCTS_ARMV7) $(DEVICEMAP_PRODUCTS_ARMV7S) $(DEVICEMAP_PRODUCTS_ARMV7K)
+
+
+DEVICEMAP_PRODUCTS_ARM64 := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH)           \
+                               -query 'SELECT DISTINCT TargetType                      \
+                                       FROM Files                                      \
+                                         INNER JOIN Manifests USING (manifestID)       \
+                                         INNER JOIN Targets USING (Target)             \
+                                       WHERE (KernelMachOArchitecture LIKE "arm64"     \
+                                          AND fileType in ("KernelCache", "RestoreKernelCache"))')
+
+
+# Generate a list of mappings of the form "n75:arm;t8002" based on the device map
+DEVICEMAP_PRODUCT_SOC_MAPPINGS := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT TargetType, KernelMachOArchitecture, KernelPlatform FROM Targets | awk -F\| '{ if ($$2 ~ /armv[0-9][a-z]?/) { print $$1 ":arm;" $$3 } else if ($$2 ~ /arm64[a-z]?/) { print $$1 ":arm64;" $$3 ";" $$4} else { print $$1 ":" $$2 ";" $$3 ";" $$4} }' )
+
+# Map a product like "n75" to "arm;t8002"
+# $(1) is a product name in lower case
+function_lookup_product = $(call function_substitute_word_with_replacement,    \
+                                $(1),                                          \
+                                $(DEVICEMAP_PRODUCT_SOC_MAPPINGS),             \
+                                unknown_arch_for_$(1);unknown_platform_for_$(1) \
+                          )
+
+# Generate a list of mappings for products that use a different platform for their kernel configuration than their true platform
+# of the form "n71m:arm64;s8000;s8003". The 4th element is the true SoC platform, which will get an on-disk copy, while the
+# kernel's recursive build system will build the 3rd element as the KernelPlatform
+DEVICEMAP_PRODUCT_SOC_ALIASES := $(shell $(EMBEDDED_DEVICE_MAP) -db $(EDM_DBPATH) -query SELECT DISTINCT TargetType, KernelMachOArchitecture, KernelPlatform, Platform FROM Targets WHERE KernelPlatform "!=" Platform | awk -F\| '{ if ($$2 ~ /armv[0-9][a-z]?/) { print $$1 ":arm;" $$3 ";" $$4} else if ($$2 ~ /arm64[a-z]?/) { print $$1 ":arm64;" $$3 ";" $$4} else { print $$1 ":" $$2 ";" $$3 ";" $$4} }' )
+
+function_lookup_product_alias = $(call function_substitute_word_with_replacement,      \
+                                       $(1),                                           \
+                                       $(DEVICEMAP_PRODUCT_SOC_ALIASES),               \
+                                )
+endif
 
 ifeq ($(PLATFORM),MacOSX)
 
index 64af5e1e0b9747fabaa3480ed39f51422fdd44b9..b3beb261918043a0c9c9dace56b805bfbca41817 100644 (file)
@@ -6,6 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
+
 INSTINC_SUBDIRS = \
        mach    \
        atm     \
@@ -70,7 +71,8 @@ EXPINC_SUBDIRS = \
        libsa \
        console \
        kperf \
-       prng
+       prng \
+       tests
 
 
 EXPINC_SUBDIRS_X86_64 = \
index 8fd552a10e6dc8a874dff9d2d9acccd28a439d45..38c19f380ab98b1aca070b1da6a41f79753d4da5 100644 (file)
@@ -21,11 +21,11 @@ ARM_HEADER_FILES =  \
                machine_cpuid.h \
                machine_routines.h \
                pal_routines.h \
+               pmap_public.h \
                proc_reg.h \
+               simple_lock.h \
                smp.h \
-               thread.h \
-               simple_lock.h
-
+               thread.h
 
 INSTALL_MD_DIR = arm
 
index e81af968b0aa82c261cd702b99dc67522c9f8e33..b38086203b62e4fdadd5435755aa3569170e1a9e 100644 (file)
@@ -83,9 +83,10 @@ extern int   serial_init(void);
 extern void sleep_token_buffer_init(void);
 
 extern vm_offset_t intstack_top;
-extern vm_offset_t fiqstack_top;
 #if __arm64__
 extern vm_offset_t excepstack_top;
+#else
+extern vm_offset_t fiqstack_top;
 #endif
 
 extern const char version[];
@@ -132,10 +133,74 @@ unsigned int page_shift_user32;   /* for page_size as seen by a 32-bit task */
 #endif /* __arm64__ */
 
 
+/*
+ * JOP rebasing
+ */
+
+
+// Note, the following should come from a header from dyld
+static void
+rebase_chain(uintptr_t chainStartAddress, uint64_t stepMultiplier, uintptr_t baseAddress __unused, uint64_t slide)
+{
+       uint64_t delta = 0;
+       uintptr_t address = chainStartAddress;
+       do {
+               uint64_t value = *(uint64_t*)address;
+
+               bool isAuthenticated = (value & (1ULL << 63)) != 0;
+               bool isRebase = (value & (1ULL << 62)) == 0;
+               if (isRebase) {
+                       if (isAuthenticated) {
+                               // The new value for a rebase is the low 32-bits of the threaded value plus the slide.
+                               uint64_t newValue = (value & 0xFFFFFFFF) + slide;
+                               // Add in the offset from the mach_header
+                               newValue += baseAddress;
+                               *(uint64_t*)address = newValue;
+
+                       } else
+                       {
+                               // Regular pointer which needs to fit in 51-bits of value.
+                               // C++ RTTI uses the top bit, so we'll allow the whole top-byte
+                               // and the bottom 43-bits to be fit in to 51-bits.
+                               uint64_t top8Bits = value & 0x0007F80000000000ULL;
+                               uint64_t bottom43Bits = value & 0x000007FFFFFFFFFFULL;
+                               uint64_t targetValue = ( top8Bits << 13 ) | (((intptr_t)(bottom43Bits << 21) >> 21) & 0x00FFFFFFFFFFFFFF);
+                               targetValue = targetValue + slide;
+                               *(uint64_t*)address = targetValue;
+                       }
+               }
+
+               // The delta is bits [51..61]
+               // And bit 62 is to tell us if we are a rebase (0) or bind (1)
+               value &= ~(1ULL << 62);
+               delta = ( value & 0x3FF8000000000000 ) >> 51;
+               address += delta * stepMultiplier;
+       } while ( delta != 0 );
+}
+
+// Note, the following method should come from a header from dyld
+static bool
+rebase_threaded_starts(uint32_t *threadArrayStart, uint32_t *threadArrayEnd,
+                           uintptr_t macho_header_addr, uintptr_t macho_header_vmaddr, size_t slide)
+{
+       uint32_t threadStartsHeader = *threadArrayStart;
+       uint64_t stepMultiplier = (threadStartsHeader & 1) == 1 ? 8 : 4;
+       for (uint32_t* threadOffset = threadArrayStart + 1; threadOffset != threadArrayEnd; ++threadOffset) {
+               if (*threadOffset == 0xFFFFFFFF)
+                       break;
+               rebase_chain(macho_header_addr + *threadOffset, stepMultiplier, macho_header_vmaddr, slide);
+       }
+       return true;
+}
+
 /*
  *             Routine:                arm_init
  *             Function:
  */
+
+extern uint32_t __thread_starts_sect_start[] __asm("section$start$__TEXT$__thread_starts");
+extern uint32_t __thread_starts_sect_end[]   __asm("section$end$__TEXT$__thread_starts");
+
 void
 arm_init(
        boot_args       *args)
@@ -146,15 +211,27 @@ arm_init(
        thread_t        thread;
        processor_t     my_master_proc;
 
+    // rebase and sign jops
+       if (&__thread_starts_sect_end[0] != &__thread_starts_sect_start[0])
+       {
+               uintptr_t mh    = (uintptr_t) &_mh_execute_header;
+               uintptr_t slide = mh - VM_KERNEL_LINK_ADDRESS;
+               rebase_threaded_starts( &__thread_starts_sect_start[0],
+                                                               &__thread_starts_sect_end[0],
+                                                               mh, mh - slide, slide);
+       }
+
        /* If kernel integrity is supported, use a constant copy of the boot args. */
        const_boot_args = *args;
-       BootArgs = &const_boot_args;
+       BootArgs = args = &const_boot_args;
 
        cpu_data_init(&BootCpuData);
 
-       PE_init_platform(FALSE, args);  /* Get platform expert set up */
+       PE_init_platform(FALSE, args); /* Get platform expert set up */
 
 #if __arm64__
+
+
        {
                unsigned int    tmp_16k = 0;
 
@@ -221,11 +298,12 @@ arm_init(
 #endif
        BootCpuData.intstack_top = (vm_offset_t) & intstack_top;
        BootCpuData.istackptr = BootCpuData.intstack_top;
-       BootCpuData.fiqstack_top = (vm_offset_t) & fiqstack_top;
-       BootCpuData.fiqstackptr = BootCpuData.fiqstack_top;
 #if __arm64__
        BootCpuData.excepstack_top = (vm_offset_t) & excepstack_top;
        BootCpuData.excepstackptr = BootCpuData.excepstack_top;
+#else
+       BootCpuData.fiqstack_top = (vm_offset_t) & fiqstack_top;
+       BootCpuData.fiqstackptr = BootCpuData.fiqstack_top;
 #endif
        BootCpuData.cpu_processor = cpu_processor_alloc(TRUE);
        BootCpuData.cpu_console_buf = (void *)NULL;
@@ -312,6 +390,10 @@ arm_init(
 
        printf_init();
        panic_init();
+#if __arm64__
+       /* Enable asynchronous exceptions */
+       __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF);
+#endif
 #if __arm64__ && WITH_CLASSIC_S2R
        sleep_token_buffer_init();
 #endif
@@ -367,7 +449,7 @@ arm_init(
 
        PE_init_platform(TRUE, &BootCpuData);
        cpu_timebase_init(TRUE);
-       fiq_context_init(TRUE);
+       fiq_context_bootstrap(TRUE);
 
 
        /*
@@ -407,8 +489,10 @@ arm_init_cpu(
        machine_set_current_thread(cpu_data_ptr->cpu_active_thread);
 
 #if __arm64__
+       pmap_clear_user_ttb();
+       flush_mmu_tlb();
        /* Enable asynchronous exceptions */
-        __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF);
+       __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF);
 #endif
 
        cpu_machine_idle_init(FALSE);
@@ -455,10 +539,11 @@ arm_init_cpu(
 #if CONFIG_TELEMETRY
                bootprofile_wake_from_sleep();
 #endif /* CONFIG_TELEMETRY */
+       }
 #if MONOTONIC && defined(__arm64__)
-               mt_wake();
+       mt_wake_per_core();
 #endif /* MONOTONIC && defined(__arm64__) */
-       }
+
 
        slave_main(NULL);
 }
@@ -481,8 +566,10 @@ arm_init_idle_cpu(
        machine_set_current_thread(cpu_data_ptr->cpu_active_thread);
 
 #if __arm64__
+       pmap_clear_user_ttb();
+       flush_mmu_tlb();
        /* Enable asynchronous exceptions */
-        __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF);
+       __builtin_arm_wsr("DAIFClr", DAIFSC_ASYNCF);
 #endif
 
 #if    (__ARM_ARCH__ == 7)
@@ -496,5 +583,5 @@ arm_init_idle_cpu(
 
        fiq_context_init(FALSE);
 
-       cpu_idle_exit();
+       cpu_idle_exit(TRUE);
 }
index 07bcfb9b263cf1e7cad4b6795e62eaf45b2433a4..ebdfe7735df96cbe91c9bfec50f993487e39da68 100644 (file)
 
 #include <arm/proc_reg.h>
 #include <arm/caches_internal.h>
+#include <arm/cpu_data_internal.h>
 #include <arm/pmap.h>
 #include <arm/misc_protos.h>
 #include <arm/lowglobals.h>
 
 #include <pexpert/arm/boot.h>
+#include <pexpert/device_tree.h>
 
 #include <libkern/kernel_mach_header.h>
 
@@ -77,6 +79,9 @@ vm_offset_t vm_elinkedit;
 vm_offset_t vm_prelink_sdata;
 vm_offset_t vm_prelink_edata;
 
+vm_offset_t vm_kernel_builtinkmod_text;
+vm_offset_t vm_kernel_builtinkmod_text_end;
+
 unsigned long gVirtBase, gPhysBase, gPhysSize;     /* Used by <mach/arm/vm_param.h> */
 
 vm_offset_t   mem_size;                             /* Size of actual physical memory present
@@ -93,6 +98,9 @@ addr64_t      vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Highest kernel
                                                      * virtual address known
                                                      * to the VM system */
 
+vm_offset_t            segEXTRADATA;
+unsigned long          segSizeEXTRADATA;
+vm_offset_t            segLOWESTTEXT;
 static vm_offset_t     segTEXTB;
 static unsigned long   segSizeTEXT;
 static vm_offset_t     segDATAB;
@@ -105,6 +113,11 @@ static vm_offset_t     segLASTB;
 static unsigned long   segSizeLAST;
 static vm_offset_t     sectCONSTB;
 static unsigned long   sectSizeCONST;
+vm_offset_t            segBOOTDATAB;
+unsigned long          segSizeBOOTDATA;
+extern vm_offset_t     intstack_low_guard;
+extern vm_offset_t     intstack_high_guard;
+extern vm_offset_t     fiqstack_high_guard;
 
 vm_offset_t     segPRELINKTEXTB;
 unsigned long   segSizePRELINKTEXT;
@@ -139,6 +152,11 @@ extern vm_offset_t ExceptionVectorsBase; /* the code we want to load there */
 #define round_x_table(x) \
        (((pmap_paddr_t)(x) + (ARM_PGBYTES<<2) - 1) & ~((ARM_PGBYTES<<2) - 1))
 
+vm_map_address_t
+phystokv(pmap_paddr_t pa)
+{
+       return (pa - gPhysBase + gVirtBase);
+}
 
 static void
 arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, 
@@ -154,6 +172,11 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
 
                pa = va - gVirtBase + gPhysBase;
 
+               if (pa >= avail_end)
+                       return;
+
+               assert(_end >= va);
+
                if (ARM_TTE_TYPE_TABLE == (tmplate & ARM_TTE_TYPE_MASK)) {
                        /* pick up the existing page table. */
                        ppte = (pt_entry_t *)phystokv((tmplate & ARM_TTE_TABLE_MASK));
@@ -161,13 +184,17 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
                        /* TTE must be reincarnated COARSE. */
                        ppte = (pt_entry_t *)phystokv(avail_start);
                        avail_start += ARM_PGBYTES;
-
-                       pmap_init_pte_static_page(kernel_pmap, ppte, pa);
+                       bzero(ppte, ARM_PGBYTES);
 
                        for (i = 0; i < 4; ++i)
                                tte[i] = pa_to_tte(kvtophys((vm_offset_t)ppte) + (i * 0x400)) | ARM_TTE_TYPE_TABLE;
                }
 
+               vm_offset_t len = _end - va;
+               if ((pa + len) > avail_end)
+                       _end -= (pa + len - avail_end);
+               assert((start - gVirtBase + gPhysBase) >= gPhysBase);
+
                /* Apply the desired protections to the specified page range */
                for (i = 0; i < (ARM_PGBYTES / sizeof(*ppte)); i++) {
                        if (start <= va && va < _end) {
@@ -189,7 +216,7 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
 
 static void
 arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, 
-                          int tte_prot_XN, int pte_prot_APX, int pte_prot_XN, int forceCoarse)
+                          int tte_prot_XN, int pte_prot_APX, int pte_prot_XN, int force_page_granule)
 {
        vm_offset_t _end = start + size;
        vm_offset_t align_start = (start + ARM_TT_L1_PT_OFFMASK) & ~ARM_TT_L1_PT_OFFMASK;
@@ -198,7 +225,7 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size,
        arm_vm_page_granular_helper(start, _end, start, pte_prot_APX, pte_prot_XN);
 
        while (align_start < align_end) {
-               if (forceCoarse) {
+               if (force_page_granule) {
                        arm_vm_page_granular_helper(align_start, align_end, align_start + 1, 
                                                    pte_prot_APX, pte_prot_XN);
                } else {
@@ -221,27 +248,27 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size,
 }
 
 static inline void
-arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, int forceCoarse)
+arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, int force_page_granule)
 {
-       arm_vm_page_granular_prot(start, size, 1, AP_RONA, 1, forceCoarse);
+       arm_vm_page_granular_prot(start, size, 1, AP_RONA, 1, force_page_granule);
 }
 
 static inline void
-arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, int forceCoarse)
+arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, int force_page_granule)
 {
-       arm_vm_page_granular_prot(start, size, 0, AP_RONA, 0, forceCoarse);
+       arm_vm_page_granular_prot(start, size, 0, AP_RONA, 0, force_page_granule);
 }
 
 static inline void
-arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, int forceCoarse)
+arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, int force_page_granule)
 {
-       arm_vm_page_granular_prot(start, size, 1, AP_RWNA, 1, forceCoarse);
+       arm_vm_page_granular_prot(start, size, 1, AP_RWNA, 1, force_page_granule);
 }
 
 static inline void
-arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, int forceCoarse)
+arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, int force_page_granule)
 {
-       arm_vm_page_granular_prot(start, size, 0, AP_RWNA, 0, forceCoarse);
+       arm_vm_page_granular_prot(start, size, 0, AP_RWNA, 0, force_page_granule);
 }
 
 void
@@ -276,6 +303,10 @@ arm_vm_prot_init(boot_args * args)
                /* If we aren't protecting const, just map DATA as a single blob. */
                arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, FALSE);
        }
+       arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, TRUE);
+       arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, TRUE);
+       arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, TRUE);
+       arm_vm_page_granular_RNX((vm_offset_t)&fiqstack_high_guard, PAGE_MAX_SIZE, TRUE);
 
        arm_vm_page_granular_ROX(segKLDB, segSizeKLD, force_coarse_physmap);
        arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, force_coarse_physmap);
@@ -283,7 +314,8 @@ arm_vm_prot_init(boot_args * args)
        arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, TRUE); // Refined in OSKext::readPrelinkedExtensions
        arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT,
                                     end_kern - (segPRELINKTEXTB + segSizePRELINKTEXT), force_coarse_physmap); // PreLinkInfoDictionary
-       arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, force_coarse_physmap); // Device Tree, RAM Disk (if present), bootArgs
+       arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, force_coarse_physmap); // Device Tree, RAM Disk (if present), bootArgs, trust caches
+       arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, FALSE); // tighter trust cache protection
        arm_vm_page_granular_RWNX(phystokv(args->topOfKernelData), ARM_PGBYTES * 8, FALSE); // boot_tte, cpu_tte
 
        /*
@@ -319,6 +351,8 @@ arm_vm_prot_init(boot_args * args)
 void
 arm_vm_prot_finalize(boot_args * args)
 {
+       cpu_stack_alloc(&BootCpuData);
+       ml_static_mfree(segBOOTDATAB, segSizeBOOTDATA);
        /*
         * Naively we could have:
         * arm_vm_page_granular_ROX(segTEXTB, segSizeTEXT, FALSE);
@@ -335,6 +369,13 @@ arm_vm_prot_finalize(boot_args * args)
        flush_mmu_tlb();
 }
 
+/* used in the chosen/memory-map node, populated by iBoot. */
+typedef struct MemoryMapFileInfo {
+       vm_offset_t paddr;
+       size_t length;
+} MemoryMapFileInfo;
+
+
 void
 arm_vm_init(uint64_t memory_size, boot_args * args)
 {
@@ -391,9 +432,9 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        }
 
        while (tte < tte_limit) {
-                       *tte = ARM_TTE_TYPE_FAULT; 
-                       tte++;
-               }
+               *tte = ARM_TTE_TYPE_FAULT; 
+               tte++;
+       }
                
        /* Skip 6 pages (four L1 + two L2 entries) */
        avail_start = cpu_ttep + ARM_PGBYTES * 6;
@@ -404,12 +445,33 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         * from MACH-O headers for the currently running 32 bit kernel.
         */
        segTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT", &segSizeTEXT);
+       segLOWESTTEXT = segTEXTB;
        segDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA);
        segLINKB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK);
        segKLDB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD);
        segLASTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LAST", &segSizeLAST);
        segPRELINKTEXTB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_TEXT", &segSizePRELINKTEXT);
        segPRELINKINFOB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_INFO", &segSizePRELINKINFO);
+       segBOOTDATAB = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA);
+
+       segEXTRADATA = 0;
+       segSizeEXTRADATA = 0;
+
+       DTEntry memory_map;
+       MemoryMapFileInfo *trustCacheRange;
+       unsigned int trustCacheRangeSize;
+       int err;
+
+       err = DTLookupEntry(NULL, "chosen/memory-map", &memory_map);
+       assert(err == kSuccess);
+
+       err = DTGetProperty(memory_map, "TrustCache", (void**)&trustCacheRange, &trustCacheRangeSize);
+       if (err == kSuccess) {
+               assert(trustCacheRangeSize == sizeof(MemoryMapFileInfo));
+
+               segEXTRADATA = phystokv(trustCacheRange->paddr);
+               segSizeEXTRADATA = trustCacheRange->length;
+       }
 
        etext = (vm_offset_t) segTEXTB + segSizeTEXT;
        sdata = (vm_offset_t) segDATAB;
@@ -492,7 +554,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
 
        sane_size = mem_size - (avail_start - gPhysBase);
        max_mem = mem_size;
-       vm_kernel_slide = gVirtBase-0x80000000;
+       vm_kernel_slide = gVirtBase-VM_KERNEL_LINK_ADDRESS;
        vm_kernel_stext = segTEXTB;
        vm_kernel_etext = segTEXTB + segSizeTEXT;
        vm_kernel_base = gVirtBase;
index 3da426b3d9ae23a86bd0e7541a9de0300e0e92e4..8f83828edcc18ae6e4f7fe08f670a7cf961dd16f 100644 (file)
@@ -29,6 +29,7 @@
 #ifndef _ARM_ATOMIC_H_
 #define _ARM_ATOMIC_H_
 
+#include <mach/boolean.h>
 #include <arm/smp.h>
 
 // Parameter for __builtin_arm_dmb
@@ -213,6 +214,7 @@ atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval,
 #endif // ATOMIC_PRIVATE
 
 #if __arm__
+#undef os_atomic_rmw_loop
 #define os_atomic_rmw_loop(p, ov, nv, m, ...)  ({ \
                boolean_t _result = FALSE; uint32_t _err = 0; \
                typeof(atomic_load(p)) *_p = (typeof(atomic_load(p)) *)(p); \
@@ -234,7 +236,14 @@ atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval,
                } \
                _result; \
        })
+
+#undef os_atomic_rmw_loop_give_up
+#define os_atomic_rmw_loop_give_up(expr) \
+               ({ __builtin_arm_clrex(); expr; __builtin_trap(); })
+
 #else
+
+#undef os_atomic_rmw_loop
 #define os_atomic_rmw_loop(p, ov, nv, m, ...)  ({ \
                boolean_t _result = FALSE; \
                typeof(atomic_load(p)) *_p = (typeof(atomic_load(p)) *)(p); \
@@ -253,9 +262,25 @@ atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval,
                } while (__builtin_expect(!_result, 0)); \
                _result; \
        })
-#endif
 
+#undef os_atomic_rmw_loop_give_up
 #define os_atomic_rmw_loop_give_up(expr) \
                ({ __builtin_arm_clrex(); expr; __builtin_trap(); })
+#endif
+
+#undef os_atomic_force_dependency_on
+#if defined(__arm64__)
+#define os_atomic_force_dependency_on(p, e) ({ \
+               unsigned long _v; \
+               __asm__("and %x[_v], %x[_e], xzr" : [_v] "=r" (_v) : [_e] "r" (e)); \
+               (typeof(*(p)) *)((char *)(p) + _v); \
+       })
+#else
+#define os_atomic_force_dependency_on(p, e) ({ \
+               unsigned long _v; \
+               __asm__("and %[_v], %[_e], #0" : [_v] "=r" (_v) : [_e] "r" (e)); \
+               (typeof(*(p)) *)((char *)(p) + _v); \
+       })
+#endif // defined(__arm64__)
 
 #endif // _ARM_ATOMIC_H_
index 5f37e202dcaf50c7754c7fb6b4a69ef8afdcd2b9..91d489d085fe74973962ce7e394963cdcc017fd5 100644 (file)
@@ -66,13 +66,13 @@ flush_dcache(
        cpu_data_t      *cpu_data_ptr = getCpuDatap();
 
        if (phys) {
-               unsigned int    paddr;
-               unsigned int    vaddr;
+               pmap_paddr_t    paddr;
+               vm_offset_t     vaddr;
 
-               paddr = CAST_DOWN(unsigned int, addr);
+               paddr = CAST_DOWN(pmap_paddr_t, addr);
                if (!isphysmem(paddr))
                        return;
-               vaddr = (unsigned int)phystokv(paddr);
+               vaddr = phystokv(paddr);
                FlushPoC_DcacheRegion( (vm_offset_t) vaddr, length);
 
                if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL)
@@ -111,19 +111,19 @@ clean_dcache(
        cpu_data_t      *cpu_data_ptr = getCpuDatap();
 
        if (phys) {
-               unsigned int    paddr;
-               unsigned int    vaddr;
+               pmap_paddr_t    paddr;
+               vm_offset_t     vaddr;
 
-               paddr = CAST_DOWN(unsigned int, addr);
+               paddr = CAST_DOWN(pmap_paddr_t, addr);
                if (!isphysmem(paddr))
                        return;
 
-               vaddr = (unsigned int)phystokv(paddr);
+               vaddr = phystokv(paddr);
                CleanPoC_DcacheRegion( (vm_offset_t) vaddr, length);
 
                if (cpu_data_ptr->cpu_cache_dispatch != (cache_dispatch_t) NULL)
                        ((cache_dispatch_t) cpu_data_ptr->cpu_cache_dispatch) (
-                                           cpu_data_ptr->cpu_id, CacheCleanRegion, paddr, length);
+                                           cpu_data_ptr->cpu_id, CacheCleanRegion, (unsigned int) paddr, length);
                return;
        }
        
@@ -175,8 +175,8 @@ dcache_incoherent_io_flush64(
        unsigned int remaining,
        unsigned int *res)
 {
-       unsigned int vaddr;
-       unsigned int paddr = CAST_DOWN(unsigned int, pa);
+       vm_offset_t vaddr;
+       pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa);
        cpu_data_t *cpu_data_ptr = getCpuDatap();
 
        if ((cache_info()->c_bulksize_op !=0) && (remaining >= (cache_info()->c_bulksize_op))) {
@@ -190,7 +190,7 @@ dcache_incoherent_io_flush64(
                *res = BWOpDone;
        } else {
                if (isphysmem(paddr)) {
-                       vaddr = (unsigned int)phystokv(pa);
+                       vaddr = phystokv(pa);
                        {
                                FlushPoC_DcacheRegion( (vm_offset_t) vaddr, size);
 
@@ -209,8 +209,8 @@ dcache_incoherent_io_flush64(
                                if (count > size)
                                        count = size;
 
-                               wimg_bits = pmap_cache_attributes((paddr >> PAGE_SHIFT));
-                               index = pmap_map_cpu_windows_copy((paddr >> PAGE_SHIFT), VM_PROT_READ|VM_PROT_WRITE, wimg_bits);
+                               wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT));
+                               index = pmap_map_cpu_windows_copy((ppnum_t) (paddr >> PAGE_SHIFT), VM_PROT_READ|VM_PROT_WRITE, wimg_bits);
                                vaddr = pmap_cpu_windows_copy_addr(cpu_number(), index) | (paddr & PAGE_MASK);
 
                                CleanPoC_DcacheRegion( (vm_offset_t) vaddr, count);
@@ -235,12 +235,12 @@ dcache_incoherent_io_store64(
        unsigned int remaining,
        unsigned int *res)
 {
-       unsigned int vaddr;
-       unsigned int paddr = CAST_DOWN(unsigned int, pa);
+       vm_offset_t vaddr;
+       pmap_paddr_t paddr = CAST_DOWN(pmap_paddr_t, pa);
        cpu_data_t *cpu_data_ptr = getCpuDatap();
 
        if (isphysmem(paddr)) {
-               unsigned int wimg_bits = pmap_cache_attributes(paddr >> PAGE_SHIFT);
+               unsigned int wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT));
                if ((wimg_bits == VM_WIMG_IO) || (wimg_bits == VM_WIMG_WCOMB)) {
                        return;
                }
@@ -259,7 +259,7 @@ dcache_incoherent_io_store64(
                *res = BWOpDone;
        } else {
                if (isphysmem(paddr)) {
-                       vaddr = (unsigned int)phystokv(pa);
+                       vaddr = phystokv(pa);
                        {
                                CleanPoC_DcacheRegion( (vm_offset_t) vaddr, size);
 
@@ -278,8 +278,8 @@ dcache_incoherent_io_store64(
                                if (count > size)
                                        count = size;
 
-                               wimg_bits = pmap_cache_attributes((paddr >> PAGE_SHIFT));
-                               index = pmap_map_cpu_windows_copy((paddr >> PAGE_SHIFT), VM_PROT_READ|VM_PROT_WRITE, wimg_bits);
+                               wimg_bits = pmap_cache_attributes((ppnum_t) (paddr >> PAGE_SHIFT));
+                               index = pmap_map_cpu_windows_copy((ppnum_t) (paddr >> PAGE_SHIFT), VM_PROT_READ|VM_PROT_WRITE, wimg_bits);
                                vaddr = pmap_cpu_windows_copy_addr(cpu_number(), index) | (paddr & PAGE_MASK);
 
                                CleanPoC_DcacheRegion( (vm_offset_t) vaddr, count);
@@ -384,6 +384,7 @@ platform_cache_shutdown(
 void
 platform_cache_disable(void)
 {
+#if (__ARM_ARCH__ < 8)
        uint32_t sctlr_value = 0;
 
        /* Disable dcache allocation. */
@@ -395,7 +396,7 @@ platform_cache_disable(void)
        __asm__ volatile("mcr p15, 0, %0, c1, c0, 0\n"
                         "isb"
                         :: "r"(sctlr_value));
-
+#endif /* (__ARM_ARCH__ < 8) */
 }
 
 void
@@ -414,15 +415,16 @@ platform_cache_idle_enter(
        if (up_style_idle_exit && (real_ncpus == 1))
                CleanPoU_Dcache();
        else {
-               cpu_data_t      *cpu_data_ptr = getCpuDatap();
-
                FlushPoU_Dcache();
 
+#if (__ARM_ARCH__ < 8)
+               cpu_data_t      *cpu_data_ptr = getCpuDatap();
                cpu_data_ptr->cpu_CLW_active = 0;
                __asm__ volatile("dmb ish");
                cpu_data_ptr->cpu_CLWFlush_req = 0;
                cpu_data_ptr->cpu_CLWClean_req = 0;
                CleanPoC_DcacheRegion((vm_offset_t) cpu_data_ptr, sizeof(cpu_data_t));
+#endif /* (__ARM_ARCH__ < 8) */
        }
 #else
        CleanPoU_Dcache();
index 27cfadba928380bed85679e89538c620c731d0aa..74aa72f31f6d33aea48396ff1aa315a916eff0bf 100644 (file)
@@ -36,6 +36,7 @@
  *     File:           arm/commpage/commpage.c
  *     Purpose:        Set up and export a RO/RW page
  */
+#include <libkern/section_keywords.h>
 #include <mach/mach_types.h>
 #include <mach/machine.h>
 #include <mach/vm_map.h>
@@ -50,8 +51,6 @@
 #include <arm/rtclock.h>
 #include <libkern/OSAtomic.h>
 #include <stdatomic.h>
-#include <kern/remote_time.h>
-#include <machine/machine_remote_time.h>
 
 #include <sys/kdebug.h>
 
 static void commpage_init_cpu_capabilities( void );
 static int commpage_cpus( void );
 
-vm_address_t   commPagePtr=0;
-vm_address_t   sharedpage_rw_addr = 0;
-uint32_t       _cpu_capabilities = 0;
+SECURITY_READ_ONLY_LATE(vm_address_t)  commPagePtr=0;
+SECURITY_READ_ONLY_LATE(vm_address_t)  sharedpage_rw_addr = 0;
+SECURITY_READ_ONLY_LATE(uint32_t)      _cpu_capabilities = 0;
 
-extern int     gARMv81Atomics; /* For sysctl access from BSD side */
+/* For sysctl access from BSD side */
+extern int     gARMv81Atomics;
+extern int     gARMv8Crc32;
 
 void
 commpage_populate(
@@ -231,6 +232,12 @@ commpage_cpus( void )
        return cpus;
 }
 
+vm_address_t
+_get_commpage_priv_address(void)
+{
+       return sharedpage_rw_addr;
+}
+
 /*
  * Initialize _cpu_capabilities vector
  */
@@ -273,6 +280,8 @@ commpage_init_cpu_capabilities( void )
                bits |= kHasNeon;
        if (mvfp_info->neon_hpfp)
                bits |= kHasNeonHPFP;
+       if (mvfp_info->neon_fp16)
+               bits |= kHasNeonFP16;
 #endif
 #if defined(__arm64__)
        bits |= kHasFMA;
@@ -290,10 +299,15 @@ commpage_init_cpu_capabilities( void )
        bits |= kHasARMv8Crypto;
 #endif
 #ifdef __arm64__
-       if ((__builtin_arm_rsr64("ID_AA64ISAR0_EL1") & ID_AA64ISAR0_EL1_ATOMIC_MASK) == ID_AA64ISAR0_EL1_ATOMIC_8_1) {
+       uint64_t isar0 = __builtin_arm_rsr64("ID_AA64ISAR0_EL1");
+       if ((isar0 & ID_AA64ISAR0_EL1_ATOMIC_MASK) == ID_AA64ISAR0_EL1_ATOMIC_8_1) {
                bits |= kHasARMv81Atomics;
                gARMv81Atomics = 1;
        }
+       if ((isar0 & ID_AA64ISAR0_EL1_CRC32_MASK) == ID_AA64ISAR0_EL1_CRC32_EN) {
+               bits |= kHasARMv8Crc32;
+               gARMv8Crc32 = 1;
+       }
 #endif
        _cpu_capabilities = bits;
 
@@ -425,26 +439,41 @@ commpage_update_boottime(uint64_t value)
        }
 }
 
+
 /*
- * set the commpage's remote time params for
- * userspace call to mach_bridge_remote_time()
+ * After this counter has incremented, all running CPUs are guaranteed to
+ * have quiesced, i.e. executed serially dependent memory barriers.
+ * This is only tracked for CPUs running in userspace, therefore only useful
+ * outside the kernel.
+ *
+ * Note that you can't know which side of those barriers your read was from,
+ * so you have to observe 2 increments in order to ensure that you saw a
+ * serially dependent barrier chain across all running CPUs.
  */
- void
- commpage_set_remotetime_params(double rate, uint64_t base_local_ts, uint64_t base_remote_ts)
- {
-        if (commPagePtr) {
-#ifdef __arm64__
-               struct bt_params *paramsp = (struct bt_params *)(_COMM_PAGE_REMOTETIME_PARAMS + _COMM_PAGE_RW_OFFSET);
-               paramsp->base_local_ts = 0;
-               __asm__ volatile("dmb ish" ::: "memory");
-               paramsp->rate = rate;
-               paramsp->base_remote_ts = base_remote_ts;
-               __asm__ volatile("dmb ish" ::: "memory");
-               paramsp->base_local_ts = base_local_ts;  //This will act as a generation count
+uint64_t
+commpage_increment_cpu_quiescent_counter(void)
+{
+       if (!commPagePtr)
+               return 0;
+
+       uint64_t old_gen;
+
+       _Atomic uint64_t *sched_gen = (_Atomic uint64_t *)(_COMM_PAGE_CPU_QUIESCENT_COUNTER +
+                                                          _COMM_PAGE_RW_OFFSET);
+       /*
+        * On 32bit architectures, double-wide atomic load or stores are a CAS,
+        * so the atomic increment is the most efficient way to increment the
+        * counter.
+        *
+        * On 64bit architectures however, because the update is synchronized by
+        * the cpu mask, relaxed loads and stores is more efficient.
+        */
+#if __LP64__
+       old_gen = atomic_load_explicit(sched_gen, memory_order_relaxed);
+       atomic_store_explicit(sched_gen, old_gen + 1, memory_order_relaxed);
 #else
-               (void)rate;
-               (void)base_local_ts;
-               (void)base_remote_ts;
-#endif /* __arm64__ */
-       }
+       old_gen = atomic_fetch_add_explicit(sched_gen, 1, memory_order_relaxed);
+#endif
+       return old_gen;
 }
+
index 711be4cda138d936e607cf2acbf0a2bde11b4a3d..d7f349c295190aa25790e6586738ad484ef2cded 100644 (file)
@@ -46,5 +46,6 @@ extern  void  commpage_update_mach_continuous_time(uint64_t sleeptime);
 extern void    commpage_update_multiuser_config(uint32_t);
 extern  void    commpage_update_boottime(uint64_t boottime_usec);
 extern void    commpage_set_remotetime_params(double rate, uint64_t base_local_ts, uint64_t base_remote_ts);
+extern uint64_t commpage_increment_cpu_quiescent_counter(void);
 
 #endif /* _ARM_COMMPAGE_H */
diff --git a/osfmk/arm/conf.c b/osfmk/arm/conf.c
deleted file mode 100644 (file)
index a9da871..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2007 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/*
- * @APPLE_FREE_COPYRIGHT@
- */
-/*
- * Mach Operating System Copyright (c) 1991,1990,1989 Carnegie Mellon
- * University All Rights Reserved.
- * 
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright notice
- * and this permission notice appear in all copies of the software,
- * derivative works or modified versions, and any portions thereof, and that
- * both notices appear in supporting documentation.
- * 
- * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" CONDITION.
- * CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR ANY DAMAGES
- * WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- * 
- * Carnegie Mellon requests users of this software to return to
- * 
- * Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- * School of Computer Science Carnegie Mellon University Pittsburgh PA
- * 15213-3890
- * 
- * any improvements or extensions that they make and grant Carnegie Mellon the
- * rights to redistribute these changes.
- */
-/*
- * */
-
-#include <types.h>
-#include <kern/clock.h>
-#include <libkern/section_keywords.h>
-
-/*
- * Clock device subsystem configuration. The clock_list[]
- * table contains the clock structures for all clocks in
- * the system.
- */
-
-extern const struct clock_ops sysclk_ops, calend_ops;
-
-/*
- * List of clock devices.
- */
-SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = {
-
-       /* SYSTEM_CLOCK */
-       {&sysclk_ops, 0, 0},
-
-       /* CALENDAR_CLOCK */
-       {&calend_ops, 0, 0},
-};
-int             clock_count = sizeof(clock_list) / sizeof(clock_list[0]);
index 46cfcddb7d787602d0330e87e748bf8463d928b5..49b5833cce56b7d005dae0ad87c8a0f30e3df2a9 100644 (file)
@@ -157,7 +157,7 @@ cpu_idle(void)
        platform_cache_idle_exit();
 
        ClearIdlePop(TRUE);
-       cpu_idle_exit();
+       cpu_idle_exit(FALSE);
 }
 
 /*
@@ -165,7 +165,7 @@ cpu_idle(void)
  *     Function:
  */
 void
-cpu_idle_exit(void)
+cpu_idle_exit(boolean_t from_reset __unused)
 {
        uint64_t        new_idle_timeout_ticks = 0x0ULL;
        cpu_data_t     *cpu_data_ptr = getCpuDatap();
@@ -267,55 +267,35 @@ cpu_init(void)
 
 }
 
-cpu_data_t *
-cpu_data_alloc(boolean_t is_boot_cpu)
+void
+cpu_stack_alloc(cpu_data_t *cpu_data_ptr)
 {
-       cpu_data_t              *cpu_data_ptr = NULL;
-
-       if (is_boot_cpu)
-               cpu_data_ptr = &BootCpuData;
-       else {
-               void    *irq_stack = NULL;
-               void    *fiq_stack = NULL;
-
-               if ((kmem_alloc(kernel_map, (vm_offset_t *)&cpu_data_ptr, sizeof(cpu_data_t), VM_KERN_MEMORY_CPU)) != KERN_SUCCESS)
-                       goto cpu_data_alloc_error;
-
-               bzero((void *)cpu_data_ptr, sizeof(cpu_data_t));
-
-               if ((irq_stack = kalloc(INTSTACK_SIZE)) == 0) 
-                       goto cpu_data_alloc_error;
-#if __BIGGEST_ALIGNMENT__
-               /* force 16-byte alignment */
-               if ((uint32_t)irq_stack & 0x0F)
-                       irq_stack = (void *)((uint32_t)irq_stack + (0x10 - ((uint32_t)irq_stack & 0x0F)));
-#endif
-               cpu_data_ptr->intstack_top = (vm_offset_t)irq_stack + INTSTACK_SIZE ;
-               cpu_data_ptr->istackptr = cpu_data_ptr->intstack_top;
-
-               if ((fiq_stack = kalloc(PAGE_SIZE)) == 0) 
-                       goto cpu_data_alloc_error;
-#if __BIGGEST_ALIGNMENT__
-               /* force 16-byte alignment */
-               if ((uint32_t)fiq_stack & 0x0F)
-                       fiq_stack = (void *)((uint32_t)fiq_stack + (0x10 - ((uint32_t)fiq_stack & 0x0F)));
-#endif
-               cpu_data_ptr->fiqstack_top = (vm_offset_t)fiq_stack + PAGE_SIZE ;
-               cpu_data_ptr->fiqstackptr = cpu_data_ptr->fiqstack_top;
-       }
-
-       cpu_data_ptr->cpu_processor = cpu_processor_alloc(is_boot_cpu);
-       if (cpu_data_ptr->cpu_processor == (struct processor *)NULL)
-               goto cpu_data_alloc_error;
-
-       return cpu_data_ptr;
-
-cpu_data_alloc_error:
-       panic("cpu_data_alloc() failed\n");
-       return (cpu_data_t *)NULL;
+       vm_offset_t             irq_stack = 0;
+       vm_offset_t             fiq_stack = 0;
+
+       kern_return_t kr = kernel_memory_allocate(kernel_map, &irq_stack,
+                                  INTSTACK_SIZE + (2 * PAGE_SIZE),
+                                  PAGE_MASK,
+                                  KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT,
+                                  VM_KERN_MEMORY_STACK);
+       if (kr != KERN_SUCCESS)
+               panic("Unable to allocate cpu interrupt stack\n");
+
+       cpu_data_ptr->intstack_top = irq_stack + PAGE_SIZE + INTSTACK_SIZE;
+       cpu_data_ptr->istackptr = cpu_data_ptr->intstack_top;
+
+       kr = kernel_memory_allocate(kernel_map, &fiq_stack,
+                                  FIQSTACK_SIZE + (2 * PAGE_SIZE),
+                                  PAGE_MASK,
+                                  KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT,
+                                  VM_KERN_MEMORY_STACK);
+       if (kr != KERN_SUCCESS)
+               panic("Unable to allocate cpu exception stack\n");
+
+       cpu_data_ptr->fiqstack_top = fiq_stack + PAGE_SIZE + FIQSTACK_SIZE;
+       cpu_data_ptr->fiqstackptr = cpu_data_ptr->fiqstack_top;
 }
 
-
 void
 cpu_data_free(cpu_data_t *cpu_data_ptr)
 {
@@ -324,7 +304,7 @@ cpu_data_free(cpu_data_t *cpu_data_ptr)
 
        cpu_processor_free( cpu_data_ptr->cpu_processor);
        kfree( (void *)(cpu_data_ptr->intstack_top - INTSTACK_SIZE), INTSTACK_SIZE);
-       kfree( (void *)(cpu_data_ptr->fiqstack_top - PAGE_SIZE), PAGE_SIZE);
+       kfree( (void *)(cpu_data_ptr->fiqstack_top - FIQSTACK_SIZE), FIQSTACK_SIZE);
        kmem_free(kernel_map, (vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t));
 }
 
@@ -579,7 +559,7 @@ cpu_machine_idle_init(boolean_t from_boot)
                                     ((unsigned int)&(ResetHandlerData.cpu_data_entries) - (unsigned int)&ExceptionLowVectorsBase)),
                           4);
 
-               CleanPoC_DcacheRegion((vm_offset_t) phystokv((char *) (gPhysBase)), PAGE_SIZE);
+               CleanPoC_DcacheRegion((vm_offset_t) phystokv(gPhysBase), PAGE_SIZE);
 
                resume_idle_cpu_paddr = (unsigned int)ml_static_vtop((vm_offset_t)&resume_idle_cpu);
 
index b686c0ed1f0124edd5791adcca82e0375b41f1fa..32044d7d07f08fb43b87057c61f0c75a75985382 100644 (file)
 
 #ifndef        __ASSEMBLER__
 #include <stdint.h>
-#ifdef KERNEL_PRIVATE
 #include <mach/vm_types.h>
 #endif
-#endif
  
 /*
  * This is the authoritative way to determine from user mode what
@@ -47,6 +45,7 @@
 /*
  * Bit definitions for _cpu_capabilities:
  */
+#define        kHasNeonFP16                    0x00000008      // ARM v8.2 NEON FP16 supported
 #define        kCache32                        0x00000010      // cache line size is 32 bytes
 #define        kCache64                        0x00000020      // cache line size is 64 bytes
 #define        kCache128                       0x00000040      // cache line size is 128 bytes
@@ -60,6 +59,7 @@
 #define        kNumCPUs                        0x00FF0000      // number of CPUs (see _NumCPUs() below)
 #define kHasARMv8Crypto                        0x01000000      // Optional ARMv8 Crypto extensions
 #define kHasARMv81Atomics              0x02000000      // ARMv8.1 Atomic instructions supported
+#define kHasARMv8Crc32                 0x04000000      // Optional ARMv8 crc32 instructions (required in ARMv8.1)
 
 #define        kNumCPUsShift           16                      // see _NumCPUs() below
 
@@ -91,6 +91,8 @@ typedef struct {
        volatile uint32_t       TimeBase_shift;
 } commpage_timeofday_data_t;
 
+extern vm_address_t                            _get_commpage_priv_address(void);
+
 #endif /* __ASSEMBLER__ */
 
 
@@ -98,26 +100,22 @@ typedef struct {
  * The shared kernel/user "comm page(s)":
  */
 
-#if defined(__arm64__)
+#if defined(__LP64__)
 
 #define _COMM_PAGE64_BASE_ADDRESS              (0x0000000FFFFFC000ULL) /* In TTBR0 */
 #define _COMM_HIGH_PAGE64_BASE_ADDRESS (0xFFFFFFF0001FC000ULL) /* Just below the kernel, safely in TTBR1; only used for testing */
-#define _COMM_PRIV_PAGE64_BASE_ADDRESS (_COMM_HIGH_PAGE64_BASE_ADDRESS - (PAGE_SIZE))          /* Privileged RO in kernel mode */
 
 #define _COMM_PAGE64_AREA_LENGTH               (_COMM_PAGE32_AREA_LENGTH)
 #define _COMM_PAGE64_AREA_USED                 (-1)
 
-// macro to change a user comm page address to one that is accessible from privileged mode
-// we can no longer access user memory in privileged mode once PAN is enabled
-#define _COMM_PAGE_PRIV(_addr_)                        ((_addr_) - (_COMM_PAGE_START_ADDRESS) + (_COMM_PRIV_PAGE64_BASE_ADDRESS))
+#define _COMM_PAGE_PRIV(_addr_)                        ((_addr_) - (_COMM_PAGE_START_ADDRESS) + _get_commpage_priv_address())
 
 #ifdef KERNEL_PRIVATE
-extern vm_address_t                                            sharedpage_rw_addr;
 #define        _COMM_PAGE_RW_OFFSET                    (0)
 #define        _COMM_PAGE_AREA_LENGTH                  (PAGE_SIZE)
 
-#define        _COMM_PAGE_BASE_ADDRESS                 (sharedpage_rw_addr)
-#define _COMM_PAGE_START_ADDRESS               (sharedpage_rw_addr)
+#define        _COMM_PAGE_BASE_ADDRESS                 (_get_commpage_priv_address())
+#define _COMM_PAGE_START_ADDRESS               (_get_commpage_priv_address())
 #else /* KERNEL_PRIVATE */
 #define        _COMM_PAGE_AREA_LENGTH                  (4096)
 
@@ -125,7 +123,7 @@ extern vm_address_t                                         sharedpage_rw_addr;
 #define _COMM_PAGE_START_ADDRESS               _COMM_PAGE64_BASE_ADDRESS
 #endif /* KERNEL_PRIVATE */
 
-#elif defined(__arm__)
+#else
 
 #define _COMM_PAGE64_BASE_ADDRESS              (-1)
 #define _COMM_PAGE64_AREA_LENGTH               (-1)
@@ -137,8 +135,7 @@ extern vm_address_t                                         sharedpage_rw_addr;
 #define _COMM_PAGE_PRIV(_addr_)                        (_addr_)
 
 #ifdef KERNEL_PRIVATE
-extern vm_address_t                            sharedpage_rw_addr;
-#define        _COMM_PAGE_RW_OFFSET                    (sharedpage_rw_addr-_COMM_PAGE_BASE_ADDRESS)
+#define        _COMM_PAGE_RW_OFFSET                    (_get_commpage_priv_address()-_COMM_PAGE_BASE_ADDRESS)
 #define        _COMM_PAGE_AREA_LENGTH                  (PAGE_SIZE)
 #else
 #define        _COMM_PAGE_AREA_LENGTH                  (4096)
@@ -147,8 +144,6 @@ extern vm_address_t                         sharedpage_rw_addr;
 #define        _COMM_PAGE_BASE_ADDRESS                 _COMM_PAGE32_BASE_ADDRESS
 #define _COMM_PAGE_START_ADDRESS               _COMM_PAGE32_BASE_ADDRESS
 
-#else
-#error Unknown architecture.
 #endif
 
 #define _COMM_PAGE32_BASE_ADDRESS              (0xFFFF4000)            /* Must be outside of normal map bounds */
@@ -208,6 +203,9 @@ extern vm_address_t                         sharedpage_rw_addr;
 
 #define _COMM_PAGE_NEWTIMEOFDAY_DATA           (_COMM_PAGE_START_ADDRESS+0x120)        // used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40.
 
+// aligning to 128 bytes for cacheline/fabric size
+#define _COMM_PAGE_CPU_QUIESCENT_COUNTER        (_COMM_PAGE_START_ADDRESS+0x180)        // uint64_t, but reserve the whole 128 (0x80) bytes
+
 #define _COMM_PAGE_END                         (_COMM_PAGE_START_ADDRESS+0x1000)       // end of common page
 
 #endif /* _ARM_CPU_CAPABILITIES_H */
index 2b9b0a2614c70247f673dca40cbb7fe39293b392..d976ce5c108898cd279d04a2633098e9e925b4af 100644 (file)
@@ -515,6 +515,33 @@ processor_to_cpu_datap(processor_t processor)
        return target_cpu_datap;
 }
 
+cpu_data_t *
+cpu_data_alloc(boolean_t is_boot_cpu)
+{
+       cpu_data_t              *cpu_data_ptr = NULL;
+
+       if (is_boot_cpu)
+               cpu_data_ptr = &BootCpuData;
+       else {
+               if ((kmem_alloc(kernel_map, (vm_offset_t *)&cpu_data_ptr, sizeof(cpu_data_t), VM_KERN_MEMORY_CPU)) != KERN_SUCCESS)
+                       goto cpu_data_alloc_error;
+
+               bzero((void *)cpu_data_ptr, sizeof(cpu_data_t));
+
+               cpu_stack_alloc(cpu_data_ptr);
+       }
+
+       cpu_data_ptr->cpu_processor = cpu_processor_alloc(is_boot_cpu);
+       if (cpu_data_ptr->cpu_processor == (struct processor *)NULL)
+               goto cpu_data_alloc_error;
+
+       return cpu_data_ptr;
+
+cpu_data_alloc_error:
+       panic("cpu_data_alloc() failed\n");
+       return (cpu_data_t *)NULL;
+}
+
 ast_t *
 ast_pending(void)
 {
index f35121e35bda0f6802d9ca6c846426208d9fade4..3b8c8885442794af16831b6524d30e4f13152a3d 100644 (file)
 #include <mach/mach_types.h>
 #include <machine/thread.h>
 
-
 #define current_thread()       current_thread_fast()
 
-static inline thread_t current_thread_fast(void) 
+static inline __pure2 thread_t current_thread_fast(void) 
+{
+#if defined(__arm64__)
+       return (thread_t)(__builtin_arm_rsr64("TPIDR_EL1"));
+#else
+       return (thread_t)(__builtin_arm_mrc(15, 0, 13, 0, 4));  // TPIDRPRW
+#endif
+}
+
+/*
+ * The "volatile" flavor of current_thread() is intended for use by
+ * scheduler code which may need to update the thread pointer in the
+ * course of a context switch.  Any call to current_thread() made
+ * prior to the thread pointer update should be safe to optimize away
+ * as it should be consistent with that thread's state to the extent
+ * the compiler can reason about it.  Likewise, the context switch
+ * path will eventually result in an arbitrary branch to the new
+ * thread's pc, about which the compiler won't be able to reason.
+ * Thus any compile-time optimization of current_thread() calls made
+ * within the new thread should be safely encapsulated in its
+ * register/stack state.  The volatile form therefore exists to cover
+ * the window between the thread pointer update and the branch to
+ * the new pc.
+ */
+static inline thread_t current_thread_volatile(void)
 {
-        thread_t        result;
+       /* The compiler treats rsr64 as const, which can allow
+          it to eliminate redundant calls, which we don't want here.
+          Thus we use volatile asm.  The mrc used for arm32 should be
+          treated as volatile however. */
 #if defined(__arm64__)
-        __asm__ volatile("mrs %0, TPIDR_EL1" : "=r" (result));
+       thread_t result;
+       __asm__ volatile("mrs %0, TPIDR_EL1" : "=r" (result));
+       return result;
 #else
-       result = (thread_t)__builtin_arm_mrc(15, 0, 13, 0, 4);  // TPIDRPRW
+       return (thread_t)(__builtin_arm_mrc(15, 0, 13, 0, 4));  // TPIDRPRW
 #endif
-        return result;
 }
 
 #if defined(__arm64__)
index e660c1e36ee704e170c6956bc68217d203a02794..29acbc1e8b01dbd78db345cec87647be5c677f0b 100644 (file)
@@ -119,12 +119,13 @@ typedef struct cpu_data
        unsigned short                          cpu_flags;
        vm_offset_t                             istackptr;
        vm_offset_t                             intstack_top;
-       vm_offset_t                             fiqstackptr;
-       vm_offset_t                             fiqstack_top;
 #if __arm64__
        vm_offset_t                             excepstackptr;
        vm_offset_t                             excepstack_top;
        boolean_t                               cluster_master;
+#else
+       vm_offset_t                             fiqstackptr;
+       vm_offset_t                             fiqstack_top;
 #endif
        boolean_t                               interrupts_enabled;
        thread_t                                cpu_active_thread;
@@ -259,7 +260,6 @@ typedef struct cpu_data
 #if MONOTONIC
        struct mt_cpu                           cpu_monotonic;
 #endif /* MONOTONIC */
-       struct prngContext                      *cpu_prng;
        cluster_type_t                          cpu_cluster_type;
        uint32_t                                cpu_cluster_id;
        uint32_t                                cpu_l2_id;
@@ -302,9 +302,10 @@ extern     unsigned int            LowExceptionVectorBase;
 
 extern cpu_data_t                      *cpu_datap(int cpu);
 extern cpu_data_t                      *cpu_data_alloc(boolean_t is_boot);
-extern void                                    cpu_data_init(cpu_data_t *cpu_data_ptr);
-extern void                                    cpu_data_free(cpu_data_t *cpu_data_ptr);
-extern kern_return_t           cpu_data_register(cpu_data_t *cpu_data_ptr);
+extern void                            cpu_stack_alloc(cpu_data_t*);
+extern void                            cpu_data_init(cpu_data_t *cpu_data_ptr);
+extern void                            cpu_data_free(cpu_data_t *cpu_data_ptr);
+extern kern_return_t                   cpu_data_register(cpu_data_t *cpu_data_ptr);
 extern cpu_data_t                      *processor_to_cpu_datap( processor_t processor);
 
 #if __arm64__
index c7c846d75c0366cf10d2698f2eb4aebecdf39b82..34b4bce727f548691ef9e8bb861e3a226a34a53f 100644 (file)
@@ -69,5 +69,9 @@ extern void                   cpu_signal_cancel(
 
 extern unsigned int real_ncpus;
 
+#if defined(CONFIG_XNUPOST) && __arm64__ 
+extern void arm64_ipi_test(void);
+#endif /* defined(CONFIG_XNUPOST) && __arm64__ */
+
 
 #endif /* _ARM_CPU_INTERNAL_H_ */
index 2782475e284b93d8752a76a23f13e19ad8b0b20a..22435e76bc90017ec051bd8d03ecf4a9a0833420 100644 (file)
@@ -172,6 +172,10 @@ cpuid_get_cpufamily(void)
                case CPU_PART_HURRICANE_MYST:
                        cpufamily = CPUFAMILY_ARM_HURRICANE;
                        break;
+               case CPU_PART_MONSOON:
+               case CPU_PART_MISTRAL:
+                       cpufamily = CPUFAMILY_ARM_MONSOON_MISTRAL;
+                       break;
                default:
                        cpufamily = CPUFAMILY_UNKNOWN;
                        break;
index 07778404b490d31b15de389f21267e1b0ffee6b1..bf642b6c591242072c1a97f8b786c5d1d3cbfd7f 100644 (file)
@@ -132,6 +132,12 @@ typedef union {
 /* H9G (ARMv8 architecture) */
 #define CPU_PART_HURRICANE_MYST 0x7
 
+/* H10 p-Core (ARMv8 architecture) */
+#define CPU_PART_MONSOON       0x8
+
+/* H10 e-Core (ARMv8 architecture) */
+#define CPU_PART_MISTRAL       0x9
+
 
 /* Cache type identification */
 
@@ -198,6 +204,7 @@ typedef union {
 typedef struct {
        uint32_t                neon;
        uint32_t                neon_hpfp;
+       uint32_t                neon_fp16;
 } arm_mvfp_info_t;
 
 #ifdef __cplusplus
index 7c3812dd01e68996edc7de03e0016cac88e7e759..7851e0ed345db3c1f8527054867cb27b7960f6e9 100644 (file)
@@ -97,10 +97,12 @@ LEXT(machine_load_context)
        bx              lr                                                                      // Return
 
 /*
- *     void Call_continuation( void (*continuation)(void), 
- *                             void *param, 
- *                             wait_result_t wresult, 
- *                             vm_offset_t stack_ptr)
+ *  typedef void (*thread_continue_t)(void *param, wait_result_t)
+ *
+ *     void Call_continuation( thread_continue_t continuation,
+ *                                     void *param, 
+ *                                         wait_result_t wresult,
+ *                          bool enable interrupts)
  */
        .text
        .align  5
@@ -110,10 +112,21 @@ LEXT(Call_continuation)
        mrc             p15, 0, r9, c13, c0, 4                          // Read TPIDRPRW
        ldr             sp, [r9, TH_KSTACKPTR]                          // Set stack pointer
        mov             r7, #0                                                          // Clear frame pointer
-       mov             r6,r0                                                           // Load continuation
-       mov             r0,r1                                                           // Set first parameter
-       mov             r1,r2                                                           // Set wait result arg
-       blx             r6                                                                      // Branch to continuation
+
+       mov             r4,r0                                                           // Load continuation
+       mov             r5,r1                                                           // continuation parameter
+       mov             r6,r2                                                           // Set wait result arg
+
+    teq     r3, #0
+    beq     1f
+    mov     r0, #1
+    bl _ml_set_interrupts_enabled
+1:
+    
+       mov             r0,r5                                                           // Set first parameter
+       mov             r1,r6                                                           // Set wait result arg
+       blx             r4                                                                      // Branch to continuation
+
        mrc             p15, 0, r0, c13, c0, 4                          // Read TPIDRPRW
        LOAD_ADDR_PC(thread_terminate)
        b               .                                                                       // Not reach
index b82b5033917ea38b41bb8459a127548b5016d169..1ffa7e5d48e4c67f8d34fc28eeeda59b4e5e3182 100644 (file)
 #error Unknown architecture.
 #endif
 
+       .section __BOOTDATA, __data                                     // Aligned data
 
-       .section __DATA, __data                                         // Aligned data
+       .align 14 
 
-#if __arm64__
-       /*
-        * Exception stack; this is above the interrupt stack so we don't squash the interrupt
-        * stack on an exception.
-        */
-       .global EXT(excepstack)
-LEXT(excepstack)
-       .space  (4096)
-       .globl  EXT(excepstack_top)
-LEXT(excepstack_top)
-#endif
+       .globl EXT(intstack_low_guard)
+LEXT(intstack_low_guard)
+       .space (PAGE_MAX_SIZE_NUM)
 
        /* IRQ stack */
        .globl  EXT(intstack)                                           // Boot processor IRQ stack
 LEXT(intstack)
-       .space  (4*4096)
+       .space  (INTSTACK_SIZE_NUM)
        .globl  EXT(intstack_top)
 LEXT(intstack_top)
 
+       .globl EXT(intstack_high_guard)
+LEXT(intstack_high_guard)
+       .space (PAGE_MAX_SIZE_NUM)
 
-       .align 12                                                       // Page aligned Section
+/* Low guard for fiq/exception stack is shared w/ interrupt stack high guard */
+
+#ifndef __arm64__
 
        .globl  EXT(fiqstack)                                           // Boot processor FIQ stack
 LEXT(fiqstack)
-       .space  (4096)                                                  // One page size
+       .space  (FIQSTACK_SIZE_NUM)
        .globl  EXT(fiqstack_top)                                       // Boot processor FIQ stack top
 LEXT(fiqstack_top)
 
+       .globl EXT(fiqstack_high_guard)
+LEXT(fiqstack_high_guard)
+       .space (PAGE_MAX_SIZE_NUM)
+
+#else
+
+       .global EXT(excepstack)
+LEXT(excepstack)
+       .space  (EXCEPSTACK_SIZE_NUM)
+       .globl  EXT(excepstack_top)
+LEXT(excepstack_top)
+
+       .globl EXT(excepstack_high_guard)
+LEXT(excepstack_high_guard)
+       .space (PAGE_MAX_SIZE_NUM)
+
+#endif
+
+// Must align to 16K here, due to <rdar://problem/33268668>
+        .global EXT(kd_early_buffer)
+        .align 14
+LEXT(kd_early_buffer) // space for kdebug's early event buffer
+        .space 16*1024,0
+
+       .section __DATA, __data                                         // Aligned data
+
        .globl  EXT(CpuDataEntries)
        .align  12                                                      // Page aligned
 LEXT(CpuDataEntries)                                                   // Cpu Data Entry Array               
@@ -90,12 +114,6 @@ LEXT(vfptrash_data)
        .fill   64, 4, 0xca55e77e
 #endif
 
-// Must align to 16K here, due to <rdar://problem/33268668>
-        .global EXT(kd_early_buffer)
-        .align 14
-LEXT(kd_early_buffer) // space for kdebug's early event buffer
-        .space 16*1024,0
-
 #if __arm64__
         .section __DATA, __const
 
@@ -103,7 +121,7 @@ LEXT(kd_early_buffer) // space for kdebug's early event buffer
 /* reserve space for read only page tables */
         .align 14
 LEXT(ropagetable_begin)
-        .space 16*16*1024,0
+        .space 14*16*1024,0
 #else
 LEXT(ropagetable_begin)
 #endif /* defined(KERNEL_INTEGRITY_KTRR)*/
index 9403467191e90dd0f6b3a9554813c34287b66bc6..a91fd5dc4b0ea85942e7e08565d66a7e0f493c59 100644 (file)
@@ -51,6 +51,24 @@ typedef enum {
        DBGWRAP_WARN_CPU_OFFLINE
 } dbgwrap_status_t;
 
+static inline const char*
+ml_dbgwrap_strerror(dbgwrap_status_t status) {
+       switch (status) {
+
+       case DBGWRAP_ERR_SELF_HALT:             return "CPU attempted to halt itself";
+       case DBGWRAP_ERR_UNSUPPORTED:           return "halt not supported for this configuration";
+       case DBGWRAP_ERR_INPROGRESS:            return "halt in progress on another CPU";
+       case DBGWRAP_ERR_INSTR_ERROR:           return "instruction-stuffing failure";
+       case DBGWRAP_ERR_INSTR_TIMEOUT:         return "instruction-stuffing timeout";
+       case DBGWRAP_ERR_HALT_TIMEOUT:          return "halt ack timeout, CPU likely wedged";
+       case DBGWRAP_SUCCESS:                   return "halt succeeded";
+       case DBGWRAP_WARN_ALREADY_HALTED:       return "CPU already halted";
+       case DBGWRAP_WARN_CPU_OFFLINE:          return "CPU offline";
+       default:                                return "unrecognized status";
+
+       }
+}
+
 boolean_t ml_dbgwrap_cpu_is_halted(int cpu_index);
 
 dbgwrap_status_t ml_dbgwrap_wait_cpu_halted(int cpu_index, uint64_t timeout_ns);
index 4f3c1b8ba68e8f932af2e99ddc10c8911961de88..36e59b9c755f059ec7628fbb1326499b0d1b204e 100644 (file)
@@ -316,8 +316,10 @@ main(
 
        DECLARE("CPU_DATA_PADDR",       offsetof(struct cpu_data_entry, cpu_data_paddr));
 
-
        DECLARE("INTSTACK_SIZE",        INTSTACK_SIZE);
+       DECLARE("FIQSTACK_SIZE",        FIQSTACK_SIZE);
+
+       DECLARE("PAGE_MAX_SIZE",        PAGE_MAX_SIZE);
 
        /* values from kern/timer.h */
        DECLARE("TIMER_LOW",
index 3a58a7fccea1f5fa9a6f9ba7f4d0e77a2506f40b..16acddb90ed12ec989a9b0457dd152ed70f0a86a 100644 (file)
@@ -255,8 +255,6 @@ typedef struct {
 
 #define LOCK_PANIC_TIMEOUT     0xc00000        // 12.5 m ticks = 250ms with 24MHz OSC
 
-#define LOCK_TRY_DISABLE_INT 1 // Disable interrupts for a quick acquire attempt
-
 #define PLATFORM_LCK_ILOCK LCK_ILOCK
 
 
@@ -276,6 +274,7 @@ typedef struct {
 #define LCK_MTX_THREAD_MASK (~(uintptr_t)(LCK_ILOCK | ARM_LCK_WAITERS))
 
 #define disable_preemption_for_thread(t) ((volatile thread_t)t)->machine.preemption_count++
+#define preemption_disabled_for_thread(t) (((volatile thread_t)t)->machine.preemption_count > 0)
 
 
 __unused static void disable_interrupts_noread(void)
index 941ec38fc831c9e70e35cdece8ff8e39b38219fe..b43f665db1d83b0de86aacc1309138557257efee 100644 (file)
@@ -232,7 +232,6 @@ static boolean_t lck_rw_lock_shared_to_exclusive_success(lck_rw_t *lck);
 static boolean_t lck_rw_lock_shared_to_exclusive_failure(lck_rw_t *lck, uint32_t prior_lock_state);
 static void lck_rw_lock_exclusive_to_shared_gen(lck_rw_t *lck, uint32_t prior_lock_state);
 static lck_rw_type_t lck_rw_done_gen(lck_rw_t *lck, uint32_t prior_lock_state);
-void lck_rw_clear_promotions_x86(thread_t thread);
 static boolean_t lck_rw_grab(lck_rw_t *lock, int mode, boolean_t wait);
 
 /*
@@ -358,8 +357,8 @@ static unsigned int
 hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout);
 #endif
 
-unsigned int
-hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout)
+static inline unsigned int
+hw_lock_bit_to_internal(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout)
 {
        unsigned int success = 0;
        uint32_t        mask = (1 << bit);
@@ -367,7 +366,6 @@ hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout)
        uint32_t        state;
 #endif
 
-       _disable_preemption();
 #if    __SMP__
        if (__improbable(!atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE)))
                success = hw_lock_bit_to_contended(lock, mask, timeout);
@@ -390,6 +388,13 @@ hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout)
        return success;
 }
 
+unsigned int
+hw_lock_bit_to(hw_lock_bit_t *lock, unsigned int bit, uint32_t timeout)
+{
+       _disable_preemption();
+       return hw_lock_bit_to_internal(lock, bit, timeout);
+}
+
 #if    __SMP__
 static unsigned int NOINLINE
 hw_lock_bit_to_contended(hw_lock_bit_t *lock, uint32_t mask, uint32_t timeout)
@@ -440,17 +445,30 @@ hw_lock_bit(hw_lock_bit_t *lock, unsigned int bit)
 #endif
 }
 
+void
+hw_lock_bit_nopreempt(hw_lock_bit_t *lock, unsigned int bit)
+{
+       if (__improbable(get_preemption_level() == 0))
+               panic("Attempt to take no-preempt bitlock %p in preemptible context", lock);
+       if (hw_lock_bit_to_internal(lock, bit, LOCK_PANIC_TIMEOUT))
+               return;
+#if    __SMP__
+       panic("hw_lock_bit_nopreempt(): timed out (%p)", lock);
+#else
+       panic("hw_lock_bit_nopreempt(): interlock held (%p)", lock);
+#endif
+}
+
 unsigned int
 hw_lock_bit_try(hw_lock_bit_t *lock, unsigned int bit)
 {
-       long            intmask;
        uint32_t        mask = (1 << bit);
 #if    !__SMP__
        uint32_t        state;
 #endif
        boolean_t       success = FALSE;
 
-       intmask = disable_interrupts();
+       _disable_preemption();
 #if    __SMP__
        // TODO: consider weak (non-looping) atomic test-and-set
        success = atomic_test_and_set32(lock, mask, mask, memory_order_acquire, FALSE);
@@ -461,9 +479,8 @@ hw_lock_bit_try(hw_lock_bit_t *lock, unsigned int bit)
                success = TRUE;
        }
 #endif // __SMP__
-       if (success)
-               disable_preemption();
-       restore_interrupts(intmask);
+       if (!success)
+               _enable_preemption();
 
 #if CONFIG_DTRACE
        if (success)
@@ -473,14 +490,8 @@ hw_lock_bit_try(hw_lock_bit_t *lock, unsigned int bit)
        return success;
 }
 
-/*
- *     Routine:        hw_unlock_bit
- *
- *             Release spin-lock. The second parameter is the bit number to test and set.
- *             Decrement the preemption level.
- */
-void
-hw_unlock_bit(hw_lock_bit_t *lock, unsigned int bit)
+static inline void
+hw_unlock_bit_internal(hw_lock_bit_t *lock, unsigned int bit)
 {
        uint32_t        mask = (1 << bit);
 #if    !__SMP__
@@ -497,9 +508,28 @@ hw_unlock_bit(hw_lock_bit_t *lock, unsigned int bit)
 #if CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, bit);
 #endif
-       enable_preemption();
 }
 
+/*
+ *     Routine:        hw_unlock_bit
+ *
+ *             Release spin-lock. The second parameter is the bit number to test and set.
+ *             Decrement the preemption level.
+ */
+void
+hw_unlock_bit(hw_lock_bit_t *lock, unsigned int bit)
+{
+       hw_unlock_bit_internal(lock, bit);
+       _enable_preemption();
+}
+
+void
+hw_unlock_bit_nopreempt(hw_lock_bit_t *lock, unsigned int bit)
+{
+       if (__improbable(get_preemption_level() == 0))
+               panic("Attempt to release no-preempt bitlock %p in preemptible context", lock);
+       hw_unlock_bit_internal(lock, bit);
+}
 
 /*
  *      Routine:        lck_spin_alloc_init
@@ -570,6 +600,19 @@ lck_spin_lock(lck_spin_t *lock)
        hw_lock_lock(&lock->hwlock);
 }
 
+/*
+ *      Routine:        lck_spin_lock_nopreempt
+ */
+void
+lck_spin_lock_nopreempt(lck_spin_t *lock)
+{
+#if    DEVELOPMENT || DEBUG
+       if (lock->type != LCK_SPIN_TYPE)
+               panic("Invalid spinlock %p", lock);
+#endif // DEVELOPMENT || DEBUG
+       hw_lock_lock_nopreempt(&lock->hwlock);
+}
+
 /*
  *      Routine:        lck_spin_try_lock
  */
@@ -579,6 +622,15 @@ lck_spin_try_lock(lck_spin_t *lock)
        return hw_lock_try(&lock->hwlock);
 }
 
+/*
+ *      Routine:        lck_spin_try_lock_nopreempt
+ */
+int
+lck_spin_try_lock_nopreempt(lck_spin_t *lock)
+{
+       return hw_lock_try_nopreempt(&lock->hwlock);
+}
+
 /*
  *      Routine:        lck_spin_unlock
  */
@@ -594,6 +646,21 @@ lck_spin_unlock(lck_spin_t *lock)
        hw_lock_unlock(&lock->hwlock);
 }
 
+/*
+ *      Routine:        lck_spin_unlock_nopreempt
+ */
+void
+lck_spin_unlock_nopreempt(lck_spin_t *lock)
+{
+#if    DEVELOPMENT || DEBUG
+       if ((LCK_MTX_STATE_TO_THREAD(lock->lck_spin_data) != current_thread()) && LOCK_CORRECTNESS_PANIC())
+               panic("Spinlock not owned by thread %p = %lx", lock, lock->lck_spin_data);
+       if (lock->type != LCK_SPIN_TYPE)
+               panic("Invalid spinlock type %p", lock);
+#endif // DEVELOPMENT || DEBUG
+       hw_lock_unlock_nopreempt(&lock->hwlock);
+}
+
 /*
  *      Routine:        lck_spin_destroy
  */
@@ -1373,7 +1440,7 @@ lck_rw_lock_shared_to_exclusive_failure(
 
        if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
                /* sched_flags checked without lock, but will be rechecked while clearing */
-               lck_rw_clear_promotion(thread);
+               lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
        }
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
@@ -1457,7 +1524,8 @@ lck_rw_lock_shared_to_exclusive_success(
                                ordered_store_rw(lock, word.data);
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
-                               res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT);
+                               res = assert_wait(LCK_RW_WRITER_EVENT(lock),
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lock, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1796,7 +1864,8 @@ lck_rw_lock_exclusive_gen(
                                ordered_store_rw(lock, word.data);
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
-                               res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT);
+                               res = assert_wait(LCK_RW_WRITER_EVENT(lock),
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lock, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1866,7 +1935,8 @@ lck_rw_lock_exclusive_gen(
                                ordered_store_rw(lock, word.data);
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
-                               res = assert_wait(LCK_RW_WRITER_EVENT(lock), THREAD_UNINT);
+                               res = assert_wait(LCK_RW_WRITER_EVENT(lock),
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lock, istate);
 
                                if (res == THREAD_WAITING) {
@@ -2034,7 +2104,7 @@ lck_rw_done_gen(
 #endif
        if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
                /* sched_flags checked without lock, but will be rechecked while clearing */
-               lck_rw_clear_promotion(thread);
+               lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
        }
 #if CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_RW_DONE_RELEASE, lck, lock_type == LCK_RW_TYPE_SHARED ? 0 : 1);
@@ -2116,7 +2186,8 @@ lck_rw_lock_shared_gen(
                                ordered_store_rw(lck, word.data);
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
-                               res = assert_wait(LCK_RW_READER_EVENT(lck), THREAD_UNINT);
+                               res = assert_wait(LCK_RW_READER_EVENT(lck),
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -2414,10 +2485,10 @@ lck_mtx_lock_contended(lck_mtx_t *lock, thread_t thread, boolean_t interlocked)
        if (interlocked)
                goto interlock_held;
 
+       /* TODO: short-duration spin for on-core contention <rdar://problem/10234625> */
+
+       /* Loop waiting until I see that the mutex is unowned */
        for ( ; ; ) {
-               if (atomic_compare_exchange(&lock->lck_mtx_data, 0, LCK_MTX_THREAD_TO_STATE(thread),
-                                               memory_order_acquire_smp, FALSE))
-                       return;
                interlock_lock(lock);
 interlock_held:
                state = ordered_load_mtx(lock);
@@ -2426,7 +2497,10 @@ interlock_held:
                        break;
                ordered_store_mtx(lock, (state | LCK_ILOCK | ARM_LCK_WAITERS)); // Set waiters bit and wait
                lck_mtx_lock_wait(lock, holding_thread);
+               /* returns interlock unlocked */
        }
+
+       /* Hooray, I'm the new owner! */
        waiters = lck_mtx_lock_acquire(lock);
        state = LCK_MTX_THREAD_TO_STATE(thread);
        if (waiters != 0)
@@ -2661,14 +2735,14 @@ lck_mtx_unlock_contended(lck_mtx_t *lock, thread_t thread, boolean_t ilk_held)
                state |= LCK_ILOCK;
                ordered_store_mtx(lock, state);
 #endif
+               if (state & ARM_LCK_WAITERS) {
+                       lck_mtx_unlock_wakeup(lock, thread);
+                       state = ordered_load_mtx(lock);
+               } else {
+            assertf(lock->lck_mtx_pri == 0, "pri=0x%x", lock->lck_mtx_pri);
+               }
        }
-       if (state & ARM_LCK_WAITERS) {
-               lck_mtx_unlock_wakeup(lock, thread);
-               state = ordered_load_mtx(lock);
-       } else {
-               assertf(lock->lck_mtx_pri == 0, "pri=0x%x", lock->lck_mtx_pri);
-       }
-       state &= ARM_LCK_WAITERS;               // Retain waiters bit
+       state &= ARM_LCK_WAITERS;   /* Clear state, retain waiters bit */
 #if __SMP__
        state |= LCK_ILOCK;
        ordered_store_mtx(lock, state);
index 233166b555854a943493afa0e4283585e2bd36bd..ce41150c18042a699d0468a9bffd4dc572b39937 100644 (file)
@@ -1260,6 +1260,7 @@ fleh_irq_user:
        ldr             r2, [r2]
        movs    r2, r2
        beq             1f
+       mov             r1, #0                                  // (not a PMI record)
        bl              EXT(telemetry_mark_curthread)           // ...if so, mark the current thread...
        mrc             p15, 0, r9, c13, c0, 4                          // ...and restore the thread pointer from TPIDRPRW
 1:
@@ -1313,6 +1314,7 @@ fleh_irq_kernel:
        ldr             r2, [r2]
        movs    r2, r2
        beq             1f
+       mov             r1, #0                                  // (not a PMI record)
        bl              EXT(telemetry_mark_curthread)           // ...if so, mark the current thread...
        mrc             p15, 0, r9, c13, c0, 4                          // ...and restore the thread pointer from TPIDRPRW
 1:
@@ -1470,6 +1472,7 @@ fleh_decirq_user:
        ldr             r2, [r2]
        movs    r2, r2
        beq             1f
+       mov             r1, #0                                  // (not a PMI record)
        bl              EXT(telemetry_mark_curthread)           // ...if so, mark the current thread...
        mrc             p15, 0, r9, c13, c0, 4                          // ...and restore the thread pointer from TPIDRPRW
 1:
@@ -1523,6 +1526,7 @@ fleh_decirq_kernel:
        ldr             r2, [r2]
        movs    r2, r2
        beq             1f
+       mov             r1, #0                                  // (not a pmi record)
        bl              EXT(telemetry_mark_curthread)           // ...if so, mark the current thread...
        mrc             p15, 0, r9, c13, c0, 4                          // ...and restore the thread pointer from TPIDRPRW
 1:
@@ -1777,8 +1781,9 @@ LEXT(fleh_dec)
 #if CONFIG_TELEMETRY
        LOAD_ADDR(r4, telemetry_needs_record)           // Check if a telemetry record was requested...
        ldr             r4, [r4]
-       movs    r4, r4
+       movs            r4, r4
        beq             6f
+       mov             r1, #0                                  // (not a PMI record)
        bl              EXT(telemetry_mark_curthread)           // ...if so, mark the current thread...
        mrc             p15, 0, r9, c13, c0, 4                          // ...and restore the thread pointer from TPIDRPRW
 6:
index 46aeec6daa19e519a597dc8e54d834eb083c8b91..35999a7f7ea94f8b9041088c0c07428dc352d83b 100644 (file)
@@ -495,9 +495,9 @@ copypv(addr64_t source, addr64_t sink, unsigned int size, int which)
                panic("copypv: no more than 1 parameter may be virtual\n");     /* Not allowed */
 
        if (which & cppvPsrc)
-               from = (void *)phystokv(from);
+               from = (void *)phystokv((pmap_paddr_t)from);
        if (which & cppvPsnk)
-               to = (void *)phystokv(to);
+               to = (void *)phystokv((pmap_paddr_t)to);
 
        if ((which & (cppvPsrc | cppvKmap)) == 0)       /* Source is virtual in
                                                         * current map */
@@ -549,17 +549,17 @@ copy_validate(const user_addr_t user_addr,
 {
        uintptr_t kernel_addr_last = kernel_addr + nbytes;
 
-       if (kernel_addr < VM_MIN_KERNEL_ADDRESS ||
+       if (__improbable(kernel_addr < VM_MIN_KERNEL_ADDRESS ||
            kernel_addr > VM_MAX_KERNEL_ADDRESS ||
            kernel_addr_last < kernel_addr ||
-           kernel_addr_last > VM_MAX_KERNEL_ADDRESS)
+           kernel_addr_last > VM_MAX_KERNEL_ADDRESS))
                panic("%s(%p, %p, %u) - kaddr not in kernel", __func__,
                    (void *)user_addr, (void *)kernel_addr, nbytes);
 
        user_addr_t user_addr_last = user_addr + nbytes;
 
-       if (user_addr_last < user_addr ||
-           user_addr_last > VM_MIN_KERNEL_ADDRESS)
+       if (__improbable((user_addr_last < user_addr) || ((user_addr + nbytes) > vm_map_max(current_thread()->map)) ||
+           (user_addr < vm_map_min(current_thread()->map))))
                return (EFAULT);
 
        if (__improbable(nbytes > copysize_limit_panic))
index f710eb65f4ffa7ac69a4e1914416b7a4d8b28679..eb6bcdf4f4fd1626a8ce7892fc32d3e83f7be930 100644 (file)
@@ -62,8 +62,8 @@ lowglo lowGlo __attribute__ ((aligned(PAGE_MAX_SIZE))) = {
        .lgManualPktAddr = (uint32_t)&manual_pkt,
 #endif
        .lgPmapMemQ = (uint32_t)&(pmap_object_store.memq),
-       .lgPmapMemPageOffset = offsetof(struct vm_page_with_ppnum, phys_page),
-       .lgPmapMemChainOffset = offsetof(struct vm_page, listq),
+       .lgPmapMemPageOffset = offsetof(struct vm_page_with_ppnum, vmp_phys_page),
+       .lgPmapMemChainOffset = offsetof(struct vm_page, vmp_listq),
        .lgPmapMemPagesize = (uint32_t)sizeof(struct vm_page),
 
        .lgPmapMemStartAddr = -1,
index 64d795e70fd08e61cf3c421887102ae1489b3baf..d6584de3f332d0a6da2dc4981c6a75b15f2e128b 100644 (file)
@@ -44,7 +44,7 @@ extern void cpu_signal_handler_internal(boolean_t disable_signal);
 extern void cpu_doshutdown(void (*doshutdown)(processor_t), processor_t processor);
 
 extern void cpu_idle(void);
-extern void cpu_idle_exit(void) __attribute__((noreturn));
+extern void cpu_idle_exit(boolean_t from_reset) __attribute__((noreturn));
 extern void cpu_idle_tickle(void);
 
 extern void cpu_machine_idle_init(boolean_t from_boot);
index 3943f47d4c862bdcdeec3560db0041fee5865b67..ac54a0be51896b4a024f904322b0d0edad1fae24 100644 (file)
@@ -147,7 +147,7 @@ machine_do_mvfpid()
 #else
        cpuid_mvfp_info.neon = 1;
        cpuid_mvfp_info.neon_hpfp = 1;
-#endif
+#endif /* __arm__ */
 
 }
 
index 83e60754060eb41e3f964345e488b80e01b95412..0a6777ea588a1cf9000603397d1dc088cfebe4d3 100644 (file)
@@ -435,6 +435,12 @@ void ml_init_timebase(
        }
 }
 
+void
+fiq_context_bootstrap(boolean_t enable_fiq)
+{
+       fiq_context_init(enable_fiq);
+}
+
 void
 ml_parse_cpu_topology(void)
 {
@@ -593,7 +599,7 @@ ml_processor_register(
 #endif
 
        if (!is_boot_cpu)
-               prng_cpu_init(this_cpu_datap->cpu_number);
+               early_random_cpu_init(this_cpu_datap->cpu_number);
 
        return KERN_SUCCESS;
 
@@ -639,23 +645,6 @@ cause_ast_check(
        }
 }
 
-
-/*
- *     Routine:        ml_at_interrupt_context
- *     Function:       Check if running at interrupt context
- */
-boolean_t 
-ml_at_interrupt_context(void)
-{
-       boolean_t at_interrupt_context = FALSE;
-
-       disable_preemption();
-       at_interrupt_context = (getCpuDatap()->cpu_int_state != NULL);
-       enable_preemption();
-
-       return at_interrupt_context;
-}
-
 extern uint32_t cpu_idle_count;
 
 void ml_get_power_state(boolean_t *icp, boolean_t *pidlep) {
@@ -722,6 +711,19 @@ ml_static_vtop(
        return ((vm_address_t)(vaddr) - gVirtBase + gPhysBase);
 }
 
+vm_offset_t
+ml_static_slide(
+       vm_offset_t vaddr)
+{
+       return VM_KERNEL_SLIDE(vaddr);
+}
+
+vm_offset_t
+ml_static_unslide(
+       vm_offset_t vaddr)
+{
+       return VM_KERNEL_UNSLIDE(vaddr);
+}
 
 kern_return_t
 ml_static_protect(
@@ -963,20 +965,6 @@ machine_choose_processor(__unused processor_set_t pset, processor_t processor)
        return (processor);
 }
 
-vm_offset_t 
-ml_stack_remaining(void)
-{
-       uintptr_t local = (uintptr_t) &local;
-       vm_offset_t     intstack_top_ptr;
-
-       intstack_top_ptr = getCpuDatap()->intstack_top;
-       if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
-           return (local - (getCpuDatap()->intstack_top - INTSTACK_SIZE));
-       } else {
-           return (local - current_thread()->kernel_stack);
-       }
-}
-
 boolean_t machine_timeout_suspended(void) {
        return FALSE;
 }
@@ -1023,7 +1011,7 @@ ml_delay_should_spin(uint64_t interval)
 
 boolean_t ml_thread_is64bit(thread_t thread)
 {
-       return (thread_is_64bit(thread));
+       return (thread_is_64bit_addr(thread));
 }
 
 void ml_timer_evaluate(void) {
@@ -1151,8 +1139,3 @@ arm_user_protect_end(thread_t thread, uintptr_t ttbr0, boolean_t disable_interru
     }
 }
 #endif // __ARM_USER_PROTECT__
-
-void ml_task_set_rop_pid(__unused task_t task, __unused task_t parent_task, __unused boolean_t inherit)
-{
-       return;
-}
index cee6477b1e391192b9fc095675b9b04af9bb65ea..4a7061b652e8bee53074fa5f0255e7eb76ad76e1 100644 (file)
@@ -131,6 +131,11 @@ typedef void (*platform_error_handler_t)(void *refcon, vm_offset_t fault_addr);
 typedef enum
 {
        EXCB_CLASS_ILLEGAL_INSTR_SET,
+#ifdef CONFIG_XNUPOST
+       EXCB_CLASS_TEST1,
+       EXCB_CLASS_TEST2,
+       EXCB_CLASS_TEST3,
+#endif
        EXCB_CLASS_MAX          // this must be last
 }
 ex_cb_class_t;
@@ -140,6 +145,9 @@ typedef enum
 {
        EXCB_ACTION_RERUN,      // re-run the faulting instruction
        EXCB_ACTION_NONE,       // continue normal exception handling
+#ifdef CONFIG_XNUPOST
+       EXCB_ACTION_TEST_FAIL,
+#endif
 }
 ex_cb_action_t;
 
@@ -289,6 +297,12 @@ vm_offset_t
 ml_static_ptovirt(
        vm_offset_t);
 
+vm_offset_t ml_static_slide(
+       vm_offset_t vaddr);
+
+vm_offset_t ml_static_unslide(
+       vm_offset_t vaddr);
+
 /* Offset required to obtain absolute time value from tick counter */
 uint64_t ml_get_abstime_offset(void);
 
@@ -527,10 +541,15 @@ vm_offset_t ml_stack_remaining(void);
 uint32_t       get_fpscr(void);
 void           set_fpscr(uint32_t);
 
+#ifdef __arm64__
+unsigned long update_mdscr(unsigned long clear, unsigned long set);
+#endif /* __arm64__ */
+
 extern void            init_vfp(void);
 extern boolean_t       get_vfp_enabled(void);
 extern void            arm_debug_set_cp14(arm_debug_state_t *debug_state);
 extern void            fiq_context_init(boolean_t enable_fiq);
+extern void            fiq_context_bootstrap(boolean_t enable_fiq);
 
 extern void            reenable_async_aborts(void);
 extern void            cpu_idle_wfi(boolean_t wfi_fast);
@@ -849,6 +868,8 @@ extern void sched_perfcontrol_register_callbacks(sched_perfcontrol_callbacks_t c
 
 extern void sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores);
 extern void sched_perfcontrol_thread_group_recommend(void *data, cluster_type_t recommendation);
+extern void sched_override_recommended_cores_for_sleep(void);
+extern void sched_restore_recommended_cores_after_sleep(void);
 
 /*
  * Update the deadline after which sched_perfcontrol_deadline_passed will be called.
@@ -890,7 +911,6 @@ void ml_get_power_state(boolean_t *, boolean_t *);
 boolean_t user_cont_hwclock_allowed(void);
 boolean_t user_timebase_allowed(void);
 boolean_t ml_thread_is64bit(thread_t thread);
-void ml_task_set_rop_pid(task_t task, task_t parent_task, boolean_t inherit);
 
 #ifdef __arm64__
 void ml_set_align_checking(void);
index d0b0a6b96c82e2440e90e7ab4549615799c7d57c..d175af88d1062c46537638d97c89c3a2cc31fb39 100644 (file)
@@ -123,8 +123,8 @@ LEXT(timer_grab)
        bx              lr
 
        .align  2
-       .globl  EXT(timer_update)
-LEXT(timer_update)
+       .globl  EXT(timer_advance_internal_32)
+LEXT(timer_advance_internal_32)
        str             r1, [r0, TIMER_HIGHCHK]
 #if    __ARM_SMP__
        dmb             ish                                                                     // dmb ish
@@ -188,39 +188,106 @@ LEXT(OSSynchronizeIO)
        dsb
        bx              lr
 
+.macro SYNC_TLB_FLUSH
+       dsb     ish
+       isb
+.endmacro
+
 /*
- *     void flush_mmu_tlb(void)
+ *     void sync_tlb_flush
  *
- *             Flush all TLBs
+ *             Synchronize one or more prior TLB flush operations
  */
        .text
        .align 2
-       .globl EXT(flush_mmu_tlb)
-LEXT(flush_mmu_tlb)
+       .globl EXT(sync_tlb_flush)
+LEXT(sync_tlb_flush)
+       SYNC_TLB_FLUSH
+       bx      lr
+
+.macro FLUSH_MMU_TLB
        mov     r0, #0
 #if    __ARM_SMP__
        mcr     p15, 0, r0, c8, c3, 0                           // Invalidate Inner Shareable entire TLBs
 #else
        mcr     p15, 0, r0, c8, c7, 0                           // Invalidate entire TLB
 #endif
-       dsb             ish
-       isb
-       bx              lr
+.endmacro
+
+/*
+ *     void flush_mmu_tlb_async(void)
+ *
+ *             Flush all TLBs, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_async)
+LEXT(flush_mmu_tlb_async)
+       FLUSH_MMU_TLB
+       bx      lr
+
+/*
+ *     void flush_mmu_tlb(void)
+ *
+ *             Flush all TLBs
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb)
+LEXT(flush_mmu_tlb)
+       FLUSH_MMU_TLB
+       SYNC_TLB_FLUSH
+       bx      lr
+
+.macro FLUSH_CORE_TLB
+       mov     r0, #0
+       mcr     p15, 0, r0, c8, c7, 0                           // Invalidate entire TLB
+.endmacro
+
+/*
+ *
+ *     void flush_core_tlb_async(void)
+ *
+ *             Flush local core's TLB, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_core_tlb_async)
+LEXT(flush_core_tlb_async)
+       FLUSH_CORE_TLB
+       bx      lr
 
 /*
  *     void flush_core_tlb(void)
  *
- *             Flush core TLB
+ *             Flush local core's TLB
  */
        .text
        .align 2
        .globl EXT(flush_core_tlb)
 LEXT(flush_core_tlb)
-       mov     r0, #0
-       mcr     p15, 0, r0, c8, c7, 0                           // Invalidate entire TLB
-       dsb             ish
-       isb
-       bx              lr
+       FLUSH_CORE_TLB
+       SYNC_TLB_FLUSH
+       bx      lr
+
+.macro FLUSH_MMU_TLB_ENTRY
+#if    __ARM_SMP__
+       mcr     p15, 0, r0, c8, c3, 1                           // Invalidate TLB  Inner Shareableentry
+#else
+       mcr     p15, 0, r0, c8, c7, 1                           // Invalidate TLB entry
+#endif
+.endmacro
+/*
+ *     void flush_mmu_tlb_entry_async(uint32_t)
+ *
+ *             Flush TLB entry, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_entry_async)
+LEXT(flush_mmu_tlb_entry_async)
+       FLUSH_MMU_TLB_ENTRY
+       bx      lr
 
 /*
  *     void flush_mmu_tlb_entry(uint32_t)
@@ -231,40 +298,70 @@ LEXT(flush_core_tlb)
        .align 2
        .globl EXT(flush_mmu_tlb_entry)
 LEXT(flush_mmu_tlb_entry)
+       FLUSH_MMU_TLB_ENTRY
+       SYNC_TLB_FLUSH
+       bx      lr
+
+.macro FLUSH_MMU_TLB_ENTRIES
+1:
 #if    __ARM_SMP__
-       mcr     p15, 0, r0, c8, c3, 1                           // Invalidate TLB  Inner Shareableentry
+       mcr     p15, 0, r0, c8, c3, 1                           // Invalidate TLB Inner Shareable entry 
 #else
        mcr     p15, 0, r0, c8, c7, 1                           // Invalidate TLB entry
 #endif
-       dsb             ish
-       isb
-       bx              lr
+       add     r0, r0, ARM_PGBYTES                             // Increment to the next page
+       cmp     r0, r1                                          // Loop if current address < end address
+       blt     1b
+.endmacro
+
+/*
+ *     void flush_mmu_tlb_entries_async(uint32_t, uint32_t)
+ *
+ *             Flush TLB entries for address range, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_entries_async)
+LEXT(flush_mmu_tlb_entries_async)
+       FLUSH_MMU_TLB_ENTRIES
+       bx      lr
 
 /*
  *     void flush_mmu_tlb_entries(uint32_t, uint32_t)
  *
- *             Flush TLB entries
+ *             Flush TLB entries for address range
  */
        .text
        .align 2
        .globl EXT(flush_mmu_tlb_entries)
 LEXT(flush_mmu_tlb_entries)
-1:
+       FLUSH_MMU_TLB_ENTRIES
+       SYNC_TLB_FLUSH
+       bx      lr
+
+
+.macro FLUSH_MMU_TLB_MVA_ENTRIES
 #if    __ARM_SMP__
-       mcr     p15, 0, r0, c8, c3, 1                           // Invalidate TLB Inner Shareable entry 
+       mcr     p15, 0, r0, c8, c3, 3                           // Invalidate TLB Inner Shareable entries by mva
 #else
-       mcr     p15, 0, r0, c8, c7, 1                           // Invalidate TLB entry
+       mcr     p15, 0, r0, c8, c7, 3                           // Invalidate TLB Inner Shareable entries by mva
 #endif
-       add             r0, r0, ARM_PGBYTES                                     // Increment to the next page
-       cmp             r0, r1                                                          // Loop if current address < end address
-       blt             1b
-       dsb             ish                                                                     // Synchronize
-       isb
-       bx              lr
+.endmacro
 
+/*
+ *     void flush_mmu_tlb_mva_entries_async(uint32_t)
+ *
+ *             Flush TLB entries for mva, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_mva_entries_async)
+LEXT(flush_mmu_tlb_mva_entries_async)
+       FLUSH_MMU_TLB_MVA_ENTRIES
+       bx      lr
 
 /*
- *     void flush_mmu_tlb_mva_entries(uint32_t)
+ *     void flush_mmu_tlb_mva_entries_async(uint32_t)
  *
  *             Flush TLB entries for mva
  */
@@ -272,46 +369,71 @@ LEXT(flush_mmu_tlb_entries)
        .align 2
        .globl EXT(flush_mmu_tlb_mva_entries)
 LEXT(flush_mmu_tlb_mva_entries)
+       FLUSH_MMU_TLB_MVA_ENTRIES
+       SYNC_TLB_FLUSH
+       bx      lr
+
+.macro FLUSH_MMU_TLB_ASID
 #if    __ARM_SMP__
-       mcr     p15, 0, r0, c8, c3, 3                           // Invalidate TLB Inner Shareable entries by mva
+       mcr     p15, 0, r0, c8, c3, 2                           // Invalidate TLB Inner Shareable entries by asid
 #else
-       mcr     p15, 0, r0, c8, c7, 3                           // Invalidate TLB Inner Shareable entries by mva
+       mcr     p15, 0, r0, c8, c7, 2                           // Invalidate TLB entries by asid
 #endif
-       dsb             ish
-       isb
-       bx              lr
+.endmacro
+
+/*
+ *     void flush_mmu_tlb_asid_async(uint32_t)
+ *
+ *             Flush TLB entries for asid, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_asid_async)
+LEXT(flush_mmu_tlb_asid_async)
+       FLUSH_MMU_TLB_ASID
+       bx      lr
 
 /*
  *     void flush_mmu_tlb_asid(uint32_t)
  *
- *             Flush TLB entriesfor requested asid
+ *             Flush TLB entries for asid
  */
        .text
        .align 2
        .globl EXT(flush_mmu_tlb_asid)
 LEXT(flush_mmu_tlb_asid)
-#if    __ARM_SMP__
-       mcr     p15, 0, r0, c8, c3, 2                           // Invalidate TLB Inner Shareable entries by asid
-#else
+       FLUSH_MMU_TLB_ASID
+       SYNC_TLB_FLUSH
+       bx      lr
+
+.macro FLUSH_CORE_TLB_ASID
        mcr     p15, 0, r0, c8, c7, 2                           // Invalidate TLB entries by asid
-#endif
-       dsb             ish
-       isb
-       bx              lr
+.endmacro
+
+/*
+ *     void flush_core_tlb_asid_async(uint32_t)
+ *
+ *             Flush local core TLB entries for asid, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_core_tlb_asid_async)
+LEXT(flush_core_tlb_asid_async)
+       FLUSH_CORE_TLB_ASID
+       bx      lr
 
 /*
  *     void flush_core_tlb_asid(uint32_t)
  *
- *             Flush TLB entries for core for requested asid
+ *             Flush local core TLB entries for asid
  */
        .text
        .align 2
        .globl EXT(flush_core_tlb_asid)
 LEXT(flush_core_tlb_asid)
-       mcr     p15, 0, r0, c8, c7, 2                           // Invalidate TLB entries by asid
-       dsb             ish
-       isb
-       bx              lr
+       FLUSH_CORE_TLB_ASID
+       SYNC_TLB_FLUSH
+       bx      lr
 
 /*
  *     Set MMU Translation Table Base
@@ -499,22 +621,7 @@ LEXT(set_context_id)
        isb
        bx              lr
 
-#define COPYIO_HEADER(rUser, kLabel)                                   \
-       /* test for zero len */                                         ;\
-       cmp             r2, #0                                          ;\
-       moveq           r0, #0                                          ;\
-       bxeq            lr                                              ;\
-       /* test user_addr, user_addr+len to see if it's in kernel space */              ;\
-       add             r12, rUser, r2                                  ;\
-       cmp             r12, KERNELBASE                                 ;\
-       bhs             kLabel                                          ;\
-       cmp             r12, rUser                                      ;\
-       bcc             kLabel
-
-#define        COPYIO_VALIDATE(NAME, SIZE)                                     \
-       /* branch around for small sizes */                             ;\
-       cmp             r2, #(SIZE)                                     ;\
-       bls             L##NAME##_validate_done                         ;\
+#define        COPYIO_VALIDATE(NAME)                                           \
        /* call NAME_validate to check the arguments */                 ;\
        push            {r0, r1, r2, r7, lr}                            ;\
        add             r7, sp, #12                                     ;\
@@ -523,7 +630,6 @@ LEXT(set_context_id)
        addne           sp, #12                                         ;\
        popne           {r7, pc}                                        ;\
        pop             {r0, r1, r2, r7, lr}                            ;\
-L##NAME##_validate_done:
 
 #define        COPYIO_SET_RECOVER()                                            \
        /* set recovery address */                                      ;\
@@ -533,6 +639,15 @@ L##NAME##_validate_done:
        ldr             r4, [r12, TH_RECOVER]                           ;\
        str             r3, [r12, TH_RECOVER]
 
+#define COPYIO_TRY_KERNEL()                                                            \
+       /* if (current_thread()->map->pmap == kernel_pmap) copyio_kernel() */           ;\
+       mrc             p15, 0, r12, c13, c0, 4                 // Read TPIDRPRW        ;\
+       ldr             r3, [r12, ACT_MAP]                                              ;\
+       ldr             r3, [r3, MAP_PMAP]                                              ;\
+       LOAD_ADDR(ip, kernel_pmap_store)                                                ;\
+       cmp             r3, ip                                                          ;\
+       beq             copyio_kern_body
+
 #if __ARM_USER_PROTECT__
 #define        COPYIO_MAP_USER()                                       \
        /* disable interrupts to prevent expansion to 2GB at L1 ;\
@@ -549,7 +664,7 @@ L##NAME##_validate_done:
 #define        COPYIO_MAP_USER()
 #endif
 
-#define COPYIO_HEADER_KERN()                                           ;\
+#define COPYIO_HEADER()                                                        ;\
        /* test for zero len */                                         ;\
        cmp             r2, #0                                          ;\
        moveq           r0, #0                                          ;\
@@ -615,22 +730,21 @@ L$0_noerror:
        .align 2
        .globl EXT(copyinstr)
 LEXT(copyinstr)
+       cmp             r2, #0
+       moveq           r0, #ENAMETOOLONG
+       moveq           r12, #0
+       streq           r12, [r3]
+       bxeq            lr
+       COPYIO_VALIDATE(copyin)
        stmfd   sp!, { r4, r5, r6 }
        
        mov             r6, r3
-       add             r3, r0, r2                                              // user_addr + max
-       cmp             r3, KERNELBASE                                  // Check KERNELBASE < user_addr + max
-       bhs             copyinstr_param_error                   // Drop out if it is
-       cmp             r3, r0                                                  // Check we're copying from user space
-       bcc             copyinstr_param_error                   // Drop out if we aren't
        adr             r3, copyinstr_error                     // Get address for recover
        mrc             p15, 0, r12, c13, c0, 4                 // Read TPIDRPRW
        ldr             r4, [r12, TH_RECOVER]                           ;\
        str             r3, [r12, TH_RECOVER]
        COPYIO_MAP_USER()
        mov             r12, #0                                                 // Number of bytes copied so far
-       cmp             r2, #0
-       beq             copyinstr_too_long
 copyinstr_loop:
        ldrb            r3, [r0], #1                                    // Load a byte from the source (user)
        strb            r3, [r1], #1                                    // Store a byte to the destination (kernel)
@@ -647,16 +761,15 @@ copyinstr_too_long:
 copyinstr_done:
 //
 // When we get here, we have finished copying the string.  We came here from
-// either the "beq copyinstr_done" above, in which case r4 == 0 (which is also
+// either the "beq copyinstr_done" above, in which case r3 == 0 (which is also
 // the function result for success), or falling through from copyinstr_too_long,
-// in which case r4 == ENAMETOOLONG.
+// in which case r3 == ENAMETOOLONG.
 //
        str             r12, [r6]                                               // Save the count for actual
        mov             r0, r3                                                  // Return error code from r3
 copyinstr_exit:
        COPYIO_UNMAP_USER()
        str             r4, [r12, TH_RECOVER]
-copyinstr_exit2:
        ldmfd   sp!, { r4, r5, r6 }
        bx              lr
 
@@ -665,11 +778,6 @@ copyinstr_error:
        mov             r0, #EFAULT
        b               copyinstr_exit
 
-copyinstr_param_error:
-       /* set error, exit routine */
-       mov             r0, #EFAULT
-       b               copyinstr_exit2
-
 /*
  * int copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes)
  */
@@ -677,8 +785,9 @@ copyinstr_param_error:
        .align 2
        .globl EXT(copyin)
 LEXT(copyin)
-       COPYIO_HEADER(r0,copyio_kernel)
-       COPYIO_VALIDATE(copyin,4096)
+       COPYIO_HEADER()
+       COPYIO_VALIDATE(copyin)
+       COPYIO_TRY_KERNEL()
        COPYIO_SET_RECOVER()
        COPYIO_MAP_USER()
        COPYIO_BODY copyin
@@ -693,8 +802,9 @@ LEXT(copyin)
        .align 2
        .globl EXT(copyout)
 LEXT(copyout)
-       COPYIO_HEADER(r1,copyio_kernel)
-       COPYIO_VALIDATE(copyout,4096)
+       COPYIO_HEADER()
+       COPYIO_VALIDATE(copyout)
+       COPYIO_TRY_KERNEL()
        COPYIO_SET_RECOVER()
        COPYIO_MAP_USER()
        COPYIO_BODY copyout
@@ -717,7 +827,7 @@ LEXT(copyin_word)
        tst             r0, r3                  // Test alignment of user address
        bne             L_copyin_invalid
 
-       COPYIO_HEADER(r0,L_copyin_word_fault)
+       COPYIO_VALIDATE(copyin)
        COPYIO_SET_RECOVER()
        COPYIO_MAP_USER()
 
@@ -734,9 +844,6 @@ LEXT(copyin_word)
 L_copyin_invalid:
        mov             r0, #EINVAL
        bx              lr
-L_copyin_word_fault:
-       mov             r0, #EFAULT
-       bx              lr
 
 
 copyio_error:
@@ -753,8 +860,8 @@ copyio_error:
        .align 2
        .globl EXT(copyin_kern)
 LEXT(copyin_kern)
-       COPYIO_HEADER_KERN()
-       b               bypass_check
+       COPYIO_HEADER()
+       b               copyio_kern_body
 
 /*
  *  int copyout_kern(const char *kernel_addr, user_addr_t user_addr, vm_size_t nbytes)
@@ -763,23 +870,10 @@ LEXT(copyin_kern)
        .align 2
        .globl EXT(copyout_kern)
 LEXT(copyout_kern)
-       COPYIO_HEADER_KERN()
-       b               bypass_check
-
-copyio_kernel_error:
-       mov             r0, #EFAULT
-       bx              lr
-
-copyio_kernel:
-       /* if (current_thread()->map->pmap != kernel_pmap) return EFAULT */
-       mrc             p15, 0, r12, c13, c0, 4                 // Read TPIDRPRW
-       ldr             r3, [r12, ACT_MAP]
-       ldr             r3, [r3, MAP_PMAP]
-       LOAD_ADDR(ip, kernel_pmap_store)
-       cmp             r3, ip
-       bne             copyio_kernel_error
+       COPYIO_HEADER()
+       b               copyio_kern_body
 
-bypass_check:
+copyio_kern_body:
        stmfd   sp!, { r5, r6 }
        COPYIO_BODY copyio_kernel
        ldmfd   sp!, { r5, r6 }
index 9afa6a74f335a7c473be42384852a358fcb32bb6..0c6da73f7bcad1b5cead3275dcec76fbc386406c 100644 (file)
@@ -40,6 +40,7 @@
 #include <kern/thread_group.h>
 #include <kern/policy_internal.h>
 #include <machine/config.h>
+#include <pexpert/pexpert.h>
 
 #if MONOTONIC
 #include <kern/monotonic.h>
@@ -361,7 +362,7 @@ machine_thread_going_on_core(thread_t   new_thread,
        on_core.energy_estimate_nj = 0;
        on_core.qos_class = proc_get_effective_thread_policy(new_thread, TASK_POLICY_QOS);
        on_core.urgency = urgency;
-       on_core.is_32_bit = thread_is_64bit(new_thread) ? FALSE : TRUE;
+       on_core.is_32_bit = thread_is_64bit_data(new_thread) ? FALSE : TRUE;
        on_core.is_kernel_thread = new_thread->task == kernel_task;
        on_core.scheduling_latency = sched_latency;
        on_core.start_time = timestamp;
@@ -467,7 +468,7 @@ machine_perfcontrol_deadline_passed(uint64_t deadline)
 void
 ml_spin_debug_reset(thread_t thread)
 {
-    thread->machine.intmask_timestamp = mach_absolute_time();
+       thread->machine.intmask_timestamp = mach_absolute_time();
 }
 
 /*
@@ -478,7 +479,7 @@ ml_spin_debug_reset(thread_t thread)
 void
 ml_spin_debug_clear(thread_t thread)
 {
-    thread->machine.intmask_timestamp = 0;
+       thread->machine.intmask_timestamp = 0;
 }
 
 /*
@@ -495,28 +496,28 @@ ml_spin_debug_clear_self()
 void
 ml_check_interrupts_disabled_duration(thread_t thread)
 {
-    uint64_t start;
-    uint64_t now;
+       uint64_t start;
+       uint64_t now;
 
-    start = thread->machine.intmask_timestamp;
-    if (start != 0) {
-        now = mach_absolute_time();
+       start = thread->machine.intmask_timestamp;
+       if (start != 0) {
+               now = mach_absolute_time();
 
-        if ((now - start) > interrupt_masked_timeout) {
-            mach_timebase_info_data_t timebase;
-            clock_timebase_info(&timebase);
+               if ((now - start) > interrupt_masked_timeout * debug_cpu_performance_degradation_factor) {
+                       mach_timebase_info_data_t timebase;
+                       clock_timebase_info(&timebase);
 
 #ifndef KASAN
-            /*
-             * Disable the actual panic for KASAN due to the overhead of KASAN itself, leave the rest of the
-             * mechanism enabled so that KASAN can catch any bugs in the mechanism itself.
-             */
-            panic("Interrupts held disabled for %llu nanoseconds", (((now - start) * timebase.numer)/timebase.denom));
+                       /*
+                       * Disable the actual panic for KASAN due to the overhead of KASAN itself, leave the rest of the
+                       * mechanism enabled so that KASAN can catch any bugs in the mechanism itself.
+                       */
+                       panic("Interrupts held disabled for %llu nanoseconds", (((now - start) * timebase.numer)/timebase.denom));
 #endif
-        }
-    }
+               }
+       }
 
-    return;
+       return;
 }
 #endif // INTERRUPT_MASKED_DEBUG
 
@@ -524,84 +525,121 @@ ml_check_interrupts_disabled_duration(thread_t thread)
 boolean_t
 ml_set_interrupts_enabled(boolean_t enable)
 {
-    thread_t   thread;
-    uint64_t   state;
+       thread_t        thread;
+       uint64_t        state;
 
 #if __arm__
 #define INTERRUPT_MASK PSR_IRQF
-    state = __builtin_arm_rsr("cpsr");
+       state = __builtin_arm_rsr("cpsr");
 #else
 #define INTERRUPT_MASK DAIF_IRQF
-    state = __builtin_arm_rsr("DAIF");
+       state = __builtin_arm_rsr("DAIF");
 #endif
-    if (enable) {
+       if (enable && (state & INTERRUPT_MASK)) {
 #if INTERRUPT_MASKED_DEBUG
-        if (interrupt_masked_debug && (state & INTERRUPT_MASK)) {
-            // Interrupts are currently masked, we will enable them (after finishing this check)
-            thread = current_thread();
-            ml_check_interrupts_disabled_duration(thread);
-            thread->machine.intmask_timestamp = 0;
-        }
+               if (interrupt_masked_debug) {
+                       // Interrupts are currently masked, we will enable them (after finishing this check)
+                       thread = current_thread();
+                       ml_check_interrupts_disabled_duration(thread);
+                       thread->machine.intmask_timestamp = 0;
+               }
 #endif // INTERRUPT_MASKED_DEBUG
-        if (get_preemption_level() == 0) {
-            thread = current_thread();
-            while (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
+               if (get_preemption_level() == 0) {
+                       thread = current_thread();
+                       while (thread->machine.CpuDatap->cpu_pending_ast & AST_URGENT) {
 #if __ARM_USER_PROTECT__
-                uintptr_t up = arm_user_protect_begin(thread);
+                               uintptr_t up = arm_user_protect_begin(thread);
 #endif
-                ast_taken_kernel();
+                               ast_taken_kernel();
 #if __ARM_USER_PROTECT__
-                arm_user_protect_end(thread, up, FALSE);
+                               arm_user_protect_end(thread, up, FALSE);
 #endif
-            }
-        }
+                       }
+               }
 #if __arm__
-        __asm__ volatile ("cpsie if" ::: "memory"); // Enable IRQ FIQ
+               __asm__ volatile ("cpsie if" ::: "memory"); // Enable IRQ FIQ
 #else
-        __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
+               __builtin_arm_wsr("DAIFClr", (DAIFSC_IRQF | DAIFSC_FIQF));
 #endif
-    } else {
+       } else if (!enable && ((state & INTERRUPT_MASK) == 0)) {
 #if __arm__
-        __asm__ volatile ("cpsid if" ::: "memory"); // Mask IRQ FIQ
+               __asm__ volatile ("cpsid if" ::: "memory"); // Mask IRQ FIQ
 #else
-        __builtin_arm_wsr("DAIFSet", (DAIFSC_IRQF | DAIFSC_FIQF));
+               __builtin_arm_wsr("DAIFSet", (DAIFSC_IRQF | DAIFSC_FIQF));
 #endif
 #if INTERRUPT_MASKED_DEBUG
-        if (interrupt_masked_debug && ((state & INTERRUPT_MASK) == 0)) {
-            // Interrupts were enabled, we just masked them
-            current_thread()->machine.intmask_timestamp = mach_absolute_time();
-        }
+               if (interrupt_masked_debug) {
+                       // Interrupts were enabled, we just masked them
+                       current_thread()->machine.intmask_timestamp = mach_absolute_time();
+               }
 #endif
-    }
-    return ((state & INTERRUPT_MASK) == 0);
+       }
+       return ((state & INTERRUPT_MASK) == 0);
+}
+
+/*
+ *     Routine:        ml_at_interrupt_context
+ *     Function:       Check if running at interrupt context
+ */
+boolean_t
+ml_at_interrupt_context(void)
+{
+       /* Do not use a stack-based check here, as the top-level exception handler
+        * is free to use some other stack besides the per-CPU interrupt stack.
+        * Interrupts should always be disabled if we're at interrupt context.
+        * Check that first, as we may be in a preemptible non-interrupt context, in
+        * which case we could be migrated to a different CPU between obtaining
+        * the per-cpu data pointer and loading cpu_int_state.  We then might end
+        * up checking the interrupt state of a different CPU, resulting in a false
+        * positive.  But if interrupts are disabled, we also know we cannot be
+        * preempted. */
+       return (!ml_get_interrupts_enabled() && (getCpuDatap()->cpu_int_state != NULL));
+}
+
+vm_offset_t 
+ml_stack_remaining(void)
+{
+       uintptr_t local = (uintptr_t) &local;
+       vm_offset_t     intstack_top_ptr;
+
+       /* Since this is a stack-based check, we don't need to worry about
+        * preemption as we do in ml_at_interrupt_context().  If we are preemptible,
+        * then the sp should never be within any CPU's interrupt stack unless
+        * something has gone horribly wrong. */
+       intstack_top_ptr = getCpuDatap()->intstack_top;
+       if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
+               return (local - (getCpuDatap()->intstack_top - INTSTACK_SIZE));
+       } else {
+               return (local - current_thread()->kernel_stack);
+       }
 }
 
 static boolean_t ml_quiescing;
 
 void ml_set_is_quiescing(boolean_t quiescing)
 {
-    assert(FALSE == ml_get_interrupts_enabled());
-    ml_quiescing = quiescing;
+       assert(FALSE == ml_get_interrupts_enabled());
+       ml_quiescing = quiescing;
 }
 
 boolean_t ml_is_quiescing(void)
 {
-    assert(FALSE == ml_get_interrupts_enabled());
-    return (ml_quiescing);
+       assert(FALSE == ml_get_interrupts_enabled());
+       return (ml_quiescing);
 }
 
 uint64_t ml_get_booter_memory_size(void)
 {
-    enum { kRoundSize = 512*1024*1024ULL };
        uint64_t size;
+       uint64_t roundsize = 512*1024*1024ULL;
        size = BootArgs->memSizeActual;
-    if (!size)
-    {
+       if (!size) {
                size  = BootArgs->memSize;
-               size  = (size + kRoundSize - 1) & ~(kRoundSize - 1);
+               if (size < (2 * roundsize)) roundsize >>= 1;
+               size  = (size + roundsize - 1) & ~(roundsize - 1);
                size -= BootArgs->memSize;
-    }
-    return (size);
+       }
+       return (size);
 }
 
 uint64_t
index 416bcb2a349477f7eda0a4deaf87a0ebde5ab125..a36edabcb46456376da61c1a3d9681df2c5ccd0a 100644 (file)
@@ -56,7 +56,7 @@ extern void Load_context(thread_t);
 extern void Idle_load_context(void) __attribute__((noreturn));
 extern thread_t Switch_context(thread_t, thread_continue_t, thread_t);
 extern thread_t Shutdown_context(void (*doshutdown)(processor_t), processor_t  processor);
-extern void Call_continuation(thread_continue_t, void *, wait_result_t, vm_offset_t);
+extern void Call_continuation(thread_continue_t, void *, wait_result_t, boolean_t enable_interrupts);
 
 extern void DebuggerCall(unsigned int reason, void *ctx);
 extern void DebuggerXCall(void *ctx);
@@ -82,8 +82,6 @@ extern boolean_t debug_state_is_valid32(arm_debug_state32_t *ds);
 extern boolean_t debug_state_is_valid64(arm_debug_state64_t *ds);
 
 extern int copyio_check_user_addr(user_addr_t user_addr, vm_size_t nbytes);
-extern int _emulate_swp(user_addr_t addr, uint32_t newval, uint32_t *oldval);
-extern int _emulate_swpb(user_addr_t addr, uint8_t newval, uint32_t *oldval);
 
 /* Top-Byte-Ignore */
 extern boolean_t user_tbi;
index 1ae4b3c5586c9fd8f75809adff1335808ca22fe6..49503dfeba5de978dd165fee4cc50cda23870108 100644 (file)
@@ -40,6 +40,7 @@
 #include <pexpert/boot.h>
 #include <pexpert/pexpert.h>
 
+
 #include <kern/misc_protos.h>
 #include <kern/startup.h>
 #include <kern/clock.h>
@@ -57,6 +58,7 @@
 #include <kdp/kdp_dyld.h>
 #include <kdp/kdp_internal.h>
 #include <uuid/uuid.h>
+#include <sys/codesign.h>
 #include <sys/time.h>
 
 #include <IOKit/IOPlatformExpert.h>
@@ -92,7 +94,7 @@ extern int            kdp_stack_snapshot_bytes_traced(void);
  * Increment the PANICLOG_VERSION if you change the format of the panic
  * log in any way.
  */
-#define PANICLOG_VERSION 9
+#define PANICLOG_VERSION 11
 static struct kcdata_descriptor kc_panic_data;
 
 extern char                 firmware_version[];
@@ -123,6 +125,7 @@ decl_simple_lock_data(extern,clock_lock)
 extern struct timeval   gIOLastSleepTime;
 extern struct timeval   gIOLastWakeTime;
 extern boolean_t                is_clock_configured;
+extern boolean_t kernelcache_uuid_valid;
 extern uuid_t kernelcache_uuid;
 
 /* Definitions for frame pointers */
@@ -152,6 +155,8 @@ unsigned int          DebugContextCount = 0;
 uint8_t PE_smc_stashed_x86_system_state = 0xFF;
 uint8_t PE_smc_stashed_x86_power_state = 0xFF;
 uint8_t PE_smc_stashed_x86_efi_boot_state = 0xFF;
+uint8_t PE_smc_stashed_x86_shutdown_cause = 0xFF;
+uint64_t PE_smc_stashed_x86_prev_power_transitions = UINT64_MAX;
 uint32_t PE_pcie_stashed_link_state = UINT32_MAX;
 #endif
 
@@ -327,24 +332,21 @@ do_print_all_backtraces(
                if (last_hwaccess_thread) {
                        paniclog_append_noflush("AppleHWAccess Thread: 0x%llx\n", last_hwaccess_thread);
                }
-#if defined(XNU_TARGET_OS_BRIDGE)
-               paniclog_append_noflush("PCIeUp link state: ");
-               if (PE_pcie_stashed_link_state != UINT32_MAX) {
-                       paniclog_append_noflush("0x%x\n", PE_pcie_stashed_link_state);
-               } else {
-                       paniclog_append_noflush("not available\n");
-               }
-#endif
+               paniclog_append_noflush("Boot args: %s\n", PE_boot_args());
        }
        paniclog_append_noflush("Memory ID: 0x%x\n", gPlatformMemoryID);
        paniclog_append_noflush("OS version: %.256s\n",
                        ('\0' != osversion[0]) ? osversion : "Not set yet");
        paniclog_append_noflush("Kernel version: %.512s\n", version);
-       paniclog_append_noflush("KernelCache UUID: ");
-       for (index = 0; index < sizeof(uuid_t); index++) {
-               paniclog_append_noflush("%02X", kernelcache_uuid[index]);
+
+       if (kernelcache_uuid_valid) {
+               paniclog_append_noflush("KernelCache UUID: ");
+               for (index = 0; index < sizeof(uuid_t); index++) {
+                       paniclog_append_noflush("%02X", kernelcache_uuid[index]);
+               }
+               paniclog_append_noflush("\n");
        }
-       paniclog_append_noflush("\n");
+       panic_display_kernel_uuid();
 
        paniclog_append_noflush("iBoot version: %.128s\n", firmware_version);
        paniclog_append_noflush("secure boot?: %s\n", debug_enabled ? "NO": "YES");
@@ -367,6 +369,24 @@ do_print_all_backtraces(
        } else {
                paniclog_append_noflush("not available\n");
        }
+       paniclog_append_noflush("x86 Shutdown Cause: ");
+       if (PE_smc_stashed_x86_shutdown_cause != 0xFF) {
+               paniclog_append_noflush("0x%x\n", PE_smc_stashed_x86_shutdown_cause);
+       } else {
+               paniclog_append_noflush("not available\n");
+       }
+       paniclog_append_noflush("x86 Previous Power Transitions: ");
+       if (PE_smc_stashed_x86_prev_power_transitions != UINT64_MAX) {
+               paniclog_append_noflush("0x%llx\n", PE_smc_stashed_x86_prev_power_transitions);
+       } else {
+               paniclog_append_noflush("not available\n");
+       }
+       paniclog_append_noflush("PCIeUp link state: ");
+       if (PE_pcie_stashed_link_state != UINT32_MAX) {
+               paniclog_append_noflush("0x%x\n", PE_pcie_stashed_link_state);
+       } else {
+               paniclog_append_noflush("not available\n");
+       }
 #endif
        paniclog_append_noflush("Paniclog version: %d\n", logversion);
 
@@ -380,6 +400,13 @@ do_print_all_backtraces(
        panic_display_ecc_errors();
 #endif /* CONFIG_ECC_LOGGING */
 
+#if DEVELOPMENT || DEBUG
+       if (cs_debug_unsigned_exec_failures != 0 || cs_debug_unsigned_mmap_failures != 0) {
+               paniclog_append_noflush("Unsigned code exec failures: %u\n", cs_debug_unsigned_exec_failures);
+               paniclog_append_noflush("Unsigned code mmap failures: %u\n", cs_debug_unsigned_mmap_failures);
+       }
+#endif
+
        // Just print threads with high CPU usage for WDT timeouts
        if (strncmp(message, "WDT timeout", 11) == 0) {
                thread_t        top_runnable[5] = {0};
@@ -596,7 +623,7 @@ void panic_print_symbol_name(vm_address_t search)
 
 void
 SavePanicInfo(
-       const char *message, __unused uint64_t panic_options)
+       const char *message, __unused void *panic_data, __unused uint64_t panic_options)
 {
 
        /* This should be initialized by the time we get here */
@@ -787,10 +814,12 @@ DebuggerXCallEnter(
                                paniclog_append_noflush("Attempting to forcibly halt cpu %d\n", cpu);
                                dbgwrap_status_t halt_status = ml_dbgwrap_halt_cpu(cpu, 0);
                                if (halt_status < 0)
-                                       paniclog_append_noflush("Unable to halt cpu %d: %d\n", cpu, halt_status);
+                                       paniclog_append_noflush("cpu %d failed to halt with error %d: %s\n", cpu, halt_status, ml_dbgwrap_strerror(halt_status));
                                else {
                                        if (halt_status > 0)
-                                               paniclog_append_noflush("cpu %d halted with warning %d\n", cpu, halt_status);
+                                               paniclog_append_noflush("cpu %d halted with warning %d: %s\n", cpu, halt_status, ml_dbgwrap_strerror(halt_status));
+                                       else
+                                               paniclog_append_noflush("cpu %d successfully halted\n", cpu);
                                        target_cpu_datap->halt_status = CPU_HALTED;
                                }
                        } else
@@ -806,7 +835,7 @@ DebuggerXCallEnter(
                                dbgwrap_status_t halt_status = ml_dbgwrap_halt_cpu_with_state(cpu,
                                    NSEC_PER_SEC, &target_cpu_datap->halt_state);
                                if ((halt_status < 0) || (halt_status == DBGWRAP_WARN_CPU_OFFLINE))
-                                       paniclog_append_noflush("Unable to obtain state for cpu %d: %d\n", cpu, halt_status);
+                                       paniclog_append_noflush("Unable to obtain state for cpu %d with status %d: %s\n", cpu, halt_status, ml_dbgwrap_strerror(halt_status));
                                else
                                        target_cpu_datap->halt_status = CPU_HALTED_WITH_STATE;
                        }
@@ -871,6 +900,7 @@ DebuggerXCall(
        if (save_context) {
                /* Save the interrupted context before acknowledging the signal */
                *state = *regs;
+
        } else if (regs) {
                /* zero old state so machine_trace_thread knows not to backtrace it */
                set_saved_state_fp(state, 0);
index a5b071631b8646205eeb558e5c70f4a48fe5966c..5526c515bded6cc0f26af7e3baf3a47e150a1094 100644 (file)
@@ -32,7 +32,7 @@
 bool mt_core_supported = false;
 
 void
-mt_init(void)
+mt_early_init(void)
 {
 }
 
@@ -48,4 +48,10 @@ mt_cur_cpu(void)
        return &getCpuDatap()->cpu_monotonic;
 }
 
-const struct monotonic_dev monotonic_devs[0];
+int
+mt_microstackshot_start_arch(__unused uint64_t period)
+{
+       return 1;
+}
+
+struct mt_device mt_devices[0];
index 2d06b559e0788d1df41dc18a7c553d05463c78d4..60510d8b26afa633aecd465f25b7628eda738883 100644 (file)
@@ -285,13 +285,14 @@ void
 call_continuation(
                  thread_continue_t continuation,
                  void *parameter,
-                 wait_result_t wresult)
+                 wait_result_t wresult, 
+                 boolean_t enable_interrupts)
 {
 #define call_continuation_kprintf(x...)        /* kprintf("call_continuation_kprintf:
                                         *  " x) */
 
        call_continuation_kprintf("thread = %x continuation = %x, stack = %x\n", current_thread(), continuation, current_thread()->machine.kstackptr);
-       Call_continuation(continuation, parameter, wresult, current_thread()->machine.kstackptr);
+       Call_continuation(continuation, parameter, wresult, enable_interrupts);
 }
 
 void arm_debug_set(arm_debug_state_t *debug_state)
index a62fd171e6ebd18a3292c61350c938458b6df509..754b84930d34d405a7f955880769ae8b6d64d937 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2016 Apple Inc. All rights reserved.
+ * Copyright (c) 2011-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -45,6 +45,9 @@
 #include <kern/misc_protos.h>
 #include <kern/spl.h>
 #include <kern/xpr.h>
+#include <kern/trustcache.h>
+
+#include <os/overflow.h>
 
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
@@ -55,6 +58,7 @@
 #include <vm/vm_pageout.h>
 #include <vm/cpm.h>
 
+#include <libkern/img4/interface.h>
 #include <libkern/section_keywords.h>
 
 #include <machine/atomic.h>
@@ -70,8 +74,6 @@
 #include <arm/misc_protos.h>
 #include <arm/trap.h>
 
-#include <libkern/section_keywords.h>
-
 #if    (__ARM_VMSA__ > 7)
 #include <arm64/proc_reg.h>
 #include <pexpert/arm64/boot.h>
 #include <pexpert/device_tree.h>
 
 #include <san/kasan.h>
+#include <sys/cdefs.h>
+
 
 #if MACH_ASSERT
+int vm_footprint_suspend_allowed = 1;
+
+extern int pmap_ledgers_panic;
+extern int pmap_ledgers_panic_leeway;
+
 int pmap_stats_assert = 1;
 #define PMAP_STATS_ASSERTF(cond, pmap, fmt, ...)                   \
        MACRO_BEGIN                                         \
@@ -100,13 +109,18 @@ int pmap_stats_assert = 1;
 #endif /* MACH_ASSERT */
 
 #if DEVELOPMENT || DEBUG
-#define PMAP_FOOTPRINT_SUSPENDED(pmap) ((pmap)->footprint_suspended)
+#define PMAP_FOOTPRINT_SUSPENDED(pmap) \
+       (current_thread()->pmap_footprint_suspended)
 #else /* DEVELOPMENT || DEBUG */
 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
 #endif /* DEVELOPMENT || DEBUG */
 
 
 
+#define pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a)
+#define pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a)
+
+
 #if DEVELOPMENT || DEBUG
 int panic_on_unsigned_execute = 0;
 #endif /* DEVELOPMENT || DEBUG */
@@ -130,6 +144,7 @@ extern pmap_paddr_t avail_end;
 
 extern vm_offset_t     virtual_space_start;    /* Next available kernel VA */
 extern vm_offset_t     virtual_space_end;      /* End of kernel address space */
+extern vm_offset_t     static_memory_end;
 
 extern int hard_maxproc;
 
@@ -168,10 +183,6 @@ decl_simple_lock_data(, pmaps_lock MARK_AS_PMAP_DATA)
 unsigned int   pmap_stamp MARK_AS_PMAP_DATA;
 queue_head_t   map_pmap_list MARK_AS_PMAP_DATA;
 
-queue_head_t   tt_pmap_list MARK_AS_PMAP_DATA;
-unsigned int   tt_pmap_count MARK_AS_PMAP_DATA;
-unsigned int   tt_pmap_max MARK_AS_PMAP_DATA;
-
 decl_simple_lock_data(, pt_pages_lock MARK_AS_PMAP_DATA)
 queue_head_t   pt_page_list MARK_AS_PMAP_DATA; /* pt page ptd entries list */
 
@@ -302,15 +313,28 @@ decl_simple_lock_data(,phys_backup_lock)
 #endif
 #endif
 
-#define        PT_DESC_REFCOUNT        0x4000U
+#define        PT_DESC_REFCOUNT                0x4000U
+#define PT_DESC_IOMMU_REFCOUNT         0x8000U
 
 typedef struct pt_desc {
-       queue_chain_t           pt_page;
+       queue_chain_t                   pt_page;
        struct {
-               unsigned short  refcnt;
-               unsigned short  wiredcnt;
+               /*
+                * For non-leaf pagetables, should always be PT_DESC_REFCOUNT
+                * For leaf pagetables, should reflect the number of non-empty PTEs
+                * For IOMMU pages, should always be PT_DESC_IOMMU_REFCOUNT
+                */
+               unsigned short          refcnt;
+               /*
+                * For non-leaf pagetables, should be 0
+                * For leaf pagetables, should reflect the number of wired entries
+                * For IOMMU pages, may optionally reflect a driver-defined refcount (IOMMU operations are implicitly wired)
+                */
+               unsigned short          wiredcnt;
        } pt_cnt[PT_INDEX_MAX];
-       struct pmap                     *pmap;
+       union {
+               struct pmap             *pmap;
+       };
        struct {
                vm_offset_t             va;
        } pt_map[PT_INDEX_MAX];
@@ -348,20 +372,21 @@ typedef   u_int16_t pp_attr_t;
 
 SECURITY_READ_ONLY_LATE(pp_attr_t*)    pp_attr_table;
 
+typedef struct pmap_io_range
+{
+       uint64_t addr;
+       uint32_t len;
+       uint32_t wimg; // treated as pp_attr_t
+} __attribute__((packed)) pmap_io_range_t;
 
-typedef uint8_t io_attr_t;
-
-#define IO_ATTR_WIMG_MASK              0x3F
-#define IO_ATTR_WIMG(x)                        ((x) & IO_ATTR_WIMG_MASK)
-
-SECURITY_READ_ONLY_LATE(io_attr_t*)    io_attr_table;
+SECURITY_READ_ONLY_LATE(pmap_io_range_t*)      io_attr_table;
 
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)  vm_first_phys = (pmap_paddr_t) 0;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)  vm_last_phys = (pmap_paddr_t) 0;
 
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)  io_rgn_start = 0;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)  io_rgn_end = 0;
-SECURITY_READ_ONLY_LATE(uint32_t)      io_rgn_granule = 0;
+SECURITY_READ_ONLY_LATE(unsigned int)  num_io_rgns = 0;
 
 SECURITY_READ_ONLY_LATE(boolean_t)     pmap_initialized = FALSE;       /* Has pmap_init completed? */
 
@@ -381,67 +406,67 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 #endif
 
 
-#define pa_index(pa)                                                                           \
+#define pa_index(pa)                                                                   \
        (atop((pa) - vm_first_phys))
 
-#define pai_to_pvh(pai)                                                                                \
+#define pai_to_pvh(pai)                                                                        \
        (&pv_head_table[pai])
 
-#define pa_valid(x)                                                                            \
+#define pa_valid(x)                                                                    \
        ((x) >= vm_first_phys && (x) < vm_last_phys)
 
 /* PTE Define Macros */
 
-#define        pte_is_wired(pte)                                                                       \
+#define        pte_is_wired(pte)                                                               \
        (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
 
-#define        pte_set_wired(ptep, wired)                                                      \
+#define        pte_set_wired(ptep, wired)                                                                              \
        do {                                                                                                    \
-               SInt16  *ptd_wiredcnt_ptr;                                                      \
+               SInt16  *ptd_wiredcnt_ptr;                                                                      \
                ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_ptd(ptep)->pt_cnt[ARM_PT_DESC_INDEX(ptep)].wiredcnt);   \
-               if (wired) {                                                                            \
-                               *ptep |= ARM_PTE_WIRED;                                         \
-                               OSAddAtomic16(1, ptd_wiredcnt_ptr);                     \
+               if (wired) {                                                                                    \
+                               *ptep |= ARM_PTE_WIRED;                                                         \
+                               OSAddAtomic16(1, ptd_wiredcnt_ptr);                                             \
                } else {                                                                                        \
-                               *ptep &= ~ARM_PTE_WIRED;                                        \
-                               OSAddAtomic16(-1, ptd_wiredcnt_ptr);            \
+                               *ptep &= ~ARM_PTE_WIRED;                                                        \
+                               OSAddAtomic16(-1, ptd_wiredcnt_ptr);                                            \
                }                                                                                               \
        } while(0)
 
-#define        pte_is_ffr(pte)                                                                         \
+#define        pte_is_ffr(pte)                                                                 \
        (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
 
 #define        pte_set_ffr(pte, ffr)                                                           \
-       do {                                                                                                    \
-               if (ffr) {                                                                                      \
-                       pte |= ARM_PTE_WRITEABLE;                                               \
-               } else {                                                                                        \
-                       pte &= ~ARM_PTE_WRITEABLE;                                              \
-               }                                                                                                       \
+       do {                                                                            \
+               if (ffr) {                                                              \
+                       pte |= ARM_PTE_WRITEABLE;                                       \
+               } else {                                                                \
+                       pte &= ~ARM_PTE_WRITEABLE;                                      \
+               }                                                                       \
        } while(0)
 
 /* PVE Define Macros */
 
-#define pve_next(pve)                                                                          \
+#define pve_next(pve)                                                                  \
        ((pve)->pve_next)
 
-#define pve_link_field(pve)                                                                    \
+#define pve_link_field(pve)                                                            \
        (&pve_next(pve))
 
-#define pve_link(pp, e)                                                                                \
+#define pve_link(pp, e)                                                                        \
        ((pve_next(e) = pve_next(pp)),  (pve_next(pp) = (e)))
 
-#define pve_unlink(pp, e)                                                                      \
+#define pve_unlink(pp, e)                                                              \
        (pve_next(pp) = pve_next(e))
 
 /* bits held in the ptep pointer field */
 
-#define pve_get_ptep(pve)                                                                      \
+#define pve_get_ptep(pve)                                                              \
        ((pve)->pve_ptep)
 
-#define pve_set_ptep(pve, ptep_new)                                                            \
-       do {                                                                                    \
-               (pve)->pve_ptep = (ptep_new);                                                   \
+#define pve_set_ptep(pve, ptep_new)                                                    \
+       do {                                                                            \
+               (pve)->pve_ptep = (ptep_new);                                           \
        } while (0)
 
 /* PTEP Define Macros */
@@ -461,18 +486,11 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
         */
 #define ARM_TT_PT_ADDR_SHIFT           (10U)
 
-#define        ARM_PT_DESC_INDEX(ptep)                                                                 \
-       (((unsigned)(ptep) & ARM_PT_DESC_INDEX_MASK) >> ARM_PT_DESC_INDEX_SHIFT)
-
-#define ptep_get_ptd(ptep)                                                                             \
-       ((struct pt_desc *)((*((vm_offset_t *)(pai_to_pvh(pa_index((vm_offset_t)(ptep) - gVirtBase + gPhysBase))))) & PVH_LIST_MASK))
-
 #define ptep_get_va(ptep)                                                                              \
-       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index((((vm_offset_t)(ptep) & ~0xFFF) - gVirtBase + gPhysBase))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<<ARM_TT_PT_ADDR_SHIFT))
+       ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~0xFFF))))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<<ARM_TT_PT_ADDR_SHIFT))
 
 #define ptep_get_pmap(ptep)                                                                            \
-        ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index((((vm_offset_t)(ptep) & ~0xFFF) - gVirtBase + gPhysBase))))))->pmap))
-
+        ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~0xFFF))))))))->pmap))
 
 #else
 
@@ -508,157 +526,164 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 #define ARM_TT_PT_OTHER_MASK           (0x3fffULL)
 #endif
 
-#define        ARM_PT_DESC_INDEX(ptep)                                                                 \
+#define        ARM_PT_DESC_INDEX(ptep)                                                                         \
        (((unsigned)(ptep) & ARM_PT_DESC_INDEX_MASK) >> ARM_PT_DESC_INDEX_SHIFT)
 
-
-#define ptep_get_ptd(ptep)                                                                             \
-       ((struct pt_desc *)((*((vm_offset_t *)(pai_to_pvh(pa_index((vm_offset_t)(ptep) - gVirtBase + gPhysBase))))) & PVH_LIST_MASK))
-
 #define ptep_get_va(ptep)                                                                              \
-        ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK) - gVirtBase + gPhysBase))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<<ARM_TT_PT_ADDR_SHIFT))
+        ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK))))))))->pt_map[ARM_PT_DESC_INDEX(ptep)].va)+ ((((unsigned)(ptep)) & ARM_TT_PT_INDEX_MASK)<<ARM_TT_PT_ADDR_SHIFT))
 
 #define ptep_get_pmap(ptep)                                                                            \
-        ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK) - gVirtBase + gPhysBase))))))->pmap))
+        ((((pt_desc_t *) (pvh_list(pai_to_pvh(pa_index(ml_static_vtop((((vm_offset_t)(ptep) & ~ARM_TT_PT_OTHER_MASK))))))))->pmap))
 
 #endif
 
+#define        ARM_PT_DESC_INDEX(ptep)                                                                         \
+       (((unsigned)(ptep) & ARM_PT_DESC_INDEX_MASK) >> ARM_PT_DESC_INDEX_SHIFT)
+
+#define ptep_get_ptd(ptep)                                                                             \
+       ((struct pt_desc *)(pvh_list(pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)(ptep)))))))
+
 
 /* PVH Define Macros */
 
 /* pvhead type */
-#define        PVH_TYPE_NULL   0x0UL
-#define        PVH_TYPE_PVEP   0x1UL
-#define        PVH_TYPE_PTEP   0x2UL
-#define        PVH_TYPE_PTDP   0x3UL
+#define        PVH_TYPE_NULL        0x0UL
+#define        PVH_TYPE_PVEP        0x1UL
+#define        PVH_TYPE_PTEP        0x2UL
+#define        PVH_TYPE_PTDP        0x3UL
 
-#define PVH_TYPE_MASK  (0x3UL)
-#define PVH_LIST_MASK  (~PVH_TYPE_MASK)
+#define PVH_TYPE_MASK        (0x3UL)
 
-#if    (__ARM_VMSA__ == 7)
-#define pvh_set_bits(h, b)                                                                             \
-       do {                                                                                                            \
-               while (!OSCompareAndSwap(*(vm_offset_t *)(h), *(vm_offset_t *)(h) | (b), (vm_offset_t *)(h)));  \
-       } while (0)
+#ifdef __arm64__
 
-#define pvh_clear_bits(h, b)                                                                   \
-       do {                                                                                                            \
-               while (!OSCompareAndSwap(*(vm_offset_t *)(h), *(vm_offset_t *)(h) & ~(b), (vm_offset_t *)(h))); \
-       } while (0)
-#else
-#define pvh_set_bits(h, b)                                                                             \
-       do {                                                                                                            \
-               while (!OSCompareAndSwap64(*(vm_offset_t *)(h), *(vm_offset_t *)(h) | ((int64_t)b), (vm_offset_t *)(h)));       \
-       } while (0)
+#define PVH_FLAG_IOMMU       0x4UL
+#define PVH_FLAG_IOMMU_TABLE (1ULL << 63)
+#define PVH_FLAG_CPU         (1ULL << 62)
+#define PVH_LOCK_BIT         61
+#define PVH_FLAG_LOCK        (1ULL << PVH_LOCK_BIT)
+#define PVH_FLAG_EXEC        (1ULL << 60)
+#define PVH_FLAG_LOCKDOWN    (1ULL << 59)
+#define PVH_HIGH_FLAGS       (PVH_FLAG_CPU | PVH_FLAG_LOCK | PVH_FLAG_EXEC | PVH_FLAG_LOCKDOWN)
+
+#else  /* !__arm64__ */
+
+#define PVH_LOCK_BIT         31
+#define PVH_FLAG_LOCK        (1UL << PVH_LOCK_BIT)
+#define PVH_HIGH_FLAGS       PVH_FLAG_LOCK
 
-#define pvh_clear_bits(h, b)                                                                   \
-       do {                                                                                                            \
-               while (!OSCompareAndSwap64(*(vm_offset_t *)(h), *(vm_offset_t *)(h) & ~((int64_t)b), (vm_offset_t *)(h)));      \
-       } while (0)
 #endif
 
-#define pvh_test_type(h, b)                                                                            \
+#define PVH_LIST_MASK  (~PVH_TYPE_MASK)
+
+#define pvh_test_type(h, b)                                                                                    \
        ((*(vm_offset_t *)(h) & (PVH_TYPE_MASK)) == (b))
 
 #define pvh_ptep(h)                                                                                            \
-               ((pt_entry_t *)(*(vm_offset_t *)(h) & PVH_LIST_MASK))
+       ((pt_entry_t *)((*(vm_offset_t *)(h) & PVH_LIST_MASK) | PVH_HIGH_FLAGS))
 
 #define pvh_list(h)                                                                                            \
-               ((pv_entry_t *)(*(vm_offset_t *)(h) & PVH_LIST_MASK))
+       ((pv_entry_t *)((*(vm_offset_t *)(h) & PVH_LIST_MASK) | PVH_HIGH_FLAGS))
 
-#define pvh_bits(h)                                                                                            \
-       (*(vm_offset_t *)(h) & PVH_TYPE_MASK)
+#define pvh_get_flags(h)                                                                                       \
+       (*(vm_offset_t *)(h) & PVH_HIGH_FLAGS)
 
-#if    (__ARM_VMSA__ == 7)
-#define pvh_update_head(h, e, t)                                                                       \
-       do {                                                                                                            \
-               while (!OSCompareAndSwap(*(vm_offset_t *)(h), (vm_offset_t)(e) | (t), (vm_offset_t *)(h)));     \
+#define pvh_set_flags(h, f)                                                                                    \
+       do {                                                                                                    \
+               __c11_atomic_store((_Atomic vm_offset_t *)(h), (*(vm_offset_t *)(h) & ~PVH_HIGH_FLAGS) | (f),   \
+                    memory_order_relaxed);                                                                     \
        } while (0)
-#else
-#define pvh_update_head(h, e, t)                                                                       \
-       do {                                                                                                            \
-               while (!OSCompareAndSwap64(*(vm_offset_t *)(h), (vm_offset_t)(e) | (t), (vm_offset_t *)(h)));   \
+
+#define pvh_update_head(h, e, t)                                                                               \
+       do {                                                                                                    \
+               assert(*(vm_offset_t *)(h) & PVH_FLAG_LOCK);                                                    \
+               __c11_atomic_store((_Atomic vm_offset_t *)(h), (vm_offset_t)(e) | (t) | PVH_FLAG_LOCK,          \
+                    memory_order_relaxed);                                                                     \
        } while (0)
-#endif
 
-#define pvh_add(h, e)                                                  \
-       do {                                                            \
-               assert(!pvh_test_type((h), PVH_TYPE_PTEP));             \
-               pve_next(e) = pvh_list(h);                              \
-               pvh_update_head((h), (e), PVH_TYPE_PVEP);               \
+#define pvh_update_head_unlocked(h, e, t)                                                                      \
+       do {                                                                                                    \
+               assert(!(*(vm_offset_t *)(h) & PVH_FLAG_LOCK));                                                 \
+               *(vm_offset_t *)(h) = ((vm_offset_t)(e) | (t)) & ~PVH_FLAG_LOCK;                                \
        } while (0)
 
-#define pvh_remove(h, p, e)                                            \
-       do {                                                            \
-               assert(!PVE_NEXT_IS_ALTACCT(pve_next((e))));            \
-               if ((p) == (h)) {                                       \
-                       if (PVE_NEXT_PTR(pve_next((e))) == PV_ENTRY_NULL) { \
-                               pvh_update_head((h), PV_ENTRY_NULL, PVH_TYPE_NULL); \
-                       } else {                                        \
-                               pvh_update_head((h), PVE_NEXT_PTR(pve_next((e))), PVH_TYPE_PVEP); \
-                       }                                               \
-               } else {                                                \
-                       /*                                              \
-                        * PMAP LEDGERS:                                \
-                        * preserve the "alternate accounting" bit      \
-                        * when updating "p" (the previous entry's      \
-                        * "pve_next").                                 \
-                        */                                             \
-                       boolean_t       __is_altacct;                   \
-                       __is_altacct = PVE_NEXT_IS_ALTACCT(*(p));       \
-                       *(p) = PVE_NEXT_PTR(pve_next((e)));             \
-                       if (__is_altacct) {                             \
-                               PVE_NEXT_SET_ALTACCT((p));              \
-                       } else {                                        \
-                               PVE_NEXT_CLR_ALTACCT((p));              \
-                       }                                               \
-               }                                                       \
+#define pvh_add(h, e)                                                                  \
+       do {                                                                            \
+               assert(!pvh_test_type((h), PVH_TYPE_PTEP));                             \
+               pve_next(e) = pvh_list(h);                                              \
+               pvh_update_head((h), (e), PVH_TYPE_PVEP);                               \
+       } while (0)
+
+#define pvh_remove(h, p, e)                                                                            \
+       do {                                                                                            \
+               assert(!PVE_NEXT_IS_ALTACCT(pve_next((e))));                                            \
+               if ((p) == (h)) {                                                                       \
+                       if (PVE_NEXT_PTR(pve_next((e))) == PV_ENTRY_NULL) {                             \
+                               pvh_update_head((h), PV_ENTRY_NULL, PVH_TYPE_NULL);                     \
+                       } else {                                                                        \
+                               pvh_update_head((h), PVE_NEXT_PTR(pve_next((e))), PVH_TYPE_PVEP);       \
+                       }                                                                               \
+               } else {                                                                                \
+                       /*                                                                              \
+                        * PMAP LEDGERS:                                                                \
+                        * preserve the "alternate accounting" bit                                      \
+                        * when updating "p" (the previous entry's                                      \
+                        * "pve_next").                                                                 \
+                        */                                                                             \
+                       boolean_t       __is_altacct;                                                   \
+                       __is_altacct = PVE_NEXT_IS_ALTACCT(*(p));                                       \
+                       *(p) = PVE_NEXT_PTR(pve_next((e)));                                             \
+                       if (__is_altacct) {                                                             \
+                               PVE_NEXT_SET_ALTACCT((p));                                              \
+                       } else {                                                                        \
+                               PVE_NEXT_CLR_ALTACCT((p));                                              \
+                       }                                                                               \
+               }                                                                                       \
        } while (0)
 
 
 /* PPATTR Define Macros */
 
-#define ppattr_set_bits(h, b)                                                                          \
-       do {                                                                                                            \
+#define ppattr_set_bits(h, b)                                                                                  \
+       do {                                                                                                    \
                while (!OSCompareAndSwap16(*(pp_attr_t *)(h), *(pp_attr_t *)(h) | (b), (pp_attr_t *)(h)));      \
        } while (0)
 
-#define ppattr_clear_bits(h, b)                                                                        \
-       do {                                                                                                            \
+#define ppattr_clear_bits(h, b)                                                                                        \
+       do {                                                                                                    \
                while (!OSCompareAndSwap16(*(pp_attr_t *)(h), *(pp_attr_t *)(h) & ~(b), (pp_attr_t *)(h)));     \
        } while (0)
 
-#define ppattr_test_bits(h, b)                                                                         \
+#define ppattr_test_bits(h, b)                                                         \
        ((*(pp_attr_t *)(h) & (b)) == (b))
 
-#define pa_set_bits(x, b)                                                                              \
-       do {                                                                                                            \
-               if (pa_valid(x))                                                                                \
-                       ppattr_set_bits(&pp_attr_table[pa_index(x)],            \
-                                    (b));                                                                              \
+#define pa_set_bits(x, b)                                                              \
+       do {                                                                            \
+               if (pa_valid(x))                                                        \
+                       ppattr_set_bits(&pp_attr_table[pa_index(x)],                    \
+                                    (b));                                              \
        } while (0)
 
-#define pa_test_bits(x, b)                                                                             \
+#define pa_test_bits(x, b)                                                             \
        (pa_valid(x) ? ppattr_test_bits(&pp_attr_table[pa_index(x)],\
                                     (b)) : FALSE)
 
-#define pa_clear_bits(x, b)                                                                            \
-       do {                                                                                                            \
-               if (pa_valid(x))                                                                                \
-                       ppattr_clear_bits(&pp_attr_table[pa_index(x)],          \
-                                      (b));                                                                    \
+#define pa_clear_bits(x, b)                                                            \
+       do {                                                                            \
+               if (pa_valid(x))                                                        \
+                       ppattr_clear_bits(&pp_attr_table[pa_index(x)],                  \
+                                      (b));                                            \
        } while (0)
 
-#define pa_set_modify(x)                                                                               \
+#define pa_set_modify(x)                                                               \
        pa_set_bits(x, PP_ATTR_MODIFIED)
 
-#define pa_clear_modify(x)                                                                             \
+#define pa_clear_modify(x)                                                             \
        pa_clear_bits(x, PP_ATTR_MODIFIED)
 
-#define pa_set_reference(x)                                                                            \
+#define pa_set_reference(x)                                                            \
        pa_set_bits(x, PP_ATTR_REFERENCED)
 
-#define pa_clear_reference(x)                                                                  \
+#define pa_clear_reference(x)                                                          \
        pa_clear_bits(x, PP_ATTR_REFERENCED)
 
 
@@ -676,21 +701,21 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 #define CLR_REUSABLE_PAGE(pai) \
        ppattr_clear_bits(&pp_attr_table[pai], PP_ATTR_REUSABLE)
 
-#define IS_ALTACCT_PAGE(pai, pve_p)                            \
-       (((pve_p) == NULL)                                        \
-        ? ppattr_test_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT)  \
+#define IS_ALTACCT_PAGE(pai, pve_p)                                                    \
+       (((pve_p) == NULL)                                                              \
+        ? ppattr_test_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT)                       \
         : PVE_NEXT_IS_ALTACCT(pve_next((pve_p))))
-#define SET_ALTACCT_PAGE(pai, pve_p)                                   \
-       if ((pve_p) == NULL) {                                          \
-               ppattr_set_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT);  \
-       } else {                                                        \
-               PVE_NEXT_SET_ALTACCT(&pve_next((pve_p)));               \
+#define SET_ALTACCT_PAGE(pai, pve_p)                                                   \
+       if ((pve_p) == NULL) {                                                          \
+               ppattr_set_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT);                  \
+       } else {                                                                        \
+               PVE_NEXT_SET_ALTACCT(&pve_next((pve_p)));                               \
        }
-#define CLR_ALTACCT_PAGE(pai, pve_p)                                   \
-       if ((pve_p) == NULL) {                                          \
-               ppattr_clear_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT);\
-       } else {                                                        \
-               PVE_NEXT_CLR_ALTACCT(&pve_next((pve_p)));               \
+#define CLR_ALTACCT_PAGE(pai, pve_p)                                                   \
+       if ((pve_p) == NULL) {                                                          \
+               ppattr_clear_bits(&pp_attr_table[pai], PP_ATTR_ALTACCT);                \
+       } else {                                                                        \
+               PVE_NEXT_CLR_ALTACCT(&pve_next((pve_p)));                               \
        }
 
 #define IS_REFFAULT_PAGE(pai) \
@@ -707,171 +732,184 @@ SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap;
 #define CLR_MODFAULT_PAGE(pai) \
        ppattr_clear_bits(&pp_attr_table[pai], PP_ATTR_MODFAULT)
 
+#define tte_get_ptd(tte)                                                               \
+       ((struct pt_desc *)(pvh_list(pai_to_pvh(pa_index((vm_offset_t)((tte) & ~PAGE_MASK))))))
+
 
 #if    (__ARM_VMSA__ == 7)
 
-#define tte_index(pmap, addr)                                                                  \
+#define tte_index(pmap, addr)                                                          \
        ttenum((addr))
 
-#define tte_get_ptd(tte)                                                                               \
-       ((struct pt_desc *)((*((vm_offset_t *)(pai_to_pvh(pa_index((vm_offset_t)((tte) & ~PAGE_MASK)))))) & PVH_LIST_MASK))
-
 #else
 
-#define tt0_index(pmap, addr)                                                                  \
+#define tt0_index(pmap, addr)                                                          \
        (((addr) & ARM_TT_L0_INDEX_MASK) >> ARM_TT_L0_SHIFT)
 
-#define tt1_index(pmap, addr)                                                                  \
+#define tt1_index(pmap, addr)                                                          \
        (((addr) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)
 
-#define tt2_index(pmap, addr)                                                                  \
+#define tt2_index(pmap, addr)                                                          \
        (((addr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)
 
-#define tt3_index(pmap, addr)                                                                  \
+#define tt3_index(pmap, addr)                                                          \
        (((addr) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)
 
-#define tte_index(pmap, addr)                                                                  \
+#define tte_index(pmap, addr)                                                          \
        (((addr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)
 
-#define tte_get_ptd(tte)                                                                               \
-       ((struct pt_desc *)((*((vm_offset_t *)(pai_to_pvh(pa_index((vm_offset_t)((tte) & ~PAGE_MASK)))))) & PVH_LIST_MASK))
-
 #endif
 
 /*
  *     Lock on pmap system
  */
 
-#define PMAP_LOCK_INIT(pmap) {                                                                 \
-       simple_lock_init(&(pmap)->lock, 0);                                                     \
+#define PMAP_LOCK_INIT(pmap) {                                                         \
+       simple_lock_init(&(pmap)->lock, 0);                                             \
                        }
 
-#define PMAP_LOCK(pmap) {                                                                              \
-       simple_lock(&(pmap)->lock);                                                                     \
+#define PMAP_LOCK(pmap) {                                                              \
+       pmap_simple_lock(&(pmap)->lock);                                                \
 }
 
-#define PMAP_UNLOCK(pmap) {                                                                            \
-       simple_unlock(&(pmap)->lock);                                                           \
+#define PMAP_UNLOCK(pmap) {                                                            \
+       pmap_simple_unlock(&(pmap)->lock);                                              \
 }
 
 #if MACH_ASSERT
-#define PMAP_ASSERT_LOCKED(pmap) {                                                             \
-       simple_lock_assert(&(pmap)->lock, LCK_ASSERT_OWNED);                                    \
+#define PMAP_ASSERT_LOCKED(pmap) {                                                     \
+       simple_lock_assert(&(pmap)->lock, LCK_ASSERT_OWNED);                            \
 }
 #else
 #define PMAP_ASSERT_LOCKED(pmap)
 #endif
 
-/*
- *     Each entry in the pv_head_table is locked by a bit in the
- *     pv lock array, which is stored in the region preceding pv_head_table.
- *     The lock bits are accessed by the physical address of the page they lock.
- */
-#define LOCK_PVH(index)        {                                                                               \
-       hw_lock_bit((hw_lock_bit_t *)                                                                           \
-               ((unsigned int*)pv_head_table)-1-(index>>5),                    \
-               (index&0x1F));                                                                                  \
-       }
+#if defined(__arm64__)
+#define PVH_LOCK_WORD 1 /* Assumes little-endian */
+#else
+#define PVH_LOCK_WORD 0
+#endif
 
-#define UNLOCK_PVH(index)      {                                                                       \
-       hw_unlock_bit((hw_lock_bit_t *)                                                                 \
-               ((unsigned int*)pv_head_table)-1-(index>>5),                    \
-               (index&0x1F));                                                                                  \
-       }
+#define ASSERT_PVH_LOCKED(index)                                                                                                       \
+       do {                                                                                                                            \
+               assert((vm_offset_t)(pv_head_table[index]) & PVH_FLAG_LOCK);                                                            \
+       } while (0)
 
-#define ASSERT_PVH_LOCKED(index) {                                                             \
-       assert(*(((unsigned int*)pv_head_table)-1-(index>>5)) & (1 << (index & 0x1F)));         \
-}
+#define LOCK_PVH(index)                                                                                                                        \
+       do {                                                                                                                            \
+               pmap_lock_bit((uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD, PVH_LOCK_BIT - (PVH_LOCK_WORD * 32));         \
+       } while (0)
+
+#define UNLOCK_PVH(index)                                                                                                      \
+       do {                                                                                                                    \
+               ASSERT_PVH_LOCKED(index);                                                                                       \
+               pmap_unlock_bit((uint32_t*)(&pv_head_table[index]) + PVH_LOCK_WORD, PVH_LOCK_BIT - (PVH_LOCK_WORD * 32));       \
+       } while (0)
 
 #define PMAP_UPDATE_TLBS(pmap, s, e) {                                                 \
-       flush_mmu_tlb_region_asid(s, (unsigned)(e - s), pmap);                                  \
+       flush_mmu_tlb_region_asid_async(s, (unsigned)(e - s), pmap);                    \
+       sync_tlb_flush();                                                               \
 }
 
 #ifdef __ARM_L1_PTW__
 
-#define FLUSH_PTE_RANGE(spte, epte)                                                            \
-       __asm__ volatile("dsb ish");
+#define FLUSH_PTE_RANGE(spte, epte)                                                    \
+       __builtin_arm_dmb(DMB_ISH);
 
-#define FLUSH_PTE(pte_p)                                                                               \
-       __asm__ volatile("dsb ish");
+#define FLUSH_PTE(pte_p)                                                               \
+       __builtin_arm_dmb(DMB_ISH);
 
-#else
+#define FLUSH_PTE_STRONG(pte_p)                                                                \
+       __builtin_arm_dsb(DSB_ISH);
+
+#define FLUSH_PTE_RANGE_STRONG(spte, epte)                                             \
+       __builtin_arm_dsb(DSB_ISH);
 
-#define FLUSH_PTE_RANGE(spte, epte)                                                            \
+#else /* __ARM_L1_PTW */
+
+#define FLUSH_PTE_RANGE(spte, epte)                                                    \
                CleanPoU_DcacheRegion((vm_offset_t)spte,                                \
                        (vm_offset_t)epte - (vm_offset_t)spte);
 
-#define FLUSH_PTE(pte_p)                                                                               \
-       CleanPoU_DcacheRegion((vm_offset_t)pte_p, sizeof(pt_entry_t));
-#endif
+#define FLUSH_PTE(pte_p)                                                               \
+       __unreachable_ok_push                                                           \
+       if (TEST_PAGE_RATIO_4)                                                          \
+               FLUSH_PTE_RANGE((pte_p), (pte_p) + 4);                                  \
+       else                                                                            \
+               FLUSH_PTE_RANGE((pte_p), (pte_p) + 1);                                  \
+       CleanPoU_DcacheRegion((vm_offset_t)pte_p, sizeof(pt_entry_t));                  \
+       __unreachable_ok_pop
 
-#define WRITE_PTE(pte_p, pte_entry)                                                            \
-    __unreachable_ok_push                                                                              \
-       if (TEST_PAGE_RATIO_4) {                                                                        \
-       do {                                                                                                            \
-               if (((unsigned)(pte_p)) & 0x1f) panic("WRITE_PTE\n");           \
-               if (((pte_entry) & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {        \
-               *(pte_p) = (pte_entry);                                                                 \
-               *((pte_p)+1) = (pte_entry);                                                             \
-               *((pte_p)+2) = (pte_entry);                                                             \
-               *((pte_p)+3) = (pte_entry);                                                             \
-               } else {                                                                                                \
-               *(pte_p) = (pte_entry);                                                                 \
-               *((pte_p)+1) = (pte_entry) | 0x1000;                                            \
-               *((pte_p)+2) = (pte_entry) | 0x2000;                                            \
-               *((pte_p)+3) = (pte_entry) | 0x3000;                                            \
-               }                                                                                                               \
-               FLUSH_PTE_RANGE((pte_p),((pte_p)+4));                                           \
-       } while(0);                                                                                                     \
-       } else {                                                                                                        \
-       do {                                                                                                            \
-               *(pte_p) = (pte_entry);                                                                 \
-               FLUSH_PTE(pte_p);                                                                               \
-       } while(0);                                                                                                     \
-       }                                                                                                                       \
-    __unreachable_ok_pop
+#define FLUSH_PTE_STRONG(pte_p)        FLUSH_PTE(pte_p)
 
-#define WRITE_PTE_FAST(pte_p, pte_entry)                                               \
-    __unreachable_ok_push                                                                              \
-       if (TEST_PAGE_RATIO_4) {                                                                        \
-       if (((unsigned)(pte_p)) & 0x1f) panic("WRITE_PTE\n");                   \
-       if (((pte_entry) & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {        \
-       *(pte_p) = (pte_entry);                                                                         \
-       *((pte_p)+1) = (pte_entry);                                                                     \
-       *((pte_p)+2) = (pte_entry);                                                                     \
-       *((pte_p)+3) = (pte_entry);                                                                     \
-       } else {                                                                                                        \
-       *(pte_p) = (pte_entry);                                                                         \
-       *((pte_p)+1) = (pte_entry) | 0x1000;                                                    \
-       *((pte_p)+2) = (pte_entry) | 0x2000;                                                    \
-       *((pte_p)+3) = (pte_entry) | 0x3000;                                                    \
-       }                                                                                                                       \
-       } else {                                                                                                        \
-       *(pte_p) = (pte_entry);                                                                         \
-       }                                                                                                                       \
-    __unreachable_ok_pop
+#define FLUSH_PTE_RANGE_STRONG(spte, epte) FLUSH_PTE_RANGE(spte, epte)
 
+#endif /* !defined(__ARM_L1_PTW) */
+
+#define WRITE_PTE_FAST(pte_p, pte_entry)                                               \
+       __unreachable_ok_push                                                           \
+       if (TEST_PAGE_RATIO_4) {                                                        \
+               if (((unsigned)(pte_p)) & 0x1f)                                         \
+                       panic("WRITE_PTE\n");                                           \
+               if (((pte_entry) & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {        \
+                       *(pte_p) = (pte_entry);                                         \
+                       *((pte_p)+1) = (pte_entry);                                     \
+                       *((pte_p)+2) = (pte_entry);                                     \
+                       *((pte_p)+3) = (pte_entry);                                     \
+               } else {                                                                \
+                       *(pte_p) = (pte_entry);                                         \
+                       *((pte_p)+1) = (pte_entry) | 0x1000;                            \
+                       *((pte_p)+2) = (pte_entry) | 0x2000;                            \
+                       *((pte_p)+3) = (pte_entry) | 0x3000;                            \
+               }                                                                       \
+       } else {                                                                        \
+               *(pte_p) = (pte_entry);                                                 \
+       }                                                                               \
+       __unreachable_ok_pop
+
+#define WRITE_PTE(pte_p, pte_entry)                                                    \
+       WRITE_PTE_FAST(pte_p, pte_entry);                                               \
+       FLUSH_PTE(pte_p);
+
+#define WRITE_PTE_STRONG(pte_p, pte_entry)                                             \
+       WRITE_PTE_FAST(pte_p, pte_entry);                                               \
+       FLUSH_PTE_STRONG(pte_p);
 
 /*
  * Other useful macros.
  */
-#define current_pmap()                                                                                 \
+#define current_pmap()                                                                 \
        (vm_map_pmap(current_thread()->map))
 
-#define PMAP_IS_VALID(x) (TRUE)
 
-#ifdef PMAP_TRACES
-unsigned int pmap_trace = 0;
+#define VALIDATE_USER_PMAP(x)
+#define VALIDATE_PMAP(x)
+#define VALIDATE_LEDGER(x)
+
+
+#if DEVELOPMENT || DEBUG
+
+/* 
+ * Trace levels are controlled by a bitmask in which each
+ * level can be enabled/disabled by the (1<<level) position
+ * in the boot arg
+ * Level 1: pmap lifecycle (create/destroy/switch)
+ * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
+ * Level 3: internal state management (tte/attributes/fast-fault)
+ */
+
+SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
 
-#define PMAP_TRACE(...) \
-       if (pmap_trace) { \
+#define PMAP_TRACE(level, ...) \
+       if (__improbable((1 << (level)) & pmap_trace_mask)) { \
                KDBG_RELEASE(__VA_ARGS__); \
        }
 #else
-#define PMAP_TRACE(...) KDBG_DEBUG(__VA_ARGS__)
+
+#define PMAP_TRACE(level, ...)
+
 #endif
 
-#define PMAP_TRACE_CONSTANT(...) KDBG_RELEASE(__VA_ARGS__)
 
 /*
  * Internal function prototypes (forward declarations).
@@ -896,11 +934,11 @@ static void pv_list_free(
 static void ptd_bootstrap(
                                pt_desc_t *ptdp, unsigned int ptd_cnt);
 
-static pt_desc_t *ptd_alloc(
-                               pmap_t pmap);
+static inline pt_desc_t *ptd_alloc_unlinked(void);
 
-static void ptd_deallocate(
-                               pt_desc_t *ptdp);
+static pt_desc_t *ptd_alloc(pmap_t pmap);
+
+static void ptd_deallocate(pt_desc_t *ptdp);
 
 static void ptd_init(
                                pt_desc_t *ptdp, pmap_t pmap, vm_map_address_t va, unsigned int ttlevel, pt_entry_t * pte_p);
@@ -917,7 +955,7 @@ ppnum_t                     pmap_vtophys(
 void pmap_switch_user_ttb(
                                pmap_t pmap);
 
-static void    flush_mmu_tlb_region_asid(
+static void    flush_mmu_tlb_region_asid_async(
                                vm_offset_t va, unsigned length, pmap_t pmap);
 
 static kern_return_t pmap_expand(
@@ -971,10 +1009,6 @@ const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
 
 #define        PMAP_TT_DEALLOCATE_NOBLOCK      0x1
 
-void pmap_init_pte_page_internal(
-                               pmap_t, pt_entry_t *, vm_offset_t, unsigned int , pt_desc_t **);
-
-
 #if    (__ARM_VMSA__ > 7)
 
 static inline tt_entry_t *pmap_tt1e(
@@ -989,9 +1023,6 @@ static inline pt_entry_t *pmap_tt3e(
 static void pmap_unmap_sharedpage(
                                pmap_t pmap);
 
-static void pmap_sharedpage_flush_32_to_64(
-                               void);
-
 static boolean_t
                        pmap_is_64bit(pmap_t);
 
@@ -1025,6 +1056,13 @@ static void pmap_pages_free(
                                pmap_paddr_t    pa,
                                unsigned        size);
 
+static void pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes);
+
+static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes);
+
+
+static void pmap_trim_self(pmap_t pmap);
+static void pmap_trim_subord(pmap_t subord);
 
 #define PMAP_SUPPORT_PROTOTYPES(__return_type, __function_name, __function_args, __function_index) \
        static __return_type __function_name##_internal __function_args;
@@ -1076,8 +1114,6 @@ PMAP_SUPPORT_PROTOTYPES(
 void,
 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
 
-
-
 PMAP_SUPPORT_PROTOTYPES(
 kern_return_t,
 pmap_enter_options, (pmap_t pmap,
@@ -1101,7 +1137,7 @@ pmap_find_phys, (pmap_t pmap,
 
 #if (__ARM_VMSA__ > 7)
 PMAP_SUPPORT_PROTOTYPES(
-void,
+kern_return_t,
 pmap_insert_sharedpage, (pmap_t pmap), PMAP_INSERT_SHAREDPAGE_INDEX);
 #endif
 
@@ -1149,11 +1185,10 @@ pmap_query_page_info, (pmap_t pmap,
                        int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
 
 PMAP_SUPPORT_PROTOTYPES(
-boolean_t,
+mach_vm_size_t,
 pmap_query_resident, (pmap_t pmap,
                       vm_map_address_t start,
                       vm_map_address_t end,
-                      mach_vm_size_t *resident_bytes_p,
                       mach_vm_size_t *compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
 
 PMAP_SUPPORT_PROTOTYPES(
@@ -1189,7 +1224,6 @@ pmap_set_process, (pmap_t pmap,
                    char *procname), PMAP_SET_PROCESS_INDEX);
 #endif
 
-
 PMAP_SUPPORT_PROTOTYPES(
 void,
 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
@@ -1223,6 +1257,25 @@ PMAP_SUPPORT_PROTOTYPES(
 void,
 pmap_switch_user_ttb, (pmap_t pmap), PMAP_SWITCH_USER_TTB_INDEX);
 
+PMAP_SUPPORT_PROTOTYPES(
+void,
+pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
+
+
+PMAP_SUPPORT_PROTOTYPES(
+void,
+pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
+
+PMAP_SUPPORT_PROTOTYPES(
+void,
+pmap_trim, (pmap_t grand,
+                pmap_t subord,
+                addr64_t vstart,
+                addr64_t nstart,
+                uint64_t size), PMAP_TRIM_INDEX);
+
+
+
 
 
 void pmap_footprint_suspend(vm_map_t   map,
@@ -1233,6 +1286,7 @@ PMAP_SUPPORT_PROTOTYPES(
                                 boolean_t suspend),
        PMAP_FOOTPRINT_SUSPEND_INDEX);
 
+
 #if CONFIG_PGTRACE
 boolean_t pgtrace_enabled = 0;
 
@@ -1294,12 +1348,11 @@ int pt_fake_zone_index = -1;            /* index of pmap fake zone */
 /*
  * Allocates and initializes a per-CPU data structure for the pmap.
  */
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_cpu_data_init_internal(unsigned int cpu_number)
 {
-       pmap_cpu_data_t * pmap_cpu_data = NULL;
+       pmap_cpu_data_t * pmap_cpu_data = pmap_get_cpu_data();
 
-       pmap_cpu_data = pmap_get_cpu_data();
        pmap_cpu_data->cpu_number = cpu_number;
 }
 
@@ -1351,7 +1404,7 @@ pmap_pages_reclaim(
         * If no eligible page were found in the pt page list, panic.
         */
 
-       simple_lock(&pmap_pages_lock);
+       pmap_simple_lock(&pmap_pages_lock);
        pmap_pages_request_count++;
        pmap_pages_request_acum++;
 
@@ -1362,30 +1415,28 @@ pmap_pages_reclaim(
 
                        page_entry = pmap_pages_reclaim_list;
                        pmap_pages_reclaim_list = pmap_pages_reclaim_list->next;
-                       simple_unlock(&pmap_pages_lock);
+                       pmap_simple_unlock(&pmap_pages_lock);
 
                        return((pmap_paddr_t)ml_static_vtop((vm_offset_t)page_entry));
                }
 
-               simple_unlock(&pmap_pages_lock);
+               pmap_simple_unlock(&pmap_pages_lock);
 
-               simple_lock(&pt_pages_lock);
+               pmap_simple_lock(&pt_pages_lock);
                ptdp = (pt_desc_t *)queue_first(&pt_page_list);
                found_page = FALSE;
 
                while (!queue_end(&pt_page_list, (queue_entry_t)ptdp)) {
-                       if ((ptdp->pmap != kernel_pmap)
-                           && (ptdp->pmap->nested == FALSE)
-                           && (simple_lock_try(&ptdp->pmap->lock))) {
+                       if ((ptdp->pmap->nested == FALSE)
+                           && (pmap_simple_lock_try(&ptdp->pmap->lock))) {
 
+                               assert(ptdp->pmap != kernel_pmap);
                                unsigned refcnt_acc = 0;
                                unsigned wiredcnt_acc = 0;
 
                                for (i = 0 ; i < PT_INDEX_MAX ; i++) {
-                                       if (ptdp->pt_cnt[i].refcnt & PT_DESC_REFCOUNT) {
-                                               /* Do not attempt to free a page that contains an L2 table
-                                                * or is currently being operated on by pmap_enter(), 
-                                                * which can drop the pmap lock. */
+                                       if (ptdp->pt_cnt[i].refcnt == PT_DESC_REFCOUNT) {
+                                               /* Do not attempt to free a page that contains an L2 table */
                                                refcnt_acc = 0;
                                                break;
                                        }
@@ -1399,7 +1450,7 @@ pmap_pages_reclaim(
                                         * with it while we do that. */
                                        break;
                                }
-                               simple_unlock(&ptdp->pmap->lock);
+                               pmap_simple_unlock(&ptdp->pmap->lock);
                        }
                        ptdp = (pt_desc_t *)queue_next((queue_t)ptdp);
                }
@@ -1414,12 +1465,18 @@ pmap_pages_reclaim(
                        tt_entry_t                      *tte_p;
                        uint32_t                        rmv_spte=0;
 
-                       simple_unlock(&pt_pages_lock);
+                       pmap_simple_unlock(&pt_pages_lock);
                        pmap = ptdp->pmap;
                        PMAP_ASSERT_LOCKED(pmap); // pmap lock should be held from loop above
                        for (i = 0 ; i < PT_INDEX_MAX ; i++) {
                                va = ptdp->pt_map[i].va;
 
+                               /* If the VA is bogus, this may represent an unallocated region
+                                * or one which is in transition (already being freed or expanded).
+                                * Don't try to remove mappings here. */
+                               if (va == (vm_offset_t)-1)
+                                       continue;
+
                                tte_p = pmap_tte(pmap, va);
                                if ((tte_p != (tt_entry_t *) NULL)
                                    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
@@ -1451,28 +1508,29 @@ pmap_pages_reclaim(
                                                panic("pmap_pages_reclaim(): ptdp %p, count %d\n", ptdp, ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt);
 #if    (__ARM_VMSA__ == 7)
                                        pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL);
-                                       flush_mmu_tlb_entry((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->asid & 0xff));
-                                       flush_mmu_tlb_entry(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->asid & 0xff));
-                                       flush_mmu_tlb_entry(((va & ~ARM_TT_L1_PT_OFFMASK) + 2*ARM_TT_L1_SIZE)| (pmap->asid & 0xff));
-                                       flush_mmu_tlb_entry(((va & ~ARM_TT_L1_PT_OFFMASK) + 3*ARM_TT_L1_SIZE)| (pmap->asid & 0xff));
+                                       flush_mmu_tlb_entry_async((va & ~ARM_TT_L1_PT_OFFMASK) | (pmap->asid & 0xff));
+                                       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->asid & 0xff));
+                                       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 2*ARM_TT_L1_SIZE)| (pmap->asid & 0xff));
+                                       flush_mmu_tlb_entry_async(((va & ~ARM_TT_L1_PT_OFFMASK) + 3*ARM_TT_L1_SIZE)| (pmap->asid & 0xff));
 #else
                                        pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL);
-                                       flush_mmu_tlb_entry(tlbi_addr(va & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
+                                       flush_mmu_tlb_entry_async(tlbi_addr(va & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
 #endif
 
                                        if (remove_count > 0) {
 #if    (__ARM_VMSA__ == 7)
-                                               PMAP_UPDATE_TLBS(pmap, va, va+4*ARM_TT_L1_SIZE);
+                                               flush_mmu_tlb_region_asid_async(va, 4*ARM_TT_L1_SIZE, pmap);
 #else
-                                               PMAP_UPDATE_TLBS(pmap, va, va+ARM_TT_L2_SIZE);
+                                               flush_mmu_tlb_region_asid_async(va, ARM_TT_L2_SIZE, pmap);
 #endif
                                        }
                                }
                        }
+                       sync_tlb_flush();
                        // Undo the lock we grabbed when we found ptdp above
                        PMAP_UNLOCK(pmap);
                }
-               simple_lock(&pmap_pages_lock);
+               pmap_simple_lock(&pmap_pages_lock);
        }
 }
 
@@ -1534,7 +1592,7 @@ pmap_pages_free(
        pmap_paddr_t    pa,
        unsigned        size)
 {
-       simple_lock(&pmap_pages_lock);
+       pmap_simple_lock(&pmap_pages_lock);
 
        if (pmap_pages_request_count != 0) {
                page_free_entry_t       *page_entry;
@@ -1543,12 +1601,12 @@ pmap_pages_free(
                page_entry = (page_free_entry_t *)phystokv(pa);
                page_entry->next = pmap_pages_reclaim_list;
                pmap_pages_reclaim_list = page_entry;
-               simple_unlock(&pmap_pages_lock);
+               pmap_simple_unlock(&pmap_pages_lock);
 
                return;
        }
 
-       simple_unlock(&pmap_pages_lock);
+       pmap_simple_unlock(&pmap_pages_lock);
 
        vm_page_t       m;
        pmap_paddr_t    pa_max;
@@ -1610,7 +1668,7 @@ alloc_asid(
 {
        unsigned int    asid_bitmap_index;
 
-       simple_lock(&pmaps_lock);
+       pmap_simple_lock(&pmaps_lock);
        for (asid_bitmap_index = 0; asid_bitmap_index < (MAX_ASID / (sizeof(uint32_t) * NBBY)); asid_bitmap_index++) {
                unsigned int    temp = ffs(asid_bitmap[asid_bitmap_index]);
                if (temp > 0) {
@@ -1630,7 +1688,7 @@ alloc_asid(
                        /* Grab the second ASID. */
                        asid_bitmap[asid_bitmap_index] &= ~(1 << temp2);
 #endif /* __ARM_KERNEL_PROTECT__ */
-                       simple_unlock(&pmaps_lock);
+                       pmap_simple_unlock(&pmaps_lock);
 
                        /*
                         * We should never vend out physical ASID 0 through this
@@ -1646,7 +1704,7 @@ alloc_asid(
                        return (asid_bitmap_index * sizeof(uint32_t) * NBBY + temp);
                }
        }
-       simple_unlock(&pmaps_lock);
+       pmap_simple_unlock(&pmaps_lock);
        /*
         * ToDo: Add code to deal with pmap with no asid panic for now. Not
         * an issue with the small config  process hard limit
@@ -1662,7 +1720,7 @@ free_asid(
        /* Don't free up any alias of physical ASID 0. */
        assert((asid % ARM_MAX_ASID) != 0);
 
-       simple_lock(&pmaps_lock);
+       pmap_simple_lock(&pmaps_lock);
        setbit(asid, (int *) asid_bitmap);
 
 #if __ARM_KERNEL_PROTECT__
@@ -1671,17 +1729,20 @@ free_asid(
        setbit(asid | 1, (int *) asid_bitmap);
 #endif /* __ARM_KERNEL_PROTECT__ */
 
-       simple_unlock(&pmaps_lock);
+       pmap_simple_unlock(&pmaps_lock);
 }
 
-#define PV_LOW_WATER_MARK_DEFAULT      0x200
-#define PV_KERN_LOW_WATER_MARK_DEFAULT 0x200
-#define PV_ALLOC_CHUNK_INITIAL         0x200
-#define PV_KERN_ALLOC_CHUNK_INITIAL    0x200
+#ifndef PMAP_PV_LOAD_FACTOR
+#define PMAP_PV_LOAD_FACTOR            1
+#endif
+
+#define PV_LOW_WATER_MARK_DEFAULT      (0x200 * PMAP_PV_LOAD_FACTOR)
+#define PV_KERN_LOW_WATER_MARK_DEFAULT (0x200 * PMAP_PV_LOAD_FACTOR)
+#define PV_ALLOC_CHUNK_INITIAL         (0x200 * PMAP_PV_LOAD_FACTOR)
+#define PV_KERN_ALLOC_CHUNK_INITIAL    (0x200 * PMAP_PV_LOAD_FACTOR)
 #define PV_ALLOC_INITIAL_TARGET        (PV_ALLOC_CHUNK_INITIAL * 5)
 #define PV_KERN_ALLOC_INITIAL_TARGET   (PV_KERN_ALLOC_CHUNK_INITIAL)
 
-
 uint32_t pv_free_count MARK_AS_PMAP_DATA = 0;
 uint32_t pv_page_count MARK_AS_PMAP_DATA = 0;
 uint32_t pv_kern_free_count MARK_AS_PMAP_DATA = 0;
@@ -1730,12 +1791,13 @@ pv_alloc(
        unsigned int pai,
        pv_entry_t **pvepp)
 {
-       PMAP_ASSERT_LOCKED(pmap);
+       if (pmap != NULL)
+               PMAP_ASSERT_LOCKED(pmap);
        ASSERT_PVH_LOCKED(pai);
        PV_ALLOC(pvepp);
        if (PV_ENTRY_NULL == *pvepp) {
 
-               if (kernel_pmap == pmap) {
+               if ((pmap == NULL) || (kernel_pmap == pmap)) {
 
                        PV_KERN_ALLOC(pvepp);
 
@@ -1749,7 +1811,8 @@ pv_alloc(
                                kern_return_t   ret;
 
                                UNLOCK_PVH(pai);
-                               PMAP_UNLOCK(pmap);
+                               if (pmap != NULL)
+                                       PMAP_UNLOCK(pmap);
 
                                ret = pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
 
@@ -1782,7 +1845,8 @@ pv_alloc(
                                        pv_e++;
                                }
                                PV_KERN_FREE_LIST(pv_eh, pv_et, pv_cnt);
-                               PMAP_LOCK(pmap);
+                               if (pmap != NULL)
+                                       PMAP_LOCK(pmap);
                                LOCK_PVH(pai);
                                return FALSE;
                        }
@@ -1852,11 +1916,18 @@ pv_list_free(
        PV_FREE_LIST(pvehp, pvetp, cnt);
 }
 
-
+static inline void
+pv_water_mark_check(void)
+{
+       if ((pv_free_count < pv_low_water_mark) || (pv_kern_free_count < pv_kern_low_water_mark)) {
+               if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
+                       thread_wakeup(&mapping_replenish_event);
+       }
+}
 
 static inline void     PV_ALLOC(pv_entry_t **pv_ep) {
        assert(*pv_ep == PV_ENTRY_NULL);
-       simple_lock(&pv_free_list_lock);
+       pmap_simple_lock(&pv_free_list_lock);
        /*
         * If the kernel reserved pool is low, let non-kernel mappings allocate
         * synchronously, possibly subject to a throttle.
@@ -1867,25 +1938,20 @@ static inline void      PV_ALLOC(pv_entry_t **pv_ep) {
                pv_free_count--;
        }
 
-       simple_unlock(&pv_free_list_lock);
-
-       if ((pv_free_count < pv_low_water_mark) || (pv_kern_free_count < pv_kern_low_water_mark)) {
-               if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse))
-                       thread_wakeup(&mapping_replenish_event);
-       }
+       pmap_simple_unlock(&pv_free_list_lock);
 }
 
 static inline void     PV_FREE_LIST(pv_entry_t *pv_eh, pv_entry_t *pv_et, int pv_cnt) {
-       simple_lock(&pv_free_list_lock);
+       pmap_simple_lock(&pv_free_list_lock);
        pv_et->pve_next = (pv_entry_t *)pv_free_list;
        pv_free_list = pv_eh;
        pv_free_count += pv_cnt;
-       simple_unlock(&pv_free_list_lock);
+       pmap_simple_unlock(&pv_free_list_lock);
 }
 
 static inline void     PV_KERN_ALLOC(pv_entry_t **pv_e) {
        assert(*pv_e == PV_ENTRY_NULL);
-       simple_lock(&pv_kern_free_list_lock);
+       pmap_simple_lock(&pv_kern_free_list_lock);
 
        if ((*pv_e = pv_kern_free_list) != 0) {
                pv_kern_free_list = (pv_entry_t *)(*pv_e)->pve_next;
@@ -1894,21 +1960,15 @@ static inline void      PV_KERN_ALLOC(pv_entry_t **pv_e) {
                pmap_kern_reserve_alloc_stat++;
        }
 
-       simple_unlock(&pv_kern_free_list_lock);
-
-       if (pv_kern_free_count < pv_kern_low_water_mark) {
-               if (!mappingrecurse && hw_compare_and_store(0,1, &mappingrecurse)) {
-                       thread_wakeup(&mapping_replenish_event);
-               }
-       }
+       pmap_simple_unlock(&pv_kern_free_list_lock);
 }
 
 static inline void     PV_KERN_FREE_LIST(pv_entry_t *pv_eh, pv_entry_t *pv_et, int pv_cnt) {
-       simple_lock(&pv_kern_free_list_lock);
+       pmap_simple_lock(&pv_kern_free_list_lock);
        pv_et->pve_next = pv_kern_free_list;
        pv_kern_free_list = pv_eh;
        pv_kern_free_count += pv_cnt;
-       simple_unlock(&pv_kern_free_list_lock);
+       pmap_simple_unlock(&pv_kern_free_list_lock);
 }
 
 static inline void pmap_pv_throttle(__unused pmap_t p) {
@@ -1929,7 +1989,7 @@ static inline void pmap_pv_throttle(__unused pmap_t p) {
  * Creates a target number of free pv_entry_t objects for the kernel free list
  * and the general free list.
  */
-static kern_return_t
+MARK_AS_PMAP_TEXT static kern_return_t
 mapping_free_prime_internal(void)
 {
        unsigned       j;
@@ -2069,7 +2129,7 @@ void mapping_adjust(void) {
 /*
  * Fills the kernel and general PV free lists back up to their low watermarks.
  */
-static kern_return_t
+MARK_AS_PMAP_TEXT static kern_return_t
 mapping_replenish_internal(void)
 {
        pv_entry_t    *pv_e;
@@ -2190,14 +2250,13 @@ ptd_bootstrap(
 }
 
 static pt_desc_t
-*ptd_alloc(
-       pmap_t pmap)
+*ptd_alloc_unlinked(void)
 {
        pt_desc_t       *ptdp;
        unsigned        i;
 
        if (!ptd_preboot)
-               simple_lock(&ptd_free_list_lock);
+               pmap_simple_lock(&ptd_free_list_lock);
 
        if (ptd_free_count == 0) {
                unsigned int    ptd_cnt;
@@ -2212,7 +2271,7 @@ static pt_desc_t
                        pmap_paddr_t    pa;
                        kern_return_t   ret;
 
-                       simple_unlock(&ptd_free_list_lock);
+                       pmap_simple_unlock(&ptd_free_list_lock);
 
                        if (pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT) != KERN_SUCCESS) {
                                ret =  pmap_pages_alloc(&pa, PAGE_SIZE, PMAP_PAGES_RECLAIM_NOWAIT);
@@ -2220,7 +2279,7 @@ static pt_desc_t
                        }
                        ptdp = (pt_desc_t *)phystokv(pa);
 
-                       simple_lock(&ptd_free_list_lock);
+                       pmap_simple_lock(&ptd_free_list_lock);
                        ptdp_next = ptdp;
                        ptd_cnt = PAGE_SIZE/sizeof(pt_desc_t);
                }
@@ -2242,52 +2301,60 @@ static pt_desc_t
        }
 
        if (!ptd_preboot)
-               simple_unlock(&ptd_free_list_lock);
+               pmap_simple_unlock(&ptd_free_list_lock);
 
        ptdp->pt_page.next = NULL;
        ptdp->pt_page.prev = NULL;
-       ptdp->pmap = pmap;
+       ptdp->pmap = NULL;
 
        for (i = 0 ; i < PT_INDEX_MAX ; i++) {
-               ptdp->pt_map[i].va = 0;
+               ptdp->pt_map[i].va = (vm_offset_t)-1;
                ptdp->pt_cnt[i].refcnt = 0;
                ptdp->pt_cnt[i].wiredcnt = 0;
        }
-       simple_lock(&pt_pages_lock);
-       queue_enter(&pt_page_list, ptdp, pt_desc_t *, pt_page);
-       simple_unlock(&pt_pages_lock);
-
-       pmap_tt_ledger_credit(pmap, sizeof(*ptdp));
 
        return(ptdp);
 }
 
+static inline pt_desc_t*
+ptd_alloc(pmap_t pmap)
+{
+       pt_desc_t *ptdp = ptd_alloc_unlinked();
+
+       ptdp->pmap = pmap;
+       if (pmap != kernel_pmap) {
+               /* We should never try to reclaim kernel pagetable pages in
+                * pmap_pages_reclaim(), so don't enter them into the list. */
+               pmap_simple_lock(&pt_pages_lock);
+               queue_enter(&pt_page_list, ptdp, pt_desc_t *, pt_page);
+               pmap_simple_unlock(&pt_pages_lock);
+       }
+
+       pmap_tt_ledger_credit(pmap, sizeof(*ptdp));
+       return ptdp;
+}
+
 static void
-ptd_deallocate(
-       pt_desc_t *ptdp)
+ptd_deallocate(pt_desc_t *ptdp)
 {
-       unsigned        i;
        pmap_t          pmap = ptdp->pmap;
 
        if (ptd_preboot) {
                panic("ptd_deallocate(): early boot\n");
        }
-       for (i = 0 ; i < PT_INDEX_MAX ; i++) {
-               if (ptdp->pt_cnt[i].refcnt != 0)
-                       panic("ptd_deallocate(): ptdp=%p refcnt=0x%x \n", ptdp, ptdp->pt_cnt[i].refcnt);
-       }
 
        if (ptdp->pt_page.next != NULL) {
-               simple_lock(&pt_pages_lock);
+               pmap_simple_lock(&pt_pages_lock);
                queue_remove(&pt_page_list, ptdp, pt_desc_t *, pt_page);
-               simple_unlock(&pt_pages_lock);
+               pmap_simple_unlock(&pt_pages_lock);
        }
-       simple_lock(&ptd_free_list_lock);
+       pmap_simple_lock(&ptd_free_list_lock);
        (*(void **)ptdp) = (void *)ptd_free_list;
        ptd_free_list = (pt_desc_t *)ptdp;
        ptd_free_count++;
-       simple_unlock(&ptd_free_list_lock);
-       pmap_tt_ledger_debit(pmap, sizeof(*ptdp));
+       pmap_simple_unlock(&ptd_free_list_lock);
+       if (pmap != NULL)
+               pmap_tt_ledger_debit(pmap, sizeof(*ptdp));
 }
 
 static void
@@ -2305,14 +2372,13 @@ ptd_init(
        assert(level == 2);
        ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~(ARM_TT_L1_PT_OFFMASK);
 #else
-       if (level == 3) {
-               ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L2_OFFMASK ;
-       else if (level == 2)
-               ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L1_OFFMASK ;
+       if (level == 3)
+               ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L2_OFFMASK;
+       else if (level == 2)
+               ptdp->pt_map[ARM_PT_DESC_INDEX(pte_p)].va = (vm_offset_t) va & ~ARM_TT_L1_OFFMASK;
 #endif
        if (level < PMAP_TT_MAX_LEVEL)
                ptdp->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt = PT_DESC_REFCOUNT;
-
 }
 
 
@@ -2379,6 +2445,7 @@ static inline tt_entry_t *
 pmap_tt1e(pmap_t pmap,
         vm_map_address_t addr)
 {
+       /* Level 0 currently unused */
 #if __ARM64_TWO_LEVEL_PMAP__
 #pragma unused(pmap, addr)
        panic("pmap_tt1e called on a two level pmap");
@@ -2430,23 +2497,12 @@ pmap_tt3e(
        tt_entry_t     *ttp;
        tt_entry_t      tte;
 
-       /* Level 0 currently unused */
-#if __ARM64_TWO_LEVEL_PMAP__
        ttp = pmap_tt2e(pmap, addr);
-       tte = *ttp;
-#else
-       /* Get first-level (1GB) entry */
-       ttp = pmap_tt1e(pmap, addr);
-       tte = *ttp;
-       #if MACH_ASSERT
-       if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID))
-               panic("Attempt to demote L1 block (?!): pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte);
-       #endif
-       if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) != (ARM_TTE_TYPE_TABLE | ARM_TTE_VALID))
+       if (ttp == PT_ENTRY_NULL)
                return (PT_ENTRY_NULL);
 
-       tte = ((tt_entry_t*) phystokv(tte & ARM_TTE_TABLE_MASK))[tt2_index(pmap, addr)];
-#endif
+       tte = *ttp;
+
 #if MACH_ASSERT
        if ((tte & (ARM_TTE_TYPE_MASK | ARM_TTE_VALID)) == (ARM_TTE_TYPE_BLOCK | ARM_TTE_VALID))
                panic("Attempt to demote L2 block: pmap=%p, va=0x%llx, tte=0x%llx\n", pmap, (uint64_t)addr, (uint64_t)tte);
@@ -2563,7 +2619,7 @@ pmap_map_bd_with_options(
                        panic("pmap_map_bd");
                }
                assert(!ARM_PTE_IS_COMPRESSED(*ptep));
-               WRITE_PTE(ptep, tmplate);
+               WRITE_PTE_STRONG(ptep, tmplate);
 
                pte_increment_pa(tmplate);
                vaddr += PAGE_SIZE;
@@ -2612,7 +2668,7 @@ pmap_map_bd(
                        panic("pmap_map_bd");
                }
                assert(!ARM_PTE_IS_COMPRESSED(*ptep));
-               WRITE_PTE(ptep, tmplate);
+               WRITE_PTE_STRONG(ptep, tmplate);
 
                pte_increment_pa(tmplate);
                vaddr += PAGE_SIZE;
@@ -2693,7 +2749,7 @@ scan:
 #if __ARM_KERNEL_PROTECT__
                pte |= ARM_PTE_NG;
 #endif /* __ARM_KERNEL_PROTECT__ */
-               WRITE_PTE(ptep, pte);
+               WRITE_PTE_STRONG(ptep, pte);
        }
        PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len);
 #if KASAN
@@ -2704,18 +2760,12 @@ scan:
 
 #define PMAP_ALIGN(addr, align) ((addr) + ((align) - 1) & ~((align) - 1))
 
-typedef struct pmap_io_range
-{
-       uint64_t addr;
-       uint32_t len;
-       uint32_t wimg;
-} __attribute__((packed))  pmap_io_range_t;
-
-static unsigned int 
+static vm_size_t
 pmap_compute_io_rgns(void)
 {
        DTEntry entry;
        pmap_io_range_t *ranges;
+       uint64_t rgn_end;
        void *prop = NULL;
         int err;
        unsigned int prop_size;
@@ -2723,23 +2773,22 @@ pmap_compute_io_rgns(void)
         err = DTLookupEntry(NULL, "/defaults", &entry);
         assert(err == kSuccess);
 
-       if (kSuccess != DTGetProperty(entry, "pmap-io-granule", &prop, &prop_size))
-               return 0;
-
-       io_rgn_granule = *((uint32_t*)prop);
-
        if (kSuccess != DTGetProperty(entry, "pmap-io-ranges", &prop, &prop_size))
                return 0;
 
-       if ((io_rgn_granule == 0) || (io_rgn_granule & PAGE_MASK))
-               panic("pmap I/O region granularity is not page-aligned!\n");
-
        ranges = prop;
        for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) {
+               if (ranges[i].addr & PAGE_MASK)
+                       panic("pmap I/O region %u addr 0x%llx is not page-aligned", i, ranges[i].addr);
+               if (ranges[i].len & PAGE_MASK)
+                       panic("pmap I/O region %u length 0x%x is not page-aligned", i, ranges[i].len);
+               if (os_add_overflow(ranges[i].addr, ranges[i].len, &rgn_end))
+                       panic("pmap I/O region %u addr 0x%llx length 0x%x wraps around", i, ranges[i].addr, ranges[i].len);
                if ((i == 0) || (ranges[i].addr < io_rgn_start))
                        io_rgn_start = ranges[i].addr;
-               if ((i == 0) || ((ranges[i].addr + ranges[i].len) > io_rgn_end))
-                       io_rgn_end = ranges[i].addr + ranges[i].len;
+               if ((i == 0) || (rgn_end > io_rgn_end))
+                       io_rgn_end = rgn_end;
+               ++num_io_rgns;
        }
 
        if (io_rgn_start & PAGE_MASK)
@@ -2748,11 +2797,35 @@ pmap_compute_io_rgns(void)
        if (io_rgn_end & PAGE_MASK)
                panic("pmap I/O region end is not page-aligned!\n");
 
-       if (((io_rgn_start < gPhysBase) && (io_rgn_end >= gPhysBase)) ||
-           ((io_rgn_start < avail_end) && (io_rgn_end >= avail_end)))
+       if (((io_rgn_start <= gPhysBase) && (io_rgn_end > gPhysBase)) ||
+           ((io_rgn_start < avail_end) && (io_rgn_end >= avail_end)) ||
+           ((io_rgn_start > gPhysBase) && (io_rgn_end < avail_end)))
                panic("pmap I/O region overlaps physical memory!\n");
 
-       return (unsigned int)((io_rgn_end - io_rgn_start) / io_rgn_granule);
+       return (num_io_rgns * sizeof(*ranges));
+}
+
+/*
+ * return < 0 for a < b
+ *          0 for a == b
+ *        > 0 for a > b
+ */
+typedef int (*cmpfunc_t)(const void *a, const void *b);
+
+extern void
+qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
+
+static int
+cmp_io_rgns(const void *a, const void *b)
+{
+       const pmap_io_range_t *range_a = a;
+       const pmap_io_range_t *range_b = b;
+       if ((range_b->addr + range_b->len) <= range_a->addr)
+               return 1;
+       else if ((range_a->addr + range_a->len) <= range_b->addr)
+               return -1;
+       else
+               return 0;
 }
 
 static void
@@ -2764,27 +2837,83 @@ pmap_load_io_rgns(void)
         int err;
        unsigned int prop_size;
 
-       if (io_rgn_granule == 0)
+       if (num_io_rgns == 0)
                return;
 
-        err = DTLookupEntry(NULL, "/defaults", &entry);
-        assert(err == kSuccess);
+       err = DTLookupEntry(NULL, "/defaults", &entry);
+       assert(err == kSuccess);
 
        err = DTGetProperty(entry, "pmap-io-ranges", &prop, &prop_size);
-        assert(err == kSuccess);
+       assert(err == kSuccess);
 
        ranges = prop;
-       for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i) {
-               if ((ranges[i].addr - io_rgn_start) % io_rgn_granule)
-                       panic("pmap I/O region %d is not aligned to I/O granularity!\n", i);
-               if (ranges[i].len % io_rgn_granule)
-                       panic("pmap I/O region %d size is not a multiple of I/O granularity!\n", i);
-               for (uint32_t offs = 0; offs < ranges[i].len; offs += io_rgn_granule) {
-                       io_attr_table[(ranges[i].addr + offs - io_rgn_start) / io_rgn_granule] =
-                           IO_ATTR_WIMG(ranges[i].wimg);
-               }
+       for (unsigned int i = 0; i < (prop_size / sizeof(*ranges)); ++i)
+               io_attr_table[i] = ranges[i];
+
+       qsort(io_attr_table, num_io_rgns, sizeof(*ranges), cmp_io_rgns);
+}
+
+#if __arm64__
+/*
+ * pmap_get_arm64_prot
+ *
+ * return effective armv8 VMSA block protections including
+ * table AP/PXN/XN overrides of a pmap entry
+ *
+ */
+
+uint64_t
+pmap_get_arm64_prot(
+       pmap_t pmap,
+       vm_offset_t addr)
+{
+       uint64_t tte;
+       uint64_t tt_type, table_ap, table_xn, table_pxn;
+       uint64_t prot = 0;
+
+       tte = *pmap_tt1e(pmap, addr);
+
+       if (!(tte & ARM_TTE_VALID)) {
+               return 0;
+       }
+
+       tt_type = tte & ARM_TTE_TYPE_MASK;
+
+       if(tt_type == ARM_TTE_TYPE_BLOCK) {
+               return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID;
+       }
+
+       table_ap = (tte >> ARM_TTE_TABLE_APSHIFT) & 0x3;
+       table_xn = tte & ARM_TTE_TABLE_XN;
+       table_pxn = tte & ARM_TTE_TABLE_PXN;
+
+       prot |= (table_ap << ARM_TTE_BLOCK_APSHIFT) | (table_xn ? ARM_TTE_BLOCK_NX : 0) | (table_pxn ? ARM_TTE_BLOCK_PNX : 0);
+
+       tte = *pmap_tt2e(pmap, addr);
+       if (!(tte & ARM_TTE_VALID)) {
+               return 0;
+       }
+
+       tt_type = tte & ARM_TTE_TYPE_MASK;
+
+       if (tt_type == ARM_TTE_TYPE_BLOCK) {
+               return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID;
+       }
+
+       table_ap = (tte >> ARM_TTE_TABLE_APSHIFT) & 0x3;
+       table_xn = tte & ARM_TTE_TABLE_XN;
+       table_pxn = tte & ARM_TTE_TABLE_PXN;
+
+       prot |= (table_ap << ARM_TTE_BLOCK_APSHIFT) | (table_xn ? ARM_TTE_BLOCK_NX : 0) | (table_pxn ? ARM_TTE_BLOCK_PNX : 0);
+
+       tte = *pmap_tt3e(pmap, addr);
+       if (!(tte & ARM_TTE_VALID)) {
+               return 0;
        }
+
+       return prot | (tte & ARM_TTE_BLOCK_NX) | (tte & ARM_TTE_BLOCK_PNX) | (tte & ARM_TTE_BLOCK_APMASK) | ARM_TTE_VALID;
 }
+#endif /* __arm64__ */
 
 
 /*
@@ -2815,18 +2944,16 @@ pmap_bootstrap(
 {
        pmap_paddr_t    pmap_struct_start;
        vm_size_t       pv_head_size;
-       vm_size_t       pv_lock_table_size;
        vm_size_t       ptd_root_table_size;
        vm_size_t       pp_attr_table_size;
        vm_size_t       io_attr_table_size;
-       unsigned int    niorgns;
        unsigned int    npages;
        unsigned int    i;
        vm_map_offset_t maxoffset;
 
 
-#ifdef PMAP_TRACES
-       if (PE_parse_boot_argn("-pmap_trace", &pmap_trace, sizeof (pmap_trace))) {
+#if DEVELOPMENT || DEBUG
+       if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof (pmap_trace_mask))) {
                kprintf("Kernel traces for pmap operations enabled\n");
        }
 #endif
@@ -2843,7 +2970,6 @@ pmap_bootstrap(
        kernel_pmap->min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
 #endif
        kernel_pmap->max = VM_MAX_KERNEL_ADDRESS;
-       kernel_pmap->wired = 0;
        kernel_pmap->ref_count = 1;
        kernel_pmap->gc_status = 0;
        kernel_pmap->nx_enabled = TRUE;
@@ -2866,20 +2992,18 @@ pmap_bootstrap(
        kernel_pmap->tte_index_max = (ARM_PGBYTES / sizeof(tt_entry_t));
 #endif
        kernel_pmap->prev_tte = (tt_entry_t *) NULL;
-       kernel_pmap->cpu_ref = 0;
 
        PMAP_LOCK_INIT(kernel_pmap);
 #if    (__ARM_VMSA__ == 7)
        simple_lock_init(&kernel_pmap->tt1_lock, 0);
+       kernel_pmap->cpu_ref = 0;
 #endif
        memset((void *) &kernel_pmap->stats, 0, sizeof(kernel_pmap->stats));
 
        /* allocate space for and initialize the bookkeeping structures */
-       niorgns = pmap_compute_io_rgns();
+       io_attr_table_size = pmap_compute_io_rgns();
        npages = (unsigned int)atop(mem_size);
        pp_attr_table_size = npages * sizeof(pp_attr_t);
-       io_attr_table_size = niorgns * sizeof(io_attr_t);
-       pv_lock_table_size = npages;
        pv_head_size = round_page(sizeof(pv_entry_t *) * npages);
 #if    (__ARM_VMSA__ == 7)
        ptd_root_table_size = sizeof(pt_desc_t) * (1<<((mem_size>>30)+12));
@@ -2891,8 +3015,8 @@ pmap_bootstrap(
 
        pp_attr_table = (pp_attr_t *) phystokv(avail_start);
        avail_start = PMAP_ALIGN(avail_start + pp_attr_table_size, __alignof(pp_attr_t));
-       io_attr_table = (io_attr_t *) phystokv(avail_start);
-       avail_start = PMAP_ALIGN(avail_start + io_attr_table_size + pv_lock_table_size, __alignof(pv_entry_t*));
+       io_attr_table = (pmap_io_range_t *) phystokv(avail_start);
+       avail_start = PMAP_ALIGN(avail_start + io_attr_table_size, __alignof(pv_entry_t*));
        pv_head_table = (pv_entry_t **) phystokv(avail_start);
        avail_start = PMAP_ALIGN(avail_start + pv_head_size, __alignof(pt_desc_t));
        ptd_root_table = (pt_desc_t *)phystokv(avail_start);
@@ -2911,9 +3035,6 @@ pmap_bootstrap(
        simple_lock_init(&pmaps_lock, 0);
        queue_init(&map_pmap_list);
        queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
-       queue_init(&tt_pmap_list);
-       tt_pmap_count = 0;
-       tt_pmap_max = 0;
        free_page_size_tt_list = TT_FREE_ENTRY_NULL;
        free_page_size_tt_count = 0;
        free_page_size_tt_max = 0;
@@ -2957,6 +3078,7 @@ pmap_bootstrap(
        kernel_pmap->asid = 0;
        kernel_pmap->vasid = 0;
 
+
        if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof (maxoffset))) {
                maxoffset = trunc_page(maxoffset);
                if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
@@ -2983,10 +3105,14 @@ pmap_bootstrap(
 
        simple_lock_init(&phys_backup_lock, 0);
 
+
 #if MACH_ASSERT
        PE_parse_boot_argn("pmap_stats_assert",
                           &pmap_stats_assert,
                           sizeof (pmap_stats_assert));
+       PE_parse_boot_argn("vm_footprint_suspend_allowed",
+                          &vm_footprint_suspend_allowed,
+                          sizeof (vm_footprint_suspend_allowed));
 #endif /* MACH_ASSERT */
 
 #if KASAN
@@ -3154,7 +3280,7 @@ pmap_init(
        assert(hard_maxproc < MAX_ASID);
 
 #if CONFIG_PGTRACE
-    pmap_pgtrace_init();
+       pmap_pgtrace_init();
 #endif
 }
 
@@ -3164,7 +3290,6 @@ pmap_verify_free(
 {
        pv_entry_t              **pv_h;
        int             pai;
-       boolean_t       result = TRUE;
        pmap_paddr_t    phys = ptoa(ppnum);
 
        assert(phys != vm_page_fictitious_addr);
@@ -3175,9 +3300,7 @@ pmap_verify_free(
        pai = (int)pa_index(phys);
        pv_h = pai_to_pvh(pai);
 
-       result = (pvh_list(pv_h) == PV_ENTRY_NULL);
-
-       return (result);
+       return (pvh_test_type(pv_h, PVH_TYPE_NULL));
 }
 
 
@@ -3197,6 +3320,33 @@ pmap_zone_init(
 }
 
 
+void
+pmap_ledger_alloc_init(size_t size)
+{
+       panic("%s: unsupported, "
+             "size=%lu",
+             __func__, size);
+}
+
+ledger_t
+pmap_ledger_alloc(void)
+{
+       ledger_t retval = NULL;
+
+       panic("%s: unsupported",
+             __func__);
+
+       return retval;
+}
+
+void
+pmap_ledger_free(ledger_t ledger)
+{
+       panic("%s: unsupported, "
+             "ledger=%p",
+             __func__, ledger);
+}
+
 /*
  *     Create and return a physical map.
  *
@@ -3209,7 +3359,7 @@ pmap_zone_init(
  *     the map will be used in software only, and
  *     is bounded by that size.
  */
-static pmap_t
+MARK_AS_PMAP_TEXT static pmap_t
 pmap_create_internal(
        ledger_t ledger,
        vm_map_size_t size,
@@ -3241,7 +3391,9 @@ pmap_create_internal(
                p->max = VM_MAX_ADDRESS;
        }
 
-       p->wired = 0;
+       p->nested_region_true_start = 0;
+       p->nested_region_true_end = ~0;
+
        p->ref_count = 1;
        p->gc_status = 0;
        p->stamp = hw_atomic_add(&pmap_stamp, 1);
@@ -3251,12 +3403,13 @@ pmap_create_internal(
        p->nested_pmap = PMAP_NULL;
 
 
-       ledger_reference(ledger);
+
        p->ledger = ledger;
 
        PMAP_LOCK_INIT(p);
 #if    (__ARM_VMSA__ == 7)
        simple_lock_init(&p->tt1_lock, 0);
+       p->cpu_ref = 0;
 #endif
        memset((void *) &p->stats, 0, sizeof(p->stats));
 
@@ -3264,6 +3417,7 @@ pmap_create_internal(
 
        p->tte = pmap_tt1_allocate(p, PMAP_ROOT_ALLOC_SIZE, 0);
        p->ttep = ml_static_vtop((vm_offset_t)p->tte);
+       PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
 
 #if (__ARM_VMSA__ == 7)
        p->tte_index_max = NTTES;
@@ -3271,17 +3425,13 @@ pmap_create_internal(
        p->tte_index_max = (PMAP_ROOT_ALLOC_SIZE / sizeof(tt_entry_t));
 #endif
        p->prev_tte = (tt_entry_t *) NULL;
-       p->cpu_ref = 0;
 
        /* nullify the translation table */
        for (i = 0; i < p->tte_index_max; i++)
                p->tte[i] = ARM_TTE_TYPE_FAULT;
 
-#ifndef  __ARM_L1_PTW__
-       CleanPoU_DcacheRegion((vm_offset_t) (p->tte), PMAP_ROOT_ALLOC_SIZE);
-#else
-       __asm__ volatile("dsb ish");
-#endif
+       FLUSH_PTE_RANGE(p->tte, p->tte + p->tte_index_max);
+
        /* assign a asid */
        p->vasid = alloc_asid();
        p->asid = p->vasid % ARM_MAX_ASID;
@@ -3295,19 +3445,23 @@ pmap_create_internal(
        p->nested_region_asid_bitmap = NULL;
        p->nested_region_asid_bitmap_size = 0x0UL;
 
+       p->nested_has_no_bounds_ref = false;
+       p->nested_no_bounds_refcnt = 0;
+       p->nested_bounds_set = false;
+
+
 #if MACH_ASSERT
        p->pmap_stats_assert = TRUE;
        p->pmap_pid = 0;
        strlcpy(p->pmap_procname, "<nil>", sizeof (p->pmap_procname));
 #endif /* MACH_ASSERT */
 #if DEVELOPMENT || DEBUG
-       p->footprint_suspended = FALSE;
        p->footprint_was_suspended = FALSE;
 #endif /* DEVELOPMENT || DEBUG */
 
-       simple_lock(&pmaps_lock);
+       pmap_simple_lock(&pmaps_lock);
        queue_enter(&map_pmap_list, p, pmap_t, pmaps);
-       simple_unlock(&pmaps_lock);
+       pmap_simple_unlock(&pmaps_lock);
 
        return (p);
 }
@@ -3320,18 +3474,23 @@ pmap_create(
 {
        pmap_t pmap;
 
-       PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, is_64bit);
+       PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, is_64bit);
+
+       ledger_reference(ledger);
 
        pmap = pmap_create_internal(ledger, size, is_64bit);
 
-       PMAP_TRACE(PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END,
-                  VM_KERNEL_ADDRHIDE(pmap));
+       if (pmap == PMAP_NULL) {
+               ledger_dereference(ledger);
+       }
+
+       PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid);
 
        return pmap;
 }
 
 #if MACH_ASSERT
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_set_process_internal(
        __unused pmap_t pmap,
        __unused int pid,
@@ -3342,15 +3501,21 @@ pmap_set_process_internal(
                return;
        }
 
+       VALIDATE_PMAP(pmap);
+
        pmap->pmap_pid = pid;
        strlcpy(pmap->pmap_procname, procname, sizeof (pmap->pmap_procname));
-       if (!strncmp(procname, "corecaptured", sizeof (pmap->pmap_procname))) {
+       if (pmap_ledgers_panic_leeway) {
                /*
                 * XXX FBDP
-                * "corecaptured" somehow triggers some issues that make
-                * the pmap stats and ledgers to go off track, causing
+                * Some processes somehow trigger some issues that make
+                * the pmap stats and ledgers go off track, causing
                 * some assertion failures and ledger panics.
-                * Turn that off if the terminating process is "corecaptured".
+                * Turn off the sanity checks if we allow some ledger leeway
+                * because of that.  We'll still do a final check in
+                * pmap_check_ledgers() for discrepancies larger than the
+                * allowed leeway after the address space has been fully
+                * cleaned up.
                 */
                pmap->pmap_stats_assert = FALSE;
                ledger_disable_panic_on_negative(pmap->ledger,
@@ -3470,6 +3635,34 @@ struct {
        int             purgeable_nonvolatile_compressed_under;
        ledger_amount_t purgeable_nonvolatile_compressed_under_total;
        ledger_amount_t purgeable_nonvolatile_compressed_under_max;
+
+       int             network_volatile_over;
+       ledger_amount_t network_volatile_over_total;
+       ledger_amount_t network_volatile_over_max;
+       int             network_volatile_under;
+       ledger_amount_t network_volatile_under_total;
+       ledger_amount_t network_volatile_under_max;
+
+       int             network_nonvolatile_over;
+       ledger_amount_t network_nonvolatile_over_total;
+       ledger_amount_t network_nonvolatile_over_max;
+       int             network_nonvolatile_under;
+       ledger_amount_t network_nonvolatile_under_total;
+       ledger_amount_t network_nonvolatile_under_max;
+
+       int             network_volatile_compressed_over;
+       ledger_amount_t network_volatile_compressed_over_total;
+       ledger_amount_t network_volatile_compressed_over_max;
+       int             network_volatile_compressed_under;
+       ledger_amount_t network_volatile_compressed_under_total;
+       ledger_amount_t network_volatile_compressed_under_max;
+
+       int             network_nonvolatile_compressed_over;
+       ledger_amount_t network_nonvolatile_compressed_over_total;
+       ledger_amount_t network_nonvolatile_compressed_over_max;
+       int             network_nonvolatile_compressed_under;
+       ledger_amount_t network_nonvolatile_compressed_under_total;
+       ledger_amount_t network_nonvolatile_compressed_under_max;
 } pmap_ledgers_drift;
 #endif /* MACH_ASSERT */
 
@@ -3478,76 +3671,43 @@ struct {
  *     Should only be called if the map contains
  *     no valid mappings.
  */
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_destroy_internal(
        pmap_t pmap)
 {
-#if (__ARM_VMSA__ == 7)
-       pt_entry_t     *ttep;
-       unsigned int    i;
-       pmap_t          tmp_pmap, tt_pmap;
-       queue_head_t    tmp_pmap_list;
-
-       queue_init(&tmp_pmap_list);
-       simple_lock(&pmaps_lock);
-       tt_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&tt_pmap_list));
-       while (!queue_end(&tt_pmap_list, (queue_entry_t)tt_pmap)) {
-               if (tt_pmap->cpu_ref == 0 ) {
-                       tmp_pmap = tt_pmap;
-                       tt_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&tmp_pmap->pmaps));
-                       queue_remove(&tt_pmap_list, tmp_pmap, pmap_t, pmaps);
-                       tt_pmap_count--;
-                       queue_enter(&tmp_pmap_list, tmp_pmap, pmap_t, pmaps);
-               } else {
-                       tmp_pmap = tt_pmap;
-                       tt_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&tmp_pmap->pmaps));
-               }
-       }
-       simple_unlock(&pmaps_lock);
-
-       tmp_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&tmp_pmap_list));
-       while (!queue_end(&tmp_pmap_list, (queue_entry_t)tmp_pmap)) {
-                       tt_pmap = tmp_pmap;
-                       tmp_pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&tt_pmap->pmaps));
-                       queue_remove(&tmp_pmap_list, tt_pmap, pmap_t, pmaps);
-                       if (tt_pmap->tte) {
-                               pmap_tt1_deallocate(pmap, tt_pmap->tte, tt_pmap->tte_index_max*sizeof(tt_entry_t), 0);
-                               tt_pmap->tte = (tt_entry_t *) NULL;
-                               tt_pmap->ttep = 0;
-                               tt_pmap->tte_index_max = 0;
-                       }
-                       if (tt_pmap->prev_tte) {
-                               pmap_tt1_deallocate(pmap, tt_pmap->prev_tte, PMAP_ROOT_ALLOC_SIZE, 0);
-                               tt_pmap->prev_tte = (tt_entry_t *) NULL;
-                       }
-                       assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
-                       free_asid(tt_pmap->vasid);
-
-                       pmap_check_ledgers(tt_pmap);
-                       ledger_dereference(tt_pmap->ledger);
-
-                       zfree(pmap_zone, tt_pmap);
-       }
-
        if (pmap == PMAP_NULL)
                return;
 
-       if (hw_atomic_sub(&pmap->ref_count, 1) != 0)
+       VALIDATE_PMAP(pmap);
+
+       int32_t ref_count = __c11_atomic_fetch_sub(&pmap->ref_count, 1, memory_order_relaxed) - 1;
+       if (ref_count > 0)
                return;
+       else if (ref_count < 0)
+               panic("pmap %p: refcount underflow", pmap);
+       else if (pmap == kernel_pmap)
+               panic("pmap %p: attempt to destroy kernel pmap", pmap);
 
-       simple_lock(&pmaps_lock);
+#if (__ARM_VMSA__ == 7)
+       pt_entry_t     *ttep;
+       unsigned int    i;
 
+       pmap_simple_lock(&pmaps_lock);
        while (pmap->gc_status & PMAP_GC_INFLIGHT) {
                pmap->gc_status |= PMAP_GC_WAIT;
                 assert_wait((event_t) & pmap->gc_status, THREAD_UNINT);
-               simple_unlock(&pmaps_lock);
+               pmap_simple_unlock(&pmaps_lock);
                 (void) thread_block(THREAD_CONTINUE_NULL);
-               simple_lock(&pmaps_lock);
+               pmap_simple_lock(&pmaps_lock);
 
        }
-
        queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
-       simple_unlock(&pmaps_lock);
+       pmap_simple_unlock(&pmaps_lock);
+
+       if (pmap->cpu_ref != 0)
+               panic("pmap_destroy(%p): cpu_ref = %u", pmap, pmap->cpu_ref);
+
+       pmap_trim_self(pmap);
 
        /*
         *      Free the memory maps, then the
@@ -3558,133 +3718,123 @@ pmap_destroy_internal(
                ttep = &pmap->tte[i];
                if ((*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
                        pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL);
-                       flush_mmu_tlb_entry((i<<ARM_TT_L1_SHIFT) | (pmap->asid & 0xff));
                }
        }
        PMAP_UNLOCK(pmap);
 
-       if (pmap->cpu_ref == 0) {
-               if (pmap->tte) {
-                       pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max*sizeof(tt_entry_t), 0);
-                       pmap->tte = (tt_entry_t *) NULL;
-                       pmap->ttep = 0;
-                       pmap->tte_index_max = 0;
-               }
-               if (pmap->prev_tte) {
-                       pmap_tt1_deallocate(pmap, pmap->prev_tte, PMAP_ROOT_ALLOC_SIZE, 0);
-                       pmap->prev_tte = (tt_entry_t *) NULL;
-               }
-               assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
+       if (pmap->tte) {
+               pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max*sizeof(tt_entry_t), 0);
+               pmap->tte = (tt_entry_t *) NULL;
+               pmap->ttep = 0;
+               pmap->tte_index_max = 0;
+       }
+       if (pmap->prev_tte) {
+               pmap_tt1_deallocate(pmap, pmap->prev_tte, PMAP_ROOT_ALLOC_SIZE, 0);
+               pmap->prev_tte = (tt_entry_t *) NULL;
+       }
+       assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
 
-               /* return its asid to the pool */
-               free_asid(pmap->vasid);
-               pmap_check_ledgers(pmap);
+       flush_mmu_tlb_asid(pmap->asid);
+       /* return its asid to the pool */
+       free_asid(pmap->vasid);
+       pmap_check_ledgers(pmap);
 
-               ledger_dereference(pmap->ledger);
-               if (pmap->nested_region_asid_bitmap)
-                       kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size*sizeof(unsigned int));
-               zfree(pmap_zone, pmap);
-       } else {
-               simple_lock(&pmaps_lock);
-               queue_enter(&tt_pmap_list, pmap, pmap_t, pmaps);
-               tt_pmap_count++;
-               if (tt_pmap_count > tt_pmap_max)
-                       tt_pmap_max = tt_pmap_count;
-               simple_unlock(&pmaps_lock);
-       }
-#else
+
+       if (pmap->nested_region_asid_bitmap)
+               kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size*sizeof(unsigned int));
+       zfree(pmap_zone, pmap);
+#else /* __ARM_VMSA__ == 7 */
        pt_entry_t     *ttep;
        pmap_paddr_t    pa;
        vm_map_address_t c;
 
-       if (pmap == PMAP_NULL) {
-               return;
-       }
-
        pmap_unmap_sharedpage(pmap);
 
-       if (hw_atomic_sub(&pmap->ref_count, 1) == 0) {
+       pmap_simple_lock(&pmaps_lock);
+       while (pmap->gc_status & PMAP_GC_INFLIGHT) {
+               pmap->gc_status |= PMAP_GC_WAIT;
+               assert_wait((event_t) & pmap->gc_status, THREAD_UNINT);
+               pmap_simple_unlock(&pmaps_lock);
+               (void) thread_block(THREAD_CONTINUE_NULL);
+               pmap_simple_lock(&pmaps_lock);
+       }
+       queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
+       pmap_simple_unlock(&pmaps_lock);
 
-               simple_lock(&pmaps_lock);
-               while (pmap->gc_status & PMAP_GC_INFLIGHT) {
-                       pmap->gc_status |= PMAP_GC_WAIT;
-                       assert_wait((event_t) & pmap->gc_status, THREAD_UNINT);
-                       simple_unlock(&pmaps_lock);
-                       (void) thread_block(THREAD_CONTINUE_NULL);
-                       simple_lock(&pmaps_lock);
-               }
-               queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
-               simple_unlock(&pmaps_lock);
+       pmap_trim_self(pmap);
 
-               /*
-                *      Free the memory maps, then the
-                *      pmap structure.
-                */
-               for (c = pmap->min; c < pmap->max; c += ARM_TT_L2_SIZE) {
-                       ttep = pmap_tt2e(pmap, c);
-                       if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
-                               PMAP_LOCK(pmap);
-                               pmap_tte_deallocate(pmap, ttep, PMAP_TT_L2_LEVEL);
-                               PMAP_UNLOCK(pmap);
-                               flush_mmu_tlb_entry(tlbi_addr(c) | tlbi_asid(pmap->asid));
-                       }
+       /*
+        *      Free the memory maps, then the
+        *      pmap structure.
+        */
+       for (c = pmap->min; c < pmap->max; c += ARM_TT_L2_SIZE) {
+               ttep = pmap_tt2e(pmap, c);
+               if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
+                       PMAP_LOCK(pmap);
+                       pmap_tte_deallocate(pmap, ttep, PMAP_TT_L2_LEVEL);
+                       PMAP_UNLOCK(pmap);
                }
+       }
 #if !__ARM64_TWO_LEVEL_PMAP__
-               for (c = pmap->min; c < pmap->max; c += ARM_TT_L1_SIZE) {
-                       ttep = pmap_tt1e(pmap, c);
-                       if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
-                               PMAP_LOCK(pmap);
-                               pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL);
-                               PMAP_UNLOCK(pmap);
-                       }
+       for (c = pmap->min; c < pmap->max; c += ARM_TT_L1_SIZE) {
+               ttep = pmap_tt1e(pmap, c);
+               if ((ttep != PT_ENTRY_NULL) && (*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
+                       PMAP_LOCK(pmap);
+                       pmap_tte_deallocate(pmap, ttep, PMAP_TT_L1_LEVEL);
+                       PMAP_UNLOCK(pmap);
                }
+       }
 #endif
 
-               if (pmap->tte) {
-                       pa = pmap->ttep;
-                       pmap_tt1_deallocate(pmap, (tt_entry_t *)phystokv(pa), PMAP_ROOT_ALLOC_SIZE, 0);
-               }
 
+       if (pmap->tte) {
+               pa = pmap->ttep;
+               pmap_tt1_deallocate(pmap, (tt_entry_t *)phystokv(pa), PMAP_ROOT_ALLOC_SIZE, 0);
+       }
 
-               assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
-               flush_mmu_tlb_asid((uint64_t)(pmap->asid) << TLBI_ASID_SHIFT);
-               free_asid(pmap->vasid);
+       assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
+       flush_mmu_tlb_asid((uint64_t)(pmap->asid) << TLBI_ASID_SHIFT);
+       free_asid(pmap->vasid);
 
-               if (pmap->nested_region_asid_bitmap) {
-                       kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size*sizeof(unsigned int));
-               }
+       if (pmap->nested_region_asid_bitmap) {
+               kfree(pmap->nested_region_asid_bitmap, pmap->nested_region_asid_bitmap_size*sizeof(unsigned int));
+       }
 
-               pmap_check_ledgers(pmap);
-               ledger_dereference(pmap->ledger);
+       pmap_check_ledgers(pmap);
 
-               zfree(pmap_zone, pmap);
-       }
+       zfree(pmap_zone, pmap);
 
-#endif
+#endif /* __ARM_VMSA__ == 7 */
 }
 
 void
 pmap_destroy(
        pmap_t pmap)
 {
-       PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START,
-                  VM_KERNEL_ADDRHIDE(pmap));
+       ledger_t ledger;
+
+       PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid);
+
+       ledger = pmap->ledger;
 
        pmap_destroy_internal(pmap);
 
-       PMAP_TRACE(PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
+       ledger_dereference(ledger);
+
+       PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
 }
 
 
 /*
  *     Add a reference to the specified pmap.
  */
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_reference_internal(
        pmap_t pmap)
 {
        if (pmap != PMAP_NULL) {
-               (void) hw_atomic_add(&pmap->ref_count, 1);
+               VALIDATE_PMAP(pmap);
+               __c11_atomic_fetch_add(&pmap->ref_count, 1, memory_order_relaxed);
        }
 }
 
@@ -3708,12 +3858,12 @@ pmap_tt1_allocate(
        vm_address_t    va_end;
        kern_return_t   ret;
 
-       simple_lock(&pmaps_lock);
+       pmap_simple_lock(&pmaps_lock);
        if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
                        free_page_size_tt_count--;
                        tt1 = (tt_entry_t *)free_page_size_tt_list;
                        free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
-                       simple_unlock(&pmaps_lock);
+                       pmap_simple_unlock(&pmaps_lock);
                        pmap_tt_ledger_credit(pmap, size);
                        return (tt_entry_t *)tt1;
        };
@@ -3721,7 +3871,7 @@ pmap_tt1_allocate(
                        free_two_page_size_tt_count--;
                        tt1 = (tt_entry_t *)free_two_page_size_tt_list;
                        free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
-                       simple_unlock(&pmaps_lock);
+                       pmap_simple_unlock(&pmaps_lock);
                        pmap_tt_ledger_credit(pmap, size);
                        return (tt_entry_t *)tt1;
        };
@@ -3729,12 +3879,12 @@ pmap_tt1_allocate(
                        free_tt_count--;
                        tt1 = (tt_entry_t *)free_tt_list;
                        free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
-                       simple_unlock(&pmaps_lock);
+                       pmap_simple_unlock(&pmaps_lock);
                        pmap_tt_ledger_credit(pmap, size);
                        return (tt_entry_t *)tt1;
        }
 
-       simple_unlock(&pmaps_lock);
+       pmap_simple_unlock(&pmaps_lock);
 
        ret = pmap_pages_alloc(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
 
@@ -3743,7 +3893,7 @@ pmap_tt1_allocate(
 
 
        if (size < PAGE_SIZE) {
-               simple_lock(&pmaps_lock);
+               pmap_simple_lock(&pmaps_lock);
 
                for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + size; va < va_end; va = va+size) {
                        tt1_free = (tt_free_entry_t *)va;
@@ -3754,7 +3904,7 @@ pmap_tt1_allocate(
                if (free_tt_count > free_tt_max)
                        free_tt_max = free_tt_count;
 
-               simple_unlock(&pmaps_lock);
+               pmap_simple_unlock(&pmaps_lock);
        }
 
        /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
@@ -3777,7 +3927,7 @@ pmap_tt1_deallocate(
 
        tt_entry = (tt_free_entry_t *)tt;
        if (not_in_kdp)
-               simple_lock(&pmaps_lock);
+               pmap_simple_lock(&pmaps_lock);
 
        if (size <  PAGE_SIZE) {
                free_tt_count++;
@@ -3805,7 +3955,7 @@ pmap_tt1_deallocate(
 
        if ((option & PMAP_TT_DEALLOCATE_NOBLOCK) || (!not_in_kdp)) {
                if (not_in_kdp)
-                       simple_unlock(&pmaps_lock);
+                       pmap_simple_unlock(&pmaps_lock);
                pmap_tt_ledger_debit(pmap, size);
                return;
        }
@@ -3816,13 +3966,13 @@ pmap_tt1_deallocate(
                tt = (tt_entry_t *)free_page_size_tt_list;
                free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
 
-               simple_unlock(&pmaps_lock);
+               pmap_simple_unlock(&pmaps_lock);
 
                pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
 
                OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
 
-               simple_lock(&pmaps_lock);
+               pmap_simple_lock(&pmaps_lock);
        }
 
        while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
@@ -3830,15 +3980,15 @@ pmap_tt1_deallocate(
                tt = (tt_entry_t *)free_two_page_size_tt_list;
                free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
 
-               simple_unlock(&pmaps_lock);
+               pmap_simple_unlock(&pmaps_lock);
 
                pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2*PAGE_SIZE);
 
                OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
 
-               simple_lock(&pmaps_lock);
+               pmap_simple_lock(&pmaps_lock);
        }
-       simple_unlock(&pmaps_lock);
+       pmap_simple_unlock(&pmaps_lock);
        pmap_tt_ledger_debit(pmap, size);
 }
 
@@ -3888,7 +4038,7 @@ pmap_tt_allocate(
                PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
 
                ptdp = ptd_alloc(pmap);
-               *(pt_desc_t **)pai_to_pvh(pa_index(pa)) = ptdp;
+               pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
 
                __unreachable_ok_push
                if (TEST_PAGE_RATIO_4) {
@@ -3928,17 +4078,16 @@ pmap_tt_deallocate(
 
        ptdp = ptep_get_ptd((vm_offset_t)ttp);
 
-       if (level < PMAP_TT_MAX_LEVEL) {
-
-               if (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt == PT_DESC_REFCOUNT)
-                       ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0;
-       }
+       ptdp->pt_map[ARM_PT_DESC_INDEX(ttp)].va = (vm_offset_t)-1;
 
-       ptdp->pt_map[ARM_PT_DESC_INDEX(ttp)].va = 0;
+       if ((level < PMAP_TT_MAX_LEVEL) && (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt == PT_DESC_REFCOUNT))
+               ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0;
 
        if (ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt != 0)
                panic("pmap_tt_deallocate(): ptdp %p, count %d\n", ptdp, ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt);
 
+       ptdp->pt_cnt[ARM_PT_DESC_INDEX(ttp)].refcnt = 0;
+
        for (i = 0, pt_acc_cnt = 0 ; i < max_pt_index ; i++)
                pt_acc_cnt += ptdp->pt_cnt[i].refcnt;
 
@@ -3999,28 +4148,17 @@ pmap_tt_deallocate(
 }
 
 static void
-pmap_tte_deallocate(
+pmap_tte_remove(
        pmap_t pmap,
        tt_entry_t *ttep,
        unsigned int level)
 {
-       pmap_paddr_t pa;
-       tt_entry_t tte;
-
-       PMAP_ASSERT_LOCKED(pmap);
-
-       tte = *ttep;
+       tt_entry_t tte = *ttep;
 
        if (tte == 0) {
                panic("pmap_tte_deallocate(): null tt_entry ttep==%p\n", ttep);
        }
 
-#if     MACH_ASSERT
-       if (tte_get_ptd(tte)->pmap != pmap) {
-               panic("pmap_tte_deallocate(): ptd=%p ptd->pmap=%p pmap=%p \n",
-                     tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
-       }
-#endif
        if (((level+1) == PMAP_TT_MAX_LEVEL) && (tte_get_ptd(tte)->pt_cnt[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt != 0)) {
                panic("pmap_tte_deallocate(): pmap=%p ttep=%p ptd=%p refcnt=0x%x \n", pmap, ttep,
                       tte_get_ptd(tte), (tte_get_ptd(tte)->pt_cnt[ARM_PT_DESC_INDEX(ttetokv(*ttep))].refcnt));
@@ -4033,16 +4171,36 @@ pmap_tte_deallocate(
 
                for (i = 0; i<4; i++, ttep_4M++)
                        *ttep_4M = (tt_entry_t) 0;
+               FLUSH_PTE_RANGE_STRONG(ttep_4M - 4, ttep_4M);
        }
 #else
        *ttep = (tt_entry_t) 0;
+       FLUSH_PTE_STRONG(ttep);
 #endif
+}
 
-#ifndef  __ARM_L1_PTW__
-       CleanPoU_DcacheRegion((vm_offset_t) ttep, sizeof(tt_entry_t));
-#else
-       __asm__ volatile("dsb ish");
+static void
+pmap_tte_deallocate(
+       pmap_t pmap,
+       tt_entry_t *ttep,
+       unsigned int level)
+{
+       pmap_paddr_t pa;
+       tt_entry_t tte;
+
+       PMAP_ASSERT_LOCKED(pmap);
+
+       tte = *ttep;
+
+#if     MACH_ASSERT
+       if (tte_get_ptd(tte)->pmap != pmap) {
+               panic("pmap_tte_deallocate(): ptd=%p ptd->pmap=%p pmap=%p \n",
+                     tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
+       }
 #endif
+
+       pmap_tte_remove(pmap, ttep, level);
+
        if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
 #if    MACH_ASSERT
                {
@@ -4096,9 +4254,136 @@ pmap_remove_range(
                                         PMAP_OPTIONS_REMOVE);
 }
 
+
+#ifdef PVH_FLAG_EXEC
+
+/*
+ *     Update the access protection bits of the physical aperture mapping for a page.
+ *     This is useful, for example, in guranteeing that a verified executable page
+ *     has no writable mappings anywhere in the system, including the physical
+ *     aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
+ *     synchronization overhead in cases where the call to this function is
+ *     guaranteed to be followed by other TLB operations.
+ */
+static void
+pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
+{
+#if __ARM_PTE_PHYSMAP__
+       ASSERT_PVH_LOCKED(pai);
+       vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
+       pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
+
+       pt_entry_t tmplate = *pte_p;
+       if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap))
+               return;
+       tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
+#if (__ARM_VMSA__ > 7)
+       if (tmplate & ARM_PTE_HINT_MASK) {
+               panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
+                     __func__, pte_p, (void *)kva, tmplate);
+       }
+#endif
+       WRITE_PTE_STRONG(pte_p, tmplate);
+       flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap);
+       if (!flush_tlb_async)
+               sync_tlb_flush();
+#endif
+}
+
+#endif /* defined(PVH_FLAG_EXEC) */
+
+static void
+pmap_remove_pv(
+       pmap_t pmap,
+       pt_entry_t *cpte,
+       int pai,
+       int *num_internal,
+       int *num_alt_internal,
+       int *num_reusable,
+       int *num_external)
+{
+       pv_entry_t    **pv_h, **pve_pp;
+       pv_entry_t     *pve_p;
+
+       ASSERT_PVH_LOCKED(pai);
+       pv_h = pai_to_pvh(pai);
+       vm_offset_t pvh_flags = pvh_get_flags(pv_h);
+
+
+       if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
+               if (__builtin_expect((cpte != pvh_ptep(pv_h)), 0))
+                       panic("%s: cpte=%p does not match pv_h=%p (%p), pai=0x%x\n", __func__, cpte, pv_h, pvh_ptep(pv_h), pai);
+               if (IS_ALTACCT_PAGE(pai, PV_ENTRY_NULL)) {
+                       assert(IS_INTERNAL_PAGE(pai));
+                       (*num_internal)++;
+                       (*num_alt_internal)++;
+                       CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL);
+               } else if (IS_INTERNAL_PAGE(pai)) {
+                       if (IS_REUSABLE_PAGE(pai)) {
+                               (*num_reusable)++;
+                       } else {
+                               (*num_internal)++;
+                       }
+               } else {
+                       (*num_external)++;
+               }
+               pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
+       } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
+
+               pve_pp = pv_h;
+               pve_p = pvh_list(pv_h);
+
+               while (pve_p != PV_ENTRY_NULL &&
+                      (pve_get_ptep(pve_p) != cpte)) {
+                       pve_pp = pve_link_field(pve_p);
+                       pve_p = PVE_NEXT_PTR(pve_next(pve_p));
+               }
+
+               if (__builtin_expect((pve_p == PV_ENTRY_NULL), 0))
+                       panic("%s: cpte=%p (pai=0x%x) not in pv_h=%p\n", __func__, cpte, pai, pv_h);
+
 #if MACH_ASSERT
-int num_reusable_mismatch = 0;
-#endif /* MACH_ASSERT */
+               if ((pmap != NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
+                       pv_entry_t *check_pve_p = PVE_NEXT_PTR(pve_next(pve_p));
+                       while (check_pve_p != PV_ENTRY_NULL) {
+                               if (pve_get_ptep(check_pve_p) == cpte) {
+                                       panic("%s: duplicate pve entry cpte=%p pmap=%p, pv_h=%p, pve_p=%p, pai=0x%x",
+                                           __func__, cpte, pmap, pv_h, pve_p, pai);
+                               }
+                               check_pve_p = PVE_NEXT_PTR(pve_next(check_pve_p));
+                       }
+               }
+#endif
+
+               if (IS_ALTACCT_PAGE(pai, pve_p)) {
+                       assert(IS_INTERNAL_PAGE(pai));
+                       (*num_internal)++;
+                       (*num_alt_internal)++;
+                       CLR_ALTACCT_PAGE(pai, pve_p);
+               } else if (IS_INTERNAL_PAGE(pai)) {
+                       if (IS_REUSABLE_PAGE(pai)) {
+                               (*num_reusable)++;
+                       } else {
+                               (*num_internal)++;
+                       }
+               } else {
+                       (*num_external)++;
+               }
+
+               pvh_remove(pv_h, pve_pp, pve_p);
+               pv_free(pve_p);
+               if (!pvh_test_type(pv_h, PVH_TYPE_NULL))
+                       pvh_set_flags(pv_h, pvh_flags);
+       } else {
+               panic("%s: unexpected PV head %p, cpte=%p pmap=%p pv_h=%p pai=0x%x",
+                     __func__, *pv_h, cpte, pmap, pv_h, pai);
+       }
+
+#ifdef PVH_FLAG_EXEC
+       if ((pvh_flags & PVH_FLAG_EXEC) && pvh_test_type(pv_h, PVH_TYPE_NULL))
+               pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
+#endif
+}
 
 static int
 pmap_remove_range_options(
@@ -4132,17 +4417,14 @@ pmap_remove_range_options(
 
        for (cpte = bpte; cpte < epte;
             cpte += PAGE_SIZE/ARM_PGBYTES, va += PAGE_SIZE) {
-               pv_entry_t    **pv_h, **pve_pp;
-               pv_entry_t     *pve_p;
                pt_entry_t      spte;
                boolean_t       managed=FALSE;
 
                spte = *cpte;
 
 #if CONFIG_PGTRACE
-        if (pgtrace_enabled) {
-            pmap_pgtrace_remove_clone(pmap, pte_to_pa(spte), va);
-        }
+               if (pgtrace_enabled)
+                       pmap_pgtrace_remove_clone(pmap, pte_to_pa(spte), va);
 #endif
 
                while (!managed) {
@@ -4238,77 +4520,8 @@ pmap_remove_range_options(
                 * find and remove the mapping from the chain for this
                 * physical address.
                 */
-               ASSERT_PVH_LOCKED(pai); // Should have been locked when we found the managed PTE above
-               pv_h = pai_to_pvh(pai);
-
-               if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
-                       if (__builtin_expect((cpte != pvh_ptep(pv_h)), 0))
-                               panic("pmap_remove_range(): cpte=%p (0x%llx) does not match pv_h=%p (%p)\n", cpte, (uint64_t)spte, pv_h, pvh_ptep(pv_h));
-                       if (IS_ALTACCT_PAGE(pai, PV_ENTRY_NULL)) {
-                               assert(IS_INTERNAL_PAGE(pai));
-                               num_internal++;
-                               num_alt_internal++;
-                               CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL);
-                       } else if (IS_INTERNAL_PAGE(pai)) {
-                               if (IS_REUSABLE_PAGE(pai)) {
-                                       num_reusable++;
-                               } else {
-                                       num_internal++;
-                               }
-                       } else {
-                               num_external++;
-                       }
-                       pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
-               } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
-
-                       pve_pp = pv_h;
-                       pve_p = pvh_list(pv_h);
-
-                       while (pve_p != PV_ENTRY_NULL &&
-                              (pve_get_ptep(pve_p) != cpte)) {
-                               pve_pp = pve_link_field(pve_p);
-                               pve_p = PVE_NEXT_PTR(pve_next(pve_p));
-                       }
-
-                       if (__builtin_expect((pve_p == PV_ENTRY_NULL), 0)) {
-                               UNLOCK_PVH(pai);
-                               panic("pmap_remove_range(): cpte=%p (0x%llx) not in pv_h=%p\n", cpte, (uint64_t)spte, pv_h);
-                       }
-
-#if MACH_ASSERT
-                       if (kern_feature_override(KF_PMAPV_OVRD) == FALSE) {
-                               pv_entry_t *check_pve_p = PVE_NEXT_PTR(pve_next(pve_p));
-                               while (check_pve_p != PV_ENTRY_NULL) {
-                                       if (pve_get_ptep(check_pve_p) == cpte) {
-                                               panic("pmap_remove_range(): duplicate pve entry cpte=%p pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, va=0x%llx\n",
-                                                   cpte, pmap, pv_h, pve_p, (uint64_t)spte, (uint64_t)va);
-                                       }
-                                       check_pve_p = PVE_NEXT_PTR(pve_next(check_pve_p));
-                               }
-                       }
-#endif
-
-                       if (IS_ALTACCT_PAGE(pai, pve_p)) {
-                               assert(IS_INTERNAL_PAGE(pai));
-                               num_internal++;
-                               num_alt_internal++;
-                               CLR_ALTACCT_PAGE(pai, pve_p);
-                       } else if (IS_INTERNAL_PAGE(pai)) {
-                               if (IS_REUSABLE_PAGE(pai)) {
-                                       num_reusable++;
-                               } else {
-                                       num_internal++;
-                               }
-                       } else {
-                               num_external++;
-                       }
 
-                       pvh_remove(pv_h, pve_pp, pve_p) ;
-                       pv_free(pve_p);
-               } else {
-                       panic("pmap_remove_range(): unexpected PV head %p, cpte=%p pmap=%p pv_h=%p pte=0x%llx va=0x%llx\n",
-                             *pv_h, cpte, pmap, pv_h, (uint64_t)spte, (uint64_t)va);
-               }
+               pmap_remove_pv(pmap, cpte, pai, &num_internal, &num_alt_internal, &num_reusable, &num_external);
 
                UNLOCK_PVH(pai);
                num_removed++;
@@ -4325,11 +4538,10 @@ pmap_remove_range_options(
 #if MACH_ASSERT
                if (pmap->stats.internal < num_internal) {
                        if ((! pmap_stats_assert ||
-                            ! pmap->pmap_stats_assert) ||
-                           (pmap->stats.internal + pmap->stats.reusable) ==
-                           (num_internal + num_reusable)) {
-                               num_reusable_mismatch++;
-                               printf("pmap_remove_range_options(%p,0x%llx,%p,%p,0x%x): num_internal=%d num_removed=%d num_unwired=%d num_external=%d num_reusable=%d num_compressed=%lld num_alt_internal=%d num_alt_compressed=%lld num_pte_changed=%d stats.internal=%d stats.reusable=%d\n",
+                            ! pmap->pmap_stats_assert)) {
+                               printf("%d[%s] pmap_remove_range_options(%p,0x%llx,%p,%p,0x%x): num_internal=%d num_removed=%d num_unwired=%d num_external=%d num_reusable=%d num_compressed=%lld num_alt_internal=%d num_alt_compressed=%lld num_pte_changed=%d stats.internal=%d stats.reusable=%d\n",
+                                      pmap->pmap_pid,
+                                      pmap->pmap_procname,
                                       pmap,
                                       (uint64_t) va,
                                       bpte,
@@ -4346,11 +4558,10 @@ pmap_remove_range_options(
                                       num_pte_changed,
                                       pmap->stats.internal,
                                       pmap->stats.reusable);
-                               /* slight mismatch: fix it... */
-                               num_internal = pmap->stats.internal;
-                               num_reusable = pmap->stats.reusable;
                        } else {
-                               panic("pmap_remove_range_options(%p,0x%llx,%p,%p,0x%x): num_internal=%d num_removed=%d num_unwired=%d num_external=%d num_reusable=%d num_compressed=%lld num_alt_internal=%d num_alt_compressed=%lld num_pte_changed=%d stats.internal=%d stats.reusable=%d",
+                               panic("%d[%s] pmap_remove_range_options(%p,0x%llx,%p,%p,0x%x): num_internal=%d num_removed=%d num_unwired=%d num_external=%d num_reusable=%d num_compressed=%lld num_alt_internal=%d num_alt_compressed=%lld num_pte_changed=%d stats.internal=%d stats.reusable=%d",
+                                     pmap->pmap_pid,
+                                     pmap->pmap_procname,
                                      pmap,
                                      (uint64_t) va,
                                      bpte,
@@ -4418,7 +4629,7 @@ pmap_remove_range_options(
 
        /* flush the ptable entries we have written */
        if (num_pte_changed > 0)
-               FLUSH_PTE_RANGE(bpte, epte);
+               FLUSH_PTE_RANGE_STRONG(bpte, epte);
 
        return num_pte_changed;
 }
@@ -4440,11 +4651,12 @@ pmap_remove(
        pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
 }
 
-static int
-pmap_remove_options_internal(pmap_t pmap,
-vm_map_address_t start,
-vm_map_address_t end,
-int options)
+MARK_AS_PMAP_TEXT static int
+pmap_remove_options_internal(
+       pmap_t pmap,
+       vm_map_address_t start,
+       vm_map_address_t end,
+       int options)
 {
        int remove_count = 0;
        pt_entry_t     *bpte, *epte;
@@ -4452,6 +4664,10 @@ int options)
        tt_entry_t     *tte_p;
        uint32_t        rmv_spte=0;
 
+       if (__improbable(end < start))
+               panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
+
+       VALIDATE_PMAP(pmap);
        PMAP_LOCK(pmap);
 
        tte_p = pmap_tte(pmap, start);
@@ -4485,7 +4701,6 @@ int options)
 
 done:
        PMAP_UNLOCK(pmap);
-
        return remove_count;
 }
 
@@ -4502,7 +4717,7 @@ pmap_remove_options(
        if (pmap == PMAP_NULL)
                return;
 
-       PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
+       PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
                   VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
                   VM_KERNEL_ADDRHIDE(end));
 
@@ -4537,11 +4752,10 @@ pmap_remove_options(
                va = l;
        }
 
-
        if (remove_count > 0)
                PMAP_UPDATE_TLBS(pmap, start, end);
 
-       PMAP_TRACE(PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
+       PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
 }
 
 
@@ -4556,7 +4770,6 @@ pmap_remove_some_phys(
        /* Implement to support working set code */
 }
 
-
 void
 pmap_set_pmap(
        pmap_t pmap,
@@ -4585,30 +4798,25 @@ pmap_flush_core_tlb_asid(pmap_t pmap)
        flush_core_tlb_asid(pmap->asid);
 #else
        flush_core_tlb_asid(((uint64_t) pmap->asid) << TLBI_ASID_SHIFT);
-#if __ARM_KERNEL_PROTECT__
-       flush_core_tlb_asid(((uint64_t) pmap->asid + 1) << TLBI_ASID_SHIFT);
-#endif /* __ARM_KERNEL_PROTECT__ */
 #endif
 }
 
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_switch_internal(
        pmap_t pmap)
 {
+       VALIDATE_PMAP(pmap);
        pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
-       uint32_t        last_asid_high_bits, asid_high_bits;
-       pmap_t          cur_pmap;
-       pmap_t          cur_user_pmap;
-       boolean_t       do_asid_flush = FALSE;
+       uint32_t         last_asid_high_bits, asid_high_bits;
+       boolean_t        do_asid_flush = FALSE;
 
 #if    (__ARM_VMSA__ == 7)
        if (not_in_kdp)
-               simple_lock(&pmap->tt1_lock);
+               pmap_simple_lock(&pmap->tt1_lock);
+#else
+       pmap_t           last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
 #endif
 
-       cur_pmap = current_pmap();
-       cur_user_pmap = cpu_data_ptr->cpu_user_pmap;
-
        /* Paranoia. */
        assert(pmap->asid < (sizeof(cpu_data_ptr->cpu_asid_high_bits) / sizeof(*cpu_data_ptr->cpu_asid_high_bits)));
 
@@ -4627,50 +4835,23 @@ pmap_switch_internal(
                do_asid_flush = TRUE;
        }
 
-       if ((cur_user_pmap == cur_pmap) && (cur_pmap == pmap)) {
-               if (cpu_data_ptr->cpu_user_pmap_stamp == pmap->stamp) {
-                       pmap_switch_user_ttb_internal(pmap);
-
-#if    (__ARM_VMSA__ == 7)
-                       if (not_in_kdp)
-                               simple_unlock(&pmap->tt1_lock);
-#endif
-
-                       if (do_asid_flush) {
-                               pmap_flush_core_tlb_asid(pmap);
-                       }
-
-                       return;
-               } else
-                       cur_user_pmap = NULL;
-       } else if ((cur_user_pmap == pmap) && (cpu_data_ptr->cpu_user_pmap_stamp != pmap->stamp))
-                       cur_user_pmap = NULL;
-
        pmap_switch_user_ttb_internal(pmap);
 
-       if (do_asid_flush) {
+#if    (__ARM_VMSA__ > 7)
+       /* If we're switching to a different nested pmap (i.e. shared region), we'll need
+        * to flush the userspace mappings for that region.  Those mappings are global
+        * and will not be protected by the ASID.  It should also be cheaper to flush the
+        * entire local TLB rather than to do a broadcast MMU flush by VA region. */
+       if ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap))
+               flush_core_tlb();
+       else
+#endif
+       if (do_asid_flush)
                pmap_flush_core_tlb_asid(pmap);
-       }
 
 #if    (__ARM_VMSA__ == 7)
        if (not_in_kdp)
-               simple_unlock(&pmap->tt1_lock);
-#else
-       if (pmap != kernel_pmap) {
-
-               if (cur_user_pmap != PMAP_NULL) {
-                       /*
-                        * We have a low-address global mapping for the commpage
-                        * for 32-bit processes; flush it if we switch to a 64-bot
-                        * process.
-                        */
-                       if (pmap_is_64bit(pmap) && !pmap_is_64bit(cur_user_pmap)) {
-                               pmap_sharedpage_flush_32_to_64();
-                       }
-
-               } else
-                       flush_core_tlb();
-       }
+               pmap_simple_unlock(&pmap->tt1_lock);
 #endif
 }
 
@@ -4678,7 +4859,9 @@ void
 pmap_switch(
        pmap_t pmap)
 {
+       PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid);
        pmap_switch_internal(pmap);
+       PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
 }
 
 void
@@ -4696,7 +4879,7 @@ pmap_page_protect(
  *             Lower the permission for all mappings to a given
  *             page.
  */
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_page_protect_options_internal(
        ppnum_t ppnum,
        vm_prot_t prot,
@@ -4704,13 +4887,18 @@ pmap_page_protect_options_internal(
 {
        pmap_paddr_t    phys = ptoa(ppnum);
        pv_entry_t    **pv_h;
+       pv_entry_t    **pve_pp;
        pv_entry_t     *pve_p;
        pv_entry_t     *pveh_p;
        pv_entry_t     *pvet_p;
        pt_entry_t     *pte_p;
+       pv_entry_t     *new_pve_p;
+       pt_entry_t     *new_pte_p;
+       vm_offset_t     pvh_flags;
        int             pai;
        boolean_t       remove;
        boolean_t       set_NX;
+       boolean_t       tlb_flush_needed = FALSE;
        unsigned int    pvh_cnt = 0;
 
        assert(ppnum != vm_page_fictitious_addr);
@@ -4738,11 +4926,16 @@ pmap_page_protect_options_internal(
        pai = (int)pa_index(phys);
        LOCK_PVH(pai);
        pv_h = pai_to_pvh(pai);
+       pvh_flags = pvh_get_flags(pv_h);
+
 
        pte_p = PT_ENTRY_NULL;
        pve_p = PV_ENTRY_NULL;
+       pve_pp = pv_h;
        pveh_p = PV_ENTRY_NULL;
        pvet_p = PV_ENTRY_NULL;
+       new_pve_p = PV_ENTRY_NULL;
+       new_pte_p = PT_ENTRY_NULL;
        if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
                pte_p = pvh_ptep(pv_h);
        } else if  (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
@@ -4759,6 +4952,29 @@ pmap_page_protect_options_internal(
                if (pve_p != PV_ENTRY_NULL)
                        pte_p = pve_get_ptep(pve_p);
 
+#ifdef PVH_FLAG_IOMMU
+               if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) {
+                       if (remove) {
+                               if (options & PMAP_OPTIONS_COMPRESSOR) {
+                                       panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu 0x%llx, pve_p=%p",
+                                             ppnum, (uint64_t)pte_p & ~PVH_FLAG_IOMMU, pve_p);
+                               }
+                               if (pve_p != PV_ENTRY_NULL) {
+                                       pv_entry_t *temp_pve_p = PVE_NEXT_PTR(pve_next(pve_p));
+                                       pvh_remove(pv_h, pve_pp, pve_p);
+                                       pveh_p = pvh_list(pv_h);
+                                       pve_next(pve_p) = new_pve_p;
+                                       new_pve_p = pve_p;
+                                       pve_p = temp_pve_p;
+                                       continue;
+                               } else {
+                                       new_pte_p = pte_p;
+                                       break;
+                               }
+                       }
+                       goto protect_skip_pve;
+               }
+#endif
                pmap = ptep_get_pmap(pte_p);
                va = ptep_get_va(pte_p);
 
@@ -4833,7 +5049,7 @@ pmap_page_protect_options_internal(
                        }
 
                        if (*pte_p != tmplate) {
-                               WRITE_PTE(pte_p, tmplate);
+                               WRITE_PTE_STRONG(pte_p, tmplate);
                                update = TRUE;
                        }
                        pvh_cnt++;
@@ -4976,34 +5192,54 @@ pmap_page_protect_options_internal(
                        if (*pte_p != ARM_PTE_TYPE_FAULT &&
                            !ARM_PTE_IS_COMPRESSED(*pte_p) &&
                            *pte_p != tmplate) {
-                               WRITE_PTE(pte_p, tmplate);
+                               WRITE_PTE_STRONG(pte_p, tmplate);
                                update = TRUE;
                        }
                }
 
                /* Invalidate TLBs for all CPUs using it */
-               if (update)
-                       PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
+               if (update) {
+                       tlb_flush_needed = TRUE;
+                       flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap);
+               }
 
+#ifdef PVH_FLAG_IOMMU
+       protect_skip_pve:
+#endif
                pte_p = PT_ENTRY_NULL;
                pvet_p = pve_p;
                if (pve_p != PV_ENTRY_NULL) {
-                       pvet_p = pve_p;
                        if (remove) {
                                assert(pve_next(pve_p) == PVE_NEXT_PTR(pve_next(pve_p)));
                        }
+                       pve_pp = pve_link_field(pve_p);
                        pve_p = PVE_NEXT_PTR(pve_next(pve_p));
                }
        }
 
+#ifdef PVH_FLAG_EXEC
+       if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC))
+               pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
+#endif
+       if (tlb_flush_needed)
+               sync_tlb_flush();
+
        /* if we removed a bunch of entries, take care of them now */
        if (remove) {
-               pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
+               if (new_pve_p != PV_ENTRY_NULL) {
+                       pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
+                       pvh_set_flags(pv_h, pvh_flags);
+               } else if (new_pte_p != PT_ENTRY_NULL) {
+                       pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
+                       pvh_set_flags(pv_h, pvh_flags);
+               } else {
+                       pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
+               }
        }
 
        UNLOCK_PVH(pai);
 
-       if (remove && (pveh_p != PV_ENTRY_NULL)) {
+       if (remove && (pvet_p != PV_ENTRY_NULL)) {
                pv_list_free(pveh_p, pvet_p, pvh_cnt);
        }
 }
@@ -5030,11 +5266,11 @@ pmap_page_protect_options(
                return;         /* nothing to do */
        }
 
-       PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
+       PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
 
        pmap_page_protect_options_internal(ppnum, prot, options);
 
-       PMAP_TRACE(PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
+       PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
 }
 
 /*
@@ -5062,7 +5298,7 @@ pmap_protect(
        pmap_protect_options(pmap, b, e, prot, 0, NULL);
 }
 
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_protect_options_internal(pmap_t pmap,
        vm_map_address_t start,
        vm_map_address_t end,
@@ -5083,6 +5319,9 @@ pmap_protect_options_internal(pmap_t pmap,
        boolean_t       InvalidatePoU_Icache_Done = FALSE;
 #endif
 
+       if (__improbable(end < start))
+               panic("%s called with bogus range: %p, %p", __func__, (void*)start, (void*)end);
+
 #if DEVELOPMENT || DEBUG
        if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
                if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
@@ -5127,6 +5366,7 @@ pmap_protect_options_internal(pmap_t pmap,
                set_NX = TRUE;
        }
 
+       VALIDATE_PMAP(pmap);
        PMAP_LOCK(pmap);
        tte_p = pmap_tte(pmap, start);
 
@@ -5307,7 +5547,7 @@ pmap_protect_options_internal(pmap_t pmap,
                        }
                }
 
-               FLUSH_PTE_RANGE(bpte_p, epte_p);
+               FLUSH_PTE_RANGE_STRONG(bpte_p, epte_p);
                PMAP_UPDATE_TLBS(pmap, start, end);
        }
 
@@ -5354,7 +5594,7 @@ pmap_protect_options(
                }
        }
 
-       PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
+       PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
                   VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
                   VM_KERNEL_ADDRHIDE(e));
 
@@ -5371,7 +5611,7 @@ pmap_protect_options(
                beg = l;
        }
 
-       PMAP_TRACE(PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
+       PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
 }
 
 /* Map a (possibly) autogenned block */
@@ -5457,12 +5697,14 @@ static inline void pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t pte
        }
        if (*pte_p != ARM_PTE_TYPE_FAULT &&
            !ARM_PTE_IS_COMPRESSED(*pte_p)) {
-               WRITE_PTE(pte_p, pte);
+               WRITE_PTE_STRONG(pte_p, pte);
                PMAP_UPDATE_TLBS(pmap, v, v + PAGE_SIZE);
        } else {
                WRITE_PTE(pte_p, pte);
-               __asm__ volatile("isb");
+               __builtin_arm_isb(ISB_SY);
        }
+
+       PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + PAGE_SIZE), pte);
 }
 
 static pt_entry_t
@@ -5472,6 +5714,7 @@ wimg_to_pte(unsigned int wimg)
 
        switch (wimg & (VM_WIMG_MASK)) {
                case VM_WIMG_IO:
+               case VM_WIMG_RT:
                        pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
                        pte |= ARM_PTE_NX | ARM_PTE_PNX;
                        break;
@@ -5519,7 +5762,145 @@ wimg_to_pte(unsigned int wimg)
        return pte;
 }
 
-static kern_return_t
+static boolean_t
+pmap_enter_pv(
+       pmap_t pmap,
+       pt_entry_t *pte_p,
+       int pai,
+       unsigned int options,
+       pv_entry_t **pve_p,
+       boolean_t *is_altacct)
+{
+       pv_entry_t    **pv_h;
+       pv_h = pai_to_pvh(pai);
+       boolean_t first_cpu_mapping;
+
+       ASSERT_PVH_LOCKED(pai);
+
+       vm_offset_t pvh_flags = pvh_get_flags(pv_h);
+
+
+#ifdef PVH_FLAG_CPU
+       /* An IOMMU mapping may already be present for a page that hasn't yet
+        * had a CPU mapping established, so we use PVH_FLAG_CPU to determine
+        * if this is the first CPU mapping.  We base internal/reusable
+        * accounting on the options specified for the first CPU mapping.
+        * PVH_FLAG_CPU, and thus this accounting, will then persist as long
+        * as there are *any* mappings of the page.  The accounting for a
+        * page should not need to change until the page is recycled by the
+        * VM layer, and we assert that there are no mappings when a page
+        * is recycled.   An IOMMU mapping of a freed/recycled page is
+        * considered a security violation & potential DMA corruption path.*/
+       first_cpu_mapping = ((pmap != NULL) && !(pvh_flags & PVH_FLAG_CPU));
+       if (first_cpu_mapping)
+               pvh_flags |= PVH_FLAG_CPU;
+#else
+       first_cpu_mapping = pvh_test_type(pv_h, PVH_TYPE_NULL);
+#endif
+
+       if (first_cpu_mapping) {
+               if (options & PMAP_OPTIONS_INTERNAL) {
+                       SET_INTERNAL_PAGE(pai);
+               } else {
+                       CLR_INTERNAL_PAGE(pai);
+               }
+               if ((options & PMAP_OPTIONS_INTERNAL) &&
+                   (options & PMAP_OPTIONS_REUSABLE)) {
+                       SET_REUSABLE_PAGE(pai);
+               } else {
+                       CLR_REUSABLE_PAGE(pai);
+               }
+       }
+       if (pvh_test_type(pv_h, PVH_TYPE_NULL)) {
+               pvh_update_head(pv_h, pte_p, PVH_TYPE_PTEP);
+               if (pmap != NULL && pmap != kernel_pmap &&
+                   ((options & PMAP_OPTIONS_ALT_ACCT) ||
+                    PMAP_FOOTPRINT_SUSPENDED(pmap)) &&
+                   IS_INTERNAL_PAGE(pai)) {
+                       /*
+                        * Make a note to ourselves that this mapping is using alternative
+                        * accounting. We'll need this in order to know which ledger to
+                        * debit when the mapping is removed.
+                        *
+                        * The altacct bit must be set while the pv head is locked. Defer
+                        * the ledger accounting until after we've dropped the lock.
+                        */
+                       SET_ALTACCT_PAGE(pai, PV_ENTRY_NULL);
+                       *is_altacct = TRUE;
+               } else {
+                       CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL);
+               }
+       } else {
+               if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
+                       pt_entry_t      *pte1_p;
+
+                       /*
+                        * convert pvh list from PVH_TYPE_PTEP to PVH_TYPE_PVEP
+                        */
+                       pte1_p = pvh_ptep(pv_h);
+                       pvh_set_flags(pv_h, pvh_flags);
+                       if((*pve_p == PV_ENTRY_NULL) && (!pv_alloc(pmap, pai, pve_p)))
+                               return FALSE;
+
+                       pve_set_ptep(*pve_p, pte1_p);
+                       (*pve_p)->pve_next = PV_ENTRY_NULL;
+
+                       if (IS_ALTACCT_PAGE(pai, PV_ENTRY_NULL)) {
+                               /*
+                                * transfer "altacct" from
+                                * pp_attr to this pve
+                                */
+                               CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL);
+                               SET_ALTACCT_PAGE(pai, *pve_p);
+                       }
+                       pvh_update_head(pv_h, *pve_p, PVH_TYPE_PVEP);
+                       *pve_p = PV_ENTRY_NULL;
+               } else if (!pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
+                       panic("%s: unexpected PV head %p, pte_p=%p pmap=%p pv_h=%p",
+                             __func__, *pv_h, pte_p, pmap, pv_h);
+               }
+               /*
+                * Set up pv_entry for this new mapping and then
+                * add it to the list for this physical page.
+                */
+               pvh_set_flags(pv_h, pvh_flags);
+               if((*pve_p == PV_ENTRY_NULL) && (!pv_alloc(pmap, pai, pve_p)))
+                       return FALSE;
+
+               pve_set_ptep(*pve_p, pte_p);
+               (*pve_p)->pve_next = PV_ENTRY_NULL;
+
+               pvh_add(pv_h, *pve_p);
+
+               if (pmap != NULL && pmap != kernel_pmap &&
+                   ((options & PMAP_OPTIONS_ALT_ACCT) ||
+                    PMAP_FOOTPRINT_SUSPENDED(pmap)) &&
+                   IS_INTERNAL_PAGE(pai)) {
+                       /*
+                        * Make a note to ourselves that this
+                        * mapping is using alternative
+                        * accounting. We'll need this in order
+                        * to know which ledger to debit when
+                        * the mapping is removed.
+                        *
+                        * The altacct bit must be set while
+                        * the pv head is locked. Defer the
+                        * ledger accounting until after we've
+                        * dropped the lock.
+                        */
+                       SET_ALTACCT_PAGE(pai, *pve_p);
+                       *is_altacct = TRUE;
+               }
+
+               *pve_p = PV_ENTRY_NULL;
+       }
+
+       pvh_set_flags(pv_h, pvh_flags);
+
+       return TRUE;
+} 
+
+MARK_AS_PMAP_TEXT static kern_return_t
 pmap_enter_options_internal(
        pmap_t pmap,
        vm_map_address_t v,
@@ -5541,6 +5922,9 @@ pmap_enter_options_internal(
        boolean_t       wiredcnt_updated;
        unsigned int    wimg_bits;
        boolean_t       was_compressed, was_alt_compressed;
+       kern_return_t   kr = KERN_SUCCESS;
+
+       VALIDATE_PMAP(pmap);
 
        if ((v) & PAGE_MASK) {
                panic("pmap_enter_options() pmap %p v 0x%llx\n",
@@ -5585,11 +5969,10 @@ pmap_enter_options_internal(
                /* Must unlock to expand the pmap. */
                PMAP_UNLOCK(pmap);
 
-               kern_return_t kr=pmap_expand(pmap, v, options, PMAP_TT_MAX_LEVEL);
+               kr = pmap_expand(pmap, v, options, PMAP_TT_MAX_LEVEL);
 
-               if(kr) {
+               if (kr != KERN_SUCCESS)
                        return kr;
-               }
 
                PMAP_LOCK(pmap);
        }
@@ -5618,10 +6001,7 @@ Pmap_enter_retry:
                was_compressed = TRUE;
                if (spte & ARM_PTE_COMPRESSED_ALT) {
                        was_alt_compressed = TRUE;
-                       pmap_ledger_debit(
-                               pmap,
-                               task_ledgers.alternate_accounting_compressed,
-                               PAGE_SIZE);
+                       pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, PAGE_SIZE);
                } else {
                        /* was part of the footprint */
                        pmap_ledger_debit(pmap, task_ledgers.phys_footprint, PAGE_SIZE);
@@ -5668,13 +6048,6 @@ Pmap_enter_retry:
        }
 #endif
 
-       if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT)))
-               wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
-       else
-               wimg_bits = pmap_cache_attributes(pn);
-
-       pte |= wimg_to_pte(wimg_bits);
-
        if (pmap == kernel_pmap) {
 #if __ARM_KERNEL_PROTECT__
                pte |= ARM_PTE_NG;
@@ -5783,40 +6156,42 @@ Pmap_enter_retry:
        }
 
        if (pa_valid(pa)) {
-               pv_entry_t    **pv_h;
-               int             pai;
-               boolean_t       is_altacct, is_internal;
+               int         pai;
+               boolean_t   is_altacct, is_internal;
 
                is_internal = FALSE;
                is_altacct = FALSE;
 
                pai = (int)pa_index(pa);
-               pv_h = pai_to_pvh(pai);
 
                LOCK_PVH(pai);
+       
 Pmap_enter_loop:
+               if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT)))
+                       wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
+               else
+                       wimg_bits = pmap_cache_attributes(pn);
+
+               /* We may be retrying this operation after dropping the PVH lock.
+                * Cache attributes for the physical page may have changed while the lock
+                * was dropped, so clear any cache attributes we may have previously set
+                * in the PTE template. */
+               pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
+               pte |= wimg_to_pte(wimg_bits);
+
+
 
                if (pte == *pte_p) {
                        /*
                         * This pmap_enter operation has been completed by another thread
                         * undo refcnt on pt and return
                         */
-                       if (refcnt != NULL) {
-                               assert(refcnt_updated);
-                               if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0)
-                                       panic("pmap_enter(): over-release of ptdp %p for pte %p\n", ptep_get_ptd(pte_p), pte_p);
-                       }
                        UNLOCK_PVH(pai);
-                       goto Pmap_enter_return;
+                       goto Pmap_enter_cleanup;
                } else if (pte_to_pa(*pte_p) == pa) {
-                       if (refcnt != NULL) {
-                               assert(refcnt_updated);
-                               if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0)
-                                       panic("pmap_enter(): over-release of ptdp %p for pte %p\n", ptep_get_ptd(pte_p), pte_p);
-                       }
                        pmap_enter_pte(pmap, pte_p, pte, v);
                        UNLOCK_PVH(pai);
-                       goto Pmap_enter_return;
+                       goto Pmap_enter_cleanup;
                } else if (*pte_p != ARM_PTE_TYPE_FAULT) {
                        /*
                         * pte has been modified by another thread
@@ -5825,96 +6200,8 @@ Pmap_enter_loop:
                        UNLOCK_PVH(pai);
                        goto Pmap_enter_retry;
                }
-               if (pvh_test_type(pv_h, PVH_TYPE_NULL)) {
-                       pvh_update_head(pv_h, pte_p, PVH_TYPE_PTEP);
-                       /* 1st mapping: see what kind of page it is */
-                       if (options & PMAP_OPTIONS_INTERNAL) {
-                               SET_INTERNAL_PAGE(pai);
-                       } else {
-                               CLR_INTERNAL_PAGE(pai);
-                       }
-                       if ((options & PMAP_OPTIONS_INTERNAL) &&
-                           (options & PMAP_OPTIONS_REUSABLE)) {
-                               SET_REUSABLE_PAGE(pai);
-                       } else {
-                               CLR_REUSABLE_PAGE(pai);
-                       }
-                       if (pmap != kernel_pmap &&
-                           ((options & PMAP_OPTIONS_ALT_ACCT) ||
-                            PMAP_FOOTPRINT_SUSPENDED(pmap)) &&
-                           IS_INTERNAL_PAGE(pai)) {
-                               /*
-                                * Make a note to ourselves that this mapping is using alternative
-                                * accounting. We'll need this in order to know which ledger to
-                                * debit when the mapping is removed.
-                                *
-                                * The altacct bit must be set while the pv head is locked. Defer
-                                * the ledger accounting until after we've dropped the lock.
-                                */
-                               SET_ALTACCT_PAGE(pai, PV_ENTRY_NULL);
-                               is_altacct = TRUE;
-                       } else {
-                               CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL);
-                       }
-               } else {
-                       if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
-                               pt_entry_t      *pte1_p;
-
-                               /*
-                                * convert pvh list from PVH_TYPE_PTEP to PVH_TYPE_PVEP
-                                */
-                               pte1_p = pvh_ptep(pv_h);
-                               if((pve_p == PV_ENTRY_NULL) && (!pv_alloc(pmap, pai, &pve_p))) {
-                                       goto Pmap_enter_loop;
-                               }
-                               pve_set_ptep(pve_p, pte1_p);
-                               pve_p->pve_next = PV_ENTRY_NULL;
-
-                               if (IS_ALTACCT_PAGE(pai, PV_ENTRY_NULL)) {
-                                       /*
-                                        * transfer "altacct" from
-                                        * pp_attr to this pve
-                                        */
-                                       CLR_ALTACCT_PAGE(pai, PV_ENTRY_NULL);
-                                       SET_ALTACCT_PAGE(pai, pve_p);
-                               }
-                               pvh_update_head(pv_h, pve_p, PVH_TYPE_PVEP);
-                               pve_p = PV_ENTRY_NULL;
-                       }
-                       /*
-                        * Set up pv_entry for this new mapping and then
-                        * add it to the list for this physical page.
-                        */
-                       if((pve_p == PV_ENTRY_NULL) && (!pv_alloc(pmap, pai, &pve_p))) {
-                               goto Pmap_enter_loop;
-                       }
-                       pve_set_ptep(pve_p, pte_p);
-                       pve_p->pve_next = PV_ENTRY_NULL;
-
-                       pvh_add(pv_h, pve_p);
-
-                       if (pmap != kernel_pmap &&
-                           ((options & PMAP_OPTIONS_ALT_ACCT) ||
-                            PMAP_FOOTPRINT_SUSPENDED(pmap)) &&
-                           IS_INTERNAL_PAGE(pai)) {
-                               /*
-                                * Make a note to ourselves that this
-                                * mapping is using alternative
-                                * accounting. We'll need this in order
-                                * to know which ledger to debit when
-                                * the mapping is removed.
-                                *
-                                * The altacct bit must be set while
-                                * the pv head is locked. Defer the
-                                * ledger accounting until after we've
-                                * dropped the lock.
-                                */
-                               SET_ALTACCT_PAGE(pai, pve_p);
-                               is_altacct = TRUE;
-                       }
-
-                       pve_p = PV_ENTRY_NULL;
-               }
+               if (!pmap_enter_pv(pmap, pte_p, pai, options, &pve_p, &is_altacct))
+                       goto Pmap_enter_loop;
 
                pmap_enter_pte(pmap, pte_p, pte, v);
 
@@ -5977,9 +6264,31 @@ Pmap_enter_loop:
                if (pmap->stats.resident_count > pmap->stats.resident_max)
                        pmap->stats.resident_max = pmap->stats.resident_count;
        } else {
+
+               if (prot & VM_PROT_EXECUTE) {
+                       kr = KERN_FAILURE;
+                       goto Pmap_enter_cleanup;
+               }
+
+               wimg_bits = pmap_cache_attributes(pn);
+               if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT)))
+                       wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
+
+               pte |= wimg_to_pte(wimg_bits);
+
                pmap_enter_pte(pmap, pte_p, pte, v);
        }
 
+       goto Pmap_enter_return;
+
+Pmap_enter_cleanup:
+
+       if (refcnt != NULL) {
+               assert(refcnt_updated);
+               if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0)
+                       panic("pmap_enter(): over-release of ptdp %p for pte %p\n", ptep_get_ptd(pte_p), pte_p);
+       }
+
 Pmap_enter_return:
 
 #if CONFIG_PGTRACE
@@ -5999,7 +6308,7 @@ Pmap_enter_return:
 
        PMAP_UNLOCK(pmap);
 
-       return KERN_SUCCESS;
+       return kr;
 }
 
 kern_return_t
@@ -6016,12 +6325,13 @@ pmap_enter_options(
 {
        kern_return_t kr = KERN_FAILURE;
 
-       PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
+       PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
                   VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pn, prot);
 
        kr = pmap_enter_options_internal(pmap, v, pn, prot, fault_type, flags, wired, options);
+       pv_water_mark_check();
 
-       PMAP_TRACE(PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
+       PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
 
        return kr;
 }
@@ -6033,7 +6343,7 @@ pmap_enter_options(
  *     In/out conditions:
  *                     The mapping must already exist in the pmap.
  */
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_change_wiring_internal(
        pmap_t pmap,
        vm_map_address_t v,
@@ -6049,6 +6359,7 @@ pmap_change_wiring_internal(
        if (pmap == kernel_pmap) {
                return;
        }
+       VALIDATE_USER_PMAP(pmap);
 
        PMAP_LOCK(pmap);
        pte_p = pmap_pte(pmap, v);
@@ -6083,13 +6394,15 @@ pmap_change_wiring(
        pmap_change_wiring_internal(pmap, v, wired);
 }
 
-static ppnum_t
+MARK_AS_PMAP_TEXT static ppnum_t
 pmap_find_phys_internal(
        pmap_t pmap,
        addr64_t va)
 {
        ppnum_t         ppn=0;
 
+       VALIDATE_PMAP(pmap);
+
        if (pmap != kernel_pmap) {
                PMAP_LOCK(pmap);
        }
@@ -6209,7 +6522,7 @@ pmap_vtophys(
        return ppn;
 }
 
-static vm_offset_t
+MARK_AS_PMAP_TEXT static vm_offset_t
 pmap_extract_internal(
        pmap_t pmap,
        vm_map_address_t va)
@@ -6221,6 +6534,8 @@ pmap_extract_internal(
                return 0;
        }
 
+       VALIDATE_PMAP(pmap);
+
        PMAP_LOCK(pmap);
 
        ppn = pmap_vtophys(pmap, va);
@@ -6268,11 +6583,12 @@ pmap_init_pte_page(
        unsigned int ttlevel,
        boolean_t alloc_ptd)
 {
-       pt_desc_t       *ptdp;
+       pt_desc_t   *ptdp = NULL;
+       vm_offset_t *pvh;
 
-       ptdp = *(pt_desc_t **)pai_to_pvh(pa_index((((vm_offset_t)pte_p) - gVirtBase + gPhysBase)));
+       pvh = (vm_offset_t *)(pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p))));
 
-       if (ptdp == NULL) {
+       if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
                if (alloc_ptd) {
                        /*
                         * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
@@ -6280,94 +6596,23 @@ pmap_init_pte_page(
                         * bootstrap request, so we check for an existing PTD here.
                         */
                        ptdp = ptd_alloc(pmap);
-                       *(pt_desc_t **)pai_to_pvh(pa_index((((vm_offset_t)pte_p) - gVirtBase + gPhysBase))) = ptdp;
+                       pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
                } else {
-                       panic("pmap_init_pte_page(): pte_p %p\n", pte_p);
+                       panic("pmap_init_pte_page(): pte_p %p", pte_p);
                }
+       } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
+               ptdp = (pt_desc_t*)(pvh_list(pvh));
+       } else {
+               panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
        }
 
-       pmap_init_pte_page_internal(pmap, pte_p, va, ttlevel, &ptdp);
-}
-
-/*
- *     pmap_init_pte_page_internal - Initialize page table page and page table descriptor
- */
-void
-pmap_init_pte_page_internal(
-       pmap_t pmap,
-       pt_entry_t *pte_p,
-       vm_offset_t va,
-       unsigned int ttlevel,
-       pt_desc_t **ptdp)
-{
        bzero(pte_p, ARM_PGBYTES);
        // below barrier ensures the page zeroing is visible to PTW before
        // it is linked to the PTE of previous level
-       __asm__ volatile("DMB ST" : : : "memory");
-       ptd_init(*ptdp, pmap, va, ttlevel, pte_p);
-}
-
-/*
- * pmap_init_pte_static_page - for static mappings to a known contiguous range of pa's
- * Called from arm_vm_init().
- */
-void
-pmap_init_pte_static_page(
-       __unused pmap_t pmap,
-       pt_entry_t * pte_p,
-       pmap_paddr_t pa)
-{
-#if    (__ARM_VMSA__ == 7)
-       unsigned int    i;
-       pt_entry_t      *pte_cur;
-
-       for (i = 0, pte_cur = pte_p;
-            i < (ARM_PGBYTES / sizeof(*pte_p));
-            i++, pa += PAGE_SIZE) {
-               if (pa >= avail_end) {
-                       /* We don't want to map memory xnu does not own through this routine. */
-                       break;
-               }
-
-               *pte_cur = pa_to_pte(pa)
-                          | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_SH | ARM_PTE_AP(AP_RONA)
-                          | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
-               pte_cur++;
-       }
-#else
-       unsigned int    i;
-       pt_entry_t      *pte_cur;
-       pt_entry_t      template;
-
-       template = ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) | ARM_PTE_AP(AP_RONA) | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT) | ARM_PTE_NX;
-
-       for (i = 0, pte_cur = pte_p;
-            i < (ARM_PGBYTES / sizeof(*pte_p));
-            i++, pa += PAGE_SIZE) {
-               if (pa >= avail_end) {
-                       /* We don't want to map memory xnu does not own through this routine. */
-                       break;
-               }
-
-               /* TEST_PAGE_RATIO_4 may be pre-processor defined to 0 */
-               __unreachable_ok_push
-               if (TEST_PAGE_RATIO_4) {
-                       *pte_cur = pa_to_pte(pa) | template;
-                       *(pte_cur+1) = pa_to_pte(pa+0x1000) | template;
-                       *(pte_cur+2) = pa_to_pte(pa+0x2000) | template;
-                       *(pte_cur+3) = pa_to_pte(pa+0x3000) | template;
-                       pte_cur += 4;
-               } else {
-                       *pte_cur = pa_to_pte(pa) | template;
-                       pte_cur++;
-               }
-               __unreachable_ok_pop
-       }
-#endif
-       bzero(pte_cur, ARM_PGBYTES - ((vm_offset_t)pte_cur - (vm_offset_t)pte_p));
+       __builtin_arm_dmb(DMB_ISHST);
+       ptd_init(ptdp, pmap, va, ttlevel, pte_p);
 }
 
-
 /*
  *     Routine:        pmap_expand
  *
@@ -6405,7 +6650,7 @@ pmap_expand(
                        break;
                }
 
-               simple_lock(&pmap->tt1_lock);
+               pmap_simple_lock(&pmap->tt1_lock);
                for (i = 0; i < pmap->tte_index_max; i++)
                        tte_p[i] = pmap->tte[i];
                for (i = NTTES; i < 2*NTTES; i++)
@@ -6414,23 +6659,18 @@ pmap_expand(
                pmap->prev_tte = pmap->tte;
                pmap->tte = tte_p;
                pmap->ttep = ml_static_vtop((vm_offset_t)pmap->tte);
-#ifndef  __ARM_L1_PTW__
-               CleanPoU_DcacheRegion((vm_offset_t) pmap->tte, 2*NTTES * sizeof(tt_entry_t));
-#else
-               __builtin_arm_dsb(DSB_ISH);
-#endif
+
+               FLUSH_PTE_RANGE(pmap->tte, pmap->tte + (2*NTTES));
+
                pmap->tte_index_max = 2*NTTES;
                pmap->stamp = hw_atomic_add(&pmap_stamp, 1);
 
                for (i = 0; i < NTTES; i++)
                        pmap->prev_tte[i] = ARM_TTE_TYPE_FAULT;
-#ifndef  __ARM_L1_PTW__
-               CleanPoU_DcacheRegion((vm_offset_t) pmap->prev_tte, NTTES * sizeof(tt_entry_t));
-#else
-               __builtin_arm_dsb(DSB_ISH);
-#endif
 
-               simple_unlock(&pmap->tt1_lock);
+               FLUSH_PTE_RANGE(pmap->prev_tte, pmap->prev_tte + NTTES);
+
+               pmap_simple_unlock(&pmap->tt1_lock);
                PMAP_UNLOCK(pmap);
                pmap_set_pmap(pmap, current_thread());
 
@@ -6460,9 +6700,9 @@ pmap_expand(
                if (pa) {
                        tte_p =  &pmap->tte[ttenum(v)];
                        *tte_p =  pa_to_tte(pa) | (((v >> ARM_TT_L1_SHIFT) & 0x3) << 10) | ARM_TTE_TYPE_TABLE;
-#ifndef  __ARM_L1_PTW__
-                       CleanPoU_DcacheRegion((vm_offset_t) tte_p, sizeof(tt_entry_t));
-#endif
+                       FLUSH_PTE(tte_p);
+                       PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK),
+                           VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p);
                        PMAP_UNLOCK(pmap);
                        return (KERN_SUCCESS);
                }
@@ -6497,12 +6737,13 @@ pmap_expand(
                        tte_p = &pmap->tte[ttenum(v)];
                        for (i = 0, tte_next_p = tte_p; i<4; i++) {
                                *tte_next_p = pa_to_tte(pa) | ARM_TTE_TYPE_TABLE;
+                               PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + (i * ARM_TT_L1_SIZE)),
+                                   VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + ((i + 1) * ARM_TT_L1_SIZE)), *tte_p);
                                tte_next_p++;
                                pa = pa +0x400;
                        }
-#ifndef  __ARM_L1_PTW__
-                       CleanPoU_DcacheRegion((vm_offset_t) tte_p, 4*sizeof(tt_entry_t));
-#endif
+                       FLUSH_PTE_RANGE(tte_p, tte_p + 4);
+
                        pa = 0x0ULL;
                        tt_p = (tt_entry_t *)NULL;
                }
@@ -6547,6 +6788,8 @@ pmap_expand(
                                        pa = kvtophys((vm_offset_t)tt_p);
                                        tte_p = pmap_tt1e( pmap, v);
                                        *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
+                                       PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK),
+                                               VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p);
                                        pa = 0x0ULL;
                                        tt_p = (tt_entry_t *)NULL;
                                        if ((pmap == kernel_pmap) && (VM_MIN_KERNEL_ADDRESS < 0x00000000FFFFFFFFULL))
@@ -6569,6 +6812,8 @@ pmap_expand(
                                        pa = kvtophys((vm_offset_t)tt_p);
                                        tte_p = pmap_tt2e( pmap, v);
                                        *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
+                                       PMAP_TRACE(3, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L2_OFFMASK),
+                                               VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L2_OFFMASK) + ARM_TT_L2_SIZE), *tte_p);
                                        pa = 0x0ULL;
                                        tt_p = (tt_entry_t *)NULL;
                                }
@@ -6632,38 +6877,39 @@ pmap_gc(
             pmap_gc_forced)) {
                pmap_gc_forced = FALSE;
                pmap_gc_allowed_by_time_throttle = FALSE;
-               simple_lock(&pmaps_lock);
+               pmap_simple_lock(&pmaps_lock);
                pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&map_pmap_list));
                while (!queue_end(&map_pmap_list, (queue_entry_t)pmap)) {
                        if (!(pmap->gc_status & PMAP_GC_INFLIGHT))
                                pmap->gc_status |= PMAP_GC_INFLIGHT;
-                       simple_unlock(&pmaps_lock);
+                       pmap_simple_unlock(&pmaps_lock);
 
                        pmap_collect(pmap);
 
-                       simple_lock(&pmaps_lock);
+                       pmap_simple_lock(&pmaps_lock);
                        gc_wait = (pmap->gc_status & PMAP_GC_WAIT);
                        pmap->gc_status &= ~(PMAP_GC_INFLIGHT|PMAP_GC_WAIT);
                        pmap_next = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&pmap->pmaps));
                        if (gc_wait) {
                                if (!queue_end(&map_pmap_list, (queue_entry_t)pmap_next))
                                        pmap_next->gc_status |= PMAP_GC_INFLIGHT;
-                               simple_unlock(&pmaps_lock);
+                               pmap_simple_unlock(&pmaps_lock);
                                thread_wakeup((event_t) & pmap->gc_status);
-                               simple_lock(&pmaps_lock);
+                               pmap_simple_lock(&pmaps_lock);
                        }
                        pmap = pmap_next;
                }
-               simple_unlock(&pmaps_lock);
+               pmap_simple_unlock(&pmaps_lock);
        }
 }
 
 /*
  * Called by the VM to reclaim pages that we can reclaim quickly and cheaply.
  */
-void
+uint64_t
 pmap_release_pages_fast(void)
 {
+       return 0;
 }
 
 /*
@@ -6776,15 +7022,15 @@ mapping_set_ref(
 }
 
 /*
- *     Clear specified attribute bits.
+ * Clear specified attribute bits.
  *
- *             Try to force an arm_fast_fault() for all mappings of
- *     the page - to force attributes to be set again at fault time.
- *  If the forcing succeeds, clear the cached bits at the head.
- *  Otherwise, something must have been wired, so leave the cached
- *  attributes alone.
+ * Try to force an arm_fast_fault() for all mappings of
+ * the page - to force attributes to be set again at fault time.
+ * If the forcing succeeds, clear the cached bits at the head.
+ * Otherwise, something must have been wired, so leave the cached
+ * attributes alone.
  */
-static void
+MARK_AS_PMAP_TEXT static void
 phys_attribute_clear_internal(
        ppnum_t         pn,
        unsigned int    bits,
@@ -6804,6 +7050,21 @@ phys_attribute_clear_internal(
        }
 
        assert(pn != vm_page_fictitious_addr);
+
+       if (options & PMAP_OPTIONS_CLEAR_WRITE) {
+               assert(bits == PP_ATTR_MODIFIED);
+               
+               pmap_page_protect_options_internal(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), 0);
+               /*
+                * We short circuit this case; it should not need to
+                * invoke arm_force_fast_fault, so just clear the modified bit.
+                * pmap_page_protect has taken care of resetting
+                * the state so that we'll see the next write as a fault to
+                * the VM (i.e. we don't want a fast fault).
+                */
+               pa_clear_bits(pa, bits);
+               return;
+       }
        if (bits & PP_ATTR_REFERENCED)
                allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
        if (bits & PP_ATTR_MODIFIED)
@@ -6835,11 +7096,11 @@ phys_attribute_clear(
         * Do we really want this tracepoint?  It will be extremely chatty.
         * Also, should we have a corresponding trace point for the set path?
         */
-       PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
+       PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
 
        phys_attribute_clear_internal(pn, bits, options, arg);
 
-       PMAP_TRACE(PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
+       PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
 }
 
 /*
@@ -6849,7 +7110,7 @@ phys_attribute_clear(
  *     no per-mapping hardware support for referenced and
  *     modify bits.
  */
-static void
+MARK_AS_PMAP_TEXT static void
 phys_attribute_set_internal(
        ppnum_t pn,
        unsigned int bits)
@@ -7122,7 +7383,7 @@ pmap_lock_phys_page(ppnum_t pn)
                pai = (int)pa_index(phys);
                LOCK_PVH(pai);
        } else
-               simple_lock(&phys_backup_lock);
+       simple_lock(&phys_backup_lock);
 }
 
 
@@ -7136,18 +7397,19 @@ pmap_unlock_phys_page(ppnum_t pn)
                pai = (int)pa_index(phys);
                UNLOCK_PVH(pai);
        } else
-               simple_unlock(&phys_backup_lock);
+       simple_unlock(&phys_backup_lock);
 }
 
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_switch_user_ttb_internal(
        pmap_t pmap)
 {
-#if    (__ARM_VMSA__ == 7)
+       VALIDATE_PMAP(pmap);
        pmap_cpu_data_t *cpu_data_ptr;
-
        cpu_data_ptr = pmap_get_cpu_data();
 
+#if    (__ARM_VMSA__ == 7)
+
        if ((cpu_data_ptr->cpu_user_pmap != PMAP_NULL)
            && (cpu_data_ptr->cpu_user_pmap != kernel_pmap)) {
                unsigned int    c;
@@ -7180,7 +7442,7 @@ pmap_switch_user_ttb_internal(
        if (pmap->tte_index_max == NTTES) {
                /* Setting TTBCR.N for TTBR0 TTBR1 boundary at  0x40000000 */
                __asm__ volatile("mcr   p15,0,%0,c2,c0,2" : : "r"(2));
-               __asm__ volatile("isb");
+               __builtin_arm_isb(ISB_SY);
 #if !__ARM_USER_PROTECT__
                set_mmu_ttb(pmap->ttep);
 #endif
@@ -7190,7 +7452,7 @@ pmap_switch_user_ttb_internal(
 #endif
                /* Setting TTBCR.N for TTBR0 TTBR1 boundary at  0x80000000 */
                __asm__ volatile("mcr   p15,0,%0,c2,c0,2" : : "r"(1));
-               __asm__ volatile("isb");
+               __builtin_arm_isb(ISB_SY);
 #if    MACH_ASSERT && __ARM_USER_PROTECT__
                if (pmap->ttep & 0x1000) {
                        panic("Misaligned ttbr0  %08X\n", pmap->ttep);
@@ -7201,16 +7463,14 @@ pmap_switch_user_ttb_internal(
 #if !__ARM_USER_PROTECT__
        set_context_id(pmap->asid);
 #endif
-#else
 
-       pmap_get_cpu_data()->cpu_user_pmap = pmap;
-       pmap_get_cpu_data()->cpu_user_pmap_stamp = pmap->stamp;
+#else /* (__ARM_VMSA__ == 7) */
+
+       if (pmap != kernel_pmap)
+               cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;      
 
-#if !__arm64__
-       set_context_id(pmap->asid); /* Not required */
-#endif
        if (pmap == kernel_pmap) {
-               set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
+               pmap_clear_user_ttb_internal();
        } else {
                set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK)|(((uint64_t)pmap->asid) << TTBR_ASID_SHIFT));
        }
@@ -7221,52 +7481,26 @@ void
 pmap_switch_user_ttb(
        pmap_t pmap)
 {
+       PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), pmap->vasid, pmap->asid);
        pmap_switch_user_ttb_internal(pmap);
+       PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH_USER_TTB) | DBG_FUNC_END);
 }
 
-/*
- * Try to "intuit" whether we need to raise a VM_PROT_WRITE fault
- * for the given address when a "swp" instruction raised the fault.
- * We have to look at the existing pte for the address to see
- * if it needs to get bumped, or just added. If just added, do it
- * as a read-only mapping first (this could result in extra faults -
- * but better that than extra copy-on-write evaluations).
- */
-
-#if    (__ARM_VMSA__ == 7)
-boolean_t
-arm_swap_readable_type(
-       vm_map_address_t addr,
-       unsigned int spsr)
+MARK_AS_PMAP_TEXT static void
+pmap_clear_user_ttb_internal(void)
 {
-       int             ap;
-       pt_entry_t      spte;
-       pt_entry_t     *ptep;
-
-       ptep = pmap_pte(current_pmap(), addr);
-       if (ptep == PT_ENTRY_NULL)
-               return (FALSE);
-
-       spte = *ptep;
-       if (spte == ARM_PTE_TYPE_FAULT ||
-           ARM_PTE_IS_COMPRESSED(spte))
-               return (FALSE);
-
-       /* get the access permission bitmaps */
-       /* (all subpages should be the same) */
-       ap = (spte & ARM_PTE_APMASK);
+#if (__ARM_VMSA__ > 7)
+       set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
+#else
+       set_mmu_ttb(kernel_pmap->ttep);
+#endif
+}
 
-       if (spsr & 0xf) {       /* Supervisor mode */
-               panic("arm_swap_readable_type supv");
-               return TRUE;
-       } else {                /* User mode */
-               if ((ap == ARM_PTE_AP(AP_RWRW)) || (ap == ARM_PTE_AP(AP_RORO)))
-                       return (FALSE);
-               else
-                       return (TRUE);
-       }
+void
+pmap_clear_user_ttb(void)
+{
+       pmap_clear_user_ttb_internal();
 }
-#endif
 
 /*
  *     Routine:        arm_force_fast_fault
@@ -7276,7 +7510,7 @@ arm_swap_readable_type(
  *             to the access modes allowed, so we can gather ref/modify
  *             bits again.
  */
-static boolean_t
+MARK_AS_PMAP_TEXT static boolean_t
 arm_force_fast_fault_internal(
        ppnum_t         ppnum,
        vm_prot_t       allow_mode,
@@ -7289,6 +7523,7 @@ arm_force_fast_fault_internal(
        boolean_t       result;
        pv_entry_t    **pv_h;
        boolean_t       is_reusable, is_internal;
+       boolean_t       tlb_flush_needed = FALSE;
        boolean_t       ref_fault;
        boolean_t       mod_fault;
 
@@ -7318,10 +7553,10 @@ arm_force_fast_fault_internal(
 
        while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
                vm_map_address_t va;
-               pt_entry_t              spte;
-               pt_entry_t      tmplate;
-               pmap_t          pmap;
-               boolean_t       update_pte;
+               pt_entry_t       spte;
+               pt_entry_t       tmplate;
+               pmap_t           pmap;
+               boolean_t        update_pte;
 
                if (pve_p != PV_ENTRY_NULL)
                        pte_p = pve_get_ptep(pve_p);
@@ -7329,6 +7564,11 @@ arm_force_fast_fault_internal(
                if (pte_p == PT_ENTRY_NULL) {
                        panic("pte_p is NULL: pve_p=%p ppnum=0x%x\n", pve_p, ppnum);
                }
+#ifdef PVH_FLAG_IOMMU
+               if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) {
+                       goto fff_skip_pve;
+               } 
+#endif
                if (*pte_p == ARM_PTE_EMPTY) {
                        panic("pte is NULL: pte_p=%p ppnum=0x%x\n", pte_p, ppnum);
                }
@@ -7377,11 +7617,12 @@ arm_force_fast_fault_internal(
                if (update_pte) {
                        if (*pte_p != ARM_PTE_TYPE_FAULT &&
                            !ARM_PTE_IS_COMPRESSED(*pte_p)) {
-                               WRITE_PTE(pte_p, tmplate);
-                               PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
+                               WRITE_PTE_STRONG(pte_p, tmplate);
+                               flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap);
+                               tlb_flush_needed = TRUE;
                        } else {
                                WRITE_PTE(pte_p, tmplate);
-                               __asm__ volatile("isb");
+                               __builtin_arm_isb(ISB_SY);
                        }
                }
 
@@ -7402,14 +7643,10 @@ arm_force_fast_fault_internal(
                        OSAddAtomic(+1, &pmap->stats.internal);
                        PMAP_STATS_PEAK(pmap->stats.internal);
                        PMAP_STATS_ASSERTF(pmap->stats.internal > 0, pmap, "stats.internal %d", pmap->stats.internal);
-                       pmap_ledger_credit(pmap,
-                                          task_ledgers.internal,
-                                          machine_ptob(1));
+                       pmap_ledger_credit(pmap, task_ledgers.internal, machine_ptob(1));
                        assert(!IS_ALTACCT_PAGE(pai, pve_p));
                        assert(IS_INTERNAL_PAGE(pai));
-                       pmap_ledger_credit(pmap,
-                                          task_ledgers.phys_footprint,
-                                          machine_ptob(1));
+                       pmap_ledger_credit(pmap, task_ledgers.phys_footprint, machine_ptob(1));
 
                        /*
                         * Avoid the cost of another trap to handle the fast
@@ -7431,21 +7668,23 @@ arm_force_fast_fault_internal(
                        /* one less "internal" */
                        PMAP_STATS_ASSERTF(pmap->stats.internal > 0, pmap, "stats.internal %d", pmap->stats.internal);
                        OSAddAtomic(-1, &pmap->stats.internal);
-                       pmap_ledger_debit(pmap,
-                                         task_ledgers.internal,
-                                         machine_ptob(1));
+                       pmap_ledger_debit(pmap, task_ledgers.internal, machine_ptob(1));
                        assert(!IS_ALTACCT_PAGE(pai, pve_p));
                        assert(IS_INTERNAL_PAGE(pai));
-                       pmap_ledger_debit(pmap,
-                                         task_ledgers.phys_footprint,
-                                         machine_ptob(1));
+                       pmap_ledger_debit(pmap, task_ledgers.phys_footprint, machine_ptob(1));
                }
 
+#ifdef PVH_FLAG_IOMMU
+       fff_skip_pve:
+#endif
                pte_p = PT_ENTRY_NULL;
                if (pve_p != PV_ENTRY_NULL)
                        pve_p = PVE_NEXT_PTR(pve_next(pve_p));
        }
 
+       if (tlb_flush_needed)
+               sync_tlb_flush();
+
        /* update global "reusable" status for this page */
        if (is_internal) {
                if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
@@ -7503,6 +7742,7 @@ arm_clear_fast_fault(
        pt_entry_t     *pte_p;
        int             pai;
        boolean_t       result;
+       boolean_t       tlb_flush_needed = FALSE;
        pv_entry_t    **pv_h;
 
        assert(ppnum != vm_page_fictitious_addr);
@@ -7536,6 +7776,11 @@ arm_clear_fast_fault(
                if (pte_p == PT_ENTRY_NULL) {
                        panic("pte_p is NULL: pve_p=%p ppnum=0x%x\n", pve_p, ppnum);
                }
+#ifdef PVH_FLAG_IOMMU
+               if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU) {
+                       goto cff_skip_pve;
+               } 
+#endif
                if (*pte_p == ARM_PTE_EMPTY) {
                        panic("pte is NULL: pte_p=%p ppnum=0x%x\n", pte_p, ppnum);
                }
@@ -7572,19 +7817,25 @@ arm_clear_fast_fault(
 
                if (spte != tmplate) {
                        if (spte != ARM_PTE_TYPE_FAULT) {
-                               WRITE_PTE(pte_p, tmplate);
-                               PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
+                               WRITE_PTE_STRONG(pte_p, tmplate);
+                               flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap);
+                               tlb_flush_needed = TRUE;
                        } else {
                                WRITE_PTE(pte_p, tmplate);
-                               __asm__ volatile("isb");
+                               __builtin_arm_isb(ISB_SY);
                        }
                        result = TRUE;
                }
 
+#ifdef PVH_FLAG_IOMMU
+       cff_skip_pve:
+#endif
                pte_p = PT_ENTRY_NULL;
                if (pve_p != PV_ENTRY_NULL)
                        pve_p = PVE_NEXT_PTR(pve_next(pve_p));
        }
+       if (tlb_flush_needed)
+               sync_tlb_flush();
        return result;
 }
 
@@ -7602,7 +7853,7 @@ arm_clear_fast_fault(
  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
  * disallows this type of access.
  */
-static kern_return_t
+MARK_AS_PMAP_TEXT static kern_return_t
 arm_fast_fault_internal(
        pmap_t pmap,
        vm_map_address_t va,
@@ -7615,6 +7866,8 @@ arm_fast_fault_internal(
        int             pai;
        pmap_paddr_t    pa;
 
+       VALIDATE_PMAP(pmap);
+
        PMAP_LOCK(pmap);
 
        /*
@@ -7629,12 +7882,15 @@ arm_fast_fault_internal(
                pa = pte_to_pa(spte);
 
                if ((spte == ARM_PTE_TYPE_FAULT) ||
-                   ARM_PTE_IS_COMPRESSED(spte) ||
-                   (!pa_valid(pa))) {
-                               PMAP_UNLOCK(pmap);
-                               return result;
+                   ARM_PTE_IS_COMPRESSED(spte)) {
+                       PMAP_UNLOCK(pmap);
+                       return result;
                }
 
+               if (!pa_valid(pa)) {
+                       PMAP_UNLOCK(pmap);
+                               return result;
+               }
                pai = (int)pa_index(pa);
                LOCK_PVH(pai);
        } else {
@@ -7689,7 +7945,7 @@ arm_fast_fault(
        if (va < pmap->min || va >= pmap->max)
                return result;
 
-       PMAP_TRACE(PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
+       PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
                   VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
                   from_user);
 
@@ -7718,7 +7974,7 @@ arm_fast_fault(
 done:
 #endif
 
-       PMAP_TRACE(PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
+       PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
 
        return result;
 }
@@ -7815,10 +8071,12 @@ pmap_map_globals(
 vm_offset_t
 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
 {
+       if (__improbable(index >= CPUWINDOWS_MAX))
+               panic("%s: invalid index %u", __func__, index);
        return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
 }
 
-static unsigned int
+MARK_AS_PMAP_TEXT static unsigned int
 pmap_map_cpu_windows_copy_internal(
        ppnum_t pn,
        vm_prot_t prot,
@@ -7855,11 +8113,12 @@ pmap_map_cpu_windows_copy_internal(
                pte |= ARM_PTE_AP(AP_RONA);
        }
 
-       WRITE_PTE(ptep, pte);
+       WRITE_PTE_FAST(ptep, pte);
        /*
         * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
         * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
         */
+       FLUSH_PTE_STRONG(ptep);
        PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE);
 
        return(i);
@@ -7874,7 +8133,7 @@ pmap_map_cpu_windows_copy(
        return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
 }
 
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_unmap_cpu_windows_copy_internal(
        unsigned int index)
 {
@@ -7885,9 +8144,12 @@ pmap_unmap_cpu_windows_copy_internal(
        cpu_num = pmap_get_cpu_data()->cpu_number;
 
        cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
-       __asm__ volatile("dsb sy");
+       /* Issue full-system DSB to ensure prior operations on the per-CPU window
+        * (which are likely to have been on I/O memory) are complete before
+        * tearing down the mapping. */
+       __builtin_arm_dsb(DSB_SY);
        ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
-       WRITE_PTE(ptep, ARM_PTE_TYPE_FAULT);
+       WRITE_PTE_STRONG(ptep, ARM_PTE_TYPE_FAULT);
        PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE);
 }
 
@@ -7899,12 +8161,15 @@ pmap_unmap_cpu_windows_copy(
 }
 
 /*
- * Marked a pmap has nested
+ * Indicate that a pmap is intended to be used as a nested pmap
+ * within one or more larger address spaces.  This must be set
+ * before pmap_nest() is called with this pmap as the 'subordinate'.
  */
-static void
+MARK_AS_PMAP_TEXT static void
 pmap_set_nested_internal(
        pmap_t pmap)
 {
+       VALIDATE_PMAP(pmap);
        pmap->nested = TRUE;
 }
 
@@ -7915,6 +8180,357 @@ pmap_set_nested(
        pmap_set_nested_internal(pmap);
 }
 
+/*
+ * pmap_trim_range(pmap, start, end)
+ *
+ * pmap  = pmap to operate on
+ * start = start of the range
+ * end   = end of the range
+ *
+ * Attempts to deallocate TTEs for the given range in the nested range.
+ */
+MARK_AS_PMAP_TEXT static void
+pmap_trim_range(
+       pmap_t pmap,
+       addr64_t start,
+       addr64_t end)
+{
+       addr64_t cur;
+       addr64_t nested_region_start;
+       addr64_t nested_region_end;
+       addr64_t adjusted_start;
+       addr64_t adjusted_end;
+       addr64_t adjust_offmask;
+       tt_entry_t * tte_p;
+       pt_entry_t * pte_p;
+
+       if (__improbable(end < start)) {
+               panic("%s: invalid address range, "
+                      "pmap=%p, start=%p, end=%p",
+                      __func__,
+                      pmap, (void*)start, (void*)end);
+       }
+
+       nested_region_start = pmap->nested ? pmap->nested_region_subord_addr : pmap->nested_region_subord_addr;
+       nested_region_end = nested_region_start + pmap->nested_region_size;
+
+       if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
+               panic("%s: range outside nested region %p-%p, "
+                      "pmap=%p, start=%p, end=%p",
+                      __func__, (void *)nested_region_start, (void *)nested_region_end,
+                      pmap, (void*)start, (void*)end);
+       }
+
+       /* Contract the range to TT page boundaries. */
+#if (__ARM_VMSA__ > 7)
+       adjust_offmask = ARM_TT_TWIG_OFFMASK;
+#else /* (__ARM_VMSA__ > 7) */
+       adjust_offmask = ((ARM_TT_TWIG_SIZE * 4) - 1);
+#endif /* (__ARM_VMSA__ > 7) */
+
+       adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
+       adjusted_end = end & ~adjust_offmask;
+
+       /* Iterate over the range, trying to remove TTEs. */
+       for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += ARM_TT_TWIG_SIZE) {
+               bool modified = false;
+
+               PMAP_LOCK(pmap);
+
+               tte_p = pmap_tte(pmap, cur);
+
+               if (tte_p == (tt_entry_t *) NULL) {
+                       goto done;
+               }
+
+               if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
+                       pte_p = (pt_entry_t *) ttetokv(*tte_p);
+
+#if (__ARM_VMSA__ == 7)
+                       if ((ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) &&
+                           (pmap != kernel_pmap)) {
+                               if (pmap->nested == TRUE) {
+                                       /* Deallocate for the nested map. */
+                                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L1_LEVEL);
+                               } else {
+                                       /* Just remove for the parent map. */
+                                       pmap_tte_remove(pmap, tte_p, PMAP_TT_L1_LEVEL);
+                               }
+
+                               flush_mmu_tlb_entry((cur & ~ARM_TT_L1_OFFMASK) | (pmap->asid & 0xff));
+                               modified = true;
+                       }
+#else
+                       if ((ptep_get_ptd(pte_p)->pt_cnt[ARM_PT_DESC_INDEX(pte_p)].refcnt == 0) &&
+                          (pmap != kernel_pmap)) {
+                               if (pmap->nested == TRUE) {
+                                       /* Deallocate for the nested map. */
+                                       pmap_tte_deallocate(pmap, tte_p, PMAP_TT_L2_LEVEL);
+                               } else {
+                                       /* Just remove for the parent map. */
+                                       pmap_tte_remove(pmap, tte_p, PMAP_TT_L2_LEVEL);
+                               }
+
+                               flush_mmu_tlb_entry(tlbi_addr(cur & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
+                               modified = true;
+                       }
+#endif
+               }
+
+done:
+               PMAP_UNLOCK(pmap);
+
+               if (modified) {
+                       PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE);
+               }
+       }
+
+#if (__ARM_VMSA__ > 7)
+       /* Remove empty L2 TTs. */
+       adjusted_start = ((start + ARM_TT_L1_OFFMASK) & ~ARM_TT_L1_OFFMASK);
+       adjusted_end = end & ~ARM_TT_L1_OFFMASK;
+
+       for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += ARM_TT_L1_SIZE) {
+               /* For each L1 entry in our range... */
+               PMAP_LOCK(pmap);
+
+               bool remove_tt1e = true;
+               tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
+               tt_entry_t * tt2e_start;
+               tt_entry_t * tt2e_end;
+               tt_entry_t * tt2e_p;
+               tt_entry_t tt1e;
+
+               if (tt1e_p == NULL) {
+                       PMAP_UNLOCK(pmap);
+                       continue;
+               }
+
+               tt1e = *tt1e_p;
+
+               if (tt1e == ARM_TTE_TYPE_FAULT) {
+                       PMAP_UNLOCK(pmap);
+                       continue;
+               }
+
+               tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
+               tt2e_end = &tt2e_start[TTE_PGENTRIES];
+
+               for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
+                       if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
+                               /*
+                                * If any TTEs are populated, don't remove the
+                                * L1 TT.
+                                */
+                               remove_tt1e = false;
+                       }
+               }
+
+               if (remove_tt1e) {
+                       pmap_tte_deallocate(pmap, tt1e_p, PMAP_TT_L1_LEVEL);
+                       PMAP_UPDATE_TLBS(pmap, cur, cur + PAGE_SIZE);
+               }
+
+               PMAP_UNLOCK(pmap);
+       }
+#endif /* (__ARM_VMSA__ > 7) */
+}
+
+/*
+ * pmap_trim_internal(grand, subord, vstart, nstart, size)
+ *
+ * grand  = pmap subord is nested in
+ * subord = nested pmap
+ * vstart = start of the used range in grand
+ * nstart = start of the used range in nstart
+ * size   = size of the used range
+ *
+ * Attempts to trim the shared region page tables down to only cover the given
+ * range in subord and grand.
+ */
+MARK_AS_PMAP_TEXT static void
+pmap_trim_internal(
+       pmap_t grand,
+       pmap_t subord,
+       addr64_t vstart,
+       addr64_t nstart,
+       uint64_t size)
+{
+       addr64_t vend, nend;
+       addr64_t adjust_offmask;
+
+       if (__improbable(os_add_overflow(vstart, size, &vend))) {
+               panic("%s: grand addr wraps around, "
+                     "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx",
+                     __func__, grand, subord, (void*)vstart, (void*)nstart, size);
+       }
+
+       if (__improbable(os_add_overflow(nstart, size, &nend))) {
+               panic("%s: nested addr wraps around, "
+                     "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx",
+                     __func__, grand, subord, (void*)vstart, (void*)nstart, size);
+       }
+
+       VALIDATE_PMAP(grand);
+       VALIDATE_PMAP(subord);
+
+       PMAP_LOCK(subord);
+
+       if (!subord->nested) {
+               panic("%s: subord is not nestable, "
+                     "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx",
+                     __func__, grand, subord, (void*)vstart, (void*)nstart, size);
+       }
+
+       if (grand->nested) {
+               panic("%s: grand is nestable, "
+                     "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx",
+                     __func__, grand, subord, (void*)vstart, (void*)nstart, size);
+       }
+
+       if (grand->nested_pmap != subord) {
+               panic("%s: grand->nested != subord, "
+                     "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx",
+                     __func__, grand, subord, (void*)vstart, (void*)nstart, size);
+       }
+
+       if (size != 0) {
+               if ((vstart < grand->nested_region_grand_addr) || (vend > (grand->nested_region_grand_addr + grand->nested_region_size))) {
+                       panic("%s: grand range not in nested region, "
+                             "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx",
+                             __func__, grand, subord, (void*)vstart, (void*)nstart, size);
+               }
+
+               if ((nstart < grand->nested_region_grand_addr) || (nend > (grand->nested_region_grand_addr + grand->nested_region_size))) {
+                       panic("%s: subord range not in nested region, "
+                             "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx",
+                             __func__, grand, subord, (void*)vstart, (void*)nstart, size);
+               }
+       }
+
+
+       if (!grand->nested_has_no_bounds_ref) {
+               assert(subord->nested_bounds_set);
+
+               if (!grand->nested_bounds_set) {
+                       /* Inherit the bounds from subord. */
+                       grand->nested_region_true_start = (subord->nested_region_true_start - grand->nested_region_subord_addr) + grand->nested_region_grand_addr;
+                       grand->nested_region_true_end = (subord->nested_region_true_end - grand->nested_region_subord_addr) + grand->nested_region_grand_addr;
+                       grand->nested_bounds_set = true;
+               }
+
+               PMAP_UNLOCK(subord);
+               return;
+       }
+
+       if ((!subord->nested_bounds_set) && size) {
+#if (__ARM_VMSA__ > 7)
+               adjust_offmask = ARM_TT_TWIG_OFFMASK;
+#else /* (__ARM_VMSA__ > 7) */
+               adjust_offmask = ((ARM_TT_TWIG_SIZE * 4) - 1);
+#endif /* (__ARM_VMSA__ > 7) */
+
+               subord->nested_region_true_start = nstart;
+               subord->nested_region_true_end = nend;
+               subord->nested_region_true_start &= ~adjust_offmask;
+
+               if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
+                       panic("%s: padded true end wraps around, "
+                             "grand=%p, subord=%p, vstart=%p, nstart=%p, size=%#llx",
+                             __func__, grand, subord, (void*)vstart, (void*)nstart, size);
+               }
+
+               subord->nested_region_true_end &= ~adjust_offmask;
+               subord->nested_bounds_set = true;
+       }
+
+       if (subord->nested_bounds_set) {
+               /* Inherit the bounds from subord. */
+               grand->nested_region_true_start = (subord->nested_region_true_start - grand->nested_region_subord_addr) + grand->nested_region_grand_addr;
+               grand->nested_region_true_end = (subord->nested_region_true_end - grand->nested_region_subord_addr) + grand->nested_region_grand_addr;
+               grand->nested_bounds_set = true;
+
+               /* If we know the bounds, we can trim the pmap. */
+               grand->nested_has_no_bounds_ref = false;
+               PMAP_UNLOCK(subord);
+       } else {
+               /* Don't trim if we don't know the bounds. */
+               PMAP_UNLOCK(subord);
+               return;
+       }
+
+       /* Trim grand to only cover the given range. */
+       pmap_trim_range(grand, grand->nested_region_grand_addr, grand->nested_region_true_start);
+       pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_grand_addr + grand->nested_region_size));
+
+       /* Try to trim subord. */
+       pmap_trim_subord(subord);
+}
+
+MARK_AS_PMAP_TEXT static void pmap_trim_self(pmap_t pmap)
+{
+       if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
+               /* If we have a no bounds ref, we need to drop it. */
+               PMAP_LOCK(pmap->nested_pmap);
+               pmap->nested_has_no_bounds_ref = false;
+               boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
+               vm_map_offset_t nested_region_true_start = (pmap->nested_pmap->nested_region_true_start - pmap->nested_region_subord_addr) + pmap->nested_region_grand_addr;
+               vm_map_offset_t nested_region_true_end = (pmap->nested_pmap->nested_region_true_end - pmap->nested_region_subord_addr) + pmap->nested_region_grand_addr;
+               PMAP_UNLOCK(pmap->nested_pmap);
+
+               if (nested_bounds_set) {
+                       pmap_trim_range(pmap, pmap->nested_region_grand_addr, nested_region_true_start);
+                       pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_grand_addr + pmap->nested_region_size));
+               }
+               /*
+                * Try trimming the nested pmap, in case we had the
+                * last reference.
+                */
+               pmap_trim_subord(pmap->nested_pmap);
+       }
+}
+
+/*
+ * pmap_trim_subord(grand, subord)
+ *
+ * grand  = pmap that we have nested subord in
+ * subord = nested pmap we are attempting to trim
+ *
+ * Trims subord if possible
+ */
+MARK_AS_PMAP_TEXT static void
+pmap_trim_subord(pmap_t subord)
+{
+       bool contract_subord = false;
+
+       PMAP_LOCK(subord);
+
+       subord->nested_no_bounds_refcnt--;
+
+       if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
+               /* If this was the last no bounds reference, trim subord. */
+               contract_subord = true;
+       }
+
+       PMAP_UNLOCK(subord);
+
+       if (contract_subord) {
+               pmap_trim_range(subord, subord->nested_region_subord_addr, subord->nested_region_true_start);
+               pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_subord_addr + subord->nested_region_size);
+       }
+}
+
+void
+pmap_trim(
+       pmap_t grand,
+       pmap_t subord,
+       addr64_t vstart,
+       addr64_t nstart,
+       uint64_t size)
+{
+       pmap_trim_internal(grand, subord, vstart, nstart, size);
+}
+
 /*
  *     kern_return_t pmap_nest(grand, subord, vstart, size)
  *
@@ -7928,7 +8544,7 @@ pmap_set_nested(
  *
  */
 
-static kern_return_t
+MARK_AS_PMAP_TEXT static kern_return_t
 pmap_nest_internal(
        pmap_t grand,
        pmap_t subord,
@@ -7946,6 +8562,14 @@ pmap_nest_internal(
        unsigned int*   nested_region_asid_bitmap;
        int expand_options = 0;
 
+       addr64_t vend, nend;
+       if (__improbable(os_add_overflow(vstart, size, &vend)))
+               panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
+       if (__improbable(os_add_overflow(nstart, size, &nend)))
+               panic("%s: %p nested addr wraps around: 0x%llx + 0x%llx", __func__, subord, nstart, size);
+       VALIDATE_PMAP(grand);
+       VALIDATE_PMAP(subord);
+
 
 #if    (__ARM_VMSA__ == 7)
        if (((size|vstart|nstart) & ARM_TT_L1_PT_OFFMASK) != 0x0ULL) {
@@ -7953,10 +8577,13 @@ pmap_nest_internal(
        }
 #else
        if (((size|vstart|nstart) & (ARM_TT_L2_OFFMASK)) != 0x0ULL) {
-               panic("pmap_nest() pmap %p has a nested pmap 0x%llx, 0x%llx, 0x%llx\n", grand, vstart, nstart, size);
+               panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx\n", grand, vstart, nstart, size);
        }
 #endif
 
+       if (!subord->nested)
+               panic("%s: subordinate pmap %p is not nestable", __func__, subord);
+
        if ((grand->nested_pmap != PMAP_NULL) && (grand->nested_pmap != subord)) {
                panic("pmap_nest() pmap %p has a nested pmap\n", grand);
        }
@@ -7980,14 +8607,14 @@ pmap_nest_internal(
                        kfree(nested_region_asid_bitmap, nested_region_asid_bitmap_size*sizeof(unsigned int));
                }
        }
-       if ((subord->nested_region_subord_addr + subord->nested_region_size) < (nstart+size)) {
+       if ((subord->nested_region_subord_addr + subord->nested_region_size) < nend) {
                uint64_t        new_size;
                unsigned int    new_nested_region_asid_bitmap_size;
                unsigned int*   new_nested_region_asid_bitmap;
 
                nested_region_asid_bitmap = NULL;
                nested_region_asid_bitmap_size = 0;
-               new_size =  nstart + size - subord->nested_region_subord_addr;
+               new_size =  nend - subord->nested_region_subord_addr;
 
                /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
                new_nested_region_asid_bitmap_size  = (unsigned int)((new_size>>ARM_TT_TWIG_SHIFT)/(sizeof(unsigned int)*NBBY)) + 1;
@@ -8014,6 +8641,16 @@ pmap_nest_internal(
        PMAP_LOCK(subord);
        if (grand->nested_pmap == PMAP_NULL) {
                grand->nested_pmap = subord;
+
+               if (!subord->nested_bounds_set) {
+                       /*
+                        * We are nesting without the shared regions bounds
+                        * being known.  We'll have to trim the pmap later.
+                        */
+                       grand->nested_has_no_bounds_ref = true;
+                       subord->nested_no_bounds_refcnt++;
+               }
+
                grand->nested_region_grand_addr = vstart;
                grand->nested_region_subord_addr = nstart;
                grand->nested_region_size = (mach_vm_offset_t) size;
@@ -8021,7 +8658,7 @@ pmap_nest_internal(
                if ((grand->nested_region_grand_addr > vstart)) {
                        panic("pmap_nest() pmap %p : attempt to nest outside the nested region\n", grand);
                }
-               else if ((grand->nested_region_grand_addr + grand->nested_region_size) < (vstart+size)) {
+               else if ((grand->nested_region_grand_addr + grand->nested_region_size) < vend) {
                        grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_grand_addr + size);
                }
        }
@@ -8032,6 +8669,10 @@ pmap_nest_internal(
        num_tte = size >> ARM_TT_L1_SHIFT;
 
        for (i = 0; i < num_tte; i++) {
+               if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) {
+                       goto expand_next;
+               }
+
                stte_p = pmap_tte(subord, nvaddr);
                if ((stte_p == (tt_entry_t *)NULL) || (((*stte_p) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE)) {
                        PMAP_UNLOCK(subord);
@@ -8061,7 +8702,7 @@ pmap_nest_internal(
                }
                PMAP_LOCK(subord);
 
-
+expand_next:
                nvaddr += ARM_TT_L1_SIZE;
                vaddr += ARM_TT_L1_SIZE;
        }
@@ -8071,6 +8712,10 @@ pmap_nest_internal(
        num_tte = (unsigned int)(size >> ARM_TT_L2_SHIFT);
 
        for (i = 0; i < num_tte; i++) {
+               if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) {
+                       goto expand_next;
+               }
+
                stte_p = pmap_tt2e(subord, nvaddr);
                if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
                        PMAP_UNLOCK(subord);
@@ -8083,6 +8728,7 @@ pmap_nest_internal(
 
                        PMAP_LOCK(subord);
                }
+expand_next:
                nvaddr += ARM_TT_L2_SIZE;
        }
 #endif
@@ -8099,18 +8745,25 @@ pmap_nest_internal(
 
 #if    (__ARM_VMSA__ == 7)
        for (i = 0; i < num_tte; i++) {
+               if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) {
+                       goto nest_next;
+               }
 
                stte_p = pmap_tte(subord, nvaddr);
                gtte_p = pmap_tte(grand, vaddr);
                *gtte_p = *stte_p;
 
+nest_next:
                nvaddr += ARM_TT_L1_SIZE;
                vaddr += ARM_TT_L1_SIZE;
        }
 #else
        for (i = 0; i < num_tte; i++) {
+               if (((subord->nested_region_true_start) > nvaddr) || ((subord->nested_region_true_end) <= nvaddr)) {
+                       goto nest_next;
+               }
 
-               stte_p = pmap_tt2e(subord, nstart);
+               stte_p = pmap_tt2e(subord, nvaddr);
                gtte_p = pmap_tt2e(grand, vaddr);
                if (gtte_p == PT_ENTRY_NULL) {
                        PMAP_UNLOCK(grand);
@@ -8124,17 +8777,18 @@ pmap_nest_internal(
                        gtte_p = pmap_tt2e(grand, vaddr);
                }
                *gtte_p = *stte_p;
+
+nest_next:
                vaddr += ARM_TT_L2_SIZE;
-               nstart += ARM_TT_L2_SIZE;
+               nvaddr += ARM_TT_L2_SIZE;
        }
 #endif
 
        kr = KERN_SUCCESS;
 done:
 
-#ifndef        __ARM_L1_PTW__
-       CleanPoU_DcacheRegion((vm_offset_t) pmap_tte(grand, vstart), num_tte * sizeof(tt_entry_t));
-#endif
+       stte_p = pmap_tte(grand, vstart);
+       FLUSH_PTE_RANGE_STRONG(stte_p, stte_p + num_tte);
 
 #if    (__ARM_VMSA__ > 7)
        /*
@@ -8142,7 +8796,7 @@ done:
         */
        assert((size & 0xFFFFFFFF00000000ULL) == 0);
 #endif
-       PMAP_UPDATE_TLBS(grand, vstart, vstart + size);
+       PMAP_UPDATE_TLBS(grand, vstart, vend);
 
        PMAP_UNLOCK(grand);
        return kr;
@@ -8157,13 +8811,13 @@ kern_return_t pmap_nest(
 {
        kern_return_t kr = KERN_FAILURE;
 
-       PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
+       PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
                   VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
                   VM_KERNEL_ADDRHIDE(vstart));
 
        kr = pmap_nest_internal(grand, subord, vstart, nstart, size);
 
-       PMAP_TRACE(PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
+       PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
 
        return kr;
 }
@@ -8171,7 +8825,7 @@ kern_return_t pmap_nest(
 /*
  *     kern_return_t pmap_unnest(grand, vaddr)
  *
- *     grand  = the pmap that we will nest subord into
+ *     grand  = the pmap that will have the virtual range unnested
  *     vaddr  = start of range in pmap to be unnested
  *     size   = size of range in pmap to be unnested
  *
@@ -8186,7 +8840,7 @@ pmap_unnest(
        return(pmap_unnest_options(grand, vaddr, size, 0));
 }
 
-static kern_return_t
+MARK_AS_PMAP_TEXT static kern_return_t
 pmap_unnest_options_internal(
        pmap_t grand,
        addr64_t vaddr,
@@ -8202,6 +8856,12 @@ pmap_unnest_options_internal(
        unsigned int    num_tte;
        unsigned int    i;
 
+       addr64_t vend;
+       if (__improbable(os_add_overflow(vaddr, size, &vend)))
+               panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
+
+       VALIDATE_PMAP(grand);
+
 #if    (__ARM_VMSA__ == 7)
        if (((size|vaddr) & ARM_TT_L1_PT_OFFMASK) != 0x0ULL) {
                panic("pmap_unnest(): unaligned request\n");
@@ -8214,6 +8874,12 @@ pmap_unnest_options_internal(
 
        if ((option & PMAP_UNNEST_CLEAN) == 0)
        {
+               if (grand->nested_pmap == NULL)
+                       panic("%s: %p has no nested pmap", __func__, grand);
+
+               if ((vaddr < grand->nested_region_grand_addr) || (vend > (grand->nested_region_grand_addr + grand->nested_region_size)))
+                       panic("%s: %p: unnest request to region not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
+
                PMAP_LOCK(grand->nested_pmap);
 
                start = vaddr - grand->nested_region_grand_addr + grand->nested_region_subord_addr ;
@@ -8221,19 +8887,24 @@ pmap_unnest_options_internal(
                max_index = (unsigned int)(start_index + (size >> ARM_TT_TWIG_SHIFT));
                num_tte = (unsigned int)(size >> ARM_TT_TWIG_SHIFT);
 
-               if (size > grand->nested_region_size) {
-                       panic("pmap_unnest() pmap %p %llu, %llu\n", grand, size,  (uint64_t)grand->nested_region_size);
-               }
-
-               for (current_index = start_index,  addr = start; current_index < max_index; current_index++) {
+               for (current_index = start_index,  addr = start; current_index < max_index; current_index++, addr += ARM_TT_TWIG_SIZE) {
                        pt_entry_t  *bpte, *epte, *cpte;
 
+                       if (addr < grand->nested_pmap->nested_region_true_start) {
+                               /* We haven't reached the interesting range. */
+                               continue;
+                       }
 
-                       if(!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap)) {
+                       if (addr >= grand->nested_pmap->nested_region_true_end) {
+                               /* We're done with the interesting range. */
+                               break;
+                       }
+
+                       bpte = pmap_pte(grand->nested_pmap, addr);
+                       epte = bpte + (ARM_TT_LEAF_INDEX_MASK>>ARM_TT_LEAF_SHIFT);
 
+                       if(!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap)) {
                                setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
-                               bpte = pmap_pte(grand->nested_pmap, addr);
-                               epte = bpte + (ARM_TT_LEAF_INDEX_MASK>>ARM_TT_LEAF_SHIFT);
 
                                for (cpte = bpte; cpte <= epte; cpte++) {
                                        pmap_paddr_t    pa;
@@ -8262,7 +8933,7 @@ pmap_unnest_options_internal(
 
                                                if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
 
-                                                       WRITE_PTE(cpte, (spte | ARM_PTE_NG));
+                                                       WRITE_PTE_FAST(cpte, (spte | ARM_PTE_NG));
                                                }
 
                                                if (managed)
@@ -8274,14 +8945,12 @@ pmap_unnest_options_internal(
                                }
                        }
 
-                       addr += ARM_TT_TWIG_SIZE;
-
-#ifndef        __ARM_L1_PTW__
-                       CleanPoU_DcacheRegion((vm_offset_t) pmap_pte(grand->nested_pmap, start), num_tte * sizeof(tt_entry_t));
-#endif
-                       PMAP_UPDATE_TLBS(grand->nested_pmap, start, start + size);
+                       FLUSH_PTE_RANGE_STRONG(bpte, epte); 
+                       flush_mmu_tlb_region_asid_async(start, (unsigned)size, grand->nested_pmap);
                }
 
+               sync_tlb_flush();
+
                PMAP_UNLOCK(grand->nested_pmap);
        }
 
@@ -8295,17 +8964,24 @@ pmap_unnest_options_internal(
 
        num_tte = (unsigned int)(size >> ARM_TT_TWIG_SHIFT);
 
-       for (i = 0; i < num_tte; i++) {
+       for (i = 0; i < num_tte; i++, addr += ARM_TT_TWIG_SIZE) {
+               if (addr < grand->nested_pmap->nested_region_true_start) {
+                       /* We haven't reached the interesting range. */
+                       continue;
+               }
+
+               if (addr >= grand->nested_pmap->nested_region_true_end) {
+                       /* We're done with the interesting range. */
+                       break;
+               }
+
                tte_p = pmap_tte(grand, addr);
                *tte_p = ARM_TTE_TYPE_FAULT;
-
-               addr += ARM_TT_TWIG_SIZE;
        }
 
-#ifndef        __ARM_L1_PTW__
-       CleanPoU_DcacheRegion((vm_offset_t) pmap_tte(grand, start), num_tte * sizeof(tt_entry_t));
-#endif
-       PMAP_UPDATE_TLBS(grand, start, start + size);
+       tte_p = pmap_tte(grand, start);
+       FLUSH_PTE_RANGE_STRONG(tte_p, tte_p + num_tte);
+       PMAP_UPDATE_TLBS(grand, start, vend);
 
        PMAP_UNLOCK(grand);
 
@@ -8321,12 +8997,12 @@ pmap_unnest_options(
 {
        kern_return_t kr = KERN_FAILURE;
 
-       PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
+       PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
                   VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
 
        kr = pmap_unnest_options_internal(grand, vaddr, size, option);
 
-       PMAP_TRACE(PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, kr);
+       PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, kr);
 
        return kr;
 }
@@ -8394,7 +9070,7 @@ pt_fake_zone_info(
 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
 
 static void
-flush_mmu_tlb_region_asid(
+flush_mmu_tlb_region_asid_async(
        vm_offset_t va,
        unsigned length,
        pmap_t pmap)
@@ -8411,9 +9087,9 @@ flush_mmu_tlb_region_asid(
                if ((asid == 0) || (pmap->nested == TRUE))
                        flush_all = TRUE;
                if (flush_all)
-                       flush_mmu_tlb();
+                       flush_mmu_tlb_async();
                else
-                       flush_mmu_tlb_asid(asid);
+                       flush_mmu_tlb_asid_async(asid);
 
                return;
        }
@@ -8423,14 +9099,14 @@ flush_mmu_tlb_region_asid(
 #else
                va = arm_trunc_page(va);
                while (va < end) {
-                       flush_mmu_tlb_mva_entries(va);
+                       flush_mmu_tlb_mva_entries_async(va);
                        va += ARM_SMALL_PAGE_SIZE;
                }
 #endif
                return;
        }
        va = arm_trunc_page(va) | (asid & 0xff);
-       flush_mmu_tlb_entries(va, end);
+       flush_mmu_tlb_entries_async(va, end);
 
 #else
        vm_offset_t             end = va + length;
@@ -8444,17 +9120,17 @@ flush_mmu_tlb_region_asid(
                if ((asid == 0) || (pmap->nested == TRUE))
                        flush_all = TRUE;
                if (flush_all)
-                       flush_mmu_tlb();
+                       flush_mmu_tlb_async();
                else
-                       flush_mmu_tlb_asid((uint64_t)asid << TLBI_ASID_SHIFT);
+                       flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT);
                return;
        }
        va = tlbi_asid(asid) | tlbi_addr(va);
        end = tlbi_asid(asid) | tlbi_addr(end);
        if (pmap->nested == TRUE) {
-               flush_mmu_tlb_allentries(va, end);
+               flush_mmu_tlb_allentries_async(va, end);
        } else {
-               flush_mmu_tlb_entries(va, end);
+               flush_mmu_tlb_entries_async(va, end);
        }
 
 #endif
@@ -8465,7 +9141,31 @@ flush_mmu_tlb_region(
        vm_offset_t va,
        unsigned length)
 {
-       flush_mmu_tlb_region_asid(va, length, kernel_pmap);
+       flush_mmu_tlb_region_asid_async(va, length, kernel_pmap);
+       sync_tlb_flush();
+}
+
+static unsigned int
+pmap_find_io_attr(pmap_paddr_t paddr)
+{
+       pmap_io_range_t find_range = {.addr = paddr, .len = PAGE_SIZE};
+       unsigned int begin = 0, end = num_io_rgns - 1;
+       assert(num_io_rgns > 0);
+
+       for (;;) {
+               unsigned int middle = (begin + end) / 2;
+               int cmp = cmp_io_rgns(&find_range, &io_attr_table[middle]);
+               if (cmp == 0)
+                       return io_attr_table[middle].wimg;
+               else if (begin == end)
+                       break;
+               else if (cmp > 0)
+                       begin = middle + 1;
+               else
+                       end = middle;
+       };
+
+       return (VM_WIMG_IO);
 }
 
 unsigned int
@@ -8479,14 +9179,8 @@ pmap_cache_attributes(
 
        paddr = ptoa(pn);
 
-       if ((paddr >= io_rgn_start) && (paddr < io_rgn_end)) {
-               unsigned int attr = IO_ATTR_WIMG(io_attr_table[(paddr - io_rgn_start) / io_rgn_granule]);
-               if (attr)
-                       return attr;
-               else
-                       return (VM_WIMG_IO);
-       }
-
+       if ((paddr >= io_rgn_start) && (paddr < io_rgn_end))
+               return pmap_find_io_attr(paddr);
 
        if (!pmap_initialized) {
                if  ((paddr >= gPhysBase) && (paddr < gPhysBase+gPhysSize))
@@ -8509,7 +9203,7 @@ pmap_cache_attributes(
        return result;
 }
 
-static boolean_t
+MARK_AS_PMAP_TEXT static boolean_t
 pmap_batch_set_cache_attributes_internal(
        ppnum_t pn,
        unsigned int cacheattr,
@@ -8528,7 +9222,9 @@ pmap_batch_set_cache_attributes_internal(
                cacheattr = VM_WIMG_DEFAULT;
 
        if ((doit == FALSE) &&  (*res == 0)) {
+               pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res));
                *res = page_cnt;
+               pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res));
                if (platform_cache_batch_wimg(cacheattr & (VM_WIMG_MASK), page_cnt<<PAGE_SHIFT) == FALSE) {
                        return FALSE;
                }
@@ -8537,23 +9233,29 @@ pmap_batch_set_cache_attributes_internal(
        paddr = ptoa(pn);
 
        if (!pa_valid(paddr)) {
-               panic("pmap_batch_set_cache_attributes(): pn 0x%08x not managed\n", pn);
+               panic("pmap_batch_set_cache_attributes(): pn 0x%08x not managed", pn);
        }
 
        pai = (int)pa_index(paddr);
 
-       if (doit)
+       if (doit) {
                LOCK_PVH(pai);
+       }
 
-       pp_attr_current = pp_attr_table[pai];
-       wimg_bits_prev = VM_WIMG_DEFAULT;
-       if (pp_attr_current & PP_ATTR_WIMG_MASK)
-               wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
+       do {
+               pp_attr_current = pp_attr_table[pai];
+               wimg_bits_prev = VM_WIMG_DEFAULT;
+               if (pp_attr_current & PP_ATTR_WIMG_MASK)
+                       wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
 
-       pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
+               pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
+
+               if (!doit)
+                       break;
 
-       if (doit)
-               pp_attr_table[pai] = pp_attr_template;
+               /* WIMG bits should only be updated under the PVH lock, but we should do this in a CAS loop
+                * to avoid losing simultaneous updates to other bits like refmod. */
+       } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
 
        wimg_bits_new = VM_WIMG_DEFAULT;
        if (pp_attr_template & PP_ATTR_WIMG_MASK)
@@ -8563,12 +9265,16 @@ pmap_batch_set_cache_attributes_internal(
                if (wimg_bits_new != wimg_bits_prev)
                        pmap_update_cache_attributes_locked(pn, cacheattr);
                UNLOCK_PVH(pai);
+               if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT))
+                       pmap_force_dcache_clean(phystokv(paddr), PAGE_SIZE);
        } else {
                if (wimg_bits_new == VM_WIMG_COPYBACK) {
                        return FALSE;
                }
                if (wimg_bits_prev == wimg_bits_new) {
+                       pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res));
                        *res = *res-1;
+                       pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res));
                        if (!platform_cache_batch_wimg(wimg_bits_new, (*res)<<PAGE_SHIFT)) {
                                return FALSE;
                        }
@@ -8578,7 +9284,7 @@ pmap_batch_set_cache_attributes_internal(
 
        if (page_cnt ==  (page_index+1)) {
                wimg_bits_prev = VM_WIMG_COPYBACK;
-               if (((page_cnt ==  (page_index+1)) && (wimg_bits_prev != wimg_bits_new))
+               if (((wimg_bits_prev != wimg_bits_new))
                    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
                         || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
                            && (wimg_bits_new != VM_WIMG_COPYBACK))
@@ -8603,10 +9309,11 @@ pmap_batch_set_cache_attributes(
        return pmap_batch_set_cache_attributes_internal(pn, cacheattr, page_cnt, page_index, doit, res);
 }
 
-static void
-pmap_set_cache_attributes_internal(
+MARK_AS_PMAP_TEXT static void
+pmap_set_cache_attributes_priv(
        ppnum_t pn,
-       unsigned int cacheattr)
+       unsigned int cacheattr,
+       boolean_t external __unused)
 {
        pmap_paddr_t    paddr;
        int             pai;
@@ -8627,14 +9334,19 @@ pmap_set_cache_attributes_internal(
 
        LOCK_PVH(pai);
 
-       pp_attr_current = pp_attr_table[pai];
-       wimg_bits_prev = VM_WIMG_DEFAULT;
-       if (pp_attr_current & PP_ATTR_WIMG_MASK)
-               wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
 
-       pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK)) ;
+       do {
+               pp_attr_current = pp_attr_table[pai];
+               wimg_bits_prev = VM_WIMG_DEFAULT;
+               if (pp_attr_current & PP_ATTR_WIMG_MASK)
+                       wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
+
+               pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK)) ;
+
+               /* WIMG bits should only be updated under the PVH lock, but we should do this in a CAS loop
+                * to avoid losing simultaneous updates to other bits like refmod. */
+       } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
 
-       pp_attr_table[pai] = pp_attr_template;
        wimg_bits_new = VM_WIMG_DEFAULT;
        if (pp_attr_template & PP_ATTR_WIMG_MASK)
                wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
@@ -8652,6 +9364,16 @@ pmap_set_cache_attributes_internal(
                    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK)))))
                pmap_sync_page_attributes_phys(pn);
 
+       if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT))
+               pmap_force_dcache_clean(phystokv(paddr), PAGE_SIZE);
+}
+
+MARK_AS_PMAP_TEXT static void
+pmap_set_cache_attributes_internal(
+       ppnum_t pn,
+       unsigned int cacheattr)
+{
+       pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
 }
 
 void
@@ -8667,16 +9389,13 @@ pmap_update_cache_attributes_locked(
        ppnum_t ppnum,
        unsigned attributes)
 {
-       pmap_paddr_t    phys = ptoa(ppnum);
-       pv_entry_t      *pve_p;
-       pt_entry_t      *pte_p;
-       pv_entry_t      **pv_h;
+       pmap_paddr_t    phys = ptoa(ppnum);
+       pv_entry_t      *pve_p;
+       pt_entry_t      *pte_p;
+       pv_entry_t      **pv_h;
        pt_entry_t      tmplate;
-       unsigned int    pai;
-
-#if (__ARM_VMSA__ == 7)
-       #define ARM_PTE_SHMASK ARM_PTE_SH
-#endif
+       unsigned int    pai;
+       boolean_t       tlb_flush_needed = FALSE;
 
 #if __ARM_PTE_PHYSMAP__
        vm_offset_t kva = phystokv(phys);
@@ -8685,9 +9404,15 @@ pmap_update_cache_attributes_locked(
        tmplate = *pte_p;
        tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
        tmplate |= wimg_to_pte(attributes);
-
-       WRITE_PTE(pte_p, tmplate);
-       PMAP_UPDATE_TLBS(kernel_pmap, kva, kva + PAGE_SIZE);
+#if (__ARM_VMSA__ > 7)
+       if (tmplate & ARM_PTE_HINT_MASK) {
+               panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
+                     __FUNCTION__, pte_p, (void *)kva, tmplate);
+       }
+#endif
+       WRITE_PTE_STRONG(pte_p, tmplate);
+       flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap);
+       tlb_flush_needed = TRUE;
 #endif
 
        pai = (unsigned int)pa_index(phys);
@@ -8709,7 +9434,10 @@ pmap_update_cache_attributes_locked(
 
                if (pve_p != PV_ENTRY_NULL)
                        pte_p = pve_get_ptep(pve_p);
-
+#ifdef PVH_FLAG_IOMMU
+               if ((vm_offset_t)pte_p & PVH_FLAG_IOMMU)
+                       goto cache_skip_pve;
+#endif
                pmap = ptep_get_pmap(pte_p);
                va = ptep_get_va(pte_p);
 
@@ -8717,14 +9445,20 @@ pmap_update_cache_attributes_locked(
                tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
                tmplate |= wimg_to_pte(attributes);
 
-               WRITE_PTE(pte_p, tmplate);
-               PMAP_UPDATE_TLBS(pmap, va, va + PAGE_SIZE);
+               WRITE_PTE_STRONG(pte_p, tmplate);
+               flush_mmu_tlb_region_asid_async(va, PAGE_SIZE, pmap);
+               tlb_flush_needed = TRUE;
 
+#ifdef PVH_FLAG_IOMMU
+       cache_skip_pve:
+#endif
                pte_p = PT_ENTRY_NULL;
                if (pve_p != PV_ENTRY_NULL)
                        pve_p = PVE_NEXT_PTR(pve_next(pve_p));
 
        }
+       if (tlb_flush_needed)
+               sync_tlb_flush();
 }
 
 #if    (__ARM_VMSA__ == 7)
@@ -8760,7 +9494,7 @@ pmap_update_tt3e(
 
        pte = *ptep;
        pte = tte_to_pa(pte) | template;
-       WRITE_PTE(ptep, pte);
+       WRITE_PTE_STRONG(ptep, pte);
 }
 
 /* Note absence of non-global bit */
@@ -8782,12 +9516,11 @@ pmap_create_sharedpage(
 
        memset((char *) phystokv(pa), 0, PAGE_SIZE);
 
+#ifdef CONFIG_XNUPOST
        /*
         * The kernel pmap maintains a user accessible mapping of the commpage
         * to test PAN.
         */
-       kr = pmap_expand(kernel_pmap, _COMM_HIGH_PAGE64_BASE_ADDRESS, 0, PMAP_TT_L3_LEVEL);
-       assert(kr == KERN_SUCCESS);
        kr = pmap_enter(kernel_pmap, _COMM_HIGH_PAGE64_BASE_ADDRESS, (ppnum_t)atop(pa), VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
        assert(kr == KERN_SUCCESS);
 
@@ -8797,14 +9530,10 @@ pmap_create_sharedpage(
         */
        pmap_update_tt3e(kernel_pmap, _COMM_HIGH_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE | ARM_PTE_NG);
 
-       /*
-        * With PAN enabled kernel drivers can no longer use the previous mapping which is user readable
-        * They should use the following mapping instead
-        */
-       kr = pmap_expand(kernel_pmap, _COMM_PRIV_PAGE64_BASE_ADDRESS, 0, PMAP_TT_L3_LEVEL);
-       assert(kr == KERN_SUCCESS);
-       kr = pmap_enter(kernel_pmap, _COMM_PRIV_PAGE64_BASE_ADDRESS, (ppnum_t)atop(pa), VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
-       assert(kr == KERN_SUCCESS);
+#if KASAN
+       kasan_map_shadow(_COMM_HIGH_PAGE64_BASE_ADDRESS, PAGE_SIZE, true);
+#endif
+#endif /* CONFIG_XNUPOST */
 
        /*
         * In order to avoid burning extra pages on mapping the shared page, we
@@ -8833,8 +9562,7 @@ pmap_create_sharedpage(
        pmap_update_tt3e(sharedpage_pmap, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
 
        /* For manipulation in kernel, go straight to physical page */
-       sharedpage_rw_addr = phystokv(pa);
-       return((vm_map_address_t)sharedpage_rw_addr);
+       return ((vm_map_address_t)phystokv(pa));
 }
 
 /*
@@ -8851,15 +9579,17 @@ static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= VM_MAX_ADDRESS
 #error Nested shared page mapping is unsupported on this config
 #endif
 
-static void
+MARK_AS_PMAP_TEXT static kern_return_t
 pmap_insert_sharedpage_internal(
        pmap_t pmap)
 {
-#if (ARM_PGSHIFT == 14) && !__ARM64_TWO_LEVEL_PMAP__
-       kern_return_t kr;
-#endif
+       kern_return_t kr = KERN_SUCCESS;
        vm_offset_t sharedpage_vaddr;
        pt_entry_t *ttep, *src_ttep;
+       int options = 0;
+
+       VALIDATE_PMAP(pmap);
+
 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
 #error We assume a single page.
 #endif
@@ -8883,6 +9613,8 @@ pmap_insert_sharedpage_internal(
 #if __ARM64_TWO_LEVEL_PMAP__
 #error A two level page table with a page shift of 12 is not currently supported
 #endif
+       (void)options;
+
        /* Just slam in the L1 entry.  */
        ttep = pmap_tt1e(pmap, sharedpage_vaddr);
 
@@ -8901,10 +9633,12 @@ pmap_insert_sharedpage_internal(
        while (*pmap_tt1e(pmap, sharedpage_vaddr) == ARM_PTE_EMPTY) {
                PMAP_UNLOCK(pmap);
 
-               kr = pmap_expand(pmap, _COMM_PAGE32_BASE_ADDRESS, 0, PMAP_TT_L2_LEVEL);
+               kr = pmap_expand(pmap, sharedpage_vaddr, options, PMAP_TT_L2_LEVEL);
 
                if (kr != KERN_SUCCESS) {
-                       panic("Failed to pmap_expand for 32-bit commpage, pmap=%p", pmap);
+                       {
+                               panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
+                       }
                }
 
                PMAP_LOCK(pmap);
@@ -8921,26 +9655,21 @@ pmap_insert_sharedpage_internal(
 #endif
 
        *ttep =  *src_ttep;
-#ifndef __ARM_L1_PTW__
-       CleanPoU_DcacheRegion((vm_offset_t) ttep, sizeof(tt_entry_t));
-#endif
+       FLUSH_PTE_STRONG(ttep);
+
        /* TODO: Should we flush in the 64-bit case? */
-       flush_mmu_tlb_region(sharedpage_vaddr, PAGE_SIZE);
+       flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, kernel_pmap);
 
 #if (ARM_PGSHIFT == 12) && !__ARM64_TWO_LEVEL_PMAP__
-       flush_mmu_tlb_entry(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid));
+       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid));
 #elif (ARM_PGSHIFT == 14)
-       flush_mmu_tlb_entry(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
+       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
 #endif
+       sync_tlb_flush();
 
        PMAP_UNLOCK(pmap);
-}
 
-static void
-pmap_sharedpage_flush_32_to_64(
-       void)
-{
-       flush_mmu_tlb_region(_COMM_PAGE32_BASE_ADDRESS, PAGE_SIZE);
+       return kr;
 }
 
 static void
@@ -8988,16 +9717,17 @@ pmap_unmap_sharedpage(
 #endif
 
        *ttep = ARM_TTE_EMPTY;
-       flush_mmu_tlb_region(sharedpage_vaddr, PAGE_SIZE);
+       flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, kernel_pmap);
 
 #if (ARM_PGSHIFT == 12)
 #if __ARM64_TWO_LEVEL_PMAP__
 #error A two level page table with a page shift of 12 is not currently supported
 #endif
-       flush_mmu_tlb_entry(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid));
+       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L1_OFFMASK) | tlbi_asid(pmap->asid));
 #elif (ARM_PGSHIFT == 14)
-       flush_mmu_tlb_entry(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
+       flush_mmu_tlb_entry_async(tlbi_addr(sharedpage_vaddr & ~ARM_TT_L2_OFFMASK) | tlbi_asid(pmap->asid));
 #endif
+       sync_tlb_flush();
 }
 
 void
@@ -9025,7 +9755,7 @@ pmap_valid_page(
        return pa_valid(ptoa(pn));
 }
 
-static boolean_t
+MARK_AS_PMAP_TEXT static boolean_t
 pmap_is_empty_internal(
        pmap_t pmap,
        vm_map_offset_t va_start,
@@ -9038,6 +9768,8 @@ pmap_is_empty_internal(
                return TRUE;
        }
 
+       VALIDATE_PMAP(pmap);
+
        if ((pmap != kernel_pmap) && (not_in_kdp)) {
                PMAP_LOCK(pmap);
        }
@@ -9127,14 +9859,19 @@ pmap_is_empty(
 }
 
 vm_map_offset_t pmap_max_offset(
-       boolean_t       is64 __unused,
+       boolean_t               is64,
        unsigned int    option)
+{
+       return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
+}
+
+vm_map_offset_t pmap_max_64bit_offset(
+       __unused unsigned int option)
 {
        vm_map_offset_t max_offset_ret = 0;
 
 #if defined(__arm64__)
-       assert (is64);
-       vm_map_offset_t min_max_offset = SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000; // end of shared region + 512MB for various purposes
+       const vm_map_offset_t min_max_offset = SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000; // end of shared region + 512MB for various purposes
        if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
                max_offset_ret = arm64_pmap_max_offset_default;
        } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
@@ -9145,9 +9882,9 @@ vm_map_offset_t pmap_max_offset(
                if (arm64_pmap_max_offset_default) {
                        max_offset_ret = arm64_pmap_max_offset_default;
                } else if (max_mem > 0xC0000000) {
-                       max_offset_ret = 0x0000000318000000ULL;     // Max offset is 12.375GB for devices with > 3GB of memory
+                       max_offset_ret = min_max_offset + 0x138000000; // Max offset is 13.375GB for devices with > 3GB of memory
                } else if (max_mem > 0x40000000) {
-                       max_offset_ret = 0x0000000218000000ULL;     // Max offset is 8.375GB for devices with > 1GB and <= 3GB of memory
+                       max_offset_ret = min_max_offset + 0x38000000;  // Max offset is 9.375GB for devices with > 1GB and <= 3GB of memory
                } else {
                        max_offset_ret = min_max_offset;
                }
@@ -9156,16 +9893,26 @@ vm_map_offset_t pmap_max_offset(
                        // Allow the boot-arg to override jumbo size
                        max_offset_ret = arm64_pmap_max_offset_default;
                } else {
-                       max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is MACH_VM_MAX_ADDRESS for pmaps with special "jumbo" blessing
+                       max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
                }
        } else {
-               panic("pmap_max_offset illegal option 0x%x\n", option);
+               panic("pmap_max_64bit_offset illegal option 0x%x\n", option);
        }
 
-       assert(max_offset_ret >= min_max_offset);
        assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
-       return max_offset_ret;
+       assert(max_offset_ret >= min_max_offset);
 #else
+       panic("Can't run pmap_max_64bit_offset on non-64bit architectures\n");
+#endif
+
+       return max_offset_ret;
+}
+
+vm_map_offset_t pmap_max_32bit_offset(
+       unsigned int option)
+{
+       vm_map_offset_t max_offset_ret = 0;
+
        if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
                max_offset_ret = arm_pmap_max_offset_default;
        } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
@@ -9180,13 +9927,14 @@ vm_map_offset_t pmap_max_offset(
                } else {
                        max_offset_ret = 0x66000000;
                }
+       } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
+               max_offset_ret = 0x80000000;
        } else {
-               panic("pmap_max_offset illegal option 0x%x\n", option);
+               panic("pmap_max_32bit_offset illegal option 0x%x\n", option);
        }
 
-       assert(max_offset_ret <= VM_MAX_ADDRESS);
+       assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
        return max_offset_ret;
-#endif
 }
 
 #if CONFIG_DTRACE
@@ -9227,12 +9975,26 @@ pmap_flush(
        return;
 }
 
-static boolean_t
+
+static void __unused
+pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
+{
+}
+
+static void __unused
+pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
+{
+}
+
+
+
+#define PMAP_RESIDENT_INVALID  ((mach_vm_size_t)-1)
+
+MARK_AS_PMAP_TEXT static mach_vm_size_t
 pmap_query_resident_internal(
        pmap_t                  pmap,
        vm_map_address_t        start,
        vm_map_address_t        end,
-       mach_vm_size_t          *resident_bytes_p,
        mach_vm_size_t          *compressed_bytes_p)
 {
        mach_vm_size_t  resident_bytes = 0;
@@ -9243,20 +10005,23 @@ pmap_query_resident_internal(
        tt_entry_t     *tte_p;
 
        if (pmap == NULL) {
-               return FALSE;
+               return PMAP_RESIDENT_INVALID;
        }
 
+       VALIDATE_PMAP(pmap);
+
        /* Ensure that this request is valid, and addresses exactly one TTE. */
-       assert(!(start % ARM_PGBYTES));
-       assert(!(end % ARM_PGBYTES));
-       assert(end >= start);
-       assert((end - start) <= (PTE_PGENTRIES * ARM_PGBYTES));
+       if (__improbable((start % ARM_PGBYTES) || (end % ARM_PGBYTES)))
+               panic("%s: address range %p, %p not page-aligned", __func__, (void*)start, (void*)end);
+
+       if (__improbable((end < start) || ((end - start) > (PTE_PGENTRIES * ARM_PGBYTES))))
+               panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
 
        PMAP_LOCK(pmap);
        tte_p = pmap_tte(pmap, start);
        if (tte_p == (tt_entry_t *) NULL) {
                PMAP_UNLOCK(pmap);
-               return FALSE;
+               return PMAP_RESIDENT_INVALID;
        }
        if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
 
@@ -9281,14 +10046,12 @@ pmap_query_resident_internal(
        PMAP_UNLOCK(pmap);
 
        if (compressed_bytes_p) {
+               pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
                *compressed_bytes_p += compressed_bytes;
+               pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
        }
 
-       if (resident_bytes_p) {
-               *resident_bytes_p += resident_bytes;
-       }
-
-       return TRUE;
+       return resident_bytes;
 }
 
 mach_vm_size_t
@@ -9298,7 +10061,7 @@ pmap_query_resident(
        vm_map_address_t        end,
        mach_vm_size_t          *compressed_bytes_p)
 {
-       mach_vm_size_t          resident_bytes;
+       mach_vm_size_t          total_resident_bytes;
        mach_vm_size_t          compressed_bytes;
        vm_map_address_t        va;
 
@@ -9310,24 +10073,27 @@ pmap_query_resident(
                return 0;
        }
 
-       resident_bytes = 0;
+       total_resident_bytes = 0;
        compressed_bytes = 0;
 
-       PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
+       PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
                   VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
                   VM_KERNEL_ADDRHIDE(end));
 
        va = start;
        while (va < end) {
                vm_map_address_t l;
+               mach_vm_size_t resident_bytes;
 
                l = ((va + ARM_TT_TWIG_SIZE) & ~ARM_TT_TWIG_OFFMASK);
 
                if (l > end)
                        l = end;
-               if (!pmap_query_resident_internal(pmap, va, l, &resident_bytes, compressed_bytes_p)) {
+               resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
+               if (resident_bytes == PMAP_RESIDENT_INVALID)
                        break;
-               }
+
+               total_resident_bytes += resident_bytes;
 
                va = l;
        }
@@ -9336,14 +10102,13 @@ pmap_query_resident(
                *compressed_bytes_p = compressed_bytes;
        }
 
-       PMAP_TRACE(PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
-                  resident_bytes);
+       PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
+                  total_resident_bytes);
 
-       return resident_bytes;
+       return total_resident_bytes;
 }
 
 #if MACH_ASSERT
-extern int pmap_ledgers_panic;
 static void
 pmap_check_ledgers(
        pmap_t pmap)
@@ -9375,255 +10140,60 @@ pmap_check_ledgers(
 
        pmap_ledgers_drift.num_pmaps_checked++;
 
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.phys_footprint,
-                          &bal);
-       if (bal != 0) {
-#if DEVELOPMENT || DEBUG
-//             if (!pmap->footprint_was_suspended)
-#endif /* DEVELOPMENT || DEBUG */
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"phys_footprint\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.phys_footprint_over++;
-                       pmap_ledgers_drift.phys_footprint_over_total += bal;
-                       if (bal > pmap_ledgers_drift.phys_footprint_over_max) {
-                               pmap_ledgers_drift.phys_footprint_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.phys_footprint_under++;
-                       pmap_ledgers_drift.phys_footprint_under_total += bal;
-                       if (bal < pmap_ledgers_drift.phys_footprint_under_max) {
-                               pmap_ledgers_drift.phys_footprint_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.internal,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"internal\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.internal_over++;
-                       pmap_ledgers_drift.internal_over_total += bal;
-                       if (bal > pmap_ledgers_drift.internal_over_max) {
-                               pmap_ledgers_drift.internal_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.internal_under++;
-                       pmap_ledgers_drift.internal_under_total += bal;
-                       if (bal < pmap_ledgers_drift.internal_under_max) {
-                               pmap_ledgers_drift.internal_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.internal_compressed,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"internal_compressed\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.internal_compressed_over++;
-                       pmap_ledgers_drift.internal_compressed_over_total += bal;
-                       if (bal > pmap_ledgers_drift.internal_compressed_over_max) {
-                               pmap_ledgers_drift.internal_compressed_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.internal_compressed_under++;
-                       pmap_ledgers_drift.internal_compressed_under_total += bal;
-                       if (bal < pmap_ledgers_drift.internal_compressed_under_max) {
-                               pmap_ledgers_drift.internal_compressed_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.iokit_mapped,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"iokit_mapped\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.iokit_mapped_over++;
-                       pmap_ledgers_drift.iokit_mapped_over_total += bal;
-                       if (bal > pmap_ledgers_drift.iokit_mapped_over_max) {
-                               pmap_ledgers_drift.iokit_mapped_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.iokit_mapped_under++;
-                       pmap_ledgers_drift.iokit_mapped_under_total += bal;
-                       if (bal < pmap_ledgers_drift.iokit_mapped_under_max) {
-                               pmap_ledgers_drift.iokit_mapped_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.alternate_accounting,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"alternate_accounting\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.alternate_accounting_over++;
-                       pmap_ledgers_drift.alternate_accounting_over_total += bal;
-                       if (bal > pmap_ledgers_drift.alternate_accounting_over_max) {
-                               pmap_ledgers_drift.alternate_accounting_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.alternate_accounting_under++;
-                       pmap_ledgers_drift.alternate_accounting_under_total += bal;
-                       if (bal < pmap_ledgers_drift.alternate_accounting_under_max) {
-                               pmap_ledgers_drift.alternate_accounting_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.alternate_accounting_compressed,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"alternate_accounting_compressed\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.alternate_accounting_compressed_over++;
-                       pmap_ledgers_drift.alternate_accounting_compressed_over_total += bal;
-                       if (bal > pmap_ledgers_drift.alternate_accounting_compressed_over_max) {
-                               pmap_ledgers_drift.alternate_accounting_compressed_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.alternate_accounting_compressed_under++;
-                       pmap_ledgers_drift.alternate_accounting_compressed_under_total += bal;
-                       if (bal < pmap_ledgers_drift.alternate_accounting_compressed_under_max) {
-                               pmap_ledgers_drift.alternate_accounting_compressed_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.page_table,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"page_table\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.page_table_over++;
-                       pmap_ledgers_drift.page_table_over_total += bal;
-                       if (bal > pmap_ledgers_drift.page_table_over_max) {
-                               pmap_ledgers_drift.page_table_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.page_table_under++;
-                       pmap_ledgers_drift.page_table_under_total += bal;
-                       if (bal < pmap_ledgers_drift.page_table_under_max) {
-                               pmap_ledgers_drift.page_table_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.purgeable_volatile,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"purgeable_volatile\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.purgeable_volatile_over++;
-                       pmap_ledgers_drift.purgeable_volatile_over_total += bal;
-                       if (bal > pmap_ledgers_drift.purgeable_volatile_over_max) {
-                               pmap_ledgers_drift.purgeable_volatile_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.purgeable_volatile_under++;
-                       pmap_ledgers_drift.purgeable_volatile_under_total += bal;
-                       if (bal < pmap_ledgers_drift.purgeable_volatile_under_max) {
-                               pmap_ledgers_drift.purgeable_volatile_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.purgeable_nonvolatile,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"purgeable_nonvolatile\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.purgeable_nonvolatile_over++;
-                       pmap_ledgers_drift.purgeable_nonvolatile_over_total += bal;
-                       if (bal > pmap_ledgers_drift.purgeable_nonvolatile_over_max) {
-                               pmap_ledgers_drift.purgeable_nonvolatile_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.purgeable_nonvolatile_under++;
-                       pmap_ledgers_drift.purgeable_nonvolatile_under_total += bal;
-                       if (bal < pmap_ledgers_drift.purgeable_nonvolatile_under_max) {
-                               pmap_ledgers_drift.purgeable_nonvolatile_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.purgeable_volatile_compressed,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"purgeable_volatile_compressed\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.purgeable_volatile_compressed_over++;
-                       pmap_ledgers_drift.purgeable_volatile_compressed_over_total += bal;
-                       if (bal > pmap_ledgers_drift.purgeable_volatile_compressed_over_max) {
-                               pmap_ledgers_drift.purgeable_volatile_compressed_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.purgeable_volatile_compressed_under++;
-                       pmap_ledgers_drift.purgeable_volatile_compressed_under_total += bal;
-                       if (bal < pmap_ledgers_drift.purgeable_volatile_compressed_under_max) {
-                               pmap_ledgers_drift.purgeable_volatile_compressed_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.purgeable_nonvolatile_compressed,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"purgeable_nonvolatile_compressed\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.purgeable_nonvolatile_compressed_over++;
-                       pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_total += bal;
-                       if (bal > pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max) {
-                               pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.purgeable_nonvolatile_compressed_under++;
-                       pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_total += bal;
-                       if (bal < pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max) {
-                               pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max = bal;
-                       }
-               }
-       }
+#define LEDGER_CHECK_BALANCE(__LEDGER)                                 \
+MACRO_BEGIN                                                            \
+       int panic_on_negative = TRUE;                                   \
+       ledger_get_balance(pmap->ledger,                                \
+                          task_ledgers.__LEDGER,                       \
+                          &bal);                                       \
+       ledger_get_panic_on_negative(pmap->ledger,                      \
+                                    task_ledgers.__LEDGER,             \
+                                    &panic_on_negative);               \
+       if (bal != 0) {                                                 \
+               if (panic_on_negative ||                                \
+                   (pmap_ledgers_panic &&                              \
+                    pmap_ledgers_panic_leeway > 0 &&                   \
+                    (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
+                     bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
+                       do_panic = TRUE;                                \
+               }                                                       \
+               printf("LEDGER BALANCE proc %d (%s) "                   \
+                      "\"%s\" = %lld\n",                               \
+                      pid, procname, #__LEDGER, bal);                  \
+               if (bal > 0) {                                          \
+                       pmap_ledgers_drift.__LEDGER##_over++;           \
+                       pmap_ledgers_drift.__LEDGER##_over_total += bal; \
+                       if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
+                               pmap_ledgers_drift.__LEDGER##_over_max = bal; \
+                       }                                               \
+               } else if (bal < 0) {                                   \
+                       pmap_ledgers_drift.__LEDGER##_under++;          \
+                       pmap_ledgers_drift.__LEDGER##_under_total += bal; \
+                       if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
+                               pmap_ledgers_drift.__LEDGER##_under_max = bal; \
+                       }                                               \
+               }                                                       \
+       }                                                               \
+MACRO_END
+
+       LEDGER_CHECK_BALANCE(phys_footprint);
+       LEDGER_CHECK_BALANCE(internal);
+       LEDGER_CHECK_BALANCE(internal_compressed);
+       LEDGER_CHECK_BALANCE(iokit_mapped);
+       LEDGER_CHECK_BALANCE(alternate_accounting);
+       LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
+       LEDGER_CHECK_BALANCE(page_table);
+       LEDGER_CHECK_BALANCE(purgeable_volatile);
+       LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
+       LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
+       LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
+       LEDGER_CHECK_BALANCE(network_volatile);
+       LEDGER_CHECK_BALANCE(network_nonvolatile);
+       LEDGER_CHECK_BALANCE(network_volatile_compressed);
+       LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
 
        if (do_panic) {
-               if (pmap_ledgers_panic &&
-                   pmap->pmap_stats_assert) {
+               if (pmap_ledgers_panic) {
                        panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
                              pmap, pid, procname);
                } else {
@@ -10588,32 +11158,42 @@ pmap_enforces_execute_only(
 #endif
 }
 
-void
-pmap_set_jit_entitled(
+MARK_AS_PMAP_TEXT void
+pmap_set_jit_entitled_internal(
        __unused pmap_t pmap)
 {
        return;
 }
 
-static kern_return_t
+void
+pmap_set_jit_entitled(
+       pmap_t pmap)
+{
+       pmap_set_jit_entitled_internal(pmap);
+}
+
+MARK_AS_PMAP_TEXT static kern_return_t
 pmap_query_page_info_internal(
        pmap_t          pmap,
        vm_map_offset_t va,
        int             *disp_p)
 {
-       int             disp;
-       pmap_paddr_t    pa;
-       int             pai;
-       pt_entry_t      *pte;
-       pv_entry_t      **pv_h, *pve_p;
+       pmap_paddr_t    pa;
+       int             disp;
+       int             pai;
+       pt_entry_t      *pte;
+       pv_entry_t      **pv_h, *pve_p;
 
        if (pmap == PMAP_NULL || pmap == kernel_pmap) {
+               pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
                *disp_p = 0;
+               pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
                return KERN_INVALID_ARGUMENT;
        }
 
        disp = 0;
 
+       VALIDATE_PMAP(pmap);
        PMAP_LOCK(pmap);
 
        pte = pmap_pte(pmap, va);
@@ -10657,7 +11237,9 @@ pmap_query_page_info_internal(
 
 done:
        PMAP_UNLOCK(pmap);
+       pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
        *disp_p = disp;
+       pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
        return KERN_SUCCESS;
 }
 
@@ -10670,7 +11252,7 @@ pmap_query_page_info(
        return pmap_query_page_info_internal(pmap, va, disp_p);
 }
 
-kern_return_t
+MARK_AS_PMAP_TEXT kern_return_t
 pmap_return_internal(__unused boolean_t do_panic, __unused boolean_t do_recurse)
 {
 
@@ -10683,23 +11265,26 @@ pmap_return(boolean_t do_panic, boolean_t do_recurse)
        return pmap_return_internal(do_panic, do_recurse);
 }
 
-static void
+
+
+MARK_AS_PMAP_TEXT static void
 pmap_footprint_suspend_internal(
        vm_map_t        map,
        boolean_t       suspend)
 {
 #if DEVELOPMENT || DEBUG
        if (suspend) {
-               map->pmap->footprint_suspended = TRUE;
+               current_thread()->pmap_footprint_suspended = TRUE;
                map->pmap->footprint_was_suspended = TRUE;
        } else {
-               map->pmap->footprint_suspended = FALSE;
+               current_thread()->pmap_footprint_suspended = FALSE;
        }
 #else /* DEVELOPMENT || DEBUG */
        (void) map;
        (void) suspend;
 #endif /* DEVELOPMENT || DEBUG */
 }
+
 void
 pmap_footprint_suspend(
        vm_map_t map,
@@ -10707,3 +11292,113 @@ pmap_footprint_suspend(
 {
        pmap_footprint_suspend_internal(map, suspend);
 }
+
+#if defined(__arm64__) && (DEVELOPMENT || DEBUG)
+
+struct page_table_level_info {
+       uint64_t size;
+       uint64_t offmask;
+       uint64_t shift;
+       uint64_t index_mask;
+       uint64_t valid_mask;
+       uint64_t type_mask;
+       uint64_t type_block;
+};
+
+struct page_table_dump_header {
+       uint64_t pa;
+       uint64_t num_entries;
+       uint64_t start_va;
+       uint64_t end_va;
+};
+
+struct page_table_level_info page_table_levels[] =
+       { { ARM_TT_L0_SIZE, ARM_TT_L0_OFFMASK, ARM_TT_L0_SHIFT, ARM_TT_L0_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK },
+         { ARM_TT_L1_SIZE, ARM_TT_L1_OFFMASK, ARM_TT_L1_SHIFT, ARM_TT_L1_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK },
+         { ARM_TT_L2_SIZE, ARM_TT_L2_OFFMASK, ARM_TT_L2_SHIFT, ARM_TT_L2_INDEX_MASK, ARM_TTE_VALID, ARM_TTE_TYPE_MASK, ARM_TTE_TYPE_BLOCK },
+         { ARM_TT_L3_SIZE, ARM_TT_L3_OFFMASK, ARM_TT_L3_SHIFT, ARM_TT_L3_INDEX_MASK, ARM_PTE_TYPE_VALID, ARM_PTE_TYPE_MASK, ARM_TTE_TYPE_L3BLOCK } };
+
+static size_t
+pmap_dump_page_tables_recurse(const tt_entry_t *ttp,
+                              unsigned int cur_level,
+                              uint64_t start_va,
+                              void *bufp,
+                              void *buf_end)
+{
+       size_t bytes_used = 0;
+       uint64_t num_entries = ARM_PGBYTES / sizeof(*ttp);
+       uint64_t size = page_table_levels[cur_level].size;
+       uint64_t valid_mask = page_table_levels[cur_level].valid_mask;
+       uint64_t type_mask = page_table_levels[cur_level].type_mask;
+       uint64_t type_block = page_table_levels[cur_level].type_block;
+
+       if (cur_level == arm64_root_pgtable_level)
+               num_entries = arm64_root_pgtable_num_ttes;
+
+       uint64_t tt_size = num_entries * sizeof(tt_entry_t);
+       const tt_entry_t *tt_end = &ttp[num_entries];
+
+       if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
+               return 0;
+       }
+
+       struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
+       header->pa = ml_static_vtop((vm_offset_t)ttp);
+       header->num_entries = num_entries;
+       header->start_va = start_va;
+       header->end_va = start_va + (num_entries * size);
+
+       bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
+       bytes_used += (sizeof(*header) + tt_size);
+       uint64_t current_va = start_va;
+
+       for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
+               tt_entry_t tte = *ttep;
+
+               if (!(tte & valid_mask)) {
+                       continue;
+               }
+
+               if ((tte & type_mask) == type_block) {
+                       continue;
+               } else {
+                       if (cur_level >= PMAP_TT_MAX_LEVEL) {
+                               panic("%s: corrupt entry %#llx at %p, "
+                                     "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
+                                     __FUNCTION__, tte, ttep,
+                                     ttp, cur_level, bufp, buf_end);
+                       }
+
+                       const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
+
+                       size_t recurse_result = pmap_dump_page_tables_recurse(next_tt, cur_level + 1, current_va, (uint8_t*)bufp + bytes_used, buf_end);
+
+                       if (recurse_result == 0) {
+                               return 0;
+                       }
+
+                       bytes_used += recurse_result;
+               }
+       }
+
+       return bytes_used;
+}
+
+size_t
+pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end)
+{
+       if (not_in_kdp)
+               panic("pmap_dump_page_tables must only be called from kernel debugger context");
+       return pmap_dump_page_tables_recurse(pmap->tte, arm64_root_pgtable_level, pmap->min, bufp, buf_end);
+}
+
+#else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
+
+size_t
+pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused)
+{
+       return (size_t)-1;
+}
+
+#endif /* !defined(__arm64__) */
+
index 7653401cf7e68e4fb8cdb6583a985adc67990e03..88d89086c6a5c0e2ab6d45d923379720e3401a80 100644 (file)
 
 #ifndef ASSEMBLER
 
+#include <stdatomic.h>
+#include <libkern/section_keywords.h>
 #include <mach/kern_return.h>
 #include <mach/machine/vm_types.h>
+#include <arm/pmap_public.h>
+#include <mach/arm/thread_status.h>
 
 #if __ARM_KERNEL_PROTECT__
 /*
 #define NBBY                           8
 
 struct pmap_cpu_data {
+#if defined(__arm64__)
+       pmap_t cpu_nested_pmap;
+#else
        pmap_t cpu_user_pmap;
-       unsigned int cpu_number;
        unsigned int cpu_user_pmap_stamp;
+#endif
+       unsigned int cpu_number;
+
 
        /*
         * This supports overloading of ARM ASIDs by the pmap.  The field needs
@@ -87,6 +96,9 @@ typedef struct pmap_cpu_data pmap_cpu_data_t;
 #include <kern/thread.h>
 #include <kern/queue.h>
 
+
+#include <sys/cdefs.h>
+
 /* Base address for low globals. */
 #define LOW_GLOBAL_BASE_ADDRESS 0xfffffff000000000ULL
 
@@ -102,14 +114,14 @@ typedef struct pmap_cpu_data pmap_cpu_data_t;
 
 #if defined(__arm64__)
 
+#define BOOTSTRAP_TABLE_SIZE (ARM_PGBYTES * 8)
+
 typedef uint64_t       tt_entry_t;                                     /* translation table entry type */
 #define TT_ENTRY_NULL   ((tt_entry_t *) 0)
 
 typedef uint64_t       pt_entry_t;                                     /* page table entry type */
 #define PT_ENTRY_NULL   ((pt_entry_t *) 0)
 
-typedef        uint64_t        pmap_paddr_t;                           /* physical address (not ppnum_t) */
-
 #elif defined(__arm__)
 
 typedef uint32_t        tt_entry_t;                            /* translation table entry type */
@@ -118,8 +130,6 @@ typedef uint32_t     tt_entry_t;                            /* translation table entry type */
 typedef uint32_t       pt_entry_t;                                     /* page table entry type */
 #define TT_ENTRY_NULL   ((tt_entry_t *) 0)
 
-typedef  uint32_t       pmap_paddr_t;                  /* physical address (not ppnum_t) */
-
 #else
 #error unknown arch
 #endif
@@ -164,22 +174,35 @@ typedef  uint32_t       pmap_paddr_t;                     /* physical address (not ppnum_t) */
 #define NPTES  (ARM_PGBYTES / sizeof(pt_entry_t))
 #endif
 
+extern void sync_tlb_flush(void);
+extern void flush_mmu_tlb_async(void);
 extern void flush_mmu_tlb(void);
+extern void flush_core_tlb_async(void);
 extern void flush_core_tlb(void);
 #if defined(__arm64__)
+extern void flush_mmu_tlb_allentries_async(uint64_t, uint64_t);
 extern void flush_mmu_tlb_allentries(uint64_t, uint64_t);
+extern void flush_mmu_tlb_entry_async(uint64_t);
 extern void flush_mmu_tlb_entry(uint64_t);
+extern void flush_mmu_tlb_entries_async(uint64_t, uint64_t);
 extern void flush_mmu_tlb_entries(uint64_t, uint64_t);
+extern void flush_mmu_tlb_asid_async(uint64_t);
 extern void flush_mmu_tlb_asid(uint64_t);
+extern void flush_core_tlb_asid_async(uint64_t);
 extern void flush_core_tlb_asid(uint64_t);
 
 #define tlbi_addr(x) (((x) >> TLBI_ADDR_SHIFT) & TLBI_ADDR_MASK)
 #define tlbi_asid(x) (((uint64_t)x << TLBI_ASID_SHIFT) & TLBI_ASID_MASK)
 #else
+extern void flush_mmu_tlb_entry_async(uint32_t);
 extern void flush_mmu_tlb_entry(uint32_t);
+extern void flush_mmu_tlb_entries_async(uint32_t, uint32_t);
 extern void flush_mmu_tlb_entries(uint32_t, uint32_t);
+extern void flush_mmu_tlb_mva_entries_async(uint32_t);
 extern void flush_mmu_tlb_mva_entries(uint32_t);
+extern void flush_mmu_tlb_asid_async(uint32_t);
 extern void flush_mmu_tlb_asid(uint32_t);
+extern void flush_core_tlb_asid_async(uint32_t);
 extern void flush_core_tlb_asid(uint32_t);
 #endif
 extern void flush_mmu_tlb_region(vm_offset_t va, unsigned length);
@@ -192,6 +215,7 @@ extern void set_mmu_ttb(uint64_t);
 extern void set_mmu_ttb_alternate(uint64_t);
 extern uint64_t get_tcr(void);
 extern void set_tcr(uint64_t);
+extern uint64_t pmap_get_arm64_prot(pmap_t, vm_offset_t);
 #else
 extern uint32_t get_mmu_control(void);
 extern void set_mmu_control(uint32_t);
@@ -255,6 +279,18 @@ extern pmap_paddr_t mmu_uvtop(vm_offset_t va);
 #define        PMAP_GC_INFLIGHT        1
 #define        PMAP_GC_WAIT            2
 
+#if DEVELOPMENT || DEBUG
+#define pmap_cs_log(msg, args...) printf("PMAP_CS: " msg "\n", args)
+#define pmap_cs_log_h(msg, args...) { if(pmap_cs_log_hacks) printf("PMAP_CS: " msg "\n", args); }
+
+#define PMAP_CS_EXCEPTION_LIST_HACK 1
+
+#else
+#define pmap_cs_log(msg, args...)
+#define pmap_cs_log_h(msg, args...)
+#endif /* DEVELOPMENT || DEBUG */
+
+
 /*
  *     Convert translation/page table entry to kernel virtual address
  */
@@ -266,42 +302,49 @@ struct pmap {
        pmap_paddr_t            ttep;                   /* translation table physical */
        vm_map_address_t        min;                    /* min address in pmap */
        vm_map_address_t        max;                    /* max address in pmap */
-       unsigned int            asid;                   /* address space id */
-       unsigned int            vasid;                  /* Virtual address space id */
-       unsigned int            stamp;                  /* creation stamp */
-       unsigned int            wired;                  /* wired bits */
-       volatile uint32_t       ref_count;              /* pmap reference count */
-       unsigned int            cpu_ref;                /* number of cpus using pmap */
-       unsigned int            gc_status;              /* gc status */
-       ledger_t                        ledger;                 /* ledger tracking phys mappings */
+       ledger_t                ledger;                 /* ledger tracking phys mappings */
        decl_simple_lock_data(,lock)            /* lock on map */
        struct pmap_statistics  stats;          /* map statistics */
        queue_chain_t           pmaps;                  /* global list of pmaps */
        tt_entry_t                      *tt_entry_free; /* free translation table entries */
        tt_entry_t                      *prev_tte;              /* previous translation table */
-       unsigned int            tte_index_max;  /* max tte index in translation table entries */
-       boolean_t                       nx_enabled;             /* no execute */
-       boolean_t                       nested;                 /* is nested */
-       boolean_t                       is_64bit;               /* is 64bit */
        struct pmap                     *nested_pmap;   /* nested pmap */
        vm_map_address_t        nested_region_grand_addr;
        vm_map_address_t        nested_region_subord_addr;
        vm_map_offset_t         nested_region_size;
+       vm_map_offset_t         nested_region_true_start;
+       vm_map_offset_t         nested_region_true_end;
        unsigned int            *nested_region_asid_bitmap;
-       unsigned int            nested_region_asid_bitmap_size;
 
 #if (__ARM_VMSA__ <= 7)
        decl_simple_lock_data(,tt1_lock)        /* lock on tt1 */
+       unsigned int            cpu_ref;                /* number of cpus using pmap */
 #endif
+
+
+       unsigned int            asid;                   /* address space id */
+       unsigned int            vasid;                  /* Virtual address space id */
+       unsigned int            stamp;                  /* creation stamp */
+       _Atomic int32_t         ref_count;              /* pmap reference count */
+       unsigned int            gc_status;              /* gc status */
+       unsigned int            nested_region_asid_bitmap_size;
+       unsigned int            tte_index_max;  /* max tte index in translation table entries */
+       uint32_t        nested_no_bounds_refcnt;/* number of pmaps that nested this pmap without bounds set */
+
 #if MACH_ASSERT
-       boolean_t               pmap_stats_assert;
        int                     pmap_pid;
        char                    pmap_procname[17];
+       bool            pmap_stats_assert;
 #endif /* MACH_ASSERT */
 #if DEVELOPMENT || DEBUG
-       boolean_t               footprint_suspended;
-       boolean_t               footprint_was_suspended;
+       bool            footprint_suspended;
+       bool            footprint_was_suspended;
 #endif /* DEVELOPMENT || DEBUG */
+       bool                    nx_enabled;                             /* no execute */
+       bool                    nested;                                 /* is nested */
+       bool                    is_64bit;                               /* is 64bit */
+       bool            nested_has_no_bounds_ref;       /* nested a pmap when the bounds were not set */
+       bool            nested_bounds_set;                      /* The nesting bounds have been set */
 };
 
 /* typedef struct pmap *pmap_t; */
@@ -312,6 +355,7 @@ struct pmap {
  * WIMG control
  */
 #define        VM_MEM_INNER            0x10
+#define VM_MEM_RT              0x10 // intentionally alias VM_MEM_INNER; will be used with mutually exclusive caching policies
 #define VM_MEM_EARLY_ACK       0x20
 
 #define        VM_WIMG_DEFAULT         (VM_MEM_COHERENT)
@@ -320,8 +364,8 @@ struct pmap {
 #define VM_WIMG_IO             (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED)
 #define VM_WIMG_POSTED         (VM_MEM_COHERENT | VM_MEM_NOT_CACHEABLE | VM_MEM_GUARDED | VM_MEM_EARLY_ACK)
 #define VM_WIMG_WTHRU          (VM_MEM_WRITE_THROUGH | VM_MEM_COHERENT | VM_MEM_GUARDED)
-#define VM_WIMG_WCOMB          (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT) 
-
+#define VM_WIMG_WCOMB          (VM_MEM_NOT_CACHEABLE | VM_MEM_COHERENT)
+#define VM_WIMG_RT             (VM_WIMG_IO | VM_MEM_RT)
 
 #if VM_DEBUG
 extern int      pmap_list_resident_pages(
@@ -355,6 +399,7 @@ extern pmap_paddr_t invalid_ttep;  /* physical invalid translation table addr */
  * platform dependent Prototypes
  */
 extern void pmap_switch_user_ttb(pmap_t pmap);
+extern void pmap_clear_user_ttb(void);
 extern void pmap_bootstrap(vm_offset_t);
 extern vm_map_address_t        pmap_ptov(pmap_t, ppnum_t);
 extern ppnum_t pmap_find_phys(pmap_t map, addr64_t va);
@@ -362,7 +407,7 @@ extern void pmap_set_pmap(pmap_t pmap, thread_t thread);
 extern void pmap_collect(pmap_t pmap);
 extern void pmap_gc(void);
 #if defined(__arm64__)
-extern vm_offset_t     pmap_extract(pmap_t pmap, vm_map_offset_t va);
+extern vm_offset_t pmap_extract(pmap_t pmap, vm_map_offset_t va);
 #endif
 
 /*
@@ -407,6 +452,8 @@ extern vm_offset_t  pmap_extract(pmap_t pmap, vm_map_offset_t va);
        copyout(from, to, cnt)
 
 extern pmap_paddr_t kvtophys(vm_offset_t va); 
+extern vm_map_address_t phystokv(pmap_paddr_t pa);
+extern vm_map_address_t phystokv_range(pmap_paddr_t pa, vm_size_t *max_len);
 
 extern vm_map_address_t pmap_map(vm_map_address_t va, vm_offset_t sa, vm_offset_t ea, vm_prot_t prot, unsigned int flags);
 extern vm_map_address_t pmap_map_high_window_bd( vm_offset_t pa, vm_size_t len, vm_prot_t prot);
@@ -422,7 +469,6 @@ extern vm_map_address_t pmap_map_bd_with_options(vm_map_address_t va, vm_offset_
 extern vm_map_address_t pmap_map_bd(vm_map_address_t va, vm_offset_t sa, vm_offset_t ea, vm_prot_t prot);
 
 extern void pmap_init_pte_page(pmap_t, pt_entry_t *, vm_offset_t, unsigned int ttlevel, boolean_t alloc_ptd);
-extern void pmap_init_pte_static_page(pmap_t, pt_entry_t *, pmap_paddr_t);
 
 extern boolean_t pmap_valid_address(pmap_paddr_t addr);
 extern void pmap_disable_NX(pmap_t pmap);
@@ -452,6 +498,8 @@ extern boolean_t pmap_is_empty(pmap_t pmap, vm_map_offset_t start, vm_map_offset
 
 
 extern vm_map_offset_t pmap_max_offset(boolean_t is64, unsigned int option);
+extern vm_map_offset_t pmap_max_64bit_offset(unsigned int option);
+extern vm_map_offset_t pmap_max_32bit_offset(unsigned int option);
 
 boolean_t pmap_virtual_region(unsigned int region_select, vm_map_offset_t *startp, vm_map_size_t *size);
 
@@ -488,28 +536,50 @@ boolean_t pmap_enforces_execute_only(pmap_t pmap);
 #define PMAP_SET_PROCESS_INDEX 27
 #define PMAP_SWITCH_INDEX 28
 #define PMAP_SWITCH_USER_TTB_INDEX 29
-#define PMAP_UNHINT_KV_ADDR_INDEX 30
+#define PMAP_CLEAR_USER_TTB_INDEX 30
 #define PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX 31
 #define PMAP_UNNEST_OPTIONS_INDEX 32
 #define PMAP_FOOTPRINT_SUSPEND_INDEX 33
 #define PMAP_CPU_DATA_INIT_INDEX 34
 #define PMAP_RELEASE_PAGES_TO_KERNEL_INDEX 35
+#define PMAP_SET_JIT_ENTITLED_INDEX 36
+
+
+#define PMAP_TRIM_INDEX 64
+#define PMAP_LEDGER_ALLOC_INIT_INDEX 65
+#define PMAP_LEDGER_ALLOC_INDEX 66
+#define PMAP_LEDGER_FREE_INDEX 67
 
-#define MAX_PMAP_INDEX 36
+#define PMAP_COUNT 68
 
 #define PMAP_INVALID_CPU_NUM (~0U)
 
+struct pmap_cpu_data_array_entry {
+       pmap_cpu_data_t cpu_data;
+} __attribute__((aligned(1 << L2_CLINE)));
+
 /* Initialize the pmap per-CPU data for the current CPU. */
 extern void pmap_cpu_data_init(void);
 
 /* Get the pmap per-CPU data for the current CPU. */
 extern pmap_cpu_data_t * pmap_get_cpu_data(void);
 
+
 #define MARK_AS_PMAP_TEXT
 #define MARK_AS_PMAP_DATA
+#define MARK_AS_PMAP_RODATA
+
+
 
 extern kern_return_t pmap_return(boolean_t do_panic, boolean_t do_recurse);
 
+#define pmap_force_dcache_clean(va, sz)        CleanPoC_DcacheRegion_Force(va, sz)
+#define pmap_simple_lock(l)            simple_lock(l)
+#define pmap_simple_unlock(l)          simple_unlock(l)
+#define pmap_simple_lock_try(l)                simple_lock_try(l)
+#define pmap_lock_bit(l, i)            hw_lock_bit(l, i)
+#define pmap_unlock_bit(l, i)          hw_unlock_bit(l, i)
+
 #endif /* #ifndef ASSEMBLER */
 
 #if __ARM_KERNEL_PROTECT__
diff --git a/osfmk/arm/pmap_public.h b/osfmk/arm/pmap_public.h
new file mode 100644 (file)
index 0000000..98393cc
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#ifndef _ARM_PMAP_PUBLIC_H_
+#define _ARM_PMAP_PUBLIC_H_
+
+#include <stddef.h>
+#include <mach/kern_return.h>
+#include <mach/vm_types.h>
+#include <mach/vm_prot.h>
+
+__BEGIN_DECLS
+
+#if defined(__arm64__)
+typedef uint64_t pmap_paddr_t;                 /* physical address (not ppnum_t) */
+#else
+typedef uint32_t pmap_paddr_t;                 /* physical address (not ppnum_t) */
+#endif
+
+
+__END_DECLS
+
+#endif /* _ARM_PMAP_PUBLIC_H_ */
index 28a99e3f56d268e4eea6905df1b295d82bff9bce..5d170e88dce40ed5d58e14d4fa052977811929af 100644 (file)
 #ifndef        _ARM_PROC_REG_H_
 #define        _ARM_PROC_REG_H_
 
-#if __ARM_KERNEL_PROTECT__
-/*
- * This feature is not currently implemented for 32-bit ARM CPU architectures.
- * A discussion of this feature for 64-bit ARM CPU architectures can be found
- * in the ARM64 version of this file.
- */
-#if __arm__
-#error __ARM_KERNEL_PROTECT__ is not supported on ARM32
-#endif
-#endif /* __ARM_KERNEL_PROTECT__ */
-
 #if defined (__arm64__)
 #include <pexpert/arm64/board_config.h>
 #elif defined (__arm__)
 #define __ARM_GLOBAL_SLEEP_BIT__ 1
 #define __ARM_PAN_AVAILABLE__ 1
 
+#elif defined (APPLEMONSOON)
+#define        __ARM_ARCH__    8
+#define __ARM_VMSA__   8
+#define        __ARM_SMP__     1
+#define        __ARM_AMP__     1
+#define        __ARM_VFP__     4
+#define __ARM_COHERENT_CACHE__ 1
+#define __ARM_COHERENT_IO__ 1
+#define        __ARM_IC_NOALIAS_ICACHE__ 1
+#define __ARM_L1_PTW__ 1
+#define __ARM_DEBUG__  7
+#define __ARM_ENABLE_SWAP__ 1
+#define __ARM_V8_CRYPTO_EXTENSIONS__ 1
+#define        __ARM_16K_PG__  1
+#define __ARM64_PMAP_SUBPAGE_L1__ 1
+#define __ARM_KERNEL_PROTECT__ 1
+#define __ARM_GLOBAL_SLEEP_BIT__ 1
+#define __ARM_PAN_AVAILABLE__ 1
+#define __ARM_WKDM_ISA_AVAILABLE__ 1
+#define __PLATFORM_WKDM_ALIGNMENT_MASK__ (0x3FULL)
+#define __PLATFORM_WKDM_ALIGNMENT_BOUNDARY__ (64)
+#define __ARM_CLUSTER_COUNT__ 2
+
+#elif defined (BCM2837)
+#define        __ARM_ARCH__    8
+#define __ARM_VMSA__   8
+#define        __ARM_SMP__     1
+#define        __ARM_VFP__             4
+#define __ARM_COHERENT_CACHE__ 1
+#define __ARM_L1_PTW__ 1
+#define __ARM_DEBUG__  7
+#define __ARM64_PMAP_SUBPAGE_L1__ 1
 #else
 #error processor not supported
 #endif
 
+#if __ARM_KERNEL_PROTECT__
+/*
+ * This feature is not currently implemented for 32-bit ARM CPU architectures.
+ * A discussion of this feature for 64-bit ARM CPU architectures can be found
+ * in the ARM64 version of this file.
+ */
+#if __arm__
+#error __ARM_KERNEL_PROTECT__ is not supported on ARM32
+#endif
+#endif /* __ARM_KERNEL_PROTECT__ */
+
 #if defined(ARM_BOARD_WFE_TIMEOUT_NS)
 #define __ARM_ENABLE_WFE_ 1
 #else
 
 #define        DFSR_WRITE              0x00000800      /* write data abort fault */
 
-#if defined (ARMA7) || defined (APPLE_ARM64_ARCH_FAMILY)
+#if defined (ARMA7) || defined (APPLE_ARM64_ARCH_FAMILY) || defined (BCM2837)
 
 #define TEST_FSR_VMFAULT(status)       \
                                (((status) == FSR_PFAULT)       \
                                || ((status) == FSR_SACCESS)    \
                                || ((status) == FSR_PACCESS))
 
+#define TEST_FSR_TRANSLATION_FAULT(status)     \
+                               (((status) == FSR_SFAULT)       \
+                               || ((status) == FSR_PFAULT))
+
 #else
 
 #error Incompatible CPU type configured
 #define L2_SWAY        (L2_CSIZE - L2_NWAY)            /* set size 1<<L2_SWAY */
 #define L2_NSET        (L2_SWAY - L2_CLINE)            /* lines per way 1<<L2_NSET */
 
+#elif defined (APPLEMONSOON)
+
+/* I-Cache, 96KB for Monsoon, 48KB for Mistral, 6-way. */
+#define MMU_I_CLINE    6               /* cache line size as 1<<MMU_I_CLINE (64) */
+
+/* D-Cache, 64KB for Monsoon, 32KB for Mistral, 4-way. */
+#define MMU_CSIZE      16              /* cache size as 1<<MMU_CSIZE (64K) */
+#define MMU_CLINE      6               /* cache line size is 1<<MMU_CLINE (64) */
+#define MMU_NWAY       2               /* set associativity 1<<MMU_NWAY (4) */
+#define MMU_I7SET      6               /* cp15 c7 set incrementer 1<<MMU_I7SET */
+#define MMU_I7WAY      30              /* cp15 c7 way incrementer 1<<MMU_I7WAY */
+#define MMU_I9WAY      30              /* cp15 c9 way incrementer 1<<MMU_I9WAY */
+
+#define MMU_SWAY       (MMU_CSIZE - MMU_NWAY)  /* set size 1<<MMU_SWAY */
+#define MMU_NSET       (MMU_SWAY - MMU_CLINE)  /* lines per way 1<<MMU_NSET */
+
+/* L2-Cache */
+#define __ARM_L2CACHE__ 1
+
+/*
+ * LLC (Monsoon L2, Mistral L3): 8MB, 128-byte lines, 16-way.
+ * L2E (Mistral L2): 1MB, 64-byte lines, 8-way.
+ *
+ * TODO: Our L2 cahes have different line sizes.  I begin to suspect
+ * this may be a problem.
+ */
+#define L2_CSIZE       __ARM_L2CACHE_SIZE_LOG__        /* cache size as 1<<L2_CSIZE */
+#define L2_CLINE       7               /* cache line size as 1<<L2_CLINE (128) */
+#define L2_NWAY                4               /* set associativity as 1<<L2_NWAY (16) */
+#define L2_I7SET       6               /* TODO: cp15 c7 set incrementer 1<<L2_I7SET */
+#define L2_I7WAY       28              /* TODO: cp15 c7 way incrementer 1<<L2_I7WAY */
+#define L2_I9WAY       28              /* TODO: cp15 c9 way incremenber 1<<L2_I9WAY */
+
+#define L2_SWAY        (L2_CSIZE - L2_NWAY)            /* set size 1<<L2_SWAY */
+#define L2_NSET        (L2_SWAY - L2_CLINE)            /* lines per way 1<<L2_NSET */
+
+#elif defined (BCM2837) /* Raspberry Pi 3 */
+
+/* I-Cache. We don't have detailed spec so we just follow the ARM technical reference. */
+#define MMU_I_CLINE    6
+
+/* D-Cache. */
+#define MMU_CSIZE      15
+#define MMU_CLINE      6
+#define MMU_NWAY       4
+
+#define MMU_I7SET      6
+#define MMU_I7WAY      30
+#define MMU_I9WAY      30
+
+#define MMU_SWAY       (MMU_CSIZE - MMU_NWAY)
+#define MMU_NSET       (MMU_SWAY - MMU_CLINE)
+
+#define __ARM_L2CACHE__ 1
+
+#define L2_CSIZE       __ARM_L2CACHE_SIZE_LOG__
+#define L2_CLINE       6
+#define L2_NWAY                4               
+#define L2_I7SET       6               
+#define L2_I7WAY       28              
+#define L2_I9WAY       28              
+#define L2_SWAY        (L2_CSIZE - L2_NWAY)
+#define L2_NSET        (L2_SWAY - L2_CLINE)
+
 #else
 #error processor not supported
 #endif
 #define ARM_PTE_NG                                             0x00000800                       /* value for a per-process mapping */
 
 #define ARM_PTE_SHSHIFT                                        10
-#define ARM_PTE_SH_MASK                                        0x00000400                       /* shared (SMP) mapping mask */
-#define ARM_PTE_SH                                             0x00000400                       /* shared (SMP) mapping */
+#define ARM_PTE_SHMASK                                 0x00000400                       /* shared (SMP) mapping mask */
+#define ARM_PTE_SH                                     0x00000400                       /* shared (SMP) mapping */
 
 #define ARM_PTE_CBSHIFT                                        2
 #define ARM_PTE_CB(x)                                  ((x)<<ARM_PTE_CBSHIFT)
index 0fb708ede83624a2dfdd1ed6215024c4a4c3d2db..f1fac9bee351521f6c817effca4b886bbce46f0c 100644 (file)
@@ -78,10 +78,18 @@ extern void hw_lock_bit(
                                hw_lock_bit_t *,
                                unsigned int);
 
+extern void    hw_lock_bit_nopreempt(
+                               hw_lock_bit_t *,
+                               unsigned int);
+
 extern void    hw_unlock_bit(
                                hw_lock_bit_t *,
                                unsigned int);
 
+extern void    hw_unlock_bit_nopreempt(
+                               hw_lock_bit_t *,
+                               unsigned int);
+
 extern unsigned int hw_lock_bit_try(
                                hw_lock_bit_t *,
                                unsigned int);
@@ -179,8 +187,11 @@ extern void        arm_usimple_lock_init(simple_lock_t, __unused unsigned short);
 
 #define simple_lock_init(l,t)  arm_usimple_lock_init(l,t)
 #define simple_lock(l)                 lck_spin_lock(l)
+#define simple_lock_nopreempt(l)       lck_spin_lock_nopreempt(l)
 #define simple_unlock(l)               lck_spin_unlock(l)
+#define simple_unlock_nopreempt(l)     lck_spin_unlock_nopreempt(l)
 #define simple_lock_try(l)             lck_spin_try_lock(l)
+#define simple_lock_try_nopreempt(l)   lck_spin_try_lock_nopreempt(l)
 #define simple_lock_try_lock_loop(l)   simple_lock(l)
 #define simple_lock_addr(l)            (&(l))
 #define simple_lock_assert(l,t)        lck_spin_assert(l,t)
index 94128c4bf09009830959ade1a01279863eddd91e..8fffe7c1ca05e0912e980b784c7cc17a502c9101 100644 (file)
@@ -71,6 +71,47 @@ unsigned int    _MachineStateCount[] = {
 
 extern zone_t ads_zone;
 
+kern_return_t
+machine_thread_state_convert_to_user(
+                        __unused thread_t thread,
+                        __unused thread_flavor_t flavor,
+                        __unused thread_state_t tstate,
+                        __unused mach_msg_type_number_t *count)
+{
+       // No conversion to userspace representation on this platform
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+machine_thread_state_convert_from_user(
+                        __unused thread_t thread,
+                        __unused thread_flavor_t flavor,
+                        __unused thread_state_t tstate,
+                        __unused mach_msg_type_number_t count)
+{
+       // No conversion from userspace representation on this platform
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+machine_thread_siguctx_pointer_convert_to_user(
+                        __unused thread_t thread,
+                        __unused user_addr_t *uctxp)
+{
+       // No conversion to userspace representation on this platform
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+machine_thread_function_pointers_convert_from_user(
+                        __unused thread_t thread,
+                        __unused user_addr_t *fptrs,
+                        __unused uint32_t count)
+{
+       // No conversion from userspace representation on this platform
+       return KERN_SUCCESS;
+}
+
 /*
  * Routine:    machine_thread_get_state
  *
@@ -468,7 +509,8 @@ vfp_state_initialize(struct arm_vfpsaved_state *vfp_state)
 kern_return_t
 machine_thread_dup(
                   thread_t self,
-                  thread_t target)
+                  thread_t target,
+                  __unused boolean_t is_corpse)
 {
        struct arm_saved_state *self_saved_state;
        struct arm_saved_state *target_saved_state;
index a2c691c740e427d82ed1e6864728662b114f2634..6c1f357277bc5ad6d97024b54c6c9abc920908e7 100644 (file)
@@ -51,6 +51,7 @@
 #include <kern/sched_prim.h>
 
 #include <sys/kdebug.h>
+#include <kperf/kperf.h>
 
 #include <arm/trap.h>
 #include <arm/caches_internal.h>
@@ -142,7 +143,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u
        /* Check to see if we've hit a userland probe */
        if ((regs->cpsr & PSR_MODE_MASK) == PSR_USER_MODE) {
                if (regs->cpsr & PSR_TF) {
-                       uint16_t instr;
+                       uint16_t instr = 0;
 
                        if(COPYIN((user_addr_t)(regs->pc), (char *)&instr,(vm_size_t)(sizeof(uint16_t))) != KERN_SUCCESS)
                                goto exit;
@@ -153,7 +154,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u
                                        goto exit;
                        }
                } else {
-                       uint32_t instr;
+                       uint32_t instr = 0;
 
                        if(COPYIN((user_addr_t)(regs->pc), (char *)&instr,(vm_size_t)(sizeof(uint32_t))) != KERN_SUCCESS)
                                goto exit;
@@ -169,13 +170,13 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u
 
 
        if (regs->cpsr & PSR_TF) {
-               unsigned short instr;
+               unsigned short instr = 0;
 
                if(COPYIN((user_addr_t)(regs->pc), (char *)&instr,(vm_size_t)(sizeof(unsigned short))) != KERN_SUCCESS)
                        goto exit;
 
                if (IS_THUMB32(instr)) {
-                       unsigned int    instr32;
+                       unsigned int instr32;
 
                        instr32 = (instr<<16);
 
@@ -202,7 +203,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u
                        }
                }
        } else {
-               uint32_t instr;
+               uint32_t instr = 0;
 
                if(COPYIN((user_addr_t)(regs->pc), (char *)&instr,(vm_size_t)(sizeof(uint32_t))) != KERN_SUCCESS)
                        goto exit;
@@ -238,17 +239,7 @@ sleh_undef(struct arm_saved_state * regs, struct arm_vfpsaved_state * vfp_ss __u
                        (void) ml_set_interrupts_enabled(intr);
                        goto exit;
                }
-               panic_context(exception, (void *)regs, "undefined kernel instruction\n"
-                     "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
-                     "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
-                     "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
-                     "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
-                     "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-                     regs->r[0], regs->r[1], regs->r[2], regs->r[3],
-                     regs->r[4], regs->r[5], regs->r[6], regs->r[7],
-                     regs->r[8], regs->r[9], regs->r[10], regs->r[11],
-                     regs->r[12], regs->sp, regs->lr, regs->pc,
-                     regs->cpsr, regs->fsr, regs->far);
+               panic_with_thread_kernel_state("undefined kernel instruction", regs);
 
                (void) ml_set_interrupts_enabled(intr);
 
@@ -306,8 +297,14 @@ sleh_abort(struct arm_saved_state * regs, int type)
        /* Done with asynchronous handling; re-enable here so that subsequent aborts are taken as early as possible. */
        reenable_async_aborts();
 
-       if (ml_at_interrupt_context())
-               panic_with_thread_kernel_state("sleh_abort at interrupt context", regs);
+       if (ml_at_interrupt_context()) {
+#if CONFIG_DTRACE
+               if (!(thread->options & TH_OPT_DTRACE))
+#endif /* CONFIG_DTRACE */
+               {
+                       panic_with_thread_kernel_state("sleh_abort at interrupt context", regs);
+               }
+       }
 
        fault_addr = vaddr = regs->far;
 
@@ -339,7 +336,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                        /* Cache operations report faults as write access, change these to read access */
                        /* Cache operations are invoked from arm mode for now */
                        if (!(regs->cpsr & PSR_TF)) {
-                               unsigned int    ins;
+                               unsigned int ins = 0;
 
                                if(COPYIN((user_addr_t)(regs->pc), (char *)&ins,(vm_size_t)(sizeof(unsigned int))) != KERN_SUCCESS)
                                        goto exit;
@@ -355,7 +352,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                         * a write fault.
                         */
                        if (!(regs->cpsr & PSR_TF)) {
-                               unsigned int    ins;
+                               unsigned int ins = 0;
 
                                if(COPYIN((user_addr_t)(regs->pc), (char *)&ins,(vm_size_t)(sizeof(unsigned int))) != KERN_SUCCESS)
                                        goto exit;
@@ -387,18 +384,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                                (void) ml_set_interrupts_enabled(intr);
                                goto exit;
                        }
-                       panic_context(EXC_BAD_ACCESS, (void*)regs, "sleh_abort: prefetch abort in kernel mode: fault_addr=0x%x\n"
-                             "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
-                             "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
-                             "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
-                             "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
-                             "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-                             fault_addr,
-                             regs->r[0], regs->r[1], regs->r[2], regs->r[3],
-                             regs->r[4], regs->r[5], regs->r[6], regs->r[7],
-                             regs->r[8], regs->r[9], regs->r[10], regs->r[11],
-                             regs->r[12], regs->sp, regs->lr, regs->pc,
-                             regs->cpsr, regs->fsr, regs->far);
+                       panic_with_thread_kernel_state("prefetch abort in kernel mode", regs);
 
                        (void) ml_set_interrupts_enabled(intr);
 
@@ -412,17 +398,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                                        goto exit;
                                } else {
                                        intr = ml_set_interrupts_enabled(FALSE);
-                                       panic_context(EXC_BAD_ACCESS, (void *)regs, "Unexpected page fault under dtrace_probe"
-                                             "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
-                                             "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
-                                             "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
-                                             "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
-                                             "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-                                             regs->r[0], regs->r[1], regs->r[2], regs->r[3],
-                                             regs->r[4], regs->r[5], regs->r[6], regs->r[7],
-                                             regs->r[8], regs->r[9], regs->r[10], regs->r[11],
-                                             regs->r[12], regs->sp, regs->lr, regs->pc,
-                                             regs->cpsr, regs->fsr, regs->far);
+                                       panic_with_thread_kernel_state("Unexpected page fault under dtrace_probe", regs);
 
                                        (void) ml_set_interrupts_enabled(intr);
 
@@ -436,10 +412,12 @@ sleh_abort(struct arm_saved_state * regs, int type)
                        else
                                map = thread->map;
 
-                       /* check to see if it is just a pmap ref/modify fault */
-                       result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE);
-                       if (result == KERN_SUCCESS)
-                               goto exit;
+                       if (!TEST_FSR_TRANSLATION_FAULT(status)) {
+                               /* check to see if it is just a pmap ref/modify fault */
+                               result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE);
+                               if (result == KERN_SUCCESS)
+                                       goto exit;
+                       }
 
                        /*
                         *  We have to "fault" the page in.
@@ -468,18 +446,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                        } else {
                                intr = ml_set_interrupts_enabled(FALSE);
 
-                               panic_context(EXC_BAD_ACCESS, (void *)regs, "unaligned kernel data access: pc=0x%08x fault_addr=0x%x\n"
-                                     "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
-                                     "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
-                                     "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
-                                     "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
-                                     "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-                                     regs->pc, fault_addr,
-                                     regs->r[0], regs->r[1], regs->r[2], regs->r[3],
-                                     regs->r[4], regs->r[5], regs->r[6], regs->r[7],
-                                     regs->r[8], regs->r[9], regs->r[10], regs->r[11],
-                                     regs->r[12], regs->sp, regs->lr, regs->pc,
-                                     regs->cpsr, regs->fsr, regs->far);
+                               panic_with_thread_kernel_state("unaligned kernel data access", regs);
 
                                (void) ml_set_interrupts_enabled(intr);
 
@@ -489,7 +456,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                }
                intr = ml_set_interrupts_enabled(FALSE);
 
-               panic_context(EXC_BAD_ACCESS, (void *)regs, "kernel abort type %d: fault_type=0x%x, fault_addr=0x%x\n"
+               panic_plain("kernel abort type %d: fault_type=0x%x, fault_addr=0x%x\n"
                      "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
                      "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
                      "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
@@ -519,17 +486,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                                } else {
                                        intr = ml_set_interrupts_enabled(FALSE);
 
-                                       panic_context(EXC_BAD_ACCESS, (void *)regs, "copyin/out has no recovery point"
-                                             "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
-                                             "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
-                                             "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
-                                             "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
-                                             "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-                                             regs->r[0], regs->r[1], regs->r[2], regs->r[3],
-                                             regs->r[4], regs->r[5], regs->r[6], regs->r[7],
-                                             regs->r[8], regs->r[9], regs->r[10], regs->r[11],
-                                             regs->r[12], regs->sp, regs->lr, regs->pc,
-                                             regs->cpsr, regs->fsr, regs->far);
+                                       panic_with_thread_kernel_state("copyin/out has no recovery point", regs);
 
                                        (void) ml_set_interrupts_enabled(intr);
                                }
@@ -537,17 +494,7 @@ sleh_abort(struct arm_saved_state * regs, int type)
                        } else {
                                intr = ml_set_interrupts_enabled(FALSE);
 
-                               panic_context(EXC_BAD_ACCESS, (void*)regs, "Unexpected UMW page fault under dtrace_probe"
-                                     "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
-                                     "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
-                                     "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
-                                     "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
-                                     "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-                                     regs->r[0], regs->r[1], regs->r[2], regs->r[3],
-                                     regs->r[4], regs->r[5], regs->r[6], regs->r[7],
-                                     regs->r[8], regs->r[9], regs->r[10], regs->r[11],
-                                     regs->r[12], regs->sp, regs->lr, regs->pc,
-                                     regs->cpsr, regs->fsr, regs->far);
+                               panic_with_thread_kernel_state("Unexpected UMW page fault under dtrace_probe", regs);
 
                                (void) ml_set_interrupts_enabled(intr);
 
@@ -556,16 +503,19 @@ sleh_abort(struct arm_saved_state * regs, int type)
                }
 #endif
 
-               /* check to see if it is just a pmap ref/modify fault */
-               result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, TRUE);
-               if (result != KERN_SUCCESS) {
-                       /*
-                        * We have to "fault" the page in.
-                        */
-                       result = vm_fault(map, fault_addr, fault_type,
-                                         FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
-                                         THREAD_ABORTSAFE, NULL, 0);
+               if (!TEST_FSR_TRANSLATION_FAULT(status)) {
+                       /* check to see if it is just a pmap ref/modify fault */
+                       result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, TRUE);
+                       if (result == KERN_SUCCESS)
+                               goto exception_return;
                }
+
+               /*
+                * We have to "fault" the page in.
+                */
+               result = vm_fault(map, fault_addr, fault_type,
+                                 FALSE /* change_wiring */, VM_KERN_MEMORY_NONE,
+                                 THREAD_ABORTSAFE, NULL, 0);
                if (result == KERN_SUCCESS || result == KERN_ABORTED) {
                        goto exception_return;
                }
@@ -614,7 +564,7 @@ static kern_return_t
 sleh_alignment(struct arm_saved_state * regs)
 {
        unsigned int    status;
-       unsigned int    ins;
+       unsigned int    ins = 0;
        unsigned int    rd_index;
        unsigned int    base_index;
        unsigned int    paddr;
@@ -650,7 +600,7 @@ sleh_alignment(struct arm_saved_state * regs)
        paddr = regs->far;
 
        if (regs->cpsr & PSR_TF) {
-                unsigned short  ins16;
+                unsigned short ins16 = 0;
 
                /* Get aborted instruction */
 #if    __ARM_SMP__ || __ARM_USER_PROTECT__
@@ -859,9 +809,10 @@ void
 interrupt_trace_exit(
                     void)
 {
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-               MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END,
-               0, 0, 0, 0, 0);
+#if KPERF
+       kperf_interrupt();
+#endif /* KPERF */
+       KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END);
 }
 #endif
 
@@ -878,17 +829,17 @@ interrupt_stats(void)
 static void 
 panic_with_thread_kernel_state(const char *msg, struct arm_saved_state *regs)
 {
-               panic_context(0, (void*)regs, "%s (saved state:%p)\n"
-                             "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
-                             "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
-                             "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
-                             "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
-                             "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
-                                 msg, regs,
-                             regs->r[0], regs->r[1], regs->r[2], regs->r[3],
-                             regs->r[4], regs->r[5], regs->r[6], regs->r[7],
-                             regs->r[8], regs->r[9], regs->r[10], regs->r[11],
-                             regs->r[12], regs->sp, regs->lr, regs->pc,
-                             regs->cpsr, regs->fsr, regs->far);
+       panic_plain("%s (saved state:%p)\n"
+                   "r0:   0x%08x  r1: 0x%08x  r2: 0x%08x  r3: 0x%08x\n"
+                   "r4:   0x%08x  r5: 0x%08x  r6: 0x%08x  r7: 0x%08x\n"
+                   "r8:   0x%08x  r9: 0x%08x r10: 0x%08x r11: 0x%08x\n"
+                   "r12:  0x%08x  sp: 0x%08x  lr: 0x%08x  pc: 0x%08x\n"
+                   "cpsr: 0x%08x fsr: 0x%08x far: 0x%08x\n",
+                   msg, regs,
+                   regs->r[0], regs->r[1], regs->r[2], regs->r[3],
+                   regs->r[4], regs->r[5], regs->r[6], regs->r[7],
+                   regs->r[8], regs->r[9], regs->r[10], regs->r[11],
+                   regs->r[12], regs->sp, regs->lr, regs->pc,
+                   regs->cpsr, regs->fsr, regs->far);
 
 }
index a4caeaa5a69ebfdfa1d505de01a7913b1b2ae629..b1d02a39c3764bc1738a0e7ef6ff4694db4a1f8e 100644 (file)
         || (((op) & THUMB_SIMD_VFP_MASK2) == THUMB_SIMD_VFP_CODE2 )    \
         || (((op) & THUMB_SIMD_VFP_MASK3) == THUMB_SIMD_VFP_CODE3))
 
-extern boolean_t arm_swap_readable_type(vm_map_address_t, unsigned int /* spsr */);
 extern boolean_t arm_force_fast_fault(ppnum_t, vm_prot_t, int, void *);
 extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, boolean_t);
 
@@ -250,9 +249,7 @@ extern kern_return_t arm_fast_fault(pmap_t, vm_map_address_t, vm_prot_t, boolean
        (((((op)&ARM_CDX_MASK) == ARM_STC) || \
         (((op)&ARM_STRH_MASK) == ARM_STRH) || \
         (((op)&ARM_BDX_MASK) == ARM_STM) || \
-        (((op)&ARM_SDX_MASK) == ARM_STR) || \
-        ((((op)&ARM_SWP_MASK) == ARM_SWP) && \
-               arm_swap_readable_type(vaddr,spsr))) ?  \
+        (((op)&ARM_SDX_MASK) == ARM_STR)) ?  \
                        (VM_PROT_WRITE|VM_PROT_READ) : (VM_PROT_READ))
        
 #define thumb_fault_type(op,spsr,vaddr) \
index ad75e8a11a124ee67fb17e08bc3baa8ebd8ba462..4498b9ff016f26c85671866f72c39724c7022f16 100644 (file)
@@ -6,6 +6,7 @@ export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
 include $(MakeInc_cmd)
 include $(MakeInc_def)
 
+
 ARM_HEADER_FILES = \
                lowglobals.h            \
                machine_cpuid.h         \
@@ -22,7 +23,7 @@ INSTALL_KF_MD_LIST = $(ARM_HEADER_FILES)
 
 INSTALL_KF_MD_LCL_LIST = machine_kpc.h monotonic.h pgtrace.h $(ARM_HEADER_FILES)
 
-EXPORT_MD_LIST = machine_cpuid.h machine_kpc.h monotonic.h proc_reg.h pgtrace.h
+EXPORT_MD_LIST = machine_cpuid.h machine_kpc.h monotonic.h proc_reg.h pgtrace.h asm.h
 
 
 EXPORT_MD_DIR = arm64
index 9b06f950478d0dace888789179068a0f9af23ef2..bfad29bf552e93bcd5afd0762cab607cb8c5fd77 100644 (file)
 #include <vm/vm_page.h>
 #include <vm/pmap.h>
 
+#include <arm/atomic.h>
 #include <arm64/proc_reg.h>
 #include <arm64/lowglobals.h>
 #include <arm/cpu_data_internal.h>
 #include <arm/misc_protos.h>
 #include <pexpert/arm64/boot.h>
+#include <pexpert/device_tree.h>
 
 #include <libkern/kernel_mach_header.h>
 #include <libkern/section_keywords.h>
 
-#if __ARM_KERNEL_PROTECT__
-#include <arm/atomic.h>
-#endif /* __ARM_KERNEL_PROTECT__ */
+#include <san/kasan.h>
 
 #if __ARM_KERNEL_PROTECT__
 /*
@@ -69,6 +69,8 @@ static_assert((KERNEL_PMAP_HEAP_RANGE_START & ~ARM_TT_ROOT_OFFMASK) > ARM_KERNEL
 static_assert((((~ARM_KERNEL_PROTECT_EXCEPTION_START) + 1) * 2ULL) <= (ARM_TT_ROOT_SIZE + ARM_TT_ROOT_INDEX_MASK));
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+#define ARM_DYNAMIC_TABLE_XN (ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN)
+
 #if KASAN
 extern vm_offset_t shadow_pbase;
 extern vm_offset_t shadow_ptop;
@@ -81,6 +83,9 @@ extern vm_offset_t physmap_vtop;
  */
 extern void *last_kernel_symbol;
 
+extern void arm64_replace_bootstack(cpu_data_t*);
+extern void PE_slide_devicetree(vm_offset_t);
+
 /*
  * KASLR parameters
  */
@@ -103,17 +108,67 @@ SECURITY_READ_ONLY_LATE(vm_offset_t) vm_prelink_einfo;
 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_slinkedit;
 SECURITY_READ_ONLY_LATE(vm_offset_t) vm_elinkedit;
 
+SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_builtinkmod_text;
+SECURITY_READ_ONLY_LATE(vm_offset_t) vm_kernel_builtinkmod_text_end;
+
 /* Used by <mach/arm/vm_param.h> */
 SECURITY_READ_ONLY_LATE(unsigned long) gVirtBase;
 SECURITY_READ_ONLY_LATE(unsigned long) gPhysBase;
 SECURITY_READ_ONLY_LATE(unsigned long) gPhysSize;
+SECURITY_READ_ONLY_LATE(unsigned long) gT0Sz = T0SZ_BOOT;
+SECURITY_READ_ONLY_LATE(unsigned long) gT1Sz = T1SZ_BOOT;
+
+/* 23543331 - step 1 of kext / kernel __TEXT and __DATA colocation is to move 
+ * all kexts before the kernel.  This is only for arm64 devices and looks 
+ * something like the following:
+ * -- vmaddr order --
+ * 0xffffff8004004000 __PRELINK_TEXT
+ * 0xffffff8007004000 __TEXT (xnu)
+ * 0xffffff80075ec000 __DATA (xnu)
+ * 0xffffff80076dc000 __KLD (xnu)
+ * 0xffffff80076e0000 __LAST (xnu)
+ * 0xffffff80076e4000 __LINKEDIT (xnu)
+ * 0xffffff80076e4000 __PRELINK_DATA (not used yet)
+ * 0xffffff800782c000 __PRELINK_INFO
+ * 0xffffff80078e4000 -- End of kernelcache
+ */
 
-
-/*
- * NOTE: mem_size is bogus on large memory machines. 
- *       We will pin it to 0x80000000 if there is more than 2 GB
- *       This is left only for compatibility and max_mem should be used.
+/* 24921709 - make XNU ready for KTRR
+ *
+ * Two possible kernel cache layouts, depending on which kcgen is being used.
+ * VAs increasing downwards.
+ * Old KCGEN:
+ *
+ * __PRELINK_TEXT
+ * __TEXT
+ * __DATA_CONST
+ * __TEXT_EXEC
+ * __KLD
+ * __LAST
+ * __DATA
+ * __PRELINK_DATA (expected empty)
+ * __LINKEDIT
+ * __PRELINK_INFO
+ *
+ * New kcgen:
+ *
+ * __PRELINK_TEXT    <--- First KTRR (ReadOnly) segment
+ * __PLK_DATA_CONST
+ * __PLK_TEXT_EXEC
+ * __TEXT
+ * __DATA_CONST
+ * __TEXT_EXEC
+ * __KLD
+ * __LAST            <--- Last KTRR (ReadOnly) segment
+ * __DATA
+ * __BOOTDATA (if present)
+ * __LINKEDIT
+ * __PRELINK_DATA (expected populated now)
+ * __PLK_LINKEDIT
+ * __PRELINK_INFO
+ *
  */
+
 vm_offset_t mem_size;                             /* Size of actual physical memory present
                                                    * minus any performance buffer and possibly
                                                    * limited by mem_limit in bytes */
@@ -129,6 +184,11 @@ addr64_t    vm_last_addr = VM_MAX_KERNEL_ADDRESS; /* Highest kernel
                                                    * virtual address known
                                                    * to the VM system */
 
+SECURITY_READ_ONLY_LATE(vm_offset_t)             segEXTRADATA;
+SECURITY_READ_ONLY_LATE(unsigned long)           segSizeEXTRADATA;
+
+SECURITY_READ_ONLY_LATE(vm_offset_t)          segLOWESTTEXT;
+
 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segTEXTB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeTEXT;
 
@@ -143,13 +203,19 @@ SECURITY_READ_ONLY_LATE(static vm_offset_t)   segDATAB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeDATA;
 
 
+SECURITY_READ_ONLY_LATE(vm_offset_t)          segBOOTDATAB;
+SECURITY_READ_ONLY_LATE(unsigned long)        segSizeBOOTDATA;
+extern vm_offset_t                            intstack_low_guard;
+extern vm_offset_t                            intstack_high_guard;
+extern vm_offset_t                            excepstack_high_guard;
+
 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segLINKB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeLINK;
 
 SECURITY_READ_ONLY_LATE(static vm_offset_t)   segKLDB;
 SECURITY_READ_ONLY_LATE(static unsigned long) segSizeKLD;
-SECURITY_READ_ONLY_LATE(static vm_offset_t)   segLASTB;
-SECURITY_READ_ONLY_LATE(static unsigned long) segSizeLAST;
+SECURITY_READ_ONLY_LATE(vm_offset_t)          segLASTB;
+SECURITY_READ_ONLY_LATE(unsigned long)        segSizeLAST;
 
 SECURITY_READ_ONLY_LATE(vm_offset_t)          segPRELINKTEXTB;
 SECURITY_READ_ONLY_LATE(unsigned long)        segSizePRELINKTEXT;
@@ -199,14 +265,13 @@ SECURITY_READ_ONLY_LATE(vm_offset_t)     first_avail;
 SECURITY_READ_ONLY_LATE(vm_offset_t)     static_memory_end;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)    avail_start;
 SECURITY_READ_ONLY_LATE(pmap_paddr_t)    avail_end;
+SECURITY_READ_ONLY_LATE(pmap_paddr_t)    real_avail_end;
 
 #if __ARM_KERNEL_PROTECT__
 extern void ExceptionVectorsBase;
 extern void ExceptionVectorsEnd;
 #endif /* __ARM_KERNEL_PROTECT__ */
 
-#define        MEM_SIZE_MAX            0x100000000ULL
-
 #if defined(KERNEL_INTEGRITY_KTRR)
 #if __ARM64_TWO_LEVEL_PMAP__
 /* We could support this configuration, but it adds memory overhead. */
@@ -214,6 +279,57 @@ extern void ExceptionVectorsEnd;
 #endif
 #endif
 
+typedef struct {
+       pmap_paddr_t pa;
+       vm_map_address_t va;
+       vm_size_t len;
+} ptov_table_entry;
+
+#define PTOV_TABLE_SIZE        8
+SECURITY_READ_ONLY_LATE(static ptov_table_entry)       ptov_table[PTOV_TABLE_SIZE];
+SECURITY_READ_ONLY_LATE(static boolean_t)              kva_active = FALSE;
+
+
+vm_map_address_t
+phystokv(pmap_paddr_t pa)
+{
+       for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) {
+               if ((pa >= ptov_table[i].pa) && (pa < (ptov_table[i].pa + ptov_table[i].len)))
+                       return (pa - ptov_table[i].pa + ptov_table[i].va);
+       }
+       return (pa - gPhysBase + gVirtBase);
+}
+
+vm_map_address_t
+phystokv_range(pmap_paddr_t pa, vm_size_t *max_len)
+{
+       vm_size_t len;
+       for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) {
+               if ((pa >= ptov_table[i].pa) && (pa < (ptov_table[i].pa + ptov_table[i].len))) {
+                       len = ptov_table[i].len - (pa - ptov_table[i].pa);
+                       if (*max_len > len)
+                               *max_len = len;
+                       return (pa - ptov_table[i].pa + ptov_table[i].va);
+               }
+       }
+       len = PAGE_SIZE - (pa & PAGE_MASK);
+       if (*max_len > len)
+               *max_len = len;
+       return (pa - gPhysBase + gVirtBase);
+}
+
+vm_offset_t
+ml_static_vtop(vm_offset_t va)
+{
+       for (size_t i = 0; (i < PTOV_TABLE_SIZE) && (ptov_table[i].len != 0); i++) {
+               if ((va >= ptov_table[i].va) && (va < (ptov_table[i].va + ptov_table[i].len)))
+                       return (va - ptov_table[i].va + ptov_table[i].pa);
+       }
+       if (((vm_address_t)(va) - gVirtBase) >= gPhysSize)
+               panic("ml_static_vtop(): illegal VA: %p\n", (void*)va);
+       return ((vm_address_t)(va) - gVirtBase + gPhysBase);
+}
+
 /*
  * This rounds the given address up to the nearest boundary for a PTE contiguous
  * hint.
@@ -637,33 +753,30 @@ static void arm_replace_identity_map(boot_args * args)
 
        /*
         * The V=P page tables (at the time this comment was written) start
-        * after the last bit of kernel data, and consist of 1 to 2 pages.
+        * after the last bit of kernel data, and consist of 1 L1 page and 1 or
+        * more L2 pages.
         * Grab references to those pages, and allocate an L3 page.
         */
-#if !__ARM64_TWO_LEVEL_PMAP__
        l1_ptp_phys = args->topOfKernelData;
        l1_ptp_virt = (tt_entry_t *)phystokv(l1_ptp_phys);
-       tte1 = &l1_ptp_virt[(((paddr) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)];
+       tte1 = &l1_ptp_virt[L1_TABLE_INDEX(paddr)];
 
-       l2_ptp_phys = l1_ptp_phys + ARM_PGBYTES;
-#else
-       l2_ptp_phys = args->topOfKernelData;
-#endif
-       l2_ptp_virt = (tt_entry_t *)phystokv(l2_ptp_phys);
-       tte2 = &l2_ptp_virt[(((paddr) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)];
+       l2_ptp_virt = L2_TABLE_VA(tte1);
+       l2_ptp_phys = (*tte1) & ARM_TTE_TABLE_MASK;
+       tte2 = &l2_ptp_virt[L2_TABLE_INDEX(paddr)];
 
        l3_ptp_virt = (pt_entry_t *)alloc_ptpage(FALSE);
        l3_ptp_phys = kvtophys((vm_offset_t)l3_ptp_virt);
-       ptep = &l3_ptp_virt[(((paddr) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)];
+       ptep = &l3_ptp_virt[L3_TABLE_INDEX(paddr)];
 
        /*
         * Replace the large V=P mapping with a mapping that provides only the
         * mappings needed to turn on the MMU.
         */
-#if !__ARM64_TWO_LEVEL_PMAP__
+
        bzero(l1_ptp_virt, ARM_PGBYTES);
        *tte1 = ARM_TTE_BOOT_TABLE | (l2_ptp_phys & ARM_TTE_TABLE_MASK);
-#endif
+
        bzero(l2_ptp_virt, ARM_PGBYTES);
        *tte2 = ARM_TTE_BOOT_TABLE | (l3_ptp_phys & ARM_TTE_TABLE_MASK);
 
@@ -677,6 +790,23 @@ static void arm_replace_identity_map(boot_args * args)
 }
 #endif /* defined(KERNEL_INTEGRITY_KTRR)*/
 
+tt_entry_t *arm_kva_to_tte(vm_offset_t);
+
+tt_entry_t *
+arm_kva_to_tte(vm_offset_t va)
+{
+#if __ARM64_TWO_LEVEL_PMAP__
+       tt_entry_t *tte2;
+       tte2 = cpu_tte + L2_TABLE_INDEX(va);
+#else
+       tt_entry_t *tte1, *tte2;
+       tte1 = cpu_tte + L1_TABLE_INDEX(va);
+       tte2 = L2_TABLE_VA(tte1) + L2_TABLE_INDEX(va);
+#endif
+       return tte2;
+}
+
+
 /*
  * arm_vm_page_granular_helper updates protections at the L3 level.  It will (if
  * neccessary) allocate a page for the L3 table and update the corresponding L2
@@ -684,18 +814,18 @@ static void arm_replace_identity_map(boot_args * args)
  * This expects to be invoked on a L2 entry or sub L2 entry granularity, so this should
  * not be invoked from a context that does not do L2 iteration separately (basically,
  * don't call this except from arm_vm_page_granular_prot).
+ *
+ * bool force_page_granule: true: will force page level mappings for this entry
+ *                        false: will try to use block level mappings
  */
+
 static void
-arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
-                            int pte_prot_APX, int pte_prot_XN, int forceCoarse,
+arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va, pmap_paddr_t pa_offset,
+                            int pte_prot_APX, int pte_prot_XN, bool force_page_granule,
                             pt_entry_t **deferred_pte, pt_entry_t *deferred_ptmp)
 {
        if (va & ARM_TT_L2_OFFMASK) { /* ragged edge hanging over a ARM_TT_L2_SIZE  boundary */
-#if __ARM64_TWO_LEVEL_PMAP__
                tt_entry_t *tte2;
-#else
-               tt_entry_t *tte1, *tte2;
-#endif
                tt_entry_t tmplate;
                pmap_paddr_t pa;
                pt_entry_t *ppte, *recursive_pte = NULL, ptmp, recursive_ptmp = 0;
@@ -703,33 +833,40 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
                unsigned i;
 
                va &= ~ARM_TT_L2_OFFMASK;
-               pa = va - gVirtBase + gPhysBase;
+               pa = va - gVirtBase + gPhysBase - pa_offset;
 
-#if __ARM64_TWO_LEVEL_PMAP__
-               tte2 = &cpu_tte[(((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)];
-#else
-               tte1 = &cpu_tte[(((va) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)];
-               tte2 = &((tt_entry_t*) phystokv((*tte1) & ARM_TTE_TABLE_MASK))[(((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)];
-#endif
+               if (pa >= real_avail_end)
+                       return;
 
+               tte2 = arm_kva_to_tte(va);
+
+               assert(_end >= va);
                tmplate = *tte2;
 
                if (ARM_TTE_TYPE_TABLE == (tmplate & ARM_TTE_TYPE_MASK)) {
                        /* pick up the existing page table. */
                        ppte = (pt_entry_t *)phystokv((tmplate & ARM_TTE_TABLE_MASK));
                } else {
-                       // TTE must be reincarnated COARSE.
-                       ppte = (pt_entry_t*)alloc_ptpage(TRUE);
+                       // TTE must be reincarnated with page level mappings.
+                       ppte = (pt_entry_t*)alloc_ptpage(pa_offset == 0);
+                       bzero(ppte, ARM_PGBYTES);
                        ppte_phys = kvtophys((vm_offset_t)ppte);
 
-                       pmap_init_pte_static_page(kernel_pmap, ppte, pa);
-
-                       *tte2 = pa_to_tte(ppte_phys) | ARM_TTE_TYPE_TABLE  | ARM_TTE_VALID;
+                       *tte2 = pa_to_tte(ppte_phys) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
                }
 
+               vm_offset_t len = _end - va;
+               if ((pa + len) > real_avail_end)
+                       _end -= (pa + len - real_avail_end);
+               assert((start - gVirtBase + gPhysBase - pa_offset) >= gPhysBase);
+
+               /* Round up to the nearest PAGE_SIZE boundary when creating mappings:
+                * PAGE_SIZE may be a multiple of ARM_PGBYTES, and we don't want to leave
+                * a ragged non-PAGE_SIZE-aligned edge. */
+               vm_offset_t rounded_end = round_page(_end);
                /* Apply the desired protections to the specified page range */
                for (i = 0; i <= (ARM_TT_L3_INDEX_MASK>>ARM_TT_L3_SHIFT); i++) {
-                       if ((start <= va) && (va < _end)) {
+                       if ((start <= va) && (va < rounded_end)) {
 
                                ptmp = pa | ARM_PTE_AF | ARM_PTE_SH(SH_OUTER_MEMORY) | ARM_PTE_TYPE;
                                ptmp = ptmp | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
@@ -745,19 +882,35 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
 
                                /*
                                 * If we can, apply the contiguous hint to this range.  The hint is
-                                * applicable if we are not trying to create per-page mappings and
-                                * if the current address falls within a hint-sized range that will
+                                * applicable if the current address falls within a hint-sized range that will
                                 * be fully covered by this mapping request.
                                 */
-                               if ((va >= round_up_pte_hint_address(start)) && (round_up_pte_hint_address(va + 1) < _end) &&
-                                   !forceCoarse && use_contiguous_hint) {
+                               if ((va >= round_up_pte_hint_address(start)) && (round_up_pte_hint_address(va + 1) <= _end) &&
+                                   !force_page_granule && use_contiguous_hint) {
+                                       assert((va & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1)) == ((pa & ((1 << ARM_PTE_HINT_ADDR_SHIFT) - 1))));
                                        ptmp |= ARM_PTE_HINT;
                                }
-
-                               if ((pt_entry_t*)(phystokv(pa)) == ppte) {
+                               /*
+                                * Do not change the contiguous bit on an active mapping.  Even in a single-threaded
+                                * environment, it's possible for prefetch to produce a TLB conflict by trying to pull in
+                                * a hint-sized entry on top of one or more existing page-sized entries.  It's also useful
+                                * to make sure we're not trying to unhint a sub-range of a larger hinted range, which
+                                * could produce a later TLB conflict.
+                                */
+                               assert(!kva_active || (ppte[i] == ARM_PTE_TYPE_FAULT) || ((ppte[i] & ARM_PTE_HINT) == (ptmp & ARM_PTE_HINT)));
+
+                               /* 
+                                * If we reach an entry that maps the current pte page, delay updating it until the very end.
+                                * Otherwise we might end up making the PTE page read-only, leading to a fault later on in
+                                * this function if we manage to outrun the TLB.  This can happen on KTRR-enabled devices when
+                                * marking segDATACONST read-only.  Mappings for this region may straddle a PT page boundary,
+                                * so we must also defer assignment of the following PTE.  We will assume that if the region
+                                * were to require one or more full L3 pages, it would instead use L2 blocks where possible,
+                                * therefore only requiring at most one L3 page at the beginning and one at the end. 
+                                */
+                               if (kva_active && ((pt_entry_t*)(phystokv(pa)) == ppte)) {
                                        assert(recursive_pte == NULL);  
-                                       /* This assert should be reenabled as part of rdar://problem/30149465 */
-                                       assert(!forceCoarse);
+                                       assert(!force_page_granule);
                                        recursive_pte = &ppte[i];
                                        recursive_ptmp = ptmp;
                                } else if ((deferred_pte != NULL) && (&ppte[i] == &recursive_pte[1])) {
@@ -783,15 +936,11 @@ arm_vm_page_granular_helper(vm_offset_t start, vm_offset_t _end, vm_offset_t va,
  * changing them.  If a particular chunk necessitates L3 entries (for reasons of
  * alignment or length, or an explicit request that the entry be fully expanded), we
  * hand off to arm_vm_page_granular_helper to deal with the L3 chunk of the logic.
- *
- * Note that counterintuitively a forceCoarse request is a request to expand the entries
- * out to L3, i.e. to make *finer* grained mappings. That comes from historical arm32
- * nomenclature in which the 4K granule is "coarse" vs. the 1K "fine" granule (which we
- * don't use). 
  */
 static void
-arm_vm_page_granular_prot(vm_offset_t start, unsigned long size,
-                          int tte_prot_XN, int pte_prot_APX, int pte_prot_XN, int forceCoarse)
+arm_vm_page_granular_prot(vm_offset_t start, unsigned long size, pmap_paddr_t pa_offset,
+                          int tte_prot_XN, int pte_prot_APX, int pte_prot_XN,
+                          bool force_page_granule)
 {
        pt_entry_t *deferred_pte = NULL, deferred_ptmp = 0;
        vm_offset_t _end = start + size;
@@ -801,94 +950,110 @@ arm_vm_page_granular_prot(vm_offset_t start, unsigned long size,
                return;
 
        if (align_start > _end) {
-               arm_vm_page_granular_helper(start, _end, start, pte_prot_APX, pte_prot_XN, forceCoarse, NULL, NULL);
+               arm_vm_page_granular_helper(start, _end, start, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, NULL, NULL);
                return;
        }
 
-       arm_vm_page_granular_helper(start, align_start, start, pte_prot_APX, pte_prot_XN, forceCoarse, &deferred_pte, &deferred_ptmp);
+       arm_vm_page_granular_helper(start, align_start, start, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, &deferred_pte, &deferred_ptmp);
 
-       while ((_end - align_start)  >= ARM_TT_L2_SIZE) {
-               if (forceCoarse)
-                       arm_vm_page_granular_helper(align_start, align_start+ARM_TT_L2_SIZE, align_start + 1,
-                                                   pte_prot_APX, pte_prot_XN, forceCoarse, NULL, NULL);
+       while ((_end - align_start) >= ARM_TT_L2_SIZE) {
+               if (force_page_granule)
+                       arm_vm_page_granular_helper(align_start, align_start+ARM_TT_L2_SIZE, align_start + 1, pa_offset,
+                                                   pte_prot_APX, pte_prot_XN, force_page_granule, NULL, NULL);
                else {
-#if __ARM64_TWO_LEVEL_PMAP__
+                       pmap_paddr_t pa = align_start - gVirtBase + gPhysBase - pa_offset;
+                       assert((pa & ARM_TT_L2_OFFMASK) == 0); 
                        tt_entry_t *tte2;
-#else
-                       tt_entry_t *tte1, *tte2;
-#endif
                        tt_entry_t tmplate;
 
-#if __ARM64_TWO_LEVEL_PMAP__
-                       tte2 = &cpu_tte[((align_start & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)];
-#else
-                       tte1 = &cpu_tte[((align_start & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)];
-                       tte2 = &((tt_entry_t*) phystokv((*tte1) & ARM_TTE_TABLE_MASK))[((align_start & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)];
-#endif
+                       tte2 = arm_kva_to_tte(align_start);
 
-                       tmplate = *tte2;
-
-                       tmplate = (tmplate & ~ARM_TTE_BLOCK_APMASK) | ARM_TTE_BLOCK_AP(pte_prot_APX);
-                       tmplate = tmplate | ARM_TTE_BLOCK_NX;
+                       if ((pa >= gPhysBase) && (pa < real_avail_end)) {
+                               tmplate = (pa & ARM_TTE_BLOCK_L2_MASK) | ARM_TTE_TYPE_BLOCK
+                                       | ARM_TTE_VALID | ARM_TTE_BLOCK_AF | ARM_TTE_BLOCK_NX
+                                       | ARM_TTE_BLOCK_AP(pte_prot_APX) | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY)
+                                       | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
+                               
 #if __ARM_KERNEL_PROTECT__
-                       tmplate = tmplate | ARM_TTE_BLOCK_NG;
+                               tmplate = tmplate | ARM_TTE_BLOCK_NG;
 #endif /* __ARM_KERNEL_PROTECT__ */
-                       if (tte_prot_XN)
-                               tmplate = tmplate | ARM_TTE_BLOCK_PNX;
+                               if (tte_prot_XN)
+                                       tmplate = tmplate | ARM_TTE_BLOCK_PNX;
 
-                       *tte2 = tmplate;
+                               *tte2 = tmplate;
+                       }
                }
                align_start += ARM_TT_L2_SIZE;
        }
 
        if (align_start < _end)
-               arm_vm_page_granular_helper(align_start, _end, _end, pte_prot_APX, pte_prot_XN, forceCoarse, &deferred_pte, &deferred_ptmp);
+               arm_vm_page_granular_helper(align_start, _end, _end, pa_offset, pte_prot_APX, pte_prot_XN, force_page_granule, &deferred_pte, &deferred_ptmp);
 
        if (deferred_pte != NULL)
                *deferred_pte = deferred_ptmp;
 }
 
 static inline void
-arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, int forceCoarse)
+arm_vm_page_granular_RNX(vm_offset_t start, unsigned long size, bool force_page_granule)
 {
-       arm_vm_page_granular_prot(start, size, 1, AP_RONA, 1, forceCoarse);
+       arm_vm_page_granular_prot(start, size, 0, 1, AP_RONA, 1, force_page_granule);
 }
 
 static inline void
-arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, int forceCoarse)
+arm_vm_page_granular_ROX(vm_offset_t start, unsigned long size, bool force_page_granule)
 {
-       arm_vm_page_granular_prot(start, size, 0, AP_RONA, 0, forceCoarse);
+       arm_vm_page_granular_prot(start, size, 0, 0, AP_RONA, 0, force_page_granule);
 }
 
 static inline void
-arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, int forceCoarse)
+arm_vm_page_granular_RWNX(vm_offset_t start, unsigned long size, bool force_page_granule)
 {
-       arm_vm_page_granular_prot(start, size, 1, AP_RWNA, 1, forceCoarse);
+       arm_vm_page_granular_prot(start, size, 0, 1, AP_RWNA, 1, force_page_granule);
 }
 
 static inline void
-arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, int forceCoarse)
+arm_vm_page_granular_RWX(vm_offset_t start, unsigned long size, bool force_page_granule)
 {
-       arm_vm_page_granular_prot(start, size, 0, AP_RWNA, 0, forceCoarse);
+       arm_vm_page_granular_prot(start, size, 0, 0, AP_RWNA, 0, force_page_granule);
 }
 
+/* used in the chosen/memory-map node, populated by iBoot. */
+typedef struct MemoryMapFileInfo {
+       vm_offset_t paddr;
+       size_t length;
+} MemoryMapFileInfo;
+
+
 void
 arm_vm_prot_init(boot_args * args)
 {
-       /*
-        * Enforce W^X protections on sections that have been identified so far. This will be
-        * further refined for each KEXT's TEXT and DATA segments in readPrelinkedExtensions()
-        */
-       bool use_small_page_mappings = FALSE;
 
-       /*
-        * First off, we'll create mappings for any physical memory preceeding the kernel TEXT.
-        * This is memory that we want to give to the VM; this will be accomplished through an
-        * ml_static_mfree call in arm_vm_prot_finalize.  This allows the pmap/vm bootstrap
-        * routines to assume they will have a physically contiguous chunk of memory to deal
-        * with during bootstrap, while reclaiming this memory later.
-        */
-       arm_vm_page_granular_RWNX(gVirtBase, segPRELINKTEXTB - gVirtBase, use_small_page_mappings); // Memory for the VM
+       segLOWESTTEXT = UINT64_MAX;
+       if (segSizePRELINKTEXT  && (segPRELINKTEXTB < segLOWESTTEXT)) segLOWESTTEXT = segPRELINKTEXTB;
+       assert(segSizeTEXT);
+       if (segTEXTB < segLOWESTTEXT) segLOWESTTEXT = segTEXTB;
+       assert(segLOWESTTEXT < UINT64_MAX);
+
+       segEXTRADATA = segLOWESTTEXT;
+       segSizeEXTRADATA = 0;
+
+       DTEntry memory_map;
+       MemoryMapFileInfo *trustCacheRange;
+       unsigned int trustCacheRangeSize;
+       int err;
+
+       err = DTLookupEntry(NULL, "chosen/memory-map", &memory_map);
+       assert(err == kSuccess);
+
+       err = DTGetProperty(memory_map, "TrustCache", (void**)&trustCacheRange, &trustCacheRangeSize);
+       if (err == kSuccess) {
+               assert(trustCacheRangeSize == sizeof(MemoryMapFileInfo));
+
+               segEXTRADATA = phystokv(trustCacheRange->paddr);
+               segSizeEXTRADATA = trustCacheRange->length;
+
+               arm_vm_page_granular_RNX(segEXTRADATA, segSizeEXTRADATA, FALSE);
+       }
 
        /* Map coalesced kext TEXT segment RWNX for now */
        arm_vm_page_granular_RWNX(segPRELINKTEXTB, segSizePRELINKTEXT, FALSE); // Refined in OSKext::readPrelinkedExtensions
@@ -900,10 +1065,11 @@ arm_vm_prot_init(boot_args * args)
        arm_vm_page_granular_ROX(segPLKTEXTEXECB, segSizePLKTEXTEXEC, FALSE); // Refined in OSKext::readPrelinkedExtensions
 
        /* if new segments not present, set space between PRELINK_TEXT and xnu TEXT to RWNX
-        * otherwise we no longer expecting any space between the coalesced kext read only segments and xnu rosegments
+        * otherwise we no longer expect any space between the coalesced kext read only segments and xnu rosegments
         */
        if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC) {
-               arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT), FALSE);
+               if (segSizePRELINKTEXT)
+                       arm_vm_page_granular_RWNX(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT), FALSE);
        } else {
                /*
                 * If we have the new segments, we should still protect the gap between kext
@@ -937,8 +1103,14 @@ arm_vm_prot_init(boot_args * args)
        /* DATA segment will remain RWNX */
        arm_vm_page_granular_RWNX(segDATAB, segSizeDATA, FALSE);
 
+       arm_vm_page_granular_RWNX(segBOOTDATAB, segSizeBOOTDATA, TRUE);
+       arm_vm_page_granular_RNX((vm_offset_t)&intstack_low_guard, PAGE_MAX_SIZE, TRUE);
+       arm_vm_page_granular_RNX((vm_offset_t)&intstack_high_guard, PAGE_MAX_SIZE, TRUE);
+       arm_vm_page_granular_RNX((vm_offset_t)&excepstack_high_guard, PAGE_MAX_SIZE, TRUE);
+
        arm_vm_page_granular_ROX(segKLDB, segSizeKLD, FALSE);
        arm_vm_page_granular_RWNX(segLINKB, segSizeLINK, FALSE);
+       arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, FALSE); // Coalesced kext LINKEDIT segment
        arm_vm_page_granular_ROX(segLASTB, segSizeLAST, FALSE); // __LAST may be empty, but we cannot assume this
 
        arm_vm_page_granular_RWNX(segPRELINKDATAB, segSizePRELINKDATA, FALSE); // Prelink __DATA for kexts (RW data)
@@ -946,33 +1118,94 @@ arm_vm_prot_init(boot_args * args)
        if (segSizePLKLLVMCOV > 0)
                arm_vm_page_granular_RWNX(segPLKLLVMCOVB, segSizePLKLLVMCOV, FALSE); // LLVM code coverage data
 
-       arm_vm_page_granular_RWNX(segPLKLINKEDITB, segSizePLKLINKEDIT, use_small_page_mappings); // Coalesced kext LINKEDIT segment
-
        arm_vm_page_granular_RWNX(segPRELINKINFOB, segSizePRELINKINFO, FALSE); /* PreLinkInfoDictionary */
-       arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, use_small_page_mappings); /* Device Tree, RAM Disk (if present), bootArgs */
 
-       /*
-        * This is offset by 4 pages to make room for the boot page tables; we could probably
-        * include them in the overall mapping, but we'll be paranoid for now.
-        */
-       vm_offset_t extra = 0;
-#if KASAN
-       /* add the KASAN stolen memory to the physmap */
-       extra = shadow_ptop - shadow_pbase;
+       arm_vm_page_granular_RNX(phystokv(args->topOfKernelData), BOOTSTRAP_TABLE_SIZE, FALSE); // Boot page tables; they should not be mutable.
+}
 
-       /* record the extent of the physmap */
-       physmap_vbase = phystokv(args->topOfKernelData) + ARM_PGBYTES * 4;
-       physmap_vtop = static_memory_end;
-#endif
-       arm_vm_page_granular_RNX(phystokv(args->topOfKernelData), ARM_PGBYTES * 4, FALSE); // Boot page tables; they should not be mutable.
-       arm_vm_page_granular_RWNX(phystokv(args->topOfKernelData) + ARM_PGBYTES * 4,
-                                 extra + static_memory_end - ((phystokv(args->topOfKernelData) + ARM_PGBYTES * 4)), use_small_page_mappings); // rest of physmem
+/*
+ * return < 0 for a < b
+ *          0 for a == b
+ *        > 0 for a > b
+ */
+typedef int (*cmpfunc_t)(const void *a, const void *b);
+
+extern void
+qsort(void *a, size_t n, size_t es, cmpfunc_t cmp);
+
+static int
+cmp_ptov_entries(const void *a, const void *b)
+{
+       const ptov_table_entry *entry_a = a;
+       const ptov_table_entry *entry_b = b;
+       // Sort in descending order of segment length
+       if (entry_a->len < entry_b->len)
+               return 1;
+       else if (entry_a->len > entry_b->len)
+               return -1;
+       else
+               return 0;
+}
+
+SECURITY_READ_ONLY_LATE(static unsigned int) ptov_index = 0;
+
+#define ROUND_TWIG(addr) (((addr) + ARM_TT_TWIG_OFFMASK) & ~(ARM_TT_TWIG_OFFMASK))
+
+static void
+arm_vm_physmap_slide(ptov_table_entry *temp_ptov_table, vm_map_address_t physmap_base, vm_map_address_t orig_va, vm_size_t len, int pte_prot_APX, boolean_t force_page_granule)
+{
+       pmap_paddr_t pa_offset;
+
+       assert(ptov_index < PTOV_TABLE_SIZE);
+       assert((orig_va & ARM_PGMASK) == 0);
+       temp_ptov_table[ptov_index].pa = orig_va - gVirtBase + gPhysBase;
+       if (ptov_index == 0)
+               temp_ptov_table[ptov_index].va = physmap_base;
+       else
+               temp_ptov_table[ptov_index].va = temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len;
+       if (!force_page_granule) {
+               vm_map_address_t orig_offset = temp_ptov_table[ptov_index].pa & ARM_TT_TWIG_OFFMASK;
+               vm_map_address_t new_offset = temp_ptov_table[ptov_index].va & ARM_TT_TWIG_OFFMASK;
+               if (new_offset < orig_offset)
+                       temp_ptov_table[ptov_index].va += (orig_offset - new_offset);
+               else if (new_offset > orig_offset)
+                       temp_ptov_table[ptov_index].va = ROUND_TWIG(temp_ptov_table[ptov_index].va) + orig_offset;
+       }
+       assert((temp_ptov_table[ptov_index].va & ARM_PGMASK) == 0);
+       temp_ptov_table[ptov_index].len = round_page(len);
+       pa_offset = temp_ptov_table[ptov_index].va - orig_va; 
+       arm_vm_page_granular_prot(temp_ptov_table[ptov_index].va, temp_ptov_table[ptov_index].len, pa_offset, 1, pte_prot_APX, 1, force_page_granule);
+       ++ptov_index;
 }
 
+
+static void
+arm_vm_physmap_init(boot_args *args, vm_map_address_t physmap_base, vm_map_address_t dynamic_memory_begin __unused)
+{
+       ptov_table_entry temp_ptov_table[PTOV_TABLE_SIZE];
+       bzero(temp_ptov_table, sizeof(temp_ptov_table));
+
+       // Will be handed back to VM layer through ml_static_mfree() in arm_vm_prot_finalize()
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, gVirtBase, segEXTRADATA - gVirtBase, AP_RWNA, FALSE);
+
+       arm_vm_page_granular_RWNX(end_kern, phystokv(args->topOfKernelData) - end_kern, FALSE); /* Device Tree, RAM Disk (if present), bootArgs */
+
+       arm_vm_physmap_slide(temp_ptov_table, physmap_base, (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE - gPhysBase + gVirtBase),
+                            real_avail_end - (args->topOfKernelData + BOOTSTRAP_TABLE_SIZE), AP_RWNA, FALSE); // rest of physmem
+
+       assert((temp_ptov_table[ptov_index - 1].va + temp_ptov_table[ptov_index - 1].len) <= dynamic_memory_begin);
+
+       // Sort in descending order of segment length.  LUT traversal is linear, so largest (most likely used)
+       // segments should be placed earliest in the table to optimize lookup performance.
+       qsort(temp_ptov_table, PTOV_TABLE_SIZE, sizeof(temp_ptov_table[0]), cmp_ptov_entries); 
+
+       memcpy(ptov_table, temp_ptov_table, sizeof(ptov_table));
+}
+
+
 void
-arm_vm_prot_finalize(boot_args * args)
+arm_vm_prot_finalize(boot_args * args __unused)
 {
-#pragma unused(args)
        /*
         * At this point, we are far enough along in the boot process that it will be
         * safe to free up all of the memory preceeding the kernel.  It may in fact
@@ -989,9 +1222,13 @@ arm_vm_prot_finalize(boot_args * args)
         * should be immediately followed by XNU's TEXT segment
         */
 
-       ml_static_mfree(gVirtBase, segPRELINKTEXTB - gVirtBase);
+       ml_static_mfree(phystokv(gPhysBase), segEXTRADATA - gVirtBase);
 
-       if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC) {
+       /*
+        * KTRR support means we will be mucking with these pages and trying to
+        * protect them; we cannot free the pages to the VM if we do this.
+        */
+       if (!segSizePLKDATACONST && !segSizePLKTEXTEXEC && segSizePRELINKTEXT) {
                /* If new segments not present, PRELINK_TEXT is not dynamically sized, free DRAM between it and xnu TEXT */
                ml_static_mfree(segPRELINKTEXTB + segSizePRELINKTEXT, segTEXTB - (segPRELINKTEXTB + segSizePRELINKTEXT));
        }
@@ -1009,10 +1246,15 @@ arm_vm_prot_finalize(boot_args * args)
                arm_vm_page_granular_RNX(segPLKDATACONSTB, segSizePLKDATACONST, FALSE);
        }
 
+       cpu_stack_alloc(&BootCpuData);
+       arm64_replace_bootstack(&BootCpuData);
+       ml_static_mfree(phystokv(segBOOTDATAB - gVirtBase + gPhysBase), segSizeBOOTDATA);
+
 #if __ARM_KERNEL_PROTECT__
        arm_vm_populate_kernel_el0_mappings();
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+
 #if defined(KERNEL_INTEGRITY_KTRR)
        /*
         * __LAST,__pinst should no longer be executable.
@@ -1031,6 +1273,7 @@ arm_vm_prot_finalize(boot_args * args)
 #ifndef __ARM_L1_PTW__
        FlushPoC_Dcache();
 #endif
+       __builtin_arm_dsb(DSB_ISH);
        flush_mmu_tlb();
 }
 
@@ -1068,12 +1311,14 @@ set_tbi(void)
 #endif /* !__ARM_KERNEL_PROTECT__ */
 }
 
+#define ARM64_PHYSMAP_SLIDE_RANGE (1ULL << 30) // 1 GB
+#define ARM64_PHYSMAP_SLIDE_MASK  (ARM64_PHYSMAP_SLIDE_RANGE - 1)
+
 void
 arm_vm_init(uint64_t memory_size, boot_args * args)
 {
 #if !__ARM64_TWO_LEVEL_PMAP__
        vm_map_address_t va_l1, va_l1_end;
-       pmap_paddr_t     pa_l1;
        tt_entry_t       *cpu_l1_tte;
 #else
        /*
@@ -1086,12 +1331,13 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         */
 #endif
        vm_map_address_t va_l2, va_l2_end;
-       pmap_paddr_t     pa_l2;
        tt_entry_t       *cpu_l2_tte;
        pmap_paddr_t     boot_ttep;
        tt_entry_t       *boot_tte;
        uint64_t         mem_segments;
        vm_offset_t      ptpage_vaddr;
+       vm_map_address_t dynamic_memory_begin;
+       vm_map_address_t physmap_base;
 
 
        /*
@@ -1103,19 +1349,47 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        mem_size = args->memSize;
        if ((memory_size != 0) && (mem_size > memory_size))
                mem_size = memory_size;
-       if (mem_size > MEM_SIZE_MAX )
-               mem_size = MEM_SIZE_MAX;
-       static_memory_end = gVirtBase + mem_size;
+       if (mem_size >= ((VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS) / 4))
+               panic("Unsupported memory configuration %lx\n", mem_size);
+
+       physmap_base = phystokv(args->topOfKernelData) + BOOTSTRAP_TABLE_SIZE;
+
+       // Slide the physical aperture to a random page-aligned location within the slide range
+       uint64_t physmap_slide = early_random() & ARM64_PHYSMAP_SLIDE_MASK & ~((uint64_t)PAGE_MASK);
+       assert(physmap_slide < ARM64_PHYSMAP_SLIDE_RANGE);
+
+       physmap_base += physmap_slide;
+
+       static_memory_end = physmap_base + mem_size + (PTOV_TABLE_SIZE * ARM_TT_TWIG_SIZE); // worst possible case for block alignment
+#if KASAN
+       /* add the KASAN stolen memory to the physmap */
+       dynamic_memory_begin = static_memory_end + (shadow_ptop - shadow_pbase);
+#else
+       dynamic_memory_begin = static_memory_end;
+#endif
+       if (dynamic_memory_begin > VM_MAX_KERNEL_ADDRESS)
+               panic("Unsupported memory configuration %lx\n", mem_size);
 
        boot_ttep = args->topOfKernelData;
        boot_tte = (tt_entry_t *) phystokv(boot_ttep);
 
-       /* 
-        * Four pages: 
+#if DEVELOPMENT || DEBUG
+       /* Sanity check - assert that BOOTSTRAP_TABLE_SIZE is sufficiently-large to
+        * hold our bootstrap mappings for any possible slide */
+       size_t bytes_mapped = dynamic_memory_begin - gVirtBase;
+       size_t l1_entries = 1 + ((bytes_mapped + ARM_TT_L1_SIZE - 1) / ARM_TT_L1_SIZE);
+       /* 1 L1 each for V=P and KVA, plus 1 page for each L2 */
+       size_t pages_used = 2 * (l1_entries + 1);
+       if (pages_used > BOOTSTRAP_TABLE_SIZE) {
+               panic("BOOTSTRAP_TABLE_SIZE too small for memory config\n");
+       }
+#endif
+
+       /*
         *  TTBR0 L1, TTBR0 L2 - 1:1 bootstrap mapping.
         *  TTBR1 L1, TTBR1 L2 - kernel mapping
         */
-       avail_start = boot_ttep + 4*ARM_PGBYTES; 
+       avail_start = boot_ttep + BOOTSTRAP_TABLE_SIZE;
 
 #if defined(KERNEL_INTEGRITY_KTRR)
        arm_replace_identity_map(args);
@@ -1142,6 +1416,12 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        bzero(cpu_tte, ARM_PGBYTES);
        avail_end = gPhysBase + mem_size;
 
+#if KASAN
+       real_avail_end = avail_end + (shadow_ptop - shadow_pbase);
+#else
+       real_avail_end = avail_end;
+#endif
+
        /*
         * Initialize l1 and l2 page table pages :
         *   map physical memory at the kernel base virtual address
@@ -1150,62 +1430,25 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         *   the so called physical aperture should be statically mapped
         */
 #if !__ARM64_TWO_LEVEL_PMAP__
-       pa_l1 = gPhysBase;
        va_l1 = gVirtBase;
-       va_l1_end = gVirtBase + mem_size;
-#if KASAN
-       /* add the KASAN stolen memory to the physmap */
-       va_l1_end = gVirtBase + (shadow_ptop - gPhysBase);
-#endif
+       va_l1_end = dynamic_memory_begin; 
        cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
 
        while (va_l1 < va_l1_end) {
-               tt_entry_t *new_tte = (tt_entry_t *)alloc_ptpage(TRUE);
-               /* Allocate a page and setup L1 Table TTE in L1 */
-               *cpu_l1_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK)  | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
-               bzero((void *)new_tte, ARM_PGBYTES);
-
-               va_l2 = va_l1;
+               if (*cpu_l1_tte == ARM_TTE_EMPTY) {
+                       /* Allocate a page and setup L1 Table TTE in L1 */
+                       ptpage_vaddr = alloc_ptpage(TRUE);
+                       *cpu_l1_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
+                       bzero((void *)ptpage_vaddr, ARM_PGBYTES);
+               }
 
-               if (((va_l1 & ~ARM_TT_L1_OFFMASK)+ARM_TT_L1_SIZE) < va_l1) {
+               if ((va_l1 + ARM_TT_L1_SIZE) < va_l1) {
                        /* If this is the last L1 entry, it must cover the last mapping. */
-                       va_l2_end = va_l1_end;
-               } else {
-                       va_l2_end = MIN((va_l1 & ~ARM_TT_L1_OFFMASK)+ARM_TT_L1_SIZE, va_l1_end);
+                       break;
                }
 
-               pa_l2 = pa_l1;
-               cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l1 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
-#else
-               va_l2 = gVirtBase;
-               va_l2_end = gVirtBase + mem_size;
-               pa_l2 = gPhysBase;
-               cpu_l2_tte = cpu_tte + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
-
-#if KASAN
-               /* add the KASAN stolen memory to the physmap */
-               va_l2_end = gVirtBase + (shadow_ptop - gPhysBase);
-#endif
-
-#endif
-
-               while (va_l2 < va_l2_end) {
-                       /* Set up L2 Block TTE in L2 */
-                       *cpu_l2_tte = (pa_l2 & ARM_TTE_BLOCK_L2_MASK) | ARM_TTE_TYPE_BLOCK
-                                     | ARM_TTE_VALID | ARM_TTE_BLOCK_AF
-                                     | ARM_TTE_BLOCK_AP(AP_RWNA) | ARM_TTE_BLOCK_SH(SH_OUTER_MEMORY)
-                                     | ARM_TTE_BLOCK_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
-#if __ARM_KERNEL_PROTECT__
-                       *cpu_l2_tte |= ARM_TTE_BLOCK_NG;
-#endif /* __ARM_KERNEL_PROTECT__ */
-                       va_l2 += ARM_TT_L2_SIZE;
-                       pa_l2 += ARM_TT_L2_SIZE;
-                       cpu_l2_tte++;
-               }
-#if !__ARM64_TWO_LEVEL_PMAP__
+               va_l1 += ARM_TT_L1_SIZE;
                cpu_l1_tte++;
-               va_l1 = va_l2;
-               pa_l1 = pa_l2;
        }
 #endif
 
@@ -1224,6 +1467,8 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        segDATACONSTB    = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA_CONST", &segSizeDATACONST);
        segTEXTEXECB     = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__TEXT_EXEC", &segSizeTEXTEXEC);
        segDATAB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__DATA", &segSizeDATA);
+
+       segBOOTDATAB    = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__BOOTDATA", &segSizeBOOTDATA);
        segLINKB         = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__LINKEDIT", &segSizeLINK);
        segKLDB          = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__KLD", &segSizeKLD);
        segPRELINKDATAB  = (vm_offset_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_DATA", &segSizePRELINKDATA);
@@ -1292,7 +1537,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         *    KERNEL_DYNAMIC_ADDR - VM_MAX_KERNEL_ADDRESS
         */
 #if !__ARM64_TWO_LEVEL_PMAP__
-       va_l1 = (gVirtBase+MEM_SIZE_MAX+ ~0xFFFFFFFFFF800000ULL) & 0xFFFFFFFFFF800000ULL;
+       va_l1 = dynamic_memory_begin;
        va_l1_end = VM_MAX_KERNEL_ADDRESS;
        cpu_l1_tte = cpu_tte + ((va_l1 & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT);
 
@@ -1300,7 +1545,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                if (*cpu_l1_tte == ARM_TTE_EMPTY) {
                        /* Allocate a page and setup L1 Table TTE in L1 */
                        ptpage_vaddr = alloc_ptpage(TRUE);
-                       *cpu_l1_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN;
+                       *cpu_l1_tte = (kvtophys(ptpage_vaddr) & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN;
                        bzero((void *)ptpage_vaddr, ARM_PGBYTES);
                }
 
@@ -1315,31 +1560,37 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
 #endif
 
 #if KASAN
+       /* record the extent of the physmap */
+       physmap_vbase = physmap_base;
+       physmap_vtop = static_memory_end;
        kasan_init();
 #endif
 
+       set_tbi();
        set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
+
+       arm_vm_physmap_init(args, physmap_base, dynamic_memory_begin);
        set_mmu_ttb_alternate(cpu_ttep & TTBR_BADDR_MASK);
-       set_tbi();
        flush_mmu_tlb();
+       kva_active = TRUE;
+       // global table pointers may need to be different due to physical aperture remapping
+       cpu_tte = (tt_entry_t*)(phystokv(cpu_ttep));
+       invalid_tte = (tt_entry_t*)(phystokv(invalid_ttep));
 
-       /*
-        * TODO: We're hardcoding the expected virtual TEXT base here;
-        * that gives us an ugly dependency on a linker argument in
-        * the make files.  Clean this up, so we don't hardcode it
-        * twice; this is nothing but trouble.
-        */
        sane_size = mem_size - (avail_start - gPhysBase);
        max_mem = mem_size;
-       vm_kernel_slid_base = segPRELINKTEXTB;
+       vm_kernel_slid_base = segLOWESTTEXT;
        vm_kernel_slid_top = vm_prelink_einfo;
-       vm_kernel_slide = segTEXTB-0xfffffff007004000;
+       vm_kernel_slide = segTEXTB-VM_KERNEL_LINK_ADDRESS;
        vm_kernel_stext = segTEXTB;
        assert(segDATACONSTB == segTEXTB + segSizeTEXT);
-        assert(segTEXTEXECB == segDATACONSTB + segSizeDATACONST);
+       assert(segTEXTEXECB == segDATACONSTB + segSizeDATACONST);
        vm_kernel_etext = segTEXTB + segSizeTEXT + segSizeDATACONST + segSizeTEXTEXEC;
 
-       pmap_bootstrap((gVirtBase+MEM_SIZE_MAX+ ~0xFFFFFFFFFF800000ULL) & 0xFFFFFFFFFF800000ULL);
+       dynamic_memory_begin = ROUND_TWIG(dynamic_memory_begin);
+       pmap_bootstrap(dynamic_memory_begin);
+
+       disable_preemption();
 
        /*
         * Initialize l3 page table pages :
@@ -1350,7 +1601,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
        mem_segments = (mem_size + 0x0FFFFFFF) >> 28;
 
 #if !__ARM64_TWO_LEVEL_PMAP__
-       va_l1 = (gVirtBase+MEM_SIZE_MAX+ ~0xFFFFFFFFFF800000ULL) & 0xFFFFFFFFFF800000ULL;
+       va_l1 = dynamic_memory_begin;
        va_l1_end = va_l1 + ((2 + (mem_segments * 10)) << 20);
        va_l1_end += round_page(args->Video.v_height * args->Video.v_rowBytes);
        va_l1_end = (va_l1_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL;
@@ -1370,7 +1621,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
 
                cpu_l2_tte = ((tt_entry_t *) phystokv(((*cpu_l1_tte) & ARM_TTE_TABLE_MASK))) + ((va_l2 & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT);
 #else
-               va_l2 = (gVirtBase+MEM_SIZE_MAX+ ~0xFFFFFFFFFF800000ULL) & 0xFFFFFFFFFF800000ULL;
+               va_l2 = dynamic_memory_begin;
                va_l2_end = va_l2 + ((2 + (mem_segments * 10)) << 20);
                va_l2_end += round_page(args->Video.v_height * args->Video.v_rowBytes);
                va_l2_end = (va_l2_end + 0x00000000007FFFFFULL) & 0xFFFFFFFFFF800000ULL;
@@ -1387,7 +1638,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
 
                        pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE);
 
-                       *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN;
+                       *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN;
 
                        va_l2 += ARM_TT_L2_SIZE;
                        cpu_l2_tte++;
@@ -1437,7 +1688,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
 
                        pmap_init_pte_page(kernel_pmap, ptp, va_l2, 3, TRUE);
 
-                       *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN;
+                       *cpu_l2_tte = (pa_to_tte (ptp_phys)) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN;
 
                        va_l2 += ARM_TT_L2_SIZE;
                        cpu_l2_tte++;
@@ -1465,7 +1716,7 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
                if (*cpu_l1_tte == ARM_TTE_EMPTY) {
                        tt_entry_t *new_tte = (tt_entry_t*)alloc_ptpage(FALSE);
                        bzero(new_tte, ARM_PGBYTES);
-                       *cpu_l1_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK)  | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_TTE_TABLE_PXN | ARM_TTE_TABLE_XN;
+                       *cpu_l1_tte = (kvtophys((vm_offset_t)new_tte) & ARM_TTE_TABLE_MASK)  | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID | ARM_DYNAMIC_TABLE_XN;
                }
 
                cpu_l1_tte++;
@@ -1479,8 +1730,8 @@ arm_vm_init(uint64_t memory_size, boot_args * args)
         */
        avail_start = (avail_start + PAGE_MASK) & ~PAGE_MASK;
 
-
        first_avail = avail_start;
        patch_low_glo_static_region(args->topOfKernelData, avail_start - args->topOfKernelData);
+       enable_preemption();
 }
 
index f756f22aef629abc0ba16b73b1d41f3a3247a1c4..fb2c1ea8a5520a1807064230949461e82ee0c43a 100644 (file)
        movk $0, #((($1) >> 00) & 0x000000000000FFFF), lsl #00
 .endmacro
 
+.macro ARM64_STACK_PROLOG
+#if __has_feature(ptrauth_returns)
+       pacibsp
+#endif
+.endmacro
+
+.macro ARM64_STACK_EPILOG
+#if __has_feature(ptrauth_returns)
+       retab
+#else
+       ret
+#endif
+.endmacro
+
 #define PUSH_FRAME                     \
        stp fp, lr, [sp, #-16]!         %% \
        mov fp, sp                      %%
index 01f33d61e5b0d32cffe7b1fdb7e541f995caffc2..67266940de6bdf553f02ba042252ed514ec05d34 100644 (file)
@@ -90,6 +90,7 @@ _memmove:
 //     can only be smaller than length if the buffers do not overlap, so we don't
 //     need to worry about false positives due to the overflow (they happen, but
 //     only in cases where copying in either order is correct).
+       ARM64_STACK_PROLOG
        PUSH_FRAME
        sub     x3,      x0, x1
        cmp     x3,      x2
@@ -178,7 +179,7 @@ L_forwardCleanup:
        stp     x12,x13,[x3, #32]
        stp     x14,x15,[x3, #48]
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*****************************************************************************
  *  forward small copy                                                       *
@@ -204,7 +205,7 @@ L_forwardSmallCopy:
        subs    x2,      x2, #1
        b.ne    1b
 2:     POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*****************************************************************************
  *  Reverse copy engines                                                     *
@@ -271,7 +272,7 @@ L_reverseCleanup:
        stp     x12,x13,[x0, #16] // In the forward copy, we need to compute the
        stp     x14,x15,[x0]      // address of these stores, but here we already
        POP_FRAME       // have a pointer to the start of the buffer.
-       ret
+       ARM64_STACK_EPILOG
 
 /*****************************************************************************
  *  reverse small copy                                                       *
@@ -289,8 +290,9 @@ L_reverseSmallCopy:
        subs    x2,      x2, #1
        b.ne    1b
 2:     POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
+
 
 L_return:
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
index c2f084e474d84f1a670ff1d459a6fadb7d2e5147..a7abca2cb908000011161a8d42e8d9695c2e6b08 100644 (file)
@@ -50,6 +50,7 @@
 .align 4
 _bzero:
 ___bzero:
+    ARM64_STACK_PROLOG
     PUSH_FRAME
     mov     x2,      x1
     eor     x1,      x1, x1
@@ -85,7 +86,7 @@ L_bzeroLarge:
     stp     x1, x1, [x3, #32]
     stp     x1, x1, [x3, #48]
     POP_FRAME
-       ret
+    ARM64_STACK_EPILOG
 
 /*****************************************************************************
  *  memset entrypoint                                                        *
@@ -98,6 +99,7 @@ L_bzeroLarge:
  */
 _secure_memset:
 _memset:
+    ARM64_STACK_PROLOG
     PUSH_FRAME
     and     x1,      x1, #0xff
     orr     x3,      xzr,#0x0101010101010101
@@ -134,7 +136,7 @@ L_memsetLarge:
     stp     x1, x1, [x3, #32]
     stp     x1, x1, [x3, #48]
     POP_FRAME
-       ret
+    ARM64_STACK_EPILOG
 
 /*****************************************************************************
  *  Small buffer store engine                                                *
@@ -150,4 +152,5 @@ L_memsetSmall:
     subs    x2,      x2, #1
     b.ne    1b
 2:  POP_FRAME
-       ret
+    ARM64_STACK_EPILOG
+
index ac50fd03774c8865d881e6cc4eb2bb126c03c511..a673abaf302d8465c480dd33a1fb4556a72fbb11 100644 (file)
@@ -130,7 +130,19 @@ LEXT(CleanPoU_Dcache)
 #if defined(APPLE_ARM64_ARCH_FAMILY)
        /* "Fully Coherent." */
 #else /* !defined(APPLE_ARM64_ARCH_FAMILY) */
-#error CleanPoU_Dcache needs an implementation
+       mov             x0, #0
+       mov             x9, #(1 << MMU_I7SET)
+       mov             x10, #(1 << (MMU_NSET + MMU_I7SET))
+       mov             x11, #(1 << MMU_I7WAY)
+L_cpud_dcacheway:
+L_cpud_dcacheline:
+       dc              csw, x0                                                         // clean dcache line by way/set
+       add             x0, x0, x9                                                      // increment set index
+       tst             x0, #(1 << (MMU_NSET + MMU_I7SET))      // look for overflow
+       b.eq    L_cpud_dcacheline
+       bic             x0, x0, x10                                                     // clear set overflow
+       adds    x0, x0, x11                                                     // increment way
+       b.cc    L_cpud_dcacheway                                        // loop
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
        dsb sy
        ret
@@ -170,6 +182,7 @@ L_cpudr_loop:
        .text
        .align 2
 LEXT(CleanPoC_DcacheRegion_internal)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
        mov             x9, #((1<<MMU_CLINE)-1)
        and             x2, x0, x9
@@ -193,7 +206,7 @@ L_cpcdr_loop:
        b.pl    L_cpcdr_loop                                            // Loop in counter not null
        dsb             sy
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  *     void CleanPoC_DcacheRegion(vm_offset_t va, unsigned length)
@@ -212,35 +225,50 @@ LEXT(CleanPoC_DcacheRegion)
        b EXT(CleanPoC_DcacheRegion_internal)
 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
 
-/*
- *     void CleanPoC_DcacheRegion_Force(vm_offset_t va, unsigned length)
- *
- *             Clean d-cache region to Point of Coherency -  when you really 
- *             need to flush even on coherent platforms, e.g. panic log
- */
-.text
+       .text
        .align 2
-       .globl EXT(CleanPoC_DcacheRegion_Force)
-LEXT(CleanPoC_DcacheRegion_Force)
+       .globl EXT(CleanPoC_DcacheRegion_Force_nopreempt)
+LEXT(CleanPoC_DcacheRegion_Force_nopreempt)
 #if defined(APPLE_ARM64_ARCH_FAMILY)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
-       stp             x0, x1, [sp, #-16]!
-       bl              EXT(_disable_preemption)
        isb             sy
        ARM64_IS_PCORE x15
        ARM64_READ_EP_SPR x15, x14, ARM64_REG_EHID4, ARM64_REG_HID4
        and             x14, x14, (~ARM64_REG_HID4_DisDcMVAOps)
        ARM64_WRITE_EP_SPR x15, x14, ARM64_REG_EHID4, ARM64_REG_HID4
        isb             sy
-       ldp             x0, x1, [sp], #16
        bl              EXT(CleanPoC_DcacheRegion_internal)
        isb             sy
        orr             x14, x14, ARM64_REG_HID4_DisDcMVAOps
        ARM64_WRITE_EP_SPR x15, x14, ARM64_REG_EHID4, ARM64_REG_HID4
        isb             sy
+       POP_FRAME
+       ARM64_STACK_EPILOG
+#else
+       b               EXT(CleanPoC_DcacheRegion_internal)
+#endif // APPLE_ARM64_ARCH_FAMILY
+
+/*
+ *     void CleanPoC_DcacheRegion_Force(vm_offset_t va, unsigned length)
+ *
+ *             Clean d-cache region to Point of Coherency -  when you really 
+ *             need to flush even on coherent platforms, e.g. panic log
+ */
+       .text
+       .align 2
+       .globl EXT(CleanPoC_DcacheRegion_Force)
+LEXT(CleanPoC_DcacheRegion_Force)
+#if defined(APPLE_ARM64_ARCH_FAMILY)
+       ARM64_STACK_PROLOG
+       PUSH_FRAME
+       stp             x0, x1, [sp, #-16]!
+       bl              EXT(_disable_preemption)
+       ldp             x0, x1, [sp], #16
+       bl              EXT(CleanPoC_DcacheRegion_Force_nopreempt)
        bl              EXT(_enable_preemption)
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 #else
        b               EXT(CleanPoC_DcacheRegion_internal)
 #endif // APPLE_ARM64_ARCH_FAMILY
index 2e47825ddeea49d5103d333b28af23aefa143210..599353c492b3190aee30c2c951ceecd772f5423d 100644 (file)
@@ -34,6 +34,9 @@
 #include <vm/vm_map.h>
 #include <san/kasan.h>
 
+#undef copyin
+#undef copyout
+
 extern int _bcopyin(const char *src, char *dst, vm_size_t len);
 extern int _bcopyinstr(const char *src, char *dst, vm_size_t max, vm_size_t *actual);
 extern int _bcopyout(const char *src, char *dst, vm_size_t len);
@@ -41,6 +44,9 @@ extern int _copyin_word(const char *src, uint64_t *dst, vm_size_t len);
 
 extern pmap_t kernel_pmap;
 
+/* On by default, optionally disabled by boot-arg */
+extern boolean_t copyio_zalloc_check;
+
 typedef enum copyio_type {
        COPYIO_IN,
        COPYIO_IN_WORD,
@@ -48,18 +54,6 @@ typedef enum copyio_type {
        COPYIO_OUT,
 } copyio_type_t;
 
-int
-copyio_check_user_addr(user_addr_t user_addr, vm_size_t nbytes)
-{
-       if (nbytes && (user_addr + nbytes <= user_addr))
-               return EFAULT;
-
-       if ((user_addr + nbytes) > vm_map_max(current_thread()->map))
-               return EFAULT;
-
-       return 0;
-}
-
 static inline void
 user_access_enable(void)
 {
@@ -82,6 +76,8 @@ copyio(copyio_type_t copytype, const char *src, char *dst,
 {
        int result = 0;
        vm_size_t bytes_copied = 0;
+       vm_size_t kernel_buf_size = 0;
+       void * kernel_addr = NULL;
 
        /* Reject TBI addresses */
        if (copytype == COPYIO_OUT) {
@@ -92,8 +88,16 @@ copyio(copyio_type_t copytype, const char *src, char *dst,
                        return EINVAL;
        }
 
-       if (!nbytes) {
-               return 0;
+       if (__probable(copyio_zalloc_check)) {
+               if (copytype == COPYIO_IN || copytype == COPYIO_INSTR || copytype == COPYIO_IN_WORD) {
+                       kernel_addr = (void*)dst;
+               } else if (copytype == COPYIO_OUT) {
+                       kernel_addr = (void*)(uintptr_t)src;
+               }
+               if (kernel_addr)
+                       kernel_buf_size = zone_element_size(kernel_addr, NULL);
+               if (__improbable(kernel_buf_size && kernel_buf_size < nbytes))
+                       panic("copyio: kernel buffer %p has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes);
        }
 
 #if KASAN
@@ -153,27 +157,20 @@ copyout_kern(const char *kernel_addr, user_addr_t user_addr, vm_size_t nbytes)
 }
 
 int
-copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes)
+copyin(const user_addr_t user_addr, void *kernel_addr, vm_size_t nbytes)
 {
        int result;
 
-       if (user_addr >= VM_MIN_KERNEL_ADDRESS || user_addr + nbytes >= VM_MIN_KERNEL_ADDRESS) {
-               if (current_thread()->map->pmap == kernel_pmap)
-                       return copyin_kern(user_addr, kernel_addr, nbytes);
-               else
-                       return EFAULT;
-       }
-
-       if (nbytes >= 4096) {
-               result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes);
-               if (result) return result;
-       }
-
-       result = copyio_check_user_addr(user_addr, nbytes);
+       if (nbytes == 0)
+               return 0;
 
+       result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes);
        if (result) return result;
 
-       return copyio(COPYIO_IN, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, NULL);
+       if (current_thread()->map->pmap == kernel_pmap)
+               return copyin_kern(user_addr, kernel_addr, nbytes);
+       else
+               return copyio(COPYIO_IN, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, NULL);
 }
 
 /*
@@ -194,11 +191,7 @@ copyin_word(const user_addr_t user_addr, uint64_t *kernel_addr, vm_size_t nbytes
        if (user_addr & (nbytes - 1))
                return EINVAL;
 
-       /* Address must be user */
-       if (user_addr >= VM_MIN_KERNEL_ADDRESS || user_addr + nbytes >= VM_MIN_KERNEL_ADDRESS)
-               return EFAULT;
-
-       result = copyio_check_user_addr(user_addr, nbytes);
+       result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes);
        if (result)
                return result;
 
@@ -210,18 +203,14 @@ copyinstr(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes, vm_s
 {
        int result;
 
-       if (user_addr >= VM_MIN_KERNEL_ADDRESS || user_addr + nbytes >= VM_MIN_KERNEL_ADDRESS) {
-               return EFAULT;
-       }
+       *lencopied = 0;
+       if (nbytes == 0)
+               return ENAMETOOLONG;
 
-       result = copyio_check_user_addr(user_addr, nbytes);
+       result = copyin_validate(user_addr, (uintptr_t)kernel_addr, nbytes);
 
        if (result) return result;
 
-       if (!nbytes) {
-               return ENAMETOOLONG;
-       }
-
        return copyio(COPYIO_INSTR, (const char *)(uintptr_t)user_addr, kernel_addr, nbytes, lencopied);
 }
 
@@ -230,23 +219,16 @@ copyout(const void *kernel_addr, user_addr_t user_addr, vm_size_t nbytes)
 {
        int result;
 
-       if (user_addr >= VM_MIN_KERNEL_ADDRESS || user_addr + nbytes >= VM_MIN_KERNEL_ADDRESS) {
-               if (current_thread()->map->pmap == kernel_pmap)
-                       return copyout_kern(kernel_addr, user_addr, nbytes);
-               else
-                       return EFAULT;
-       }
-
-       if (nbytes >= 4096) {
-               result = copyout_validate((uintptr_t)kernel_addr, user_addr, nbytes);
-               if (result) return result;
-       }
-
-       result = copyio_check_user_addr(user_addr, nbytes);
+       if (nbytes == 0)
+               return 0;
 
+       result = copyout_validate((uintptr_t)kernel_addr, user_addr, nbytes);
        if (result) return result;
 
-       return copyio(COPYIO_OUT, kernel_addr, (char *)(uintptr_t)user_addr, nbytes, NULL);
+       if (current_thread()->map->pmap == kernel_pmap)
+               return copyout_kern(kernel_addr, user_addr, nbytes);
+       else
+               return copyio(COPYIO_OUT, kernel_addr, (char *)(uintptr_t)user_addr, nbytes, NULL);
 }
 
 
@@ -262,10 +244,6 @@ const int copysize_limit_panic = (64 * 1024 * 1024);
 
 /*
  * Validate the arguments to copy{in,out} on this platform.
- *
- * Called when nbytes is "large" e.g. more than a page.  Such sizes are
- * infrequent, and very large sizes are likely indications of attempts
- * to exploit kernel programming errors (bugs).
  */
 static int
 copy_validate(const user_addr_t user_addr,
@@ -273,16 +251,17 @@ copy_validate(const user_addr_t user_addr,
 {
        uintptr_t kernel_addr_last = kernel_addr + nbytes;
 
-       if (kernel_addr < VM_MIN_KERNEL_ADDRESS ||
+       if (__improbable(kernel_addr < VM_MIN_KERNEL_ADDRESS ||
            kernel_addr > VM_MAX_KERNEL_ADDRESS ||
            kernel_addr_last < kernel_addr ||
-           kernel_addr_last > VM_MAX_KERNEL_ADDRESS)
+           kernel_addr_last > VM_MAX_KERNEL_ADDRESS))
                panic("%s(%p, %p, %lu) - kaddr not in kernel", __func__,
                       (void *)user_addr, (void *)kernel_addr, nbytes);
 
        user_addr_t user_addr_last = user_addr + nbytes;
 
-       if (user_addr_last < user_addr || user_addr_last > VM_MIN_KERNEL_ADDRESS)
+       if (__improbable((user_addr_last < user_addr) || ((user_addr + nbytes) > vm_map_max(current_thread()->map)) ||
+           (user_addr < vm_map_min(current_thread()->map))))
                return (EFAULT);
 
        if (__improbable(nbytes > copysize_limit_panic))
index d4712a612c328038ee5120ab8407b178d4087b31..4d1300d7509263cd478600ee5b43d171743f1151 100644 (file)
@@ -134,6 +134,55 @@ static vm_offset_t sleepTokenBuffer = (vm_offset_t)NULL;
 #endif
 static boolean_t coresight_debug_enabled = FALSE;
 
+#if defined(CONFIG_XNUPOST)
+void arm64_ipi_test_callback(void *);
+
+void arm64_ipi_test_callback(void *parm) {
+       volatile uint64_t *ipi_test_data = parm;
+       cpu_data_t *cpu_data;
+
+       cpu_data = getCpuDatap();
+
+       *ipi_test_data = cpu_data->cpu_number;
+}
+
+uint64_t arm64_ipi_test_data[MAX_CPUS];
+
+void arm64_ipi_test() {
+       volatile uint64_t *ipi_test_data;
+       uint32_t timeout_ms = 100;
+       uint64_t then, now, delta;
+       int current_cpu_number = getCpuDatap()->cpu_number;
+
+       /*
+        * probably the only way to have this on most systems is with the
+        * cpus=1 boot-arg, but nonetheless, if we only have 1 CPU active,
+        * IPI is not available
+        */
+       if (real_ncpus == 1) {
+               return;
+       }
+
+       for (unsigned int i = 0; i < MAX_CPUS; ++i) {
+               ipi_test_data = &arm64_ipi_test_data[i];
+               *ipi_test_data = ~i;
+               kern_return_t error = cpu_xcall((int)i, (void *)arm64_ipi_test_callback, (void *)(uintptr_t)ipi_test_data);
+               if (error != KERN_SUCCESS)
+                       panic("CPU %d was unable to IPI CPU %u: error %d", current_cpu_number, i, error);
+
+               then = mach_absolute_time();
+
+               while (*ipi_test_data != i) {
+                       now = mach_absolute_time();
+                       absolutetime_to_nanoseconds(now-then, &delta);
+                       if ((delta / NSEC_PER_MSEC) > timeout_ms) {
+                               panic("CPU %d tried to IPI CPU %d but didn't get correct response within %dms, respose: %llx", current_cpu_number, i, timeout_ms, *ipi_test_data);
+                       }
+               }
+       }
+
+}
+#endif /* defined(CONFIG_XNUPOST) */
 
 static void
 configure_coresight_registers(cpu_data_t *cdp)
@@ -316,7 +365,7 @@ cpu_idle(void)
 
        ClearIdlePop(TRUE);
 
-       cpu_idle_exit();
+       cpu_idle_exit(FALSE);
 }
 
 /*
@@ -324,7 +373,7 @@ cpu_idle(void)
  *     Function:
  */
 void
-cpu_idle_exit(void)
+cpu_idle_exit(boolean_t from_reset)
 {
        uint64_t        new_idle_timeout_ticks = 0x0ULL;
        cpu_data_t     *cpu_data_ptr = getCpuDatap();
@@ -332,7 +381,8 @@ cpu_idle_exit(void)
        assert(exception_stack_pointer() != 0);
 
        /* Back from WFI, unlock OSLAR and EDLAR. */
-       configure_coresight_registers(cpu_data_ptr);
+       if (from_reset)
+               configure_coresight_registers(cpu_data_ptr);
 
 #if KPC
        kpc_idle_exit();
@@ -420,51 +470,35 @@ cpu_init(void)
 #endif /* MONOTONIC */
 }
 
-cpu_data_t *
-cpu_data_alloc(boolean_t is_boot_cpu)
+void
+cpu_stack_alloc(cpu_data_t *cpu_data_ptr)
 {
-       cpu_data_t              *cpu_data_ptr = NULL;
-
-       if (is_boot_cpu)
-               cpu_data_ptr = &BootCpuData;
-       else {
-               void    *irq_stack = NULL;
-               void    *exc_stack = NULL;
-               void    *fiq_stack = NULL;
-
-               if ((kmem_alloc(kernel_map, (vm_offset_t *)&cpu_data_ptr, sizeof(cpu_data_t), VM_KERN_MEMORY_CPU)) != KERN_SUCCESS)
-                       goto cpu_data_alloc_error;
-
-               bzero((void *)cpu_data_ptr, sizeof(cpu_data_t));
-
-               if ((irq_stack = kalloc(INTSTACK_SIZE)) == 0)
-                       goto cpu_data_alloc_error;
-               cpu_data_ptr->intstack_top = (vm_offset_t)irq_stack + INTSTACK_SIZE ;
-               cpu_data_ptr->istackptr = cpu_data_ptr->intstack_top;
-
-               if ((exc_stack = kalloc(PAGE_SIZE)) == 0)
-                       goto cpu_data_alloc_error;
-               cpu_data_ptr->excepstack_top = (vm_offset_t)exc_stack + PAGE_SIZE ;
-               cpu_data_ptr->excepstackptr = cpu_data_ptr->excepstack_top;
-
-               if ((fiq_stack = kalloc(PAGE_SIZE)) == 0)
-                       goto cpu_data_alloc_error;
-               cpu_data_ptr->fiqstack_top = (vm_offset_t)fiq_stack + PAGE_SIZE ;
-               cpu_data_ptr->fiqstackptr = cpu_data_ptr->fiqstack_top;
-       }
-
-       cpu_data_ptr->cpu_processor = cpu_processor_alloc(is_boot_cpu);
-       if (cpu_data_ptr->cpu_processor == (struct processor *)NULL)
-               goto cpu_data_alloc_error;
-
-       return cpu_data_ptr;
-
-cpu_data_alloc_error:
-       panic("cpu_data_alloc() failed\n");
-       return (cpu_data_t *)NULL;
+       vm_offset_t             irq_stack = 0;
+       vm_offset_t             exc_stack = 0;
+
+       kern_return_t kr = kernel_memory_allocate(kernel_map, &irq_stack,
+                                  INTSTACK_SIZE + (2 * PAGE_SIZE),
+                                  PAGE_MASK,
+                                  KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT,
+                                  VM_KERN_MEMORY_STACK);
+       if (kr != KERN_SUCCESS)
+               panic("Unable to allocate cpu interrupt stack\n");
+
+       cpu_data_ptr->intstack_top = irq_stack + PAGE_SIZE + INTSTACK_SIZE;
+       cpu_data_ptr->istackptr = cpu_data_ptr->intstack_top;
+
+       kr = kernel_memory_allocate(kernel_map, &exc_stack,
+                                  EXCEPSTACK_SIZE + (2 * PAGE_SIZE),
+                                  PAGE_MASK,
+                                  KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT,
+                                  VM_KERN_MEMORY_STACK);
+       if (kr != KERN_SUCCESS)
+               panic("Unable to allocate cpu exception stack\n");
+
+       cpu_data_ptr->excepstack_top = exc_stack + PAGE_SIZE + EXCEPSTACK_SIZE;
+       cpu_data_ptr->excepstackptr = cpu_data_ptr->excepstack_top;
 }
 
-
 void
 cpu_data_free(cpu_data_t *cpu_data_ptr)
 {
@@ -473,7 +507,7 @@ cpu_data_free(cpu_data_t *cpu_data_ptr)
 
        cpu_processor_free( cpu_data_ptr->cpu_processor);
        kfree( (void *)(cpu_data_ptr->intstack_top - INTSTACK_SIZE), INTSTACK_SIZE);
-       kfree( (void *)(cpu_data_ptr->fiqstack_top - PAGE_SIZE), PAGE_SIZE);
+       kfree( (void *)(cpu_data_ptr->excepstack_top - EXCEPSTACK_SIZE), EXCEPSTACK_SIZE);
        kmem_free(kernel_map, (vm_offset_t)cpu_data_ptr, sizeof(cpu_data_t));
 }
 
@@ -533,8 +567,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
 
        pmap_cpu_data_t * pmap_cpu_data_ptr = &cpu_data_ptr->cpu_pmap_cpu_data;
 
-       pmap_cpu_data_ptr->cpu_user_pmap = (struct pmap *) NULL;
-       pmap_cpu_data_ptr->cpu_user_pmap_stamp = 0;
+       pmap_cpu_data_ptr->cpu_nested_pmap = (struct pmap *) NULL;
        pmap_cpu_data_ptr->cpu_number = PMAP_INVALID_CPU_NUM;
 
        for (i = 0; i < (sizeof(pmap_cpu_data_ptr->cpu_asid_high_bits) / sizeof(*pmap_cpu_data_ptr->cpu_asid_high_bits)); i++) {
@@ -544,6 +577,7 @@ cpu_data_init(cpu_data_t *cpu_data_ptr)
 #if __ARM_KERNEL_PROTECT__
        cpu_data_ptr->cpu_exc_vectors = (vm_offset_t)&exc_vectors_table;
 #endif /* __ARM_KERNEL_PROTECT__ */
+
 }
 
 kern_return_t
@@ -563,6 +597,7 @@ cpu_data_register(cpu_data_t *cpu_data_ptr)
 
 }
 
+
 kern_return_t
 cpu_start(int cpu)
 {
@@ -578,7 +613,7 @@ cpu_start(int cpu)
 
                cpu_data_ptr->cpu_reset_handler = (vm_offset_t) start_cpu_paddr;
 
-               cpu_data_ptr->cpu_pmap_cpu_data.cpu_user_pmap = NULL;
+               cpu_data_ptr->cpu_pmap_cpu_data.cpu_nested_pmap = NULL;
 
                if (cpu_data_ptr->cpu_processor->next_thread != THREAD_NULL)
                        first_thread = cpu_data_ptr->cpu_processor->next_thread;
index e3a0cb317dc8e819e7279c2f21cd7fcca72965e8..7aa9614a13f52877f779a5e12c4c0c7fa2a4cbd5 100644 (file)
@@ -78,6 +78,7 @@
  *   arg1 - Scratch register
  */
 .macro load_general_registers
+
        ldp             x16, x17, [$0, SS64_X16]
        ldp             x19, x20, [$0, SS64_X19]
        ldp             x21, x22, [$0, SS64_X21]
@@ -134,14 +135,16 @@ LEXT(machine_load_context)
        set_thread_registers    x0, x1, x2
        ldr             x1, [x0, TH_KSTACKPTR]                          // Get top of kernel stack
        load_general_registers  x1, x2
-       mov             x0, xzr                                                         // Clear argument to thread_continue
+       mov             x0, #0                                                          // Clear argument to thread_continue
        ret
 
 /*
- *     void Call_continuation( void (*continuation)(void), 
- *                             void *param, 
- *                             wait_result_t wresult, 
- *                             vm_offset_t stack_ptr)
+ *  typedef void (*thread_continue_t)(void *param, wait_result_t)
+ *
+ *     void Call_continuation( thread_continue_t continuation,
+ *                                     void *param,
+ *                                         wait_result_t wresult,
+ *                          bool enable interrupts)
  */
        .text
        .align  5
@@ -153,12 +156,21 @@ LEXT(Call_continuation)
        /* ARM64_TODO arm loads the kstack top instead of arg4. What should we use? */
        ldr             x5, [x4, TH_KSTACKPTR]                          // Get the top of the kernel stack
        mov             sp, x5                                                          // Set stack pointer
+       mov             fp, #0                                                          // Clear the frame pointer
+
+
+    mov x20, x0  //continuation
+    mov x21, x1  //continuation parameter
+    mov x22, x2  //wait result
+
+    cbz x3, 1f
+    mov x0, #1
+    bl _ml_set_interrupts_enabled
+1:
 
-       mov             fp, xzr                                                         // Clear the frame pointer
-       mov             x4, x0                                                          // Load the continuation
-       mov             x0, x1                                                          // Set the first parameter
-       mov             x1, x2                                                          // Set the wait result arg
-       blr             x4                                                                      // Branch to the continuation
+       mov             x0, x21                                                         // Set the first parameter
+       mov             x1, x22                                                         // Set the wait result arg
+       blr             x20                                                                     // Branch to the continuation
        mrs             x0, TPIDR_EL1                                           // Get the current thread pointer
        b               EXT(thread_terminate)                           // Kill the thread
 
index 6e47758b1ae0bfa14d9960832cc1e70510a8bbe8..c9755bde8087daa8bbb0afd1c935e574851cdb8c 100644 (file)
@@ -260,7 +260,6 @@ main(
        DECLARE("PGSHIFT", ARM_PGSHIFT);
        DECLARE("PGMASK", ARM_PGMASK);
 
-
        DECLARE("VM_MIN_ADDRESS",       VM_MIN_ADDRESS);
        DECLARE("VM_MAX_ADDRESS",       VM_MAX_ADDRESS);
        DECLARE("VM_MIN_KERNEL_ADDRESS",        VM_MIN_KERNEL_ADDRESS);
@@ -292,10 +291,6 @@ main(
                offsetof(cpu_data_t, excepstackptr));
         DECLARE("CPU_EXCEPSTACK_TOP",
                offsetof(cpu_data_t, excepstack_top));
-        DECLARE("CPU_FIQSTACKPTR",
-               offsetof(cpu_data_t, fiqstackptr));
-        DECLARE("CPU_FIQSTACK_TOP",
-               offsetof(cpu_data_t, fiqstack_top));
 #if __ARM_KERNEL_PROTECT__
        DECLARE("CPU_EXC_VECTORS",
                offsetof(cpu_data_t, cpu_exc_vectors));
@@ -356,6 +351,8 @@ main(
                offsetof(cpu_data_t, cpu_phys_id));
        DECLARE("RTCLOCK_DATAP",
                offsetof(cpu_data_t, rtclock_datap));
+       DECLARE("CLUSTER_MASTER",
+               offsetof(cpu_data_t, cluster_master));
 
        DECLARE("RTCLOCKDataSize",
                sizeof(rtclock_data_t));
@@ -382,8 +379,10 @@ main(
 
        DECLARE("CPU_DATA_PADDR",       offsetof(struct cpu_data_entry, cpu_data_paddr));
 
-
        DECLARE("INTSTACK_SIZE",        INTSTACK_SIZE);
+       DECLARE("EXCEPSTACK_SIZE",      EXCEPSTACK_SIZE);
+
+       DECLARE("PAGE_MAX_SIZE",        PAGE_MAX_SIZE);
 
        DECLARE("TIMER_TSTAMP",
                offsetof(struct timer, tstamp));
@@ -420,6 +419,8 @@ main(
                offsetof(struct boot_args, deviceTreeP));
        DECLARE("BA_DEVICE_TREE_LENGTH",
                offsetof(struct boot_args, deviceTreeLength));
+       DECLARE("BA_BOOT_FLAGS",
+               offsetof(struct boot_args, bootFlags));
 
        DECLARE("ENTROPY_INDEX_PTR",
                offsetof(entropy_data_t, index_ptr));
@@ -430,5 +431,6 @@ main(
        DECLARE("SR_RESTORE_TCR_EL1", offsetof(struct sysreg_restore, tcr_el1));
 
 
+
        return (0);
 }
index b1eae91fe497d047c52fd177470a30923381796f..d69c2d270d80050b1bfd22842b300dea918516a8 100644 (file)
@@ -41,6 +41,8 @@
 #include <kern/monotonic.h>
 #endif /* MONOTONIC */
 
+void kpc_pmi_handler(unsigned int ctr);
+
 /*
  * PMCs 8 and 9 were added to Hurricane and to maintain the existing bit
  * positions of the other PMCs, their configuration bits start at position 32.
@@ -230,8 +232,6 @@ static uint64_t kpc_running_cfg_pmc_mask = 0;
 static uint32_t kpc_running_classes = 0;
 static uint32_t kpc_configured = 0;
 
-static int first_time = 1;
-
 /*
  * The whitelist is disabled by default on development/debug kernel. This can
  * be changed via the kpc.disable_whitelist sysctl. The whitelist is enabled on
@@ -305,6 +305,20 @@ static kpc_config_t whitelist[] = {
        0xd3, /* FED_IC_MISS_DEM */
        0xd4, /* FED_ITLB_MISS */
 
+#elif defined(APPLEMONSOON)
+       0x02, /* CORE_CYCLE */
+       0x8a, /* INST_A32 */
+       0x8b, /* INST_THUMB */
+       0x8c, /* INST_A64 */
+       0x8d, /* INST_BRANCH */
+       0xbf, /* SYNC_DC_LOAD_MISS */
+       0xc0, /* SYNC_DC_STORE_MISS */
+       0xc1, /* SYNC_DTLB_MISS */
+       0xc4, /* SYNC_ST_HIT_YNGR_LD */
+       0xcb, /* SYNC_BR_ANY_MISP */
+       0xd3, /* FED_IC_MISS_DEM */
+       0xd4, /* FED_ITLB_MISS */
+
 #else
        /* An unknown CPU gets a trivial { NO_EVENT } whitelist. */
 #endif
@@ -984,43 +998,16 @@ kpc_set_reload_xcall(void *vmp_config)
                thread_wakeup((event_t) &kpc_reload_sync);
 }
 
-void kpc_pmi_handler(cpu_id_t source);
 void
-kpc_pmi_handler(cpu_id_t source __unused)
+kpc_pmi_handler(unsigned int ctr)
 {
-       uint64_t PMSR, extra;
-       int ctr;
-       int enabled;
+       uint64_t extra = kpc_reload_counter(ctr);
 
-       enabled = ml_set_interrupts_enabled(FALSE);
+       FIXED_SHADOW(ctr) += (kpc_fixed_max() - FIXED_RELOAD(ctr) + 1 /* Wrap */) + extra;
 
-       /* The pmi must be delivered to the CPU that generated it */
-       if (source != getCpuDatap()->interrupt_nub) {
-               panic("pmi from IOCPU %p delivered to IOCPU %p", source, getCpuDatap()->interrupt_nub); 
+       if (FIXED_ACTIONID(ctr)) {
+               kpc_sample_kperf(FIXED_ACTIONID(ctr));
        }
-
-       /* Get the PMSR which has the overflow bits for all the counters */
-       __asm__ volatile("mrs %0, S3_1_c15_c13_0" : "=r"(PMSR));
-
-       for (ctr = 0; ctr < (KPC_ARM64_FIXED_COUNT + KPC_ARM64_CONFIGURABLE_COUNT); ctr++) {
-               if ((1ull << ctr) & PMSR) {
-                       if (ctr < 2) {
-#if MONOTONIC
-                               mt_cpu_pmi(getCpuDatap(), PMSR);
-#endif /* MONOTONIC */
-                       } else {
-                               extra = kpc_reload_counter(ctr);
-
-                               FIXED_SHADOW(ctr)
-                                       += (kpc_fixed_max() - FIXED_RELOAD(ctr) + 1 /* Wrap */) + extra;
-
-                               if (FIXED_ACTIONID(ctr))
-                                       kpc_sample_kperf(FIXED_ACTIONID(ctr));
-                       }
-               }
-       }
-
-       ml_set_interrupts_enabled(enabled);
 }
 
 uint32_t
@@ -1032,20 +1019,7 @@ kpc_get_classes(void)
 int
 kpc_set_running_arch(struct kpc_running_remote *mp_config)
 {
-       int cpu;
-
-       assert(mp_config);
-
-       if (first_time) {
-               PE_cpu_perfmon_interrupt_install_handler(kpc_pmi_handler);
-               int max_cpu = ml_get_max_cpu_number();
-               for (cpu = 0; cpu <= max_cpu; cpu++) {
-                       cpu_data_t *target_cpu_datap = (cpu_data_t *)CpuDataEntries[cpu].cpu_data_vaddr;
-                       if (target_cpu_datap != NULL)
-                               PE_cpu_perfmon_interrupt_enable(target_cpu_datap->cpu_id, TRUE);
-               }
-               first_time = 0;
-       }
+       assert(mp_config != NULL);
 
        /* dispatch to all CPUs */
        cpu_broadcast_xcall(&kpc_xcall_sync, TRUE, kpc_set_running_xcall, mp_config);
index 376e901953de96617b5c0af84383c22d06614f73..6a8d109f785184ec72be4639a88740552a7171bf 100644 (file)
        stp             q28, q29, [x0, NS64_Q28]
        stp             q30, q31, [x0, NS64_Q30]
 
-       mrs             lr, ELR_EL1                                                     // Get exception link register
+       mrs             lr,  ELR_EL1                                            // Get exception link register
        mrs             x23, SPSR_EL1                                           // Load CPSR into var reg x23
        mrs             x24, FPSR
        mrs             x25, FPCR
 
+
        str             lr, [x0, SS64_PC]                                       // Save ELR to PCB
        str             w23, [x0, SS64_CPSR]                            // Save CPSR to PCB
        str             w24, [x0, NS64_FPSR]
@@ -372,6 +373,8 @@ Lel1_sp0_serror_vector_long:
 .endmacro
 
 Lel1_sp1_synchronous_vector_long:
+       b               check_exception_stack
+Lel1_sp1_synchronous_valid_stack:
 #if defined(KERNEL_INTEGRITY_KTRR)
        b               check_ktrr_sctlr_trap
 Lel1_sp1_synchronous_vector_continue:
@@ -400,7 +403,7 @@ Lel1_sp1_serror_vector_long:
        b               fleh_dispatch64
 
 .macro EL0_64_VECTOR
-       mov             x18, xzr                                                // Zero x18 to avoid leaking data to user SS
+       mov             x18, #0                                                 // Zero x18 to avoid leaking data to user SS
        stp             x0, x1, [sp, #-16]!                                     // Save x0 and x1 to the exception stack
        mrs             x0, TPIDR_EL1                                           // Load the thread register
        mrs             x1, SP_EL0                                                      // Load the user stack pointer
@@ -412,8 +415,8 @@ Lel1_sp1_serror_vector_long:
        msr             SPSel, #0                                                       // Switch to SP0
        stp             x0, x1, [sp, SS64_X0]                           // Save x0, x1 to the user PCB
        stp             fp, lr, [sp, SS64_FP]                           // Save fp and lr to the user PCB
-       mov             fp, xzr                                                         // Clear the fp and lr for the
-       mov             lr, xzr                                                         // debugger stack frame
+       mov             fp, #0                                                          // Clear the fp and lr for the
+       mov             lr, #0                                                          // debugger stack frame
        mov             x0, sp                                                          // Copy the user PCB pointer to x0
 .endmacro
 
@@ -457,6 +460,30 @@ Lel0_serror_vector_64_long:
        b               fleh_dispatch64
 
 
+/*
+ * check_exception_stack
+ *
+ * Verifies that stack pointer at SP1 is within exception stack
+ * If not, will simply hang as we have no more stack to fall back on.
+ */
+       .text
+       .align 2
+check_exception_stack:
+       mrs             x18, TPIDR_EL1                                  // Get thread pointer
+       cbz             x18, Lvalid_exception_stack                     // Thread context may not be set early in boot
+       ldr             x18, [x18, ACT_CPUDATAP]
+       cbz             x18, .                                          // If thread context is set, cpu data should be too
+       ldr             x18, [x18, CPU_EXCEPSTACK_TOP]
+       cmp             sp, x18
+       b.gt            .                                               // Hang if above exception stack top
+       sub             x18, x18, EXCEPSTACK_SIZE_NUM                   // Find bottom of exception stack
+       cmp             sp, x18
+       b.lt            .                                               // Hang if below exception stack bottom
+Lvalid_exception_stack:
+       mov             x18, #0
+       b               Lel1_sp1_synchronous_valid_stack
+
 /*
  * check_kernel_stack
  *
@@ -492,17 +519,10 @@ Ltest_kstack:
 Ltest_istack:
        ldr             x1, [x1, ACT_CPUDATAP]                          // Load the cpu data ptr
        ldr             x2, [x1, CPU_INTSTACK_TOP]                      // Get top of istack
-       sub             x3, x2, PGBYTES                                         // Find bottom of istack
+       sub             x3, x2, INTSTACK_SIZE_NUM                       // Find bottom of istack
        cmp             x0, x2                                                          // if (SP_EL0 >= istack top)
-       b.ge    Ltest_fiqstack                                          //    jump to fiqstack test
-       cmp             x0, x3                                                          // if (SP_EL0 > istack bottom)
-       b.gt    Lvalid_stack                                            //    stack pointer valid
-Ltest_fiqstack:
-       ldr             x2, [x1, CPU_FIQSTACK_TOP]                      // Get top of fiqstack
-       sub             x3, x2, PGBYTES                                         // Find bottom of fiqstack
-       cmp             x0, x2                                                          // if (SP_EL0 >= fiqstack top)
        b.ge    Lcorrupt_stack                                          //    corrupt stack pointer
-       cmp             x0, x3                                                          // if (SP_EL0 > fiqstack bottom)
+       cmp             x0, x3                                                          // if (SP_EL0 > istack bottom)
        b.gt    Lvalid_stack                                            //    stack pointer valid
 Lcorrupt_stack:
        INIT_SAVED_STATE_FLAVORS sp, w0, w1
@@ -570,32 +590,32 @@ fleh_dispatch64:
        cmp             x23, #(PSR64_MODE_EL0)
        bne             1f
 
-       mov             x2, xzr
-       mov             x3, xzr
-       mov             x4, xzr
-       mov             x5, xzr
-       mov             x6, xzr
-       mov             x7, xzr
-       mov             x8, xzr
-       mov             x9, xzr
-       mov             x10, xzr
-       mov             x11, xzr
-       mov             x12, xzr
-       mov             x13, xzr
-       mov             x14, xzr
-       mov             x15, xzr
-       mov             x16, xzr
-       mov             x17, xzr
-       mov             x18, xzr
-       mov             x19, xzr
-       mov             x20, xzr
+       mov             x2, #0
+       mov             x3, #0
+       mov             x4, #0
+       mov             x5, #0
+       mov             x6, #0
+       mov             x7, #0
+       mov             x8, #0
+       mov             x9, #0
+       mov             x10, #0
+       mov             x11, #0
+       mov             x12, #0
+       mov             x13, #0
+       mov             x14, #0
+       mov             x15, #0
+       mov             x16, #0
+       mov             x17, #0
+       mov             x18, #0
+       mov             x19, #0
+       mov             x20, #0
        /* x21, x22 cleared in common case below */
-       mov             x23, xzr
-       mov             x24, xzr
-       mov             x25, xzr
-       mov             x26, xzr
-       mov             x27, xzr
-       mov             x28, xzr
+       mov             x23, #0
+       mov             x24, #0
+       mov             x25, #0
+       mov             x26, #0
+       mov             x27, #0
+       mov             x28, #0
        /* fp/lr already cleared by EL0_64_VECTOR */
 1:
 
@@ -910,7 +930,6 @@ check_user_asts:
        // return_to_user, the latter will have to change.
        //
 
-
 exception_return:
        msr             DAIFSet, #DAIFSC_ALL                            // Disable exceptions
        mrs             x3, TPIDR_EL1                                   // Load thread pointer
@@ -1021,7 +1040,7 @@ Lexception_return_restore_registers:
        mrs             x18, TTBR0_EL1
        bic             x18, x18, #(1 << TTBR_ASID_SHIFT)
        msr             TTBR0_EL1, x18
-       mov             x18, xzr
+       mov             x18, #0
 
        /* We don't need an ISB here, as the eret is synchronizing. */
 Lskip_ttbr1_switch:
@@ -1044,7 +1063,7 @@ user_set_debug_state_and_return:
        POP_FRAME
        isb
        mrs             x3, TPIDR_EL1                                           // Reload thread pointer
-       b               exception_return                                        // And continue
+       b               exception_return                        // And continue
 
        .text
        .align 2
index 1eec5310480f2c12ed2d6083045eb5cd919e554b..e211448c7c4fcbd1bc3cf696314e7052c625892c 100644 (file)
 
 #define INT_SIZE        (BYTE_SIZE * sizeof (int))
 
-void
-bcopy_phys(addr64_t src, addr64_t dst, vm_size_t bytes)
+#define BCOPY_PHYS_SRC_IS_PHYS(flags) (((flags) & cppvPsrc) != 0)
+#define BCOPY_PHYS_DST_IS_PHYS(flags) (((flags) & cppvPsnk) != 0)
+#define BCOPY_PHYS_SRC_IS_USER(flags) (((flags) & (cppvPsrc | cppvKmap)) == 0)
+#define BCOPY_PHYS_DST_IS_USER(flags) (((flags) & (cppvPsnk | cppvKmap)) == 0)
+
+static kern_return_t
+bcopy_phys_internal(addr64_t src, addr64_t dst, vm_size_t bytes, int flags)
 {
        unsigned int    src_index;
        unsigned int    dst_index;
@@ -62,49 +67,108 @@ bcopy_phys(addr64_t src, addr64_t dst, vm_size_t bytes)
        vm_offset_t     dst_offset;
        unsigned int    wimg_bits_src, wimg_bits_dst;
        unsigned int    cpu_num = 0;
-       ppnum_t         pn_src = (ppnum_t)(src >> PAGE_SHIFT);
-       ppnum_t         pn_dst = (ppnum_t)(dst >> PAGE_SHIFT);
-
-#ifdef __ARM_COHERENT_IO__
-       if (pmap_valid_address(src) &&
-           pmap_valid_address(dst) &&
-           (mmu_kvtop_wpreflight(phystokv((pmap_paddr_t) dst)))) {
-               bcopy((char *)phystokv((pmap_paddr_t) src), (char *)phystokv((pmap_paddr_t) dst), bytes);
-               return;
-       }
+       ppnum_t         pn_src;
+       ppnum_t         pn_dst;
+       addr64_t        end __assert_only;
+       kern_return_t   res = KERN_SUCCESS;
+
+       assert(!__improbable(os_add_overflow(src, bytes, &end)));
+       assert(!__improbable(os_add_overflow(dst, bytes, &end)));
+
+       while ((bytes > 0) && (res == KERN_SUCCESS)) {
+               src_offset = src & PAGE_MASK;
+               dst_offset = dst & PAGE_MASK;
+               boolean_t use_copy_window_src = FALSE;
+               boolean_t use_copy_window_dst = FALSE;
+               vm_size_t count = bytes;
+               vm_size_t count2 = bytes;
+               if (BCOPY_PHYS_SRC_IS_PHYS(flags)) {
+                       use_copy_window_src = !pmap_valid_address(src);
+                       pn_src = (ppnum_t)(src >> PAGE_SHIFT);
+#if !defined(__ARM_COHERENT_IO__) && !__ARM_PTE_PHYSMAP__
+                       count = PAGE_SIZE - src_offset;
+                       wimg_bits_src = pmap_cache_attributes(pn_src);
+                       if ((wimg_bits_src & VM_WIMG_MASK) != VM_WIMG_DEFAULT)
+                               use_copy_window_src = TRUE;
+#else
+                       if (use_copy_window_src) {
+                               wimg_bits_src = pmap_cache_attributes(pn_src);
+                               count = PAGE_SIZE - src_offset;
+                       }
 #endif
+               }
+               if (BCOPY_PHYS_DST_IS_PHYS(flags)) {
+                       // write preflighting needed for things like dtrace which may write static read-only mappings
+                       use_copy_window_dst = (!pmap_valid_address(dst) || !mmu_kvtop_wpreflight(phystokv((pmap_paddr_t)dst)));
+                       pn_dst = (ppnum_t)(dst >> PAGE_SHIFT);
+#if !defined(__ARM_COHERENT_IO__) && !__ARM_PTE_PHYSMAP__
+                       count2 = PAGE_SIZE - dst_offset;
+                       wimg_bits_dst = pmap_cache_attributes(pn_dst);
+                       if ((wimg_bits_dst & VM_WIMG_MASK) != VM_WIMG_DEFAULT)
+                               use_copy_window_dst = TRUE;
+#else
+                       if (use_copy_window_dst) {
+                               wimg_bits_dst = pmap_cache_attributes(pn_dst);
+                               count2 = PAGE_SIZE - dst_offset;
+                       }
+#endif
+               }
 
-       wimg_bits_src = pmap_cache_attributes(pn_src);
-       wimg_bits_dst = pmap_cache_attributes(pn_dst);
+               char *tmp_src;
+               char *tmp_dst;
 
-#ifndef        __ARM_COHERENT_IO__
-       if (((wimg_bits_src & VM_WIMG_MASK) == VM_WIMG_DEFAULT) && 
-               ((wimg_bits_dst & VM_WIMG_MASK) == VM_WIMG_DEFAULT) &&
-               (mmu_kvtop_wpreflight(phystokv((pmap_paddr_t) dst)))) {
-               /* Fast path - dst is writable and both source and destination have default attributes */
-               bcopy((char *)phystokv((pmap_paddr_t) src), (char *)phystokv((pmap_paddr_t) dst), bytes);
-               return;
-       }
-#endif
+               if (use_copy_window_src || use_copy_window_dst) {
+                       mp_disable_preemption();
+                       cpu_num = cpu_number();
+               }
+
+               if (use_copy_window_src) {
+                       src_index = pmap_map_cpu_windows_copy(pn_src, VM_PROT_READ, wimg_bits_src);
+                       tmp_src = (char*)(pmap_cpu_windows_copy_addr(cpu_num, src_index) + src_offset);
+               } else if (BCOPY_PHYS_SRC_IS_PHYS(flags)) {
+                       tmp_src = (char*)phystokv_range((pmap_paddr_t)src, &count);
+               } else {
+                       tmp_src = (char*)src;
+               }
+               if (use_copy_window_dst) {
+                       dst_index = pmap_map_cpu_windows_copy(pn_dst, VM_PROT_READ | VM_PROT_WRITE, wimg_bits_dst);
+                       tmp_dst = (char*)(pmap_cpu_windows_copy_addr(cpu_num, dst_index) + dst_offset);
+               } else if (BCOPY_PHYS_DST_IS_PHYS(flags)) {
+                       tmp_dst = (char*)phystokv_range((pmap_paddr_t)dst, &count2);
+               } else {
+                       tmp_dst = (char*)dst;
+               }
 
-       src_offset = src & PAGE_MASK;
-       dst_offset = dst & PAGE_MASK;
+               if (count > count2)
+                       count = count2;
+               if (count > bytes)
+                       count = bytes;
 
-       if ((src_offset + bytes) > PAGE_SIZE || (dst_offset + bytes) > PAGE_SIZE)
-               panic("bcopy extends beyond copy windows");
+               if (BCOPY_PHYS_SRC_IS_USER(flags))
+                       res = copyin((user_addr_t)src, tmp_dst, count);
+               else if (BCOPY_PHYS_DST_IS_USER(flags))
+                       res = copyout(tmp_src, (user_addr_t)dst, count);
+               else
+                       bcopy(tmp_src, tmp_dst, count);
 
-       mp_disable_preemption();
-       cpu_num = cpu_number();
-       src_index = pmap_map_cpu_windows_copy(pn_src, VM_PROT_READ, wimg_bits_src);
-       dst_index = pmap_map_cpu_windows_copy(pn_dst, VM_PROT_READ|VM_PROT_WRITE, wimg_bits_dst);
+               if (use_copy_window_src)
+                       pmap_unmap_cpu_windows_copy(src_index);
+               if (use_copy_window_dst)
+                       pmap_unmap_cpu_windows_copy(dst_index);
+               if (use_copy_window_src || use_copy_window_dst)
+                       mp_enable_preemption();
 
-       bcopy((char *)(pmap_cpu_windows_copy_addr(cpu_num, src_index) + src_offset),
-             (char *)(pmap_cpu_windows_copy_addr(cpu_num, dst_index) + dst_offset),
-             bytes);
+               src += count;
+               dst += count;
+               bytes -= count;
+       }
+       return res;
+}
 
-       pmap_unmap_cpu_windows_copy(src_index);
-       pmap_unmap_cpu_windows_copy(dst_index);
-       mp_enable_preemption();
+void
+bcopy_phys(addr64_t src, addr64_t dst, vm_size_t bytes)
+{
+       bcopy_phys_internal(src, dst, bytes, cppvPsrc | cppvPsnk);
 }
 
 void
@@ -119,48 +183,53 @@ bzero_phys(addr64_t src, vm_size_t bytes)
 {
        unsigned int    wimg_bits;
        unsigned int    cpu_num = cpu_number();
-       ppnum_t         pn = (ppnum_t)(src >> PAGE_SHIFT);
+       ppnum_t         pn;
+       addr64_t        end __assert_only;
 
-#ifdef __ARM_COHERENT_IO__
-       if (pmap_valid_address(src)) {
-               bzero((char *)phystokv((pmap_paddr_t) src), bytes);
-               return;
-       }
-#endif
+       assert(!__improbable(os_add_overflow(src, bytes, &end)));
 
-       wimg_bits = pmap_cache_attributes(pn);
+       vm_offset_t offset = src & PAGE_MASK;
+       while (bytes > 0) {
+               vm_size_t count = bytes;
 
-#ifndef        __ARM_COHERENT_IO__
-       if ((wimg_bits & VM_WIMG_MASK) == VM_WIMG_DEFAULT) {
-               /* Fast path - default attributes */
-               bzero((char *)phystokv((pmap_paddr_t) src), bytes);
-               return;
-       }
+               boolean_t use_copy_window = !pmap_valid_address(src);
+               pn = (ppnum_t)(src >> PAGE_SHIFT);
+#if !defined(__ARM_COHERENT_IO__) && !__ARM_PTE_PHYSMAP__
+               count = PAGE_SIZE - offset;
+               wimg_bits = pmap_cache_attributes(pn);
+               if ((wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT)
+                       use_copy_window = TRUE;
+#else
+               if (use_copy_window) {
+                       wimg_bits = pmap_cache_attributes(pn);
+                       count = PAGE_SIZE - offset;
+               }
 #endif
-
-       mp_disable_preemption();
-       cpu_num = cpu_number();
-
-       while (bytes > 0) {
-               vm_offset_t offset = src & PAGE_MASK;
-               uint64_t count = PAGE_SIZE - offset;
+               char *buf;
+               unsigned int index;
+               if (use_copy_window) {
+                       mp_disable_preemption();
+                       cpu_num = cpu_number();
+                       index = pmap_map_cpu_windows_copy(pn, VM_PROT_READ | VM_PROT_WRITE, wimg_bits);
+                       buf = (char *)(pmap_cpu_windows_copy_addr(cpu_num, index) + offset);
+               } else {
+                       buf = (char *)phystokv_range((pmap_paddr_t)src, &count);
+               }
 
                if (count > bytes)
                        count = bytes;
 
-               pn = (ppnum_t)(src >> PAGE_SHIFT);
-
-               unsigned int index = pmap_map_cpu_windows_copy(pn, VM_PROT_READ | VM_PROT_WRITE, wimg_bits);
-
-               bzero((char *)(pmap_cpu_windows_copy_addr(cpu_num, index) + offset), count);
+               bzero(buf, count);
 
-               pmap_unmap_cpu_windows_copy(index);
+               if (use_copy_window) {
+                       pmap_unmap_cpu_windows_copy(index);
+                       mp_enable_preemption();
+               }
 
                src += count;
                bytes -= count;
+               offset = 0;
        }
-
-       mp_enable_preemption();
 }
 
 /*
@@ -174,13 +243,17 @@ ml_phys_read_data(pmap_paddr_t paddr, int size)
        unsigned int   index;
        unsigned int   wimg_bits;
        ppnum_t        pn = (ppnum_t)(paddr >> PAGE_SHIFT);
+       ppnum_t        pn_end = (ppnum_t)((paddr + size - 1) >> PAGE_SHIFT); 
        unsigned long  long result = 0;
        vm_offset_t    copywindow_vaddr = 0;
        unsigned char  s1;
        unsigned short s2;
        unsigned int   s4;
 
-#ifdef __ARM_COHERENT_IO__
+       if (__improbable(pn_end != pn))
+               panic("%s: paddr 0x%llx spans a page boundary", __func__, (uint64_t)paddr);
+
+#if defined(__ARM_COHERENT_IO__) || __ARM_PTE_PHYSMAP__
        if (pmap_valid_address(paddr)) {
                switch (size) {
                case 1:
@@ -301,9 +374,13 @@ ml_phys_write_data(pmap_paddr_t paddr, unsigned long long data, int size)
        unsigned int    index;
        unsigned int    wimg_bits;
        ppnum_t         pn = (ppnum_t)(paddr >> PAGE_SHIFT);
+       ppnum_t         pn_end = (ppnum_t)((paddr + size - 1) >> PAGE_SHIFT); 
        vm_offset_t     copywindow_vaddr = 0;
 
-#ifdef __ARM_COHERENT_IO__
+       if (__improbable(pn_end != pn))
+               panic("%s: paddr 0x%llx spans a page boundary", __func__, (uint64_t)paddr);
+
+#if defined(__ARM_COHERENT_IO__) || __ARM_PTE_PHYSMAP__
        if (pmap_valid_address(paddr)) {
                switch (size) {
                case 1:
@@ -539,53 +616,21 @@ memcmp(const void *s1, const void *s2, size_t n)
 kern_return_t
 copypv(addr64_t source, addr64_t sink, unsigned int size, int which)
 {
-       kern_return_t   retval = KERN_SUCCESS;
-       void            *from, *to;
-#ifndef        __ARM_COHERENT_IO__
-       unsigned int    from_wimg_bits, to_wimg_bits;
-#endif
+       if ((which & (cppvPsrc | cppvPsnk)) == 0)       /* Make sure that only one is virtual */
+               panic("%s: no more than 1 parameter may be virtual", __func__);
 
-       from = CAST_DOWN(void *, source);
-       to = CAST_DOWN(void *, sink);
-
-       if ((which & (cppvPsrc | cppvPsnk)) == 0)       /* Make sure that only
-                                                        * one is virtual */
-               panic("copypv: no more than 1 parameter may be virtual\n");     /* Not allowed */
-
-       if (which & cppvPsrc)
-               from = (void *)phystokv(from);
-       if (which & cppvPsnk)
-               to = (void *)phystokv(to);
-
-       if ((which & (cppvPsrc | cppvKmap)) == 0)       /* Source is virtual in
-                                                        * current map */
-               retval = copyin((user_addr_t) from, to, size);
-       else if ((which & (cppvPsnk | cppvKmap)) == 0)  /* Sink is virtual in
-                                                        * current map */
-               retval = copyout(from, (user_addr_t) to, size);
-       else                    /* both addresses are physical or kernel map */
-               bcopy(from, to, size);
-
-#ifndef        __ARM_COHERENT_IO__
-       if (which & cppvFsrc) {
-               flush_dcache64(source, size, ((which & cppvPsrc) == cppvPsrc));
-       } else if (which & cppvPsrc) {
-               from_wimg_bits = pmap_cache_attributes(source >> PAGE_SHIFT);
-               if ((from_wimg_bits != VM_WIMG_COPYBACK) && (from_wimg_bits != VM_WIMG_WTHRU))
-                       flush_dcache64(source, size, TRUE);
-       }
+       kern_return_t res = bcopy_phys_internal(source, sink, size, which);
 
-       if (which & cppvFsnk) {
-               flush_dcache64(sink, size, ((which & cppvPsnk) == cppvPsnk));
-       } else if (which & cppvPsnk) { 
-               to_wimg_bits = pmap_cache_attributes(sink >> PAGE_SHIFT);
-               if (to_wimg_bits != VM_WIMG_COPYBACK)
-                       flush_dcache64(sink, size, TRUE);
-       }
+#ifndef __ARM_COHERENT_IO__
+        if (which & cppvFsrc)
+                flush_dcache64(source, size, ((which & cppvPsrc) == cppvPsrc));
+
+        if (which & cppvFsnk)
+                flush_dcache64(sink, size, ((which & cppvPsnk) == cppvPsnk));
 #endif
-       return retval;
-}
 
+       return res;
+}
 
 #if     MACH_ASSERT
 
index a04ef6d76737813d2faabf02006c9114c2a0bad3..6cd326a3048fb3c00963da01d42938d75fb2d5cd 100644 (file)
@@ -66,21 +66,21 @@ lowglo lowGlo __attribute__ ((aligned(PAGE_MAX_SIZE))) = {
        .lgManualPktAddr = (uint64_t) &manual_pkt,
 #endif
        .lgPmapMemQ = (uint64_t)&(pmap_object_store.memq),
-       .lgPmapMemPageOffset = offsetof(struct vm_page_with_ppnum, phys_page),
-       .lgPmapMemChainOffset = offsetof(struct vm_page, listq),
+       .lgPmapMemPageOffset = offsetof(struct vm_page_with_ppnum, vmp_phys_page),
+       .lgPmapMemChainOffset = offsetof(struct vm_page, vmp_listq),
        .lgPmapMemPagesize = (uint64_t)sizeof(struct vm_page),
        .lgPmapMemFromArrayMask = VM_PACKED_FROM_VM_PAGES_ARRAY,
        .lgPmapMemPackedShift = VM_PACKED_POINTER_SHIFT,
        .lgPmapMemPackedBaseAddr = VM_MIN_KERNEL_AND_KEXT_ADDRESS,
        .lgPmapMemStartAddr = -1,
        .lgPmapMemEndAddr = -1,
-       .lgPmapMemFirstppnum = -1
+       .lgPmapMemFirstppnum = -1,
+       .lgPageShift = ARM_PGSHIFT
 };
 
 void patch_low_glo(void)
 {
        lowGlo.lgStext = (uint64_t)vm_kernel_stext;
-       lowGlo.lgPageShift = PAGE_SHIFT;
 }
 
 void patch_low_glo_static_region(uint64_t address, uint64_t size)
@@ -95,4 +95,5 @@ void patch_low_glo_vm_page_info(void * start_addr, void * end_addr, uint32_t fir
        lowGlo.lgPmapMemStartAddr = (uint64_t)start_addr;
        lowGlo.lgPmapMemEndAddr = (uint64_t)end_addr;
        lowGlo.lgPmapMemFirstppnum = first_ppnum;
+       lowGlo.lgPageShift = PAGE_SHIFT;
 }
index 2c7353465b646138f819377efca9c5320635a063..632776e956640d2f811c67b4a169a7aa80cf214a 100644 (file)
@@ -26,6 +26,7 @@
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 #include <vm/lz4_assembly_select.h>
+#include <arm64/asm.h>
 #if LZ4_ENABLE_ASSEMBLY_DECODE_ARM64
 
 /*
 #define src_good          x20
 
 .macro establish_frame
+    ARM64_STACK_PROLOG
     stp     fp, lr,    [sp, #-16]!
     mov     fp, sp
 .endm
 
 .macro clear_frame_and_return
     ldp     fp, lr,    [sp], #16
-    ret     lr
+    ARM64_STACK_EPILOG
 .endm
 
 // copy_1x16 SOURCE_ADDR DESTINATION_ADDR
index bf94a8536e28be237c367ff68a0127b99f0ae509..1c5a51e8c448b4f28e54ed2bec483a3cfc9ef4b4 100644 (file)
@@ -28,6 +28,7 @@
 
 #include <vm/lz4_assembly_select.h>
 #include <vm/lz4_constants.h>
+#include <arm64/asm.h>
 
 #if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64
 
@@ -54,6 +55,7 @@
 _lz4_encode_2gb:
 
     // esteblish frame
+    ARM64_STACK_PROLOG
     stp     fp, lr,    [sp, #-16]!
     mov     fp, sp
 
@@ -391,7 +393,7 @@ L_done:
 
     // clear frame
     ldp     fp, lr,    [sp], #16
-    ret     lr
+    ARM64_STACK_EPILOG
 
 L_revert_x9_and_done:
     sub x9, x9, #1
index 1ba778dc74ec5cad9918da2dbd29398ad2690e05..7f1b0b6408ce89c944f2df937ae663fe397b5fd4 100644 (file)
 typedef uint64_t kpc_config_t;
 
 #define KPC_ARM64_FIXED_COUNT        (2)
-#if NO_MONITOR
-/* Addition of 2 counters to the SoC happens to coincide with removal of
- * EL3 monitor.   If this changes again in the future, consider moving
- * counter config to per-SoC headers. */
-#define KPC_ARM64_CONFIGURABLE_COUNT (8)
-#else
-#define KPC_ARM64_CONFIGURABLE_COUNT (6)
-#endif
+#define KPC_ARM64_CONFIGURABLE_COUNT (CORE_NCTRS - KPC_ARM64_FIXED_COUNT)
 
 #define KPC_ARM64_COUNTER_WIDTH    (47)
 #define KPC_ARM64_COUNTER_MASK     ((UINT64_C(1) << KPC_ARM64_COUNTER_WIDTH) - 1)
index f6a5db38c0cca74cbc6cec553feea0922180fcc1..e9a22430d930cc0b31daebf392371a9d9ba14eee 100644 (file)
@@ -56,6 +56,8 @@
 #include <libkern/kernel_mach_header.h>
 #endif
 
+#include <libkern/section_keywords.h>
+
 #if KPC
 #include <kern/kpc.h>
 #endif
@@ -73,16 +75,23 @@ boolean_t is_clock_configured = FALSE;
 extern int mach_assert;
 extern volatile uint32_t debug_enabled;
 
+extern vm_offset_t   segEXTRADATA;
+extern vm_offset_t   segLOWESTTEXT;
+extern vm_offset_t   segLASTB;
+extern unsigned long segSizeLAST;
+
 
 void machine_conf(void);
 
 thread_t Idle_context(void);
 
-static uint32_t cpu_phys_ids[MAX_CPUS] = {[0 ... MAX_CPUS - 1] = (uint32_t)-1};
-static unsigned int avail_cpus = 0;
-static int boot_cpu = -1;
-static int max_cpu_number = 0;
-cluster_type_t boot_cluster = CLUSTER_TYPE_SMP;
+SECURITY_READ_ONLY_LATE(static uint32_t) cpu_phys_ids[MAX_CPUS] = {[0 ... MAX_CPUS - 1] = (uint32_t)-1};
+SECURITY_READ_ONLY_LATE(static unsigned int) avail_cpus = 0;
+SECURITY_READ_ONLY_LATE(static int) boot_cpu = -1;
+SECURITY_READ_ONLY_LATE(static int) max_cpu_number = 0;
+SECURITY_READ_ONLY_LATE(cluster_type_t) boot_cluster = CLUSTER_TYPE_SMP;
+
+SECURITY_READ_ONLY_LATE(static uint32_t) fiq_eventi = UINT32_MAX; 
 
 lockdown_handler_t lockdown_handler;
 void *lockdown_this;
@@ -183,19 +192,6 @@ pmap_paddr_t get_mmu_ttb(void)
        return value;
 }
 
-MARK_AS_PMAP_TEXT
-void set_mmu_ttb(pmap_paddr_t value)
-{
-#if __ARM_KERNEL_PROTECT__
-       /* All EL1-mode ASIDs are odd. */
-       value |= (1ULL << TTBR_ASID_SHIFT);
-#endif /* __ARM_KERNEL_PROTECT__ */
-
-       __builtin_arm_dsb(DSB_ISH);
-       MSR("TTBR0_EL1", value);
-       __builtin_arm_isb(ISB_SY);
-}
-
 static uint32_t get_midr_el1(void)
 {
        uint64_t value;
@@ -267,7 +263,15 @@ void rorgn_stash_range(void)
        }
 #endif
 
-       /* Get the AMC values, and stash them into rorgn_begin, rorgn_end. */
+       /* Get the AMC values, and stash them into rorgn_begin, rorgn_end.
+        * gPhysBase is the base of DRAM managed by xnu. we need DRAM_BASE as
+        * the AMCC RO region begin/end registers are in units of 16KB page
+        * numbers from DRAM_BASE so we'll truncate gPhysBase at 512MB granule
+        * and assert the value is the canonical DRAM_BASE PA of 0x8_0000_0000 for arm64.
+        */
+
+       uint64_t dram_base = gPhysBase & ~0x1FFFFFFFULL;  /* 512MB */
+       assert(dram_base == 0x800000000ULL);
 
 #if defined(KERNEL_INTEGRITY_KTRR)
        uint64_t soc_base = 0;
@@ -288,8 +292,8 @@ void rorgn_stash_range(void)
 
 #if defined(KERNEL_INTEGRITY_KTRR)
        assert(rRORGNENDADDR > rRORGNBASEADDR);
-       rorgn_begin = (rRORGNBASEADDR << ARM_PGSHIFT) + gPhysBase;
-       rorgn_end   = (rRORGNENDADDR << ARM_PGSHIFT) + gPhysBase;
+       rorgn_begin = (rRORGNBASEADDR << AMCC_PGSHIFT) + dram_base;
+       rorgn_end   = (rRORGNENDADDR << AMCC_PGSHIFT) + dram_base;
 #else
 #error KERNEL_INTEGRITY config error
 #endif /* defined (KERNEL_INTEGRITY_KTRR) */
@@ -358,7 +362,7 @@ static void assert_amcc_cache_disabled() {
 void rorgn_lockdown(void)
 {
        vm_offset_t ktrr_begin, ktrr_end;
-       unsigned long plt_segsz, last_segsz;
+       unsigned long last_segsz;
 
 #if DEVELOPMENT || DEBUG
        boolean_t ktrr_disable = FALSE;
@@ -377,22 +381,24 @@ void rorgn_lockdown(void)
        assert_unlocked();
 
        /* [x] - Use final method of determining all kernel text range or expect crashes */
-
-       ktrr_begin = (uint64_t) getsegdatafromheader(&_mh_execute_header, "__PRELINK_TEXT", &plt_segsz);
+       ktrr_begin = segEXTRADATA;
        assert(ktrr_begin && gVirtBase && gPhysBase);
 
        ktrr_begin = kvtophys(ktrr_begin);
 
+       ktrr_end   = kvtophys(segLASTB);
+       last_segsz = segSizeLAST;
+#if defined(KERNEL_INTEGRITY_KTRR)
        /* __LAST is not part of the MMU KTRR region (it is however part of the AMCC KTRR region) */
-       ktrr_end = (uint64_t) getsegdatafromheader(&_mh_execute_header, "__LAST", &last_segsz);
-       ktrr_end = (kvtophys(ktrr_end) - 1) & ~PAGE_MASK;
-
+       ktrr_end = (ktrr_end - 1) & ~AMCC_PGMASK;
        /* ensure that iboot and xnu agree on the ktrr range */
        assert(rorgn_begin == ktrr_begin && rorgn_end == (ktrr_end + last_segsz));
        /* assert that __LAST segment containing privileged insns is only a single page */
        assert(last_segsz == PAGE_SIZE);
+#endif
+
 
-#if DEBUG
+#if DEBUG || DEVELOPMENT
        printf("KTRR Begin: %p End: %p, setting lockdown\n", (void *)ktrr_begin, (void *)ktrr_end);
 #endif
 
@@ -401,7 +407,7 @@ void rorgn_lockdown(void)
        assert_amcc_cache_disabled();
 
        CleanPoC_DcacheRegion_Force(phystokv(ktrr_begin),
-               (unsigned)((ktrr_end + last_segsz) - ktrr_begin + PAGE_MASK));
+               (unsigned)((ktrr_end + last_segsz) - ktrr_begin + AMCC_PGMASK));
 
        lock_amcc();
 
@@ -446,7 +452,7 @@ void machine_lockdown_preflight(void)
 #if CONFIG_KERNEL_INTEGRITY
 
 #if defined(KERNEL_INTEGRITY_KTRR)
-       rorgn_stash_range();
+       rorgn_stash_range();
 #endif
 
 #endif
@@ -470,13 +476,13 @@ void machine_lockdown(void)
 
 
 #if defined(KERNEL_INTEGRITY_KTRR)
-        /* KTRR
-         *
-         * Lock physical KTRR region. KTRR region is read-only. Memory outside
-         * the region is not executable at EL1.
-         */
+       /* KTRR
+        *
+        * Lock physical KTRR region. KTRR region is read-only. Memory outside
+        * the region is not executable at EL1.
+        */
 
-         rorgn_lockdown();
+       rorgn_lockdown();
 #endif /* defined(KERNEL_INTEGRITY_KTRR)*/
 
 
@@ -833,6 +839,16 @@ ml_parse_cpu_topology(void)
 
        if (boot_cpu == -1)
                panic("unable to determine boot cpu!");
+
+       /*
+        * Set TPIDRRO_EL0 to indicate the correct cpu number, as we may
+        * not be booting from cpu 0.  Userspace will consume the current
+        * CPU number through this register.  For non-boot cores, this is
+        * done in start.s (start_cpu) using the cpu_number field of the
+        * per-cpu data object.
+        */
+       assert(__builtin_arm_rsr64("TPIDRRO_EL0") == 0);
+       __builtin_arm_wsr64("TPIDRRO_EL0", (uint64_t)boot_cpu);
 }
 
 unsigned int
@@ -875,6 +891,7 @@ void ml_lockdown_init() {
     assert(lockdown_handler_grp != NULL);
 
     lck_mtx_init(&lockdown_handler_lck, lockdown_handler_grp, NULL);
+
 }
 
 kern_return_t
@@ -1006,7 +1023,7 @@ ml_processor_register(
 #endif
 
        if (!is_boot_cpu) {
-               prng_cpu_init(this_cpu_datap->cpu_number);
+               early_random_cpu_init(this_cpu_datap->cpu_number);
                // now let next CPU register itself
                OSIncrementAtomic((SInt32*)&real_ncpus);
        }
@@ -1056,20 +1073,6 @@ cause_ast_check(
        }
 }
 
-
-/*
- *     Routine:        ml_at_interrupt_context
- *     Function:       Check if running at interrupt context
- */
-boolean_t
-ml_at_interrupt_context(void)
-{
-       unsigned int    local;
-       vm_offset_t     intstack_top_ptr;
-
-       intstack_top_ptr = getCpuDatap()->intstack_top;
-       return (((vm_offset_t)(&local) < intstack_top_ptr) && ((vm_offset_t)(&local) > (intstack_top_ptr - INTSTACK_SIZE)));
-}
 extern uint32_t cpu_idle_count;
 
 void ml_get_power_state(boolean_t *icp, boolean_t *pidlep) {
@@ -1128,14 +1131,21 @@ ml_static_ptovirt(
 }
 
 vm_offset_t
-ml_static_vtop(
-                 vm_offset_t vaddr)
+ml_static_slide(
+       vm_offset_t vaddr)
 {
-       if (((vm_address_t)(vaddr) - gVirtBase) >= gPhysSize)
-               panic("ml_static_ptovirt(): illegal vaddr: %p\n", (void*)vaddr);
-       return ((vm_address_t)(vaddr) - gVirtBase + gPhysBase);
+       return phystokv(vaddr + vm_kernel_slide - gVirtBase + gPhysBase);
 }
 
+vm_offset_t
+ml_static_unslide(
+       vm_offset_t vaddr)
+{
+       return (ml_static_vtop(vaddr) - gPhysBase + gVirtBase - vm_kernel_slide) ;
+}
+
+extern tt_entry_t *arm_kva_to_tte(vm_offset_t va);
+
 kern_return_t
 ml_static_protect(
        vm_offset_t vaddr, /* kernel virtual address */
@@ -1181,21 +1191,12 @@ ml_static_protect(
             vaddr_cur += PAGE_SIZE) {
                ppn = pmap_find_phys(kernel_pmap, vaddr_cur);
                if (ppn != (vm_offset_t) NULL) {
-#if __ARM64_TWO_LEVEL_PMAP__
                        tt_entry_t      *tte2;
-#else
-                       tt_entry_t      *tte1, *tte2;
-#endif
                        pt_entry_t      *pte_p;
                        pt_entry_t      ptmp;
 
 
-#if __ARM64_TWO_LEVEL_PMAP__
-                       tte2 = &kernel_pmap->tte[(((vaddr_cur) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)];
-#else
-                       tte1 = &kernel_pmap->tte[(((vaddr_cur) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)];
-                       tte2 = &((tt_entry_t*) phystokv((*tte1) & ARM_TTE_TABLE_MASK))[(((vaddr_cur) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)];
-#endif
+                       tte2 = arm_kva_to_tte(vaddr_cur);
 
                        if (((*tte2) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
                                if ((((*tte2) & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) &&
@@ -1452,18 +1453,6 @@ machine_choose_processor(__unused processor_set_t pset, processor_t processor)
        return (processor);
 }
 
-vm_offset_t
-ml_stack_remaining(void)
-{
-       uintptr_t local = (uintptr_t) &local;
-
-       if (ml_at_interrupt_context()) {
-           return (local - (getCpuDatap()->intstack_top - INTSTACK_SIZE));
-       } else {
-           return (local - current_thread()->kernel_stack);
-       }
-}
-
 #if KASAN
 vm_offset_t ml_stack_base(void);
 vm_size_t ml_stack_size(void);
@@ -1471,19 +1460,27 @@ vm_size_t ml_stack_size(void);
 vm_offset_t
 ml_stack_base(void)
 {
-       if (ml_at_interrupt_context()) {
-           return getCpuDatap()->intstack_top - INTSTACK_SIZE;
+       uintptr_t local = (uintptr_t) &local;
+       vm_offset_t     intstack_top_ptr;
+
+       intstack_top_ptr = getCpuDatap()->intstack_top;
+       if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
+               return intstack_top_ptr - INTSTACK_SIZE;
        } else {
-           return current_thread()->kernel_stack;
+               return current_thread()->kernel_stack;
        }
 }
 vm_size_t
 ml_stack_size(void)
 {
-       if (ml_at_interrupt_context()) {
-           return INTSTACK_SIZE;
+       uintptr_t local = (uintptr_t) &local;
+       vm_offset_t     intstack_top_ptr;
+
+       intstack_top_ptr = getCpuDatap()->intstack_top;
+       if ((local < intstack_top_ptr) && (local > intstack_top_ptr - INTSTACK_SIZE)) {
+               return INTSTACK_SIZE;
        } else {
-           return kernel_stack_size;
+               return kernel_stack_size;
        }
 }
 #endif
@@ -1605,7 +1602,7 @@ dcache_flush_trap(vm_map_address_t start, vm_map_size_t size)
        vm_offset_t old_recover = thread->recover;
 
        /* Check bounds */
-       if (task_has_64BitAddr(current_task())) {
+       if (task_has_64Bit_addr(current_task())) {
                if (end > MACH_VM_MAX_ADDRESS) {
                        cache_trap_error(thread, end & ((1 << ARM64_CLINE_SHIFT) - 1));
                }
@@ -1622,17 +1619,12 @@ dcache_flush_trap(vm_map_address_t start, vm_map_size_t size)
        /* Set recovery function */
        thread->recover = (vm_address_t)cache_trap_recover;
 
-#if defined(APPLE_ARM64_ARCH_FAMILY)
        /*
         * We're coherent on Apple ARM64 CPUs, so this could be a nop.  However,
         * if the region given us is bad, it would be good to catch it and
         * crash, ergo we still do the flush.
         */
-       assert((size & 0xFFFFFFFF00000000ULL) == 0);
        FlushPoC_DcacheRegion(start, (uint32_t)size);
-#else
-#error "Make sure you don't need to xcall."
-#endif
 
        /* Restore recovery function */
        thread->recover = old_recover;
@@ -1648,7 +1640,7 @@ icache_invalidate_trap(vm_map_address_t start, vm_map_size_t size)
        vm_offset_t old_recover = thread->recover;
 
        /* Check bounds */
-       if (task_has_64BitAddr(current_task())) {
+       if (task_has_64Bit_addr(current_task())) {
                if (end > MACH_VM_MAX_ADDRESS) {
                        cache_trap_error(thread, end & ((1 << ARM64_CLINE_SHIFT) - 1));
                }
@@ -1665,15 +1657,14 @@ icache_invalidate_trap(vm_map_address_t start, vm_map_size_t size)
        /* Set recovery function */
        thread->recover = (vm_address_t)cache_trap_recover;
 
-#if defined(APPLE_ARM64_ARCH_FAMILY)
-       /* Clean dcache to unification, except we're coherent on Apple ARM64 CPUs */
-#else
-#error Make sure not cleaning is right for this platform!
-#endif
+       CleanPoU_DcacheRegion(start, (uint32_t) size);
 
        /* Invalidate iCache to point of unification */
-       assert((size & 0xFFFFFFFF00000000ULL) == 0);
+#if __ARM_IC_NOALIAS_ICACHE__
        InvalidatePoU_IcacheRegion(start, (uint32_t)size);
+#else
+       InvalidatePoU_Icache();
+#endif
 
        /* Restore recovery function */
        thread->recover = old_recover;
@@ -1759,7 +1750,17 @@ _enable_virtual_timer(void)
 void
 fiq_context_init(boolean_t enable_fiq __unused)
 {
-#if defined(APPLE_ARM64_ARCH_FAMILY)
+       _enable_timebase_event_stream(fiq_eventi);
+
+       /* Interrupts still disabled. */
+       assert(ml_get_interrupts_enabled() == FALSE);
+       _enable_virtual_timer();
+}
+
+void
+fiq_context_bootstrap(boolean_t enable_fiq)
+{
+#if defined(APPLE_ARM64_ARCH_FAMILY) || defined(BCM2837)
        /* Could fill in our own ops here, if we needed them */
        uint64_t        ticks_per_sec, ticks_per_event, events_per_sec;
        uint32_t        bit_index;
@@ -1775,9 +1776,8 @@ fiq_context_init(boolean_t enable_fiq __unused)
        bit_index = flsll(ticks_per_event) - 1; /* Highest bit set */
 
        /* Round up to power of two */
-       if ((ticks_per_event & ((1 << bit_index) - 1)) != 0) {
+       if ((ticks_per_event & ((1 << bit_index) - 1)) != 0)
                bit_index++;
-       }
 
        /*
         * The timer can only trigger on rising or falling edge,
@@ -1788,89 +1788,11 @@ fiq_context_init(boolean_t enable_fiq __unused)
        if (bit_index != 0)
                bit_index--;
 
-       _enable_timebase_event_stream(bit_index);
+       fiq_eventi = bit_index;
 #else
 #error Need a board configuration.
 #endif
-
-       /* Interrupts still disabled. */
-       assert(ml_get_interrupts_enabled() == FALSE);
-       _enable_virtual_timer();
-}
-
-/*
- * ARM64_TODO: remove me (just a convenience while we don't have crashreporter)
- */
-extern int copyinframe(vm_address_t, char *, boolean_t);
-size_t                 _OSUserBacktrace(char *buffer, size_t bufsize);
-
-size_t _OSUserBacktrace(char *buffer, size_t bufsize) 
-{
-       thread_t thread = current_thread();
-       boolean_t is64bit = thread_is_64bit(thread);
-       size_t trace_size_bytes = 0, lr_size;
-       vm_address_t frame_addr; // Should really by mach_vm_offset_t...
-
-       if (bufsize < 8) {
-               return 0;
-       }
-
-       if (get_threadtask(thread) == kernel_task) {
-               panic("%s: Should never be called from a kernel thread.", __FUNCTION__);
-       }
-
-       frame_addr = get_saved_state_fp(thread->machine.upcb);
-       if (is64bit) {
-               uint64_t frame[2];
-               lr_size = sizeof(frame[1]);
-
-               *((uint64_t*)buffer) = get_saved_state_pc(thread->machine.upcb);
-               trace_size_bytes = lr_size;
-
-               while (trace_size_bytes + lr_size < bufsize) {
-                       if (!(frame_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS)) {
-                               break;
-                       }
-
-                       if (0 != copyinframe(frame_addr, (char*)frame, TRUE)) {
-                               break;
-                       }
-
-                       *((uint64_t*)(buffer + trace_size_bytes)) = frame[1]; /* lr */
-                       frame_addr = frame[0];
-                       trace_size_bytes += lr_size;
-
-                       if (frame[0] == 0x0ULL) {
-                               break;
-                       }
-               }
-       } else {
-               uint32_t frame[2];
-               lr_size = sizeof(frame[1]);
-
-               *((uint32_t*)buffer) = (uint32_t)get_saved_state_pc(thread->machine.upcb);
-               trace_size_bytes = lr_size;
-
-               while (trace_size_bytes + lr_size < bufsize) {
-                       if (!(frame_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS)) {
-                               break;
-                       }
-
-                       if (0 != copyinframe(frame_addr, (char*)frame, FALSE)) {
-                               break;
-                       }
-
-                       *((uint32_t*)(buffer + trace_size_bytes)) = frame[1]; /* lr */
-                       frame_addr = frame[0];
-                       trace_size_bytes += lr_size;
-
-                       if (frame[0] == 0x0ULL) {
-                               break;
-                       }
-               }
-       }
-
-       return trace_size_bytes;
+       fiq_context_init(enable_fiq);
 }
 
 boolean_t
@@ -1890,7 +1812,7 @@ ml_delay_should_spin(uint64_t interval)
 }
 
 boolean_t ml_thread_is64bit(thread_t thread) {
-       return (thread_is_64bit(thread));
+       return (thread_is_64bit_addr(thread));
 }
 
 void ml_timer_evaluate(void) {
index 04224600684bd51327de71ffff2c2ad35ef37db7..8f51e2a222b7f5f5026a67a9965316edd0c0a4ec 100644 (file)
@@ -35,6 +35,7 @@
 #include "assym.s"
 
 
+
 /*     uint32_t get_fpscr(void):
  *             Returns (FPSR | FPCR).
  */
@@ -82,6 +83,54 @@ LEXT(set_fpscr)
 #endif
        ret
 
+/*
+ * void update_mdscr(unsigned long clear, unsigned long set)
+ *   Clears and sets the specified bits in MDSCR_EL1.
+ *
+ * Setting breakpoints in EL1 is effectively a KTRR bypass. The ability to do so is
+ * controlled by MDSCR.KDE. The MSR to set MDSCR must be present to allow
+ * self-hosted user mode debug. Any checks before the MRS can be skipped with ROP,
+ * so we need to put the checks after the MRS where they can't be skipped. That
+ * still leaves a small window if a breakpoint is set on the instruction
+ * immediately after the MRS. To handle that, we also do a check and then set of
+ * the breakpoint control registers. This allows us to guarantee that a given
+ * core will never have both KDE set and a breakpoint targeting EL1.
+ *
+ * If KDE gets set, unset it and then panic
+ */
+       .align 2
+       .globl EXT(update_mdscr)
+LEXT(update_mdscr)
+       mov     x4, #0
+       mrs     x2, MDSCR_EL1
+       bic     x2, x2, x0
+       orr     x2, x2, x1
+1:
+       bic     x2, x2, #0x2000
+       msr     MDSCR_EL1, x2
+#if defined(CONFIG_KERNEL_INTEGRITY)
+       /*
+        * verify KDE didn't get set (including via ROP)
+        * If set, clear it and then panic
+        */
+       ands    x3, x2, #0x2000
+       orr     x4, x4, x3
+       bne     1b
+       cmp     x4, xzr
+       b.ne    Lupdate_mdscr_panic
+#endif
+       ret
+
+Lupdate_mdscr_panic:
+       adrp    x0, Lupdate_mdscr_panic_str@page
+       add     x0, x0, Lupdate_mdscr_panic_str@pageoff
+       b       EXT(panic)
+       b       .
+
+Lupdate_mdscr_panic_str:
+       .asciz "MDSCR.KDE was set"
+
+
 #if __ARM_KERNEL_PROTECT__
 /*
  * __ARM_KERNEL_PROTECT__ adds two complications to TLB management:
@@ -96,6 +145,40 @@ LEXT(set_fpscr)
  */
 #endif /* __ARM_KERNEL_PROTECT__ */
 
+.macro SYNC_TLB_FLUSH
+       dsb     ish
+       isb     sy
+.endmacro
+
+
+/*
+ *     void sync_tlb_flush(void)
+ *
+ *             Synchronize one or more prior TLB flush operations
+ */
+       .text
+       .align 2
+       .globl EXT(sync_tlb_flush)
+LEXT(sync_tlb_flush)
+       SYNC_TLB_FLUSH
+       ret
+
+
+.macro FLUSH_MMU_TLB
+       tlbi    vmalle1is
+.endmacro
+/*
+ *     void flush_mmu_tlb_async(void)
+ *
+ *             Flush all TLBs, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_async)
+LEXT(flush_mmu_tlb_async)
+       FLUSH_MMU_TLB
+       ret
+
 /*
  *     void flush_mmu_tlb(void)
  *
@@ -105,34 +188,40 @@ LEXT(set_fpscr)
        .align 2
        .globl EXT(flush_mmu_tlb)
 LEXT(flush_mmu_tlb)
-       tlbi    vmalle1is
-       dsb             ish
-       isb             sy
+       FLUSH_MMU_TLB
+       SYNC_TLB_FLUSH
        ret
 
+.macro FLUSH_CORE_TLB
+       tlbi    vmalle1
+.endmacro
+
 /*
- *     void flush_core_tlb(void)
+ *     void flush_core_tlb_async(void)
  *
- *             Flush core TLB
+ *             Flush local core TLB, don't wait for completion
  */
        .text
        .align 2
-       .globl EXT(flush_core_tlb)
-LEXT(flush_core_tlb)
-       tlbi    vmalle1
-       dsb             ish
-       isb             sy
+       .globl EXT(flush_core_tlb_async)
+LEXT(flush_core_tlb_async)
+       FLUSH_CORE_TLB
        ret
 
 /*
- *     void flush_mmu_tlb_allentries(uint64_t, uint64_t)
+ *     void flush_core_tlb(void)
  *
- *             Flush TLB entries
+ *             Flush local core TLB
  */
        .text
        .align 2
-       .globl EXT(flush_mmu_tlb_allentries)
-LEXT(flush_mmu_tlb_allentries)
+       .globl EXT(flush_core_tlb)
+LEXT(flush_core_tlb)
+       FLUSH_CORE_TLB
+       SYNC_TLB_FLUSH
+       ret
+
+.macro FLUSH_MMU_TLB_ALLENTRIES
 #if __ARM_16K_PG__
        and             x0, x0, #~0x3
 
@@ -154,24 +243,37 @@ LEXT(flush_mmu_tlb_allentries)
        add             x1, x1, #0x3
        and             x1, x1, #~0x3
 #endif
-Lflush_mmu_tlb_allentries_loop:
+1: // Lflush_mmu_tlb_allentries_loop:
        tlbi    vaae1is, x0
        add             x0, x0, #(ARM_PGBYTES / 4096)   // Units are 4KB pages, as defined by the ISA
        cmp             x0, x1
-       b.lt    Lflush_mmu_tlb_allentries_loop
-       dsb             ish
-       isb             sy
-       ret
+       b.lt    1b // Lflush_mmu_tlb_allentries_loop
+.endmacro
 
 /*
- *     void flush_mmu_tlb_entry(uint64_t)
+ *     void flush_mmu_tlb_allentries_async(uint64_t, uint64_t)
  *
- *             Flush TLB entry
+ *             Flush TLB entries, don't wait for completion
  */
        .text
        .align 2
-       .globl EXT(flush_mmu_tlb_entry)
-LEXT(flush_mmu_tlb_entry)
+       .globl EXT(flush_mmu_tlb_allentries_async)
+LEXT(flush_mmu_tlb_allentries_async)
+       FLUSH_MMU_TLB_ALLENTRIES
+       ret
+
+/*
+ *     void flush_mmu_tlb_allentries(uint64_t, uint64_t)
+ *
+ *             Flush TLB entries
+ */
+       .globl EXT(flush_mmu_tlb_allentries)
+LEXT(flush_mmu_tlb_allentries)
+       FLUSH_MMU_TLB_ALLENTRIES
+       SYNC_TLB_FLUSH
+       ret
+
+.macro FLUSH_MMU_TLB_ENTRY
 #if __ARM_KERNEL_PROTECT__
        /*
         * If we are flushing ASID 0, this is a kernel operation.  With this
@@ -179,33 +281,46 @@ LEXT(flush_mmu_tlb_entry)
         */
        lsr             x2, x0, #TLBI_ASID_SHIFT
        cmp             x2, #0
-       b.eq            Lflush_mmu_tlb_entry_globally
+       b.eq            1f // Lflush_mmu_tlb_entry_globally
 
        bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
        tlbi    vae1is, x0
        orr             x0, x0, #(1 << TLBI_ASID_SHIFT)
 #endif /* __ARM_KERNEL_PROTECT__ */
        tlbi    vae1is, x0
-       dsb             ish
-       isb             sy
-       ret
 #if __ARM_KERNEL_PROTECT__
-Lflush_mmu_tlb_entry_globally:
+       b               2f // Lflush_mmu_tlb_entry_done
+1: // Lflush_mmu_tlb_entry_globally:
        tlbi    vaae1is, x0
-       dsb             ish
-       isb             sy
-       ret
+2: // Lflush_mmu_tlb_entry_done
 #endif /* __ARM_KERNEL_PROTECT__ */
+.endmacro
+/*
+ *     void flush_mmu_tlb_entry_async(uint64_t)
+ *
+ *             Flush TLB entry, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_entry_async)
+LEXT(flush_mmu_tlb_entry_async)
+       FLUSH_MMU_TLB_ENTRY
+       ret
 
 /*
- *     void flush_mmu_tlb_entries(uint64_t, uint64_t)
+ *     void flush_mmu_tlb_entry(uint64_t)
  *
- *             Flush TLB entries
+ *             Flush TLB entry
  */
        .text
        .align 2
-       .globl EXT(flush_mmu_tlb_entries)
-LEXT(flush_mmu_tlb_entries)
+       .globl EXT(flush_mmu_tlb_entry)
+LEXT(flush_mmu_tlb_entry)
+       FLUSH_MMU_TLB_ENTRY
+       SYNC_TLB_FLUSH
+       ret
+
+.macro FLUSH_MMU_TLB_ENTRIES
 #if __ARM_16K_PG__
        and             x0, x0, #~0x3
 
@@ -226,7 +341,7 @@ LEXT(flush_mmu_tlb_entries)
         */
        add             x1, x1, #0x3
        and             x1, x1, #~0x3
-#endif /* __ARM_KERNEL_PROTECT__ */
+#endif /* __ARM_16K_PG__ */
 #if __ARM_KERNEL_PROTECT__
        /*
         * If we are flushing ASID 0, this is a kernel operation.  With this
@@ -234,11 +349,11 @@ LEXT(flush_mmu_tlb_entries)
         */
        lsr             x2, x0, #TLBI_ASID_SHIFT
        cmp             x2, #0
-       b.eq            Lflush_mmu_tlb_entries_globally_loop
+       b.eq            2f // Lflush_mmu_tlb_entries_globally_loop
 
        bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
 #endif /* __ARM_KERNEL_PROTECT__ */
-Lflush_mmu_tlb_entries_loop:
+1: // Lflush_mmu_tlb_entries_loop
        tlbi    vae1is, x0
 #if __ARM_KERNEL_PROTECT__
        orr             x0, x0, #(1 << TLBI_ASID_SHIFT)
@@ -247,30 +362,44 @@ Lflush_mmu_tlb_entries_loop:
 #endif /* __ARM_KERNEL_PROTECT__ */
        add             x0, x0, #(ARM_PGBYTES / 4096)   // Units are pages
        cmp             x0, x1
-       b.lt    Lflush_mmu_tlb_entries_loop
-       dsb             ish
-       isb             sy
-       ret
+       b.lt            1b // Lflush_mmu_tlb_entries_loop
 #if __ARM_KERNEL_PROTECT__
-Lflush_mmu_tlb_entries_globally_loop:
+       b               3f // Lflush_mmu_tlb_entries_done
+2: // Lflush_mmu_tlb_entries_globally_loop:
        tlbi    vaae1is, x0
        add             x0, x0, #(ARM_PGBYTES / 4096)   // Units are pages
        cmp             x0, x1
-       b.lt    Lflush_mmu_tlb_entries_globally_loop
-       dsb             ish
-       isb             sy
-       ret
+       b.lt            2b // Lflush_mmu_tlb_entries_globally_loop
+3: // Lflush_mmu_tlb_entries_done
 #endif /* __ARM_KERNEL_PROTECT__ */
+.endmacro
 
 /*
- *     void flush_mmu_tlb_asid(uint64_t)
+ *     void flush_mmu_tlb_entries_async(uint64_t, uint64_t)
  *
- *             Flush TLB entriesfor requested asid
+ *             Flush TLB entries, don't wait for completion
  */
        .text
        .align 2
-       .globl EXT(flush_mmu_tlb_asid)
-LEXT(flush_mmu_tlb_asid)
+       .globl EXT(flush_mmu_tlb_entries_async)
+LEXT(flush_mmu_tlb_entries_async)
+       FLUSH_MMU_TLB_ENTRIES
+       ret
+
+/*
+ *     void flush_mmu_tlb_entries(uint64_t, uint64_t)
+ *
+ *             Flush TLB entries
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_entries)
+LEXT(flush_mmu_tlb_entries)
+       FLUSH_MMU_TLB_ENTRIES
+       SYNC_TLB_FLUSH
+       ret
+
+.macro FLUSH_MMU_TLB_ASID
 #if __ARM_KERNEL_PROTECT__
        /*
         * If we are flushing ASID 0, this is a kernel operation.  With this
@@ -278,33 +407,47 @@ LEXT(flush_mmu_tlb_asid)
         */
        lsr             x1, x0, #TLBI_ASID_SHIFT
        cmp             x1, #0
-       b.eq            Lflush_mmu_tlb_globally
+       b.eq            1f // Lflush_mmu_tlb_globally
 
        bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
        tlbi    aside1is, x0
        orr             x0, x0, #(1 << TLBI_ASID_SHIFT)
 #endif /* __ARM_KERNEL_PROTECT__ */
        tlbi    aside1is, x0
-       dsb             ish
-       isb             sy
-       ret
 #if __ARM_KERNEL_PROTECT__
-Lflush_mmu_tlb_globally:
+       b               2f // Lflush_mmu_tlb_asid_done
+1: // Lflush_mmu_tlb_globally:
        tlbi    vmalle1is
-       dsb             ish
-       isb             sy
-       ret
+2: // Lflush_mmu_tlb_asid_done:
 #endif /* __ARM_KERNEL_PROTECT__ */
+.endmacro
 
 /*
- *     void flush_core_tlb_asid(uint64_t)
+ *     void flush_mmu_tlb_asid_async(uint64_t)
  *
- *             Flush TLB entries for core for requested asid
+ *             Flush TLB entriesfor requested asid, don't wait for completion
  */
        .text
        .align 2
-       .globl EXT(flush_core_tlb_asid)
-LEXT(flush_core_tlb_asid)
+       .globl EXT(flush_mmu_tlb_asid_async)
+LEXT(flush_mmu_tlb_asid_async)
+       FLUSH_MMU_TLB_ASID
+       ret
+
+/*
+ *     void flush_mmu_tlb_asid(uint64_t)
+ *
+ *             Flush TLB entriesfor requested asid
+ */
+       .text
+       .align 2
+       .globl EXT(flush_mmu_tlb_asid)
+LEXT(flush_mmu_tlb_asid)
+       FLUSH_MMU_TLB_ASID
+       SYNC_TLB_FLUSH
+       ret
+
+.macro FLUSH_CORE_TLB_ASID
 #if __ARM_KERNEL_PROTECT__
        /*
         * If we are flushing ASID 0, this is a kernel operation.  With this
@@ -312,23 +455,44 @@ LEXT(flush_core_tlb_asid)
         */
        lsr             x1, x0, #TLBI_ASID_SHIFT
        cmp             x1, #0
-       b.eq            Lflush_core_tlb_asid_globally
+       b.eq            1f // Lflush_core_tlb_asid_globally
 
        bic             x0, x0, #(1 << TLBI_ASID_SHIFT)
        tlbi    aside1, x0
        orr             x0, x0, #(1 << TLBI_ASID_SHIFT)
 #endif /* __ARM_KERNEL_PROTECT__ */
        tlbi    aside1, x0
-       dsb             ish
-       isb             sy
-       ret
 #if __ARM_KERNEL_PROTECT__
-Lflush_core_tlb_asid_globally:
+       b               2f // Lflush_core_tlb_asid_done
+1: // Lflush_core_tlb_asid_globally:
        tlbi    vmalle1
-       dsb             ish
-       isb             sy
-       ret
+2: // Lflush_core_tlb_asid_done:
 #endif /* __ARM_KERNEL_PROTECT__ */
+.endmacro
+
+/*
+ *     void flush_core_tlb_asid_async(uint64_t)
+ *
+ *             Flush TLB entries for core for requested asid, don't wait for completion
+ */
+       .text
+       .align 2
+       .globl EXT(flush_core_tlb_asid_async)
+LEXT(flush_core_tlb_asid_async)
+       FLUSH_CORE_TLB_ASID
+       ret
+/*
+ *     void flush_core_tlb_asid(uint64_t)
+ *
+ *             Flush TLB entries for core for requested asid
+ */
+       .text
+       .align 2
+       .globl EXT(flush_core_tlb_asid)
+LEXT(flush_core_tlb_asid)
+       FLUSH_CORE_TLB_ASID
+       SYNC_TLB_FLUSH
+       ret
 
 /*
  *     Set MMU Translation Table Base Alternate
@@ -348,6 +512,19 @@ LEXT(set_mmu_ttb_alternate)
        isb             sy
        ret
 
+       .text
+       .align 2
+       .globl EXT(set_mmu_ttb)
+LEXT(set_mmu_ttb)
+#if __ARM_KERNEL_PROTECT__
+       /* All EL1-mode ASIDs are odd. */
+       orr             x0, x0, #(1 << TTBR_ASID_SHIFT)
+#endif /* __ARM_KERNEL_PROTECT__ */
+       dsb             ish
+       msr             TTBR0_EL1, x0
+       isb             sy
+       ret
+
 /*
  *     set AUX control register
  */
@@ -447,7 +624,7 @@ LEXT(mmu_kvtop)
        and             x0, x1, #0x0000ffffffffffff                                     // Clear non-address bits 
        ret
 L_mmu_kvtop_invalid:
-       mov             x0, xzr                                                                         // Return invalid
+       mov             x0, #0                                                                          // Return invalid
        ret
 
 /*
@@ -469,7 +646,7 @@ LEXT(mmu_uvtop)
        and             x0, x1, #0x0000ffffffffffff                                     // Clear non-address bits 
        ret
 L_mmu_uvtop_invalid:
-       mov             x0, xzr                                                                         // Return invalid
+       mov             x0, #0                                                                          // Return invalid
        ret
 
 /*
@@ -489,7 +666,7 @@ LEXT(mmu_kvtop_wpreflight)
        and             x0, x1, #0x0000ffffffffffff                                     // Clear non-address bits
        ret
 L_mmu_kvtop_wpreflight_invalid:
-       mov             x0, xzr                                                                         // Return invalid
+       mov             x0, #0                                                                          // Return invalid
        ret
 
 /*
@@ -529,7 +706,7 @@ copyio_error:
        CLEAR_RECOVERY_HANDLER x10, x11
        mov             x0, #EFAULT                                     // Return an EFAULT error
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  * int _bcopyin(const char *src, char *dst, vm_size_t len)
@@ -538,6 +715,7 @@ copyio_error:
        .align 2
        .globl EXT(_bcopyin)
 LEXT(_bcopyin)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
        SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
        /* If len is less than 16 bytes, just do a bytewise copy */
@@ -560,9 +738,9 @@ LEXT(_bcopyin)
        b.hi    2b
 3:
        CLEAR_RECOVERY_HANDLER x10, x11
-       mov             x0, xzr
+       mov             x0, #0
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  * int _copyin_word(const char *src, uint64_t *dst, vm_size_t len)
@@ -571,6 +749,7 @@ LEXT(_bcopyin)
        .align 2
        .globl EXT(_copyin_word)
 LEXT(_copyin_word)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
        SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
        cmp             x2, #4
@@ -586,11 +765,12 @@ L_copyin_word_8:
        ldr             x8, [x0]
 L_copyin_word_store:
        str             x8, [x1]
-       mov             x0, xzr
+       mov             x0, #0
        CLEAR_RECOVERY_HANDLER x10, x11
 L_copying_exit:
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
+
 
 
 /*
@@ -600,6 +780,7 @@ L_copying_exit:
        .align 2
        .globl EXT(_bcopyout)
 LEXT(_bcopyout)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
        SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
        /* If len is less than 16 bytes, just do a bytewise copy */
@@ -622,9 +803,9 @@ LEXT(_bcopyout)
        b.hi    2b
 3:
        CLEAR_RECOVERY_HANDLER x10, x11
-       mov             x0, xzr
+       mov             x0, #0
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  * int _bcopyinstr(
@@ -637,12 +818,13 @@ LEXT(_bcopyout)
        .align 2
        .globl EXT(_bcopyinstr)
 LEXT(_bcopyinstr)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
        adr             x4, Lcopyinstr_error            // Get address for recover
        mrs             x10, TPIDR_EL1                          // Get thread pointer
        ldr             x11, [x10, TH_RECOVER]          // Save previous recover
        str             x4, [x10, TH_RECOVER]           // Store new recover
-       mov             x4, xzr                                         // x4 - total bytes copied
+       mov             x4, #0                                          // x4 - total bytes copied
 Lcopyinstr_loop:
        ldrb    w5, [x0], #1                                    // Load a byte from the user source
        strb    w5, [x1], #1                            // Store a byte to the kernel dest
@@ -661,7 +843,7 @@ Lcopyinstr_error:
 Lcopyinstr_exit:
        str             x11, [x10, TH_RECOVER]          // Restore old recover
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 /*
  * int copyinframe(const vm_address_t frame_addr, char *kernel_addr, bool is64bit)
@@ -684,6 +866,7 @@ Lcopyinstr_exit:
        .align 2
        .globl EXT(copyinframe)
 LEXT(copyinframe)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
        SET_RECOVERY_HANDLER x10, x11, x3, copyio_error
        cbnz    w2, Lcopyinframe64              // Check frame size
@@ -718,90 +901,8 @@ Lcopyinframe_valid:
 Lcopyinframe_done:
        CLEAR_RECOVERY_HANDLER x10, x11
        POP_FRAME
-       ret
-
-
-/*
- * int _emulate_swp(user_addr_t addr, uint32_t newval, uint32_t *oldval)
- *
- *  Securely emulates the swp instruction removed from armv8.
- *    Returns true on success.
- *    Returns false if the user address is not user accessible.
- *
- *  x0 : address to swap
- *  x1 : new value to store
- *  x2 : address to save old value
- *  x3 : scratch reg
- *  x10 : thread pointer (set by SET_RECOVERY_HANDLER)
- *  x11 : old recovery handler (set by SET_RECOVERY_HANDLER)
- *  x12 : interrupt state
- *  x13 : return value
- */
-       .text
-       .align 2
-       .globl EXT(_emulate_swp)
-LEXT(_emulate_swp)
-       PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, swp_error
-
-       // Perform swap
-Lswp_try:
-       ldxr    w3, [x0]                                                                        // Load data at target address
-       stxr    w4, w1, [x0]                                                            // Store new value to target address
-       cbnz    w4, Lswp_try                                                            // Retry if store failed
-       str             w3, [x2]                                                                        // Save old value
-       mov             x13, #1                                                                         // Set successful return value
-
-Lswp_exit:
-       mov             x0, x13                                                                         // Set return value
-       CLEAR_RECOVERY_HANDLER x10, x11
-       POP_FRAME
-       ret
-
-/*
- * int _emulate_swpb(user_addr_t addr, uint32_t newval, uint32_t *oldval)
- *
- *  Securely emulates the swpb instruction removed from armv8.
- *    Returns true on success.
- *    Returns false if the user address is not user accessible.
- *
- *  x0 : address to swap
- *  x1 : new value to store
- *  x2 : address to save old value
- *  x3 : scratch reg
- *  x10 : thread pointer (set by SET_RECOVERY_HANDLER)
- *  x11 : old recovery handler (set by SET_RECOVERY_HANDLER)
- *  x12 : interrupt state
- *  x13 : return value
- */
-       .text
-       .align 2
-       .globl EXT(_emulate_swpb)
-LEXT(_emulate_swpb)
-       PUSH_FRAME
-       SET_RECOVERY_HANDLER x10, x11, x3, swp_error
-
-       // Perform swap
-Lswpb_try:
-       ldxrb   w3, [x0]                                                                        // Load data at target address
-       stxrb   w4, w1, [x0]                                                            // Store new value to target address
-       cbnz    w4, Lswp_try                                                            // Retry if store failed
-       str             w3, [x2]                                                                        // Save old value
-       mov             x13, #1                                                                         // Set successful return value
-
-Lswpb_exit:
-       mov             x0, x13                                                                         // Set return value
-       CLEAR_RECOVERY_HANDLER x10, x11
-       POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
-       .text
-       .align 2
-swp_error:
-       mov             x0, xzr                                                                         // Return false
-       CLEAR_RECOVERY_HANDLER x10, x11
-       POP_FRAME
-       ret
 
 /*
  * uint32_t arm_debug_read_dscr(void)
@@ -826,7 +927,6 @@ LEXT(arm_debug_read_dscr)
 LEXT(arm_debug_set_cp14)
        PANIC_UNIMPLEMENTED
 
-
 #if defined(APPLE_ARM64_ARCH_FAMILY)
 /*
  * Note: still have to ISB before executing wfi!
@@ -871,6 +971,28 @@ LEXT(arm64_prepare_for_sleep)
        orr             x0, x0, #(ARM64_REG_CYC_OVRD_ok2pwrdn_force_down)
        msr             ARM64_REG_CYC_OVRD, x0
 
+#if defined(APPLEMONSOON)
+       ARM64_IS_PCORE x0
+       cbz             x0, Lwfi_inst // skip if not p-core 
+
+       /* <rdar://problem/32512947>: Flush the GUPS prefetcher prior to
+        * wfi.  A Skye HW bug can cause the GUPS prefetcher on p-cores
+        * to be left with valid entries that fail to drain if a
+        * subsequent wfi is issued.  This can prevent the core from
+        * power-gating.  For the idle case that is recoverable, but
+        * for the deep-sleep (S2R) case in which cores MUST power-gate,
+        * it can lead to a hang.  This can be prevented by disabling
+        * and re-enabling GUPS, which forces the prefetch queue to
+        * drain.  This should be done as close to wfi as possible, i.e.
+        * at the very end of arm64_prepare_for_sleep(). */
+       mrs             x0, ARM64_REG_HID10
+       orr             x0, x0, #(ARM64_REG_HID10_DisHwpGups)
+       msr             ARM64_REG_HID10, x0
+       isb             sy
+       and             x0, x0, #(~(ARM64_REG_HID10_DisHwpGups))
+       msr             ARM64_REG_HID10, x0
+       isb             sy
+#endif
 Lwfi_inst:
        dsb             sy
        isb             sy
@@ -885,6 +1007,7 @@ Lwfi_inst:
        .align 2
        .globl EXT(arm64_force_wfi_clock_gate)
 LEXT(arm64_force_wfi_clock_gate)
+       ARM64_STACK_PROLOG
        PUSH_FRAME
 
        mrs             x0, ARM64_REG_CYC_OVRD
@@ -892,7 +1015,7 @@ LEXT(arm64_force_wfi_clock_gate)
        msr             ARM64_REG_CYC_OVRD, x0
        
        POP_FRAME
-       ret
+       ARM64_STACK_EPILOG
 
 
 
@@ -1030,7 +1153,64 @@ cpu_defeatures_set_ret:
        ret
 #endif
 
+#else /* !defined(APPLE_ARM64_ARCH_FAMILY) */
+       .text
+       .align 2
+       .globl EXT(arm64_prepare_for_sleep)
+LEXT(arm64_prepare_for_sleep)
+       PUSH_FRAME
+Lwfi_inst:
+       dsb             sy
+       isb             sy
+       wfi
+       b               Lwfi_inst
+
+/*
+ * Force WFI to use clock gating only
+ * Note: for non-Apple device, do nothing.
+ */    
+       .text
+       .align 2
+       .globl EXT(arm64_force_wfi_clock_gate)
+LEXT(arm64_force_wfi_clock_gate)
+       PUSH_FRAME
+       nop
+       POP_FRAME
+
+#endif /* defined(APPLE_ARM64_ARCH_FAMILY) */
+
+/*
+ * void arm64_replace_bootstack(cpu_data_t *cpu_data)
+ *
+ * This must be called from a kernel thread context running on the boot CPU,
+ * after setting up new exception stacks in per-CPU data. That will guarantee
+ * that the stack(s) we're trying to replace aren't currently in use.  For
+ * KTRR-protected devices, this must also be called prior to VM prot finalization
+ * and lockdown, as updating SP1 requires a sensitive instruction.
+ */
+       .text
+       .align 2
+       .globl EXT(arm64_replace_bootstack)
+LEXT(arm64_replace_bootstack)
+       ARM64_STACK_PROLOG
+       PUSH_FRAME
+       // Set the exception stack pointer
+       ldr             x0, [x0, CPU_EXCEPSTACK_TOP]
+       mrs             x4, DAIF                                        // Load current DAIF; use x4 as pinst may trash x1-x3
+       msr             DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF | DAIFSC_ASYNCF)           // Disable IRQ/FIQ/serror
+       // Set SP_EL1 to exception stack
+#if defined(KERNEL_INTEGRITY_KTRR)
+       mov             x1, lr
+       bl              _pinst_spsel_1
+       mov             lr, x1
+#else
+       msr             SPSel, #1
 #endif
+       mov             sp, x0
+       msr             SPSel, #0
+       msr             DAIF, x4                                        // Restore interrupt state
+       POP_FRAME
+       ARM64_STACK_EPILOG
 
 #ifdef MONITOR
 /*
@@ -1050,4 +1230,5 @@ LEXT(monitor_call)
        ret
 #endif
 
+
 /* vim: set sw=4 ts=4: */
index d9efa1cade7d006f83e6f5572c2740c142f64313..a07df700759bc792a8d3336bba850d7b3f4654c8 100644 (file)
@@ -71,7 +71,7 @@ machine_task_set_state(
        case ARM_DEBUG_STATE:
        {
                arm_legacy_debug_state_t *tstate = (arm_legacy_debug_state_t *) state;
-               if (task_has_64BitAddr(task) ||
+               if (task_has_64Bit_data(task) ||
                                (state_count != ARM_LEGACY_DEBUG_STATE_COUNT) ||
                                (!debug_legacy_state_is_valid(tstate))) {
                        return KERN_INVALID_ARGUMENT;
@@ -90,7 +90,7 @@ machine_task_set_state(
        case ARM_DEBUG_STATE32:
        {
                arm_debug_state32_t *tstate = (arm_debug_state32_t *) state;
-               if (task_has_64BitAddr(task) ||
+               if (task_has_64Bit_data(task) ||
                                (state_count != ARM_DEBUG_STATE32_COUNT) ||
                                (!debug_state_is_valid32(tstate))) {
                        return KERN_INVALID_ARGUMENT;
@@ -110,7 +110,7 @@ machine_task_set_state(
        {
                arm_debug_state64_t *tstate = (arm_debug_state64_t *) state;
                
-               if ((!task_has_64BitAddr(task)) ||
+               if ((!task_has_64Bit_data(task)) ||
                                (state_count != ARM_DEBUG_STATE64_COUNT) ||
                                (!debug_state_is_valid64(tstate))) {
                        return KERN_INVALID_ARGUMENT;
@@ -156,7 +156,7 @@ machine_task_get_state(task_t task,
        {
                arm_legacy_debug_state_t *tstate = (arm_legacy_debug_state_t *) state;
                
-               if (task_has_64BitAddr(task) || (*state_count != ARM_LEGACY_DEBUG_STATE_COUNT)) {
+               if (task_has_64Bit_data(task) || (*state_count != ARM_LEGACY_DEBUG_STATE_COUNT)) {
                        return KERN_INVALID_ARGUMENT;
                }
                
@@ -172,7 +172,7 @@ machine_task_get_state(task_t task,
        {
                arm_debug_state32_t *tstate = (arm_debug_state32_t *) state;
                
-               if (task_has_64BitAddr(task) || (*state_count != ARM_DEBUG_STATE32_COUNT)) {
+               if (task_has_64Bit_data(task) || (*state_count != ARM_DEBUG_STATE32_COUNT)) {
                        return KERN_INVALID_ARGUMENT;
                }
                
@@ -188,7 +188,7 @@ machine_task_get_state(task_t task,
        {
                arm_debug_state64_t *tstate = (arm_debug_state64_t *) state;
                
-               if ((!task_has_64BitAddr(task)) || (*state_count != ARM_DEBUG_STATE64_COUNT)) {
+               if ((!task_has_64Bit_data(task)) || (*state_count != ARM_DEBUG_STATE64_COUNT)) {
                        return KERN_INVALID_ARGUMENT;
                }
                
@@ -233,8 +233,8 @@ machine_thread_inherit_taskwide(
                int flavor;
                mach_msg_type_number_t count;
 
-               flavor = task_has_64BitAddr(parent_task) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32;
-               count = task_has_64BitAddr(parent_task) ? ARM_DEBUG_STATE64_COUNT : ARM_DEBUG_STATE32_COUNT;
+               flavor = task_has_64Bit_data(parent_task) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32;
+               count = task_has_64Bit_data(parent_task) ? ARM_DEBUG_STATE64_COUNT : ARM_DEBUG_STATE32_COUNT;
 
                return machine_thread_set_state(thread, flavor, parent_task->task_debug, count);
        }
index 1cc446028a5911b630bd7cef7c90cb85547d0566..ec10b19817cf80f95930c8fcb2a02bc9e2506c56 100644 (file)
 
 #include <stdbool.h>
 
+#define PMSR "s3_1_c15_c13_0"
+#define PMSR_PMI(REG) ((REG) & ((1 << CORE_NCTRS) - 1))
 
-static inline void
-mt_fiq(void)
+
+static inline bool
+mt_pmi_pending(uint64_t * restrict pmsr, uint64_t * restrict upmsr)
 {
+       *pmsr = __builtin_arm_rsr64(PMSR);
+       bool pmi = PMSR_PMI(*pmsr);
+
+#pragma unused(upmsr)
+
+       return pmi;
 }
 
+void mt_fiq(void *cpu, uint64_t pmsr, uint64_t upmsr);
+
 #endif /* MACH_KERNEL_PRIVATE */
 
 #endif /* !defined(ARM64_MONOTONIC_H) */
index 4a1563ea747f12c18d3fbd356424e4e5d386ff80..321205db8c4f48671537073531abfc8cb931c1a3 100644 (file)
@@ -29,7 +29,7 @@
 #include <arm/cpu_data_internal.h>
 #include <arm/machine_routines.h>
 #include <arm64/monotonic.h>
-#include <kern/assert.h> /* static_assert, assert */
+#include <kern/assert.h>
 #include <kern/debug.h> /* panic */
 #include <kern/monotonic.h>
 #include <machine/limits.h> /* CHAR_BIT */
 #include <sys/errno.h>
 #include <sys/monotonic.h>
 #include <pexpert/arm64/board_config.h>
+#include <pexpert/device_tree.h> /* DTFindEntry */
 #include <pexpert/pexpert.h>
 
 #pragma mark core counters
 
 bool mt_core_supported = true;
-void mt_fiq_internal(uint64_t upmsr);
 
 /*
  * PMC[0-1] are the 48-bit fixed counters -- PMC0 is cycles and PMC1 is
@@ -64,6 +64,8 @@ void mt_fiq_internal(uint64_t upmsr);
 #define PMC8 "s3_2_c15_c9_0"
 #define PMC9 "s3_2_c15_c10_0"
 
+#define CTR_MAX ((UINT64_C(1) << 47) - 1)
+
 #define CYCLES 0
 #define INSTRS 1
 
@@ -103,14 +105,13 @@ enum {
        PMCR0_INTGEN_FIQ = 4,
 };
 #define PMCR0_INTGEN_SET(INT) ((uint64_t)(INT) << 8)
-/* use AIC for backwards compatibility with kpc */
-#define PMCR0_INTGEN_INIT PMCR0_INTGEN_SET(PMCR0_INTGEN_AIC)
+#define PMCR0_INTGEN_INIT PMCR0_INTGEN_SET(PMCR0_INTGEN_FIQ)
 /* set by hardware if a PMI was delivered */
 #define PMCR0_PMAI        (UINT64_C(1) << 11)
 #define PMCR0_PMI_EN(CTR) (UINT64_C(1) << (12 + CTR_POS(CTR)))
-/* fixed counters are always counting XXX probably need to just set this to all true */
+/* fixed counters are always counting */
 #define PMCR0_PMI_INIT (PMCR0_PMI_EN(CYCLES) | PMCR0_PMI_EN(INSTRS))
-/* disable counting on a PMI (except for AIC interrupts) */
+/* disable counting on a PMI */
 #define PMCR0_DISCNT_EN (UINT64_C(1) << 20)
 /* block PMIs until ERET retires */
 #define PMCR0_WFRFE_EN (UINT64_C(1) << 22)
@@ -119,7 +120,6 @@ enum {
 /* user mode access to configuration registers */
 #define PMCR0_USEREN_EN (UINT64_C(1) << 30)
 
-/* XXX this needs to be synchronized with kpc... */
 #define PMCR0_INIT (PMCR0_INTGEN_INIT | PMCR0_PMI_INIT | PMCR0_DISCNT_EN)
 
 /*
@@ -153,14 +153,6 @@ core_init_execution_modes(void)
        __builtin_arm_wsr64(PMCR1, pmcr1);
 }
 
-/*
- * PMSR reports the overflow status of all counters.
- */
-
-#define PMSR "s3_1_c15_c13_0"
-
-#define PMSR_OVF(CTR) (UINT64_C(1) << (CTR))
-
 /*
  * PMCR2 controls watchpoint registers.
  *
@@ -173,19 +165,15 @@ core_init_execution_modes(void)
 #define PMCR3 "s3_1_c15_c3_0"
 #define PMCR4 "s3_1_c15_c4_0"
 
-/*
- * PMCR_AFFINITY does ??? XXX.
- */
-
-#define PMCR_AFFINITY "s3_1_c15_c11_0"
+#define PMSR_OVF(CTR) (1ULL << (CTR))
 
 void
-mt_init(void)
+mt_early_init(void)
 {
 }
 
 static int
-core_init(void)
+core_init(__unused mt_device_t dev)
 {
        /* the dev node interface to the core counters is still unsupported */
        return ENOTSUP;
@@ -207,7 +195,7 @@ mt_core_snap(unsigned int ctr)
                return __builtin_arm_rsr64(PMC1);
        default:
                panic("monotonic: invalid core counter read: %u", ctr);
-               __builtin_trap();
+               __builtin_unreachable();
        }
 }
 
@@ -223,7 +211,7 @@ mt_core_set_snap(unsigned int ctr, uint64_t count)
                break;
        default:
                panic("monotonic: invalid core counter %u write %llu", ctr, count);
-               __builtin_trap();
+               __builtin_unreachable();
        }
 }
 
@@ -260,8 +248,19 @@ core_idle(__unused cpu_data_t *cpu)
        mt_update_fixed_counts();
 }
 
-static void
-core_run(cpu_data_t *cpu)
+#pragma mark uncore performance monitor
+
+
+#pragma mark common hooks
+
+void
+mt_cpu_idle(cpu_data_t *cpu)
+{
+       core_idle(cpu);
+}
+
+void
+mt_cpu_run(cpu_data_t *cpu)
 {
        uint64_t pmcr0;
        struct mt_cpu *mtc;
@@ -283,47 +282,6 @@ core_run(cpu_data_t *cpu)
        __builtin_arm_wsr64(PMCR0, pmcr0);
 }
 
-static void
-core_up(__unused cpu_data_t *cpu)
-{
-       assert(ml_get_interrupts_enabled() == FALSE);
-
-       core_init_execution_modes();
-}
-
-#pragma mark uncore counters
-
-
-static void
-uncore_sleep(void)
-{
-}
-
-static void
-uncore_wake(void)
-{
-}
-
-static void
-uncore_fiq(uint64_t upmsr)
-{
-#pragma unused(upmsr)
-}
-
-#pragma mark common hooks
-
-void
-mt_cpu_idle(cpu_data_t *cpu)
-{
-       core_idle(cpu);
-}
-
-void
-mt_cpu_run(cpu_data_t *cpu)
-{
-       core_run(cpu);
-}
-
 void
 mt_cpu_down(cpu_data_t *cpu)
 {
@@ -333,59 +291,114 @@ mt_cpu_down(cpu_data_t *cpu)
 void
 mt_cpu_up(cpu_data_t *cpu)
 {
-       core_up(cpu);
        mt_cpu_run(cpu);
 }
 
 void
 mt_sleep(void)
 {
-       uncore_sleep();
 }
 
 void
-mt_wake(void)
+mt_wake_per_core(void)
 {
-       uncore_wake();
 }
 
-void
+static void
 mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmsr)
 {
-       bool found_overflow = false;
-
        assert(cpu != NULL);
        assert(ml_get_interrupts_enabled() == FALSE);
 
        (void)atomic_fetch_add_explicit(&mt_pmis, 1, memory_order_relaxed);
 
-       for (int i = 0; i < MT_CORE_NFIXED; i++) {
+       /*
+        * monotonic handles any fixed counter PMIs.
+        */
+       for (unsigned int i = 0; i < MT_CORE_NFIXED; i++) {
+               if ((pmsr & PMSR_OVF(i)) == 0) {
+                       continue;
+               }
+
+               uint64_t count = mt_cpu_update_count(cpu, i);
+               cpu->cpu_monotonic.mtc_counts[i] += count;
+               mt_core_set_snap(i, mt_core_reset_values[i]);
+               cpu->cpu_monotonic.mtc_snaps[i] = mt_core_reset_values[i];
+
+               if (mt_microstackshots && mt_microstackshot_ctr == i) {
+                       bool user_mode = false;
+                       arm_saved_state_t *state = get_user_regs(current_thread());
+                       if (state) {
+                               user_mode = PSR64_IS_USER(get_saved_state_cpsr(state));
+                       }
+                       KDBG_RELEASE(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 1),
+                                       mt_microstackshot_ctr, user_mode);
+                       mt_microstackshot_pmi_handler(user_mode, mt_microstackshot_ctx);
+               }
+       }
+
+       /*
+        * KPC handles the configurable counter PMIs.
+        */
+       for (unsigned int i = MT_CORE_NFIXED; i < CORE_NCTRS; i++) {
                if (pmsr & PMSR_OVF(i)) {
-                       mt_cpu_update_count(cpu, i);
-                       mt_core_set_snap(i, 0);
-                       found_overflow = true;
+                       extern void kpc_pmi_handler(unsigned int ctr);
+                       kpc_pmi_handler(i);
                }
        }
 
-       assert(found_overflow);
        core_set_enabled();
 }
 
 void
-mt_fiq_internal(uint64_t upmsr)
+mt_fiq(void *cpu, uint64_t pmsr, uint64_t upmsr)
+{
+       mt_cpu_pmi(cpu, pmsr);
+
+#pragma unused(upmsr)
+}
+
+static uint32_t mt_xc_sync;
+
+static void
+mt_microstackshot_start_remote(__unused void *arg)
+{
+       cpu_data_t *cpu = getCpuDatap();
+
+       __builtin_arm_wsr64(PMCR0, PMCR0_INIT);
+
+       for (int i = 0; i < MT_CORE_NFIXED; i++) {
+               uint64_t count = mt_cpu_update_count(cpu, i);
+               cpu->cpu_monotonic.mtc_counts[i] += count;
+               mt_core_set_snap(i, mt_core_reset_values[i]);
+               cpu->cpu_monotonic.mtc_snaps[i] = mt_core_reset_values[i];
+       }
+
+       core_set_enabled();
+
+       if (hw_atomic_sub(&mt_xc_sync, 1) == 0) {
+               thread_wakeup((event_t)&mt_xc_sync);
+       }
+}
+
+int
+mt_microstackshot_start_arch(uint64_t period)
 {
-       uncore_fiq(upmsr);
+       mt_core_reset_values[mt_microstackshot_ctr] = CTR_MAX - period;
+       cpu_broadcast_xcall(&mt_xc_sync, TRUE, mt_microstackshot_start_remote,
+                       mt_microstackshot_start_remote /* cannot pass NULL */);
+       return 0;
 }
 
 #pragma mark dev nodes
 
-const struct monotonic_dev monotonic_devs[] = {
+struct mt_device mt_devices[] = {
        [0] = {
-               .mtd_name = "monotonic/core",
+               .mtd_name = "core",
                .mtd_init = core_init,
        },
 };
 
 static_assert(
-               (sizeof(monotonic_devs) / sizeof(monotonic_devs[0])) == MT_NDEVS,
-               "MT_NDEVS macro should be same as the length of monotonic_devs");
+               (sizeof(mt_devices) / sizeof(mt_devices[0])) == MT_NDEVS,
+               "MT_NDEVS macro should be same as the length of mt_devices");
index a6fa9154b603365a6924d851aef6e293064c036f..d8809b38f2ce4e75e7a4d403a7afd3cf465b22ed 100644 (file)
@@ -61,6 +61,7 @@
 
 #include <sys/kdebug.h>
 
+
 #define USER_SS_ZONE_ALLOC_SIZE (0x4000)
 
 extern int debug_task;
@@ -160,7 +161,7 @@ machine_thread_create(
                thread->machine.upcb = &thread->machine.contextData->ss;
                thread->machine.uNeon = &thread->machine.contextData->ns;
 
-               if (task_has_64BitAddr(task)) {
+               if (task_has_64Bit_data(task)) {
                        thread->machine.upcb->ash.flavor = ARM_SAVED_STATE64;
                        thread->machine.upcb->ash.count = ARM_SAVED_STATE64_COUNT;
                        thread->machine.uNeon->nsh.flavor = ARM_NEON_SAVED_STATE64;
@@ -309,7 +310,7 @@ machine_stack_attach(
        savestate->lr = (uintptr_t)thread_continue;
        savestate->sp = thread->machine.kstackptr;
        savestate->cpsr = PSR64_KERNEL_DEFAULT;
-       machine_stack_attach_kprintf("thread = %x pc = %x, sp = %x\n", thread, savestate->lr, savestate->sp);
+       machine_stack_attach_kprintf("thread = %p pc = %llx, sp = %llx\n", thread, savestate->lr, savestate->sp);
 }
 
 
@@ -357,51 +358,15 @@ machine_stack_handoff(
  */
 void
 call_continuation(
-                 thread_continue_t continuation,
-                 void *parameter,
-                 wait_result_t wresult)
+       thread_continue_t continuation,
+       void *parameter,
+       wait_result_t wresult,
+       boolean_t enable_interrupts)
 {
 #define call_continuation_kprintf(x...)        /* kprintf("call_continuation_kprintf:" x) */
 
        call_continuation_kprintf("thread = %p continuation = %p, stack = %p\n", current_thread(), continuation, current_thread()->machine.kstackptr);
-       Call_continuation(continuation, parameter, wresult, current_thread()->machine.kstackptr);
-}
-
-/* Setting breakpoints in EL1 is effectively a KTRR bypass. The ability to do so is
- * controlled by MDSCR.KDE. The MSR to set MDSCR must be present to allow
- * self-hosted user mode debug. Any checks before the MRS can be skipped with ROP,
- * so we need to put the checks after the MRS where they can't be skipped. That
- * still leaves a small window if a breakpoint is set on the instruction
- * immediately after the MRS. To handle that, we also do a check and then set of
- * the breakpoint control registers. This allows us to guarantee that a given
- * core will never have both KDE set and a breakpoint targeting EL1.
- *
- * If KDE gets set, unset it and then panic */
-static void
-update_mdscr(uint64_t clear, uint64_t set)
-{  
-       uint64_t result = 0;
-       uint64_t tmp1, tmp2;
-       __asm__ volatile(
-               "mrs %[reg], MDSCR_EL1\n"
-               "bic %[reg], %[reg], %[clear]\n"
-               "orr %[reg], %[reg], %[set]\n"
-               "1:\n"
-               "bic %[reg], %[reg], #0x2000\n"
-               "msr MDSCR_EL1, %[reg]\n"
-#if defined(CONFIG_KERNEL_INTEGRITY)
-               /* verify KDE didn't get set (including via ROP)
-                * If set, clear it and then panic */
-               "ands %[tmp], %[reg], #0x2000\n"
-               "orr %[res], %[res], %[tmp]\n"
-               "bne 1b\n"
-#endif
-               : [res] "+r" (result), [tmp] "=r" (tmp1), [reg] "=r" (tmp2)
-               : [clear] "r" (clear), [set] "r" (set) : "x0");
-#if defined(CONFIG_KERNEL_INTEGRITY)
-       if (result)
-               panic("MDSCR.KDE was set: %llx %llx %llx", tmp1, tmp2, result);
-#endif
+       Call_continuation(continuation, parameter, wresult, enable_interrupts);
 }
 
 #define SET_DBGBCRn(n, value, accum) \
@@ -794,7 +759,7 @@ void arm_debug_set(arm_debug_state_t *debug_state)
                        break;
                }
        } else {
-               if (thread_is_64bit(current_thread()))
+               if (thread_is_64bit_data(current_thread()))
                        arm_debug_set64(debug_state);
                else
                        arm_debug_set32(debug_state);
@@ -898,7 +863,7 @@ machine_thread_set_tsd_base(
                return KERN_INVALID_ARGUMENT;
        }
 
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_addr(thread)) {
                if (tsd_base > vm_map_max(thread->map))
                        tsd_base = 0ULL;
        } else {
index 61b62741930dd155a7050eae1336a91b86fccf47..96a59cac267fef51451712abca9450eda6e17933 100644 (file)
@@ -56,6 +56,7 @@
 
 #define LOCK_PRIVATE 1
 
+#include <vm/pmap.h>
 #include <kern/kalloc.h>
 #include <kern/locks.h>
 #include <kern/misc_protos.h>
@@ -78,6 +79,7 @@
 #include <sys/munge.h>
 #include <machine/cpu_capabilities.h>
 #include <arm/cpu_data_internal.h>
+#include <arm/pmap.h>
 
 kern_return_t arm64_lock_test(void);
 kern_return_t arm64_munger_test(void);
@@ -1055,6 +1057,7 @@ ex_cb_test()
        return KERN_SUCCESS;
 }
 
+
 #if __ARM_PAN_AVAILABLE__
 kern_return_t
 arm64_pan_test()
@@ -1119,3 +1122,4 @@ arm64_munger_test()
        return 0;
 }
 
+
index a6971fa74cd986df586f49cb914c0c33009b9993..914cce974043ad5ba19a3ef66353e9d0036ef31d 100644 (file)
 
 #define SCTLR_RESERVED                 ((3 << 28) | (1 << 22) | (1 << 20) | (1 << 11))
 
-// 31          PACIA_ENABLED            AddPACIA and AuthIA functions enabled
-#define SCTLR_PACIA_ENABLED            (1 << 31)
-// 30          PACIB_ENABLED            AddPACIB and AuthIB functions enabled
-#define SCTLR_PACIB_ENABLED            (1 << 30)
-// 29:28       RES1    11
-// 27          PACDA_ENABLED            AddPACDA and AuthDA functions enabled
-#define SCTLR_PACDA_ENABLED            (1 << 27)
-
 // 26          UCI             User Cache Instructions
 #define SCTLR_UCI_ENABLED              (1 << 26)
 
 // 14          DZE             User Data Cache Zero (DC ZVA)
 #define SCTLR_DZE_ENABLED              (1 << 14)
 
-// 13          RES0    0
+// 13          PACDB_ENABLED            AddPACDB and AuthDB functions enabled
+#define SCTLR_PACDB_ENABLED            (1 << 13)
 
 // 12          I               Instruction cache enable
 #define SCTLR_I_ENABLED                        (1 << 12)
 // 0           M               MMU enable
 #define SCTLR_M_ENABLED                        (1 << 0)
 
-#define SCTLR_PAC_DEFAULT              0
-
-#define SCTLR_EL1_DEFAULT              (SCTLR_PAC_DEFAULT | SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \
+#define SCTLR_EL1_DEFAULT              (SCTLR_RESERVED | SCTLR_UCI_ENABLED | SCTLR_nTWE_WFE_ENABLED | SCTLR_DZE_ENABLED | \
                                                SCTLR_I_ENABLED | SCTLR_SED_DISABLED | SCTLR_CP15BEN_ENABLED |             \
                                                SCTLR_SA0_ENABLED | SCTLR_SA_ENABLED | SCTLR_C_ENABLED | SCTLR_M_ENABLED)
 
  */
 #define MPIDR_AFF0_MASK                                0xFF
 #define MPIDR_AFF1_MASK                                0xFF00
+#define MPIDR_AFF1_SHIFT                       8
 #define MPIDR_AFF2_MASK                                0xFF0000
+#define MPIDR_AFF2_SHIFT                       16
 
 /*
  * We currently use a 3 level page table (rather than the full 4
 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
 #endif
 
+/* some sugar for getting pointers to page tables and entries */
+
+#define L1_TABLE_INDEX(va) (((va) & ARM_TT_L1_INDEX_MASK) >> ARM_TT_L1_SHIFT)
+#define L2_TABLE_INDEX(va) (((va) & ARM_TT_L2_INDEX_MASK) >> ARM_TT_L2_SHIFT)
+#define L3_TABLE_INDEX(va) (((va) & ARM_TT_L3_INDEX_MASK) >> ARM_TT_L3_SHIFT)
+
+#define L2_TABLE_VA(tte) ((tt_entry_t*) phystokv((*(tte)) & ARM_TTE_TABLE_MASK))
+#define L3_TABLE_VA(tte2) ((pt_entry_t*) phystokv((*(tte2)) & ARM_TTE_TABLE_MASK))
+
 /*
  *  L2 Translation table
  *
 #define ARM_TTE_BLOCK_NS_MASK          0x0000000000000020ULL   /* notSecure mapping mask */
 
 #define ARM_TTE_BLOCK_PNX                      0x0020000000000000ULL   /* value for privilege no execute bit */
-#define ARM_TTE_BLOCK_PNXMASK          0x0020000000000000ULL   /* privilege execute mask */
+#define ARM_TTE_BLOCK_PNXMASK          0x0020000000000000ULL   /* privilege no execute mask */
 
 #define ARM_TTE_BLOCK_NX                       0x0040000000000000ULL   /* value for no execute */
 #define ARM_TTE_BLOCK_NXMASK           0x0040000000000000ULL   /* no execute mask */
 #define ARM_PTE_HINT_ENTRIES_SHIFT     7ULL                                    /* shift to construct the number of entries */
 #define ARM_PTE_HINT_ADDR_MASK         0x0000FFFFFFE00000ULL                   /* mask to extract the starting hint address */
 #define ARM_PTE_HINT_ADDR_SHIFT                21                                      /* shift for the hint address */
+#define ARM_KVA_HINT_ADDR_MASK         0xFFFFFFFFFFE00000ULL                   /* mask to extract the starting hint address */
 #else
 #define ARM_PTE_HINT_ENTRIES           16ULL                                   /* number of entries the hint covers */
 #define ARM_PTE_HINT_ENTRIES_SHIFT     4ULL                                    /* shift to construct the number of entries */
 #define ARM_PTE_HINT_ADDR_MASK         0x0000FFFFFFFF0000ULL                   /* mask to extract the starting hint address */
 #define ARM_PTE_HINT_ADDR_SHIFT                16                                      /* shift for the hint address */
+#define ARM_KVA_HINT_ADDR_MASK         0xFFFFFFFFFFFF0000ULL                   /* mask to extract the starting hint address */
 #endif
 
-#define ARM_PTE_PNX                                    0x0020000000000000ULL   /* value for no execute */
-#define ARM_PTE_PNXMASK                                0x0020000000000000ULL   /* no execute mask */
+#define ARM_PTE_PNX                                    0x0020000000000000ULL   /* value for privilege no execute bit */
+#define ARM_PTE_PNXMASK                                0x0020000000000000ULL   /* privilege no execute mask */
 
-#define ARM_PTE_NX                                     0x0040000000000000ULL   /* value for privilege no execute bit */
-#define ARM_PTE_NXMASK                         0x0040000000000000ULL   /* privilege execute mask */
+#define ARM_PTE_NX                                     0x0040000000000000ULL   /* value for no execute bit */
+#define ARM_PTE_NXMASK                         0x0040000000000000ULL   /* no execute mask */
 
 #define ARM_PTE_WIRED                          0x0080000000000000ULL   /* value for software wired bit */
 #define ARM_PTE_WIRED_MASK                     0x0080000000000000ULL   /* software wired mask */
index cf022d32fcfe48bf0793de87a2f626194f05f7fa..4ace02086ade726299da43fb59c57fb2b7e17570 100644 (file)
@@ -54,6 +54,7 @@
 #include <vm/vm_kern.h>
 
 #include <sys/kdebug.h>
+#include <kperf/kperf.h>
 
 #include <kern/policy_internal.h>
 #if CONFIG_TELEMETRY
@@ -122,6 +123,7 @@ static void inspect_instruction_abort(uint32_t, fault_status_t *, vm_prot_t *);
 static void inspect_data_abort(uint32_t, fault_status_t *, vm_prot_t *);
 
 static int is_vm_fault(fault_status_t);
+static int is_translation_fault(fault_status_t);
 static int is_alignment_fault(fault_status_t);
 
 typedef void(*abort_handler_t)(arm_saved_state_t *, uint32_t, vm_offset_t, fault_status_t, vm_prot_t, vm_offset_t);
@@ -177,6 +179,12 @@ extern boolean_t pgtrace_enabled;
 #endif
 
 #if __ARM_PAN_AVAILABLE__
+#ifdef CONFIG_XNUPOST
+extern vm_offset_t pan_test_addr;
+extern vm_offset_t pan_ro_addr;
+extern volatile int pan_exception_level;
+extern volatile char pan_fault_value;
+#endif
 #endif
 
 #if defined(APPLECYCLONE)
@@ -207,6 +215,8 @@ extern boolean_t pgtrace_enabled;
 #endif
 
 
+extern vm_offset_t static_memory_end;
+
 static inline unsigned
 __ror(unsigned value, unsigned shift)
 {
@@ -284,6 +294,7 @@ arm64_implementation_specific_error(arm_saved_state_t *state, uint32_t esr, vm_o
                    (void *)llc_err_sts, (void *)llc_err_adr, (void *)llc_err_inf);
 #endif
 #else // !defined(APPLE_ARM64_ARCH_FAMILY)
+#pragma unused (state, esr, far)
        panic_plain("Unhandled implementation specific error\n");
 #endif
 }
@@ -437,8 +448,8 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
                assert(0); /* Unreachable */
 
        case ESR_EC_IABORT_EL1:
-               panic("Kernel instruction fetch abort: pc=%p iss=0x%x far=%p. Note: the faulting frame may be missing in the backtrace.",
-                         (void *)get_saved_state_pc(state), ESR_ISS(esr), (void*)far);
+                       
+               panic_with_thread_kernel_state("Kernel instruction fetch abort", state);
 
        case ESR_EC_PC_ALIGN:
                handle_pc_align(state);
@@ -609,13 +620,13 @@ sleh_synchronous(arm_context_t *context, uint32_t esr, vm_offset_t far)
 static void
 handle_uncategorized(arm_saved_state_t *state, boolean_t instrLen2)
 {
-       exception_type_t                        exception = EXC_BAD_INSTRUCTION;
-       mach_exception_data_type_t      codes[2] = {EXC_ARM_UNDEFINED};
-       mach_msg_type_number_t          numcodes = 2;
-       uint32_t                                        instr;
+       exception_type_t           exception = EXC_BAD_INSTRUCTION;
+       mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED};
+       mach_msg_type_number_t     numcodes = 2;
+       uint32_t                   instr = 0;
 
        if (instrLen2) {
-               uint16_t        instr16;
+               uint16_t instr16 = 0;
                COPYIN(get_saved_state_pc(state), (char *)&instr16, sizeof(instr16));
 
                instr = instr16;
@@ -886,6 +897,20 @@ is_vm_fault(fault_status_t status)
        }
 }
 
+static int
+is_translation_fault(fault_status_t status)
+{
+       switch (status) {
+       case FSC_TRANSLATION_FAULT_L0:
+       case FSC_TRANSLATION_FAULT_L1:
+       case FSC_TRANSLATION_FAULT_L2:
+       case FSC_TRANSLATION_FAULT_L3:
+               return TRUE;
+       default:
+               return FALSE;
+       }
+}
+
 #if __ARM_PAN_AVAILABLE__
 static int
 is_permission_fault(fault_status_t status)
@@ -940,9 +965,9 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
        thread->iotier_override = THROTTLE_LEVEL_NONE; /* Reset IO tier override before handling abort from userspace */
 
        if (is_vm_fault(fault_code)) {
-               kern_return_t   result = KERN_FAILURE;
-               vm_map_t                map = thread->map;
-               vm_offset_t             vm_fault_addr = fault_addr;
+               kern_return_t   result = KERN_FAILURE;
+               vm_map_t        map = thread->map;
+               vm_offset_t     vm_fault_addr = fault_addr;
 
                assert(map != kernel_map);
 
@@ -981,7 +1006,7 @@ handle_user_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_addr
 
                /* check to see if it is just a pmap ref/modify fault */
 
-               if (result != KERN_SUCCESS) {
+               if ((result != KERN_SUCCESS) && !is_translation_fault(fault_code)) {
                        result = arm_fast_fault(map->pmap, trunc_page(vm_fault_addr), fault_type, TRUE);
                }
                if (result != KERN_SUCCESS) {
@@ -1082,9 +1107,9 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
 #endif
 
        if (is_vm_fault(fault_code)) {
-               kern_return_t           result;
-               vm_map_t                map;
-               int                     interruptible;
+               kern_return_t   result = KERN_FAILURE;
+               vm_map_t        map;
+               int             interruptible;
 
                /*
                 * Ensure no faults in the physical aperture. This could happen if
@@ -1093,7 +1118,20 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
                 */
 
 
-               if (fault_addr >= gVirtBase && fault_addr < (gVirtBase+gPhysSize)) {
+#if __ARM_PAN_AVAILABLE__ && defined(CONFIG_XNUPOST)
+               if (is_permission_fault(fault_code) && !(get_saved_state_cpsr(state) & PSR64_PAN) &&
+                   (pan_ro_addr != 0) && (fault_addr == pan_ro_addr)) {
+                       ++pan_exception_level;
+                       // On an exception taken from a PAN-disabled context, verify
+                       // that PAN is re-enabled for the exception handler and that
+                       // accessing the test address produces a PAN fault.
+                       pan_fault_value = *(char *)pan_test_addr;
+                       set_saved_state_pc(state, get_saved_state_pc(state) + 4);
+                       return;
+               }
+#endif
+
+               if (fault_addr >= gVirtBase && fault_addr < static_memory_end) {
                        panic_with_thread_kernel_state("Unexpected fault in kernel static region\n",state);
                }
 
@@ -1117,9 +1155,12 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
 #endif
 
                /* check to see if it is just a pmap ref/modify fault */
-               result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE);
-               if (result == KERN_SUCCESS) return;
+               if (!is_translation_fault(fault_code)) {
+                       result = arm_fast_fault(map->pmap, trunc_page(fault_addr), fault_type, FALSE);
+                       if (result == KERN_SUCCESS) return;
+               }
 
+               if (result != KERN_PROTECTION_FAILURE)
                {
                        /*
                         *  We have to "fault" the page in.
@@ -1141,6 +1182,22 @@ handle_kernel_abort(arm_saved_state_t *state, uint32_t esr, vm_offset_t fault_ad
 
 #if __ARM_PAN_AVAILABLE__
                if (is_pan_fault(state, esr, fault_addr, fault_code)) {
+#ifdef CONFIG_XNUPOST
+                       if ((pan_test_addr != 0) && (fault_addr == pan_test_addr))
+                       {
+                               ++pan_exception_level;
+                               // read the user-accessible value to make sure
+                               // pan is enabled and produces a 2nd fault from
+                               // the exception handler
+                               if (pan_exception_level == 1)
+                                       pan_fault_value = *(char *)pan_test_addr;       
+                               // this fault address is used for PAN test
+                               // disable PAN and rerun
+                               set_saved_state_cpsr(state,
+                                       get_saved_state_cpsr(state) & (~PSR64_PAN));
+                               return;
+                       }
+#endif
                        panic_with_thread_kernel_state("Privileged access never abort.", state);
                }
 #endif
@@ -1232,10 +1289,10 @@ handle_mach_continuous_time_trap(arm_saved_state_t *state)
 static void
 handle_msr_trap(arm_saved_state_t *state, uint32_t iss)
 {
-       exception_type_t                        exception = EXC_BAD_INSTRUCTION;
-       mach_exception_data_type_t      codes[2] = {EXC_ARM_UNDEFINED};
-       mach_msg_type_number_t          numcodes = 2;
-       uint32_t                                        instr;
+       exception_type_t           exception = EXC_BAD_INSTRUCTION;
+       mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED};
+       mach_msg_type_number_t     numcodes = 2;
+       uint32_t                   instr = 0;
 
        (void)iss;
 
@@ -1256,10 +1313,10 @@ handle_msr_trap(arm_saved_state_t *state, uint32_t iss)
 static void
 handle_user_trapped_instruction32(arm_saved_state_t *state, uint32_t esr)
 {
-       exception_type_t                        exception = EXC_BAD_INSTRUCTION;
-       mach_exception_data_type_t      codes[2] = {EXC_ARM_UNDEFINED};
-       mach_msg_type_number_t          numcodes = 2;
-       uint32_t                                        instr;
+       exception_type_t           exception = EXC_BAD_INSTRUCTION;
+       mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED};
+       mach_msg_type_number_t     numcodes = 2;
+       uint32_t                   instr = 0;
 
        if (is_saved_state64(state)) {
                panic("ESR (0x%x) for instruction trapped from U32, but saved state is 64-bit.", esr);
@@ -1278,10 +1335,10 @@ handle_user_trapped_instruction32(arm_saved_state_t *state, uint32_t esr)
 static void
 handle_simd_trap(arm_saved_state_t *state, uint32_t esr)
 {
-       exception_type_t                        exception = EXC_BAD_INSTRUCTION;
-       mach_exception_data_type_t      codes[2] = {EXC_ARM_UNDEFINED};
-       mach_msg_type_number_t          numcodes = 2;
-       uint32_t                                        instr;
+       exception_type_t           exception = EXC_BAD_INSTRUCTION;
+       mach_exception_data_type_t codes[2] = {EXC_ARM_UNDEFINED};
+       mach_msg_type_number_t     numcodes = 2;
+       uint32_t                   instr = 0;
 
        if (PSR64_IS_KERNEL(get_saved_state_cpsr(state))) {
                panic("ESR (0x%x) for SIMD trap from userland, actually came from kernel?", esr);
@@ -1301,6 +1358,9 @@ sleh_irq(arm_saved_state_t *state)
        uint32_t *   old_entropy_data_ptr = NULL;
        uint32_t *   new_entropy_data_ptr = NULL;
        cpu_data_t * cdp                  = getCpuDatap();
+#if DEVELOPMENT || DEBUG
+       int preemption_level = get_preemption_level();
+#endif
 
        sleh_interrupt_handler_prologue(state, DBG_INTR_TYPE_OTHER);
 
@@ -1334,25 +1394,45 @@ sleh_irq(arm_saved_state_t *state)
        *old_entropy_data_ptr = (uint32_t)timestamp ^ __ror(old_entropy_data, 9);
 
        sleh_interrupt_handler_epilogue();
+#if DEVELOPMENT || DEBUG
+       if (preemption_level != get_preemption_level())
+               panic("irq handler %p changed preemption level from %d to %d", cdp->interrupt_handler, preemption_level, get_preemption_level());
+#endif
 }
 
 void
 sleh_fiq(arm_saved_state_t *state)
 {
        unsigned int type   = DBG_INTR_TYPE_UNKNOWN;
+#if DEVELOPMENT || DEBUG
+       int preemption_level = get_preemption_level();
+#endif
+#if MONOTONIC
+       uint64_t pmsr = 0, upmsr = 0;
+#endif /* MONOTONIC */
+
+#if MONOTONIC
+       if (mt_pmi_pending(&pmsr, &upmsr)) {
+               type = DBG_INTR_TYPE_PMI;
+       } else
+#endif /* MONOTONIC */
        if (ml_get_timer_pending()) {
                type = DBG_INTR_TYPE_TIMER;
        }
 
        sleh_interrupt_handler_prologue(state, type);
 
+#if MONOTONIC
+       if (type == DBG_INTR_TYPE_PMI) {
+               mt_fiq(getCpuDatap(), pmsr, upmsr);
+       } else
+#endif /* MONOTONIC */
        {
                /*
                 * We don't know that this is a timer, but we don't have insight into
                 * the other interrupts that go down this path.
                 */
 
-
                cpu_data_t *cdp = getCpuDatap();
 
                cdp->cpu_decrementer = -1; /* Large */
@@ -1366,15 +1446,26 @@ sleh_fiq(arm_saved_state_t *state)
        }
 
        sleh_interrupt_handler_epilogue();
+#if DEVELOPMENT || DEBUG
+       if (preemption_level != get_preemption_level())
+               panic("fiq type %u changed preemption level from %d to %d", type, preemption_level, get_preemption_level());
+#endif
 }
 
 void
 sleh_serror(arm_context_t *context, uint32_t esr, vm_offset_t far)
 {
        arm_saved_state_t               *state = &context->ss;
+#if DEVELOPMENT || DEBUG
+       int preemption_level = get_preemption_level();
+#endif
 
        ASSERT_CONTEXT_SANITY(context);
        arm64_platform_error(state, esr, far);
+#if DEVELOPMENT || DEBUG
+       if (preemption_level != get_preemption_level())
+               panic("serror changed preemption level from %d to %d", preemption_level, get_preemption_level());
+#endif
 }
 
 void
@@ -1434,7 +1525,7 @@ sleh_interrupt_handler_prologue(arm_saved_state_t *state, unsigned int type)
 
 #if CONFIG_TELEMETRY
        if (telemetry_needs_record) {
-               telemetry_mark_curthread((boolean_t)is_user);
+               telemetry_mark_curthread((boolean_t)is_user, FALSE);
        }
 #endif /* CONFIG_TELEMETRY */
 }
@@ -1442,6 +1533,9 @@ sleh_interrupt_handler_prologue(arm_saved_state_t *state, unsigned int type)
 static void
 sleh_interrupt_handler_epilogue(void)
 {
+#if KPERF
+       kperf_interrupt();
+#endif /* KPERF */
        KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END);
 }
 
index 24a6dba7f79269bb7cd6cab7e9ad3a59d7e48b71..f709217cf72a4db21e83200bf9056bb628c16c7b 100644 (file)
@@ -29,8 +29,6 @@
 #include <arm64/asm.h>
 #include <arm64/proc_reg.h>
 #include <pexpert/arm64/board_config.h>
-#include <pexpert/arm64/cyclone.h>
-#include <pexpert/arm64/hurricane.h>
 #include <mach_assert.h>
 #include <machine/asm.h>
 #include "assym.s"
@@ -104,6 +102,7 @@ LEXT(LowResetVectorBase)
 
        // Unlock the core for debugging
        msr             OSLAR_EL1, xzr
+       msr             DAIFSet, #(DAIFSC_ALL)                          // Disable all interrupts
 
 #if !(defined(KERNEL_INTEGRITY_KTRR))
        // Set low reset vector before attempting any loads
@@ -123,18 +122,19 @@ LEXT(LowResetVectorBase)
         * If either values are zero, we're debugging kernel so skip programming KTRR.
         */
 
+
        // load stashed rorgn_begin
        adrp    x17, EXT(rorgn_begin)@page
        add             x17, x17, EXT(rorgn_begin)@pageoff
        ldr             x17, [x17]
        // if rorgn_begin is zero, we're debugging. skip enabling ktrr
-       cbz             x17, 1f
+       cbz             x17, Lskip_ktrr
 
        // load stashed rorgn_end
        adrp    x19, EXT(rorgn_end)@page
        add             x19, x19, EXT(rorgn_end)@pageoff
        ldr             x19, [x19]
-       cbz             x19, 1f
+       cbz             x19, Lskip_ktrr
 
        // program and lock down KTRR
        // subtract one page from rorgn_end to make pinst insns NX
@@ -143,9 +143,8 @@ LEXT(LowResetVectorBase)
        msr             ARM64_REG_KTRR_UPPER_EL1, x19
        mov             x17, #1
        msr             ARM64_REG_KTRR_LOCK_EL1, x17
-
-1:
-#endif /* defined(KERNEL_INTEGRITY_KTRR) */
+Lskip_ktrr:
+#endif /* defined(KERNEL_INTEGRITY_KTRR)*/
 
        // Process reset handlers
        adrp    x19, EXT(ResetHandlerData)@page                 // Get address of the reset handler data
@@ -304,13 +303,13 @@ start_cpu:
 
        // x20 set to BootArgs phys address
        // x21 set to cpu data phys address
-       msr             DAIFSet, #(DAIFSC_ALL)                          // Disable all interrupts
 
        // Get the kernel memory parameters from the boot args
        ldr             x22, [x20, BA_VIRT_BASE]                        // Get the kernel virt base
        ldr             x23, [x20, BA_PHYS_BASE]                        // Get the kernel phys base
        ldr             x24, [x20, BA_MEM_SIZE]                         // Get the physical memory size
        ldr             x25, [x20, BA_TOP_OF_KERNEL_DATA]       // Get the top of the kernel data
+       ldr             x26, [x20, BA_BOOT_FLAGS]                       // Get the kernel boot flags
 
        // Set TPIDRRO_EL0 with the CPU number
        ldr             x0, [x21, CPU_NUMBER_GS]
@@ -395,6 +394,52 @@ start_cpu:
        b.ne    1b
 .endmacro
 
+/*
+ *  arg0 - virtual start address
+ *  arg1 - physical start address
+ *  arg2 - number of entries to map
+ *  arg3 - L1 table address
+ *  arg4 - free space pointer
+ *  arg5 - scratch (entries mapped per loop)
+ *  arg6 - scratch
+ *  arg7 - scratch
+ *  arg8 - scratch
+ *  arg9 - scratch
+ */
+.macro create_bootstrap_mapping
+       /* calculate entries left in this page */
+       and     $5, $0, #(ARM_TT_L2_INDEX_MASK)
+       lsr     $5, $5, #(ARM_TT_L2_SHIFT)
+       mov     $6, #(TTE_PGENTRIES)
+       sub     $5, $6, $5
+
+       /* allocate an L2 table */
+3:     add     $4, $4, PGBYTES
+
+       /* create_l1_table_entry(virt_base, L1 table, L2 table, scratch1, scratch2, scratch3) */
+       create_l1_table_entry   $0, $3, $4, $6, $7, $8
+
+       /* determine how many entries to map this loop - the smaller of entries
+        * remaining in page and total entries left */
+       cmp     $2, $5
+       csel    $5, $2, $5, lt
+
+       /* create_l2_block_entries(virt_base, phys_base, L2 table, num_ents, scratch1, scratch2, scratch3) */
+       create_l2_block_entries $0, $1, $4, $5, $6, $7, $8, $9
+
+       /* subtract entries just mapped and bail out if we're done */
+       subs    $2, $2, $5
+       beq     2f
+
+       /* entries left to map - advance base pointers */
+       add     $0, $0, $5, lsl #(ARM_TT_L2_SHIFT)
+       add     $1, $1, $5, lsl #(ARM_TT_L2_SHIFT)
+
+       mov     $5, #(TTE_PGENTRIES)  /* subsequent loops map (up to) a whole L2 page */
+       b       3b
+2:
+.endmacro
+
 /*
  * _start_first_cpu
  * Cold boot init routine.  Called from __start
@@ -406,8 +451,9 @@ LEXT(start_first_cpu)
 
        // Unlock the core for debugging
        msr             OSLAR_EL1, xzr
+       msr             DAIFSet, #(DAIFSC_ALL)                          // Disable all interrupts
        mov             x20, x0
-       mov             x21, xzr
+       mov             x21, #0
 
        // Set low reset vector before attempting any loads
        adrp    x0, EXT(LowExceptionVectorBase)@page
@@ -415,14 +461,16 @@ LEXT(start_first_cpu)
        MSR_VBAR_EL1_X0
 
 
-
        // Get the kernel memory parameters from the boot args
        ldr             x22, [x20, BA_VIRT_BASE]                        // Get the kernel virt base
        ldr             x23, [x20, BA_PHYS_BASE]                        // Get the kernel phys base
        ldr             x24, [x20, BA_MEM_SIZE]                         // Get the physical memory size
        ldr             x25, [x20, BA_TOP_OF_KERNEL_DATA]       // Get the top of the kernel data
+       ldr             x26, [x20, BA_BOOT_FLAGS]                       // Get the kernel boot flags
 
-       // Set CPU number to 0
+       // Clear the register that will be used to store the userspace thread pointer and CPU number.
+       // We may not actually be booting from ordinal CPU 0, so this register will be updated
+       // in ml_parse_cpu_topology(), which happens later in bootstrap.
        msr             TPIDRRO_EL0, x21
 
        // Set up exception stack pointer
@@ -480,21 +528,11 @@ LEXT(start_first_cpu)
 #else
        lsl             x2, x2, #2                                                      // Shift by 2 for num entries on 4 pages
 #endif
-       sub             x2, x2, #1                                                      // Subtract one to terminate on last entry
 Linvalidate_bootstrap:                                                 // do {
        str             x0, [x1], #(1 << TTE_SHIFT)                     //   Invalidate and advance
        subs    x2, x2, #1                                                      //   entries--
        b.ne    Linvalidate_bootstrap                           // } while (entries != 0)
 
-       /* Load addresses for page table construction macros
-        *  x0 - Physical base (used to identify V=P section to set up)
-        *      x1 - V=P L1 table base
-        *      x2 - V=P L2 table base
-        *      x3 - KVA L1 table base
-        *      x4 - KVA L2 table base
-        *      x5 - Mem size in entries (up to 1GB)
-        */
-
        /*
         * In order to reclaim memory on targets where TZ0 (or some other entity)
         * must be located at the base of memory, iBoot may set the virtual and
@@ -512,53 +550,55 @@ Linvalidate_bootstrap:                                                    // do {
         * mapping TZ0.
         */
        adrp    x0, EXT(_mh_execute_header)@page        // Use xnu's mach header as the start address
-       add             x0, x0, EXT(_mh_execute_header)@pageoff
-#if __ARM64_TWO_LEVEL_PMAP__
+       add     x0, x0, EXT(_mh_execute_header)@pageoff
+
        /*
-        * We don't need the L1 entries in this case, so skip them.
+        * Adjust physical and virtual base addresses to account for physical
+        * memory preceeding xnu Mach-O header
+        * x22 - Kernel virtual base
+        * x23 - Kernel physical base
+        * x24 - Physical memory size
         */
-       mov             x2, x25                                                         // Load V=P L2 table address
-       add             x4, x2, PGBYTES                                         // Load KVA L2 table address
-#else
-       mov             x1, x25                                                         // Load V=P L1 table address
-       add             x2, x1, PGBYTES                                         // Load V=P L2 table address
-       add             x3, x2, PGBYTES                                         // Load KVA L1 table address
-       add             x4, x3, PGBYTES                                         // Load KVA L2 table address
-#endif
+       sub             x18, x0, x23
+       sub             x24, x24, x18
+       add             x22, x22, x18
+       add             x23, x23, x18
+
        /*
-        * We must adjust the amount we wish to map in order to account for the
-        * memory preceeding xnu's mach header.
+        * x0  - V=P virtual cursor
+        * x4  - V=P physical cursor
+        * x14 - KVA virtual cursor
+        * x15 - KVA physical cursor
         */
-       sub             x5, x0, x23                                                     // Map from the mach header up to the end of our memory
-       sub             x5, x24, x5
-       lsr             x5, x5, #(ARM_TT_L2_SHIFT)
-       mov             x6, #(TTE_PGENTRIES)                            // Load number of L2 entries per page
-       cmp             x5, x6                                                          // If memsize requires more than 1 page of entries
-       csel    x5, x5, x6, lt                                          // ... round down to a single page (first 1GB)
-
-#if !__ARM64_TWO_LEVEL_PMAP__
-       /* Create entry for L2 table in V=P L1 table
-        * create_l1_table_entry(V=P, L1 table, L2 table, scratch1, scratch2, scratch3)
-        */
-       create_l1_table_entry   x0, x1, x2, x10, x11, x12
-#endif
+       mov             x4, x0
+       mov             x14, x22
+       mov             x15, x23
 
-       /* Create block entry in V=P L2 table
-        * create_l2_block_entries(V=P virt, V=P phys, L2 table, num_ents, scratch1, scratch2, scratch3)
+       /*
+        * Allocate L1 tables
+        * x1 - V=P L1 page
+        * x3 - KVA L1 page
+        * x2 - free mem pointer from which we allocate a variable number of L2
+        * pages. The maximum number of bootstrap page table pages is limited to
+        * BOOTSTRAP_TABLE_SIZE. For a 2G 4k page device, assuming the worst-case
+        * slide, we need 1xL1 and up to 3xL2 pages (1GB mapped per L1 entry), so
+        * 8 total pages for V=P and KVA.
         */
-       create_l2_block_entries x0, x0, x2, x5, x10, x11, x12, x13
+       mov             x1, x25
+       add             x3, x1, PGBYTES
+       mov             x2, x3
 
-#if !__ARM64_TWO_LEVEL_PMAP__
-       /* Create entry for L2 table in KVA L1 table
-        * create_l1_table_entry(virt_base, L1 table, L2 table, scratch1, scratch2, scratch3)
+       /*
+        * Setup the V=P bootstrap mapping
+        * x5 - total number of L2 entries to allocate
         */
-       create_l1_table_entry   x22, x3, x4, x10, x11, x12
-#endif
+       lsr             x5,  x24, #(ARM_TT_L2_SHIFT)
+       /* create_bootstrap_mapping(vbase, pbase, num_ents, L1 table, freeptr) */
+       create_bootstrap_mapping x0,  x4,  x5, x1, x2, x6, x10, x11, x12, x13
 
-       /* Create block entries in KVA L2 table
-        * create_l2_block_entries(virt_base, phys_base, L2 table, num_ents, scratch1, scratch2, scratch3)
-        */
-       create_l2_block_entries x22, x23, x4, x5, x10, x11, x12, x13
+       /* Setup the KVA bootstrap mapping */
+       lsr             x5,  x24, #(ARM_TT_L2_SHIFT)
+       create_bootstrap_mapping x14, x15, x5, x3, x2, x9, x10, x11, x12, x13
 
        /* Ensure TTEs are visible */
        dsb             ish
@@ -573,8 +613,7 @@ Linvalidate_bootstrap:                                                      // do {
  *     x21 - zero on cold boot, PA of cpu data on warm reset
  *     x22 - Kernel virtual base
  *     x23 - Kernel physical base
- *     x24     - Physical memory size
- *     x25 - PA of the end of the kernl
+ *     x25 - PA of the end of the kernel
  *      lr - KVA of C init routine
  *      sp - SP_EL0 selected
  *
@@ -591,7 +630,7 @@ common_start:
 
        /* Set up translation table base registers.
         *      TTBR0 - V=P table @ top of kernel
-        *      TTBR1 - KVA table @ top of kernel + 2 pages
+        *      TTBR1 - KVA table @ top of kernel + 1 page
         */
 #if defined(KERNEL_INTEGRITY_KTRR)
        /* Note that for KTRR configurations, the V=P map will be modified by
@@ -599,25 +638,10 @@ common_start:
         */
 #endif
        and             x0, x25, #(TTBR_BADDR_MASK)
-#if __ARM_KERNEL_PROTECT__
-       /* We start out with a kernel ASID. */
-       orr             x0, x0, #(1 << TTBR_ASID_SHIFT)
-#endif /* __ARM_KERNEL_PROTECT__ */
-       msr             TTBR0_EL1, x0
-#if __ARM64_TWO_LEVEL_PMAP__
-       /*
-        * If we're using a two level pmap, we'll only need a
-        * single page per bootstrap pmap.
-        */
-       mov             x12, #1
-#else
-       /*
-        * If we're using a three level pmap, we'll need two
-        * pages per bootstrap pmap.
-        */
-       mov             x12, #2
-#endif
-       add             x0, x25, x12, lsl PGSHIFT
+       mov             x19, lr
+       bl              EXT(set_mmu_ttb)
+       mov             lr, x19
+       add             x0, x25, PGBYTES
        and             x0, x0, #(TTBR_BADDR_MASK)
        MSR_TTBR1_EL1_X0
 
@@ -637,9 +661,6 @@ common_start:
        orr             x0, x0, x1
        msr             MAIR_EL1, x0
 
-       // Disable interrupts
-       msr     DAIFSet, #(DAIFSC_IRQF | DAIFSC_FIQF)
-
 #if defined(APPLEHURRICANE)
 
        // <rdar://problem/26726624> Increase Snoop reservation in EDB to reduce starvation risk
@@ -651,6 +672,19 @@ common_start:
 
 #endif
 
+#if defined(BCM2837)
+       // Setup timer interrupt routing; must be done before MMU is enabled
+       mrs             x15, MPIDR_EL1                                          // Load MPIDR to get CPU number
+       and             x15, x15, #0xFF                                         // CPU number is in MPIDR Affinity Level 0
+       mov             x0, #0x4000
+       lsl             x0, x0, #16
+       add             x0, x0, #0x0040                                         // x0: 0x4000004X Core Timers interrupt control
+       add             x0, x0, x15, lsl #2
+       mov             w1, #0xF0                                               // x1: 0xF0       Route to Core FIQs
+       str             w1, [x0]
+       isb             sy
+#endif
+
 
 #ifndef __ARM_IC_NOALIAS_ICACHE__
        /* Invalidate the TLB and icache on systems that do not guarantee that the
@@ -725,13 +759,13 @@ common_start:
 #if defined(APPLECYCLONE) || defined(APPLETYPHOON)
        //
        // Cyclone/Typhoon-Specific initialization
-       // For tunable summary, see <rdar://problem/13503621>
+       // For tunable summary, see <rdar://problem/13503621> Alcatraz/H6: Confirm Cyclone CPU tunables have been set
        //
 
        //
        // Disable LSP flush with context switch to work around bug in LSP
        // that can cause Cyclone to wedge when CONTEXTIDR is written.
-       // <rdar://problem/12387704>
+       // <rdar://problem/12387704> Innsbruck11A175: panic(cpu 0 caller 0xffffff800024e30c): "wait queue deadlock - wq=0xffffff805a7a63c0, cpu=0\n"
        //
 
        mrs             x12, ARM64_REG_HID0
@@ -843,6 +877,83 @@ Lskip_isalive:
 
 #endif // APPLEHURRICANE
 
+#if defined(APPLEMONSOON)
+
+       /***** Tunables that apply to all skye cores, all chip revs *****/
+
+       // <rdar://problem/28512310> SW WAR/eval: WKdm write ack lost when bif_wke_colorWrAck_XXaH asserts concurrently for both colors
+       mrs             x12, ARM64_REG_HID8
+       orr             x12, x12, #ARM64_REG_HID8_WkeForceStrictOrder
+       msr             ARM64_REG_HID8, x12
+
+       // Skip if not E-core
+       ARM64_IS_PCORE x15
+       cbnz            x15, Lskip_skye_ecore_only
+
+       /***** Tunables that only apply to skye e-cores, all chip revs *****/
+
+       // <rdar://problem/30423928>: Atomic launch eligibility is erroneously taken away when a store at SMB gets invalidated
+       mrs             x12, ARM64_REG_EHID11
+       and             x12, x12, ~(ARM64_REG_EHID11_SmbDrainThresh_mask)
+       msr             ARM64_REG_EHID11, x12
+
+Lskip_skye_ecore_only:
+
+       SKIP_IF_CPU_VERSION_GREATER_OR_EQUAL x12, MONSOON_CPU_VERSION_B0, Lskip_skye_a0_workarounds
+
+       // Skip if not E-core
+       cbnz            x15, Lskip_skye_a0_ecore_only
+
+       /***** Tunables that only apply to skye e-cores, chip revs < B0 *****/
+
+       // Disable downstream fill bypass logic
+       // <rdar://problem/28545159> [Tunable] Skye - L2E fill bypass collision from both pipes to ecore
+       mrs             x12, ARM64_REG_EHID5
+       orr             x12, x12, ARM64_REG_EHID5_DisFillByp
+       msr             ARM64_REG_EHID5, x12
+
+       // Disable forwarding of return addresses to the NFP 
+       // <rdar://problem/30387067> Skye: FED incorrectly taking illegal va exception
+       mrs             x12, ARM64_REG_EHID0
+       orr             x12, x12, ARM64_REG_EHID0_nfpRetFwdDisb
+       msr             ARM64_REG_EHID0, x12
+
+Lskip_skye_a0_ecore_only:
+
+       /***** Tunables that apply to all skye cores, chip revs < B0 *****/
+
+       // Disable clock divider gating
+       // <rdar://problem/30854420> [Tunable/Errata][cpu_1p_1e] [CPGV2] ACC power down issue when link FSM switches from GO_DN to CANCEL and at the same time upStreamDrain request is set.
+       mrs             x12, ARM64_REG_HID6
+       orr             x12, x12, ARM64_REG_HID6_DisClkDivGating
+       msr             ARM64_REG_HID6, x12
+
+       // Disable clock dithering
+       // <rdar://problem/29022199> [Tunable] Skye A0: Linux: LLC PIO Errors
+       mrs             x12, ARM64_REG_ACC_OVRD
+       orr             x12, x12, ARM64_REG_ACC_OVRD_dsblClkDtr
+       msr             ARM64_REG_ACC_OVRD, x12
+
+       mrs             x12, ARM64_REG_ACC_EBLK_OVRD
+       orr             x12, x12, ARM64_REG_ACC_OVRD_dsblClkDtr
+       msr             ARM64_REG_ACC_EBLK_OVRD, x12
+
+Lskip_skye_a0_workarounds:
+
+       SKIP_IF_CPU_VERSION_LESS_THAN x12, MONSOON_CPU_VERSION_B0, Lskip_skye_post_a1_workarounds
+
+       /***** Tunables that apply to all skye cores, chip revs >= B0 *****/
+
+       // <rdar://problem/32512836>: Disable refcount syncing between E and P
+       mrs             x12, ARM64_REG_CYC_OVRD
+       and             x12, x12, ~ARM64_REG_CYC_OVRD_dsblSnoopTime_mask
+       orr             x12, x12, ARM64_REG_CYC_OVRD_dsblSnoopPTime
+       msr             ARM64_REG_CYC_OVRD, x12
+
+Lskip_skye_post_a1_workarounds:
+
+#endif /* defined(APPLEMONSOON) */
+
 
        // If x21 != 0, we're doing a warm reset, so we need to trampoline to the kernel pmap.
        cbnz    x21, Ltrampoline
@@ -913,29 +1024,12 @@ arm_init_tramp:
         *  +---Kernel Base---+
         */
 
-
-       adrp    x0, EXT(invalid_ttep)@page
-       add             x0, x0, EXT(invalid_ttep)@pageoff
-       ldr             x0, [x0]
-#if __ARM_KERNEL_PROTECT__
-       /* We start out with a kernel ASID. */
-       orr             x0, x0, #(1 << TTBR_ASID_SHIFT)
-#endif /* __ARM_KERNEL_PROTECT__ */
-
-       msr             TTBR0_EL1, x0
-
+       mov             x19, lr
        // Convert CPU data PA to VA and set as first argument
-       add             x0, x21, x22
-       sub             x0, x0, x23
-       mov             x1, #0
+       mov             x0, x21
+       bl              EXT(phystokv)
 
-       // Make sure that the TLB flush happens after the registers are set!
-       isb             sy
-
-       // Synchronize system for TTBR updates
-       tlbi    vmalle1
-       dsb             sy
-       isb             sy
+       mov             lr, x19
 
        /* Return to arm_init() */
        ret
index cf2d66cd83b770888557547b6d3dcf8de945d2d5..ff0429e36d0be330019676f50461dae4667a7502 100644 (file)
@@ -231,22 +231,92 @@ handle_set_arm_thread_state(
         * what the client is expecting.
         */
        if (count < ARM_UNIFIED_THREAD_STATE_COUNT) {
+               if (!is_saved_state32(saved_state)) {
+                       return (KERN_INVALID_ARGUMENT);
+               }
                return handle_set_arm32_thread_state(tstate, count, saved_state);
        }
 
        const arm_unified_thread_state_t *unified_state = (const arm_unified_thread_state_t *) tstate;
 #if __arm64__
        if (is_thread_state64(unified_state)) {
+               if (!is_saved_state64(saved_state)) {
+                       return (KERN_INVALID_ARGUMENT);
+               }
                (void)thread_state64_to_saved_state(const_thread_state64(unified_state), saved_state);
        } else
 #endif
        {
+               if (!is_saved_state32(saved_state)) {
+                       return (KERN_INVALID_ARGUMENT);
+               }
                (void)thread_state32_to_saved_state(const_thread_state32(unified_state), saved_state);
        }
 
        return (KERN_SUCCESS);
 }
 
+/*
+ * Translate thread state arguments to userspace representation
+ */
+
+kern_return_t
+machine_thread_state_convert_to_user(
+                        thread_t thread,
+                        thread_flavor_t flavor,
+                        thread_state_t tstate,
+                        mach_msg_type_number_t *count)
+{
+       // No conversion to userspace representation on this platform
+       (void)thread; (void)flavor; (void)tstate; (void)count;
+       return KERN_SUCCESS;
+}
+
+/*
+ * Translate thread state arguments from userspace representation
+ */
+
+kern_return_t
+machine_thread_state_convert_from_user(
+                        thread_t thread,
+                        thread_flavor_t flavor,
+                        thread_state_t tstate,
+                        mach_msg_type_number_t count)
+{
+       // No conversion from userspace representation on this platform
+       (void)thread; (void)flavor; (void)tstate; (void)count;
+       return KERN_SUCCESS;
+}
+
+/*
+ * Translate signal context data pointer to userspace representation
+ */
+
+kern_return_t
+machine_thread_siguctx_pointer_convert_to_user(
+                        __assert_only thread_t thread,
+                        user_addr_t *uctxp)
+{
+       // No conversion to userspace representation on this platform
+       (void)thread; (void)uctxp;
+       return KERN_SUCCESS;
+}
+
+/*
+ * Translate array of function pointer syscall arguments from userspace representation
+ */
+
+kern_return_t
+machine_thread_function_pointers_convert_from_user(
+                        __assert_only thread_t thread,
+                        user_addr_t *fptrs,
+                        uint32_t count)
+{
+       // No conversion from userspace representation on this platform
+       (void)thread; (void)fptrs; (void)count;
+       return KERN_SUCCESS;
+}
+
 /*
  * Routine:    machine_thread_get_state
  *
@@ -276,8 +346,8 @@ machine_thread_get_state(
 
                tstate[0] = ARM_THREAD_STATE;
                tstate[1] = ARM_VFP_STATE;
-               tstate[2] = thread_is_64bit(thread) ? ARM_EXCEPTION_STATE64 : ARM_EXCEPTION_STATE;
-               tstate[3] = thread_is_64bit(thread) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32;
+               tstate[2] = thread_is_64bit_data(thread) ? ARM_EXCEPTION_STATE64 : ARM_EXCEPTION_STATE;
+               tstate[3] = thread_is_64bit_data(thread) ? ARM_DEBUG_STATE64 : ARM_DEBUG_STATE32;
                *count = 4;
                break;
 
@@ -289,7 +359,7 @@ machine_thread_get_state(
        }
        case ARM_THREAD_STATE32:
        {
-               if (thread_is_64bit(thread))
+               if (thread_is_64bit_data(thread))
                        return KERN_INVALID_ARGUMENT;
 
                kern_return_t rn = handle_get_arm32_thread_state(tstate, count, thread->machine.upcb);
@@ -299,7 +369,7 @@ machine_thread_get_state(
 #if __arm64__
        case ARM_THREAD_STATE64:
        {
-               if (!thread_is_64bit(thread))
+               if (!thread_is_64bit_data(thread))
                        return KERN_INVALID_ARGUMENT;
 
                kern_return_t rn = handle_get_arm64_thread_state(tstate, count, thread->machine.upcb);
@@ -313,7 +383,7 @@ machine_thread_get_state(
 
                        if (*count < ARM_EXCEPTION_STATE_COUNT)
                                return (KERN_INVALID_ARGUMENT);
-                       if (thread_is_64bit(thread))
+                       if (thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        state = (struct arm_exception_state *) tstate;
@@ -332,7 +402,7 @@ machine_thread_get_state(
 
                        if (*count < ARM_EXCEPTION_STATE64_COUNT)
                                return (KERN_INVALID_ARGUMENT);
-                       if (!thread_is_64bit(thread))
+                       if (!thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        state = (struct arm_exception_state64 *) tstate;
@@ -352,7 +422,7 @@ machine_thread_get_state(
                        if (*count < ARM_LEGACY_DEBUG_STATE_COUNT)
                                return (KERN_INVALID_ARGUMENT);
                        
-                       if (thread_is_64bit(thread))
+                       if (thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        state = (arm_legacy_debug_state_t *) tstate;
@@ -373,7 +443,7 @@ machine_thread_get_state(
                        if (*count < ARM_DEBUG_STATE32_COUNT)
                                return (KERN_INVALID_ARGUMENT);
                        
-                       if (thread_is_64bit(thread))
+                       if (thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        state = (arm_debug_state32_t *) tstate;
@@ -395,7 +465,7 @@ machine_thread_get_state(
                        if (*count < ARM_DEBUG_STATE64_COUNT)
                                return (KERN_INVALID_ARGUMENT);
                        
-                       if (!thread_is_64bit(thread))
+                       if (!thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        state = (arm_debug_state64_t *) tstate;
@@ -439,10 +509,10 @@ machine_thread_get_state(
                arm_neon_state_t *state;
                arm_neon_saved_state32_t *thread_state;
 
-        if (*count < ARM_NEON_STATE_COUNT)
+               if (*count < ARM_NEON_STATE_COUNT)
                        return (KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thread))
+               if (thread_is_64bit_data(thread))
                        return (KERN_INVALID_ARGUMENT);
 
                state = (arm_neon_state_t *)tstate;
@@ -460,10 +530,10 @@ machine_thread_get_state(
                arm_neon_state64_t *state;
                arm_neon_saved_state64_t *thread_state;
 
-        if (*count < ARM_NEON_STATE64_COUNT)
+               if (*count < ARM_NEON_STATE64_COUNT)
                        return (KERN_INVALID_ARGUMENT);
 
-               if (!thread_is_64bit(thread))
+               if (!thread_is_64bit_data(thread))
                        return (KERN_INVALID_ARGUMENT);
 
                state = (arm_neon_state64_t *)tstate;
@@ -532,7 +602,7 @@ machine_thread_get_kern_state(
 void
 machine_thread_switch_addrmode(thread_t thread)
 {
-       if (task_has_64BitAddr(thread->task)) {
+       if (task_has_64Bit_data(thread->task)) {
                thread->machine.upcb->ash.flavor = ARM_SAVED_STATE64;
                thread->machine.upcb->ash.count = ARM_SAVED_STATE64_COUNT;
                thread->machine.uNeon->nsh.flavor = ARM_NEON_SAVED_STATE64;
@@ -579,7 +649,7 @@ machine_thread_set_state(
                break;
 
        case ARM_THREAD_STATE32:
-               if (thread_is_64bit(thread))
+               if (thread_is_64bit_data(thread))
                        return (KERN_INVALID_ARGUMENT);
 
                rn = handle_set_arm32_thread_state(tstate, count, thread->machine.upcb);
@@ -588,7 +658,7 @@ machine_thread_set_state(
 
 #if __arm64__
        case ARM_THREAD_STATE64:
-               if (!thread_is_64bit(thread))
+               if (!thread_is_64bit_data(thread))
                        return (KERN_INVALID_ARGUMENT);
 
                rn = handle_set_arm64_thread_state(tstate, count, thread->machine.upcb);
@@ -599,7 +669,7 @@ machine_thread_set_state(
 
                        if (count != ARM_EXCEPTION_STATE_COUNT)
                                return (KERN_INVALID_ARGUMENT);
-                       if (thread_is_64bit(thread))
+                       if (thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        break;
@@ -608,7 +678,7 @@ machine_thread_set_state(
 
                        if (count != ARM_EXCEPTION_STATE64_COUNT)
                                return (KERN_INVALID_ARGUMENT);
-                       if (!thread_is_64bit(thread))
+                       if (!thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        break;
@@ -621,7 +691,7 @@ machine_thread_set_state(
 
                        if (count != ARM_LEGACY_DEBUG_STATE_COUNT)
                                return (KERN_INVALID_ARGUMENT);
-                       if (thread_is_64bit(thread))
+                       if (thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        state = (arm_legacy_debug_state_t *) tstate;
@@ -698,7 +768,7 @@ machine_thread_set_state(
 
                        if (count != ARM_DEBUG_STATE32_COUNT)
                                return (KERN_INVALID_ARGUMENT);
-                       if (thread_is_64bit(thread))
+                       if (thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        state = (arm_debug_state32_t *) tstate;
@@ -781,7 +851,7 @@ machine_thread_set_state(
 
                        if (count != ARM_DEBUG_STATE64_COUNT)
                                return (KERN_INVALID_ARGUMENT);
-                       if (!thread_is_64bit(thread))
+                       if (!thread_is_64bit_data(thread))
                                return (KERN_INVALID_ARGUMENT);
 
                        state = (arm_debug_state64_t *) tstate;
@@ -886,7 +956,7 @@ machine_thread_set_state(
                if (count != ARM_NEON_STATE_COUNT)
                        return (KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thread))
+               if (thread_is_64bit_data(thread))
                        return (KERN_INVALID_ARGUMENT);
 
                state = (arm_neon_state_t *)tstate;
@@ -908,7 +978,7 @@ machine_thread_set_state(
                if (count != ARM_NEON_STATE64_COUNT)
                        return (KERN_INVALID_ARGUMENT);
 
-               if (!thread_is_64bit(thread))
+               if (!thread_is_64bit_data(thread))
                        return (KERN_INVALID_ARGUMENT);
 
                state = (arm_neon_state64_t *)tstate;
@@ -958,6 +1028,7 @@ machine_thread_state_initialize(
 
        thread->machine.DebugData = NULL;
 
+
        return KERN_SUCCESS;
 }
 
@@ -968,7 +1039,8 @@ machine_thread_state_initialize(
 kern_return_t
 machine_thread_dup(
                   thread_t self,
-                  thread_t target)
+                  thread_t target,
+                  __unused boolean_t is_corpse)
 {
        struct arm_saved_state *self_saved_state;
        struct arm_saved_state *target_saved_state;
@@ -1056,13 +1128,13 @@ find_debug_state64(
  */
 kern_return_t
 thread_userstack(
-                thread_t thread,
-                int flavor,
-                thread_state_t tstate,
-                unsigned int count,
-                mach_vm_offset_t * user_stack,
-                int *customstack,
-                boolean_t is64bit
+               __unused thread_t thread,
+               int flavor,
+               thread_state_t tstate,
+               unsigned int count,
+               mach_vm_offset_t * user_stack,
+               int *customstack,
+               boolean_t is_64bit_data
 )
 {
        register_t sp;
@@ -1071,7 +1143,7 @@ thread_userstack(
        case ARM_THREAD_STATE:
                if (count == ARM_UNIFIED_THREAD_STATE_COUNT) {
 #if __arm64__
-                       if (thread_is_64bit(thread)) {
+                       if (is_64bit_data) {
                                sp = ((arm_unified_thread_state_t *)tstate)->ts_64.sp;
                        } else
 #endif
@@ -1086,7 +1158,7 @@ thread_userstack(
        case ARM_THREAD_STATE32:
                if (count != ARM_THREAD_STATE32_COUNT)
                        return (KERN_INVALID_ARGUMENT);
-               if (is64bit)
+               if (is_64bit_data)
                        return (KERN_INVALID_ARGUMENT);
 
                sp = ((arm_thread_state32_t *)tstate)->sp;
@@ -1095,7 +1167,7 @@ thread_userstack(
        case ARM_THREAD_STATE64:
                if (count != ARM_THREAD_STATE64_COUNT)
                        return (KERN_INVALID_ARGUMENT);
-               if (!is64bit)
+               if (!is_64bit_data)
                        return (KERN_INVALID_ARGUMENT);
 
                sp = ((arm_thread_state32_t *)tstate)->sp;
@@ -1312,7 +1384,7 @@ act_thread_csave(void)
        }
 
 #if __ARM_VFP__
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_data(thread)) {
                val = ARM_NEON_STATE64_COUNT;
                kret = machine_thread_get_state(thread,
                                ARM_NEON_STATE64,
@@ -1353,7 +1425,7 @@ act_thread_catt(void *ctx)
                goto out;
 
 #if __ARM_VFP__
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_data(thread)) {
                kret = machine_thread_set_state(thread,
                                ARM_NEON_STATE64,
                                (thread_state_t) & ic->ns,
@@ -1390,7 +1462,7 @@ thread_set_wq_state32(thread_t thread, thread_state_t tstate)
        thread_t curth = current_thread();
        spl_t s=0;
 
-       assert(!thread_is_64bit(thread));
+       assert(!thread_is_64bit_data(thread));
 
        saved_state = thread->machine.upcb;
        saved_state_32 = saved_state32(saved_state);
@@ -1427,7 +1499,7 @@ thread_set_wq_state64(thread_t thread, thread_state_t tstate)
        thread_t curth = current_thread();
        spl_t s=0;
 
-       assert(thread_is_64bit(thread));
+       assert(thread_is_64bit_data(thread));
 
        saved_state = thread->machine.upcb;
        saved_state_64 = saved_state64(saved_state);
@@ -1444,7 +1516,7 @@ thread_set_wq_state64(thread_t thread, thread_state_t tstate)
         * like sp.
         */
        thread_state64_to_saved_state(state, saved_state);
-       saved_state_64->cpsr = PSR64_USER64_DEFAULT;
+       set_saved_state_cpsr(saved_state, PSR64_USER64_DEFAULT);
 
        if (curth != thread) {
                thread_unlock(thread);
index eee2de7225831677ccdd14cd961d29f548f7db43..07c9a36d0c904cb745b53363a935c5432c509961 100644 (file)
@@ -34,6 +34,8 @@
  * of the first mismatched characters interpreted as uint8_t.
  */
 
+#include <arm64/asm.h>
+
 .globl _strncmp
 
 /*****************************************************************************
  *****************************************************************************/
 
 .macro EstablishFrame
+       ARM64_STACK_PROLOG
        stp       fp, lr, [sp, #-16]!
        mov       fp,      sp
 .endm
 
 .macro ClearFrameAndReturn
        ldp       fp, lr, [sp], #16
-       ret
+       ARM64_STACK_EPILOG
 .endm
 
 #include "../mach/arm/vm_param.h"
index 3e0080669678874b69f9a811a43b240f20f2bdc2..4ec162539da44d04a47e422ab05a0ef98c517c1a 100644 (file)
@@ -33,6 +33,8 @@
  * is amller, without reading beyond the first maxlen characters of string.
  */
 
+#include <arm64/asm.h>
+
 .globl _strlen
 .globl _strnlen
 
  *****************************************************************************/
 
 .macro EstablishFrame
+       ARM64_STACK_PROLOG
        stp       fp, lr, [sp, #-16]!
        mov       fp,      sp
 .endm
 
 .macro ClearFrameAndReturn
        ldp       fp, lr, [sp], #16
-       ret
+       ARM64_STACK_EPILOG
 .endm
 
 /*****************************************************************************
@@ -116,7 +119,7 @@ _strnlen:
        ClearFrameAndReturn
 
 L_maxlenIsZero:
-       mov       x0,      xzr
+       mov       x0,      #0
        ret                         // No stack frame, so don't clear it.
 
 L_foundNUL:
index b7d5d11d0fd3a6e55f533b729bf95256d9a8cdf5..4b183e9b54b8343f4646ca60b3096d6aaa2dfde6 100644 (file)
@@ -74,7 +74,10 @@ static bank_account_t bank_account_alloc_init(bank_task_t bank_holder, bank_task
 static bank_task_t get_bank_task_context(task_t task, boolean_t initialize);
 static void bank_task_dealloc(bank_task_t bank_task, mach_voucher_attr_value_reference_t sync);
 static kern_return_t bank_account_dealloc_with_sync(bank_account_t bank_account, mach_voucher_attr_value_reference_t sync);
-static void bank_rollup_chit_to_tasks(ledger_t bill, bank_task_t bank_holder, bank_task_t bank_merchant);
+static void bank_rollup_chit_to_tasks(ledger_t bill, ledger_t bank_holder_ledger, ledger_t bank_merchant_ledger,
+       int bank_holder_pid, int bank_merchant_pid);
+static ledger_t bank_get_bank_task_ledger_with_ref(bank_task_t bank_task);
+static void bank_destroy_bank_task_ledger(bank_task_t bank_task);
 static void init_bank_ledgers(void);
 static boolean_t bank_task_is_propagate_entitled(task_t t);
 static struct thread_group *bank_get_bank_task_thread_group(bank_task_t bank_task __unused);
@@ -729,7 +732,7 @@ bank_release(
  * Purpose: Allocate and initialize a bank task structure.
  * Returns: bank_task_t on Success.
  *          BANK_TASK_NULL: on Failure.
- * Notes:   Leaves the task and creditcard blank and has only 1 ref,
+ * Notes:   Leaves the task and ledger blank and has only 1 ref,
             needs to take 1 extra ref after the task field is initialized.
  */
 static bank_task_t
@@ -745,7 +748,7 @@ bank_task_alloc_init(task_t task)
        new_bank_task->bt_voucher_ref = 0;
        new_bank_task->bt_refs = 1;
        new_bank_task->bt_made = 0;
-       new_bank_task->bt_creditcard = NULL;
+       new_bank_task->bt_ledger = LEDGER_NULL;
        new_bank_task->bt_hasentitlement = bank_task_is_propagate_entitled(task);
        queue_init(&new_bank_task->bt_accounts_to_pay);
        queue_init(&new_bank_task->bt_accounts_to_charge);
@@ -813,7 +816,7 @@ bank_account_alloc_init(
        boolean_t entry_found = FALSE;
        ledger_t new_ledger = ledger_instantiate(bank_ledger_template, LEDGER_CREATE_INACTIVE_ENTRIES);
 
-       if (new_ledger == NULL)
+       if (new_ledger == LEDGER_NULL)
                return BANK_ACCOUNT_NULL;
 
        ledger_entry_setactive(new_ledger, bank_ledgers.cpu_time);
@@ -919,7 +922,7 @@ get_bank_task_context
                return BANK_TASK_NULL;
        }
        /* We won the race. Take a ref on the ledger and initialize bank task. */
-       bank_task->bt_creditcard = task->ledger;
+       bank_task->bt_ledger = task->ledger;
 #if DEVELOPMENT || DEBUG
        bank_task->bt_task = task;
 #endif
@@ -954,7 +957,7 @@ bank_task_dealloc(
        assert(queue_empty(&bank_task->bt_accounts_to_pay));
        assert(queue_empty(&bank_task->bt_accounts_to_charge));
 
-       ledger_dereference(bank_task->bt_creditcard);
+       assert(!LEDGER_VALID(bank_task->bt_ledger));
        lck_mtx_destroy(&bank_task->bt_acc_to_pay_lock, &bank_lock_grp);
        lck_mtx_destroy(&bank_task->bt_acc_to_charge_lock, &bank_lock_grp);
 
@@ -983,12 +986,22 @@ bank_account_dealloc_with_sync(
        bank_task_t bank_merchant = bank_account->ba_merchant;
        bank_task_t bank_secureoriginator = bank_account->ba_secureoriginator;
        bank_task_t bank_proximateprocess = bank_account->ba_proximateprocess;
+       ledger_t bank_merchant_ledger = LEDGER_NULL;
+
+       /*
+        * Grab a reference on the bank_merchant_ledger, since we would not be able
+        * to take bt_acc_to_pay_lock for bank_merchant later.
+        */
+       bank_merchant_ledger = bank_get_bank_task_ledger_with_ref(bank_merchant);
 
        /* Grab the acc to pay list lock and check the sync value */
        lck_mtx_lock(&bank_holder->bt_acc_to_pay_lock);
 
        if (bank_account->ba_made != sync) {
                lck_mtx_unlock(&bank_holder->bt_acc_to_pay_lock);
+               if (bank_merchant_ledger) {
+                       ledger_dereference(bank_merchant_ledger);
+               }
                return KERN_FAILURE;
        }
                
@@ -1001,8 +1014,10 @@ bank_account_dealloc_with_sync(
        /* Grab both the acc to pay and acc to charge locks */
        lck_mtx_lock(&bank_merchant->bt_acc_to_charge_lock);
 
-       bank_rollup_chit_to_tasks(bank_account->ba_bill, bank_holder, bank_merchant);
-       
+       /* No need to take ledger reference for bank_holder ledger since bt_acc_to_pay_lock is locked */
+       bank_rollup_chit_to_tasks(bank_account->ba_bill, bank_holder->bt_ledger, bank_merchant_ledger,
+               bank_holder->bt_pid, bank_merchant->bt_pid);
+
        /* Remove the account entry from Accounts need to pay account link list. */
        queue_remove(&bank_holder->bt_accounts_to_pay, bank_account, bank_account_t, ba_next_acc_to_pay);
        
@@ -1012,6 +1027,9 @@ bank_account_dealloc_with_sync(
        lck_mtx_unlock(&bank_merchant->bt_acc_to_charge_lock);
        lck_mtx_unlock(&bank_holder->bt_acc_to_pay_lock);
 
+       if (bank_merchant_ledger) {
+               ledger_dereference(bank_merchant_ledger);
+       }
        ledger_dereference(bank_account->ba_bill);
 
        /* Drop the reference of bank holder and merchant */
@@ -1038,38 +1056,50 @@ bank_account_dealloc_with_sync(
 static void
 bank_rollup_chit_to_tasks(
        ledger_t bill,
-       bank_task_t bank_holder,
-       bank_task_t bank_merchant)
+       ledger_t bank_holder_ledger,
+       ledger_t bank_merchant_ledger,
+       int bank_holder_pid,
+       int bank_merchant_pid)
 {
        ledger_amount_t credit;
        ledger_amount_t debit;
        kern_return_t ret;
 
-       if (bank_holder == bank_merchant)
+       if (bank_holder_ledger == bank_merchant_ledger)
                return;
 
        ret = ledger_get_entries(bill, bank_ledgers.cpu_time, &credit, &debit);
        if (ret == KERN_SUCCESS) {
                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                        (BANK_CODE(BANK_ACCOUNT_INFO, (BANK_SETTLE_CPU_TIME))) | DBG_FUNC_NONE,
-                       bank_merchant->bt_pid, bank_holder->bt_pid, credit, debit, 0);
-               ledger_credit(bank_holder->bt_creditcard, task_ledgers.cpu_time_billed_to_me, credit);
-               ledger_debit(bank_holder->bt_creditcard, task_ledgers.cpu_time_billed_to_me, debit);
+                       bank_merchant_pid, bank_holder_pid, credit, debit, 0);
 
-               ledger_credit(bank_merchant->bt_creditcard, task_ledgers.cpu_time_billed_to_others, credit);
-               ledger_debit(bank_merchant->bt_creditcard, task_ledgers.cpu_time_billed_to_others, debit);
+               if (bank_holder_ledger) {
+                       ledger_credit(bank_holder_ledger, task_ledgers.cpu_time_billed_to_me, credit);
+                       ledger_debit(bank_holder_ledger, task_ledgers.cpu_time_billed_to_me, debit);
+               }
+
+               if (bank_merchant_ledger) {
+                       ledger_credit(bank_merchant_ledger, task_ledgers.cpu_time_billed_to_others, credit);
+                       ledger_debit(bank_merchant_ledger, task_ledgers.cpu_time_billed_to_others, debit);
+               }
        }
 
        ret = ledger_get_entries(bill, bank_ledgers.energy, &credit, &debit);
        if (ret == KERN_SUCCESS) {
                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                        (BANK_CODE(BANK_ACCOUNT_INFO, (BANK_SETTLE_ENERGY))) | DBG_FUNC_NONE,
-                       bank_merchant->bt_pid, bank_holder->bt_pid, credit, debit, 0);
-               ledger_credit(bank_holder->bt_creditcard, task_ledgers.energy_billed_to_me, credit);
-               ledger_debit(bank_holder->bt_creditcard, task_ledgers.energy_billed_to_me, debit);
+                       bank_merchant_pid, bank_holder_pid, credit, debit, 0);
 
-               ledger_credit(bank_merchant->bt_creditcard, task_ledgers.energy_billed_to_others, credit);
-               ledger_debit(bank_merchant->bt_creditcard, task_ledgers.energy_billed_to_others, debit);
+               if (bank_holder_ledger) {
+                       ledger_credit(bank_holder_ledger, task_ledgers.energy_billed_to_me, credit);
+                       ledger_debit(bank_holder_ledger, task_ledgers.energy_billed_to_me, debit);
+               }
+
+               if (bank_merchant_ledger) {
+                       ledger_credit(bank_merchant_ledger, task_ledgers.energy_billed_to_others, credit);
+                       ledger_debit(bank_merchant_ledger, task_ledgers.energy_billed_to_others, debit);
+               }
        }
 }
 
@@ -1091,6 +1121,7 @@ bank_task_destroy(task_t task)
        task->bank_context = NULL;
        global_bank_task_unlock();
 
+       bank_destroy_bank_task_ledger(bank_task);
        bank_task_dealloc(bank_task, 1);
 }
 
@@ -1200,19 +1231,22 @@ bank_billed_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energy)
        
        lck_mtx_lock(&bank_task->bt_acc_to_pay_lock);
 
-       kr = ledger_get_balance(bank_task->bt_creditcard, task_ledgers.cpu_time_billed_to_me, &temp);
-       if (kr == KERN_SUCCESS && temp >= 0) {
-               cpu_balance += temp;
-       }
+       /* bt_acc_to_pay_lock locked, no need to take ledger reference for bt_ledger */
+       if (bank_task->bt_ledger != LEDGER_NULL) {
+               kr = ledger_get_balance(bank_task->bt_ledger, task_ledgers.cpu_time_billed_to_me, &temp);
+               if (kr == KERN_SUCCESS && temp >= 0) {
+                       cpu_balance += temp;
+               }
 #if DEVELOPMENT || DEBUG
-       else {
-               printf("bank_bill_time: ledger_get_balance failed or negative balance in ledger: %lld\n", temp);
-       }
+               else {
+                       printf("bank_bill_time: ledger_get_balance failed or negative balance in ledger: %lld\n", temp);
+               }
 #endif /* DEVELOPMENT || DEBUG */
 
-       kr = ledger_get_balance(bank_task->bt_creditcard, task_ledgers.energy_billed_to_me, &temp);
-       if (kr == KERN_SUCCESS && temp >= 0) {
-               energy_balance += temp;
+               kr = ledger_get_balance(bank_task->bt_ledger, task_ledgers.energy_billed_to_me, &temp);
+               if (kr == KERN_SUCCESS && temp >= 0) {
+                       energy_balance += temp;
+               }
        }
 
        queue_iterate(&bank_task->bt_accounts_to_pay, bank_account, bank_account_t, ba_next_acc_to_pay) {
@@ -1297,27 +1331,33 @@ bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energ
        bank_account_t bank_account;
        int64_t temp = 0;
        kern_return_t kr;
+       ledger_t ledger = LEDGER_NULL;
        if (bank_task == BANK_TASK_NULL) {
                *cpu_time = 0;
                *energy = 0;
                return;
        }
 
+       /* Grab a ledger reference on bt_ledger for bank_task */
+       ledger = bank_get_bank_task_ledger_with_ref(bank_task);
+
        lck_mtx_lock(&bank_task->bt_acc_to_charge_lock);
 
-       kr = ledger_get_balance(bank_task->bt_creditcard, task_ledgers.cpu_time_billed_to_others, &temp);
-       if (kr == KERN_SUCCESS && temp >= 0) {
-               cpu_balance += temp;
-       }
+       if (ledger) {
+               kr = ledger_get_balance(ledger, task_ledgers.cpu_time_billed_to_others, &temp);
+               if (kr == KERN_SUCCESS && temp >= 0) {
+                       cpu_balance += temp;
+               }
 #if DEVELOPMENT || DEBUG
-       else {
-               printf("bank_serviced_time: ledger_get_balance failed or negative balance in ledger: %lld\n", temp);
-       }
+               else {
+                       printf("bank_serviced_time: ledger_get_balance failed or negative balance in ledger: %lld\n", temp);
+               }
 #endif /* DEVELOPMENT || DEBUG */
 
-       kr = ledger_get_balance(bank_task->bt_creditcard, task_ledgers.energy_billed_to_others, &temp);
-       if (kr == KERN_SUCCESS && temp >= 0) {
-               energy_balance += temp;
+               kr = ledger_get_balance(ledger, task_ledgers.energy_billed_to_others, &temp);
+               if (kr == KERN_SUCCESS && temp >= 0) {
+                       energy_balance += temp;
+               }
        }
 
        queue_iterate(&bank_task->bt_accounts_to_charge, bank_account, bank_account_t, ba_next_acc_to_charge) {
@@ -1338,6 +1378,9 @@ bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energ
                }
        }
        lck_mtx_unlock(&bank_task->bt_acc_to_charge_lock);
+       if (ledger) {
+               ledger_dereference(ledger);
+       }
        *cpu_time = (uint64_t)cpu_balance;
        *energy = (uint64_t)energy_balance;
        return;
@@ -1401,6 +1444,44 @@ bank_get_voucher_bank_account(ipc_voucher_t voucher)
        return BANK_ACCOUNT_NULL;
 }
 
+/*
+ * Routine: bank_get_bank_task_ledger_with_ref
+ * Purpose: Get the bank ledger from the bank task and return a reference to it.
+ */
+static ledger_t
+bank_get_bank_task_ledger_with_ref(bank_task_t bank_task)
+{
+       ledger_t ledger = LEDGER_NULL;
+
+       lck_mtx_lock(&bank_task->bt_acc_to_pay_lock);
+       ledger = bank_task->bt_ledger;
+       if (ledger) {
+               ledger_reference(ledger);
+       }
+       lck_mtx_unlock(&bank_task->bt_acc_to_pay_lock);
+
+       return ledger;
+}
+
+/*
+ * Routine: bank_destroy_bank_task_ledger
+ * Purpose: Drop the bank task reference on the task ledger.
+ */
+static void
+bank_destroy_bank_task_ledger(bank_task_t bank_task)
+{
+       ledger_t ledger;
+
+       /* Remove the ledger reference from the bank task */
+       lck_mtx_lock(&bank_task->bt_acc_to_pay_lock);
+       assert(LEDGER_VALID(bank_task->bt_ledger));
+       ledger = bank_task->bt_ledger;
+       bank_task->bt_ledger = LEDGER_NULL;
+       lck_mtx_unlock(&bank_task->bt_acc_to_pay_lock);
+
+       ledger_dereference(ledger);
+}
+
 /*
  * Routine: bank_get_bank_account_ledger
  * Purpose: Get the bankledger from the bank account if ba_merchant different than ba_holder
@@ -1408,7 +1489,7 @@ bank_get_voucher_bank_account(ipc_voucher_t voucher)
 static ledger_t
 bank_get_bank_account_ledger(bank_account_t bank_account)
 {
-       ledger_t bankledger = NULL;
+       ledger_t bankledger = LEDGER_NULL;
 
        if (bank_account != BANK_ACCOUNT_NULL &&
                bank_account->ba_holder != bank_account->ba_merchant)
@@ -1437,7 +1518,7 @@ bank_get_bank_task_thread_group(bank_task_t bank_task __unused)
 static struct thread_group *
 bank_get_bank_account_thread_group(bank_account_t bank_account __unused)
 {
-       thread_group_t banktg = NULL;
+       struct thread_group *banktg = NULL;
 
 
        return (banktg);
@@ -1453,7 +1534,7 @@ kern_return_t
 bank_get_bank_ledger_and_thread_group(
        ipc_voucher_t     voucher,
        ledger_t          *bankledger,
-       thread_group_t    *banktg)
+       struct thread_group **banktg)
 {
        bank_account_t bank_account;
        struct thread_group *thread_group = NULL;
@@ -1488,7 +1569,7 @@ bank_swap_thread_bank_ledger(thread_t thread __unused, ledger_t new_ledger __unu
        int64_t effective_energy_consumed = 0;
        uint64_t thread_energy;
        
-       if (old_ledger == NULL && new_ledger == NULL)
+       if (old_ledger == LEDGER_NULL && new_ledger == LEDGER_NULL)
                return;
 
        assert((thread == current_thread() || thread->started == 0));
@@ -1534,7 +1615,7 @@ bank_swap_thread_bank_ledger(thread_t thread __unused, ledger_t new_ledger __unu
        thread_unlock(thread);
        splx(s);
        
-       if (old_ledger != NULL) {
+       if (old_ledger != LEDGER_NULL) {
                ledger_credit(old_ledger,
                        bank_ledgers.cpu_time,
                        effective_ledger_time_consumed);
index eb8f5599c91e7c5d4ed36a2363f13c919d765124..e3b3480e243d12c6e4b83000a310728eb723c2b0 100644 (file)
@@ -66,7 +66,7 @@ typedef struct bank_element * bank_element_t;
 struct bank_task {
        struct bank_element       bt_elem;                 /* Bank element */
        struct proc_persona_info  bt_proc_persona;         /* Persona of the process */
-       ledger_t                  bt_creditcard;           /* Ledger of the customer task */
+       ledger_t                  bt_ledger;               /* Ledger of the customer task */
        queue_head_t              bt_accounts_to_pay;      /* List of accounts worked for me and need to pay */
        queue_head_t              bt_accounts_to_charge;   /* List of accounts I did work and need to charge */
        decl_lck_mtx_data(,       bt_acc_to_pay_lock)      /* Lock to protect accounts to pay list */
@@ -176,7 +176,7 @@ extern void bank_billed_balance(bank_task_t bank_task, uint64_t *cpu_time, uint6
 extern void bank_serviced_balance_safe(task_t task, uint64_t *cpu_time, uint64_t *energy);
 extern void bank_serviced_balance(bank_task_t bank_task, uint64_t *cpu_time, uint64_t *energy);
 extern kern_return_t bank_get_bank_ledger_and_thread_group(ipc_voucher_t voucher,
-       ledger_t *bankledger, thread_group_t *banktg);
+       ledger_t *bankledger, struct thread_group **banktg);
 extern void bank_swap_thread_bank_ledger(thread_t thread, ledger_t ledger);
 
 #endif /* MACH_KERNEL_PRIVATE */
index 78235b8be2547f2fe353f91acbbe8ab8dc105355..2c3b7ec5c908afdae6a556f5b9d86017d1e7bd72 100644 (file)
@@ -8,6 +8,7 @@ CWARNFLAGS = $(CWARNFLAGS_STD) -Wshorten-64-to-32
 HIB_FILES=
 
 lz4.o_CFLAGS_ADD += -fbuiltin -O3
+
 ######################################################################
 #END   Machine dependent Makefile fragment for arm64
 ######################################################################
index d533b91a5edc755fedf2ce4da319c2e6b71a2af9..d296cb6a72917e91169b825a0dbd4e59dbca5ef6 100644 (file)
@@ -40,14 +40,12 @@ OBJS_NO_CAST_ALIGN =                        \
                cpu_threads.o           \
                cpuid.o                 \
                locks_i386.o            \
+               locks_i386_opt.o        \
                machine_task.o          \
                mp_desc.o               \
                pcb.o                   \
                pcb_native.o            \
                kdp_x86_common.o        \
-               memory_object.o         \
-               vm_apple_protect.o      \
-               vm_map.o                \
                startup64.o             \
                affinity.o              \
                sched_grrr.o            \
@@ -60,9 +58,6 @@ OBJS_NO_CAST_ALIGN =                  \
                status.o                \
                machine_routines.o      \
                loose_ends.o            \
-               fips_sha1.o             \
-               prng_yarrow.o           \
-               sha1mod.o               \
                sleh.o                  \
                ccdigest_final_64be.o   \
                ccdigest_init.o         \
@@ -71,17 +66,8 @@ OBJS_NO_CAST_ALIGN =                 \
                cchmac_init.o           \
                ccsha1.o                \
                ipc_object.o            \
-               ccmode_ctr_crypt.o      \
-               ccmode_factory_ctr_crypt.o      \
-               ccmode_ctr_init.o       \
-               ccmode_ctr_setctr.o     \
                ipc_kmsg.o              \
-               ipc_right.o             \
-               bsd_vm.o                \
-               vm_map_store.o          \
-               vm_map_store_ll.o       \
-               vm_map_store_rb.o       \
-               vm_debug.o
+               ipc_right.o
 
 # Objects that don't want -Wsign-compare warning (15294427)
 OBJS_NO_SIGN_COMPARE =         \
index efbb892f667470da44e6edabb0d70b9c34cee20a..57759351c04767edcd3b13f2d826c13cc02f278d 100644 (file)
@@ -16,17 +16,21 @@ UNCONFIGURED_HIB_FILES=                                     \
 HIB_FILES=$(filter $(UNCONFIGURED_HIB_FILES),$(OBJS))
 
 # Unconfigured __HIB files must be Mach-O for "setsegname"
-WKdmDecompress_new.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG)
-WKdmData_new.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG)
-hibernate_restore.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG)
-hibernate_bootstrap.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG)
-bcopy.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG)
-bzero.o_CFLAGS_ADD += -fno-stack-protector $(CFLAGS_NOLTO_FLAG)
+WKdmDecompress_new.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG)
+WKdmData_new.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG)
+hibernate_restore.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG)
+hibernate_bootstrap.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG)
+bcopy.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG)
+bzero.o_CFLAGS_ADD += -fno-stack-protector -fno-stack-check $(CFLAGS_NOLTO_FLAG)
+fp_simd.o_SFLAGS_ADD += -mavx512f
 
 # To appear at the beginning of the __HIB segment, emit
 # as Mach-O so that the linker can enforce symbol order
 boot_pt.o_CFLAGS_ADD += $(CFLAGS_NOLTO_FLAG)
 
+# fast path lock C leaf functions must be built without stack frames
+locks_i386_opt.o_CFLAGS_ADD += -momit-leaf-frame-pointer -O2
+
 ######################################################################
 #END   Machine dependent Makefile fragment for x86_64
 ######################################################################
index 53078380e40844eb86192a5ab77e83f63df7614a..342d24d836506ee2cdabbf3648a6ab0b504a346f 100644 (file)
@@ -61,6 +61,8 @@ OPTIONS/config_dtrace         optional config_dtrace
 
 OPTIONS/no_kextd               optional no_kextd
 
+OPTIONS/config_quiesce_counter  optional config_quiesce_counter
+
 #
 # gssd files
 #
@@ -119,6 +121,7 @@ osfmk/kern/clock.c                  standard
 osfmk/kern/clock_oldops.c              standard
 osfmk/kern/coalition.c                 optional config_coalitions
 osfmk/kern/counters.c                  standard
+osfmk/kern/cpu_quiesce.c               optional config_quiesce_counter
 osfmk/kern/debug.c                     standard
 osfmk/kern/energy_perf.c               standard
 osfmk/kern/exception.c         standard
@@ -133,7 +136,7 @@ osfmk/kern/ipc_misc.c                       standard
 osfmk/kern/ipc_sync.c                  standard
 osfmk/kern/ipc_tt.c                    standard
 osfmk/kern/kalloc.c                    standard
-osfmk/kern/kern_ecc.c                  optional config_ecc_logging
+osfmk/kern/ecc_logging.c                       optional config_ecc_logging
 osfmk/kern/ktrace_background_notify.c  standard
 osfmk/kern/ledger.c                    standard
 osfmk/kern/locks.c                     standard
@@ -145,6 +148,7 @@ osfmk/kern/mk_timer.c               standard
 osfmk/kern/page_decrypt.c      standard
 osfmk/kern/printf.c                    standard
 osfmk/kern/priority.c                  standard
+osfmk/kern/priority_queue.c            standard
 osfmk/kern/processor.c         standard
 osfmk/kern/processor_data.c            standard
 osfmk/kern/sched_average.c             standard
@@ -166,6 +170,8 @@ osfmk/kern/sysdiagnose.c    optional config_sysdiagnose
 osfmk/kern/task.c                      standard
 osfmk/kern/task_policy.c       standard
 osfmk/kern/task_swap.c         standard
+osfmk/kern/test_lock.c         optional development
+osfmk/kern/test_lock.c          optional debug
 osfmk/kern/thread.c                    standard
 osfmk/kern/thread_act.c                standard
 osfmk/kern/thread_call.c       standard
@@ -173,10 +179,13 @@ osfmk/kern/thread_group.c standard
 osfmk/kern/thread_policy.c     standard
 osfmk/kern/timer.c                     standard
 osfmk/kern/timer_call.c                standard
+osfmk/kern/turnstile.c         standard
+osfmk/kern/ux_handler.c                standard
 osfmk/kern/waitq.c                     standard
 osfmk/kern/work_interval.c             standard
 osfmk/kern/xpr.c                       optional xpr_debug
 osfmk/kern/zalloc.c                    standard
+osfmk/kern/zcache.c            optional config_zcache
 osfmk/kern/gzalloc.c           optional config_gzalloc
 osfmk/kern/bsd_kern.c          optional mach_bsd
 osfmk/kern/hibernate.c         optional hibernation
@@ -200,6 +209,7 @@ osfmk/kern/copyout_shim.c   optional copyout_shim
 ./mach/mach_vm_server.c                        standard
 ./mach/mach_voucher_server.c           standard
 ./mach/mach_voucher_attr_control_server.c              standard
+./mach/memory_entry_server.c           standard
 ./mach/memory_object_control_server.c  standard
 ./mach/resource_notify_user.c          standard
 ./mach/upl_server.c                    standard
@@ -207,6 +217,14 @@ osfmk/kern/copyout_shim.c  optional copyout_shim
 ./mach/task_access_user.c              standard
 osfmk/corpses/corpse.c                 standard
 osfmk/kern/kern_cdata.c                        standard
+osfmk/tests/kernel_tests.c             optional config_xnupost
+osfmk/tests/ktest.c                    optional config_xnupost
+osfmk/tests/ktest_accessor.c           optional config_xnupost
+osfmk/tests/ktest_emit.c               optional config_xnupost
+osfmk/tests/ktest_global.c             optional config_xnupost
+osfmk/tests/pmap_tests.c               optional config_xnupost
+osfmk/tests/bitmap_test.c              optional config_xnupost
+osfmk/tests/test_thread_call.c          optional config_xnupost
 ./mach/telemetry_notification_user.c optional config_telemetry
 osfmk/bank/bank.c                      standard
 osfmk/atm/atm.c                        optional config_atm
@@ -248,6 +266,7 @@ osfmk/vm/vm_pageout.c                       standard
 osfmk/vm/vm_purgeable.c                        standard
 osfmk/vm/vm_resident.c                 standard
 osfmk/vm/vm_shared_region.c            standard
+osfmk/vm/vm_shared_region_pager.c      standard
 osfmk/vm/vm_swapfile_pager.c           standard
 osfmk/vm/vm_user.c                     standard
 osfmk/vm/vm32_user.c                   standard
@@ -281,6 +300,7 @@ osfmk/kperf/meminfo.c                   optional kperf
 osfmk/kperf/kperf_timer.c               optional kperf
 osfmk/kperf/kperf_kpc.c                 optional kperf
 osfmk/kperf/kdebug_trigger.c            optional kperf
+osfmk/kperf/lazy.c                      optional kperf
 osfmk/kern/kpc_thread.c                 optional kpc
 osfmk/kern/kpc_common.c                 optional kpc
 
@@ -315,17 +335,4 @@ osfmk/corecrypto/ccsha2/src/ccsha256_K.c   standard
 osfmk/corecrypto/ccsha2/src/ccsha256_ltc_compress.c    standard
 osfmk/corecrypto/ccsha2/src/ccsha256_ltc_di.c  standard
 
-osfmk/corecrypto/ccaes/src/ccaes_ltc_ecb_encrypt_mode.c        standard
-osfmk/corecrypto/ccmode/src/ccmode_ctr_crypt.c standard
-osfmk/corecrypto/ccmode/src/ccmode_ctr_init.c  standard
-osfmk/corecrypto/ccmode/src/ccmode_ctr_setctr.c        standard
-osfmk/corecrypto/ccmode/src/ccmode_factory_ctr_crypt.c standard
-
-osfmk/prng/random.c                    standard
-osfmk/prng/prng_yarrow.c               standard
-osfmk/prng/fips_sha1.c                 standard
-osfmk/prng/YarrowCoreLib/port/smf.c    standard
-osfmk/prng/YarrowCoreLib/src/comp.c    standard
-osfmk/prng/YarrowCoreLib/src/prng.c    standard
-osfmk/prng/YarrowCoreLib/src/sha1mod.c standard
-osfmk/prng/YarrowCoreLib/src/yarrowUtils.c     standard
+osfmk/prng/prng_random.c               standard
index 208781e4709af256362fa48e1443483bcbd5482b..11ca5662789aedc5141433ea33d63bfcc60f6518 100644 (file)
@@ -50,7 +50,6 @@ osfmk/arm/strlcpy.c   standard
 
 osfmk/arm/model_dep.c          standard
 osfmk/arm/pcb.c                standard
-osfmk/arm/conf.c               standard
 osfmk/arm/rtclock.c            standard
 osfmk/arm/status.c             standard
 osfmk/arm/status_shared.c      standard
@@ -70,7 +69,6 @@ osfmk/OPTIONS/hi_res_clock    optional hi_res_clock
 
 # Kernel performance monitoring
 osfmk/kperf/arm/kperf_mp.c      optional kperf
-osfmk/kperf/arm/kperf_meminfo.c optional kperf
 osfmk/arm/kpc_arm.c            optional kpc
 
 osfmk/arm/monotonic_arm.c optional monotonic
index 61e470322701eae6926688c5a8e0cbf0b894a37b..68611422f43854978c5cdda24f7040d4f9f4347f 100644 (file)
@@ -55,7 +55,6 @@ osfmk/arm/strlcpy.c     standard
 
 osfmk/arm/model_dep.c          standard
 osfmk/arm64/pcb.c              standard
-osfmk/arm/conf.c               standard
 osfmk/arm/rtclock.c            standard
 osfmk/arm64/status.c           standard
 osfmk/arm/status_shared.c      standard
@@ -74,7 +73,6 @@ osfmk/OPTIONS/hi_res_clock    optional hi_res_clock
 
 # Kernel performance monitoring
 osfmk/kperf/arm/kperf_mp.c      optional kperf
-osfmk/kperf/arm/kperf_meminfo.c optional kperf
 osfmk/arm64/kpc.c              optional kpc
 
 osfmk/arm64/monotonic_arm64.c optional monotonic
index bf7e53cc68518d458dd06a4958b8296e2535cd66..a696fc0acaf45e893d8eb9ae73414c2c91a891fb 100644 (file)
@@ -47,7 +47,8 @@ osfmk/i386/ktss.c             standard
 osfmk/i386/ldt.c               standard
 osfmk/x86_64/loose_ends.c      standard
 osfmk/x86_64/copyio.c          standard
-osfmk/i386/locks_i386.c        standard
+osfmk/i386/locks_i386.c                standard
+osfmk/i386/locks_i386_opt.c    standard
 osfmk/x86_64/locore.s  standard
 osfmk/x86_64/lowmem_vectors.c  standard
 osfmk/x86_64/cswitch.s standard
@@ -73,7 +74,6 @@ osfmk/i386/commpage/commpage.c        standard
 osfmk/i386/commpage/commpage_asm.s     standard
 osfmk/i386/commpage/fifo_queues.s      standard
 
-osfmk/i386/AT386/conf.c                standard
 osfmk/i386/AT386/model_dep.c   standard
 
 osfmk/i386/lapic.c             standard
@@ -114,7 +114,6 @@ osfmk/kern/hv_support.c                             optional hypervisor
 
 # Kernel performance monitoring
 osfmk/kperf/x86_64/kperf_mp.c   optional kperf
-osfmk/kperf/x86_64/kperf_meminfo.c  optional kperf
 osfmk/x86_64/kpc_x86.c              optional kpc
 
 osfmk/x86_64/monotonic_x86_64.c optional monotonic
index 8161ef2805937fa55cd29a3cdd8b101397622991..2a74280b8506333c34c0b48594b19feaef9dcbbd 100644 (file)
 #include <arm/cpu_data_internal.h>
 #endif
 
+#ifdef CONFIG_XNUPOST
+#include <tests/xnupost.h>
+kern_return_t console_serial_test(void);
+kern_return_t console_serial_alloc_rel_tests(void);
+kern_return_t console_serial_parallel_log_tests(void);
+#define MAX_CPU_SLOTS (MAX_CPUS + 2)
+#endif
 
 #ifndef MAX_CPU_SLOTS
 #define MAX_CPU_SLOTS (MAX_CPUS)
@@ -127,7 +134,7 @@ SECURITY_READ_ONLY_EARLY(uint32_t) nconsops = (sizeof cons_ops / sizeof cons_ops
 
 uint32_t cons_ops_index = VC_CONS_OPS;
 
-#ifdef __arm__
+#if defined(__x86_64__) || defined(__arm__)
 // NMI static variables
 #define NMI_STRING_SIZE 32
 char nmi_string[NMI_STRING_SIZE] = "afDIGHr84A84jh19Kphgp428DNPdnapq";
@@ -598,7 +605,7 @@ _serial_getc(__unused int a, __unused int b, boolean_t wait, __unused boolean_t
                c = serial_getc();
        } while (wait && c < 0);
 
-#ifdef __arm__
+#if defined(__x86_64__) || defined(__arm__)
        // Check for the NMI string
        if (c == nmi_string[nmi_counter]) {
                nmi_counter++;
@@ -645,3 +652,174 @@ vcgetc(__unused int l, __unused int u, __unused boolean_t wait, __unused boolean
                return 0;
 }
 
+#ifdef CONFIG_XNUPOST
+static uint32_t cons_test_ops_count = 0;
+
+/*
+ * Try to do multiple cpu buffer allocs and free and intentionally
+ * allow for pre-emption.
+ */
+static void
+alloc_free_func(void * arg, wait_result_t wres __unused)
+{
+       console_buf_t * cbp = NULL;
+       int count           = (int)arg;
+
+       T_LOG("Doing %d iterations of console cpu alloc and free.", count);
+
+       while (count-- > 0) {
+               (void)hw_atomic_add(&cons_test_ops_count, 1);
+               cbp = (console_buf_t *)console_cpu_alloc(0);
+               if (cbp == NULL) {
+                       T_ASSERT_NOTNULL(cbp, "cpu allocation failed");
+               }
+               console_cpu_free(cbp);
+               cbp = NULL;
+               /* give chance to another thread to come in */
+               delay(10);
+       }
+}
+
+/*
+ * Log to console by multiple methods - printf, unbuffered write, console_write()
+ */
+static void
+log_to_console_func(void * arg __unused, wait_result_t wres __unused)
+{
+       uint64_t thread_id = current_thread()->thread_id;
+       char somedata[10] = "123456789";
+       for (int i = 0; i < 26; i++) {
+               (void)hw_atomic_add(&cons_test_ops_count, 1);
+               printf(" thid: %llu printf iteration %d\n", thread_id, i);
+               cnputc_unbuffered((char)('A' + i));
+               cnputc_unbuffered('\n');
+               console_write((char *)somedata, sizeof(somedata));
+               delay(10);
+       }
+       printf("finished the log_to_console_func operations\n\n");
+}
+
+kern_return_t
+console_serial_parallel_log_tests(void)
+{
+       thread_t thread;
+       kern_return_t kr;
+       cons_test_ops_count = 0;
+
+       kr = kernel_thread_start(log_to_console_func, NULL, &thread);
+       T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully");
+
+       delay(100);
+
+       log_to_console_func(NULL, 0);
+
+       /* wait until other thread has also finished */
+       while (cons_test_ops_count < 52) {
+               delay(1000);
+       }
+
+       thread_deallocate(thread);
+       T_LOG("parallel_logging tests is now complete. From this point forward we expect full lines\n");
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+console_serial_alloc_rel_tests(void)
+{
+       unsigned long i, free_buf_count = 0;
+       uint32_t * p;
+       console_buf_t * cbp;
+       thread_t thread;
+       kern_return_t kr;
+
+       T_LOG("doing alloc/release tests");
+
+       for (i = 0; i < MAX_CPU_SLOTS; i++) {
+               p   = (uint32_t *)((uintptr_t)console_ring.buffer + console_ring.len + (i * sizeof(console_buf_t)));
+               cbp = (console_buf_t *)(void *)p;
+               /* p should either be allocated cpu buffer or have CPU_BUF_FREE_HEX in it */
+               T_ASSERT(*p == CPU_BUF_FREE_HEX || cbp->buf_base == &cbp->buf[0], "");
+               if (*p == CPU_BUF_FREE_HEX) {
+                       free_buf_count++;
+               }
+       }
+
+       T_ASSERT_GE_ULONG(free_buf_count, 2, "At least 2 buffers should be free");
+       cons_test_ops_count = 0;
+
+       kr = kernel_thread_start(alloc_free_func, (void *)1000, &thread);
+       T_ASSERT_EQ_INT(kr, KERN_SUCCESS, "kernel_thread_start returned successfully");
+
+       /* yeild cpu to give other thread chance to get on-core */
+       delay(100);
+
+       alloc_free_func((void *)1000, 0);
+
+       /* wait until other thread finishes its tasks */
+       while (cons_test_ops_count < 2000) {
+               delay(1000);
+       }
+
+       thread_deallocate(thread);
+       /* verify again that atleast 2 slots are free */
+       free_buf_count = 0;
+       for (i = 0; i < MAX_CPU_SLOTS; i++) {
+               p   = (uint32_t *)((uintptr_t)console_ring.buffer + console_ring.len + (i * sizeof(console_buf_t)));
+               cbp = (console_buf_t *)(void *)p;
+               /* p should either be allocated cpu buffer or have CPU_BUF_FREE_HEX in it */
+               T_ASSERT(*p == CPU_BUF_FREE_HEX || cbp->buf_base == &cbp->buf[0], "");
+               if (*p == CPU_BUF_FREE_HEX) {
+                       free_buf_count++;
+               }
+       }
+       T_ASSERT_GE_ULONG(free_buf_count, 2, "At least 2 buffers should be free after alloc free tests");
+
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+console_serial_test(void)
+{
+       unsigned long i;
+       char buffer[CPU_BUFFER_LEN];
+       uint32_t * p;
+       console_buf_t * cbp;
+
+       T_LOG("Checking console_ring status.");
+       T_ASSERT_EQ_INT(console_ring.len, KERN_CONSOLE_RING_SIZE, "Console ring size is not correct.");
+       T_ASSERT_GT_INT(KERN_CONSOLE_BUF_SIZE, KERN_CONSOLE_RING_SIZE, "kernel console buffer size is < allocation.");
+
+       /* select the next slot from the per cpu buffers at end of console_ring.buffer */
+       for (i = 0; i < MAX_CPU_SLOTS; i++) {
+               p   = (uint32_t *)((uintptr_t)console_ring.buffer + console_ring.len + (i * sizeof(console_buf_t)));
+               cbp = (console_buf_t *)(void *)p;
+               /* p should either be allocated cpu buffer or have CPU_BUF_FREE_HEX in it */
+               T_ASSERT(*p == CPU_BUF_FREE_HEX || cbp->buf_base == &cbp->buf[0], "verified initialization of cpu buffers p=%p", (void *)p);
+       }
+
+       /* setup buffer to be chars */
+       for (i = 0; i < CPU_BUFFER_LEN; i++) {
+               buffer[i] = (char)('0' + (i % 10));
+       }
+       buffer[CPU_BUFFER_LEN - 1] = '\0';
+
+       T_LOG("Printing %d char string to serial one char at a time.", CPU_BUFFER_LEN);
+       for (i = 0; i < CPU_BUFFER_LEN; i++) {
+               printf("%c", buffer[i]);
+       }
+       printf("End\n");
+       T_LOG("Printing %d char string to serial as a whole", CPU_BUFFER_LEN);
+       printf("%s\n", buffer);
+
+       T_LOG("Using console_write call repeatedly for 100 iterations");
+       for (i = 0; i < 100; i++) {
+               console_write(&buffer[0], 14);
+               if ((i % 6) == 0)
+                       printf("\n");
+       }
+       printf("\n");
+
+       T_LOG("Using T_LOG to print buffer %s", buffer);
+       return KERN_SUCCESS;
+}
+#endif
index 2a0437671ec21cb70eb5ca713cc1d54c46f59523..a9e11890a1af3129770f6eb344eace844b7f7a88 100644 (file)
 #include <kern/debug.h>
 void cc_try_abort(const char * msg CC_UNUSED , ...)
 {
-    panic(msg);
+    panic("%s", msg);
 }
 
-#elif CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKIT
+#elif CC_USE_SEPROM || CC_USE_S3 || CC_BASEBAND || CC_EFI || CC_IBOOT || CC_RTKIT || CC_RTKITROM
 void cc_try_abort(const char * msg CC_UNUSED, ...)
 {
     //Do nothing and return because we don't have panic() in those
diff --git a/osfmk/corecrypto/ccaes/src/aes_tab.c b/osfmk/corecrypto/ccaes/src/aes_tab.c
deleted file mode 100644 (file)
index 0fe7b19..0000000
+++ /dev/null
@@ -1,1061 +0,0 @@
-/*
- *  aes_tab.c
- *  corecrypto
- *
- *  Created on 12/12/2010
- *
- *  Copyright (c) 2010,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/* LibTomCrypt, modular cryptographic library -- Tom St Denis
- *
- * LibTomCrypt is a library that provides various cryptographic
- * algorithms in a highly modular and flexible manner.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://libtom.org
- */
-
-/* The precomputed tables for AES */
-/*
-Te0[x] = S [x].[02, 01, 01, 03];
-Te1[x] = S [x].[03, 02, 01, 01];
-Te2[x] = S [x].[01, 03, 02, 01];
-Te3[x] = S [x].[01, 01, 03, 02];
-Te4[x] = S [x].[01, 01, 01, 01];
-
-Td0[x] = Si[x].[0e, 09, 0d, 0b];
-Td1[x] = Si[x].[0b, 0e, 09, 0d];
-Td2[x] = Si[x].[0d, 0b, 0e, 09];
-Td3[x] = Si[x].[09, 0d, 0b, 0e];
-Td4[x] = Si[x].[01, 01, 01, 01];
-*/
-
-#include <stdint.h>
-
-/*!
-  @file aes_tab.c
-  AES tables
-*/
-static const uint32_t TE0[256] = {
-    0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
-    0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
-    0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
-    0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
-    0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
-    0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
-    0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
-    0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
-    0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
-    0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
-    0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
-    0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
-    0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
-    0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
-    0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
-    0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
-    0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
-    0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
-    0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
-    0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
-    0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
-    0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
-    0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
-    0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
-    0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
-    0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
-    0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
-    0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
-    0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
-    0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
-    0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
-    0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
-    0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
-    0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
-    0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
-    0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
-    0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
-    0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
-    0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
-    0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
-    0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
-    0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
-    0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
-    0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
-    0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
-    0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
-    0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
-    0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
-    0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
-    0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
-    0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
-    0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
-    0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
-    0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
-    0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
-    0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
-    0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
-    0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
-    0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
-    0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
-    0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
-    0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
-    0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
-    0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a,
-};
-
-#ifndef PELI_TAB
-static const uint32_t Te4[256] = {
-    0x63636363, 0x7c7c7c7c, 0x77777777, 0x7b7b7b7b,
-    0xf2f2f2f2, 0x6b6b6b6b, 0x6f6f6f6f, 0xc5c5c5c5,
-    0x30303030, 0x01010101, 0x67676767, 0x2b2b2b2b,
-    0xfefefefe, 0xd7d7d7d7, 0xabababab, 0x76767676,
-    0xcacacaca, 0x82828282, 0xc9c9c9c9, 0x7d7d7d7d,
-    0xfafafafa, 0x59595959, 0x47474747, 0xf0f0f0f0,
-    0xadadadad, 0xd4d4d4d4, 0xa2a2a2a2, 0xafafafaf,
-    0x9c9c9c9c, 0xa4a4a4a4, 0x72727272, 0xc0c0c0c0,
-    0xb7b7b7b7, 0xfdfdfdfd, 0x93939393, 0x26262626,
-    0x36363636, 0x3f3f3f3f, 0xf7f7f7f7, 0xcccccccc,
-    0x34343434, 0xa5a5a5a5, 0xe5e5e5e5, 0xf1f1f1f1,
-    0x71717171, 0xd8d8d8d8, 0x31313131, 0x15151515,
-    0x04040404, 0xc7c7c7c7, 0x23232323, 0xc3c3c3c3,
-    0x18181818, 0x96969696, 0x05050505, 0x9a9a9a9a,
-    0x07070707, 0x12121212, 0x80808080, 0xe2e2e2e2,
-    0xebebebeb, 0x27272727, 0xb2b2b2b2, 0x75757575,
-    0x09090909, 0x83838383, 0x2c2c2c2c, 0x1a1a1a1a,
-    0x1b1b1b1b, 0x6e6e6e6e, 0x5a5a5a5a, 0xa0a0a0a0,
-    0x52525252, 0x3b3b3b3b, 0xd6d6d6d6, 0xb3b3b3b3,
-    0x29292929, 0xe3e3e3e3, 0x2f2f2f2f, 0x84848484,
-    0x53535353, 0xd1d1d1d1, 0x00000000, 0xedededed,
-    0x20202020, 0xfcfcfcfc, 0xb1b1b1b1, 0x5b5b5b5b,
-    0x6a6a6a6a, 0xcbcbcbcb, 0xbebebebe, 0x39393939,
-    0x4a4a4a4a, 0x4c4c4c4c, 0x58585858, 0xcfcfcfcf,
-    0xd0d0d0d0, 0xefefefef, 0xaaaaaaaa, 0xfbfbfbfb,
-    0x43434343, 0x4d4d4d4d, 0x33333333, 0x85858585,
-    0x45454545, 0xf9f9f9f9, 0x02020202, 0x7f7f7f7f,
-    0x50505050, 0x3c3c3c3c, 0x9f9f9f9f, 0xa8a8a8a8,
-    0x51515151, 0xa3a3a3a3, 0x40404040, 0x8f8f8f8f,
-    0x92929292, 0x9d9d9d9d, 0x38383838, 0xf5f5f5f5,
-    0xbcbcbcbc, 0xb6b6b6b6, 0xdadadada, 0x21212121,
-    0x10101010, 0xffffffff, 0xf3f3f3f3, 0xd2d2d2d2,
-    0xcdcdcdcd, 0x0c0c0c0c, 0x13131313, 0xecececec,
-    0x5f5f5f5f, 0x97979797, 0x44444444, 0x17171717,
-    0xc4c4c4c4, 0xa7a7a7a7, 0x7e7e7e7e, 0x3d3d3d3d,
-    0x64646464, 0x5d5d5d5d, 0x19191919, 0x73737373,
-    0x60606060, 0x81818181, 0x4f4f4f4f, 0xdcdcdcdc,
-    0x22222222, 0x2a2a2a2a, 0x90909090, 0x88888888,
-    0x46464646, 0xeeeeeeee, 0xb8b8b8b8, 0x14141414,
-    0xdededede, 0x5e5e5e5e, 0x0b0b0b0b, 0xdbdbdbdb,
-    0xe0e0e0e0, 0x32323232, 0x3a3a3a3a, 0x0a0a0a0a,
-    0x49494949, 0x06060606, 0x24242424, 0x5c5c5c5c,
-    0xc2c2c2c2, 0xd3d3d3d3, 0xacacacac, 0x62626262,
-    0x91919191, 0x95959595, 0xe4e4e4e4, 0x79797979,
-    0xe7e7e7e7, 0xc8c8c8c8, 0x37373737, 0x6d6d6d6d,
-    0x8d8d8d8d, 0xd5d5d5d5, 0x4e4e4e4e, 0xa9a9a9a9,
-    0x6c6c6c6c, 0x56565656, 0xf4f4f4f4, 0xeaeaeaea,
-    0x65656565, 0x7a7a7a7a, 0xaeaeaeae, 0x08080808,
-    0xbabababa, 0x78787878, 0x25252525, 0x2e2e2e2e,
-    0x1c1c1c1c, 0xa6a6a6a6, 0xb4b4b4b4, 0xc6c6c6c6,
-    0xe8e8e8e8, 0xdddddddd, 0x74747474, 0x1f1f1f1f,
-    0x4b4b4b4b, 0xbdbdbdbd, 0x8b8b8b8b, 0x8a8a8a8a,
-    0x70707070, 0x3e3e3e3e, 0xb5b5b5b5, 0x66666666,
-    0x48484848, 0x03030303, 0xf6f6f6f6, 0x0e0e0e0e,
-    0x61616161, 0x35353535, 0x57575757, 0xb9b9b9b9,
-    0x86868686, 0xc1c1c1c1, 0x1d1d1d1d, 0x9e9e9e9e,
-    0xe1e1e1e1, 0xf8f8f8f8, 0x98989898, 0x11111111,
-    0x69696969, 0xd9d9d9d9, 0x8e8e8e8e, 0x94949494,
-    0x9b9b9b9b, 0x1e1e1e1e, 0x87878787, 0xe9e9e9e9,
-    0xcececece, 0x55555555, 0x28282828, 0xdfdfdfdf,
-    0x8c8c8c8c, 0xa1a1a1a1, 0x89898989, 0x0d0d0d0d,
-    0xbfbfbfbf, 0xe6e6e6e6, 0x42424242, 0x68686868,
-    0x41414141, 0x99999999, 0x2d2d2d2d, 0x0f0f0f0f,
-    0xb0b0b0b0, 0x54545454, 0xbbbbbbbb, 0x16161616,
-};
-#endif
-
-#ifndef ENCRYPT_ONLY
-
-static const uint32_t TD0[256] = {
-    0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
-    0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
-    0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
-    0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
-    0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
-    0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
-    0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
-    0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
-    0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
-    0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
-    0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
-    0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
-    0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
-    0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
-    0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
-    0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
-    0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
-    0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
-    0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
-    0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
-    0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
-    0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
-    0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
-    0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
-    0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
-    0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
-    0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
-    0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
-    0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
-    0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
-    0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
-    0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
-    0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
-    0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
-    0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
-    0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
-    0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
-    0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
-    0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
-    0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
-    0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
-    0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
-    0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
-    0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
-    0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
-    0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
-    0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
-    0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
-    0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
-    0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
-    0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
-    0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
-    0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
-    0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
-    0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
-    0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
-    0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
-    0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
-    0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
-    0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
-    0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
-    0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
-    0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
-    0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742,
-};
-
-static const uint32_t Td4[256] = {
-    0x52525252, 0x09090909, 0x6a6a6a6a, 0xd5d5d5d5,
-    0x30303030, 0x36363636, 0xa5a5a5a5, 0x38383838,
-    0xbfbfbfbf, 0x40404040, 0xa3a3a3a3, 0x9e9e9e9e,
-    0x81818181, 0xf3f3f3f3, 0xd7d7d7d7, 0xfbfbfbfb,
-    0x7c7c7c7c, 0xe3e3e3e3, 0x39393939, 0x82828282,
-    0x9b9b9b9b, 0x2f2f2f2f, 0xffffffff, 0x87878787,
-    0x34343434, 0x8e8e8e8e, 0x43434343, 0x44444444,
-    0xc4c4c4c4, 0xdededede, 0xe9e9e9e9, 0xcbcbcbcb,
-    0x54545454, 0x7b7b7b7b, 0x94949494, 0x32323232,
-    0xa6a6a6a6, 0xc2c2c2c2, 0x23232323, 0x3d3d3d3d,
-    0xeeeeeeee, 0x4c4c4c4c, 0x95959595, 0x0b0b0b0b,
-    0x42424242, 0xfafafafa, 0xc3c3c3c3, 0x4e4e4e4e,
-    0x08080808, 0x2e2e2e2e, 0xa1a1a1a1, 0x66666666,
-    0x28282828, 0xd9d9d9d9, 0x24242424, 0xb2b2b2b2,
-    0x76767676, 0x5b5b5b5b, 0xa2a2a2a2, 0x49494949,
-    0x6d6d6d6d, 0x8b8b8b8b, 0xd1d1d1d1, 0x25252525,
-    0x72727272, 0xf8f8f8f8, 0xf6f6f6f6, 0x64646464,
-    0x86868686, 0x68686868, 0x98989898, 0x16161616,
-    0xd4d4d4d4, 0xa4a4a4a4, 0x5c5c5c5c, 0xcccccccc,
-    0x5d5d5d5d, 0x65656565, 0xb6b6b6b6, 0x92929292,
-    0x6c6c6c6c, 0x70707070, 0x48484848, 0x50505050,
-    0xfdfdfdfd, 0xedededed, 0xb9b9b9b9, 0xdadadada,
-    0x5e5e5e5e, 0x15151515, 0x46464646, 0x57575757,
-    0xa7a7a7a7, 0x8d8d8d8d, 0x9d9d9d9d, 0x84848484,
-    0x90909090, 0xd8d8d8d8, 0xabababab, 0x00000000,
-    0x8c8c8c8c, 0xbcbcbcbc, 0xd3d3d3d3, 0x0a0a0a0a,
-    0xf7f7f7f7, 0xe4e4e4e4, 0x58585858, 0x05050505,
-    0xb8b8b8b8, 0xb3b3b3b3, 0x45454545, 0x06060606,
-    0xd0d0d0d0, 0x2c2c2c2c, 0x1e1e1e1e, 0x8f8f8f8f,
-    0xcacacaca, 0x3f3f3f3f, 0x0f0f0f0f, 0x02020202,
-    0xc1c1c1c1, 0xafafafaf, 0xbdbdbdbd, 0x03030303,
-    0x01010101, 0x13131313, 0x8a8a8a8a, 0x6b6b6b6b,
-    0x3a3a3a3a, 0x91919191, 0x11111111, 0x41414141,
-    0x4f4f4f4f, 0x67676767, 0xdcdcdcdc, 0xeaeaeaea,
-    0x97979797, 0xf2f2f2f2, 0xcfcfcfcf, 0xcececece,
-    0xf0f0f0f0, 0xb4b4b4b4, 0xe6e6e6e6, 0x73737373,
-    0x96969696, 0xacacacac, 0x74747474, 0x22222222,
-    0xe7e7e7e7, 0xadadadad, 0x35353535, 0x85858585,
-    0xe2e2e2e2, 0xf9f9f9f9, 0x37373737, 0xe8e8e8e8,
-    0x1c1c1c1c, 0x75757575, 0xdfdfdfdf, 0x6e6e6e6e,
-    0x47474747, 0xf1f1f1f1, 0x1a1a1a1a, 0x71717171,
-    0x1d1d1d1d, 0x29292929, 0xc5c5c5c5, 0x89898989,
-    0x6f6f6f6f, 0xb7b7b7b7, 0x62626262, 0x0e0e0e0e,
-    0xaaaaaaaa, 0x18181818, 0xbebebebe, 0x1b1b1b1b,
-    0xfcfcfcfc, 0x56565656, 0x3e3e3e3e, 0x4b4b4b4b,
-    0xc6c6c6c6, 0xd2d2d2d2, 0x79797979, 0x20202020,
-    0x9a9a9a9a, 0xdbdbdbdb, 0xc0c0c0c0, 0xfefefefe,
-    0x78787878, 0xcdcdcdcd, 0x5a5a5a5a, 0xf4f4f4f4,
-    0x1f1f1f1f, 0xdddddddd, 0xa8a8a8a8, 0x33333333,
-    0x88888888, 0x07070707, 0xc7c7c7c7, 0x31313131,
-    0xb1b1b1b1, 0x12121212, 0x10101010, 0x59595959,
-    0x27272727, 0x80808080, 0xecececec, 0x5f5f5f5f,
-    0x60606060, 0x51515151, 0x7f7f7f7f, 0xa9a9a9a9,
-    0x19191919, 0xb5b5b5b5, 0x4a4a4a4a, 0x0d0d0d0d,
-    0x2d2d2d2d, 0xe5e5e5e5, 0x7a7a7a7a, 0x9f9f9f9f,
-    0x93939393, 0xc9c9c9c9, 0x9c9c9c9c, 0xefefefef,
-    0xa0a0a0a0, 0xe0e0e0e0, 0x3b3b3b3b, 0x4d4d4d4d,
-    0xaeaeaeae, 0x2a2a2a2a, 0xf5f5f5f5, 0xb0b0b0b0,
-    0xc8c8c8c8, 0xebebebeb, 0xbbbbbbbb, 0x3c3c3c3c,
-    0x83838383, 0x53535353, 0x99999999, 0x61616161,
-    0x17171717, 0x2b2b2b2b, 0x04040404, 0x7e7e7e7e,
-    0xbabababa, 0x77777777, 0xd6d6d6d6, 0x26262626,
-    0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363,
-    0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d,
-};
-
-#endif /* ENCRYPT_ONLY */
-
-#ifdef LTC_SMALL_CODE
-
-#define Te0(x) TE0[x]
-#define Te1(x) RORc(TE0[x], 8)
-#define Te2(x) RORc(TE0[x], 16)
-#define Te3(x) RORc(TE0[x], 24)
-
-#define Td0(x) TD0[x]
-#define Td1(x) RORc(TD0[x], 8)
-#define Td2(x) RORc(TD0[x], 16)
-#define Td3(x) RORc(TD0[x], 24)
-
-#define Te4_0 0x000000FF & Te4
-#define Te4_1 0x0000FF00 & Te4
-#define Te4_2 0x00FF0000 & Te4
-#define Te4_3 0xFF000000 & Te4
-
-#else
-
-#define Te0(x) TE0[x]
-#define Te1(x) TE1[x]
-#define Te2(x) TE2[x]
-#define Te3(x) TE3[x]
-
-#define Td0(x) TD0[x]
-#define Td1(x) TD1[x]
-#define Td2(x) TD2[x]
-#define Td3(x) TD3[x]
-
-static const uint32_t TE1[256] = {
-    0xa5c66363, 0x84f87c7c, 0x99ee7777, 0x8df67b7b,
-    0x0dfff2f2, 0xbdd66b6b, 0xb1de6f6f, 0x5491c5c5,
-    0x50603030, 0x03020101, 0xa9ce6767, 0x7d562b2b,
-    0x19e7fefe, 0x62b5d7d7, 0xe64dabab, 0x9aec7676,
-    0x458fcaca, 0x9d1f8282, 0x4089c9c9, 0x87fa7d7d,
-    0x15effafa, 0xebb25959, 0xc98e4747, 0x0bfbf0f0,
-    0xec41adad, 0x67b3d4d4, 0xfd5fa2a2, 0xea45afaf,
-    0xbf239c9c, 0xf753a4a4, 0x96e47272, 0x5b9bc0c0,
-    0xc275b7b7, 0x1ce1fdfd, 0xae3d9393, 0x6a4c2626,
-    0x5a6c3636, 0x417e3f3f, 0x02f5f7f7, 0x4f83cccc,
-    0x5c683434, 0xf451a5a5, 0x34d1e5e5, 0x08f9f1f1,
-    0x93e27171, 0x73abd8d8, 0x53623131, 0x3f2a1515,
-    0x0c080404, 0x5295c7c7, 0x65462323, 0x5e9dc3c3,
-    0x28301818, 0xa1379696, 0x0f0a0505, 0xb52f9a9a,
-    0x090e0707, 0x36241212, 0x9b1b8080, 0x3ddfe2e2,
-    0x26cdebeb, 0x694e2727, 0xcd7fb2b2, 0x9fea7575,
-    0x1b120909, 0x9e1d8383, 0x74582c2c, 0x2e341a1a,
-    0x2d361b1b, 0xb2dc6e6e, 0xeeb45a5a, 0xfb5ba0a0,
-    0xf6a45252, 0x4d763b3b, 0x61b7d6d6, 0xce7db3b3,
-    0x7b522929, 0x3edde3e3, 0x715e2f2f, 0x97138484,
-    0xf5a65353, 0x68b9d1d1, 0x00000000, 0x2cc1eded,
-    0x60402020, 0x1fe3fcfc, 0xc879b1b1, 0xedb65b5b,
-    0xbed46a6a, 0x468dcbcb, 0xd967bebe, 0x4b723939,
-    0xde944a4a, 0xd4984c4c, 0xe8b05858, 0x4a85cfcf,
-    0x6bbbd0d0, 0x2ac5efef, 0xe54faaaa, 0x16edfbfb,
-    0xc5864343, 0xd79a4d4d, 0x55663333, 0x94118585,
-    0xcf8a4545, 0x10e9f9f9, 0x06040202, 0x81fe7f7f,
-    0xf0a05050, 0x44783c3c, 0xba259f9f, 0xe34ba8a8,
-    0xf3a25151, 0xfe5da3a3, 0xc0804040, 0x8a058f8f,
-    0xad3f9292, 0xbc219d9d, 0x48703838, 0x04f1f5f5,
-    0xdf63bcbc, 0xc177b6b6, 0x75afdada, 0x63422121,
-    0x30201010, 0x1ae5ffff, 0x0efdf3f3, 0x6dbfd2d2,
-    0x4c81cdcd, 0x14180c0c, 0x35261313, 0x2fc3ecec,
-    0xe1be5f5f, 0xa2359797, 0xcc884444, 0x392e1717,
-    0x5793c4c4, 0xf255a7a7, 0x82fc7e7e, 0x477a3d3d,
-    0xacc86464, 0xe7ba5d5d, 0x2b321919, 0x95e67373,
-    0xa0c06060, 0x98198181, 0xd19e4f4f, 0x7fa3dcdc,
-    0x66442222, 0x7e542a2a, 0xab3b9090, 0x830b8888,
-    0xca8c4646, 0x29c7eeee, 0xd36bb8b8, 0x3c281414,
-    0x79a7dede, 0xe2bc5e5e, 0x1d160b0b, 0x76addbdb,
-    0x3bdbe0e0, 0x56643232, 0x4e743a3a, 0x1e140a0a,
-    0xdb924949, 0x0a0c0606, 0x6c482424, 0xe4b85c5c,
-    0x5d9fc2c2, 0x6ebdd3d3, 0xef43acac, 0xa6c46262,
-    0xa8399191, 0xa4319595, 0x37d3e4e4, 0x8bf27979,
-    0x32d5e7e7, 0x438bc8c8, 0x596e3737, 0xb7da6d6d,
-    0x8c018d8d, 0x64b1d5d5, 0xd29c4e4e, 0xe049a9a9,
-    0xb4d86c6c, 0xfaac5656, 0x07f3f4f4, 0x25cfeaea,
-    0xafca6565, 0x8ef47a7a, 0xe947aeae, 0x18100808,
-    0xd56fbaba, 0x88f07878, 0x6f4a2525, 0x725c2e2e,
-    0x24381c1c, 0xf157a6a6, 0xc773b4b4, 0x5197c6c6,
-    0x23cbe8e8, 0x7ca1dddd, 0x9ce87474, 0x213e1f1f,
-    0xdd964b4b, 0xdc61bdbd, 0x860d8b8b, 0x850f8a8a,
-    0x90e07070, 0x427c3e3e, 0xc471b5b5, 0xaacc6666,
-    0xd8904848, 0x05060303, 0x01f7f6f6, 0x121c0e0e,
-    0xa3c26161, 0x5f6a3535, 0xf9ae5757, 0xd069b9b9,
-    0x91178686, 0x5899c1c1, 0x273a1d1d, 0xb9279e9e,
-    0x38d9e1e1, 0x13ebf8f8, 0xb32b9898, 0x33221111,
-    0xbbd26969, 0x70a9d9d9, 0x89078e8e, 0xa7339494,
-    0xb62d9b9b, 0x223c1e1e, 0x92158787, 0x20c9e9e9,
-    0x4987cece, 0xffaa5555, 0x78502828, 0x7aa5dfdf,
-    0x8f038c8c, 0xf859a1a1, 0x80098989, 0x171a0d0d,
-    0xda65bfbf, 0x31d7e6e6, 0xc6844242, 0xb8d06868,
-    0xc3824141, 0xb0299999, 0x775a2d2d, 0x111e0f0f,
-    0xcb7bb0b0, 0xfca85454, 0xd66dbbbb, 0x3a2c1616,
-};
-static const uint32_t TE2[256] = {
-    0x63a5c663, 0x7c84f87c, 0x7799ee77, 0x7b8df67b,
-    0xf20dfff2, 0x6bbdd66b, 0x6fb1de6f, 0xc55491c5,
-    0x30506030, 0x01030201, 0x67a9ce67, 0x2b7d562b,
-    0xfe19e7fe, 0xd762b5d7, 0xabe64dab, 0x769aec76,
-    0xca458fca, 0x829d1f82, 0xc94089c9, 0x7d87fa7d,
-    0xfa15effa, 0x59ebb259, 0x47c98e47, 0xf00bfbf0,
-    0xadec41ad, 0xd467b3d4, 0xa2fd5fa2, 0xafea45af,
-    0x9cbf239c, 0xa4f753a4, 0x7296e472, 0xc05b9bc0,
-    0xb7c275b7, 0xfd1ce1fd, 0x93ae3d93, 0x266a4c26,
-    0x365a6c36, 0x3f417e3f, 0xf702f5f7, 0xcc4f83cc,
-    0x345c6834, 0xa5f451a5, 0xe534d1e5, 0xf108f9f1,
-    0x7193e271, 0xd873abd8, 0x31536231, 0x153f2a15,
-    0x040c0804, 0xc75295c7, 0x23654623, 0xc35e9dc3,
-    0x18283018, 0x96a13796, 0x050f0a05, 0x9ab52f9a,
-    0x07090e07, 0x12362412, 0x809b1b80, 0xe23ddfe2,
-    0xeb26cdeb, 0x27694e27, 0xb2cd7fb2, 0x759fea75,
-    0x091b1209, 0x839e1d83, 0x2c74582c, 0x1a2e341a,
-    0x1b2d361b, 0x6eb2dc6e, 0x5aeeb45a, 0xa0fb5ba0,
-    0x52f6a452, 0x3b4d763b, 0xd661b7d6, 0xb3ce7db3,
-    0x297b5229, 0xe33edde3, 0x2f715e2f, 0x84971384,
-    0x53f5a653, 0xd168b9d1, 0x00000000, 0xed2cc1ed,
-    0x20604020, 0xfc1fe3fc, 0xb1c879b1, 0x5bedb65b,
-    0x6abed46a, 0xcb468dcb, 0xbed967be, 0x394b7239,
-    0x4ade944a, 0x4cd4984c, 0x58e8b058, 0xcf4a85cf,
-    0xd06bbbd0, 0xef2ac5ef, 0xaae54faa, 0xfb16edfb,
-    0x43c58643, 0x4dd79a4d, 0x33556633, 0x85941185,
-    0x45cf8a45, 0xf910e9f9, 0x02060402, 0x7f81fe7f,
-    0x50f0a050, 0x3c44783c, 0x9fba259f, 0xa8e34ba8,
-    0x51f3a251, 0xa3fe5da3, 0x40c08040, 0x8f8a058f,
-    0x92ad3f92, 0x9dbc219d, 0x38487038, 0xf504f1f5,
-    0xbcdf63bc, 0xb6c177b6, 0xda75afda, 0x21634221,
-    0x10302010, 0xff1ae5ff, 0xf30efdf3, 0xd26dbfd2,
-    0xcd4c81cd, 0x0c14180c, 0x13352613, 0xec2fc3ec,
-    0x5fe1be5f, 0x97a23597, 0x44cc8844, 0x17392e17,
-    0xc45793c4, 0xa7f255a7, 0x7e82fc7e, 0x3d477a3d,
-    0x64acc864, 0x5de7ba5d, 0x192b3219, 0x7395e673,
-    0x60a0c060, 0x81981981, 0x4fd19e4f, 0xdc7fa3dc,
-    0x22664422, 0x2a7e542a, 0x90ab3b90, 0x88830b88,
-    0x46ca8c46, 0xee29c7ee, 0xb8d36bb8, 0x143c2814,
-    0xde79a7de, 0x5ee2bc5e, 0x0b1d160b, 0xdb76addb,
-    0xe03bdbe0, 0x32566432, 0x3a4e743a, 0x0a1e140a,
-    0x49db9249, 0x060a0c06, 0x246c4824, 0x5ce4b85c,
-    0xc25d9fc2, 0xd36ebdd3, 0xacef43ac, 0x62a6c462,
-    0x91a83991, 0x95a43195, 0xe437d3e4, 0x798bf279,
-    0xe732d5e7, 0xc8438bc8, 0x37596e37, 0x6db7da6d,
-    0x8d8c018d, 0xd564b1d5, 0x4ed29c4e, 0xa9e049a9,
-    0x6cb4d86c, 0x56faac56, 0xf407f3f4, 0xea25cfea,
-    0x65afca65, 0x7a8ef47a, 0xaee947ae, 0x08181008,
-    0xbad56fba, 0x7888f078, 0x256f4a25, 0x2e725c2e,
-    0x1c24381c, 0xa6f157a6, 0xb4c773b4, 0xc65197c6,
-    0xe823cbe8, 0xdd7ca1dd, 0x749ce874, 0x1f213e1f,
-    0x4bdd964b, 0xbddc61bd, 0x8b860d8b, 0x8a850f8a,
-    0x7090e070, 0x3e427c3e, 0xb5c471b5, 0x66aacc66,
-    0x48d89048, 0x03050603, 0xf601f7f6, 0x0e121c0e,
-    0x61a3c261, 0x355f6a35, 0x57f9ae57, 0xb9d069b9,
-    0x86911786, 0xc15899c1, 0x1d273a1d, 0x9eb9279e,
-    0xe138d9e1, 0xf813ebf8, 0x98b32b98, 0x11332211,
-    0x69bbd269, 0xd970a9d9, 0x8e89078e, 0x94a73394,
-    0x9bb62d9b, 0x1e223c1e, 0x87921587, 0xe920c9e9,
-    0xce4987ce, 0x55ffaa55, 0x28785028, 0xdf7aa5df,
-    0x8c8f038c, 0xa1f859a1, 0x89800989, 0x0d171a0d,
-    0xbfda65bf, 0xe631d7e6, 0x42c68442, 0x68b8d068,
-    0x41c38241, 0x99b02999, 0x2d775a2d, 0x0f111e0f,
-    0xb0cb7bb0, 0x54fca854, 0xbbd66dbb, 0x163a2c16,
-};
-static const uint32_t TE3[256] = {
-
-    0x6363a5c6, 0x7c7c84f8, 0x777799ee, 0x7b7b8df6,
-    0xf2f20dff, 0x6b6bbdd6, 0x6f6fb1de, 0xc5c55491,
-    0x30305060, 0x01010302, 0x6767a9ce, 0x2b2b7d56,
-    0xfefe19e7, 0xd7d762b5, 0xababe64d, 0x76769aec,
-    0xcaca458f, 0x82829d1f, 0xc9c94089, 0x7d7d87fa,
-    0xfafa15ef, 0x5959ebb2, 0x4747c98e, 0xf0f00bfb,
-    0xadadec41, 0xd4d467b3, 0xa2a2fd5f, 0xafafea45,
-    0x9c9cbf23, 0xa4a4f753, 0x727296e4, 0xc0c05b9b,
-    0xb7b7c275, 0xfdfd1ce1, 0x9393ae3d, 0x26266a4c,
-    0x36365a6c, 0x3f3f417e, 0xf7f702f5, 0xcccc4f83,
-    0x34345c68, 0xa5a5f451, 0xe5e534d1, 0xf1f108f9,
-    0x717193e2, 0xd8d873ab, 0x31315362, 0x15153f2a,
-    0x04040c08, 0xc7c75295, 0x23236546, 0xc3c35e9d,
-    0x18182830, 0x9696a137, 0x05050f0a, 0x9a9ab52f,
-    0x0707090e, 0x12123624, 0x80809b1b, 0xe2e23ddf,
-    0xebeb26cd, 0x2727694e, 0xb2b2cd7f, 0x75759fea,
-    0x09091b12, 0x83839e1d, 0x2c2c7458, 0x1a1a2e34,
-    0x1b1b2d36, 0x6e6eb2dc, 0x5a5aeeb4, 0xa0a0fb5b,
-    0x5252f6a4, 0x3b3b4d76, 0xd6d661b7, 0xb3b3ce7d,
-    0x29297b52, 0xe3e33edd, 0x2f2f715e, 0x84849713,
-    0x5353f5a6, 0xd1d168b9, 0x00000000, 0xeded2cc1,
-    0x20206040, 0xfcfc1fe3, 0xb1b1c879, 0x5b5bedb6,
-    0x6a6abed4, 0xcbcb468d, 0xbebed967, 0x39394b72,
-    0x4a4ade94, 0x4c4cd498, 0x5858e8b0, 0xcfcf4a85,
-    0xd0d06bbb, 0xefef2ac5, 0xaaaae54f, 0xfbfb16ed,
-    0x4343c586, 0x4d4dd79a, 0x33335566, 0x85859411,
-    0x4545cf8a, 0xf9f910e9, 0x02020604, 0x7f7f81fe,
-    0x5050f0a0, 0x3c3c4478, 0x9f9fba25, 0xa8a8e34b,
-    0x5151f3a2, 0xa3a3fe5d, 0x4040c080, 0x8f8f8a05,
-    0x9292ad3f, 0x9d9dbc21, 0x38384870, 0xf5f504f1,
-    0xbcbcdf63, 0xb6b6c177, 0xdada75af, 0x21216342,
-    0x10103020, 0xffff1ae5, 0xf3f30efd, 0xd2d26dbf,
-    0xcdcd4c81, 0x0c0c1418, 0x13133526, 0xecec2fc3,
-    0x5f5fe1be, 0x9797a235, 0x4444cc88, 0x1717392e,
-    0xc4c45793, 0xa7a7f255, 0x7e7e82fc, 0x3d3d477a,
-    0x6464acc8, 0x5d5de7ba, 0x19192b32, 0x737395e6,
-    0x6060a0c0, 0x81819819, 0x4f4fd19e, 0xdcdc7fa3,
-    0x22226644, 0x2a2a7e54, 0x9090ab3b, 0x8888830b,
-    0x4646ca8c, 0xeeee29c7, 0xb8b8d36b, 0x14143c28,
-    0xdede79a7, 0x5e5ee2bc, 0x0b0b1d16, 0xdbdb76ad,
-    0xe0e03bdb, 0x32325664, 0x3a3a4e74, 0x0a0a1e14,
-    0x4949db92, 0x06060a0c, 0x24246c48, 0x5c5ce4b8,
-    0xc2c25d9f, 0xd3d36ebd, 0xacacef43, 0x6262a6c4,
-    0x9191a839, 0x9595a431, 0xe4e437d3, 0x79798bf2,
-    0xe7e732d5, 0xc8c8438b, 0x3737596e, 0x6d6db7da,
-    0x8d8d8c01, 0xd5d564b1, 0x4e4ed29c, 0xa9a9e049,
-    0x6c6cb4d8, 0x5656faac, 0xf4f407f3, 0xeaea25cf,
-    0x6565afca, 0x7a7a8ef4, 0xaeaee947, 0x08081810,
-    0xbabad56f, 0x787888f0, 0x25256f4a, 0x2e2e725c,
-    0x1c1c2438, 0xa6a6f157, 0xb4b4c773, 0xc6c65197,
-    0xe8e823cb, 0xdddd7ca1, 0x74749ce8, 0x1f1f213e,
-    0x4b4bdd96, 0xbdbddc61, 0x8b8b860d, 0x8a8a850f,
-    0x707090e0, 0x3e3e427c, 0xb5b5c471, 0x6666aacc,
-    0x4848d890, 0x03030506, 0xf6f601f7, 0x0e0e121c,
-    0x6161a3c2, 0x35355f6a, 0x5757f9ae, 0xb9b9d069,
-    0x86869117, 0xc1c15899, 0x1d1d273a, 0x9e9eb927,
-    0xe1e138d9, 0xf8f813eb, 0x9898b32b, 0x11113322,
-    0x6969bbd2, 0xd9d970a9, 0x8e8e8907, 0x9494a733,
-    0x9b9bb62d, 0x1e1e223c, 0x87879215, 0xe9e920c9,
-    0xcece4987, 0x5555ffaa, 0x28287850, 0xdfdf7aa5,
-    0x8c8c8f03, 0xa1a1f859, 0x89898009, 0x0d0d171a,
-    0xbfbfda65, 0xe6e631d7, 0x4242c684, 0x6868b8d0,
-    0x4141c382, 0x9999b029, 0x2d2d775a, 0x0f0f111e,
-    0xb0b0cb7b, 0x5454fca8, 0xbbbbd66d, 0x16163a2c,
-};
-
-#ifndef PELI_TAB
-static const uint32_t Te4_0[] = {
-0x00000063, 0x0000007c, 0x00000077, 0x0000007b, 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5,
-0x00000030, 0x00000001, 0x00000067, 0x0000002b, 0x000000fe, 0x000000d7, 0x000000ab, 0x00000076,
-0x000000ca, 0x00000082, 0x000000c9, 0x0000007d, 0x000000fa, 0x00000059, 0x00000047, 0x000000f0,
-0x000000ad, 0x000000d4, 0x000000a2, 0x000000af, 0x0000009c, 0x000000a4, 0x00000072, 0x000000c0,
-0x000000b7, 0x000000fd, 0x00000093, 0x00000026, 0x00000036, 0x0000003f, 0x000000f7, 0x000000cc,
-0x00000034, 0x000000a5, 0x000000e5, 0x000000f1, 0x00000071, 0x000000d8, 0x00000031, 0x00000015,
-0x00000004, 0x000000c7, 0x00000023, 0x000000c3, 0x00000018, 0x00000096, 0x00000005, 0x0000009a,
-0x00000007, 0x00000012, 0x00000080, 0x000000e2, 0x000000eb, 0x00000027, 0x000000b2, 0x00000075,
-0x00000009, 0x00000083, 0x0000002c, 0x0000001a, 0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0,
-0x00000052, 0x0000003b, 0x000000d6, 0x000000b3, 0x00000029, 0x000000e3, 0x0000002f, 0x00000084,
-0x00000053, 0x000000d1, 0x00000000, 0x000000ed, 0x00000020, 0x000000fc, 0x000000b1, 0x0000005b,
-0x0000006a, 0x000000cb, 0x000000be, 0x00000039, 0x0000004a, 0x0000004c, 0x00000058, 0x000000cf,
-0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb, 0x00000043, 0x0000004d, 0x00000033, 0x00000085,
-0x00000045, 0x000000f9, 0x00000002, 0x0000007f, 0x00000050, 0x0000003c, 0x0000009f, 0x000000a8,
-0x00000051, 0x000000a3, 0x00000040, 0x0000008f, 0x00000092, 0x0000009d, 0x00000038, 0x000000f5,
-0x000000bc, 0x000000b6, 0x000000da, 0x00000021, 0x00000010, 0x000000ff, 0x000000f3, 0x000000d2,
-0x000000cd, 0x0000000c, 0x00000013, 0x000000ec, 0x0000005f, 0x00000097, 0x00000044, 0x00000017,
-0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d, 0x00000064, 0x0000005d, 0x00000019, 0x00000073,
-0x00000060, 0x00000081, 0x0000004f, 0x000000dc, 0x00000022, 0x0000002a, 0x00000090, 0x00000088,
-0x00000046, 0x000000ee, 0x000000b8, 0x00000014, 0x000000de, 0x0000005e, 0x0000000b, 0x000000db,
-0x000000e0, 0x00000032, 0x0000003a, 0x0000000a, 0x00000049, 0x00000006, 0x00000024, 0x0000005c,
-0x000000c2, 0x000000d3, 0x000000ac, 0x00000062, 0x00000091, 0x00000095, 0x000000e4, 0x00000079,
-0x000000e7, 0x000000c8, 0x00000037, 0x0000006d, 0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9,
-0x0000006c, 0x00000056, 0x000000f4, 0x000000ea, 0x00000065, 0x0000007a, 0x000000ae, 0x00000008,
-0x000000ba, 0x00000078, 0x00000025, 0x0000002e, 0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6,
-0x000000e8, 0x000000dd, 0x00000074, 0x0000001f, 0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a,
-0x00000070, 0x0000003e, 0x000000b5, 0x00000066, 0x00000048, 0x00000003, 0x000000f6, 0x0000000e,
-0x00000061, 0x00000035, 0x00000057, 0x000000b9, 0x00000086, 0x000000c1, 0x0000001d, 0x0000009e,
-0x000000e1, 0x000000f8, 0x00000098, 0x00000011, 0x00000069, 0x000000d9, 0x0000008e, 0x00000094,
-0x0000009b, 0x0000001e, 0x00000087, 0x000000e9, 0x000000ce, 0x00000055, 0x00000028, 0x000000df,
-0x0000008c, 0x000000a1, 0x00000089, 0x0000000d, 0x000000bf, 0x000000e6, 0x00000042, 0x00000068,
-0x00000041, 0x00000099, 0x0000002d, 0x0000000f, 0x000000b0, 0x00000054, 0x000000bb, 0x00000016
-};
-
-static const uint32_t Te4_1[] = {
-0x00006300, 0x00007c00, 0x00007700, 0x00007b00, 0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500,
-0x00003000, 0x00000100, 0x00006700, 0x00002b00, 0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600,
-0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00, 0x0000fa00, 0x00005900, 0x00004700, 0x0000f000,
-0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00, 0x00009c00, 0x0000a400, 0x00007200, 0x0000c000,
-0x0000b700, 0x0000fd00, 0x00009300, 0x00002600, 0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00,
-0x00003400, 0x0000a500, 0x0000e500, 0x0000f100, 0x00007100, 0x0000d800, 0x00003100, 0x00001500,
-0x00000400, 0x0000c700, 0x00002300, 0x0000c300, 0x00001800, 0x00009600, 0x00000500, 0x00009a00,
-0x00000700, 0x00001200, 0x00008000, 0x0000e200, 0x0000eb00, 0x00002700, 0x0000b200, 0x00007500,
-0x00000900, 0x00008300, 0x00002c00, 0x00001a00, 0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000,
-0x00005200, 0x00003b00, 0x0000d600, 0x0000b300, 0x00002900, 0x0000e300, 0x00002f00, 0x00008400,
-0x00005300, 0x0000d100, 0x00000000, 0x0000ed00, 0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00,
-0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900, 0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00,
-0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00, 0x00004300, 0x00004d00, 0x00003300, 0x00008500,
-0x00004500, 0x0000f900, 0x00000200, 0x00007f00, 0x00005000, 0x00003c00, 0x00009f00, 0x0000a800,
-0x00005100, 0x0000a300, 0x00004000, 0x00008f00, 0x00009200, 0x00009d00, 0x00003800, 0x0000f500,
-0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100, 0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200,
-0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00, 0x00005f00, 0x00009700, 0x00004400, 0x00001700,
-0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00, 0x00006400, 0x00005d00, 0x00001900, 0x00007300,
-0x00006000, 0x00008100, 0x00004f00, 0x0000dc00, 0x00002200, 0x00002a00, 0x00009000, 0x00008800,
-0x00004600, 0x0000ee00, 0x0000b800, 0x00001400, 0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00,
-0x0000e000, 0x00003200, 0x00003a00, 0x00000a00, 0x00004900, 0x00000600, 0x00002400, 0x00005c00,
-0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200, 0x00009100, 0x00009500, 0x0000e400, 0x00007900,
-0x0000e700, 0x0000c800, 0x00003700, 0x00006d00, 0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900,
-0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00, 0x00006500, 0x00007a00, 0x0000ae00, 0x00000800,
-0x0000ba00, 0x00007800, 0x00002500, 0x00002e00, 0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600,
-0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00, 0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00,
-0x00007000, 0x00003e00, 0x0000b500, 0x00006600, 0x00004800, 0x00000300, 0x0000f600, 0x00000e00,
-0x00006100, 0x00003500, 0x00005700, 0x0000b900, 0x00008600, 0x0000c100, 0x00001d00, 0x00009e00,
-0x0000e100, 0x0000f800, 0x00009800, 0x00001100, 0x00006900, 0x0000d900, 0x00008e00, 0x00009400,
-0x00009b00, 0x00001e00, 0x00008700, 0x0000e900, 0x0000ce00, 0x00005500, 0x00002800, 0x0000df00,
-0x00008c00, 0x0000a100, 0x00008900, 0x00000d00, 0x0000bf00, 0x0000e600, 0x00004200, 0x00006800,
-0x00004100, 0x00009900, 0x00002d00, 0x00000f00, 0x0000b000, 0x00005400, 0x0000bb00, 0x00001600
-};
-
-static const uint32_t Te4_2[] = {
-0x00630000, 0x007c0000, 0x00770000, 0x007b0000, 0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000,
-0x00300000, 0x00010000, 0x00670000, 0x002b0000, 0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000,
-0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000, 0x00fa0000, 0x00590000, 0x00470000, 0x00f00000,
-0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000, 0x009c0000, 0x00a40000, 0x00720000, 0x00c00000,
-0x00b70000, 0x00fd0000, 0x00930000, 0x00260000, 0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000,
-0x00340000, 0x00a50000, 0x00e50000, 0x00f10000, 0x00710000, 0x00d80000, 0x00310000, 0x00150000,
-0x00040000, 0x00c70000, 0x00230000, 0x00c30000, 0x00180000, 0x00960000, 0x00050000, 0x009a0000,
-0x00070000, 0x00120000, 0x00800000, 0x00e20000, 0x00eb0000, 0x00270000, 0x00b20000, 0x00750000,
-0x00090000, 0x00830000, 0x002c0000, 0x001a0000, 0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000,
-0x00520000, 0x003b0000, 0x00d60000, 0x00b30000, 0x00290000, 0x00e30000, 0x002f0000, 0x00840000,
-0x00530000, 0x00d10000, 0x00000000, 0x00ed0000, 0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000,
-0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000, 0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000,
-0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000, 0x00430000, 0x004d0000, 0x00330000, 0x00850000,
-0x00450000, 0x00f90000, 0x00020000, 0x007f0000, 0x00500000, 0x003c0000, 0x009f0000, 0x00a80000,
-0x00510000, 0x00a30000, 0x00400000, 0x008f0000, 0x00920000, 0x009d0000, 0x00380000, 0x00f50000,
-0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000, 0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000,
-0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000, 0x005f0000, 0x00970000, 0x00440000, 0x00170000,
-0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000, 0x00640000, 0x005d0000, 0x00190000, 0x00730000,
-0x00600000, 0x00810000, 0x004f0000, 0x00dc0000, 0x00220000, 0x002a0000, 0x00900000, 0x00880000,
-0x00460000, 0x00ee0000, 0x00b80000, 0x00140000, 0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000,
-0x00e00000, 0x00320000, 0x003a0000, 0x000a0000, 0x00490000, 0x00060000, 0x00240000, 0x005c0000,
-0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000, 0x00910000, 0x00950000, 0x00e40000, 0x00790000,
-0x00e70000, 0x00c80000, 0x00370000, 0x006d0000, 0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000,
-0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000, 0x00650000, 0x007a0000, 0x00ae0000, 0x00080000,
-0x00ba0000, 0x00780000, 0x00250000, 0x002e0000, 0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000,
-0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000, 0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000,
-0x00700000, 0x003e0000, 0x00b50000, 0x00660000, 0x00480000, 0x00030000, 0x00f60000, 0x000e0000,
-0x00610000, 0x00350000, 0x00570000, 0x00b90000, 0x00860000, 0x00c10000, 0x001d0000, 0x009e0000,
-0x00e10000, 0x00f80000, 0x00980000, 0x00110000, 0x00690000, 0x00d90000, 0x008e0000, 0x00940000,
-0x009b0000, 0x001e0000, 0x00870000, 0x00e90000, 0x00ce0000, 0x00550000, 0x00280000, 0x00df0000,
-0x008c0000, 0x00a10000, 0x00890000, 0x000d0000, 0x00bf0000, 0x00e60000, 0x00420000, 0x00680000,
-0x00410000, 0x00990000, 0x002d0000, 0x000f0000, 0x00b00000, 0x00540000, 0x00bb0000, 0x00160000
-};
-
-static const uint32_t Te4_3[] = {
-0x63000000, 0x7c000000, 0x77000000, 0x7b000000, 0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000,
-0x30000000, 0x01000000, 0x67000000, 0x2b000000, 0xfe000000, 0xd7000000, 0xab000000, 0x76000000,
-0xca000000, 0x82000000, 0xc9000000, 0x7d000000, 0xfa000000, 0x59000000, 0x47000000, 0xf0000000,
-0xad000000, 0xd4000000, 0xa2000000, 0xaf000000, 0x9c000000, 0xa4000000, 0x72000000, 0xc0000000,
-0xb7000000, 0xfd000000, 0x93000000, 0x26000000, 0x36000000, 0x3f000000, 0xf7000000, 0xcc000000,
-0x34000000, 0xa5000000, 0xe5000000, 0xf1000000, 0x71000000, 0xd8000000, 0x31000000, 0x15000000,
-0x04000000, 0xc7000000, 0x23000000, 0xc3000000, 0x18000000, 0x96000000, 0x05000000, 0x9a000000,
-0x07000000, 0x12000000, 0x80000000, 0xe2000000, 0xeb000000, 0x27000000, 0xb2000000, 0x75000000,
-0x09000000, 0x83000000, 0x2c000000, 0x1a000000, 0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000,
-0x52000000, 0x3b000000, 0xd6000000, 0xb3000000, 0x29000000, 0xe3000000, 0x2f000000, 0x84000000,
-0x53000000, 0xd1000000, 0x00000000, 0xed000000, 0x20000000, 0xfc000000, 0xb1000000, 0x5b000000,
-0x6a000000, 0xcb000000, 0xbe000000, 0x39000000, 0x4a000000, 0x4c000000, 0x58000000, 0xcf000000,
-0xd0000000, 0xef000000, 0xaa000000, 0xfb000000, 0x43000000, 0x4d000000, 0x33000000, 0x85000000,
-0x45000000, 0xf9000000, 0x02000000, 0x7f000000, 0x50000000, 0x3c000000, 0x9f000000, 0xa8000000,
-0x51000000, 0xa3000000, 0x40000000, 0x8f000000, 0x92000000, 0x9d000000, 0x38000000, 0xf5000000,
-0xbc000000, 0xb6000000, 0xda000000, 0x21000000, 0x10000000, 0xff000000, 0xf3000000, 0xd2000000,
-0xcd000000, 0x0c000000, 0x13000000, 0xec000000, 0x5f000000, 0x97000000, 0x44000000, 0x17000000,
-0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000, 0x64000000, 0x5d000000, 0x19000000, 0x73000000,
-0x60000000, 0x81000000, 0x4f000000, 0xdc000000, 0x22000000, 0x2a000000, 0x90000000, 0x88000000,
-0x46000000, 0xee000000, 0xb8000000, 0x14000000, 0xde000000, 0x5e000000, 0x0b000000, 0xdb000000,
-0xe0000000, 0x32000000, 0x3a000000, 0x0a000000, 0x49000000, 0x06000000, 0x24000000, 0x5c000000,
-0xc2000000, 0xd3000000, 0xac000000, 0x62000000, 0x91000000, 0x95000000, 0xe4000000, 0x79000000,
-0xe7000000, 0xc8000000, 0x37000000, 0x6d000000, 0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000,
-0x6c000000, 0x56000000, 0xf4000000, 0xea000000, 0x65000000, 0x7a000000, 0xae000000, 0x08000000,
-0xba000000, 0x78000000, 0x25000000, 0x2e000000, 0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000,
-0xe8000000, 0xdd000000, 0x74000000, 0x1f000000, 0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000,
-0x70000000, 0x3e000000, 0xb5000000, 0x66000000, 0x48000000, 0x03000000, 0xf6000000, 0x0e000000,
-0x61000000, 0x35000000, 0x57000000, 0xb9000000, 0x86000000, 0xc1000000, 0x1d000000, 0x9e000000,
-0xe1000000, 0xf8000000, 0x98000000, 0x11000000, 0x69000000, 0xd9000000, 0x8e000000, 0x94000000,
-0x9b000000, 0x1e000000, 0x87000000, 0xe9000000, 0xce000000, 0x55000000, 0x28000000, 0xdf000000,
-0x8c000000, 0xa1000000, 0x89000000, 0x0d000000, 0xbf000000, 0xe6000000, 0x42000000, 0x68000000,
-0x41000000, 0x99000000, 0x2d000000, 0x0f000000, 0xb0000000, 0x54000000, 0xbb000000, 0x16000000
-};
-#endif /* pelimac */
-
-#ifndef ENCRYPT_ONLY
-
-static const uint32_t TD1[256] = {
-    0x5051f4a7, 0x537e4165, 0xc31a17a4, 0x963a275e,
-    0xcb3bab6b, 0xf11f9d45, 0xabacfa58, 0x934be303,
-    0x552030fa, 0xf6ad766d, 0x9188cc76, 0x25f5024c,
-    0xfc4fe5d7, 0xd7c52acb, 0x80263544, 0x8fb562a3,
-    0x49deb15a, 0x6725ba1b, 0x9845ea0e, 0xe15dfec0,
-    0x02c32f75, 0x12814cf0, 0xa38d4697, 0xc66bd3f9,
-    0xe7038f5f, 0x9515929c, 0xebbf6d7a, 0xda955259,
-    0x2dd4be83, 0xd3587421, 0x2949e069, 0x448ec9c8,
-    0x6a75c289, 0x78f48e79, 0x6b99583e, 0xdd27b971,
-    0xb6bee14f, 0x17f088ad, 0x66c920ac, 0xb47dce3a,
-    0x1863df4a, 0x82e51a31, 0x60975133, 0x4562537f,
-    0xe0b16477, 0x84bb6bae, 0x1cfe81a0, 0x94f9082b,
-    0x58704868, 0x198f45fd, 0x8794de6c, 0xb7527bf8,
-    0x23ab73d3, 0xe2724b02, 0x57e31f8f, 0x2a6655ab,
-    0x07b2eb28, 0x032fb5c2, 0x9a86c57b, 0xa5d33708,
-    0xf2302887, 0xb223bfa5, 0xba02036a, 0x5ced1682,
-    0x2b8acf1c, 0x92a779b4, 0xf0f307f2, 0xa14e69e2,
-    0xcd65daf4, 0xd50605be, 0x1fd13462, 0x8ac4a6fe,
-    0x9d342e53, 0xa0a2f355, 0x32058ae1, 0x75a4f6eb,
-    0x390b83ec, 0xaa4060ef, 0x065e719f, 0x51bd6e10,
-    0xf93e218a, 0x3d96dd06, 0xaedd3e05, 0x464de6bd,
-    0xb591548d, 0x0571c45d, 0x6f0406d4, 0xff605015,
-    0x241998fb, 0x97d6bde9, 0xcc894043, 0x7767d99e,
-    0xbdb0e842, 0x8807898b, 0x38e7195b, 0xdb79c8ee,
-    0x47a17c0a, 0xe97c420f, 0xc9f8841e, 0x00000000,
-    0x83098086, 0x48322bed, 0xac1e1170, 0x4e6c5a72,
-    0xfbfd0eff, 0x560f8538, 0x1e3daed5, 0x27362d39,
-    0x640a0fd9, 0x21685ca6, 0xd19b5b54, 0x3a24362e,
-    0xb10c0a67, 0x0f9357e7, 0xd2b4ee96, 0x9e1b9b91,
-    0x4f80c0c5, 0xa261dc20, 0x695a774b, 0x161c121a,
-    0x0ae293ba, 0xe5c0a02a, 0x433c22e0, 0x1d121b17,
-    0x0b0e090d, 0xadf28bc7, 0xb92db6a8, 0xc8141ea9,
-    0x8557f119, 0x4caf7507, 0xbbee99dd, 0xfda37f60,
-    0x9ff70126, 0xbc5c72f5, 0xc544663b, 0x345bfb7e,
-    0x768b4329, 0xdccb23c6, 0x68b6edfc, 0x63b8e4f1,
-    0xcad731dc, 0x10426385, 0x40139722, 0x2084c611,
-    0x7d854a24, 0xf8d2bb3d, 0x11aef932, 0x6dc729a1,
-    0x4b1d9e2f, 0xf3dcb230, 0xec0d8652, 0xd077c1e3,
-    0x6c2bb316, 0x99a970b9, 0xfa119448, 0x2247e964,
-    0xc4a8fc8c, 0x1aa0f03f, 0xd8567d2c, 0xef223390,
-    0xc787494e, 0xc1d938d1, 0xfe8ccaa2, 0x3698d40b,
-    0xcfa6f581, 0x28a57ade, 0x26dab78e, 0xa43fadbf,
-    0xe42c3a9d, 0x0d507892, 0x9b6a5fcc, 0x62547e46,
-    0xc2f68d13, 0xe890d8b8, 0x5e2e39f7, 0xf582c3af,
-    0xbe9f5d80, 0x7c69d093, 0xa96fd52d, 0xb3cf2512,
-    0x3bc8ac99, 0xa710187d, 0x6ee89c63, 0x7bdb3bbb,
-    0x09cd2678, 0xf46e5918, 0x01ec9ab7, 0xa8834f9a,
-    0x65e6956e, 0x7eaaffe6, 0x0821bccf, 0xe6ef15e8,
-    0xd9bae79b, 0xce4a6f36, 0xd4ea9f09, 0xd629b07c,
-    0xaf31a4b2, 0x312a3f23, 0x30c6a594, 0xc035a266,
-    0x37744ebc, 0xa6fc82ca, 0xb0e090d0, 0x1533a7d8,
-    0x4af10498, 0xf741ecda, 0x0e7fcd50, 0x2f1791f6,
-    0x8d764dd6, 0x4d43efb0, 0x54ccaa4d, 0xdfe49604,
-    0xe39ed1b5, 0x1b4c6a88, 0xb8c12c1f, 0x7f466551,
-    0x049d5eea, 0x5d018c35, 0x73fa8774, 0x2efb0b41,
-    0x5ab3671d, 0x5292dbd2, 0x33e91056, 0x136dd647,
-    0x8c9ad761, 0x7a37a10c, 0x8e59f814, 0x89eb133c,
-    0xeecea927, 0x35b761c9, 0xede11ce5, 0x3c7a47b1,
-    0x599cd2df, 0x3f55f273, 0x791814ce, 0xbf73c737,
-    0xea53f7cd, 0x5b5ffdaa, 0x14df3d6f, 0x867844db,
-    0x81caaff3, 0x3eb968c4, 0x2c382434, 0x5fc2a340,
-    0x72161dc3, 0x0cbce225, 0x8b283c49, 0x41ff0d95,
-    0x7139a801, 0xde080cb3, 0x9cd8b4e4, 0x906456c1,
-    0x617bcb84, 0x70d532b6, 0x74486c5c, 0x42d0b857,
-};
-static const uint32_t TD2[256] = {
-    0xa75051f4, 0x65537e41, 0xa4c31a17, 0x5e963a27,
-    0x6bcb3bab, 0x45f11f9d, 0x58abacfa, 0x03934be3,
-    0xfa552030, 0x6df6ad76, 0x769188cc, 0x4c25f502,
-    0xd7fc4fe5, 0xcbd7c52a, 0x44802635, 0xa38fb562,
-    0x5a49deb1, 0x1b6725ba, 0x0e9845ea, 0xc0e15dfe,
-    0x7502c32f, 0xf012814c, 0x97a38d46, 0xf9c66bd3,
-    0x5fe7038f, 0x9c951592, 0x7aebbf6d, 0x59da9552,
-    0x832dd4be, 0x21d35874, 0x692949e0, 0xc8448ec9,
-    0x896a75c2, 0x7978f48e, 0x3e6b9958, 0x71dd27b9,
-    0x4fb6bee1, 0xad17f088, 0xac66c920, 0x3ab47dce,
-    0x4a1863df, 0x3182e51a, 0x33609751, 0x7f456253,
-    0x77e0b164, 0xae84bb6b, 0xa01cfe81, 0x2b94f908,
-    0x68587048, 0xfd198f45, 0x6c8794de, 0xf8b7527b,
-    0xd323ab73, 0x02e2724b, 0x8f57e31f, 0xab2a6655,
-    0x2807b2eb, 0xc2032fb5, 0x7b9a86c5, 0x08a5d337,
-    0x87f23028, 0xa5b223bf, 0x6aba0203, 0x825ced16,
-    0x1c2b8acf, 0xb492a779, 0xf2f0f307, 0xe2a14e69,
-    0xf4cd65da, 0xbed50605, 0x621fd134, 0xfe8ac4a6,
-    0x539d342e, 0x55a0a2f3, 0xe132058a, 0xeb75a4f6,
-    0xec390b83, 0xefaa4060, 0x9f065e71, 0x1051bd6e,
-    0x8af93e21, 0x063d96dd, 0x05aedd3e, 0xbd464de6,
-    0x8db59154, 0x5d0571c4, 0xd46f0406, 0x15ff6050,
-    0xfb241998, 0xe997d6bd, 0x43cc8940, 0x9e7767d9,
-    0x42bdb0e8, 0x8b880789, 0x5b38e719, 0xeedb79c8,
-    0x0a47a17c, 0x0fe97c42, 0x1ec9f884, 0x00000000,
-    0x86830980, 0xed48322b, 0x70ac1e11, 0x724e6c5a,
-    0xfffbfd0e, 0x38560f85, 0xd51e3dae, 0x3927362d,
-    0xd9640a0f, 0xa621685c, 0x54d19b5b, 0x2e3a2436,
-    0x67b10c0a, 0xe70f9357, 0x96d2b4ee, 0x919e1b9b,
-    0xc54f80c0, 0x20a261dc, 0x4b695a77, 0x1a161c12,
-    0xba0ae293, 0x2ae5c0a0, 0xe0433c22, 0x171d121b,
-    0x0d0b0e09, 0xc7adf28b, 0xa8b92db6, 0xa9c8141e,
-    0x198557f1, 0x074caf75, 0xddbbee99, 0x60fda37f,
-    0x269ff701, 0xf5bc5c72, 0x3bc54466, 0x7e345bfb,
-    0x29768b43, 0xc6dccb23, 0xfc68b6ed, 0xf163b8e4,
-    0xdccad731, 0x85104263, 0x22401397, 0x112084c6,
-    0x247d854a, 0x3df8d2bb, 0x3211aef9, 0xa16dc729,
-    0x2f4b1d9e, 0x30f3dcb2, 0x52ec0d86, 0xe3d077c1,
-    0x166c2bb3, 0xb999a970, 0x48fa1194, 0x642247e9,
-    0x8cc4a8fc, 0x3f1aa0f0, 0x2cd8567d, 0x90ef2233,
-    0x4ec78749, 0xd1c1d938, 0xa2fe8cca, 0x0b3698d4,
-    0x81cfa6f5, 0xde28a57a, 0x8e26dab7, 0xbfa43fad,
-    0x9de42c3a, 0x920d5078, 0xcc9b6a5f, 0x4662547e,
-    0x13c2f68d, 0xb8e890d8, 0xf75e2e39, 0xaff582c3,
-    0x80be9f5d, 0x937c69d0, 0x2da96fd5, 0x12b3cf25,
-    0x993bc8ac, 0x7da71018, 0x636ee89c, 0xbb7bdb3b,
-    0x7809cd26, 0x18f46e59, 0xb701ec9a, 0x9aa8834f,
-    0x6e65e695, 0xe67eaaff, 0xcf0821bc, 0xe8e6ef15,
-    0x9bd9bae7, 0x36ce4a6f, 0x09d4ea9f, 0x7cd629b0,
-    0xb2af31a4, 0x23312a3f, 0x9430c6a5, 0x66c035a2,
-    0xbc37744e, 0xcaa6fc82, 0xd0b0e090, 0xd81533a7,
-    0x984af104, 0xdaf741ec, 0x500e7fcd, 0xf62f1791,
-    0xd68d764d, 0xb04d43ef, 0x4d54ccaa, 0x04dfe496,
-    0xb5e39ed1, 0x881b4c6a, 0x1fb8c12c, 0x517f4665,
-    0xea049d5e, 0x355d018c, 0x7473fa87, 0x412efb0b,
-    0x1d5ab367, 0xd25292db, 0x5633e910, 0x47136dd6,
-    0x618c9ad7, 0x0c7a37a1, 0x148e59f8, 0x3c89eb13,
-    0x27eecea9, 0xc935b761, 0xe5ede11c, 0xb13c7a47,
-    0xdf599cd2, 0x733f55f2, 0xce791814, 0x37bf73c7,
-    0xcdea53f7, 0xaa5b5ffd, 0x6f14df3d, 0xdb867844,
-    0xf381caaf, 0xc43eb968, 0x342c3824, 0x405fc2a3,
-    0xc372161d, 0x250cbce2, 0x498b283c, 0x9541ff0d,
-    0x017139a8, 0xb3de080c, 0xe49cd8b4, 0xc1906456,
-    0x84617bcb, 0xb670d532, 0x5c74486c, 0x5742d0b8,
-};
-static const uint32_t TD3[256] = {
-    0xf4a75051, 0x4165537e, 0x17a4c31a, 0x275e963a,
-    0xab6bcb3b, 0x9d45f11f, 0xfa58abac, 0xe303934b,
-    0x30fa5520, 0x766df6ad, 0xcc769188, 0x024c25f5,
-    0xe5d7fc4f, 0x2acbd7c5, 0x35448026, 0x62a38fb5,
-    0xb15a49de, 0xba1b6725, 0xea0e9845, 0xfec0e15d,
-    0x2f7502c3, 0x4cf01281, 0x4697a38d, 0xd3f9c66b,
-    0x8f5fe703, 0x929c9515, 0x6d7aebbf, 0x5259da95,
-    0xbe832dd4, 0x7421d358, 0xe0692949, 0xc9c8448e,
-    0xc2896a75, 0x8e7978f4, 0x583e6b99, 0xb971dd27,
-    0xe14fb6be, 0x88ad17f0, 0x20ac66c9, 0xce3ab47d,
-    0xdf4a1863, 0x1a3182e5, 0x51336097, 0x537f4562,
-    0x6477e0b1, 0x6bae84bb, 0x81a01cfe, 0x082b94f9,
-    0x48685870, 0x45fd198f, 0xde6c8794, 0x7bf8b752,
-    0x73d323ab, 0x4b02e272, 0x1f8f57e3, 0x55ab2a66,
-    0xeb2807b2, 0xb5c2032f, 0xc57b9a86, 0x3708a5d3,
-    0x2887f230, 0xbfa5b223, 0x036aba02, 0x16825ced,
-    0xcf1c2b8a, 0x79b492a7, 0x07f2f0f3, 0x69e2a14e,
-    0xdaf4cd65, 0x05bed506, 0x34621fd1, 0xa6fe8ac4,
-    0x2e539d34, 0xf355a0a2, 0x8ae13205, 0xf6eb75a4,
-    0x83ec390b, 0x60efaa40, 0x719f065e, 0x6e1051bd,
-    0x218af93e, 0xdd063d96, 0x3e05aedd, 0xe6bd464d,
-    0x548db591, 0xc45d0571, 0x06d46f04, 0x5015ff60,
-    0x98fb2419, 0xbde997d6, 0x4043cc89, 0xd99e7767,
-    0xe842bdb0, 0x898b8807, 0x195b38e7, 0xc8eedb79,
-    0x7c0a47a1, 0x420fe97c, 0x841ec9f8, 0x00000000,
-    0x80868309, 0x2bed4832, 0x1170ac1e, 0x5a724e6c,
-    0x0efffbfd, 0x8538560f, 0xaed51e3d, 0x2d392736,
-    0x0fd9640a, 0x5ca62168, 0x5b54d19b, 0x362e3a24,
-    0x0a67b10c, 0x57e70f93, 0xee96d2b4, 0x9b919e1b,
-    0xc0c54f80, 0xdc20a261, 0x774b695a, 0x121a161c,
-    0x93ba0ae2, 0xa02ae5c0, 0x22e0433c, 0x1b171d12,
-    0x090d0b0e, 0x8bc7adf2, 0xb6a8b92d, 0x1ea9c814,
-    0xf1198557, 0x75074caf, 0x99ddbbee, 0x7f60fda3,
-    0x01269ff7, 0x72f5bc5c, 0x663bc544, 0xfb7e345b,
-    0x4329768b, 0x23c6dccb, 0xedfc68b6, 0xe4f163b8,
-    0x31dccad7, 0x63851042, 0x97224013, 0xc6112084,
-    0x4a247d85, 0xbb3df8d2, 0xf93211ae, 0x29a16dc7,
-    0x9e2f4b1d, 0xb230f3dc, 0x8652ec0d, 0xc1e3d077,
-    0xb3166c2b, 0x70b999a9, 0x9448fa11, 0xe9642247,
-    0xfc8cc4a8, 0xf03f1aa0, 0x7d2cd856, 0x3390ef22,
-    0x494ec787, 0x38d1c1d9, 0xcaa2fe8c, 0xd40b3698,
-    0xf581cfa6, 0x7ade28a5, 0xb78e26da, 0xadbfa43f,
-    0x3a9de42c, 0x78920d50, 0x5fcc9b6a, 0x7e466254,
-    0x8d13c2f6, 0xd8b8e890, 0x39f75e2e, 0xc3aff582,
-    0x5d80be9f, 0xd0937c69, 0xd52da96f, 0x2512b3cf,
-    0xac993bc8, 0x187da710, 0x9c636ee8, 0x3bbb7bdb,
-    0x267809cd, 0x5918f46e, 0x9ab701ec, 0x4f9aa883,
-    0x956e65e6, 0xffe67eaa, 0xbccf0821, 0x15e8e6ef,
-    0xe79bd9ba, 0x6f36ce4a, 0x9f09d4ea, 0xb07cd629,
-    0xa4b2af31, 0x3f23312a, 0xa59430c6, 0xa266c035,
-    0x4ebc3774, 0x82caa6fc, 0x90d0b0e0, 0xa7d81533,
-    0x04984af1, 0xecdaf741, 0xcd500e7f, 0x91f62f17,
-    0x4dd68d76, 0xefb04d43, 0xaa4d54cc, 0x9604dfe4,
-    0xd1b5e39e, 0x6a881b4c, 0x2c1fb8c1, 0x65517f46,
-    0x5eea049d, 0x8c355d01, 0x877473fa, 0x0b412efb,
-    0x671d5ab3, 0xdbd25292, 0x105633e9, 0xd647136d,
-    0xd7618c9a, 0xa10c7a37, 0xf8148e59, 0x133c89eb,
-    0xa927eece, 0x61c935b7, 0x1ce5ede1, 0x47b13c7a,
-    0xd2df599c, 0xf2733f55, 0x14ce7918, 0xc737bf73,
-    0xf7cdea53, 0xfdaa5b5f, 0x3d6f14df, 0x44db8678,
-    0xaff381ca, 0x68c43eb9, 0x24342c38, 0xa3405fc2,
-    0x1dc37216, 0xe2250cbc, 0x3c498b28, 0x0d9541ff,
-    0xa8017139, 0x0cb3de08, 0xb4e49cd8, 0x56c19064,
-    0xcb84617b, 0x32b670d5, 0x6c5c7448, 0xb85742d0,
-};
-
-static const uint32_t Tks0[] = {
-0x00000000, 0x0e090d0b, 0x1c121a16, 0x121b171d, 0x3824342c, 0x362d3927, 0x24362e3a, 0x2a3f2331,
-0x70486858, 0x7e416553, 0x6c5a724e, 0x62537f45, 0x486c5c74, 0x4665517f, 0x547e4662, 0x5a774b69,
-0xe090d0b0, 0xee99ddbb, 0xfc82caa6, 0xf28bc7ad, 0xd8b4e49c, 0xd6bde997, 0xc4a6fe8a, 0xcaaff381,
-0x90d8b8e8, 0x9ed1b5e3, 0x8ccaa2fe, 0x82c3aff5, 0xa8fc8cc4, 0xa6f581cf, 0xb4ee96d2, 0xbae79bd9,
-0xdb3bbb7b, 0xd532b670, 0xc729a16d, 0xc920ac66, 0xe31f8f57, 0xed16825c, 0xff0d9541, 0xf104984a,
-0xab73d323, 0xa57ade28, 0xb761c935, 0xb968c43e, 0x9357e70f, 0x9d5eea04, 0x8f45fd19, 0x814cf012,
-0x3bab6bcb, 0x35a266c0, 0x27b971dd, 0x29b07cd6, 0x038f5fe7, 0x0d8652ec, 0x1f9d45f1, 0x119448fa,
-0x4be30393, 0x45ea0e98, 0x57f11985, 0x59f8148e, 0x73c737bf, 0x7dce3ab4, 0x6fd52da9, 0x61dc20a2,
-0xad766df6, 0xa37f60fd, 0xb16477e0, 0xbf6d7aeb, 0x955259da, 0x9b5b54d1, 0x894043cc, 0x87494ec7,
-0xdd3e05ae, 0xd33708a5, 0xc12c1fb8, 0xcf2512b3, 0xe51a3182, 0xeb133c89, 0xf9082b94, 0xf701269f,
-0x4de6bd46, 0x43efb04d, 0x51f4a750, 0x5ffdaa5b, 0x75c2896a, 0x7bcb8461, 0x69d0937c, 0x67d99e77,
-0x3daed51e, 0x33a7d815, 0x21bccf08, 0x2fb5c203, 0x058ae132, 0x0b83ec39, 0x1998fb24, 0x1791f62f,
-0x764dd68d, 0x7844db86, 0x6a5fcc9b, 0x6456c190, 0x4e69e2a1, 0x4060efaa, 0x527bf8b7, 0x5c72f5bc,
-0x0605bed5, 0x080cb3de, 0x1a17a4c3, 0x141ea9c8, 0x3e218af9, 0x302887f2, 0x223390ef, 0x2c3a9de4,
-0x96dd063d, 0x98d40b36, 0x8acf1c2b, 0x84c61120, 0xaef93211, 0xa0f03f1a, 0xb2eb2807, 0xbce2250c,
-0xe6956e65, 0xe89c636e, 0xfa877473, 0xf48e7978, 0xdeb15a49, 0xd0b85742, 0xc2a3405f, 0xccaa4d54,
-0x41ecdaf7, 0x4fe5d7fc, 0x5dfec0e1, 0x53f7cdea, 0x79c8eedb, 0x77c1e3d0, 0x65daf4cd, 0x6bd3f9c6,
-0x31a4b2af, 0x3fadbfa4, 0x2db6a8b9, 0x23bfa5b2, 0x09808683, 0x07898b88, 0x15929c95, 0x1b9b919e,
-0xa17c0a47, 0xaf75074c, 0xbd6e1051, 0xb3671d5a, 0x99583e6b, 0x97513360, 0x854a247d, 0x8b432976,
-0xd134621f, 0xdf3d6f14, 0xcd267809, 0xc32f7502, 0xe9105633, 0xe7195b38, 0xf5024c25, 0xfb0b412e,
-0x9ad7618c, 0x94de6c87, 0x86c57b9a, 0x88cc7691, 0xa2f355a0, 0xacfa58ab, 0xbee14fb6, 0xb0e842bd,
-0xea9f09d4, 0xe49604df, 0xf68d13c2, 0xf8841ec9, 0xd2bb3df8, 0xdcb230f3, 0xcea927ee, 0xc0a02ae5,
-0x7a47b13c, 0x744ebc37, 0x6655ab2a, 0x685ca621, 0x42638510, 0x4c6a881b, 0x5e719f06, 0x5078920d,
-0x0a0fd964, 0x0406d46f, 0x161dc372, 0x1814ce79, 0x322bed48, 0x3c22e043, 0x2e39f75e, 0x2030fa55,
-0xec9ab701, 0xe293ba0a, 0xf088ad17, 0xfe81a01c, 0xd4be832d, 0xdab78e26, 0xc8ac993b, 0xc6a59430,
-0x9cd2df59, 0x92dbd252, 0x80c0c54f, 0x8ec9c844, 0xa4f6eb75, 0xaaffe67e, 0xb8e4f163, 0xb6edfc68,
-0x0c0a67b1, 0x02036aba, 0x10187da7, 0x1e1170ac, 0x342e539d, 0x3a275e96, 0x283c498b, 0x26354480,
-0x7c420fe9, 0x724b02e2, 0x605015ff, 0x6e5918f4, 0x44663bc5, 0x4a6f36ce, 0x587421d3, 0x567d2cd8,
-0x37a10c7a, 0x39a80171, 0x2bb3166c, 0x25ba1b67, 0x0f853856, 0x018c355d, 0x13972240, 0x1d9e2f4b,
-0x47e96422, 0x49e06929, 0x5bfb7e34, 0x55f2733f, 0x7fcd500e, 0x71c45d05, 0x63df4a18, 0x6dd64713,
-0xd731dcca, 0xd938d1c1, 0xcb23c6dc, 0xc52acbd7, 0xef15e8e6, 0xe11ce5ed, 0xf307f2f0, 0xfd0efffb,
-0xa779b492, 0xa970b999, 0xbb6bae84, 0xb562a38f, 0x9f5d80be, 0x91548db5, 0x834f9aa8, 0x8d4697a3
-};
-
-static const uint32_t Tks1[] = {
-0x00000000, 0x0b0e090d, 0x161c121a, 0x1d121b17, 0x2c382434, 0x27362d39, 0x3a24362e, 0x312a3f23,
-0x58704868, 0x537e4165, 0x4e6c5a72, 0x4562537f, 0x74486c5c, 0x7f466551, 0x62547e46, 0x695a774b,
-0xb0e090d0, 0xbbee99dd, 0xa6fc82ca, 0xadf28bc7, 0x9cd8b4e4, 0x97d6bde9, 0x8ac4a6fe, 0x81caaff3,
-0xe890d8b8, 0xe39ed1b5, 0xfe8ccaa2, 0xf582c3af, 0xc4a8fc8c, 0xcfa6f581, 0xd2b4ee96, 0xd9bae79b,
-0x7bdb3bbb, 0x70d532b6, 0x6dc729a1, 0x66c920ac, 0x57e31f8f, 0x5ced1682, 0x41ff0d95, 0x4af10498,
-0x23ab73d3, 0x28a57ade, 0x35b761c9, 0x3eb968c4, 0x0f9357e7, 0x049d5eea, 0x198f45fd, 0x12814cf0,
-0xcb3bab6b, 0xc035a266, 0xdd27b971, 0xd629b07c, 0xe7038f5f, 0xec0d8652, 0xf11f9d45, 0xfa119448,
-0x934be303, 0x9845ea0e, 0x8557f119, 0x8e59f814, 0xbf73c737, 0xb47dce3a, 0xa96fd52d, 0xa261dc20,
-0xf6ad766d, 0xfda37f60, 0xe0b16477, 0xebbf6d7a, 0xda955259, 0xd19b5b54, 0xcc894043, 0xc787494e,
-0xaedd3e05, 0xa5d33708, 0xb8c12c1f, 0xb3cf2512, 0x82e51a31, 0x89eb133c, 0x94f9082b, 0x9ff70126,
-0x464de6bd, 0x4d43efb0, 0x5051f4a7, 0x5b5ffdaa, 0x6a75c289, 0x617bcb84, 0x7c69d093, 0x7767d99e,
-0x1e3daed5, 0x1533a7d8, 0x0821bccf, 0x032fb5c2, 0x32058ae1, 0x390b83ec, 0x241998fb, 0x2f1791f6,
-0x8d764dd6, 0x867844db, 0x9b6a5fcc, 0x906456c1, 0xa14e69e2, 0xaa4060ef, 0xb7527bf8, 0xbc5c72f5,
-0xd50605be, 0xde080cb3, 0xc31a17a4, 0xc8141ea9, 0xf93e218a, 0xf2302887, 0xef223390, 0xe42c3a9d,
-0x3d96dd06, 0x3698d40b, 0x2b8acf1c, 0x2084c611, 0x11aef932, 0x1aa0f03f, 0x07b2eb28, 0x0cbce225,
-0x65e6956e, 0x6ee89c63, 0x73fa8774, 0x78f48e79, 0x49deb15a, 0x42d0b857, 0x5fc2a340, 0x54ccaa4d,
-0xf741ecda, 0xfc4fe5d7, 0xe15dfec0, 0xea53f7cd, 0xdb79c8ee, 0xd077c1e3, 0xcd65daf4, 0xc66bd3f9,
-0xaf31a4b2, 0xa43fadbf, 0xb92db6a8, 0xb223bfa5, 0x83098086, 0x8807898b, 0x9515929c, 0x9e1b9b91,
-0x47a17c0a, 0x4caf7507, 0x51bd6e10, 0x5ab3671d, 0x6b99583e, 0x60975133, 0x7d854a24, 0x768b4329,
-0x1fd13462, 0x14df3d6f, 0x09cd2678, 0x02c32f75, 0x33e91056, 0x38e7195b, 0x25f5024c, 0x2efb0b41,
-0x8c9ad761, 0x8794de6c, 0x9a86c57b, 0x9188cc76, 0xa0a2f355, 0xabacfa58, 0xb6bee14f, 0xbdb0e842,
-0xd4ea9f09, 0xdfe49604, 0xc2f68d13, 0xc9f8841e, 0xf8d2bb3d, 0xf3dcb230, 0xeecea927, 0xe5c0a02a,
-0x3c7a47b1, 0x37744ebc, 0x2a6655ab, 0x21685ca6, 0x10426385, 0x1b4c6a88, 0x065e719f, 0x0d507892,
-0x640a0fd9, 0x6f0406d4, 0x72161dc3, 0x791814ce, 0x48322bed, 0x433c22e0, 0x5e2e39f7, 0x552030fa,
-0x01ec9ab7, 0x0ae293ba, 0x17f088ad, 0x1cfe81a0, 0x2dd4be83, 0x26dab78e, 0x3bc8ac99, 0x30c6a594,
-0x599cd2df, 0x5292dbd2, 0x4f80c0c5, 0x448ec9c8, 0x75a4f6eb, 0x7eaaffe6, 0x63b8e4f1, 0x68b6edfc,
-0xb10c0a67, 0xba02036a, 0xa710187d, 0xac1e1170, 0x9d342e53, 0x963a275e, 0x8b283c49, 0x80263544,
-0xe97c420f, 0xe2724b02, 0xff605015, 0xf46e5918, 0xc544663b, 0xce4a6f36, 0xd3587421, 0xd8567d2c,
-0x7a37a10c, 0x7139a801, 0x6c2bb316, 0x6725ba1b, 0x560f8538, 0x5d018c35, 0x40139722, 0x4b1d9e2f,
-0x2247e964, 0x2949e069, 0x345bfb7e, 0x3f55f273, 0x0e7fcd50, 0x0571c45d, 0x1863df4a, 0x136dd647,
-0xcad731dc, 0xc1d938d1, 0xdccb23c6, 0xd7c52acb, 0xe6ef15e8, 0xede11ce5, 0xf0f307f2, 0xfbfd0eff,
-0x92a779b4, 0x99a970b9, 0x84bb6bae, 0x8fb562a3, 0xbe9f5d80, 0xb591548d, 0xa8834f9a, 0xa38d4697
-};
-
-static const uint32_t Tks2[] = {
-0x00000000, 0x0d0b0e09, 0x1a161c12, 0x171d121b, 0x342c3824, 0x3927362d, 0x2e3a2436, 0x23312a3f,
-0x68587048, 0x65537e41, 0x724e6c5a, 0x7f456253, 0x5c74486c, 0x517f4665, 0x4662547e, 0x4b695a77,
-0xd0b0e090, 0xddbbee99, 0xcaa6fc82, 0xc7adf28b, 0xe49cd8b4, 0xe997d6bd, 0xfe8ac4a6, 0xf381caaf,
-0xb8e890d8, 0xb5e39ed1, 0xa2fe8cca, 0xaff582c3, 0x8cc4a8fc, 0x81cfa6f5, 0x96d2b4ee, 0x9bd9bae7,
-0xbb7bdb3b, 0xb670d532, 0xa16dc729, 0xac66c920, 0x8f57e31f, 0x825ced16, 0x9541ff0d, 0x984af104,
-0xd323ab73, 0xde28a57a, 0xc935b761, 0xc43eb968, 0xe70f9357, 0xea049d5e, 0xfd198f45, 0xf012814c,
-0x6bcb3bab, 0x66c035a2, 0x71dd27b9, 0x7cd629b0, 0x5fe7038f, 0x52ec0d86, 0x45f11f9d, 0x48fa1194,
-0x03934be3, 0x0e9845ea, 0x198557f1, 0x148e59f8, 0x37bf73c7, 0x3ab47dce, 0x2da96fd5, 0x20a261dc,
-0x6df6ad76, 0x60fda37f, 0x77e0b164, 0x7aebbf6d, 0x59da9552, 0x54d19b5b, 0x43cc8940, 0x4ec78749,
-0x05aedd3e, 0x08a5d337, 0x1fb8c12c, 0x12b3cf25, 0x3182e51a, 0x3c89eb13, 0x2b94f908, 0x269ff701,
-0xbd464de6, 0xb04d43ef, 0xa75051f4, 0xaa5b5ffd, 0x896a75c2, 0x84617bcb, 0x937c69d0, 0x9e7767d9,
-0xd51e3dae, 0xd81533a7, 0xcf0821bc, 0xc2032fb5, 0xe132058a, 0xec390b83, 0xfb241998, 0xf62f1791,
-0xd68d764d, 0xdb867844, 0xcc9b6a5f, 0xc1906456, 0xe2a14e69, 0xefaa4060, 0xf8b7527b, 0xf5bc5c72,
-0xbed50605, 0xb3de080c, 0xa4c31a17, 0xa9c8141e, 0x8af93e21, 0x87f23028, 0x90ef2233, 0x9de42c3a,
-0x063d96dd, 0x0b3698d4, 0x1c2b8acf, 0x112084c6, 0x3211aef9, 0x3f1aa0f0, 0x2807b2eb, 0x250cbce2,
-0x6e65e695, 0x636ee89c, 0x7473fa87, 0x7978f48e, 0x5a49deb1, 0x5742d0b8, 0x405fc2a3, 0x4d54ccaa,
-0xdaf741ec, 0xd7fc4fe5, 0xc0e15dfe, 0xcdea53f7, 0xeedb79c8, 0xe3d077c1, 0xf4cd65da, 0xf9c66bd3,
-0xb2af31a4, 0xbfa43fad, 0xa8b92db6, 0xa5b223bf, 0x86830980, 0x8b880789, 0x9c951592, 0x919e1b9b,
-0x0a47a17c, 0x074caf75, 0x1051bd6e, 0x1d5ab367, 0x3e6b9958, 0x33609751, 0x247d854a, 0x29768b43,
-0x621fd134, 0x6f14df3d, 0x7809cd26, 0x7502c32f, 0x5633e910, 0x5b38e719, 0x4c25f502, 0x412efb0b,
-0x618c9ad7, 0x6c8794de, 0x7b9a86c5, 0x769188cc, 0x55a0a2f3, 0x58abacfa, 0x4fb6bee1, 0x42bdb0e8,
-0x09d4ea9f, 0x04dfe496, 0x13c2f68d, 0x1ec9f884, 0x3df8d2bb, 0x30f3dcb2, 0x27eecea9, 0x2ae5c0a0,
-0xb13c7a47, 0xbc37744e, 0xab2a6655, 0xa621685c, 0x85104263, 0x881b4c6a, 0x9f065e71, 0x920d5078,
-0xd9640a0f, 0xd46f0406, 0xc372161d, 0xce791814, 0xed48322b, 0xe0433c22, 0xf75e2e39, 0xfa552030,
-0xb701ec9a, 0xba0ae293, 0xad17f088, 0xa01cfe81, 0x832dd4be, 0x8e26dab7, 0x993bc8ac, 0x9430c6a5,
-0xdf599cd2, 0xd25292db, 0xc54f80c0, 0xc8448ec9, 0xeb75a4f6, 0xe67eaaff, 0xf163b8e4, 0xfc68b6ed,
-0x67b10c0a, 0x6aba0203, 0x7da71018, 0x70ac1e11, 0x539d342e, 0x5e963a27, 0x498b283c, 0x44802635,
-0x0fe97c42, 0x02e2724b, 0x15ff6050, 0x18f46e59, 0x3bc54466, 0x36ce4a6f, 0x21d35874, 0x2cd8567d,
-0x0c7a37a1, 0x017139a8, 0x166c2bb3, 0x1b6725ba, 0x38560f85, 0x355d018c, 0x22401397, 0x2f4b1d9e,
-0x642247e9, 0x692949e0, 0x7e345bfb, 0x733f55f2, 0x500e7fcd, 0x5d0571c4, 0x4a1863df, 0x47136dd6,
-0xdccad731, 0xd1c1d938, 0xc6dccb23, 0xcbd7c52a, 0xe8e6ef15, 0xe5ede11c, 0xf2f0f307, 0xfffbfd0e,
-0xb492a779, 0xb999a970, 0xae84bb6b, 0xa38fb562, 0x80be9f5d, 0x8db59154, 0x9aa8834f, 0x97a38d46
-};
-
-static const uint32_t Tks3[] = {
-0x00000000, 0x090d0b0e, 0x121a161c, 0x1b171d12, 0x24342c38, 0x2d392736, 0x362e3a24, 0x3f23312a,
-0x48685870, 0x4165537e, 0x5a724e6c, 0x537f4562, 0x6c5c7448, 0x65517f46, 0x7e466254, 0x774b695a,
-0x90d0b0e0, 0x99ddbbee, 0x82caa6fc, 0x8bc7adf2, 0xb4e49cd8, 0xbde997d6, 0xa6fe8ac4, 0xaff381ca,
-0xd8b8e890, 0xd1b5e39e, 0xcaa2fe8c, 0xc3aff582, 0xfc8cc4a8, 0xf581cfa6, 0xee96d2b4, 0xe79bd9ba,
-0x3bbb7bdb, 0x32b670d5, 0x29a16dc7, 0x20ac66c9, 0x1f8f57e3, 0x16825ced, 0x0d9541ff, 0x04984af1,
-0x73d323ab, 0x7ade28a5, 0x61c935b7, 0x68c43eb9, 0x57e70f93, 0x5eea049d, 0x45fd198f, 0x4cf01281,
-0xab6bcb3b, 0xa266c035, 0xb971dd27, 0xb07cd629, 0x8f5fe703, 0x8652ec0d, 0x9d45f11f, 0x9448fa11,
-0xe303934b, 0xea0e9845, 0xf1198557, 0xf8148e59, 0xc737bf73, 0xce3ab47d, 0xd52da96f, 0xdc20a261,
-0x766df6ad, 0x7f60fda3, 0x6477e0b1, 0x6d7aebbf, 0x5259da95, 0x5b54d19b, 0x4043cc89, 0x494ec787,
-0x3e05aedd, 0x3708a5d3, 0x2c1fb8c1, 0x2512b3cf, 0x1a3182e5, 0x133c89eb, 0x082b94f9, 0x01269ff7,
-0xe6bd464d, 0xefb04d43, 0xf4a75051, 0xfdaa5b5f, 0xc2896a75, 0xcb84617b, 0xd0937c69, 0xd99e7767,
-0xaed51e3d, 0xa7d81533, 0xbccf0821, 0xb5c2032f, 0x8ae13205, 0x83ec390b, 0x98fb2419, 0x91f62f17,
-0x4dd68d76, 0x44db8678, 0x5fcc9b6a, 0x56c19064, 0x69e2a14e, 0x60efaa40, 0x7bf8b752, 0x72f5bc5c,
-0x05bed506, 0x0cb3de08, 0x17a4c31a, 0x1ea9c814, 0x218af93e, 0x2887f230, 0x3390ef22, 0x3a9de42c,
-0xdd063d96, 0xd40b3698, 0xcf1c2b8a, 0xc6112084, 0xf93211ae, 0xf03f1aa0, 0xeb2807b2, 0xe2250cbc,
-0x956e65e6, 0x9c636ee8, 0x877473fa, 0x8e7978f4, 0xb15a49de, 0xb85742d0, 0xa3405fc2, 0xaa4d54cc,
-0xecdaf741, 0xe5d7fc4f, 0xfec0e15d, 0xf7cdea53, 0xc8eedb79, 0xc1e3d077, 0xdaf4cd65, 0xd3f9c66b,
-0xa4b2af31, 0xadbfa43f, 0xb6a8b92d, 0xbfa5b223, 0x80868309, 0x898b8807, 0x929c9515, 0x9b919e1b,
-0x7c0a47a1, 0x75074caf, 0x6e1051bd, 0x671d5ab3, 0x583e6b99, 0x51336097, 0x4a247d85, 0x4329768b,
-0x34621fd1, 0x3d6f14df, 0x267809cd, 0x2f7502c3, 0x105633e9, 0x195b38e7, 0x024c25f5, 0x0b412efb,
-0xd7618c9a, 0xde6c8794, 0xc57b9a86, 0xcc769188, 0xf355a0a2, 0xfa58abac, 0xe14fb6be, 0xe842bdb0,
-0x9f09d4ea, 0x9604dfe4, 0x8d13c2f6, 0x841ec9f8, 0xbb3df8d2, 0xb230f3dc, 0xa927eece, 0xa02ae5c0,
-0x47b13c7a, 0x4ebc3774, 0x55ab2a66, 0x5ca62168, 0x63851042, 0x6a881b4c, 0x719f065e, 0x78920d50,
-0x0fd9640a, 0x06d46f04, 0x1dc37216, 0x14ce7918, 0x2bed4832, 0x22e0433c, 0x39f75e2e, 0x30fa5520,
-0x9ab701ec, 0x93ba0ae2, 0x88ad17f0, 0x81a01cfe, 0xbe832dd4, 0xb78e26da, 0xac993bc8, 0xa59430c6,
-0xd2df599c, 0xdbd25292, 0xc0c54f80, 0xc9c8448e, 0xf6eb75a4, 0xffe67eaa, 0xe4f163b8, 0xedfc68b6,
-0x0a67b10c, 0x036aba02, 0x187da710, 0x1170ac1e, 0x2e539d34, 0x275e963a, 0x3c498b28, 0x35448026,
-0x420fe97c, 0x4b02e272, 0x5015ff60, 0x5918f46e, 0x663bc544, 0x6f36ce4a, 0x7421d358, 0x7d2cd856,
-0xa10c7a37, 0xa8017139, 0xb3166c2b, 0xba1b6725, 0x8538560f, 0x8c355d01, 0x97224013, 0x9e2f4b1d,
-0xe9642247, 0xe0692949, 0xfb7e345b, 0xf2733f55, 0xcd500e7f, 0xc45d0571, 0xdf4a1863, 0xd647136d,
-0x31dccad7, 0x38d1c1d9, 0x23c6dccb, 0x2acbd7c5, 0x15e8e6ef, 0x1ce5ede1, 0x07f2f0f3, 0x0efffbfd,
-0x79b492a7, 0x70b999a9, 0x6bae84bb, 0x62a38fb5, 0x5d80be9f, 0x548db591, 0x4f9aa883, 0x4697a38d
-};
-
-#endif /* ENCRYPT_ONLY */
-
-#endif /* SMALL CODE */
-
-static const uint32_t rcon[] = {
-    0x01000000, 0x02000000, 0x04000000, 0x08000000,
-    0x10000000, 0x20000000, 0x40000000, 0x80000000,
-    0x1B000000, 0x36000000, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
-};
diff --git a/osfmk/corecrypto/ccaes/src/ccaes_ltc_ecb_encrypt_mode.c b/osfmk/corecrypto/ccaes/src/ccaes_ltc_ecb_encrypt_mode.c
deleted file mode 100644 (file)
index 0772f68..0000000
+++ /dev/null
@@ -1,421 +0,0 @@
-/*
- *  ccaes_ltc_ecb_encrypt_mode.c
- *  corecrypto
- *
- *  Created on 12/12/2010
- *
- *  Copyright (c) 2010,2011,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
- * Parts of this code adapted from LibTomCrypt
- *
- * LibTomCrypt, modular cryptographic library -- Tom St Denis
- *
- * LibTomCrypt is a library that provides various cryptographic
- * algorithms in a highly modular and flexible manner.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@gmail.com, http://libtom.org
- */
-
-
-#include <corecrypto/ccaes.h>
-#include <corecrypto/cc_priv.h>
-
-typedef struct ltc_rijndael_key {
-    uint32_t eK[60], dK[60];
-    int Nr;
-} ltc_rijndael_keysched;
-
-#include "aes_tab.c"
-
-static uint32_t setup_mix(uint32_t temp)
-{
-    return (Te4_3[cc_byte(temp, 2)]) ^
-           (Te4_2[cc_byte(temp, 1)]) ^
-           (Te4_1[cc_byte(temp, 0)]) ^
-           (Te4_0[cc_byte(temp, 3)]);
-}
-
-/*!
- Initialize the AES (Rijndael) block cipher
- @param key The symmetric key you wish to pass
- @param keylen The key length in bytes
- @param num_rounds The number of rounds desired (0 for default)
- @param skey The key in as scheduled by this function.
- @return CRYPT_OK if successful
- */
-static int ccaes_ltc_init(const unsigned char *key, int keylen, int num_rounds,
-                          ccecb_ctx *skey)
-{
-    int i, j;
-    uint32_t temp, *rk;
-#ifndef ENCRYPT_ONLY
-    uint32_t *rrk;
-#endif
-    ltc_rijndael_keysched *rijndael;
-
-    rijndael = (ltc_rijndael_keysched *)skey;
-
-    if (keylen != 16 && keylen != 24 && keylen != 32) {
-        return -1; //CRYPT_INVALID_KEYSIZE;
-    }
-
-    if (num_rounds != 0 && num_rounds != (10 + ((keylen/8)-2)*2)) {
-        return -1; //CRYPT_INVALID_ROUNDS;
-    }
-
-    rijndael->Nr = 10 + ((keylen/8)-2)*2;
-
-    /* setup the forward key */
-    i                 = 0;
-    rk                = rijndael->eK;
-    CC_LOAD32_BE(rk[0], key     );
-    CC_LOAD32_BE(rk[1], key +  4);
-    CC_LOAD32_BE(rk[2], key +  8);
-    CC_LOAD32_BE(rk[3], key + 12);
-    if (keylen == 16) {
-        j = 44;
-        for (;;) {
-            temp  = rk[3];
-            rk[4] = rk[0] ^ setup_mix(temp) ^ rcon[i];
-            rk[5] = rk[1] ^ rk[4];
-            rk[6] = rk[2] ^ rk[5];
-            rk[7] = rk[3] ^ rk[6];
-            if (++i == 10) {
-                break;
-            }
-            rk += 4;
-        }
-    } else if (keylen == 24) {
-        j = 52;
-        CC_LOAD32_BE(rk[4], key + 16);
-        CC_LOAD32_BE(rk[5], key + 20);
-        for (;;) {
-#ifdef _MSC_VER
-            temp = rijndael->eK[rk - rijndael->eK + 5];
-#else
-            temp = rk[5];
-#endif
-            rk[ 6] = rk[ 0] ^ setup_mix(temp) ^ rcon[i];
-            rk[ 7] = rk[ 1] ^ rk[ 6];
-            rk[ 8] = rk[ 2] ^ rk[ 7];
-            rk[ 9] = rk[ 3] ^ rk[ 8];
-            if (++i == 8) {
-                break;
-            }
-            rk[10] = rk[ 4] ^ rk[ 9];
-            rk[11] = rk[ 5] ^ rk[10];
-            rk += 6;
-        }
-    } else if (keylen == 32) {
-        j = 60;
-        CC_LOAD32_BE(rk[4], key + 16);
-        CC_LOAD32_BE(rk[5], key + 20);
-        CC_LOAD32_BE(rk[6], key + 24);
-        CC_LOAD32_BE(rk[7], key + 28);
-        for (;;) {
-#ifdef _MSC_VER
-            temp = rijndael->eK[rk - rijndael->eK + 7];
-#else
-            temp = rk[7];
-#endif
-            rk[ 8] = rk[ 0] ^ setup_mix(temp) ^ rcon[i];
-            rk[ 9] = rk[ 1] ^ rk[ 8];
-            rk[10] = rk[ 2] ^ rk[ 9];
-            rk[11] = rk[ 3] ^ rk[10];
-            if (++i == 7) {
-                break;
-            }
-            temp = rk[11];
-            rk[12] = rk[ 4] ^ setup_mix(CC_RORc(temp, 8));
-            rk[13] = rk[ 5] ^ rk[12];
-            rk[14] = rk[ 6] ^ rk[13];
-            rk[15] = rk[ 7] ^ rk[14];
-            rk += 8;
-        }
-    } else {
-        /* this can't happen */
-        return -1; //CRYPT_ERROR;
-    }
-
-#ifndef ENCRYPT_ONLY
-    /* setup the inverse key now */
-    rk   = rijndael->dK;
-    rrk  = rijndael->eK + j - 4;
-
-    /* apply the inverse MixColumn transform to all round keys but the first and the last: */
-    /* copy first */
-    *rk++ = *rrk++;
-    *rk++ = *rrk++;
-    *rk++ = *rrk++;
-    *rk   = *rrk;
-    rk -= 3; rrk -= 3;
-
-    for (i = 1; i < rijndael->Nr; i++) {
-        rrk -= 4;
-        rk  += 4;
-#ifdef LTC_SMALL_CODE
-        temp = rrk[0];
-        rk[0] = setup_mix2(temp);
-        temp = rrk[1];
-        rk[1] = setup_mix2(temp);
-        temp = rrk[2];
-        rk[2] = setup_mix2(temp);
-        temp = rrk[3];
-        rk[3] = setup_mix2(temp);
-#else
-        temp = rrk[0];
-        rk[0] =
-        Tks0[cc_byte(temp, 3)] ^
-        Tks1[cc_byte(temp, 2)] ^
-        Tks2[cc_byte(temp, 1)] ^
-        Tks3[cc_byte(temp, 0)];
-        temp = rrk[1];
-        rk[1] =
-        Tks0[cc_byte(temp, 3)] ^
-        Tks1[cc_byte(temp, 2)] ^
-        Tks2[cc_byte(temp, 1)] ^
-        Tks3[cc_byte(temp, 0)];
-        temp = rrk[2];
-        rk[2] =
-        Tks0[cc_byte(temp, 3)] ^
-        Tks1[cc_byte(temp, 2)] ^
-        Tks2[cc_byte(temp, 1)] ^
-        Tks3[cc_byte(temp, 0)];
-        temp = rrk[3];
-        rk[3] =
-        Tks0[cc_byte(temp, 3)] ^
-        Tks1[cc_byte(temp, 2)] ^
-        Tks2[cc_byte(temp, 1)] ^
-        Tks3[cc_byte(temp, 0)];
-#endif
-
-    }
-
-    /* copy last */
-    rrk -= 4;
-    rk  += 4;
-    *rk++ = *rrk++;
-    *rk++ = *rrk++;
-    *rk++ = *rrk++;
-    *rk   = *rrk;
-#endif /* ENCRYPT_ONLY */
-
-    return 0; //CRYPT_OK;
-}
-
-static int ccaes_ecb_encrypt_init(const struct ccmode_ecb *ecb CC_UNUSED, ccecb_ctx *key,
-                                  size_t rawkey_len, const void *rawkey) {
-    return ccaes_ltc_init(rawkey, (int)rawkey_len, 0, key);
-}
-
-static void ccaes_ltc_ecb_encrypt(const ccecb_ctx *skey, const unsigned char *pt,
-                                  unsigned char *ct)
-{
-    uint32_t s0, s1, s2, s3, t0, t1, t2, t3;
-    const uint32_t *rk;
-    int Nr, r;
-    const ltc_rijndael_keysched *rijndael;
-
-    rijndael = (const ltc_rijndael_keysched *)skey;
-
-    Nr = rijndael->Nr;
-    rk = rijndael->eK;
-
-    /*
-     * map byte array block to cipher state
-     * and add initial round key:
-     */
-    CC_LOAD32_BE(s0, pt      ); s0 ^= rk[0];
-    CC_LOAD32_BE(s1, pt  +  4); s1 ^= rk[1];
-    CC_LOAD32_BE(s2, pt  +  8); s2 ^= rk[2];
-    CC_LOAD32_BE(s3, pt  + 12); s3 ^= rk[3];
-
-#ifdef LTC_SMALL_CODE
-
-    for (r = 0; ; r++) {
-        rk += 4;
-        t0 =
-        Te0(cc_byte(s0, 3)) ^
-        Te1(cc_byte(s1, 2)) ^
-        Te2(cc_byte(s2, 1)) ^
-        Te3(cc_byte(s3, 0)) ^
-        rk[0];
-        t1 =
-        Te0(cc_byte(s1, 3)) ^
-        Te1(cc_byte(s2, 2)) ^
-        Te2(cc_byte(s3, 1)) ^
-        Te3(cc_byte(s0, 0)) ^
-        rk[1];
-        t2 =
-        Te0(cc_byte(s2, 3)) ^
-        Te1(cc_byte(s3, 2)) ^
-        Te2(cc_byte(s0, 1)) ^
-        Te3(cc_byte(s1, 0)) ^
-        rk[2];
-        t3 =
-        Te0(cc_byte(s3, 3)) ^
-        Te1(cc_byte(s0, 2)) ^
-        Te2(cc_byte(s1, 1)) ^
-        Te3(cc_byte(s2, 0)) ^
-        rk[3];
-        if (r == Nr-2) {
-            break;
-        }
-        s0 = t0; s1 = t1; s2 = t2; s3 = t3;
-    }
-    rk += 4;
-
-#else
-
-    /*
-     * Nr - 1 full rounds:
-     */
-    r = Nr >> 1;
-    for (;;) {
-        t0 =
-        Te0(cc_byte(s0, 3)) ^
-        Te1(cc_byte(s1, 2)) ^
-        Te2(cc_byte(s2, 1)) ^
-        Te3(cc_byte(s3, 0)) ^
-        rk[4];
-        t1 =
-        Te0(cc_byte(s1, 3)) ^
-        Te1(cc_byte(s2, 2)) ^
-        Te2(cc_byte(s3, 1)) ^
-        Te3(cc_byte(s0, 0)) ^
-        rk[5];
-        t2 =
-        Te0(cc_byte(s2, 3)) ^
-        Te1(cc_byte(s3, 2)) ^
-        Te2(cc_byte(s0, 1)) ^
-        Te3(cc_byte(s1, 0)) ^
-        rk[6];
-        t3 =
-        Te0(cc_byte(s3, 3)) ^
-        Te1(cc_byte(s0, 2)) ^
-        Te2(cc_byte(s1, 1)) ^
-        Te3(cc_byte(s2, 0)) ^
-        rk[7];
-
-        rk += 8;
-        if (--r == 0) {
-            break;
-        }
-
-        s0 =
-        Te0(cc_byte(t0, 3)) ^
-        Te1(cc_byte(t1, 2)) ^
-        Te2(cc_byte(t2, 1)) ^
-        Te3(cc_byte(t3, 0)) ^
-        rk[0];
-        s1 =
-        Te0(cc_byte(t1, 3)) ^
-        Te1(cc_byte(t2, 2)) ^
-        Te2(cc_byte(t3, 1)) ^
-        Te3(cc_byte(t0, 0)) ^
-        rk[1];
-        s2 =
-        Te0(cc_byte(t2, 3)) ^
-        Te1(cc_byte(t3, 2)) ^
-        Te2(cc_byte(t0, 1)) ^
-        Te3(cc_byte(t1, 0)) ^
-        rk[2];
-        s3 =
-        Te0(cc_byte(t3, 3)) ^
-        Te1(cc_byte(t0, 2)) ^
-        Te2(cc_byte(t1, 1)) ^
-        Te3(cc_byte(t2, 0)) ^
-        rk[3];
-    }
-
-#endif
-
-    /*
-     * apply last round and
-     * map cipher state to byte array block:
-     */
-    s0 =
-    (Te4_3[cc_byte(t0, 3)]) ^
-    (Te4_2[cc_byte(t1, 2)]) ^
-    (Te4_1[cc_byte(t2, 1)]) ^
-    (Te4_0[cc_byte(t3, 0)]) ^
-    rk[0];
-    CC_STORE32_BE(s0, ct);
-    s1 =
-    (Te4_3[cc_byte(t1, 3)]) ^
-    (Te4_2[cc_byte(t2, 2)]) ^
-    (Te4_1[cc_byte(t3, 1)]) ^
-    (Te4_0[cc_byte(t0, 0)]) ^
-    rk[1];
-    CC_STORE32_BE(s1, ct+4);
-    s2 =
-    (Te4_3[cc_byte(t2, 3)]) ^
-    (Te4_2[cc_byte(t3, 2)]) ^
-    (Te4_1[cc_byte(t0, 1)]) ^
-    (Te4_0[cc_byte(t1, 0)]) ^
-    rk[2];
-    CC_STORE32_BE(s2, ct+8);
-    s3 =
-    (Te4_3[cc_byte(t3, 3)]) ^
-    (Te4_2[cc_byte(t0, 2)]) ^
-    (Te4_1[cc_byte(t1, 1)]) ^
-    (Te4_0[cc_byte(t2, 0)]) ^
-    rk[3];
-    CC_STORE32_BE(s3, ct+12);
-}
-
-static int ccaes_ecb_encrypt(const ccecb_ctx *key, size_t nblocks,
-                             const void *in, void *out) {
-    if (nblocks) {
-        const unsigned char *p = in;
-        unsigned char *c = out;
-        for (;;) {
-            ccaes_ltc_ecb_encrypt(key, p, c);
-            if (--nblocks) {
-                p += CCAES_BLOCK_SIZE;
-                c += CCAES_BLOCK_SIZE;
-            } else {
-                break;
-            }
-        }
-    }
-    
-    return 0;
-}
-
-const struct ccmode_ecb ccaes_ltc_ecb_encrypt_mode = {
-    .size = sizeof(ltc_rijndael_keysched),
-    .block_size = CCAES_BLOCK_SIZE,
-    .init = ccaes_ecb_encrypt_init,
-    .ecb = ccaes_ecb_encrypt,
-};
diff --git a/osfmk/corecrypto/ccaes/src/ccaes_private_types.h b/osfmk/corecrypto/ccaes/src/ccaes_private_types.h
deleted file mode 100644 (file)
index 7a30fad..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- *  ccaes_private_types.h
- *  corecrypto
- *
- *  Created on 02/15/2012
- *
- *  Copyright (c) 2012,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#ifndef _CORECRYPTO_CCAES_PRIVATE_TYPES_H_
-#define _CORECRYPTO_CCAES_PRIVATE_TYPES_H_
-
-#include <corecrypto/ccaes.h>
-#include <corecrypto/cc_priv.h>
-
-typedef struct ltc_rijndael_key 
-{
-    uint32_t eK[60], dK[60];
-    int Nr;
-} ltc_rijndael_keysched;
-
-
-#endif // _CORECRYPTO_CCAES_PRIVATE_TYPES_H_
index a7bfb84c06fba28baa28171c1efc228789be79b0..cbd7db453544f601006e008b6a2605c379977b00 100644 (file)
 
 void cchmac_final(const struct ccdigest_info *di, cchmac_ctx_t hc,
                   unsigned char *mac) {
+
+    // Finalize the inner state of the data being HMAC'd, i.e., H((key \oplus ipad) || m)
     ccdigest_final(di, cchmac_digest_ctx(di, hc), cchmac_data(di, hc));
-    /* typecast: output size will alwys fit in an unsigned int */
-    cchmac_num(di, hc) = (unsigned int)di->output_size;
+
+    // Set the HMAC output size based on the digest algorithm
+    cchmac_num(di, hc) = (unsigned int)di->output_size; /* typecast: output size will alwys fit in an unsigned int */
     cchmac_nbits(di, hc) = di->block_size * 8;
+
+    // Copy the pre-computed compress(key \oplus opad) back to digest state,
+    // and then run through the digest once more to finish the HMAC
     ccdigest_copy_state(di, cchmac_istate32(di, hc), cchmac_ostate32(di, hc));
     ccdigest_final(di, cchmac_digest_ctx(di, hc), mac);
 }
diff --git a/osfmk/corecrypto/ccmode/src/ccmode_ctr_crypt.c b/osfmk/corecrypto/ccmode/src/ccmode_ctr_crypt.c
deleted file mode 100644 (file)
index 3efce7d..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- *  ccmode_ctr_crypt.c
- *  corecrypto
- *
- *  Created on 12/17/2010
- *
- *  Copyright (c) 2010,2011,2012,2014,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include "ccmode_internal.h"
-
-int ccmode_ctr_crypt(ccctr_ctx *key,
-                     size_t nbytes, const void *in, void *out) {
-    const struct ccmode_ecb *ecb = CCMODE_CTR_KEY_ECB(key);
-    const ccecb_ctx *ecb_key = CCMODE_CTR_KEY_ECB_KEY(key);
-    uint8_t *ctr = (uint8_t *)CCMODE_CTR_KEY_CTR(key);
-    uint8_t *pad = (uint8_t *)CCMODE_CTR_KEY_PAD(key);
-    size_t pad_offset = CCMODE_CTR_KEY_PAD_OFFSET(key);
-    const uint8_t *in_bytes = in;
-    // Counter is 64bit wide for cipher with block size of 64bit or more
-    // This is to match the assembly
-    const size_t counter_size=(CC_MIN(ecb->block_size,(typeof(ecb->block_size))8));
-    uint8_t *out_bytes = out;
-    size_t n;
-
-    while (nbytes) {
-        if (pad_offset == ecb->block_size) {
-            ecb->ecb(ecb_key, 1, ctr, pad);
-            pad_offset = 0;
-
-            /* increment the big endian counter */
-            inc_uint(ctr + ecb->block_size - counter_size, counter_size);
-
-            if (nbytes==0) break;
-        }
-        
-        n = CC_MIN(nbytes, ecb->block_size - pad_offset);
-        cc_xor(n, out_bytes, in_bytes, pad + pad_offset);
-        nbytes -= n;
-        in_bytes += n;
-        out_bytes += n;
-        pad_offset += n;
-    }
-    CCMODE_CTR_KEY_PAD_OFFSET(key) = pad_offset;
-    
-    return 0;
-}
diff --git a/osfmk/corecrypto/ccmode/src/ccmode_ctr_init.c b/osfmk/corecrypto/ccmode/src/ccmode_ctr_init.c
deleted file mode 100644 (file)
index 00e3ca6..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *  ccmode_ctr_init.c
- *  corecrypto
- *
- *  Created on 12/17/2010
- *
- *  Copyright (c) 2010,2011,2012,2014,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include "ccmode_internal.h"
-
-int ccmode_ctr_init(const struct ccmode_ctr *ctr, ccctr_ctx *key,
-                    size_t rawkey_len, const void *rawkey,
-                    const void *iv) {
-    int rc;
-    const struct ccmode_ecb *ecb = ctr->custom;
-    CCMODE_CTR_KEY_ECB(key) = ecb;
-
-    rc = ecb->init(ecb, CCMODE_CTR_KEY_ECB_KEY(key), rawkey_len, rawkey);
-    
-    ccctr_setctr(ctr, key, iv);
-
-    return rc;
-}
diff --git a/osfmk/corecrypto/ccmode/src/ccmode_ctr_setctr.c b/osfmk/corecrypto/ccmode/src/ccmode_ctr_setctr.c
deleted file mode 100644 (file)
index 6b54e20..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- *  ccmode_ctr_setctr.c
- *  corecrypto
- *
- *  Created on 2/1/2017
- *
- *  Copyright (c) 2017 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include "ccmode_internal.h"
-
-int ccmode_ctr_setctr(CC_UNUSED const struct ccmode_ctr *mode, ccctr_ctx *ctx, const void *ctr)
-{
-    CCMODE_CTR_KEY_PAD_OFFSET(ctx) = CCMODE_CTR_KEY_ECB(ctx)->block_size;
-    CC_MEMCPY(CCMODE_CTR_KEY_CTR(ctx), ctr, CCMODE_CTR_KEY_ECB(ctx)->block_size);
-    
-    return 0;
-}
diff --git a/osfmk/corecrypto/ccmode/src/ccmode_factory_ctr_crypt.c b/osfmk/corecrypto/ccmode/src/ccmode_factory_ctr_crypt.c
deleted file mode 100644 (file)
index ddac576..0000000
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- *  ccmode_factory_ctr_crypt.c
- *  corecrypto
- *
- *  Created on 05/19/2015
- *
- *  Copyright (c) 2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include "ccmode_internal.h"
-
-void ccmode_factory_ctr_crypt(struct ccmode_ctr *ctr,
-                              const struct ccmode_ecb *ecb) {
-    struct ccmode_ctr ctr_crypt = CCMODE_FACTORY_CTR_CRYPT(ecb);
-    *ctr = ctr_crypt;
-}
diff --git a/osfmk/corecrypto/ccmode/src/ccmode_internal.h b/osfmk/corecrypto/ccmode/src/ccmode_internal.h
deleted file mode 100644 (file)
index 0f7f0c6..0000000
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- *  ccmode_internal.h
- *  corecrypto
- *
- *  Created on 12/12/2010
- *
- *  Copyright (c) 2010,2011,2012,2014,2015 Apple Inc. All rights reserved.
- *
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- *
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- *
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- *
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#ifndef _CORECRYPTO_CCMODE_INTERNAL_H_
-#define _CORECRYPTO_CCMODE_INTERNAL_H_
-
-#include <corecrypto/ccmode.h>
-#include <corecrypto/ccmode_factory.h>
-#include <corecrypto/cc_priv.h>
-#include <corecrypto/cc_macros.h>
-
-#define CCMODE_INVALID_INPUT         -1
-#define CCMODE_INVALID_CALL_SEQUENCE -2
-#define CCMODE_INTEGRITY_FAILURE     -3
-#define CCMODE_NOT_SUPPORTED         -4
-#define CCMODE_INTERNAL_ERROR        -5
-
-// VNG speed up for GCM's AES encrypton and finite fileld multiplication
-#if     \
-((CCAES_INTEL_ASM && defined(__x86_64__)) || (CCAES_ARM_ASM && defined(__ARM_NEON__)))
-#define        CCMODE_GCM_VNG_SPEEDUP  1
-#else
-#define        CCMODE_GCM_VNG_SPEEDUP  0
-#endif
-
-
-#define CCMODE_GCM_USE_GF_LOOKUP_TABLES 1
-
-/* Helper function used.  TODO: Probably not specific to xts, since
-   gcm uses it too */
-void ccmode_xts_mult_alpha(cc_unit *tweak);
-
-/* Macros for accessing a CCMODE_CBC_KEY.
- {
-     const struct ccmode_ecb *ecb
-     ccn_unit ecb_key[ecb->n]
- } */
-#define _CCMODE_CBC_KEY(K)       ((struct _ccmode_cbc_key *)(K))
-#define _CCMODE_CBC_KEY_CONST(K) ((const struct _ccmode_cbc_key *)(K))
-#define CCMODE_CBC_KEY_ECB(K) (_CCMODE_CBC_KEY(K)->ecb)
-#define CCMODE_CBC_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CBC_KEY(K)->u[0])
-
-CC_CONST CC_INLINE
-const struct ccmode_ecb * ccmode_cbc_key_ecb(const cccbc_ctx *K) {
-    return ((const struct _ccmode_cbc_key *)K)->ecb;
-}
-
-CC_CONST CC_INLINE
-const ccecb_ctx * ccmode_cbc_key_ecb_key(const cccbc_ctx *K) {
-    return (const ccecb_ctx *)&((const struct _ccmode_cbc_key *)K)->u[0];
-}
-
-/* Macros for accessing a CCMODE_CFB_KEY.
-{
-    const struct ccmode_ecb *ecb
-    cc_size pad_len;
-    ccn_unit pad[ecb->block_size / CCN_UNIT_SIZE];
-    ccn_unit iv[ecb->block_size / CCN_UNIT_SIZE];
-    ccn_unit ecb_key[ecb->n]
-} */
-#define _CCMODE_CFB_KEY(K) ((struct _ccmode_cfb_key *)(K))
-#define CCMODE_CFB_KEY_ECB(K) (_CCMODE_CFB_KEY(K)->ecb)
-#define CCMODE_CFB_KEY_PAD_LEN(K) (_CCMODE_CFB_KEY(K)->pad_len)
-#define CCMODE_CFB_KEY_PAD(K) (&_CCMODE_CFB_KEY(K)->u[0])
-#define CCMODE_CFB_KEY_IV(K) (&_CCMODE_CFB_KEY(K)->u[ccn_nof_size(CCMODE_CFB_KEY_ECB(K)->block_size)])
-#define CCMODE_CFB_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CFB_KEY(K)->u[2 * ccn_nof_size(CCMODE_CFB_KEY_ECB(K)->block_size)])
-
-/* Macros for accessing a CCMODE_CFB8_KEY.
-{
-    const struct ccmode_ecb *ecb
-    ccn_unit pad[ecb->block_size / CCN_UNIT_SIZE];
-    ccn_unit iv[ecb->block_size / CCN_UNIT_SIZE];
-    ccn_unit ecb_key[ecb->n]
-} */
-#define _CCMODE_CFB8_KEY(K) ((struct _ccmode_cfb8_key *)(K))
-#define CCMODE_CFB8_KEY_ECB(K) (_CCMODE_CFB8_KEY(K)->ecb)
-#define CCMODE_CFB8_KEY_PAD(K) (&_CCMODE_CFB8_KEY(K)->u[0])
-#define CCMODE_CFB8_KEY_IV(K) (&_CCMODE_CFB8_KEY(K)->u[ccn_nof_size(CCMODE_CFB8_KEY_ECB(K)->block_size)])
-#define CCMODE_CFB8_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CFB8_KEY(K)->u[2 * ccn_nof_size(CCMODE_CFB8_KEY_ECB(K)->block_size)])
-
-
-/* Macros for accessing a CCMODE_CTR_KEY.
-{
-    const struct ccmode_ecb *ecb
-    cc_size pad_offset;
-    ccn_unit pad[ecb->block_size / CCN_UNIT_SIZE];
-    ccn_unit ctr[ecb->block_size / CCN_UNIT_SIZE];
-    ccn_unit ecb_key[ecb->n]
-} */
-#define _CCMODE_CTR_KEY(K) ((struct _ccmode_ctr_key *)(K))
-#define CCMODE_CTR_KEY_ECB(K) (_CCMODE_CTR_KEY(K)->ecb)
-#define CCMODE_CTR_KEY_PAD_OFFSET(K) (_CCMODE_CTR_KEY(K)->pad_offset)
-#define CCMODE_CTR_KEY_PAD(K) (&_CCMODE_CTR_KEY(K)->u[0])
-#define CCMODE_CTR_KEY_CTR(K) (&_CCMODE_CTR_KEY(K)->u[ccn_nof_size(CCMODE_CTR_KEY_ECB(K)->block_size)])
-#define CCMODE_CTR_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CTR_KEY(K)->u[2 * ccn_nof_size(CCMODE_CTR_KEY_ECB(K)->block_size)])
-
-CC_INLINE int ccctr_setctr(const struct ccmode_ctr *mode, ccctr_ctx *ctx, const void *ctr)
-{
-    return mode->setctr(mode, ctx, ctr);
-}
-
-/* Macros for accessing a CCMODE_OFB_KEY.
-{
-    const struct ccmode_ecb *ecb
-    cc_size pad_len;
-    ccn_unit iv[ecb->block_size / CCN_UNIT_SIZE];
-    ccn_unit ecb_key[ecb->n]
-} */
-#define _CCMODE_OFB_KEY(K) ((struct _ccmode_ofb_key *)(K))
-#define CCMODE_OFB_KEY_ECB(K) (_CCMODE_OFB_KEY(K)->ecb)
-#define CCMODE_OFB_KEY_PAD_LEN(K) (_CCMODE_OFB_KEY(K)->pad_len)
-#define CCMODE_OFB_KEY_IV(K) (&_CCMODE_OFB_KEY(K)->u[0])
-#define CCMODE_OFB_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_OFB_KEY(K)->u[ccn_nof_size(CCMODE_OFB_KEY_ECB(K)->block_size)])
-
-
-/* Macros for accessing a CCMODE_XTS_KEY.
-{
-    const struct ccmode_ecb *ecb
-    const struct ccmode_ecb *ecb_encrypt
-    ccn_unit data_key[ecb->size]
-    ccn_unit tweak_key[ecb_encrypt->size]
-} */
-#define _CCMODE_XTS_KEY(K) ((struct _ccmode_xts_key *)(K))
-#define CCMODE_XTS_KEY_ECB(K) (_CCMODE_XTS_KEY(K)->ecb)
-#define CCMODE_XTS_KEY_ECB_ENCRYPT(K) (_CCMODE_XTS_KEY(K)->ecb_encrypt)
-#define CCMODE_XTS_KEY_DATA_KEY(K) ((ccecb_ctx *)&_CCMODE_XTS_KEY(K)->u[0])
-#define CCMODE_XTS_KEY_TWEAK_KEY(K) ((ccecb_ctx *)&_CCMODE_XTS_KEY(K)->u[ccn_nof_size(CCMODE_XTS_KEY_ECB(K)->size)])
-
-CC_CONST CC_INLINE
-const struct ccmode_ecb * ccmode_xts_key_ecb(const ccxts_ctx *K) {
-    return ((const struct _ccmode_xts_key *)K)->ecb;
-}
-
-CC_CONST CC_INLINE
-const struct ccmode_ecb * ccmode_xts_key_ecb_encrypt(const ccxts_ctx *K) {
-    return ((const struct _ccmode_xts_key *)K)->ecb_encrypt;
-}
-
-CC_CONST CC_INLINE
-const ccecb_ctx * ccmode_xts_key_data_key(const ccxts_ctx *K) {
-    return (const ccecb_ctx *)&((const struct _ccmode_xts_key *)K)->u[0];
-}
-
-CC_CONST CC_INLINE
-const ccecb_ctx * ccmode_xts_key_tweak_key(const ccxts_ctx *K) {
-    return (const ccecb_ctx *)&((const struct _ccmode_xts_key *)K)->u[ccn_nof_size(ccmode_xts_key_ecb(K)->size)];
-}
-
-/* Macros for accessing a CCMODE_XTS_TWEAK.
-{
- size_t  blocks_processed;
- uint8_t value[16];
-} */
-#define _CCMODE_XTS_TWEAK(T) ((struct _ccmode_xts_tweak *)(T))
-#define CCMODE_XTS_TWEAK_BLOCK_PROCESSED(T)(_CCMODE_XTS_TWEAK(T)->blocks_processed)
-#define CCMODE_XTS_TWEAK_VALUE(T) (_CCMODE_XTS_TWEAK(T)->u)
-
-
-/* Macros for accessing a CCMODE_GCM_KEY.
- Common to the generic (factory) and the VNG implementation
-*/
-
-#define _CCMODE_GCM_KEY(K) ((struct _ccmode_gcm_key *)(K))
-#define CCMODE_GCM_KEY_H(K) (_CCMODE_GCM_KEY(K)->H)
-#define CCMODE_GCM_KEY_X(K) (_CCMODE_GCM_KEY(K)->X)
-#define CCMODE_GCM_KEY_Y(K) (_CCMODE_GCM_KEY(K)->Y)
-#define CCMODE_GCM_KEY_Y_0(K) (_CCMODE_GCM_KEY(K)->Y_0)
-#define CCMODE_GCM_KEY_PAD_LEN(K) (_CCMODE_GCM_KEY(K)->buf_nbytes)
-#define CCMODE_GCM_KEY_PAD(K) (_CCMODE_GCM_KEY(K)->buf)
-
-#define _CCMODE_GCM_ECB_MODE(K) ((struct _ccmode_gcm_key *)(K))
-#define CCMODE_GCM_KEY_ECB(K) (_CCMODE_GCM_ECB_MODE(K)->ecb)
-#define CCMODE_GCM_KEY_ECB_KEY(K) ((ccecb_ctx *)_CCMODE_GCM_ECB_MODE(K)->ecb_key)  // set in init function
-
-#define CCMODE_GCM_STATE_IV    1
-#define CCMODE_GCM_STATE_AAD   2
-#define CCMODE_GCM_STATE_TEXT  3
-#define CCMODE_GCM_STATE_FINAL 4
-
-#define CCMODE_STATE_INIT 2     //first call to init
-#define CCMODE_STATE_IV_START 3 //first call to set_iv
-
-// rdar://problem/23523093
-//this allows users to bypass set_iv().
-//this is a temporary setting mainly to allow Security framework to adapt
-//ccgcm_set_iv_legacy() and check the tack on decyption without
-//need to change the Security twice
-//#define CCMODE_STATE_IV_CONT 2 //subsequent calls to set_iv
-#define CCMODE_STATE_IV_CONT CCMODE_STATE_IV_START
-
-#define CCMODE_STATE_AAD     4
-#define CCMODE_STATE_TEXT    5
-
-#define CCMODE_CCM_STATE_IV 1
-
-void ccmode_gcm_gf_mult(const unsigned char *a, const unsigned char *b,
-                        unsigned char *c);
-void ccmode_gcm_mult_h(ccgcm_ctx *key, unsigned char *I);
-
-/* Macros for accessing a CCMODE_CCM_KEY. */
-#define _CCMODE_CCM_KEY(K) ((struct _ccmode_ccm_key *)(K))
-#define CCMODE_CCM_KEY_ECB(K) (_CCMODE_CCM_KEY(K)->ecb)
-#define CCMODE_CCM_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_CCM_KEY(K)->u[0])
-
-#define _CCMODE_CCM_NONCE(N) ((struct _ccmode_ccm_nonce *)(N))
-#define CCMODE_CCM_KEY_MAC(N) (_CCMODE_CCM_NONCE(N)->MAC)
-#define CCMODE_CCM_KEY_A_I(N) (_CCMODE_CCM_NONCE(N)->A_i)
-#define CCMODE_CCM_KEY_B_I(N) (_CCMODE_CCM_NONCE(N)->B_i)
-#define CCMODE_CCM_KEY_PAD_LEN(N) (_CCMODE_CCM_NONCE(N)->buflen)
-#define CCMODE_CCM_KEY_PAD(N) (_CCMODE_CCM_NONCE(N)->buf)
-#define CCMODE_CCM_KEY_MAC_LEN(N) (_CCMODE_CCM_NONCE(N)->mac_size)
-#define CCMODE_CCM_KEY_NONCE_LEN(N) (_CCMODE_CCM_NONCE(N)->nonce_size)
-#define CCMODE_CCM_KEY_AUTH_LEN(N) (_CCMODE_CCM_NONCE(N)->b_i_len)
-
-/* Macros for accessing a CCMODE_OMAC_KEY.
-{
-    const struct ccmode_ecb *ecb
-    cc_size tweak_size;
-    ccn_unit ecb_key1[ecb->n]
-    ccn_unit ecb_key2[ecb->n]
-} */
-#define _CCMODE_OMAC_KEY(K) ((struct _ccmode_omac_key *)(K))
-#define CCMODE_OMAC_KEY_ECB(K) (_CCMODE_OMAC_KEY(K)->ecb)
-#define CCMODE_OMAC_KEY_TWEAK_LEN(K) (_CCMODE_OMAC_KEY(K)->tweak_len)
-#define CCMODE_OMAC_KEY_ECB_KEY(K) ((ccecb_ctx *)&_CCMODE_OMAC_KEY(K)->u[0])
-
-CC_INLINE void inc_uint(uint8_t *buf, size_t nbytes)
-{
-    size_t i;
-    for (i = 0; i < nbytes; i += 1) {
-        if (++buf[nbytes-1-i] & 255) { break; }
-    }
-}
-
-CC_INLINE void ccmode_gcm_update_pad(ccgcm_ctx *key)
-{
-    inc_uint(CCMODE_GCM_KEY_Y(key) + 12, 4);
-    CCMODE_GCM_KEY_ECB(key)->ecb(CCMODE_GCM_KEY_ECB_KEY(key), 1,
-                                 CCMODE_GCM_KEY_Y(key),
-                                 CCMODE_GCM_KEY_PAD(key));
-}
-
-CC_INLINE void ccmode_gcm_aad_finalize(ccgcm_ctx *key)
-{
-    if (_CCMODE_GCM_KEY(key)->state == CCMODE_GCM_STATE_AAD) {
-        if (_CCMODE_GCM_KEY(key)->aad_nbytes % CCGCM_BLOCK_NBYTES > 0) {
-            ccmode_gcm_mult_h(key, CCMODE_GCM_KEY_X(key));
-        }
-        _CCMODE_GCM_KEY(key)->state = CCMODE_GCM_STATE_TEXT;
-    }
-}
-
-CC_INLINE void xor_128bits(unsigned char *r, const unsigned char *a, const unsigned char *b)
-{
-    cc_unit *r1 = (cc_unit *)r;
-    const cc_unit *a1 = (const cc_unit *)a;
-    const cc_unit *b1 = (const cc_unit *)b;
-
-    for (int i=0; i<128/(CCN_UNIT_SIZE*8); i++) {
-        r1[i] = a1[i] ^ b1[i];
-    }
-}
-
-
-
-#endif /* _CORECRYPTO_CCMODE_INTERNAL_H_ */
index 8bec7daf7d97b9e261c45db902c7cf0cc69f1cba..be6acaa1a722b92a5cd13b5692a4f574f03ad2c3 100644 (file)
@@ -34,6 +34,7 @@
 
 #include <corecrypto/ccdigest_priv.h>
 #include <corecrypto/cc_priv.h>
+#include "ccdigest_internal.h"
 
 /* This can be used for SHA1, SHA256 and SHA224 */
 void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t ctx,
diff --git a/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h b/osfmk/corecrypto/ccsha1/src/ccdigest_internal.h
new file mode 100644 (file)
index 0000000..bc3921e
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ *  ccdigest_internal.h
+ *  corecrypto
+ *
+ *  Created on 12/20/2017
+ *
+ *  Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _CORECRYPTO_CCDIGEST_INTERNAL_H_
+#define _CORECRYPTO_CCDIGEST_INTERNAL_H_
+
+#include <corecrypto/ccdigest.h>
+
+void ccdigest_final_common(const struct ccdigest_info *di,
+                           ccdigest_ctx_t ctx, void *digest);
+void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t,
+                         unsigned char *digest);
+void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t,
+                         unsigned char *digest);
+
+#endif /* _CORECRYPTO_CCDIGEST_INTERNAL_H_ */
index 3e945ad8c73a3695fc1cb3e97afbc28d78ad3b50..a28e38f9f41c1ad1fe926047fc957701905578f9 100644 (file)
 
 
 #include <corecrypto/ccsha1.h>
+#include "ccsha1_internal.h"
 #include <corecrypto/cc_priv.h>
 #include <corecrypto/ccdigest_priv.h>
+#include "ccdigest_internal.h"
 
 
 #ifndef SHA_LONG_LOG2
diff --git a/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h b/osfmk/corecrypto/ccsha1/src/ccsha1_internal.h
new file mode 100644 (file)
index 0000000..323bbb2
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ *  ccsha1_internal.h
+ *  corecrypto
+ *
+ *  Created on 12/19/2017
+ *
+ *  Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _CORECRYPTO_CCSHA1_INTERNAL_H_
+#define _CORECRYPTO_CCSHA1_INTERNAL_H_
+
+#include <corecrypto/ccdigest.h>
+#include <corecrypto/cc_config.h>
+
+extern const uint32_t ccsha1_initial_state[5];
+
+#if CCSHA1_VNG_INTEL && defined(__x86_64__)
+extern const struct ccdigest_info ccsha1_vng_intel_AVX2_di;
+extern const struct ccdigest_info ccsha1_vng_intel_AVX1_di;
+#endif
+
+#endif /* _CORECRYPTO_CCSHA1_INTERNAL_H_ */
diff --git a/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h b/osfmk/corecrypto/ccsha2/src/ccdigest_internal.h
new file mode 100644 (file)
index 0000000..bc3921e
--- /dev/null
@@ -0,0 +1,47 @@
+/*
+ *  ccdigest_internal.h
+ *  corecrypto
+ *
+ *  Created on 12/20/2017
+ *
+ *  Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _CORECRYPTO_CCDIGEST_INTERNAL_H_
+#define _CORECRYPTO_CCDIGEST_INTERNAL_H_
+
+#include <corecrypto/ccdigest.h>
+
+void ccdigest_final_common(const struct ccdigest_info *di,
+                           ccdigest_ctx_t ctx, void *digest);
+void ccdigest_final_64be(const struct ccdigest_info *di, ccdigest_ctx_t,
+                         unsigned char *digest);
+void ccdigest_final_64le(const struct ccdigest_info *di, ccdigest_ctx_t,
+                         unsigned char *digest);
+
+#endif /* _CORECRYPTO_CCDIGEST_INTERNAL_H_ */
index c0b031a0d0d1013710f56e844efee1d77d41675e..c702b9736c7ab499ede23187f35a1e4f7452aa73 100644 (file)
@@ -33,6 +33,7 @@
  */
 
 #include <corecrypto/ccsha2.h>
+#include "ccsha2_internal.h"
 #include <corecrypto/cc_runtime_config.h>
 
 #include "corecrypto/fipspost_trace.h"
@@ -43,8 +44,11 @@ const struct ccdigest_info *ccsha256_di(void)
 
 #if  CCSHA2_VNG_INTEL
 #if defined (__x86_64__)
-    return ( (CC_HAS_AVX2() ? &ccsha256_vng_intel_AVX2_di : 
-               ( (CC_HAS_AVX1() ? &ccsha256_vng_intel_AVX1_di : 
+    if (CC_HAS_AVX512_AND_IN_KERNEL())
+        return &ccsha256_vng_intel_SupplementalSSE3_di;
+    else
+    return ( (CC_HAS_AVX2() ? &ccsha256_vng_intel_AVX2_di :
+               ( (CC_HAS_AVX1() ? &ccsha256_vng_intel_AVX1_di :
                        &ccsha256_vng_intel_SupplementalSSE3_di ) ) ) ) ;
 #else
     return &ccsha256_vng_intel_SupplementalSSE3_di;
index b9ff54b87496891afd8c959bd487bfdbc8f118c7..fb301b446ff82a2082ca8dfed145eb556b3f7095 100644 (file)
@@ -50,6 +50,8 @@
 #include <corecrypto/cc_priv.h>
 #include "ccsha2_internal.h"
 
+#if !CC_KERNEL || !CC_USE_ASM
+
 // Various logical functions
 #define Ch(x,y,z)       (z ^ (x & (y ^ z)))
 #define Maj(x,y,z)      (((x | y) & z) | (x & y))
@@ -91,7 +93,7 @@
     d += t0;                                                   \
     h  = t0 + t1;
 
-// compress 512-bits 
+// compress 512-bits
 void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *in)
 {
     uint32_t W[64], t0, t1;
@@ -136,7 +138,7 @@ void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *i
             RND(S2,S3,S4,S5,S6,S7,S0,S1,i+6);
             RND(S1,S2,S3,S4,S5,S6,S7,S0,i+7);
         }
-        
+
         // feedback
         s[0] += S0;
         s[1] += S1;
@@ -150,3 +152,5 @@ void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *i
         buf+=CCSHA256_BLOCK_SIZE/sizeof(buf[0]);
     }
 }
+
+#endif
index 1e4109b60b1a228f70f74979c6efb007a5e00324..7b9aef1cecc9cf0853cb98b466c36808f5401167 100644 (file)
 
 #include <corecrypto/ccsha2.h>
 #include <corecrypto/ccdigest_priv.h>
+#include "ccdigest_internal.h"
 #include "ccsha2_internal.h"
 
+#if !CC_KERNEL || !CC_USE_ASM
+
 const struct ccdigest_info ccsha256_ltc_di = {
     .output_size = CCSHA256_OUTPUT_SIZE,
     .state_size = CCSHA256_STATE_SIZE,
@@ -46,3 +49,5 @@ const struct ccdigest_info ccsha256_ltc_di = {
     .compress = ccsha256_ltc_compress,
     .final = ccdigest_final_64be,
 };
+
+#endif
index 14fd2d4fbcec256190d03f57f61201f0f218bc2f..7bf64bc04b86074bb7143b7d10685da7279a2d77 100644 (file)
@@ -2,9 +2,9 @@
  *  ccsha2_internal.h
  *  corecrypto
  *
- *  Created on 12/07/2010
+ *  Created on 12/19/2017
  *
- *  Copyright (c) 2010,2011,2012,2014,2015 Apple Inc. All rights reserved.
+ *  Copyright (c) 2017 Apple Inc. All rights reserved.
  *
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 
 #include <corecrypto/ccdigest.h>
 
-void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *buf);
+extern const struct ccdigest_info ccsha256_v6m_di;
 void ccsha256_v6m_compress(ccdigest_state_t state, size_t nblocks, const void *buf);
+
+void ccsha256_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *buf);
 void ccsha512_ltc_compress(ccdigest_state_t state, size_t nblocks, const void *in);
 
 #if  CCSHA2_VNG_INTEL
@@ -49,10 +51,31 @@ void ccsha256_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, c
 void ccsha512_vng_intel_avx2_compress(ccdigest_state_t state, size_t nblocks, const void *in);
 void ccsha512_vng_intel_avx1_compress(ccdigest_state_t state, size_t nblocks, const void *in);
 void ccsha512_vng_intel_ssse3_compress(ccdigest_state_t state, size_t nblocks, const void *in);
+
+extern const struct ccdigest_info ccsha224_vng_intel_AVX2_di;
+extern const struct ccdigest_info ccsha224_vng_intel_AVX1_di;
+extern const struct ccdigest_info ccsha256_vng_intel_AVX2_di;
+extern const struct ccdigest_info ccsha256_vng_intel_AVX1_di;
+extern const struct ccdigest_info ccsha384_vng_intel_AVX2_di;
+extern const struct ccdigest_info ccsha384_vng_intel_AVX1_di;
+extern const struct ccdigest_info ccsha384_vng_intel_SupplementalSSE3_di;
+extern const struct ccdigest_info ccsha512_vng_intel_AVX2_di;
+extern const struct ccdigest_info ccsha512_vng_intel_AVX1_di;
+extern const struct ccdigest_info ccsha512_vng_intel_SupplementalSSE3_di;
 #endif
 void ccsha256_vng_intel_sse3_compress(ccdigest_state_t state, size_t nblocks, const void *in);
 #endif
 
+#if  CCSHA2_VNG_ARMV7NEON
+extern const struct ccdigest_info ccsha384_vng_arm64_di;
+extern const struct ccdigest_info ccsha384_vng_armv7neon_di;
+extern const struct ccdigest_info ccsha512_vng_arm64_di;
+extern const struct ccdigest_info ccsha512_vng_armv7neon_di;
+#endif
+
+extern const uint32_t ccsha256_K[64];
+extern const uint64_t ccsha512_K[80];
+
 void ccsha512_final(const struct ccdigest_info *di, ccdigest_ctx_t ctx, unsigned char *digest);
 
 extern const uint32_t ccsha224_initial_state[8];
@@ -60,4 +83,5 @@ extern const uint32_t ccsha256_initial_state[8];
 extern const uint64_t ccsha384_initial_state[8];
 extern const uint64_t ccsha512_initial_state[8];
 
+
 #endif /* _CORECRYPTO_CCSHA2_INTERNAL_H_ */
index b2078a91a2cb03d77977e9e75c185c1639e2905d..2941e21254a3393aad5f436cb8fa40cafcdcf564 100644 (file)
@@ -369,9 +369,11 @@ task_purge_all_corpses(void)
                               * Final cleanup:
                               * + no unnesting
                               * + remove immutable mappings
+                              * + allow gaps in the range
                               */
                              (VM_MAP_REMOVE_NO_UNNESTING |
-                              VM_MAP_REMOVE_IMMUTABLE));
+                              VM_MAP_REMOVE_IMMUTABLE |
+                              VM_MAP_REMOVE_GAPS_OK));
        }
 
        lck_mtx_unlock(&tasks_corpse_lock);
@@ -413,7 +415,9 @@ task_generate_corpse(
        if (kr != KERN_SUCCESS) {
                return kr;
        }
-       assert(thread == THREAD_NULL);
+       if (thread != THREAD_NULL) {
+               thread_deallocate(thread);
+       }
 
        /* wait for all the threads in the task to terminate */
        task_lock(new_task);
@@ -476,7 +480,9 @@ task_enqueue_exception_with_corpse(
        kr = task_generate_corpse_internal(task, &new_task, &thread,
                        etype, code[0], code[1], reason);
        if (kr == KERN_SUCCESS) {
-               assert(thread != THREAD_NULL);
+               if (thread == THREAD_NULL) {
+                       return KERN_FAILURE;
+               }
                assert(new_task != TASK_NULL);
                assert(etype == EXC_RESOURCE || etype == EXC_GUARD);
                thread_exception_enqueue(new_task, thread, etype);
@@ -512,7 +518,8 @@ task_generate_corpse_internal(
        thread_t thread_next = THREAD_NULL;
        kern_return_t kr;
        struct proc *p = NULL;
-       int is64bit;
+       int is_64bit_addr;
+       int is_64bit_data;
        int t_flags;
        uint64_t *udata_buffer = NULL;
        int size = 0;
@@ -543,8 +550,13 @@ task_generate_corpse_internal(
                goto error_task_generate_corpse;
        }
 
-       is64bit = IS_64BIT_PROCESS(p);
-       t_flags = TF_CORPSE_FORK | TF_PENDING_CORPSE | TF_CORPSE | (is64bit ? TF_64B_ADDR : TF_NONE);
+       is_64bit_addr = IS_64BIT_PROCESS(p);
+       is_64bit_data = (task == TASK_NULL) ? is_64bit_addr : task_get_64bit_data(task);
+       t_flags = TF_CORPSE_FORK |
+                         TF_PENDING_CORPSE |
+                         TF_CORPSE |
+                         (is_64bit_addr ? TF_64B_ADDR : TF_NONE) |
+                         (is_64bit_data ? TF_64B_DATA : TF_NONE);
 
 #if CONFIG_MACF
        /* Create the corpse label credentials from the process. */
@@ -555,7 +567,8 @@ task_generate_corpse_internal(
        kr = task_create_internal(task,
                                NULL,
                                TRUE,
-                               is64bit,
+                               is_64bit_addr,
+                               is_64bit_data,
                                t_flags,
                                TPF_NONE,
                                &new_task);
index 59ab0759252d14725a1ac00b5305823af477bd45..120aeb97cdf5c200e59d28a28ab1f1c32a653d62 100644 (file)
@@ -68,8 +68,6 @@ extern kern_return_t task_deliver_crash_notification(task_t, thread_t, exception
 
 extern kcdata_descriptor_t task_get_corpseinfo(task_t task);
 
-extern unsigned long total_corposes_count(void) __attribute__((pure));
-
 extern kcdata_descriptor_t  task_crashinfo_alloc_init(
                                        mach_vm_address_t crash_data_p,
                                        unsigned size, uint32_t kc_u_flags, unsigned kc_flags);
diff --git a/osfmk/i386/AT386/conf.c b/osfmk/i386/AT386/conf.c
deleted file mode 100644 (file)
index 8fffcc6..0000000
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-/*
- * @OSF_COPYRIGHT@
- */
-/* 
- * Mach Operating System
- * Copyright (c) 1991,1990,1989 Carnegie Mellon University
- * All Rights Reserved.
- * 
- * Permission to use, copy, modify and distribute this software and its
- * documentation is hereby granted, provided that both the copyright
- * notice and this permission notice appear in all copies of the
- * software, derivative works or modified versions, and any portions
- * thereof, and that both notices appear in supporting documentation.
- * 
- * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
- * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
- * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- * 
- * Carnegie Mellon requests users of this software to return to
- * 
- *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
- *  School of Computer Science
- *  Carnegie Mellon University
- *  Pittsburgh PA 15213-3890
- * 
- * any improvements or extensions that they make and grant Carnegie Mellon
- * the rights to redistribute these changes.
- */
-/* 
- */
-
-/*
- * Device switch for i386 AT bus.
- */
-
-#include <types.h>
-#include <kern/clock.h>
-#include <libkern/section_keywords.h>
-
-/*
- * Clock device subsystem configuration. The clock_list[]
- * table contains the clock structures for all clocks in
- * the system.
- */
-
-extern const struct clock_ops  sysclk_ops, calend_ops;
-
-/*
- * List of clock devices.
- */
-SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = {
-
-       /* SYSTEM_CLOCK */
-       { &sysclk_ops, 0, 0 },
-
-       /* CALENDAR_CLOCK */
-       { &calend_ops, 0, 0 }
-};
-int    clock_count = sizeof(clock_list) / sizeof(clock_list[0]);
index 614b310eb57f124ee1a614df97862e8141cd7520..3976f1f6bba343ee54595e65192472e7d495f708 100644 (file)
 
 #include <libkern/kernel_mach_header.h>
 #include <libkern/OSKextLibPrivate.h>
-
-#include <mach/branch_predicates.h>
+#include <libkern/crc.h>
 
 #if    DEBUG || DEVELOPMENT
 #define DPRINTF(x...)  kprintf(x)
@@ -342,112 +341,9 @@ machine_conf(void)
        machine_info.memory_size = (typeof(machine_info.memory_size))mem_size;
 }
 
-
 extern void *gPEEFIRuntimeServices;
 extern void *gPEEFISystemTable;
 
-/*-
- *  COPYRIGHT (C) 1986 Gary S. Brown.  You may use this program, or
- *  code or tables extracted from it, as desired without restriction.
- *
- *  First, the polynomial itself and its table of feedback terms.  The
- *  polynomial is
- *  X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0
- *
- *  Note that we take it "backwards" and put the highest-order term in
- *  the lowest-order bit.  The X^32 term is "implied"; the LSB is the
- *  X^31 term, etc.  The X^0 term (usually shown as "+1") results in
- *  the MSB being 1
- *
- *  Note that the usual hardware shift register implementation, which
- *  is what we're using (we're merely optimizing it by doing eight-bit
- *  chunks at a time) shifts bits into the lowest-order term.  In our
- *  implementation, that means shifting towards the right.  Why do we
- *  do it this way?  Because the calculated CRC must be transmitted in
- *  order from highest-order term to lowest-order term.  UARTs transmit
- *  characters in order from LSB to MSB.  By storing the CRC this way
- *  we hand it to the UART in the order low-byte to high-byte; the UART
- *  sends each low-bit to hight-bit; and the result is transmission bit
- *  by bit from highest- to lowest-order term without requiring any bit
- *  shuffling on our part.  Reception works similarly
- *
- *  The feedback terms table consists of 256, 32-bit entries.  Notes
- *
- *      The table can be generated at runtime if desired; code to do so
- *      is shown later.  It might not be obvious, but the feedback
- *      terms simply represent the results of eight shift/xor opera
- *      tions for all combinations of data and CRC register values
- *
- *      The values must be right-shifted by eight bits by the "updcrc
- *      logic; the shift must be unsigned (bring in zeroes).  On some
- *      hardware you could probably optimize the shift in assembler by
- *      using byte-swap instructions
- *      polynomial $edb88320
- *
- *
- * CRC32 code derived from work by Gary S. Brown.
- */
-
-static uint32_t crc32_tab[] = {
-       0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
-       0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-       0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
-       0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-       0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
-       0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-       0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
-       0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-       0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
-       0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-       0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
-       0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-       0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
-       0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-       0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
-       0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-       0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
-       0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-       0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
-       0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-       0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
-       0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-       0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
-       0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-       0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
-       0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-       0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
-       0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-       0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
-       0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-       0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
-       0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-       0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
-       0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-       0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
-       0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-       0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
-       0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-       0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
-       0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-       0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
-       0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-       0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
-};
-
-static uint32_t
-crc32(uint32_t crc, const void *buf, size_t size)
-{
-       const uint8_t *p;
-
-       p = buf;
-       crc = crc ^ ~0U;
-
-       while (size--)
-               crc = crc32_tab[(crc ^ *p++) & 0xFF] ^ (crc >> 8);
-
-       return crc ^ ~0U;
-}
-
 static void
 efi_set_tables_64(EFI_SYSTEM_TABLE_64 * system_table)
 {
@@ -1017,9 +913,12 @@ RecordPanicStackshot()
 
 void
 SavePanicInfo(
-       __unused const char *message, uint64_t panic_options)
+       __unused const char *message, void *panic_data, uint64_t panic_options)
 {
-       void *stackptr;
+       void *stackptr  = NULL;
+       thread_t thread_to_trace = (thread_t) panic_data;
+       cframe_t synthetic_stack_frame = { };
+       char *debugger_msg = NULL;
        int cn = cpu_number();
 
        /*
@@ -1028,15 +927,37 @@ SavePanicInfo(
         */
        panic_io_port_read();
 
-       /* Obtain current frame pointer */
-       __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr));
+       /* Obtain frame pointer for stack to trace */
+       if (panic_options & DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE) {
+               if (!mp_kdp_all_cpus_halted()) {
+                       debugger_msg = "Backtracing panicked thread because failed to halt all CPUs\n";
+               } else if (thread_to_trace == THREAD_NULL) {
+                       debugger_msg = "Backtracing panicked thread because no thread pointer provided\n";
+               } else if (kvtophys((vm_offset_t)thread_to_trace) == 0ULL) {
+                       debugger_msg = "Backtracing panicked thread because unable to access specified thread\n";
+               } else if (thread_to_trace->kernel_stack == 0) {
+                       debugger_msg = "Backtracing panicked thread because kernel_stack is NULL for specified thread\n";
+               } else if (kvtophys(STACK_IKS(thread_to_trace->kernel_stack) == 0ULL)) {
+                       debugger_msg = "Backtracing panicked thread because unable to access kernel_stack for specified thread\n";
+               } else {
+                       debugger_msg = "Backtracing specified thread\n";
+                       /* We construct a synthetic stack frame so we can include the current instruction pointer */
+                       synthetic_stack_frame.prev = (cframe_t *)STACK_IKS(thread_to_trace->kernel_stack)->k_rbp;
+                       synthetic_stack_frame.caller = (uintptr_t) STACK_IKS(thread_to_trace->kernel_stack)->k_rip;
+                       stackptr = (void *) &synthetic_stack_frame;
+               }
+       }
+
+       if (stackptr == NULL) {
+               __asm__ volatile("movq %%rbp, %0" : "=m" (stackptr));
+       }
 
        /* Print backtrace - callee is internally synchronized */
        if (panic_options & DEBUGGER_OPTION_INITPROC_PANIC) {
                /* Special handling of launchd died panics */
                print_launchd_info();
        } else {
-               panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), NULL, FALSE, NULL);
+               panic_i386_backtrace(stackptr, ((panic_double_fault_cpu == cn) ? 80: 48), debugger_msg, FALSE, NULL);
        }
 
        if (panic_options & DEBUGGER_OPTION_COPROC_INITIATED_PANIC) {
index 4df5e85102d508732216101b5d575aabb655891e..12e9c602505bfd67b06a0832f22148d05b5541b5 100644 (file)
@@ -22,6 +22,7 @@ EXPORT_ONLY_FILES =   \
                    lapic.h \
                    lock.h \
                    locks.h \
+                   locks_i386_inlines.h \
                    machine_routines.h \
                    machine_cpu.h \
                    mtrr.h \
index 5ceff836b2c4ce5f366d5df511f1ea9747f861fe..5a991c5971e00132038a9445cfb9a74305574541 100644 (file)
@@ -318,7 +318,7 @@ acpi_sleep_kernel(acpi_sleep_callback func, void *refcon)
         * The sleep implementation uses indirect noreturn calls, so we miss stack
         * unpoisoning. Do it explicitly.
         */
-       __asan_handle_no_return();
+       kasan_unpoison_curstack(true);
 #endif
 
 #if HIBERNATION
index ef4652d5cd5244da0f64c4f3bd7fef09df678298..940e5fcf2139d453854d5d7a663381fa11f7bc68 100644 (file)
@@ -51,7 +51,7 @@
 
 #ifdef ATOMIC_PRIVATE
 
-static boolean_t
+static inline boolean_t
 atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval,
                        enum memory_order ord, boolean_t wait)
 {
@@ -59,22 +59,14 @@ atomic_compare_exchange(uintptr_t *target, uintptr_t oldval, uintptr_t newval,
        return __c11_atomic_compare_exchange_strong((_Atomic uintptr_t *)target, &oldval, newval, ord, memory_order_relaxed);
 }
 
-#endif // ATOMIC_PRIVATE
-
-#define os_atomic_rmw_loop(p, ov, nv, m, ...)  ({ \
-               bool _result = false; \
-               typeof(p) _p = (p); \
-               ov = atomic_load_explicit(_p, memory_order_relaxed); \
-               do { \
-                       __VA_ARGS__; \
-                       typeof(ov) _r = (ov); \
-                       _result = atomic_compare_exchange_weak_explicit(_p, &_r, nv, \
-                                       memory_order_##m, memory_order_relaxed); \
-                       (ov) = _r; \
-               } while (__builtin_expect(!_result, 0)); \
-               _result; \
-       })
+static inline boolean_t
+atomic_compare_exchange32(uint32_t *target, uint32_t oldval, uint32_t newval,
+                       enum memory_order ord, boolean_t wait)
+{
+       (void)wait;
+       return __c11_atomic_compare_exchange_strong((_Atomic uint32_t *)target, &oldval, newval, ord, memory_order_relaxed);
+}
 
-#define os_atomic_rmw_loop_give_up(expr) ({ expr; __builtin_trap(); })
+#endif // ATOMIC_PRIVATE
 
 #endif // _I386_ATOMIC_H_
index b855d1c6611ad9366ea04aa30b678b767a4ce3f8..805cbc1de6dc51d6c121ae03c7b108de1794ba16 100644 (file)
@@ -211,7 +211,7 @@ thread_set_child(thread_t child, int pid)
 {
        pal_register_cache_state(child, DIRTY);
 
-       if (thread_is_64bit(child)) {
+       if (thread_is_64bit_addr(child)) {
                x86_saved_state64_t     *iss64;
 
                iss64 = USER_REGS64(child);
@@ -609,7 +609,7 @@ thread_setuserstack(
        mach_vm_address_t       user_stack)
 {
        pal_register_cache_state(thread, DIRTY);
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_addr(thread)) {
                x86_saved_state64_t     *iss64;
 
                iss64 = USER_REGS64(thread);
@@ -636,7 +636,7 @@ thread_adjuserstack(
        int             adjust)
 {
        pal_register_cache_state(thread, DIRTY);
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_addr(thread)) {
                x86_saved_state64_t     *iss64;
 
                iss64 = USER_REGS64(thread);
@@ -665,7 +665,7 @@ void
 thread_setentrypoint(thread_t thread, mach_vm_address_t entry)
 {
        pal_register_cache_state(thread, DIRTY);
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_addr(thread)) {
                x86_saved_state64_t     *iss64;
 
                iss64 = USER_REGS64(thread);
@@ -685,7 +685,7 @@ kern_return_t
 thread_setsinglestep(thread_t thread, int on)
 {
        pal_register_cache_state(thread, DIRTY);
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_addr(thread)) {
                x86_saved_state64_t     *iss64;
 
                iss64 = USER_REGS64(thread);
index 4ec100effaf994f038c1f5978ebc3333a62ae86d..6b8e1d1249deb7890f7846380615237d5c150506 100644 (file)
@@ -75,7 +75,8 @@
 kern_return_t
 machine_thread_dup(
     thread_t           parent,
-    thread_t           child
+    thread_t           child,
+    __unused boolean_t is_corpse
 )
 {
        
@@ -85,7 +86,7 @@ machine_thread_dup(
        /*
         * Copy over the x86_saved_state registers
         */
-       if (thread_is_64bit(parent))
+       if (thread_is_64bit_addr(parent))
                bcopy(USER_REGS64(parent), USER_REGS64(child), sizeof(x86_saved_state64_t));
        else
                bcopy(USER_REGS32(parent), USER_REGS32(child), sizeof(x86_saved_state32_t));
@@ -101,7 +102,7 @@ machine_thread_dup(
         * Copy the parent's cthread id and USER_CTHREAD descriptor, if 32-bit.
         */
        child_pcb->cthread_self = parent_pcb->cthread_self;
-       if (!thread_is_64bit(parent))
+       if (!thread_is_64bit_addr(parent))
                child_pcb->cthread_desc = parent_pcb->cthread_desc;
 
        /*
@@ -125,7 +126,7 @@ thread_set_parent(thread_t parent, int pid)
 {
        pal_register_cache_state(parent, DIRTY);
 
-       if (thread_is_64bit(parent)) {
+       if (thread_is_64bit_addr(parent)) {
                x86_saved_state64_t     *iss64;
 
                iss64 = USER_REGS64(parent);
index 81b962c1bb2620fce5df6969a9f0bf4395b720d0..8d93e94564df8d29ff74f10564296075d1065c6c 100644 (file)
@@ -126,22 +126,25 @@ commpage_allocate(
        vm_map_entry_t  entry;
        ipc_port_t      handle;
        kern_return_t   kr;
+       vm_map_kernel_flags_t vmk_flags;
 
        if (submap == NULL)
                panic("commpage submap is null");
 
-       if ((kr = vm_map_kernel(kernel_map,
-                        &kernel_addr,
-                        area_used,
-                        0,
-                        VM_FLAGS_ANYWHERE,
-                        VM_KERN_MEMORY_OSFMK,
-                        NULL,
-                        0,
-                        FALSE,
-                        VM_PROT_ALL,
-                        VM_PROT_ALL,
-                        VM_INHERIT_NONE)))
+       kr = vm_map_kernel(kernel_map,
+                          &kernel_addr,
+                          area_used,
+                          0,
+                          VM_FLAGS_ANYWHERE,
+                          VM_MAP_KERNEL_FLAGS_NONE,
+                          VM_KERN_MEMORY_OSFMK,
+                          NULL,
+                          0,
+                          FALSE,
+                          VM_PROT_ALL,
+                          VM_PROT_ALL,
+                          VM_INHERIT_NONE);
+       if (kr != KERN_SUCCESS)
                panic("cannot allocate commpage %d", kr);
 
        if ((kr = vm_map_wire_kernel(kernel_map,
@@ -171,18 +174,31 @@ commpage_allocate(
                                    NULL )))            // parent_entry (what is this?)
                panic("cannot make entry for commpage %d", kr);
 
-       if ((kr = vm_map_64_kernel(     submap,                         // target map (shared submap)
-                       &zero,                          // address (map into 1st page in submap)
-                       area_used,                      // size
-                       0,                              // mask
-                       VM_FLAGS_FIXED,                 // flags (it must be 1st page in submap)
-                       VM_KERN_MEMORY_NONE,
-                       handle,                         // port is the memory entry we just made
-                       0,                              // offset (map 1st page in memory entry)
-                       FALSE,                          // copy
-                       uperm,   // cur_protection (R-only in user map)
-                       uperm,   // max_protection
-                       VM_INHERIT_SHARE )))             // inheritance
+       vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
+       if (uperm == (VM_PROT_READ | VM_PROT_EXECUTE)) {
+               /*
+                * Mark this unsigned executable mapping as "jit" to avoid
+                * code-signing violations when attempting to execute unsigned
+                * code.
+                */
+               vmk_flags.vmkf_map_jit = TRUE;
+       }
+
+       kr = vm_map_64_kernel(
+               submap,                 // target map (shared submap)
+               &zero,                  // address (map into 1st page in submap)
+               area_used,              // size
+               0,                      // mask
+               VM_FLAGS_FIXED,         // flags (it must be 1st page in submap)
+               vmk_flags,
+               VM_KERN_MEMORY_NONE,
+               handle,                 // port is the memory entry we just made
+               0,                      // offset (map 1st page in memory entry)
+               FALSE,                  // copy
+               uperm,                  // cur_protection (R-only in user map)
+               uperm,                  // max_protection
+               VM_INHERIT_SHARE);      // inheritance
+       if (kr != KERN_SUCCESS)
                panic("cannot map commpage %d", kr);
 
        ipc_port_release(handle);
@@ -307,9 +323,9 @@ commpage_init_cpu_capabilities( void )
                                        CPUID_LEAF7_FEATURE_HLE);
        setif(bits, kHasAVX2_0,  cpuid_leaf7_features() &
                                        CPUID_LEAF7_FEATURE_AVX2);
-       setif(bits, kHasRDSEED,  cpuid_features() &
+       setif(bits, kHasRDSEED,  cpuid_leaf7_features() &
                                        CPUID_LEAF7_FEATURE_RDSEED);
-       setif(bits, kHasADX,     cpuid_features() &
+       setif(bits, kHasADX,     cpuid_leaf7_features() &
                                        CPUID_LEAF7_FEATURE_ADX);
        
 #if 0  /* The kernel doesn't support MPX or SGX */
index dff0ae545486701dfe166ab5dd2539558103d031..5caa412da3594fc125d2b926a40d66ef7bf59c18 100644 (file)
@@ -218,7 +218,7 @@ int _NumCPUs( void )
 /* Align following entries to next cache line */
 #define _COMM_PAGE_CONT_TIMEBASE       (_COMM_PAGE_START_ADDRESS+0x0C0)        /* used by mach_continuous_time() */
 #define _COMM_PAGE_BOOTTIME_USEC       (_COMM_PAGE_START_ADDRESS+0x0C8)        /* uint64_t boottime */
-#define _COMM_PAGE_NEWTIMEOFDAY_DATA   (_COMM_PAGE_START_ADDRESS+0x0D0)        /* used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40*/
+#define _COMM_PAGE_NEWTIMEOFDAY_DATA   (_COMM_PAGE_START_ADDRESS+0x0D0)        /* used by gettimeofday(). Currently, sizeof(new_commpage_timeofday_data_t) = 40 */
 
 #define _COMM_PAGE_END                 (_COMM_PAGE_START_ADDRESS+0xfff)        /* end of common page */
 
index 32ffc1d83eb61049f0c530b8e4abbd1a7119c762..4201068f4d7be6506c9125121493e2e789bcc6ae 100644 (file)
@@ -268,7 +268,6 @@ typedef struct cpu_data
 #if CONFIG_MCA
        struct mca_state        *cpu_mca_state;         /* State at MC fault */
 #endif
-       struct prngContext      *cpu_prng;              /* PRNG's context */
        int                     cpu_type;
        int                     cpu_subtype;
        int                     cpu_threadtype;
@@ -289,6 +288,7 @@ typedef struct cpu_data
        uint64_t                cpu_exit_cr3;
        uint64_t                cpu_pcid_last_cr3;
 #endif
+       boolean_t               cpu_rendezvous_in_progress;
 } cpu_data_t;
 
 extern cpu_data_t      *cpu_data_ptr[];  
@@ -365,12 +365,37 @@ extern cpu_data_t *cpu_data_ptr[];
  * inline versions of these routines.  Everyone outside, must call
  * the real thing,
  */
+
+
+/*
+ * The "volatile" flavor of current_thread() is intended for use by
+ * scheduler code which may need to update the thread pointer in the
+ * course of a context switch.  Any call to current_thread() made
+ * prior to the thread pointer update should be safe to optimize away
+ * as it should be consistent with that thread's state to the extent
+ * the compiler can reason about it.  Likewise, the context switch
+ * path will eventually result in an arbitrary branch to the new
+ * thread's pc, about which the compiler won't be able to reason.
+ * Thus any compile-time optimization of current_thread() calls made
+ * within the new thread should be safely encapsulated in its
+ * register/stack state.  The volatile form therefore exists to cover
+ * the window between the thread pointer update and the branch to
+ * the new pc.
+ */
 static inline thread_t
+get_active_thread_volatile(void)
+{
+       CPU_DATA_GET(cpu_active_thread,thread_t)
+}
+
+static inline __pure2 thread_t
 get_active_thread(void)
 {
        CPU_DATA_GET(cpu_active_thread,thread_t)
 }
+
 #define current_thread_fast()          get_active_thread()
+#define current_thread_volatile()      get_active_thread_volatile()
 #define current_thread()               current_thread_fast()
 
 #define cpu_mode_is64bit()             TRUE
index b6182d4a381af616db65b4873d9fdb9cab01a11b..5679190e32f56a0ab19317e777b717cd58424376 100644 (file)
@@ -40,6 +40,33 @@ Entry(vzeroall)
        vzeroall
        ret
 
+Entry(avx512_zero)
+       vzeroall
+
+       VPX %zmm16
+       VPX %zmm17
+       VPX %zmm18
+       VPX %zmm19
+
+       VPX %zmm20
+       VPX %zmm21
+       VPX %zmm22
+       VPX %zmm23
+
+       VPX %zmm24
+       VPX %zmm25
+       VPX %zmm26
+       VPX %zmm27
+
+       VPX %zmm28
+       VPX %zmm29
+       VPX %zmm30
+       VPX %zmm31
+
+       xor %eax, %eax
+       kmovw %eax, %k1
+       ret
+
 Entry(xmmzeroall)
        PX %xmm0
        PX %xmm1
index 51c89b83298e34bed78b72cd202b9aaf578d3085..c0883c821c5f6b67405ce902c616deed113e6310 100644 (file)
@@ -57,7 +57,6 @@
 #include <mach/exception_types.h>
 #include <mach/i386/thread_status.h>
 #include <mach/i386/fp_reg.h>
-#include <mach/branch_predicates.h>
 
 #include <kern/mach_param.h>
 #include <kern/processor.h>
@@ -557,7 +556,7 @@ static void fpu_load_registers(void *fstate) {
        fp_save_layout_t layout = ifps->fp_save_layout;
 
        assert(current_task() == NULL ||                                \
-              (thread_is_64bit(current_thread()) ?                     \
+              (thread_is_64bit_addr(current_thread()) ?                        \
                        (layout == FXSAVE64 || layout == XSAVE64) :     \
                        (layout == FXSAVE32 || layout == XSAVE32)));
        assert(ALIGNED(ifps, 64));
@@ -701,7 +700,10 @@ fpu_switch_context(thread_t old, thread_t new)
                 */
                clear_ts();
                /* registers are in FPU - save to memory */
-               fpu_store_registers(ifps, (thread_is_64bit(old) && is_saved_state64(old->machine.iss)));
+               boolean_t is64 = (thread_is_64bit_addr(old) &&
+                   is_saved_state64(old->machine.iss));
+
+               fpu_store_registers(ifps, is64);
                ifps->fp_valid = TRUE;
 
                if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
@@ -839,13 +841,13 @@ fpu_set_fxstate(
                        panic("fpu_set_fxstate() UNDEFINED xstate");
                        break;
                    case FP:
-                       ifps->fp_save_layout = thread_is_64bit(thr_act) ? FXSAVE64 : FXSAVE32;
+                       ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
                        break;
                    case AVX: {
                        struct x86_avx_thread_state *iavx = (void *) ifps;
                        x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
 
-                       iavx->fp.fp_save_layout = thread_is_64bit(thr_act) ? XSAVE64 : XSAVE32;
+                       iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
 
                        /* Sanitize XSAVE header */
                        bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
@@ -870,7 +872,7 @@ fpu_set_fxstate(
                                x86_avx512_state64_t *s64;
                        } xs = { .ts = tstate };
 
-                       iavx->fp.fp_save_layout = thread_is_64bit(thr_act) ? XSAVE64 : XSAVE32;
+                       iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
 
                        /* Sanitize XSAVE header */
                        bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
@@ -1182,7 +1184,7 @@ fpnoextflt(void)
                ifps = fp_state_alloc(xstate);
                bcopy((char *)&initial_fp_state, (char *)ifps,
                    fp_state_size[xstate]);
-               if (!thread_is_64bit(thr_act)) {
+               if (!thread_is_64bit_addr(thr_act)) {
                        ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
                }
                else
@@ -1343,7 +1345,7 @@ fp_save(
                assert((get_cr0() & CR0_TS) == 0);
                /* registers are in FPU */
                ifps->fp_valid = TRUE;
-               fpu_store_registers(ifps, thread_is_64bit(thr_act));
+               fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
        }
 }
 
index 8868fe29ce21c033512c66c01b236df578fc5daa..0ed1dda804d1bfc99310ae3a4d8b028f4ce1d411 100644 (file)
@@ -134,5 +134,6 @@ extern void         fpUDflt(user_addr_t rip);
 extern uint32_t        thread_fpsimd_hash(thread_t);
 extern void vzeroall(void);
 extern void xmmzeroall(void);
+extern void avx512_zero(void);
 #endif /* MKP */
 #endif /* _I386_FPU_H_ */
index 5a7655de9fbb7679518e0e157f9cd6697366a799..8eb6b7edf6bfbac5244410f60c09888bf1a52b08 100644 (file)
@@ -416,7 +416,7 @@ vstart(vm_offset_t boot_args_start)
 #endif
 
 #if MONOTONIC
-               mt_init();
+               mt_early_init();
 #endif /* MONOTONIC */
 
                first_avail = (vm_offset_t)ID_MAP_VTOP(physfree);
@@ -729,6 +729,7 @@ void doublemap_init(void) {
 
        dblmap_dist = dblmap_base - hdescb;
        idt64_hndl_table0[1] = DBLMAP(idt64_hndl_table0[1]);
+       idt64_hndl_table0[6] = (uint64_t)(uintptr_t)&kernel_stack_mask;
 
        extern cpu_data_t cpshadows[], scdatas[];
        uintptr_t cd1 = (uintptr_t) &cpshadows[0];
index d657afaee7aebbf117ab962fef950394345151ce..ac0726918fe241e26c3edc1fe5b3879498eae865 100644 (file)
@@ -41,7 +41,7 @@
 #include <i386/trap.h>
 #include <config_dtrace.h>
 #include <i386/mp.h>
-       
+
 #include "assym.s"
 
 #define        PAUSE           rep; nop
 #define LEAF_ENTRY(name)       \
        Entry(name)
 
-#define LEAF_ENTRY2(n1,n2)     \
-       Entry(n1);              \
-       Entry(n2)
-
 #define LEAF_RET               \
        ret
 
-/* Non-leaf routines always have a stack frame: */
-
-#define NONLEAF_ENTRY(name)    \
-       Entry(name);            \
-       FRAME
-
-#define NONLEAF_ENTRY2(n1,n2)  \
-       Entry(n1);              \
-       Entry(n2);              \
-       FRAME
-
-#define NONLEAF_RET            \
-       EMARF;                  \
-       ret
-
-
 /* For x86_64, the varargs ABI requires that %al indicate
  * how many SSE register contain arguments. In our case, 0 */
 #define ALIGN_STACK()          and  $0xFFFFFFFFFFFFFFF0, %rsp ;
 #define LOAD_PTR_ARG1(x)       mov x, %rsi ;
 #define CALL_PANIC()           xorb %al,%al ; call EXT(panic) ;
 
-#define        CHECK_UNLOCK(current, owner)                            \
-       cmp     current, owner                          ;       \
-       je      1f                                      ;       \
-       ALIGN_STACK()                                   ;       \
-       LOAD_STRING_ARG0(2f)                            ;       \
-       CALL_PANIC()                                    ;       \
-       hlt                                             ;       \
-       .data                                           ;       \
-2:     String  "Mutex unlock attempted from non-owner thread"; \
-       .text                                           ;       \
-1:
-
-#if    MACH_LDEBUG
-/*
- *  Routines for general lock debugging.
- */
-
-/* 
- * Checks for expected lock types and calls "panic" on
- * mismatch.  Detects calls to Mutex functions with
- * type simplelock and vice versa.
- */
-#define        CHECK_MUTEX_TYPE()                                      \
-       cmpl    $ MUTEX_TAG,M_TYPE                      ;       \
-       je      1f                                      ;       \
-       ALIGN_STACK()                                   ;       \
-       LOAD_STRING_ARG0(2f)                            ;       \
-       CALL_PANIC()                                    ;       \
-       hlt                                             ;       \
-       .data                                           ;       \
-2:     String  "not a mutex!"                          ;       \
-       .text                                           ;       \
-1:
-
-#define        CHECK_MYLOCK(current, owner)                            \
-       cmp     current, owner                          ;       \
-       jne     1f                                      ;       \
-       ALIGN_STACK()                                   ;       \
-       LOAD_STRING_ARG0(2f)                            ;       \
-       CALL_PANIC()                                    ;       \
-       hlt                                             ;       \
-       .data                                           ;       \
-2:     String  "Attempt to recursively lock a non-recursive lock";     \
-       .text                                           ;       \
-1:
-
-#else  /* MACH_LDEBUG */
-#define        CHECK_MUTEX_TYPE()
-#define        CHECK_MYLOCK(thd)
-#endif /* MACH_LDEBUG */
-
-#if DEVELOPMENT || DEBUG
-/*
- * If one or more simplelocks are currently held by a thread,
- * an attempt to acquire a mutex will cause this check to fail
- * (since a mutex lock may context switch, holding a simplelock
- * is not a good thing).
- */
-#define CHECK_PREEMPTION_LEVEL()                               \
-       cmpl    $0,%gs:CPU_PREEMPTION_LEVEL             ;       \
-       je      1f                                      ;       \
-       cmpl    $0,EXT(LckDisablePreemptCheck)(%rip)    ;       \
-       jne     1f                                      ;       \
-       cmpl    $0,%gs:CPU_HIBERNATE                    ;       \
-       jne     1f                                      ;       \
-       ALIGN_STACK()                                   ;       \
-       movl    %gs:CPU_PREEMPTION_LEVEL, %eax          ;       \
-       LOAD_ARG1(%eax)                                 ;       \
-       LOAD_STRING_ARG0(2f)                            ;       \
-       CALL_PANIC()                                    ;       \
-       hlt                                             ;       \
-       .data                                           ;       \
-2:     String  "preemption_level(%d) != 0!"            ;       \
-       .text                                           ;       \
-1:
-#else /* DEVELOPMENT || DEBUG */
-#define CHECK_PREEMPTION_LEVEL()
-#endif /* DEVELOPMENT || DEBUG */
-
 #define PREEMPTION_DISABLE                             \
        incl    %gs:CPU_PREEMPTION_LEVEL
 
 19:
 #endif
 
-
-#if    CONFIG_DTRACE
-
-       .globl  _lockstat_probe
-       .globl  _lockstat_probemap
-
-/*
- * LOCKSTAT_LABEL creates a dtrace symbol which contains
- * a pointer into the lock code function body. At that
- * point is a "ret" instruction that can be patched into
- * a "nop"
- */
-
-#define        LOCKSTAT_LABEL(lab) \
-       .data                                       ;\
-       .globl  lab                                 ;\
-       lab:                                        ;\
-       .quad 9f                                    ;\
-       .text                                       ;\
-       9:
-
-#define LOCKSTAT_RECORD(id, lck) \
-       push    %rbp                                ;       \
-       mov     %rsp,%rbp                           ;       \
-       movl    _lockstat_probemap + (id * 4)(%rip),%eax ;  \
-       test    %eax,%eax                           ;       \
-       je              9f                          ;       \
-       mov             lck, %rsi                   ;       \
-       mov             %rax, %rdi                  ;       \
-       mov             $0, %rdx                    ;       \
-       mov             $0, %rcx                    ;       \
-       mov             $0, %r8                     ;       \
-       mov             $0, %r9                     ;       \
-       call    *_lockstat_probe(%rip)              ;       \
-9:     leave
-       /* ret - left to subsequent code, e.g. return values */
-
-#endif /* CONFIG_DTRACE */
-
 /*
  * For most routines, the hw_lock_t pointer is loaded into a
  * register initially, and then either a byte or register-sized
@@ -286,801 +148,6 @@ LEAF_ENTRY(hw_lock_byte_unlock)
        movb $0, (%rdi)         /* Clear the lock byte */
        PREEMPTION_ENABLE
        LEAF_RET
-       
-/*
- * N.B.: On x86, statistics are currently recorded for all indirect mutexes.
- * Also, only the acquire attempt count (GRP_MTX_STAT_UTIL) is maintained
- * as a 64-bit quantity (this matches the existing PowerPC implementation,
- * and the new x86 specific statistics are also maintained as 32-bit
- * quantities).
- *
- *
- * Enable this preprocessor define to record the first miss alone
- * By default, we count every miss, hence multiple misses may be
- * recorded for a single lock acquire attempt via lck_mtx_lock
- */
-#undef LOG_FIRST_MISS_ALONE    
-
-/*
- * This preprocessor define controls whether the R-M-W update of the
- * per-group statistics elements are atomic (LOCK-prefixed)
- * Enabled by default.
- */
-#define ATOMIC_STAT_UPDATES 1
-
-#if defined(ATOMIC_STAT_UPDATES)
-#define LOCK_IF_ATOMIC_STAT_UPDATES lock
-#else
-#define LOCK_IF_ATOMIC_STAT_UPDATES
-#endif /* ATOMIC_STAT_UPDATES */
-
-
-/*
- * For most routines, the lck_mtx_t pointer is loaded into a
- * register initially, and the owner field checked for indirection.
- * Eventually the lock owner is loaded into a register and examined.
- */
-
-#define M_OWNER                MUTEX_OWNER
-#define M_PTR          MUTEX_PTR
-#define M_STATE                MUTEX_STATE     
-       
-
-#define LMTX_ENTER_EXTENDED                                    \
-       mov     M_PTR(%rdx), %rdx                       ;       \
-       xor     %r11, %r11                              ;       \
-       mov     MUTEX_GRP(%rdx), %r10                   ;       \
-       LOCK_IF_ATOMIC_STAT_UPDATES                     ;       \
-       incq    GRP_MTX_STAT_UTIL(%r10)
-
-
-#if    LOG_FIRST_MISS_ALONE
-#define LMTX_UPDATE_MISS                                       \
-       test    $1, %r11                                ;       \
-       jnz     11f                                     ;       \
-       LOCK_IF_ATOMIC_STAT_UPDATES                     ;       \
-       incl    GRP_MTX_STAT_MISS(%r10)                 ;       \
-       or      $1, %r11                                ;       \
-11:
-#else
-#define LMTX_UPDATE_MISS                                       \
-       LOCK_IF_ATOMIC_STAT_UPDATES                     ;       \
-       incl    GRP_MTX_STAT_MISS(%r10)
-#endif
-       
-
-#if    LOG_FIRST_MISS_ALONE
-#define LMTX_UPDATE_WAIT                                       \
-       test    $2, %r11                                ;       \
-       jnz     11f                                     ;       \
-       LOCK_IF_ATOMIC_STAT_UPDATES                     ;       \
-       incl    GRP_MTX_STAT_WAIT(%r10)                 ;       \
-       or      $2, %r11                                ;       \
-11:
-#else
-#define LMTX_UPDATE_WAIT                                       \
-       LOCK_IF_ATOMIC_STAT_UPDATES                     ;       \
-       incl    GRP_MTX_STAT_WAIT(%r10)
-#endif
-
-
-/*
- * Record the "direct wait" statistic, which indicates if a
- * miss proceeded to block directly without spinning--occurs
- * if the owner of the mutex isn't running on another processor
- * at the time of the check.
- */
-#define LMTX_UPDATE_DIRECT_WAIT                                        \
-       LOCK_IF_ATOMIC_STAT_UPDATES                     ;       \
-       incl    GRP_MTX_STAT_DIRECT_WAIT(%r10)
-
-       
-#define LMTX_CALLEXT1(func_name)               \
-       cmp     %rdx, %rdi              ;       \
-       je      12f                     ;       \
-       push    %r10                    ;       \
-       push    %r11                    ;       \
-12:    push    %rdi                    ;       \
-       push    %rdx                    ;       \
-       mov     %rdx, %rdi              ;       \
-       call    EXT(func_name)          ;       \
-       pop     %rdx                    ;       \
-       pop     %rdi                    ;       \
-       cmp     %rdx, %rdi              ;       \
-       je      12f                     ;       \
-       pop     %r11                    ;       \
-       pop     %r10                    ;       \
-12:
-       
-#define LMTX_CALLEXT2(func_name, reg)          \
-       cmp     %rdx, %rdi              ;       \
-       je      12f                     ;       \
-       push    %r10                    ;       \
-       push    %r11                    ;       \
-12:    push    %rdi                    ;       \
-       push    %rdx                    ;       \
-       mov     reg, %rsi               ;       \
-       mov     %rdx, %rdi              ;       \
-       call    EXT(func_name)          ;       \
-       pop     %rdx                    ;       \
-       pop     %rdi                    ;       \
-       cmp     %rdx, %rdi              ;       \
-       je      12f                     ;       \
-       pop     %r11                    ;       \
-       pop     %r10                    ;       \
-12:
-
-
-#define M_WAITERS_MSK          0x0000ffff
-#define M_PRIORITY_MSK         0x00ff0000
-#define M_ILOCKED_MSK          0x01000000
-#define M_MLOCKED_MSK          0x02000000
-#define M_PROMOTED_MSK         0x04000000
-#define M_SPIN_MSK             0x08000000
-
-/*
- *     void lck_mtx_assert(lck_mtx_t* l, unsigned int)
- *     Takes the address of a lock, and an assertion type as parameters.
- *     The assertion can take one of two forms determine by the type
- *     parameter: either the lock is held by the current thread, and the
- *     type is LCK_MTX_ASSERT_OWNED, or it isn't and the type is
- *     LCK_MTX_ASSERT_NOTOWNED. Calls panic on assertion failure.
- *     
- */
-
-NONLEAF_ENTRY(lck_mtx_assert)
-        mov    %rdi, %rdx                      /* Load lock address */
-       mov     %gs:CPU_ACTIVE_THREAD, %rax     /* Load current thread */
-
-       mov     M_STATE(%rdx), %ecx
-       cmp     $(MUTEX_IND), %ecx              /* Is this an indirect mutex? */
-       jne     0f
-       mov     M_PTR(%rdx), %rdx               /* If so, take indirection */
-0:     
-       mov     M_OWNER(%rdx), %rcx             /* Load owner */
-       cmp     $(MUTEX_ASSERT_OWNED), %rsi
-       jne     2f                              /* Assert ownership? */
-       cmp     %rax, %rcx                      /* Current thread match? */
-       jne     3f                              /* no, go panic */
-       testl   $(M_ILOCKED_MSK | M_MLOCKED_MSK), M_STATE(%rdx)
-       je      3f
-1:                                             /* yes, we own it */
-       NONLEAF_RET
-2:
-       cmp     %rax, %rcx                      /* Current thread match? */
-       jne     1b                              /* No, return */
-       ALIGN_STACK()
-       LOAD_PTR_ARG1(%rdx)
-       LOAD_STRING_ARG0(mutex_assert_owned_str)
-       jmp     4f
-3:
-       ALIGN_STACK()
-       LOAD_PTR_ARG1(%rdx)
-       LOAD_STRING_ARG0(mutex_assert_not_owned_str)
-4:
-       CALL_PANIC()
-
-
-lck_mtx_destroyed:
-       ALIGN_STACK()
-       LOAD_PTR_ARG1(%rdx)
-       LOAD_STRING_ARG0(mutex_interlock_destroyed_str)
-       CALL_PANIC()
-       
-
-.data
-mutex_assert_not_owned_str:
-       .asciz  "mutex (%p) not owned\n"
-mutex_assert_owned_str:
-       .asciz  "mutex (%p) owned\n"
-mutex_interlock_destroyed_str:
-       .asciz  "trying to interlock destroyed mutex (%p)"
-.text
-
-
-
-/*
- * lck_mtx_lock()
- * lck_mtx_try_lock()
- * lck_mtx_unlock()
- * lck_mtx_lock_spin()
- * lck_mtx_lock_spin_always()
- * lck_mtx_try_lock_spin()
- * lck_mtx_try_lock_spin_always()
- * lck_mtx_convert_spin()
- */
-NONLEAF_ENTRY(lck_mtx_lock_spin_always)
-       mov     %rdi, %rdx              /* fetch lock pointer */
-       jmp     Llmls_avoid_check
-       
-NONLEAF_ENTRY(lck_mtx_lock_spin)
-       mov     %rdi, %rdx              /* fetch lock pointer */
-
-       CHECK_PREEMPTION_LEVEL()
-Llmls_avoid_check:
-       mov     M_STATE(%rdx), %ecx
-       test    $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx  /* is the interlock or mutex held */
-       jnz     Llmls_slow
-Llmls_try:                             /* no - can't be INDIRECT, DESTROYED or locked */
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       or      $(M_ILOCKED_MSK | M_SPIN_MSK), %ecx
-
-       PREEMPTION_DISABLE
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     Llmls_busy_disabled
-
-       mov     %gs:CPU_ACTIVE_THREAD, %rax
-       mov     %rax, M_OWNER(%rdx)     /* record owner of interlock */
-#if    MACH_LDEBUG
-       test    %rax, %rax
-       jz      1f
-       incl    TH_MUTEX_COUNT(%rax)    /* lock statistic */
-1:     
-#endif /* MACH_LDEBUG */
-
-       /* return with the interlock held and preemption disabled */
-       leave
-#if    CONFIG_DTRACE
-       LOCKSTAT_LABEL(_lck_mtx_lock_spin_lockstat_patch_point)
-       ret
-       /* inherit lock pointer in %rdx above */
-       LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, %rdx)
-#endif
-       ret
-
-Llmls_slow:    
-       test    $M_ILOCKED_MSK, %ecx            /* is the interlock held */
-       jz      Llml_contended                  /* no, must have been the mutex */
-
-       cmp     $(MUTEX_DESTROYED), %ecx        /* check to see if its marked destroyed */
-       je      lck_mtx_destroyed
-       cmp     $(MUTEX_IND), %ecx              /* Is this an indirect mutex */
-       jne     Llmls_loop                      /* no... must be interlocked */
-
-       LMTX_ENTER_EXTENDED
-
-       mov     M_STATE(%rdx), %ecx
-       test    $(M_SPIN_MSK), %ecx
-       jz      Llmls_loop1
-
-       LMTX_UPDATE_MISS                /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
-Llmls_loop:
-       PAUSE
-       mov     M_STATE(%rdx), %ecx
-Llmls_loop1:
-       test    $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx
-       jz      Llmls_try
-       test    $(M_MLOCKED_MSK), %ecx
-       jnz     Llml_contended                  /* mutex owned by someone else, go contend for it */
-       jmp     Llmls_loop
-
-Llmls_busy_disabled:
-       PREEMPTION_ENABLE
-       jmp     Llmls_loop
-
-
-       
-NONLEAF_ENTRY(lck_mtx_lock)
-       mov     %rdi, %rdx              /* fetch lock pointer */
-
-       CHECK_PREEMPTION_LEVEL()
-
-       mov     M_STATE(%rdx), %ecx
-       test    $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx  /* is the interlock or mutex held */
-       jnz     Llml_slow
-Llml_try:                              /* no - can't be INDIRECT, DESTROYED or locked */
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       or      $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx
-
-       PREEMPTION_DISABLE
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     Llml_busy_disabled
-
-       mov     %gs:CPU_ACTIVE_THREAD, %rax
-       mov     %rax, M_OWNER(%rdx)     /* record owner of mutex */
-#if    MACH_LDEBUG
-       test    %rax, %rax
-       jz      1f
-       incl    TH_MUTEX_COUNT(%rax)    /* lock statistic */
-1:
-#endif /* MACH_LDEBUG */
-
-       testl   $(M_WAITERS_MSK), M_STATE(%rdx)
-       jz      Llml_finish
-
-       LMTX_CALLEXT1(lck_mtx_lock_acquire_x86)
-
-Llml_finish:
-       andl    $(~M_ILOCKED_MSK), M_STATE(%rdx)
-       PREEMPTION_ENABLE
-       
-       cmp     %rdx, %rdi              /* is this an extended mutex */
-       jne     2f
-
-       leave
-#if    CONFIG_DTRACE
-       LOCKSTAT_LABEL(_lck_mtx_lock_lockstat_patch_point)
-       ret
-       /* inherit lock pointer in %rdx above */
-       LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, %rdx)
-#endif
-       ret
-2:     
-       leave
-#if    CONFIG_DTRACE
-       LOCKSTAT_LABEL(_lck_mtx_lock_ext_lockstat_patch_point)
-       ret
-       /* inherit lock pointer in %rdx above */
-       LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, %rdx)
-#endif
-       ret
-
-       
-Llml_slow:
-       test    $M_ILOCKED_MSK, %ecx            /* is the interlock held */
-       jz      Llml_contended                  /* no, must have been the mutex */
-       
-       cmp     $(MUTEX_DESTROYED), %ecx        /* check to see if its marked destroyed */
-       je      lck_mtx_destroyed
-       cmp     $(MUTEX_IND), %ecx              /* Is this an indirect mutex? */
-       jne     Llml_loop                       /* no... must be interlocked */
-
-       LMTX_ENTER_EXTENDED
-
-       mov     M_STATE(%rdx), %ecx
-       test    $(M_SPIN_MSK), %ecx
-       jz      Llml_loop1
-
-       LMTX_UPDATE_MISS                /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
-Llml_loop:
-       PAUSE
-       mov     M_STATE(%rdx), %ecx
-Llml_loop1:
-       test    $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx
-       jz      Llml_try
-       test    $(M_MLOCKED_MSK), %ecx
-       jnz     Llml_contended                  /* mutex owned by someone else, go contend for it */
-       jmp     Llml_loop
-
-Llml_busy_disabled:
-       PREEMPTION_ENABLE
-       jmp     Llml_loop
-
-       
-Llml_contended:
-       cmp     %rdx, %rdi              /* is this an extended mutex */
-       je      0f
-       LMTX_UPDATE_MISS
-0:     
-       LMTX_CALLEXT1(lck_mtx_lock_spinwait_x86)
-
-       test    %rax, %rax
-       jz      Llml_acquired           /* acquired mutex, interlock held and preemption disabled */
-
-       cmp     $1, %rax                /* check for direct wait status */
-       je      2f
-       cmp     %rdx, %rdi              /* is this an extended mutex */
-       je      2f
-       LMTX_UPDATE_DIRECT_WAIT
-2:     
-       mov     M_STATE(%rdx), %ecx
-       test    $(M_ILOCKED_MSK), %ecx
-       jnz     6f
-
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       or      $(M_ILOCKED_MSK), %ecx  /* try to take the interlock */
-
-       PREEMPTION_DISABLE
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     5f
-
-       test    $(M_MLOCKED_MSK), %ecx  /* we've got the interlock and */
-       jnz     3f
-       or      $(M_MLOCKED_MSK), %ecx  /* the mutex is free... grab it directly */
-       mov     %ecx, M_STATE(%rdx)
-       
-       mov     %gs:CPU_ACTIVE_THREAD, %rax
-       mov     %rax, M_OWNER(%rdx)     /* record owner of mutex */
-#if    MACH_LDEBUG
-       test    %rax, %rax
-       jz      1f
-       incl    TH_MUTEX_COUNT(%rax)    /* lock statistic */
-1:
-#endif /* MACH_LDEBUG */
-
-Llml_acquired:
-       testl   $(M_WAITERS_MSK), M_STATE(%rdx)
-       jnz     1f
-       mov     M_OWNER(%rdx), %rax
-       mov     TH_WAS_PROMOTED_ON_WAKEUP(%rax), %eax
-       test    %eax, %eax
-       jz      Llml_finish
-1:     
-       LMTX_CALLEXT1(lck_mtx_lock_acquire_x86)
-       jmp     Llml_finish
-
-3:                                     /* interlock held, mutex busy */
-       cmp     %rdx, %rdi              /* is this an extended mutex */
-       je      4f
-       LMTX_UPDATE_WAIT
-4:     
-       LMTX_CALLEXT1(lck_mtx_lock_wait_x86)
-       jmp     Llml_contended
-5:     
-       PREEMPTION_ENABLE
-6:
-       PAUSE
-       jmp     2b
-       
-
-NONLEAF_ENTRY(lck_mtx_try_lock_spin_always)
-       mov     %rdi, %rdx              /* fetch lock pointer */
-       jmp     Llmts_avoid_check
-
-NONLEAF_ENTRY(lck_mtx_try_lock_spin)
-       mov     %rdi, %rdx              /* fetch lock pointer */
-
-Llmts_avoid_check:
-       mov     M_STATE(%rdx), %ecx
-       test    $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx  /* is the interlock or mutex held */
-       jnz     Llmts_slow
-Llmts_try:                             /* no - can't be INDIRECT, DESTROYED or locked */
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       or      $(M_ILOCKED_MSK | M_SPIN_MSK), %rcx
-
-       PREEMPTION_DISABLE
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     Llmts_busy_disabled
-
-       mov     %gs:CPU_ACTIVE_THREAD, %rax
-       mov     %rax, M_OWNER(%rdx)     /* record owner of mutex */
-#if    MACH_LDEBUG
-       test    %rax, %rax
-       jz      1f
-       incl    TH_MUTEX_COUNT(%rax)    /* lock statistic */
-1:
-#endif /* MACH_LDEBUG */
-
-       leave
-
-#if    CONFIG_DTRACE
-       mov     $1, %rax                        /* return success */
-       LOCKSTAT_LABEL(_lck_mtx_try_lock_spin_lockstat_patch_point)
-       ret
-       /* inherit lock pointer in %rdx above */
-       LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, %rdx)
-#endif
-       mov     $1, %rax                        /* return success */
-       ret
-
-Llmts_slow:
-       test    $(M_ILOCKED_MSK), %ecx  /* is the interlock held */
-       jz      Llmts_fail                      /* no, must be held as a mutex */
-
-       cmp     $(MUTEX_DESTROYED), %ecx        /* check to see if its marked destroyed */
-       je      lck_mtx_destroyed
-       cmp     $(MUTEX_IND), %ecx              /* Is this an indirect mutex? */
-       jne     Llmts_loop1
-
-       LMTX_ENTER_EXTENDED
-Llmts_loop:
-       PAUSE
-       mov     M_STATE(%rdx), %ecx
-Llmts_loop1:
-       test    $(M_MLOCKED_MSK | M_SPIN_MSK), %ecx
-       jnz     Llmts_fail
-       test    $(M_ILOCKED_MSK), %ecx
-       jz      Llmts_try
-       jmp     Llmts_loop
-       
-Llmts_busy_disabled:
-       PREEMPTION_ENABLE
-       jmp     Llmts_loop
-
-
-       
-NONLEAF_ENTRY(lck_mtx_try_lock)
-       mov     %rdi, %rdx              /* fetch lock pointer */
-
-       mov     M_STATE(%rdx), %ecx
-       test    $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx  /* is the interlock or mutex held */
-       jnz     Llmt_slow       
-Llmt_try:                              /* no - can't be INDIRECT, DESTROYED or locked */
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       or      $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx
-       
-       PREEMPTION_DISABLE
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     Llmt_busy_disabled
-
-       mov     %gs:CPU_ACTIVE_THREAD, %rax
-       mov     %rax, M_OWNER(%rdx)     /* record owner of mutex */
-#if    MACH_LDEBUG
-       test    %rax, %rax
-       jz      1f
-       incl    TH_MUTEX_COUNT(%rax)    /* lock statistic */
-1:
-#endif /* MACH_LDEBUG */
-
-       test    $(M_WAITERS_MSK), %ecx
-       jz      0f
-
-       LMTX_CALLEXT1(lck_mtx_lock_acquire_x86)
-0:
-       andl    $(~M_ILOCKED_MSK), M_STATE(%rdx)
-       PREEMPTION_ENABLE
-
-       leave
-#if    CONFIG_DTRACE
-       mov     $1, %rax                        /* return success */
-       /* Dtrace probe: LS_LCK_MTX_TRY_LOCK_ACQUIRE */
-       LOCKSTAT_LABEL(_lck_mtx_try_lock_lockstat_patch_point)
-       ret
-       /* inherit lock pointer in %rdx from above */
-       LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, %rdx)
-#endif 
-       mov     $1, %rax                        /* return success */
-       ret
-
-Llmt_slow:
-       test    $(M_ILOCKED_MSK), %ecx  /* is the interlock held */
-       jz      Llmt_fail                       /* no, must be held as a mutex */
-
-       cmp     $(MUTEX_DESTROYED), %ecx        /* check to see if its marked destroyed */
-       je      lck_mtx_destroyed
-       cmp     $(MUTEX_IND), %ecx              /* Is this an indirect mutex? */
-       jne     Llmt_loop
-
-       LMTX_ENTER_EXTENDED
-Llmt_loop:
-       PAUSE
-       mov     M_STATE(%rdx), %ecx
-Llmt_loop1:
-       test    $(M_MLOCKED_MSK | M_SPIN_MSK), %ecx
-       jnz     Llmt_fail
-       test    $(M_ILOCKED_MSK), %ecx
-       jz      Llmt_try
-       jmp     Llmt_loop
-
-Llmt_busy_disabled:
-       PREEMPTION_ENABLE
-       jmp     Llmt_loop
-
-
-Llmt_fail:
-Llmts_fail:
-       cmp     %rdx, %rdi                      /* is this an extended mutex */
-       je      0f
-       LMTX_UPDATE_MISS
-0:
-       xor     %rax, %rax
-       NONLEAF_RET
-
-
-
-NONLEAF_ENTRY(lck_mtx_convert_spin)
-       mov     %rdi, %rdx                      /* fetch lock pointer */
-
-       mov     M_STATE(%rdx), %ecx
-       cmp     $(MUTEX_IND), %ecx              /* Is this an indirect mutex? */
-       jne     0f
-       mov     M_PTR(%rdx), %rdx               /* If so, take indirection */
-       mov     M_STATE(%rdx), %ecx
-0:
-       test    $(M_MLOCKED_MSK), %ecx          /* already owned as a mutex, just return */
-       jnz     2f
-       test    $(M_WAITERS_MSK), %ecx          /* are there any waiters? */
-       jz      1f
-
-       LMTX_CALLEXT1(lck_mtx_lock_acquire_x86)
-       mov     M_STATE(%rdx), %ecx
-1:     
-       and     $(~(M_ILOCKED_MSK | M_SPIN_MSK)), %ecx  /* convert from spin version to mutex */
-       or      $(M_MLOCKED_MSK), %ecx
-       mov     %ecx, M_STATE(%rdx)             /* since I own the interlock, I don't need an atomic update */
-
-       PREEMPTION_ENABLE
-2:     
-       NONLEAF_RET
-
-       
-
-NONLEAF_ENTRY(lck_mtx_unlock)
-       mov     %rdi, %rdx              /* fetch lock pointer */
-Llmu_entry:
-       mov     M_STATE(%rdx), %ecx
-Llmu_prim:
-       cmp     $(MUTEX_IND), %ecx      /* Is this an indirect mutex? */
-       je      Llmu_ext
-
-Llmu_chktype:
-       test    $(M_MLOCKED_MSK), %ecx  /* check for full mutex */
-       jz      Llmu_unlock
-Llmu_mutex:
-       test    $(M_ILOCKED_MSK), %rcx  /* have to wait for interlock to clear */
-       jnz     Llmu_busy
-
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       and     $(~M_MLOCKED_MSK), %ecx /* drop mutex */
-       or      $(M_ILOCKED_MSK), %ecx  /* pick up interlock */
-
-       PREEMPTION_DISABLE
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     Llmu_busy_disabled      /* branch on failure to spin loop */
-
-Llmu_unlock:
-       xor     %rax, %rax
-       mov     %rax, M_OWNER(%rdx)
-       mov     %rcx, %rax              /* keep original state in %ecx for later evaluation */
-       and     $(~(M_ILOCKED_MSK | M_SPIN_MSK | M_PROMOTED_MSK)), %rax
-
-       test    $(M_WAITERS_MSK), %eax
-       jz      2f
-       dec     %eax                    /* decrement waiter count */
-2:     
-       mov     %eax, M_STATE(%rdx)     /* since I own the interlock, I don't need an atomic update */
-
-#if    MACH_LDEBUG
-       /* perform lock statistics after drop to prevent delay */
-       mov     %gs:CPU_ACTIVE_THREAD, %rax
-       test    %rax, %rax
-       jz      1f
-       decl    TH_MUTEX_COUNT(%rax)    /* lock statistic */
-1:
-#endif /* MACH_LDEBUG */
-
-       test    $(M_PROMOTED_MSK | M_WAITERS_MSK), %ecx
-       jz      3f
-
-       LMTX_CALLEXT2(lck_mtx_unlock_wakeup_x86, %rcx)
-3:     
-       PREEMPTION_ENABLE
-
-       cmp     %rdx, %rdi
-       jne     4f
-
-       leave
-#if    CONFIG_DTRACE
-       /* Dtrace: LS_LCK_MTX_UNLOCK_RELEASE */
-       LOCKSTAT_LABEL(_lck_mtx_unlock_lockstat_patch_point)
-       ret
-       /* inherit lock pointer in %rdx from above */
-       LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, %rdx)
-#endif
-       ret
-4:     
-       leave
-#if    CONFIG_DTRACE
-       /* Dtrace: LS_LCK_MTX_EXT_UNLOCK_RELEASE */
-       LOCKSTAT_LABEL(_lck_mtx_ext_unlock_lockstat_patch_point)
-       ret
-       /* inherit lock pointer in %rdx from above */
-       LOCKSTAT_RECORD(LS_LCK_MTX_EXT_UNLOCK_RELEASE, %rdx)
-#endif
-       ret
-
-
-Llmu_busy_disabled:
-       PREEMPTION_ENABLE
-Llmu_busy:
-       PAUSE
-       mov     M_STATE(%rdx), %ecx
-       jmp     Llmu_mutex
-
-Llmu_ext:
-       mov     M_PTR(%rdx), %rdx
-       mov     M_OWNER(%rdx), %rax
-       mov     %gs:CPU_ACTIVE_THREAD, %rcx
-       CHECK_UNLOCK(%rcx, %rax)
-       mov     M_STATE(%rdx), %ecx
-       jmp     Llmu_chktype
-
-
-       
-LEAF_ENTRY(lck_mtx_ilk_try_lock)
-       mov     %rdi, %rdx              /* fetch lock pointer - no indirection here */
-
-       mov     M_STATE(%rdx), %ecx
-
-       test    $(M_ILOCKED_MSK), %ecx  /* can't have the interlock yet */
-       jnz     3f
-
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       or      $(M_ILOCKED_MSK), %ecx
-
-       PREEMPTION_DISABLE
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     2f                      /* return failure after re-enabling preemption */
-
-       mov     $1, %rax                /* return success with preemption disabled */
-       LEAF_RET
-2:     
-       PREEMPTION_ENABLE               /* need to re-enable preemption */
-3:     
-       xor     %rax, %rax              /* return failure */
-       LEAF_RET
-       
-
-LEAF_ENTRY(lck_mtx_ilk_unlock)
-       mov     %rdi, %rdx              /* fetch lock pointer - no indirection here */
-
-       andl    $(~M_ILOCKED_MSK), M_STATE(%rdx)
-
-       PREEMPTION_ENABLE               /* need to re-enable preemption */
-
-       LEAF_RET
-
-       
-LEAF_ENTRY(lck_mtx_lock_grab_mutex)
-       mov     %rdi, %rdx              /* fetch lock pointer - no indirection here */
-
-       mov     M_STATE(%rdx), %ecx
-
-       test    $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx  /* can't have the mutex yet */
-       jnz     3f
-
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       or      $(M_ILOCKED_MSK | M_MLOCKED_MSK), %ecx
-
-       PREEMPTION_DISABLE
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     2f                              /* branch on failure to spin loop */
-
-       mov     %gs:CPU_ACTIVE_THREAD, %rax
-       mov     %rax, M_OWNER(%rdx)     /* record owner of mutex */
-#if    MACH_LDEBUG
-       test    %rax, %rax
-       jz      1f
-       incl    TH_MUTEX_COUNT(%rax)    /* lock statistic */
-1:
-#endif /* MACH_LDEBUG */
-
-       mov     $1, %rax                /* return success */
-       LEAF_RET
-2:                                             
-       PREEMPTION_ENABLE
-3:
-       xor     %rax, %rax      /* return failure */
-       LEAF_RET
-       
-
-
-LEAF_ENTRY(lck_mtx_lock_mark_destroyed)
-       mov     %rdi, %rdx
-1:
-       mov     M_STATE(%rdx), %ecx
-       cmp     $(MUTEX_IND), %ecx      /* Is this an indirect mutex? */
-       jne     2f
-
-       movl    $(MUTEX_DESTROYED), M_STATE(%rdx)       /* convert to destroyed state */
-       jmp     3f
-2:     
-       test    $(M_ILOCKED_MSK), %rcx  /* have to wait for interlock to clear */
-       jnz     5f
-
-       PREEMPTION_DISABLE
-       mov     %rcx, %rax              /* eax contains snapshot for cmpxchgl */
-       or      $(M_ILOCKED_MSK), %ecx
-       lock
-       cmpxchg %ecx, M_STATE(%rdx)     /* atomic compare and exchange */
-       jne     4f                      /* branch on failure to spin loop */
-       movl    $(MUTEX_DESTROYED), M_STATE(%rdx)       /* convert to destroyed state */
-       PREEMPTION_ENABLE
-3:
-       LEAF_RET                        /* return with M_ILOCKED set */
-4:
-       PREEMPTION_ENABLE
-5:
-       PAUSE
-       jmp     1b
 
 LEAF_ENTRY(preemption_underflow_panic)
        FRAME
@@ -1093,4 +160,3 @@ LEAF_ENTRY(preemption_underflow_panic)
 16:    String  "Preemption level underflow, possible cause unlocking an unlocked mutex or spinlock"
        .text
 
-
index e623d60045dd53e9ab3c37205549c9e76db3a889..9b74e8d449911ea58db3ba8e0e8edfa96ff2f799 100644 (file)
@@ -113,6 +113,9 @@ vm_offset_t vm_prelink_einfo;
 vm_offset_t vm_slinkedit;
 vm_offset_t vm_elinkedit;
 
+vm_offset_t vm_kernel_builtinkmod_text;
+vm_offset_t vm_kernel_builtinkmod_text_end;
+
 #define MAXLORESERVE   (32 * 1024 * 1024)
 
 ppnum_t                max_ppnum = 0;
index 3d337a1c8c0a784df73add40d9c49ae92f645d7b..9bdd394cf78bf54cb36753c6ff4b3ace92d29879 100644 (file)
@@ -31,6 +31,7 @@
 
 #include <sys/appleapiopts.h>
 #include <kern/kern_types.h>
+#include <kern/assert.h>
 
 #ifdef MACH_KERNEL_PRIVATE
 
@@ -100,23 +101,39 @@ typedef struct _lck_mtx_ {
        };
 } lck_mtx_t;
 
+#define LCK_MTX_WAITERS_MSK            0x0000ffff
+#define LCK_MTX_WAITER                 0x00000001
+#define LCK_MTX_PRIORITY_MSK           0x00ff0000
+#define LCK_MTX_ILOCKED_MSK            0x01000000
+#define LCK_MTX_MLOCKED_MSK            0x02000000
+#define LCK_MTX_PROMOTED_MSK           0x04000000
+#define LCK_MTX_SPIN_MSK               0x08000000
+
 /* This pattern must subsume the interlocked, mlocked and spin bits */
 #define        LCK_MTX_TAG_INDIRECT                    0x07ff1007      /* lock marked as Indirect  */
 #define        LCK_MTX_TAG_DESTROYED                   0x07fe2007      /* lock marked as Destroyed */
 
 /* Adaptive spin before blocking */
 extern uint64_t        MutexSpin;
-extern int             lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex);
-extern void            lck_mtx_lock_wait_x86(lck_mtx_t *mutex);
-extern void            lck_mtx_lock_acquire_x86(lck_mtx_t *mutex);
-extern void            lck_mtx_unlock_wakeup_x86(lck_mtx_t *mutex, int prior_lock_state);
-
-extern void            lck_mtx_lock_mark_destroyed(lck_mtx_t *mutex);
-extern int             lck_mtx_lock_grab_mutex(lck_mtx_t *mutex);
 
-extern void            hw_lock_byte_init(volatile uint8_t *lock_byte);
-extern void            hw_lock_byte_lock(volatile uint8_t *lock_byte);
-extern void            hw_lock_byte_unlock(volatile uint8_t *lock_byte);
+typedef enum lck_mtx_spinwait_ret_type {
+       LCK_MTX_SPINWAIT_ACQUIRED = 0,
+       LCK_MTX_SPINWAIT_SPUN = 1,
+       LCK_MTX_SPINWAIT_NO_SPIN = 2,
+} lck_mtx_spinwait_ret_type_t;
+
+extern lck_mtx_spinwait_ret_type_t             lck_mtx_lock_spinwait_x86(lck_mtx_t *mutex);
+extern void                                    lck_mtx_lock_wait_x86(lck_mtx_t *mutex);
+extern void                                    lck_mtx_lock_acquire_x86(lck_mtx_t *mutex);
+
+extern void                                    lck_mtx_lock_slow(lck_mtx_t *lock);
+extern boolean_t                               lck_mtx_try_lock_slow(lck_mtx_t *lock);
+extern void                                    lck_mtx_unlock_slow(lck_mtx_t *lock);
+extern void                                    lck_mtx_lock_spin_slow(lck_mtx_t *lock);
+extern boolean_t                               lck_mtx_try_lock_spin_slow(lck_mtx_t *lock);
+extern void                                    hw_lock_byte_init(volatile uint8_t *lock_byte);
+extern void                                    hw_lock_byte_lock(volatile uint8_t *lock_byte);
+extern void                                    hw_lock_byte_unlock(volatile uint8_t *lock_byte);
 
 typedef struct {
        unsigned int            type;
@@ -176,7 +193,6 @@ typedef struct __lck_mtx_ext_t__    lck_mtx_ext_t;
 #endif
 
 #ifdef MACH_KERNEL_PRIVATE
-#pragma pack(1)                /* Make sure the structure stays as we defined it */
 typedef union _lck_rw_t_internal_ {
        struct {
                volatile uint16_t       lck_rw_shared_count;    /* No. of accepted readers */
@@ -199,7 +215,9 @@ typedef union _lck_rw_t_internal_ {
                uint32_t                lck_rw_pad12;
        };
 } lck_rw_t;
-#pragma pack()
+#define LCK_RW_T_SIZE          16
+
+static_assert(sizeof(lck_rw_t) == LCK_RW_T_SIZE);
 
 #define LCK_RW_SHARED_SHIFT     0
 #define LCK_RW_INTERLOCK_BIT   16
@@ -244,6 +262,7 @@ typedef union _lck_rw_t_internal_ {
 #if LOCK_PRIVATE
 
 #define disable_preemption_for_thread(t) ((cpu_data_t GS_RELATIVE *)0UL)->cpu_preemption_level++
+#define preemption_disabled_for_thread(t) (((cpu_data_t GS_RELATIVE *)0UL)->cpu_preemption_level > 0)
 
 #define LCK_MTX_THREAD_TO_STATE(t)     ((uintptr_t)t)
 #define PLATFORM_LCK_ILOCK             0
@@ -274,5 +293,4 @@ typedef struct __lck_rw_t__ lck_rw_t;
 extern void            kernel_preempt_check (void);
 
 #endif /* MACH_KERNEL_PRIVATE */
-
 #endif /* _I386_LOCKS_H_ */
index 039584749b070f6c7c6d286f792fb484cef84892..bc1669f7f48473427b74aa88893b28d28c1aaae9 100644 (file)
@@ -61,6 +61,9 @@
  *     Locking primitives implementation
  */
 
+#define ATOMIC_PRIVATE 1
+#define LOCK_PRIVATE 1
+
 #include <mach_ldebug.h>
 
 #include <kern/locks.h>
@@ -79,9 +82,9 @@
 #include <machine/atomic.h>
 #include <machine/machine_cpu.h>
 #include <i386/mp.h>
-
+#include <machine/atomic.h>
 #include <sys/kdebug.h>
-#include <mach/branch_predicates.h>
+#include <i386/locks_i386_inlines.h>
 
 /*
  * We need only enough declarations from the BSD-side to be able to
@@ -160,14 +163,6 @@ typedef void       *pc_t;
 #endif /* lint */
 #endif /* USLOCK_DEBUG */
 
-// Enforce program order of loads and stores.
-#define ordered_load(target) _Generic( (target),\
-               uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \
-               uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) )
-#define ordered_store(target, value) _Generic( (target),\
-               uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_relaxed), \
-               uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_relaxed) )
-
 /*
  * atomic exchange API is a low level abstraction of the operations
  * to atomically read, modify, and write a pointer.  This abstraction works
@@ -235,7 +230,6 @@ int         usld_lock_common_checks(usimple_lock_t, char *);
 #define        USLDBG(stmt)
 #endif /* USLOCK_DEBUG */
 
-
 /*
  * Forward definitions
  */
@@ -250,6 +244,14 @@ void lck_rw_clear_promotions_x86(thread_t thread);
 static boolean_t lck_rw_held_read_or_upgrade(lck_rw_t *lock);
 static boolean_t lck_rw_grab_want(lck_rw_t *lock);
 static boolean_t lck_rw_grab_shared(lck_rw_t *lock);
+static void lck_mtx_unlock_wakeup_tail(lck_mtx_t *mutex, int prior_lock_state, boolean_t indirect);
+static void lck_mtx_interlock_lock(lck_mtx_t *mutex, uint32_t *new_state);
+static void lck_mtx_interlock_lock_clear_flags(lck_mtx_t *mutex, uint32_t and_flags, uint32_t *new_state);
+static int lck_mtx_interlock_try_lock(lck_mtx_t *mutex, uint32_t *new_state);
+static int lck_mtx_interlock_try_lock_set_flags(lck_mtx_t *mutex, uint32_t or_flags, uint32_t *new_state);
+static boolean_t lck_mtx_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
+static boolean_t lck_mtx_try_lock_wait_interlock_to_clear(lck_mtx_t *lock, uint32_t *new_state);
+
 
 /*
  *      Routine:        lck_spin_alloc_init
@@ -1030,7 +1032,7 @@ static void
 lck_rw_lock_exclusive_gen(
        lck_rw_t        *lck)
 {
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
        uint64_t        deadline = 0;
        int             slept = 0;
        int             gotlock = 0;
@@ -1096,7 +1098,8 @@ lck_rw_lock_exclusive_gen(
                                lck->lck_w_waiting = TRUE;
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
-                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
+                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1175,7 +1178,8 @@ lck_rw_lock_exclusive_gen(
                                lck->lck_w_waiting = TRUE;
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockWrite);
-                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
+                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1325,7 +1329,7 @@ lck_rw_done_gen(
 #endif
        if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
                /* sched_flags checked without lock, but will be rechecked while clearing */
-               lck_rw_clear_promotion(thread);
+               lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
        }
 
 #if CONFIG_DTRACE
@@ -1440,7 +1444,7 @@ static void
 lck_rw_lock_shared_gen(
        lck_rw_t        *lck)
 {
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
        uint64_t        deadline = 0;
        int             gotlock = 0;
        int             slept = 0;
@@ -1506,7 +1510,8 @@ lck_rw_lock_shared_gen(
                                lck->lck_r_waiting = TRUE;
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockRead);
-                               res = assert_wait(RW_LOCK_READER_EVENT(lck), THREAD_UNINT);
+                               res = assert_wait(RW_LOCK_READER_EVENT(lck),
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1634,7 +1639,7 @@ lck_rw_lock_shared_to_exclusive_failure(
 
        if ((rwlock_count == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
                /* sched_flags checked without lock, but will be rechecked while clearing */
-               lck_rw_clear_promotion(thread);
+               lck_rw_clear_promotion(thread, unslide_for_kdebug(lck));
        }
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_RW_LCK_SH_TO_EX_CODE) | DBG_FUNC_NONE,
@@ -1656,7 +1661,7 @@ static boolean_t
 lck_rw_lock_shared_to_exclusive_success(
        lck_rw_t        *lck)
 {
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
        uint64_t        deadline = 0;
        int             slept = 0;
        int             still_shared = 0;
@@ -1720,7 +1725,8 @@ lck_rw_lock_shared_to_exclusive_success(
                                lck->lck_w_waiting = TRUE;
 
                                thread_set_pending_block_hint(current_thread(), kThreadWaitKernelRWLockUpgrade);
-                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck), THREAD_UNINT);
+                               res = assert_wait(RW_LOCK_WRITER_EVENT(lck),
+                                               THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
                                lck_interlock_unlock(lck, istate);
 
                                if (res == THREAD_WAITING) {
@@ -1796,7 +1802,7 @@ lck_rw_lock_exclusive_to_shared_gen(
        lck_rw_t        *lck,
        uint32_t        prior_lock_state)
 {
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
        lck_rw_t                *fake_lck;
 
        fake_lck = (lck_rw_t *)&prior_lock_state;
@@ -1956,7 +1962,7 @@ lck_rw_clear_promotions_x86(thread_t thread)
 #else
        /* Paper over the issue */
        thread->rwlock_count = 0;
-       lck_rw_clear_promotion(thread);
+       lck_rw_clear_promotion(thread, 0);
 #endif
 }
 
@@ -1987,10 +1993,77 @@ kdp_lck_rw_lock_is_acquired_exclusive(lck_rw_t *lck) {
        return ((lck->lck_rw_want_upgrade || lck->lck_rw_want_write) && (lck->lck_rw_shared_count == 0)) ? TRUE : FALSE;
 }
 
+/*
+ * Slow path routines for lck_mtx locking and unlocking functions.
+ *
+ * These functions were previously implemented in x86 assembly,
+ * and some optimizations are in place in this c code to obtain a compiled code
+ * as performant and compact as the assembly version.
+ *
+ * To avoid to inline these functions on the fast path, all functions directly called by
+ * the fast paths have the __attribute__((noinline)) specified. Also they are all implemented
+ * in such a way the fast path can tail call into them. In this way the return address
+ * does not need to be pushed on the caller stack and stack optimization can happen on the caller.
+ *
+ * Slow path code is structured in such a way there are no calls to functions that will return
+ * on the context of the caller function, i.e. all functions called are or tail call functions
+ * or inline functions. The number of arguments of the tail call functions are less then six,
+ * so that they can be passed over registers and do not need to be pushed on stack.
+ * This allows the compiler to not create a stack frame for the functions.
+ *
+ * __improbable and __probable are used to compile the slow path code in such a way
+ * the fast path case will be on a sequence of instructions with as less jumps as possible,
+ * to make this case the most optimized even if falling through the slow path.
+ */
+
+/*
+ * Intel lock invariants:
+ *
+ * lck_mtx_waiters: contains the count of threads currently in the mutex waitqueue
+ * lck_mtx_pri: contains the max priority of all waiters during a contention period
+ *      not cleared on last unlock, but stomped over on next first contention
+ * lck_mtx_promoted: set when the current lock owner has been promoted
+ *      cleared when lock owner unlocks, set on acquire or wait.
+ *
+ * The lock owner is promoted to the max priority of all its waiters only if it
+ * was a lower priority when it acquired or was an owner when a waiter waited.
+ * Max priority is capped at MAXPRI_PROMOTE.
+ *
+ * The last waiter will not be promoted as it is woken up, but the last
+ * lock owner may not have been the last thread to have been woken up depending on the
+ * luck of the draw.  Therefore a last-owner may still have the promoted-on-wakeup
+ * flag set.
+ *
+ * TODO: Figure out an algorithm for stopping a lock holder which is already at the right
+ *       priority from dropping priority in the future without having to take thread lock
+ *       on acquire.
+ */
 
 #ifdef MUTEX_ZONE
 extern zone_t lck_mtx_zone;
 #endif
+
+/*
+ * N.B.: On x86, statistics are currently recorded for all indirect mutexes.
+ * Also, only the acquire attempt count (GRP_MTX_STAT_UTIL) is maintained
+ * as a 64-bit quantity (the new x86 specific statistics are also maintained
+ * as 32-bit quantities).
+ *
+ *
+ * Enable this preprocessor define to record the first miss alone
+ * By default, we count every miss, hence multiple misses may be
+ * recorded for a single lock acquire attempt via lck_mtx_lock
+ */
+#undef LOG_FIRST_MISS_ALONE
+
+/*
+ * This preprocessor define controls whether the R-M-W update of the
+ * per-group statistics elements are atomic (LOCK-prefixed)
+ * Enabled by default.
+ */
+#define ATOMIC_STAT_UPDATES 1
+
+
 /*
  *      Routine:        lck_mtx_alloc_init
  */
@@ -2114,6 +2187,27 @@ lck_mtx_init_ext(
        lck_grp_lckcnt_incr(grp, LCK_TYPE_MTX);
 }
 
+static void
+lck_mtx_lock_mark_destroyed(
+       lck_mtx_t *mutex,
+       boolean_t indirect)
+{
+       uint32_t state;
+
+       if (indirect) {
+               /* convert to destroyed state */
+               ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
+               return;
+       }
+
+       state = ordered_load_mtx_state(mutex);
+       lck_mtx_interlock_lock(mutex, &state);
+
+       ordered_store_mtx_state_release(mutex, LCK_MTX_TAG_DESTROYED);
+
+       enable_preemption();
+}
+
 /*
  *      Routine:        lck_mtx_destroy
  */
@@ -2122,18 +2216,18 @@ lck_mtx_destroy(
        lck_mtx_t       *lck,
        lck_grp_t       *grp)
 {
-       boolean_t lck_is_indirect;
+       boolean_t indirect;
        
        if (lck->lck_mtx_tag == LCK_MTX_TAG_DESTROYED)
                return;
 #if MACH_LDEBUG
        lck_mtx_assert(lck, LCK_MTX_ASSERT_NOTOWNED);
 #endif
-       lck_is_indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
+       indirect = (lck->lck_mtx_tag == LCK_MTX_TAG_INDIRECT);
 
-       lck_mtx_lock_mark_destroyed(lck);
+       lck_mtx_lock_mark_destroyed(lck, indirect);
 
-       if (lck_is_indirect)
+       if (indirect)
                kfree(lck->lck_mtx_ptr, sizeof(lck_mtx_ext_t));
        lck_grp_lckcnt_decr(grp, LCK_TYPE_MTX);
        lck_grp_deallocate(grp);
@@ -2141,29 +2235,133 @@ lck_mtx_destroy(
 }
 
 
+#if DEVELOPMENT | DEBUG
+__attribute__((noinline))
+void
+lck_mtx_owner_check_panic(
+       lck_mtx_t       *lock)
+{
+       thread_t owner = (thread_t)lock->lck_mtx_owner;
+       panic("Mutex unlock attempted from non-owner thread. Owner=%p lock=%p", owner, lock);
+}
+#endif
+
+__attribute__((always_inline))
+static boolean_t
+get_indirect_mutex(
+       lck_mtx_t       **lock,
+       uint32_t        *state)
+{
+       *lock = &((*lock)->lck_mtx_ptr->lck_mtx);
+       *state = ordered_load_mtx_state(*lock);
+       return TRUE;
+}
+
+/*
+ * Routine:    lck_mtx_unlock_slow
+ *
+ * Unlocks a mutex held by current thread.
+ *
+ * It will wake up waiters if necessary and
+ * drop promotions.
+ *
+ * Interlock can be held.
+ */
+__attribute__((noinline))
+void
+lck_mtx_unlock_slow(
+       lck_mtx_t       *lock)
+{
+       thread_t        thread;
+       uint32_t        state, prev;
+       boolean_t       indirect = FALSE;
+
+       state = ordered_load_mtx_state(lock);
+
+       /* Is this an indirect mutex? */
+       if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
+               indirect = get_indirect_mutex(&lock, &state);
+       }
+
+       thread = current_thread();
+
+#if DEVELOPMENT | DEBUG
+       thread_t owner = (thread_t)lock->lck_mtx_owner;
+       if(__improbable(owner != thread))
+               return lck_mtx_owner_check_panic(lock);
+#endif
+
+       /* check if it is held as a spinlock */
+       if (__improbable((state & LCK_MTX_MLOCKED_MSK) == 0))
+               goto unlock;
+
+       lck_mtx_interlock_lock_clear_flags(lock, LCK_MTX_MLOCKED_MSK, &state);
+
+unlock:
+       /* preemption disabled, interlock held and mutex not held */
+
+       /* clear owner */
+       ordered_store_mtx_owner(lock, 0);
+       /* keep original state in prev for later evaluation */
+       prev = state;
+       /* release interlock, promotion and clear spin flag */
+       state &= (~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK | LCK_MTX_PROMOTED_MSK));
+       if ((state & LCK_MTX_WAITERS_MSK))
+               state -= LCK_MTX_WAITER;        /* decrement waiter count */
+       ordered_store_mtx_state_release(lock, state);           /* since I own the interlock, I don't need an atomic update */
+
+#if    MACH_LDEBUG
+       /* perform lock statistics after drop to prevent delay */
+       if (thread)
+               thread->mutex_count--;          /* lock statistic */
+#endif /* MACH_LDEBUG */
+
+       /* check if there are waiters to wake up or priority to drop */
+       if ((prev & (LCK_MTX_PROMOTED_MSK | LCK_MTX_WAITERS_MSK)))
+               return lck_mtx_unlock_wakeup_tail(lock, prev, indirect);
+
+       /* re-enable preemption */
+       lck_mtx_unlock_finish_inline(lock, FALSE);
+
+       return;
+}
+
 #define        LCK_MTX_LCK_WAIT_CODE           0x20
 #define        LCK_MTX_LCK_WAKEUP_CODE         0x21
 #define        LCK_MTX_LCK_SPIN_CODE           0x22
 #define        LCK_MTX_LCK_ACQUIRE_CODE        0x23
 #define LCK_MTX_LCK_DEMOTE_CODE                0x24
 
-
 /*
- * Routine:    lck_mtx_unlock_wakeup_x86
+ * Routine:    lck_mtx_unlock_wakeup_tail
  *
- * Invoked on unlock when there is 
- * contention (i.e. the assembly routine sees that
- * that mutex->lck_mtx_waiters != 0 or 
- * that mutex->lck_mtx_promoted != 0...
+ * Invoked on unlock when there is
+ * contention, i.e. the assembly routine sees
+ * that mutex->lck_mtx_waiters != 0 or
+ * that mutex->lck_mtx_promoted != 0
  *
  * neither the mutex or interlock is held
+ *
+ * Note that this routine might not be called if there are pending
+ * waiters which have previously been woken up, and they didn't
+ * end up boosting the old owner.
+ *
+ * assembly routine previously did the following to mutex:
+ * (after saving the state in prior_lock_state)
+ *      cleared lck_mtx_promoted
+ *      decremented lck_mtx_waiters if nonzero
+ *
+ * This function needs to be called as a tail call
+ * to optimize the compiled code.
  */
-void
-lck_mtx_unlock_wakeup_x86 (
+__attribute__((noinline))
+static void
+lck_mtx_unlock_wakeup_tail (
        lck_mtx_t       *mutex,
-       int             prior_lock_state)
+       int             prior_lock_state,
+       boolean_t       indirect)
 {
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
        lck_mtx_t               fake_lck;
 
        /*
@@ -2175,56 +2373,50 @@ lck_mtx_unlock_wakeup_x86 (
        fake_lck.lck_mtx_state = prior_lock_state;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_START,
-                    trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
+                    trace_lck, fake_lck.lck_mtx_promoted, fake_lck.lck_mtx_waiters, fake_lck.lck_mtx_pri, 0);
 
        if (__probable(fake_lck.lck_mtx_waiters)) {
+               kern_return_t did_wake;
+
                if (fake_lck.lck_mtx_waiters > 1)
-                       thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
+                       did_wake = thread_wakeup_one_with_pri(LCK_MTX_EVENT(mutex), fake_lck.lck_mtx_pri);
                else
-                       thread_wakeup_one(LCK_MTX_EVENT(mutex));
+                       did_wake = thread_wakeup_one(LCK_MTX_EVENT(mutex));
+               /*
+                * The waiters count always precisely matches the number of threads on the waitqueue.
+                * i.e. we should never see ret == KERN_NOT_WAITING.
+                */
+               assert(did_wake == KERN_SUCCESS);
        }
 
+       /* When lck_mtx_promoted was set, then I as the owner definitely have a promotion */
        if (__improbable(fake_lck.lck_mtx_promoted)) {
-               thread_t        thread = current_thread();
+               thread_t thread = current_thread();
 
+               spl_t s = splsched();
+               thread_lock(thread);
 
                KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_DEMOTE_CODE) | DBG_FUNC_NONE,
-                            thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
+                            thread_tid(thread), thread->promotions, thread->sched_flags & TH_SFLAG_PROMOTED, 0, 0);
+               assert(thread->was_promoted_on_wakeup == 0);
+               assert(thread->promotions > 0);
 
-               if (thread->promotions > 0) {
-                       spl_t   s = splsched();
+               assert_promotions_invariant(thread);
 
-                       thread_lock(thread);
+               if (--thread->promotions == 0)
+                       sched_thread_unpromote(thread, trace_lck);
 
-                       if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED)) {
+               assert_promotions_invariant(thread);
 
-                               thread->sched_flags &= ~TH_SFLAG_PROMOTED;
-
-                               if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
-                                       /* Thread still has a RW lock promotion */
-                               } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-                                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
-                                                             thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
-
-                                       set_sched_pri(thread, DEPRESSPRI);
-                               }
-                               else {
-                                       if (thread->base_pri < thread->sched_pri) {
-                                               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_DEMOTE) | DBG_FUNC_NONE,
-                                                                     thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
-
-                                               thread_recompute_sched_pri(thread, FALSE);
-                                       }
-                               }
-                       }
-                       thread_unlock(thread);
-                       splx(s);
-               }
+               thread_unlock(thread);
+               splx(s);
        }
+
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAKEUP_CODE) | DBG_FUNC_END,
-                    trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
-}
+                 trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
 
+       lck_mtx_unlock_finish_inline(mutex, indirect);
+}
 
 /*
  * Routine:    lck_mtx_lock_acquire_x86
@@ -2236,14 +2428,13 @@ lck_mtx_unlock_wakeup_x86 (
  *
  * mutex is owned...  interlock is held... preemption is disabled
  */
-void
-lck_mtx_lock_acquire_x86(
+__attribute__((always_inline))
+static void
+lck_mtx_lock_acquire_inline(
        lck_mtx_t       *mutex)
 {
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
-       thread_t                thread;
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
        integer_t               priority;
-       spl_t                   s;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_ACQUIRE_CODE) | DBG_FUNC_START,
                     trace_lck, thread->was_promoted_on_wakeup, mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
@@ -2251,31 +2442,49 @@ lck_mtx_lock_acquire_x86(
        if (mutex->lck_mtx_waiters)
                priority = mutex->lck_mtx_pri;
        else
-               priority = 0;
+               priority = 0; /* not worth resetting lck_mtx_pri here, it will be reset by next waiter */
 
-       thread = (thread_t)mutex->lck_mtx_owner;        /* faster then current_thread() */
+       /* the priority must have been set correctly by wait */
+       assert(priority <= MAXPRI_PROMOTE);
+       assert(priority == 0 || priority >= BASEPRI_DEFAULT);
 
-       if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
+       /* if the mutex wasn't owned, then the owner wasn't promoted */
+       assert(mutex->lck_mtx_promoted == 0);
 
-               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
-                                     thread->sched_pri, priority, thread->was_promoted_on_wakeup, trace_lck, 0);
+       thread_t thread = (thread_t)mutex->lck_mtx_owner;       /* faster than current_thread() */
 
-               s = splsched();
+       if (thread->sched_pri < priority || thread->was_promoted_on_wakeup) {
+               spl_t s = splsched();
                thread_lock(thread);
 
-               if (thread->sched_pri < priority) {
-                       /* Do not promote past promotion ceiling */
-                       assert(priority <= MAXPRI_PROMOTE);
-                       set_sched_pri(thread, priority);
-               }
-               if (mutex->lck_mtx_promoted == 0) {
+               if (thread->was_promoted_on_wakeup)
+                       assert(thread->promotions > 0);
+
+               /* Intel only promotes if priority goes up */
+               if (thread->sched_pri < priority && thread->promotion_priority < priority) {
+                       /* Remember that I need to drop this promotion on unlock */
                        mutex->lck_mtx_promoted = 1;
-                       
-                       thread->promotions++;
-                       thread->sched_flags |= TH_SFLAG_PROMOTED;
+
+                       if (thread->promotions++ == 0) {
+                               /* This is the first promotion for the owner */
+                               sched_thread_promote_to_pri(thread, priority, trace_lck);
+                       } else {
+                               /*
+                                * Holder was previously promoted due to a different mutex,
+                                * raise to match this one.
+                                * Or, this thread was promoted on wakeup but someone else
+                                * later contended on mutex at higher priority before we got here
+                                */
+                               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
+                       }
                }
-               thread->was_promoted_on_wakeup = 0;
-               
+
+               if (thread->was_promoted_on_wakeup) {
+                       thread->was_promoted_on_wakeup = 0;
+                       if (--thread->promotions == 0)
+                               sched_thread_unpromote(thread, trace_lck);
+               }
+
                thread_unlock(thread);
                splx(s);
        }
@@ -2283,28 +2492,794 @@ lck_mtx_lock_acquire_x86(
                     trace_lck, 0, mutex->lck_mtx_waiters, 0, 0);
 }
 
+void
+lck_mtx_lock_acquire_x86(
+       lck_mtx_t       *mutex)
+{
+       return lck_mtx_lock_acquire_inline(mutex);
+}
+
+/*
+ * Tail call helpers for lock functions that perform
+ * lck_mtx_lock_acquire followed by the caller's finish routine, to optimize
+ * the caller's compiled code.
+ */
 
-static int
-lck_mtx_interlock_try_lock(lck_mtx_t *mutex, boolean_t *istate)
+__attribute__((noinline))
+static void
+lck_mtx_lock_acquire_tail(
+       lck_mtx_t       *mutex,
+       boolean_t       indirect)
+{
+       lck_mtx_lock_acquire_inline(mutex);
+       lck_mtx_lock_finish_inline(mutex, ordered_load_mtx_state(mutex), indirect);
+}
+
+__attribute__((noinline))
+static boolean_t
+lck_mtx_try_lock_acquire_tail(
+       lck_mtx_t       *mutex)
+{
+       lck_mtx_lock_acquire_inline(mutex);
+       lck_mtx_try_lock_finish_inline(mutex, ordered_load_mtx_state(mutex));
+
+       return TRUE;
+}
+
+__attribute__((noinline))
+static void
+lck_mtx_convert_spin_acquire_tail(
+       lck_mtx_t       *mutex)
+{
+       lck_mtx_lock_acquire_inline(mutex);
+       lck_mtx_convert_spin_finish_inline(mutex, ordered_load_mtx_state(mutex));
+}
+
+boolean_t
+lck_mtx_ilk_unlock(
+       lck_mtx_t       *mutex)
+{
+       lck_mtx_ilk_unlock_inline(mutex, ordered_load_mtx_state(mutex));
+       return TRUE;
+}
+
+static inline void
+lck_mtx_interlock_lock_set_and_clear_flags(
+       lck_mtx_t *mutex,
+       uint32_t xor_flags,
+       uint32_t and_flags,
+       uint32_t *new_state)
 {
-       int             retval;
+       uint32_t state, prev;
+       state = *new_state;
+
+       for ( ; ; ) {
+               /* have to wait for interlock to clear */
+               while (__improbable(state & (LCK_MTX_ILOCKED_MSK | xor_flags))) {
+                       cpu_pause();
+                       state = ordered_load_mtx_state(mutex);
+               }
+               prev = state;                                   /* prev contains snapshot for exchange */
+               state |= LCK_MTX_ILOCKED_MSK | xor_flags;       /* pick up interlock */
+               state &= ~and_flags;                            /* clear flags */
+
+               disable_preemption();
+               if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE))
+                       break;
+               enable_preemption();
+               cpu_pause();
+               state = ordered_load_mtx_state(mutex);
+       }
+       *new_state = state;
+       return;
+}
+
+static inline void
+lck_mtx_interlock_lock_clear_flags(
+       lck_mtx_t *mutex,
+       uint32_t and_flags,
+       uint32_t *new_state)
+{
+       return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, and_flags, new_state);
+}
+
+static inline void
+lck_mtx_interlock_lock(
+       lck_mtx_t *mutex,
+       uint32_t *new_state)
+{
+       return lck_mtx_interlock_lock_set_and_clear_flags(mutex, 0, 0, new_state);
+}
+
+static inline int
+lck_mtx_interlock_try_lock_set_flags(
+       lck_mtx_t *mutex,
+       uint32_t or_flags,
+       uint32_t *new_state)
+{
+       uint32_t state, prev;
+       state = *new_state;
+
+       /* have to wait for interlock to clear */
+       if (state & (LCK_MTX_ILOCKED_MSK | or_flags)) {
+               return 0;
+       }
+       prev = state;                                   /* prev contains snapshot for exchange */
+       state |= LCK_MTX_ILOCKED_MSK | or_flags;        /* pick up interlock */
+       disable_preemption();
+       if (atomic_compare_exchange32(&mutex->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+                       *new_state = state;
+                       return 1;
+       }
+
+       enable_preemption();
+       return 0;
+}
+
+static inline int
+lck_mtx_interlock_try_lock(
+       lck_mtx_t *mutex,
+       uint32_t *new_state)
+{
+       return lck_mtx_interlock_try_lock_set_flags(mutex, 0, new_state);
+}
+
+static inline int
+lck_mtx_interlock_try_lock_disable_interrupts(
+       lck_mtx_t *mutex,
+       boolean_t *istate)
+{
+       uint32_t        state;
 
        *istate = ml_set_interrupts_enabled(FALSE);
-       retval = lck_mtx_ilk_try_lock(mutex);
+       state = ordered_load_mtx_state(mutex);
 
-       if (retval == 0)
+       if (lck_mtx_interlock_try_lock(mutex, &state)) {
+               return 1;
+       } else {
                ml_set_interrupts_enabled(*istate);
-
-       return retval;
+               return 0;
+       }
 }
 
-static void
-lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
-{               
+static inline void
+lck_mtx_interlock_unlock_enable_interrupts(
+       lck_mtx_t *mutex,
+       boolean_t istate)
+{
        lck_mtx_ilk_unlock(mutex);
        ml_set_interrupts_enabled(istate);
 }
 
+static void __inline__
+lck_mtx_inc_stats(
+       uint64_t* stat)
+{
+#if ATOMIC_STAT_UPDATES
+               os_atomic_inc(stat, relaxed);
+#else
+               *stat = (*stat)++;
+#endif
+}
+
+static void __inline__
+lck_mtx_update_miss(
+       struct _lck_mtx_ext_ *lock,
+       int *first_miss)
+{
+#if LOG_FIRST_MISS_ALONE
+       if ((*first_miss & 1) == 0) {
+#else
+#pragma unused(first_miss)
+#endif
+               uint64_t* stat = &lock->lck_mtx_grp->lck_grp_miss;
+               lck_mtx_inc_stats(stat);
+
+#if LOG_FIRST_MISS_ALONE
+               *first_miss |= 1;
+       }
+#endif
+}
+
+static void __inline__
+lck_mtx_update_direct_wait(
+       struct _lck_mtx_ext_ *lock)
+{
+       uint64_t* stat = &lock->lck_mtx_grp->lck_grp_direct_wait;
+       lck_mtx_inc_stats(stat);
+}
+
+static void __inline__
+lck_mtx_update_wait(
+       struct _lck_mtx_ext_ *lock,
+       int *first_miss)
+{
+#if LOG_FIRST_MISS_ALONE
+       if ((*first_miss & 2) == 0) {
+#else
+#pragma unused(first_miss)
+#endif
+               uint64_t* stat = &lock->lck_mtx_grp->lck_grp_wait;
+               lck_mtx_inc_stats(stat);
+
+#if LOG_FIRST_MISS_ALONE
+               *first_miss |= 2;
+       }
+#endif
+}
+
+static void __inline__
+lck_mtx_update_util(
+       struct _lck_mtx_ext_ *lock)
+{
+       uint64_t* stat = &lock->lck_mtx_grp->lck_grp_util;
+       lck_mtx_inc_stats(stat);
+}
+
+__attribute__((noinline))
+static void
+lck_mtx_lock_contended(
+       lck_mtx_t       *lock,
+       boolean_t indirect,
+       boolean_t *first_miss)
+{
+       lck_mtx_spinwait_ret_type_t ret;
+       uint32_t state;
+       thread_t thread;
+
+try_again:
+
+       if (indirect) {
+               lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, first_miss);
+       }
+
+       ret = lck_mtx_lock_spinwait_x86(lock);
+       state = ordered_load_mtx_state(lock);
+       switch (ret) {
+       case LCK_MTX_SPINWAIT_NO_SPIN:
+               /*
+                * owner not on core, lck_mtx_lock_spinwait_x86 didn't even
+                * try to spin.
+                */
+               if (indirect) {
+                       lck_mtx_update_direct_wait((struct _lck_mtx_ext_*)lock);
+               }
+
+               /* just fall through case LCK_MTX_SPINWAIT_SPUN */
+       case LCK_MTX_SPINWAIT_SPUN:
+               /*
+                * mutex not acquired but lck_mtx_lock_spinwait_x86 tried to spin
+                * interlock not held
+                */
+               lck_mtx_interlock_lock(lock, &state);
+               assert(state & LCK_MTX_ILOCKED_MSK);
+
+               if (state & LCK_MTX_MLOCKED_MSK) {
+                       if (indirect) {
+                               lck_mtx_update_wait((struct _lck_mtx_ext_*)lock, first_miss);
+                       }
+                       lck_mtx_lock_wait_x86(lock);
+                       /*
+                        * interlock is not held here.
+                        */
+                       goto try_again;
+               } else {
+
+                       /* grab the mutex */
+                       state |= LCK_MTX_MLOCKED_MSK;
+                       ordered_store_mtx_state_release(lock, state);
+                       thread = current_thread();
+                       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+#if     MACH_LDEBUG
+                       if (thread) {
+                               thread->mutex_count++;
+                       }
+#endif  /* MACH_LDEBUG */
+               }
+
+               break;
+       case LCK_MTX_SPINWAIT_ACQUIRED:
+               /*
+                * mutex has been acquired by lck_mtx_lock_spinwait_x86
+                * interlock is held and preemption disabled
+                * owner is set and mutex marked as locked
+                * statistics updated too
+                */
+               break;
+       default:
+               panic("lck_mtx_lock_spinwait_x86 returned %d for mutex %p\n", ret, lock);
+       }
+
+       /*
+        * interlock is already acquired here
+        */
+
+       /* mutex has been acquired */
+       thread = (thread_t)lock->lck_mtx_owner;
+       if (state & LCK_MTX_WAITERS_MSK || thread->was_promoted_on_wakeup) {
+               return lck_mtx_lock_acquire_tail(lock, indirect);
+       }
+
+       /* release the interlock */
+       lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
+}
+
+/*
+ * Helper noinline functions for calling
+ * panic to optimize compiled code.
+ */
+
+__attribute__((noinline))
+static void
+lck_mtx_destroyed(
+       lck_mtx_t       *lock)
+{
+       panic("trying to interlock destroyed mutex (%p)", lock);
+}
+
+__attribute__((noinline))
+static boolean_t
+lck_mtx_try_destroyed(
+       lck_mtx_t       *lock)
+{
+       panic("trying to interlock destroyed mutex (%p)", lock);
+       return FALSE;
+}
+
+__attribute__((always_inline))
+static boolean_t
+lck_mtx_lock_wait_interlock_to_clear(
+       lck_mtx_t       *lock,
+       uint32_t*        new_state)
+{
+       uint32_t state;
+
+       for ( ; ; ) {
+               cpu_pause();
+               state = ordered_load_mtx_state(lock);
+               if (!(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK))) {
+                       *new_state = state;
+                       return TRUE;
+               }
+               if (state & LCK_MTX_MLOCKED_MSK) {
+                       /* if it is held as mutex, just fail */
+                       return FALSE;
+               }
+       }
+}
+
+__attribute__((always_inline))
+static boolean_t
+lck_mtx_try_lock_wait_interlock_to_clear(
+       lck_mtx_t       *lock,
+       uint32_t*        new_state)
+{
+       uint32_t state;
+
+       for ( ; ; ) {
+               cpu_pause();
+               state = ordered_load_mtx_state(lock);
+               if (state & (LCK_MTX_MLOCKED_MSK | LCK_MTX_SPIN_MSK)) {
+                       /* if it is held as mutex or spin, just fail */
+                       return FALSE;
+               }
+               if (!(state & LCK_MTX_ILOCKED_MSK)) {
+                       *new_state = state;
+                       return TRUE;
+               }
+       }
+}
+
+/*
+ * Routine:    lck_mtx_lock_slow
+ *
+ * Locks a mutex for current thread.
+ * If the lock is contended this function might
+ * sleep.
+ *
+ * Called with interlock not held.
+ */
+__attribute__((noinline))
+void
+lck_mtx_lock_slow(
+       lck_mtx_t       *lock)
+{
+       boolean_t       indirect = FALSE;
+       uint32_t        state;
+       int             first_miss = 0;
+
+       state = ordered_load_mtx_state(lock);
+
+       /* is the interlock or mutex held */
+       if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
+               /*
+                * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
+                * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
+                * set in state (state == lck_mtx_tag)
+                */
+
+
+               /* is the mutex already held and not indirect */
+               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
+                       /* no, must have been the mutex */
+                       return lck_mtx_lock_contended(lock, indirect, &first_miss);
+               }
+
+               /* check to see if it is marked destroyed */
+               if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
+                       return lck_mtx_destroyed(lock);
+               }
+
+               /* Is this an indirect mutex? */
+               if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
+                       indirect = get_indirect_mutex(&lock, &state);
+
+                       first_miss = 0;
+                       lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
+
+                       if (state & LCK_MTX_SPIN_MSK) {
+                                /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
+                               assert(state & LCK_MTX_ILOCKED_MSK);
+                               lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
+                       }
+               }
+
+               if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
+                       return lck_mtx_lock_contended(lock, indirect, &first_miss);
+               }
+       }
+
+       /* no - can't be INDIRECT, DESTROYED or locked */
+       while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
+               if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
+                       return lck_mtx_lock_contended(lock, indirect, &first_miss);
+               }
+       }
+
+       /* lock and interlock acquired */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+       /*
+        * Check if there are waiters to
+        * inherit their priority.
+        */
+       if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
+               return lck_mtx_lock_acquire_tail(lock, indirect);
+       }
+
+       /* release the interlock */
+       lck_mtx_lock_finish_inline(lock, ordered_load_mtx_state(lock), indirect);
+
+       return;
+}
+
+__attribute__((noinline))
+boolean_t
+lck_mtx_try_lock_slow(
+       lck_mtx_t       *lock)
+{
+       boolean_t       indirect = FALSE;
+       uint32_t        state;
+       int             first_miss = 0;
+
+       state = ordered_load_mtx_state(lock);
+
+       /* is the interlock or mutex held */
+       if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
+               /*
+                * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
+                * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
+                * set in state (state == lck_mtx_tag)
+                */
+
+               /* is the mutex already held and not indirect */
+               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
+                       return FALSE;
+               }
+
+               /* check to see if it is marked destroyed */
+               if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
+                       return lck_mtx_try_destroyed(lock);
+               }
+
+               /* Is this an indirect mutex? */
+               if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
+                       indirect = get_indirect_mutex(&lock, &state);
+
+                       first_miss = 0;
+                       lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
+               }
+
+               if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
+                       if (indirect)
+                               lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
+                       return FALSE;
+               }
+       }
+
+       /* no - can't be INDIRECT, DESTROYED or locked */
+       while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state))) {
+               if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
+                       if (indirect)
+                               lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
+                       return FALSE;
+               }
+       }
+
+       /* lock and interlock acquired */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+       /*
+        * Check if there are waiters to
+        * inherit their priority.
+        */
+       if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
+               return lck_mtx_try_lock_acquire_tail(lock);
+       }
+
+       /* release the interlock */
+       lck_mtx_try_lock_finish_inline(lock, ordered_load_mtx_state(lock));
+
+       return TRUE;
+
+}
+
+__attribute__((noinline))
+void
+lck_mtx_lock_spin_slow(
+       lck_mtx_t       *lock)
+{
+       boolean_t       indirect = FALSE;
+       uint32_t        state;
+       int             first_miss = 0;
+
+       state = ordered_load_mtx_state(lock);
+
+       /* is the interlock or mutex held */
+       if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
+               /*
+                * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
+                * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
+                * set in state (state == lck_mtx_tag)
+                */
+
+
+               /* is the mutex already held and not indirect */
+               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
+                       /* no, must have been the mutex */
+                       return lck_mtx_lock_contended(lock, indirect, &first_miss);
+               }
+
+               /* check to see if it is marked destroyed */
+               if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
+                       return lck_mtx_destroyed(lock);
+               }
+
+               /* Is this an indirect mutex? */
+               if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
+                       indirect = get_indirect_mutex(&lock, &state);
+
+                       first_miss = 0;
+                       lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
+
+                       if (state & LCK_MTX_SPIN_MSK) {
+                                /* M_SPIN_MSK was set, so M_ILOCKED_MSK must also be present */
+                               assert(state & LCK_MTX_ILOCKED_MSK);
+                               lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
+                       }
+               }
+
+               if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
+                       return lck_mtx_lock_contended(lock, indirect, &first_miss);
+               }
+       }
+
+       /* no - can't be INDIRECT, DESTROYED or locked */
+       while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state) )) {
+               if (!lck_mtx_lock_wait_interlock_to_clear(lock, &state)) {
+                       return lck_mtx_lock_contended(lock, indirect, &first_miss);
+               }
+       }
+
+       /* lock as spinlock and interlock acquired */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
+#endif
+       /* return with the interlock held and preemption disabled */
+       return;
+}
+
+__attribute__((noinline))
+boolean_t
+lck_mtx_try_lock_spin_slow(
+       lck_mtx_t       *lock)
+{
+       boolean_t       indirect = FALSE;
+       uint32_t        state;
+       int             first_miss = 0;
+
+       state = ordered_load_mtx_state(lock);
+
+       /* is the interlock or mutex held */
+       if (__improbable(state & ((LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))) {
+               /*
+                * Note: both LCK_MTX_TAG_DESTROYED and LCK_MTX_TAG_INDIRECT
+                * have LCK_MTX_ILOCKED_MSK and LCK_MTX_MLOCKED_MSK
+                * set in state (state == lck_mtx_tag)
+                */
+
+               /* is the mutex already held and not indirect */
+               if (__improbable(!(state & LCK_MTX_ILOCKED_MSK))){
+                       return FALSE;
+               }
+
+               /* check to see if it is marked destroyed */
+               if (__improbable(state == LCK_MTX_TAG_DESTROYED)) {
+                       return lck_mtx_try_destroyed(lock);
+               }
+
+               /* Is this an indirect mutex? */
+               if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
+                       indirect = get_indirect_mutex(&lock, &state);
+
+                       first_miss = 0;
+                       lck_mtx_update_util((struct _lck_mtx_ext_*)lock);
+               }
+
+               if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
+                       if (indirect)
+                               lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
+                       return FALSE;
+               }
+       }
+
+       /* no - can't be INDIRECT, DESTROYED or locked */
+       while (__improbable(!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_SPIN_MSK, &state))) {
+               if (!lck_mtx_try_lock_wait_interlock_to_clear(lock, &state)) {
+                       if (indirect)
+                               lck_mtx_update_miss((struct _lck_mtx_ext_*)lock, &first_miss);
+                       return FALSE;
+               }
+       }
+
+       /* lock and interlock acquired */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+
+#if     CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
+#endif
+       return TRUE;
+
+}
+
+__attribute__((noinline))
+void
+lck_mtx_convert_spin(
+       lck_mtx_t       *lock)
+{
+       uint32_t state;
+
+       state = ordered_load_mtx_state(lock);
+
+       /* Is this an indirect mutex? */
+       if (__improbable(state == LCK_MTX_TAG_INDIRECT)) {
+               /* If so, take indirection */
+               get_indirect_mutex(&lock, &state);
+       }
+
+       assertf((thread_t)lock->lck_mtx_owner == current_thread(), "lock %p not owned by thread %p (current owner %p)", lock, current_thread(), (thread_t)lock->lck_mtx_owner );
+
+       if (__improbable(state & LCK_MTX_MLOCKED_MSK)) {
+               /* already owned as a mutex, just return */
+               return;
+       }
+
+       assert(get_preemption_level() > 0);
+       assert(state & LCK_MTX_ILOCKED_MSK);
+       assert(state & LCK_MTX_SPIN_MSK);
+
+       /*
+        * Check if there are waiters to
+        * inherit their priority.
+        */
+       if (__improbable(state & LCK_MTX_WAITERS_MSK)) {
+               return lck_mtx_convert_spin_acquire_tail(lock);
+       }
+
+       lck_mtx_convert_spin_finish_inline(lock, ordered_load_mtx_state(lock));
+
+       return;
+}
+
+static inline boolean_t
+lck_mtx_lock_grab_mutex(
+       lck_mtx_t       *lock)
+{
+       uint32_t state;
+
+       state = ordered_load_mtx_state(lock);
+
+       if (!lck_mtx_interlock_try_lock_set_flags(lock, LCK_MTX_MLOCKED_MSK, &state)) {
+               return FALSE;
+       }
+
+       /* lock and interlock acquired */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+       return TRUE;
+}
+
+__attribute__((noinline))
+void
+lck_mtx_assert(
+       lck_mtx_t       *lock,
+       unsigned int    type)
+{
+       thread_t thread, owner;
+       uint32_t state;
+
+       thread = current_thread();
+       state = ordered_load_mtx_state(lock);
+
+       if (state == LCK_MTX_TAG_INDIRECT) {
+               get_indirect_mutex(&lock, &state);
+       }
+
+       owner = (thread_t)lock->lck_mtx_owner;
+
+       if (type == LCK_MTX_ASSERT_OWNED) {
+               if (owner != thread || !(state & (LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK)))
+                       panic("mutex (%p) not owned\n", lock);
+       } else {
+               assert (type == LCK_MTX_ASSERT_NOTOWNED);
+               if (owner == thread)
+                       panic("mutex (%p) owned\n", lock);
+       }
+}
 
 /*
  * Routine:    lck_mtx_lock_spinwait_x86
@@ -2314,20 +3289,21 @@ lck_mtx_interlock_unlock(lck_mtx_t *mutex, boolean_t istate)
  * time waiting for the lock to be released.
  *
  * Called with the interlock unlocked.
- * returns 0 if mutex acquired
- * returns 1 if we spun
- * returns 2 if we didn't spin due to the holder not running
+ * returns LCK_MTX_SPINWAIT_ACQUIRED if mutex acquired
+ * returns LCK_MTX_SPINWAIT_SPUN if we spun
+ * returns LCK_MTX_SPINWAIT_NO_SPIN if we didn't spin due to the holder not running
  */
-int
+__attribute__((noinline))
+lck_mtx_spinwait_ret_type_t
 lck_mtx_lock_spinwait_x86(
        lck_mtx_t       *mutex)
 {
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
        thread_t        holder;
        uint64_t        overall_deadline;
        uint64_t        check_owner_deadline;
        uint64_t        cur_time;
-       int             retval = 1;
+       lck_mtx_spinwait_ret_type_t             retval = LCK_MTX_SPINWAIT_SPUN;
        int             loopcount = 0;
 
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_SPIN_CODE) | DBG_FUNC_START,
@@ -2347,7 +3323,7 @@ lck_mtx_lock_spinwait_x86(
         */
        do {
                if (__probable(lck_mtx_lock_grab_mutex(mutex))) {
-                       retval = 0;
+                       retval = LCK_MTX_SPINWAIT_ACQUIRED;
                        break;
                }
                cur_time = mach_absolute_time();
@@ -2358,21 +3334,33 @@ lck_mtx_lock_spinwait_x86(
                if (cur_time >= check_owner_deadline && mutex->lck_mtx_owner) {
                        boolean_t       istate;
 
-                       if (lck_mtx_interlock_try_lock(mutex, &istate)) {
+                       /*
+                        * We will repeatedly peek at the state of the lock while spinning,
+                        * and we will acquire the interlock to do so.
+                        * The thread that will unlock the mutex will also need to acquire
+                        * the interlock, and we want to avoid to slow it down.
+                        * To avoid to get an interrupt while holding the interlock
+                        * and increase the time we are holding it, we
+                        * will try to acquire the interlock with interrupts disabled.
+                        * This is safe because it is a "try_lock", if we can't acquire
+                        * the interlock we re-enable the interrupts and fail, so it is
+                        * ok to call it even if the interlock was already held.
+                       */
+                       if (lck_mtx_interlock_try_lock_disable_interrupts(mutex, &istate)) {
 
                                if ((holder = (thread_t) mutex->lck_mtx_owner) != NULL) {
 
                                        if ( !(holder->machine.specFlags & OnProc) ||
                                             (holder->state & TH_IDLE)) {
 
-                                               lck_mtx_interlock_unlock(mutex, istate);
+                                               lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
 
                                                if (loopcount == 0)
-                                                       retval = 2;
+                                                       retval = LCK_MTX_SPINWAIT_NO_SPIN;
                                                break;
                                        }
                                }
-                               lck_mtx_interlock_unlock(mutex, istate);
+                               lck_mtx_interlock_unlock_enable_interrupts(mutex, istate);
 
                                check_owner_deadline = cur_time + (MutexSpin / 4);
                        }
@@ -2418,79 +3406,119 @@ lck_mtx_lock_spinwait_x86(
  * Invoked in order to wait on contention.
  *
  * Called with the interlock locked and
- * preemption disabled...  
+ * preemption disabled...
  * returns it unlocked and with preemption enabled
+ *
+ * lck_mtx_waiters is 1:1 with a wakeup needing to occur.
+ *      A runnable waiter can exist between wait and acquire
+ *      without a waiters count being set.
+ *      This allows us to never make a spurious wakeup call.
+ *
+ * Priority:
+ *      This avoids taking the thread lock if the owning thread is the same priority.
+ *      This optimizes the case of same-priority threads contending on a lock.
+ *      However, that allows the owning thread to drop in priority while holding the lock,
+ *      because there is no state that the priority change can notice that
+ *      says that the targeted thread holds a contended mutex.
+ *
+ *      One possible solution: priority changes could look for some atomic tag
+ *      on the thread saying 'holding contended lock', and then set up a promotion.
+ *      Needs a story for dropping that promotion - the last contended unlock
+ *      has to notice that this has happened.
  */
+__attribute__((noinline))
 void
 lck_mtx_lock_wait_x86 (
        lck_mtx_t       *mutex)
 {
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(mutex);
-       thread_t        self = current_thread();
-       thread_t        holder;
-       integer_t       priority;
-       spl_t           s;
 #if    CONFIG_DTRACE
-       uint64_t        sleep_start = 0;
+       uint64_t sleep_start = 0;
 
        if (lockstat_probemap[LS_LCK_MTX_LOCK_BLOCK] || lockstat_probemap[LS_LCK_MTX_EXT_LOCK_BLOCK]) {
                sleep_start = mach_absolute_time();
        }
 #endif
+       thread_t self = current_thread();
+       assert(self->waiting_for_mutex == NULL);
+
+       self->waiting_for_mutex = mutex;
+
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(mutex);
+
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
-                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
+                    mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
-       priority = self->sched_pri;
+       integer_t waiter_pri = self->sched_pri;
+       waiter_pri = MAX(waiter_pri, self->base_pri);
+       waiter_pri = MAX(waiter_pri, BASEPRI_DEFAULT);
+       waiter_pri = MIN(waiter_pri, MAXPRI_PROMOTE);
 
-       if (priority < self->base_pri)
-               priority = self->base_pri;
-       if (priority < BASEPRI_DEFAULT)
-               priority = BASEPRI_DEFAULT;
+       assert(mutex->lck_mtx_pri <= MAXPRI_PROMOTE);
 
-       /* Do not promote past promotion ceiling */
-       priority = MIN(priority, MAXPRI_PROMOTE);
+       /* Re-initialize lck_mtx_pri if this is the first contention */
+       if (mutex->lck_mtx_waiters == 0 || mutex->lck_mtx_pri <= waiter_pri)
+               mutex->lck_mtx_pri = waiter_pri;
 
-       if (mutex->lck_mtx_waiters == 0 || priority > mutex->lck_mtx_pri)
-               mutex->lck_mtx_pri = priority;
-       mutex->lck_mtx_waiters++;
+       thread_t holder = (thread_t)mutex->lck_mtx_owner;
 
-       if ( (holder = (thread_t)mutex->lck_mtx_owner) &&
-            holder->sched_pri < mutex->lck_mtx_pri ) {
-               s = splsched();
+       assert(holder != NULL);
+
+       /*
+        * Intel only causes a promotion when priority needs to change,
+        * reducing thread lock holds but leaving us vulnerable to the holder
+        * dropping priority.
+        */
+       if (holder->sched_pri < mutex->lck_mtx_pri) {
+               int promote_pri = mutex->lck_mtx_pri;
+
+               spl_t s = splsched();
                thread_lock(holder);
 
-               /* holder priority may have been bumped by another thread
-                * before thread_lock was taken
-                */
-               if (holder->sched_pri < mutex->lck_mtx_pri) {
-                       KERNEL_DEBUG_CONSTANT(
-                               MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTE) | DBG_FUNC_NONE,
-                               holder->sched_pri, priority, thread_tid(holder), trace_lck, 0);
-                       /* Assert that we're not altering the priority of a
-                        * thread above the MAXPRI_PROMOTE band
-                        */
-                       assert(holder->sched_pri < MAXPRI_PROMOTE);
-                       set_sched_pri(holder, priority);
-                       
+               /* Check again in case sched_pri changed */
+               if (holder->sched_pri < promote_pri && holder->promotion_priority < promote_pri) {
                        if (mutex->lck_mtx_promoted == 0) {
-                               holder->promotions++;
-                               holder->sched_flags |= TH_SFLAG_PROMOTED;
-                               
+                               /* This is the first promotion for this mutex */
                                mutex->lck_mtx_promoted = 1;
+
+                               if (holder->promotions++ == 0) {
+                                       /* This is the first promotion for holder */
+                                       sched_thread_promote_to_pri(holder, promote_pri, trace_lck);
+                               } else {
+                                       /*
+                                        * Holder was previously promoted due to a different mutex,
+                                        * check if it needs to raise to match this one
+                                        */
+                                       sched_thread_update_promotion_to_pri(holder, promote_pri,
+                                                                            trace_lck);
+                               }
+                       } else {
+                               /*
+                                * Holder was previously promoted due to this mutex,
+                                * check if the pri needs to go up
+                                */
+                               sched_thread_update_promotion_to_pri(holder, promote_pri, trace_lck);
                        }
                }
+
                thread_unlock(holder);
                splx(s);
        }
+
+       mutex->lck_mtx_waiters++;
+
        thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
-       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
+       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
 
        lck_mtx_ilk_unlock(mutex);
 
        thread_block(THREAD_CONTINUE_NULL);
 
+       self->waiting_for_mutex = NULL;
+
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END,
-                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner), mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
+                    trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(mutex->lck_mtx_owner),
+                    mutex->lck_mtx_waiters, mutex->lck_mtx_pri, 0);
 
 #if    CONFIG_DTRACE
        /*
diff --git a/osfmk/i386/locks_i386_inlines.h b/osfmk/i386/locks_i386_inlines.h
new file mode 100644 (file)
index 0000000..7e4aa59
--- /dev/null
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 201 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _I386_LOCKS_I386_INLINES_H_
+#define _I386_LOCKS_I386_INLINES_H_
+
+#include <kern/locks.h>
+/*
+ * We need only enough declarations from the BSD-side to be able to
+ * test if our probe is active, and to call __dtrace_probe().  Setting
+ * NEED_DTRACE_DEFS gets a local copy of those definitions pulled in.
+ */
+#if    CONFIG_DTRACE
+#define NEED_DTRACE_DEFS
+#include <../bsd/sys/lockstat.h>
+#endif
+
+// Enforce program order of loads and stores.
+#define ordered_load(target) _Generic( (target),\
+               uint32_t* : __c11_atomic_load((_Atomic uint32_t* )(target), memory_order_relaxed), \
+               uintptr_t*: __c11_atomic_load((_Atomic uintptr_t*)(target), memory_order_relaxed) )
+#define ordered_store_release(target, value) _Generic( (target),\
+               uint32_t* : __c11_atomic_store((_Atomic uint32_t* )(target), (value), memory_order_release_smp), \
+               uintptr_t*: __c11_atomic_store((_Atomic uintptr_t*)(target), (value), memory_order_release_smp) )
+#define ordered_store_volatile(target, value) _Generic( (target),\
+               volatile uint32_t* : __c11_atomic_store((_Atomic volatile uint32_t* )(target), (value), memory_order_relaxed), \
+               volatile uintptr_t*: __c11_atomic_store((_Atomic volatile uintptr_t*)(target), (value), memory_order_relaxed) )
+
+/* Enforce program order of loads and stores. */
+#define ordered_load_mtx_state(lock)                   ordered_load(&(lock)->lck_mtx_state)
+#define ordered_store_mtx_state_release(lock, value)           ordered_store_release(&(lock)->lck_mtx_state, (value))
+#define ordered_store_mtx_owner(lock, value)   ordered_store_volatile(&(lock)->lck_mtx_owner, (value))
+
+#if DEVELOPMENT | DEBUG
+void lck_mtx_owner_check_panic(lck_mtx_t       *mutex);
+#endif
+
+__attribute__((always_inline))
+static inline void
+lck_mtx_ilk_unlock_inline(
+       lck_mtx_t       *mutex,
+       uint32_t        state)
+{
+       state &= ~LCK_MTX_ILOCKED_MSK;
+       ordered_store_mtx_state_release(mutex, state);
+
+       enable_preemption();
+}
+
+__attribute__((always_inline))
+static inline void
+lck_mtx_lock_finish_inline(
+       lck_mtx_t       *mutex,
+       uint32_t        state,
+       boolean_t       indirect)
+{
+       assert(state & LCK_MTX_ILOCKED_MSK);
+
+       /* release the interlock and re-enable preemption */
+       lck_mtx_ilk_unlock_inline(mutex, state);
+
+#if    CONFIG_DTRACE
+       if (indirect) {
+               LOCKSTAT_RECORD(LS_LCK_MTX_EXT_LOCK_ACQUIRE, mutex, 0);
+       } else {
+               LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_ACQUIRE, mutex, 0);
+       }
+#endif
+}
+
+__attribute__((always_inline))
+static inline void
+lck_mtx_try_lock_finish_inline(
+       lck_mtx_t       *mutex,
+       uint32_t        state)
+{
+       /* release the interlock and re-enable preemption */
+       lck_mtx_ilk_unlock_inline(mutex, state);
+
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_MTX_TRY_LOCK_ACQUIRE, mutex, 0);
+#endif
+}
+
+__attribute__((always_inline))
+static inline void
+lck_mtx_convert_spin_finish_inline(
+       lck_mtx_t       *mutex,
+       uint32_t        state)
+{
+       /* release the interlock and acquire it as mutex */
+       state &= ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK);
+       state |= LCK_MTX_MLOCKED_MSK;
+
+       ordered_store_mtx_state_release(mutex, state);
+       enable_preemption();
+}
+
+__attribute__((always_inline))
+static inline void
+lck_mtx_unlock_finish_inline(
+       lck_mtx_t       *mutex,
+       boolean_t       indirect)
+{
+       enable_preemption();
+
+#if    CONFIG_DTRACE
+       if (indirect) {
+               LOCKSTAT_RECORD(LS_LCK_MTX_EXT_UNLOCK_RELEASE, mutex, 0);
+       } else {
+               LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, mutex, 0);
+       }
+#endif // CONFIG_DTRACE
+}
+
+#endif /* _I386_LOCKS_I386_INLINES_H_ */
+
diff --git a/osfmk/i386/locks_i386_opt.c b/osfmk/i386/locks_i386_opt.c
new file mode 100644 (file)
index 0000000..90dcf06
--- /dev/null
@@ -0,0 +1,462 @@
+/*
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#define ATOMIC_PRIVATE 1
+#define LOCK_PRIVATE 1
+
+#include <mach_ldebug.h>
+
+#include <kern/locks.h>
+#include <kern/kalloc.h>
+#include <kern/misc_protos.h>
+#include <kern/thread.h>
+#include <kern/processor.h>
+#include <kern/cpu_data.h>
+#include <kern/cpu_number.h>
+#include <kern/sched_prim.h>
+#include <kern/xpr.h>
+#include <kern/debug.h>
+#include <string.h>
+
+#include <i386/machine_routines.h> /* machine_timeout_suspended() */
+#include <machine/atomic.h>
+#include <machine/machine_cpu.h>
+#include <i386/mp.h>
+#include <machine/atomic.h>
+#include <sys/kdebug.h>
+#include <i386/locks_i386_inlines.h>
+
+/*
+ * Fast path routines for lck_mtx locking and unlocking functions.
+ * Fast paths will try a single compare and swap instruction to acquire/release the lock
+ * and interlock, and they will fall through the slow path in case it fails.
+ *
+ * These functions were previously implemented in x86 assembly,
+ * and some optimizations are in place in this c code to obtain a compiled code
+ * as performant and compact as the assembly version.
+ *
+ * To avoid to inline these functions and increase the kernel text size all functions have
+ * the __attribute__((noinline)) specified.
+ *
+ * The code is structured in such a way there are no calls to functions that will return
+ * on the context of the caller function, i.e. all functions called are or tail call functions
+ * or inline functions. The number of arguments of the tail call functions are less then six,
+ * so that they can be passed over registers and do not need to be pushed on stack.
+ * This allows the compiler to not create a stack frame for the functions.
+ *
+ * The file is compiled with momit-leaf-frame-pointer and O2.
+ */
+
+#if DEVELOPMENT || DEBUG
+
+/*
+ * If one or more simplelocks are currently held by a thread,
+ * an attempt to acquire a mutex will cause this check to fail
+ * (since a mutex lock may context switch, holding a simplelock
+ * is not a good thing).
+ */
+void __inline__
+lck_mtx_check_preemption(void)
+{
+       if (get_preemption_level() == 0)
+               return;
+       if (LckDisablePreemptCheck)
+               return;
+       if (current_cpu_datap()->cpu_hibernate)
+               return;
+
+       panic("preemption_level(%d) != 0\n", get_preemption_level());
+}
+
+#else /* DEVELOPMENT || DEBUG */
+
+void __inline__
+lck_mtx_check_preemption(void)
+{
+       return;
+}
+
+#endif /* DEVELOPMENT || DEBUG */
+
+/*
+ * Routine:    lck_mtx_lock
+ *
+ * Locks a mutex for current thread.
+ * It tries the fast path first and
+ * falls through the slow path in case
+ * of contention.
+ *
+ * Interlock or mutex cannot be already held by current thread.
+ * In case of contention it might sleep.
+ */
+__attribute__((noinline))
+void
+lck_mtx_lock(
+       lck_mtx_t       *lock)
+{
+       uint32_t prev, state;
+
+       lck_mtx_check_preemption();
+       state = ordered_load_mtx_state(lock);
+
+       /*
+        * Fast path only if the mutex is not held
+        * interlock is not contended and there are no waiters.
+        * Indirect mutexes will fall through the slow path as
+        * well as destroyed mutexes.
+        */
+
+       prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK | LCK_MTX_WAITERS_MSK);
+       state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK;
+
+       disable_preemption();
+       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+               enable_preemption();
+               return lck_mtx_lock_slow(lock);
+       }
+
+       /* mutex acquired, interlock acquired and preemption disabled */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+
+       /* release interlock and re-enable preemption */
+       lck_mtx_lock_finish_inline(lock, state, FALSE);
+}
+
+/*
+ * Routine:    lck_mtx_try_lock
+ *
+ * Try to lock a mutex for current thread.
+ * It tries the fast path first and
+ * falls through the slow path in case
+ * of contention.
+ *
+ * Interlock or mutex cannot be already held by current thread.
+ *
+ * In case the mutex is held (either as spin or mutex)
+ * the function will fail, it will acquire the mutex otherwise.
+ */
+__attribute__((noinline))
+boolean_t
+lck_mtx_try_lock(
+       lck_mtx_t       *lock)
+{
+       uint32_t prev, state;
+
+       state = ordered_load_mtx_state(lock);
+
+       /*
+        * Fast path only if the mutex is not held
+        * interlock is not contended and there are no waiters.
+        * Indirect mutexes will fall through the slow path as
+        * well as destroyed mutexes.
+        */
+
+       prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK | LCK_MTX_WAITERS_MSK);
+       state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK;
+
+       disable_preemption();
+       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+               enable_preemption();
+               return lck_mtx_try_lock_slow(lock);
+       }
+
+       /* mutex acquired, interlock acquired and preemption disabled */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+
+       /* release interlock and re-enable preemption */
+       lck_mtx_try_lock_finish_inline(lock, state);
+
+       return TRUE;
+}
+
+/*
+ * Routine:    lck_mtx_lock_spin_always
+ *
+ * Try to lock a mutex as spin lock for current thread.
+ * It tries the fast path first and
+ * falls through the slow path in case
+ * of contention.
+ *
+ * Interlock or mutex cannot be already held by current thread.
+ *
+ * In case the mutex is held as mutex by another thread
+ * this function will switch behavior and try to acquire the lock as mutex.
+ *
+ * In case the mutex is held as spinlock it will spin contending
+ * for it.
+ *
+ * In case of contention it might sleep.
+ */
+__attribute__((noinline))
+void
+lck_mtx_lock_spin_always(
+       lck_mtx_t       *lock)
+{
+       uint32_t prev, state;
+
+       state = ordered_load_mtx_state(lock);
+
+       /*
+        * Fast path only if the mutex is not held
+        * neither as mutex nor as spin and
+        * interlock is not contended.
+        * Indirect mutexes will fall through the slow path as
+        * well as destroyed mutexes.
+        */
+
+       /* Note LCK_MTX_SPIN_MSK is set only if LCK_MTX_ILOCKED_MSK is set */
+       prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK);
+       state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK;
+
+       disable_preemption();
+       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+               enable_preemption();
+               return lck_mtx_lock_spin_slow(lock);
+       }
+
+       /* mutex acquired as spinlock, interlock acquired and preemption disabled */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+
+#if    CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_MTX_LOCK_SPIN_ACQUIRE, lock, 0);
+#endif
+       /* return with the interlock held and preemption disabled */
+       return;
+}
+
+/*
+ * Routine:    lck_mtx_lock_spin
+ *
+ * Try to lock a mutex as spin lock for current thread.
+ * It tries the fast path first and
+ * falls through the slow path in case
+ * of contention.
+ *
+ * Interlock or mutex cannot be already held by current thread.
+ *
+ * In case the mutex is held as mutex by another thread
+ * this function will switch behavior and try to acquire the lock as mutex.
+ *
+ * In case the mutex is held as spinlock it will spin contending
+ * for it.
+ *
+ * In case of contention it might sleep.
+ */
+void
+lck_mtx_lock_spin(
+       lck_mtx_t       *lock)
+{
+       lck_mtx_check_preemption();
+       lck_mtx_lock_spin_always(lock);
+}
+
+/*
+ * Routine:    lck_mtx_try_lock_spin_always
+ *
+ * Try to lock a mutex as spin lock for current thread.
+ * It tries the fast path first and
+ * falls through the slow path in case
+ * of contention.
+ *
+ * Interlock or mutex cannot be already held by current thread.
+ *
+ * In case the mutex is held (either as spin or mutex)
+ * the function will fail, it will acquire the mutex as spin lock
+ * otherwise.
+ *
+ */
+__attribute__((noinline))
+boolean_t
+lck_mtx_try_lock_spin_always(
+       lck_mtx_t       *lock)
+{
+       uint32_t prev, state;
+
+       state = ordered_load_mtx_state(lock);
+
+       /*
+        * Fast path only if the mutex is not held
+        * neither as mutex nor as spin and
+        * interlock is not contended.
+        * Indirect mutexes will fall through the slow path as
+        * well as destroyed mutexes.
+        */
+
+       /* Note LCK_MTX_SPIN_MSK is set only if LCK_MTX_ILOCKED_MSK is set */
+       prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_MLOCKED_MSK);
+       state = prev | LCK_MTX_ILOCKED_MSK | LCK_MTX_SPIN_MSK;
+
+       disable_preemption();
+       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+               enable_preemption();
+               return lck_mtx_try_lock_spin_slow(lock);
+       }
+
+       /* mutex acquired as spinlock, interlock acquired and preemption disabled */
+
+       thread_t thread = current_thread();
+       /* record owner of mutex */
+       ordered_store_mtx_owner(lock, (uintptr_t)thread);
+
+#if MACH_LDEBUG
+       if (thread) {
+               thread->mutex_count++;          /* lock statistic */
+       }
+#endif
+
+#if     CONFIG_DTRACE
+       LOCKSTAT_RECORD(LS_LCK_MTX_TRY_SPIN_LOCK_ACQUIRE, lock, 0);
+#endif
+
+       /* return with the interlock held and preemption disabled */
+       return TRUE;
+}
+
+/*
+ * Routine:    lck_mtx_try_lock_spin
+ *
+ * Try to lock a mutex as spin lock for current thread.
+ * It tries the fast path first and
+ * falls through the slow path in case
+ * of contention.
+ *
+ * Interlock or mutex cannot be already held by current thread.
+ *
+ * In case the mutex is held (either as spin or mutex)
+ * the function will fail, it will acquire the mutex as spin lock
+ * otherwise.
+ *
+ */
+boolean_t
+lck_mtx_try_lock_spin(
+       lck_mtx_t       *lock)
+{
+       return lck_mtx_try_lock_spin_always(lock);
+}
+
+/*
+ * Routine:    lck_mtx_unlock
+ *
+ * Unlocks a mutex held by current thread.
+ * It tries the fast path first, and falls
+ * through the slow path in case waiters need to
+ * be woken up or promotions need to be dropped.
+ *
+ * Interlock can be held, and the slow path will
+ * unlock the mutex for this case.
+ */
+__attribute__((noinline))
+void
+lck_mtx_unlock(
+       lck_mtx_t       *lock)
+{
+       uint32_t prev, state;
+
+       state = ordered_load_mtx_state(lock);
+
+       if (state & LCK_MTX_SPIN_MSK)
+               return lck_mtx_unlock_slow(lock);
+
+       /*
+        * Only full mutex will go through the fast path
+        * (if the lock was acquired as a spinlock it will
+        * fall through the slow path).
+        * If there are waiters or promotions it will fall
+        * through the slow path.
+        * If it is indirect it will fall through the slow path.
+        */
+
+        /*
+         * Fast path state:
+         * interlock not held, no waiters, no promotion and mutex held.
+         */
+       prev = state & ~(LCK_MTX_ILOCKED_MSK | LCK_MTX_WAITERS_MSK | LCK_MTX_PROMOTED_MSK);
+       prev |= LCK_MTX_MLOCKED_MSK;
+
+       state = prev | LCK_MTX_ILOCKED_MSK;
+       state &= ~LCK_MTX_MLOCKED_MSK;
+
+       disable_preemption();
+
+       /* the memory order needs to be acquire because it is acquiring the interlock */
+       if (!atomic_compare_exchange32(&lock->lck_mtx_state, prev, state, memory_order_acquire_smp, FALSE)) {
+               enable_preemption();
+               return lck_mtx_unlock_slow(lock);
+       }
+
+       /* mutex released, interlock acquired and preemption disabled */
+
+#if DEVELOPMENT | DEBUG
+       thread_t owner = (thread_t)lock->lck_mtx_owner;
+       if(__improbable(owner != current_thread()))
+               return lck_mtx_owner_check_panic(lock);
+#endif
+
+       /* clear owner */
+       ordered_store_mtx_owner(lock, 0);
+       /* release interlock */
+       state &= ~LCK_MTX_ILOCKED_MSK;
+       ordered_store_mtx_state_release(lock, state);
+
+#if     MACH_LDEBUG
+       thread_t thread = current_thread();
+       if (thread)
+               thread->mutex_count--;
+#endif  /* MACH_LDEBUG */
+
+       /* re-enable preemption */
+       lck_mtx_unlock_finish_inline(lock, FALSE);
+}
+
index 68f9f781925480d7fbec1ce4c0cb810f77c45468..e62e821c6b91a38eacbac26f522b08a3b6f42154 100644 (file)
@@ -130,6 +130,20 @@ ml_static_ptovirt(
 #endif
 } 
 
+vm_offset_t
+ml_static_slide(
+       vm_offset_t vaddr)
+{
+       return VM_KERNEL_SLIDE(vaddr);
+}
+
+vm_offset_t
+ml_static_unslide(
+       vm_offset_t vaddr)
+{
+       return VM_KERNEL_UNSLIDE(vaddr);
+}
+
 
 /*
  *     Routine:        ml_static_mfree
@@ -505,7 +519,7 @@ ml_processor_register(
     /* allocate and initialize other per-cpu structures */
     if (!boot_cpu) {
        mp_cpus_call_cpu_init(cpunum);
-       prng_cpu_init(cpunum);
+       early_random_cpu_init(cpunum);
     }
 
     /* output arg */
@@ -796,7 +810,7 @@ boolean_t ml_is64bit(void) {
 
 boolean_t ml_thread_is64bit(thread_t thread) {
   
-        return (thread_is_64bit(thread));
+        return (thread_is_64bit_addr(thread));
 }
 
 
@@ -873,20 +887,21 @@ kernel_preempt_check(void)
 
        assert(get_preemption_level() == 0);
 
-       __asm__ volatile("pushf; pop    %0" :  "=r" (flags));
-
-       intr = ((flags & EFL_IF) != 0);
-
-       if ((*ast_pending() & AST_URGENT) && intr == TRUE) {
+       if (__improbable(*ast_pending() & AST_URGENT)) {
                /*
                 * can handle interrupts and preemptions 
                 * at this point
                 */
+               __asm__ volatile("pushf; pop    %0" :  "=r" (flags));
+
+               intr = ((flags & EFL_IF) != 0);
 
                /*
                 * now cause the PRE-EMPTION trap
                 */
-               __asm__ volatile ("int %0" :: "N" (T_PREEMPT));
+               if (intr == TRUE){
+                       __asm__ volatile ("int %0" :: "N" (T_PREEMPT));
+               }
        }
 }
 
index da6db8347d09206935e348eaa438532bc17def4a..8020990521059e34a61a712cb95acb6ac7d7220f 100644 (file)
@@ -100,6 +100,12 @@ void ml_static_mfree(
 vm_offset_t ml_static_malloc(
        vm_size_t size);
 
+vm_offset_t ml_static_slide(
+       vm_offset_t vaddr);
+
+vm_offset_t ml_static_unslide(
+       vm_offset_t vaddr);
+
 /* virtual to physical on wired pages */
 vm_offset_t ml_vtophys(
        vm_offset_t vaddr);
index f1cd81ce4564d39dee9e81910be583b8db5c49d6..3edd363c7adca03e6aab6e0ebcd11f3bfc429ea6 100644 (file)
@@ -76,7 +76,7 @@ machine_task_set_state(
                case x86_DEBUG_STATE32:
                {
                        x86_debug_state32_t *tstate = (x86_debug_state32_t*) state;
-                       if ((task_has_64BitAddr(task)) || 
+                       if ((task_has_64Bit_addr(task)) ||
                                        (state_count != x86_DEBUG_STATE32_COUNT) || 
                                        (!debug_state_is_valid32(tstate))) {
                                return KERN_INVALID_ARGUMENT;
@@ -94,7 +94,7 @@ machine_task_set_state(
                {
                        x86_debug_state64_t *tstate = (x86_debug_state64_t*) state;
 
-                       if ((!task_has_64BitAddr(task)) || 
+                       if ((!task_has_64Bit_addr(task)) ||
                                        (state_count != x86_DEBUG_STATE64_COUNT) || 
                                        (!debug_state_is_valid64(tstate))) {
                                return KERN_INVALID_ARGUMENT;
@@ -118,7 +118,7 @@ machine_task_set_state(
 
                        if ((tstate->dsh.flavor == x86_DEBUG_STATE32) && 
                                        (tstate->dsh.count == x86_DEBUG_STATE32_COUNT) &&
-                                       (!task_has_64BitAddr(task)) &&
+                                       (!task_has_64Bit_addr(task)) &&
                                        debug_state_is_valid32(&tstate->uds.ds32)) {
                                
                                if (task->task_debug == NULL) {
@@ -130,7 +130,7 @@ machine_task_set_state(
 
                        } else if ((tstate->dsh.flavor == x86_DEBUG_STATE64) && 
                                        (tstate->dsh.count == x86_DEBUG_STATE64_COUNT) &&
-                                       task_has_64BitAddr(task) &&
+                                       task_has_64Bit_addr(task) &&
                                        debug_state_is_valid64(&tstate->uds.ds64)) {
                                
                                if (task->task_debug == NULL) {
@@ -161,7 +161,7 @@ machine_task_get_state(task_t task,
                {
                        x86_debug_state32_t *tstate = (x86_debug_state32_t*) state;
 
-                       if ((task_has_64BitAddr(task)) || (*state_count != x86_DEBUG_STATE32_COUNT)) {
+                       if ((task_has_64Bit_addr(task)) || (*state_count != x86_DEBUG_STATE32_COUNT)) {
                                return KERN_INVALID_ARGUMENT;
                        }
 
@@ -177,7 +177,7 @@ machine_task_get_state(task_t task,
                {
                        x86_debug_state64_t *tstate = (x86_debug_state64_t*) state;
 
-                       if ((!task_has_64BitAddr(task)) || (*state_count != x86_DEBUG_STATE64_COUNT)) {
+                       if ((!task_has_64Bit_addr(task)) || (*state_count != x86_DEBUG_STATE64_COUNT)) {
                                return KERN_INVALID_ARGUMENT;
                        }
 
@@ -196,7 +196,7 @@ machine_task_get_state(task_t task,
                        if (*state_count != x86_DEBUG_STATE_COUNT)
                                return(KERN_INVALID_ARGUMENT);
 
-                       if (task_has_64BitAddr(task)) {
+                       if (task_has_64Bit_addr(task)) {
                                tstate->dsh.flavor = x86_DEBUG_STATE64;
                                tstate->dsh.count  = x86_DEBUG_STATE64_COUNT;
 
@@ -270,7 +270,7 @@ machine_thread_inherit_taskwide(
                int flavor;
                mach_msg_type_number_t count;
 
-               if (task_has_64BitAddr(parent_task)) {
+               if (task_has_64Bit_addr(parent_task)) {
                        flavor = x86_DEBUG_STATE64;
                        count = x86_DEBUG_STATE64_COUNT;
                } else {
index 3b72326873570e426d4e4582ba47b5aeb13129fc..e5c83e895e7d275b65de87f41638398e888450b4 100644 (file)
@@ -572,7 +572,7 @@ cpu_signal_handler(x86_saved_state_t *regs)
 }
 
 extern void kprintf_break_lock(void);
-static int
+int
 NMIInterruptHandler(x86_saved_state_t *regs)
 {
        void            *stackptr;
@@ -690,7 +690,7 @@ NMI_cpus(void)
        uint64_t        tsc_timeout;
 
        intrs_enabled = ml_set_interrupts_enabled(FALSE);
-
+       NMIPI_enable(TRUE);
        for (cpu = 0; cpu < real_ncpus; cpu++) {
                if (!cpu_is_running(cpu))
                        continue;
@@ -707,6 +707,7 @@ NMI_cpus(void)
                }
                cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
        }
+       NMIPI_enable(FALSE);
 
        ml_set_interrupts_enabled(intrs_enabled);
 }
@@ -849,6 +850,13 @@ mp_rendezvous_action(__unused void *null)
        boolean_t       intrs_enabled;
        uint64_t        tsc_spin_start;
 
+       /*
+        * Note that mp_rv_lock was acquired by the thread that initiated the
+        * rendezvous and must have been acquired before we enter
+        * mp_rendezvous_action().
+        */
+       current_cpu_datap()->cpu_rendezvous_in_progress = TRUE;
+
        /* setup function */
        if (mp_rv_setup_func != NULL)
                mp_rv_setup_func(mp_rv_func_arg);
@@ -886,6 +894,8 @@ mp_rendezvous_action(__unused void *null)
        if (mp_rv_teardown_func != NULL)
                mp_rv_teardown_func(mp_rv_func_arg);
 
+       current_cpu_datap()->cpu_rendezvous_in_progress = FALSE;
+
        /* Bump completion count */
        atomic_incl(&mp_rv_complete, 1);
 }
@@ -909,7 +919,7 @@ mp_rendezvous(void (*setup_func)(void *),
        }
                
        /* obtain rendezvous lock */
-       (void) mp_safe_spin_lock(&mp_rv_lock);
+       mp_rendezvous_lock();
 
        /* set static function pointers */
        mp_rv_setup_func = setup_func;
@@ -948,6 +958,18 @@ mp_rendezvous(void (*setup_func)(void *),
        mp_rv_func_arg = NULL;
 
        /* release lock */
+       mp_rendezvous_unlock();
+}
+
+void
+mp_rendezvous_lock(void)
+{
+       (void) mp_safe_spin_lock(&mp_rv_lock);
+}
+
+void
+mp_rendezvous_unlock(void)
+{
        simple_unlock(&mp_rv_lock);
 }
 
@@ -1024,7 +1046,8 @@ mp_call_head_lock(mp_call_queue_t *cqp)
  */
 void
 NMIPI_panic(cpumask_t cpu_mask, NMI_reason_t why) {
-       unsigned int cpu, cpu_bit;
+       unsigned int cpu;
+       cpumask_t cpu_bit;
        uint64_t deadline;
 
        NMIPI_enable(TRUE);
@@ -1575,7 +1598,7 @@ mp_kdp_enter(boolean_t proceed_on_failure)
                }
                if (proceed_on_failure) {
                        if (mach_absolute_time() - start_time > 500000000ll) {
-                               kprintf("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
+                               paniclog_append_noflush("mp_kdp_enter() can't get x86_topo_lock! Debugging anyway! #YOLO\n");
                                break;
                        }
                        locked = simple_lock_try(&x86_topo_lock);
@@ -1648,13 +1671,16 @@ mp_kdp_enter(boolean_t proceed_on_failure)
                        NMIPI_enable(TRUE);
                }
                if (mp_kdp_ncpus != ncpus) {
-                       cpumask_t cpus_NMI_pending = 0;
-                       DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
+                       unsigned int wait_cycles = 0;
+                       if (proceed_on_failure)
+                               paniclog_append_noflush("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
+                       else
+                               DBG("mp_kdp_enter() timed-out on cpu %d, NMI-ing\n", my_cpu);
                        for (cpu = 0; cpu < real_ncpus; cpu++) {
                                if (cpu == my_cpu || !cpu_is_running(cpu))
                                        continue;
                                if (cpu_signal_pending(cpu, MP_KDP)) {
-                                       cpus_NMI_pending |= cpu_to_cpumask(cpu);
+                                       cpu_datap(cpu)->cpu_NMI_acknowledged = FALSE;
                                        cpu_NMI_interrupt(cpu);
                                }
                        }
@@ -1663,9 +1689,24 @@ mp_kdp_enter(boolean_t proceed_on_failure)
                        while (mp_kdp_ncpus != ncpus && rdtsc64() < tsc_timeout) {
                                handle_pending_TLB_flushes();
                                cpu_pause();
+                               ++wait_cycles;
                        }
                        if (mp_kdp_ncpus != ncpus) {
-                               kdb_printf("mp_kdp_enter(): %llu, %lu, %u TIMED-OUT WAITING FOR NMI-ACK, PROCEEDING\n", cpus_NMI_pending, mp_kdp_ncpus, ncpus);
+                               paniclog_append_noflush("mp_kdp_enter() NMI pending on cpus:");
+                               for (cpu = 0; cpu < real_ncpus; cpu++) {
+                                       if (cpu_is_running(cpu) && !cpu_datap(cpu)->cpu_NMI_acknowledged)
+                                               paniclog_append_noflush(" %d", cpu);
+                               }
+                               paniclog_append_noflush("\n");
+                               if (proceed_on_failure) {
+                                       paniclog_append_noflush("mp_kdp_enter() timed-out during %s wait after NMI;"
+                                           "expected %u acks but received %lu after %u loops in %llu ticks\n",
+                                            (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
+                               } else {
+                                       panic("mp_kdp_enter() timed-out during %s wait after NMI;"
+                                           "expected %u acks but received %lu after %u loops in %llu ticks",
+                                            (locked ? "locked" : "unlocked"), ncpus, mp_kdp_ncpus, wait_cycles, LockTimeOutTSC);
+                               }
                        }
                }
        }
@@ -1686,6 +1727,22 @@ mp_kdp_enter(boolean_t proceed_on_failure)
        postcode(MP_KDP_ENTER);
 }
 
+boolean_t
+mp_kdp_all_cpus_halted()
+{
+       unsigned int ncpus = 0, cpu = 0, my_cpu = 0;
+
+       my_cpu = cpu_number();
+       ncpus = 1; /* current CPU */
+       for (cpu = 0; cpu < real_ncpus; cpu++) {
+               if (cpu == my_cpu || !cpu_is_running(cpu))
+                       continue;
+               ncpus++;
+       }
+
+       return (mp_kdp_ncpus == ncpus);
+}
+
 static boolean_t
 cpu_signal_pending(int cpu, mp_event_t event)
 {
index 6f46c5d4a9a5255fc150780cf2f6f7b4b70a91ba..705f41c189b92bf86fdf8bd26ba2db536646389f 100644 (file)
@@ -112,8 +112,9 @@ extern  uint32_t spinlock_timeout_NMI(uintptr_t thread_addr);
 
 extern uint64_t        LastDebuggerEntryAllowance;
 
-extern void    mp_kdp_enter(boolean_t proceed_on_failure);
-extern void    mp_kdp_exit(void);
+extern void      mp_kdp_enter(boolean_t proceed_on_failure);
+extern void      mp_kdp_exit(void);
+extern boolean_t mp_kdp_all_cpus_halted(void);
 
 extern boolean_t       mp_recent_debugger_activity(void);
 extern void    kernel_spin(uint64_t spin_ns);
@@ -130,6 +131,8 @@ extern void mp_rendezvous_no_intrs(
                void (*action_func)(void *),
                void *arg);
 extern void mp_rendezvous_break_lock(void);
+extern void mp_rendezvous_lock(void);
+extern void mp_rendezvous_unlock(void);
 
 /*
  * All cpu broadcast.
index 788e71663931347b217f1dd0b3550d4f6c7cb776..78c9e11d09c144c23d1054f13ebf51751b3dc1e7 100644 (file)
@@ -646,7 +646,7 @@ cpu_data_alloc(boolean_t is_boot_cpu)
         * started.
         */
        cdp->cpu_active_thread = (thread_t) (uintptr_t) cdp->cpu_number;
-
+       cdp->cpu_NMI_acknowledged = TRUE;
        cdp->cpu_nanotime = &pal_rtc_nanotime_info;
 
        kprintf("cpu_data_alloc(%d) %p desc_table: %p "
index 9a2ca43901462d59ef4e8f2978ae1d2fb245d5b2..9f1471f36940c00e8bed982b2153265d7ab45a62 100644 (file)
@@ -721,6 +721,46 @@ get_thread_state64(thread_t thread, x86_thread_state64_t *ts)
        ts->gs = saved_state->gs;
 }
 
+kern_return_t
+machine_thread_state_convert_to_user(
+                        __unused thread_t thread,
+                        __unused thread_flavor_t flavor,
+                        __unused thread_state_t tstate,
+                        __unused mach_msg_type_number_t *count)
+{
+       // No conversion to userspace representation on this platform
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+machine_thread_state_convert_from_user(
+                        __unused thread_t thread,
+                        __unused thread_flavor_t flavor,
+                        __unused thread_state_t tstate,
+                        __unused mach_msg_type_number_t count)
+{
+       // No conversion from userspace representation on this platform
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+machine_thread_siguctx_pointer_convert_to_user(
+                        __unused thread_t thread,
+                        __unused user_addr_t *uctxp)
+{
+       // No conversion to userspace representation on this platform
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+machine_thread_function_pointers_convert_from_user(
+                        __unused thread_t thread,
+                        __unused user_addr_t *fptrs,
+                        __unused uint32_t count)
+{
+       // No conversion from userspace representation on this platform
+       return KERN_SUCCESS;
+}
 
 /*
  *     act_machine_set_state:
@@ -744,7 +784,7 @@ machine_thread_set_state(
                if (count < x86_SAVED_STATE32_COUNT)
                        return(KERN_INVALID_ARGUMENT);
         
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                state = (x86_saved_state32_t *) tstate;
@@ -809,7 +849,7 @@ machine_thread_set_state(
                if (count < x86_SAVED_STATE64_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if (!thread_is_64bit(thr_act))
+               if (!thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                state = (x86_saved_state64_t *) tstate;
@@ -879,7 +919,7 @@ machine_thread_set_state(
                if (count != _MachineStateCount[flavor])
                        return(KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                return fpu_set_fxstate(thr_act, tstate, flavor);
@@ -894,7 +934,7 @@ machine_thread_set_state(
                if (count != _MachineStateCount[flavor])
                        return(KERN_INVALID_ARGUMENT);
 
-               if (!thread_is_64bit(thr_act))
+               if (!thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                return fpu_set_fxstate(thr_act, tstate, flavor);
@@ -909,11 +949,11 @@ machine_thread_set_state(
 
                state = (x86_float_state_t *)tstate;
                if (state->fsh.flavor == x86_FLOAT_STATE64 && state->fsh.count == x86_FLOAT_STATE64_COUNT &&
-                   thread_is_64bit(thr_act)) {
+                   thread_is_64bit_addr(thr_act)) {
                        return fpu_set_fxstate(thr_act, (thread_state_t)&state->ufs.fs64, x86_FLOAT_STATE64);
                }
                if (state->fsh.flavor == x86_FLOAT_STATE32 && state->fsh.count == x86_FLOAT_STATE32_COUNT &&
-                   !thread_is_64bit(thr_act)) {
+                   !thread_is_64bit_addr(thr_act)) {
                        return fpu_set_fxstate(thr_act, (thread_state_t)&state->ufs.fs32, x86_FLOAT_STATE32); 
                }
                return(KERN_INVALID_ARGUMENT);
@@ -934,7 +974,7 @@ machine_thread_set_state(
                /* 64-bit flavor? */
                if (state->ash.flavor == (flavor - 1) &&
                    state->ash.count  == _MachineStateCount[flavor - 1] &&
-                   thread_is_64bit(thr_act)) {
+                   thread_is_64bit_addr(thr_act)) {
                        return fpu_set_fxstate(thr_act,
                                               (thread_state_t)&state->ufs.as64,
                                               flavor - 1);
@@ -942,7 +982,7 @@ machine_thread_set_state(
                /* 32-bit flavor? */
                if (state->ash.flavor == (flavor - 2) &&
                    state->ash.count  == _MachineStateCount[flavor - 2] &&
-                   !thread_is_64bit(thr_act)) {
+                   !thread_is_64bit_addr(thr_act)) {
                        return fpu_set_fxstate(thr_act,
                                               (thread_state_t)&state->ufs.as32,
                                               flavor - 2); 
@@ -955,7 +995,7 @@ machine_thread_set_state(
                if (count != x86_THREAD_STATE32_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                return set_thread_state32(thr_act, (x86_thread_state32_t *)tstate);
@@ -966,7 +1006,7 @@ machine_thread_set_state(
                if (count != x86_THREAD_STATE64_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if (!thread_is_64bit(thr_act))
+               if (!thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                return set_thread_state64(thr_act, (x86_thread_state64_t *)tstate);
@@ -983,11 +1023,11 @@ machine_thread_set_state(
 
                if (state->tsh.flavor == x86_THREAD_STATE64 &&
                    state->tsh.count == x86_THREAD_STATE64_COUNT &&
-                   thread_is_64bit(thr_act)) {
+                   thread_is_64bit_addr(thr_act)) {
                        return set_thread_state64(thr_act, &state->uts.ts64);
                } else if (state->tsh.flavor == x86_THREAD_STATE32 &&
                           state->tsh.count == x86_THREAD_STATE32_COUNT &&
-                          !thread_is_64bit(thr_act)) {
+                          !thread_is_64bit_addr(thr_act)) {
                        return set_thread_state32(thr_act, &state->uts.ts32);
                } else
                        return(KERN_INVALID_ARGUMENT);
@@ -997,7 +1037,7 @@ machine_thread_set_state(
                x86_debug_state32_t *state;
                kern_return_t ret;
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                state = (x86_debug_state32_t *)tstate;
@@ -1011,7 +1051,7 @@ machine_thread_set_state(
                x86_debug_state64_t *state;
                kern_return_t ret;
 
-               if (!thread_is_64bit(thr_act))
+               if (!thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                state = (x86_debug_state64_t *)tstate;
@@ -1031,13 +1071,13 @@ machine_thread_set_state(
                state = (x86_debug_state_t *)tstate;
                if (state->dsh.flavor == x86_DEBUG_STATE64 &&
                                state->dsh.count == x86_DEBUG_STATE64_COUNT &&
-                               thread_is_64bit(thr_act)) {
+                               thread_is_64bit_addr(thr_act)) {
                        ret = set_debug_state64(thr_act, &state->uds.ds64);
                }
                else
                        if (state->dsh.flavor == x86_DEBUG_STATE32 &&
                            state->dsh.count == x86_DEBUG_STATE32_COUNT &&
-                           !thread_is_64bit(thr_act)) {
+                           !thread_is_64bit_addr(thr_act)) {
                                ret = set_debug_state32(thr_act, &state->uds.ds32);
                }
                return ret;
@@ -1135,7 +1175,7 @@ machine_thread_get_state(
                if (*count < x86_SAVED_STATE32_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                state = (x86_saved_state32_t *) tstate;
@@ -1162,7 +1202,7 @@ machine_thread_get_state(
                if (*count < x86_SAVED_STATE64_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if (!thread_is_64bit(thr_act))
+               if (!thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                state = (x86_saved_state64_t *)tstate;
@@ -1184,7 +1224,7 @@ machine_thread_get_state(
                if (*count < x86_FLOAT_STATE32_COUNT) 
                        return(KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                *count = x86_FLOAT_STATE32_COUNT;
@@ -1197,7 +1237,7 @@ machine_thread_get_state(
                if (*count < x86_FLOAT_STATE64_COUNT) 
                        return(KERN_INVALID_ARGUMENT);
 
-               if ( !thread_is_64bit(thr_act))
+               if ( !thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                *count = x86_FLOAT_STATE64_COUNT;
@@ -1219,7 +1259,7 @@ machine_thread_get_state(
                 * no need to bzero... currently 
                 * x86_FLOAT_STATE64_COUNT == x86_FLOAT_STATE32_COUNT
                 */
-               if (thread_is_64bit(thr_act)) {
+               if (thread_is_64bit_addr(thr_act)) {
                        state->fsh.flavor = x86_FLOAT_STATE64;
                        state->fsh.count  = x86_FLOAT_STATE64_COUNT;
 
@@ -1243,7 +1283,7 @@ machine_thread_get_state(
                if (*count != _MachineStateCount[flavor])
                        return(KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                *count = _MachineStateCount[flavor];
@@ -1259,7 +1299,7 @@ machine_thread_get_state(
                if (*count != _MachineStateCount[flavor])
                        return(KERN_INVALID_ARGUMENT);
 
-               if ( !thread_is_64bit(thr_act))
+               if ( !thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                *count = _MachineStateCount[flavor];
@@ -1283,7 +1323,7 @@ machine_thread_get_state(
 
                bzero((char *)state, *count * sizeof(int));
 
-               if (thread_is_64bit(thr_act)) {
+               if (thread_is_64bit_addr(thr_act)) {
                        flavor -= 1;    /* 64-bit flavor */
                        fstate = (thread_state_t) &state->ufs.as64;
                } else {
@@ -1301,7 +1341,7 @@ machine_thread_get_state(
                if (*count < x86_THREAD_STATE32_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                *count = x86_THREAD_STATE32_COUNT;
@@ -1315,7 +1355,7 @@ machine_thread_get_state(
                if (*count < x86_THREAD_STATE64_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-                if ( !thread_is_64bit(thr_act))
+                if ( !thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                *count = x86_THREAD_STATE64_COUNT;
@@ -1335,7 +1375,7 @@ machine_thread_get_state(
 
                bzero((char *)state, sizeof(x86_thread_state_t));
 
-               if (thread_is_64bit(thr_act)) {
+               if (thread_is_64bit_addr(thr_act)) {
                        state->tsh.flavor = x86_THREAD_STATE64;
                        state->tsh.count  = x86_THREAD_STATE64_COUNT;
 
@@ -1357,7 +1397,7 @@ machine_thread_get_state(
                if (*count < x86_EXCEPTION_STATE32_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                *count = x86_EXCEPTION_STATE32_COUNT;
@@ -1376,7 +1416,7 @@ machine_thread_get_state(
                if (*count < x86_EXCEPTION_STATE64_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if ( !thread_is_64bit(thr_act))
+               if ( !thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                *count = x86_EXCEPTION_STATE64_COUNT;
@@ -1401,7 +1441,7 @@ machine_thread_get_state(
 
                bzero((char *)state, sizeof(x86_exception_state_t));
 
-               if (thread_is_64bit(thr_act)) {
+               if (thread_is_64bit_addr(thr_act)) {
                        state->esh.flavor = x86_EXCEPTION_STATE64;
                        state->esh.count  = x86_EXCEPTION_STATE64_COUNT;
 
@@ -1421,7 +1461,7 @@ machine_thread_get_state(
                if (*count < x86_DEBUG_STATE32_COUNT)
                        return(KERN_INVALID_ARGUMENT);
 
-               if (thread_is_64bit(thr_act))
+               if (thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                get_debug_state32(thr_act, (x86_debug_state32_t *)tstate);
@@ -1435,7 +1475,7 @@ machine_thread_get_state(
                if (*count < x86_DEBUG_STATE64_COUNT)
                        return(KERN_INVALID_ARGUMENT);
                
-               if (!thread_is_64bit(thr_act))
+               if (!thread_is_64bit_addr(thr_act))
                        return(KERN_INVALID_ARGUMENT);
 
                get_debug_state64(thr_act, (x86_debug_state64_t *)tstate);
@@ -1455,7 +1495,7 @@ machine_thread_get_state(
 
                bzero(state, sizeof *state);
 
-               if (thread_is_64bit(thr_act)) {
+               if (thread_is_64bit_addr(thr_act)) {
                        state->dsh.flavor = x86_DEBUG_STATE64;
                        state->dsh.count  = x86_DEBUG_STATE64_COUNT;
 
@@ -1661,7 +1701,7 @@ machine_thread_switch_addrmode(thread_t thread)
        machine_thread_create(thread, thread->task);
 
        /* Adjust FPU state */
-       fpu_switch_addrmode(thread, task_has_64BitAddr(thread->task));
+       fpu_switch_addrmode(thread, task_has_64Bit_addr(thread->task));
 
        /* If we're switching ourselves, reset the pcb addresses etc. */
        if (thread == current_thread()) {
@@ -1711,7 +1751,7 @@ get_useraddr(void)
 {
         thread_t thr_act = current_thread();
  
-        if (thread_is_64bit(thr_act)) {
+        if (thread_is_64bit_addr(thr_act)) {
                x86_saved_state64_t     *iss64;
                
                iss64 = USER_REGS64(thr_act);
@@ -1849,7 +1889,7 @@ act_thread_csave(void)
        mach_msg_type_number_t val;
        thread_t thr_act = current_thread();
 
-       if (thread_is_64bit(thr_act)) {
+       if (thread_is_64bit_addr(thr_act)) {
                struct x86_act_context64 *ic64;
 
                ic64 = (struct x86_act_context64 *)kalloc(sizeof(struct x86_act_context64));
@@ -1929,7 +1969,7 @@ act_thread_catt(void *ctx)
        if (ctx == (void *)NULL)
                                return;
 
-        if (thread_is_64bit(thr_act)) {
+        if (thread_is_64bit_addr(thr_act)) {
                struct x86_act_context64 *ic64;
 
                ic64 = (struct x86_act_context64 *)ctx;
index 34df7c11932c2d5fa78822a2c2941602ae98c345..748bde0490accb322636352d4f3a6111ddebe0d2 100644 (file)
@@ -374,7 +374,7 @@ machine_thread_create(
 
        bzero((char *)pcb->iss, sizeof(x86_saved_state_t));
 
-        if (task_has_64BitAddr(task)) {
+        if (task_has_64Bit_addr(task)) {
                pcb->iss->flavor = x86_SAVED_STATE64;
 
                pcb->iss->ss_64.isf.cs = USER64_CS;
@@ -447,7 +447,7 @@ machine_thread_set_tsd_base(
                return KERN_INVALID_ARGUMENT;
        }
 
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_addr(thread)) {
                /* check for canonical address, set 0 otherwise  */
                if (!IS_USERADDR64_CANONICAL(tsd_base))
                        tsd_base = 0ULL;
@@ -459,7 +459,7 @@ machine_thread_set_tsd_base(
        pcb_t pcb = THREAD_TO_PCB(thread);
        pcb->cthread_self = tsd_base;
 
-       if (!thread_is_64bit(thread)) {
+       if (!thread_is_64bit_addr(thread)) {
                /* Set up descriptor for later use */
                struct real_descriptor desc = {
                        .limit_low = 1,
@@ -478,7 +478,7 @@ machine_thread_set_tsd_base(
        /* For current thread, make the TSD base active immediately */
        if (thread == current_thread()) {
 
-               if (thread_is_64bit(thread)) {
+               if (thread_is_64bit_addr(thread)) {
                        cpu_data_t              *cdp;
 
                        mp_disable_preemption();
index 3458ce7cf57806b6f60624d5275fc5944582e3fe..076b69aa3884269a7b61f73858a2aae70d8821b1 100644 (file)
@@ -78,7 +78,6 @@
 #include <kern/kern_types.h>
 #include <kern/thread.h>
 #include <kern/simple_lock.h>
-#include <mach/branch_predicates.h>
 
 #include <i386/mp.h>
 #include <i386/proc_reg.h>
@@ -433,6 +432,10 @@ extern boolean_t pmap_ept_support_ad;
 #define PMAP_ACTIVATE_CACHE    4
 #define PMAP_NO_GUARD_CACHE    8
 
+/* Per-pmap ledger operations */
+#define        pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a)
+#define        pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a)
+
 #ifndef        ASSEMBLER
 
 #include <sys/queue.h>
@@ -542,6 +545,7 @@ struct pmap {
        pmap_paddr_t    pm_eptp;        /* EPTP */
        ledger_t        ledger;         /* ledger tracking phys mappings */
 #if MACH_ASSERT
+       boolean_t       pmap_stats_assert;
        int             pmap_pid;
        char            pmap_procname[17];
 #endif /* MACH_ASSERT */
@@ -618,7 +622,8 @@ set_dirbase(pmap_t tpmap, thread_t thread, int my_cpu) {
        cpu_datap(ccpu)->cpu_ucr3 = ucr3;
        cpu_shadowp(ccpu)->cpu_ucr3 = ucr3;
 
-       cpu_datap(ccpu)->cpu_task_map = tpmap->pm_task_map;
+       cpu_datap(ccpu)->cpu_task_map = cpu_shadowp(ccpu)->cpu_task_map =
+           tpmap->pm_task_map;
 
        assert((get_preemption_level() > 0) || (ml_get_interrupts_enabled() == FALSE));
        assert(ccpu == cpu_number());
index 4ddabaa2049b5df0a7eee6adf2c7843357a7d100..1a7c75e3205e13501d4a89342fccea99471d2e11 100644 (file)
@@ -917,10 +917,6 @@ pmap_pv_is_altacct(
        pvhash_idx = pvhashidx(pmap, vaddr);
        LOCK_PV_HASH(pvhash_idx);
        pvh_e = *(pvhash(pvhash_idx));
-       if (PV_HASHED_ENTRY_NULL == pvh_e) {
-               panic("Possible memory corruption: pmap_pv_is_altacct(%p,0x%llx,0x%x): empty hash",
-                     pmap, vaddr, ppn);
-       }
        while (PV_HASHED_ENTRY_NULL != pvh_e) {
                if (pvh_e->pmap == pmap &&
                    PVE_VA(pvh_e) == vaddr &&
index b66630233d686a24a621d04f9c05173c34c10227..443b972175512104d58be0f95c246919cf365881 100644 (file)
@@ -1969,12 +1969,14 @@ phys_attribute_clear(
                                        pte_bits &= ept_bits_to_clear;
                                }
                        }
+                       if (options & PMAP_OPTIONS_CLEAR_WRITE)
+                               pte_bits |= PTE_WRITE(is_ept);
 
                         /*
                          * Clear modify and/or reference bits.
                          */
                        if (pte_bits) {
-                               pmap_update_pte(pte, bits, 0);
+                               pmap_update_pte(pte, pte_bits, 0);
 
                                /* Ensure all processors using this translation
                                 * invalidate this TLB entry. The invalidation
@@ -2472,13 +2474,15 @@ done:
        return KERN_SUCCESS;
 }
 
-void pmap_set_jit_entitled(__unused pmap_t pmap)
+void
+pmap_set_jit_entitled(__unused pmap_t pmap)
 {
        /* The x86 pmap layer does not care if a map has a JIT entry. */
        return;
 }
 
-bool pmap_has_prot_policy(__unused vm_prot_t prot)
+bool
+pmap_has_prot_policy(__unused vm_prot_t prot)
 {
        /*
         * The x86 pmap layer does not apply any policy to any protection
@@ -2487,8 +2491,43 @@ bool pmap_has_prot_policy(__unused vm_prot_t prot)
        return FALSE;
 }
 
-void pmap_release_pages_fast(void)
+uint64_t
+pmap_release_pages_fast(void)
+{
+       return 0;
+}
+
+void
+pmap_trim(__unused pmap_t grand, __unused pmap_t subord, __unused addr64_t vstart, __unused addr64_t nstart, __unused uint64_t size)
 {
        return;
 }
 
+void pmap_ledger_alloc_init(size_t size)
+{
+       panic("%s: unsupported, "
+             "size=%lu",
+             __func__, size);
+}
+
+ledger_t pmap_ledger_alloc(void)
+{
+       panic("%s: unsupported",
+             __func__);
+
+       return NULL;
+}
+
+void pmap_ledger_free(ledger_t ledger)
+{
+       panic("%s: unsupported, "
+             "ledger=%p",
+             __func__, ledger);
+}
+
+size_t
+pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused)
+{
+       return (size_t)-1;
+}
+
index 9c0174a434432d3718e4fd7ec3c83e2b6dfa908c..ef03460db8e344287e756a1393f44427c7a79b7d 100644 (file)
@@ -93,6 +93,7 @@
 #include <kern/telemetry.h>
 #endif
 #include <sys/kdebug.h>
+#include <kperf/kperf.h>
 #include <prng/random.h>
 
 #include <string.h>
@@ -140,7 +141,7 @@ thread_syscall_return(
 
        pal_register_cache_state(thr_act, DIRTY);
 
-        if (thread_is_64bit(thr_act)) {
+        if (thread_is_64bit_addr(thr_act)) {
                x86_saved_state64_t     *regs;
                
                regs = USER_REGS64(thr_act);
@@ -211,7 +212,7 @@ user_page_fault_continue(
        thread_t        thread = current_thread();
        user_addr_t     vaddr;
 
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_addr(thread)) {
                x86_saved_state64_t     *uregs;
 
                uregs = USER_REGS64(thread);
@@ -383,7 +384,7 @@ interrupt(x86_saved_state_t *state)
 
 #if CONFIG_TELEMETRY
        if (telemetry_needs_record) {
-               telemetry_mark_curthread(user_mode);
+               telemetry_mark_curthread(user_mode, FALSE);
        }
 #endif
 
@@ -454,13 +455,16 @@ interrupt(x86_saved_state_t *state)
                                (long) depth, (long) VM_KERNEL_UNSLIDE(rip), 0, 0, 0);
                }
        }
-       
+
        if (cnum == master_cpu)
                ml_entropy_collect();
 
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, 
-               MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END,
-               interrupt_num, 0, 0, 0, 0);
+#if KPERF
+       kperf_interrupt();
+#endif /* KPERF */
+
+       KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_EXCP_INTR, 0) | DBG_FUNC_END,
+                       interrupt_num);
 
        assert(ml_get_interrupts_enabled() == FALSE);
 }
@@ -884,8 +888,8 @@ user_trap(
        user_addr_t             rip;
        unsigned long           dr6 = 0; /* 32 bit for i386, 64 bit for x86_64 */
 
-       assert((is_saved_state32(saved_state) && !thread_is_64bit(thread)) ||
-              (is_saved_state64(saved_state) &&  thread_is_64bit(thread)));
+       assert((is_saved_state32(saved_state) && !thread_is_64bit_addr(thread)) ||
+              (is_saved_state64(saved_state) &&  thread_is_64bit_addr(thread)));
 
        if (is_saved_state64(saved_state)) {
                x86_saved_state64_t     *regs;
@@ -967,7 +971,7 @@ user_trap(
                                 * because the high order bits are not
                                 * used on x86_64
                                 */
-                               if (thread_is_64bit(thread)) {
+                               if (thread_is_64bit_addr(thread)) {
                                        x86_debug_state64_t *ids = pcb->ids;
                                        ids->dr6 = dr6;
                                } else { /* 32 bit thread */
@@ -1249,11 +1253,11 @@ extern void     thread_exception_return_internal(void) __dead2;
 void thread_exception_return(void) {
        thread_t thread = current_thread();
        ml_set_interrupts_enabled(FALSE);
-       if (thread_is_64bit(thread) != task_has_64BitAddr(thread->task)) {
-               panic("Task/thread bitness mismatch %p %p, task: %d, thread: %d", thread, thread->task, thread_is_64bit(thread),  task_has_64BitAddr(thread->task));
+       if (thread_is_64bit_addr(thread) != task_has_64Bit_addr(thread->task)) {
+               panic("Task/thread bitness mismatch %p %p, task: %d, thread: %d", thread, thread->task, thread_is_64bit_addr(thread),  task_has_64Bit_addr(thread->task));
        }
 
-       if (thread_is_64bit(thread)) {
+       if (thread_is_64bit_addr(thread)) {
                if ((gdt_desc_p(USER64_CS)->access & ACC_PL_U) == 0) {
                        panic("64-GDT mismatch %p, descriptor: %p", thread, gdt_desc_p(USER64_CS));
                }
index 288ea6a5801857470956296ef1c8c343f6932734..facaf2af0d0bf61f3f74ca090f9372a556fe2a22 100644 (file)
@@ -79,6 +79,7 @@
 #include <ipc/ipc_table.h>
 #include <ipc/ipc_port.h>
 #include <string.h>
+#include <sys/kdebug.h>
 
 /*
  *     Routine:        ipc_entry_lookup
@@ -489,6 +490,14 @@ ipc_entry_modified(
                space->is_low_mod = index;
        if (index > space->is_high_mod)
                space->is_high_mod = index;
+
+       KERNEL_DEBUG_CONSTANT(
+               MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_PORT_ENTRY_MODIFY) | DBG_FUNC_NONE,
+               space->is_task ? task_pid(space->is_task) : 0,
+               name,
+               entry->ie_bits,
+               0,
+               0);
 }
 
 #define IPC_ENTRY_GROW_STATS 1
index f89005cf35632a3e26348542bd306026a3f96767..7ab4eb35568decf25798fda1c32439c7b71af73c 100644 (file)
@@ -86,11 +86,6 @@ static lck_spin_t ipc_importance_lock_data;  /* single lock for now */
        lck_spin_unlock(&ipc_importance_lock_data)
 #define ipc_importance_assert_held() \
        lck_spin_assert(&ipc_importance_lock_data, LCK_ASSERT_OWNED)
-#define ipc_importance_sleep(elem) lck_spin_sleep(&ipc_importance_lock_data,   \
-                                       LCK_SLEEP_DEFAULT,                      \
-                                       (event_t)(elem),                        \
-                                       THREAD_UNINT)
-#define ipc_importance_wakeup(elem) thread_wakeup((event_t)(elem))
 
 #if IIE_REF_DEBUG
 #define incr_ref_counter(x) (hw_atomic_add(&(x), 1))
@@ -1660,7 +1655,7 @@ ipc_importance_task_mark_live_donor(ipc_importance_task_t task_imp, boolean_t li
 }
 
 /*
- *     Routine:        ipc_importance_task_marked_live_donor
+ *     Routine:        ipc_importance_task_is_marked_live_donor
  *     Purpose:
  *             Query the live donor and donor flags for the given task importance.
  *     Conditions:
@@ -2155,9 +2150,6 @@ ipc_importance_exec_switch_task(
 
        /* Create an importance linkage from old_task to new_task */
        inherit = ipc_importance_inherit_from_task(old_task, new_task);
-       if (inherit == III_NULL) {
-               return inherit;
-       }
 
        /* Switch task importance base from old task to new task */
        ipc_importance_lock();
@@ -2214,9 +2206,7 @@ ipc_importance_check_circularity(
        boolean_t imp_lock_held = FALSE;
        int assertcnt = 0;
        ipc_port_t base;
-       sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0};
-       sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0};
-       boolean_t update_knote = FALSE;
+       struct turnstile *send_turnstile = TURNSTILE_NULL;
 
        assert(port != IP_NULL);
        assert(dest != IP_NULL);
@@ -2225,6 +2215,9 @@ ipc_importance_check_circularity(
                return TRUE;
        base = dest;
 
+       /* Check if destination needs a turnstile */
+       ipc_port_send_turnstile_prepare(dest);
+
        /* port is in limbo, so donation status is safe to latch */
        if (port->ip_impdonation != 0) {
                imp_lock_held = TRUE;
@@ -2302,22 +2295,24 @@ ipc_importance_check_circularity(
                assert(port->ip_receiver_name == MACH_PORT_NULL);
                assert(port->ip_destination == IP_NULL);
 
-               while (dest != IP_NULL) {
+               base = dest;
+               while (base != IP_NULL) {
                        ipc_port_t next;
 
-                       /* dest is in transit or in limbo */
+                       /* base is in transit or in limbo */
 
-                       assert(ip_active(dest));
-                       assert(dest->ip_receiver_name == MACH_PORT_NULL);
+                       assert(ip_active(base));
+                       assert(base->ip_receiver_name == MACH_PORT_NULL);
 
-                       next = dest->ip_destination;
-                       ip_unlock(dest);
-                       dest = next;
+                       next = base->ip_destination;
+                       ip_unlock(base);
+                       base = next;
                }
 
                if (imp_lock_held)
                        ipc_importance_unlock();
 
+               ipc_port_send_turnstile_complete(dest);
                return TRUE;
        }
 
@@ -2331,9 +2326,8 @@ ipc_importance_check_circularity(
        ipc_port_multiple_unlock();
 
 not_circular:
-       imq_lock(&base->ip_messages);
-
        /* port is in limbo */
+       imq_lock(&port->ip_messages);
 
        assert(ip_active(port));
        assert(port->ip_receiver_name == MACH_PORT_NULL);
@@ -2359,10 +2353,22 @@ not_circular:
        /* take the port out of limbo w.r.t. assertions */
        port->ip_tempowner = 0;
 
-       /* Capture the sync qos count delta */
-       for (int i = 0; i < THREAD_QOS_LAST; i++) {
-               sync_qos_delta_add[i] = port_sync_qos(port, i);
+       /*
+        * Setup linkage for source port if it has a send turnstile i.e. it has
+        * a thread waiting in send or has a port enqueued in it or has sync ipc
+        * push from a special reply port.
+        */
+       if (port_send_turnstile(port)) {
+               send_turnstile = turnstile_prepare((uintptr_t)port,
+                       port_send_turnstile_address(port),
+                       TURNSTILE_NULL, TURNSTILE_SYNC_IPC);
+
+               turnstile_update_inheritor(send_turnstile, port_send_turnstile(dest),
+                       (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE));
+
+               /* update complete and turnstile complete called after dropping all locks */
        }
+       imq_unlock(&port->ip_messages);
 
        /* now unlock chain */
 
@@ -2370,9 +2376,9 @@ not_circular:
 
        for (;;) {
 
+               ipc_port_t next;
                /* every port along chain track assertions behind it */
                ipc_port_impcount_delta(dest, assertcnt, base);
-               update_knote = ipc_port_sync_qos_delta(dest, sync_qos_delta_add, sync_qos_delta_sub);
 
                if (dest == base)
                        break;
@@ -2384,9 +2390,9 @@ not_circular:
                assert(dest->ip_destination != IP_NULL);
                assert(dest->ip_tempowner == 0);
 
-               port = dest->ip_destination;
+               next = dest->ip_destination;
                ip_unlock(dest);
-               dest = port;
+               dest = next;
        }
 
        /* base is not in transit */
@@ -2425,10 +2431,6 @@ not_circular:
                }
        }
 
-       if (update_knote) {
-               KNOTE(&base->ip_messages.imq_klist, 0);
-       }
-       imq_unlock(&base->ip_messages);
        ip_unlock(base);
 
        /*
@@ -2457,6 +2459,18 @@ not_circular:
        if (imp_lock_held)
                ipc_importance_unlock();
 
+       /* All locks dropped, call turnstile_update_inheritor_complete for source port's turnstile */
+       if (send_turnstile) {
+               turnstile_update_inheritor_complete(send_turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
+
+               /* Take the mq lock to call turnstile complete */
+               imq_lock(&port->ip_messages);
+               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL);
+               send_turnstile = TURNSTILE_NULL;
+               imq_unlock(&port->ip_messages);
+               turnstile_cleanup();
+       }
+
        if (imp_task != IIT_NULL)
                ipc_importance_task_release(imp_task);
 
index 8d5ea071f771fa8fbb3b287f74b40b32b10a9624..d8e0917e756c69903bee2a8fe9944f640a36fe34 100644 (file)
@@ -87,6 +87,7 @@
 #include <kern/misc_protos.h>
 #include <kern/sync_lock.h>
 #include <kern/sync_sema.h>
+#include <kern/ux_handler.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 
@@ -131,8 +132,6 @@ lck_attr_t          ipc_lck_attr;
 
 static lck_grp_attr_t  ipc_lck_grp_attr;
 
-extern void ikm_cache_init(void);
-
 /*
  *     Routine:        ipc_bootstrap
  *     Purpose:
@@ -190,6 +189,7 @@ ipc_bootstrap(void)
                              IKM_SAVED_KMSG_SIZE,
                              "ipc kmsgs");
        zone_change(ipc_kmsg_zone, Z_CALLERACCT, FALSE);
+       zone_change(ipc_kmsg_zone, Z_CACHING_ENABLED, TRUE);
 
        /* create special spaces */
 
@@ -275,6 +275,8 @@ ipc_init(void)
        msg_ool_size_small -= cpy_kdata_hdr_sz;
 
        ipc_host_init();
+       ux_handler_init();
+
 }
 
 
@@ -291,3 +293,4 @@ ipc_thread_call_init(void)
        ipc_importance_thread_call_init();
 #endif
 }
+
index 736fe824c8d75eb4c70a54196fa9c3af3b1f4ff8..81776c7291b4b6a9d52ed768d4e0067ae19bf35c 100644 (file)
@@ -94,6 +94,8 @@
 #include <kern/cpu_data.h>
 #include <kern/policy_internal.h>
 
+#include <pthread/priority_private.h>
+
 #include <machine/machlimits.h>
 
 #include <vm/vm_map.h>
@@ -797,7 +799,7 @@ void ipc_kmsg_trace_send(ipc_kmsg_t kmsg,
         * Trailer contents
         */
        trailer = (mach_msg_trailer_t *)((vm_offset_t)msg +
-                                        (vm_offset_t)msg->msgh_size);
+                                        round_msg((vm_offset_t)msg->msgh_size));
        if (trailer->msgh_trailer_size <= sizeof(mach_msg_security_trailer_t)) {
                extern security_token_t KERNEL_SECURITY_TOKEN;
                mach_msg_security_trailer_t *strailer;
@@ -849,11 +851,10 @@ void ipc_kmsg_clean_partial(
        vm_size_t               length);
 
 mach_msg_return_t ipc_kmsg_copyin_body(
-       ipc_kmsg_t              kmsg,
-       ipc_space_t             space,
-       vm_map_t                map);
-
-extern int thread_qos_from_pthread_priority(unsigned long, unsigned long *);
+       ipc_kmsg_t          kmsg,
+       ipc_space_t         space,
+       vm_map_t            map,
+       mach_msg_option_t   *optionp);
 
 /*
  *     We keep a per-processor cache of kernel message buffers.
@@ -914,21 +915,6 @@ ipc_kmsg_alloc(
                max_expanded_size = IKM_SAVED_MSG_SIZE;         /* round up for ikm_cache */
 
        if (max_expanded_size == IKM_SAVED_MSG_SIZE) {
-               struct ikm_cache        *cache;
-               unsigned int            i;
-
-               disable_preemption();
-               cache = &PROCESSOR_DATA(current_processor(), ikm_cache);
-               if ((i = cache->avail) > 0) {
-                       assert(i <= IKM_STASH);
-                       kmsg = cache->entries[--i];
-                       cache->avail = i;
-                       enable_preemption();
-                       ikm_check_init(kmsg, max_expanded_size);
-                       ikm_set_header(kmsg, msg_and_trailer_size);
-                       return (kmsg);
-               }
-               enable_preemption();
                kmsg = (ipc_kmsg_t)zalloc(ipc_kmsg_zone);
        } else {
                kmsg = (ipc_kmsg_t)kalloc(ikm_plus_overhead(max_expanded_size));
@@ -986,22 +972,7 @@ ipc_kmsg_free(
                ip_release(port); /* May be last reference */
        }
 
-       /*
-        * Peek and see if it has to go back in the cache.
-        */
        if (kmsg->ikm_size == IKM_SAVED_MSG_SIZE) {
-               struct ikm_cache        *cache;
-               unsigned int            i;
-
-               disable_preemption();
-               cache = &PROCESSOR_DATA(current_processor(), ikm_cache);
-               if ((i = cache->avail) < IKM_STASH) {
-                       cache->entries[i] = kmsg;
-                       cache->avail = i + 1;
-                       enable_preemption();
-                       return;
-               }
-               enable_preemption();
                zfree(ipc_kmsg_zone, kmsg);
                return;
        }
@@ -1472,16 +1443,10 @@ ipc_kmsg_set_prealloc(
        assert(kmsg->ikm_prealloc == IP_NULL);
   
        kmsg->ikm_prealloc = IP_NULL;
-       /* take the mqueue lock since the sync qos is protected under it */
-       imq_lock(&port->ip_messages);
 
-       /* copy the sync qos values to kmsg */
-       for (int i = 0; i < THREAD_QOS_LAST; i++) {
-               kmsg->sync_qos[i] = port_sync_qos(port, i);
-       }
-       kmsg->special_port_qos = port_special_qos(port);
+       assert(port_send_turnstile(port) == TURNSTILE_NULL);
+       kmsg->ikm_turnstile = TURNSTILE_NULL;
        IP_SET_PREALLOC(port, kmsg);
-       imq_unlock(&port->ip_messages);
 }
 
 /*
@@ -1496,20 +1461,11 @@ ipc_kmsg_clear_prealloc(
        ipc_kmsg_t              kmsg,
        ipc_port_t              port)
 {
-       assert(kmsg->ikm_prealloc == port);
-  
-       kmsg->ikm_prealloc = IP_NULL;
-
-       /* take the mqueue lock since the sync qos is protected under it */
+       /* take the mqueue lock since the turnstile is protected under it */
        imq_lock(&port->ip_messages);
 
        IP_CLEAR_PREALLOC(port, kmsg);
-
-       /* copy the sync qos values from kmsg to port */
-       for (int i = 0; i < THREAD_QOS_LAST; i++) {
-               set_port_sync_qos(port, i, kmsg->sync_qos[i]);
-       }
-       set_port_special_qos(port, kmsg->special_port_qos);
+       set_port_send_turnstile(port, kmsg->ikm_turnstile);
        imq_unlock(&port->ip_messages);
 }
 
@@ -1783,8 +1739,6 @@ ipc_kmsg_get_from_kernel(
  *             MACH_SEND_INTERRUPTED   Caller still has message.
  *             MACH_SEND_INVALID_DEST  Caller still has message.
  */
-
-
 mach_msg_return_t
 ipc_kmsg_send(
        ipc_kmsg_t              kmsg,
@@ -1806,7 +1760,7 @@ ipc_kmsg_send(
        }
 
 #if IMPORTANCE_INHERITANCE
-       boolean_t did_importance = FALSE;
+       bool did_importance = false;
 #if IMPORTANCE_TRACE
        mach_msg_id_t imp_msgh_id = -1;
        int           sender_pid  = -1;
@@ -1840,6 +1794,17 @@ retry:
         if (MACH_NODE_VALID(kmsg->ikm_node) && FPORT_VALID(port->ip_messages.imq_fport))
             flipc_msg_ack(kmsg->ikm_node, &port->ip_messages, FALSE);
 #endif
+               if (did_importance) {
+                       /*
+                        * We're going to pretend we delivered this message
+                        * successfully, and just eat the kmsg. However, the
+                        * kmsg is actually visible via the importance_task!
+                        * We need to cleanup this linkage before we destroy
+                        * the message, and more importantly before we set the
+                        * msgh_remote_port to NULL. See: 34302571
+                        */
+                       ipc_importance_clean(kmsg);
+               }
                ip_release(port);  /* JMM - Future: release right, not just ref */
                kmsg->ikm_header->msgh_remote_port = MACH_PORT_NULL;
                ipc_kmsg_destroy(kmsg);
@@ -1885,8 +1850,8 @@ retry:
         * propagation.  That routine can drop the port lock temporarily.
         * If it does we'll have to revalidate the destination.
         */
-       if (did_importance == FALSE) {
-               did_importance = TRUE;
+       if (!did_importance) {
+               did_importance = true;
                if (ipc_importance_send(kmsg, option))
                        goto retry;
        }
@@ -1901,6 +1866,9 @@ retry:
                 * queue. Lock message queue while port is locked.
                 */
                imq_lock(&port->ip_messages);
+
+               set_ip_srp_msg_sent(port);
+
                ip_unlock(port);
 
                error = ipc_mqueue_send(&port->ip_messages, kmsg, option,
@@ -1908,7 +1876,7 @@ retry:
        }
 
 #if IMPORTANCE_INHERITANCE
-       if (did_importance == TRUE) {
+       if (did_importance) {
                __unused int importance_cleared = 0;
                switch (error) {
                        case MACH_SEND_TIMED_OUT:
@@ -2024,7 +1992,7 @@ ipc_kmsg_put(
                                                         kmsg->ikm_header->msgh_id);
 
 #if defined(__LP64__)
-       if (current_task() != kernel_task) { /* don't if receiver expects fully-cooked in-kernel msg; ux_exception */
+       if (current_task() != kernel_task) { /* don't if receiver expects fully-cooked in-kernel msg; */
                mach_msg_legacy_header_t *legacy_header = 
                        (mach_msg_legacy_header_t *)((vm_offset_t)(kmsg->ikm_header) + LEGACY_HEADER_SIZE_DELTA);
 
@@ -2106,7 +2074,20 @@ ipc_kmsg_put_to_kernel(
        ipc_kmsg_free(kmsg);
 }
 
-unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t propagation);
+static mach_msg_priority_t
+ipc_get_current_thread_priority(void)
+{
+       thread_t thread = current_thread();
+       thread_qos_t qos;
+       int relpri;
+
+       qos = thread_get_requested_qos(thread, &relpri);
+       if (!qos) {
+               qos = thread_user_promotion_qos_for_pri(thread->base_pri);
+               relpri = 0;
+       }
+       return (mach_msg_priority_t)_pthread_priority_make_from_thread_qos(qos, relpri, 0);
+}
 
 static kern_return_t
 ipc_kmsg_set_qos(
@@ -2115,24 +2096,23 @@ ipc_kmsg_set_qos(
        mach_msg_priority_t override)
 {
        kern_return_t kr;
-       unsigned long flags = 0;
        ipc_port_t special_reply_port = kmsg->ikm_header->msgh_local_port;
        ipc_port_t dest_port = kmsg->ikm_header->msgh_remote_port;
 
        kr = ipc_get_pthpriority_from_kmsg_voucher(kmsg, &kmsg->ikm_qos);
        if (kr != KERN_SUCCESS) {
-               kmsg->ikm_qos = MACH_MSG_PRIORITY_UNSPECIFIED;
+               if (options & MACH_SEND_PROPAGATE_QOS) {
+                       kmsg->ikm_qos = ipc_get_current_thread_priority();
+               } else {
+                       kmsg->ikm_qos = MACH_MSG_PRIORITY_UNSPECIFIED;
+               }
        }
        kmsg->ikm_qos_override = kmsg->ikm_qos;
 
        if (options & MACH_SEND_OVERRIDE) {
-               unsigned long canonical;
-               mach_msg_priority_t canon;
-
-               canonical = pthread_priority_canonicalize(override, TRUE);
-               canon = (mach_msg_priority_t)canonical;
-               if (canon > kmsg->ikm_qos)
-                       kmsg->ikm_qos_override = canon;
+               pthread_priority_t pp = _pthread_priority_normalize_for_ipc(override);
+               if (pp > kmsg->ikm_qos)
+                       kmsg->ikm_qos_override = (mach_msg_priority_t)pp;
        }
 
        kr = KERN_SUCCESS;
@@ -2140,16 +2120,10 @@ ipc_kmsg_set_qos(
                if (IP_VALID(special_reply_port) &&
                    MACH_MSGH_BITS_LOCAL(kmsg->ikm_header->msgh_bits) == MACH_MSG_TYPE_PORT_SEND_ONCE) {
                        /*
-                        * Update the sync override count if the reply port is a special reply port,
-                        * link the destination port to special reply port and update the qos count
-                        * of destination port.
-                        *
-                        * Use the qos value passed by voucher and not the one passed by notify field.
+                        * Link the destination port to special reply port and make sure that
+                        * dest port has a send turnstile, else allocate one.
                         */
-                       kr = ipc_port_link_special_reply_port_with_qos(special_reply_port, dest_port,
-                               thread_qos_from_pthread_priority(kmsg->ikm_qos, &flags));
-               } else {
-                       kr = KERN_FAILURE;
+                       ipc_port_link_special_reply_port(special_reply_port, dest_port);
                }
        }
        return kr;
@@ -2243,6 +2217,9 @@ ipc_kmsg_copyin_header(
                    (voucher_type != MACH_MSG_TYPE_MOVE_SEND &&
                     voucher_type != MACH_MSG_TYPE_COPY_SEND)) {
                        is_write_unlock(space);
+                       if ((*optionp & MACH_SEND_KERNEL) == 0) {
+                               mach_port_guard_exception(voucher_name, 0, 0, kGUARD_EXC_SEND_INVALID_VOUCHER);
+                       }
                        return MACH_SEND_INVALID_VOUCHER;
                }
 
@@ -2252,6 +2229,9 @@ ipc_kmsg_copyin_header(
                            (voucher_entry->ie_bits & MACH_PORT_TYPE_SEND) == 0 ||
                            io_kotype(voucher_entry->ie_object) != IKOT_VOUCHER) {
                                is_write_unlock(space);
+                               if ((*optionp & MACH_SEND_KERNEL) == 0) {
+                                       mach_port_guard_exception(voucher_name, 0, 0, kGUARD_EXC_SEND_INVALID_VOUCHER);
+                               }
                                return MACH_SEND_INVALID_VOUCHER;
                        }
                } else {
@@ -2609,6 +2589,9 @@ invalid_reply:
        assert(voucher_port == IP_NULL);
        assert(voucher_soright == IP_NULL);
 
+       if ((*optionp & MACH_SEND_KERNEL) == 0) {
+               mach_port_guard_exception(reply_name, 0, 0, kGUARD_EXC_SEND_INVALID_REPLY);
+       }
        return MACH_SEND_INVALID_REPLY;
 
 invalid_dest:
@@ -2627,23 +2610,26 @@ invalid_dest:
 }
 
 mach_msg_descriptor_t *ipc_kmsg_copyin_port_descriptor(
-        volatile mach_msg_port_descriptor_t *dsc,
-        mach_msg_legacy_port_descriptor_t *user_dsc,
-        ipc_space_t space,
-        ipc_object_t dest,
-        ipc_kmsg_t kmsg,
-        mach_msg_return_t *mr);
+       volatile mach_msg_port_descriptor_t *dsc,
+       mach_msg_legacy_port_descriptor_t *user_dsc,
+       ipc_space_t space,
+       ipc_object_t dest,
+       ipc_kmsg_t kmsg,
+       mach_msg_option_t *optionp,
+       mach_msg_return_t *mr);
 
 void ipc_print_type_name(
    int type_name);
+
 mach_msg_descriptor_t *
 ipc_kmsg_copyin_port_descriptor(
-        volatile mach_msg_port_descriptor_t *dsc,
-        mach_msg_legacy_port_descriptor_t *user_dsc_in,
-        ipc_space_t space,
-        ipc_object_t dest,
-        ipc_kmsg_t kmsg,
-        mach_msg_return_t *mr)
+       volatile mach_msg_port_descriptor_t *dsc,
+       mach_msg_legacy_port_descriptor_t *user_dsc_in,
+       ipc_space_t space,
+       ipc_object_t dest,
+       ipc_kmsg_t kmsg,
+       mach_msg_option_t *optionp,
+       mach_msg_return_t *mr)
 {
     volatile mach_msg_legacy_port_descriptor_t *user_dsc = user_dsc_in;
     mach_msg_type_name_t       user_disp;
@@ -2659,6 +2645,9 @@ ipc_kmsg_copyin_port_descriptor(
 
         kern_return_t kr = ipc_object_copyin(space, name, user_disp, &object);
         if (kr != KERN_SUCCESS) {
+                       if ((*optionp & MACH_SEND_KERNEL) == 0) {
+                               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT);
+                       }
             *mr = MACH_SEND_INVALID_RIGHT;
             return NULL;
         }
@@ -2681,24 +2670,27 @@ ipc_kmsg_copyin_port_descriptor(
 }
 
 mach_msg_descriptor_t * ipc_kmsg_copyin_ool_descriptor(
-        mach_msg_ool_descriptor_t *dsc,
-        mach_msg_descriptor_t *user_dsc,
-        int is_64bit,
-        vm_offset_t *paddr,
-        vm_map_copy_t *copy,
-        vm_size_t *space_needed,
-        vm_map_t map,
-        mach_msg_return_t *mr);
+       mach_msg_ool_descriptor_t *dsc,
+       mach_msg_descriptor_t *user_dsc,
+       int is_64bit,
+       vm_offset_t *paddr,
+       vm_map_copy_t *copy,
+       vm_size_t *space_needed,
+       vm_map_t map,
+       mach_msg_option_t *optionp,
+       mach_msg_return_t *mr);
+
 mach_msg_descriptor_t *
 ipc_kmsg_copyin_ool_descriptor(
-        mach_msg_ool_descriptor_t *dsc,
-        mach_msg_descriptor_t *user_dsc,
-        int is_64bit,
-        vm_offset_t *paddr,
-        vm_map_copy_t *copy,
-        vm_size_t *space_needed,
-        vm_map_t map,
-        mach_msg_return_t *mr)
+       mach_msg_ool_descriptor_t *dsc,
+       mach_msg_descriptor_t *user_dsc,
+       int is_64bit,
+       vm_offset_t *paddr,
+       vm_map_copy_t *copy,
+       vm_size_t *space_needed,
+       vm_map_t map,
+       __unused mach_msg_option_t *optionp,
+       mach_msg_return_t *mr)
 {
     vm_size_t                          length;
     boolean_t                          dealloc;
@@ -2794,36 +2786,39 @@ ipc_kmsg_copyin_ool_descriptor(
 }
 
 mach_msg_descriptor_t * ipc_kmsg_copyin_ool_ports_descriptor(
-        mach_msg_ool_ports_descriptor_t *dsc,
-        mach_msg_descriptor_t *user_dsc,
-        int is_64bit,
-        vm_map_t map,
-        ipc_space_t space,
-        ipc_object_t dest,
-        ipc_kmsg_t kmsg,
-        mach_msg_return_t *mr);
+       mach_msg_ool_ports_descriptor_t *dsc,
+       mach_msg_descriptor_t *user_dsc,
+       int is_64bit,
+       vm_map_t map,
+       ipc_space_t space,
+       ipc_object_t dest,
+       ipc_kmsg_t kmsg,
+       mach_msg_option_t *optionp,
+       mach_msg_return_t *mr);
+
 mach_msg_descriptor_t *
 ipc_kmsg_copyin_ool_ports_descriptor(
-        mach_msg_ool_ports_descriptor_t *dsc,
-        mach_msg_descriptor_t *user_dsc,
-        int is_64bit,
-        vm_map_t map,
-        ipc_space_t space,
-        ipc_object_t dest,
-        ipc_kmsg_t kmsg,
-        mach_msg_return_t *mr)
+       mach_msg_ool_ports_descriptor_t *dsc,
+       mach_msg_descriptor_t *user_dsc,
+       int is_64bit,
+       vm_map_t map,
+       ipc_space_t space,
+       ipc_object_t dest,
+       ipc_kmsg_t kmsg,
+       mach_msg_option_t *optionp,
+       mach_msg_return_t *mr)
 {
-    void                                       *data;
-    ipc_object_t                       *objects;
-    unsigned int                               i;
-    mach_vm_offset_t                           addr;
-    mach_msg_type_name_t               user_disp;
-    mach_msg_type_name_t               result_disp;
-    mach_msg_type_number_t                     count;
-    mach_msg_copy_options_t                    copy_option;
-    boolean_t                          deallocate;
-    mach_msg_descriptor_type_t      type;
-    vm_size_t                          ports_length, names_length;
+    void *data;
+    ipc_object_t *objects;
+    unsigned int i;
+    mach_vm_offset_t addr;
+    mach_msg_type_name_t user_disp;
+    mach_msg_type_name_t result_disp;
+    mach_msg_type_number_t count;
+    mach_msg_copy_options_t copy_option;
+    boolean_t deallocate;
+    mach_msg_descriptor_type_t type;
+    vm_size_t ports_length, names_length;
 
     if (is_64bit) {
         mach_msg_ool_ports_descriptor64_t *user_ool_dsc = (typeof(user_ool_dsc))user_dsc;
@@ -2928,6 +2923,9 @@ ipc_kmsg_copyin_ool_ports_descriptor(
             }
             kfree(data, ports_length);
             dsc->address = NULL;
+                       if ((*optionp & MACH_SEND_KERNEL) == 0) {
+                               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_SEND_INVALID_RIGHT);
+                       }
             *mr = MACH_SEND_INVALID_RIGHT;
             return NULL;
         }
@@ -2970,7 +2968,8 @@ mach_msg_return_t
 ipc_kmsg_copyin_body(
        ipc_kmsg_t      kmsg,
        ipc_space_t     space,
-       vm_map_t        map)
+       vm_map_t    map,
+       mach_msg_option_t *optionp)
 {
     ipc_object_t                       dest;
     mach_msg_body_t            *body;
@@ -3144,20 +3143,20 @@ ipc_kmsg_copyin_body(
         switch (user_addr->type.type) {
             case MACH_MSG_PORT_DESCRIPTOR:
                 user_addr = ipc_kmsg_copyin_port_descriptor((mach_msg_port_descriptor_t *)kern_addr, 
-                        (mach_msg_legacy_port_descriptor_t *)user_addr, space, dest, kmsg, &mr);
+                                           (mach_msg_legacy_port_descriptor_t *)user_addr, space, dest, kmsg, optionp, &mr);
                 kern_addr++;
                 complex = TRUE;
                 break;
             case MACH_MSG_OOL_VOLATILE_DESCRIPTOR:
             case MACH_MSG_OOL_DESCRIPTOR: 
                 user_addr = ipc_kmsg_copyin_ool_descriptor((mach_msg_ool_descriptor_t *)kern_addr, 
-                        user_addr, is_task_64bit, &paddr, &copy, &space_needed, map, &mr);
+                                           user_addr, is_task_64bit, &paddr, &copy, &space_needed, map, optionp, &mr);
                 kern_addr++;
                 complex = TRUE;
                 break;
             case MACH_MSG_OOL_PORTS_DESCRIPTOR: 
                 user_addr = ipc_kmsg_copyin_ool_ports_descriptor((mach_msg_ool_ports_descriptor_t *)kern_addr, 
-                        user_addr, is_task_64bit, map, space, dest, kmsg, &mr);
+                                           user_addr, is_task_64bit, map, space, dest, kmsg, optionp, &mr);
                 kern_addr++;
                 complex = TRUE;
                 break;
@@ -3248,7 +3247,7 @@ ipc_kmsg_copyin(
     if ((kmsg->ikm_header->msgh_bits & MACH_MSGH_BITS_COMPLEX) == 0)
        return MACH_MSG_SUCCESS;
     
-       mr = ipc_kmsg_copyin_body( kmsg, space, map);
+       mr = ipc_kmsg_copyin_body( kmsg, space, map, optionp);
 
        /* unreachable if !DEBUG */
        __unreachable_ok_push
@@ -4823,7 +4822,7 @@ ipc_kmsg_add_trailer(ipc_kmsg_t kmsg, ipc_space_t space __unused,
 
        trailer->msgh_seqno = seqno;
        trailer->msgh_context = context;
-       trailer->msgh_trailer_size = REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option);
+       trailer->msgh_trailer_size = REQUESTED_TRAILER_SIZE(thread_is_64bit_addr(thread), option);
 
        if (minimal_trailer) { 
                goto done;
@@ -4846,7 +4845,7 @@ ipc_kmsg_add_trailer(ipc_kmsg_t kmsg, ipc_space_t space __unused,
 
 done:
 #ifdef __arm64__
-       ipc_kmsg_munge_trailer(trailer, real_trailer_out, thread_is_64bit(thread));
+       ipc_kmsg_munge_trailer(trailer, real_trailer_out, thread_is_64bit_addr(thread));
 #endif /* __arm64__ */
 
        return trailer->msgh_trailer_size;
index f7ff4059cddd146b11d66893664a86dcbbd63cd6..f5598615f557d06f84f9f0aafb934a376c74f61a 100644 (file)
@@ -108,8 +108,7 @@ struct ipc_kmsg {
        mach_msg_priority_t        ikm_qos_override; /* qos override on this kmsg */
        struct ipc_importance_elem *ikm_importance;  /* inherited from */
        queue_chain_t              ikm_inheritance;  /* inherited from link */
-       sync_qos_count_t sync_qos[THREAD_QOS_LAST];  /* sync qos counters for ikm_prealloc port */
-       sync_qos_count_t special_port_qos;           /* special port qos for ikm_prealloc port */
+       struct turnstile           *ikm_turnstile;   /* send turnstile for ikm_prealloc port */
 #if MACH_FLIPC
        struct mach_node           *ikm_node;        /* Originating node - needed for ack */
 #endif
index 38af0db4cc330f9c33ccc9b328e148f4c23282dd..685950c902f9f5f95642eb228e06bc0e8597c691 100644 (file)
@@ -120,15 +120,14 @@ static void ipc_mqueue_peek_on_thread(
 void
 ipc_mqueue_init(
        ipc_mqueue_t    mqueue,
-       boolean_t       is_set,
-       uint64_t        *reserved_link)
+       boolean_t       is_set)
 {
        if (is_set) {
                waitq_set_init(&mqueue->imq_set_queue,
                               SYNC_POLICY_FIFO|SYNC_POLICY_PREPOST,
-                              reserved_link, NULL);
+                              NULL, NULL);
        } else {
-               waitq_init(&mqueue->imq_wait_queue, SYNC_POLICY_FIFO);
+               waitq_init(&mqueue->imq_wait_queue, SYNC_POLICY_FIFO | SYNC_POLICY_PORT);
                ipc_kmsg_queue_init(&mqueue->imq_messages);
                mqueue->imq_seqno = 0;
                mqueue->imq_msgcount = 0;
@@ -298,6 +297,7 @@ ipc_mqueue_add(
        kern_return_t    kr;
 
        assert(reserved_link && *reserved_link != 0);
+       assert(waitqs_is_linked(set_waitq));
 
        imq_lock(port_mqueue);
 
@@ -371,7 +371,7 @@ ipc_mqueue_add(
                         */
                        msize = ipc_kmsg_copyout_size(kmsg, th->map);
                        if (th->ith_rsize <
-                                       (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(th), th->ith_option))) {
+                                       (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit_addr(th), th->ith_option))) {
                                th->ith_state = MACH_RCV_TOO_LARGE;
                                th->ith_msize = msize;
                                if (th->ith_option & MACH_RCV_LARGE) {
@@ -427,8 +427,25 @@ void
 ipc_mqueue_changed(
        ipc_mqueue_t            mqueue)
 {
-       /* Indicate that this message queue is vanishing */
-       knote_vanish(&mqueue->imq_klist);
+       if (IMQ_KLIST_VALID(mqueue)) {
+               /*
+                * Indicate that this message queue is vanishing
+                *
+                * When this is called, the associated receive right may be in flight
+                * between two tasks: the one it used to live in, and the one that armed
+                * a port destroyed notification for it.
+                *
+                * The new process may want to register the port it gets back with an
+                * EVFILT_MACHPORT filter again, and may have pending sync IPC on this
+                * port pending already, in which case we want the imq_klist field to be
+                * reusable for nefarious purposes (see IMQ_SET_INHERITOR).
+                *
+                * Fortunately, we really don't need this linkage anymore after this
+                * point as EV_VANISHED / EV_EOF will be the last thing delivered ever.
+                */
+               knote_vanish(&mqueue->imq_klist);
+               klist_init(&mqueue->imq_klist);
+       }
 
        waitq_wakeup64_all_locked(&mqueue->imq_wait_queue,
                                  IPC_MQUEUE_RECEIVE,
@@ -439,13 +456,13 @@ ipc_mqueue_changed(
 }
 
 
-               
+
 
 /*
  *     Routine:        ipc_mqueue_send
  *     Purpose:
  *             Send a message to a message queue.  The message holds a reference
- *             for the destination port for this message queue in the 
+ *             for the destination port for this message queue in the
  *             msgh_remote_port field.
  *
  *             If unsuccessful, the caller still has possession of
@@ -474,7 +491,7 @@ ipc_mqueue_send(
         *      3) Message is sent to a send-once right.
         */
        if (!imq_full(mqueue) ||
-           (!imq_full_kernel(mqueue) && 
+           (!imq_full_kernel(mqueue) &&
             ((option & MACH_SEND_ALWAYS) ||
              (MACH_MSGH_BITS_REMOTE(kmsg->ikm_header->msgh_bits) ==
               MACH_MSG_TYPE_PORT_SEND_ONCE)))) {
@@ -483,9 +500,12 @@ ipc_mqueue_send(
                imq_unlock(mqueue);
        } else {
                thread_t cur_thread = current_thread();
+               ipc_port_t port = ip_from_mq(mqueue);
+               struct turnstile *send_turnstile = TURNSTILE_NULL;
+               turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
                uint64_t deadline;
 
-               /* 
+               /*
                 * We have to wait for space to be granted to us.
                 */
                if ((option & MACH_SEND_TIMEOUT) && (send_timeout == 0)) {
@@ -504,38 +524,65 @@ ipc_mqueue_send(
                        deadline = 0;
 
                thread_set_pending_block_hint(cur_thread, kThreadWaitPortSend);
-               wresult = waitq_assert_wait64_locked(
-                                               &mqueue->imq_wait_queue,
-                                               IPC_MQUEUE_FULL,
-                                               THREAD_ABORTSAFE,
-                                               TIMEOUT_URGENCY_USER_NORMAL,
-                                               deadline, TIMEOUT_NO_LEEWAY,
-                                               cur_thread);
+
+               send_turnstile = turnstile_prepare((uintptr_t)port,
+                       port_send_turnstile_address(port),
+                       TURNSTILE_NULL, TURNSTILE_SYNC_IPC);
+
+               /* Check if the port in is in transit, get the destination port's turnstile */
+               if (ip_active(port) &&
+                   port->ip_receiver_name == MACH_PORT_NULL &&
+                   port->ip_destination != NULL) {
+                       inheritor = port_send_turnstile(port->ip_destination);
+               } else {
+                       inheritor = ipc_port_get_inheritor(port);
+               }
+
+               turnstile_update_inheritor(send_turnstile, inheritor,
+                               TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
+
+               wresult = waitq_assert_wait64_leeway(
+                                       &send_turnstile->ts_waitq,
+                                       IPC_MQUEUE_FULL,
+                                       THREAD_ABORTSAFE,
+                                       TIMEOUT_URGENCY_USER_NORMAL,
+                                       deadline,
+                                       TIMEOUT_NO_LEEWAY);
 
                imq_unlock(mqueue);
-               
+               turnstile_update_inheritor_complete(send_turnstile,
+                               TURNSTILE_INTERLOCK_NOT_HELD);
+
                if (wresult == THREAD_WAITING) {
                        wresult = thread_block(THREAD_CONTINUE_NULL);
                        counter(c_ipc_mqueue_send_block++);
                }
-               
+
+               /* Call turnstile complete with interlock held */
+               imq_lock(mqueue);
+               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL);
+               imq_unlock(mqueue);
+
+               /* Call cleanup after dropping the interlock */
+               turnstile_cleanup();
+
                switch (wresult) {
 
                case THREAD_AWAKENED:
-                       /* 
+                       /*
                         * we can proceed - inherited msgcount from waker
                         * or the message queue has been destroyed and the msgcount
                         * has been reset to zero (will detect in ipc_mqueue_post()).
                         */
                        break;
-                       
+
                case THREAD_TIMED_OUT:
                        assert(option & MACH_SEND_TIMEOUT);
                        return MACH_SEND_TIMED_OUT;
-                       
+
                case THREAD_INTERRUPTED:
                        return MACH_SEND_INTERRUPTED;
-                       
+
                case THREAD_RESTART:
                        /* mqueue is being destroyed */
                        return MACH_SEND_INVALID_DEST;
@@ -569,12 +616,14 @@ extern void ipc_mqueue_override_send(
        imq_lock(mqueue);
        assert(imq_valid(mqueue));
        assert(!imq_is_set(mqueue));
-       
+
        if (imq_full(mqueue)) {
                ipc_kmsg_t first = ipc_kmsg_queue_first(&mqueue->imq_messages);
 
-               if (first && ipc_kmsg_override_qos(&mqueue->imq_messages, first, override))
-                       KNOTE(&mqueue->imq_klist, 0);
+               if (first && ipc_kmsg_override_qos(&mqueue->imq_messages, first, override)) {
+                       if (IMQ_KLIST_VALID(mqueue))
+                               KNOTE(&mqueue->imq_klist, 0);
+               }
                if (!first)
                        full_queue_empty = TRUE;
        }
@@ -608,26 +657,32 @@ extern void ipc_mqueue_override_send(
 void
 ipc_mqueue_release_msgcount(ipc_mqueue_t port_mq, ipc_mqueue_t set_mq)
 {
+       struct turnstile *send_turnstile = port_send_turnstile(ip_from_mq(port_mq));
        (void)set_mq;
        assert(imq_held(port_mq));
        assert(port_mq->imq_msgcount > 1 || ipc_kmsg_queue_empty(&port_mq->imq_messages));
 
        port_mq->imq_msgcount--;
 
-       if (!imq_full(port_mq) && port_mq->imq_fullwaiters) {
+       if (!imq_full(port_mq) && port_mq->imq_fullwaiters &&
+               send_turnstile != TURNSTILE_NULL) {
                /*
                 * boost the priority of the awoken thread
                 * (WAITQ_PROMOTE_PRIORITY) to ensure it uses
                 * the message queue slot we've just reserved.
                 *
                 * NOTE: this will never prepost
+                *
+                * The wakeup happens on a turnstile waitq
+                * which will wakeup the highest priority waiter.
+                * A potential downside of this would be starving low
+                * priority senders if there is a constant churn of
+                * high priority threads trying to send to this port.
                 */
-               if (waitq_wakeup64_one_locked(&port_mq->imq_wait_queue,
+               if (waitq_wakeup64_one(&send_turnstile->ts_waitq,
                                              IPC_MQUEUE_FULL,
                                              THREAD_AWAKENED,
-                                             NULL,
-                                             WAITQ_PROMOTE_PRIORITY,
-                                             WAITQ_KEEP_LOCKED) != KERN_SUCCESS) {
+                                             WAITQ_PROMOTE_PRIORITY) != KERN_SUCCESS) {
                        port_mq->imq_fullwaiters = FALSE;
                } else {
                        /* gave away our slot - add reference back */
@@ -694,24 +749,26 @@ ipc_mqueue_post(
 
                if (receiver == THREAD_NULL) {
 
-                       /* 
+                       /*
                         * no receivers; queue kmsg if space still reserved
                         * Reservations are cancelled when the port goes inactive.
                         * note that this will enqueue the message for any
-                        * "peeking" receivers. 
+                        * "peeking" receivers.
                         *
                         * Also, post the knote to wake up any threads waiting
                         * on that style of interface if this insertion is of
                         * note (first insertion, or adjusted override qos all
                         * the way to the head of the queue).
-                        * 
+                        *
                         * This is just for ports. portset knotes are stay-active,
                         * and their threads get awakened through the !MACH_RCV_IN_PROGRESS
                         * logic below).
                         */
                        if (mqueue->imq_msgcount > 0) {
-                               if (ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg))
-                                       KNOTE(&mqueue->imq_klist, 0);
+                               if (ipc_kmsg_enqueue_qos(&mqueue->imq_messages, kmsg)) {
+                                       if (IMQ_KLIST_VALID(mqueue))
+                                               KNOTE(&mqueue->imq_klist, 0);
+                               }
                                break;
                        }
 
@@ -722,7 +779,7 @@ ipc_mqueue_post(
                        destroy_msg = TRUE;
                        goto out_unlock;
                }
-       
+
                /*
                 * If a thread is attempting a "peek" into the message queue
                 * (MACH_PEEK_IN_PROGRESS), then we enqueue the message and set the
@@ -753,7 +810,7 @@ ipc_mqueue_post(
                        continue;
                }
 
-       
+
                /*
                 * We found a waiting thread.
                 * If the message is too large or the scatter list is too small
@@ -761,7 +818,7 @@ ipc_mqueue_post(
                 */
                msize = ipc_kmsg_copyout_size(kmsg, receiver->map);
                if (receiver->ith_rsize <
-                               (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(receiver), receiver->ith_option))) {
+                               (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit_addr(receiver), receiver->ith_option))) {
                        receiver->ith_msize = msize;
                        receiver->ith_state = MACH_RCV_TOO_LARGE;
                } else {
@@ -924,7 +981,7 @@ ipc_mqueue_receive(
                return;
 
        if (wresult == THREAD_WAITING) {
-               counter((interruptible == THREAD_ABORTSAFE) ? 
+               counter((interruptible == THREAD_ABORTSAFE) ?
                        c_ipc_mqueue_receive_block_user++ :
                        c_ipc_mqueue_receive_block_kernel++);
 
@@ -986,6 +1043,8 @@ ipc_mqueue_receive_on_thread(
 {
        wait_result_t           wresult;
        uint64_t                deadline;
+       struct turnstile        *rcv_turnstile = TURNSTILE_NULL;
+       turnstile_inheritor_t   inheritor = NULL;
 
        /* called with mqueue locked */
 
@@ -1001,7 +1060,7 @@ ipc_mqueue_receive_on_thread(
                 */
                return THREAD_RESTART;
        }
-       
+
        if (imq_is_set(mqueue)) {
                ipc_mqueue_t port_mq = IMQ_NULL;
 
@@ -1040,7 +1099,7 @@ ipc_mqueue_receive_on_thread(
                /*
                 * Receive on a single port. Just try to get the messages.
                 */
-               kmsgs = &mqueue->imq_messages;
+               kmsgs = &mqueue->imq_messages;
                if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL) {
                        if (option & MACH_PEEK_MSG)
                                ipc_mqueue_peek_on_thread(mqueue, option, thread);
@@ -1054,7 +1113,7 @@ ipc_mqueue_receive_on_thread(
                panic("Unknown mqueue type 0x%x: likely memory corruption!\n",
                      mqueue->imq_wait_queue.waitq_type);
        }
-       
+
        /*
         * Looks like we'll have to block.  The mqueue we will
         * block on (whether the set's or the local port's) is
@@ -1082,6 +1141,37 @@ ipc_mqueue_receive_on_thread(
        else
                deadline = 0;
 
+       /*
+        * Threads waiting on a port (not portset)
+        * will wait on port's receive turnstile.
+        * Donate waiting thread's turnstile and
+        * setup inheritor for special reply port.
+        * Based on the state of the special reply
+        * port, the inheritor would be the send
+        * turnstile of the connection port on which
+        * the send of sync ipc would happen or
+        * workloop's turnstile who would reply to
+        * the sync ipc message.
+        *
+        * Pass in mqueue wait in waitq_assert_wait to
+        * support port set wakeup. The mqueue waitq of port
+        * will be converted to to turnstile waitq
+        * in waitq_assert_wait instead of global waitqs.
+        */
+       if (imq_is_queue(mqueue)) {
+               ipc_port_t port = ip_from_mq(mqueue);
+               rcv_turnstile = turnstile_prepare((uintptr_t)port,
+                       port_rcv_turnstile_address(port),
+                       TURNSTILE_NULL, TURNSTILE_SYNC_IPC);
+
+               if (port->ip_specialreply) {
+                       inheritor = ipc_port_get_special_reply_port_inheritor(port);
+               }
+
+               turnstile_update_inheritor(rcv_turnstile, inheritor,
+                       (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_DELAYED_UPDATE));
+       }
+
        thread_set_pending_block_hint(thread, kThreadWaitPortReceive);
        wresult = waitq_assert_wait64_locked(&mqueue->imq_wait_queue,
                                             IPC_MQUEUE_RECEIVE,
@@ -1096,6 +1186,12 @@ ipc_mqueue_receive_on_thread(
 
        imq_unlock(mqueue);
 
+       /* Check if its a port mqueue and if it needs to call turnstile_update_inheritor_complete */
+       if (rcv_turnstile != TURNSTILE_NULL) {
+               turnstile_update_inheritor_complete(rcv_turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
+       }
+       /* Its callers responsibility to call turnstile_complete to get the turnstile back */
+
        return wresult;
 }
 
@@ -1175,7 +1271,7 @@ ipc_mqueue_select_on_thread(
         * (and size needed).
         */
        msize = ipc_kmsg_copyout_size(kmsg, thread->map);
-       if (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit(thread), option) > max_size) {
+       if (msize + REQUESTED_TRAILER_SIZE(thread_is_64bit_addr(thread), option) > max_size) {
                mr = MACH_RCV_TOO_LARGE;
                if (option & MACH_RCV_LARGE) {
                        thread->ith_receiver_name = port_mq->imq_receiver_name;
@@ -1236,7 +1332,7 @@ ipc_mqueue_peek_locked(ipc_mqueue_t mq,
        if (seqno == 0) {
                seqno = mq->imq_seqno;
                msgoff = 0;
-       } else if (seqno >= mq->imq_seqno && 
+       } else if (seqno >= mq->imq_seqno &&
                   seqno < mq->imq_seqno + mq->imq_msgcount) {
                msgoff = seqno - mq->imq_seqno;
        } else
@@ -1259,7 +1355,7 @@ ipc_mqueue_peek_locked(ipc_mqueue_t mq,
        if (msg_idp != NULL)
                *msg_idp = kmsg->ikm_header->msgh_id;
        if (msg_trailerp != NULL)
-               memcpy(msg_trailerp, 
+               memcpy(msg_trailerp,
                       (mach_msg_max_trailer_t *)((vm_offset_t)kmsg->ikm_header +
                                                  round_msg(kmsg->ikm_header->msgh_size)),
                       sizeof(mach_msg_max_trailer_t));
@@ -1353,7 +1449,7 @@ static int mqueue_peek_iterator(void *ctx, struct waitq *waitq,
 
        (void)ctx;
        (void)wqset;
-               
+
        if (ipc_kmsg_queue_first(kmsgs) != IKM_NULL)
                return WQ_ITERATE_BREAK; /* break out of the prepost iteration */
 
@@ -1482,6 +1578,7 @@ ipc_mqueue_destroy_locked(ipc_mqueue_t mqueue)
        ipc_kmsg_queue_t kmqueue;
        ipc_kmsg_t kmsg;
        boolean_t reap = FALSE;
+       struct turnstile *send_turnstile = port_send_turnstile(ip_from_mq(mqueue));
 
        assert(!imq_is_set(mqueue));
 
@@ -1491,12 +1588,13 @@ ipc_mqueue_destroy_locked(ipc_mqueue_t mqueue)
         *      (never preposts)
         */
        mqueue->imq_fullwaiters = FALSE;
-       waitq_wakeup64_all_locked(&mqueue->imq_wait_queue,
-                                 IPC_MQUEUE_FULL,
-                                 THREAD_RESTART,
-                                 NULL,
-                                 WAITQ_ALL_PRIORITIES,
-                                 WAITQ_KEEP_LOCKED);
+
+       if (send_turnstile != TURNSTILE_NULL) {
+               waitq_wakeup64_all(&send_turnstile->ts_waitq,
+                                  IPC_MQUEUE_FULL,
+                                  THREAD_RESTART,
+                                  WAITQ_ALL_PRIORITIES);
+       }
 
        /*
         * Move messages from the specified queue to the per-thread
@@ -1559,6 +1657,7 @@ ipc_mqueue_set_qlimit(
         imq_lock(mqueue);
         if (qlimit > mqueue->imq_qlimit) {
                 mach_port_msgcount_t i, wakeup;
+                struct turnstile *send_turnstile = port_send_turnstile(ip_from_mq(mqueue));
 
                 /* caution: wakeup, qlimit are unsigned */
                 wakeup = qlimit - mqueue->imq_qlimit;
@@ -1571,12 +1670,11 @@ ipc_mqueue_set_qlimit(
                         *
                         * NOTE: this will never prepost
                         */
-                       if (waitq_wakeup64_one_locked(&mqueue->imq_wait_queue,
-                                                     IPC_MQUEUE_FULL,
-                                                     THREAD_AWAKENED,
-                                                     NULL,
-                                                     WAITQ_PROMOTE_PRIORITY,
-                                                     WAITQ_KEEP_LOCKED) == KERN_NOT_WAITING) {
+                       if (send_turnstile == TURNSTILE_NULL ||
+                           waitq_wakeup64_one(&send_turnstile->ts_waitq,
+                                              IPC_MQUEUE_FULL,
+                                              THREAD_AWAKENED,
+                                              WAITQ_PROMOTE_PRIORITY) == KERN_NOT_WAITING) {
                                mqueue->imq_fullwaiters = FALSE;
                                break;
                        }
index af0a534a4027ded7a31a38a21154c694f5e54d4f..05d952ef32e7300cb37a1b9a04ed40ec8f3c8a1a 100644 (file)
@@ -86,7 +86,7 @@ typedef struct ipc_mqueue {
                struct {
                        struct  waitq           waitq;
                        struct ipc_kmsg_queue   messages;
-                       mach_port_seqno_t       seqno;
+                       mach_port_seqno_t       seqno;
                        mach_port_name_t        receiver_name;
                        uint16_t                msgcount;
                        uint16_t                qlimit;
@@ -98,11 +98,31 @@ typedef struct ipc_mqueue {
                        struct waitq_set        setq;
                } pset;
        } data;
-       struct klist imq_klist;
+       union {
+               struct klist imq_klist;
+               uintptr_t imq_inheritor;
+       };
 } *ipc_mqueue_t;
 
 #define        IMQ_NULL                ((ipc_mqueue_t) 0)
 
+/*
+ * When a receive right is in flight, before it can ever be registered with
+ * a new knote, its imq_klist field can be overloaded to hold a pointer
+ * to the knote that the port is pushing on through his turnstile.
+ *
+ * if IMQ_KLIST_VALID() returns true, then the imq_klist field can be used,
+ * else IMQ_INHERITOR() can be used to get the pointer to the knote currently
+ * being the port turnstile inheritor.
+ */
+#define IMQ_KLIST_VALID(imq) (((imq)->imq_inheritor & 1) == 0)
+#define IMQ_INHERITOR(imq) ((struct turnstile *)((imq)->imq_inheritor ^ 1))
+#define IMQ_SET_INHERITOR(imq, inheritor) \
+MACRO_BEGIN                                                                   \
+               assert(((imq)->imq_inheritor & 1) || SLIST_EMPTY(&(imq)->imq_klist)); \
+               ((imq)->imq_inheritor = (uintptr_t)(inheritor) | 1);                  \
+MACRO_END
+
 #define imq_wait_queue         data.port.waitq
 #define imq_messages           data.port.messages
 #define imq_msgcount           data.port.msgcount
@@ -141,11 +161,11 @@ typedef struct ipc_mqueue {
 #define        imq_from_waitq(waitq)   (waitq_is_set(waitq) ? \
                                        ((struct ipc_mqueue *)((void *)( \
                                                (uintptr_t)(waitq) - \
-                                               __offsetof(struct ipc_mqueue, imq_wait_queue)) \
+                                               __offsetof(struct ipc_mqueue, imq_set_queue)) \
                                        )) : \
                                        ((struct ipc_mqueue *)((void *)( \
                                                (uintptr_t)(waitq) - \
-                                               __offsetof(struct ipc_mqueue, imq_set_queue)) \
+                                               __offsetof(struct ipc_mqueue, imq_wait_queue)) \
                                        )) \
                                 )
 
@@ -171,8 +191,7 @@ extern int ipc_mqueue_full;
 /* Initialize a newly-allocated message queue */
 extern void ipc_mqueue_init(
        ipc_mqueue_t            mqueue,
-       boolean_t               is_set,
-       uint64_t                *reserved_link);
+       boolean_t               is_set);
 
 /* de-initialize / cleanup an mqueue (specifically waitq resources) */
 extern void ipc_mqueue_deinit(
index d1f50d5c718df879ebd53de158cc1005b2e55d67..44e6ed6d9ba3e0564ff43a344ac0992919c00b4c 100644 (file)
@@ -158,7 +158,7 @@ void
 ipc_notify_send_once(
        ipc_port_t      port)
 {
-       ipc_port_unlink_special_reply_port(port, IPC_PORT_UNLINK_SR_NONE);
+       ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_NONE, FALSE);
 
        (void)mach_notify_send_once(port);
        /* send-once right consumed */
index 5ff293fca76b487e6789983ba7ccbdb0462b97e3..6b40e4761b0b27efe525053ce0918c101d7cfc73 100644 (file)
@@ -202,11 +202,13 @@ ipc_object_translate_two(
 
        if ((entry1->ie_bits & MACH_PORT_TYPE(right1)) == MACH_PORT_TYPE_NONE) {
                is_read_unlock(space);
+               mach_port_guard_exception(name1, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                return KERN_INVALID_RIGHT;
        }
 
        if ((entry2->ie_bits & MACH_PORT_TYPE(right2)) == MACH_PORT_TYPE_NONE) {
                is_read_unlock(space);
+               mach_port_guard_exception(name2, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                return KERN_INVALID_RIGHT;
        }
 
@@ -576,6 +578,7 @@ ipc_object_copyin_from_kernel(
                ipc_port_t port = (ipc_port_t) object;
 
                ip_lock(port);
+               imq_lock(&port->ip_messages);
                assert(ip_active(port));
                if (port->ip_destination != IP_NULL) {
                        assert(port->ip_receiver == ipc_space_kernel);
@@ -586,6 +589,7 @@ ipc_object_copyin_from_kernel(
                        port->ip_receiver_name = MACH_PORT_NULL;
                        port->ip_destination = IP_NULL;
                }
+               imq_unlock(&port->ip_messages);
                ip_unlock(port);
                break;
            }
@@ -748,6 +752,7 @@ ipc_object_copyout(
        boolean_t               overflow,
        mach_port_name_t        *namep)
 {
+       struct knote *kn = current_thread()->ith_knote;
        mach_port_name_t name;
        ipc_entry_t entry;
        kern_return_t kr;
@@ -755,6 +760,11 @@ ipc_object_copyout(
        assert(IO_VALID(object));
        assert(io_otype(object) == IOT_PORT);
 
+       if (ITH_KNOTE_VALID(kn, msgt_name)) {
+               filt_machport_turnstile_prepare_lazily(kn,
+                               msgt_name, (ipc_port_t)object);
+       }
+
        is_write_lock(space);
 
        for (;;) {
@@ -842,6 +852,7 @@ ipc_object_copyout_name(
        ipc_entry_t oentry;
        ipc_entry_t entry;
        kern_return_t kr;
+       struct knote *kn = current_thread()->ith_knote;
 
 #if IMPORTANCE_INHERITANCE
        int assertcnt = 0;
@@ -851,6 +862,11 @@ ipc_object_copyout_name(
        assert(IO_VALID(object));
        assert(io_otype(object) == IOT_PORT);
 
+       if (ITH_KNOTE_VALID(kn, msgt_name)) {
+               filt_machport_turnstile_prepare_lazily(kn,
+                               msgt_name, (ipc_port_t)object);
+       }
+
        kr = ipc_entry_alloc_name(space, name, &entry);
        if (kr != KERN_SUCCESS)
                return kr;
index 6aaf285a64678ec20560b21075a01b33210800b5..17e5abc0240c4a0c9d2682f1eb0d78368ded99fc 100644 (file)
@@ -236,7 +236,7 @@ io_release(ipc_object_t io) {
        }
 }
 
-/*   
+/*
  * Retrieve a label for use in a kernel call that takes a security
  * label as a parameter. If necessary, io_getlabel acquires internal
  * (not io_lock) locks, and io_unlocklabel releases them.
index 871f98f49f5685e75ae612d709a3b05515b37059..823abe3a71663f9c73d13c80d903e5bf3a675a9e 100644 (file)
@@ -92,6 +92,7 @@
 #include <ipc/ipc_table.h>
 #include <ipc/ipc_importance.h>
 #include <machine/machlimits.h>
+#include <kern/turnstile.h>
 
 #include <security/mac_mach_internal.h>
 
@@ -638,10 +639,14 @@ ipc_port_init(
        port->ip_impcount    = 0;
 
        port->ip_specialreply = 0;
-       port->ip_link_sync_qos = 0;
+       port->ip_sync_link_state = PORT_SYNC_LINK_ANY;
+
+       reset_ip_srp_bits(port);
+
+       port->ip_send_turnstile = TURNSTILE_NULL;
 
        ipc_mqueue_init(&port->ip_messages,
-                       FALSE /* !set */, NULL /* no reserved link */);
+                       FALSE /* !set */);
 }
 
 /*
@@ -919,13 +924,15 @@ ipc_port_destroy(ipc_port_t port)
                port->ip_pdrequest = IP_NULL;
 
                /* make port be in limbo */
+               imq_lock(&port->ip_messages);
                port->ip_receiver_name = MACH_PORT_NULL;
                port->ip_destination = IP_NULL;
+               imq_unlock(&port->ip_messages);
                ip_unlock(port);
 
                if (special_reply) {
-                       ipc_port_unlink_special_reply_port(port,
-                               IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE);
+                       ipc_port_adjust_special_reply_port(port,
+                               IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE, FALSE);
                }
                /* consumes our refs for port and pdrequest */
                ipc_notify_port_destroyed(pdrequest, port);
@@ -933,8 +940,11 @@ ipc_port_destroy(ipc_port_t port)
                goto drop_assertions;
        }
 
+       /* port active bit needs to be guarded under mqueue lock for turnstiles */
+       imq_lock(&port->ip_messages);
        port->ip_object.io_bits &= ~IO_BITS_ACTIVE;
        port->ip_timestamp = ipc_port_timestamp();
+       imq_unlock(&port->ip_messages);
        nsrequest = port->ip_nsrequest;
 
        /*
@@ -966,7 +976,7 @@ ipc_port_destroy(ipc_port_t port)
                kmsg = port->ip_premsg;
                assert(kmsg != IKM_NULL);
                inuse_port = ikm_prealloc_inuse_port(kmsg);
-               IP_CLEAR_PREALLOC(port, kmsg);
+               ipc_kmsg_clear_prealloc(kmsg, port);
                ip_unlock(port);
                if (inuse_port != IP_NULL) {
                        assert(inuse_port == port);
@@ -979,8 +989,8 @@ ipc_port_destroy(ipc_port_t port)
 
        /* unlink the kmsg from special reply port */
        if (special_reply) {
-               ipc_port_unlink_special_reply_port(port,
-                       IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE);
+               ipc_port_adjust_special_reply_port(port,
+                       IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE, FALSE);
        }
 
        /* throw away no-senders request */
@@ -1056,9 +1066,6 @@ ipc_port_check_circularity(
        return ipc_importance_check_circularity(port, dest);
 #else
        ipc_port_t base;
-       sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0};
-       sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0};
-       boolean_t update_knote = FALSE;
 
        assert(port != IP_NULL);
        assert(dest != IP_NULL);
@@ -1067,6 +1074,9 @@ ipc_port_check_circularity(
                return TRUE;
        base = dest;
 
+       /* Check if destination needs a turnstile */
+       ipc_port_send_turnstile_prepare(dest);
+
        /*
         *      First try a quick check that can run in parallel.
         *      No circularity if dest is not in transit.
@@ -1115,19 +1125,21 @@ ipc_port_check_circularity(
                assert(port->ip_receiver_name == MACH_PORT_NULL);
                assert(port->ip_destination == IP_NULL);
 
-               while (dest != IP_NULL) {
+               base = dest;
+               while (base != IP_NULL) {
                        ipc_port_t next;
 
                        /* dest is in transit or in limbo */
 
-                       assert(ip_active(dest));
-                       assert(dest->ip_receiver_name == MACH_PORT_NULL);
+                       assert(ip_active(base));
+                       assert(base->ip_receiver_name == MACH_PORT_NULL);
 
-                       next = dest->ip_destination;
-                       ip_unlock(dest);
-                       dest = next;
+                       next = base->ip_destination;
+                       ip_unlock(base);
+                       base = next;
                }
 
+               ipc_port_send_turnstile_complete(dest);
                return TRUE;
        }
 
@@ -1141,7 +1153,7 @@ ipc_port_check_circularity(
        ipc_port_multiple_unlock();
 
 not_circular:
-       imq_lock(&base->ip_messages);
+       imq_lock(&port->ip_messages);
 
        /* port is in limbo */
 
@@ -1152,18 +1164,27 @@ not_circular:
        ip_reference(dest);
        port->ip_destination = dest;
 
-       /* Capture the sync qos count delta */
-       for (int i = 0; i < THREAD_QOS_LAST; i++) {
-               sync_qos_delta_add[i] = port_sync_qos(port, i);
+       /* Setup linkage for source port if it has sync ipc push */
+       struct turnstile *send_turnstile = TURNSTILE_NULL;
+       if (port_send_turnstile(port)) {
+               send_turnstile = turnstile_prepare((uintptr_t)port,
+                       port_send_turnstile_address(port),
+                       TURNSTILE_NULL, TURNSTILE_SYNC_IPC);
+
+               turnstile_update_inheritor(send_turnstile, port_send_turnstile(dest),
+                       (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE));
+
+               /* update complete and turnstile complete called after dropping all locks */
        }
+       imq_unlock(&port->ip_messages);
 
        /* now unlock chain */
 
        ip_unlock(port);
 
        for (;;) {
-               /* every port along chain tracks override behind it */
-               update_knote = ipc_port_sync_qos_delta(dest, sync_qos_delta_add, sync_qos_delta_sub);
+               ipc_port_t next;
+
                if (dest == base)
                        break;
 
@@ -1173,9 +1194,9 @@ not_circular:
                assert(dest->ip_receiver_name == MACH_PORT_NULL);
                assert(dest->ip_destination != IP_NULL);
 
-               port = dest->ip_destination;
+               next = dest->ip_destination;
                ip_unlock(dest);
-               dest = port;
+               dest = next;
        }
 
        /* base is not in transit */
@@ -1183,456 +1204,476 @@ not_circular:
               (base->ip_receiver_name != MACH_PORT_NULL) ||
               (base->ip_destination == IP_NULL));
 
-       if (update_knote) {
-               KNOTE(&base->ip_messages.imq_klist, 0);
-       }
-       imq_unlock(&base->ip_messages);
-
        ip_unlock(base);
 
+       /* All locks dropped, call turnstile_update_inheritor_complete for source port's turnstile */
+       if (send_turnstile) {
+               turnstile_update_inheritor_complete(send_turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
+
+               /* Take the mq lock to call turnstile complete */
+               imq_lock(&port->ip_messages);
+               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port), NULL);
+               send_turnstile = TURNSTILE_NULL;
+               imq_unlock(&port->ip_messages);
+               turnstile_cleanup();
+       }
+
        return FALSE;
 #endif /* !IMPORTANCE_INHERITANCE */
 }
 
-/*
- *     Routine:        ipc_port_link_special_reply_port_with_qos
- *     Purpose:
- *             Link the special reply port with the destination port.
- *             Update the sync qos count of special reply port,
- *             destination port.
- *
- *     Conditions:
- *             Nothing is locked.
- */
-kern_return_t
-ipc_port_link_special_reply_port_with_qos(
-       ipc_port_t special_reply_port,
-       ipc_port_t dest_port,
-       int qos)
+struct turnstile *
+ipc_port_get_inheritor(ipc_port_t port)
 {
-       ipc_port_t next, base;
-       sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0};
-       sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0};
-       boolean_t update_knote = FALSE;
-       boolean_t multiple_lock = FALSE;
+       ipc_mqueue_t mqueue = &port->ip_messages;
+       struct knote *kn;
 
-       ip_lock(dest_port);
+       assert(imq_held(mqueue));
 
-       /* Check if dest is active */
-       if (!ip_active(dest_port)) {
-               ip_unlock(dest_port);
-               return KERN_FAILURE;
+       if (!IMQ_KLIST_VALID(mqueue)) {
+               return IMQ_INHERITOR(mqueue);
        }
 
-       if ((dest_port->ip_receiver_name == MACH_PORT_NULL) &&
-           (dest_port->ip_destination != IP_NULL)) {
-               /* dest_port is in transit; need to take the serialize lock */
-               ip_unlock(dest_port);
-               goto take_multiple_lock;
-       }
-
-       /* Check if the port is a special reply port */
-       if (ip_lock_try(special_reply_port)) {
-               if (!special_reply_port->ip_specialreply ||
-                   !special_reply_port->ip_link_sync_qos ||
-                   (special_reply_port->ip_sync_qos_override_port != IP_NULL &&
-                    special_reply_port->ip_sync_qos_override_port != dest_port)) {
-
-                       boolean_t link_sync_qos = special_reply_port->ip_link_sync_qos;
-                       ip_unlock(special_reply_port);
-                       ip_unlock(dest_port);
-                       /* return KERN_SUCCESS when link_sync_qos is not set */
-                       if (!link_sync_qos) {
-                               return KERN_SUCCESS;
-                       }
-                       return KERN_FAILURE;
-               } else {
-                       goto both_ports_locked;
+       SLIST_FOREACH(kn, &port->ip_messages.imq_klist, kn_selnext) {
+               if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) {
+                       return filt_machport_kqueue_turnstile(kn);
                }
        }
 
-       ip_unlock(dest_port);
-
-take_multiple_lock:
+       return TURNSTILE_NULL;
+}
 
-       ipc_port_multiple_lock(); /* massive serialization */
-       multiple_lock = TRUE;
+/*
+ *     Routine:        ipc_port_send_turnstile_prepare
+ *     Purpose:
+ *             Get a reference on port's send turnstile, if
+ *             port does not have a send turnstile then allocate one.
+ *
+ *     Conditions:
+ *             Nothing is locked.
+ */
+void
+ipc_port_send_turnstile_prepare(ipc_port_t port)
+{
+       struct turnstile *turnstile = TURNSTILE_NULL;
+       struct turnstile *inheritor = TURNSTILE_NULL;
+       struct turnstile *send_turnstile = TURNSTILE_NULL;
 
-       ip_lock(special_reply_port);
+retry_alloc:
+       imq_lock(&port->ip_messages);
 
-       /* Check if the special reply port is marked regular */
-       if (!special_reply_port->ip_specialreply ||
-           !special_reply_port->ip_link_sync_qos ||
-           (special_reply_port->ip_sync_qos_override_port != IP_NULL &&
-            special_reply_port->ip_sync_qos_override_port != dest_port)) {
+       if (port_send_turnstile(port) == NULL ||
+           port_send_turnstile(port)->ts_port_ref == 0) {
 
-               boolean_t link_sync_qos = special_reply_port->ip_link_sync_qos;
-               ip_unlock(special_reply_port);
-               ipc_port_multiple_unlock();
-               /* return KERN_SUCCESS when link_sync_qos is not set */
-               if (!link_sync_qos) {
-                       return KERN_SUCCESS;
+               if (turnstile == TURNSTILE_NULL) {
+                       imq_unlock(&port->ip_messages);
+                       turnstile = turnstile_alloc();
+                       goto retry_alloc;
                }
-               return KERN_FAILURE;
-       }
 
-       ip_lock(dest_port);
+               send_turnstile = turnstile_prepare((uintptr_t)port,
+                       port_send_turnstile_address(port),
+                       turnstile, TURNSTILE_SYNC_IPC);
+               turnstile = TURNSTILE_NULL;
 
-both_ports_locked:
-       next = dest_port;
+               /*
+                * if port in transit, setup linkage for its turnstile,
+                * otherwise the link it to WL turnstile.
+                */
+               if (ip_active(port) &&
+                   port->ip_receiver_name == MACH_PORT_NULL &&
+                   port->ip_destination != IP_NULL) {
+                       assert(port->ip_receiver_name == MACH_PORT_NULL);
+                       assert(port->ip_destination != IP_NULL);
 
-       /* Apply the qos to special reply port, capture the old qos */
-       if (special_reply_port->ip_sync_qos_override_port != IP_NULL) {
-               /* Check if qos needs to be updated */
-               if ((sync_qos_count_t)qos <= port_special_qos(special_reply_port)) {
-                       imq_lock(&dest_port->ip_messages);
-                       goto done_update;
+                       inheritor = port_send_turnstile(port->ip_destination);
+               } else {
+                       inheritor = ipc_port_get_inheritor(port);
                }
-               sync_qos_delta_sub[port_special_qos(special_reply_port)]++;
+               turnstile_update_inheritor(send_turnstile, inheritor,
+                       TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE);
+               /* turnstile complete will be called in ipc_port_send_turnstile_complete */
        }
 
-       set_port_special_qos(special_reply_port, (sync_qos_count_t)qos);
-       sync_qos_delta_add[qos]++;
+       /* Increment turnstile counter */
+       port_send_turnstile(port)->ts_port_ref++;
+       imq_unlock(&port->ip_messages);
 
-       /* Link the special reply port to dest port */
-       if (special_reply_port->ip_sync_qos_override_port == IP_NULL) {
-               /* take a reference on dest_port */
-               ip_reference(dest_port);
-               special_reply_port->ip_sync_qos_override_port = dest_port;
+       if (send_turnstile) {
+               turnstile_update_inheritor_complete(send_turnstile,
+                       TURNSTILE_INTERLOCK_NOT_HELD);
        }
-
-       /* Apply the sync qos delta to all in-transit ports */
-       for (;;) {
-               boolean_t port_not_in_transit = FALSE;
-               if (!ip_active(next) ||
-                   (next->ip_receiver_name != MACH_PORT_NULL) ||
-                   (next->ip_destination == IP_NULL)) {
-                       /* Get the mqueue lock for destination port to update knotes */
-                       imq_lock(&next->ip_messages);
-                       port_not_in_transit = TRUE;
-               }
-               /* Apply the sync qos delta */
-               update_knote = ipc_port_sync_qos_delta(next, sync_qos_delta_add, sync_qos_delta_sub);
-
-               if (port_not_in_transit)
-                       break;
-
-               next = next->ip_destination;
-               ip_lock(next);
+       if (turnstile != TURNSTILE_NULL) {
+               turnstile_deallocate(turnstile);
        }
-done_update:
+}
 
-       if (multiple_lock) {
-               ipc_port_multiple_unlock();
-       }
 
-       ip_unlock(special_reply_port);
-       base = next;
-       next = dest_port;
+/*
+ *     Routine:        ipc_port_send_turnstile_complete
+ *     Purpose:
+ *             Drop a ref on the port's send turnstile, if the
+ *             ref becomes zero, deallocate the turnstile.
+ *
+ *     Conditions:
+ *             The space might be locked, use safe deallocate.
+ */
+void
+ipc_port_send_turnstile_complete(ipc_port_t port)
+{
+       struct turnstile *turnstile = TURNSTILE_NULL;
 
-       while (next != base) {
-               ipc_port_t prev = next;
-               next = next->ip_destination;
+       /* Drop turnstile count on dest port */
+       imq_lock(&port->ip_messages);
 
-               ip_unlock(prev);
+       port_send_turnstile(port)->ts_port_ref--;
+       if (port_send_turnstile(port)->ts_port_ref == 0) {
+               turnstile_complete((uintptr_t)port, port_send_turnstile_address(port),
+                               &turnstile);
+               assert(turnstile != TURNSTILE_NULL);
        }
+       imq_unlock(&port->ip_messages);
+       turnstile_cleanup();
 
-       if (update_knote) {
-               KNOTE(&base->ip_messages.imq_klist, 0);
+       if (turnstile != TURNSTILE_NULL) {
+               turnstile_deallocate_safe(turnstile);
+               turnstile = TURNSTILE_NULL;
        }
-       imq_unlock(&base->ip_messages);
-       ip_unlock(base);
-       return KERN_SUCCESS;
 }
 
+
 /*
- *     Routine:        ipc_port_unlink_special_reply_port_locked
+ *     Routine:        ipc_port_rcv_turnstile_waitq
  *     Purpose:
- *             If the special port is linked to a port, adjust it's sync qos override and unlink the port.
- *     Condition:
- *             Special reply port locked on entry.
- *             Special reply port unlocked on return.
- *     Returns:
- *             None.
+ *             Given the mqueue's waitq, find the port's
+ *              rcv turnstile and return its waitq.
+ *
+ *     Conditions:
+ *             mqueue locked or thread waiting on turnstile is locked.
  */
-void
-ipc_port_unlink_special_reply_port_locked(
-       ipc_port_t special_reply_port,
-       struct knote *kn,
-       uint8_t flags)
+struct waitq *
+ipc_port_rcv_turnstile_waitq(struct waitq *waitq)
 {
-       ipc_port_t dest_port;
-       sync_qos_count_t sync_qos;
-       sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0};
-       sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0};
+       struct waitq *safeq;
 
-       /* Return if called from copy out in pseudo receive */
-       if (kn == ITH_KNOTE_PSEUDO) {
-               ip_unlock(special_reply_port);
-               return;
-       }
-
-       /* check if special port has a port linked to it */
-       if (special_reply_port->ip_specialreply == 0 ||
-           special_reply_port->ip_sync_qos_override_port == IP_NULL) {
-               set_port_special_qos(special_reply_port, 0);
-               if (flags & IPC_PORT_UNLINK_SR_CLEAR_SPECIAL_REPLY) {
-                       special_reply_port->ip_specialreply = 0;
-               }
-               if (flags & IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE) {
-                       special_reply_port->ip_link_sync_qos = 1;
-               }
-               ip_unlock(special_reply_port);
-               return;
-       }
-
-       /*
-        * port->ip_sync_qos_override_port is not null and it is safe
-        * to access it since ip_specialreply is set.
-        */
-       dest_port = special_reply_port->ip_sync_qos_override_port;
-       sync_qos_delta_sub[port_special_qos(special_reply_port)]++;
-       sync_qos = port_special_qos(special_reply_port);
-
-       /* Clear qos delta for special reply port */
-       set_port_special_qos(special_reply_port, 0);
-       special_reply_port->ip_sync_qos_override_port = IP_NULL;
-       if (flags & IPC_PORT_UNLINK_SR_CLEAR_SPECIAL_REPLY) {
-               special_reply_port->ip_specialreply = 0;
-       }
+       ipc_mqueue_t mqueue = imq_from_waitq(waitq);
+       ipc_port_t port = ip_from_mq(mqueue);
+       struct turnstile *rcv_turnstile = ipc_port_rcv_turnstile(port);
 
-       if (flags & IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE) {
-               special_reply_port->ip_link_sync_qos = 1;
+       /* Check if the port has a rcv turnstile */
+       if (rcv_turnstile != TURNSTILE_NULL) {
+               safeq = &rcv_turnstile->ts_waitq;
        } else {
-               special_reply_port->ip_link_sync_qos = 0;
+               safeq = global_eventq(waitq);
        }
-
-       ip_unlock(special_reply_port);
-
-       /* Add the sync qos on knote */
-       if (ITH_KNOTE_VALID(kn)) {
-               knote_adjust_sync_qos(kn, sync_qos, TRUE);
-       }
-
-       /* Adjust the sync qos of destination */
-       ipc_port_adjust_sync_qos(dest_port, sync_qos_delta_add, sync_qos_delta_sub);
-       ip_release(dest_port);
+       return safeq;
 }
 
+
 /*
- *     Routine:        ipc_port_unlink_special_reply_port
+ *     Routine:        ipc_port_rcv_turnstile
  *     Purpose:
- *             If the special port is linked to a port, adjust it's sync qos override and unlink the port.
- *     Condition:
- *             Nothing locked.
- *     Returns:
- *             None.
+ *             Get the port's receive turnstile
+ *
+ *     Conditions:
+ *             mqueue locked or thread waiting on turnstile is locked.
  */
-void
-ipc_port_unlink_special_reply_port(
-       ipc_port_t special_reply_port,
-       uint8_t flags)
+struct turnstile *
+ipc_port_rcv_turnstile(ipc_port_t port)
 {
-       ip_lock(special_reply_port);
-       ipc_port_unlink_special_reply_port_locked(special_reply_port, NULL, flags);
-       /* special_reply_port unlocked */
+       return turnstile_lookup_by_proprietor((uintptr_t)port);
 }
 
+
 /*
- *     Routine:        ipc_port_sync_qos_delta
+ *     Routine:        ipc_port_link_special_reply_port
  *     Purpose:
- *             Adjust the sync qos count associated with a port.
+ *             Link the special reply port with the destination port.
+ *              Allocates turnstile to dest port.
  *
- *             For now, be defensive during deductions to make sure the
- *             sync_qos count for the port doesn't underflow zero.
- *     Returns:
- *             TRUE: if max sync qos of the port changes.
- *             FALSE: otherwise.
  *     Conditions:
- *             The port is referenced and locked.
- *             The mqueue is locked if port is not in-transit.
+ *             Nothing is locked.
  */
-boolean_t
-ipc_port_sync_qos_delta(
-       ipc_port_t        port,
-       sync_qos_count_t *sync_qos_delta_add,
-       sync_qos_count_t *sync_qos_delta_sub)
+void
+ipc_port_link_special_reply_port(
+       ipc_port_t special_reply_port,
+       ipc_port_t dest_port)
 {
-       sync_qos_count_t max_sync_qos_index;
+       boolean_t drop_turnstile_ref = FALSE;
 
-       if (!ip_active(port)) {
-               return FALSE;
-       }
+       /* Check if dest_port needs a turnstile */
+       ipc_port_send_turnstile_prepare(dest_port);
+
+       /* Lock the special reply port and establish the linkage */
+       ip_lock(special_reply_port);
+       imq_lock(&special_reply_port->ip_messages);
 
-       max_sync_qos_index = ipc_port_get_max_sync_qos_index(port);
+       /* Check if we need to drop the acquired turnstile ref on dest port */
+       if (!special_reply_port->ip_specialreply ||
+           special_reply_port->ip_sync_link_state != PORT_SYNC_LINK_ANY ||
+           special_reply_port->ip_sync_inheritor_port != IPC_PORT_NULL) {
+               drop_turnstile_ref = TRUE;
+       } else {
+               /* take a reference on dest_port */
+               ip_reference(dest_port);
+               special_reply_port->ip_sync_inheritor_port = dest_port;
+               special_reply_port->ip_sync_link_state = PORT_SYNC_LINK_PORT;
+       }
 
-       for (int i = 0; i < THREAD_QOS_LAST; i++) {
-               sync_qos_count_t port_sync_qos_count = port_sync_qos(port, i);
-               /* Do not let the sync qos underflow */
-               if (sync_qos_delta_sub[i] > port_sync_qos_count) {
-                       KDBG_FILTERED(IMPORTANCE_CODE(IMP_SYNC_IPC_QOS, IMP_SYNC_IPC_QOS_UNDERFLOW),
-                             i, VM_KERNEL_UNSLIDE_OR_PERM(port),
-                             port_sync_qos_count, sync_qos_delta_sub[i]);
+       imq_unlock(&special_reply_port->ip_messages);
+       ip_unlock(special_reply_port);
 
-                       set_port_sync_qos(port, i, 0);
-               } else if (sync_qos_delta_sub[i] != 0) {
-                       KDBG_FILTERED(IMPORTANCE_CODE(IMP_SYNC_IPC_QOS, IMP_SYNC_IPC_QOS_REMOVED),
-                             i, VM_KERNEL_UNSLIDE_OR_PERM(port),
-                             port_sync_qos_count, sync_qos_delta_sub[i]);
+       if (drop_turnstile_ref) {
+               ipc_port_send_turnstile_complete(dest_port);
+       }
 
-                       set_port_sync_qos(port, i, (port_sync_qos_count - sync_qos_delta_sub[i]));
-               }
+       return;
+}
 
-               port_sync_qos_count = port_sync_qos(port, i);
-               /* Do not let the sync qos overflow */
-               if (UCHAR_MAX - sync_qos_delta_add[i] < port_sync_qos_count) {
-                       KDBG_FILTERED(IMPORTANCE_CODE(IMP_SYNC_IPC_QOS, IMP_SYNC_IPC_QOS_OVERFLOW),
-                             i, VM_KERNEL_UNSLIDE_OR_PERM(port),
-                             port_sync_qos_count, sync_qos_delta_add[i]);
+#if DEVELOPMENT || DEBUG
+inline void
+reset_ip_srp_bits(ipc_port_t special_reply_port)
+{
+       special_reply_port->ip_srp_lost_link = 0;
+       special_reply_port->ip_srp_msg_sent = 0;
+}
 
-                       set_port_sync_qos(port, i, UCHAR_MAX);
-               } else if (sync_qos_delta_add[i] != 0) {
-                       KDBG_FILTERED(IMPORTANCE_CODE(IMP_SYNC_IPC_QOS, IMP_SYNC_IPC_QOS_APPLIED),
-                             i, VM_KERNEL_UNSLIDE_OR_PERM(port),
-                             port_sync_qos_count, sync_qos_delta_add[i]);
+inline void
+reset_ip_srp_msg_sent(ipc_port_t special_reply_port)
+{
+       if (special_reply_port->ip_specialreply == 1) {
+               special_reply_port->ip_srp_msg_sent = 0;
+       }
+}
 
-                       set_port_sync_qos(port, i, (port_sync_qos_count + sync_qos_delta_add[i]));
-               }
+inline void
+set_ip_srp_msg_sent(ipc_port_t special_reply_port)
+{
+       if (special_reply_port->ip_specialreply == 1) {
+               special_reply_port->ip_srp_msg_sent = 1;
        }
-       return (ipc_port_get_max_sync_qos_index(port) != max_sync_qos_index);
 }
 
-/*
- *     Routine:        ipc_port_get_max_sync_qos_index
- *     Purpose:
- *             Return the max sync qos of the port.
- *
- *     Conditions:
- */
-sync_qos_count_t
-ipc_port_get_max_sync_qos_index(
-       ipc_port_t      port)
+inline void
+set_ip_srp_lost_link(ipc_port_t special_reply_port)
 {
-       int i;
-       for (i = THREAD_QOS_LAST - 1; i >= 0; i--) {
-               if (port_sync_qos(port, i) != 0) {
-                       return i;
-               }
+       if (special_reply_port->ip_specialreply == 1 && special_reply_port->ip_srp_msg_sent == 0) {
+               special_reply_port->ip_srp_lost_link = 1;
        }
-       return THREAD_QOS_UNSPECIFIED;
 }
 
+#else /* DEVELOPMENT || DEBUG */
+inline void
+reset_ip_srp_bits(__unused ipc_port_t special_reply_port)
+{
+       return;
+}
+
+inline void
+reset_ip_srp_msg_sent(__unused ipc_port_t special_reply_port)
+{
+       return;
+}
+
+inline void
+set_ip_srp_msg_sent(__unused ipc_port_t special_reply_port)
+{
+       return;
+}
+
+inline void
+set_ip_srp_lost_link(__unused ipc_port_t special_reply_port)
+{
+       return;
+}
+#endif /* DEVELOPMENT || DEBUG */
+
 /*
- *     Routine:        ipc_port_adjust_sync_qos
+ *     Routine:        ipc_port_adjust_special_reply_port_locked
  *     Purpose:
- *             Adjust sync qos of the port and it's destination
- *             port if the port is in transit.
- *     Conditions:
- *             Nothing locked.
+ *             If the special port has a turnstile, update it's inheritor.
+ *     Condition:
+ *             Special reply port locked on entry.
+ *             Special reply port unlocked on return.
  *     Returns:
  *             None.
  */
 void
-ipc_port_adjust_sync_qos(
-       ipc_port_t port,
-       sync_qos_count_t *sync_qos_delta_add,
-       sync_qos_count_t *sync_qos_delta_sub)
+ipc_port_adjust_special_reply_port_locked(
+       ipc_port_t special_reply_port,
+       struct knote *kn,
+       uint8_t flags,
+       boolean_t get_turnstile)
 {
-       boolean_t update_knote;
-       boolean_t multiple_lock = FALSE;
-       ipc_port_t dest, base, next;
+       ipc_port_t dest_port = IPC_PORT_NULL;
+       int sync_link_state = PORT_SYNC_LINK_NO_LINKAGE;
+       turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
+       struct turnstile *dest_ts = TURNSTILE_NULL, *ts = TURNSTILE_NULL;
 
-       ip_lock(port);
+       imq_lock(&special_reply_port->ip_messages);
 
-       /* Check if the port is in transit */
-       if (!ip_active(port) ||
-           (port->ip_receiver_name != MACH_PORT_NULL) ||
-           (port->ip_destination == IP_NULL)) {
-               /* lock the mqueue since port is not in-transit */
-               imq_lock(&port->ip_messages);
-               update_knote = ipc_port_sync_qos_delta(port, sync_qos_delta_add, sync_qos_delta_sub);
-               if (update_knote) {
-                       KNOTE(&port->ip_messages.imq_klist, 0);
+       if (flags & IPC_PORT_ADJUST_SR_RECEIVED_MSG) {
+               reset_ip_srp_msg_sent(special_reply_port);
+       }
+
+       /* Check if the special reply port is marked non-special */
+       if (special_reply_port->ip_specialreply == 0 ||
+                       special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY) {
+               if (get_turnstile) {
+                       turnstile_complete((uintptr_t)special_reply_port,
+                               port_rcv_turnstile_address(special_reply_port),
+                       NULL);
+               }
+               imq_unlock(&special_reply_port->ip_messages);
+               ip_unlock(special_reply_port);
+               if (get_turnstile) {
+                       turnstile_cleanup();
                }
-               imq_unlock(&port->ip_messages);
-               ip_unlock(port);
                return;
        }
 
-       dest = port->ip_destination;
-       assert(dest != IP_NULL);
+       /* Clear thread's special reply port and clear linkage */
+       if (flags & IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY) {
+               /* This option should only be specified by a non blocking thread */
+               assert(get_turnstile == FALSE);
+               special_reply_port->ip_specialreply = 0;
 
-       if (ip_lock_try(dest)) {
-               if (!ip_active(dest) ||
-                   (dest->ip_receiver_name != MACH_PORT_NULL) ||
-                   (dest->ip_destination == IP_NULL)) {
-                       update_knote = ipc_port_sync_qos_delta(port, sync_qos_delta_add, sync_qos_delta_sub);
-                       ip_unlock(port);
+               reset_ip_srp_bits(special_reply_port);
 
-                       /* lock the mqueue since dest is not in-transit */
-                       imq_lock(&dest->ip_messages);
-                       update_knote = ipc_port_sync_qos_delta(dest, sync_qos_delta_add, sync_qos_delta_sub);
-                       if (update_knote) {
-                               KNOTE(&dest->ip_messages.imq_klist, 0);
-                       }
-                       imq_unlock(&dest->ip_messages);
-                       ip_unlock(dest);
+               /* Check if need to break linkage */
+               if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_NO_LINKAGE) {
+                       imq_unlock(&special_reply_port->ip_messages);
+                       ip_unlock(special_reply_port);
                        return;
                }
-
-               /* dest is in transit; need to take the serialize lock */
-               ip_unlock(dest);
+       } else if (flags & IPC_PORT_ADJUST_SR_LINK_WORKLOOP) {
+               if (special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_ANY ||
+                   special_reply_port->ip_sync_link_state == PORT_SYNC_LINK_PORT) {
+                       if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_SEND_ONCE)) {
+                               inheritor = filt_machport_stash_port(kn, special_reply_port,
+                                               &sync_link_state);
+                       }
+               }
+       } else if (flags & IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE) {
+               sync_link_state = PORT_SYNC_LINK_ANY;
+       }
+
+       switch (special_reply_port->ip_sync_link_state) {
+       case PORT_SYNC_LINK_PORT:
+               dest_port = special_reply_port->ip_sync_inheritor_port;
+               special_reply_port->ip_sync_inheritor_port = IPC_PORT_NULL;
+               break;
+       case PORT_SYNC_LINK_WORKLOOP_KNOTE:
+               special_reply_port->ip_sync_inheritor_knote = NULL;
+               break;
+       case PORT_SYNC_LINK_WORKLOOP_STASH:
+               dest_ts = special_reply_port->ip_sync_inheritor_ts;
+               special_reply_port->ip_sync_inheritor_ts = NULL;
+               break;
+       }
+
+       special_reply_port->ip_sync_link_state = sync_link_state;
+
+       switch (sync_link_state) {
+       case PORT_SYNC_LINK_WORKLOOP_KNOTE:
+               special_reply_port->ip_sync_inheritor_knote = kn;
+               break;
+       case PORT_SYNC_LINK_WORKLOOP_STASH:
+               turnstile_reference(inheritor);
+               special_reply_port->ip_sync_inheritor_ts = inheritor;
+               break;
+       case PORT_SYNC_LINK_NO_LINKAGE:
+               if (flags & IPC_PORT_ADJUST_SR_ENABLE_EVENT) {
+                       set_ip_srp_lost_link(special_reply_port);
+               }
+               break;
        }
 
-       ip_unlock(port);
-
-       ipc_port_multiple_lock(); /* massive serialization */
-       multiple_lock = TRUE;
-
-       ip_lock(port);
-       next = port;
-
-       /* Apply the sync qos delta to all in-transit ports */
-       for (;;) {
-               boolean_t port_not_in_transit = FALSE;
-
-               if (!ip_active(next) ||
-                   (next->ip_receiver_name != MACH_PORT_NULL) ||
-                   (next->ip_destination == IP_NULL)) {
-                       /* Get the mqueue lock for destination port to update knotes */
-                       imq_lock(&next->ip_messages);
-                       port_not_in_transit = TRUE;
+       /* Get thread's turnstile donated to special reply port */
+       if (get_turnstile) {
+               turnstile_complete((uintptr_t)special_reply_port,
+                       port_rcv_turnstile_address(special_reply_port),
+                       NULL);
+       } else {
+               ts = ipc_port_rcv_turnstile(special_reply_port);
+               if (ts) {
+                       turnstile_reference(ts);
+                       turnstile_update_inheritor(ts, inheritor,
+                                       (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE));
                }
+       }
 
-               /* Apply the sync qos delta */
-               update_knote = ipc_port_sync_qos_delta(next, sync_qos_delta_add, sync_qos_delta_sub);
-
-               if (port_not_in_transit)
-                       break;
+       imq_unlock(&special_reply_port->ip_messages);
+       ip_unlock(special_reply_port);
 
-               next = next->ip_destination;
-               ip_lock(next);
+       if (get_turnstile) {
+               turnstile_cleanup();
+       } else if (ts) {
+               /* Call turnstile cleanup after dropping the interlock */
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+               turnstile_deallocate_safe(ts);
        }
 
-       if (multiple_lock) {
-               ipc_port_multiple_unlock();
+       /* Release the ref on the dest port and it's turnstile */
+       if (dest_port) {
+               ipc_port_send_turnstile_complete(dest_port);
+               /* release the reference on the dest port */
+               ip_release(dest_port);
        }
 
-       base = next;
-       next = port;
+       if (dest_ts) {
+               turnstile_deallocate_safe(dest_ts);
+       }
+}
 
-       while (next != base) {
-               ipc_port_t prev = next;
-               next = next->ip_destination;
+/*
+ *     Routine:        ipc_port_adjust_special_reply_port
+ *     Purpose:
+ *             If the special port has a turnstile, update it's inheritor.
+ *     Condition:
+ *             Nothing locked.
+ *     Returns:
+ *             None.
+ */
+void
+ipc_port_adjust_special_reply_port(
+       ipc_port_t special_reply_port,
+       uint8_t flags,
+       boolean_t get_turnstile)
+{
+       ip_lock(special_reply_port);
+       ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL, flags, get_turnstile);
+       /* special_reply_port unlocked */
+}
 
-               ip_unlock(prev);
-       }
+/*
+ *     Routine:        ipc_port_get_special_reply_port_inheritor
+ *     Purpose:
+ *             Returns the current inheritor of the special reply port
+ *     Condition:
+ *             mqueue is locked, port is a special reply port
+ *     Returns:
+ *             the current inheritor
+ */
+turnstile_inheritor_t
+ipc_port_get_special_reply_port_inheritor(
+       ipc_port_t port)
+{
+       assert(port->ip_specialreply);
+       imq_held(&port->ip_messages);
 
-       if (update_knote) {
-               KNOTE(&base->ip_messages.imq_klist, 0);
+       switch (port->ip_sync_link_state) {
+       case PORT_SYNC_LINK_PORT:
+               if (port->ip_sync_inheritor_port != NULL) {
+                       return port_send_turnstile(port->ip_sync_inheritor_port);
+               }
+               break;
+       case PORT_SYNC_LINK_WORKLOOP_KNOTE:
+               return filt_machport_stashed_special_reply_port_turnstile(port);
+       case PORT_SYNC_LINK_WORKLOOP_STASH:
+               return port->ip_sync_inheritor_ts;
        }
-       imq_unlock(&base->ip_messages);
-       ip_unlock(base);
+       return TURNSTILE_INHERITOR_NULL;
 }
 
 /*
@@ -2052,6 +2093,40 @@ ipc_port_copyout_send(
        return name;
 }
 
+/*
+ *     Routine:        ipc_port_copyout_name_send
+ *     Purpose:
+ *             Copyout a naked send right (possibly null/dead) to given name,
+ *             or if that fails, destroy the right.
+ *     Conditions:
+ *             Nothing locked.
+ */
+
+mach_port_name_t
+ipc_port_copyout_name_send(
+       ipc_port_t      sright,
+       ipc_space_t     space,
+       mach_port_name_t name)
+{
+       if (IP_VALID(sright)) {
+               kern_return_t kr;
+
+               kr = ipc_object_copyout_name(space, (ipc_object_t) sright,
+                                       MACH_MSG_TYPE_PORT_SEND, TRUE, name);
+               if (kr != KERN_SUCCESS) {
+                       ipc_port_release_send(sright);
+
+                       if (kr == KERN_INVALID_CAPABILITY)
+                               name = MACH_PORT_DEAD;
+                       else
+                               name = MACH_PORT_NULL;
+               }
+       } else
+               name = CAST_MACH_PORT_TO_NAME(sright);
+
+       return name;
+}
+
 /*
  *     Routine:        ipc_port_release_send
  *     Purpose:
@@ -2165,7 +2240,7 @@ ipc_port_release_sonce(
        if (!IP_VALID(port))
                return;
 
-       ipc_port_unlink_special_reply_port(port, IPC_PORT_UNLINK_SR_NONE);
+       ipc_port_adjust_special_reply_port(port, IPC_PORT_ADJUST_SR_NONE, FALSE);
 
        ip_lock(port);
 
@@ -2205,8 +2280,10 @@ ipc_port_release_receive(
 
        ipc_port_destroy(port); /* consumes ref, unlocks */
 
-       if (dest != IP_NULL)
+       if (dest != IP_NULL) {
+               ipc_port_send_turnstile_complete(dest);
                ip_release(dest);
+       }
 }
 
 /*
@@ -2272,8 +2349,10 @@ ipc_port_dealloc_special(
         *      the ipc_space_kernel check in ipc_mqueue_send.
         */
 
+       imq_lock(&port->ip_messages);
        port->ip_receiver_name = MACH_PORT_NULL;
        port->ip_receiver = IS_NULL;
+       imq_unlock(&port->ip_messages);
 
        /* relevant part of ipc_port_clear_receiver */
        ipc_port_set_mscount(port, 0);
@@ -2297,7 +2376,13 @@ ipc_port_finalize(
 {
        ipc_port_request_t requests = port->ip_requests;
 
-       assert(!ip_active(port));
+       assert(port_send_turnstile(port) == TURNSTILE_NULL);
+       assert(ipc_port_rcv_turnstile(port) == TURNSTILE_NULL);
+
+       if (ip_active(port)) {
+               panic("Trying to free an active port. port %p", port);
+       }
+
        if (requests != IPR_NULL) {
                ipc_table_size_t its = requests->ipr_size;
                it_requests_free(its, requests);
@@ -2327,11 +2412,13 @@ ipc_port_finalize(
 void
 kdp_mqueue_send_find_owner(struct waitq * waitq, __assert_only event64_t event, thread_waitinfo_t * waitinfo)
 {
+       struct turnstile *turnstile;
        assert(waitinfo->wait_type == kThreadWaitPortSend);
        assert(event == IPC_MQUEUE_FULL);
+       assert(waitq_is_turnstile_queue(waitq));
 
-       ipc_mqueue_t mqueue = imq_from_waitq(waitq);
-       ipc_port_t port     = ip_from_mq(mqueue); /* we are blocking on send */
+       turnstile = waitq_to_turnstile(waitq);
+       ipc_port_t port     = (ipc_port_t)turnstile->ts_proprietor; /* we are blocking on send */
        assert(kdp_is_in_zone(port, "ipc ports"));
 
        waitinfo->owner = 0;
index c2be8199b3ce1acf8343c41438d7f0ff651f75e4..16addb831135ba91a49f970ce07434900d28267b 100644 (file)
@@ -84,6 +84,7 @@
 
 #include <kern/assert.h>
 #include <kern/kern_types.h>
+#include <kern/turnstile.h>
 
 #include <ipc/ipc_types.h>
 #include <ipc/ipc_object.h>
@@ -128,18 +129,18 @@ struct ipc_port {
        union {
                ipc_kobject_t kobject;
                ipc_importance_task_t imp_task;
-               ipc_port_t sync_qos_override_port;
+               ipc_port_t sync_inheritor_port;
+               struct knote *sync_inheritor_knote;
+               struct turnstile *sync_inheritor_ts;
        } kdata;
-               
+
        struct ipc_port *ip_nsrequest;
        struct ipc_port *ip_pdrequest;
        struct ipc_port_request *ip_requests;
        union {
                struct ipc_kmsg *premsg;
-               struct {
-                       sync_qos_count_t sync_qos[THREAD_QOS_LAST];
-                       sync_qos_count_t special_port_qos;
-               } qos_counter;
+               struct turnstile *send_turnstile;
+               SLIST_ENTRY(ipc_port) dealloc_elm;
        } kdata2;
 
        mach_vm_address_t ip_context;
@@ -151,8 +152,8 @@ struct ipc_port {
                  ip_guarded:1,         /* port guarded (use context value as guard) */
                  ip_strict_guard:1,    /* Strict guarding; Prevents user manipulation of context values directly */
                  ip_specialreply:1,    /* port is a special reply port */
-                 ip_link_sync_qos:1,   /* link the special reply port to destination port */
-                 ip_impcount:24;       /* number of importance donations in nested queue */
+                 ip_sync_link_state:3, /* link the special reply port to destination port/ Workloop */
+                 ip_impcount:22;       /* number of importance donations in nested queue */
 
        mach_port_mscount_t ip_mscount;
        mach_port_rights_t ip_srights;
@@ -167,6 +168,10 @@ struct ipc_port {
        uintptr_t       ip_callstack[IP_CALLSTACK_MAX]; /* stack trace */
        unsigned long   ip_spares[IP_NSPARES]; /* for debugging */
 #endif /* MACH_ASSERT */
+#if DEVELOPMENT || DEBUG
+       uint8_t         ip_srp_lost_link:1,     /* special reply port turnstile link chain broken */
+                       ip_srp_msg_sent:1;      /* special reply port msg sent */
+#endif
 };
 
 
@@ -182,32 +187,63 @@ struct ipc_port {
 
 #define ip_kobject             kdata.kobject
 #define ip_imp_task            kdata.imp_task
-#define ip_sync_qos_override_port      kdata.sync_qos_override_port
+#define ip_sync_inheritor_port kdata.sync_inheritor_port
+#define ip_sync_inheritor_knote        kdata.sync_inheritor_knote
+#define ip_sync_inheritor_ts   kdata.sync_inheritor_ts
 
 #define ip_premsg              kdata2.premsg
-#define ip_sync_qos            kdata2.qos_counter.sync_qos
-#define ip_special_port_qos     kdata2.qos_counter.special_port_qos
-
-#define port_sync_qos(port, i) (IP_PREALLOC(port) ? (port)->ip_premsg->sync_qos[(i)] : (port)->ip_sync_qos[(i)])
-#define port_special_qos(port)  (IP_PREALLOC(port) ? (port)->ip_premsg->special_port_qos : (port)->ip_special_port_qos)
-
-#define set_port_sync_qos(port, i, value)               \
-MACRO_BEGIN                                             \
-if (IP_PREALLOC(port)) {                                \
-        (port)->ip_premsg->sync_qos[(i)] = (value);     \
-} else {                                                \
-        (port)->ip_sync_qos[(i)] = (value);             \
-}                                                       \
+#define ip_send_turnstile      kdata2.send_turnstile
+#define ip_dealloc_elm         kdata2.dealloc_elm
+
+#define port_send_turnstile(port)      (IP_PREALLOC(port) ? (port)->ip_premsg->ikm_turnstile : (port)->ip_send_turnstile)
+
+#define set_port_send_turnstile(port, value)                 \
+MACRO_BEGIN                                                  \
+if (IP_PREALLOC(port)) {                                     \
+        (port)->ip_premsg->ikm_turnstile = (value);          \
+} else {                                                     \
+        (port)->ip_send_turnstile = (value);                 \
+}                                                            \
 MACRO_END
 
-#define set_port_special_qos(port, value)               \
-MACRO_BEGIN                                             \
-if (IP_PREALLOC(port)) {                                \
-        (port)->ip_premsg->special_port_qos = (value);  \
-} else {                                                \
-        (port)->ip_special_port_qos = (value);          \
-}                                                       \
-MACRO_END
+#define port_send_turnstile_address(port)                    \
+(IP_PREALLOC(port) ? &((port)->ip_premsg->ikm_turnstile) : &((port)->ip_send_turnstile))
+
+#define port_rcv_turnstile_address(port) (NULL)
+
+
+/*
+ * SYNC IPC state flags for special reply port.
+ *
+ * PORT_SYNC_LINK_ANY
+ *    Special reply port is not linked to any other port
+ *    or WL and linkage should be allowed.
+ *
+ * PORT_SYNC_LINK_PORT
+ *    Special reply port is linked to the port and
+ *    ip_sync_inheritor_port contains the inheritor
+ *    port.
+ *
+ * PORT_SYNC_LINK_WORKLOOP_KNOTE
+ *    Special reply port is linked to a WL (via a knote).
+ *    ip_sync_inheritor_knote contains a pointer to the knote
+ *    the port is stashed on.
+ *
+ * PORT_SYNC_LINK_WORKLOOP_STASH
+ *    Special reply port is linked to a WL (via a knote stash).
+ *    ip_sync_inheritor_ts contains a pointer to the turnstile with a +1
+ *    the port is stashed on.
+ *
+ * PORT_SYNC_LINK_NO_LINKAGE
+ *    Message sent to special reply port, do
+ *    not allow any linkages till receive is
+ *    complete.
+ */
+#define PORT_SYNC_LINK_ANY              (0)
+#define PORT_SYNC_LINK_PORT             (0x1)
+#define PORT_SYNC_LINK_WORKLOOP_KNOTE   (0x2)
+#define PORT_SYNC_LINK_WORKLOOP_STASH   (0x3)
+#define PORT_SYNC_LINK_NO_LINKAGE       (0x4)
 
 #define IP_NULL                        IPC_PORT_NULL
 #define IP_DEAD                        IPC_PORT_DEAD
@@ -224,10 +260,8 @@ MACRO_END
 #define        ip_release(port)        io_release(&(port)->ip_object)
 
 /* get an ipc_port pointer from an ipc_mqueue pointer */
-#define        ip_from_mq(mq)          ((struct ipc_port *)((void *)( \
-                                       (char *)(mq) - \
-                                       __offsetof(struct ipc_port, ip_messages)) \
-                               ))
+#define        ip_from_mq(mq) \
+               __container_of(mq, struct ipc_port, ip_messages)
 
 #define        ip_reference_mq(mq)     ip_reference(ip_from_mq(mq))
 #define        ip_release_mq(mq)       ip_release(ip_from_mq(mq))
@@ -475,46 +509,60 @@ enum {
 };
 
 /* link the destination port with special reply port */
-kern_return_t
-ipc_port_link_special_reply_port_with_qos(
+void
+ipc_port_link_special_reply_port(
        ipc_port_t special_reply_port,
-       ipc_port_t dest_port,
-       int qos);
+       ipc_port_t dest_port);
+
+#define IPC_PORT_ADJUST_SR_NONE                      0
+#define IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY       0x1
+#define IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE        0x2
+#define IPC_PORT_ADJUST_SR_LINK_WORKLOOP             0x4
+
+#define IPC_PORT_ADJUST_SR_RECEIVED_MSG                     0x8
+#define IPC_PORT_ADJUST_SR_ENABLE_EVENT                     0x10
+
+void
+reset_ip_srp_bits(ipc_port_t special_reply_port);
+
+void
+reset_ip_srp_msg_sent(ipc_port_t special_reply_port);
+
+void
+set_ip_srp_msg_sent(ipc_port_t special_reply_port);
+
+void
+set_ip_srp_lost_link(ipc_port_t special_reply_port);
 
-/* link the destination port with locked special reply port */
-void ipc_port_unlink_special_reply_port_locked(
+/* Adjust special reply port linkage */
+void ipc_port_adjust_special_reply_port_locked(
        ipc_port_t special_reply_port,
        struct knote *kn,
-       uint8_t flags);
+       uint8_t flags,
+       boolean_t get_turnstile);
 
-/* Unlink the destination port from special reply port */
+/* Adjust special reply port linkage */
 void
-ipc_port_unlink_special_reply_port(
+ipc_port_adjust_special_reply_port(
        ipc_port_t special_reply_port,
-       uint8_t flags);
-
-#define IPC_PORT_UNLINK_SR_NONE                      0
-#define IPC_PORT_UNLINK_SR_CLEAR_SPECIAL_REPLY       0x1
-#define IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE    0x2
+       uint8_t flags,
+       boolean_t get_turnstile);
 
-/* Get the max sync qos override index applied to the port */
-sync_qos_count_t
-ipc_port_get_max_sync_qos_index(
-       ipc_port_t      port);
+turnstile_inheritor_t
+ipc_port_get_special_reply_port_inheritor(
+       ipc_port_t special_reply_port);
 
-/* Apply qos delta to the port */
-boolean_t
-ipc_port_sync_qos_delta(
-       ipc_port_t        port,
-       sync_qos_count_t *sync_qos_delta_add,
-       sync_qos_count_t *sync_qos_delta_sub);
+void
+ipc_port_send_turnstile_prepare(ipc_port_t port);
 
-/* Adjust the sync qos of the port and it's destination port */
 void
-ipc_port_adjust_sync_qos(
-       ipc_port_t port,
-       sync_qos_count_t *sync_qos_delta_add,
-       sync_qos_count_t *sync_qos_delta_sub);
+ipc_port_send_turnstile_complete(ipc_port_t port);
+
+struct waitq *
+ipc_port_rcv_turnstile_waitq(struct waitq *waitq);
+
+struct turnstile *
+ipc_port_rcv_turnstile(ipc_port_t port);
 
 /* apply importance delta to port only */
 extern mach_port_delta_t
@@ -561,6 +609,12 @@ extern mach_port_name_t ipc_port_copyout_send(
        ipc_port_t      sright,
        ipc_space_t     space);
 
+/* Copyout a naked send right to given name */
+extern mach_port_name_t ipc_port_copyout_name_send(
+       ipc_port_t      sright,
+       ipc_space_t     space,
+       mach_port_name_t name);
+
 #endif /* MACH_KERNEL_PRIVATE */
 
 #if KERNEL_PRIVATE
@@ -617,6 +671,9 @@ extern void ipc_port_track_dealloc(
 extern void ipc_port_debug_init(void);
 #endif /* MACH_ASSERT */
 
+extern struct turnstile *ipc_port_get_inheritor(
+       ipc_port_t port);
+
 #define        ipc_port_alloc_kernel()         \
                ipc_port_alloc_special(ipc_space_kernel)
 #define        ipc_port_dealloc_kernel(port)   \
index fe989b283d21577f2a0e2401008f2a8d85d922f0..8a8e129792e6becb4f7185eee9e12290d9540554 100644 (file)
@@ -101,24 +101,18 @@ ipc_pset_alloc(
        ipc_pset_t pset;
        mach_port_name_t name;
        kern_return_t kr;
-       uint64_t reserved_link;
-
-       reserved_link = waitq_link_reserve(NULL);
 
        kr = ipc_object_alloc(space, IOT_PORT_SET,
                              MACH_PORT_TYPE_PORT_SET, 0,
                              &name, (ipc_object_t *) &pset);
        if (kr != KERN_SUCCESS) {
-               waitq_link_release(reserved_link);
                return kr;
        }
        /* pset and space are locked */
 
-       ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link);
+       ipc_mqueue_init(&pset->ips_messages, TRUE /* set */);
        is_write_unlock(space);
 
-       waitq_link_release(reserved_link);
-
        *namep = name;
        *psetp = pset;
        return KERN_SUCCESS;
@@ -146,23 +140,16 @@ ipc_pset_alloc_name(
 {
        ipc_pset_t pset;
        kern_return_t kr;
-       uint64_t reserved_link;
-
-
-       reserved_link = waitq_link_reserve(NULL);
 
        kr = ipc_object_alloc_name(space, IOT_PORT_SET,
                                   MACH_PORT_TYPE_PORT_SET, 0,
                                   name, (ipc_object_t *) &pset);
        if (kr != KERN_SUCCESS) {
-               waitq_link_release(reserved_link);
                return kr;
        }
        /* pset is locked */
 
-       ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link);
-
-       waitq_link_release(reserved_link);
+       ipc_mqueue_init(&pset->ips_messages, TRUE /* set */);
 
        *psetp = pset;
        return KERN_SUCCESS;
@@ -183,17 +170,13 @@ ipc_pset_alloc_special(
        __assert_only ipc_space_t space)
 {
        ipc_pset_t pset;
-       uint64_t reserved_link;
 
        assert(space != IS_NULL);
        assert(space->is_table == IE_NULL);
        assert(!is_active(space));
 
-       reserved_link = waitq_link_reserve(NULL);
-
        __IGNORE_WCASTALIGN(pset = (ipc_pset_t)io_alloc(IOT_PORT_SET));
        if (pset == IPS_NULL) {
-               waitq_link_release(reserved_link);
                return IPS_NULL;
        }
 
@@ -203,9 +186,7 @@ ipc_pset_alloc_special(
        pset->ips_references = 1;
        pset->ips_object.io_bits = io_makebits(TRUE, IOT_PORT_SET, 0);
 
-       ipc_mqueue_init(&pset->ips_messages, TRUE /* set */, &reserved_link);
-
-       waitq_link_release(reserved_link);
+       ipc_mqueue_init(&pset->ips_messages, TRUE /* set */);
 
        return pset;
 }
@@ -250,7 +231,7 @@ ipc_pset_add(
 
        assert(ips_active(pset));
        assert(ip_active(port));
-       
+
        kr = ipc_mqueue_add(&port->ip_messages, &pset->ips_messages,
                            reserved_link, reserved_prepost);
 
@@ -286,6 +267,55 @@ ipc_pset_remove(
        return kr;
 }
 
+/*
+ *     Routine:        ipc_pset_lazy_allocate
+ *     Purpose:
+ *             lazily initialize the wqset of a port set.
+ *     Conditions:
+ *             Nothing locked.
+ */
+
+kern_return_t
+ipc_pset_lazy_allocate(
+       ipc_space_t space,
+       mach_port_name_t psname)
+{
+       kern_return_t kr;
+       ipc_entry_t entry;
+       ipc_object_t psobj;
+       ipc_pset_t pset;
+
+       kr = ipc_right_lookup_read(space, psname, &entry);
+       if (kr != KERN_SUCCESS)
+               return kr;
+
+       /* space is read-locked and active */
+       if ((entry->ie_bits & MACH_PORT_TYPE_PORT_SET) == 0) {
+               is_read_unlock(space);
+               kr = KERN_INVALID_RIGHT;
+               return kr;
+       }
+
+       psobj = entry->ie_object;
+       __IGNORE_WCASTALIGN(pset = (ipc_pset_t) psobj);
+       assert(pset != NULL);
+       ipc_mqueue_t set_mqueue = &pset->ips_messages;
+       struct waitq_set *wqset =  &set_mqueue->imq_set_queue;
+
+       io_reference(psobj);
+       is_read_unlock(space);
+
+       /*
+        * lazily initialize the wqset to avoid
+        * possible allocation while linking
+        * under spinlocks.
+        */
+       waitq_set_lazy_init_link(wqset);
+       io_release(psobj);
+
+       return KERN_SUCCESS;
+}
+
 /*
  *     Routine:        ipc_pset_remove_from_all
  *     Purpose:
@@ -347,26 +377,217 @@ ipc_pset_destroy(
        ips_release(pset);       /* consume the ref our caller gave us */
 }
 
-/* Kqueue EVFILT_MACHPORT support */
+/*
+ * Kqueue EVFILT_MACHPORT support
+ *
+ * - kn_ptr.p_mqueue points to the monitored mqueue
+ *
+ * - (in/out) ext[0] holds a mach_vm_address_t to a userspace buffer
+ *   that can be used to direct-deliver messages when
+ *   MACH_RCV_MSG is set in kn_sfflags
+ *
+ * - (in/out) ext[1] holds a mach_msg_size_t representing the size
+ *   of the userspace buffer held in ext[0].
+ *
+ * - (out)    ext[2] is used to deliver qos information
+ *   about the send queue to userspace.
+ *
+ * - (abused) ext[3] is used in kernel to hold a reference to the first port
+ *   with a turnstile that participate to sync IPC override.
+ *
+ * - kn_hook is optionally a "knote" turnstile. It is used as the inheritor
+ *   of turnstiles for rights copied out as part of direct message delivery
+ *   when they can participate to sync IPC override.
+ *
+ *   It is used to atomically neuter the sync IPC override when the knote is
+ *   re-enabled.
+ *
+ */
 
 #include <sys/event.h>
 #include <sys/errno.h>
 
-static int      filt_machportattach(struct knote *kn, struct kevent_internal_s *kev);
-static void    filt_machportdetach(struct knote *kn);
-static int     filt_machport(struct knote *kn, long hint);
-static int     filt_machporttouch(struct knote *kn, struct kevent_internal_s *kev);
-static int     filt_machportprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
-static unsigned filt_machportpeek(struct knote *kn);
-SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = {
-       .f_adjusts_qos = 1,
-       .f_attach = filt_machportattach,
-       .f_detach = filt_machportdetach,
-       .f_event = filt_machport,
-       .f_touch = filt_machporttouch,
-       .f_process = filt_machportprocess,
-       .f_peek = filt_machportpeek,
-};
+static int
+filt_machport_adjust_qos(struct knote *kn, ipc_kmsg_t first)
+{
+       if (kn->kn_sfflags & MACH_RCV_MSG) {
+               int qos = _pthread_priority_thread_qos(first->ikm_qos_override);
+               return FILTER_ADJUST_EVENT_QOS(qos);
+       }
+       return 0;
+}
+
+struct turnstile *
+filt_machport_kqueue_turnstile(struct knote *kn)
+{
+       if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) {
+               return kqueue_turnstile(knote_get_kq(kn));
+       }
+       return TURNSTILE_NULL;
+}
+
+/*
+ * Stashes a port that participate to sync IPC override until the knote
+ * is being re-enabled.
+ *
+ * It returns:
+ * - the turnstile to use as an inheritor for the stashed port
+ * - the kind of stash that happened as PORT_SYNC_* value among:
+ *   o not stashed (no sync IPC support)
+ *   o stashed in the knote (in kn_ext[3])
+ *   o to be hooked to the kn_hook knote
+ */
+struct turnstile *
+filt_machport_stash_port(struct knote *kn, ipc_port_t port, int *link)
+{
+       struct turnstile *ts = filt_machport_kqueue_turnstile(kn);
+
+       if (!ts) {
+               if (link) *link = PORT_SYNC_LINK_NO_LINKAGE;
+       } else if (kn->kn_ext[3] == 0) {
+               ip_reference(port);
+               kn->kn_ext[3] = (uintptr_t)port;
+               if (link) *link = PORT_SYNC_LINK_WORKLOOP_KNOTE;
+       } else {
+               ts = (struct turnstile *)kn->kn_hook;
+               if (link) *link = PORT_SYNC_LINK_WORKLOOP_STASH;
+       }
+
+       return ts;
+}
+
+struct turnstile *
+filt_machport_stashed_special_reply_port_turnstile(ipc_port_t port)
+{
+       struct knote *kn = port->ip_sync_inheritor_knote;
+
+       assert(port->ip_specialreply);
+       assert(port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE);
+       if (kn->kn_ext[3] == (uint64_t)port) {
+               return kqueue_turnstile(knote_get_kq(kn));
+       }
+       return kn->kn_hook;
+}
+
+/*
+ * Lazily prepare a turnstile so that filt_machport_stash_port()
+ * can be called with the mqueue lock held.
+ *
+ * It will allocate a turnstile in kn_hook if:
+ * - the knote supports sync IPC override,
+ * - we already stashed a port in kn_ext[3],
+ * - the object that will be copied out has a chance to ask to be stashed.
+ *
+ * It is setup so that its inheritor is the workloop turnstile that has been
+ * allocated when this knote was attached.
+ */
+void
+filt_machport_turnstile_prepare_lazily(
+               struct knote *kn,
+               mach_msg_type_name_t msgt_name,
+               ipc_port_t port)
+{
+       /* This is called from within filt_machportprocess */
+       assert((kn->kn_status & KN_SUPPRESSED) && (kn->kn_status & KN_LOCKED));
+
+       struct turnstile *ts = filt_machport_kqueue_turnstile(kn);
+       if (ts == TURNSTILE_NULL || kn->kn_ext[3] == 0 || kn->kn_hook)
+               return;
+
+       if ((msgt_name == MACH_MSG_TYPE_PORT_SEND_ONCE && port->ip_specialreply) ||
+                       (msgt_name == MACH_MSG_TYPE_PORT_RECEIVE)) {
+               struct turnstile *kn_ts = turnstile_alloc();
+               kn_ts = turnstile_prepare((uintptr_t)kn,
+                               (struct turnstile **)&kn->kn_hook, kn_ts, TURNSTILE_KNOTE);
+               turnstile_update_inheritor(kn_ts, ts,
+                               TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE);
+               turnstile_cleanup();
+       }
+}
+
+/*
+ * Other half of filt_machport_turnstile_prepare_lazily()
+ *
+ * This is serialized by the knote state machine.
+ */
+static void
+filt_machport_turnstile_complete(struct knote *kn)
+{
+       struct turnstile *ts = TURNSTILE_NULL;
+
+       if (kn->kn_ext[3]) {
+               ipc_port_t port = (ipc_port_t)kn->kn_ext[3];
+               ipc_mqueue_t mqueue = &port->ip_messages;
+
+               ip_lock(port);
+               if (port->ip_specialreply) {
+                       /*
+                        * If the reply has been sent to the special reply port already,
+                        * then the special reply port may already be reused to do something
+                        * entirely different.
+                        *
+                        * However, the only reason for it to still point to this knote is
+                        * that it's still waiting for a reply, so when this is the case,
+                        * neuter the linkage.
+                        */
+                       if (port->ip_sync_link_state == PORT_SYNC_LINK_WORKLOOP_KNOTE &&
+                                       port->ip_sync_inheritor_knote == kn) {
+                               ipc_port_adjust_special_reply_port_locked(port, NULL,
+                                               (IPC_PORT_ADJUST_SR_NONE | IPC_PORT_ADJUST_SR_ENABLE_EVENT), FALSE);
+                       } else {
+                               ip_unlock(port);
+                       }
+               } else {
+                       struct turnstile *kq_ts = kqueue_turnstile(knote_get_kq(kn));
+
+                       /*
+                        * For receive rights, if their IMQ_INHERITOR() is still this
+                        * workloop, then sever the link.
+                        *
+                        * It has a theoretical hole: if the port is sent again to a new
+                        * receive right that is also monitored by the same kqueue,
+                        * we would sever the link incorrectly.
+                        *
+                        * However this would be a REALLY cumbersome thing to do.
+                        */
+                       imq_lock(mqueue);
+                       if (!IMQ_KLIST_VALID(mqueue) && IMQ_INHERITOR(mqueue) == kq_ts) {
+                               turnstile_deallocate_safe(kq_ts);
+                               klist_init(&mqueue->imq_klist);
+                               ts = port_send_turnstile(port);
+                       }
+                       if (ts) {
+                               turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
+                                               TURNSTILE_IMMEDIATE_UPDATE);
+                               turnstile_reference(ts);
+                       }
+                       imq_unlock(mqueue);
+                       ip_unlock(port);
+
+                       if (ts) {
+                               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+                               turnstile_deallocate(ts);
+                       }
+               }
+
+               ip_release(port);
+               kn->kn_ext[3] = 0;
+       }
+
+       if (kn->kn_hook) {
+               ts = kn->kn_hook;
+
+               turnstile_update_inheritor(ts, TURNSTILE_INHERITOR_NULL,
+                               TURNSTILE_IMMEDIATE_UPDATE);
+               turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_HELD);
+
+               turnstile_complete((uintptr_t)kn, (struct turnstile **)&kn->kn_hook, &ts);
+               turnstile_cleanup();
+
+               assert(ts);
+               turnstile_deallocate(ts);
+       }
+}
 
 static int
 filt_machportattach(
@@ -377,6 +598,8 @@ filt_machportattach(
        uint64_t wq_link_id = waitq_link_reserve(NULL);
        ipc_space_t space = current_space();
        ipc_kmsg_t first;
+       struct turnstile *turnstile = TURNSTILE_NULL;
+       struct turnstile *send_turnstile = TURNSTILE_NULL;
 
        int error;
        int result = 0;
@@ -384,13 +607,48 @@ filt_machportattach(
        ipc_entry_t entry;
        ipc_mqueue_t mqueue;
 
+       kn->kn_flags &= ~EV_EOF;
+       kn->kn_ext[3] = 0;
+
+       if ((kn->kn_sfflags & MACH_RCV_MSG) && (kn->kn_status & KN_DISPATCH)) {
+               /*
+                * If the filter is likely to support sync IPC override,
+                * and it happens to be attaching to a workloop,
+                * make sure the workloop has an allocated turnstile.
+                */
+               turnstile = kqueue_alloc_turnstile(knote_get_kq(kn));
+       }
+
        kr = ipc_right_lookup_read(space, name, &entry);
+
+check_lookup:
        if (kr == KERN_SUCCESS) {
                /* space is read-locked and active */
 
                if (entry->ie_bits & MACH_PORT_TYPE_PORT_SET) {
                        ipc_pset_t pset;
 
+                       if (knote_link_waitqset_should_lazy_alloc(kn)) {
+                               is_read_unlock(space);
+
+                               /*
+                                * We need to link the portset of the kn,
+                                * to insure that the link is allocated before taking
+                                * any spinlocks.
+                                */
+                               knote_link_waitqset_lazy_alloc(kn);
+
+                               /*
+                                * We had to drop the space lock because knote_link_waitqset_lazy_alloc()
+                                * could have allocated memory. The ipc_right_lookup_read()
+                                * function returns with the space locked, so we need to revalidate state.
+                                */
+                               kr = ipc_right_lookup_read(space, name, &entry);
+                               if (!(kr == KERN_SUCCESS) || !(entry->ie_bits & MACH_PORT_TYPE_PORT_SET)) {
+                                       goto check_lookup;
+                               }
+                       }
+
                        __IGNORE_WCASTALIGN(pset = (ipc_pset_t)entry->ie_object);
                        mqueue = &pset->ips_messages;
                        ips_reference(pset);
@@ -407,11 +665,10 @@ filt_machportattach(
                         */
                        error = knote_link_waitq(kn, &mqueue->imq_wait_queue, &wq_link_id);
                        if (!error) {
+                               assert(IMQ_KLIST_VALID(mqueue));
                                KNOTE_ATTACH(&mqueue->imq_klist, kn);
                                imq_unlock(mqueue);
-
-                       }
-                       else {
+                       } else {
                                kn->kn_ptr.p_mqueue = IMQ_NULL;
                                imq_unlock(mqueue);
                                ips_release(pset);
@@ -440,18 +697,37 @@ filt_machportattach(
                         * first message in the queue.
                         */
                        imq_lock(mqueue);
-                       kn->kn_ptr.p_mqueue = mqueue; 
+                       kn->kn_ptr.p_mqueue = mqueue;
+                       if (!IMQ_KLIST_VALID(mqueue)) {
+                               /*
+                                * We're attaching a port that used to have an IMQ_INHERITOR,
+                                * clobber this state, and set the inheritor of its turnstile
+                                * to the kqueue it's now attached to.
+                                */
+                               turnstile_deallocate_safe(IMQ_INHERITOR(mqueue));
+                               klist_init(&mqueue->imq_klist);
+                       }
                        KNOTE_ATTACH(&mqueue->imq_klist, kn);
+
+                       /* Update the port's turnstile inheritor */
+                       send_turnstile = port_send_turnstile(port);
+                       if (send_turnstile) {
+                               turnstile_reference(send_turnstile);
+                               turnstile_update_inheritor(send_turnstile, turnstile,
+                                       (TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE));
+                       }
+
                        if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) {
-                               int sync_qos_override_index = ipc_port_get_max_sync_qos_index(port);
-                               if (kn->kn_sfflags & MACH_RCV_MSG)
-                                       knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override,
-                                               sync_qos_override_index);
-                               result = 1;
+                               result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first);
                        }
                        imq_unlock(mqueue);
-
                        is_read_unlock(space);
+                       if (send_turnstile) {
+                               turnstile_update_inheritor_complete(send_turnstile,
+                                               TURNSTILE_INTERLOCK_NOT_HELD);
+                               turnstile_deallocate(send_turnstile);
+                       }
+
                        error = 0;
                } else {
                        is_read_unlock(space);
@@ -465,8 +741,7 @@ filt_machportattach(
 
        /* bail out on errors */
        if (error) {
-               kn->kn_flags |= EV_ERROR;
-               kn->kn_data = error;
+               knote_set_error(kn, error);
                return 0;
        }
 
@@ -485,24 +760,54 @@ filt_machportdetach(
 {
        ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
        ipc_object_t object = mqueue_to_object(mqueue);
+       struct turnstile *send_turnstile = TURNSTILE_NULL;
+
+       filt_machport_turnstile_complete(kn);
 
        imq_lock(mqueue);
-       KNOTE_DETACH(&mqueue->imq_klist, kn);
+       if ((kn->kn_status & KN_VANISHED) || (kn->kn_flags & EV_EOF)) {
+               /*
+                * ipc_mqueue_changed() already unhooked this knote from the mqueue,
+                */
+       } else {
+               assert(IMQ_KLIST_VALID(mqueue));
+               KNOTE_DETACH(&mqueue->imq_klist, kn);
+       }
+
+       if (io_otype(object) == IOT_PORT) {
+               ipc_port_t port = ip_from_mq(mqueue);
+
+               send_turnstile = port_send_turnstile(port);
+               if (send_turnstile) {
+                       turnstile_reference(send_turnstile);
+                       turnstile_update_inheritor(send_turnstile,
+                               ipc_port_get_inheritor(port),
+                               TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_IMMEDIATE_UPDATE);
+               }
+       }
+
+       /* Clear the knote pointer once the knote has been removed from turnstile */
        kn->kn_ptr.p_mqueue = IMQ_NULL;
        imq_unlock(mqueue);
 
+       if (send_turnstile) {
+               turnstile_update_inheritor_complete(send_turnstile,
+                               TURNSTILE_INTERLOCK_NOT_HELD);
+               turnstile_deallocate(send_turnstile);
+       }
+
        if (io_otype(object) == IOT_PORT_SET) {
                /*
                 * Unlink the portset wait queue from knote/kqueue.
-                * JMM - Does this need to be atomic under the mq lock?
+                * JMM - Does this need to be atomic under the mq lock?
                 */
                (void)knote_unlink_waitq(kn, &mqueue->imq_wait_queue);
-       } 
+       }
        io_release(object);
 }
 
 /*
- * filt_machport - deliver events into the mach port filter
+ * filt_machportevent - deliver events into the mach port filter
  *
  * Mach port message arrival events are currently only posted via the
  * kqueue filter routine for ports. Port sets are marked stay-active
@@ -524,7 +829,7 @@ filt_machportdetach(
  * avoiding a conflict).
  */
 static int
-filt_machport(
+filt_machportevent(
        struct knote *kn,
        long hint)
 {
@@ -537,17 +842,11 @@ filt_machport(
 
        if (hint == NOTE_REVOKE) {
                kn->kn_flags |= EV_EOF | EV_ONESHOT;
-               result = 1;
+               result = FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
        } else if (imq_is_valid(mqueue)) {
                assert(!imq_is_set(mqueue));
                if ((first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) {
-                       ipc_port_t port = ip_from_mq(mqueue);
-                       int sync_qos_override_index = ipc_port_get_max_sync_qos_index(port);
-
-                       if (kn->kn_sfflags & MACH_RCV_MSG)
-                               knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override,
-                                       sync_qos_override_index);
-                       result = 1;
+                       result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first);
                }
        }
 
@@ -556,21 +855,25 @@ filt_machport(
 
 static int
 filt_machporttouch(
-       struct knote *kn, 
+       struct knote *kn,
        struct kevent_internal_s *kev)
 {
        ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
        ipc_kmsg_t first;
        int result = 0;
 
-       imq_lock(mqueue);
-
        /* copy in new settings and save off new input fflags */
        kn->kn_sfflags = kev->fflags;
        kn->kn_ext[0] = kev->ext[0];
        kn->kn_ext[1] = kev->ext[1];
-       if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
-               kn->kn_udata = kev->udata;
+
+       if (kev->flags & EV_ENABLE) {
+               /*
+                * If the knote is being enabled, make sure there's no lingering
+                * IPC overrides from the previous message delivery.
+                */
+               filt_machport_turnstile_complete(kn);
+       }
 
        /*
         * If the mqueue is a valid port and there is a message
@@ -579,20 +882,12 @@ filt_machporttouch(
         * the event. If there are no more messages, reset the
         * QoS to the value provided by the kevent.
         */
+       imq_lock(mqueue);
        if (imq_is_valid(mqueue) && !imq_is_set(mqueue) &&
            (first = ipc_kmsg_queue_first(&mqueue->imq_messages)) != IKM_NULL) {
-               ipc_port_t port = ip_from_mq(mqueue);
-               int sync_qos_override_index = ipc_port_get_max_sync_qos_index(port);
-
-               if (kn->kn_sfflags & MACH_RCV_MSG)
-                       knote_adjust_qos(kn, first->ikm_qos, first->ikm_qos_override,
-                               sync_qos_override_index);
-               result = 1;
+               result = FILTER_ACTIVE | filt_machport_adjust_qos(kn, first);
        } else if (kn->kn_sfflags & MACH_RCV_MSG) {
-               knote_adjust_qos(kn,
-                                MACH_MSG_PRIORITY_UNSPECIFIED,
-                                MACH_MSG_PRIORITY_UNSPECIFIED,
-                                THREAD_QOS_UNSPECIFIED);
+               result = FILTER_RESET_EVENT_QOS;
        }
        imq_unlock(mqueue);
 
@@ -615,16 +910,14 @@ filt_machportprocess(
        mach_vm_address_t addr;
        mach_msg_size_t size;
 
-       imq_lock(mqueue);
-
        /* Capture current state */
        *kev = kn->kn_kevent;
+       kev->ext[3] = 0; /* hide our port reference from userspace */
 
        /* If already deallocated/moved return one last EOF event */
        if (kev->flags & EV_EOF) {
-               imq_unlock(mqueue);
-               return 1;
-        }
+               return FILTER_ACTIVE | FILTER_RESET_EVENT_QOS;
+       }
 
        /*
         * Only honor supported receive options. If no options are
@@ -658,6 +951,8 @@ filt_machportprocess(
                size = 0;
        }
 
+       imq_lock(mqueue);
+
        /* just use the reference from here on out */
        io_reference(object);
 
@@ -693,6 +988,7 @@ filt_machportprocess(
         * reference on the ipc_object and return zero.
         */
        if (wresult == THREAD_RESTART || self->ith_state == MACH_RCV_TIMED_OUT) {
+               assert(self->turnstile != TURNSTILE_NULL);
                io_release(object);
                return 0;
        }
@@ -710,7 +1006,7 @@ filt_machportprocess(
                assert(self->ith_kmsg == IKM_NULL);
                kev->data = self->ith_receiver_name;
                io_release(object);
-               return 1;
+               return FILTER_ACTIVE;
        }
 
        /*
@@ -750,26 +1046,24 @@ filt_machportprocess(
                        process_data->fp_data_out += size;
                } else {
                        assert(option & MACH_RCV_STACK);
-                       kev->ext[0] = process_data->fp_data_out + 
+                       kev->ext[0] = process_data->fp_data_out +
                                      process_data->fp_data_resid;
                }
        }
 
        /*
         * Apply message-based QoS values to output kevent as prescribed.
-        * The kev->qos field gets max(msg-qos, kn->kn_qos).
         * The kev->ext[2] field gets (msg-qos << 32) | (override-qos).
         *
         * The mach_msg_receive_results() call saved off the message
         * QoS values in the continuation save area on successful receive.
         */
        if (kev->fflags == MACH_MSG_SUCCESS) {
-               kev->qos = mach_msg_priority_combine(self->ith_qos, kn->kn_qos);
-               kev->ext[2] = ((uint64_t)self->ith_qos << 32) | 
-                              (uint64_t)self->ith_qos_override;
+               kev->ext[2] = ((uint64_t)self->ith_qos << 32) |
+                               (uint64_t)self->ith_qos_override;
        }
 
-       return 1;
+       return FILTER_ACTIVE;
 }
 
 /*
@@ -785,10 +1079,21 @@ filt_machportprocess(
  * will catch changes in this status when the event gets posted
  * up to the knote's kqueue).
  */
-static unsigned
+static int
 filt_machportpeek(struct knote *kn)
 {
        ipc_mqueue_t mqueue = kn->kn_ptr.p_mqueue;
 
-       return (ipc_mqueue_set_peek(mqueue));
+       return ipc_mqueue_set_peek(mqueue) ? FILTER_ACTIVE : 0;
 }
+
+SECURITY_READ_ONLY_EARLY(struct filterops) machport_filtops = {
+       .f_adjusts_qos = true,
+       .f_extended_codes = true,
+       .f_attach = filt_machportattach,
+       .f_detach = filt_machportdetach,
+       .f_event = filt_machportevent,
+       .f_touch = filt_machporttouch,
+       .f_process = filt_machportprocess,
+       .f_peek = filt_machportpeek,
+};
index a18b9adcdfcade98441e3089db69022715d75216..42008febf8dbb904796ddad3aa857c442df786cf 100644 (file)
@@ -94,10 +94,8 @@ struct ipc_pset {
 #define        ips_release(pset)       io_release(&(pset)->ips_object)
 
 /* get an ipc_pset pointer from an ipc_mqueue pointer */
-#define        ips_from_mq(mq)         ((struct ipc_pset *)((void *)( \
-                                       (char *)(mq) - \
-                                       __offsetof(struct ipc_pset, ips_messages)) \
-                               ))
+#define        ips_from_mq(mq) \
+               __container_of(mq, struct ipc_pset, ips_messages)
 
 /* Allocate a port set */
 extern kern_return_t ipc_pset_alloc(
@@ -132,6 +130,11 @@ extern kern_return_t ipc_pset_remove(
        ipc_pset_t      pset,
        ipc_port_t      port);
 
+/* lazily initialize the wqset of a port set */
+extern kern_return_t ipc_pset_lazy_allocate(
+       ipc_space_t      space,
+       mach_port_name_t psname);
+
 /* Remove a port from all its current port sets */
 extern kern_return_t ipc_pset_remove_from_all(
        ipc_port_t      port);
@@ -140,4 +143,22 @@ extern kern_return_t ipc_pset_remove_from_all(
 extern void ipc_pset_destroy(
        ipc_pset_t      pset);
 
+#if MACH_KERNEL_PRIVATE
+extern struct turnstile *filt_machport_kqueue_turnstile(
+       struct knote *kn);
+
+extern struct turnstile *filt_machport_stashed_special_reply_port_turnstile(
+       ipc_port_t port);
+
+extern void filt_machport_turnstile_prepare_lazily(
+       struct knote *kn,
+       mach_msg_type_name_t    msgt_name,
+       ipc_port_t port);
+
+extern struct turnstile *filt_machport_stash_port(
+       struct knote *kn,
+       ipc_port_t port,
+       int *link);
+#endif
+
 #endif /* _IPC_IPC_PSET_H_ */
index 04043b3ea92f2ed4f2a82b36cb4be77f77487186..d1925e69cfb78575042c7a30816f77102ab370f7 100644 (file)
 #include <ipc/ipc_importance.h>
 #include <security/mac_mach_internal.h>
 
-/* Allow IPC to generate mach port guard exceptions */
-extern kern_return_t
-mach_port_guard_exception(
-       mach_port_name_t        name,
-       uint64_t                inguard,
-       uint64_t                portguard,
-       unsigned                reason);
 /*
  *     Routine:        ipc_right_lookup_write
  *     Purpose:
@@ -170,10 +163,12 @@ ipc_right_lookup_two_write(
 
        if ((entry1 = ipc_entry_lookup(space, name1)) == IE_NULL) {
                is_write_unlock(space);
+               mach_port_guard_exception(name1, 0, 0, kGUARD_EXC_INVALID_NAME);
                return KERN_INVALID_NAME;
        }
        if ((entry2 = ipc_entry_lookup(space, name2)) == IE_NULL) {
                is_write_unlock(space);
+               mach_port_guard_exception(name2, 0, 0, kGUARD_EXC_INVALID_NAME);
                return KERN_INVALID_NAME;
        }
        *entryp1 = entry1;
@@ -1042,6 +1037,7 @@ ipc_right_dealloc(
 
            default:
                is_write_unlock(space);
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                return KERN_INVALID_RIGHT;
        }
 
@@ -1075,7 +1071,6 @@ ipc_right_delta(
 
        bits = entry->ie_bits;
 
-
 /*
  *     The following is used (for case MACH_PORT_RIGHT_DEAD_NAME) in the
  *     switch below. It is used to keep track of those cases (in DIPC)
@@ -1093,8 +1088,10 @@ ipc_right_delta(
            case MACH_PORT_RIGHT_PORT_SET: {
                ipc_pset_t pset;
 
-               if ((bits & MACH_PORT_TYPE_PORT_SET) == 0)
+               if ((bits & MACH_PORT_TYPE_PORT_SET) == 0) {
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                        goto invalid_right;
+               }
 
                assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_PORT_SET);
                assert(IE_BITS_UREFS(bits) == 0);
@@ -1123,8 +1120,10 @@ ipc_right_delta(
            case MACH_PORT_RIGHT_RECEIVE: {
                ipc_port_t request = IP_NULL;
 
-               if ((bits & MACH_PORT_TYPE_RECEIVE) == 0)
+               if ((bits & MACH_PORT_TYPE_RECEIVE) == 0) {
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                        goto invalid_right;
+               }
 
                if (delta == 0)
                        goto success;
@@ -1230,6 +1229,7 @@ ipc_right_delta(
 
                if (ipc_right_check(space, port, name, entry)) {
                        assert(!(entry->ie_bits & MACH_PORT_TYPE_SEND_ONCE));
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                        goto invalid_right;
                }
                /* port is locked and active */
@@ -1274,12 +1274,14 @@ ipc_right_delta(
                                /* port is locked and active */
                                ip_unlock(port);
                                port = IP_NULL;
+                               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                                goto invalid_right;
                        }
                        bits = entry->ie_bits;
                        relport = port;
                        port = IP_NULL;
                } else if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0) {
+                       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                        goto invalid_right;
                }
 
@@ -1334,8 +1336,13 @@ ipc_right_delta(
                ipc_port_t port_to_release = IP_NULL;
                mach_port_mscount_t mscount = 0;
 
-               if ((bits & MACH_PORT_TYPE_SEND) == 0)
+               if ((bits & MACH_PORT_TYPE_SEND) == 0) {
+                       /* invalid right exception only when not live/dead confusion */
+                       if ((bits & MACH_PORT_TYPE_DEAD_NAME) == 0) {
+                               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
+                       }
                        goto invalid_right;
+               }
 
                /* maximum urefs for send is MACH_PORT_UREFS_MAX */
 
@@ -1454,6 +1461,7 @@ ipc_right_delta(
 
     invalid_value:
        is_write_unlock(space);
+       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_VALUE);
        return KERN_INVALID_VALUE;
 
     guard_failure:
@@ -1491,12 +1499,13 @@ ipc_right_destruct(
        mach_port_mscount_t mscount = 0;
 
        bits = entry->ie_bits;
-       
+
        assert(is_active(space));
 
        if (((bits & MACH_PORT_TYPE_RECEIVE) == 0) ||
            (srdelta && ((bits & MACH_PORT_TYPE_SEND) == 0))) {
                is_write_unlock(space);
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_RIGHT);
                return KERN_INVALID_RIGHT;
        }
 
@@ -1636,8 +1645,8 @@ ipc_right_destruct(
        
     invalid_value:
        is_write_unlock(space);
+       mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_VALUE);
        return KERN_INVALID_VALUE;
-
 }
 
 
@@ -1933,8 +1942,10 @@ ipc_right_copyin(
                ipc_entry_modified(space, name, entry);
 
                (void)ipc_port_clear_receiver(port, FALSE); /* don't destroy the port/mqueue */
+               imq_lock(&port->ip_messages);
                port->ip_receiver_name = MACH_PORT_NULL;
                port->ip_destination = IP_NULL;
+               imq_unlock(&port->ip_messages);
 
 #if IMPORTANCE_INHERITANCE
                /*
@@ -2545,8 +2556,8 @@ ipc_right_copyout(
                assert(port->ip_sorights > 0);
 
                if (port->ip_specialreply) {
-                       ipc_port_unlink_special_reply_port_locked(port,
-                               current_thread()->ith_knote, IPC_PORT_UNLINK_SR_NONE);
+                       ipc_port_adjust_special_reply_port_locked(port,
+                               current_thread()->ith_knote, IPC_PORT_ADJUST_SR_LINK_WORKLOOP, FALSE);
                        /* port unlocked on return */
                } else {
                        ip_unlock(port);
@@ -2610,23 +2621,17 @@ ipc_right_copyout(
 
            case MACH_MSG_TYPE_PORT_RECEIVE: {
                ipc_port_t dest;
-               sync_qos_count_t max_sync_qos = THREAD_QOS_UNSPECIFIED;
-               sync_qos_count_t sync_qos_delta_add[THREAD_QOS_LAST] = {0};
-               sync_qos_count_t sync_qos_delta_sub[THREAD_QOS_LAST] = {0};
+               turnstile_inheritor_t inheritor = TURNSTILE_INHERITOR_NULL;
+               struct turnstile *ts = TURNSTILE_NULL;
 
 #if IMPORTANCE_INHERITANCE
                natural_t assertcnt = port->ip_impcount;
 #endif /* IMPORTANCE_INHERITANCE */
-               /* Capture the sync qos count delta */
-               for (int i = 0; i < THREAD_QOS_LAST; i++) {
-                       sync_qos_delta_sub[i] = port_sync_qos(port, i);
-                       if (sync_qos_delta_sub[i] != 0) {
-                               max_sync_qos = i;
-                       }
-               }
 
                assert(port->ip_mscount == 0);
                assert(port->ip_receiver_name == MACH_PORT_NULL);
+
+               imq_lock(&port->ip_messages);
                dest = port->ip_destination;
 
                port->ip_receiver_name = name;
@@ -2634,6 +2639,24 @@ ipc_right_copyout(
 
                assert((bits & MACH_PORT_TYPE_RECEIVE) == 0);
 
+               /* Update the port's turnstile linkage to WL turnstile */
+               ts = port_send_turnstile(port);
+               if (ts) {
+                       struct knote *kn = current_thread()->ith_knote;
+                       if (ITH_KNOTE_VALID(kn, MACH_MSG_TYPE_PORT_RECEIVE)) {
+                               inheritor = filt_machport_stash_port(kn, port, NULL);
+                               if (inheritor) {
+                                       turnstile_reference(inheritor);
+                                       IMQ_SET_INHERITOR(&port->ip_messages, inheritor);
+                               }
+                       }
+                       turnstile_reference(ts);
+                       turnstile_update_inheritor(ts, inheritor,
+                               (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_TURNSTILE));
+               }
+
+               imq_unlock(&port->ip_messages);
+
                if (bits & MACH_PORT_TYPE_SEND) {
                        assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_SEND);
                        assert(IE_BITS_UREFS(bits) > 0);
@@ -2643,9 +2666,7 @@ ipc_right_copyout(
                        ip_release(port);
 
                        /* entry is locked holding ref, so can use port */
-
-                       ipc_hash_delete(space, (ipc_object_t) port,
-                                       name, entry);
+                       ipc_hash_delete(space, (ipc_object_t) port, name, entry);
                } else {
                        assert(IE_BITS_TYPE(bits) == MACH_PORT_TYPE_NONE);
                        assert(IE_BITS_UREFS(bits) == 0);
@@ -2656,9 +2677,9 @@ ipc_right_copyout(
                entry->ie_bits = bits | MACH_PORT_TYPE_RECEIVE;
                ipc_entry_modified(space, name, entry);
 
-               /* update the sync qos count on knote */
-               if (ITH_KNOTE_VALID(current_thread()->ith_knote)) {
-                       knote_adjust_sync_qos(current_thread()->ith_knote, max_sync_qos, TRUE);
+               if (ts) {
+                       turnstile_update_inheritor_complete(ts, TURNSTILE_INTERLOCK_NOT_HELD);
+                       turnstile_deallocate_safe(ts);
                }
 
                if (dest != IP_NULL) {
@@ -2673,8 +2694,9 @@ ipc_right_copyout(
                        ipc_port_impcount_delta(dest, 0 - assertcnt, IP_NULL);
                        ip_unlock(dest);
 #endif /* IMPORTANCE_INHERITANCE */
-                       /* Adjust the sync qos of destination */
-                       ipc_port_adjust_sync_qos(dest, sync_qos_delta_add, sync_qos_delta_sub);
+
+                       /* Drop turnstile ref on dest */
+                       ipc_port_send_turnstile_complete(dest);
                        ip_release(dest);
                }
                break;
@@ -2775,11 +2797,13 @@ ipc_right_rename(
                assert(port != IP_NULL);
 
                ip_lock(port);
+               imq_lock(&port->ip_messages);
                assert(ip_active(port));
                assert(port->ip_receiver_name == oname);
                assert(port->ip_receiver == space);
 
                port->ip_receiver_name = nname;
+               imq_unlock(&port->ip_messages);
                ip_unlock(port);
                break;
            }
index 8596ad530f1f38f356bd191ea060d9b86cc877e5..9760d042e86412e0b28e7b89bc72e2526c30e4e3 100644 (file)
@@ -149,12 +149,12 @@ ipc_space_rand_freelist(
        mach_port_index_t       bottom,
        mach_port_index_t       top)
 {
+       int at_start = (bottom == 0);
 #ifdef CONFIG_SEMI_RANDOM_ENTRIES
        /*
         * Only make sequential entries at the start of the table, and not when
         * we're growing the space.
         */
-       int at_start = (bottom == 0);
        ipc_entry_num_t total = 0;
 #endif
 
@@ -210,6 +210,11 @@ ipc_space_rand_freelist(
        table[curr].ie_object = IO_NULL;
        table[curr].ie_index  = 0;
        table[curr].ie_bits   = IE_BITS_GEN_MASK;
+
+       /* The freelist head should always have generation number set to 0 */
+       if (at_start) {
+               table[0].ie_bits = 0;
+       }
 }
 
 
index ab0858c99f8d5e494fcb35046d473fd04e571e08..4e7b4b9502368d0f57e11d520feae4ca42aac5e9 100644 (file)
@@ -98,26 +98,22 @@ static lck_spin_t ivgt_lock_data;
 
 ipc_voucher_t iv_alloc(iv_index_t entries);
 void iv_dealloc(ipc_voucher_t iv, boolean_t unhash);
-extern int thread_qos_from_pthread_priority(unsigned long, unsigned long *);
 
-static inline iv_refs_t
+os_refgrp_decl(static, iv_refgrp, "voucher", NULL);
+os_refgrp_decl(static, ivac_refgrp, "voucher attribute control", NULL);
+
+static inline void
 iv_reference(ipc_voucher_t iv)
 {
-       iv_refs_t refs;
-
-       refs = hw_atomic_add(&iv->iv_refs, 1);
-       return refs;
+       os_ref_retain(&iv->iv_refs);
 }
 
 static inline void
 iv_release(ipc_voucher_t iv)
 {
-       iv_refs_t refs;
-
-       assert(0 < iv->iv_refs);
-       refs = hw_atomic_sub(&iv->iv_refs, 1);
-       if (0 == refs)
+       if (os_ref_release(&iv->iv_refs) == 0) {
                iv_dealloc(iv, TRUE);
+       }
 }
 
 /*
@@ -242,7 +238,7 @@ iv_alloc(iv_index_t entries)
        if (IV_NULL == iv)
                return IV_NULL;
                
-       iv->iv_refs = 1;
+       os_ref_init(&iv->iv_refs, &iv_refgrp);
        iv->iv_sum = 0;
        iv->iv_hash = 0;
        iv->iv_port = IP_NULL;
@@ -298,7 +294,7 @@ iv_dealloc(ipc_voucher_t iv, boolean_t unhash)
         */
        if (unhash) {
                ivht_lock();
-               assert(0 == iv->iv_refs);
+               assert(os_ref_get_count(&iv->iv_refs) == 0);
                assert(IV_HASH_BUCKETS > iv->iv_hash);
                queue_remove(&ivht_bucket[iv->iv_hash], iv, ipc_voucher_t, iv_hash_link);
                ivht_count--;
@@ -307,8 +303,10 @@ iv_dealloc(ipc_voucher_t iv, boolean_t unhash)
                KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_VOUCHER_DESTROY) | DBG_FUNC_NONE,
                                      VM_KERNEL_ADDRPERM((uintptr_t)iv), 0, ivht_count, 0, 0);
 
-       } else
-               assert(0 == --iv->iv_refs);
+       } else {
+               os_ref_count_t cnt __assert_only = os_ref_release(&iv->iv_refs);
+               assert(cnt == 0);
+       }
 
        /*
         * if a port was allocated for this voucher,
@@ -451,13 +449,10 @@ convert_port_name_to_voucher(
 void
 ipc_voucher_reference(ipc_voucher_t voucher)
 {
-       iv_refs_t refs;
-
        if (IPC_VOUCHER_NULL == voucher)
                return;
 
-       refs = iv_reference(voucher);
-       assert(1 < refs);
+       iv_reference(voucher);
 }
 
 void
@@ -505,7 +500,7 @@ convert_voucher_to_port(ipc_voucher_t voucher)
        if (IV_NULL == voucher)
                return (IP_NULL);
 
-       assert(0 < voucher->iv_refs);
+       assert(os_ref_get_count(&voucher->iv_refs) > 0);
 
        /* create a port if needed */
        port = voucher->iv_port;
@@ -579,7 +574,7 @@ ivac_alloc(iv_index_t key_index)
        if (IVAC_NULL == ivac)
                return IVAC_NULL;
                
-       ivac->ivac_refs = 1;
+       os_ref_init(&ivac->ivac_refs, &ivac_refgrp);
        ivac->ivac_is_growing = FALSE;
        ivac->ivac_port = IP_NULL;
 
@@ -617,7 +612,7 @@ ivac_dealloc(ipc_voucher_attr_control_t ivac)
         * that the reference count is still zero.
         */
        ivgt_lock();
-       if (ivac->ivac_refs > 0) {
+       if (os_ref_get_count(&ivac->ivac_refs) > 0) {
                ivgt_unlock();
                return;
        }
@@ -1617,8 +1612,7 @@ iv_dedup(ipc_voucher_t new_iv)
                assert(iv->iv_hash == hash);
 
                /* if not already deallocating and sums match... */
-               if (0 < iv->iv_refs && iv->iv_sum == sum) {
-                       iv_refs_t refs;
+               if ((os_ref_get_count(&iv->iv_refs) > 0) && (iv->iv_sum == sum)) {
                        iv_index_t i;
 
                        assert(iv->iv_table_size <= new_iv->iv_table_size);
@@ -1641,16 +1635,12 @@ iv_dedup(ipc_voucher_t new_iv)
 
                        /* can we get a ref before it hits 0
                         *
-                        * This is thread safe. The reference is just an atomic
-                        * add. If the reference count is zero when we adjust it,
-                        * no other thread can have a reference to the voucher.
+                        * This is thread safe. If the reference count is zero before we
+                        * adjust it, no other thread can have a reference to the voucher.
                         * The dealloc code requires holding the ivht_lock, so
                         * the voucher cannot be yanked out from under us.
                         */
-                       refs = iv_reference(iv);
-                       if (1 == refs) {
-                               /* drats! going away. Put back to zero */
-                               iv->iv_refs = 0;
+                       if (!os_ref_retain_try(&iv->iv_refs)) {
                                continue;
                        }
 
@@ -1724,24 +1714,21 @@ iv_dedup(ipc_voucher_t new_iv)
                                }
                        }
 
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_VOUCHER_CREATE) | DBG_FUNC_NONE,
-                                             voucher_addr,
-                                             new_iv->iv_table_size, ivht_count, payload_size, 0);
+                       KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_VOUCHER_CREATE),
+                                       voucher_addr, new_iv->iv_table_size, ivht_count,
+                                       payload_size);
 
                        uintptr_t index = 0;
                        while (attr_tracepoints_needed--) {
-                               KERNEL_DEBUG_CONSTANT1(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_VOUCHER_CREATE_ATTR_DATA) | DBG_FUNC_NONE,
-                                                      payload[index],
-                                                      payload[index+1],
-                                                      payload[index+2],
-                                                      payload[index+3],
-                                                      voucher_addr);
+                               KDBG(MACHDBG_CODE(DBG_MACH_IPC,
+                                               MACH_IPC_VOUCHER_CREATE_ATTR_DATA), payload[index],
+                                               payload[index + 1], payload[index + 2],
+                                               payload[index + 3]);
                                index += 4;
                        }
                } else {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_IPC,MACH_IPC_VOUCHER_CREATE) | DBG_FUNC_NONE,
-                                             voucher_addr,
-                                             new_iv->iv_table_size, ivht_count, 0, 0);
+                       KDBG(MACHDBG_CODE(DBG_MACH_IPC, MACH_IPC_VOUCHER_CREATE),
+                                       voucher_addr, new_iv->iv_table_size, ivht_count);
                }
        }
 #endif /* KDEBUG_LEVEL >= KDEBUG_LEVEL_STANDARD */
@@ -2353,7 +2340,7 @@ mach_voucher_attr_control_get_values(
 
        key_index = control->ivac_key_index;
 
-       assert(0 < voucher->iv_refs);
+       assert(os_ref_get_count(&voucher->iv_refs) > 0);
        value_index = iv_lookup(voucher, key_index);
        ivace_lookup_values(key_index, value_index,
                            out_values, in_out_size);
index 3f637856ce76ef098ba06ad7174581ff59bc7660..b306487a43c3dd635de42f6b06c8771fde7d5d35 100644 (file)
@@ -32,6 +32,7 @@
 #include <mach/mach_voucher_types.h>
 #include <mach/boolean.h>
 #include <ipc/ipc_types.h>
+#include <os/refcnt.h>
 
 #ifdef MACH_KERNEL_PRIVATE
 
@@ -50,8 +51,6 @@ extern void ipc_voucher_init(void);
 typedef mach_voucher_attr_value_handle_t        iv_value_handle_t;
 typedef mach_voucher_attr_value_reference_t     iv_value_refs_t;
 
-typedef natural_t              iv_refs_t;
-
 typedef natural_t              iv_index_t;
 #define IV_UNUSED_VALINDEX     ((iv_index_t) 0)
 #define IV_UNUSED_KEYINDEX     ((iv_index_t) ~0)
@@ -71,7 +70,7 @@ typedef iv_index_t            *iv_entry_t;
 struct ipc_voucher {
        iv_index_t              iv_hash;        /* checksum hash */
        iv_index_t              iv_sum;         /* checksum of values */
-       iv_refs_t               iv_refs;        /* reference count */
+       os_refcnt_t             iv_refs;        /* reference count */
        iv_index_t              iv_table_size;  /* size of the voucher table */
        iv_index_t              iv_inline_table[IV_ENTRIES_INLINE];
        iv_entry_t              iv_table;       /* table of voucher attr entries */
@@ -142,7 +141,7 @@ typedef ivac_entry              *ivac_entry_t;
 #define IVAC_ENTRIES_MAX        524288
 
 struct ipc_voucher_attr_control {
-       iv_refs_t               ivac_refs;
+       os_refcnt_t             ivac_refs;
        boolean_t               ivac_is_growing;        /* is the table being grown */
        ivac_entry_t            ivac_table;             /* table of voucher attr value entries */
        iv_index_t              ivac_table_size;        /* size of the attr value table */
@@ -182,20 +181,20 @@ extern void ivac_dealloc(ipc_voucher_attr_control_t ivac);
 static inline void
 ivac_reference(ipc_voucher_attr_control_t ivac)
 {
-       (void)hw_atomic_add(&ivac->ivac_refs, 1);
+       if (ivac == IVAC_NULL)
+               return;
+       os_ref_retain(&ivac->ivac_refs);
 }
 
 static inline void
 ivac_release(ipc_voucher_attr_control_t ivac)
 {
-       iv_refs_t refs;
-
        if (IVAC_NULL == ivac)
                return;
 
-       refs = hw_atomic_sub(&ivac->ivac_refs, 1);
-       if (refs == 0)
+       if (os_ref_release(&ivac->ivac_refs) == 0) {
                ivac_dealloc(ivac);
+       }
 }
 
 #define IVAM_NULL IPC_VOUCHER_ATTR_MANAGER_NULL
index 972073a71e512a528ed6888a8b4a7f6b47fbc755..a7b47831b579c8aa11faaca31e9e618674cfaf42 100644 (file)
@@ -545,3 +545,51 @@ mach_port_kernel_object(
        return kr;
 }
 #endif /* MACH_IPC_DEBUG */
+
+#if (DEVELOPMENT || DEBUG)
+kern_return_t
+mach_port_special_reply_port_reset_link(
+       ipc_space_t             space,
+       mach_port_name_t        name,
+       boolean_t               *srp_lost_link)
+{
+       ipc_port_t port;
+       kern_return_t kr;
+       thread_t thread = current_thread();
+
+       if (space != current_space())
+               return KERN_INVALID_TASK;
+
+       if (!MACH_PORT_VALID(name))
+               return KERN_INVALID_NAME;
+
+       if (!IP_VALID(thread->ith_special_reply_port))
+               return KERN_INVALID_VALUE;
+
+       kr = ipc_port_translate_receive(space, name, &port);
+       if (kr != KERN_SUCCESS)
+               return kr;
+
+       if (thread->ith_special_reply_port != port) {
+               ip_unlock(port);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       imq_lock(&port->ip_messages);
+       *srp_lost_link = (port->ip_srp_lost_link == 1)? TRUE : FALSE;
+       port->ip_srp_lost_link = 0;
+       imq_unlock(&port->ip_messages);
+
+       ip_unlock(port);
+       return KERN_SUCCESS;
+}
+#else
+kern_return_t
+mach_port_special_reply_port_reset_link(
+       __unused ipc_space_t            space,
+       __unused mach_port_name_t       name,
+       __unused boolean_t              *srp_lost_link)
+{
+       return KERN_NOT_SUPPORTED;
+}
+#endif
index b4ee58fecec039d0b671ae984dbf83b92611f4c3..05fd050bbb8d743fe69a3fefc990640c3b5c9917 100644 (file)
@@ -268,6 +268,44 @@ done:
        return (rv);
 }
 
+int
+_kernelrpc_mach_port_get_attributes_trap(struct _kernelrpc_mach_port_get_attributes_args *args)
+{
+       task_inspect_t task = port_name_to_task_inspect(args->target);
+       int rv = MACH_SEND_INVALID_DEST;
+       mach_msg_type_number_t count;
+
+       if (task != current_task())
+               goto done;
+
+       // MIG does not define the type or size of the mach_port_info_t out array
+       // anywhere, so derive them from the field in the generated reply struct
+#define MACH_PORT_INFO_OUT (((__Reply__mach_port_get_attributes_t*)NULL)->port_info_out)
+#define MACH_PORT_INFO_STACK_LIMIT 80 // current size is 68 == 17 * sizeof(integer_t)
+       _Static_assert(sizeof(MACH_PORT_INFO_OUT) < MACH_PORT_INFO_STACK_LIMIT,
+                       "mach_port_info_t has grown significantly, reevaluate stack usage");
+       const mach_msg_type_number_t max_count = (sizeof(MACH_PORT_INFO_OUT)/sizeof(MACH_PORT_INFO_OUT[0]));
+       typeof(MACH_PORT_INFO_OUT[0]) info[max_count];
+
+       if (copyin(CAST_USER_ADDR_T(args->count), &count, sizeof(count))) {
+               rv = MACH_SEND_INVALID_DATA;
+               goto done;
+       }
+       if (count > max_count)
+               count = max_count;
+
+       rv = mach_port_get_attributes(task->itk_space, args->name, args->flavor, info, &count);
+       if (rv == KERN_SUCCESS)
+               rv = copyout(&count, CAST_USER_ADDR_T(args->count), sizeof(count));
+       if (rv == KERN_SUCCESS && count > 0)
+               rv = copyout(info, CAST_USER_ADDR_T(args->info), count * sizeof(info[0]));
+
+done:
+       if (task)
+               task_deallocate(task);
+       return (rv);
+}
+
 int
 _kernelrpc_mach_port_insert_member_trap(struct _kernelrpc_mach_port_insert_member_args *args)
 {
@@ -487,7 +525,8 @@ mach_voucher_extract_attr_recipe_trap(struct mach_voucher_extract_attr_recipe_ar
                kfree(krecipe, (vm_size_t)max_sz);
        }
 
-       kr = copyout(&sz, args->recipe_size, sizeof(sz));
+       if (kr == KERN_SUCCESS)
+               kr = copyout(&sz, args->recipe_size, sizeof(sz));
 
 done:
        ipc_voucher_release(voucher);
index 128cd96053a783ae9ec247f8b603dc3904fc3633..d17cb24c3fc82b66a2a56a4e4186f05816fbbfa4 100644 (file)
@@ -95,6 +95,7 @@
 
 #include <vm/vm_map.h>
 
+#include <ipc/port.h>
 #include <ipc/ipc_types.h>
 #include <ipc/ipc_kmsg.h>
 #include <ipc/ipc_mqueue.h>
@@ -151,8 +152,8 @@ mach_msg_rcv_link_special_reply_port(
        ipc_port_t special_reply_port,
        mach_port_name_t dest_name_port);
 
-static void
-mach_msg_rcv_unlink_special_reply_port(void);
+void
+mach_msg_receive_results_complete(ipc_object_t object);
 
 security_token_t KERNEL_SECURITY_TOKEN = KERNEL_SECURITY_TOKEN_VALUE;
 audit_token_t KERNEL_AUDIT_TOKEN = KERNEL_AUDIT_TOKEN_VALUE;
@@ -205,6 +206,8 @@ mach_msg_send(
        mach_msg_size_t msg_and_trailer_size;
        mach_msg_max_trailer_t  *trailer;
 
+       option |= MACH_SEND_KERNEL;
+
        if ((send_size & 3) ||
            send_size < sizeof(mach_msg_header_t) ||
            (send_size < sizeof(mach_msg_base_t) && (msg->msgh_bits & MACH_MSGH_BITS_COMPLEX)))
@@ -320,8 +323,11 @@ mach_msg_receive_results(
        mach_msg_trailer_size_t trailer_size;
        mach_msg_size_t   size = 0;
 
-       /* unlink the special_reply_port before releasing reference to object */
-       mach_msg_rcv_unlink_special_reply_port();
+       /*
+        * unlink the special_reply_port before releasing reference to object.
+        * get the thread's turnstile, if the thread donated it's turnstile to the port
+        */
+       mach_msg_receive_results_complete(object);
        io_release(object);
 
        if (mr != MACH_MSG_SUCCESS) {
@@ -415,33 +421,6 @@ mach_msg_receive_results(
                *sizep = size;
        return mr;
 }
-#ifndef _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG
-#define _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG 0x02000000 /* pthread event manager bit */
-#endif
-#ifndef _PTHREAD_PRIORITY_OVERCOMMIT_FLAG
-#define _PTHREAD_PRIORITY_OVERCOMMIT_FLAG    0x80000000 /* request overcommit threads */
-#endif
-#ifndef _PTHREAD_PRIORITY_QOS_CLASS_MASK
-#define _PTHREAD_PRIORITY_QOS_CLASS_MASK    0x003fff00  /* QoS class mask */
-#endif
-
-/* JMM - this needs to invoke a pthread function to compute this */
-mach_msg_priority_t
-mach_msg_priority_combine(mach_msg_priority_t msg_qos,
-                          mach_msg_priority_t recv_qos)
-{
-    mach_msg_priority_t overcommit;
-       mach_msg_priority_t no_oc_qos;
-       mach_msg_priority_t res;
-
-       assert(msg_qos < _PTHREAD_PRIORITY_EVENT_MANAGER_FLAG);
-       overcommit = recv_qos & _PTHREAD_PRIORITY_OVERCOMMIT_FLAG;
-       no_oc_qos =  recv_qos & ~overcommit; 
-       res = (no_oc_qos > msg_qos) ? no_oc_qos : msg_qos;
-       res |= overcommit;
-
-       return res;
-}
 
 /*
  *     Routine:        mach_msg_receive [Kernel Internal]
@@ -451,7 +430,7 @@ mach_msg_priority_combine(mach_msg_priority_t msg_qos,
  *             Unlike being dispatched to by ipc_kobject_server() or the
  *             reply part of mach_msg_rpc_from_kernel(), this routine
  *             looks up the receive port name in the kernel's port
- *             namespace and copies out received port rights to that namespace
+ *             namespace and copies out received port rights to that namespace
  *             as well.  Out-of-line memory is copied out the kernel's
  *             address space (rather than just providing the vm_map_copy_t).
  *     Conditions:
@@ -586,6 +565,7 @@ mach_msg_overwrite_trap(
 
                mr = ipc_mqueue_copyin(space, rcv_name, &mqueue, &object);
                if (mr != MACH_MSG_SUCCESS) {
+                       mach_port_guard_exception(rcv_name, 0, 0, kGUARD_EXC_RCV_INVALID_NAME);
                        return mr;
                }
                /* hold ref for object */
@@ -640,7 +620,6 @@ mach_msg_rcv_link_special_reply_port(
 {
        ipc_port_t dest_port = IP_NULL;
        kern_return_t kr;
-       int qos;
 
        if (current_thread()->ith_special_reply_port != special_reply_port) {
                return MACH_RCV_INVALID_NOTIFY;
@@ -660,12 +639,8 @@ mach_msg_rcv_link_special_reply_port(
         * do not fail the receive in that case.
         */
        if (kr == KERN_SUCCESS && IP_VALID(dest_port)) {
-
-               /* Get the effective qos of the thread */
-               qos = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS);
-
-               ipc_port_link_special_reply_port_with_qos(special_reply_port,
-                       dest_port, qos);
+               ipc_port_link_special_reply_port(special_reply_port,
+                       dest_port);
 
                /* release the send right */
                ipc_port_release_send(dest_port);
@@ -674,29 +649,47 @@ mach_msg_rcv_link_special_reply_port(
 }
 
 /*
- *     Routine:        mach_msg_rcv_unlink_special_reply_port
+ *     Routine:        mach_msg_receive_results_complete
  *     Purpose:
- *             Unlink the special reply port to the other end
- *             of the sync ipc channel.
+ *             Get thread's turnstile back from the object and
+ *              if object is a special reply port then reset its
+ *             linkage.
  *     Condition:
  *             Nothing locked.
  *     Returns:
  *             None.
  */
-static void
-mach_msg_rcv_unlink_special_reply_port(void)
+void
+mach_msg_receive_results_complete(ipc_object_t object)
 {
        thread_t self = current_thread();
-       ipc_port_t special_reply_port = self->ith_special_reply_port;
-       mach_msg_option_t option = self->ith_option;
+       ipc_port_t port = IPC_PORT_NULL;
+       boolean_t get_turnstile = self->turnstile ? FALSE : TRUE;
 
-       if ((special_reply_port == IP_NULL) ||
-           !(option & MACH_RCV_SYNC_WAIT)) {
+       if (io_otype(object) == IOT_PORT) {
+               __IGNORE_WCASTALIGN(port = (ipc_port_t) object);
+       } else {
+               assert(self->turnstile != TURNSTILE_NULL);
                return;
        }
 
-       ipc_port_unlink_special_reply_port(special_reply_port,
-               IPC_PORT_UNLINK_SR_ALLOW_SYNC_QOS_LINKAGE);
+       uint8_t flags = IPC_PORT_ADJUST_SR_ALLOW_SYNC_LINKAGE;
+
+       /*
+        * Don't clear the ip_srp_msg_sent bit if...
+        */
+       if (!((self->ith_state == MACH_RCV_TOO_LARGE && self->ith_option & MACH_RCV_LARGE) || //msg was too large and the next receive will get it
+               self->ith_state == MACH_RCV_INTERRUPTED ||
+               self->ith_state == MACH_RCV_TIMED_OUT ||
+               self->ith_state == MACH_RCV_PORT_CHANGED ||
+               self->ith_state == MACH_PEEK_READY)) {
+
+               flags |= IPC_PORT_ADJUST_SR_RECEIVED_MSG;
+       }
+
+       ipc_port_adjust_special_reply_port(port,
+               flags, get_turnstile);
+       /* thread now has a turnstile */
 }
 
 /*
index ced4e638475fffac43bf8b7b656d6c2062a41a46..7d88c54810e9ff2b53735a2d34c6f26cd72fa9ae 100644 (file)
@@ -87,6 +87,7 @@
 #include <mach/mach_port_server.h>
 #include <vm/vm_map.h>
 #include <vm/vm_kern.h>
+#include <ipc/port.h>
 #include <ipc/ipc_entry.h>
 #include <ipc/ipc_space.h>
 #include <ipc/ipc_object.h>
@@ -120,14 +121,6 @@ void mach_port_gst_helper(
        mach_port_name_t        *names,
        ipc_entry_num_t         *actualp);
 
-
-kern_return_t
-mach_port_guard_exception(
-       mach_port_name_t        name,
-       uint64_t                inguard,
-       uint64_t                portguard,
-       unsigned                reason);
-
 /* Needs port locked */
 void mach_port_get_status_helper(
        ipc_port_t              port,
@@ -464,8 +457,10 @@ mach_port_type(
        }
 
        kr = ipc_right_lookup_write(space, name, &entry);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME);
                return kr;
+       }
 
        /* space is write-locked and active */
        kr = ipc_right_info(space, name, entry, typep, &urefs);
@@ -677,12 +672,14 @@ mach_port_allocate_full(
                } else {
                        mach_msg_size_t size = qosp->len + MAX_TRAILER_SIZE;
 
-                       if (right != MACH_PORT_RIGHT_RECEIVE)
+                       if (right != MACH_PORT_RIGHT_RECEIVE) {
                                return (KERN_INVALID_VALUE);
+                       }
 
                        kmsg = (ipc_kmsg_t)ipc_kmsg_prealloc(size);
-                       if (kmsg == IKM_NULL)
+                       if (kmsg == IKM_NULL) {
                                return (KERN_RESOURCE_SHORTAGE);
+                       }
                }
        }
 
@@ -763,8 +760,10 @@ mach_port_destroy(
                return KERN_SUCCESS;
 
        kr = ipc_right_lookup_write(space, name, &entry);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME);
                return kr;
+       }
        /* space is write-locked and active */
 
        kr = ipc_right_destroy(space, name, entry, TRUE, 0); /* unlocks space */
@@ -804,8 +803,10 @@ mach_port_deallocate(
                return KERN_SUCCESS;
 
        kr = ipc_right_lookup_write(space, name, &entry);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME);
                return kr;
+       }
        /* space is write-locked */
 
        kr = ipc_right_dealloc(space, name, entry); /* unlocks space */
@@ -857,8 +858,10 @@ mach_port_get_refs(
        }
 
        kr = ipc_right_lookup_write(space, name, &entry);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME);
                return kr;
+       }
 
        /* space is write-locked and active */
        kr = ipc_right_info(space, name, entry, &type, &urefs);
@@ -937,8 +940,11 @@ mach_port_mod_refs(
        }
 
        kr = ipc_right_lookup_write(space, name, &entry);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME);
                return kr;
+       }
+
        /* space is write-locked and active */
 
        kr = ipc_right_delta(space, name, entry, right, delta); /* unlocks */
@@ -1011,14 +1017,21 @@ mach_port_peek(
         * leaking the context pointer and to avoid variable-sized context issues.
         */
        if (GET_RCV_ELEMENTS(trailer_type) > MACH_RCV_TRAILER_AUDIT ||
-           REQUESTED_TRAILER_SIZE(TRUE, trailer_type) > *trailer_sizep) 
+           REQUESTED_TRAILER_SIZE(TRUE, trailer_type) > *trailer_sizep) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_VALUE);
                return KERN_INVALID_VALUE;
+       }
 
        *trailer_sizep = REQUESTED_TRAILER_SIZE(TRUE, trailer_type);
 
        kr = ipc_port_translate_receive(space, name, &port);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0,
+                                         ((KERN_INVALID_NAME == kr) ?
+                                          kGUARD_EXC_INVALID_NAME :
+                                          kGUARD_EXC_INVALID_RIGHT));
                return kr;
+       }
 
        /* Port locked and active */
 
@@ -1392,6 +1405,9 @@ mach_port_move_member(
                wq_link_id = waitq_link_reserve(NULL);
                wq_reserved_prepost = waitq_prepost_reserve(NULL, 10,
                                                            WAITQ_DONT_LOCK);
+               kr = ipc_pset_lazy_allocate(space, after);
+               if (kr != KERN_SUCCESS)
+                       goto done;
        }
 
        kr = ipc_right_lookup_read(space, member, &entry);
@@ -2048,6 +2064,10 @@ mach_port_insert_member(
        wq_link_id = waitq_link_reserve(NULL);
        wq_reserved_prepost = waitq_prepost_reserve(NULL, 10,
                                                    WAITQ_DONT_LOCK);
+       kr = ipc_pset_lazy_allocate(space, psname);
+       if (kr != KERN_SUCCESS)
+               goto done;
+
 
        kr = ipc_object_translate_two(space, 
                                      name, MACH_PORT_RIGHT_RECEIVE, &obj,
@@ -2224,7 +2244,7 @@ mach_port_unguard_locked(
  *     Returns:
  *             KERN_FAILURE            Thread marked with AST_GUARD.
  */
-kern_return_t
+void
 mach_port_guard_exception(
        mach_port_name_t        name,
        __unused uint64_t       inguard,
@@ -2238,7 +2258,6 @@ mach_port_guard_exception(
        mach_exception_subcode_t subcode = (uint64_t)portguard;
        thread_t t = current_thread();
        thread_guard_violation(t, code, subcode);
-       return KERN_FAILURE;
 }
 
 
@@ -2253,16 +2272,65 @@ mach_port_guard_exception(
  */
 
 void
-mach_port_guard_ast(thread_t __unused t,
+mach_port_guard_ast(thread_t t,
        mach_exception_data_type_t code, mach_exception_data_type_t subcode)
 {
-       assert(t->task != kernel_task);
+       unsigned int reason = EXC_GUARD_DECODE_GUARD_FLAVOR(code);
+       task_t task = t->task;
+       unsigned int behavior = task->task_exc_guard;
+       assert(task == current_task());
+       assert(task != kernel_task);
 
-       /* Raise an EXC_GUARD exception */
-       task_exception_notify(EXC_GUARD, code, subcode);
+       switch (reason) {
+               /*
+                * Fatal Mach port guards - always delivered synchronously
+                */
+       case kGUARD_EXC_DESTROY:
+       case kGUARD_EXC_MOD_REFS:
+       case kGUARD_EXC_SET_CONTEXT:
+       case kGUARD_EXC_UNGUARDED:
+       case kGUARD_EXC_INCORRECT_GUARD:
+               task_exception_notify(EXC_GUARD, code, subcode);
+               task_bsdtask_kill(task);
+               break;
+
+       default:
+               /*
+                * Mach port guards controlled by task settings.
+                */
+
+               /* Is delivery enabled */
+               if ((behavior & TASK_EXC_GUARD_MP_DELIVER) == 0) {
+                       return;
+               }
+
+               /* If only once, make sure we're that once */
+               while (behavior & TASK_EXC_GUARD_MP_ONCE) {
+                       uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_MP_DELIVER;
+
+                       if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
+                               break;
+                       }
+                       behavior = task->task_exc_guard;
+                       if ((behavior & TASK_EXC_GUARD_MP_DELIVER) == 0) {
+                               return;
+                       }
+               }
+
+               /* Raise exception via corpse fork or synchronously */
+               if ((task->task_exc_guard & TASK_EXC_GUARD_MP_CORPSE) &&
+                   (task->task_exc_guard & TASK_EXC_GUARD_MP_FATAL) == 0) {
+                       task_violated_guard(code, subcode, NULL);
+               } else {
+                       task_exception_notify(EXC_GUARD, code, subcode);
+               }
 
-       /* Terminate task which caused the exception */
-       task_bsdtask_kill(current_task());
+               /* Terminate the task if desired */
+               if (task->task_exc_guard & TASK_EXC_GUARD_MP_FATAL) {
+                       task_bsdtask_kill(task);
+               }
+               break;
+       }
 }
 
 /*
@@ -2390,8 +2458,10 @@ mach_port_destruct(
 
        /* Remove reference for receive right */
        kr = ipc_right_lookup_write(space, name, &entry);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_NAME);
                return kr;
+       }
        /* space is write-locked and active */
        kr = ipc_right_destruct(space, name, entry, srdelta, guard);    /* unlocks */
 
@@ -2431,15 +2501,23 @@ mach_port_guard(
 
        /* Guard can be applied only to receive rights */
        kr = ipc_port_translate_receive(space, name, &port);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0,
+                                         ((KERN_INVALID_NAME == kr) ?
+                                          kGUARD_EXC_INVALID_NAME :
+                                          kGUARD_EXC_INVALID_RIGHT));
                return kr;
+       }
 
        /* Port locked and active */
        kr = mach_port_guard_locked(port, guard, strict);
        ip_unlock(port);
 
-       return kr;
+       if (KERN_INVALID_ARGUMENT == kr) {
+               mach_port_guard_exception(name, 0, 0, kGUARD_EXC_INVALID_ARGUMENT);
+       }
 
+       return kr;
 }
 
 /*
@@ -2474,12 +2552,18 @@ mach_port_unguard(
                return KERN_INVALID_NAME;
 
        kr = ipc_port_translate_receive(space, name, &port);
-       if (kr != KERN_SUCCESS)
+       if (kr != KERN_SUCCESS) {
+               mach_port_guard_exception(name, 0, 0,
+                                         ((KERN_INVALID_NAME == kr) ?
+                                          kGUARD_EXC_INVALID_NAME :
+                                          kGUARD_EXC_INVALID_RIGHT));
                return kr;
+       }
 
        /* Port locked and active */
        kr = mach_port_unguard_locked(port, name, guard);
        ip_unlock(port);
+
        return kr;
 }
 
index 26dd9bcb10030e63814003a30fe6840fe0d5fcb1..7e25b7f05de4d899a11c11d27d96022ab8afa4b8 100644 (file)
 #define        MACH_PORT_UREFS_UNDERFLOW(urefs, delta)                         \
                (((delta) < 0) && (((mach_port_urefs_t)-(delta)) > (urefs)))
 
+__BEGIN_DECLS
+extern void mach_port_guard_exception(
+                                                                         mach_port_name_t      name,
+                                                                         uint64_t      inguard,
+                                                                         uint64_t      portguard,
+                                                                         unsigned      reason);
+__END_DECLS
+
 #endif /* _IPC_PORT_H_ */
index c3ea1037c9f42657fd34e5c99fb0142e51f5071a..d214c8e04eb244d88937a9d5c450672a23b2e200 100644 (file)
@@ -178,6 +178,10 @@ boolean_t kdp_has_polled_corefile(void)
     return (NULL != gIOPolledCoreFileVars);
 }
 
+kern_return_t kdp_polled_corefile_error(void)
+{
+    return gIOPolledCoreFileOpenRet;
+}
 #if CONFIG_EMBEDDED
 /*
  * Whenever we start a coredump, make sure the buffers
@@ -461,7 +465,7 @@ kern_dump_disk_proc(unsigned int request, __unused char *corename,
         case KDP_DATA:
            err = IOPolledFileWrite(gIOPolledCoreFileVars, data, length, NULL);
            if (kIOReturnSuccess != err) {
-                   kern_coredump_log(NULL, "IOPolledFileWrite(gIOPolledCoreFileVars, 0x%p, 0x%llx, NULL) returned 0x%x\n",
+                   kern_coredump_log(NULL, "IOPolledFileWrite(gIOPolledCoreFileVars, %p, 0x%llx, NULL) returned 0x%x\n",
                                    data, length, err);
                    break;
            }
@@ -510,7 +514,7 @@ kdp_core_zoutput(z_streamp strm, Bytef *buf, unsigned len)
     {
        if ((ret = (*vars->outproc)(KDP_DATA, NULL, len, buf)) != kIOReturnSuccess)
        { 
-           kern_coredump_log(NULL, "(kdp_core_zoutput) outproc(KDP_DATA, NULL, 0x%x, 0x%p) returned 0x%x\n",
+           kern_coredump_log(NULL, "(kdp_core_zoutput) outproc(KDP_DATA, NULL, 0x%x, %p) returned 0x%x\n",
                            len, buf, ret);
            vars->error = ret;
        }
@@ -553,7 +557,7 @@ kdp_core_zoutputbuf(z_streamp strm, Bytef *inbuf, unsigned inlen)
                                        vars->outlen - vars->outremain, 
                                        vars->outbuf)) != kIOReturnSuccess)
        { 
-           kern_coredump_log(NULL, "(kdp_core_zoutputbuf) outproc(KDP_DATA, NULL, 0x%x, 0x%p) returned 0x%x\n",
+           kern_coredump_log(NULL, "(kdp_core_zoutputbuf) outproc(KDP_DATA, NULL, 0x%x, %p) returned 0x%x\n",
                            (vars->outlen - vars->outremain), vars->outbuf, ret);
            vars->error = ret;
        }
@@ -708,7 +712,7 @@ kernel_pmap_present_mapping(uint64_t vaddr, uint64_t * pvincr, uintptr_t * pvphy
         vincr = kdp_core_ramdisk_size;
     }
     else
-#if defined(__arm64__)
+#if defined(__arm64__) && defined(CONFIG_XNUPOST)
     if (vaddr == _COMM_HIGH_PAGE64_BASE_ADDRESS)
     {
        /* not readable */
@@ -795,22 +799,25 @@ pmap_traverse_present_mappings(pmap_t __unused pmap,
                    ppn = VM_PAGE_GET_PHYS_PAGE(m);
                    break;
                }
-               m = (vm_page_t)vm_page_queue_next(&m->listq);
-           }
-           vcur = phystokv(ptoa(ppn));
-           if (vcur != vprev)
-           {
-               ret = callback(vcurstart, vprev, context);
-               lastvavalid = FALSE;
+               m = (vm_page_t)vm_page_queue_next(&m->vmp_listq);
            }
            vincr = PAGE_SIZE_64;
            if (ppn == atop(avail_end))
            {
                vm_object_unlock(&pmap_object_store);
                m = VM_PAGE_NULL;
+               // avail_end is not a valid physical address,
+               // so phystokv(avail_end) may not produce the expected result.
+               vcur = phystokv(avail_start) + (avail_end - avail_start);
+           } else {
+               m = (vm_page_t)vm_page_queue_next(&m->vmp_listq);
+               vcur = phystokv(ptoa(ppn));
+           }
+           if (vcur != vprev)
+           {
+               ret = callback(vcurstart, vprev, context);
+               lastvavalid = FALSE;
            }
-           else
-               m = (vm_page_t)vm_page_queue_next(&m->listq);
        }
        if (m == VM_PAGE_NULL)
            ppn = kernel_pmap_present_mapping(vcur, &vincr, NULL);
@@ -1066,13 +1073,13 @@ kern_dump_update_header(struct kdp_core_out_vars *outvars)
        /* Write the file header -- first seek to the beginning of the file */
        foffset = 0;
        if ((ret = (outvars->outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) {
-               kern_coredump_log(NULL, "(kern_dump_update_header) outproc(KDP_SEEK, NULL, %lu, 0x%p) foffset = 0x%llx returned 0x%x\n",
+               kern_coredump_log(NULL, "(kern_dump_update_header) outproc(KDP_SEEK, NULL, %lu, %p) foffset = 0x%llx returned 0x%x\n",
                                sizeof(foffset), &foffset, foffset, ret);
                return ret;
        }
 
        if ((ret = (outvars->outproc)(KDP_DATA, NULL, sizeof(kdp_core_header), &kdp_core_header)) != kIOReturnSuccess) {
-               kern_coredump_log(NULL, "(kern_dump_update_header) outproc(KDP_DATA, NULL, %lu, 0x%p) returned 0x%x\n",
+               kern_coredump_log(NULL, "(kern_dump_update_header) outproc(KDP_DATA, NULL, %lu, %p) returned 0x%x\n",
                                sizeof(kdp_core_header), &kdp_core_header, ret);
                 return ret;
        }
@@ -1125,7 +1132,7 @@ kern_dump_seek_to_next_file(void *kdp_core_out_vars, uint64_t next_file_offset)
        int ret;
 
        if ((ret = (outvars->outproc)(KDP_SEEK, NULL, sizeof(next_file_offset), &next_file_offset)) != kIOReturnSuccess) {
-               kern_coredump_log(NULL, "(kern_dump_seek_to_next_file) outproc(KDP_SEEK, NULL, %lu, 0x%p) foffset = 0x%llx returned 0x%x\n",
+               kern_coredump_log(NULL, "(kern_dump_seek_to_next_file) outproc(KDP_SEEK, NULL, %lu, %p) foffset = 0x%llx returned 0x%x\n",
                                sizeof(next_file_offset), &next_file_offset, next_file_offset, ret);
        }
 
@@ -1186,7 +1193,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
 
                /* Seek the calculated offset (we'll scrollback later to flush the logs and header) */
                if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) {
-                       kern_coredump_log(NULL, "(do_kern_dump seek begin) outproc(KDP_SEEK, NULL, %lu, 0x%p) foffset = 0x%llx returned 0x%x\n",
+                       kern_coredump_log(NULL, "(do_kern_dump seek begin) outproc(KDP_SEEK, NULL, %lu, %p) foffset = 0x%llx returned 0x%x\n",
                                        sizeof(foffset), &foffset, foffset, ret);
                        dump_succeeded = FALSE;
                        goto exit;
@@ -1237,11 +1244,11 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
                                kern_coredump_log(NULL, "Failed to reset outvars for stackshot with len 0x%zx, returned 0x%x\n", panic_stackshot_len, ret);
                                dump_succeeded = FALSE;
                        } else if ((ret = kdp_core_output(&outvars, panic_stackshot_len, (void *)panic_stackshot_buf)) != KERN_SUCCESS) {
-                               kern_coredump_log(NULL, "Failed to write panic stackshot to file, kdp_coreoutput(outvars, %lu, 0x%p) returned 0x%x\n",
+                               kern_coredump_log(NULL, "Failed to write panic stackshot to file, kdp_coreoutput(outvars, %lu, %p) returned 0x%x\n",
                                               panic_stackshot_len, (void *) panic_stackshot_buf, ret);
                                dump_succeeded = FALSE;
                        } else if ((ret = kdp_core_output(&outvars, 0, NULL)) != KERN_SUCCESS) {
-                               kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(0x%p, 0, NULL) returned 0x%x\n", &outvars, ret);
+                               kern_coredump_log(NULL, "Failed to flush stackshot data : kdp_core_output(%p, 0, NULL) returned 0x%x\n", &outvars, ret);
                                dump_succeeded = FALSE;
                        } else if ((ret = kern_dump_record_file(&outvars, "panic_stackshot.kcdata", foffset, &compressed_stackshot_len)) != KERN_SUCCESS) {
                                kern_coredump_log(NULL, "Failed to record panic stackshot in corefile header, kern_dump_record_file returned 0x%x\n", ret);
@@ -1255,7 +1262,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
                /* Write the debug log -- first seek to the end of the corefile header */
                foffset = KERN_COREDUMP_HEADERSIZE;
                if ((ret = (*outproc)(KDP_SEEK, NULL, sizeof(foffset), &foffset)) != kIOReturnSuccess) {
-                       kern_coredump_log(NULL, "(do_kern_dump seek logfile) outproc(KDP_SEEK, NULL, %lu, 0x%p) foffset = 0x%llx returned 0x%x\n",
+                       kern_coredump_log(NULL, "(do_kern_dump seek logfile) outproc(KDP_SEEK, NULL, %lu, %p) foffset = 0x%llx returned 0x%x\n",
                                        sizeof(foffset), &foffset, foffset, ret);
                        dump_succeeded = FALSE;
                        goto exit;
@@ -1281,7 +1288,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
                 */
                buf = debug_buf_base;
                if ((ret = (*outproc)(KDP_DATA, NULL, existing_log_size, buf)) != kIOReturnSuccess) {
-                               kern_coredump_log(NULL, "(do_kern_dump paniclog) outproc(KDP_DATA, NULL, %lu, 0x%p) returned 0x%x\n",
+                               kern_coredump_log(NULL, "(do_kern_dump paniclog) outproc(KDP_DATA, NULL, %lu, %p) returned 0x%x\n",
                                                existing_log_size, buf, ret);
                                dump_succeeded = FALSE;
                                goto exit;
@@ -1302,7 +1309,7 @@ do_kern_dump(kern_dump_output_proc outproc, enum kern_dump_type kd_variant)
 
                /* Write the coredump log */
                if ((ret = (*outproc)(KDP_DATA, NULL, new_log_len, buf)) != kIOReturnSuccess) {
-                       kern_coredump_log(NULL, "(do_kern_dump coredump log) outproc(KDP_DATA, NULL, %lu, 0x%p) returned 0x%x\n",
+                       kern_coredump_log(NULL, "(do_kern_dump coredump log) outproc(KDP_DATA, NULL, %lu, %p) returned 0x%x\n",
                                        new_log_len, buf, ret);
                        dump_succeeded = FALSE;
                        goto exit;
@@ -1382,17 +1389,18 @@ kern_dump(enum kern_dump_type kd_variant)
 }
 
 #if CONFIG_EMBEDDED
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wmissing-noreturn"
 void
 panic_spin_shmcon()
 {
-#pragma clang diagnostic pop
+       if (hwsd_info == NULL) {
+               kern_coredump_log(NULL, "handshake structure not initialized\n");
+               return;
+       }
+
        kern_coredump_log(NULL, "\nPlease go to https://panic.apple.com to report this panic\n");
        kern_coredump_log(NULL, "Waiting for hardware shared memory debugger, handshake structure is at virt: %p, phys %p\n",
                        hwsd_info, (void *)kvtophys((vm_offset_t)hwsd_info));
 
-       assert(hwsd_info != NULL);
        hwsd_info->xhsdci_status = XHSDCI_STATUS_KERNEL_READY;
        hwsd_info->xhsdci_seq_no = 0;
        FlushPoC_DcacheRegion((vm_offset_t) hwsd_info, sizeof(*hwsd_info));
index 45cee67b1f23662a589fa078374f8d70f04559c1..7e0b17cfd6cb4dd3f09be2b64429b273d3a7627f 100644 (file)
@@ -156,6 +156,7 @@ void kern_collectth_state_size(uint64_t * tstate_count, uint64_t * tstate_size);
 void kern_collectth_state(thread_t thread, void *buffer, uint64_t size, void **iter);
 
 boolean_t kdp_has_polled_corefile(void);
+kern_return_t kdp_polled_corefile_error(void);
 
 void kdp_core_init(void);
 
index 1bf25ac74fd9c3711d9224608162c200b9a20936..4e7fa06392ddbb2eb5c36637474b99ee1fe4faaf 100644 (file)
@@ -65,7 +65,8 @@ int machine_trace_thread64(thread_t thread,
                            int nframes,
                            boolean_t user_p,
                            boolean_t trace_fp,
-                           uint32_t * thread_trace_flags);
+                           uint32_t * thread_trace_flags,
+                                                  uint64_t *sp);
 
 void kdp_trap(unsigned int, struct arm_saved_state * saved_state);
 
@@ -495,11 +496,22 @@ machine_trace_thread(thread_t thread,
                                        if(target_cpu_datap == (cpu_data_t *)NULL)
                                                continue;
                                        
-                                       if ((prevfp >= (target_cpu_datap->intstack_top-INTSTACK_SIZE) && prevfp < target_cpu_datap->intstack_top) ||
-                                               (prevfp >= (target_cpu_datap->fiqstack_top-PAGE_SIZE) && prevfp < target_cpu_datap->fiqstack_top)) {
+                                       if (prevfp >= (target_cpu_datap->intstack_top-INTSTACK_SIZE) && prevfp < target_cpu_datap->intstack_top) {
                                                prev_in_interrupt_stack = TRUE;
                                                break;
                                        }
+
+#if defined(__arm__)
+                                       if (prevfp >= (target_cpu_datap->fiqstack_top-FIQSTACK_SIZE) && prevfp < target_cpu_datap->fiqstack_top) {
+                                               prev_in_interrupt_stack = TRUE;
+                                               break;
+                                       }
+#elif defined(__arm64__)
+                                       if (prevfp >= (target_cpu_datap->excepstack_top-EXCEPSTACK_SIZE) && prevfp < target_cpu_datap->excepstack_top) {
+                                               prev_in_interrupt_stack = TRUE;
+                                               break;
+                                       }
+#endif
                                }
                        }
 
@@ -555,8 +567,10 @@ machine_trace_thread64(thread_t thread,
                        int nframes,
                        boolean_t user_p,
                        boolean_t trace_fp,
-                       uint32_t * thread_trace_flags)
+                       uint32_t * thread_trace_flags,
+                                          uint64_t *sp_out)
 {
+#pragma unused(sp_out)
 #if defined(__arm__)
 #pragma unused(thread, tracepos, tracebound, nframes, user_p, trace_fp, thread_trace_flags)
        return 0;
@@ -577,6 +591,8 @@ machine_trace_thread64(thread_t thread,
        vm_offset_t kern_virt_addr    = 0;
        vm_map_t bt_vm_map            = VM_MAP_NULL;
 
+       const boolean_t is_64bit_addr = thread_is_64bit_addr(thread);
+
        nframes = (tracebound > tracepos) ? MIN(nframes, (int)((tracebound - tracepos) / framesize)) : 0;
        if (!nframes) {
                return (0);
@@ -586,8 +602,8 @@ machine_trace_thread64(thread_t thread,
        if (user_p) {
                /* Examine the user savearea */
                state = thread->machine.upcb;
-               stacklimit = MACH_VM_MAX_ADDRESS;
-               stacklimit_bottom = MACH_VM_MIN_ADDRESS;
+               stacklimit = (is_64bit_addr) ? MACH_VM_MAX_ADDRESS : VM_MAX_ADDRESS;
+               stacklimit_bottom = (is_64bit_addr) ? MACH_VM_MIN_ADDRESS : VM_MIN_ADDRESS;
 
                /* Fake up a stack frame for the PC */
                *tracebuf++ = get_saved_state_pc(state);
@@ -666,12 +682,21 @@ machine_trace_thread64(thread_t thread,
                                        if(target_cpu_datap == (cpu_data_t *)NULL)
                                                continue;
 
-                                       if ((prevfp >= (target_cpu_datap->intstack_top-INTSTACK_SIZE) && prevfp < target_cpu_datap->intstack_top) ||
-                                               (prevfp >= (target_cpu_datap->fiqstack_top-PAGE_SIZE) && prevfp < target_cpu_datap->fiqstack_top)) {
+                                       if (prevfp >= (target_cpu_datap->intstack_top-INTSTACK_SIZE) && prevfp < target_cpu_datap->intstack_top) {
                                                switched_stacks = TRUE;
                                                break;
                                        }
-
+#if defined(__arm__)
+                                       if (prevfp >= (target_cpu_datap->fiqstack_top-FIQSTACK_SIZE) && prevfp < target_cpu_datap->fiqstack_top) {
+                                               switched_stacks = TRUE;
+                                               break;
+                                       }
+#elif defined(__arm64__)
+                                       if (prevfp >= (target_cpu_datap->excepstack_top-EXCEPSTACK_SIZE) && prevfp < target_cpu_datap->excepstack_top) {
+                                               switched_stacks = TRUE;
+                                               break;
+                                       }
+#endif
                                }
 
                        }
index 0d716b5d1202b55806ba53b305eb6a1923324b4d..5cfc3be3335ab0c59b8cc3b93943e764cef2f706 100644 (file)
@@ -586,7 +586,8 @@ machine_trace_thread64(thread_t thread,
                        int nframes,
                        boolean_t user_p,
                        boolean_t trace_fp,
-                       uint32_t * thread_trace_flags)
+                       uint32_t * thread_trace_flags,
+                       uint64_t *sp)
 {
        uint64_t * tracebuf = (uint64_t *)tracepos;
        unsigned framesize  = (trace_fp ? 2 : 1) * sizeof(addr64_t);
@@ -607,6 +608,9 @@ machine_trace_thread64(thread_t thread,
                prev_rip = iss64->isf.rip;
                stackptr = iss64->rbp;
                bt_vm_map = thread->task->map;
+        if (sp && user_p) {
+            *sp = iss64->isf.rsp;
+        }
        }
        else {
                stackptr = STACK_IKS(thread->kernel_stack)->k_rbp;
index f33d9915c57a1115c1db617ba6f193b0bd0d96f6..e1c40e14100bebb9bbddf94bdc692582a7d5800c 100644 (file)
@@ -258,7 +258,7 @@ coredump_save_summary(uint64_t core_segment_count, uint64_t core_byte_count,
                /* Send the core_header to the output procedure */
                ret =  kdp_core_output(core_context->core_outvars, sizeof(core_header), (caddr_t)&core_header);
                if (ret != KERN_SUCCESS) {
-                       kern_coredump_log(context, "coredump_save_summary() : failed to write mach header : kdp_core_output(0x%p, %lu, 0x%p) returned error 0x%x\n",
+                       kern_coredump_log(context, "coredump_save_summary() : failed to write mach header : kdp_core_output(%p, %lu, %p) returned error 0x%x\n",
                                        core_context->core_outvars, sizeof(core_header), &core_header, ret);
                        return ret;
                }
@@ -280,7 +280,7 @@ coredump_save_summary(uint64_t core_segment_count, uint64_t core_byte_count,
                /* Send the core_header to the output procedure */
                ret =  kdp_core_output(core_context->core_outvars, sizeof(core_header), (caddr_t)&core_header);
                if (ret != KERN_SUCCESS) {
-                       kern_coredump_log(context, "coredump_save_summary() : failed to write mach header : kdp_core_output(0x%p, %lu, 0x%p) returned error 0x%x\n",
+                       kern_coredump_log(context, "coredump_save_summary() : failed to write mach header : kdp_core_output(%p, %lu, %p) returned error 0x%x\n",
                                        core_context->core_outvars, sizeof(core_header), &core_header, ret);
                        return ret;
                }
@@ -303,13 +303,13 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end,
        uint64_t size = seg_end - seg_start;
 
        if (seg_end <= seg_start) {
-               kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : called with invalid addresses : start 0x%llx >= end 0x%llx\n",
+               kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : called with invalid addresses : start 0x%llx >= end 0x%llx\n",
                                seg_start, seg_end, context, seg_start, seg_end);
                return KERN_INVALID_ARGUMENT;
        }
 
        if (core_context->core_segments_remaining == 0) {
-               kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : coredump_save_segment_descriptions() called too many times, %llu segment descriptions already recorded\n",
+               kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : coredump_save_segment_descriptions() called too many times, %llu segment descriptions already recorded\n",
                                seg_start, seg_end, context, core_context->core_segment_count);
                return KERN_INVALID_ARGUMENT;
        }
@@ -320,7 +320,7 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end,
                struct segment_command_64 seg_command = { };
 
                if (core_context->core_cur_hoffset + sizeof(seg_command) > core_context->core_header_size) {
-                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : ran out of space to save commands with %llu of %llu remaining\n",
+                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : ran out of space to save commands with %llu of %llu remaining\n",
                                seg_start, seg_end, context, core_context->core_segments_remaining, core_context->core_segment_count);
                        return KERN_NO_SPACE;
                }
@@ -338,7 +338,7 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end,
                /* Flush new command to output */
                ret = kdp_core_output(core_context->core_outvars, sizeof(seg_command), (caddr_t)&seg_command);
                if (ret != KERN_SUCCESS) {
-                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : failed to write segment %llu of %llu. kdp_core_output(0x%p, %lu, 0x%p) returned error %d\n",
+                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : failed to write segment %llu of %llu. kdp_core_output(%p, %lu, %p) returned error %d\n",
                                        seg_start, seg_end, context, core_context->core_segment_count - core_context->core_segments_remaining,
                                        core_context->core_segment_count, core_context->core_outvars, sizeof(seg_command), &seg_command, ret);
                        return ret;
@@ -351,13 +351,13 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end,
                struct segment_command seg_command = { };
 
                if (seg_start > UINT32_MAX || seg_end > UINT32_MAX) {
-                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : called with invalid addresses for 32-bit : start 0x%llx, end 0x%llx\n",
+                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : called with invalid addresses for 32-bit : start 0x%llx, end 0x%llx\n",
                                seg_start, seg_end, context, seg_start, seg_end);
                        return KERN_INVALID_ARGUMENT;
                }
 
                if (core_context->core_cur_hoffset + sizeof(seg_command) > core_context->core_header_size) {
-                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : ran out of space to save commands with %llu of %llu remaining\n",
+                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : ran out of space to save commands with %llu of %llu remaining\n",
                                seg_start, seg_end, context, core_context->core_segments_remaining, core_context->core_segment_count);
                        return KERN_NO_SPACE;
                }
@@ -375,7 +375,7 @@ coredump_save_segment_descriptions(uint64_t seg_start, uint64_t seg_end,
                /* Flush new command to output */
                ret = kdp_core_output(core_context->core_outvars, sizeof(seg_command), (caddr_t)&seg_command);
                if (ret != KERN_SUCCESS) {
-                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, 0x%p) : failed to write segment %llu of %llu : kdp_core_output(0x%p, %lu, 0x%p) returned  error 0x%x\n",
+                       kern_coredump_log(context, "coredump_save_segment_descriptions(0x%llx, 0x%llx, %p) : failed to write segment %llu of %llu : kdp_core_output(%p, %lu, %p) returned  error 0x%x\n",
                                        seg_start, seg_end, context, core_context->core_segment_count - core_context->core_segments_remaining,
                                        core_context->core_segment_count, core_context->core_outvars, sizeof(seg_command), &seg_command, ret);
                        return ret;
@@ -404,20 +404,20 @@ coredump_save_thread_state(void *thread_state, void *context)
        int ret;
 
        if (tc->cmd != LC_THREAD) {
-               kern_coredump_log(context, "coredump_save_thread_state(0x%p, 0x%p) : found %d expected LC_THREAD (%d)\n",
+               kern_coredump_log(context, "coredump_save_thread_state(%p, %p) : found %d expected LC_THREAD (%d)\n",
                                thread_state, context, tc->cmd, LC_THREAD);
                return KERN_INVALID_ARGUMENT;
        }
 
        if (core_context->core_cur_hoffset + core_context->core_thread_state_size > core_context->core_header_size) {
-               kern_coredump_log(context, "coredump_save_thread_state(0x%p, 0x%p) : ran out of space to save threads with %llu of %llu remaining\n",
+               kern_coredump_log(context, "coredump_save_thread_state(%p, %p) : ran out of space to save threads with %llu of %llu remaining\n",
                                thread_state, context, core_context->core_threads_remaining, core_context->core_thread_count);
                return KERN_NO_SPACE;
        }
 
        ret = kdp_core_output(core_context->core_outvars, core_context->core_thread_state_size, (caddr_t)thread_state);
        if (ret != KERN_SUCCESS) {
-               kern_coredump_log(context, "coredump_save_thread_state(0x%p, 0x%p) : failed to write thread data : kdp_core_output(0x%p, %llu, 0x%p) returned 0x%x\n",
+               kern_coredump_log(context, "coredump_save_thread_state(%p, %p) : failed to write thread data : kdp_core_output(%p, %llu, %p) returned 0x%x\n",
                                thread_state, context, core_context->core_outvars, core_context->core_thread_state_size, thread_state, ret);
                return ret;
        }
@@ -436,13 +436,13 @@ coredump_save_sw_vers(void *sw_vers, uint64_t length, void *context)
        int ret;
 
        if (length > KERN_COREDUMP_VERSIONSTRINGMAXSIZE || !length) {
-               kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : called with invalid length %llu\n",
+               kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : called with invalid length %llu\n",
                                sw_vers, length, context, length);
                return KERN_INVALID_ARGUMENT;
        }
 
        if (core_context->core_cur_hoffset + sizeof(struct ident_command) + length > core_context->core_header_size) {
-               kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : ran out of space to save data\n",
+               kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : ran out of space to save data\n",
                                sw_vers, length, context);
                return KERN_NO_SPACE;
        }
@@ -451,14 +451,14 @@ coredump_save_sw_vers(void *sw_vers, uint64_t length, void *context)
        ident.cmdsize = (uint32_t)(sizeof(struct ident_command) + KERN_COREDUMP_VERSIONSTRINGMAXSIZE);
        ret = kdp_core_output(core_context->core_outvars, sizeof(struct ident_command), (caddr_t)&ident);
        if (ret != KERN_SUCCESS) {
-               kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : failed to write ident command : kdp_core_output(0x%p, %lu, 0x%p) returned 0x%x\n",
+               kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : failed to write ident command : kdp_core_output(%p, %lu, %p) returned 0x%x\n",
                                sw_vers, length, context, core_context->core_outvars, sizeof(struct ident_command), &ident, ret);
                return ret;
        }
 
        ret = kdp_core_output(core_context->core_outvars, length, (caddr_t)sw_vers);
        if (ret != KERN_SUCCESS) {
-               kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : failed to write version string : kdp_core_output(0x%p, %llu, 0x%p) returned 0x%x\n",
+               kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : failed to write version string : kdp_core_output(%p, %llu, %p) returned 0x%x\n",
                                sw_vers, length, context, core_context->core_outvars, length, sw_vers, ret);
                return ret;
        }
@@ -467,7 +467,7 @@ coredump_save_sw_vers(void *sw_vers, uint64_t length, void *context)
                /* Zero fill to the full command size */
                ret = kdp_core_output(core_context->core_outvars, (KERN_COREDUMP_VERSIONSTRINGMAXSIZE - length), NULL);
                if (ret != KERN_SUCCESS) {
-                       kern_coredump_log(context, "coredump_save_sw_vers(0x%p, %llu, 0x%p) : failed to write zero fill padding : kdp_core_output(0x%p, %llu, NULL) returned 0x%x\n",
+                       kern_coredump_log(context, "coredump_save_sw_vers(%p, %llu, %p) : failed to write zero fill padding : kdp_core_output(%p, %llu, NULL) returned 0x%x\n",
                                        sw_vers, length, context, core_context->core_outvars, (KERN_COREDUMP_VERSIONSTRINGMAXSIZE - length), ret);
                        return ret;
                }
@@ -485,7 +485,7 @@ coredump_save_segment_data(void *seg_data, uint64_t length, void *context)
        processor_core_context *core_context = (processor_core_context *)context;
 
        if (length > core_context->core_segment_bytes_remaining) {
-               kern_coredump_log(context, "coredump_save_segment_data(0x%p, %llu, 0x%p) : called with too much data, %llu written, %llu left\n",
+               kern_coredump_log(context, "coredump_save_segment_data(%p, %llu, %p) : called with too much data, %llu written, %llu left\n",
                                seg_data, length, context, core_context->core_segment_byte_total - core_context->core_segment_bytes_remaining,
                                core_context->core_segment_bytes_remaining);
                return KERN_INVALID_ARGUMENT;
@@ -493,7 +493,7 @@ coredump_save_segment_data(void *seg_data, uint64_t length, void *context)
 
        ret = kdp_core_output(core_context->core_outvars, length, (caddr_t)seg_data);
        if (ret != KERN_SUCCESS) {
-               kern_coredump_log(context, "coredump_save_segment_data(0x%p, %llu, 0x%p) : failed to write data (%llu bytes remaining) :%d\n",
+               kern_coredump_log(context, "coredump_save_segment_data(%p, %llu, %p) : failed to write data (%llu bytes remaining) :%d\n",
                                seg_data, length, context, core_context->core_segment_bytes_remaining, ret);
                return ret;
        }
@@ -595,7 +595,7 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor
        /* Zero fill between the end of the header and the beginning of the segment data file offset */
        ret = kdp_core_output(context.core_outvars, (round_page(context.core_header_size) - context.core_header_size), NULL);
        if (ret != KERN_SUCCESS) {
-               kern_coredump_log(&context, "(kern_coredump_routine) : failed to write zero fill padding (%llu bytes remaining) : kdp_core_output(0x%p, %llu, NULL) returned 0x%x\n",
+               kern_coredump_log(&context, "(kern_coredump_routine) : failed to write zero fill padding (%llu bytes remaining) : kdp_core_output(%p, %llu, NULL) returned 0x%x\n",
                                context.core_segment_bytes_remaining, context.core_outvars, (round_page(context.core_header_size) - context.core_header_size), ret);
                return ret;
        }
@@ -618,7 +618,7 @@ kern_coredump_routine(void *core_outvars, struct kern_coredump_core *current_cor
        /* Flush the last data out */
        ret = kdp_core_output(context.core_outvars, 0, NULL);
        if (ret != KERN_SUCCESS) {
-               kern_coredump_log(&context, "(kern_coredump_routine) : failed to flush final core data : kdp_core_output(0x%p, 0, NULL) returned 0x%x\n",
+               kern_coredump_log(&context, "(kern_coredump_routine) : failed to flush final core data : kdp_core_output(%p, 0, NULL) returned 0x%x\n",
                                context.core_outvars, ret);
                return ret;
        }
index dcde0c50eec202517bec8a7cae367df0bf6605ce..bf4cc9197a4f8b5daa9ab4d4e43420419882d016 100644 (file)
@@ -15,11 +15,13 @@ DATAFILES = \
 
 PRIVATE_DATAFILES = \
        cs_blobs.h \
+       trustcache.h \
        debug.h \
        ecc.h \
        block_hint.h \
        monotonic.h \
-       arithmetic_128.h
+       arithmetic_128.h  \
+       turnstile.h
 
 EXPORT_FILES = \
        affinity.h \
@@ -27,6 +29,7 @@ EXPORT_FILES = \
        audit_sessionport.h \
        backtrace.h \
        bits.h \
+       btlog.h \
        call_entry.h \
        clock.h \
        coalition.h \
@@ -52,6 +55,7 @@ EXPORT_FILES = \
        policy_internal.h \
        processor.h \
        queue.h \
+       priority_queue.h \
        sched_prim.h \
        sfi.h \
        simple_lock.h \
@@ -72,6 +76,11 @@ PRIVATE_EXPORT_FILES = \
        copyout_shim.h
 
 
+XNU_ONLY_EXPORTS = \
+       cpu_quiesce.h \
+       ipc_kobject.h \
+       ux_handler.h
+
 INSTALL_MI_LIST = ${DATAFILES}
 
 INSTALL_MI_LCL_LIST = ${PRIVATE_DATAFILES}
@@ -80,7 +89,7 @@ INSTALL_KF_MI_LCL_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${EXPORT_FILES} ${PRI
 
 INSTALL_MI_DIR = kern
 
-EXPORT_MI_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${EXPORT_FILES} ipc_kobject.h
+EXPORT_MI_LIST = ${DATAFILES} ${PRIVATE_DATAFILES} ${EXPORT_FILES} ${XNU_ONLY_EXPORTS}
 
 EXPORT_MI_DIR = kern
 
index 8f282ce580f316b6efb6dbfc893a38328dbf96c8..a75d45dff9c3e77a9446132d28522124ea69efd9 100644 (file)
@@ -56,6 +56,7 @@
 
 #include <kern/ast.h>
 #include <kern/counters.h>
+#include <kern/cpu_quiesce.h>
 #include <kern/misc_protos.h>
 #include <kern/queue.h>
 #include <kern/sched_prim.h>
@@ -297,7 +298,28 @@ ast_taken_user(void)
                }
        }
 
+       if (ast_consume(AST_UNQUIESCE) == AST_UNQUIESCE) {
+               cpu_quiescent_counter_ast();
+       }
+
+       cpu_quiescent_counter_assert_ast();
+
        splx(s);
+
+       /*
+        * Here's a good place to put assertions of things which must be true
+        * upon return to userspace.
+        */
+       assert((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) == 0);
+       assert((thread->sched_flags & TH_SFLAG_RW_PROMOTED) == 0);
+       assert((thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) == 0);
+       assert((thread->sched_flags & TH_SFLAG_PROMOTED) == 0);
+       assert((thread->sched_flags & TH_SFLAG_DEPRESS) == 0);
+
+       assert(thread->promotions == 0);
+       assert(thread->was_promoted_on_wakeup == 0);
+       assert(thread->waiting_for_mutex == NULL);
+       assert(thread->rwlock_count == 0);
 }
 
 /*
index d41821e2d3bb85f19d804198cbca24e07df0a154..1fd6916ab08746ac93abf4a03917ec8b1f5eadc3 100644 (file)
@@ -122,11 +122,13 @@ typedef uint32_t ast_t;
 #define AST_GUARD              0x1000
 #define AST_TELEMETRY_USER     0x2000  /* telemetry sample requested on interrupt from userspace */
 #define AST_TELEMETRY_KERNEL   0x4000  /* telemetry sample requested on interrupt from kernel */
+#define AST_TELEMETRY_PMI      0x8000  /* telemetry sample requested on PMI */
 #define AST_SFI                        0x10000 /* Evaluate if SFI wait is needed before return to userspace */
 #define AST_DTRACE             0x20000
 #define AST_TELEMETRY_IO       0x40000 /* telemetry sample requested for I/O */
 #define AST_KEVENT             0x80000
 #define AST_REBALANCE           0x100000 /* thread context switched due to rebalancing */
+#define AST_UNQUIESCE           0x200000 /* catch unquiesced processor before returning to userspace */
 
 #define AST_NONE               0x00
 #define AST_ALL                        (~AST_NONE)
@@ -134,7 +136,8 @@ typedef uint32_t ast_t;
 #define AST_SCHEDULING (AST_PREEMPTION | AST_YIELD | AST_HANDOFF)
 #define AST_PREEMPTION (AST_PREEMPT | AST_QUANTUM | AST_URGENT)
 
-#define AST_TELEMETRY_ALL      (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL | AST_TELEMETRY_IO)
+#define AST_TELEMETRY_ALL (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL | \
+               AST_TELEMETRY_PMI | AST_TELEMETRY_IO)
 
 /* Per-thread ASTs follow the thread at context-switch time. */
 #define AST_PER_THREAD (AST_APC | AST_BSD | AST_MACF | AST_LEDGER | AST_GUARD | AST_TELEMETRY_ALL | AST_KEVENT)
index b47ce7940b41e79f4a5d365c3abc4c19d9ba9028..0588970f0aea850a956272a3694a82c6b3ddbd66 100644 (file)
@@ -41,6 +41,7 @@
 #endif
 
 
+
 uint32_t __attribute__((noinline))
 backtrace(uintptr_t *bt, uint32_t max_frames)
 {
@@ -84,6 +85,7 @@ backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame)
 
        while (fp != NULL && frame_index < max_frames) {
                uintptr_t *next_fp = (uintptr_t *)*fp;
+               uintptr_t ret_addr = *(fp + 1); /* return address is one word higher than frame pointer */
 
                /*
                 * If the frame pointer is 0, backtracing has reached the top of
@@ -97,8 +99,7 @@ backtrace_frame(uintptr_t *bt, uint32_t max_frames, void *start_frame)
                        break;
                }
 
-               /* return address is one word higher than frame pointer */
-               bt[frame_index++] = *(fp + 1);
+               bt[frame_index++] = ret_addr;
 
                /* stacks grow down; backtracing should be moving to higher addresses */
                if (next_fp <= fp) {
@@ -218,7 +219,7 @@ backtrace_interrupted(uintptr_t *bt, uint32_t max_frames)
                return 1;
        }
 
-       return backtrace_frame(bt + 1, max_frames - 1, (void *)fp);
+       return backtrace_frame(bt + 1, max_frames - 1, (void *)fp) + 1;
 }
 
 int
@@ -235,16 +236,11 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
 {
        bool user_64;
        uintptr_t pc, fp, next_fp;
-       vm_map_t map, old_map;
+       vm_map_t map = NULL, old_map = NULL;
        uint32_t frame_index = 0;
        int err = 0;
        size_t frame_size;
 
-       assert(ml_get_interrupts_enabled() == TRUE);
-       if (!ml_get_interrupts_enabled()) {
-               return EINVAL;
-       }
-
        assert(bt != NULL);
        assert(max_frames > 0);
        assert(frames_out != NULL);
@@ -302,15 +298,23 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
 #error "backtrace_thread_user: unsupported architecture"
 #endif /* !defined(__arm__) */
 
-       /* switch to the correct map, for copyin */
-       if (thread != current_thread()) {
-               map = get_task_map_reference(get_threadtask(thread));
-               if (map == NULL) {
-                       return EINVAL;
-               }
-               old_map = vm_map_switch(map);
-       } else {
-               map = NULL;
+       if (max_frames == 0) {
+               goto out;
+       }
+
+       bt[frame_index++] = pc;
+
+       if (frame_index >= max_frames) {
+               goto out;
+       }
+
+       if (INVALID_USER_FP(fp)) {
+               goto out;
+       }
+
+       assert(ml_get_interrupts_enabled() == TRUE);
+       if (!ml_get_interrupts_enabled()) {
+               return EINVAL;
        }
 
        union {
@@ -323,12 +327,18 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
                        uint32_t ret;
                } u32;
        } frame;
-       frame_size = 2 * (user_64 ? sizeof(uint64_t) : sizeof(uint32_t));
 
-       bt[frame_index++] = pc;
+       frame_size = 2 * (user_64 ? sizeof(uint64_t) : sizeof(uint32_t));
 
-       if (INVALID_USER_FP(fp)) {
-               goto out;
+       /* switch to the correct map, for copyin */
+       if (thread != current_thread()) {
+               map = get_task_map_reference(get_threadtask(thread));
+               if (map == NULL) {
+                       return EINVAL;
+               }
+               old_map = vm_map_switch(map);
+       } else {
+               map = NULL;
        }
 
        while (fp != 0 && frame_index < max_frames) {
@@ -343,7 +353,8 @@ backtrace_thread_user(void *thread, uintptr_t *bt, uint32_t max_frames,
                        break;
                }
 
-               bt[frame_index++] = user_64 ? frame.u64.ret : frame.u32.ret;
+               uintptr_t ret_addr = user_64 ? frame.u64.ret : frame.u32.ret;
+               bt[frame_index++] = ret_addr;
 
                /* stacks grow down; backtracing should be moving to higher addresses */
                if (next_fp <= fp) {
index 5c977497d662ddfbea1f9b7699b83b356fe5e41b..13ce948d2ad4313f9216d0df457c489101ce96fa 100644 (file)
@@ -39,7 +39,7 @@ typedef unsigned int                  uint;
 
 #define BIT(b)                         (1ULL << (b))
 
-#define mask(width)                    (BIT(width) - 1)
+#define mask(width)                    (width >= 64 ? -1 : (BIT(width) - 1))
 #define extract(x, shift, width)       ((((uint64_t)(x)) >> (shift)) & mask(width))
 #define bits(x, hi, lo)                        extract((x), (lo), (hi) - (lo) + 1)
 
@@ -47,6 +47,31 @@ typedef unsigned int                 uint;
 #define bit_clear(x, b)                        ((x) &= ~BIT(b))
 #define bit_test(x, b)                 ((bool)((x) & BIT(b)))
 
+inline static uint64_t
+bit_ror64(uint64_t bitmap, uint n)
+{
+#if defined(__arm64__)
+       uint64_t result;
+       uint64_t _n = (uint64_t)n;
+       asm volatile("ror %0, %1, %2" : "=r" (result) : "r" (bitmap), "r" (_n));
+       return result;
+#else
+       n = n & 63;
+       return ((bitmap >> n) | (bitmap << (64 - n)));
+#endif
+}
+
+inline static uint64_t
+bit_rol64(uint64_t bitmap, uint n)
+{
+#if defined(__arm64__)
+       return bit_ror64(bitmap, 64U - n);
+#else
+       n = n & 63;
+       return ((bitmap << n) | (bitmap >> (64 - n)));
+#endif
+}
+
 /* Non-atomically clear the bit and returns whether the bit value was changed */
 inline static bool
 bit_clear_if_set(uint64_t bitmap, int bit)
index c52b9488f0db77c065c00491bfb3d5f0f9d14d4f..f379d0850f45c60243555f95f2a2d518586da7ef 100644 (file)
@@ -29,7 +29,6 @@
 #ifndef        _KERN_BLOCK_HINT_H_
 #define _KERN_BLOCK_HINT_H_
 
-/* This must fit inside a short  */
 typedef enum thread_snapshot_wait_flags {
        kThreadWaitNone                 = 0x00,
        kThreadWaitKernelMutex          = 0x01,
@@ -48,8 +47,12 @@ typedef enum thread_snapshot_wait_flags {
        kThreadWaitPThreadCondVar       = 0x0e,
        kThreadWaitParkedWorkQueue      = 0x0f,
        kThreadWaitWorkloopSyncWait     = 0x10,
+       kThreadWaitOnProcess            = 0x11,
 } __attribute__((packed)) block_hint_t;
 
+_Static_assert(sizeof(block_hint_t) <= sizeof(short),
+               "block_hint_t must fit within a short");
+
 #ifdef XNU_KERNEL_PRIVATE
 
 struct waitq;
@@ -66,6 +69,7 @@ extern void kdp_rwlck_find_owner(struct waitq * waitq, event64_t event, thread_w
 extern void kdp_pthread_find_owner(thread_t thread, thread_waitinfo_t *waitinfo);
 extern void *kdp_pthread_get_thread_kwq(thread_t thread);
 extern void kdp_workloop_sync_wait_find_owner(thread_t thread, event64_t event, thread_waitinfo_t *waitinfo);
+extern void kdp_wait4_find_process(thread_t thread, event64_t event, thread_waitinfo_t *waitinfo);
 
 #endif /* XNU_KERNEL_PRIVATE */
 
index f70820520a4b8064a2b4ce4c950102a12cb53037..d017ae5203fb76f2eb623143b8d3e42493aef797 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -308,7 +308,7 @@ int  get_task_numacts(task_t t)
 /* does this machine need  64bit register set for signal handler */
 int is_64signalregset(void)
 {
-       if (task_has_64BitData(current_task())) {
+       if (task_has_64Bit_data(current_task())) {
                return(1);
        }
 
@@ -430,21 +430,24 @@ uint64_t get_task_phys_footprint(task_t task)
        return 0;
 }
 
+#if CONFIG_LEDGER_INTERVAL_MAX
 /*
  *
  */
-uint64_t get_task_phys_footprint_recent_max(task_t task)
+uint64_t get_task_phys_footprint_interval_max(task_t task, int reset)
 {
        kern_return_t ret;
        ledger_amount_t max;
 
-       ret = ledger_get_recent_max(task->ledger, task_ledgers.phys_footprint, &max);
-       if (KERN_SUCCESS == ret) {
+       ret = ledger_get_interval_max(task->ledger, task_ledgers.phys_footprint, &max, reset);
+
+       if(KERN_SUCCESS == ret) {
                return max;
        }
 
        return 0;
 }
+#endif /* CONFIG_LEDGER_INTERVAL_MAX */
 
 /*
  *
@@ -583,6 +586,46 @@ uint64_t get_task_iokit_mapped(task_t task)
        return 0;
 }
 
+uint64_t get_task_network_nonvolatile(task_t task)
+{
+    kern_return_t ret;
+    ledger_amount_t credit, debit;
+
+    ret = ledger_get_entries(task->ledger, task_ledgers.network_nonvolatile, &credit, &debit);
+    if (KERN_SUCCESS == ret) {
+        return (credit - debit);
+    }
+
+    return 0;
+}
+
+uint64_t get_task_network_nonvolatile_compressed(task_t task)
+{
+    kern_return_t ret;
+    ledger_amount_t credit, debit;
+
+    ret = ledger_get_entries(task->ledger, task_ledgers.network_nonvolatile_compressed, &credit, &debit);
+    if (KERN_SUCCESS == ret) {
+        return (credit - debit);
+    }
+
+    return 0;
+}
+
+uint64_t get_task_wired_mem(task_t task)
+{
+    kern_return_t ret;
+    ledger_amount_t credit, debit;
+
+    ret = ledger_get_entries(task->ledger, task_ledgers.wired_mem, &credit, &debit);
+    if (KERN_SUCCESS == ret) {
+        return (credit - debit);
+    }
+
+    return 0;
+}
+
+
 uint64_t get_task_cpu_time(task_t task)
 {
        kern_return_t ret;
@@ -885,7 +928,7 @@ fill_taskprocinfo(task_t task, struct proc_taskinfo_internal * ptinfo)
 }
 
 int 
-fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_threadinfo_internal * ptinfo, void * vpp, int *vidp)
+fill_taskthreadinfo(task_t task, uint64_t thaddr, bool thuniqueid, struct proc_threadinfo_internal * ptinfo, void * vpp, int *vidp)
 {
        thread_t  thact;
        int err=0;
@@ -898,7 +941,7 @@ fill_taskthreadinfo(task_t task, uint64_t thaddr, int thuniqueid, struct proc_th
 
        for (thact  = (thread_t)(void *)queue_first(&task->threads);
                        !queue_end(&task->threads, (queue_entry_t)thact); ) {
-               addr = (thuniqueid==0)?thact->machine.cthread_self: thact->thread_id;
+               addr = (thuniqueid) ? thact->thread_id : thact->machine.cthread_self;
                if (addr == thaddr)
                {
                
@@ -935,7 +978,7 @@ out:
 }
 
 int
-fill_taskthreadlist(task_t task, void * buffer, int thcount)
+fill_taskthreadlist(task_t task, void * buffer, int thcount, bool thuniqueid)
 {
        int numthr=0;
        thread_t thact;
@@ -948,7 +991,7 @@ fill_taskthreadlist(task_t task, void * buffer, int thcount)
 
        for (thact  = (thread_t)(void *)queue_first(&task->threads);
                        !queue_end(&task->threads, (queue_entry_t)thact); ) {
-               thaddr = thact->machine.cthread_self;
+               thaddr = (thuniqueid) ? thact->thread_id : thact->machine.cthread_self;
                *uptr++ = thaddr;
                numthr++;
                if (numthr >= thcount)
index 80a4799615a41d2e371687465409a045c0ae6560..a15aef98046324084695bb0f9d2c8395568ec9b7 100644 (file)
@@ -837,4 +837,66 @@ btlog_copy_backtraces_for_elements(btlog_t      * btlog,
        btlog_unlock(btlog);
 }
 
+/*
+ * Returns the number of records in the btlog struct.
+ *
+ * Called by the mach_zone_get_btlog_records() MIG routine.
+ */
+size_t
+get_btlog_records_count(btlog_t *btlog)
+{
+       if (btlog->btlog_buffersize < sizeof(btlog_t)) {
+               return 0;
+       }
+       return ((btlog->btlog_buffersize - sizeof(btlog_t))/btlog->btrecord_size);
+}
+
+/*
+ * Copies out relevant info from btlog_record_t's to zone_btrecord_t's. 'numrecs' points to the number of records
+ * the 'records' buffer can hold. Upon return 'numrecs' points to the number of records actually copied out.
+ *
+ * Called by the mach_zone_get_btlog_records() MIG routine.
+ */
+void
+get_btlog_records(btlog_t *btlog, zone_btrecord_t *records, unsigned int *numrecs)
+{
+       unsigned int count, recs_copied, frame;
+       zone_btrecord_t *current_rec;
+       btlog_record_t *zstack_record;
+       btlog_recordindex_t     zstack_index = BTLOG_RECORDINDEX_NONE;
+
+       btlog_lock(btlog);
+
+       count = 0;
+       if (btlog->btlog_buffersize > sizeof(btlog_t)) {
+               count = (unsigned int)((btlog->btlog_buffersize - sizeof(btlog_t))/btlog->btrecord_size);
+       }
+       /* Copy out only as many records as the pre-allocated buffer size permits. */
+       if (count > *numrecs) {
+               count = *numrecs;
+       }
+       zstack_index = btlog->head;
+
+       current_rec = &records[0];
+       recs_copied = 0;
+       while (recs_copied < count && (zstack_index != BTLOG_RECORDINDEX_NONE)) {
+               zstack_record = lookup_btrecord(btlog, zstack_index);
+               current_rec->operation_type = (uint32_t)(zstack_record->operation);
+               current_rec->ref_count = zstack_record->ref_count;
+
+               frame = 0;
+               while (frame < MIN(btlog->btrecord_btdepth, MAX_ZTRACE_DEPTH)) {
+                       current_rec->bt[frame] = (uint64_t)VM_KERNEL_UNSLIDE(zstack_record->bt[frame]);
+                       frame++;
+               }
+
+               zstack_index = zstack_record->next;
+               recs_copied++;
+               current_rec++;
+       }
+       *numrecs = recs_copied;
+
+       btlog_unlock(btlog);
+}
+
 #endif  /* DEBUG || DEVELOPMENT */
index c9e937b60dc44fa763fbb90b634ea382696aad8d..3930703abe138d3d4445cbbb96660229f1d3025f 100644 (file)
@@ -33,6 +33,7 @@
 #include <kern/debug.h>
 #include <sys/cdefs.h>
 #include <stdint.h>
+#include <mach_debug/zone_info.h>
 
 #ifdef XNU_KERNEL_PRIVATE
 
@@ -87,6 +88,13 @@ void btlog_copy_backtraces_for_elements(btlog_t      * btlog,
                                         uint32_t       zoneSize,
                                         leak_site_proc proc,
                                         void         * refCon);
+
+size_t get_btlog_records_count(btlog_t *btlog);
+
+void get_btlog_records(btlog_t *btlog,
+                       zone_btrecord_t *records,
+                       unsigned int *numrecs);
+
 #endif  /* DEBUG || DEVELOPMENT */
 
 #endif /* XNU_KERNEL_PRIVATE */
index 9bd9f3b0e8249889767ff787b4c489306b4123dd..2cd05c562cf1a95512f49898190eece0d2014848 100644 (file)
@@ -272,14 +272,6 @@ static struct clock_calend {
 
 static uint64_t ticks_per_sec; /* ticks in a second (expressed in abs time) */
 
-#if DEVELOPMENT || DEBUG
-clock_sec_t last_utc_sec = 0;
-clock_usec_t last_utc_usec = 0;
-clock_sec_t max_utc_sec = 0;
-clock_sec_t last_sys_sec = 0;
-clock_usec_t last_sys_usec = 0;
-#endif
-
 #if DEVELOPMENT || DEBUG
 extern int g_should_log_clock_adjustments;
 
@@ -704,24 +696,6 @@ clock_gettimeofday_and_absolute_time(
        }
 }
 
-static void
-update_basesleep(struct bintime delta, bool forward)
-{
-       /*
-        * Update basesleep only if the platform does not have monotonic clock.
-        * In that case the sleep time computation will use the PMU time
-        * which offset gets modified by settimeofday.
-        * We don't need this for mononic clock because in that case the sleep
-        * time computation is independent from the offset value of the PMU.
-        */
-       if (!has_monotonic_clock) {
-               if (forward)
-                       bintime_add(&clock_calend.basesleep, &delta);
-               else
-                       bintime_sub(&clock_calend.basesleep, &delta);
-       }
-}
-
 /*
  *     clock_set_calendar_microtime:
  *
@@ -792,34 +766,19 @@ clock_set_calendar_microtime(
 
                TIME_SUB(deltasecs, oldsecs, deltamicrosecs, oldmicrosecs, USEC_PER_SEC);
 
-#if DEVELOPMENT || DEBUG
-               if (g_should_log_clock_adjustments) {
-                       os_log(OS_LOG_DEFAULT, "%s delta requested %lu s %d u\n",
-                              __func__, (unsigned long)deltasecs, deltamicrosecs);
-               }
-#endif
-
                TIME_ADD(clock_boottime, deltasecs, clock_boottime_usec, deltamicrosecs, USEC_PER_SEC);
                clock2bintime(&deltasecs, &deltamicrosecs, &bt);
                bintime_add(&clock_calend.boottime, &bt);
-               update_basesleep(bt, TRUE);
        } else {
                // moving backwards
                deltasecs = oldsecs;
                deltamicrosecs = oldmicrosecs;
 
                TIME_SUB(deltasecs, secs, deltamicrosecs, microsecs, USEC_PER_SEC);
-#if DEVELOPMENT || DEBUG
-               if (g_should_log_clock_adjustments) {
-                       os_log(OS_LOG_DEFAULT, "%s negative delta requested %lu s %d u\n",
-                              __func__, (unsigned long)deltasecs, deltamicrosecs);
-               }
-#endif
 
                TIME_SUB(clock_boottime, deltasecs, clock_boottime_usec, deltamicrosecs, USEC_PER_SEC);
                clock2bintime(&deltasecs, &deltamicrosecs, &bt);
                bintime_sub(&clock_calend.boottime, &bt);
-               update_basesleep(bt, FALSE);
        }
 
        clock_calend.bintime = clock_calend.boottime;
@@ -1065,26 +1024,24 @@ clock_initialize_calendar(void)
         clock_usec_t            microsys2, monotonic_usec;
         size_t                  size;
 
-       //Get PMU time with offset and corresponding sys time
+       //Get the UTC time and corresponding sys time
        PEGetUTCTimeOfDay(&secs, &microsecs);
        clock_get_system_microtime(&sys, &microsys);
 
        /*
         * If the platform has a monotonic clock, use kern.monotonicclock_usecs
-        * to estimate the sleep/wake time, otherwise use the PMU and adjustments
-        * provided through settimeofday to estimate the sleep time.
-        * NOTE: the latter case relies that the kernel is the only component
-        * to set the PMU offset.
+        * to estimate the sleep/wake time, otherwise use the UTC time to estimate
+        * the sleep time.
         */
        size = sizeof(monotonic_time);
        if (kernel_sysctlbyname("kern.monotonicclock_usecs", &monotonic_time, &size, NULL, 0) != 0) {
                has_monotonic_clock = 0;
-               os_log(OS_LOG_DEFAULT, "%s system does not have monotonic clock.\n", __func__);
+               os_log(OS_LOG_DEFAULT, "%s system does not have monotonic clock\n", __func__);
        } else {
                has_monotonic_clock = 1;
                monotonic_usec_total = monotonic_time.monotonic_time_usec;
                absolutetime_to_microtime(monotonic_time.mach_time, &sys2, &microsys2);
-               os_log(OS_LOG_DEFAULT, "%s system has monotonic clock.\n", __func__);
+               os_log(OS_LOG_DEFAULT, "%s system has monotonic clock\n", __func__);
        }
 
        s = splclock();
@@ -1095,15 +1052,6 @@ clock_initialize_calendar(void)
        utc_offset_secs = secs;
        utc_offset_microsecs = microsecs;
 
-#if DEVELOPMENT || DEBUG
-       last_utc_sec = secs;
-       last_utc_usec = microsecs;
-       last_sys_sec = sys;
-       last_sys_usec = microsys;
-       if (secs > max_utc_sec)
-               max_utc_sec = secs;
-#endif
-
        /*
         * We normally expect the UTC clock to be always-on and produce
         * greater readings than the tick counter.  There may be corner cases
@@ -1112,18 +1060,17 @@ clock_initialize_calendar(void)
         * on error) in which that doesn't hold true.  Bring the UTC measurements
         * in-line with the tick counter measurements as a best effort in that case.
         */
-       //FIXME if the current time is prior than 1970 secs will be negative
        if ((sys > secs) || ((sys == secs) && (microsys > microsecs))) {
-               os_log(OS_LOG_DEFAULT, "%s WARNING: PMU offset is less then sys PMU %lu s %d u sys %lu s %d u\n",
+               os_log(OS_LOG_DEFAULT, "%s WARNING: UTC time is less then sys time, (%lu s %d u) UTC (%lu s %d u) sys\n",
                        __func__, (unsigned long) secs, microsecs, (unsigned long)sys, microsys);
                secs = utc_offset_secs = sys;
                microsecs = utc_offset_microsecs = microsys;
        }
 
-       // PMU time with offset - sys
+       // UTC - sys
        // This macro stores the subtraction result in utc_offset_secs and utc_offset_microsecs
        TIME_SUB(utc_offset_secs, sys, utc_offset_microsecs, microsys, USEC_PER_SEC);
-
+       // This function converts utc_offset_secs and utc_offset_microsecs in bintime
        clock2bintime(&utc_offset_secs, &utc_offset_microsecs, &bt);
 
        /*
@@ -1151,16 +1098,13 @@ clock_initialize_calendar(void)
                monotonic_sec = monotonic_usec_total / (clock_sec_t)USEC_PER_SEC;
                monotonic_usec = monotonic_usec_total % (clock_usec_t)USEC_PER_SEC;
 
-               // PMU time without offset - sys
+               // monotonic clock - sys
                // This macro stores the subtraction result in monotonic_sec and monotonic_usec
                TIME_SUB(monotonic_sec, sys2, monotonic_usec, microsys2, USEC_PER_SEC);
                clock2bintime(&monotonic_sec, &monotonic_usec, &monotonic_bt);
 
                // set the baseleep as the difference between monotonic clock - sys
                clock_calend.basesleep = monotonic_bt;
-       } else {
-               // set the baseleep as the difference between PMU clock - sys
-               clock_calend.basesleep = bt;
        }
        commpage_update_mach_continuous_time(mach_absolutetime_asleep);
 
@@ -1187,149 +1131,189 @@ clock_initialize_calendar(void)
 void
 clock_wakeup_calendar(void)
 {
-       clock_sec_t             sys;
-       clock_sec_t             secs;
-       clock_usec_t            microsys;
-       clock_usec_t            microsecs;
+       clock_sec_t             wake_sys_sec;
+       clock_usec_t            wake_sys_usec;
+       clock_sec_t             wake_sec;
+       clock_usec_t            wake_usec;
+       clock_sec_t             wall_time_sec;
+       clock_usec_t            wall_time_usec;
+       clock_sec_t             diff_sec;
+        clock_usec_t           diff_usec;
+       clock_sec_t             var_s;
+       clock_usec_t            var_us;
        spl_t                   s;
        struct bintime          bt, last_sleep_bt;
-       clock_sec_t             basesleep_s, last_sleep_sec;
-       clock_usec_t            basesleep_us, last_sleep_usec;
        struct latched_time     monotonic_time;
        uint64_t                monotonic_usec_total;
+       uint64_t                wake_abs;
        size_t                  size;
-       clock_sec_t secs_copy;
-        clock_usec_t microsecs_copy;
-#if DEVELOPMENT || DEBUG
-       clock_sec_t utc_sec;
-       clock_usec_t utc_usec;
-       PEGetUTCTimeOfDay(&utc_sec, &utc_usec);
-#endif
 
        /*
         * If the platform has the monotonic clock use that to
         * compute the sleep time. The monotonic clock does not have an offset
         * that can be modified, so nor kernel or userspace can change the time
         * of this clock, it can only monotonically increase over time.
-        * During sleep mach_absolute_time does not tick,
-        * so the sleep time is the difference betwen the current monotonic time
+        * During sleep mach_absolute_time (sys time) does not tick,
+        * so the sleep time is the difference between the current monotonic time
         * less the absolute time and the previous difference stored at wake time.
         *
-        * basesleep = monotonic - sys ---> computed at last wake
+        * basesleep = (monotonic - sys) ---> computed at last wake
         * sleep_time = (monotonic - sys) - basesleep
         *
-        * If the platform does not support monotonic time we use the PMU time
-        * to compute the last sleep.
-        * The PMU time is the monotonic clock + an offset that can be set
+        * If the platform does not support monotonic clock we set the wall time to what the
+        * UTC clock returns us.
+        * Setting the wall time to UTC time implies that we loose all the adjustments
+        * done during wake time through adjtime/ntp_adjustime.
+        * The UTC time is the monotonic clock + an offset that can be set
         * by kernel.
+        * The time slept in this case is the difference between wall time and UTC
+        * at wake.
         *
         * IMPORTANT:
-        * We assume that only the kernel is setting the offset of the PMU and that
+        * We assume that only the kernel is setting the offset of the PMU/RTC and that
         * it is doing it only througth the settimeofday interface.
-        *
-        * basesleep is the different between the PMU time and the mach_absolute_time
-        * at wake.
-        * During awake time settimeofday can change the PMU offset by a delta,
-        * and basesleep is shifted by the same delta applyed to the PMU. So the sleep
-        * time computation becomes:
-        *
-        * PMU = monotonic + PMU_offset
-        * basesleep = PMU - sys ---> computed at last wake
-        * basesleep += settimeofday_delta
-        * PMU_offset += settimeofday_delta
-        * sleep_time = (PMU - sys) - basesleep
         */
        if (has_monotonic_clock) {
-               //Get monotonic time with corresponding sys time
+
+#if DEVELOPMENT || DEBUG
+               /*
+                * Just for debugging, get the wake UTC time.
+                */
+               PEGetUTCTimeOfDay(&var_s, &var_us);
+#endif
+               /*
+                * Get monotonic time with corresponding sys time
+                */
                size = sizeof(monotonic_time);
                if (kernel_sysctlbyname("kern.monotonicclock_usecs", &monotonic_time, &size, NULL, 0) != 0) {
                        panic("%s: could not call kern.monotonicclock_usecs", __func__);
                }
-               monotonic_usec_total = monotonic_time.monotonic_time_usec;
-               absolutetime_to_microtime(monotonic_time.mach_time, &sys, &microsys);
+               wake_abs = monotonic_time.mach_time;
+               absolutetime_to_microtime(wake_abs, &wake_sys_sec, &wake_sys_usec);
 
-               secs = monotonic_usec_total / (clock_sec_t)USEC_PER_SEC;
-               microsecs = monotonic_usec_total % (clock_usec_t)USEC_PER_SEC;
+               monotonic_usec_total = monotonic_time.monotonic_time_usec;
+               wake_sec = monotonic_usec_total / (clock_sec_t)USEC_PER_SEC;
+               wake_usec = monotonic_usec_total % (clock_usec_t)USEC_PER_SEC;
        } else {
-               //Get PMU time with offset and corresponding sys time
-               PEGetUTCTimeOfDay(&secs, &microsecs);
-               clock_get_system_microtime(&sys, &microsys);
-
+               /*
+                * Get UTC time and corresponding sys time
+                */
+               PEGetUTCTimeOfDay(&wake_sec, &wake_usec);
+               wake_abs = mach_absolute_time();
+               absolutetime_to_microtime(wake_abs, &wake_sys_sec, &wake_sys_usec);
        }
 
+#if DEVELOPMENT || DEBUG
+        os_log(OS_LOG_DEFAULT, "time at wake %lu s %d u from %s clock, abs %llu\n", (unsigned long)wake_sec, wake_usec, (has_monotonic_clock)?"monotonic":"UTC", wake_abs);
+        if (has_monotonic_clock) {
+                os_log(OS_LOG_DEFAULT, "UTC time %lu s %d u\n", (unsigned long)var_s, var_us);
+        }
+#endif /* DEVELOPMENT || DEBUG */
+
        s = splclock();
        clock_lock();
        
        commpage_disable_timestamp();
 
-       secs_copy = secs;
-       microsecs_copy = microsecs;
-
 #if DEVELOPMENT || DEBUG
        struct clock_calend clock_calend_cp1 = clock_calend;
 #endif /* DEVELOPMENT || DEBUG */
 
-#if DEVELOPMENT || DEBUG
-       last_utc_sec = secs;
-       last_utc_usec = microsecs;
-       last_sys_sec = sys;
-       last_sys_usec = microsys;
-       if (secs > max_utc_sec)
-               max_utc_sec = secs;
-#endif
        /*
-        * We normally expect the UTC clock to be always-on and produce
-        * greater readings than the tick counter.  There may be corner cases
-        * due to differing clock resolutions (UTC clock is likely lower) and
-        * and errors reading the UTC clock (some implementations return 0
-        * on error) in which that doesn't hold true.  Bring the UTC measurements
-        * in-line with the tick counter measurements as a best effort in that case.
+        * We normally expect the UTC/monotonic clock to be always-on and produce
+        * greater readings than the sys counter.  There may be corner cases
+        * due to differing clock resolutions (UTC/monotonic clock is likely lower) and
+        * and errors reading the UTC/monotonic clock (some implementations return 0
+        * on error) in which that doesn't hold true.
         */
-       //FIXME if the current time is prior than 1970 secs will be negative
-       if ((sys > secs) || ((sys == secs) && (microsys > microsecs))) {
-               os_log(OS_LOG_DEFAULT, "%s WARNING: %s is less then sys %s %lu s %d u sys %lu s %d u\n",
-                       __func__, (has_monotonic_clock)?"monotonic":"PMU", (has_monotonic_clock)?"monotonic":"PMU", (unsigned long)secs, microsecs, (unsigned long)sys, microsys);
-               secs = sys;
-               microsecs = microsys;
+       if ((wake_sys_sec > wake_sec) || ((wake_sys_sec == wake_sec) && (wake_sys_usec > wake_usec))) {
+               os_log_error(OS_LOG_DEFAULT, "WARNING: %s clock is less then sys clock at wake: %lu s %d u vs %lu s %d u, defaulting sleep time to zero\n", (has_monotonic_clock)?"monotonic":"UTC", (unsigned long)wake_sec, wake_usec, (unsigned long)wake_sys_sec, wake_sys_usec);
+               mach_absolutetime_last_sleep = 0;
+               goto done;
        }
 
-       // PMU or monotonic - sys
-       // This macro stores the subtraction result in secs and microsecs
-       TIME_SUB(secs, sys, microsecs, microsys, USEC_PER_SEC);
-       clock2bintime(&secs, &microsecs, &bt);
+       if (has_monotonic_clock) {
+               /*
+                * computer the difference monotonic - sys
+                * we already checked that monotonic time is
+                * greater than sys.
+                */
+               diff_sec = wake_sec;
+               diff_usec = wake_usec;
+               // This macro stores the subtraction result in diff_sec and diff_usec
+               TIME_SUB(diff_sec, wake_sys_sec, diff_usec, wake_sys_usec, USEC_PER_SEC);
+               //This function converts diff_sec and diff_usec in bintime
+               clock2bintime(&diff_sec, &diff_usec, &bt);
 
-       /*
-        * Safety belt: the UTC clock will likely have a lower resolution than the tick counter.
-        * It's also possible that the device didn't fully transition to the powered-off state on
-        * the most recent sleep, so the tick counter may not have reset or may have only briefly
-        * tured off.  In that case it's possible for the difference between the UTC clock and the
-        * tick counter to be less than the previously recorded value in clock.calend.basesleep.
-        * In that case simply record that we slept for 0 ticks.
-        */ 
-       if ((bt.sec > clock_calend.basesleep.sec) ||
-           ((bt.sec == clock_calend.basesleep.sec) && (bt.frac > clock_calend.basesleep.frac))) {
-
-               //last_sleep is the difference between current PMU or monotonic - abs and last wake PMU or monotonic - abs
-               last_sleep_bt = bt;
-               bintime_sub(&last_sleep_bt, &clock_calend.basesleep);
-
-               //set baseseep to current PMU or monotonic - abs
-               clock_calend.basesleep = bt;
-               bintime2usclock(&last_sleep_bt, &last_sleep_sec, &last_sleep_usec);
-               bintime2absolutetime(&last_sleep_bt, &mach_absolutetime_last_sleep);
-               mach_absolutetime_asleep += mach_absolutetime_last_sleep;
-
-               bintime_add(&clock_calend.offset, &last_sleep_bt);
-               bintime_add(&clock_calend.bintime, &last_sleep_bt);
-
-       } else{
-               mach_absolutetime_last_sleep = 0;
-               last_sleep_sec = last_sleep_usec = 0;
-               bintime2usclock(&clock_calend.basesleep, &basesleep_s, &basesleep_us);
-               os_log(OS_LOG_DEFAULT, "%s WARNING: basesleep (%lu s %d u)  > %s-sys (%lu s %d u) \n",
-                       __func__, (unsigned long) basesleep_s, basesleep_us, (has_monotonic_clock)?"monotonic":"PMU", (unsigned long) secs_copy, microsecs_copy );
-       }
+               /*
+                * Safety belt: the monotonic clock will likely have a lower resolution than the sys counter.
+                * It's also possible that the device didn't fully transition to the powered-off state on
+                * the most recent sleep, so the sys counter may not have reset or may have only briefly
+                * turned off.  In that case it's possible for the difference between the monotonic clock and the
+                * sys counter to be less than the previously recorded value in clock.calend.basesleep.
+                * In that case simply record that we slept for 0 ticks.
+                */
+               if ((bt.sec > clock_calend.basesleep.sec) ||
+                   ((bt.sec == clock_calend.basesleep.sec) && (bt.frac > clock_calend.basesleep.frac))) {
 
+                       //last_sleep is the difference between (current monotonic - abs) and (last wake monotonic - abs)
+                       last_sleep_bt = bt;
+                       bintime_sub(&last_sleep_bt, &clock_calend.basesleep);
+
+                       bintime2absolutetime(&last_sleep_bt, &mach_absolutetime_last_sleep);
+                       mach_absolutetime_asleep += mach_absolutetime_last_sleep;
+
+                       //set basesleep to current monotonic - abs
+                       clock_calend.basesleep = bt;
+
+                       //update wall time
+                       bintime_add(&clock_calend.offset, &last_sleep_bt);
+                       bintime_add(&clock_calend.bintime, &last_sleep_bt);
+
+                       bintime2usclock(&last_sleep_bt, &var_s, &var_us);
+                       os_log(OS_LOG_DEFAULT, "time_slept (%lu s %d u)\n", (unsigned long) var_s, var_us);
+
+               } else {
+                       bintime2usclock(&clock_calend.basesleep, &var_s, &var_us);
+                       os_log_error(OS_LOG_DEFAULT, "WARNING: last wake monotonic-sys time (%lu s %d u) is greater then current monotonic-sys time(%lu s %d u), defaulting sleep time to zero\n", (unsigned long) var_s, var_us, (unsigned long) diff_sec, diff_usec);
+
+                       mach_absolutetime_last_sleep = 0;
+               }
+       } else {
+               /*
+                * set the wall time to UTC value
+                */
+               bt = get_scaled_time(wake_abs);
+               bintime_add(&bt, &clock_calend.bintime);
+               bintime2usclock(&bt, &wall_time_sec, &wall_time_usec);
+
+               if (wall_time_sec > wake_sec || (wall_time_sec == wake_sec && wall_time_usec > wake_usec) ) {
+                       os_log(OS_LOG_DEFAULT, "WARNING: wall time (%lu s %d u) is greater than current UTC time (%lu s %d u), defaulting sleep time to zero\n", (unsigned long) wall_time_sec, wall_time_usec, (unsigned long) wake_sec, wake_usec);
+
+                       mach_absolutetime_last_sleep = 0;
+               } else {
+                       diff_sec = wake_sec;
+                       diff_usec = wake_usec;
+                       // This macro stores the subtraction result in diff_sec and diff_usec
+                       TIME_SUB(diff_sec, wall_time_sec, diff_usec, wall_time_usec, USEC_PER_SEC);
+                       //This function converts diff_sec and diff_usec in bintime
+                       clock2bintime(&diff_sec, &diff_usec, &bt);
+
+                       //time slept in this case is the difference between PMU/RTC and wall time
+                       last_sleep_bt = bt;
+
+                       bintime2absolutetime(&last_sleep_bt, &mach_absolutetime_last_sleep);
+                       mach_absolutetime_asleep += mach_absolutetime_last_sleep;
+
+                       //update wall time
+                       bintime_add(&clock_calend.offset, &last_sleep_bt);
+                       bintime_add(&clock_calend.bintime, &last_sleep_bt);
+
+                       bintime2usclock(&last_sleep_bt, &var_s, &var_us);
+                       os_log(OS_LOG_DEFAULT, "time_slept (%lu s %d u)\n", (unsigned long)var_s, var_us);
+               }
+       }
+done:
        KERNEL_DEBUG_CONSTANT(
                  MACHDBG_CODE(DBG_MACH_CLOCK,MACH_EPOCH_CHANGE) | DBG_FUNC_NONE,
                  (uintptr_t) mach_absolutetime_last_sleep,
@@ -1350,11 +1334,8 @@ clock_wakeup_calendar(void)
 
 #if DEVELOPMENT || DEBUG
        if (g_should_log_clock_adjustments) {
-               os_log(OS_LOG_DEFAULT, "PMU was %lu s %d u\n",(unsigned long) utc_sec, utc_usec);
-               os_log(OS_LOG_DEFAULT, "last sleep was %lu s %d u\n",(unsigned long) last_sleep_sec, last_sleep_usec);
-               print_all_clock_variables("clock_wakeup_calendar:BEFORE",
-                                 &secs_copy, &microsecs_copy, &sys, &microsys, &clock_calend_cp1);
-               print_all_clock_variables("clock_wakeup_calendar:AFTER", NULL, NULL, NULL, NULL, &clock_calend_cp);
+               print_all_clock_variables("clock_wakeup_calendar: BEFORE", NULL, NULL, NULL, NULL, &clock_calend_cp1);
+               print_all_clock_variables("clock_wakeup_calendar: AFTER", NULL, NULL, NULL, NULL, &clock_calend_cp);
        }
 #endif /* DEVELOPMENT || DEBUG */
 
index bc7357560dd03fdb0e2975dbe774201940e705dc..c77d40f2878b12d0608e4f896437dc0fb7ab9b8e 100644 (file)
@@ -158,6 +158,19 @@ SECURITY_READ_ONLY_EARLY(struct clock_ops) calend_ops = {
        calend_getattr,
 };
 
+/*
+ * List of clock devices.
+ */
+SECURITY_READ_ONLY_LATE(struct clock) clock_list[] = {
+
+       /* SYSTEM_CLOCK */
+       { &sysclk_ops, 0, 0 },
+
+       /* CALENDAR_CLOCK */
+       { &calend_ops, 0, 0 }
+};
+int    clock_count = sizeof(clock_list) / sizeof(clock_list[0]);
+
 /*
  *     Macros to lock/unlock clock system.
  */
index ed24958c17f84eda87e664f3ec16ce73b92f37b9..26f9d33a49a89da93c939d485565ccafa8ed58ab 100644 (file)
@@ -220,7 +220,7 @@ struct i_jetsam_coalition {
        queue_head_t extensions;
        queue_head_t services;
        queue_head_t other;
-       thread_group_t thread_group;
+       struct thread_group *thread_group;
 };
 
 
index 10cc5b742d22b73498828f6d66c2c96c8e3b9d42..195ba05be9c19f9c21b26e2beac64c27897faa45 100644 (file)
@@ -68,10 +68,10 @@ boolean_t task_coalition_adjust_focal_count(task_t task, int count, uint32_t *ne
 uint32_t task_coalition_focal_count(task_t task);
 boolean_t task_coalition_adjust_nonfocal_count(task_t task, int count, uint32_t *new_count);
 uint32_t task_coalition_nonfocal_count(task_t task);
-thread_group_t task_coalition_get_thread_group(task_t task);
-void    coalition_set_thread_group(coalition_t coal, thread_group_t tg);
-thread_group_t kdp_coalition_get_thread_group(coalition_t coal);
-thread_group_t coalition_get_thread_group(coalition_t coal);
+struct thread_group *task_coalition_get_thread_group(task_t task);
+void    coalition_set_thread_group(coalition_t coal, struct thread_group *tg);
+struct thread_group *kdp_coalition_get_thread_group(coalition_t coal);
+struct thread_group *coalition_get_thread_group(coalition_t coal);
 void task_coalition_thread_group_focal_update(task_t task);
 
 void coalition_for_each_task(coalition_t coal, void *ctx,
diff --git a/osfmk/kern/cpu_quiesce.c b/osfmk/kern/cpu_quiesce.c
new file mode 100644 (file)
index 0000000..977f5e5
--- /dev/null
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifdef __x86_64__
+#error This file is only needed on weakly-ordered systems!
+#endif
+
+#include <machine/atomic.h>
+#include <machine/commpage.h>
+#include <machine/machine_cpu.h>
+
+#include <kern/sched_prim.h>
+#include <kern/processor.h>
+#include <kern/ast.h>
+
+#include <kern/cpu_quiesce.h>
+
+/*
+ * CPU quiescing generation counter implemented with a checkin mask
+ *
+ * A tri-state bitfield, with 2 bits for each processor:;
+ * 1) 'checkin' bit, saying this processor has 'checked in', i.e. executed the acqrel barrier
+ * 2) 'expected' bit, saying this processor is expected to check in, i.e. not idle.
+ *
+ * When a processor causes the 'expected' bits to equal the 'checkin' bits, which
+ * indicates that all processors have executed the barrier, it ticks the algorithm
+ * and resets the state.
+ *
+ * Idle CPUs won't check in, because they don't run, so the algorithm won't tick.
+ * However, they can't do anything in userspace while idle, so we don't need
+ * them to execute barriers, so we have them 'leave' the counter so that
+ * they don't delay the tick while idle.
+ *
+ * This bitfield currently limits MAX_CPUS to 32 on LP64.
+ * In the future, we can use double-wide atomics and int128 if we need 64 CPUS.
+ *
+ * The mask only guarantees ordering to code running in userspace.
+ * We defer joining the counter until we actually reach userspace, allowing
+ * processors that come out of idle and only run kernel code to avoid the overhead
+ * of participation.
+ *
+ * We additionally defer updating the counter for a minimum interval to
+ * reduce the frequency of executing the exclusive atomic operations.
+ *
+ * The longest delay between two checkins assuming that at least one processor
+ * joins is <checkin delay> + (<thread quantum> * 2)
+ */
+
+typedef unsigned long checkin_mask_t;
+
+static _Atomic checkin_mask_t cpu_quiescing_checkin_state;
+
+static uint64_t cpu_checkin_last_commit;
+
+#define CPU_CHECKIN_MIN_INTERVAL_US     4000 /* 4ms */
+#define CPU_CHECKIN_MIN_INTERVAL_MAX_US USEC_PER_SEC /* 1s */
+static uint64_t cpu_checkin_min_interval;
+uint32_t cpu_checkin_min_interval_us;
+
+#if __LP64__
+static_assert(MAX_CPUS <= 32);
+#define CPU_CHECKIN_MASK        0x5555555555555555UL
+#define CPU_EXPECTED_MASK       (~CPU_CHECKIN_MASK)
+#else
+/* Avoid double-wide CAS on 32-bit platforms by using a 32-bit state and mask */
+static_assert(MAX_CPUS <= 16);
+#define CPU_CHECKIN_MASK        0x55555555UL
+#define CPU_EXPECTED_MASK       (~CPU_CHECKIN_MASK)
+#endif
+
+static_assert(CPU_CHECKIN_MASK == CPU_EXPECTED_MASK >> 1);
+
+static inline checkin_mask_t
+cpu_checked_in_bit(int cpuid)
+{
+       return 1UL << (2 * cpuid);
+}
+
+static inline checkin_mask_t
+cpu_expected_bit(int cpuid)
+{
+       return 1UL << (2 * cpuid + 1);
+}
+
+void
+cpu_quiescent_counter_init(void)
+{
+       assert(CPU_CHECKIN_MASK & cpu_checked_in_bit(MAX_CPUS));
+       assert(CPU_EXPECTED_MASK & cpu_expected_bit(MAX_CPUS));
+       assert((CPU_CHECKIN_MASK & cpu_expected_bit(MAX_CPUS)) == 0);
+       assert((CPU_EXPECTED_MASK & cpu_checked_in_bit(MAX_CPUS)) == 0);
+
+       cpu_quiescent_counter_set_min_interval_us(CPU_CHECKIN_MIN_INTERVAL_US);
+}
+
+void
+cpu_quiescent_counter_set_min_interval_us(uint32_t new_value_us)
+{
+       /* clamp to something vaguely sane */
+       if (new_value_us > CPU_CHECKIN_MIN_INTERVAL_MAX_US)
+               new_value_us = CPU_CHECKIN_MIN_INTERVAL_MAX_US;
+
+       cpu_checkin_min_interval_us = new_value_us;
+
+       uint64_t abstime = 0;
+       clock_interval_to_absolutetime_interval(cpu_checkin_min_interval_us,
+                                               NSEC_PER_USEC, &abstime);
+       cpu_checkin_min_interval = abstime;
+}
+
+
+/*
+ * Called when all running CPUs have checked in.
+ *
+ * The commpage increment is protected by the 'lock' of having caused the tick,
+ * and it is published by the state reset release barrier.
+ */
+static void
+cpu_quiescent_counter_commit(uint64_t ctime)
+{
+       __kdebug_only uint64_t          old_gen;
+       __kdebug_only checkin_mask_t    old_state;
+
+       old_gen = commpage_increment_cpu_quiescent_counter();
+
+       cpu_checkin_last_commit = ctime;
+
+       old_state = os_atomic_and(&cpu_quiescing_checkin_state, ~CPU_CHECKIN_MASK, release);
+
+       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_QUIESCENT_COUNTER), old_gen, old_state, ctime, 0);
+}
+
+/*
+ * Have all the expected CPUs checked in?
+ */
+static bool
+cpu_quiescent_counter_needs_commit(checkin_mask_t state)
+{
+       return (state & CPU_CHECKIN_MASK) == ((state & CPU_EXPECTED_MASK) >> 1);
+}
+
+/*
+ * Called when a processor wants to start participating in the counter, e.g.
+ * 1) when context switching away from the idle thread
+ * 2) when coming up for the first time
+ * 3) when coming up after a shutdown
+ *
+ * Called with interrupts disabled.
+ */
+void
+cpu_quiescent_counter_join(__unused uint64_t ctime)
+{
+       processor_t processor = current_processor();
+       __assert_only int cpuid = processor->cpu_id;
+
+       assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_NONE ||
+              processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_LEFT);
+
+       assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
+               (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
+
+       processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_PENDING_JOIN;
+
+       /*
+        * Mark the processor to call cpu_quiescent_counter_ast before it
+        * ever returns to userspace.
+        */
+       ast_on(AST_UNQUIESCE);
+}
+
+/*
+ * Called with interrupts disabled from the userspace boundary at the AST_UNQUIESCE callback
+ * It needs to acquire the counter to see data and the counter published by other CPUs.
+ */
+void
+cpu_quiescent_counter_ast(void)
+{
+       processor_t processor = current_processor();
+       int cpuid = processor->cpu_id;
+
+       assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
+
+       /* We had better not already be joined. */
+       assert((os_atomic_load(&cpu_quiescing_checkin_state, relaxed) &
+               (cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid))) == 0);
+
+       /*
+        * No release barrier needed because we have no prior state to publish.
+        * Acquire barrier needed because we need this processor to see
+        * the latest counter value.
+        *
+        * The state may be in 'needs checkin' both before and after
+        * this atomic or.
+        *
+        * Additionally, if this is the first processor to come out of idle,
+        * it may need to kickstart the algorithm, otherwise it would
+        * stay in 'needs commit' perpetually with no processor assigned to
+        * actually do the commit.  To do that, the first processor only adds
+        * its expected bit.
+        */
+
+       processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_JOINED;
+       processor->cpu_quiesce_last_checkin = mach_absolute_time();
+
+       checkin_mask_t old_mask, new_mask;
+       os_atomic_rmw_loop(&cpu_quiescing_checkin_state, old_mask, new_mask, acquire, {
+               if (old_mask == 0) {
+                       new_mask = old_mask | cpu_expected_bit(cpuid);
+               } else {
+                       new_mask = old_mask | cpu_expected_bit(cpuid) | cpu_checked_in_bit(cpuid);
+               }
+       });
+}
+
+/*
+ * Called when a processor no longer wants to participate in the counter,
+ * i.e. when a processor is on its way to idle or shutdown.
+ *
+ * Called with interrupts disabled.
+ *
+ * The processor needs to remove itself from the expected mask, to allow the
+ * algorithm to continue ticking without its participation.
+ * However, it needs to ensure that anything it has done since the last time
+ * it checked in has been published before the next tick is allowed to commit.
+ */
+void
+cpu_quiescent_counter_leave(uint64_t ctime)
+{
+       processor_t processor = current_processor();
+       int cpuid = processor->cpu_id;
+
+       assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED ||
+              processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN);
+
+       /* We no longer need the cpu_quiescent_counter_ast callback to be armed */
+       ast_off(AST_UNQUIESCE);
+
+       if (processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_PENDING_JOIN) {
+               /* We never actually joined, so we don't have to do the work to leave. */
+               processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT;
+               return;
+       }
+
+       /* Leaving can't be deferred, even if we're within the min interval */
+       processor->cpu_quiesce_last_checkin = ctime;
+
+       checkin_mask_t mask = cpu_checked_in_bit(cpuid) | cpu_expected_bit(cpuid);
+
+       checkin_mask_t orig_state = os_atomic_and_orig(&cpu_quiescing_checkin_state,
+                                                      ~mask, acq_rel);
+
+       assert((orig_state & cpu_expected_bit(cpuid)));
+
+       processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_LEFT;
+
+       if (cpu_quiescent_counter_needs_commit(orig_state)) {
+               /*
+                * the old state indicates someone else was already doing a commit
+                * but hadn't finished yet.  We successfully inserted the acq_rel
+                * before they finished the commit by resetting the bitfield,
+                * so we're done here.
+                */
+               return;
+       }
+
+       checkin_mask_t new_state = orig_state & ~mask;
+
+       if (cpu_quiescent_counter_needs_commit(new_state)) {
+               cpu_quiescent_counter_commit(ctime);
+       }
+}
+
+/*
+ * Called when a processor wants to check in to the counter
+ * If it hasn't yet fully joined, it doesn't need to check in.
+ *
+ * Called with interrupts disabled.
+ */
+void
+cpu_quiescent_counter_checkin(uint64_t ctime)
+{
+       processor_t processor = current_processor();
+       int cpuid = processor->cpu_id;
+
+       assert(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_NONE);
+
+       /* If we're not joined yet, we don't need to check in */
+       if (__probable(processor->cpu_quiesce_state != CPU_QUIESCE_COUNTER_JOINED))
+               return;
+
+       /* If we've checked in recently, we don't need to check in yet. */
+       if (__probable((ctime - processor->cpu_quiesce_last_checkin) <= cpu_checkin_min_interval))
+               return;
+
+       processor->cpu_quiesce_last_checkin = ctime;
+
+       checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
+
+       assert((state & cpu_expected_bit(cpuid)));
+
+       if (__probable((state & cpu_checked_in_bit(cpuid)))) {
+               /*
+                * Processor has already checked in for this round, no need to
+                * acquire the cacheline exclusive.
+                */
+               return;
+       }
+
+       checkin_mask_t orig_state = os_atomic_or_orig(&cpu_quiescing_checkin_state,
+                                                     cpu_checked_in_bit(cpuid), acq_rel);
+
+       checkin_mask_t new_state = orig_state | cpu_checked_in_bit(cpuid);
+
+       if (cpu_quiescent_counter_needs_commit(new_state)) {
+               assertf(!cpu_quiescent_counter_needs_commit(orig_state),
+                       "old: 0x%lx, new: 0x%lx", orig_state, new_state);
+               cpu_quiescent_counter_commit(ctime);
+       }
+}
+
+#if MACH_ASSERT
+/*
+ * Called on all AST exits to userspace to assert this processor actually joined
+ *
+ * Called with interrupts disabled after the AST should have been handled
+ */
+void
+cpu_quiescent_counter_assert_ast(void)
+{
+       processor_t processor = current_processor();
+       int cpuid = processor->cpu_id;
+
+       assert(processor->cpu_quiesce_state == CPU_QUIESCE_COUNTER_JOINED);
+
+       checkin_mask_t state = os_atomic_load(&cpu_quiescing_checkin_state, relaxed);
+       assert((state & cpu_expected_bit(cpuid)));
+}
+#endif /* MACH_ASSERT */
+
diff --git a/osfmk/kern/cpu_quiesce.h b/osfmk/kern/cpu_quiesce.h
new file mode 100644 (file)
index 0000000..324a2b0
--- /dev/null
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_CPU_QUIESCE_H_
+#define _KERN_CPU_QUIESCE_H_
+
+#ifdef XNU_KERNEL_PRIVATE
+
+#include <sys/cdefs.h>
+
+/* State field kept on each CPU to track participation */
+typedef enum {
+       /* Processor that never participated */
+       CPU_QUIESCE_COUNTER_NONE = 0,
+       /* Processor ready to participate, waiting for return to user */
+       CPU_QUIESCE_COUNTER_PENDING_JOIN = 1,
+       /* Processor currently participating in counter */
+       CPU_QUIESCE_COUNTER_JOINED = 2,
+       /* Processor currently not participating in counter */
+       CPU_QUIESCE_COUNTER_LEFT = 3,
+} cpu_quiescent_state_t;
+
+#if CONFIG_QUIESCE_COUNTER
+
+extern void cpu_quiescent_counter_join(uint64_t ctime);
+extern void cpu_quiescent_counter_leave(uint64_t ctime);
+extern void cpu_quiescent_counter_checkin(uint64_t ctime);
+extern void cpu_quiescent_counter_ast(void);
+extern void cpu_quiescent_counter_init(void);
+
+/* use of these is guarded by the config */
+extern uint32_t cpu_checkin_min_interval_us;
+extern void cpu_quiescent_counter_set_min_interval_us(uint32_t new_value);
+
+#else /* CONFIG_QUIESCE_COUNTER */
+
+/* stub routines for platforms without the counter */
+
+static inline void cpu_quiescent_counter_join(__unused uint64_t ctime) { }
+static inline void cpu_quiescent_counter_leave(__unused uint64_t ctime) { }
+static inline void cpu_quiescent_counter_checkin(__unused uint64_t ctime) { }
+static inline void cpu_quiescent_counter_ast(void) { }
+static inline void cpu_quiescent_counter_init(void) { }
+
+#endif /* CONFIG_QUIESCE_COUNTER */
+
+#if MACH_ASSERT && CONFIG_QUIESCE_COUNTER
+extern void cpu_quiescent_counter_assert_ast(void);
+#else
+static inline void cpu_quiescent_counter_assert_ast(void) { }
+#endif
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+#endif /* _KERN_CPU_QUIESCE_H_ */
+
index e8007e9f27c192935322e23e4b8fe1a51c844609..cafafcaffa7c87f6ba4d8ec4ff14ef74155eba0f 100644 (file)
 #define _KERN_CODESIGN_H_
 
 /* code signing attributes of a process */
-#define        CS_VALID                                        0x0000001       /* dynamically valid */
-#define CS_ADHOC                                       0x0000002       /* ad hoc signed */
-#define CS_GET_TASK_ALLOW                      0x0000004       /* has get-task-allow entitlement */
-#define CS_INSTALLER                           0x0000008       /* has installer entitlement */
-
-#define        CS_HARD                                         0x0000100       /* don't load invalid pages */
-#define        CS_KILL                                         0x0000200       /* kill process if it becomes invalid */
-#define CS_CHECK_EXPIRATION                    0x0000400       /* force expiration checking */
-#define CS_RESTRICT                                    0x0000800       /* tell dyld to treat restricted */
-#define CS_ENFORCEMENT                         0x0001000       /* require enforcement */
-#define CS_REQUIRE_LV                          0x0002000       /* require library validation */
-#define CS_ENTITLEMENTS_VALIDATED      0x0004000       /* code signature permits restricted entitlements */
-#define CS_NVRAM_UNRESTRICTED          0x0008000       /* has com.apple.rootless.restricted-nvram-variables.heritable entitlement */
-
-#define        CS_ALLOWED_MACHO                         (CS_ADHOC | CS_HARD | CS_KILL | CS_CHECK_EXPIRATION | \
-                                                                         CS_RESTRICT | CS_ENFORCEMENT | CS_REQUIRE_LV)
-
-#define CS_EXEC_SET_HARD                       0x0100000       /* set CS_HARD on any exec'ed process */
-#define CS_EXEC_SET_KILL                       0x0200000       /* set CS_KILL on any exec'ed process */
-#define CS_EXEC_SET_ENFORCEMENT                0x0400000       /* set CS_ENFORCEMENT on any exec'ed process */
-#define CS_EXEC_INHERIT_SIP                    0x0800000       /* set CS_INSTALLER on any exec'ed process */
-
-#define CS_KILLED                                      0x1000000       /* was killed by kernel for invalidity */
-#define CS_DYLD_PLATFORM                       0x2000000       /* dyld used to load this is a platform binary */
-#define CS_PLATFORM_BINARY                     0x4000000       /* this is a platform binary */
-#define CS_PLATFORM_PATH                       0x8000000       /* platform binary by the fact of path (osx only) */
-#define CS_DEBUGGED                                    0x10000000  /* process is currently or has previously been debugged and allowed to run with invalid pages */
-#define CS_SIGNED                                      0x20000000  /* process has a signature (may have gone invalid) */
-#define CS_DEV_CODE                                    0x40000000  /* code is dev signed, cannot be loaded into prod signed code (will go away with rdar://problem/28322552) */
-#define CS_DATAVAULT_CONTROLLER                0x80000000      /* has Data Vault controller entitlement */
-       
-#define CS_ENTITLEMENT_FLAGS           (CS_GET_TASK_ALLOW | CS_INSTALLER | CS_DATAVAULT_CONTROLLER | CS_NVRAM_UNRESTRICTED)
+#define CS_VALID                    0x00000001  /* dynamically valid */
+#define CS_ADHOC                    0x00000002  /* ad hoc signed */
+#define CS_GET_TASK_ALLOW           0x00000004  /* has get-task-allow entitlement */
+#define CS_INSTALLER                0x00000008  /* has installer entitlement */
+
+#define CS_FORCED_LV                0x00000010  /* Library Validation required by Hardened System Policy */
+#define CS_INVALID_ALLOWED          0x00000020  /* (macOS Only) Page invalidation allowed by task port policy */
+
+#define CS_HARD                     0x00000100  /* don't load invalid pages */
+#define CS_KILL                     0x00000200  /* kill process if it becomes invalid */
+#define CS_CHECK_EXPIRATION         0x00000400  /* force expiration checking */
+#define CS_RESTRICT                 0x00000800  /* tell dyld to treat restricted */
+
+#define CS_ENFORCEMENT              0x00001000  /* require enforcement */
+#define CS_REQUIRE_LV               0x00002000  /* require library validation */
+#define CS_ENTITLEMENTS_VALIDATED   0x00004000  /* code signature permits restricted entitlements */
+#define CS_NVRAM_UNRESTRICTED       0x00008000  /* has com.apple.rootless.restricted-nvram-variables.heritable entitlement */
+
+#define CS_RUNTIME                                     0x00010000  /* Apply hardened runtime policies */
+
+#define CS_ALLOWED_MACHO            (CS_ADHOC | CS_HARD | CS_KILL | CS_CHECK_EXPIRATION | \
+                                     CS_RESTRICT | CS_ENFORCEMENT | CS_REQUIRE_LV | CS_RUNTIME)
+
+#define CS_EXEC_SET_HARD            0x00100000  /* set CS_HARD on any exec'ed process */
+#define CS_EXEC_SET_KILL            0x00200000  /* set CS_KILL on any exec'ed process */
+#define CS_EXEC_SET_ENFORCEMENT     0x00400000  /* set CS_ENFORCEMENT on any exec'ed process */
+#define CS_EXEC_INHERIT_SIP         0x00800000  /* set CS_INSTALLER on any exec'ed process */
+
+#define CS_KILLED                   0x01000000  /* was killed by kernel for invalidity */
+#define CS_DYLD_PLATFORM            0x02000000  /* dyld used to load this is a platform binary */
+#define CS_PLATFORM_BINARY          0x04000000  /* this is a platform binary */
+#define CS_PLATFORM_PATH            0x08000000  /* platform binary by the fact of path (osx only) */
+
+#define CS_DEBUGGED                 0x10000000  /* process is currently or has previously been debugged and allowed to run with invalid pages */
+#define CS_SIGNED                   0x20000000  /* process has a signature (may have gone invalid) */
+#define CS_DEV_CODE                 0x40000000  /* code is dev signed, cannot be loaded into prod signed code (will go away with rdar://problem/28322552) */
+#define CS_DATAVAULT_CONTROLLER     0x80000000  /* has Data Vault controller entitlement */
+
+#define CS_ENTITLEMENT_FLAGS        (CS_GET_TASK_ALLOW | CS_INSTALLER | CS_DATAVAULT_CONTROLLER | CS_NVRAM_UNRESTRICTED)
 
 /* executable segment flags */
 
@@ -69,7 +76,7 @@
 #define CS_EXECSEG_ALLOW_UNSIGNED      0x10            /* allow unsigned pages (for debugging) */
 #define CS_EXECSEG_DEBUGGER                    0x20            /* main binary is debugger */
 #define CS_EXECSEG_JIT                         0x40            /* JIT enabled */
-#define CS_EXECSEG_SKIP_LV                     0x80            /* skip library validation */
+#define CS_EXECSEG_SKIP_LV                     0x80            /* OBSOLETE: skip library validation */
 #define CS_EXECSEG_CAN_LOAD_CDHASH     0x100           /* can bless cdhash for execution */
 #define CS_EXECSEG_CAN_EXEC_CDHASH     0x200           /* can execute blessed cdhash */
 
@@ -103,6 +110,8 @@ enum {
        CSSLOT_ALTERNATE_CODEDIRECTORY_LIMIT = CSSLOT_ALTERNATE_CODEDIRECTORIES + CSSLOT_ALTERNATE_CODEDIRECTORY_MAX, /* one past the last */
 
        CSSLOT_SIGNATURESLOT = 0x10000,                 /* CMS Signature */
+       CSSLOT_IDENTIFICATIONSLOT = 0x10001,
+       CSSLOT_TICKETSLOT = 0x10002,
 
        CSTYPE_INDEX_REQUIREMENTS = 0x00000002,         /* compat with amfi */
        CSTYPE_INDEX_ENTITLEMENTS = 0x00000005,         /* compat with amfi */
index 313c979b52e1a8342e002ced916692bf37d8229a..0d15f8f8eee1f33697e927e2fa663332484d68e0 100644 (file)
 extern volatile struct xnu_hw_shmem_dbg_command_info *hwsd_info;
 #endif
 
+#if CONFIG_XNUPOST
+#include <tests/xnupost.h>
+extern int vsnprintf(char *, size_t, const char *, va_list);
+#endif
 
 unsigned int   halt_in_debugger = 0;
 unsigned int   current_debugger = 0;
@@ -142,6 +146,7 @@ unsigned int        kernel_debugger_entry_count = 0;
 #define CPUPANICSTR PROCESSOR_DATA(current_processor(), debugger_state).db_panic_str
 #define CPUPANICARGS PROCESSOR_DATA(current_processor(), debugger_state).db_panic_args
 #define CPUPANICOPTS PROCESSOR_DATA(current_processor(), debugger_state).db_panic_options
+#define CPUPANICDATAPTR PROCESSOR_DATA(current_processor(), debugger_state).db_panic_data_ptr
 #define CPUDEBUGGERSYNC PROCESSOR_DATA(current_processor(), debugger_state).db_proceed_on_sync_failure
 #define CPUDEBUGGERCOUNT PROCESSOR_DATA(current_processor(), debugger_state).db_entry_count
 #define CPUDEBUGGERRET PROCESSOR_DATA(current_processor(), debugger_state).db_op_return
@@ -160,11 +165,13 @@ MACRO_END
 debugger_op debugger_current_op = DBOP_NONE;
 const char *debugger_panic_str = NULL;
 va_list *debugger_panic_args = NULL;
+void *debugger_panic_data = NULL;
 uint64_t debugger_panic_options = 0;
 const char *debugger_message = NULL;
 unsigned long debugger_panic_caller = 0;
 
-void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx, uint64_t panic_options_mask, unsigned long panic_caller);
+void panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx,
+                               uint64_t panic_options_mask, void *panic_data, unsigned long panic_caller);
 static void kdp_machine_reboot_type(unsigned int type);
 __attribute__((noreturn)) void panic_spin_forever(void);
 extern kern_return_t do_stackshot(void);
@@ -215,6 +222,10 @@ unsigned int   debugger_context = 0;
 static char model_name[64];
 unsigned char *kernel_uuid;
 
+boolean_t kernelcache_uuid_valid = FALSE;
+uuid_t kernelcache_uuid;
+uuid_string_t kernelcache_uuid_string;
+
 /*
  * By default we treat Debugger() the same as calls to panic(), unless
  * we have debug boot-args present and the DB_KERN_DUMP_ON_NMI *NOT* set.
@@ -231,6 +242,7 @@ boolean_t debug_boot_arg_inited = FALSE;
 SECURITY_READ_ONLY_LATE(unsigned int) debug_boot_arg;
 
 char kernel_uuid_string[37]; /* uuid_string_t */
+char kernelcache_uuid_string[37]; /* uuid_string_t */
 char   panic_disk_error_description[512];
 size_t panic_disk_error_description_size = sizeof(panic_disk_error_description);
 
@@ -412,7 +424,7 @@ DebuggerResumeOtherCores()
 
 static void
 DebuggerSaveState(debugger_op db_op, const char *db_message, const char *db_panic_str,
-               va_list *db_panic_args, uint64_t db_panic_options,
+               va_list *db_panic_args, uint64_t db_panic_options, void *db_panic_data_ptr,
                boolean_t db_proceed_on_sync_failure, unsigned long db_panic_caller)
 {
        CPUDEBUGGEROP = db_op;
@@ -422,6 +434,7 @@ DebuggerSaveState(debugger_op db_op, const char *db_message, const char *db_pani
                CPUDEBUGGERMSG = db_message;
                CPUPANICSTR = db_panic_str;
                CPUPANICARGS = db_panic_args;
+               CPUPANICDATAPTR = db_panic_data_ptr;
                CPUPANICCALLER = db_panic_caller;
        } else if (CPUDEBUGGERCOUNT > 1 && db_panic_str != NULL) {
                kprintf("Nested panic detected:");
@@ -444,21 +457,21 @@ DebuggerSaveState(debugger_op db_op, const char *db_message, const char *db_pani
  */
 kern_return_t
 DebuggerTrapWithState(debugger_op db_op, const char *db_message, const char *db_panic_str,
-               va_list *db_panic_args, uint64_t db_panic_options,
+               va_list *db_panic_args, uint64_t db_panic_options, void *db_panic_data_ptr,
                boolean_t db_proceed_on_sync_failure, unsigned long db_panic_caller)
 {
        kern_return_t ret;
 
        assert(ml_get_interrupts_enabled() == FALSE);
-       DebuggerSaveState(db_op, db_message, db_panic_str,
-               db_panic_args, db_panic_options, db_proceed_on_sync_failure,
-               db_panic_caller);
+       DebuggerSaveState(db_op, db_message, db_panic_str, db_panic_args,
+                               db_panic_options, db_panic_data_ptr,
+                               db_proceed_on_sync_failure, db_panic_caller);
 
        TRAP_DEBUGGER;
 
        ret = CPUDEBUGGERRET;
 
-       DebuggerSaveState(DBOP_NONE, NULL, NULL, NULL, 0, FALSE, 0);
+       DebuggerSaveState(DBOP_NONE, NULL, NULL, NULL, 0, NULL, FALSE, 0);
 
        return ret;
 }
@@ -525,13 +538,13 @@ DebuggerWithContext(unsigned int reason, void *ctx, const char *message,
 
        if (ctx != NULL) {
                DebuggerSaveState(DBOP_DEBUGGER, message,
-                       NULL, NULL, debugger_options_mask, TRUE, 0);
+                       NULL, NULL, debugger_options_mask, NULL, TRUE, 0);
                handle_debugger_trap(reason, 0, 0, ctx);
                DebuggerSaveState(DBOP_NONE, NULL, NULL,
-                       NULL, 0, FALSE, 0);
+                       NULL, 0, NULL, FALSE, 0);
        } else {
                DebuggerTrapWithState(DBOP_DEBUGGER, message,
-                       NULL, NULL, debugger_options_mask, TRUE, 0);
+                       NULL, NULL, debugger_options_mask, NULL, TRUE, 0);
        }
 
        CPUDEBUGGERCOUNT--;
@@ -604,7 +617,7 @@ panic(const char *str, ...)
        va_list panic_str_args;
 
        va_start(panic_str_args, str);
-       panic_trap_to_debugger(str, &panic_str_args, 0, NULL, 0, (unsigned long)(char *)__builtin_return_address(0));
+       panic_trap_to_debugger(str, &panic_str_args, 0, NULL, 0, NULL, (unsigned long)(char *)__builtin_return_address(0));
        va_end(panic_str_args);
 }
 
@@ -614,25 +627,47 @@ panic_with_options(unsigned int reason, void *ctx, uint64_t debugger_options_mas
        va_list panic_str_args;
 
        va_start(panic_str_args, str);
-       panic_trap_to_debugger(str, &panic_str_args, reason, ctx, debugger_options_mask, (unsigned long)(char *)__builtin_return_address(0));
+       panic_trap_to_debugger(str, &panic_str_args, reason, ctx, (debugger_options_mask & ~DEBUGGER_INTERNAL_OPTIONS_MASK),
+                               NULL, (unsigned long)(char *)__builtin_return_address(0));
        va_end(panic_str_args);
 }
 
+#if defined (__x86_64__)
+/*
+ * panic_with_thread_context() is used on x86 platforms to specify a different thread that should be backtraced in the paniclog.
+ * We don't generally need this functionality on embedded platforms because embedded platforms include a panic time stackshot
+ * from customer devices. We plumb the thread pointer via the debugger trap mechanism and backtrace the kernel stack from the
+ * thread when writing the panic log.
+ *
+ * NOTE: panic_with_thread_context() should be called with an explicit thread reference held on the passed thread.
+ */
 void
-panic_context(unsigned int reason, void *ctx, const char *str, ...)
+panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_options_mask, thread_t thread, const char *str, ...)
 {
        va_list panic_str_args;
+       __assert_only uint32_t th_ref_count;
+
+       assert_thread_magic(thread);
+       th_ref_count = atomic_load_explicit(&thread->ref_count, memory_order_acquire);
+       assertf(th_ref_count > 0, "panic_with_thread_context called with invalid thread %p with refcount %u", thread, th_ref_count);
+
+       /* Take a reference on the thread so it doesn't disappear by the time we try to backtrace it */
+       thread_reference(thread);
 
        va_start(panic_str_args, str);
-       panic_trap_to_debugger(str, &panic_str_args, reason, ctx, 0, (unsigned long)(char *)__builtin_return_address(0));
+       panic_trap_to_debugger(str, &panic_str_args, reason, ctx, ((debugger_options_mask & ~DEBUGGER_INTERNAL_OPTIONS_MASK) | DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE),
+                               thread, (unsigned long)(char *)__builtin_return_address(0));
+
        va_end(panic_str_args);
+
 }
+#endif /* defined (__x86_64__) */
 
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wmissing-noreturn"
 void
-panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void
-                       *ctx, uint64_t panic_options_mask, unsigned long panic_caller)
+panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsigned int reason, void *ctx,
+                       uint64_t panic_options_mask, void *panic_data_ptr, unsigned long panic_caller)
 {
 #pragma clang diagnostic pop
 
@@ -706,7 +741,7 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign
                 */
                DebuggerSaveState(DBOP_PANIC, "panic",
                                panic_format_str, panic_args,
-                               panic_options_mask, TRUE, panic_caller);
+                               panic_options_mask, panic_data_ptr, TRUE, panic_caller);
                handle_debugger_trap(reason, 0, 0, ctx);
        }
 
@@ -718,7 +753,7 @@ panic_trap_to_debugger(const char *panic_format_str, va_list *panic_args, unsign
 #endif /* defined(__arm64__) */
 
        DebuggerTrapWithState(DBOP_PANIC, "panic", panic_format_str,
-                       panic_args, panic_options_mask, TRUE, panic_caller);
+                       panic_args, panic_options_mask, panic_data_ptr, TRUE, panic_caller);
 
        /*
         * Not reached.
@@ -840,7 +875,7 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
                 * TODO: Need to clear panic log when return from debugger
                 * hooked up for embedded
                 */
-               SavePanicInfo(debugger_message, debugger_panic_options);
+               SavePanicInfo(debugger_message, debugger_panic_data, debugger_panic_options);
 
 #if DEVELOPMENT || DEBUG
                DEBUGGER_DEBUGGING_NESTED_PANIC_IF_REQUESTED((debugger_panic_options & DEBUGGER_OPTION_RECURPANIC_POSTLOG));
@@ -866,31 +901,51 @@ debugger_collect_diagnostics(unsigned int exception, unsigned int code, unsigned
         * Consider generating a local corefile if the infrastructure is configured
         * and we haven't disabled on-device coredumps.
         */
-       if (kdp_has_polled_corefile() && !(debug_boot_arg & DB_DISABLE_LOCAL_CORE)) {
-               int ret = -1;
+       if (!(debug_boot_arg & DB_DISABLE_LOCAL_CORE)) {
+               if (!kdp_has_polled_corefile()) {
+                       if (debug_boot_arg & (DB_KERN_DUMP_ON_PANIC | DB_KERN_DUMP_ON_NMI)) {
+                               paniclog_append_noflush("skipping local kernel core because core file could not be opened prior to panic (error : 0x%x)",
+                                                       kdp_polled_corefile_error());
+#if CONFIG_EMBEDDED
+                               panic_info->eph_panic_flags |= EMBEDDED_PANIC_HEADER_FLAG_COREDUMP_FAILED;
+                               paniclog_flush();
+#else /* CONFIG_EMBEDDED */
+                               if (panic_info->mph_panic_log_offset != 0) {
+                                       panic_info->mph_panic_flags |= MACOS_PANIC_HEADER_FLAG_COREDUMP_FAILED;
+                                       paniclog_flush();
+                               }
+#endif /* CONFIG_EMBEDDED */
+                       }
+               } else {
+                       int ret = -1;
 
 #if defined (__x86_64__)
-               /* On x86 we don't do a coredump on Debugger unless the DB_KERN_DUMP_ON_NMI boot-arg is specified. */
-               if (debugger_current_op != DBOP_DEBUGGER || (debug_boot_arg & DB_KERN_DUMP_ON_NMI))
+                       /* On x86 we don't do a coredump on Debugger unless the DB_KERN_DUMP_ON_NMI boot-arg is specified. */
+                       if (debugger_current_op != DBOP_DEBUGGER || (debug_boot_arg & DB_KERN_DUMP_ON_NMI))
 #endif
-               {
-                       /*
-                        * Doing an on-device coredump leaves the disk driver in a state
-                        * that can not be resumed.
-                        */
-                       debugger_safe_to_return = FALSE;
-                       begin_panic_transfer();
-                       ret = kern_dump(KERN_DUMP_DISK);
-                       abort_panic_transfer();
+                       {
+                               /*
+                                * Doing an on-device coredump leaves the disk driver in a state
+                                * that can not be resumed.
+                                */
+                               debugger_safe_to_return = FALSE;
+                               begin_panic_transfer();
+                               ret = kern_dump(KERN_DUMP_DISK);
+                               abort_panic_transfer();
 
 #if DEVELOPMENT || DEBUG
-                       DEBUGGER_DEBUGGING_NESTED_PANIC_IF_REQUESTED((debugger_panic_options & DEBUGGER_OPTION_RECURPANIC_POSTCORE));
+                               DEBUGGER_DEBUGGING_NESTED_PANIC_IF_REQUESTED((debugger_panic_options & DEBUGGER_OPTION_RECURPANIC_POSTCORE));
 #endif
-               }
+                       }
 
-               /* If we wrote a corefile and DB_REBOOT_POST_CORE is set, reboot */
-               if (ret == 0 && (debug_boot_arg & DB_REBOOT_POST_CORE)) {
-                       kdp_machine_reboot_type(kPEPanicRestartCPU);
+                       /*
+                        * If DB_REBOOT_POST_CORE is set, then reboot if coredump is sucessfully saved
+                        * or if option to ignore failures is set.
+                        */
+                       if ((debug_boot_arg & DB_REBOOT_POST_CORE) &&
+                                       ((ret == 0) || (debugger_panic_options & DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT))) {
+                               kdp_machine_reboot_type(kPEPanicRestartCPU);
+                       }
                }
        }
 
@@ -984,6 +1039,7 @@ handle_debugger_trap(unsigned int exception, unsigned int code, unsigned int sub
                if (debugger_panic_str == NULL) {
                        debugger_panic_str = CPUPANICSTR;
                        debugger_panic_args = CPUPANICARGS;
+                       debugger_panic_data = CPUPANICDATAPTR;
                        debugger_message = CPUDEBUGGERMSG;
                        debugger_panic_caller = CPUPANICCALLER;
                }
@@ -1026,6 +1082,7 @@ handle_debugger_trap(unsigned int exception, unsigned int code, unsigned int sub
        if (debugger_current_op != DBOP_BREAKPOINT) {
                debugger_panic_str = NULL;
                debugger_panic_args = NULL;
+               debugger_panic_data = NULL;
                debugger_panic_options = 0;
                debugger_message = NULL;
        }
index 860824c047187cd8a57a9f843c087a29a0ce8fdd..1ad189d5464ec1793bf121e1941ca91180a34f94 100644 (file)
@@ -156,7 +156,9 @@ struct micro_snapshot {
 } __attribute__ ((packed));
 
 
-
+/*
+ * mirrors the dyld_cache_header struct defined in dyld_cache_format.h from dyld source code
+ */
 struct _dyld_cache_header
 {
     char       magic[16];                              // e.g. "dyld_v0    i386"
@@ -172,14 +174,47 @@ struct _dyld_cache_header
     uint64_t    localSymbolsOffset;     // file offset of where local symbols are stored
     uint64_t    localSymbolsSize;       // size of local symbols information
     uint8_t     uuid[16];               // unique value for each shared cache file
+    uint64_t    cacheType;              // 0 for development, 1 for production
+    uint32_t    branchPoolsOffset;      // file offset to table of uint64_t pool addresses
+    uint32_t    branchPoolsCount;       // number of uint64_t entries
+    uint64_t    accelerateInfoAddr;     // (unslid) address of optimization info
+    uint64_t    accelerateInfoSize;     // size of optimization info
+    uint64_t    imagesTextOffset;       // file offset to first dyld_cache_image_text_info
+    uint64_t    imagesTextCount;        // number of dyld_cache_image_text_info entries
+    uint64_t    dylibsImageGroupAddr;   // (unslid) address of ImageGroup for dylibs in this cache
+    uint64_t    dylibsImageGroupSize;   // size of ImageGroup for dylibs in this cache
+    uint64_t    otherImageGroupAddr;    // (unslid) address of ImageGroup for other OS dylibs
+    uint64_t    otherImageGroupSize;    // size of oImageGroup for other OS dylibs
+    uint64_t    progClosuresAddr;       // (unslid) address of list of program launch closures
+    uint64_t    progClosuresSize;       // size of list of program launch closures
+    uint64_t    progClosuresTrieAddr;   // (unslid) address of trie of indexes into program launch closures
+    uint64_t    progClosuresTrieSize;   // size of trie of indexes into program launch closures
+    uint32_t    platform;               // platform number (macOS=1, etc)
+    uint32_t    formatVersion        : 8,  // dyld3::closure::kFormatVersion
+                dylibsExpectedOnDisk : 1,  // dyld should expect the dylib exists on disk and to compare inode/mtime to see if cache is valid
+                simulator            : 1,  // for simulator of specified platform
+                locallyBuiltCache    : 1,  // 0 for B&I built cache, 1 for locally built cache
+                padding              : 21; // TBD
+};
+
+/*
+ * mirrors the dyld_cache_image_text_info struct defined in dyld_cache_format.h from dyld source code
+ */
+struct _dyld_cache_image_text_info
+{
+    uuid_t      uuid;
+    uint64_t    loadAddress;            // unslid address of start of __TEXT
+    uint32_t    textSegmentSize;
+    uint32_t    pathOffset;             // offset from start of cache file
 };
 
 
 enum micro_snapshot_flags {
        kInterruptRecord        = 0x1,
        kTimerArmingRecord      = 0x2,
-       kUserMode               = 0x4, /* interrupted usermode, or armed by usermode */
-       kIORecord               = 0x8,
+       kUserMode               = 0x4, /* interrupted usermode, or armed by usermode */
+       kIORecord               = 0x8,
+       kPMIRecord              = 0x10,
 };
 
 /*
@@ -209,25 +244,8 @@ enum {
        STACKSHOT_KCDATA_FORMAT                    = 0x10000,
        STACKSHOT_ENABLE_BT_FAULTING               = 0x20000,
        STACKSHOT_COLLECT_DELTA_SNAPSHOT           = 0x40000,
-       /*
-        * STACKSHOT_TAILSPIN flips on several features aimed at minimizing the size
-        * of stackshots.  It is meant to be used only by the tailspin daemon.  Its
-        * behavior may be changed at any time to suit the needs of the tailspin
-        * daemon.  Seriously, if you are not the tailspin daemon, don't use this
-        * flag.  If you need these features, ask us to add a stable SPI for what
-        * you need.   That being said, the features it turns on are:
-        *
-        * minimize_uuids: If the set of loaded dylibs or kexts has not changed in
-        * the delta period, do then not report them.
-        *
-        * iostats: do not include io statistics.
-        *
-        * trace_fp: do not include the frame pointers in stack traces.
-        *
-        * minimize_nonrunnables: Do not report detailed information about threads
-        * which were not runnable in the delta period.
-        */
-       STACKSHOT_TAILSPIN                         = 0x80000,
+       /* Include the layout of the system shared cache */
+       STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT       = 0x80000,
        /*
         * Kernel consumers of stackshot (via stack_snapshot_from_kernel) can ask
         * that we try to take the stackshot lock, and fail if we don't get it.
@@ -241,6 +259,8 @@ enum {
        STACKSHOT_THREAD_GROUP                     = 0x2000000,
        STACKSHOT_SAVE_JETSAM_COALITIONS           = 0x4000000,
        STACKSHOT_INSTRS_CYCLES                    = 0x8000000,
+       STACKSHOT_ASID                             = 0x10000000,
+       STACKSHOT_PAGE_TABLES                      = 0x20000000,
 };
 
 #define STACKSHOT_THREAD_SNAPSHOT_MAGIC     0xfeedface
@@ -429,15 +449,19 @@ enum {
 /*
  * Values for a 64-bit mask that's passed to the debugger.
  */
-#define DEBUGGER_OPTION_NONE                   0x0ULL
-#define DEBUGGER_OPTION_PANICLOGANDREBOOT      0x1ULL /* capture a panic log and then reboot immediately */
-#define DEBUGGER_OPTION_RECURPANIC_ENTRY        0x2ULL
-#define DEBUGGER_OPTION_RECURPANIC_PRELOG       0x4ULL
-#define DEBUGGER_OPTION_RECURPANIC_POSTLOG      0x8ULL
-#define DEBUGGER_OPTION_RECURPANIC_POSTCORE     0x10ULL
-#define DEBUGGER_OPTION_INITPROC_PANIC          0x20ULL
-#define DEBUGGER_OPTION_COPROC_INITIATED_PANIC  0x40ULL /* panic initiated by a co-processor */
-#define DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP     0x80ULL /* don't try to save local coredumps for this panic */
+#define DEBUGGER_OPTION_NONE                        0x0ULL
+#define DEBUGGER_OPTION_PANICLOGANDREBOOT           0x1ULL /* capture a panic log and then reboot immediately */
+#define DEBUGGER_OPTION_RECURPANIC_ENTRY            0x2ULL
+#define DEBUGGER_OPTION_RECURPANIC_PRELOG           0x4ULL
+#define DEBUGGER_OPTION_RECURPANIC_POSTLOG          0x8ULL
+#define DEBUGGER_OPTION_RECURPANIC_POSTCORE         0x10ULL
+#define DEBUGGER_OPTION_INITPROC_PANIC              0x20ULL
+#define DEBUGGER_OPTION_COPROC_INITIATED_PANIC      0x40ULL /* panic initiated by a co-processor */
+#define DEBUGGER_OPTION_SKIP_LOCAL_COREDUMP         0x80ULL /* don't try to save local coredumps for this panic */
+#define DEBUGGER_OPTION_ATTEMPTCOREDUMPANDREBOOT    0x100ULL /* attempt to save coredump. always reboot */
+#define DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE   0x200ULL /* backtrace the specified thread in the paniclog (x86 only) */
+
+#define DEBUGGER_INTERNAL_OPTIONS_MASK              (DEBUGGER_INTERNAL_OPTION_THREAD_BACKTRACE)
 
 __BEGIN_DECLS
 
@@ -453,7 +477,6 @@ __BEGIN_DECLS
 #define panic(ex, ...) (panic)(# ex "@" PANIC_LOCATION, ## __VA_ARGS__)
 #endif
 
-void panic_context(unsigned int reason, void *ctx, const char *string, ...);
 void panic_with_options(unsigned int reason, void *ctx, uint64_t debugger_options_mask, const char *str, ...);
 void Debugger(const char * message);
 void populate_model_name(char *);
@@ -466,6 +489,12 @@ __END_DECLS
 
 #if XNU_KERNEL_PRIVATE
 
+#if defined (__x86_64__)
+struct thread;
+
+void panic_with_thread_context(unsigned int reason, void *ctx, uint64_t debugger_options_mask, struct thread* th, const char *str, ...);
+#endif
+
 boolean_t oslog_is_safe(void);
 boolean_t debug_mode_active(void);
 boolean_t stackshot_active(void);
@@ -512,6 +541,10 @@ extern unsigned int        debug_boot_arg;
 extern boolean_t       debug_boot_arg_inited;
 #endif
 
+extern boolean_t kernelcache_uuid_valid;
+extern uuid_t kernelcache_uuid;
+extern uuid_string_t kernelcache_uuid_string;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -557,7 +590,7 @@ extern size_t panic_stackshot_len;
 #endif /* DEVELOPMENT || DEBUG */
 #endif /* defined (__x86_64__) */
 
-void   SavePanicInfo(const char *message, uint64_t panic_options);
+void   SavePanicInfo(const char *message, void *panic_data, uint64_t panic_options);
 void    paniclog_flush(void);
 void   panic_display_system_configuration(boolean_t launchd_exit);
 void   panic_display_zprint(void);
@@ -592,7 +625,7 @@ typedef enum {
 } debugger_op;
 
 kern_return_t DebuggerTrapWithState(debugger_op db_op, const char *db_message, const char *db_panic_str, va_list *db_panic_args,
-               uint64_t db_panic_options, boolean_t db_proceed_on_sync_failure, unsigned long db_panic_caller);
+               uint64_t db_panic_options, void *db_panic_data_ptr, boolean_t db_proceed_on_sync_failure, unsigned long db_panic_caller);
 void handle_debugger_trap(unsigned int exception, unsigned int code, unsigned int subcode, void *state);
 
 void DebuggerWithContext(unsigned int reason, void *ctx, const char *message, uint64_t debugger_options_mask);
diff --git a/osfmk/kern/ecc_logging.c b/osfmk/kern/ecc_logging.c
new file mode 100644 (file)
index 0000000..d62ab81
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+* Copyright (c) 2013 Apple Inc. All rights reserved.
+*
+* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+* 
+* This file contains Original Code and/or Modifications of Original Code
+* as defined in and that are subject to the Apple Public Source License
+* Version 2.0 (the 'License'). You may not use this file except in
+* compliance with the License. The rights granted to you under the License
+* may not be used to create, or enable the creation or redistribution of,
+* unlawful or unlicensed copies of an Apple operating system, or to
+* circumvent, violate, or enable the circumvention or violation of, any
+* terms of an Apple operating system software license agreement.
+* 
+* Please obtain a copy of the License at
+* http://www.opensource.apple.com/apsl/ and read it before using this file.
+* 
+* The Original Code and all software distributed under the License are
+* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+* Please see the License for the specific language governing rights and
+* limitations under the License.
+* 
+* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+*/
+#include <mach/mach_types.h>
+#include <mach/host_info.h>
+#include <kern/locks.h>
+#include <kern/ecc.h>
+#include <kern/spl.h>
+#include <pexpert/pexpert.h>
+#include <libkern/OSAtomic.h>
+
+/*
+ * ECC data.  Not really KPCs, but this still seems like the
+ * best home for this code.
+ *
+ * Circular buffer of events.  When we fill up, drop data.
+ */
+#define ECC_EVENT_BUFFER_COUNT 5
+struct ecc_event               ecc_data[ECC_EVENT_BUFFER_COUNT];
+static uint32_t                        ecc_data_next_read; 
+static uint32_t                        ecc_data_next_write; 
+static boolean_t               ecc_data_empty = TRUE; // next read == next write : empty or full?
+static lck_grp_t               *ecc_data_lock_group;
+static lck_spin_t              ecc_data_lock;
+static uint32_t                        ecc_correction_count;
+
+void
+ecc_log_init()
+{
+       ecc_data_lock_group = lck_grp_alloc_init("ecc-data", NULL);
+       lck_spin_init(&ecc_data_lock, ecc_data_lock_group, NULL);
+       OSMemoryBarrier();
+}
+
+uint32_t
+ecc_log_get_correction_count()
+{
+       return ecc_correction_count;
+}
+
+kern_return_t
+ecc_log_record_event(const struct ecc_event *ev)
+{
+       spl_t x;
+
+       if (ev->count > ECC_EVENT_INFO_DATA_ENTRIES) {
+               panic("Count of %u on ecc event is too large.", (unsigned)ev->count);
+       }
+
+       x = splhigh();
+       lck_spin_lock(&ecc_data_lock);
+
+       ecc_correction_count++;
+
+       if (ecc_data_next_read == ecc_data_next_write && !ecc_data_empty)  {
+               lck_spin_unlock(&ecc_data_lock);
+               splx(x);
+               return KERN_FAILURE;
+       }
+
+       bcopy(ev, &ecc_data[ecc_data_next_write], sizeof(*ev));
+       ecc_data_next_write++;
+       ecc_data_next_write %= ECC_EVENT_BUFFER_COUNT;
+       ecc_data_empty = FALSE;
+
+       lck_spin_unlock(&ecc_data_lock);
+       splx(x);
+
+       return KERN_SUCCESS;
+}
+
+
+kern_return_t
+ecc_log_get_next_event(struct ecc_event *ev)
+{
+       spl_t x;
+
+       x = splhigh();
+       lck_spin_lock(&ecc_data_lock);
+
+       if (ecc_data_empty)  {
+               assert(ecc_data_next_write == ecc_data_next_read);
+
+               lck_spin_unlock(&ecc_data_lock);
+               splx(x);
+               return KERN_FAILURE;
+       }
+
+       bcopy(&ecc_data[ecc_data_next_read], ev, sizeof(*ev));
+       ecc_data_next_read++;
+       ecc_data_next_read %= ECC_EVENT_BUFFER_COUNT;
+
+       if (ecc_data_next_read == ecc_data_next_write) {
+               ecc_data_empty = TRUE;
+       }
+
+       lck_spin_unlock(&ecc_data_lock);
+       splx(x);
+
+       return KERN_SUCCESS;
+}
index 8486ec569a5c92e9f0ea53e9fb1100d94b32923f..dbdb3f1937dcf800742887c153000d4fa4eac3bb 100644 (file)
@@ -64,6 +64,8 @@
 
 /* EXC_GUARD types */
 
+#define GUARD_TYPE_NONE         0x0
+
 /*
  * Mach port guards use the exception codes like this:
  *
 
 #define GUARD_TYPE_VN          0x4     /* guarded vnode */
 
+/*
+ * VM guards use the exception codes like this:
+ *
+ * code:
+ * +-------------------------------+----------------+-----------------+
+ * |[63:61] GUARD_TYPE_VIRT_MEMORY | [60:32] flavor | [31:0] unused   |
+ * +-------------------------------+----------------+-----------------+
+ *
+ * subcode:
+ * +----------------------------------------------------------------+
+ * |[63:0] offset                                                   |
+ * +----------------------------------------------------------------+
+ */
+
+#define GUARD_TYPE_VIRT_MEMORY 0x5     /* VM operation violating guard */
+
 #ifdef KERNEL
 
 #define EXC_GUARD_ENCODE_TYPE(code, type) \
index c90fafc617f142bc16062fe978fef0563d3c91d3..21d0d0b6d2baa15c26abe8abc63c2f62d1cc069c 100644 (file)
@@ -62,6 +62,7 @@
 #define RESOURCE_TYPE_WAKEUPS  2
 #define        RESOURCE_TYPE_MEMORY    3
 #define RESOURCE_TYPE_IO       4
+#define RESOURCE_TYPE_THREADS  5
 
 /* RESOURCE_TYPE_CPU flavors */
 #define FLAVOR_CPU_MONITOR             1
         ((subcode) & 0x7FFFULL)
 
 
+/*
+ * RESOURCE_TYPE_THREADS exception code & subcode
+ *
+ * This is sent by the kernel when a task crosses its
+ * thread limit.
+ */
+
+#define EXC_RESOURCE_THREADS_DECODE_THREADS(code) \
+       ((code) & 0x7FFFULL)
+
+/* RESOURCE_TYPE_THREADS flavors */
+#define FLAVOR_THREADS_HIGH_WATERMARK 1
+
 #ifdef KERNEL
 
 /* EXC_RESOURCE type and flavor encoding macros */
 #define EXC_RESOURCE_IO_ENCODE_OBSERVED(subcode, num) \
        ((subcode) |= (((uint64_t)(num) & 0x7FFFULL)))
 
+/* RESOURCE_TYPE_THREADS specific encoding macros */
+#define EXC_RESOURCE_THREADS_ENCODE_THREADS(code, threads) \
+       ((code) |= (((uint64_t)(threads) & 0x7FFFULL)))
+
 #endif /* KERNEL */
 
 
index 4042b91b318ab8f05c4f64e83abdcb85eec61875..9a67b727b0dcfc6f504b883fc6b3b7b8bc815cc8 100644 (file)
@@ -67,6 +67,7 @@
 #include <mach/exception_types.h>
 #include <mach/exc.h>
 #include <mach/mach_exc.h>
+
 #include <ipc/port.h>
 #include <ipc/ipc_entry.h>
 #include <ipc/ipc_object.h>
@@ -74,6 +75,7 @@
 #include <ipc/ipc_space.h>
 #include <ipc/ipc_pset.h>
 #include <ipc/ipc_machdep.h>
+
 #include <kern/counters.h>
 #include <kern/ipc_tt.h>
 #include <kern/task.h>
 #include <kern/sched_prim.h>
 #include <kern/host.h>
 #include <kern/misc_protos.h>
+#include <kern/ux_handler.h>
+
 #include <security/mac_mach_internal.h>
 #include <string.h>
+
 #include <pexpert/pexpert.h>
 
 extern int panic_on_exception_triage;
@@ -212,7 +217,10 @@ exception_deliver(
         * As with other failures, exception_triage_thread will go on
         * to the next level.
         */
-       if (mac_exc_action_check_exception_send(task, excp) != 0) {
+
+       /* The global exception-to-signal translation port is safe to be an exception handler. */
+       if (is_ux_handler_port(exc_port) == FALSE &&
+           mac_exc_action_check_exception_send(task, excp) != 0) {
                kr = KERN_FAILURE;
                goto out_release_right;
        }
@@ -241,7 +249,7 @@ exception_deliver(
 
                c_thr_exc_raise_state++;
                state_cnt = _MachineStateCount[flavor];
-               kr = thread_getstatus(thread, flavor, 
+               kr = thread_getstatus_to_user(thread, flavor,
                                      (thread_state_t)state,
                                      &state_cnt);
                if (kr == KERN_SUCCESS) {
@@ -263,7 +271,7 @@ exception_deliver(
                        }
                        if (kr == KERN_SUCCESS) {
                                if (exception != EXC_CORPSE_NOTIFY)
-                                       kr = thread_setstatus(thread, flavor,
+                                       kr = thread_setstatus_from_user(thread, flavor,
                                                        (thread_state_t)state,
                                                        state_cnt);
                                goto out_release_right;
@@ -300,7 +308,7 @@ exception_deliver(
 
                c_thr_exc_raise_state_id++;
                state_cnt = _MachineStateCount[flavor];
-               kr = thread_getstatus(thread, flavor,
+               kr = thread_getstatus_to_user(thread, flavor,
                                      (thread_state_t)state,
                                      &state_cnt);
                if (kr == KERN_SUCCESS) {
@@ -329,7 +337,7 @@ exception_deliver(
 
                        if (kr == KERN_SUCCESS) {
                                if (exception != EXC_CORPSE_NOTIFY)
-                                       kr = thread_setstatus(thread, flavor,
+                                       kr = thread_setstatus_from_user(thread, flavor,
                                                        (thread_state_t)state,
                                                        state_cnt);
                                goto out_release_right;
index 9bd11481f4e8df91eaf61936febeff603897ab22..64d0ba9eb353693a9a587a179b03e2396237f9dd 100644 (file)
@@ -562,26 +562,43 @@ boolean_t gzalloc_element_size(void *gzaddr, zone_t *z, vm_size_t *gzsz) {
        uintptr_t a = (uintptr_t)gzaddr;
        if (__improbable(gzalloc_mode && (a >= gzalloc_map_min) && (a < gzalloc_map_max))) {
                gzhdr_t *gzh;
+               boolean_t       vmef;
+               vm_map_entry_t  gzvme = NULL;
+               vm_map_lock_read(gzalloc_map);
+               vmef = vm_map_lookup_entry(gzalloc_map, (vm_map_offset_t)a, &gzvme);
+               vm_map_unlock(gzalloc_map);
+               if (vmef == FALSE) {
+                       panic("GZALLOC: unable to locate map entry for %p\n", (void *)a);
+               }
+               assertf(gzvme->vme_atomic != 0, "GZALLOC: VM map entry inconsistency, vme: %p, start: %llu end: %llu", gzvme, gzvme->vme_start, gzvme->vme_end);
 
                /* Locate the gzalloc metadata adjoining the element */
                if (gzalloc_uf_mode == TRUE) {
-                       boolean_t       vmef;
-                       vm_map_entry_t  gzvme = NULL;
 
                        /* In underflow detection mode, locate the map entry describing
                         * the element, and then locate the copy of the gzalloc
                         * header at the trailing edge of the range.
                         */
-                       vm_map_lock_read(gzalloc_map);
-                       vmef = vm_map_lookup_entry(gzalloc_map, (vm_map_offset_t)a, &gzvme);
-                       vm_map_unlock(gzalloc_map);
-                       if (vmef == FALSE) {
-                               panic("GZALLOC: unable to locate map entry for %p\n", (void *)a);
-                       }
-                       assertf(gzvme->vme_atomic != 0, "GZALLOC: VM map entry inconsistency, vme: %p, start: %llu end: %llu", gzvme, gzvme->vme_start, gzvme->vme_end);
                        gzh = (gzhdr_t *)(gzvme->vme_end - GZHEADER_SIZE);
                } else {
-                       gzh = (gzhdr_t *)(a - GZHEADER_SIZE);
+                       /* In overflow detection mode, scan forward from
+                        * the base of the map entry to locate the
+                        * gzalloc header.
+                        */
+                       uint32_t *p = (uint32_t*) gzvme->vme_start;
+                       while (p < (uint32_t *) gzvme->vme_end) {
+                               if (*p == GZALLOC_SIGNATURE)
+                                       break;
+                               else {
+                                       p++;
+                               }
+                       }
+                       if (p >= (uint32_t *) gzvme->vme_end) {
+                               panic("GZALLOC signature missing addr %p, zone %p", gzaddr, z);
+                       }
+                       p++;
+                       uintptr_t q = (uintptr_t) p;
+                       gzh = (gzhdr_t *) (q - sizeof(gzhdr_t));
                }
 
                if (gzh->gzsig != GZALLOC_SIGNATURE) {
index 47bab64b11ce197bbf85631f0a5ded756e879efb..0f4fe2fb934e02f57c32d3aa339c9bb46869ab6b 100644 (file)
@@ -307,6 +307,31 @@ host_info(host_t host, host_flavor_t flavor, host_info_t info, mach_msg_type_num
 #endif
        }
 
+       case HOST_PREFERRED_USER_ARCH: {
+               host_preferred_user_arch_t user_arch_info;
+
+               /*
+                *      Basic information about this host.
+                */
+               if (*count < HOST_PREFERRED_USER_ARCH_COUNT)
+                       return (KERN_FAILURE);
+
+               user_arch_info = (host_preferred_user_arch_t)info;
+
+#if defined(PREFERRED_USER_CPU_TYPE) && defined(PREFERRED_USER_CPU_SUBTYPE)
+               user_arch_info->cpu_type = PREFERRED_USER_CPU_TYPE;
+               user_arch_info->cpu_subtype = PREFERRED_USER_CPU_SUBTYPE;
+#else
+               int master_id = master_processor->cpu_id;
+               user_arch_info->cpu_type = slot_type(master_id);
+               user_arch_info->cpu_subtype = slot_subtype(master_id);
+#endif
+
+               *count = HOST_PREFERRED_USER_ARCH_COUNT;
+
+               return (KERN_SUCCESS);
+       }
+
        default: return (KERN_INVALID_ARGUMENT);
        }
 }
@@ -939,6 +964,27 @@ set_sched_stats_active(boolean_t active)
        return (KERN_SUCCESS);
 }
 
+
+uint64_t
+get_pages_grabbed_count(void)
+{
+       processor_t processor;
+        uint64_t pages_grabbed_count = 0;
+
+       simple_lock(&processor_list_lock);
+
+       processor = processor_list;
+
+       while (processor) {
+               pages_grabbed_count += PROCESSOR_DATA(processor, page_grab_count);
+               processor = processor->processor_list;
+       }
+       simple_unlock(&processor_list_lock);
+
+       return(pages_grabbed_count);
+}
+
+
 kern_return_t
 get_sched_statistics(struct _processor_statistics_np * out, uint32_t * count)
 {
@@ -1150,6 +1196,14 @@ host_processor_info(host_t host,
        return (KERN_SUCCESS);
 }
 
+static bool
+is_valid_host_special_port(int id)
+{
+       return (id <= HOST_MAX_SPECIAL_PORT) &&
+              (id >= HOST_MIN_SPECIAL_PORT) &&
+              ((id <= HOST_LAST_SPECIAL_KERNEL_PORT) || (id > HOST_MAX_SPECIAL_KERNEL_PORT));
+}
+
 /*
  *      Kernel interface for setting a special port.
  */
@@ -1158,9 +1212,12 @@ kernel_set_special_port(host_priv_t host_priv, int id, ipc_port_t port)
 {
        ipc_port_t old_port;
 
+       if (!is_valid_host_special_port(id))
+               panic("attempted to set invalid special port %d", id);
+
 #if !MACH_FLIPC
-    if (id == HOST_NODE_PORT)
-        return (KERN_NOT_SUPPORTED);
+       if (id == HOST_NODE_PORT)
+               return (KERN_NOT_SUPPORTED);
 #endif
 
        host_lock(host_priv);
@@ -1169,7 +1226,7 @@ kernel_set_special_port(host_priv_t host_priv, int id, ipc_port_t port)
        host_unlock(host_priv);
 
 #if MACH_FLIPC
-    if (id == HOST_NODE_PORT)
+       if (id == HOST_NODE_PORT)
                mach_node_port_changed();
 #endif
 
@@ -1184,10 +1241,13 @@ kernel_set_special_port(host_priv_t host_priv, int id, ipc_port_t port)
 kern_return_t
 kernel_get_special_port(host_priv_t host_priv, int id, ipc_port_t * portp)
 {
-        host_lock(host_priv);
-        *portp = host_priv->special[id];
-        host_unlock(host_priv);
-        return (KERN_SUCCESS);
+       if (!is_valid_host_special_port(id))
+               panic("attempted to get invalid special port %d", id);
+
+       host_lock(host_priv);
+       *portp = host_priv->special[id];
+       host_unlock(host_priv);
+       return (KERN_SUCCESS);
 }
 
 /*
@@ -1227,7 +1287,7 @@ host_get_special_port(host_priv_t host_priv, __unused int node, int id, ipc_port
 {
        ipc_port_t port;
 
-       if (host_priv == HOST_PRIV_NULL || id == HOST_SECURITY_PORT || id > HOST_MAX_SPECIAL_PORT || id < 0)
+       if (host_priv == HOST_PRIV_NULL || id == HOST_SECURITY_PORT || id > HOST_MAX_SPECIAL_PORT || id < HOST_MIN_SPECIAL_PORT)
                return (KERN_INVALID_ARGUMENT);
 
        host_lock(host_priv);
index c67af697e073120abc0a8403ec6218a87ab17631..cbccf8c22006f7606bf9b6576b9582b87560a7df 100644 (file)
@@ -42,6 +42,8 @@
 #include <mach/vm_statistics.h>
 #include <kern/processor.h>
 
+extern
+uint64_t get_pages_grabbed_count(void);
 
 #define VM_STAT_INCR(event)                                                                    \
 MACRO_BEGIN                                                                                    \
index 181bb0383f9d79010260599c6bb7a3708dcb5f60..2a216b2fbb96b291e95b23845bc7ced8484f4818 100644 (file)
@@ -89,6 +89,7 @@
 #include <mach/clock_server.h>
 #include <mach/clock_priv_server.h>
 #include <mach/lock_set_server.h>
+#include <mach/memory_entry_server.h>
 #include <mach/memory_object_control_server.h>
 #include <mach/memory_object_default_server.h>
 #include <mach/processor_server.h>
 #endif
 #include <mach/thread_act_server.h>
 
+#include <mach/exc_server.h>
+#include <mach/mach_exc_server.h>
+
 #include <device/device_types.h>
 #include <device/device_server.h>
 
@@ -190,6 +194,7 @@ const struct mig_subsystem *mig_e[] = {
        (const struct mig_subsystem *)&UNDReply_subsystem,
        (const struct mig_subsystem *)&mach_voucher_subsystem,
        (const struct mig_subsystem *)&mach_voucher_attr_control_subsystem,
+       (const struct mig_subsystem *)&memory_entry_subsystem,
 
 #if     XK_PROXY
         (const struct mig_subsystem *)&do_uproxy_xk_uproxy_subsystem,
@@ -200,6 +205,9 @@ const struct mig_subsystem *mig_e[] = {
 #if     MCMSG && iPSC860
        (const struct mig_subsystem *)&mcmsg_info_subsystem,
 #endif  /* MCMSG && iPSC860 */
+        (const struct mig_subsystem *)&catch_exc_subsystem,
+        (const struct mig_subsystem *)&catch_mach_exc_subsystem,
+
 };
 
 void
@@ -269,20 +277,20 @@ ipc_kobject_server(
        task_t task = TASK_NULL;
        uint32_t exec_token;
        boolean_t exec_token_changed = FALSE;
+       int request_msgh_id = request->ikm_header->msgh_id;
 
        /*
         * Find out corresponding mig_hash entry if any
         */
        {
-           int key = request->ikm_header->msgh_id;
-           unsigned int i = (unsigned int)MIG_HASH(key);
+           unsigned int i = (unsigned int)MIG_HASH(request_msgh_id);
            int max_iter = mig_table_max_displ;
 
            do {
                ptr = &mig_buckets[i++ % MAX_MIG_ENTRIES];
-           } while (key != ptr->num && ptr->num && --max_iter);
+           } while (request_msgh_id != ptr->num && ptr->num && --max_iter);
 
-           if (!ptr->routine || key != ptr->num) {
+           if (!ptr->routine || request_msgh_id != ptr->num) {
                ptr = (mig_hash_t *)0;
                reply_size = mig_reply_size;
            } else {
@@ -466,8 +474,7 @@ ipc_kobject_server(
                 */
 #if DEVELOPMENT || DEBUG
                printf("%s: refusing to send reply to kobject %d port (id:%d)\n",
-                      __func__, ip_kotype(replyp),
-                      request->ikm_header->msgh_id);
+                      __func__, ip_kotype(replyp), request_msgh_id);
 #endif /* DEVELOPMENT || DEBUG */
                ipc_kmsg_destroy(reply);
                return IKM_NULL;
index 52431b60e6f46214f25728eb5bdd6ba0a5e4721f..28db4e47df08f405bff3c12461a89991074a1c18 100644 (file)
@@ -129,12 +129,13 @@ typedef natural_t ipc_kobject_type_t;
 #define IKOT_VOUCHER                   37
 #define IKOT_VOUCHER_ATTR_CONTROL      38
 #define IKOT_WORK_INTERVAL              39
+#define IKOT_UX_HANDLER                 40
 
 /*
  * Add new entries here and adjust IKOT_UNKNOWN.
  * Please keep ipc/ipc_object.c:ikot_print_array up to date.
  */
-#define IKOT_UNKNOWN                    40      /* magic catchall       */
+#define IKOT_UNKNOWN                    41      /* magic catchall       */
 #define        IKOT_MAX_TYPE   (IKOT_UNKNOWN+1)        /* # of IKOT_ types     */
 
 
index 8114708a15bab6996dbc51940816cbc7677c221a..ddbfa0e5aa6f8129c572730ec6a065deff3fc46b 100644 (file)
@@ -84,6 +84,9 @@
 
 #include <libkern/OSAtomic.h>
 
+void
+mach_msg_receive_results_complete(ipc_object_t object);
+
 /*
  *     Routine:        mach_msg_send_from_kernel
  *     Purpose:
@@ -410,6 +413,7 @@ mach_msg_rpc_from_kernel_body(
 
        for (;;) {
                ipc_mqueue_t mqueue;
+               ipc_object_t object;
 
                assert(reply->ip_in_pset == 0);
                assert(ip_active(reply));
@@ -434,6 +438,9 @@ mach_msg_rpc_from_kernel_body(
                kmsg = self->ith_kmsg;
                seqno = self->ith_seqno;
 
+               __IGNORE_WCASTALIGN(object = (ipc_object_t) reply);
+               mach_msg_receive_results_complete(object);
+
                if (mr == MACH_MSG_SUCCESS)
                  {
                        break;
@@ -598,6 +605,7 @@ mach_msg_overwrite(
                                               &mqueue, &object);
                        if (mr != MACH_MSG_SUCCESS)
                                return mr;
+
                        /* hold ref for object */
 
                        self->ith_continuation = (void (*)(mach_msg_return_t))0;
@@ -610,6 +618,7 @@ mach_msg_overwrite(
                        kmsg = self->ith_kmsg;
                        seqno = self->ith_seqno;
 
+                       mach_msg_receive_results_complete(object);
                        io_release(object);
 
                } while (mr == MACH_RCV_INTERRUPTED);
index 920ac8fc5829471532ce2cfdd40c68b4dd835b87..6fcef9f174728ca8ce224b30d4178e34db68d11f 100644 (file)
@@ -867,6 +867,7 @@ mach_reply_port(
  *     Conditions:
  *             Nothing locked.
  *     Returns:
+ *             mach_port_name_t: send right & receive right for special reply port.
  *             MACH_PORT_NULL if there are any resource failures
  *             or other errors.
  */
@@ -877,6 +878,7 @@ thread_get_special_reply_port(
 {
        ipc_port_t port;
        mach_port_name_t name;
+       mach_port_name_t send_name;
        kern_return_t kr;
        thread_t thread = current_thread();
 
@@ -891,7 +893,22 @@ thread_get_special_reply_port(
        kr = ipc_port_alloc(current_task()->itk_space, &name, &port);
        if (kr == KERN_SUCCESS) {
                ipc_port_bind_special_reply_port_locked(port);
+
+               /* Make a send right and insert it in the space at specified name */
+               ipc_port_make_send_locked(port);
                ip_unlock(port);
+               send_name = ipc_port_copyout_name_send(port, current_task()->itk_space, name);
+               /*
+                * If insertion of send right failed, userland is doing something bad, error out.
+                * The space was marked inactive or the receive right just inserted above at the
+                * given name was moved, in either case do not try to deallocate the receive right.
+                */
+               if (send_name == MACH_PORT_NULL || send_name == MACH_PORT_DEAD) {
+                       if (IP_VALID(thread->ith_special_reply_port)) {
+                               ipc_port_unbind_special_reply_port(thread, TRUE);
+                       }
+                       name = MACH_PORT_NULL;
+               }
        } else {
                name = MACH_PORT_NULL;
        }
@@ -918,14 +935,17 @@ ipc_port_bind_special_reply_port_locked(
        ip_reference(port);
        thread->ith_special_reply_port = port;
        port->ip_specialreply = 1;
-       port->ip_link_sync_qos = 1;
+       port->ip_sync_link_state = PORT_SYNC_LINK_ANY;
+
+       reset_ip_srp_bits(port);
 }
 
 /*
  *     Routine:        ipc_port_unbind_special_reply_port
  *     Purpose:
  *             Unbind the thread's special reply port.
- *             If the special port is linked to a port, adjust it's sync qos delta`.
+ *             If the special port has threads waiting on turnstile,
+ *             update it's inheritor.
  *     Condition:
  *             Nothing locked.
  *     Returns:
@@ -947,8 +967,8 @@ ipc_port_unbind_special_reply_port(
        }
 
        thread->ith_special_reply_port = NULL;
-       ipc_port_unlink_special_reply_port_locked(special_reply_port, NULL,
-               IPC_PORT_UNLINK_SR_CLEAR_SPECIAL_REPLY);
+       ipc_port_adjust_special_reply_port_locked(special_reply_port, NULL,
+               IPC_PORT_ADJUST_SR_CLEAR_SPECIAL_REPLY, FALSE);
        /* port unlocked */
 
        ip_release(special_reply_port);
@@ -1365,7 +1385,7 @@ task_conversion_eval(task_t caller, task_t victim)
         * Only the kernel can can resolve the kernel's task port. We've established
         * by this point that the caller is not kernel_task.
         */
-       if (victim == kernel_task) {
+       if (victim == TASK_NULL || victim == kernel_task) {
                return KERN_INVALID_SECURITY;
        }
 
@@ -1751,12 +1771,13 @@ convert_port_to_thread(
        if (IP_VALID(port)) {
                ip_lock(port);
 
-               if (    ip_active(port)                                 &&
-                               ip_kotype(port) == IKOT_THREAD          ) {
+               if (ip_active(port) &&
+                   ip_kotype(port) == IKOT_THREAD) {
                        thread = (thread_t)port->ip_kobject;
                        assert(thread != THREAD_NULL);
-                       if (thread->task && thread->task == kernel_task &&
-                           current_task() != kernel_task) {
+
+                       /* Use task conversion rules for thread control conversions */
+                       if (task_conversion_eval(current_task(), thread->task) != KERN_SUCCESS) {
                                ip_unlock(port);
                                return THREAD_NULL;
                        }
index 65e9df392e840f5bed327bf90533516c61759311..6527654f2e5c1e340c93edf941d1a606880cb363 100644 (file)
@@ -716,8 +716,6 @@ kalloc_external(
        return( kalloc_tag_bt(size, VM_KERN_MEMORY_KALLOC) );
 }
 
-volatile SInt32 kfree_nop_count = 0;
-
 void
 kfree(
        void            *data,
@@ -751,28 +749,7 @@ kfree(
                if ((((vm_offset_t) data) >= kalloc_map_min) && (((vm_offset_t) data) <= kalloc_map_max))
                        alloc_map = kalloc_map;
                if (size > kalloc_largest_allocated) {
-                               /*
-                                * work around double FREEs of small MALLOCs
-                                * this used to end up being a nop
-                                * since the pointer being freed from an
-                                * alloc backed by the zalloc world could
-                                * never show up in the kalloc_map... however,
-                                * the kernel_map is a different issue... since it
-                                * was released back into the zalloc pool, a pointer
-                                * would have gotten written over the 'size' that 
-                                * the MALLOC was retaining in the first 4 bytes of
-                                * the underlying allocation... that pointer ends up 
-                                * looking like a really big size on the 2nd FREE and
-                                * pushes the kfree into the kernel_map...  we
-                                * end up removing a ton of virtual space before we panic
-                                * this check causes us to ignore the kfree for a size
-                                * that must be 'bogus'... note that it might not be due
-                                * to the above scenario, but it would still be wrong and
-                                * cause serious damage.
-                                */
-
-                               OSAddAtomic(1, &kfree_nop_count);
-                               return;
+                       panic("kfree: size %lu > kalloc_largest_allocated %lu", (unsigned long)size, (unsigned long)kalloc_largest_allocated);
                }
                kmem_free(alloc_map, (vm_offset_t)data, size);
                kalloc_spin_lock();
@@ -797,7 +774,9 @@ kfree(
                    z, z->zone_name, (unsigned long)size);
 #endif
        assert(size <= z->elem_size);
+#if !KASAN_KALLOC
        DTRACE_VM3(kfree, vm_size_t, size, vm_size_t, z->elem_size, void*, data);
+#endif
        zfree(z, data);
 }
 
index 702bfacbc30ea742facb12a3ce64fba2e4ac6695..e36c55352fdfbdaebccc25e7b25bef2af9b935f0 100644 (file)
@@ -436,47 +436,49 @@ struct kcdata_type_definition {
  * NOTE: Please update kcdata/libkdd/kcdtypes.c if you make any changes
  * in STACKSHOT_KCTYPE_* types.
  */
-#define STACKSHOT_KCTYPE_IOSTATS 0x901u          /* io_stats_snapshot */
-#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u /* struct mem_and_io_snapshot */
+#define STACKSHOT_KCTYPE_IOSTATS 0x901u                   /* io_stats_snapshot */
+#define STACKSHOT_KCTYPE_GLOBAL_MEM_STATS 0x902u          /* struct mem_and_io_snapshot */
 #define STACKSHOT_KCCONTAINER_TASK 0x903u
 #define STACKSHOT_KCCONTAINER_THREAD 0x904u
-#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u         /* task_snapshot_v2 */
-#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u       /* thread_snapshot_v2, thread_snapshot_v3 */
-#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u         /* int[] */
-#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u  /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
-#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u           /* char[] */
-#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au       /* struct stack_snapshot_frame32 */
-#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu     /* struct stack_snapshot_frame64 */
-#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu       /* struct stack_snapshot_frame32 */
-#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du     /* struct stack_snapshot_frame64 */
-#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu              /* boot args string */
-#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu             /* os version string */
-#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u        /* kernel page size in uint32_t */
-#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u          /* jetsam level in uint32_t */
-#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u /* timestamp used for the delta stackshot */
+#define STACKSHOT_KCTYPE_TASK_SNAPSHOT 0x905u             /* task_snapshot_v2 */
+#define STACKSHOT_KCTYPE_THREAD_SNAPSHOT 0x906u           /* thread_snapshot_v2, thread_snapshot_v3 */
+#define STACKSHOT_KCTYPE_DONATING_PIDS 0x907u             /* int[] */
+#define STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO 0x908u      /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_NAME 0x909u               /* char[] */
+#define STACKSHOT_KCTYPE_KERN_STACKFRAME 0x90Au           /* struct stack_snapshot_frame32 */
+#define STACKSHOT_KCTYPE_KERN_STACKFRAME64 0x90Bu         /* struct stack_snapshot_frame64 */
+#define STACKSHOT_KCTYPE_USER_STACKFRAME 0x90Cu           /* struct stack_snapshot_frame32 */
+#define STACKSHOT_KCTYPE_USER_STACKFRAME64 0x90Du         /* struct stack_snapshot_frame64 */
+#define STACKSHOT_KCTYPE_BOOTARGS 0x90Eu                  /* boot args string */
+#define STACKSHOT_KCTYPE_OSVERSION 0x90Fu                 /* os version string */
+#define STACKSHOT_KCTYPE_KERN_PAGE_SIZE 0x910u            /* kernel page size in uint32_t */
+#define STACKSHOT_KCTYPE_JETSAM_LEVEL 0x911u              /* jetsam level in uint32_t */
+#define STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP 0x912u     /* timestamp used for the delta stackshot */
+#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u              /* uint32_t */
+#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u            /* uint64_t */
+#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u              /* uint32_t */
+#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u            /* uint64_t */
+#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u          /* uint64_t */
+#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u         /* uint64_t */
+#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u                 /* struct stackshot_cpu_times or stackshot_cpu_times_v2 */
+#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au        /* struct stackshot_duration */
+#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu     /* struct stackshot_fault_stats */
+#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO  0x91cu     /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
+#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du           /* struct stackshot_thread_waitinfo */
+#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu     /* struct thread_group_snapshot or thread_group_snapshot_v2 */
+#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu              /* uint64_t */
+#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */
+#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u          /* uint64_t */
+#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u     /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */
+#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u             /* struct instrs_cycles_snapshot */
+#define STACKSHOT_KCTYPE_USER_STACKTOP 0x924u             /* struct stack_snapshot_stacktop */
+#define STACKSHOT_KCTYPE_ASID 0x925u                      /* uint32_t */
+#define STACKSHOT_KCTYPE_PAGE_TABLES 0x926u               /* uint64_t */
+#define STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT 0x927u    /* same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
 
 #define STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT 0x940u   /* task_delta_snapshot_v2 */
 #define STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT 0x941u /* thread_delta_snapshot_v* */
 
-#define STACKSHOT_KCTYPE_KERN_STACKLR 0x913u          /* uint32_t */
-#define STACKSHOT_KCTYPE_KERN_STACKLR64 0x914u        /* uint64_t */
-#define STACKSHOT_KCTYPE_USER_STACKLR 0x915u          /* uint32_t */
-#define STACKSHOT_KCTYPE_USER_STACKLR64 0x916u        /* uint64_t */
-#define STACKSHOT_KCTYPE_NONRUNNABLE_TIDS 0x917u      /* uint64_t */
-#define STACKSHOT_KCTYPE_NONRUNNABLE_TASKS 0x918u     /* uint64_t */
-#define STACKSHOT_KCTYPE_CPU_TIMES 0x919u             /* struct stackshot_cpu_times */
-#define STACKSHOT_KCTYPE_STACKSHOT_DURATION 0x91au    /* struct stackshot_duration */
-#define STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS 0x91bu /* struct stackshot_fault_stats */
-#define STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO  0x91cu /* kernelcache UUID -- same as KCDATA_TYPE_LIBRARY_LOADINFO64 */
-#define STACKSHOT_KCTYPE_THREAD_WAITINFO 0x91du       /* struct stackshot_thread_waitinfo */
-#define STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT 0x91eu /* struct thread_group_snapshot or thread_group_snapshot_v2 */
-#define STACKSHOT_KCTYPE_THREAD_GROUP 0x91fu          /* uint64_t */
-#define STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT 0x920u /* struct jetsam_coalition_snapshot */
-#define STACKSHOT_KCTYPE_JETSAM_COALITION 0x921u      /* uint64_t */
-#define STACKSHOT_KCTYPE_INSTRS_CYCLES 0x923u         /* struct instrs_cycles_snapshot */
-
-#define STACKSHOT_KCTYPE_THREAD_POLICY_VERSION 0x922u /* THREAD_POLICY_INTERNAL_STRUCT_VERSION in uint32 */
-
 struct stack_snapshot_frame32 {
        uint32_t lr;
        uint32_t sp;
@@ -537,6 +539,10 @@ enum task_snapshot_flags {
        kTaskUUIDInfoMissing                  = 0x200000, /* some UUID info was paged out */
        kTaskUUIDInfoTriedFault               = 0x400000, /* tried to fault in UUID info */
        kTaskSharedRegionInfoUnavailable      = 0x800000,  /* shared region info unavailable */
+       kTaskTALEngaged                       = 0x1000000,
+       /* 0x2000000 unused */
+       kTaskIsDirtyTracked                   = 0x4000000,
+       kTaskAllowIdleExit                    = 0x8000000,
 };
 
 enum thread_snapshot_flags {
@@ -785,6 +791,12 @@ struct stackshot_cpu_times {
        uint64_t system_usec;
 } __attribute__((packed));
 
+struct stackshot_cpu_times_v2 {
+       uint64_t user_usec;
+       uint64_t system_usec;
+       uint64_t runnable_usec;
+} __attribute__((packed));
+
 struct stackshot_duration {
        uint64_t stackshot_duration;
        uint64_t stackshot_duration_outer;
@@ -813,6 +825,12 @@ typedef struct stackshot_thread_waitinfo {
 #define STACKSHOT_WAITOWNER_SUSPENDED      (UINT64_MAX - 7) /* workloop is suspended */
 
 
+struct stack_snapshot_stacktop {
+       uint64_t sp;
+       uint8_t stack_contents[8];
+};
+
+
 /**************** definitions for crashinfo *********************/
 
 /*
@@ -866,6 +884,22 @@ struct crashinfo_proc_uniqidentifierinfo {
 #define TASK_CRASHINFO_UDATA_PTRS           0x81C  /* uint64_t */
 #define TASK_CRASHINFO_MEMORY_LIMIT         0x81D  /* uint64_t */
 
+#define TASK_CRASHINFO_LEDGER_INTERNAL                          0x81E /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_INTERNAL_COMPRESSED               0x81F /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_IOKIT_MAPPED                      0x820 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING              0x821 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_ALTERNATE_ACCOUNTING_COMPRESSED   0x822 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE             0x823 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PURGEABLE_NONVOLATILE_COMPRESSED  0x824 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PAGE_TABLE                        0x825 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT                    0x826 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_PHYS_FOOTPRINT_LIFETIME_MAX       0x827 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE               0x828 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_NETWORK_NONVOLATILE_COMPRESSED    0x829 /* uint64_t */
+#define TASK_CRASHINFO_LEDGER_WIRED_MEM                         0x82A /* uint64_t */
+
+
+
 #define TASK_CRASHINFO_END                  KCDATA_TYPE_BUFFER_END
 
 /**************** definitions for os reasons *********************/
@@ -963,7 +997,7 @@ kcdata_iter_type(kcdata_iter_t iter)
 static inline uint32_t
 kcdata_calc_padding(uint32_t size)
 {
-       /* calculate number of bits to add to size to get something divisible by 16 */
+       /* calculate number of bytes to add to size to get something divisible by 16 */
        return (-size) & 0xf;
 }
 
index 97bee226c7dd956196491fae6ab3b291ef45aef3..71a2368f6d32dada565d147951a309b193fab604 100644 (file)
@@ -185,11 +185,13 @@ static kern_return_t kcdata_get_memory_addr_with_flavor(
                uint64_t flags,
                mach_vm_address_t *user_addr)
 {
+       kern_return_t kr;
        struct kcdata_item info;
 
        uint32_t orig_size = size;
        /* make sure 16 byte aligned */
-       size += kcdata_calc_padding(size);
+       uint32_t padding = kcdata_calc_padding(size);
+       size += padding;
        uint32_t total_size  = size + sizeof(info);
 
        if (user_addr == NULL || data == NULL || total_size + sizeof(info) < orig_size) {
@@ -207,14 +209,18 @@ static kern_return_t kcdata_get_memory_addr_with_flavor(
                return KERN_RESOURCE_SHORTAGE;
        }
 
-       if (data->kcd_flags & KCFLAG_USE_COPYOUT) {
-               if (copyout(&info, data->kcd_addr_end, sizeof(info)))
-                       return KERN_NO_ACCESS;
-       } else {
-               memcpy((void *)data->kcd_addr_end, &info, sizeof(info));
-       }
+       kr = kcdata_memcpy(data, data->kcd_addr_end, &info, sizeof(info));
+       if (kr)
+               return kr;
 
        data->kcd_addr_end += sizeof(info);
+
+       if (padding) {
+               kr = kcdata_bzero(data, data->kcd_addr_end + size - padding, padding);
+               if (kr)
+                       return kr;
+       }
+
        *user_addr = data->kcd_addr_end;
        data->kcd_addr_end += size;
 
@@ -317,7 +323,7 @@ kcdata_undo_add_container_begin(kcdata_descriptor_t data)
  * returns: KERN_NO_ACCESS if copyout fails.
  */
 
-kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, void *src_addr, uint32_t size)
+kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, const void *src_addr, uint32_t size)
 {
        if (data->kcd_flags & KCFLAG_USE_COPYOUT) {
                if (copyout(src_addr, dst_addr, size))
@@ -328,6 +334,30 @@ kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr
        return KERN_SUCCESS;
 }
 
+/*
+ * Routine: kcdata_bzero
+ * Desc: zero out a portion of a kcdata buffer.
+ */
+kern_return_t
+kcdata_bzero(kcdata_descriptor_t data, mach_vm_address_t dst_addr, uint32_t size)
+{
+       kern_return_t kr = KERN_SUCCESS;
+       if (data->kcd_flags & KCFLAG_USE_COPYOUT) {
+               uint8_t zeros[16] = {};
+               while (size) {
+                       uint32_t block_size = MIN(size, 16);
+                       kr = copyout(&zeros, dst_addr, block_size);
+                       if (kr)
+                               return KERN_NO_ACCESS;
+                       size -= block_size;
+               }
+               return KERN_SUCCESS;
+       } else {
+               bzero((void*)dst_addr, size);
+               return KERN_SUCCESS;
+       }
+}
+
 /*
  * Routine: kcdata_add_type_definition
  * Desc: add type definition to kcdata buffer.
index ce49bf679ab083daca659f46b23b81b10e0f87bf..39739d76e699d4c00e0b3dd04cf549535f5a486e 100644 (file)
@@ -101,7 +101,8 @@ typedef void * kcdata_descriptor_t;
 
 uint32_t kcdata_estimate_required_buffer_size(uint32_t num_items, uint32_t payload_size);
 uint64_t kcdata_memory_get_used_bytes(kcdata_descriptor_t kcd);
-kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, void * src_addr, uint32_t size);
+kern_return_t kcdata_memcpy(kcdata_descriptor_t data, mach_vm_address_t dst_addr, const void * src_addr, uint32_t size);
+kern_return_t kcdata_bzero(kcdata_descriptor_t data, mach_vm_address_t dst_addr, uint32_t size);
 kern_return_t kcdata_get_memory_addr(kcdata_descriptor_t data, uint32_t type, uint32_t size, mach_vm_address_t * user_addr);
 kern_return_t kcdata_get_memory_addr_for_array(
     kcdata_descriptor_t data, uint32_t type_of_element, uint32_t size_of_element, uint32_t count, mach_vm_address_t * user_addr);
diff --git a/osfmk/kern/kern_ecc.c b/osfmk/kern/kern_ecc.c
deleted file mode 100644 (file)
index d62ab81..0000000
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
-* Copyright (c) 2013 Apple Inc. All rights reserved.
-*
-* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
-* 
-* This file contains Original Code and/or Modifications of Original Code
-* as defined in and that are subject to the Apple Public Source License
-* Version 2.0 (the 'License'). You may not use this file except in
-* compliance with the License. The rights granted to you under the License
-* may not be used to create, or enable the creation or redistribution of,
-* unlawful or unlicensed copies of an Apple operating system, or to
-* circumvent, violate, or enable the circumvention or violation of, any
-* terms of an Apple operating system software license agreement.
-* 
-* Please obtain a copy of the License at
-* http://www.opensource.apple.com/apsl/ and read it before using this file.
-* 
-* The Original Code and all software distributed under the License are
-* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
-* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
-* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
-* Please see the License for the specific language governing rights and
-* limitations under the License.
-* 
-* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
-*/
-#include <mach/mach_types.h>
-#include <mach/host_info.h>
-#include <kern/locks.h>
-#include <kern/ecc.h>
-#include <kern/spl.h>
-#include <pexpert/pexpert.h>
-#include <libkern/OSAtomic.h>
-
-/*
- * ECC data.  Not really KPCs, but this still seems like the
- * best home for this code.
- *
- * Circular buffer of events.  When we fill up, drop data.
- */
-#define ECC_EVENT_BUFFER_COUNT 5
-struct ecc_event               ecc_data[ECC_EVENT_BUFFER_COUNT];
-static uint32_t                        ecc_data_next_read; 
-static uint32_t                        ecc_data_next_write; 
-static boolean_t               ecc_data_empty = TRUE; // next read == next write : empty or full?
-static lck_grp_t               *ecc_data_lock_group;
-static lck_spin_t              ecc_data_lock;
-static uint32_t                        ecc_correction_count;
-
-void
-ecc_log_init()
-{
-       ecc_data_lock_group = lck_grp_alloc_init("ecc-data", NULL);
-       lck_spin_init(&ecc_data_lock, ecc_data_lock_group, NULL);
-       OSMemoryBarrier();
-}
-
-uint32_t
-ecc_log_get_correction_count()
-{
-       return ecc_correction_count;
-}
-
-kern_return_t
-ecc_log_record_event(const struct ecc_event *ev)
-{
-       spl_t x;
-
-       if (ev->count > ECC_EVENT_INFO_DATA_ENTRIES) {
-               panic("Count of %u on ecc event is too large.", (unsigned)ev->count);
-       }
-
-       x = splhigh();
-       lck_spin_lock(&ecc_data_lock);
-
-       ecc_correction_count++;
-
-       if (ecc_data_next_read == ecc_data_next_write && !ecc_data_empty)  {
-               lck_spin_unlock(&ecc_data_lock);
-               splx(x);
-               return KERN_FAILURE;
-       }
-
-       bcopy(ev, &ecc_data[ecc_data_next_write], sizeof(*ev));
-       ecc_data_next_write++;
-       ecc_data_next_write %= ECC_EVENT_BUFFER_COUNT;
-       ecc_data_empty = FALSE;
-
-       lck_spin_unlock(&ecc_data_lock);
-       splx(x);
-
-       return KERN_SUCCESS;
-}
-
-
-kern_return_t
-ecc_log_get_next_event(struct ecc_event *ev)
-{
-       spl_t x;
-
-       x = splhigh();
-       lck_spin_lock(&ecc_data_lock);
-
-       if (ecc_data_empty)  {
-               assert(ecc_data_next_write == ecc_data_next_read);
-
-               lck_spin_unlock(&ecc_data_lock);
-               splx(x);
-               return KERN_FAILURE;
-       }
-
-       bcopy(&ecc_data[ecc_data_next_read], ev, sizeof(*ev));
-       ecc_data_next_read++;
-       ecc_data_next_read %= ECC_EVENT_BUFFER_COUNT;
-
-       if (ecc_data_next_read == ecc_data_next_write) {
-               ecc_data_empty = TRUE;
-       }
-
-       lck_spin_unlock(&ecc_data_lock);
-       splx(x);
-
-       return KERN_SUCCESS;
-}
index 92bacff0326f420ec040d170694b8928f9b4dea0..0c9d825e93be769e61f4fa5ab60d35e45877c503 100644 (file)
@@ -61,6 +61,12 @@ _Atomic uint64_t mt_retrograde = 0;
 #define MAXSPINS   100
 #define MAXRETRIES 10
 
+/*
+ * Write the fixed counter values for the thread `thread` into `counts_out`.
+ *
+ * This function does not include the accumulated counter values since the
+ * thread's last context switch or quantum expiration.
+ */
 int
 mt_fixed_thread_counts(thread_t thread, uint64_t *counts_out)
 {
@@ -521,3 +527,54 @@ mt_stackshot_task(task_t task, uint64_t *instrs, uint64_t *cycles)
 
        *cycles = task->task_monotonic.mtk_counts[MT_CORE_CYCLES];
 }
+
+/*
+ * Maintain reset values for the fixed instruction and cycle counters so
+ * clients can be notified after a given number of those events occur.  This is
+ * only used by microstackshot.
+ */
+
+bool mt_microstackshots = false;
+unsigned int mt_microstackshot_ctr = 0;
+mt_pmi_fn mt_microstackshot_pmi_handler = NULL;
+void *mt_microstackshot_ctx = NULL;
+uint64_t mt_core_reset_values[MT_CORE_NFIXED] = { 0 };
+
+#define MT_MIN_FIXED_PERIOD (10 * 1000 * 1000)
+
+int
+mt_microstackshot_start(unsigned int ctr, uint64_t period, mt_pmi_fn handler,
+               void *ctx)
+{
+       assert(ctr < MT_CORE_NFIXED);
+
+       if (period < MT_MIN_FIXED_PERIOD) {
+               return EINVAL;
+       }
+       if (mt_microstackshots) {
+               return EBUSY;
+       }
+
+       mt_microstackshot_ctr = ctr;
+       mt_microstackshot_pmi_handler = handler;
+       mt_microstackshot_ctx = ctx;
+
+       int error = mt_microstackshot_start_arch(period);
+       if (error) {
+               return error;
+       }
+
+       mt_microstackshots = true;
+
+       return 0;
+}
+
+int
+mt_microstackshot_stop(void)
+{
+       mt_microstackshots = false;
+       memset(mt_core_reset_values, 0, sizeof(mt_core_reset_values));
+
+       return 0;
+}
+
index 0c1e07bf17d08993c40af34716f43f82d8b5f2ec..28d6270fa5c2c4d74f324c1af738b065c3fa9a85 100644 (file)
@@ -45,6 +45,7 @@
 
 #include <libsa/types.h>
 #include <libkern/version.h>
+#include <libkern/section_keywords.h>
 
 #include <string.h> /* bcopy */
 
 #include <vm/vm_shared_region.h>
 #include <libkern/OSKextLibPrivate.h>
 
+#if defined(__x86_64__)
+#include <i386/mp.h>
+#include <i386/cpu_threads.h>
+#endif
+
 #if CONFIG_EMBEDDED
 #include <pexpert/pexpert.h> /* For gPanicBase/gPanicBase */
 #endif
 
 extern unsigned int not_in_kdp;
 
-#if CONFIG_EMBEDDED
-uuid_t kernelcache_uuid;
-#endif
 
 /* indicate to the compiler that some accesses are unaligned */
 typedef uint64_t unaligned_u64 __attribute__((aligned(1)));
 
 extern addr64_t kdp_vtophys(pmap_t pmap, addr64_t va);
-extern void * proc_get_uthread_uu_threadlist(void * uthread_v);
 
 int kdp_snapshot                            = 0;
 static kern_return_t stack_snapshot_ret     = 0;
@@ -134,18 +136,20 @@ static void               stackshot_coalition_jetsam_snapshot(void *arg, int i, coalition_t c
 #endif /* CONFIG_COALITIONS */
 
 
-extern uint32_t workqueue_get_pwq_state_kdp(void *proc);
+extern uint32_t        workqueue_get_pwq_state_kdp(void *proc);
 
 extern int             proc_pid(void *p);
 extern uint64_t                proc_uniqueid(void *p);
 extern uint64_t                proc_was_throttled(void *p);
 extern uint64_t                proc_did_throttle(void *p);
-static uint64_t proc_did_throttle_from_task(task_t task);
-extern void proc_name_kdp(task_t task, char * buf, int size);
-extern int proc_threadname_kdp(void * uth, char * buf, size_t size);
-extern void proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime);
+extern int             proc_exiting(void *p);
+extern int             proc_in_teardown(void *p);
+static uint64_t        proc_did_throttle_from_task(task_t task);
+extern void            proc_name_kdp(task_t task, char * buf, int size);
+extern int             proc_threadname_kdp(void * uth, char * buf, size_t size);
+extern void            proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime);
 extern int             memorystatus_get_pressure_status_kdp(void);
-extern boolean_t memorystatus_proc_is_dirty_unsafe(void * v);
+extern void            memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit);
 
 extern int count_busy_buffers(void); /* must track with declaration in bsd/sys/buf_internal.h */
 extern void bcopy_phys(addr64_t, addr64_t, vm_size_t);
@@ -217,6 +221,8 @@ static lck_mtx_t    stackshot_subsys_mutex;
 #define SANE_BOOTPROFILE_TRACEBUF_SIZE (64 * 1024 * 1024)
 #define SANE_TRACEBUF_SIZE (8 * 1024 * 1024)
 
+SECURITY_READ_ONLY_LATE(static uint32_t) max_tracebuf_size = SANE_TRACEBUF_SIZE;
+
 /*
  * We currently set a ceiling of 3 milliseconds spent in the kdp fault path
  * for non-panic stackshots where faulting is requested.
@@ -245,6 +251,8 @@ stackshot_init( void )
 
        clock_timebase_info(&timebase);
        fault_stats.sfs_system_max_fault_time = ((KDP_FAULT_PATH_MAX_TIME_PER_STACKSHOT_NSECS * timebase.denom)/ timebase.numer);
+
+       PE_parse_boot_argn("stackshot_maxsz", &max_tracebuf_size, sizeof(max_tracebuf_size));
 }
 
 /* 
@@ -275,7 +283,38 @@ static uint64_t safe_grab_timer_value(struct timer *t)
 static kern_return_t 
 stackshot_trap()
 {
-       return DebuggerTrapWithState(DBOP_STACKSHOT, NULL, NULL, NULL, 0, FALSE, 0);
+       kern_return_t   rv;
+
+#if defined(__x86_64__)
+       /*
+        * Since mp_rendezvous and stackshot both attempt to capture cpus then perform an
+        * operation, it's essential to apply mutual exclusion to the other when one
+        * mechanism is in operation, lest there be a deadlock as the mechanisms race to
+        * capture CPUs.
+        *
+        * Further, we assert that invoking stackshot from mp_rendezvous*() is not
+        * allowed, so we check to ensure there there is no rendezvous in progress before
+        * trying to grab the lock (if there is, a deadlock will occur when we try to
+        * grab the lock).  This is accomplished by setting cpu_rendezvous_in_progress to
+        * TRUE in the mp rendezvous action function.  If stackshot_trap() is called by
+        * a subordinate of the call chain within the mp rendezvous action, this flag will
+        * be set and can be used to detect the inevitable deadlock that would occur
+        * if this thread tried to grab the rendezvous lock.
+        */
+
+       if (current_cpu_datap()->cpu_rendezvous_in_progress == TRUE) {
+               panic("Calling stackshot from a rendezvous is not allowed!");
+       }
+
+       mp_rendezvous_lock();
+#endif
+
+       rv = DebuggerTrapWithState(DBOP_STACKSHOT, NULL, NULL, NULL, 0, NULL, FALSE, 0);
+
+#if defined(__x86_64__)
+       mp_rendezvous_unlock();
+#endif
+       return (rv);
 }
 
 
@@ -295,9 +334,9 @@ stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint32_t flags, ui
                return KERN_INVALID_ARGUMENT;
        }
 
-       /* cap in individual stackshot to SANE_TRACEBUF_SIZE */
-       if (size > SANE_TRACEBUF_SIZE) {
-               size = SANE_TRACEBUF_SIZE;
+       /* cap in individual stackshot to max_tracebuf_size */
+       if (size > max_tracebuf_size) {
+               size = max_tracebuf_size;
        }
 
        /* Serialize tracing */
@@ -375,7 +414,7 @@ stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flag
        STACKSHOT_SUBSYS_LOCK();
 
        if (flags & STACKSHOT_GET_MICROSTACKSHOT) {
-               if (tracebuf_size > SANE_TRACEBUF_SIZE) {
+               if (tracebuf_size > max_tracebuf_size) {
                        error = KERN_INVALID_ARGUMENT;
                        goto unlock_exit;
                }
@@ -517,7 +556,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi
                        pid = config->sc_pid;
                        flags = config->sc_flags;
                        since_timestamp = config->sc_delta_timestamp;
-                       if (config->sc_size <= SANE_TRACEBUF_SIZE) {
+                       if (config->sc_size <= max_tracebuf_size) {
                                size_hint = config->sc_size;
                        }
                        break;
@@ -610,7 +649,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi
 
        stackshotbuf_size = get_stackshot_estsize(size_hint);
 
-       for (; stackshotbuf_size <= SANE_TRACEBUF_SIZE; stackshotbuf_size <<= 1) {
+       for (; stackshotbuf_size <= max_tracebuf_size; stackshotbuf_size <<= 1) {
                if (kmem_alloc(kernel_map, (vm_offset_t *)&stackshotbuf, stackshotbuf_size, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
                        error = KERN_RESOURCE_SHORTAGE;
                        goto error_exit;
@@ -699,7 +738,7 @@ kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_confi
                goto error_exit;
        }
 
-       if (stackshotbuf_size > SANE_TRACEBUF_SIZE) {
+       if (stackshotbuf_size > max_tracebuf_size) {
                error = KERN_RESOURCE_SHORTAGE;
        }
 
@@ -794,11 +833,11 @@ static uint64_t
 kcdata_get_task_ss_flags(task_t task)
 {
        uint64_t ss_flags = 0;
-       boolean_t task64 = task_has_64BitAddr(task);
+       boolean_t task_64bit_addr = task_has_64Bit_addr(task);
 
-       if (task64)
+       if (task_64bit_addr)
                ss_flags |= kUser64_p;
-       if (!task->active || task_is_a_corpse(task))
+       if (!task->active || task_is_a_corpse(task) || proc_exiting(task->bsd_info))
                ss_flags |= kTerminatedSnapshot;
        if (task->pidsuspended)
                ss_flags |= kPidSuspended;
@@ -813,9 +852,19 @@ kcdata_get_task_ss_flags(task_t task)
        if (task->effective_policy.tep_sup_active == 1)
                ss_flags |= kTaskIsSuppressed;
 #if CONFIG_MEMORYSTATUS
-       if (memorystatus_proc_is_dirty_unsafe(task->bsd_info))
+
+       boolean_t dirty = FALSE, dirty_tracked = FALSE, allow_idle_exit = FALSE;
+       memorystatus_proc_flags_unsafe(task->bsd_info, &dirty, &dirty_tracked, &allow_idle_exit);
+       if (dirty)
                ss_flags |= kTaskIsDirty;
+       if (dirty_tracked)
+               ss_flags |= kTaskIsDirtyTracked;
+       if (allow_idle_exit)
+               ss_flags |= kTaskAllowIdleExit;
+
 #endif
+       if (task->effective_policy.tep_tal_engaged)
+               ss_flags |= kTaskTALEngaged;
 
        ss_flags |= (0x7 & workqueue_get_pwq_state_kdp(task->bsd_info)) << 17;
 
@@ -827,19 +876,17 @@ kcdata_get_task_ss_flags(task_t task)
                        ss_flags |= kTaskIsLiveImpDonor;
        }
 #endif
-
        return ss_flags;
 }
 
 static kern_return_t
-kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, struct dyld_uuid_info_64_v2 *sys_shared_cache_loadinfo, unaligned_u64 *task_snap_ss_flags)
+kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_u64 *task_snap_ss_flags)
 {
        kern_return_t error = KERN_SUCCESS;
        mach_vm_address_t out_addr = 0;
 
        uint64_t shared_cache_slide = 0;
        uint64_t shared_cache_base_address = 0;
-       int task_pid = pid_from_task(task);
        uint32_t kdp_fault_results = 0;
 
        assert(task_snap_ss_flags != NULL);
@@ -863,22 +910,9 @@ kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, struct dyl
         */
        shared_cache_slide = task->shared_region->sr_slide_info.slide;
 
-       if (sys_shared_cache_loadinfo) {
-               if (task_pid == 1) {
-                       /* save launchd's shared cache info as system level */
-                       stackshot_memcpy(sys_shared_cache_loadinfo->imageUUID, &task->shared_region->sr_uuid, sizeof(task->shared_region->sr_uuid));
-                       sys_shared_cache_loadinfo->imageLoadAddress = shared_cache_slide;
-                       sys_shared_cache_loadinfo->imageSlidBaseAddress = shared_cache_slide + task->shared_region->sr_base_address;
-
-                       goto error_exit;
-               } else {
-                       if (shared_cache_slide == sys_shared_cache_loadinfo->imageLoadAddress &&
-                           0 == memcmp(&task->shared_region->sr_uuid, sys_shared_cache_loadinfo->imageUUID,
-                                       sizeof(task->shared_region->sr_uuid))) {
-                               /* skip adding shared cache info. its same as system level one */
-                               goto error_exit;
-                       }
-               }
+       if (task->shared_region == init_task_shared_region) {
+               /* skip adding shared cache info -- it's the same as the system level one */
+               goto error_exit;
        }
 
        kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64_v2), &out_addr));
@@ -908,8 +942,6 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla
 {
        boolean_t save_loadinfo_p         = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0);
        boolean_t save_kextloadinfo_p     = ((trace_flags & STACKSHOT_SAVE_KEXT_LOADINFO) != 0);
-       boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
-       boolean_t minimize_uuids          = collect_delta_stackshot && ((trace_flags & STACKSHOT_TAILSPIN) != 0);
        boolean_t should_fault            = (trace_flags & STACKSHOT_ENABLE_UUID_FAULTING);
 
        kern_return_t error        = KERN_SUCCESS;
@@ -923,11 +955,11 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla
        assert(task_snap_ss_flags != NULL);
 
        int task_pid     = pid_from_task(task);
-       boolean_t task64 = task_has_64BitAddr(task);
+       boolean_t task_64bit_addr = task_has_64Bit_addr(task);
 
        if (save_loadinfo_p && have_pmap && task->active && task_pid > 0) {
                /* Read the dyld_all_image_infos struct from the task memory to get UUID array count and location */
-               if (task64) {
+               if (task_64bit_addr) {
                        struct user64_dyld_all_image_infos task_image_infos;
                        if (kdp_copyin(task->map, task->all_image_info_addr, &task_image_infos,
                                       sizeof(struct user64_dyld_all_image_infos), should_fault, &kdp_fault_results)) {
@@ -968,13 +1000,10 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla
        }
 
        if (task_pid > 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) {
-               if (minimize_uuids && uuid_info_timestamp != 0 && uuid_info_timestamp < stack_snapshot_delta_since_timestamp)
-                       goto error_exit;
-
-               uint32_t uuid_info_size       = (uint32_t)(task64 ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
+               uint32_t uuid_info_size       = (uint32_t)(task_64bit_addr ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
                uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
 
-               kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task64 ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
+               kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
                                                                        uuid_info_size, uuid_info_count, &out_addr));
 
                /* Copy in the UUID info array
@@ -985,15 +1014,12 @@ kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint32_t trace_fla
                }
 
        } else if (task_pid == 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) {
-               if (minimize_uuids && gLoadedKextSummaries != 0 && gLoadedKextSummariesTimestamp < stack_snapshot_delta_since_timestamp)
-                       goto error_exit;
-
                uintptr_t image_load_address;
 
                do {
 
 #if CONFIG_EMBEDDED
-                       if (!save_kextloadinfo_p) {
+                       if (kernelcache_uuid_valid && !save_kextloadinfo_p) {
                                kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64), &out_addr));
                                struct dyld_uuid_info_64 *kc_uuid = (struct dyld_uuid_info_64 *)out_addr;
                                kc_uuid->imageLoadAddress = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
@@ -1102,10 +1128,15 @@ static kern_return_t
 kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace_flags, boolean_t have_pmap, unaligned_u64 **task_snap_ss_flags)
 {
        boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
-       boolean_t collect_iostats         = !collect_delta_stackshot && !(trace_flags & STACKSHOT_TAILSPIN) && !(trace_flags & STACKSHOT_NO_IO_STATS);
+       boolean_t collect_iostats         = !collect_delta_stackshot && !(trace_flags & STACKSHOT_NO_IO_STATS);
 #if MONOTONIC
        boolean_t collect_instrs_cycles   = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
 #endif /* MONOTONIC */
+#if __arm__ || __arm64__
+       boolean_t collect_asid            = ((trace_flags & STACKSHOT_ASID) != 0);
+#endif
+       boolean_t collect_pagetables       = ((trace_flags & STACKSHOT_PAGE_TABLES) != 0);
+       
 
        kern_return_t error                 = KERN_SUCCESS;
        mach_vm_address_t out_addr          = 0;
@@ -1118,8 +1149,8 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace
        uint64_t proc_starttime_secs = 0;
 
        kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_SNAPSHOT, sizeof(struct task_snapshot_v2), &out_addr));
-
        cur_tsnap = (struct task_snapshot_v2 *)out_addr;
+       bzero(cur_tsnap, sizeof(*cur_tsnap));
 
        cur_tsnap->ts_unique_pid = task_uniqueid;
        cur_tsnap->ts_ss_flags = kcdata_get_task_ss_flags(task);
@@ -1129,28 +1160,49 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace
 
        proc_starttime_kdp(task->bsd_info, &proc_starttime_secs, NULL, NULL);
        cur_tsnap->ts_p_start_sec = proc_starttime_secs;
-
-#if CONFIG_EMBEDDED
        cur_tsnap->ts_task_size = have_pmap ? get_task_phys_footprint(task) : 0;
-#else
-       cur_tsnap->ts_task_size = have_pmap ? (pmap_resident_count(task->map->pmap) * PAGE_SIZE) : 0;
-#endif
        cur_tsnap->ts_max_resident_size = get_task_resident_max(task);
+       cur_tsnap->ts_was_throttled = (uint32_t) proc_was_throttled_from_task(task);
+       cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task);
+
        cur_tsnap->ts_suspend_count = task->suspend_count;
        cur_tsnap->ts_faults = task->faults;
        cur_tsnap->ts_pageins = task->pageins;
        cur_tsnap->ts_cow_faults = task->cow_faults;
-       cur_tsnap->ts_was_throttled = (uint32_t) proc_was_throttled_from_task(task);
-       cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task);
        cur_tsnap->ts_latency_qos = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ?
                LATENCY_QOS_TIER_UNSPECIFIED : ((0xFF << 16) | task->effective_policy.tep_latency_qos);
        cur_tsnap->ts_pid = task_pid;
 
+#if __arm__ || __arm64__
+       if (collect_asid && have_pmap) {
+               uint32_t asid = task->map->pmap->asid;
+               kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr));
+               stackshot_memcpy((void*)out_addr, &asid, sizeof(asid));
+       }
+#endif
+       if (collect_pagetables && have_pmap) {
+#if INTERRUPT_MASKED_DEBUG
+               // pagetable dumps can be large; reset the interrupt timeout to avoid a panic
+               ml_spin_debug_clear_self();
+#endif
+               size_t bytes_dumped = pmap_dump_page_tables(task->map->pmap, kcd_end_address(kcd), kcd_max_address(kcd));
+               if (bytes_dumped == 0) {
+                       error = KERN_INSUFFICIENT_BUFFER_SIZE;
+                       goto error_exit;
+               } else if (bytes_dumped == (size_t)-1) {
+                       error = KERN_NOT_SUPPORTED;
+                       goto error_exit;
+               } else {
+                       kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, STACKSHOT_KCTYPE_PAGE_TABLES,
+                           sizeof(uint64_t), (uint32_t)(bytes_dumped / sizeof(uint64_t)), &out_addr));
+               }
+       }
+
        /* Add the BSD process identifiers */
        if (task_pid != -1 && task->bsd_info != NULL) {
                proc_name_kdp(task, cur_tsnap->ts_p_comm, sizeof(cur_tsnap->ts_p_comm));
 #if CONFIG_COALITIONS
-               if (trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) {
+               if ((trace_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) && (task->coalition[COALITION_TYPE_JETSAM] != NULL)) {
                        uint64_t jetsam_coal_id = coalition_id(task->coalition[COALITION_TYPE_JETSAM]);
                        kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_JETSAM_COALITION, sizeof(jetsam_coal_id), &out_addr));
                        stackshot_memcpy((void*)out_addr, &jetsam_coal_id, sizeof(jetsam_coal_id));
@@ -1162,7 +1214,7 @@ kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace
 #if IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG)
                if (task->task_imp_base != NULL) {
                        stackshot_strlcpy(cur_tsnap->ts_p_comm, &task->task_imp_base->iit_procname[0],
-                               MIN((int)sizeof(task->task_imp_base->iit_procname), (int)sizeof(cur_tsnap->ts_p_comm)));
+                                                         MIN((int)sizeof(task->task_imp_base->iit_procname), (int)sizeof(cur_tsnap->ts_p_comm)));
                }
 #endif /* IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) */
        }
@@ -1184,13 +1236,18 @@ error_exit:
 static kern_return_t
 kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t trace_flags, boolean_t have_pmap, unaligned_u64 **task_snap_ss_flags)
 {
+#if !MONOTONIC
+#pragma unused(trace_flags)
+#endif /* !MONOTONIC */
        kern_return_t error                       = KERN_SUCCESS;
        struct task_delta_snapshot_v2 * cur_tsnap = NULL;
        mach_vm_address_t out_addr                = 0;
+       (void) trace_flags;
+#if __arm__ || __arm64__
+       boolean_t collect_asid                    = ((trace_flags & STACKSHOT_ASID) != 0);
+#endif
 #if MONOTONIC
        boolean_t collect_instrs_cycles           = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
-#else
-       (void)trace_flags;
 #endif /* MONOTONIC */
 
        uint64_t task_uniqueid = get_task_uniqueid(task);
@@ -1207,11 +1264,7 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t
        cur_tsnap->tds_user_time_in_terminated_threads = task->total_user_time;
        cur_tsnap->tds_system_time_in_terminated_threads = task->total_system_time;
 
-#if CONFIG_EMBEDDED
        cur_tsnap->tds_task_size = have_pmap ? get_task_phys_footprint(task) : 0;
-#else
-       cur_tsnap->tds_task_size = have_pmap ? (pmap_resident_count(task->map->pmap) * PAGE_SIZE) : 0;
-#endif
 
        cur_tsnap->tds_max_resident_size = get_task_resident_max(task);
        cur_tsnap->tds_suspend_count = task->suspend_count;
@@ -1221,8 +1274,16 @@ kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint32_t
        cur_tsnap->tds_was_throttled     = (uint32_t)proc_was_throttled_from_task(task);
        cur_tsnap->tds_did_throttle      = (uint32_t)proc_did_throttle_from_task(task);
        cur_tsnap->tds_latency_qos       = (task-> effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED)
-                                        ? LATENCY_QOS_TIER_UNSPECIFIED
-                                        : ((0xFF << 16) | task-> effective_policy.tep_latency_qos);
+               ? LATENCY_QOS_TIER_UNSPECIFIED
+               : ((0xFF << 16) | task-> effective_policy.tep_latency_qos);
+
+#if __arm__ || __arm64__
+       if (collect_asid && have_pmap) {
+               uint32_t asid = task->map->pmap->asid;
+               kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr));
+               stackshot_memcpy((void*)out_addr, &asid, sizeof(asid));
+       }
+#endif
 
 #if MONOTONIC
        if (collect_instrs_cycles) {
@@ -1275,7 +1336,7 @@ kcdata_record_thread_snapshot(
        boolean_t active_kthreads_only_p  = ((trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0);
        boolean_t trace_fp_p              = false;
        boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
-       boolean_t collect_iostats         = !collect_delta_stackshot && !(trace_flags & STACKSHOT_TAILSPIN) && !(trace_flags & STACKSHOT_NO_IO_STATS);
+       boolean_t collect_iostats         = !collect_delta_stackshot && !(trace_flags & STACKSHOT_NO_IO_STATS);
 #if MONOTONIC
        boolean_t collect_instrs_cycles   = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
 #endif /* MONOTONIC */
@@ -1287,7 +1348,7 @@ kcdata_record_thread_snapshot(
        struct thread_snapshot_v4 * cur_thread_snap = NULL;
        char cur_thread_name[STACKSHOT_MAX_THREAD_NAME_SIZE];
        uint64_t tval    = 0;
-       boolean_t task64 = task_has_64BitAddr(task);
+       const boolean_t is_64bit_data = task_has_64Bit_data(task);
 
        kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_SNAPSHOT, sizeof(struct thread_snapshot_v4), &out_addr));
        cur_thread_snap = (struct thread_snapshot_v4 *)out_addr;
@@ -1365,7 +1426,8 @@ kcdata_record_thread_snapshot(
        cur_thread_snap->ths_sched_priority = thread->sched_pri;
        cur_thread_snap->ths_eqos = thread->effective_policy.thep_qos;
        cur_thread_snap->ths_rqos = thread->requested_policy.thrp_qos;
-       cur_thread_snap->ths_rqos_override = thread->requested_policy.thrp_qos_override;
+       cur_thread_snap->ths_rqos_override = MAX(thread->requested_policy.thrp_qos_override,
+                                                                                        thread->requested_policy.thrp_qos_workq_override);
        cur_thread_snap->ths_io_tier = proc_get_effective_thread_policy(thread, TASK_POLICY_IO);
        cur_thread_snap->ths_thread_t = VM_KERNEL_UNSLIDE_OR_PERM(thread);
 
@@ -1382,30 +1444,47 @@ kcdata_record_thread_snapshot(
                stackshot_memcpy((void *)out_addr, (void *)cur_thread_name, sizeof(cur_thread_name));
        }
 
-       /* record system and user cpu times */
-       time_value_t user_time;
-       time_value_t system_time;
-       thread_read_times(thread, &user_time, &system_time);
-       kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_CPU_TIMES, sizeof(struct stackshot_cpu_times), &out_addr));
-       struct stackshot_cpu_times * stackshot_cpu_times = (struct stackshot_cpu_times *)out_addr;
-       stackshot_cpu_times->user_usec                   = ((uint64_t)user_time.seconds) * USEC_PER_SEC + user_time.microseconds;
-       stackshot_cpu_times->system_usec                 = ((uint64_t)system_time.seconds) * USEC_PER_SEC + system_time.microseconds;
+       /* record system, user, and runnable times */
+       time_value_t user_time, system_time, runnable_time;
+       thread_read_times(thread, &user_time, &system_time, &runnable_time);
+       kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_CPU_TIMES, sizeof(struct stackshot_cpu_times_v2), &out_addr));
+       struct stackshot_cpu_times_v2 *stackshot_cpu_times = (struct stackshot_cpu_times_v2 *)out_addr;
+       *stackshot_cpu_times = (struct stackshot_cpu_times_v2){
+               .user_usec = (uint64_t)user_time.seconds * USEC_PER_SEC + user_time.microseconds,
+               .system_usec = (uint64_t)system_time.seconds * USEC_PER_SEC + system_time.microseconds,
+               .runnable_usec = (uint64_t)runnable_time.seconds * USEC_PER_SEC + runnable_time.microseconds,
+       };
 
        /* Trace user stack, if any */
        if (!active_kthreads_only_p && task->active && thread->task->map != kernel_map) {
                uint32_t thread_snapshot_flags = 0;
-               /* 64-bit task? */
-               if (task64) {
+
+               /* Uses 64-bit machine state? */
+               if (is_64bit_data) {
+                       uint64_t sp = 0;
                        out_addr    = (mach_vm_address_t)kcd_end_address(kcd);
                        saved_count = machine_trace_thread64(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, TRUE,
-                                                            trace_fp_p, &thread_snapshot_flags);
+                                                            trace_fp_p, &thread_snapshot_flags, &sp);
                        if (saved_count > 0) {
                                int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame64) : sizeof(uint64_t);
                                kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_USER_STACKFRAME64
-                                                                                                  : STACKSHOT_KCTYPE_USER_STACKLR64,
+                                                                                                                                  : STACKSHOT_KCTYPE_USER_STACKLR64,
                                                                                   frame_size, saved_count / frame_size, &out_addr));
                                cur_thread_snap->ths_ss_flags |= kUser64_p;
                        }
+#if __x86_64__
+                       if (sp) {
+                               // I'm using 8 here and not sizeof(stack_contents) because this
+                               // code would not work if you just made stack_contents bigger.
+                               vm_offset_t kern_virt_addr = machine_trace_thread_get_kva(sp, thread->task->map, &thread_snapshot_flags);
+                               if (kern_virt_addr && (kern_virt_addr % 8) == 0) {
+                                       kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_USER_STACKTOP, sizeof(struct stack_snapshot_stacktop), &out_addr));
+                                       struct stack_snapshot_stacktop *stacktop = (struct stack_snapshot_stacktop *)out_addr;
+                                       stacktop->sp = sp;
+                                       memcpy(stacktop->stack_contents, (void*) kern_virt_addr, 8);
+                               }
+                       }
+#endif
                } else {
                        out_addr    = (mach_vm_address_t)kcd_end_address(kcd);
                        saved_count = machine_trace_thread(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, TRUE, trace_fp_p,
@@ -1413,7 +1492,7 @@ kcdata_record_thread_snapshot(
                        if (saved_count > 0) {
                                int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame32) : sizeof(uint32_t);
                                kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_USER_STACKFRAME
-                                                                                                  : STACKSHOT_KCTYPE_USER_STACKLR,
+                                                                                                                                  : STACKSHOT_KCTYPE_USER_STACKLR,
                                                                                   frame_size, saved_count / frame_size, &out_addr));
                        }
                }
@@ -1431,12 +1510,12 @@ kcdata_record_thread_snapshot(
 #if defined(__LP64__)
                out_addr    = (mach_vm_address_t)kcd_end_address(kcd);
                saved_count = machine_trace_thread64(thread, (char *)out_addr, (char *)kcd_max_address(kcd), MAX_FRAMES, FALSE, trace_fp_p,
-                                                    &thread_snapshot_flags);
+                                                    &thread_snapshot_flags, NULL);
                if (saved_count > 0) {
                        int frame_size = trace_fp_p ? sizeof(struct stack_snapshot_frame64) : sizeof(uint64_t);
                        cur_thread_snap->ths_ss_flags |= kKernel64_p;
                        kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, trace_fp_p ? STACKSHOT_KCTYPE_KERN_STACKFRAME64
-                                                                                          : STACKSHOT_KCTYPE_KERN_STACKLR64,
+                                                                                                                          : STACKSHOT_KCTYPE_KERN_STACKLR64,
                                                                           frame_size, saved_count / frame_size, &out_addr));
                }
 #else
@@ -1506,7 +1585,8 @@ kcdata_record_thread_delta_snapshot(struct thread_delta_snapshot_v3 * cur_thread
        cur_thread_snap->tds_sched_priority          = thread->sched_pri;
        cur_thread_snap->tds_eqos                    = thread->effective_policy.thep_qos;
        cur_thread_snap->tds_rqos                    = thread->requested_policy.thrp_qos;
-       cur_thread_snap->tds_rqos_override           = thread->requested_policy.thrp_qos_override;
+       cur_thread_snap->tds_rqos_override           = MAX(thread->requested_policy.thrp_qos_override,
+                                                                                                          thread->requested_policy.thrp_qos_workq_override);
        cur_thread_snap->tds_io_tier                 = proc_get_effective_thread_policy(thread, TASK_POLICY_IO);
 
        static_assert(sizeof(thread->effective_policy) == sizeof(uint64_t));
@@ -1529,45 +1609,15 @@ struct saved_uniqueids {
        unsigned count;
 };
 
-static kern_return_t
-flush_nonrunnable_tasks(struct saved_uniqueids * ids)
-{
-       if (ids->count == 0)
-               return KERN_SUCCESS;
-       mach_vm_address_t out_addr = 0;
-       kern_return_t ret = kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_NONRUNNABLE_TASKS, sizeof(uint64_t),
-                                                            ids->count, &out_addr);
-       if (ret != KERN_SUCCESS) {
-               return ret;
-       }
-       stackshot_memcpy((void *)out_addr, ids->ids, sizeof(uint64_t) * ids->count);
-       ids->count = 0;
-       return ret;
-}
-
-static kern_return_t
-handle_nonrunnable_task(struct saved_uniqueids * ids, uint64_t pid)
-{
-       kern_return_t ret    = KERN_SUCCESS;
-       ids->ids[ids->count] = pid;
-       ids->count++;
-       assert(ids->count <= UNIQUEIDSPERFLUSH);
-       if (ids->count == UNIQUEIDSPERFLUSH)
-               ret = flush_nonrunnable_tasks(ids);
-       return ret;
-}
-
 enum thread_classification {
        tc_full_snapshot,  /* take a full snapshot */
        tc_delta_snapshot, /* take a delta snapshot */
-       tc_nonrunnable,    /* only report id */
 };
 
 static enum thread_classification
 classify_thread(thread_t thread, boolean_t * thread_on_core_p, uint32_t trace_flags)
 {
        boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
-       boolean_t minimize_nonrunnables   = ((trace_flags & STACKSHOT_TAILSPIN) != 0);
 
        processor_t last_processor = thread->last_processor;
 
@@ -1581,14 +1631,230 @@ classify_thread(thread_t thread, boolean_t * thread_on_core_p, uint32_t trace_fl
        if (!collect_delta_stackshot || thread_on_core || (thread->last_run_time > stack_snapshot_delta_since_timestamp)) {
                return tc_full_snapshot;
        } else {
-               if (minimize_nonrunnables && !(thread->state & TH_RUN)) {
-                       return tc_nonrunnable;
+               return tc_delta_snapshot;
+       }
+}
+
+struct stackshot_context
+{
+       int pid;
+       uint32_t trace_flags;
+};
+
+static kern_return_t
+kdp_stackshot_record_task(struct stackshot_context *ctx, task_t task)
+{
+       boolean_t active_kthreads_only_p  = ((ctx->trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0);
+       boolean_t save_donating_pids_p    = ((ctx->trace_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0);
+       boolean_t collect_delta_stackshot = ((ctx->trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
+       boolean_t save_owner_info         = ((ctx->trace_flags & STACKSHOT_THREAD_WAITINFO) != 0);
+
+
+       kern_return_t error = KERN_SUCCESS;
+       mach_vm_address_t out_addr = 0;
+       int saved_count = 0;
+
+       int task_pid                   = 0;
+       uint64_t task_uniqueid         = 0;
+       int num_delta_thread_snapshots = 0;
+       int num_nonrunnable_threads    = 0;
+       int num_waitinfo_threads       = 0;
+
+       uint64_t task_start_abstime    = 0;
+       boolean_t task_delta_stackshot = FALSE;
+       boolean_t have_map = FALSE, have_pmap = FALSE;
+       boolean_t some_thread_ran = FALSE;
+       unaligned_u64 *task_snap_ss_flags = NULL;
+
+       if ((task == NULL) || !ml_validate_nofault((vm_offset_t)task, sizeof(struct task))) {
+               error = KERN_FAILURE;
+               goto error_exit;
+       }
+
+       have_map = (task->map != NULL) && (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map)));
+       have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap)));
+
+       task_pid = pid_from_task(task);
+       task_uniqueid = get_task_uniqueid(task);
+
+       if (!task->active || task_is_a_corpse(task)) {
+               /*
+                * Not interested in terminated tasks without threads, and
+                * at the moment, stackshot can't handle a task  without a name.
+                */
+               if (queue_empty(&task->threads) || task_pid == -1) {
+                       return KERN_SUCCESS;
+               }
+       }
+
+       if (collect_delta_stackshot) {
+               proc_starttime_kdp(task->bsd_info, NULL, NULL, &task_start_abstime);
+       }
+
+       /* Trace everything, unless a process was specified */
+       if ((ctx->pid == -1) || (ctx->pid == task_pid)) {
+
+               /* add task snapshot marker */
+               kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
+                                                                                                         STACKSHOT_KCCONTAINER_TASK, task_uniqueid));
+
+               if (!collect_delta_stackshot || (task_start_abstime == 0) ||
+                       (task_start_abstime > stack_snapshot_delta_since_timestamp)) {
+                       kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, ctx->trace_flags, have_pmap, &task_snap_ss_flags));
                } else {
-                       return tc_delta_snapshot;
+                       task_delta_stackshot = TRUE;
+                       kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, ctx->trace_flags, have_pmap, &task_snap_ss_flags));
                }
+
+               /* Iterate over task threads */
+               thread_t thread = THREAD_NULL;
+               queue_iterate(&task->threads, thread, thread_t, task_threads)
+               {
+                       uint64_t thread_uniqueid;
+
+                       if ((thread == NULL) || !ml_validate_nofault((vm_offset_t)thread, sizeof(struct thread))) {
+                               error = KERN_FAILURE;
+                               goto error_exit;
+                       }
+
+                       if (active_kthreads_only_p && thread->kernel_stack == 0)
+                               continue;
+
+                       thread_uniqueid = thread_tid(thread);
+
+                       boolean_t thread_on_core;
+                       enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, ctx->trace_flags);
+
+                       switch (thread_classification) {
+                       case tc_full_snapshot:
+                               /* add thread marker */
+                               kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
+                                                                                                                         STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid));
+                               kcd_exit_on_error(
+                                       kcdata_record_thread_snapshot(stackshot_kcdata_p, thread, task, ctx->trace_flags, have_pmap, thread_on_core));
+
+                               /* mark end of thread snapshot data */
+                               kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
+                                                                                                                         STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid));
+
+                               some_thread_ran = TRUE;
+                               break;
+
+                       case tc_delta_snapshot:
+                               num_delta_thread_snapshots++;
+                               break;
+                       }
+
+                       /* We want to report owner information regardless of whether a thread
+                        * has changed since the last delta, whether it's a normal stackshot,
+                        * or whether it's nonrunnable */
+                       if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread))
+                               num_waitinfo_threads++;
+               }
+
+               struct thread_delta_snapshot_v3 * delta_snapshots = NULL;
+               int current_delta_snapshot_index                  = 0;
+
+               if (num_delta_thread_snapshots > 0) {
+                       kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT,
+                                                                                                                          sizeof(struct thread_delta_snapshot_v3),
+                                                                                                                          num_delta_thread_snapshots, &out_addr));
+                       delta_snapshots = (struct thread_delta_snapshot_v3 *)out_addr;
+               }
+
+               uint64_t * nonrunnable_tids   = NULL;
+
+               if (num_nonrunnable_threads > 0) {
+                       kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_NONRUNNABLE_TIDS,
+                                                                                                                          sizeof(uint64_t), num_nonrunnable_threads, &out_addr));
+                       nonrunnable_tids = (uint64_t *)out_addr;
+               }
+
+               thread_waitinfo_t *thread_waitinfo = NULL;
+               int current_waitinfo_index         = 0;
+
+               if (num_waitinfo_threads > 0) {
+                       kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_WAITINFO,
+                                                                                                                          sizeof(thread_waitinfo_t), num_waitinfo_threads, &out_addr));
+                       thread_waitinfo = (thread_waitinfo_t *)out_addr;
+               }
+
+               if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0 || num_waitinfo_threads > 0) {
+                       queue_iterate(&task->threads, thread, thread_t, task_threads)
+                       {
+                               if (active_kthreads_only_p && thread->kernel_stack == 0)
+                                       continue;
+
+                               /* If we want owner info, we should capture it regardless of its classification */
+                               if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) {
+                                       stackshot_thread_wait_owner_info(
+                                               thread,
+                                               &thread_waitinfo[current_waitinfo_index++]);
+                               }
+
+                               boolean_t thread_on_core;
+                               enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, ctx->trace_flags);
+
+                               switch (thread_classification) {
+                               case tc_full_snapshot:
+                                       /* full thread snapshot captured above */
+                                       continue;
+
+                               case tc_delta_snapshot:
+                                       kcd_exit_on_error(kcdata_record_thread_delta_snapshot(&delta_snapshots[current_delta_snapshot_index++],
+                                                                                                                                                 thread, thread_on_core));
+                                       break;
+                               }
+                       }
+
+#if DEBUG || DEVELOPMENT
+                       if (current_delta_snapshot_index != num_delta_thread_snapshots) {
+                               panic("delta thread snapshot count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
+                                         num_delta_thread_snapshots, current_delta_snapshot_index);
+                       }
+                       if (current_waitinfo_index != num_waitinfo_threads) {
+                               panic("thread wait info count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
+                                         num_waitinfo_threads, current_waitinfo_index);
+                       }
+#endif
+               }
+
+#if IMPORTANCE_INHERITANCE
+               if (save_donating_pids_p) {
+                       kcd_exit_on_error(
+                               ((((mach_vm_address_t)kcd_end_address(stackshot_kcdata_p) + (TASK_IMP_WALK_LIMIT * sizeof(int32_t))) <
+                                 (mach_vm_address_t)kcd_max_address(stackshot_kcdata_p))
+                                ? KERN_SUCCESS
+                                : KERN_RESOURCE_SHORTAGE));
+                       saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS,
+                                                                                                       (void *)kcd_end_address(stackshot_kcdata_p), TASK_IMP_WALK_LIMIT);
+                       if (saved_count > 0)
+                               kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS,
+                                                                                                                                  sizeof(int32_t), saved_count, &out_addr));
+               }
+#endif
+
+               if (!collect_delta_stackshot || (num_delta_thread_snapshots != task->thread_count) || !task_delta_stackshot) {
+                       /*
+                        * Collect shared cache info and UUID info in these scenarios
+                        * 1) a full stackshot
+                        * 2) a delta stackshot where the task started after the previous full stackshot OR
+                        *    any thread from the task has run since the previous full stackshot
+                        */
+
+                       kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, task_snap_ss_flags));
+                       kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, ctx->trace_flags, have_pmap, task_snap_ss_flags));
+               }
+               /* mark end of task snapshot data */
+               kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK,
+                                                                                                         task_uniqueid));
        }
+
+error_exit:
+       return error;
 }
 
+
 static kern_return_t
 kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTraced)
 {
@@ -1597,21 +1863,14 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac
        uint64_t abs_time = 0, abs_time_end = 0;
        uint64_t *abs_time_addr = NULL;
        uint64_t system_state_flags = 0;
-       int saved_count = 0;
        task_t task = TASK_NULL;
-       thread_t thread = THREAD_NULL;
        mach_timebase_info_data_t timebase = {0, 0};
        uint32_t length_to_copy = 0, tmp32 = 0;
-
        abs_time = mach_absolute_time();
 
        /* process the flags */
-       boolean_t active_kthreads_only_p  = ((trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0);
-       boolean_t save_donating_pids_p    = ((trace_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0);
        boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
-       boolean_t minimize_nonrunnables   = ((trace_flags & STACKSHOT_TAILSPIN) != 0);
        boolean_t use_fault_path          = ((trace_flags & (STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING)) != 0);
-       boolean_t save_owner_info         = ((trace_flags & STACKSHOT_THREAD_WAITINFO) != 0);
        stack_enable_faulting = (trace_flags & (STACKSHOT_ENABLE_BT_FAULTING));
 
 #if CONFIG_EMBEDDED
@@ -1619,7 +1878,9 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac
        trace_flags &= ~(STACKSHOT_SAVE_KEXT_LOADINFO);
 #endif
 
-       struct saved_uniqueids saved_uniqueids = {.count = 0};
+       struct stackshot_context ctx = {};
+       ctx.trace_flags = trace_flags;
+       ctx.pid = pid;
 
        if (use_fault_path) {
                fault_stats.sfs_pages_faulted_in = 0;
@@ -1682,13 +1943,33 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac
        kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, KCDATA_TYPE_USECS_SINCE_EPOCH, sizeof(uint64_t), &out_addr));
        stackshot_memcpy((void *)out_addr, &stackshot_microsecs, sizeof(uint64_t));
 
-       /* reserve space of system level shared cache load info */
-       struct dyld_uuid_info_64_v2 * sys_shared_cache_loadinfo = NULL;
-       if (!collect_delta_stackshot) {
+       /* record system level shared cache load info (if available) */
+       if (!collect_delta_stackshot && init_task_shared_region &&
+                       ml_validate_nofault((vm_offset_t)init_task_shared_region, sizeof(struct vm_shared_region))) {
+               struct dyld_uuid_info_64_v2 *sys_shared_cache_info = NULL;
                kcd_exit_on_error(kcdata_get_memory_addr(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO,
                                                         sizeof(struct dyld_uuid_info_64_v2), &out_addr));
-               sys_shared_cache_loadinfo = (struct dyld_uuid_info_64_v2 *)out_addr;
-               bzero((void *)sys_shared_cache_loadinfo, sizeof(struct dyld_uuid_info_64_v2));
+               sys_shared_cache_info = (struct dyld_uuid_info_64_v2 *)out_addr;
+
+               stackshot_memcpy(sys_shared_cache_info->imageUUID, &init_task_shared_region->sr_uuid, sizeof(init_task_shared_region->sr_uuid));
+               sys_shared_cache_info->imageLoadAddress = init_task_shared_region->sr_slide_info.slide;
+               sys_shared_cache_info->imageSlidBaseAddress = init_task_shared_region->sr_slide_info.slide + init_task_shared_region->sr_base_address;
+
+               if (trace_flags & STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT) {
+                       /*
+                        * Include a map of the system shared cache layout if it has been populated
+                        * (which is only when the system is using a custom shared cache).
+                        */
+                       if (init_task_shared_region->sr_images && ml_validate_nofault((vm_offset_t)init_task_shared_region->sr_images,
+                                              (init_task_shared_region->sr_images_count * sizeof(struct dyld_uuid_info_64)))) {
+                               assert(init_task_shared_region->sr_images_count != 0);
+                               kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT,
+                                                                               sizeof(struct dyld_uuid_info_64),
+                                                                               init_task_shared_region->sr_images_count, &out_addr));
+                               stackshot_memcpy((void*)out_addr, init_task_shared_region->sr_images,
+                                               (init_task_shared_region->sr_images_count * sizeof(struct dyld_uuid_info_64)));
+                       }
+               }
        }
 
        /* Add requested information first */
@@ -1724,252 +2005,28 @@ kdp_stackshot_kcdata_format(int pid, uint32_t trace_flags, uint32_t * pBytesTrac
 
        trace_flags &= ~(STACKSHOT_THREAD_GROUP);
 
+
        /* Iterate over tasks */
-       queue_head_t *task_list = &tasks;
-       queue_iterate(task_list, task, task_t, tasks) {
-               int task_pid                   = 0;
-               uint64_t task_uniqueid         = 0;
-               int num_delta_thread_snapshots = 0;
-               int num_nonrunnable_threads    = 0;
-               int num_waitinfo_threads       = 0;
-
-               uint64_t task_start_abstime    = 0;
-               boolean_t task_delta_stackshot = FALSE;
-               boolean_t task64 = FALSE, have_map = FALSE, have_pmap = FALSE;
-               boolean_t some_thread_ran = FALSE;
-               unaligned_u64 *task_snap_ss_flags = NULL;
-
-               if ((task == NULL) || !ml_validate_nofault((vm_offset_t)task, sizeof(struct task))) {
-                       error = KERN_FAILURE;
+       queue_iterate(&tasks, task, task_t, tasks)
+       {
+               error = kdp_stackshot_record_task(&ctx, task);
+               if (error)
                        goto error_exit;
-               }
-
-               have_map = (task->map != NULL) && (ml_validate_nofault((vm_offset_t)(task->map), sizeof(struct _vm_map)));
-               have_pmap = have_map && (task->map->pmap != NULL) && (ml_validate_nofault((vm_offset_t)(task->map->pmap), sizeof(struct pmap)));
-
-               task_pid = pid_from_task(task);
-               task_uniqueid = get_task_uniqueid(task);
-               task64 = task_has_64BitAddr(task);
-
-               if (!task->active || task_is_a_corpse(task)) {
-                       /*
-                        * Not interested in terminated tasks without threads, and
-                        * at the moment, stackshot can't handle a task  without a name.
-                        */
-                       if (queue_empty(&task->threads) || task_pid == -1) {
-                               continue;
-                       }
-               }
-
-               if (collect_delta_stackshot) {
-                       proc_starttime_kdp(task->bsd_info, NULL, NULL, &task_start_abstime);
-               }
-
-               /* Trace everything, unless a process was specified */
-               if ((pid == -1) || (pid == task_pid)) {
-#if DEBUG || DEVELOPMENT
-                       /* we might want to call kcdata_undo_add_container_begin(), which is
-                        * only safe if we call it after kcdata_add_container_marker() but
-                        * before adding any other kcdata items.  In development kernels,
-                        * we'll remember where the buffer end was and confirm after calling
-                        * kcdata_undo_add_container_begin() that it's in exactly the same
-                        * place.*/
-                       mach_vm_address_t revert_addr = stackshot_kcdata_p->kcd_addr_end;
-#endif
-
-                       /* add task snapshot marker */
-                       kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
-                                                                     STACKSHOT_KCCONTAINER_TASK, task_uniqueid));
-
-                       if (!collect_delta_stackshot || (task_start_abstime == 0) ||
-                           (task_start_abstime > stack_snapshot_delta_since_timestamp)) {
-                               kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, trace_flags, have_pmap, &task_snap_ss_flags));
-                       } else {
-                               task_delta_stackshot = TRUE;
-                               if (minimize_nonrunnables) {
-                                       // delay taking the task snapshot.  If there are no runnable threads we'll skip it.
-                               } else {
-                                       kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, trace_flags, have_pmap, &task_snap_ss_flags));
-                               }
-                       }
-
-                       /* Iterate over task threads */
-                       queue_iterate(&task->threads, thread, thread_t, task_threads)
-                       {
-                               uint64_t thread_uniqueid;
-
-                               if ((thread == NULL) || !ml_validate_nofault((vm_offset_t)thread, sizeof(struct thread))) {
-                                       error = KERN_FAILURE;
-                                       goto error_exit;
-                               }
-
-                               if (active_kthreads_only_p && thread->kernel_stack == 0)
-                                       continue;
-
-                               thread_uniqueid = thread_tid(thread);
-
-                               boolean_t thread_on_core;
-                               enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, trace_flags);
-
-                               switch (thread_classification) {
-                               case tc_full_snapshot:
-                                       /* add thread marker */
-                                       kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
-                                                                                     STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid));
-                                       kcd_exit_on_error(
-                                           kcdata_record_thread_snapshot(stackshot_kcdata_p, thread, task, trace_flags, have_pmap, thread_on_core));
-
-                                       /* mark end of thread snapshot data */
-                                       kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
-                                                                                     STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid));
-
-                                       some_thread_ran = TRUE;
-                                       break;
-
-                               case tc_delta_snapshot:
-                                       num_delta_thread_snapshots++;
-                                       break;
-
-                               case tc_nonrunnable:
-                                       num_nonrunnable_threads++;
-                                       break;
-                               }
-
-                               /* We want to report owner information regardless of whether a thread
-                                * has changed since the last delta, whether it's a normal stackshot,
-                                * or whether it's nonrunnable */
-                               if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread))
-                                       num_waitinfo_threads++;
-                       }
-
-                       if (task_delta_stackshot && minimize_nonrunnables) {
-                               if (some_thread_ran || num_delta_thread_snapshots > 0) {
-                                       kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, trace_flags, have_pmap, &task_snap_ss_flags));
-                               } else {
-                                       kcd_exit_on_error(kcdata_undo_add_container_begin(stackshot_kcdata_p));
-
-#if DEBUG || DEVELOPMENT
-                                       mach_vm_address_t undo_addr = stackshot_kcdata_p->kcd_addr_end;
-                                       if (revert_addr != undo_addr) {
-                                               panic("tried to revert a container begin but we already moved past it. revert=%p undo=%p",
-                                                     (void *)revert_addr, (void *)undo_addr);
-                                       }
-#endif
-                                       kcd_exit_on_error(handle_nonrunnable_task(&saved_uniqueids, task_uniqueid));
-                                       continue;
-                               }
-                       }
-
-                       struct thread_delta_snapshot_v3 * delta_snapshots = NULL;
-                       int current_delta_snapshot_index                  = 0;
-
-                       if (num_delta_thread_snapshots > 0) {
-                               kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT,
-                                                                                  sizeof(struct thread_delta_snapshot_v3),
-                                                                                  num_delta_thread_snapshots, &out_addr));
-                               delta_snapshots = (struct thread_delta_snapshot_v3 *)out_addr;
-                       }
-
-                       uint64_t * nonrunnable_tids   = NULL;
-                       int current_nonrunnable_index = 0;
-
-                       if (num_nonrunnable_threads > 0) {
-                               kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_NONRUNNABLE_TIDS,
-                                                                                  sizeof(uint64_t), num_nonrunnable_threads, &out_addr));
-                               nonrunnable_tids = (uint64_t *)out_addr;
-                       }
-
-                       thread_waitinfo_t *thread_waitinfo = NULL;
-                       int current_waitinfo_index         = 0;
-
-                       if (num_waitinfo_threads > 0) {
-                               kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_WAITINFO,
-                                                                          sizeof(thread_waitinfo_t), num_waitinfo_threads, &out_addr));
-                               thread_waitinfo = (thread_waitinfo_t *)out_addr;
-                       }
-
-                       if (num_delta_thread_snapshots > 0 || num_nonrunnable_threads > 0 || num_waitinfo_threads > 0) {
-                               queue_iterate(&task->threads, thread, thread_t, task_threads)
-                               {
-                                       if (active_kthreads_only_p && thread->kernel_stack == 0)
-                                               continue;
-
-                                       /* If we want owner info, we should capture it regardless of its classification */
-                                       if (save_owner_info && stackshot_thread_has_valid_waitinfo(thread)) {
-                                               stackshot_thread_wait_owner_info(
-                                                               thread,
-                                                               &thread_waitinfo[current_waitinfo_index++]);
-                                       }
-
-                                       boolean_t thread_on_core;
-                                       enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, trace_flags);
-
-                                       switch (thread_classification) {
-                                       case tc_full_snapshot:
-                                               /* full thread snapshot captured above */
-                                               continue;
-
-                                       case tc_delta_snapshot:
-                                               kcd_exit_on_error(kcdata_record_thread_delta_snapshot(&delta_snapshots[current_delta_snapshot_index++],
-                                                                                                     thread, thread_on_core));
-                                               break;
-
-                                       case tc_nonrunnable:
-                                               nonrunnable_tids[current_nonrunnable_index++] = thread_tid(thread);
-                                               continue;
-                                       }
-                               }
-
-#if DEBUG || DEVELOPMENT
-                               if (current_delta_snapshot_index != num_delta_thread_snapshots) {
-                                       panic("delta thread snapshot count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
-                                             num_delta_thread_snapshots, current_delta_snapshot_index);
-                               }
-                               if (current_nonrunnable_index != num_nonrunnable_threads) {
-                                       panic("nonrunnable thread count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
-                                             num_nonrunnable_threads, current_nonrunnable_index);
-                               }
-                               if (current_waitinfo_index != num_waitinfo_threads) {
-                                       panic("thread wait info count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
-                                             num_waitinfo_threads, current_waitinfo_index);
-                               }
-#endif
-                       }
-
-#if IMPORTANCE_INHERITANCE
-                       if (save_donating_pids_p) {
-                               kcd_exit_on_error(
-                                   ((((mach_vm_address_t)kcd_end_address(stackshot_kcdata_p) + (TASK_IMP_WALK_LIMIT * sizeof(int32_t))) <
-                                     (mach_vm_address_t)kcd_max_address(stackshot_kcdata_p))
-                                        ? KERN_SUCCESS
-                                        : KERN_RESOURCE_SHORTAGE));
-                               saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS,
-                                                                       (void *)kcd_end_address(stackshot_kcdata_p), TASK_IMP_WALK_LIMIT);
-                               if (saved_count > 0)
-                                       kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS,
-                                                                                          sizeof(int32_t), saved_count, &out_addr));
-                       }
-#endif
-
-                       if (!collect_delta_stackshot || (num_delta_thread_snapshots != task->thread_count) || !task_delta_stackshot) {
-                               /*
-                                * Collect shared cache info and UUID info in these scenarios
-                                * 1) a full stackshot
-                                * 2) a delta stackshot where the task started after the previous full stackshot OR
-                                *    any thread from the task has run since the previous full stackshot
-                                */
-
-                               kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, sys_shared_cache_loadinfo, task_snap_ss_flags));
-                               kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, trace_flags, have_pmap, task_snap_ss_flags));
-                       }
-                       /* mark end of task snapshot data */
-                       kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, STACKSHOT_KCCONTAINER_TASK,
-                                                                     task_uniqueid));
-               }
        }
-
-       if (minimize_nonrunnables) {
-               flush_nonrunnable_tasks(&saved_uniqueids);
+       /*
+        * Iterate over the tasks in the terminated tasks list. We only inspect
+        * tasks that have a valid bsd_info pointer where P_LPEXIT is NOT set.
+        * We're only interested in tasks that have remaining threads (which
+        * could be involved in a deadlock, etc), and the last thread that tears
+        * itself down during exit sets P_LPEXIT during proc_exit().
+        */
+       queue_iterate(&terminated_tasks, task, task_t, tasks)
+       {
+               if (task->bsd_info && !proc_in_teardown(task->bsd_info)) {
+                       error = kdp_stackshot_record_task(&ctx, task);
+                       if (error)
+                               goto error_exit;
+               }
        }
 
        if (use_fault_path) {
@@ -2192,7 +2249,7 @@ boolean_t
 kdp_copyin_word(
        task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results)
 {
-       if (task_has_64BitAddr(task)) {
+       if (task_has_64Bit_data(task)) {
                return kdp_copyin(task->map, addr, result, sizeof(uint64_t), try_fault, kdp_fault_results);
        } else {
                uint32_t buf;
@@ -2453,6 +2510,9 @@ stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_t *waitinfo)
                case kThreadWaitWorkloopSyncWait:
                        kdp_workloop_sync_wait_find_owner(thread, thread->wait_event, waitinfo);
                        break;
+               case kThreadWaitOnProcess:
+                       kdp_wait4_find_process(thread, thread->wait_event, waitinfo);
+                       break;
                default:
                        waitinfo->owner = 0;
                        waitinfo->context = 0;
index 727712ec9977c8bf35a27804e4bbce2601087f58..8308df68d90a51fa86e3e1e90beb4ea9407e2779 100644 (file)
@@ -116,12 +116,25 @@ typedef   void (*thread_continue_t)(void *, wait_result_t);
  *    You must provide this value for any unbounded wait - otherwise you will
  *    pend user signals forever.
  *
+ * THREAD_WAIT_NOREPORT:
+ *    The scheduler has a callback (sched_call) that some subsystems use to
+ *    decide whether more threads should be thrown at a given problem by trying
+ *    to maintain a good level of concurrency.
+ *
+ *    When the wait will not be helped by adding more threads (e.g. lock
+ *    contention), using this flag as an argument to assert_wait* (or any of its
+ *    wrappers) will prevent the next wait/block to cause thread creation.
+ *
+ *    This comes in two flavors: THREAD_WAIT_NOREPORT_KERNEL, and
+ *    THREAD_WAIT_NOREPORT_USER to prevent reporting about the wait for kernel
+ *    and user threads respectively.
+ *
  * Thread interrupt mask:
  *
- *   The current maximum interruptible state for the thread, as set by
- *   thread_interrupt_level(), will limit the conditions that will cause a wake.
- *   This is useful for code that can't be interrupted to set before calling code
- *   that doesn't know that.
+ *    The current maximum interruptible state for the thread, as set by
+ *    thread_interrupt_level(), will limit the conditions that will cause a wake.
+ *    This is useful for code that can't be interrupted to set before calling code
+ *    that doesn't know that.
  *
  * Thread termination vs safe abort:
  *
@@ -152,9 +165,12 @@ typedef    void (*thread_continue_t)(void *, wait_result_t);
  *    call will always either return or call the passed in continuation.
  */
 typedef int wait_interrupt_t;
-#define THREAD_UNINT                   0               /* not interruptible      */
-#define THREAD_INTERRUPTIBLE   1               /* may not be restartable */
-#define THREAD_ABORTSAFE               2               /* abortable safely       */
+#define THREAD_UNINT                    0x00000000  /* not interruptible      */
+#define THREAD_INTERRUPTIBLE            0x00000001  /* may not be restartable */
+#define THREAD_ABORTSAFE                0x00000002  /* abortable safely       */
+#define THREAD_WAIT_NOREPORT_KERNEL     0x80000000
+#define THREAD_WAIT_NOREPORT_USER       0x40000000
+#define THREAD_WAIT_NOREPORT            (THREAD_WAIT_NOREPORT_KERNEL | THREAD_WAIT_NOREPORT_USER)
 
 typedef int wait_timeout_urgency_t;
 #define TIMEOUT_URGENCY_SYS_NORMAL     0x00            /* use default leeway thresholds for system */
index 479d114e32824c1a73fd18ac35d1992bca3fea85..02ef41fba56cdaa084b6fce3608d98000d888f45 100644 (file)
@@ -156,6 +156,7 @@ kext_alloc(vm_offset_t *_addr, vm_size_t size, boolean_t fixed)
                        size, 
                        0,
                        flags,
+                      VM_MAP_KERNEL_FLAGS_NONE,
                        VM_KERN_MEMORY_KEXT,
                        MACH_PORT_NULL,
                        0,
index aa2db20a11d5422b8443e6842959cfdfd8273b7f..3af8971844f9eaef6fca9c337fb90b678f16a13a 100644 (file)
@@ -91,6 +91,8 @@ struct cpu_data;
 extern boolean_t kpc_register_cpu(struct cpu_data *cpu_data);
 extern void kpc_unregister_cpu(struct cpu_data *cpu_data);
 
+extern bool kpc_supported;
+
 /* bootstrap */
 extern void kpc_init(void);
 
@@ -155,6 +157,7 @@ extern void kpc_thread_destroy(thread_t thread);
 /* allocate a buffer big enough for all counters */
 extern uint64_t *kpc_counterbuf_alloc(void);
 extern void      kpc_counterbuf_free(uint64_t*);
+extern uint32_t  kpc_get_counterbuf_size(void);
 
 /* whether we're currently accounting into threads */
 extern int kpc_threads_counting;
index 96455de011ff9d3db890189646656b109851d5ec..53f382ec4079e889578e2f242515b0ce8acfff62 100644 (file)
@@ -68,8 +68,8 @@ static bool kpc_calling_pm = false;
 #endif /* MACH_ASSERT */
 
 boolean_t kpc_context_switch_active = FALSE;
+bool kpc_supported = true;
 
-void kpc_common_init(void);
 void
 kpc_common_init(void)
 {
@@ -503,13 +503,19 @@ kpc_set_config(uint32_t classes, kpc_config_t *configv)
        return ret;
 }
 
+uint32_t
+kpc_get_counterbuf_size(void)
+{
+       return COUNTERBUF_SIZE;
+}
+
 /* allocate a buffer large enough for all possible counters */
 uint64_t *
 kpc_counterbuf_alloc(void)
 {
        uint64_t *buf = NULL;
 
-       buf = kalloc(COUNTERBUF_SIZE);
+       buf = kalloc_tag(COUNTERBUF_SIZE, VM_KERN_MEMORY_DIAG);
        if (buf) {
                bzero(buf, COUNTERBUF_SIZE);
        }
@@ -529,16 +535,19 @@ void
 kpc_sample_kperf(uint32_t actionid)
 {
        struct kperf_sample sbuf;
-       struct kperf_context ctx;
 
        BUF_DATA(PERF_KPC_HNDLR | DBG_FUNC_START);
 
-       ctx.cur_pid = 0;
-       ctx.cur_thread = current_thread();
-       ctx.cur_pid = task_pid(current_task());
+       thread_t thread = current_thread();
+       task_t task = get_threadtask(thread);
 
-       ctx.trigger_type = TRIGGER_TYPE_PMI;
-       ctx.trigger_id = 0;
+       struct kperf_context ctx = {
+               .cur_thread = thread,
+               .cur_task = task,
+               .cur_pid = task_pid(task),
+               .trigger_type = TRIGGER_TYPE_PMI,
+               .trigger_id = 0,
+       };
 
        int r = kperf_sample(&sbuf, &ctx, actionid, SAMPLE_FLAG_PEND_USER);
 
index ec35a3a52351b35d29d703bad016b6c94ab89288..001cad83b4e3ea2e95fc492a654b6d6ecaa9e564 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2010-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -46,6 +46,8 @@
 #include <mach/mach_types.h>
 #include <os/overflow.h>
 
+#include <vm/pmap.h>
+
 /*
  * Ledger entry flags. Bits in second nibble (masked by 0xF0) are used for
  * ledger actions (LEDGER_ACTION_BLOCK, etc).
@@ -113,6 +115,7 @@ struct ledger_template {
        volatile uint32_t       lt_inuse;
        lck_mtx_t               lt_lock;
        zone_t                  lt_zone;
+       bool                    lt_initialized;
        struct entry_template   *lt_entries;
 };
 
@@ -130,47 +133,6 @@ struct ledger_template {
        splx(s);                                                \
 }
 
-/*
- * Use NTOCKS "tocks" to track the rolling maximum balance of a ledger entry.
- */
-#define        NTOCKS 1
-/*
- * The explicit alignment is to ensure that atomic operations don't panic
- * on ARM.
- */
-struct ledger_entry {
-        volatile uint32_t               le_flags;
-        ledger_amount_t                 le_limit;
-        ledger_amount_t                 le_warn_level;
-        volatile ledger_amount_t        le_credit __attribute__((aligned(8)));
-        volatile ledger_amount_t        le_debit  __attribute__((aligned(8)));
-       union {
-               struct {
-                       /*
-                        * XXX - the following two fields can go away if we move all of
-                        * the refill logic into process policy
-                        */
-                       uint64_t        le_refill_period;
-                       uint64_t        le_last_refill;
-               } le_refill;
-               struct _le_maxtracking {
-                       struct _le_peak {
-                               uint32_t        le_max;  /* Lower 32-bits of observed max balance */
-                               uint32_t        le_time; /* time when this peak was observed */
-                       } le_peaks[NTOCKS];
-                       ledger_amount_t    le_lifetime_max; /* greatest peak ever observed */
-               } le_maxtracking;
-       } _le;
-} __attribute__((aligned(8)));
-
-struct ledger {
-       uint64_t                l_id;
-       int32_t                 l_refs;
-       int32_t                 l_size;
-       struct ledger_template  *l_template;
-       struct ledger_entry     l_entries[0] __attribute__((aligned(8)));
-};
-
 static int ledger_cnt = 0;
 /* ledger ast helper functions */
 static uint32_t ledger_check_needblock(ledger_t l, uint64_t now);
@@ -366,6 +328,22 @@ ledger_template_complete(ledger_template_t template)
        template->lt_zone = zinit(ledger_size, CONFIG_TASK_MAX * ledger_size,
                               ledger_size,
                               template->lt_name);
+       template->lt_initialized = true;
+}
+
+/*
+ * Like ledger_template_complete, except we'll ask
+ * the pmap layer to manage allocations for us.
+ * Meant for ledgers that should be owned by the
+ * pmap layer.
+ */
+void
+ledger_template_complete_secure_alloc(ledger_template_t template)
+{
+       size_t ledger_size;
+       ledger_size = sizeof(struct ledger) + (template->lt_cnt * sizeof(struct ledger_entry));
+       pmap_ledger_alloc_init(ledger_size);
+       template->lt_initialized = true;
 }
 
 /*
@@ -385,10 +363,14 @@ ledger_instantiate(ledger_template_t template, int entry_type)
        template_lock(template);
        template->lt_refs++;
        cnt = template->lt_cnt;
-       assert(template->lt_zone);
        template_unlock(template);
 
-       ledger = (ledger_t)zalloc(template->lt_zone);
+       if (template->lt_zone) {
+               ledger = (ledger_t)zalloc(template->lt_zone);
+       } else {
+               ledger = pmap_ledger_alloc();
+       }
+
        if (ledger == NULL) {
                ledger_template_dereference(template);
                return LEDGER_NULL;
@@ -477,7 +459,11 @@ ledger_dereference(ledger_t ledger)
 
        /* Just released the last reference.  Free it. */
        if (v == 1) {
-               zfree(ledger->l_template->lt_zone, ledger);
+               if (ledger->l_template->lt_zone) {
+                       zfree(ledger->l_template->lt_zone, ledger);
+               } else {
+                       pmap_ledger_free(ledger);
+               }
        }
 
        return (KERN_SUCCESS);
@@ -657,74 +643,22 @@ ledger_refill(uint64_t now, ledger_t ledger, int entry)
                ledger_limit_entry_wakeup(le);
 }
 
-/*
- * In tenths of a second, the length of one lookback period (a "tock") for
- * ledger rolling maximum calculations. The effective lookback window will be this times
- * NTOCKS.
- *
- * Use a tock length of 2.5 seconds to get a total lookback period of 5 seconds.
- *
- * XXX Could make this caller-definable, at the point that rolling max tracking
- * is enabled for the entry.
- */
-#define        TOCKLEN 25
-
-/*
- * How many sched_tick's are there in one tock (one of our lookback periods)?
- *
- *  X sched_ticks        2.5 sec      N sched_ticks
- * ---------------   =  ----------  * -------------
- *      tock               tock            sec
- *
- * where N sched_ticks/sec is calculated via 1 << SCHED_TICK_SHIFT (see sched_prim.h)
- *
- * This should give us 20 sched_tick's in one 2.5 second-long tock.
- */
-#define SCHED_TICKS_PER_TOCK ((TOCKLEN * (1 << SCHED_TICK_SHIFT)) / 10)
-
-/*
- * Rolling max timestamps use their own unit (let's call this a "tock"). One tock is the
- * length of one lookback period that we use for our rolling max calculation.
- *
- * Calculate the current time in tocks from sched_tick (which runs at a some
- * fixed rate).
- */
-#define        CURRENT_TOCKSTAMP() (sched_tick / SCHED_TICKS_PER_TOCK)
-
-/*
- * Does the given tockstamp fall in either the current or the previous tocks?
- */
-#define TOCKSTAMP_IS_STALE(now, tock) ((((now) - (tock)) < NTOCKS) ? FALSE : TRUE)
-
 void
 ledger_entry_check_new_balance(thread_t thread, ledger_t ledger,
                                int entry, struct ledger_entry *le)
 {
-       ledger_amount_t credit, debit;
-
        if (le->le_flags & LF_TRACKING_MAX) {
                ledger_amount_t balance = le->le_credit - le->le_debit;
-               uint32_t now = CURRENT_TOCKSTAMP();
-               struct _le_peak *p = &le->_le.le_maxtracking.le_peaks[now % NTOCKS];
 
-               if (!TOCKSTAMP_IS_STALE(now, p->le_time) || (balance > p->le_max)) {
-                       /*
-                        * The current balance is greater than the previously
-                        * observed peak for the current time block, *or* we
-                        * haven't yet recorded a peak for the current time block --
-                        * so this is our new peak.
-                        *
-                        * (We only track the lower 32-bits of a balance for rolling
-                        * max purposes.)
-                        */
-                       p->le_max = (uint32_t)balance;
-                       p->le_time = now;
+               if (balance > le->_le._le_max.le_lifetime_max){
+                       le->_le._le_max.le_lifetime_max = balance;
                }
 
-               struct _le_maxtracking *m = &le->_le.le_maxtracking;
-               if(balance > m->le_lifetime_max){
-                       m->le_lifetime_max = balance;
+#if CONFIG_LEDGER_INTERVAL_MAX
+               if (balance > le->_le._le_max.le_interval_max) {
+                       le->_le._le_max.le_interval_max = balance;
                }
+#endif /* LEDGER_CONFIG_INTERVAL_MAX */
        }
 
        /* Check to see whether we're due a refill */
@@ -799,16 +733,13 @@ ledger_entry_check_new_balance(thread_t thread, ledger_t ledger,
                }
        }
 
-       credit = le->le_credit;
-       debit = le->le_debit;
        if ((le->le_flags & LF_PANIC_ON_NEGATIVE) &&
-           ((credit < debit) ||
-            (le->le_credit < le->le_debit))) {
-               panic("ledger_entry_check_new_balance(%p,%d): negative ledger %p credit:%lld/%lld debit:%lld/%lld balance:%lld/%lld\n",
+           (le->le_credit < le->le_debit)) {
+               panic("ledger_entry_check_new_balance(%p,%d): negative ledger %p credit:%lld debit:%lld balance:%lld\n",
                      ledger, entry, le,
-                     credit, le->le_credit,
-                     debit, le->le_debit,
-                     credit - debit, le->le_credit - le->le_debit);
+                     le->le_credit,
+                     le->le_debit,
+                     le->le_credit - le->le_debit);
        }
 }
 
@@ -842,7 +773,9 @@ ledger_credit_thread(thread_t thread, ledger_t ledger, int entry, ledger_amount_
        new = old + amount;
        lprintf(("%p Credit %lld->%lld\n", thread, old, new));
 
-       ledger_entry_check_new_balance(thread, ledger, entry, le);
+       if (thread) {
+               ledger_entry_check_new_balance(thread, ledger, entry, le);
+       }
 
        return (KERN_SUCCESS);
 }
@@ -856,6 +789,15 @@ ledger_credit(ledger_t ledger, int entry, ledger_amount_t amount)
        return ledger_credit_thread(current_thread(), ledger, entry, amount);
 }
 
+/*
+ * Add value to an entry in a ledger; do not check balance after update.
+ */
+kern_return_t
+ledger_credit_nocheck(ledger_t ledger, int entry, ledger_amount_t amount)
+{
+       return ledger_credit_thread(NULL, ledger, entry, amount);
+}
+
 /* Add all of one ledger's values into another.
  * They must have been created from the same template.
  * This is not done atomically. Another thread (if not otherwise synchronized)
@@ -1004,41 +946,29 @@ ledger_set_limit(ledger_t ledger, int entry, ledger_amount_t limit,
        return (KERN_SUCCESS);
 }
 
+#if CONFIG_LEDGER_INTERVAL_MAX
 kern_return_t
-ledger_get_recent_max(ledger_t ledger, int entry,
-       ledger_amount_t *max_observed_balance)
+ledger_get_interval_max(ledger_t ledger, int entry,
+        ledger_amount_t *max_interval_balance, int reset)
 {
-       struct ledger_entry     *le;
-       uint32_t                now = CURRENT_TOCKSTAMP();
-       int                     i;
-
+       struct ledger_entry *le;
        le = &ledger->l_entries[entry];
 
        if (!ENTRY_VALID(ledger, entry) || !(le->le_flags & LF_TRACKING_MAX)) {
                return (KERN_INVALID_VALUE);
        }
 
-       /*
-        * Start with the current balance; if neither of the recorded peaks are
-        * within recent history, we use this.
-        */
-       *max_observed_balance = le->le_credit - le->le_debit;
-
-       for (i = 0; i < NTOCKS; i++) {
-               if (!TOCKSTAMP_IS_STALE(now, le->_le.le_maxtracking.le_peaks[i].le_time) &&
-                   (le->_le.le_maxtracking.le_peaks[i].le_max > *max_observed_balance)) {
-                       /*
-                        * The peak for this time block isn't stale, and it
-                        * is greater than the current balance -- so use it.
-                        */
-                   *max_observed_balance = le->_le.le_maxtracking.le_peaks[i].le_max;
-               }
-       }
+       *max_interval_balance = le->_le._le_max.le_interval_max;
+       lprintf(("ledger_get_interval_max: %lld%s\n", *max_interval_balance,
+               (reset) ? " --> 0" : ""));
 
-       lprintf(("ledger_get_maximum: %lld\n", *max_observed_balance));
+       if (reset) {
+               le->_le._le_max.le_interval_max = 0;
+       }
 
        return (KERN_SUCCESS);
 }
+#endif /* CONFIG_LEDGER_INTERVAL_MAX */
 
 kern_return_t
 ledger_get_lifetime_max(ledger_t ledger, int entry,
@@ -1051,7 +981,7 @@ ledger_get_lifetime_max(ledger_t ledger, int entry,
                return (KERN_INVALID_VALUE);
        }
 
-       *max_lifetime_balance = le->_le.le_maxtracking.le_lifetime_max;
+       *max_lifetime_balance = le->_le._le_max.le_lifetime_max;
        lprintf(("ledger_get_lifetime_max: %lld\n", *max_lifetime_balance));
 
        return (KERN_SUCCESS);
@@ -1318,7 +1248,9 @@ ledger_debit_thread(thread_t thread, ledger_t ledger, int entry, ledger_amount_t
        }
        lprintf(("%p Debit %lld->%lld\n", thread, old, new));
 
-       ledger_entry_check_new_balance(thread, ledger, entry, le);
+       if (thread) {
+               ledger_entry_check_new_balance(thread, ledger, entry, le);
+       }
 
        return (KERN_SUCCESS);
 }
@@ -1329,6 +1261,12 @@ ledger_debit(ledger_t ledger, int entry, ledger_amount_t amount)
        return ledger_debit_thread(current_thread(), ledger, entry, amount);
 }
 
+kern_return_t
+ledger_debit_nocheck(ledger_t ledger, int entry, ledger_amount_t amount)
+{
+       return ledger_debit_thread(NULL, ledger, entry, amount);
+}
+
 void
 ledger_ast(thread_t thread)
 {
@@ -1523,7 +1461,7 @@ ledger_perform_blocking(ledger_t l)
                assert(!(le->le_flags & LF_TRACKING_MAX));
 
                /* Prepare to sleep until the resource is refilled */
-               ret = assert_wait_deadline(le, TRUE,
+               ret = assert_wait_deadline(le, THREAD_INTERRUPTIBLE,
                    le->_le.le_refill.le_last_refill + le->_le.le_refill.le_refill_period);
                if (ret != THREAD_WAITING)
                        return(KERN_SUCCESS);
@@ -1595,6 +1533,25 @@ ledger_disable_panic_on_negative(ledger_t ledger, int entry)
        return (KERN_SUCCESS);
 }
 
+kern_return_t
+ledger_get_panic_on_negative(ledger_t ledger, int entry, int *panic_on_negative)
+{
+       struct ledger_entry *le;
+
+       if (!ENTRY_VALID(ledger, entry))
+               return (KERN_INVALID_ARGUMENT);
+
+       le = &ledger->l_entries[entry];
+
+       if (le->le_flags & LF_PANIC_ON_NEGATIVE) {
+               *panic_on_negative = TRUE;
+       } else {
+               *panic_on_negative = FALSE;
+       }
+
+       return (KERN_SUCCESS);
+}
+
 kern_return_t
 ledger_get_balance(ledger_t ledger, int entry, ledger_amount_t *balance)
 {
index d6b27ce11276b8320c24cf0a85c7e463bc4f85a0..78eb4f8484f005996836be1cb642d33281273ca7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2010 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2010-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -55,6 +55,50 @@ struct ledger_template_info {
        char            lti_units[LEDGER_NAME_MAX];
 };
 
+#ifdef MACH_KERNEL_PRIVATE
+/*
+ * These definitions are only here to allow pmap.c to determine the expected
+ * size of a ledger at build time.  Direct access to ledger fields or to
+ * ledger entries is prohibited.
+ */
+
+/*
+ * The explicit alignment is to ensure that atomic operations don't panic
+ * on ARM.
+ */
+struct ledger_entry {
+       volatile uint32_t        le_flags;
+       ledger_amount_t          le_limit;
+       ledger_amount_t          le_warn_level;
+       volatile ledger_amount_t le_credit __attribute__((aligned(8)));
+       volatile ledger_amount_t le_debit  __attribute__((aligned(8)));
+       union {
+               struct {
+                       /*
+                        * XXX - the following two fields can go away if we move all of
+                        * the refill logic into process policy
+                        */
+                       uint64_t le_refill_period;
+                       uint64_t le_last_refill;
+               } le_refill;
+               struct {
+                       ledger_amount_t le_lifetime_max; /* Process lifetime peak */
+#if CONFIG_LEDGER_INTERVAL_MAX
+                       ledger_amount_t le_interval_max; /* Interval peak XXX better name needed */
+#endif
+               } _le_max;
+       } _le;
+} __attribute__((aligned(8)));
+
+struct ledger {
+       uint64_t                l_id;
+       int32_t                 l_refs;
+       int32_t                 l_size;
+       struct ledger_template *l_template;
+       struct ledger_entry     l_entries[0] __attribute__((aligned(8)));
+};
+#endif /* MACH_KERNEL_PRIVATE */
+
 struct ledger_entry_info {
        int64_t         lei_balance;
        int64_t         lei_credit;
@@ -111,14 +155,17 @@ extern int ledger_key_lookup(ledger_template_t template, const char *key);
 #define        LEDGER_CREATE_INACTIVE_ENTRIES  1
 extern ledger_t ledger_instantiate(ledger_template_t template, int entry_type);
 extern void ledger_template_complete(ledger_template_t template);
+extern void ledger_template_complete_secure_alloc(ledger_template_t template);
 extern kern_return_t ledger_disable_callback(ledger_t ledger, int entry);
 extern kern_return_t ledger_enable_callback(ledger_t ledger, int entry);
 extern kern_return_t ledger_get_limit(ledger_t ledger, int entry,
        ledger_amount_t *limit);
 extern kern_return_t ledger_set_limit(ledger_t ledger, int entry,
        ledger_amount_t limit, uint8_t warn_level_percentage);
-extern kern_return_t ledger_get_recent_max(ledger_t ledger, int entry,
-       ledger_amount_t *max_observed_balance);
+#if CONFIG_LEDGER_INTERVAL_MAX
+extern kern_return_t ledger_get_interval_max(ledger_t ledger, int entry,
+       ledger_amount_t *max_interval_balance, int reset);
+#endif /* CONFIG_LEDGER_INTERVAL_MAX */
 extern kern_return_t ledger_get_lifetime_max(ledger_t ledger, int entry,
        ledger_amount_t *max_lifetime_balance);
 extern kern_return_t ledger_get_actions(ledger_t ledger, int entry, int *actions);
@@ -132,8 +179,12 @@ extern kern_return_t ledger_entry_setactive(ledger_t ledger, int entry);
 extern void ledger_check_new_balance(thread_t thread, ledger_t ledger, int entry);
 extern kern_return_t ledger_credit(ledger_t ledger, int entry,
        ledger_amount_t amount);
+extern kern_return_t ledger_credit_nocheck(ledger_t ledger, int entry,
+       ledger_amount_t amount);
 extern kern_return_t ledger_debit(ledger_t ledger, int entry,
        ledger_amount_t amount);
+extern kern_return_t ledger_debit_nocheck(ledger_t ledger, int entry,
+       ledger_amount_t amount);
 extern kern_return_t ledger_credit_thread(thread_t thread, ledger_t ledger,
                                           int entry, ledger_amount_t amount);
 extern kern_return_t ledger_debit_thread(thread_t thread, ledger_t ledger,
@@ -145,6 +196,7 @@ extern kern_return_t ledger_get_balance(ledger_t ledger, int entry,
        ledger_amount_t *balance);
 extern kern_return_t ledger_reset_callback_state(ledger_t ledger, int entry);
 extern kern_return_t ledger_disable_panic_on_negative(ledger_t ledger, int entry);
+extern kern_return_t ledger_get_panic_on_negative(ledger_t ledger, int entry, int *panic_on_negative);
 
 extern kern_return_t ledger_rollup(ledger_t to_ledger, ledger_t from_ledger);
 extern kern_return_t ledger_rollup_entry(ledger_t to_ledger, ledger_t from_ledger, int entry);
@@ -155,10 +207,6 @@ extern int ledger_reference_count(ledger_t ledger);
 extern kern_return_t ledger_reference(ledger_t ledger);
 extern kern_return_t ledger_dereference(ledger_t ledger);
 
-/* Per-pmap ledger operations */
-#define        pmap_ledger_debit(p, e, a) ledger_debit((p)->ledger, e, a)
-#define        pmap_ledger_credit(p, e, a) ledger_credit((p)->ledger, e, a)
-
 /* Support for ledger() syscall */
 #ifdef LEDGER_DEBUG
 extern int ledger_limit(task_t task, struct ledger_limit_args *args);
index 25641b8beb409ef593dec5205996e71db2ae7f05..04b5dd239fb8fa5e232cb902c9a74ad8fc663736 100644 (file)
 #include <kern/processor.h>
 #include <kern/sched_prim.h>
 #include <kern/debug.h>
+#include <libkern/section_keywords.h>
 #include <machine/atomic.h>
 #include <machine/machine_cpu.h>
 #include <string.h>
 
-
 #include <sys/kdebug.h>
 
 #if    CONFIG_DTRACE
@@ -120,6 +120,8 @@ static unsigned int lck_grp_cnt;
 decl_lck_mtx_data(static,lck_grp_lock)
 static lck_mtx_ext_t lck_grp_lock_ext;
 
+SECURITY_READ_ONLY_LATE(boolean_t) spinlock_timeout_panic = TRUE;
+
 lck_grp_attr_t LockDefaultGroupAttr;
 lck_grp_t              LockCompatGroup;
 lck_attr_t             LockDefaultLckAttr;
@@ -132,6 +134,14 @@ uint64_t dtrace_spin_threshold = LOCK_PANIC_TIMEOUT / 1000000; // 500ns
 #endif
 #endif
 
+uintptr_t
+unslide_for_kdebug(void* object) {
+       if (__improbable(kdebug_enable))
+               return VM_KERNEL_UNSLIDE_OR_PERM(object);
+       else
+               return 0;
+}
+
 /*
  * Routine:    lck_mod_init
  */
@@ -535,20 +545,11 @@ hw_lock_lock_contended(hw_lock_t lock, uintptr_t data, uint64_t timeout, boolean
 }
 #endif // __SMP__
 
-/*
- *     Routine: hw_lock_lock
- *
- *     Acquire lock, spinning until it becomes available,
- *     return with preemption disabled.
- */
-void
-hw_lock_lock(hw_lock_t lock)
+static inline void
+hw_lock_lock_internal(hw_lock_t lock, thread_t thread)
 {
-       thread_t        thread;
        uintptr_t       state;
 
-       thread = current_thread();
-       disable_preemption_for_thread(thread);
        state = LCK_MTX_THREAD_TO_STATE(thread) | PLATFORM_LCK_ILOCK;
 #if    __SMP__
 
@@ -563,7 +564,7 @@ hw_lock_lock(hw_lock_t lock)
 #if    LOCK_PRETEST
 contended:
 #endif // LOCK_PRETEST
-       hw_lock_lock_contended(lock, state, 0, TRUE);
+       hw_lock_lock_contended(lock, state, 0, spinlock_timeout_panic);
 end:
 #else  // __SMP__
        if (lock->lock_data)
@@ -576,6 +577,34 @@ end:
        return;
 }
 
+/*
+ *     Routine: hw_lock_lock
+ *
+ *     Acquire lock, spinning until it becomes available,
+ *     return with preemption disabled.
+ */
+void
+hw_lock_lock(hw_lock_t lock)
+{
+       thread_t thread = current_thread();
+       disable_preemption_for_thread(thread);
+       hw_lock_lock_internal(lock, thread);
+}
+
+/*
+ *     Routine: hw_lock_lock_nopreempt
+ *
+ *     Acquire lock, spinning until it becomes available.
+ */
+void
+hw_lock_lock_nopreempt(hw_lock_t lock)
+{
+       thread_t thread = current_thread();
+       if (__improbable(!preemption_disabled_for_thread(thread)))
+               panic("Attempt to take no-preempt spinlock %p in preemptible context", lock);
+       hw_lock_lock_internal(lock, thread);
+}
+
 /*
  *     Routine: hw_lock_to
  *
@@ -628,18 +657,10 @@ end:
  *
  *     returns with preemption disabled on success.
  */
-unsigned int
-hw_lock_try(hw_lock_t lock)
+static inline unsigned int
+hw_lock_try_internal(hw_lock_t lock, thread_t thread)
 {
-       thread_t        thread = current_thread();
        int             success = 0;
-#if    LOCK_TRY_DISABLE_INT
-       long            intmask;
-
-       intmask = disable_interrupts();
-#else
-       disable_preemption_for_thread(thread);
-#endif // LOCK_TRY_DISABLE_INT
 
 #if    __SMP__
 #if    LOCK_PRETEST
@@ -655,20 +676,9 @@ hw_lock_try(hw_lock_t lock)
        }
 #endif // __SMP__
 
-#if    LOCK_TRY_DISABLE_INT
-       if (success)
-               disable_preemption_for_thread(thread);
-#if    LOCK_PRETEST
-failed:
-#endif // LOCK_PRETEST
-       restore_interrupts(intmask);
-#else
 #if    LOCK_PRETEST
 failed:
 #endif // LOCK_PRETEST
-       if (!success)
-               enable_preemption();
-#endif // LOCK_TRY_DISABLE_INT
 #if CONFIG_DTRACE
        if (success)
                LOCKSTAT_RECORD(LS_LCK_SPIN_LOCK_ACQUIRE, lock, 0);
@@ -676,13 +686,33 @@ failed:
        return success;
 }
 
+unsigned int
+hw_lock_try(hw_lock_t lock)
+{
+       thread_t thread = current_thread();
+       disable_preemption_for_thread(thread);
+       unsigned int success = hw_lock_try_internal(lock, thread);
+       if (!success)
+               enable_preemption();
+       return success;
+}
+
+unsigned int
+hw_lock_try_nopreempt(hw_lock_t lock)
+{
+       thread_t thread = current_thread();
+       if (__improbable(!preemption_disabled_for_thread(thread)))
+               panic("Attempt to test no-preempt spinlock %p in preemptible context", lock);
+       return hw_lock_try_internal(lock, thread);
+}
+
 /*
  *     Routine: hw_lock_unlock
  *
  *     Unconditionally release lock, release preemption level.
  */
-void
-hw_lock_unlock(hw_lock_t lock)
+static inline void
+hw_lock_unlock_internal(hw_lock_t lock)
 {
        __c11_atomic_store((_Atomic uintptr_t *)&lock->lock_data, 0, memory_order_release_smp);
 #if __arm__ || __arm64__
@@ -692,9 +722,23 @@ hw_lock_unlock(hw_lock_t lock)
 #if    CONFIG_DTRACE
        LOCKSTAT_RECORD(LS_LCK_SPIN_UNLOCK_RELEASE, lock, 0);
 #endif /* CONFIG_DTRACE */
+}
+
+void
+hw_lock_unlock(hw_lock_t lock)
+{
+       hw_lock_unlock_internal(lock);
        enable_preemption();
 }
 
+void
+hw_lock_unlock_nopreempt(hw_lock_t lock)
+{
+       if (__improbable(!preemption_disabled_for_thread(current_thread())))
+               panic("Attempt to release no-preempt spinlock %p in preemptible context", lock);
+       hw_lock_unlock_internal(lock);
+}
+
 /*
  *     Routine hw_lock_held, doesn't change preemption state.
  *     N.B.  Racy, of course.
@@ -765,40 +809,6 @@ lck_spin_sleep_deadline(
        return res;
 }
 
-
-/*
- * Routine:    lck_mtx_clear_promoted
- *
- * Handle clearing of TH_SFLAG_PROMOTED,
- * adjusting thread priority as needed.
- *
- * Called with thread lock held
- */
-static void
-lck_mtx_clear_promoted (
-       thread_t                        thread,
-       __kdebug_only uintptr_t         trace_lck)
-{
-       thread->sched_flags &= ~TH_SFLAG_PROMOTED;
-
-       if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
-               /* Thread still has a RW lock promotion */
-       } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-               KERNEL_DEBUG_CONSTANT(
-                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
-                               thread->sched_pri, DEPRESSPRI, 0, trace_lck, 0);
-               set_sched_pri(thread, DEPRESSPRI);
-       } else {
-               if (thread->base_pri < thread->sched_pri) {
-                       KERNEL_DEBUG_CONSTANT(
-                               MACHDBG_CODE(DBG_MACH_SCHED,MACH_DEMOTE) | DBG_FUNC_NONE,
-                                       thread->sched_pri, thread->base_pri, 0, trace_lck, 0);
-               }
-               thread_recompute_sched_pri(thread, FALSE);
-       }
-}
-
-
 /*
  * Routine:    lck_mtx_sleep
  */
@@ -848,7 +858,7 @@ lck_mtx_sleep(
        if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
                if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
                        /* sched_flags checked without lock, but will be rechecked while clearing */
-                       lck_rw_clear_promotion(thread);
+                       lck_rw_clear_promotion(thread, unslide_for_kdebug(event));
                }
        }
 
@@ -903,7 +913,7 @@ lck_mtx_sleep_deadline(
        if (lck_sleep_action & LCK_SLEEP_PROMOTED_PRI) {
                if ((thread->rwlock_count-- == 1 /* field now 0 */) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
                        /* sched_flags checked without lock, but will be rechecked while clearing */
-                       lck_rw_clear_promotion(thread);
+                       lck_rw_clear_promotion(thread, unslide_for_kdebug(event));
                }
        }
 
@@ -913,12 +923,58 @@ lck_mtx_sleep_deadline(
 }
 
 /*
- * Routine:    lck_mtx_lock_wait
+ * Lock Boosting Invariants:
+ *
+ * The lock owner is always promoted to the max priority of all its waiters.
+ * Max priority is capped at MAXPRI_PROMOTE.
+ *
+ * lck_mtx_pri being set implies that the lock owner is promoted to at least lck_mtx_pri
+ *      This prevents the thread from dropping in priority while holding a mutex
+ *      (note: Intel locks currently don't do this, to avoid thread lock churn)
+ *
+ * thread->promotions has a +1 for every mutex currently promoting the thread
+ * and 1 for was_promoted_on_wakeup being set.
+ * TH_SFLAG_PROMOTED is set on a thread whenever it has any promotions
+ * from any mutex (i.e. thread->promotions != 0)
+ *
+ * was_promoted_on_wakeup is set on a thread which is woken up by a mutex when
+ * it raises the priority of the woken thread to match lck_mtx_pri.
+ * It can be set for multiple iterations of wait, fail to acquire, re-wait, etc
+ * was_promoted_on_wakeup being set always implies a +1 promotions count.
+ *
+ * The last waiter is not given a promotion when it wakes up or acquires the lock.
+ * When the last waiter is waking up, a new contender can always come in and
+ * steal the lock without having to wait for the last waiter to make forward progress.
+ *
+ * lck_mtx_waiters has a +1 for every waiter currently between wait and acquire
+ * This prevents us from asserting that every wakeup wakes up a thread.
+ * This also causes excess thread_wakeup calls in the unlock path.
+ * It can only be fooled into thinking there are more waiters than are
+ * actually blocked, not less.
+ * It does allows us to reduce the complexity of the lock state.
+ *
+ * This also means that a starved bg thread as the last waiter could end up
+ * keeping the lock in the contended state for a long period of time, which
+ * may keep lck_mtx_pri artificially high for a very long time even though
+ * it is not participating or blocking anyone else.
+ * Intel locks don't have this problem because they can go uncontended
+ * as soon as there are no blocked threads involved.
+ */
+
+/*
+ * Routine: lck_mtx_lock_wait
  *
  * Invoked in order to wait on contention.
  *
  * Called with the interlock locked and
  * returns it unlocked.
+ *
+ * Always aggressively sets the owning thread to promoted,
+ * even if it's the same or higher priority
+ * This prevents it from lowering its own priority while holding a lock
+ *
+ * TODO: Come up with a more efficient way to handle same-priority promotions
+ *      <rdar://problem/30737670> ARM mutex contention logic could avoid taking the thread lock
  */
 void
 lck_mtx_lock_wait (
@@ -927,10 +983,8 @@ lck_mtx_lock_wait (
 {
        thread_t                self = current_thread();
        lck_mtx_t               *mutex;
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
-       __kdebug_only uintptr_t trace_holder = VM_KERNEL_UNSLIDE_OR_PERM(holder);
-       integer_t               priority;
-       spl_t                   s = splsched();
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+
 #if    CONFIG_DTRACE
        uint64_t                sleep_start = 0;
 
@@ -944,50 +998,65 @@ lck_mtx_lock_wait (
        else
                mutex = &lck->lck_mtx_ptr->lck_mtx;
 
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START, trace_lck, trace_holder, 0, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_START,
+                    trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0);
+
+       spl_t s = splsched();
+       thread_lock(holder);
+
+       assert_promotions_invariant(holder);
 
-       priority = self->sched_pri;
-       if (priority < self->base_pri)
-               priority = self->base_pri;
-       if (priority < BASEPRI_DEFAULT)
-               priority = BASEPRI_DEFAULT;
+       if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0)
+               assert(holder->sched_pri >= mutex->lck_mtx_pri);
 
-       /* Do not promote past promotion ceiling */
+       integer_t priority = self->sched_pri;
+       priority = MAX(priority, self->base_pri);
+       priority = MAX(priority, BASEPRI_DEFAULT);
        priority = MIN(priority, MAXPRI_PROMOTE);
 
-       thread_lock(holder);
        if (mutex->lck_mtx_pri == 0) {
-               holder->promotions++;
-               holder->sched_flags |= TH_SFLAG_PROMOTED;
+               /* This is the first promotion for this mutex */
+               if (holder->promotions++ == 0) {
+                       /* This is the first promotion for holder */
+                       sched_thread_promote_to_pri(holder, priority, trace_lck);
+               } else {
+                       /* Holder was previously promoted due to a different mutex, raise to match this one */
+                       sched_thread_update_promotion_to_pri(holder, priority, trace_lck);
+               }
+       } else {
+               /* Holder was previously promoted due to this mutex, check if the pri needs to go up */
+               sched_thread_update_promotion_to_pri(holder, priority, trace_lck);
        }
 
-       if (mutex->lck_mtx_pri < priority && holder->sched_pri < priority) {
-               KERNEL_DEBUG_CONSTANT(
-                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
-                                       holder->sched_pri, priority, trace_holder, trace_lck, 0);
-               set_sched_pri(holder, priority);
-       }
+       assert(holder->promotions > 0);
+       assert(holder->promotion_priority >= priority);
+
+       if ((holder->sched_flags & TH_SFLAG_DEPRESS) == 0)
+               assert(holder->sched_pri >= mutex->lck_mtx_pri);
+
+       assert_promotions_invariant(holder);
+
        thread_unlock(holder);
        splx(s);
 
        if (mutex->lck_mtx_pri < priority)
                mutex->lck_mtx_pri = priority;
-       if (self->pending_promoter[self->pending_promoter_index] == NULL) {
-               self->pending_promoter[self->pending_promoter_index] = mutex;
-               mutex->lck_mtx_waiters++;
-       }
-       else
-       if (self->pending_promoter[self->pending_promoter_index] != mutex) {
-               self->pending_promoter[++self->pending_promoter_index] = mutex;
+
+       if (self->waiting_for_mutex == NULL) {
+               self->waiting_for_mutex = mutex;
                mutex->lck_mtx_waiters++;
        }
 
+       assert(self->waiting_for_mutex == mutex);
+
        thread_set_pending_block_hint(self, kThreadWaitKernelMutex);
-       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT);
+       assert_wait(LCK_MTX_EVENT(mutex), THREAD_UNINT | THREAD_WAIT_NOREPORT_USER);
        lck_mtx_ilk_unlock(mutex);
 
        thread_block(THREAD_CONTINUE_NULL);
 
+       assert(mutex->lck_mtx_waiters > 0);
+
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_LCK_WAIT_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
 #if    CONFIG_DTRACE
        /*
@@ -1023,50 +1092,80 @@ lck_mtx_lock_acquire(
        thread_t                thread = current_thread();
        lck_mtx_t               *mutex;
        integer_t               priority;
-       spl_t                   s;
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
 
        if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
                mutex = lck;
        else
                mutex = &lck->lck_mtx_ptr->lck_mtx;
 
-       if (thread->pending_promoter[thread->pending_promoter_index] == mutex) {
-               thread->pending_promoter[thread->pending_promoter_index] = NULL;
-               if (thread->pending_promoter_index > 0)
-                       thread->pending_promoter_index--;
+       /*
+        * If waiting_for_mutex is set, then this thread was previously blocked waiting on this lock
+        * If it's un-set, then this thread stole the lock from another waiter.
+        */
+       if (thread->waiting_for_mutex == mutex) {
+               assert(mutex->lck_mtx_waiters > 0);
+
+               thread->waiting_for_mutex = NULL;
                mutex->lck_mtx_waiters--;
        }
 
-       if (mutex->lck_mtx_waiters)
+       assert(thread->waiting_for_mutex == NULL);
+
+       if (mutex->lck_mtx_waiters > 0) {
                priority = mutex->lck_mtx_pri;
-       else {
+       } else {
+               /* I was the last waiter, so the mutex is no longer promoted or contended */
                mutex->lck_mtx_pri = 0;
                priority = 0;
        }
 
        if (priority || thread->was_promoted_on_wakeup) {
-               s = splsched();
+               __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
+
+               /*
+                * Note: was_promoted_on_wakeup can happen for multiple wakeups in a row without
+                * an intervening acquire if a thread keeps failing to acquire the lock
+                *
+                * If priority is true but not promoted on wakeup,
+                * then this is a lock steal of a promoted mutex, so it needs a ++ of promotions.
+                *
+                * If promoted on wakeup is true, but priority is not,
+                * then this is the last owner, and the last owner does not need a promotion.
+                */
+
+               spl_t s = splsched();
                thread_lock(thread);
 
+               assert_promotions_invariant(thread);
+
+               if (thread->was_promoted_on_wakeup)
+                       assert(thread->promotions > 0);
+
                if (priority) {
-                       thread->promotions++;
-                       thread->sched_flags |= TH_SFLAG_PROMOTED;
-                       if (thread->sched_pri < priority) {
-                               KERNEL_DEBUG_CONSTANT(
-                                       MACHDBG_CODE(DBG_MACH_SCHED,MACH_PROMOTE) | DBG_FUNC_NONE,
-                                                       thread->sched_pri, priority, 0, trace_lck, 0);
-                               /* Do not promote past promotion ceiling */
-                               assert(priority <= MAXPRI_PROMOTE);
-                               set_sched_pri(thread, priority);
+                       if (thread->promotions++ == 0) {
+                               /* This is the first promotion for holder */
+                               sched_thread_promote_to_pri(thread, priority, trace_lck);
+                       } else {
+                               /*
+                                * Holder was previously promoted due to a different mutex, raise to match this one
+                                * Or, this thread was promoted on wakeup but someone else later contended on mutex
+                                * at higher priority before we got here
+                                */
+                               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
                        }
                }
+
                if (thread->was_promoted_on_wakeup) {
                        thread->was_promoted_on_wakeup = 0;
-                       if (thread->promotions == 0)
-                               lck_mtx_clear_promoted(thread, trace_lck);
+                       if (--thread->promotions == 0)
+                               sched_thread_unpromote(thread, trace_lck);
                }
 
+               assert_promotions_invariant(thread);
+
+               if (priority && (thread->sched_flags & TH_SFLAG_DEPRESS) == 0)
+                       assert(thread->sched_pri >= priority);
+
                thread_unlock(thread);
                splx(s);
        }
@@ -1089,6 +1188,10 @@ lck_mtx_lock_acquire(
  * Invoked on unlock when there is contention.
  *
  * Called with the interlock locked.
+ *
+ * TODO: the 'waiters' flag does not indicate waiters exist on the waitqueue,
+ * it indicates waiters exist between wait and acquire.
+ * This means that here we may do extra unneeded wakeups.
  */
 void
 lck_mtx_unlock_wakeup (
@@ -1097,7 +1200,7 @@ lck_mtx_unlock_wakeup (
 {
        thread_t                thread = current_thread();
        lck_mtx_t               *mutex;
-       __kdebug_only uintptr_t trace_lck = VM_KERNEL_UNSLIDE_OR_PERM(lck);
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(lck);
 
        if (lck->lck_mtx_tag != LCK_MTX_TAG_INDIRECT)
                mutex = lck;
@@ -1107,20 +1210,36 @@ lck_mtx_unlock_wakeup (
        if (thread != holder)
                panic("lck_mtx_unlock_wakeup: mutex %p holder %p\n", mutex, holder);
 
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START, trace_lck, VM_KERNEL_UNSLIDE_OR_PERM(holder), 0, 0, 0);
+       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_START,
+                    trace_lck, (uintptr_t)thread_tid(thread), 0, 0, 0);
 
        assert(mutex->lck_mtx_waiters > 0);
+       assert(thread->was_promoted_on_wakeup == 0);
+       assert(thread->waiting_for_mutex == NULL);
+
+       /*
+        * The waiters count does not precisely match the number of threads on the waitqueue,
+        * therefore we cannot assert that we actually wake up a thread here
+        */
        if (mutex->lck_mtx_waiters > 1)
                thread_wakeup_one_with_pri(LCK_MTX_EVENT(lck), lck->lck_mtx_pri);
        else
                thread_wakeup_one(LCK_MTX_EVENT(lck));
 
-       if (thread->promotions > 0) {
-               spl_t           s = splsched();
-
+       /* When mutex->lck_mtx_pri is set, it means means I as the owner have a promotion. */
+       if (mutex->lck_mtx_pri) {
+               spl_t s = splsched();
                thread_lock(thread);
-               if (--thread->promotions == 0 && (thread->sched_flags & TH_SFLAG_PROMOTED))
-                       lck_mtx_clear_promoted(thread, trace_lck);
+
+               assert(thread->promotions > 0);
+
+               assert_promotions_invariant(thread);
+
+               if (--thread->promotions == 0)
+                       sched_thread_unpromote(thread, trace_lck);
+
+               assert_promotions_invariant(thread);
+
                thread_unlock(thread);
                splx(s);
        }
@@ -1128,21 +1247,50 @@ lck_mtx_unlock_wakeup (
        KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
 }
 
+/*
+ * Callout from the waitqueue code from inside thread_wakeup_one_with_pri
+ * At splsched, thread is pulled from waitq, still locked, not on runqueue yet
+ *
+ * We always make sure to set the promotion flag, even if the thread is already at this priority,
+ * so that it doesn't go down.
+ */
 void
-lck_mtx_unlockspin_wakeup (
-       lck_mtx_t                       *lck)
+lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority)
 {
-       assert(lck->lck_mtx_waiters > 0);
-       thread_wakeup_one(LCK_MTX_EVENT(lck));
+       assert(priority <= MAXPRI_PROMOTE);
+       assert(thread->waiting_for_mutex != NULL);
 
-       KERNEL_DEBUG(MACHDBG_CODE(DBG_MACH_LOCKS, LCK_MTX_UNLCK_WAKEUP_CODE) | DBG_FUNC_NONE, VM_KERNEL_UNSLIDE_OR_PERM(lck), 0, 0, 1, 0);
-#if CONFIG_DTRACE
-       /*
-        * When there are waiters, we skip the hot-patch spot in the
-        * fastpath, so we record it here.
-        */
-       LOCKSTAT_RECORD(LS_LCK_MTX_UNLOCK_RELEASE, lck, 0);
-#endif
+       __kdebug_only uintptr_t trace_lck = unslide_for_kdebug(thread->waiting_for_mutex);
+
+       assert_promotions_invariant(thread);
+
+       if (thread->was_promoted_on_wakeup) {
+               /* Thread was previously promoted, but contended again */
+               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
+               return;
+       }
+
+       if (thread->promotions > 0 && priority <= thread->promotion_priority) {
+               /*
+                * Thread is already promoted to the right level, no need to do more
+                * I can draft off of another promotion here, which is OK
+                * because I know the thread will soon run acquire to get its own promotion
+                */
+               assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
+               return;
+       }
+
+       thread->was_promoted_on_wakeup = 1;
+
+       if (thread->promotions++ == 0) {
+               /* This is the first promotion for this thread */
+               sched_thread_promote_to_pri(thread, priority, trace_lck);
+       } else {
+               /* Holder was previously promoted due to a different mutex, raise to match this one */
+               sched_thread_update_promotion_to_pri(thread, priority, trace_lck);
+       }
+
+       assert_promotions_invariant(thread);
 }
 
 
@@ -1265,7 +1413,7 @@ lck_rw_sleep(
                        /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
                        assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
 
-                       lck_rw_clear_promotion(thread);
+                       lck_rw_clear_promotion(thread, unslide_for_kdebug(event));
                }
        }
 
@@ -1319,7 +1467,7 @@ lck_rw_sleep_deadline(
                        /* Only if the caller wanted the lck_rw_t returned unlocked should we drop to 0 */
                        assert(lck_sleep_action & LCK_SLEEP_UNLOCK);
 
-                       lck_rw_clear_promotion(thread);
+                       lck_rw_clear_promotion(thread, unslide_for_kdebug(event));
                }
        }
 
@@ -1331,11 +1479,11 @@ lck_rw_sleep_deadline(
  *
  * We support a limited form of reader-writer
  * lock promotion whose effects are:
- * 
+ *
  *   * Qualifying threads have decay disabled
  *   * Scheduler priority is reset to a floor of
  *     of their statically assigned priority
- *     or BASEPRI_BACKGROUND
+ *     or MINPRI_RWLOCK
  *
  * The rationale is that lck_rw_ts do not have
  * a single owner, so we cannot apply a directed
@@ -1381,32 +1529,16 @@ lck_rw_sleep_deadline(
  * lck_rw_clear_promotion: Undo priority promotions when the last RW
  * lock is released by a thread (if a promotion was active)
  */
-void lck_rw_clear_promotion(thread_t thread)
+void lck_rw_clear_promotion(thread_t thread, uintptr_t trace_obj)
 {
        assert(thread->rwlock_count == 0);
 
        /* Cancel any promotions if the thread had actually blocked while holding a RW lock */
        spl_t s = splsched();
-
        thread_lock(thread);
 
-       if (thread->sched_flags & TH_SFLAG_RW_PROMOTED) {
-               thread->sched_flags &= ~TH_SFLAG_RW_PROMOTED;
-
-               if (thread->sched_flags & TH_SFLAG_PROMOTED) {
-                       /* Thread still has a mutex promotion */
-               } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
-                                             (uintptr_t)thread_tid(thread), thread->sched_pri, DEPRESSPRI, 0, 0);
-
-                       set_sched_pri(thread, DEPRESSPRI);
-               } else {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE) | DBG_FUNC_NONE,
-                                             (uintptr_t)thread_tid(thread), thread->sched_pri, thread->base_pri, 0, 0);
-
-                       thread_recompute_sched_pri(thread, FALSE);
-               }
-       }
+       if (thread->sched_flags & TH_SFLAG_RW_PROMOTED)
+               sched_thread_unpromote_reason(thread, TH_SFLAG_RW_PROMOTED, trace_obj);
 
        thread_unlock(thread);
        splx(s);
@@ -1424,27 +1556,10 @@ lck_rw_set_promotion_locked(thread_t thread)
        if (LcksOpts & disLkRWPrio)
                return;
 
-       integer_t priority;
+       assert(thread->rwlock_count > 0);
 
-       priority = thread->sched_pri;
-
-       if (priority < thread->base_pri)
-               priority = thread->base_pri;
-       if (priority < BASEPRI_BACKGROUND)
-               priority = BASEPRI_BACKGROUND;
-
-       if ((thread->sched_pri < priority) ||
-           !(thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
-               KERNEL_DEBUG_CONSTANT(
-                       MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE) | DBG_FUNC_NONE,
-                       (uintptr_t)thread_tid(thread), thread->sched_pri,
-                       thread->base_pri, priority, 0);
-
-               thread->sched_flags |= TH_SFLAG_RW_PROMOTED;
-
-               if (thread->sched_pri < priority)
-                       set_sched_pri(thread, priority);
-       }
+       if (!(thread->sched_flags & TH_SFLAG_RW_PROMOTED))
+               sched_thread_promote_reason(thread, TH_SFLAG_RW_PROMOTED, 0);
 }
 
 kern_return_t
index 99017d2cc19e04383d9acbb5207aed923f18926b..4db3c40f51de3609b7a04d3eed98d7d771be3984 100644 (file)
@@ -116,6 +116,12 @@ typedef    struct _lck_grp_ {
        lck_grp_stat_t          lck_grp_stat;
 } lck_grp_t;
 
+#define lck_grp_miss           lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_miss_cnt
+#define lck_grp_held           lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cnt
+#define lck_grp_util           lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_util_cnt
+#define lck_grp_wait           lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_wait_cnt
+#define lck_grp_direct_wait    lck_grp_stat.lck_grp_mtx_stat.lck_grp_mtx_held_cnt
+
 #define LCK_GRP_NULL   (lck_grp_t *)0
 
 #else
@@ -265,8 +271,14 @@ extern wait_result_t       lck_spin_sleep_deadline(
 
 #ifdef KERNEL_PRIVATE
 
+extern void                    lck_spin_lock_nopreempt(                lck_spin_t              *lck);
+
+extern void                    lck_spin_unlock_nopreempt(              lck_spin_t              *lck);
+
 extern boolean_t               lck_spin_try_lock(                      lck_spin_t              *lck);
 
+extern boolean_t               lck_spin_try_lock_nopreempt(            lck_spin_t              *lck);
+
 /* NOT SAFE: To be used only by kernel debugger to avoid deadlock. */
 extern boolean_t               kdp_lck_spin_is_acquired(               lck_spin_t              *lck);
 
@@ -313,6 +325,17 @@ extern wait_result_t       lck_mtx_sleep_deadline(
                                                                        event_t                         event,
                                                                        wait_interrupt_t        interruptible,
                                                                        uint64_t                        deadline);
+#if DEVELOPMENT || DEBUG
+extern void            erase_all_test_mtx_stats(void);
+extern int             get_test_mtx_stats_string(char* buffer, int buffer_size);
+extern void            lck_mtx_test_init(void);
+extern void            lck_mtx_test_lock(void);
+extern void            lck_mtx_test_unlock(void);
+extern int             lck_mtx_test_mtx_uncontended(int iter, char* buffer, int buffer_size);
+extern int             lck_mtx_test_mtx_contended(int iter, char* buffer, int buffer_size);
+extern int             lck_mtx_test_mtx_uncontended_loop_time(int iter, char* buffer, int buffer_size);
+extern int             lck_mtx_test_mtx_contended_loop_time(int iter, char* buffer, int buffer_size);
+#endif
 
 #ifdef KERNEL_PRIVATE
 
@@ -396,8 +419,6 @@ extern int                          lck_mtx_lock_acquire(
 extern void                            lck_mtx_unlock_wakeup(
                                                                        lck_mtx_t               *lck,
                                                                        thread_t                holder);
-extern void                            lck_mtx_unlockspin_wakeup(
-                                                                       lck_mtx_t               *lck);
 
 extern boolean_t               lck_mtx_ilk_unlock(
                                                                        lck_mtx_t               *lck);
@@ -405,6 +426,8 @@ extern boolean_t            lck_mtx_ilk_unlock(
 extern boolean_t               lck_mtx_ilk_try_lock(
                                                                        lck_mtx_t               *lck);
 
+extern void lck_mtx_wakeup_adjust_pri(thread_t thread, integer_t priority);
+
 #endif
 
 #define decl_lck_rw_data(class,name)     class lck_rw_t name;
@@ -466,9 +489,10 @@ extern void                                lck_rw_assert(
                                                                        lck_rw_t                *lck,
                                                                        unsigned int            type);
 
-extern void                            lck_rw_clear_promotion(
-                                                                       thread_t                thread);
+extern void lck_rw_clear_promotion(thread_t thread, uintptr_t trace_obj);
 extern void lck_rw_set_promotion_locked(thread_t thread);
+
+uintptr_t unslide_for_kdebug(void* object);
 #endif
 
 #ifdef KERNEL_PRIVATE
index 073f27c9ad799c2953f8393db63f6fb1b739065d..43eb195108b62277c577fcd154feabb4f1ba6ddb 100644 (file)
@@ -405,6 +405,22 @@ void ltable_grow(struct link_table *table, uint32_t min_free)
        return;
 }
 
+#if DEVELOPMENT || DEBUG
+
+int
+ltable_nelem(struct link_table *table)
+{
+       int nelem = 0;
+
+       lck_mtx_lock(&table->lock);
+
+       nelem = table->used_elem;
+
+       lck_mtx_unlock(&table->lock);
+
+       return nelem;
+}
+#endif
 
 /**
  * ltable_alloc_elem: allocate one or more elements from a given table
index aa62edfb9eb3f67f66698360221bf3f55e5785e6..c95743f10c95dd8ab7b97ce1bfc6f76718648cd0 100644 (file)
@@ -208,6 +208,15 @@ struct lt_elem *ltable_alloc_elem(struct link_table *table, int type,
                                  int nelem, int nattempts);
 
 
+#if DEVELOPMENT || DEBUG
+/**
+ * ltable_nelem: returns how many elements are used in this
+ * table.
+ */
+extern
+int ltable_nelem(struct link_table *table);
+#endif
+
 /**
  * ltable_realloc_elem: convert a reserved element to a particular type
  *
index 4a0d96dc8903d918d5398fe9ea77e870d0065574..d342f3b4878b3356a054f9148333ddc1dfabfdd9 100644 (file)
@@ -318,6 +318,7 @@ mach_node_register(mach_node_t      node)
         goto out;
     }
 
+    waitq_set_lazy_init_link(pp_set);
     /* Add the bootstrap port to the proxy port set */
     uint64_t wq_link_id = waitq_link_reserve(NULL);
     uint64_t wq_reserved_prepost = waitq_prepost_reserve(NULL, 10,
index cc0290ee119a161d32348d487af032d76ab52d3b..43d69835ca4cb7473d57ece068d4bbe91253973d 100644 (file)
@@ -77,6 +77,7 @@
 #include <kern/kern_types.h>
 #include <kern/counters.h>
 #include <kern/cpu_data.h>
+#include <kern/cpu_quiesce.h>
 #include <kern/ipc_host.h>
 #include <kern/host.h>
 #include <kern/machine.h>
@@ -126,10 +127,7 @@ processor_up(
        pset = processor->processor_set;
        pset_lock(pset);
        ++pset->online_processor_count;
-       enqueue_tail(&pset->active_queue, (queue_entry_t)processor);
-       processor->state = PROCESSOR_RUNNING;
-       pset->active_processor_count++;
-       sched_update_pset_load_average(pset);
+       pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
        (void)hw_atomic_add(&processor_avail_count, 1);
        commpage_update_active_cpus();
        pset_unlock(pset);
@@ -230,15 +228,7 @@ processor_shutdown(
                return (KERN_SUCCESS);
        }
 
-       if (processor->state == PROCESSOR_IDLE) {
-               remqueue((queue_entry_t)processor);
-       } else if (processor->state == PROCESSOR_RUNNING) {
-               remqueue((queue_entry_t)processor);
-               pset->active_processor_count--;
-               sched_update_pset_load_average(pset);
-       }
-
-       processor->state = PROCESSOR_SHUTDOWN;
+       pset_update_processor_state(pset, processor, PROCESSOR_SHUTDOWN);
 
        pset_unlock(pset);
 
@@ -285,7 +275,7 @@ processor_doshutdown(
 
        pset = processor->processor_set;
        pset_lock(pset);
-       processor->state = PROCESSOR_OFF_LINE;
+       pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
        --pset->online_processor_count;
        (void)hw_atomic_sub(&processor_avail_count, 1);
        commpage_update_active_cpus();
@@ -331,6 +321,12 @@ processor_offline(
        thread_t old_thread = processor->active_thread;
        thread_t new_thread = processor->idle_thread;
 
+       if (!new_thread->kernel_stack) {
+               /* the idle thread has a reserved stack, so this will never fail */
+               if (!stack_alloc_try(new_thread))
+                       panic("processor_offline");
+       }
+
        processor->active_thread = new_thread;
        processor_state_update_idle(processor);
        processor->starting_pri = IDLEPRI;
@@ -343,7 +339,7 @@ processor_offline(
        old_thread->last_run_time = ctime;
 
        /* Update processor->thread_timer and ->kernel_timer to point to the new thread */
-       thread_timer_event(ctime, &new_thread->system_timer);
+       processor_timer_switch_thread(ctime, &new_thread->system_timer);
        PROCESSOR_DATA(processor, kernel_timer) = &new_thread->system_timer;
        timer_stop(PROCESSOR_DATA(processor, current_state), ctime);
 
@@ -356,6 +352,8 @@ processor_offline(
 
        thread_dispatch(old_thread, new_thread);
 
+       cpu_quiescent_counter_leave(processor->last_dispatch);
+
        PMAP_DEACTIVATE_KERNEL(processor->cpu_id);
 
        cpu_sleep();
index 63eb58c09e36d2c1791335e6d01cdf18905d283c..74a0a7d631750e49120a7c0a81c3ead7876408e3 100644 (file)
@@ -46,6 +46,7 @@
 #include <mach/machine/vm_types.h>
 #include <ipc/ipc_types.h>
 #include <kern/debug.h>
+#include <libkern/copyio.h>
 
 #ifndef MIN
 #define MIN(a,b) (((a)<(b))?(a):(b))
@@ -85,12 +86,6 @@ extern int testbit(
        int             which,
        int             *bitmap);
 
-/* Move arbitrarily-aligned data from a user space to kernel space */
-extern int copyin(
-       const user_addr_t   user_addr,
-       char                *kernel_addr,
-       vm_size_t           nbytes);
-
 /* Move an aligned 32 or 64-bit word from user space to kernel space
  * using a single read instruction
  *
@@ -116,12 +111,6 @@ extern int copyinmsg(
        char                *kernel_addr,
        mach_msg_size_t     nbytes);
 
-/* Move arbitrarily-aligned data from a kernel space to user space */
-extern int copyout(
-       const void      *kernel_addr,
-       user_addr_t     user_addr,
-       vm_size_t       nbytes);
-
 /* Move arbitrarily-aligned data from a kernel space to user space */
 extern int copyoutmsg(
        const char      *kernel_addr,
index a082a35353a5f9e928713937d1b2cbf999e59289..65fd271402d767175f66caf872edce4a1b7a6ff4 100644 (file)
@@ -30,6 +30,9 @@
 
 #include <stdbool.h>
 #include <stdint.h>
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
 
 extern bool mt_debug;
 extern _Atomic uint64_t mt_pmis;
@@ -43,12 +46,16 @@ uint64_t mt_cur_cpu_cycles(void);
 uint64_t mt_cur_thread_instrs(void);
 uint64_t mt_cur_thread_cycles(void);
 
+__END_DECLS
+
 #if MACH_KERNEL_PRIVATE
 
 #include <kern/thread.h>
 #include <kern/task.h>
 #include <stdbool.h>
 
+__BEGIN_DECLS
+
 #if defined(__arm__) || defined(__arm64__)
 #include <arm/cpu_data_internal.h>
 #elif defined(__x86_64__)
@@ -57,7 +64,6 @@ uint64_t mt_cur_thread_cycles(void);
 #error unsupported architecture
 #endif /* !defined(__arm__) && !defined(__arm64__) && !defined(__x86_64__) */
 
-void mt_init(void);
 void mt_update_fixed_counts(void);
 void mt_update_task(task_t task, thread_t thread);
 bool mt_update_thread(thread_t thread);
@@ -65,22 +71,17 @@ int mt_fixed_thread_counts(thread_t thread, uint64_t *counts_out);
 int mt_fixed_task_counts(task_t task, uint64_t *counts_out);
 
 /*
- * Called when a thread is switching off-core or expires its quantum.
- */
-void mt_sched_update(thread_t thread);
-
-/*
- * Called when a thread is terminating to save its counters into the task.  The
- * task lock must be held and the thread should be removed from the task's
- * thread list in that same critical section.
+ * Private API for the platform layers.
  */
-void mt_terminate_update(task_t task, thread_t thread);
 
 /*
- * Called when a core receives a PMI.
+ * Called once early in boot, before CPU initialization occurs (where
+ * `mt_cpu_up` is called).
+ *
+ * This allows monotonic to detect if the hardware supports performance counters
+ * and install the global PMI handler.
  */
-void mt_cpu_pmi(cpu_data_t *cpu, uint64_t pmsr);
-uint64_t mt_cpu_update_count(cpu_data_t *cpu, unsigned int ctr);
+void mt_early_init(void);
 
 /*
  * Called when a core is idling and exiting from idle.
@@ -95,10 +96,42 @@ void mt_cpu_down(cpu_data_t *cpu);
 void mt_cpu_up(cpu_data_t *cpu);
 
 /*
- * Called while single-threaded when the system is going to sleep and waking up.
+ * Called while single-threaded when the system is going to sleep.
  */
 void mt_sleep(void);
-void mt_wake(void);
+
+/*
+ * Called on each CPU as the system is waking from sleep.
+ */
+void mt_wake_per_core(void);
+
+#if __ARM_CLUSTER_COUNT__
+/*
+ * Called when a cluster is initialized.
+ */
+void mt_cluster_init(void);
+#endif /* __ARM_CLUSTER_COUNT__ */
+
+/*
+ * "Up-call" to the Mach layer to update counters from a PMI.
+ */
+uint64_t mt_cpu_update_count(cpu_data_t *cpu, unsigned int ctr);
+
+/*
+ * Private API for the scheduler.
+ */
+
+/*
+ * Called when a thread is switching off-core or expires its quantum.
+ */
+void mt_sched_update(thread_t thread);
+
+/*
+ * Called when a thread is terminating to save its counters into the task.  The
+ * task lock must be held and the thread should be removed from the task's
+ * thread list in that same critical section.
+ */
+void mt_terminate_update(task_t task, thread_t thread);
 
 /*
  * Private API for the performance controller callout.
@@ -111,6 +144,16 @@ void mt_perfcontrol(uint64_t *instrs, uint64_t *cycles);
 void mt_stackshot_thread(thread_t thread, uint64_t *instrs, uint64_t *cycles);
 void mt_stackshot_task(task_t task, uint64_t *instrs, uint64_t *cycles);
 
+/*
+ * Private API for microstackshot.
+ */
+typedef void (*mt_pmi_fn)(bool user_mode, void *ctx);
+int mt_microstackshot_start(unsigned int ctr, uint64_t period, mt_pmi_fn fn,
+               void *ctx);
+int mt_microstackshot_stop(void);
+
+__END_DECLS
+
 #endif /* MACH_KERNEL_PRIVATE */
 
 #endif /* !defined(KERN_MONOTONIC_H) */
index 91e4c0d1235df202ec408787f9bcdf8dd991a352..3e2814408e70b193aa2c84c2d679763b5c0a703a 100644 (file)
@@ -106,9 +106,10 @@ extern kern_return_t task_importance(task_t task, integer_t importance);
 #define TASK_POLICY_QOS                 0x35
 #define TASK_POLICY_QOS_OVERRIDE        0x36
 #define TASK_POLICY_QOS_AND_RELPRIO     0x38 /* QoS as value1, relative priority as value2 */
+#define TASK_POLICY_QOS_WORKQ_OVERRIDE  0x3B
 #define TASK_POLICY_QOS_PROMOTE         0x3C
 #define TASK_POLICY_QOS_IPC_OVERRIDE    0x3D
-#define TASK_POLICY_QOS_SYNC_IPC_OVERRIDE    0x3E
+// was TASK_POLICY_QOS_SYNC_IPC_OVERRIDE 0x3E
 
 #define TASK_POLICY_MAX                 0x3F
 
@@ -152,13 +153,8 @@ extern void proc_inherit_task_role(task_t new_task, task_t old_task);
 #define THROTTLE_LEVEL_COMPRESSOR_TIER1         THROTTLE_LEVEL_TIER1
 #define THROTTLE_LEVEL_COMPRESSOR_TIER2         THROTTLE_LEVEL_TIER2
 
-#if CONFIG_EMBEDDED
-#define THROTTLE_LEVEL_PAGEOUT_THROTTLED        THROTTLE_LEVEL_TIER3
-#define THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED      THROTTLE_LEVEL_TIER1
-#else
 #define THROTTLE_LEVEL_PAGEOUT_THROTTLED        THROTTLE_LEVEL_TIER2
 #define THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED      THROTTLE_LEVEL_TIER1
-#endif
 
 #if CONFIG_IOSCHED
 #define IOSCHED_METADATA_TIER                   THROTTLE_LEVEL_TIER1
@@ -172,22 +168,17 @@ extern void proc_apply_task_networkbg(void * bsd_info, thread_t thread);
 #endif /* MACH_BSD */
 
 /* Functions used by pthread_shims.c */
-extern boolean_t proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid,
+extern int proc_thread_qos_add_override(task_t task, thread_t thread, uint64_t tid,
                                               int override_qos, boolean_t first_override_for_resource,
                                               user_addr_t resource, int resource_type);
-extern int proc_thread_qos_add_override_check_owner(thread_t thread, int override_qos,
-               boolean_t first_override_for_resource, user_addr_t resource, int resource_type,
-               user_addr_t user_lock_addr, mach_port_name_t user_lock_owner);
-extern boolean_t proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid,
+extern int proc_thread_qos_remove_override(task_t task, thread_t thread, uint64_t tid,
                                                  user_addr_t resource, int resource_type);
-extern boolean_t proc_thread_qos_reset_override(task_t task, thread_t thread, uint64_t tid,
-                                                 user_addr_t resource, int resource_type);
-extern int proc_thread_qos_squash_override(thread_t thread, user_addr_t resource, int resource_type);
 
-extern kern_return_t
-thread_set_workq_qos(thread_t thread, int qos_tier, int relprio);
-extern kern_return_t
-thread_set_workq_pri(thread_t thread, integer_t priority, integer_t policy);
+extern void thread_reset_workq_qos(thread_t thread, uint32_t qos);
+extern void thread_set_workq_override(thread_t thread, uint32_t qos);
+extern void thread_set_workq_pri(thread_t thread, thread_qos_t qos, integer_t priority, integer_t policy);
+extern uint8_t thread_workq_pri_for_qos(thread_qos_t qos) __pure2;
+extern thread_qos_t thread_workq_qos_for_pri(int priority);
 
 extern int
 task_get_default_manager_qos(task_t task);
@@ -204,6 +195,7 @@ extern int proc_lf_pidbind(task_t curtask, uint64_t tid, task_t target_task, int
 /* Importance inheritance functions not under IMPORTANCE_INHERITANCE */
 extern void task_importance_mark_donor(task_t task, boolean_t donating);
 extern void task_importance_reset(task_t task);
+extern void task_importance_init_from_parent(task_t new_task, task_t parent_task);
 
 #if IMPORTANCE_INHERITANCE
 extern boolean_t task_is_importance_donor(task_t task);
@@ -252,19 +244,8 @@ extern int task_importance_estimate(task_t task);
 extern kern_return_t thread_policy_set_internal(thread_t thread, thread_policy_flavor_t flavor,
                                                 thread_policy_t policy_info, mach_msg_type_number_t count);
 
-struct promote_token {
-       uint16_t        pt_basepri;
-       uint16_t        pt_qos;
-};
-
-#define PROMOTE_TOKEN_INIT ((struct promote_token){.pt_basepri = 0, .pt_qos = 0})
-
-extern void thread_user_promotion_add(thread_t thread, thread_t promoter, struct promote_token* promote_token);
-extern void thread_user_promotion_update(thread_t thread, thread_t promoter, struct promote_token* promote_token);
-extern void thread_user_promotion_drop(thread_t thread);
-
-/* for thread exec promotion */
-#define EXEC_BOOST_PRIORITY 31
+extern boolean_t thread_recompute_user_promotion_locked(thread_t thread);
+extern thread_qos_t thread_user_promotion_qos_for_pri(int priority);
 
 extern void thread_set_exec_promotion(thread_t thread);
 extern void thread_clear_exec_promotion(thread_t thread);
@@ -273,9 +254,9 @@ extern void thread_clear_exec_promotion(thread_t thread);
 extern void thread_add_ipc_override(thread_t thread, uint32_t qos_override);
 extern void thread_update_ipc_override(thread_t thread, uint32_t qos_override);
 extern void thread_drop_ipc_override(thread_t thread);
-extern void thread_add_sync_ipc_override(thread_t thread);
-extern void thread_drop_sync_ipc_override(thread_t thread);
-extern uint32_t thread_get_ipc_override(thread_t thread);
+
+/* for ipc_pset.c */
+extern thread_qos_t thread_get_requested_qos(thread_t thread, int *relpri);
 
 /*
  ******************************
index 54a3220d3608282e9ad2d28c3f2c57385b44ab20..c8abeb624fd7f4d0e07cbc7665d95d8d09a5b754 100644 (file)
 #include <arm/cpu_data_internal.h>
 #endif
 
+
 #define isdigit(d) ((d) >= '0' && (d) <= '9')
 #define Ctod(c) ((c) - '0')
 
@@ -585,6 +586,7 @@ __doprnt(
                        const char* strp = str;
                        int strl = sizeof(str) - 1;
 
+
                        if (u >= VM_MIN_KERNEL_AND_KEXT_ADDRESS && u <= VM_MAX_KERNEL_ADDRESS) {
                            while(*strp != '\0') {
                                (*putc)(*strp, arg);
index 29ee2506fcbabc9cd0d195cf7b41c9405042de33..40eb17242b70832b44113719822e03184b7b652d 100644 (file)
@@ -117,12 +117,14 @@ thread_quantum_expire(
        /*
         * We bill CPU time to both the individual thread and its task.
         *
-        * Because this balance adjustment could potentially attempt to wake this very
-        * thread, we must credit the ledger before taking the thread lock. The ledger
-        * pointers are only manipulated by the thread itself at the ast boundary.
+        * Because this balance adjustment could potentially attempt to wake this
+        * very thread, we must credit the ledger before taking the thread lock.
+        * The ledger pointers are only manipulated by the thread itself at the ast
+        * boundary.
         *
-        * TODO: This fails to account for the time between when the timer was armed and when it fired.
-        * It should be based on the system_timer and running a thread_timer_event operation here.
+        * TODO: This fails to account for the time between when the timer was
+        * armed and when it fired.  It should be based on the system_timer and
+        * running a timer_update operation here.
         */
        ledger_credit(thread->t_ledger, task_ledgers.cpu_time, thread->quantum_remaining);
        ledger_credit(thread->t_threadledger, thread_ledgers.cpu_time, thread->quantum_remaining);
@@ -154,14 +156,15 @@ thread_quantum_expire(
        /*
         *      Check for fail-safe trip.
         */
-       if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) && 
-           !(thread->sched_flags & TH_SFLAG_PROMOTED_MASK) &&
-           !(thread->options & TH_OPT_SYSTEM_CRITICAL)) {
-               uint64_t new_computation;
-  
-               new_computation = ctime - thread->computation_epoch;
-               new_computation += thread->computation_metered;
-               if (new_computation > max_unsafe_computation) {
+       if ((thread->sched_mode == TH_MODE_REALTIME || thread->sched_mode == TH_MODE_FIXED) &&
+           !(thread->sched_flags & TH_SFLAG_PROMOTED) &&
+           !(thread->sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) &&
+           !(thread->options & TH_OPT_SYSTEM_CRITICAL)) {
+               uint64_t new_computation;
+
+               new_computation = ctime - thread->computation_epoch;
+               new_computation += thread->computation_metered;
+               if (new_computation > max_unsafe_computation) {
                        KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_FAILSAFE)|DBG_FUNC_NONE,
                                        (uintptr_t)thread->sched_pri, (uintptr_t)thread->sched_mode, 0, 0, 0);
 
@@ -199,12 +202,9 @@ thread_quantum_expire(
         * during privilege transitions, synthesize an event now.
         */
        if (!thread->precise_user_kernel_time) {
-               timer_switch(PROCESSOR_DATA(processor, current_state),
-                                        ctime,
-                                        PROCESSOR_DATA(processor, current_state));
-               timer_switch(PROCESSOR_DATA(processor, thread_timer),
-                                        ctime,
-                                        PROCESSOR_DATA(processor, thread_timer));
+               timer_update(PROCESSOR_DATA(processor, current_state), ctime);
+               timer_update(PROCESSOR_DATA(processor, thread_timer), ctime);
+               timer_update(&thread->runnable_timer, ctime);
        }
 
 
@@ -301,7 +301,7 @@ sched_set_thread_base_priority(thread_t thread, int priority)
        }
        sched_update_thread_bucket(thread);
 
-       thread_recompute_sched_pri(thread, FALSE);
+       thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
 }
 
 /*
@@ -311,28 +311,54 @@ sched_set_thread_base_priority(thread_t thread, int priority)
  *     according to its base priority if the
  *     thread has not been promoted or depressed.
  *
- *     This is the standard way to push base_pri changes into sched_pri,
- *     or to recalculate the appropriate sched_pri after clearing
+ *     This is the only way to push base_pri changes into sched_pri,
+ *     or to recalculate the appropriate sched_pri after changing
  *     a promotion or depression.
  *
  *     Called at splsched with the thread locked.
+ *
+ *     TODO: Add an 'update urgency' flag to avoid urgency callouts on every rwlock operation
  */
 void
-thread_recompute_sched_pri(
-                           thread_t thread,
-                           boolean_t override_depress)
+thread_recompute_sched_pri(thread_t thread, set_sched_pri_options_t options)
 {
-       int priority;
+       uint32_t     sched_flags = thread->sched_flags;
+       sched_mode_t sched_mode  = thread->sched_mode;
 
-       if (thread->sched_mode == TH_MODE_TIMESHARE)
+       int priority = thread->base_pri;
+
+       if (sched_mode == TH_MODE_TIMESHARE)
                priority = SCHED(compute_timeshare_priority)(thread);
-       else
-               priority = thread->base_pri;
 
-       if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK)  || (priority > thread->sched_pri)) &&
-           (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) || override_depress)) {
-               set_sched_pri(thread, priority);
+       if (sched_flags & TH_SFLAG_DEPRESS) {
+               /* thread_yield_internal overrides kernel mutex promotion */
+               priority = DEPRESSPRI;
+       } else {
+               /* poll-depress is overridden by mutex promotion and promote-reasons */
+               if ((sched_flags & TH_SFLAG_POLLDEPRESS)) {
+                       priority = DEPRESSPRI;
+               }
+
+               if (sched_flags & TH_SFLAG_PROMOTED) {
+                       priority = MAX(priority, thread->promotion_priority);
+
+                       if (sched_mode != TH_MODE_REALTIME)
+                               priority = MIN(priority, MAXPRI_PROMOTE);
+               }
+
+               if (sched_flags & TH_SFLAG_PROMOTE_REASON_MASK) {
+                       if (sched_flags & TH_SFLAG_RW_PROMOTED)
+                               priority = MAX(priority, MINPRI_RWLOCK);
+
+                       if (sched_flags & TH_SFLAG_WAITQ_PROMOTED)
+                               priority = MAX(priority, MINPRI_WAITQ);
+
+                       if (sched_flags & TH_SFLAG_EXEC_PROMOTED)
+                               priority = MAX(priority, MINPRI_EXEC);
+               }
        }
+
+       set_sched_pri(thread, priority, options);
 }
 
 void
@@ -380,23 +406,8 @@ lightweight_update_priority(thread_t thread)
 
                priority = sched_compute_timeshare_priority(thread);
 
-               /*
-                * Adjust the scheduled priority like thread_recompute_sched_pri,
-                * except with the benefit of knowing the thread is on this core.
-                */
-               if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK)  || (priority > thread->sched_pri)) &&
-                   (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)) &&
-                   priority != thread->sched_pri) {
-
-                       thread->sched_pri = priority;
-
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
-                                             (uintptr_t)thread_tid(thread),
-                                             thread->base_pri,
-                                             thread->sched_pri,
-                                             thread->sched_usage,
-                                             0);
-               }
+               if (priority != thread->sched_pri)
+                       thread_recompute_sched_pri(thread, SETPRI_LAZY);
        }
 }
 
@@ -512,8 +523,6 @@ update_priority(
 
        thread->sched_stamp += ticks;
 
-       thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket];
-
        /* If requested, accelerate aging of sched_usage */
        if (sched_decay_usage_age_factor > 1)
                ticks *= sched_decay_usage_age_factor;
@@ -524,9 +533,9 @@ update_priority(
        thread_timer_delta(thread, delta);
        if (ticks < SCHED_DECAY_TICKS) {
                /*
-                *      Accumulate timesharing usage only
-                *      during contention for processor
-                *      resources.
+                *      Accumulate timesharing usage only during contention for processor
+                *      resources. Use the pri_shift from the previous tick window to 
+                *      determine if the system was in a contended state.
                 */
                if (thread->pri_shift < INT8_MAX)
                        thread->sched_usage += delta;
@@ -561,36 +570,17 @@ update_priority(
        }
 
        /*
-        *      Recompute scheduled priority if appropriate.
+        * Now that the thread's CPU usage has been accumulated and aged
+        * based on contention of the previous tick window, update the
+        * pri_shift of the thread to match the current global load/shift
+        * values. The updated pri_shift would be used to calculate the
+        * new priority of the thread.
         */
-       if (thread->sched_mode == TH_MODE_TIMESHARE) {
-               int priority = sched_compute_timeshare_priority(thread);
-
-               /*
-                * Adjust the scheduled priority like thread_recompute_sched_pri,
-                * except without setting an AST.
-                */
-               if ((!(thread->sched_flags & TH_SFLAG_PROMOTED_MASK)  || (priority > thread->sched_pri)) &&
-                   (!(thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)) &&
-                   priority != thread->sched_pri) {
-
-                       boolean_t removed = thread_run_queue_remove(thread);
-
-                       thread->sched_pri = priority;
-
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
-                                             (uintptr_t)thread_tid(thread),
-                                             thread->base_pri,
-                                             thread->sched_pri,
-                                             thread->sched_usage,
-                                             0);
-
-                       if (removed)
-                               thread_run_queue_reinsert(thread, SCHED_TAILQ);
-               }
-       }
+       thread->pri_shift = sched_pri_shifts[thread->th_sched_bucket];
 
-       return;
+       /* Recompute scheduled priority if appropriate. */
+       if (thread->sched_mode == TH_MODE_TIMESHARE)
+               thread_recompute_sched_pri(thread, SETPRI_LAZY);
 }
 
 #endif /* CONFIG_SCHED_TIMESHARE_CORE */
@@ -662,8 +652,10 @@ sched_update_thread_bucket(thread_t thread)
                break;
 
        case TH_MODE_TIMESHARE:
-               if (thread->base_pri > BASEPRI_UTILITY)
+               if (thread->base_pri > BASEPRI_DEFAULT)
                        new_bucket = TH_BUCKET_SHARE_FG;
+               else if (thread->base_pri > BASEPRI_UTILITY)
+                       new_bucket = TH_BUCKET_SHARE_DF;
                else if (thread->base_pri > MAXPRI_THROTTLE)
                        new_bucket = TH_BUCKET_SHARE_UT;
                else
@@ -779,4 +771,168 @@ sched_thread_mode_undemote(thread_t thread, uint32_t reason)
                thread_run_queue_reinsert(thread, SCHED_TAILQ);
 }
 
+/*
+ * Promote thread to a specific priority
+ *
+ * Promotion must not last past syscall boundary
+ * Clients must always pair promote and unpromote 1:1
+ *
+ * Called at splsched with thread locked
+ */
+void
+sched_thread_promote_to_pri(thread_t    thread,
+                            int         priority,
+              __kdebug_only uintptr_t   trace_obj /* already unslid */)
+{
+       assert((thread->sched_flags & TH_SFLAG_PROMOTED) != TH_SFLAG_PROMOTED);
+       assert(thread->promotion_priority == 0);
+       assert(priority <= MAXPRI_PROMOTE);
+       assert(priority > 0);
+
+       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTED),
+            thread_tid(thread), trace_obj, priority);
+
+       thread->sched_flags |= TH_SFLAG_PROMOTED;
+       thread->promotion_priority = priority;
+
+       thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
+}
+
+
+/*
+ * Update a pre-existing priority promotion to have a higher priority floor
+ * Priority can only go up from the previous value
+ * Update must occur while a promotion is active
+ *
+ * Called at splsched with thread locked
+ */
+void
+sched_thread_update_promotion_to_pri(thread_t   thread,
+                                     int        priority,
+                       __kdebug_only uintptr_t  trace_obj /* already unslid */)
+{
+       assert(thread->promotions > 0);
+       assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
+       assert(thread->promotion_priority > 0);
+       assert(priority <= MAXPRI_PROMOTE);
+
+       if (thread->promotion_priority < priority) {
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_PROMOTED_UPDATE),
+                    thread_tid(thread), trace_obj, priority);
+
+               thread->promotion_priority = priority;
+               thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
+       }
+}
+
+/*
+ * End a priority promotion
+ * Demotes a thread back to its expected priority without the promotion in place
+ *
+ * Called at splsched with thread locked
+ */
+void
+sched_thread_unpromote(thread_t     thread,
+         __kdebug_only uintptr_t    trace_obj /* already unslid */)
+{
+       assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
+       assert(thread->promotion_priority > 0);
+
+       KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_UNPROMOTED),
+            thread_tid(thread), trace_obj, 0);
+
+       thread->sched_flags &= ~TH_SFLAG_PROMOTED;
+       thread->promotion_priority = 0;
+
+       thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
+}
+
+/* called with thread locked */
+void
+assert_promotions_invariant(thread_t thread)
+{
+       if (thread->promotions > 0)
+               assert((thread->sched_flags & TH_SFLAG_PROMOTED) == TH_SFLAG_PROMOTED);
+
+       if (thread->promotions == 0)
+               assert((thread->sched_flags & TH_SFLAG_PROMOTED) != TH_SFLAG_PROMOTED);
+}
+
+/*
+ * Promote thread to have a sched pri floor for a specific reason
+ *
+ * Promotion must not last past syscall boundary
+ * Clients must always pair promote and demote 1:1,
+ * Handling nesting of the same promote reason is the client's responsibility
+ *
+ * Called at splsched with thread locked
+ */
+void
+sched_thread_promote_reason(thread_t    thread,
+                            uint32_t    reason,
+              __kdebug_only uintptr_t   trace_obj /* already unslid */)
+{
+       assert(reason & TH_SFLAG_PROMOTE_REASON_MASK);
+       assert((thread->sched_flags & reason) != reason);
+
+       switch (reason) {
+       case TH_SFLAG_RW_PROMOTED:
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_PROMOTE),
+                    thread_tid(thread), thread->sched_pri,
+                    thread->base_pri, trace_obj);
+               break;
+       case TH_SFLAG_WAITQ_PROMOTED:
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE),
+                    thread_tid(thread), thread->sched_pri,
+                    thread->base_pri, trace_obj);
+               break;
+       case TH_SFLAG_EXEC_PROMOTED:
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_PROMOTE),
+                    thread_tid(thread), thread->sched_pri,
+                    thread->base_pri, trace_obj);
+               break;
+       }
+
+       thread->sched_flags |= reason;
+
+       thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
+}
+
+/*
+ * End a specific promotion reason
+ * Demotes a thread back to its expected priority without the promotion in place
+ *
+ * Called at splsched with thread locked
+ */
+void
+sched_thread_unpromote_reason(thread_t  thread,
+                              uint32_t  reason,
+                __kdebug_only uintptr_t trace_obj /* already unslid */)
+{
+       assert(reason & TH_SFLAG_PROMOTE_REASON_MASK);
+       assert((thread->sched_flags & reason) == reason);
+
+       switch (reason) {
+       case TH_SFLAG_RW_PROMOTED:
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_RW_DEMOTE),
+                    thread_tid(thread), thread->sched_pri,
+                    thread->base_pri, trace_obj);
+               break;
+       case TH_SFLAG_WAITQ_PROMOTED:
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE),
+                    thread_tid(thread), thread->sched_pri,
+                    thread->base_pri, trace_obj);
+               break;
+       case TH_SFLAG_EXEC_PROMOTED:
+               KDBG(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE),
+                    thread_tid(thread), thread->sched_pri,
+                    thread->base_pri, trace_obj);
+               break;
+       }
+
+       thread->sched_flags &= ~reason;
+
+       thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
+}
+
 
diff --git a/osfmk/kern/priority_queue.c b/osfmk/kern/priority_queue.c
new file mode 100644 (file)
index 0000000..5314d60
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/priority_queue.h>
+#include <mach/vm_param.h>
+
+#ifdef __LP64__
+static_assert(PRIORITY_QUEUE_ENTRY_CHILD_BITS >= VM_KERNEL_POINTER_SIGNIFICANT_BITS,
+               "Priority Queue child pointer packing failed");
+#endif
+
+priority_queue_entry_t
+pqueue_pair_meld(priority_queue_entry_t elt, priority_queue_compare_fn_t cmp_fn)
+{
+       priority_queue_entry_t pq_meld_result = NULL;
+       priority_queue_entry_t pair_list = NULL;
+
+       assert(elt); // caller needs to check this.
+
+       /* Phase 1: */
+       /* Split the list into a set of pairs going front to back. */
+       /* Hook these pairs onto an intermediary list in reverse order of traversal.*/
+
+       do {
+               /* Consider two elements at a time for pairing */
+               priority_queue_entry_t pair_item_a = elt;
+               priority_queue_entry_t pair_item_b = elt->next;
+               if (pair_item_b == NULL) {
+                       /* Odd number of elements in the list; link the odd element */
+                       /* as it is on the intermediate list. */
+                       pair_item_a->prev = pair_list;
+                       pair_list = pair_item_a;
+                       break;
+               }
+               /* Found two elements to pair up */
+               elt = pair_item_b->next;
+               priority_queue_entry_t pair = pqueue_merge(pair_item_a, pair_item_b, cmp_fn);
+               /* Link the pair onto the intermediary list */
+               pair->prev = pair_list;
+               pair_list = pair;
+       } while (elt != NULL);
+
+       /* Phase 2: Merge all the pairs in the pair_list */
+       do {
+               elt = pair_list->prev;
+               pq_meld_result = pqueue_merge(pq_meld_result, pair_list, cmp_fn);
+               pair_list = elt;
+       } while (pair_list != NULL);
+
+       return pq_meld_result;
+}
+
+void
+pqueue_destroy(struct priority_queue *q, size_t offset,
+               void (^callback)(void *e))
+{
+       assert(callback != NULL);
+       priority_queue_entry_t head = pqueue_unpack_root(q);
+       priority_queue_entry_t tail = head;
+
+       while (head != NULL) {
+               priority_queue_entry_t child_list = pqueue_entry_unpack_child(head);
+               if (child_list) {
+                       tail->next = child_list;
+                       while (tail->next) tail = tail->next;
+               }
+
+               priority_queue_entry_t elt = head;
+               head = head->next;
+               callback((void *)elt - offset);
+       }
+
+       /* poison the queue now that it's destroyed */
+       q->pq_root_packed = ~0UL;
+}
diff --git a/osfmk/kern/priority_queue.h b/osfmk/kern/priority_queue.h
new file mode 100644 (file)
index 0000000..ff9836b
--- /dev/null
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_PRIORITY_QUEUE_H_
+#define _KERN_PRIORITY_QUEUE_H_
+
+#include <mach/mach_types.h>
+#include <kern/macro_help.h>
+#include <kern/assert.h>
+
+#include <sys/cdefs.h>
+
+__BEGIN_DECLS
+
+/*
+ * A generic priorty ordered queue implementation based on pairing heaps.
+ *
+ * Reference Papers:
+ * - A Back-to-Basics Empirical Study of Priority Queues (https://arxiv.org/abs/1403.0252)
+ * - The Pairing Heap: A New Form of Self-Adjusting Heap (https://www.cs.cmu.edu/~sleator/papers/pairing-heaps.pdf)
+ *
+ * The XNU implementation is a basic version of the pairing heap. It allows for O(1) insertion and amortized O(log n)
+ * deletion. It is not a stable data structure since adding stability would need more pointers and hence more memory.
+ *
+ * The implementation supports two types of key storage:
+ *
+ *     Type 1: PRIORITY_QUEUE_GENERIC_KEY
+ *
+ *         This flag is useful when the priorities are either larger than 8-bits or the node comparision needs
+ *         extra information other than the priority. The nodes do not store the priorities themselves and on
+ *         comparision, callout to the comparator (of type priority_queue_compare_fn_t) provided as part of
+ *         initialization.
+ *
+ *         Sample Initialization:
+ *
+ *         {
+ *             static struct priority_queue pq;
+ *             priority_queue_init(pq, PRIORITY_QUEUE_MAX_HEAP | PRIORITY_QUEUE_GENERIC_KEY);
+ *         }
+ *
+ *         For this type, all insertions, priority_increase, priority_decrease must pass PRIORITY_QUEUE_KEY_NONE
+ *         as the priority key field.
+ *
+ *     Type 2: PRIORITY_QUEUE_BUILTIN_KEY
+ *
+ *         This type is useful when the priorities need to be stored within the data structure itself.
+ *         Each node in the priority queue maintains a 8-bit priority key.
+ *
+ *         Sample Initialization:
+ *         {
+ *             static struct priority_queue pq;
+ *             priority_queue_init(pq, PRIORITY_QUEUE_MAX_HEAP | PRIORITY_QUEUE_BUILTIN_KEY);
+ *         }
+ *
+ *
+ * Min / Max Heap:
+ *
+ *     The semantics of Min/Max heap are not used by the implementation, it assumes that the comparison block
+ *     that is passed to the insertion / removal / ... macros provides the right ordering.
+ *
+ *     However for human readability purposes, whether this heap is a MIN or MAX heap is passed
+ *     at initialization time, and will influence whether accessors like priority_queue_min
+ *     or priority_queue_max can be used.
+ *
+ *
+ * Element Linkage:
+ *
+ *         Both types use a common queue head and linkage pattern.
+ *         The head of a priority queue is declared as:
+ *
+ *              struct priority_queue pq_head;
+ *
+ *         Elements in this queue are linked together using struct priority_queue_entry objects embedded within a structure:
+ *              struct some_data {
+ *                      int field1;
+ *                      int field2;
+ *                      ...
+ *                      struct priority_queue_entry link;
+ *                      ...
+ *                      int last_field;
+ *              };
+ *         struct some_data is referred to as the queue "element"
+ *
+ *         This method uses the next, prev and child pointers of the struct priority_queue_entry linkage object embedded in a
+ *         queue element to point to other elements in the queue. The head of the priority queue (the priority_queue
+ *         object) will point to the root of the pairing heap (NULL if heap is empty). This method allows multiple chains
+ *         through a given object, by embedding multiple priority_queue_entry objects in the structure, while simultaneously
+ *         providing fast removal and insertion into the heap using only priority_queue_entry object pointers.
+ */
+
+
+/*
+ * Priority keys maintained by the data structure.
+ * Since the priority is packed in the node itself, it restricts keys to be 8-bits only.
+ */
+#define PRIORITY_QUEUE_KEY_NONE             0
+typedef uint8_t priority_queue_key_t;
+
+/*
+ * Flags passed to priority_queue_init()
+ *
+ * One key type must be picked (default is BUILTIN_KEY)
+ * Min or Max heap must be picked (default is MAX_HEAP)
+ */
+typedef enum priority_queue_flags {
+       PRIORITY_QUEUE_BUILTIN_KEY    = 0x0,
+       PRIORITY_QUEUE_GENERIC_KEY    = 0x1,
+       PRIORITY_QUEUE_MAX_HEAP       = 0x0,
+       PRIORITY_QUEUE_MIN_HEAP       = 0x2,
+#define PRIORITY_QUEUE_BUILTIN_MAX_HEAP (PRIORITY_QUEUE_MAX_HEAP | PRIORITY_QUEUE_BUILTIN_KEY)
+} priority_queue_flags_t;
+
+#ifdef __LP64__
+
+/*
+ * For 64-bit platforms, pack the priority key into the child pointer
+ * The packing/unpacking is done using a compiler trick to sign extend long.
+ * This avoids additional NULL checks which are needed in typical packing
+ * implementation. The idea is to define the packed location as a long and
+ * for unpacking simply cast it to a full pointer which sign extends it.
+ */
+#define PRIORITY_QUEUE_ENTRY_CHILD_BITS     56
+#define PRIORITY_QUEUE_ENTRY_KEY_BITS       8
+
+typedef struct priority_queue_entry {
+       struct priority_queue_entry         *next;
+       struct priority_queue_entry         *prev;
+       long                                key: PRIORITY_QUEUE_ENTRY_KEY_BITS;
+       long                                child: PRIORITY_QUEUE_ENTRY_CHILD_BITS;
+} *priority_queue_entry_t;
+
+#else /* __LP64__ */
+
+/*
+ * For 32-bit platforms, use an extra field to store the key since child pointer packing
+ * is not an option. The child is maintained as a long to use the same packing/unpacking
+ * routines that work for 64-bit platforms.
+ */
+typedef struct priority_queue_entry {
+       struct priority_queue_entry         *next;
+       struct priority_queue_entry         *prev;
+       long                                child;
+       priority_queue_key_t                key;
+} *priority_queue_entry_t;
+
+#endif /* __LP64__ */
+
+/*
+ * Comparator block prototype
+ * Args:
+ *      - elements to compare
+ * Return:
+ * comparision result to indicate relative ordering of elements according to the heap type
+ */
+typedef int (^priority_queue_compare_fn_t)(struct priority_queue_entry *e1,
+               struct priority_queue_entry *e2);
+
+/*
+ * Standard comparision routines for max and min heap.
+ * Must be used with PRIORITY_QUEUE_BUILTIN_KEY only.
+ */
+static inline int
+priority_queue_element_builtin_key_compare(priority_queue_entry_t e1, priority_queue_entry_t e2)
+{
+       return (int)e2->key - (int)e1->key;
+}
+
+#define priority_heap_make_comparator(name1, name2, type, field, ...) \
+        (^int(priority_queue_entry_t __e1, priority_queue_entry_t __e2){                                        \
+            type *name1 = pqe_element_fast(__e1, type, field);                                                  \
+            type *name2 = pqe_element_fast(__e2, type, field);                                                  \
+            __VA_ARGS__;                                                                                        \
+        })
+
+#define PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE                                                               \
+        (^int(priority_queue_entry_t e1, priority_queue_entry_t e2){                                            \
+            return -priority_queue_element_builtin_key_compare(e1, e2);                                         \
+        })
+
+#define PRIORITY_QUEUE_SCHED_PRI_MIN_HEAP_COMPARE                                                               \
+        (^int(priority_queue_entry_t e1, priority_queue_entry_t e2){                                            \
+            return priority_queue_element_builtin_key_compare(e1, e2);                                          \
+        })
+
+/*
+ * Helper routines for packing/unpacking the child pointer in heap nodes.
+ * On 64-bit platforms, these routines rely on the fact that the sign extension
+ * for the lower 56-bits of a kernel pointer results in the real pointer. The trick
+ * works for NULL pointers as well.
+ * */
+#define pqueue_entry_pack_child(qe, child_ptr)      ((qe)->child = (long)(child_ptr))
+#define pqueue_entry_unpack_child(qe)               ((struct priority_queue_entry *)((qe)->child))
+
+/*
+ * Priority queue head structure.
+ * Stores the comparision function using pointer packing. The remaining bit is used
+ * for type of the queue.
+ */
+struct priority_queue {
+/*
+ * we pack priority_queue_flags_t in the least significant two bits
+ * of the root pointer.
+ */
+#define PRIORITY_QUEUE_ROOT_FLAGS_MASK    (3ul)
+#define PRIORITY_QUEUE_ROOT_POINTER_MASK  (~PRIORITY_QUEUE_ROOT_FLAGS_MASK)
+       unsigned long                       pq_root_packed;
+};
+
+/*
+ *      Macro:          pqe_element_fast
+ *      Function:
+ *              Convert a priority_queue_entry_t to a queue element pointer.
+ *              Get a pointer to the user-defined element containing
+ *              a given priority_queue_entry_t
+ *
+ *              The fast variant assumes that `qe` is not NULL
+ *      Header:
+ *              pqe_element_fast(qe, type, field)
+ *                      <priority_queue_entry_t> qe
+ *                      <type> type of element in priority queue
+ *                      <field> chain field in (*<type>)
+ *      Returns:
+ *              <type *> containing qe
+ */
+#define pqe_element_fast(qe, type, field)  __container_of(qe, type, field)
+
+/*
+ *      Macro:          pqe_element
+ *      Function:
+ *              Convert a priority_queue_entry_t to a queue element pointer.
+ *              Get a pointer to the user-defined element containing
+ *              a given priority_queue_entry_t
+ *
+ *              The non fast variant handles NULL `qe`
+ *      Header:
+ *              pqe_element(qe, type, field)
+ *                      <priority_queue_entry_t> qe
+ *                      <type> type of element in priority queue
+ *                      <field> chain field in (*<type>)
+ *      Returns:
+ *              <type *> containing qe
+ */
+#define pqe_element(qe, type, field)  ({                                                                        \
+        priority_queue_entry_t _tmp_entry = (qe);                                                               \
+        _tmp_entry ? pqe_element_fast(_tmp_entry, type, field) : ((type *)NULL);                                \
+})
+
+#define pqueue_has_generic_keys(p) \
+        (((p)->pq_root_packed & PRIORITY_QUEUE_GENERIC_KEY) != 0)
+
+#define pqueue_has_builtin_keys(p) \
+        (((p)->pq_root_packed & PRIORITY_QUEUE_GENERIC_KEY) == 0)
+
+#define pqueue_is_min_heap(p) \
+        (((p)->pq_root_packed & PRIORITY_QUEUE_MIN_HEAP) != 0)
+
+#define pqueue_is_max_heap(p) \
+        (((p)->pq_root_packed & PRIORITY_QUEUE_MIN_HEAP) == 0)
+
+/*
+ *      Macro:          pqueue_pack_root
+ *      Function:
+ *              Pack the root pointer of the head.
+ *      Header:
+ *              pqueue_pack_root(q, root_ptr)
+ *                      <struct priority_queue *> q
+ *                      <priority_queue_entry_t> root_ptr
+ */
+#define pqueue_pack_root(q, root_ptr)                                                                           \
+MACRO_BEGIN                                                                                                     \
+        uintptr_t __flags = (q)->pq_root_packed & PRIORITY_QUEUE_ROOT_FLAGS_MASK;                               \
+        (q)->pq_root_packed = (uintptr_t)(root_ptr) | __flags;                                                  \
+MACRO_END
+
+/*
+ *      Macro:          pqueue_unpack_root
+ *      Function:
+ *              Unpack the root pointer from the head of the priority queue.
+ *      Header:
+ *              pqueue_unpack_root(q)
+ *                      <struct priority_queue *> q
+ *      Returns:
+ *              <priority_queue_entry_t>
+ */
+#define pqueue_unpack_root(q) \
+        ((priority_queue_entry_t)((q)->pq_root_packed & PRIORITY_QUEUE_ROOT_POINTER_MASK))
+
+/*
+ *      Macro:          pqueue_list_remove
+ *      Function:
+ *              Helper routine to remove an element from the list at its level
+ *      Header:
+ *              pqueue_list_remove(elt)
+ *                      <priority_queue_entry_t> elt
+ *      Returns:
+ *              None
+ */
+static inline void
+pqueue_list_remove(priority_queue_entry_t elt)
+{
+       assert(elt->prev != NULL);
+       /* Check if elt is head of list at its level;        */
+       /* If yes, make the next node the head at that level */
+       /* Else, remove elt from the list at that level      */
+       if (pqueue_entry_unpack_child(elt->prev) == elt) {
+               pqueue_entry_pack_child(elt->prev, elt->next);
+       } else {
+               elt->prev->next = elt->next;
+       }
+       /* Update prev for next element in list */
+       if (elt->next != NULL)
+               elt->next->prev = elt->prev;
+}
+
+/*
+ *      Macro:          pqueue_merge
+ *      Function:
+ *              Helper routine to merge two subtrees of the heap to form a single tree and
+ *              maintain the parent > child invariant. If the two keys are equal, the current
+ *              implementation makes the first subtree the parent and the second one the child.
+ *      Header:
+ *              pqueue_merge(subtree_a, subtree_b, cmp_fn)
+ *                      <priority_queue_entry_t> subtree_a
+ *                      <priority_queue_entry_t> subtree_b
+ *                      <cmp_fn> comparator function
+ *      Returns:
+ *              <priority_queue_entry_t> pointing to root of the merged tree
+ */
+static inline priority_queue_entry_t
+pqueue_merge(priority_queue_entry_t subtree_a, priority_queue_entry_t subtree_b,
+               priority_queue_compare_fn_t cmp_fn)
+{
+       priority_queue_entry_t merge_result = NULL;
+       if (subtree_a == NULL) {
+               merge_result = subtree_b;
+       } else if (subtree_b == NULL || (subtree_a == subtree_b)) {
+               merge_result = subtree_a;
+       } else {
+               priority_queue_entry_t parent = subtree_a;
+               priority_queue_entry_t child = subtree_b;
+               if (cmp_fn(subtree_a, subtree_b) < 0) {
+                       parent = subtree_b;
+                       child = subtree_a;
+               }
+               /* Insert the child as the first element in the parent's child list */
+               child->next = pqueue_entry_unpack_child(parent);
+               child->prev = parent;
+               if (pqueue_entry_unpack_child(parent) != NULL)
+                       pqueue_entry_unpack_child(parent)->prev = child;
+               /* Create the parent child relationship */
+               pqueue_entry_pack_child(parent, child);
+               parent->next = NULL;
+               parent->prev = NULL;
+               merge_result = parent;
+       }
+       return merge_result;
+}
+
+/*
+ *      Macro:          pqueue_pair_meld
+ *      Function:
+ *              Helper routine to splitwise pair a set of subtrees on a list at a given level and then
+ *              meld them together to form a new tree while maintaining the invariant parent > child.
+ *
+ *              The caller must check the element is non NULL.
+ *
+ *      Header:
+ *              pqueue_pair_meld(elt, cmp_fn)
+ *                      <priority_queue_entry_t> elt
+ *                      <cmp_fn> comparator function
+ *      Returns:
+ *              <priority_queue_entry_t> pointing to root of the melded tree
+ */
+priority_queue_entry_t
+pqueue_pair_meld(priority_queue_entry_t e, priority_queue_compare_fn_t cmp_fn);
+
+/*
+ *      Macro:          pqueue_update_key
+ *      Function:
+ *              Helper routine to update the key for a node in the heap. Note that the priority keys are only
+ *              maintained for the PRIORITY_QUEUE_BUILTIN_KEY type of priority queue. For PRIORITY_QUEUE_GENERIC_KEY,
+ *              this routine does nothing.
+ *      Header:
+ *              pqueue_update_key(que, elt, new_key)
+ *                      <struct priority_queue *> que
+ *                      <priority_queue_entry_t> elt
+ *                      <priority_queue_key_t> new_key
+ *      Returns:
+ *              None
+ */
+static inline void
+pqueue_update_key(struct priority_queue *que, priority_queue_entry_t elt,
+               priority_queue_key_t new_key)
+{
+       if (pqueue_has_builtin_keys(que)) {
+               assert(new_key <= UINT8_MAX);
+               elt->key = new_key;
+       } else {
+               assert(new_key == PRIORITY_QUEUE_KEY_NONE);
+       }
+}
+
+/*
+ *      Macro:          pqueue_remove_root
+ *      Function:
+ *              Helper routine to remove the root element in a priority queue.
+ *      Header:
+ *              pqueue_remove_root(que, cmp_fn)
+ *                      <struct priority_queue *> que
+ *                      <priority_queue_entry_t> old_root
+ *                      <cmp_fn> comparator function
+ *      Returns:
+ *              old_root
+ */
+static inline priority_queue_entry_t
+pqueue_remove_root(struct priority_queue *que, priority_queue_entry_t old_root,
+               priority_queue_compare_fn_t cmp_fn)
+{
+       priority_queue_entry_t new_root = pqueue_entry_unpack_child(old_root);
+       if (new_root) new_root = pqueue_pair_meld(new_root, cmp_fn);
+       pqueue_pack_root(que, new_root);
+       return old_root;
+}
+
+/*
+ *      Macro:          pqueue_remove_non_root
+ *      Function:
+ *              Helper routine to remove a non root element in a priority queue.
+ *      Header:
+ *              pqueue_remove_non_root(que, cmp_fn)
+ *                      <struct priority_queue *> que
+ *                      <priority_queue_entry_t> elt
+ *                      <cmp_fn> comparator function
+ *      Returns:
+ *              elt
+ */
+static inline priority_queue_entry_t
+pqueue_remove_non_root(struct priority_queue *que, priority_queue_entry_t elt,
+               priority_queue_compare_fn_t cmp_fn)
+{
+       priority_queue_entry_t child, new_root;
+
+       /* To remove a non-root element with children levels, */
+       /* - Remove element from its current level iist */
+       /* - Pairwise split all the elements in the child level list */
+       /* - Meld all these splits (right-to-left) to form new subtree */
+       /* - Merge the root subtree with the newly formed subtree */
+       pqueue_list_remove(elt);
+
+       child = pqueue_entry_unpack_child(elt);
+       if (child) {
+               child = pqueue_pair_meld(child, cmp_fn);
+               new_root = pqueue_merge(pqueue_unpack_root(que), child, cmp_fn);
+               pqueue_pack_root(que, new_root);
+       }
+
+       return elt;
+}
+
+/*
+ *      Macro:          pqueue_destroy
+ *      Function:
+ *              Destroy a priority queue safely. This routine accepts a callback
+ *              to handle any cleanup for elements in the priority queue. The queue does
+ *              not maintain its invariants while getting destroyed. The priority queue and
+ *              the linkage nodes need to be re-initialized before re-using them.
+ *
+ *              Note: the offset is the offset to the linkage inside the elements
+ *              That are linked inside the priority heap, because pqueue_destroy
+ *              can't use pqe_element.
+ *      Header:
+ *              pqueue_destroy(q, offset, callback)
+ *                      <struct priority_queue *> q
+ *                      <size_t> offset
+ *                      <callback> callback for each element
+ *
+ *      Returns:
+ *              None
+ */
+void
+pqueue_destroy(struct priority_queue *q, size_t offset,
+               void (^callback)(void *e));
+
+/*
+ * Priority Queue functionality routines
+ */
+
+/*
+ *      Macro:          priority_queue_empty
+ *      Function:
+ *              Tests whether a priority queue is empty.
+ *      Header:
+ *              boolean_t priority_queue_empty(q)
+ *                      <struct priority_queue *> q
+ */
+#define priority_queue_empty(q)         (pqueue_unpack_root((q)) == NULL)
+
+/*
+ *      Macro:          priority_queue_entry_key
+ *      Function:
+ *              Returns the priority queue entry key for an element on a PRIORITY_QUEUE_BUILTIN_KEY
+ *              queue. It should not be called for an element on a PRIORITY_QUEUE_GENERIC_KEY queue.
+ *      Header:
+ *              priority_queue_key_t priority_queue_entry_key(q, elt)
+ *                      <struct priority_queue *> q
+ *                      <struct priority_queue_entry *> elt
+ */
+#define priority_queue_entry_key(q, elt) ({                                                                     \
+        assert(pqueue_has_builtin_keys(q));                                                                     \
+        (priority_queue_key_t)((elt)->key);                                                                     \
+})
+
+/*
+ *      Macro:          priority_queue_init
+ *      Function:
+ *              Initialze a <struct priority_queue *> by setting the flags
+ *              Valid flags are:
+ *              - PRIORITY_QUEUE_BUILTIN_KEY or PRIORITY_QUEUE_GENERIC_KEY
+ *              - PRIORITY_QUEUE_MAX_HEAP or PRIORITY_QUEUE_MIN_HEAP
+ *      Header:
+ *              priority_queue_init(q, cmp_fn, queue_type)
+ *                      <struct priority_queue *> q
+ *                      <priority_queue_flags_t> queue_flags
+ *      Returns:
+ *              None
+ */
+#define priority_queue_init(q, flags)                                                                           \
+MACRO_BEGIN                                                                                                     \
+        pqueue_pack_root((q), NULL);                                                                            \
+        (q)->pq_root_packed = (flags);                                                                          \
+MACRO_END
+
+/*
+ *      Macro:          priority_queue_entry_init
+ *      Function:
+ *              Initialze a priority_queue_entry_t
+ *      Header:
+ *              priority_queue_entry_init(qe)
+ *                      <priority_queue_entry_t> qe
+ *      Returns:
+ *              None
+ */
+#define priority_queue_entry_init(qe)                                                                           \
+MACRO_BEGIN                                                                                                     \
+        (qe)->next = NULL;                                                                                      \
+        (qe)->prev = NULL;                                                                                      \
+        pqueue_entry_pack_child((qe), NULL);                                                                    \
+        (qe)->key = PRIORITY_QUEUE_KEY_NONE;                                                                    \
+MACRO_END
+
+/*
+ *      Macro:          priority_queue_insert
+ *      Function:
+ *              Insert an element into the priority queue
+ *      Header:
+ *              priority_queue_insert(que, elt, new_key, cmp_fn)
+ *                      <struct priority_queue *> que
+ *                      <priority_queue_entry_t> elt
+ *                      <priority_queue_key_t> new_key
+ *                      <cmp_fn> comparator function
+ *      Returns:
+ *              Whether the inserted element became the new root
+ */
+static inline boolean_t
+priority_queue_insert(struct priority_queue *que, priority_queue_entry_t elt,
+               priority_queue_key_t new_key, priority_queue_compare_fn_t cmp_fn)
+{
+       priority_queue_entry_t new_root;
+
+       pqueue_update_key(que, elt, new_key);
+       new_root = pqueue_merge(pqueue_unpack_root(que), elt, cmp_fn);
+       pqueue_pack_root(que, new_root);
+       return new_root == elt;
+}
+
+/*
+ *      Macro:          priority_queue_remove
+ *      Function:
+ *              Removes an element from the priority queue
+ *      Header:
+ *              priority_queue_remove(que, elt, cmp_fn)
+ *                      <struct priority_queue *> que
+ *                      <priority_queue_entry_t> elt
+ *                      <cmp_fn> comparator function
+ *      Returns:
+ *              Whether the removed element was the root
+ */
+static inline boolean_t
+priority_queue_remove(struct priority_queue *que, priority_queue_entry_t elt,
+               priority_queue_compare_fn_t cmp_fn)
+{
+       if (elt == pqueue_unpack_root(que)) {
+               pqueue_remove_root(que, elt, cmp_fn);
+               priority_queue_entry_init(elt);
+               return TRUE;
+       } else {
+               pqueue_remove_non_root(que, elt, cmp_fn);
+               priority_queue_entry_init(elt);
+               return FALSE;
+       }
+}
+
+/*
+ *      Macro:          priority_queue_entry_decrease
+ *
+ *      WARNING:
+ *              This function is badly named for a min-heap, as it means the element
+ *              moves toward the root, which happens if the key value became smaller.
+ *
+ *      Function:
+ *              Decrease the priority of an element in the priority queue. Since the heap invariant is to always
+ *              have the maximum element at the root, the most efficient way to implement this is to remove
+ *              the element and re-insert it into the heap.
+ *
+ *              For PRIORITY_QUEUE_BUILTIN_KEY, the new_key is passed into this routine since the priority is
+ *              maintained by the data structure. For PRIORITY_QUEUE_GENERIC_KEY, the caller must update the priority
+ *              in the element and then call this routine. For the new_key field, it must pass PRIORITY_QUEUE_KEY_NONE.
+ *      Header:
+ *              priority_queue_entry_decrease(que, elt, new_key, cmp_fn)
+ *                      <struct priority_queue *> que
+ *                      <priority_queue_entry_t> elt
+ *                      <priority_queue_key_t> new_key
+ *                      <cmp_fn> comparator function
+ *      Returns:
+ *              Whether the update caused the root or its key to change.
+ */
+static inline boolean_t
+priority_queue_entry_decrease(struct priority_queue *que, priority_queue_entry_t elt,
+               priority_queue_key_t new_key, priority_queue_compare_fn_t cmp_fn)
+{
+       boolean_t was_root = priority_queue_remove(que, elt, cmp_fn);
+       /* Insert it back in the heap; insertion also causes the priority update in the element */
+       priority_queue_insert(que, elt, new_key, cmp_fn);
+       return was_root;
+}
+
+/*
+ *      Macro:          priority_queue_entry_increase
+ *
+ *      WARNING:
+ *              This function is badly named for a min-heap, as it means the element
+ *              moves away from the root, which happens if the key value became larger.
+ *
+ *      Function:
+ *              Increase the priority of an element in the priority queue. If the root is being increased, no change
+ *              to the data structure is needed. For elements at any other level, unhook it from that level and
+ *              re-merge it.
+ *
+ *              For PRIORITY_QUEUE_BUILTIN_KEY, the new_key is passed into this routine since the priority is
+ *              maintained by the data structure. For PRIORITY_QUEUE_GENERIC_KEY, the caller must update the priority
+ *              in the element and then call this routine. For the new_key field, it must pass PRIORITY_QUEUE_KEY_NONE.
+ *      Header:
+ *              priority_queue_entry_increase(que, elt, new_key, cmp_fn)
+ *                      <struct priority_queue *> que
+ *                      <priority_queue_entry_t> elt
+ *                      <priority_queue_key_t> new_key
+ *                      <cmp_fn> comparator function
+ *      Returns:
+ *              Whether the update caused the root or its key to change.
+ */
+static inline boolean_t
+priority_queue_entry_increase(struct priority_queue *que, priority_queue_entry_t elt,
+               priority_queue_key_t new_key, priority_queue_compare_fn_t cmp_fn)
+{
+       if (elt == pqueue_unpack_root(que)) {
+               pqueue_update_key(que, elt, new_key);
+               return TRUE;
+       }
+
+       /* Remove the element from its current level list */
+       pqueue_list_remove(elt);
+       /* Re-insert the element into the heap with a merge */
+       return priority_queue_insert(que, elt, new_key, cmp_fn);
+}
+
+/*
+ * Min/Max nodes lookup and removal routines
+ * Since the data structure is unaware of the type of heap being constructed, it provides both the min
+ * and max variants of the lookup and removal routines. Both variants do the exact same operation and
+ * it is up to the callers to call the right variant which makes semantic sense for the type of heap.
+ */
+
+/*
+ *      Macro:          priority_queue_max
+ *      Function:
+ *              Lookup the max element in a priority queue. It simply returns the root of the
+ *              priority queue.
+ *      Header:
+ *              priority_queue_max(q, type, field)
+ *                      <struct priority_queue *> q
+ *                      <type> type of element in priority queue
+ *                      <field> chain field in (*<type>)
+ *      Returns:
+ *              <type *> max element
+ */
+#define priority_queue_max(q, type, field) ({                                                                   \
+        assert(pqueue_is_max_heap(q));                                                                          \
+        pqe_element(pqueue_unpack_root(q), type, field);                                                        \
+})
+
+/*
+ *      Macro:          priority_queue_min
+ *      Function:
+ *              Lookup the min element in a priority queue. It simply returns the root of the
+ *              priority queue.
+ *      Header:
+ *              priority_queue_min(q, type, field)
+ *                      <struct priority_queue *> q
+ *                      <type> type of element in priority queue
+ *                      <field> chain field in (*<type>)
+ *      Returns:
+ *              <type *> min element
+ */
+#define priority_queue_min(q, type, field) ({                                                                   \
+        assert(pqueue_is_min_heap(que));                                                                         \
+        priority_queue_entry_key(pqueue_unpack_root(q), type, field);                                           \
+})
+
+/*
+ *      Macro:          priority_queue_max_key
+ *      Function:
+ *              Lookup the max key in a priority queue.
+ *      Header:
+ *              priority_queue_max_key(q)
+ *                      <struct priority_queue *> q
+ *      Returns:
+ *              <type *> max key
+ */
+#define priority_queue_max_key(q) ({                                                                            \
+        assert(pqueue_is_max_heap(q));                                                                          \
+        priority_queue_entry_key(q, pqueue_unpack_root(q));                                                     \
+})
+
+/*
+ *      Macro:          priority_queue_min_key
+ *      Function:
+ *              Lookup the min key in a priority queue.
+ *      Header:
+ *              priority_queue_min_key(q)
+ *                      <struct priority_queue *> q
+ *      Returns:
+ *              <type *> min key
+ */
+#define priority_queue_min_key(q) ({                                                                            \
+        assert(pqueue_is_min_heap(q));                                                                          \
+        priority_queue_entry_key(pqueue_unpack_root(q));                                                        \
+})
+
+/*
+ *      Macro:          priority_queue_remove_max
+ *      Function:
+ *              Remove the max element in a priority queue.
+ *              Uses the priority_queue_remove() routine to actually do the removal.
+ *      Header:
+ *              priority_queue_remove_max(q, type, field)
+ *                      <struct priority_queue *> q
+ *                      <type> type of element in priority queue
+ *                      <field> chain field in (*<type>)
+ *      Returns:
+ *              <type *> max element
+ */
+#define priority_queue_remove_max(q, type, field, cmp_fn) ({                                                    \
+        assert(pqueue_is_max_heap(q));                                                                          \
+        pqe_element(pqueue_remove_root(q, pqueue_unpack_root(q), cmp_fn), type, field);                         \
+})
+
+/*
+ *      Macro:          priority_queue_remove_min
+ *      Function:
+ *              Remove the min element in a priority queue.
+ *              Uses the priority_queue_remove() routine to actually do the removal.
+ *      Header:
+ *              priority_queue_remove_min(q, type, field)
+ *                      <struct priority_queue *> q
+ *                      <type> type of element in priority queue
+ *                      <field> chain field in (*<type>)
+ *      Returns:
+ *              <type *> min element
+ */
+#define priority_queue_remove_min(q, type, field, cmp_fn) ({                                                    \
+        assert(pqueue_is_min_heap(que));                                                                         \
+        pqe_element(pqueue_remove_root(q, pqueue_unpack_root(q), cmp_fn), type, field);                         \
+})
+
+/*
+ *      Macro:          priority_queue_destroy
+ *      Function:
+ *              Destroy a priority queue safely. This routine accepts a callback
+ *              to handle any cleanup for elements in the priority queue. The queue does
+ *              not maintain its invariants while getting destroyed. The priority queue and
+ *              the linkage nodes need to be re-initialized before re-using them.
+ *      Header:
+ *              priority_queue_destroy(q, type, field, callback)
+ *                      <struct priority_queue *> q
+ *                      <type> type of element in priority queue
+ *                      <field> chain field in (*<type>)
+ *                      <callback> callback for each element
+ *
+ *      Returns:
+ *              None
+ */
+#define priority_queue_destroy(q, type, field, callback, ...) \
+        pqueue_destroy(q, offsetof(type, field), callback, ##__VA_ARGS__)
+
+__END_DECLS
+
+#endif /* _KERN_PRIORITY_QUEUE_H_ */
index 5aad73e37180b7a20e2cc8c793a1b92cf8211b45..479094c30569339ba261ba8dc984ec7e199d7c52 100644 (file)
 
 #include <security/mac_mach_internal.h>
 
+#if defined(CONFIG_XNUPOST)
+
+#include <tests/xnupost.h>
+
+#endif /* CONFIG_XNUPOST */
 
 /*
  * Exported interface
@@ -114,6 +119,36 @@ boolean_t          sched_stats_active = FALSE;
 
 processor_t            processor_array[MAX_SCHED_CPUS] = { 0 };
 
+#if defined(CONFIG_XNUPOST)
+kern_return_t ipi_test(void);
+extern void arm64_ipi_test(void);
+
+kern_return_t
+ipi_test()
+{
+#if __arm64__
+       processor_t p;
+
+       for (p = processor_list; p != NULL; p = p->processor_list) {
+               thread_bind(p);
+               thread_block(THREAD_CONTINUE_NULL);
+               kprintf("Running IPI test on cpu %d\n", p->cpu_id);
+               arm64_ipi_test();
+       }
+
+       /* unbind thread from specific cpu */
+       thread_bind(PROCESSOR_NULL);
+       thread_block(THREAD_CONTINUE_NULL);
+
+       T_PASS("Done running IPI tests");
+#else
+       T_PASS("Unsupported platform. Not running IPI tests");
+
+#endif /* __arm64__ */
+
+       return KERN_SUCCESS;
+}
+#endif /* defined(CONFIG_XNUPOST) */
 
 
 void
@@ -154,6 +189,8 @@ processor_init(
                SCHED(processor_init)(processor);
        }
 
+       assert(cpu_id < MAX_SCHED_CPUS);
+
        processor->state = PROCESSOR_OFF_LINE;
        processor->active_thread = processor->next_thread = processor->idle_thread = THREAD_NULL;
        processor->processor_set = pset;
@@ -171,6 +208,8 @@ processor_init(
        processor->processor_self = IP_NULL;
        processor_data_init(processor);
        processor->processor_list = NULL;
+       processor->cpu_quiesce_state = CPU_QUIESCE_COUNTER_NONE;
+       processor->cpu_quiesce_last_checkin = 0;
 
        s = splsched();
        pset_lock(pset);
@@ -191,7 +230,6 @@ processor_init(
                processor_list_tail->processor_list = processor;
        processor_list_tail = processor;
        processor_count++;
-       assert(cpu_id < MAX_SCHED_CPUS);
        processor_array[cpu_id] = processor;
        simple_unlock(&processor_list_lock);
 }
@@ -216,6 +254,9 @@ processor_set_primary(
                /* Mark both processors as SMT siblings */
                primary->is_SMT = TRUE;
                processor->is_SMT = TRUE;
+
+               processor_set_t pset = processor->processor_set;
+               atomic_bit_clear(&pset->primary_map, processor->cpu_id, memory_order_relaxed);
        }
 }
 
@@ -328,17 +369,18 @@ pset_init(
                SCHED(rt_init)(pset);
        }
 
-       queue_init(&pset->active_queue);
-       queue_init(&pset->idle_queue);
-       queue_init(&pset->idle_secondary_queue);
-       queue_init(&pset->unused_queue);
        pset->online_processor_count = 0;
-       pset->active_processor_count = 0;
        pset->load_average = 0;
        pset->cpu_set_low = pset->cpu_set_hi = 0;
        pset->cpu_set_count = 0;
+       pset->last_chosen = -1;
        pset->cpu_bitmask = 0;
        pset->recommended_bitmask = ~0ULL;
+       pset->primary_map = ~0ULL;
+       pset->cpu_state_map[PROCESSOR_OFF_LINE] = ~0ULL;
+       for (uint i = PROCESSOR_SHUTDOWN; i < PROCESSOR_STATE_LEN; i++) {
+               pset->cpu_state_map[i] = 0;
+       }
        pset->pending_AST_cpu_mask = 0;
 #if defined(CONFIG_SCHED_DEFERRED_AST)
        pset->pending_deferred_AST_cpu_mask = 0;
@@ -540,7 +582,7 @@ processor_start(
                return (KERN_FAILURE);
        }
 
-       processor->state = PROCESSOR_START;
+       pset_update_processor_state(pset, processor, PROCESSOR_START);
        pset_unlock(pset);
        splx(s);
 
@@ -552,7 +594,7 @@ processor_start(
                if (result != KERN_SUCCESS) {
                        s = splsched();
                        pset_lock(pset);
-                       processor->state = PROCESSOR_OFF_LINE;
+                       pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
                        pset_unlock(pset);
                        splx(s);
 
@@ -571,7 +613,7 @@ processor_start(
                if (result != KERN_SUCCESS) {
                        s = splsched();
                        pset_lock(pset);
-                       processor->state = PROCESSOR_OFF_LINE;
+                       pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
                        pset_unlock(pset);
                        splx(s);
 
@@ -597,7 +639,7 @@ processor_start(
        if (result != KERN_SUCCESS) {
                s = splsched();
                pset_lock(pset);
-               processor->state = PROCESSOR_OFF_LINE;
+               pset_update_processor_state(pset, processor, PROCESSOR_OFF_LINE);
                pset_unlock(pset);
                splx(s);
 
index 09caf6a7fbe0b3c1a980656720073ef39803dd82..646ea801cf736ecb2759abef438c4bfcc9a1697c 100644 (file)
 #include <kern/sched.h>
 #include <mach/sfi_class.h>
 #include <kern/processor_data.h>
+#include <kern/cpu_quiesce.h>
+
+/*
+ *     Processor state is accessed by locking the scheduling lock
+ *     for the assigned processor set.
+ *
+ *           -------------------- SHUTDOWN
+ *          /                     ^     ^
+ *        _/                      |      \
+ *  OFF_LINE ---> START ---> RUNNING ---> IDLE ---> DISPATCHING
+ *         \_________________^   ^ ^______/           /
+ *                                \__________________/
+ *
+ *  Most of these state transitions are externally driven as a
+ *  a directive (for instance telling an IDLE processor to start
+ *  coming out of the idle state to run a thread). However these
+ *  are typically paired with a handshake by the processor itself
+ *  to indicate that it has completed a transition of indeterminate
+ *  length (for example, the DISPATCHING->RUNNING or START->RUNNING
+ *  transitions must occur on the processor itself).
+ *
+ *  The boot processor has some special cases, and skips the START state,
+ *  since it has already bootstrapped and is ready to context switch threads.
+ *
+ *  When a processor is in DISPATCHING or RUNNING state, the current_pri,
+ *  current_thmode, and deadline fields should be set, so that other
+ *  processors can evaluate if it is an appropriate candidate for preemption.
+ */
+#if defined(CONFIG_SCHED_DEFERRED_AST)
+/*
+ *           -------------------- SHUTDOWN
+ *          /                     ^     ^
+ *        _/                      |      \
+ *  OFF_LINE ---> START ---> RUNNING ---> IDLE ---> DISPATCHING
+ *         \_________________^   ^ ^______/ ^_____ /  /
+ *                                \__________________/
+ *
+ *  A DISPATCHING processor may be put back into IDLE, if another
+ *  processor determines that the target processor will have nothing to do
+ *  upon reaching the RUNNING state.  This is racy, but if the target
+ *  responds and becomes RUNNING, it will not break the processor state
+ *  machine.
+ *
+ *  This change allows us to cancel an outstanding signal/AST on a processor
+ *  (if such an operation is supported through hardware or software), and
+ *  push the processor back into the IDLE state as a power optimization.
+ */
+#endif
+
+#define PROCESSOR_OFF_LINE             0       /* Not available */
+#define PROCESSOR_SHUTDOWN             1       /* Going off-line */
+#define PROCESSOR_START                        2       /* Being started */
+/*                                             3          Formerly Inactive (unavailable) */
+#define        PROCESSOR_IDLE                  4       /* Idle (available) */
+#define PROCESSOR_DISPATCHING  5       /* Dispatching (idle -> active) */
+#define        PROCESSOR_RUNNING               6       /* Normal execution */
+#define PROCESSOR_STATE_LEN             (PROCESSOR_RUNNING+1)
 
 typedef enum {
        PSET_SMP,
 } pset_cluster_type_t;
 
-struct processor_set {
-       queue_head_t            active_queue;   /* active processors */
-       queue_head_t            idle_queue;             /* idle processors */
-       queue_head_t            idle_secondary_queue;   /* idle secondary processors */
-       queue_head_t            unused_queue;           /* processors not recommended by CLPC */
+typedef bitmap_t cpumap_t;
 
-       int                                     online_processor_count;
-       int                                     active_processor_count;
-       int                                     load_average;
+struct processor_set {
+       int                     online_processor_count;
+       int                     load_average;
 
-       int                                     cpu_set_low, cpu_set_hi;
-       int                                     cpu_set_count;
-       uint64_t                                cpu_bitmask;
-       uint64_t                                recommended_bitmask;
+       int                     cpu_set_low, cpu_set_hi;
+       int                     cpu_set_count;
+       int                     last_chosen;
+       cpumap_t                cpu_bitmask;
+       cpumap_t                recommended_bitmask;
+       cpumap_t                cpu_state_map[PROCESSOR_STATE_LEN];
+       cpumap_t                primary_map;
 
 #if __SMP__
        decl_simple_lock_data(,sched_lock)      /* lock for above */
@@ -116,7 +172,7 @@ struct processor_set {
 #endif
 
        /* CPUs that have been sent an unacknowledged remote AST for scheduling purposes */
-       uint64_t                        pending_AST_cpu_mask;
+       cpumap_t                        pending_AST_cpu_mask;
 #if defined(CONFIG_SCHED_DEFERRED_AST)
        /*
         * A separate mask, for ASTs that we may be able to cancel.  This is dependent on
@@ -129,9 +185,9 @@ struct processor_set {
         * of spurious ASTs in the system, and let processors spend longer periods in
         * IDLE.
         */
-       uint64_t                        pending_deferred_AST_cpu_mask;
+       cpumap_t                        pending_deferred_AST_cpu_mask;
 #endif
-       uint64_t                        pending_spill_cpu_mask;
+       cpumap_t                        pending_spill_cpu_mask;
 
        struct ipc_port *       pset_self;              /* port for operations */
        struct ipc_port *       pset_name_self; /* port for information */
@@ -161,15 +217,12 @@ decl_lck_mtx_data(extern,tasks_threads_lock)
 decl_lck_mtx_data(extern,tasks_corpse_lock)
 
 struct processor {
-       queue_chain_t           processor_queue;/* idle/active queue link,
-                                                                                * MUST remain the first element */
-       int                                     state;                  /* See below */
-       boolean_t               is_SMT;
-       boolean_t               is_recommended;
-       struct thread
-                                               *active_thread, /* thread running on processor */
-                                               *next_thread,   /* next thread when dispatched */
-                                               *idle_thread;   /* this processor's idle thread. */
+       int                     state;                  /* See above */
+       bool                    is_SMT;
+       bool                    is_recommended;
+       struct thread           *active_thread;         /* thread running on processor */
+       struct thread           *next_thread;           /* next thread when dispatched */
+       struct thread           *idle_thread;           /* this processor's idle thread. */
 
        processor_set_t         processor_set;  /* assigned set */
 
@@ -179,13 +232,17 @@ struct processor {
        int                     starting_pri;       /* priority of current thread as it was when scheduled */
        pset_cluster_type_t     current_recommended_pset_type;  /* Cluster type recommended for current thread */
        int                     cpu_id;                 /* platform numeric id */
+       cpu_quiescent_state_t   cpu_quiesce_state;
+       uint64_t                cpu_quiesce_last_checkin;
 
        timer_call_data_t       quantum_timer;  /* timer for quantum expiration */
        uint64_t                        quantum_end;    /* time when current quantum ends */
        uint64_t                        last_dispatch;  /* time of last dispatch */
 
+       uint64_t                        kperf_last_sample_time; /* time of last kperf sample */
+
        uint64_t                        deadline;               /* current deadline */
-       boolean_t               first_timeslice;                /* has the quantum expired since context switch */
+       bool                    first_timeslice;        /* has the quantum expired since context switch */
 
 #if defined(CONFIG_SCHED_TRADITIONAL) || defined(CONFIG_SCHED_MULTIQ)
        struct run_queue        runq;                   /* runq for this processor */
@@ -220,61 +277,6 @@ extern processor_t         master_processor;
 
 extern boolean_t               sched_stats_active;
 
-/*
- *     Processor state is accessed by locking the scheduling lock
- *     for the assigned processor set.
- *
- *           -------------------- SHUTDOWN
- *          /                     ^     ^
- *        _/                      |      \
- *  OFF_LINE ---> START ---> RUNNING ---> IDLE ---> DISPATCHING
- *         \_________________^   ^ ^______/           /
- *                                \__________________/
- *
- *  Most of these state transitions are externally driven as a
- *  a directive (for instance telling an IDLE processor to start
- *  coming out of the idle state to run a thread). However these
- *  are typically paired with a handshake by the processor itself
- *  to indicate that it has completed a transition of indeterminate
- *  length (for example, the DISPATCHING->RUNNING or START->RUNNING
- *  transitions must occur on the processor itself).
- *
- *  The boot processor has some special cases, and skips the START state,
- *  since it has already bootstrapped and is ready to context switch threads.
- *
- *  When a processor is in DISPATCHING or RUNNING state, the current_pri,
- *  current_thmode, and deadline fields should be set, so that other
- *  processors can evaluate if it is an appropriate candidate for preemption.
- */
-#if defined(CONFIG_SCHED_DEFERRED_AST)
-/*
- *           -------------------- SHUTDOWN
- *          /                     ^     ^
- *        _/                      |      \
- *  OFF_LINE ---> START ---> RUNNING ---> IDLE ---> DISPATCHING
- *         \_________________^   ^ ^______/ ^_____ /  /
- *                                \__________________/
- *
- *  A DISPATCHING processor may be put back into IDLE, if another
- *  processor determines that the target processor will have nothing to do
- *  upon reaching the RUNNING state.  This is racy, but if the target
- *  responds and becomes RUNNING, it will not break the processor state
- *  machine.
- *
- *  This change allows us to cancel an outstanding signal/AST on a processor
- *  (if such an operation is supported through hardware or software), and
- *  push the processor back into the IDLE state as a power optimization.
- */
-#endif
-
-#define PROCESSOR_OFF_LINE             0       /* Not available */
-#define PROCESSOR_SHUTDOWN             1       /* Going off-line */
-#define PROCESSOR_START                        2       /* Being started */
-/*                                             3          Formerly Inactive (unavailable) */
-#define        PROCESSOR_IDLE                  4       /* Idle (available) */
-#define PROCESSOR_DISPATCHING  5       /* Dispatching (idle -> active) */
-#define        PROCESSOR_RUNNING               6       /* Normal execution */
-
 extern processor_t     current_processor(void);
 
 /* Lock macros, always acquired and released with interrupts disabled (splsched()) */
@@ -283,6 +285,12 @@ extern processor_t current_processor(void);
 #define pset_lock(p)                   simple_lock(&(p)->sched_lock)
 #define pset_unlock(p)                 simple_unlock(&(p)->sched_lock)
 #define pset_lock_init(p)              simple_lock_init(&(p)->sched_lock, 0)
+#if defined(__arm__) || defined(__arm64__)
+#define pset_assert_locked(p)           LCK_SPIN_ASSERT(&(p)->sched_lock, LCK_ASSERT_OWNED)
+#else
+/* See <rdar://problem/39630910> pset_lock() should be converted to use lck_spin_lock() instead of simple_lock() */
+#define pset_assert_locked(p)           do { (void)p; } while(0)
+#endif
 
 #define rt_lock_lock(p)                        simple_lock(&SCHED(rt_runq)(p)->rt_lock)
 #define rt_lock_unlock(p)              simple_unlock(&SCHED(rt_runq)(p)->rt_lock)
@@ -291,6 +299,7 @@ extern processor_t  current_processor(void);
 #define pset_lock(p)                   do { (void)p; } while(0)
 #define pset_unlock(p)                 do { (void)p; } while(0)
 #define pset_lock_init(p)              do { (void)p; } while(0)
+#define pset_assert_locked(p)           do { (void)p; } while(0)
 
 #define rt_lock_lock(p)                        do { (void)p; } while(0)
 #define rt_lock_unlock(p)              do { (void)p; } while(0)
@@ -369,6 +378,40 @@ extern void processor_state_update_explicit(processor_t processor, int pri,
        sfi_class_id_t sfi_class, pset_cluster_type_t pset_type, 
        perfcontrol_class_t perfctl_class);
 
+#define PSET_LOAD_NUMERATOR_SHIFT   16
+#define PSET_LOAD_FRACTIONAL_SHIFT   4
+
+inline static int
+sched_get_pset_load_average(processor_set_t pset)
+{
+       return pset->load_average >> (PSET_LOAD_NUMERATOR_SHIFT - PSET_LOAD_FRACTIONAL_SHIFT);
+}
+extern void sched_update_pset_load_average(processor_set_t pset);
+
+inline static void
+pset_update_processor_state(processor_set_t pset, processor_t processor, uint new_state)
+{
+       pset_assert_locked(pset);
+
+       uint old_state = processor->state;
+       uint cpuid = processor->cpu_id;
+
+       assert(processor->processor_set == pset);
+       assert(bit_test(pset->cpu_bitmask, cpuid));
+
+       assert(old_state < PROCESSOR_STATE_LEN);
+       assert(new_state < PROCESSOR_STATE_LEN);
+
+       processor->state = new_state;
+
+       bit_clear(pset->cpu_state_map[old_state], cpuid);
+       bit_set(pset->cpu_state_map[new_state], cpuid);
+
+       if ((old_state == PROCESSOR_RUNNING) || (new_state == PROCESSOR_RUNNING)) {
+               sched_update_pset_load_average(pset);
+       }
+}
+
 #else  /* MACH_KERNEL_PRIVATE */
 
 __BEGIN_DECLS
index 3f563c08fd3e90787a51b2ed4a4d1749bb3b3e1a..8e70723bc1f115d514a55f11ab20990883f7548d 100644 (file)
@@ -76,13 +76,6 @@ struct processor_data {
        /* VM event counters */
        vm_statistics64_data_t  vm_stat;
 
-       /* IPC free message cache */
-       struct ikm_cache {
-#define IKM_STASH      16
-               ipc_kmsg_t                              entries[IKM_STASH];
-               unsigned int                    avail;
-       }                                               ikm_cache;
-
        /* waitq prepost cache */
 #define WQP_CACHE_MAX  50
        struct wqp_cache {
@@ -104,6 +97,7 @@ struct processor_data {
                const char *db_panic_str; 
                va_list *db_panic_args;
                uint64_t db_panic_options;
+               void *db_panic_data_ptr;
                boolean_t db_proceed_on_sync_failure;
                uint32_t db_entry_count; /* incremented whenever we panic or call Debugger (current CPU panic level) */
                kern_return_t db_op_return;
index 1cdcd4f1bc6cb716f8f45d0435b5753db42b0f74..ee2f141c6dc8719a7e0026cc782fc2b42a46b995 100644 (file)
@@ -226,7 +226,6 @@ typedef     struct queue_entry      *queue_entry_t;
 
 #ifdef XNU_KERNEL_PRIVATE
 #include <kern/debug.h>
-#include <mach/branch_predicates.h>
 static inline void __QUEUE_ELT_VALIDATE(queue_entry_t elt) {
        queue_entry_t   elt_next, elt_prev;
        
@@ -730,12 +729,24 @@ movqueue(queue_t _old, queue_t _new)
  *                     <field> is the chain field in (*<type>)
  *     Note:
  *             This should only be used with Method 2 queue iteration (element chains)
+ *
+ *             We insert a compiler barrier after setting the fields in the element
+ *             to ensure that the element is updated before being added to the queue,
+ *             which is especially important because stackshot, which operates from
+ *             debugger context, iterates several queues that use this macro (the tasks
+ *             lists and threads lists) without locks. Without this barrier, the
+ *             compiler may re-order the instructions for this macro in a way that
+ *             could cause stackshot to trip over an inconsistent queue during
+ *             iteration.
  */
 #define queue_enter(head, elt, type, field)                    \
 MACRO_BEGIN                                                    \
        queue_entry_t __prev;                                   \
                                                                \
        __prev = (head)->prev;                                  \
+       (elt)->field.prev = __prev;                             \
+       (elt)->field.next = head;                               \
+       __compiler_barrier();                                   \
        if ((head) == __prev) {                                 \
                (head)->next = (queue_entry_t) (elt);           \
        }                                                       \
@@ -743,8 +754,6 @@ MACRO_BEGIN                                                 \
                ((type)(void *)__prev)->field.next =            \
                        (queue_entry_t)(elt);                   \
        }                                                       \
-       (elt)->field.prev = __prev;                             \
-       (elt)->field.next = head;                               \
        (head)->prev = (queue_entry_t) elt;                     \
 MACRO_END
 
index d8fe8ee3669112144c1ed97582f6cf754d6a2924..43211cb9a4a3323382f62e8ab3f40fc0eaa2f6e2 100644 (file)
 #define MAXPRI_THROTTLE                (MINPRI + 4)                                    /*  4 */
 #define MINPRI_USER            MINPRI                                          /*  0 */
 
-#define DEPRESSPRI             MINPRI                  /* depress priority */
-#define MAXPRI_PROMOTE         (MAXPRI_KERNEL)         /* ceiling for mutex promotion */
+#define DEPRESSPRI              (MINPRI)                /* depress priority */
+
+#define MAXPRI_PROMOTE          (MAXPRI_KERNEL)         /* ceiling for mutex promotion */
+#define MINPRI_RWLOCK           (BASEPRI_BACKGROUND)    /* floor when holding rwlock count */
+#define MINPRI_EXEC             (BASEPRI_DEFAULT)       /* floor when in exec state */
+#define MINPRI_WAITQ            (BASEPRI_DEFAULT)       /* floor when in waitq handover state */
+
 
 /* Type used for thread->sched_mode and saved_mode */
 typedef enum {
@@ -182,7 +187,8 @@ typedef enum {
 typedef enum {
        TH_BUCKET_RUN = 0,      /* All runnable threads */
        TH_BUCKET_FIXPRI,       /* Fixed-priority */
-       TH_BUCKET_SHARE_FG,     /* Timeshare thread above BASEPRI_UTILITY */
+       TH_BUCKET_SHARE_FG,     /* Timeshare thread above BASEPRI_DEFAULT */
+       TH_BUCKET_SHARE_DF,     /* Timeshare thread between BASEPRI_DEFAULT and BASEPRI_UTILITY */
        TH_BUCKET_SHARE_UT,     /* Timeshare thread between BASEPRI_UTILITY and MAXPRI_THROTTLE */
        TH_BUCKET_SHARE_BG,     /* Timeshare thread between MAXPRI_THROTTLE and MINPRI */
        TH_BUCKET_MAX,
@@ -306,6 +312,8 @@ extern void         thread_quantum_expire(
 extern ast_t   csw_check(processor_t           processor,
                                                ast_t                   check_reason);
 
+extern void sched_update_generation_count(void);
+
 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 extern uint32_t        std_quantum, min_std_quantum;
 extern uint32_t        std_quantum_us;
@@ -338,6 +346,8 @@ extern uint32_t             sched_tick_interval;
 extern uint64_t                sched_one_second_interval;
 
 /* Periodic computation of various averages */
+extern void            compute_sched_load(void);
+
 extern void            compute_averages(uint64_t);
 
 extern void            compute_averunnable(
@@ -346,9 +356,6 @@ extern void         compute_averunnable(
 extern void            compute_stack_target(
                                        void                    *arg);
 
-extern void            compute_memory_pressure(
-                                       void                    *arg);
-
 extern void            compute_pageout_gc_throttle(
                                        void                    *arg);
 
index cf95209153bbea6f9c216fd023e3091b88a13407..e7b24bb0d415145621c26f609c5a622b9cad953d 100644 (file)
@@ -110,7 +110,6 @@ static struct sched_average {
 } sched_average[] = {
        { compute_averunnable, &sched_nrun, 5, 0 },
        { compute_stack_target, NULL, 5, 1 },
-       { compute_memory_pressure, NULL, 1, 0 },
        { compute_pageout_gc_throttle, NULL, 1, 0 },
        { compute_pmap_gc_throttle, NULL, 60, 0 },
 #if CONFIG_TELEMETRY
@@ -121,15 +120,63 @@ static struct sched_average {
 
 typedef struct sched_average   *sched_average_t;
 
-uint32_t load_now[TH_BUCKET_MAX];
+/*
+ * Scheduler load calculation algorithm
+ *
+ * The scheduler load values provide an estimate of the number of runnable 
+ * timeshare threads in the system at various priority bands. The load 
+ * ultimately affects the priority shifts applied to all threads in a band 
+ * causing them to timeshare with other threads in the system. The load is 
+ * maintained in buckets, with each bucket corresponding to a priority band.
+ *
+ * Each runnable thread on the system contributes its load to its priority 
+ * band and to the bands above it. The contribution of a thread to the bands 
+ * above it is not strictly 1:1 and is weighted based on the priority band 
+ * of the thread. The rules of thread load contribution to each of its higher 
+ * bands are as follows:
+ *
+ * - DF threads: Upto (2 * NCPUs) threads
+ * - UT threads: Upto NCPUs threads
+ * - BG threads: Upto 1 thread
+ *
+ * To calculate the load values, the various run buckets are sampled (every 
+ * sched_load_compute_interval_abs) and the weighted contributions of the the
+ * lower bucket threads are added. The resultant value is plugged into an 
+ * exponentially weighted moving average formula: 
+ *     new-load = alpha * old-load + (1 - alpha) * run-bucket-sample-count
+ *     (where, alpha < 1)
+ * The calculations for the scheduler load are done using fixpoint math with 
+ * a scale factor of 16 to avoid expensive divides and floating point 
+ * operations. The final load values are a smooth curve representative of 
+ * the actual number of runnable threads in a priority band.
+ */
+
+/* Maintains the current (scaled for fixpoint) load in various buckets */
+uint32_t sched_load[TH_BUCKET_MAX];
 
-/* The "stdelta" parameter represents the number of scheduler maintenance
- * "ticks" that have elapsed since the last invocation, subject to
- * integer division imprecision.
+/* 
+ * Alpha factor for the EWMA alogrithm. The current values are chosen as 
+ * 6:10 ("old load":"new samples") to make sure the scheduler reacts fast 
+ * enough to changing system load but does not see too many spikes from bursty 
+ * activity. The current values ensure that the scheduler would converge 
+ * to the latest load in 2-3 sched_load_compute_interval_abs intervals 
+ * (which amounts to ~30-45ms with current values).
  */
+#define SCHED_LOAD_EWMA_ALPHA_OLD      6
+#define SCHED_LOAD_EWMA_ALPHA_NEW      10
+#define SCHED_LOAD_EWMA_ALPHA_SHIFT    4
+static_assert((SCHED_LOAD_EWMA_ALPHA_OLD + SCHED_LOAD_EWMA_ALPHA_NEW) == (1ul << SCHED_LOAD_EWMA_ALPHA_SHIFT));
+
+/* For fixpoint EWMA, roundup the load to make it converge */
+#define SCHED_LOAD_EWMA_ROUNDUP(load)  (((load) & (1ul << (SCHED_LOAD_EWMA_ALPHA_SHIFT - 1))) != 0)
 
+/* Macro to convert scaled sched load to a real load value */
+#define SCHED_LOAD_EWMA_UNSCALE(load)  (((load) >> SCHED_LOAD_EWMA_ALPHA_SHIFT) + SCHED_LOAD_EWMA_ROUNDUP(load))
+
+/*
+ * Routine to capture the latest runnable counts and update sched_load */
 void
-compute_averages(uint64_t stdelta)
+compute_sched_load(void)
 {
        /*
         * Retrieve a snapshot of the current run counts.
@@ -138,56 +185,65 @@ compute_averages(uint64_t stdelta)
         * not byte-by-byte copy.
         */
        uint32_t ncpus = processor_avail_count;
+       uint32_t load_now[TH_BUCKET_MAX];
 
        load_now[TH_BUCKET_RUN]      = sched_run_buckets[TH_BUCKET_RUN];
        load_now[TH_BUCKET_FIXPRI]   = sched_run_buckets[TH_BUCKET_FIXPRI];
        load_now[TH_BUCKET_SHARE_FG] = sched_run_buckets[TH_BUCKET_SHARE_FG];
+       load_now[TH_BUCKET_SHARE_DF] = sched_run_buckets[TH_BUCKET_SHARE_DF];
        load_now[TH_BUCKET_SHARE_UT] = sched_run_buckets[TH_BUCKET_SHARE_UT];
        load_now[TH_BUCKET_SHARE_BG] = sched_run_buckets[TH_BUCKET_SHARE_BG];
 
        assert(load_now[TH_BUCKET_RUN] >= 0);
        assert(load_now[TH_BUCKET_FIXPRI] >= 0);
 
-       /* Ignore the current thread, which is a running fixpri thread */
-
-       uint32_t nthreads = load_now[TH_BUCKET_RUN] - 1;
-       uint32_t nfixpri  = load_now[TH_BUCKET_FIXPRI] - 1;
+       uint32_t nthreads = load_now[TH_BUCKET_RUN];
+       uint32_t nfixpri  = load_now[TH_BUCKET_FIXPRI];
 
        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD) | DBG_FUNC_NONE,
-               load_now[TH_BUCKET_FIXPRI] - 1, load_now[TH_BUCKET_SHARE_FG],
+               load_now[TH_BUCKET_FIXPRI], (load_now[TH_BUCKET_SHARE_FG] + load_now[TH_BUCKET_SHARE_DF]),
                load_now[TH_BUCKET_SHARE_BG],   load_now[TH_BUCKET_SHARE_UT], 0);
 
        /*
         * Compute the timeshare priority conversion factor based on loading.
         * Because our counters may be incremented and accessed
         * concurrently with respect to each other, we may have
-        * windows where the invariant (nthreads - nfixpri) == (fg + bg + ut)
+        * windows where the invariant (nthreads - nfixpri) == (fg + df + bg + ut)
         * is broken, so truncate values in these cases.
         */
-
        uint32_t timeshare_threads = (nthreads - nfixpri);
-
        for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) {
                if (load_now[i] > timeshare_threads)
                        load_now[i] = timeshare_threads;
        }
 
+       /* 
+        * Default threads contribute up to (NCPUS * 2) of load to FG threads 
+        */
+       if (load_now[TH_BUCKET_SHARE_DF] <= (ncpus * 2)) {
+               load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_DF];
+       } else {
+               load_now[TH_BUCKET_SHARE_FG] += (ncpus * 2);
+       }
+       
        /*
-        * Utility threads contribute up to NCPUS of load to FG threads
+        * Utility threads contribute up to NCPUS of load to FG & DF threads
         */
        if (load_now[TH_BUCKET_SHARE_UT] <= ncpus) {
                load_now[TH_BUCKET_SHARE_FG] += load_now[TH_BUCKET_SHARE_UT];
+               load_now[TH_BUCKET_SHARE_DF] += load_now[TH_BUCKET_SHARE_UT];
        } else {
                load_now[TH_BUCKET_SHARE_FG] += ncpus;
+               load_now[TH_BUCKET_SHARE_DF] += ncpus;
        }
 
        /*
-        * FG and UT should notice there's one thread of competition from BG,
-        * but no more.
+        * BG threads contribute up to 1 thread worth of load to FG, DF and UT threads
         */
        if (load_now[TH_BUCKET_SHARE_BG] > 0) {
                load_now[TH_BUCKET_SHARE_FG] += 1;
+               load_now[TH_BUCKET_SHARE_DF] += 1;
                load_now[TH_BUCKET_SHARE_UT] += 1;
        }
 
@@ -203,6 +259,7 @@ compute_averages(uint64_t stdelta)
                uint32_t bucket_load = 0;
 
                if (load_now[i] > ncpus) {
+                       /* Normalize the load to number of CPUs */
                        if (ncpus > 1)
                                bucket_load = load_now[i] / ncpus;
                        else
@@ -211,7 +268,27 @@ compute_averages(uint64_t stdelta)
                        if (bucket_load > MAX_LOAD)
                                bucket_load = MAX_LOAD;
                }
+               /* Plug the load values into the EWMA algorithm to calculate (scaled for fixpoint) sched_load */
+               sched_load[i] = (sched_load[i] * SCHED_LOAD_EWMA_ALPHA_OLD) + ((bucket_load << SCHED_LOAD_EWMA_ALPHA_SHIFT) * SCHED_LOAD_EWMA_ALPHA_NEW);
+               sched_load[i] = sched_load[i] >> SCHED_LOAD_EWMA_ALPHA_SHIFT;
+       }
 
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+               MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_LOAD_EFFECTIVE) | DBG_FUNC_NONE, 
+                       SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_FG]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_DF]),
+                       SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_UT]), SCHED_LOAD_EWMA_UNSCALE(sched_load[TH_BUCKET_SHARE_BG]), 0);
+}
+
+void
+compute_averages(uint64_t stdelta)
+{
+       
+       uint32_t nthreads = sched_run_buckets[TH_BUCKET_RUN] - 1;
+       uint32_t ncpus = processor_avail_count;
+       
+       /* Update the global pri_shifts based on the latest values */
+       for (uint32_t i = TH_BUCKET_SHARE_FG; i <= TH_BUCKET_SHARE_BG ; i++) {
+               uint32_t bucket_load = SCHED_LOAD_EWMA_UNSCALE(sched_load[i]);
                sched_pri_shifts[i] = sched_fixed_shift - sched_load_shifts[bucket_load];
        }
 
index f465d803e19d11165fb4c00c188fe9ae4af9f02c..855d758537a8bf337c030c89efd6280cc49cf53b 100644 (file)
@@ -94,6 +94,9 @@ sched_dualq_processor_queue_shutdown(processor_t processor);
 static sched_mode_t
 sched_dualq_initial_thread_sched_mode(task_t parent_task);
 
+static bool
+sched_dualq_thread_avoid_processor(processor_t processor, thread_t thread);
+
 const struct sched_dispatch_table sched_dualq_dispatch = {
        .sched_name                                     = "dualq",
        .init                                           = sched_dualq_init,
@@ -126,8 +129,8 @@ const struct sched_dispatch_table sched_dualq_dispatch = {
        .direct_dispatch_to_idle_processors             = FALSE,
        .multiple_psets_enabled                         = TRUE,
        .sched_groups_enabled                           = FALSE,
-       .avoid_processor_enabled                        = FALSE,
-       .thread_avoid_processor                         = NULL,
+       .avoid_processor_enabled                        = TRUE,
+       .thread_avoid_processor                         = sched_dualq_thread_avoid_processor,
        .processor_balance                              = sched_SMT_balance,
 
        .rt_runq                                        = sched_rtglobal_runq,
@@ -251,6 +254,10 @@ sched_dualq_processor_csw_check(processor_t processor)
        boolean_t       has_higher;
        int             pri;
 
+       if (sched_dualq_thread_avoid_processor(processor, current_thread())) {
+               return (AST_PREEMPT | AST_URGENT);
+       }
+
        run_queue_t main_runq  = dualq_main_runq(processor);
        run_queue_t bound_runq = dualq_bound_runq(processor);
 
@@ -476,4 +483,21 @@ sched_dualq_thread_update_scan(sched_update_scan_context_t scan_context)
        } while (restart_needed);
 }
 
+extern int sched_allow_rt_smt;
+
+/* Return true if this thread should not continue running on this processor */
+static bool
+sched_dualq_thread_avoid_processor(processor_t processor, thread_t thread)
+{
+       if (processor->processor_primary != processor) {
+               /*
+                * This is a secondary SMT processor.  If the primary is running
+                * a realtime thread, only allow realtime threads on the secondary.
+                */
+               if ((processor->processor_primary->current_pri >= BASEPRI_RTQUEUES) && ((thread->sched_pri < BASEPRI_RTQUEUES) || !sched_allow_rt_smt)) {
+                       return true;
+               }
+       }
 
+       return false;
+}
index 508c83bdb7064911c71ac3e4c820e7c532c9b732..751b57417fbd5c2090d8747d73ca01866bbf7fd8 100644 (file)
@@ -78,9 +78,7 @@
 #include <machine/machlimits.h>
 #include <machine/atomic.h>
 
-#ifdef CONFIG_MACH_APPROXIMATE_TIME
 #include <machine/commpage.h>
-#endif
 
 #include <kern/kern_types.h>
 #include <kern/backtrace.h>
 #include <kern/timer_queue.h>
 #include <kern/waitq.h>
 #include <kern/policy_internal.h>
+#include <kern/cpu_quiesce.h>
 
 #include <vm/pmap.h>
 #include <vm/vm_kern.h>
@@ -181,8 +180,13 @@ uint32_t   min_rt_quantum;
 
 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
 
-unsigned       sched_tick;
-uint32_t       sched_tick_interval;
+unsigned               sched_tick;
+uint32_t               sched_tick_interval;
+
+/* Timeshare load calculation interval (15ms) */
+uint32_t               sched_load_compute_interval_us = 15000; 
+uint64_t               sched_load_compute_interval_abs;
+static _Atomic uint64_t        sched_load_compute_deadline;
 
 uint32_t       sched_pri_shifts[TH_BUCKET_MAX];
 uint32_t       sched_fixed_shift;
@@ -341,7 +345,7 @@ sched_init_override(void)
                kprintf("Scheduler: Runtime selection of %s\n", SCHED(sched_name));
        } else {
 #if   defined(CONFIG_SCHED_MULTIQ)
-               sched_current_dispatch = &sched_multiq_dispatch;
+               sched_current_dispatch = &sched_dualq_dispatch;
 #elif defined(CONFIG_SCHED_TRADITIONAL)
                sched_current_dispatch = &sched_traditional_with_pset_runqueue_dispatch;
 #else
@@ -379,6 +383,8 @@ sched_init(void)
        }
        strlcpy(sched_string, SCHED(sched_name), sizeof(sched_string));
 
+       cpu_quiescent_counter_init();
+
        SCHED(init)();
        SCHED(rt_init)(&pset0);
        sched_timer_deadline_tracking_init();
@@ -454,6 +460,10 @@ sched_timeshare_timebase_init(void)
        assert((abstime >> 32) == 0 && (uint32_t)abstime != 0);
        sched_tick_interval = (uint32_t)abstime;
 
+       /* timeshare load calculation interval & deadline initialization */
+       clock_interval_to_absolutetime_interval(sched_load_compute_interval_us, NSEC_PER_USEC, &sched_load_compute_interval_abs);
+       sched_load_compute_deadline = sched_load_compute_interval_abs;  
+       
        /*
         * Compute conversion factor from usage to
         * timesharing priorities with 5/8 ** n aging.
@@ -662,6 +672,7 @@ thread_unblock(
        boolean_t               ready_for_runq = FALSE;
        thread_t                cthread = current_thread();
        uint32_t                new_run_count;
+       int                             old_thread_state;
 
        /*
         *      Set wait_result.
@@ -681,15 +692,20 @@ thread_unblock(
         *      Update scheduling state: not waiting,
         *      set running.
         */
-       thread->state &= ~(TH_WAIT|TH_UNINT);
+       old_thread_state = thread->state;
+       thread->state = (old_thread_state | TH_RUN) &
+                       ~(TH_WAIT|TH_UNINT|TH_WAIT_REPORT);
 
-       if (!(thread->state & TH_RUN)) {
-               thread->state |= TH_RUN;
-               thread->last_made_runnable_time = thread->last_basepri_change_time = mach_approximate_time();
+       if ((old_thread_state & TH_RUN) == 0) {
+               uint64_t ctime = mach_approximate_time();
+               thread->last_made_runnable_time = thread->last_basepri_change_time = ctime;
+               timer_start(&thread->runnable_timer, ctime);
 
                ready_for_runq = TRUE;
 
-               (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
+               if (old_thread_state & TH_WAIT_REPORT) {
+                       (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
+               }
 
                /* Update the runnable thread count */
                new_run_count = sched_run_incr(thread);
@@ -786,6 +802,12 @@ thread_unblock(
                thread->callout_woke_thread = FALSE;
        }
 
+#if KPERF
+       if (ready_for_runq) {
+               kperf_make_runnable(thread, aticontext);
+       }
+#endif /* KPERF */
+
        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                MACHDBG_CODE(DBG_MACH_SCHED,MACH_MAKE_RUNNABLE) | DBG_FUNC_NONE,
                (uintptr_t)thread_tid(thread), thread->sched_pri, thread->wait_result,
@@ -848,11 +870,12 @@ __private_extern__
 wait_result_t
 thread_mark_wait_locked(
        thread_t                        thread,
-       wait_interrupt_t        interruptible)
+       wait_interrupt_t        interruptible_orig)
 {
-       boolean_t               at_safe_point;
+       boolean_t                       at_safe_point;
+       wait_interrupt_t        interruptible = interruptible_orig;
 
-       assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2)));
+       assert(!(thread->state & (TH_WAIT|TH_IDLE|TH_UNINT|TH_TERMINATE2|TH_WAIT_REPORT)));
 
        /*
         *      The thread may have certain types of interrupts/aborts masked
@@ -860,6 +883,7 @@ thread_mark_wait_locked(
         *      are OK, we have to honor mask settings (outer-scoped code may
         *      not be able to handle aborts at the moment).
         */
+       interruptible &= TH_OPT_INTMASK;
        if (interruptible > (thread->options & TH_OPT_INTMASK))
                interruptible = thread->options & TH_OPT_INTMASK;
 
@@ -873,7 +897,20 @@ thread_mark_wait_locked(
                if ( !(thread->state & TH_TERMINATE))
                        DTRACE_SCHED(sleep);
 
-               thread->state |= (interruptible) ? TH_WAIT : (TH_WAIT | TH_UNINT);
+               int state_bits = TH_WAIT;
+               if (!interruptible) {
+                       state_bits |= TH_UNINT;
+               }
+               if (thread->sched_call) {
+                       wait_interrupt_t mask = THREAD_WAIT_NOREPORT_USER;
+                       if (is_kerneltask(thread->task)) {
+                               mask = THREAD_WAIT_NOREPORT_KERNEL;
+                       }
+                       if ((interruptible_orig & mask) == 0) {
+                               state_bits |= TH_WAIT_REPORT;
+                       }
+               }
+               thread->state |= state_bits;
                thread->at_safe_point = at_safe_point;
 
                /* TODO: pass this through assert_wait instead, have
@@ -883,10 +920,10 @@ thread_mark_wait_locked(
                thread->pending_block_hint = kThreadWaitNone;
 
                return (thread->wait_result = THREAD_WAITING);
+       } else {
+               if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
+                       thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
        }
-       else
-       if (thread->sched_flags & TH_SFLAG_ABORTSAFELY)
-               thread->sched_flags &= ~TH_SFLAG_ABORTED_MASK;
        thread->pending_block_hint = kThreadWaitNone;
 
        return (thread->wait_result = THREAD_INTERRUPTED);
@@ -1745,18 +1782,19 @@ sched_SMT_balance(processor_t cprocessor, processor_set_t cpset) {
        processor_t sprocessor;
 
        sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
-       qe_foreach_element(sprocessor, &cpset->active_queue, processor_queue) {
-               if ((sprocessor->state == PROCESSOR_RUNNING) &&
-                   (sprocessor->processor_primary != sprocessor) &&
-                   (sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
+       uint64_t running_secondary_map = (cpset->cpu_state_map[PROCESSOR_RUNNING] &
+                                         ~cpset->primary_map);
+       for (int cpuid = lsb_first(running_secondary_map); cpuid >= 0; cpuid = lsb_next(running_secondary_map, cpuid)) {
+               sprocessor = processor_array[cpuid];
+               if ((sprocessor->processor_primary->state == PROCESSOR_RUNNING) &&
                    (sprocessor->current_pri < BASEPRI_RTQUEUES)) {
 
-                   ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
-                   if (ipi_type != SCHED_IPI_NONE) {
-                assert(sprocessor != cprocessor);
-                ast_processor = sprocessor;
-                break;
-                   }
+                       ipi_type = sched_ipi_action(sprocessor, NULL, false, SCHED_IPI_EVENT_SMT_REBAL);
+                       if (ipi_type != SCHED_IPI_NONE) {
+                               assert(sprocessor != cprocessor);
+                               ast_processor = sprocessor;
+                               break;
+                       }
                }
        }
 
@@ -1830,7 +1868,7 @@ thread_select(thread_t          thread,
                         * choose_processor(), so in those cases we should continue trying to dequeue work.
                         */
                        if (!SCHED(processor_bound_count)(processor)) {
-                               if (!queue_empty(&pset->idle_queue)) {
+                               if ((pset->recommended_bitmask & pset->primary_map & pset->cpu_state_map[PROCESSOR_IDLE]) != 0) {
                                        goto idle;
                                }
                                
@@ -2036,18 +2074,7 @@ pick_new_rt_thread:
                 *      was running.
                 */
                if (processor->state == PROCESSOR_RUNNING) {
-                       processor->state = PROCESSOR_IDLE;
-
-                       if (!processor->is_recommended) {
-                               re_queue_head(&pset->unused_queue, &processor->processor_queue);
-                       } else if (processor->processor_primary == processor) {
-                               re_queue_head(&pset->idle_queue, &processor->processor_queue);
-                       } else {
-                               re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
-                       }
-
-                       pset->active_processor_count--;
-                       sched_update_pset_load_average(pset);
+                       pset_update_processor_state(pset, processor, PROCESSOR_IDLE);
                }
 
 #if __SMP__
@@ -2127,7 +2154,8 @@ thread_select_idle(
 #endif
 
        thread->last_run_time = processor->last_dispatch;
-       thread_timer_event(processor->last_dispatch, &processor->idle_thread->system_timer);
+       processor_timer_switch_thread(processor->last_dispatch,
+                       &processor->idle_thread->system_timer);
        PROCESSOR_DATA(processor, kernel_timer) = &processor->idle_thread->system_timer;
 
 
@@ -2137,7 +2165,9 @@ thread_select_idle(
        timer_call_quantum_timer_cancel(&processor->quantum_timer);
        processor->first_timeslice = FALSE;
 
-       (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
+       if (thread->sched_call) {
+               (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
+       }
 
        thread_tell_urgency(THREAD_URGENCY_NONE, 0, 0, 0, NULL);
 
@@ -2150,7 +2180,9 @@ thread_select_idle(
        /*
         *      Return at splsched.
         */
-       (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
+       if (thread->sched_call) {
+               (*thread->sched_call)(SCHED_CALL_UNBLOCK, thread);
+       }
 
        thread_lock(thread);
 
@@ -2159,16 +2191,17 @@ thread_select_idle(
         *      Otherwise skip; we will context switch to another thread or return here.
         */
        if (!(thread->state & TH_WAIT)) {
-               processor->last_dispatch = mach_absolute_time();
-               thread_timer_event(processor->last_dispatch, &thread->system_timer);
+               uint64_t time_now = processor->last_dispatch = mach_absolute_time();
+               processor_timer_switch_thread(time_now, &thread->system_timer);
+               timer_update(&thread->runnable_timer, time_now);
                PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
                thread_quantum_init(thread);
-               processor->quantum_end = processor->last_dispatch + thread->quantum_remaining;
+               processor->quantum_end = time_now + thread->quantum_remaining;
                timer_call_quantum_timer_enter(&processor->quantum_timer,
-                       thread, processor->quantum_end, processor->last_dispatch);
+                       thread, processor->quantum_end, time_now);
                processor->first_timeslice = TRUE;
 
-               thread->computation_epoch = processor->last_dispatch;
+               thread->computation_epoch = time_now;
        }
 
        thread->state &= ~TH_IDLE;
@@ -2262,7 +2295,7 @@ thread_invoke(
 
     /* Prepare for spin debugging */
 #if INTERRUPT_MASKED_DEBUG
-    ml_spin_debug_clear(thread);
+       ml_spin_debug_clear(thread);
 #endif
 
        if (continuation != NULL) {
@@ -2300,7 +2333,8 @@ thread_invoke(
 
                        processor->last_dispatch = ctime;
                        self->last_run_time = ctime;
-                       thread_timer_event(ctime, &thread->system_timer);
+                       processor_timer_switch_thread(ctime, &thread->system_timer);
+                       timer_update(&thread->runnable_timer, ctime);
                        PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
 
                        /*
@@ -2308,11 +2342,9 @@ thread_invoke(
                         * during privilege transitions, synthesize an event now.
                         */
                        if (!thread->precise_user_kernel_time) {
-                               timer_switch(PROCESSOR_DATA(processor, current_state),
-                                                       ctime,
-                                                        PROCESSOR_DATA(processor, current_state));
+                               timer_update(PROCESSOR_DATA(processor, current_state), ctime);
                        }
-       
+
                        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                                MACHDBG_CODE(DBG_MACH_SCHED, MACH_STACK_HANDOFF)|DBG_FUNC_NONE,
                                self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
@@ -2326,11 +2358,15 @@ thread_invoke(
 
                        SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
 
+#if KPERF
+                       kperf_off_cpu(self);
+#endif /* KPERF */
+
                        TLOG(1, "thread_invoke: calling stack_handoff\n");
                        stack_handoff(self, thread);
 
                        /* 'self' is now off core */
-                       assert(thread == current_thread());
+                       assert(thread == current_thread_volatile());
 
                        DTRACE_SCHED(on__cpu);
 
@@ -2338,21 +2374,20 @@ thread_invoke(
                        kperf_on_cpu(thread, continuation, NULL);
 #endif /* KPERF */
 
+                       thread_dispatch(self, thread);
+
 #if KASAN
-                       kasan_unpoison_fakestack(self);
+                       /* Old thread's stack has been moved to the new thread, so explicitly
+                        * unpoison it. */
                        kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
 #endif
 
-                       thread_dispatch(self, thread);
-
                        thread->continuation = thread->parameter = NULL;
 
                        counter(c_thread_invoke_hits++);
 
-                       (void) spllo();
-
                        assert(continuation);
-                       call_continuation(continuation, parameter, thread->wait_result);
+                       call_continuation(continuation, parameter, thread->wait_result, TRUE);
                        /*NOTREACHED*/
                }
                else if (thread == self) {
@@ -2371,15 +2406,16 @@ thread_invoke(
                                self->reason, (uintptr_t)thread_tid(thread), self->sched_pri, thread->sched_pri, 0);
 
 #if KASAN
-                       kasan_unpoison_fakestack(self);
+                       /* stack handoff to self - no thread_dispatch(), so clear the stack
+                        * and free the fakestack directly */
+                       kasan_fakestack_drop(self);
+                       kasan_fakestack_gc(self);
                        kasan_unpoison_stack(self->kernel_stack, kernel_stack_size);
 #endif
 
                        self->continuation = self->parameter = NULL;
 
-                       (void) spllo();
-
-                       call_continuation(continuation, parameter, self->wait_result);
+                       call_continuation(continuation, parameter, self->wait_result, TRUE);
                        /*NOTREACHED*/
                }
        } else {
@@ -2431,7 +2467,8 @@ need_stack:
 
        processor->last_dispatch = ctime;
        self->last_run_time = ctime;
-       thread_timer_event(ctime, &thread->system_timer);
+       processor_timer_switch_thread(ctime, &thread->system_timer);
+       timer_update(&thread->runnable_timer, ctime);
        PROCESSOR_DATA(processor, kernel_timer) = &thread->system_timer;
 
        /*
@@ -2439,9 +2476,7 @@ need_stack:
         * during privilege transitions, synthesize an event now.
         */
        if (!thread->precise_user_kernel_time) {
-               timer_switch(PROCESSOR_DATA(processor, current_state),
-                                       ctime,
-                                        PROCESSOR_DATA(processor, current_state));
+               timer_update(PROCESSOR_DATA(processor, current_state), ctime);
        }
 
        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
@@ -2457,6 +2492,10 @@ need_stack:
 
        SCHED_STATS_CSW(processor, self->reason, self->sched_pri, thread->sched_pri);
 
+#if KPERF
+       kperf_off_cpu(self);
+#endif /* KPERF */
+
        /*
         * This is where we actually switch register context,
         * and address space if required.  We will next run
@@ -2474,7 +2513,7 @@ need_stack:
         */
        assert(continuation == self->continuation);
        thread = machine_switch_context(self, continuation, thread);
-       assert(self == current_thread());
+       assert(self == current_thread_volatile());
        TLOG(1,"thread_invoke: returning machine_switch_context: self %p continuation %p thread %p\n", self, continuation, thread);
 
        DTRACE_SCHED(on__cpu);
@@ -2491,9 +2530,7 @@ need_stack:
        if (continuation) {
                self->continuation = self->parameter = NULL;
 
-               (void) spllo();
-
-               call_continuation(continuation, parameter, self->wait_result);
+               call_continuation(continuation, parameter, self->wait_result, TRUE);
                /*NOTREACHED*/
        }
 
@@ -2537,11 +2574,14 @@ pset_cancel_deferred_dispatch(
         * correct (we won't accidentally have a runnable thread that hasn't been
         * dispatched to an idle processor), if not ideal (we may be restarting the
         * dispatch process, which could have some overhead).
-        *
         */
-       if ((sampled_sched_run_count == 1) &&
-           (pset->pending_deferred_AST_cpu_mask)) {
-               qe_foreach_element_safe(active_processor, &pset->active_queue, processor_queue) {
+
+       if ((sampled_sched_run_count == 1) && (pset->pending_deferred_AST_cpu_mask)) {
+               uint64_t dispatching_map = (pset->cpu_state_map[PROCESSOR_DISPATCHING] &
+                                           pset->pending_deferred_AST_cpu_mask &
+                                           ~pset->pending_AST_cpu_mask);
+               for (int cpuid = lsb_first(dispatching_map); cpuid >= 0; cpuid = lsb_next(dispatching_map, cpuid)) {
+                       active_processor = processor_array[cpuid];
                        /*
                         * If a processor is DISPATCHING, it could be because of
                         * a cancelable signal.
@@ -2563,35 +2603,16 @@ pset_cancel_deferred_dispatch(
                         * should be no different than if the core took some
                         * interrupt while IDLE.
                         */
-                       if ((active_processor->state == PROCESSOR_DISPATCHING) &&
-                           (bit_test(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id)) &&
-                           (!bit_test(pset->pending_AST_cpu_mask, active_processor->cpu_id)) &&
-                           (active_processor != processor)) {
+                       if (active_processor != processor) {
                                /*
                                 * Squash all of the processor state back to some
                                 * reasonable facsimile of PROCESSOR_IDLE.
-                                *
-                                * TODO: What queue policy do we actually want here?
-                                * We want to promote selection of a good processor
-                                * to run on.  Do we want to enqueue at the head?
-                                * The tail?  At the (relative) old position in the
-                                * queue?  Or something else entirely?
                                 */
-                               if (!active_processor->is_recommended) {
-                                       re_queue_head(&pset->unused_queue, &active_processor->processor_queue);
-                               } else if (active_processor->processor_primary == active_processor) {
-                                       re_queue_head(&pset->idle_queue, &active_processor->processor_queue);
-                               } else {
-                                       re_queue_head(&pset->idle_secondary_queue, &active_processor->processor_queue);
-                               }
-
-                               pset->active_processor_count--;
-                               sched_update_pset_load_average(pset);
 
                                assert(active_processor->next_thread == THREAD_NULL);
                                processor_state_update_idle(active_processor);
                                active_processor->deadline = UINT64_MAX;
-                               active_processor->state = PROCESSOR_IDLE;
+                               pset_update_processor_state(pset, active_processor, PROCESSOR_IDLE);
                                bit_clear(pset->pending_deferred_AST_cpu_mask, active_processor->cpu_id);
                                machine_signal_idle_cancel(active_processor);
                        }
@@ -2639,7 +2660,7 @@ thread_dispatch(
        processor_t             processor = self->last_processor;
 
        assert(processor == current_processor());
-       assert(self == current_thread());
+       assert(self == current_thread_volatile());
        assert(thread != self);
 
        if (thread != THREAD_NULL) {
@@ -2656,7 +2677,25 @@ thread_dispatch(
                 * - We do not want to callout if "thread" is NULL.
                 */
                thread_csw_callout(thread, self, processor->last_dispatch);     
-               
+
+#if KASAN
+               if (thread->continuation != NULL) {
+                       /*
+                        * Thread has a continuation and the normal stack is going away.
+                        * Unpoison the stack and mark all fakestack objects as unused.
+                        */
+                       kasan_fakestack_drop(thread);
+                       if (thread->kernel_stack) {
+                               kasan_unpoison_stack(thread->kernel_stack, kernel_stack_size);
+                       }
+               }
+
+               /*
+                * Free all unused fakestack objects.
+                */
+               kasan_fakestack_gc(thread);
+#endif
+
                /*
                 *      If blocked at a continuation, discard
                 *      the stack.
@@ -2785,8 +2824,9 @@ thread_dispatch(
                                if (reason & AST_REBALANCE) {
                                        options |= SCHED_REBALANCE;
                                        if (reason & AST_QUANTUM) {
-                                               /* Having gone to the trouble of forcing this thread off a less preferred core,
-                                                * we should force the preferable core to reschedule immediatey to give this
+                                               /*
+                                                * Having gone to the trouble of forcing this thread off a less preferred core,
+                                                * we should force the preferable core to reschedule immediately to give this
                                                 * thread a chance to run instead of just sitting on the run queue where
                                                 * it may just be stolen back by the idle core we just forced it off.
                                                 * But only do this at the end of a quantum to prevent cascading effects.
@@ -2825,27 +2865,30 @@ thread_dispatch(
                                 */
                                boolean_t should_terminate = FALSE;
                                uint32_t new_run_count;
+                               int thread_state = thread->state;
 
                                /* Only the first call to thread_dispatch
                                 * after explicit termination should add
                                 * the thread to the termination queue
                                 */
-                               if ((thread->state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
+                               if ((thread_state & (TH_TERMINATE|TH_TERMINATE2)) == TH_TERMINATE) {
                                        should_terminate = TRUE;
-                                       thread->state |= TH_TERMINATE2;
+                                       thread_state |= TH_TERMINATE2;
                                }
 
-                               thread->state &= ~TH_RUN;
+                               timer_stop(&thread->runnable_timer, processor->last_dispatch);
+
+                               thread_state &= ~TH_RUN;
+                               thread->state = thread_state;
+
                                thread->last_made_runnable_time = thread->last_basepri_change_time = THREAD_NOT_RUNNABLE;
                                thread->chosen_processor = PROCESSOR_NULL;
 
                                new_run_count = sched_run_decr(thread);
 
 #if CONFIG_SCHED_SFI
-                               if ((thread->state & (TH_WAIT | TH_TERMINATE)) == TH_WAIT) {
-                                       if (thread->reason & AST_SFI) {
-                                               thread->wait_sfi_begin_time = processor->last_dispatch;
-                                       }
+                               if (thread->reason & AST_SFI) {
+                                       thread->wait_sfi_begin_time = processor->last_dispatch;
                                }
 #endif
 
@@ -2853,10 +2896,12 @@ thread_dispatch(
 
                                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                                        MACHDBG_CODE(DBG_MACH_SCHED,MACH_DISPATCH) | DBG_FUNC_NONE,
-                                       (uintptr_t)thread_tid(thread), thread->reason, thread->state,
+                                       (uintptr_t)thread_tid(thread), thread->reason, thread_state,
                                        new_run_count, 0);
 
-                               (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
+                               if (thread_state & TH_WAIT_REPORT) {
+                                       (*thread->sched_call)(SCHED_CALL_BLOCK, thread);
+                               }
 
                                if (thread->wake_active) {
                                        thread->wake_active = FALSE;
@@ -2949,7 +2994,6 @@ thread_dispatch(
                pset_cancel_deferred_dispatch(processor->processor_set, processor);
        }
 #endif
-
 }
 
 /*
@@ -3105,11 +3149,10 @@ thread_continue(
     ml_spin_debug_clear(self);
 #endif
 
-       if (thread != THREAD_NULL)
-               (void)spllo();
-
- TLOG(1, "thread_continue: calling call_continuation \n");
-       call_continuation(continuation, parameter, self->wait_result);
+       TLOG(1, "thread_continue: calling call_continuation\n");
+       
+       boolean_t enable_interrupts = thread != THREAD_NULL;
+       call_continuation(continuation, parameter, self->wait_result, enable_interrupts);
        /*NOTREACHED*/
 }
 
@@ -3347,8 +3390,9 @@ realtime_setrun(
        processor_t                     processor,
        thread_t                        thread)
 {
-       processor_set_t         pset = processor->processor_set;
-       ast_t                           preempt;
+       processor_set_t pset = processor->processor_set;
+       pset_assert_locked(pset);
+       ast_t preempt;
 
        sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
 
@@ -3362,15 +3406,11 @@ realtime_setrun(
         */
        if ( (thread->bound_processor == processor)
                && processor->state == PROCESSOR_IDLE) {
-               re_queue_tail(&pset->active_queue, &processor->processor_queue);
-
-               pset->active_processor_count++;
-               sched_update_pset_load_average(pset);
 
                processor->next_thread = thread;
                processor_state_update_from_thread(processor, thread);
                processor->deadline = thread->realtime.deadline;
-               processor->state = PROCESSOR_DISPATCHING;
+               pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
 
                ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
                pset_unlock(pset);
@@ -3390,15 +3430,10 @@ realtime_setrun(
        ipi_type = SCHED_IPI_NONE;
        if (preempt != AST_NONE) {
                if (processor->state == PROCESSOR_IDLE) {
-                       re_queue_tail(&pset->active_queue, &processor->processor_queue);
-                       
-                       pset->active_processor_count++;
-                       sched_update_pset_load_average(pset);
-
                        processor->next_thread = THREAD_NULL;
                        processor_state_update_from_thread(processor, thread);
                        processor->deadline = thread->realtime.deadline;
-                       processor->state = PROCESSOR_DISPATCHING;
+                       pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
                        if (processor == current_processor()) {
                                ast_on(preempt);
                        } else {
@@ -3557,8 +3592,9 @@ processor_setrun(
        thread_t                        thread,
        integer_t                       options)
 {
-       processor_set_t         pset = processor->processor_set;
-       ast_t                           preempt;
+       processor_set_t pset = processor->processor_set;
+       pset_assert_locked(pset);
+       ast_t preempt;
        enum { eExitIdle, eInterruptRunning, eDoNothing } ipi_action = eDoNothing;
 
        sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
@@ -3572,15 +3608,10 @@ processor_setrun(
                  thread->bound_processor == processor)
                && processor->state == PROCESSOR_IDLE) {
 
-               re_queue_tail(&pset->active_queue, &processor->processor_queue);
-
-               pset->active_processor_count++;
-               sched_update_pset_load_average(pset);
-
                processor->next_thread = thread;
                processor_state_update_from_thread(processor, thread);
                processor->deadline = UINT64_MAX;
-               processor->state = PROCESSOR_DISPATCHING;
+               pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
 
                ipi_type = sched_ipi_action(processor, thread, true, SCHED_IPI_EVENT_BOUND_THR);
                pset_unlock(pset);
@@ -3607,17 +3638,25 @@ processor_setrun(
        } else
                preempt = (options & SCHED_PREEMPT)? AST_PREEMPT: AST_NONE;
 
+       if ((options & (SCHED_PREEMPT|SCHED_REBALANCE)) == (SCHED_PREEMPT|SCHED_REBALANCE)) {
+               /*
+                * Having gone to the trouble of forcing this thread off a less preferred core,
+                * we should force the preferable core to reschedule immediately to give this
+                * thread a chance to run instead of just sitting on the run queue where
+                * it may just be stolen back by the idle core we just forced it off.
+                */
+               preempt |= AST_PREEMPT;
+       }
+
        SCHED(processor_enqueue)(processor, thread, options);
        sched_update_pset_load_average(pset);
 
        if (preempt != AST_NONE) {
                if (processor->state == PROCESSOR_IDLE) {
-                       re_queue_tail(&pset->active_queue, &processor->processor_queue);
-                       pset->active_processor_count++;
                        processor->next_thread = THREAD_NULL;
                        processor_state_update_from_thread(processor, thread);
                        processor->deadline = UINT64_MAX;
-                       processor->state = PROCESSOR_DISPATCHING;
+                       pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
                        ipi_action = eExitIdle;
                } else if ( processor->state == PROCESSOR_DISPATCHING) {
                        if ((processor->next_thread == THREAD_NULL) && (processor->current_pri < thread->sched_pri)) {
@@ -3638,15 +3677,11 @@ processor_setrun(
                        thread->sched_pri >= processor->current_pri     ) {
                        ipi_action = eInterruptRunning;
                } else if (processor->state == PROCESSOR_IDLE) {
-                       re_queue_tail(&pset->active_queue, &processor->processor_queue);
-
-                       pset->active_processor_count++;
-                       // sched_update_pset_load_average(pset);
 
                        processor->next_thread = THREAD_NULL;
                        processor_state_update_from_thread(processor, thread);
                        processor->deadline = UINT64_MAX;
-                       processor->state = PROCESSOR_DISPATCHING;
+                       pset_update_processor_state(pset, processor, PROCESSOR_DISPATCHING);
 
                        ipi_action = eExitIdle;
                }
@@ -3701,11 +3736,12 @@ choose_next_pset(
  */
 processor_t
 choose_processor(
-       processor_set_t         pset,
-       processor_t                     processor,
-       thread_t                        thread)
+       processor_set_t         starting_pset,
+       processor_t             processor,
+       thread_t                thread)
 {
-       processor_set_t         nset, cset = pset;
+       processor_set_t pset = starting_pset;
+       processor_set_t nset;
 
        assert(thread->sched_pri <= BASEPRI_RTQUEUES);
 
@@ -3821,16 +3857,19 @@ choose_processor(
        }
 
        do {
-
                /*
                 * Choose an idle processor, in pset traversal order
                 */
-               qe_foreach_element(processor, &cset->idle_queue, processor_queue) {
-                       if (bit_test(cset->pending_AST_cpu_mask, processor->cpu_id)) {
-                               continue;
-                       }
-                       if (processor->is_recommended)
-                               return processor;
+
+               uint64_t idle_primary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
+                                            pset->primary_map &
+                                            pset->recommended_bitmask &
+                                            ~pset->pending_AST_cpu_mask);
+
+               int cpuid = lsb_first(idle_primary_map);
+               if (cpuid >= 0) {
+                       processor = processor_array[cpuid];
+                       return processor;
                }
 
                /*
@@ -3838,14 +3877,13 @@ choose_processor(
                 * with lower priority/etc.
                 */
 
-               qe_foreach_element(processor, &cset->active_queue, processor_queue) {
-
-                       if (!processor->is_recommended) {
-                               continue;
-                       }
-                       if (bit_test(cset->pending_AST_cpu_mask, processor->cpu_id)) {
-                               continue;
-                       }
+               uint64_t active_map = ((pset->cpu_state_map[PROCESSOR_RUNNING] | pset->cpu_state_map[PROCESSOR_DISPATCHING]) &
+                                            pset->recommended_bitmask &
+                                            ~pset->pending_AST_cpu_mask);
+               active_map = bit_ror64(active_map, (pset->last_chosen + 1));
+               for (int rotid = lsb_first(active_map); rotid >= 0; rotid = lsb_next(active_map, rotid)) {
+                       cpuid = ((rotid + pset->last_chosen + 1) & 63);
+                       processor = processor_array[cpuid];
 
                        integer_t cpri = processor->current_pri;
                        if (processor->processor_primary != processor) {
@@ -3876,15 +3914,20 @@ choose_processor(
                 * For SMT configs, these idle secondary processors must have active primary. Otherwise
                 * the idle primary would have short-circuited the loop above
                 */
-               qe_foreach_element(processor, &cset->idle_secondary_queue, processor_queue) {
+               uint64_t idle_secondary_map = (pset->cpu_state_map[PROCESSOR_IDLE] &
+                                              ~pset->primary_map &
+                                              pset->recommended_bitmask &
+                                              ~pset->pending_AST_cpu_mask);
 
-                       if (!processor->is_recommended) {
-                               continue;
-                       }
+               for (cpuid = lsb_first(idle_secondary_map); cpuid >= 0; cpuid = lsb_next(idle_secondary_map, cpuid)) {
+                       processor = processor_array[cpuid];
 
                        processor_t cprimary = processor->processor_primary;
 
-                       if (bit_test(cset->pending_AST_cpu_mask, cprimary->cpu_id)) {
+                       if (!cprimary->is_recommended) {
+                               continue;
+                       }
+                       if (bit_test(pset->pending_AST_cpu_mask, cprimary->cpu_id)) {
                                continue;
                        }
 
@@ -3911,16 +3954,15 @@ choose_processor(
                         */
 
                        if (thread->sched_pri > lowest_unpaired_primary_priority) {
-                               /* Move to end of active queue so that the next thread doesn't also pick it */
-                               re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
+                               pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
                                return lp_unpaired_primary_processor;
                        }
                        if (thread->sched_pri > lowest_priority) {
-                               /* Move to end of active queue so that the next thread doesn't also pick it */
-                               re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
+                               pset->last_chosen = lp_processor->cpu_id;
                                return lp_processor;
                        }
                        if (sched_allow_rt_smt && (thread->sched_pri > lowest_secondary_priority)) {
+                               pset->last_chosen = lp_paired_secondary_processor->cpu_id;
                                return lp_paired_secondary_processor;
                        }
                        if (thread->realtime.deadline < furthest_deadline)
@@ -3935,13 +3977,11 @@ choose_processor(
                else {
 
                        if (thread->sched_pri > lowest_unpaired_primary_priority) {
-                               /* Move to end of active queue so that the next thread doesn't also pick it */
-                               re_queue_tail(&cset->active_queue, &lp_unpaired_primary_processor->processor_queue);
+                               pset->last_chosen = lp_unpaired_primary_processor->cpu_id;
                                return lp_unpaired_primary_processor;
                        }
                        if (thread->sched_pri > lowest_priority) {
-                               /* Move to end of active queue so that the next thread doesn't also pick it */
-                               re_queue_tail(&cset->active_queue, &lp_processor->processor_queue);
+                               pset->last_chosen = lp_processor->cpu_id;
                                return lp_processor;
                        }
 
@@ -3955,15 +3995,15 @@ choose_processor(
                /*
                 * Move onto the next processor set.
                 */
-               nset = next_pset(cset);
+               nset = next_pset(pset);
 
-               if (nset != pset) {
-                       pset_unlock(cset);
+               if (nset != starting_pset) {
+                       pset_unlock(pset);
 
-                       cset = nset;
-                       pset_lock(cset);
+                       pset = nset;
+                       pset_lock(pset);
                }
-       } while (nset != pset);
+       } while (nset != starting_pset);
 
        /*
         * Make sure that we pick a running processor,
@@ -4001,10 +4041,10 @@ choose_processor(
                 * Check that the correct processor set is
                 * returned locked.
                 */
-               if (cset != processor->processor_set) {
-                       pset_unlock(cset);
-                       cset = processor->processor_set;
-                       pset_lock(cset);
+               if (pset != processor->processor_set) {
+                       pset_unlock(pset);
+                       pset = processor->processor_set;
+                       pset_lock(pset);
                }
 
                /*
@@ -4019,11 +4059,8 @@ choose_processor(
 
        } while (processor == PROCESSOR_NULL);
 
-       if (processor->state == PROCESSOR_RUNNING) {
-               re_queue_tail(&cset->active_queue, &processor->processor_queue);
-       }
-
-       return (processor);
+       pset->last_chosen = processor->cpu_id;
+       return processor;
 }
 
 /*
@@ -4208,21 +4245,22 @@ csw_check_locked(
                }
        }
 
-       result = SCHED(processor_csw_check)(processor);
-       if (result != AST_NONE)
-               return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
-
 #if __SMP__
-
        /*
-        * If the current thread is running on a processor that is no longer recommended, gently
-        * (non-urgently) get to a point and then block, and which point thread_select() should
+        * If the current thread is running on a processor that is no longer recommended,
+        * urgently preempt it, at which point thread_select() should
         * try to idle the processor and re-dispatch the thread to a recommended processor.
         */
        if (!processor->is_recommended) {
-               return (check_reason | AST_PREEMPT);
+               return (check_reason | AST_PREEMPT | AST_URGENT);
        }
+#endif
 
+       result = SCHED(processor_csw_check)(processor);
+       if (result != AST_NONE)
+               return (check_reason | result | (thread_eager_preemption(thread) ? AST_URGENT : AST_NONE));
+
+#if __SMP__
        /*
         * Same for avoid-processor
         *
@@ -4239,11 +4277,6 @@ csw_check_locked(
         * TODO: Should this do the same check that thread_select does? i.e.
         * if no bound threads target this processor, and idle primaries exist, preempt
         * The case of RT threads existing is already taken care of above
-        * Consider Capri in this scenario.
-        *
-        * if (!SCHED(processor_bound_count)(processor) && !queue_empty(&pset->idle_queue))
-        *
-        * TODO: Alternatively - check if only primary is idle, or check if primary's pri is lower than mine.
         */
 
        if (processor->current_pri < BASEPRI_RTQUEUES &&
@@ -4279,7 +4312,8 @@ csw_check_locked(
 void
 set_sched_pri(
               thread_t        thread,
-              int             new_priority)
+              int             new_priority,
+              set_sched_pri_options_t options)
 {
        thread_t cthread = current_thread();
        boolean_t is_current_thread = (thread == cthread) ? TRUE : FALSE;
@@ -4287,6 +4321,8 @@ set_sched_pri(
        uint64_t urgency_param1, urgency_param2;
        boolean_t removed_from_runq = FALSE;
 
+       bool lazy_update = ((options & SETPRI_LAZY) == SETPRI_LAZY);
+
        int old_priority = thread->sched_pri;
 
        /* If we're already at this priority, no need to mess with the runqueue */
@@ -4337,13 +4373,14 @@ set_sched_pri(
                         * If a thread raises its own priority, don't aggressively rebalance it.
                         * <rdar://problem/31699165>
                         */
-                       if (new_priority < old_priority) {
+                       if (!lazy_update && new_priority < old_priority) {
                                ast_t preempt;
 
                                if ((preempt = csw_check(processor, AST_NONE)) != AST_NONE)
                                        ast_on(preempt);
                        }
-               } else if (processor != PROCESSOR_NULL && processor->active_thread == thread) {
+               } else if (!lazy_update && processor != PROCESSOR_NULL &&
+                          processor != current_processor() && processor->active_thread == thread) {
                        cause_ast_check(processor);
                }
        }
@@ -4479,12 +4516,12 @@ thread_run_queue_reinsert(thread_t thread, integer_t options)
 }
 
 void
-sys_override_cpu_throttle(int flag)
+sys_override_cpu_throttle(boolean_t enable_override)
 {
-       if (flag == CPU_THROTTLE_ENABLE)
-               cpu_throttle_enabled = 1;
-       if (flag == CPU_THROTTLE_DISABLE)
+       if (enable_override)
                cpu_throttle_enabled = 0;
+       else 
+               cpu_throttle_enabled = 1;
 }
 
 int
@@ -4574,10 +4611,13 @@ processor_idle(
 
        SCHED_STATS_CPU_IDLE_START(processor);
 
-       timer_switch(&PROCESSOR_DATA(processor, system_state),
-                                                                       mach_absolute_time(), &PROCESSOR_DATA(processor, idle_state));
+       uint64_t ctime = mach_absolute_time();
+
+       timer_switch(&PROCESSOR_DATA(processor, system_state), ctime, &PROCESSOR_DATA(processor, idle_state));
        PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, idle_state);
 
+       cpu_quiescent_counter_leave(ctime);
+
        while (1) {
                /*
                 * Ensure that updates to my processor and pset state,
@@ -4622,6 +4662,17 @@ processor_idle(
 
                (void)splsched();
 
+               /* 
+                * Check if we should call sched_timeshare_consider_maintenance() here.  
+                * The CPU was woken out of idle due to an interrupt and we should do the 
+                * call only if the processor is still idle. If the processor is non-idle, 
+                * the threads running on the processor would do the call as part of 
+                * context swithing.
+                */
+               if (processor->state == PROCESSOR_IDLE) {
+                       sched_timeshare_consider_maintenance(mach_absolute_time());
+               }
+
                IDLE_KERNEL_DEBUG_CONSTANT(
                        MACHDBG_CODE(DBG_MACH_SCHED,MACH_IDLE) | DBG_FUNC_NONE, (uintptr_t)thread_tid(thread), rt_runq_count(pset), SCHED(processor_runq_count)(processor), -2, 0);
 
@@ -4634,10 +4685,13 @@ processor_idle(
                }
        }
 
-       timer_switch(&PROCESSOR_DATA(processor, idle_state),
-                                                                       mach_absolute_time(), &PROCESSOR_DATA(processor, system_state));
+       ctime = mach_absolute_time();
+
+       timer_switch(&PROCESSOR_DATA(processor, idle_state), ctime, &PROCESSOR_DATA(processor, system_state));
        PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
 
+       cpu_quiescent_counter_join(ctime);
+
        pset_lock(pset);
 
        /* If we were sent a remote AST and came out of idle, acknowledge it here with pset lock held */
@@ -4653,7 +4707,7 @@ processor_idle(
                 */
                new_thread = processor->next_thread;
                processor->next_thread = THREAD_NULL;
-               processor->state = PROCESSOR_RUNNING;
+               pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
 
                if ((new_thread != THREAD_NULL) && (SCHED(processor_queue_has_priority)(processor, new_thread->sched_pri, FALSE)                                        ||
                                                                                        (rt_runq_count(pset) > 0))      ) {
@@ -4686,12 +4740,7 @@ processor_idle(
                return (new_thread);
 
        } else if (state == PROCESSOR_IDLE) {
-               re_queue_tail(&pset->active_queue, &processor->processor_queue);
-
-               pset->active_processor_count++;
-               sched_update_pset_load_average(pset);
-
-               processor->state = PROCESSOR_RUNNING;
+               pset_update_processor_state(pset, processor, PROCESSOR_RUNNING);
                processor_state_update_idle(processor);
                processor->deadline = UINT64_MAX;
 
@@ -4935,8 +4984,8 @@ sched_timeshare_maintenance_continue(void)
 #endif /* DEBUG || DEVELOPMENT */
 
        KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_MAINTENANCE) | DBG_FUNC_END,
-               sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
-               sched_pri_shifts[TH_BUCKET_SHARE_UT], 0, 0);
+               sched_pri_shifts[TH_BUCKET_SHARE_FG], sched_pri_shifts[TH_BUCKET_SHARE_BG],
+               sched_pri_shifts[TH_BUCKET_SHARE_UT], sched_pri_shifts[TH_BUCKET_SHARE_DF], 0);
 
        assert_wait((event_t)sched_timeshare_maintenance_continue, THREAD_UNINT);
        thread_block((thread_continue_t)sched_timeshare_maintenance_continue);
@@ -4957,14 +5006,17 @@ static uint64_t sched_maintenance_wakeups;
  */
 void
 sched_timeshare_consider_maintenance(uint64_t ctime) {
-       uint64_t ndeadline, deadline = sched_maintenance_deadline;
+
+       cpu_quiescent_counter_checkin(ctime);
+
+       uint64_t deadline = sched_maintenance_deadline;
 
        if (__improbable(ctime >= deadline)) {
                if (__improbable(current_thread() == sched_maintenance_thread))
                        return;
                OSMemoryBarrier();
 
-               ndeadline = ctime + sched_tick_interval;
+               uint64_t ndeadline = ctime + sched_tick_interval;
 
                if (__probable(__sync_bool_compare_and_swap(&sched_maintenance_deadline, deadline, ndeadline))) {
                        thread_wakeup((event_t)sched_timeshare_maintenance_continue);
@@ -4972,6 +5024,18 @@ sched_timeshare_consider_maintenance(uint64_t ctime) {
                }
        }
 
+       uint64_t load_compute_deadline = __c11_atomic_load(&sched_load_compute_deadline, memory_order_relaxed);
+
+       if (__improbable(load_compute_deadline && ctime >= load_compute_deadline)) {
+               uint64_t new_deadline = 0;
+               if (__c11_atomic_compare_exchange_strong(&sched_load_compute_deadline, &load_compute_deadline, new_deadline,
+                                                        memory_order_relaxed, memory_order_relaxed)) {
+                       compute_sched_load();
+                       new_deadline = ctime + sched_load_compute_interval_abs;
+                       __c11_atomic_store(&sched_load_compute_deadline, new_deadline, memory_order_relaxed);
+               }
+       }
+
 #if __arm64__
        uint64_t perf_deadline = __c11_atomic_load(&sched_perfcontrol_callback_deadline, memory_order_relaxed);
 
@@ -5241,7 +5305,8 @@ sched_timer_deadline_tracking_init(void) {
 
 uint32_t    perfcontrol_requested_recommended_cores = ALL_CORES_RECOMMENDED;
 uint32_t    perfcontrol_requested_recommended_core_count = MAX_CPUS;
-boolean_t   perfcontrol_failsafe_active = FALSE;
+bool        perfcontrol_failsafe_active = false;
+bool        perfcontrol_sleep_override = false;
 
 uint64_t    perfcontrol_failsafe_maintenance_runnable_time;
 uint64_t    perfcontrol_failsafe_activation_time;
@@ -5279,7 +5344,7 @@ sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
        perfcontrol_requested_recommended_cores = recommended_cores;
        perfcontrol_requested_recommended_core_count = __builtin_popcountll(recommended_cores);
 
-       if (perfcontrol_failsafe_active == FALSE)
+       if ((perfcontrol_failsafe_active == false) && (perfcontrol_sleep_override == false))
                sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
        else
                KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
@@ -5291,6 +5356,36 @@ sched_perfcontrol_update_recommended_cores(uint32_t recommended_cores)
        splx(s);
 }
 
+void
+sched_override_recommended_cores_for_sleep(void)
+{
+       spl_t s = splsched();
+       simple_lock(&sched_recommended_cores_lock);
+
+       if (perfcontrol_sleep_override == false) {
+               perfcontrol_sleep_override = true;
+               sched_update_recommended_cores(ALL_CORES_RECOMMENDED);
+       }
+
+       simple_unlock(&sched_recommended_cores_lock);
+       splx(s);
+}
+
+void
+sched_restore_recommended_cores_after_sleep(void)
+{
+       spl_t s = splsched();
+       simple_lock(&sched_recommended_cores_lock);
+
+       if (perfcontrol_sleep_override == true) {
+               perfcontrol_sleep_override = false;
+               sched_update_recommended_cores(perfcontrol_requested_recommended_cores);
+       }
+
+       simple_unlock(&sched_recommended_cores_lock);
+       splx(s);
+}
+
 /*
  * Consider whether we need to activate the recommended cores failsafe
  *
@@ -5506,12 +5601,11 @@ sched_update_recommended_cores(uint32_t recommended_cores)
        processor = processor_list;
        pset = processor->processor_set;
 
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-               MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
-               recommended_cores, perfcontrol_failsafe_active, 0, 0, 0);
+       KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_START,
+               recommended_cores, perfcontrol_failsafe_active, 0, 0);
 
        if (__builtin_popcount(recommended_cores) == 0) {
-               recommended_cores |= 0x1U; /* add boot processor or we hang */
+               bit_set(recommended_cores, master_processor->cpu_id); /* add boot processor or we hang */
        }
 
        /* First set recommended cores */
@@ -5525,19 +5619,13 @@ sched_update_recommended_cores(uint32_t recommended_cores)
                        pset_lock(pset);
                }
 
-               pset->recommended_bitmask = recommended_cores;
-
-               if (recommended_cores & (1ULL << processor->cpu_id)) {
+               if (bit_test(recommended_cores, processor->cpu_id)) {
                        processor->is_recommended = TRUE;
+                       bit_set(pset->recommended_bitmask, processor->cpu_id);
 
                        if (processor->state == PROCESSOR_IDLE) {
-                               if (processor->processor_primary == processor) {
-                                       re_queue_head(&pset->idle_queue, &processor->processor_queue);
-                               } else {
-                                       re_queue_head(&pset->idle_secondary_queue, &processor->processor_queue);
-                               }
                                if (processor != current_processor()) {
-                                       needs_exit_idle_mask |= (1ULL << processor->cpu_id);
+                                       bit_set(needs_exit_idle_mask, processor->cpu_id);
                                }
                        }
                }
@@ -5558,16 +5646,28 @@ sched_update_recommended_cores(uint32_t recommended_cores)
                        pset_lock(pset);
                }
 
-               if (!(recommended_cores & (1ULL << processor->cpu_id))) {
+               if (!bit_test(recommended_cores, processor->cpu_id)) {
+                       sched_ipi_type_t ipi_type = SCHED_IPI_NONE;
+
                        processor->is_recommended = FALSE;
-                       if (processor->state == PROCESSOR_IDLE) {
-                               re_queue_head(&pset->unused_queue, &processor->processor_queue);
+                       bit_clear(pset->recommended_bitmask, processor->cpu_id);
+
+                       if ((processor->state == PROCESSOR_RUNNING) || (processor->state == PROCESSOR_DISPATCHING)) {
+                               ipi_type = SCHED_IPI_IMMEDIATE;
                        }
                        SCHED(processor_queue_shutdown)(processor);
                        /* pset unlocked */
 
                        SCHED(rt_queue_shutdown)(processor);
 
+                       if (ipi_type != SCHED_IPI_NONE) {
+                               if (processor == current_processor()) {
+                                       ast_on(AST_PREEMPT);
+                               } else {
+                                       sched_ipi_perform(processor, ipi_type);
+                               }
+                       }
+
                        pset_lock(pset);
                }
        } while ((processor = processor->processor_list) != NULL);
@@ -5579,9 +5679,8 @@ sched_update_recommended_cores(uint32_t recommended_cores)
                machine_signal_idle(processor);
        }
 
-       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-               MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
-                                                         needs_exit_idle_mask, 0, 0, 0, 0);
+       KDBG(MACHDBG_CODE(DBG_MACH_SCHED,MACH_SCHED_UPDATE_REC_CORES) | DBG_FUNC_END,
+                                                         needs_exit_idle_mask, 0, 0, 0);
 }
 #endif /* __arm__ || __arm64__ */
 
@@ -5661,25 +5760,10 @@ sched_perfcontrol_update_callback_deadline(uint64_t new_deadline)
 
 #endif /* __arm64__ */
 
-int
-sched_get_pset_load_average(processor_set_t pset)
-{
-       return pset->load_average >> (PSET_LOAD_NUMERATOR_SHIFT - PSET_LOAD_FRACTIONAL_SHIFT);
-}
-
 void
 sched_update_pset_load_average(processor_set_t pset)
 {
-#if DEBUG
-       queue_entry_t iter;
-       int count = 0;
-       qe_foreach(iter, &pset->active_queue) {
-               count++;
-       }
-       assertf(count == pset->active_processor_count, "count %d pset->active_processor_count %d\n", count, pset->active_processor_count);
-#endif
-
-       int load = ((pset->active_processor_count + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
+       int load = ((bit_count(pset->cpu_state_map[PROCESSOR_RUNNING]) + pset->pset_runq.count + rt_runq_count(pset)) << PSET_LOAD_NUMERATOR_SHIFT);
        int new_load_average = (pset->load_average + load) >> 1;
 
        pset->load_average = new_load_average;
@@ -5761,7 +5845,22 @@ all_available_primaries_are_running_realtime_threads(processor_set_t pset)
                        continue;
                }
 
-               if ((processor->state != PROCESSOR_RUNNING) && (processor->state != PROCESSOR_DISPATCHING)) {
+               if (processor->state == PROCESSOR_IDLE) {
+                       return false;
+               }
+
+               if (processor->state == PROCESSOR_DISPATCHING) {
+                       return false;
+               }
+
+               if (processor->state != PROCESSOR_RUNNING) {
+                       /*
+                        * All other processor states are considered unavailable to run
+                        * realtime threads.  In particular, we prefer an available secondary
+                        * processor over the risk of leaving a realtime thread on the run queue
+                        * while waiting for a processor in PROCESSOR_START state,
+                        * which should anyway be a rare case.
+                        */
                        continue;
                }
 
@@ -5772,3 +5871,5 @@ all_available_primaries_are_running_realtime_threads(processor_set_t pset)
 
        return true;
 }
+
+
index c6361a9cac9ecd087b9506ddfb807743ada2ff00..bd67f68693baa28e326964b1062cd18c0179f460 100644 (file)
@@ -77,8 +77,6 @@
 
 #ifdef MACH_KERNEL_PRIVATE
 
-#include <mach/branch_predicates.h>
-
 /* Initialization */
 extern void            sched_init(void);
 
@@ -144,12 +142,23 @@ extern void                       thread_continue(
 extern void            call_continuation(
                                        thread_continue_t       continuation,
                                        void                            *parameter,
-                                       wait_result_t           wresult);
+                                       wait_result_t           wresult,
+                                       boolean_t           enable_interrupts);
+
+/*
+ * Flags that can be passed to set_sched_pri
+ * to skip side effects
+ */
+typedef enum {
+       SETPRI_DEFAULT  = 0x0,
+       SETPRI_LAZY     = 0x1,  /* Avoid setting AST flags or sending IPIs */
+} set_sched_pri_options_t;
 
 /* Set the current scheduled priority */
-extern void            set_sched_pri(
-                                       thread_t                thread,
-                                       int                             priority);
+extern void set_sched_pri(
+                          thread_t      thread,
+                          int           priority,
+                          set_sched_pri_options_t options);
 
 /* Set base priority of the specified thread */
 extern void            sched_set_thread_base_priority(
@@ -166,16 +175,22 @@ extern void             sched_thread_mode_demote(thread_t thread,
 extern void             sched_thread_mode_undemote(thread_t thread,
                                                    uint32_t reason);
 
+extern void sched_thread_promote_to_pri(thread_t thread, int priority, uintptr_t trace_obj);
+extern void sched_thread_update_promotion_to_pri(thread_t thread, int priority, uintptr_t trace_obj);
+extern void sched_thread_unpromote(thread_t thread, uintptr_t trace_obj);
+
+extern void assert_promotions_invariant(thread_t thread);
+
+extern void sched_thread_promote_reason(thread_t thread, uint32_t reason, uintptr_t trace_obj);
+extern void sched_thread_unpromote_reason(thread_t thread, uint32_t reason, uintptr_t trace_obj);
+
 /* Re-evaluate base priority of thread (thread locked) */
 void thread_recompute_priority(thread_t thread);
 
-/* Re-evaluate base priority of thread (thread unlocked) */
-void thread_recompute_qos(thread_t thread);
-
-/* Reset scheduled priority of thread */
-extern void            thread_recompute_sched_pri(
-                                       thread_t                thread,
-                                       boolean_t               override_depress);
+/* Re-evaluate scheduled priority of thread (thread locked) */
+extern void thread_recompute_sched_pri(
+                                       thread_t thread,
+                                       set_sched_pri_options_t options);
 
 /* Periodic scheduler activity */
 extern void            sched_init_thread(void (*)(void));
@@ -435,12 +450,6 @@ extern void        active_rt_threads(
 extern perfcontrol_class_t thread_get_perfcontrol_class(
                                        thread_t        thread);
 
-#define PSET_LOAD_NUMERATOR_SHIFT   16
-#define PSET_LOAD_FRACTIONAL_SHIFT   4
-
-extern int sched_get_pset_load_average(processor_set_t pset);
-extern void sched_update_pset_load_average(processor_set_t pset);
-
 /* Generic routine for Non-AMP schedulers to calculate parallelism */
 extern uint32_t sched_qos_max_parallelism(int qos, uint64_t options);
 
@@ -451,9 +460,7 @@ __BEGIN_DECLS
 #ifdef XNU_KERNEL_PRIVATE
 
 /* Toggles a global override to turn off CPU Throttling */
-#define CPU_THROTTLE_DISABLE   0
-#define CPU_THROTTLE_ENABLE    1
-extern void    sys_override_cpu_throttle(int flag);
+extern void    sys_override_cpu_throttle(boolean_t enable_override);
 
 /*
  ****************** Only exported until BSD stops using ********************
@@ -479,7 +486,11 @@ extern char sched_string[SCHED_STRING_MAX_LENGTH];
 extern thread_t port_name_to_thread_for_ulock(mach_port_name_t thread_name);
 
 /* Attempt to context switch to a specific runnable thread */
-extern wait_result_t thread_handoff(thread_t thread);
+extern wait_result_t thread_handoff_deallocate(thread_t thread);
+
+__attribute__((nonnull(1, 2)))
+extern void thread_handoff_parameter(thread_t thread,
+               thread_continue_t continuation, void *parameter) __dead2;
 
 extern struct waitq    *assert_wait_queue(event_t event);
 
@@ -498,9 +509,14 @@ extern void                thread_set_pending_block_hint(
 #define QOS_PARALLELISM_COUNT_LOGICAL   0x1
 #define QOS_PARALLELISM_REALTIME        0x2
 extern uint32_t qos_max_parallelism(int qos, uint64_t options);
-
 #endif /* KERNEL_PRIVATE */
 
+#if XNU_KERNEL_PRIVATE
+extern void            thread_yield_with_continuation(
+                                               thread_continue_t       continuation,
+                                               void                            *parameter) __dead2;
+#endif
+
 /* Context switch */
 extern wait_result_t   thread_block(
                                                        thread_continue_t       continuation);
@@ -582,8 +598,8 @@ extern boolean_t preemption_enabled(void);
  * For DEV & REL kernels, use a static dispatch table instead of 
  * using the indirect function table.
  */
-extern const struct sched_dispatch_table sched_multiq_dispatch;
-#define SCHED(f) (sched_multiq_dispatch.f)
+extern const struct sched_dispatch_table sched_dualq_dispatch;
+#define SCHED(f) (sched_dualq_dispatch.f)
 
 #endif /* DEBUG */
 
index 7bc3d43931095da6a224f5d92bda6b4a110c07da..02066c97e723d2e5b0b7a205b2d7663df8bc5965 100644 (file)
@@ -689,21 +689,18 @@ sched_traditional_steal_thread(processor_set_t pset)
        thread_t        thread;
 
        do {
-               processor = (processor_t)(uintptr_t)queue_first(&cset->active_queue);
-               while (!queue_end(&cset->active_queue, (queue_entry_t)processor)) {
+               uint64_t active_map = (pset->cpu_state_map[PROCESSOR_RUNNING] |
+                                      pset->cpu_state_map[PROCESSOR_DISPATCHING]);
+               for (int cpuid = lsb_first(active_map); cpuid >= 0; cpuid = lsb_next(active_map, cpuid)) {
+                       processor = processor_array[cpuid];
                        if (runq_for_processor(processor)->count > 0) {
                                thread = sched_traditional_steal_processor_thread(processor);
                                if (thread != THREAD_NULL) {
-                                       remqueue((queue_entry_t)processor);
-                                       enqueue_tail(&cset->active_queue, (queue_entry_t)processor);
-
                                        pset_unlock(cset);
 
                                        return (thread);
                                }
                        }
-
-                       processor = (processor_t)(uintptr_t)queue_next((queue_entry_t)processor);
                }
 
                nset = next_pset(cset);
index 80fa2c105bd21096ec39b71ab277afa900d86ef6..139a6798e1b6141ff406281b235abc859dc708a2 100644 (file)
@@ -44,6 +44,8 @@
 #include <kern/ledger.h>
 #include <kern/policy_internal.h>
 
+#include <machine/atomic.h>
+
 #include <pexpert/pexpert.h>
 
 #include <libkern/kernel_mach_header.h>
 #define dprintf(...) do { } while(0)
 #endif
 
-#ifdef MACH_BSD
-extern sched_call_t workqueue_get_sched_callback(void);
-#endif /* MACH_BSD */
-
 /*
  * SFI (Selective Forced Idle) operates by enabling a global
  * timer on the SFI window interval. When it fires, all processors
@@ -131,36 +129,43 @@ typedef struct {
  * 5) Modify thermald to use the SFI class
  */
 
-static inline void _sfi_wait_cleanup(sched_call_t callback);
-
-#define SFI_CLASS_REGISTER(class_id, ledger_name)                                      \
-extern char compile_time_assert_ ## class_id[SFI_CLASS_ ## class_id < MAX_SFI_CLASS_ID ? 1 : -1];  \
-void __attribute__((noinline,noreturn)) SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused); \
-void SFI_ ## class_id ## _THREAD_IS_WAITING(void *callback, wait_result_t wret __unused) \
-{                                                                                                                                              \
-       _sfi_wait_cleanup(callback);                                                                            \
-       thread_exception_return();                                                                                      \
-}                                                                                                                                              \
-                                                                                                                                               \
-sfi_class_registration_t SFI_ ## class_id ## _registration __attribute__((section("__DATA,__sfi_class_reg"),used)) = { SFI_CLASS_ ## class_id, SFI_ ## class_id ## _THREAD_IS_WAITING, "SFI_CLASS_" # class_id, "SFI_CLASS_" # ledger_name };
+static inline void _sfi_wait_cleanup(void);
+
+#define SFI_CLASS_REGISTER(clsid, ledger_name)                                                         \
+static void __attribute__((noinline, noreturn))                                                                \
+SFI_ ## clsid ## _THREAD_IS_WAITING(void *arg __unused, wait_result_t wret __unused) \
+{                                                                                                                                                      \
+       _sfi_wait_cleanup();                                                                                                    \
+       thread_exception_return();                                                                                              \
+}                                                                                                                                                      \
+                                                                                                                                                       \
+_Static_assert(SFI_CLASS_ ## clsid < MAX_SFI_CLASS_ID, "Invalid ID");          \
+                                                                                                                                                       \
+__attribute__((section("__DATA,__sfi_class_reg"), used))                                       \
+static sfi_class_registration_t SFI_ ## clsid ## _registration = {                     \
+       .class_id = SFI_CLASS_ ## clsid,                                                                                \
+       .class_continuation = SFI_ ## clsid ## _THREAD_IS_WAITING,                              \
+       .class_name = "SFI_CLASS_" # clsid,                                                                             \
+       .class_ledger_name = "SFI_CLASS_" # ledger_name,                                                \
+}
 
 /* SFI_CLASS_UNSPECIFIED not included here */
-SFI_CLASS_REGISTER(MAINTENANCE,               MAINTENANCE)
-SFI_CLASS_REGISTER(DARWIN_BG,                 DARWIN_BG)
-SFI_CLASS_REGISTER(APP_NAP,                   APP_NAP)
-SFI_CLASS_REGISTER(MANAGED_FOCAL,             MANAGED)
-SFI_CLASS_REGISTER(MANAGED_NONFOCAL,          MANAGED)
-SFI_CLASS_REGISTER(UTILITY,                   UTILITY)
-SFI_CLASS_REGISTER(DEFAULT_FOCAL,             DEFAULT)
-SFI_CLASS_REGISTER(DEFAULT_NONFOCAL,          DEFAULT)
-SFI_CLASS_REGISTER(LEGACY_FOCAL,              LEGACY)
-SFI_CLASS_REGISTER(LEGACY_NONFOCAL,           LEGACY)
-SFI_CLASS_REGISTER(USER_INITIATED_FOCAL,      USER_INITIATED)
-SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL,   USER_INITIATED)
-SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL,    USER_INTERACTIVE)
-SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE)
-SFI_CLASS_REGISTER(KERNEL,                    OPTED_OUT)
-SFI_CLASS_REGISTER(OPTED_OUT,                 OPTED_OUT)
+SFI_CLASS_REGISTER(MAINTENANCE,               MAINTENANCE);
+SFI_CLASS_REGISTER(DARWIN_BG,                 DARWIN_BG);
+SFI_CLASS_REGISTER(APP_NAP,                   APP_NAP);
+SFI_CLASS_REGISTER(MANAGED_FOCAL,             MANAGED);
+SFI_CLASS_REGISTER(MANAGED_NONFOCAL,          MANAGED);
+SFI_CLASS_REGISTER(UTILITY,                   UTILITY);
+SFI_CLASS_REGISTER(DEFAULT_FOCAL,             DEFAULT);
+SFI_CLASS_REGISTER(DEFAULT_NONFOCAL,          DEFAULT);
+SFI_CLASS_REGISTER(LEGACY_FOCAL,              LEGACY);
+SFI_CLASS_REGISTER(LEGACY_NONFOCAL,           LEGACY);
+SFI_CLASS_REGISTER(USER_INITIATED_FOCAL,      USER_INITIATED);
+SFI_CLASS_REGISTER(USER_INITIATED_NONFOCAL,   USER_INITIATED);
+SFI_CLASS_REGISTER(USER_INTERACTIVE_FOCAL,    USER_INTERACTIVE);
+SFI_CLASS_REGISTER(USER_INTERACTIVE_NONFOCAL, USER_INTERACTIVE);
+SFI_CLASS_REGISTER(KERNEL,                    OPTED_OUT);
+SFI_CLASS_REGISTER(OPTED_OUT,                 OPTED_OUT);
 
 struct sfi_class_state {
        uint64_t        off_time_usecs;
@@ -788,12 +793,15 @@ sfi_class_id_t sfi_thread_classify(thread_t thread)
                break;
        case TASK_BACKGROUND_APPLICATION:
        case TASK_DEFAULT_APPLICATION:
-       case TASK_THROTTLE_APPLICATION:
        case TASK_UNSPECIFIED:
                /* Focal if the task is in a coalition with a FG/focal app */
                if (task_coalition_focal_count(thread->task) > 0)
                        focal = TRUE;
                break;
+       case TASK_THROTTLE_APPLICATION:
+       case TASK_DARWINBG_APPLICATION:
+       case TASK_NONUI_APPLICATION:
+               /* Definitely not focal */
        default:
                break;
        }
@@ -894,29 +902,50 @@ ast_t sfi_processor_needs_ast(processor_t processor)
 
 }
 
-static inline void _sfi_wait_cleanup(sched_call_t callback) {
+static inline void _sfi_wait_cleanup(void)
+{
        thread_t self = current_thread();
-       sfi_class_id_t current_sfi_wait_class = SFI_CLASS_UNSPECIFIED;
-       int64_t sfi_wait_time, sfi_wait_begin = 0;
 
        spl_t s = splsched();
-       thread_lock(self);
-       if (callback) {
-               thread_sched_call(self, callback);
-       }
-       sfi_wait_begin = self->wait_sfi_begin_time;
-       thread_unlock(self);
-
        simple_lock(&sfi_lock);
-       sfi_wait_time = mach_absolute_time() - sfi_wait_begin;
-       current_sfi_wait_class = self->sfi_wait_class;
+
+       sfi_class_id_t current_sfi_wait_class = self->sfi_wait_class;
+
+       assert((SFI_CLASS_UNSPECIFIED < current_sfi_wait_class) &&
+              (current_sfi_wait_class < MAX_SFI_CLASS_ID));
+
        self->sfi_wait_class = SFI_CLASS_UNSPECIFIED;
+
        simple_unlock(&sfi_lock);
        splx(s);
-       assert((SFI_CLASS_UNSPECIFIED < current_sfi_wait_class) && (current_sfi_wait_class < MAX_SFI_CLASS_ID));
-#if !CONFIG_EMBEDDED   
-       ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class], sfi_wait_time);
+
+       /*
+        * It's possible for the thread to be woken up due to the SFI period
+        * ending *before* it finishes blocking. In that case,
+        * wait_sfi_begin_time won't be set.
+        *
+        * Derive the time sacrificed to SFI by looking at when this thread was
+        * awoken by the on-timer, to avoid counting the time this thread spent
+        * waiting to get scheduled.
+        *
+        * Note that last_made_runnable_time could be reset if this thread
+        * gets preempted before we read the value. To fix that, we'd need to
+        * track wait time in a thread timer, sample the timer before blocking,
+        * pass the value through thread->parameter, and subtract that.
+        */
+
+       if (self->wait_sfi_begin_time != 0) {
+#if !CONFIG_EMBEDDED
+               uint64_t made_runnable = os_atomic_load(&self->last_made_runnable_time, relaxed);
+               int64_t sfi_wait_time = made_runnable - self->wait_sfi_begin_time;
+               assert(sfi_wait_time >= 0);
+
+               ledger_credit(self->task->ledger, task_ledgers.sfi_wait_times[current_sfi_wait_class],
+                             sfi_wait_time);
 #endif /* !CONFIG_EMBEDDED */
+
+               self->wait_sfi_begin_time = 0;
+       }
 }
 
 /*
@@ -932,9 +961,7 @@ void sfi_ast(thread_t thread)
        struct sfi_class_state  *sfi_class;
        wait_result_t   waitret;
        boolean_t       did_wait = FALSE;
-       uint64_t        tid;
        thread_continue_t       continuation;
-       sched_call_t    workq_callback = workqueue_get_sched_callback();
 
        s = splsched();
 
@@ -955,7 +982,7 @@ void sfi_ast(thread_t thread)
 
        thread_lock(thread);
        thread->sfi_class = class_id = sfi_thread_classify(thread);
-       tid = thread_tid(thread);
+       thread_unlock(thread);
 
        /*
         * Once the sfi_lock is taken and the thread's ->sfi_class field is updated, we
@@ -967,23 +994,15 @@ void sfi_ast(thread_t thread)
         * classification.
         */
 
-       /* Optimistically clear workq callback while thread is already locked */
-       if (workq_callback && (thread->sched_call == workq_callback)) {
-               thread_sched_call(thread, NULL);
-       } else {
-               workq_callback = NULL;
-       }
-       thread_unlock(thread);
-
        sfi_class = &sfi_classes[class_id];
        if (!sfi_class->class_in_on_phase) {
                /* Need to block thread in wait queue */
-               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER), tid, class_id, 0, 0, 0);
+               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SFI, SFI_THREAD_DEFER),
+                               thread_tid(thread), class_id, 0, 0, 0);
 
                waitret = waitq_assert_wait64(&sfi_class->waitq,
                                              CAST_EVENT64_T(class_id),
-                                             THREAD_INTERRUPTIBLE,
-                                             0);
+                                             THREAD_INTERRUPTIBLE | THREAD_WAIT_NOREPORT, 0);
                if (waitret == THREAD_WAITING) {
                        thread->sfi_wait_class = class_id;
                        did_wait = TRUE;
@@ -994,13 +1013,13 @@ void sfi_ast(thread_t thread)
                }
        }
        simple_unlock(&sfi_lock);
-       
+
        splx(s);
 
        if (did_wait) {
-               thread_block_reason(continuation, workq_callback, AST_SFI);
-       } else if (workq_callback) {
-               thread_reenable_sched_call(thread, workq_callback);
+               assert(thread->wait_sfi_begin_time == 0);
+
+               thread_block_reason(continuation, NULL, AST_SFI);
        }
 }
 
index 8ef311a88731d955abb19258ddcad0e6812b410a..b66313f7fb56c6bc82ac4048922fe35bc1b161d5 100644 (file)
@@ -84,9 +84,15 @@ extern void                  hw_lock_init(
 extern void                    hw_lock_lock(
                                        hw_lock_t);
 
+extern void                    hw_lock_lock_nopreempt(
+                                       hw_lock_t);
+
 extern void                    hw_lock_unlock(
                                        hw_lock_t);
 
+extern void                    hw_lock_unlock_nopreempt(
+                                       hw_lock_t);
+
 extern unsigned int            hw_lock_to(
                                        hw_lock_t,
                                        uint64_t);
@@ -94,6 +100,9 @@ extern unsigned int          hw_lock_to(
 extern unsigned int            hw_lock_try(
                                        hw_lock_t);
 
+extern unsigned int            hw_lock_try_nopreempt(
+                                       hw_lock_t);
+
 extern unsigned int            hw_lock_held(
                                        hw_lock_t);
 
index 18db3f24b1cb84c8b548accbe34f2b392cabdf24..c56a597bc042aafa6a77be307ff9db27ff062f73 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2003-2007 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
@@ -100,7 +100,7 @@ log2(vm_offset_t size)
 static inline vm_offset_t
 roundup_pow2(vm_offset_t size)
 {
-       return 1UL << (log2(size - 1) + 1); 
+       return 1UL << (log2(size - 1) + 1);
 }
 
 static vm_offset_t stack_alloc_internal(void);
@@ -110,7 +110,7 @@ void
 stack_init(void)
 {
        simple_lock_init(&stack_lock_data, 0);
-       
+
        kernel_stack_pages = KERNEL_STACK_SIZE / PAGE_SIZE;
        kernel_stack_size = KERNEL_STACK_SIZE;
        kernel_stack_mask = -KERNEL_STACK_SIZE;
@@ -127,7 +127,7 @@ stack_init(void)
        if (kernel_stack_size < round_page(kernel_stack_size))
                panic("stack_init: stack size %p not a multiple of page size %d\n",
                        (void *) kernel_stack_size, PAGE_SIZE);
-       
+
        stack_addr_mask = roundup_pow2(kernel_stack_size) - 1;
        kernel_stack_mask = ~stack_addr_mask;
 }
@@ -139,7 +139,7 @@ stack_init(void)
  *     block.
  */
 
-static vm_offset_t 
+static vm_offset_t
 stack_alloc_internal(void)
 {
        vm_offset_t             stack = 0;
@@ -163,7 +163,7 @@ stack_alloc_internal(void)
        stack_free_delta--;
        stack_unlock();
        splx(s);
-               
+
        if (stack == 0) {
 
                /*
@@ -172,7 +172,7 @@ stack_alloc_internal(void)
                 * for these.
                 */
 
-               flags = KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT;
+               flags = KMA_GUARD_FIRST | KMA_GUARD_LAST | KMA_KSTACK | KMA_KOBJECT | KMA_ZERO;
                kr = kernel_memory_allocate(kernel_map, &stack,
                                           kernel_stack_size + (2*PAGE_SIZE),
                                           stack_addr_mask,
@@ -219,11 +219,6 @@ stack_free(
 {
     vm_offset_t                stack = machine_stack_detach(thread);
 
-#if KASAN
-       kasan_unpoison_stack(stack, kernel_stack_size);
-       kasan_unpoison_fakestack(thread);
-#endif
-
        assert(stack);
        if (stack != thread->reserved_stack) {
                stack_free_stack(stack);
@@ -235,9 +230,6 @@ stack_free_reserved(
        thread_t        thread)
 {
        if (thread->reserved_stack != thread->kernel_stack) {
-#if KASAN
-               kasan_unpoison_stack(thread->reserved_stack, kernel_stack_size);
-#endif
                stack_free_stack(thread->reserved_stack);
        }
 }
@@ -249,6 +241,11 @@ stack_free_stack(
        struct stack_cache      *cache;
        spl_t                           s;
 
+#if KASAN_DEBUG
+       /* Sanity check - stack should be unpoisoned by now */
+       assert(kasan_check_shadow(stack, kernel_stack_size, 0));
+#endif
+
        s = splsched();
        cache = &PROCESSOR_DATA(current_processor(), stack_cache);
        if (cache->count < STACK_CACHE_SIZE) {
@@ -416,7 +413,7 @@ stack_fake_zone_init(int zone_index)
 }
 
 void
-stack_fake_zone_info(int *count, 
+stack_fake_zone_info(int *count,
                     vm_size_t *cur_size, vm_size_t *max_size, vm_size_t *elem_size, vm_size_t *alloc_size,
                     uint64_t *sum_size, int *collectable, int *exhaustable, int *caller_acct)
 {
index c9cfd5167356d5a3f007e7b60ce31dbcf598c865..c0693a1178289a34bed330143a754521469a2140 100644 (file)
 #include <kern/clock.h>
 #include <kern/coalition.h>
 #include <kern/cpu_number.h>
+#include <kern/cpu_quiesce.h>
 #include <kern/ledger.h>
 #include <kern/machine.h>
 #include <kern/processor.h>
 #include <kern/sched_prim.h>
+#include <kern/turnstile.h>
 #if CONFIG_SCHED_SFI
 #include <kern/sfi.h>
 #endif
 #include <sys/kdebug.h>
 #include <sys/random.h>
 #include <sys/ktrace.h>
+#include <libkern/section_keywords.h>
 
 #include <kern/ltable.h>
 #include <kern/waitq.h>
 #include <ipc/ipc_voucher.h>
 #include <voucher/ipc_pthread_priority_internal.h>
 #include <mach/host_info.h>
+#include <pthread/workqueue_internal.h>
 
+#if CONFIG_XNUPOST
+#include <tests/ktest.h>
+#include <tests/xnupost.h>
+#endif
 
 #if CONFIG_ATM
 #include <atm/atm_internal.h>
@@ -182,8 +190,13 @@ extern void cpu_physwindow_init(int);
 #include <i386/vmx/vmx_cpu.h>
 #endif
 
+#if CONFIG_DTRACE
+extern void dtrace_early_init(void);
+extern void sdt_early_init(void);
+#endif
+
 // libkern/OSKextLib.cpp
-extern void    OSKextRemoveKextBootstrap(void);
+extern void OSKextRemoveKextBootstrap(void);
 
 void scale_setup(void);
 extern void bsd_scale_setup(int);
@@ -207,6 +220,11 @@ unsigned int trace_wrap = 0;
 boolean_t trace_serial = FALSE;
 boolean_t early_boot_complete = FALSE;
 
+/* physically contiguous carveouts */
+SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout = 0;
+SECURITY_READ_ONLY_LATE(uintptr_t) phys_carveout_pa = 0;
+SECURITY_READ_ONLY_LATE(size_t) phys_carveout_size = 0;
+
 /* mach leak logging */
 int log_leaks = 0;
 
@@ -301,9 +319,14 @@ kernel_bootstrap(void)
        csr_init();
 #endif
 
-       if (PE_i_can_has_debugger(NULL) &&
-           PE_parse_boot_argn("-show_pointers", &namep, sizeof (namep))) {
-               doprnt_hide_pointers = FALSE;
+       if (PE_i_can_has_debugger(NULL)) {
+               if (PE_parse_boot_argn("-show_pointers", &namep, sizeof(namep))) {
+                       doprnt_hide_pointers = FALSE;
+               }
+               if (PE_parse_boot_argn("-no_slto_panic", &namep, sizeof(namep))) {
+                       extern boolean_t spinlock_timeout_panic;
+                       spinlock_timeout_panic = FALSE;
+               }
        }
 
        kernel_bootstrap_log("console_init");
@@ -365,6 +388,12 @@ kernel_bootstrap(void)
        kernel_bootstrap_log("thread_init");
        thread_init();
 
+       kernel_bootstrap_log("workq_init");
+       workq_init();
+
+       kernel_bootstrap_log("turnstiles_init");
+       turnstiles_init();
+
 #if CONFIG_ATM
        /* Initialize the Activity Trace Resource Manager. */
        kernel_bootstrap_log("atm_init");
@@ -497,9 +526,25 @@ kernel_bootstrap_thread(void)
        cpu_physwindow_init(0);
 #endif
 
+       if (PE_i_can_has_debugger(NULL)) {
+               unsigned int phys_carveout_mb = 0;
+               if (PE_parse_boot_argn("phys_carveout_mb", &phys_carveout_mb,
+                               sizeof(phys_carveout_mb)) && phys_carveout_mb > 0) {
+                       phys_carveout_size = phys_carveout_mb * 1024 * 1024;
+                       kern_return_t kr = kmem_alloc_contig(kernel_map,
+                                       (vm_offset_t *)&phys_carveout, phys_carveout_size,
+                                       VM_MAP_PAGE_MASK(kernel_map), 0, 0, KMA_NOPAGEWAIT,
+                                       VM_KERN_MEMORY_DIAG);
+                       if (kr != KERN_SUCCESS) {
+                               kprintf("failed to allocate %uMB for phys_carveout_mb: %u\n",
+                                               phys_carveout_mb, (unsigned int)kr);
+                       } else {
+                               phys_carveout_pa = kvtophys((vm_offset_t)phys_carveout);
+                       }
+               }
+       }
 
-       
-#if MACH_KDP 
+#if MACH_KDP
        kernel_bootstrap_log("kdp_init");
        kdp_init();
 #endif
@@ -534,16 +579,13 @@ kernel_bootstrap_thread(void)
 
        kdebug_init(new_nkdbufs, trace_typefilter, trace_wrap);
 
-       kernel_bootstrap_log("prng_init");
-       prng_cpu_init(master_cpu);
-
 #ifdef MACH_BSD
        kernel_bootstrap_log("bsd_early_init");
        bsd_early_init();
 #endif
 
 #if defined(__arm64__)
-    ml_lockdown_init();
+       ml_lockdown_init();
 #endif
 
 #ifdef IOKIT
@@ -591,9 +633,22 @@ kernel_bootstrap_thread(void)
 #endif
 #endif
 
+#if CONFIG_DTRACE
+       dtrace_early_init();
+       sdt_early_init();
+#endif
+
+
+       /*
+        * Get rid of segments used to bootstrap kext loading. This removes
+        * the KLD, PRELINK symtab, LINKEDIT, and symtab segments/load commands.
+        * Must be done prior to lockdown so that we can free (and possibly relocate)
+        * the static KVA mappings used for the jettisoned bootstrap segments.
+        */
+       OSKextRemoveKextBootstrap();
 #if defined(__arm__) || defined(__arm64__)
 #if CONFIG_KERNEL_INTEGRITY
-        machine_lockdown_preflight();
+       machine_lockdown_preflight();
 #endif
        /*
         *  Finalize protections on statically mapped pages now that comm page mapping is established.
@@ -627,6 +682,14 @@ kernel_bootstrap_thread(void)
        vm_set_restrictions();
 
 
+#ifdef CONFIG_XNUPOST
+       kern_return_t result = kernel_list_tests();
+       result = kernel_do_post();
+       if (result != KERN_SUCCESS) {
+               panic("kernel_do_post: Tests failed with result = 0x%08x\n", result);
+       }
+       kernel_bootstrap_log("kernel_do_post - done");
+#endif /* CONFIG_XNUPOST */
 
 
        /*
@@ -636,11 +699,6 @@ kernel_bootstrap_thread(void)
        bsd_init();
 #endif
 
-    /*
-     * Get rid of segments used to bootstrap kext loading. This removes
-     * the KLD, PRELINK symtab, LINKEDIT, and symtab segments/load commands.
-     */
-       OSKextRemoveKextBootstrap();
 
        /*
         * Get rid of pages used for early boot tracing.
@@ -772,6 +830,8 @@ load_context(
        PROCESSOR_DATA(processor, current_state) = &PROCESSOR_DATA(processor, system_state);
 
 
+       cpu_quiescent_counter_join(processor->last_dispatch);
+
        PMAP_ACTIVATE_USER(thread, processor->cpu_id);
 
        load_context_kprintf("machine_load_context\n");
index 0a6f7b33decd0158d2466bea1baf83999d097eb0..98f33ba8d69dae1e37e6ce1111ee1a92ec1a5911 100644 (file)
@@ -66,6 +66,8 @@ static unsigned int semaphore_event;
 zone_t semaphore_zone;
 unsigned int semaphore_max;
 
+os_refgrp_decl(static, sema_refgrp, "semaphore", NULL);
+
 /* Forward declarations */
 
 
@@ -184,7 +186,7 @@ semaphore_create(
         * Initialize the semaphore values.
         */
        s->port = IP_NULL;
-       s->ref_count = 1;
+       os_ref_init(&s->ref_count, &sema_refgrp);
        s->count = value;
        s->active = TRUE;
        s->owner = task;
@@ -280,11 +282,12 @@ semaphore_destroy(
 
        if (semaphore->owner != task) {
                semaphore_unlock(semaphore);
+               semaphore_dereference(semaphore);
                splx(spl_level);
                task_unlock(task);
                return KERN_INVALID_ARGUMENT;
        }
-                       
+
        semaphore_destroy_internal(task, semaphore);
        /* semaphore unlocked */
 
@@ -1105,7 +1108,7 @@ void
 semaphore_reference(
        semaphore_t             semaphore)
 {
-       (void)hw_atomic_add(&semaphore->ref_count, 1);
+       os_ref_retain(&semaphore->ref_count);
 }
 
 /*
@@ -1124,8 +1127,9 @@ semaphore_dereference(
        if (semaphore == NULL)
                return;
 
-       if (hw_atomic_sub(&semaphore->ref_count, 1) != 0)
+       if (os_ref_release(&semaphore->ref_count) > 0) {
                return;
+       }
 
        /*
         * Last ref, clean up the port [if any]
index 2187c6bae69b5e795088f390d4deb448b3e38a03..144e925d305bc12811683758c3b5a4b1f6ae10e4 100644 (file)
 
 #include <kern/queue.h>
 #include <kern/waitq.h>
+#include <os/refcnt.h>
 
 typedef struct semaphore {
        queue_chain_t     task_link;  /* chain of semaphores owned by a task */
        struct waitq      waitq;      /* queue of blocked threads & lock     */
        task_t            owner;      /* task that owns semaphore            */
        ipc_port_t        port;       /* semaphore port                      */
-       uint32_t          ref_count;  /* reference count                     */
+       os_refcnt_t       ref_count;  /* reference count                     */
        int               count;      /* current count value                 */
        boolean_t         active;     /* active status                       */
 } Semaphore;
index 27961d85cb50f9fef9d587fdca3bcfd36408d0e6..1732d7ab23e51e526824eb44bb4c61cc60a7cea3 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
@@ -53,8 +53,6 @@
  * any improvements or extensions that they make and grant Carnegie Mellon
  * the rights to redistribute these changes.
  */
-/*
- */
 
 #include <mach/boolean.h>
 #include <mach/thread_switch.h>
 #include <sys/kdebug.h>
 #include <kern/ast.h>
 
-#ifdef MACH_BSD
-extern void workqueue_thread_yielded(void);
-extern sched_call_t workqueue_get_sched_callback(void);
-#endif /* MACH_BSD */
-
-extern wait_result_t thread_handoff_reason(thread_t thread, ast_t reason);
+static void thread_depress_abstime(uint64_t interval);
+static void thread_depress_ms(mach_msg_timeout_t interval);
 
 /* Called from commpage to take a delayed preemption when exiting
  * the "Preemption Free Zone" (PFZ).
@@ -125,7 +119,6 @@ swtch(
        __unused struct swtch_args *args)
 {
        processor_t     myprocessor;
-       boolean_t                               result;
 
        disable_preemption();
        myprocessor = current_processor();
@@ -138,14 +131,7 @@ swtch(
 
        counter(c_swtch_block++);
 
-       thread_block_reason((thread_continue_t)swtch_continue, NULL, AST_YIELD);
-
-       disable_preemption();
-       myprocessor = current_processor();
-       result = SCHED(thread_should_yield)(myprocessor, current_thread());
-       enable_preemption();
-
-       return (result);
+       thread_yield_with_continuation((thread_continue_t)swtch_continue, NULL);
 }
 
 static void
@@ -154,7 +140,7 @@ swtch_pri_continue(void)
        processor_t     myprocessor;
        boolean_t       result;
 
-       thread_depress_abort_internal(current_thread());
+       thread_depress_abort(current_thread());
 
        disable_preemption();
        myprocessor = current_processor();
@@ -170,7 +156,6 @@ swtch_pri(
 __unused       struct swtch_pri_args *args)
 {
        processor_t     myprocessor;
-       boolean_t                               result;
 
        disable_preemption();
        myprocessor = current_processor();
@@ -185,45 +170,17 @@ __unused  struct swtch_pri_args *args)
 
        thread_depress_abstime(thread_depress_time);
 
-       thread_block_reason((thread_continue_t)swtch_pri_continue, NULL, AST_YIELD);
-
-       thread_depress_abort_internal(current_thread());
-
-       disable_preemption();
-       myprocessor = current_processor();
-       result = SCHED(thread_should_yield)(myprocessor, current_thread());
-       enable_preemption();
-
-       return (result);
-}
-
-static boolean_t
-thread_switch_disable_workqueue_sched_callback(void)
-{
-       sched_call_t callback = workqueue_get_sched_callback();
-       return thread_disable_sched_call(current_thread(), callback) != NULL;
+       thread_yield_with_continuation((thread_continue_t)swtch_pri_continue, NULL);
 }
 
 static void
-thread_switch_enable_workqueue_sched_callback(void)
+thread_switch_continue(void *parameter, __unused int ret)
 {
-       sched_call_t callback = workqueue_get_sched_callback();
-       thread_reenable_sched_call(current_thread(), callback);
-}
-
-static void
-thread_switch_continue(void)
-{
-       thread_t        self = current_thread();
-       int                                     option = self->saved.swtch.option;
-       boolean_t                       reenable_workq_callback = self->saved.swtch.reenable_workq_callback;
-
+       thread_t self = current_thread();
+       int option = (int)(intptr_t)parameter;
 
        if (option == SWITCH_OPTION_DEPRESS || option == SWITCH_OPTION_OSLOCK_DEPRESS)
-               thread_depress_abort_internal(self);
-
-       if (reenable_workq_callback)
-               thread_switch_enable_workqueue_sched_callback();
+               thread_depress_abort(self);
 
        thread_syscall_return(KERN_SUCCESS);
        /*NOTREACHED*/
@@ -244,41 +201,34 @@ thread_switch(
        int                                             option = args->option;
        mach_msg_timeout_t              option_time = args->option_time;
        uint32_t                                scale_factor = NSEC_PER_MSEC;
-       boolean_t                               reenable_workq_callback = FALSE;
        boolean_t                               depress_option = FALSE;
        boolean_t                               wait_option = FALSE;
+       wait_interrupt_t                interruptible = THREAD_ABORTSAFE;
 
     /*
-     * Validate and process option.
-     */
-    switch (option) {
-
+        *      Validate and process option.
+        */
+       switch (option) {
        case SWITCH_OPTION_NONE:
-               workqueue_thread_yielded();
                break;
        case SWITCH_OPTION_WAIT:
                wait_option = TRUE;
-               workqueue_thread_yielded();
                break;
        case SWITCH_OPTION_DEPRESS:
                depress_option = TRUE;
-               workqueue_thread_yielded();
                break;
        case SWITCH_OPTION_DISPATCH_CONTENTION:
                scale_factor = NSEC_PER_USEC;
                wait_option = TRUE;
-               if (thread_switch_disable_workqueue_sched_callback())
-                       reenable_workq_callback = TRUE;
+               interruptible |= THREAD_WAIT_NOREPORT;
                break;
        case SWITCH_OPTION_OSLOCK_DEPRESS:
                depress_option = TRUE;
-               if (thread_switch_disable_workqueue_sched_callback())
-                       reenable_workq_callback = TRUE;
+               interruptible |= THREAD_WAIT_NOREPORT;
                break;
        case SWITCH_OPTION_OSLOCK_WAIT:
                wait_option = TRUE;
-               if (thread_switch_disable_workqueue_sched_callback())
-                       reenable_workq_callback = TRUE;
+               interruptible |= THREAD_WAIT_NOREPORT;
                break;
        default:
            return (KERN_INVALID_ARGUMENT);
@@ -350,17 +300,13 @@ thread_switch(
                        thread_deallocate_safe(thread);
 
                        if (wait_option)
-                               assert_wait_timeout((event_t)assert_wait_timeout, THREAD_ABORTSAFE,
+                               assert_wait_timeout((event_t)assert_wait_timeout, interruptible,
                                                    option_time, scale_factor);
                        else if (depress_option)
                                thread_depress_ms(option_time);
 
-                       self->saved.swtch.option = option;
-                       self->saved.swtch.reenable_workq_callback = reenable_workq_callback;
-
-                       thread_run(self, (thread_continue_t)thread_switch_continue, NULL, pulled_thread);
-                       /* NOTREACHED */
-                       panic("returned from thread_run!");
+                       thread_run(self, thread_switch_continue, (void *)(intptr_t)option, pulled_thread);
+                       __builtin_unreachable();
                }
 
                splx(s);
@@ -369,24 +315,25 @@ thread_switch(
        }
 
        if (wait_option)
-               assert_wait_timeout((event_t)assert_wait_timeout, THREAD_ABORTSAFE, option_time, scale_factor);
+               assert_wait_timeout((event_t)assert_wait_timeout, interruptible, option_time, scale_factor);
        else if (depress_option)
                thread_depress_ms(option_time);
 
-       self->saved.swtch.option = option;
-       self->saved.swtch.reenable_workq_callback = reenable_workq_callback;
-
-       thread_block_reason((thread_continue_t)thread_switch_continue, NULL, AST_YIELD);
-
-       if (depress_option)
-               thread_depress_abort_internal(self);
-
-       if (reenable_workq_callback)
-               thread_switch_enable_workqueue_sched_callback();
+       thread_yield_with_continuation(thread_switch_continue, (void *)(intptr_t)option);
+       __builtin_unreachable();
+}
 
-    return (KERN_SUCCESS);
+void
+thread_yield_with_continuation(
+       thread_continue_t       continuation,
+       void                            *parameter)
+{
+       assert(continuation);
+       thread_block_reason(continuation, parameter, AST_YIELD);
+       __builtin_unreachable();
 }
 
+
 /* Returns a +1 thread reference */
 thread_t
 port_name_to_thread_for_ulock(mach_port_name_t thread_name)
@@ -425,10 +372,15 @@ port_name_to_thread_for_ulock(mach_port_name_t thread_name)
 /* This function is called after an assert_wait(), therefore it must not
  * cause another wait until after the thread_run() or thread_block()
  *
- * Consumes a ref on thread
+ *
+ * When called with a NULL continuation, the thread ref is consumed
+ * (thread_handoff_deallocate calling convention) else it is up to the
+ * continuation to do the cleanup (thread_handoff_parameter calling convention)
+ * and it instead doesn't return.
  */
-wait_result_t
-thread_handoff(thread_t thread)
+static wait_result_t
+thread_handoff_internal(thread_t thread, thread_continue_t continuation,
+               void *parameter)
 {
        thread_t deallocate_thread = THREAD_NULL;
        thread_t self = current_thread();
@@ -446,10 +398,12 @@ thread_handoff(thread_t thread)
                                      pulled_thread ? TRUE : FALSE, 0, 0);
 
                if (pulled_thread != THREAD_NULL) {
-                       /* We can't be dropping the last ref here */
-                       thread_deallocate_safe(thread);
+                       if (continuation == NULL) {
+                               /* We can't be dropping the last ref here */
+                               thread_deallocate_safe(thread);
+                       }
 
-                       int result = thread_run(self, THREAD_CONTINUE_NULL, NULL, pulled_thread);
+                       int result = thread_run(self, continuation, parameter, pulled_thread);
 
                        splx(s);
                        return result;
@@ -461,7 +415,7 @@ thread_handoff(thread_t thread)
                thread = THREAD_NULL;
        }
 
-       int result = thread_block(THREAD_CONTINUE_NULL);
+       int result = thread_block_parameter(continuation, parameter);
        if (deallocate_thread != THREAD_NULL) {
                thread_deallocate(deallocate_thread);
        }
@@ -469,54 +423,75 @@ thread_handoff(thread_t thread)
        return result;
 }
 
+void
+thread_handoff_parameter(thread_t thread, thread_continue_t continuation,
+               void *parameter)
+{
+       thread_handoff_internal(thread, continuation, parameter);
+       panic("NULL continuation passed to %s", __func__);
+       __builtin_unreachable();
+}
+
+wait_result_t
+thread_handoff_deallocate(thread_t thread)
+{
+       return thread_handoff_internal(thread, NULL, NULL);
+}
+
+/*
+ * Thread depression
+ *
+ * This mechanism drops a thread to priority 0 in order for it to yield to
+ * all other runnnable threads on the system.  It can be canceled or timed out,
+ * whereupon the thread goes back to where it was.
+ *
+ * Note that TH_SFLAG_DEPRESS and TH_SFLAG_POLLDEPRESS are never set at the
+ * same time.  DEPRESS always defers to POLLDEPRESS.
+ *
+ * DEPRESS only lasts across a single thread_block call, and never returns
+ * to userspace.
+ * POLLDEPRESS can be active anywhere up until thread termination.
+ */
+
 /*
  * Depress thread's priority to lowest possible for the specified interval,
- * with a value of zero resulting in no timeout being scheduled.
+ * with an interval of zero resulting in no timeout being scheduled.
+ *
+ * Must block with AST_YIELD afterwards to take effect
  */
 void
-thread_depress_abstime(
-       uint64_t                                interval)
+thread_depress_abstime(uint64_t interval)
 {
-       thread_t                self = current_thread();
-       uint64_t                                deadline;
-    spl_t                                      s;
-
-    s = splsched();
-    thread_lock(self);
-       if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) {
-               processor_t             myprocessor = self->last_processor;
-
-               self->sched_pri = DEPRESSPRI;
-
-               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
-                                     (uintptr_t)thread_tid(self),
-                                     self->base_pri,
-                                     self->sched_pri,
-                                     self->sched_usage,
-                                     0);
-
-               myprocessor->current_pri = self->sched_pri;
-               myprocessor->current_perfctl_class = thread_get_perfcontrol_class(self);
+       thread_t self = current_thread();
+
+       spl_t s = splsched();
+       thread_lock(self);
+
+       assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0);
+
+       if ((self->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) {
                self->sched_flags |= TH_SFLAG_DEPRESS;
+               thread_recompute_sched_pri(self, SETPRI_LAZY);
 
                if (interval != 0) {
+                       uint64_t deadline;
+
                        clock_absolutetime_interval_to_deadline(interval, &deadline);
                        if (!timer_call_enter(&self->depress_timer, deadline, TIMER_CALL_USER_CRITICAL))
                                self->depress_timer_active++;
                }
        }
+
        thread_unlock(self);
-    splx(s);
+       splx(s);
 }
 
 void
-thread_depress_ms(
-       mach_msg_timeout_t              interval)
+thread_depress_ms(mach_msg_timeout_t interval)
 {
-       uint64_t                abstime;
+       uint64_t abstime;
 
-       clock_interval_to_absolutetime_interval(
-                                                       interval, NSEC_PER_MSEC, &abstime);
+       clock_interval_to_absolutetime_interval(interval, NSEC_PER_MSEC, &abstime);
        thread_depress_abstime(abstime);
 }
 
@@ -524,111 +499,132 @@ thread_depress_ms(
  *     Priority depression expiration.
  */
 void
-thread_depress_expire(
-       void                    *p0,
-       __unused void   *p1)
+thread_depress_expire(void      *p0,
+             __unused void      *p1)
 {
-       thread_t                thread = p0;
-    spl_t                      s;
+       thread_t thread = (thread_t)p0;
+
+       spl_t s = splsched();
+       thread_lock(thread);
+
+       assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
 
-    s = splsched();
-    thread_lock(thread);
        if (--thread->depress_timer_active == 0) {
                thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
-               thread_recompute_sched_pri(thread, FALSE);
+               thread_recompute_sched_pri(thread, SETPRI_DEFAULT);
        }
-    thread_unlock(thread);
-    splx(s);
+
+       thread_unlock(thread);
+       splx(s);
 }
 
 /*
- *     Prematurely abort priority depression if there is one.
+ * Prematurely abort priority depression if there is one.
  */
 kern_return_t
-thread_depress_abort_internal(
-       thread_t                                thread)
+thread_depress_abort(thread_t thread)
 {
-    kern_return_t                      result = KERN_NOT_DEPRESSED;
-    spl_t                                      s;
-
-    s = splsched();
-    thread_lock(thread);
-       if (!(thread->sched_flags & TH_SFLAG_POLLDEPRESS)) {
-               if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-                       thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
-                       thread_recompute_sched_pri(thread, FALSE);
-                       result = KERN_SUCCESS;
-               }
+       kern_return_t result = KERN_NOT_DEPRESSED;
+
+       spl_t s = splsched();
+       thread_lock(thread);
 
-               if (timer_call_cancel(&thread->depress_timer))
-                       thread->depress_timer_active--;
+       assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
+
+       /*
+        * User-triggered depress-aborts should not get out
+        * of the poll-depress, but they should cancel a regular depress.
+        */
+       if ((thread->sched_flags & TH_SFLAG_POLLDEPRESS) == 0) {
+               result = thread_depress_abort_locked(thread);
        }
+
        thread_unlock(thread);
-    splx(s);
+       splx(s);
 
-    return (result);
+       return result;
 }
 
-void
-thread_poll_yield(
-       thread_t                self)
+/*
+ * Prematurely abort priority depression or poll depression if one is active.
+ * Called with the thread locked.
+ */
+kern_return_t
+thread_depress_abort_locked(thread_t thread)
 {
-       spl_t                   s;
+       if ((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == 0)
+               return KERN_NOT_DEPRESSED;
+
+       assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
 
+       thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
+
+       thread_recompute_sched_pri(thread, SETPRI_LAZY);
+
+       if (timer_call_cancel(&thread->depress_timer))
+               thread->depress_timer_active--;
+
+       return KERN_SUCCESS;
+}
+
+/*
+ * Invoked as part of a polling operation like a no-timeout port receive
+ *
+ * Forces a fixpri thread to yield if it is detected polling without blocking for too long.
+ */
+void
+thread_poll_yield(thread_t self)
+{
        assert(self == current_thread());
+       assert((self->sched_flags & TH_SFLAG_DEPRESS) == 0);
 
-       s = splsched();
-       if (self->sched_mode == TH_MODE_FIXED) {
-               uint64_t                        total_computation, abstime;
-
-               abstime = mach_absolute_time();
-               total_computation = abstime - self->computation_epoch;
-               total_computation += self->computation_metered;
-               if (total_computation >= max_poll_computation) {
-                       processor_t             myprocessor = current_processor();
-                       ast_t                   preempt;
-
-                       thread_lock(self);
-                       if (!(self->sched_flags & TH_SFLAG_DEPRESSED_MASK)) {
-                               self->sched_pri = DEPRESSPRI;
-
-                               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
-                                                     (uintptr_t)thread_tid(self),
-                                                     self->base_pri,
-                                                     self->sched_pri,
-                                                     self->sched_usage,
-                                                     0);
-
-                               myprocessor->current_pri = self->sched_pri;
-                               myprocessor->current_perfctl_class = thread_get_perfcontrol_class(self);
-                       }
-                       self->computation_epoch = abstime;
-                       self->computation_metered = 0;
-                       self->sched_flags |= TH_SFLAG_POLLDEPRESS;
+       if (self->sched_mode != TH_MODE_FIXED)
+               return;
 
-                       abstime += (total_computation >> sched_poll_yield_shift);
-                       if (!timer_call_enter(&self->depress_timer, abstime, TIMER_CALL_USER_CRITICAL))
-                               self->depress_timer_active++;
+       spl_t s = splsched();
 
-                       if ((preempt = csw_check(myprocessor, AST_NONE)) != AST_NONE)
-                               ast_on(preempt);
+       uint64_t abstime = mach_absolute_time();
+       uint64_t total_computation = abstime -
+               self->computation_epoch + self->computation_metered;
 
-                       thread_unlock(self);
-               }
+       if (total_computation >= max_poll_computation) {
+               thread_lock(self);
+
+               self->computation_epoch   = abstime;
+               self->computation_metered = 0;
+
+               uint64_t yield_expiration = abstime +
+                        (total_computation >> sched_poll_yield_shift);
+
+               if (!timer_call_enter(&self->depress_timer, yield_expiration,
+                                     TIMER_CALL_USER_CRITICAL))
+                       self->depress_timer_active++;
+
+               self->sched_flags |= TH_SFLAG_POLLDEPRESS;
+               thread_recompute_sched_pri(self, SETPRI_DEFAULT);
+
+               thread_unlock(self);
        }
        splx(s);
 }
 
-
+/*
+ * Kernel-internal interface to yield for a specified period
+ *
+ * WARNING: Will still yield to priority 0 even if the thread is holding a contended lock!
+ */
 void
-thread_yield_internal(
-       mach_msg_timeout_t      ms)
+thread_yield_internal(mach_msg_timeout_t ms)
 {
+       thread_t self = current_thread();
+
+       assert((self->sched_flags & TH_SFLAG_DEPRESSED_MASK) != TH_SFLAG_DEPRESSED_MASK);
+
        processor_t     myprocessor;
 
        disable_preemption();
        myprocessor = current_processor();
-       if (!SCHED(thread_should_yield)(myprocessor, current_thread())) {
+       if (!SCHED(thread_should_yield)(myprocessor, self)) {
                mp_enable_preemption();
 
                return;
@@ -639,7 +635,7 @@ thread_yield_internal(
 
        thread_block_reason(THREAD_CONTINUE_NULL, NULL, AST_YIELD);
 
-       thread_depress_abort_internal(current_thread());
+       thread_depress_abort(self);
 }
 
 /*
index 5a68b926be55d31ac5124f814335f843f5137dc1..6d0984aec89910de0d9c1178cc383018a1aed828 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2017 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
  */
-/* 
+/*
  * Mach Operating System
  * Copyright (c) 1991,1990,1989 Carnegie Mellon University
  * All Rights Reserved.
- * 
+ *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
- * 
+ *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- * 
+ *
  * Carnegie Mellon requests users of this software to return to
- * 
+ *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
- * 
+ *
  * any improvements or extensions that they make and grant Carnegie Mellon
  * the rights to redistribute these changes.
  */
-/*
- */
 
-#ifndef        _KERN_SYSCALL_SUBR_H_
+#ifndef _KERN_SYSCALL_SUBR_H_
 #define _KERN_SYSCALL_SUBR_H_
 
 #include <mach/mach_traps.h>
 
-extern void            thread_depress_abstime(
-                                       uint64_t                interval);
+extern kern_return_t thread_depress_abort(thread_t thread);
 
-extern void            thread_depress_ms(
-                                       mach_msg_timeout_t      interval);
+extern kern_return_t thread_depress_abort_locked(thread_t thread);
 
-extern kern_return_t   thread_depress_abort_internal(
-                                                       thread_t                                thread);
+extern void thread_depress_expire(void *thread, void *p1);
 
-extern void            thread_depress_expire(
-                                       void                    *thread,
-                                       void                    *p1);
+extern void thread_poll_yield(thread_t self);
 
-extern void            thread_poll_yield(
-                                       thread_t                self);
+#endif /* _KERN_SYSCALL_SUBR_H_ */
 
-#endif /* _KERN_SYSCALL_SUBR_H_ */
index 5c4bd06f33c59cdc639d3307ec05caf4e77fd3c5..65e3b28906756c09f3a3a0602cb77b0deadd5216 100644 (file)
@@ -142,7 +142,7 @@ const mach_trap_t   mach_trap_table[MACH_TRAP_TABLE_COUNT] = {
 /* 37 */       MACH_TRAP(semaphore_wait_signal_trap, 2, 2, munge_ww),
 /* 38 */       MACH_TRAP(semaphore_timedwait_trap, 3, 3, munge_www),
 /* 39 */       MACH_TRAP(semaphore_timedwait_signal_trap, 4, 4, munge_wwww),
-/* 40 */       MACH_TRAP(kern_invalid, 0, 0, NULL),
+/* 40 */       MACH_TRAP(_kernelrpc_mach_port_get_attributes_trap, 5, 5, munge_wwwww),
 /* 41 */       MACH_TRAP(_kernelrpc_mach_port_guard_trap, 4, 5, munge_wwlw),
 /* 42 */       MACH_TRAP(_kernelrpc_mach_port_unguard_trap, 3, 4, munge_wwl),
 /* 43 */       MACH_TRAP(mach_generate_activity_id, 3, 3, munge_www),
index 873779ca9caa0beb119add33dbd41f1058e66c19..20eef5136946dd1f58dac660c95f0950230b27bf 100644 (file)
 
 #include <libkern/OSDebug.h>
 #include <libkern/OSAtomic.h>
+#include <libkern/section_keywords.h>
 
 #if CONFIG_ATM
 #include <atm/atm_internal.h>
@@ -184,6 +185,7 @@ lck_grp_attr_t  task_lck_grp_attr;
 
 extern int exc_via_corpse_forking;
 extern int corpse_for_fatal_memkill;
+extern boolean_t proc_send_synchronous_EXC_RESOURCE(void *p);
 
 /* Flag set by core audio when audio is playing. Used to stifle EXC_RESOURCE generation when active. */
 int audio_active = 0;
@@ -197,15 +199,39 @@ lck_spin_t                dead_task_statistics_lock;
 
 ledger_template_t task_ledger_template = NULL;
 
-struct _task_ledger_indices task_ledgers __attribute__((used)) =
-       {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+SECURITY_READ_ONLY_LATE(struct _task_ledger_indices) task_ledgers __attribute__((used)) =
+{.cpu_time = -1,
+ .tkm_private = -1,
+ .tkm_shared = -1,
+ .phys_mem = -1,
+ .wired_mem = -1,
+ .internal = -1,
+ .iokit_mapped = -1,
+ .alternate_accounting = -1,
+ .alternate_accounting_compressed = -1,
+ .page_table = -1,
+ .phys_footprint = -1,
+ .internal_compressed = -1,
+ .purgeable_volatile = -1,
+ .purgeable_nonvolatile = -1,
+ .purgeable_volatile_compressed = -1,
+ .purgeable_nonvolatile_compressed = -1,
+ .network_volatile = -1,
+ .network_nonvolatile = -1,
+ .network_volatile_compressed = -1,
+ .network_nonvolatile_compressed = -1,
+ .platform_idle_wakeups = -1,
+ .interrupt_wakeups = -1,
 #if !CONFIG_EMBEDDED
       { 0 /* initialized at runtime */},
.sfi_wait_times = { 0 /* initialized at runtime */},
 #endif /* !CONFIG_EMBEDDED */   
-        -1, -1,
-        -1, -1,
-        -1, -1,
-       };
+ .cpu_time_billed_to_me = -1,
+ .cpu_time_billed_to_others = -1,
+ .physical_writes = -1,
+ .logical_writes = -1,
+ .energy_billed_to_me = -1,
+ .energy_billed_to_others = -1
+};
 
 /* System sleep state */
 boolean_t tasks_suspend_state;
@@ -266,8 +292,11 @@ int64_t io_telemetry_limit;                        /* Threshold to take a microstackshot (0 indicated
 int64_t global_logical_writes_count = 0;       /* Global count for logical writes */
 static boolean_t global_update_logical_writes(int64_t);
 
+#define TASK_MAX_THREAD_LIMIT 256
+
 #if MACH_ASSERT
 int pmap_ledgers_panic = 1;
+int pmap_ledgers_panic_leeway = 3;
 #endif /* MACH_ASSERT */
 
 int task_max = CONFIG_TASK_MAX; /* Max number of tasks */
@@ -280,9 +309,12 @@ int hwm_user_cores = 0; /* high watermark violations generate user core files */
 extern void    proc_getexecutableuuid(void *, unsigned char *, unsigned long);
 extern int     proc_pid(struct proc *p);
 extern int     proc_selfpid(void);
+extern struct proc *current_proc(void);
 extern char    *proc_name_address(struct proc *p);
 extern uint64_t get_dispatchqueue_offset_from_proc(void *);
 extern int kevent_proc_copy_uptrs(void *proc, uint64_t *buf, int bufsize);
+extern void workq_proc_suspended(struct proc *p);
+extern void workq_proc_resumed(struct proc *p);
 
 #if CONFIG_MEMORYSTATUS
 extern void    proc_memstat_terminated(struct proc* p, boolean_t set);
@@ -298,6 +330,17 @@ extern void memorystatus_abort_vm_map_fork(task_t);
 
 #endif /* MACH_BSD */
 
+#if DEVELOPMENT || DEBUG
+int exc_resource_threads_enabled;
+#endif /* DEVELOPMENT || DEBUG */
+
+#if (DEVELOPMENT || DEBUG) && TASK_EXC_GUARD_DELIVER_CORPSE
+uint32_t task_exc_guard_default = TASK_EXC_GUARD_MP_DELIVER | TASK_EXC_GUARD_MP_CORPSE |
+                                  TASK_EXC_GUARD_VM_DELIVER | TASK_EXC_GUARD_VM_CORPSE;
+#else
+uint32_t task_exc_guard_default = 0;
+#endif
+
 /* Forwards */
 
 static void task_hold_locked(task_t task);
@@ -306,21 +349,12 @@ static void task_release_locked(task_t task);
 
 static void task_synchronizer_destroy_all(task_t task);
 
-void
-task_backing_store_privileged(
-                       task_t task)
-{
-       task_lock(task);
-       task->priv_flags |= VM_BACKING_STORE_PRIV;
-       task_unlock(task);
-       return;
-}
-
 
 void
 task_set_64bit(
                task_t task,
-               boolean_t is64bit)
+               boolean_t is_64bit,
+               boolean_t is_64bit_data)
 {
 #if defined(__i386__) || defined(__x86_64__) || defined(__arm64__)
        thread_t thread;
@@ -328,15 +362,34 @@ task_set_64bit(
 
        task_lock(task);
 
-       if (is64bit) {
-               if (task_has_64BitAddr(task))
+       /*
+        * Switching to/from 64-bit address spaces
+        */
+       if (is_64bit) {
+               if (!task_has_64Bit_addr(task)) {
+                       task_set_64Bit_addr(task);
+               }
+       } else {
+               if (task_has_64Bit_addr(task)) {
+                       task_clear_64Bit_addr(task);
+               }
+       }
+
+       /*
+        * Switching to/from 64-bit register state.
+        */
+       if (is_64bit_data) {
+               if (task_has_64Bit_data(task))
                        goto out;
-               task_set_64BitAddr(task);
+
+               task_set_64Bit_data(task);
        } else {
-               if ( !task_has_64BitAddr(task))
+               if ( !task_has_64Bit_data(task))
                        goto out;
-               task_clear_64BitAddr(task);
+
+               task_clear_64Bit_data(task);
        }
+
        /* FIXME: On x86, the thread save state flavor can diverge from the
         * task's 64-bit feature flag due to the 32-bit/64-bit register save
         * state dichotomy. Since we can be pre-empted in this interval,
@@ -381,6 +434,12 @@ out:
        task_unlock(task);
 }
 
+boolean_t
+task_get_64bit_data(task_t task)
+{
+       return task_has_64Bit_data(task);
+}
+
 void
 task_set_platform_binary(
                task_t task,
@@ -526,6 +585,14 @@ task_wait_to_return(void)
 
        task_unlock(task);
 
+#if CONFIG_MACF
+       /*
+        * Before jumping to userspace and allowing this process to execute any code,
+        * notify any interested parties.
+        */
+       mac_proc_notify_exec_complete(current_proc());
+#endif
+
        thread_bootstrap_return();
 }
 
@@ -702,10 +769,16 @@ task_init(void)
 #endif /* CONFIG_MEMORYSTATUS */
        }
 
-#if MACH_ASSERT
-       PE_parse_boot_argn("pmap_ledgers_panic", &pmap_ledgers_panic,
-                         sizeof (pmap_ledgers_panic));
-#endif /* MACH_ASSERT */
+#if DEVELOPMENT || DEBUG
+       if (!PE_parse_boot_argn("exc_resource_threads",
+               &exc_resource_threads_enabled,
+               sizeof(exc_resource_threads_enabled))) {
+               exc_resource_threads_enabled = 1;
+       }
+       PE_parse_boot_argn("task_exc_guard_default",
+           &task_exc_guard_default,
+           sizeof(task_exc_guard_default));
+#endif /* DEVELOPMENT || DEBUG */
 
 #if CONFIG_COREDUMP
        if (!PE_parse_boot_argn("hwm_user_cores", &hwm_user_cores,
@@ -766,9 +839,9 @@ task_init(void)
         * Create the kernel task as the first task.
         */
 #ifdef __LP64__
-       if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS)
+       if (task_create_internal(TASK_NULL, NULL, FALSE, TRUE, TRUE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS)
 #else
-       if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS)
+       if (task_create_internal(TASK_NULL, NULL, FALSE, FALSE, FALSE, TF_NONE, TPF_NONE, &kernel_task) != KERN_SUCCESS)
 #endif
                panic("task_init\n");
 
@@ -874,8 +947,12 @@ init_task_ledgers(void)
        assert(kernel_task == TASK_NULL);
 
 #if MACH_ASSERT
-       PE_parse_boot_argn("pmap_ledgers_panic", &pmap_ledgers_panic,
+       PE_parse_boot_argn("pmap_ledgers_panic",
+                          &pmap_ledgers_panic,
                          sizeof (pmap_ledgers_panic));
+       PE_parse_boot_argn("pmap_ledgers_panic_leeway",
+                          &pmap_ledgers_panic_leeway,
+                         sizeof (pmap_ledgers_panic_leeway));
 #endif /* MACH_ASSERT */
 
        if ((t = ledger_template_create("Per-task ledger")) == NULL)
@@ -908,6 +985,12 @@ init_task_ledgers(void)
        task_ledgers.purgeable_nonvolatile = ledger_entry_add(t, "purgeable_nonvolatile", "physmem", "bytes");
        task_ledgers.purgeable_volatile_compressed = ledger_entry_add(t, "purgeable_volatile_compress", "physmem", "bytes");
        task_ledgers.purgeable_nonvolatile_compressed = ledger_entry_add(t, "purgeable_nonvolatile_compress", "physmem", "bytes");
+
+       task_ledgers.network_volatile = ledger_entry_add(t, "network_volatile", "physmem", "bytes");
+       task_ledgers.network_nonvolatile = ledger_entry_add(t, "network_nonvolatile", "physmem", "bytes");
+       task_ledgers.network_volatile_compressed = ledger_entry_add(t, "network_volatile_compressed", "physmem", "bytes");
+       task_ledgers.network_nonvolatile_compressed = ledger_entry_add(t, "network_nonvolatile_compressed", "physmem", "bytes");
+
        task_ledgers.platform_idle_wakeups = ledger_entry_add(t, "platform_idle_wakeups", "power",
            "count");
        task_ledgers.interrupt_wakeups = ledger_entry_add(t, "interrupt_wakeups", "power",
@@ -965,6 +1048,10 @@ init_task_ledgers(void)
            (task_ledgers.purgeable_nonvolatile < 0) ||
            (task_ledgers.purgeable_volatile_compressed < 0) ||
            (task_ledgers.purgeable_nonvolatile_compressed < 0) ||
+           (task_ledgers.network_volatile < 0) ||
+           (task_ledgers.network_nonvolatile < 0) ||
+           (task_ledgers.network_volatile_compressed < 0) ||
+           (task_ledgers.network_nonvolatile_compressed < 0) ||
            (task_ledgers.platform_idle_wakeups < 0) ||
            (task_ledgers.interrupt_wakeups < 0) ||
            (task_ledgers.cpu_time_billed_to_me < 0) || (task_ledgers.cpu_time_billed_to_others < 0) ||
@@ -988,6 +1075,11 @@ init_task_ledgers(void)
        ledger_track_credit_only(t, task_ledgers.purgeable_volatile_compressed);
        ledger_track_credit_only(t, task_ledgers.purgeable_nonvolatile_compressed);
 
+       ledger_track_credit_only(t, task_ledgers.network_volatile);
+       ledger_track_credit_only(t, task_ledgers.network_nonvolatile);
+       ledger_track_credit_only(t, task_ledgers.network_volatile_compressed);
+       ledger_track_credit_only(t, task_ledgers.network_nonvolatile_compressed);
+
        ledger_track_maximum(t, task_ledgers.phys_footprint, 60);
 #if MACH_ASSERT
        if (pmap_ledgers_panic) {
@@ -1002,6 +1094,11 @@ init_task_ledgers(void)
                ledger_panic_on_negative(t, task_ledgers.purgeable_nonvolatile);
                ledger_panic_on_negative(t, task_ledgers.purgeable_volatile_compressed);
                ledger_panic_on_negative(t, task_ledgers.purgeable_nonvolatile_compressed);
+
+               ledger_panic_on_negative(t, task_ledgers.network_volatile);
+               ledger_panic_on_negative(t, task_ledgers.network_nonvolatile);
+               ledger_panic_on_negative(t, task_ledgers.network_volatile_compressed);
+               ledger_panic_on_negative(t, task_ledgers.network_nonvolatile_compressed);
        }
 #endif /* MACH_ASSERT */
 
@@ -1024,6 +1121,7 @@ task_create_internal(
        coalition_t     *parent_coalitions __unused,
        boolean_t       inherit_memory,
        __unused boolean_t      is_64bit,
+       boolean_t is_64bit_data,
        uint32_t        t_flags,
        uint32_t        t_procflags,
        task_t          *child_task)            /* OUT */
@@ -1076,7 +1174,6 @@ task_create_internal(
        new_task->legacy_stop_count = 0;
        new_task->active = TRUE;
        new_task->halting = FALSE;
-       new_task->user_data = NULL;
        new_task->priv_flags = 0;
        new_task->t_flags = t_flags;
        new_task->t_procflags = t_procflags;
@@ -1084,6 +1181,8 @@ task_create_internal(
        new_task->crashed_thread_id = 0;
        new_task->exec_token = 0;
 
+       new_task->task_exc_guard = task_exc_guard_default;
+       
 #if CONFIG_ATM
        new_task->atm_context = NULL;
 #endif
@@ -1160,13 +1259,12 @@ task_create_internal(
 
        new_task->mem_notify_reserved = 0;
        new_task->memlimit_attrs_reserved = 0;
-#if IMPORTANCE_INHERITANCE
-       new_task->task_imp_base = NULL;
-#endif /* IMPORTANCE_INHERITANCE */
 
        new_task->requested_policy = default_task_requested_policy;
        new_task->effective_policy = default_task_effective_policy;
 
+       task_importance_init_from_parent(new_task, parent_task);
+
        if (parent_task != TASK_NULL) {
                new_task->sec_token = parent_task->sec_token;
                new_task->audit_token = parent_task->audit_token;
@@ -1175,8 +1273,14 @@ task_create_internal(
                shared_region = vm_shared_region_get(parent_task);
                vm_shared_region_set(new_task, shared_region);
 
-               if(task_has_64BitAddr(parent_task))
-                       task_set_64BitAddr(new_task);
+               if(task_has_64Bit_addr(parent_task)) {
+                       task_set_64Bit_addr(new_task);
+               }
+
+               if(task_has_64Bit_data(parent_task)) {
+                       task_set_64Bit_data(new_task);
+               }
+
                new_task->all_image_info_addr = parent_task->all_image_info_addr;
                new_task->all_image_info_size = parent_task->all_image_info_size;
 
@@ -1185,43 +1289,6 @@ task_create_internal(
 
                new_task->pset_hint = parent_task->pset_hint = task_choose_pset(parent_task);
 
-#if IMPORTANCE_INHERITANCE
-               ipc_importance_task_t new_task_imp = IIT_NULL;
-               boolean_t inherit_receive = TRUE;
-
-               if (task_is_marked_importance_donor(parent_task)) {
-                       new_task_imp = ipc_importance_for_task(new_task, FALSE);
-                       assert(IIT_NULL != new_task_imp);
-                       ipc_importance_task_mark_donor(new_task_imp, TRUE);
-               }
-#if CONFIG_EMBEDDED
-               /* Embedded only wants to inherit for exec copy task */
-               if ((t_procflags & TPF_EXEC_COPY) == 0) {
-                       inherit_receive = FALSE;
-               }
-#endif /* CONFIG_EMBEDDED */
-
-               if (inherit_receive) {
-                       if (task_is_marked_importance_receiver(parent_task)) {
-                               if (IIT_NULL == new_task_imp)
-                                       new_task_imp = ipc_importance_for_task(new_task, FALSE);
-                               assert(IIT_NULL != new_task_imp);
-                               ipc_importance_task_mark_receiver(new_task_imp, TRUE);
-                       }
-                       if (task_is_marked_importance_denap_receiver(parent_task)) {
-                               if (IIT_NULL == new_task_imp)
-                                       new_task_imp = ipc_importance_for_task(new_task, FALSE);
-                               assert(IIT_NULL != new_task_imp);
-                               ipc_importance_task_mark_denap_receiver(new_task_imp, TRUE);
-                       }
-               }
-               
-               if (IIT_NULL != new_task_imp) {
-                       assert(new_task->task_imp_base == new_task_imp);
-                       ipc_importance_task_release(new_task_imp);
-               }
-#endif /* IMPORTANCE_INHERITANCE */
-
                new_task->priority = BASEPRI_DEFAULT;
                new_task->max_priority = MAXPRI_USER;
 
@@ -1230,9 +1297,15 @@ task_create_internal(
                new_task->sec_token = KERNEL_SECURITY_TOKEN;
                new_task->audit_token = KERNEL_AUDIT_TOKEN;
 #ifdef __LP64__
-               if(is_64bit)
-                       task_set_64BitAddr(new_task);
+               if(is_64bit) {
+                       task_set_64Bit_addr(new_task);
+               }
 #endif
+
+               if(is_64bit_data) {
+                       task_set_64Bit_data(new_task);
+               }
+
                new_task->all_image_info_addr = (mach_vm_address_t)0;
                new_task->all_image_info_size = (mach_vm_size_t)0;
 
@@ -1269,6 +1342,7 @@ task_create_internal(
                new_task->total_user_time = 0;
                new_task->total_system_time = 0;
                new_task->total_ptime = 0;
+               new_task->total_runnable_time = 0;
                new_task->faults = 0;
                new_task->pageins = 0;
                new_task->cow_faults = 0;
@@ -1341,9 +1415,6 @@ task_create_internal(
                new_task->dispatchqueue_offset = parent_task->dispatchqueue_offset;
        }
 
-       if (vm_backing_store_low && parent_task != NULL)
-               new_task->priv_flags |= (parent_task->priv_flags&VM_BACKING_STORE_PRIV);
-
        new_task->task_volatile_objects = 0;
        new_task->task_nonvolatile_objects = 0;
        new_task->task_purgeable_disowning = FALSE;
@@ -1351,14 +1422,26 @@ task_create_internal(
        queue_init(&new_task->task_objq);
        task_objq_lock_init(new_task);
 
+#if __arm64__
+       new_task->task_legacy_footprint = FALSE;
+#endif /* __arm64__ */
        new_task->task_region_footprint = FALSE;
-
+       new_task->task_has_crossed_thread_limit = FALSE;
+       new_task->task_thread_limit = 0;
 #if CONFIG_SECLUDED_MEMORY
        new_task->task_can_use_secluded_mem = FALSE;
        new_task->task_could_use_secluded_mem = FALSE;
        new_task->task_could_also_use_secluded_mem = FALSE;
+       new_task->task_suppressed_secluded = FALSE;
 #endif /* CONFIG_SECLUDED_MEMORY */
 
+       /*
+        * t_flags is set up above. But since we don't
+        * support darkwake mode being set that way
+        * currently, we clear it out here explicitly.
+        */
+       new_task->t_flags &= ~(TF_DARKWAKE_MODE);
+
         queue_init(&new_task->io_user_clients);
 
        ipc_task_enable(new_task);
@@ -1389,6 +1472,7 @@ task_rollup_accounting_info(task_t to_task, task_t from_task)
        to_task->total_user_time = from_task->total_user_time;
        to_task->total_system_time = from_task->total_system_time;
        to_task->total_ptime = from_task->total_ptime;
+       to_task->total_runnable_time = from_task->total_runnable_time;
        to_task->faults = from_task->faults;
        to_task->pageins = from_task->pageins;
        to_task->cow_faults = from_task->cow_faults;
@@ -2005,7 +2089,7 @@ task_duplicate_map_and_threads(
        kern_return_t kr = KERN_SUCCESS;
        int active;
        thread_t thread, self, thread_return = THREAD_NULL;
-       thread_t new_thread = THREAD_NULL;
+       thread_t new_thread = THREAD_NULL, first_thread = THREAD_NULL;
        thread_t *thread_array;
        uint32_t active_thread_count = 0, array_count = 0, i;
        vm_map_t oldmap;
@@ -2049,9 +2133,13 @@ task_duplicate_map_and_threads(
                new_task->map = vm_map_fork(new_task->ledger,
                                            task->map,
                                            (VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
-                                            VM_MAP_FORK_PRESERVE_PURGEABLE));
+                                            VM_MAP_FORK_PRESERVE_PURGEABLE |
+                                            VM_MAP_FORK_CORPSE_FOOTPRINT));
                vm_map_deallocate(oldmap);
 
+               /* copy ledgers that impact the memory footprint */
+               vm_map_copy_footprint_ledgers(task, new_task);
+
                /* Get all the udata pointers from kqueue */
                est_knotes = kevent_proc_copy_uptrs(p, NULL, 0);
                if (est_knotes > 0) {
@@ -2104,6 +2192,8 @@ task_duplicate_map_and_threads(
                if (thread_array[i] == self) {
                        thread_return = new_thread;
                        new_task->crashed_thread_id = thread_tid(new_thread);
+               } else if (first_thread == NULL) {
+                       first_thread = new_thread;
                } else {
                        /* drop the extra ref returned by thread_create_with_continuation */
                        thread_deallocate(new_thread);
@@ -2119,9 +2209,19 @@ task_duplicate_map_and_threads(
 
                /* Copy thread name */
                bsd_copythreadname(new_thread->uthread, thread_array[i]->uthread);
+               new_thread->thread_tag = thread_array[i]->thread_tag;
                thread_copy_resource_info(new_thread, thread_array[i]);
        }
 
+       /* return the first thread if we couldn't find the equivalent of current */
+       if (thread_return == THREAD_NULL) {
+               thread_return = first_thread;
+       }
+       else if (first_thread != THREAD_NULL) {
+               /* drop the extra ref returned by thread_create_with_continuation */
+               thread_deallocate(first_thread);
+       }
+
        task_resume_internal(task);
 
        for (i = 0; i < array_count; i++) {
@@ -2188,6 +2288,10 @@ task_terminate_internal(
        }
        task->task_could_use_secluded_mem = FALSE;
        task->task_could_also_use_secluded_mem = FALSE;
+
+       if (task->task_suppressed_secluded) {
+               stop_secluded_suppression(task);
+       }
 #endif /* CONFIG_SECLUDED_MEMORY */
 
        if (!task->active) {
@@ -2339,9 +2443,11 @@ task_terminate_internal(
                       * Final cleanup:
                       * + no unnesting
                       * + remove immutable mappings
+                      * + allow gaps in range
                       */
                      (VM_MAP_REMOVE_NO_UNNESTING |
-                      VM_MAP_REMOVE_IMMUTABLE));
+                      VM_MAP_REMOVE_IMMUTABLE |
+                      VM_MAP_REMOVE_GAPS_OK));
 
        /* release our shared region */
        vm_shared_region_set(task, NULL);
@@ -2530,9 +2636,11 @@ task_complete_halt(task_t task)
                       * Final cleanup:
                       * + no unnesting
                       * + remove immutable mappings
+                      * + allow gaps in the range
                       */
                      (VM_MAP_REMOVE_NO_UNNESTING |
-                      VM_MAP_REMOVE_IMMUTABLE));
+                      VM_MAP_REMOVE_IMMUTABLE |
+                      VM_MAP_REMOVE_GAPS_OK));
 
        /*
         * Kick out any IOKitUser handles to the task. At best they're stale,
@@ -2548,7 +2656,7 @@ task_complete_halt(task_t task)
  *     This is a recursive-style suspension of the task, a count of
  *     suspends is maintained.
  *
- *     CONDITIONS: the task is locked and active.
+ *     CONDITIONS: the task is locked and active.
  */
 void
 task_hold_locked(
@@ -2561,6 +2669,10 @@ task_hold_locked(
        if (task->suspend_count++ > 0)
                return;
 
+       if (task->bsd_info) {
+               workq_proc_suspended(task->bsd_info);
+       }
+
        /*
         *      Iterate through all the threads and hold them.
         */
@@ -2675,6 +2787,10 @@ task_release_locked(
        if (--task->suspend_count > 0)
                return;
 
+       if (task->bsd_info) {
+               workq_proc_resumed(task->bsd_info);
+       }
+
        queue_iterate(&task->threads, thread, thread_t, task_threads) {
                thread_mtx_lock(thread);
                thread_release(thread);
@@ -3400,8 +3516,9 @@ task_freeze(
        uint32_t           *clean_count,
        uint32_t           *dirty_count,
        uint32_t           dirty_budget,
-       boolean_t          *shared,
-       boolean_t          walk_only)
+       uint32_t           *shared_count,
+       int                *freezer_error_code,
+       boolean_t          eval_only)
 {
        kern_return_t kr = KERN_SUCCESS;
     
@@ -3426,22 +3543,29 @@ task_freeze(
 
        task_unlock(task);
 
-       if (walk_only) {
-               panic("task_freeze - walk_only == TRUE");
-       } else {
-               kr = vm_map_freeze(task->map, purgeable_count, wired_count, clean_count, dirty_count, dirty_budget, shared);
-       }
+       kr = vm_map_freeze(task->map,
+                          purgeable_count,
+                          wired_count,
+                          clean_count,
+                          dirty_count,
+                          dirty_budget,
+                          shared_count,
+                          freezer_error_code,
+                          eval_only);
 
        task_lock(task);
 
-       if (walk_only == FALSE && kr == KERN_SUCCESS)
+       if ((kr == KERN_SUCCESS) && (eval_only == FALSE)) {
                task->frozen = TRUE;
+       }
+
        task->changing_freeze_state = FALSE;
        thread_wakeup(&task->changing_freeze_state);
        
        task_unlock(task);
 
-       if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
+       if (VM_CONFIG_COMPRESSOR_IS_PRESENT &&
+           (eval_only == FALSE)) {
                vm_wake_compactor_swapper();
                /*
                 * We do an explicit wakeup of the swapout thread here
@@ -3807,7 +3931,7 @@ task_info(
                        if (thread->options & TH_OPT_IDLE_THREAD)
                                continue;
 
-                       thread_read_times(thread, &user_time, &system_time);
+                       thread_read_times(thread, &user_time, &system_time, NULL);
 
                        time_value_add(&times_info->user_time, &user_time);
                        time_value_add(&times_info->system_time, &system_time);
@@ -3891,7 +4015,7 @@ task_info(
 
                /* only set format on output for those expecting it */
                if (*task_info_count >= TASK_DYLD_INFO_COUNT) {
-                       info->all_image_info_format = task_has_64BitAddr(task) ?
+                       info->all_image_info_format = task_has_64Bit_addr(task) ?
                                                 TASK_DYLD_ALL_IMAGE_INFO_64 : 
                                                 TASK_DYLD_ALL_IMAGE_INFO_32 ;
                        *task_info_count = TASK_DYLD_INFO_COUNT;
@@ -4359,7 +4483,7 @@ task_info(
                flags_info = (task_flags_info_t)task_info_out;
 
                /* only publish the 64-bit flag of the task */
-               flags_info->flags = task->t_flags & TF_64B_ADDR;
+               flags_info->flags = task->t_flags & (TF_64B_ADDR | TF_64B_DATA);
 
                *task_info_count = TASK_FLAGS_INFO_COUNT;
                break;
@@ -4984,9 +5108,6 @@ PROC_VIOLATED_GUARD__SEND_EXC_GUARD_AND_SUSPEND(
        return kr;
 }
 
-extern kern_return_t
-task_violated_guard(mach_exception_code_t, mach_exception_subcode_t, void *);
-
 kern_return_t
 task_violated_guard(
        mach_exception_code_t code,
@@ -5093,6 +5214,7 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb,
        int                                                     pid         = 0;
        const char                                      *procname       = "unknown";
        mach_exception_data_type_t      code[EXCEPTION_CODE_MAX];
+       boolean_t send_sync_exc_resource = FALSE;
 
 #ifdef MACH_BSD
        pid = proc_selfpid();
@@ -5105,8 +5227,10 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb,
                return;
        }
 
-       if (task->bsd_info != NULL)
+       if (task->bsd_info != NULL) {
                procname = proc_name_address(current_task()->bsd_info);
+               send_sync_exc_resource = proc_send_synchronous_EXC_RESOURCE(current_task()->bsd_info);
+       }
 #endif
 #if CONFIG_COREDUMP
        if (hwm_user_cores) {
@@ -5155,10 +5279,13 @@ PROC_CROSSED_HIGH_WATERMARK__SEND_EXC_RESOURCE_AND_SUSPEND(int max_footprint_mb,
        EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_HIGH_WATERMARK);
        EXC_RESOURCE_HWM_ENCODE_LIMIT(code[0], max_footprint_mb);
 
-       /* Do not generate a corpse fork if the violation is a fatal one */
-       if (is_fatal || exc_via_corpse_forking == 0) {
-               /* Do not send a EXC_RESOURCE is corpse_for_fatal_memkill is set */
-               if (corpse_for_fatal_memkill == 0) {
+       /*
+        * Do not generate a corpse fork if the violation is a fatal one
+        * or the process wants synchronous EXC_RESOURCE exceptions.
+        */
+       if (is_fatal || send_sync_exc_resource || exc_via_corpse_forking == 0) {
+               /* Do not send a EXC_RESOURCE if corpse_for_fatal_memkill is set */
+               if (send_sync_exc_resource || corpse_for_fatal_memkill == 0) {
                        /*
                         * Use the _internal_ variant so that no user-space
                         * process can resume our task from under us.
@@ -5396,6 +5523,17 @@ task_get_phys_footprint_limit(
 }
 #endif /* CONFIG_MEMORYSTATUS */
 
+void
+task_set_thread_limit(task_t task, uint16_t thread_limit)
+{
+       assert(task != kernel_task);
+       if (thread_limit <= TASK_MAX_THREAD_LIMIT) {
+               task_lock(task);
+               task->task_thread_limit = thread_limit;
+               task_unlock(task);
+       }
+}
+
 /*
  * We need to export some functions to other components that
  * are currently implemented in macros within the osfmk
@@ -6153,7 +6291,8 @@ task_set_could_also_use_secluded_mem(
 
 boolean_t
 task_can_use_secluded_mem(
-       task_t  task)
+       task_t          task,
+       boolean_t       is_alloc)
 {
        if (task->task_can_use_secluded_mem) {
                assert(task->task_could_use_secluded_mem);
@@ -6165,6 +6304,20 @@ task_can_use_secluded_mem(
                assert(num_tasks_can_use_secluded_mem > 0);
                return TRUE;
        }
+
+       /*
+        * If a single task is using more than some amount of
+        * memory, allow it to dip into secluded and also begin
+        * suppression of secluded memory until the tasks exits.
+        */
+       if (is_alloc && secluded_shutoff_trigger != 0) {
+               uint64_t phys_used = get_task_phys_footprint(task);
+               if (phys_used > secluded_shutoff_trigger) {
+                       start_secluded_suppression(task);
+                       return TRUE;
+               }
+       }
+
        return FALSE;
 }
 
@@ -6219,3 +6372,38 @@ task_self_region_footprint_set(
        }
        task_unlock(curtask);
 }
+
+void
+task_set_darkwake_mode(task_t task, boolean_t set_mode)
+{
+       assert(task);
+
+       task_lock(task);
+
+       if (set_mode) {
+               task->t_flags |= TF_DARKWAKE_MODE;
+       } else {
+               task->t_flags &= ~(TF_DARKWAKE_MODE);
+       }
+
+       task_unlock(task);
+}
+
+boolean_t
+task_get_darkwake_mode(task_t task)
+{
+       assert(task);
+       return ((task->t_flags & TF_DARKWAKE_MODE) != 0);
+}
+
+#if __arm64__
+void
+task_set_legacy_footprint(
+       task_t          task,
+       boolean_t       new_val)
+{
+       task_lock(task);
+       task->task_legacy_footprint = new_val;
+       task_unlock(task);
+}
+#endif /* __arm64__ */
index f0cbdff1dcce81ff19f6afe823d7b5d2c7afa936..fe43b2db1098a9a9485d6b416d4ed3a30b6ae1c7 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2010, 2015 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
@@ -150,11 +150,12 @@ struct task {
        _Atomic uint32_t        ref_count;      /* Number of references to me */
        boolean_t       active;         /* Task has not been terminated */
        boolean_t       halting;        /* Task is being halted */
+       /* Virtual timers */
+       uint32_t                vtimers;
 
        /* Miscellaneous */
        vm_map_t        map;            /* Address space description */
        queue_chain_t   tasks;  /* global list of tasks */
-       void            *user_data;     /* Arbitrary data settable via IPC */
 
 #if defined(CONFIG_SCHED_MULTIQ)
        sched_group_t sched_group;
@@ -182,14 +183,12 @@ struct task {
        /* Task security and audit tokens */
        security_token_t sec_token;
        audit_token_t   audit_token;
-        
+
        /* Statistics */
        uint64_t                total_user_time;        /* terminated threads only */
        uint64_t                total_system_time;
        uint64_t                total_ptime;
-       
-       /* Virtual timers */
-       uint32_t                vtimers;
+       uint64_t                total_runnable_time;
 
        /* IPC structures */
        decl_lck_mtx_data(,itk_lock_data)
@@ -210,12 +209,11 @@ struct task {
 
        struct ipc_space *itk_space;
 
+       ledger_t        ledger;
        /* Synchronizer ownership information */
        queue_head_t    semaphore_list;         /* list of owned semaphores   */
        int             semaphores_owned;       /* number of semaphores owned */
 
-       ledger_t        ledger;
-
        unsigned int    priv_flags;                     /* privilege resource flags */
 #define VM_BACKING_STORE_PRIV  0x1
 
@@ -257,15 +255,27 @@ struct task {
 #define TF_LRETURNWAITER        0x00000200                              /* task is waiting for TF_LRETURNWAIT to get cleared */
 #define TF_PLATFORM             0x00000400                              /* task is a platform binary */
 #define TF_CA_CLIENT_WI         0x00000800                              /* task has CA_CLIENT work interval */
+#define TF_DARKWAKE_MODE        0x00001000                              /* task is in darkwake mode */
+
+/*
+ * Task is running within a 64-bit address space.
+ */
+#define task_has_64Bit_addr(task)      \
+       (((task)->t_flags & TF_64B_ADDR) != 0)
+#define task_set_64Bit_addr(task)      \
+       ((task)->t_flags |= TF_64B_ADDR)
+#define task_clear_64Bit_addr(task)    \
+       ((task)->t_flags &= ~TF_64B_ADDR)
 
-#define task_has_64BitAddr(task)       \
-        (((task)->t_flags & TF_64B_ADDR) != 0)
-#define task_set_64BitAddr(task)       \
-        ((task)->t_flags |= TF_64B_ADDR)
-#define task_clear_64BitAddr(task)     \
-        ((task)->t_flags &= ~TF_64B_ADDR)
-#define task_has_64BitData(task)    \
-        (((task)->t_flags & TF_64B_DATA) != 0)
+/*
+ * Task is using 64-bit machine state.
+ */
+#define task_has_64Bit_data(task)      \
+       (((task)->t_flags & TF_64B_DATA) != 0)
+#define task_set_64Bit_data(task)      \
+       ((task)->t_flags |= TF_64B_DATA)
+#define task_clear_64Bit_data(task)    \
+       ((task)->t_flags &= ~TF_64B_DATA)
 
 #define task_is_a_corpse(task)      \
         (((task)->t_flags & TF_CORPSE) != 0)
@@ -316,8 +326,11 @@ struct task {
                 applied_ru_cpu_ext     :4;
        uint8_t  rusage_cpu_flags;
        uint8_t  rusage_cpu_percentage;         /* Task-wide CPU limit percentage */
-       uint64_t rusage_cpu_interval;           /* Task-wide CPU limit interval */
        uint8_t  rusage_cpu_perthr_percentage;  /* Per-thread CPU limit percentage */
+#if MACH_ASSERT
+       int8_t          suspends_outstanding;   /* suspends this task performed in excess of resumes */
+#endif
+       uint64_t rusage_cpu_interval;           /* Task-wide CPU limit interval */
        uint64_t rusage_cpu_perthr_interval;    /* Per-thread CPU limit interval */
        uint64_t rusage_cpu_deadline;
        thread_call_t rusage_cpu_callt;
@@ -338,10 +351,6 @@ struct task {
 
        vm_extmod_statistics_data_t     extmod_statistics;
 
-#if MACH_ASSERT
-       int8_t          suspends_outstanding;   /* suspends this task performed in excess of resumes */
-#endif
-
        struct task_requested_policy requested_policy;
        struct task_effective_policy effective_policy;
 
@@ -393,8 +402,13 @@ struct task {
        queue_head_t    task_objq;
        decl_lck_mtx_data(,task_objq_lock) /* protects "task_objq" */
 
-       boolean_t       task_region_footprint;
-
+       unsigned int    task_thread_limit:16;
+#if __arm64__
+       unsigned int    task_legacy_footprint:1;
+#endif /* __arm64__ */
+       unsigned int    task_region_footprint:1;
+       unsigned int    task_has_crossed_thread_limit:1;
+       uint32_t        exec_token;
        /*
         * A task's coalition set is "adopted" in task_create_internal
         * and unset in task_deallocate_internal, so each array member
@@ -416,15 +430,33 @@ struct task {
 #endif /* HYPERVISOR */
 
 #if CONFIG_SECLUDED_MEMORY
-       boolean_t       task_can_use_secluded_mem;
-       boolean_t       task_could_use_secluded_mem;
-       boolean_t       task_could_also_use_secluded_mem;
+       uint8_t task_can_use_secluded_mem;
+       uint8_t task_could_use_secluded_mem;
+       uint8_t task_could_also_use_secluded_mem;
+       uint8_t task_suppressed_secluded;
 #endif /* CONFIG_SECLUDED_MEMORY */
 
+       uint32_t task_exc_guard;
+
        queue_head_t    io_user_clients;
-       uint32_t        exec_token;
 };
 
+#define TASK_EXC_GUARD_VM_DELIVER            0x01 /* Deliver virtual memory EXC_GUARD exceptions */
+#define TASK_EXC_GUARD_VM_ONCE               0x02 /* Deliver them only once */
+#define TASK_EXC_GUARD_VM_CORPSE             0x04 /* Deliver them via a forked corpse */
+#define TASK_EXC_GUARD_VM_FATAL              0x08 /* Virtual Memory EXC_GUARD delivery is fatal */
+#define TASK_EXC_GUARD_VM_ALL                0x0f
+
+#define TASK_EXC_GUARD_MP_DELIVER            0x10 /* Deliver mach port EXC_GUARD exceptions */
+#define TASK_EXC_GUARD_MP_ONCE               0x20 /* Deliver them only once */
+#define TASK_EXC_GUARD_MP_CORPSE             0x04 /* Deliver them via a forked corpse */
+#define TASK_EXC_GUARD_MP_FATAL              0x80 /* mach port EXC_GUARD delivery is fatal */
+
+extern uint32_t task_exc_guard_default;
+
+extern kern_return_t
+task_violated_guard(mach_exception_code_t, mach_exception_subcode_t, void *);
+
 #define task_lock(task)                        lck_mtx_lock(&(task)->lock)
 #define        task_lock_assert_owned(task)    LCK_MTX_ASSERT(&(task)->lock, LCK_MTX_ASSERT_OWNED)
 #define task_lock_try(task)            lck_mtx_try_lock(&(task)->lock)
@@ -552,8 +584,9 @@ extern kern_return_t        task_freeze(
                                                        uint32_t        *clean_count,
                                                        uint32_t        *dirty_count,
                                                        uint32_t        dirty_budget,
-                                                       boolean_t       *shared,
-                                                       boolean_t       walk_only);
+                                                       uint32_t        *shared_count,
+                                                       int             *freezer_error_code,
+                                                       boolean_t       eval_only);
 
 /* Thaw a currently frozen task */
 extern kern_return_t   task_thaw(
@@ -577,6 +610,7 @@ extern kern_return_t        task_create_internal(
                                                        coalition_t     *parent_coalitions,
                                                        boolean_t       inherit_memory,
                                                        boolean_t       is_64bit,
+                                                       boolean_t       is_64bit_data,
                                                        uint32_t        flags,
                                                        uint32_t        procflags,
                                                        task_t          *child_task);   /* OUT */
@@ -625,7 +659,11 @@ extern void                task_vtimer_update(
 
 extern void            task_set_64bit(
                                        task_t          task,
-                                       boolean_t       is64bit);
+                                       boolean_t       is_64bit,
+                                       boolean_t       is_64bit_data);
+
+extern boolean_t       task_get_64bit_data(
+                                               task_t task);
 
 extern void    task_set_platform_binary(
                                        task_t task,
@@ -634,9 +672,6 @@ extern bool task_set_ca_client_wi(
                                        task_t task,
                                        boolean_t ca_client_wi);
 
-extern void            task_backing_store_privileged(
-                                       task_t          task);
-
 extern void            task_set_dyld_info(
                                        task_t          task,
                                        mach_vm_address_t addr,
@@ -667,7 +702,9 @@ extern uint64_t     get_task_resident_size(task_t);
 extern uint64_t        get_task_compressed(task_t);
 extern uint64_t        get_task_resident_max(task_t);
 extern uint64_t        get_task_phys_footprint(task_t);
-extern uint64_t        get_task_phys_footprint_recent_max(task_t);
+#if CONFIG_LEDGER_INTERVAL_MAX
+extern uint64_t        get_task_phys_footprint_interval_max(task_t, int reset);
+#endif /* CONFIG_FOOTPRINT_INTERVAL_MAX */
 extern uint64_t        get_task_phys_footprint_lifetime_max(task_t);
 extern uint64_t        get_task_phys_footprint_limit(task_t);
 extern uint64_t        get_task_purgeable_size(task_t);
@@ -686,6 +723,9 @@ extern uint64_t get_task_alternate_accounting(task_t);
 extern uint64_t get_task_alternate_accounting_compressed(task_t);
 extern uint64_t get_task_memory_region_count(task_t);
 extern uint64_t get_task_page_table(task_t);
+extern uint64_t get_task_network_nonvolatile(task_t);
+extern uint64_t get_task_network_nonvolatile_compressed(task_t);
+extern uint64_t get_task_wired_mem(task_t);
 
 extern kern_return_t task_convert_phys_footprint_limit(int, int *);
 extern kern_return_t task_set_phys_footprint_limit_internal(task_t, int, int *, boolean_t, boolean_t);
@@ -699,6 +739,9 @@ extern void task_set_memlimit_is_fatal(task_t task, boolean_t memlimit_is_fatal)
 extern boolean_t task_has_triggered_exc_resource(task_t task, boolean_t memlimit_is_active);
 extern void task_mark_has_triggered_exc_resource(task_t task, boolean_t memlimit_is_active);
 
+extern void task_set_thread_limit(task_t task, uint16_t thread_limit);
+
+
 extern boolean_t       is_kerneltask(task_t task);
 extern boolean_t       is_corpsetask(task_t task);
 
@@ -735,6 +778,10 @@ struct _task_ledger_indices {
        int purgeable_nonvolatile;
        int purgeable_volatile_compressed;
        int purgeable_nonvolatile_compressed;
+       int network_volatile;
+       int network_nonvolatile;
+       int network_volatile_compressed;
+       int network_nonvolatile_compressed;
        int platform_idle_wakeups;
        int interrupt_wakeups;
 #if CONFIG_SCHED_SFI
@@ -826,10 +873,19 @@ extern void task_set_could_use_secluded_mem(
 extern void task_set_could_also_use_secluded_mem(
        task_t task,
        boolean_t could_also_use_secluded_mem);
-extern boolean_t task_can_use_secluded_mem(task_t task);
+extern boolean_t task_can_use_secluded_mem(
+       task_t task,
+       boolean_t is_allocate);
 extern boolean_t task_could_use_secluded_mem(task_t task);
 #endif /* CONFIG_SECLUDED_MEMORY */
 
+extern void task_set_darkwake_mode(task_t, boolean_t);
+extern boolean_t task_get_darkwake_mode(task_t);
+
+#if __arm64__
+extern void task_set_legacy_footprint(task_t task, boolean_t new_val);
+#endif /* __arm64__ */
+
 #if CONFIG_MACF
 extern struct label *get_task_crash_label(task_t task);
 #endif /* CONFIG_MACF */
index ca3e83f1847af1fb4c20c42be1c4f6eb39fe3dc4..f44ba4c845a4d10d9717ce3f4a1f512fc512ea19 100644 (file)
@@ -248,11 +248,7 @@ int proc_tal_disk_tier        = THROTTLE_LEVEL_TIER1;
 
 int proc_graphics_timer_qos   = (LATENCY_QOS_TIER_0 & 0xFF);
 
-#if CONFIG_EMBEDDED
-const int proc_default_bg_iotier  = THROTTLE_LEVEL_TIER3;
-#else
 const int proc_default_bg_iotier  = THROTTLE_LEVEL_TIER2;
-#endif
 
 /* Latency/throughput QoS fields remain zeroed, i.e. TIER_UNSPECIFIED at creation */
 const struct task_requested_policy default_task_requested_policy = {
@@ -323,8 +319,12 @@ qos_throughput_policy_package(uint32_t qv) {
        return (qv == THROUGHPUT_QOS_TIER_UNSPECIFIED) ? THROUGHPUT_QOS_TIER_UNSPECIFIED : ((0xFE << 16) | qv);
 }
 
+#define TASK_POLICY_SUPPRESSION_DISABLE  0x1
+#define TASK_POLICY_SUPPRESSION_IOTIER2  0x2
+#define TASK_POLICY_SUPPRESSION_NONDONOR 0x4
 /* TEMPORARY boot-arg controlling task_policy suppression (App Nap) */
-static boolean_t task_policy_suppression_disable = FALSE;
+static boolean_t task_policy_suppression_flags = TASK_POLICY_SUPPRESSION_IOTIER2 |
+                                                 TASK_POLICY_SUPPRESSION_NONDONOR;
 
 kern_return_t
 task_policy_set(
@@ -462,7 +462,8 @@ task_policy_set(
                        return kr;
 
                /* TEMPORARY disablement of task suppression */
-               if (task_policy_suppression_disable && info->active)
+               if (info->active &&
+                   (task_policy_suppression_flags & TASK_POLICY_SUPPRESSION_DISABLE))
                        return KERN_SUCCESS;
 
                struct task_pend_token pend_token = {};
@@ -826,6 +827,11 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t
                                next.tep_qos_ceiling = THREAD_QOS_UTILITY;
                                break;
 
+                       case TASK_DARWINBG_APPLICATION:
+                               /* i.e. 'DARWIN_BG throttled background application' */
+                               next.tep_qos_ceiling = THREAD_QOS_BACKGROUND;
+                               break;
+
                        case TASK_UNSPECIFIED:
                        default:
                                /* Apps that don't have an application role get
@@ -849,16 +855,21 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t
         *
         * Backgrounding due to apptype does.
         */
-       if (requested.trp_int_darwinbg || requested.trp_ext_darwinbg)
+       if (requested.trp_int_darwinbg || requested.trp_ext_darwinbg ||
+           next.tep_role == TASK_DARWINBG_APPLICATION)
                wants_watchersbg = wants_all_sockets_bg = wants_darwinbg = TRUE;
 
-       /* Background TAL apps are throttled when TAL is enabled */
+       /*
+        * Deprecated TAL implementation for TAL apptype
+        * Background TAL apps are throttled when TAL is enabled
+        */
        if (requested.trp_apptype       == TASK_APPTYPE_APP_TAL         &&
            requested.trp_role          == TASK_BACKGROUND_APPLICATION  &&
            requested.trp_tal_enabled   == 1) {
                next.tep_tal_engaged = 1;
        }
 
+       /* New TAL implementation based on TAL role alone, works for all apps */
        if ((requested.trp_apptype      == TASK_APPTYPE_APP_DEFAULT ||
             requested.trp_apptype      == TASK_APPTYPE_APP_TAL)    &&
             requested.trp_role         == TASK_THROTTLE_APPLICATION) {
@@ -941,13 +952,13 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t
                next.tep_io_passive = 1;
 
        /* Calculate suppression-active flag */
-       boolean_t memorystatus_appnap_transition = FALSE;
+       boolean_t appnap_transition = FALSE;
 
        if (requested.trp_sup_active && requested.trp_boosted == 0)
                next.tep_sup_active = 1;
 
        if (task->effective_policy.tep_sup_active != next.tep_sup_active)
-               memorystatus_appnap_transition = TRUE;
+               appnap_transition = TRUE;
 
        /* Calculate timer QOS */
        int latency_qos = requested.trp_base_latency_qos;
@@ -1001,10 +1012,14 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t
        switch (requested.trp_apptype) {
                case TASK_APPTYPE_APP_TAL:
                case TASK_APPTYPE_APP_DEFAULT:
-                       if (requested.trp_ext_darwinbg == 0)
-                               next.tep_live_donor = 1;
-                       else
+                       if (requested.trp_ext_darwinbg == 1 ||
+                           (next.tep_sup_active == 1 &&
+                            (task_policy_suppression_flags & TASK_POLICY_SUPPRESSION_NONDONOR)) ||
+                           next.tep_role == TASK_DARWINBG_APPLICATION) {
                                next.tep_live_donor = 0;
+                       } else {
+                               next.tep_live_donor = 1;
+                       }
                        break;
 
                case TASK_APPTYPE_DAEMON_INTERACTIVE:
@@ -1193,11 +1208,13 @@ task_policy_update_internal_locked(task_t task, boolean_t in_create, task_pend_t
 
        /*
         * Use the app-nap transitions to influence the
-        * transition of the process within the jetsam band.
+        * transition of the process within the jetsam band
+        * [and optionally its live-donor status]
         * On macOS only.
         */
-       if (memorystatus_appnap_transition == TRUE) {
+       if (appnap_transition == TRUE) {
                if (task->effective_policy.tep_sup_active == 1) {
+                       
                        memorystatus_update_priority_for_appnap(((proc_t) task->bsd_info), TRUE);
                } else {
                        memorystatus_update_priority_for_appnap(((proc_t) task->bsd_info), FALSE);
@@ -1761,6 +1778,9 @@ proc_darwin_role_to_task_role(int darwin_role, int* task_role)
                case PRIO_DARWIN_ROLE_TAL_LAUNCH:
                        role = TASK_THROTTLE_APPLICATION;
                        break;
+               case PRIO_DARWIN_ROLE_DARWIN_BG:
+                       role = TASK_DARWINBG_APPLICATION;
+                       break;
                default:
                        return EINVAL;
        }
@@ -1784,6 +1804,8 @@ proc_task_role_to_darwin_role(int task_role)
                        return PRIO_DARWIN_ROLE_UI;
                case TASK_THROTTLE_APPLICATION:
                        return PRIO_DARWIN_ROLE_TAL_LAUNCH;
+               case TASK_DARWINBG_APPLICATION:
+                       return PRIO_DARWIN_ROLE_DARWIN_BG;
                case TASK_UNSPECIFIED:
                default:
                        return PRIO_DARWIN_ROLE_DEFAULT;
@@ -2305,9 +2327,14 @@ proc_init_cpumon_params(void)
        proc_max_cpumon_interval *= NSEC_PER_SEC;
 
        /* TEMPORARY boot arg to control App suppression */
-       PE_parse_boot_argn("task_policy_suppression_disable",
-                          &task_policy_suppression_disable,
-                          sizeof(task_policy_suppression_disable));
+       PE_parse_boot_argn("task_policy_suppression_flags",
+                          &task_policy_suppression_flags,
+                          sizeof(task_policy_suppression_flags));
+
+       /* adjust suppression disk policy if called for in boot arg */
+       if (task_policy_suppression_flags & TASK_POLICY_SUPPRESSION_IOTIER2) {
+               proc_suppressed_disk_tier = THROTTLE_LEVEL_TIER2;
+       }
 }
 
 /*
@@ -3183,8 +3210,48 @@ task_importance_reset(__imp_only task_t task)
 #endif /* IMPORTANCE_INHERITANCE */
 }
 
+void
+task_importance_init_from_parent(__imp_only task_t new_task, __imp_only task_t parent_task)
+{
 #if IMPORTANCE_INHERITANCE
+       ipc_importance_task_t new_task_imp = IIT_NULL;
+
+       new_task->task_imp_base = NULL;
+       if (!parent_task) return;
+
+       if (task_is_marked_importance_donor(parent_task)) {
+               new_task_imp = ipc_importance_for_task(new_task, FALSE);
+               assert(IIT_NULL != new_task_imp);
+               ipc_importance_task_mark_donor(new_task_imp, TRUE);
+       }
+       if (task_is_marked_live_importance_donor(parent_task)) {
+               if (IIT_NULL == new_task_imp)
+                       new_task_imp = ipc_importance_for_task(new_task, FALSE);
+               assert(IIT_NULL != new_task_imp);
+               ipc_importance_task_mark_live_donor(new_task_imp, TRUE);
+       }
+       /* Do not inherit 'receiver' on fork, vfexec or true spawn */
+       if (task_is_exec_copy(new_task) &&
+                               task_is_marked_importance_receiver(parent_task)) {
+               if (IIT_NULL == new_task_imp)
+                       new_task_imp = ipc_importance_for_task(new_task, FALSE);
+               assert(IIT_NULL != new_task_imp);
+               ipc_importance_task_mark_receiver(new_task_imp, TRUE);
+       }
+       if (task_is_marked_importance_denap_receiver(parent_task)) {
+               if (IIT_NULL == new_task_imp)
+                       new_task_imp = ipc_importance_for_task(new_task, FALSE);
+               assert(IIT_NULL != new_task_imp);
+               ipc_importance_task_mark_denap_receiver(new_task_imp, TRUE);
+       }
+       if (IIT_NULL != new_task_imp) {
+               assert(new_task->task_imp_base == new_task_imp);
+               ipc_importance_task_release(new_task_imp);
+       }
+#endif /* IMPORTANCE_INHERITANCE */
+}
 
+#if IMPORTANCE_INHERITANCE
 /*
  * Sets the task boost bit to the provided value.  Does NOT run the update function.
  *
index 120885eac63b46b6175f0eb2ed92c37c15be9543..723d48f5b46ae4a6a29622ce6dc1ccaaf8201e8c 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 #include <mach/host_priv.h>
@@ -35,9 +35,9 @@
 #include <kern/debug.h>
 #include <kern/host.h>
 #include <kern/kalloc.h>
-#include <kern/kern_types.h> 
-#include <kern/locks.h> 
-#include <kern/misc_protos.h> 
+#include <kern/kern_types.h>
+#include <kern/locks.h>
+#include <kern/misc_protos.h>
 #include <kern/sched.h>
 #include <kern/sched_prim.h>
 #include <kern/telemetry.h>
@@ -52,6 +52,7 @@
 
 #include <kperf/callstack.h>
 #include <kern/backtrace.h>
+#include <kern/monotonic.h>
 
 #include <sys/kdebug.h>
 #include <uuid/uuid.h>
@@ -93,10 +94,11 @@ volatile boolean_t  telemetry_needs_timer_arming_record = FALSE;
  * If TRUE, record micro-stackshot samples for all tasks.
  * If FALSE, only sample tasks which are marked for telemetry.
  */
-boolean_t                      telemetry_sample_all_tasks = FALSE;
-uint32_t                       telemetry_active_tasks = 0; // Number of tasks opted into telemetry
+boolean_t telemetry_sample_all_tasks = FALSE;
+boolean_t telemetry_sample_pmis = FALSE;
+uint32_t telemetry_active_tasks = 0; // Number of tasks opted into telemetry
 
-uint32_t                       telemetry_timestamp = 0;
+uint32_t telemetry_timestamp = 0;
 
 /*
  * The telemetry_buffer is responsible
@@ -109,12 +111,16 @@ struct micro_snapshot_buffer telemetry_buffer = {0, 0, 0, 0};
 int                                    telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked?
 int                                    telemetry_buffer_notify_at = 0;
 
-lck_grp_t              telemetry_lck_grp;
-lck_mtx_t              telemetry_mtx;
+lck_grp_t telemetry_lck_grp;
+lck_mtx_t telemetry_mtx;
+lck_mtx_t telemetry_pmi_mtx;
 
-#define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while(0)
+#define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while (0)
 #define TELEMETRY_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&telemetry_mtx)
-#define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while(0)
+#define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while (0)
+
+#define TELEMETRY_PMI_LOCK() do { lck_mtx_lock(&telemetry_pmi_mtx); } while (0)
+#define TELEMETRY_PMI_UNLOCK() do { lck_mtx_unlock(&telemetry_pmi_mtx); } while (0)
 
 void telemetry_init(void)
 {
@@ -123,6 +129,7 @@ void telemetry_init(void)
 
        lck_grp_init(&telemetry_lck_grp, "telemetry group", LCK_GRP_ATTR_NULL);
        lck_mtx_init(&telemetry_mtx, &telemetry_lck_grp, LCK_ATTR_NULL);
+       lck_mtx_init(&telemetry_pmi_mtx, &telemetry_lck_grp, LCK_ATTR_NULL);
 
        if (!PE_parse_boot_argn("telemetry_buffer_size", &telemetry_buffer.size, sizeof(telemetry_buffer.size))) {
                telemetry_buffer.size = TELEMETRY_DEFAULT_BUFFER_SIZE;
@@ -180,7 +187,7 @@ void telemetry_init(void)
  * enable_disable == 0: turn it off
  */
 void
-telemetry_global_ctl(int enable_disable) 
+telemetry_global_ctl(int enable_disable)
 {
        if (enable_disable == 1) {
                telemetry_sample_all_tasks = TRUE;
@@ -222,9 +229,9 @@ telemetry_task_ctl_locked(task_t task, uint32_t reasons, int enable_disable)
                task->t_flags |= reasons;
                if ((origflags & TF_TELEMETRY) == 0) {
                        OSIncrementAtomic(&telemetry_active_tasks);
-#if TELEMETRY_DEBUG                    
+#if TELEMETRY_DEBUG
                        printf("%s: telemetry OFF -> ON (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
-#endif                 
+#endif
                }
        } else {
                task->t_flags &= ~reasons;
@@ -258,15 +265,15 @@ telemetry_is_active(thread_t thread)
                return FALSE;
        }
 
-       if (telemetry_sample_all_tasks == TRUE) {
-               return (TRUE);
+       if (telemetry_sample_all_tasks || telemetry_sample_pmis) {
+               return TRUE;
        }
 
        if ((telemetry_active_tasks > 0) && ((thread->task->t_flags & TF_TELEMETRY) != 0)) {
-               return (TRUE);
+               return TRUE;
        }
-       return (FALSE);
+
+       return FALSE;
 }
 
 /*
@@ -284,11 +291,82 @@ int telemetry_timer_event(__unused uint64_t deadline, __unused uint64_t interval
        return (0);
 }
 
+#if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
+static void
+telemetry_pmi_handler(bool user_mode, __unused void *ctx)
+{
+       telemetry_mark_curthread(user_mode, TRUE);
+}
+#endif /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
+
+int telemetry_pmi_setup(enum telemetry_pmi pmi_ctr, uint64_t period)
+{
+#if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
+       static boolean_t sample_all_tasks_aside = FALSE;
+       static uint32_t active_tasks_aside = FALSE;
+       int error = 0;
+       const char *name = "?";
+
+       unsigned int ctr = 0;
+
+       TELEMETRY_PMI_LOCK();
+
+       switch (pmi_ctr) {
+       case TELEMETRY_PMI_NONE:
+               if (!telemetry_sample_pmis) {
+                       error = 1;
+                       goto out;
+               }
+
+               telemetry_sample_pmis = FALSE;
+               telemetry_sample_all_tasks = sample_all_tasks_aside;
+               telemetry_active_tasks = active_tasks_aside;
+               error = mt_microstackshot_stop();
+               if (!error) {
+                       printf("telemetry: disabling ustackshot on PMI\n");
+               }
+               goto out;
+
+       case TELEMETRY_PMI_INSTRS:
+               ctr = MT_CORE_INSTRS;
+               name = "instructions";
+               break;
+
+       case TELEMETRY_PMI_CYCLES:
+               ctr = MT_CORE_CYCLES;
+               name = "cycles";
+               break;
+
+       default:
+               error = 1;
+               goto out;
+       }
+
+       telemetry_sample_pmis = TRUE;
+       sample_all_tasks_aside = telemetry_sample_all_tasks;
+       active_tasks_aside = telemetry_active_tasks;
+       telemetry_sample_all_tasks = FALSE;
+       telemetry_active_tasks = 0;
+
+       error = mt_microstackshot_start(ctr, period, telemetry_pmi_handler, NULL);
+       if (!error) {
+               printf("telemetry: ustackshot every %llu %s\n", period, name);
+       }
+
+out:
+       TELEMETRY_PMI_UNLOCK();
+       return error;
+#else /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
+#pragma unused(pmi_ctr, period)
+       return 1;
+#endif /* !defined(MT_CORE_INSTRS) || !defined(MT_CORE_CYCLES) */
+}
+
 /*
  * Mark the current thread for an interrupt-based
  * telemetry record, to be sampled at the next AST boundary.
  */
-void telemetry_mark_curthread(boolean_t interrupted_userspace)
+void telemetry_mark_curthread(boolean_t interrupted_userspace, boolean_t pmi)
 {
        uint32_t ast_bits = 0;
        thread_t thread = current_thread();
@@ -302,6 +380,9 @@ void telemetry_mark_curthread(boolean_t interrupted_userspace)
        }
 
        ast_bits |= (interrupted_userspace ? AST_TELEMETRY_USER : AST_TELEMETRY_KERNEL);
+       if (pmi) {
+               ast_bits |= AST_TELEMETRY_PMI;
+       }
 
        telemetry_needs_record = FALSE;
        thread_ast_set(thread, ast_bits);
@@ -324,33 +405,33 @@ void compute_telemetry(void *arg __unused)
 static void
 telemetry_notify_user(void)
 {
-       mach_port_t user_port;
-       uint32_t        flags = 0;
-       int                     error;
+       mach_port_t user_port = MACH_PORT_NULL;
 
-       error = host_get_telemetry_port(host_priv_self(), &user_port);
-       if ((error != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) {
+       kern_return_t kr = host_get_telemetry_port(host_priv_self(), &user_port);
+       if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) {
                return;
        }
 
-       telemetry_notification(user_port, flags);
+       telemetry_notification(user_port, 0);
        ipc_port_release_send(user_port);
 }
 
 void telemetry_ast(thread_t thread, ast_t reasons)
 {
-       assert((reasons & AST_TELEMETRY_ALL) != AST_TELEMETRY_ALL); /* only one is valid at a time */
-
-       boolean_t io_telemetry = (reasons & AST_TELEMETRY_IO) ? TRUE : FALSE;
-       boolean_t interrupted_userspace = (reasons & AST_TELEMETRY_USER) ? TRUE : FALSE;
+       assert((reasons & AST_TELEMETRY_ALL) != 0);
 
-       uint8_t microsnapshot_flags = kInterruptRecord;
+       uint8_t record_type = 0;
+       if (reasons & AST_TELEMETRY_IO) {
+               record_type |= kIORecord;
+       }
+       if (reasons & (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL)) {
+               record_type |= (reasons & AST_TELEMETRY_PMI) ? kPMIRecord :
+                               kInterruptRecord;
+       }
 
-       if (io_telemetry == TRUE)
-               microsnapshot_flags = kIORecord;
+       uint8_t user_telemetry = (reasons & AST_TELEMETRY_USER) ? kUserMode : 0;
 
-       if (interrupted_userspace)
-               microsnapshot_flags |= kUserMode;
+       uint8_t microsnapshot_flags = record_type | user_telemetry;
 
        telemetry_take_sample(thread, microsnapshot_flags, &telemetry_buffer);
 }
@@ -377,25 +458,10 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct
        if ((task == TASK_NULL) || (task == kernel_task) || task_did_exec(task) || task_is_exec_copy(task))
                return;
 
-       /*
-        * To avoid overloading the system with telemetry requests, make
-        * sure we don't add more requests while existing ones are
-        * in-flight.  Attempt this by checking if we can grab the lock.
-        *
-        * This concerns me a little; this working as intended is
-        * contingent on the workload being done in the context of the
-        * telemetry lock being the expensive part of telemetry.  This
-        * includes populating the buffer and the client gathering it,
-        * but excludes the copyin overhead.
-        */
-       if (!TELEMETRY_TRY_SPIN_LOCK())
-               return;
-
-       TELEMETRY_UNLOCK();
-
        /* telemetry_XXX accessed outside of lock for instrumentation only */
-       /* TODO */
-       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START, microsnapshot_flags, telemetry_bytes_since_last_mark, 0, 0, (&telemetry_buffer != current_buffer));
+       KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START,
+                       microsnapshot_flags, telemetry_bytes_since_last_mark, 0,
+                       (&telemetry_buffer != current_buffer));
 
        p = get_bsdtask_info(task);
 
@@ -444,7 +510,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct
         */
        uint32_t                        uuid_info_count = 0;
        mach_vm_address_t       uuid_info_addr = 0;
-       if (task_has_64BitAddr(task)) {
+       if (task_has_64Bit_addr(task)) {
                struct user64_dyld_all_image_infos task_image_infos;
                if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
                        uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
@@ -475,7 +541,7 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct
                uuid_info_count = TELEMETRY_MAX_UUID_COUNT;
        }
 
-       uint32_t uuid_info_size = (uint32_t)(task_has_64BitAddr(thread->task) ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
+       uint32_t uuid_info_size = (uint32_t)(task_has_64Bit_addr(thread->task) ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
        uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
        char     *uuid_info_array = NULL;
 
@@ -505,10 +571,10 @@ void telemetry_take_sample(thread_t thread, uint8_t microsnapshot_flags, struct
        if (dqkeyaddr != 0) {
                uint64_t dqaddr = 0;
                uint64_t dq_serialno_offset = get_task_dispatchqueue_serialno_offset(task);
-               if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64BitAddr(task) ? 8 : 4)) == 0) &&
+               if ((copyin(dqkeyaddr, (char *)&dqaddr, (task_has_64Bit_addr(task) ? 8 : 4)) == 0) &&
                    (dqaddr != 0) && (dq_serialno_offset != 0)) {
                        uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset;
-                       if (copyin(dqserialnumaddr, (char *)&dqserialnum, (task_has_64BitAddr(task) ? 8 : 4)) == 0) {
+                       if (copyin(dqserialnumaddr, (char *)&dqserialnum, (task_has_64Bit_addr(task) ? 8 : 4)) == 0) {
                                dqserialnum_valid = 1;
                        }
                }
@@ -556,7 +622,7 @@ copytobuffer:
        msnap->snapshot_magic = STACKSHOT_MICRO_SNAPSHOT_MAGIC;
        msnap->ms_flags = microsnapshot_flags;
        msnap->ms_opaque_flags = 0; /* namespace managed by userspace */
-       msnap->ms_cpu = 0; /* XXX - does this field make sense for a micro-stackshot? */
+       msnap->ms_cpu = cpu_number();
        msnap->ms_time = secs;
        msnap->ms_time_microsecs = usecs;
 
@@ -580,7 +646,7 @@ copytobuffer:
        tsnap->user_time_in_terminated_threads = task->total_user_time;
        tsnap->system_time_in_terminated_threads = task->total_system_time;
        tsnap->suspend_count = task->suspend_count;
-       tsnap->task_size = pmap_resident_count(task->map->pmap);
+       tsnap->task_size = (typeof(tsnap->task_size)) (get_task_phys_footprint(task) / PAGE_SIZE);
        tsnap->faults = task->faults;
        tsnap->pageins = task->pageins;
        tsnap->cow_faults = task->cow_faults;
@@ -588,12 +654,12 @@ copytobuffer:
         * The throttling counters are maintained as 64-bit counters in the proc
         * structure. However, we reserve 32-bits (each) for them in the task_snapshot
         * struct to save space and since we do not expect them to overflow 32-bits. If we
-        * find these values overflowing in the future, the fix would be to simply 
+        * find these values overflowing in the future, the fix would be to simply
         * upgrade these counters to 64-bit in the task_snapshot struct
         */
        tsnap->was_throttled = (uint32_t) proc_was_throttled(p);
        tsnap->did_throttle = (uint32_t) proc_did_throttle(p);
-       
+
        if (task->t_flags & TF_TELEMETRY) {
                tsnap->ss_flags |= kTaskRsrcFlagged;
        }
@@ -619,7 +685,7 @@ copytobuffer:
        tsnap->latency_qos = task_grab_latency_qos(task);
 
        strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm));
-       if (task_has_64BitAddr(thread->task)) {
+       if (task_has_64Bit_addr(thread->task)) {
                tsnap->ss_flags |= kUser64_p;
        }
 
@@ -660,7 +726,7 @@ copytobuffer:
 
        if ((current_buffer->size - current_buffer->current_position) < sizeof(struct thread_snapshot)) {
                /* wrap and overwrite */
-               current_buffer->end_point = current_record_start;               
+               current_buffer->end_point = current_record_start;
                current_buffer->current_position = 0;
                if (current_record_start == 0) {
                        /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
@@ -681,7 +747,8 @@ copytobuffer:
        thsnap->ss_flags |= kStacksPCOnly;
        thsnap->ts_qos = thread->effective_policy.thep_qos;
        thsnap->ts_rqos = thread->requested_policy.thrp_qos;
-       thsnap->ts_rqos_override = thread->requested_policy.thrp_qos_override;
+       thsnap->ts_rqos_override = MAX(thread->requested_policy.thrp_qos_override,
+                       thread->requested_policy.thrp_qos_workq_override);
 
        if (proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG)) {
                thsnap->ss_flags |= kThreadDarwinBG;
@@ -706,7 +773,7 @@ copytobuffer:
        if (dqserialnum_valid) {
                if ((current_buffer->size - current_buffer->current_position) < sizeof(dqserialnum)) {
                        /* wrap and overwrite */
-                       current_buffer->end_point = current_record_start;               
+                       current_buffer->end_point = current_record_start;
                        current_buffer->current_position = 0;
                        if (current_record_start == 0) {
                                /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
@@ -720,7 +787,7 @@ copytobuffer:
                current_buffer->current_position += sizeof (dqserialnum);
        }
 
-       if (task_has_64BitAddr(task)) {
+       if (user64) {
                framesize = 8;
                thsnap->ss_flags |= kUser64_p;
        } else {
@@ -772,11 +839,11 @@ copytobuffer:
        }
 
 cancel_sample:
-
        TELEMETRY_UNLOCK();
 
-       /* TODO */
-       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END, notify, telemetry_bytes_since_last_mark, current_buffer->current_position, current_buffer->end_point, (&telemetry_buffer != current_buffer));
+       KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END,
+                       notify, telemetry_bytes_since_last_mark,
+                       current_buffer->current_position, current_buffer->end_point);
 
        if (notify) {
                telemetry_notify_user();
@@ -793,7 +860,7 @@ log_telemetry_output(vm_offset_t buf, uint32_t pos, uint32_t sz)
 {
        struct micro_snapshot *p;
        uint32_t offset;
-       
+
        printf("Copying out %d bytes of telemetry at offset %d\n", sz, pos);
 
        buf += pos;
@@ -820,13 +887,14 @@ int telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, boolean_t mark
        int result = 0;
        uint32_t oldest_record_offset;
 
-       /* TODO */
-       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START, mark, telemetry_bytes_since_last_mark, 0, 0, (&telemetry_buffer != current_buffer));
+       KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START,
+                       mark, telemetry_bytes_since_last_mark, 0,
+                       (&telemetry_buffer != current_buffer));
 
        TELEMETRY_LOCK();
 
        if (current_buffer->buffer == 0) {
-               *length = 0;            
+               *length = 0;
                goto out;
        }
 
@@ -910,7 +978,9 @@ out:
 
        TELEMETRY_UNLOCK();
 
-       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END, current_buffer->current_position, *length, current_buffer->end_point, 0, (&telemetry_buffer != current_buffer));
+       KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END,
+                       current_buffer->current_position, *length,
+                       current_buffer->end_point, (&telemetry_buffer != current_buffer));
 
        return (result);
 }
@@ -1007,7 +1077,7 @@ void bootprofile_init(void)
                if (0 == strcmp(type, "boot")) {
                        bootprofile_type = kBootProfileStartTimerAtBoot;
                } else if (0 == strcmp(type, "wake")) {
-                       bootprofile_type = kBootProfileStartTimerAtWake;                        
+                       bootprofile_type = kBootProfileStartTimerAtWake;
                } else {
                        bootprofile_type = kBootProfileDisabled;
                }
@@ -1182,7 +1252,7 @@ int bootprofile_gather(user_addr_t buffer, uint32_t *length)
        BOOTPROFILE_LOCK();
 
        if (bootprofile_buffer == 0) {
-               *length = 0;            
+               *length = 0;
                goto out;
        }
 
index b5e023401f995c4b6c8b1647e8dc6253576f48e3..166b31c1a82163c569c8fa74b5a028b640255f10 100644 (file)
 
 __BEGIN_DECLS
 
+#define TELEMETRY_CMD_TIMER_EVENT 1
+#define TELEMETRY_CMD_VOUCHER_NAME 2
+#define TELEMETRY_CMD_VOUCHER_STAIN TELEMETRY_CMD_VOUCHER_NAME
+
+enum telemetry_pmi {
+       TELEMETRY_PMI_NONE,
+       TELEMETRY_PMI_INSTRS,
+       TELEMETRY_PMI_CYCLES,
+};
+#define TELEMETRY_CMD_PMI_SETUP 3
+
+#if XNU_KERNEL_PRIVATE
+
 extern volatile boolean_t telemetry_needs_record;
 
 extern void telemetry_init(void);
@@ -46,24 +59,23 @@ extern void telemetry_ast(thread_t thread, uint32_t reasons);
 
 extern int telemetry_gather(user_addr_t buffer, uint32_t *length, boolean_t mark);
 
-extern void telemetry_mark_curthread(boolean_t interrupted_userspace);
+extern void telemetry_mark_curthread(boolean_t interrupted_userspace,
+               boolean_t pmi);
 
 extern void telemetry_task_ctl(task_t task, uint32_t reason, int enable_disable);
 extern void telemetry_task_ctl_locked(task_t task, uint32_t reason, int enable_disable);
 extern void telemetry_global_ctl(int enable_disable);
 
 extern int telemetry_timer_event(uint64_t deadline, uint64_t interval, uint64_t leeway);
-
-#define TELEMETRY_CMD_TIMER_EVENT 1
-#define TELEMETRY_CMD_VOUCHER_NAME 2
-#define TELEMETRY_CMD_VOUCHER_STAIN TELEMETRY_CMD_VOUCHER_NAME
-
+extern int telemetry_pmi_setup(enum telemetry_pmi pmi_type, uint64_t interval);
 
 extern void bootprofile_init(void);
 extern void bootprofile_wake_from_sleep(void);
 extern void bootprofile_get(void **buffer, uint32_t *length);
 extern int bootprofile_gather(user_addr_t buffer, uint32_t *length);
 
+#endif /* XNU_KERNEL_PRIVATE */
+
 __END_DECLS
 
 #endif /* _KERNEL_TELEMETRY_H_ */
diff --git a/osfmk/kern/test_lock.c b/osfmk/kern/test_lock.c
new file mode 100644 (file)
index 0000000..0805606
--- /dev/null
@@ -0,0 +1,932 @@
+#include <mach_ldebug.h>
+#include <debug.h>
+
+#include <mach/kern_return.h>
+#include <mach/mach_host_server.h>
+#include <mach_debug/lockgroup_info.h>
+
+#include <kern/locks.h>
+#include <kern/misc_protos.h>
+#include <kern/kalloc.h>
+#include <kern/thread.h>
+#include <kern/processor.h>
+#include <kern/sched_prim.h>
+#include <kern/debug.h>
+#include <libkern/section_keywords.h>
+#include <machine/atomic.h>
+#include <machine/machine_cpu.h>
+#include <machine/atomic.h>
+#include <string.h>
+#include <kern/kalloc.h>
+
+#include <sys/kdebug.h>
+
+static lck_mtx_t       test_mtx;
+static lck_grp_t       test_mtx_grp;
+static lck_grp_attr_t  test_mtx_grp_attr;
+static lck_attr_t      test_mtx_attr;
+
+static lck_grp_t        test_mtx_stats_grp;
+static lck_grp_attr_t  test_mtx_stats_grp_attr;
+static lck_attr_t      test_mtx_stats_attr;
+
+struct lck_mtx_test_stats_elem {
+       lck_spin_t      lock;
+       uint64_t        samples;
+       uint64_t        avg;
+       uint64_t        max;
+       uint64_t        min;
+       uint64_t        tot;
+};
+
+#define TEST_MTX_LOCK_STATS                    0
+#define TEST_MTX_TRY_LOCK_STATS                        1
+#define TEST_MTX_LOCK_SPIN_STATS               2
+#define TEST_MTX_LOCK_SPIN_ALWAYS_STATS                3
+#define TEST_MTX_TRY_LOCK_SPIN_STATS           4
+#define TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS    5
+#define TEST_MTX_UNLOCK_MTX_STATS              6
+#define TEST_MTX_UNLOCK_SPIN_STATS             7
+#define TEST_MTX_MAX_STATS                     8
+
+struct lck_mtx_test_stats_elem lck_mtx_test_stats[TEST_MTX_MAX_STATS];
+atomic_bool enabled = TRUE;
+
+static void
+init_test_mtx_stats(void)
+{
+       int i;
+
+       lck_grp_attr_setdefault(&test_mtx_stats_grp_attr);
+       lck_grp_init(&test_mtx_stats_grp, "testlck_stats_mtx", &test_mtx_stats_grp_attr);
+       lck_attr_setdefault(&test_mtx_stats_attr);
+
+       atomic_store(&enabled, TRUE);
+       for(i = 0; i < TEST_MTX_MAX_STATS; i++){
+               memset(&lck_mtx_test_stats[i], 0 , sizeof(struct lck_mtx_test_stats_elem));
+               lck_mtx_test_stats[i].min = ~0;
+               lck_spin_init(&lck_mtx_test_stats[i].lock, &test_mtx_stats_grp, &test_mtx_stats_attr);
+       }
+}
+
+static void
+update_test_mtx_stats(
+       uint64_t start,
+       uint64_t end,
+       uint type)
+{
+       if (atomic_load(&enabled) == TRUE) {
+               assert(type < TEST_MTX_MAX_STATS);
+               assert(start <= end);
+
+               uint64_t elapsed = end - start;
+               struct lck_mtx_test_stats_elem* stat = &lck_mtx_test_stats[type];
+
+               lck_spin_lock(&stat->lock);
+
+               stat->samples++;
+               stat->tot += elapsed;
+               stat->avg = stat->tot / stat->samples;
+               if (stat->max < elapsed)
+                       stat->max = elapsed;
+               if (stat->min > elapsed)
+                       stat->min = elapsed;
+               lck_spin_unlock(&stat->lock);
+       }
+}
+
+static void
+erase_test_mtx_stats(
+       uint type)
+{
+       assert(type < TEST_MTX_MAX_STATS);
+       struct lck_mtx_test_stats_elem* stat = &lck_mtx_test_stats[type];
+
+       lck_spin_lock(&stat->lock);
+
+       stat->samples = 0;
+       stat->tot = 0;
+       stat->avg = 0;
+       stat->max = 0;
+       stat->min = ~0;
+
+       lck_spin_unlock(&stat->lock);
+}
+
+void
+erase_all_test_mtx_stats(void)
+{
+       int i;
+       for (i = 0; i < TEST_MTX_MAX_STATS; i++) {
+               erase_test_mtx_stats(i);
+       }
+}
+
+static void
+disable_all_test_mtx_stats(void)
+{
+       atomic_store(&enabled, FALSE);
+}
+
+static void
+enable_all_test_mtx_stats(void)
+{
+       atomic_store(&enabled, TRUE);
+}
+
+static int
+print_test_mtx_stats_string_name(
+       int type_num,
+       char* buffer,
+       int size)
+{
+       char* type = "";
+       switch (type_num) {
+       case TEST_MTX_LOCK_STATS:
+               type = "TEST_MTX_LOCK_STATS";
+               break;
+       case TEST_MTX_TRY_LOCK_STATS:
+               type = "TEST_MTX_TRY_LOCK_STATS";
+               break;
+       case TEST_MTX_LOCK_SPIN_STATS:
+               type = "TEST_MTX_LOCK_SPIN_STATS";
+               break;
+       case TEST_MTX_LOCK_SPIN_ALWAYS_STATS:
+               type = "TEST_MTX_LOCK_SPIN_ALWAYS_STATS";
+               break;
+       case TEST_MTX_TRY_LOCK_SPIN_STATS:
+               type = "TEST_MTX_TRY_LOCK_SPIN_STATS";
+               break;
+       case TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS:
+               type = "TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS";
+               break;
+       case TEST_MTX_UNLOCK_MTX_STATS:
+               type = "TEST_MTX_UNLOCK_MTX_STATS";
+               break;
+       case TEST_MTX_UNLOCK_SPIN_STATS:
+               type = "TEST_MTX_UNLOCK_SPIN_STATS";
+               break;
+       default:
+               break;
+       }
+
+       return snprintf(buffer, size, "%s ", type);
+}
+
+int
+get_test_mtx_stats_string(
+       char* buffer,
+       int size)
+{
+       int string_off = 0;
+       int ret = 0;
+
+       ret = snprintf(&buffer[string_off], size, "\n");
+       size -= ret;
+       string_off += ret;
+
+       int i;
+       for (i = 0; i < TEST_MTX_MAX_STATS; i++) {
+               struct lck_mtx_test_stats_elem* stat = &lck_mtx_test_stats[i];
+
+               ret = snprintf(&buffer[string_off], size, "{ ");
+               size -= ret;
+               string_off += ret;
+
+               lck_spin_lock(&stat->lock);
+               uint64_t time;
+
+               ret = snprintf(&buffer[string_off], size, "samples %llu, ", stat->samples);
+               size -= ret;
+               string_off += ret;
+
+               absolutetime_to_nanoseconds(stat->tot, &time);
+               ret = snprintf(&buffer[string_off], size, "tot %llu ns, ", time);
+               size -= ret;
+               string_off += ret;
+
+               absolutetime_to_nanoseconds(stat->avg, &time);
+               ret = snprintf(&buffer[string_off], size, "avg %llu ns, ", time);
+               size -= ret;
+               string_off += ret;
+
+               absolutetime_to_nanoseconds(stat->max, &time);
+               ret = snprintf(&buffer[string_off], size, "max %llu ns, ", time);
+               size -= ret;
+               string_off += ret;
+
+               absolutetime_to_nanoseconds(stat->min, &time);
+               ret = snprintf(&buffer[string_off], size, "min %llu ns", time);
+               size -= ret;
+               string_off += ret;
+
+               lck_spin_unlock(&stat->lock);
+
+               ret = snprintf(&buffer[string_off], size, " } ");
+               size -= ret;
+               string_off += ret;
+
+               ret = print_test_mtx_stats_string_name(i, &buffer[string_off], size);
+               size -= ret;
+               string_off += ret;
+
+               ret = snprintf(&buffer[string_off], size, "\n");
+               size -= ret;
+               string_off += ret;
+       }
+
+       return string_off;
+}
+
+void
+lck_mtx_test_init(void)
+{
+       static int first = 0;
+
+       /*
+        * This should be substituted with a version
+        * of dispatch_once for kernel (rdar:39537874)
+        */
+       if (os_atomic_load(&first, acquire) >= 2)
+               return;
+
+       if (os_atomic_cmpxchg(&first, 0, 1, relaxed)){
+               lck_grp_attr_setdefault(&test_mtx_grp_attr);
+               lck_grp_init(&test_mtx_grp, "testlck_mtx", &test_mtx_grp_attr);
+               lck_attr_setdefault(&test_mtx_attr);
+               lck_mtx_init(&test_mtx, &test_mtx_grp, &test_mtx_attr);
+
+               init_test_mtx_stats();
+
+               os_atomic_inc(&first, release);
+       }
+
+       while(os_atomic_load(&first, acquire) < 2);
+}
+
+void
+lck_mtx_test_lock(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_lock(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_LOCK_STATS);
+}
+
+static void
+lck_mtx_test_try_lock(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_try_lock(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_TRY_LOCK_STATS);
+}
+
+static void
+lck_mtx_test_lock_spin(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_lock_spin(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_LOCK_SPIN_STATS);
+}
+
+static void
+lck_mtx_test_lock_spin_always(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_lock_spin_always(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_LOCK_SPIN_ALWAYS_STATS);
+}
+
+static void
+lck_mtx_test_try_lock_spin(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_try_lock_spin(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_TRY_LOCK_SPIN_STATS);
+}
+
+static void
+lck_mtx_test_try_lock_spin_always(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_try_lock_spin_always(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS);
+}
+
+void
+lck_mtx_test_unlock(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_unlock(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_UNLOCK_MTX_STATS);
+}
+
+static void
+lck_mtx_test_unlock_mtx(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_unlock(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_UNLOCK_MTX_STATS);
+}
+
+static void
+lck_mtx_test_unlock_spin(void)
+{
+       uint64_t start;
+
+       start = mach_absolute_time();
+
+       lck_mtx_unlock(&test_mtx);
+
+       update_test_mtx_stats(start, mach_absolute_time(), TEST_MTX_UNLOCK_SPIN_STATS);
+}
+
+#define WARMUP_ITER    1000
+
+int
+lck_mtx_test_mtx_uncontended_loop_time(
+       int iter, char *buffer, int size)
+{
+       int i;
+       uint64_t tot_time[TEST_MTX_MAX_STATS];
+       uint64_t run_time[TEST_MTX_MAX_STATS];
+       uint64_t start;
+       uint64_t start_run;
+
+       //warming up the test
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_lock(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       start_run = thread_get_runtime_self();
+       start = mach_absolute_time();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_lock(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_LOCK_STATS]);
+       absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_LOCK_STATS]);
+
+       //warming up the test
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_try_lock(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       start_run = thread_get_runtime_self();
+       start = mach_absolute_time();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_try_lock(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_TRY_LOCK_STATS]);
+       absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_TRY_LOCK_STATS]);
+
+       //warming up the test
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_lock_spin(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       start_run = thread_get_runtime_self();
+       start = mach_absolute_time();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_lock_spin(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_LOCK_SPIN_STATS]);
+       absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_LOCK_SPIN_STATS]);
+
+       //warming up the test
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_lock_spin_always(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       start_run = thread_get_runtime_self();
+       start = mach_absolute_time();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_lock_spin_always(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_LOCK_SPIN_ALWAYS_STATS]);
+       absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_LOCK_SPIN_ALWAYS_STATS]);
+
+       //warming up the test
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_try_lock_spin(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       start_run = thread_get_runtime_self();
+       start = mach_absolute_time();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_try_lock_spin(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_TRY_LOCK_SPIN_STATS]);
+       absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_TRY_LOCK_SPIN_STATS]);
+
+       //warming up the test
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_try_lock_spin_always(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       start_run = thread_get_runtime_self();
+       start = mach_absolute_time();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_try_lock_spin_always(&test_mtx);
+               lck_mtx_unlock(&test_mtx);
+       }
+
+       absolutetime_to_nanoseconds(mach_absolute_time() - start, &tot_time[TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS]);
+       absolutetime_to_nanoseconds(thread_get_runtime_self() - start_run, &run_time[TEST_MTX_TRY_LOCK_SPIN_ALWAYS_STATS]);
+
+       int string_off = 0;
+       int ret = 0;
+
+       ret = snprintf(&buffer[string_off], size, "\n");
+       size -= ret;
+       string_off += ret;
+
+       for (i = 0; i < TEST_MTX_MAX_STATS - 2; i++) {
+
+               ret = snprintf(&buffer[string_off], size, "total time %llu ns total run time %llu ns ", tot_time[i], run_time[i]);
+               size -= ret;
+               string_off += ret;
+
+               ret = print_test_mtx_stats_string_name(i, &buffer[string_off], size);
+               size -= ret;
+               string_off += ret;
+
+               ret = snprintf(&buffer[string_off], size, "\n");
+               size -= ret;
+               string_off += ret;
+       }
+
+       return string_off;
+}
+
+static kern_return_t
+lck_mtx_test_mtx_lock_uncontended(
+       int iter)
+{
+       int i;
+
+       disable_all_test_mtx_stats();
+
+       //warming up the test for lock
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_test_lock();
+               lck_mtx_test_unlock_mtx();
+       }
+
+       enable_all_test_mtx_stats();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_test_lock();
+               lck_mtx_test_unlock_mtx();
+       }
+
+       disable_all_test_mtx_stats();
+
+       //warming up the test for try_lock
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_test_try_lock();
+               lck_mtx_test_unlock_mtx();
+       }
+
+       enable_all_test_mtx_stats();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_test_try_lock();
+               lck_mtx_test_unlock_mtx();
+       }
+
+       return KERN_SUCCESS;
+}
+
+static kern_return_t
+lck_mtx_test_mtx_spin_uncontended(
+       int iter)
+{
+       int i;
+
+       disable_all_test_mtx_stats();
+
+       //warming up the test for lock_spin
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_test_lock_spin();
+               lck_mtx_test_unlock_spin();
+       }
+
+       enable_all_test_mtx_stats();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_test_lock_spin();
+               lck_mtx_test_unlock_spin();
+       }
+
+       disable_all_test_mtx_stats();
+
+       //warming up the test for try_lock_spin
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_test_try_lock_spin();
+               lck_mtx_test_unlock_spin();
+       }
+
+       enable_all_test_mtx_stats();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_test_try_lock_spin();
+               lck_mtx_test_unlock_spin();
+       }
+
+       disable_all_test_mtx_stats();
+
+       //warming up the test for lock_spin_always
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_test_lock_spin_always();
+               lck_mtx_test_unlock_spin();
+       }
+
+       enable_all_test_mtx_stats();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_test_lock_spin_always();
+               lck_mtx_test_unlock_spin();
+       }
+
+       disable_all_test_mtx_stats();
+
+       //warming up the test for try_lock_spin_always
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_test_try_lock_spin_always();
+               lck_mtx_test_unlock_spin();
+       }
+
+       enable_all_test_mtx_stats();
+
+       for (i = 0; i < iter; i++) {
+               lck_mtx_test_try_lock_spin_always();
+               lck_mtx_test_unlock_spin();
+       }
+
+       return KERN_SUCCESS;
+}
+
+int
+lck_mtx_test_mtx_uncontended(
+       int iter,
+       char *buffer,
+       int size)
+{
+       erase_all_test_mtx_stats();
+       lck_mtx_test_mtx_lock_uncontended(iter);
+       lck_mtx_test_mtx_spin_uncontended(iter);
+
+       return get_test_mtx_stats_string(buffer,size);
+}
+
+static int synch;
+static int wait_barrier;
+static int iterations;
+static uint64_t start_loop_time;
+static uint64_t start_loop_time_run;
+static uint64_t end_loop_time;
+static uint64_t end_loop_time_run;
+
+struct lck_mtx_thread_arg {
+       int my_locked;
+       int* other_locked;
+       thread_t other_thread;
+};
+
+static void
+test_mtx_lock_unlock_contended_thread(
+       void *arg,
+       __unused wait_result_t wr)
+{
+       int i, val;
+       struct lck_mtx_thread_arg *info = (struct lck_mtx_thread_arg *) arg;
+       thread_t other_thread;
+       int* my_locked;
+       int* other_locked;
+
+       printf("Starting thread %p\n", current_thread());
+
+       while(os_atomic_load(&info->other_thread, acquire) == NULL);
+       other_thread = info->other_thread;
+
+       printf("Other thread %p\n", other_thread);
+
+       my_locked = &info->my_locked;
+        other_locked = info->other_locked;
+
+       *my_locked = 0;
+       val = os_atomic_inc(&synch, relaxed);
+       while(os_atomic_load(&synch, relaxed) < 2);
+
+       //warming up the test
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_test_lock();
+
+               os_atomic_xchg(my_locked, 1 , relaxed);
+               if (i != WARMUP_ITER - 1) {
+                       while(os_atomic_load(&other_thread->state, relaxed) & TH_RUN);
+                       os_atomic_xchg(my_locked, 0 , relaxed);
+               }
+
+               lck_mtx_test_unlock();
+
+               if (i != WARMUP_ITER - 1)
+                       while(os_atomic_load(other_locked, relaxed) == 0);
+       }
+
+       printf("warmup done %p\n", current_thread());
+       os_atomic_inc(&synch, relaxed);
+       while(os_atomic_load(&synch, relaxed) < 4);
+
+       //erase statistics
+       if (val == 1)
+               erase_all_test_mtx_stats();
+
+       *my_locked = 0;
+       /*
+        * synch the threads so they start
+        * concurrently.
+        */
+       os_atomic_inc(&synch, relaxed);
+       while(os_atomic_load(&synch, relaxed) < 6);
+
+       for (i = 0; i < iterations; i++) {
+               lck_mtx_test_lock();
+
+               os_atomic_xchg(my_locked, 1 , relaxed);
+               if (i != iterations - 1) {
+                       while(os_atomic_load(&other_thread->state, relaxed) & TH_RUN);
+                       os_atomic_xchg(my_locked, 0 , relaxed);
+               }
+               lck_mtx_test_unlock_mtx();
+
+               if (i != iterations - 1)
+                       while(os_atomic_load(other_locked, relaxed) == 0);
+
+       }
+
+       os_atomic_inc(&wait_barrier, relaxed);
+       thread_wakeup((event_t) &wait_barrier);
+       thread_terminate_self();
+}
+
+
+kern_return_t
+lck_mtx_test_mtx_contended(
+       int iter,
+       char* buffer,
+       int buffer_size)
+{
+       thread_t thread1, thread2;
+       kern_return_t result;
+       struct lck_mtx_thread_arg targs[2] = {};
+       synch = 0;
+       wait_barrier = 0;
+       iterations = iter;
+
+       erase_all_test_mtx_stats();
+
+       targs[0].other_thread = NULL;
+        targs[1].other_thread = NULL;
+
+       result = kernel_thread_start((thread_continue_t)test_mtx_lock_unlock_contended_thread, &targs[0], &thread1);
+       if (result != KERN_SUCCESS) {
+               return 0;
+       }
+
+       result = kernel_thread_start((thread_continue_t)test_mtx_lock_unlock_contended_thread, &targs[1], &thread2);
+       if (result != KERN_SUCCESS) {
+               thread_deallocate(thread1);
+               return 0;
+       }
+
+       /* this are t1 args */
+       targs[0].my_locked = 0;
+       targs[0].other_locked = &targs[1].my_locked;
+
+       os_atomic_xchg(&targs[0].other_thread, thread2, release);
+
+       /* this are t2 args */
+       targs[1].my_locked = 0;
+       targs[1].other_locked = &targs[0].my_locked;
+
+       os_atomic_xchg(&targs[1].other_thread, thread1, release);
+
+       while (os_atomic_load(&wait_barrier, relaxed) != 2) {
+               assert_wait((event_t) &wait_barrier, THREAD_UNINT);
+               if (os_atomic_load(&wait_barrier, relaxed) != 2) {
+                       (void) thread_block(THREAD_CONTINUE_NULL);
+               } else {
+                       clear_wait(current_thread(), THREAD_AWAKENED);
+               }
+       }
+
+       thread_deallocate(thread1);
+       thread_deallocate(thread2);
+
+       return  get_test_mtx_stats_string(buffer, buffer_size);
+}
+
+static void
+test_mtx_lck_unlock_contended_loop_time_thread(
+       __unused void *arg,
+       __unused wait_result_t wr)
+{
+       int i, val;
+       struct lck_mtx_thread_arg *info = (struct lck_mtx_thread_arg *) arg;
+       thread_t other_thread;
+       int* my_locked;
+       int* other_locked;
+
+       printf("Starting thread %p\n", current_thread());
+
+       while(os_atomic_load(&info->other_thread, acquire) == NULL);
+       other_thread = info->other_thread;
+
+       printf("Other thread %p\n", other_thread);
+
+       my_locked = &info->my_locked;
+       other_locked = info->other_locked;
+
+       *my_locked = 0;
+       val = os_atomic_inc(&synch, relaxed);
+       while(os_atomic_load(&synch, relaxed) < 2);
+
+       //warming up the test
+       for (i = 0; i < WARMUP_ITER; i++) {
+               lck_mtx_lock(&test_mtx);
+
+               os_atomic_xchg(my_locked, 1 , relaxed);
+               if (i != WARMUP_ITER - 1) {
+                       while(os_atomic_load(&other_thread->state, relaxed) & TH_RUN);
+                       os_atomic_xchg(my_locked, 0 , relaxed);
+               }
+
+               lck_mtx_unlock(&test_mtx);
+
+               if (i != WARMUP_ITER - 1)
+                       while(os_atomic_load(other_locked, relaxed) == 0);
+       }
+
+       printf("warmup done %p\n", current_thread());
+
+       os_atomic_inc(&synch, relaxed);
+       while(os_atomic_load(&synch, relaxed) < 4);
+
+       *my_locked = 0;
+
+       /*
+        * synch the threads so they start
+        * concurrently.
+        */
+       os_atomic_inc(&synch, relaxed);
+       while(os_atomic_load(&synch, relaxed) < 6);
+
+       if (val == 1) {
+               start_loop_time_run = thread_get_runtime_self();
+               start_loop_time = mach_absolute_time();
+       }
+
+       for (i = 0; i < iterations; i++) {
+               lck_mtx_lock(&test_mtx);
+
+               os_atomic_xchg(my_locked, 1 , relaxed);
+               if (i != iterations - 1) {
+                       while(os_atomic_load(&other_thread->state, relaxed) & TH_RUN);
+                       os_atomic_xchg(my_locked, 0 , relaxed);
+               }
+
+               lck_mtx_unlock(&test_mtx);
+
+               if (i != iterations - 1)
+                       while(os_atomic_load(other_locked, relaxed) == 0);
+       }
+
+       if (val == 1) {
+               end_loop_time = mach_absolute_time();
+               end_loop_time_run = thread_get_runtime_self();
+       }
+
+       os_atomic_inc(&wait_barrier, relaxed);
+       thread_wakeup((event_t) &wait_barrier);
+       thread_terminate_self();
+}
+
+
+int
+lck_mtx_test_mtx_contended_loop_time(
+       int iter,
+       char *buffer,
+       int buffer_size)
+{
+       thread_t thread1, thread2;
+       kern_return_t result;
+       int ret;
+       struct lck_mtx_thread_arg targs[2] = {};
+       synch = 0;
+       wait_barrier = 0;
+       iterations = iter;
+       uint64_t time, time_run;
+
+       targs[0].other_thread = NULL;
+       targs[1].other_thread = NULL;
+
+       result = kernel_thread_start((thread_continue_t)test_mtx_lck_unlock_contended_loop_time_thread, &targs[0], &thread1);
+       if (result != KERN_SUCCESS) {
+               return 0;
+       }
+
+       result = kernel_thread_start((thread_continue_t)test_mtx_lck_unlock_contended_loop_time_thread, &targs[1], &thread2);
+       if (result != KERN_SUCCESS) {
+               thread_deallocate(thread1);
+               return 0;
+       }
+
+       /* this are t1 args */
+       targs[0].my_locked = 0;
+       targs[0].other_locked = &targs[1].my_locked;
+
+       os_atomic_xchg(&targs[0].other_thread, thread2, release);
+
+       /* this are t2 args */
+       targs[1].my_locked = 0;
+       targs[1].other_locked = &targs[0].my_locked;
+
+       os_atomic_xchg(&targs[1].other_thread, thread1, release);
+
+       while (os_atomic_load(&wait_barrier, acquire) != 2) {
+               assert_wait((event_t) &wait_barrier, THREAD_UNINT);
+               if (os_atomic_load(&wait_barrier, acquire) != 2) {
+                       (void) thread_block(THREAD_CONTINUE_NULL);
+               } else {
+                       clear_wait(current_thread(), THREAD_AWAKENED);
+               }
+       }
+
+       thread_deallocate(thread1);
+       thread_deallocate(thread2);
+
+       absolutetime_to_nanoseconds(end_loop_time - start_loop_time, &time);
+       absolutetime_to_nanoseconds(end_loop_time_run - start_loop_time_run, &time_run);
+
+       ret = snprintf(buffer, buffer_size, "\n");
+       ret += snprintf(&buffer[ret], buffer_size - ret, "total time %llu ns total run time %llu ns ", time, time_run);
+       ret += print_test_mtx_stats_string_name(TEST_MTX_LOCK_STATS, &buffer[ret], buffer_size - ret);
+       ret += snprintf(&buffer[ret], buffer_size - ret, "\n");
+
+       return ret;
+}
+
index 433a1ae907fa678c45050b6927cc58cf92f3267b..81f934a17cc048044670970847ab6332a3f85d1d 100644 (file)
 #include <kern/exc_guard.h>
 #include <kern/telemetry.h>
 #include <kern/policy_internal.h>
+#include <kern/turnstile.h>
 
 #include <corpses/task_corpse.h>
 #if KPC
@@ -169,8 +170,14 @@ static queue_head_t                thread_stack_queue;
 decl_simple_lock_data(static,thread_terminate_lock)
 static queue_head_t            thread_terminate_queue;
 
+static queue_head_t            thread_deallocate_queue;
+
+static queue_head_t            turnstile_deallocate_queue;
+
 static queue_head_t            crashed_threads_queue;
 
+static queue_head_t            workq_deallocate_queue;
+
 decl_simple_lock_data(static,thread_exception_lock)
 static queue_head_t            thread_exception_queue;
 
@@ -182,10 +189,8 @@ struct thread_exception_elt {
 };
 
 static struct thread   thread_template, init_thread;
-
-static void            sched_call_null(
-                                       int                     type,
-                                       thread_t        thread);
+static void thread_deallocate_enqueue(thread_t thread);
+static void thread_deallocate_complete(thread_t thread);
 
 #ifdef MACH_BSD
 extern void proc_exit(void *);
@@ -193,6 +198,7 @@ extern mach_exception_data_type_t proc_encode_exit_exception_code(void *);
 extern uint64_t get_dispatchqueue_offset_from_proc(void *);
 extern uint64_t get_return_to_kernel_offset_from_proc(void *p);
 extern int      proc_selfpid(void);
+extern void     proc_name(int, char*, int);
 extern char *   proc_name_address(void *p);
 #endif /* MACH_BSD */
 
@@ -212,6 +218,13 @@ static void init_thread_ledgers(void);
 void jetsam_on_ledger_cpulimit_exceeded(void);
 #endif
 
+extern int task_thread_soft_limit;
+extern int exc_via_corpse_forking;
+
+#if DEVELOPMENT || DEBUG
+extern int exc_resource_threads_enabled;
+#endif /* DEVELOPMENT || DEBUG */
+
 /*
  * Level (in terms of percentage of the limit) at which the CPU usage monitor triggers telemetry.
  *
@@ -222,6 +235,9 @@ void jetsam_on_ledger_cpulimit_exceeded(void);
 
 int cpumon_ustackshots_trigger_pct; /* Percentage. Level at which we start gathering telemetry. */
 void __attribute__((noinline)) SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void);
+#if DEVELOPMENT || DEBUG
+void __attribute__((noinline)) SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t, int);
+#endif /* DEVELOPMENT || DEBUG */
 
 /*
  * The smallest interval over which we support limiting CPU consumption is 1ms
@@ -274,10 +290,8 @@ thread_bootstrap(void)
        thread_template.max_priority = 0;
        thread_template.task_priority = 0;
        thread_template.promotions = 0;
-       thread_template.pending_promoter_index = 0;
-       thread_template.pending_promoter[0] = NULL;
-       thread_template.pending_promoter[1] = NULL;
        thread_template.rwlock_count = 0;
+       thread_template.waiting_for_mutex = NULL;
 
 
        thread_template.realtime.deadline = UINT64_MAX;
@@ -307,11 +321,12 @@ thread_bootstrap(void)
        thread_template.bound_processor = PROCESSOR_NULL;
        thread_template.last_processor = PROCESSOR_NULL;
 
-       thread_template.sched_call = sched_call_null;
+       thread_template.sched_call = NULL;
 
        timer_init(&thread_template.user_timer);
        timer_init(&thread_template.system_timer);
        timer_init(&thread_template.ptime);
+       timer_init(&thread_template.runnable_timer);
        thread_template.user_timer_save = 0;
        thread_template.system_timer_save = 0;
        thread_template.vtimer_user_save = 0;
@@ -331,6 +346,9 @@ thread_bootstrap(void)
        thread_template.recover = (vm_offset_t)NULL;
        
        thread_template.map = VM_MAP_NULL;
+#if DEVELOPMENT || DEBUG
+       thread_template.pmap_footprint_suspended = FALSE;
+#endif /* DEVELOPMENT || DEBUG */
 
 #if CONFIG_DTRACE
        thread_template.t_dtrace_predcache = 0;
@@ -483,7 +501,6 @@ thread_terminate_self(void)
 {
        thread_t                thread = current_thread();
        task_t                  task;
-       spl_t                   s;
        int threadcnt;
 
        pal_thread_terminate_self(thread);
@@ -496,34 +513,12 @@ thread_terminate_self(void)
 
        thread_mtx_unlock(thread);
 
-       s = splsched();
-       thread_lock(thread);
-
-       /*
-        *      Cancel priority depression, wait for concurrent expirations
-        *      on other processors.
-        */
-       if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-               thread->sched_flags &= ~TH_SFLAG_DEPRESSED_MASK;
-
-               /* If our priority was low because of a depressed yield, restore it in case we block below */
-               thread_recompute_sched_pri(thread, FALSE);
-
-               if (timer_call_cancel(&thread->depress_timer))
-                       thread->depress_timer_active--;
-       }
-
-       while (thread->depress_timer_active > 0) {
-               thread_unlock(thread);
-               splx(s);
-
-               delay(1);
+       thread_sched_call(thread, NULL);
 
-               s = splsched();
-               thread_lock(thread);
-       }
+       spl_t s = splsched();
+       thread_lock(thread);
 
-       thread_sched_call(thread, NULL);
+       thread_depress_abort_locked(thread);
 
        thread_unlock(thread);
        splx(s);
@@ -608,6 +603,32 @@ thread_terminate_self(void)
        s = splsched();
        thread_lock(thread);
 
+       /*
+        * Ensure that the depress timer is no longer enqueued,
+        * so the timer (stored in the thread) can be safely deallocated
+        *
+        * TODO: build timer_call_cancel_wait
+        */
+
+       assert((thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) == 0);
+
+       uint32_t delay_us = 1;
+
+       while (thread->depress_timer_active > 0) {
+               thread_unlock(thread);
+               splx(s);
+
+               delay(delay_us++);
+
+               if (delay_us > USEC_PER_SEC)
+                       panic("depress timer failed to inactivate!"
+                             "thread: %p depress_timer_active: %d",
+                             thread, thread->depress_timer_active);
+
+               s = splsched();
+               thread_lock(thread);
+       }
+
        /*
         *      Cancel wait timer, and wait for
         *      concurrent expirations.
@@ -619,11 +640,18 @@ thread_terminate_self(void)
                        thread->wait_timer_active--;
        }
 
+       delay_us = 1;
+
        while (thread->wait_timer_active > 0) {
                thread_unlock(thread);
                splx(s);
 
-               delay(1);
+               delay(delay_us++);
+
+               if (delay_us > USEC_PER_SEC)
+                       panic("wait timer failed to inactivate!"
+                             "thread: %p wait_timer_active: %d",
+                             thread, thread->wait_timer_active);
 
                s = splsched();
                thread_lock(thread);
@@ -642,10 +670,16 @@ thread_terminate_self(void)
         */
        thread->state |= TH_TERMINATE;
        thread_mark_wait_locked(thread, THREAD_UNINT);
+
+       assert((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) == 0);
+       assert((thread->sched_flags & TH_SFLAG_RW_PROMOTED) == 0);
+       assert((thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) == 0);
        assert((thread->sched_flags & TH_SFLAG_PROMOTED) == 0);
        assert(thread->promotions == 0);
-       assert(!(thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED));
+       assert(thread->was_promoted_on_wakeup == 0);
+       assert(thread->waiting_for_mutex == NULL);
        assert(thread->rwlock_count == 0);
+
        thread_unlock(thread);
        /* splsched */
 
@@ -653,23 +687,33 @@ thread_terminate_self(void)
        /*NOTREACHED*/
 }
 
-/* Drop a thread refcount that definitely isn't the last one. */
+/* Drop a thread refcount safely without triggering a zfree */
 void
 thread_deallocate_safe(thread_t thread)
 {
+       __assert_only uint32_t          th_ref_count;
+
+       if (thread == THREAD_NULL)
+               return;
+
        assert_thread_magic(thread);
 
-       uint32_t old_refcount = atomic_fetch_sub_explicit(&thread->ref_count, 1, memory_order_release);
+       if (__probable(atomic_fetch_sub_explicit(&thread->ref_count, 1,
+                       memory_order_release) - 1 > 0)) {
+                return;
+        }
+
+       th_ref_count = atomic_load_explicit(&thread->ref_count, memory_order_acquire);
+       assert(th_ref_count == 0);
 
-       if (__improbable(old_refcount <= 1))
-               panic("bad thread refcount: %d", old_refcount);
+       /* enqueue the thread for thread deallocate deamon to call thread_deallocate_complete */
+       thread_deallocate_enqueue(thread);
 }
 
 void
 thread_deallocate(
        thread_t                        thread)
 {
-       task_t                          task;
        __assert_only uint32_t          th_ref_count;
 
        if (thread == THREAD_NULL)
@@ -685,6 +729,19 @@ thread_deallocate(
        th_ref_count = atomic_load_explicit(&thread->ref_count, memory_order_acquire);
        assert(th_ref_count == 0);
 
+       thread_deallocate_complete(thread);
+}
+
+void
+thread_deallocate_complete(
+       thread_t                        thread)
+{
+       task_t                          task;
+
+       assert_thread_magic(thread);
+
+       assert(thread->ref_count == 0);
+
        assert(thread_owned_workloops_count(thread) == 0);
 
        if (!(thread->state & TH_TERMINATE2))
@@ -692,8 +749,6 @@ thread_deallocate(
 
        assert(thread->runq == PROCESSOR_NULL);
 
-       assert(thread->user_promotions == 0);
-
 #if KPC
        kpc_thread_destroy(thread);
 #endif
@@ -718,6 +773,10 @@ thread_deallocate(
        if (thread->t_threadledger)
                ledger_dereference(thread->t_threadledger);
 
+       assert(thread->turnstile != TURNSTILE_NULL);
+       if (thread->turnstile)
+               turnstile_deallocate(thread->turnstile);
+
        if (IPC_VOUCHER_NULL != thread->ith_voucher)
                ipc_voucher_release(thread->ith_voucher);
 
@@ -856,7 +915,6 @@ thread_copy_resource_info(
        thread_t dst_thread,
        thread_t src_thread)
 {
-       dst_thread->thread_tag = src_thread->thread_tag;
        dst_thread->c_switch = src_thread->c_switch;
        dst_thread->p_switch = src_thread->p_switch;
        dst_thread->ps_switch = src_thread->ps_switch;
@@ -865,6 +923,7 @@ thread_copy_resource_info(
        dst_thread->user_timer_save = src_thread->user_timer_save;
        dst_thread->system_timer = src_thread->system_timer;
        dst_thread->system_timer_save = src_thread->system_timer_save;
+       dst_thread->runnable_timer = src_thread->runnable_timer;
        dst_thread->vtimer_user_save = src_thread->vtimer_user_save;
        dst_thread->vtimer_prof_save = src_thread->vtimer_prof_save;
        dst_thread->vtimer_rlim_save = src_thread->vtimer_rlim_save;
@@ -892,6 +951,7 @@ thread_terminate_daemon(void)
        (void)splsched();
        simple_lock(&thread_terminate_lock);
 
+thread_terminate_start:
        while ((thread = qe_dequeue_head(&thread_terminate_queue, struct thread, runq_links)) != THREAD_NULL) {
                assert_thread_magic(thread);
 
@@ -913,6 +973,7 @@ thread_terminate_daemon(void)
                task_lock(task);
                task->total_user_time += timer_grab(&thread->user_timer);
                task->total_ptime += timer_grab(&thread->ptime);
+               task->total_runnable_time += timer_grab(&thread->runnable_timer);
                if (thread->precise_user_kernel_time) {
                        task->total_system_time += timer_grab(&thread->system_timer);
                } else {
@@ -960,6 +1021,55 @@ thread_terminate_daemon(void)
                simple_lock(&thread_terminate_lock);
        }
 
+       while ((thread = qe_dequeue_head(&thread_deallocate_queue, struct thread, runq_links)) != THREAD_NULL) {
+               assert_thread_magic(thread);
+
+               simple_unlock(&thread_terminate_lock);
+               (void)spllo();
+
+               thread_deallocate_complete(thread);
+
+               (void)splsched();
+               simple_lock(&thread_terminate_lock);
+       }
+
+       struct turnstile *turnstile;
+       while ((turnstile = qe_dequeue_head(&turnstile_deallocate_queue, struct turnstile, ts_deallocate_link)) != TURNSTILE_NULL) {
+
+               simple_unlock(&thread_terminate_lock);
+               (void)spllo();
+
+               turnstile_destroy(turnstile);
+
+               (void)splsched();
+               simple_lock(&thread_terminate_lock);
+       }
+
+       queue_entry_t qe;
+
+       /*
+        * see workq_deallocate_enqueue: struct workqueue is opaque to thread.c and
+        * we just link pieces of memory here
+        */
+       while ((qe = dequeue_head(&workq_deallocate_queue))) {
+               simple_unlock(&thread_terminate_lock);
+               (void)spllo();
+
+               workq_destroy((struct workqueue *)qe);
+
+               (void)splsched();
+               simple_lock(&thread_terminate_lock);
+       }
+
+       /*
+        * Check if something enqueued in thread terminate/deallocate queue
+        * while processing workq deallocate queue
+        */
+       if (!queue_empty(&thread_terminate_queue) ||
+           !queue_empty(&thread_deallocate_queue) ||
+           !queue_empty(&turnstile_deallocate_queue))
+               goto thread_terminate_start;
+
        assert_wait((event_t)&thread_terminate_queue, THREAD_UNINT);
        simple_unlock(&thread_terminate_lock);
        /* splsched */
@@ -989,6 +1099,67 @@ thread_terminate_enqueue(
        thread_wakeup((event_t)&thread_terminate_queue);
 }
 
+/*
+ *     thread_deallocate_enqueue:
+ *
+ *     Enqueue a thread for final deallocation.
+ */
+static void
+thread_deallocate_enqueue(
+       thread_t                thread)
+{
+       spl_t s = splsched();
+
+       simple_lock(&thread_terminate_lock);
+       enqueue_tail(&thread_deallocate_queue, &thread->runq_links);
+       simple_unlock(&thread_terminate_lock);
+
+       thread_wakeup((event_t)&thread_terminate_queue);
+       splx(s);
+}
+
+/*
+ *     turnstile_deallocate_enqueue:
+ *
+ *     Enqueue a turnstile for final deallocation.
+ */
+void
+turnstile_deallocate_enqueue(
+       struct turnstile *turnstile)
+{
+       spl_t s = splsched();
+
+       simple_lock(&thread_terminate_lock);
+       enqueue_tail(&turnstile_deallocate_queue, &turnstile->ts_deallocate_link);
+       simple_unlock(&thread_terminate_lock);
+
+       thread_wakeup((event_t)&thread_terminate_queue);
+       splx(s);
+}
+
+/*
+ *     workq_deallocate_enqueue:
+ *
+ *     Enqueue a workqueue for final deallocation.
+ */
+void
+workq_deallocate_enqueue(
+       struct workqueue *wq)
+{
+       spl_t s = splsched();
+
+       simple_lock(&thread_terminate_lock);
+       /*
+        * this is just to delay a zfree(), so we link the memory with no regards
+        * for how the struct looks like.
+        */
+       enqueue_tail(&workq_deallocate_queue, (queue_entry_t)wq);
+       simple_unlock(&thread_terminate_lock);
+
+       thread_wakeup((event_t)&thread_terminate_queue);
+       splx(s);
+}
+
 /*
  * thread_terminate_crashed_threads:
  * walk the list of crashed threads and put back set of threads
@@ -999,6 +1170,7 @@ thread_terminate_crashed_threads()
 {
        thread_t th_remove;
        boolean_t should_wake_terminate_queue = FALSE;
+       spl_t s = splsched();
 
        simple_lock(&thread_terminate_lock);
        /*
@@ -1017,6 +1189,7 @@ thread_terminate_crashed_threads()
        }
 
        simple_unlock(&thread_terminate_lock);
+       splx(s);
        if (should_wake_terminate_queue == TRUE) {
                thread_wakeup((event_t)&thread_terminate_queue);
        }
@@ -1093,6 +1266,9 @@ thread_daemon_init(void)
 
        simple_lock_init(&thread_terminate_lock, 0);
        queue_init(&thread_terminate_queue);
+       queue_init(&thread_deallocate_queue);
+       queue_init(&workq_deallocate_queue);
+       queue_init(&turnstile_deallocate_queue);
        queue_init(&crashed_threads_queue);
 
        result = kernel_thread_start_priority((thread_continue_t)thread_terminate_daemon, NULL, MINPRI_KERNEL, &thread);
@@ -1123,6 +1299,7 @@ thread_daemon_init(void)
 #define TH_OPTION_NONE         0x00
 #define TH_OPTION_NOCRED       0x01
 #define TH_OPTION_NOSUSP       0x02
+#define TH_OPTION_WORKQ                0x04
 
 /*
  * Create a new thread.
@@ -1135,6 +1312,7 @@ thread_create_internal(
        task_t                                  parent_task,
        integer_t                               priority,
        thread_continue_t               continuation,
+       void                                    *parameter,
        int                                             options,
        thread_t                                *out_thread)
 {
@@ -1195,6 +1373,10 @@ thread_create_internal(
        ipc_thread_init(new_thread);
 
        new_thread->continuation = continuation;
+       new_thread->parameter = parameter;
+       new_thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE;
+       priority_queue_init(&new_thread->inheritor_queue,
+                       PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
 
        /* Allocate I/O Statistics structure */
        new_thread->thread_io_stats = (io_stat_info_t)kalloc(sizeof(struct io_stat_info));
@@ -1211,6 +1393,24 @@ thread_create_internal(
        new_thread->decmp_upl = NULL;
 #endif /* CONFIG_IOSCHED */ 
 
+#if DEVELOPMENT || DEBUG
+       task_lock(parent_task);
+       uint16_t thread_limit = parent_task->task_thread_limit;
+       if (exc_resource_threads_enabled &&
+           thread_limit > 0 &&
+           parent_task->thread_count >= thread_limit &&
+           !parent_task->task_has_crossed_thread_limit &&
+           !(parent_task->t_flags & TF_CORPSE)) {
+               int thread_count = parent_task->thread_count;
+               parent_task->task_has_crossed_thread_limit = TRUE;
+               task_unlock(parent_task);
+               SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(parent_task, thread_count);
+       }
+       else {
+               task_unlock(parent_task);
+       }
+#endif
+
        lck_mtx_lock(&tasks_threads_lock);
        task_lock(parent_task);
 
@@ -1340,6 +1540,7 @@ thread_create_internal(
                new_thread->inspection = FALSE;
        }
        new_thread->corpse_dup = FALSE;
+       new_thread->turnstile = turnstile_alloc();
        *out_thread = new_thread;
 
        if (kdebug_enable) {
@@ -1390,7 +1591,7 @@ thread_create_internal2(
        if (task == TASK_NULL || task == kernel_task)
                return (KERN_INVALID_ARGUMENT);
 
-       result = thread_create_internal(task, -1, continuation, TH_OPTION_NONE, &thread);
+       result = thread_create_internal(task, -1, continuation, NULL, TH_OPTION_NONE, &thread);
        if (result != KERN_SUCCESS)
                return (result);
 
@@ -1449,6 +1650,7 @@ thread_create_waiting_internal(
        task_t                  task,
        thread_continue_t       continuation,
        event_t                 event,
+       block_hint_t            block_hint,
        int                     options,
        thread_t                *new_thread)
 {
@@ -1458,7 +1660,8 @@ thread_create_waiting_internal(
        if (task == TASK_NULL || task == kernel_task)
                return (KERN_INVALID_ARGUMENT);
 
-       result = thread_create_internal(task, -1, continuation, options, &thread);
+       result = thread_create_internal(task, -1, continuation, NULL,
+                       options, &thread);
        if (result != KERN_SUCCESS)
                return (result);
 
@@ -1468,6 +1671,11 @@ thread_create_waiting_internal(
                thread_hold(thread);
 
        thread_mtx_lock(thread);
+       thread_set_pending_block_hint(thread, block_hint);
+       if (options & TH_OPTION_WORKQ) {
+               thread->static_param = true;
+               event = workq_thread_init_and_wq_lock(task, thread);
+       }
        thread_start_in_assert_wait(thread, event, THREAD_INTERRUPTIBLE);
        thread_mtx_unlock(thread);
 
@@ -1487,7 +1695,7 @@ thread_create_waiting(
        thread_t                *new_thread)
 {
        return thread_create_waiting_internal(task, continuation, event,
-                                             TH_OPTION_NONE, new_thread);
+                       kThreadWaitNone, TH_OPTION_NONE, new_thread);
 }
 
 
@@ -1506,14 +1714,23 @@ thread_create_running_internal2(
        if (task == TASK_NULL || task == kernel_task)
                return (KERN_INVALID_ARGUMENT);
 
-       result = thread_create_internal(task, -1, (thread_continue_t)thread_bootstrap_return, TH_OPTION_NONE, &thread);
+       result = thread_create_internal(task, -1,
+                       (thread_continue_t)thread_bootstrap_return, NULL,
+                       TH_OPTION_NONE, &thread);
        if (result != KERN_SUCCESS)
                return (result);
 
        if (task->suspend_count > 0)
                thread_hold(thread);
 
-       result = machine_thread_set_state(thread, flavor, new_state, new_state_count);
+       if (from_user) {
+               result = machine_thread_state_convert_from_user(thread, flavor,
+                               new_state, new_state_count);
+       }
+       if (result == KERN_SUCCESS) {
+               result = machine_thread_set_state(thread, flavor, new_state,
+                               new_state_count);
+       }
        if (result != KERN_SUCCESS) {
                task_unlock(task);
                lck_mtx_unlock(&tasks_threads_lock);
@@ -1573,46 +1790,15 @@ thread_create_running_from_user(
                new_thread, TRUE);
 }
 
-kern_return_t
-thread_create_workq(
-       task_t                          task,
-       thread_continue_t               thread_return,
-       thread_t                        *new_thread)
-{
-       kern_return_t           result;
-       thread_t                        thread;
-
-       if (task == TASK_NULL || task == kernel_task)
-               return (KERN_INVALID_ARGUMENT);
-
-       result = thread_create_internal(task, -1, thread_return, TH_OPTION_NOCRED | TH_OPTION_NOSUSP, &thread);
-       if (result != KERN_SUCCESS)
-               return (result);
-
-       thread->user_stop_count = 1;
-       thread_hold(thread);
-       if (task->suspend_count > 0)
-               thread_hold(thread);
-
-       task_unlock(task);
-       lck_mtx_unlock(&tasks_threads_lock);
-       
-       *new_thread = thread;
-
-       return (KERN_SUCCESS);
-}
-
 kern_return_t
 thread_create_workq_waiting(
        task_t              task,
        thread_continue_t   continuation,
-       event_t             event,
        thread_t            *new_thread)
 {
-
-       return thread_create_waiting_internal(task, continuation, event,
-                                             TH_OPTION_NOCRED | TH_OPTION_NOSUSP,
-                                             new_thread);
+       int options = TH_OPTION_NOCRED | TH_OPTION_NOSUSP | TH_OPTION_WORKQ;
+       return thread_create_waiting_internal(task, continuation, NULL,
+                       kThreadWaitParkedWorkQueue, options, new_thread);
 }
 
 /*
@@ -1632,7 +1818,8 @@ kernel_thread_create(
        thread_t                        thread;
        task_t                          task = kernel_task;
 
-       result = thread_create_internal(task, priority, continuation, TH_OPTION_NOCRED | TH_OPTION_NONE, &thread);
+       result = thread_create_internal(task, priority, continuation, parameter,
+                       TH_OPTION_NOCRED | TH_OPTION_NONE, &thread);
        if (result != KERN_SUCCESS)
                return (result);
 
@@ -1646,8 +1833,6 @@ kernel_thread_create(
 #endif
        thread->reserved_stack = thread->kernel_stack;
 
-       thread->parameter = parameter;
-
 if(debug_task & 1)
        kprintf("kernel_thread_create: thread = %p continuation = %p\n", thread, continuation);
        *new_thread = thread;
@@ -1697,7 +1882,7 @@ retrieve_thread_basic_info(thread_t thread, thread_basic_info_t basic_info)
        /* fill in info */
 
        thread_read_times(thread, &basic_info->user_time,
-                                                               &basic_info->system_time);
+                       &basic_info->system_time, NULL);
 
        /*
         *      Update lazy-evaluated scheduler info because someone wants it.
@@ -1963,7 +2148,8 @@ void
 thread_read_times(
        thread_t                thread,
        time_value_t    *user_time,
-       time_value_t    *system_time)
+       time_value_t    *system_time,
+       time_value_t    *runnable_time)
 {
        clock_sec_t             secs;
        clock_usec_t    usecs;
@@ -1976,7 +2162,7 @@ thread_read_times(
                absolutetime_to_microtime(tval_user, &secs, &usecs);
                user_time->seconds = (typeof(user_time->seconds))secs;
                user_time->microseconds = usecs;
-               
+
                absolutetime_to_microtime(tval_system, &secs, &usecs);
                system_time->seconds = (typeof(system_time->seconds))secs;
                system_time->microseconds = usecs;
@@ -1990,6 +2176,13 @@ thread_read_times(
                system_time->seconds = 0;
                system_time->microseconds = 0;
        }
+
+       if (runnable_time) {
+               uint64_t tval_runnable = timer_grab(&thread->runnable_timer);
+               absolutetime_to_microtime(tval_runnable, &secs, &usecs);
+               runnable_time->seconds = (typeof(runnable_time->seconds))secs;
+               runnable_time->microseconds = usecs;
+       }
 }
 
 uint64_t thread_get_runtime_self(void)
@@ -2004,7 +2197,7 @@ uint64_t thread_get_runtime_self(void)
        /* Not interrupt safe, as the scheduler may otherwise update timer values underneath us */
        interrupt_state = ml_set_interrupts_enabled(FALSE);
        processor = current_processor();
-       timer_switch(PROCESSOR_DATA(processor, thread_timer), mach_absolute_time(), PROCESSOR_DATA(processor, thread_timer));
+       timer_update(PROCESSOR_DATA(processor, thread_timer), mach_absolute_time());
        runtime = (timer_grab(&thread->user_timer) + timer_grab(&thread->system_timer));
        ml_set_interrupts_enabled(interrupt_state);
 
@@ -2138,7 +2331,7 @@ clear_thread_rwlock_boost(void)
 
        if ((thread->rwlock_count-- == 1) && (thread->sched_flags & TH_SFLAG_RW_PROMOTED)) {
 
-               lck_rw_clear_promotion(thread);
+               lck_rw_clear_promotion(thread, 0);
        }
 }
 
@@ -2151,7 +2344,10 @@ thread_guard_violation(thread_t thread,
     mach_exception_data_type_t code, mach_exception_data_type_t subcode)
 {
        assert(thread == current_thread());
-       assert(thread->task != kernel_task);
+
+       /* don't set up the AST for kernel threads */
+       if (thread->task == kernel_task)
+               return;
 
        spl_t s = splsched();
        /*
@@ -2184,10 +2380,16 @@ guard_ast(thread_t t)
                code = t->guard_exc_info.code,
                subcode = t->guard_exc_info.subcode;
 
+       t->guard_exc_info.code = 0;
+       t->guard_exc_info.subcode = 0;
+       
        switch (EXC_GUARD_DECODE_GUARD_TYPE(code)) {
+       case GUARD_TYPE_NONE:
+               /* lingering AST_GUARD on the processor? */
+               break;
        case GUARD_TYPE_MACH_PORT:
                mach_port_guard_ast(t, code, subcode);
-               break;
+               break;
        case GUARD_TYPE_FD:
                fd_guard_ast(t, code, subcode);
                break;
@@ -2196,6 +2398,9 @@ guard_ast(thread_t t)
                vn_guard_ast(t, code, subcode);
                break;
 #endif
+       case GUARD_TYPE_VIRT_MEMORY:
+               virt_memory_guard_ast(t, code, subcode);
+               break;
        default:
                panic("guard_exc_info %llx %llx", code, subcode);
        }
@@ -2289,7 +2494,7 @@ SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void)
 
        interval_sec = (uint32_t)(interval_ns / NSEC_PER_SEC);
 
-       thread_read_times(thread, &thread_user_time, &thread_system_time);
+       thread_read_times(thread, &thread_user_time, &thread_system_time, NULL);
        time_value_add(&thread_total_time, &thread_user_time);
        time_value_add(&thread_total_time, &thread_system_time);
        ledger_get_entry_info(thread->t_threadledger, thread_ledgers.cpu_time, &lei);
@@ -2376,6 +2581,52 @@ SENDING_NOTIFICATION__THIS_THREAD_IS_CONSUMING_TOO_MUCH_CPU(void)
        }
 }
 
+#if DEVELOPMENT || DEBUG
+void __attribute__((noinline)) SENDING_NOTIFICATION__TASK_HAS_TOO_MANY_THREADS(task_t task, int thread_count)
+{
+       mach_exception_data_type_t code[EXCEPTION_CODE_MAX] = {0};
+       int pid = task_pid(task);
+       char procname[MAXCOMLEN+1] = "unknown";
+
+       if (pid == 1) {
+               /*
+                * Cannot suspend launchd
+                */
+               return;
+       }
+
+       proc_name(pid, procname, sizeof(procname));
+
+       if (disable_exc_resource) {
+               printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
+                       "supressed by a boot-arg. \n", procname, pid, thread_count);
+               return;
+       }
+
+       if (audio_active) {
+               printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
+                       "supressed due to audio playback.\n", procname, pid, thread_count);
+               return;
+       }
+
+       if (exc_via_corpse_forking == 0) {
+               printf("process %s[%d] crossed thread count high watermark (%d), EXC_RESOURCE "
+                       "supressed due to corpse forking being disabled.\n", procname, pid,
+                       thread_count);
+               return;
+       }
+
+       printf("process %s[%d] crossed thread count high watermark (%d), sending "
+               "EXC_RESOURCE\n", procname, pid, thread_count);
+
+       EXC_RESOURCE_ENCODE_TYPE(code[0], RESOURCE_TYPE_THREADS);
+       EXC_RESOURCE_ENCODE_FLAVOR(code[0], FLAVOR_THREADS_HIGH_WATERMARK);
+       EXC_RESOURCE_THREADS_ENCODE_THREADS(code[0], thread_count);
+
+       task_enqueue_exception_with_corpse(task, EXC_RESOURCE, code, EXCEPTION_CODE_MAX, NULL);
+}
+#endif /* DEVELOPMENT || DEBUG */
+
 void thread_update_io_stats(thread_t thread, int size, int io_flags)
 {
        int io_tier;
@@ -2585,77 +2836,38 @@ thread_set_cpulimit(int action, uint8_t percentage, uint64_t interval_ns)
        return (0);
 }
 
-static void
-sched_call_null(
-__unused       int                     type,
-__unused       thread_t        thread)
-{
-       return;
-}
-
 void
 thread_sched_call(
        thread_t                thread,
        sched_call_t    call)
 {
-       thread->sched_call = (call != NULL)? call: sched_call_null;
+       assert((thread->state & TH_WAIT_REPORT) == 0);
+       thread->sched_call = call;
 }
 
-sched_call_t
-thread_disable_sched_call(
-       thread_t                thread,
-       sched_call_t    call)
+uint64_t
+thread_tid(
+       thread_t        thread)
 {
-       if (call) {
-               spl_t s = splsched();
-               thread_lock(thread);
-               if (thread->sched_call == call) {
-                       thread->sched_call = sched_call_null;
-               } else {
-                       call = NULL;
-               }
-               thread_unlock(thread);
-               splx(s);
-       }
-       return call;
+       return (thread != THREAD_NULL? thread->thread_id: 0);
 }
 
-void
-thread_reenable_sched_call(
-       thread_t                thread,
-       sched_call_t    call)
+uint16_t
+thread_set_tag(thread_t th, uint16_t tag)
 {
-       if (call) {
-               spl_t s = splsched();
-               thread_lock(thread);
-               thread_sched_call(thread, call);
-               thread_unlock(thread);
-               splx(s);
-       }
+       return thread_set_tag_internal(th, tag);
 }
 
-void
-thread_static_param(
-       thread_t                thread,
-       boolean_t               state)
+uint16_t
+thread_get_tag(thread_t th)
 {
-       thread_mtx_lock(thread);
-       thread->static_param = state;
-       thread_mtx_unlock(thread);
+       return thread_get_tag_internal(th);
 }
 
 uint64_t
-thread_tid(
-       thread_t        thread)
+thread_last_run_time(thread_t th)
 {
-       return (thread != THREAD_NULL? thread->thread_id: 0);
-}
-
-uint16_t       thread_set_tag(thread_t th, uint16_t tag) {
-       return thread_set_tag_internal(th, tag);
-}
-uint16_t       thread_get_tag(thread_t th) {
-       return thread_get_tag_internal(th);
+       return th->last_run_time;
 }
 
 uint64_t
@@ -2718,6 +2930,22 @@ thread_rettokern_addr(
  * within the osfmk component.
  */
 
+#undef thread_mtx_lock
+void thread_mtx_lock(thread_t thread);
+void
+thread_mtx_lock(thread_t thread)
+{
+       lck_mtx_lock(&thread->mutex);
+}
+
+#undef thread_mtx_unlock
+void thread_mtx_unlock(thread_t thread);
+void
+thread_mtx_unlock(thread_t thread)
+{
+       lck_mtx_unlock(&thread->mutex);
+}
+
 #undef thread_reference
 void thread_reference(thread_t thread);
 void
@@ -2754,7 +2982,7 @@ thread_set_voucher_name(mach_port_name_t voucher_name)
        ipc_voucher_t new_voucher = IPC_VOUCHER_NULL;
        ipc_voucher_t voucher;
        ledger_t bankledger = NULL;
-       thread_group_t banktg = NULL;
+       struct thread_group *banktg = NULL;
 
        if (MACH_PORT_DEAD == voucher_name)
                return KERN_INVALID_RIGHT;
@@ -2879,7 +3107,7 @@ thread_set_mach_voucher(
 {
        ipc_voucher_t old_voucher;
        ledger_t bankledger = NULL;
-       thread_group_t banktg = NULL;
+       struct thread_group *banktg = NULL;
 
        if (THREAD_NULL == thread)
                return KERN_INVALID_ARGUMENT;
@@ -2916,78 +3144,22 @@ thread_set_mach_voucher(
  *  Conditions: callers holds a reference on the new and presumed old voucher(s).
  *             nothing locked.
  *
- *  If the old voucher is still the same as passed in, replace it with new voucher
- *  and discard the old (and the reference passed in).  Otherwise, discard the new
- *  and return an updated old voucher.
+ *  This function is no longer supported.
  */
 kern_return_t
 thread_swap_mach_voucher(
-       thread_t                thread,
-       ipc_voucher_t           new_voucher,
-       ipc_voucher_t           *in_out_old_voucher)
+       __unused thread_t               thread,
+       __unused ipc_voucher_t          new_voucher,
+       ipc_voucher_t                   *in_out_old_voucher)
 {
-       mach_port_name_t old_voucher_name;
-       ipc_voucher_t old_voucher;
-       ledger_t bankledger = NULL;
-       thread_group_t banktg = NULL;
-
-       if (THREAD_NULL == thread)
-               return KERN_INVALID_TASK;
-
-       if (thread != current_thread() && thread->started)
-               return KERN_INVALID_ARGUMENT;
-
-       bank_get_bank_ledger_and_thread_group(new_voucher, &bankledger, &banktg);
-
-       thread_mtx_lock(thread);
-
-       old_voucher = thread->ith_voucher;
-
-       if (IPC_VOUCHER_NULL == old_voucher) {
-               old_voucher_name = thread->ith_voucher_name;
-
-               /* perform lazy binding if needed */
-               if (MACH_PORT_VALID(old_voucher_name)) {
-                       old_voucher = convert_port_name_to_voucher(old_voucher_name);
-                       thread->ith_voucher_name = MACH_PORT_NULL;
-                       thread->ith_voucher = old_voucher;
-
-                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                                                 MACHDBG_CODE(DBG_MACH_IPC,MACH_THREAD_SET_VOUCHER) | DBG_FUNC_NONE,
-                                                 (uintptr_t)thread_tid(thread),
-                                                 (uintptr_t)old_voucher_name,
-                                                 VM_KERNEL_ADDRPERM((uintptr_t)old_voucher),
-                                                 4, 0);
-
-               }
-       }
-
-       /* swap in new voucher, if old voucher matches the one supplied */
-       if (old_voucher == *in_out_old_voucher) {
-               ipc_voucher_reference(new_voucher);
-               thread->ith_voucher = new_voucher;
-               thread->ith_voucher_name = MACH_PORT_NULL;
-               thread_mtx_unlock(thread);
-               bank_swap_thread_bank_ledger(thread, bankledger);
-
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
-                                         MACHDBG_CODE(DBG_MACH_IPC,MACH_THREAD_SET_VOUCHER) | DBG_FUNC_NONE,
-                                         (uintptr_t)thread_tid(thread),
-                                         (uintptr_t)MACH_PORT_NULL,
-                                         VM_KERNEL_ADDRPERM((uintptr_t)new_voucher),
-                                         5, 0);
-
-               ipc_voucher_release(old_voucher);
-
-               *in_out_old_voucher = IPC_VOUCHER_NULL;
-               return KERN_SUCCESS;
-       }
-
-       /* Otherwise, just return old voucher reference */
-       ipc_voucher_reference(old_voucher);
-       thread_mtx_unlock(thread);
-       *in_out_old_voucher = old_voucher;
-       return KERN_SUCCESS;
+       /*
+        * Currently this function is only called from a MIG generated
+        * routine which doesn't release the reference on the voucher
+        * addressed by in_out_old_voucher. To avoid leaking this reference,
+        * a call to release it has been added here.
+        */
+       ipc_voucher_release(*in_out_old_voucher);
+       return KERN_NOT_SUPPORTED;
 }
 
 /* 
@@ -3073,6 +3245,12 @@ kern_allocation_name_t thread_set_allocation_name(kern_allocation_name_t new_nam
        return ret;
 }
 
+uint64_t
+thread_get_last_wait_duration(thread_t thread)
+{
+       return thread->last_made_runnable_time - thread->last_run_time;
+}
+
 #if CONFIG_DTRACE
 uint32_t dtrace_get_thread_predcache(thread_t thread)
 {
index ca2217584b3ad5fa91efa71b055004e77bc2238a..d2cf4278e7065fab322d12625e8a7bef9a4916f4 100644 (file)
 #include <kern/cpu_number.h>
 #include <kern/smp.h>
 #include <kern/queue.h>
+
 #include <kern/timer.h>
 #include <kern/simple_lock.h>
 #include <kern/locks.h>
 #include <kern/affinity.h>
 #include <kern/debug.h>
 #include <kern/block_hint.h>
+#include <kern/turnstile.h>
 
 #include <kern/waitq.h>
 #include <san/kasan.h>
 #include <machine/cpu_data.h>
 #include <machine/thread.h>
 
+#ifdef XNU_KERNEL_PRIVATE
+/* priority queue static asserts fail for __ARM64_ARCH_8_32__ kext builds */
+#include <kern/priority_queue.h>
+#endif /* XNU_KERNEL_PRIVATE */
+
 #if MONOTONIC
 #include <stdatomic.h>
 #include <machine/monotonic.h>
@@ -163,14 +170,18 @@ struct thread {
         *      anywhere in the thread structure.
         */
        union {
-               queue_chain_t           runq_links;     /* run queue links */
-               queue_chain_t           wait_links;     /* wait queue links */
+               queue_chain_t                   runq_links;             /* run queue links */
+               queue_chain_t                   wait_links;             /* wait queue links */
+               struct priority_queue_entry     wait_prioq_links;       /* priority ordered waitq links */
        };
 
        processor_t             runq;           /* run queue assignment */
 
        event64_t               wait_event;     /* wait queue event */
        struct waitq           *waitq;          /* wait queue this thread is enqueued on */
+       struct turnstile       *turnstile;      /* thread's turnstile, protected by primitives interlock */
+       void                   *inheritor;      /* inheritor of the primitive the thread will block on */
+       struct priority_queue  inheritor_queue; /* Inheritor queue */
 
        /* Data updated during assert_wait/thread_wakeup */
 #if __SMP__
@@ -218,9 +229,10 @@ struct thread {
 #define TH_SUSP                        0x02                    /* stopped or requested to stop */
 #define TH_RUN                 0x04                    /* running or on runq */
 #define TH_UNINT               0x08                    /* waiting uninteruptibly */
-#define TH_TERMINATE           0x10                    /* halted at termination */
-#define TH_TERMINATE2          0x20                    /* added to termination queue */
-
+#define TH_TERMINATE   0x10                    /* halted at termination */
+#define TH_TERMINATE2  0x20                    /* added to termination queue */
+#define TH_WAIT_REPORT 0x40                    /* the wait is using the sched_call,
+                                                                                  only set if TH_WAIT is also set */
 #define TH_IDLE                        0x80                    /* idling processor */
 
        /* Scheduling information */
@@ -240,7 +252,7 @@ struct thread {
 #define TH_SFLAG_THROTTLED             0x0004          /* throttled thread forced to timeshare mode (may be applied in addition to failsafe) */
 #define TH_SFLAG_DEMOTED_MASK      (TH_SFLAG_THROTTLED | TH_SFLAG_FAILSAFE)    /* saved_mode contains previous sched_mode */
 
-#define        TH_SFLAG_PROMOTED               0x0008          /* sched pri has been promoted */
+#define        TH_SFLAG_PROMOTED               0x0008          /* sched pri has been promoted by kernel mutex priority promotion */
 #define TH_SFLAG_ABORT                 0x0010          /* abort interruptible waits */
 #define TH_SFLAG_ABORTSAFELY           0x0020          /* ... but only those at safe point */
 #define TH_SFLAG_ABORTED_MASK          (TH_SFLAG_ABORT | TH_SFLAG_ABORTSAFELY)
@@ -249,13 +261,15 @@ struct thread {
 #define TH_SFLAG_DEPRESSED_MASK                (TH_SFLAG_DEPRESS | TH_SFLAG_POLLDEPRESS)
 /* unused TH_SFLAG_PRI_UPDATE           0x0100 */
 #define TH_SFLAG_EAGERPREEMPT          0x0200          /* Any preemption of this thread should be treated as if AST_URGENT applied */
-#define TH_SFLAG_RW_PROMOTED           0x0400          /* sched pri has been promoted due to blocking with RW lock held */
+#define TH_SFLAG_RW_PROMOTED           0x0400          /* promote reason: blocking with RW lock held */
 /* unused TH_SFLAG_THROTTLE_DEMOTED     0x0800 */
-#define TH_SFLAG_WAITQ_PROMOTED                0x1000          /* sched pri promoted from waitq wakeup (generally for IPC receive) */
+#define TH_SFLAG_WAITQ_PROMOTED                0x1000          /* promote reason: waitq wakeup (generally for IPC receive) */
 
 
-#define TH_SFLAG_EXEC_PROMOTED          0x8000         /* sched pri has been promoted since thread is in an exec */
-#define TH_SFLAG_PROMOTED_MASK         (TH_SFLAG_PROMOTED | TH_SFLAG_RW_PROMOTED | TH_SFLAG_WAITQ_PROMOTED | TH_SFLAG_EXEC_PROMOTED)
+#define TH_SFLAG_EXEC_PROMOTED          0x8000         /* promote reason: thread is in an exec */
+
+/* 'promote reasons' that request a priority floor only, not a custom priority */
+#define TH_SFLAG_PROMOTE_REASON_MASK    (TH_SFLAG_RW_PROMOTED | TH_SFLAG_WAITQ_PROMOTED | TH_SFLAG_EXEC_PROMOTED)
 
 #define TH_SFLAG_RW_PROMOTED_BIT       (10)    /* 0x400 */
 
@@ -263,22 +277,24 @@ struct thread {
        int16_t                         base_pri;               /* base priority */
        int16_t                         max_priority;           /* copy of max base priority */
        int16_t                         task_priority;          /* copy of task base priority */
+       int16_t                         promotion_priority;     /* priority thread is currently promoted to */
 
 #if defined(CONFIG_SCHED_GRRR)
 #if 0
        uint16_t                        grrr_deficit;           /* fixed point (1/1000th quantum) fractional deficit */
 #endif
 #endif
-       
+
        int16_t                         promotions;                     /* level of promotion */
-       int16_t                         pending_promoter_index;
+       int                             iotier_override; /* atomic operations to set, cleared on ret to user */
        _Atomic uint32_t                ref_count;              /* number of references to me */
-       void                            *pending_promoter[2];
+
+       lck_mtx_t*                      waiting_for_mutex;      /* points to mutex we're waiting for until we acquire it */
 
        uint32_t                        rwlock_count;   /* Number of lck_rw_t locks held by thread */
 
        integer_t                       importance;                     /* task-relative importance */
-       uint32_t                        was_promoted_on_wakeup;
+       uint32_t                        was_promoted_on_wakeup;         /* thread promoted on wakeup to acquire mutex */
 
        /* Priority depression expiration */
        integer_t                       depress_timer_active;
@@ -321,7 +337,7 @@ struct thread {
 #if defined(CONFIG_SCHED_PROTO)
        uint32_t                        runqueue_generation;    /* last time runqueue was drained */
 #endif
-       
+
        /* Statistics and timesharing calculations */
 #if defined(CONFIG_SCHED_TIMESHARE_CORE)
        natural_t                       sched_stamp;    /* last scheduler tick */
@@ -347,6 +363,7 @@ struct thread {
        uint64_t                        vtimer_qos_save;
 
        timer_data_t            ptime;                  /* time executing in P mode */
+       timer_data_t            runnable_timer;         /* time the thread is runnable (including running) */
 
 #if CONFIG_SCHED_SFI
        /* Timing for wait state */
@@ -369,13 +386,13 @@ struct thread {
        /* Various bits of state to stash across a continuation, exclusive to the current thread block point */
        union {
                struct {
-                       mach_msg_return_t       state;          /* receive state */
+                       mach_msg_return_t       state;          /* receive state */
                        mach_port_seqno_t       seqno;          /* seqno of recvd message */
-                       ipc_object_t            object;         /* object received on */
-                       mach_vm_address_t       msg_addr;       /* receive buffer pointer */
+                       ipc_object_t            object;         /* object received on */
+                       mach_vm_address_t       msg_addr;       /* receive buffer pointer */
                        mach_msg_size_t         rsize;          /* max size for recvd msg */
                        mach_msg_size_t         msize;          /* actual size for recvd msg */
-                       mach_msg_option_t       option;         /* options for receive */
+                       mach_msg_option_t       option;         /* options for receive */
                        mach_port_name_t        receiver_name;  /* the receive port name */
                        struct knote            *knote;         /* knote fired for rcv */
                        union {
@@ -389,16 +406,12 @@ struct thread {
                        mach_msg_continue_t     continuation;
                } receive;
                struct {
-                       struct semaphore        *waitsemaphore;         /* semaphore ref */
+                       struct semaphore        *waitsemaphore;         /* semaphore ref */
                        struct semaphore        *signalsemaphore;       /* semaphore ref */
                        int                                     options;                        /* semaphore options */
                        kern_return_t           result;                         /* primary result */
                        mach_msg_continue_t continuation;
                } sema;
-               struct {
-                       int                                     option;         /* switch option */
-                       boolean_t                               reenable_workq_callback;        /* on entry, callbacks were suspended */
-               } swtch;
        } saved;
 
        /* Only user threads can cause guard exceptions, only kernel threads can be thread call threads */
@@ -439,6 +452,9 @@ struct thread {
                /* Task membership */
                struct task                             *task;
                vm_map_t                                map;
+#if DEVELOPMENT || DEBUG
+       boolean_t pmap_footprint_suspended;
+#endif /* DEVELOPMENT || DEBUG */
 
                decl_lck_mtx_data(,mutex)
 
@@ -543,17 +559,12 @@ struct thread {
                user_addr_t     override_resource;
        } *overrides;
 
-       _Atomic uint32_t kqwl_owning_count;
        uint32_t        ipc_overrides;
+       _Atomic uint32_t kqwl_owning_count;
        uint32_t        sync_ipc_overrides;
-       uint32_t        user_promotions;
        uint16_t        user_promotion_basepri;
        _Atomic uint16_t kevent_ast_bits;
 
-       block_hint_t    pending_block_hint;
-       block_hint_t    block_hint;      /* What type of primitive last caused us to block. */
-
-       int     iotier_override; /* atomic operations to set, cleared on ret to user */
        io_stat_info_t                  thread_io_stats; /* per-thread I/O statistics */
 
 #if CONFIG_EMBEDDED
@@ -582,6 +593,9 @@ struct thread {
 #if    SCHED_TRACE_THREAD_WAKEUPS
        uintptr_t               thread_wakeup_bt[64];
 #endif
+       turnstile_update_flags_t inheritor_flags; /* inheritor flags for inheritor field */
+       block_hint_t    pending_block_hint;
+       block_hint_t    block_hint;      /* What type of primitive last caused us to block. */
 };
 
 #define ith_state           saved.receive.state
@@ -607,7 +621,15 @@ struct thread {
 
 #define ITH_KNOTE_NULL      ((void *)NULL)
 #define ITH_KNOTE_PSEUDO    ((void *)0xdeadbeef)
-#define ITH_KNOTE_VALID(kn) ((kn) != ITH_KNOTE_NULL && (kn) != ITH_KNOTE_PSEUDO)
+/*
+ * The ith_knote is used during message delivery, and can safely be interpreted
+ * only when used for one of these codepaths, which the test for the msgt_name
+ * being RECEIVE or SEND_ONCE is about.
+ */
+#define ITH_KNOTE_VALID(kn, msgt_name) \
+               (((kn) != ITH_KNOTE_NULL && (kn) != ITH_KNOTE_PSEUDO) && \
+                ((msgt_name) == MACH_MSG_TYPE_PORT_RECEIVE || \
+                (msgt_name) == MACH_MSG_TYPE_PORT_SEND_ONCE))
 
 #if MACH_ASSERT
 #define assert_thread_magic(thread) assertf((thread)->thread_magic == THREAD_MAGIC, \
@@ -668,6 +690,9 @@ extern void                 thread_copy_resource_info(
 
 extern void                    thread_terminate_crashed_threads(void);
 
+extern void                    turnstile_deallocate_enqueue(
+                                               struct turnstile *turnstile);
+
 extern void                    thread_stack_enqueue(
                                                thread_t                thread);
 
@@ -778,9 +803,22 @@ extern kern_return_t       machine_thread_get_state(
                                                        thread_state_t                  state,
                                                        mach_msg_type_number_t  *count);
 
+extern kern_return_t   machine_thread_state_convert_from_user(
+                                                       thread_t                                thread,
+                                                       thread_flavor_t                 flavor,
+                                                       thread_state_t                  tstate,
+                                                       mach_msg_type_number_t  count);
+
+extern kern_return_t   machine_thread_state_convert_to_user(
+                                                       thread_t                                thread,
+                                                       thread_flavor_t                 flavor,
+                                                       thread_state_t                  tstate,
+                                                       mach_msg_type_number_t  *count);
+
 extern kern_return_t   machine_thread_dup(
                                                        thread_t                self,
-                                                       thread_t                target);
+                                                       thread_t                target,
+                                                       boolean_t               is_corpse);
 
 extern void                            machine_thread_init(void);
 
@@ -839,6 +877,10 @@ extern void thread_set_options(uint32_t thopt);
 
 __BEGIN_DECLS
 
+extern void thread_mtx_lock(thread_t thread);
+
+extern void thread_mtx_unlock(thread_t thread);
+
 extern thread_t                current_thread(void);
 
 extern void                    thread_reference(
@@ -900,6 +942,7 @@ __BEGIN_DECLS
 
 uint16_t       thread_set_tag(thread_t, uint16_t);
 uint16_t       thread_get_tag(thread_t);
+uint64_t       thread_last_run_time(thread_t);
 
 extern kern_return_t    thread_state_initialize(
                                                        thread_t                                thread);
@@ -910,12 +953,24 @@ extern kern_return_t      thread_setstatus(
                                                        thread_state_t                  tstate,
                                                        mach_msg_type_number_t  count);
 
+extern kern_return_t   thread_setstatus_from_user(
+                                                       thread_t                                thread,
+                                                       int                                             flavor,
+                                                       thread_state_t                  tstate,
+                                                       mach_msg_type_number_t  count);
+
 extern kern_return_t   thread_getstatus(
                                                        thread_t                                thread,
                                                        int                                             flavor,
                                                        thread_state_t                  tstate,
                                                        mach_msg_type_number_t  *count);
 
+extern kern_return_t   thread_getstatus_to_user(
+                                                       thread_t                                thread,
+                                                       int                                             flavor,
+                                                       thread_state_t                  tstate,
+                                                       mach_msg_type_number_t  *count);
+
 extern kern_return_t   thread_create_with_continuation(
                                                        task_t task,
                                                        thread_t *new_thread,
@@ -926,21 +981,15 @@ extern kern_return_t thread_create_waiting(task_t               task,
                                            event_t              event,
                                            thread_t             *new_thread);
 
-extern kern_return_t   thread_create_workq(
-                                                       task_t                  task,
-                                                       thread_continue_t       thread_return,
-                                                       thread_t                *new_thread);
-
 extern kern_return_t   thread_create_workq_waiting(
                                                        task_t                  task,
                                                        thread_continue_t       thread_return,
-                                                       event_t         event,
                                                        thread_t                *new_thread);
 
 extern void    thread_yield_internal(
        mach_msg_timeout_t      interval);
 
-extern void    thread_yield_to_preemption(void);
+extern void            thread_yield_to_preemption(void);
 
 /*
  * Thread-private CPU limits: apply a private CPU limit to this thread only. Available actions are:
@@ -963,9 +1012,10 @@ extern int thread_get_cpulimit(int *action, uint8_t *percentage, uint64_t *inter
 extern int thread_set_cpulimit(int action, uint8_t percentage, uint64_t interval_ns);
 
 extern void                    thread_read_times(
-                                               thread_t                thread,
+                                               thread_t                thread,
                                                time_value_t    *user_time,
-                                               time_value_t    *system_time);
+                                               time_value_t    *system_time,
+                                               time_value_t    *runnable_time);
 
 extern uint64_t                thread_get_runtime_self(void);
 
@@ -1034,29 +1084,27 @@ extern void             thread_sched_call(
                                        thread_t                thread,
                                        sched_call_t    call);
 
-extern sched_call_t    thread_disable_sched_call(
-                                       thread_t                thread,
-                                       sched_call_t    call);
-
-extern void    thread_reenable_sched_call(
-                                       thread_t                thread,
-                                       sched_call_t    call);
-
-extern void            thread_static_param(
-                                       thread_t                thread,
-                                       boolean_t               state);
-
 extern boolean_t       thread_is_static_param(
                                        thread_t                thread);
 
 extern task_t  get_threadtask(thread_t);
-#define thread_is_64bit(thd)   \
-       task_has_64BitAddr(get_threadtask(thd))
 
+/*
+ * Thread is running within a 64-bit address space.
+ */
+#define thread_is_64bit_addr(thd)      \
+       task_has_64Bit_addr(get_threadtask(thd))
+
+/*
+ * Thread is using 64-bit machine state.
+ */
+#define thread_is_64bit_data(thd)      \
+       task_has_64Bit_data(get_threadtask(thd))
 
 extern void            *get_bsdthread_info(thread_t);
 extern void            set_bsdthread_info(thread_t, void *);
 extern void            *uthread_alloc(task_t, thread_t, int);
+extern event_t workq_thread_init_and_wq_lock(task_t, thread_t); // bsd/pthread/
 extern void            uthread_cleanup_name(void *uthread);
 extern void            uthread_cleanup(task_t, void *, void *);
 extern void            uthread_zone_free(void *); 
@@ -1119,6 +1167,8 @@ extern void vn_guard_ast(thread_t,
 #endif
 extern void mach_port_guard_ast(thread_t,
        mach_exception_code_t, mach_exception_subcode_t);
+extern void virt_memory_guard_ast(thread_t,
+       mach_exception_code_t, mach_exception_subcode_t);
 extern void thread_guard_violation(thread_t,
        mach_exception_code_t, mach_exception_subcode_t);
 extern void thread_update_io_stats(thread_t, int size, int io_flags);
@@ -1147,6 +1197,23 @@ extern void thread_set_thread_name(thread_t th, const char* name);
 
 extern void thread_enable_send_importance(thread_t thread, boolean_t enable);
 
+/*
+ * Translate signal context data pointer to userspace representation
+ */
+
+extern kern_return_t   machine_thread_siguctx_pointer_convert_to_user(
+                                                       thread_t thread,
+                                                       user_addr_t *uctxp);
+
+/*
+ * Translate array of function pointer syscall arguments from userspace representation
+ */
+
+extern kern_return_t   machine_thread_function_pointers_convert_from_user(
+                                                       thread_t thread,
+                                                       user_addr_t *fptrs,
+                                                       uint32_t count);
+
 /* Get a backtrace for a threads kernel or user stack (user_p), with pc and optionally
  * frame pointer (getfp). Returns bytes added to buffer, and kThreadTruncatedBT in
  * thread_trace_flags if a user page is not present after kdp_lightweight_fault() is
@@ -1163,12 +1230,18 @@ extern int                              machine_trace_thread(
                                                        uint32_t *thread_trace_flags);
 
 extern int                             machine_trace_thread64(thread_t thread,
-                                                       char *tracepos,
-                                                       char *tracebound,
-                                                       int nframes,
-                                                       boolean_t user_p,
-                                                       boolean_t getfp,
-                                                       uint32_t *thread_trace_flags);
+                                                                                          char *tracepos,
+                                                                                          char *tracebound,
+                                                                                          int nframes,
+                                                                                          boolean_t user_p,
+                                                                                          boolean_t getfp,
+                                                                                          uint32_t *thread_trace_flags,
+                                                                                          uint64_t *sp);
+
+/*
+ * Get the duration of the given thread's last wait.
+ */
+uint64_t thread_get_last_wait_duration(thread_t thread);
 
 #endif /* XNU_KERNEL_PRIVATE */
 
index 9c7aa300c8657aab2e3c3bc6f8f458f7cb58720d..4faa1e9b569014dfef9b01786c340fe27e9638ea 100644 (file)
@@ -49,6 +49,7 @@
  *
  *     Thread management routines
  */
+
 #include <mach/mach_types.h>
 #include <mach/kern_return.h>
 #include <mach/thread_act_server.h>
@@ -315,27 +316,26 @@ thread_resume(thread_t thread)
 }
 
 /*
- *     thread_depress_abort:
+ *     thread_depress_abort_from_user:
  *
  *     Prematurely abort priority depression if there is one.
  */
 kern_return_t
-thread_depress_abort(
-       thread_t        thread)
+thread_depress_abort_from_user(thread_t thread)
 {
-       kern_return_t           result;
+       kern_return_t result;
 
-    if (thread == THREAD_NULL)
+       if (thread == THREAD_NULL)
                return (KERN_INVALID_ARGUMENT);
 
-    thread_mtx_lock(thread);
+       thread_mtx_lock(thread);
 
        if (thread->active)
-               result = thread_depress_abort_internal(thread);
+               result = thread_depress_abort(thread);
        else
                result = KERN_TERMINATED;
 
-    thread_mtx_unlock(thread);
+       thread_mtx_unlock(thread);
 
        return (result);
 }
@@ -358,6 +358,7 @@ act_abort(
        if (!(thread->sched_flags & TH_SFLAG_ABORT)) {
                thread->sched_flags |= TH_SFLAG_ABORT;
                thread_set_apc_ast_locked(thread);
+               thread_depress_abort_locked(thread);
        } else {
                thread->sched_flags &= ~TH_SFLAG_ABORTSAFELY;
        }
@@ -409,6 +410,7 @@ thread_abort_safely(
                        if (!(thread->sched_flags & TH_SFLAG_ABORT)) {
                                thread->sched_flags |= TH_SFLAG_ABORTED_MASK;
                                thread_set_apc_ast_locked(thread);
+                               thread_depress_abort_locked(thread);
                        }
                }
                thread_unlock(thread);
@@ -452,12 +454,13 @@ thread_info(
        return (result);
 }
 
-kern_return_t
-thread_get_state(
+static inline kern_return_t
+thread_get_state_internal(
        thread_t                thread,
        int                                             flavor,
        thread_state_t                  state,                  /* pointer to OUT array */
-       mach_msg_type_number_t  *state_count)   /*IN/OUT*/
+       mach_msg_type_number_t  *state_count,   /*IN/OUT*/
+       boolean_t                               to_user)
 {
        kern_return_t           result = KERN_SUCCESS;
 
@@ -497,16 +500,50 @@ thread_get_state(
        else
                result = KERN_TERMINATED;
 
+       if (to_user && result == KERN_SUCCESS) {
+               result = machine_thread_state_convert_to_user(thread, flavor, state,
+                               state_count);
+       }
+
        thread_mtx_unlock(thread);
 
        return (result);
 }
 
+/* No prototype, since thread_act_server.h has the _to_user version if KERNEL_SERVER */
+
+kern_return_t
+thread_get_state(
+       thread_t                thread,
+       int                                             flavor,
+       thread_state_t                  state,
+       mach_msg_type_number_t  *state_count);
+
+kern_return_t
+thread_get_state(
+       thread_t                thread,
+       int                                             flavor,
+       thread_state_t                  state,                  /* pointer to OUT array */
+       mach_msg_type_number_t  *state_count)   /*IN/OUT*/
+{
+       return thread_get_state_internal(thread, flavor, state, state_count, FALSE);
+}
+
+kern_return_t
+thread_get_state_to_user(
+       thread_t                thread,
+       int                                             flavor,
+       thread_state_t                  state,                  /* pointer to OUT array */
+       mach_msg_type_number_t  *state_count)   /*IN/OUT*/
+{
+       return thread_get_state_internal(thread, flavor, state, state_count, TRUE);
+}
+
 /*
  *     Change thread's machine-dependent state.  Called with nothing
  *     locked.  Returns same way.
  */
-static kern_return_t
+static inline kern_return_t
 thread_set_state_internal(
        thread_t                thread,
        int                                             flavor,
@@ -522,6 +559,13 @@ thread_set_state_internal(
        thread_mtx_lock(thread);
 
        if (thread->active) {
+               if (from_user) {
+                       result = machine_thread_state_convert_from_user(thread, flavor,
+                                       state, state_count);
+                       if (result != KERN_SUCCESS) {
+                               goto out;
+                       }
+               }
                if (thread != current_thread()) {
                        thread_hold(thread);
 
@@ -550,6 +594,7 @@ thread_set_state_internal(
        if ((result == KERN_SUCCESS) && from_user)
                extmod_statistics_incr_thread_set_state(thread);
 
+out:
        thread_mtx_unlock(thread);
 
        return (result);
@@ -650,7 +695,8 @@ thread_dup(
 
                if (thread_stop(target, TRUE)) {
                        thread_mtx_lock(target);
-                       result = machine_thread_dup(self, target);
+                       result = machine_thread_dup(self, target, FALSE);
+
                        if (self->affinity_set != AFFINITY_SET_NULL)
                                thread_affinity_dup(self, target);
                        thread_unstop(target);
@@ -699,7 +745,7 @@ thread_dup2(
 
                if (thread_stop(target, TRUE)) {
                        thread_mtx_lock(target);
-                       result = machine_thread_dup(source, target);
+                       result = machine_thread_dup(source, target, TRUE);
                        if (source->affinity_set != AFFINITY_SET_NULL)
                                thread_affinity_dup(source, target);
                        thread_unstop(target);
@@ -736,6 +782,17 @@ thread_setstatus(
        return (thread_set_state(thread, flavor, tstate, count));
 }
 
+kern_return_t
+thread_setstatus_from_user(
+       thread_t                thread,
+       int                                             flavor,
+       thread_state_t                  tstate,
+       mach_msg_type_number_t  count)
+{
+
+       return (thread_set_state_from_user(thread, flavor, tstate, count));
+}
+
 /*
  *     thread_getstatus:
  *
@@ -751,6 +808,16 @@ thread_getstatus(
        return (thread_get_state(thread, flavor, tstate, count));
 }
 
+kern_return_t
+thread_getstatus_to_user(
+       thread_t                thread,
+       int                                             flavor,
+       thread_state_t                  tstate,
+       mach_msg_type_number_t  *count)
+{
+       return (thread_get_state_to_user(thread, flavor, tstate, count));
+}
+
 /*
  *     Change thread's machine-dependent userspace TSD base.
  *  Called with nothing locked.  Returns same way.
@@ -826,16 +893,6 @@ thread_set_apc_ast(thread_t thread)
 static void
 thread_set_apc_ast_locked(thread_t thread)
 {
-       /*
-        * Temporarily undepress, so target has
-        * a chance to do locking required to
-        * block itself in thread_suspended.
-        *
-        * Leaves the depress flag set so we can reinstate when it's blocked.
-        */
-       if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK)
-               thread_recompute_sched_pri(thread, TRUE);
-
        thread_ast_set(thread, AST_APC);
 
        if (thread == current_thread()) {
@@ -861,9 +918,7 @@ thread_set_apc_ast_locked(thread_t thread)
  *
  * Continuation routine for thread suspension.  It checks
  * to see whether there has been any new suspensions.  If so, it
- * installs the AST_APC handler again.  Otherwise, it checks to see
- * if the current depression needs to be re-instated (it may have
- * been temporarily removed in order to get to this point in a hurry).
+ * installs the AST_APC handler again.
  */
 __attribute__((noreturn))
 static void
@@ -878,27 +933,8 @@ thread_suspended(__unused void *parameter, wait_result_t result)
        else
                assert(thread->suspend_parked == FALSE);
 
-       if (thread->suspend_count > 0) {
+       if (thread->suspend_count > 0)
                thread_set_apc_ast(thread);
-       } else {
-               spl_t s = splsched();
-
-               thread_lock(thread);
-               if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-                       thread->sched_pri = DEPRESSPRI;
-                       thread->last_processor->current_pri = thread->sched_pri;
-                       thread->last_processor->current_perfctl_class = thread_get_perfcontrol_class(thread);
-
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_SCHED_CHANGE_PRIORITY),
-                                             (uintptr_t)thread_tid(thread),
-                                             thread->base_pri,
-                                             thread->sched_pri,
-                                             thread->sched_usage,
-                                             0);
-               }
-               thread_unlock(thread);
-               splx(s);
-       }
 
        thread_mtx_unlock(thread);
 
@@ -938,7 +974,8 @@ thread_apc_ast(thread_t thread)
        /* If we're suspended, go to sleep and wait for someone to wake us up. */
        if (thread->suspend_count > 0) {
                thread->suspend_parked = TRUE;
-               assert_wait(&thread->suspend_count, THREAD_ABORTSAFE);
+               assert_wait(&thread->suspend_count,
+                               THREAD_ABORTSAFE | THREAD_WAIT_NOREPORT_USER);
                thread_mtx_unlock(thread);
 
                thread_block(thread_suspended);
@@ -984,6 +1021,14 @@ act_set_state_from_user(
     
 }
 
+/* Prototype, see justification above */
+kern_return_t
+act_get_state(
+       thread_t                                thread,
+       int                                             flavor,
+       thread_state_t                  state,
+       mach_msg_type_number_t  *count);
+
 kern_return_t
 act_get_state(
        thread_t                                thread,
@@ -997,6 +1042,19 @@ act_get_state(
     return (thread_get_state(thread, flavor, state, count));
 }
 
+kern_return_t
+act_get_state_to_user(
+       thread_t                                thread,
+       int                                             flavor,
+       thread_state_t                  state,
+       mach_msg_type_number_t  *count)
+{
+    if (thread == current_thread())
+           return (KERN_INVALID_ARGUMENT);
+
+    return (thread_get_state_to_user(thread, flavor, state, count));
+}
+
 static void
 act_set_ast(
            thread_t thread,
index d43248ee52b811e4e6679917e63e28b5f24f9de6..ec92802f878d1e40b50b6a5dacfbbf11cb7e8b91 100644 (file)
@@ -1285,7 +1285,6 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
        uint64_t  time;
        uint32_t  flags;
        boolean_t signal;
-       boolean_t dowake = FALSE;
        boolean_t repend = FALSE;
 
        call->tc_finish_count++;
@@ -1328,22 +1327,8 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
                }
        }
 
-       if ((flags & THREAD_CALL_WAIT) != 0) {
-               dowake = TRUE;
-
-               /* 
-                * Dropping lock here because the sched call for the 
-                * high-pri group can take the big lock from under
-                * a thread lock.
-                */
-               thread_call_unlock();
-               thread_wakeup((event_t)call);
-               thread_call_lock_spin();
-               /* THREAD_CALL_SIGNAL call may have been freed */
-       }
-
        if (!signal && (call->tc_refs == 0)) {
-               if (dowake) {
+               if ((flags & THREAD_CALL_WAIT) != 0) {
                        panic("Someone waiting on a thread call that is scheduled for free: %p\n", call->tc_call.func);
                }
 
@@ -1356,6 +1341,18 @@ thread_call_finish(thread_call_t call, thread_call_group_t group, spl_t *s)
                *s = disable_ints_and_lock();
        }
 
+       if ((flags & THREAD_CALL_WAIT) != 0) {
+               /*
+                * Dropping lock here because the sched call for the
+                * high-pri group can take the big lock from under
+                * a thread lock.
+                */
+               thread_call_unlock();
+               thread_wakeup((event_t)call);
+               thread_call_lock_spin();
+               /* THREAD_CALL_SIGNAL call may have been freed */
+       }
+
        return (repend);
 }
 
index 6e79915071445826daf56d62ce49eebc750eaeaf..e19269ee28a4292ca7eda53af635c33ca470433b 100644 (file)
@@ -33,7 +33,6 @@
 #define _KERN_THREAD_GROUP_H_
 
 struct thread_group;
-typedef struct thread_group *thread_group_t;
 
 #include <mach/thread_status.h> /* for proc_reg.h / CONFIG_THREAD_GROUPS */
 
index 6ede110d1ee60ab547ad9bdc531db049fc283654..7b7e4f87def4c369203acffa240566d9f9d9383b 100644 (file)
@@ -53,13 +53,12 @@ uint32_t qos_override_mode;
 #define QOS_OVERRIDE_MODE_OVERHANG_PEAK 0
 #define QOS_OVERRIDE_MODE_IGNORE_OVERRIDE 1
 #define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE 2
-#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH 3
-#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE 4
+#define QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE 3
 
 extern zone_t thread_qos_override_zone;
 
-static boolean_t
-proc_thread_qos_remove_override_internal(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset, boolean_t squash);
+static void
+proc_thread_qos_remove_override_internal(thread_t thread, user_addr_t resource, int resource_type, boolean_t reset);
 
 /*
  * THREAD_QOS_UNSPECIFIED is assigned the highest tier available, so it does not provide a limit
@@ -83,15 +82,6 @@ const qos_policy_params_t thread_qos_policy_params = {
         * This table defines the highest IO priority that a thread marked with this
         * QoS class can have.
         */
-#if CONFIG_EMBEDDED
-       .qos_iotier[THREAD_QOS_UNSPECIFIED]             = THROTTLE_LEVEL_TIER0,
-       .qos_iotier[THREAD_QOS_USER_INTERACTIVE]        = THROTTLE_LEVEL_TIER0,
-       .qos_iotier[THREAD_QOS_USER_INITIATED]          = THROTTLE_LEVEL_TIER0,
-       .qos_iotier[THREAD_QOS_LEGACY]                  = THROTTLE_LEVEL_TIER0,
-       .qos_iotier[THREAD_QOS_UTILITY]                 = THROTTLE_LEVEL_TIER0,
-       .qos_iotier[THREAD_QOS_BACKGROUND]              = THROTTLE_LEVEL_TIER3,
-       .qos_iotier[THREAD_QOS_MAINTENANCE]             = THROTTLE_LEVEL_TIER3,
-#else
        .qos_iotier[THREAD_QOS_UNSPECIFIED]             = THROTTLE_LEVEL_TIER0,
        .qos_iotier[THREAD_QOS_USER_INTERACTIVE]        = THROTTLE_LEVEL_TIER0,
        .qos_iotier[THREAD_QOS_USER_INITIATED]          = THROTTLE_LEVEL_TIER0,
@@ -99,7 +89,6 @@ const qos_policy_params_t thread_qos_policy_params = {
        .qos_iotier[THREAD_QOS_UTILITY]                 = THROTTLE_LEVEL_TIER1,
        .qos_iotier[THREAD_QOS_BACKGROUND]              = THROTTLE_LEVEL_TIER2, /* possibly overridden by bg_iotier */
        .qos_iotier[THREAD_QOS_MAINTENANCE]             = THROTTLE_LEVEL_TIER3,
-#endif
 
        /*
         * This table defines the highest QoS level that
@@ -643,35 +632,133 @@ unlock:
        return kr;
 }
 
+uint8_t
+thread_workq_pri_for_qos(thread_qos_t qos)
+{
+       assert(qos < THREAD_QOS_LAST);
+       return (uint8_t)thread_qos_policy_params.qos_pri[qos];
+}
+
+thread_qos_t
+thread_workq_qos_for_pri(int priority)
+{
+       int qos;
+       if (priority > thread_qos_policy_params.qos_pri[THREAD_QOS_USER_INTERACTIVE]) {
+               // indicate that workq should map >UI threads to workq's
+               // internal notation for above-UI work.
+               return THREAD_QOS_UNSPECIFIED;
+       }
+       for (qos = THREAD_QOS_USER_INTERACTIVE; qos > THREAD_QOS_MAINTENANCE; qos--) {
+               // map a given priority up to the next nearest qos band.
+               if (thread_qos_policy_params.qos_pri[qos - 1] < priority) {
+                       return qos;
+               }
+       }
+       return THREAD_QOS_MAINTENANCE;
+}
+
 /*
- * KPI for pthread kext
+ * private interface for pthread workqueues
  *
  * Set scheduling policy & absolute priority for thread
- * May be called from waitqueue callout context with spinlocks held
+ * May be called with spinlocks held
  * Thread mutex lock is not held
  */
-kern_return_t
+void
+thread_reset_workq_qos(thread_t thread, uint32_t qos)
+{
+       struct task_pend_token pend_token = {};
+
+       assert(qos < THREAD_QOS_LAST);
+
+       spl_t s = splsched();
+       thread_lock(thread);
+
+       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
+                       TASK_POLICY_QOS_AND_RELPRIO, qos, 0, &pend_token);
+       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
+                       TASK_POLICY_QOS_WORKQ_OVERRIDE, THREAD_QOS_UNSPECIFIED, 0,
+                       &pend_token);
+
+       assert(pend_token.tpt_update_sockets == 0);
+
+       thread_unlock(thread);
+       splx(s);
+
+       thread_policy_update_complete_unlocked(thread, &pend_token);
+}
+
+/*
+ * private interface for pthread workqueues
+ *
+ * Set scheduling policy & absolute priority for thread
+ * May be called with spinlocks held
+ * Thread mutex lock is held
+ */
+void
+thread_set_workq_override(thread_t thread, uint32_t qos)
+{
+       struct task_pend_token pend_token = {};
+
+       assert(qos < THREAD_QOS_LAST);
+
+       spl_t s = splsched();
+       thread_lock(thread);
+
+       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
+                       TASK_POLICY_QOS_WORKQ_OVERRIDE, qos, 0, &pend_token);
+
+       assert(pend_token.tpt_update_sockets == 0);
+
+       thread_unlock(thread);
+       splx(s);
+
+       thread_policy_update_complete_unlocked(thread, &pend_token);
+}
+
+/*
+ * private interface for pthread workqueues
+ *
+ * Set scheduling policy & absolute priority for thread
+ * May be called with spinlocks held
+ * Thread mutex lock is not held
+ */
+void
 thread_set_workq_pri(thread_t  thread,
+                     thread_qos_t qos,
                      integer_t priority,
                      integer_t policy)
 {
        struct task_pend_token pend_token = {};
        sched_mode_t mode = convert_policy_to_sched_mode(policy);
 
+       assert(qos < THREAD_QOS_LAST);
        assert(thread->static_param);
-       if (!thread->static_param)
-               return KERN_FAILURE;
+
+       if (!thread->static_param || !thread->active)
+               return;
+
+       spl_t s = splsched();
+       thread_lock(thread);
+
+       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
+                       TASK_POLICY_QOS_AND_RELPRIO, qos, 0, &pend_token);
+       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
+                       TASK_POLICY_QOS_WORKQ_OVERRIDE, THREAD_QOS_UNSPECIFIED,
+                       0, &pend_token);
+
+       thread_unlock(thread);
+       splx(s);
 
        /* Concern: this doesn't hold the mutex... */
-       if (!thread->active)
-               return KERN_TERMINATED;
 
-       kern_return_t kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority, &pend_token);
+       __assert_only kern_return_t kr;
+       kr = thread_set_mode_and_absolute_pri_internal(thread, mode, priority,
+                       &pend_token);
+       assert(kr == KERN_SUCCESS);
 
        if (pend_token.tpt_update_thread_sfi)
                sfi_reevaluate(thread);
-
-       return kr;
 }
 
 /*
@@ -762,7 +849,7 @@ thread_update_qos_cpu_time_locked(thread_t thread)
         * last context switch (embedded) or last user/kernel boundary transition (desktop)
         * because user_timer and system_timer are only updated then.
         *
-        * TODO: Consider running a thread_timer_event operation here to update it first.
+        * TODO: Consider running a timer_update operation here to update it first.
         *       Maybe doable with interrupts disabled from current thread.
         *       If the thread is on a different core, may not be easy to get right.
         *
@@ -779,7 +866,7 @@ thread_update_qos_cpu_time_locked(thread_t thread)
 
        /* Update the task-level effective and requested qos stats atomically, because we don't have the task lock. */
        switch (thread->effective_policy.thep_qos) {
-               case THREAD_QOS_DEFAULT:            task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_default;          break;
+               case THREAD_QOS_UNSPECIFIED:        task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_default;          break;
                case THREAD_QOS_MAINTENANCE:        task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_maintenance;      break;
                case THREAD_QOS_BACKGROUND:         task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_background;       break;
                case THREAD_QOS_UTILITY:            task_counter = &task->cpu_time_eqos_stats.cpu_time_qos_utility;          break;
@@ -794,7 +881,7 @@ thread_update_qos_cpu_time_locked(thread_t thread)
 
        /* Update the task-level qos stats atomically, because we don't have the task lock. */
        switch (thread->requested_policy.thrp_qos) {
-               case THREAD_QOS_DEFAULT:            task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_default;          break;
+               case THREAD_QOS_UNSPECIFIED:        task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_default;          break;
                case THREAD_QOS_MAINTENANCE:        task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_maintenance;      break;
                case THREAD_QOS_BACKGROUND:         task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_background;       break;
                case THREAD_QOS_UTILITY:            task_counter = &task->cpu_time_rqos_stats.cpu_time_qos_utility;          break;
@@ -1183,7 +1270,7 @@ thread_policy_get(
                        info->thps_requested_policy = *(uint64_t*)(void*)(&thread->requested_policy);
                        info->thps_effective_policy = *(uint64_t*)(void*)(&thread->effective_policy);
 
-                       info->thps_user_promotions          = thread->user_promotions;
+                       info->thps_user_promotions          = 0;
                        info->thps_user_promotion_basepri   = thread->user_promotion_basepri;
                        info->thps_ipc_overrides            = thread->ipc_overrides;
 
@@ -1346,14 +1433,10 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr
        uint32_t next_qos = requested.thrp_qos;
 
        if (requested.thrp_qos != THREAD_QOS_UNSPECIFIED) {
-               if (requested.thrp_qos_override != THREAD_QOS_UNSPECIFIED)
-                       next_qos = MAX(requested.thrp_qos_override, next_qos);
-
-               if (requested.thrp_qos_promote != THREAD_QOS_UNSPECIFIED)
-                       next_qos = MAX(requested.thrp_qos_promote, next_qos);
-
-               if (requested.thrp_qos_ipc_override != THREAD_QOS_UNSPECIFIED)
-                       next_qos = MAX(requested.thrp_qos_ipc_override, next_qos);
+               next_qos = MAX(requested.thrp_qos_override, next_qos);
+               next_qos = MAX(requested.thrp_qos_promote, next_qos);
+               next_qos = MAX(requested.thrp_qos_ipc_override, next_qos);
+               next_qos = MAX(requested.thrp_qos_workq_override, next_qos);
        }
 
        next.thep_qos = next_qos;
@@ -1379,8 +1462,7 @@ thread_policy_update_internal_spinlocked(thread_t thread, boolean_t recompute_pr
        }
 
        /* Apply the sync ipc qos override */
-       if (requested.thrp_qos_sync_ipc_override != THREAD_QOS_UNSPECIFIED)
-               next.thep_qos = MAX(requested.thrp_qos_sync_ipc_override, next.thep_qos);
+       assert(requested.thrp_qos_sync_ipc_override == THREAD_QOS_UNSPECIFIED);
 
        /*
         * The QoS relative priority is only applicable when the original programmer's
@@ -1597,55 +1679,6 @@ proc_set_thread_policy(thread_t   thread,
        thread_policy_update_complete_unlocked(thread, &pend_token);
 }
 
-/*
- * KPI for pthread kext to call to set thread base QoS values during a workq wakeup
- * May be called with interrupts disabled and workqueue/waitqueue/kqueue locks held
- *
- * Does NOT do update completion, so the thread MUST be in a safe place WRT
- * IO throttling and SFI.
- *
- * TODO: Can I assert 'it must be in a safe place'?
- */
-kern_return_t
-thread_set_workq_qos(thread_t   thread,
-                     int        qos_tier,
-                     int        relprio) /* relprio is -16 to 0 */
-{
-       assert(qos_tier >= 0 && qos_tier <= THREAD_QOS_LAST);
-       assert(relprio  <= 0 && relprio  >= THREAD_QOS_MIN_TIER_IMPORTANCE);
-
-       if (!(qos_tier >= 0 && qos_tier <= THREAD_QOS_LAST))
-               return KERN_FAILURE;
-       if (!(relprio  <= 0 && relprio  >= THREAD_QOS_MIN_TIER_IMPORTANCE))
-               return KERN_FAILURE;
-
-       if (qos_tier == THREAD_QOS_UNSPECIFIED) {
-               assert(relprio == 0);
-               if (relprio != 0)
-                       return KERN_FAILURE;
-       }
-
-       assert(thread->static_param);
-       if (!thread->static_param) {
-               return KERN_FAILURE;
-       }
-
-       /* Concern: this doesn't hold the mutex... */
-       //if (!thread->active)
-       //      return KERN_TERMINATED;
-
-       struct task_pend_token pend_token = {};
-
-       proc_set_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_AND_RELPRIO, qos_tier, -relprio, &pend_token);
-
-       assert(pend_token.tpt_update_sockets == 0);
-       /* we don't need to update throttle or sfi because pthread kext promises the thread is in a safe place */
-       /* TODO: Do we need to update SFI to ensure it gets tagged with the AST? */
-
-       return KERN_SUCCESS;
-}
-
-
 /*
  * Do the things that can't be done while holding a thread mutex.
  * These are set up to call back into thread policy to get the latest value,
@@ -1804,6 +1837,11 @@ thread_set_requested_policy_spinlocked(thread_t     thread,
                        DTRACE_BOOST3(qos_set, uint64_t, thread->thread_id, int, requested.thrp_qos, int, requested.thrp_qos_relprio);
                        break;
 
+               case TASK_POLICY_QOS_WORKQ_OVERRIDE:
+                       assert(category == TASK_POLICY_ATTRIBUTE);
+                       requested.thrp_qos_workq_override = value;
+                       break;
+
                case TASK_POLICY_QOS_PROMOTE:
                        assert(category == TASK_POLICY_ATTRIBUTE);
                        requested.thrp_qos_promote = value;
@@ -1814,11 +1852,6 @@ thread_set_requested_policy_spinlocked(thread_t     thread,
                        requested.thrp_qos_ipc_override = value;
                        break;
 
-               case TASK_POLICY_QOS_SYNC_IPC_OVERRIDE:
-                       assert(category == TASK_POLICY_ATTRIBUTE);
-                       requested.thrp_qos_sync_ipc_override = value;
-                       break;
-
                case TASK_POLICY_TERMINATED:
                        assert(category == TASK_POLICY_ATTRIBUTE);
                        requested.thrp_terminated = value;
@@ -1923,6 +1956,10 @@ thread_get_requested_policy_spinlocked(thread_t thread,
                        assert(category == TASK_POLICY_ATTRIBUTE);
                        value = requested.thrp_through_qos;
                        break;
+               case TASK_POLICY_QOS_WORKQ_OVERRIDE:
+                       assert(category == TASK_POLICY_ATTRIBUTE);
+                       value = requested.thrp_qos_workq_override;
+                       break;
                case TASK_POLICY_QOS_AND_RELPRIO:
                        assert(category == TASK_POLICY_ATTRIBUTE);
                        assert(value2 != NULL);
@@ -2218,11 +2255,6 @@ static void canonicalize_resource_and_type(user_addr_t *resource, int *resource_
                *resource_type = THREAD_QOS_OVERRIDE_TYPE_UNKNOWN;
        } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE) {
                /* no transform */
-       } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH) {
-               /* Map all dispatch overrides to a single one, to avoid memory overhead */
-               if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) {
-                       *resource = USER_ADDR_NULL;
-               }
        } else if (qos_override_mode == QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_SINGLE_MUTEX_OVERRIDE) {
                /* Map all mutex overrides to a single one, to avoid memory overhead */
                if (*resource_type == THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX) {
@@ -2314,11 +2346,7 @@ calculate_requested_qos_override(thread_t thread)
 
        override = thread->overrides;
        while (override) {
-               if (qos_override_mode != QOS_OVERRIDE_MODE_FINE_GRAINED_OVERRIDE_BUT_IGNORE_DISPATCH ||
-                       override->override_resource_type != THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE) {
-                       qos_override = MAX(qos_override, override->override_qos);
-               }
-
+               qos_override = MAX(qos_override, override->override_qos);
                override = override->override_next;
        }
 
@@ -2329,19 +2357,13 @@ calculate_requested_qos_override(thread_t thread)
  * Returns:
  * - 0 on success
  * - EINVAL if some invalid input was passed
- * - EFAULT if user_lock_addr != NULL and needs to be faulted (userland has to
- *   fault and retry)
- * - ESTALE if user_lock_addr != NULL &&
- *   ulock_owner_value_to_port_name(*user_lock_addr) != user_lock_owner
  */
 static int
 proc_thread_qos_add_override_internal(thread_t         thread,
                                       int              override_qos,
                                       boolean_t        first_override_for_resource,
                                       user_addr_t      resource,
-                                      int              resource_type,
-                                      user_addr_t      user_lock_addr,
-                                      mach_port_name_t user_lock_owner)
+                                      int              resource_type)
 {
        struct task_pend_token pend_token = {};
        int rc = 0;
@@ -2373,26 +2395,6 @@ proc_thread_qos_add_override_internal(thread_t         thread,
                thread_mtx_lock(thread);
                override = find_qos_override(thread, resource, resource_type);
        }
-       if (user_lock_addr) {
-               uint64_t val;
-               /* Workaround lack of explicit support for 'no-fault copyin'
-                * <rdar://problem/24999882>, as disabling preemption prevents paging in
-                */
-               disable_preemption();
-               rc = copyin_word(user_lock_addr, &val, sizeof(user_lock_owner));
-               enable_preemption();
-               if (rc == 0 && ulock_owner_value_to_port_name((uint32_t)val) != user_lock_owner) {
-                       rc = ESTALE;
-               }
-               if (rc) {
-                       prev_qos_override = proc_get_thread_policy_locked(thread,
-                                       TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL);
-                       new_qos_override = prev_qos_override;
-                       new_effective_qos = proc_get_effective_thread_policy(thread, TASK_POLICY_QOS);
-                       thread_mtx_unlock(thread);
-                       goto out;
-               }
-       }
        if (first_override_for_resource && override) {
                /* Someone else already allocated while the thread lock was dropped */
                override->override_contended_resource_count++;
@@ -2435,7 +2437,6 @@ proc_thread_qos_add_override_internal(thread_t         thread,
 
        thread_policy_update_complete_unlocked(thread, &pend_token);
 
-out:
        if (override_new) {
                zfree(thread_qos_override_zone, override_new);
        }
@@ -2450,20 +2451,6 @@ out:
 }
 
 int
-proc_thread_qos_add_override_check_owner(thread_t thread,
-                                         int override_qos,
-                                         boolean_t first_override_for_resource,
-                                         user_addr_t resource,
-                                         int resource_type,
-                                         user_addr_t user_lock_addr,
-                                         mach_port_name_t user_lock_owner)
-{
-       return proc_thread_qos_add_override_internal(thread, override_qos,
-                       first_override_for_resource, resource, resource_type,
-                       user_lock_addr, user_lock_owner);
-}
-
-boolean_t
 proc_thread_qos_add_override(task_t           task,
                              thread_t         thread,
                              uint64_t         tid,
@@ -2482,33 +2469,31 @@ proc_thread_qos_add_override(task_t           task,
                if (thread == THREAD_NULL) {
                        KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_ADD_OVERRIDE)) | DBG_FUNC_NONE,
                                                                  tid, 0, 0xdead, 0, 0);
-                       return FALSE;
+                       return ESRCH;
                }
                has_thread_reference = TRUE;
        } else {
                assert(thread->task == task);
        }
        rc = proc_thread_qos_add_override_internal(thread, override_qos,
-                       first_override_for_resource, resource, resource_type, 0, 0);
+                       first_override_for_resource, resource, resource_type);
        if (has_thread_reference) {
                thread_deallocate(thread);
        }
 
-       return rc == 0;
+       return rc;
 }
 
-static int
+static void
 proc_thread_qos_remove_override_internal(thread_t       thread,
                                          user_addr_t    resource,
                                          int            resource_type,
-                                         boolean_t      reset,
-                                         boolean_t      squash)
+                                         boolean_t      reset)
 {
        struct task_pend_token pend_token = {};
 
        struct thread_qos_override *deferred_free_override_list = NULL;
-       int new_qos_override, prev_qos_override, new_effective_qos, prev_qos;
-       int new_qos = THREAD_QOS_UNSPECIFIED;
+       int new_qos_override, prev_qos_override, new_effective_qos;
 
        thread_mtx_lock(thread);
 
@@ -2536,24 +2521,6 @@ proc_thread_qos_remove_override_internal(thread_t       thread,
         */
        prev_qos_override = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, NULL);
 
-       if (squash) {
-               int prev_ipc_override;
-               int prev_override;
-
-               /*
-                * Remove the specified overrides, and set the current override as the new base QoS.
-                * Return the new QoS value.
-                */
-               prev_ipc_override = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_IPC_OVERRIDE, NULL);
-               prev_override = MAX(prev_qos_override, prev_ipc_override);
-
-               prev_qos = thread_get_requested_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, NULL);
-
-               new_qos = MAX(prev_qos, prev_override);
-               if (new_qos != prev_qos)
-                       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS, new_qos, 0, &pend_token);
-       }
-
        if (new_qos_override != prev_qos_override)
                proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_OVERRIDE, new_qos_override, 0, &pend_token);
 
@@ -2577,12 +2544,10 @@ proc_thread_qos_remove_override_internal(thread_t       thread,
                      int, new_qos_override, int, new_effective_qos);
 
        KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_END,
-                             thread_tid(thread), squash, 0, 0, 0);
-
-       return new_qos;
+                             thread_tid(thread), 0, 0, 0, 0);
 }
 
-boolean_t
+int
 proc_thread_qos_remove_override(task_t      task,
                                 thread_t    thread,
                                 uint64_t    tid,
@@ -2598,80 +2563,24 @@ proc_thread_qos_remove_override(task_t      task,
                if (thread == THREAD_NULL) {
                        KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE,
                                              tid, 0, 0xdead, 0, 0);
-                       return FALSE;
-               }
-               has_thread_reference = TRUE;
-       } else {
-               assert(task == thread->task);
-       }
-
-       proc_thread_qos_remove_override_internal(thread, resource, resource_type, FALSE, FALSE);
-
-       if (has_thread_reference)
-               thread_deallocate(thread);
-
-       return TRUE;
-}
-
-boolean_t
-proc_thread_qos_reset_override(task_t       task,
-                               thread_t     thread,
-                               uint64_t     tid,
-                               user_addr_t  resource,
-                               int          resource_type)
-
-{
-       boolean_t has_thread_reference = FALSE;
-
-       if (thread == THREAD_NULL) {
-               thread = task_findtid(task, tid);
-               /* returns referenced thread */
-
-               if (thread == THREAD_NULL) {
-                       KERNEL_DEBUG_CONSTANT((IMPORTANCE_CODE(IMP_USYNCH_QOS_OVERRIDE, IMP_USYNCH_REMOVE_OVERRIDE)) | DBG_FUNC_NONE,
-                                             tid, 0, 0xdead, 0, 0);
-                       return FALSE;
+                       return ESRCH;
                }
                has_thread_reference = TRUE;
        } else {
                assert(task == thread->task);
        }
 
-       proc_thread_qos_remove_override_internal(thread, resource, resource_type, TRUE, FALSE);
+       proc_thread_qos_remove_override_internal(thread, resource, resource_type, FALSE);
 
        if (has_thread_reference)
                thread_deallocate(thread);
 
-       return TRUE;
-}
-
-/*
- * Clears the requested overrides, and replaces the current QoS with the max
- * of the current QoS and the current override, then returns the new QoS.
- *
- * This is useful in order to reset overrides before parking a workqueue thread,
- * but avoid dropping priority and getting preempted right before parking.
- *
- * Called without any locks held.
- */
-int
-proc_thread_qos_squash_override(thread_t thread, user_addr_t resource, int resource_type)
-{
-       return proc_thread_qos_remove_override_internal(thread, resource, resource_type, TRUE, TRUE);
+       return 0;
 }
 
 /* Deallocate before thread termination */
 void proc_thread_qos_deallocate(thread_t thread)
 {
-       /*
-        * There are no more references to this thread,
-        * therefore this thread must not own any more locks,
-        * therefore there must not be any more user promotions.
-        */
-       assert(thread->user_promotions == 0);
-       assert(thread->requested_policy.thrp_qos_promote == THREAD_QOS_UNSPECIFIED);
-       assert(thread->user_promotion_basepri == 0);
-
        /* This thread must have no more IPC overrides. */
        assert(thread->ipc_overrides == 0);
        assert(thread->requested_policy.thrp_qos_ipc_override == THREAD_QOS_UNSPECIFIED);
@@ -2746,148 +2655,77 @@ task_get_default_manager_qos(task_t task)
        return primordial_qos;
 }
 
-
 /*
- * Promote thread with the user level properties of 'promoter'
- * Mutexes may be held, but it's OK to take the throttle lock
+ * Check if the user promotion on thread has changed
+ * and apply it.
  *
- * if 'new_promotion' is TRUE, this is a new promotion.
- * if FALSE, we are updating an existing promotion.
+ * thread locked on entry, might drop the thread lock
+ * and reacquire it.
  */
-static void
-thread_user_promotion_promote(thread_t  thread,
-                              thread_t  promoter,
-                              struct promote_token* promote_token,
-                              boolean_t new_promotion)
+boolean_t
+thread_recompute_user_promotion_locked(thread_t thread)
 {
+       boolean_t needs_update = FALSE;
        struct task_pend_token pend_token = {};
+       int user_promotion_basepri = MIN(thread_get_inheritor_turnstile_priority(thread), MAXPRI_USER);
+       int old_base_pri = thread->base_pri;
+       thread_qos_t qos_promotion;
 
-       uint32_t promoter_base_pri = 0, promoter_qos = THREAD_QOS_UNSPECIFIED;
-
-       spl_t s = splsched();
-       thread_lock(promoter);
-
-       /*
-        * We capture the 'promotion qos' here, which is captured
-        * before task-level clamping.
-        *
-        * This means that if the process gets unclamped while a promotion,
-        * is in effect, the owning thread ends up with the correct QoS.
-        *
-        * This does NOT work correctly across processes, as the correct QoS
-        * in one is not necessarily the correct QoS in another.
-        * When we add support for multi-process ulock boosting, we need to
-        * do something more complex.
-        */
-       promoter_qos = promoter->effective_policy.thep_qos_promote;
-
-       /* TODO: extract 'effective unclamped base pri' instead */
-       promoter_base_pri = promoter->base_pri;
-
-       thread_unlock(promoter);
-       splx(s);
-
-       /* clamp out realtime to max user pri */
-       promoter_base_pri = MIN(promoter_base_pri, MAXPRI_USER);
-
-       /* add in the saved promotion token */
-       assert(promote_token->pt_basepri <= MAXPRI_USER);
-
-       promoter_base_pri = MAX(promoter_base_pri, promote_token->pt_basepri);
-       promoter_qos = MAX(promoter_qos, promote_token->pt_qos);
-
-       /* save the max for later */
-       promote_token->pt_basepri = promoter_base_pri;
-       promote_token->pt_qos = promoter_qos;
-
-       s = splsched();
-       thread_lock(thread);
-
-       if (new_promotion) {
-               if (thread->user_promotions == 0) {
-                       assert(thread->requested_policy.thrp_qos_promote == THREAD_QOS_UNSPECIFIED);
-                       assert(thread->user_promotion_basepri == 0);
-               }
-
-               thread->user_promotions++;
+       /* Check if user promotion has changed */
+       if (thread->user_promotion_basepri == user_promotion_basepri) {
+               return needs_update;
        } else {
-               assert(thread->user_promotions > 0);
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                       (TURNSTILE_CODE(TURNSTILE_PRIORITY_OPERATIONS, (THREAD_USER_PROMOTION_CHANGE))) | DBG_FUNC_NONE,
+                       thread_tid(thread),
+                       user_promotion_basepri,
+                       thread->user_promotion_basepri,
+                       0, 0);
        }
 
-       uint32_t thread_qos     = thread->requested_policy.thrp_qos_promote;
-       uint32_t thread_basepri = thread->user_promotion_basepri;
+       /* Update the user promotion base pri */
+       thread->user_promotion_basepri = user_promotion_basepri;
+       pend_token.tpt_force_recompute_pri = 1;
 
-       uint32_t new_qos     = MAX(thread_qos, promoter_qos);
-       uint32_t new_basepri = MAX(thread_basepri, promoter_base_pri);
-
-       /* TODO: Fast path the 'new is lower than effective' case to avoid full reevaluation */
-       if (thread_qos != new_qos || thread_basepri != new_basepri) {
-
-               thread->user_promotion_basepri = new_basepri;
+       if (user_promotion_basepri <= MAXPRI_THROTTLE) {
+               qos_promotion = THREAD_QOS_UNSPECIFIED;
+       } else {
+               qos_promotion = thread_user_promotion_qos_for_pri(user_promotion_basepri);
+       }
 
-               pend_token.tpt_force_recompute_pri = 1;
+       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
+                       TASK_POLICY_QOS_PROMOTE, qos_promotion, 0, &pend_token);
 
-               proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
-                                                 TASK_POLICY_QOS_PROMOTE, new_qos,
-                                                 0, &pend_token);
+       if (thread_get_waiting_turnstile(thread) &&
+           thread->base_pri != old_base_pri) {
+               needs_update = TRUE;
        }
 
        thread_unlock(thread);
-       splx(s);
 
        thread_policy_update_complete_unlocked(thread, &pend_token);
-}
 
-/* Add a user promotion to thread */
-void
-thread_user_promotion_add(thread_t thread,
-                          thread_t promoter,
-                          struct promote_token* promote_token)
-{
-       thread_user_promotion_promote(thread, promoter, promote_token, TRUE);
-}
+       thread_lock(thread);
 
-/* Update an existing user promotion on thread */
-void
-thread_user_promotion_update(thread_t thread,
-                             thread_t promoter,
-                             struct promote_token* promote_token)
-{
-       thread_user_promotion_promote(thread, promoter, promote_token, FALSE);
+       return needs_update;
 }
 
 /*
- * Drop a user promotion on thread
- * Mutexes may be held, but it's OK to take the throttle lock
+ * Convert the thread user promotion base pri to qos for threads in qos world.
+ * For priority above UI qos, the qos would be set to UI.
  */
-void
-thread_user_promotion_drop(thread_t thread)
+thread_qos_t
+thread_user_promotion_qos_for_pri(int priority)
 {
-       struct task_pend_token pend_token = {};
-
-       spl_t s = splsched();
-       thread_lock(thread);
-
-       assert(thread->user_promotions > 0);
-
-       if (--thread->user_promotions == 0) {
-               thread->requested_policy.thrp_qos_promote = THREAD_QOS_UNSPECIFIED;
-               thread->user_promotion_basepri = 0;
-
-               pend_token.tpt_force_recompute_pri = 1;
-
-               proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
-                                                 TASK_POLICY_QOS_PROMOTE, THREAD_QOS_UNSPECIFIED,
-                                                 0, &pend_token);
+       int qos;
+       for (qos = THREAD_QOS_USER_INTERACTIVE; qos > THREAD_QOS_MAINTENANCE; qos--) {
+               if (thread_qos_policy_params.qos_pri[qos] <= priority) {
+                       return qos;
+               }
        }
-
-       thread_unlock(thread);
-       splx(s);
-
-       thread_policy_update_complete_unlocked(thread, &pend_token);
+       return THREAD_QOS_MAINTENANCE;
 }
 
-
 /*
  * Set the thread's QoS IPC override
  * Owned by the IPC subsystem
@@ -2914,6 +2752,7 @@ thread_ipc_override(thread_t    thread,
 
        assert(qos_override > THREAD_QOS_UNSPECIFIED);
        assert(qos_override < THREAD_QOS_LAST);
+
        if (is_new_override) {
                if (thread->ipc_overrides++ == 0) {
                        /* This add is the first override for this thread */
@@ -2948,10 +2787,6 @@ thread_ipc_override(thread_t    thread,
        thread_unlock(thread);
        splx(s);
 
-       /*
-        * this is only safe after rethrottle_thread supports
-        * being called from spinlock context
-        */
        thread_policy_update_complete_unlocked(thread, &pend_token);
 }
 
@@ -2993,88 +2828,20 @@ thread_drop_ipc_override(thread_t thread)
        thread_unlock(thread);
        splx(s);
 
-       /*
-        * this is only safe after rethrottle_thread supports
-        * being called from spinlock context
-        */
        thread_policy_update_complete_unlocked(thread, &pend_token);
 }
 
-void
-thread_add_sync_ipc_override(thread_t  thread)
+/* Get current requested qos / relpri, may be called from spinlock context */
+thread_qos_t
+thread_get_requested_qos(thread_t thread, int *relpri)
 {
-       struct task_pend_token pend_token = {};
-
-       spl_t s = splsched();
-       thread_lock(thread);
-
-       uint32_t old_override __unused = thread->requested_policy.thrp_qos_sync_ipc_override;
-
-       if (thread->sync_ipc_overrides++ == 0) {
-               /* This add is the first override for this thread */
-               assert(old_override == THREAD_QOS_UNSPECIFIED);
-       } else {
-               /* There are already other overrides in effect for this thread */
-               assert(old_override == THREAD_QOS_USER_INTERACTIVE);
-               thread_unlock(thread);
-               splx(s);
-               return;
-       }
-
-       uint32_t new_override = THREAD_QOS_USER_INTERACTIVE;
-
-       proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
-                                         TASK_POLICY_QOS_SYNC_IPC_OVERRIDE,
-                                         new_override, 0, &pend_token);
-
-       assert(pend_token.tpt_update_sockets == 0);
+       int relprio_value = 0;
+       thread_qos_t qos;
 
-       thread_unlock(thread);
-       splx(s);
-
-       /*
-        * this is only safe after rethrottle_thread supports
-        * being called from spinlock context
-        */
-       thread_policy_update_complete_unlocked(thread, &pend_token);
-}
-
-void
-thread_drop_sync_ipc_override(thread_t thread)
-{
-       struct task_pend_token pend_token = {};
-
-       spl_t s = splsched();
-       thread_lock(thread);
-
-       assert(thread->sync_ipc_overrides > 0);
-
-       if (--thread->sync_ipc_overrides == 0) {
-               /*
-                * There are no more overrides for this thread, so we should
-                * clear out the saturated override value
-                */
-
-               proc_set_thread_policy_spinlocked(thread, TASK_POLICY_ATTRIBUTE,
-                                                 TASK_POLICY_QOS_SYNC_IPC_OVERRIDE, THREAD_QOS_UNSPECIFIED,
-                                                 0, &pend_token);
-       }
-
-       thread_unlock(thread);
-       splx(s);
-
-       /*
-        * this is only safe after rethrottle_thread supports
-        * being called from spinlock context
-        */
-       thread_policy_update_complete_unlocked(thread, &pend_token);
-}
-
-/* Get current IPC override, may be called from spinlock context */
-uint32_t
-thread_get_ipc_override(thread_t thread)
-{
-       return proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE, TASK_POLICY_QOS_IPC_OVERRIDE, NULL);
+       qos = proc_get_thread_policy_locked(thread, TASK_POLICY_ATTRIBUTE,
+                       TASK_POLICY_QOS_AND_RELPRIO, &relprio_value);
+       if (relpri) *relpri = -relprio_value;
+       return qos;
 }
 
 /*
@@ -3082,27 +2849,16 @@ thread_get_ipc_override(thread_t thread)
  * since exec could block other threads calling
  * proc_find on the proc. This boost must be removed
  * via call to thread_clear_exec_promotion.
+ *
+ * This should be replaced with a generic 'priority inheriting gate' mechanism (24194397)
  */
 void
 thread_set_exec_promotion(thread_t thread)
 {
-       spl_t s;
-
-       s = splsched();
+       spl_t s = splsched();
        thread_lock(thread);
 
-       assert((thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) == 0);
-
-       if (thread->sched_pri < EXEC_BOOST_PRIORITY ||
-           !(thread->sched_flags & TH_SFLAG_EXEC_PROMOTED)) {
-               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_PROMOTE) | DBG_FUNC_NONE,
-                                     (uintptr_t)thread_tid(thread),
-                                     thread->sched_pri, thread->base_pri,
-                                     EXEC_BOOST_PRIORITY, 0);
-               thread->sched_flags |= TH_SFLAG_EXEC_PROMOTED;
-               if (thread->sched_pri < EXEC_BOOST_PRIORITY)
-                       set_sched_pri(thread, EXEC_BOOST_PRIORITY);
-       }
+       sched_thread_promote_reason(thread, TH_SFLAG_EXEC_PROMOTED, 0);
 
        thread_unlock(thread);
        splx(s);
@@ -3115,34 +2871,12 @@ thread_set_exec_promotion(thread_t thread)
 void
 thread_clear_exec_promotion(thread_t thread)
 {
-       spl_t s;
-
-       s = splsched();
+       spl_t s = splsched();
        thread_lock(thread);
-       assert(thread->sched_flags & TH_SFLAG_EXEC_PROMOTED);
-
-       if (thread->sched_flags & TH_SFLAG_EXEC_PROMOTED) {
-               thread->sched_flags &= ~TH_SFLAG_EXEC_PROMOTED;
-
-               if (thread->sched_flags & TH_SFLAG_PROMOTED_MASK) {
-                       /* it still has other promotions (mutex/rw_lock) */
-               } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE) | DBG_FUNC_NONE,
-                                             (uintptr_t)thread_tid(thread),
-                                             thread->sched_pri,
-                                             thread->base_pri,
-                                             DEPRESSPRI, 0);
-                       set_sched_pri(thread, DEPRESSPRI);
-               } else {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_EXEC_DEMOTE) | DBG_FUNC_NONE,
-                                             (uintptr_t)thread_tid(thread),
-                                             thread->sched_pri,
-                                             thread->base_pri,
-                                             thread->base_pri, 0);
-                       thread_recompute_sched_pri(thread, FALSE);
-               }
-       }
+
+       sched_thread_unpromote_reason(thread, TH_SFLAG_EXEC_PROMOTED, 0);
 
        thread_unlock(thread);
        splx(s);
 }
+
index f101eb17142ac9218c0382b51c2d127d2789cc4b..8ccba9e2cd6429be742d68bfd7ce92c5f5360abf 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2000-2007 Apple Inc. All rights reserved.
+ * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
  */
-/* 
+/*
  * Mach Operating System
  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  * All Rights Reserved.
- * 
+ *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
- * 
+ *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- * 
+ *
  * Carnegie Mellon requests users of this software to return to
- * 
+ *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
- * 
+ *
  * any improvements or extensions that they make and grant Carnegie Mellon
  * the rights to redistribute these changes.
  */
-/* 
- */
 
 #include <mach/kern_return.h>
 #include <mach/port.h>
@@ -70,110 +68,81 @@ int precise_user_kernel_time = 0;
 int precise_user_kernel_time = 1;
 #endif
 
-/*
- *     timer_init initializes a timer.
- */
 void
-timer_init(
-       timer_t         timer)
+timer_init(timer_t timer)
 {
-       timer->tstamp = 0;
-#if    defined(__LP64__)
-       timer->all_bits = 0;
-#else
-       timer->low_bits = 0;
-       timer->high_bits = 0;
-       timer->high_bits_check = 0;
-#endif /* defined(__LP64__) */
+       memset(timer, 0, sizeof(*timer));
 }
 
-/*
- *     Calculate the difference between a timer
- *     and saved value, and update the saved value.
- */
 uint64_t
-timer_delta(
-       timer_t         timer,
-       uint64_t        *save)
+timer_delta(timer_t timer, uint64_t *prev_in_cur_out)
 {
-       uint64_t        new, old = *save;
-
-       *save = new = timer_grab(timer);
-
+       uint64_t old = *prev_in_cur_out;
+       uint64_t new = *prev_in_cur_out = timer_grab(timer);
        return (new - old);
 }
 
-void
-timer_advance(
-       timer_t         timer,
-       uint64_t        delta)
+static void
+timer_advance(timer_t timer, uint64_t delta)
 {
-#if    defined(__LP64__)
+#if defined(__LP64__)
        timer->all_bits += delta;
-#else
-       uint64_t        low;
-
-       low = delta + timer->low_bits;
-       if (low >> 32)
-               timer_update(timer, (uint32_t)(timer->high_bits + (low >> 32)), (uint32_t)low);
-       else
+#else /* defined(__LP64__) */
+       extern void timer_advance_internal_32(timer_t timer, uint32_t high,
+                       uint32_t low);
+       uint64_t low = delta + timer->low_bits;
+       if (low >> 32) {
+               timer_advance_internal_32(timer,
+                               (uint32_t)(timer->high_bits + (low >> 32)), (uint32_t)low);
+       } else {
                timer->low_bits = (uint32_t)low;
-#endif         /* defined(__LP64__) */
+       }
+#endif /* defined(__LP64__) */
 }
 
 void
-timer_start(
-       timer_t         timer,
-       uint64_t        tstamp)
+timer_start(timer_t timer, uint64_t tstamp)
 {
        timer->tstamp = tstamp;
 }
 
 void
-timer_stop(
-       timer_t         timer,
-       uint64_t        tstamp)
+timer_stop(timer_t timer, uint64_t tstamp)
 {
        timer_advance(timer, tstamp - timer->tstamp);
 }
 
-/*
- *     Update the timer and start a new one.
- */
 void
-timer_switch(
-       timer_t                 timer,
-       uint64_t                tstamp,
-       timer_t                 new_timer)
+timer_update(timer_t timer, uint64_t tstamp)
+{
+       timer_advance(timer, tstamp - timer->tstamp);
+       timer->tstamp = tstamp;
+}
+
+void
+timer_switch(timer_t timer, uint64_t tstamp, timer_t new_timer)
 {
        timer_advance(timer, tstamp - timer->tstamp);
        new_timer->tstamp = tstamp;
 }
 
 /*
- *     Update the current thread timer and
- *     start the new timer.  Requires a current
- *     and new timer.
+ * Update the current processor's thread timer with `tstamp` and switch the
+ * processor's thread timer to `new_timer`.
  *
- *     Called with interrupts disabled.
+ * Called with interrupts disabled.
  */
 void
-thread_timer_event(
-       uint64_t                tstamp,
-       timer_t                 new_timer)
+processor_timer_switch_thread(uint64_t tstamp, timer_t new_timer)
 {
-       processor_t             processor = current_processor();
-       timer_t                 timer;
+       processor_t processor = current_processor();
+       timer_t timer;
 
-       /*
-        *      Update current timer.
-        */
+       /* Update current timer. */
        timer = PROCESSOR_DATA(processor, thread_timer);
        timer_advance(timer, tstamp - timer->tstamp);
 
-       /*
-        *      Start new timer.
-        */
+       /* Start new timer. */
        PROCESSOR_DATA(processor, thread_timer) = new_timer;
        new_timer->tstamp = tstamp;
 }
index a353c6c29b7f336fffb75706c9b615840bcf4982..648a47c79be4ff8ec5a77ff42338fcf80879316c 100644 (file)
@@ -56,7 +56,7 @@
 /*
  */
 
-#ifndef        _KERN_TIMER_H_
+#ifndef _KERN_TIMER_H_
 #define _KERN_TIMER_H_
 
 #include <kern/kern_types.h>
@@ -80,85 +80,78 @@ extern int precise_user_kernel_time;
  * thread-local value (or in kernel debugger context). In the future,
  * we make take into account task-level or thread-level policy.
  */
-#define use_precise_user_kernel_time(thread) ( precise_user_kernel_time ) 
+#define use_precise_user_kernel_time(thread) (precise_user_kernel_time)
 
 /*
- *     Definitions for high resolution timers.  A check
- *     word on the high portion allows atomic updates.
+ * Definitions for high resolution timers.
  */
 
 struct timer {
-       uint64_t        tstamp;
-#if    defined(__LP64__)
-       uint64_t        all_bits;
-#else
-       uint32_t        low_bits;
-       uint32_t        high_bits;
-       uint32_t        high_bits_check;
-#endif
+       uint64_t tstamp;
+#if defined(__LP64__)
+       uint64_t all_bits;
+#else /* defined(__LP64__) */
+       /* A check word on the high portion allows atomic updates. */
+       uint32_t low_bits;
+       uint32_t high_bits;
+       uint32_t high_bits_check;
+#endif /* !defined(__LP64__) */
 };
 
-typedef struct timer   timer_data_t, *timer_t;
+typedef struct timer timer_data_t, *timer_t;
 
 /*
- *     Exported kernel interface to timers
+ * Initialize the `timer`.
  */
+void timer_init(timer_t timer);
 
-/* Start a timer by setting the timestamp */
-extern void            timer_start(
-                                       timer_t         timer,
-                                       uint64_t        tstamp);
-
-/* Stop a timer by updating from the timestamp */
-extern void            timer_stop(
-                                       timer_t         timer,
-                                       uint64_t        tstamp);
-
-/* Update the timer and start a new one */
-extern void            timer_switch(
-                                       timer_t         timer,
-                                       uint64_t        tstamp,
-                                       timer_t         new_timer);
-
-/* Update the thread timer at an event */
-extern void            thread_timer_event(
-                                       uint64_t        tstamp,
-                                       timer_t         new_timer);
-
-/* Initialize a timer */
-extern void            timer_init(
-                                       timer_t         timer);
-
-/* Update a saved timer value and return delta to current value */
-extern uint64_t        timer_delta(
-                                       timer_t         timer,
-                                       uint64_t        *save);
-
-/* Advance a timer by a 64 bit value */
-extern void            timer_advance(
-                                       timer_t         timer,
-                                       uint64_t        delta);
+/*
+ * Start the `timer` at time `tstamp`.
+ */
+void timer_start(timer_t timer, uint64_t tstamp);
+
+/*
+ * Stop the `timer` and update it with time `tstamp`.
+ */
+void timer_stop(timer_t timer, uint64_t tstamp);
+
+/*
+ * Update the `timer` at time `tstamp`, leaving it running.
+ */
+void timer_update(timer_t timer, uint64_t tstamp);
+
+/*
+ * Update the `timer` with time `tstamp` and start `new_timer`.
+ */
+void timer_switch(timer_t timer, uint64_t tstamp, timer_t new_timer);
 
 /*
- *     Exported hardware interface to timers
+ * Update the thread timer at an "event," like a context switch, at time
+ * `tstamp`.  This stops the current timer and starts the `new_timer` running.
+ *
+ * Must be called with interrupts disabled.
  */
+void processor_timer_switch_thread(uint64_t tstamp, timer_t new_timer);
 
-/* Read timer value */
-#if    defined(__LP64__)
-static inline uint64_t timer_grab(
-                                       timer_t         timer)
+/*
+ * Return the difference between the `timer` and a previous value pointed to by
+ * `prev_in_cur_out`.  Store the current value of the timer to
+ * `prev_in_cur_out`.
+ */
+uint64_t timer_delta(timer_t timer, uint64_t *prev_in_cur_out);
+
+/*
+ * Read the accumulated time of `timer`.
+ */
+#if defined(__LP64__)
+static inline
+uint64_t
+timer_grab(timer_t timer)
 {
        return timer->all_bits;
 }
-#else
-extern uint64_t        timer_grab(
-                                       timer_t         timer);
-
-/* Update timer value */
-extern void            timer_update(
-                                       timer_t         timer,
-                                       uint32_t        new_high,
-                                       uint32_t        new_low);
-#endif /* defined(__LP64__) */
-
-#endif /* _KERN_TIMER_H_ */
+#else /* defined(__LP64__) */
+uint64_t timer_grab(timer_t timer);
+#endif /* !defined(__LP64__) */
+
+#endif /* _KERN_TIMER_H_ */
diff --git a/osfmk/kern/trustcache.h b/osfmk/kern/trustcache.h
new file mode 100644 (file)
index 0000000..4fd57d5
--- /dev/null
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_TRUSTCACHE_H_
+#define _KERN_TRUSTCACHE_H_
+
+#include <stdint.h>
+
+#include <kern/cs_blobs.h>
+
+#include <uuid/uuid.h>
+
+/* Version 0 trust caches: No defined sorting order (thus only suitable for small trust caches).
+ * Used for loadable trust caches only, until phasing out support. */
+typedef uint8_t trust_cache_hash0[CS_CDHASH_LEN];
+struct trust_cache_module0 {
+    uint32_t version;
+    uuid_t uuid;
+    uint32_t num_hashes;
+    trust_cache_hash0 hashes[];
+} __attribute__((__packed__));
+
+
+/* Version 1 trust caches: Always sorted by cdhash, added hash type and flags field.
+ * Suitable for all trust caches. */
+
+struct trust_cache_entry1 {
+       uint8_t cdhash[CS_CDHASH_LEN];
+       uint8_t hash_type;
+       uint8_t flags;
+} __attribute__((__packed__));
+
+struct trust_cache_module1 {
+    uint32_t version;
+    uuid_t uuid;
+    uint32_t num_entries;
+    struct trust_cache_entry1 entries[];
+} __attribute__((__packed__));
+
+// Trust Cache Entry Flags
+#define CS_TRUST_CACHE_AMFID    0x1                    // valid cdhash for amfid
+
+#define TC_LOOKUP_HASH_TYPE_SHIFT               16
+#define TC_LOOKUP_HASH_TYPE_MASK                0xff0000L;
+#define TC_LOOKUP_FLAGS_SHIFT                   8
+#define TC_LOOKUP_FLAGS_MASK                    0xff00L
+#define TC_LOOKUP_RESULT_SHIFT                  0
+#define TC_LOOKUP_RESULT_MASK                   0xffL
+
+#define TC_LOOKUP_FOUND         1
+#define TC_LOOKUP_FALLBACK      2
+
+#ifdef XNU_KERNEL_PRIVATE
+
+// Serialized Trust Caches
+
+/* This is how iBoot delivers them to us. */
+struct serialized_trust_caches {
+       uint32_t num_caches;
+       uint32_t offsets[0];
+} __attribute__((__packed__));
+
+
+// Legacy Static Trust Cache
+
+/* This is the old legacy trust cache baked into the AMFI kext.
+ * We support it for a transitionary period, until external trust caches
+ * are fully established, and the AMFI trust cache can be removed. */
+
+struct legacy_trust_cache_bucket {
+       uint16_t count;
+       uint16_t offset;
+} __attribute__((__packed__));
+
+#define LEGACY_TRUST_CACHE_ENTRY_LEN (CS_CDHASH_LEN-1)
+#define LEGACY_TRUST_CACHE_BUCKET_COUNT (256)
+
+typedef uint8_t pmap_cs_legacy_stc_entry[CS_CDHASH_LEN-1]; // bucketized with first byte
+
+void trust_cache_init(void);
+
+uint32_t lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN]);
+
+bool lookup_in_trust_cache_module(struct trust_cache_module1 const * const module,
+                                                                 uint8_t const cdhash[CS_CDHASH_LEN],
+                                                                 uint8_t       * const hash_type,
+                                                                 uint8_t       * const flags);
+
+#endif
+
+#endif /* _KERN_TRUSTCACHE_H */
diff --git a/osfmk/kern/turnstile.c b/osfmk/kern/turnstile.c
new file mode 100644 (file)
index 0000000..7a11341
--- /dev/null
@@ -0,0 +1,2745 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/turnstile.h>
+#include <kern/cpu_data.h>
+#include <kern/mach_param.h>
+#include <kern/kern_types.h>
+#include <kern/assert.h>
+#include <kern/kalloc.h>
+#include <kern/thread.h>
+#include <kern/clock.h>
+#include <kern/policy_internal.h>
+#include <kern/task.h>
+#include <kern/waitq.h>
+#include <kern/sched_prim.h>
+#include <kern/zalloc.h>
+#include <kern/debug.h>
+#include <machine/machlimits.h>
+#include <machine/atomic.h>
+
+#include <pexpert/pexpert.h>
+#include <libkern/section_keywords.h>
+
+static zone_t turnstiles_zone;
+static int turnstile_max_hop;
+#define MAX_TURNSTILES (thread_max)
+#define TURNSTILES_CHUNK (THREAD_CHUNK)
+
+/* Global table for turnstile promote policy for all type of turnstiles */
+turnstile_promote_policy_t turnstile_promote_policy[TURNSTILE_TOTAL_TYPES] = {
+       [TURNSTILE_NONE]          = TURNSTILE_PROMOTE_NONE,
+       [TURNSTILE_KERNEL_MUTEX]  = TURNSTILE_KERNEL_PROMOTE,
+       [TURNSTILE_ULOCK]         = TURNSTILE_USER_PROMOTE,
+       [TURNSTILE_PTHREAD_MUTEX] = TURNSTILE_USER_PROMOTE,
+       [TURNSTILE_SYNC_IPC]      = TURNSTILE_USER_IPC_PROMOTE,
+       [TURNSTILE_WORKLOOPS]     = TURNSTILE_USER_IPC_PROMOTE,
+       [TURNSTILE_WORKQS]        = TURNSTILE_USER_IPC_PROMOTE,
+       [TURNSTILE_KNOTE]         = TURNSTILE_USER_IPC_PROMOTE,
+};
+
+os_refgrp_decl(static, turnstile_refgrp, "turnstile", NULL);
+
+#if DEVELOPMENT || DEBUG
+static queue_head_t turnstiles_list;
+static lck_spin_t global_turnstile_lock;
+
+lck_grp_t              turnstiles_dev_lock_grp;
+lck_attr_t             turnstiles_dev_lock_attr;
+lck_grp_attr_t         turnstiles_dev_lock_grp_attr;
+
+#define global_turnstiles_lock_init() \
+       lck_spin_init(&global_turnstile_lock, &turnstiles_dev_lock_grp, &turnstiles_dev_lock_attr)
+#define global_turnstiles_lock_destroy() \
+       lck_spin_destroy(&global_turnstile_lock, &turnstiles_dev_lock_grp)
+#define        global_turnstiles_lock() \
+       lck_spin_lock(&global_turnstile_lock)
+#define        global_turnstiles_lock_try() \
+       lck_spin_try_lock(&global_turnstile_lock)
+#define        global_turnstiles_unlock() \
+       lck_spin_unlock(&global_turnstile_lock)
+
+/* Array to store stats for multi-hop boosting */
+static struct turnstile_stats turnstile_boost_stats[TURNSTILE_MAX_HOP_DEFAULT] = {};
+static struct turnstile_stats turnstile_unboost_stats[TURNSTILE_MAX_HOP_DEFAULT] = {};
+uint64_t thread_block_on_turnstile_count;
+uint64_t thread_block_on_regular_waitq_count;
+
+#endif
+
+#ifndef max
+#define max(a,b)        (((a) > (b)) ? (a) : (b))
+#endif /* max */
+
+/* Static function declarations */
+static turnstile_type_t
+turnstile_get_type(struct turnstile *turnstile);
+static uint32_t
+turnstile_get_gencount(struct turnstile *turnstile);
+static void
+turnstile_set_type_and_increment_gencount(struct turnstile *turnstile, turnstile_type_t type);
+static void
+turnstile_init(struct turnstile *turnstile);
+static void
+turnstile_update_inheritor_workq_priority_chain(struct turnstile *in_turnstile, spl_t s);
+static void
+turnstile_update_inheritor_thread_priority_chain(struct turnstile **in_turnstile,
+               thread_t *out_thread, int total_hop, turnstile_stats_update_flags_t tsu_flags);
+static void
+turnstile_update_inheritor_turnstile_priority_chain(struct turnstile **in_out_turnstile,
+               int total_hop, turnstile_stats_update_flags_t tsu_flags);
+static void
+thread_update_waiting_turnstile_priority_chain(thread_t *in_thread,
+               struct turnstile **out_turnstile, int thread_hop, int total_hop,
+               turnstile_stats_update_flags_t tsu_flags);
+static boolean_t
+turnstile_update_turnstile_promotion_locked(struct turnstile *dst_turnstile,
+               struct turnstile *src_turnstile);
+static boolean_t
+turnstile_update_turnstile_promotion(struct turnstile *dst_turnstile,
+               struct turnstile *src_turnstile);
+static boolean_t
+turnstile_need_turnstile_promotion_update(struct turnstile *dst_turnstile,
+               struct turnstile *src_turnstile);
+static boolean_t
+turnstile_add_turnstile_promotion(struct turnstile *dst_turnstile,
+               struct turnstile *src_turnstile);
+static boolean_t
+turnstile_remove_turnstile_promotion(struct turnstile *dst_turnstile,
+               struct turnstile *src_turnstile);
+static boolean_t
+turnstile_update_thread_promotion_locked(struct turnstile *dst_turnstile,
+               thread_t thread);
+static boolean_t
+turnstile_need_thread_promotion_update(struct turnstile *dst_turnstile,
+               thread_t thread);
+static boolean_t
+thread_add_turnstile_promotion(
+               thread_t thread, struct turnstile *turnstile);
+static boolean_t
+thread_remove_turnstile_promotion(
+               thread_t thread, struct turnstile *turnstile);
+static boolean_t
+thread_needs_turnstile_promotion_update(thread_t thread,
+               struct turnstile *turnstile);
+static boolean_t
+thread_update_turnstile_promotion(
+               thread_t thread, struct turnstile *turnstile);
+static boolean_t
+thread_update_turnstile_promotion_locked(
+               thread_t thread, struct turnstile *turnstile);
+static boolean_t
+workq_add_turnstile_promotion(
+               struct workqueue *wq_inheritor, struct turnstile *turnstile);
+static turnstile_stats_update_flags_t
+thread_get_update_flags_for_turnstile_propagation_stoppage(thread_t thread);
+static turnstile_stats_update_flags_t
+turnstile_get_update_flags_for_above_UI_pri_change(struct turnstile *turnstile);
+
+#if DEVELOPMENT || DEBUG
+/* Test primitives and interfaces for testing turnstiles */
+struct tstile_test_prim {
+       struct turnstile *ttprim_turnstile;
+       thread_t ttprim_owner;
+       lck_spin_t ttprim_interlock;
+       uint32_t tt_prim_waiters;
+};
+
+struct tstile_test_prim *test_prim_ts_inline;
+struct tstile_test_prim *test_prim_global_htable;
+static void
+tstile_test_prim_init(struct tstile_test_prim **test_prim_ptr);
+#endif
+
+union turnstile_type_gencount {
+       uint32_t value;
+       struct {
+               uint32_t ts_type:(8 * sizeof(turnstile_type_t)),
+                        ts_gencount: (8 *(sizeof(uint32_t) - sizeof(turnstile_type_t)));
+       };
+};
+
+static turnstile_type_t
+turnstile_get_type(struct turnstile *turnstile)
+{
+       union turnstile_type_gencount type_and_gencount;
+
+       type_and_gencount.value = atomic_load_explicit(&turnstile->ts_type_gencount, memory_order_relaxed);
+       return (turnstile_type_t) type_and_gencount.ts_type;
+}
+
+static uint32_t
+turnstile_get_gencount(struct turnstile *turnstile)
+{
+       union turnstile_type_gencount type_and_gencount;
+
+       type_and_gencount.value = atomic_load_explicit(&turnstile->ts_type_gencount, memory_order_relaxed);
+       return (uint32_t) type_and_gencount.ts_gencount;
+}
+
+static void
+turnstile_set_type_and_increment_gencount(struct turnstile *turnstile, turnstile_type_t type)
+{
+       union turnstile_type_gencount type_and_gencount;
+
+       /* No need to compare exchange since the store happens under interlock of the primitive */
+       type_and_gencount.value = atomic_load_explicit(&turnstile->ts_type_gencount, memory_order_relaxed);
+       type_and_gencount.ts_type = type;
+       type_and_gencount.ts_gencount++;
+       atomic_store_explicit(&turnstile->ts_type_gencount, type_and_gencount.value, memory_order_relaxed);
+}
+
+
+/* Turnstile hashtable Implementation */
+
+/*
+ * Maximum number of buckets in the turnstile hashtable. This number affects the 
+ * performance of the hashtable since it determines the hash collision 
+ * rate. To experiment with the number of buckets in this hashtable use the 
+ * "ts_htable_buckets" boot-arg.
+ */
+#define TURNSTILE_HTABLE_BUCKETS_DEFAULT   32
+#define TURNSTILE_HTABLE_BUCKETS_MAX       1024
+
+SLIST_HEAD(turnstile_hashlist, turnstile);
+
+struct turnstile_htable_bucket {
+        lck_spin_t                    ts_ht_bucket_lock;
+        struct turnstile_hashlist     ts_ht_bucket_list;
+};
+
+SECURITY_READ_ONLY_LATE(static uint32_t) ts_htable_buckets;
+/* Global hashtable for turnstiles */
+SECURITY_READ_ONLY_LATE(static struct turnstile_htable_bucket *)turnstile_htable;
+
+/* Bucket locks for turnstile hashtable */
+lck_grp_t               turnstiles_htable_lock_grp;
+lck_attr_t              turnstiles_htable_lock_attr;
+lck_grp_attr_t          turnstiles_htable_lock_grp_attr;
+
+#define turnstile_bucket_lock_init(bucket) \
+        lck_spin_init(&bucket->ts_ht_bucket_lock, &turnstiles_htable_lock_grp, &turnstiles_htable_lock_attr)
+#define turnstile_bucket_lock(bucket) \
+        lck_spin_lock(&bucket->ts_ht_bucket_lock)
+#define turnstile_bucket_unlock(bucket) \
+        lck_spin_unlock(&bucket->ts_ht_bucket_lock)
+
+/*
+ * Name: turnstiles_hashtable_init
+ *
+ * Description: Initializes the global turnstile hash table.
+ *
+ * Args:
+ *   None
+ *
+ * Returns:
+ *   None
+ */
+static void
+turnstiles_hashtable_init(void)
+{
+       /* Initialize number of buckets in the hashtable */
+       if (PE_parse_boot_argn("ts_htable_buckets", &ts_htable_buckets, sizeof(ts_htable_buckets)) != TRUE)
+               ts_htable_buckets = TURNSTILE_HTABLE_BUCKETS_DEFAULT;
+       assert(ts_htable_buckets <= TURNSTILE_HTABLE_BUCKETS_MAX);
+       uint32_t ts_htable_size = ts_htable_buckets * sizeof(struct turnstile_htable_bucket);
+       turnstile_htable = (struct turnstile_htable_bucket *)kalloc(ts_htable_size);
+       if (turnstile_htable == NULL)
+               panic("Turnstiles hash table memory allocation failed!");
+       
+       lck_grp_attr_setdefault(&turnstiles_htable_lock_grp_attr);
+       lck_grp_init(&turnstiles_htable_lock_grp, "turnstiles_htable_locks", &turnstiles_htable_lock_grp_attr);
+       lck_attr_setdefault(&turnstiles_htable_lock_attr);
+
+       /* Initialize all the buckets of the hashtable */
+       for (uint32_t i = 0; i < ts_htable_buckets; i++) {
+               struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[i]);
+               turnstile_bucket_lock_init(ts_bucket);
+               SLIST_INIT(&ts_bucket->ts_ht_bucket_list);
+       }
+}
+
+/*
+ * Name: turnstile_freelist_empty
+ *
+ * Description: Checks if the turnstile's freelist is empty
+ *              Should be called with the primitive IL held.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *
+ * Returns:
+ *   true if freelist is empty; false otherwise
+ */
+static inline boolean_t
+turnstile_freelist_empty(
+       struct turnstile *ts)
+{
+       return SLIST_EMPTY(&ts->ts_free_turnstiles);
+}
+
+
+/*
+ * Name: turnstile_freelist_insert
+ *
+ * Description: Inserts the turnstile into the freelist of another turnstile
+ *              Should be called with the primitive IL held.
+ *
+ * Args:
+ *   Arg1: primitive turnstile
+ *   Arg2: turnstile to add to the freelist
+ *
+ * Returns:
+ *   None
+ */
+static void
+turnstile_freelist_insert(
+       struct turnstile *dst_ts,
+       struct turnstile *free_ts)
+{
+       assert(turnstile_get_type(dst_ts) == turnstile_get_type(free_ts));
+       assert(dst_ts->ts_proprietor == free_ts->ts_proprietor);
+       turnstile_state_add(free_ts, TURNSTILE_STATE_FREELIST);
+       SLIST_INSERT_HEAD(&dst_ts->ts_free_turnstiles, free_ts, ts_free_elm);
+}
+
+/*
+ * Name: turnstile_freelist_remove
+ *
+ * Description: Removes a turnstile from the freelist of a turnstile
+ *              Should be called with the primitive IL held.
+ *
+ * Args:
+ *   Arg1: primitive turnstile
+ *
+ * Returns:
+ *   turnstile removed from the freelist
+ */
+static struct turnstile *
+turnstile_freelist_remove(
+       struct turnstile *ts)
+{
+       struct turnstile *ret_turnstile = TURNSTILE_NULL;
+       assert(!SLIST_EMPTY(&ts->ts_free_turnstiles));
+       ret_turnstile = SLIST_FIRST(&ts->ts_free_turnstiles);
+       SLIST_REMOVE_HEAD(&ts->ts_free_turnstiles, ts_free_elm);
+       assert(ret_turnstile != TURNSTILE_NULL);
+       turnstile_state_remove(ret_turnstile, TURNSTILE_STATE_FREELIST);
+       /* Need to initialize the list again, since head and elm are in union */
+       SLIST_INIT(&ret_turnstile->ts_free_turnstiles);
+       return ret_turnstile;
+}
+
+/*
+ * Name: turnstile_hash
+ *
+ * Description: Calculates the hash bucket index for a given proprietor
+ *
+ * Args:
+ *   Arg1: proprietor (key) for hashing
+ *
+ * Returns:
+ *   hash table bucket index for provided proprietor
+ */
+static inline uint32_t 
+turnstile_hash(uintptr_t proprietor)
+{
+       char *key = (char *)&proprietor;
+       uint32_t hash = jenkins_hash(key, sizeof(key));
+       hash &= (ts_htable_buckets - 1);
+       return hash;
+}
+
+/*
+ * Name: turnstile_htable_lookup_add
+ *
+ * Description: Lookup the proprietor in the global turnstile hash table.
+ *              If an entry is present, add the new turnstile to the entry's freelist.
+ *              Otherwise add the passed in turnstile for that proprietor.
+ *              The routine assumes that the turnstile->proprietor does not change
+ *              while the turnstile is in the global hash table.
+ *
+ * Args:
+ *   Arg1: proprietor
+ *   Arg2: new turnstile for primitive
+ *
+ * Returns:
+ *   Previous turnstile for proprietor in the hash table
+ */
+static struct turnstile *
+turnstile_htable_lookup_add(
+       uintptr_t proprietor, 
+       struct turnstile *new_turnstile)
+{
+       uint32_t index = turnstile_hash(proprietor);
+       assert(index < ts_htable_buckets);
+       struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]);
+       spl_t s;
+       
+       s = splsched();
+       turnstile_bucket_lock(ts_bucket);
+       struct turnstile *ts;
+
+       SLIST_FOREACH(ts, &ts_bucket->ts_ht_bucket_list, ts_htable_link) {
+               if (ts->ts_proprietor == proprietor) {
+                       /* 
+                        * Found an entry in the hashtable for this proprietor; add thread turnstile to freelist
+                        * and return this turnstile
+                        */
+                       turnstile_bucket_unlock(ts_bucket);
+                       splx(s);
+                       turnstile_freelist_insert(ts, new_turnstile);
+                       return ts;
+               }
+       }
+
+       /* No entry for this proprietor; add the new turnstile in the hash table */
+       SLIST_INSERT_HEAD(&ts_bucket->ts_ht_bucket_list, new_turnstile, ts_htable_link);
+       turnstile_state_add(new_turnstile, TURNSTILE_STATE_HASHTABLE);
+       turnstile_bucket_unlock(ts_bucket);
+       splx(s);
+       /* Since there was no previous entry for this proprietor, return TURNSTILE_NULL */
+       return TURNSTILE_NULL;
+}
+
+/*
+ * Name: turnstable_htable_lookup_remove
+ *
+ * Description: Lookup the proprietor in the global turnstile hash table.
+ *              For the turnstile in the hash table, if the freelist has turnstiles on it
+ *              return one of them from the freelist. Otherwise remove the turnstile from
+ *              the hashtable and return that.
+ *              The routine assumes that the turnstile->proprietor does not change
+ *              while the turnstile is in the global hash table.
+ *
+ * Args:
+ *   Arg1: proprietor
+ *   Arg2: free turnstile to be returned
+ *
+ * Returns:
+ *   turnstile for this proprietor in the hashtable after the removal
+ */
+static struct turnstile *
+turnstable_htable_lookup_remove(
+       uintptr_t proprietor,
+       struct turnstile **free_turnstile)
+{
+       uint32_t index = turnstile_hash(proprietor);
+       assert(index < ts_htable_buckets);
+       struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]);
+       struct turnstile *ret_turnstile = TURNSTILE_NULL;
+       spl_t s;
+
+       s = splsched();
+       turnstile_bucket_lock(ts_bucket);
+       struct turnstile *ts, **prev_tslink;
+       /* Find the turnstile for the given proprietor in the hashtable */
+       SLIST_FOREACH_PREVPTR(ts, prev_tslink, &ts_bucket->ts_ht_bucket_list, ts_htable_link) {
+               if (ts->ts_proprietor == proprietor) {
+                       ret_turnstile = ts;
+                       break;
+               }
+       }
+       assert(ret_turnstile != TURNSTILE_NULL);
+
+       /* Check if the turnstile has any turnstiles on its freelist */
+       if (turnstile_freelist_empty(ret_turnstile)) {
+               /* No turnstiles on the freelist; remove the turnstile from the hashtable and mark it freed */
+               *prev_tslink = SLIST_NEXT(ret_turnstile, ts_htable_link);
+               turnstile_state_remove(ret_turnstile, TURNSTILE_STATE_HASHTABLE);
+               turnstile_bucket_unlock(ts_bucket);
+               splx(s);
+               *free_turnstile = ret_turnstile;
+               return TURNSTILE_NULL;
+       } else {
+               /* 
+                * Turnstile has free turnstiles on its list; leave the hashtable unchanged
+                * and return the first turnstile in the freelist as the free turnstile
+                */
+               turnstile_bucket_unlock(ts_bucket);
+               splx(s);
+               *free_turnstile = turnstile_freelist_remove(ret_turnstile);
+               return ret_turnstile;
+       }
+}
+
+/*
+ * Name: turnstile_htable_lookup
+ *
+ * Description: Lookup the proprietor in the global turnstile hash table.
+ *              The routine assumes that the turnstile->proprietor does not change
+ *              while the turnstile is in the global hash table.
+ *
+ * Args:
+ *   Arg1: proprietor
+ *
+ * Returns:
+ *   Turnstile for proprietor in the hash table
+ */
+static struct turnstile *
+turnstile_htable_lookup(
+       uintptr_t proprietor)
+{
+       uint32_t index = turnstile_hash(proprietor);
+       assert(index < ts_htable_buckets);
+       struct turnstile_htable_bucket *ts_bucket = &(turnstile_htable[index]);
+       spl_t s;
+
+       s = splsched();
+       turnstile_bucket_lock(ts_bucket);
+       struct turnstile *ts = TURNSTILE_NULL;
+       struct turnstile *ret_turnstile = TURNSTILE_NULL;
+
+       SLIST_FOREACH(ts, &ts_bucket->ts_ht_bucket_list, ts_htable_link) {
+               if (ts->ts_proprietor == proprietor) {
+                       /* Found an entry in the hashtable for this proprietor */
+                       ret_turnstile = ts;
+                       break;
+               }
+       }
+
+       turnstile_bucket_unlock(ts_bucket);
+       splx(s);
+       return ret_turnstile;
+}
+
+/*
+ * Name: turnstiles_init
+ *
+ * Description: Initialize turnstile sub system.
+ *
+ * Args: None.
+ *
+ * Returns: None.
+ */
+void
+turnstiles_init(void)
+{
+       turnstiles_zone = zinit(sizeof(struct turnstile),
+                         MAX_TURNSTILES * sizeof(struct turnstile),
+                         TURNSTILES_CHUNK * sizeof(struct turnstile),
+                         "turnstiles");
+
+       if (!PE_parse_boot_argn("turnstile_max_hop", &turnstile_max_hop, sizeof(turnstile_max_hop))) {
+               turnstile_max_hop = TURNSTILE_MAX_HOP_DEFAULT;
+       }
+       
+       turnstiles_hashtable_init();
+
+#if DEVELOPMENT || DEBUG
+       /* Initialize the global turnstile locks and lock group */
+
+       lck_grp_attr_setdefault(&turnstiles_dev_lock_grp_attr);
+       lck_grp_init(&turnstiles_dev_lock_grp, "turnstiles_dev_lock", &turnstiles_dev_lock_grp_attr);
+       lck_attr_setdefault(&turnstiles_dev_lock_attr);
+       global_turnstiles_lock_init();
+
+       queue_init(&turnstiles_list);
+
+       /* Initialize turnstile test primitive */
+       tstile_test_prim_init(&test_prim_ts_inline);
+       tstile_test_prim_init(&test_prim_global_htable);
+#endif
+       return;
+}
+
+/*
+ * Name: turnstile_alloc
+ *
+ * Description: Allocate a turnstile.
+ *
+ * Args: None.
+ *
+ * Returns:
+ *   turnstile on Success.
+ */
+struct turnstile *
+turnstile_alloc(void)
+{
+       struct turnstile *turnstile = TURNSTILE_NULL;
+
+       turnstile = zalloc(turnstiles_zone);
+       turnstile_init(turnstile);
+
+#if DEVELOPMENT || DEBUG
+       /* Add turnstile to global list */
+       global_turnstiles_lock();
+       queue_enter(&turnstiles_list, turnstile,
+               struct turnstile *, ts_global_elm);
+       global_turnstiles_unlock();
+#endif
+       return turnstile;
+}
+
+/*
+ * Name: turnstile_init
+ *
+ * Description: Initialize the turnstile.
+ *
+ * Args:
+ *   Arg1: turnstile to initialize
+ *
+ * Returns: None.
+ */
+static void
+turnstile_init(struct turnstile *turnstile)
+{
+       kern_return_t kret;
+
+       /* Initialize the waitq */
+       kret = waitq_init(&turnstile->ts_waitq, SYNC_POLICY_DISABLE_IRQ | SYNC_POLICY_REVERSED |
+               SYNC_POLICY_TURNSTILE);
+       assert(kret == KERN_SUCCESS);
+
+       turnstile->ts_inheritor = TURNSTILE_INHERITOR_NULL;
+       SLIST_INIT(&turnstile->ts_free_turnstiles);
+       turnstile->ts_type_gencount = 0;
+       turnstile_set_type_and_increment_gencount(turnstile, TURNSTILE_NONE);
+       turnstile_state_init(turnstile, TURNSTILE_STATE_THREAD);
+       os_ref_init_count(&turnstile->ts_refcount, &turnstile_refgrp, 1);
+       turnstile->ts_proprietor = TURNSTILE_PROPRIETOR_NULL;
+       turnstile->ts_priority = MAXPRI_THROTTLE;
+       turnstile->ts_inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE;
+       turnstile->ts_port_ref = 0;
+       priority_queue_init(&turnstile->ts_inheritor_queue,
+                       PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+
+#if DEVELOPMENT || DEBUG
+       turnstile->ts_thread = current_thread();
+       turnstile->ts_prev_thread = NULL;
+#endif
+}
+
+/*
+ * Name: turnstile_reference
+ *
+ * Description: Take a reference on the turnstile.
+ *
+ * Arg1: turnstile
+ *
+ * Returns: None.
+ */
+void
+turnstile_reference(struct turnstile *turnstile)
+{
+       if (turnstile == TURNSTILE_NULL) {
+               return;
+       }
+       os_ref_retain(&turnstile->ts_refcount);
+}
+
+/*
+ * Name: turnstile_deallocate
+ *
+ * Description: Drop a reference on the turnstile.
+ *              Destroy the turnstile if the last ref.
+ *
+ * Arg1: turnstile
+ *
+ * Returns: None.
+ */
+void
+turnstile_deallocate(struct turnstile *turnstile)
+{
+       if (turnstile == TURNSTILE_NULL) {
+               return;
+       }
+
+       if (__improbable(os_ref_release(&turnstile->ts_refcount) == 0)) {
+               turnstile_destroy(turnstile);
+       }
+}
+
+/*
+ * Name: turnstile_deallocate_safe
+ *
+ * Description: Drop a reference on the turnstile safely without triggering zfree.
+ *
+ * Arg1: turnstile
+ *
+ * Returns: None.
+ */
+void
+turnstile_deallocate_safe(struct turnstile *turnstile)
+{
+       if (turnstile == TURNSTILE_NULL) {
+               return;
+       }
+
+       if (__improbable(os_ref_release(&turnstile->ts_refcount) == 0)) {
+               /* enqueue the turnstile for thread deallocate deamon to call turnstile_destroy */
+               turnstile_deallocate_enqueue(turnstile);
+       }
+}
+
+/*
+ * Name: turnstile_destroy
+ *
+ * Description: Deallocates the turnstile.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *
+ * Returns: None.
+ */
+void
+turnstile_destroy(struct turnstile *turnstile)
+{
+       /* destroy the waitq */
+       waitq_deinit(&turnstile->ts_waitq);
+
+       assert(turnstile->ts_inheritor == TURNSTILE_INHERITOR_NULL);
+       assert(SLIST_EMPTY(&turnstile->ts_free_turnstiles));
+       assert(turnstile->ts_state & TURNSTILE_STATE_THREAD);
+#if DEVELOPMENT || DEBUG
+       /* Remove turnstile from global list */
+       global_turnstiles_lock();
+       queue_remove(&turnstiles_list, turnstile,
+               struct turnstile *, ts_global_elm);
+       global_turnstiles_unlock();
+#endif
+       zfree(turnstiles_zone, turnstile);
+}
+
+/*
+ * Name: turnstile_prepare
+ *
+ * Description: Transfer current thread's turnstile to primitive or it's free turnstile list.
+ *              Function is called holding the interlock (spinlock) of the primitive.
+ *              The turnstile returned by this function is safe to use untill the thread calls turnstile_complete.
+ *              When no turnstile is provided explicitly, the calling thread will not have a turnstile attached to
+ *              it untill it calls turnstile_complete.
+ *
+ * Args:
+ *   Arg1: proprietor
+ *   Arg2: pointer in primitive struct to store turnstile
+ *   Arg3: turnstile to use instead of taking it from thread.
+ *   Arg4: type of primitive
+ *
+ * Returns:
+ *   turnstile.
+ */
+struct turnstile *
+turnstile_prepare(
+       uintptr_t proprietor,
+       struct turnstile **tstore,
+       struct turnstile *turnstile,
+       turnstile_type_t type)
+{
+       thread_t thread = current_thread();
+       struct turnstile *ret_turnstile = TURNSTILE_NULL;
+       struct turnstile *thread_turnstile = turnstile;
+
+       /* Get the thread's turnstile if no turnstile provided */
+       if (thread_turnstile == TURNSTILE_NULL) {
+               thread_turnstile = thread->turnstile;
+               assert(thread_turnstile != TURNSTILE_NULL);
+               assert(thread->inheritor == NULL);
+               thread->turnstile = TURNSTILE_NULL;
+       }
+
+       /* Prepare the thread turnstile to be the primitive turnstile */
+       SLIST_INIT(&thread_turnstile->ts_free_turnstiles);
+       turnstile_set_type_and_increment_gencount(thread_turnstile, type);
+       thread_turnstile->ts_inheritor = TURNSTILE_INHERITOR_NULL;
+       thread_turnstile->ts_proprietor = proprietor;
+       turnstile_state_remove(thread_turnstile, TURNSTILE_STATE_THREAD);
+
+       thread_turnstile->ts_priority = MAXPRI_THROTTLE;
+#if DEVELOPMENT || DEBUG
+       thread_turnstile->ts_prev_thread = thread_turnstile->ts_thread;
+       thread_turnstile->ts_thread = NULL;
+#endif
+       
+       if (tstore != NULL) {
+               /* 
+                * If the primitive stores the turnstile, 
+                * If there is already a turnstile, put the thread_turnstile if the primitive currently does not have a 
+                * turnstile.
+                * Else, add the thread turnstile to freelist of the primitive turnstile.
+                */
+               ret_turnstile = *tstore;
+               if (*tstore == TURNSTILE_NULL) {
+                       turnstile_state_add(thread_turnstile, TURNSTILE_STATE_PROPRIETOR);
+                       *tstore = thread_turnstile;
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                               (TURNSTILE_CODE(TURNSTILE_FREELIST_OPERATIONS, (TURNSTILE_PREPARE))) | DBG_FUNC_NONE,
+                               VM_KERNEL_UNSLIDE_OR_PERM(thread_turnstile),
+                               VM_KERNEL_UNSLIDE_OR_PERM(proprietor),
+                               turnstile_get_type(thread_turnstile), 0, 0);
+               } else {
+                       turnstile_freelist_insert(ret_turnstile, thread_turnstile);
+               }
+               ret_turnstile = *tstore;
+       } else {
+               /* 
+                * Lookup the primitive in the turnstile hash table and see if it already has an entry.
+                */
+               ret_turnstile = turnstile_htable_lookup_add(proprietor, thread_turnstile);
+               if (ret_turnstile == NULL) {
+                       ret_turnstile = thread_turnstile;
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                               (TURNSTILE_CODE(TURNSTILE_FREELIST_OPERATIONS, (TURNSTILE_PREPARE))) | DBG_FUNC_NONE,
+                               VM_KERNEL_UNSLIDE_OR_PERM(thread_turnstile),
+                               VM_KERNEL_UNSLIDE_OR_PERM(proprietor),
+                               turnstile_get_type(thread_turnstile), 0, 0);
+               }
+       }
+       
+       return ret_turnstile;
+}
+
+/*
+ * Name: turnstile_complete
+ *
+ * Description: Transfer the primitive's turnstile or from it's freelist to current thread.
+ *              Function is called holding the interlock (spinlock) of the primitive.
+ *              Current thread will have a turnstile attached to it after this call.
+ *
+ * Args:
+ *   Arg1: proprietor
+ *   Arg2: pointer in primitive struct to update turnstile
+ *   Arg3: pointer to store the returned turnstile instead of attaching it to thread
+ *
+ * Returns:
+ *   None.
+ */
+void
+turnstile_complete(
+       uintptr_t proprietor,
+       struct turnstile **tstore,
+       struct turnstile **out_turnstile)
+{
+       thread_t thread = current_thread();
+       struct turnstile *primitive_turnstile = TURNSTILE_NULL;
+       struct turnstile *thread_turnstile = TURNSTILE_NULL;
+
+       assert(thread->inheritor == NULL);
+
+       if (tstore != NULL) {
+               /*
+                * If the primitive stores the turnstile, check if the primitive turnstile
+                * has any turnstiles on its freelist.
+                */
+               assert(*tstore != TURNSTILE_NULL);
+               if (turnstile_freelist_empty(*tstore)) {
+                       /* Last turnstile scenario; remove the primitive->turnstile */
+                       thread_turnstile = *tstore;
+                       *tstore = TURNSTILE_NULL;
+                       turnstile_state_remove(thread_turnstile, TURNSTILE_STATE_PROPRIETOR);
+               } else {
+                       /* Freelist has turnstiles; remove one from the freelist */
+                       thread_turnstile = turnstile_freelist_remove(*tstore);
+               }
+               primitive_turnstile = *tstore;
+       } else {
+               /* Use the global hash to find and remove a turnstile */
+               primitive_turnstile = turnstable_htable_lookup_remove(proprietor, &thread_turnstile);
+       }
+       if (primitive_turnstile == NULL) {
+               /*
+                * Primitive no longer has a turnstile associated with it, thread_turnstile
+                * was the last turnstile attached to primitive, clear out the inheritor and
+                * set the old inheritor for turnstile cleanup.
+                */
+               if (thread_turnstile->ts_inheritor != TURNSTILE_INHERITOR_NULL) {
+                       turnstile_update_inheritor(thread_turnstile, TURNSTILE_INHERITOR_NULL,
+                               (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+                       /*
+                        * old inheritor is set in curret thread and its priority propagation
+                        * will happen in turnstile cleanup call
+                        */
+               }
+               assert(thread_turnstile->ts_inheritor == TURNSTILE_INHERITOR_NULL);
+
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                               (TURNSTILE_CODE(TURNSTILE_FREELIST_OPERATIONS, (TURNSTILE_COMPLETE))) | DBG_FUNC_NONE,
+                               VM_KERNEL_UNSLIDE_OR_PERM(thread_turnstile),
+                               VM_KERNEL_UNSLIDE_OR_PERM(proprietor),
+                               turnstile_get_type(thread_turnstile), 0, 0);
+       } else {
+               /* If primitive's turnstile needs priority update, set it up for turnstile cleanup */
+               if (turnstile_recompute_priority(primitive_turnstile)) {
+                       turnstile_reference(primitive_turnstile);
+                       thread->inheritor = primitive_turnstile;
+                       thread->inheritor_flags = (TURNSTILE_INHERITOR_TURNSTILE |
+                               TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE);
+               }
+       }
+
+       turnstile_set_type_and_increment_gencount(thread_turnstile, TURNSTILE_NONE);
+#if DEVELOPMENT || DEBUG
+       thread_turnstile->ts_prev_thread = NULL;
+       thread_turnstile->ts_thread = thread;
+#endif
+
+       turnstile_state_add(thread_turnstile, TURNSTILE_STATE_THREAD);
+       if (out_turnstile == NULL) {
+               /* Prepare the turnstile to become the thread's turnstile */
+               thread->turnstile = thread_turnstile;
+       } else {
+               *out_turnstile = thread_turnstile;
+       }
+       return;
+}
+
+/*
+ * Name: turnstile_update_inheritor_locked
+ *
+ * Description: Update the inheritor of the turnstile and boost the
+ *              inheritor, called with turnstile locked.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *   Implicit arg: new inheritor value is stashed in current thread's struct
+ *
+ * Returns:
+ *   old inheritor reference is returned on current thread's struct.
+ */
+void
+turnstile_update_inheritor_locked(
+       struct turnstile *turnstile)
+{
+       turnstile_inheritor_t old_inheritor = turnstile->ts_inheritor;
+       turnstile_update_flags_t old_inheritor_flags = turnstile->ts_inheritor_flags;
+       thread_t thread = current_thread();
+       boolean_t old_inheritor_needs_update = FALSE;
+       boolean_t new_inheritor_needs_update = FALSE;
+       turnstile_stats_update_flags_t tsu_flags =
+               turnstile_get_update_flags_for_above_UI_pri_change(turnstile);
+
+       assert(waitq_held(&turnstile->ts_waitq));
+
+       /*
+        * Get the new inheritor value from current thread's
+        * struct, the value was stashed by turnstile_update_inheritor
+        */
+       turnstile_inheritor_t new_inheritor = thread->inheritor;
+       turnstile_update_flags_t new_inheritor_flags = thread->inheritor_flags;
+
+       switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
+       case TURNSTILE_USER_PROMOTE:
+       case TURNSTILE_USER_IPC_PROMOTE:
+
+               /* Check if update is needed */
+               if (old_inheritor == new_inheritor && old_inheritor == NULL) {
+                       break;
+               }
+
+               if (old_inheritor == new_inheritor) {
+                       if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+                               thread_t thread_inheritor = (thread_t)new_inheritor;
+
+                               assert(old_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
+
+                               /* adjust turnstile position in the thread's inheritor list */
+                               new_inheritor_needs_update = thread_update_turnstile_promotion(
+                                       thread_inheritor, turnstile);
+
+                       } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+                               struct turnstile *inheritor_turnstile = new_inheritor;
+
+                               assert(old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE);
+
+                               new_inheritor_needs_update = turnstile_update_turnstile_promotion(
+                                       inheritor_turnstile, turnstile);
+
+                       } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+                               /*
+                                * When we are still picking "WORKQ" then possible racing
+                                * updates will call redrive through their own propagation
+                                * and we don't need to update anything here.
+                                */
+                               turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
+                                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile);
+                       } else {
+                               panic("Inheritor flags lost along the way");
+                       }
+
+                       /* Update turnstile stats */
+                       if (!new_inheritor_needs_update) {
+                               turnstile_stats_update(1, TSU_PRI_PROPAGATION |
+                                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags, turnstile);
+                       }
+                       break;
+               }
+
+               if (old_inheritor != NULL) {
+                       if (old_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+                               thread_t thread_inheritor = (thread_t)old_inheritor;
+
+                               /* remove turnstile from thread's inheritor list */
+                               old_inheritor_needs_update = thread_remove_turnstile_promotion(thread_inheritor, turnstile);
+
+                       } else if (old_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+                               struct turnstile *old_turnstile = old_inheritor;
+
+                               old_inheritor_needs_update = turnstile_remove_turnstile_promotion(
+                                       old_turnstile, turnstile);
+
+                       } else if (old_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+                               /*
+                                * We don't need to do anything when the push was WORKQ
+                                * because nothing is pushed on in the first place.
+                                */
+                               turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
+                                       TSU_TURNSTILE_ARG, turnstile);
+                       } else {
+                               panic("Inheritor flags lost along the way");
+                       }
+                       /* Update turnstile stats */
+                       if (!old_inheritor_needs_update) {
+                               turnstile_stats_update(1, TSU_PRI_PROPAGATION | TSU_TURNSTILE_ARG,
+                                       turnstile);
+                       }
+               }
+
+               if (new_inheritor != NULL) {
+                       if (new_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+                               thread_t thread_inheritor = (thread_t)new_inheritor;
+
+                               assert(new_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
+                               /* add turnstile to thread's inheritor list */
+                               new_inheritor_needs_update = thread_add_turnstile_promotion(
+                                               thread_inheritor, turnstile);
+
+                       } else if (new_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+                               struct turnstile *new_turnstile = new_inheritor;
+
+                               new_inheritor_needs_update = turnstile_add_turnstile_promotion(
+                                       new_turnstile, turnstile);
+
+                       } else if (new_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+                               struct workqueue *wq_inheritor = new_inheritor;
+
+                               new_inheritor_needs_update = workq_add_turnstile_promotion(
+                                               wq_inheritor, turnstile);
+                               if (!new_inheritor_needs_update) {
+                                       turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
+                                               TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile);
+                               }
+                       } else {
+                               panic("Inheritor flags lost along the way");
+                       }
+                       /* Update turnstile stats */
+                       if (!new_inheritor_needs_update) {
+                               turnstile_stats_update(1, TSU_PRI_PROPAGATION |
+                                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG | tsu_flags,turnstile);
+                       }
+               }
+
+               break;
+
+       case TURNSTILE_KERNEL_PROMOTE:
+               break;
+       default:
+               panic("turnstile promotion for type %d not yet implemented", turnstile_get_type(turnstile));
+       }
+
+       if (old_inheritor_needs_update) {
+               old_inheritor_flags |= TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE;
+       }
+
+       /*
+        * If new inheritor needs priority updated, then set TURNSTILE_NEEDS_PRI_UPDATE
+        * on the old_inheritor_flags which will be copied to the thread.
+        */
+       if (new_inheritor_needs_update) {
+               old_inheritor_flags |= TURNSTILE_NEEDS_PRI_UPDATE;
+       }
+
+       turnstile->ts_inheritor = new_inheritor;
+       turnstile->ts_inheritor_flags = new_inheritor_flags;
+       thread->inheritor = old_inheritor;
+       thread->inheritor_flags = old_inheritor_flags;
+       return;
+}
+
+/*
+ * Name: turnstile_update_inheritor
+ *
+ * Description: Update the inheritor of the turnstile and boost the
+ *              inheritor. It will take a thread reference on the inheritor.
+ *              Called with the interlock of the primitive held.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *   Arg2: inheritor
+ *   Arg3: flags - TURNSTILE_DELAYED_UPDATE - update will happen later in assert_wait
+ *
+ * Returns:
+ *   old inheritor reference is stashed on current thread's struct.
+ */
+void
+turnstile_update_inheritor(
+       struct turnstile *turnstile,
+       turnstile_inheritor_t new_inheritor,
+       turnstile_update_flags_t flags)
+{
+       thread_t thread = current_thread();
+       spl_t spl;
+
+       /*
+        * Set the inheritor on calling thread struct, no need
+        * to take the turnstile waitq lock since the inheritor
+        * is protected by the primitive's interlock
+        */
+       assert(thread->inheritor == TURNSTILE_INHERITOR_NULL);
+       thread->inheritor = new_inheritor;
+       thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE;
+       if (new_inheritor == TURNSTILE_INHERITOR_NULL) {
+               /* nothing to retain or remember */
+       } else if (flags & TURNSTILE_INHERITOR_THREAD) {
+               thread->inheritor_flags |= TURNSTILE_INHERITOR_THREAD;
+               thread_reference((thread_t)new_inheritor);
+       } else if (flags & TURNSTILE_INHERITOR_TURNSTILE) {
+               thread->inheritor_flags |= TURNSTILE_INHERITOR_TURNSTILE;
+               turnstile_reference((struct turnstile *)new_inheritor);
+       } else if (flags & TURNSTILE_INHERITOR_WORKQ) {
+               thread->inheritor_flags |= TURNSTILE_INHERITOR_WORKQ;
+               workq_reference((struct workqueue *)new_inheritor);
+       } else {
+               panic("Missing type in flags (%x) for inheritor (%p)", flags,
+                               new_inheritor);
+       }
+
+       /* Do not perform the update if delayed update is specified */
+       if (flags & TURNSTILE_DELAYED_UPDATE) {
+               return;
+       }
+
+       /* lock the turnstile waitq */
+       spl = splsched();
+       waitq_lock(&turnstile->ts_waitq);
+
+       turnstile_update_inheritor_locked(turnstile);
+
+       waitq_unlock(&turnstile->ts_waitq);
+       splx(spl);
+
+       return;
+}
+
+
+/*
+ * Name: turnstile_need_thread_promotion_update
+ *
+ * Description: Check if thread's place in the turnstile waitq needs to be updated.
+ *
+ * Arg1: dst turnstile
+ * Arg2: thread
+ *
+ * Returns: TRUE: if turnstile_update_thread_promotion_locked needs to be called.
+ *          FALSE: otherwise.
+ *
+ * Condition: thread locked.
+ */
+static boolean_t
+turnstile_need_thread_promotion_update(
+       struct turnstile *dst_turnstile __assert_only,
+       thread_t thread)
+{
+       int thread_link_priority;
+       boolean_t needs_update = FALSE;
+
+       thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue),
+                                &(thread->wait_prioq_links));
+
+       needs_update = (thread_link_priority == thread->base_pri) ? FALSE : TRUE;
+       return needs_update;
+}
+
+/*
+ * Name: turnstile_priority_queue_update_entry_key
+ *
+ * Description: Updates the priority of an entry in a priority queue
+ *
+ * Arg1: a turnstile/thread/... priority queue
+ * Arg2: the element to change the priority of
+ * Arg3: the new priority
+ *
+ * Returns: whether the maximum priority of the queue changed.
+ */
+static boolean_t
+turnstile_priority_queue_update_entry_key(struct priority_queue *q,
+               priority_queue_entry_t elt, priority_queue_key_t pri)
+{
+       priority_queue_key_t old_key = priority_queue_max_key(q);
+
+       if (priority_queue_entry_key(q, elt) < pri) {
+               if (priority_queue_entry_increase(q, elt, pri,
+                               PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+                       return old_key != priority_queue_max_key(q);
+               }
+       } else if (priority_queue_entry_key(q, elt) > pri) {
+               if (priority_queue_entry_decrease(q, elt, pri,
+                               PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+                       return old_key != priority_queue_max_key(q);
+               }
+       }
+
+       return FALSE;
+}
+
+/*
+ * Name: turnstile_update_thread_promotion_locked
+ *
+ * Description: Update dst turnstile's inheritor link since one of the waiting
+ *              thread's priority has changed.
+ *
+ * Arg1: dst turnstile
+ * Arg2: thread
+ *
+ * Returns: TRUE: if the dst turnstile priority has changed and needs propagation.
+ *          FALSE: if the dst turnstile priority did not change or it does not need propagation.
+ *
+ * Condition: dst turnstile and thread are locked.
+ */
+static boolean_t
+turnstile_update_thread_promotion_locked(
+       struct turnstile *dst_turnstile,
+       thread_t thread)
+{
+       int thread_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_waitq.waitq_prio_queue),
+                                &(thread->wait_prioq_links));
+
+       if (thread->base_pri != thread_link_priority) {
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                       (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_MOVED_IN_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
+                       VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile),
+                       thread_tid(thread),
+                       thread->base_pri,
+                       thread_link_priority, 0);
+       }
+
+       if (!turnstile_priority_queue_update_entry_key(
+                       &dst_turnstile->ts_waitq.waitq_prio_queue,
+                       &thread->wait_prioq_links, thread->base_pri)) {
+               return FALSE;
+       }
+
+       /* Update dst turnstile's priority */
+       return turnstile_recompute_priority_locked(dst_turnstile);
+}
+
+
+/*
+ * Name: thread_add_turnstile_promotion
+ *
+ * Description: Add a turnstile to thread's inheritor list and update thread's priority.
+ *
+ * Arg1: thread
+ * Arg2: turnstile
+ *
+ * Returns: TRUE: if the thread's priority has changed and needs propagation.
+ *          FALSE: if the thread's priority did not change or it does not need propagation.
+ *
+ * Condition: turnstile locked.
+ */
+static boolean_t
+thread_add_turnstile_promotion(
+       thread_t thread,
+       struct turnstile *turnstile)
+{
+       boolean_t needs_update = FALSE;
+
+       /* Update the pairing heap */
+       thread_lock(thread);
+
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+               (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_ADDED_TO_THREAD_HEAP))) | DBG_FUNC_NONE,
+               thread_tid(thread),
+               VM_KERNEL_UNSLIDE_OR_PERM(turnstile),
+               turnstile->ts_priority, 0, 0);
+
+       priority_queue_entry_init(&(turnstile->ts_inheritor_links));
+       if (priority_queue_insert(&thread->inheritor_queue,
+                       &turnstile->ts_inheritor_links, turnstile->ts_priority,
+                       PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+               /* Update thread priority */
+               needs_update = thread_recompute_user_promotion_locked(thread);
+       }
+
+       /* Update turnstile stats */
+       if (!needs_update) {
+               turnstile_stats_update(1,
+                       thread_get_update_flags_for_turnstile_propagation_stoppage(thread) |
+                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG,
+                       turnstile);
+       }
+
+       thread_unlock(thread);
+       return needs_update;
+}
+
+
+/*
+ * Name: thread_remove_turnstile_promotion
+ *
+ * Description: Remove turnstile from thread's inheritor list and update thread's priority.
+ *
+ * Arg1: thread
+ * Arg2: turnstile
+ *
+ * Returns: TRUE: if the thread's priority has changed and needs propagation.
+ *          FALSE: if the thread's priority did not change or it does not need propagation.
+ *
+ * Condition: turnstile locked.
+ */
+static boolean_t
+thread_remove_turnstile_promotion(
+       thread_t thread,
+       struct turnstile *turnstile)
+{
+       boolean_t needs_update = FALSE;
+
+       /* Update the pairing heap */
+       thread_lock(thread);
+
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+               (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_REMOVED_FROM_THREAD_HEAP))) | DBG_FUNC_NONE,
+               thread_tid(thread),
+               VM_KERNEL_UNSLIDE_OR_PERM(turnstile),
+               0, 0, 0);
+
+       if (priority_queue_remove(&thread->inheritor_queue,
+                       &turnstile->ts_inheritor_links,
+                       PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+               /* Update thread priority */
+               needs_update = thread_recompute_user_promotion_locked(thread);
+       }
+
+       /* Update turnstile stats */
+       if (!needs_update) {
+               turnstile_stats_update(1,
+                       thread_get_update_flags_for_turnstile_propagation_stoppage(thread) | TSU_TURNSTILE_ARG,
+                       turnstile);
+       }
+
+       thread_unlock(thread);
+       return needs_update;
+}
+
+/*
+ * Name: thread_needs_turnstile_promotion_update
+ *
+ * Description: Check if turnstile position in thread's inheritor list needs to be updated.
+ *
+ * Arg1: thread
+ * Arg2: turnstile
+ *
+ * Returns: TRUE: if thread_update_turnstile_promotion needs to be called.
+ *          FALSE: otherwise.
+ *
+ * Condition: turnstile locked.
+ */
+static boolean_t
+thread_needs_turnstile_promotion_update(
+       thread_t thread __assert_only,
+       struct turnstile *turnstile)
+{
+       boolean_t needs_update = FALSE;
+       int turnstile_link_priority;
+
+       /* Update the pairing heap */
+       turnstile_link_priority = priority_queue_entry_key(&(thread->inheritor_queue),
+                                &(turnstile->ts_inheritor_links));
+
+       needs_update = (turnstile_link_priority == turnstile->ts_priority) ? FALSE : TRUE;
+       return needs_update;
+}
+
+/*
+ * Name: thread_update_turnstile_promotion_locked
+ *
+ * Description: Update turnstile position in thread's inheritor list and update thread's priority.
+ *
+ * Arg1: thread
+ * Arg2: turnstile
+ *
+ * Returns: TRUE: if the thread's priority has changed and needs propagation.
+ *          FALSE: if the thread's priority did not change or it does not need propagation.
+ *
+ * Condition: turnstile and thread are locked.
+ */
+static boolean_t
+thread_update_turnstile_promotion_locked(
+       thread_t thread,
+       struct turnstile *turnstile)
+{
+       int turnstile_link_priority = priority_queue_entry_key(&(thread->inheritor_queue),
+                                &(turnstile->ts_inheritor_links));
+
+       if (turnstile->ts_priority != turnstile_link_priority) {
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                       (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_MOVED_IN_THREAD_HEAP))) | DBG_FUNC_NONE,
+                       thread_tid(thread),
+                       VM_KERNEL_UNSLIDE_OR_PERM(turnstile),
+                       turnstile->ts_priority,
+                       turnstile_link_priority, 0);
+       }
+
+       if (!turnstile_priority_queue_update_entry_key(&thread->inheritor_queue,
+                       &turnstile->ts_inheritor_links, turnstile->ts_priority)) {
+               return FALSE;
+       }
+
+       /* Update thread priority */
+       return thread_recompute_user_promotion_locked(thread);
+}
+
+
+/*
+ * Name: thread_update_turnstile_promotion
+ *
+ * Description: Update turnstile position in thread's inheritor list and update thread's priority.
+ *
+ * Arg1: thread
+ * Arg2: turnstile
+ *
+ * Returns: TRUE: if the thread's priority has changed and needs propagation.
+ *          FALSE: if the thread's priority did not change or it does not need propagation.
+ *
+ * Condition: turnstile locked.
+ */
+static boolean_t
+thread_update_turnstile_promotion(
+       thread_t thread,
+       struct turnstile *turnstile)
+{
+       /* Before grabbing the thread lock, check if update is needed */
+       boolean_t needs_update = thread_needs_turnstile_promotion_update(thread, turnstile);
+
+       if (!needs_update) {
+               turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
+                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG, turnstile);
+               return needs_update;
+       }
+
+       /* Update the pairing heap */
+       thread_lock(thread);
+       needs_update = thread_update_turnstile_promotion_locked(thread, turnstile);
+
+       /* Update turnstile stats */
+       if (!needs_update) {
+               turnstile_stats_update(1,
+                       thread_get_update_flags_for_turnstile_propagation_stoppage(thread) |
+                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG,
+                       turnstile);
+       }
+       thread_unlock(thread);
+       return needs_update;
+}
+
+
+/*
+ * Name: thread_get_inheritor_turnstile_priority
+ *
+ * Description: Get the max priority of all the inheritor turnstiles
+ *
+ * Arg1: thread
+ *
+ * Returns: Max priority of all the inheritor turnstiles.
+ *
+ * Condition: thread locked
+ */
+int
+thread_get_inheritor_turnstile_priority(thread_t thread)
+{
+       struct turnstile *max_turnstile;
+
+       max_turnstile = priority_queue_max(&thread->inheritor_queue,
+                       struct turnstile, ts_inheritor_links);
+
+       if (max_turnstile) {
+               return priority_queue_entry_key(&thread->inheritor_queue,
+                               &max_turnstile->ts_inheritor_links);
+       }
+
+       return MAXPRI_THROTTLE;
+}
+
+
+/*
+ * Name: thread_get_waiting_turnstile
+ *
+ * Description: Get the turnstile if the thread is waiting on a turnstile.
+ *
+ * Arg1: thread
+ *
+ * Returns: turnstile: if the thread is blocked on a turnstile.
+ *          TURNSTILE_NULL: otherwise.
+ *
+ * Condition: thread locked.
+ */
+struct turnstile *
+thread_get_waiting_turnstile(thread_t thread)
+{
+       struct turnstile *turnstile = TURNSTILE_NULL;
+       struct waitq *waitq = thread->waitq;
+
+       /* Check if the thread is on a waitq */
+       if (waitq == NULL) {
+               return turnstile;
+       }
+
+       /* Get the safeq if the waitq is a port queue */
+       if (waitq_is_port_queue(waitq)) {
+               waitq = waitq_get_safeq(waitq);
+       }
+
+       /* Check if the waitq is a turnstile queue */
+       if (waitq_is_turnstile_queue(waitq)) {
+               turnstile = waitq_to_turnstile(waitq);
+       }
+       return turnstile;
+}
+
+
+/*
+ * Name: turnstile_lookup_by_proprietor
+ *
+ * Description: Get turnstile for a proprietor from global
+ *              turnstile hash.
+ *
+ * Arg1: port
+ *
+ * Returns: turnstile: if the proprietor has a turnstile.
+ *          TURNSTILE_NULL: otherwise.
+ *
+ * Condition: proprietor interlock held.
+ */
+struct turnstile *
+turnstile_lookup_by_proprietor(uintptr_t proprietor)
+{
+       return turnstile_htable_lookup(proprietor);
+}
+
+
+/*
+ * Name: thread_get_update_flags_for_turnstile_propagation_stoppage
+ *
+ * Description: Get the turnstile stats flags based on the thread wait status.
+ *
+ * Arg1: thread
+ *
+ * Returns: TSU_THREAD_RUNNABLE: if the thread is runnable.
+ *          TSU_NO_TURNSTILE: if thread waiting on a regular waitq.
+ *          TSU_NO_PRI_CHANGE_NEEDED: otherwise.
+ *
+ * Condition: thread locked.
+ */
+static turnstile_stats_update_flags_t
+thread_get_update_flags_for_turnstile_propagation_stoppage(thread_t thread)
+{
+       struct waitq *waitq = thread->waitq;
+
+       /* Check if the thread is on a waitq */
+       if (waitq == NULL) {
+               return TSU_THREAD_RUNNABLE;
+       }
+
+       /* Get the safeq if the waitq is a port queue */
+       if (waitq_is_port_queue(waitq)) {
+               waitq = waitq_get_safeq(waitq);
+       }
+
+       /* Check if the waitq is a turnstile queue */
+       if (!waitq_is_turnstile_queue(waitq)) {
+               return TSU_NO_TURNSTILE;
+       }
+
+       /* Thread blocked on turnstile waitq but no propagation needed */
+       return TSU_NO_PRI_CHANGE_NEEDED;
+}
+
+
+/*
+ * Name: turnstile_get_update_flags_for_above_UI_pri_change
+ *
+ * Description: Get the turnstile stats flags based on the turnstile priority.
+ *
+ * Arg1: turnstile
+ *
+ * Returns: TSU_ABOVE_UI_PRI_CHANGE: if turnstile priority is above 47 and it is not an ulock.
+ *          TSU_FLAGS_NONE: otherwise.
+ *
+ * Condition: turnstile locked.
+ */
+static turnstile_stats_update_flags_t
+turnstile_get_update_flags_for_above_UI_pri_change(struct turnstile *turnstile)
+{
+       if (turnstile->ts_priority >
+           (thread_qos_policy_params.qos_pri[THREAD_QOS_USER_INTERACTIVE] + 1) &&
+           turnstile_get_type(turnstile) != TURNSTILE_ULOCK) {
+               return TSU_ABOVE_UI_PRI_CHANGE;
+
+       }
+
+       return TSU_FLAGS_NONE;
+}
+
+
+/*
+ * Name: workq_add_turnstile_promotion
+ *
+ * Description: Connect the workqueue turnstile to the workqueue as a fake
+ *              inheritor
+ *
+ * Arg1: workqueue
+ * Arg2: turnstile
+ *
+ * Condition: turnstile locked.
+ */
+static boolean_t
+workq_add_turnstile_promotion(
+       struct workqueue *wq_inheritor __unused,
+       struct turnstile *turnstile)
+{
+       /*
+        * If the push is higher than MAXPRI_THROTTLE then the workqueue should
+        * bring up a thread.
+        */
+       return turnstile->ts_priority > MAXPRI_THROTTLE;
+}
+
+/*
+ * Name: turnstile_need_turnstile_promotion_update
+ *
+ * Description: Check if turnstile position in turnstile's inheritor list needs to be updated.
+ *
+ * Arg1: dst turnstile
+ * Arg2: src turnstile
+ *
+ * Returns: TRUE: if turnstile_update_turnstile_promotion needs to be called.
+ *          FALSE: otherwise.
+ *
+ * Condition: src turnstile locked.
+ */
+static boolean_t
+turnstile_need_turnstile_promotion_update(
+       struct turnstile *dst_turnstile __assert_only,
+       struct turnstile *src_turnstile)
+{
+       int src_turnstile_link_priority;
+       boolean_t needs_update = FALSE;
+
+       src_turnstile_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_inheritor_queue),
+                                &(src_turnstile->ts_inheritor_links));
+
+       needs_update = (src_turnstile_link_priority == src_turnstile->ts_priority) ? FALSE : TRUE;
+       return needs_update;
+}
+
+/*
+ * Name: turnstile_update_turnstile_promotion_locked
+ *
+ * Description: Update dst turnstile's inheritor link since src turnstile's
+ *              promote priority has changed.
+ *
+ * Arg1: dst turnstile
+ * Arg2: src turnstile
+ *
+ * Returns: TRUE: if the dst turnstile priority has changed and needs propagation.
+ *          FALSE: if the dst turnstile priority did not change or it does not need propagation.
+ *
+ * Condition: src and dst turnstile locked.
+ */
+static boolean_t
+turnstile_update_turnstile_promotion_locked(
+       struct turnstile *dst_turnstile,
+       struct turnstile *src_turnstile)
+{
+       int src_turnstile_link_priority;
+       src_turnstile_link_priority = priority_queue_entry_key(&(dst_turnstile->ts_inheritor_queue),
+                                &(src_turnstile->ts_inheritor_links));
+
+       if (src_turnstile->ts_priority != src_turnstile_link_priority) {
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                       (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_MOVED_IN_TURNSTILE_HEAP))) | DBG_FUNC_NONE,
+                       VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile),
+                       VM_KERNEL_UNSLIDE_OR_PERM(src_turnstile),
+                       src_turnstile->ts_priority, src_turnstile_link_priority, 0);
+       }
+
+       if (!turnstile_priority_queue_update_entry_key(
+                       &dst_turnstile->ts_inheritor_queue, &src_turnstile->ts_inheritor_links,
+                       src_turnstile->ts_priority)) {
+               return FALSE;
+       }
+
+       /* Update dst turnstile's priority */
+       return turnstile_recompute_priority_locked(dst_turnstile);
+}
+
+/*
+ * Name: turnstile_update_turnstile_promotion
+ *
+ * Description: Update dst turnstile's inheritor link since src turnstile's
+ *              promote priority has changed.
+ *
+ * Arg1: dst turnstile
+ * Arg2: src turnstile
+ *
+ * Returns: TRUE: if the dst turnstile priority has changed and needs propagation.
+ *          FALSE: if the dst turnstile priority did not change or it does not need propagation.
+ *
+ * Condition: src turnstile locked.
+ */
+static boolean_t
+turnstile_update_turnstile_promotion(
+       struct turnstile *dst_turnstile,
+       struct turnstile *src_turnstile)
+{
+       /* Check if update is needed before grabbing the src turnstile lock */
+       boolean_t needs_update = turnstile_need_turnstile_promotion_update(dst_turnstile, src_turnstile);
+       if (!needs_update) {
+               turnstile_stats_update(1, TSU_NO_PRI_CHANGE_NEEDED |
+                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG,
+                       src_turnstile);
+               return needs_update;
+       }
+
+       /* Update the pairing heap */
+       waitq_lock(&dst_turnstile->ts_waitq);
+       needs_update = turnstile_update_turnstile_promotion_locked(dst_turnstile, src_turnstile);
+
+       /* Update turnstile stats */
+       if (!needs_update) {
+               turnstile_stats_update(1,
+                       (dst_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) |
+                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG, src_turnstile);
+       }
+       waitq_unlock(&dst_turnstile->ts_waitq);
+       return needs_update;
+}
+
+/*
+ * Name: turnstile_add_turnstile_promotion
+ *
+ * Description: Add src turnstile to dst turnstile's inheritor link
+ *              and update dst turnstile's priority.
+ *
+ * Arg1: dst turnstile
+ * Arg2: src turnstile
+ *
+ * Returns: TRUE: if the dst turnstile priority has changed and needs propagation.
+ *          FALSE: if the dst turnstile priority did not change or it does not need propagation.
+ *
+ * Condition: src turnstile locked.
+ */
+static boolean_t
+turnstile_add_turnstile_promotion(
+       struct turnstile *dst_turnstile,
+       struct turnstile *src_turnstile)
+{
+       boolean_t needs_update = FALSE;
+
+       /* Update the pairing heap */
+       waitq_lock(&dst_turnstile->ts_waitq);
+
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+               (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_ADDED_TO_TURNSTILE_HEAP))) | DBG_FUNC_NONE,
+               VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile),
+               VM_KERNEL_UNSLIDE_OR_PERM(src_turnstile),
+               src_turnstile->ts_priority, 0, 0);
+
+       priority_queue_entry_init(&(src_turnstile->ts_inheritor_links));
+       if (priority_queue_insert(&dst_turnstile->ts_inheritor_queue,
+                       &src_turnstile->ts_inheritor_links, src_turnstile->ts_priority,
+                       PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+               /* Update dst turnstile priority */
+               needs_update = turnstile_recompute_priority_locked(dst_turnstile);
+       }
+
+       /* Update turnstile stats */
+       if (!needs_update) {
+               turnstile_stats_update(1,
+                       (dst_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) |
+                       TSU_TURNSTILE_ARG | TSU_BOOST_ARG, src_turnstile);
+       }
+
+       waitq_unlock(&dst_turnstile->ts_waitq);
+       return needs_update;
+}
+
+/*
+ * Name: turnstile_remove_turnstile_promotion
+ *
+ * Description: Remove src turnstile from dst turnstile's inheritor link
+ *              and update dst turnstile's priority.
+ *
+ * Arg1: dst turnstile
+ * Arg2: src turnstile
+ *
+ * Returns: TRUE: if the dst turnstile priority has changed and needs propagation.
+ *          FALSE: if the dst turnstile priority did not change or it does not need propagation.
+ *
+ * Condition: src turnstile locked.
+ */
+static boolean_t
+turnstile_remove_turnstile_promotion(
+       struct turnstile *dst_turnstile,
+       struct turnstile *src_turnstile)
+{
+       boolean_t needs_update = FALSE;
+
+       /* Update the pairing heap */
+       waitq_lock(&dst_turnstile->ts_waitq);
+
+       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+               (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (TURNSTILE_REMOVED_FROM_TURNSTILE_HEAP))) | DBG_FUNC_NONE,
+               VM_KERNEL_UNSLIDE_OR_PERM(dst_turnstile),
+               VM_KERNEL_UNSLIDE_OR_PERM(src_turnstile),
+               0, 0, 0);
+
+       if (priority_queue_remove(&dst_turnstile->ts_inheritor_queue,
+                       &src_turnstile->ts_inheritor_links,
+                       PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE)) {
+               /* Update dst turnstile priority */
+               needs_update = turnstile_recompute_priority_locked(dst_turnstile);
+       }
+
+       /* Update turnstile stats */
+       if (!needs_update) {
+               turnstile_stats_update(1,
+                       (dst_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) |
+                       TSU_TURNSTILE_ARG, src_turnstile);
+       }
+
+       waitq_unlock(&dst_turnstile->ts_waitq);
+       return needs_update;
+}
+
+/*
+ * Name: turnstile_recompute_priority_locked
+ *
+ * Description: Update turnstile priority based
+ *              on highest waiter thread and highest blocking
+ *              turnstile.
+ *
+ * Args: turnstile
+ *
+ * Returns: TRUE: if the turnstile priority changed and needs propagation.
+ *          FALSE: if the turnstile priority did not change or it does not need propagation.
+ *
+ * Condition: turnstile locked
+ */
+boolean_t
+turnstile_recompute_priority_locked(
+       struct turnstile *turnstile)
+{
+       int old_priority;
+       int new_priority;
+       boolean_t needs_priority_update = FALSE;
+       thread_t max_thread = THREAD_NULL;
+       struct turnstile *max_turnstile;
+       int thread_max_pri = MAXPRI_THROTTLE;
+       int turnstile_max_pri = MAXPRI_THROTTLE;
+
+       switch (turnstile_promote_policy[turnstile_get_type(turnstile)]) {
+
+       case TURNSTILE_USER_PROMOTE:
+       case TURNSTILE_USER_IPC_PROMOTE:
+
+               old_priority = turnstile->ts_priority;
+
+               max_thread = priority_queue_max(&turnstile->ts_waitq.waitq_prio_queue,
+                               struct thread, wait_prioq_links);
+
+               if (max_thread) {
+                       thread_max_pri = priority_queue_entry_key(&turnstile->ts_waitq.waitq_prio_queue,
+                                       &max_thread->wait_prioq_links);
+               }
+
+               max_turnstile = priority_queue_max(&turnstile->ts_inheritor_queue,
+                               struct turnstile, ts_inheritor_links);
+
+               if (max_turnstile) {
+                       turnstile_max_pri = priority_queue_entry_key(&turnstile->ts_inheritor_queue,
+                                       &max_turnstile->ts_inheritor_links);
+               }
+
+               new_priority = max(thread_max_pri, turnstile_max_pri);
+               turnstile->ts_priority = new_priority;
+
+               if (old_priority != new_priority) {
+                       KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                               (TURNSTILE_CODE(TURNSTILE_PRIORITY_OPERATIONS,
+                               (TURNSTILE_PRIORITY_CHANGE))) | DBG_FUNC_NONE,
+                               VM_KERNEL_UNSLIDE_OR_PERM(turnstile),
+                               new_priority,
+                               old_priority,
+                               0, 0);
+               }
+               needs_priority_update = (!(old_priority == new_priority)) &&
+                               (turnstile->ts_inheritor != NULL);
+       break;
+
+       case TURNSTILE_PROMOTE_NONE:
+       case TURNSTILE_KERNEL_PROMOTE:
+
+       /* The turnstile was repurposed, do nothing */
+       break;
+
+       default:
+
+       panic("Needs implementation for turnstile_recompute_priority");
+       break;
+
+       }
+       return needs_priority_update;
+
+}
+
+
+/*
+ * Name: turnstile_recompute_priority
+ *
+ * Description: Update turnstile priority based
+ *              on highest waiter thread and highest blocking
+ *              turnstile.
+ *
+ * Args: turnstile
+ *
+ * Returns: TRUE: if the turnstile priority changed and needs propagation.
+ *          FALSE: if the turnstile priority did not change or it does not need propagation.
+ */
+boolean_t
+turnstile_recompute_priority(
+       struct turnstile *turnstile)
+{
+       boolean_t needs_priority_update = FALSE;
+       spl_t s = splsched();
+
+       waitq_lock(&turnstile->ts_waitq);
+
+       needs_priority_update = turnstile_recompute_priority_locked(turnstile);
+
+       waitq_unlock(&turnstile->ts_waitq);
+       splx(s);
+       return needs_priority_update;
+
+}
+
+
+/*
+ * Name: turnstile_workq_proprietor_of_max_turnstile
+ *
+ * Description: Returns the highest priority and proprietor of a turnstile
+ *              pushing on a workqueue turnstile.
+ *
+ *              This will not return waiters that are at priority
+ *              MAXPRI_THROTTLE or lower.
+ *
+ * Args: turnstile
+ *
+ * Returns:
+ *    Priority of the max entry, or 0
+ *    Pointer to the max entry proprietor
+ */
+int
+turnstile_workq_proprietor_of_max_turnstile(
+       struct turnstile *turnstile,
+       uintptr_t *proprietor_out)
+{
+       struct turnstile *max_turnstile;
+       int max_priority = 0;
+       uintptr_t proprietor = 0;
+
+       assert(turnstile_get_type(turnstile) == TURNSTILE_WORKQS);
+
+       spl_t s = splsched();
+
+       waitq_lock(&turnstile->ts_waitq);
+
+       max_turnstile = priority_queue_max(&turnstile->ts_inheritor_queue,
+                       struct turnstile, ts_inheritor_links);
+       if (max_turnstile) {
+               max_priority = priority_queue_entry_key(&turnstile->ts_inheritor_queue,
+                               &max_turnstile->ts_inheritor_links);
+               proprietor = max_turnstile->ts_proprietor;
+       }
+
+       waitq_unlock(&turnstile->ts_waitq);
+       splx(s);
+
+       if (max_priority <= MAXPRI_THROTTLE) {
+               max_priority = 0;
+               proprietor = 0;
+       }
+       if (proprietor_out) *proprietor_out = proprietor;
+       return max_priority;
+}
+
+
+/*
+ * Name: turnstile_update_inheritor_priority_chain
+ *
+ * Description: Update turnstile inheritor's priority and propagate
+ *              the priority if the inheritor is blocked on a turnstile.
+ *
+ * Arg1: inheritor
+ * Arg2: inheritor flags
+ *
+ * Returns: None.
+ */
+static void
+turnstile_update_inheritor_priority_chain(
+       turnstile_inheritor_t inheritor,
+       turnstile_update_flags_t turnstile_flags)
+{
+       struct turnstile *turnstile = TURNSTILE_NULL;
+       thread_t thread = THREAD_NULL;
+       int total_hop = 0, thread_hop = 0;
+       spl_t s;
+       turnstile_stats_update_flags_t tsu_flags = ((turnstile_flags & TURNSTILE_UPDATE_BOOST) ?
+               TSU_BOOST_ARG : TSU_FLAGS_NONE) | TSU_PRI_PROPAGATION;
+
+       if (inheritor == NULL) {
+               return;
+       }
+
+       s = splsched();
+
+       if (turnstile_flags & TURNSTILE_INHERITOR_THREAD) {
+               thread = inheritor;
+               thread_lock(thread);
+               //TODO: Need to call sched promotion for kernel mutex.
+               thread_recompute_user_promotion_locked(thread);
+       } else if (turnstile_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+               turnstile = inheritor;
+               waitq_lock(&turnstile->ts_waitq);
+               turnstile_recompute_priority_locked(turnstile);
+               tsu_flags |= turnstile_get_update_flags_for_above_UI_pri_change(turnstile);
+       } else {
+               /*
+                * we should never call turnstile_update_inheritor_priority_chain()
+                * for a workqueue, they have no "chain" after them.
+                */
+               assert((turnstile_flags & TURNSTILE_INHERITOR_WORKQ) == 0);
+       }
+
+       while (turnstile != TURNSTILE_NULL || thread != THREAD_NULL) {
+               if (turnstile != TURNSTILE_NULL) {
+                       if (turnstile->ts_inheritor == NULL) {
+                               turnstile_stats_update(total_hop + 1, TSU_NO_INHERITOR |
+                                       TSU_TURNSTILE_ARG | tsu_flags,
+                                       turnstile);
+                               waitq_unlock(&turnstile->ts_waitq);
+                               turnstile = TURNSTILE_NULL;
+                               break;
+                       }
+                       if (turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+                               turnstile_update_inheritor_thread_priority_chain(&turnstile, &thread,
+                                       total_hop, tsu_flags);
+
+                       } else if (turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+                               turnstile_update_inheritor_turnstile_priority_chain(&turnstile,
+                                       total_hop, tsu_flags);
+
+                       } else if (turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+                               turnstile_update_inheritor_workq_priority_chain(turnstile, s);
+                               turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED | tsu_flags,
+                                       NULL);
+                               return;
+
+                       } else {
+                               panic("Inheritor flags not passed in turnstile_update_inheritor");
+                       }
+               } else if (thread != THREAD_NULL) {
+                       thread_update_waiting_turnstile_priority_chain(&thread, &turnstile,
+                                       thread_hop, total_hop, tsu_flags);
+                       thread_hop++;
+               }
+               total_hop++;
+       }
+
+       splx(s);
+       return;
+}
+
+/*
+ * Name: turnstile_update_inheritor_complete
+ *
+ * Description: Update turnstile inheritor's priority and propagate the
+ *              priority if the inheritor is blocked on a turnstile.
+ *              Consumes thread ref of old inheritor returned by
+ *              turnstile_update_inheritor. Recursive priority update
+ *              will only happen when called with interlock dropped.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *   Arg2: interlock held
+ *
+ * Returns: None.
+ */
+void
+turnstile_update_inheritor_complete(
+       struct turnstile *turnstile,
+       turnstile_update_complete_flags_t flags __unused)
+{
+       thread_t thread = current_thread();
+
+       turnstile_update_flags_t inheritor_flags = thread->inheritor_flags;
+
+       turnstile_cleanup();
+
+       /* Perform priority update for new inheritor */
+       if (inheritor_flags & TURNSTILE_NEEDS_PRI_UPDATE) {
+               turnstile_update_inheritor_priority_chain(turnstile,
+                       TURNSTILE_INHERITOR_TURNSTILE | TURNSTILE_UPDATE_BOOST);
+       }
+}
+
+/*
+ * Name: turnstile_cleanup
+ *
+ * Description: Update priority of a turnstile inheritor
+ *              if needed.
+ *
+ * Args: inheritor and flags passed on thread struct.
+ *
+ * Returns: None.
+ */
+void
+turnstile_cleanup(void)
+{
+       thread_t thread = current_thread();
+
+       /* Get the old inheritor from calling thread struct */
+       turnstile_inheritor_t old_inheritor = thread->inheritor;
+       turnstile_update_flags_t inheritor_flags = thread->inheritor_flags;
+       thread->inheritor = THREAD_NULL;
+       thread->inheritor_flags = TURNSTILE_UPDATE_FLAGS_NONE;
+
+       if (old_inheritor == TURNSTILE_INHERITOR_NULL) {
+               /* no cleanup to do */
+               return;
+       }
+
+       /* Perform priority demotion for old inheritor */
+       if (inheritor_flags & TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE) {
+               turnstile_update_inheritor_priority_chain(old_inheritor,
+                       inheritor_flags);
+       }
+
+       /* Drop thread reference for old inheritor */
+       if (inheritor_flags & TURNSTILE_INHERITOR_THREAD) {
+               thread_deallocate_safe(old_inheritor);
+       } else if (inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE) {
+               turnstile_deallocate_safe((struct turnstile *)old_inheritor);
+       } else if (inheritor_flags & TURNSTILE_INHERITOR_WORKQ) {
+               workq_deallocate_safe((struct workqueue *)old_inheritor);
+       } else {
+               panic("Inheritor flags lost along the way");
+       }
+}
+
+/*
+ * Name: turnstile_update_inheritor_workq_priority_chain
+ *
+ * Description: Helper function to update turnstile's inheritor(workq)
+ *              priority and possibly redrive thread creation
+ *
+ * Arg1: turnstile: turnstile
+ * Arg2: s: whether iterrupts are disabled.
+ *
+ * Condition: turnstile is locked on entry, it is unlocked on exit,
+ *            and interrupts re-enabled.
+ */
+static void
+turnstile_update_inheritor_workq_priority_chain(struct turnstile *turnstile, spl_t s)
+{
+       struct workqueue *wq = turnstile->ts_inheritor;
+       bool workq_lock_held = workq_is_current_thread_updating_turnstile(wq);
+
+       if (__improbable(turnstile->ts_priority <= MAXPRI_THROTTLE)) {
+               waitq_unlock(&turnstile->ts_waitq);
+               splx(s);
+               return;
+       }
+
+       if (!workq_lock_held) workq_reference(wq);
+       waitq_unlock(&turnstile->ts_waitq);
+       splx(s);
+
+       workq_schedule_creator_turnstile_redrive(wq, workq_lock_held);
+
+       if (!workq_lock_held) workq_deallocate_safe(wq);
+}
+
+/*
+ * Name: turnstile_update_inheritor_thread_priority_chain
+ *
+ * Description: Helper function to update turnstile's inheritor(thread)
+ *              priority.
+ *
+ * Arg1: in_turnstile: address to turnstile
+ * Arg2: out_thread: address to return the thread inheritor
+ * Arg3: thread_hop: number to thread hop in propagation chain
+ * Arg4: tsu_flags: turnstile update flags
+ *
+ * Returns: Implicit returns locked thread in out_thread if it needs
+ *          further propagation.
+ *
+ * Condition: *in_turnstile is locked on entry, it is unlocked on exit and
+ *            *in_turnstile is set to NULL.
+ */
+static void
+turnstile_update_inheritor_thread_priority_chain(
+       struct turnstile **in_turnstile,
+       thread_t *out_thread,
+       int total_hop,
+       turnstile_stats_update_flags_t tsu_flags)
+{
+       boolean_t needs_update = FALSE;
+       struct turnstile *turnstile = *in_turnstile;
+       thread_t thread_inheritor = turnstile->ts_inheritor;
+       boolean_t first_update = !total_hop;
+
+       assert(turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_THREAD);
+       *in_turnstile = TURNSTILE_NULL;
+
+       /* Check if update is needed before grabbing the thread lock */
+       needs_update = thread_needs_turnstile_promotion_update(thread_inheritor, turnstile);
+       if (!needs_update && !first_update) {
+               turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED |
+                       TSU_TURNSTILE_ARG | tsu_flags, turnstile);
+               waitq_unlock(&turnstile->ts_waitq);
+               return;
+       }
+
+       thread_lock(thread_inheritor);
+
+       /* adjust turnstile position in the thread's inheritor list */
+       needs_update = thread_update_turnstile_promotion_locked(
+               thread_inheritor, turnstile);
+
+       /*
+        * Check if thread needs further priority propagation,
+        * since the first hop priority update was done in
+        * turnstile_update_inheritor, do not bailout if it is
+        * the first update as needs_update flag would evaluate to
+        * false for that case.
+        */
+       if (!needs_update && !first_update) {
+               /* Update turnstile stats before returning */
+               turnstile_stats_update(total_hop + 1,
+                       (thread_get_update_flags_for_turnstile_propagation_stoppage(thread_inheritor)) |
+                       TSU_TURNSTILE_ARG | tsu_flags,
+                       turnstile);
+               thread_unlock(thread_inheritor);
+               waitq_unlock(&turnstile->ts_waitq);
+               return;
+       }
+
+       /* Unlock the turnstile and update the thread */
+       waitq_unlock(&turnstile->ts_waitq);
+       *out_thread = thread_inheritor;
+       return;
+}
+
+/*
+ * Name: turnstile_update_inheritor_turnstile_priority_chain
+ *
+ * Description: Helper function to update turnstile's inheritor(turnstile)
+ *              priority.
+ *
+ * Arg1: in_out_turnstile: address to turnstile
+ * Arg2: thread_hop: number of thread hop in propagation chain
+ * Arg3: tsu_flags: turnstile update flags
+ *
+ * Returns: Implicit returns locked turnstile in in_out_turnstile if it needs
+ *          further propagation.
+ *
+ * Condition: *in_out_turnstile is locked on entry, *in_out_turnstile on exit,
+ *            but the value of *in_out_turnstile might change and turnstile lock
+ *            will be dropped for old value and will be acquired for the new value.
+ */
+static void
+turnstile_update_inheritor_turnstile_priority_chain(
+       struct turnstile **in_out_turnstile,
+       int total_hop,
+       turnstile_stats_update_flags_t tsu_flags)
+{
+       boolean_t needs_update = FALSE;
+       struct turnstile *turnstile = *in_out_turnstile;
+       struct turnstile *inheritor_turnstile = turnstile->ts_inheritor;
+       boolean_t first_update = !total_hop;
+
+       assert(turnstile->ts_inheritor_flags & TURNSTILE_INHERITOR_TURNSTILE);
+       *in_out_turnstile = TURNSTILE_NULL;
+
+       /* Check if the inheritor turnstile needs to be updated before grabbing the lock */
+       needs_update = turnstile_need_turnstile_promotion_update(inheritor_turnstile, turnstile);
+       if (!needs_update && !first_update) {
+               turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED |
+                       TSU_TURNSTILE_ARG | tsu_flags,
+                       turnstile);
+               waitq_unlock(&turnstile->ts_waitq);
+               return;
+       }
+
+       waitq_lock(&inheritor_turnstile->ts_waitq);
+
+       needs_update = turnstile_update_turnstile_promotion_locked(
+               inheritor_turnstile, turnstile);
+
+       /*
+        * Check if turnstile needs further priority propagation,
+        * since the first hop priority update was done in
+        * turnstile_update_inheritor, do not bailout if it is
+        * the first update as needs_update flag would evaluate to
+        * false for that case.
+        */
+       if (!needs_update && !first_update) {
+               /* Update turnstile stats before returning */
+               turnstile_stats_update(total_hop + 1,
+                       (inheritor_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) |
+                       TSU_TURNSTILE_ARG | tsu_flags,
+                       turnstile);
+               waitq_unlock(&inheritor_turnstile->ts_waitq);
+               waitq_unlock(&turnstile->ts_waitq);
+               return;
+       }
+
+       /* Unlock the outer turnstile and update the inner turnstile */
+       waitq_unlock(&turnstile->ts_waitq);
+       *in_out_turnstile = inheritor_turnstile;
+       return;
+}
+
+/*
+ * Name: thread_update_waiting_turnstile_priority_chain
+ *
+ * Description: Helper function to update thread's waiting
+ *              turnstile priority.
+ *
+ * Arg1: in_thread: pointer to thread
+ * Arg2: out_turnstile: pointer to turnstile to return to caller
+ * Arg3: thread_hop: Number of thread hops visited
+ * Arg4: total_hop: total hops visited
+ * Arg5: tsu_flags: turnstile update flags
+ *
+ * Returns: *out_turnstile returns the inheritor if it needs further propagation.
+ *
+ * Condition: *in_thread locked on entry, unlocked on exit and set to NULL.
+ */
+static void
+thread_update_waiting_turnstile_priority_chain(
+       thread_t *in_thread,
+       struct turnstile **out_turnstile,
+       int thread_hop,
+       int total_hop,
+       turnstile_stats_update_flags_t tsu_flags)
+{
+       boolean_t needs_update = FALSE;
+       thread_t thread = *in_thread;
+       struct turnstile *waiting_turnstile = TURNSTILE_NULL;
+       uint32_t turnstile_gencount;
+       boolean_t first_update = !total_hop;
+
+       *in_thread = THREAD_NULL;
+
+       /* Check if thread waiting on a turnstile */
+       waiting_turnstile = thread_get_waiting_turnstile(thread);
+
+       if (waiting_turnstile == TURNSTILE_NULL || thread_hop > turnstile_max_hop) {
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                       (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS,
+                        (waiting_turnstile ? TURNSTILE_UPDATE_STOPPED_BY_LIMIT : THREAD_NOT_WAITING_ON_TURNSTILE)
+                       )) | DBG_FUNC_NONE,
+                       thread_tid(thread),
+                       turnstile_max_hop,
+                       thread_hop,
+                       VM_KERNEL_UNSLIDE_OR_PERM(waiting_turnstile), 0);
+               turnstile_stats_update(total_hop + 1, TSU_NO_TURNSTILE |
+                       TSU_THREAD_ARG | tsu_flags, thread);
+               thread_unlock(thread);
+               return;
+       }
+
+       /* Check if the thread needs to update the waiting turnstile */
+       needs_update = turnstile_need_thread_promotion_update(waiting_turnstile, thread);
+       if (!needs_update && !first_update) {
+               turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED |
+                       TSU_THREAD_ARG | tsu_flags, thread);
+               thread_unlock(thread);
+               return;
+       }
+
+       /* take a reference on thread, turnstile and snapshot of gencount */
+       turnstile_gencount = turnstile_get_gencount(waiting_turnstile);
+       turnstile_reference(waiting_turnstile);
+       thread_reference(thread);
+
+       /* drop the thread lock and acquire the turnstile lock */
+       thread_unlock(thread);
+       waitq_lock(&waiting_turnstile->ts_waitq);
+       thread_lock(thread);
+
+       /* Check if the gencount matches and thread is still waiting on same turnstile */
+       if (turnstile_gencount != turnstile_get_gencount(waiting_turnstile) ||
+           waiting_turnstile != thread_get_waiting_turnstile(thread)) {
+               turnstile_stats_update(total_hop + 1, TSU_NO_PRI_CHANGE_NEEDED |
+                       TSU_THREAD_ARG | tsu_flags, thread);
+               /* No updates required, bail out */
+               thread_unlock(thread);
+               waitq_unlock(&waiting_turnstile->ts_waitq);
+               thread_deallocate_safe(thread);
+               turnstile_deallocate_safe(waiting_turnstile);
+               return;
+       }
+
+       /*
+        * The thread is waiting on the waiting_turnstile and we have thread lock,
+        * we can drop the thread and turnstile reference since its on waitq and
+        * it could not be removed from the waitq without the thread lock.
+        */
+       thread_deallocate_safe(thread);
+       turnstile_deallocate_safe(waiting_turnstile);
+
+       /* adjust thread's position on turnstile waitq */
+       needs_update = turnstile_update_thread_promotion_locked(waiting_turnstile, thread);
+
+       /*
+        * Check if thread needs further priority propagation,
+        * since the first hop priority update was done in
+        * turnstile_update_inheritor, do not bailout if it is
+        * the first update as needs_update flag would evaluate to
+        * false for that case.
+        */
+       if (!needs_update && !first_update) {
+               turnstile_stats_update(total_hop + 1,
+                       (waiting_turnstile->ts_inheritor ? TSU_NO_PRI_CHANGE_NEEDED : TSU_NO_INHERITOR) |
+                       TSU_THREAD_ARG | tsu_flags, thread);
+               thread_unlock(thread);
+               waitq_unlock(&waiting_turnstile->ts_waitq);
+               return;
+       }
+
+       /* drop the thread lock and update the turnstile */
+       thread_unlock(thread);
+       *out_turnstile = waiting_turnstile;
+}
+
+/*
+ * Name: turnstile_stats_update
+ *
+ * Description: Function to update turnstile stats for dev kernel.
+ *
+ * Arg1: hops : number of thread hops in priority propagation
+ * Arg2: flags : turnstile stats update flags
+ * Arg3: inheritor: inheritor
+ *
+ * Returns: Nothing
+ */
+void
+turnstile_stats_update(
+       int hop __assert_only,
+       turnstile_stats_update_flags_t flags __assert_only,
+       turnstile_inheritor_t inheritor __assert_only)
+{
+#if DEVELOPMENT || DEBUG
+       if (flags & TSU_TURNSTILE_BLOCK_COUNT) {
+               os_atomic_inc(&thread_block_on_turnstile_count, relaxed);
+       }
+
+       if (flags & TSU_REGULAR_WAITQ_BLOCK_COUNT) {
+               os_atomic_inc(&thread_block_on_regular_waitq_count, relaxed);
+       }
+
+       if (hop > TURNSTILE_MAX_HOP_DEFAULT || hop == 0) {
+               return;
+       }
+
+       assert(hop >= 0);
+
+       /*
+        * Check if turnstile stats needs to be updated.
+        * Bail out if the turnstile or thread does not
+        * have any user promotion, i.e. pri 4.
+        * Bail out if it is the first hop of WQ turnstile
+        * since WQ's use of a turnstile for the admission check
+        * introduces a lot of noise due to state changes.
+        */
+       if (flags & TSU_TURNSTILE_ARG) {
+               struct turnstile *ts = (struct turnstile *)inheritor;
+               if (ts->ts_priority <= MAXPRI_THROTTLE) {
+                       return;
+               }
+
+               if (hop == 1 && turnstile_get_type(ts) == TURNSTILE_WORKQS) {
+                       return;
+               }
+       } else if (flags & TSU_THREAD_ARG) {
+               thread_t thread = (thread_t)inheritor;
+               if (thread->user_promotion_basepri <= MAXPRI_THROTTLE) {
+                       return;
+               }
+       } else {
+               assert(inheritor == NULL);
+       }
+
+       struct turnstile_stats *turnstile_stats;
+       if (flags & TSU_BOOST_ARG) {
+               turnstile_stats = turnstile_boost_stats;
+       } else {
+               turnstile_stats = turnstile_unboost_stats;
+       }
+
+       if (flags & TSU_PRI_PROPAGATION) {
+               os_atomic_inc(&turnstile_stats[hop - 1].ts_priority_propagation, relaxed);
+       }
+
+       if (flags & TSU_NO_INHERITOR) {
+               os_atomic_inc(&turnstile_stats[hop - 1].ts_no_inheritor, relaxed);
+       }
+
+       if (flags & TSU_NO_TURNSTILE) {
+               os_atomic_inc(&turnstile_stats[hop - 1].ts_no_turnstile, relaxed);
+       }
+
+       if (flags & TSU_NO_PRI_CHANGE_NEEDED) {
+               os_atomic_inc(&turnstile_stats[hop - 1].ts_no_priority_change_required, relaxed);
+       }
+
+       if (flags & TSU_THREAD_RUNNABLE) {
+               os_atomic_inc(&turnstile_stats[hop - 1].ts_thread_runnable, relaxed);
+       }
+
+       if (flags & TSU_ABOVE_UI_PRI_CHANGE) {
+               os_atomic_inc(&turnstile_stats[hop - 1].ts_above_ui_pri_change, relaxed);
+       }
+#endif
+}
+
+
+#if DEVELOPMENT || DEBUG
+
+int sysctl_io_opaque(void *req,void *pValue, size_t valueSize, int *changed);
+
+/*
+ * Name: turnstile_get_boost_stats_sysctl
+ *
+ * Description: Function to get turnstile stats.
+ *
+ * Args: req : opaque struct to pass to sysctl_io_opaque
+ *
+ * Returns: errorno
+ */
+int
+turnstile_get_boost_stats_sysctl(
+       void *req)
+{
+       return sysctl_io_opaque(req, turnstile_boost_stats, sizeof (struct turnstile_stats) * TURNSTILE_MAX_HOP_DEFAULT, NULL);
+}
+
+/*
+ * Name: get_turnstile_stats_sysctl
+ *
+ * Description: Function to get turnstile stats.
+ *
+ * Args: req : opaque struct to pass to sysctl_io_opaque
+ *
+ * Returns: errorno
+ */
+int
+turnstile_get_unboost_stats_sysctl(
+       void *req)
+{
+       return sysctl_io_opaque(req, turnstile_unboost_stats, sizeof (struct turnstile_stats) * TURNSTILE_MAX_HOP_DEFAULT, NULL);
+}
+
+/* Testing interface for Development kernels */
+#define        tstile_test_prim_lock_interlock(test_prim) \
+       lck_spin_lock(&test_prim->ttprim_interlock)
+#define        tstile_test_prim_unlock_interlock(test_prim) \
+       lck_spin_unlock(&test_prim->ttprim_interlock)
+
+static void
+tstile_test_prim_init(struct tstile_test_prim **test_prim_ptr)
+{
+       struct tstile_test_prim *test_prim = (struct tstile_test_prim *) kalloc(sizeof(struct tstile_test_prim));
+
+       test_prim->ttprim_turnstile = TURNSTILE_NULL;
+       test_prim->ttprim_owner = NULL;
+       lck_spin_init(&test_prim->ttprim_interlock, &turnstiles_dev_lock_grp, &turnstiles_dev_lock_attr);
+       test_prim->tt_prim_waiters = 0;
+
+       *test_prim_ptr = test_prim;
+       return;
+}
+
+int
+tstile_test_prim_lock(boolean_t use_hashtable)
+{
+       struct tstile_test_prim *test_prim = use_hashtable ? test_prim_global_htable : test_prim_ts_inline;
+lock_start:
+       /* take the interlock of the primitive */
+       tstile_test_prim_lock_interlock(test_prim);
+
+       /* Check if the lock is available */
+       if (test_prim->ttprim_owner == NULL && test_prim->tt_prim_waiters == 0) {
+               thread_reference(current_thread());
+               test_prim->ttprim_owner = current_thread();
+               tstile_test_prim_unlock_interlock(test_prim);
+               return 0;
+       }
+
+       struct turnstile *prim_turnstile = TURNSTILE_NULL;
+
+       /* primitive locked, get a turnstile */
+       prim_turnstile = turnstile_prepare((uintptr_t)test_prim,
+                       use_hashtable ? NULL : &test_prim->ttprim_turnstile,
+                       TURNSTILE_NULL, TURNSTILE_ULOCK);
+
+       assert(prim_turnstile != TURNSTILE_NULL);
+
+       /* This is contented acquire case */
+       if (test_prim->ttprim_owner == NULL) {
+               thread_reference(current_thread());
+               test_prim->ttprim_owner = current_thread();
+
+               /* Update the turnstile owner */
+               turnstile_update_inheritor(prim_turnstile,
+                               current_thread(),
+                               (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+               turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_HELD);
+
+               turnstile_complete((uintptr_t)test_prim,
+                       use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL);
+
+               tstile_test_prim_unlock_interlock(test_prim);
+
+               turnstile_cleanup();
+
+               return 0;
+       }
+
+       test_prim->tt_prim_waiters++;
+       turnstile_update_inheritor(prim_turnstile,
+                               test_prim->ttprim_owner,
+                               (TURNSTILE_DELAYED_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+       waitq_assert_wait64(&prim_turnstile->ts_waitq,
+               CAST_EVENT64_T(test_prim), THREAD_ABORTSAFE,
+               TIMEOUT_WAIT_FOREVER);
+
+       /* drop the interlock */
+       tstile_test_prim_unlock_interlock(test_prim);
+
+       turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_NOT_HELD);
+
+       wait_result_t result;
+       result = thread_block(THREAD_CONTINUE_NULL);
+
+       /* re-acquire the interlock to get turnstile back */
+       tstile_test_prim_lock_interlock(test_prim);
+       test_prim->tt_prim_waiters--;
+       turnstile_complete((uintptr_t)test_prim,
+               use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL);
+
+       tstile_test_prim_unlock_interlock(test_prim);
+
+       turnstile_cleanup();
+
+       /* Return if thread interrupted */
+       if (result == THREAD_INTERRUPTED) {
+               return 1;
+       }
+
+       goto lock_start;
+}
+
+int
+tstile_test_prim_unlock(boolean_t use_hashtable)
+{
+
+       struct tstile_test_prim *test_prim = use_hashtable ? test_prim_global_htable : test_prim_ts_inline;
+       /* take the interlock of the primitive */
+       tstile_test_prim_lock_interlock(test_prim);
+
+       if (test_prim->ttprim_owner == NULL) {
+               tstile_test_prim_unlock_interlock(test_prim);
+               return 1;
+       }
+
+       /* Check if the lock is contended */
+       if (test_prim->ttprim_owner != NULL && test_prim->tt_prim_waiters == 0) {
+               /* lock is not contended */
+               thread_t old_owner = test_prim->ttprim_owner;
+               test_prim->ttprim_owner = NULL;
+               tstile_test_prim_unlock_interlock(test_prim);
+
+               thread_deallocate(old_owner);
+               return 0;
+       }
+
+       struct turnstile *prim_turnstile = TURNSTILE_NULL;
+
+       thread_t old_owner = test_prim->ttprim_owner;
+       test_prim->ttprim_owner = NULL;
+
+       /* primitive locked, get a turnstile */
+       prim_turnstile = turnstile_prepare((uintptr_t)test_prim,
+                       use_hashtable ? NULL : &test_prim->ttprim_turnstile,
+                       TURNSTILE_NULL, TURNSTILE_ULOCK);
+
+       assert(prim_turnstile != TURNSTILE_NULL);
+
+       /* Update the turnstile owner */
+       turnstile_update_inheritor(prim_turnstile,
+                       NULL,
+                       (TURNSTILE_IMMEDIATE_UPDATE | TURNSTILE_INHERITOR_THREAD));
+
+       waitq_wakeup64_one(&prim_turnstile->ts_waitq,
+               CAST_EVENT64_T(test_prim),
+               THREAD_AWAKENED, WAITQ_SELECT_MAX_PRI);
+
+       turnstile_update_inheritor_complete(prim_turnstile, TURNSTILE_INTERLOCK_HELD);
+
+       turnstile_complete((uintptr_t)test_prim,
+               use_hashtable ? NULL : &test_prim->ttprim_turnstile, NULL);
+
+       tstile_test_prim_unlock_interlock(test_prim);
+
+       turnstile_cleanup();
+
+       if (old_owner) {
+               /* Changing this to thread_deallocate_safe to exercise thread_deallocate_safe path */
+               thread_deallocate_safe(old_owner);
+       }
+
+       return 0;
+}
+
+#endif
diff --git a/osfmk/kern/turnstile.h b/osfmk/kern/turnstile.h
new file mode 100644 (file)
index 0000000..f8f9ebe
--- /dev/null
@@ -0,0 +1,686 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _TURNSTILE_H_
+#define _TURNSTILE_H_
+
+#include <mach/mach_types.h>
+#include <mach/kern_return.h>
+#include <sys/cdefs.h>
+
+#if PRIVATE
+#define TURNSTILE_MAX_HOP_DEFAULT (10)
+struct turnstile_stats {
+       uint64_t ts_priority_propagation;
+       uint64_t ts_no_inheritor;
+       uint64_t ts_thread_runnable;
+       uint64_t ts_no_priority_change_required;
+       uint64_t ts_above_ui_pri_change;
+       uint64_t ts_no_turnstile;
+};
+#endif
+
+#ifdef KERNEL_PRIVATE
+#include <kern/queue.h>
+#include <sys/queue.h>
+#include <kern/waitq.h>
+#include <kern/priority_queue.h>
+#include <os/refcnt.h>
+#include <kern/assert.h>
+#include <kern/kern_types.h>
+
+/*
+ * turnstile_type_t : Indicates the type of primitive the turnstile is associated with
+ *                    Please populate turnstile_promote_policy array if a new type is added here.
+ */
+typedef enum __attribute__((packed)) turnstile_type {
+       TURNSTILE_NONE = 0,
+       TURNSTILE_KERNEL_MUTEX = 1,
+       TURNSTILE_ULOCK = 2,
+       TURNSTILE_PTHREAD_MUTEX = 3,
+       TURNSTILE_SYNC_IPC = 4,
+       TURNSTILE_WORKLOOPS = 5,
+       TURNSTILE_WORKQS = 6,
+       TURNSTILE_KNOTE = 7,
+       TURNSTILE_TOTAL_TYPES = 8,
+} turnstile_type_t;
+
+/*
+ * For each type of turnstile, following are the type of
+ * inheritors passed:
+ *
+ * TURNSTILE_KERNEL_MUTEX
+ *    Interlock: kernel mutex interlock.
+ *    Inheritor: threads.
+ *    Lock order: turnstile lock, thread lock.
+ *
+ * TURNSTILE_ULOCK
+ *    Interlock: ulocks interlock.
+ *    Inheritor: threads.
+ *    Lock order: turnstile lock, thread lock.
+ *
+ * TURNSTILE_PTHREAD_MUTEX
+ *    Interlock: pthread mtx interlock.
+ *    Inheritor: threads.
+ *    Lock order: turnstile lock, thread lock.
+ *
+ * TURNSTILE_SYNC_IPC
+ *    Interlock: port's mqueue lock
+ *    Inheritor: turnstile (of port in which we are enqueued or WL turnstile.
+ *    Lock order: Our turnstile, then turnstile of the port we are enqueued in.
+ *                Port circularity will make sure there is never a cycle formation
+ *                and lock order is maintained.
+ *
+ * TURNSTILE_WORKLOOPS
+ *    Interlock:
+ *    - kq req lock
+ *    - wq lock when "filt_wlworkq_interlock_needed() is true"
+ *    Inheritor: thread, turnstile (of workq)
+ *    Lock order: turnstile lock, thread lock
+ *                WL turnstile lock, Workq turnstile lock
+ *
+ * TURNSTILE_WORKQS
+ *    Interlock: workqueue lock
+ *    Inheritor: thread
+ *    Lock order: turnstile lock, thread lock.
+ *
+ * TURNSTILE_KNOTE
+ *    Interlock: the knote lock
+ *    Inheritor: WL turnstile
+ */
+
+typedef enum __attribute__((flag_enum)) turnstile_promote_policy {
+       TURNSTILE_PROMOTE_NONE = 0,
+       TURNSTILE_KERNEL_PROMOTE = 0x1,
+       TURNSTILE_USER_PROMOTE = 0x2,
+       TURNSTILE_USER_IPC_PROMOTE = 0x4,
+} turnstile_promote_policy_t;
+
+/*
+ * Turnstile state flags
+ *
+ * The turnstile state flags represent the current ownership of a turnstile.
+ * The supported flags are:
+ * - TURNSTILE_STATE_THREAD    : Turnstile is attached to a thread
+ * - TURNSTILE_STATE_FREELIST  : Turnstile is hanging off the freelist of another turnstile
+ * - TURNSTILE_STATE_HASHTABLE : Turnstile is in the global hash table as the turnstile for a primitive
+ * - TURNSTILE_STATE_PROPRIETOR : Turnstile is attached to a proprietor
+ *
+ * The flag updates are done while holding the primitive interlock.
+ * */
+
+#define TURNSTILE_STATE_THREAD         0x1
+#define TURNSTILE_STATE_FREELIST       0x2
+#define TURNSTILE_STATE_HASHTABLE      0x4
+#define TURNSTILE_STATE_PROPRIETOR     0x8
+
+/* Helper macros to set/unset turnstile state flags */
+#if DEVELOPMENT || DEBUG
+
+#define turnstile_state_init(ts, state)         \
+MACRO_BEGIN                                     \
+               ts->ts_state = state;           \
+MACRO_END
+
+#define turnstile_state_add(ts, state)          \
+MACRO_BEGIN                                     \
+               assert((ts->ts_state & (state)) == 0);  \
+               ts->ts_state |= state;                  \
+MACRO_END
+
+#define turnstile_state_remove(ts, state)       \
+MACRO_BEGIN                                     \
+               assert(ts->ts_state & (state));         \
+               ts->ts_state &= ~(state);               \
+MACRO_END
+
+#else  /* DEVELOPMENT || DEBUG */
+
+#define turnstile_state_init(ts, state)         \
+MACRO_BEGIN                                     \
+               (void)ts;                       \
+MACRO_END
+
+#define turnstile_state_add(ts, state)          \
+MACRO_BEGIN                                     \
+               (void)ts;                       \
+MACRO_END
+
+#define turnstile_state_remove(ts, state)       \
+MACRO_BEGIN                                     \
+               (void)ts;                       \
+MACRO_END
+
+#endif /* DEVELOPMENT || DEBUG */
+
+/* Foward declaration of turnstile */
+struct turnstile;
+
+/*
+ * Turnstile update flags
+ *
+ * TURNSTILE_IMMEDIATE_UPDATE
+ *    When passed to turnstile_update_inheritor
+ *    update the inheritor of the turnstile in
+ *    the same call.
+ *
+ * TURNSTILE_DELAYED_UPDATE
+ *    When passed to turnstile_update_inheritor
+ *    it stashed the inheritor on the thread and
+ *    turnstile's inheritor is updated in
+ *    assert wait.
+ *
+ * TURNSTILE_INHERITOR_THREAD
+ *    The turnstile inheritor is of type thread.
+ *
+ * TURNSTILE_INHERITOR_TURNSTILE
+ *    The turnstile inheritor is of type turnstile.
+ *
+ * TURNSTILE_INHERITOR_WORKQ
+ *    The turnstile inheritor is of type workqueue
+ *
+ * TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE
+ *    The inheritor needs a chain priority update.
+ *
+ * TURNSTILE_NEEDS_PRI_UPDATE
+ *    Current turnstile needs a chain priority update.
+ *
+ * Locking order for passing thread and turnstile as inheritor
+ *
+ * Thread as an inheritor:
+ *    When thread is passed as an inheritor of a turnstile
+ *    turnstile lock is taken and then thread lock.
+ *
+ * Turnstile as in inheritor:
+ *    When turnstile (T1) is passed as an inheritor of
+ *    a turnstile (T2), turnstile lock of T2 is taken
+ *    and then turnstile lock of T1 is taken.
+ *
+ * Caution: While passing turnstile as an inheritor, its
+ *    job of the adopter to make sure that there is no
+ *    lock inversion.
+ */
+typedef enum __attribute__((flag_enum)) __attribute__((packed)) turnstile_update_flags {
+       TURNSTILE_UPDATE_FLAGS_NONE = 0,
+       TURNSTILE_IMMEDIATE_UPDATE = 0x1,
+       TURNSTILE_DELAYED_UPDATE = 0x2,
+       TURNSTILE_INHERITOR_THREAD = 0x4,
+       TURNSTILE_INHERITOR_TURNSTILE = 0x8,
+       TURNSTILE_INHERITOR_NEEDS_PRI_UPDATE = 0x10,
+       TURNSTILE_NEEDS_PRI_UPDATE = 0x20,
+       TURNSTILE_INHERITOR_WORKQ = 0x40,
+       TURNSTILE_UPDATE_BOOST = 0x80,
+} turnstile_update_flags_t;
+
+#define TURNSTILE_NULL ((struct turnstile *)0)
+
+typedef void * turnstile_inheritor_t;
+
+#define TURNSTILE_INHERITOR_NULL NULL
+
+#ifdef XNU_KERNEL_PRIVATE
+
+/* Turnstile stats update flags
+ *
+ * TSU_TURNSTILE_BLOCK_COUNT
+ *    thread blocking on turnstile waitq, increment global
+ *    thread block on turnstile count.
+ *
+ * TSU_REGULAR_WAITQ_BLOCK_COUNT
+ *    thread blocking on regular waitq, increment global
+ *    thread block on regular waitq count.
+ *
+ * TSU_PRI_PROPAGATION
+ *    turnstile propagation update stopped at nth hop, update
+ *    priority change count for nth element in stats array.
+ *
+ * TSU_NO_INHERITOR
+ *    turnstile propagation update stopped due to turnstile
+ *    not having an inheritor after nth hop, update the no
+ *    inheritor count for nth element in the stats array.
+ *
+ * TSU_NO_TURNSTILE
+ *    turnstile propagation update stopped due to thread
+ *    not blocked on a turnstile waitq after nth hop, update
+ *    the no turnstile count for the nth element in the stats
+ *    array.
+ *
+ * TSU_NO_PRI_CHANGE_NEEDED
+ *    turnstile propagation update stopped due to thread or
+ *    turnstile having the correct priority or not blocked.
+ *    update the no priority change count for the nth element
+ *    in the stats array.
+ *
+ * TSU_THREAD_RUNNABLE
+ *    turnstile propagation update stopped due to thread
+ *    being runnable, update the thread runnable count for
+ *    the nth element in the stats array.
+ *
+ * TSU_ABOVE_UI_PRI_CHANGE
+ *    turnstile propagation caused an above UI priority change.
+ */
+typedef enum __attribute__((flag_enum)) turnstile_stats_update_flags {
+       TSU_FLAGS_NONE = 0,
+       TSU_TURNSTILE_BLOCK_COUNT = 0x1,
+       TSU_REGULAR_WAITQ_BLOCK_COUNT = 0x2,
+       TSU_PRI_PROPAGATION = 0x4,
+       TSU_NO_INHERITOR = 0x8,
+       TSU_NO_TURNSTILE = 0x10,
+       TSU_NO_PRI_CHANGE_NEEDED = 0x20,
+       TSU_THREAD_RUNNABLE = 0x40,
+       TSU_ABOVE_UI_PRI_CHANGE = 0x80,
+       TSU_THREAD_ARG = 0x100,
+       TSU_TURNSTILE_ARG = 0x200,
+       TSU_BOOST_ARG = 0x400,
+} turnstile_stats_update_flags_t;
+
+SLIST_HEAD(turnstile_list, turnstile);
+
+struct turnstile {
+       struct waitq                  ts_waitq;              /* waitq embedded in turnstile */
+       turnstile_inheritor_t         ts_inheritor;          /* thread/turnstile inheriting the priority (IL, WL) */
+       union {
+               struct turnstile_list ts_free_turnstiles;    /* turnstile free list (IL) */
+               SLIST_ENTRY(turnstile) ts_free_elm;          /* turnstile free list element (IL) */
+       };
+       struct priority_queue         ts_inheritor_queue;    /* Queue of turnstile with us as an inheritor (WL) */
+       union {
+               struct priority_queue_entry ts_inheritor_links;    /* Inheritor queue links */
+               queue_chain_t         ts_deallocate_link;    /* thread deallocate link */
+       };
+       SLIST_ENTRY(turnstile)        ts_htable_link;        /* linkage for turnstile in global hash table */
+       uintptr_t                     ts_proprietor;         /* hash key lookup turnstile (IL) */
+       os_refcnt_t                   ts_refcount;           /* reference count for turnstiles */
+       _Atomic uint32_t              ts_type_gencount;      /* gen count used for priority chaining (IL), type of turnstile (IL) */
+       uint32_t                      ts_port_ref;           /* number of explicit refs from ports on send turnstile */
+       turnstile_update_flags_t      ts_inheritor_flags;    /* flags for turnstile inheritor (IL, WL) */
+       uint8_t                       ts_priority;           /* priority of turnstile (WL) */
+
+#if DEVELOPMENT || DEBUG
+       uint8_t                       ts_state;              /* current state of turnstile (IL) */
+       queue_chain_t                 ts_global_elm;         /* global turnstile chain */
+       thread_t                      ts_thread;             /* thread the turnstile is attached to */
+       thread_t                      ts_prev_thread;        /* thread the turnstile was attached before donation */
+#endif
+};
+
+#define waitq_to_turnstile(waitq) __container_of(waitq, struct turnstile, ts_waitq)
+
+/* IL - interlock, WL - turnstile lock i.e. waitq lock */
+
+#define TURNSTILE_PROPRIETOR_NULL 0
+
+/*
+ * Name: turnstiles_init
+ *
+ * Description: Initialize turnstile sub system.
+ *
+ * Args: None.
+ *
+ * Returns: None.
+ */
+void
+turnstiles_init(void);
+
+/*
+ * Name: turnstile_alloc
+ *
+ * Description: Allocate a turnstile.
+ *
+ * Args: None.
+ *
+ * Returns:
+ *   turnstile on Success.
+ */
+struct turnstile *
+turnstile_alloc(void);
+
+/*
+ * Name: turnstile_destroy
+ *
+ * Description: Deallocates the turnstile.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *
+ * Returns: None.
+ */
+void
+turnstile_destroy(struct turnstile *turnstile);
+
+/*
+ * Name: turnstile_reference
+ *
+ * Description: Take a reference on the turnstile.
+ *
+ * Arg1: turnstile
+ *
+ * Returns: None.
+ */
+void
+turnstile_reference(struct turnstile *turnstile);
+
+/*
+ * Name: turnstile_deallocate
+ *
+ * Description: Drop a reference on the turnstile.
+ *              Destroy the turnstile if the last ref.
+ *
+ * Arg1: turnstile
+ *
+ * Returns: None.
+ */
+void
+turnstile_deallocate(struct turnstile *turnstile);
+
+/*
+ * Name: turnstile_deallocate_safe
+ *
+ * Description: Drop a reference on the turnstile safely without triggering zfree.
+ *
+ * Arg1: turnstile
+ *
+ * Returns: None.
+ */
+void
+turnstile_deallocate_safe(struct turnstile *turnstile);
+
+/*
+ * Name: turnstile_recompute_priority_locked
+ *
+ * Description: Update turnstile priority based
+ *              on highest waiter thread and highest blocking
+ *              turnstile.
+ *
+ * Args: turnstile
+ *
+ * Returns: TRUE: if the turnstile priority changed and needs propagation.
+ *          FALSE: if the turnstile priority did not change or it does not need propagation.
+ *
+ * Condition: turnstile locked
+ */
+boolean_t
+turnstile_recompute_priority_locked(
+       struct turnstile *turnstile);
+
+/*
+ * Name: turnstile_recompute_priority
+ *
+ * Description: Update turnstile priority based
+ *              on highest waiter thread and highest blocking
+ *              turnstile.
+ *
+ * Args: turnstile
+ *
+ * Returns: TRUE: if the turnstile priority changed and needs propagation.
+ *          FALSE: if the turnstile priority did not change or it does not need propagation.
+ */
+boolean_t
+turnstile_recompute_priority(
+       struct turnstile *turnstile);
+
+/*
+ * Name: turnstile_workq_proprietor_of_max_turnstile
+ *
+ * Description: Returns the highest priority and proprietor of a turnstile
+ *              pushing on a workqueue turnstile.
+ *
+ *              This will not return waiters that are at priority
+ *              MAXPRI_THROTTLE or lower.
+ *
+ * Args: turnstile
+ *
+ * Returns:
+ *    Priority of the max entry, or 0
+ *    Pointer to the max entry proprietor
+ */
+int
+turnstile_workq_proprietor_of_max_turnstile(
+       struct turnstile *turnstile,
+       uintptr_t *proprietor);
+
+/*
+ * Name: turnstile_cleanup
+ *
+ * Description: Update priority of a turnstile inheritor
+ *              if needed.
+ *
+ * Args: inheritor and flags passed on thread struct.
+ *
+ * Returns: None.
+ */
+void
+turnstile_cleanup(void);
+
+/*
+ * Name: turnstile_update_inheritor_locked
+ *
+ * Description: Update the inheritor of the turnstile and boost the
+ *              inheritor, called with turnstile locked.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *   Implicit arg: new inheritor value is stashed in current thread's struct
+ *
+ * Returns:
+ *   old inheritor reference is returned on current thread's struct.
+ */
+void
+turnstile_update_inheritor_locked(struct turnstile *turnstile);
+
+/*
+ * Name: thread_get_inheritor_turnstile_priority
+ *
+ * Description: Get the max priority of all the inheritor turnstiles
+ *
+ * Arg1: thread
+ *
+ * Returns: Max priority of all the inheritor turnstiles.
+ *
+ * Condition: thread locked
+ */
+int
+thread_get_inheritor_turnstile_priority(thread_t thread);
+
+/*
+ * Name: thread_get_waiting_turnstile
+ *
+ * Description: Get the turnstile if the thread is waiting on a turnstile.
+ *
+ * Arg1: thread
+ *
+ * Returns: turnstile: if the thread is blocked on a turnstile.
+ *          TURNSTILE_NULL: otherwise.
+ *
+ * Condition: thread locked.
+ */
+struct turnstile *
+thread_get_waiting_turnstile(thread_t thread);
+
+/*
+ * Name: turnstile_lookup_by_proprietor
+ *
+ * Description: Get turnstile for a proprietor from global
+ *              turnstile hash.
+ *
+ * Arg1: port
+ *
+ * Returns: turnstile: if the proprietor has a turnstile.
+ *          TURNSTILE_NULL: otherwise.
+ *
+ * Condition: proprietor interlock held.
+ */
+struct turnstile *
+turnstile_lookup_by_proprietor(uintptr_t proprietor);
+
+/*
+ * Name: turnstile_stats_update
+ *
+ * Description: Function to update turnstile stats for dev kernel.
+ *
+ * Arg1: hops : number of thread hops in priority propagation
+ * Arg2: flags : turnstile stats update flags
+ * Arg3: inheritor: inheritor
+ *
+ * Returns: Nothing
+ */
+void
+turnstile_stats_update(
+       int hop __assert_only,
+       turnstile_stats_update_flags_t flags __assert_only,
+       turnstile_inheritor_t inheritor __assert_only);
+
+#if DEVELOPMENT || DEBUG
+
+/* Functions used by debug test primitive exported by sysctls */
+int
+tstile_test_prim_lock(boolean_t use_hashtable);
+
+int
+tstile_test_prim_unlock(boolean_t use_hashtable);
+
+int
+turnstile_get_boost_stats_sysctl(void *req);
+int
+turnstile_get_unboost_stats_sysctl(void *req);
+#endif /* DEVELOPMENT || DEBUG */
+#endif /* XNU_KERNEL_PRIVATE */
+
+/* Interface */
+
+/*
+ * Name: turnstile_prepare
+ *
+ * Description: Transfer current thread's turnstile to primitive or it's free turnstile list.
+ *              Function is called holding the interlock (spinlock) of the primitive.
+ *              The turnstile returned by this function is safe to use untill the thread calls turnstile_complete.
+ *              When no turnstile is provided explicitly, the calling thread will not have a turnstile attached to
+ *              it untill it calls turnstile_complete.
+ *
+ * Args:
+ *   Arg1: proprietor
+ *   Arg2: pointer in primitive struct to store turnstile
+ *   Arg3: turnstile to use instead of taking it from thread.
+ *   Arg4: type of primitive
+ *
+ * Returns:
+ *   turnstile.
+ */
+struct turnstile *
+turnstile_prepare(
+       uintptr_t proprietor,
+       struct turnstile **tstore,
+       struct turnstile *turnstile,
+       turnstile_type_t type);
+
+/*
+ * Name: turnstile_complete
+ *
+ * Description: Transfer the primitive's turnstile or from it's freelist to current thread.
+ *              Function is called holding the interlock (spinlock) of the primitive.
+ *              Current thread will have a turnstile attached to it after this call.
+ *
+ * Args:
+ *   Arg1: proprietor
+ *   Arg2: pointer in primitive struct to update turnstile
+ *   Arg3: pointer to store the returned turnstile instead of attaching it to thread
+ *
+ * Returns:
+ *   None.
+ */
+void
+turnstile_complete(
+       uintptr_t proprietor,
+       struct turnstile **tstore,
+       struct turnstile **turnstile);
+
+/*
+ * Name: turnstile_update_inheritor
+ *
+ * Description: Update the inheritor of the turnstile and boost the
+ *              inheritor. It will take a thread reference on the inheritor.
+ *              Called with the interlock of the primitive held.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *   Arg2: inheritor
+ *   Arg3: flags - TURNSTILE_DELAYED_UPDATE - update will happen later in assert_wait
+ *
+ * Returns:
+ *   old inheritor reference is stashed on current thread's struct.
+ */
+void
+turnstile_update_inheritor(
+       struct turnstile *turnstile,
+       turnstile_inheritor_t new_inheritor,
+       turnstile_update_flags_t flags);
+
+typedef enum turnstile_update_complete_flags {
+       TURNSTILE_INTERLOCK_NOT_HELD = 0x1,
+       TURNSTILE_INTERLOCK_HELD = 0x2,
+} turnstile_update_complete_flags_t;
+
+/*
+ * Name: turnstile_update_inheritor_complete
+ *
+ * Description: Update turnstile inheritor's priority and propagate the
+ *              priority if the inheritor is blocked on a turnstile.
+ *              Consumes thread ref of old inheritor returned by
+ *              turnstile_update_inheritor. Recursive priority update
+ *              will only happen when called with interlock dropped.
+ *
+ * Args:
+ *   Arg1: turnstile
+ *   Arg2: interlock held
+ *
+ * Returns: None.
+ */
+void
+turnstile_update_inheritor_complete(
+       struct turnstile *turnstile,
+       turnstile_update_complete_flags_t flags);
+
+#endif /* KERNEL_PRIVATE */
+#if XNU_KERNEL_PRIVATE
+
+struct workqueue;
+
+/* pthread_workqueue.c */
+extern void workq_reference(struct workqueue *wq);
+extern void workq_deallocate_safe(struct workqueue *wq);
+extern void workq_destroy(struct workqueue *wq);
+extern bool workq_is_current_thread_updating_turnstile(struct workqueue *wq);
+extern void workq_schedule_creator_turnstile_redrive(struct workqueue *wq,
+               bool locked);
+
+/* thread.c */
+extern void    workq_deallocate_enqueue(struct workqueue *wq);
+
+#endif /* XNU_KERNEL_PRIVATE */
+
+#endif /* _TURNSTILE_H_ */
diff --git a/osfmk/kern/ux_handler.c b/osfmk/kern/ux_handler.c
new file mode 100644 (file)
index 0000000..5f81a2e
--- /dev/null
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/ux_handler.h>
+#include <sys/ux_exception.h>
+
+#include <mach/exception.h>
+#include <mach/kern_return.h>
+#include <mach/port.h>
+#include <mach/mach_port.h>
+#include <mach/mig_errors.h>
+
+#include <kern/thread.h>
+#include <kern/task.h>
+#include <kern/ipc_kobject.h>
+#include <kern/ipc_tt.h>
+
+#include <ipc/ipc_port.h>
+
+#include <mach/host_priv.h>
+#include <kern/host.h>
+
+#include <mach/exc_server.h>
+#include <mach/mach_exc_server.h>
+
+#include <libkern/section_keywords.h>
+
+/*
+ * Mach kobject port to reflect Mach exceptions into Unix signals.
+ *
+ * This is the default Mach exception handler for initproc, which
+ * then filters to all subprocesses as the host level exception handler for
+ * most Mach exceptions.
+ */
+
+static const void                      *ux_handler_kobject    = NULL;
+SECURITY_READ_ONLY_LATE(ipc_port_t)     ux_handler_port       = IP_NULL;
+
+/*
+ * init is called early in Mach initialization
+ * when we can initialize read-only memory
+ */
+void
+ux_handler_init(void)
+{
+       ux_handler_port = ipc_port_alloc_kernel();
+
+       if (ux_handler_port == IP_NULL)
+               panic("can't allocate unix exception port");
+
+       ipc_kobject_set(ux_handler_port, (ipc_kobject_t)&ux_handler_kobject, IKOT_UX_HANDLER);
+}
+
+/*
+ * setup is called late in BSD initialization from initproc's context
+ * so the MAC hook goo inside host_set_exception_ports will be able to
+ * set up labels without falling over.
+ */
+void
+ux_handler_setup(void)
+{
+       ipc_port_t ux_handler_send_right = ipc_port_make_send(ux_handler_port);
+
+       if (!IP_VALID(ux_handler_send_right))
+               panic("Couldn't allocate send right for ux_handler_port!\n");
+
+       kern_return_t kr = KERN_SUCCESS;
+
+       /*
+        * Consumes 1 send right.
+        *
+        * Instruments uses the RPC_ALERT port, so don't register for that.
+        */
+       kr = host_set_exception_ports(host_priv_self(),
+                                     EXC_MASK_ALL & ~(EXC_MASK_RPC_ALERT),
+                                     ux_handler_send_right,
+                                     EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES,
+                                     0);
+
+       if (kr != KERN_SUCCESS)
+               panic("host_set_exception_ports failed to set ux_handler! %d", kr);
+}
+
+/*
+ * Is this port the ux_handler?
+ * If so, it's safe to send an exception without checking labels.
+ */
+boolean_t
+is_ux_handler_port(mach_port_t port)
+{
+       if (ux_handler_port == port)
+               return TRUE;
+       else
+               return FALSE;
+}
+
+kern_return_t
+catch_mach_exception_raise(
+                           mach_port_t                  exception_port,
+                           mach_port_t                  thread_port,
+                           mach_port_t                  task_port,
+                           exception_type_t             exception,
+                           mach_exception_data_t        code,
+                  __unused mach_msg_type_number_t       codeCnt)
+{
+       if (exception_port != ux_handler_port)
+               return KERN_FAILURE;
+
+       kern_return_t kr = KERN_SUCCESS;
+
+       thread_t    target_thread   = THREAD_NULL;
+       task_t      target_task     = TASK_NULL;
+
+       if ((target_thread = convert_port_to_thread(thread_port)) == THREAD_NULL) {
+               kr = KERN_INVALID_ARGUMENT;
+               goto out;
+       }
+
+       if ((target_task = convert_port_to_task(task_port)) == TASK_NULL) {
+               kr = KERN_INVALID_ARGUMENT;
+               goto out;
+       }
+
+       kr = handle_ux_exception(target_thread, exception, code[0], code[1]);
+
+out:
+       if (kr == KERN_SUCCESS) {
+               /*
+                * Following the MIG 'consume on success' protocol,
+                * consume references to the port arguments.
+                * (but NOT the exception_port, as the first argument is borrowed)
+                *
+                * If we return non-success, the kobject server will eat the port
+                * references for us.
+                */
+
+               ipc_port_release_send(thread_port);
+               ipc_port_release_send(task_port);
+       }
+
+       thread_deallocate(target_thread);
+       task_deallocate(target_task);
+
+       return kr;
+}
+
+kern_return_t
+catch_exception_raise(
+                      mach_port_t               exception_port,
+                      mach_port_t               thread,
+                      mach_port_t               task,
+                      exception_type_t          exception,
+                      exception_data_t          code,
+                      mach_msg_type_number_t    codeCnt)
+{
+       if (exception_port != ux_handler_port)
+               return KERN_FAILURE;
+
+       mach_exception_data_type_t big_code[EXCEPTION_CODE_MAX] = {
+               [0] = code[0],
+               [1] = code[1],
+       };
+
+       return catch_mach_exception_raise(exception_port,
+                                         thread,
+                                         task,
+                                         exception,
+                                         big_code,
+                                         codeCnt);
+}
+
+kern_return_t
+catch_exception_raise_state(
+                   __unused mach_port_t                 exception_port,
+                   __unused exception_type_t            exception,
+                   __unused const exception_data_t      code,
+                   __unused mach_msg_type_number_t      codeCnt,
+                   __unused int                        *flavor,
+                   __unused const thread_state_t        old_state,
+                   __unused mach_msg_type_number_t      old_stateCnt,
+                   __unused thread_state_t              new_state,
+                   __unused mach_msg_type_number_t     *new_stateCnt)
+{
+       return(KERN_INVALID_ARGUMENT);
+}
+
+kern_return_t
+catch_mach_exception_raise_state(
+                        __unused mach_port_t                    exception_port,
+                        __unused exception_type_t               exception,
+                        __unused const mach_exception_data_t    code,
+                        __unused mach_msg_type_number_t         codeCnt,
+                        __unused int                           *flavor,
+                        __unused const thread_state_t           old_state,
+                        __unused mach_msg_type_number_t         old_stateCnt,
+                        __unused thread_state_t                 new_state,
+                        __unused mach_msg_type_number_t        *new_stateCnt)
+{
+       return(KERN_INVALID_ARGUMENT);
+}
+
+kern_return_t
+catch_exception_raise_state_identity(
+                            __unused mach_port_t                exception_port,
+                            __unused mach_port_t                thread,
+                            __unused mach_port_t                task,
+                            __unused exception_type_t           exception,
+                            __unused exception_data_t           code,
+                            __unused mach_msg_type_number_t     codeCnt,
+                            __unused int                       *flavor,
+                            __unused thread_state_t             old_state,
+                            __unused mach_msg_type_number_t     old_stateCnt,
+                            __unused thread_state_t             new_state,
+                            __unused mach_msg_type_number_t    *new_stateCnt)
+{
+       return(KERN_INVALID_ARGUMENT);
+}
+
+kern_return_t
+catch_mach_exception_raise_state_identity(
+                                 __unused mach_port_t                   exception_port,
+                                 __unused mach_port_t                   thread,
+                                 __unused mach_port_t                   task,
+                                 __unused exception_type_t              exception,
+                                 __unused mach_exception_data_t         code,
+                                 __unused mach_msg_type_number_t        codeCnt,
+                                 __unused int                          *flavor,
+                                 __unused thread_state_t                old_state,
+                                 __unused mach_msg_type_number_t        old_stateCnt,
+                                 __unused thread_state_t                new_state,
+                                 __unused mach_msg_type_number_t       *new_stateCnt)
+{
+       return(KERN_INVALID_ARGUMENT);
+}
+
diff --git a/osfmk/kern/ux_handler.h b/osfmk/kern/ux_handler.h
new file mode 100644 (file)
index 0000000..a3c473b
--- /dev/null
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _KERN_UX_HANDLER_H_
+#define _KERN_UX_HANDLER_H_
+
+#include <mach/port.h>
+
+extern void             ux_handler_init(void);
+extern void             ux_handler_setup(void);
+extern boolean_t        is_ux_handler_port(mach_port_t port);
+
+#endif /* !defined(_KERN_UX_HANDLER_H_) */
+
index 380b3f1f92149c2f778d607470d490971d6f08b2..98ee900ba9f671828023714386c3e5e8480d03b0 100644 (file)
@@ -73,6 +73,7 @@
 #include <kern/waitq.h>
 #include <kern/zalloc.h>
 #include <kern/policy_internal.h>
+#include <kern/turnstile.h>
 
 #include <libkern/OSAtomic.h>
 #include <mach/sync_policy.h>
@@ -135,7 +136,7 @@ static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int s
 #if __arm64__
 
 #define waitq_lock_to(wq,to) \
-       (hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, (uint32_t)to))
+       (hw_lock_bit_to(&(wq)->waitq_interlock, LCK_ILOCK, to))
 
 #define waitq_lock_unlock(wq) \
        (hw_unlock_bit(&(wq)->waitq_interlock, LCK_ILOCK))
@@ -146,7 +147,7 @@ static __inline__ void waitq_grab_backtrace(uintptr_t bt[NWAITQ_BTFRAMES], int s
 #else
 
 #define waitq_lock_to(wq,to) \
-       (hw_lock_to(&(wq)->waitq_interlock, (uint32_t)to))
+       (hw_lock_to(&(wq)->waitq_interlock, to))
 
 #define waitq_lock_unlock(wq) \
        (hw_lock_unlock(&(wq)->waitq_interlock))
@@ -533,7 +534,7 @@ int walk_waitq_links(int walk_type, struct waitq *waitq,
                 * invalidated before we grabbed the lock!
                 */
                if (wqset->wqset_id != link->wql_setid.id) {
-                       /*This is the bottom of the tree: just get out */
+                       /* This is the bottom of the tree: just get out */
                        if (should_unlock) {
                                waitq_set_unlock(wqset);
                        }
@@ -1390,6 +1391,8 @@ static void wq_prepost_do_post_locked(struct waitq_set *wqset,
        if (wq_is_preposted_on_set(waitq, wqset))
                return;
 
+       assert(waitqs_is_linked(wqset));
+
        /*
         * This function is called because an event is being posted to 'waitq'.
         * We need a prepost object associated with this queue. Allocate one
@@ -1683,7 +1686,7 @@ static __inline__ void waitq_stats_count_fail(struct waitq *waitq)
 
 int waitq_is_valid(struct waitq *waitq)
 {
-       return (waitq != NULL) && waitq->waitq_isvalid && ((waitq->waitq_type & ~1) == WQT_QUEUE);
+       return (waitq != NULL) && waitq->waitq_isvalid;
 }
 
 int waitq_set_is_valid(struct waitq_set *wqset)
@@ -1704,6 +1707,20 @@ int waitq_irq_safe(struct waitq *waitq)
        return waitq->waitq_irq;
 }
 
+struct waitq * waitq_get_safeq(struct waitq *waitq)
+{
+       struct waitq *safeq;
+
+       /* Check if it's a port waitq */
+       if (waitq_is_port_queue(waitq)) {
+               assert(!waitq_irq_safe(waitq));
+               safeq = ipc_port_rcv_turnstile_waitq(waitq);
+       } else {
+               safeq = global_eventq(waitq);
+       }
+       return safeq;
+}
+
 static uint32_t waitq_hash_size(void)
 {
        uint32_t hsize, queues;
@@ -1717,6 +1734,65 @@ static uint32_t waitq_hash_size(void)
        return hsize;
 }
 
+/* 
+ * Since the priority ordered waitq uses basepri as the 
+ * ordering key assert that this value fits in a uint8_t.
+ */
+static_assert(MAXPRI <= UINT8_MAX);
+
+static inline void waitq_thread_insert(struct waitq *wq,
+                                       thread_t thread, boolean_t fifo)
+{
+       if (waitq_is_turnstile_queue(wq)) {
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                       (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_ADDED_TO_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
+                       VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
+                       thread_tid(thread),
+                       thread->base_pri, 0, 0);
+
+               turnstile_stats_update(0, TSU_TURNSTILE_BLOCK_COUNT, NULL);
+
+               /*
+                * For turnstile queues (which use priority queues), 
+                * insert the thread in the heap based on its current 
+                * base_pri. Note that the priority queue implementation 
+                * is currently not stable, so does not maintain fifo for 
+                * threads at the same base_pri. Also, if the base_pri 
+                * of the thread changes while its blocked in the waitq, 
+                * the thread position should be updated in the priority 
+                * queue by calling priority queue increase/decrease 
+                * operations.
+                */
+               priority_queue_entry_init(&(thread->wait_prioq_links));
+               priority_queue_insert(&wq->waitq_prio_queue,
+                               &thread->wait_prioq_links, thread->base_pri,
+                               PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+       } else {
+               turnstile_stats_update(0, TSU_REGULAR_WAITQ_BLOCK_COUNT, NULL);
+               if (fifo) {
+                       enqueue_tail(&wq->waitq_queue, &thread->wait_links);
+               } else {
+                       enqueue_head(&wq->waitq_queue, &thread->wait_links);
+               }
+       }
+}
+
+static inline void waitq_thread_remove(struct waitq *wq,
+                                       thread_t thread)
+{
+       if (waitq_is_turnstile_queue(wq)) {
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
+                       (TURNSTILE_CODE(TURNSTILE_HEAP_OPERATIONS, (THREAD_REMOVED_FROM_TURNSTILE_WAITQ))) | DBG_FUNC_NONE,
+                       VM_KERNEL_UNSLIDE_OR_PERM(waitq_to_turnstile(wq)),
+                       thread_tid(thread),
+                       0, 0, 0);
+               priority_queue_remove(&wq->waitq_prio_queue, &thread->wait_prioq_links,
+                               PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+       } else {
+               remqueue(&(thread->wait_links));
+       }
+}
+
 void waitq_bootstrap(void)
 {
        kern_return_t kret;
@@ -1914,6 +1990,8 @@ static int waitq_select_walk_cb(struct waitq *waitq, void *ctx,
        if (wqset->wqset_id != link->wql_setid.id)
                goto out_unlock;
 
+       assert(waitqs_is_linked(wqset));
+
        /*
         * Find any threads waiting on this wait queue set,
         * and recurse into any waitq set to which this set belongs.
@@ -1956,6 +2034,187 @@ out_unlock:
        return ret;
 }
 
+/**
+ * Routine to iterate over the waitq for non-priority ordered waitqs
+ *
+ * Conditions:
+ *     args->waitq (and args->posted_waitq) is locked
+ *
+ * Notes:
+ *     Uses the optional select callback function to refine the selection
+ *     of one or more threads from a waitq. The select callback is invoked
+ *     once for every thread that is found to be waiting on the input args->waitq.
+ *
+ *     If one or more threads are selected, this may disable interrupts.
+ *     The previous interrupt state is returned in args->spl and should
+ *     be used in a call to splx() if threads are returned to the caller.
+ */
+static thread_t waitq_queue_iterate_locked(struct waitq *safeq, struct waitq *waitq,
+                                           spl_t spl, struct waitq_select_args *args,
+                                           uint32_t *remaining_eventmask)
+{
+       int max_threads = args->max_threads;
+       int *nthreads = args->nthreads;
+       thread_t thread = THREAD_NULL;
+       thread_t first_thread = THREAD_NULL;
+
+       qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
+               thread_t t = THREAD_NULL;
+               assert_thread_magic(thread);
+
+               /*
+                * For non-priority ordered waitqs, we allow multiple events to be
+                * mux'ed into the same waitq. Also safeqs may contain threads from 
+                * multiple waitqs. Only pick threads that match the
+                * requested wait event. 
+                */
+               if (thread->waitq == waitq && thread->wait_event == args->event) {
+                       t = thread;
+                       if (first_thread == THREAD_NULL)
+                               first_thread = thread;
+
+                       /* allow the caller to futher refine the selection */
+                       if (args->select_cb)
+                               t = args->select_cb(args->select_ctx, waitq,
+                                                   waitq_is_global(waitq), thread);
+                       if (t != THREAD_NULL) {
+                               *nthreads += 1;
+                               if (args->threadq) {
+                                       /* if output queue, add locked thread to it */
+                                       if (*nthreads == 1)
+                                               *(args->spl) = (safeq != waitq) ? spl : splsched();
+                                       thread_lock(t);
+                                       thread_clear_waitq_state(t);
+                                       re_queue_tail(args->threadq, &t->wait_links);
+                               }
+                               /* only enqueue up to 'max' threads */
+                               if (*nthreads >= max_threads && max_threads > 0)
+                                       break;
+                       }
+               }
+               /* thread wasn't selected so track it's event */
+               if (t == THREAD_NULL) {
+                       *remaining_eventmask |= (thread->waitq != safeq) ?
+                                _CAST_TO_EVENT_MASK(thread->waitq) : _CAST_TO_EVENT_MASK(thread->wait_event);
+               }
+       }
+
+       return first_thread;
+}
+
+/**
+ * Routine to iterate and remove threads from priority ordered waitqs
+ *
+ * Conditions:
+ *     args->waitq (and args->posted_waitq) is locked
+ *
+ * Notes:
+ *     The priority ordered waitqs only support maximum priority element removal.
+ *
+ *     Also, the implementation makes sure that all threads in a priority ordered
+ *     waitq are waiting on the same wait event. This is not necessarily true for
+ *     non-priority ordered waitqs. If one or more threads are selected, this may
+ *     disable interrupts. The previous interrupt state is returned in args->spl
+ *     and should be used in a call to splx() if threads are returned to the caller.
+ *
+ *     In the future, we could support priority ordered waitqs with multiple wait
+ *     events in the same queue. The way to implement that would be to keep removing
+ *     elements from the waitq and if the event does not match the requested one,
+ *     add it to a local list. This local list of elements needs to be re-inserted
+ *     into the priority queue at the end and the select_cb return value & 
+ *     remaining_eventmask would need to be handled appropriately. The implementation 
+ *     is not very efficient but would work functionally. 
+ */
+static thread_t waitq_prioq_iterate_locked(struct waitq *safeq, struct waitq *waitq,
+                                           spl_t spl, struct waitq_select_args *args,
+                                           uint32_t *remaining_eventmask)
+{
+       int max_threads = args->max_threads;
+       int *nthreads = args->nthreads;
+       thread_t first_thread = THREAD_NULL;
+       thread_t thread = THREAD_NULL;
+
+       /* 
+        * The waitq select routines need to handle two cases:
+        * Case 1: Peek at maximum priority thread in the waitq (remove_op = 0)
+        *         Get the maximum priority thread from the waitq without removing it.
+        *         In that case args->threadq == NULL and max_threads == 1.
+        * Case 2: Remove 'n' highest priority threads from waitq (remove_op = 1)
+        *         Get max_threads (if available) while removing them from the waitq.
+        *         In that case args->threadq != NULL and max_threads is one of {-1, 1}.
+        * 
+        * The only possible values for remaining_eventmask for the priority queue 
+        * waitq are either 0 (for the remove all threads case) or the original 
+        * safeq->waitq_eventmask (for the lookup/remove one thread cases).
+        */
+       *remaining_eventmask = safeq->waitq_eventmask;
+       boolean_t remove_op = !!(args->threadq);
+
+       while ((max_threads <= 0) || (*nthreads < max_threads)) {
+
+               if (priority_queue_empty(&(safeq->waitq_prio_queue))) {
+                       *remaining_eventmask = 0;
+                       break;
+               }
+
+               if (remove_op) {
+                       thread = priority_queue_remove_max(&safeq->waitq_prio_queue,
+                                       struct thread, wait_prioq_links,
+                                       PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+               } else {
+                       /* For the peek operation, the only valid value for max_threads is 1 */
+                       assert(max_threads == 1);
+                       thread = priority_queue_max(&safeq->waitq_prio_queue,
+                                       struct thread, wait_prioq_links);
+               }
+               /* 
+                * Ensure the wait event matches since priority ordered waitqs do not 
+                * support multiple events in the same waitq.
+                */
+               assert((thread->waitq == waitq) && (thread->wait_event == args->event));
+               
+               if (args->select_cb) {
+                       /*
+                        * Call the select_cb passed into the waitq_select args. The callback 
+                        * updates the select_ctx with information about the highest priority 
+                        * thread which is eventually used by the caller. 
+                        */
+                       thread_t __assert_only ret_thread = args->select_cb(args->select_ctx, waitq,
+                                                                           waitq_is_global(waitq), thread);
+                       if (!remove_op) {
+                               /* For the peek operation, the thread should not be selected for addition */
+                               assert(ret_thread == THREAD_NULL);
+                       } else {
+                               /* 
+                                * For the remove operation, the select routine should always return a valid 
+                                * thread for priority waitqs. Since all threads in a prioq are equally 
+                                * eligible, it should match the thread removed from the prioq. If this 
+                                * invariant changes, the implementation would need to handle the 
+                                * remaining_eventmask here correctly.
+                                */
+                               assert(ret_thread == thread);
+                       }
+               }
+               
+               if (first_thread == THREAD_NULL)
+                       first_thread = thread;
+
+               /* For the peek operation, break out early */
+               if (!remove_op)
+                       break;
+
+               /* Add the thread to the result thread list */
+               *nthreads += 1;
+               if (*nthreads == 1)
+                       *(args->spl) = (safeq != waitq) ? spl : splsched();
+               thread_lock(thread);
+               thread_clear_waitq_state(thread);
+               enqueue_tail(args->threadq, &(thread->wait_links));
+       }
+
+       return first_thread;
+}
+
 /**
  * generic thread selection from a waitq (and sets to which the waitq belongs)
  *
@@ -1976,7 +2235,7 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args)
 {
        struct waitq *waitq = args->waitq;
        int max_threads = args->max_threads;
-       thread_t thread = THREAD_NULL, first_thread = THREAD_NULL;
+       thread_t first_thread = THREAD_NULL;
        struct waitq *safeq;
        uint32_t remaining_eventmask = 0;
        uint32_t eventmask;
@@ -1988,7 +2247,7 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args)
        if (!waitq_irq_safe(waitq)) {
                /* JMM - add flag to waitq to avoid global lookup if no waiters */
                eventmask = _CAST_TO_EVENT_MASK(waitq);
-               safeq = global_eventq(waitq);
+               safeq = waitq_get_safeq(waitq);
                if (*nthreads == 0)
                        spl = splsched();
                waitq_lock(safeq);
@@ -2005,41 +2264,14 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args)
        if (!waitq_is_global(safeq) ||
            (safeq->waitq_eventmask & eventmask) == eventmask) {
 
-               /* look through each thread waiting directly on the safeq */
-               qe_foreach_element_safe(thread, &safeq->waitq_queue, wait_links) {
-                       thread_t t = THREAD_NULL;
-                       assert_thread_magic(thread);
-
-                       if (thread->waitq == waitq && thread->wait_event == args->event) {
-                               t = thread;
-                               if (first_thread == THREAD_NULL)
-                                       first_thread = thread;
-
-                               /* allow the caller to futher refine the selection */
-                               if (args->select_cb)
-                                       t = args->select_cb(args->select_ctx, waitq,
-                                                           waitq_is_global(waitq), thread);
-                               if (t != THREAD_NULL) {
-                                       *nthreads += 1;
-                                       if (args->threadq) {
-                                               if (*nthreads == 1)
-                                                       *(args->spl) = (safeq != waitq) ? spl : splsched();
-                                               thread_lock(t);
-                                               thread_clear_waitq_state(t);
-                                               /* put locked thread on output queue */
-                                               re_queue_tail(args->threadq, &t->wait_links);
-                                       }
-                                       /* only enqueue up to 'max' threads */
-                                       if (*nthreads >= max_threads && max_threads > 0)
-                                               break;
-                               }
-                       }
-                       /* thread wasn't selected so track it's event */
-                       if (t == THREAD_NULL) {
-                               remaining_eventmask |= (thread->waitq != safeq) ?
-                                   _CAST_TO_EVENT_MASK(thread->waitq):
-                                   _CAST_TO_EVENT_MASK(thread->wait_event);
-                       }
+               if (waitq_is_turnstile_queue(safeq)) {
+                       first_thread = waitq_prioq_iterate_locked(safeq, waitq,
+                                                                 spl, args,
+                                                                 &remaining_eventmask);
+               } else {
+                       first_thread = waitq_queue_iterate_locked(safeq, waitq,
+                                                                 spl, args,
+                                                                 &remaining_eventmask);
                }
 
                /*
@@ -2052,7 +2284,7 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args)
                 *   computed is complete - so reset it.
                 */
                if (waitq_is_global(safeq)) {
-                       if (queue_empty(&safeq->waitq_queue))
+                       if (waitq_empty(safeq))
                                safeq->waitq_eventmask = 0;
                        else if (max_threads < 0 || *nthreads < max_threads)
                                safeq->waitq_eventmask = remaining_eventmask;
@@ -2070,10 +2302,11 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args)
                *(args->spl) = (safeq != waitq) ? spl : splsched();
                thread_lock(first_thread);
                thread_clear_waitq_state(first_thread);
-               re_queue_tail(args->threadq, &first_thread->wait_links);
+               waitq_thread_remove(safeq, first_thread);
+               enqueue_tail(args->threadq, &(first_thread->wait_links));
 
                /* update the eventmask on [now] empty global queues */
-               if (waitq_is_global(safeq) && queue_empty(&safeq->waitq_queue))
+               if (waitq_is_global(safeq) && waitq_empty(safeq))
                        safeq->waitq_eventmask = 0;
        }
 
@@ -2127,8 +2360,8 @@ static void do_waitq_select_n_locked(struct waitq_select_args *args)
  *     been placed onto the input 'threadq'
  *
  * Notes:
- *     The 'select_cb' function is invoked for every thread found waiting
- *     on 'waitq' for 'event'. The thread is _not_ locked upon callback
+ *     The 'select_cb' function is invoked for every thread found waiting on 
+ *     'waitq' for 'event'. The thread is _not_ locked upon callback 
  *     invocation. This parameter may be NULL.
  *
  *     If one or more threads are returned in 'threadq' then the caller is
@@ -2269,8 +2502,9 @@ waitq_select_max_locked(struct waitq *waitq, event64_t event,
         * Scan the waitq to find the highest priority thread.
         * This doesn't remove any thread from the queue
         */
-       nthreads = waitq_select_n_locked(waitq, event, waitq_find_max_pri_cb, &ctx,
-                                        reserved_preposts, NULL, 1, spl);
+       nthreads = waitq_select_n_locked(waitq, event,
+                                        waitq_find_max_pri_cb,
+                                        &ctx, reserved_preposts, NULL, 1, spl);
 
        assert(nthreads == 0);
 
@@ -2336,14 +2570,14 @@ static int waitq_select_thread_cb(struct waitq *waitq, void *ctx,
        s = splsched();
 
        /* find and lock the interrupt-safe waitq the thread is thought to be on */
-       safeq = global_eventq(wqsetq);
+       safeq = waitq_get_safeq(wqsetq);
        waitq_lock(safeq);
 
        thread_lock(thread);
 
        if ((thread->waitq == wqsetq) && (thread->wait_event == event)) {
-               remqueue(&thread->wait_links);
-               if (queue_empty(&safeq->waitq_queue)) {
+               waitq_thread_remove(wqsetq, thread);
+               if (waitq_empty(safeq)) {
                        safeq->waitq_eventmask = 0;
                }
                thread_clear_waitq_state(thread);
@@ -2387,7 +2621,7 @@ static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
 
        /* Find and lock the interrupts disabled queue the thread is actually on */
        if (!waitq_irq_safe(waitq)) {
-               safeq = global_eventq(waitq);
+               safeq = waitq_get_safeq(waitq);
                waitq_lock(safeq);
        } else {
                safeq = waitq;
@@ -2396,8 +2630,8 @@ static kern_return_t waitq_select_thread_locked(struct waitq *waitq,
        thread_lock(thread);
 
        if ((thread->waitq == waitq) && (thread->wait_event == event)) {
-               remqueue(&thread->wait_links);
-               if (queue_empty(&safeq->waitq_queue)) {
+               waitq_thread_remove(safeq, thread);
+               if (waitq_empty(safeq)) {
                        safeq->waitq_eventmask = 0;
                }
                thread_clear_waitq_state(thread);
@@ -2517,7 +2751,7 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq,
         * Otherwise, determine a global queue to use and lock it.
         */
        if (!waitq_irq_safe(waitq)) {
-               safeq = global_eventq(waitq);
+               safeq = waitq_get_safeq(waitq);
                eventmask = _CAST_TO_EVENT_MASK(waitq);
                waitq_lock(safeq);
        } else {
@@ -2551,9 +2785,9 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq,
                
                if (!safeq->waitq_fifo
                    || (thread->options & TH_OPT_VMPRIV) || realtime)
-                       enqueue_head(&safeq->waitq_queue, &thread->wait_links);
+                       waitq_thread_insert(safeq, thread, false);
                else
-                       enqueue_tail(&safeq->waitq_queue, &thread->wait_links);
+                       waitq_thread_insert(safeq, thread, true);
 
                /* mark the event and real waitq, even if enqueued on a global safeq */
                thread->wait_event = wait_event;
@@ -2580,6 +2814,12 @@ wait_result_t waitq_assert_wait64_locked(struct waitq *waitq,
        /* unlock the thread */
        thread_unlock(thread);
 
+       /* update the inheritor's thread priority if the waitq is embedded in turnstile */
+       if (waitq_is_turnstile_queue(safeq) && wait_result == THREAD_WAITING) {
+               turnstile_recompute_priority_locked(waitq_to_turnstile(safeq));
+               turnstile_update_inheritor_locked(waitq_to_turnstile(safeq));
+       }
+
        /* unlock the safeq if we locked it here */
        if (safeq != waitq) {
                waitq_unlock(safeq);
@@ -2610,7 +2850,7 @@ int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
 
        /* Find the interrupts disabled queue thread is waiting on */
        if (!waitq_irq_safe(waitq)) {
-               safeq = global_eventq(waitq);
+               safeq = waitq_get_safeq(waitq);
        } else {
                safeq = waitq;
        }
@@ -2619,12 +2859,12 @@ int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
        if (!waitq_lock_try(safeq))
                return 0;
 
-       remqueue(&thread->wait_links);
+       waitq_thread_remove(safeq, thread);
        thread_clear_waitq_state(thread);
        waitq_stats_count_clear_wakeup(waitq);
 
        /* clear the global event mask if this was the last thread there! */
-       if (waitq_is_global(safeq) && queue_empty(&safeq->waitq_queue)) {
+       if (waitq_is_global(safeq) && waitq_empty(safeq)) {
                safeq->waitq_eventmask = 0;
                /* JMM - also mark no-waiters on waitq (if not the same as the safeq) */
        }
@@ -2636,80 +2876,58 @@ int waitq_pull_thread_locked(struct waitq *waitq, thread_t thread)
 
 
 static __inline__
-void maybe_adjust_thread_pri(thread_t thread, int priority) {
-       if (thread->sched_pri < priority) {
-               if (priority <= MAXPRI) {
-                       set_sched_pri(thread, priority);
-
-                       thread->was_promoted_on_wakeup = 1;
-                       thread->sched_flags |= TH_SFLAG_PROMOTED;
-               }
-               return;
-       }
+void maybe_adjust_thread_pri(thread_t   thread,
+                             int        priority,
+               __kdebug_only struct waitq *waitq)
+{
 
        /*
         * If the caller is requesting the waitq subsystem to promote the
         * priority of the awoken thread, then boost the thread's priority to
         * the default WAITQ_BOOST_PRIORITY (if it's not already equal or
         * higher priority).  This boost must be removed via a call to
-        * waitq_clear_promotion_locked.
+        * waitq_clear_promotion_locked before the thread waits again.
+        *
+        * WAITQ_PROMOTE_PRIORITY is -2.
+        * Anything above 0 represents a mutex promotion.
+        * The default 'no action' value is -1.
+        * TODO: define this in a header
         */
-       if (priority == WAITQ_PROMOTE_PRIORITY &&
-           (thread->sched_pri < WAITQ_BOOST_PRIORITY ||
-            !(thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED))) {
-
-               KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_PROMOTE) | DBG_FUNC_NONE,
-                                     (uintptr_t)thread_tid(thread),
-                                     thread->sched_pri, thread->base_pri,
-                                     WAITQ_BOOST_PRIORITY, 0);
-               thread->sched_flags |= TH_SFLAG_WAITQ_PROMOTED;
-               if (thread->sched_pri < WAITQ_BOOST_PRIORITY)
-                       set_sched_pri(thread, WAITQ_BOOST_PRIORITY);
+       if (priority == WAITQ_PROMOTE_PRIORITY) {
+               uintptr_t trace_waitq = 0;
+               if (__improbable(kdebug_enable))
+                       trace_waitq = VM_KERNEL_UNSLIDE_OR_PERM(waitq);
+
+               sched_thread_promote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, trace_waitq);
+       } else if (priority > 0) {
+               /* Mutex subsystem wants to see this thread before we 'go' it */
+               lck_mtx_wakeup_adjust_pri(thread, priority);
        }
 }
 
-/**
- * Clear a thread's waitq priority promotion state and the waitq's boost flag
+/*
+ * Clear a potential thread priority promotion from a waitq wakeup
+ * with WAITQ_PROMOTE_PRIORITY.
  *
- * This function will always clear the waitq's 'waitq_boost' flag. If the
- * 'thread' parameter is non-null, the this function will also check the
- * priority promotion (boost) state of that thread. If this thread was boosted
- * (by having been awoken from a boosting waitq), then this boost state is
- * cleared. This function is to be paired with waitq_enable_promote_locked.
+ * This must be called on the thread which was woken up with TH_SFLAG_WAITQ_PROMOTED.
  */
 void waitq_clear_promotion_locked(struct waitq *waitq, thread_t thread)
 {
        spl_t s;
 
        assert(waitq_held(waitq));
-       if (thread == THREAD_NULL)
+       assert(thread != THREAD_NULL);
+       assert(thread == current_thread());
+
+       /* This flag is only cleared by the thread itself, so safe to check outside lock */
+       if ((thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) != TH_SFLAG_WAITQ_PROMOTED)
                return;
 
        if (!waitq_irq_safe(waitq))
                s = splsched();
        thread_lock(thread);
 
-       if (thread->sched_flags & TH_SFLAG_WAITQ_PROMOTED) {
-               thread->sched_flags &= ~TH_SFLAG_WAITQ_PROMOTED;
-
-               if (thread->sched_flags & TH_SFLAG_PROMOTED_MASK) {
-                       /* it still has other promotions (mutex/rw_lock) */
-               } else if (thread->sched_flags & TH_SFLAG_DEPRESSED_MASK) {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE) | DBG_FUNC_NONE,
-                                             (uintptr_t)thread_tid(thread),
-                                             thread->sched_pri,
-                                             thread->base_pri,
-                                             DEPRESSPRI, 0);
-                       set_sched_pri(thread, DEPRESSPRI);
-               } else {
-                       KERNEL_DEBUG_CONSTANT(MACHDBG_CODE(DBG_MACH_SCHED, MACH_WAITQ_DEMOTE) | DBG_FUNC_NONE,
-                                             (uintptr_t)thread_tid(thread),
-                                             thread->sched_pri,
-                                             thread->base_pri,
-                                             thread->base_pri, 0);
-                       thread_recompute_sched_pri(thread, FALSE);
-               }
-       }
+       sched_thread_unpromote_reason(thread, TH_SFLAG_WAITQ_PROMOTED, 0);
 
        thread_unlock(thread);
        if (!waitq_irq_safe(waitq))
@@ -2763,7 +2981,7 @@ kern_return_t waitq_wakeup64_all_locked(struct waitq *waitq,
        qe_foreach_element_safe(thread, &wakeup_queue, wait_links) {
                assert_thread_magic(thread);
                remqueue(&thread->wait_links);
-               maybe_adjust_thread_pri(thread, priority);
+               maybe_adjust_thread_pri(thread, priority, waitq);
                ret = thread_go(thread, result);
                assert(ret == KERN_SUCCESS);
                thread_unlock(thread);
@@ -2817,7 +3035,7 @@ kern_return_t waitq_wakeup64_one_locked(struct waitq *waitq,
                waitq_unlock(waitq);
 
        if (thread != THREAD_NULL) {
-               maybe_adjust_thread_pri(thread, priority);
+               maybe_adjust_thread_pri(thread, priority, waitq);
                kern_return_t ret = thread_go(thread, result);
                assert(ret == KERN_SUCCESS);
                thread_unlock(thread);
@@ -2954,13 +3172,21 @@ kern_return_t waitq_init(struct waitq *waitq, int policy)
        waitq->waitq_irq = !!(policy & SYNC_POLICY_DISABLE_IRQ);
        waitq->waitq_prepost = 0;
        waitq->waitq_type = WQT_QUEUE;
+       waitq->waitq_turnstile_or_port = !!(policy & SYNC_POLICY_TURNSTILE);
        waitq->waitq_eventmask = 0;
 
        waitq->waitq_set_id = 0;
        waitq->waitq_prepost_id = 0;
 
        waitq_lock_init(waitq);
-       queue_init(&waitq->waitq_queue);
+       if (waitq_is_turnstile_queue(waitq)) {
+               /* For turnstile, initialize it as a priority queue */
+               priority_queue_init(&waitq->waitq_prio_queue,
+                               PRIORITY_QUEUE_BUILTIN_MAX_HEAP);
+               assert(waitq->waitq_fifo == 0);
+       } else {
+               queue_init(&waitq->waitq_queue);
+       }
 
        waitq->waitq_isvalid = 1;
        return KERN_SUCCESS;
@@ -3050,7 +3276,6 @@ void waitq_deinit(struct waitq *waitq)
                return;
        }
 
-       waitq->waitq_type = WQT_INVALID;
        waitq->waitq_isvalid = 0;
 
        if (!waitq_irq_safe(waitq)) {
@@ -3061,7 +3286,7 @@ void waitq_deinit(struct waitq *waitq)
                splx(s);
        }
 
-       assert(queue_empty(&waitq->waitq_queue));
+       assert(waitq_empty(waitq));
 }
 
 void waitq_invalidate_locked(struct waitq *waitq)
@@ -3095,7 +3320,10 @@ static int wqset_clear_prepost_chain_cb(struct waitq_set __unused *wqset,
  *     may block
  *
  * Returns:
- *     allocated / initialized waitq_set object
+ *     allocated / initialized waitq_set object.
+ *     the waits_set object returned does not have
+ *     a waitq_link associated.
+ *
  *     NULL on failure
  */
 struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook)
@@ -3119,9 +3347,9 @@ struct waitq_set *waitq_set_alloc(int policy, void *prepost_hook)
 /**
  * initialize a waitq set object
  *
- * Conditions:
- *     may (rarely) block if link table needs to grow, and
- *     no 'reserved_link' object is passed.
+ * if no 'reserved_link' object is passed
+ * the waitq_link will be lazily allocated
+ * on demand through waitq_set_lazy_init_link.
  */
 kern_return_t waitq_set_init(struct waitq_set *wqset,
                             int policy, uint64_t *reserved_link,
@@ -3148,21 +3376,96 @@ kern_return_t waitq_set_init(struct waitq_set *wqset,
 
        if (reserved_link && *reserved_link != 0) {
                link = wql_get_reserved(*reserved_link, WQL_WQS);
+
+               if (!link)
+                       panic("Can't allocate link object for waitq set: %p", wqset);
+
                /* always consume the caller's reference */
                *reserved_link = 0;
+
+               link->wql_wqs.wql_set = wqset;
+               wql_mkvalid(link);
+
+               wqset->wqset_id = link->wql_setid.id;
+               wql_put_link(link);
+
        } else {
-               link = wql_alloc_link(WQL_WQS);
+               /*
+                * Lazy allocate the link only when an actual id is needed.
+                */
+               wqset->wqset_id = WQSET_NOT_LINKED;
        }
+
+       return KERN_SUCCESS;
+}
+
+#if DEVELOPMENT || DEBUG
+
+int
+sysctl_helper_waitq_set_nelem(void)
+{
+       return ltable_nelem(&g_wqlinktable);
+}
+
+#endif
+
+/**
+ * initialize a waitq set link.
+ *
+ * Conditions:
+ *     may block
+ *     locks and unlocks the waiq set lock
+ *
+ */
+void
+waitq_set_lazy_init_link(struct waitq_set *wqset)
+{
+       struct waitq_link *link;
+
+       assert(get_preemption_level() == 0 && waitq_wait_possible(current_thread()));
+
+       waitq_set_lock(wqset);
+       if (!waitq_set_should_lazy_init_link(wqset)){
+               waitq_set_unlock(wqset);
+               return;
+       }
+
+       assert(wqset->wqset_id == WQSET_NOT_LINKED);
+       waitq_set_unlock(wqset);
+
+       link = wql_alloc_link(WQL_WQS);
        if (!link)
                panic("Can't allocate link object for waitq set: %p", wqset);
 
        link->wql_wqs.wql_set = wqset;
-       wql_mkvalid(link);
 
-       wqset->wqset_id = link->wql_setid.id;
+       waitq_set_lock(wqset);
+       if (waitq_set_should_lazy_init_link(wqset)) {
+               wql_mkvalid(link);
+               wqset->wqset_id = link->wql_setid.id;
+       }
+
+       assert(wqset->wqset_id != 0);
+       assert(wqset->wqset_id != WQSET_NOT_LINKED);
+
+       waitq_set_unlock(wqset);
+
        wql_put_link(link);
 
-       return KERN_SUCCESS;
+       return;
+}
+
+/**
+ * checks if a waitq set needs to be linked.
+ *
+ */
+boolean_t
+waitq_set_should_lazy_init_link(struct waitq_set *wqset)
+{
+       if (waitqs_is_linked(wqset) || wqset->wqset_id == 0) {
+               return FALSE;
+       }
+       return TRUE;
 }
 
 /**
@@ -3183,27 +3486,32 @@ void waitq_set_deinit(struct waitq_set *wqset)
                panic("trying to de-initialize an invalid wqset @%p", wqset);
 
        assert(!waitq_irq_safe(&wqset->wqset_q));
+
        waitq_set_lock(wqset);
 
        set_id = wqset->wqset_id;
 
-       /* grab the set's link object */
-       link = wql_get_link(set_id);
-       if (link)
-               wql_invalidate(link);
+       if (waitqs_is_linked(wqset) || set_id == 0) {
 
-       /* someone raced us to deinit */
-       if (!link || wqset->wqset_id != set_id || set_id != link->wql_setid.id) {
-               if (link)
-                       wql_put_link(link);
-               waitq_set_unlock(wqset);
-               return;
-       }
+               /* grab the set's link object */
+               link = wql_get_link(set_id);
+               if (link) {
+                       wql_invalidate(link);
+               }
+               /* someone raced us to deinit */
+               if (!link || wqset->wqset_id != set_id || set_id != link->wql_setid.id) {
+                       if (link) {
+                               wql_put_link(link);
+                       }
+                       waitq_set_unlock(wqset);
+                       return;
+               }
 
-       /* every wait queue set should have a valid link object */
-       assert(link != NULL && wql_type(link) == WQL_WQS);
+               /* the link should be a valid link object at this point */
+               assert(link != NULL && wql_type(link) == WQL_WQS);
 
-       wqset->wqset_id = 0;
+               wqset->wqset_id = 0;
+       }
 
        /*
         * This set may have a lot of preposts, or may have been a member of
@@ -3213,12 +3521,13 @@ void waitq_set_deinit(struct waitq_set *wqset)
         * objects and free those outside the critical section.
         */
        prepost_id = 0;
-       if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id)
+       if (wqset->wqset_q.waitq_prepost && wqset->wqset_prepost_id) {
+               assert(link != NULL);
                prepost_id = wqset->wqset_prepost_id;
+       }
        /* else { TODO: notify kqueue subsystem? } */
        wqset->wqset_prepost_id = 0;
 
-       wqset->wqset_q.waitq_type = WQT_INVALID;
        wqset->wqset_q.waitq_fifo = 0;
        wqset->wqset_q.waitq_prepost = 0;
        wqset->wqset_q.waitq_isvalid = 0;
@@ -3229,16 +3538,19 @@ void waitq_set_deinit(struct waitq_set *wqset)
        waitq_unlink_all_unlock(&wqset->wqset_q);
        /* wqset->wqset_q unlocked and set links deallocated */
 
-       /*
-        * walk_waitq_links may race with us for access to the waitq set.
-        * If walk_waitq_links has a reference to the set, then we should wait
-        * until the link's refcount goes to 1 (our reference) before we exit
-        * this function. That way we ensure that the waitq set memory will
-        * remain valid even though it's been cleared out.
-        */
-       while (wql_refcnt(link) > 1)
-               delay(1);
-       wql_put_link(link);
+
+       if (link) {
+               /*
+                * walk_waitq_links may race with us for access to the waitq set.
+                * If walk_waitq_links has a reference to the set, then we should wait
+                * until the link's refcount goes to 1 (our reference) before we exit
+                * this function. That way we ensure that the waitq set memory will
+                * remain valid even though it's been cleared out.
+                */
+               while (wql_refcnt(link) > 1)
+                       delay(1);
+               wql_put_link(link);
+       }
 
        /* drop / unlink all the prepost table objects */
        /* JMM - can this happen before the delay? */
@@ -3274,6 +3586,11 @@ uint64_t wqset_id(struct waitq_set *wqset)
                return 0;
 
        assert(waitqs_is_set(wqset));
+
+       if (!waitqs_is_linked(wqset)) {
+               waitq_set_lazy_init_link(wqset);
+       }
+
        return wqset->wqset_id;
 }
 
@@ -3483,12 +3800,13 @@ boolean_t waitq_member(struct waitq *waitq, struct waitq_set *wqset)
 
        if (!waitqs_is_set(wqset))
                return FALSE;
-       
+
        waitq_lock(waitq);
 
+       if (!waitqs_is_linked(wqset))
+                goto out_unlock;
+
        setid = wqset->wqset_id;
-       if (!setid)
-               goto out_unlock;
 
        /* fast path: most waitqs are members of only 1 set */
        if (waitq->waitq_set_id == setid) {
@@ -3606,6 +3924,8 @@ static kern_return_t waitq_link_internal(struct waitq *waitq,
        kern_return_t kr;
 
        assert(waitq_held(waitq));
+       assert(setid != 0);
+       assert(setid != WQSET_NOT_LINKED);
 
        /*
         * If the waitq_set_id field is empty, then this waitq is not
@@ -3636,7 +3956,7 @@ static kern_return_t waitq_link_internal(struct waitq *waitq,
        kr = walk_waitq_links(LINK_WALK_ONE_LEVEL, waitq, waitq->waitq_set_id,
                              WQL_ALL, (void *)&setid, waitq_inset_cb);
        if (kr == WQ_ITERATE_FOUND)
-               return kr;
+               return KERN_ALREADY_IN_SET;
 
        /*
         * This wait queue is a member of at least one set already,
@@ -3666,9 +3986,14 @@ static kern_return_t waitq_link_internal(struct waitq *waitq,
  *     may (rarely) block on link table allocation if the table has to grow,
  *     and no 'reserved_link' object is passed.
  *
+ *     may block and acquire wqset lock if the wqset passed has no link.
+ *
  * Notes:
  *     The caller can guarantee that this function will never block by
- *     pre-allocating a link table object and passing its ID in 'reserved_link'
+ *     - pre-allocating a link table object and passing its ID in 'reserved_link'
+ *     - and pre-allocating the waitq set link calling waitq_set_lazy_init_link.
+ *     It is not possible to provide a reserved_link without having also linked
+ *     the wqset.
  */
 kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset,
                         waitq_lock_state_t lock_state, uint64_t *reserved_link)
@@ -3683,6 +4008,12 @@ kern_return_t waitq_link(struct waitq *waitq, struct waitq_set *wqset,
        if (!waitqs_is_set(wqset))
                return KERN_INVALID_ARGUMENT;
 
+       if (!reserved_link || *reserved_link == 0) {
+               if (!waitqs_is_linked(wqset)) {
+                       waitq_set_lazy_init_link(wqset);
+               }
+       }
+
        wqdbg_v("Link waitq %p to wqset 0x%llx",
                (void *)VM_KERNEL_UNSLIDE_OR_PERM(waitq), wqset->wqset_id);
 
@@ -3990,8 +4321,6 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq,
 
        assert(!waitq_irq_safe(waitq));
 
-       setid = wqset->wqset_id;
-
        if (waitq->waitq_set_id == 0) {
                /*
                 * TODO:
@@ -4004,6 +4333,16 @@ static kern_return_t waitq_unlink_locked(struct waitq *waitq,
                return KERN_NOT_IN_SET;
        }
 
+       if (!waitqs_is_linked(wqset)) {
+               /*
+                * No link has been allocated for the wqset,
+                * so no waitq could have been linked to it.
+                */
+               return KERN_NOT_IN_SET;
+       }
+
+       setid = wqset->wqset_id;
+
        if (waitq->waitq_set_id == setid) {
                waitq->waitq_set_id = 0;
                /*
@@ -4284,24 +4623,27 @@ kern_return_t waitq_set_unlink_all_unlock(struct waitq_set *wqset)
         * constituent wait queues. All we have to do is invalidate the SetID
         */
 
-       /* invalidate and re-alloc the link object first */
-       link = wql_get_link(wqset->wqset_id);
+       if (waitqs_is_linked(wqset)){
 
-       /* we may have raced with a waitq_set_deinit: handle this */
-       if (!link) {
-               waitq_set_unlock(wqset);
-               return KERN_SUCCESS;
-       }
+               /* invalidate and re-alloc the link object first */
+               link = wql_get_link(wqset->wqset_id);
+
+               /* we may have raced with a waitq_set_deinit: handle this */
+               if (!link) {
+                       waitq_set_unlock(wqset);
+                       return KERN_SUCCESS;
+               }
 
-       wql_invalidate(link);
+               wql_invalidate(link);
 
-       /* re-alloc the object to get a new generation ID */
-       wql_realloc_link(link, WQL_WQS);
-       link->wql_wqs.wql_set = wqset;
+               /* re-alloc the object to get a new generation ID */
+               wql_realloc_link(link, WQL_WQS);
+               link->wql_wqs.wql_set = wqset;
 
-       wqset->wqset_id = link->wql_setid.id;
-       wql_mkvalid(link);
-       wql_put_link(link);
+               wqset->wqset_id = link->wql_setid.id;
+               wql_mkvalid(link);
+               wql_put_link(link);
+       }
 
        /* clear any preposts attached to this set */
        prepost_id = 0;
index e5874895ea4d534fbd7598fc8addb6510fee038a..c3fee4a8c3122c42d441f8ddbf5fbe59a596a6c8 100644 (file)
 
 #include <sys/cdefs.h>
 
+#ifdef XNU_KERNEL_PRIVATE
+/* priority queue static asserts fail for __ARM64_ARCH_8_32__ kext builds */
+#include <kern/priority_queue.h>
+#endif /* XNU_KERNEL_PRIVATE */
+
 /*
  * Constants and types used in the waitq APIs
  */
@@ -102,13 +107,12 @@ jenkins_hash(char *key, size_t length)
 
 #include <kern/spl.h>
 #include <kern/simple_lock.h>
-#include <mach/branch_predicates.h>
 
 #include <machine/cpu_number.h>
 #include <machine/machine_routines.h> /* machine_timeout_suspended() */
 
 /*
- * The event mask is of 59 bits on 64 bit architeture and 27 bits on
+ * The event mask is of 57 bits on 64 bit architeture and 25 bits on
  * 32 bit architecture and so we calculate its size using sizeof(long).
  * If the bitfield for wq_type and wq_fifo is changed, then value of
  * EVENT_MASK_BITS will also change.
@@ -116,9 +120,8 @@ jenkins_hash(char *key, size_t length)
  * New plan: this is an optimization anyway, so I'm stealing 32bits
  * from the mask to shrink the waitq object even further.
  */
-#define _EVENT_MASK_BITS   ((sizeof(uint32_t) * 8) - 6)
+#define _EVENT_MASK_BITS   ((sizeof(uint32_t) * 8) - 7)
 
-#define WAITQ_BOOST_PRIORITY 31
 
 enum waitq_type {
        WQT_INVALID = 0,
@@ -162,6 +165,7 @@ struct waitq {
                waitq_prepost:1, /* waitq supports prepost? */
                waitq_irq:1,     /* waitq requires interrupts disabled */
                waitq_isvalid:1, /* waitq structure is valid */
+               waitq_turnstile_or_port:1, /* waitq is embedded in a turnstile (if irq safe), or port (if not irq safe) */
                waitq_eventmask:_EVENT_MASK_BITS;
                /* the wait queue set (set-of-sets) to which this queue belongs */
 #if __arm64__
@@ -172,7 +176,10 @@ struct waitq {
 
        uint64_t waitq_set_id;
        uint64_t waitq_prepost_id;
-       queue_head_t    waitq_queue;            /* queue of elements */
+       union {
+               queue_head_t            waitq_queue;            /* queue of elements */
+               struct priority_queue   waitq_prio_queue;       /* priority ordered queue of elements */
+       };
 };
 
 static_assert(sizeof(struct waitq) == WQ_OPAQUE_SIZE, "waitq structure size mismatch");
@@ -192,6 +199,7 @@ struct waitq_set {
        };
 };
 
+#define WQSET_NOT_LINKED ((uint64_t)(~0))
 static_assert(sizeof(struct waitq_set) == WQS_OPAQUE_SIZE, "waitq_set structure size mismatch");
 static_assert(__alignof(struct waitq_set) == WQS_OPAQUE_ALIGN, "waitq_set structure alignment mismatch");
 
@@ -200,6 +208,12 @@ extern void waitq_bootstrap(void);
 #define waitq_is_queue(wq) \
        ((wq)->waitq_type == WQT_QUEUE)
 
+#define waitq_is_turnstile_queue(wq) \
+       (((wq)->waitq_irq) && (wq)->waitq_turnstile_or_port)
+
+#define waitq_is_port_queue(wq) \
+       (!((wq)->waitq_irq) && (wq)->waitq_turnstile_or_port)
+
 #define waitq_is_set(wq) \
        ((wq)->waitq_type == WQT_SET && ((struct waitq_set *)(wq))->wqset_id != 0)
 
@@ -207,7 +221,10 @@ extern void waitq_bootstrap(void);
        (((wqs)->wqset_q.waitq_type == WQT_SET) && ((wqs)->wqset_id != 0))
 
 #define waitq_valid(wq) \
-       ((wq) != NULL && (wq)->waitq_isvalid && ((wq)->waitq_type & ~1) == WQT_QUEUE)
+       ((wq) != NULL && (wq)->waitq_isvalid)
+
+#define waitqs_is_linked(wqs) \
+       (((wqs)->wqset_id != WQSET_NOT_LINKED) && ((wqs)->wqset_id != 0))
 
 /*
  * Invalidate a waitq. The only valid waitq functions to call after this are:
@@ -216,8 +233,14 @@ extern void waitq_bootstrap(void);
  */
 extern void waitq_invalidate_locked(struct waitq *wq);
 
-#define waitq_empty(wq) \
-       (queue_empty(&(wq)->waitq_queue))
+static inline boolean_t waitq_empty(struct waitq *wq)
+{
+       if (waitq_is_turnstile_queue(wq)) {
+               return priority_queue_empty(&(wq->waitq_prio_queue));
+       } else {
+               return queue_empty(&(wq->waitq_queue));
+       }
+}
 
 #if __arm64__
 
@@ -400,6 +423,7 @@ extern void waitq_set_deinit(struct waitq_set *wqset);
 extern kern_return_t waitq_set_free(struct waitq_set *wqset);
 
 #if DEVELOPMENT || DEBUG
+extern int sysctl_helper_waitq_set_nelem(void);
 #if CONFIG_WAITQ_DEBUG
 extern uint64_t wqset_id(struct waitq_set *wqset);
 
@@ -412,6 +436,8 @@ struct waitq *wqset_waitq(struct waitq_set *wqset);
  * set membership
  */
 extern uint64_t waitq_link_reserve(struct waitq *waitq);
+extern void waitq_set_lazy_init_link(struct waitq_set *wqset);
+extern boolean_t waitq_set_should_lazy_init_link(struct waitq_set *wqset);
 
 extern void waitq_link_release(uint64_t id);
 
@@ -458,6 +484,8 @@ extern int waitq_is_global(struct waitq *waitq);
 
 extern int waitq_irq_safe(struct waitq *waitq);
 
+extern struct waitq * waitq_get_safeq(struct waitq *waitq);
+
 #if CONFIG_WAITQ_STATS
 /*
  * waitq statistics
index 8da4fe3c8df6d282ec76e36f3bf1d58c636bc578..a9091abf4b83120490a31220b1b0603464c88584 100644 (file)
@@ -70,7 +70,6 @@
 #include <mach/mach_host_server.h>
 #include <mach/task_server.h>
 #include <mach/machine/vm_types.h>
-#include <mach_debug/zone_info.h>
 #include <mach/vm_map.h>
 #include <mach/sdt.h>
 
 
 #include <libkern/OSDebug.h>
 #include <libkern/OSAtomic.h>
+#include <libkern/section_keywords.h>
 #include <sys/kdebug.h>
 
 #include <san/kasan.h>
@@ -191,6 +191,9 @@ sample_counter(volatile uint32_t * count_p, uint32_t factor)
 #define ZP_POISON       0xdeadbeef
 #endif
 
+boolean_t zfree_poison_element(zone_t zone, vm_offset_t elem);
+void zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr);
+
 #define ZP_DEFAULT_SAMPLING_FACTOR 16
 #define ZP_DEFAULT_SCALE_FACTOR 4
 
@@ -202,7 +205,12 @@ sample_counter(volatile uint32_t * count_p, uint32_t factor)
  */
 
 /* set by zp-factor=N boot arg, zero indicates non-tiny poisoning disabled */
-uint32_t        zp_factor               = 0;
+#if DEBUG
+#define DEFAULT_ZP_FACTOR (1)
+#else
+#define DEFAULT_ZP_FACTOR (0)
+#endif
+uint32_t        zp_factor               = DEFAULT_ZP_FACTOR;
 
 /* set by zp-scale=N boot arg, scales zp_factor by zone size */
 uint32_t        zp_scale                = 0;
@@ -218,6 +226,7 @@ uintptr_t       zp_nopoison_cookie      = 0;
 boolean_t       zone_tagging_on;
 #endif /* VM_MAX_TAG_ZONES */
 
+SECURITY_READ_ONLY_LATE(boolean_t) copyio_zalloc_check = TRUE;
 static struct bool_gen zone_bool_gen;
 
 /*
@@ -362,6 +371,33 @@ struct zone_free_element {
        /* void *backup_ptr; */
 };
 
+#if CONFIG_ZCACHE
+
+#if !CONFIG_GZALLOC
+bool use_caching = TRUE;
+#else
+bool use_caching = FALSE;
+#endif /* !CONFIG_GZALLOC */
+
+/*
+ * Decides whether per-cpu zone caching is to be enabled for all zones.
+ * Can be set to TRUE via the boot-arg '-zcache_all'.
+ */
+bool cache_all_zones = FALSE;
+
+/*
+ * Specifies a single zone to enable CPU caching for. 
+ * Can be set using boot-args: zcc_enable_for_zone_name=<zone> 
+ */
+static char cache_zone_name[MAX_ZONE_NAME];
+
+static inline bool zone_caching_enabled(zone_t z)
+{
+       return (z->cpu_cache_enabled && !z->tags && !z->zleak_on);
+}
+
+#endif /* CONFIG_ZCACHE */
+
 /*
  *      Protects zone_array, num_zones, num_zones_in_use, and zone_empty_bitmap
  */
@@ -446,6 +482,7 @@ struct zone_page_metadata {
 /* Magic value to indicate empty element free list */
 #define PAGE_METADATA_EMPTY_FREELIST           ((uint32_t)(~0))
 
+vm_map_copy_t create_vm_map_copy(vm_offset_t start_addr, vm_size_t total_size, vm_size_t used_size);
 boolean_t get_zone_info(zone_t z, mach_zone_name_t *zn, mach_zone_info_t *zi);
 boolean_t is_zone_map_nearing_exhaustion(void);
 extern void vm_pageout_garbage_collect(int collect);
@@ -513,14 +550,22 @@ zone_populate_metadata_page(struct zone_page_metadata *page_meta)
 {
        vm_offset_t page_metadata_begin = trunc_page(page_meta);
        vm_offset_t page_metadata_end = trunc_page((vm_offset_t)page_meta + sizeof(struct zone_page_metadata));
-       
+
        for(;page_metadata_begin <= page_metadata_end; page_metadata_begin += PAGE_SIZE) {
+#if !KASAN
+               /*
+                * This can race with another thread doing a populate on the same metadata
+                * page, where we see an updated pmap but unmapped KASan shadow, causing a
+                * fault in the shadow when we first access the metadata page. Avoid this
+                * by always synchronizing on the zone_metadata_region lock with KASan.
+                */
                if (pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin))
                        continue;
+#endif
                /* All updates to the zone_metadata_region are done under the zone_metadata_region_lck */
                lck_mtx_lock(&zone_metadata_region_lck);
                if (0 == pmap_find_phys(kernel_pmap, (vm_map_address_t)page_metadata_begin)) {
-                       kern_return_t __unused ret = kernel_memory_populate(zone_map,
+                       kern_return_t __assert_only ret = kernel_memory_populate(zone_map,
                                       page_metadata_begin,
                                       PAGE_SIZE,
                                       KMA_KOBJECT,
@@ -559,8 +604,9 @@ get_zone_page_metadata(struct zone_free_element *element, boolean_t init)
        } else {
                page_meta = (struct zone_page_metadata *)(trunc_page((vm_offset_t)element));
        }
-       if (init)
-               __nosan_bzero((char *)page_meta, sizeof(struct zone_page_metadata));
+       if (init) {
+               bzero((char *)page_meta, sizeof(struct zone_page_metadata));
+       }
        return ((PAGE_METADATA_GET_ZINDEX(page_meta) != MULTIPAGE_METADATA_MAGIC) ? page_meta : page_metadata_get_realmeta(page_meta));
 }
 
@@ -1200,16 +1246,17 @@ free_to_zone(zone_t      zone,
        assert(PAGE_METADATA_GET_ZONE(page_meta) == zone);
        old_head = (vm_offset_t)page_metadata_get_freelist(page_meta);
 
-#if MACH_ASSERT
        if (__improbable(!is_sane_zone_element(zone, old_head)))
                panic("zfree: invalid head pointer %p for freelist of zone %s\n",
                      (void *) old_head, zone->zone_name);
-#endif
 
        if (__improbable(!is_sane_zone_element(zone, element)))
                panic("zfree: freeing invalid pointer %p to zone %s\n",
                      (void *) element, zone->zone_name);
 
+       if (__improbable(old_head == element))
+               panic("zfree: double free of %p to zone %s\n",
+                     (void *) element, zone->zone_name);
        /*
         * Always write a redundant next pointer
         * So that it is more difficult to forge, xor it with a random cookie
@@ -1485,7 +1532,7 @@ static int  num_zones_logged = 0;
 static char zone_name_to_log[MAX_ZONE_NAME] = "";      /* the zone name we're logging, if any */
 
 /* Log allocations and frees to help debug a zone element corruption */
-boolean_t       corruption_debug_flag    = FALSE;    /* enabled by "-zc" boot-arg */
+boolean_t       corruption_debug_flag    = DEBUG;    /* enabled by "-zc" boot-arg */
 /* Making pointer scanning leaks detection possible for all zones */
 
 #if DEBUG || DEVELOPMENT
@@ -1515,13 +1562,6 @@ boolean_t       leak_scan_debug_flag     = FALSE;    /* enabled by "-zl" boot-ar
  */
 
 
-/*
- * Opcodes for the btlog operation field:
- */
-
-#define ZOP_ALLOC      1
-#define ZOP_FREE       0
-
 /*
  * Decide if we want to log this zone by doing a string compare between a zone name and the name
  * of the zone to log. Return true if the strings are equal, false otherwise.  Because it's not
@@ -1532,7 +1572,7 @@ boolean_t       leak_scan_debug_flag     = FALSE;    /* enabled by "-zl" boot-ar
 int
 track_this_zone(const char *zonename, const char *logname)
 {
-       int len;
+       unsigned int len;
        const char *zc = zonename;
        const char *lc = logname;
 
@@ -2068,6 +2108,101 @@ compute_element_size(vm_size_t requested_size)
        return element_size;
 }
 
+#if KASAN_ZALLOC
+
+/*
+ * Called from zinit().
+ *
+ * Fixes up the zone's element size to incorporate the redzones.
+ */
+static void
+kasan_update_element_size_for_redzone(
+       zone_t          zone,           /* the zone that needs to be updated */
+       vm_size_t       *size,          /* requested zone element size */
+       vm_size_t       *max,           /* maximum memory to use */
+       const char      *name)          /* zone name */
+{
+       /* Expand the zone allocation size to include the redzones. For page-multiple
+        * zones add a full guard page because they likely require alignment. kalloc
+        * and fakestack handles its own KASan state, so ignore those zones. */
+       /* XXX: remove this when zinit_with_options() is a thing */
+       const char *kalloc_name = "kalloc.";
+       const char *fakestack_name = "fakestack.";
+       if (strncmp(name, kalloc_name, strlen(kalloc_name)) == 0) {
+               zone->kasan_redzone = 0;
+       } else if (strncmp(name, fakestack_name, strlen(fakestack_name)) == 0) {
+               zone->kasan_redzone = 0;
+       } else {
+               if ((*size % PAGE_SIZE) != 0) {
+                       zone->kasan_redzone = KASAN_GUARD_SIZE;
+               } else {
+                       zone->kasan_redzone = PAGE_SIZE;
+               }
+               *max = (*max / *size) * (*size + zone->kasan_redzone * 2);
+               *size += zone->kasan_redzone * 2;
+       }
+}
+
+/*
+ * Called from zalloc_internal() to fix up the address of the newly
+ * allocated element.
+ *
+ * Returns the element address skipping over the redzone on the left.
+ */
+static vm_offset_t
+kasan_fixup_allocated_element_address(
+       zone_t                  zone,   /* the zone the element belongs to */
+       vm_offset_t             addr)   /* address of the element, including the redzone */
+{
+       /* Fixup the return address to skip the redzone */
+       if (zone->kasan_redzone) {
+               addr = kasan_alloc(addr, zone->elem_size,
+                               zone->elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
+       }
+       return addr;
+}
+
+/*
+ * Called from zfree() to add the element being freed to the KASan quarantine.
+ *
+ * Returns true if the newly-freed element made it into the quarantine without
+ * displacing another, false otherwise. In the latter case, addrp points to the
+ * address of the displaced element, which will be freed by the zone.
+ */
+static bool
+kasan_quarantine_freed_element(
+       zone_t          *zonep,         /* the zone the element is being freed to */
+       void            **addrp)        /* address of the element being freed */
+{
+       zone_t zone = *zonep;
+       void *addr = *addrp;
+
+       /*
+        * Resize back to the real allocation size and hand off to the KASan
+        * quarantine. `addr` may then point to a different allocation, if the
+        * current element replaced another in the quarantine. The zone then
+        * takes ownership of the swapped out free element.
+        */
+       vm_size_t usersz = zone->elem_size - 2 * zone->kasan_redzone;
+       vm_size_t sz = usersz;
+
+       if (addr && zone->kasan_redzone) {
+               kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
+               addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
+               assert(sz == zone->elem_size);
+       }
+       if (addr && zone->kasan_quarantine) {
+               kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, zonep, usersz, true);
+               if (!addr) {
+                       return TRUE;
+               }
+       }
+       *addrp = addr;
+       return FALSE;
+}
+
+#endif /* KASAN_ZALLOC */
+
 /*
  *     zinit initializes a new zone.  The zone data structures themselves
  *     are stored in a zone, which is initially a static structure that
@@ -2138,25 +2273,7 @@ zinit(
        simple_unlock(&all_zones_lock);
 
 #if KASAN_ZALLOC
-       /* Expand the zone allocation size to include the redzones. For page-multiple
-        * zones add a full guard page because they likely require alignment. kalloc
-        * and fakestack handles its own KASan state, so ignore those zones. */
-       /* XXX: remove this when zinit_with_options() is a thing */
-       const char *kalloc_name = "kalloc.";
-       const char *fakestack_name = "fakestack.";
-       if (strncmp(name, kalloc_name, strlen(kalloc_name)) == 0) {
-               z->kasan_redzone = 0;
-       } else if (strncmp(name, fakestack_name, strlen(fakestack_name)) == 0) {
-               z->kasan_redzone = 0;
-       } else {
-               if ((size % PAGE_SIZE) != 0) {
-                       z->kasan_redzone = KASAN_GUARD_SIZE;
-               } else {
-                       z->kasan_redzone = PAGE_SIZE;
-               }
-               max = (max / size) * (size + z->kasan_redzone * 2);
-               size += z->kasan_redzone * 2;
-       }
+       kasan_update_element_size_for_redzone(z, &size, &max, name);
 #endif
 
        max = round_page(max);
@@ -2213,6 +2330,7 @@ zinit(
        z->zp_count = 0;
        z->kasan_quarantine = TRUE;
        z->zone_valid = TRUE;
+       z->cpu_cache_enabled = FALSE;
 
 #if CONFIG_ZLEAKS
        z->zleak_capture = 0;
@@ -2367,6 +2485,13 @@ zinit(
        gzalloc_zone_init(z);
 #endif
 
+#if    CONFIG_ZCACHE
+       /* Check if boot-arg specified it should have a cache */
+       if (cache_all_zones || track_this_zone(name, cache_zone_name)) {
+               zone_change(z, Z_CACHING_ENABLED, TRUE);
+       }
+#endif
+
        return(z);
 }
 unsigned       zone_replenish_loops, zone_replenish_wakeups, zone_replenish_wakeups_initiated, zone_replenish_throttle_count;
@@ -2486,6 +2611,13 @@ zdestroy(zone_t z)
 #endif
        unlock_zone(z);
 
+#if CONFIG_ZCACHE
+       /* Drain the per-cpu caches if caching is enabled for the zone. */
+       if (zone_caching_enabled(z)) {
+               panic("zdestroy: Zone caching enabled for zone %s", z->zone_name);
+       }
+#endif /* CONFIG_ZCACHE */
+
        /* Dump all the free elements */
        drop_free_elements(z);
 
@@ -2545,6 +2677,7 @@ zcram_metadata_init(vm_offset_t newmem, vm_size_t size, struct zone_page_metadat
        return;
 }
 
+
 static void
 random_free_to_zone(
                        zone_t          zone,
@@ -2558,7 +2691,7 @@ random_free_to_zone(
        vm_size_t       elem_size;
        int             index;
 
-       assert(element_count  <= ZONE_CHUNK_MAXELEMENTS);
+       assert(element_count && element_count <= ZONE_CHUNK_MAXELEMENTS);
        elem_size = zone->elem_size;
        last_element_offset = first_element_offset + ((element_count * elem_size) - elem_size);
        for (index = 0; index < element_count; index++) {
@@ -2668,11 +2801,11 @@ zcram(
                        } else {
                                first_element_offset = zone_page_metadata_size + (ZONE_ELEMENT_ALIGNMENT - (zone_page_metadata_size % ZONE_ELEMENT_ALIGNMENT));
                        }
-                       element_count = (int)((PAGE_SIZE - first_element_offset) / elem_size);
+                       element_count = (unsigned int)((PAGE_SIZE - first_element_offset) / elem_size);
                        random_free_to_zone(zone, newmem, first_element_offset, element_count, entropy_buffer);                         
                }
        } else {
-               element_count = (int)(size / elem_size);
+               element_count = (unsigned int)(size / elem_size);
                random_free_to_zone(zone, newmem, 0, element_count, entropy_buffer);    
        }
        unlock_zone(zone);
@@ -2742,9 +2875,13 @@ zone_bootstrap(void)
        /* should zlog log to debug zone corruption instead of leaks? */
        if (PE_parse_boot_argn("-zc", temp_buf, sizeof(temp_buf))) {
                corruption_debug_flag = TRUE;
-       }       
+       }
 
 #if DEBUG || DEVELOPMENT
+       /* should perform zone element size checking in copyin/copyout? */
+       if (PE_parse_boot_argn("-no-copyio-zalloc-check", temp_buf, sizeof(temp_buf))) {
+               copyio_zalloc_check = FALSE;
+       }
 #if VM_MAX_TAG_ZONES
        /* enable tags for zones that ask for  */
        if (PE_parse_boot_argn("-zt", temp_buf, sizeof(temp_buf))) {
@@ -2777,6 +2914,19 @@ zone_bootstrap(void)
 
        lck_attr_setdefault(&zone_metadata_lock_attr); 
        lck_mtx_init_ext(&zone_metadata_region_lck, &zone_metadata_region_lck_ext, &zone_locks_grp, &zone_metadata_lock_attr);
+
+#if    CONFIG_ZCACHE
+    /* zcc_enable_for_zone_name=<zone>: enable per-cpu zone caching for <zone>. */
+       if (PE_parse_boot_arg_str("zcc_enable_for_zone_name", cache_zone_name, sizeof(cache_zone_name))) {
+               printf("zcache: caching enabled for zone %s\n", cache_zone_name);
+       }
+
+    /* -zcache_all: enable per-cpu zone caching for all zones, overrides 'zcc_enable_for_zone_name'. */
+    if (PE_parse_boot_argn("-zcache_all", temp_buf, sizeof(temp_buf))) {
+        cache_all_zones = TRUE;
+        printf("zcache: caching enabled for all zones\n");
+    }
+#endif /* CONFIG_ZCACHE */
 }
 
 /*
@@ -2854,8 +3004,8 @@ static void kill_process_in_largest_zone(void)
         * vm_map_entry_zone as the largest. This lets us target a specific process to jetsam to quickly recover from the zone map bloat.
         */
        if (largest_zone == vm_object_zone) {
-               int vm_object_zone_count = vm_object_zone->count;
-               int vm_map_entry_zone_count = vm_map_entry_zone->count;
+               unsigned int vm_object_zone_count = vm_object_zone->count;
+               unsigned int vm_map_entry_zone_count = vm_map_entry_zone->count;
                /* Is the VM map entries zone count >= 98% of the VM objects zone count? */
                if (vm_map_entry_zone_count >= ((vm_object_zone_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
                        largest_zone = vm_map_entry_zone;
@@ -2904,6 +3054,7 @@ zone_init(
 #if    CONFIG_GZALLOC
        gzalloc_init(max_zonemap_size);
 #endif
+
        /*
         * Setup garbage collection information:
         */
@@ -2953,13 +3104,42 @@ zone_init(
                zone_map_jetsam_limit = jetsam_limit_temp;
 }
 
-extern volatile SInt32 kfree_nop_count;
-
 #pragma mark -
 #pragma mark zalloc_canblock
 
 extern boolean_t early_boot_complete;
 
+void
+zalloc_poison_element(boolean_t check_poison, zone_t zone, vm_offset_t addr)
+{
+       vm_offset_t     inner_size = zone->elem_size;
+       if (__improbable(check_poison && addr)) {
+               vm_offset_t *element_cursor  = ((vm_offset_t *) addr) + 1;
+               vm_offset_t *backup  = get_backup_ptr(inner_size, (vm_offset_t *) addr);
+
+               for ( ; element_cursor < backup ; element_cursor++)
+                       if (__improbable(*element_cursor != ZP_POISON))
+                               zone_element_was_modified_panic(zone,
+                                                               addr,
+                                                               *element_cursor,
+                                                               ZP_POISON,
+                                                               ((vm_offset_t)element_cursor) - addr);
+       }
+
+       if (addr) {
+               /*
+                * Clear out the old next pointer and backup to avoid leaking the cookie
+                * and so that only values on the freelist have a valid cookie
+                */
+
+               vm_offset_t *primary  = (vm_offset_t *) addr;
+               vm_offset_t *backup   = get_backup_ptr(inner_size, primary);
+
+               *primary = ZP_POISON;
+               *backup  = ZP_POISON;
+       }
+}
+
 /*
  *     zalloc returns an element from the specified zone.
  */
@@ -2978,7 +3158,7 @@ zalloc_internal(
        vm_offset_t     addr = 0;
        kern_return_t   retval;
        uintptr_t       zbt[MAX_ZTRACE_DEPTH];  /* used in zone leak logging and zone leak detection */
-       int             numsaved = 0;
+       unsigned int            numsaved = 0;
        boolean_t       zone_replenish_wakeup = FALSE, zone_alloc_throttle = FALSE;
        thread_t thr = current_thread();
        boolean_t       check_poison = FALSE;
@@ -3033,6 +3213,21 @@ zalloc_internal(
        if (__improbable(zone->tags)) vm_tag_will_update_zone(tag, zone->tag_zone_index);
 #endif /* VM_MAX_TAG_ZONES */
 
+#if CONFIG_ZCACHE
+       if (__probable(addr == 0)) {
+               if (zone_caching_enabled(zone)) {
+                       addr = zcache_alloc_from_cpu_cache(zone);
+                       if (addr) {
+#if KASAN_ZALLOC
+                               addr = kasan_fixup_allocated_element_address(zone, addr);
+#endif
+                               DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
+                               return((void *)addr);
+                       }
+               }
+       }
+#endif /* CONFIG_ZCACHE */
+
        lock_zone(zone);
        assert(zone->zone_valid);
 
@@ -3220,7 +3415,7 @@ zalloc_internal(
                                                        (unsigned long)zone_largest->cur_size, zone_largest->count);
 
                                                }
-                                               panic("zalloc: \"%s\" (%d elements) retry fail %d, kfree_nop_count: %d", zone->zone_name, zone->count, retval, (int)kfree_nop_count);
+                                               panic("zalloc: \"%s\" (%d elements) retry fail %d", zone->zone_name, zone->count, retval);
                                        }
                                } else {
                                        break;
@@ -3288,43 +3483,19 @@ zalloc_internal(
 
        unlock_zone(zone);
 
-       vm_offset_t     inner_size = zone->elem_size;
-
        if (__improbable(DO_LOGGING(zone) && addr)) {
                btlog_add_entry(zone->zlog_btlog, (void *)addr, ZOP_ALLOC, (void **)zbt, numsaved);
        }
 
-       if (__improbable(check_poison && addr)) {
-               vm_offset_t *element_cursor  = ((vm_offset_t *) addr) + 1;
-               vm_offset_t *backup  = get_backup_ptr(inner_size, (vm_offset_t *) addr);
-
-               for ( ; element_cursor < backup ; element_cursor++)
-                       if (__improbable(*element_cursor != ZP_POISON))
-                               zone_element_was_modified_panic(zone,
-                                                               addr,
-                                                               *element_cursor,
-                                                               ZP_POISON,
-                                                               ((vm_offset_t)element_cursor) - addr);
-       }
+       zalloc_poison_element(check_poison, zone, addr);
 
        if (addr) {
-               /*
-                * Clear out the old next pointer and backup to avoid leaking the cookie
-                * and so that only values on the freelist have a valid cookie
-                */
-
-               vm_offset_t *primary  = (vm_offset_t *) addr;
-               vm_offset_t *backup   = get_backup_ptr(inner_size, primary);
-
-               *primary = ZP_POISON;
-               *backup  = ZP_POISON;
-
 #if DEBUG || DEVELOPMENT
                if (__improbable(leak_scan_debug_flag && !(zone->elem_size & (sizeof(uintptr_t) - 1)))) {
-                       int count, idx;
+                       unsigned int count, idx;
                        /* Fill element, from tail, with backtrace in reverse order */
                        if (numsaved == 0) numsaved = backtrace(zbt, MAX_ZTRACE_DEPTH);
-                       count = (int) (zone->elem_size / sizeof(uintptr_t));
+                       count = (unsigned int)(zone->elem_size / sizeof(uintptr_t));
                        if (count >= numsaved) count = numsaved - 1;
                        for (idx = 0; idx < count; idx++) ((uintptr_t *)addr)[count - 1 - idx] = zbt[idx + 1];
                }
@@ -3333,12 +3504,9 @@ zalloc_internal(
 
        TRACE_MACHLEAKS(ZALLOC_CODE, ZALLOC_CODE_2, zone->elem_size, addr);
 
+
 #if KASAN_ZALLOC
-       /* Fixup the return address to skip the redzone */
-       if (zone->kasan_redzone) {
-               addr = kasan_alloc(addr, zone->elem_size,
-                               zone->elem_size - 2 * zone->kasan_redzone, zone->kasan_redzone);
-       }
+       addr = kasan_fixup_allocated_element_address(zone, addr);
 #endif
 
        DTRACE_VM2(zalloc, zone_t, zone, void*, addr);
@@ -3376,6 +3544,22 @@ zalloc_canblock(zone_t zone, boolean_t canblock)
     return (zalloc_internal(zone, canblock, FALSE, 0, VM_KERN_MEMORY_NONE));
 }
 
+void *
+zalloc_attempt(zone_t zone)
+{
+       boolean_t check_poison = FALSE;
+       vm_offset_t addr = try_alloc_from_zone(zone, VM_KERN_MEMORY_NONE, &check_poison);
+       zalloc_poison_element(check_poison, zone, addr);
+       return (void *)addr;
+}
+
+void
+zfree_direct(zone_t zone, vm_offset_t elem)
+{
+       boolean_t       poison = zfree_poison_element(zone, elem);
+       free_to_zone(zone, elem, poison);
+}
+
 
 void
 zalloc_async(
@@ -3467,6 +3651,41 @@ static void zone_check_freelist(zone_t zone, vm_offset_t elem)
        }
 }
 
+boolean_t
+zfree_poison_element(zone_t zone, vm_offset_t elem)
+{
+       boolean_t       poison = FALSE;
+       if (zp_factor != 0 || zp_tiny_zone_limit != 0) {
+               /*
+                * Poison the memory before it ends up on the freelist to catch
+                * use-after-free and use of uninitialized memory
+                *
+                * Always poison tiny zones' elements (limit is 0 if -no-zp is set)
+                * Also poison larger elements periodically
+                */
+
+               vm_offset_t     inner_size = zone->elem_size;
+
+               uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale);
+
+               if (inner_size <= zp_tiny_zone_limit)
+                       poison = TRUE;
+               else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE)
+                       poison = TRUE;
+
+               if (__improbable(poison)) {
+
+                       /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
+                       /* Poison everything but primary and backup */
+                       vm_offset_t *element_cursor  = ((vm_offset_t *) elem) + 1;
+                       vm_offset_t *backup   = get_backup_ptr(inner_size, (vm_offset_t *)elem);
+
+                       for ( ; element_cursor < backup; element_cursor++)
+                               *element_cursor = ZP_POISON;
+               }
+       }
+       return poison;
+}
 void
 zfree(
        zone_t  zone,
@@ -3474,7 +3693,7 @@ zfree(
 {
        vm_offset_t     elem = (vm_offset_t) addr;
        uintptr_t       zbt[MAX_ZTRACE_DEPTH];                  /* only used if zone logging is enabled via boot-args */
-       int             numsaved = 0;
+       unsigned int            numsaved = 0;
        boolean_t       gzfreed = FALSE;
        boolean_t       poison = FALSE;
 #if VM_MAX_TAG_ZONES
@@ -3483,24 +3702,9 @@ zfree(
 
        assert(zone != ZONE_NULL);
        DTRACE_VM2(zfree, zone_t, zone, void*, addr);
-
 #if KASAN_ZALLOC
-       /*
-        * Resize back to the real allocation size and hand off to the KASan
-        * quarantine. `addr` may then point to a different allocation.
-        */
-       vm_size_t usersz = zone->elem_size - 2 * zone->kasan_redzone;
-       vm_size_t sz = usersz;
-       if (addr && zone->kasan_redzone) {
-               kasan_check_free((vm_address_t)addr, usersz, KASAN_HEAP_ZALLOC);
-               addr = (void *)kasan_dealloc((vm_address_t)addr, &sz);
-               assert(sz == zone->elem_size);
-       }
-       if (addr && zone->kasan_quarantine) {
-               kasan_free(&addr, &sz, KASAN_HEAP_ZALLOC, &zone, usersz, true);
-               if (!addr) {
-                       return;
-               }
+       if (kasan_quarantine_freed_element(&zone, &addr)) {
+               return;
        }
        elem = (vm_offset_t)addr;
 #endif
@@ -3536,34 +3740,8 @@ zfree(
                panic("zfree: non-allocated memory in collectable zone!");
        }
 
-       if ((zp_factor != 0 || zp_tiny_zone_limit != 0) && !gzfreed) {
-               /*
-                * Poison the memory before it ends up on the freelist to catch
-                * use-after-free and use of uninitialized memory
-                *
-                * Always poison tiny zones' elements (limit is 0 if -no-zp is set)
-                * Also poison larger elements periodically
-                */
-
-               vm_offset_t     inner_size = zone->elem_size;
-
-               uint32_t sample_factor = zp_factor + (((uint32_t)inner_size) >> zp_scale);
-
-               if (inner_size <= zp_tiny_zone_limit)
-                       poison = TRUE;
-               else if (zp_factor != 0 && sample_counter(&zone->zp_count, sample_factor) == TRUE)
-                       poison = TRUE;
-
-               if (__improbable(poison)) {
-
-                       /* memset_pattern{4|8} could help make this faster: <rdar://problem/4662004> */
-                       /* Poison everything but primary and backup */
-                       vm_offset_t *element_cursor  = ((vm_offset_t *) elem) + 1;
-                       vm_offset_t *backup   = get_backup_ptr(inner_size, (vm_offset_t *)elem);
-
-                       for ( ; element_cursor < backup; element_cursor++)
-                               *element_cursor = ZP_POISON;
-               }
+       if (!gzfreed) {
+               poison = zfree_poison_element(zone, elem);
        }
 
        /*
@@ -3589,6 +3767,14 @@ zfree(
                }
        }
 
+#if CONFIG_ZCACHE
+               if (zone_caching_enabled(zone)) {
+                       int __assert_only ret = zcache_free_to_cpu_cache(zone, addr);
+                       assert(ret != FALSE);
+                       return;
+               }
+#endif /* CONFIG_ZCACHE */
+
        lock_zone(zone);
        assert(zone->zone_valid);
 
@@ -3607,12 +3793,10 @@ zfree(
                free_to_zone(zone, elem, poison);
        }
 
-#if MACH_ASSERT
-       if (zone->count < 0)
+       if (__improbable(zone->count < 0)) {
                panic("zfree: zone count underflow in zone %s while freeing element %p, possible cause: double frees or freeing memory that did not come from this zone",
                zone->zone_name, addr);
-#endif
-       
+       }
 
 #if CONFIG_ZLEAKS
        /*
@@ -3699,6 +3883,18 @@ zone_change(
                case Z_KASAN_QUARANTINE:
                        zone->kasan_quarantine = value;
                        break;
+               case Z_CACHING_ENABLED:
+#if    CONFIG_ZCACHE
+                       if (value == TRUE && use_caching) {
+                               if (zcache_ready()) {
+                                       zcache_init(zone);
+                               } else {
+                                       zone->cpu_cache_enable_when_ready = TRUE;
+                               }
+
+                       }
+#endif
+                       break;
                default:
                        panic("Zone_change: Wrong Item Type!");
                        /* break; */
@@ -3731,7 +3927,7 @@ void
 drop_free_elements(zone_t z)
 {
        vm_size_t                                       elt_size, size_freed;
-       int                                                     total_freed_pages = 0;
+       unsigned int                                                    total_freed_pages = 0;
        uint64_t                                        old_all_free_count;
        struct zone_page_metadata       *page_meta;
        queue_head_t                            page_meta_head;
@@ -3834,7 +4030,11 @@ zone_gc(boolean_t consider_jetsams)
                if (!z->collectable) {
                        continue;
                }
-               
+#if CONFIG_ZCACHE
+               if (zone_caching_enabled(z)) {
+                       zcache_drain_depot(z);
+               }
+#endif /* CONFIG_ZCACHE */
                if (queue_empty(&z->pages.all_free)) {
                        continue;
                }
@@ -3873,6 +4073,40 @@ consider_zone_gc(boolean_t consider_jetsams)
                zone_gc(consider_jetsams);
 }
 
+/*
+ * Creates a vm_map_copy_t to return to the caller of mach_* MIG calls
+ * requesting zone information.
+ * Frees unused pages towards the end of the region, and zero'es out unused
+ * space on the last page.
+ */
+vm_map_copy_t
+create_vm_map_copy(
+       vm_offset_t             start_addr,
+       vm_size_t               total_size,
+       vm_size_t               used_size)
+{
+       kern_return_t   kr;
+       vm_offset_t             end_addr;
+       vm_size_t               free_size;
+       vm_map_copy_t   copy;
+
+       if (used_size != total_size) {
+               end_addr = start_addr + used_size;
+               free_size = total_size - (round_page(end_addr) - start_addr);
+
+               if (free_size >= PAGE_SIZE) {
+                       kmem_free(ipc_kernel_map,
+                                       round_page(end_addr), free_size);
+               }
+               bzero((char *) end_addr, round_page(end_addr) - end_addr);
+       }
+
+       kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)start_addr,
+                          (vm_map_size_t)used_size, TRUE, &copy);
+       assert(kr == KERN_SUCCESS);
+
+       return copy;
+}
 
 boolean_t
 get_zone_info(
@@ -3960,15 +4194,13 @@ mach_memory_info(
        vm_offset_t             memory_info_addr;
        vm_size_t               memory_info_size;
        vm_size_t               memory_info_vmsize;
-        unsigned int           num_info;
+       unsigned int            num_info;
 
        unsigned int            max_zones, used_zones, i;
        mach_zone_name_t        *zn;
        mach_zone_info_t        *zi;
        kern_return_t           kr;
        
-       vm_size_t               used;
-       vm_map_copy_t           copy;
        uint64_t                zones_collectable_bytes = 0;
 
        if (host == HOST_NULL)
@@ -4018,42 +4250,10 @@ mach_memory_info(
                zi++;
        }
 
-       used = used_zones * sizeof *names;
-       if (used != names_size) {
-               vm_offset_t names_addr_end = names_addr + used;
-               vm_size_t free_size = names_size - (round_page(names_addr_end) - names_addr);
-
-               if (free_size >= PAGE_SIZE) {
-                       kmem_free(ipc_kernel_map,
-                                       round_page(names_addr_end), free_size);
-               }
-               bzero((char *) names_addr_end, round_page(names_addr_end) - names_addr_end);
-       }
-
-       kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)names_addr,
-                          (vm_map_size_t)used, TRUE, &copy);
-       assert(kr == KERN_SUCCESS);
-
-       *namesp = (mach_zone_name_t *) copy;
+       *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, used_zones * sizeof *names);
        *namesCntp = used_zones;
 
-       used = used_zones * sizeof *info;
-       if (used != info_size) {
-               vm_offset_t info_addr_end = info_addr + used;
-               vm_size_t free_size = info_size - (round_page(info_addr_end) - info_addr);
-
-               if (free_size >= PAGE_SIZE) {
-                       kmem_free(ipc_kernel_map,
-                                       round_page(info_addr_end), free_size);
-               }
-               bzero((char *) info_addr_end, round_page(info_addr_end) - info_addr_end);
-       }
-
-       kr = vm_map_copyin(ipc_kernel_map, (vm_map_address_t)info_addr,
-                          (vm_map_size_t)used, TRUE, &copy);
-       assert(kr == KERN_SUCCESS);
-
-       *infop = (mach_zone_info_t *) copy;
+       *infop = (mach_zone_info_t *) create_vm_map_copy(info_addr, info_size, used_zones * sizeof *info);
        *infoCntp = used_zones;
        
        num_info = 0;
@@ -4061,6 +4261,7 @@ mach_memory_info(
 
        if (memoryInfop && memoryInfoCntp)
        {
+               vm_map_copy_t           copy;
                num_info = vm_page_diagnose_estimate();
                memory_info_size = num_info * sizeof(*memory_info);
                memory_info_vmsize = round_page(memory_info_size);
@@ -4121,7 +4322,7 @@ mach_zone_info_for_zone(
                assert(z != ZONE_NULL);
 
                /* Find the requested zone by name */
-               if (!strncmp(name.mzn_name, z->zone_name, strlen(z->zone_name))) {
+               if (track_this_zone(z->zone_name, name.mzn_name)) {
                        zone_ptr = z;
                        break;
                }
@@ -4181,6 +4382,143 @@ get_zones_collectable_bytes(void)
        return zones_collectable_bytes;
 }
 
+kern_return_t
+mach_zone_get_zlog_zones(
+       host_priv_t                             host,
+       mach_zone_name_array_t  *namesp,
+       mach_msg_type_number_t  *namesCntp)
+{
+#if DEBUG || DEVELOPMENT
+       unsigned int max_zones, logged_zones, i;
+       kern_return_t kr;
+       zone_t zone_ptr;
+       mach_zone_name_t *names;
+       vm_offset_t names_addr;
+       vm_size_t names_size;
+
+       if (host == HOST_NULL)
+               return KERN_INVALID_HOST;
+
+       if (namesp == NULL || namesCntp == NULL)
+               return KERN_INVALID_ARGUMENT;
+
+       simple_lock(&all_zones_lock);
+       max_zones = (unsigned int)(num_zones);
+       simple_unlock(&all_zones_lock);
+
+       names_size = round_page(max_zones * sizeof *names);
+       kr = kmem_alloc_pageable(ipc_kernel_map,
+                                &names_addr, names_size, VM_KERN_MEMORY_IPC);
+       if (kr != KERN_SUCCESS)
+               return kr;
+       names = (mach_zone_name_t *) names_addr;
+
+       zone_ptr = ZONE_NULL;
+       logged_zones = 0;
+       for (i = 0; i < max_zones; i++) {
+               zone_t z = &(zone_array[i]);
+               assert(z != ZONE_NULL);
+
+               /* Copy out the zone name if zone logging is enabled */
+               if(z->zlog_btlog) {
+                       get_zone_info(z, &names[logged_zones], NULL);
+                       logged_zones++;
+               }
+       }
+
+       *namesp = (mach_zone_name_t *) create_vm_map_copy(names_addr, names_size, logged_zones * sizeof *names);
+       *namesCntp = logged_zones;
+
+       return KERN_SUCCESS;
+
+#else /* DEBUG || DEVELOPMENT */
+#pragma unused(host, namesp, namesCntp)
+       return KERN_FAILURE;
+#endif /* DEBUG || DEVELOPMENT */
+}
+
+kern_return_t
+mach_zone_get_btlog_records(
+       host_priv_t                             host,
+       mach_zone_name_t                name,
+       zone_btrecord_array_t   *recsp,
+       mach_msg_type_number_t  *recsCntp)
+{
+#if DEBUG || DEVELOPMENT
+       unsigned int max_zones, i, numrecs = 0;
+       zone_btrecord_t *recs;
+       kern_return_t kr;
+       zone_t zone_ptr;
+       vm_offset_t recs_addr;
+       vm_size_t recs_size;
+
+       if (host == HOST_NULL)
+               return KERN_INVALID_HOST;
+
+       if (recsp == NULL || recsCntp == NULL)
+               return KERN_INVALID_ARGUMENT;
+
+       simple_lock(&all_zones_lock);
+       max_zones = (unsigned int)(num_zones);
+       simple_unlock(&all_zones_lock);
+
+       zone_ptr = ZONE_NULL;
+       for (i = 0; i < max_zones; i++) {
+               zone_t z = &(zone_array[i]);
+               assert(z != ZONE_NULL);
+
+               /* Find the requested zone by name */
+               if (track_this_zone(z->zone_name, name.mzn_name)) {
+                       zone_ptr = z;
+                       break;
+               }
+       }
+
+       /* No zones found with the requested zone name */
+       if (zone_ptr == ZONE_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       /* Logging not turned on for the requested zone */
+       if (!DO_LOGGING(zone_ptr)) {
+               return KERN_FAILURE;
+       }
+
+       /* Allocate memory for btlog records */
+       numrecs = (unsigned int)(get_btlog_records_count(zone_ptr->zlog_btlog));
+       recs_size = round_page(numrecs * sizeof *recs);
+
+       kr = kmem_alloc_pageable(ipc_kernel_map, &recs_addr, recs_size, VM_KERN_MEMORY_IPC);
+       if (kr != KERN_SUCCESS) {
+               return kr;
+       }
+
+       /*
+        * We will call get_btlog_records() below which populates this region while holding a spinlock
+        * (the btlog lock). So these pages need to be wired.
+        */
+       kr = vm_map_wire_kernel(ipc_kernel_map, recs_addr, recs_addr + recs_size,
+                       VM_PROT_READ|VM_PROT_WRITE, VM_KERN_MEMORY_IPC, FALSE);
+       assert(kr == KERN_SUCCESS);
+
+       recs = (zone_btrecord_t *)recs_addr;
+       get_btlog_records(zone_ptr->zlog_btlog, recs, &numrecs);
+
+       kr = vm_map_unwire(ipc_kernel_map, recs_addr, recs_addr + recs_size, FALSE);
+       assert(kr == KERN_SUCCESS);
+
+       *recsp = (zone_btrecord_t *) create_vm_map_copy(recs_addr, recs_size, numrecs * sizeof *recs);
+       *recsCntp = numrecs;
+
+       return KERN_SUCCESS;
+
+#else /* DEBUG || DEVELOPMENT */
+#pragma unused(host, name, recsp, recsCntp)
+       return KERN_FAILURE;
+#endif /* DEBUG || DEVELOPMENT */
+}
+
+
 #if DEBUG || DEVELOPMENT
 
 kern_return_t
@@ -4235,6 +4573,8 @@ mach_memory_info_check(void)
     return (kr);
 }
 
+extern boolean_t (* volatile consider_buffer_cache_collect)(int);
+
 #endif /* DEBUG || DEVELOPMENT */
 
 kern_return_t
@@ -4245,6 +4585,10 @@ mach_zone_force_gc(
                return KERN_INVALID_HOST;
 
 #if DEBUG || DEVELOPMENT
+       /* Callout to buffer cache GC to drop elements in the apfs zones */
+       if (consider_buffer_cache_collect != NULL) {
+               (void)(*consider_buffer_cache_collect)(0);
+       }
        consider_zone_gc(FALSE);
 #endif /* DEBUG || DEVELOPMENT */
        return (KERN_SUCCESS);
@@ -4445,7 +4789,7 @@ kdp_is_in_zone(void *addr, const char *zone_name)
 boolean_t
 run_zone_test(void)
 {
-       int i = 0, max_iter = 5;
+       unsigned int i = 0, max_iter = 5;
        void * test_ptr;
        zone_t test_zone;
 
index 6a585b83fc87ee92b868e96e37fe1e2ff4d6f91d..b45020f11f5f1eb6c3d83b7d496d4449a4f38ff9 100644 (file)
@@ -68,6 +68,7 @@
 #define _KERN_ZALLOC_H_
 
 #include <mach/machine/vm_types.h>
+#include <mach_debug/zone_info.h>
 #include <kern/kern_types.h>
 #include <sys/cdefs.h>
 
 #include <san/kasan.h>
 #endif
 
+#ifdef CONFIG_ZCACHE
+#include <kern/zcache.h>
+#endif
+
 #if    CONFIG_GZALLOC
 typedef struct gzalloc_data {
        uint32_t        gzfc_index;
@@ -103,6 +108,9 @@ struct zone_free_element;
 struct zone_page_metadata;
 
 struct zone {
+#ifdef  CONFIG_ZCACHE
+       struct zone_cache *zcache;
+#endif /* CONFIG_ZCACHE */
        struct zone_free_element *free_elements;        /* free elements directly linked */
        struct {
                queue_head_t                    any_free_foreign;       /* foreign pages crammed into zone */
@@ -145,7 +153,9 @@ struct zone {
        /* boolean_t */ tags_inline        :1,
        /* future    */ tag_zone_index     :6,
        /* boolean_t */ zone_valid         :1,
-       /* future    */ _reserved          :5;
+       /* boolean_t */ cpu_cache_enable_when_ready  :1,
+       /* boolean_t */ cpu_cache_enabled  :1,
+       /* future    */ _reserved          :3;
 
        int             index;          /* index into zone_info arrays for this zone */
        const char      *zone_name;     /* a name for the zone */
@@ -267,12 +277,19 @@ __BEGIN_DECLS
 #ifdef XNU_KERNEL_PRIVATE
 #define Z_TAGS_ENABLED 11      /* Store tags */
 #endif  /* XNU_KERNEL_PRIVATE */
+#define Z_CACHING_ENABLED 12   /*enable and initialize per-cpu caches for the zone*/
 
 #ifdef XNU_KERNEL_PRIVATE
 
 extern vm_offset_t     zone_map_min_address;
 extern vm_offset_t     zone_map_max_address;
 
+/* free an element with no regard for gzalloc, zleaks, or kasan*/
+extern void    zfree_direct(           zone_t          zone,
+                                       vm_offset_t     elem);
+
+/* attempts to allocate an element with no regard for gzalloc, zleaks, or kasan*/
+extern void *  zalloc_attempt(         zone_t          zone);
 
 /* Non-waiting for memory version of zalloc */
 extern void *  zalloc_nopagewait(
@@ -321,16 +338,6 @@ extern vm_size_t   zone_element_size(
                                                void            *addr,
                                                zone_t          *z);
 
-/*
- * MAX_ZTRACE_DEPTH configures how deep of a stack trace is taken on each zalloc in the zone of interest.  15
- * levels is usually enough to get past all the layers of code in kalloc and IOKit and see who the actual
- * caller is up above these lower levels.
- *
- * This is used both for the zone leak detector and the zone corruption log.
- */
-
-#define MAX_ZTRACE_DEPTH       15
-
 /* 
  *  Structure for keeping track of a backtrace, used for leak detection.
  *  This is in the .h file because it is used during panic, see kern/debug.c
diff --git a/osfmk/kern/zcache.c b/osfmk/kern/zcache.c
new file mode 100644 (file)
index 0000000..dab30e6
--- /dev/null
@@ -0,0 +1,642 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/assert.h>
+#include <kern/cpu_data.h>
+#include <mach/mach_host.h>
+#include <vm/vm_kern.h>
+
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <i386/mp.h>
+#endif
+
+#if defined (__arm__) || defined (__arm64__)
+#include <arm/cpu_data_internal.h>
+#endif
+
+#define DEFAULT_MAGAZINE_SIZE  8               /* Default number of elements for all magazines allocated from the magazine_zone */
+#define DEFAULT_DEPOT_SIZE     8               /* Default number of elements for the array zcc_depot_list */
+#define ZCC_MAX_CPU_CACHE_LINE_SIZE    64      /* We should use a platform specific macro for this in the future, right now this is the max cache line size for all platforms*/
+
+lck_grp_t      zcache_locks_grp;                       /* lock group for depot_lock */
+zone_t                 magazine_zone;                          /* zone to allocate zcc_magazine structs from */
+uint16_t       magazine_element_count = 0;             /* Size of array in magazine determined by boot-arg or default */
+uint16_t       depot_element_count = 0;                /* Size of depot lists determined by boot-arg or default */
+bool           zone_cache_ready = FALSE;               /* Flag to check if zone caching has been set up by zcache_bootstrap */
+uintptr_t      zcache_canary = 0;                      /* Canary used for the caching layer to prevent UaF attacks */
+
+/*     The zcc_magazine is used as a stack to store cached zone elements. These
+ *     sets of elements can be moved around to perform bulk operations.
+*/
+struct zcc_magazine {
+       uint32_t zcc_magazine_index;            /* Used as a stack pointer to acess elements in the array */
+       uint32_t zcc_magazine_capacity;         /* Number of pointers able to be stored in the zcc_elements array */
+       void *zcc_elements[0];                  /* Array of pointers to objects */
+};
+
+
+/*     Each CPU will use one of these to store its elements
+*/
+struct zcc_per_cpu_cache {
+       struct zcc_magazine *current;           /* Magazine from which we will always try to allocate from and free to first */
+       struct zcc_magazine *previous;          /* Dedicated magazine for a quick reload and to prevent thrashing wen we swap with the depot */
+} __attribute__(( aligned(ZCC_MAX_CPU_CACHE_LINE_SIZE) ));     /* we want to align this to a cache line size so it does not thrash when multiple cpus want to access their caches in paralell */
+
+
+/*
+ * The depot layer can be invalid while zone_gc() is draining it out.
+ * During that time, the CPU caches are active. For CPU magazine allocs and 
+ * frees, the caching layer reaches directly into the zone allocator.
+ */
+#define ZCACHE_DEPOT_INVALID                   -1
+#define zcache_depot_available(zcache)         (zcache->zcc_depot_index != ZCACHE_DEPOT_INVALID)
+
+/*     This is the basic struct to take care of cahing and is included within
+ *     the zone.
+*/
+struct zone_cache {
+       lck_mtx_t zcc_depot_lock;                               /* Lock for the depot layer of caching */
+       struct zcc_per_cpu_cache zcc_per_cpu_caches[MAX_CPUS];  /* An array of caches, one for each CPU */
+       int zcc_depot_index;                                    /* marks the point in the array where empty magazines begin */
+       struct zcc_magazine *zcc_depot_list[0];                 /* Stores full and empty magazines in the depot layer */
+};
+
+
+void zcache_init_marked_zones(void);
+bool zcache_mag_fill(zone_t zone, struct zcc_magazine *mag);
+void zcache_mag_drain(zone_t zone, struct zcc_magazine *mag);
+void zcache_mag_init(struct zcc_magazine *mag, int count);
+void *zcache_mag_pop(struct zcc_magazine *mag);
+void zcache_mag_push(struct zcc_magazine *mag, void *elem);
+bool zcache_mag_has_space(struct zcc_magazine *mag);
+bool zcache_mag_has_elements(struct zcc_magazine *mag);
+void zcache_swap_magazines(struct zcc_magazine **a, struct zcc_magazine **b);
+void zcache_mag_depot_swap_for_alloc(struct zone_cache *depot, struct zcc_per_cpu_cache *cache);
+void zcache_mag_depot_swap_for_free(struct zone_cache *depot, struct zcc_per_cpu_cache *cache);
+void zcache_mag_depot_swap(struct zone_cache *depot, struct zcc_per_cpu_cache *cache, boolean_t load_full);
+void zcache_canary_add(zone_t zone, void *addr);
+void zcache_canary_validate(zone_t zone, void *addr);
+
+/*
+ * zcache_ready
+ *
+ * Description: returns whether or not the zone caches are ready to use
+ *
+ */
+bool zcache_ready(void){
+       return zone_cache_ready;
+}
+
+/*
+ * zcache_init_marked_zones
+ *
+ * Description: Initializes all parts of the per-cpu caches for the list of
+ *             marked zones once we are able to initalize caches. This should
+ *             only be called once, and will be called during the time that the
+ *             system is single threaded so we don't have to take the lock.
+ *
+ */
+void zcache_init_marked_zones(void){
+       unsigned int i;
+       for(i = 0; i < num_zones; i ++){
+               if(zone_array[i].cpu_cache_enable_when_ready){
+                       zcache_init(&zone_array[i]);
+                       zone_array[i].cpu_cache_enable_when_ready = FALSE;
+               }
+       }
+}
+
+/*
+ * zcache_bootstrap
+ *
+ * Description: initializes zone to allocate magazines from and sets
+ *             magazine_element_count and depot_element_count from
+ *             boot-args or default values
+ *
+ */
+void zcache_bootstrap(void)
+{
+       /* use boot-arg for custom magazine size*/
+       if (! PE_parse_boot_argn("zcc_magazine_element_count", &magazine_element_count, sizeof (uint16_t)))
+               magazine_element_count = DEFAULT_MAGAZINE_SIZE;
+
+       int magazine_size = sizeof(struct zcc_magazine) + magazine_element_count * sizeof(void *);
+
+       magazine_zone = zinit(magazine_size, 100000 * magazine_size , magazine_size, "zcc_magazine_zone");
+
+       assert(magazine_zone != NULL);
+
+       /* use boot-arg for custom depot size*/
+       if (! PE_parse_boot_argn("zcc_depot_element_count", &depot_element_count, sizeof (uint16_t)))
+               depot_element_count = DEFAULT_DEPOT_SIZE;
+
+       lck_grp_init(&zcache_locks_grp, "zcc_depot_lock", LCK_GRP_ATTR_NULL);
+
+       /* Generate the canary value for zone caches */
+       zcache_canary = (uintptr_t) early_random();
+
+       zone_cache_ready = TRUE;
+
+       zcache_init_marked_zones();
+}
+
+
+/*
+ * zcache_init
+ *
+ * Description: Initializes all parts of the per-cpu caches for a given zone
+ *
+ * Parameters: zone    pointer to zone on which to iniitalize caching
+ *
+ */
+ void zcache_init(zone_t zone)
+ {
+       int     i;                      /* used as index in for loops */
+       vm_size_t       total_size;             /* Used for allocating the zone_cache struct with the proper size of depot list */
+       struct zone_cache *temp_cache;  /* Temporary variable to initialize a zone_cache before assigning to the specified zone */
+
+       /* Allocate chunk of memory for all structs */
+       total_size = sizeof(struct zone_cache) + (depot_element_count * sizeof(void *));
+       
+       temp_cache = (struct zone_cache *) kalloc(total_size);
+
+
+       /* Initialize a cache for every CPU */
+       for (i = 0; i < MAX_CPUS; i++) {
+               temp_cache->zcc_per_cpu_caches[i].current = (struct zcc_magazine *)zalloc(magazine_zone);
+               temp_cache->zcc_per_cpu_caches[i].previous = (struct zcc_magazine *)zalloc(magazine_zone);
+
+               assert(temp_cache->zcc_per_cpu_caches[i].current != NULL && temp_cache->zcc_per_cpu_caches[i].previous != NULL);
+
+               zcache_mag_init(temp_cache->zcc_per_cpu_caches[i].current, magazine_element_count);
+               zcache_mag_init(temp_cache->zcc_per_cpu_caches[i].previous, magazine_element_count);
+       }
+
+       /* Initialize the lock on the depot layer */
+       lck_mtx_init(&(temp_cache->zcc_depot_lock), &zcache_locks_grp, LCK_ATTR_NULL);
+
+       /* Initialize empty magazines in the depot list */
+       for (i = 0; i < depot_element_count; i++) {
+               temp_cache->zcc_depot_list[i] = (struct zcc_magazine *)zalloc(magazine_zone);
+
+               assert(temp_cache->zcc_depot_list[i] != NULL);
+
+               zcache_mag_init(temp_cache->zcc_depot_list[i], magazine_element_count);
+       }
+
+       temp_cache->zcc_depot_index = 0;
+
+       lock_zone(zone);
+       zone->zcache = temp_cache;
+       /* Set flag to know caching is enabled */
+       zone->cpu_cache_enabled = TRUE;
+       unlock_zone(zone);
+       return;
+ }
+
+/*
+ * zcache_drain_depot
+ *
+ * Description: Frees all the full magazines from the depot layer to the zone allocator as part
+ *              of zone_gc(). The routine assumes that only one zone_gc() is in progress (zone_gc_lock
+ *              ensures that)
+ *
+ * Parameters: zone    pointer to zone for which the depot layer needs to be drained
+ *
+ * Returns: None
+ *
+ */
+void zcache_drain_depot(zone_t zone)
+{
+       struct zone_cache *zcache = zone->zcache;
+       int drain_depot_index = 0;
+
+       /*
+        * Grab the current depot list from the zone cache. If it has full magazines, 
+        * mark the depot as invalid and drain it.
+        */
+       lck_mtx_lock_spin_always(&(zcache->zcc_depot_lock));
+       if (!zcache_depot_available(zcache) || (zcache->zcc_depot_index == 0)) {
+               /* no full magazines in the depot or depot unavailable; nothing to drain here */
+               lck_mtx_unlock(&(zcache->zcc_depot_lock));
+               return;
+       }
+       drain_depot_index = zcache->zcc_depot_index;
+       /* Mark the depot as unavailable */
+       zcache->zcc_depot_index = ZCACHE_DEPOT_INVALID;
+       lck_mtx_unlock(&(zcache->zcc_depot_lock));
+
+       /* Now drain the full magazines in the depot */
+       for (int i = 0; i < drain_depot_index; i++)
+               zcache_mag_drain(zone, zcache->zcc_depot_list[i]);
+
+       lck_mtx_lock_spin_always(&(zcache->zcc_depot_lock));
+       /* Mark the depot as available again */
+       zcache->zcc_depot_index = 0;
+       lck_mtx_unlock(&(zcache->zcc_depot_lock));
+}
+
+
+/*
+ * zcache_free_to_cpu_cache
+ *
+ * Description: Checks per-cpu caches to free element there if possible
+ *
+ * Parameters: zone    pointer to zone for which element comes from
+ *             addr    pointer to element to free
+ *
+ * Returns: TRUE if successfull, FALSE otherwise
+ *
+ * Precondition: check that caching is enabled for zone
+ */
+bool zcache_free_to_cpu_cache(zone_t zone, void *addr)
+{
+       int     curcpu;                                 /* Current cpu is used to index into array of zcc_per_cpu_cache structs */
+       struct  zone_cache *zcache;                     /* local storage of the zone's cache */
+       struct zcc_per_cpu_cache *per_cpu_cache;        /* locally store the current per_cpu_cache */
+
+       disable_preemption();
+       curcpu = current_processor()->cpu_id;
+       zcache = zone->zcache;
+       per_cpu_cache = &zcache->zcc_per_cpu_caches[curcpu];
+
+       if (zcache_mag_has_space(per_cpu_cache->current)) {
+               /* If able, free into current magazine */
+               goto free_to_current;
+       } else if (zcache_mag_has_space(per_cpu_cache->previous)) {
+               /* If able, swap current and previous magazine and retry */
+               zcache_swap_magazines(&per_cpu_cache->previous, &per_cpu_cache->current);
+               goto free_to_current;
+       } else{
+               lck_mtx_lock_spin_always(&(zcache->zcc_depot_lock));
+               if (zcache_depot_available(zcache) && (zcache->zcc_depot_index < depot_element_count)) {
+                       /* If able, rotate in a new empty magazine from the depot and retry */
+                       zcache_mag_depot_swap_for_free(zcache, per_cpu_cache);
+                       lck_mtx_unlock(&(zcache->zcc_depot_lock));
+                       goto free_to_current;
+               }
+               lck_mtx_unlock(&(zcache->zcc_depot_lock));
+               /* Attempt to free an entire magazine of elements */
+               zcache_mag_drain(zone, per_cpu_cache->current);
+               if(zcache_mag_has_space(per_cpu_cache->current)){
+                       goto free_to_current;
+               }
+       }
+
+       /* If not able to use cache return FALSE and fall through to zfree */
+       enable_preemption();
+       return FALSE;
+
+free_to_current:
+       assert(zcache_mag_has_space(per_cpu_cache->current));
+       zcache_canary_add(zone, addr);
+       zcache_mag_push(per_cpu_cache->current, addr);
+
+#if KASAN_ZALLOC
+       kasan_poison_range((vm_offset_t)addr, zone->elem_size, ASAN_HEAP_FREED);
+#endif
+
+       enable_preemption();
+       return TRUE;
+}
+
+
+/*
+ * zcache_alloc_from_cpu_cache
+ *
+ * Description: Checks per-cpu caches to allocate element from there if possible
+ *
+ * Parameters: zone    pointer to zone for which element will come from
+ *
+ * Returns: pointer to usable element
+ *
+ * Precondition: check that caching is enabled for zone
+ */
+vm_offset_t zcache_alloc_from_cpu_cache(zone_t zone)
+{
+       int curcpu;                                     /* Current cpu is used to index into array of zcc_per_cpu_cache structs */
+       void *ret = NULL;                               /* Points to the element which will be returned */
+       struct  zone_cache *zcache;                     /* local storage of the zone's cache */
+       struct zcc_per_cpu_cache *per_cpu_cache;        /* locally store the current per_cpu_cache */
+
+       disable_preemption();
+       curcpu = current_processor()->cpu_id;
+       zcache = zone->zcache;
+       per_cpu_cache = &zcache->zcc_per_cpu_caches[curcpu];
+
+       if (zcache_mag_has_elements(per_cpu_cache->current)) {
+               /* If able, allocate from current magazine */
+               goto allocate_from_current;
+       } else if (zcache_mag_has_elements(per_cpu_cache->previous)) {
+               /* If able, swap current and previous magazine and retry */
+               zcache_swap_magazines(&per_cpu_cache->previous, &per_cpu_cache->current);
+               goto allocate_from_current;
+       } else {
+               lck_mtx_lock_spin_always(&(zcache->zcc_depot_lock));
+               if (zcache_depot_available(zcache) && (zcache->zcc_depot_index > 0)) {
+                       /* If able, rotate in a full magazine from the depot */
+                       zcache_mag_depot_swap_for_alloc(zcache, per_cpu_cache);
+                       lck_mtx_unlock(&(zcache->zcc_depot_lock));
+                       goto allocate_from_current;
+               }
+               lck_mtx_unlock(&(zcache->zcc_depot_lock));
+               /* Attempt to allocate an entire magazine of elements */
+               if(zcache_mag_fill(zone, per_cpu_cache->current)){
+                       goto allocate_from_current;
+               }
+       }
+
+       /* If unable to allocate from cache return NULL and fall through to zalloc */
+       enable_preemption();
+       return (vm_offset_t) NULL;
+
+allocate_from_current:
+       ret = zcache_mag_pop(per_cpu_cache->current);
+       assert(ret != NULL);
+       zcache_canary_validate(zone, ret);
+
+#if KASAN_ZALLOC
+       kasan_poison_range((vm_offset_t)ret, zone->elem_size, ASAN_VALID);
+#endif
+
+       enable_preemption();
+       return (vm_offset_t) ret;
+}
+
+
+/*
+ * zcache_mag_init
+ *
+ * Description: initializes fields in a zcc_magazine struct
+ *
+ * Parameters: mag     pointer to magazine to initialize
+ *
+ */
+void zcache_mag_init(struct zcc_magazine *mag, int count)
+{
+       mag->zcc_magazine_index = 0;
+       mag->zcc_magazine_capacity = count;
+}
+
+
+/*
+ * zcache_mag_fill
+ *
+ * Description: fills a magazine with as many elements as the zone can give
+ *             without blocking to carve out more memory
+ *
+ * Parameters: zone    zone from which to allocate
+ *             mag     pointer to magazine to fill
+ *
+ * Return:     True if able to allocate elements, false is mag is still empty
+ */
+bool zcache_mag_fill(zone_t zone, struct zcc_magazine *mag)
+{
+       assert(mag->zcc_magazine_index == 0);
+       void* elem = NULL;
+       uint32_t i;
+       lock_zone(zone);
+       for(i = mag->zcc_magazine_index; i < mag->zcc_magazine_capacity; i ++){
+               elem = zalloc_attempt(zone);
+               if(elem) {
+                       zcache_canary_add(zone, elem);
+                       zcache_mag_push(mag, elem);
+#if KASAN_ZALLOC
+                       kasan_poison_range((vm_offset_t)elem, zone->elem_size, ASAN_HEAP_FREED);
+#endif
+               } else {
+                       break;
+               }
+       }
+       unlock_zone(zone);
+       if (i == 0){
+               return FALSE;
+       }
+       return TRUE;
+}
+
+/*
+ * zcache_mag_drain
+ *
+ * Description: frees all elements in a magazine
+ *
+ * Parameters: zone    zone to which elements will be freed
+ *             mag     pointer to magazine to empty
+ *
+ */
+void zcache_mag_drain(zone_t zone, struct zcc_magazine *mag)
+{
+       assert(mag->zcc_magazine_index == mag->zcc_magazine_capacity);
+       lock_zone(zone);
+       while(mag->zcc_magazine_index > 0){
+               uint32_t index = --mag->zcc_magazine_index;
+               zcache_canary_validate(zone, mag->zcc_elements[index]);
+               zfree_direct(zone,(vm_offset_t)mag->zcc_elements[index]);
+               mag->zcc_elements[mag->zcc_magazine_index] = 0;
+       }
+       unlock_zone(zone);
+}
+
+/*
+ * zcache_mag_pop
+ *
+ * Description: removes last element from magazine in a stack pop fashion
+ *             zcc_magazine_index represents the number of elements on the
+ *             stack, so it the index of where to save the next element, when
+ *             full, it will be 1 past the last index of the array
+ *
+ * Parameters: mag     pointer to magazine from which to remove element
+ *
+ * Returns: pointer to element removed from magazine
+ *
+ * Precondition: must check that magazine is not empty before calling
+ */
+void *zcache_mag_pop(struct zcc_magazine *mag)
+{
+       void    *elem;
+       assert(zcache_mag_has_elements(mag));
+       elem =  mag->zcc_elements[--mag->zcc_magazine_index];
+       /* Ensure pointer to element cannot be accessed after we pop it */
+       mag->zcc_elements[mag->zcc_magazine_index] = NULL;
+       assert(elem != NULL);
+       return elem;
+}
+
+
+/*
+ * zcache_mag_push
+ *
+ * Description: adds element to magazine and increments zcc_magazine_index
+ *             zcc_magazine_index represents the number of elements on the
+ *             stack, so it the index of where to save the next element, when
+ *             full, it will be 1 past the last index of the array
+ *
+ * Parameters: mag     pointer to magazine from which to remove element
+ *             elem    pointer to element to add
+ *
+ * Precondition: must check that magazine is not full before calling
+ */
+void zcache_mag_push(struct zcc_magazine *mag, void *elem)
+{
+       assert(zcache_mag_has_space(mag));
+       mag->zcc_elements[mag->zcc_magazine_index ++] = elem;
+}
+
+
+/*
+ * zcache_mag_has_space
+ *
+ * Description: checks if magazine still has capacity
+ *
+ * Parameters: mag     pointer to magazine to check
+ *
+ * Returns: true if magazine is full
+ *
+ */
+bool zcache_mag_has_space(struct zcc_magazine *mag)
+{
+       return (mag->zcc_magazine_index < mag->zcc_magazine_capacity);
+}
+
+
+/*
+ * zcache_mag_has_elements
+ *
+ * Description: checks if magazine is empty
+ *
+ * Parameters: mag     pointer to magazine to check
+ *
+ * Returns: true if magazine has no elements
+ *
+ */
+bool zcache_mag_has_elements(struct zcc_magazine *mag)
+{
+       return (mag->zcc_magazine_index > 0);
+}
+
+
+/*
+ * zcache_swap_magazines
+ *
+ * Description: Function which swaps two pointers of any type
+ *
+ * Parameters: a               pointer to first pointer
+ *             b               pointer to second pointer
+ */
+void zcache_swap_magazines(struct zcc_magazine **a, struct zcc_magazine **b)
+{
+       struct zcc_magazine *temp = *a;
+       *a = *b;
+       *b = temp;
+}
+
+
+/*
+ * zcache_mag_depot_swap_for_alloc
+ *
+ * Description: Swaps a full magazine into the current position
+ *
+ * Parameters: zcache                  pointer to the zone_cache to access the depot
+ *             cache                   pointer to the current per-cpu cache
+ *
+ * Precondition: Check that the depot list has full elements
+ */
+void zcache_mag_depot_swap_for_alloc(struct zone_cache *zcache, struct zcc_per_cpu_cache *cache)
+{
+       /* Loads a full magazine from which we can allocate */
+       assert(zcache_depot_available(zcache));
+       assert(zcache->zcc_depot_index > 0);
+       zcache->zcc_depot_index --;
+       zcache_swap_magazines(&cache->current, &zcache->zcc_depot_list[zcache->zcc_depot_index]);
+}
+
+
+/*
+ * zcache_mag_depot_swap_for_free
+ *
+ * Description: Swaps an empty magazine into the current position
+ *
+ * Parameters: zcache                  pointer to the zone_cache to access the depot
+ *             cache                   pointer to the current per-cpu cache
+ *
+ * Precondition: Check that the depot list has empty elements
+ */
+void zcache_mag_depot_swap_for_free(struct zone_cache *zcache, struct zcc_per_cpu_cache *cache)
+{
+       /* Loads an empty magazine into which we can free */
+       assert(zcache_depot_available(zcache));
+       assert(zcache->zcc_depot_index < depot_element_count);
+       zcache_swap_magazines(&cache->current, &zcache->zcc_depot_list[zcache->zcc_depot_index]);
+       zcache->zcc_depot_index ++;
+}
+
+/*
+ * zcache_canary_add
+ *
+ * Description: Adds a canary to an element by putting zcache_canary at the first 
+ *             and last location of the element
+ *
+ * Parameters: zone    zone for the element
+ *             addr    element address to add canary to
+ *
+ */
+void zcache_canary_add(zone_t zone, void *element)
+{
+       vm_offset_t *primary = (vm_offset_t *)element;
+       vm_offset_t *backup = (vm_offset_t *)((vm_offset_t)primary + zone->elem_size - sizeof(vm_offset_t));
+       *primary = *backup = (zcache_canary ^ (uintptr_t)element);
+}
+
+/*
+ * zcache_canary_validate
+ *
+ * Description: Validates an element of the zone cache to make sure it still contains the zone 
+ *             caching canary.
+ *
+ * Parameters: zone    zone for the element
+ *             addr    element address to validate
+ *
+ */
+void zcache_canary_validate(zone_t zone, void *element)
+{
+       vm_offset_t *primary = (vm_offset_t *)element;
+       vm_offset_t *backup = (vm_offset_t *)((vm_offset_t)primary + zone->elem_size - sizeof(vm_offset_t));
+
+       vm_offset_t primary_value = (*primary ^ (uintptr_t)element);
+       if (primary_value != zcache_canary) {
+               panic("Zone cache element was used after free! Element %p was corrupted at beginning; Expected %p but found %p; canary %p",
+                       element, (void *)(zcache_canary ^ (uintptr_t)element) , (void *)(*primary), (void *)zcache_canary);
+       }
+       
+       vm_offset_t backup_value = (*backup ^ (uintptr_t)element);
+       if (backup_value != zcache_canary) {
+               panic("Zone cache element was used after free! Element %p was corrupted at end; Expected %p but found %p; canary %p",
+                       element, (void *)(zcache_canary ^ (uintptr_t)element), (void *)(*backup), (void *)zcache_canary);
+       }
+}
diff --git a/osfmk/kern/zcache.h b/osfmk/kern/zcache.h
new file mode 100644 (file)
index 0000000..6919aa5
--- /dev/null
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+/*
+ *     Below is a diagram of the caching system. This design is based of the
+ * paper "Magazines and Vmem: Extending the Slab Allocator to Many CPUs and
+ * Arbitrary Resources" by Jeff Bonwick and Jonathan Adams. It is divided into 3
+ * layers: the Per-cpu Layer, the Depot Layer, and the Zone Allocator. The
+ * Per-CPU and Depot layers store elements using arrays we call magazines.
+ *
+ *     Magazines function like a stack (we push and pop elements) and can be
+ *  moved around for bulk operations.
+ *  _________         _________         _________
+ * |  CPU 1  |       |  CPU 2  |       |  CPU 3  |
+ * |  _   _  |       |  _   _  |       |  _   _  |
+ * | |#| | | |       | | | |#| |       | |#| |#| |        Per-CPU Layer
+ * | |#| |_| |       | |_| |#| |       | |#| |#| |
+ * |_________|       |_________|       |_________|
+ *
+ *  ______________________________________________
+ * |            _   _   _   _   _   _             |
+ * |           |#| |#| |#| | | | | | |            |     Depot Layer
+ * |           |#| |#| |#| |_| |_| |_|            |
+ * |______________________________________________|
+ *
+ *  _______________________________________________
+ * | # | # | # | # | # | # | # | # | # | # | # | # |   Zone Allocator
+ * |_______________________________________________|
+ *
+ *     The top layer is the per-cpu cache and consists of a current and
+ * previous magazine for each CPU. The current magazine is the one we always try
+ * to allocate from and free to first. Only if we are unable, do we check the
+ * previous magazine. If the previous magazine can satisfy the allocate or free,
+ * then we switch the two and allocate from the new current magazine. This layer
+ * requires no locking, so we can access multiple CPU's caches concurrently.
+ * This is the main source of the speedup.
+ *
+ *     We have two magazines here to prevent thrashing when swapping magazines
+ * with the depot layer. If a certain pattern of alloc and free are called we
+ * can waste a lot of time swapping magazines to and from the depot layer. We
+ * prevent this by dividing the per-cpu cache into two separate magazines.
+ *
+ *     The middle layer is the magazine depot. This layer consists of a
+ * collection of full and empty magazines. These are used to reload the per-cpu
+ * caches when needed. This is implemented as an array of magazines which are
+ * initially all empty and as we fill up magazines we increment the index to
+ * point at the first empty magazine. Since this layer is per-zone, it allows us
+ *  to balance the cache between cpus, but does require taking a lock.
+ *
+ *     When neither the current nor previous magazine for a given CPU can
+ * satisfy the free or allocation, we look to the depot layer. If there are
+ * magazines in the depot that can satisfy the free or allocation we swap
+ * that magazine into the current position. In the example below, to allocate on
+ * the given CPU we must lock the depot layer and swap magazine A with magazine
+ * B and decrement the depot index.
+ *
+ *      _____________________       _______________________________________
+ *     |    Per-CPU Cache    |     |              Depot Layer              |
+ *     |                     |     |                                       |
+ *     |   A___      ____    |     |   ____      B___      ____      ____  |
+ *     |  |    |    |    |   |     |  | ## |    | ## |    |    |    |    | |
+ *     |  |    |    |    |   |     |  | ## |    | ## |    |    |    |    | |
+ *     |  |    |    |    |   |     |  | ## |    | ## |    |    |    |    | |
+ *     |  |    |    |    |   |     |  | ## |    | ## |    |    |    |    | |
+ *     |  |____|    |____|   |     |  |_##_|    |_##_|    |____|    |____| |
+ *     | Current   Previous  |     |                                       |
+ *     |_____________________|     |_______________________________________|
+ *
+ *     The bottom layer is the Zone Allocator. This is already implemented in
+ *  XNU and will remain mostly unchanged. Implementation for this can be found
+ * in zalloc.c and zalloc.h. We will only use the zone if all other layers are
+ * unable to satisfy the allocation or free. When we do use the zone, we will
+ * try to allocate an entire magazine of elements or free an entire magazine of
+ * elements at once.
+ *
+ *     Caching must be enabled explicitly, by calling zone_change() with the
+ * Z_CACHING_ENABLED flag, for every zone you want to cache elements for. Zones
+ * which are good candidates for this are ones with highly contended zone locks.
+ *
+ * Some good potential candidates are kalloc.16, kalloc.48, Vm objects, VM map
+ * entries, ipc vouchers, and ipc ports.
+ *
+ *
+ * Some factors can be tuned by boot-arg:
+ *  zcc_enable_for_zone_name   name of a single zone to enable caching for
+ *                             (replace space characters with '.')
+ *
+ *  zcc_magazine_element_count integer value for magazine size used for all
+ *                             zones (default 8 is used if not specified)
+ *
+ *  zcc_depot_element_count    integer value for how many full and empty
+ *                             magazines to store in the depot, if N specified
+ *                             depot will have N full and N empty magazines
+ *                             (default 16 used if not specified)
+*/
+#include <kern/kern_types.h>
+#include <vm/vm_kern.h>
+
+
+/*
+ * zcache_ready
+ *
+ * Description: returns whether or not the zone caches are ready to use
+ *
+ */
+bool           zcache_ready(void);
+
+
+/*
+ * zcache_bootstrap
+ *
+ * Description: initializes zone to allocate magazines from
+ *
+ */
+void           zcache_bootstrap(void);
+
+
+/*
+ * zcache_init
+ *
+ * Description: Initializes all parts of the per-cpu caches for a given zone
+ *
+ * Parameters: zone    pointer to zone on which to iniitalize caching
+ *
+ */
+void           zcache_init(zone_t zone);
+
+
+/*
+ * zcache_free_to_cpu_cache
+ *
+ * Description: Checks per-cpu caches to free element there if possible
+ *
+ * Parameters: zone    pointer to zone for which element comes from
+ *             addr    pointer to element to free
+ *
+ * Returns: TRUE if successfull, FALSE otherwise
+ *
+ * Precondition: check that caching is enabled for zone
+ */
+bool           zcache_free_to_cpu_cache(zone_t zone, void *addr);
+
+
+/*
+ * zcache_alloc_from_cpu_cache
+ *
+ * Description: Checks per-cpu caches to allocate element from there if possible
+ *
+ * Parameters: zone    pointer to zone for which element will come from
+ *
+ * Returns: pointer to usable element
+ *
+ * Precondition: check that caching is enabled for zone
+ */
+vm_offset_t    zcache_alloc_from_cpu_cache(zone_t zone);
+
+/*
+ * zcache_drain_depot
+ *
+ * Description: Frees all the full magazines from the depot layer to the zone allocator
+ *              Invoked by zone_gc()
+ *
+ * Parameters:  zone    pointer to zone for which the depot layer needs to be drained
+ *
+ * Returns: None
+ *
+ */
+void           zcache_drain_depot(zone_t zone);
index 385bc0520289a7d4c3371e80f776e24db9c0a6ce..467e337741a0a202604d80d028f5efd6995ab17f 100644 (file)
@@ -13,6 +13,7 @@ EXPORT_ONLY_FILES = \
                        kperfbsd.h       \
                        kperf_timer.h    \
                        kdebug_trigger.h \
+                       lazy.h           \
                        pet.h
 
 EXPORT_MI_DIR = kperf
index ab33ded44a5600134a979e61314fef7f435c0b29..94afda3423530d199590fbfd5c43472693e1d456 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2011 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
@@ -38,6 +38,7 @@
 #include <kern/thread.h>
 #include <sys/errno.h>
 #include <sys/vm.h>
+#include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
 
@@ -57,8 +58,7 @@
 #define ACTION_MAX (32)
 
 /* the list of different actions to take */
-struct action
-{
+struct action {
        uint32_t sample;
        uint32_t ucallstack_depth;
        uint32_t kcallstack_depth;
@@ -67,14 +67,14 @@ struct action
 };
 
 /* the list of actions */
-static unsigned actionc = 0;
+static unsigned int actionc = 0;
 static struct action *actionv = NULL;
 
 /* should emit tracepoint on context switch */
 int kperf_kdebug_cswitch = 0;
 
 bool
-kperf_sample_has_non_system(unsigned actionid)
+kperf_action_has_non_system(unsigned int actionid)
 {
        if (actionid > actionc) {
                return false;
@@ -87,6 +87,26 @@ kperf_sample_has_non_system(unsigned actionid)
        }
 }
 
+bool
+kperf_action_has_task(unsigned int actionid)
+{
+       if (actionid > actionc) {
+               return false;
+       }
+
+       return (actionv[actionid - 1].sample & SAMPLER_TASK_MASK);
+}
+
+bool
+kperf_action_has_thread(unsigned int actionid)
+{
+       if (actionid > actionc) {
+               return false;
+       }
+
+       return (actionv[actionid - 1].sample & SAMPLER_THREAD_MASK);
+}
+
 static void
 kperf_system_memory_log(void)
 {
@@ -94,6 +114,10 @@ kperf_system_memory_log(void)
                        (uintptr_t)vm_page_wire_count, (uintptr_t)vm_page_external_count,
                        (uintptr_t)(vm_page_active_count + vm_page_inactive_count +
                        vm_page_speculative_count));
+       BUF_DATA(PERF_MI_SYS_DATA_2, (uintptr_t)vm_page_anonymous_count,
+                       (uintptr_t)vm_page_internal_count,
+                       (uintptr_t)vm_pageout_vminfo.vm_pageout_compressions,
+                       (uintptr_t)VM_PAGE_COMPRESSOR_COUNT);
 }
 
 static kern_return_t
@@ -106,6 +130,7 @@ kperf_sample_internal(struct kperf_sample *sbuf,
        int pended_th_dispatch = 0;
        bool on_idle_thread = false;
        uint32_t userdata = actionid;
+       bool task_only = false;
 
        /* not much point continuing here, but what to do ? return
         * Shutdown? cut a tracepoint and continue?
@@ -123,8 +148,20 @@ kperf_sample_internal(struct kperf_sample *sbuf,
                sample_what &= SAMPLER_SYS_MEM;
        }
 
-       context->cur_thread->kperf_pet_gen = kperf_pet_gen;
-       boolean_t is_kernel = (context->cur_pid == 0);
+       assert((sample_flags & (SAMPLE_FLAG_THREAD_ONLY | SAMPLE_FLAG_TASK_ONLY))
+                       != (SAMPLE_FLAG_THREAD_ONLY | SAMPLE_FLAG_TASK_ONLY));
+       if (sample_flags & SAMPLE_FLAG_THREAD_ONLY) {
+               sample_what &= SAMPLER_THREAD_MASK;
+       }
+       if (sample_flags & SAMPLE_FLAG_TASK_ONLY) {
+               task_only = true;
+               sample_what &= SAMPLER_TASK_MASK;
+       }
+
+       if (!task_only) {
+               context->cur_thread->kperf_pet_gen = kperf_pet_gen;
+       }
+       bool is_kernel = (context->cur_pid == 0);
 
        if (actionid && actionid <= actionc) {
                sbuf->kcallstack.nframes = actionv[actionid - 1].kcallstack_depth;
@@ -175,13 +212,13 @@ kperf_sample_internal(struct kperf_sample *sbuf,
                }
        }
        if (sample_what & SAMPLER_TK_SNAPSHOT) {
-               kperf_task_snapshot_sample(&(sbuf->tk_snapshot), context);
+               kperf_task_snapshot_sample(context->cur_task, &(sbuf->tk_snapshot));
        }
 
        /* sensitive ones */
        if (!is_kernel) {
                if (sample_what & SAMPLER_MEMINFO) {
-                       kperf_meminfo_sample(&(sbuf->meminfo), context);
+                       kperf_meminfo_sample(context->cur_task, &(sbuf->meminfo));
                }
 
                if (sample_flags & SAMPLE_FLAG_PEND_USER) {
@@ -257,6 +294,9 @@ log_sample:
        if (sample_what & SAMPLER_TK_SNAPSHOT) {
                kperf_task_snapshot_log(&(sbuf->tk_snapshot));
        }
+       if (sample_what & SAMPLER_TK_INFO) {
+               kperf_task_info_log(context);
+       }
 
        /* dump user stuff */
        if (!is_kernel) {
@@ -331,7 +371,6 @@ void
 kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp)
 {
        uint32_t sample_flags = SAMPLE_FLAG_PEND_USER;
-       struct kperf_context ctx;
        struct kperf_sample *sample = NULL;
        kern_return_t kr = KERN_SUCCESS;
        int s;
@@ -342,10 +381,15 @@ kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp)
 
        BUF_VERB(PERF_KDBG_HNDLR | DBG_FUNC_START, debugid);
 
-       ctx.cur_thread = current_thread();
-       ctx.cur_pid = task_pid(get_threadtask(ctx.cur_thread));
-       ctx.trigger_type = TRIGGER_TYPE_KDEBUG;
-       ctx.trigger_id = 0;
+       thread_t thread = current_thread();
+       task_t task = get_threadtask(thread);
+       struct kperf_context ctx = {
+               .cur_thread = thread,
+               .cur_task = task,
+               .cur_pid = task_pid(task),
+               .trigger_type = TRIGGER_TYPE_KDEBUG,
+               .trigger_id = 0,
+       };
 
        s = ml_set_interrupts_enabled(0);
 
@@ -385,9 +429,11 @@ kperf_thread_ast_handler(thread_t thread)
        }
 
        /* make a context, take a sample */
-       struct kperf_context ctx;
-       ctx.cur_thread = thread;
-       ctx.cur_pid = task_pid(task);
+       struct kperf_context ctx = {
+               .cur_thread = thread,
+               .cur_task = task,
+               .cur_pid = task_pid(task),
+       };
 
        /* decode the flags to determine what to sample */
        unsigned int sample_what = 0;
index f4e2e72bd7bd13ab6275a754e66523fa94357620..be150c40104b749b33f944b6b11cf0dc16b78608 100644 (file)
@@ -30,6 +30,7 @@
 #define KPERF_ACTION_H
 
 #include <mach/kern_return.h>
+#include <stdint.h>
 #include <stdbool.h>
 
 /* fwd decl */
@@ -50,6 +51,13 @@ struct kperf_context;
 #define SAMPLER_TK_SNAPSHOT   (1U << 10)
 #define SAMPLER_SYS_MEM       (1U << 11)
 #define SAMPLER_TH_INSCYC     (1U << 12)
+#define SAMPLER_TK_INFO       (1U << 13)
+
+#define SAMPLER_TASK_MASK (SAMPLER_MEMINFO | SAMPLER_TK_SNAPSHOT | \
+               SAMPLER_TK_INFO)
+#define SAMPLER_THREAD_MASK (SAMPLER_TH_INFO | SAMPLER_TH_SNAPSHOT | \
+               SAMPLER_KSTACK | SAMPLER_USTACK | SAMPLER_PMC_THREAD | \
+               SAMPLER_TH_SCHEDULING | SAMPLER_TH_DISPATCH | SAMPLER_TH_INSCYC)
 
 /* flags for sample calls */
 
@@ -67,6 +75,10 @@ struct kperf_context;
 #define SAMPLE_FLAG_SYSTEM          (1U << 5)
 /* sample should not include non-system samplers */
 #define SAMPLE_FLAG_ONLY_SYSTEM     (1U << 6)
+/* sample should only include task samplers */
+#define SAMPLE_FLAG_TASK_ONLY       (1U << 7)
+/* sample should only include thread samplers */
+#define SAMPLE_FLAG_THREAD_ONLY     (1U << 8)
 
 /*  Take a sample into "sbuf" using current thread "cur_thread" */
 kern_return_t kperf_sample(struct kperf_sample *sbuf,
@@ -75,7 +87,9 @@ kern_return_t kperf_sample(struct kperf_sample *sbuf,
                            unsigned sample_flags);
 
 /* Whether the action provided samples non-system values. */
-bool kperf_sample_has_non_system(unsigned actionid);
+bool kperf_action_has_non_system(unsigned actionid);
+bool kperf_action_has_thread(unsigned int actionid);
+bool kperf_action_has_task(unsigned int actionid);
 
 /* return codes from taking a sample
  * either keep trigger, or something went wrong (or we're shutting down)
@@ -105,4 +119,11 @@ int kperf_action_get_kcallstack_depth(unsigned int actionid, uint32_t * depth_ou
 int kperf_action_set_filter(unsigned int actionid, int pid);
 int kperf_action_get_filter(unsigned int actionid, int *pid_out);
 
+void kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp);
+
+/* whether to output tracepoints on context-switch */
+extern int kperf_kdebug_cswitch;
+int kperf_kdbg_cswitch_get(void);
+int kperf_kdbg_cswitch_set(int newval);
+
 #endif /* !defined(KPERF_ACTION_H) */
diff --git a/osfmk/kperf/arm/kperf_meminfo.c b/osfmk/kperf/arm/kperf_meminfo.c
deleted file mode 100644 (file)
index e9d6b10..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2015 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include <mach/mach_types.h>
-#include <kern/task.h> /* task_ledgers */
-#include <kern/thread.h>
-#include <kern/ledger.h>
-
-#include <kperf/kperf_arch.h>
-
-kern_return_t
-kperf_get_phys_footprint(task_t task, uint64_t *phys_footprint_out)
-{
-       kern_return_t kr;
-       ledger_amount_t credit, debit;
-       uint64_t phys_footprint;
-
-       kr = ledger_get_entries(task->ledger, task_ledgers.phys_footprint,
-                               &credit, &debit);
-       if (kr == KERN_SUCCESS) {
-               phys_footprint = credit - debit;
-       } else {
-               return kr;
-       }
-
-       *phys_footprint_out = phys_footprint;
-       return KERN_SUCCESS;
-}
-
index 8dbe06fbcc2d92ee421eae11a2e7c8cd3c71de6a..fb24d409785c94f536b7434c4b2c41e06f2eebbe 100644 (file)
@@ -43,7 +43,7 @@
 #define PERF_KPC        (6)
 #define PERF_KDBG       (7)
 #define PERF_TASK       (8)
-/* 9 unused */
+#define PERF_LAZY       (9)
 #define PERF_MEMINFO    (10)
 
 /* helpers for 32-bit */
@@ -78,6 +78,8 @@
 #define PERF_TI_INSCYCDATA_32   PERF_TI_CODE(18)
 #define PERF_TI_SCHEDDATA_2     PERF_TI_CODE(19)
 #define PERF_TI_SCHEDDATA2_32_2 PERF_TI_CODE(20)
+#define PERF_TI_SCHEDDATA3_32   PERF_TI_CODE(21)
+#define PERF_TI_SCHEDDATA_3     PERF_TI_CODE(22)
 
 #define PERF_CS_CODE(code) PERF_CODE(PERF_CALLSTACK, code)
 #define PERF_CS_KSAMPLE    PERF_CS_CODE(0)
 #define PERF_TK_SNAP_DATA     PERF_TK_CODE(1)
 #define PERF_TK_SNAP_DATA1_32 PERF_TK_CODE(2)
 #define PERF_TK_SNAP_DATA2_32 PERF_TK_CODE(3)
+#define PERF_TK_INFO_DATA     PERF_TK_CODE(4)
+
+#define PERF_LZ_CODE(code) PERF_CODE(PERF_LAZY, code)
+#define PERF_LZ_MKRUNNABLE PERF_LZ_CODE(0)
+#define PERF_LZ_WAITSAMPLE PERF_LZ_CODE(1)
+#define PERF_LZ_CPUSAMPLE  PERF_LZ_CODE(2)
 
 #define PERF_MI_CODE(code) PERF_CODE(PERF_MEMINFO, code)
 #define PERF_MI_SAMPLE     PERF_MI_CODE(0)
 #define PERF_MI_DATA       PERF_MI_CODE(1)
 #define PERF_MI_SYS_DATA   PERF_MI_CODE(2)
+#define PERF_MI_SYS_DATA_2 PERF_MI_CODE(3)
 
 /* error sub-codes for trace data */
 enum
index 2fe676882634e2daa2a957ad8ffd64f3364179fe..7c93e8137f44ef0a66473ec0e269aba621d738b6 100644 (file)
@@ -662,6 +662,8 @@ chudxnu_thread_get_callstack64_kperf(
        return chudxnu_thread_get_callstack64_internal( thread, callStack, count, user_only, 0 );
 }
 #elif __arm64__
+
+
 // chudxnu_thread_get_callstack gathers a raw callstack along with any information needed to
 // fix it up later (in case we stopped program as it was saving values into prev stack frame, etc.)
 // after sampling has finished.
index 14eadfe7aeed8b0fc635ab9c8ceeca18f8dd84f1..d0fd4c290cfdc61ade0c3fb4f77c0bf81bf9a201 100644 (file)
@@ -36,11 +36,12 @@ struct kperf_context {
        /* who was running during the event */
        int cur_pid;
        thread_t cur_thread;
+       task_t cur_task;
        uintptr_t *starting_fp;
 
        /* who caused the event */
-       unsigned trigger_type;
-       unsigned trigger_id;
+       unsigned int trigger_type;
+       unsigned int trigger_id;
 };
 
 #endif /* !defined(KPERF_CONTEXT_H) */
index 7c343631b83eebec8ac7fe6b92b302ceec553e24..b649888ac9b16770f3c604dcb12a649d494cd579 100644 (file)
@@ -157,14 +157,14 @@ kperf_kdebug_set_filter(user_addr_t user_filter, uint32_t user_size)
                return err;
        }
 
+       n_debugids_provided = (uint32_t)KPERF_KDEBUG_N_DEBUGIDS(user_size);
+
        /* detect disabling the filter completely */
-       if (user_filter == USER_ADDR_NULL || user_size == 0) {
+       if (n_debugids_provided == 0) {
                bzero(kperf_kdebug_filter, sizeof(*kperf_kdebug_filter));
                goto out;
        }
 
-       n_debugids_provided = (uint32_t)KPERF_KDEBUG_N_DEBUGIDS(user_size);
-
        if ((err = kperf_kdebug_set_n_debugids(n_debugids_provided))) {
                goto out;
        }
index 19f7d870423c39165b7f2d86e553064d5bff0975..831f3afd86563b8b641abc6a545694a38dfdc31a 100644 (file)
@@ -38,6 +38,7 @@
 #include <kperf/kdebug_trigger.h>
 #include <kperf/kperf.h>
 #include <kperf/kperf_timer.h>
+#include <kperf/lazy.h>
 #include <kperf/pet.h>
 #include <kperf/sample.h>
 
@@ -62,6 +63,9 @@ static boolean_t kperf_initted = FALSE;
 /* whether or not to callback to kperf on context switch */
 boolean_t kperf_on_cpu_active = FALSE;
 
+unsigned int kperf_thread_blocked_action;
+unsigned int kperf_cpu_sample_action;
+
 struct kperf_sample *
 kperf_intr_sample_buffer(void)
 {
@@ -140,6 +144,7 @@ kperf_reset(void)
        (void)kperf_sampling_disable();
 
        /* cleanup miscellaneous configuration first */
+       kperf_lazy_reset();
        (void)kperf_kdbg_cswitch_set(0);
        (void)kperf_set_lightweight_pet(0);
        kperf_kdebug_reset();
@@ -209,12 +214,17 @@ kperf_kernel_configure(const char *config)
                }
        } while (*(config++) == ',');
 
-       kperf_sampling_enable();
+       int error = kperf_sampling_enable();
+       if (error) {
+               kprintf("kperf: cannot enable sampling at boot: %d", error);
+       }
 
 out:
        ktrace_end_single_threaded();
 }
 
+void kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation,
+               uintptr_t *starting_fp);
 void
 kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation,
                       uintptr_t *starting_fp)
@@ -222,19 +232,22 @@ kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation,
        if (kperf_kdebug_cswitch) {
                /* trace the new thread's PID for Instruments */
                int pid = task_pid(get_threadtask(thread));
-
                BUF_DATA(PERF_TI_CSWITCH, thread_tid(thread), pid);
        }
        if (kperf_lightweight_pet_active) {
                kperf_pet_on_cpu(thread, continuation, starting_fp);
        }
+       if (kperf_lazy_wait_action != 0) {
+               kperf_lazy_wait_sample(thread, continuation, starting_fp);
+       }
 }
 
 void
 kperf_on_cpu_update(void)
 {
        kperf_on_cpu_active = kperf_kdebug_cswitch ||
-                             kperf_lightweight_pet_active;
+                             kperf_lightweight_pet_active ||
+                             kperf_lazy_wait_action != 0;
 }
 
 /* random misc-ish functions */
@@ -321,21 +334,16 @@ kperf_thread_set_dirty(thread_t thread, boolean_t dirty)
 int
 kperf_port_to_pid(mach_port_name_t portname)
 {
-       task_t task;
-       int pid;
-
        if (!MACH_PORT_VALID(portname)) {
                return -1;
        }
 
-       task = port_name_to_task(portname);
-
+       task_t task = port_name_to_task(portname);
        if (task == TASK_NULL) {
                return -1;
        }
-
-       pid = task_pid(task);
-
+       pid_t pid = task_pid(task);
+       /* drop the ref taken by port_name_to_task */
        task_deallocate_internal(task);
 
        return pid;
index 040d032c1e6260c979206e4ccef710955474447d..673a02cd36953d68cbeadd16e07ac691ed8e73ae 100644 (file)
@@ -1,8 +1,8 @@
 /*
- * Copyright (c) 2011 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2011-2018 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
 extern lck_grp_t kperf_lck_grp;
 
 /* the trigger types supported by kperf */
-#define TRIGGER_TYPE_TIMER  (0)
-#define TRIGGER_TYPE_PMI    (1)
-#define TRIGGER_TYPE_KDEBUG (2)
+#define TRIGGER_TYPE_TIMER     (0)
+#define TRIGGER_TYPE_PMI       (1)
+#define TRIGGER_TYPE_KDEBUG    (2)
+#define TRIGGER_TYPE_LAZY_WAIT (3)
+#define TRIGGER_TYPE_LAZY_CPU  (3)
 
 /* helpers to get and set AST flags on a thread */
 uint32_t kperf_get_thread_flags(thread_t thread);
@@ -69,51 +71,78 @@ extern int kperf_sampling_disable(void);
 struct kperf_sample *kperf_intr_sample_buffer(void);
 
 /*
- * kperf AST handler
+ * Callbacks into kperf from other systems.
  */
-extern __attribute__((noinline)) void kperf_thread_ast_handler(thread_t thread);
 
 /*
- * thread on core callback
+ * kperf AST handler
+ *
+ * Prevent inlining, since the sampling function allocates on the stack and
+ * branches calling ast_taken (but never on a kperf AST) may blow their stacks.
  */
-
-/* controls whether the callback is called on context switch */
-extern boolean_t kperf_on_cpu_active;
+extern __attribute__((noinline)) void kperf_thread_ast_handler(thread_t thread);
 
 /* update whether the callback is set */
 void kperf_on_cpu_update(void);
 
-/* handle a thread being switched on */
-void kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation,
-                           uintptr_t *starting_fp);
-
-/* for scheduler threads switching threads on */
+/* for scheduler switching threads on */
 static inline void
 kperf_on_cpu(thread_t thread, thread_continue_t continuation,
              uintptr_t *starting_fp)
 {
+       extern boolean_t kperf_on_cpu_active;
+       void kperf_on_cpu_internal(thread_t thread, thread_continue_t continuation,
+                       uintptr_t *starting_fp);
+
        if (__improbable(kperf_on_cpu_active)) {
                kperf_on_cpu_internal(thread, continuation, starting_fp);
        }
 }
 
-/*
- * kdebug callback
- */
+/* for scheduler switching threads off */
+static inline void
+kperf_off_cpu(thread_t thread)
+{
+       extern unsigned int kperf_lazy_cpu_action;
+       void kperf_lazy_off_cpu(thread_t thread);
+
+       if (__improbable(kperf_lazy_cpu_action != 0)) {
+               kperf_lazy_off_cpu(thread);
+       }
+}
 
-/* controls whether the kdebug callback is called */
-extern boolean_t kperf_kdebug_active;
+/* for scheduler making threads runnable */
+static inline void
+kperf_make_runnable(thread_t thread, int interrupt)
+{
+       extern unsigned int kperf_lazy_cpu_action;
+       void kperf_lazy_make_runnable(thread_t thread, bool interrupt);
 
-/* handle the kdebug event */
-void kperf_kdebug_callback_internal(uint32_t debugid);
+       if (__improbable(kperf_lazy_cpu_action != 0)) {
+               kperf_lazy_make_runnable(thread, interrupt);
+       }
+}
 
-/* handle a kdebug event */
-void kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp);
+/* for interrupt handler epilogue */
+static inline void
+kperf_interrupt(void)
+{
+       extern unsigned int kperf_lazy_cpu_action;
+       extern void kperf_lazy_cpu_sample(thread_t thread, unsigned int flags,
+                       bool interrupt);
+
+       if (__improbable(kperf_lazy_cpu_action != 0)) {
+               kperf_lazy_cpu_sample(current_thread(), 0, true);
+       }
+}
 
-/* called inside of kernel_debug_internal */
+/* for kdebug on every traced event */
 static inline void
 kperf_kdebug_callback(uint32_t debugid, uintptr_t *starting_fp)
 {
+       extern boolean_t kperf_kdebug_active;
+       void kperf_kdebug_handler(uint32_t debugid, uintptr_t *starting_fp);
+
        if (__improbable(kperf_kdebug_active)) {
                kperf_kdebug_handler(debugid, starting_fp);
        }
@@ -129,21 +158,11 @@ extern void kperf_reset(void);
  */
 void kperf_kernel_configure(const char *config);
 
-/* get and set whether we're recording stacks on interesting kdebug events */
-extern int kperf_kdbg_get_stacks(void);
-extern int kperf_kdbg_set_stacks(int);
-
-extern int kperf_kdebug_cswitch;
+/* given a task port, find out its pid */
+int kperf_port_to_pid(mach_port_name_t portname);
 
 #if DEVELOPMENT || DEBUG
 extern _Atomic long long kperf_pending_ipis;
 #endif /* DEVELOPMENT || DEBUG */
 
-/* get and set whether to output tracepoints on context-switch */
-extern int kperf_kdbg_cswitch_get(void);
-extern int kperf_kdbg_cswitch_set(int newval);
-
-/* given a task port, find out its pid */
-int kperf_port_to_pid(mach_port_name_t portname);
-
 #endif /* !defined(KPERF_H) */
index c8399219128cc829598f44867a17935422238ad4..6c84d89f579d1c7010531ce7698fabd5486400b0 100644 (file)
@@ -32,6 +32,5 @@ struct kperf_timer;
 bool kperf_mp_broadcast_other_running(struct kperf_timer *trigger);
 
 void kperf_signal_handler(unsigned int cpu_number);
-kern_return_t kperf_get_phys_footprint(task_t, uint64_t *);
 
 #endif /* KPERF_ARCH_H */
index 86ed35d8711638d55de4a80599e1654dce7e77f4..49c16419c64700c6a9957ec01f1b80178b4186d8 100644 (file)
@@ -118,12 +118,15 @@ kperf_sample_cpu(struct kperf_timer *timer, bool system_sample,
 #endif /* DEVELOPMENT || DEBUG */
 
        /* On a timer, we can see the "real" current thread */
+       thread_t thread = current_thread();
+       task_t task = get_threadtask(thread);
        struct kperf_context ctx = {
-               .cur_thread = current_thread(),
+               .cur_thread = thread,
+               .cur_task = task,
+               .cur_pid = task_pid(task),
                .trigger_type = TRIGGER_TYPE_TIMER,
                .trigger_id = (unsigned int)(timer - kperf_timerv),
        };
-       ctx.cur_pid = task_pid(get_threadtask(ctx.cur_thread));
 
        if (ctx.trigger_id == pet_timer_id && ncpu < machine_info.logical_cpu_max) {
                kperf_tid_on_cpus[ncpu] = thread_tid(ctx.cur_thread);
@@ -192,7 +195,7 @@ kperf_timer_handler(void *param0, __unused void *param1)
        /*
         * IPI other cores only if the action has non-system samplers.
         */
-       if (kperf_sample_has_non_system(timer->actionid)) {
+       if (kperf_action_has_non_system(timer->actionid)) {
                /*
                 * If the core that's handling the timer is not scheduling
                 * threads, only run system samplers.
index fcc64221720fd60f940a51c4af6e1746d9ce0aac..3d5b91cad729bfbe48b48b5a8115595699f93c45 100644 (file)
@@ -66,23 +66,23 @@ void kperf_ipi_handler(void *param);
 #if defined(__x86_64__)
 
 #define KP_MIN_PERIOD_NS        (20 * NSEC_PER_USEC)
-#define KP_MIN_PERIOD_BG_NS     (10 * NSEC_PER_MSEC)
+#define KP_MIN_PERIOD_BG_NS     (1 * NSEC_PER_MSEC)
 #define KP_MIN_PERIOD_PET_NS    (2 * NSEC_PER_MSEC)
-#define KP_MIN_PERIOD_PET_BG_NS (10 * NSEC_PER_MSEC)
+#define KP_MIN_PERIOD_PET_BG_NS (5 * NSEC_PER_MSEC)
 
 #elif defined(__arm64__)
 
 #define KP_MIN_PERIOD_NS        (50 * NSEC_PER_USEC)
-#define KP_MIN_PERIOD_BG_NS     (20 * NSEC_PER_MSEC)
+#define KP_MIN_PERIOD_BG_NS     (1 * NSEC_PER_MSEC)
 #define KP_MIN_PERIOD_PET_NS    (2 * NSEC_PER_MSEC)
-#define KP_MIN_PERIOD_PET_BG_NS (50 * NSEC_PER_MSEC)
+#define KP_MIN_PERIOD_PET_BG_NS (10 * NSEC_PER_MSEC)
 
 #elif defined(__arm__)
 
 #define KP_MIN_PERIOD_NS        (100 * NSEC_PER_USEC)
-#define KP_MIN_PERIOD_BG_NS     (50 * NSEC_PER_MSEC)
+#define KP_MIN_PERIOD_BG_NS     (10 * NSEC_PER_MSEC)
 #define KP_MIN_PERIOD_PET_NS    (2 * NSEC_PER_MSEC)
-#define KP_MIN_PERIOD_PET_BG_NS (100 * NSEC_PER_MSEC)
+#define KP_MIN_PERIOD_PET_BG_NS (50 * NSEC_PER_MSEC)
 
 #else /* defined(__x86_64__) */
 #error "unsupported architecture"
index b89125126fe1adb8e8a334e31323735c59885cb6..6fe1b5c291c3d6c8930699b68a346e988e16ba10 100644 (file)
 #include <kperf/kperfbsd.h>
 #include <kperf/kperf_timer.h>
 #include <kperf/pet.h>
+#include <kperf/lazy.h>
 
 #include <sys/ktrace.h>
 
-/* IDs for dispatch from SYSCTL macros */
-#define REQ_SAMPLING                (1)
-#define REQ_ACTION_COUNT            (2)
-#define REQ_ACTION_SAMPLERS         (3)
-#define REQ_TIMER_COUNT             (4)
-#define REQ_TIMER_PERIOD            (5)
-#define REQ_TIMER_PET               (6)
-#define REQ_TIMER_ACTION            (7)
-#define REQ_BLESS                   (8)
-#define REQ_ACTION_USERDATA         (9)
-#define REQ_ACTION_FILTER_BY_TASK   (10)
-#define REQ_ACTION_FILTER_BY_PID    (11)
-/* 12 unused */
-#define REQ_PET_IDLE_RATE           (13)
-#define REQ_BLESS_PREEMPT           (14)
-#define REQ_KDBG_CSWITCH            (15)
-#define REQ_RESET                   (16)
-/* 17 unused */
-#define REQ_ACTION_UCALLSTACK_DEPTH (18)
-#define REQ_ACTION_KCALLSTACK_DEPTH (19)
-#define REQ_LIGHTWEIGHT_PET         (20)
-#define REQ_KDEBUG_ACTION           (21)
-#define REQ_KDEBUG_FILTER           (22)
+/* Requests from kperf sysctls. */
+enum kperf_request {
+       REQ_SAMPLING,
+       REQ_RESET,
+
+       REQ_ACTION_COUNT,
+       REQ_ACTION_SAMPLERS,
+       REQ_ACTION_USERDATA,
+       REQ_ACTION_FILTER_BY_TASK,
+       REQ_ACTION_FILTER_BY_PID,
+       REQ_ACTION_UCALLSTACK_DEPTH,
+       REQ_ACTION_KCALLSTACK_DEPTH,
+
+       REQ_TIMER_COUNT,
+       REQ_TIMER_PERIOD,
+       REQ_TIMER_PET,
+       REQ_TIMER_ACTION,
+
+       REQ_KDBG_CSWITCH,
+
+       REQ_BLESS,
+       REQ_BLESS_PREEMPT,
+
+       REQ_PET_IDLE_RATE,
+       REQ_LIGHTWEIGHT_PET,
+
+       REQ_KDEBUG_FILTER,
+       REQ_KDEBUG_ACTION,
+
+       REQ_LAZY_WAIT_TIME_THRESHOLD,
+       REQ_LAZY_WAIT_ACTION,
+       REQ_LAZY_CPU_TIME_THRESHOLD,
+       REQ_LAZY_CPU_ACTION,
+};
 
 int kperf_debug_level = 0;
 
@@ -78,7 +90,7 @@ _Atomic long long kperf_pending_ipis = 0;
 #endif /* DEVELOPMENT || DEBUG */
 
 /*
- * kperf has a different sysctl model than others.
+ * kperf has unique requirements from sysctl.
  *
  * For simple queries like the number of actions, the normal sysctl style
  * of get/set works well.
@@ -137,6 +149,28 @@ kperf_sysctl_get_set_int(struct sysctl_req *req,
        return set(value);
 }
 
+static int
+kperf_sysctl_get_set_uint64(struct sysctl_req *req,
+       uint64_t (*get)(void), int (*set)(uint64_t))
+{
+       assert(req != NULL);
+       assert(get != NULL);
+       assert(set != NULL);
+
+       uint64_t value = 0;
+       if (req->oldptr) {
+               value = get();
+       }
+
+       int error = sysctl_io_number(req, value, sizeof(value), &value, NULL);
+
+       if (error || !req->newptr) {
+               return error;
+       }
+
+       return set(value);
+}
+
 static int
 kperf_sysctl_get_set_unsigned_uint32(struct sysctl_req *req,
        int (*get)(unsigned int, uint32_t *), int (*set)(unsigned int, uint32_t))
@@ -311,9 +345,11 @@ sysctl_kdebug_filter(struct sysctl_req *req)
                }
 
                return SYSCTL_OUT(req, filter, filter_size);
+       } else if (req->newptr != USER_ADDR_NULL) {
+               return kperf_kdebug_set_filter(req->newptr, (uint32_t)req->newlen);
+       } else {
+               return EINVAL;
        }
-
-       return kperf_kdebug_set_filter(req->newptr, (uint32_t)req->newlen);
 }
 
 static int
@@ -407,12 +443,40 @@ sysctl_kdbg_cswitch(struct sysctl_req *req)
                kperf_kdbg_cswitch_set);
 }
 
+static int
+sysctl_lazy_wait_time_threshold(struct sysctl_req *req)
+{
+       return kperf_sysctl_get_set_uint64(req, kperf_lazy_get_wait_time_threshold,
+               kperf_lazy_set_wait_time_threshold);
+}
+
+static int
+sysctl_lazy_wait_action(struct sysctl_req *req)
+{
+       return kperf_sysctl_get_set_int(req, kperf_lazy_get_wait_action,
+               kperf_lazy_set_wait_action);
+}
+
+static int
+sysctl_lazy_cpu_time_threshold(struct sysctl_req *req)
+{
+       return kperf_sysctl_get_set_uint64(req, kperf_lazy_get_cpu_time_threshold,
+               kperf_lazy_set_cpu_time_threshold);
+}
+
+static int
+sysctl_lazy_cpu_action(struct sysctl_req *req)
+{
+       return kperf_sysctl_get_set_int(req, kperf_lazy_get_cpu_action,
+               kperf_lazy_set_cpu_action);
+}
+
 static int
 kperf_sysctl SYSCTL_HANDLER_ARGS
 {
 #pragma unused(oidp, arg2)
        int ret;
-       uintptr_t type = (uintptr_t)arg1;
+       enum kperf_request type = (enum kperf_request)arg1;
 
        ktrace_lock();
 
@@ -487,6 +551,18 @@ kperf_sysctl SYSCTL_HANDLER_ARGS
        case REQ_LIGHTWEIGHT_PET:
                ret = sysctl_lightweight_pet(req);
         break;
+       case REQ_LAZY_WAIT_TIME_THRESHOLD:
+               ret = sysctl_lazy_wait_time_threshold(req);
+        break;
+       case REQ_LAZY_WAIT_ACTION:
+               ret = sysctl_lazy_wait_action(req);
+        break;
+       case REQ_LAZY_CPU_TIME_THRESHOLD:
+               ret = sysctl_lazy_cpu_time_threshold(req);
+        break;
+       case REQ_LAZY_CPU_ACTION:
+               ret = sysctl_lazy_cpu_action(req);
+        break;
        default:
                ret = ENOENT;
                break;
@@ -552,7 +628,8 @@ SYSCTL_NODE(_kperf, OID_AUTO, action, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
             "action");
 
 SYSCTL_PROC(_kperf_action, OID_AUTO, count,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED |
+            CTLFLAG_MASKED,
             (void *)REQ_ACTION_COUNT,
             sizeof(int), kperf_sysctl, "I", "Number of actions");
 
@@ -598,7 +675,8 @@ SYSCTL_NODE(_kperf, OID_AUTO, timer, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
             "timer");
 
 SYSCTL_PROC(_kperf_timer, OID_AUTO, count,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED
+            | CTLFLAG_MASKED,
             (void *)REQ_TIMER_COUNT,
             sizeof(int), kperf_sysctl, "I", "Number of time triggers");
 
@@ -615,7 +693,8 @@ SYSCTL_PROC(_kperf_timer, OID_AUTO, action,
             "Timer number and actionid");
 
 SYSCTL_PROC(_kperf_timer, OID_AUTO, pet_timer,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED
+            | CTLFLAG_MASKED,
             (void *)REQ_TIMER_PET,
             sizeof(int), kperf_sysctl, "I", "Which timer ID does PET");
 
@@ -625,7 +704,8 @@ SYSCTL_NODE(_kperf, OID_AUTO, kdebug, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
             "kdebug");
 
 SYSCTL_PROC(_kperf_kdebug, OID_AUTO, action,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED
+            | CTLFLAG_MASKED,
             (void*)REQ_KDEBUG_ACTION,
             sizeof(int), kperf_sysctl, "I", "ID of action to trigger on kdebug events");
 
@@ -634,10 +714,40 @@ SYSCTL_PROC(_kperf_kdebug, OID_AUTO, filter,
             (void*)REQ_KDEBUG_FILTER,
             sizeof(int), kperf_sysctl, "P", "The filter that determines which kdebug events trigger a sample");
 
+/* lazy sampling */
+
+SYSCTL_NODE(_kperf, OID_AUTO, lazy, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
+            "lazy");
+
+SYSCTL_PROC(_kperf_lazy, OID_AUTO, wait_time_threshold,
+            CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+            (void *)REQ_LAZY_WAIT_TIME_THRESHOLD,
+            sizeof(uint64_t), kperf_sysctl, "UQ",
+            "How many ticks a thread must wait to take a sample");
+
+SYSCTL_PROC(_kperf_lazy, OID_AUTO, wait_action,
+            CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+            (void *)REQ_LAZY_WAIT_ACTION,
+            sizeof(uint64_t), kperf_sysctl, "UQ",
+            "Which action to fire when a thread waits longer than threshold");
+
+SYSCTL_PROC(_kperf_lazy, OID_AUTO, cpu_time_threshold,
+            CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+            (void *)REQ_LAZY_CPU_TIME_THRESHOLD,
+            sizeof(uint64_t), kperf_sysctl, "UQ",
+            "Minimum number of ticks a CPU must run between samples");
+
+SYSCTL_PROC(_kperf_lazy, OID_AUTO, cpu_action,
+            CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_MASKED | CTLFLAG_LOCKED,
+            (void *)REQ_LAZY_CPU_ACTION,
+            sizeof(uint64_t), kperf_sysctl, "UQ",
+            "Which action to fire for lazy CPU samples");
+
 /* misc */
 
 SYSCTL_PROC(_kperf, OID_AUTO, sampling,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED
+            | CTLFLAG_MASKED,
             (void *)REQ_SAMPLING,
             sizeof(int), kperf_sysctl, "I", "Sampling running");
 
@@ -647,29 +757,34 @@ SYSCTL_PROC(_kperf, OID_AUTO, reset,
             0, kperf_sysctl, "-", "Reset kperf");
 
 SYSCTL_PROC(_kperf, OID_AUTO, blessed_pid,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, /* must be root */
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED /* must be root */
+            | CTLFLAG_MASKED,
             (void *)REQ_BLESS,
             sizeof(int), kperf_sysctl_bless_handler, "I", "Blessed pid");
 
 SYSCTL_PROC(_kperf, OID_AUTO, blessed_preempt,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED |
+            CTLFLAG_MASKED,
             (void *)REQ_BLESS_PREEMPT,
             sizeof(int), kperf_sysctl, "I", "Blessed preemption");
 
 SYSCTL_PROC(_kperf, OID_AUTO, kdbg_cswitch,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED
+            | CTLFLAG_MASKED,
             (void *)REQ_KDBG_CSWITCH,
             sizeof(int), kperf_sysctl, "I", "Generate context switch info");
 
 SYSCTL_PROC(_kperf, OID_AUTO, pet_idle_rate,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED
+            | CTLFLAG_MASKED,
             (void *)REQ_PET_IDLE_RATE,
             sizeof(int), kperf_sysctl, "I",
             "Rate at which unscheduled threads are forced to be sampled in "
             "PET mode");
 
 SYSCTL_PROC(_kperf, OID_AUTO, lightweight_pet,
-            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED,
+            CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_ANYBODY | CTLFLAG_LOCKED
+            | CTLFLAG_MASKED,
             (void *)REQ_LIGHTWEIGHT_PET,
             sizeof(int), kperf_sysctl, "I",
             "Status of lightweight PET mode");
@@ -679,16 +794,18 @@ SYSCTL_PROC(_kperf, OID_AUTO, lightweight_pet,
 SYSCTL_NODE(_kperf, OID_AUTO, limits, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
             "limits");
 
-#define REQ_LIM_PERIOD_NS (1)
-#define REQ_LIM_BG_PERIOD_NS (2)
-#define REQ_LIM_PET_PERIOD_NS (3)
-#define REQ_LIM_BG_PET_PERIOD_NS (4)
+enum kperf_limit_request {
+       REQ_LIM_PERIOD_NS,
+       REQ_LIM_BG_PERIOD_NS,
+       REQ_LIM_PET_PERIOD_NS,
+       REQ_LIM_BG_PET_PERIOD_NS,
+};
 
 static int
 kperf_sysctl_limits SYSCTL_HANDLER_ARGS
 {
 #pragma unused(oidp, arg2)
-       int type = (int)arg1;
+       enum kperf_limit_request type = (enum kperf_limit_request)arg1;
        uint64_t limit = 0;
 
        switch (type) {
index 16bfb7c91419e0840274af716832da91bb67d750..2e71d403c1868acdd038923216e4798469abd946 100644 (file)
@@ -29,9 +29,7 @@
 #ifndef __KPERF_BSD_H__
 #define __KPERF_BSD_H__
 
-/* bless a process to allow kperf access to a non-root process
- */
-extern int kperf_bless_pid(pid_t newpid);
-
+/* bless a process to allow kperf access to a non-root process */
+int kperf_bless_pid(pid_t newpid);
 
 #endif /* __KPERF_BSD_H__ */
diff --git a/osfmk/kperf/lazy.c b/osfmk/kperf/lazy.c
new file mode 100644 (file)
index 0000000..78e01b2
--- /dev/null
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <stdint.h>
+
+#include <kern/thread.h>
+
+#include <kperf/action.h>
+#include <kperf/buffer.h>
+#include <kperf/kperf.h>
+#include <kperf/lazy.h>
+#include <kperf/sample.h>
+
+unsigned int kperf_lazy_wait_action = 0;
+unsigned int kperf_lazy_cpu_action = 0;
+uint64_t kperf_lazy_wait_time_threshold = 0;
+uint64_t kperf_lazy_cpu_time_threshold = 0;
+
+void
+kperf_lazy_reset(void)
+{
+       kperf_lazy_wait_action = 0;
+       kperf_lazy_wait_time_threshold = 0;
+       kperf_lazy_cpu_action = 0;
+       kperf_lazy_cpu_time_threshold = 0;
+       kperf_on_cpu_update();
+}
+
+void
+kperf_lazy_off_cpu(thread_t thread)
+{
+       /* try to lazily sample the CPU if the thread was pre-empted */
+       if ((thread->reason & AST_SCHEDULING) != 0) {
+               kperf_lazy_cpu_sample(thread, 0, 0);
+    }
+}
+
+void
+kperf_lazy_make_runnable(thread_t thread, bool in_interrupt)
+{
+       assert(thread->last_made_runnable_time != THREAD_NOT_RUNNABLE);
+       /* ignore threads that race to wait and in waking up */
+       if (thread->last_run_time > thread->last_made_runnable_time) {
+               return;
+       }
+
+       uint64_t wait_time = thread_get_last_wait_duration(thread);
+       if (wait_time > kperf_lazy_wait_time_threshold) {
+               BUF_DATA(PERF_LZ_MKRUNNABLE, (uintptr_t)thread_tid(thread),
+                               thread->sched_pri, in_interrupt ? 1 : 0);
+       }
+}
+
+void
+kperf_lazy_wait_sample(thread_t thread, thread_continue_t continuation,
+               uintptr_t *starting_fp)
+{
+       /* ignore idle threads */
+       if (thread->last_made_runnable_time == THREAD_NOT_RUNNABLE) {
+               return;
+       }
+       /* ignore invalid made runnable times */
+       if (thread->last_made_runnable_time < thread->last_run_time) {
+               return;
+       }
+
+       /* take a sample if thread was waiting for longer than threshold */
+       uint64_t wait_time = thread_get_last_wait_duration(thread);
+       if (wait_time > kperf_lazy_wait_time_threshold) {
+               uint64_t time_now = mach_absolute_time();
+               timer_update(&thread->runnable_timer, time_now);
+               timer_update(&thread->system_timer, time_now);
+
+               uint64_t runnable_time = timer_grab(&thread->runnable_timer);
+               uint64_t running_time = timer_grab(&thread->user_timer) +
+                               timer_grab(&thread->system_timer);
+
+               BUF_DATA(PERF_LZ_WAITSAMPLE, wait_time, runnable_time, running_time);
+
+               task_t task = get_threadtask(thread);
+               struct kperf_context ctx = {
+                       .cur_thread = thread,
+                       .cur_task = task,
+                       .cur_pid = task_pid(task),
+                       .trigger_type = TRIGGER_TYPE_LAZY_WAIT,
+                       .starting_fp = starting_fp,
+               };
+
+               struct kperf_sample *sample = kperf_intr_sample_buffer();
+               if (!sample) {
+                       return;
+               }
+
+               unsigned int flags = SAMPLE_FLAG_PEND_USER;
+               flags |= continuation ? SAMPLE_FLAG_CONTINUATION : 0;
+               flags |= !ml_at_interrupt_context() ? SAMPLE_FLAG_NON_INTERRUPT : 0;
+
+               kperf_sample(sample, &ctx, kperf_lazy_wait_action, flags);
+       }
+}
+
+void
+kperf_lazy_cpu_sample(thread_t thread, unsigned int flags, bool interrupt)
+{
+       assert(ml_get_interrupts_enabled() == FALSE);
+
+       /* take a sample if this CPU's last sample time is beyond the threshold */
+       processor_t processor = current_processor();
+       uint64_t time_now = mach_absolute_time();
+       uint64_t since_last_sample = time_now - processor->kperf_last_sample_time;
+       if (since_last_sample > kperf_lazy_cpu_time_threshold) {
+               processor->kperf_last_sample_time = time_now;
+               timer_update(&thread->runnable_timer, time_now);
+               timer_update(&thread->system_timer, time_now);
+
+               uint64_t runnable_time = timer_grab(&thread->runnable_timer);
+               uint64_t running_time = timer_grab(&thread->user_timer) +
+                               timer_grab(&thread->system_timer);
+
+               BUF_DATA(PERF_LZ_CPUSAMPLE, running_time, runnable_time,
+                               thread->sched_pri, interrupt ? 1 : 0);
+
+               task_t task = get_threadtask(thread);
+               struct kperf_context ctx = {
+                       .cur_thread = thread,
+                       .cur_task = task,
+                       .cur_pid = task_pid(task),
+                       .trigger_type = TRIGGER_TYPE_LAZY_CPU,
+                       .starting_fp = 0,
+               };
+
+               struct kperf_sample *sample = kperf_intr_sample_buffer();
+               if (!sample) {
+                       return;
+               }
+
+               kperf_sample(sample, &ctx, kperf_lazy_cpu_action,
+                               SAMPLE_FLAG_PEND_USER | flags);
+       }
+}
+
+/*
+ * Accessors for configuration.
+ */
+
+int kperf_lazy_get_wait_action(void) { return kperf_lazy_wait_action; }
+
+int
+kperf_lazy_set_wait_action(int action_id)
+{
+       if (action_id < 0 || (unsigned int)action_id > kperf_action_get_count()) {
+               return 1;
+       }
+
+       kperf_lazy_wait_action = action_id;
+       kperf_on_cpu_update();
+       return 0;
+}
+
+uint64_t
+kperf_lazy_get_wait_time_threshold(void)
+{
+       return kperf_lazy_wait_time_threshold;
+}
+
+int
+kperf_lazy_set_wait_time_threshold(uint64_t threshold)
+{
+       kperf_lazy_wait_time_threshold = threshold;
+       return 0;
+}
+
+int kperf_lazy_get_cpu_action(void) { return kperf_lazy_cpu_action; }
+
+int
+kperf_lazy_set_cpu_action(int action_id)
+{
+       if (action_id < 0 || (unsigned int)action_id > kperf_action_get_count()) {
+               return 1;
+       }
+
+       kperf_lazy_cpu_action = action_id;
+       return 0;
+}
+
+uint64_t
+kperf_lazy_get_cpu_time_threshold(void)
+{
+       return kperf_lazy_cpu_time_threshold;
+}
+
+int
+kperf_lazy_set_cpu_time_threshold(uint64_t threshold)
+{
+       kperf_lazy_cpu_time_threshold = threshold;
+       return 0;
+}
+
diff --git a/osfmk/kperf/lazy.h b/osfmk/kperf/lazy.h
new file mode 100644 (file)
index 0000000..c09fabe
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef KPERF_LAZY_H
+#define KPERF_LAZY_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <kern/thread.h>
+
+extern unsigned int kperf_lazy_wait_action;
+extern unsigned int kperf_lazy_cpu_action;
+
+void kperf_lazy_reset(void);
+void kperf_lazy_off_cpu(thread_t thread);
+void kperf_lazy_make_runnable(thread_t thread, bool in_interrupt);
+void kperf_lazy_wait_sample(thread_t thread,
+               thread_continue_t continuation, uintptr_t *starting_fp);
+void kperf_lazy_cpu_sample(thread_t thread, unsigned int flags, bool interrupt);
+
+/* accessors for configuration */
+int kperf_lazy_get_wait_action(void);
+int kperf_lazy_get_cpu_action(void);
+int kperf_lazy_set_wait_action(int action_id);
+int kperf_lazy_set_cpu_action(int action_id);
+uint64_t kperf_lazy_get_wait_time_threshold(void);
+uint64_t kperf_lazy_get_cpu_time_threshold(void);
+int kperf_lazy_set_wait_time_threshold(uint64_t threshold);
+int kperf_lazy_set_cpu_time_threshold(uint64_t threshold);
+
+#endif /* !defined(KPERF_LAZY_H) */
index 15de26436b7f7e19fc2f18f116cf0808342c52c9..03616d0851862af648d40f7a34a117356df422f9 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2011 Apple Computer, Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
 #include <mach/mach_types.h>
 #include <kern/task.h> /* task_ledgers */
-#include <kern/thread.h>
 #include <kern/ledger.h>
 
 #include <kperf/kperf.h>
-#include <kperf/kperf_arch.h>
 
 #include <kperf/buffer.h>
 #include <kperf/context.h>
 
 /* collect current memory info */
 void
-kperf_meminfo_sample(struct meminfo *mi, struct kperf_context *context)
+kperf_meminfo_sample(task_t task, struct meminfo *mi)
 {
-       task_t task;
        ledger_amount_t credit, debit;
-       uint64_t phys_footprint;
        kern_return_t kr;
 
-       assert(mi);
-       assert(context);
-
-       thread_t thread = context->cur_thread;
+       assert(mi != NULL);
 
-       BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread));
+       BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_START);
 
-       task = get_threadtask(thread);
-
-       kr = kperf_get_phys_footprint(task, &phys_footprint);
-       if (kr == KERN_SUCCESS) {
-               mi->phys_footprint = phys_footprint;
-       } else {
-               mi->phys_footprint = UINT64_MAX;
-       }
+       mi->phys_footprint = get_task_phys_footprint(task);
 
        kr = ledger_get_entries(task->ledger, task_ledgers.purgeable_volatile,
                                &credit, &debit);
@@ -80,7 +66,7 @@ kperf_meminfo_sample(struct meminfo *mi, struct kperf_context *context)
                mi->purgeable_volatile_compressed = UINT64_MAX;
        }
 
-       BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_END, (uintptr_t)thread_tid(thread));
+       BUF_INFO(PERF_MI_SAMPLE | DBG_FUNC_END);
 }
 
 /* log an existing sample into the buffer */
@@ -90,4 +76,3 @@ kperf_meminfo_log(struct meminfo *mi)
        BUF_DATA(PERF_MI_DATA, mi->phys_footprint, mi->purgeable_volatile,
                 mi->purgeable_volatile_compressed);
 }
-
index 5103e1ef4b8144ad219912cd3c968994bc28b5b1..a51c1794f8e759c77c44dfda6227b84138f4baf3 100644 (file)
@@ -31,6 +31,7 @@
 
 #include <mach/mach_types.h>
 #include <kern/ledger.h>
+#include <kern/task.h>
 
 struct meminfo {
        uint64_t phys_footprint;
@@ -39,7 +40,7 @@ struct meminfo {
 };
 
 struct kperf_context;
-extern void kperf_meminfo_sample(struct meminfo *, struct kperf_context *);
+extern void kperf_meminfo_sample(task_t, struct meminfo *);
 extern void kperf_meminfo_log(struct meminfo *mi);
 
 #endif /* __KPERF_MEMINFO_H__ */
index 5af12821cb205e855e40837075de2d90fd01f70b..0bfb626cef9c436180bd2adb37065dc23903e69b 100644 (file)
@@ -120,7 +120,8 @@ static kern_return_t pet_threads_prepare(task_t task);
 
 static void pet_sample_all_tasks(uint32_t idle_rate);
 static void pet_sample_task(task_t task, uint32_t idle_rate);
-static void pet_sample_thread(int pid, thread_t thread, uint32_t idle_rate);
+static void pet_sample_thread(int pid, task_t task, thread_t thread,
+               uint32_t idle_rate);
 
 /* functions called by other areas of kperf */
 
@@ -161,9 +162,11 @@ kperf_pet_on_cpu(thread_t thread, thread_continue_t continuation,
        if (thread->kperf_pet_gen != kperf_pet_gen) {
                BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START, kperf_pet_gen, thread->kperf_pet_gen);
 
+               task_t task = get_threadtask(thread);
                struct kperf_context ctx = {
                        .cur_thread = thread,
-                       .cur_pid = task_pid(get_threadtask(thread)),
+                       .cur_task = task,
+                       .cur_pid = task_pid(task),
                        .starting_fp = starting_fp,
                };
                /*
@@ -345,17 +348,18 @@ pet_thread_loop(void *param, wait_result_t wr)
 /* sampling */
 
 static void
-pet_sample_thread(int pid, thread_t thread, uint32_t idle_rate)
+pet_sample_thread(int pid, task_t task, thread_t thread, uint32_t idle_rate)
 {
        lck_mtx_assert(pet_lock, LCK_MTX_ASSERT_OWNED);
 
-       uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS;
+       uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS | SAMPLE_FLAG_THREAD_ONLY;
 
        BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START);
 
        /* work out the context */
        struct kperf_context ctx = {
                .cur_thread = thread,
+               .cur_task = task,
                .cur_pid = pid,
        };
 
@@ -441,21 +445,51 @@ pet_sample_task(task_t task, uint32_t idle_rate)
 
        BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_START);
 
-       kern_return_t kr = pet_threads_prepare(task);
-       if (kr != KERN_SUCCESS) {
-               BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr);
-               BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1);
+       int pid = task_pid(task);
+       if (kperf_action_has_task(pet_action_id)) {
+               struct kperf_context ctx = {
+                       .cur_task = task,
+                       .cur_pid = pid,
+               };
+
+               kperf_sample(pet_sample, &ctx, pet_action_id, SAMPLE_FLAG_TASK_ONLY);
+       }
+
+       if (!kperf_action_has_thread(pet_action_id)) {
+               BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END);
                return;
        }
 
-       int pid = task_pid(task);
+       kern_return_t kr = KERN_SUCCESS;
+
+       /*
+        * Suspend the task to see an atomic snapshot of all its threads.  This
+        * is expensive, and disruptive.
+        */
+       bool needs_suspend = task != kernel_task;
+       if (needs_suspend) {
+               kr = task_suspend_internal(task);
+               if (kr != KERN_SUCCESS) {
+                       BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1);
+                       return;
+               }
+               needs_suspend = true;
+       }
+
+       kr = pet_threads_prepare(task);
+       if (kr != KERN_SUCCESS) {
+               BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr);
+               goto out;
+       }
 
        for (unsigned int i = 0; i < pet_threads_count; i++) {
                thread_t thread = pet_threads[i];
-               int cpu;
-               assert(thread);
+               assert(thread != THREAD_NULL);
 
-               /* do not sample the thread if it was on a CPU during the IPI. */
+               /*
+                * Do not sample the thread if it was on a CPU when the timer fired.
+                */
+               int cpu = 0;
                for (cpu = 0; cpu < machine_info.logical_cpu_max; cpu++) {
                        if (kperf_tid_on_cpus[cpu] == thread_tid(thread)) {
                                break;
@@ -464,12 +498,17 @@ pet_sample_task(task_t task, uint32_t idle_rate)
 
                /* the thread was not on a CPU */
                if (cpu == machine_info.logical_cpu_max) {
-                       pet_sample_thread(pid, thread, idle_rate);
+                       pet_sample_thread(pid, task, thread, idle_rate);
                }
 
                thread_deallocate(pet_threads[i]);
        }
 
+out:
+       if (needs_suspend) {
+               task_resume_internal(task);
+       }
+
        BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, pet_threads_count);
 }
 
@@ -556,18 +595,7 @@ pet_sample_all_tasks(uint32_t idle_rate)
        for (unsigned int i = 0; i < pet_tasks_count; i++) {
                task_t task = pet_tasks[i];
 
-               if (task != kernel_task) {
-                       kr = task_suspend_internal(task);
-                       if (kr != KERN_SUCCESS) {
-                               continue;
-                       }
-               }
-
                pet_sample_task(task, idle_rate);
-
-               if (task != kernel_task) {
-                       task_resume_internal(task);
-               }
        }
 
        for(unsigned int i = 0; i < pet_tasks_count; i++) {
index f976518d4ec50eb7cfc401366f5ab9f4cfec9947..3d521b782988fd30d499f5097cc7aa0371afc517 100644 (file)
 
 #include <kern/task.h>
 
-extern boolean_t memorystatus_proc_is_dirty_unsafe(void *v);
+extern void memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty,
+               boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit);
 
 void
-kperf_task_snapshot_sample(struct kperf_task_snapshot *tksn,
-                           struct kperf_context *ctx)
+kperf_task_snapshot_sample(task_t task, struct kperf_task_snapshot *tksn)
 {
-       thread_t thread;
-       task_t task;
-
        BUF_INFO(PERF_TK_SNAP_SAMPLE | DBG_FUNC_START);
 
        assert(tksn != NULL);
-       assert(ctx != NULL);
-
-       thread = ctx->cur_thread;
-       task = get_threadtask(thread);
 
        tksn->kptksn_flags = 0;
        if (task->effective_policy.tep_darwinbg) {
@@ -61,9 +54,17 @@ kperf_task_snapshot_sample(struct kperf_task_snapshot *tksn,
                tksn->kptksn_flags |= KPERF_TASK_FLAG_BOOSTED;
        }
 #if CONFIG_MEMORYSTATUS
-       if (memorystatus_proc_is_dirty_unsafe(task->bsd_info)) {
+       boolean_t dirty = FALSE, dirty_tracked = FALSE, allow_idle_exit = FALSE;
+       memorystatus_proc_flags_unsafe(task->bsd_info, &dirty, &dirty_tracked, &allow_idle_exit);
+       if (dirty) {
                tksn->kptksn_flags |= KPERF_TASK_FLAG_DIRTY;
        }
+       if (dirty_tracked) {
+               tksn->kptksn_flags |= KPERF_TASK_FLAG_DIRTY_TRACKED;
+       }
+       if (allow_idle_exit) {
+               tksn->kptksn_flags |= KPERF_TASK_ALLOW_IDLE_EXIT;
+       }
 #endif
 
        tksn->kptksn_suspend_count = task->suspend_count;
@@ -96,3 +97,11 @@ kperf_task_snapshot_log(struct kperf_task_snapshot *tksn)
                                        LOWER_32(tksn->kptksn_system_time_in_terminated_threads));
 #endif /* defined(__LP64__) */
 }
+
+void
+kperf_task_info_log(struct kperf_context *ctx)
+{
+       assert(ctx != NULL);
+
+       BUF_DATA(PERF_TK_INFO_DATA, ctx->cur_pid);
+}
index ebebeb5522949af22136fcc6742679492cf30470..d47b15bca2390649b6f05fa43a243bc5a373fdc5 100644 (file)
@@ -30,6 +30,7 @@
 #define KPERF_TASK_SAMPLERS_H
 
 #include <kperf/context.h>
+#include <kern/task.h>
 
 struct kperf_task_snapshot {
        uint64_t kptksn_flags;
@@ -46,9 +47,11 @@ struct kperf_task_snapshot {
 #define KPERF_TASK_FLAG_WQ_FLAGS_VALID          (1U << 4)
 #define KPERF_TASK_FLAG_WQ_EXCEEDED_TOTAL       (1U << 5)
 #define KPERF_TASK_FLAG_WQ_EXCEEDED_CONSTRAINED (1U << 6)
+#define KPERF_TASK_FLAG_DIRTY_TRACKED           (1U << 7)
+#define KPERF_TASK_ALLOW_IDLE_EXIT              (1U << 8)
 
-void kperf_task_snapshot_sample(struct kperf_task_snapshot *tksn,
-                                struct kperf_context *ctx);
+void kperf_task_snapshot_sample(task_t task, struct kperf_task_snapshot *tksn);
 void kperf_task_snapshot_log(struct kperf_task_snapshot *tksn);
+void kperf_task_info_log(struct kperf_context *ctx);
 
 #endif /* !defined(KPERF_TASK_SAMPLERS_H) */
index 176520f0c102c412ecc739c0a6b13d1f7e897b9b..36e2196fe53fb4c54ab27f4aa33ec048be0a22dd 100644 (file)
@@ -140,8 +140,8 @@ kperf_thread_scheduling_sample(struct kperf_thread_scheduling *thsc,
 
        BUF_INFO(PERF_TI_SCHEDSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread));
 
-       thsc->kpthsc_user_time = timer_grab(&(thread->user_timer));
-       uint64_t system_time = timer_grab(&(thread->system_timer));
+       thsc->kpthsc_user_time = timer_grab(&thread->user_timer);
+       uint64_t system_time = timer_grab(&thread->system_timer);
 
        if (thread->precise_user_kernel_time) {
                thsc->kpthsc_system_time = system_time;
@@ -150,12 +150,14 @@ kperf_thread_scheduling_sample(struct kperf_thread_scheduling *thsc,
                thsc->kpthsc_system_time = 0;
        }
 
+       thsc->kpthsc_runnable_time = timer_grab(&thread->runnable_timer);
        thsc->kpthsc_state = thread->state;
        thsc->kpthsc_base_priority = thread->base_pri;
        thsc->kpthsc_sched_priority = thread->sched_pri;
        thsc->kpthsc_effective_qos = thread->effective_policy.thep_qos;
        thsc->kpthsc_requested_qos = thread->requested_policy.thrp_qos;
-       thsc->kpthsc_requested_qos_override = thread->requested_policy.thrp_qos_override;
+       thsc->kpthsc_requested_qos_override = MAX(thread->requested_policy.thrp_qos_override,
+                       thread->requested_policy.thrp_qos_workq_override);
        thsc->kpthsc_requested_qos_promote = thread->requested_policy.thrp_qos_promote;
        thsc->kpthsc_requested_qos_ipc_override = thread->requested_policy.thrp_qos_ipc_override;
        thsc->kpthsc_requested_qos_sync_ipc_override = thread->requested_policy.thrp_qos_sync_ipc_override;
@@ -183,6 +185,7 @@ kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc)
                        | ((uint64_t)thsc->kpthsc_requested_qos_ipc_override << 55)
                        | ((uint64_t)thsc->kpthsc_requested_qos_sync_ipc_override << 52)
                        );
+       BUF_DATA(PERF_TI_SCHEDDATA_3, thsc->kpthsc_runnable_time);
 #else
        BUF_DATA(PERF_TI_SCHEDDATA1_32, UPPER_32(thsc->kpthsc_user_time),
                        LOWER_32(thsc->kpthsc_user_time),
@@ -200,6 +203,8 @@ kperf_thread_scheduling_log(struct kperf_thread_scheduling *thsc)
                        | ((uint32_t)thsc->kpthsc_requested_qos_ipc_override << 23)
                        | ((uint32_t)thsc->kpthsc_requested_qos_sync_ipc_override << 20)
                        );
+       BUF_DATA(PERF_TI_SCHEDDATA3_32, UPPER_32(thsc->kpthsc_runnable_time),
+                       LOWER_32(thsc->kpthsc_runnable_time));
 #endif /* defined(__LP64__) */
 }
 
@@ -282,7 +287,7 @@ kperf_thread_dispatch_sample(struct kperf_thread_dispatch *thdi,
        BUF_INFO(PERF_TI_DISPSAMPLE | DBG_FUNC_START, (uintptr_t)thread_tid(thread));
 
        task_t task = thread->task;
-       boolean_t task_64 = task_has_64BitAddr(task);
+       boolean_t task_64 = task_has_64Bit_addr(task);
        size_t user_addr_size = task_64 ? 8 : 4;
 
        assert(thread->task != kernel_task);
@@ -364,7 +369,9 @@ kperf_thread_inscyc_log(struct kperf_context *context)
        BUF_DATA(PERF_TI_INSCYCDATA_32, 0, 0, UPPER_32(counts[MT_CORE_CYCLES]),
                        LOWER_32(counts[MT_CORE_CYCLES]));
 #endif /* !defined(__LP64__) */
-#else /* MONOTONIC */
+
+#else
 #pragma unused(context)
-#endif /* !MONOTONIC */
+#endif /* MONOTONIC */
+
 }
index e5a9eaeff5f9120e8e95da47a37897db8f67f9b7..f443be7dd03ea886e66305f4b61750b48688d363 100644 (file)
@@ -47,6 +47,7 @@ void kperf_thread_info_log(struct kperf_thread_info *);
 struct kperf_thread_scheduling {
        uint64_t kpthsc_user_time;
        uint64_t kpthsc_system_time;
+       uint64_t kpthsc_runnable_time;
        unsigned int kpthsc_state;
        uint16_t kpthsc_base_priority;
        uint16_t kpthsc_sched_priority;
diff --git a/osfmk/kperf/x86_64/kperf_meminfo.c b/osfmk/kperf/x86_64/kperf_meminfo.c
deleted file mode 100644 (file)
index 9ed5acc..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2015 Apple Computer, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include <mach/mach_types.h>
-#include <kern/task.h> /* task_ledgers */
-#include <kern/thread.h>
-#include <kern/ledger.h>
-
-#include <kperf/kperf_arch.h>
-
-kern_return_t
-kperf_get_phys_footprint(task_t task, uint64_t *phys_footprint_out)
-{
-       kern_return_t kr;
-       ledger_amount_t credit, debit;
-       uint64_t phys_footprint;
-
-       kr = ledger_get_entries(task->ledger, task_ledgers.internal,
-                               &credit, &debit);
-       if (kr == KERN_SUCCESS) {
-               phys_footprint = credit - debit;
-       } else {
-               return kr;
-       }
-
-       kr = ledger_get_entries(task->ledger, task_ledgers.internal_compressed,
-                               &credit, &debit);
-       if (kr == KERN_SUCCESS) {
-               phys_footprint += credit - debit;
-       } else {
-               return kr;
-       }
-
-       *phys_footprint_out = phys_footprint;
-       return KERN_SUCCESS;
-}
-
index 9c293a5ce8f11b0f513ee33398d3ed377ff184bf..e728f0a4f4c0b879dd5ba8d6fdc6f8dc9e1b71d9 100644 (file)
@@ -39,6 +39,7 @@ MIG_DEFS =    \
        mach_vm.defs \
        mach_voucher.defs \
        mach_voucher_attr_control.defs \
+       memory_entry.defs \
        notify.defs \
        processor.defs \
        processor_set.defs \
@@ -86,6 +87,7 @@ MIG_UUHDRS = \
        mach_vm.h \
        mach_voucher.h \
        mach_voucher_attr_control.h \
+       memory_entry.h \
        memory_object_control.h \
        processor.h \
        processor_set.h \
@@ -166,7 +168,6 @@ INSTALL_MI_LIST     = \
 # installed into System.framework's PrivateHeaders/mach subdirectory
 PRIVATE_DATAFILES = \
        bootstrap.h \
-       branch_predicates.h \
        coalition.h \
        coalition_notification.defs \
        host_info.h \
@@ -206,7 +207,6 @@ INSTALL_MI_GEN_LIST =
 INSTALL_MI_DIR = mach
 
 EXPORT_MI_LIST = \
-       branch_predicates.h \
        coalition.h \
        mach_interface.h \
        resource_monitors.h \
@@ -303,6 +303,7 @@ MIG_KSHDRS = \
        mach_vm_server.h \
        mach_voucher_server.h \
        mach_voucher_attr_control_server.h \
+       memory_entry_server.h \
        memory_object_control_server.h \
        memory_object_default_server.h \
        processor_server.h \
@@ -327,6 +328,7 @@ MIG_KSSRC = \
        mach_vm_server.c \
        mach_voucher_server.c \
        mach_voucher_attr_control_server.c \
+       memory_entry_server.c \
        memory_object_control_server.c \
        memory_object_default_server.c \
        processor_server.c \
index b7db3d2b6e9950c40898a72dee00776818fc3a0f..2392307600b5cb7ccc8bc7323bae412dc46bc8bb 100644 (file)
@@ -112,11 +112,58 @@ _STRUCT_ARM_THREAD_STATE64
        __uint64_t    fp;               /* Frame pointer x29 */
        __uint64_t    lr;               /* Link register x30 */
        __uint64_t    sp;               /* Stack pointer x31 */
-       __uint64_t    pc;               /* Program counter */
+       __uint64_t    pc;               /* Program counter */
        __uint32_t    cpsr;             /* Current program status register */
        __uint32_t    __pad;    /* Same size for 32-bit or 64-bit clients */
 };
 #endif /* __DARWIN_UNIX03 */
+#if !defined(KERNEL)
+#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__)
+#if __DARWIN_UNIX03
+#define __darwin_arm_thread_state64_get_pc(ts) \
+               ((ts).__pc)
+#define __darwin_arm_thread_state64_get_pc_fptr(ts) \
+               ((void*)(uintptr_t)((ts).__pc))
+#define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \
+               ((ts).__pc = (uintptr_t)(fptr))
+#define __darwin_arm_thread_state64_get_lr(ts) \
+               ((ts).__lr)
+#define __darwin_arm_thread_state64_get_lr_fptr(ts) \
+               ((void*)(uintptr_t)((ts).__lr))
+#define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \
+               ((ts).__lr = (uintptr_t)(fptr))
+#define __darwin_arm_thread_state64_get_sp(ts) \
+               ((ts).__sp)
+#define __darwin_arm_thread_state64_set_sp(ts, ptr) \
+               ((ts).__sp = (uintptr_t)(ptr))
+#define __darwin_arm_thread_state64_get_fp(ts) \
+               ((ts).__fp)
+#define __darwin_arm_thread_state64_set_fp(ts, ptr) \
+               ((ts).__fp = (uintptr_t)(ptr))
+#else /* !__DARWIN_UNIX03 */
+#define __darwin_arm_thread_state64_get_pc(ts) \
+               ((ts).pc)
+#define __darwin_arm_thread_state64_get_pc_fptr(ts) \
+               ((void*)(uintptr_t)((ts).pc))
+#define __darwin_arm_thread_state64_set_pc_fptr(ts, fptr) \
+               ((ts).pc = (uintptr_t)(fptr))
+#define __darwin_arm_thread_state64_get_lr(ts) \
+               ((ts).lr)
+#define __darwin_arm_thread_state64_get_lr_fptr(ts) \
+               ((void*)(uintptr_t)((ts).lr))
+#define __darwin_arm_thread_state64_set_lr_fptr(ts, fptr) \
+               ((ts).lr = (uintptr_t)(fptr))
+#define __darwin_arm_thread_state64_get_sp(ts) \
+               ((ts).sp)
+#define __darwin_arm_thread_state64_set_sp(ts, ptr) \
+               ((ts).sp = (uintptr_t)(ptr))
+#define __darwin_arm_thread_state64_get_fp(ts) \
+               ((ts).fp)
+#define __darwin_arm_thread_state64_set_fp(ts, ptr) \
+               ((ts).fp = (uintptr_t)(ptr))
+#endif /* __DARWIN_UNIX03 */
+#endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) */
+#endif /* !defined(KERNEL) */
 
 #if __DARWIN_UNIX03
 #define _STRUCT_ARM_VFP_STATE          struct __darwin_arm_vfp_state
index 318134d1f955daa0a36b7f90cd33d92bad5f697a..0751024a2945c7679ff2a534ee8f7e0b45857a3f 100644 (file)
 
 #define DTRACE_LABEL(p, n)                     \
        ".pushsection __DATA, __data\n\t"       \
-        ".globl " DTRACE_LAB(p, n) "\n\t"      \
-        DTRACE_LAB(p, n) ":" ".long 1f""\n\t"  \
+       ".p2align       2\n\t"                  \
+       ".globl " DTRACE_LAB(p, n) "\n\t"       \
+       DTRACE_LAB(p, n) ":" ".long 1f""\n\t"   \
        ".popsection" "\n\t"                    \
        "1:"
 #else /* __arm64__ */
 #define DTRACE_LAB(p, n)                                                                        \
-    "__dtrace_probe$" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n)
+       "__dtrace_probe$" DTRACE_TOSTRING(%=__LINE__) DTRACE_STRINGIFY(_##p##___##n)
 
 #define DTRACE_LABEL(p, n)                     \
        ".pushsection __DATA, __data\n\t"       \
-        ".globl " DTRACE_LAB(p, n) "\n\t"      \
-        DTRACE_LAB(p, n) ":" ".quad 1f""\n\t"  \
+       ".p2align       3\n\t"                  \
+       ".globl " DTRACE_LAB(p, n) "\n\t"       \
+       DTRACE_LAB(p, n) ":" ".quad 1f""\n\t"   \
        ".popsection" "\n\t"                    \
        "1:"
 #endif
index 7f4ac7d0482e5ed1a2feaf379bc2edacdb5854d7..8bdbe8a9e13dcfe8949f8ceb54c9e69956d70f69 100644 (file)
@@ -96,6 +96,31 @@ typedef _STRUCT_ARM_THREAD_STATE             arm_thread_state_t;
 typedef _STRUCT_ARM_THREAD_STATE               arm_thread_state32_t;
 typedef _STRUCT_ARM_THREAD_STATE64             arm_thread_state64_t;
 
+#if !defined(KERNEL)
+#if __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__)
+#define arm_thread_state64_get_pc(ts) \
+               __darwin_arm_thread_state64_get_pc(ts)
+#define arm_thread_state64_get_pc_fptr(ts) \
+               __darwin_arm_thread_state64_get_pc_fptr(ts)
+#define arm_thread_state64_set_pc_fptr(ts, fptr) \
+               __darwin_arm_thread_state64_set_pc_fptr(ts, fptr)
+#define arm_thread_state64_get_lr(ts) \
+               __darwin_arm_thread_state64_get_lr(ts)
+#define arm_thread_state64_get_lr_fptr(ts) \
+               __darwin_arm_thread_state64_get_lr_fptr(ts)
+#define arm_thread_state64_set_lr_fptr(ts, fptr) \
+               __darwin_arm_thread_state64_set_lr_fptr(ts, fptr)
+#define arm_thread_state64_get_sp(ts) \
+               __darwin_arm_thread_state64_get_sp(ts)
+#define arm_thread_state64_set_sp(ts, ptr) \
+               __darwin_arm_thread_state64_set_sp(ts, ptr)
+#define arm_thread_state64_get_fp(ts) \
+               __darwin_arm_thread_state64_get_fp(ts)
+#define arm_thread_state64_set_fp(ts, ptr) \
+               __darwin_arm_thread_state64_set_fp(ts, ptr)
+#endif /* __DARWIN_C_LEVEL >= __DARWIN_C_FULL && defined(__arm64__) */
+#endif /* !defined(KERNEL) */
+
 struct arm_unified_thread_state {
        arm_state_hdr_t ash;
        union {
@@ -364,12 +389,12 @@ typedef struct arm_saved_state32_tagged arm_saved_state32_tagged_t;
                (sizeof (arm_saved_state32_t)/sizeof(unsigned int)))
 
 struct arm_saved_state64 {
-       uint64_t    x[29];              /* General purpose registers x0-x28 */
-       uint64_t    fp;                 /* Frame pointer x29 */
-       uint64_t    lr;                 /* Link register x30 */
-       uint64_t    sp;                 /* Stack pointer x31 */
-       uint64_t    pc;                 /* Program counter */
-       uint32_t    cpsr;               /* Current program status register */
+       uint64_t        x[29];          /* General purpose registers x0-x28 */
+       uint64_t        fp;                     /* Frame pointer x29 */
+       uint64_t        lr;                     /* Link register x30 */
+       uint64_t        sp;                     /* Stack pointer x31 */
+       uint64_t        pc;                     /* Program counter */
+       uint32_t        cpsr;           /* Current program status register */
        uint32_t        reserved;       /* Reserved padding */
        uint64_t        far;            /* Virtual fault address */
        uint32_t        esr;            /* Exception syndrome register */
index 915e237e6a1b99379cf09e57d3e493544594aab1..c0ed53a8e574e189033de28a76a434858442daea 100644 (file)
@@ -166,6 +166,7 @@ extern unsigned             PAGE_SHIFT_CONST;
 #ifdef KERNEL
 
 #if defined (__arm__)
+#define VM_KERNEL_POINTER_SIGNIFICANT_BITS  32
 #define VM_MIN_KERNEL_ADDRESS  ((vm_address_t) 0x80000000)
 #define VM_MAX_KERNEL_ADDRESS  ((vm_address_t) 0xFFFEFFFF)
 #define VM_HIGH_KERNEL_WINDOW  ((vm_address_t) 0xFFFE0000)
@@ -174,6 +175,7 @@ extern unsigned             PAGE_SHIFT_CONST;
  * The minimum and maximum kernel address; some configurations may
  * constrain the address space further.
  */
+#define VM_KERNEL_POINTER_SIGNIFICANT_BITS  37
 #define VM_MIN_KERNEL_ADDRESS  ((vm_address_t) 0xffffffe000000000ULL)
 #define VM_MAX_KERNEL_ADDRESS  ((vm_address_t) 0xfffffff3ffffffffULL)
 #else
@@ -183,8 +185,11 @@ extern unsigned            PAGE_SHIFT_CONST;
 #define VM_MIN_KERNEL_AND_KEXT_ADDRESS \
                                VM_MIN_KERNEL_ADDRESS
 
-#define VM_KERNEL_ADDRESS(va)  ((((vm_address_t)(va))>=VM_MIN_KERNEL_ADDRESS) && \
-                               (((vm_address_t)(va))<=VM_MAX_KERNEL_ADDRESS))
+#define VM_KERNEL_STRIP_PTR(_v) (_v)
+
+#define VM_KERNEL_ADDRESS(_va) \
+       ((((vm_address_t)VM_KERNEL_STRIP_PTR(_va)) >= VM_MIN_KERNEL_ADDRESS) && \
+        (((vm_address_t)VM_KERNEL_STRIP_PTR(_va)) <= VM_MAX_KERNEL_ADDRESS))
 
 #ifdef  MACH_KERNEL_PRIVATE
 /*
@@ -193,22 +198,41 @@ extern unsigned           PAGE_SHIFT_CONST;
 extern unsigned long           gVirtBase, gPhysBase, gPhysSize;
 
 #define isphysmem(a)           (((vm_address_t)(a) - gPhysBase) < gPhysSize)
-#define phystokv(a)            ((vm_address_t)(a) - gPhysBase + gVirtBase)
 
 #if KASAN
 /* Increase the stack sizes to account for the redzones that get added to every
  * stack object. */
 # define KERNEL_STACK_SIZE     (4*4*4096)
-# define INTSTACK_SIZE         (4*4*4096)
 #else
 # define KERNEL_STACK_SIZE     (4*4096)
-# define INTSTACK_SIZE         (4*4096)
+#endif
+
+#define INTSTACK_SIZE          (4*4096)
+
+#ifdef __arm64__
+#define EXCEPSTACK_SIZE                (4*4096)
+#else
+#define FIQSTACK_SIZE          (4096)
 #endif
 
 #if defined (__arm__)
 #define HIGH_EXC_VECTORS       ((vm_address_t) 0xFFFF0000)
 #endif
 
+/*
+ * TODO: We're hardcoding the expected virtual TEXT base here;
+ * that gives us an ugly dependency on a linker argument in
+ * the make files.  Clean this up, so we don't hardcode it
+ * twice; this is nothing but trouble.
+ */
+#if defined (__arm__)
+#define VM_KERNEL_LINK_ADDRESS  ((vm_address_t) 0x80000000)
+#elif defined (__arm64__)
+#define VM_KERNEL_LINK_ADDRESS  ((vm_address_t) 0xFFFFFFF007004000)
+#else
+#error architecture not supported
+#endif
+
 #endif /* MACH_KERNEL_PRIVATE */
 #endif /* KERNEL */
 
diff --git a/osfmk/mach/branch_predicates.h b/osfmk/mach/branch_predicates.h
deleted file mode 100644 (file)
index a551970..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-
-#ifndef        _MACH_BRANCH_PREDICATES_H
-#define        _MACH_BRANCH_PREDICATES_H
-
-#define        __probable(x)   __builtin_expect(!!((long)(x)), 1L)
-
-#define        __improbable(x) __builtin_expect(!!((long)(x)), 0L)
-
-#endif /* _MACH_BRANCH_PREDICATES_H */
index c412d319265bfb05f09a350f1f1c7388948886b4..734e7408fea00b96523a9348aa38195ef823a09e 100644 (file)
  */
 
 subsystem
+#if    KERNEL_SERVER
+         KernelServer
+#endif /* KERNEL_SERVER */
+
 #if    KERNEL_USER
          KernelUser
 #endif
@@ -80,6 +84,14 @@ routine              exception_raise(
                        task            : mach_port_t;
                        exception       : exception_type_t;
                        code            : exception_data_t
+#if EXC_SERVER_SECTOKEN
+ ;
+ ServerSecToken stoken : security_token_t
+#endif
+#if EXC_SERVER_AUDITTOKEN
+ ;
+ ServerAuditToken atoken: audit_token_t
+#endif
                        );
 
 routine                exception_raise_state(
@@ -88,7 +100,16 @@ routine             exception_raise_state(
                        code            : exception_data_t, const;
                  inout flavor          : int;
                        old_state       : thread_state_t, const;
-                   out new_state       : thread_state_t);
+                   out new_state       : thread_state_t
+#if EXC_SERVER_SECTOKEN
+ ;
+ ServerSecToken stoken : security_token_t
+#endif
+#if EXC_SERVER_AUDITTOKEN
+ ;
+ ServerAuditToken atoken: audit_token_t
+#endif
+                       );
 
 routine                exception_raise_state_identity(
                        exception_port  : mach_port_t;
@@ -98,6 +119,15 @@ routine             exception_raise_state_identity(
                        code            : exception_data_t;
                  inout flavor          : int;
                        old_state       : thread_state_t;
-                   out new_state       : thread_state_t);
+                   out new_state       : thread_state_t
+#if EXC_SERVER_SECTOKEN
+ ;
+ ServerSecToken stoken : security_token_t
+#endif
+#if EXC_SERVER_AUDITTOKEN
+ ;
+ ServerAuditToken atoken: audit_token_t
+#endif
+                       );
 
 /* vim: set ft=c : */
index 9339fad376de14cfa58c8d621ddc65eba2a7b033..716c17960b01c1ef4b8c929585b306355e59db74 100644 (file)
@@ -99,7 +99,8 @@ typedef       integer_t       host_flavor_t;
 #define HOST_MACH_MSG_TRAP     8       /* Has mach_msg_trap */
 #define HOST_VM_PURGABLE       9       /* purg'e'able memory info */
 #define HOST_DEBUG_INFO_INTERNAL 10    /* Used for kernel internal development tests only */
-#define HOST_CAN_HAS_DEBUGGER  11 
+#define HOST_CAN_HAS_DEBUGGER  11
+#define HOST_PREFERRED_USER_ARCH 12    /* Get the preferred user-space architecture */
 
 #ifdef MACH_KERNEL_PRIVATE
 struct host_basic_info_old {
@@ -260,6 +261,16 @@ typedef struct host_cpu_load_info  *host_cpu_load_info_t;
 #define HOST_CPU_LOAD_INFO_COUNT ((mach_msg_type_number_t) \
                (sizeof (host_cpu_load_info_data_t) / sizeof (integer_t)))
 
+struct host_preferred_user_arch {
+       cpu_type_t      cpu_type;       /* Preferred user-space cpu type */
+       cpu_subtype_t   cpu_subtype;    /* Preferred user-space cpu subtype */
+};
+
+typedef struct host_preferred_user_arch        host_preferred_user_arch_data_t;
+typedef struct host_preferred_user_arch        *host_preferred_user_arch_t;
+#define HOST_PREFERRED_USER_ARCH_COUNT ((mach_msg_type_number_t) \
+               (sizeof(host_preferred_user_arch_data_t)/sizeof(integer_t)))
+
 #ifdef PRIVATE
 /*
  * CPU Statistics information
index 82ed8d003b292ff467072d038e5eb220bff3d29c..8c97b882efce748c088ed749f376540a766aefee 100644 (file)
@@ -69,6 +69,8 @@
  */
 #define HOST_SECURITY_PORT               0
 
+#define HOST_MIN_SPECIAL_PORT            HOST_SECURITY_PORT
+
 /*
  * Always provided by kernel (cannot be set from user-space).
  */
@@ -77,6 +79,8 @@
 #define HOST_IO_MASTER_PORT              3
 #define HOST_MAX_SPECIAL_KERNEL_PORT     7 /* room to grow */
 
+#define HOST_LAST_SPECIAL_KERNEL_PORT    HOST_IO_MASTER_PORT
+
 /*
  * Not provided by kernel
  */
 #define HOST_NODE_PORT                 (19 + HOST_MAX_SPECIAL_KERNEL_PORT)
 #define HOST_RESOURCE_NOTIFY_PORT      (20 + HOST_MAX_SPECIAL_KERNEL_PORT)
 #define HOST_CLOSURED_PORT             (21 + HOST_MAX_SPECIAL_KERNEL_PORT)
+#define HOST_SYSPOLICYD_PORT           (22 + HOST_MAX_SPECIAL_KERNEL_PORT)
 
-#define HOST_MAX_SPECIAL_PORT          HOST_CLOSURED_PORT
-                                        /* MAX = last since rdar://19421223 */
+#define HOST_MAX_SPECIAL_PORT          HOST_SYSPOLICYD_PORT
+                                        /* MAX = last since rdar://35861175 */
 
 /* obsolete name */
 #define HOST_CHUD_PORT HOST_LAUNCHCTL_PORT
 #define host_set_closured_port(host, port)     \
        (host_set_special_port((host), HOST_CLOSURED_PORT, (port)))
 
+#define host_get_syspolicyd_port(host, port)   \
+       (host_get_special_port((host),                          \
+       HOST_LOCAL_NODE, HOST_SYSPOLICYD_PORT, (port)))
+#define host_set_syspolicyd_port(host, port)   \
+       (host_set_special_port((host), HOST_SYSPOLICYD_PORT, (port)))
+
 /* HOST_RESOURCE_NOTIFY_PORT doesn't #defines these conveniences.
    All lookups go through send_resource_violation()
  */
index 040472dfeaf86834bf974b66241b01c7355e2b2c..18a8fcab88a6d8a278fe4ab71199d68067384179 100644 (file)
 
 
 #define KERNEL_IMAGE_TO_PHYS(x) (x)
+#define VM_KERNEL_POINTER_SIGNIFICANT_BITS 39
 #define VM_MIN_KERNEL_ADDRESS          ((vm_offset_t) 0xFFFFFF8000000000UL)
 #define VM_MIN_KERNEL_PAGE             ((ppnum_t)0)
 #define VM_MIN_KERNEL_AND_KEXT_ADDRESS (VM_MIN_KERNEL_ADDRESS - 0x80000000ULL)
 #define KEXT_ALLOC_BASE(x)  ((x) - KEXT_ALLOC_MAX_OFFSET)
 #define KEXT_ALLOC_SIZE(x)  (KEXT_ALLOC_MAX_OFFSET - (x))
 
+#define VM_KERNEL_STRIP_PTR(_v) (_v)
+
 #define VM_KERNEL_ADDRESS(va)  ((((vm_address_t)(va))>=VM_MIN_KERNEL_AND_KEXT_ADDRESS) && \
                                (((vm_address_t)(va))<=VM_MAX_KERNEL_ADDRESS))
 
  * stack object. */
 # define INTSTACK_SIZE (I386_PGBYTES*4*4)
 # define KERNEL_STACK_SIZE (I386_PGBYTES*4*4)
+#elif DEBUG
+# define INTSTACK_SIZE (I386_PGBYTES*4)
+# define KERNEL_STACK_SIZE (I386_PGBYTES*6)
 #else
 # define INTSTACK_SIZE (I386_PGBYTES*4)
 # define KERNEL_STACK_SIZE (I386_PGBYTES*4)
        MACRO_END
 
 #define IS_USERADDR64_CANONICAL(addr)                  \
-       ((addr) < (VM_MAX_USER_PAGE_ADDRESS + PAGE_SIZE))
+       ((addr) < (VM_MAX_USER_PAGE_ADDRESS))
 
 #endif /* MACH_KERNEL_PRIVATE */
 
index 99449b7bf37339e19c12fee709875dcd9e147504..41224659243378033de2fc6be0519a89b027c668 100644 (file)
@@ -175,6 +175,7 @@ extern void kmod_panic_dump(vm_offset_t * addr, unsigned int dump_cnt);
  * flag overrides system mode in dtrace_modload().
  */
 #define KMOD_DTRACE_FORCE_INIT 0x01
+#define KMOD_DTRACE_STATIC_KEXT        0x02
 #endif /* CONFIG_DTRACE */
 
 #endif    /* KERNEL_PRIVATE */
index a2a7669dab333bee791eb5c4b9b51bc9855487d4..5ce6427bc7b09c2ca5d149b44f58bc5367fd1849 100644 (file)
@@ -64,6 +64,10 @@ subsystem
 #if    KERNEL_USER
          KernelUser
 #endif
+#if    KERNEL_SERVER
+         KernelServer
+#endif /* KERNEL_SERVER */
+
                     mach_exc 2405;
 
 #include <mach/std_types.defs>
@@ -80,6 +84,14 @@ routine              mach_exception_raise(
                        task            : mach_port_t;
                        exception       : exception_type_t;
                        code            : mach_exception_data_t
+#if MACH_EXC_SERVER_SECTOKEN
+ ;
+ ServerSecToken stoken : security_token_t
+#endif
+#if MACH_EXC_SERVER_AUDITTOKEN
+ ;
+ ServerAuditToken atoken: audit_token_t
+#endif
                        );
 
 routine                mach_exception_raise_state(
@@ -88,7 +100,16 @@ routine             mach_exception_raise_state(
                        code            : mach_exception_data_t, const;
                  inout flavor          : int;
                        old_state       : thread_state_t, const;
-                   out new_state       : thread_state_t);
+                   out new_state       : thread_state_t
+#if MACH_EXC_SERVER_SECTOKEN
+ ;
+ ServerSecToken stoken : security_token_t
+#endif
+#if MACH_EXC_SERVER_AUDITTOKEN
+ ;
+ ServerAuditToken atoken: audit_token_t
+#endif
+                       );
 
 routine                mach_exception_raise_state_identity(
                        exception_port  : mach_port_t;
@@ -98,6 +119,15 @@ routine             mach_exception_raise_state_identity(
                        code            : mach_exception_data_t;
                  inout flavor          : int;
                        old_state       : thread_state_t;
-                   out new_state       : thread_state_t);
+                   out new_state       : thread_state_t
+#if MACH_EXC_SERVER_SECTOKEN
+ ;
+ ServerSecToken stoken : security_token_t
+#endif
+#if MACH_EXC_SERVER_AUDITTOKEN
+ ;
+ ServerAuditToken atoken: audit_token_t
+#endif
+                       );
 
 /* vim: set ft=c : */
index 5ca0e125e30c1138e9f11182dd909d07776e4f2c..83d485388596f539b7ca2a603eaa67b72fe8188b 100644 (file)
@@ -375,4 +375,31 @@ routine mach_zone_info_for_largest_zone(
 skip;
 #endif
 
+#ifdef PRIVATE
+/*
+ * Returns names of zones that have zlog enabled.
+ */
+routine mach_zone_get_zlog_zones(
+               host            : host_priv_t;
+       out     names           : mach_zone_name_array_t,
+                                       Dealloc);
+#else
+skip;
+#endif
+
+#ifdef PRIVATE
+/*
+ * Returns BTLog records for a specific zone.
+ * The zone name is passed in via the argument name,
+ * recs returns an array of zone_btrecord_t's.
+ */
+routine mach_zone_get_btlog_records(
+               host            : host_priv_t;
+               name            : mach_zone_name_t;
+       out     recs            : zone_btrecord_array_t,
+                                       Dealloc);
+#else
+skip;
+#endif
+
 /* vim: set ft=c : */
index dcb43c76e3d47fffc9d9a41dfd1a4d1c4f0ac948..5bc503421dfbbd14c92f51293ed3f6c7e668d3ef 100644 (file)
@@ -611,4 +611,16 @@ routine mach_port_space_basic_info(
                task            : ipc_space_inspect_t;
        out     basic_info      : ipc_info_space_basic_t);
 
+#if KERNEL || !LIBSYSCALL_INTERFACE
+/*
+ *     Returns sync ipc turnstile link status
+ *     for special reply ports.
+ */
+routine mach_port_special_reply_port_reset_link(
+               task            : ipc_space_t;
+               name            : mach_port_name_t;
+       out     srp_lost_link   : boolean_t);
+#else
+skip;
+#endif
 /* vim: set ft=c : */
index 462d7972f1b8301e90cbebe8ef47c1e16d924559..9e712b9b11c1d021e4abc9c4e7ff3149f69ba877 100644 (file)
@@ -222,6 +222,14 @@ extern kern_return_t _kernelrpc_mach_port_insert_right_trap(
                                mach_msg_type_name_t polyPoly
 );
 
+extern kern_return_t _kernelrpc_mach_port_get_attributes_trap(
+                               mach_port_name_t target,
+                               mach_port_name_t name,
+                               mach_port_flavor_t flavor,
+                               mach_port_info_t port_info_out,
+                               mach_msg_type_number_t *port_info_outCnt
+);
+
 extern kern_return_t _kernelrpc_mach_port_insert_member_trap(
                                mach_port_name_t target,
                                mach_port_name_t name,
@@ -714,6 +722,16 @@ struct _kernelrpc_mach_port_insert_right_args {
 extern kern_return_t _kernelrpc_mach_port_insert_right_trap(
                                struct _kernelrpc_mach_port_insert_right_args *args);
 
+struct _kernelrpc_mach_port_get_attributes_args {
+       PAD_ARG_(mach_port_name_t, target);
+       PAD_ARG_(mach_port_name_t, name);
+       PAD_ARG_(mach_port_flavor_t, flavor);
+       PAD_ARG_(user_addr_t, info);
+       PAD_ARG_(user_addr_t, count);
+};
+extern kern_return_t _kernelrpc_mach_port_get_attributes_trap(
+                               struct _kernelrpc_mach_port_get_attributes_args *args);
+
 struct _kernelrpc_mach_port_insert_member_args {
        PAD_ARG_(mach_port_name_t, target);
        PAD_ARG_(mach_port_name_t, name);
index 87c1ba64f8612edecca450cd72a9de1c94209a2e..b7a5d36b6c24bd4ecea7686f414518fcd9489eec 100644 (file)
@@ -127,8 +127,8 @@ __END_DECLS
 /*
  * Capability bits used in the definition of cpu_type.
  */
-#define        CPU_ARCH_MASK   0xff000000              /* mask for architecture bits */
-#define CPU_ARCH_ABI64 0x01000000              /* 64 bit ABI */
+#define        CPU_ARCH_MASK           0xff000000      /* mask for architecture bits */
+#define CPU_ARCH_ABI64         0x01000000      /* 64 bit ABI */
 
 /*
  *     Machine types known by all.
@@ -151,7 +151,7 @@ __END_DECLS
 #define CPU_TYPE_MC98000       ((cpu_type_t) 10)
 #define CPU_TYPE_HPPA           ((cpu_type_t) 11)
 #define CPU_TYPE_ARM           ((cpu_type_t) 12)
-#define CPU_TYPE_ARM64          (CPU_TYPE_ARM | CPU_ARCH_ABI64)
+#define CPU_TYPE_ARM64         (CPU_TYPE_ARM | CPU_ARCH_ABI64)
 #define CPU_TYPE_MC88000       ((cpu_type_t) 13)
 #define CPU_TYPE_SPARC         ((cpu_type_t) 14)
 #define CPU_TYPE_I860          ((cpu_type_t) 15)
@@ -368,6 +368,7 @@ __END_DECLS
 #define CPU_SUBTYPE_ARM64_ALL           ((cpu_subtype_t) 0)
 #define CPU_SUBTYPE_ARM64_V8            ((cpu_subtype_t) 1)
 
+
 #endif /* !__ASSEMBLER__ */
 
 /*
@@ -407,6 +408,7 @@ __END_DECLS
 #define CPUFAMILY_ARM_TYPHOON          0x2c91a47e
 #define CPUFAMILY_ARM_TWISTER          0x92fb37c8
 #define CPUFAMILY_ARM_HURRICANE                0x67ceee93
+#define CPUFAMILY_ARM_MONSOON_MISTRAL  0xe81e7ef6
 
 /* The following synonyms are deprecated: */
 #define CPUFAMILY_INTEL_6_23   CPUFAMILY_INTEL_PENRYN
diff --git a/osfmk/mach/memory_entry.defs b/osfmk/mach/memory_entry.defs
new file mode 100644 (file)
index 0000000..07e8fa4
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+subsystem
+#if KERNEL_SERVER
+    KernelServer
+#endif /* KERNEL_SERVER */
+       memory_entry 4900;
+
+#if !KERNEL && !LIBSYSCALL_INTERFACE
+    UserPrefix _kernelrpc_;
+#endif
+
+#include <mach/std_types.defs>
+#include <mach/mach_types.defs>
+#include <mach_debug/mach_debug_types.defs>
+
+routine mach_memory_entry_purgable_control(
+               mem_entry       : mem_entry_name_port_t;
+               control         : vm_purgable_t;
+       inout   state           : int);
+
+routine mach_memory_entry_access_tracking(
+               mem_entry               : mem_entry_name_port_t;
+       inout   access_tracking         : int;
+       out     access_tracking_reads   : uint32_t;
+       out     access_tracking_writes  : uint32_t);
index 13481e1bdfa09dc7c73ee44dd94cd0e0898ef043..edc86250353236cf9fe86673fbb4c33bb1dbaa57 100644 (file)
@@ -721,7 +721,9 @@ typedef integer_t mach_msg_option_t;
 #define MACH_SEND_NODENAP      MACH_SEND_NOIMPORTANCE
 #define MACH_SEND_IMPORTANCE   0x00080000      /* msg carries importance - kernel only */
 #define MACH_SEND_SYNC_OVERRIDE        0x00100000      /* msg should do sync ipc override */
-
+#define MACH_SEND_PROPAGATE_QOS  0x00200000    /* IPC should propagate the caller's QoS */
+#define MACH_SEND_SYNC_USE_THRPRI      MACH_SEND_PROPAGATE_QOS /* obsolete name */
+#define MACH_SEND_KERNEL    0x00400000  /* full send from kernel space - kernel only */
 
 #define MACH_RCV_TIMEOUT       0x00000100      /* timeout value applies to receive */  
 #define MACH_RCV_NOTIFY                0x00000200      /* reserved - legacy */
@@ -740,7 +742,7 @@ typedef integer_t mach_msg_option_t;
  * If more than one thread attempts to MACH_PEEK_MSG on a port or set, one of
  * the threads may miss messages (in fact, it may never wake up).
  */
-#define MACH_PEEK_MSG          0x00100000      /* receive, but leave msgs queued */
+#define MACH_PEEK_MSG          0x80000000      /* receive, but leave msgs queued */
 
 #endif
 
@@ -772,7 +774,7 @@ typedef integer_t mach_msg_option_t;
 #define MACH_SEND_USER (MACH_SEND_MSG | MACH_SEND_TIMEOUT | \
                                                MACH_SEND_NOTIFY | MACH_SEND_OVERRIDE | \
                                                MACH_SEND_TRAILER | MACH_SEND_NOIMPORTANCE | \
-                                               MACH_SEND_SYNC_OVERRIDE)
+                                               MACH_SEND_SYNC_OVERRIDE | MACH_SEND_PROPAGATE_QOS)
 
 #define MACH_RCV_USER (MACH_RCV_MSG | MACH_RCV_TIMEOUT | \
                                           MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY | \
@@ -1016,8 +1018,6 @@ extern kern_return_t              mach_voucher_deallocate(
 
 extern mach_msg_return_t       mach_msg_receive_results(mach_msg_size_t *size);
 
-extern mach_msg_priority_t mach_msg_priority_combine(mach_msg_priority_t msg_qos,
-                                                     mach_msg_priority_t recv_qos);
 #endif /* KERNEL */
 
 __END_DECLS
index 30b55a1b09f0adad7b3ecf1507d5a771eaadd298..763e6f94f58699246c986069b9919aba93802282 100644 (file)
@@ -405,11 +405,25 @@ typedef mach_port_options_t *mach_port_options_ptr_t;
 
 /* Reasons for exception for a guarded mach port */
 enum mach_port_guard_exception_codes {
-       kGUARD_EXC_DESTROY              = 1u << 0,
-       kGUARD_EXC_MOD_REFS             = 1u << 1,
-       kGUARD_EXC_SET_CONTEXT          = 1u << 2,
-       kGUARD_EXC_UNGUARDED            = 1u << 3,
-       kGUARD_EXC_INCORRECT_GUARD      = 1u << 4
+       kGUARD_EXC_DESTROY                       = 1u << 0,
+       kGUARD_EXC_MOD_REFS                      = 1u << 1,
+       kGUARD_EXC_SET_CONTEXT               = 1u << 2,
+       kGUARD_EXC_UNGUARDED                 = 1u << 3,
+       kGUARD_EXC_INCORRECT_GUARD           = 1u << 4,
+        /* start of non-fatal guards */
+       kGUARD_EXC_INVALID_RIGHT         = 1u << 8,
+       kGUARD_EXC_INVALID_NAME          = 1u << 9,
+       kGUARD_EXC_INVALID_VALUE         = 1u << 10,
+       kGUARD_EXC_INVALID_ARGUMENT      = 1u << 11,
+       kGUARD_EXC_RIGHT_EXISTS          = 1u << 12,
+       kGUARD_EXC_KERN_NO_SPACE         = 1u << 13,
+       kGUARD_EXC_KERN_FAILURE          = 1u << 14,
+       kGUARD_EXC_KERN_RESOURCE         = 1u << 15,
+       kGUARD_EXC_SEND_INVALID_REPLY    = 1u << 16,
+       kGUARD_EXC_SEND_INVALID_VOUCHER  = 1u << 16,
+       kGUARD_EXC_SEND_INVALID_RIGHT    = 1u << 17,
+       kGUARD_EXC_RCV_INVALID_NAME      = 1u << 18,
+       kGUARD_EXC_RCV_INVALID_NOTIFY    = 1u << 19
 };
 
 #if    !__DARWIN_UNIX03 && !defined(_NO_PORT_T_FROM_MACH)
index 19351b07f302fbe54b58570ff060b4b2450e10b2..d5799dc2eb06524b09533c0d6a62646f7ec389ad 100644 (file)
 #define SHARED_REGION_NESTING_MIN_ARM          ?
 #define SHARED_REGION_NESTING_MAX_ARM          ?
 
+
 #ifdef XNU_KERNEL_PRIVATE
 /* ARM64_TODO: move to higher memory */
 #endif
 #define SHARED_REGION_BASE_ARM64               0x180000000ULL
-#define SHARED_REGION_SIZE_ARM64               0x40000000ULL
+#define SHARED_REGION_SIZE_ARM64               0x100000000ULL
 #define SHARED_REGION_NESTING_BASE_ARM64       0x180000000ULL
-#define SHARED_REGION_NESTING_SIZE_ARM64       0x40000000ULL
+#define SHARED_REGION_NESTING_SIZE_ARM64       0x100000000ULL
 #define SHARED_REGION_NESTING_MIN_ARM64                ?
 #define SHARED_REGION_NESTING_MAX_ARM64                ?
 
 #define SHARED_REGION_NESTING_SIZE             SHARED_REGION_NESTING_SIZE_ARM
 #define SHARED_REGION_NESTING_MIN              SHARED_REGION_NESTING_MIN_ARM
 #define SHARED_REGION_NESTING_MAX              SHARED_REGION_NESTING_MAX_ARM
-#elif defined(__arm64__)
+#elif defined(__arm64__) && defined(__LP64__)
 #define SHARED_REGION_BASE                     SHARED_REGION_BASE_ARM64
 #define SHARED_REGION_SIZE                     SHARED_REGION_SIZE_ARM64
 #define SHARED_REGION_NESTING_BASE             SHARED_REGION_NESTING_BASE_ARM64
index 11277d0da7fac242cf166306d7a7c40a135fae78..239d11baf02fc50a8b96e047b33b817d86e76150 100644 (file)
@@ -49,8 +49,15 @@ typedef int sync_policy_t;
  *     These options provide addition (kernel-private) behaviors
  */
 
-#define SYNC_POLICY_PREPOST            0x4
-#define SYNC_POLICY_DISABLE_IRQ                0x8
+#define SYNC_POLICY_PREPOST             0x4
+#define SYNC_POLICY_DISABLE_IRQ         0x8
+
+/*
+ * If the waitq is IRQ safe, 0x10 suggests it's a waitq embedded in turnstile.
+ * If the waitq is not IRQ safe, 0x10 suggests it's a waitq of a port and should use it's turnstile safeq.
+ */
+#define SYNC_POLICY_TURNSTILE           0x10
+#define SYNC_POLICY_PORT                0x10
 
 #endif /* KERNEL_PRIVATE */
 
index 381bfc510b0b16d62c5de0def8a4ed798cd8e55a..b8c1c4d3224f5f816bfec47fe2e3be05c85fc9d1 100644 (file)
@@ -114,6 +114,7 @@ kernel_trap(semaphore_wait_signal_trap,-37,2)
 kernel_trap(semaphore_timedwait_trap,-38,3)
 kernel_trap(semaphore_timedwait_signal_trap,-39,4)
 
+kernel_trap(_kernelrpc_mach_port_get_attributes_trap,-40,5)
 kernel_trap(_kernelrpc_mach_port_guard_trap,-41,5)
 kernel_trap(_kernelrpc_mach_port_unguard_trap,-42,4)
 kernel_trap(mach_generate_activity_id, -43, 3)
index 58e4e75549b58122e9911ff68db44f3350e550fa..62824b760ed816f8632add7a547ad794fa9f5de6 100644 (file)
@@ -479,7 +479,8 @@ typedef struct task_flags_info * task_flags_info_t;
 #define TASK_FLAGS_INFO_COUNT  ((mach_msg_type_number_t) \
                (sizeof(task_flags_info_data_t) / sizeof (natural_t)))
 
-#define TF_LP64                 0x00000001                              /* task has 64-bit addressing */
+#define TF_LP64                0x00000001 /* task has 64-bit addressing */
+#define TF_64B_DATA    0x00000002 /* task has 64-bit data registers */
 
 #define TASK_DEBUG_INFO_INTERNAL    29 /* Used for kernel internal development tests. */
 
index 9ad6d0798282c6ba5b2c9be2ea06fe7f9eb543c6..1c58d60675d02c46e5f01007bd0830096df7e843 100644 (file)
@@ -112,15 +112,16 @@ kern_return_t     task_policy_get(
 
 
 enum task_role {
-       TASK_RENICED = -1,
-       TASK_UNSPECIFIED = 0,
-       TASK_FOREGROUND_APPLICATION,
-       TASK_BACKGROUND_APPLICATION,
-       TASK_CONTROL_APPLICATION,
-       TASK_GRAPHICS_SERVER,
-       TASK_THROTTLE_APPLICATION,
-       TASK_NONUI_APPLICATION,
-       TASK_DEFAULT_APPLICATION
+       TASK_RENICED                    = -1,
+       TASK_UNSPECIFIED                = 0,
+       TASK_FOREGROUND_APPLICATION     = 1,
+       TASK_BACKGROUND_APPLICATION     = 2,
+       TASK_CONTROL_APPLICATION        = 3,
+       TASK_GRAPHICS_SERVER            = 4,
+       TASK_THROTTLE_APPLICATION       = 5,
+       TASK_NONUI_APPLICATION          = 6,
+       TASK_DEFAULT_APPLICATION        = 7,
+       TASK_DARWINBG_APPLICATION       = 8,
 };
 
 typedef integer_t      task_role_t;
@@ -193,7 +194,7 @@ typedef struct task_qos_policy *task_qos_policy_t;
  * When they do, we will update TASK_POLICY_INTERNAL_STRUCT_VERSION.
  */
 
-#define TASK_POLICY_INTERNAL_STRUCT_VERSION 1
+#define TASK_POLICY_INTERNAL_STRUCT_VERSION 2
 
 struct task_requested_policy {
        uint64_t        trp_int_darwinbg        :1,     /* marked as darwinbg via setpriority */
@@ -209,7 +210,7 @@ struct task_requested_policy {
 
                        trp_apptype             :3,     /* What apptype did launchd tell us this was (inherited) */
                        trp_boosted             :1,     /* Has a non-zero importance assertion count */
-                       trp_role                :3,     /* task's system role */
+                       trp_role                :4,     /* task's system role */
                        trp_tal_enabled         :1,     /* TAL mode is enabled */
                        trp_over_latency_qos    :3,     /* Timer latency QoS override */
                        trp_over_through_qos    :3,     /* Computation throughput QoS override */
@@ -225,7 +226,7 @@ struct task_requested_policy {
                        trp_sup_cpu             :1,     /* Wants suppressed CPU priority (MAXPRI_SUPPRESSED) */
                        trp_sup_bg_sockets      :1,     /* Wants background sockets */
 
-                       trp_reserved            :18;
+                       trp_reserved            :17;
 };
 
 struct task_effective_policy {
@@ -244,14 +245,14 @@ struct task_effective_policy {
                        tep_tal_engaged         :1,     /* TAL mode is in effect */
                        tep_watchers_bg         :1,     /* watchers are BG-ed */
                        tep_sup_active          :1,     /* suppression behaviors are in effect */
-                       tep_role                :3,     /* task's system role */
+                       tep_role                :4,     /* task's system role */
                        tep_suppressed_cpu      :1,     /* cpu priority == MAXPRI_SUPPRESSED (trumped by lowpri_cpu) */
                        tep_sfi_managed         :1,     /* SFI Managed task */
                        tep_live_donor          :1,     /* task is a live importance boost donor */
                        tep_qos_clamp           :3,     /* task qos clamp (applies to qos-disabled threads too) */
                        tep_qos_ceiling         :3,     /* task qos ceiling (applies to only qos-participating threads) */
 
-                       tep_reserved            :32;
+                       tep_reserved            :31;
 };
 
 #endif /* PRIVATE */
index 66fd7ed051c84b3d6d6977e9828ca347caad1d5a..9080a451e42392cc41a59bdc6d6a9013fd685d82 100644 (file)
@@ -92,6 +92,8 @@ typedef       int     task_special_port_t;
 
 #define TASK_RESOURCE_NOTIFY_PORT   11 /* overrides host special RN port */
 
+#define TASK_MAX_SPECIAL_PORT TASK_RESOURCE_NOTIFY_PORT
+
 /*
  *     Definitions for ease of use
  */
index 716026ac5c9a580f9800cf990c3e9c6a3f7d56e0..205fff541d69511aac32e627677e8f9192c2f903 100644 (file)
@@ -96,7 +96,12 @@ routine thread_terminate(
  *     may be stale.  [Flavor THREAD_STATE_FLAVOR_LIST provides a
  *     list of valid flavors for the target thread.]
  */
-routine act_get_state(
+routine
+#ifdef KERNEL_SERVER
+act_get_state_to_user(
+#else
+act_get_state(
+#endif
                target_act      : thread_act_t;
                flavor          : int;
        out     old_state       : thread_state_t, CountInOut);
@@ -125,7 +130,12 @@ act_set_state(
  *     may be stale.  [Flavor THREAD_STATE_FLAVOR_LIST provides a
  *     list of valid flavors for the target thr_act.]
  */
-routine thread_get_state(
+routine
+#ifdef KERNEL_SERVER
+thread_get_state_to_user(
+#else
+thread_get_state(
+#endif
                target_act      : thread_act_t;
                flavor          : thread_state_flavor_t;
        out     old_state       : thread_state_t, CountInOut);
@@ -191,7 +201,12 @@ routine thread_abort_safely(
                target_act      : thread_act_t);
 
 
-routine thread_depress_abort(
+routine
+#ifdef KERNEL_SERVER
+thread_depress_abort_from_user(
+#else
+thread_depress_abort(
+#endif
                thread          : thread_act_t);
 
 
index 915425333e23ffcd89a2ce21ae34cbe888d7a5a9..626e0d3f1d8417aeeb069676124d73e53b42d766 100644 (file)
@@ -295,6 +295,7 @@ typedef struct thread_policy_state          *thread_policy_state_t;
 #define THREAD_QOS_POLICY               9
 #define THREAD_QOS_POLICY_OVERRIDE      10
 
+typedef uint8_t thread_qos_t;
 #define THREAD_QOS_UNSPECIFIED          0
 #define THREAD_QOS_DEFAULT              THREAD_QOS_UNSPECIFIED  /* Temporary rename */
 #define THREAD_QOS_MAINTENANCE          1
@@ -336,18 +337,6 @@ typedef struct thread_policy_state         *thread_policy_state_t;
  * either be a memory allocation in userspace, or the pthread_t of the
  * overrider if no allocation was used.
  *
- * THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE are used to
- * override the QoS of a thread currently draining a serial dispatch
- * queue, so that it can get to a block of higher QoS than its
- * predecessors. The override is applied by a thread enqueueing work
- * with resource=&queue, and reset by the thread that was overriden
- * once it has drained the queue. Since the ++ and reset are
- * asynchronous, there is the possibility of a ++ after the target
- * thread has issued a reset, in which case the workqueue thread may
- * issue a reset-all in its outermost scope before deciding whether it
- * should return to dequeueing work from the global concurrent queues,
- * or return to the kernel.
- *
  * THREAD_QOS_OVERRIDE_TYPE_WILDCARD is a catch-all which will reset every
  * resource matching the resource value.  Passing
  * THREAD_QOS_OVERRIDE_RESOURCE_WILDCARD as well will reset everything.
@@ -357,7 +346,6 @@ typedef struct thread_policy_state          *thread_policy_state_t;
 #define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_MUTEX                  (1)
 #define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_RWLOCK                 (2)
 #define THREAD_QOS_OVERRIDE_TYPE_PTHREAD_EXPLICIT_OVERRIDE      (3)
-#define THREAD_QOS_OVERRIDE_TYPE_DISPATCH_ASYNCHRONOUS_OVERRIDE (4)
 #define THREAD_QOS_OVERRIDE_TYPE_WILDCARD                       (5)
 
 /* A special resource value to indicate a resource wildcard */
@@ -385,7 +373,7 @@ typedef struct thread_qos_policy      *thread_qos_policy_t;
  * When they do, we will update THREAD_POLICY_INTERNAL_STRUCT_VERSION.
  */
 
-#define THREAD_POLICY_INTERNAL_STRUCT_VERSION 4
+#define THREAD_POLICY_INTERNAL_STRUCT_VERSION 5
 
 struct thread_requested_policy {
        uint64_t        thrp_int_darwinbg       :1,     /* marked as darwinbg via setpriority */
@@ -404,9 +392,10 @@ struct thread_requested_policy {
                        thrp_qos_promote        :3,     /* thread qos class from promotion */
                        thrp_qos_ipc_override   :3,     /* thread qos class from ipc override */
                        thrp_terminated         :1,     /* heading for termination */
-                       thrp_qos_sync_ipc_override:3,   /* thread qos class from sync ipc override */
+                       thrp_qos_sync_ipc_override:3,   /* now unused */
+                       thrp_qos_workq_override :3,     /* thread qos class override (workq) */
 
-                       thrp_reserved           :29;
+                       thrp_reserved           :26;
 };
 
 struct thread_effective_policy {
index 96bd1f445072699287d89d820a1648ddfc12bcba..f6709419adcf6ec93ddda55ceb5e2c6b1016423f 100644 (file)
@@ -269,12 +269,14 @@ extern vm_offset_t                vm_kernel_base;
 extern vm_offset_t             vm_kernel_top;
 extern vm_offset_t             vm_hib_base;
 
-#define VM_KERNEL_IS_SLID(_o)                                                 \
-               (((vm_offset_t)(_o) >= vm_kernel_slid_base) &&                 \
-                ((vm_offset_t)(_o) <  vm_kernel_slid_top))
+extern vm_offset_t             vm_kernel_builtinkmod_text;
+extern vm_offset_t             vm_kernel_builtinkmod_text_end;
 
-#define VM_KERNEL_SLIDE(_u)                                                   \
-               ((vm_offset_t)(_u) + vm_kernel_slide)
+#define VM_KERNEL_IS_SLID(_o)                                            \
+       (((vm_offset_t)VM_KERNEL_STRIP_PTR(_o) >= vm_kernel_slid_base) && \
+        ((vm_offset_t)VM_KERNEL_STRIP_PTR(_o) <  vm_kernel_slid_top))
+
+#define VM_KERNEL_SLIDE(_u) ((vm_offset_t)(_u) + vm_kernel_slide)
 
 /*
  * The following macros are to be used when exposing kernel addresses to
@@ -319,20 +321,20 @@ __BEGIN_DECLS
 extern vm_offset_t vm_kernel_addrhash(vm_offset_t addr);
 __END_DECLS
 
-#define __DO_UNSLIDE(_v) ((vm_offset_t)(_v) - vm_kernel_slide)
+#define __DO_UNSLIDE(_v) ((vm_offset_t)VM_KERNEL_STRIP_PTR(_v) - vm_kernel_slide)
 
 #if DEBUG || DEVELOPMENT
-# define VM_KERNEL_ADDRHIDE(_v) (VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : (vm_address_t)(_v))
+#define VM_KERNEL_ADDRHIDE(_v) (VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : (vm_address_t)VM_KERNEL_STRIP_PTR(_v))
 #else
-# define VM_KERNEL_ADDRHIDE(_v) (VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : (vm_address_t)0)
-#endif
+#define VM_KERNEL_ADDRHIDE(_v) (VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : (vm_address_t)0)
+#endif /* DEBUG || DEVELOPMENT */
 
 #define VM_KERNEL_ADDRHASH(_v) vm_kernel_addrhash((vm_offset_t)(_v))
 
 #define VM_KERNEL_UNSLIDE_OR_PERM(_v) ({ \
                VM_KERNEL_IS_SLID(_v) ? __DO_UNSLIDE(_v) : \
-               VM_KERNEL_ADDRESS(_v) ? ((vm_offset_t)(_v) + vm_kernel_addrperm) : \
-               (vm_offset_t)(_v); \
+               VM_KERNEL_ADDRESS(_v) ? ((vm_offset_t)VM_KERNEL_STRIP_PTR(_v) + vm_kernel_addrperm) : \
+               (vm_offset_t)VM_KERNEL_STRIP_PTR(_v); \
        })
 
 #define VM_KERNEL_UNSLIDE(_v) ({ \
index 04fafe640077e5f1a7230ff5dccede0b6926c5d5..e4552b60fa415890718e459b1f7a43cb3348de03 100644 (file)
@@ -350,6 +350,15 @@ typedef struct pmap_statistics     *pmap_statistics_t;
 #define VM_FLAGS_SUPERPAGE_SIZE_2MB (SUPERPAGE_SIZE_2MB<<VM_FLAGS_SUPERPAGE_SHIFT)
 #endif
 
+/*
+ * EXC_GUARD definitions for virtual memory.
+ */
+#define GUARD_TYPE_VIRT_MEMORY 0x5
+
+/* Reasons for exception for virtual memory */
+enum virtual_memory_guard_exception_codes {
+       kGUARD_EXC_DEALLOC_GAP  = 1u << 0
+};
 
 #ifdef KERNEL_PRIVATE
 typedef struct {
@@ -368,7 +377,9 @@ typedef struct {
                vmkf_fourk:1,
                vmkf_overwrite_immutable:1,
                vmkf_remap_prot_copy:1,
-               __vmkf_unused:18;
+               vmkf_cs_enforcement_override:1,
+               vmkf_cs_enforcement:1,
+               __vmkf_unused:16;
 } vm_map_kernel_flags_t;
 #define VM_MAP_KERNEL_FLAGS_NONE (vm_map_kernel_flags_t) {             \
        .vmkf_atomic_entry = 0, /* keep entry atomic (no coalescing) */ \
@@ -385,6 +396,8 @@ typedef struct {
        .vmkf_fourk = 0,        /* use fourk pager */                   \
        .vmkf_overwrite_immutable = 0,  /* can overwrite immutable mappings */ \
        .vmkf_remap_prot_copy = 0, /* vm_remap for VM_PROT_COPY */      \
+       .vmkf_cs_enforcement_override = 0, /* override CS_ENFORCEMENT */ \
+       .vmkf_cs_enforcement = 0,  /* new value for CS_ENFORCEMENT */   \
        .__vmkf_unused = 0                                              \
 }
 #endif /* KERNEL_PRIVATE */
@@ -538,6 +551,23 @@ typedef struct {
 
 #define VM_MEMORY_VIDEOBITSTREAM 91
 
+/* memory allocated by CoreMedia */
+#define VM_MEMORY_CM_XPC 92
+
+#define VM_MEMORY_CM_RPC 93
+
+#define VM_MEMORY_CM_MEMORYPOOL 94
+
+#define VM_MEMORY_CM_READCACHE 95
+
+#define VM_MEMORY_CM_CRABS 96
+
+/* memory allocated for QuickLookThumbnailing */
+#define VM_MEMORY_QUICKLOOK_THUMBNAILS 97
+
+/* memory allocated by Accounts framework */
+#define VM_MEMORY_ACCOUNTS 98
+
 /* Reserve 240-255 for application */
 #define VM_MEMORY_APPLICATION_SPECIFIC_1 240
 #define VM_MEMORY_APPLICATION_SPECIFIC_16 255
index 75843897347e1d91cecfead9907143aa85df9603..9a5ad0ee4bd1284021d6540a84fcfe90e1136b36 100644 (file)
@@ -177,6 +177,9 @@ typedef struct vm_allocation_site vm_allocation_site_t;
        static vm_allocation_site_t site __attribute__((section("__DATA, __data"))) \
         = { .refcount = 2, .tag = (itag), .flags = (iflags) };
 
+extern int vmrtf_extract(uint64_t, boolean_t, int, void *, int *);
+extern unsigned int vmrtfaultinfo_bufsz(void);
+
 #endif /* XNU_KERNEL_PRIVATE */
 
 #ifdef  KERNEL_PRIVATE
@@ -208,5 +211,15 @@ typedef mach_port_t                vm_named_entry_t;
 
 #define UPL_NULL               ((upl_t) 0)
 #define VM_NAMED_ENTRY_NULL    ((vm_named_entry_t) 0)
-
+#ifdef PRIVATE
+typedef struct {
+       uint64_t rtfabstime; // mach_continuous_time at start of fault
+       uint64_t rtfduration; // fault service duration
+       uint64_t rtfaddr; // fault address
+       uint64_t rtfpc; // userspace program counter of thread incurring the fault
+       uint64_t rtftid; // thread ID
+       uint64_t rtfupid; // process identifier
+       uint64_t rtftype; // fault type
+} vm_rtfault_record_t;
+#endif
 #endif /* _MACH_VM_TYPES_H_ */
index be15db509c128609ef5e02093057358942ddbc88..d5cfc89c3ebac8b6693be91c82006fca6e0f84e4 100644 (file)
@@ -79,6 +79,9 @@ type mach_zone_info_array_t = array[] of mach_zone_info_t;
 type task_zone_info_t = struct[11] of uint64_t;                                /* deprecated */
 type task_zone_info_array_t = array[] of task_zone_info_t;     /* deprecated */
 
+type zone_btrecord_t = struct[16] of uint64_t;
+type zone_btrecord_array_t = array[] of zone_btrecord_t;
+
 type hash_info_bucket_t = struct[1] of natural_t;
 type hash_info_bucket_array_t = array[] of hash_info_bucket_t;
 
index 81bd7badf6f0ba8f1e22be9e738c56563b9bcc52..9544b454fc15aab5d66b24be070ac794a533e71f 100644 (file)
@@ -167,4 +167,35 @@ typedef struct mach_memory_info {
 
 typedef mach_memory_info_t *mach_memory_info_array_t;
 
+/*
+ * MAX_ZTRACE_DEPTH configures how deep of a stack trace is taken on each zalloc in the zone of interest.  15
+ * levels is usually enough to get past all the layers of code in kalloc and IOKit and see who the actual
+ * caller is up above these lower levels.
+ *
+ * This is used both for the zone leak detector and the zone corruption log. Make sure this isn't greater than
+ * BTLOG_MAX_DEPTH defined in btlog.h. Also make sure to update the definition of zone_btrecord_t in
+ * mach_debug_types.defs if this changes.
+ */
+
+#define MAX_ZTRACE_DEPTH       15
+
+/*
+ * Opcodes for the btlog operation field:
+ */
+
+#define ZOP_ALLOC      1
+#define ZOP_FREE       0
+
+/*
+ * Structure used to copy out btlog records to userspace, via the MIG call
+ * mach_zone_get_btlog_records().
+ */
+typedef struct zone_btrecord {
+       uint32_t    ref_count;                                  /* no. of active references on the record */
+       uint32_t        operation_type;                         /* operation type (alloc/free) */
+       uint64_t        bt[MAX_ZTRACE_DEPTH];           /* backtrace */
+} zone_btrecord_t;
+
+typedef zone_btrecord_t *zone_btrecord_array_t;
+
 #endif /* _MACH_DEBUG_ZONE_INFO_H_ */
index b200f93632c9e63ce3328d2cb4b77109c991b482..c9aeda0eadd967812f6f1303cb97cb5ec2b731f4 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2015 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
@@ -22,7 +22,7 @@
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
 
 #include <stdatomic.h>
 
+#define _os_atomic_c11_atomic(p) \
+               ((typeof(*(p)) _Atomic *)(p))
+
+#define _os_atomic_basetypeof(p) \
+               typeof(atomic_load(((typeof(*(p)) _Atomic *)(p))))
+
+#define _os_atomic_c11_op_orig(p, v, m, o) \
+               atomic_##o##_explicit(_os_atomic_c11_atomic(p), v, \
+               memory_order_##m)
+
+#define _os_atomic_c11_op(p, v, m, o, op) \
+               ({ typeof(v) _v = (v); _os_atomic_c11_op_orig(p, v, m, o) op _v; })
+
+#define os_atomic_thread_fence(m)  atomic_thread_fence(memory_order_##m)
+
+#define os_atomic_load(p, m) \
+               atomic_load_explicit(_os_atomic_c11_atomic(p), memory_order_##m)
+#define os_atomic_store(p, v, m)    _os_atomic_c11_op_orig(p, v, m, store)
+
+#define os_atomic_add_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_add)
+#define os_atomic_add(p, v, m)      _os_atomic_c11_op(p, v, m, fetch_add, +)
+
+#define os_atomic_inc_orig(p, m)    _os_atomic_c11_op_orig(p, 1, m, fetch_add)
+#define os_atomic_inc(p, m)         _os_atomic_c11_op(p, 1, m, fetch_add, +)
+
+#define os_atomic_sub_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_sub)
+#define os_atomic_sub(p, v, m)      _os_atomic_c11_op(p, v, m, fetch_sub, -)
+
+#define os_atomic_dec_orig(p, m)    _os_atomic_c11_op_orig(p, 1, m, fetch_sub)
+#define os_atomic_dec(p, m)         _os_atomic_c11_op(p, 1, m, fetch_sub, -)
+
+#define os_atomic_and_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_and)
+#define os_atomic_and(p, v, m)      _os_atomic_c11_op(p, v, m, fetch_and, &)
+
+#define os_atomic_or_orig(p, v, m)  _os_atomic_c11_op_orig(p, v, m, fetch_or)
+#define os_atomic_or(p, v, m)       _os_atomic_c11_op(p, v, m, fetch_or, |)
+
+#define os_atomic_xor_orig(p, v, m) _os_atomic_c11_op_orig(p, v, m, fetch_xor)
+#define os_atomic_xor(p, v, m)      _os_atomic_c11_op(p, v, m, fetch_xor, ^)
+
+#define os_atomic_xchg(p, v, m)     _os_atomic_c11_op_orig(p, v, m, exchange)
+
+#define os_atomic_cmpxchg(p, e, v, m) \
+               ({ _os_atomic_basetypeof(p) _r = (e); \
+               atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \
+               &_r, v, memory_order_##m, memory_order_relaxed); })
+#define os_atomic_cmpxchgv(p, e, v, g, m) \
+               ({ _os_atomic_basetypeof(p) _r = (e); int _b = \
+               atomic_compare_exchange_strong_explicit(_os_atomic_c11_atomic(p), \
+               &_r, v, memory_order_##m, memory_order_relaxed); *(g) = _r; _b; })
+#define os_atomic_cmpxchgvw(p, e, v, g, m) \
+               ({ _os_atomic_basetypeof(p) _r = (e); int _b = \
+               atomic_compare_exchange_weak_explicit(_os_atomic_c11_atomic(p), \
+               &_r, v, memory_order_##m, memory_order_relaxed); *(g) = _r;  _b; })
+
+#define os_atomic_rmw_loop(p, ov, nv, m, ...)  ({ \
+               bool _result = false; \
+               typeof(p) _p = (p); \
+               ov = os_atomic_load(_p, relaxed); \
+               do { \
+                       __VA_ARGS__; \
+                       _result = os_atomic_cmpxchgvw(_p, ov, nv, &ov, m); \
+               } while (!_result); \
+               _result; \
+       })
+
+#define os_atomic_rmw_loop_give_up_with_fence(m, expr) \
+               ({ os_atomic_thread_fence(m); expr; __builtin_unreachable(); })
+#define os_atomic_rmw_loop_give_up(expr) \
+               os_atomic_rmw_loop_give_up_with_fence(relaxed, expr)
+
+#define os_atomic_force_dependency_on(p, e) (p)
+#define os_atomic_load_with_dependency_on(p, e) \
+               os_atomic_load(os_atomic_force_dependency_on(p, e), relaxed)
+
 #if defined (__x86_64__)
 #include "i386/atomic.h"
 #elif defined (__arm__) || defined (__arm64__)
index b3e75c8d3587a4fd6db2ee71586d8f88a626eee3..9de057044a7408885e2232b05542c02a78cbc82f 100644 (file)
@@ -67,4 +67,12 @@ uint64_t mt_core_snap(unsigned int ctr);
 void mt_core_set_snap(unsigned int ctr, uint64_t snap);
 void mt_mtc_set_snap(struct mt_cpu *mtc, unsigned int ctr, uint64_t snap);
 
+typedef void (*mt_pmi_fn)(bool user_mode, void *ctx);
+extern bool mt_microstackshots;
+extern unsigned int mt_microstackshot_ctr;
+extern mt_pmi_fn mt_microstackshot_pmi_handler;
+extern void *mt_microstackshot_ctx;
+extern uint64_t mt_core_reset_values[MT_CORE_NFIXED];
+int mt_microstackshot_start_arch(uint64_t period);
+
 #endif /* !defined(MACHINE_MONOTONIC_H) */
diff --git a/osfmk/prng/YarrowCoreLib/include/WindowsTypesForMac.h b/osfmk/prng/YarrowCoreLib/include/WindowsTypesForMac.h
deleted file mode 100644 (file)
index 06f6a5b..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           WindowsTypesForMac.h
-
-       Contains:       Define common Windows data types in mac terms.
-
-       Written by:     Doug Mitchell
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/10/99        dpm             Created.
-*/
-
-#ifndef        _WINDOWS_TYPES_FOR_MAC_H_
-#define _WINDOWS_TYPES_FOR_MAC_H_
-
-#include <stdint.h>
-
-typedef u_int8_t       UCHAR;
-typedef int8_t         CHAR;
-typedef u_int8_t       BYTE;
-typedef char   TCHAR;
-typedef int16_t        WORD;
-typedef int32_t        DWORD;
-typedef u_int16_t      USHORT;
-typedef u_int32_t      ULONG;
-typedef int32_t        LONG;
-typedef u_int32_t      UINT;
-typedef int64_t        LONGLONG;
-typedef u_int8_t       *LPBYTE;
-typedef int8_t         *LPSTR;
-typedef int16_t        *LPWORD;
-typedef        int8_t  *LPCTSTR;               /* ??? */
-typedef        int8_t  *LPCSTR;                /* ??? */
-typedef void   *LPVOID;
-typedef void   *HINSTANCE;
-typedef        void    *HANDLE;
-
-#define WINAPI
-
-#endif /* _WINDOWS_TYPES_FOR_MAC_H_*/
-
diff --git a/osfmk/prng/YarrowCoreLib/include/yarrow.h b/osfmk/prng/YarrowCoreLib/include/yarrow.h
deleted file mode 100644 (file)
index 282da76..0000000
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           yarrow.h
-
-       Contains:       Public header file for Counterpane's Yarrow Pseudo-random 
-                               number generator.
-
-       Written by:     Counterpane, Inc. 
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/10/99        dpm             Created, based on Counterpane source.
-*/
-/*
-       yarrow.h
-
-       Main header file for Counterpane's Yarrow Pseudo-random number generator.
-*/
-
-#ifndef __YARROW_H__
-#define __YARROW_H__
-
-#if            defined(macintosh) || defined(__APPLE__)
-#include "WindowsTypesForMac.h"
-#endif
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* Error Codes */
-typedef enum prng_error_status {
-       PRNG_SUCCESS = 0,
-       PRNG_ERR_REINIT,
-       PRNG_ERR_WRONG_CALLER,
-       PRNG_ERR_NOT_READY,
-       PRNG_ERR_NULL_POINTER,
-       PRNG_ERR_LOW_MEMORY,
-       PRNG_ERR_OUT_OF_BOUNDS,
-       PRNG_ERR_COMPRESSION,
-       PRNG_ERR_NOT_ENOUGH_ENTROPY,
-       PRNG_ERR_MUTEX,
-       PRNG_ERR_TIMEOUT,
-       PRNG_ERR_PROGRAM_FLOW
-} prng_error_status;
-
-/*
- * Entropy sources
- */
-enum user_sources {
-       CLIENT_SOURCE = 0,
-       ENTROPY_FILE_SOURCE,
-       SYSTEM_SOURCE,
-       USER_SOURCES  /* Leave as last source */
-};
-
-
-/* Declare YARROWAPI as __declspec(dllexport) before
-   including this file in the actual DLL */
-#ifndef YARROWAPI 
-#if            defined(macintosh) || defined(__APPLE__)
-#define YARROWAPI
-#else
-#define YARROWAPI __declspec(dllimport)
-#endif
-#endif
-
-/* Public function forward declarations */
-
-#if            defined(macintosh) || defined(__APPLE__)
-/* 
- * Mac changes:
- *   1. PrngRef context for all functions. Thus no global variables.
- *   2. Strong enum typing (prng_error_status instead of int return).
- */
-struct PRNG;
-typedef struct PRNG *PrngRef;
-
-YARROWAPI prng_error_status 
-prngInitialize(
-       PrngRef *prng);
-YARROWAPI prng_error_status 
-prngDestroy(
-       PrngRef prng);
-YARROWAPI prng_error_status 
-prngOutput(
-       PrngRef prng, 
-       BYTE *outbuf,
-       UINT outbuflen);
-/* this one has no context */
-YARROWAPI prng_error_status 
-prngStretch(
-       BYTE *inbuf,
-       UINT inbuflen,
-       BYTE *outbuf,
-       UINT outbuflen);
-YARROWAPI prng_error_status 
-prngInput(
-       PrngRef prng, 
-       BYTE *inbuf,
-       UINT inbuflen,
-       UINT poolnum,
-       UINT estbits);
-YARROWAPI prng_error_status 
-prngForceReseed(
-       PrngRef prng, 
-       LONGLONG ticks);
-YARROWAPI prng_error_status 
-prngAllowReseed(
-       PrngRef prng, 
-       LONGLONG ticks);
-YARROWAPI prng_error_status 
-prngProcessSeedBuffer(
-       PrngRef prng, 
-       BYTE *buf,
-       LONGLONG ticks);
-YARROWAPI prng_error_status 
-prngSlowPoll(
-       PrngRef prng, 
-       UINT pollsize);
-#else
-/* original Counterpane API */
-YARROWAPI int prngOutput(BYTE *outbuf,UINT outbuflen);
-YARROWAPI int prngStretch(BYTE *inbuf,UINT inbuflen,BYTE *outbuf,UINT outbuflen);
-YARROWAPI int prngInput(BYTE *inbuf,UINT inbuflen,UINT poolnum,UINT estbits);
-YARROWAPI int prngForceReseed(LONGLONG ticks);
-YARROWAPI int prngAllowReseed(LONGLONG ticks);
-YARROWAPI int prngProcessSeedBuffer(BYTE *buf,LONGLONG ticks);
-YARROWAPI int prngSlowPoll(UINT pollsize);
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif
diff --git a/osfmk/prng/YarrowCoreLib/include/yarrowUtils.h b/osfmk/prng/YarrowCoreLib/include/yarrowUtils.h
deleted file mode 100644 (file)
index 95a43f5..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           yarrowUtils.h
-
-       Contains:       Misc. utility functions.
-
-       Written by:     Doug Mitchell
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/29/00        dpm             Created.
-*/
-
-#ifndef        _YARROW_UTILS_H_
-#define _YARROW_UTILS_H_
-
-#include <prng/YarrowCoreLib/include/yarrow.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- * Wipe a piece of memory clean.
- */
-void trashMemory(void* mem, int len);
-
-#ifdef __cplusplus
-} 
-#endif
-
-#endif /* _YARROW_UTILS_H_*/
diff --git a/osfmk/prng/YarrowCoreLib/port/smf.c b/osfmk/prng/YarrowCoreLib/port/smf.c
deleted file mode 100644 (file)
index 5cb4a36..0000000
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 1999-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           smf.c
-
-       Contains:       platform-dependent malloc/free
-*/
-
-#include <prng/YarrowCoreLib/src/smf.h>
-#include <kern/kalloc.h>
-#include <stddef.h>
-
-/* Shim emulating _MALLOC */
-
-SMFAPI void mmInit( void )
-{
-       return;
-}
-
-SMFAPI MMPTR mmMalloc(DWORD request)
-{
-       void *addr;
-
-       addr = (void *) kalloc(request);
-       if (addr == NULL)
-               return NULL;
-    
-       return (MMPTR) addr;
-}
-
-SMFAPI void mmFree(MMPTR ptrnum)
-{
-       kfree_addr(ptrnum);
-}
-
-SMFAPI LPVOID mmGetPtr(MMPTR ptrnum)
-{
-       return (LPVOID)ptrnum;
-}
-
-SMFAPI void mmReturnPtr(__unused MMPTR ptrnum)
-{
-       /* nothing */
-       return;
-}
-
diff --git a/osfmk/prng/YarrowCoreLib/src/assertverify.h b/osfmk/prng/YarrowCoreLib/src/assertverify.h
deleted file mode 100644 (file)
index 7f2c35a..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#ifndef ASSERT_VERIFY_H
-#define ASSERT_VERIFY_H
-
-/******************************************************************************
-Written by: Jeffrey Richter
-Notices: Copyright (c) 1995-1997 Jeffrey Richter
-Purpose: Common header file containing handy macros and definitions used
-         throughout all the applications in the book.
-******************************************************************************/
-
-/* These header functions were copied from the cmnhdr.h file that accompanies 
-   Advanced Windows 3rd Edition by Jeffrey Richter */
-
-//////////////////////////// Assert/Verify Macros /////////////////////////////
-
-#if            defined(macintosh) || defined(__APPLE__)
-/* TBD */
-#define chFAIL(szMSG)                                          
-#define chASSERTFAIL(file,line,expr) 
-#else
-#define chFAIL(szMSG) {                                                   \
-      MessageBox(GetActiveWindow(), szMSG,                                \
-         __TEXT("Assertion Failed"), MB_OK | MB_ICONERROR);               \
-      DebugBreak();                                                       \
-   }
-
-/* Put up an assertion failure message box. */
-#define chASSERTFAIL(file,line,expr) {                                    \
-      TCHAR sz[128];                                                      \
-      wsprintf(sz, __TEXT("File %hs, line %d : %hs"), file, line, expr);  \
-      chFAIL(sz);                                                         \
-   }
-
-#endif /* macintosh */
-
-/* Put up a message box if an assertion fails in a debug build. */
-#ifdef _DEBUG
-#define chASSERT(x) if (!(x)) chASSERTFAIL(__FILE__, __LINE__, #x)
-#else
-#define chASSERT(x)
-#endif
-
-/* Assert in debug builds, but don't remove the code in retail builds. */
-#ifdef _DEBUG
-#define chVERIFY(x) chASSERT(x)
-#else
-#define chVERIFY(x) (x)
-#endif
-
-#endif /* ASSERT_VERIFY_H */
diff --git a/osfmk/prng/YarrowCoreLib/src/comp.c b/osfmk/prng/YarrowCoreLib/src/comp.c
deleted file mode 100644 (file)
index 28f3c07..0000000
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           comp.c
-
-       Contains:       NULL compression. Kernel version of Yarrow assumes
-                               incoming seed data is truly random.
-*/
-#include "prng/YarrowCoreLib/include/WindowsTypesForMac.h"
-#include "comp.h"
-
-#ifdef YARROW_KERNEL
-
-/* null compression */
-comp_error_status comp_init(__unused COMP_CTX* ctx)
-{
-       return COMP_SUCCESS;
-}
-
-
-comp_error_status comp_add_data( __unused COMP_CTX* ctx, 
-                                                                __unused Bytef* inp, 
-                                                                __unused uInt inplen )
-{
-       return COMP_SUCCESS;
-}
-
-comp_error_status comp_get_ratio( __unused COMP_CTX* ctx,float* out )
-{
-       *out = 1.0;
-       return COMP_SUCCESS;
-}
-
-comp_error_status comp_end( __unused COMP_CTX* ctx )
-{
-       return COMP_SUCCESS;
-}
-
-#else
-
-/* original Yarrow compression, must be linked with zlib */
-
-#if            defined(macintosh) || defined(__APPLE__)
-#include "WindowsTypesForMac.h"
-#include "yarrowUtils.h"
-#include <string.h>
-#include <stdlib.h>
-#else
-#include <windows.h>
-#endif
-#include <math.h>
-#include "comp.h"
-
-/* Check that the pointer is not NULL */
-#define PCHECK(ptr)  if(ptr==NULL) {return COMP_ERR_NULL_POINTER;}
-#define MMPCHECK(mmptr) if(mmptr==MM_NULL) {return COMP_ERR_NULL_POINTER;}
-/* Check that the important parts of the context are ok */
-#define CTXCHECK(ctx) \
-PCHECK(ctx)                            \
-MMPCHECK(ctx->buf)
-
-/* Might want to vary these by context */
-#define BUFSIZE  16384 /* 16K */
-#define OUTBUFSIZE 16800 /* = inbufsize*1.01 + 12 (See zlib docs) */
-#define SHIFTSIZE 4096 /* BUFSIZE/4 */
-
-#define _MIN(a,b) (((a)<(b))?(a):(b))
-
-
-/* Initialize these routines */
-comp_error_status comp_init(COMP_CTX* ctx)
-{
-       ctx->buf = mmMalloc(BUFSIZE);
-       if(ctx->buf == MM_NULL) {goto cleanup_comp_init;}
-       ctx->spaceused = 0;
-
-       return COMP_SUCCESS;
-
-cleanup_comp_init:
-       mmFree(ctx->buf);
-
-       return COMP_ERR_LOW_MEMORY;
-}
-
-
-comp_error_status comp_add_data(COMP_CTX* ctx,Bytef* inp,uInt inplen)
-{
-       uInt shifts;
-       uInt blocksize;
-       BYTE* buf;
-
-       CTXCHECK(ctx);
-       PCHECK(inp);
-
-       buf = (BYTE*)mmGetPtr(ctx->buf);
-
-       if(inplen+SHIFTSIZE>BUFSIZE)
-       {
-               blocksize = _MIN(inplen,BUFSIZE);
-               memmove(buf,inp,blocksize);
-               ctx->spaceused = blocksize;
-       }
-       else
-       {
-               if(inplen+ctx->spaceused>BUFSIZE) 
-               {
-                       shifts = (uInt)ceil((inplen+ctx->spaceused-BUFSIZE)/(float)SHIFTSIZE);
-                       blocksize = _MIN(shifts*SHIFTSIZE,ctx->spaceused);
-                       memmove(buf,buf+blocksize,BUFSIZE-blocksize);
-                       ctx->spaceused = ctx->spaceused - blocksize;
-               }
-               memmove(buf+ctx->spaceused,inp,inplen);
-               ctx->spaceused += inplen;
-       }
-
-       return COMP_SUCCESS;
-}
-
-comp_error_status comp_get_ratio(COMP_CTX* ctx,float* out)
-{
-       Bytef *inbuf,*outbuf;
-       uLong insize,outsize;
-       int resp;
-
-       *out = 0;
-
-       CTXCHECK(ctx);
-       PCHECK(out);
-
-       if(ctx->spaceused == 0) {return COMP_SUCCESS;}
-
-       inbuf = (Bytef*)mmGetPtr(ctx->buf);
-       outbuf = (Bytef*)malloc(OUTBUFSIZE);
-       if(outbuf==NULL) {return COMP_ERR_LOW_MEMORY;}
-
-       insize = ctx->spaceused;
-       outsize = OUTBUFSIZE;
-
-       resp = compress(outbuf,&outsize,inbuf,insize);
-       if(resp==Z_MEM_ERROR) {return COMP_ERR_LOW_MEMORY;}
-       if(resp==Z_BUF_ERROR) {return COMP_ERR_LIB;}
-
-       *out = (float)outsize/(float)insize;
-
-       /* Thrash the memory and free it */
-       trashMemory(outbuf, OUTBUFSIZE);
-       free(outbuf);
-
-       return COMP_SUCCESS;
-}
-
-comp_error_status comp_end(COMP_CTX* ctx)
-{
-       if(ctx == NULL) {return COMP_SUCCESS;} /* Since nothing is left undone */
-
-       mmFree(ctx->buf);
-       ctx->buf = MM_NULL;
-
-       return COMP_SUCCESS;
-}
-
-#endif /* YARROW_KERNEL */
-
diff --git a/osfmk/prng/YarrowCoreLib/src/comp.h b/osfmk/prng/YarrowCoreLib/src/comp.h
deleted file mode 100644 (file)
index 0cbeca3..0000000
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           comp.h
-
-       Contains:       Glue between core prng code to the Zlib library.
-
-       Written by:     Counterpane, Inc. 
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/10/99        dpm             Created, based on Counterpane source.
-*/
-/* comp.h
-
-   Header for the compression routines added to the Counterpane PRNG. 
-*/
-
-#ifndef __YARROW_COMP_H__
-#define __YARROW_COMP_H__
-
-#include "smf.h"
-
-/*
- * Kernel version does NULL compression....
- */
-#define YARROW_KERNEL
-
-#ifdef YARROW_KERNEL
-/* 
- * Shrink this down to almost nothing to simplify kernel port;
- * with additional hacking on prng.c, this could go away entirely
- */
-typedef char COMP_CTX;
-
-/* and define some type3s normally picked up from zlib */
-typedef unsigned char Bytef;
-typedef unsigned uInt;
-
-#else
-
-#include "zlib.h"
-
-/* Top level compression context */
-typedef struct{
-       MMPTR buf;
-       uInt spaceused;
-} COMP_CTX;
-#endif /* YARROW_KERNEL */
-
-typedef enum comp_error_status {
-       COMP_SUCCESS = 0,
-       COMP_ERR_NULL_POINTER,
-       COMP_ERR_LOW_MEMORY,
-       COMP_ERR_LIB
-} comp_error_status;
-
-/* Exported functions from compress.c */
-comp_error_status comp_init(COMP_CTX* ctx);
-comp_error_status comp_add_data(COMP_CTX* ctx,Bytef* inp,uInt inplen);
-comp_error_status comp_end(COMP_CTX* ctx);
-comp_error_status comp_get_ratio(COMP_CTX* ctx,float* out);
-
-#endif
diff --git a/osfmk/prng/YarrowCoreLib/src/entropysources.h b/osfmk/prng/YarrowCoreLib/src/entropysources.h
deleted file mode 100644 (file)
index 1821acc..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/* entropysources.h */
-/* This files contain the defination of the entropy sources */
-
-#ifndef __YARROW_ENTROPY_SOURCES_H__
-#define __YARROW_ENTROPY_SOURCES_H__
-
-#if            defined(macintosh) || defined(__APPLE__)
-/* 
- * In our implementation, all sources are user sources.
- */
-enum entropy_sources {
-       ENTROPY_SOURCES = 0
-};
-#else
-enum entropy_sources {
-       KEYTIMESOURCE = 0,
-       MOUSETIMESOURCE,
-       MOUSEMOVESOURCE,
-       SLOWPOLLSOURCE,
-       ENTROPY_SOURCES,        /* Leave as second to last source */
-       MSG_CLOSE_PIPE          /* Leave as last source */
-};
-#endif
-
-#endif
diff --git a/osfmk/prng/YarrowCoreLib/src/macOnly.h b/osfmk/prng/YarrowCoreLib/src/macOnly.h
deleted file mode 100644 (file)
index 4586b02..0000000
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           macOnly.h
-
-       Contains:       Mac-specific #defines for Yarrow.
-
-       Written by:     Doug Mitchell
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/10/99        dpm             Created.
-*/
-
-#if            !defined(macintosh) && !defined(__APPLE__)
-#error Hey, why are you including macOnly for a non-Mac build!?
-#endif
-
-#ifndef        _MAC_ONLY_H_
-#define _MAC_ONLY_H_
-
-#include "prng/YarrowCoreLib/include/WindowsTypesForMac.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/*
- * No "slow poll" for Mac. 
- */
-#define SLOW_POLL_ENABLE       0
-#if            SLOW_POLL_ENABLE
-extern DWORD prng_slow_poll(BYTE* buf,UINT bufsize);
-#endif /* SLOW_POLL_ENABLE */
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* _MAC_ONLY_H_*/
diff --git a/osfmk/prng/YarrowCoreLib/src/prng.c b/osfmk/prng/YarrowCoreLib/src/prng.c
deleted file mode 100644 (file)
index 5c1d6ad..0000000
+++ /dev/null
@@ -1,638 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2016 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           prng.c
-
-       Contains:       Core routines for the Counterpane Yarrow PRNG.
-
-       Written by:     Counterpane, Inc. 
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/10/99        dpm             Created, based on Counterpane source.
-*/
-/*
-       prng.c
-
-       Core routines for the Counterpane PRNG
-*/
-#include "userdefines.h"
-#include "assertverify.h"
-#include "prng/YarrowCoreLib/include/yarrowUtils.h"
-
-#if            defined(macintosh) || defined(__APPLE__)
-/* FIXME - this file needs to be in a platform-independent place */
-
-#include "macOnly.h"
-#endif /* macintosh */
-#include "smf.h"
-#include "sha1mod.h"
-#include "entropysources.h"
-#include "comp.h"
-#include "prng/YarrowCoreLib/include/yarrow.h"
-#include "prng.h"
-#include "prngpriv.h"
-
-
-#define _MAX(a,b) (((a)>(b))?(a):(b))
-#define _MIN(a,b) (((a)<(b))?(a):(b))
-
-#if            defined(macintosh) || defined(__APPLE__)
-/*
- * No mutexes in this module for Macintosh/OSX. We handle the
- * required locking elsewhere. 
- */
-#define MUTEX_ENABLE   0
-
-#include <string.h>            /* memcpy, etc. */
-#if            TARGET_API_MAC_OSX
-       #include <sys/time.h>           /* for timespec */
-#elif  TARGET_API_MAC_CARBON
-       #include <Timer.h>                              /* Microseconds */
-       #include <Math64.h>
-#elif  KERNEL_BUILD
-       #include <sys/time.h>
-#elif  MACH_KERNEL_PRIVATE
-       #include <mach/mach_time.h>
-       #include <mach/clock_types.h>
-#else
-       #error Unknown TARGET_API
-#endif /* TARGET_API */
-#else
-#define MUTEX_ENABLE   1
-#endif /* macintosh */
-
-#if            MUTEX_ENABLE
-static HANDLE Statmutex = NULL;
-static DWORD mutexCreatorId = 0;
-#endif
-
-#if 0
-#pragma mark -
-#pragma mark * * * Static Utility functions * * * 
-#endif
-
-/* All error checking should be done in the function that calls these */
-
-/*
- * out := SHA1(IV | out) 
- */
-static void 
-prng_do_SHA1(GEN_CTX *ctx) 
-{
-       YSHA1_CTX sha;
-
-       YSHA1Init(&sha);
-       YSHA1Update(&sha,ctx->IV,20);
-       YSHA1Update(&sha,ctx->out,20);
-       YSHA1Final(ctx->out,&sha);
-       ctx->index = 0;
-}
-
-/*
- * IV  := newState
- * out := SHA1(IV)
- *
- * Called from init, prngForceReseed(), and prngOutput()
- * as anti-backtracking mechanism.
- */
-static void 
-prng_make_new_state(GEN_CTX *ctx,BYTE *newState) 
-{
-       YSHA1_CTX sha;
-
-       memcpy(ctx->IV,newState,20);
-       YSHA1Init(&sha);
-       YSHA1Update(&sha,ctx->IV,20);
-       YSHA1Final(ctx->out,&sha);
-       ctx->numout = 0;
-       ctx->index = 0;
-}
-
-#if            SLOW_POLL_ENABLE
-
-
-/* Initialize the secret state with a slow poll */
-/* Currently only called from prngInitialize */
-
-#define SPLEN 65536  /* 64K */
-
-static void 
-prng_slow_init(PRNG *p)
-/* This fails silently and must be fixed. */
-{
-       YSHA1_CTX* ctx = NULL;
-       MMPTR mmctx = MM_NULL;
-       BYTE* bigbuf = NULL;
-       MMPTR mmbigbuf = MM_NULL;
-       BYTE* buf = NULL;
-       MMPTR mmbuf = MM_NULL;
-       DWORD polllength;
-
-       mmbigbuf = mmMalloc(SPLEN);
-       if(mmbigbuf == MM_NULL) {goto cleanup_slow_init;}
-       bigbuf = (BYTE*)mmGetPtr(mmbigbuf);
-
-       mmbuf = mmMalloc(20);
-       if(mmbuf == MM_NULL) {goto cleanup_slow_init;}
-       buf = (BYTE*)mmGetPtr(mmbuf);
-
-       mmctx = mmMalloc(sizeof(YSHA1_CTX));
-       if(mmctx == MM_NULL) {goto cleanup_slow_init;}
-       ctx = (YSHA1_CTX*)mmGetPtr(mmctx);
-
-
-       /* Initialize the secret state. */
-       /* Init entropy pool */
-       YSHA1Init(&p->pool);
-       /* Init output generator */
-       polllength = prng_slow_poll(bigbuf,SPLEN);
-       YSHA1Init(ctx);
-       YSHA1Update(ctx,bigbuf,polllength);
-       YSHA1Final(buf,ctx);
-       prng_make_new_state(&p->outstate, buf);
-
-cleanup_slow_init:
-       mmFree(mmctx);
-       mmFree(mmbigbuf);
-       mmFree(mmbuf);
-
-       return;
-}
-
-#endif /* SLOW_POLL_ENABLE */
-
-/* In-place modifed bubble sort */
-static void 
-bubbleSort( UINT *data, LONG len ) 
-{
-       LONG    i,last,newlast;
-       UINT    temp;
-
-       last = len-1; 
-       while(last!=-1) 
-       {
-               newlast = -1;
-               for(i=0;i<last;i++) 
-               {
-                       if(data[i+1] > data[i]) 
-                       {
-                               newlast = i;
-                               temp = data[i];
-                               data[i] = data[i+1];
-                               data[i+1] = temp;
-                       }
-               }
-               last = newlast;
-       }               
-}
-
-#if 0
-#pragma mark -
-#pragma mark * * * Public functions * * * 
-#endif
-
-/* Set up the PRNG */
-prng_error_status
-prngInitialize(PrngRef *prng) 
-{
-       UINT i;
-       comp_error_status resp;
-       prng_error_status retval = PRNG_ERR_LOW_MEMORY;
-       MMPTR   mmp;
-       PRNG    *p;
-       
-       mmInit();
-       
-       #if     MUTEX_ENABLE
-       /* Create the mutex */
-       /* NOTE: on return the mutex should bve held, since our caller (prngInitialize)
-        * will release it. 
-        */
-       if(mutexCreatorId!=0) {return PRNG_ERR_REINIT;}
-       Statmutex = CreateMutex(NULL,TRUE,NULL);
-       if(Statmutex == NULL) {mutexCreatorId = 0; return PRNG_ERR_MUTEX;}
-       DuplicateHandle(GetCurrentProcess(),Statmutex,GetCurrentProcess(),&mutex,SYNCHRONIZE,FALSE,0);
-       mutexCreatorId = GetCurrentProcessId();
-       #endif  /* MUTEX_ENABLE */
-       
-       /* Assign memory */
-       mmp = mmMalloc(sizeof(PRNG));
-       if(mmp==MM_NULL)
-       {
-               goto cleanup_init;
-       }
-       else
-       {
-               p = (PRNG*)mmGetPtr(mmp);
-               memset(p, 0, sizeof(PRNG));
-       }
-
-       /* Initialize Variables */
-       for(i=0;i<TOTAL_SOURCES;i++) 
-       {
-               p->poolSize[i] = 0;
-               p->poolEstBits[i] = 0;
-       }
-
-#ifdef WIN_NT
-       /* Setup security on the registry so that remote users cannot predict the slow pool */
-       prng_set_NT_security();
-#endif
-
-       /* Initialize the secret state. */
-       /* FIXME - might want to make this an option here and have the caller
-        * do it after we return....? */
-       YSHA1Init(&p->pool);
-#if            SLOW_POLL_ENABLE
-       prng_slow_init(p);      /* Does a slow poll and then calls prng_make_state(...) */
-#else  
-       /* NULL init */
-       prng_do_SHA1(&p->outstate);
-       prng_make_new_state(&p->outstate, p->outstate.out);
-#endif /* SLOW_POLL_ENABLE */
-
-       /* Initialize compression routines */
-       for(i=0;i<COMP_SOURCES;i++) 
-       {
-               resp = comp_init((p->comp_state)+i);
-               if(resp!=COMP_SUCCESS) {retval = PRNG_ERR_COMPRESSION; goto cleanup_init;}
-       }
-       
-       p->ready = PRNG_READY;
-       *prng = (PrngRef)p;
-       
-       return PRNG_SUCCESS;
-
-cleanup_init:
-       /* Program failed on one of the mmmallocs */
-       mmFree(mmp);
-       mmp = MM_NULL;
-       
-       #if             MUTEX_ENABLE
-       CloseHandle(Statmutex);
-       Statmutex = NULL;
-       mutexCreatorId = 0;
-       #endif
-       
-       return retval; /* default PRNG_ERR_LOW_MEMORY */
-}
-
-/* Provide output */
-prng_error_status
-prngOutput(PRNG *p, BYTE *outbuf,UINT outbuflen) 
-{
-       UINT i;
-       GEN_CTX *ctx = &p->outstate;
-       
-       CHECKSTATE(p);
-       GENCHECK(p);
-       PCHECK(outbuf);
-       chASSERT(BACKTRACKLIMIT > 0);
-
-       for(i=0;i<outbuflen;i++,ctx->index++,ctx->numout++) 
-       {
-               /* Check backtracklimit */
-               if(ctx->numout > BACKTRACKLIMIT) 
-               {
-                       prng_do_SHA1(ctx);      
-                       prng_make_new_state(ctx, ctx->out);
-               }
-               /* Check position in IV */
-               if(ctx->index>=20) 
-               {
-                       prng_do_SHA1(ctx);
-               }
-               /* Output data */
-               outbuf[i] = (ctx->out)[ctx->index];
-       }
-
-       return PRNG_SUCCESS;
-}
-
-
-/* Cause the PRNG to reseed now regardless of entropy pool */ 
-/* Should this be public? */
-prng_error_status
-prngForceReseed(PRNG *p, LONGLONG ticks) 
-{
-       int i;
-#ifdef WIN_NT
-       FILETIME a,b,c,usertime;
-#endif
-       BYTE buf[64];
-       BYTE dig[20];
-#if    defined(macintosh) || defined(__APPLE__)
-       #if             (defined(TARGET_API_MAC_OSX) || defined(KERNEL_BUILD))
-               struct timeval  tv;             
-               int64_t                 endTime, curTime;
-       #elif           defined(MACH_KERNEL_PRIVATE)
-               int64_t                 endTime, curTime;
-       #else   /* TARGET_API_MAC_CARBON */
-               UnsignedWide    uwide;          /* struct needed for Microseconds() */
-               LONGLONG                start;
-               LONGLONG                now;
-       #endif
-#endif
-
-       CHECKSTATE(p);
-       POOLCHECK(p);
-       ZCHECK(ticks);
-       
-       /* Set up start and end times */
-       #if             defined(macintosh) || defined(__APPLE__)
-               #if             (defined(TARGET_API_MAC_OSX) || defined(KERNEL_BUILD))
-                       /* note we can't loop for more than a million microseconds */
-            #ifdef KERNEL_BUILD
-                microuptime (&tv);
-            #else
-                gettimeofday(&tv, NULL);
-            #endif
-                       endTime = (int64_t)tv.tv_sec*1000000LL + (int64_t)tv.tv_usec + ticks;
-               #elif           defined(MACH_KERNEL_PRIVATE)
-                       endTime = mach_absolute_time() + (ticks*NSEC_PER_USEC);
-               #else   /* TARGET_API_MAC_OSX */
-                       Microseconds(&uwide);
-                       start = UnsignedWideToUInt64(uwide);
-               #endif  /* TARGET_API_xxx */
-       #endif  /* macintosh */
-       do
-       {
-               /* Do a couple of iterations between time checks */
-               prngOutput(p, buf,64);
-               YSHA1Update(&p->pool,buf,64);
-               prngOutput(p, buf,64);
-               YSHA1Update(&p->pool,buf,64);
-               prngOutput(p, buf,64);
-               YSHA1Update(&p->pool,buf,64);
-               prngOutput(p, buf,64);
-               YSHA1Update(&p->pool,buf,64);
-               prngOutput(p, buf,64);
-               YSHA1Update(&p->pool,buf,64);
-
-#if            defined(macintosh) || defined(__APPLE__)
-       #if             defined(TARGET_API_MAC_OSX) || defined(KERNEL_BUILD)
-        #ifdef TARGET_API_MAC_OSX
-            gettimeofday(&tv, NULL);
-        #else
-            microuptime (&tv);
-           curTime = (int64_t)tv.tv_sec*1000000LL + (int64_t)tv.tv_usec;
-        #endif
-       } while(curTime < endTime);
-       #elif           defined(MACH_KERNEL_PRIVATE)
-           curTime = mach_absolute_time();     
-       } while(curTime < endTime);
-       #else
-               Microseconds(&uwide);
-               now = UnsignedWideToUInt64(uwide);
-       } while ( (now-start) < ticks) ;
-       #endif
-#else
-       } while ( (now-start) < ticks) ;
-#endif
-       YSHA1Final(dig,&p->pool);
-       YSHA1Update(&p->pool,dig,20); 
-       YSHA1Final(dig,&p->pool);
-
-       /* Reset secret state */
-       YSHA1Init(&p->pool);
-       prng_make_new_state(&p->outstate,dig);
-
-       /* Clear counter variables */
-       for(i=0;i<TOTAL_SOURCES;i++) 
-       {
-               p->poolSize[i] = 0;
-               p->poolEstBits[i] = 0;
-       }
-
-       /* Cleanup memory */
-       trashMemory(dig,20*sizeof(char));
-       trashMemory(buf,64*sizeof(char));
-
-       return PRNG_SUCCESS;
-}
-
-
-/* Input a state into the PRNG */
-prng_error_status
-prngProcessSeedBuffer(PRNG *p, BYTE *buf,LONGLONG ticks) 
-{
-       CHECKSTATE(p);
-       GENCHECK(p);
-       PCHECK(buf);
-
-       /* Put the data into the entropy, add some data from the unknown state, reseed */
-       YSHA1Update(&p->pool,buf,20);                   /* Put it into the entropy pool */
-       prng_do_SHA1(&p->outstate);                             /* Output 20 more bytes and     */
-       YSHA1Update(&p->pool,p->outstate.out,20);/* add it to the pool as well.  */
-       prngForceReseed(p, ticks);                              /* Do a reseed */
-       return prngOutput(p, buf,20); /* Return the first 20 bytes of output in buf */
-}
-
-
-/* Take some "random" data and make more "random-looking" data from it */
-/* note: this routine has no context, no mutex wrapper */
-prng_error_status
-prngStretch(BYTE *inbuf,UINT inbuflen,BYTE *outbuf,UINT outbuflen) {
-       long int left,prev;
-       YSHA1_CTX ctx;
-       BYTE dig[20];
-
-       PCHECK(inbuf);
-       PCHECK(outbuf);
-
-       if(inbuflen >= outbuflen) 
-       {
-               memcpy(outbuf,inbuf,outbuflen);
-               return PRNG_SUCCESS;
-       }
-       else  /* Extend using SHA1 hash of inbuf */
-       {
-               YSHA1Init(&ctx);
-               YSHA1Update(&ctx,inbuf,inbuflen);
-               YSHA1Final(dig,&ctx);
-               for(prev=0,left=outbuflen;left>0;prev+=20,left-=20) 
-               {
-                       YSHA1Update(&ctx,dig,20);
-                       YSHA1Final(dig,&ctx);
-                       memcpy(outbuf+prev,dig,(left>20)?20:left);
-               }
-               trashMemory(dig,20*sizeof(BYTE));
-               
-               return PRNG_SUCCESS;
-       }
-}
-
-
-/* Add entropy to the PRNG from a source */
-prng_error_status
-prngInput(PRNG *p, BYTE *inbuf,UINT inbuflen,UINT poolnum, __unused UINT estbits)
-{
-       #ifndef YARROW_KERNEL
-       comp_error_status resp;
-       #endif
-       
-       CHECKSTATE(p);
-       POOLCHECK(p);
-       PCHECK(inbuf);
-       if(poolnum >= TOTAL_SOURCES) {return PRNG_ERR_OUT_OF_BOUNDS;}
-
-       /* Add to entropy pool */
-       YSHA1Update(&p->pool,inbuf,inbuflen);
-       
-       #ifndef YARROW_KERNEL
-       /* skip this step for the kernel */
-       
-       /* Update pool size, pool user estimate and pool compression context */
-       p->poolSize[poolnum] += inbuflen;
-       p->poolEstBits[poolnum] += estbits;
-       if(poolnum<COMP_SOURCES)
-       {
-               resp = comp_add_data((p->comp_state)+poolnum,inbuf,inbuflen);
-               if(resp!=COMP_SUCCESS) {return PRNG_ERR_COMPRESSION;}
-       }
-       #endif  /* YARROW_KERNEL */
-       
-       return PRNG_SUCCESS;
-}
-
-
-
-/* If we have enough entropy, allow a reseed of the system */
-prng_error_status
-prngAllowReseed(PRNG *p, LONGLONG ticks) 
-{
-       UINT temp[TOTAL_SOURCES];
-       LONG i;
-       UINT sum;
-#ifndef KERNEL_BUILD
-       float ratio;
-#endif
-
-#ifndef KERNEL_BUILD
-       comp_error_status resp;
-#endif
-
-       CHECKSTATE(p);
-
-       for(i=0;i<ENTROPY_SOURCES;i++)
-       {
-               /* Make sure that compression-based entropy estimates are current */
-#ifndef KERNEL_BUILD // floating point in a kernel is BAD!
-               resp = comp_get_ratio((p->comp_state)+i,&ratio);
-               if(resp!=COMP_SUCCESS) {return PRNG_ERR_COMPRESSION;}
-               /* Use 4 instead of 8 to half compression estimate */
-               temp[i] = (int)(ratio*p->poolSize[i]*4);
-#else
-        temp[i] = p->poolSize[i] * 4;
-#endif
-
-       }
-       /* Use minumum of user and compression estimate for compressed sources */
-       for(i=ENTROPY_SOURCES;i<COMP_SOURCES;i++)
-       {
-#ifndef KERNEL_BUILD
-               /* Make sure that compression-based entropy estimates are current */
-               resp = comp_get_ratio((p->comp_state)+i,&ratio);
-               if(resp!=COMP_SUCCESS) {return PRNG_ERR_COMPRESSION;}
-               /* Use 4 instead of 8 to half compression estimate */
-               temp[i] = _MIN((int)(ratio*p->poolSize[i]*4),(int)p->poolEstBits[i]);
-#else
-        temp[i] = _MIN (p->poolSize[i] * 4, p->poolEstBits[i]);
-#endif
-
-       }
-       /* Use user estimate for remaining sources */
-       for(i=COMP_SOURCES;i<TOTAL_SOURCES;i++) {temp[i] = p->poolEstBits[i];}
-
-       if(K > 0) {
-               /* pointless if we're not ignoring any sources */
-               bubbleSort(temp,TOTAL_SOURCES);
-       }
-       for(i=K,sum=0;i<TOTAL_SOURCES;sum+=temp[i++]); /* Stupid C trick */
-       if(sum>THRESHOLD) 
-               return prngForceReseed(p, ticks);
-       else 
-               return PRNG_ERR_NOT_ENOUGH_ENTROPY;
-}
-
-#if            SLOW_POLL_ENABLE
-/* Call a slow poll and insert the data into the entropy pool */
-static prng_error_status
-prngSlowPoll(PRNG *p, UINT pollsize)
-{
-       BYTE *buf;
-       DWORD len;
-       prng_error_status retval;
-
-       CHECKSTATE(p);
-
-       buf = (BYTE*)malloc(pollsize);
-       if(buf==NULL) {return PRNG_ERR_LOW_MEMORY;}
-       len = prng_slow_poll(buf,pollsize);     /* OS specific call */
-       retval = prngInput(p, buf,len,SLOWPOLLSOURCE, len * 8);
-       trashMemory(buf,pollsize);
-       free(buf);
-
-       return retval;
-}
-#endif /* SLOW_POLL_ENABLE */
-
-
-/* Delete the PRNG */
-prng_error_status
-prngDestroy(PRNG *p) 
-{
-       UINT i;
-
-       #if     MUTEX_ENABLE
-       if(GetCurrentProcessId()!=mutexCreatorId) {return PRNG_ERR_WRONG_CALLER;}
-       #endif
-       if(p==NULL) {return PRNG_SUCCESS;} /* Well, there is nothing to destroy... */
-
-       p->ready = PRNG_NOT_READY;
-       
-       for(i=0;i<COMP_SOURCES;i++)
-       {
-               comp_end((p->comp_state)+i);
-       }
-
-       #if     MUTEX_ENABLE
-       CloseHandle(Statmutex);
-       Statmutex = NULL;
-       mutexCreatorId = 0;
-       #endif
-       
-       return PRNG_SUCCESS;
-}
-
-
diff --git a/osfmk/prng/YarrowCoreLib/src/prng.h b/osfmk/prng/YarrowCoreLib/src/prng.h
deleted file mode 100644 (file)
index 7d80758..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           prng.h
-
-       Contains:       Core routines for the Counterpane Yarrow PRNG.
-
-       Written by:     Counterpane, Inc. 
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/10/99        dpm             Created, based on Counterpane source.
-*/
-/*
-       prng.h
-
-       Main private header for the Counterpane PRNG. Use this to be able access the
-       initialization and destruction routines from the DLL.
-*/
-
-#ifndef __YARROW_PRNG_H__
-#define __YARROW_PRNG_H__
-
-#if            defined(macintosh) || defined(__APPLE__)
-#include "prng/YarrowCoreLib/include/yarrow.h"
-/* Private function forward declarations */
-// this is in yarrow.h...YARROWAPI prng_error_status prngInitialize(void);
-// ditto....             YARROWAPI prng_error_status prngDestroy(void);
-YARROWAPI prng_error_status prngInputEntropy(PrngRef prng, BYTE *inbuf,UINT inbuflen,UINT poolnum);
-#else  /* original yarrow code */
-/* Declare YARROWAPI as __declspec(dllexport) before
-   including this file in the actual DLL */
-#ifndef YARROWAPI 
-#define YARROWAPI __declspec(dllimport)
-#endif
-
-/* Private function forward declarations */
-YARROWAPI int prngInitialize(void);
-YARROWAPI int prngDestroy(void);
-YARROWAPI int prngInputEntropy(BYTE *inbuf,UINT inbuflen,UINT poolnum);
-
-#endif /* macintosh */
-#endif /* __YARROW_PRNG_H__ */
diff --git a/osfmk/prng/YarrowCoreLib/src/prngpriv.h b/osfmk/prng/YarrowCoreLib/src/prngpriv.h
deleted file mode 100644 (file)
index 3014b4f..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           prngpriv.h
-
-       Contains:       Private typedefs and #defines for Counterpane Yarrow PRNG.
-
-       Written by:     Counterpane, Inc. 
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/10/99        dpm             Created, based on Counterpane source.
-*/
-/*
-       prngpriv.h
-
-       Completely private header for the Counterpane PRNG. Should only be included by prng.c
-*/
-
-#ifndef __YARROW_PRNG_PRIV_H__
-#define __YARROW_PRNG_PRIV_H__
-
-#include "userdefines.h"
-#include "prng/YarrowCoreLib/include/yarrow.h"
-#include "entropysources.h"
-#include "comp.h"
-#include "sha1mod.h"
-#include "smf.h"
-
-#define TOTAL_SOURCES ENTROPY_SOURCES+USER_SOURCES
-
-#ifdef COMPRESSION_ON
-#define COMP_SOURCES TOTAL_SOURCES
-#else
-#define COMP_SOURCES ENTROPY_SOURCES
-#endif
-
-/* Error numbers */
-typedef enum prng_ready_status {
-       PRNG_READY = 33,        /* Compiler will initialize to either 0 or random if allowed to */
-       PRNG_NOT_READY = 0
-} prng_ready_status;
-
-/* Top level output state */
-typedef struct{
-       BYTE IV[20];
-       BYTE out[20];
-       UINT index;                     /* current byte to output */
-       UINT numout;            /* bytes since last prng_make_new_state */ 
-} GEN_CTX;
-
-/* PRNG state structure */
-struct PRNG {
-       /* Output State */
-       GEN_CTX outstate;
-
-       /* Entropy Pools (somewhat unlike a gene pool) */
-       YSHA1_CTX pool;
-       UINT poolSize[TOTAL_SOURCES];                   /* Note that size is in bytes and est in bits */
-       UINT poolEstBits[TOTAL_SOURCES];
-       COMP_CTX comp_state[COMP_SOURCES];
-
-       /* Status Flags */
-       prng_ready_status ready;
-};
-
-/*
- * Clients see an opaque PrngRef; internal code uses the 
- * following typedef.
- */
-typedef struct PRNG PRNG;
-
-
-/* Test Macros */
-#define CHECKSTATE(p) \
-if(p==NULL) {return PRNG_ERR_NOT_READY;} /* Does the state exist? */   \
-if(p->ready != PRNG_READY) {return PRNG_ERR_NOT_READY;}        /* Set error state and return */
-/* To make sure that a pointer isn't NULL */
-#define PCHECK(ptr)  if(ptr==NULL) {return PRNG_ERR_NULL_POINTER;}
-/* To make sure that malloc returned a valid value */
-#define MCHECK(ptr)  if(ptr==NULL) {return PRNG_ERR_LOW_MEMORY;}
-/* To make sure that a given value is non-negative */
-#if            defined(macintosh) || defined(__APPLE__)
-/* original looks like a bogon */
-#define ZCHECK(val)  if(val<0) {return PRNG_ERR_OUT_OF_BOUNDS;}
-#else
-#define ZCHECK(val)  if(p<0) {return PRNG_ERR_OUT_OF_BOUNDS;}
-#endif /* macintosh */
-/* To make sure that the generator state is valid */
-#define GENCHECK(p) if(p->outstate.index>20) {return PRNG_ERR_OUT_OF_BOUNDS;} /* index is unsigned */
-/* To make sure that the entropy pool is valid */
-#define POOLCHECK(p) /* */
-
-
-#endif
diff --git a/osfmk/prng/YarrowCoreLib/src/readme-prnguser.txt b/osfmk/prng/YarrowCoreLib/src/readme-prnguser.txt
deleted file mode 100644 (file)
index c7b41ff..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-12345678901234567890123456789012345678901234567890123456789012345678901234567890
-
-Description of User Routines in Prngcore
-----------------------------------------------
-
-This files describes routines in prngcore that are designed to be called by the
-user (ie client apps). Those interested in the details of the library are 
-directed to readme-prngcoder.
-
-Files of interest in this directory
------------------------------------
-
-yarrow.h
-
-Main header file (and the only one needed) for client apps.
-
-userdefines.h
-
-Header file with macros that can be defined to specify the system that this
-code is being compiled on, as well as other details of the prng operation.
-
-usersources.h
-
-Header file containing the names of the various user sources of entropic data.
-You can add/delete/rename sources by altering the entries in the enumeration.
-
-
-PRNG Client Routines
---------------------
-
-All major routines return the success/error value for their operation.
-
-prngOutput(outbuf,outbuflen)
-
-Writes outbuflen worth of "random" data to outbuf. This routine has
-backtracking protection, but you should call prngAllowReseed whenever you can
-spare the cycles to guarantee good output. 
-
-prngStretch(inbuf,inbuflen,outbuf,outbuflen)
-
-Takes inbuflen bytes of data from inbuf and turns it into outbuflen bytes of 
-data stored in outbuf.
-
-prngInput(inbuf,inbuflen,poolnum,estbits)
-
-Takes inbuflen bytes of data from inbuf and places it in entropy pool poolnum.  
-The user entropy pool names can be found in usersources.h (see above).
-
-prngForceReseed(ticks)
-
-Forces a reseed that lasts about ticks ticks long. Be very careful when using
-this function to ensure that you do not produce a poor output state.  It is 
-suggested that you instead use prngAllowReseed.
-
-prngAllowReseed(ticks)
-
-Will force a reseed if there is enough entropy. A reseed (of length ticks) 
-will be done if the total entropy estimate, ignoring the K greatest sources,
-is greater than THRESHOLD. Currently, K = 0 (a bad idea) and THRESHOLD = 100
-(likely to remain so). These values can be found and edited in userdefines.h.
-Will return PRNG_ERR_NOT_ENOUGH_ENTROPY if there is not enough entropy in the
-pool at this time.
-
-prngProcessSeedBuffer(buf,ticks)
-
-Takes 20 bytes of data from buf and churns it into the entropy pool, and then
-forces a reseed of length ticks. The first 20 bytes of output are then
-returned in buf for future use with this function.  It is recommended that data
-used with this function be stored very securely.
-
-prngSlowPoll(pollsize)
-
-Does a slow poll to collect a large amount of vaguely random data from the OS
-itself.  The poll with collect at most pollsize bytes, and this parameter can
-be used to control (approximately) the length of the poll. The collected data
-is fed into the entropy pool.  After calling this function you may call either
-allow (recommended) or force a reseed if desired.
-
---------
-
-Any questions can be directed to the programmer (me), Ari Benbasat, at 
-pigsfly@unixg.ubc.ca.  Comments would be greatly appreciated.  Please cc: all
-e-mail to Bruce Schneier, John Kelsey and Chris Hall 
-{schneier,kelsey,hall}@counterpane.com.  
-
-Thank you.
-
-
-
-i
diff --git a/osfmk/prng/YarrowCoreLib/src/sha1mod.c b/osfmk/prng/YarrowCoreLib/src/sha1mod.c
deleted file mode 100644 (file)
index 3f308d9..0000000
+++ /dev/null
@@ -1,239 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-SHA-1 in C
-By Steve Reid <steve@edmweb.com>
-100% Public Domain
-*/
-/* Header portion split from main code for convenience (AYB 3/02/98) */
-#include "sha1mod.h"
-#ifdef SHA1HANDSOFF
-#include <string.h>
-#endif
-
-#define rol(value, bits) (((value) << (bits)) | ((value) >> (32 - (bits))))
-
-/*
- * Apple change...
- */
-#if    defined(macintosh) || defined (__APPLE__)
-#undef LITTLE_ENDIAN
-#endif
-
-/* blk0() and blk() perform the initial expand. */
-/* I got the idea of expanding during the round function from SSLeay */
-#ifdef LITTLE_ENDIAN
-#define blk0(i) (block->l[i] = (rol(block->l[i],24)&0xFF00FF00) \
-    |(rol(block->l[i],8)&0x00FF00FF))
-#else
-#define blk0(i) block->l[i]
-#endif
-#define blk(i) (block->l[i&15] = rol(block->l[(i+13)&15]^block->l[(i+8)&15] \
-    ^block->l[(i+2)&15]^block->l[i&15],1))
-
-/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
-#define R0(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk0(i)+0x5A827999+rol(v,5);w=rol(w,30);
-#define R1(v,w,x,y,z,i) z+=((w&(x^y))^y)+blk(i)+0x5A827999+rol(v,5);w=rol(w,30);
-#define R2(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0x6ED9EBA1+rol(v,5);w=rol(w,30);
-#define R3(v,w,x,y,z,i) z+=(((w|x)&y)|(w&x))+blk(i)+0x8F1BBCDC+rol(v,5);w=rol(w,30);
-#define R4(v,w,x,y,z,i) z+=(w^x^y)+blk(i)+0xCA62C1D6+rol(v,5);w=rol(w,30);
-
-
-/* Hash a single 512-bit block. This is the core of the algorithm. */
-
-__private_extern__ void
-YSHA1Transform(u_int32_t state[5], const unsigned char buffer[64])
-{
-u_int32_t a, b, c, d, e;
-typedef union {
-    unsigned char c[64];
-    u_int32_t l[16];
-} CHAR64LONG16;
-CHAR64LONG16* block;
-#ifdef SHA1HANDSOFF
-static unsigned char workspace[64];
-    block = (CHAR64LONG16*)workspace;
-    memcpy(block, buffer, 64);
-#else
-    block = (CHAR64LONG16*)buffer;
-#endif
-    /* Copy context->state[] to working vars */
-    a = state[0];
-    b = state[1];
-    c = state[2];
-    d = state[3];
-    e = state[4];
-    /* 4 rounds of 20 operations each. Loop unrolled. */
-    R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
-    R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
-    R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
-    R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
-    R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
-    R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
-    R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
-    R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
-    R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
-    R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
-    R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
-    R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
-    R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
-    R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
-    R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
-    R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
-    R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
-    R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
-    R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
-    R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
-    /* Add the working vars back into context.state[] */
-    state[0] += a;
-    state[1] += b;
-    state[2] += c;
-    state[3] += d;
-    state[4] += e;
-    /* Wipe variables */
-    a = b = c = d = e = 0;
-}
-
-
-/* YSHA1Init - Initialize new context */
-
-__private_extern__ void
-YSHA1Init(YSHA1_CTX* context)
-{
-    /* SHA1 initialization constants */
-    context->state[0] = 0x67452301;
-    context->state[1] = 0xEFCDAB89;
-    context->state[2] = 0x98BADCFE;
-    context->state[3] = 0x10325476;
-    context->state[4] = 0xC3D2E1F0;
-    context->count[0] = context->count[1] = 0;
-}
-
-
-/* Run your data through this. */
-
-__private_extern__ void
-YSHA1Update(YSHA1_CTX* context, const unsigned char* data, unsigned int len)
-{
-unsigned int i, j;
-
-    j = (context->count[0] >> 3) & 63;
-    if ((context->count[0] += len << 3) < (len << 3)) context->count[1]++;
-    context->count[1] += (len >> 29);
-    if ((j + len) > 63) {
-        memcpy(&context->buffer[j], data, (i = 64-j));
-        YSHA1Transform(context->state, context->buffer);
-        for ( ; i + 63 < len; i += 64) {
-            YSHA1Transform(context->state, &data[i]);
-        }
-        j = 0;
-    }
-    else i = 0;
-    memcpy(&context->buffer[j], &data[i], len - i);
-}
-
-
-/* Add padding and return the message digest. */
-
-__private_extern__ void
-YSHA1Final(unsigned char digest[20], YSHA1_CTX* context)
-{
-u_int32_t i, j;
-unsigned char finalcount[8];
-
-    for (i = 0; i < 8; i++) {
-        finalcount[i] = (unsigned char)((context->count[(i >= 4 ? 0 : 1)]
-         >> ((3-(i & 3)) * 8) ) & 255);  /* Endian independent */
-    }
-    YSHA1Update(context, (const unsigned char *)"\200", 1);
-    while ((context->count[0] & 504) != 448) {
-        YSHA1Update(context, (const unsigned char *)"\0", 1);
-    }
-    YSHA1Update(context, finalcount, 8);  /* Should cause a YSHA1Transform() */
-    for (i = 0; i < 20; i++) {
-        digest[i] = (unsigned char)
-         ((context->state[i>>2] >> ((3-(i & 3)) * 8) ) & 255);
-    }
-    /* Wipe variables */
-    i = j = 0;
-    memset(context->buffer, 0, 64);
-    memset(context->state, 0, 20);
-    memset(context->count, 0, 8);
-    memset(finalcount, 0, 8);
-#ifdef SHA1HANDSOFF  /* make YSHA1Transform overwrite it's own static vars */
-    YSHA1Transform(context->state, context->buffer);
-#endif
-}
-
-
-/*************************************************************/
-
-/* Test Code */
-
-#if 0
-
-int main(int argc, char** argv)
-{
-int i, j;
-YSHA1_CTX context;
-unsigned char digest[20], buffer[16384];
-FILE* file;
-
-    if (argc > 2) {
-        puts("Public domain SHA-1 implementation - by Steve Reid <steve@edmweb.com>");
-        puts("Produces the SHA-1 hash of a file, or stdin if no file is specified.");
-        exit(0);
-    }
-    if (argc < 2) {
-        file = stdin;
-    }
-    else {
-        if (!(file = fopen(argv[1], "rb"))) {
-            fputs("Unable to open file.", stderr);
-            exit(-1);
-        }
-    } 
-    YSHA1Init(&context);
-    while (!feof(file)) {  /* note: what if ferror(file) */
-        i = fread(buffer, 1, 16384, file);
-        YSHA1Update(&context, buffer, i);
-    }
-    YSHA1Final(digest, &context);
-    fclose(file);
-    for (i = 0; i < 5; i++) {
-        for (j = 0; j < 4; j++) {
-            printf("%02X", digest[i*4+j]);
-        }
-        putchar(' ');
-    }
-    putchar('\n');
-    exit(0);
-}
-
-#endif
diff --git a/osfmk/prng/YarrowCoreLib/src/sha1mod.h b/osfmk/prng/YarrowCoreLib/src/sha1mod.h
deleted file mode 100644 (file)
index d969f2c..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-SHA-1 in C
-By Steve Reid <steve@edmweb.com>
-100% Public Domain
-*/
-/* Header portion split from main code for convenience (AYB 3/02/98) */
-
-#ifndef __SHA1_H__
-
-#define __SHA1_H__
-
-#include <stdint.h>
-
-/*
-Test Vectors (from FIPS PUB 180-1)
-"abc"
-  A9993E36 4706816A BA3E2571 7850C26C 9CD0D89D
-"abcdbcdecdefdefgefghfghighijhijkijkljklmklmnlmnomnopnopq"
-  84983E44 1C3BD26E BAAE4AA1 F95129E5 E54670F1
-A million repetitions of "a"
-  34AA973C D4C4DAA4 F61EEB2B DBAD2731 6534016F
-*/
-
-/* Apple change - define this in the source file which uses it */
-/* #define LITTLE_ENDIAN  This should be #define'd if true. */
-#define SHA1HANDSOFF /* Copies data before messing with it. */
-
-//Context declaration
-typedef struct {
-    u_int32_t state[5];
-    u_int32_t count[2];
-    unsigned char buffer[64];
-} YSHA1_CTX;
-
-//Function forward declerations
-__private_extern__ void YSHA1Transform(u_int32_t state[5],
-    const unsigned char buffer[64]);
-__private_extern__ void YSHA1Init(YSHA1_CTX* context);
-__private_extern__ void YSHA1Update(YSHA1_CTX* context,
-    const unsigned char* data, unsigned int len);
-__private_extern__ void YSHA1Final(unsigned char digest[20],
-    YSHA1_CTX* context);
-
-#endif /* __SHA1_H__ */
diff --git a/osfmk/prng/YarrowCoreLib/src/smf.h b/osfmk/prng/YarrowCoreLib/src/smf.h
deleted file mode 100644 (file)
index b152732..0000000
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 1999-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           smf.h
-
-       Contains:       Secure malloc/free API.
-
-       Written by:     Doug Mitchell
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/10/00        dpm             Created, based on Counterpane's Yarrow code. 
-*/
-
-#ifndef _YARROW_SMF_H_
-#define _YARROW_SMF_H_
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-/* smf.h */
-
-       /*  
-       Header file for secure malloc and free routines used by the Counterpane
-       PRNG. Use this code to set up a memory-mapped file out of the system 
-       paging file, allocate and free memory from it, and then return
-       the memory to the system registry after having securely overwritten it.
-       Details of the secure overwrite can be found in Gutmann 1996 (Usenix).
-       Trying to explain it here will cause my head to begin to hurt.
-       Ari Benbasat (pigsfly@unixg.ubc.ca)
-       */
-
-
-
-#if            defined(macintosh) || defined(__APPLE__)
-#include "macOnly.h"
-#define MMPTR  void *
-
-#ifndef SMFAPI 
-#define SMFAPI 
-#endif
-
-#else  /* original Yarrow */
-
-/* Declare HOOKSAPI as __declspec(dllexport) before
-   including this file in the actual DLL */
-#ifndef SMFAPI 
-#define SMFAPI __declspec(dllimport)
-#endif
-#define MMPTR  BYTE
-
-#endif /* macintosh */
-
-
-#define MM_NULL        ((void *)0)
-
-/* Function forward declarations */
-SMFAPI void mmInit( void );
-SMFAPI MMPTR mmMalloc(DWORD request);
-SMFAPI void mmFree(MMPTR ptrnum);
-SMFAPI LPVOID mmGetPtr(MMPTR ptrnum);
-SMFAPI void mmReturnPtr(MMPTR ptrnum);
-#if    0
-SMFAPI void mmFreePtr(LPVOID ptr);
-#endif
-
-#if defined(__cplusplus)
-}
-#endif
-
-#endif /* _YARROW_SMF_H_*/
diff --git a/osfmk/prng/YarrowCoreLib/src/userdefines.h b/osfmk/prng/YarrowCoreLib/src/userdefines.h
deleted file mode 100644 (file)
index 3d76b4b..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       userdefines.h
-
-       Header file that contains the major user-defineable quantities for the Counterpane PRNG.
-*/
-#ifndef __YARROW_USER_DEFINES_H__
-#define __YARROW_USER_DEFINES_H__
-
-/* User-alterable define statements */
-#define STRICT                         /* Define to force strict type checking */
-#define K 0                                    /* How many sources should we ignore when calculating total entropy? */
-#define THRESHOLD 100          /* Minimum amount of entropy for a reseed */
-#define BACKTRACKLIMIT 500     /* Number of outputed bytes after which to generate a new state */
-#define COMPRESSION_ON         /* Define this variable to add on-the-fly compression (recommended) */
-                                                       /* for user sources */
-#if            !defined(macintosh) && !defined(__APPLE__)
-#define WIN_95                         /* Choose an OS: WIN_95, WIN_NT */
-#endif
-
-/* Setup Microsoft flag for NT4.0 */
-#ifdef WIN_NT
-#define _WIN32_WINNT 0x0400
-#endif
-
-#endif /* __YARROW_USER_DEFINES_H__ */
diff --git a/osfmk/prng/YarrowCoreLib/src/yarrowUtils.c b/osfmk/prng/YarrowCoreLib/src/yarrowUtils.c
deleted file mode 100644 (file)
index 68b6ce0..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 1999, 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       File:           yarrowUtils.c
-
-       Contains:       Misc. utility functions.
-
-       Written by:     Doug Mitchell
-
-       Copyright: (c) 2000 by Apple Computer, Inc., all rights reserved.
-
-       Change History (most recent first):
-
-               02/29/00        dpm             Created.
-*/
-
-#include "prng/YarrowCoreLib/include/yarrowUtils.h"
-#include <string.h>
-
-void 
-trashMemory(void* mem, int len)
-/* This function should only be used on data in RAM */
-{
-       if(len == 0) {
-               /* some memsets really don't like this */
-               return;
-       }
-       
-       /* Cycle a bit just in case it is one of those weird memory units */
-       /* No, I don't know which units those would be */
-       memset(mem,0x00,len);
-       memset(mem,0xFF,len);
-       memset(mem,0x00,len);
-}
-
-
diff --git a/osfmk/prng/fips_sha1.c b/osfmk/prng/fips_sha1.c
deleted file mode 100644 (file)
index 93a0068..0000000
+++ /dev/null
@@ -1,386 +0,0 @@
-/*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
- * This SHA1 code is based on the basic framework from the reference
- * implementation for MD5.  That implementation is Copyright (C)
- * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved.
- *
- * License to copy and use this software is granted provided that it
- * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
- * Algorithm" in all material mentioning or referencing this software
- * or this function.
- *
- * License is also granted to make and use derivative works provided
- * that such works are identified as "derived from the RSA Data
- * Security, Inc. MD5 Message-Digest Algorithm" in all material
- * mentioning or referencing the derived work.
- *
- * RSA Data Security, Inc. makes no representations concerning either
- * the merchantability of this software or the suitability of this
- * software for any particular purpose. It is provided "as is"
- * without express or implied warranty of any kind.
- *
- * These notices must be retained in any copies of any part of this
- * documentation and/or software.
- *
- * Based on the FIPS 180-1: Secure Hash Algorithm (SHA-1) available at
- * http://www.itl.nist.gov/div897/pubs/fip180-1.htm
- */
-
-/*
-       WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
-       
-       THIS FILE IS NEEDED TO PASS FIPS ACCEPTANCE FOR THE RANDOM NUMBER GENERATOR.
-       IF YOU ALTER IT IN ANY WAY, WE WILL NEED TO GO THOUGH FIPS ACCEPTANCE AGAIN,
-       AN OPERATION THAT IS VERY EXPENSIVE AND TIME CONSUMING.  IN OTHER WORDS,
-       DON'T MESS WITH THIS FILE.
-
-       WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
-*/
-
-#include <stdint.h>
-#include <string.h>
-
-#include "fips_sha1.h"
-
-typedef int Boolean;
-
-/* Internal mappings to the legacy sha1_ctxt structure. */
-#define        state   h.b32
-#define        bcount  c.b32
-#define        buffer  m.b8
-
-/*
- * The digest algorithm interprets the input message as a sequence of 32-bit
- * big-endian words.  We must reverse bytes in each word on x86/64 platforms,
- * but not on big-endian ones such as PPC.  For performance, we take advantage
- * of the bswap instruction on x86/64 to perform byte-reversal.  On PPC, we
- * could do 4-byte load if the address is 4-byte aligned which should further
- * improve the performance.  But for code simplicity, we punt and do 1-byte
- * loads instead.
- */
-#if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__)
-#define        FETCH_32(p) ({                                                  \
-       u_int32_t l = (u_int32_t)*((const u_int32_t *)(p));     \
-       __asm__ __volatile__("bswap %0" : "=r" (l) : "0" (l));          \
-       l;                                                              \
-})
-#else
-#define        FETCH_32(p)                                                     \
-       (((u_int32_t)*((const u_int8_t *)(p) + 3)) |                    \
-       (((u_int32_t)*((const u_int8_t *)(p) + 2)) << 8) |              \
-       (((u_int32_t)*((const u_int8_t *)(p) + 1)) << 16) |             \
-       (((u_int32_t)*((const u_int8_t *)(p))) << 24))
-#endif /* __i386__ || __x86_64__ */
-
-/*
- * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
- * a multiple of 4. This is not compatible with memcpy().
- */
-static void
-Encode(unsigned char *output, u_int32_t *input, unsigned int len)
-{
-       unsigned int i, j;
-
-       for (i = 0, j = 0; j < len; i++, j += 4) {
-               output[j + 3] = input[i] & 0xff;
-               output[j + 2] = (input[i] >> 8) & 0xff;
-               output[j + 1] = (input[i] >> 16) & 0xff;
-               output[j] = (input[i] >> 24) & 0xff;
-       }
-}
-
-static unsigned char PADDING[64] = { 0x80, /* zeros */ };
-
-/* Constants from FIPS 180-1 */
-#define        K_00_19         0x5a827999UL
-#define        K_20_39         0x6ed9eba1UL
-#define        K_40_59         0x8f1bbcdcUL
-#define        K_60_79         0xca62c1d6UL
-
-/* F, G, H and I are basic SHA1 functions. */
-#define        F(b, c, d)      ((((c) ^ (d)) & (b)) ^ (d))
-#define        G(b, c, d)      ((b) ^ (c) ^ (d))
-#define        H(b, c, d)      (((b) & (c)) | (((b) | (c)) & (d)))
-
-/* ROTATE_LEFT rotates x left n bits. */
-#define        ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
-
-/* R, R1-R4 are macros used during each transformation round. */
-#define R(f, k, v, w, x, y, z, i) {                            \
-       (v) = ROTATE_LEFT(w, 5) + f(x, y, z) + (v) + (i) + (k); \
-       (x) = ROTATE_LEFT(x, 30);                               \
-}
-
-#define        R1(v, w, x, y, z, i)    R(F, K_00_19, v, w, x, y, z, i)
-#define        R2(v, w, x, y, z, i)    R(G, K_20_39, v, w, x, y, z, i)
-#define        R3(v, w, x, y, z, i)    R(H, K_40_59, v, w, x, y, z, i)
-#define        R4(v, w, x, y, z, i)    R(G, K_60_79, v, w, x, y, z, i)
-
-/* WUPDATE represents Wt variable that gets updated for steps 16-79 */
-#define        WUPDATE(p, q, r, s) {           \
-       (p) = ((q) ^ (r) ^ (s) ^ (p));  \
-       (p) = ROTATE_LEFT(p, 1);        \
-}
-
-static void SHA1Transform(u_int32_t, u_int32_t, u_int32_t, u_int32_t,
-    u_int32_t, const u_int8_t *, SHA1_CTX *);
-
-/*
- * SHA1 initialization. Begins a SHA1 operation, writing a new context.
- */
-void
-FIPS_SHA1Init(SHA1_CTX *context)
-{
-       context->bcount[0] = context->bcount[1] = 0;
-       context->count = 0;
-
-       /* Load magic initialization constants.  */
-       context->state[0] = 0x67452301UL;
-       context->state[1] = 0xefcdab89UL;
-       context->state[2] = 0x98badcfeUL;
-       context->state[3] = 0x10325476UL;
-       context->state[4] = 0xc3d2e1f0UL;
-}
-
-/*
- * SHA1 block update operation. Continues a SHA1 message-digest
- * operation, processing another message block, and updating the
- * context.
- */
-void FIPS_SHA1Update(SHA1_CTX *context, const void *inpp, size_t inputLen)
-{
-       u_int32_t i, index, partLen;
-       const unsigned char *input = (const unsigned char *)inpp;
-
-       if (inputLen == 0)
-               return;
-
-       /* Compute number of bytes mod 64 */
-       index = (context->bcount[1] >> 3) & 0x3F;
-
-       /* Update number of bits */
-       if ((context->bcount[1] += (inputLen << 3)) < (inputLen << 3))
-               context->bcount[0]++;
-       context->bcount[0] += (inputLen >> 29);
-
-       partLen = 64 - index;
-
-       /* Transform as many times as possible. */
-       i = 0;
-       if (inputLen >= partLen) {
-               if (index != 0) {
-                       memcpy(&context->buffer[index], input, partLen);
-                       SHA1Transform(context->state[0], context->state[1],
-                           context->state[2], context->state[3],
-                           context->state[4], context->buffer, context);
-                       i = partLen;
-               }
-
-               for (; i + 63 < inputLen; i += 64)
-                       SHA1Transform(context->state[0], context->state[1],
-                           context->state[2], context->state[3],
-                           context->state[4], &input[i], context);
-
-               if (inputLen == i)
-                       return;
-
-               index = 0;
-       }
-
-       /* Buffer remaining input */
-       memcpy(&context->buffer[index], &input[i], inputLen - i);
-}
-
-
-
-
-/*
- * This is function is only called in from the pagefault path or from page_copy().
- * So we assume that we can safely convert the virtual address to the physical address and use it.
- * Assumptions: The passed in address(inpp) is a kernel virtual address 
- * and a physical page has been faulted in. 
- * The inputLen passed in should always be less than or equal to a  page size (4096) 
- * and inpp should be on a page boundary. 
- * "performSHA1WithinKernelOnly" is initialized only when the hardware driver exists and is ready.
- */
-
-
-
-/*
- * SHA1 finalization. Ends an SHA1 message-digest operation, writing the
- * the message digest and zeroizing the context.
- */
-void
-FIPS_SHA1Final(void *digest, SHA1_CTX *context)
-{
-       unsigned char bits[8];
-       u_int32_t index = (context->bcount[1] >> 3) & 0x3f;
-
-       /* Save number of bits */
-       Encode(bits, context->bcount, 8);
-
-       /* Pad out to 56 mod 64. */
-       FIPS_SHA1Update(context, PADDING, ((index < 56) ? 56 : 120) - index);
-
-       /* Append length (before padding) */
-       FIPS_SHA1Update(context, bits, 8);
-
-       /* Store state in digest */
-       Encode(digest, context->state, 20);
-
-       /* Zeroize sensitive information. */
-       memset(context, 0, sizeof (*context));
-}
-
-/*
- * SHA1 basic transformation. Transforms state based on block.
- */
-static void
-SHA1Transform(u_int32_t a, u_int32_t b, u_int32_t c, u_int32_t d,
-    u_int32_t e, const u_int8_t block[64], SHA1_CTX *context)
-{
-       /* Register (instead of array) is a win in most cases */
-       u_int32_t w0, w1, w2, w3, w4, w5, w6, w7;
-       u_int32_t w8, w9, w10, w11, w12, w13, w14, w15;
-
-       w15 = FETCH_32(block + 60);
-       w14 = FETCH_32(block + 56);
-       w13 = FETCH_32(block + 52);
-       w12 = FETCH_32(block + 48);
-       w11 = FETCH_32(block + 44);
-       w10 = FETCH_32(block + 40);
-       w9  = FETCH_32(block + 36);
-       w8  = FETCH_32(block + 32);
-       w7  = FETCH_32(block + 28);
-       w6  = FETCH_32(block + 24);
-       w5  = FETCH_32(block + 20);
-       w4  = FETCH_32(block + 16);
-       w3  = FETCH_32(block + 12);
-       w2  = FETCH_32(block +  8);
-       w1  = FETCH_32(block +  4);
-       w0  = FETCH_32(block +  0);
-
-       /* Round 1 */
-                                       R1(e, a, b, c, d,  w0);         /*  0 */
-                                       R1(d, e, a, b, c,  w1);         /*  1 */
-                                       R1(c, d, e, a, b,  w2);         /*  2 */
-                                       R1(b, c, d, e, a,  w3);         /*  3 */
-                                       R1(a, b, c, d, e,  w4);         /*  4 */
-                                       R1(e, a, b, c, d,  w5);         /*  5 */
-                                       R1(d, e, a, b, c,  w6);         /*  6 */
-                                       R1(c, d, e, a, b,  w7);         /*  7 */
-                                       R1(b, c, d, e, a,  w8);         /*  8 */
-                                       R1(a, b, c, d, e,  w9);         /*  9 */
-                                       R1(e, a, b, c, d, w10);         /* 10 */
-                                       R1(d, e, a, b, c, w11);         /* 11 */
-                                       R1(c, d, e, a, b, w12);         /* 12 */
-                                       R1(b, c, d, e, a, w13);         /* 13 */
-                                       R1(a, b, c, d, e, w14);         /* 14 */
-                                       R1(e, a, b, c, d, w15);         /* 15 */
-       WUPDATE( w0, w13,  w8,  w2);    R1(d, e, a, b, c,  w0);         /* 16 */
-       WUPDATE( w1, w14,  w9,  w3);    R1(c, d, e, a, b,  w1);         /* 17 */
-       WUPDATE( w2, w15, w10,  w4);    R1(b, c, d, e, a,  w2);         /* 18 */
-       WUPDATE( w3,  w0, w11,  w5);    R1(a, b, c, d, e,  w3);         /* 19 */
-
-       /* Round 2 */
-       WUPDATE( w4,  w1, w12,  w6);    R2(e, a, b, c, d,  w4);         /* 20 */
-       WUPDATE( w5,  w2, w13,  w7);    R2(d, e, a, b, c,  w5);         /* 21 */
-       WUPDATE( w6,  w3, w14,  w8);    R2(c, d, e, a, b,  w6);         /* 22 */
-       WUPDATE( w7,  w4, w15,  w9);    R2(b, c, d, e, a,  w7);         /* 23 */
-       WUPDATE( w8,  w5,  w0, w10);    R2(a, b, c, d, e,  w8);         /* 24 */
-       WUPDATE( w9,  w6,  w1, w11);    R2(e, a, b, c, d,  w9);         /* 25 */
-       WUPDATE(w10,  w7,  w2, w12);    R2(d, e, a, b, c, w10);         /* 26 */
-       WUPDATE(w11,  w8,  w3, w13);    R2(c, d, e, a, b, w11);         /* 27 */
-       WUPDATE(w12,  w9,  w4, w14);    R2(b, c, d, e, a, w12);         /* 28 */
-       WUPDATE(w13, w10,  w5, w15);    R2(a, b, c, d, e, w13);         /* 29 */
-       WUPDATE(w14, w11,  w6,  w0);    R2(e, a, b, c, d, w14);         /* 30 */
-       WUPDATE(w15, w12,  w7,  w1);    R2(d, e, a, b, c, w15);         /* 31 */
-       WUPDATE( w0, w13,  w8,  w2);    R2(c, d, e, a, b,  w0);         /* 32 */
-       WUPDATE( w1, w14,  w9,  w3);    R2(b, c, d, e, a,  w1);         /* 33 */
-       WUPDATE( w2, w15, w10,  w4);    R2(a, b, c, d, e,  w2);         /* 34 */
-       WUPDATE( w3,  w0, w11,  w5);    R2(e, a, b, c, d,  w3);         /* 35 */
-       WUPDATE( w4,  w1, w12,  w6);    R2(d, e, a, b, c,  w4);         /* 36 */
-       WUPDATE( w5,  w2, w13,  w7);    R2(c, d, e, a, b,  w5);         /* 37 */
-       WUPDATE( w6,  w3, w14,  w8);    R2(b, c, d, e, a,  w6);         /* 38 */
-       WUPDATE( w7,  w4, w15,  w9);    R2(a, b, c, d, e,  w7);         /* 39 */
-
-       /* Round 3 */
-       WUPDATE( w8,  w5,  w0, w10);    R3(e, a, b, c, d,  w8);         /* 40 */
-       WUPDATE( w9,  w6,  w1, w11);    R3(d, e, a, b, c,  w9);         /* 41 */
-       WUPDATE(w10,  w7,  w2, w12);    R3(c, d, e, a, b, w10);         /* 42 */
-       WUPDATE(w11,  w8,  w3, w13);    R3(b, c, d, e, a, w11);         /* 43 */
-       WUPDATE(w12,  w9,  w4, w14);    R3(a, b, c, d, e, w12);         /* 44 */
-       WUPDATE(w13, w10,  w5, w15);    R3(e, a, b, c, d, w13);         /* 45 */
-       WUPDATE(w14, w11,  w6,  w0);    R3(d, e, a, b, c, w14);         /* 46 */
-       WUPDATE(w15, w12,  w7,  w1);    R3(c, d, e, a, b, w15);         /* 47 */
-       WUPDATE( w0, w13,  w8,  w2);    R3(b, c, d, e, a,  w0);         /* 48 */
-       WUPDATE( w1, w14,  w9,  w3);    R3(a, b, c, d, e,  w1);         /* 49 */
-       WUPDATE( w2, w15, w10,  w4);    R3(e, a, b, c, d,  w2);         /* 50 */
-       WUPDATE( w3,  w0, w11,  w5);    R3(d, e, a, b, c,  w3);         /* 51 */
-       WUPDATE( w4,  w1, w12,  w6);    R3(c, d, e, a, b,  w4);         /* 52 */
-       WUPDATE( w5,  w2, w13,  w7);    R3(b, c, d, e, a,  w5);         /* 53 */
-       WUPDATE( w6,  w3, w14,  w8);    R3(a, b, c, d, e,  w6);         /* 54 */
-       WUPDATE( w7,  w4, w15,  w9);    R3(e, a, b, c, d,  w7);         /* 55 */
-       WUPDATE( w8,  w5,  w0, w10);    R3(d, e, a, b, c,  w8);         /* 56 */
-       WUPDATE( w9,  w6,  w1, w11);    R3(c, d, e, a, b,  w9);         /* 57 */
-       WUPDATE(w10,  w7,  w2, w12);    R3(b, c, d, e, a, w10);         /* 58 */
-       WUPDATE(w11,  w8,  w3, w13);    R3(a, b, c, d, e, w11);         /* 59 */
-
-       WUPDATE(w12,  w9,  w4, w14);    R4(e, a, b, c, d, w12);         /* 60 */
-       WUPDATE(w13, w10,  w5, w15);    R4(d, e, a, b, c, w13);         /* 61 */
-       WUPDATE(w14, w11,  w6,  w0);    R4(c, d, e, a, b, w14);         /* 62 */
-       WUPDATE(w15, w12,  w7,  w1);    R4(b, c, d, e, a, w15);         /* 63 */
-       WUPDATE( w0, w13,  w8,  w2);    R4(a, b, c, d, e,  w0);         /* 64 */
-       WUPDATE( w1, w14,  w9,  w3);    R4(e, a, b, c, d,  w1);         /* 65 */
-       WUPDATE( w2, w15, w10,  w4);    R4(d, e, a, b, c,  w2);         /* 66 */
-       WUPDATE( w3,  w0, w11,  w5);    R4(c, d, e, a, b,  w3);         /* 67 */
-       WUPDATE( w4,  w1, w12,  w6);    R4(b, c, d, e, a,  w4);         /* 68 */
-       WUPDATE( w5,  w2, w13,  w7);    R4(a, b, c, d, e,  w5);         /* 69 */
-       WUPDATE( w6,  w3, w14,  w8);    R4(e, a, b, c, d,  w6);         /* 70 */
-       WUPDATE( w7,  w4, w15,  w9);    R4(d, e, a, b, c,  w7);         /* 71 */
-       WUPDATE( w8,  w5,  w0, w10);    R4(c, d, e, a, b,  w8);         /* 72 */
-       WUPDATE( w9,  w6,  w1, w11);    R4(b, c, d, e, a,  w9);         /* 73 */
-       WUPDATE(w10,  w7,  w2, w12);    R4(a, b, c, d, e, w10);         /* 74 */
-       WUPDATE(w11,  w8,  w3, w13);    R4(e, a, b, c, d, w11);         /* 75 */
-       WUPDATE(w12,  w9,  w4, w14);    R4(d, e, a, b, c, w12);         /* 76 */
-       WUPDATE(w13, w10,  w5, w15);    R4(c, d, e, a, b, w13);         /* 77 */
-       WUPDATE(w14, w11,  w6,  w0);    R4(b, c, d, e, a, w14);         /* 78 */
-       WUPDATE(w15, w12,  w7,  w1);    R4(a, b, c, d, e, w15);         /* 79 */
-
-       context->state[0] += a;
-       context->state[1] += b;
-       context->state[2] += c;
-       context->state[3] += d;
-       context->state[4] += e;
-
-       /* Zeroize sensitive information. */
-       w15 = w14 = w13 = w12 = w11 = w10 = w9 = w8 = 0;
-       w7 = w6 = w5 = w4 = w3 = w2 = w1 = w0 = 0;
-}
diff --git a/osfmk/prng/fips_sha1.h b/osfmk/prng/fips_sha1.h
deleted file mode 100644 (file)
index 092c48b..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-/*
-       WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
-       
-       THIS FILE IS NEEDED TO PASS FIPS ACCEPTANCE FOR THE RANDOM NUMBER GENERATOR.
-       IF YOU ALTER IT IN ANY WAY, WE WILL NEED TO GO THOUGH FIPS ACCEPTANCE AGAIN,
-       AN OPERATION THAT IS VERY EXPENSIVE AND TIME CONSUMING.  IN OTHER WORDS,
-       DON'T MESS WITH THIS FILE.
-
-       WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
-*/
-
-#ifndef _CRYPTO_FIPS_SHA1_H_
-#define        _CRYPTO_FIPS_SHA1_H_
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#include <stdint.h>
-
-#define        SHA_DIGEST_LENGTH       20
-#define        SHA1_RESULTLEN          SHA_DIGEST_LENGTH
-
-typedef struct sha1_ctxt {
-       union {
-               u_int8_t        b8[20];
-               u_int32_t       b32[5]; /* state (ABCDE) */
-       } h;
-       union {
-               u_int8_t        b8[8];
-               u_int32_t       b32[2];
-               u_int64_t       b64[1]; /* # of bits, modulo 2^64 (msb first) */
-       } c;
-       union {
-               u_int8_t        b8[64];
-               u_int32_t       b32[16]; /* input buffer */
-       } m;
-       u_int8_t        count;          /* unused; for compatibility only */
-} SHA1_CTX;
-
-extern void FIPS_SHA1Init(SHA1_CTX *);
-extern void FIPS_SHA1Update(SHA1_CTX *, const void *, size_t);
-extern void FIPS_SHA1Final(void *, SHA1_CTX *);
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif /*_CRYPTO_SHA1_H_*/
diff --git a/osfmk/prng/prng_random.c b/osfmk/prng/prng_random.c
new file mode 100644 (file)
index 0000000..ec90cc7
--- /dev/null
@@ -0,0 +1,652 @@
+/*
+ * Copyright (c) 2013 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/cpu_data.h>
+#include <kern/cpu_number.h>
+#include <kern/kalloc.h>
+#include <kern/machine.h>
+#include <kern/misc_protos.h>
+#include <kern/processor.h>
+#include <kern/sched.h>
+#include <kern/startup.h>
+#include <kern/thread.h>
+#include <kern/thread_call.h>
+#include <mach/machine.h>
+#include <mach/processor.h>
+#include <machine/cpu_data.h>
+#include <machine/simple_lock.h>
+#include <sys/errno.h>
+#include <sys/kdebug.h>
+#include <sys/random.h>
+#include <vm/pmap.h>
+#include <vm/vm_page.h>
+
+#include <corecrypto/ccdigest.h>
+#include <corecrypto/ccdrbg.h>
+#include <corecrypto/cckprng.h>
+#include <corecrypto/ccsha1.h>
+#include <corecrypto/ccsha2.h>
+#include <prng/random.h>
+
+#include <IOKit/IOPlatformExpert.h>
+#include <console/serial_protos.h>
+#include <pexpert/pexpert.h>
+
+#include <libkern/section_keywords.h>
+
+#if defined(__arm__) || defined(__arm64__)
+#include <arm/cpu_data_internal.h> // For MAX_CPUS
+#endif
+
+#if defined(__x86_64__)
+#include <i386/cpuid.h>
+
+static int
+rdseed_step(uint64_t * seed)
+{
+       uint8_t ok;
+
+       asm volatile("rdseed %0; setc %1" : "=r"(*seed), "=qm"(ok));
+
+       return (int)ok;
+}
+
+static int
+rdseed_retry(uint64_t * seed, size_t nretries)
+{
+       size_t i;
+
+       for (i = 0; i < nretries; i += 1) {
+               if (rdseed_step(seed)) {
+                       return 1;
+               } else {
+                       asm volatile("pause");
+               }
+       }
+
+       return 0;
+}
+
+static size_t
+rdseed_seed(void * buf, size_t nwords)
+{
+       uint64_t * buf_words;
+       size_t i;
+
+       if (nwords > 8) {
+               nwords = 8;
+       }
+
+       buf_words = buf;
+       for (i = 0; i < nwords; i += 1) {
+               if (!rdseed_retry(buf_words + i, 10)) {
+                       return i;
+               }
+       }
+
+       return nwords;
+}
+
+static int
+rdrand_step(uint64_t * rand)
+{
+       uint8_t ok;
+
+       asm volatile("rdrand %0; setc %1" : "=r"(*rand), "=qm"(ok));
+
+       return (int)ok;
+}
+
+static int
+rdrand_retry(uint64_t * rand, size_t nretries)
+{
+       size_t i;
+
+       for (i = 0; i < nretries; i += 1) {
+               if (rdrand_step(rand)) {
+                       return 1;
+               }
+       }
+
+       return 0;
+}
+
+static size_t
+rdrand_seed(void * buf, size_t nwords)
+{
+       size_t i;
+       uint64_t w;
+       uint8_t hash[CCSHA256_OUTPUT_SIZE];
+       const struct ccdigest_info * di = &ccsha256_ltc_di;
+
+       ccdigest_di_decl(di, ctx);
+       ccdigest_init(di, ctx);
+
+       for (i = 0; i < 1023; i += 1) {
+               if (!rdrand_retry(&w, 10)) {
+                       nwords = 0;
+                       goto out;
+               }
+               ccdigest_update(di, ctx, sizeof w, &w);
+       }
+
+       ccdigest_final(di, ctx, hash);
+
+       if (nwords > 2) {
+               nwords = 2;
+       }
+
+       memcpy(buf, hash, nwords * sizeof(uint64_t));
+
+out:
+       ccdigest_di_clear(di, ctx);
+       bzero(hash, sizeof hash);
+       bzero(&w, sizeof w);
+
+       return nwords;
+}
+
+static void
+intel_entropysource(void * buf, size_t * nbytes)
+{
+       size_t nwords;
+
+       /* only handle complete words */
+       assert(*nbytes % sizeof(uint64_t) == 0);
+
+       nwords = (*nbytes) / sizeof(uint64_t);
+       if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_RDSEED) {
+               nwords  = rdseed_seed(buf, nwords);
+               *nbytes = nwords * sizeof(uint64_t);
+       } else if (cpuid_features() & CPUID_FEATURE_RDRAND) {
+               nwords  = rdrand_seed(buf, nwords);
+               *nbytes = nwords * sizeof(uint64_t);
+       } else {
+               *nbytes = 0;
+       }
+}
+
+#endif /* defined(__x86_64__) */
+
+void entropy_buffer_read(void * buffer, size_t * count);
+
+typedef void (*entropysource)(void * buf, size_t * nbytes);
+
+static const entropysource entropysources[] = {
+    entropy_buffer_read,
+#if defined(__x86_64__)
+    intel_entropysource,
+#endif
+};
+
+static const size_t nsources = sizeof entropysources / sizeof entropysources[0];
+
+static size_t
+entropy_readall(void * buf, size_t nbytes_persource)
+{
+       uint8_t * buf_bytes = buf;
+       size_t i;
+       size_t nbytes_total = 0;
+
+       for (i = 0; i < nsources; i += 1) {
+               size_t nbytes = nbytes_persource;
+               entropysources[i](buf_bytes, &nbytes);
+               bzero(buf_bytes + nbytes, nbytes_persource - nbytes);
+               nbytes_total += nbytes;
+               buf_bytes += nbytes_persource;
+       }
+
+       return nbytes_total;
+}
+
+static struct {
+       struct cckprng_ctx ctx;
+       struct {
+               lck_grp_t * group;
+               lck_attr_t * attrs;
+               lck_grp_attr_t * group_attrs;
+               lck_mtx_t * mutex;
+       } lock;
+} prng;
+
+static SECURITY_READ_ONLY_LATE(prng_fns_t) prng_fns = NULL;
+
+static int
+prng_init(cckprng_ctx_t ctx, size_t nbytes, const void * seed)
+{
+       int err = prng_fns->init(ctx, nbytes, seed);
+       if (err == CCKPRNG_ABORT) {
+               panic("prng_init");
+       }
+       return err;
+}
+
+#define PERMIT_WRITE_RANDOM 0
+
+#if PERMIT_WRITE_RANDOM
+static int
+prng_reseed(cckprng_ctx_t ctx, size_t nbytes, const void * seed)
+{
+       int err = prng_fns->reseed(ctx, nbytes, seed);
+       if (err == CCKPRNG_ABORT) {
+               panic("prng_reseed");
+       }
+       return err;
+}
+#endif
+
+static int
+prng_addentropy(cckprng_ctx_t ctx, size_t nbytes, const void * entropy)
+{
+       int err = prng_fns->addentropy(ctx, nbytes, entropy);
+       if (err == CCKPRNG_ABORT) {
+               panic("prng_addentropy");
+       }
+       return err;
+}
+
+static int
+prng_generate(cckprng_ctx_t ctx, size_t nbytes, void * out)
+{
+       int err = prng_fns->generate(ctx, nbytes, out);
+       if (err == CCKPRNG_ABORT) {
+               panic("prng_generate");
+       }
+       return err;
+}
+
+entropy_data_t EntropyData = {.index_ptr = EntropyData.buffer};
+
+static struct {
+       uint8_t seed[nsources][EARLY_RANDOM_SEED_SIZE];
+       int seedset;
+       uint8_t master_drbg_state[EARLY_RANDOM_STATE_STATIC_SIZE];
+       struct ccdrbg_state * drbg_states[MAX_CPUS];
+       struct ccdrbg_info drbg_info;
+       const struct ccdrbg_nisthmac_custom drbg_custom;
+} erandom = {.drbg_custom = {
+                 .di         = &ccsha1_eay_di,
+                 .strictFIPS = 0,
+             }};
+
+static void read_erandom(void * buf, uint32_t nbytes);
+
+void
+entropy_buffer_read(void * buffer, size_t * count)
+{
+       boolean_t current_state;
+       unsigned int i, j;
+
+       if (!erandom.seedset) {
+               panic("early_random was never invoked");
+       }
+
+       if (*count > ENTROPY_BUFFER_BYTE_SIZE) {
+               *count = ENTROPY_BUFFER_BYTE_SIZE;
+       }
+
+       current_state = ml_set_interrupts_enabled(FALSE);
+
+       memcpy(buffer, EntropyData.buffer, *count);
+
+       /* Consider removing this mixing step rdar://problem/31668239 */
+       for (i = 0, j = (ENTROPY_BUFFER_SIZE - 1); i < ENTROPY_BUFFER_SIZE; j = i, i++)
+               EntropyData.buffer[i] = EntropyData.buffer[i] ^ EntropyData.buffer[j];
+
+       (void)ml_set_interrupts_enabled(current_state);
+
+#if DEVELOPMENT || DEBUG
+       uint32_t * word = buffer;
+       /* Good for both 32-bit and 64-bit kernels. */
+       for (i = 0; i < ENTROPY_BUFFER_SIZE; i += 4)
+               /*
+                * We use "EARLY" here so that we can grab early entropy on
+                * ARM, where tracing is not started until after PRNG is
+                * initialized.
+                */
+               KERNEL_DEBUG_EARLY(ENTROPY_READ(i / 4), word[i + 0], word[i + 1], word[i + 2], word[i + 3]);
+#endif
+}
+
+/*
+ * Return a uniformly distributed 64-bit random number.
+ *
+ * This interface should have minimal dependencies on kernel
+ * services, and thus be available very early in the life
+ * of the kernel.
+ * This provides cryptographically secure randomness.
+ * Each processor has its own generator instance.
+ * It is seeded (lazily) with entropy provided by the Booter.
+ *
+ * For <rdar://problem/17292592> the algorithm switched from LCG to
+ * NIST HMAC DBRG as follows:
+ *  - When first called (on OSX this is very early while page tables are being
+ *    built) early_random() calls ccdrbg_factory_hmac() to set-up a ccdbrg info
+ *    structure.
+ *  - The boot processor's ccdrbg state structure is a statically allocated area
+ *    which is then initialized by calling the ccdbrg_init method.
+ *    The initial entropy is 16 bytes of boot entropy.
+ *    The nonce is the first 8 bytes of entropy xor'ed with a timestamp
+ *    from ml_get_timebase().
+ *    The personalization data provided is null.
+ *  - The first 64-bit random value is returned on the boot processor from
+ *    an invocation of the ccdbrg_generate method.
+ *  - Non-boot processor's DRBG state structures are allocated dynamically
+ *    from prng_init(). Each is initialized with the same 16 bytes of entropy
+ *    but with a different timestamped nonce and cpu number as personalization.
+ *  - Subsequent calls to early_random() pass to read_erandom() to generate
+ *    an 8-byte random value.  read_erandom() ensures that pre-emption is
+ *    disabled and selects the DBRG state from the current processor.
+ *    The ccdbrg_generate method is called for the required random output.
+ *    If this method returns CCDRBG_STATUS_NEED_RESEED, the erandom.seed buffer
+ *    is re-filled with kernel-harvested entropy and the ccdbrg_reseed method is
+ *    called with this new entropy. The kernel panics if a reseed fails.
+ */
+uint64_t
+early_random(void)
+{
+       uint32_t cnt = 0;
+       uint64_t result;
+       uint64_t nonce;
+       int rc;
+       int ps;
+       struct ccdrbg_state * state;
+
+       if (!erandom.seedset) {
+               erandom.seedset = 1;
+               cnt             = PE_get_random_seed((unsigned char *)EntropyData.buffer, sizeof(EntropyData.buffer));
+
+               if (cnt < sizeof(EntropyData.buffer)) {
+                       /*
+                        * Insufficient entropy is fatal.  We must fill the
+                        * entire entropy buffer during initializaton.
+                        */
+                       panic("EntropyData needed %lu bytes, but got %u.\n", sizeof(EntropyData.buffer), cnt);
+               }
+
+               entropy_readall(&erandom.seed, EARLY_RANDOM_SEED_SIZE);
+
+               /* Init DRBG for NIST HMAC */
+               ccdrbg_factory_nisthmac(&erandom.drbg_info, &erandom.drbg_custom);
+               assert(erandom.drbg_info.size <= sizeof(erandom.master_drbg_state));
+               state                           = (struct ccdrbg_state *)erandom.master_drbg_state;
+               erandom.drbg_states[master_cpu] = state;
+
+               /*
+                * Init our DBRG from the boot entropy and a timestamp as nonce
+                * and the cpu number as personalization.
+                */
+               assert(sizeof(erandom.seed) > sizeof(nonce));
+               nonce = ml_get_timebase();
+               ps    = 0; /* boot cpu */
+               rc    = ccdrbg_init(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, sizeof(nonce), &nonce, sizeof(ps), &ps);
+               cc_clear(sizeof(nonce), &nonce);
+               if (rc != CCDRBG_STATUS_OK)
+                       panic("ccdrbg_init() returned %d", rc);
+
+               /* Generate output */
+               rc = ccdrbg_generate(&erandom.drbg_info, state, sizeof(result), &result, 0, NULL);
+               if (rc != CCDRBG_STATUS_OK)
+                       panic("ccdrbg_generate() returned %d", rc);
+
+               return result;
+       };
+
+       read_erandom(&result, sizeof(result));
+
+       return result;
+}
+
+static void
+read_erandom(void * buffer, u_int numBytes)
+{
+       int cpu;
+       int rc;
+       size_t nbytes;
+       struct ccdrbg_state * state;
+
+       mp_disable_preemption();
+       cpu   = cpu_number();
+       state = erandom.drbg_states[cpu];
+       assert(state);
+       for (;;) {
+               /* Generate output */
+               rc = ccdrbg_generate(&erandom.drbg_info, state, numBytes, buffer, 0, NULL);
+               if (rc == CCDRBG_STATUS_OK)
+                       break;
+               if (rc == CCDRBG_STATUS_NEED_RESEED) {
+                       /* It's time to reseed. Get more entropy */
+                       nbytes = entropy_readall(erandom.seed, EARLY_RANDOM_SEED_SIZE);
+                       assert(nbytes >= EARLY_RANDOM_SEED_SIZE);
+                       rc = ccdrbg_reseed(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, 0, NULL);
+                       cc_clear(sizeof(erandom.seed), erandom.seed);
+                       if (rc == CCDRBG_STATUS_OK)
+                               continue;
+                       panic("read_erandom reseed error %d\n", rc);
+               }
+               panic("read_erandom ccdrbg error %d\n", rc);
+       }
+       mp_enable_preemption();
+}
+
+void
+read_frandom(void * buffer, u_int numBytes)
+{
+       uint8_t * buffer_bytes = buffer;
+       int nbytes;
+
+       /*
+        * Split up into requests for blocks smaller than
+        * than the DBRG request limit. iThis limit is private but
+        * for NISTHMAC it's known to be greater then 4096.
+        */
+       while (numBytes) {
+               nbytes = MIN(numBytes, PAGE_SIZE);
+               read_erandom(buffer_bytes, nbytes);
+               buffer_bytes += nbytes;
+               numBytes -= nbytes;
+       }
+}
+
+void
+early_random_cpu_init(int cpu)
+{
+       uint64_t nonce;
+       int rc;
+       struct ccdrbg_state * state;
+
+       /*
+        * Allocate state and initialize DBRG state for early_random()
+        * for this processor.
+        */
+       assert(cpu != master_cpu);
+       assert(erandom.drbg_states[cpu] == NULL);
+
+       state = kalloc(erandom.drbg_info.size);
+       if (state == NULL) {
+               panic("prng_init kalloc failed\n");
+       }
+       erandom.drbg_states[cpu] = state;
+
+       /*
+        * Init our DBRG from boot entropy, nonce as timestamp
+        * and use the cpu number as the personalization parameter.
+        */
+       nonce = ml_get_timebase();
+       rc    = ccdrbg_init(&erandom.drbg_info, state, sizeof(erandom.seed), erandom.seed, sizeof(nonce), &nonce, sizeof(cpu), &cpu);
+       cc_clear(sizeof(nonce), &nonce);
+       if (rc != CCDRBG_STATUS_OK)
+               panic("ccdrbg_init() returned %d", rc);
+}
+
+void
+register_and_init_prng(prng_fns_t fns)
+{
+       uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE];
+       size_t nbytes;
+
+       assert(cpu_number() == master_cpu);
+       assert(prng_fns == NULL);
+
+       prng_fns = fns;
+
+       /* make a mutex to control access */
+       prng.lock.group_attrs = lck_grp_attr_alloc_init();
+       prng.lock.group       = lck_grp_alloc_init("random", prng.lock.group_attrs);
+       prng.lock.attrs       = lck_attr_alloc_init();
+       prng.lock.mutex       = lck_mtx_alloc_init(prng.lock.group, prng.lock.attrs);
+
+       nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE);
+       (void)prng_init(&prng.ctx, nbytes, buf);
+       cc_clear(sizeof(buf), buf);
+}
+
+static void
+Reseed(void)
+{
+       uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE];
+       size_t nbytes;
+
+       lck_mtx_assert(prng.lock.mutex, LCK_MTX_ASSERT_OWNED);
+
+       nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE);
+       PRNG_CCKPRNG((void)prng_addentropy(&prng.ctx, nbytes, buf));
+       cc_clear(sizeof(buf), buf);
+}
+
+/* export good random numbers to the rest of the kernel */
+void
+read_random(void * buffer, u_int numbytes)
+{
+       int err;
+
+       lck_mtx_lock(prng.lock.mutex);
+
+       /*
+        * Call PRNG, reseeding and retrying if requested.
+        */
+       for (;;) {
+               PRNG_CCKPRNG(err = prng_generate(&prng.ctx, numbytes, buffer));
+               if (err == CCKPRNG_OK)
+                       break;
+               if (err == CCKPRNG_NEED_ENTROPY) {
+                       Reseed();
+                       continue;
+               }
+               panic("read_random() error %d\n", err);
+       }
+
+       lck_mtx_unlock(prng.lock.mutex);
+}
+
+int
+write_random(void * buffer, u_int numbytes)
+{
+#if PERMIT_WRITE_RANDOM
+       int err;
+
+       lck_mtx_lock(prng.lock.mutex);
+       err = prng_reseed(&prng.ctx, numbytes, buffer);
+       lck_mtx_unlock(prng.lock.mutex);
+
+       return err ? EIO : 0;
+#else
+#pragma unused(buffer, numbytes)
+       return 0;
+#endif
+}
+
+/*
+ * Boolean PRNG for generating booleans to randomize order of elements
+ * in certain kernel data structures. The algorithm is a
+ * modified version of the KISS RNG proposed in the paper:
+ * http://stat.fsu.edu/techreports/M802.pdf
+ * The modifications have been documented in the technical paper
+ * paper from UCL:
+ * http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf
+ */
+
+/* Initialize the PRNG structures. */
+void
+random_bool_init(struct bool_gen * bg)
+{
+       /* Seed the random boolean generator */
+       for (int i = 0; i < RANDOM_BOOL_GEN_SEED_COUNT; i++) {
+               bg->seed[i] = (unsigned int)early_random();
+       }
+       bg->state = 0;
+       simple_lock_init(&bg->lock, 0);
+}
+
+/* Generate random bits and add them to an entropy pool. */
+void
+random_bool_gen_entropy(struct bool_gen * bg, unsigned int * buffer, int count)
+{
+       simple_lock(&bg->lock);
+       int i, t;
+       for (i = 0; i < count; i++) {
+               bg->seed[1] ^= (bg->seed[1] << 5);
+               bg->seed[1] ^= (bg->seed[1] >> 7);
+               bg->seed[1] ^= (bg->seed[1] << 22);
+               t           = bg->seed[2] + bg->seed[3] + bg->state;
+               bg->seed[2] = bg->seed[3];
+               bg->state   = t < 0;
+               bg->seed[3] = t & 2147483647;
+               bg->seed[0] += 1411392427;
+               buffer[i] = (bg->seed[0] + bg->seed[1] + bg->seed[3]);
+       }
+       simple_unlock(&bg->lock);
+}
+
+/* Get some number of bits from the entropy pool, refilling if necessary. */
+unsigned int
+random_bool_gen_bits(struct bool_gen * bg, unsigned int * buffer, unsigned int count, unsigned int numbits)
+{
+       unsigned int index = 0;
+       unsigned int rbits = 0;
+       for (unsigned int bitct = 0; bitct < numbits; bitct++) {
+               /*
+                * Find a portion of the buffer that hasn't been emptied.
+                * We might have emptied our last index in the previous iteration.
+                */
+               while (index < count && buffer[index] == 0)
+                       index++;
+
+               /* If we've exhausted the pool, refill it. */
+               if (index == count) {
+                       random_bool_gen_entropy(bg, buffer, count);
+                       index = 0;
+               }
+
+               /* Collect-a-bit */
+               unsigned int bit = buffer[index] & 1;
+               buffer[index]    = buffer[index] >> 1;
+               rbits            = bit | (rbits << 1);
+       }
+       return rbits;
+}
diff --git a/osfmk/prng/prng_yarrow.c b/osfmk/prng/prng_yarrow.c
deleted file mode 100644 (file)
index b5f4144..0000000
+++ /dev/null
@@ -1,413 +0,0 @@
-/*
- * Copyright (c) 1999-2013 Apple, Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-
-#include <string.h>
-#include <kern/cpu_number.h>
-#include <kern/cpu_data.h>
-#include <kern/misc_protos.h>
-#include <kern/thread.h>
-#include <sys/random.h>
-
-#include <corecrypto/ccdrbg.h>
-
-#include <prng/YarrowCoreLib/include/yarrow.h>
-
-#include <libkern/OSByteOrder.h>
-#include <libkern/OSAtomic.h>
-
-#include <mach/mach_time.h>
-
-#include <prng/random.h>
-
-#include "fips_sha1.h"
-
-
-/*
-       WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
-       
-       THIS FILE IS NEEDED TO PASS FIPS ACCEPTANCE FOR THE RANDOM NUMBER GENERATOR.
-       IF YOU ALTER IT IN ANY WAY, WE WILL NEED TO GO THOUGH FIPS ACCEPTANCE AGAIN,
-       AN OPERATION THAT IS VERY EXPENSIVE AND TIME CONSUMING.  IN OTHER WORDS,
-       DON'T MESS WITH THIS FILE.
-
-       WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
-*/
-/*
-       WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
-
-       ANY CODE PROTECTED UNDER "#ifdef __arm__" IS SERIOUSLY SUPPOSED TO BE THERE!
-       IF YOU REMOVE ARM CODE, RANDOM WILL NOT MEAN ANYTHING FOR iPHONES ALL OVER.
-       PLEASE DON'T TOUCH __arm__ CODE IN THIS FILE!
-
-       WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!
-*/
-
-
-#define RESEED_TICKS 50 /* how long a reseed operation can take */
-
-
-typedef u_int8_t BlockWord;
-enum {kBSize = 20};
-typedef BlockWord Block[kBSize];
-enum {kBlockSize = sizeof(Block)};
-
-struct YarrowContext {
-       PrngRef         PrngRef;
-       Block           xkey;
-       Block           random_data;
-       int             bytes_used;
-       unsigned char   SelfTestInitialized;
-       u_int32_t       LastBlockChecksum;
-       uint64_t        bytes_since_reseed;
-};
-typedef struct YarrowContext *YarrowContextp;
-
-/* define prototypes to keep the compiler happy... */
-
-void add_blocks(Block a, Block b, BlockWord carry);
-void fips_initialize(YarrowContextp yp);
-void random_block(YarrowContextp yp, Block b, int addOptional);
-u_int32_t CalculateCRC(u_int8_t* buffer, size_t length);
-
-/*
- * Get 120 bits from yarrow
- */
-
-/*
- * add block b to block a
- */
-void
-add_blocks(Block a, Block b, BlockWord carry)
-{
-       int i = kBlockSize - 1;
-       while (i >= 0)
-       {
-               u_int32_t c = (u_int32_t)carry +
-                                         (u_int32_t)a[i] +
-                                         (u_int32_t)b[i];
-               a[i] = c & 0xff;
-               carry = c >> 8;
-               i -= 1;
-       }
-}
-
-
-
-static char zeros[(512 - kBSize * 8) / 8];
-
-static const u_int32_t g_crc_table[] =
-{
-       0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
-       0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
-       0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
-       0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
-       0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
-       0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
-       0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
-       0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
-       0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
-       0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
-       0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
-       0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
-       0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
-       0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
-       0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
-       0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
-       0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
-       0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
-       0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
-       0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
-       0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
-       0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
-       0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
-       0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
-       0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
-       0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
-       0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
-       0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
-       0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
-       0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
-       0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
-       0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D,
-};
-
-/*
- * Setup for fips compliance
- */
-
-/*
- * calculate a crc-32 checksum
- */
-u_int32_t CalculateCRC(u_int8_t* buffer, size_t length)
-{
-       u_int32_t crc = 0;
-       
-       size_t i;
-       for (i = 0; i < length; ++i)
-       {
-               u_int32_t temp = (crc ^ ((u_int32_t) buffer[i])) & 0xFF;
-               crc = (crc >> 8) ^ g_crc_table[temp];
-       }
-       
-       return crc;
-}
-
-/*
- * get a random block of data per fips 186-2
- */
-void
-random_block(YarrowContextp pp, Block b, int addOptional)
-{
-       SHA1_CTX sha1_ctx;
-
-       int repeatCount = 0;
-       do
-       {
-               // do one iteration
-               
-               if (addOptional)
-               {
-                       // create an xSeed to add.
-                       Block xSeed;
-                       prngOutput (pp->PrngRef, (BYTE*) &xSeed, sizeof (xSeed));
-                       
-                       // add the seed to the previous value of xkey
-                       add_blocks (pp->xkey, xSeed, 0);
-               }
-               
-               // initialize the value of H
-               FIPS_SHA1Init(&sha1_ctx);
-               
-               // to stay compatible with the FIPS specification, we need to flip the bytes in
-               // xkey to little endian byte order.  In our case, this makes exactly no difference
-               // (random is random), but we need to do it anyway to keep FIPS happy
-               
-               // compute "G"
-               FIPS_SHA1Update(&sha1_ctx, pp->xkey, kBlockSize);
-               
-               // add zeros to fill the internal SHA-1 buffer
-               FIPS_SHA1Update (&sha1_ctx, (const u_int8_t *)zeros, sizeof (zeros));
-               
-               // we have to do a byte order correction here because the sha1 math is being done internally
-               // as u_int32_t, not a stream of bytes.  Since we maintain our data as a byte stream, we need
-               // to convert
-               
-               u_int32_t* finger = (u_int32_t*) b;
-               
-               unsigned j;
-               for (j = 0; j < kBlockSize / sizeof (u_int32_t); ++j)
-               {
-                       *finger++ = OSSwapHostToBigInt32(sha1_ctx.h.b32[j]);
-               }               
-               
-               // calculate the CRC-32 of the block
-               u_int32_t new_crc = CalculateCRC(sha1_ctx.h.b8, sizeof (Block));
-               
-               // make sure we don't repeat
-               int cmp = new_crc == pp->LastBlockChecksum;
-               pp->LastBlockChecksum = new_crc;
-               if (!pp->SelfTestInitialized)
-               {
-                       pp->SelfTestInitialized = 1;
-                       return;
-               }
-               else if (!cmp)
-               {
-                       return;
-               }
-               
-               repeatCount += 1;
-               
-               // fix up the next value of xkey
-               add_blocks (pp->xkey, b, 1);
-       } while (repeatCount < 2);
-       
-       /*
-        * If we got here, three sucessive checksums of the random number
-        * generator have been the same.  Since the odds of this happening are
-        * 1 in 18,446,744,073,709,551,616, (1 in 18 quintillion) one of the following has
-        * most likely happened:
-        *
-        * 1: There is a significant bug in this code.
-        * 2: There has been a massive system failure.
-        * 3: The universe has ceased to exist.
-        *
-        * There is no good way to recover from any of these cases. We
-        * therefore panic.
-        */
-        
-        panic("FIPS random self-test failed.");
-}
-
-const Block kKnownAnswer = {0x92, 0xb4, 0x04, 0xe5, 0x56, 0x58, 0x8c, 0xed, 0x6c, 0x1a, 0xcd, 0x4e, 0xbf, 0x05, 0x3f, 0x68, 0x09, 0xf7, 0x3a, 0x93};
-
-void
-fips_initialize(YarrowContextp yp)
-{
-       /* So that we can do the self test, set the seed to zero */
-       memset(&yp->xkey, 0, sizeof(yp->xkey));
-       
-       /* other initializations */
-       memset (zeros, 0, sizeof (zeros));
-       yp->bytes_used = 0;
-       random_block(yp, yp->random_data, FALSE);
-       
-       // check here to see if we got the initial data we were expecting
-       if (memcmp(kKnownAnswer, yp->random_data, kBlockSize) != 0)
-       {
-               panic("FIPS random self test failed");
-       }
-       
-       // now do the random block again to make sure that userland doesn't get predicatable data
-       random_block(yp, yp->random_data, TRUE);
-}
-
-
-static int
-yarrow_init(
-       const struct ccdrbg_info *info,
-       struct ccdrbg_state *drbg,
-       unsigned long entropyLength, const void* entropy,
-       unsigned long nonceLength, const void* nonce,
-       unsigned long psLength, const void* ps)
-{
-#pragma unused(info)
-#pragma unused(nonceLength)
-#pragma unused(nonce)
-#pragma unused(psLength)
-#pragma unused(ps)
-       YarrowContextp          yp = (YarrowContextp) drbg;
-       prng_error_status       perr;
-       char                    buffer[16];
-
-       yp->SelfTestInitialized = 0;
-
-       /* create a Yarrow object */
-       perr = prngInitialize(&yp->PrngRef);
-       if (perr != 0) {
-               panic("Couldn't initialize Yarrow, /dev/random will not work.");
-       }
-
-       perr = prngInput(yp->PrngRef, __DECONST(BYTE*, entropy), (UINT) entropyLength,
-                       SYSTEM_SOURCE, (UINT) entropyLength * 8);
-       if (perr != 0) {
-               /* an error, complain */
-               panic("Couldn't seed Yarrow.\n");
-       }
-
-       /* turn the data around */
-       perr = prngOutput(yp->PrngRef, (BYTE*) buffer, (UINT) sizeof(buffer));
-
-       /* and scramble it some more */
-       perr = prngForceReseed(yp->PrngRef, RESEED_TICKS);
-
-       fips_initialize(yp);
-
-       yp->bytes_since_reseed = 0;
-
-       return perr;
-}
-
-static int
-yarrow_generate(
-       struct ccdrbg_state *prng,
-       unsigned long outlen, void *out,
-       unsigned long inlen, const void *in)
-{
-#pragma unused(inlen)
-#pragma unused(in)
-       YarrowContextp  yp = (YarrowContextp) prng;
-       int             bytes_read = 0;
-       int             bytes_remaining = (int) outlen;
-
-       yp->bytes_since_reseed += outlen;
-       /* Reseed needed? But allow any length immediately after reseeding. */
-       if (yp->bytes_since_reseed != outlen &&
-           yp->bytes_since_reseed > RESEED_BYTES)
-               return CCDRBG_STATUS_NEED_RESEED;
-       
-       while (bytes_remaining > 0) {
-               int bytes_to_read = MIN(bytes_remaining,
-                                       kBlockSize - yp->bytes_used);
-               if (bytes_to_read == 0) {
-                       random_block(yp, yp->random_data, TRUE);
-                       yp->bytes_used = 0;
-                       bytes_to_read = MIN(bytes_remaining, kBlockSize);
-               }
-               
-               memmove((u_int8_t*) out + bytes_read,
-                       ((u_int8_t*)yp->random_data) + yp->bytes_used,
-                       bytes_to_read);
-               yp->bytes_used += bytes_to_read;
-               bytes_read += bytes_to_read;
-               bytes_remaining -= bytes_to_read;
-       }
-
-       return CCDRBG_STATUS_OK;
-}
-
-static int
-yarrow_reseed(
-       struct ccdrbg_state *prng,
-       unsigned long entropylen, const void *entropy,
-       unsigned long inlen, const void *in)
-{
-#pragma unused(inlen)
-#pragma unused(in)
-       YarrowContextp  yp = (YarrowContextp) prng;
-
-       (void) prngInput(yp->PrngRef, __DECONST(BYTE*, entropy), (UINT) entropylen,
-                        SYSTEM_SOURCE, (UINT) entropylen * 8);
-       (void) prngForceReseed(yp->PrngRef, RESEED_TICKS);
-
-       yp->bytes_since_reseed = 0;
-
-       return CCDRBG_STATUS_OK;
-}
-
-static void
-yarrow_destroy(
-       struct ccdrbg_state *prng)
-{
-#pragma unused(prng)
-}
-
-
-void
-ccdrbg_factory_yarrow(
-       struct ccdrbg_info      *info,
-       const void              *custom)
-{
-       info->size = sizeof(struct YarrowContext);
-       info->init = yarrow_init;
-       info->generate = yarrow_generate;
-       info->reseed = yarrow_reseed;
-       info->done = yarrow_destroy;
-       info->custom = custom;
-}
diff --git a/osfmk/prng/random.c b/osfmk/prng/random.c
deleted file mode 100644 (file)
index 5dc056f..0000000
+++ /dev/null
@@ -1,704 +0,0 @@
-/*
- * Copyright (c) 2013 Apple Inc. All rights reserved.
- *
- * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
- * This file contains Original Code and/or Modifications of Original Code
- * as defined in and that are subject to the Apple Public Source License
- * Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. The rights granted to you under the License
- * may not be used to create, or enable the creation or redistribution of,
- * unlawful or unlicensed copies of an Apple operating system, or to
- * circumvent, violate, or enable the circumvention or violation of, any
- * terms of an Apple operating system software license agreement.
- * 
- * Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
- * The Original Code and all software distributed under the License are
- * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
- * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
- * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
- * Please see the License for the specific language governing rights and
- * limitations under the License.
- * 
- * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
- */
-
-#include <mach/machine.h>
-#include <mach/processor.h>
-#include <kern/processor.h>
-#include <kern/cpu_data.h>
-#include <kern/cpu_number.h>
-#include <kern/kalloc.h>
-#include <kern/machine.h>
-#include <kern/misc_protos.h>
-#include <kern/startup.h>
-#include <kern/sched.h>
-#include <kern/thread.h>
-#include <kern/thread_call.h>
-#include <machine/cpu_data.h>
-#include <machine/simple_lock.h>
-#include <vm/pmap.h>
-#include <vm/vm_page.h>
-#include <sys/kdebug.h>
-#include <sys/random.h>
-
-#include <prng/random.h>
-#include <corecrypto/ccdrbg.h>
-#include <corecrypto/ccsha1.h>
-#include <corecrypto/ccdigest.h>
-#include <corecrypto/ccsha2.h>
-
-#include <pexpert/pexpert.h>
-#include <console/serial_protos.h>
-#include <IOKit/IOPlatformExpert.h>
-
-#if defined(__x86_64__)
-#include <i386/cpuid.h>
-
-static int rdseed_step(uint64_t *seed)
-{
-       uint8_t ok;
-       
-       asm volatile ("rdseed %0; setc %1" : "=r" (*seed), "=qm" (ok));
-       
-       return (int) ok;
-}
-
-static int rdseed_retry(uint64_t *seed, size_t nretries)
-{
-       size_t i;
-       
-       for (i = 0; i < nretries; i += 1) {
-               if (rdseed_step(seed)) {
-                       return 1;
-               } else {
-                       asm volatile ("pause");
-               }
-       }
-       
-       return 0;
-}
-
-static size_t rdseed_seed(void *buf, size_t nwords)
-{
-       uint64_t *buf_words;
-       size_t i;
-       
-       if (nwords > 8) {
-               nwords = 8;
-       }
-       
-       buf_words = buf;
-       for (i = 0; i < nwords; i += 1) {
-               if (!rdseed_retry(buf_words + i, 10)) {
-                       return i;
-               }
-       }
-       
-       return nwords;
-}
-
-static int rdrand_step(uint64_t *rand)
-{
-       uint8_t ok;
-       
-       asm volatile ("rdrand %0; setc %1" : "=r" (*rand), "=qm" (ok));
-       
-       return (int) ok;
-}
-
-static int rdrand_retry(uint64_t *rand, size_t nretries)
-{
-       size_t i;
-       
-       for (i = 0; i < nretries; i += 1) {
-               if (rdrand_step(rand)) {
-                       return 1;
-               }
-       }
-       
-       return 0;
-}
-
-static size_t rdrand_seed(void *buf, size_t nwords)
-{
-       size_t i;
-       uint64_t w;
-       uint8_t hash[CCSHA256_OUTPUT_SIZE];
-       const struct ccdigest_info *di = &ccsha256_ltc_di;
-       
-       ccdigest_di_decl(di, ctx);
-       ccdigest_init(di, ctx);
-       
-       for (i = 0; i < 1023; i += 1) {
-               if (!rdrand_retry(&w, 10)) {
-                       nwords = 0;
-                       goto out;
-               }
-               ccdigest_update(di, ctx, sizeof w, &w);
-       }
-       
-       ccdigest_final(di, ctx, hash);
-       
-       if (nwords > 2) {
-               nwords = 2;
-       }
-       
-       memcpy(buf, hash, nwords * sizeof (uint64_t));
-       
-out:
-       ccdigest_di_clear(di, ctx);
-       bzero(hash, sizeof hash);
-       bzero(&w, sizeof w);
-       
-       return nwords;
-}
-
-static void intel_entropysource(void *buf, size_t *nbytes)
-{
-       size_t nwords;
-       
-       /* only handle complete words */
-       assert(*nbytes % sizeof (uint64_t) == 0);
-       
-       nwords = (*nbytes) / sizeof (uint64_t);
-       if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_RDSEED) {
-               nwords = rdseed_seed(buf, nwords);
-               *nbytes = nwords * sizeof (uint64_t);
-       } else if (cpuid_features() & CPUID_FEATURE_RDRAND) {
-               nwords = rdrand_seed(buf, nwords);
-               *nbytes = nwords * sizeof (uint64_t);
-       } else {
-               *nbytes = 0;
-       }
-}
-
-#endif
-
-typedef void (*entropysource)(void *buf, size_t *nbytes);
-
-static const entropysource entropysources[] = {
-       entropy_buffer_read,
-#if defined(__x86_64__)
-       intel_entropysource,
-#endif
-};
-
-static const size_t nsources = sizeof entropysources / sizeof entropysources[0];
-
-static size_t entropy_readall(void *buf, size_t nbytes_persource)
-{
-       uint8_t *buf_bytes = buf;
-       size_t i;
-       size_t nbytes_total = 0;
-       
-       for (i = 0; i < nsources; i += 1) {
-               size_t nbytes = nbytes_persource;
-               entropysources[i](buf_bytes, &nbytes);
-               bzero(buf_bytes + nbytes, nbytes_persource - nbytes);
-               nbytes_total += nbytes;
-               buf_bytes += nbytes_persource;
-       }
-       
-       return nbytes_total;
-}
-
-static struct {
-       lck_grp_t *group;
-       lck_attr_t *attrs;
-       lck_grp_attr_t *group_attrs;
-       lck_mtx_t *mutex;
-} lock;
-
-typedef struct prngContext {
-       struct ccdrbg_info *infop;
-       struct ccdrbg_state *statep;
-       uint64_t bytes_generated;
-       uint64_t bytes_reseeded;
-} *prngContextp;
-
-ccdrbg_factory_t prng_ccdrbg_factory = NULL;
-
-entropy_data_t EntropyData = {
-       .index_ptr = EntropyData.buffer
-};
-
-static struct {
-       uint8_t seed[nsources][EARLY_RANDOM_SEED_SIZE];
-       size_t seedset;
-       uint8_t master_drbg_state[EARLY_RANDOM_STATE_STATIC_SIZE];
-       struct ccdrbg_state *drbg_states[MAX_CPUS];
-       struct ccdrbg_info drbg_info;
-       const struct ccdrbg_nisthmac_custom drbg_custom;
-} erandom = {
-       .drbg_custom = {
-               .di = &ccsha1_eay_di,
-               .strictFIPS = 0,
-       }
-};
-
-static void read_erandom(void *buf, uint32_t nbytes);
-
-void 
-entropy_buffer_read(void *buffer, size_t *count)
-{
-       boolean_t current_state;
-       unsigned int i, j;
-
-       if (!erandom.seedset) {
-               panic("early_random was never invoked");
-       }
-
-       if (*count > ENTROPY_BUFFER_BYTE_SIZE) {
-               *count = ENTROPY_BUFFER_BYTE_SIZE;
-       }
-
-       current_state = ml_set_interrupts_enabled(FALSE);
-
-       memcpy(buffer, EntropyData.buffer, *count);
-
-       /* Consider removing this mixing step rdar://problem/31668239 */
-       for (i = 0, j = (ENTROPY_BUFFER_SIZE - 1); i < ENTROPY_BUFFER_SIZE; j = i, i++)
-               EntropyData.buffer[i] = EntropyData.buffer[i] ^ EntropyData.buffer[j];
-
-       (void) ml_set_interrupts_enabled(current_state);
-
-#if DEVELOPMENT || DEBUG
-       uint32_t *word = buffer;
-       /* Good for both 32-bit and 64-bit kernels. */
-       for (i = 0; i < ENTROPY_BUFFER_SIZE; i += 4)
-               /*
-                * We use "EARLY" here so that we can grab early entropy on
-                * ARM, where tracing is not started until after PRNG is
-                * initialized.
-               */
-               KERNEL_DEBUG_EARLY(ENTROPY_READ(i/4),
-                       word[i+0], word[i+1], word[i+2], word[i+3]);
-#endif
-}
-
-/*
- * Return a uniformly distributed 64-bit random number.
- *
- * This interface should have minimal dependencies on kernel
- * services, and thus be available very early in the life
- * of the kernel.
- * This provides cryptographically secure randomness.
- * Each processor has its own generator instance.
- * It is seeded (lazily) with entropy provided by the Booter.
- *
- * For <rdar://problem/17292592> the algorithm switched from LCG to
- * NIST HMAC DBRG as follows:
- *  - When first called (on OSX this is very early while page tables are being
- *    built) early_random() calls ccdrbg_factory_hmac() to set-up a ccdbrg info
- *    structure.
- *  - The boot processor's ccdrbg state structure is a statically allocated area
- *    which is then initialized by calling the ccdbrg_init method.
- *    The initial entropy is 16 bytes of boot entropy.
- *    The nonce is the first 8 bytes of entropy xor'ed with a timestamp
- *    from ml_get_timebase().
- *    The personalization data provided is null.
- *  - The first 64-bit random value is returned on the boot processor from
- *    an invocation of the ccdbrg_generate method.
- *  - Non-boot processor's DRBG state structures are allocated dynamically
- *    from prng_init(). Each is initialized with the same 16 bytes of entropy
- *    but with a different timestamped nonce and cpu number as personalization.
- *  - Subsequent calls to early_random() pass to read_erandom() to generate
- *    an 8-byte random value.  read_erandom() ensures that pre-emption is
- *    disabled and selects the DBRG state from the current processor.
- *    The ccdbrg_generate method is called for the required random output.
- *    If this method returns CCDRBG_STATUS_NEED_RESEED, the erandom.seed buffer
- *    is re-filled with kernel-harvested entropy and the ccdbrg_reseed method is
- *    called with this new entropy. The kernel panics if a reseed fails.
- */
-uint64_t
-early_random(void)
-{
-       uint32_t        cnt = 0;
-       uint64_t        result;
-       uint64_t        nonce;
-       int             rc;
-       int             ps;
-       struct ccdrbg_state *state;
-
-       if (!erandom.seedset) {
-               erandom.seedset = 1;
-               cnt = PE_get_random_seed((unsigned char *) EntropyData.buffer,
-                                        sizeof(EntropyData.buffer));
-
-               if (cnt < sizeof(EntropyData.buffer)) {
-                       /*
-                        * Insufficient entropy is fatal.  We must fill the
-                        * entire entropy buffer during initializaton.
-                        */
-                       panic("EntropyData needed %lu bytes, but got %u.\n",
-                               sizeof(EntropyData.buffer), cnt);
-               }               
-
-               entropy_readall(&erandom.seed, EARLY_RANDOM_SEED_SIZE);
-
-               /* Init DRBG for NIST HMAC */
-               ccdrbg_factory_nisthmac(&erandom.drbg_info, &erandom.drbg_custom);
-               assert(erandom.drbg_info.size <= sizeof(erandom.master_drbg_state));
-               state = (struct ccdrbg_state *) erandom.master_drbg_state;
-               erandom.drbg_states[master_cpu] = state;
-
-               /*
-                * Init our DBRG from the boot entropy and a timestamp as nonce
-                * and the cpu number as personalization.
-                */
-               assert(sizeof(erandom.seed) > sizeof(nonce));
-               nonce = ml_get_timebase();
-               ps = 0;                         /* boot cpu */
-               rc = ccdrbg_init(&erandom.drbg_info, state,
-                                sizeof(erandom.seed), erandom.seed,
-                                sizeof(nonce), &nonce,
-                                sizeof(ps), &ps);
-               cc_clear(sizeof(nonce), &nonce);
-               if (rc != CCDRBG_STATUS_OK)
-                       panic("ccdrbg_init() returned %d", rc);
-
-               /* Generate output */
-               rc = ccdrbg_generate(&erandom.drbg_info, state,
-                                        sizeof(result), &result,
-                                        0, NULL);
-               if (rc != CCDRBG_STATUS_OK)
-                       panic("ccdrbg_generate() returned %d", rc);
-       
-               return result;
-       };
-
-       read_erandom(&result, sizeof(result));
-
-       return result;
-}
-
-static void
-read_erandom(void *buffer, u_int numBytes)
-{
-       int             cpu;
-       int             rc;
-       size_t nbytes;
-       struct ccdrbg_state *state;
-
-       mp_disable_preemption();
-       cpu = cpu_number();
-       state = erandom.drbg_states[cpu];
-       assert(state);
-       for (;;) {
-               /* Generate output */
-               rc = ccdrbg_generate(&erandom.drbg_info, state,
-                                        numBytes, buffer,
-                                        0, NULL);
-               if (rc == CCDRBG_STATUS_OK)
-                       break;
-               if (rc == CCDRBG_STATUS_NEED_RESEED) {
-                       /* It's time to reseed. Get more entropy */
-                       nbytes = entropy_readall(erandom.seed, EARLY_RANDOM_SEED_SIZE);
-                       assert(nbytes >= EARLY_RANDOM_SEED_SIZE);
-                       rc = ccdrbg_reseed(&erandom.drbg_info, state,
-                                          sizeof(erandom.seed), erandom.seed,
-                                          0, NULL);
-                       cc_clear(sizeof(erandom.seed), erandom.seed);
-                       if (rc == CCDRBG_STATUS_OK)
-                               continue;
-                       panic("read_erandom reseed error %d\n", rc);
-               }
-               panic("read_erandom ccdrbg error %d\n", rc);
-       }
-       mp_enable_preemption();
-}
-
-void
-read_frandom(void *buffer, u_int numBytes)
-{
-       uint8_t *buffer_bytes = buffer;
-       int nbytes;
-
-       /*
-        * Split up into requests for blocks smaller than
-        * than the DBRG request limit. iThis limit is private but
-        * for NISTHMAC it's known to be greater then 4096.
-        */
-       while (numBytes) {
-               nbytes = MIN(numBytes, PAGE_SIZE);
-               read_erandom(buffer_bytes, nbytes);
-               buffer_bytes += nbytes;
-               numBytes -= nbytes;
-       }
-}
-
-/*
- * Register a DRBG factory routine to e used in constructing the kernel PRNG.
- * XXX to be called from the corecrypto kext.
- */
-void
-prng_factory_register(ccdrbg_factory_t factory)
-{
-       prng_ccdrbg_factory = factory;
-       thread_wakeup((event_t) &prng_ccdrbg_factory);
-}
-
-void
-prng_cpu_init(int cpu)
-{      
-       uint64_t        nonce;
-       int             rc;
-       struct ccdrbg_state *state;
-       prngContextp    pp;
-
-       /*
-        * Allocate state and initialize DBRG state for early_random()
-        * for this processor, if necessary.
-        */
-       if (erandom.drbg_states[cpu] == NULL) {
-               
-               state = kalloc(erandom.drbg_info.size);
-               if (state == NULL) {
-                       panic("prng_init kalloc failed\n");
-               }
-               erandom.drbg_states[cpu] = state;
-
-               /*
-                * Init our DBRG from boot entropy, nonce as timestamp
-                * and use the cpu number as the personalization parameter.
-                */
-               nonce = ml_get_timebase();
-               rc = ccdrbg_init(&erandom.drbg_info, state,
-                                sizeof(erandom.seed), erandom.seed,
-                                sizeof(nonce), &nonce,
-                                sizeof(cpu), &cpu);
-               cc_clear(sizeof(nonce), &nonce);
-               if (rc != CCDRBG_STATUS_OK)
-                       panic("ccdrbg_init() returned %d", rc);
-       }
-
-       /* Non-boot cpus use the master cpu's global context */
-       if (cpu != master_cpu) {
-               cpu_datap(cpu)->cpu_prng = master_prng_context();
-               return;
-       }
-
-       assert(lock.mutex == NULL);             /* Once only, please */
-
-       /* make a mutex to control access */
-       lock.group_attrs = lck_grp_attr_alloc_init();
-       lock.group = lck_grp_alloc_init("random", lock.group_attrs);
-       lock.attrs = lck_attr_alloc_init();
-       lock.mutex = lck_mtx_alloc_init(lock.group, lock.attrs);
-
-       pp = kalloc(sizeof(*pp));
-       if (pp == NULL)
-               panic("Unable to allocate prng context");
-       pp->bytes_generated = 0;
-       pp->bytes_reseeded = 0;
-       pp->infop = NULL;
-
-       /* XXX Temporary registration */
-       prng_factory_register(ccdrbg_factory_yarrow);
-
-       master_prng_context() = pp;
-}
-
-static struct ccdrbg_info *
-prng_infop(prngContextp pp)
-{
-       uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE];
-       size_t nbytes;
-       
-       lck_mtx_assert(lock.mutex, LCK_MTX_ASSERT_OWNED);
-
-       /* Usual case: the info is all set */
-       if (pp->infop)
-               return pp->infop;
-
-       /*
-        * Possibly wait for the CCDRBG factory routune to be registered
-        * by corecypto. But panic after waiting for more than 10 seconds.
-        */
-       while (prng_ccdrbg_factory == NULL ) {
-               wait_result_t   wait_result;
-               assert_wait_timeout((event_t) &prng_ccdrbg_factory, TRUE,
-                                       10, NSEC_PER_USEC);
-               lck_mtx_unlock(lock.mutex);
-               wait_result = thread_block(THREAD_CONTINUE_NULL);
-               if (wait_result == THREAD_TIMED_OUT)
-                       panic("prng_ccdrbg_factory registration timeout");
-               lck_mtx_lock(lock.mutex);
-       }
-       /* Check we didn't lose the set-up race */
-       if (pp->infop)
-               return pp->infop;
-
-       pp->infop = (struct ccdrbg_info *) kalloc(sizeof(struct ccdrbg_info));
-       if (pp->infop == NULL)
-               panic("Unable to allocate prng info");
-
-       prng_ccdrbg_factory(pp->infop, NULL);
-
-       pp->statep = kalloc(pp->infop->size);
-       if (pp->statep == NULL)
-               panic("Unable to allocate prng state");
-
-       nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE);
-
-       (void) ccdrbg_init(pp->infop, pp->statep,
-                          nbytes, buf,
-                          0, NULL,
-                          0, NULL);
-       cc_clear(sizeof (buf), buf);
-       return pp->infop;
-}
-
-static void
-Reseed(prngContextp pp)
-{
-       uint8_t buf[nsources][ENTROPY_BUFFER_BYTE_SIZE];
-       size_t nbytes;
-       
-       nbytes = entropy_readall(buf, ENTROPY_BUFFER_BYTE_SIZE);
-
-       PRNG_CCDRBG((void) ccdrbg_reseed(pp->infop, pp->statep,
-                                        nbytes, buf,
-                                        0, NULL)); 
-
-       cc_clear(sizeof (buf), buf);
-       pp->bytes_reseeded = pp->bytes_generated;
-}
-
-
-/* export good random numbers to the rest of the kernel */
-void
-read_random(void* buffer, u_int numbytes)
-{
-       prngContextp pp;
-       struct ccdrbg_info *infop;
-       int ccdrbg_err;
-
-       lck_mtx_lock(lock.mutex);
-
-       pp = current_prng_context();
-       infop = prng_infop(pp);
-
-       /*
-        * Call DRBG, reseeding and retrying if requested.
-        */
-       for (;;) {
-               PRNG_CCDRBG(
-                       ccdrbg_err = ccdrbg_generate(infop, pp->statep,
-                                                        numbytes, buffer,
-                                                        0, NULL));
-               if (ccdrbg_err == CCDRBG_STATUS_OK)
-                       break;
-               if (ccdrbg_err == CCDRBG_STATUS_NEED_RESEED) {
-                       Reseed(pp);
-                       continue;
-               }
-               panic("read_random ccdrbg error %d\n", ccdrbg_err);
-       }
-
-       pp->bytes_generated += numbytes;
-       lck_mtx_unlock(lock.mutex);
-}
-
-int
-write_random(void* buffer, u_int numbytes)
-{
-#if 0
-       int             retval = 0;
-       prngContextp    pp;
-
-       lck_mtx_lock(lock.mutex);
-
-       pp = current_prng_context();
-
-       if (ccdrbg_reseed(prng_infop(pp), pp->statep,
-                         bytesToInput, rdBuffer, 0, NULL) != 0)
-               retval = EIO;
-
-       lck_mtx_unlock(lock.mutex);
-       return retval;
-#else
-#pragma unused(buffer, numbytes)
-       return 0;
-#endif
-}
-
-
-/*
- * Boolean PRNG for generating booleans to randomize order of elements
- * in certain kernel data structures. The algorithm is a
- * modified version of the KISS RNG proposed in the paper:
- * http://stat.fsu.edu/techreports/M802.pdf
- * The modifications have been documented in the technical paper
- * paper from UCL:
- * http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf
- */
-
-/* Initialize the PRNG structures. */
-void random_bool_init(struct bool_gen *bg)
-{
-       /* Seed the random boolean generator */
-       for (int i = 0; i < RANDOM_BOOL_GEN_SEED_COUNT; i++) {
-               bg->seed[i] = (unsigned int)early_random();
-       }
-       bg->state = 0;
-       simple_lock_init(&bg->lock, 0);
-}
-
-/* Generate random bits and add them to an entropy pool. */
-void random_bool_gen_entropy(
-               struct bool_gen *bg,
-               unsigned int    *buffer,
-               int             count)
-{
-
-       simple_lock(&bg->lock);
-       int i, t;
-       for (i = 0; i < count; i++) {
-               bg->seed[1] ^= (bg->seed[1] << 5);
-               bg->seed[1] ^= (bg->seed[1] >> 7);
-               bg->seed[1] ^= (bg->seed[1] << 22);
-               t = bg->seed[2] + bg->seed[3] + bg->state;
-               bg->seed[2] = bg->seed[3];
-               bg->state = t < 0;
-               bg->seed[3] = t & 2147483647;
-               bg->seed[0] += 1411392427;
-               buffer[i] = (bg->seed[0] + bg->seed[1] + bg->seed[3]);
-       }
-       simple_unlock(&bg->lock);
-}
-
-/* Get some number of bits from the entropy pool, refilling if necessary. */
-unsigned int random_bool_gen_bits(
-               struct bool_gen *bg,
-               unsigned int    *buffer,
-               unsigned int    count,
-               unsigned int    numbits)
-{
-       unsigned int index = 0;
-       unsigned int rbits = 0;
-       for (unsigned int bitct = 0; bitct < numbits; bitct++) {
-               /*
-                * Find a portion of the buffer that hasn't been emptied.
-                * We might have emptied our last index in the previous iteration.
-                */
-               while (index < count && buffer[index] == 0)
-                       index++;
-
-               /* If we've exhausted the pool, refill it. */
-               if (index == count) {
-                       random_bool_gen_entropy(bg, buffer, count);
-                       index = 0;
-               }
-
-               /* Collect-a-bit */
-               unsigned int bit = buffer[index] & 1;
-               buffer[index] = buffer[index] >> 1;
-               rbits = bit | (rbits << 1);
-       }
-       return rbits;
-}
index 7ba5f00e18d68c117d13631c8ba0c13bd3e92f11..a49b6c7301dc3c91d6c6e9c158cf1be81a0d48b8 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2013 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 
-#ifndef        _PRNG_RANDOM_H_
-#define        _PRNG_RANDOM_H_
+#ifndef _PRNG_RANDOM_H_
+#define _PRNG_RANDOM_H_
 
 __BEGIN_DECLS
 
@@ -42,46 +42,24 @@ typedef struct entropy_data {
         * TODO: Should index_ptr be volatile?  Are we exposed to any races that
         * we care about if it is not?
         */
-       uint32_t *index_ptr;
+       uint32_t * index_ptr;
        uint32_t buffer[ENTROPY_BUFFER_SIZE];
 } entropy_data_t;
 
 extern entropy_data_t EntropyData;
 
 /* Trace codes for DBG_SEC_KERNEL: */
-#define ENTROPY_READ(n)        SECURITYDBG_CODE(DBG_SEC_KERNEL, n) /* n: 0 .. 3 */
+#define ENTROPY_READ(n) SECURITYDBG_CODE(DBG_SEC_KERNEL, n) /* n: 0 .. 3 */
 
 /*
  * Early_random implementation params: */
-#define        EARLY_RANDOM_SEED_SIZE (16)
-#define        EARLY_RANDOM_STATE_STATIC_SIZE (264)
-
-#if defined (__x86_64__)
-#define current_prng_context() (current_cpu_datap()->cpu_prng)
-#define master_prng_context()  (cpu_datap(master_cpu)->cpu_prng)
-#elif defined (__arm__) || defined(__arm64__)
-#include <arm/cpu_data_internal.h>             // For MAX_CPUS
-#define current_prng_context()  (getCpuDatap()->cpu_prng)
-#define master_prng_context()  (cpu_datap(master_cpu)->cpu_prng)
-#else
-#error architecture unknown
-#endif
+#define EARLY_RANDOM_SEED_SIZE (16)
+#define EARLY_RANDOM_STATE_STATIC_SIZE (264)
 
-#include <corecrypto/ccdrbg.h>
-#include <corecrypto/ccsha1.h>
-
-typedef void (*ccdrbg_factory_t)(struct ccdrbg_info *info, const void *custom);
-
-extern void    ccdrbg_factory_yarrow(struct ccdrbg_info *info, const void *custom);
-
-void prng_factory_register(ccdrbg_factory_t factory);
-void prng_cpu_init(int cpu);
-
-void entropy_buffer_read(void *buffer, size_t *count);
-void entropy_boot_trace(void);
+void early_random_cpu_init(int cpu);
 
 /*
- * Wrapper for requesting a CCDRBG operation.
+ * Wrapper for requesting a CCKPRNG operation.
  * This macro makes the DRBG call with pre-emption disabled to ensure that
  * any attempt to block will cause a panic. And the operation is timed and
  * cannot exceed 10msec (for development kernels).
@@ -89,31 +67,39 @@ void entropy_boot_trace(void);
  */
 #define YARROW 1
 #if YARROW
-#define PRNG_CCDRBG(op)                                        \
-MACRO_BEGIN                                            \
-       op;                                             \
-MACRO_END
+#define PRNG_CCKPRNG(op) \
+       MACRO_BEGIN          \
+       op;                  \
+       MACRO_END
 #else
-#define PRNG_CCDRBG(op)                                        \
-MACRO_BEGIN                                            \
-       uint64_t        start;                          \
-       uint64_t        stop;                           \
-       disable_preemption();                           \
-       start = mach_absolute_time();                   \
-       op;                                             \
-       stop = mach_absolute_time();                    \
-       enable_preemption();                            \
-       assert(stop - start < 10*NSEC_PER_MSEC ||       \
-              machine_timeout_suspended());            \
-       (void) start;                                   \
-       (void) stop;                                    \
-MACRO_END
+#define PRNG_CCKPRNG(op)                                                      \
+       MACRO_BEGIN                                                               \
+       uint64_t start;                                                           \
+       uint64_t stop;                                                            \
+       disable_preemption();                                                     \
+       start = mach_absolute_time();                                             \
+       op;                                                                       \
+       stop = mach_absolute_time();                                              \
+       enable_preemption();                                                      \
+       assert(stop - start < 10 * NSEC_PER_MSEC || machine_timeout_suspended()); \
+       (void)start;                                                              \
+       (void)stop;                                                               \
+       MACRO_END
 #endif
 
 #endif /* XNU_KERNEL_PRIVATE */
 
-/* /dev/random's PRNG is reseeded after generating this many bytes: */
-#define        RESEED_BYTES (17597)
+#include <corecrypto/cckprng.h>
+
+/* kernel prng */
+typedef const struct prng_fns {
+       int (*init)(cckprng_ctx_t ctx, size_t nbytes, const void * seed);
+       int (*reseed)(cckprng_ctx_t ctx, size_t nbytes, const void * seed);
+       int (*addentropy)(cckprng_ctx_t ctx, size_t nbytes, const void * entropy);
+       int (*generate)(cckprng_ctx_t ctx, size_t nbytes, void * out);
+} * prng_fns_t;
+
+void register_and_init_prng(prng_fns_t fns);
 
 #include <kern/simple_lock.h>
 /* Definitions for boolean PRNG */
@@ -124,18 +110,11 @@ struct bool_gen {
        decl_simple_lock_data(, lock)
 };
 
-extern void random_bool_init(struct bool_gen *bg);
+extern void random_bool_init(struct bool_gen * bg);
 
-extern void random_bool_gen_entropy(
-               struct bool_gen *bg,
-               unsigned int *buffer,
-               int count);
+extern void random_bool_gen_entropy(struct bool_gen * bg, unsigned int * buffer, int count);
 
-extern unsigned int random_bool_gen_bits(
-               struct bool_gen *bg,
-               unsigned int *buffer,
-               unsigned int count,
-               unsigned int numbits);
+extern unsigned int random_bool_gen_bits(struct bool_gen * bg, unsigned int * buffer, unsigned int count, unsigned int numbits);
 
 __END_DECLS
 
diff --git a/osfmk/tests/Makefile b/osfmk/tests/Makefile
new file mode 100644 (file)
index 0000000..7e3492e
--- /dev/null
@@ -0,0 +1,19 @@
+export MakeInc_cmd=${SRCROOT}/makedefs/MakeInc.cmd
+export MakeInc_def=${SRCROOT}/makedefs/MakeInc.def
+export MakeInc_rule=${SRCROOT}/makedefs/MakeInc.rule
+export MakeInc_dir=${SRCROOT}/makedefs/MakeInc.dir
+
+include $(MakeInc_cmd)
+include $(MakeInc_def)
+
+EXPORT_FILES = xnupost.h ktest.h
+
+EXPORT_MI_LIST = ${EXPORT_FILES}
+
+EXPORT_MI_DIR = tests
+
+INSTALL_KF_MI_LCL_LIST =
+INSTALL_KF_MI_LIST =
+
+include $(MakeInc_rule)
+include $(MakeInc_dir)
diff --git a/osfmk/tests/README.md b/osfmk/tests/README.md
new file mode 100644 (file)
index 0000000..34d753b
--- /dev/null
@@ -0,0 +1,125 @@
+# Kernel Power On Self Tests (POST)
+
+The tests directories osfmk/tests and bsd/tests include set of tests that run in kernel at boot-time. The primary objective for these tests is to verify functionality of various subsystems like memory allocators, scheduling, VM, IPC ports etc. Following are some tips and guidelines to creating and running tests.
+
+## Features:
+  * Compiled out of RELEASE kernels.
+  * enabled with boot-arg kernPOST [ 0x1 : for on desk testing, 0x3 for BATs testing]
+  * Automatically skips tests that are designed to panic kernel for on-desk testing, but run in BATs environment.
+  * Does not require complete install on device to run. Just kernelcache is enough.
+  * Ability to check for assertions and panic path as well.
+
+## How to run kernel POST
+
+  * Start usbterm and setup your target machine/device in iBoot. 
+  * set boot-args to include "```kernPOST=0x1```"" to enable kernel testing on boot.
+  * load kernelcache using "```usb get /patch/to/kc```"
+  * boot the image "```bootx```"
+  * watch for nanokdp serial output with tags like "```[KTEST] <test> logs```"
+
+## How do I configure to run just test #8?
+
+Kernel POST supports configuring test through boot-args. For example if you want to run your test #8 (say you are tweaking it to do more testing). Just set "```kernPOST_config=8```" and only your test will be run. The configuration also takes ranges as follows
+```
+-> kernPOST_config=1_3,5_9999  # skip test#4. Will run tests 1,2,3 and 5,6 and onwards.
+  
+-> kernPOST_config=1_3,4_9999  # will not skip anything. lower_upper are both inclusive.
+  
+```
+
+## How do I add a new test?
+Adding a new kernel POST test is very simple. Here are a few steps and guidelines for adding tests.
+
+  * There are two locations ```osfmk/tests/``` and ```bsd/tests``` where you can add tests based on your area of testing.
+  * If you wish to add a new *.c* file for your tests then, use ```#include <xnupost.h>``` to include required functions and macros for testing. Remember to add file_name.c in ```osfmk/conf/files``` or ```bsd/conf/files``` as 
+  
+  ```osfmk/tests/my_tests.c   optional config_xnupost```
+  * To add a test function just declare a function with prototype as 
+  
+  ```kern_return_t my_sample_tests(void); ```
+  * And add to struct xnupost_test array in osfmk/tests/kernel_tests.c or bsd/tests/bsd_tests.c as 
+
+```
+struct xnupost_test kernel_post_tests[] = {
+       XNUPOST_TEST_CONFIG_BASIC(my_sample_tests),  // simple test 
+    XNUPOST_TEST_CONFIG_TEST_PANIC(panic_test) // test that is expected to panic 
+};
+```
+  * And you are set. Use KERN_SUCCESS to report successful run and any other error for failure. Here is an example with some available macros.
+  
+```
+kern_return_t my_sample_tests() {
+    uint64_t test_begin_timestamp = 0;
+    uint64_t cur_timestamp = 0, tmp;
+    
+    T_SETUPBEGIN; 
+       test_begin_timestamp = mach_absolute_time();
+       T_ASSERT_NOTNULL(test_begin_timestamp, "mach_absolute_time returned 0.");
+    T_SETUPEND;
+    
+    T_LOG("Testing mach_absolute_time for 100 iterations");
+    for (int i = 0; i < 100; i++) {
+        tmp = mach_absolute_time();
+        T_EXPECT_TRUE((cur_timestamp <= tmp ), "Time went backwards");
+        cur_timestamp = tmp;
+       }
+    
+       T_LOG("Completed mach_absolute_time tests.");
+    return KERN_SUCCESS;
+}
+```
+
+  * There are many ** T_* ** macros available for your convenience.
+  * **Note**: Please make sure your test does a proper cleanup of state. The kernel is expected to continue to boot after testing. If you are unable to cleanup and require a reboot then use XNUPOST_TEST_CONFIG_TEST_PANIC type and panic at the end of the function. This will make sure the test controller reboots and runs the next test in automation.
+
+## What is the difference between T_EXPECT and T_ASSERT macros?
+
+  * T_ASSERT macros will check for condition and upon failure return with KERN_FAILURE. This way it ensures that no further execution of test code is done. 
+  * T_EXPECT will just report failure of that test case, but will continue to run further test code.
+
+## How do I run my tests in BATs?
+
+Bats has a new test type **kernel_POST** that runs Lean test environment tests. You can run the following command to get POST testing.
+
+```
+~osdev/tat/dev/bin/bats  build  -b <build>  -t darwinLTE  -p  xnu:<branch> -r <radarnum>
+```
+
+## How do I test for panic/assertions?
+
+The xnupost subsystem provides mechanism for setting up a `panic widget`. This widget can check for some conditions and report test case SUCCESS/FAILURE. See xnupost.h for `XT_RET* ` style return values. There are convenience macros for registering for generic panic and for assertion handling. For example if you wish to check for api foo(int arg) { assert(arg > 0); ... } then a test case could be like
+
+```
+kern_return_t test_foo_arg_assertion(void) {
+       void * assert_retval = NULL;
+       kern_return_t kr = T_REGISTER_ASSERT_CHECK("arg > 0", &assert_retval);
+       T_ASSERT(kr == KERN_SUCCESS, "register assertion handler");
+
+       foo(-1); /* this will cause assert to fire */
+
+       T_ASSERT(assert_retval == (void *)XT_RET_W_SUCCESS, "verify assertion was hit");
+}
+
+```
+
+## How do XNUPOST panic widgets work?
+
+On debug/development kernels, the `panic()` code is modified to call out to XNUPOST system `xnupost_process_panic()`. This callout can then determine if testing was enabled and has a widget registered for checking panics. If yes, then the corresponding widget function is called and the return value determines what action is taken. For example a widget could return either of the following values
+
+  XT_PANIC_UNRELATED    /* not related. continue panic */
+  XT_RET_W_FAIL         /* report FAILURE and return from panic */
+  XT_RET_W_SUCCESS      /* report SUCCESS and return from panic */
+  XT_PANIC_W_FAIL       /* report FAILURE and continue to panic */
+  XT_PANIC_W_SUCCESS    /* report SUCCESS and continue to panic */
+
+The panic widget data is saved in internal data array where each is of type:
+struct xnupost_panic_widget {
+       void * xtp_context_p;  /* a context pointer for callbacks to track */
+       void ** xtp_outval_p;  /* an out param for function to return some value to running test */
+       const char * xtp_func_name;  /* widget name for tracking in serial output */
+       xt_panic_widget_func xtp_func; 
+};
+
+There is an example use case in `osfmk/tests/kernel_tests.c :check_panic_test() and panic_test()` for writing a widget.
+For basic assertion check see example in `osfmkt/tests/kernel_tests.c :kcdata_api_assert_tests()`
+
diff --git a/osfmk/tests/bitmap_test.c b/osfmk/tests/bitmap_test.c
new file mode 100644 (file)
index 0000000..121d92e
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+
+#if DEVELOPMENT || DEBUG
+
+#include <tests/xnupost.h>
+#include <kern/kalloc.h>
+#include <kern/bits.h>
+
+extern void dump_bitmap_next(bitmap_t *map, uint nbits);
+extern void dump_bitmap_lsb(bitmap_t *map, uint nbits);
+extern void test_bitmap(void);
+extern kern_return_t bitmap_post_test(void);
+
+void
+dump_bitmap_next(bitmap_t *map, uint nbits)
+{
+       for (int i = bitmap_first(map, nbits); i >= 0; i = bitmap_next(map, i)) {
+               printf(" %d", i);
+       }
+       printf("\n");
+}
+
+void
+dump_bitmap_lsb(bitmap_t *map, uint nbits)
+{
+       for (int i = bitmap_lsb_first(map, nbits); i >= 0; i = bitmap_lsb_next(map, nbits, i)) {
+               printf(" %d", i);
+       }
+       printf("\n");
+}
+
+#ifdef NOTDEF
+#ifdef assert
+#undef assert
+#endif
+#define assert(x)      T_ASSERT(x, NULL)
+#endif
+
+void
+test_bitmap(void)
+{
+       uint start = 60;
+       for (uint nbits = start; nbits <= 192; nbits++) {
+               bitmap_t *map = bitmap_alloc(nbits);
+
+               for (uint i = 0; i < nbits; i++) {
+                       bitmap_set(map, i);
+               }
+
+               int expected_result = nbits - 1;
+               for (int i = bitmap_first(map, nbits); i >= 0; i = bitmap_next(map, i)) {
+                       assert(i == expected_result);
+                       expected_result--;
+               }
+               assert(expected_result == -1);
+
+               expected_result = 0;
+               for (int i = bitmap_lsb_first(map, nbits); i >= 0; i = bitmap_lsb_next(map, nbits, i)) {
+                       assert(i == expected_result);
+                       expected_result++;
+               }
+               assert(expected_result == (int)nbits);
+
+               for (uint i = 0; i < nbits; i++) {
+                       bitmap_clear(map, i);
+               }
+               assert(bitmap_first(map, nbits) == -1);
+               assert(bitmap_lsb_first(map, nbits) == -1);
+
+               bitmap_free(map, nbits);
+       }
+}
+
+kern_return_t
+bitmap_post_test(void)
+{
+       test_bitmap();
+
+       kern_return_t ret = KERN_SUCCESS;
+
+       T_ASSERT(ret == KERN_SUCCESS, NULL);
+
+       return ret;
+}
+#endif
diff --git a/osfmk/tests/kernel_tests.c b/osfmk/tests/kernel_tests.c
new file mode 100644 (file)
index 0000000..9bac2ab
--- /dev/null
@@ -0,0 +1,920 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <kern/kern_types.h>
+#include <kern/assert.h>
+#include <kern/host.h>
+#include <kern/macro_help.h>
+#include <kern/sched.h>
+#include <kern/locks.h>
+#include <kern/sched_prim.h>
+#include <kern/misc_protos.h>
+#include <kern/thread_call.h>
+#include <kern/zalloc.h>
+#include <kern/kalloc.h>
+#include <tests/ktest.h>
+#include <sys/errno.h>
+#include <sys/random.h>
+#include <kern/kern_cdata.h>
+#include <machine/lowglobals.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <kern/priority_queue.h>
+
+#if !(DEVELOPMENT || DEBUG)
+#error "Testing is not enabled on RELEASE configurations"
+#endif
+
+#include <tests/xnupost.h>
+
+extern boolean_t get_range_bounds(char * c, int64_t * lower, int64_t * upper);
+__private_extern__ void qsort(void * a, size_t n, size_t es, int (*cmp)(const void *, const void *));
+
+uint32_t total_post_tests_count = 0;
+void xnupost_reset_panic_widgets(void);
+
+/* test declarations */
+kern_return_t zalloc_test(void);
+kern_return_t RandomULong_test(void);
+kern_return_t kcdata_api_test(void);
+kern_return_t priority_queue_test(void);
+
+#if defined(__arm__) || defined(__arm64__)
+kern_return_t pmap_coredump_test(void);
+#endif
+
+extern kern_return_t console_serial_test(void);
+extern kern_return_t console_serial_alloc_rel_tests(void);
+extern kern_return_t console_serial_parallel_log_tests(void);
+extern kern_return_t test_os_log(void);
+extern kern_return_t test_os_log_parallel(void);
+extern kern_return_t bitmap_post_test(void);
+
+#ifdef __arm64__
+extern kern_return_t arm64_munger_test(void);
+extern kern_return_t ex_cb_test(void);
+#if __ARM_PAN_AVAILABLE__
+extern kern_return_t arm64_pan_test(void);
+#endif
+#endif /* __arm64__ */
+
+extern kern_return_t test_thread_call(void);
+
+
+struct xnupost_panic_widget xt_panic_widgets = {NULL, NULL, NULL, NULL};
+
+struct xnupost_test kernel_post_tests[] = {XNUPOST_TEST_CONFIG_BASIC(zalloc_test),
+                                           XNUPOST_TEST_CONFIG_BASIC(RandomULong_test),
+                                           XNUPOST_TEST_CONFIG_BASIC(test_os_log),
+                                           XNUPOST_TEST_CONFIG_BASIC(test_os_log_parallel),
+#ifdef __arm64__
+                                           XNUPOST_TEST_CONFIG_BASIC(arm64_munger_test),
+                                           XNUPOST_TEST_CONFIG_BASIC(ex_cb_test),
+#if __ARM_PAN_AVAILABLE__
+                                           XNUPOST_TEST_CONFIG_BASIC(arm64_pan_test),
+#endif
+#endif /* __arm64__ */
+                                           XNUPOST_TEST_CONFIG_BASIC(kcdata_api_test),
+                                           XNUPOST_TEST_CONFIG_BASIC(console_serial_test),
+                                           XNUPOST_TEST_CONFIG_BASIC(console_serial_alloc_rel_tests),
+                                           XNUPOST_TEST_CONFIG_BASIC(console_serial_parallel_log_tests),
+#if defined(__arm__) || defined(__arm64__)
+                                           XNUPOST_TEST_CONFIG_BASIC(pmap_coredump_test),
+#endif
+                                           XNUPOST_TEST_CONFIG_BASIC(bitmap_post_test),
+                                         //XNUPOST_TEST_CONFIG_TEST_PANIC(kcdata_api_assert_tests)
+                                           XNUPOST_TEST_CONFIG_BASIC(test_thread_call),
+                                           XNUPOST_TEST_CONFIG_BASIC(priority_queue_test),
+};
+
+uint32_t kernel_post_tests_count = sizeof(kernel_post_tests) / sizeof(xnupost_test_data_t);
+
+#define POSTARGS_RUN_TESTS 0x1
+#define POSTARGS_CONTROLLER_AVAILABLE 0x2
+#define POSTARGS_CUSTOM_TEST_RUNLIST 0x4
+uint64_t kernel_post_args = 0x0;
+
+/* static variables to hold state */
+static kern_return_t parse_config_retval = KERN_INVALID_CAPABILITY;
+static char kernel_post_test_configs[256];
+boolean_t xnupost_should_run_test(uint32_t test_num);
+
+kern_return_t
+xnupost_parse_config()
+{
+       if (parse_config_retval != KERN_INVALID_CAPABILITY)
+               return parse_config_retval;
+       PE_parse_boot_argn("kernPOST", &kernel_post_args, sizeof(kernel_post_args));
+
+       if (PE_parse_boot_argn("kernPOST_config", &kernel_post_test_configs[0], sizeof(kernel_post_test_configs)) == TRUE) {
+               kernel_post_args |= POSTARGS_CUSTOM_TEST_RUNLIST;
+       }
+
+       if (kernel_post_args != 0) {
+               parse_config_retval = KERN_SUCCESS;
+               goto out;
+       }
+       parse_config_retval = KERN_NOT_SUPPORTED;
+out:
+       return parse_config_retval;
+}
+
+boolean_t
+xnupost_should_run_test(uint32_t test_num)
+{
+       if (kernel_post_args & POSTARGS_CUSTOM_TEST_RUNLIST) {
+               int64_t begin = 0, end = 999999;
+               char * b = kernel_post_test_configs;
+               while (*b) {
+                       get_range_bounds(b, &begin, &end);
+                       if (test_num >= begin && test_num <= end) {
+                               return TRUE;
+                       }
+
+                       /* skip to the next "," */
+                       while (*b != ',') {
+                               if (*b == '\0')
+                                       return FALSE;
+                               b++;
+                       }
+                       /* skip past the ',' */
+                       b++;
+               }
+               return FALSE;
+       }
+       return TRUE;
+}
+
+kern_return_t
+xnupost_list_tests(xnupost_test_t test_list, uint32_t test_count)
+{
+       if (KERN_SUCCESS != xnupost_parse_config())
+               return KERN_FAILURE;
+
+       xnupost_test_t testp;
+       for (uint32_t i = 0; i < test_count; i++) {
+               testp = &test_list[i];
+               if (testp->xt_test_num == 0) {
+                       testp->xt_test_num = ++total_post_tests_count;
+               }
+               /* make sure the boot-arg based test run list is honored */
+               if (kernel_post_args & POSTARGS_CUSTOM_TEST_RUNLIST) {
+                       testp->xt_config |= XT_CONFIG_IGNORE;
+                       if (xnupost_should_run_test(testp->xt_test_num)) {
+                               testp->xt_config &= ~(XT_CONFIG_IGNORE);
+                               testp->xt_config |= XT_CONFIG_RUN;
+                               printf("\n[TEST] #%u is marked as ignored", testp->xt_test_num);
+                       }
+               }
+               printf("\n[TEST] TOC#%u name: %s expected: %d config: %x\n", testp->xt_test_num, testp->xt_name, testp->xt_expected_retval,
+                      testp->xt_config);
+       }
+
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+xnupost_run_tests(xnupost_test_t test_list, uint32_t test_count)
+{
+       uint32_t i = 0;
+       int retval = KERN_SUCCESS;
+
+       if ((kernel_post_args & POSTARGS_RUN_TESTS) == 0) {
+               printf("No POST boot-arg set.\n");
+               return retval;
+       }
+
+       T_START;
+       xnupost_test_t testp;
+       for (; i < test_count; i++) {
+               xnupost_reset_panic_widgets();
+               testp = &test_list[i];
+               T_BEGIN(testp->xt_name);
+               testp->xt_begin_time = mach_absolute_time();
+               testp->xt_end_time   = testp->xt_begin_time;
+
+               /*
+                * If test is designed to panic and controller
+                * is not available then mark as SKIPPED
+                */
+               if ((testp->xt_config & XT_CONFIG_EXPECT_PANIC) && !(kernel_post_args & POSTARGS_CONTROLLER_AVAILABLE)) {
+                       T_SKIP(
+                           "Test expects panic but "
+                           "no controller is present");
+                       testp->xt_test_actions = XT_ACTION_SKIPPED;
+                       continue;
+               }
+
+               if ((testp->xt_config & XT_CONFIG_IGNORE)) {
+                       T_SKIP("Test is marked as XT_CONFIG_IGNORE");
+                       testp->xt_test_actions = XT_ACTION_SKIPPED;
+                       continue;
+               }
+
+               testp->xt_func();
+               T_END;
+               testp->xt_retval = T_TESTRESULT;
+               testp->xt_end_time = mach_absolute_time();
+               if (testp->xt_retval == testp->xt_expected_retval) {
+                       testp->xt_test_actions = XT_ACTION_PASSED;
+               } else {
+                       testp->xt_test_actions = XT_ACTION_FAILED;
+               }
+       }
+       T_FINISH;
+       return retval;
+}
+
+kern_return_t
+kernel_list_tests()
+{
+       return xnupost_list_tests(kernel_post_tests, kernel_post_tests_count);
+}
+
+kern_return_t
+kernel_do_post()
+{
+       return xnupost_run_tests(kernel_post_tests, kernel_post_tests_count);
+}
+
+kern_return_t
+xnupost_register_panic_widget(xt_panic_widget_func funcp, const char * funcname, void * context, void ** outval)
+{
+       if (xt_panic_widgets.xtp_context_p != NULL || xt_panic_widgets.xtp_func != NULL)
+               return KERN_RESOURCE_SHORTAGE;
+
+       xt_panic_widgets.xtp_context_p = context;
+       xt_panic_widgets.xtp_func      = funcp;
+       xt_panic_widgets.xtp_func_name = funcname;
+       xt_panic_widgets.xtp_outval_p  = outval;
+
+       return KERN_SUCCESS;
+}
+
+void
+xnupost_reset_panic_widgets()
+{
+       bzero(&xt_panic_widgets, sizeof(xt_panic_widgets));
+}
+
+kern_return_t
+xnupost_process_kdb_stop(const char * panic_s)
+{
+       xt_panic_return_t retval         = 0;
+       struct xnupost_panic_widget * pw = &xt_panic_widgets;
+       const char * name = "unknown";
+       if (xt_panic_widgets.xtp_func_name) {
+               name = xt_panic_widgets.xtp_func_name;
+       }
+
+       /* bail early on if kernPOST is not set */
+       if (kernel_post_args == 0) {
+               return KERN_INVALID_CAPABILITY;
+       }
+
+       if (xt_panic_widgets.xtp_func) {
+               T_LOG("%s: Calling out to widget: %s", __func__, xt_panic_widgets.xtp_func_name);
+               retval = pw->xtp_func(panic_s, pw->xtp_context_p, pw->xtp_outval_p);
+       } else {
+               return KERN_INVALID_CAPABILITY;
+       }
+
+       switch (retval) {
+       case XT_RET_W_SUCCESS:
+               T_EXPECT_EQ_INT(retval, XT_RET_W_SUCCESS, "%s reported successful handling. Returning from kdb_stop.", name);
+               /* KERN_SUCCESS means return from panic/assertion */
+               return KERN_SUCCESS;
+
+       case XT_RET_W_FAIL:
+               T_FAIL("%s reported XT_RET_W_FAIL: Returning from kdb_stop", name);
+               return KERN_SUCCESS;
+
+       case XT_PANIC_W_FAIL:
+               T_FAIL("%s reported XT_PANIC_W_FAIL: Continuing to kdb_stop", name);
+               return KERN_FAILURE;
+
+       case XT_PANIC_W_SUCCESS:
+               T_EXPECT_EQ_INT(retval, XT_PANIC_W_SUCCESS, "%s reported successful testcase. But continuing to kdb_stop.", name);
+               return KERN_FAILURE;
+
+       case XT_PANIC_UNRELATED:
+       default:
+               T_LOG("UNRELATED: Continuing to kdb_stop.");
+               return KERN_FAILURE;
+       }
+}
+
+xt_panic_return_t
+_xt_generic_assert_check(const char * s, void * str_to_match, void ** outval)
+{
+       xt_panic_return_t ret = XT_PANIC_UNRELATED;
+
+       if (NULL != strnstr(__DECONST(char *, s), (char *)str_to_match, strlen(s))) {
+               T_LOG("%s: kdb_stop string: '%s' MATCHED string: '%s'", __func__, s, (char *)str_to_match);
+               ret = XT_RET_W_SUCCESS;
+       }
+
+       if (outval)
+               *outval = (void *)(uintptr_t)ret;
+       return ret;
+}
+
+kern_return_t
+xnupost_reset_tests(xnupost_test_t test_list, uint32_t test_count)
+{
+       uint32_t i = 0;
+       xnupost_test_t testp;
+       for (; i < test_count; i++) {
+               testp                  = &test_list[i];
+               testp->xt_begin_time   = 0;
+               testp->xt_end_time     = 0;
+               testp->xt_test_actions = XT_ACTION_NONE;
+               testp->xt_retval       = -1;
+       }
+       return KERN_SUCCESS;
+}
+
+
+kern_return_t
+zalloc_test()
+{
+       zone_t test_zone;
+       void * test_ptr;
+
+       T_SETUPBEGIN;
+       test_zone = zinit(sizeof(uint64_t), 100 * sizeof(uint64_t), sizeof(uint64_t), "test_uint64_zone");
+       T_ASSERT_NOTNULL(test_zone, NULL);
+
+       T_ASSERT_EQ_INT(zone_free_count(test_zone), 0, NULL);
+       T_SETUPEND;
+
+       T_ASSERT_NOTNULL(test_ptr = zalloc(test_zone), NULL);
+
+       zfree(test_zone, test_ptr);
+
+       /* A sample report for perfdata */
+       T_PERF("num_threads_at_ktest", threads_count, "count", "# of threads in system at zalloc_test");
+
+       return KERN_SUCCESS;
+}
+
+/*
+ * Function used for comparison by qsort()
+ */
+static int
+compare_numbers_ascending(const void * a, const void * b)
+{
+       const uint64_t x = *(const uint64_t *)a;
+       const uint64_t y = *(const uint64_t *)b;
+       if (x < y) {
+               return -1;
+       } else if (x > y) {
+               return 1;
+       } else {
+               return 0;
+       }
+}
+
+/*
+ * Function used for comparison by qsort()
+ */
+static int
+compare_numbers_descending(const void * a, const void * b)
+{
+       const uint32_t x = *(const uint32_t *)a;
+       const uint32_t y = *(const uint32_t *)b;
+       if (x > y) {
+               return -1;
+       } else if (x < y) {
+               return 1;
+       } else {
+               return 0;
+       }
+}
+
+/* Node structure for the priority queue tests */
+struct priority_queue_test_node {
+       struct priority_queue_entry     link;
+       priority_queue_key_t            node_key;
+};
+
+static void
+priority_queue_test_queue(struct priority_queue *pq, int type,
+               priority_queue_compare_fn_t cmp_fn)
+{
+       /* Configuration for the test */
+#define PRIORITY_QUEUE_NODES   7
+       static uint32_t priority_list[] = { 20, 3, 7, 6, 50, 2, 8};
+       uint32_t increase_pri = 100;
+       uint32_t decrease_pri = 90;
+       struct priority_queue_test_node *result;
+       uint32_t key = 0;
+       boolean_t update_result = false;
+
+       struct priority_queue_test_node *node = NULL;
+       /* Add all priorities to the first priority queue */
+       for (int i = 0; i < PRIORITY_QUEUE_NODES; i++) {
+               node = kalloc(sizeof(struct priority_queue_test_node));
+               T_ASSERT_NOTNULL(node, NULL);
+
+               priority_queue_entry_init(&(node->link));
+               node->node_key = priority_list[i];
+               key = (type == PRIORITY_QUEUE_GENERIC_KEY) ? PRIORITY_QUEUE_KEY_NONE : priority_list[i];
+               priority_queue_insert(pq, &(node->link), key, cmp_fn);
+       }
+
+       T_ASSERT_NOTNULL(node, NULL);
+       key = (type == PRIORITY_QUEUE_GENERIC_KEY) ? node->node_key : priority_queue_entry_key(pq, &(node->link));
+       T_ASSERT((key == node->node_key), "verify node stored key correctly");
+
+       /* Test the priority increase operation by updating the last node added (8) */
+       T_ASSERT_NOTNULL(node, NULL);
+       node->node_key = increase_pri;
+       key = (type == PRIORITY_QUEUE_GENERIC_KEY) ? PRIORITY_QUEUE_KEY_NONE : node->node_key;
+       update_result = priority_queue_entry_increase(pq, &node->link, key, cmp_fn);
+       T_ASSERT((update_result == true), "increase key updated root");
+       result = priority_queue_max(pq, struct priority_queue_test_node, link);
+       T_ASSERT((result->node_key == increase_pri), "verify priority_queue_entry_increase() operation");
+
+
+       /* Test the priority decrease operation by updating the last node added */
+       T_ASSERT((result == node), NULL);
+       node->node_key = decrease_pri;
+       key = (type == PRIORITY_QUEUE_GENERIC_KEY) ? PRIORITY_QUEUE_KEY_NONE : node->node_key;
+       update_result = priority_queue_entry_decrease(pq, &node->link, key, cmp_fn);
+       T_ASSERT((update_result == true), "decrease key updated root");
+       result = priority_queue_max(pq, struct priority_queue_test_node, link);
+       T_ASSERT((result->node_key == decrease_pri), "verify priority_queue_entry_decrease() operation");
+
+       /* Update our local priority list as well */
+       priority_list[PRIORITY_QUEUE_NODES - 1] = decrease_pri;
+
+       /* Sort the local list in descending order */
+       qsort(priority_list, PRIORITY_QUEUE_NODES, sizeof(priority_list[0]), compare_numbers_descending);
+
+       /* Test the maximum operation by comparing max node with local list */
+       result = priority_queue_max(pq, struct priority_queue_test_node, link);
+       T_ASSERT((result->node_key == priority_list[0]), "(heap (%u) == qsort (%u)) priority queue max node lookup", 
+               (uint32_t)result->node_key, priority_list[0]);
+
+       /* Remove all remaining elements and verify they match local list */
+       for (int i = 0; i < PRIORITY_QUEUE_NODES; i++) {
+               result = priority_queue_remove_max(pq, struct priority_queue_test_node, link, cmp_fn);
+               T_ASSERT((result->node_key == priority_list[i]), "(heap (%u) == qsort (%u)) priority queue max node removal", 
+                       (uint32_t)result->node_key, priority_list[i]);
+       }
+
+       priority_queue_destroy(pq, struct priority_queue_test_node, link, ^(void *n) {
+               kfree(n, sizeof(struct priority_queue_test_node));
+       });
+}
+
+kern_return_t
+priority_queue_test(void)
+{
+       /*
+        * Initialize two priority queues
+        * - One which uses the key comparator
+        * - Other which uses the node comparator
+        */
+       static struct priority_queue pq;
+       static struct priority_queue pq_nodes;
+
+       T_SETUPBEGIN;
+
+       priority_queue_init(&pq, PRIORITY_QUEUE_BUILTIN_KEY | PRIORITY_QUEUE_MAX_HEAP);
+       priority_queue_init(&pq_nodes, PRIORITY_QUEUE_GENERIC_KEY | PRIORITY_QUEUE_MAX_HEAP);
+
+       T_SETUPEND;
+
+       priority_queue_test_queue(&pq, PRIORITY_QUEUE_BUILTIN_KEY,
+                       PRIORITY_QUEUE_SCHED_PRI_MAX_HEAP_COMPARE);
+
+       priority_queue_test_queue(&pq_nodes, PRIORITY_QUEUE_GENERIC_KEY,
+                       priority_heap_make_comparator(a, b, struct priority_queue_test_node, link, {
+                               return (a->node_key > b->node_key) ? 1 : ((a->node_key == b->node_key) ? 0 : -1);
+                       }));
+
+       return KERN_SUCCESS;
+}
+
+/*
+ * Function to count number of bits that are set in a number.
+ * It uses Side Addition using Magic Binary Numbers
+ */
+static int
+count_bits(uint64_t number)
+{
+       return __builtin_popcountll(number);
+}
+
+kern_return_t
+RandomULong_test()
+{
+/*
+ * Randomness test for RandomULong()
+ *
+ * This test verifies that:
+ *  a. RandomULong works
+ *  b. The generated numbers match the following entropy criteria:
+ *     For a thousand iterations, verify:
+ *          1. mean entropy > 12 bits
+ *          2. min entropy > 4 bits
+ *          3. No Duplicate
+ *          4. No incremental/decremental pattern in a window of 3
+ *          5. No Zero
+ *          6. No -1
+ *
+ * <rdar://problem/22526137> Add test to increase code coverage for /dev/random
+ */
+
+#define CONF_MIN_ENTROPY 4
+#define CONF_MEAN_ENTROPY 12
+#define CONF_ITERATIONS 1000
+#define CONF_WINDOW_SIZE 3
+#define CONF_WINDOW_TREND_LIMIT ((CONF_WINDOW_SIZE / 2) + (CONF_WINDOW_SIZE & 1)) >> 0
+
+       int i;
+       uint32_t min_bit_entropy, max_bit_entropy, bit_entropy;
+       uint32_t aggregate_bit_entropy = 0;
+       uint32_t mean_bit_entropy      = 0;
+       uint64_t numbers[CONF_ITERATIONS];
+       min_bit_entropy = UINT32_MAX;
+       max_bit_entropy = 0;
+
+       /*
+        * TEST 1: Number generation and basic and basic validation
+        * Check for non-zero (no bits set), -1 (all bits set) and error
+        */
+       for (i = 0; i < CONF_ITERATIONS; i++) {
+               read_random(&numbers[i], sizeof(numbers[i]));
+               if (numbers[i] == 0) {
+                       T_ASSERT_NE_ULLONG(numbers[i], 0, "read_random returned zero value.");
+               }
+               if (numbers[i] == UINT64_MAX) {
+                       T_ASSERT_NE_ULLONG(numbers[i], UINT64_MAX, "read_random returned -1.");
+               }
+       }
+       T_PASS("Generated %d non-zero random numbers with atleast one bit reset.", CONF_ITERATIONS);
+
+       /*
+        * TEST 2: Mean and Min Bit Entropy
+        * Check the bit entropy and its mean over the generated numbers.
+        */
+       for (i = 1; i < CONF_ITERATIONS; i++) {
+               bit_entropy = count_bits(numbers[i - 1] ^ numbers[i]);
+               if (bit_entropy < min_bit_entropy)
+                       min_bit_entropy = bit_entropy;
+               if (bit_entropy > max_bit_entropy)
+                       max_bit_entropy = bit_entropy;
+
+               if (bit_entropy < CONF_MIN_ENTROPY) {
+                       T_EXPECT_GE_UINT(bit_entropy, CONF_MIN_ENTROPY,
+                                        "Number of differing bits in consecutive numbers does not satisfy the min criteria.");
+               }
+
+               aggregate_bit_entropy += bit_entropy;
+       }
+       T_PASS("Passed the min bit entropy expectation of %d bits", CONF_MIN_ENTROPY);
+
+       mean_bit_entropy = aggregate_bit_entropy / CONF_ITERATIONS;
+       T_EXPECT_GE_UINT(mean_bit_entropy, CONF_MEAN_ENTROPY, "Test criteria for mean number of differing bits.");
+       T_PASS("Mean bit entropy criteria satisfied (Required %d, Actual: %d).", CONF_MEAN_ENTROPY, mean_bit_entropy);
+       T_LOG("{PERFORMANCE} iterations: %d, min_bit_entropy: %d, mean_bit_entropy: %d, max_bit_entropy: %d", CONF_ITERATIONS,
+             min_bit_entropy, mean_bit_entropy, max_bit_entropy);
+       T_PERF("min_bit_entropy_" T_TOSTRING(CONF_ITERATIONS), min_bit_entropy, "bits", "minimum bit entropy in RNG. High is better");
+       T_PERF("mean_bit_entropy_" T_TOSTRING(CONF_ITERATIONS), mean_bit_entropy, "bits", "mean bit entropy in RNG. High is better");
+       T_PERF("max_bit_entropy_" T_TOSTRING(CONF_ITERATIONS), max_bit_entropy, "bits", "max bit entropy in RNG. High is better");
+
+       /*
+        * TEST 3: Incremental Pattern Search
+        * Check that incremental/decremental pattern does not exist in the given window
+        */
+       int window_start, window_end, trend;
+       window_start = window_end = trend = 0;
+
+       do {
+               /*
+                * Set the window
+                */
+               window_end = window_start + CONF_WINDOW_SIZE - 1;
+               if (window_end >= CONF_ITERATIONS)
+                       window_end = CONF_ITERATIONS - 1;
+
+               trend = 0;
+               for (i = window_start; i < window_end; i++) {
+                       if (numbers[i] < numbers[i + 1])
+                               trend++;
+                       else if (numbers[i] > numbers[i + 1])
+                               trend--;
+               }
+               /*
+                * Check that there is no increasing or decreasing trend
+                * i.e. trend <= ceil(window_size/2)
+                */
+               if (trend < 0) {
+                       trend = -trend;
+               }
+               if (trend > CONF_WINDOW_TREND_LIMIT) {
+                       T_ASSERT_LE_INT(trend, CONF_WINDOW_TREND_LIMIT, "Found increasing/decreasing trend in random numbers.");
+               }
+
+               /*
+                * Move to the next window
+                */
+               window_start++;
+
+       } while (window_start < (CONF_ITERATIONS - 1));
+       T_PASS("Did not find increasing/decreasing trends in a window of %d numbers.", CONF_WINDOW_SIZE);
+
+       /*
+        * TEST 4: Find Duplicates
+        * Check no duplicate values are generated
+        */
+       qsort(numbers, CONF_ITERATIONS, sizeof(numbers[0]), compare_numbers_ascending);
+       for (i = 1; i < CONF_ITERATIONS; i++) {
+               if (numbers[i] == numbers[i - 1]) {
+                       T_ASSERT_NE_ULLONG(numbers[i], numbers[i - 1], "read_random generated duplicate values.");
+               }
+       }
+       T_PASS("Test did not find any duplicates as expected.");
+
+       return KERN_SUCCESS;
+}
+
+
+/* KCDATA kernel api tests */
+static struct kcdata_descriptor test_kc_data;//, test_kc_data2;
+struct sample_disk_io_stats {
+       uint64_t disk_reads_count;
+       uint64_t disk_reads_size;
+       uint64_t io_priority_count[4];
+       uint64_t io_priority_size;
+} __attribute__((packed));
+
+struct kcdata_subtype_descriptor test_disk_io_stats_def[] = {
+    {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 0 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_count"},
+    {KCS_SUBTYPE_FLAGS_NONE, KC_ST_UINT64, 1 * sizeof(uint64_t), sizeof(uint64_t), "disk_reads_size"},
+    {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, 2 * sizeof(uint64_t), KCS_SUBTYPE_PACK_SIZE(4, sizeof(uint64_t)), "io_priority_count"},
+    {KCS_SUBTYPE_FLAGS_ARRAY, KC_ST_UINT64, (2 + 4) * sizeof(uint64_t), sizeof(uint64_t), "io_priority_size"},
+};
+
+kern_return_t
+kcdata_api_test()
+{
+       kern_return_t retval = KERN_SUCCESS;
+
+       /* test for NULL input */
+       retval = kcdata_memory_static_init(NULL, (mach_vm_address_t)0, KCDATA_BUFFER_BEGIN_STACKSHOT, 100, KCFLAG_USE_MEMCOPY);
+       T_ASSERT(retval == KERN_INVALID_ARGUMENT, "kcdata_memory_static_init with NULL struct");
+
+       /* another negative test with buffer size < 32 bytes */
+       char data[30] = "sample_disk_io_stats";
+       retval = kcdata_memory_static_init(&test_kc_data, (mach_vm_address_t)&data, KCDATA_BUFFER_BEGIN_CRASHINFO, sizeof(data),
+                                          KCFLAG_USE_MEMCOPY);
+       T_ASSERT(retval == KERN_RESOURCE_SHORTAGE, "init with 30 bytes failed as expected with KERN_RESOURCE_SHORTAGE");
+
+       /* test with COPYOUT for 0x0 address. Should return KERN_NO_ACCESS */
+       retval = kcdata_memory_static_init(&test_kc_data, (mach_vm_address_t)0, KCDATA_BUFFER_BEGIN_CRASHINFO, PAGE_SIZE,
+                                          KCFLAG_USE_COPYOUT);
+       T_ASSERT(retval == KERN_NO_ACCESS, "writing to 0x0 returned KERN_NO_ACCESS");
+
+       /* test with successful kcdata_memory_static_init */
+       test_kc_data.kcd_length   = 0xdeadbeef;
+       mach_vm_address_t address = (mach_vm_address_t)kalloc(PAGE_SIZE);
+       T_EXPECT_NOTNULL(address, "kalloc of PAGE_SIZE data.");
+
+       retval = kcdata_memory_static_init(&test_kc_data, (mach_vm_address_t)address, KCDATA_BUFFER_BEGIN_STACKSHOT, PAGE_SIZE,
+                                          KCFLAG_USE_MEMCOPY);
+
+       T_ASSERT(retval == KERN_SUCCESS, "successful kcdata_memory_static_init call");
+
+       T_ASSERT(test_kc_data.kcd_length == PAGE_SIZE, "kcdata length is set correctly to PAGE_SIZE.");
+       T_LOG("addr_begin 0x%llx and end 0x%llx and address 0x%llx", test_kc_data.kcd_addr_begin, test_kc_data.kcd_addr_end, address);
+       T_ASSERT(test_kc_data.kcd_addr_begin == address, "kcdata begin address is correct 0x%llx", (uint64_t)address);
+
+       /* verify we have BEGIN and END HEADERS set */
+       uint32_t * mem = (uint32_t *)address;
+       T_ASSERT(mem[0] == KCDATA_BUFFER_BEGIN_STACKSHOT, "buffer does contain KCDATA_BUFFER_BEGIN_STACKSHOT");
+       T_ASSERT(mem[4] == KCDATA_TYPE_BUFFER_END, "KCDATA_TYPE_BUFFER_END is appended as expected");
+       T_ASSERT(mem[5] == 0, "size of BUFFER_END tag is zero");
+
+       /* verify kcdata_memory_get_used_bytes() */
+       uint64_t bytes_used = 0;
+       bytes_used = kcdata_memory_get_used_bytes(&test_kc_data);
+       T_ASSERT(bytes_used == (2 * sizeof(struct kcdata_item)), "bytes_used api returned expected %llu", bytes_used);
+
+       /* test for kcdata_get_memory_addr() */
+
+       mach_vm_address_t user_addr = 0;
+       /* negative test for NULL user_addr AND/OR kcdata_descriptor */
+       retval = kcdata_get_memory_addr(NULL, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), &user_addr);
+       T_ASSERT(retval == KERN_INVALID_ARGUMENT, "kcdata_get_memory_addr with NULL struct -> KERN_INVALID_ARGUMENT");
+
+       retval = kcdata_get_memory_addr(&test_kc_data, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), NULL);
+       T_ASSERT(retval == KERN_INVALID_ARGUMENT, "kcdata_get_memory_addr with NULL user_addr -> KERN_INVALID_ARGUMENT");
+
+       /* successful case with size 0. Yes this is expected to succeed as just a item type could be used as boolean */
+       retval = kcdata_get_memory_addr(&test_kc_data, KCDATA_TYPE_USECS_SINCE_EPOCH, 0, &user_addr);
+       T_ASSERT(retval == KERN_SUCCESS, "Successfully got kcdata entry for 0 size data");
+       T_ASSERT(user_addr == test_kc_data.kcd_addr_end, "0 sized data did not add any extra buffer space");
+
+       /* successful case with valid size. */
+       user_addr = 0xdeadbeef;
+       retval = kcdata_get_memory_addr(&test_kc_data, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), &user_addr);
+       T_ASSERT(retval == KERN_SUCCESS, "kcdata_get_memory_addr with valid values succeeded.");
+       T_ASSERT(user_addr > test_kc_data.kcd_addr_begin, "user_addr is in range of buffer");
+       T_ASSERT(user_addr < test_kc_data.kcd_addr_end, "user_addr is in range of buffer");
+
+       /* Try creating an item with really large size */
+       user_addr  = 0xdeadbeef;
+       bytes_used = kcdata_memory_get_used_bytes(&test_kc_data);
+       retval = kcdata_get_memory_addr(&test_kc_data, KCDATA_TYPE_MACH_ABSOLUTE_TIME, PAGE_SIZE * 4, &user_addr);
+       T_ASSERT(retval == KERN_RESOURCE_SHORTAGE, "Allocating entry with size > buffer -> KERN_RESOURCE_SHORTAGE");
+       T_ASSERT(user_addr == 0xdeadbeef, "user_addr remained unaffected with failed kcdata_get_memory_addr");
+       T_ASSERT(bytes_used == kcdata_memory_get_used_bytes(&test_kc_data), "The data structure should be unaffected");
+
+       /* verify convenience functions for uint32_with_description */
+       retval = kcdata_add_uint32_with_description(&test_kc_data, 0xbdc0ffee, "This is bad coffee");
+       T_ASSERT(retval == KERN_SUCCESS, "add uint32 with description succeeded.");
+
+       retval = kcdata_add_uint64_with_description(&test_kc_data, 0xf001badc0ffee, "another 8 byte no.");
+       T_ASSERT(retval == KERN_SUCCESS, "add uint64 with desc succeeded.");
+
+       /* verify creating an KCDATA_TYPE_ARRAY here */
+       user_addr  = 0xdeadbeef;
+       bytes_used = kcdata_memory_get_used_bytes(&test_kc_data);
+       /* save memory address where the array will come up */
+       struct kcdata_item * item_p = (struct kcdata_item *)test_kc_data.kcd_addr_end;
+
+       retval = kcdata_get_memory_addr_for_array(&test_kc_data, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), 20, &user_addr);
+       T_ASSERT(retval == KERN_SUCCESS, "Array of 20 integers should be possible");
+       T_ASSERT(user_addr != 0xdeadbeef, "user_addr is updated as expected");
+       T_ASSERT((kcdata_memory_get_used_bytes(&test_kc_data) - bytes_used) >= 20 * sizeof(uint64_t), "memory allocation is in range");
+       kcdata_iter_t iter = kcdata_iter(item_p, PAGE_SIZE - kcdata_memory_get_used_bytes(&test_kc_data));
+       T_ASSERT(kcdata_iter_array_elem_count(iter) == 20, "array count is 20");
+
+       /* FIXME add tests here for ranges of sizes and counts */
+
+       T_ASSERT(item_p->flags == (((uint64_t)KCDATA_TYPE_MACH_ABSOLUTE_TIME << 32) | 20), "flags are set correctly");
+
+       /* test adding of custom type */
+
+       retval = kcdata_add_type_definition(&test_kc_data, 0x999, data, &test_disk_io_stats_def[0],
+                                           sizeof(test_disk_io_stats_def) / sizeof(struct kcdata_subtype_descriptor));
+       T_ASSERT(retval == KERN_SUCCESS, "adding custom type succeeded.");
+
+       return KERN_SUCCESS;
+}
+
+/*
+kern_return_t
+kcdata_api_assert_tests()
+{
+       kern_return_t retval       = 0;
+       void * assert_check_retval = NULL;
+       test_kc_data2.kcd_length   = 0xdeadbeef;
+       mach_vm_address_t address = (mach_vm_address_t)kalloc(PAGE_SIZE);
+       T_EXPECT_NOTNULL(address, "kalloc of PAGE_SIZE data.");
+
+       retval = kcdata_memory_static_init(&test_kc_data2, (mach_vm_address_t)address, KCDATA_BUFFER_BEGIN_STACKSHOT, PAGE_SIZE,
+                                          KCFLAG_USE_MEMCOPY);
+
+       T_ASSERT(retval == KERN_SUCCESS, "successful kcdata_memory_static_init call");
+
+       retval = T_REGISTER_ASSERT_CHECK("KCDATA_DESC_MAXLEN", &assert_check_retval);
+       T_ASSERT(retval == KERN_SUCCESS, "registered assert widget");
+
+       // this will assert
+       retval = kcdata_add_uint32_with_description(&test_kc_data2, 0xc0ffee, "really long description string for kcdata");
+       T_ASSERT(retval == KERN_INVALID_ARGUMENT, "API param check returned KERN_INVALID_ARGUMENT correctly");
+       T_ASSERT(assert_check_retval == (void *)XT_RET_W_SUCCESS, "assertion handler verified that it was hit");
+
+       return KERN_SUCCESS;
+}
+*/
+
+#if defined(__arm__) || defined(__arm64__)
+
+#include <arm/pmap.h>
+
+#define MAX_PMAP_OBJECT_ELEMENT 100000
+
+extern struct vm_object pmap_object_store; /* store pt pages */
+extern unsigned long gPhysBase, gPhysSize, first_avail;
+
+/*
+ * Define macros to transverse the pmap object structures and extract
+ * physical page number with information from low global only
+ * This emulate how Astris extracts information from coredump
+ */
+#if defined(__arm64__)
+
+static inline uintptr_t
+astris_vm_page_unpack_ptr(uintptr_t p)
+{
+       if (!p)
+               return ((uintptr_t)0);
+
+       return (p & lowGlo.lgPmapMemFromArrayMask)
+                  ? lowGlo.lgPmapMemStartAddr + (p & ~(lowGlo.lgPmapMemFromArrayMask)) * lowGlo.lgPmapMemPagesize
+                  : lowGlo.lgPmapMemPackedBaseAddr + (p << lowGlo.lgPmapMemPackedShift);
+}
+
+// assume next pointer is the first element
+#define astris_vm_page_queue_next(qc) (astris_vm_page_unpack_ptr(*((uint32_t *)(qc))))
+
+#endif
+
+#if defined(__arm__)
+
+// assume next pointer is the first element
+#define astris_vm_page_queue_next(qc) *((uintptr_t *)(qc))
+
+#endif
+
+#define astris_vm_page_queue_first(q) astris_vm_page_queue_next(q)
+
+#define astris_vm_page_queue_end(q, qe) ((q) == (qe))
+
+#define astris_vm_page_queue_iterate(head, elt)                                                           \
+       for ((elt) = (uintptr_t)astris_vm_page_queue_first((head)); !astris_vm_page_queue_end((head), (elt)); \
+            (elt) = (uintptr_t)astris_vm_page_queue_next(((elt) + (uintptr_t)lowGlo.lgPmapMemChainOffset)))
+
+#define astris_ptoa(x) ((vm_address_t)(x) << lowGlo.lgPageShift)
+
+static inline ppnum_t
+astris_vm_page_get_phys_page(uintptr_t m)
+{
+       return (m >= lowGlo.lgPmapMemStartAddr && m < lowGlo.lgPmapMemEndAddr)
+                  ? (ppnum_t)((m - lowGlo.lgPmapMemStartAddr) / lowGlo.lgPmapMemPagesize + lowGlo.lgPmapMemFirstppnum)
+                  : *((ppnum_t *)(m + lowGlo.lgPmapMemPageOffset));
+}
+
+kern_return_t
+pmap_coredump_test(void)
+{
+       int iter = 0;
+       uintptr_t p;
+
+       T_LOG("Testing coredump info for PMAP.");
+
+       T_ASSERT_GE_ULONG(lowGlo.lgStaticAddr, gPhysBase, NULL);
+       T_ASSERT_LE_ULONG(lowGlo.lgStaticAddr + lowGlo.lgStaticSize, first_avail, NULL);
+       T_ASSERT_EQ_ULONG(lowGlo.lgLayoutMajorVersion, 3, NULL);
+       T_ASSERT_EQ_ULONG(lowGlo.lgLayoutMinorVersion, 0, NULL);
+       T_ASSERT_EQ_ULONG(lowGlo.lgLayoutMagic, LOWGLO_LAYOUT_MAGIC, NULL);
+
+       // check the constant values in lowGlo
+       T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemQ, ((uint64_t) & (pmap_object_store.memq)), NULL);
+       T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemPageOffset, offsetof(struct vm_page_with_ppnum, vmp_phys_page), NULL);
+       T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemChainOffset, offsetof(struct vm_page, vmp_listq), NULL);
+       T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemPagesize, sizeof(struct vm_page), NULL);
+
+#if defined(__arm64__)
+       T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemFromArrayMask, VM_PACKED_FROM_VM_PAGES_ARRAY, NULL);
+       T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemPackedShift, VM_PACKED_POINTER_SHIFT, NULL);
+       T_ASSERT_EQ_ULONG(lowGlo.lgPmapMemPackedBaseAddr, VM_MIN_KERNEL_AND_KEXT_ADDRESS, NULL);
+#endif
+
+       vm_object_lock_shared(&pmap_object_store);
+       astris_vm_page_queue_iterate(lowGlo.lgPmapMemQ, p)
+       {
+               ppnum_t ppnum   = astris_vm_page_get_phys_page(p);
+               pmap_paddr_t pa = (pmap_paddr_t)astris_ptoa(ppnum);
+               T_ASSERT_GE_ULONG(pa, gPhysBase, NULL);
+               T_ASSERT_LT_ULONG(pa, gPhysBase + gPhysSize, NULL);
+               iter++;
+               T_ASSERT_LT_INT(iter, MAX_PMAP_OBJECT_ELEMENT, NULL);
+       }
+       vm_object_unlock(&pmap_object_store);
+
+       T_ASSERT_GT_INT(iter, 0, NULL);
+       return KERN_SUCCESS;
+}
+#endif
diff --git a/osfmk/tests/ktest.c b/osfmk/tests/ktest.c
new file mode 100644 (file)
index 0000000..14dcb69
--- /dev/null
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <tests/ktest.h>
+#include <tests/ktest_internal.h>
+#include <mach/mach_time.h>
+#include <kern/misc_protos.h>
+
+void
+ktest_start(void) {
+       ktest_emit_start();
+}
+
+void
+ktest_finish(void) {
+       ktest_emit_finish();
+}
+
+void
+ktest_testbegin(const char * test_name) {
+       ktest_current_time = mach_absolute_time();
+       ktest_test_name = test_name;
+       ktest_emit_testbegin(test_name);
+}
+
+void
+ktest_testend() {
+       ktest_current_time = mach_absolute_time();
+       ktest_emit_testend();
+       ktest_test_index++;
+}
+
+void
+ktest_testskip(const char * msg, ...) {
+       va_list args;
+
+       ktest_current_time = mach_absolute_time();
+
+       va_start(args, msg);
+       ktest_emit_testskip(msg, args);
+       va_end(args);
+
+}
+
+void
+ktest_log(const char * msg, ...) {
+       va_list args;
+
+       ktest_current_time = mach_absolute_time();
+
+       va_start(args, msg);
+       ktest_emit_log(msg, args);
+       va_end(args);
+}
+
+void
+ktest_perf(const char * metric, const char * unit, double value, const char * desc)
+{
+       ktest_current_time = mach_absolute_time();
+       ktest_emit_perfdata(metric, unit, value, desc);
+}
+
+void
+ktest_testcase(int success)
+{
+       ktest_current_time = mach_absolute_time();
+
+       if(success && !ktest_expectfail) {
+               /* PASS */
+               ktest_passcount++;
+               ktest_testcase_result = T_RESULT_PASS;
+       } else if(!success && !ktest_expectfail) {
+               /* FAIL */
+               ktest_failcount++;
+               ktest_testcase_result = T_RESULT_FAIL;
+       } else if(success && ktest_expectfail) {
+               /* UXPASS */
+               ktest_xpasscount++;
+               ktest_testcase_result = T_RESULT_UXPASS;
+       } else if(!success && ktest_expectfail) {
+               /* XFAIL */
+               ktest_xfailcount++;
+               ktest_testcase_result = T_RESULT_XFAIL;
+       }
+
+       ktest_update_test_result_state();
+       if(ktest_quiet == 0 ||
+          ktest_testcase_result == T_RESULT_FAIL ||
+          ktest_testcase_result == T_RESULT_UXPASS) {
+               ktest_emit_testcase();
+       }
+       ktest_expression_index++;
+
+       ktest_quiet = 0;
+       ktest_expectfail = 0;
+       ktest_output_buf[0] = '\0';
+       ktest_current_msg[0] = '\0';
+       ktest_current_expr[0] = '\0';
+       for(int i = 0; i < KTEST_MAXVARS; i++) {
+               ktest_current_var_names[i][0] = '\0';
+               ktest_current_var_values[i][0] = '\0';
+       }
+       ktest_current_var_index = 0;
+}
+
+void
+ktest_update_test_result_state(void) {
+       ktest_test_result = ktest_test_result_statetab[ktest_test_result]
+                                                     [ktest_testcase_result]
+                                                     [ktest_testcase_mode];
+}
+
+void
+ktest_assertion_check(void) {
+       if (ktest_testcase_result == T_RESULT_FAIL || ktest_testcase_result == T_RESULT_UXPASS) {
+               ktest_testend();
+               panic("XNUPOST: Assertion failed : %s : at %s:%d", ktest_test_name, ktest_current_file, ktest_current_line);
+       }
+}
diff --git a/osfmk/tests/ktest.h b/osfmk/tests/ktest.h
new file mode 100644 (file)
index 0000000..4cc95c3
--- /dev/null
@@ -0,0 +1,646 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _TESTS_KTEST_H
+#define _TESTS_KTEST_H
+
+/* Symbol name prefix */
+#define T_SYM(sym) ktest_ ## sym
+
+#include <stdarg.h>
+
+extern unsigned int T_SYM(current_line);
+extern const char * T_SYM(current_file);
+extern const char * T_SYM(current_func);
+extern int T_SYM(testcase_mode);
+extern int T_SYM(testcase_result);
+extern int T_SYM(test_result);
+extern int T_SYM(quiet);
+
+void T_SYM(start)(void);
+void T_SYM(finish)(void);
+void T_SYM(testbegin)(const char * test_name);
+void T_SYM(testend)(void);
+void T_SYM(testskip)(const char * msg, ...);
+void T_SYM(testcase)(int expr);
+void T_SYM(log)(const char * msg, ...);
+void T_SYM(perf)(const char * metric, const char * unit, double value, const char * desc);
+void T_SYM(update_test_result_state)(void);
+void T_SYM(assertion_check)(void);
+
+void T_SYM(set_current_msg)(const char * msg, ...);
+void T_SYM(set_current_expr)(const char * expr_fmt, ...);
+void T_SYM(set_current_var)(const char * name, const char * value_fmt, ...);
+
+typedef union {
+    char _char;
+    unsigned char _uchar;
+
+    short _short;
+    unsigned short _ushort;
+
+    int _int;
+    unsigned int _uint;
+
+    long _long;
+    unsigned long _ulong;
+
+    long long _llong;
+    unsigned long long _ullong;
+
+    float _float;
+
+    double _double;
+
+    long double _ldouble;
+
+    void* _ptr;
+} T_SYM(temp);
+
+extern T_SYM(temp) T_SYM(temp1), T_SYM(temp2), T_SYM(temp3);
+
+#define T_SUCCESS 1
+#define T_FAILURE 0
+
+/* Testcase modes */
+#define T_MAIN 0
+#define T_SETUP 1
+
+/* Testcase result states */
+#define T_RESULT_PASS 0
+#define T_RESULT_FAIL 1
+#define T_RESULT_UXPASS 2
+#define T_RESULT_XFAIL 3
+
+/* Test result states */
+#define T_STATE_UNRESOLVED 0
+#define T_STATE_PASS 1
+#define T_STATE_FAIL 2
+#define T_STATE_SETUPFAIL 3
+
+/*
+ * Helpers
+ */
+
+#define T_TOSTRING_HELPER(x) #x
+#define T_TOSTRING(x) T_TOSTRING_HELPER(x)
+
+#define T_SAVEINFO do {\
+       T_SYM(current_line) = __LINE__;\
+       T_SYM(current_func) = (const char *)__func__;\
+       T_SYM(current_file) = (const char *)__FILE__;\
+} while(0)
+
+#define T_SET_AUX_VARS do {\
+       /* Only used in userspace lib for now */\
+} while(0)
+
+#define T_ASSERTION_CHECK do {\
+       T_SYM(assertion_check)();\
+} while(0)
+
+#define T_EXPECT_BLOCK2(type, fmt, cmp, lhs, rhs, msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(temp1).type = (lhs);\
+       T_SYM(temp2).type = (rhs);\
+       T_SYM(set_current_expr)(T_TOSTRING(lhs) " "\
+                               T_TOSTRING(cmp) " "\
+                               T_TOSTRING(rhs));\
+       T_SYM(set_current_var)(T_TOSTRING(lhs), fmt, T_SYM(temp1).type);\
+       T_SYM(set_current_var)(T_TOSTRING(rhs), fmt, T_SYM(temp2).type);\
+       T_SET_AUX_VARS;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(T_SYM(temp1).type cmp T_SYM(temp2).type);\
+} while(0)
+
+#define T_ASSERT_BLOCK2(type, fmt, cmp, lhs, rhs, msg, ...) do {\
+       T_EXPECT_BLOCK2(type, fmt, cmp, lhs, rhs, msg, ## __VA_ARGS__);\
+       T_ASSERTION_CHECK;\
+} while(0)
+
+/*
+ * Core functions
+ */
+
+/* Denotes start of testing. All prior output is ignored. */
+#define T_START do {\
+       T_SAVEINFO;\
+       T_SYM(start)();\
+} while(0)
+
+/* Denotes end of testing. All subsequent output is ignored. */
+#define T_FINISH do {\
+       T_SAVEINFO;\
+       T_SYM(finish)();\
+} while(0)
+
+/* Denotes beginning of a test. */
+#define T_BEGIN(name) do {\
+       T_SAVEINFO;\
+       T_SYM(testbegin)(name);\
+} while(0)
+
+/* Denotes end of current test. */
+#define T_END do {\
+       T_SAVEINFO;\
+       T_SYM(testend)();\
+} while(0)
+
+/* Denotes beginning of a setup section of the current test. */
+#define T_SETUPBEGIN do {\
+       T_SYM(testcase_mode) = T_SETUP;\
+} while(0)
+
+/* Denotes end of the current setup section of the current test. */
+#define T_SETUPEND do {\
+       T_SYM(testcase_mode) = T_MAIN;\
+} while(0)
+
+/* Denotes end of current test because test was skipped. */
+#define T_SKIP(msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(testskip)(msg, ## __VA_ARGS__);\
+} while(0)
+
+/* Returns result of latest testrun. */
+#define T_TESTRESULT (T_SYM(test_result))
+
+/* Return result of latest testcase. */
+#define T_TESTCASERESULT (T_SYM(testcase_result))
+
+/* Flags the next testcase as expected failure. */
+#define T_EXPECTFAIL do {\
+       T_SYM(expectfail) = 1;\
+} while(0)
+
+/* Only emit output for next testcase if it is a FAIL or UXPASS. */
+#define T_QUIET do {\
+       T_SYM(quiet) = 1;\
+} while(0)
+
+/* Logs a message. */
+#define T_LOG(msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(log)(msg, ## __VA_ARGS__);\
+} while(0)
+
+/* Explicit pass. */
+#define T_PASS(msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(T_SUCCESS);\
+} while(0)
+
+/* Explicit fail. */
+#define T_FAIL(msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(T_FAILURE);\
+} while(0)
+
+/* Explicit assert fail. */
+#define T_ASSERT_FAIL(msg, ...) do {\
+       T_SAVEINFO;\
+       T_SET_AUX_VARS;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(T_FAILURE);\
+       T_SYM(assertion_fail)();\
+} while(0)
+
+/* Generic expect. */
+#define T_EXPECT(expr, msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(temp1)._int = (int)(!!(expr));\
+       T_SYM(set_current_expr)(T_TOSTRING(expr));\
+       T_SET_AUX_VARS;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(T_SYM(temp1)._int);\
+} while(0)
+
+/* Generic assert. */
+#define T_ASSERT(expr, msg, ...) do {\
+       T_EXPECT(expr, msg, ## __VA_ARGS__);\
+       T_ASSERTION_CHECK;\
+} while(0)
+
+/*
+ * Convenience functions
+ */
+
+/* null */
+
+#define T_EXPECT_NOTNULL(expr, msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(temp1)._int = (int)(!!(expr));\
+       T_SYM(set_current_expr)(T_TOSTRING(expr) " != NULL");\
+       T_SYM(set_current_var)(T_TOSTRING(expr),\
+                              "%s",\
+                              T_SYM(temp1)._int ? "<NOTNULL>" : "NULL");\
+       T_SET_AUX_VARS;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(T_SYM(temp1)._int);\
+} while(0)
+
+#define T_EXPECT_NULL(expr, msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(temp1)._int = (int)(!(expr));\
+       T_SYM(set_current_expr)(T_TOSTRING(expr) " == NULL");\
+       T_SYM(set_current_var)(T_TOSTRING(expr),\
+                              "%s",\
+                              T_SYM(temp1)._int ? "NULL" : "<NOTNULL>");\
+       T_SET_AUX_VARS;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(T_SYM(temp1)._int);\
+} while(0)
+
+#define T_ASSERT_NOTNULL(expr, msg, ...) do {\
+       T_EXPECT_NOTNULL(expr, msg, ## __VA_ARGS__);\
+       T_ASSERTION_CHECK;\
+} while(0)
+
+#define T_ASSERT_NULL(expr, msg, ...) do {\
+       T_EXPECT_NULL(expr, msg, ## __VA_ARGS__);\
+       T_ASSERTION_CHECK;\
+} while(0)
+
+/* string */
+
+// TODO: check/truncate inputs
+#define T_EXPECT_EQ_STR(lhs, rhs, msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(temp1)._ptr = (lhs);\
+       T_SYM(temp2)._ptr = (rhs);\
+       T_SYM(set_current_expr)(T_TOSTRING(lhs) " == " T_TOSTRING(rhs));\
+       T_SYM(set_current_var)(T_TOSTRING(lhs), "%s", T_SYM(temp1)._ptr);\
+       T_SYM(set_current_var)(T_TOSTRING(rhs), "%s", T_SYM(temp2)._ptr);\
+       T_SET_AUX_VARS;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(strcmp(T_SYM(temp1)._ptr, T_SYM(temp2)._ptr) == 0);\
+} while(0)
+
+#define T_EXPECT_NE_STR(lhs, rhs, msg, ...) do {\
+       T_SAVEINFO;\
+       T_SYM(temp1)._ptr = (lhs);\
+       T_SYM(temp2)._ptr = (rhs);\
+       T_SYM(set_current_expr)(T_TOSTRING(lhs) " == " T_TOSTRING(rhs));\
+       T_SYM(set_current_var)(T_TOSTRING(lhs), "%s", T_SYM(temp1)._ptr);\
+       T_SYM(set_current_var)(T_TOSTRING(rhs), "%s", T_SYM(temp2)._ptr);\
+       T_SET_AUX_VARS;\
+       T_SYM(set_current_msg)(msg, ## __VA_ARGS__);\
+       T_SYM(testcase)(strcmp(T_SYM(temp1)._ptr, T_SYM(temp2)._ptr) != 0);\
+} while(0)
+
+#define T_ASSERT_EQ_STR(lhs, rhs, msg, ...) do {\
+       T_EXPECT_EQ_STR(lhs, rhs, msg, # __VA_ARGS__);\
+       T_ASSERTION_CHECK;\
+} while(0)
+
+#define T_ASSERT_NE_STR(lhs, rhs, msg, ...) do {\
+       T_EXPECT_NE_STR(lhs, rhs, msg, # __VA_ARGS__);\
+       T_ASSERTION_CHECK;\
+} while(0)
+
+/* char */
+
+#define T_EXPECT_EQ_CHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_char, "%c", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_CHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_char, "%c", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_CHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_char, "%c", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_CHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_char, "%c", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_CHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_char, "%c", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_CHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_char, "%c", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_CHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_char, "%c", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_CHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_char, "%c", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_CHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_char, "%c", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_CHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_char, "%c", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_CHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_char, "%c", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_CHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_char, "%c", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* unsigned char */
+
+#define T_EXPECT_EQ_UCHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uchar, "%c", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_UCHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uchar, "%c", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_UCHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uchar, "%c", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_UCHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uchar, "%c", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_UCHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uchar, "%c", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_UCHAR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uchar, "%c", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_UCHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uchar, "%c", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_UCHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uchar, "%c", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_UCHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uchar, "%c", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_UCHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uchar, "%c", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_UCHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uchar, "%c", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_UCHAR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uchar, "%c", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* short */
+
+#define T_EXPECT_EQ_SHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_short, "%hi", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_SHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_short, "%hi", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_SHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_short, "%hi", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_SHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_short, "%hi", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_SHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_short, "%hi", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_SHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_short, "%hi", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_SHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_short, "%hi", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_SHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_short, "%hi", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_SHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_short, "%hi", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_SHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_short, "%hi", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_SHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_short, "%hi", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_SHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_short, "%hi", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* unsigned short */
+
+#define T_EXPECT_EQ_USHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ushort, "%hu", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_USHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ushort, "%hu", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_USHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ushort, "%hu", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_USHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ushort, "%hu", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_USHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ushort, "%hu", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_USHORT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ushort, "%hu", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_USHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ushort, "%hu", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_USHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ushort, "%hu", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_USHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ushort, "%hu", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_USHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ushort, "%hu", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_USHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ushort, "%hu", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_USHORT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ushort, "%hu", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* int */
+
+#define T_EXPECT_EQ_INT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_int, "%d", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_INT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_int, "%d", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_INT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_int, "%d", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_INT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_int, "%d", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_INT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_int, "%d", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_INT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_int, "%d", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_INT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_int, "%d", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_INT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_int, "%d", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_INT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_int, "%d", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_INT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_int, "%d", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_INT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_int, "%d", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_INT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_int, "%d", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* unsigned int */
+
+#define T_EXPECT_EQ_UINT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uint, "%u", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_UINT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uint, "%u", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_UINT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uint, "%u", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_UINT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uint, "%u", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_UINT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uint, "%u", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_UINT(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_uint, "%u", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_UINT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uint, "%u", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_UINT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uint, "%u", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_UINT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uint, "%u", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_UINT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uint, "%u", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_UINT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uint, "%u", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_UINT(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_uint, "%u", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* long */
+
+#define T_EXPECT_EQ_LONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_long, "%li", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_LONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_long, "%li", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_LONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_long, "%li", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_LONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_long, "%li", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_LONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_long, "%li", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_LONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_long, "%li", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_LONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_long, "%li", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_LONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_long, "%li", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_LONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_long, "%li", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_LONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_long, "%li", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_LONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_long, "%li", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_LONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_long, "%li", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* unsigned long */
+
+#define T_EXPECT_EQ_ULONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ulong, "%lu", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_ULONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ulong, "%lu", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_ULONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ulong, "%lu", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_ULONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ulong, "%lu", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_ULONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ulong, "%lu", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_ULONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ulong, "%lu", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_ULONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ulong, "%lu", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_ULONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ulong, "%lu", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_ULONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ulong, "%lu", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_ULONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ulong, "%lu", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_ULONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ulong, "%lu", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_ULONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ulong, "%lu", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* long long */
+
+#define T_EXPECT_EQ_LLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_llong, "%lli", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_LLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_llong, "%lli", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_LLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_llong, "%lli", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_LLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_llong, "%lli", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_LLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_llong, "%lli", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_LLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_llong, "%lli", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_LLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_llong, "%lli", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_LLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_llong, "%lli", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_LLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_llong, "%lli", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_LLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_llong, "%lli", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_LLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_llong, "%lli", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_LLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_llong, "%lli", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* unsigned long long */
+
+#define T_EXPECT_EQ_ULLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ullong, "%llu", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_ULLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ullong, "%llu", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_ULLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ullong, "%llu", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_ULLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ullong, "%llu", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_ULLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ullong, "%llu", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_ULLONG(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ullong, "%llu", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_ULLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ullong, "%llu", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_ULLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ullong, "%llu", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_ULLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ullong, "%llu", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_ULLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ullong, "%llu", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_ULLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ullong, "%llu", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_ULLONG(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ullong, "%llu", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/* pointer */
+
+#define T_EXPECT_EQ_PTR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ptr, "%p", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_NE_PTR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ptr, "%p", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LT_PTR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ptr, "%p", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GT_PTR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ptr, "%p", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_LE_PTR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ptr, "%p", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_EXPECT_GE_PTR(lhs, rhs, msg, ...)\
+       T_EXPECT_BLOCK2(_ptr, "%p", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+#define T_ASSERT_EQ_PTR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ptr, "%p", ==, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_NE_PTR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ptr, "%p", !=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LT_PTR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ptr, "%p", <, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GT_PTR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ptr, "%p", >, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_LE_PTR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ptr, "%p", <=, lhs, rhs, msg, ## __VA_ARGS__)
+#define T_ASSERT_GE_PTR(lhs, rhs, msg, ...)\
+       T_ASSERT_BLOCK2(_ptr, "%p", >=, lhs, rhs, msg, ## __VA_ARGS__)
+
+/*
+ * Log a perfdata measurement. For example:
+ * T_PERF("name_of_metric", 3234, "nsec", "time since first test run")
+ */
+#define T_PERF(metric, value, unit, desc) \
+       do {                                              \
+               T_SAVEINFO;                               \
+               T_SYM(perf)(metric, unit, value, desc);   \
+       } while (0)
+
+#endif /* _TESTS_KTEST_H */
diff --git a/osfmk/tests/ktest_accessor.c b/osfmk/tests/ktest_accessor.c
new file mode 100644 (file)
index 0000000..ab660d5
--- /dev/null
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <tests/ktest_internal.h>
+#include <kern/misc_protos.h>
+#include <kern/debug.h>
+
+int vsnprintf(char *, size_t, const char *, va_list);
+
+void
+ktest_set_current_expr(const char * expr_fmt, ...) {
+       int ret;
+       va_list args;
+
+       va_start(args, expr_fmt);
+       ret = vsnprintf(ktest_current_expr, KTEST_MAXLEN, expr_fmt, args);
+       va_end(args);
+}
+
+void
+ktest_set_current_var(const char * name, const char * value_fmt, ...) {
+       int ret;
+       va_list args;
+
+       if(ktest_current_var_index >= KTEST_MAXVARS) {
+               panic("Internal ktest error in " __func__);
+       }
+
+       strlcpy(ktest_current_var_names[ktest_current_var_index],
+                       name,
+                       KTEST_MAXLEN);
+
+       va_start(args, value_fmt);
+       ret = vsnprintf(ktest_current_var_values[ktest_current_var_index],
+                       KTEST_MAXLEN,
+                       value_fmt,
+                       args);
+       va_end(args);
+
+       ktest_current_var_index++;
+}
+
+void
+ktest_set_current_msg(const char * msg, ...) {
+       int ret;
+       va_list args;
+
+       if(msg == NULL) return;
+
+       va_start(args, msg);
+       ret = vsnprintf(ktest_current_msg, KTEST_MAXLEN, msg, args);
+       va_end(args);
+}
+
diff --git a/osfmk/tests/ktest_emit.c b/osfmk/tests/ktest_emit.c
new file mode 100644 (file)
index 0000000..088dd38
--- /dev/null
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <tests/ktest_internal.h>
+#include <kern/misc_protos.h>
+#include <kern/debug.h>
+
+#define EMIT(buf,size) do { \
+       console_write(buf, size); \
+       } while(0)
+
+/* TODO: intelligently truncate messages if possible */
+#define BOUNDS_CHECK_AND_UPDATE(ret, size) do {\
+       if(ret < 0 || ret >= size) {\
+               panic("Internal ktest error in %s", __func__);\
+       }\
+       size -= ret;\
+       msg += ret;\
+} while(0)
+
+int vsnprintf(char *, size_t, const char *, va_list);
+
+void
+ktest_emit_start(void) {
+       char str[] = "\n[KTEST]\tSTART\t" KTEST_VERSION_STR "\n";
+       EMIT((char *)&str[0], sizeof(str)-1);
+}
+
+void
+ktest_emit_finish(void) {
+       char str[] = "\n[KTEST]\tFINISH\n";
+       EMIT((char *)&str[0], sizeof(str)-1);
+}
+
+void
+ktest_emit_testbegin(const char * test_name) {
+       char * msg = ktest_output_buf;
+       int size = sizeof(ktest_output_buf);
+       int ret;
+
+       /* left trim the file path for readability */
+       char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100);
+
+       ret = snprintf(msg,
+                      size,
+                      "\n[KTEST]\t"      /* header */
+                      "TESTBEGIN\t"    /* type */
+                      "%lld\t"         /* time */
+                      "%d\t"           /* index */
+                      "%s\t"           /* file */
+                      "%d\t"           /* line */
+                      "%s\n",          /* name */
+                      ktest_current_time,
+                      ktest_test_index,
+                      fname,
+                      ktest_current_line,
+                      test_name);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       EMIT(ktest_output_buf, (int)(msg - ktest_output_buf));
+}
+
+void
+ktest_emit_testskip(const char * skip_msg, va_list args) {
+       char * msg = ktest_output_buf;
+       int size = sizeof(ktest_output_buf);
+       int ret;
+
+       char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100);
+
+       ret = snprintf(msg,
+                      size,
+                      "\n[KTEST]\t"     /* header */
+                      "TESTSKIP\t"    /* type */
+                      "%lld\t"        /* time */
+                      "%s\t"          /* file */
+                      "%d\t",         /* line */
+                      ktest_current_time,
+                      fname,
+                      ktest_current_line);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       ret = vsnprintf(msg, size, skip_msg, args);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       ret = snprintf(msg, size, "\n");
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       EMIT(ktest_output_buf, (int)(msg - ktest_output_buf));
+
+}
+
+void
+ktest_emit_testend() {
+       char * msg = ktest_output_buf;
+       int size = sizeof(ktest_output_buf);
+       int ret;
+
+       char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100);
+
+       ret = snprintf(msg,
+                      size,
+                      "\n[KTEST]\t"      /* header */
+                      "TESTEND\t"      /* type */
+                      "%lld\t"         /* time */
+                      "%d\t"           /* index */
+                      "%s\t"           /* file */
+                      "%d\t"           /* line */
+                      "%s\n",          /* name */
+                      ktest_current_time,
+                      ktest_test_index,
+                      fname,
+                      ktest_current_line,
+                      ktest_test_name);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       EMIT(ktest_output_buf, (int)(msg - ktest_output_buf));
+
+}
+
+void
+ktest_emit_log(const char * log_msg, va_list args) {
+       char * msg = ktest_output_buf;
+       int size = sizeof(ktest_output_buf);
+       int ret;
+
+       char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100);
+
+       ret = snprintf(msg,
+                      size,
+                      "\n[KTEST]\t" /* header */
+                      "LOG\t"     /* type */
+                      "%lld\t"    /* time */
+                      "%s\t"      /* file */
+                      "%d\t",     /* line */
+                      ktest_current_time,
+                      fname,
+                      ktest_current_line);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       ret = vsnprintf(msg, size, log_msg, args);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       ret = snprintf(msg, size, "\n");
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       EMIT(ktest_output_buf, (int)(msg - ktest_output_buf));
+
+}
+
+void
+ktest_emit_perfdata(const char * metric, const char * unit, double value, const char * desc)
+{
+       static const char * perfstr = "%s\t%lld\t%s\t\"%s\"";
+       char * msg = ktest_output_buf;
+       int64_t print_value = (int64_t)value;
+       int size   = sizeof(ktest_output_buf);
+       int ret;
+
+       char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100);
+
+       ret = snprintf(msg, size,
+                      "\n[KTEST]\t" /* header */
+                      "PERF\t"    /* type */
+                      "%lld\t"    /* time */
+                      "%s\t"      /* file */
+                      "%d\t",     /* line */
+                      ktest_current_time,
+                      fname,
+                      ktest_current_line);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       ret = snprintf(msg, size, perfstr, metric, print_value, unit, desc);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       ret = snprintf(msg, size, "\n");
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       EMIT(ktest_output_buf, (int)(msg - ktest_output_buf));
+
+}
+
+void
+ktest_emit_testcase(void) {
+       char * msg = ktest_output_buf;
+       int size = sizeof(ktest_output_buf);
+       int ret;
+
+       char *fname = strnstr((char *)(uintptr_t)ktest_current_file, "xnu", 100);
+
+       ret = snprintf(msg,
+                      size,
+                      "\n[KTEST]\t" /* header */
+                      "%s\t"      /* type */
+                      "%lld\t"    /* time */
+                      "%d\t"      /* index */
+                      "%s\t"      /* file */
+                      "%d\t"      /* line */
+                      "%s\t"      /* message */
+                      "%s",       /* current_expr */
+                      ktest_testcase_result_tokens[ktest_testcase_mode]
+                                                  [ktest_testcase_result],
+                      ktest_current_time,
+                      ktest_expression_index,
+                      fname,
+                      ktest_current_line,
+                      ktest_current_msg,
+                      ktest_current_expr);
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       for(int i = 0; ktest_current_var_names[i][0]; i++) {
+               ret = snprintf(msg,
+                              size,
+                              "\t%s\t%s",
+                              ktest_current_var_names[i],
+                              ktest_current_var_values[i]);
+               BOUNDS_CHECK_AND_UPDATE(ret, size);
+       }
+
+       ret = snprintf(msg, size, "\n");
+       BOUNDS_CHECK_AND_UPDATE(ret, size);
+
+       EMIT(ktest_output_buf, (int)(msg - ktest_output_buf));
+}
diff --git a/osfmk/tests/ktest_global.c b/osfmk/tests/ktest_global.c
new file mode 100644 (file)
index 0000000..eee7e0a
--- /dev/null
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <tests/ktest.h>
+#include <stdint.h>
+#include <kern/misc_protos.h>
+#include <tests/ktest_internal.h>
+
+unsigned int ktest_current_line = 0;
+const char * ktest_current_file = NULL;
+const char * ktest_current_func = NULL;
+uint64_t ktest_current_time = 0;
+
+const char * ktest_test_name = "";
+
+char ktest_current_msg[KTEST_MAXLEN] = "";
+char ktest_current_expr[KTEST_MAXOUTLEN] = "";
+char ktest_current_var_names[KTEST_MAXVARS][KTEST_MAXLEN] = { "", "", ""  };
+char ktest_current_var_values[KTEST_MAXVARS][KTEST_MAXLEN] = { "", "", "" };
+unsigned int ktest_expression_index = 0;
+unsigned int ktest_current_var_index = 0;
+unsigned int ktest_test_index = 0;
+unsigned int ktest_passcount = 0;
+unsigned int ktest_failcount = 0;
+unsigned int ktest_xpasscount = 0;
+unsigned int ktest_xfailcount = 0;
+int ktest_expectfail = 0;
+int ktest_quiet = 0;
+
+int ktest_testcase_result = T_RESULT_FAIL;
+int ktest_test_result = T_STATE_UNRESOLVED;
+int ktest_testcase_mode = T_MAIN;
+
+ktest_temp ktest_temp1, ktest_temp2, ktest_temp3;
+
+char ktest_output_buf[KTEST_MAXLEN] = "";
+
+int
+ktest_test_result_statetab[KTEST_NUM_TEST_STATES]
+                         [KTEST_NUM_TESTCASE_STATES]
+                         [KTEST_NUM_TESTCASE_MODES] = {
+       [T_STATE_UNRESOLVED][T_RESULT_PASS][T_MAIN] = T_STATE_PASS,
+       [T_STATE_UNRESOLVED][T_RESULT_FAIL][T_MAIN] = T_STATE_FAIL,
+       [T_STATE_UNRESOLVED][T_RESULT_UXPASS][T_MAIN] = T_STATE_FAIL,
+       [T_STATE_UNRESOLVED][T_RESULT_XFAIL][T_MAIN] = T_STATE_PASS,
+
+       [T_STATE_PASS][T_RESULT_PASS][T_MAIN] = T_STATE_PASS,
+       [T_STATE_PASS][T_RESULT_FAIL][T_MAIN] = T_STATE_FAIL,
+       [T_STATE_PASS][T_RESULT_UXPASS][T_MAIN] = T_STATE_FAIL,
+       [T_STATE_PASS][T_RESULT_XFAIL][T_MAIN] = T_STATE_PASS,
+
+       [T_STATE_FAIL][T_RESULT_PASS][T_MAIN] = T_STATE_FAIL,
+       [T_STATE_FAIL][T_RESULT_FAIL][T_MAIN] = T_STATE_FAIL,
+       [T_STATE_FAIL][T_RESULT_UXPASS][T_MAIN] = T_STATE_FAIL,
+       [T_STATE_FAIL][T_RESULT_XFAIL][T_MAIN] = T_STATE_FAIL,
+
+       [T_STATE_SETUPFAIL][T_RESULT_PASS][T_MAIN] = T_STATE_SETUPFAIL,
+       [T_STATE_SETUPFAIL][T_RESULT_FAIL][T_MAIN] = T_STATE_SETUPFAIL,
+       [T_STATE_SETUPFAIL][T_RESULT_UXPASS][T_MAIN] = T_STATE_SETUPFAIL,
+       [T_STATE_SETUPFAIL][T_RESULT_XFAIL][T_MAIN] = T_STATE_SETUPFAIL,
+
+       [T_STATE_UNRESOLVED][T_RESULT_PASS][T_SETUP] = T_STATE_UNRESOLVED,
+       [T_STATE_UNRESOLVED][T_RESULT_FAIL][T_SETUP] = T_STATE_SETUPFAIL,
+       [T_STATE_UNRESOLVED][T_RESULT_UXPASS][T_SETUP] = T_STATE_SETUPFAIL,
+       [T_STATE_UNRESOLVED][T_RESULT_XFAIL][T_SETUP] = T_STATE_UNRESOLVED,
+
+       [T_STATE_PASS][T_RESULT_PASS][T_SETUP] = T_STATE_PASS,
+       [T_STATE_PASS][T_RESULT_FAIL][T_SETUP] = T_STATE_SETUPFAIL,
+       [T_STATE_PASS][T_RESULT_UXPASS][T_SETUP] = T_STATE_SETUPFAIL,
+       [T_STATE_PASS][T_RESULT_XFAIL][T_SETUP] = T_STATE_PASS,
+
+       [T_STATE_FAIL][T_RESULT_PASS][T_SETUP] = T_STATE_FAIL,
+       [T_STATE_FAIL][T_RESULT_FAIL][T_SETUP] = T_STATE_FAIL,
+       [T_STATE_FAIL][T_RESULT_UXPASS][T_SETUP] = T_STATE_FAIL,
+       [T_STATE_FAIL][T_RESULT_XFAIL][T_SETUP] = T_STATE_FAIL,
+
+       [T_STATE_SETUPFAIL][T_RESULT_PASS][T_SETUP] = T_STATE_SETUPFAIL,
+       [T_STATE_SETUPFAIL][T_RESULT_FAIL][T_SETUP] = T_STATE_SETUPFAIL,
+       [T_STATE_SETUPFAIL][T_RESULT_UXPASS][T_SETUP] = T_STATE_SETUPFAIL,
+       [T_STATE_SETUPFAIL][T_RESULT_XFAIL][T_SETUP] = T_STATE_SETUPFAIL,
+};
+
+const char * ktest_testcase_result_tokens[KTEST_NUM_TESTCASE_MODES]
+                                        [KTEST_NUM_TESTCASE_STATES] = {
+       [T_MAIN][T_RESULT_PASS] = "PASS",
+       [T_MAIN][T_RESULT_FAIL] = "FAIL",
+       [T_MAIN][T_RESULT_UXPASS] = "UXPASS",
+       [T_MAIN][T_RESULT_XFAIL] = "XFAIL",
+       [T_SETUP][T_RESULT_PASS] = "SETUP_PASS",
+       [T_SETUP][T_RESULT_FAIL] = "SETUP_FAIL",
+       [T_SETUP][T_RESULT_UXPASS] = "SETUP_UXPASS",
+       [T_SETUP][T_RESULT_XFAIL] = "SETUP_XFAIL",
+};
+
diff --git a/osfmk/tests/ktest_internal.h b/osfmk/tests/ktest_internal.h
new file mode 100644 (file)
index 0000000..bf82d45
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _TESTS_KTEST_INTERNAL_H
+#define _TESTS_KTEST_INTERNAL_H
+
+#include <tests/ktest.h>
+#include <stdint.h>
+
+#define KTEST_VERSION 1
+#define KTEST_VERSION_STR T_TOSTRING(KTEST_VERSION)
+
+#define KTEST_MAXLEN 1024
+#define KTEST_MAXOUTLEN 4096
+#define KTEST_MAXVARS 3
+
+#define KTEST_NUM_TESTCASE_MODES 2
+#define KTEST_NUM_TESTCASE_STATES 4
+#define KTEST_NUM_TEST_STATES 4
+
+extern unsigned int ktest_current_line;
+extern const char * ktest_current_file;
+extern const char * ktest_current_func;
+extern uint64_t ktest_current_time;
+
+extern const char * ktest_test_name;
+
+extern char ktest_current_msg[KTEST_MAXLEN];
+extern char ktest_current_expr[KTEST_MAXOUTLEN];
+extern char ktest_current_var_names[KTEST_MAXVARS][KTEST_MAXLEN];
+extern char ktest_current_var_values[KTEST_MAXVARS][KTEST_MAXLEN];
+extern unsigned int ktest_expression_index;
+extern unsigned int ktest_current_var_index;
+extern unsigned int ktest_test_index;
+extern unsigned int ktest_passcount;
+extern unsigned int ktest_failcount;
+extern unsigned int ktest_xpasscount;
+extern unsigned int ktest_xfailcount;
+extern int ktest_expectfail;
+
+extern int ktest_testcase_result;
+extern int ktest_test_result;
+extern int ktest_testcase_mode;
+
+extern ktest_temp ktest_temp1, ktest_temp2, ktest_temp3;
+
+extern char ktest_output_buf[KTEST_MAXLEN];
+
+extern int ktest_test_result_statetab[KTEST_NUM_TEST_STATES]
+                                    [KTEST_NUM_TESTCASE_STATES]
+                                    [KTEST_NUM_TESTCASE_MODES];
+
+extern const char * ktest_testcase_result_tokens[KTEST_NUM_TESTCASE_MODES]
+                                               [KTEST_NUM_TESTCASE_STATES];
+
+
+void ktest_emit_start(void);
+void ktest_emit_finish(void);
+void ktest_emit_testbegin(const char * test_name);
+void ktest_emit_testskip(const char * skip_msg, va_list args);
+void ktest_emit_testend(void);
+void ktest_emit_log(const char * log_msg, va_list args);
+void ktest_emit_perfdata(const char * metric, const char * unit, double value, const char * desc);
+void ktest_emit_testcase(void);
+
+#endif /* _TESTS_KTEST_INTERNAL_H */
+
diff --git a/osfmk/tests/pmap_tests.c b/osfmk/tests/pmap_tests.c
new file mode 100644 (file)
index 0000000..d0a1164
--- /dev/null
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2016 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ * 
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ * 
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ * 
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ * 
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <vm/vm_page.h>
+#include <vm/pmap.h>
+#include <kern/ledger.h>
+#include <kern/thread.h>
+
+
+extern ledger_template_t task_ledger_template;
+
+kern_return_t test_pmap_enter_disconnect(unsigned int num_loops);
+kern_return_t test_pmap_iommu_disconnect(void);
+
+#define PMAP_TEST_VA (0xDEAD << PAGE_SHIFT)
+
+typedef struct {
+       pmap_t pmap;
+       volatile boolean_t stop;
+       ppnum_t pn;
+} pmap_test_thread_args;       
+
+static pmap_t
+pmap_create_wrapper() {
+       pmap_t new_pmap = NULL;
+       ledger_t ledger;
+       assert(task_ledger_template != NULL);
+       if ((ledger = ledger_instantiate(task_ledger_template, LEDGER_CREATE_ACTIVE_ENTRIES)) == NULL)
+               return NULL;
+        new_pmap = pmap_create(ledger, 0, FALSE);
+       ledger_dereference(ledger);
+       return new_pmap;
+}
+
+static void
+pmap_disconnect_thread(void *arg, wait_result_t __unused wres) {
+       pmap_test_thread_args *args = arg;
+       do {
+               pmap_disconnect(args->pn);
+       } while (!args->stop);
+       thread_wakeup((event_t)args);
+}
+
+kern_return_t
+test_pmap_enter_disconnect(unsigned int num_loops)
+{
+       kern_return_t kr = KERN_SUCCESS;
+       thread_t disconnect_thread;
+       pmap_t new_pmap = pmap_create_wrapper();
+       if (new_pmap == NULL)
+               return KERN_FAILURE;
+       vm_page_t m = vm_page_grab();
+       if (m == VM_PAGE_NULL) {
+               pmap_destroy(new_pmap);
+               return KERN_FAILURE;
+       }
+       ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
+       pmap_test_thread_args args = {new_pmap, FALSE, phys_page};
+       kern_return_t res = kernel_thread_start(pmap_disconnect_thread, &args, &disconnect_thread);
+       if (res) {
+               pmap_destroy(new_pmap);
+               vm_page_lock_queues();
+               vm_page_free(m);
+               vm_page_unlock_queues();
+               return res;
+       }
+       thread_deallocate(disconnect_thread);
+
+       while (num_loops-- != 0) {
+               kr = pmap_enter(new_pmap, PMAP_TEST_VA, phys_page,
+                               VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, FALSE);
+               assert(kr == KERN_SUCCESS);
+       }
+
+       assert_wait((event_t)&args, THREAD_UNINT);
+       args.stop = TRUE;
+       thread_block(THREAD_CONTINUE_NULL);
+
+       pmap_remove(new_pmap, PMAP_TEST_VA, PMAP_TEST_VA + PAGE_SIZE);
+       vm_page_lock_queues();
+       vm_page_free(m);
+       vm_page_unlock_queues();
+       pmap_destroy(new_pmap);
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+test_pmap_iommu_disconnect(void)
+{
+       return KERN_SUCCESS;
+}
+
diff --git a/osfmk/tests/test_thread_call.c b/osfmk/tests/test_thread_call.c
new file mode 100644 (file)
index 0000000..ad37023
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2017 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#if !(DEVELOPMENT || DEBUG)
+#error "Testing is not enabled on RELEASE configurations"
+#endif
+
+#include <tests/xnupost.h>
+#include <kern/thread_call.h>
+#include <kern/locks.h>
+#include <kern/sched_prim.h>
+
+kern_return_t test_thread_call(void);
+
+lck_grp_t test_lock_grp;
+lck_mtx_t test_lock;
+
+typedef enum {
+       TEST_ARG1 = 0x1234,
+       TEST_ARG2 = 0x3456,
+} test_param;
+
+int wait_for_callback;
+int wait_for_main;
+
+int once_callback_counter = 0;
+
+static void
+test_once_callback(thread_call_param_t param0,
+                   thread_call_param_t param1)
+{
+       T_ASSERT_EQ_INT((test_param)param0, TEST_ARG1, "param0 is correct");
+       T_ASSERT_EQ_INT((test_param)param1, TEST_ARG2, "param1 is correct");
+
+       once_callback_counter++;
+
+       T_ASSERT_EQ_INT(once_callback_counter, 1, "only one callback");
+
+       lck_mtx_lock(&test_lock);
+
+       thread_wakeup(&wait_for_callback);
+
+       uint64_t deadline;
+       clock_interval_to_deadline(10, NSEC_PER_SEC, &deadline);
+
+       kern_return_t kr;
+       /* wait for the main thread to finish, time out after 10s */
+       kr = lck_mtx_sleep_deadline(&test_lock, LCK_SLEEP_DEFAULT, &wait_for_main, THREAD_UNINT, deadline);
+       T_ASSERT_EQ_INT(kr, THREAD_AWAKENED, " callback woken by main function");
+
+       lck_mtx_unlock(&test_lock);
+
+       /* sleep for 1s to let the main thread begin the cancel and wait */
+       delay_for_interval(1, NSEC_PER_SEC);
+}
+
+static void
+test_once_thread_call(void)
+{
+       lck_grp_init(&test_lock_grp, "test_thread_call", LCK_GRP_ATTR_NULL);
+       lck_mtx_init(&test_lock, &test_lock_grp, LCK_ATTR_NULL);
+
+       thread_call_t call;
+       call = thread_call_allocate_with_options(&test_once_callback,
+                                                (thread_call_param_t)TEST_ARG1,
+                                                THREAD_CALL_PRIORITY_HIGH,
+                                                THREAD_CALL_OPTIONS_ONCE);
+
+       thread_call_param_t arg2_param = (thread_call_param_t)TEST_ARG2;
+
+       lck_mtx_lock(&test_lock);
+
+       thread_call_enter1(call, arg2_param);
+
+       uint64_t deadline;
+       clock_interval_to_deadline(10, NSEC_PER_SEC, &deadline);
+
+       kern_return_t kr;
+       /* wait for the call to execute, time out after 10s */
+       kr = lck_mtx_sleep_deadline(&test_lock, LCK_SLEEP_DEFAULT, &wait_for_callback, THREAD_UNINT, deadline);
+       T_ASSERT_EQ_INT(kr, THREAD_AWAKENED, "main function woken by callback");
+
+       lck_mtx_unlock(&test_lock);
+
+       /* at this point the callback is stuck waiting */
+
+       T_ASSERT_EQ_INT(once_callback_counter, 1, "callback fired");
+
+       boolean_t canceled, pending, freed;
+
+       canceled = thread_call_cancel(call);
+       T_ASSERT_EQ_INT(canceled, FALSE, "thread_call_cancel should not succeed");
+
+       pending = thread_call_enter1(call, arg2_param);
+       T_ASSERT_EQ_INT(pending, FALSE, "call should not be pending");
+
+       /* sleep for 10ms, the call should not execute */
+       delay_for_interval(10, NSEC_PER_MSEC);
+
+       canceled = thread_call_cancel(call);
+       T_ASSERT_EQ_INT(canceled, TRUE, "thread_call_cancel should succeed");
+
+       pending = thread_call_enter1(call, arg2_param);
+       T_ASSERT_EQ_INT(pending, FALSE, "call should not be pending");
+
+       freed = thread_call_free(call);
+       T_ASSERT_EQ_INT(freed, FALSE, "thread_call_free should not succeed");
+
+       pending = thread_call_enter1(call, arg2_param);
+       T_ASSERT_EQ_INT(pending, TRUE, "call should be pending");
+
+       thread_wakeup(&wait_for_main);
+
+       canceled = thread_call_cancel_wait(call);
+       T_ASSERT_EQ_INT(canceled, TRUE, "thread_call_cancel_wait should succeed");
+
+       canceled = thread_call_cancel(call);
+       T_ASSERT_EQ_INT(canceled, FALSE, "thread_call_cancel should not succeed");
+
+       freed = thread_call_free(call);
+       T_ASSERT_EQ_INT(freed, TRUE, "thread_call_free should succeed");
+}
+
+int signal_callback_counter = 0;
+
+static void
+test_signal_callback(__unused thread_call_param_t param0,
+                     __unused thread_call_param_t param1)
+{
+       /*
+        * ktest sometimes panics if you assert from interrupt context,
+        * and the serial logging will blow past the delay to wait for the interrupt
+        * so don't print in this context.
+        */
+
+       signal_callback_counter++;
+}
+
+static void
+test_signal_thread_call(void)
+{
+       thread_call_t call;
+       call = thread_call_allocate_with_options(&test_signal_callback,
+                                                (thread_call_param_t)TEST_ARG1,
+                                                THREAD_CALL_PRIORITY_HIGH,
+                                                THREAD_CALL_OPTIONS_ONCE|THREAD_CALL_OPTIONS_SIGNAL);
+
+       thread_call_param_t arg2_param = (thread_call_param_t)TEST_ARG2;
+
+       uint64_t deadline;
+
+       boolean_t canceled, pending, freed;
+
+       clock_interval_to_deadline(10, NSEC_PER_SEC, &deadline);
+       pending = thread_call_enter1_delayed(call, arg2_param, deadline);
+       T_ASSERT_EQ_INT(pending, FALSE, "call should not be pending");
+
+       canceled = thread_call_cancel(call);
+       T_ASSERT_EQ_INT(canceled, TRUE, "thread_call_cancel should succeed");
+
+       clock_interval_to_deadline(10, NSEC_PER_MSEC, &deadline);
+       pending = thread_call_enter1_delayed(call, arg2_param, deadline);
+       T_ASSERT_EQ_INT(pending, FALSE, "call should not be pending");
+
+       /* sleep for 50ms to let the interrupt fire */
+       delay_for_interval(50, NSEC_PER_MSEC);
+
+       T_ASSERT_EQ_INT(signal_callback_counter, 1, "callback fired");
+
+       canceled = thread_call_cancel(call);
+       T_ASSERT_EQ_INT(canceled, FALSE, "thread_call_cancel should not succeed");
+
+       freed = thread_call_free(call);
+       T_ASSERT_EQ_INT(freed, TRUE, "thread_call_free should succeed");
+}
+
+kern_return_t
+test_thread_call(void)
+{
+       test_once_thread_call();
+       test_signal_thread_call();
+
+       return KERN_SUCCESS;
+}
diff --git a/osfmk/tests/xnupost.h b/osfmk/tests/xnupost.h
new file mode 100644 (file)
index 0000000..3268586
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2015 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _TESTS_XNUPOST_H
+#define _TESTS_XNUPOST_H
+
+#ifndef CONFIG_XNUPOST
+#error "Testing is not enabled if CONFIG_XNUPOST is not enabled"
+#endif
+
+#include <kern/kern_types.h>
+#include <kern/assert.h>
+#include <tests/ktest.h>
+
+#define XT_CONFIG_RUN 0x0
+#define XT_CONFIG_IGNORE 0x1
+#define XT_CONFIG_EXPECT_PANIC 0x2
+
+#define XTCTL_RUN_TESTS  1
+#define XTCTL_RESET_TESTDATA 2
+
+typedef enum { XT_ACTION_NONE = 0, XT_ACTION_SKIPPED, XT_ACTION_PASSED, XT_ACTION_FAILED } xnupost_test_action_t;
+
+typedef kern_return_t (*test_function)(void);
+struct xnupost_test {
+       uint16_t xt_config;
+       uint16_t xt_test_num;
+       kern_return_t xt_retval;
+       kern_return_t xt_expected_retval;
+       uint64_t xt_begin_time;
+       uint64_t xt_end_time;
+       xnupost_test_action_t xt_test_actions;
+       test_function xt_func;
+       const char * xt_name;
+};
+
+typedef kern_return_t xt_panic_return_t;
+#define XT_PANIC_UNRELATED  0x8  /* not related. continue panic */
+#define XT_RET_W_FAIL       0x9  /* report FAILURE and return from panic */
+#define XT_RET_W_SUCCESS    0xA  /* report SUCCESS and return from panic */
+#define XT_PANIC_W_FAIL     0xB  /* report FAILURE and continue to panic */
+#define XT_PANIC_W_SUCCESS  0xC  /* report SUCCESS and continue to panic */
+
+typedef xt_panic_return_t (*xt_panic_widget_func)(const char * panicstr, void * context, void ** outval);
+struct xnupost_panic_widget {
+       void * xtp_context_p;
+       void ** xtp_outval_p;
+       const char * xtp_func_name;
+       xt_panic_widget_func xtp_func;
+};
+
+/* for internal use only. Use T_REGISTER_* macros */
+extern xt_panic_return_t _xt_generic_assert_check(const char * s, void * str_to_match, void ** outval);
+kern_return_t xnupost_register_panic_widget(xt_panic_widget_func funcp, const char * funcname, void * context, void ** outval);
+
+#define T_REGISTER_PANIC_WIDGET(func, ctx, outval) xnupost_register_panic_widget((func), #func, (ctx), (outval))
+#define T_REGISTER_ASSERT_CHECK(assert_str, retval) \
+       T_REGISTER_PANIC_WIDGET(_xt_generic_assert_check, (void *)__DECONST(char *, assert_str), retval)
+
+typedef struct xnupost_test xnupost_test_data_t;
+typedef struct xnupost_test * xnupost_test_t;
+
+extern struct xnupost_test kernel_post_tests[];
+extern uint32_t kernel_post_tests_count;
+extern uint32_t total_post_tests_count;
+
+#define XNUPOST_TEST_CONFIG_BASIC(func)                   \
+       {                                                     \
+               XT_CONFIG_RUN, 0, -1, T_STATE_PASS, 0, 0, 0, (func), "xnu."#func \
+       }
+
+#define XNUPOST_TEST_CONFIG_TEST_PANIC(func)                       \
+       {                                                              \
+               XT_CONFIG_EXPECT_PANIC, 0, -1, T_STATE_PASS, 0, 0, 0, (func), "xnu."#func \
+       }
+
+void xnupost_init(void);
+/*
+ * Parse boot-args specific to POST testing and setup enabled/disabled settings
+ * returns: KERN_SUCCESS - if testing is enabled.
+ */
+kern_return_t xnupost_parse_config(void);
+kern_return_t xnupost_run_tests(xnupost_test_t test_list, uint32_t test_count);
+kern_return_t xnupost_list_tests(xnupost_test_t test_list, uint32_t test_count);
+kern_return_t xnupost_reset_tests(xnupost_test_t test_list, uint32_t test_count);
+
+int xnupost_export_testdata(void * outp, uint32_t size, uint32_t * lenp);
+uint32_t xnupost_get_estimated_testdata_size(void);
+
+kern_return_t kernel_do_post(void);
+kern_return_t xnupost_process_kdb_stop(const char * panic_s);
+int xnupost_reset_all_tests(void);
+
+kern_return_t kernel_list_tests(void);
+int bsd_do_post(void);
+int bsd_list_tests(void);
+
+#endif /* _TESTS_XNUPOST_H */
index af3985f95ce6a01232588d7080e7c0d5eb590127..0999ea757692186c0875e98cc738026fb8227301 100644 (file)
@@ -177,10 +177,6 @@ int pagerdebug=0;
 
 extern int proc_resetpcontrol(int);
 
-#if DEVELOPMENT || DEBUG
-extern unsigned long vm_cs_validated_resets;
-#endif
-
 
 extern int     uiomove64(addr64_t, int, void *);
 #define        MAX_RUN 32
@@ -240,7 +236,7 @@ memory_object_control_uiomove(
                                break;
 
 
-                       if (dst_page->busy || dst_page->cleaning) {
+                       if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
                                /*
                                 * someone else is playing with the page... if we've
                                 * already collected pages into this run, go ahead
@@ -253,28 +249,28 @@ memory_object_control_uiomove(
                                PAGE_SLEEP(object, dst_page, THREAD_UNINT);
                                continue;
                        }
-                       if (dst_page->laundry)
+                       if (dst_page->vmp_laundry)
                                vm_pageout_steal_laundry(dst_page, FALSE);
 
                        if (mark_dirty) {
-                               if (dst_page->dirty == FALSE)
+                               if (dst_page->vmp_dirty == FALSE)
                                        dirty_count++;
                                SET_PAGE_DIRTY(dst_page, FALSE);
-                               if (dst_page->cs_validated && 
-                                   !dst_page->cs_tainted) {
+                               if (dst_page->vmp_cs_validated && 
+                                   !dst_page->vmp_cs_tainted) {
                                        /*
                                         * CODE SIGNING:
                                         * We're modifying a code-signed
                                         * page: force revalidate
                                         */
-                                       dst_page->cs_validated = FALSE;
-#if DEVELOPMENT || DEBUG
-                                        vm_cs_validated_resets++;
-#endif
+                                       dst_page->vmp_cs_validated = FALSE;
+
+                                       VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
+
                                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
                                }
                        }
-                       dst_page->busy = TRUE;
+                       dst_page->vmp_busy = TRUE;
 
                        page_run[cur_run++] = dst_page;
 
@@ -334,7 +330,7 @@ memory_object_control_uiomove(
                         * update clustered and speculative state
                         * 
                         */
-                       if (dst_page->clustered)
+                       if (dst_page->vmp_clustered)
                                VM_PAGE_CONSUME_CLUSTERED(dst_page);
 
                        PAGE_WAKEUP_DONE(dst_page);
@@ -370,6 +366,8 @@ vnode_pager_bootstrap(void)
 #if __arm64__
        fourk_pager_bootstrap();
 #endif /* __arm64__ */
+       shared_region_pager_bootstrap();
+
        return;
 }
 
@@ -476,6 +474,21 @@ vnode_pager_data_unlock(
        return KERN_FAILURE;
 }
 
+void
+vnode_pager_dirtied(
+       memory_object_t         mem_obj,
+       vm_object_offset_t      s_offset,
+       vm_object_offset_t      e_offset)
+{
+       vnode_pager_t   vnode_object;
+
+       if (mem_obj && mem_obj->mo_pager_ops == &vnode_pager_ops) {
+
+               vnode_object = vnode_pager_lookup(mem_obj);
+               vnode_pager_was_dirtied(vnode_object->vnode_handle, s_offset, e_offset);
+       }
+}
+
 kern_return_t
 vnode_pager_get_isinuse(
        memory_object_t         mem_obj,
index 3c1d5be0a92c9b733488550518e6e3168c507664..caec717a654e932f9c68b8eb79b79e85c9879b0d 100644 (file)
@@ -419,8 +419,6 @@ size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
   return (size_t)(dst - dst_buffer); // bytes produced
 }
 
-#define likely(expr)     __builtin_expect((expr) != 0, 1)
-#define unlikely(expr)   __builtin_expect((expr) != 0, 0)
 typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
 
 int lz4_decode(uint8_t ** dst_ptr,
@@ -446,25 +444,25 @@ int lz4_decode(uint8_t ** dst_ptr,
         uint32_t matchLength = 4 + (cmd & 15); // 4..19
 
         // extra bytes for literalLength
-        if (unlikely(literalLength == 15))
+        if (__improbable(literalLength == 15))
         {
             uint8_t s;
             do {
 #if DEBUG_LZ4_DECODE_ERRORS
-                if (unlikely(src >= src_end)) printf("Truncated SRC literal length\n");
+                if (__improbable(src >= src_end)) printf("Truncated SRC literal length\n");
 #endif
-                if (unlikely(src >= src_end)) goto IN_FAIL;         // unexpected end of input (1 byte needed)
+                if (__improbable(src >= src_end)) goto IN_FAIL;         // unexpected end of input (1 byte needed)
                 s = *src++;
                 literalLength += s;
-            } while (unlikely(s == 255));
+            } while (__improbable(s == 255));
         }
 
         // copy literal
 #if DEBUG_LZ4_DECODE_ERRORS
-        if (unlikely(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n");
+        if (__improbable(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n");
 #endif
-        if (unlikely(literalLength > (size_t)(src_end - src))) goto IN_FAIL;
-        if (unlikely(literalLength > (size_t)(dst_end - dst))) {
+        if (__improbable(literalLength > (size_t)(src_end - src))) goto IN_FAIL;
+        if (__improbable(literalLength > (size_t)(dst_end - dst))) {
             //  literal will take us past the end of the destination buffer,
             //  so we can only copy part of it.
             literalLength = (uint32_t)(dst_end - dst);
@@ -476,11 +474,11 @@ int lz4_decode(uint8_t ** dst_ptr,
         src += literalLength;
         dst += literalLength;
 
-        if (unlikely(src >= src_end)) goto OUT_FULL;                // valid end of stream
+        if (__improbable(src >= src_end)) goto OUT_FULL;                // valid end of stream
 #if DEBUG_LZ4_DECODE_ERRORS
-        if (unlikely(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n");
+        if (__improbable(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n");
 #endif
-        if (unlikely(2 > (size_t)(src_end - src))) goto IN_FAIL;    // unexpected end of input (2 bytes needed)
+        if (__improbable(2 > (size_t)(src_end - src))) goto IN_FAIL;    // unexpected end of input (2 bytes needed)
 
        //DRKTODO: this causes an alignment increase warning (legitimate?)
        //DRKTODO: cast of char * to uint16_t*
@@ -494,29 +492,29 @@ int lz4_decode(uint8_t ** dst_ptr,
 #if DEBUG_LZ4_DECODE_ERRORS
         if (matchDistance == 0) printf("Invalid match distance D = 0\n");
 #endif
-        if (unlikely(matchDistance == 0)) goto IN_FAIL;                      // 0x0000 invalid
+        if (__improbable(matchDistance == 0)) goto IN_FAIL;                      // 0x0000 invalid
         uint8_t * ref = dst - matchDistance;
 #if DEBUG_LZ4_DECODE_ERRORS
-        if (unlikely(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end);
+        if (__improbable(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end);
 #endif
-        if (unlikely(ref < dst_begin)) goto OUT_FAIL;                        // out of range
+        if (__improbable(ref < dst_begin)) goto OUT_FAIL;                        // out of range
 
         // extra bytes for matchLength
-        if (unlikely(matchLength == 19))
+        if (__improbable(matchLength == 19))
         {
             uint8_t s;
             do {
 #if DEBUG_LZ4_DECODE_ERRORS
-                if (unlikely(src >= src_end)) printf("Truncated SRC match length\n");
+                if (__improbable(src >= src_end)) printf("Truncated SRC match length\n");
 #endif
-                if (unlikely(src >= src_end)) goto IN_FAIL;                      // unexpected end of input (1 byte needed)
+                if (__improbable(src >= src_end)) goto IN_FAIL;                      // unexpected end of input (1 byte needed)
                 s = *src++;
                 matchLength += s;
-            } while (unlikely(s == 255));
+            } while (__improbable(s == 255));
         }
       
         // copy match (may overlap)
-        if (unlikely(matchLength > (size_t)(dst_end - dst))) {
+        if (__improbable(matchLength > (size_t)(dst_end - dst))) {
             //  match will take us past the end of the destination buffer,
             //  so we can only copy part of it.
             matchLength = (uint32_t)(dst_end - dst);
index d37eb42243d5df1977a7787f8dda6207e331ff2a..864969edadbc2dcc083d876ec1390379f88780df 100644 (file)
@@ -122,15 +122,15 @@ decl_lck_mtx_data(,       memory_manager_default_lock)
  *             2. Page is precious and should_return is RETURN_ALL.
  *             3. Should_return is RETURN_ANYTHING.
  *
- *             As a side effect, m->dirty will be made consistent
+ *             As a side effect, m->vmp_dirty will be made consistent
  *             with pmap_is_modified(m), if should_return is not
  *             MEMORY_OBJECT_RETURN_NONE.
  */
 
 #define        memory_object_should_return_page(m, should_return) \
     (should_return != MEMORY_OBJECT_RETURN_NONE && \
-     (((m)->dirty || ((m)->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)))) || \
-      ((m)->precious && (should_return) == MEMORY_OBJECT_RETURN_ALL) || \
+     (((m)->vmp_dirty || ((m)->vmp_dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)))) || \
+      ((m)->vmp_precious && (should_return) == MEMORY_OBJECT_RETURN_ALL) || \
       (should_return) == MEMORY_OBJECT_RETURN_ANYTHING))
 
 typedef        int     memory_object_lock_result_t;
@@ -171,18 +171,18 @@ memory_object_lock_page(
             m, should_return, should_flush, prot, 0);
 
 
-       if (m->busy || m->cleaning)
+       if (m->vmp_busy || m->vmp_cleaning)
                return (MEMORY_OBJECT_LOCK_RESULT_MUST_BLOCK);
 
-       if (m->laundry)
+       if (m->vmp_laundry)
                vm_pageout_steal_laundry(m, FALSE);
 
        /*
         *      Don't worry about pages for which the kernel
         *      does not have any data.
         */
-       if (m->absent || m->error || m->restart) {
-               if (m->error && should_flush && !VM_PAGE_WIRED(m)) {
+       if (m->vmp_absent || m->vmp_error || m->vmp_restart) {
+               if (m->vmp_error && should_flush && !VM_PAGE_WIRED(m)) {
                        /*
                         * dump the page, pager wants us to
                         * clean it up and there is no
@@ -192,7 +192,7 @@ memory_object_lock_page(
                }
                return (MEMORY_OBJECT_LOCK_RESULT_DONE);
        }
-       assert(!m->fictitious);
+       assert(!m->vmp_fictitious);
 
        if (VM_PAGE_WIRED(m)) {
                /*
@@ -486,10 +486,6 @@ MACRO_BEGIN                                                                \
         int                    upl_flags;                              \
        memory_object_t         pager;                                  \
                                                                        \
-       if (object->object_slid) {                                      \
-               panic("Objects with slid pages not allowed\n");         \
-       }                                                               \
-                                                                       \
        if ((pager = (object)->pager) != MEMORY_OBJECT_NULL) {          \
                vm_object_paging_begin(object);                         \
                vm_object_unlock(object);                               \
@@ -598,7 +594,7 @@ vm_object_update_extent(
                                break;
 
                        case MEMORY_OBJECT_LOCK_RESULT_MUST_FREE:
-                               if (m->dirty == TRUE)
+                               if (m->vmp_dirty == TRUE)
                                        dirty_count++;
                                dwp->dw_mask |= DW_vm_page_free;
                                break;
@@ -625,7 +621,7 @@ vm_object_update_extent(
                                                /*
                                                 * add additional state for the flush
                                                 */
-                                               m->free_when_done = TRUE;
+                                               m->vmp_free_when_done = TRUE;
                                        }
                                        /*
                                         * we use to remove the page from the queues at this
@@ -767,7 +763,7 @@ vm_object_update(
                vm_page_t               page;
                vm_page_t               top_page;
                kern_return_t           error = 0;
-               struct vm_object_fault_info fault_info;
+               struct vm_object_fault_info fault_info = {};
 
                if (copy_object != VM_OBJECT_NULL) {
                        /*
@@ -808,16 +804,11 @@ vm_object_update(
                }
                fault_info.interruptible = THREAD_UNINT;
                fault_info.behavior  = VM_BEHAVIOR_SEQUENTIAL;
-               fault_info.user_tag  = 0;
-               fault_info.pmap_options = 0;
                fault_info.lo_offset = copy_offset;
                fault_info.hi_offset = copy_size;
-               fault_info.no_cache   = FALSE;
                fault_info.stealth = TRUE;
-               fault_info.io_sync = FALSE;
-               fault_info.cs_bypass = FALSE;
-               fault_info.mark_zf_absent = FALSE;
-               fault_info.batch_pmap_op = FALSE;
+               assert(fault_info.cs_bypass == FALSE);
+               assert(fault_info.pmap_cs_associated == FALSE);
 
                vm_object_paging_begin(copy_object);
 
@@ -958,24 +949,24 @@ BYPASS_COW_COPYIN:
                m = (vm_page_t) vm_page_queue_first(&object->memq);
 
                while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t) m)) {
-                       next = (vm_page_t) vm_page_queue_next(&m->listq);
+                       next = (vm_page_t) vm_page_queue_next(&m->vmp_listq);
 
-                       if ((m->offset >= start) && (m->offset < end)) {
+                       if ((m->vmp_offset >= start) && (m->vmp_offset < end)) {
                                /*
                                 * this is a page we're interested in
                                 * try to fit it into a current extent
                                 */
                                for (n = 0; n < num_of_extents; n++) {
-                                       if ((m->offset & e_mask) == extents[n].e_base) {
+                                       if ((m->vmp_offset & e_mask) == extents[n].e_base) {
                                                /*
                                                 * use (PAGE_SIZE - 1) to determine the
                                                 * max offset so that we don't wrap if
                                                 * we're at the last page of the space
                                                 */
-                                               if (m->offset < extents[n].e_min)
-                                                       extents[n].e_min = m->offset;
-                                               else if ((m->offset + (PAGE_SIZE - 1)) > extents[n].e_max)
-                                                       extents[n].e_max = m->offset + (PAGE_SIZE - 1);
+                                               if (m->vmp_offset < extents[n].e_min)
+                                                       extents[n].e_min = m->vmp_offset;
+                                               else if ((m->vmp_offset + (PAGE_SIZE - 1)) > extents[n].e_max)
+                                                       extents[n].e_max = m->vmp_offset + (PAGE_SIZE - 1);
                                                break;
                                        }
                                }
@@ -989,9 +980,9 @@ BYPASS_COW_COPYIN:
                                                 * if we still have room, 
                                                 * create a new extent
                                                 */
-                                               extents[n].e_base = m->offset & e_mask;
-                                               extents[n].e_min  = m->offset;
-                                               extents[n].e_max  = m->offset + (PAGE_SIZE - 1);
+                                               extents[n].e_base = m->vmp_offset & e_mask;
+                                               extents[n].e_min  = m->vmp_offset;
+                                               extents[n].e_max  = m->vmp_offset + (PAGE_SIZE - 1);
 
                                                num_of_extents++;
                                        } else {
@@ -1556,23 +1547,33 @@ memory_object_super_upl_request(
 }
 
 kern_return_t
-memory_object_cluster_size(memory_object_control_t control, memory_object_offset_t *start,
-                          vm_size_t *length, uint32_t *io_streaming, memory_object_fault_info_t fault_info)
+memory_object_cluster_size(
+       memory_object_control_t control,
+       memory_object_offset_t  *start,
+       vm_size_t               *length,
+       uint32_t                *io_streaming,
+       memory_object_fault_info_t mo_fault_info)
 {
        vm_object_t             object;
+       vm_object_fault_info_t  fault_info;
 
        object = memory_object_control_to_vm_object(control);
 
        if (object == VM_OBJECT_NULL || object->paging_offset > *start)
-               return (KERN_INVALID_ARGUMENT);
+               return KERN_INVALID_ARGUMENT;
 
        *start -= object->paging_offset;
 
-       vm_object_cluster_size(object, (vm_object_offset_t *)start, length, (vm_object_fault_info_t)fault_info, io_streaming);
+       fault_info = (vm_object_fault_info_t)(uintptr_t) mo_fault_info;
+       vm_object_cluster_size(object,
+                              (vm_object_offset_t *)start,
+                              length,
+                              fault_info,
+                              io_streaming);
 
        *start += object->paging_offset;
 
-       return (KERN_SUCCESS);
+       return KERN_SUCCESS;
 }
 
 
@@ -1931,7 +1932,7 @@ memory_object_is_signed(
 }
 
 boolean_t
-memory_object_is_slid(
+memory_object_is_shared_cache(
        memory_object_control_t control)
 {
        vm_object_t     object = VM_OBJECT_NULL;
@@ -1940,7 +1941,7 @@ memory_object_is_slid(
        if (object == VM_OBJECT_NULL)
                return FALSE;
 
-       return object->object_slid;
+       return object->object_is_shared_cache;
 }
 
 static zone_t mem_obj_control_zone;
index d14b5e3c96800ad0753b5e8e1709dfbda8bfa7ad..6023627ae467873245aeb37153e4a89934c0c165 100644 (file)
@@ -132,7 +132,7 @@ extern kern_return_t        memory_object_signed(
 extern boolean_t       memory_object_is_signed(
        memory_object_control_t control);
 
-extern boolean_t       memory_object_is_slid(
+extern boolean_t       memory_object_is_shared_cache(
        memory_object_control_t         control);
 
 extern void            memory_object_mark_used(
index 4916313ce81a1fc168278e660e8273016c7e46c8..e70231c438e5dfb133afd56a3bf7ad9166d1b585 100644 (file)
@@ -74,6 +74,8 @@
 #include <mach/boolean.h>
 #include <mach/vm_prot.h>
 
+#include <kern/trustcache.h>
+
 #ifdef KERNEL_PRIVATE
 
 /*
@@ -431,7 +433,7 @@ extern kern_return_t        (pmap_attribute)(       /* Get/Set special memory
        if (__obj->internal) {                                          \
                __options |= PMAP_OPTIONS_INTERNAL;                     \
        }                                                               \
-       if (__page->reusable || __obj->all_reusable) {                  \
+       if (__page->vmp_reusable || __obj->all_reusable) {              \
                __options |= PMAP_OPTIONS_REUSABLE;                     \
        }                                                               \
        result = pmap_enter_options(__pmap,                             \
@@ -460,7 +462,7 @@ extern kern_return_t        (pmap_attribute)(       /* Get/Set special memory
        if (__obj->internal) {                                          \
                __extra_options |= PMAP_OPTIONS_INTERNAL;               \
        }                                                               \
-       if (__page->reusable || __obj->all_reusable) {                  \
+       if (__page->vmp_reusable || __obj->all_reusable) {              \
                __extra_options |= PMAP_OPTIONS_REUSABLE;               \
        }                                                               \
        result = pmap_enter_options(__pmap,                             \
@@ -547,7 +549,7 @@ extern kern_return_t        (pmap_attribute)(       /* Get/Set special memory
 
 #define PMAP_ENTER_CHECK(pmap, page)                                   \
 {                                                                      \
-       if ((page)->error) {                                            \
+       if ((page)->vmp_error) {                                        \
                panic("VM page %p should not have an error\n",          \
                        (page));                                        \
        }                                                               \
@@ -688,6 +690,7 @@ extern pmap_t       kernel_pmap;                    /* The kernel's map */
                                                    * iff page was modified */
 #define PMAP_OPTIONS_PROTECT_IMMEDIATE 0x1000  /* allow protections to be
                                                 * be upgraded */
+#define PMAP_OPTIONS_CLEAR_WRITE 0x2000
 
 
 #if    !defined(__LP64__)
@@ -725,6 +728,21 @@ mach_vm_size_t pmap_query_resident(pmap_t pmap,
 /* Inform the pmap layer that there is a JIT entry in this map. */
 extern void pmap_set_jit_entitled(pmap_t pmap);
 
+/*
+ * Tell the pmap layer what range within the nested region the VM intends to
+ * use.
+ */
+extern void pmap_trim(pmap_t grand, pmap_t subord, addr64_t vstart, addr64_t nstart, uint64_t size);
+
+/*
+ * Dump page table contents into the specified buffer.  Returns the number of
+ * bytes copied, 0 if insufficient space, (size_t)-1 if unsupported.
+ * This is expected to only be called from kernel debugger context,
+ * so synchronization is not required.
+ */
+
+extern size_t pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end);
+
 /*
  * Indicates if any special policy is applied to this protection by the pmap
  * layer.
@@ -735,7 +753,7 @@ bool pmap_has_prot_policy(vm_prot_t prot);
  * Causes the pmap to return any available pages that it can return cheaply to
  * the VM.
  */
-void pmap_release_pages_fast(void);
+uint64_t pmap_release_pages_fast(void);
 
 #define PMAP_QUERY_PAGE_PRESENT                        0x01
 #define PMAP_QUERY_PAGE_REUSABLE               0x02
@@ -754,6 +772,11 @@ int pmap_pgtrace_delete_page(pmap_t pmap, vm_map_offset_t start, vm_map_offset_t
 kern_return_t pmap_pgtrace_fault(pmap_t pmap, vm_map_offset_t va, arm_saved_state_t *ss);
 #endif
 
+
+extern void pmap_ledger_alloc_init(size_t);
+extern ledger_t pmap_ledger_alloc(void);
+extern void pmap_ledger_free(ledger_t);
+
 #endif  /* KERNEL_PRIVATE */
 
 #endif /* _VM_PMAP_H_ */
index c8f3343f19ad7073b96e8a2980f9fbcabc933f67..9918cf98438f87656f62051859a719e522c2f74c 100644 (file)
@@ -547,10 +547,9 @@ vm32__task_wire(
        if (map == VM_MAP_NULL)
                return(KERN_INVALID_ARGUMENT);
 
-       if (must_wire)
-               map->wiring_required = TRUE;
-       else
-               map->wiring_required = FALSE;
+       vm_map_lock(map);
+       map->wiring_required = (must_wire == TRUE);
+       vm_map_unlock(map);
 
        return(KERN_SUCCESS);
 }
index 707c3e69591145df02544dc91f165f70f1f9f734..508211e68b3000b36ad051665cd74a7cf078b7ad 100644 (file)
@@ -354,7 +354,6 @@ apple_protect_pager_data_request(
        unsigned int            pl_count;
        vm_object_t             src_top_object, src_page_object, dst_object;
        kern_return_t           kr, retval;
-       vm_map_offset_t         kernel_mapping;
        vm_offset_t             src_vaddr, dst_vaddr;
        vm_offset_t             cur_offset;
        vm_offset_t             offset_in_page;
@@ -370,10 +369,9 @@ apple_protect_pager_data_request(
        retval = KERN_SUCCESS;
        src_top_object = VM_OBJECT_NULL;
        src_page_object = VM_OBJECT_NULL;
-       kernel_mapping = 0;
        upl = NULL;
        upl_pl = NULL;
-       fault_info = *((struct vm_object_fault_info *) mo_fault_info);
+       fault_info = *((struct vm_object_fault_info *)(uintptr_t)mo_fault_info);
        fault_info.stealth = TRUE;
        fault_info.io_sync = FALSE;
        fault_info.mark_zf_absent = FALSE;
@@ -386,6 +384,9 @@ apple_protect_pager_data_request(
 
        PAGER_DEBUG(PAGER_PAGEIN, ("apple_protect_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager));
 
+       fault_info.lo_offset += pager->backing_offset;
+       fault_info.hi_offset += pager->backing_offset;
+
        /*
         * Gather in a UPL all the VM pages requested by VM.
         */
@@ -409,39 +410,6 @@ apple_protect_pager_data_request(
        dst_object = mo_control->moc_object;
        assert(dst_object != VM_OBJECT_NULL);
 
-
-#if __x86_64__ || __arm__ || __arm64__
-       /* we'll use the 1-to-1 mapping of physical memory */
-       src_vaddr = 0;
-       dst_vaddr = 0;
-#else /* __x86_64__ || __arm__ || __arm64__ */
-       /*
-        * Reserve 2 virtual pages in the kernel address space to map each
-        * source and destination physical pages when it's their turn to
-        * be processed.
-        */
-       vm_map_entry_t          map_entry;
-
-       vm_object_reference(kernel_object);     /* ref. for mapping */
-       kr = vm_map_find_space(kernel_map,
-                              &kernel_mapping,
-                              2 * PAGE_SIZE_64,
-                              0,
-                              0,
-                              VM_MAP_KERNEL_FLAGS_NONE,
-                              &map_entry);
-       if (kr != KERN_SUCCESS) {
-               vm_object_deallocate(kernel_object);
-               retval = kr;
-               goto done;
-       }
-       map_entry->object.vm_object = kernel_object;
-       map_entry->offset = kernel_mapping;
-       vm_map_unlock(kernel_map);
-       src_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping);
-       dst_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping + PAGE_SIZE_64);
-#endif /* __x86_64__ || __arm__ || __arm64__ */
-
        /*
         * We'll map the encrypted data in the kernel address space from the 
         * backing VM object (itself backed by the encrypted file via
@@ -522,66 +490,42 @@ apple_protect_pager_data_request(
                              kr);
                }
                assert(src_page != VM_PAGE_NULL);
-               assert(src_page->busy);
+               assert(src_page->vmp_busy);
 
-               if (( !VM_PAGE_NON_SPECULATIVE_PAGEABLE(src_page))) {
+               if (src_page->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
 
                        vm_page_lockspin_queues();
 
-                       if (( !VM_PAGE_NON_SPECULATIVE_PAGEABLE(src_page))) {
-                               vm_page_deactivate(src_page);
+                       if (src_page->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
+                               vm_page_speculate(src_page, FALSE);
                        }
                        vm_page_unlock_queues();
                }
 
                /*
-                * Establish an explicit mapping of the source
-                * physical page.
+                * Establish pointers to the source
+                * and destination physical pages.
                 */
+               dst_pnum = (ppnum_t)
+                       upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE));
+                assert(dst_pnum != 0);
 #if __x86_64__
                src_vaddr = (vm_map_offset_t)
                        PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
                                     << PAGE_SHIFT);
+               dst_vaddr = (vm_map_offset_t)
+                       PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
+
 #elif __arm__ || __arm64__
                src_vaddr = (vm_map_offset_t)
                        phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
                                 << PAGE_SHIFT);
-#else
-               kr = pmap_enter(kernel_pmap,
-                               src_vaddr,
-                               VM_PAGE_GET_PHYS_PAGE(src_page),
-                               VM_PROT_READ,
-                               VM_PROT_NONE,
-                               0,
-                               TRUE);
-
-               assert(kr == KERN_SUCCESS);
-#endif
-               /*
-                * Establish an explicit pmap mapping of the destination
-                * physical page.
-                * We can't do a regular VM mapping because the VM page
-                * is "busy".
-                */
-               dst_pnum = (ppnum_t)
-                       upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE));
-               assert(dst_pnum != 0);
-#if __x86_64__
-               dst_vaddr = (vm_map_offset_t)
-                       PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
-#elif __arm__ || __arm64__
                dst_vaddr = (vm_map_offset_t)
                        phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
 #else
-               kr = pmap_enter(kernel_pmap,
-                               dst_vaddr,
-                               dst_pnum,
-                               VM_PROT_READ | VM_PROT_WRITE,
-                               VM_PROT_NONE,
-                               0,
-                               TRUE);
-
-               assert(kr == KERN_SUCCESS);
+#error "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
+               src_vaddr = 0;
+               dst_vaddr = 0;
 #endif
                src_page_object = VM_PAGE_OBJECT(src_page);
 
@@ -597,11 +541,11 @@ apple_protect_pager_data_request(
                 * ... and transfer the results to the destination page.
                 */
                UPL_SET_CS_VALIDATED(upl_pl, cur_offset / PAGE_SIZE,
-                                    src_page->cs_validated);
+                                    src_page->vmp_cs_validated);
                UPL_SET_CS_TAINTED(upl_pl, cur_offset / PAGE_SIZE,
-                                  src_page->cs_tainted);
+                                  src_page->vmp_cs_tainted);
                UPL_SET_CS_NX(upl_pl, cur_offset / PAGE_SIZE,
-                                  src_page->cs_nx);
+                                  src_page->vmp_cs_nx);
 
                /*
                 * page_decrypt() might access a mapped file, so let's release
@@ -610,7 +554,7 @@ apple_protect_pager_data_request(
                 * "paging_in_progress" reference on its object, so it's safe
                 * to unlock the object here.
                 */
-               assert(src_page->busy);
+               assert(src_page->vmp_busy);
                assert(src_page_object->paging_in_progress > 0);
                vm_object_unlock(src_page_object);
 
@@ -630,6 +574,7 @@ apple_protect_pager_data_request(
                                                     offset_in_page),
                                      (char *)(dst_vaddr + offset_in_page),
                                      4096);
+
                                if (apple_protect_pager_data_request_debug) {
                                        printf("apple_protect_data_request"
                                               "(%p,0x%llx+0x%llx+0x%04llx): "
@@ -651,9 +596,9 @@ apple_protect_pager_data_request(
                                               *(uint64_t *)(dst_vaddr+
                                                             offset_in_page+8),
                                               src_page_object->code_signed,
-                                              src_page->cs_validated,
-                                              src_page->cs_tainted,
-                                              src_page->cs_nx);
+                                              src_page->vmp_cs_validated,
+                                              src_page->vmp_cs_tainted,
+                                              src_page->vmp_cs_nx);
                                }
                                ret = 0;
                                continue;
@@ -667,6 +612,7 @@ apple_protect_pager_data_request(
                                 cur_offset +
                                 offset_in_page),
                                pager->crypt_info->crypt_ops);
+
                        if (apple_protect_pager_data_request_debug) {
                                printf("apple_protect_data_request"
                                       "(%p,0x%llx+0x%llx+0x%04llx): "
@@ -697,9 +643,9 @@ apple_protect_pager_data_request(
                                       *(uint64_t *)(dst_vaddr+offset_in_page),
                                       *(uint64_t *)(dst_vaddr+offset_in_page+8),
                                       src_page_object->code_signed,
-                                      src_page->cs_validated,
-                                      src_page->cs_tainted,
-                                      src_page->cs_nx,
+                                      src_page->vmp_cs_validated,
+                                      src_page->vmp_cs_tainted,
+                                      src_page->vmp_cs_nx,
                                       ret);
                        }
                        if (ret) {
@@ -714,53 +660,18 @@ apple_protect_pager_data_request(
                }
 
                assert(VM_PAGE_OBJECT(src_page) == src_page_object);
-               assert(src_page->busy);
+               assert(src_page->vmp_busy);
                assert(src_page_object->paging_in_progress > 0);
                vm_object_lock(src_page_object);
 
-#if __x86_64__ || __arm__ || __arm64__
-               /* we used the 1-to-1 mapping of physical memory */
-               src_vaddr = 0;
-               dst_vaddr = 0;
-#else /* __x86_64__ || __arm__ || __arm64__ */
-               /*
-                * Remove the pmap mapping of the source and destination pages
-                * in the kernel.
-                */
-               pmap_remove(kernel_pmap,
-                           (addr64_t) kernel_mapping,
-                           (addr64_t) (kernel_mapping + (2 * PAGE_SIZE_64)));
-#endif /* __x86_64__ || __arm__ || __arm64__ */
-
                /*
                 * Cleanup the result of vm_fault_page() of the source page.
                 */
-               if (retval == KERN_SUCCESS &&
-                   src_page->busy &&
-                   !VM_PAGE_WIRED(src_page) &&
-                   !src_page->dirty &&
-                   !src_page->precious &&
-                   !src_page->laundry &&
-                   !src_page->cleaning) {
-                       int refmod_state;
-
-                       refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(src_page));
-
-                       if (refmod_state & VM_MEM_MODIFIED) {
-                               SET_PAGE_DIRTY(src_page, FALSE);
-                       }
-                       if (!src_page->dirty) {
-                               vm_page_free_unlocked(src_page, TRUE);
-                               src_page = VM_PAGE_NULL;
-                       } else {
-                               PAGE_WAKEUP_DONE(src_page);
-                       }
-               } else {
-                       PAGE_WAKEUP_DONE(src_page);
-               }
+               PAGE_WAKEUP_DONE(src_page);
                src_page = VM_PAGE_NULL;
                vm_object_paging_end(src_page_object);
                vm_object_unlock(src_page_object);
+
                if (top_page != VM_PAGE_NULL) {
                        assert(VM_PAGE_OBJECT(top_page) == src_top_object);
                        vm_object_lock(src_top_object);
@@ -824,21 +735,9 @@ done:
                upl_deallocate(upl);
                upl = NULL;
        }
-       if (kernel_mapping != 0) {
-               /* clean up the mapping of the source and destination pages */
-               kr = vm_map_remove(kernel_map,
-                                  kernel_mapping,
-                                  kernel_mapping + (2 * PAGE_SIZE_64),
-                                  VM_MAP_NO_FLAGS);
-               assert(kr == KERN_SUCCESS);
-               kernel_mapping = 0;
-               src_vaddr = 0;
-               dst_vaddr = 0;
-       }
        if (src_top_object != VM_OBJECT_NULL) {
                vm_object_deallocate(src_top_object);
        }
-
        return retval;
 }
 
@@ -1125,7 +1024,7 @@ apple_protect_pager_lookup(
        apple_protect_pager_t   pager;
 
        assert(mem_obj->mo_pager_ops == &apple_protect_pager_ops);
-       pager = (apple_protect_pager_t) mem_obj;
+       pager = (apple_protect_pager_t)(uintptr_t) mem_obj;
        assert(pager->ref_count > 0);
        return pager;
 }
index f474385b7d7da5f7b91905e1f339d8ae475ee692..0f3a37fe16d551c89ca348941749516d7d7697c1 100644 (file)
@@ -50,6 +50,8 @@
 
 #include <IOKit/IOHibernatePrivate.h>
 
+extern boolean_t vm_darkwake_mode;
+
 #if POPCOUNT_THE_COMPRESSED_DATA
 boolean_t popcount_c_segs = TRUE;
 
@@ -88,6 +90,9 @@ static inline uint32_t vmc_pop(uintptr_t ins, int sz) {
 }
 #endif
 
+#if VALIDATE_C_SEGMENTS
+boolean_t validate_c_segs = TRUE;
+#endif
 /*
  * vm_compressor_mode has a heirarchy of control to set its value.
  * boot-args are checked first, then device-tree, and finally
@@ -103,8 +108,7 @@ int         vm_compressor_mode = VM_PAGER_FREEZER_DEFAULT;
 void           *freezer_chead; /* The chead used to track c_segs allocated for the exclusive use of holding just one task's compressed memory.*/
 char           *freezer_compressor_scratch_buf = NULL;
 
-#define                VM_MAX_FREEZER_CSEG_SWAP_COUNT  64 /* The maximum number of c_segs holding just one task's compressed memory that can be swapped out to disk.*/
-extern int     c_freezer_swapout_count;           /* This count keeps track of the # of c_segs holding just one task's compressed memory on the swapout queue. This count is used during each freeze i.e. on a per-task basis.*/
+extern int     c_freezer_swapout_page_count;      /* This count keeps track of the # of compressed pages holding just one task's compressed memory on the swapout queue. This count is used during each freeze i.e. on a per-task basis.*/
 
 #else /* CONFIG_FREEZE */
 int            vm_compressor_mode = VM_PAGER_NOT_CONFIGURED;
@@ -194,8 +198,9 @@ char        *c_compressed_record_cptr;
 
 
 queue_head_t   c_age_list_head;
-queue_head_t   c_swapout_list_head;
 queue_head_t   c_swappedin_list_head;
+queue_head_t   c_swapout_list_head;
+queue_head_t   c_swapio_list_head;
 queue_head_t   c_swappedout_list_head;
 queue_head_t   c_swappedout_sparse_list_head;
 queue_head_t   c_major_list_head;
@@ -203,8 +208,9 @@ queue_head_t        c_filling_list_head;
 queue_head_t   c_bad_list_head;
 
 uint32_t       c_age_count = 0;
-uint32_t       c_swapout_count = 0;
 uint32_t       c_swappedin_count = 0;
+uint32_t       c_swapout_count = 0;
+uint32_t       c_swapio_count = 0;
 uint32_t       c_swappedout_count = 0;
 uint32_t       c_swappedout_sparse_count = 0;
 uint32_t       c_major_count = 0;
@@ -249,6 +255,11 @@ uint32_t   vm_compressor_majorcompact_threshold_divisor = 10;
 uint32_t       vm_compressor_unthrottle_threshold_divisor = 10;
 uint32_t       vm_compressor_catchup_threshold_divisor = 10;
 
+uint32_t       vm_compressor_minorcompact_threshold_divisor_overridden = 0;
+uint32_t       vm_compressor_majorcompact_threshold_divisor_overridden = 0;
+uint32_t       vm_compressor_unthrottle_threshold_divisor_overridden = 0;
+uint32_t       vm_compressor_catchup_threshold_divisor_overridden = 0;
+
 #define                C_SEGMENTS_PER_PAGE     (PAGE_SIZE / sizeof(union c_segu))
 
 
@@ -286,7 +297,6 @@ uint32_t    vm_ripe_target_age = (60 * 60 * 48);
 uint32_t       swapout_target_age = 0;
 uint32_t       age_of_decompressions_during_sample_period[DECOMPRESSION_SAMPLE_MAX_AGE];
 uint32_t       overage_decompressions_during_sample_period = 0;
-uint32_t       vm_compressor_pages_grabbed = 0;
 
 
 void           do_fastwake_warmup(queue_head_t *, boolean_t);
@@ -546,6 +556,9 @@ vm_compressor_init(void)
 #endif
 #if CHECKSUM_THE_DATA || CHECKSUM_THE_COMPRESSED_DATA
                checksum_c_segs = FALSE;
+#endif
+#if VALIDATE_C_SEGMENTS
+               validate_c_segs = FALSE;
 #endif
                write_protect_c_segs = FALSE;
        }
@@ -602,6 +615,7 @@ vm_compressor_init(void)
        queue_init(&c_major_list_head);
        queue_init(&c_filling_list_head);
        queue_init(&c_swapout_list_head);
+       queue_init(&c_swapio_list_head);
        queue_init(&c_swappedin_list_head);
        queue_init(&c_swappedout_list_head);
        queue_init(&c_swappedout_sparse_list_head);
@@ -731,7 +745,18 @@ try_again:
                compressor_scratch_bufs = kalloc_tag(compressor_cpus * vm_compressor_get_decode_scratch_size(), VM_KERN_MEMORY_COMPRESSOR);
 
                kdp_compressor_scratch_buf = kalloc_tag(vm_compressor_get_decode_scratch_size(), VM_KERN_MEMORY_COMPRESSOR);
-               kdp_compressor_decompressed_page = kalloc_tag(PAGE_SIZE, VM_KERN_MEMORY_COMPRESSOR);
+
+               /*
+                * kdp_compressor_decompressed_page must be page aligned because we access
+                * it through the physical apperture by page number. kalloc() does not
+                * guarantee alignment.
+                */
+               vm_offset_t addr;
+               if (kernel_memory_allocate(kernel_map, &addr, PAGE_SIZE, 0, KMA_KOBJECT, VM_KERN_MEMORY_COMPRESSOR) != KERN_SUCCESS) {
+                       panic("vm_compressor_init: kernel_memory_allocate failed - kdp_compressor_decompressed_page\n");
+               }
+               assert((addr & PAGE_MASK) == 0);
+               kdp_compressor_decompressed_page = (void *)addr;
                kdp_compressor_decompressed_page_paddr = kvtophys((vm_offset_t)kdp_compressor_decompressed_page);
                kdp_compressor_decompressed_page_ppnum = (ppnum_t) atop(kdp_compressor_decompressed_page_paddr);
        }
@@ -783,6 +808,9 @@ c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact)
        uint32_t        c_size;
        c_slot_t        cs;
 
+       if (__probable(validate_c_segs == FALSE)) {
+               return;
+       }
        if (c_seg->c_firstemptyslot < c_seg->c_nextslot) {
                c_indx = c_seg->c_firstemptyslot;
                cs = C_SEG_SLOT_FROM_INDEX(c_seg, c_indx);
@@ -812,6 +840,16 @@ c_seg_validate(c_segment_t c_seg, boolean_t must_be_compact)
                        panic("Compressed data doesn't match original %p phys: 0x%llx %d %p %d %d 0x%x 0x%x", c_seg, csvphys, cs->c_offset, cs, c_indx, c_size, cs->c_hash_compressed_data, csvhash);
                }
 #endif
+#if POPCOUNT_THE_COMPRESSED_DATA
+               unsigned csvpop;
+               if (c_size) {
+                       uintptr_t csvaddr = (uintptr_t) &c_seg->c_store.c_buffer[cs->c_offset];
+                       if (cs->c_pop_cdata != (csvpop = vmc_pop(csvaddr, c_size))) {
+                               panic("Compressed data popcount doesn't match original, bit distance: %d %p (phys: %p) %p %p 0x%llx 0x%x 0x%x 0x%x", (csvpop - cs->c_pop_cdata), (void *)csvaddr, (void *) kvtophys(csvaddr), c_seg, cs, cs->c_offset, c_size, csvpop, cs->c_pop_cdata);
+                       }
+               }
+#endif
+
        }
 
        if (bytes_used != c_seg->c_bytes_used)
@@ -1053,14 +1091,20 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
                        break;
 
                case C_ON_SWAPOUT_Q:
-                       assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q ||
-                              new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY);
+                       assert(new_state == C_ON_AGE_Q || new_state == C_IS_FREE || new_state == C_IS_EMPTY || new_state == C_ON_SWAPIO_Q);
 
                        queue_remove(&c_swapout_list_head, c_seg, c_segment_t, c_age_list);
                        thread_wakeup((event_t)&compaction_swapper_running);
                        c_swapout_count--;
                        break;
 
+               case C_ON_SWAPIO_Q:
+                       assert(new_state == C_ON_SWAPPEDOUT_Q || new_state == C_ON_SWAPPEDOUTSPARSE_Q || new_state == C_ON_AGE_Q);
+
+                       queue_remove(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
+                       c_swapio_count--;
+                       break;
+
                case C_ON_SWAPPEDOUT_Q:
                        assert(new_state == C_ON_SWAPPEDIN_Q || new_state == C_ON_AGE_Q ||
                               new_state == C_ON_SWAPPEDOUTSPARSE_Q ||
@@ -1116,7 +1160,8 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
                        break;
 
                case C_ON_AGE_Q:
-                       assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q || old_state == C_ON_SWAPOUT_Q ||
+                       assert(old_state == C_IS_FILLING || old_state == C_ON_SWAPPEDIN_Q ||
+                              old_state == C_ON_SWAPOUT_Q || old_state == C_ON_SWAPIO_Q ||
                               old_state == C_ON_MAJORCOMPACT_Q || old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
 
                        if (old_state == C_IS_FILLING)
@@ -1134,7 +1179,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
                        break;
                
                case C_ON_SWAPPEDIN_Q:
-                       assert(c_seg->c_state == C_ON_SWAPPEDOUT_Q || c_seg->c_state == C_ON_SWAPPEDOUTSPARSE_Q);
+                       assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
 
                        if (insert_head == TRUE)
                                queue_enter_first(&c_swappedin_list_head, c_seg, c_segment_t, c_age_list);
@@ -1153,8 +1198,18 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
                        c_swapout_count++;
                        break;
 
+               case C_ON_SWAPIO_Q:
+                       assert(old_state == C_ON_SWAPOUT_Q);
+
+                       if (insert_head == TRUE)
+                               queue_enter_first(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
+                       else
+                               queue_enter(&c_swapio_list_head, c_seg, c_segment_t, c_age_list);
+                       c_swapio_count++;
+                       break;
+
                case C_ON_SWAPPEDOUT_Q:
-                       assert(c_seg->c_state == C_ON_SWAPOUT_Q);
+                       assert(old_state == C_ON_SWAPIO_Q);
 
                        if (insert_head == TRUE)
                                queue_enter_first(&c_swappedout_list_head, c_seg, c_segment_t, c_age_list);
@@ -1164,7 +1219,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
                        break;
 
                case C_ON_SWAPPEDOUTSPARSE_Q:
-                       assert(c_seg->c_state == C_ON_SWAPOUT_Q || c_seg->c_state == C_ON_SWAPPEDOUT_Q);
+                       assert(old_state == C_ON_SWAPIO_Q || old_state == C_ON_SWAPPEDOUT_Q);
                        
                        if (insert_head == TRUE)
                                queue_enter_first(&c_swappedout_sparse_list_head, c_seg, c_segment_t, c_age_list);
@@ -1175,7 +1230,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
                        break;
 
                case C_ON_MAJORCOMPACT_Q:
-                       assert(c_seg->c_state == C_ON_AGE_Q);
+                       assert(old_state == C_ON_AGE_Q);
 
                        if (insert_head == TRUE)
                                queue_enter_first(&c_major_list_head, c_seg, c_segment_t, c_age_list);
@@ -1185,7 +1240,7 @@ c_seg_switch_state(c_segment_t c_seg, int new_state, boolean_t insert_head)
                        break;
 
                case C_ON_BAD_Q:
-                       assert(c_seg->c_state == C_ON_SWAPPEDOUT_Q || c_seg->c_state == C_ON_SWAPPEDOUTSPARSE_Q);
+                       assert(old_state == C_ON_SWAPPEDOUT_Q || old_state == C_ON_SWAPPEDOUTSPARSE_Q);
 
                        if (insert_head == TRUE)
                                queue_enter_first(&c_bad_list_head, c_seg, c_segment_t, c_age_list);
@@ -1672,8 +1727,6 @@ uint32_t compressor_thrashing_min_per_10msecs = 20;
 /* When true, reset sample data next chance we get. */
 static boolean_t       compressor_need_sample_reset = FALSE;
 
-extern uint32_t vm_page_filecache_min;
-
 
 void
 compute_swapout_target_age(void)
@@ -1802,7 +1855,8 @@ int               compaction_swapper_abort = 0;
 
 
 #if CONFIG_JETSAM
-boolean_t      memorystatus_kill_on_VM_thrashing(boolean_t);
+boolean_t      memorystatus_kill_on_VM_compressor_thrashing(boolean_t);
+boolean_t      memorystatus_kill_on_VM_compressor_space_shortage(boolean_t);
 boolean_t      memorystatus_kill_on_FC_thrashing(boolean_t);
 int            compressor_thrashing_induced_jetsam = 0;
 int            filecache_thrashing_induced_jetsam = 0;
@@ -1875,7 +1929,13 @@ compressor_needs_to_swap(void)
                        vm_compressor_thrashing_detected = TRUE;
                                
                        if (swapout_target_age || vm_compressor_low_on_space() == TRUE) {
-                               memorystatus_kill_on_VM_thrashing(TRUE /* async */);
+                               if (swapout_target_age) {
+                                       /* The compressor is thrashing. */
+                                       memorystatus_kill_on_VM_compressor_thrashing(TRUE /* async */);
+                               } else {
+                                       /* The compressor is running low on space. */
+                                       memorystatus_kill_on_VM_compressor_space_shortage(TRUE /* async */);
+                               }
                                compressor_thrashing_induced_jetsam++;
                        } else {
                                memorystatus_kill_on_FC_thrashing(TRUE /* async */);
@@ -1967,7 +2027,7 @@ vm_run_compactor(void)
        }
        if (compaction_swapper_running) {
 
-               if (vm_restricted_to_single_processor == FALSE) {
+               if (vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
                        vm_run_compactor_already_running++;
 
                        lck_mtx_unlock_always(c_list_lock);
@@ -2309,7 +2369,7 @@ vm_compressor_swap_trigger_thread(void)
        if (compaction_swapper_init_now) {
                vm_compaction_swapper_do_init();
 
-               if (vm_restricted_to_single_processor == TRUE)
+               if (vm_pageout_state.vm_restricted_to_single_processor == TRUE)
                        thread_vm_bind_group_add();
                thread_set_thread_name(current_thread(), "VM_cswap_trigger");
                compaction_swapper_init_now = 0;
@@ -2904,7 +2964,8 @@ c_seg_allocate(c_segment_t *current_chead)
 
                        if (size_to_populate > C_SEG_MAX_POPULATE_SIZE)
                                size_to_populate = C_SEG_MAX_POPULATE_SIZE;
-                       vm_compressor_pages_grabbed += size_to_populate / PAGE_SIZE;
+
+                       OSAddAtomic64(size_to_populate / PAGE_SIZE,  &vm_pageout_vminfo.vm_compressor_pages_grabbed);
 
                        kernel_memory_populate(compressor_map,
                                               (vm_offset_t) &c_seg->c_store.c_buffer[c_seg->c_populated_offset],
@@ -2933,6 +2994,7 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
        int             new_state = C_ON_AGE_Q;
        clock_sec_t     sec;
        clock_nsec_t    nsec;
+       boolean_t       head_insert = FALSE;
 
        unused_bytes = trunc_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset - c_seg->c_nextoffset));
 
@@ -2989,23 +3051,33 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
 #if CONFIG_FREEZE
        if (current_chead == (c_segment_t*)&freezer_chead &&
            VM_CONFIG_SWAP_IS_PRESENT &&
-           VM_CONFIG_FREEZER_SWAP_IS_ACTIVE &&
-           c_freezer_swapout_count < VM_MAX_FREEZER_CSEG_SWAP_COUNT) {
+           VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
                new_state = C_ON_SWAPOUT_Q;
        }
 #endif /* CONFIG_FREEZE */
 
+       if (vm_darkwake_mode == TRUE) {
+               new_state = C_ON_SWAPOUT_Q;
+               head_insert = TRUE;
+       }
+
        clock_get_system_nanotime(&sec, &nsec);
        c_seg->c_creation_ts = (uint32_t)sec;
 
        lck_mtx_lock_spin_always(c_list_lock);
 
        c_seg->c_generation_id = c_generation_id++;
-       c_seg_switch_state(c_seg, new_state, FALSE);
+       c_seg_switch_state(c_seg, new_state, head_insert);
 
 #if CONFIG_FREEZE
-       if (c_seg->c_state == C_ON_SWAPOUT_Q)
-               c_freezer_swapout_count++;
+       if (c_seg->c_state == C_ON_SWAPOUT_Q) {
+               /*
+                * darkwake and freezer can't co-exist together
+                * We'll need to fix this accounting as a start.
+                */
+               assert(vm_darkwake_mode == FALSE);
+               c_freezer_swapout_page_count += (C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset)) / PAGE_SIZE_64;
+       }
 #endif /* CONFIG_FREEZE */
 
        if (c_seg->c_state == C_ON_AGE_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE)
@@ -3013,10 +3085,8 @@ c_current_seg_filled(c_segment_t c_seg, c_segment_t *current_chead)
 
        lck_mtx_unlock_always(c_list_lock);
 
-#if CONFIG_FREEZE
        if (c_seg->c_state == C_ON_SWAPOUT_Q)
                thread_wakeup((event_t)&c_swapout_list_head);
-#endif /* CONFIG_FREEZE */
 
        *current_chead = NULL;
 }
@@ -3770,7 +3840,7 @@ bypass_busy_check:
                                        lck_mtx_lock_spin_always(&c_seg->c_lock);
                                        C_SEG_WAKEUP_DONE(c_seg);
                                }
-                               if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPOUT_Q)
+                               if (!c_seg->c_on_minorcompact_q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q)
                                        c_seg_need_delayed_compaction(c_seg, FALSE);
                        } else {
                                if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q) {
@@ -3790,7 +3860,8 @@ bypass_busy_check:
                        }
                } else if ( !(C_SEG_IS_ONDISK(c_seg))) {
 
-                       if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
+                       if (c_seg->c_state != C_ON_BAD_Q && c_seg->c_state != C_ON_SWAPOUT_Q && c_seg->c_state != C_ON_SWAPIO_Q &&
+                           C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE) {
                                c_seg_need_delayed_compaction(c_seg, FALSE);
                        }
                } else if (c_seg->c_state != C_ON_SWAPPEDOUTSPARSE_Q && C_SEG_ONDISK_IS_SPARSE(c_seg)) {
@@ -3810,7 +3881,7 @@ done:
        PAGE_REPLACEMENT_DISALLOWED(FALSE);
 
        if (consider_defragmenting == TRUE)
-               vm_swap_consider_defragmenting();
+               vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
 
 #if CONFIG_EMBEDDED
        if ((c_minor_count && COMPRESSOR_NEEDS_TO_MINOR_COMPACT()) || vm_compressor_needs_to_major_compact())
index 4b0883873b3293e2710f76d0ee5dc35209a5e025..0dea16c531599ac2fa1e34a7567634c761fd93ab 100644 (file)
@@ -60,7 +60,9 @@
 
 #if DEVELOPMENT || DEBUG
 
-
+#if defined(PLATFORM_WatchOS)
+#define VALIDATE_C_SEGMENTS (1)
+#endif
 #endif
 
 #endif
 #define CHECKSUM_THE_SWAP              ENABLE_SWAP_CHECKS      /* Debug swap data */
 #define CHECKSUM_THE_DATA              ENABLE_COMPRESSOR_CHECKS        /* Debug compressor/decompressor data */
 #define CHECKSUM_THE_COMPRESSED_DATA   ENABLE_COMPRESSOR_CHECKS        /* Debug compressor/decompressor compressed data */
+
+#ifndef VALIDATE_C_SEGMENTS
 #define VALIDATE_C_SEGMENTS            ENABLE_COMPRESSOR_CHECKS        /* Debug compaction */
+#endif
 
 #define RECORD_THE_COMPRESSED_DATA     0
 
@@ -117,6 +122,7 @@ struct c_slot {
 #define        C_ON_SWAPPEDIN_Q        7
 #define        C_ON_MAJORCOMPACT_Q     8
 #define        C_ON_BAD_Q              9
+#define C_ON_SWAPIO_Q          10
 
 
 struct c_segment {
@@ -222,7 +228,8 @@ extern      vm_offset_t     c_buffers;
 #define C_SEG_IS_ONDISK(cseg)          ((cseg->c_state == C_ON_SWAPPEDOUT_Q || cseg->c_state == C_ON_SWAPPEDOUTSPARSE_Q))
 #define C_SEG_IS_ON_DISK_OR_SOQ(cseg)  ((cseg->c_state == C_ON_SWAPPEDOUT_Q || \
                                          cseg->c_state == C_ON_SWAPPEDOUTSPARSE_Q || \
-                                         cseg->c_state == C_ON_SWAPOUT_Q))
+                                         cseg->c_state == C_ON_SWAPOUT_Q || \
+                                         cseg->c_state == C_ON_SWAPIO_Q))
 
 
 #define C_SEG_WAKEUP_DONE(cseg)                                \
@@ -317,7 +324,7 @@ extern void         vm_swap_decrypt(c_segment_t);
 extern int             vm_swap_low_on_space(void);
 extern kern_return_t   vm_swap_get(c_segment_t, uint64_t, uint64_t);
 extern void            vm_swap_free(uint64_t);
-extern void            vm_swap_consider_defragmenting(void);
+extern void            vm_swap_consider_defragmenting(int);
 
 extern void            c_seg_swapin_requeue(c_segment_t, boolean_t, boolean_t, boolean_t);
 extern int             c_seg_swapin(c_segment_t, boolean_t, boolean_t);
@@ -358,6 +365,12 @@ extern uint32_t    vm_compressor_minorcompact_threshold_divisor;
 extern uint32_t        vm_compressor_majorcompact_threshold_divisor;
 extern uint32_t        vm_compressor_unthrottle_threshold_divisor;
 extern uint32_t        vm_compressor_catchup_threshold_divisor;
+
+extern uint32_t        vm_compressor_minorcompact_threshold_divisor_overridden;
+extern uint32_t        vm_compressor_majorcompact_threshold_divisor_overridden;
+extern uint32_t        vm_compressor_unthrottle_threshold_divisor_overridden;
+extern uint32_t        vm_compressor_catchup_threshold_divisor_overridden;
+
 extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, clock_sec_t, clock_nsec_t);
 
 #define PAGE_REPLACEMENT_DISALLOWED(enable)    (enable == TRUE ? lck_rw_lock_shared(&c_master_lock) : lck_rw_done(&c_master_lock))
@@ -366,14 +379,25 @@ extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, c
 
 #define AVAILABLE_NON_COMPRESSED_MEMORY                (vm_page_active_count + vm_page_inactive_count + vm_page_free_count + vm_page_speculative_count)
 #define AVAILABLE_MEMORY                       (AVAILABLE_NON_COMPRESSED_MEMORY + VM_PAGE_COMPRESSOR_COUNT)
-/* TODO, there may be a minor optimisation opportunity to replace these divisions
+
+/*
+ * TODO, there may be a minor optimisation opportunity to replace these divisions
  * with multiplies and shifts
+ *
+ * By multiplying by 10, the divisors can have more precision w/o resorting to floating point... a divisor specified as 25 is in reality a divide by 2.5
+ * By multiplying by 9, you get a number ~11% smaller which allows us to have another limit point derived from the same base
+ * By multiplying by 11, you get a number ~10% bigger which allows us to generate a reset limit derived from the same base which is useful for hysteresis
  */
 
-#define        VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD            (((AVAILABLE_MEMORY) * 10) / (vm_compressor_minorcompact_threshold_divisor ? vm_compressor_minorcompact_threshold_divisor : 1))
-#define        VM_PAGE_COMPRESSOR_SWAP_THRESHOLD               (((AVAILABLE_MEMORY) * 10) / (vm_compressor_majorcompact_threshold_divisor ? vm_compressor_majorcompact_threshold_divisor : 1))
-#define        VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD    (((AVAILABLE_MEMORY) * 10) / (vm_compressor_unthrottle_threshold_divisor ? vm_compressor_unthrottle_threshold_divisor : 1))
-#define VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD      (((AVAILABLE_MEMORY) * 10) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 1))
+#define        VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD            (((AVAILABLE_MEMORY) * 10) / (vm_compressor_minorcompact_threshold_divisor ? vm_compressor_minorcompact_threshold_divisor : 10))
+#define        VM_PAGE_COMPRESSOR_SWAP_THRESHOLD               (((AVAILABLE_MEMORY) * 10) / (vm_compressor_majorcompact_threshold_divisor ? vm_compressor_majorcompact_threshold_divisor : 10))
+
+#define        VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD    (((AVAILABLE_MEMORY) * 10) / (vm_compressor_unthrottle_threshold_divisor ? vm_compressor_unthrottle_threshold_divisor : 10))
+#define        VM_PAGE_COMPRESSOR_SWAP_RETHROTTLE_THRESHOLD    (((AVAILABLE_MEMORY) * 11) / (vm_compressor_unthrottle_threshold_divisor ? vm_compressor_unthrottle_threshold_divisor : 11))
+
+#define VM_PAGE_COMPRESSOR_SWAP_HAS_CAUGHTUP_THRESHOLD (((AVAILABLE_MEMORY) * 11) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 11))
+#define VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD      (((AVAILABLE_MEMORY) * 10) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 10))
+#define VM_PAGE_COMPRESSOR_HARD_THROTTLE_THRESHOLD      (((AVAILABLE_MEMORY) * 9) / (vm_compressor_catchup_threshold_divisor ? vm_compressor_catchup_threshold_divisor : 9))
 
 #ifdef CONFIG_EMBEDDED
 #define AVAILABLE_NON_COMPRESSED_MIN                   20000
@@ -383,11 +407,11 @@ extern uint64_t vm_compressor_compute_elapsed_msecs(clock_sec_t, clock_nsec_t, c
 #define COMPRESSOR_NEEDS_TO_SWAP()             ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_THRESHOLD) ? 1 : 0)
 #endif
 
-#define VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()                            \
-       (vm_compressor_mode == VM_PAGER_COMPRESSOR_WITH_SWAP &&         \
-        ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD) ? 1 : 0))
-#define HARD_THROTTLE_LIMIT_REACHED()          ((AVAILABLE_NON_COMPRESSED_MEMORY < (VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 2) ? 1 : 0)
+#define HARD_THROTTLE_LIMIT_REACHED()          ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_HARD_THROTTLE_THRESHOLD) ? 1 : 0)
 #define SWAPPER_NEEDS_TO_UNTHROTTLE()          ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) ? 1 : 0)
+#define SWAPPER_NEEDS_TO_RETHROTTLE()          ((AVAILABLE_NON_COMPRESSED_MEMORY > VM_PAGE_COMPRESSOR_SWAP_RETHROTTLE_THRESHOLD) ? 1 : 0)
+#define SWAPPER_NEEDS_TO_CATCHUP()             ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_SWAP_CATCHUP_THRESHOLD) ? 1 : 0)
+#define SWAPPER_HAS_CAUGHTUP()                 ((AVAILABLE_NON_COMPRESSED_MEMORY > VM_PAGE_COMPRESSOR_SWAP_HAS_CAUGHTUP_THRESHOLD) ? 1 : 0)
 #define COMPRESSOR_NEEDS_TO_MINOR_COMPACT()    ((AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0)
 
 
index f4ec70ce5747307fd048635d1bcc9971d300644a..e8c1342a1986e544799486813ce69bd9fbb6dac5 100644 (file)
@@ -27,6 +27,7 @@
  */
 
 #include "vm_compressor_backing_store.h"
+#include <vm/vm_pageout.h>
 #include <vm/vm_protos.h>
 
 #include <IOKit/IOHibernatePrivate.h>
@@ -37,12 +38,12 @@ boolean_t   compressor_store_stop_compaction = FALSE;
 boolean_t      vm_swapfile_create_needed = FALSE;
 boolean_t      vm_swapfile_gc_needed = FALSE;
 
-int            swapper_throttle = -1;
-boolean_t      swapper_throttle_inited = FALSE;
+int            vm_swapper_throttle = -1;
 uint64_t       vm_swapout_thread_id;
 
 uint64_t       vm_swap_put_failures = 0;
 uint64_t       vm_swap_get_failures = 0;
+int            vm_num_swap_files_config = 0;
 int            vm_num_swap_files = 0;
 int            vm_num_pinned_swap_files = 0;
 int            vm_swapout_thread_processed_segments = 0;
@@ -110,18 +111,21 @@ static void vm_swap_do_delayed_trim(struct swapfile *);
 static void vm_swap_wait_on_trim_handling_in_progress(void);
 
 
+boolean_t vm_swap_force_defrag = FALSE, vm_swap_force_reclaim = FALSE;
+
 #if CONFIG_EMBEDDED
-/*
- * Only 1 swap file currently allowed.
- */
-#define VM_MAX_SWAP_FILE_NUM           1
+
+#if DEVELOPMENT || DEBUG
+#define VM_MAX_SWAP_FILE_NUM           100
+#else /* DEVELOPMENT || DEBUG */
+#define VM_MAX_SWAP_FILE_NUM           5
+#endif /* DEVELOPMENT || DEBUG */
+
 #define        VM_SWAPFILE_DELAYED_TRIM_MAX    4
 
-#define        VM_SWAP_SHOULD_DEFRAGMENT()     (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 16) ? 1 : 0)
-#define VM_SWAP_SHOULD_RECLAIM()       FALSE
-#define VM_SWAP_SHOULD_ABORT_RECLAIM() FALSE
+#define        VM_SWAP_SHOULD_DEFRAGMENT()     (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 16))) ? 1 : 0)
 #define VM_SWAP_SHOULD_PIN(_size)      FALSE
-#define VM_SWAP_SHOULD_CREATE(cur_ts)  ((vm_num_swap_files < VM_MAX_SWAP_FILE_NUM) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)VM_SWAPFILE_HIWATER_SEGS) && \
+#define VM_SWAP_SHOULD_CREATE(cur_ts)  ((vm_num_swap_files < vm_num_swap_files_config) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)VM_SWAPFILE_HIWATER_SEGS) && \
                                         ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0)
 #define VM_SWAP_SHOULD_TRIM(swf)       ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
 
@@ -130,19 +134,19 @@ static void vm_swap_wait_on_trim_handling_in_progress(void);
 #define VM_MAX_SWAP_FILE_NUM           100
 #define        VM_SWAPFILE_DELAYED_TRIM_MAX    128
 
-#define        VM_SWAP_SHOULD_DEFRAGMENT()     (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 4) ? 1 : 0)
-#define VM_SWAP_SHOULD_RECLAIM()       (((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= SWAPFILE_RECLAIM_THRESHOLD_SEGS) ? 1 : 0)
-#define VM_SWAP_SHOULD_ABORT_RECLAIM() (((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= SWAPFILE_RECLAIM_MINIMUM_SEGS) ? 1 : 0)
+#define        VM_SWAP_SHOULD_DEFRAGMENT()     (((vm_swap_force_defrag == TRUE) || (c_swappedout_sparse_count > (vm_swapfile_total_segs_used / 4))) ? 1 : 0)
 #define VM_SWAP_SHOULD_PIN(_size)      (vm_swappin_avail > 0 && vm_swappin_avail >= (int64_t)(_size))
-#define VM_SWAP_SHOULD_CREATE(cur_ts)  ((vm_num_swap_files < VM_MAX_SWAP_FILE_NUM) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)VM_SWAPFILE_HIWATER_SEGS) && \
+#define VM_SWAP_SHOULD_CREATE(cur_ts)  ((vm_num_swap_files < vm_num_swap_files_config) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) < (unsigned int)VM_SWAPFILE_HIWATER_SEGS) && \
                                         ((cur_ts - vm_swapfile_last_failed_to_create_ts) > VM_SWAPFILE_DELAYED_CREATE) ? 1 : 0)
 #define VM_SWAP_SHOULD_TRIM(swf)       ((swf->swp_delayed_trim_count >= VM_SWAPFILE_DELAYED_TRIM_MAX) ? 1 : 0)
 
 #endif /* CONFIG_EMBEDDED */
 
+#define VM_SWAP_SHOULD_RECLAIM()       (((vm_swap_force_reclaim == TRUE) || ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) >= SWAPFILE_RECLAIM_THRESHOLD_SEGS)) ? 1 : 0)
+#define VM_SWAP_SHOULD_ABORT_RECLAIM() (((vm_swap_force_reclaim == FALSE) && ((vm_swapfile_total_segs_alloced - vm_swapfile_total_segs_used) <= SWAPFILE_RECLAIM_MINIMUM_SEGS)) ? 1 : 0)
 #define        VM_SWAPFILE_DELAYED_CREATE      15
 
-#define VM_SWAP_BUSY() ((c_swapout_count && (swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER1 || swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER0)) ? 1 : 0)
+#define VM_SWAP_BUSY() ((c_swapout_count && (vm_swapper_throttle == THROTTLE_LEVEL_COMPRESSOR_TIER0)) ? 1 : 0)
 
 
 #if CHECKSUM_THE_SWAP
@@ -197,152 +201,149 @@ vm_swapfile_for_handle(uint64_t f_offset)
 
 #if ENCRYPTED_SWAP
 
-#include <libkern/crypto/aes.h>
-extern u_int32_t random(void); /* from <libkern/libkern.h> */
+#include <libkern/crypto/aesxts.h>
 
-#define SWAP_CRYPT_AES_KEY_SIZE 128     /* XXX 192 and 256 don't work ! */
+extern int cc_rand_generate(void *, size_t);     /* from libkern/cyrpto/rand.h> */
 
-boolean_t              swap_crypt_ctx_initialized;
-void                   swap_crypt_ctx_initialize(void);
+boolean_t      swap_crypt_initialized;
+void           swap_crypt_initialize(void);
 
-aes_ctx                        swap_crypt_ctx;
-const unsigned char     swap_crypt_null_iv[AES_BLOCK_SIZE] = {0xa, };
-uint32_t                swap_crypt_key[8]; /* big enough for a 256 key */
+symmetric_xts   xts_modectx;
+uint32_t        swap_crypt_key1[8];   /* big enough for a 256 bit random key */
+uint32_t        swap_crypt_key2[8];   /* big enough for a 256 bit random key */
 
-unsigned long          vm_page_encrypt_counter;
-unsigned long          vm_page_decrypt_counter;
+#if DEVELOPMENT || DEBUG
+boolean_t      swap_crypt_xts_tested = FALSE;
+unsigned char   swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
+unsigned char   swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
+unsigned char   swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
+#endif /* DEVELOPMENT || DEBUG */
 
+unsigned long  vm_page_encrypt_counter;
+unsigned long  vm_page_decrypt_counter;
 
-#if DEBUG
-boolean_t              swap_crypt_ctx_tested = FALSE;
-unsigned char swap_crypt_test_page_ref[4096] __attribute__((aligned(4096)));
-unsigned char swap_crypt_test_page_encrypt[4096] __attribute__((aligned(4096)));
-unsigned char swap_crypt_test_page_decrypt[4096] __attribute__((aligned(4096)));
-#endif /* DEBUG */
 
-/*
- * Initialize the encryption context: key and key size.
- */
-void swap_crypt_ctx_initialize(void); /* forward */
 void
-swap_crypt_ctx_initialize(void)
+swap_crypt_initialize(void)
 {
-       unsigned int    i;
+        uint8_t  *enckey1, *enckey2;
+       int      keylen1, keylen2;
+       int      error;
 
-       /*
-        * No need for locking to protect swap_crypt_ctx_initialized
-        * because the first use of encryption will come from the
-        * pageout thread (we won't pagein before there's been a pageout)
-        * and there's only one pageout thread.
-        */
-       if (swap_crypt_ctx_initialized == FALSE) {
-               for (i = 0;
-                    i < (sizeof (swap_crypt_key) /
-                         sizeof (swap_crypt_key[0]));
-                    i++) {
-                       swap_crypt_key[i] = random();
-               }
-               aes_encrypt_key((const unsigned char *) swap_crypt_key,
-                               SWAP_CRYPT_AES_KEY_SIZE,
-                               &swap_crypt_ctx.encrypt);
-               aes_decrypt_key((const unsigned char *) swap_crypt_key,
-                               SWAP_CRYPT_AES_KEY_SIZE,
-                               &swap_crypt_ctx.decrypt);
-               swap_crypt_ctx_initialized = TRUE;
-       }
+       assert(swap_crypt_initialized == FALSE);
+
+       keylen1 = sizeof(swap_crypt_key1);
+       enckey1 = (uint8_t *)&swap_crypt_key1;
+       keylen2 = sizeof(swap_crypt_key2);
+       enckey2 = (uint8_t *)&swap_crypt_key2;
+
+       error = cc_rand_generate((void *)enckey1, keylen1);
+       assert(!error);
+
+       error = cc_rand_generate((void *)enckey2, keylen2);
+       assert(!error);
+
+       error = xts_start(0, NULL, enckey1, keylen1, enckey2, keylen2, 0, 0, &xts_modectx);
+       assert(!error);
+
+       swap_crypt_initialized = TRUE;
+
+#if DEVELOPMENT || DEBUG
+        uint8_t *encptr;
+        uint8_t *decptr;
+        uint8_t *refptr;
+       uint8_t *iv;
+       uint64_t ivnum[2];
+       int size = 0;
+       int i    = 0;
+       int rc   = 0;
+
+       assert(swap_crypt_xts_tested == FALSE);
 
-#if DEBUG
        /*
         * Validate the encryption algorithms.
+        *
+        * First initialize the test data.
         */
-       if (swap_crypt_ctx_tested == FALSE) {
-               /* initialize */
-               for (i = 0; i < 4096; i++) {
-                       swap_crypt_test_page_ref[i] = (char) i;
-               }
-               /* encrypt */
-               aes_encrypt_cbc(swap_crypt_test_page_ref,
-                               swap_crypt_null_iv,
-                               PAGE_SIZE / AES_BLOCK_SIZE,
-                               swap_crypt_test_page_encrypt,
-                               &swap_crypt_ctx.encrypt);
-               /* decrypt */
-               aes_decrypt_cbc(swap_crypt_test_page_encrypt,
-                               swap_crypt_null_iv,
-                               PAGE_SIZE / AES_BLOCK_SIZE,
-                               swap_crypt_test_page_decrypt,
-                               &swap_crypt_ctx.decrypt);
-               /* compare result with original */
-               for (i = 0; i < 4096; i ++) {
-                       if (swap_crypt_test_page_decrypt[i] !=
-                           swap_crypt_test_page_ref[i]) {
-                               panic("encryption test failed");
-                       }
+       for (i = 0; i < 4096; i++) {
+               swap_crypt_test_page_ref[i] = (char) i;
+       }
+       ivnum[0] = (uint64_t)0xaa;
+       ivnum[1] = 0;
+       iv = (uint8_t *)ivnum;
+       
+       refptr = (uint8_t *)swap_crypt_test_page_ref;
+       encptr = (uint8_t *)swap_crypt_test_page_encrypt;
+       decptr = (uint8_t *)swap_crypt_test_page_decrypt;
+       size = 4096;
+
+       /* encrypt */
+       rc = xts_encrypt(refptr, size, encptr, iv, &xts_modectx);
+       assert(!rc);
+
+       /* compare result with original - should NOT match */
+       for (i = 0; i < 4096; i ++) {
+               if (swap_crypt_test_page_encrypt[i] !=
+                   swap_crypt_test_page_ref[i]) {
+                       break;
                }
+       }
+       assert(i != 4096);
 
-               /* encrypt again */
-               aes_encrypt_cbc(swap_crypt_test_page_decrypt,
-                               swap_crypt_null_iv,
-                               PAGE_SIZE / AES_BLOCK_SIZE,
-                               swap_crypt_test_page_decrypt,
-                               &swap_crypt_ctx.encrypt);
-               /* decrypt in place */
-               aes_decrypt_cbc(swap_crypt_test_page_decrypt,
-                               swap_crypt_null_iv,
-                               PAGE_SIZE / AES_BLOCK_SIZE,
-                               swap_crypt_test_page_decrypt,
-                               &swap_crypt_ctx.decrypt);
-               for (i = 0; i < 4096; i ++) {
-                       if (swap_crypt_test_page_decrypt[i] !=
-                           swap_crypt_test_page_ref[i]) {
-                               panic("in place encryption test failed");
-                       }
-               }
+       /* decrypt */
+       rc = xts_decrypt(encptr, size, decptr, iv, &xts_modectx);
+       assert(!rc);
 
-               swap_crypt_ctx_tested = TRUE;
+       /* compare result with original */
+       for (i = 0; i < 4096; i ++) {
+               if (swap_crypt_test_page_decrypt[i] !=
+                   swap_crypt_test_page_ref[i]) {
+                       panic("encryption test failed");
+               }
+       }
+       /* encrypt in place */
+       rc = xts_encrypt(decptr, size, decptr, iv, &xts_modectx);
+       assert(!rc);
+
+       /* decrypt in place */
+       rc = xts_decrypt(decptr, size, decptr, iv, &xts_modectx);
+       assert(!rc);
+
+       for (i = 0; i < 4096; i ++) {
+               if (swap_crypt_test_page_decrypt[i] !=
+                   swap_crypt_test_page_ref[i]) {
+                       panic("in place encryption test failed");
+               }
        }
-#endif /* DEBUG */
+       swap_crypt_xts_tested = TRUE;
+#endif /* DEVELOPMENT || DEBUG */
 }
 
 
 void
 vm_swap_encrypt(c_segment_t c_seg)
 {
-       vm_offset_t     kernel_vaddr = 0;
-       uint64_t        size = 0;
+        uint8_t *ptr;
+       uint8_t *iv;
+       uint64_t ivnum[2];
+       int size = 0;
+       int rc   = 0;
+
+       if (swap_crypt_initialized == FALSE)
+               swap_crypt_initialize();
 
-       union {
-               unsigned char   aes_iv[AES_BLOCK_SIZE];
-               void            *c_seg;
-       } encrypt_iv;
-       
-       assert(swap_crypt_ctx_initialized);
-       
 #if DEVELOPMENT || DEBUG
        C_SEG_MAKE_WRITEABLE(c_seg);
 #endif
-       bzero(&encrypt_iv.aes_iv[0], sizeof (encrypt_iv.aes_iv));
-
-       encrypt_iv.c_seg = (void*)c_seg;
-
-       /* encrypt the "initial vector" */
-       aes_encrypt_cbc((const unsigned char *) &encrypt_iv.aes_iv[0],
-                       swap_crypt_null_iv,
-                       1,
-                       &encrypt_iv.aes_iv[0],
-                       &swap_crypt_ctx.encrypt);
-
-       kernel_vaddr = (vm_offset_t) c_seg->c_store.c_buffer;
+       ptr = (uint8_t *)c_seg->c_store.c_buffer;
        size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
 
-       /*
-        * Encrypt the c_segment.
-        */
-       aes_encrypt_cbc((const unsigned char *) kernel_vaddr,
-                       &encrypt_iv.aes_iv[0],
-                       (unsigned int)(size / AES_BLOCK_SIZE),
-                       (unsigned char *) kernel_vaddr,
-                       &swap_crypt_ctx.encrypt);
+       ivnum[0] = (uint64_t)c_seg;
+       ivnum[1] = 0;
+       iv = (uint8_t *)ivnum;
+
+       rc = xts_encrypt(ptr, size, ptr, iv, &xts_modectx);
+       assert(!rc);
 
        vm_page_encrypt_counter += (size/PAGE_SIZE_64);
 
@@ -354,48 +355,26 @@ vm_swap_encrypt(c_segment_t c_seg)
 void
 vm_swap_decrypt(c_segment_t c_seg)
 {
+        uint8_t *ptr;
+       uint8_t *iv;
+       uint64_t ivnum[2];
+       int size = 0;
+       int rc   = 0;
 
-       vm_offset_t     kernel_vaddr = 0;
-       uint64_t        size = 0;
-
-       union {
-               unsigned char   aes_iv[AES_BLOCK_SIZE];
-               void            *c_seg;
-       } decrypt_iv;
-       
-       
-       assert(swap_crypt_ctx_initialized);
+       assert(swap_crypt_initialized);
 
 #if DEVELOPMENT || DEBUG
        C_SEG_MAKE_WRITEABLE(c_seg);
 #endif
-       /*
-        * Prepare an "initial vector" for the decryption.
-        * It has to be the same as the "initial vector" we
-        * used to encrypt that page.
-        */
-       bzero(&decrypt_iv.aes_iv[0], sizeof (decrypt_iv.aes_iv));
-
-       decrypt_iv.c_seg = (void*)c_seg;
-
-       /* encrypt the "initial vector" */
-       aes_encrypt_cbc((const unsigned char *) &decrypt_iv.aes_iv[0],
-                       swap_crypt_null_iv,
-                       1,
-                       &decrypt_iv.aes_iv[0],
-                       &swap_crypt_ctx.encrypt);
-       
-       kernel_vaddr = (vm_offset_t) c_seg->c_store.c_buffer;
+       ptr = (uint8_t *)c_seg->c_store.c_buffer;
        size = round_page_32(C_SEG_OFFSET_TO_BYTES(c_seg->c_populated_offset));
 
-       /*
-        * Decrypt the c_segment.
-        */
-       aes_decrypt_cbc((const unsigned char *) kernel_vaddr,
-                       &decrypt_iv.aes_iv[0],
-                       (unsigned int) (size / AES_BLOCK_SIZE),
-                       (unsigned char *) kernel_vaddr,
-                       &swap_crypt_ctx.decrypt);
+       ivnum[0] = (uint64_t)c_seg;
+       ivnum[1] = 0;
+       iv = (uint8_t *)ivnum;
+
+       rc = xts_decrypt(ptr, size, ptr, iv, &xts_modectx);
+       assert(!rc);
 
        vm_page_decrypt_counter += (size/PAGE_SIZE_64);
 
@@ -428,6 +407,7 @@ vm_compressor_swap_init()
                                         BASEPRI_VM, &thread) != KERN_SUCCESS) {
                panic("vm_swapout_thread: create failed");
        }
+       thread_set_thread_name(thread, "VM_swapout");
        vm_swapout_thread_id = thread->thread_id;
 
        thread_deallocate(thread);
@@ -437,12 +417,14 @@ vm_compressor_swap_init()
                panic("vm_swapfile_create_thread: create failed");
        }
 
+       thread_set_thread_name(thread, "VM_swapfile_create");
        thread_deallocate(thread);
 
        if (kernel_thread_start_priority((thread_continue_t)vm_swapfile_gc_thread, NULL,
                                 BASEPRI_VM, &thread) != KERN_SUCCESS) {
                panic("vm_swapfile_gc_thread: create failed");
        }
+       thread_set_thread_name(thread, "VM_swapfile_gc");
        thread_deallocate(thread);
 
        proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
@@ -450,12 +432,6 @@ vm_compressor_swap_init()
        proc_set_thread_policy_with_tid(kernel_task, thread->thread_id,
                                        TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
 
-#if ENCRYPTED_SWAP
-       if (swap_crypt_ctx_initialized == FALSE) {
-               swap_crypt_ctx_initialize();
-       }
-#endif /* ENCRYPTED_SWAP */
-
 #if CONFIG_EMBEDDED
        /*
         * dummy value until the swap file gets created 
@@ -465,6 +441,9 @@ vm_compressor_swap_init()
         */
        c_overage_swapped_limit = 16;
 #endif
+
+       vm_num_swap_files_config = VM_MAX_SWAP_FILE_NUM;
+
        printf("VM Swap Subsystem is ON\n");
 }
 
@@ -521,9 +500,23 @@ vm_compaction_swapper_do_init(void)
                if (vp) {
                        
                        if (vnode_pager_isSSD(vp) == FALSE) {
-                               vm_compressor_minorcompact_threshold_divisor = 18;
-                               vm_compressor_majorcompact_threshold_divisor = 22;
-                               vm_compressor_unthrottle_threshold_divisor = 32;
+                               /*
+                                * swap files live on an HDD, so let's make sure to start swapping
+                                * much earlier since we're not worried about SSD write-wear and 
+                                * we have so little write bandwidth to work with
+                                * these values were derived expermentially by running the performance
+                                * teams stock test for evaluating HDD performance against various 
+                                * combinations and looking and comparing overall results.
+                                * Note that the > relationship between these 4 values must be maintained
+                                */
+                               if (vm_compressor_minorcompact_threshold_divisor_overridden == 0)
+                                       vm_compressor_minorcompact_threshold_divisor = 15;
+                               if (vm_compressor_majorcompact_threshold_divisor_overridden == 0)
+                                       vm_compressor_majorcompact_threshold_divisor = 18;
+                               if (vm_compressor_unthrottle_threshold_divisor_overridden == 0)
+                                       vm_compressor_unthrottle_threshold_divisor = 24;
+                               if (vm_compressor_catchup_threshold_divisor_overridden == 0)
+                                       vm_compressor_catchup_threshold_divisor = 30;
                        }
 #if !CONFIG_EMBEDDED
                        vnode_setswapmount(vp);
@@ -542,16 +535,26 @@ vm_compaction_swapper_do_init(void)
 }
 
 
-
 void
-vm_swap_consider_defragmenting()
+vm_swap_consider_defragmenting(int flags)
 {
+       boolean_t force_defrag = (flags & VM_SWAP_FLAGS_FORCE_DEFRAG);
+       boolean_t force_reclaim = (flags & VM_SWAP_FLAGS_FORCE_RECLAIM);
+
        if (compressor_store_stop_compaction == FALSE && !VM_SWAP_BUSY() &&
-           (VM_SWAP_SHOULD_DEFRAGMENT() || VM_SWAP_SHOULD_RECLAIM())) {
+           (force_defrag || force_reclaim || VM_SWAP_SHOULD_DEFRAGMENT() || VM_SWAP_SHOULD_RECLAIM())) {
 
-               if (!vm_swapfile_gc_thread_running) {
+               if (!vm_swapfile_gc_thread_running || force_defrag || force_reclaim) {
                        lck_mtx_lock(&vm_swap_data_lock);
 
+                       if (force_defrag) {
+                               vm_swap_force_defrag = TRUE;
+                       }
+
+                       if (force_reclaim) {
+                               vm_swap_force_reclaim = TRUE;
+                       }
+
                        if (!vm_swapfile_gc_thread_running)
                                thread_wakeup((event_t) &vm_swapfile_gc_needed);
 
@@ -783,6 +786,9 @@ vm_swapfile_gc_thread(void)
                if (need_defragment == FALSE && need_reclaim == FALSE)
                        break;
 
+               vm_swap_force_defrag = FALSE;
+               vm_swap_force_reclaim = FALSE;
+
                lck_mtx_unlock(&vm_swap_data_lock);
 
                if (need_defragment == TRUE)
@@ -806,98 +812,217 @@ vm_swapfile_gc_thread(void)
 
 
 
-int      swapper_entered_T0 = 0;
-int      swapper_entered_T1 = 0;
-int      swapper_entered_T2 = 0;
+#define   VM_SWAPOUT_LIMIT_T2P  4
+#define   VM_SWAPOUT_LIMIT_T1P  4
+#define   VM_SWAPOUT_LIMIT_T0P  6
+#define   VM_SWAPOUT_LIMIT_T0   8
+#define   VM_SWAPOUT_LIMIT_MAX  8
+
+#define   VM_SWAPOUT_START      0
+#define   VM_SWAPOUT_T2_PASSIVE 1
+#define   VM_SWAPOUT_T1_PASSIVE 2
+#define   VM_SWAPOUT_T0_PASSIVE 3
+#define   VM_SWAPOUT_T0         4
+
+int vm_swapout_state = VM_SWAPOUT_START;
+int vm_swapout_limit = 1;
+
+int vm_swapper_entered_T0  = 0;
+int vm_swapper_entered_T0P = 0;
+int vm_swapper_entered_T1P = 0;
+int vm_swapper_entered_T2P = 0;
+
 
 static void
 vm_swapout_thread_throttle_adjust(void)
 {
-       int swapper_throttle_new;
 
-       if (swapper_throttle_inited == FALSE) {
-               /*
-                * force this thread to be set to the correct
-                * throttling tier
-                */
-               swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER2;
-               swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER1;
-               swapper_throttle_inited = TRUE;
-               swapper_entered_T2++;
-               goto done;
-       }
-       swapper_throttle_new = swapper_throttle;
+       switch(vm_swapout_state) {
+
+       case VM_SWAPOUT_START:
+         
+               vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
+               vm_swapper_entered_T2P++;
+
+               proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                               TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
+               proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                               TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+               vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
+               vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
+
+               break;
 
+       case VM_SWAPOUT_T2_PASSIVE:
 
-       switch(swapper_throttle) {
+               if (SWAPPER_NEEDS_TO_UNTHROTTLE()) {
+                       vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0;
+                       vm_swapper_entered_T0P++;
 
-       case THROTTLE_LEVEL_COMPRESSOR_TIER2:
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+                       vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
+                       vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
 
-               if (SWAPPER_NEEDS_TO_UNTHROTTLE() || swapout_target_age || hibernate_flushing == TRUE) {
-                       swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER1;
-                       swapper_entered_T1++;
                        break;
                }
+               if (swapout_target_age || hibernate_flushing == TRUE) {
+                       vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER1;
+                       vm_swapper_entered_T1P++;
+
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+                       vm_swapout_limit = VM_SWAPOUT_LIMIT_T1P;
+                       vm_swapout_state = VM_SWAPOUT_T1_PASSIVE;
+               }
                break;
 
-       case THROTTLE_LEVEL_COMPRESSOR_TIER1:
+       case VM_SWAPOUT_T1_PASSIVE:
+
+               if (SWAPPER_NEEDS_TO_UNTHROTTLE()) {
+                       vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER0;
+                       vm_swapper_entered_T0P++;
+
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+                       vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
+                       vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
 
-               if (VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) {
-                       swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER0;
-                       swapper_entered_T0++;
                        break;
                }
-               if (COMPRESSOR_NEEDS_TO_SWAP() == 0 && swapout_target_age == 0 && hibernate_flushing == FALSE) {
-                       swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER2;
-                       swapper_entered_T2++;
-                       break;
+               if (swapout_target_age == 0 && hibernate_flushing == FALSE) {
+
+                       vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
+                       vm_swapper_entered_T2P++;
+
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+                       vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
+                       vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
                }
-               break;
+               break;
 
-       case THROTTLE_LEVEL_COMPRESSOR_TIER0:
+       case VM_SWAPOUT_T0_PASSIVE:
+
+               if (SWAPPER_NEEDS_TO_RETHROTTLE()) {
+                       vm_swapper_throttle = THROTTLE_LEVEL_COMPRESSOR_TIER2;
+                       vm_swapper_entered_T2P++;
+
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_IO, vm_swapper_throttle);
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+                       vm_swapout_limit = VM_SWAPOUT_LIMIT_T2P;
+                       vm_swapout_state = VM_SWAPOUT_T2_PASSIVE;
 
-               if (COMPRESSOR_NEEDS_TO_SWAP() == 0) {
-                       swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER2;
-                       swapper_entered_T2++;
                        break;
                }
-               if (SWAPPER_NEEDS_TO_UNTHROTTLE() == 0) {
-                       swapper_throttle_new = THROTTLE_LEVEL_COMPRESSOR_TIER1;
-                       swapper_entered_T1++;
-                       break;
+               if (SWAPPER_NEEDS_TO_CATCHUP()) {
+                       vm_swapper_entered_T0++;
+
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_DISABLE);
+                       vm_swapout_limit = VM_SWAPOUT_LIMIT_T0;
+                       vm_swapout_state = VM_SWAPOUT_T0;
+               }
+               break;
+
+       case VM_SWAPOUT_T0:
+
+               if (SWAPPER_HAS_CAUGHTUP()) {
+                       vm_swapper_entered_T0P++;
+
+                       proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
+                                                       TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+                       vm_swapout_limit = VM_SWAPOUT_LIMIT_T0P;
+                       vm_swapout_state = VM_SWAPOUT_T0_PASSIVE;
                }
                break;
        }
-done:
-       if (swapper_throttle != swapper_throttle_new) {
-               proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
-                                               TASK_POLICY_INTERNAL, TASK_POLICY_IO, swapper_throttle_new);
-               proc_set_thread_policy_with_tid(kernel_task, vm_swapout_thread_id,
-                                               TASK_POLICY_INTERNAL, TASK_POLICY_PASSIVE_IO, TASK_POLICY_ENABLE);
+}
+
+int vm_swapout_found_empty = 0;
+
+struct swapout_io_completion vm_swapout_ctx[VM_SWAPOUT_LIMIT_MAX];
 
-               swapper_throttle = swapper_throttle_new;
+int vm_swapout_soc_busy = 0;
+int vm_swapout_soc_done = 0;
+
+
+static struct swapout_io_completion *
+vm_swapout_find_free_soc(void)
+{       int      i;
+
+        for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) {
+               if (vm_swapout_ctx[i].swp_io_busy == 0)
+                       return (&vm_swapout_ctx[i]);
        }
+       assert(vm_swapout_soc_busy == VM_SWAPOUT_LIMIT_MAX);
+
+       return NULL;
 }
 
+static struct swapout_io_completion *
+vm_swapout_find_done_soc(void)
+{       int      i;
+
+        if (vm_swapout_soc_done) {
+               for (i = 0; i < VM_SWAPOUT_LIMIT_MAX; i++) {
+                       if (vm_swapout_ctx[i].swp_io_done)
+                               return (&vm_swapout_ctx[i]);
+               }
+       }
+       return NULL;
+}
+
+static void
+vm_swapout_complete_soc(struct swapout_io_completion *soc)
+{
+        kern_return_t  kr;
+
+        if (soc->swp_io_error)
+               kr = KERN_FAILURE;
+       else
+               kr = KERN_SUCCESS;
+
+       lck_mtx_unlock_always(c_list_lock);
+
+       vm_swap_put_finish(soc->swp_swf, &soc->swp_f_offset, soc->swp_io_error);
+       vm_swapout_finish(soc->swp_c_seg, soc->swp_f_offset, soc->swp_c_size, kr);
+
+       lck_mtx_lock_spin_always(c_list_lock);
+
+       soc->swp_io_done = 0;
+       soc->swp_io_busy = 0;
+
+       vm_swapout_soc_busy--;
+       vm_swapout_soc_done--;
+}
 
-int vm_swapout_found_empty = 0;
 
 static void
 vm_swapout_thread(void)
 {
-       uint64_t        f_offset = 0;
        uint32_t        size = 0;
        c_segment_t     c_seg = NULL;
        kern_return_t   kr = KERN_SUCCESS;
-       vm_offset_t     addr = 0;
+       struct swapout_io_completion *soc;
 
        current_thread()->options |= TH_OPT_VMPRIV;
 
        vm_swapout_thread_awakened++;
 
        lck_mtx_lock_spin_always(c_list_lock);
-
-       while (!queue_empty(&c_swapout_list_head)) {
+again:
+       while (!queue_empty(&c_swapout_list_head) && vm_swapout_soc_busy < vm_swapout_limit) {
                
                c_seg = (c_segment_t)queue_first(&c_swapout_list_head);
 
@@ -934,14 +1059,13 @@ vm_swapout_thread(void)
                C_SEG_BUSY(c_seg);
                c_seg->c_busy_swapping = 1;
 
-               lck_mtx_unlock_always(c_list_lock);
-
-               addr = (vm_offset_t) c_seg->c_store.c_buffer;
+               c_seg_switch_state(c_seg, C_ON_SWAPIO_Q, FALSE);
 
+               lck_mtx_unlock_always(c_list_lock);
                lck_mtx_unlock_always(&c_seg->c_lock);
 
 #if CHECKSUM_THE_SWAP  
-               c_seg->cseg_hash = hash_string((char*)addr, (int)size);
+               c_seg->cseg_hash = hash_string((char *)c_seg->c_store.c_buffer, (int)size);
                c_seg->cseg_swap_size = size;
 #endif /* CHECKSUM_THE_SWAP */
 
@@ -949,80 +1073,133 @@ vm_swapout_thread(void)
                vm_swap_encrypt(c_seg);
 #endif /* ENCRYPTED_SWAP */
 
-               vm_swapout_thread_throttle_adjust();
+               soc = vm_swapout_find_free_soc();
+               assert(soc);
 
-               kr = vm_swap_put((vm_offset_t) addr, &f_offset, size, c_seg);
+               soc->swp_upl_ctx.io_context = (void *)soc;
+               soc->swp_upl_ctx.io_done = (void *)vm_swapout_iodone;
+               soc->swp_upl_ctx.io_error = 0;
 
-               PAGE_REPLACEMENT_DISALLOWED(TRUE);
+               kr = vm_swap_put((vm_offset_t)c_seg->c_store.c_buffer, &soc->swp_f_offset, size, c_seg, soc);
 
-               if (kr == KERN_SUCCESS) {
-                       kernel_memory_depopulate(compressor_map, (vm_offset_t) addr, size, KMA_COMPRESSOR);
-               }
-#if ENCRYPTED_SWAP
-               else {
-                       vm_swap_decrypt(c_seg);
+               if (kr != KERN_SUCCESS) {
+                       if (soc->swp_io_done) {
+                               lck_mtx_lock_spin_always(c_list_lock);
+
+                               soc->swp_io_done = 0;
+                               vm_swapout_soc_done--;
+
+                               lck_mtx_unlock_always(c_list_lock);
+                       }
+                       vm_swapout_finish(c_seg, soc->swp_f_offset, size, kr);
+               } else {
+                       soc->swp_io_busy = 1;
+                       vm_swapout_soc_busy++;
                }
-#endif /* ENCRYPTED_SWAP */
+               vm_swapout_thread_throttle_adjust();
+               vm_pageout_io_throttle();
+
+c_seg_is_empty:
+               if (c_swapout_count == 0)
+                       vm_swap_consider_defragmenting(VM_SWAP_FLAGS_NONE);
+
                lck_mtx_lock_spin_always(c_list_lock);
-               lck_mtx_lock_spin_always(&c_seg->c_lock);
 
-               if (kr == KERN_SUCCESS) {
-                       int             new_state = C_ON_SWAPPEDOUT_Q;
-                       boolean_t       insert_head = FALSE;
+               if ((soc = vm_swapout_find_done_soc()))
+                       vm_swapout_complete_soc(soc);
+       }
+       if ((soc = vm_swapout_find_done_soc())) {
+               vm_swapout_complete_soc(soc);
+               goto again;
+       }
+       assert_wait((event_t)&c_swapout_list_head, THREAD_UNINT);
 
-                       if (hibernate_flushing == TRUE) {
-                               if (c_seg->c_generation_id >= first_c_segment_to_warm_generation_id &&
-                                   c_seg->c_generation_id <= last_c_segment_to_warm_generation_id)
-                                       insert_head = TRUE;
-                       } else if (C_SEG_ONDISK_IS_SPARSE(c_seg))
-                               new_state = C_ON_SWAPPEDOUTSPARSE_Q;
+       lck_mtx_unlock_always(c_list_lock);
 
-                       c_seg_switch_state(c_seg, new_state, insert_head);
+       thread_block((thread_continue_t)vm_swapout_thread);
+       
+       /* NOTREACHED */
+}
 
-                       c_seg->c_store.c_swap_handle = f_offset;
 
-                       VM_STAT_INCR_BY(swapouts, size >> PAGE_SHIFT);
-                       
-                       if (c_seg->c_bytes_used)
-                               OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used);
-               } else {
-                       if (c_seg->c_overage_swap == TRUE) {
-                               c_seg->c_overage_swap = FALSE;
-                               c_overage_swapped_count--;
-                       }
-                       c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
+void
+vm_swapout_iodone(void *io_context, int error)
+{
+        struct swapout_io_completion *soc;
 
-                       if (!c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE)
-                               c_seg_need_delayed_compaction(c_seg, TRUE);
-               }
-               assert(c_seg->c_busy_swapping);
-               assert(c_seg->c_busy);
+       soc = (struct swapout_io_completion *)io_context;
 
-               c_seg->c_busy_swapping = 0;
-               lck_mtx_unlock_always(c_list_lock);
+       lck_mtx_lock_spin_always(c_list_lock);
 
-               C_SEG_WAKEUP_DONE(c_seg);
-               lck_mtx_unlock_always(&c_seg->c_lock);
+       soc->swp_io_done = 1;
+       soc->swp_io_error = error;
+       vm_swapout_soc_done++;
+       
+       thread_wakeup((event_t)&c_swapout_list_head);
+       
+       lck_mtx_unlock_always(c_list_lock);
+}
 
-               PAGE_REPLACEMENT_DISALLOWED(FALSE);
 
-               vm_pageout_io_throttle();
-c_seg_is_empty:
-               if (c_swapout_count == 0)
-                       vm_swap_consider_defragmenting();
+static void
+vm_swapout_finish(c_segment_t c_seg, uint64_t f_offset,  uint32_t size, kern_return_t kr)
+{
 
-               lck_mtx_lock_spin_always(c_list_lock);
+       PAGE_REPLACEMENT_DISALLOWED(TRUE);
+
+       if (kr == KERN_SUCCESS) {
+               kernel_memory_depopulate(compressor_map, (vm_offset_t)c_seg->c_store.c_buffer, size, KMA_COMPRESSOR);
+       }
+#if ENCRYPTED_SWAP
+       else {
+               vm_swap_decrypt(c_seg);
        }
+#endif /* ENCRYPTED_SWAP */
+       lck_mtx_lock_spin_always(c_list_lock);
+       lck_mtx_lock_spin_always(&c_seg->c_lock);
 
-       assert_wait((event_t)&c_swapout_list_head, THREAD_UNINT);
+       if (kr == KERN_SUCCESS) {
+               int             new_state = C_ON_SWAPPEDOUT_Q;
+               boolean_t       insert_head = FALSE;
+
+               if (hibernate_flushing == TRUE) {
+                       if (c_seg->c_generation_id >= first_c_segment_to_warm_generation_id &&
+                                 c_seg->c_generation_id <= last_c_segment_to_warm_generation_id)
+                               insert_head = TRUE;
+               } else if (C_SEG_ONDISK_IS_SPARSE(c_seg))
+                       new_state = C_ON_SWAPPEDOUTSPARSE_Q;
+
+               c_seg_switch_state(c_seg, new_state, insert_head);
+
+               c_seg->c_store.c_swap_handle = f_offset;
 
+               VM_STAT_INCR_BY(swapouts, size >> PAGE_SHIFT);
+                       
+               if (c_seg->c_bytes_used)
+                       OSAddAtomic64(-c_seg->c_bytes_used, &compressor_bytes_used);
+       } else {
+               if (c_seg->c_overage_swap == TRUE) {
+                       c_seg->c_overage_swap = FALSE;
+                       c_overage_swapped_count--;
+               }
+               c_seg_switch_state(c_seg, C_ON_AGE_Q, FALSE);
+
+               if (!c_seg->c_on_minorcompact_q && C_SEG_UNUSED_BYTES(c_seg) >= PAGE_SIZE)
+                       c_seg_need_delayed_compaction(c_seg, TRUE);
+       }
+       assert(c_seg->c_busy_swapping);
+       assert(c_seg->c_busy);
+
+       c_seg->c_busy_swapping = 0;
        lck_mtx_unlock_always(c_list_lock);
 
-       thread_block((thread_continue_t)vm_swapout_thread);
-       
-       /* NOTREACHED */
+       C_SEG_WAKEUP_DONE(c_seg);
+       lck_mtx_unlock_always(&c_seg->c_lock);
+
+       PAGE_REPLACEMENT_DISALLOWED(FALSE);
 }
 
+
 boolean_t
 vm_swap_create_file()
 {
@@ -1199,7 +1376,7 @@ vm_swap_get(c_segment_t c_seg, uint64_t f_offset, uint64_t size)
        C_SEG_MAKE_WRITEABLE(c_seg);
 #endif
        file_offset = (f_offset & SWAP_SLOT_MASK);
-       retval = vm_swapfile_io(swf->swp_vp, file_offset, (uint64_t)c_seg->c_store.c_buffer, (int)(size / PAGE_SIZE_64), SWAP_READ);
+       retval = vm_swapfile_io(swf->swp_vp, file_offset, (uint64_t)c_seg->c_store.c_buffer, (int)(size / PAGE_SIZE_64), SWAP_READ, NULL);
 
 #if DEVELOPMENT || DEBUG
        C_SEG_WRITE_PROTECT(c_seg);
@@ -1232,7 +1409,7 @@ done:
 }
 
 kern_return_t
-vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint64_t size, c_segment_t c_seg)
+vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint32_t size, c_segment_t c_seg, struct swapout_io_completion *soc)
 {
        unsigned int    segidx = 0;
        struct swapfile *swf = NULL;
@@ -1246,6 +1423,7 @@ vm_swap_put(vm_offset_t addr, uint64_t *f_offset, uint64_t size, c_segment_t c_s
        int             error = 0;
        clock_sec_t     sec;
        clock_nsec_t    nsec;
+       void            *upl_ctx = NULL;
 
        if (addr == 0 || f_offset == NULL) {
                return KERN_FAILURE;
@@ -1278,8 +1456,9 @@ retry:
                                file_offset = segidx * COMPRESSED_SWAP_CHUNK_SIZE;
                                swf->swp_nseginuse++;
                                swf->swp_io_count++;
-                               swapfile_index = swf->swp_index;
+                               swf->swp_csegs[segidx] = c_seg;
 
+                               swapfile_index = swf->swp_index;
                                vm_swapfile_total_segs_used++;
 
                                clock_get_system_nanotime(&sec, &nsec);
@@ -1289,7 +1468,7 @@ retry:
 
                                lck_mtx_unlock(&vm_swap_data_lock);
                
-                               goto done;
+                               goto issue_io;
                        }
                }
                swf = (struct swapfile*) queue_next(&swf->swp_queue);
@@ -1336,32 +1515,48 @@ retry:
 
        return KERN_FAILURE;
 
-done:  
+issue_io:      
        assert(c_seg->c_busy_swapping);
        assert(c_seg->c_busy);
        assert(!c_seg->c_on_minorcompact_q);
 
-       error = vm_swapfile_io(swf->swp_vp, file_offset, addr, (int) (size / PAGE_SIZE_64), SWAP_WRITE);
+       *f_offset = (swapfile_index << SWAP_DEVICE_SHIFT) | file_offset;
+
+       if (soc) {
+               soc->swp_c_seg = c_seg;
+               soc->swp_c_size = size;
 
-       lck_mtx_lock(&vm_swap_data_lock);
+               soc->swp_swf = swf;
 
-       swf->swp_csegs[segidx] = c_seg;
+               soc->swp_io_error = 0;
+               soc->swp_io_done = 0;
 
-       swf->swp_io_count--;
+               upl_ctx = (void *)&soc->swp_upl_ctx;
+       }
+       error = vm_swapfile_io(swf->swp_vp, file_offset, addr, (int) (size / PAGE_SIZE_64), SWAP_WRITE, upl_ctx);
 
-       *f_offset = (swapfile_index << SWAP_DEVICE_SHIFT) | file_offset;
+       if (error || upl_ctx == NULL)
+               return (vm_swap_put_finish(swf, f_offset, error));
+
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+vm_swap_put_finish(struct swapfile *swf, uint64_t *f_offset, int error)
+{
+       lck_mtx_lock(&vm_swap_data_lock);
+
+       swf->swp_io_count--;
 
        if ((swf->swp_flags & SWAP_WANTED) && swf->swp_io_count == 0) {
        
                swf->swp_flags &= ~SWAP_WANTED;
                thread_wakeup((event_t) &swf->swp_flags);
        }
-
        lck_mtx_unlock(&vm_swap_data_lock);
 
        if (error) {
                vm_swap_free(*f_offset);
-
                vm_swap_put_failures++;
 
                return KERN_FAILURE;
@@ -1370,7 +1565,6 @@ done:
 }
 
 
-
 static void
 vm_swap_free_now(struct swapfile *swf, uint64_t f_offset)
 {
@@ -1737,7 +1931,7 @@ ReTry_for_cseg:
 
                lck_mtx_unlock_always(&c_seg->c_lock);
 
-               if (vm_swapfile_io(swf->swp_vp, f_offset, addr, (int)(c_size / PAGE_SIZE_64), SWAP_READ)) {
+               if (vm_swapfile_io(swf->swp_vp, f_offset, addr, (int)(c_size / PAGE_SIZE_64), SWAP_READ, NULL)) {
 
                        /*
                         * reading the data back in failed, so convert c_seg
@@ -1753,7 +1947,7 @@ ReTry_for_cseg:
                }
                VM_STAT_INCR_BY(swapins, c_size >> PAGE_SHIFT);
 
-               if (vm_swap_put(addr, &f_offset, c_size, c_seg)) {
+               if (vm_swap_put(addr, &f_offset, c_size, c_seg, NULL)) {
                        vm_offset_t     c_buffer;
 
                        /*
@@ -1896,9 +2090,67 @@ vm_swap_files_pinned(void)
         boolean_t result;
 
        if (vm_swappin_enabled == FALSE)
-               return(TRUE);
+               return (TRUE);
 
         result = (vm_num_pinned_swap_files == vm_num_swap_files);
 
         return (result);
 }
+
+#if CONFIG_FREEZE
+boolean_t
+vm_swap_max_budget(uint64_t *freeze_daily_budget)
+{
+       boolean_t       use_device_value = FALSE;
+       struct swapfile *swf = NULL;
+
+       if (vm_num_swap_files) {
+               lck_mtx_lock(&vm_swap_data_lock);
+
+               swf = (struct swapfile*) queue_first(&swf_global_queue);
+
+               if (swf) {
+                       while(queue_end(&swf_global_queue, (queue_entry_t)swf) == FALSE) {
+
+                               if (swf->swp_flags == SWAP_READY) {
+
+                                       assert(swf->swp_vp);
+
+                                       if (vm_swap_vol_get_budget(swf->swp_vp, freeze_daily_budget) == 0) {
+                                               use_device_value = TRUE;
+                                       }
+                                       break;
+                               }
+                               swf = (struct swapfile*) queue_next(&swf->swp_queue);
+                       }
+               }
+
+               lck_mtx_unlock(&vm_swap_data_lock);
+
+       } else {
+
+               /*
+                * This block is used for the initial budget value before any swap files
+                * are created. We create a temp swap file to get the budget.
+                */
+
+               struct vnode *temp_vp = NULL;
+
+               vm_swapfile_open(swapfilename, &temp_vp);
+
+               if (temp_vp) {
+
+                       if (vm_swap_vol_get_budget(temp_vp, freeze_daily_budget) == 0) {
+                               use_device_value = TRUE;
+                       }
+
+                       vm_swapfile_close((uint64_t)&swapfilename, temp_vp);
+                       temp_vp = NULL;
+               } else {
+                       *freeze_daily_budget = 0;
+               }
+       }
+
+       return use_device_value;
+}
+#endif /* CONFIG_FREEZE */
index 9dda1ab7541ec7088ba4786e2a225b042768e7a1..2bad2d6f1ae1a9361cdd9f449aedeb5817ba04e2 100644 (file)
@@ -78,7 +78,29 @@ lck_mtx_t    vm_swap_data_lock;
 
 void vm_swap_init(void);
 boolean_t vm_swap_create_file(void);
-kern_return_t vm_swap_put(vm_offset_t, uint64_t*, uint64_t, c_segment_t);
+
+
+struct swapout_io_completion {
+  
+        int          swp_io_busy;
+        int          swp_io_done;
+        int          swp_io_error;
+
+        uint32_t     swp_c_size;
+        c_segment_t  swp_c_seg;
+
+        struct swapfile *swp_swf;
+        uint64_t        swp_f_offset;
+
+        struct upl_io_completion swp_upl_ctx;
+};
+void vm_swapout_iodone(void *, int);
+
+
+static void vm_swapout_finish(c_segment_t, uint64_t, uint32_t, kern_return_t);
+kern_return_t vm_swap_put_finish(struct swapfile *, uint64_t *, int);
+kern_return_t vm_swap_put(vm_offset_t, uint64_t*, uint32_t, c_segment_t, struct swapout_io_completion *);
+
 void vm_swap_flush(void);
 void vm_swap_reclaim(void);
 void vm_swap_encrypt(c_segment_t);
@@ -92,7 +114,12 @@ extern void vm_swapfile_close(uint64_t path, struct vnode *vp);
 extern int vm_swapfile_preallocate(struct vnode *vp, uint64_t *size, boolean_t *pin);
 extern uint64_t vm_swapfile_get_blksize(struct vnode *vp);
 extern uint64_t vm_swapfile_get_transfer_size(struct vnode *vp);
-extern int vm_swapfile_io(struct vnode *vp, uint64_t offset, uint64_t start, int npages, int flags);
+extern int vm_swapfile_io(struct vnode *vp, uint64_t offset, uint64_t start, int npages, int flags, void *upl_ctx);
+
+#if CONFIG_FREEZE
+boolean_t vm_swap_max_budget(uint64_t *);
+int vm_swap_vol_get_budget(struct vnode* vp, uint64_t *freeze_daily_budget);
+#endif /* CONFIG_FREEZE */
 
 #if RECORD_THE_COMPRESSED_DATA
 extern int vm_record_file_write(struct vnode *vp, uint64_t offset, char *buf, int size);
index ae0195c861f7e4fc9256f5eb13c942be921d832e..c8ec4fed006213c26c63c8faf0fbb908ceb889ab 100644 (file)
@@ -707,6 +707,8 @@ vm_compressor_pager_put(
 {
        compressor_pager_t      pager;
        compressor_slot_t       *slot_p;
+       unsigned int            prev_wimg = VM_WIMG_DEFAULT;
+       boolean_t               set_cache_attr = FALSE;
 
        compressor_pager_stats.put++;
 
@@ -746,11 +748,33 @@ vm_compressor_pager_put(
                vm_compressor_free(slot_p, 0);
                *compressed_count_delta_p -= 1;
        }
-       if (vm_compressor_put(ppnum, slot_p, current_chead, scratch_buf))
-               return (KERN_RESOURCE_SHORTAGE);
+
+       /*
+        * cacheability should be set to the system default (usually writeback)
+        * during compressor operations, both for performance and correctness,
+        * e.g. to avoid compressor codec faults generated by an unexpected
+        * memory type.
+        */
+       prev_wimg = pmap_cache_attributes(ppnum) & VM_WIMG_MASK;
+
+       if ((prev_wimg != VM_WIMG_DEFAULT) && (prev_wimg != VM_WIMG_USE_DEFAULT)) {
+               set_cache_attr = TRUE;
+               pmap_set_cache_attributes(ppnum, VM_WIMG_DEFAULT);
+       }
+       /*
+        * If the compressor operation succeeds, we presumably don't need to
+        * undo any previous WIMG update, as all live mappings should be
+        * disconnected.
+        */
+
+       if (vm_compressor_put(ppnum, slot_p, current_chead, scratch_buf)) {
+               if (set_cache_attr)
+                       pmap_set_cache_attributes(ppnum, prev_wimg);
+               return KERN_RESOURCE_SHORTAGE;
+       }
        *compressed_count_delta_p += 1;
 
-       return (KERN_SUCCESS);
+       return KERN_SUCCESS;
 }
 
 
@@ -796,6 +820,21 @@ vm_compressor_pager_get(
                
        if (kr == KERN_SUCCESS) {
                int     retval;
+               unsigned int prev_wimg = VM_WIMG_DEFAULT;
+               boolean_t set_cache_attr = FALSE;
+
+               /*
+                * cacheability should be set to the system default (usually writeback)
+                * during compressor operations, both for performance and correctness,
+                * e.g. to avoid compressor codec faults generated by an unexpected
+                * memory type.
+                */
+               prev_wimg = pmap_cache_attributes(ppnum) & VM_WIMG_MASK;
+
+               if ((prev_wimg != VM_WIMG_DEFAULT) && (prev_wimg != VM_WIMG_USE_DEFAULT)) {
+                       set_cache_attr = TRUE;
+                       pmap_set_cache_attributes(ppnum, VM_WIMG_DEFAULT);
+               }
 
                /* get the page from the compressor */
                retval = vm_compressor_get(ppnum, slot_p, flags);
@@ -807,6 +846,8 @@ vm_compressor_pager_get(
                        assert((flags & C_DONT_BLOCK));
                        kr = KERN_FAILURE;
                }
+               if (set_cache_attr)
+                       pmap_set_cache_attributes(ppnum, prev_wimg);
        }
 
        if (kr == KERN_SUCCESS) {
index e723c90129780d4a10778257dd7096004a3cb26f..a42d1b9ee1bc52037189540b52702d0a64d458af 100644 (file)
@@ -88,11 +88,11 @@ extern vm_external_state_t vm_compressor_pager_state_get(
                                                  (object));            \
                }                                                       \
                if (_num_pages_cleared &&                               \
-                   (object)->purgable != VM_PURGABLE_DENY &&           \
-                   (object)->vo_purgeable_owner != NULL) {             \
-                       /* less compressed purgeable pages */           \
+                   ((object)->purgable != VM_PURGABLE_DENY ||          \
+                    (object)->vo_ledger_tag)) {                        \
+                       /* less compressed purgeable/tagged pages */    \
                        assert(_num_pages_cleared == 1);                \
-                       vm_purgeable_compressed_update(                 \
+                       vm_object_owner_compressed_update(              \
                                (object),                               \
                                -_num_pages_cleared);                   \
                }                                                       \
index 256c70dfe27d158f351fa7e830850b82c816c72e..abbe202ff351edadda29587a00d4e9865c33b311 100644 (file)
@@ -139,11 +139,12 @@ uint64_t vm_hard_throttle_threshold;
 
 
 #define NEED_TO_HARD_THROTTLE_THIS_TASK()      (vm_wants_task_throttled(current_task()) ||     \
-                                                (vm_page_free_count < vm_page_throttle_limit && \
-                                                 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) > THROTTLE_LEVEL_THROTTLED))
+                                                ((vm_page_free_count < vm_page_throttle_limit || \
+                                                  HARD_THROTTLE_LIMIT_REACHED()) && \
+                                                 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
 
 
-#define HARD_THROTTLE_DELAY    5000    /* 5000 us == 5 ms */
+#define HARD_THROTTLE_DELAY    10000   /* 10000 us == 10 ms */
 #define SOFT_THROTTLE_DELAY    200     /* 200 us == .2 ms */
 
 #define        VM_PAGE_CREATION_THROTTLE_PERIOD_SECS   6
@@ -197,6 +198,10 @@ unsigned long vm_cs_revalidates = 0;
 unsigned long vm_cs_query_modified = 0;
 unsigned long vm_cs_validated_dirtied = 0;
 unsigned long vm_cs_bitmap_validated = 0;
+#if PMAP_CS
+uint64_t vm_cs_defer_to_pmap_cs = 0;
+uint64_t vm_cs_defer_to_pmap_cs_not = 0;
+#endif /* PMAP_CS */
 
 void vm_pre_fault(vm_map_offset_t);
 
@@ -204,6 +209,24 @@ extern char *kdp_compressor_decompressed_page;
 extern addr64_t        kdp_compressor_decompressed_page_paddr;
 extern ppnum_t kdp_compressor_decompressed_page_ppnum;
 
+struct vmrtfr {
+       int vmrtfr_maxi;
+       int vmrtfr_curi;
+       int64_t vmrtf_total;
+       vm_rtfault_record_t *vm_rtf_records;
+} vmrtfrs;
+#define VMRTF_DEFAULT_BUFSIZE (4096)
+#define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
+int vmrtf_num_records = VMRTF_NUM_RECORDS_DEFAULT;
+
+static void vm_rtfrecord_lock(void);
+static void vm_rtfrecord_unlock(void);
+static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
+
+lck_spin_t vm_rtfr_slock;
+extern lck_grp_t vm_page_lck_grp_bucket;
+extern lck_attr_t vm_page_lck_attr;
+
 /*
  *     Routine:        vm_fault_init
  *     Purpose:
@@ -245,11 +268,20 @@ vm_fault_init(void)
                /* If no boot arg or incorrect boot arg, try device tree. */
                PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
        }
-       PE_parse_boot_argn("vm_compressor_threads", &vm_compressor_thread_count, sizeof (vm_compressor_thread_count));
-
        printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
 }
 
+void vm_rtfault_record_init(void) {
+       PE_parse_boot_argn("vm_rtfault_records", &vmrtf_num_records, sizeof(vmrtf_num_records));
+
+       assert(vmrtf_num_records >= 1);
+       vmrtf_num_records = MAX(vmrtf_num_records, 1);
+       size_t kallocsz = vmrtf_num_records * sizeof(vm_rtfault_record_t);
+       vmrtfrs.vm_rtf_records = kalloc(kallocsz);
+       bzero(vmrtfrs.vm_rtf_records, kallocsz);
+       vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
+       lck_spin_init(&vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
+}
 /*
  *     Routine:        vm_fault_cleanup
  *     Purpose:
@@ -282,24 +314,6 @@ vm_fault_cleanup(
        }
 }
 
-#if    MACH_CLUSTER_STATS
-#define MAXCLUSTERPAGES 16
-struct {
-       unsigned long pages_in_cluster;
-       unsigned long pages_at_higher_offsets;
-       unsigned long pages_at_lower_offsets;
-} cluster_stats_in[MAXCLUSTERPAGES];
-#define CLUSTER_STAT(clause)   clause
-#define CLUSTER_STAT_HIGHER(x) \
-       ((cluster_stats_in[(x)].pages_at_higher_offsets)++)
-#define CLUSTER_STAT_LOWER(x)  \
-        ((cluster_stats_in[(x)].pages_at_lower_offsets)++)
-#define CLUSTER_STAT_CLUSTER(x)        \
-       ((cluster_stats_in[(x)].pages_in_cluster)++)
-#else  /* MACH_CLUSTER_STATS */
-#define CLUSTER_STAT(clause)
-#endif /* MACH_CLUSTER_STATS */
-
 #define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
 
 
@@ -530,7 +544,7 @@ vm_fault_deactivate_behind(
         for (n = 0; n < max_pages_in_run; n++) {
                m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
 
-               if (m && !m->laundry && !m->busy && !m->no_cache && (m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->fictitious && !m->absent) {
+               if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
                        page_run[pages_in_run++] = m;
 
                        /*
@@ -630,7 +644,7 @@ vm_page_throttled(boolean_t page_kept)
                                thread->t_page_creation_time = tv_sec;
                                thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
                        }
-                       ++vm_page_throttle_count;
+                       VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
 
                        thread->t_page_creation_throttled = 1;
 
@@ -664,10 +678,10 @@ no_throttle:
  * cleanup is based on being called from vm_fault_page
  *
  * object must be locked
- * object == m->object
+ * object == m->vmp_object
  */
 static vm_fault_return_t
-vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t interruptible_state, boolean_t page_throttle)
+vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
 {
        int throttle_delay;
 
@@ -688,26 +702,6 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int
 
                return (VM_FAULT_MEMORY_ERROR);
        }
-       if (vm_backing_store_low) {
-               /*
-                * are we protecting the system from
-                * backing store exhaustion.  If so
-                * sleep unless we are privileged.
-                */
-               if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
-
-                       if (m != VM_PAGE_NULL)
-                               VM_PAGE_FREE(m);
-                       vm_fault_cleanup(object, first_m);
-
-                       assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
-
-                       thread_block(THREAD_CONTINUE_NULL);
-                       thread_interrupt_level(interruptible_state);
-
-                       return (VM_FAULT_RETRY);
-               }
-       }
        if (page_throttle == TRUE) {
                if ((throttle_delay = vm_page_throttled(FALSE))) {
                        /*
@@ -739,7 +733,7 @@ vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, boolean_t int
  * do the work to zero fill a page and
  * inject it into the correct paging queue
  *
- * m->object must be locked
+ * m->vmp_object must be locked
  * page queue lock must NOT be held
  */
 static int
@@ -765,16 +759,16 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
         * sending a program into this area.  We
         * choose this approach for performance
         */
-       m->pmapped = TRUE;
+       m->vmp_pmapped = TRUE;
 
-       m->cs_validated = FALSE;
-       m->cs_tainted = FALSE;
-       m->cs_nx = FALSE;
+       m->vmp_cs_validated = FALSE;
+       m->vmp_cs_tainted = FALSE;
+       m->vmp_cs_nx = FALSE;
 
        if (no_zero_fill == TRUE) {
                my_fault = DBG_NZF_PAGE_FAULT;
 
-               if (m->absent && m->busy)
+               if (m->vmp_absent && m->vmp_busy)
                        return (my_fault);
        } else {
                vm_page_zero_fill(m);
@@ -782,9 +776,9 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
                VM_STAT_INCR(zero_fill_count);
                DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
        }
-       assert(!m->laundry);
+       assert(!m->vmp_laundry);
        assert(object != kernel_object);
-       //assert(m->pageq.next == 0 && m->pageq.prev == 0);
+       //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
 
        if (!VM_DYNAMIC_PAGING_ENABLED() &&
                (object->purgable == VM_PURGABLE_DENY ||
@@ -802,8 +796,8 @@ vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
                         */
                        vm_page_queues_remove(m, TRUE);
                        vm_page_check_pageable_safe(m);
-                       vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
-                       m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
+                       vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq);
+                       m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
                        vm_page_throttled_count++;
                }
                vm_page_unlock_queues();
@@ -897,10 +891,8 @@ vm_fault_page(
        boolean_t               force_fault_retry = FALSE;
        vm_prot_t               access_required = fault_type;
        vm_prot_t               wants_copy_flag;
-       CLUSTER_STAT(int pages_at_higher_offsets;)
-       CLUSTER_STAT(int pages_at_lower_offsets;)
        kern_return_t           wait_result;
-       boolean_t               interruptible_state;
+       wait_interrupt_t        interruptible_state;
        boolean_t               data_already_requested = FALSE;
        vm_behavior_t           orig_behavior;
        vm_size_t               orig_cluster_size;
@@ -1072,7 +1064,7 @@ vm_fault_page(
 #endif
                if (m != VM_PAGE_NULL) {
 
-                       if (m->busy) {
+                       if (m->vmp_busy) {
                                /*
                                 * The page is being brought in,
                                 * wait for it and then retry.
@@ -1099,10 +1091,10 @@ vm_fault_page(
                                }
                                continue;
                        }
-                       if (m->laundry) {
-                               m->free_when_done = FALSE;
+                       if (m->vmp_laundry) {
+                               m->vmp_free_when_done = FALSE;
 
-                               if (!m->cleaning)
+                               if (!m->vmp_cleaning)
                                        vm_pageout_steal_laundry(m, FALSE);
                        }
                        if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
@@ -1116,7 +1108,7 @@ vm_fault_page(
                                         * be just to wire or unwire it.
                                         * Let's pretend it succeeded...
                                         */
-                                       m->busy = TRUE;
+                                       m->vmp_busy = TRUE;
                                        *result_page = m;
                                        assert(first_m == VM_PAGE_NULL);
                                        *top_page = first_m;
@@ -1135,7 +1127,7 @@ vm_fault_page(
                                }
                        }
 
-                       if (m->error) {
+                       if (m->vmp_error) {
                                /*
                                 * The page is in error, give up now.
                                 */
@@ -1151,7 +1143,7 @@ vm_fault_page(
 
                                return (VM_FAULT_MEMORY_ERROR);
                        }
-                       if (m->restart) {
+                       if (m->vmp_restart) {
                                /*
                                 * The pager wants us to restart
                                 * at the top of the chain,
@@ -1168,7 +1160,7 @@ vm_fault_page(
 
                                return (VM_FAULT_RETRY);
                        }
-                       if (m->absent) {
+                       if (m->vmp_absent) {
                                /*
                                 * The page isn't busy, but is absent,
                                 * therefore it's deemed "unavailable".
@@ -1238,11 +1230,11 @@ vm_fault_page(
                                                 * we're going to use the absent page we just found
                                                 * so convert it to a 'busy' page
                                                 */
-                                               m->absent = FALSE;
-                                               m->busy = TRUE;
+                                               m->vmp_absent = FALSE;
+                                               m->vmp_busy = TRUE;
                                        }
                                        if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
-                                               m->absent = TRUE;
+                                               m->vmp_absent = TRUE;
                                        /*
                                         * zero-fill the page and put it on
                                         * the correct paging queue
@@ -1258,8 +1250,8 @@ vm_fault_page(
                                                VM_PAGE_FREE(m);
                                        } else {
                                                first_m = m;
-                                               m->absent = FALSE;
-                                               m->busy = TRUE;
+                                               m->vmp_absent = FALSE;
+                                               m->vmp_busy = TRUE;
 
                                                vm_page_lockspin_queues();
                                                vm_page_queues_remove(m, FALSE);
@@ -1289,7 +1281,7 @@ vm_fault_page(
                                        continue;
                                }
                        }
-                       if ((m->cleaning)
+                       if ((m->vmp_cleaning)
                            && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
                            && (fault_type & VM_PROT_WRITE)) {
                                /*
@@ -1322,7 +1314,7 @@ vm_fault_page(
 
                                m = vm_page_lookup(object, offset);
 
-                               if (m != VM_PAGE_NULL && m->cleaning) {
+                               if (m != VM_PAGE_NULL && m->vmp_cleaning) {
                                        PAGE_ASSERT_WAIT(m, interruptible);
 
                                        vm_object_unlock(object);
@@ -1339,14 +1331,14 @@ vm_fault_page(
                                        return (VM_FAULT_RETRY);
                                }
                        }
-                       if (type_of_fault == NULL && (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
+                       if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
                            !(fault_info != NULL && fault_info->stealth)) {
                                /*
                                 * If we were passed a non-NULL pointer for
                                 * "type_of_fault", than we came from
                                 * vm_fault... we'll let it deal with
                                 * this condition, since it
-                                * needs to see m->speculative to correctly
+                                * needs to see m->vmp_speculative to correctly
                                 * account the pageins, otherwise...
                                 * take it off the speculative queue, we'll
                                 * let the caller of vm_fault_page deal
@@ -1357,7 +1349,7 @@ vm_fault_page(
                                 * the page in the speculative queue.
                                 */
                                vm_page_lockspin_queues();
-                               if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q)
+                               if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q)
                                        vm_page_queues_remove(m, FALSE);
                                vm_page_unlock_queues();
                        }
@@ -1388,10 +1380,10 @@ vm_fault_page(
                        XPR(XPR_VM_FAULT,
                            "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
                                object, offset, m, 0, 0);
-                       assert(!m->busy);
-                       assert(!m->absent);
+                       assert(!m->vmp_busy);
+                       assert(!m->vmp_absent);
 
-                       m->busy = TRUE;
+                       m->vmp_busy = TRUE;
                        break;
                }
 
@@ -1561,16 +1553,16 @@ vm_fault_page(
                                                return (VM_FAULT_MEMORY_SHORTAGE);
                                        }
 
-                                       m->absent = TRUE;
+                                       m->vmp_absent = TRUE;
                                        if (fault_info && fault_info->batch_pmap_op == TRUE) {
                                                vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
                                        } else {
                                                vm_page_insert(m, object, offset);
                                        }
                                }
-                               assert(m->busy);
+                               assert(m->vmp_busy);
 
-                               m->absent = TRUE;
+                               m->vmp_absent = TRUE;
                                pager = object->pager;
 
                                assert(object->paging_in_progress > 0);
@@ -1610,8 +1602,8 @@ vm_fault_page(
 
                                switch (rc) {
                                case KERN_SUCCESS:
-                                       m->absent = FALSE;
-                                       m->dirty = TRUE;
+                                       m->vmp_absent = FALSE;
+                                       m->vmp_dirty = TRUE;
                                        if ((object->wimg_bits &
                                             VM_WIMG_MASK) !=
                                            VM_WIMG_USE_DEFAULT) {
@@ -1624,7 +1616,7 @@ vm_fault_page(
                                                pmap_sync_page_attributes_phys(
                                                        VM_PAGE_GET_PHYS_PAGE(m));
                                        } else {
-                                               m->written_by_kernel = TRUE;
+                                               m->vmp_written_by_kernel = TRUE;
                                        }
 
                                        /*
@@ -1635,27 +1627,28 @@ vm_fault_page(
                                         * "compressed purgeable" ledger, so
                                         * update that now.
                                         */
-                                       if ((object->purgable !=
-                                            VM_PURGABLE_DENY) &&
-                                           (object->vo_purgeable_owner !=
+                                       if (((object->purgable !=
+                                             VM_PURGABLE_DENY) ||
+                                            object->vo_ledger_tag) &&
+                                           (object->vo_owner !=
                                             NULL)) {
                                                /*
                                                 * One less compressed
-                                                * purgeable page.
+                                                * purgeable/tagged page.
                                                 */
-                                               vm_purgeable_compressed_update(
+                                               vm_object_owner_compressed_update(
                                                        object,
                                                        -1);
                                        }
 
                                        break;
                                case KERN_MEMORY_FAILURE:
-                                       m->unusual = TRUE;
-                                       m->error = TRUE;
-                                       m->absent = FALSE;
+                                       m->vmp_unusual = TRUE;
+                                       m->vmp_error = TRUE;
+                                       m->vmp_absent = FALSE;
                                        break;
                                case KERN_MEMORY_ERROR:
-                                       assert(m->absent);
+                                       assert(m->vmp_absent);
                                        break;
                                default:
                                        panic("vm_fault_page(): unexpected "
@@ -1698,7 +1691,7 @@ vm_fault_page(
                         * so we can release the object lock.
                         */
 
-                       if (object->object_slid == TRUE) {
+                       if (object->object_is_shared_cache) {
                                set_thread_rwlock_boost();
                        }
 
@@ -1786,7 +1779,7 @@ vm_fault_page(
 #endif
                        vm_object_lock(object);
 
-                       if (object->object_slid == TRUE) {
+                       if (object->object_is_shared_cache) {
                                clear_thread_rwlock_boost();
                        }
 
@@ -1859,7 +1852,7 @@ dont_look_for_page:
                 * We get here if the object has no pager, or an existence map
                 * exists and indicates the page isn't present on the pager
                 * or we're unwiring a page.  If a pager exists, but there
-                * is no existence map, then the m->absent case above handles
+                * is no existence map, then the m->vmp_absent case above handles
                 * the ZF case when the pager can't provide the page
                 */
 #if TRACEFAULTPAGE
@@ -1920,7 +1913,7 @@ dont_look_for_page:
                                vm_page_insert(m, object, offset);
                        }
                        if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
-                               m->absent = TRUE;
+                               m->vmp_absent = TRUE;
 
                        my_fault = vm_fault_zero_page(m, no_zero_fill);
 
@@ -1969,10 +1962,10 @@ dont_look_for_page:
        dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m);  /* (TEST/DEBUG) */
 #endif
 #if    EXTRA_ASSERTIONS
-       assert(m->busy && !m->absent);
+       assert(m->vmp_busy && !m->vmp_absent);
        assert((first_m == VM_PAGE_NULL) ||
-              (first_m->busy && !first_m->absent &&
-               !first_m->active && !first_m->inactive && !first_m->secluded));
+              (first_m->vmp_busy && !first_m->vmp_absent &&
+               !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
 #endif /* EXTRA_ASSERTIONS */
 
        XPR(XPR_VM_FAULT,
@@ -2000,25 +1993,6 @@ dont_look_for_page:
                         */
                        assert(!must_be_resident);
 
-                       /*
-                        * are we protecting the system from
-                        * backing store exhaustion.  If so
-                        * sleep unless we are privileged.
-                        */
-                       if (vm_backing_store_low) {
-                               if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
-
-                                       RELEASE_PAGE(m);
-                                       vm_fault_cleanup(object, first_m);
-
-                                       assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
-
-                                       thread_block(THREAD_CONTINUE_NULL);
-                                       thread_interrupt_level(interruptible_state);
-
-                                       return (VM_FAULT_RETRY);
-                               }
-                       }
                        /*
                         * If we try to collapse first_object at this
                         * point, we may deadlock when we try to get
@@ -2067,14 +2041,14 @@ dont_look_for_page:
                         * access to this page, then we could
                         * avoid the pmap_disconnect() call.
                         */
-                       if (m->pmapped)
+                       if (m->vmp_pmapped)
                                pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
 
-                       if (m->clustered) {
+                       if (m->vmp_clustered) {
                                VM_PAGE_COUNT_AS_PAGEIN(m);
                                VM_PAGE_CONSUME_CLUSTERED(m);
                        }
-                       assert(!m->cleaning);
+                       assert(!m->vmp_cleaning);
 
                        /*
                         * We no longer need the old page or object.
@@ -2114,7 +2088,7 @@ dont_look_for_page:
                         * and replace it with the
                         * page we just copied into
                         */
-                       assert(copy_m->busy);
+                       assert(copy_m->vmp_busy);
                        vm_page_insert(copy_m, object, offset);
                        SET_PAGE_DIRTY(copy_m, TRUE);
 
@@ -2200,7 +2174,7 @@ dont_look_for_page:
                        /*
                         * Page currently exists in the copy object
                         */
-                       if (copy_m->busy) {
+                       if (copy_m->vmp_busy) {
                                /*
                                 * If the page is being brought
                                 * in, wait for it and then retry.
@@ -2223,7 +2197,7 @@ dont_look_for_page:
                                assert(copy_object->ref_count > 0);
                                copy_m = vm_page_lookup(copy_object, copy_offset);
 
-                               if (copy_m != VM_PAGE_NULL && copy_m->busy) {
+                               if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
                                        PAGE_ASSERT_WAIT(copy_m, interruptible);
 
                                        vm_object_unlock(copy_object);
@@ -2249,32 +2223,7 @@ dont_look_for_page:
                         * for example) or it hasn't been paged out.
                         * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
                         * We must copy the page to the copy object.
-                        */
-
-                       if (vm_backing_store_low) {
-                               /*
-                                * we are protecting the system from
-                                * backing store exhaustion.  If so
-                                * sleep unless we are privileged.
-                                */
-                               if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV)) {
-                                       assert_wait((event_t)&vm_backing_store_low, THREAD_UNINT);
-
-                                       RELEASE_PAGE(m);
-                                       VM_OBJ_RES_DECR(copy_object);
-                                       vm_object_lock_assert_exclusive(copy_object);
-                                       copy_object->ref_count--;
-                                       assert(copy_object->ref_count > 0);
-
-                                       vm_object_unlock(copy_object);
-                                       vm_fault_cleanup(object, first_m);
-                                       thread_block(THREAD_CONTINUE_NULL);
-                                       thread_interrupt_level(interruptible_state);
-
-                                       return (VM_FAULT_RETRY);
-                               }
-                       }
-                       /*
+                        *
                         * Allocate a page for the copy
                         */
                        copy_m = vm_page_alloc(copy_object, copy_offset);
@@ -2304,10 +2253,10 @@ dont_look_for_page:
                         * from all pmaps.  (We can't know which
                         * pmaps use it.)
                         */
-                       if (m->pmapped)
+                       if (m->vmp_pmapped)
                                pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
 
-                       if (m->clustered) {
+                       if (m->vmp_clustered) {
                                VM_PAGE_COUNT_AS_PAGEIN(m);
                                VM_PAGE_CONSUME_CLUSTERED(m);
                        }
@@ -2321,7 +2270,7 @@ dont_look_for_page:
                           ) {
 
                                vm_page_lockspin_queues();
-                               assert(!m->cleaning);
+                               assert(!m->vmp_cleaning);
                                vm_page_activate(copy_m);
                                vm_page_unlock_queues();
 
@@ -2330,8 +2279,8 @@ dont_look_for_page:
 
                        } else {
 
-                               assert(copy_m->busy == TRUE);
-                               assert(!m->cleaning);
+                               assert(copy_m->vmp_busy == TRUE);
+                               assert(!m->vmp_cleaning);
 
                                /*
                                 * dirty is protected by the object lock
@@ -2384,8 +2333,8 @@ dont_look_for_page:
                         * wait result].  Can't turn off the page's
                         * busy bit because we're not done with it.
                         */
-                       if (m->wanted) {
-                               m->wanted = FALSE;
+                       if (m->vmp_wanted) {
+                               m->vmp_wanted = FALSE;
                                thread_wakeup_with_result((event_t) m, THREAD_RESTART);
                        }
                }
@@ -2434,8 +2383,20 @@ done:
                         * state being up to date
                         */
                        vm_fault_is_sequential(object, offset, fault_info->behavior);
+                       vm_fault_deactivate_behind(object, offset, fault_info->behavior);
 
+               } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
+                       /*
+                        * we weren't called from vm_fault, so handle the
+                        * accounting here for hits in the cache
+                        */
+                       if (m->vmp_clustered) {
+                               VM_PAGE_COUNT_AS_PAGEIN(m);
+                               VM_PAGE_CONSUME_CLUSTERED(m);
+                       }
+                       vm_fault_is_sequential(object, offset, fault_info->behavior);
                        vm_fault_deactivate_behind(object, offset, fault_info->behavior);
+
                } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
 
                        VM_STAT_INCR(decompressions);
@@ -2477,16 +2438,16 @@ backoff:
  */
 #define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj)              \
        ((pmap) != kernel_pmap /*1*/ &&                                 \
-        !(page)->cs_tainted /*2*/ &&                                   \
+        !(page)->vmp_cs_tainted /*2*/ &&                                       \
         (page_obj)->code_signed /*3*/ &&                                       \
-        (!(page)->cs_validated || (page)->wpmapped /*4*/))
+        (!(page)->vmp_cs_validated || (page)->vmp_wpmapped /*4*/))
 
 
 /*
  * page queue lock must NOT be held
- * m->object must be locked
+ * m->vmp_object must be locked
  *
- * NOTE: m->object could be locked "shared" only if we are called
+ * NOTE: m->vmp_object could be locked "shared" only if we are called
  * from vm_fault() as part of a soft fault.  If so, we must be
  * careful not to modify the VM object in any way that is not
  * legal under a shared lock...
@@ -2505,20 +2466,21 @@ vm_fault_enter(vm_page_t m,
               boolean_t wired,
               boolean_t change_wiring,
               vm_tag_t  wire_tag,
-              boolean_t no_cache,
-              boolean_t cs_bypass,
-              __unused int      user_tag,
-              int       pmap_options,
+              vm_object_fault_info_t fault_info,
               boolean_t *need_retry,
               int *type_of_fault)
 {
        kern_return_t   kr, pe_result;
-       boolean_t       previously_pmapped = m->pmapped;
+       boolean_t       previously_pmapped = m->vmp_pmapped;
        boolean_t       must_disconnect = 0;
        boolean_t       map_is_switched, map_is_switch_protected;
+       boolean_t       cs_violation;
        int             cs_enforcement_enabled;
        vm_prot_t       fault_type;
        vm_object_t     object;
+       boolean_t       no_cache = fault_info->no_cache;
+       boolean_t       cs_bypass = fault_info->cs_bypass;
+       int             pmap_options = fault_info->pmap_options;
 
        fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
        object = VM_PAGE_OBJECT(m);
@@ -2534,7 +2496,7 @@ vm_fault_enter(vm_page_t m,
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
 
        if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
-               assert(m->fictitious);
+               assert(m->vmp_fictitious);
                return KERN_SUCCESS;
        }
 
@@ -2542,7 +2504,12 @@ vm_fault_enter(vm_page_t m,
 
                vm_object_lock_assert_exclusive(object);
 
-       } else if ((fault_type & VM_PROT_WRITE) == 0 && !m->wpmapped) {
+       } else if ((fault_type & VM_PROT_WRITE) == 0 &&
+                  (!m->vmp_wpmapped
+#if VM_OBJECT_ACCESS_TRACKING
+                   || object->access_tracking
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+                          )) {
                /*
                 * This is not a "write" fault, so we
                 * might not have taken the object lock
@@ -2562,13 +2529,13 @@ vm_fault_enter(vm_page_t m,
                        assert(cs_bypass);
                }
        }
-       if (m->pmapped == FALSE) {
+       if (m->vmp_pmapped == FALSE) {
 
-               if (m->clustered) {
+               if (m->vmp_clustered) {
                        if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
                                /*
                                 * found it in the cache, but this
-                                * is the first fault-in of the page (m->pmapped == FALSE)
+                                * is the first fault-in of the page (m->vmp_pmapped == FALSE)
                                 * so it must have come in as part of
                                 * a cluster... account 1 pagein against it
                                 */
@@ -2592,20 +2559,43 @@ vm_fault_enter(vm_page_t m,
        }
 
        /* Validate code signature if necessary. */
-       if (VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
+       if (!cs_bypass &&
+           VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
                vm_object_lock_assert_exclusive(object);
 
-               if (m->cs_validated) {
+               if (m->vmp_cs_validated) {
                        vm_cs_revalidates++;
                }
 
                /* VM map is locked, so 1 ref will remain on VM object -
                 * so no harm if vm_page_validate_cs drops the object lock */
+
+#if PMAP_CS
+               if (fault_info->pmap_cs_associated &&
+                   pmap_cs_enforced(pmap) &&
+                   !m->vmp_cs_validated &&
+                   !m->vmp_cs_tainted &&
+                   !m->vmp_cs_nx &&
+                   (prot & VM_PROT_EXECUTE) &&
+                   (caller_prot & VM_PROT_EXECUTE)) {
+                       /*
+                        * With pmap_cs, the pmap layer will validate the
+                        * code signature for any executable pmap mapping.
+                        * No need for us to validate this page too:
+                        * in pmap_cs we trust...
+                        */
+                       vm_cs_defer_to_pmap_cs++;
+               } else {
+                       vm_cs_defer_to_pmap_cs_not++;
+                       vm_page_validate_cs(m);
+               }
+#else /* PMAP_CS */
                vm_page_validate_cs(m);
+#endif /* PMAP_CS */
        }
 
-#define page_immutable(m,prot) ((m)->cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
-#define page_nx(m) ((m)->cs_nx)
+#define page_immutable(m,prot) ((m)->vmp_cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
+#define page_nx(m) ((m)->vmp_cs_nx)
 
        map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
                           (pmap == vm_map_pmap(current_thread()->map)));
@@ -2621,7 +2611,7 @@ vm_fault_enter(vm_page_t m,
         * from the current map. We do that below right before we do the
         * PMAP_ENTER.
         */
-       cs_enforcement_enabled = cs_enforcement(NULL);
+       cs_enforcement_enabled = cs_process_enforcement(NULL);
 
        if(cs_enforcement_enabled && map_is_switched &&
           map_is_switch_protected && page_immutable(m, prot) &&
@@ -2636,31 +2626,6 @@ vm_fault_enter(vm_page_t m,
                return KERN_CODESIGN_ERROR;
        }
 
-       if (cs_enforcement_enabled &&
-           !m->cs_validated &&
-           (prot & VM_PROT_EXECUTE) &&
-           !(caller_prot & VM_PROT_EXECUTE)) {
-               /*
-                * FOURK PAGER:
-                * This page has not been validated and will not be
-                * allowed to be mapped for "execute".
-                * But the caller did not request "execute" access for this
-                * fault, so we should not raise a code-signing violation
-                * (and possibly kill the process) below.
-                * Instead, let's just remove the "execute" access request.
-                *
-                * This can happen on devices with a 4K page size if a 16K
-                * page contains a mix of signed&executable and
-                * unsigned&non-executable 4K pages, making the whole 16K
-                * mapping "executable".
-                */
-               if (!pmap_has_prot_policy(prot)) {
-                       prot &= ~VM_PROT_EXECUTE;
-               } else {
-                       assert(cs_bypass);
-               }
-       }
-
        /* A page could be tainted, or pose a risk of being tainted later.
         * Check whether the receiving process wants it, and make it feel
         * the consequences (that hapens in cs_invalid_page()).
@@ -2671,28 +2636,52 @@ vm_fault_enter(vm_page_t m,
         *   can be changed without the kernel noticing, therefore unsigned
         *   code can be created
         */
-       if (!cs_bypass &&
-           (m->cs_tainted ||
-            (cs_enforcement_enabled &&
-             (/* The page is unsigned and wants to be executable */
-              (!m->cs_validated && (prot & VM_PROT_EXECUTE))  ||
-              /* The page should be immutable, but is in danger of being modified
-               * This is the case where we want policy from the code directory -
-               * is the page immutable or not? For now we have to assume that
-               * code pages will be immutable, data pages not.
-               * We'll assume a page is a code page if it has a code directory
-               * and we fault for execution.
-               * That is good enough since if we faulted the code page for
-               * writing in another map before, it is wpmapped; if we fault
-               * it for writing in this map later it will also be faulted for executing
-               * at the same time; and if we fault for writing in another map
-               * later, we will disconnect it from this pmap so we'll notice
-               * the change.
-               */
-             (page_immutable(m, prot) && ((prot & VM_PROT_WRITE) || m->wpmapped))
-             ))
-                   ))
-       {
+       if (cs_bypass) {
+               /* code-signing is bypassed */
+               cs_violation = FALSE;
+       } else if (m->vmp_cs_tainted) {
+               /* tainted page */
+               cs_violation = TRUE;
+       } else if (!cs_enforcement_enabled) {
+               /* no further code-signing enforcement */
+               cs_violation = FALSE;
+       } else if (page_immutable(m, prot) &&
+                  ((prot & VM_PROT_WRITE) ||
+                   m->vmp_wpmapped)) {
+               /*
+                * The page should be immutable, but is in danger of being
+                * modified.
+                * This is the case where we want policy from the code
+                * directory - is the page immutable or not? For now we have
+                * to assume that code pages will be immutable, data pages not.
+                * We'll assume a page is a code page if it has a code directory
+                * and we fault for execution.
+                * That is good enough since if we faulted the code page for
+                * writing in another map before, it is wpmapped; if we fault
+                * it for writing in this map later it will also be faulted for
+                * executing at the same time; and if we fault for writing in
+                * another map later, we will disconnect it from this pmap so
+                * we'll notice the change.
+                */
+               cs_violation = TRUE;
+       } else if (!m->vmp_cs_validated &&
+                  (prot & VM_PROT_EXECUTE)
+#if PMAP_CS
+                  /*
+                   * Executable pages will be validated by pmap_cs;
+                   * in pmap_cs we trust...
+                   * If pmap_cs is turned off, this is a code-signing
+                   * violation.
+                   */
+                  && ! (pmap_cs_enforced(pmap))
+#endif /* PMAP_CS */
+               ) {
+               cs_violation = TRUE;
+       } else {
+               cs_violation = FALSE;
+       }
+
+       if (cs_violation) {
                /* We will have a tainted page. Have to handle the special case
                 * of a switched map now. If the map is not switched, standard
                 * procedure applies - call cs_invalid_page().
@@ -2707,12 +2696,11 @@ vm_fault_enter(vm_page_t m,
                        reject_page = FALSE;
                } else {
                        if (cs_debug > 5)
-                               printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s slid: %s prot: 0x%x\n",
+                               printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
                                       object->code_signed ? "yes" : "no",
-                                      m->cs_validated ? "yes" : "no",
-                                      m->cs_tainted ? "yes" : "no",
-                                      m->wpmapped ? "yes" : "no",
-                                      m->slid ? "yes" : "no",
+                                      m->vmp_cs_validated ? "yes" : "no",
+                                      m->vmp_cs_tainted ? "yes" : "no",
+                                      m->vmp_wpmapped ? "yes" : "no",
                                       (int)prot);
                        reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
                }
@@ -2744,7 +2732,7 @@ vm_fault_enter(vm_page_t m,
 
                        /* get file's VM object */
                        file_object = object;
-                       file_offset = m->offset;
+                       file_offset = m->vmp_offset;
                        for (shadow = file_object->shadow,
                                     shadow_depth = 0;
                             shadow != VM_OBJECT_NULL;
@@ -2798,7 +2786,7 @@ vm_fault_enter(vm_page_t m,
                               "from offset 0x%llx in file \"%s%s%s\" "
                               "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
                               "(signed:%d validated:%d tainted:%d nx:%d "
-                              "wpmapped:%d slid:%d dirty:%d depth:%d)\n",
+                              "wpmapped:%d dirty:%d depth:%d)\n",
                               pid, procname, (addr64_t) vaddr,
                               file_offset,
                               (pathname ? pathname : "<nil>"),
@@ -2811,12 +2799,11 @@ vm_fault_enter(vm_page_t m,
                                : "!="),
                               mtime.tv_sec, mtime.tv_nsec,
                               object->code_signed,
-                              m->cs_validated,
-                              m->cs_tainted,
-                              m->cs_nx,
-                              m->wpmapped,
-                              m->slid,
-                              m->dirty,
+                              m->vmp_cs_validated,
+                              m->vmp_cs_tainted,
+                              m->vmp_cs_nx,
+                              m->vmp_wpmapped,
+                              m->vmp_dirty,
                               shadow_depth);
 
                        /*
@@ -2861,12 +2848,12 @@ vm_fault_enter(vm_page_t m,
                                                        ceri->ceri_page_modtime_secs = mtime.tv_sec;
                                                        ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
                                                        ceri->ceri_object_codesigned = (object->code_signed);
-                                                       ceri->ceri_page_codesig_validated = (m->cs_validated);
-                                                       ceri->ceri_page_codesig_tainted = (m->cs_tainted);
-                                                       ceri->ceri_page_codesig_nx = (m->cs_nx);
-                                                       ceri->ceri_page_wpmapped = (m->wpmapped);
-                                                       ceri->ceri_page_slid = (m->slid);
-                                                       ceri->ceri_page_dirty = (m->dirty);
+                                                       ceri->ceri_page_codesig_validated = (m->vmp_cs_validated);
+                                                       ceri->ceri_page_codesig_tainted = (m->vmp_cs_tainted);
+                                                       ceri->ceri_page_codesig_nx = (m->vmp_cs_nx);
+                                                       ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
+                                                       ceri->ceri_page_slid = 0;
+                                                       ceri->ceri_page_dirty = (m->vmp_dirty);
                                                        ceri->ceri_page_shadow_depth = shadow_depth;
                                                } else {
 #if DEBUG || DEVELOPMENT
@@ -2883,13 +2870,13 @@ vm_fault_enter(vm_page_t m,
                                set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
                        }
                        if (panic_on_cs_killed &&
-                           object->object_slid) {
+                           object->object_is_shared_cache) {
                                panic("CODE SIGNING: process %d[%s]: "
                                      "rejecting invalid page at address 0x%llx "
                                      "from offset 0x%llx in file \"%s%s%s\" "
                                      "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
                                      "(signed:%d validated:%d tainted:%d nx:%d"
-                                     "wpmapped:%d slid:%d dirty:%d depth:%d)\n",
+                                     "wpmapped:%d dirty:%d depth:%d)\n",
                                      pid, procname, (addr64_t) vaddr,
                                      file_offset,
                                      (pathname ? pathname : "<nil>"),
@@ -2902,12 +2889,11 @@ vm_fault_enter(vm_page_t m,
                                       : "!="),
                                      mtime.tv_sec, mtime.tv_nsec,
                                      object->code_signed,
-                                     m->cs_validated,
-                                     m->cs_tainted,
-                                     m->cs_nx,
-                                     m->wpmapped,
-                                     m->slid,
-                                     m->dirty,
+                                     m->vmp_cs_validated,
+                                     m->vmp_cs_tainted,
+                                     m->vmp_cs_nx,
+                                     m->vmp_wpmapped,
+                                     m->vmp_dirty,
                                      shadow_depth);
                        }
 
@@ -2922,7 +2908,7 @@ vm_fault_enter(vm_page_t m,
                } else {
                        /* proceed with the invalid page */
                        kr = KERN_SUCCESS;
-                       if (!m->cs_validated &&
+                       if (!m->vmp_cs_validated &&
                            !object->code_signed) {
                                /*
                                 * This page has not been (fully) validated but
@@ -2951,8 +2937,8 @@ vm_fault_enter(vm_page_t m,
                                 * through that code path for re-consideration
                                 * of the validity of that page.
                                 */
-                               must_disconnect = !m->cs_tainted;
-                               m->cs_tainted = TRUE;
+                               must_disconnect = !m->vmp_cs_tainted;
+                               m->vmp_cs_tainted = TRUE;
                        }
                        cs_enter_tainted_accepted++;
                }
@@ -2995,12 +2981,12 @@ MACRO_END
         * the page queues.  Change wiring
         * case is obvious.
         */
-       assert((m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
+       assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
 
 #if CONFIG_BACKGROUND_QUEUE
        vm_page_update_background_state(m);
 #endif
-       if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
+       if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
                /*
                 * Compressor pages are neither wired
                 * nor pageable and should never change.
@@ -3030,10 +3016,10 @@ MACRO_END
                        __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
                        vm_page_deactivate(m);
                        /* we keep the page queues lock, if we need it later */
-               } else if (((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) ||
-                           (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
-                           (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
-                           ((m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
+               } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
+                           (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
+                           (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
+                           ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
                           !VM_PAGE_WIRED(m)) {
 
                        if (vm_page_local_q &&
@@ -3042,7 +3028,7 @@ MACRO_END
                                struct vpl      *lq;
                                uint32_t        lid;
 
-                               assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+                               assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
 
                                __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
                                vm_object_lock_assert_exclusive(object);
@@ -3069,9 +3055,9 @@ MACRO_END
 
                                vm_page_check_pageable_safe(m);
                                vm_page_queue_enter(&lq->vpl_queue, m,
-                                                   vm_page_t, pageq);
-                               m->vm_page_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
-                               m->local_id = lid;
+                                                   vm_page_t, vmp_pageq);
+                               m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
+                               m->vmp_local_id = lid;
                                lq->vpl_count++;
 
                                if (object->internal)
@@ -3110,11 +3096,11 @@ MACRO_END
                                 * page queue lock
                                 */
                                if (!VM_PAGE_WIRED(m)) {
-                                       if (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
+                                       if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
                                                vm_page_queues_remove(m, FALSE);
 
-                                               vm_pageout_cleaned_reactivated++;
-                                               vm_pageout_cleaned_fault_reactivated++;
+                                               VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
+                                               VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
                                        }
 
                                        if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
@@ -3134,10 +3120,10 @@ MACRO_END
 
                                                if (no_cache &&
                                                    (!previously_pmapped ||
-                                                    m->no_cache)) {
-                                                       m->no_cache = TRUE;
+                                                    m->vmp_no_cache)) {
+                                                       m->vmp_no_cache = TRUE;
 
-                                                       if (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)
+                                                       if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)
                                                                vm_page_speculate(m, FALSE);
 
                                                } else if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
@@ -3166,7 +3152,7 @@ MACRO_END
                 * properly serialize updating the pmapped and
                 * xpmapped bits
                 */
-               if ((prot & VM_PROT_EXECUTE) && !m->xpmapped) {
+               if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
                        ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
 
                        pmap_lock_phys_page(phys_page);
@@ -3176,11 +3162,11 @@ MACRO_END
                         * need to grab this lock a 2nd time
                         * just below
                         */
-                       m->pmapped = TRUE;
+                       m->vmp_pmapped = TRUE;
 
-                       if (!m->xpmapped) {
+                       if (!m->vmp_xpmapped) {
 
-                               m->xpmapped = TRUE;
+                               m->vmp_xpmapped = TRUE;
 
                                pmap_unlock_phys_page(phys_page);
 
@@ -3210,44 +3196,23 @@ MACRO_END
                        } else
                                pmap_unlock_phys_page(phys_page);
                } else {
-                       if (m->pmapped == FALSE) {
+                       if (m->vmp_pmapped == FALSE) {
                                ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
 
                                pmap_lock_phys_page(phys_page);
-                               m->pmapped = TRUE;
+                               m->vmp_pmapped = TRUE;
                                pmap_unlock_phys_page(phys_page);
                        }
                }
-               if (vm_page_is_slideable(m)) {
-                       boolean_t was_busy = m->busy;
-
-                       vm_object_lock_assert_exclusive(object);
-
-                       m->busy = TRUE;
-                       kr = vm_page_slide(m, 0);
-                       assert(m->busy);
-                       if(!was_busy) {
-                               PAGE_WAKEUP_DONE(m);
-                       }
-                       if (kr != KERN_SUCCESS) {
-                               /*
-                                * This page has not been slid correctly,
-                                * do not do the pmap_enter() !
-                                * Let vm_fault_enter() return the error
-                                * so the caller can fail the fault.
-                                */
-                               goto after_the_pmap_enter;
-                       }
-               }
 
                if (fault_type & VM_PROT_WRITE) {
 
-                       if (m->wpmapped == FALSE) {
+                       if (m->vmp_wpmapped == FALSE) {
                                vm_object_lock_assert_exclusive(object);
                                if (!object->internal && object->pager) {
                                        task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
                                }
-                               m->wpmapped = TRUE;
+                               m->vmp_wpmapped = TRUE;
                        }
                        if (must_disconnect) {
                                /*
@@ -3273,6 +3238,33 @@ MACRO_END
                }
                assert(VM_PAGE_OBJECT(m) == object);
 
+#if VM_OBJECT_ACCESS_TRACKING
+               if (object->access_tracking) {
+                       DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
+                       if (fault_type & VM_PROT_WRITE) {
+                               object->access_tracking_writes++;
+                               vm_object_access_tracking_writes++;
+                       } else {
+                               object->access_tracking_reads++;
+                               vm_object_access_tracking_reads++;
+                       }
+               }
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+
+#if PMAP_CS
+               /*
+                * If CS enforcement is on, we don't ask for an executable page if the
+                * fault does not call for execution, because that can fail in
+                * situations where the caller only actually wanted read access.
+                * However, it may be better to instead retry without execute on
+                * failure, or pass a flag into pmap_enter to do the right thing.
+                */
+               // TODO: <rdar://problem/30997388> maybe do something better than masking out VM_PROT_EXECUTE on non-execute faults
+               if (pmap_cs_enforced(pmap) && !(caller_prot & VM_PROT_EXECUTE)) {
+                       prot &= ~VM_PROT_EXECUTE;
+               }
+#endif
+
                /* Prevent a deadlock by not
                 * holding the object lock if we need to wait for a page in
                 * pmap_enter() - <rdar://problem/7138958> */
@@ -3317,11 +3309,11 @@ MACRO_END
                         * at the level above us, so
                         * use the blocking version instead. Requires marking
                         * the page busy and unlocking the object */
-                       boolean_t was_busy = m->busy;
+                       boolean_t was_busy = m->vmp_busy;
 
                        vm_object_lock_assert_exclusive(object);
 
-                       m->busy = TRUE;
+                       m->vmp_busy = TRUE;
                        vm_object_unlock(object);
 
                        PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
@@ -3335,7 +3327,7 @@ MACRO_END
 
                        /* If the page was busy, someone else will wake it up.
                         * Otherwise, we have to do it now. */
-                       assert(m->busy);
+                       assert(m->vmp_busy);
                        if(!was_busy) {
                                PAGE_WAKEUP_DONE(m);
                        }
@@ -3449,19 +3441,22 @@ vm_fault_internal(
        vm_object_t             new_object;
        int                     type_of_fault;
        pmap_t                  pmap;
-       boolean_t               interruptible_state;
+       wait_interrupt_t        interruptible_state;
        vm_map_t                real_map = map;
        vm_map_t                original_map = map;
        boolean_t               object_locks_dropped = FALSE;
        vm_prot_t               fault_type;
        vm_prot_t               original_fault_type;
-       struct vm_object_fault_info fault_info;
+       struct vm_object_fault_info fault_info = {};
        boolean_t               need_collapse = FALSE;
        boolean_t               need_retry = FALSE;
        boolean_t               *need_retry_ptr = NULL;
        int                     object_lock_type = 0;
        int                     cur_object_lock_type;
        vm_object_t             top_object = VM_OBJECT_NULL;
+       vm_object_t             written_on_object = VM_OBJECT_NULL;
+       memory_object_t         written_on_pager = NULL;
+       vm_object_offset_t      written_on_offset = 0;
        int                     throttle_delay;
        int                     compressed_count_delta;
        int                     grab_options;
@@ -3502,6 +3497,14 @@ vm_fault_internal(
                return (KERN_FAILURE);
        }
 
+       thread_t cthread = current_thread();
+       boolean_t rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
+       uint64_t fstart = 0;
+
+       if (rtfault) {
+               fstart = mach_continuous_time();
+       }
+
        interruptible_state = thread_interrupt_level(interruptible);
 
        fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
@@ -3526,6 +3529,8 @@ vm_fault_internal(
                }
        }
 RetryFault:
+       assert(written_on_object == VM_OBJECT_NULL);
+
        /*
         * assume we will hit a page in the cache
         * otherwise, explicitly override with
@@ -3677,7 +3682,7 @@ RetryFault:
                if (m != VM_PAGE_NULL) {
                        m_object = cur_object;
 
-                       if (m->busy) {
+                       if (m->vmp_busy) {
                                wait_result_t   result;
 
                                /*
@@ -3726,9 +3731,9 @@ RetryFault:
                                                continue;
                                        }
                                }
-                               if ((m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
+                               if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
                                        /*
-                                        * m->busy == TRUE and the object is locked exclusively
+                                        * m->vmp_busy == TRUE and the object is locked exclusively
                                         * if m->pageout_queue == TRUE after we acquire the
                                         * queues lock, we are guaranteed that it is stable on
                                         * the pageout queue and therefore reclaimable
@@ -3740,7 +3745,7 @@ RetryFault:
 
                                        vm_page_lock_queues();
 
-                                       if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
+                                       if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
                                                vm_pageout_throttle_up(m);
                                                vm_page_unlock_queues();
 
@@ -3772,7 +3777,7 @@ RetryFault:
                                goto done;
                        }
 reclaimed_from_pageout:
-                       if (m->laundry) {
+                       if (m->vmp_laundry) {
                                if (object != cur_object) {
                                        if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
                                                cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
@@ -3815,7 +3820,7 @@ reclaimed_from_pageout:
                                 */
                                break;
                        }
-                       if (m->unusual && (m->error || m->restart || m->private || m->absent)) {
+                       if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
                                /*
                                 * Unusual case... let the slow path deal with it
                                 */
@@ -3831,32 +3836,6 @@ reclaimed_from_pageout:
                                kr = KERN_MEMORY_ERROR;
                                goto done;
                        }
-                       if (vm_page_is_slideable(m)) {
-                               /*
-                                * We might need to slide this page, and so,
-                                * we want to hold the VM object exclusively.
-                                */
-                               if (object != cur_object) {
-                                       if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
-                                               vm_object_unlock(object);
-                                               vm_object_unlock(cur_object);
-
-                                               cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
-
-                                               vm_map_unlock_read(map);
-                                               if (real_map != map)
-                                                       vm_map_unlock(real_map);
-
-                                               goto RetryFault;
-                                       }
-                               } else if (object_lock_type == OBJECT_LOCK_SHARED) {
-
-                                       vm_object_unlock(object);
-                                       object_lock_type = OBJECT_LOCK_EXCLUSIVE;
-                                       vm_map_unlock_read(map);
-                                       goto RetryFault;
-                               }
-                       }
                        assert(m_object == VM_PAGE_OBJECT(m));
 
                        if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) ||
@@ -3963,7 +3942,7 @@ FastPmapEnter:
                                 * prepare for the pmap_enter...
                                 * object and map are both locked
                                 * m contains valid data
-                                * object == m->object
+                                * object == m->vmp_object
                                 * cur_object == NULL or it's been unlocked
                                 * no paging references on either object or cur_object
                                 */
@@ -3981,10 +3960,7 @@ FastPmapEnter:
                                                            wired,
                                                            change_wiring,
                                                            wire_tag,
-                                                           fault_info.no_cache,
-                                                           fault_info.cs_bypass,
-                                                           fault_info.user_tag,
-                                                           fault_info.pmap_options,
+                                                           &fault_info,
                                                            need_retry_ptr,
                                                            &type_of_fault);
                                } else {
@@ -3996,10 +3972,7 @@ FastPmapEnter:
                                                            wired,
                                                            change_wiring,
                                                            wire_tag,
-                                                           fault_info.no_cache,
-                                                           fault_info.cs_bypass,
-                                                           fault_info.user_tag,
-                                                           fault_info.pmap_options,
+                                                           &fault_info,
                                                            need_retry_ptr,
                                                            &type_of_fault);
                                }
@@ -4009,14 +3982,14 @@ FastPmapEnter:
 
                                if (m_object->internal)
                                        event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
-                               else if (m_object->object_slid)
+                               else if (m_object->object_is_shared_cache)
                                        event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
                                else
                                        event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
 
-                               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->offset, get_current_unique_pid(), 0);
+                               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
 
-                               DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
+                               DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
                                }
 #endif
                                if (kr == KERN_SUCCESS &&
@@ -4025,7 +3998,7 @@ FastPmapEnter:
                                        *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
                                        if (prot & VM_PROT_WRITE) {
                                                vm_object_lock_assert_exclusive(m_object);
-                                               m->dirty = TRUE;
+                                               m->vmp_dirty = TRUE;
                                        }
                                }
 
@@ -4053,16 +4026,25 @@ FastPmapEnter:
                                         * vm_fault_deactivate_behind depends on the
                                         * state being up to date
                                         */
-                                       vm_fault_is_sequential(object, cur_offset, fault_info.behavior);
+                                       vm_fault_is_sequential(m_object, cur_offset, fault_info.behavior);
 
-                                       vm_fault_deactivate_behind(object, cur_offset, fault_info.behavior);
+                                       vm_fault_deactivate_behind(m_object, cur_offset, fault_info.behavior);
                                }
                                /*
                                 * That's it, clean up and return.
                                 */
-                               if (m->busy)
+                               if (m->vmp_busy)
                                        PAGE_WAKEUP_DONE(m);
 
+                               if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
+                                       
+                                       vm_object_paging_begin(m_object);
+
+                                       assert(written_on_object == VM_OBJECT_NULL);
+                                       written_on_object = m_object;
+                                       written_on_pager = m_object->pager;
+                                       written_on_offset = m_object->paging_offset + m->vmp_offset;
+                               }
                                vm_object_unlock(object);
 
                                vm_map_unlock_read(map);
@@ -4156,10 +4138,10 @@ FastPmapEnter:
                        /*
                         * Now cope with the source page and object
                         */
-                       if (object->ref_count > 1 && cur_m->pmapped)
+                       if (object->ref_count > 1 && cur_m->vmp_pmapped)
                                pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
 
-                       if (cur_m->clustered) {
+                       if (cur_m->vmp_clustered) {
                                VM_PAGE_COUNT_AS_PAGEIN(cur_m);
                                VM_PAGE_CONSUME_CLUSTERED(cur_m);
                                vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
@@ -4349,7 +4331,7 @@ FastPmapEnter:
                                                m = VM_PAGE_NULL;
                                                break;
                                        }
-                                       m->dirty = TRUE;
+                                       m->vmp_dirty = TRUE;
 
                                        /*
                                         * If the object is purgeable, its
@@ -4377,22 +4359,25 @@ FastPmapEnter:
                                                 * no ledger update in that
                                                 * case.
                                                 */
-                                       } else if ((cur_object->purgable ==
-                                                   VM_PURGABLE_DENY) ||
-                                                  (cur_object->vo_purgeable_owner ==
+                                       } else if (((cur_object->purgable ==
+                                                    VM_PURGABLE_DENY) &&
+                                                   (!cur_object->vo_ledger_tag)) ||
+                                                  (cur_object->vo_owner ==
                                                    NULL)) {
                                                /*
                                                 * "cur_object" is not purgeable
-                                                * or is not owned, so no
-                                                * purgeable ledgers to update.
+                                                * and is not ledger-taged, or
+                                                * there's no owner for it,
+                                                * so no owner's ledgers to
+                                                * update.
                                                 */
                                        } else {
                                                /*
                                                 * One less compressed
-                                                * purgeable page for
+                                                * purgeable/tagged page for
                                                 * cur_object's owner.
                                                 */
-                                               vm_purgeable_compressed_update(
+                                               vm_object_owner_compressed_update(
                                                        cur_object,
                                                        -1);
                                        }
@@ -4460,16 +4445,6 @@ FastPmapEnter:
                                        kr = KERN_MEMORY_ERROR;
                                        goto done;
                                }
-                               if (vm_backing_store_low) {
-                                       /*
-                                        * we are protecting the system from
-                                        * backing store exhaustion...
-                                        * must take the slow path if we're
-                                        * not privileged
-                                        */
-                                       if (!(current_task()->priv_flags & VM_BACKING_STORE_PRIV))
-                                               break;
-                               }
                                if (cur_object != object) {
                                        vm_object_unlock(cur_object);
 
@@ -4891,10 +4866,7 @@ handle_copy_delay:
                                            wired,
                                            change_wiring,
                                            wire_tag,
-                                           fault_info.no_cache,
-                                           fault_info.cs_bypass,
-                                           fault_info.user_tag,
-                                           fault_info.pmap_options,
+                                           &fault_info,
                                            NULL,
                                            &type_of_fault);
                } else {
@@ -4906,10 +4878,7 @@ handle_copy_delay:
                                            wired,
                                            change_wiring,
                                            wire_tag,
-                                           fault_info.no_cache,
-                                           fault_info.cs_bypass,
-                                           fault_info.user_tag,
-                                           fault_info.pmap_options,
+                                           &fault_info,
                                            NULL,
                                            &type_of_fault);
                }
@@ -4921,15 +4890,15 @@ handle_copy_delay:
 
                if (m_object->internal)
                        event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
-               else if (m_object->object_slid)
+               else if (m_object->object_is_shared_cache)
                        event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
                else
                        event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
 
-               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->offset, get_current_unique_pid(), 0);
+               KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
 
-               DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
-               }
+               DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
+       }
 #endif
                if (kr != KERN_SUCCESS) {
                        /* abort this page fault */
@@ -4946,7 +4915,7 @@ handle_copy_delay:
                        *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
                        if (prot & VM_PROT_WRITE) {
                                vm_object_lock_assert_exclusive(m_object);
-                               m->dirty = TRUE;
+                               m->vmp_dirty = TRUE;
                        }
                }
        } else {
@@ -4960,28 +4929,6 @@ handle_copy_delay:
                 * in the object
                 */
 
-#ifdef ppc
-               /* While we do not worry about execution protection in   */
-               /* general, certian pages may have instruction execution */
-               /* disallowed.  We will check here, and if not allowed   */
-               /* to execute, we return with a protection failure.      */
-
-               if ((fault_type & VM_PROT_EXECUTE) &&
-                       (!pmap_eligible_for_execute((ppnum_t)(object->vo_shadow_offset >> 12)))) {
-
-                       vm_map_unlock_read(map);
-
-                       if (real_map != map)
-                               vm_map_unlock(real_map);
-
-                       vm_fault_cleanup(object, top_page);
-                       vm_object_deallocate(object);
-
-                       kr = KERN_PROTECTION_FAILURE;
-                       goto done;
-               }
-#endif /* ppc */
-
                if (real_map != map)
                        vm_map_unlock(real_map);
 
@@ -5098,6 +5045,15 @@ cleanup:
        if (m != VM_PAGE_NULL) {
                assert(VM_PAGE_OBJECT(m) == m_object);
 
+               if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
+                       
+                       vm_object_paging_begin(m_object);
+
+                       assert(written_on_object == VM_OBJECT_NULL);
+                       written_on_object = m_object;
+                       written_on_pager = m_object->pager;
+                       written_on_offset = m_object->paging_offset + m->vmp_offset;
+               }
                PAGE_WAKEUP_DONE(m);
 
                vm_fault_cleanup(m_object, top_page);
@@ -5133,6 +5089,22 @@ done:
                        }
                }
        }
+
+       if (written_on_object) {
+
+               vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
+
+               vm_object_lock(written_on_object);
+               vm_object_paging_end(written_on_object);
+               vm_object_unlock(written_on_object);
+
+               written_on_object = VM_OBJECT_NULL;
+       }
+
+       if (rtfault) {
+               vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
+       }
+
        KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
                              (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
                              ((uint64_t)trace_vaddr >> 32),
@@ -5231,7 +5203,7 @@ vm_fault_unwire(
        vm_map_offset_t va;
        vm_map_offset_t end_addr = entry->vme_end;
        vm_object_t             object;
-       struct vm_object_fault_info fault_info;
+       struct vm_object_fault_info fault_info = {};
        unsigned int    unwired_pages;
 
        object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
@@ -5248,7 +5220,6 @@ vm_fault_unwire(
        fault_info.interruptible = THREAD_UNINT;
        fault_info.behavior = entry->behavior;
        fault_info.user_tag = VME_ALIAS(entry);
-       fault_info.pmap_options = 0;
        if (entry->iokit_acct ||
            (!entry->is_sub_map && !entry->use_pmap)) {
                fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
@@ -5257,10 +5228,6 @@ vm_fault_unwire(
        fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
        fault_info.no_cache = entry->no_cache;
        fault_info.stealth = TRUE;
-       fault_info.io_sync = FALSE;
-       fault_info.cs_bypass = FALSE;
-       fault_info.mark_zf_absent = FALSE;
-       fault_info.batch_pmap_op = FALSE;
 
        unwired_pages = 0;
 
@@ -5285,13 +5252,12 @@ vm_fault_unwire(
                        vm_object_t     result_object;
                        vm_fault_return_t result;
 
-                       if (end_addr - va > (vm_size_t) -1) {
-                               /* 32-bit overflow */
-                               fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
-                       } else {
-                               fault_info.cluster_size = (vm_size_t) (end_addr - va);
-                               assert(fault_info.cluster_size == end_addr - va);
+                       /* cap cluster size at maximum UPL size */
+                       upl_size_t cluster_size;
+                       if (os_sub_overflow(end_addr, va, &cluster_size)) {
+                               cluster_size = 0 - (upl_size_t)PAGE_SIZE;
                        }
+                       fault_info.cluster_size = cluster_size;
 
                        do {
                                prot = VM_PROT_NONE;
@@ -5427,6 +5393,7 @@ vm_fault_wire_fast(
        thread_t                thread = current_thread();
        int                     type_of_fault;
        kern_return_t           kr;
+       struct vm_object_fault_info fault_info = {};
 
        VM_STAT_INCR(faults);
 
@@ -5511,12 +5478,12 @@ vm_fault_wire_fast(
         *      there's something going on, give up.
         */
        m = vm_page_lookup(object, offset);
-       if ((m == VM_PAGE_NULL) || (m->busy) ||
-           (m->unusual && ( m->error || m->restart || m->absent))) {
+       if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
+           (m->vmp_unusual && ( m->vmp_error || m->vmp_restart || m->vmp_absent))) {
 
                GIVE_UP;
        }
-       if (m->fictitious &&
+       if (m->vmp_fictitious &&
            VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
                /*
                 * Guard pages are fictitious pages and are never
@@ -5538,9 +5505,9 @@ vm_fault_wire_fast(
        /*
         *      Mark page busy for other threads.
         */
-       assert(!m->busy);
-       m->busy = TRUE;
-       assert(!m->absent);
+       assert(!m->vmp_busy);
+       m->vmp_busy = TRUE;
+       assert(!m->vmp_absent);
 
        /*
         *      Give up if the page is being written and there's a copy object
@@ -5550,6 +5517,13 @@ vm_fault_wire_fast(
                GIVE_UP;
        }
 
+       fault_info.user_tag = VME_ALIAS(entry);
+       fault_info.pmap_options = 0;
+       if (entry->iokit_acct ||
+           (!entry->is_sub_map && !entry->use_pmap)) {
+               fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
+       }
+
        /*
         *      Put this page into the physical map.
         */
@@ -5562,13 +5536,7 @@ vm_fault_wire_fast(
                            TRUE,  /* wired */
                            FALSE, /* change_wiring */
                            wire_tag,
-                           FALSE, /* no_cache */
-                           FALSE, /* cs_bypass */
-                           VME_ALIAS(entry),
-                           ((entry->iokit_acct ||
-                             (!entry->is_sub_map && !entry->use_pmap))
-                            ? PMAP_OPTIONS_ALT_ACCT
-                            : 0),
+                           &fault_info,
                            NULL,
                            &type_of_fault);
        if (kr != KERN_SUCCESS) {
@@ -5588,7 +5556,7 @@ done:
                        *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
                        if (prot & VM_PROT_WRITE) {
                                vm_object_lock_assert_exclusive(object);
-                               m->dirty = TRUE;
+                               m->vmp_dirty = TRUE;
                        }
                } else {
                        *physpage_p = 0;
@@ -5699,8 +5667,8 @@ vm_fault_copy(
        vm_fault_return_t       result;
 
        vm_map_size_t           part_size;
-       struct vm_object_fault_info fault_info_src;
-       struct vm_object_fault_info fault_info_dst;
+       struct vm_object_fault_info fault_info_src = {};
+       struct vm_object_fault_info fault_info_dst = {};
 
        /*
         * In order not to confuse the clustered pageins, align
@@ -5717,29 +5685,15 @@ vm_fault_copy(
 
        fault_info_src.interruptible = interruptible;
        fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
-       fault_info_src.user_tag  = 0;
-       fault_info_src.pmap_options = 0;
        fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
        fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
-       fault_info_src.no_cache   = FALSE;
        fault_info_src.stealth = TRUE;
-       fault_info_src.io_sync = FALSE;
-       fault_info_src.cs_bypass = FALSE;
-       fault_info_src.mark_zf_absent = FALSE;
-       fault_info_src.batch_pmap_op = FALSE;
 
        fault_info_dst.interruptible = interruptible;
        fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
-       fault_info_dst.user_tag  = 0;
-       fault_info_dst.pmap_options = 0;
        fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
        fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
-       fault_info_dst.no_cache   = FALSE;
        fault_info_dst.stealth = TRUE;
-       fault_info_dst.io_sync = FALSE;
-       fault_info_dst.cs_bypass = FALSE;
-       fault_info_dst.mark_zf_absent = FALSE;
-       fault_info_dst.batch_pmap_op = FALSE;
 
        do { /* while (amount_left > 0) */
                /*
@@ -5756,13 +5710,12 @@ vm_fault_copy(
                vm_object_lock(dst_object);
                vm_object_paging_begin(dst_object);
 
-               if (amount_left > (vm_size_t) -1) {
-                       /* 32-bit overflow */
-                       fault_info_dst.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
-               } else {
-                       fault_info_dst.cluster_size = (vm_size_t) amount_left;
-                       assert(fault_info_dst.cluster_size == amount_left);
+               /* cap cluster size at maximum UPL size */
+               upl_size_t cluster_size;
+               if (os_convert_overflow(amount_left, &cluster_size)) {
+                       cluster_size = 0 - (upl_size_t)PAGE_SIZE;
                }
+               fault_info_dst.cluster_size = cluster_size;
 
                XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
                dst_page = VM_PAGE_NULL;
@@ -5851,13 +5804,11 @@ vm_fault_copy(
                                src_prot = VM_PROT_READ;
                                vm_object_paging_begin(src_object);
 
-                               if (amount_left > (vm_size_t) -1) {
-                                       /* 32-bit overflow */
-                                       fault_info_src.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
-                               } else {
-                                       fault_info_src.cluster_size = (vm_size_t) amount_left;
-                                       assert(fault_info_src.cluster_size == amount_left);
+                               /* cap cluster size at maximum UPL size */
+                               if (os_convert_overflow(amount_left, &cluster_size)) {
+                                       cluster_size = 0 - (upl_size_t)PAGE_SIZE;
                                }
+                               fault_info_src.cluster_size = cluster_size;
 
                                XPR(XPR_VM_FAULT,
                                        "vm_fault_copy(2) -> vm_fault_page\n",
@@ -5972,7 +5923,7 @@ vm_fault_copy(
                                                  dst_page,
                                                  (vm_offset_t) dst_po,
                                                  (vm_size_t)part_size);
-                               if(!dst_page->dirty){
+                               if(!dst_page->vmp_dirty){
                                        vm_object_lock(dst_object);
                                        SET_PAGE_DIRTY(dst_page, TRUE);
                                        vm_object_unlock(dst_object);
@@ -5989,7 +5940,7 @@ vm_fault_copy(
                                vm_page_copy(result_page, dst_page);
                                vm_object_unlock(result_page_object);
 
-                               if(!dst_page->dirty){
+                               if(!dst_page->vmp_dirty){
                                        vm_object_lock(dst_object);
                                        SET_PAGE_DIRTY(dst_page, TRUE);
                                        vm_object_unlock(dst_object);
@@ -6050,7 +6001,7 @@ vm_fault_classify(vm_object_t             object,
        while (TRUE) {
                m = vm_page_lookup(object, offset);
                if (m != VM_PAGE_NULL) {
-                       if (m->busy || m->error || m->restart || m->absent) {
+                       if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
                                type = VM_FAULT_TYPE_OTHER;
                                break;
                        }
@@ -6164,23 +6115,23 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
                                return 0;
                        }
 
-                       if (m->laundry || m->busy || m->free_when_done || m->absent || m->error || m->cleaning ||
-                               m->overwriting || m->restart || m->unusual) {
+                       if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning ||
+                               m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
                                return 0;
                        }
 
-                       assert(!m->private);
-                       if (m->private) {
+                       assert(!m->vmp_private);
+                       if (m->vmp_private) {
                                return 0;
                        }
 
-                       assert(!m->fictitious);
-                       if (m->fictitious) {
+                       assert(!m->vmp_fictitious);
+                       if (m->vmp_fictitious) {
                                return 0;
                        }
 
-                       assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
-                       if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
+                       assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
+                       if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
                                return 0;
                        }
 
@@ -6212,23 +6163,26 @@ kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
 
 }
 
-void
-vm_page_validate_cs_mapped(
-       vm_page_t       page,
-       const void      *kaddr)
+/*
+ * vm_page_validate_cs_fast():
+ * Performs a few quick checks to determine if the page's code signature
+ * really needs to be fully validated.  It could:
+ *     1. have been modified (i.e. automatically tainted),
+ *     2. have already been validated,
+ *     3. have already been found to be tainted,
+ *     4. no longer have a backing store.
+ * Returns FALSE if the page needs to be fully validated.
+ */
+static boolean_t
+vm_page_validate_cs_fast(
+       vm_page_t       page)
 {
-       vm_object_t             object;
-       vm_object_offset_t      offset;
-       memory_object_t         pager;
-       struct vnode            *vnode;
-       boolean_t               validated;
-       unsigned                tainted;
+       vm_object_t     object;
 
-       assert(page->busy);
        object = VM_PAGE_OBJECT(page);
-       vm_object_lock_assert_exclusive(object);
+       vm_object_lock_assert_held(object);
 
-       if (page->wpmapped && !page->cs_tainted) {
+       if (page->vmp_wpmapped && !page->vmp_cs_tainted) {
                /*
                 * This page was mapped for "write" access sometime in the
                 * past and could still be modifiable in the future.
@@ -6236,38 +6190,76 @@ vm_page_validate_cs_mapped(
                 * [ If the page was already found to be "tainted", no
                 * need to re-validate. ]
                 */
-               page->cs_validated = TRUE;
-               page->cs_tainted = TRUE;
+               vm_object_lock_assert_exclusive(object);
+               page->vmp_cs_validated = TRUE;
+               page->vmp_cs_tainted = TRUE;
                if (cs_debug) {
-                       printf("CODESIGNING: vm_page_validate_cs: "
+                       printf("CODESIGNING: %s: "
                               "page %p obj %p off 0x%llx "
                               "was modified\n",
-                              page, object, page->offset);
+                              __FUNCTION__,
+                              page, object, page->vmp_offset);
                }
                vm_cs_validated_dirtied++;
        }
 
-       if (page->cs_validated || page->cs_tainted) {
-               return;
+       if (page->vmp_cs_validated || page->vmp_cs_tainted) {
+               return TRUE;
        }
+       vm_object_lock_assert_exclusive(object);
 
-       vm_cs_validates++;
+#if CHECK_CS_VALIDATION_BITMAP
+       kern_return_t kr;
 
-       assert(object->code_signed);
-       offset = page->offset;
+       kr = vnode_pager_cs_check_validation_bitmap(
+               object->pager,
+               page->vmp_offset + object->paging_offset,
+               CS_BITMAP_CHECK);
+       if (kr == KERN_SUCCESS) {
+               page->vmp_cs_validated = TRUE;
+               page->vmp_cs_tainted = FALSE;
+               vm_cs_bitmap_validated++;
+               return TRUE;
+       }
+#endif /* CHECK_CS_VALIDATION_BITMAP */
 
        if (!object->alive || object->terminating || object->pager == NULL) {
                /*
                 * The object is terminating and we don't have its pager
                 * so we can't validate the data...
                 */
-               return;
+               return TRUE;
        }
+
+       /* we need to really validate this page */
+       vm_object_lock_assert_exclusive(object);
+       return FALSE;
+}
+
+void
+vm_page_validate_cs_mapped_slow(
+       vm_page_t       page,
+       const void      *kaddr)
+{
+       vm_object_t             object;
+       memory_object_offset_t  mo_offset;
+       memory_object_t         pager;
+       struct vnode            *vnode;
+       boolean_t               validated;
+       unsigned                tainted;
+
+       assert(page->vmp_busy);
+       object = VM_PAGE_OBJECT(page);
+       vm_object_lock_assert_exclusive(object);
+
+       vm_cs_validates++;
+
        /*
         * Since we get here to validate a page that was brought in by
         * the pager, we know that this pager is all setup and ready
         * by now.
         */
+       assert(object->code_signed);
        assert(!object->internal);
        assert(object->pager != NULL);
        assert(object->pager_ready);
@@ -6275,26 +6267,43 @@ vm_page_validate_cs_mapped(
        pager = object->pager;
        assert(object->paging_in_progress);
        vnode = vnode_pager_lookup_vnode(pager);
+       mo_offset = page->vmp_offset + object->paging_offset;
 
        /* verify the SHA1 hash for this page */
        tainted = 0;
        validated = cs_validate_range(vnode,
                                      pager,
-                                     (object->paging_offset +
-                                      offset),
+                                     mo_offset,
                                      (const void *)((const char *)kaddr),
                                      PAGE_SIZE_64,
                                      &tainted);
 
        if (tainted & CS_VALIDATE_TAINTED) {
-               page->cs_tainted = TRUE;
+               page->vmp_cs_tainted = TRUE;
        }
        if (tainted & CS_VALIDATE_NX) {
-               page->cs_nx = TRUE;
+               page->vmp_cs_nx = TRUE;
        }
-
        if (validated) {
-               page->cs_validated = TRUE;
+               page->vmp_cs_validated = TRUE;
+       }
+
+#if CHECK_CS_VALIDATION_BITMAP
+       if (page->vmp_cs_validated && !page->vmp_cs_tainted) {
+               vnode_pager_cs_check_validation_bitmap(object->pager,
+                                                      mo_offset,
+                                                      CS_BITMAP_SET);
+       }
+#endif /* CHECK_CS_VALIDATION_BITMAP */
+}
+
+void
+vm_page_validate_cs_mapped(
+       vm_page_t       page,
+       const void      *kaddr)
+{
+       if (!vm_page_validate_cs_fast(page)) {
+               vm_page_validate_cs_mapped_slow(page, kaddr);
        }
 }
 
@@ -6314,53 +6323,18 @@ vm_page_validate_cs(
        object = VM_PAGE_OBJECT(page);
        vm_object_lock_assert_held(object);
 
-       if (page->wpmapped && !page->cs_tainted) {
-               vm_object_lock_assert_exclusive(object);
-
-               /*
-                * This page was mapped for "write" access sometime in the
-                * past and could still be modifiable in the future.
-                * Consider it tainted.
-                * [ If the page was already found to be "tainted", no
-                * need to re-validate. ]
-                */
-               page->cs_validated = TRUE;
-               page->cs_tainted = TRUE;
-               if (cs_debug) {
-                       printf("CODESIGNING: vm_page_validate_cs: "
-                              "page %p obj %p off 0x%llx "
-                              "was modified\n",
-                              page, object, page->offset);
-               }
-               vm_cs_validated_dirtied++;
-       }
-
-       if (page->cs_validated || page->cs_tainted) {
-               return;
-       }
-
-       if (page->slid) {
-               panic("vm_page_validate_cs(%p): page is slid\n", page);
-       }
-       assert(!page->slid);
-
-#if CHECK_CS_VALIDATION_BITMAP
-       if ( vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page(page->offset + object->paging_offset), CS_BITMAP_CHECK ) == KERN_SUCCESS) {
-               page->cs_validated = TRUE;
-               page->cs_tainted = FALSE;
-               vm_cs_bitmap_validated++;
+       if (vm_page_validate_cs_fast(page)) {
                return;
        }
-#endif
        vm_object_lock_assert_exclusive(object);
 
        assert(object->code_signed);
-       offset = page->offset;
+       offset = page->vmp_offset;
 
-       busy_page = page->busy;
+       busy_page = page->vmp_busy;
        if (!busy_page) {
                /* keep page busy while we map (and unlock) the VM object */
-               page->busy = TRUE;
+               page->vmp_busy = TRUE;
        }
 
        /*
@@ -6383,19 +6357,14 @@ vm_page_validate_cs(
                                  &koffset,
                                  &need_unmap);
        if (kr != KERN_SUCCESS) {
-               panic("vm_page_validate_cs: could not map page: 0x%x\n", kr);
+               panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
        }
        kaddr = CAST_DOWN(vm_offset_t, koffset);
 
        /* validate the mapped page */
-       vm_page_validate_cs_mapped(page, (const void *) kaddr);
+       vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
 
-#if CHECK_CS_VALIDATION_BITMAP
-       if ( page->cs_validated == TRUE && page->cs_tainted == FALSE ) {
-               vnode_pager_cs_check_validation_bitmap( object->pager, trunc_page( offset + object->paging_offset), CS_BITMAP_SET );
-       }
-#endif
-       assert(page->busy);
+       assert(page->vmp_busy);
        assert(object == VM_PAGE_OBJECT(page));
        vm_object_lock_assert_exclusive(object);
 
@@ -6431,12 +6400,12 @@ vm_page_validate_cs_mapped_chunk(
        *validated_p = FALSE;
        *tainted_p = 0;
 
-       assert(page->busy);
+       assert(page->vmp_busy);
        object = VM_PAGE_OBJECT(page);
        vm_object_lock_assert_exclusive(object);
 
        assert(object->code_signed);
-       offset = page->offset;
+       offset = page->vmp_offset;
 
        if (!object->alive || object->terminating || object->pager == NULL) {
                /*
@@ -6479,3 +6448,98 @@ vm_page_validate_cs_mapped_chunk(
                *tainted_p = tainted;
        }
 }
+
+static void vm_rtfrecord_lock(void) {
+       lck_spin_lock(&vm_rtfr_slock);
+}
+
+static void vm_rtfrecord_unlock(void) {
+       lck_spin_unlock(&vm_rtfr_slock);
+}
+
+unsigned int vmrtfaultinfo_bufsz(void) {
+       return (vmrtf_num_records * sizeof(vm_rtfault_record_t));
+}
+
+#include <kern/backtrace.h>
+
+static void vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault) {
+       uint64_t fend = mach_continuous_time();
+
+       uint64_t cfpc = 0;
+       uint64_t ctid = cthread->thread_id;
+       uint64_t cupid = get_current_unique_pid();
+
+       uintptr_t bpc = 0;
+       uint32_t bfrs = 0;
+       bool u64 = false;
+
+       /* Capture a single-frame backtrace; this extracts just the program
+        * counter at the point of the fault into "bpc", and should perform no
+        * further user stack traversals, thus avoiding copyin()s and further
+        * faults.
+        */
+       int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64);
+
+       if ((btr == 0) && (bfrs > 0)) {
+               cfpc = bpc;
+       }
+
+       assert((fstart != 0) && fend >= fstart);
+       vm_rtfrecord_lock();
+       assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
+
+       vmrtfrs.vmrtf_total++;
+       vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
+
+       cvmr->rtfabstime = fstart;
+       cvmr->rtfduration = fend - fstart;
+       cvmr->rtfaddr = fault_vaddr;
+       cvmr->rtfpc = cfpc;
+       cvmr->rtftype = type_of_fault;
+       cvmr->rtfupid = cupid;
+       cvmr->rtftid = ctid;
+
+       if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
+               vmrtfrs.vmrtfr_curi = 0;
+       }
+
+       vm_rtfrecord_unlock();
+}
+
+int vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, int vrecordsz, void *vrecords, int *vmrtfrv) {
+       vm_rtfault_record_t *cvmrd = vrecords;
+       size_t residue = vrecordsz;
+       int numextracted = 0;
+       boolean_t early_exit = FALSE;
+
+       vm_rtfrecord_lock();
+
+       for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
+
+               if (residue < sizeof(vm_rtfault_record_t)) {
+                       early_exit = TRUE;
+                       break;
+               }
+
+               if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
+#if    DEVELOPMENT || DEBUG
+                       if (isroot == FALSE) {
+                               continue;
+                       }
+#else
+                       continue;
+#endif /* DEVDEBUG */
+               }
+
+               *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
+               cvmrd++;
+               residue -= sizeof(vm_rtfault_record_t);
+               numextracted++;
+       }
+
+       vm_rtfrecord_unlock();
+
+       *vmrtfrv = numextracted;
+       return (early_exit);
+}
index 666b7ef52c23a1ee20d722d3fb3e30397ca10063..1dc0839e7f835c10a4324f4332942fe73158aa0b 100644 (file)
@@ -182,10 +182,7 @@ extern kern_return_t vm_fault_enter(
        boolean_t wired,
        boolean_t change_wiring,
        vm_tag_t  wire_tag,             /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
-       boolean_t no_cache,
-       boolean_t cs_bypass,
-       int       user_tag,
-       int       pmap_options,
+       vm_object_fault_info_t fault_info,
        boolean_t *need_retry,
        int *type_of_fault);
 
@@ -193,6 +190,7 @@ extern vm_offset_t kdp_lightweight_fault(
                vm_map_t map,
                vm_offset_t cur_target_addr);
 
+extern void vm_rtfault_record_init(void);
 
 #endif /* MACH_KERNEL_PRIVATE */
 
index 407cbb916d031f266b625a8ba52ada6d32380f33..d8de270773e0d8ccbad93440550c7b7ca2f24088 100644 (file)
@@ -1068,7 +1068,7 @@ fourk_pager_data_request(
                                      kr);
                        }
                        assert(src_page != VM_PAGE_NULL);
-                       assert(src_page->busy);
+                       assert(src_page->vmp_busy);
 
                        src_page_object = VM_PAGE_OBJECT(src_page);
 
@@ -1164,7 +1164,7 @@ fourk_pager_data_request(
                                       offset, cur_offset,
                                       (sub_page-sub_page_idx)*FOURK_PAGE_SIZE,
                                       src_page_object,
-                                      src_page->offset + offset_in_src_page,
+                                      src_page->vmp_offset + offset_in_src_page,
                                       *(uint64_t *)(dst_vaddr +
                                                     ((sub_page-sub_page_idx) *
                                                      FOURK_PAGE_SIZE)),
@@ -1302,7 +1302,7 @@ done:
                kr = vm_map_remove(kernel_map,
                                   kernel_mapping,
                                   kernel_mapping + (2 * PAGE_SIZE_64),
-                                  VM_MAP_NO_FLAGS);
+                                  VM_MAP_REMOVE_NO_FLAGS);
                assert(kr == KERN_SUCCESS);
                kernel_mapping = 0;
                src_vaddr = 0;
index 326ac01a517b23c380465d7f635f583123967e07..82f7ce30c68a80cfa1f6bd6cf742d6bddf7336a4 100644 (file)
@@ -113,31 +113,12 @@ vm_mem_bootstrap_log(const char *message)
  *     This is done only by the first cpu up.
  */
 
-int pacified_footprint_suspend = 0;
-int pacified_purgeable_iokit = 0;
-
 void
 vm_mem_bootstrap(void)
 {
        vm_offset_t     start, end;
        vm_size_t zsizearg;
        mach_vm_size_t zsize;
-       int pacified;
-
-       pacified = 0;
-       PE_parse_boot_argn("pacified",
-                          &pacified,
-                          sizeof (pacified));
-       if (pacified) {
-               pacified_footprint_suspend = 1;
-               pacified_purgeable_iokit = 1;
-       }
-       PE_parse_boot_argn("pacified_footprint_suspend",
-                          &pacified_footprint_suspend,
-                          sizeof (pacified_footprint_suspend));
-       PE_parse_boot_argn("pacified_purgeable_iokit",
-                          &pacified_purgeable_iokit,
-                          sizeof (pacified_purgeable_iokit));
 
        /*
         *      Initializes resident memory structures.
@@ -198,9 +179,10 @@ vm_mem_bootstrap(void)
 
        if (zsize < ZONE_MAP_MIN)
                zsize = ZONE_MAP_MIN;   /* Clamp to min */
+
 #if defined(__LP64__)
        zsize += zsize >> 1;
-#endif  /* __LP64__ */
+#endif /* __LP64__ */
        if (zsize > sane_size >> 1)
                zsize = sane_size >> 1; /* Clamp to half of RAM max */
 #if !__LP64__
@@ -208,25 +190,6 @@ vm_mem_bootstrap(void)
                zsize = ZONE_MAP_MAX;   /* Clamp to 1.5GB max for K32 */
 #endif /* !__LP64__ */
 
-#if CONFIG_EMBEDDED
-#if defined(__LP64__)
-       {
-       mach_vm_size_t max_zsize;
-
-       /*
-        * because of the limited kernel virtual space for embedded systems,
-        * we need to clamp the size of the zone map being created... replicate
-        * the above calculation for a 1Gbyte, LP64 system and use that as the
-        * maximum size for the zone map
-        */
-       max_zsize = (1024ULL * 1024ULL * 1024ULL) >> 2ULL;
-       max_zsize += max_zsize >> 1;
-
-       if (zsize > max_zsize)
-               zsize = max_zsize;
-       }
-#endif
-#endif
        vm_mem_bootstrap_log("kext_alloc_init");
        kext_alloc_init();
 
@@ -261,6 +224,11 @@ vm_mem_bootstrap(void)
        vm_paging_map_init();
 
        vm_mem_bootstrap_log("vm_mem_bootstrap done");
+
+#ifdef CONFIG_ZCACHE
+       zcache_bootstrap();
+#endif
+       vm_rtfault_record_init();
 }
 
 void
index a14a10db9a0145440bf2887056a0892f55de4c4c..8e53cbd13802039bf99dba3315ee828938eebbc6 100644 (file)
@@ -175,7 +175,7 @@ kmem_alloc_contig(
                                                VM_MAP_PAGE_MASK(map)),
                              vm_map_round_page(map_addr + map_size,
                                                VM_MAP_PAGE_MASK(map)),
-                             0);
+                             VM_MAP_REMOVE_NO_FLAGS);
                vm_object_deallocate(object);
                *addrp = 0;
                return kr;
@@ -186,7 +186,7 @@ kmem_alloc_contig(
                m = pages;
                pages = NEXT_PAGE(m);
                *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
-               m->busy = FALSE;
+               m->vmp_busy = FALSE;
                vm_page_insert(m, object, offset + i);
        }
        vm_object_unlock(object);
@@ -210,7 +210,7 @@ kmem_alloc_contig(
                                                VM_MAP_PAGE_MASK(map)), 
                              vm_map_round_page(map_addr + map_size,
                                                VM_MAP_PAGE_MASK(map)),
-                             0);
+                             VM_MAP_REMOVE_NO_FLAGS);
                vm_object_deallocate(object);
                return kr;
        }
@@ -265,6 +265,7 @@ kernel_memory_allocate(
        vm_page_t               wired_page_list = NULL;
        int                     guard_page_count = 0;
        int                     wired_page_count = 0;
+       int                     page_grab_count = 0;
        int                     i;
        int                     vm_alloc_flags;
        vm_map_kernel_flags_t   vmk_flags;
@@ -294,7 +295,8 @@ kernel_memory_allocate(
         * limit raised to 2GB with 128GB max physical limit,
         * but scaled by installed memory above this
         */
-        if ( !(flags & KMA_VAONLY) && map_size > MAX(1ULL<<31, sane_size/64)) {
+        if (!(flags & (KMA_VAONLY | KMA_PAGEABLE)) &&
+           map_size > MAX(1ULL<<31, sane_size/64)) {
                 return KERN_RESOURCE_SHORTAGE;
         }
 
@@ -340,6 +342,10 @@ kernel_memory_allocate(
        wired_page_count = (int) (fill_size / PAGE_SIZE_64);
        assert(wired_page_count * PAGE_SIZE_64 == fill_size);
 
+#if DEBUG || DEVELOPMENT
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0);
+#endif
+
        for (i = 0; i < guard_page_count; i++) {
                for (;;) {
                        mem = vm_page_grab_guard();
@@ -352,11 +358,11 @@ kernel_memory_allocate(
                        }
                        vm_page_more_fictitious();
                }
-               mem->snext = guard_page_list;
+               mem->vmp_snext = guard_page_list;
                guard_page_list = mem;
        }
 
-       if (! (flags & KMA_VAONLY)) {
+       if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
        for (i = 0; i < wired_page_count; i++) {
                uint64_t        unavailable;
                
@@ -385,8 +391,9 @@ kernel_memory_allocate(
                        }
                        VM_PAGE_WAIT();
                }
+               page_grab_count++;
                if (KMA_ZERO & flags) vm_page_zero_fill(mem);
-               mem->snext = wired_page_list;
+               mem->vmp_snext = wired_page_list;
                wired_page_list = mem;
        }
        }
@@ -424,7 +431,7 @@ kernel_memory_allocate(
        VME_OBJECT_SET(entry, object);
        VME_OFFSET_SET(entry, offset);
        
-       if (object != compressor_object)
+       if (!(flags & (KMA_COMPRESSOR | KMA_PAGEABLE)))
                entry->wired_count++;
 
        if (flags & KMA_PERMANENT)
@@ -443,12 +450,12 @@ kernel_memory_allocate(
                        panic("kernel_memory_allocate: guard_page_list == NULL");
 
                mem = guard_page_list;
-               guard_page_list = mem->snext;
-               mem->snext = NULL;
+               guard_page_list = mem->vmp_snext;
+               mem->vmp_snext = NULL;
 
                vm_page_insert(mem, object, offset + pg_offset);
 
-               mem->busy = FALSE;
+               mem->vmp_busy = FALSE;
                pg_offset += PAGE_SIZE_64;
        }
 
@@ -461,7 +468,7 @@ kernel_memory_allocate(
        }
 #endif
 
-       if (flags & KMA_VAONLY) {
+       if (flags & (KMA_VAONLY | KMA_PAGEABLE)) {
                pg_offset = fill_start + fill_size;
        } else {
        for (pg_offset = fill_start; pg_offset < fill_start + fill_size; pg_offset += PAGE_SIZE_64) {
@@ -469,24 +476,24 @@ kernel_memory_allocate(
                        panic("kernel_memory_allocate: wired_page_list == NULL");
 
                mem = wired_page_list;
-               wired_page_list = mem->snext;
-               mem->snext = NULL;
+               wired_page_list = mem->vmp_snext;
+               mem->vmp_snext = NULL;
 
-               assert(mem->wire_count == 0);
-               assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+               assert(mem->vmp_wire_count == 0);
+               assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
 
-               mem->vm_page_q_state = VM_PAGE_IS_WIRED;
-               mem->wire_count++;
-               if (__improbable(mem->wire_count == 0)) {
+               mem->vmp_q_state = VM_PAGE_IS_WIRED;
+               mem->vmp_wire_count++;
+               if (__improbable(mem->vmp_wire_count == 0)) {
                        panic("kernel_memory_allocate(%p): wire_count overflow",
                              mem);
                }
 
                vm_page_insert_wired(mem, object, offset + pg_offset, tag);
 
-               mem->busy = FALSE;
-               mem->pmapped = TRUE;
-               mem->wpmapped = TRUE;
+               mem->vmp_busy = FALSE;
+               mem->vmp_pmapped = TRUE;
+               mem->vmp_wpmapped = TRUE;
 
                PMAP_ENTER_OPTIONS(kernel_pmap, map_addr + pg_offset, mem,
                                   kma_prot, VM_PROT_NONE, ((flags & KMA_KSTACK) ? VM_MEM_STACK : 0), TRUE,
@@ -517,17 +524,17 @@ kernel_memory_allocate(
                        panic("kernel_memory_allocate: guard_page_list == NULL");
 
                mem = guard_page_list;
-               guard_page_list = mem->snext;
-               mem->snext = NULL;
+               guard_page_list = mem->vmp_snext;
+               mem->vmp_snext = NULL;
 
                vm_page_insert(mem, object, offset + pg_offset);
 
-               mem->busy = FALSE;
+               mem->vmp_busy = FALSE;
        }
        if (guard_page_list || wired_page_list)
                panic("kernel_memory_allocate: non empty list\n");
 
-       if (! (flags & KMA_VAONLY)) {
+       if (!(flags & (KMA_VAONLY | KMA_PAGEABLE))) {
        vm_page_lockspin_queues();
        vm_page_wire_count += wired_page_count;
        vm_page_unlock_queues();
@@ -543,6 +550,10 @@ kernel_memory_allocate(
        else
                vm_object_deallocate(object);
 
+#if DEBUG || DEVELOPMENT
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
+#endif
+
        /*
         *      Return the memory, not zeroed.
         */
@@ -556,6 +567,10 @@ out:
        if (wired_page_list)
                vm_page_free_list(wired_page_list, FALSE);
 
+#if DEBUG || DEVELOPMENT
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
+#endif
+
        return kr;
 }
 
@@ -573,8 +588,13 @@ kernel_memory_populate(
        vm_page_t               mem;
        vm_page_t               page_list = NULL;
        int                     page_count = 0;
+       int                     page_grab_count = 0;
        int                     i;
 
+#if DEBUG || DEVELOPMENT
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_START, size, 0, 0, 0);
+#endif
+
        page_count = (int) (size / PAGE_SIZE_64);
 
        assert((flags & (KMA_COMPRESSOR|KMA_KOBJECT)) != (KMA_COMPRESSOR|KMA_KOBJECT));
@@ -592,8 +612,9 @@ kernel_memory_populate(
                                
                                VM_PAGE_WAIT();
                        }
+                       page_grab_count++;
                        if (KMA_ZERO & flags) vm_page_zero_fill(mem);
-                       mem->snext = page_list;
+                       mem->vmp_snext = page_list;
                        page_list = mem;
 
                        pg_offset -= PAGE_SIZE_64;
@@ -616,16 +637,16 @@ kernel_memory_populate(
                     pg_offset += PAGE_SIZE_64) {
 
                        mem = page_list;
-                       page_list = mem->snext;
-                       mem->snext = NULL;
+                       page_list = mem->vmp_snext;
+                       mem->vmp_snext = NULL;
 
                        vm_page_insert(mem, object, offset + pg_offset);
-                       assert(mem->busy);
+                       assert(mem->vmp_busy);
 
-                       mem->busy = FALSE;
-                       mem->pmapped = TRUE;
-                       mem->wpmapped = TRUE;
-                       mem->vm_page_q_state = VM_PAGE_USED_BY_COMPRESSOR;
+                       mem->vmp_busy = FALSE;
+                       mem->vmp_pmapped = TRUE;
+                       mem->vmp_wpmapped = TRUE;
+                       mem->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
                }
                vm_object_unlock(object);
 
@@ -636,6 +657,10 @@ kernel_memory_populate(
                        kasan_notify_address(addr, size);
                }
 #endif
+
+#if DEBUG || DEVELOPMENT
+               VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
+#endif
                return KERN_SUCCESS;
        }
 
@@ -660,8 +685,9 @@ kernel_memory_populate(
                        }
                        VM_PAGE_WAIT();
                }
+               page_grab_count++;
                if (KMA_ZERO & flags) vm_page_zero_fill(mem);
-               mem->snext = page_list;
+               mem->vmp_snext = page_list;
                page_list = mem;
        }
        if (flags & KMA_KOBJECT) {
@@ -691,22 +717,21 @@ kernel_memory_populate(
                        panic("kernel_memory_populate: page_list == NULL");
 
                mem = page_list;
-               page_list = mem->snext;
-               mem->snext = NULL;
-
-               assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-               mem->vm_page_q_state = VM_PAGE_IS_WIRED;
-               mem->wire_count++;
-               if (__improbable(mem->wire_count == 0)) {
-                       panic("kernel_memory_populate(%p): wire_count overflow",
-                             mem);
+               page_list = mem->vmp_snext;
+               mem->vmp_snext = NULL;
+
+               assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
+               mem->vmp_q_state = VM_PAGE_IS_WIRED;
+               mem->vmp_wire_count++;
+               if (__improbable(mem->vmp_wire_count == 0)) {
+                       panic("kernel_memory_populate(%p): wire_count overflow", mem);
                }
 
                vm_page_insert_wired(mem, object, offset + pg_offset, tag);
 
-               mem->busy = FALSE;
-               mem->pmapped = TRUE;
-               mem->wpmapped = TRUE;
+               mem->vmp_busy = FALSE;
+               mem->vmp_pmapped = TRUE;
+               mem->vmp_wpmapped = TRUE;
 
                PMAP_ENTER_OPTIONS(kernel_pmap, addr + pg_offset, mem,
                                   VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE,
@@ -732,10 +757,14 @@ kernel_memory_populate(
                        pmap_set_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
                }
        }
-       vm_page_lock_queues();
+       vm_page_lockspin_queues();
        vm_page_wire_count += page_count;
        vm_page_unlock_queues();
 
+#if DEBUG || DEVELOPMENT
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
+#endif
+
        if (kernel_object == object) vm_tag_update_size(tag, size);
 
        vm_object_unlock(object);
@@ -753,6 +782,10 @@ out:
        if (page_list)
                vm_page_free_list(page_list, FALSE);
 
+#if DEBUG || DEVELOPMENT
+       VM_DEBUG_CONSTANT_EVENT(vm_kern_request, VM_KERN_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
+#endif
+
        return kr;
 }
 
@@ -804,21 +837,21 @@ kernel_memory_depopulate(
 
                assert(mem);
                
-               if (mem->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR)
+               if (mem->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR)
                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
 
-               mem->busy = TRUE;
+               mem->vmp_busy = TRUE;
 
-               assert(mem->tabled);
+               assert(mem->vmp_tabled);
                vm_page_remove(mem, TRUE);
-               assert(mem->busy);
+               assert(mem->vmp_busy);
 
-               assert(mem->pageq.next == 0 && mem->pageq.prev == 0);
-               assert((mem->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
-                      (mem->vm_page_q_state == VM_PAGE_NOT_ON_Q));
+               assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
+               assert((mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
+                      (mem->vmp_q_state == VM_PAGE_NOT_ON_Q));
 
-               mem->vm_page_q_state = VM_PAGE_NOT_ON_Q;
-               mem->snext = local_freeq;
+               mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
+               mem->vmp_snext = local_freeq;
                local_freeq = mem;
        }
        vm_object_unlock(object);
@@ -977,7 +1010,7 @@ kmem_realloc(
        kr = vm_map_wire_kernel(map, newmapaddr, newmapaddr + newmapsize,
                         VM_PROT_DEFAULT, tag, FALSE);
        if (KERN_SUCCESS != kr) {
-               vm_map_remove(map, newmapaddr, newmapaddr + newmapsize, 0);
+               vm_map_remove(map, newmapaddr, newmapaddr + newmapsize, VM_MAP_REMOVE_NO_FLAGS);
                vm_object_lock(object);
                for(offset = oldsize; offset < newmapsize; offset += PAGE_SIZE) {
                        if ((mem = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
@@ -1167,7 +1200,7 @@ kmem_alloc_pages(
                VM_PAGE_WAIT();
                vm_object_lock(object);
            }
-           mem->busy = FALSE;
+           mem->vmp_busy = FALSE;
 
            alloc_size -= PAGE_SIZE;
            offset += PAGE_SIZE;
@@ -1248,7 +1281,8 @@ kmem_suballoc(
                /*
                 * See comment preceding vm_map_submap().
                 */
-               vm_map_remove(parent, map_addr, map_addr + map_size, VM_MAP_NO_FLAGS);
+               vm_map_remove(parent, map_addr, map_addr + map_size,
+                             VM_MAP_REMOVE_NO_FLAGS);
                vm_map_deallocate(map); /* also removes ref to pmap */
                vm_object_deallocate(vm_submap_object);
                return (kr);
index 8cab89ce4e6202491a272d1513049abfc12eeb3a..d63523e08656e09bc1b583f36045f4268690a234 100644 (file)
 #ifndef        _VM_VM_KERN_H_
 #define _VM_VM_KERN_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #include <mach/mach_types.h>
 #include <mach/boolean.h>
 #include <mach/kern_return.h>
@@ -98,6 +102,7 @@ extern kern_return_t kernel_memory_allocate(
 #define KMA_COMPRESSOR 0x400   /* Pages belonging to the compressor are not on the paging queues, nor are they counted as wired. */
 #define KMA_ATOMIC     0x800
 #define KMA_ZERO       0x1000
+#define KMA_PAGEABLE   0x2000
 
 extern kern_return_t kmem_alloc(
                                vm_map_t    map,
@@ -326,6 +331,7 @@ extern kern_return_t mach_vm_map_kernel(
        mach_vm_size_t  initial_size,
        mach_vm_offset_t        mask,
        int                     flags,
+       vm_map_kernel_flags_t   vmk_flags,
        vm_tag_t                tag,
        ipc_port_t              port,
        vm_object_offset_t      offset,
@@ -341,6 +347,7 @@ extern kern_return_t vm_map_kernel(
        vm_size_t               size,
        vm_offset_t             mask,
        int                     flags,
+       vm_map_kernel_flags_t   vmk_flags,
        vm_tag_t                tag,
        ipc_port_t              port,
        vm_offset_t             offset,
@@ -383,6 +390,7 @@ extern kern_return_t vm_map_64_kernel(
        vm_size_t               size,
        vm_offset_t             mask,
        int                     flags,
+       vm_map_kernel_flags_t   vmk_flags,
        vm_tag_t                tag,
        ipc_port_t              port,
        vm_object_offset_t      offset,
@@ -455,4 +463,9 @@ extern void vm_kernel_addrhash_external(
 extern void vm_init_before_launchd(void);
 
 #endif /* KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* _VM_VM_KERN_H_ */
index 573c2a34e8ba6ffd828e65758ff42831e01c3687..ad60f1693598496387fd7198ee02cd27e96ad033 100644 (file)
 #include <kern/assert.h>
 #include <kern/backtrace.h>
 #include <kern/counters.h>
+#include <kern/exc_guard.h>
 #include <kern/kalloc.h>
 #include <kern/zalloc.h>
 
 #include <vm/cpm.h>
+#include <vm/vm_compressor.h>
 #include <vm/vm_compressor_pager.h>
 #include <vm/vm_init.h>
 #include <vm/vm_fault.h>
@@ -95,6 +97,7 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <vm/vm_pageout.h>
+#include <vm/pmap.h>
 #include <vm/vm_kern.h>
 #include <ipc/ipc_port.h>
 #include <kern/sched_prim.h>
 
 #include <san/kasan.h>
 
+#include <sys/codesign.h>
+#include <libkern/section_keywords.h>
+#if DEVELOPMENT || DEBUG
+extern int proc_selfcsflags(void);
+#if CONFIG_EMBEDDED
+extern int panic_on_unsigned_execute;
+#endif /* CONFIG_EMBEDDED */
+#endif /* DEVELOPMENT || DEBUG */
+
 #if __arm64__
-extern int fourk_binary_compatibility_unsafe;
-extern int fourk_binary_compatibility_allow_wx;
+extern const int fourk_binary_compatibility_unsafe;
+extern const int fourk_binary_compatibility_allow_wx;
 #endif /* __arm64__ */
 extern int proc_selfpid(void);
 extern char *proc_name_address(void *p);
@@ -126,8 +138,8 @@ int vm_map_debug_apple_protect = 0;
 int vm_map_debug_fourk = 0;
 #endif /* VM_MAP_DEBUG_FOURK */
 
-int vm_map_executable_immutable = 0;
-int vm_map_executable_immutable_no_log = 0;
+SECURITY_READ_ONLY_LATE(int) vm_map_executable_immutable = 1;
+int vm_map_executable_immutable_verbose = 0;
 
 extern u_int32_t random(void); /* from <libkern/libkern.h> */
 /* Internal prototypes
@@ -180,6 +192,11 @@ static kern_return_t       vm_map_delete(
        int             flags,
        vm_map_t        zap_map);
 
+static void            vm_map_copy_insert(
+       vm_map_t        map,
+       vm_map_entry_t  after_where,
+       vm_map_copy_t   copy);
+
 static kern_return_t   vm_map_copy_overwrite_unaligned(
        vm_map_t        dst_map,
        vm_map_entry_t  entry,
@@ -317,6 +334,9 @@ static kern_return_t        vm_map_pageout(
        vm_map_offset_t end);
 #endif /* MACH_ASSERT */
 
+static void            vm_map_corpse_footprint_destroy(
+       vm_map_t        map);
+
 pid_t find_largest_process_vm_map_entries(void);
 
 /*
@@ -329,6 +349,34 @@ pid_t find_largest_process_vm_map_entries(void);
  * vm_map_copyout.
  */
 
+#if CONFIG_EMBEDDED
+
+/*
+ * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
+ * But for security reasons on embedded platforms, we don't want the
+ * new mapping to be "used for jit", so we always reset the flag here.
+ * Same for "pmap_cs_associated".
+ */
+#define VM_MAP_ENTRY_COPY_CODE_SIGNING(NEW,OLD)                \
+MACRO_BEGIN                                            \
+       (NEW)->used_for_jit = FALSE;                    \
+       (NEW)->pmap_cs_associated = FALSE;                              \
+MACRO_END
+
+#else /* CONFIG_EMBEDDED */
+
+/*
+ * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
+ * On macOS, the new mapping can be "used for jit".
+ */
+#define VM_MAP_ENTRY_COPY_CODE_SIGNING(NEW,OLD)                                \
+MACRO_BEGIN                                                            \
+       assert((NEW)->used_for_jit == (OLD)->used_for_jit);             \
+       assert((NEW)->pmap_cs_associated == FALSE);                             \
+MACRO_END
+
+#endif /* CONFIG_EMBEDDED */
+
 #define vm_map_entry_copy(NEW,OLD)     \
 MACRO_BEGIN                            \
 boolean_t _vmec_reserved = (NEW)->from_reserved_zone;  \
@@ -339,7 +387,7 @@ boolean_t _vmec_reserved = (NEW)->from_reserved_zone;       \
        (NEW)->wired_count = 0;         \
        (NEW)->user_wired_count = 0;    \
        (NEW)->permanent = FALSE;       \
-       (NEW)->used_for_jit = FALSE;    \
+       VM_MAP_ENTRY_COPY_CODE_SIGNING((NEW),(OLD));    \
        (NEW)->from_reserved_zone = _vmec_reserved;     \
        if ((NEW)->iokit_acct) {                        \
             assertf(!(NEW)->use_pmap, "old %p new %p\n", (OLD), (NEW)); \
@@ -708,7 +756,7 @@ vm_map_apple_protected(
                                             vm_flags,
                                             vmk_flags,
                                             VM_KERN_MEMORY_NONE,
-                                            (ipc_port_t) unprotected_mem_obj,
+                                            (ipc_port_t)(uintptr_t) unprotected_mem_obj,
                                             0,
                                             TRUE,
                                             tmp_entry.protection,
@@ -771,6 +819,14 @@ lck_grp_attr_t     vm_map_lck_grp_attr;
 lck_attr_t             vm_map_lck_attr;
 lck_attr_t             vm_map_lck_rw_attr;
 
+#if CONFIG_EMBEDDED
+int malloc_no_cow = 1;
+#define VM_PROTECT_WX_FAIL 0
+#else /* CONFIG_EMBEDDED */
+int malloc_no_cow = 0;
+#define VM_PROTECT_WX_FAIL 1
+#endif /* CONFIG_EMBEDDED */
+uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
 
 /*
  *     vm_map_init:
@@ -893,9 +949,29 @@ vm_map_init(
        PE_parse_boot_argn("vm_map_executable_immutable",
                           &vm_map_executable_immutable,
                           sizeof(vm_map_executable_immutable));
-       PE_parse_boot_argn("vm_map_executable_immutable_no_log",
-                          &vm_map_executable_immutable_no_log,
-                          sizeof(vm_map_executable_immutable_no_log));
+       PE_parse_boot_argn("vm_map_executable_immutable_verbose",
+                          &vm_map_executable_immutable_verbose,
+                          sizeof(vm_map_executable_immutable_verbose));
+
+       PE_parse_boot_argn("malloc_no_cow",
+                          &malloc_no_cow,
+                          sizeof(malloc_no_cow));
+       if (malloc_no_cow) {
+               vm_memory_malloc_no_cow_mask = 0ULL;
+               vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
+               vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
+               vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
+//             vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
+//             vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
+               vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
+               vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
+               vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
+               vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
+//             vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
+               PE_parse_boot_argn("vm_memory_malloc_no_cow_mask",
+                                  &vm_memory_malloc_no_cow_mask,
+                                  sizeof(vm_memory_malloc_no_cow_mask));
+       }
 }
 
 void
@@ -954,7 +1030,7 @@ vm_map_disable_hole_optimization(vm_map_t map)
 
        if (map->holelistenabled) {
 
-               head_entry = hole_entry = (vm_map_entry_t) map->holes_list;
+               head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
 
                while (hole_entry != NULL) {
 
@@ -994,15 +1070,35 @@ vm_kernel_map_is_kernel(vm_map_t map) {
 
 vm_map_t
 vm_map_create(
-       pmap_t                  pmap,
+       pmap_t          pmap,
        vm_map_offset_t min,
        vm_map_offset_t max,
-       boolean_t               pageable)
+       boolean_t       pageable)
+{
+       int options;
+
+       options = 0;
+       if (pageable) {
+               options |= VM_MAP_CREATE_PAGEABLE;
+       }
+       return vm_map_create_options(pmap, min, max, options);
+}
+
+vm_map_t
+vm_map_create_options(
+       pmap_t          pmap,
+       vm_map_offset_t min,
+       vm_map_offset_t max,
+       int             options)
 {
-       static int              color_seed = 0;
        vm_map_t        result;
        struct vm_map_links     *hole_entry = NULL;
 
+       if (options & ~(VM_MAP_CREATE_ALL_OPTIONS)) {
+               /* unknown option */
+               return VM_MAP_NULL;
+       }
+
        result = (vm_map_t) zalloc(vm_map_zone);
        if (result == VM_MAP_NULL)
                panic("vm_map_create");
@@ -1010,7 +1106,11 @@ vm_map_create(
        vm_map_first_entry(result) = vm_map_to_entry(result);
        vm_map_last_entry(result)  = vm_map_to_entry(result);
        result->hdr.nentries = 0;
-       result->hdr.entries_pageable = pageable;
+       if (options & VM_MAP_CREATE_PAGEABLE) {
+               result->hdr.entries_pageable = TRUE;
+       } else {
+               result->hdr.entries_pageable = FALSE;
+       }
 
        vm_map_store_init( &(result->hdr) );
 
@@ -1022,7 +1122,7 @@ vm_map_create(
 #if __x86_64__
        result->vmmap_high_start = 0;
 #endif /* __x86_64__ */
-       result->ref_count = 1;
+       result->map_refcnt = 1;
 #if    TASK_SWAPPER
        result->res_count = 1;
        result->sw_state = MAP_SW_IN;
@@ -1042,25 +1142,30 @@ vm_map_create(
        result->highest_entry_end = 0;
        result->first_free = vm_map_to_entry(result);
        result->hint = vm_map_to_entry(result);
-       result->color_rr = (color_seed++) & vm_color_mask;
        result->jit_entry_exists = FALSE;
 
-       if (vm_map_supports_hole_optimization) {
-               hole_entry = zalloc(vm_map_holes_zone);
+       /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
+       if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
+               result->has_corpse_footprint = TRUE;
+               result->holelistenabled = FALSE;
+               result->vmmap_corpse_footprint = NULL;
+       } else {
+               result->has_corpse_footprint = FALSE;
+               if (vm_map_supports_hole_optimization) {
+                       hole_entry = zalloc(vm_map_holes_zone);
 
-               hole_entry->start = min;
+                       hole_entry->start = min;
 #if defined(__arm__) || defined(__arm64__)
-               hole_entry->end = result->max_offset;
+                       hole_entry->end = result->max_offset;
 #else
-               hole_entry->end = (max > (vm_map_offset_t)MACH_VM_MAX_ADDRESS) ? max : (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
+                       hole_entry->end = (max > (vm_map_offset_t)MACH_VM_MAX_ADDRESS) ? max : (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
 #endif
-               result->holes_list = result->hole_hint = hole_entry;
-               hole_entry->prev = hole_entry->next = (vm_map_entry_t) hole_entry;
-               result->holelistenabled = TRUE;
-
-       } else {
-
-               result->holelistenabled = FALSE;
+                       result->holes_list = result->hole_hint = hole_entry;
+                       hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
+                       result->holelistenabled = TRUE;
+               } else {
+                       result->holelistenabled = FALSE;
+               }
        }
 
        vm_map_lock_init(result);
@@ -1190,7 +1295,7 @@ void vm_map_res_reference(vm_map_t map)
 {
        /* assert map is locked */
        assert(map->res_count >= 0);
-       assert(map->ref_count >= map->res_count);
+       assert(map->map_refcnt >= map->res_count);
        if (map->res_count == 0) {
                lck_mtx_unlock(&map->s_lock);
                vm_map_lock(map);
@@ -1215,8 +1320,8 @@ void vm_map_reference_swap(vm_map_t map)
        assert(map != VM_MAP_NULL);
        lck_mtx_lock(&map->s_lock);
        assert(map->res_count >= 0);
-       assert(map->ref_count >= map->res_count);
-       map->ref_count++;
+       assert(map->map_refcnt >= map->res_count);
+       map->map_refcnt++;
        vm_map_res_reference(map);
        lck_mtx_unlock(&map->s_lock);
 }
@@ -1241,7 +1346,7 @@ void vm_map_res_deallocate(vm_map_t map)
                vm_map_unlock(map);
                lck_mtx_lock(&map->s_lock);
        }
-       assert(map->ref_count >= map->res_count);
+       assert(map->map_refcnt >= map->res_count);
 }
 #endif /* MACH_ASSERT && TASK_SWAPPER */
 
@@ -1261,6 +1366,8 @@ vm_map_destroy(
        flags |= VM_MAP_REMOVE_NO_UNNESTING;
        /* final cleanup: ok to remove immutable mappings */
        flags |= VM_MAP_REMOVE_IMMUTABLE;
+       /* final cleanup: allow gaps in range */
+       flags |= VM_MAP_REMOVE_GAPS_OK;
 
        /* clean up regular map entries */
        (void) vm_map_delete(map, map->min_offset, map->max_offset,
@@ -1272,6 +1379,8 @@ vm_map_destroy(
 #endif /* !__arm__ && !__arm64__ */
 
        vm_map_disable_hole_optimization(map);
+       vm_map_corpse_footprint_destroy(map);
+
        vm_map_unlock(map);
 
        assert(map->hdr.nentries == 0);
@@ -1590,7 +1699,7 @@ vm_map_find_space(
                VM_MAP_HIGHEST_ENTRY(map, entry, start);
        } else {
                if (map->holelistenabled) {
-                       hole_entry = (vm_map_entry_t)map->holes_list;
+                       hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
 
                        if (hole_entry == NULL) {
                                /*
@@ -1638,7 +1747,9 @@ vm_map_find_space(
                        return(KERN_NO_SPACE);
                }
                start = end;
+               assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
                end += size;
+               assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
 
                if ((end > map->max_offset) || (end < start)) {
                        vm_map_entry_dispose(map, new_entry);
@@ -1675,7 +1786,7 @@ vm_map_find_space(
                entry = next;
 
                if (map->holelistenabled) {
-                       if (entry == (vm_map_entry_t) map->holes_list) {
+                       if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
                                /*
                                 * Wrapped around
                                 */
@@ -1748,6 +1859,7 @@ vm_map_find_space(
        }
 
        new_entry->used_for_jit = FALSE;
+       new_entry->pmap_cs_associated = FALSE;
        new_entry->zero_wired_pages = FALSE;
        new_entry->iokit_acct = FALSE;
        new_entry->vme_resilient_codesign = FALSE;
@@ -1763,7 +1875,7 @@ vm_map_find_space(
         *      Insert the new entry into the list
         */
 
-       vm_map_store_entry_link(map, entry, new_entry);
+       vm_map_store_entry_link(map, entry, new_entry, VM_MAP_KERNEL_FLAGS_NONE);
 
        map->size += size;
 
@@ -1804,6 +1916,7 @@ vm_map_pmap_enter(
 {
        int                     type_of_fault;
        kern_return_t           kr;
+       struct vm_object_fault_info fault_info = {};
 
        if(map->pmap == 0)
                return;
@@ -1826,8 +1939,8 @@ vm_map_pmap_enter(
 
                m = vm_page_lookup(object, offset);
 
-               if (m == VM_PAGE_NULL || m->busy || m->fictitious ||
-                   (m->unusual && ( m->error || m->restart || m->absent))) {
+               if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
+                   (m->vmp_unusual && ( m->vmp_error || m->vmp_restart || m->vmp_absent))) {
                        vm_object_unlock(object);
                        return;
                }
@@ -1838,16 +1951,14 @@ vm_map_pmap_enter(
                               map, (unsigned long long)addr, object, (unsigned long long)offset);
                }
                type_of_fault = DBG_CACHE_HIT_FAULT;
-               kr = vm_fault_enter(m, map->pmap, addr, protection, protection,
-                                                   VM_PAGE_WIRED(m),
-                                                   FALSE, /* change_wiring */
-                                                   VM_KERN_MEMORY_NONE, /* tag - not wiring */
-                                                   FALSE, /* no_cache */
-                                                   FALSE, /* cs_bypass */
-                                                   0,     /* XXX need user tag / alias? */
-                                                   0,     /* pmap_options */
-                                                   NULL,  /* need_retry */
-                                                   &type_of_fault);
+               kr = vm_fault_enter(m, map->pmap,
+                                   addr, protection, protection,
+                                   VM_PAGE_WIRED(m),
+                                   FALSE, /* change_wiring */
+                                   VM_KERN_MEMORY_NONE, /* tag - not wiring */
+                                   &fault_info,
+                                   NULL,  /* need_retry */
+                                   &type_of_fault);
 
                vm_object_unlock(object);
 
@@ -1944,6 +2055,19 @@ vm_map_random_address_for_size(
        return kr;
 }
 
+static boolean_t
+vm_memory_malloc_no_cow(
+       int alias)
+{
+       uint64_t alias_mask;
+
+       alias_mask = 1ULL << alias;
+       if (alias_mask & vm_memory_malloc_no_cow_mask) {
+               return TRUE;
+       }
+       return FALSE;
+}
+
 /*
  *     Routine:        vm_map_enter
  *
@@ -1977,6 +2101,7 @@ vm_map_enter(
        vm_map_offset_t         start, tmp_start, tmp_offset;
        vm_map_offset_t         end, tmp_end;
        vm_map_offset_t         tmp2_start, tmp2_end;
+       vm_map_offset_t         desired_empty_end;
        vm_map_offset_t         step;
        kern_return_t           result = KERN_SUCCESS;
        vm_map_t                zap_old_map = VM_MAP_NULL;
@@ -2042,16 +2167,36 @@ vm_map_enter(
        }
 
 
-#if CONFIG_EMBEDDED
-       if (cur_protection & VM_PROT_WRITE){
-               if ((cur_protection & VM_PROT_EXECUTE) && !entry_for_jit){
-                       printf("EMBEDDED: %s: curprot cannot be write+execute. "
-                              "turning off execute\n",
-                              __FUNCTION__);
-                       cur_protection &= ~VM_PROT_EXECUTE;
-               }
+       if ((cur_protection & VM_PROT_WRITE) &&
+           (cur_protection & VM_PROT_EXECUTE) &&
+#if !CONFIG_EMBEDDED
+           map != kernel_map &&
+           (cs_process_global_enforcement() ||
+            (vmk_flags.vmkf_cs_enforcement_override
+             ? vmk_flags.vmkf_cs_enforcement
+             : cs_process_enforcement(NULL))) &&
+#endif /* !CONFIG_EMBEDDED */
+           !entry_for_jit) {
+               DTRACE_VM3(cs_wx,
+                          uint64_t, 0,
+                          uint64_t, 0,
+                          vm_prot_t, cur_protection);
+               printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
+#if VM_PROTECT_WX_FAIL
+                      "failing\n",
+#else /* VM_PROTECT_WX_FAIL */
+                      "turning off execute\n",
+#endif /* VM_PROTECT_WX_FAIL */
+                      proc_selfpid(),
+                      (current_task()->bsd_info
+                       ? proc_name_address(current_task()->bsd_info)
+                       : "?"),
+                      __FUNCTION__);
+               cur_protection &= ~VM_PROT_EXECUTE;
+#if VM_PROTECT_WX_FAIL
+               return KERN_PROTECTION_FAILURE;
+#endif /* VM_PROTECT_WX_FAIL */
        }
-#endif /* CONFIG_EMBEDDED */
 
        /*
         * If the task has requested executable lockdown,
@@ -2190,11 +2335,13 @@ StartAgain: ;
                map_locked = TRUE;
 
                if (entry_for_jit) {
+#if CONFIG_EMBEDDED
                        if (map->jit_entry_exists) {
                                result = KERN_INVALID_ARGUMENT;
                                goto BailOut;
                        }
                        random_address = TRUE;
+#endif /* CONFIG_EMBEDDED */
                }
 
                if (random_address) {
@@ -2236,7 +2383,7 @@ StartAgain: ;
                } else {
 
                        if (map->holelistenabled) {
-                               hole_entry = (vm_map_entry_t)map->holes_list;
+                               hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
 
                                if (hole_entry == NULL) {
                                        /*
@@ -2261,7 +2408,7 @@ StartAgain: ;
                                                }
                                                hole_entry = hole_entry->vme_next;
 
-                                       } while (hole_entry != (vm_map_entry_t) map->holes_list);
+                                       } while (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list));
 
                                        if (found_hole == FALSE) {
                                                result = KERN_NO_SPACE;
@@ -2338,7 +2485,10 @@ StartAgain: ;
                                                   VM_MAP_PAGE_MASK(map)));
                        end += size;
 
-                       if ((end > effective_max_offset) || (end < start)) {
+                       /* We want an entire page of empty space, but don't increase the allocation size. */
+                       desired_empty_end = vm_map_round_page(end, VM_MAP_PAGE_MASK(map));
+
+                       if ((desired_empty_end > effective_max_offset) || (desired_empty_end < start)) {
                                if (map->wait_for_space) {
                                        assert(!keep_map_locked);
                                        if (size <= (effective_max_offset -
@@ -2357,7 +2507,7 @@ StartAgain: ;
                        next = entry->vme_next;
 
                        if (map->holelistenabled) {
-                               if (entry->vme_end >= end)
+                               if (entry->vme_end >= desired_empty_end)
                                        break;
                        } else {
                                /*
@@ -2372,7 +2522,7 @@ StartAgain: ;
                                if (next == vm_map_to_entry(map))
                                        break;
 
-                               if (next->vme_start >= end)
+                               if (next->vme_start >= desired_empty_end)
                                        break;
                        }
 
@@ -2383,7 +2533,7 @@ StartAgain: ;
                        entry = next;
 
                        if (map->holelistenabled) {
-                               if (entry == (vm_map_entry_t) map->holes_list) {
+                               if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
                                        /*
                                         * Wrapped around
                                         */
@@ -2557,12 +2707,14 @@ StartAgain: ;
         *      semantics.
         */
 
-       if (purgable || entry_for_jit) {
+       if (purgable ||
+           entry_for_jit ||
+           vm_memory_malloc_no_cow(user_alias)) {
                if (object == VM_OBJECT_NULL) {
 
                        object = vm_object_allocate(size);
                        object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
-                       object->true_share = TRUE;
+                       object->true_share = FALSE;
                        if (purgable) {
                                task_t owner;
                                object->purgable = VM_PURGABLE_NONVOLATILE;
@@ -2580,7 +2732,7 @@ StartAgain: ;
                                } else {
                                        owner = current_task();
                                }
-                               assert(object->vo_purgeable_owner == NULL);
+                               assert(object->vo_owner == NULL);
                                assert(object->resident_page_count == 0);
                                assert(object->wired_page_count == 0);
                                vm_object_lock(object);
@@ -2616,6 +2768,7 @@ StartAgain: ;
                   (!entry->map_aligned || !clear_map_aligned) &&
                   (!entry->zero_wired_pages) &&
                   (!entry->used_for_jit && !entry_for_jit) &&
+                  (!entry->pmap_cs_associated) &&
                   (entry->iokit_acct == iokit_acct) &&
                   (!entry->vme_resilient_codesign) &&
                   (!entry->vme_resilient_media) &&
@@ -2715,12 +2868,13 @@ StartAgain: ;
                        assert(!new_entry->iokit_acct);
                        if (!is_submap &&
                            object != VM_OBJECT_NULL &&
-                           object->purgable != VM_PURGABLE_DENY) {
+                           (object->purgable != VM_PURGABLE_DENY ||
+                            object->vo_ledger_tag)) {
                                assert(new_entry->use_pmap);
                                assert(!new_entry->iokit_acct);
                                /*
                                 * Turn off pmap accounting since
-                                * purgeable objects have their
+                                * purgeable (or tagged) objects have their
                                 * own ledgers.
                                 */
                                new_entry->use_pmap = FALSE;
@@ -2991,7 +3145,8 @@ BailOut:
                                        vm_map_store_entry_unlink(zap_old_map,
                                                            entry2);
                                        zap_old_map->size -= entry_size;
-                                       vm_map_store_entry_link(map, entry1, entry2);
+                                       vm_map_store_entry_link(map, entry1, entry2,
+                                                               VM_MAP_KERNEL_FLAGS_NONE);
                                        map->size += entry_size;
                                        entry1 = entry2;
                                }
@@ -3089,17 +3244,26 @@ vm_map_enter_fourk(
                return KERN_NOT_SUPPORTED;
        }
 
-#if CONFIG_EMBEDDED
-       if (cur_protection & VM_PROT_WRITE) {
-               if ((cur_protection & VM_PROT_EXECUTE) &&
-                   !entry_for_jit) {
-                       printf("EMBEDDED: %s: curprot cannot be write+execute. "
-                              "turning off execute\n",
-                              __FUNCTION__);
-                       cur_protection &= ~VM_PROT_EXECUTE;
-               }
+       if ((cur_protection & VM_PROT_WRITE) &&
+           (cur_protection & VM_PROT_EXECUTE) &&
+#if !CONFIG_EMBEDDED
+           map != kernel_map &&
+           cs_process_enforcement(NULL) &&
+#endif /* !CONFIG_EMBEDDED */
+           !entry_for_jit) {
+               DTRACE_VM3(cs_wx,
+                          uint64_t, 0,
+                          uint64_t, 0,
+                          vm_prot_t, cur_protection);
+               printf("CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
+                      "turning off execute\n",
+                      proc_selfpid(),
+                      (current_task()->bsd_info
+                       ? proc_name_address(current_task()->bsd_info)
+                       : "?"),
+                      __FUNCTION__);
+               cur_protection &= ~VM_PROT_EXECUTE;
        }
-#endif /* CONFIG_EMBEDDED */
 
        /*
         * If the task has requested executable lockdown,
@@ -3611,7 +3775,8 @@ BailOut:
                                        vm_map_store_entry_unlink(zap_old_map,
                                                            entry2);
                                        zap_old_map->size -= entry_size;
-                                       vm_map_store_entry_link(map, entry1, entry2);
+                                       vm_map_store_entry_link(map, entry1, entry2,
+                                                               VM_MAP_KERNEL_FLAGS_NONE);
                                        map->size += entry_size;
                                        entry1 = entry2;
                                }
@@ -3828,7 +3993,7 @@ vm_map_enter_mem_object_helper(
                                              flags,
                                              vmk_flags,
                                              tag,
-                                             (vm_object_t) submap,
+                                             (vm_object_t)(uintptr_t) submap,
                                              offset,
                                              copy,
                                              cur_protection,
@@ -3970,7 +4135,7 @@ vm_map_enter_mem_object_helper(
                                        vm_map_lock(copy_submap);
                                        vm_map_reference(copy_submap);
                                        vm_map_unlock(copy_submap);
-                                       copy_object = (vm_object_t) copy_submap;
+                                       copy_object = (vm_object_t)(uintptr_t) copy_submap;
                                } else if (!copy &&
                                           copy_object != VM_OBJECT_NULL &&
                                           (copy_entry->needs_copy ||
@@ -4039,6 +4204,11 @@ vm_map_enter_mem_object_helper(
                                         */
                                        assert(!copy_entry->needs_copy);
                                }
+#if !CONFIG_EMBEDDED
+                               if (copy_entry->used_for_jit) {
+                                       vmk_remap_flags.vmkf_map_jit = TRUE;
+                               }
+#endif /* !CONFIG_EMBEDDED */
                                kr = vm_map_enter(target_map,
                                                  &copy_addr,
                                                  copy_size,
@@ -4081,7 +4251,7 @@ vm_map_enter_mem_object_helper(
                                        vm_map_remove(target_map,
                                                      map_addr,
                                                      map_addr + offset,
-                                                     0);
+                                                     VM_MAP_REMOVE_NO_FLAGS);
                                        *address += offset;
                                }
                                if (offset + map_size < named_entry->size) {
@@ -4095,7 +4265,7 @@ vm_map_enter_mem_object_helper(
                                                       offset + map_size),
                                                      (map_addr +
                                                       named_entry->size),
-                                                     0);
+                                                     VM_MAP_REMOVE_NO_FLAGS);
                                }
                        }
                        named_entry_unlock(named_entry);
@@ -4737,15 +4907,15 @@ vm_map_enter_cpm(
                pages = NEXT_PAGE(m);
                *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
 
-               assert(!m->gobbled);
-               assert(!m->wanted);
-               assert(!m->pageout);
-               assert(!m->tabled);
+               assert(!m->vmp_gobbled);
+               assert(!m->vmp_wanted);
+               assert(!m->vmp_pageout);
+               assert(!m->vmp_tabled);
                assert(VM_PAGE_WIRED(m));
-               assert(m->busy);
+               assert(m->vmp_busy);
                assert(VM_PAGE_GET_PHYS_PAGE(m)>=(avail_start>>PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m)<=(avail_end>>PAGE_SHIFT));
 
-               m->busy = FALSE;
+               m->vmp_busy = FALSE;
                vm_page_insert(m, cpm_obj, offset);
        }
        assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
@@ -4849,17 +5019,17 @@ vm_map_enter_cpm(
                if (m == VM_PAGE_NULL)
                        panic("vm_allocate_cpm:  obj %p off 0x%llx no page",
                              cpm_obj, (uint64_t)offset);
-               assert(m->tabled);
-               assert(!m->busy);
-               assert(!m->wanted);
-               assert(!m->fictitious);
-               assert(!m->private);
-               assert(!m->absent);
-               assert(!m->error);
-               assert(!m->cleaning);
-               assert(!m->laundry);
-               assert(!m->precious);
-               assert(!m->clustered);
+               assert(m->vmp_tabled);
+               assert(!m->vmp_busy);
+               assert(!m->vmp_wanted);
+               assert(!m->vmp_fictitious);
+               assert(!m->vmp_private);
+               assert(!m->vmp_absent);
+               assert(!m->vmp_error);
+               assert(!m->vmp_cleaning);
+               assert(!m->vmp_laundry);
+               assert(!m->vmp_precious);
+               assert(!m->vmp_clustered);
                if (offset != 0) {
                        if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
                                printf("start 0x%llx end 0x%llx va 0x%llx\n",
@@ -4971,7 +5141,7 @@ vm_map_clip_unnest(
        pmap_unnest(map->pmap,
                    entry->vme_start,
                    entry->vme_end - entry->vme_start);
-       if ((map->mapped_in_other_pmaps) && (map->ref_count)) {
+       if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) {
                /* clean up parent map/maps */
                vm_map_submap_pmap_clean(
                        map, entry->vme_start,
@@ -5029,6 +5199,15 @@ vm_map_clip_start(
                if (entry->vme_atomic) {
                        panic("Attempting to clip an atomic VM entry! (map: %p, entry: %p)\n", map, entry);
                }
+
+               DTRACE_VM5(
+                       vm_map_clip_start,
+                       vm_map_t, map,
+                       vm_map_offset_t, entry->vme_start,
+                       vm_map_offset_t, entry->vme_end,
+                       vm_map_offset_t, startaddr,
+                       int, VME_ALIAS(entry));
+
                _vm_map_clip_start(&map->hdr, entry, startaddr);
                if (map->holelistenabled) {
                        vm_map_store_update_first_free(map, NULL, FALSE);
@@ -5137,6 +5316,14 @@ vm_map_clip_end(
                if (entry->vme_atomic) {
                        panic("Attempting to clip an atomic VM entry! (map: %p, entry: %p)\n", map, entry);
                }
+               DTRACE_VM5(
+                       vm_map_clip_end,
+                       vm_map_t, map,
+                       vm_map_offset_t, entry->vme_start,
+                       vm_map_offset_t, entry->vme_end,
+                       vm_map_offset_t, endaddr,
+                       int, VME_ALIAS(entry));
+
                _vm_map_clip_end(&map->hdr, entry, endaddr);
                if (map->holelistenabled) {
                        vm_map_store_update_first_free(map, NULL, FALSE);
@@ -5381,12 +5568,6 @@ vm_map_submap(
        return(result);
 }
 
-#if CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG)
-#include <sys/codesign.h>
-extern int proc_selfcsflags(void);
-extern int panic_on_unsigned_execute;
-#endif /* CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) */
-
 /*
  *     vm_map_protect:
  *
@@ -5424,8 +5605,39 @@ vm_map_protect(
                        return KERN_INVALID_ADDRESS;
                }
 
+#if VM_PROTECT_WX_FAIL
+               if ((new_prot & VM_PROT_EXECUTE) &&
+                   map != kernel_map &&
+                   cs_process_enforcement(NULL)) {
+                       DTRACE_VM3(cs_wx,
+                                  uint64_t, (uint64_t) start,
+                                  uint64_t, (uint64_t) end,
+                                  vm_prot_t, new_prot);
+                       printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
+                              proc_selfpid(),
+                              (current_task()->bsd_info
+                               ? proc_name_address(current_task()->bsd_info)
+                               : "?"),
+                              __FUNCTION__);
+                       return KERN_PROTECTION_FAILURE;
+               }
+#endif /* VM_PROTECT_WX_FAIL */
+
+               /*
+                * Let vm_map_remap_extract() know that it will need to:
+                * + make a copy of the mapping
+                * + add VM_PROT_WRITE to the max protections
+                * + remove any protections that are no longer allowed from the
+                *   max protections (to avoid any WRITE/EXECUTE conflict, for
+                *   example).
+                * Note that "max_prot" is an IN/OUT parameter only for this
+                * specific (VM_PROT_COPY) case.  It's usually an OUT parameter
+                * only.
+                */
+               max_prot = new_prot & VM_PROT_ALL;
                kflags = VM_MAP_KERNEL_FLAGS_NONE;
                kflags.vmkf_remap_prot_copy = TRUE;
+               kflags.vmkf_overwrite_immutable = TRUE;
                new_start = start;
                kr = vm_map_remap(map,
                                  &new_start,
@@ -5500,14 +5712,29 @@ vm_map_protect(
                        return(KERN_PROTECTION_FAILURE);
                }
 
-#if CONFIG_EMBEDDED
-               if (new_prot & VM_PROT_WRITE) {
-                       if ((new_prot & VM_PROT_EXECUTE) && !(current->used_for_jit)) {
-                               printf("EMBEDDED: %s can't have both write and exec at the same time\n", __FUNCTION__);
-                               new_prot &= ~VM_PROT_EXECUTE;
-                       }
+               if ((new_prot & VM_PROT_WRITE) &&
+                   (new_prot & VM_PROT_EXECUTE) &&
+#if !CONFIG_EMBEDDED
+                   map != kernel_map &&
+                   cs_process_enforcement(NULL) &&
+#endif /* !CONFIG_EMBEDDED */
+                   !(current->used_for_jit)) {
+                       DTRACE_VM3(cs_wx,
+                                  uint64_t, (uint64_t) current->vme_start,
+                                  uint64_t, (uint64_t) current->vme_end,
+                                  vm_prot_t, new_prot);
+                       printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
+                              proc_selfpid(),
+                              (current_task()->bsd_info
+                               ? proc_name_address(current_task()->bsd_info)
+                               : "?"),
+                              __FUNCTION__);
+                       new_prot &= ~VM_PROT_EXECUTE;
+#if VM_PROTECT_WX_FAIL
+                       vm_map_unlock(map);
+                       return KERN_PROTECTION_FAILURE;
+#endif /* VM_PROTECT_WX_FAIL */
                }
-#endif
 
                /*
                 * If the task has requested executable lockdown,
@@ -5617,8 +5844,8 @@ vm_map_protect(
 #if CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG)
                        if (!(old_prot & VM_PROT_EXECUTE) &&
                            (prot & VM_PROT_EXECUTE) &&
-                           (proc_selfcsflags() & CS_KILL) &&
-                           panic_on_unsigned_execute) {
+                           panic_on_unsigned_execute &&
+                           (proc_selfcsflags() & CS_KILL)) {
                                panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?\n", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
                        }
 #endif /* CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) */
@@ -5864,9 +6091,7 @@ subtract_wire_counts(
        }
 }
 
-#if CONFIG_EMBEDDED
 int cs_executable_wire = 0;
-#endif /* CONFIG_EMBEDDED */
 
 /*
  *     vm_map_wire:
@@ -6283,7 +6508,7 @@ vm_map_wire_nested(
                                        if (entry->protection & VM_PROT_WRITE) {
                                                vm_object_lock_assert_exclusive(
                                                        object);
-                                               m->dirty = TRUE;
+                                               m->vmp_dirty = TRUE;
                                        }
                                } else {
                                        /* not already wired !? */
@@ -6302,14 +6527,20 @@ vm_map_wire_nested(
                 * Unwired entry or wire request transmitted via submap
                 */
 
-#if CONFIG_EMBEDDED
                /*
                 * Wiring would copy the pages to the shadow object.
                 * The shadow object would not be code-signed so
                 * attempting to execute code from these copied pages
                 * would trigger a code-signing violation.
                 */
-               if (entry->protection & VM_PROT_EXECUTE) {
+
+               if ((entry->protection & VM_PROT_EXECUTE)
+#if !CONFIG_EMBEDDED
+                   &&
+                   map != kernel_map &&
+                   cs_process_enforcement(NULL)
+#endif /* !CONFIG_EMBEDDED */
+                       ) {
 #if MACH_ASSERT
                        printf("pid %d[%s] wiring executable range from "
                               "0x%llx to 0x%llx: rejected to preserve "
@@ -6328,8 +6559,6 @@ vm_map_wire_nested(
                        rc = KERN_PROTECTION_FAILURE;
                        goto done;
                }
-#endif /* CONFIG_EMBEDDED */
-
 
                /*
                 * Perform actions of vm_map_lookup that need the write
@@ -7044,7 +7273,7 @@ vm_map_submap_pmap_clean(
                                VME_OFFSET(entry));
                } else {
 
-                       if((map->mapped_in_other_pmaps) && (map->ref_count)
+                       if((map->mapped_in_other_pmaps) && (map->map_refcnt)
                           && (VME_OBJECT(entry) != NULL)) {
                                vm_object_pmap_protect_options(
                                        VME_OBJECT(entry),
@@ -7080,7 +7309,7 @@ vm_map_submap_pmap_clean(
                                VME_SUBMAP(entry),
                                VME_OFFSET(entry));
                } else {
-                       if((map->mapped_in_other_pmaps) && (map->ref_count)
+                       if((map->mapped_in_other_pmaps) && (map->map_refcnt)
                           && (VME_OBJECT(entry) != NULL)) {
                                vm_object_pmap_protect_options(
                                        VME_OBJECT(entry),
@@ -7104,6 +7333,87 @@ vm_map_submap_pmap_clean(
        return;
 }
 
+/*
+ *     virt_memory_guard_ast:
+ *
+ *     Handle the AST callout for a virtual memory guard.
+ *        raise an EXC_GUARD exception and terminate the task
+ *     if configured to do so.
+ */
+void
+virt_memory_guard_ast(
+       thread_t thread,
+       mach_exception_data_type_t code,
+       mach_exception_data_type_t subcode)
+{
+       task_t task = thread->task;
+       assert(task != kernel_task);
+       assert(task == current_task());
+       uint32_t behavior;
+
+       behavior = task->task_exc_guard;
+
+       /* Is delivery enabled */
+       if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
+               return;
+       }
+
+       /* If only once, make sure we're that once */
+       while (behavior & TASK_EXC_GUARD_VM_ONCE) {
+               uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
+
+               if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
+                       break;
+               }
+               behavior = task->task_exc_guard;
+               if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
+                       return;
+               }
+       }
+
+       /* Raise exception via corpse fork or synchronously */
+       if ((task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) &&
+           (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) == 0) {
+               task_violated_guard(code, subcode, NULL);
+       } else {
+               task_exception_notify(EXC_GUARD, code, subcode);
+       }
+
+       /* Terminate the task if desired */
+       if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
+               task_bsdtask_kill(current_task());
+       }
+}
+
+/*
+ *     vm_map_guard_exception:
+ *
+ *     Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
+ *
+ *     Right now, we do this when we find nothing mapped, or a
+ *     gap in the mapping when a user address space deallocate
+ *     was requested. We report the address of the first gap found.
+ */
+static void
+vm_map_guard_exception(
+       vm_map_offset_t gap_start,
+       unsigned reason)
+{
+       mach_exception_code_t code = 0;
+       unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
+       unsigned int target = 0; /* should we pass in pid associated with map? */
+       mach_exception_data_type_t subcode = (uint64_t)gap_start;
+
+       /* Can't deliver exceptions to kernel task */
+       if (current_task() == kernel_task)
+               return;
+
+       EXC_GUARD_ENCODE_TYPE(code, guard_type);
+       EXC_GUARD_ENCODE_FLAVOR(code, reason);
+       EXC_GUARD_ENCODE_TARGET(code, target);
+       thread_guard_violation(current_thread(), code, subcode);
+}
+
 /*
  *     vm_map_delete:  [ internal use only ]
  *
@@ -7130,6 +7440,16 @@ vm_map_delete(
        boolean_t               need_wakeup;
        unsigned int            last_timestamp = ~0; /* unlikely value */
        int                     interruptible;
+       vm_map_offset_t         gap_start;
+       vm_map_offset_t         save_start = start;
+       vm_map_offset_t         save_end = end;
+       const vm_map_offset_t   FIND_GAP = 1;   /* a not page aligned value */
+       const vm_map_offset_t   GAPS_OK = 2;    /* a different not page aligned value */
+
+       if (map != kernel_map && !(flags & VM_MAP_REMOVE_GAPS_OK))
+               gap_start = FIND_GAP;
+       else
+               gap_start = GAPS_OK;
 
        interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
                THREAD_ABORTSAFE : THREAD_UNINT;
@@ -7165,10 +7485,15 @@ vm_map_delete(
                                      (uint64_t)entry->vme_start,
                                      (uint64_t)entry->vme_end);
                        }
-                       if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) { /* extend request to whole entry */                           start = SUPERPAGE_ROUND_DOWN(start);
+
+                       /*
+                        * If in a superpage, extend the range to include the start of the mapping.
+                        */
+                       if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
                                start = SUPERPAGE_ROUND_DOWN(start);
                                continue;
                        }
+
                        if (start == entry->vme_start) {
                                /*
                                 * No need to clip.  We don't want to cause
@@ -7204,9 +7529,11 @@ vm_map_delete(
                         *      time through the loop.
                         */
                        SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
+
                } else {
+
                        if (map->pmap == kernel_pmap &&
-                           map->ref_count != 0) {
+                           map->map_refcnt != 0) {
                                panic("vm_map_delete(%p,0x%llx,0x%llx): "
                                      "no map entry at 0x%llx\n",
                                      map,
@@ -7215,6 +7542,8 @@ vm_map_delete(
                                      (uint64_t)start);
                        }
                        entry = first_entry->vme_next;
+                       if (gap_start == FIND_GAP)
+                               gap_start = start;
                }
                break;
        }
@@ -7315,8 +7644,27 @@ vm_map_delete(
                        } else if (flags & VM_MAP_REMOVE_IMMUTABLE) {
 //                             printf("FBDP %d[%s] removing permanent entry %p [0x%llx:0x%llx] prot 0x%x/0x%x\n", proc_selfpid(), (current_task()->bsd_info ? proc_name_address(current_task()->bsd_info) : "?"), entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, entry->protection, entry->max_protection);
                                entry->permanent = FALSE;
+#if PMAP_CS
+                       } else if ((entry->protection & VM_PROT_EXECUTE) && !pmap_cs_enforced(map->pmap)) {
+                               entry->permanent = FALSE;
+
+                               printf("%d[%s] %s(0x%llx,0x%llx): "
+                                          "pmap_cs disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
+                                          "prot 0x%x/0x%x\n",
+                                          proc_selfpid(),
+                                          (current_task()->bsd_info
+                                               ? proc_name_address(current_task()->bsd_info)
+                                               : "?"),
+                                          __FUNCTION__,
+                                          (uint64_t) start,
+                                          (uint64_t) end,
+                                          (uint64_t)entry->vme_start,
+                                          (uint64_t)entry->vme_end,
+                                          entry->protection,
+                                          entry->max_protection);
+#endif
                        } else {
-                               if (!vm_map_executable_immutable_no_log) {
+                               if (vm_map_executable_immutable_verbose) {
                                        printf("%d[%s] %s(0x%llx,0x%llx): "
                                                   "permanent entry [0x%llx:0x%llx] "
                                                   "prot 0x%x/0x%x\n",
@@ -7383,6 +7731,8 @@ vm_map_delete(
                                /*
                                 * User: use the next entry
                                 */
+                               if (gap_start == FIND_GAP)
+                                       gap_start = s;
                                entry = first_entry->vme_next;
                                s = entry->vme_start;
                        } else {
@@ -7452,6 +7802,8 @@ vm_map_delete(
                                                /*
                                                 * User: use the next entry
                                                 */
+                                               if (gap_start == FIND_GAP)
+                                                       gap_start = s;
                                                entry = first_entry->vme_next;
                                                s = entry->vme_start;
                                        } else {
@@ -7527,6 +7879,8 @@ vm_map_delete(
                                if (!vm_map_lookup_entry(map, s, &first_entry)){
                                        assert((map != kernel_map) &&
                                               (!entry->is_sub_map));
+                                       if (gap_start == FIND_GAP)
+                                               gap_start = s;
                                        first_entry = first_entry->vme_next;
                                        s = first_entry->vme_start;
                                } else {
@@ -7607,7 +7961,7 @@ vm_map_delete(
                                        entry->vme_end - entry->vme_start,
                                        pmap_flags);
 #endif /* NO_NESTED_PMAP */
-                               if ((map->mapped_in_other_pmaps) && (map->ref_count)) {
+                               if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) {
                                        /* clean up parent map/maps */
                                        vm_map_submap_pmap_clean(
                                                map, entry->vme_start,
@@ -7624,7 +7978,7 @@ vm_map_delete(
                } else if (VME_OBJECT(entry) != kernel_object &&
                           VME_OBJECT(entry) != compressor_object) {
                        object = VME_OBJECT(entry);
-                       if ((map->mapped_in_other_pmaps) && (map->ref_count)) {
+                       if ((map->mapped_in_other_pmaps) && (map->map_refcnt)) {
                                vm_object_pmap_protect_options(
                                        object, VME_OFFSET(entry),
                                        entry->vme_end - entry->vme_start,
@@ -7679,7 +8033,7 @@ vm_map_delete(
                next = entry->vme_next;
 
                if (map->pmap == kernel_pmap &&
-                   map->ref_count != 0 &&
+                   map->map_refcnt != 0 &&
                    entry->vme_end < end &&
                    (next == vm_map_to_entry(map) ||
                     next->vme_start != entry->vme_end)) {
@@ -7692,6 +8046,19 @@ vm_map_delete(
                              (uint64_t)entry->vme_end);
                }
 
+               /*
+                * If the desired range didn't end with "entry", then there is a gap if
+                * we wrapped around to the start of the map or if "entry" and "next"
+                * aren't contiguous.
+                *
+                * The vm_map_round_page() is needed since an entry can be less than VM_MAP_PAGE_MASK() sized.
+                * For example, devices which have h/w 4K pages, but entry sizes are all now 16K.
+                */
+               if (gap_start == FIND_GAP &&
+                   vm_map_round_page(entry->vme_end, VM_MAP_PAGE_MASK(map)) < end &&
+                   (next == vm_map_to_entry(map) || entry->vme_end != next->vme_start)) {
+                       gap_start = entry->vme_end;
+               }
                s = next->vme_start;
                last_timestamp = map->timestamp;
 
@@ -7714,8 +8081,9 @@ vm_map_delete(
                        vm_map_store_entry_unlink(map, entry);
                        /* ... and add it to the end of the "zap_map" */
                        vm_map_store_entry_link(zap_map,
-                                         vm_map_last_entry(zap_map),
-                                         entry);
+                                               vm_map_last_entry(zap_map),
+                                               entry,
+                                               VM_MAP_KERNEL_FLAGS_NONE);
                        entry_size = entry->vme_end - entry->vme_start;
                        map->size -= entry_size;
                        zap_map->size += entry_size;
@@ -7732,17 +8100,23 @@ vm_map_delete(
                if(entry == vm_map_to_entry(map)) {
                        break;
                }
-               if (last_timestamp+1 != map->timestamp) {
+               if (last_timestamp + 1 != map->timestamp) {
                        /*
-                        * we are responsible for deleting everything
-                        * from the give space, if someone has interfered
-                        * we pick up where we left off, back fills should
-                        * be all right for anyone except map_delete and
+                        * We are responsible for deleting everything
+                        * from the given space. If someone has interfered,
+                        * we pick up where we left off. Back fills should
+                        * be all right for anyone, except map_delete, and
                         * we have to assume that the task has been fully
                         * disabled before we get here
                         */
                        if (!vm_map_lookup_entry(map, s, &entry)){
                                entry = entry->vme_next;
+
+                               /*
+                                * Nothing found for s. If we weren't already done, then there is a gap.
+                                */
+                               if (gap_start == FIND_GAP && s < end)
+                                       gap_start = s;
                                s = entry->vme_start;
                        } else {
                                SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
@@ -7751,7 +8125,7 @@ vm_map_delete(
                         * others can not only allocate behind us, we can
                         * also see coalesce while we don't have the map lock
                         */
-                       if(entry == vm_map_to_entry(map)) {
+                       if (entry == vm_map_to_entry(map)) {
                                break;
                        }
                }
@@ -7766,6 +8140,28 @@ vm_map_delete(
        if (need_wakeup)
                vm_map_entry_wakeup(map);
 
+       if (gap_start != FIND_GAP && gap_start != GAPS_OK) {
+               DTRACE_VM3(kern_vm_deallocate_gap,
+                   vm_map_offset_t, gap_start,
+                   vm_map_offset_t, save_start,
+                   vm_map_offset_t, save_end);
+               if (!(flags & VM_MAP_REMOVE_GAPS_OK)) {
+#if defined(DEVELOPMENT) || defined(DEBUG)
+                       /* log just once if not checking, otherwise log each one */
+                       if (!map->warned_delete_gap ||
+                           (task_exc_guard_default & TASK_EXC_GUARD_VM_ALL) != 0) {
+                               printf("vm_map_delete: map %p [%p...%p] nothing at %p\n",
+                                   (void *)map, (void *)save_start, (void *)save_end,
+                                   (void *)gap_start);
+                               if (!map->warned_delete_gap) {
+                                       map->warned_delete_gap = 1;
+                               }
+                       }
+#endif
+                       vm_map_guard_exception(gap_start, kGUARD_EXC_DEALLOC_GAP);
+               }
+       }
+
        return KERN_SUCCESS;
 }
 
@@ -7822,6 +8218,25 @@ vm_map_remove_locked(
 }
 
 
+/*
+ *     Routine:        vm_map_copy_allocate
+ *
+ *     Description:
+ *             Allocates and initializes a map copy object.
+ */
+static vm_map_copy_t
+vm_map_copy_allocate(void)
+{
+       vm_map_copy_t new_copy;
+
+       new_copy = zalloc(vm_map_copy_zone);
+       bzero(new_copy, sizeof (*new_copy));
+       new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
+       vm_map_copy_first_entry(new_copy) = vm_map_copy_to_entry(new_copy);
+       vm_map_copy_last_entry(new_copy) = vm_map_copy_to_entry(new_copy);
+       return new_copy;
+}
+
 /*
  *     Routine:        vm_map_copy_discard
  *
@@ -7902,7 +8317,6 @@ vm_map_copy_copy(
         */
 
        new_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
-       new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
        *new_copy = *copy;
 
        if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
@@ -8543,12 +8957,7 @@ start_overwrite:
                                }
                                /* otherwise copy no longer exists, it was */
                                /* destroyed after successful copy_overwrite */
-                               copy = (vm_map_copy_t)
-                                       zalloc(vm_map_copy_zone);
-                               copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
-                               vm_map_copy_first_entry(copy) =
-                                       vm_map_copy_last_entry(copy) =
-                                       vm_map_copy_to_entry(copy);
+                               copy = vm_map_copy_allocate();
                                copy->type = VM_MAP_COPY_ENTRY_LIST;
                                copy->offset = new_offset;
 
@@ -8859,14 +9268,8 @@ vm_map_copy_overwrite(
                /*
                 * Extract "head_copy" out of "copy".
                 */
-               head_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
-               head_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
-               vm_map_copy_first_entry(head_copy) =
-                       vm_map_copy_to_entry(head_copy);
-               vm_map_copy_last_entry(head_copy) =
-                       vm_map_copy_to_entry(head_copy);
+               head_copy = vm_map_copy_allocate();
                head_copy->type = VM_MAP_COPY_ENTRY_LIST;
-               head_copy->cpy_hdr.nentries = 0;
                head_copy->cpy_hdr.entries_pageable =
                        copy->cpy_hdr.entries_pageable;
                vm_map_store_init(&head_copy->cpy_hdr);
@@ -8904,14 +9307,8 @@ vm_map_copy_overwrite(
                /*
                 * Extract "tail_copy" out of "copy".
                 */
-               tail_copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
-               tail_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
-               vm_map_copy_first_entry(tail_copy) =
-                       vm_map_copy_to_entry(tail_copy);
-               vm_map_copy_last_entry(tail_copy) =
-                       vm_map_copy_to_entry(tail_copy);
+               tail_copy = vm_map_copy_allocate();
                tail_copy->type = VM_MAP_COPY_ENTRY_LIST;
-               tail_copy->cpy_hdr.nentries = 0;
                tail_copy->cpy_hdr.entries_pageable =
                        copy->cpy_hdr.entries_pageable;
                vm_map_store_init(&tail_copy->cpy_hdr);
@@ -9714,7 +10111,7 @@ vm_map_copyin_kernel_buffer(
                                          VM_MAP_PAGE_MASK(src_map)),
                        (VM_MAP_REMOVE_INTERRUPTIBLE |
                         VM_MAP_REMOVE_WAIT_FOR_KWIRE |
-                        ((src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : 0)));
+                        ((src_map == kernel_map) ? VM_MAP_REMOVE_KUNWIRE : VM_MAP_REMOVE_NO_FLAGS)));
        }
        *copy_result = copy;
        return KERN_SUCCESS;
@@ -9831,7 +10228,7 @@ vm_map_copyout_kernel_buffer(
                                                   vm_map_round_page(copy_size,
                                                                     VM_MAP_PAGE_MASK(map))),
                                                  VM_MAP_PAGE_MASK(map)),
-                               VM_MAP_NO_FLAGS);
+                               VM_MAP_REMOVE_NO_FLAGS);
                        *addr = 0;
                }
        } else {
@@ -9845,21 +10242,31 @@ vm_map_copyout_kernel_buffer(
 }
 
 /*
- *     Macro:          vm_map_copy_insert
+ *     Routine:        vm_map_copy_insert      [internal use only]
  *
  *     Description:
  *             Link a copy chain ("copy") into a map at the
  *             specified location (after "where").
  *     Side effects:
  *             The copy chain is destroyed.
- *     Warning:
- *             The arguments are evaluated multiple times.
  */
-#define        vm_map_copy_insert(map, where, copy)                            \
-MACRO_BEGIN                                                            \
-       vm_map_store_copy_insert(map, where, copy);       \
-       zfree(vm_map_copy_zone, copy);          \
-MACRO_END
+static void
+vm_map_copy_insert(
+       vm_map_t        map,
+       vm_map_entry_t  after_where,
+       vm_map_copy_t   copy)
+{
+       vm_map_entry_t  entry;
+
+       while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
+               entry = vm_map_copy_first_entry(copy);
+               vm_map_copy_entry_unlink(copy, entry);
+               vm_map_store_entry_link(map, after_where, entry,
+                                       VM_MAP_KERNEL_FLAGS_NONE);
+               after_where = entry;
+       }
+       zfree(vm_map_copy_zone, copy);
+}
 
 void
 vm_map_copy_remap(
@@ -9899,7 +10306,8 @@ vm_map_copy_remap(
                        vm_object_reference(VME_OBJECT(new_entry));
                }
                /* insert the new entry in the map */
-               vm_map_store_entry_link(map, where, new_entry);
+               vm_map_store_entry_link(map, where, new_entry,
+                                       VM_MAP_KERNEL_FLAGS_NONE);
                /* continue inserting the "copy entries" after the new entry */
                where = new_entry;
        }
@@ -10089,7 +10497,7 @@ StartAgain: ;
                last = entry;
        } else {
                if (dst_map->holelistenabled) {
-                       hole_entry = (vm_map_entry_t)dst_map->holes_list;
+                       hole_entry = CAST_TO_VM_MAP_ENTRY(dst_map->holes_list);
 
                        if (hole_entry == NULL) {
                                /*
@@ -10151,7 +10559,7 @@ StartAgain: ;
                last = next;
 
                if (dst_map->holelistenabled) {
-                       if (last == (vm_map_entry_t) dst_map->holes_list) {
+                       if (last == CAST_TO_VM_MAP_ENTRY(dst_map->holes_list)) {
                                /*
                                 * Wrapped around
                                 */
@@ -10291,6 +10699,7 @@ StartAgain: ;
 
                        while (va < entry->vme_end) {
                                vm_page_t       m;
+                               struct vm_object_fault_info fault_info = {};
 
                                /*
                                 * Look up the page in the object.
@@ -10313,7 +10722,7 @@ StartAgain: ;
 
                                m = vm_page_lookup(object, offset);
                                if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
-                                   m->absent)
+                                   m->vmp_absent)
                                        panic("vm_map_copyout: wiring %p", m);
 
                                prot = entry->protection;
@@ -10324,20 +10733,24 @@ StartAgain: ;
 
                                type_of_fault = DBG_CACHE_HIT_FAULT;
 
-                               vm_fault_enter(m, dst_map->pmap, va, prot, prot,
-                                                               VM_PAGE_WIRED(m),
-                                                               FALSE, /* change_wiring */
-                                                               VM_KERN_MEMORY_NONE, /* tag - not wiring */
-                                                               FALSE, /* no_cache */
-                                                               FALSE, /* cs_bypass */
-                                                               VME_ALIAS(entry),
-                                                               ((entry->iokit_acct ||
-                                                                (!entry->is_sub_map &&
-                                                                 !entry->use_pmap))
-                                                               ? PMAP_OPTIONS_ALT_ACCT
-                                                               : 0),  /* pmap_options */
-                                                               NULL,  /* need_retry */
-                                                               &type_of_fault);
+                               fault_info.user_tag = VME_ALIAS(entry);
+                               fault_info.pmap_options = 0;
+                               if (entry->iokit_acct ||
+                                   (!entry->is_sub_map && !entry->use_pmap)) {
+                                       fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
+                               }
+
+                               vm_fault_enter(m,
+                                              dst_map->pmap,
+                                              va,
+                                              prot,
+                                              prot,
+                                              VM_PAGE_WIRED(m),
+                                              FALSE, /* change_wiring */
+                                              VM_KERN_MEMORY_NONE, /* tag - not wiring */
+                                              &fault_info,
+                                              NULL,  /* need_retry */
+                                              &type_of_fault);
 
                                vm_object_unlock(object);
 
@@ -10566,12 +10979,8 @@ vm_map_copyin_internal(
         *      remember the endpoints prior to rounding.
         */
 
-       copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
-       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
-       vm_map_copy_first_entry(copy) =
-               vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy);
+       copy = vm_map_copy_allocate();
        copy->type = VM_MAP_COPY_ENTRY_LIST;
-       copy->cpy_hdr.nentries = 0;
        copy->cpy_hdr.entries_pageable = TRUE;
 #if 00
        copy->cpy_hdr.page_shift = src_map->hdr.page_shift;
@@ -10944,7 +11353,7 @@ vm_map_copyin_internal(
                        assert(new_object->ref_count == 1);
                        assert(new_object->shadow == VM_OBJECT_NULL);
                        assert(new_object->copy == VM_OBJECT_NULL);
-                       assert(new_object->vo_purgeable_owner == NULL);
+                       assert(new_object->vo_owner == NULL);
 
                        new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
                        new_object->true_share = TRUE;
@@ -11180,7 +11589,7 @@ vm_map_copyin_internal(
                        src_end,
                        ((src_map == kernel_map) ?
                         VM_MAP_REMOVE_KUNWIRE :
-                        VM_MAP_NO_FLAGS),
+                        VM_MAP_REMOVE_NO_FLAGS),
                        VM_MAP_NULL);
        } else {
                /* fix up the damage we did in the base map */
@@ -11392,12 +11801,8 @@ vm_map_copy_extract(
         *      remember the endpoints prior to rounding.
         */
 
-       copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
-       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
-       vm_map_copy_first_entry(copy) =
-               vm_map_copy_last_entry(copy) = vm_map_copy_to_entry(copy);
+       copy = vm_map_copy_allocate();
        copy->type = VM_MAP_COPY_ENTRY_LIST;
-       copy->cpy_hdr.nentries = 0;
        copy->cpy_hdr.entries_pageable = TRUE;
 
        vm_map_store_init(&copy->cpy_hdr);
@@ -11446,8 +11851,7 @@ vm_map_copyin_object(
         *      that contains the object directly.
         */
 
-       copy = (vm_map_copy_t) zalloc(vm_map_copy_zone);
-       copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
+       copy = vm_map_copy_allocate();
        copy->type = VM_MAP_COPY_OBJECT;
        copy->cpy_object = object;
        copy->offset = offset;
@@ -11699,7 +12103,8 @@ vm_map_fork_share(
         *      map.
         */
 
-       vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry);
+       vm_map_store_entry_link(new_map, vm_map_last_entry(new_map), new_entry,
+                               VM_MAP_KERNEL_FLAGS_NONE);
 
        /*
         *      Update the physical map
@@ -11816,9 +12221,13 @@ vm_map_fork(
        boolean_t       new_entry_needs_copy;
        boolean_t       pmap_is64bit;
        int             vm_map_copyin_flags;
+       vm_inherit_t    old_entry_inheritance;
+       int             map_create_options;
+       kern_return_t   footprint_collect_kr;
 
        if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
-                       VM_MAP_FORK_PRESERVE_PURGEABLE)) {
+                       VM_MAP_FORK_PRESERVE_PURGEABLE |
+                       VM_MAP_FORK_CORPSE_FOOTPRINT)) {
                /* unsupported option */
                return VM_MAP_NULL;
        }
@@ -11839,10 +12248,18 @@ vm_map_fork(
        vm_map_reference_swap(old_map);
        vm_map_lock(old_map);
 
-       new_map = vm_map_create(new_pmap,
-                               old_map->min_offset,
-                               old_map->max_offset,
-                               old_map->hdr.entries_pageable);
+       map_create_options = 0;
+       if (old_map->hdr.entries_pageable) {
+               map_create_options |= VM_MAP_CREATE_PAGEABLE;
+       }
+       if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
+               map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
+               footprint_collect_kr = KERN_SUCCESS;
+       }
+       new_map = vm_map_create_options(new_pmap,
+                                       old_map->min_offset,
+                                       old_map->max_offset,
+                                       map_create_options);
        vm_map_lock(new_map);
        vm_commit_pagezero_status(new_map);
        /* inherit the parent map's page size */
@@ -11854,20 +12271,40 @@ vm_map_fork(
 
                entry_size = old_entry->vme_end - old_entry->vme_start;
 
-               switch (old_entry->inheritance) {
-               case VM_INHERIT_NONE:
+               old_entry_inheritance = old_entry->inheritance;
+               /*
+                * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
+                * share VM_INHERIT_NONE entries that are not backed by a
+                * device pager.
+                */
+               if (old_entry_inheritance == VM_INHERIT_NONE &&
+                   (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
+                   !(!old_entry->is_sub_map &&
+                     VME_OBJECT(old_entry) != NULL &&
+                     VME_OBJECT(old_entry)->pager != NULL &&
+                     is_device_pager_ops(
+                             VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
+                       old_entry_inheritance = VM_INHERIT_SHARE;
+               }
+
+               if (old_entry_inheritance != VM_INHERIT_NONE &&
+                   (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
+                   footprint_collect_kr == KERN_SUCCESS) {
                        /*
-                        * Skip making a share entry if VM_MAP_FORK_SHARE_IF_INHERIT_NONE
-                        * is not passed or it is backed by a device pager.
+                        * The corpse won't have old_map->pmap to query
+                        * footprint information, so collect that data now
+                        * and store it in new_map->vmmap_corpse_footprint
+                        * for later autopsy.
                         */
-                       if ((!(options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE)) ||
-                               (!old_entry->is_sub_map &&
-                               VME_OBJECT(old_entry) != NULL &&
-                               VME_OBJECT(old_entry)->pager != NULL &&
-                               is_device_pager_ops(VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
-                               break;
-                       }
-                       /* FALLTHROUGH */
+                       footprint_collect_kr =
+                               vm_map_corpse_footprint_collect(old_map,
+                                                               old_entry,
+                                                               new_map);
+               }
+
+               switch (old_entry_inheritance) {
+               case VM_INHERIT_NONE:
+                       break;
 
                case VM_INHERIT_SHARE:
                        vm_map_fork_share(old_map, old_entry, new_map);
@@ -11960,8 +12397,10 @@ vm_map_fork(
                         *      of the map.
                         */
 
-                       vm_map_store_entry_link(new_map, vm_map_last_entry(new_map),
-                                         new_entry);
+                       vm_map_store_entry_link(new_map,
+                                               vm_map_last_entry(new_map),
+                                               new_entry,
+                                               VM_MAP_KERNEL_FLAGS_NONE);
                        new_size += entry_size;
                        break;
 
@@ -11987,6 +12426,11 @@ vm_map_fork(
 #endif
 
        new_map->size = new_size;
+
+       if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
+               vm_map_corpse_footprint_collect_done(new_map);
+       }
+
        vm_map_unlock(new_map);
        vm_map_unlock(old_map);
        vm_map_deallocate(old_map);
@@ -12007,24 +12451,27 @@ vm_map_exec(
        task_t          task,
        boolean_t       is64bit,
        void            *fsroot,
-       cpu_type_t      cpu)
+       cpu_type_t      cpu,
+       cpu_subtype_t   cpu_subtype)
 {
        SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x): ->\n",
+               ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
                 (void *)VM_KERNEL_ADDRPERM(current_task()),
                 (void *)VM_KERNEL_ADDRPERM(new_map),
                 (void *)VM_KERNEL_ADDRPERM(task),
                 (void *)VM_KERNEL_ADDRPERM(fsroot),
-                cpu));
+                cpu,
+                cpu_subtype));
        (void) vm_commpage_enter(new_map, task, is64bit);
-       (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu);
+       (void) vm_shared_region_enter(new_map, task, is64bit, fsroot, cpu, cpu_subtype);
        SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x): <-\n",
+               ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
                 (void *)VM_KERNEL_ADDRPERM(current_task()),
                 (void *)VM_KERNEL_ADDRPERM(new_map),
                 (void *)VM_KERNEL_ADDRPERM(task),
                 (void *)VM_KERNEL_ADDRPERM(fsroot),
-                cpu));
+                cpu,
+                cpu_subtype));
        return KERN_SUCCESS;
 }
 
@@ -12404,17 +12851,41 @@ submap_recurse:
                        VME_OBJECT_SET(entry, copy_object);
 
                        /* propagate the submap entry's protections */
-                       entry->protection |= subentry_protection;
+                       if (entry->protection != VM_PROT_READ) {
+                               /*
+                                * Someone has already altered the top entry's
+                                * protections via vm_protect(VM_PROT_COPY).
+                                * Respect these new values and ignore the
+                                * submap entry's protections.
+                                */
+                       } else {
+                               /*
+                                * Regular copy-on-write: propagate the submap
+                                * entry's protections to the top map entry.
+                                */
+                               entry->protection |= subentry_protection;
+                       }
                        entry->max_protection |= subentry_max_protection;
 
-#if CONFIG_EMBEDDED
-                       if (entry->protection & VM_PROT_WRITE) {
-                               if ((entry->protection & VM_PROT_EXECUTE) && !(entry->used_for_jit)) {
-                                       printf("EMBEDDED: %s can't have both write and exec at the same time\n", __FUNCTION__);
-                                       entry->protection &= ~VM_PROT_EXECUTE;
-                               }
+                       if ((entry->protection & VM_PROT_WRITE) &&
+                           (entry->protection & VM_PROT_EXECUTE) &&
+#if !CONFIG_EMBEDDED
+                           map != kernel_map &&
+                           cs_process_enforcement(NULL) &&
+#endif /* !CONFIG_EMBEDDED */
+                           !(entry->used_for_jit)) {
+                               DTRACE_VM3(cs_wx,
+                                          uint64_t, (uint64_t)entry->vme_start,
+                                          uint64_t, (uint64_t)entry->vme_end,
+                                          vm_prot_t, entry->protection);
+                               printf("CODE SIGNING: %d[%s] %s can't have both write and exec at the same time\n",
+                                      proc_selfpid(),
+                                      (current_task()->bsd_info
+                                       ? proc_name_address(current_task()->bsd_info)
+                                       : "?"),
+                                      __FUNCTION__);
+                               entry->protection &= ~VM_PROT_EXECUTE;
                        }
-#endif
 
                        if(copied_slowly) {
                                VME_OFFSET_SET(entry, local_start - old_start);
@@ -12592,6 +13063,16 @@ submap_recurse:
                } else {
                        fault_info->cs_bypass = FALSE;
                }
+               fault_info->pmap_cs_associated = FALSE;
+#if CONFIG_PMAP_CS
+               if (entry->pmap_cs_associated) {
+                       /*
+                        * The pmap layer will validate this page
+                        * before allowing it to be executed from.
+                        */
+                       fault_info->pmap_cs_associated = TRUE;
+               }
+#endif /* CONFIG_PMAP_CS */
                fault_info->mark_zf_absent = FALSE;
                fault_info->batch_pmap_op = FALSE;
        }
@@ -13089,7 +13570,7 @@ recurse_again:
                        } else {
                                extended.share_mode = SM_PRIVATE;
                        }
-                       extended.ref_count = VME_SUBMAP(curr_entry)->ref_count;
+                       extended.ref_count = VME_SUBMAP(curr_entry)->map_refcnt;
                }
        }
 
@@ -13486,9 +13967,27 @@ vm_map_region_walk(
                                int disp;
 
                                disp = 0;
-                               pmap_query_page_info(map->pmap, va, &disp);
+                               if (map->has_corpse_footprint) {
+                                       /*
+                                        * Query the page info data we saved
+                                        * while forking the corpse.
+                                        */
+                                       vm_map_corpse_footprint_query_page_info(
+                                               map,
+                                               va,
+                                               &disp);
+                               } else {
+                                       /*
+                                        * Query the pmap.
+                                        */
+                                       pmap_query_page_info(map->pmap,
+                                                            va,
+                                                            &disp);
+                               }
                                if (disp & PMAP_QUERY_PAGE_PRESENT) {
-                                       extended->pages_resident++;
+                                       if (!(disp & PMAP_QUERY_PAGE_ALTACCT)) {
+                                               extended->pages_resident++;
+                                       }
                                        if (disp & PMAP_QUERY_PAGE_REUSABLE) {
                                                extended->pages_reusable++;
                                        } else if (!(disp & PMAP_QUERY_PAGE_INTERNAL) ||
@@ -13505,7 +14004,57 @@ vm_map_region_walk(
                                        }
                                }
                                /* deal with alternate accounting */
-                               if (obj->purgable != VM_PURGABLE_DENY) {
+                               if (obj->purgable == VM_PURGABLE_NONVOLATILE &&
+                                   /* && not tagged as no-footprint? */
+                                   VM_OBJECT_OWNER(obj) != NULL &&
+                                   VM_OBJECT_OWNER(obj)->map == map) {
+                                       if ((((va
+                                              - entry->vme_start
+                                              + VME_OFFSET(entry))
+                                             / PAGE_SIZE) <
+                                            (obj->resident_page_count +
+                                             vm_compressor_pager_get_count(obj->pager)))) {
+                                               /*
+                                                * Non-volatile purgeable object owned
+                                                * by this task: report the first
+                                                * "#resident + #compressed" pages as
+                                                * "resident" (to show that they
+                                                * contribute to the footprint) but not
+                                                * "dirty" (to avoid double-counting
+                                                * with the fake "non-volatile" region
+                                                * we'll report at the end of the
+                                                * address space to account for all
+                                                * (mapped or not) non-volatile memory
+                                                * owned by this task.
+                                                */
+                                               extended->pages_resident++;
+                                       }
+                               } else if ((obj->purgable == VM_PURGABLE_VOLATILE ||
+                                           obj->purgable == VM_PURGABLE_EMPTY) &&
+                                          /* && not tagged as no-footprint? */
+                                          VM_OBJECT_OWNER(obj) != NULL &&
+                                          VM_OBJECT_OWNER(obj)->map == map) {
+                                       if ((((va
+                                              - entry->vme_start
+                                              + VME_OFFSET(entry))
+                                             / PAGE_SIZE) <
+                                            obj->wired_page_count)) {
+                                               /*
+                                                * Volatile|empty purgeable object owned
+                                                * by this task: report the first
+                                                * "#wired" pages as "resident" (to
+                                                * show that they contribute to the
+                                                * footprint) but not "dirty" (to avoid
+                                                * double-counting with the fake
+                                                * "non-volatile" region we'll report
+                                                * at the end of the address space to
+                                                * account for all (mapped or not)
+                                                * non-volatile memory owned by this
+                                                * task.
+                                                */
+                                               extended->pages_resident++;
+                                       }
+                               } else if (obj->purgable != VM_PURGABLE_DENY) {
                                        /*
                                         * Pages from purgeable objects
                                         * will be reported as dirty 
@@ -13645,11 +14194,11 @@ vm_map_region_look_for_page(
                        if (shadow && (max_refcnt == 1))
                                extended->pages_shared_now_private++;
 
-                       if (!p->fictitious &&
-                           (p->dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p))))
+                       if (!p->vmp_fictitious &&
+                           (p->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p))))
                                extended->pages_dirtied++;
                        else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
-                               if (p->reusable || object->all_reusable) {
+                               if (p->vmp_reusable || object->all_reusable) {
                                        extended->pages_reusable++;
                                }
                        }
@@ -13784,6 +14333,7 @@ vm_map_simplify_entry(
            (prev_entry->map_aligned == this_entry->map_aligned) &&
            (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
            (prev_entry->used_for_jit == this_entry->used_for_jit) &&
+           (prev_entry->pmap_cs_associated == this_entry->pmap_cs_associated) &&
            /* from_reserved_zone: OK if that field doesn't match */
            (prev_entry->iokit_acct == this_entry->iokit_acct) &&
            (prev_entry->vme_resilient_codesign ==
@@ -13975,7 +14525,7 @@ vm_map_machine_attribute(
                                                m = vm_page_lookup(
                                                        object, offset);
 
-                                               if (m && !m->fictitious) {
+                                               if (m && !m->vmp_fictitious) {
                                                        ret =
                                                                pmap_attribute_cache_sync(
                                                                        VM_PAGE_GET_PHYS_PAGE(m),
@@ -14149,25 +14699,14 @@ vm_map_willneed(
        vm_map_entry_t                  entry;
        vm_object_t                     object;
        memory_object_t                 pager;
-       struct vm_object_fault_info     fault_info;
+       struct vm_object_fault_info     fault_info = {};
        kern_return_t                   kr;
        vm_object_size_t                len;
        vm_object_offset_t              offset;
 
-       /*
-        * Fill in static values in fault_info.  Several fields get ignored by the code
-        * we call, but we'll fill them in anyway since uninitialized fields are bad
-        * when it comes to future backwards compatibility.
-        */
-
-       fault_info.interruptible = THREAD_UNINT;                /* ignored value */
+       fault_info.interruptible = THREAD_UNINT;        /* ignored value */
        fault_info.behavior      = VM_BEHAVIOR_SEQUENTIAL;
-       fault_info.no_cache      = FALSE;                       /* ignored value */
        fault_info.stealth       = TRUE;
-       fault_info.io_sync = FALSE;
-       fault_info.cs_bypass = FALSE;
-       fault_info.mark_zf_absent = FALSE;
-       fault_info.batch_pmap_op = FALSE;
 
        /*
         * The MADV_WILLNEED operation doesn't require any changes to the
@@ -14347,7 +14886,7 @@ vm_map_entry_is_reusable(
                return TRUE;
        }
 
-       if (entry->is_shared ||
+       if (/*entry->is_shared ||*/
            entry->is_sub_map ||
            entry->in_transition ||
            entry->protection != VM_PROT_DEFAULT ||
@@ -14385,8 +14924,9 @@ vm_map_entry_is_reusable(
            object->wired_page_count == 0 &&
            object->copy == VM_OBJECT_NULL &&
            object->shadow == VM_OBJECT_NULL &&
-           object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
            object->internal &&
+           object->purgable == VM_PURGABLE_DENY &&
+           object->copy_strategy != MEMORY_OBJECT_COPY_DELAY &&
            !object->true_share &&
            object->wimg_bits == VM_WIMG_USE_DEFAULT &&
            !object->code_signed) {
@@ -14758,7 +15298,7 @@ vm_map_pageout(
 /*
  *     Routine:        vm_map_entry_insert
  *
- *     Descritpion:    This routine inserts a new vm_entry in a locked map.
+ *     Description:    This routine inserts a new vm_entry in a locked map.
  */
 vm_map_entry_t
 vm_map_entry_insert(
@@ -14787,6 +15327,7 @@ vm_map_entry_insert(
        vm_map_entry_t  new_entry;
 
        assert(insp_entry != (vm_map_entry_t)0);
+       vm_map_lock_assert_exclusive(map);
 
 #if DEVELOPMENT || DEBUG
        vm_object_offset_t      end_offset = 0;
@@ -14853,7 +15394,10 @@ vm_map_entry_insert(
        else
                new_entry->superpage_size = FALSE;
        if (used_for_jit){
-               if (!(map->jit_entry_exists)){
+#if CONFIG_EMBEDDED
+               if (!(map->jit_entry_exists))
+#endif /* CONFIG_EMBEDDED */
+               {
                        new_entry->used_for_jit = TRUE;
                        map->jit_entry_exists = TRUE;
 
@@ -14863,6 +15407,7 @@ vm_map_entry_insert(
        } else {
                new_entry->used_for_jit = FALSE;
        }
+       new_entry->pmap_cs_associated = FALSE;
        new_entry->iokit_acct = FALSE;
        new_entry->vme_resilient_codesign = FALSE;
        new_entry->vme_resilient_media = FALSE;
@@ -14872,7 +15417,8 @@ vm_map_entry_insert(
         *      Insert the new entry into the list.
         */
 
-       vm_map_store_entry_link(map, insp_entry, new_entry);
+       vm_map_store_entry_link(map, insp_entry, new_entry,
+                               VM_MAP_KERNEL_FLAGS_NONE);
        map->size += end - start;
 
        /*
@@ -14918,6 +15464,7 @@ vm_map_remap_extract(
        boolean_t               new_entry_needs_copy;
        vm_map_entry_t          saved_src_entry;
        boolean_t               src_entry_was_wired;
+       vm_prot_t               max_prot_for_prot_copy;
 
        assert(map != VM_MAP_NULL);
        assert(size != 0);
@@ -14936,14 +15483,19 @@ vm_map_remap_extract(
        /*
         *      Initialize map_header.
         */
-       map_header->links.next = (struct vm_map_entry *)&map_header->links;
-       map_header->links.prev = (struct vm_map_entry *)&map_header->links;
+       map_header->links.next = CAST_TO_VM_MAP_ENTRY(&map_header->links);
+       map_header->links.prev = CAST_TO_VM_MAP_ENTRY(&map_header->links);
        map_header->nentries = 0;
        map_header->entries_pageable = pageable;
        map_header->page_shift = PAGE_SHIFT;
 
        vm_map_store_init( map_header );
 
+       if (copy && vmk_flags.vmkf_remap_prot_copy) {
+               max_prot_for_prot_copy = *max_protection & VM_PROT_ALL;
+       } else {
+               max_prot_for_prot_copy = VM_PROT_NONE;
+       }
        *cur_protection = VM_PROT_ALL;
        *max_protection = VM_PROT_ALL;
 
@@ -15124,6 +15676,7 @@ vm_map_remap_extract(
                         * VM_PROT_WRITE to the max protection.
                         */
                        new_entry->inheritance = src_entry->inheritance;
+                       new_entry->protection &= max_prot_for_prot_copy;
                        new_entry->max_protection |= VM_PROT_WRITE;
                } else {
                        new_entry->inheritance = inheritance;
@@ -15140,8 +15693,10 @@ vm_map_remap_extract(
                         * region to be shared across address spaces.
                         */
                        if (src_entry->used_for_jit == TRUE && !same_map) {
+#if CONFIG_EMBEDDED
                                result = KERN_INVALID_ARGUMENT;
                                break;
+#endif /* CONFIG_EMBEDDED */
                        }
                        src_entry->is_shared = TRUE;
                        new_entry->is_shared = TRUE;
@@ -15313,7 +15868,7 @@ vm_map_remap_extract(
                 * Free all allocated elements.
                 */
                for (src_entry = map_header->links.next;
-                    src_entry != (struct vm_map_entry *)&map_header->links;
+                    src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
                     src_entry = new_entry) {
                        new_entry = src_entry->vme_next;
                        _vm_map_store_entry_unlink(map_header, src_entry);
@@ -15431,14 +15986,13 @@ vm_map_remap(
                                             &insp_entry);
 
        for (entry = map_header.links.next;
-            entry != (struct vm_map_entry *)&map_header.links;
+            entry != CAST_TO_VM_MAP_ENTRY(&map_header.links);
             entry = new_entry) {
                new_entry = entry->vme_next;
                _vm_map_store_entry_unlink(&map_header, entry);
                if (result == KERN_SUCCESS) {
                        if (flags & VM_FLAGS_RESILIENT_CODESIGN) {
                                /* no codesigning -> read-only access */
-                               assert(!entry->used_for_jit);
                                entry->max_protection = VM_PROT_READ;
                                entry->protection = VM_PROT_READ;
                                entry->vme_resilient_codesign = TRUE;
@@ -15446,7 +16000,8 @@ vm_map_remap(
                        entry->vme_start += *address;
                        entry->vme_end += *address;
                        assert(!entry->map_aligned);
-                       vm_map_store_entry_link(target_map, insp_entry, entry);
+                       vm_map_store_entry_link(target_map, insp_entry, entry,
+                                               vmk_flags);
                        insp_entry = entry;
                } else {
                        if (!entry->is_sub_map) {
@@ -15474,9 +16029,37 @@ vm_map_remap(
                target_map->size += size;
                SAVE_HINT_MAP_WRITE(target_map, insp_entry);
 
-       }
-       vm_map_unlock(target_map);
-
+#if PMAP_CS
+               if (*max_protection & VM_PROT_EXECUTE) {
+                       vm_map_address_t region_start = 0, region_size = 0;
+                       struct pmap_cs_code_directory *region_cd = NULL;
+                       vm_map_address_t base = 0;
+                       struct pmap_cs_lookup_results results = {};
+                       vm_map_size_t page_addr = vm_map_trunc_page(memory_address, PAGE_MASK);
+                       vm_map_size_t assoc_size = vm_map_round_page(memory_address + size - page_addr, PAGE_MASK);
+
+                       pmap_cs_lookup(src_map->pmap, memory_address, &results);
+                       region_size = results.region_size;
+                       region_start = results.region_start;
+                       region_cd = results.region_cd_entry;
+                       base = results.base;
+
+                       if (region_cd != NULL && (page_addr != region_start || assoc_size != region_size)) {
+                               *cur_protection = VM_PROT_READ;
+                               *max_protection = VM_PROT_READ;
+                               printf("mismatched remap of executable range 0x%llx-0x%llx to 0x%llx, "
+                                          "region_start 0x%llx, region_size 0x%llx, cd_entry %sNULL, making non-executable.\n",
+                                          page_addr, page_addr+assoc_size, *address,
+                                          region_start, region_size,
+                                          region_cd != NULL ? "not " : ""              // Don't leak kernel slide
+                                       );
+                       }
+               }
+#endif
+
+       }
+       vm_map_unlock(target_map);
+
        if (result == KERN_SUCCESS && target_map->wiring_required)
                result = vm_map_wire_kernel(target_map, *address,
                                     *address + size, *cur_protection, VM_KERN_MEMORY_MLOCK,
@@ -15511,13 +16094,14 @@ vm_map_remap_range_allocate(
        vm_map_size_t           size,
        vm_map_offset_t         mask,
        int                     flags,
-       __unused vm_map_kernel_flags_t  vmk_flags,
+       vm_map_kernel_flags_t   vmk_flags,
        __unused vm_tag_t       tag,
        vm_map_entry_t          *map_entry)     /* OUT */
 {
        vm_map_entry_t  entry;
        vm_map_offset_t start;
        vm_map_offset_t end;
+       vm_map_offset_t desired_empty_end;
        kern_return_t   kr;
        vm_map_entry_t          hole_entry;
 
@@ -15559,7 +16143,7 @@ StartAgain: ;
                } else {
 
                        if (map->holelistenabled) {
-                               hole_entry = (vm_map_entry_t)map->holes_list;
+                               hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
 
                                if (hole_entry == NULL) {
                                        /*
@@ -15583,7 +16167,7 @@ StartAgain: ;
                                                }
                                                hole_entry = hole_entry->vme_next;
 
-                                       } while (hole_entry != (vm_map_entry_t) map->holes_list);
+                                       } while (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list));
 
                                        if (found_hole == FALSE) {
                                                return (KERN_NO_SPACE);
@@ -15630,7 +16214,10 @@ StartAgain: ;
                        start = end;
                        end += size;
 
-                       if ((end > map->max_offset) || (end < start)) {
+                       /* We want an entire page of empty space, but don't increase the allocation size. */
+                       desired_empty_end = vm_map_round_page(end, VM_MAP_PAGE_MASK(map));
+
+                       if ((desired_empty_end > map->max_offset) || (desired_empty_end < start)) {
                                if (map->wait_for_space) {
                                        if (size <= (map->max_offset -
                                                     map->min_offset)) {
@@ -15648,7 +16235,7 @@ StartAgain: ;
                        next = entry->vme_next;
 
                        if (map->holelistenabled) {
-                               if (entry->vme_end >= end)
+                               if (entry->vme_end >= desired_empty_end)
                                        break;
                        } else {
                                /*
@@ -15663,7 +16250,7 @@ StartAgain: ;
                                if (next == vm_map_to_entry(map))
                                        break;
 
-                               if (next->vme_start >= end)
+                               if (next->vme_start >= desired_empty_end)
                                        break;
                        }
 
@@ -15674,7 +16261,7 @@ StartAgain: ;
                        entry = next;
 
                        if (map->holelistenabled) {
-                               if (entry == (vm_map_entry_t) map->holes_list) {
+                               if (entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
                                        /*
                                         * Wrapped around
                                         */
@@ -15726,6 +16313,7 @@ StartAgain: ;
                 */
                if (flags & VM_FLAGS_OVERWRITE) {
                        vm_map_t zap_map;
+                       int remove_flags = VM_MAP_REMOVE_SAVE_ENTRIES | VM_MAP_REMOVE_NO_MAP_ALIGN;
 
                        /*
                         * We use a "zap_map" to avoid having to unlock
@@ -15743,9 +16331,11 @@ StartAgain: ;
                        vm_map_set_page_shift(zap_map, VM_MAP_PAGE_SHIFT(map));
                        vm_map_disable_hole_optimization(zap_map);
 
+                       if (vmk_flags.vmkf_overwrite_immutable) {
+                               remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
+                       }
                        kr = vm_map_delete(map, start, end,
-                                          (VM_MAP_REMOVE_SAVE_ENTRIES |
-                                           VM_MAP_REMOVE_NO_MAP_ALIGN),
+                                          remove_flags,
                                           zap_map);
                        if (kr == KERN_SUCCESS) {
                                vm_map_destroy(zap_map,
@@ -16342,10 +16932,76 @@ vm_map_page_range_info_internal(
 
                        disposition = 0;
                        pmap_disp = 0;
-                       pmap_query_page_info(map->pmap, curr_s_offset, &pmap_disp);
-                       if (map_entry->iokit_acct &&
-                           object->internal &&
-                           object->purgable == VM_PURGABLE_DENY) {
+                       if (map->has_corpse_footprint) {
+                               /*
+                                * Query the page info data we saved
+                                * while forking the corpse.
+                                */
+                               vm_map_corpse_footprint_query_page_info(
+                                       map,
+                                       curr_s_offset,
+                                       &pmap_disp);
+                       } else {
+                               /*
+                                * Query the pmap.
+                                */
+                               pmap_query_page_info(map->pmap,
+                                                    curr_s_offset,
+                                                    &pmap_disp);
+                       }
+                       if (object->purgable == VM_PURGABLE_NONVOLATILE &&
+                           /* && not tagged as no-footprint? */
+                           VM_OBJECT_OWNER(object) != NULL &&
+                           VM_OBJECT_OWNER(object)->map == map) {
+                               if ((((curr_s_offset
+                                      - map_entry->vme_start
+                                      + VME_OFFSET(map_entry))
+                                     / PAGE_SIZE) <
+                                    (object->resident_page_count +
+                                     vm_compressor_pager_get_count(object->pager)))) {
+                                       /*
+                                        * Non-volatile purgeable object owned
+                                        * by this task: report the first
+                                        * "#resident + #compressed" pages as
+                                        * "resident" (to show that they
+                                        * contribute to the footprint) but not
+                                        * "dirty" (to avoid double-counting
+                                        * with the fake "non-volatile" region
+                                        * we'll report at the end of the
+                                        * address space to account for all
+                                        * (mapped or not) non-volatile memory
+                                        * owned by this task.
+                                        */
+                                       disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
+                               }
+                       } else if ((object->purgable == VM_PURGABLE_VOLATILE ||
+                                   object->purgable == VM_PURGABLE_EMPTY) &&
+                                  /* && not tagged as no-footprint? */
+                                  VM_OBJECT_OWNER(object) != NULL &&
+                                  VM_OBJECT_OWNER(object)->map == map) {
+                               if ((((curr_s_offset
+                                      - map_entry->vme_start
+                                      + VME_OFFSET(map_entry))
+                                     / PAGE_SIZE) <
+                                    object->wired_page_count)) {
+                                       /*
+                                        * Volatile|empty purgeable object owned
+                                        * by this task: report the first
+                                        * "#wired" pages as "resident" (to
+                                        * show that they contribute to the
+                                        * footprint) but not "dirty" (to avoid
+                                        * double-counting with the fake
+                                        * "non-volatile" region we'll report
+                                        * at the end of the address space to
+                                        * account for all (mapped or not)
+                                        * non-volatile memory owned by this
+                                        * task.
+                                        */
+                                       disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
+                               }
+                       } else if (map_entry->iokit_acct &&
+                                  object->internal &&
+                                  object->purgable == VM_PURGABLE_DENY) {
                                /*
                                 * Non-purgeable IOKit memory: phys_footprint
                                 * includes the entire virtual mapping.
@@ -16356,7 +17012,25 @@ vm_map_page_range_info_internal(
                        } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
                                                PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
                                /* alternate accounting */
-//                             assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
+#if CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG)
+                               if (map->pmap->footprint_was_suspended ||
+                                   /*
+                                    * XXX corpse does not know if original
+                                    * pmap had its footprint suspended...
+                                    */
+                                   map->has_corpse_footprint) {
+                                       /*
+                                        * The assertion below can fail if dyld
+                                        * suspended footprint accounting
+                                        * while doing some adjustments to
+                                        * this page;  the mapping would say
+                                        * "use pmap accounting" but the page
+                                        * would be marked "alternate
+                                        * accounting".
+                                        */
+                               } else
+#endif /* CONFIG_EMBEDDED && (DEVELOPMENT || DEBUG) */
+                                       assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
                                pmap_disp = 0;
                        } else {
                                if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
@@ -16478,25 +17152,25 @@ vm_map_page_range_info_internal(
 
                        if (m != VM_PAGE_NULL) {
 
-                               if (m->fictitious) {
+                               if (m->vmp_fictitious) {
 
                                        disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
 
                                } else {
-                                       if (m->dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)))
+                                       if (m->vmp_dirty || pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m)))
                                                disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
 
-                                       if (m->reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m)))
+                                       if (m->vmp_reference || pmap_is_referenced(VM_PAGE_GET_PHYS_PAGE(m)))
                                                disposition |= VM_PAGE_QUERY_PAGE_REF;
 
-                                       if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q)
+                                       if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q)
                                                disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
 
-                                       if (m->cs_validated)
+                                       if (m->vmp_cs_validated)
                                                disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
-                                       if (m->cs_tainted)
+                                       if (m->vmp_cs_tainted)
                                                disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
-                                       if (m->cs_nx)
+                                       if (m->vmp_cs_nx)
                                                disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
                                }
                        }
@@ -16944,10 +17618,10 @@ vm_map_reference(
        lck_mtx_lock(&map->s_lock);
 #if    TASK_SWAPPER
        assert(map->res_count > 0);
-       assert(map->ref_count >= map->res_count);
+       assert(map->map_refcnt >= map->res_count);
        map->res_count++;
 #endif
-       map->ref_count++;
+       map->map_refcnt++;
        lck_mtx_unlock(&map->s_lock);
 }
 
@@ -16968,13 +17642,13 @@ vm_map_deallocate(
                return;
 
        lck_mtx_lock(&map->s_lock);
-       ref = --map->ref_count;
+       ref = --map->map_refcnt;
        if (ref > 0) {
                vm_map_res_deallocate(map);
                lck_mtx_unlock(&map->s_lock);
                return;
        }
-       assert(map->ref_count == 0);
+       assert(map->map_refcnt == 0);
        lck_mtx_unlock(&map->s_lock);
 
 #if    TASK_SWAPPER
@@ -16986,7 +17660,7 @@ vm_map_deallocate(
         */
 #endif
 
-       vm_map_destroy(map, VM_MAP_NO_FLAGS);
+       vm_map_destroy(map, VM_MAP_REMOVE_NO_FLAGS);
 }
 
 
@@ -17035,15 +17709,43 @@ vm_map_set_64bit(vm_map_t map)
 }
 
 /*
- * Expand the maximum size of an existing map.
+ * Expand the maximum size of an existing map to the maximum supported.
  */
 void
 vm_map_set_jumbo(vm_map_t map)
 {
 #if defined (__arm64__)
+       vm_map_set_max_addr(map, ~0);
+#else /* arm64 */
+       (void) map;
+#endif
+}
+
+/*
+ * Expand the maximum size of an existing map.
+ */
+void
+vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
+{
+#if defined(__arm64__)
+       vm_map_offset_t max_supported_offset = 0;
        vm_map_offset_t old_max_offset = map->max_offset;
-       map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_JUMBO);
-       if (map->holes_list->prev->vme_end == pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE)) {
+       max_supported_offset = pmap_max_offset(vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
+
+       new_max_offset = trunc_page(new_max_offset);
+
+       /* The address space cannot be shrunk using this routine. */
+       if (old_max_offset >= new_max_offset) {
+               return;
+       }
+
+       if (max_supported_offset < new_max_offset) {
+               new_max_offset = max_supported_offset;
+       }
+
+       map->max_offset = new_max_offset;
+
+       if (map->holes_list->prev->vme_end == old_max_offset) {
                /*
                 * There is already a hole at the end of the map; simply make it bigger.
                 */
@@ -17061,8 +17763,9 @@ vm_map_set_jumbo(vm_map_t map)
                map->holes_list->prev->links.next = (struct vm_map_entry *)new_hole;
                map->holes_list->prev = (struct vm_map_entry *)new_hole;
        }
-#else /* arm64 */
-       (void) map;
+#else
+       (void)map;
+       (void)new_max_offset;
 #endif
 }
 
@@ -17336,8 +18039,8 @@ kern_return_t vm_map_sign(vm_map_t map,
                        return KERN_FAILURE;
                }
                /* deal with special page status */
-               if (m->busy ||
-                   (m->unusual && (m->error || m->restart || m->private || m->absent))) {
+               if (m->vmp_busy ||
+                   (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
                        vm_object_unlock(object);
                        return KERN_FAILURE;
                }
@@ -17345,18 +18048,18 @@ kern_return_t vm_map_sign(vm_map_t map,
                /* Page is OK... now "validate" it */
                /* This is the place where we'll call out to create a code
                 * directory, later */
-               m->cs_validated = TRUE;
+               m->vmp_cs_validated = TRUE;
 
                /* The page is now "clean" for codesigning purposes. That means
                 * we don't consider it as modified (wpmapped) anymore. But
                 * we'll disconnect the page so we note any future modification
                 * attempts. */
-               m->wpmapped = FALSE;
+               m->vmp_wpmapped = FALSE;
                refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
 
                /* Pull the dirty status from the pmap, since we cleared the
                 * wpmapped bit */
-               if ((refmod & VM_MEM_MODIFIED) && !m->dirty) {
+               if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
                        SET_PAGE_DIRTY(m, FALSE);
                }
 
@@ -17490,24 +18193,33 @@ vm_map_disconnect_page_mappings(
 #if CONFIG_FREEZE
 
 
-int c_freezer_swapout_count;
+int c_freezer_swapout_page_count;
 int c_freezer_compression_count = 0;
 AbsoluteTime c_freezer_last_yield_ts = 0;
 
-kern_return_t vm_map_freeze(
+extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
+extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
+
+kern_return_t
+vm_map_freeze(
                vm_map_t map,
                unsigned int *purgeable_count,
                unsigned int *wired_count,
                unsigned int *clean_count,
                unsigned int *dirty_count,
                __unused unsigned int dirty_budget,
-               boolean_t *has_shared)
+                unsigned int *shared_count,
+               int          *freezer_error_code,
+               boolean_t    eval_only)
 {
        vm_map_entry_t  entry2 = VM_MAP_ENTRY_NULL;
        kern_return_t   kr = KERN_SUCCESS;
+       boolean_t       evaluation_phase = TRUE;
+       vm_object_t     cur_shared_object = NULL;
+       int             cur_shared_obj_ref_cnt = 0;
+       unsigned int    dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
 
-       *purgeable_count = *wired_count = *clean_count = *dirty_count = 0;
-       *has_shared = FALSE;
+       *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
 
        /*
         * We need the exclusive lock here so that we can
@@ -17519,12 +18231,39 @@ kern_return_t vm_map_freeze(
        assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 
        if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
+               if (vm_compressor_low_on_space()) {
+                       *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
+               }
+
+               if (vm_swap_low_on_space()) {
+                       *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
+               }
+
                kr = KERN_NO_SPACE;
                goto done;
        }
 
-       c_freezer_compression_count = 0;
-       clock_get_uptime(&c_freezer_last_yield_ts);
+       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
+               /*
+                * In-memory compressor backing the freezer. No disk.
+                * So no need to do the evaluation phase.
+                */
+               evaluation_phase = FALSE;
+
+               if (eval_only == TRUE) {
+                       /*
+                        * We don't support 'eval_only' mode
+                        * in this non-swap config.
+                        */
+                       *freezer_error_code = FREEZER_ERROR_GENERIC;
+                       kr = KERN_INVALID_ARGUMENT;
+                       goto done;
+               }
+
+               c_freezer_compression_count = 0;
+               clock_get_uptime(&c_freezer_last_yield_ts);
+       }
+again:
 
        for (entry2 = vm_map_first_entry(map);
             entry2 != vm_map_to_entry(map);
@@ -17544,31 +18283,118 @@ kern_return_t vm_map_freeze(
                                         * Pages belonging to this object could be swapped to disk.
                                         * Make sure it's not a shared object because we could end
                                         * up just bringing it back in again.
+                                        *
+                                        * We try to optimize somewhat by checking for objects that are mapped
+                                        * more than once within our own map. But we don't do full searches,
+                                        * we just look at the entries following our current entry.
                                         */
                                        if (src_object->ref_count > 1) {
+                                               if (src_object != cur_shared_object) {
+                                                       obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
+                                                       dirty_shared_count += obj_pages_snapshot;
+
+                                                       cur_shared_object = src_object;
+                                                       cur_shared_obj_ref_cnt = 1;
+                                                       continue;
+                                               } else {
+                                                       cur_shared_obj_ref_cnt++;
+                                                       if (src_object->ref_count == cur_shared_obj_ref_cnt) {
+                                                               /*
+                                                                * Fall through to below and treat this object as private.
+                                                                * So deduct its pages from our shared total and add it to the
+                                                                * private total.
+                                                                */
+
+                                                               dirty_shared_count -= obj_pages_snapshot;
+                                                               dirty_private_count += obj_pages_snapshot;
+                                                       } else {
+                                                               continue;
+                                                       }
+                                               }
+                                       }
+
+
+                                       if (src_object->ref_count == 1) {
+                                               dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
+                                       }
+
+                                       if (evaluation_phase == TRUE) {
+
                                                continue;
                                        }
                                }
+
                                vm_object_compressed_freezer_pageout(src_object);
 
+                               *wired_count += src_object->wired_page_count;
+
                                if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
+                                       if (vm_compressor_low_on_space()) {
+                                               *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
+                                       }
+
+                                       if (vm_swap_low_on_space()) {
+                                               *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
+                                       }
+
                                        kr = KERN_NO_SPACE;
                                        break;
                                }
                        }
                }
        }
+
+       if (evaluation_phase) {
+
+               unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
+
+               if (dirty_shared_count > shared_pages_threshold) {
+                       *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
+                       kr = KERN_FAILURE;
+                       goto done;
+               }
+
+               if (dirty_shared_count &&
+                  ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
+                       *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
+                       kr = KERN_FAILURE;
+                       goto done;
+               }
+
+               evaluation_phase = FALSE;
+               dirty_shared_count = dirty_private_count = 0;
+       
+               c_freezer_compression_count = 0;
+               clock_get_uptime(&c_freezer_last_yield_ts);
+
+               if (eval_only) {
+                       kr = KERN_SUCCESS;
+                       goto done;
+               }
+
+               goto again;
+
+       } else {
+
+               kr = KERN_SUCCESS;
+               *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
+       }
+
 done:
        vm_map_unlock(map);
 
-       vm_object_compressed_freezer_done();
+       if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
+               vm_object_compressed_freezer_done();
 
-       if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
-               /*
-                * reset the counter tracking the # of swapped c_segs
-                * because we are now done with this freeze session and task.
-                */
-               c_freezer_swapout_count = 0;
+               if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
+                       /*
+                        * reset the counter tracking the # of swapped compressed pages
+                        * because we are now done with this freeze session and task.
+                        */
+
+                       *dirty_count = c_freezer_swapout_page_count; //used to track pageouts
+                       c_freezer_swapout_page_count = 0;
+               }
        }
        return kr;
 }
@@ -17915,3 +18741,841 @@ vm_map_set_high_start(
        map->vmmap_high_start = high_start;
 }
 #endif /* __x86_64__ */
+
+#if PMAP_CS
+kern_return_t
+vm_map_entry_cs_associate(
+       vm_map_t                map,
+       vm_map_entry_t          entry,
+       vm_map_kernel_flags_t   vmk_flags)
+{
+       vm_object_t cs_object, cs_shadow;
+       vm_object_offset_t cs_offset;
+       void *cs_blobs;
+       struct vnode *cs_vnode;
+       kern_return_t cs_ret;
+
+       if (map->pmap == NULL ||
+           entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
+           VME_OBJECT(entry) == VM_OBJECT_NULL ||
+           ! (entry->protection & VM_PROT_EXECUTE)) {
+               return KERN_SUCCESS;
+       }
+
+       vm_map_lock_assert_exclusive(map);
+
+       if (entry->used_for_jit) {
+               cs_ret = pmap_cs_associate(map->pmap,
+                                          PMAP_CS_ASSOCIATE_JIT,
+                                          entry->vme_start,
+                                          entry->vme_end - entry->vme_start);
+               goto done;
+       }
+
+       if (vmk_flags.vmkf_remap_prot_copy) {
+               cs_ret = pmap_cs_associate(map->pmap,
+                                          PMAP_CS_ASSOCIATE_COW,
+                                          entry->vme_start,
+                                          entry->vme_end - entry->vme_start);
+               goto done;
+       }
+
+       vm_object_lock_shared(VME_OBJECT(entry));
+       cs_offset = VME_OFFSET(entry);
+       for (cs_object = VME_OBJECT(entry);
+            (cs_object != VM_OBJECT_NULL &&
+             !cs_object->code_signed);
+            cs_object = cs_shadow) {
+               cs_shadow = cs_object->shadow;
+               if (cs_shadow != VM_OBJECT_NULL) {
+                       cs_offset += cs_object->vo_shadow_offset;
+                       vm_object_lock_shared(cs_shadow);
+               }
+               vm_object_unlock(cs_object);
+       }
+       if (cs_object == VM_OBJECT_NULL) {
+               return KERN_SUCCESS;
+       }
+
+       cs_offset += cs_object->paging_offset;
+       cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
+       cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
+                                         &cs_blobs);
+       assert(cs_ret == KERN_SUCCESS);
+       cs_ret = cs_associate_blob_with_mapping(map->pmap,
+                                               entry->vme_start,
+                                               (entry->vme_end -
+                                                entry->vme_start),
+                                               cs_offset,
+                                               cs_blobs);
+       vm_object_unlock(cs_object);
+       cs_object = VM_OBJECT_NULL;
+
+       done:
+       if (cs_ret == KERN_SUCCESS) {
+               DTRACE_VM2(vm_map_entry_cs_associate_success,
+                          vm_map_offset_t, entry->vme_start,
+                          vm_map_offset_t, entry->vme_end);
+               if (vm_map_executable_immutable) {
+                       /*
+                        * Prevent this executable
+                        * mapping from being unmapped
+                        * or modified.
+                        */
+                       entry->permanent = TRUE;
+               }
+               /*
+                * pmap says it will validate the
+                * code-signing validity of pages
+                * faulted in via this mapping, so
+                * this map entry should be marked so
+                * that vm_fault() bypasses code-signing
+                * validation for faults coming through
+                * this mapping.
+                */
+               entry->pmap_cs_associated = TRUE;
+       } else if (cs_ret == KERN_NOT_SUPPORTED) {
+               /*
+                * pmap won't check the code-signing
+                * validity of pages faulted in via
+                * this mapping, so VM should keep
+                * doing it.
+                */
+               DTRACE_VM3(vm_map_entry_cs_associate_off,
+                          vm_map_offset_t, entry->vme_start,
+                          vm_map_offset_t, entry->vme_end,
+                          int, cs_ret);
+       } else {
+               /*
+                * A real error: do not allow
+                * execution in this mapping.
+                */
+               DTRACE_VM3(vm_map_entry_cs_associate_failure,
+                          vm_map_offset_t, entry->vme_start,
+                          vm_map_offset_t, entry->vme_end,
+                          int, cs_ret);
+               entry->protection &= ~VM_PROT_EXECUTE;
+               entry->max_protection &= ~VM_PROT_EXECUTE;
+       }
+
+       return cs_ret;
+}
+#endif /* PMAP_CS */
+
+/*
+ * FORKED CORPSE FOOTPRINT
+ *
+ * A forked corpse gets a copy of the original VM map but its pmap is mostly
+ * empty since it never ran and never got to fault in any pages.
+ * Collecting footprint info (via "sysctl vm.self_region_footprint") for
+ * a forked corpse would therefore return very little information.
+ *
+ * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
+ * to vm_map_fork() to collect footprint information from the original VM map
+ * and its pmap, and store it in the forked corpse's VM map.  That information
+ * is stored in place of the VM map's "hole list" since we'll never need to
+ * lookup for holes in the corpse's map.
+ *
+ * The corpse's footprint info looks like this:
+ *
+ * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
+ * as follows:
+ *                     +---------------------------------------+
+ *            header-> | cf_size                               |
+ *                     +-------------------+-------------------+
+ *                     | cf_last_region    | cf_last_zeroes    |
+ *                     +-------------------+-------------------+
+ *           region1-> | cfr_vaddr                             |
+ *                     +-------------------+-------------------+
+ *                     | cfr_num_pages     | d0 | d1 | d2 | d3 |
+ *                     +---------------------------------------+
+ *                     | d4 | d5 | ...                         |
+ *                     +---------------------------------------+
+ *                     | ...                                   |
+ *                     +-------------------+-------------------+
+ *                     | dy | dz | na | na | cfr_vaddr...      | <-region2
+ *                     +-------------------+-------------------+
+ *                     | cfr_vaddr (ctd)   | cfr_num_pages     |
+ *                     +---------------------------------------+
+ *                     | d0 | d1 ...                           |
+ *                     +---------------------------------------+
+ *                       ...
+ *                     +---------------------------------------+
+ *       last region-> | cfr_vaddr                             |
+ *                     +---------------------------------------+
+ *                     + cfr_num_pages     | d0 | d1 | d2 | d3 |
+ *                     +---------------------------------------+
+ *                       ...
+ *                     +---------------------------------------+
+ *                     | dx | dy | dz | na | na | na | na | na |
+ *                     +---------------------------------------+
+ *
+ * where:
+ *     cf_size:        total size of the buffer (rounded to page size)
+ *     cf_last_region: offset in the buffer of the last "region" sub-header
+ *     cf_last_zeroes: number of trailing "zero" dispositions at the end
+ *                     of last region
+ *     cfr_vaddr:      virtual address of the start of the covered "region"
+ *     cfr_num_pages:  number of pages in the covered "region"
+ *     d*:             disposition of the page at that virtual address
+ * Regions in the buffer are word-aligned.
+ *
+ * We estimate the size of the buffer based on the number of memory regions
+ * and the virtual size of the address space.  While copying each memory region
+ * during vm_map_fork(), we also collect the footprint info for that region
+ * and store it in the buffer, packing it as much as possible (coalescing
+ * contiguous memory regions to avoid having too many region headers and
+ * avoiding long streaks of "zero" page dispositions by splitting footprint
+ * "regions", so the number of regions in the footprint buffer might not match
+ * the number of memory regions in the address space.
+ *
+ * We also have to copy the original task's "nonvolatile" ledgers since that's
+ * part of the footprint and will need to be reported to any tool asking for
+ * the footprint information of the forked corpse.
+ */
+
+uint64_t vm_map_corpse_footprint_count = 0;
+uint64_t vm_map_corpse_footprint_size_avg = 0;
+uint64_t vm_map_corpse_footprint_size_max = 0;
+uint64_t vm_map_corpse_footprint_full = 0;
+uint64_t vm_map_corpse_footprint_no_buf = 0;
+
+/*
+ * vm_map_corpse_footprint_new_region:
+ *     closes the current footprint "region" and creates a new one
+ *
+ * Returns NULL if there's not enough space in the buffer for a new region.
+ */
+static struct vm_map_corpse_footprint_region *
+vm_map_corpse_footprint_new_region(
+       struct vm_map_corpse_footprint_header *footprint_header)
+{
+       uintptr_t       footprint_edge;
+       uint32_t        new_region_offset;
+       struct vm_map_corpse_footprint_region *footprint_region;
+       struct vm_map_corpse_footprint_region *new_footprint_region;
+
+       footprint_edge = ((uintptr_t)footprint_header +
+                         footprint_header->cf_size);
+       footprint_region = ((struct vm_map_corpse_footprint_region *)
+                           ((char *)footprint_header +
+                            footprint_header->cf_last_region));
+       assert((uintptr_t)footprint_region + sizeof (*footprint_region) <=
+              footprint_edge);
+
+       /* get rid of trailing zeroes in the last region */
+       assert(footprint_region->cfr_num_pages >=
+              footprint_header->cf_last_zeroes);
+       footprint_region->cfr_num_pages -=
+                       footprint_header->cf_last_zeroes;
+       footprint_header->cf_last_zeroes = 0;
+
+       /* reuse this region if it's now empty */
+       if (footprint_region->cfr_num_pages == 0) {
+               return footprint_region;
+       }
+
+       /* compute offset of new region */
+       new_region_offset = footprint_header->cf_last_region;
+       new_region_offset += sizeof (*footprint_region);
+       new_region_offset += footprint_region->cfr_num_pages;
+       new_region_offset = roundup(new_region_offset, sizeof (int));
+
+       /* check if we're going over the edge */
+       if (((uintptr_t)footprint_header +
+            new_region_offset +
+            sizeof (*footprint_region)) >=
+           footprint_edge) {
+               /* over the edge: no new region */
+               return NULL;
+       }
+
+       /* adjust offset of last region in header */
+       footprint_header->cf_last_region = new_region_offset;
+
+       new_footprint_region = (struct vm_map_corpse_footprint_region *)
+               ((char *)footprint_header +
+                footprint_header->cf_last_region);
+       new_footprint_region->cfr_vaddr = 0;
+       new_footprint_region->cfr_num_pages = 0;
+       /* caller needs to initialize new region */
+
+       return new_footprint_region;
+}
+
+/*
+ * vm_map_corpse_footprint_collect:
+ *     collect footprint information for "old_entry" in "old_map" and
+ *     stores it in "new_map"'s vmmap_footprint_info.
+ */
+kern_return_t
+vm_map_corpse_footprint_collect(
+       vm_map_t        old_map,
+       vm_map_entry_t  old_entry,
+       vm_map_t        new_map)
+{
+       vm_map_offset_t va;
+       int             disp;
+       kern_return_t   kr;
+       struct vm_map_corpse_footprint_header *footprint_header;
+       struct vm_map_corpse_footprint_region *footprint_region;
+       struct vm_map_corpse_footprint_region *new_footprint_region;
+       unsigned char   *next_disp_p;
+       uintptr_t       footprint_edge;
+       uint32_t        num_pages_tmp;
+
+       va = old_entry->vme_start;
+
+       vm_map_lock_assert_exclusive(old_map);
+       vm_map_lock_assert_exclusive(new_map);
+
+       assert(new_map->has_corpse_footprint);
+       assert(!old_map->has_corpse_footprint);
+       if (!new_map->has_corpse_footprint ||
+           old_map->has_corpse_footprint) {
+               /*
+                * This can only transfer footprint info from a
+                * map with a live pmap to a map with a corpse footprint.
+                */
+               return KERN_NOT_SUPPORTED;
+       }
+
+       if (new_map->vmmap_corpse_footprint == NULL) {
+               vm_offset_t     buf;
+               vm_size_t       buf_size;
+
+               buf = 0;
+               buf_size = (sizeof (*footprint_header) +
+                           (old_map->hdr.nentries
+                            *
+                            (sizeof (*footprint_region) +
+                             + 3)) /* potential alignment for each region */
+                           +
+                           ((old_map->size / PAGE_SIZE)
+                            *
+                            sizeof (char))); /* disposition for each page */
+//             printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
+               buf_size = round_page(buf_size);
+
+               /* limit buffer to 1 page to validate overflow detection */
+//             buf_size = PAGE_SIZE;
+
+               /* limit size to a somewhat sane amount */
+#if CONFIG_EMBEDDED
+#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE  (256*1024)      /* 256KB */
+#else /* CONFIG_EMBEDDED */
+#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE  (8*1024*1024)   /* 8MB */
+#endif /* CONFIG_EMBEDDED */
+               if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
+                       buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
+               }
+
+               /*
+                * Allocate the pageable buffer (with a trailing guard page).
+                * It will be zero-filled on demand.
+                */
+               kr = kernel_memory_allocate(kernel_map,
+                                           &buf,
+                                           (buf_size
+                                            + PAGE_SIZE), /* trailing guard page */
+                                           0, /* mask */
+                                           KMA_PAGEABLE | KMA_GUARD_LAST,
+                                           VM_KERN_MEMORY_DIAG);
+               if (kr != KERN_SUCCESS) {
+                       vm_map_corpse_footprint_no_buf++;
+                       return kr;
+               }
+
+               /* initialize header and 1st region */
+               footprint_header = (struct vm_map_corpse_footprint_header *)buf;
+               new_map->vmmap_corpse_footprint = footprint_header;
+
+               footprint_header->cf_size = buf_size;
+               footprint_header->cf_last_region =
+                       sizeof (*footprint_header);
+               footprint_header->cf_last_zeroes = 0;
+
+               footprint_region = (struct vm_map_corpse_footprint_region *)
+                       ((char *)footprint_header +
+                        footprint_header->cf_last_region);
+               footprint_region->cfr_vaddr = 0;
+               footprint_region->cfr_num_pages = 0;
+       } else {
+               /* retrieve header and last region */
+               footprint_header = (struct vm_map_corpse_footprint_header *)
+                       new_map->vmmap_corpse_footprint;
+               footprint_region = (struct vm_map_corpse_footprint_region *)
+                       ((char *)footprint_header +
+                        footprint_header->cf_last_region);
+       }
+       footprint_edge = ((uintptr_t)footprint_header +
+                         footprint_header->cf_size);
+
+       if ((footprint_region->cfr_vaddr +
+            (((vm_map_offset_t)footprint_region->cfr_num_pages) *
+             PAGE_SIZE))
+           != old_entry->vme_start) {
+               uint64_t num_pages_delta;
+               uint32_t region_offset_delta;
+
+               /*
+                * Not the next contiguous virtual address:
+                * start a new region or store "zero" dispositions for
+                * the missing pages?
+                */
+               /* size of gap in actual page dispositions */
+               num_pages_delta = (((old_entry->vme_start -
+                                    footprint_region->cfr_vaddr) / PAGE_SIZE)
+                                  - footprint_region->cfr_num_pages);
+               /* size of gap as a new footprint region header */
+               region_offset_delta =
+                       (sizeof (*footprint_region) +
+                        roundup((footprint_region->cfr_num_pages -
+                                 footprint_header->cf_last_zeroes),
+                                sizeof (int)) -
+                        (footprint_region->cfr_num_pages -
+                         footprint_header->cf_last_zeroes));
+//             printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
+               if (region_offset_delta < num_pages_delta ||
+                   os_add3_overflow(footprint_region->cfr_num_pages,
+                                    (uint32_t) num_pages_delta,
+                                    1,
+                                    &num_pages_tmp)) {
+                       /*
+                        * Storing data for this gap would take more space
+                        * than inserting a new footprint region header:
+                        * let's start a new region and save space. If it's a
+                        * tie, let's avoid using a new region, since that
+                        * would require more region hops to find the right
+                        * range during lookups.
+                        *
+                        * If the current region's cfr_num_pages would overflow
+                        * if we added "zero" page dispositions for the gap,
+                        * no choice but to start a new region.
+                        */
+//                     printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
+                       new_footprint_region =
+                               vm_map_corpse_footprint_new_region(footprint_header);
+                       /* check that we're not going over the edge */
+                       if (new_footprint_region == NULL) {
+                               goto over_the_edge;
+                       }
+                       footprint_region = new_footprint_region;
+                       /* initialize new region as empty */
+                       footprint_region->cfr_vaddr = old_entry->vme_start;
+                       footprint_region->cfr_num_pages = 0;
+               } else {
+                       /*
+                        * Store "zero" page dispositions for the missing
+                        * pages.
+                        */
+//                     printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
+                       for (; num_pages_delta > 0; num_pages_delta--) {
+                               next_disp_p =
+                                       ((unsigned char *) footprint_region +
+                                        sizeof (*footprint_region) +
+                                        footprint_region->cfr_num_pages);
+                               /* check that we're not going over the edge */
+                               if ((uintptr_t)next_disp_p >= footprint_edge) {
+                                       goto over_the_edge;
+                               }
+                               /* store "zero" disposition for this gap page */
+                               footprint_region->cfr_num_pages++;
+                               *next_disp_p = (unsigned char) 0;
+                               footprint_header->cf_last_zeroes++;
+                       }
+               }
+       }
+
+       for (va = old_entry->vme_start;
+            va < old_entry->vme_end;
+            va += PAGE_SIZE) {
+               vm_object_t     object;
+
+               object = VME_OBJECT(old_entry);
+               if (!old_entry->is_sub_map &&
+                   old_entry->iokit_acct &&
+                   object != VM_OBJECT_NULL &&
+                   object->internal &&
+                   object->purgable == VM_PURGABLE_DENY) {
+                       /*
+                        * Non-purgeable IOKit memory: phys_footprint
+                        * includes the entire virtual mapping.
+                        * Since the forked corpse's VM map entry will not
+                        * have "iokit_acct", pretend that this page's
+                        * disposition is "present & internal", so that it
+                        * shows up in the forked corpse's footprint.
+                        */
+                       disp = (PMAP_QUERY_PAGE_PRESENT |
+                               PMAP_QUERY_PAGE_INTERNAL);
+               } else {
+                       disp = 0;
+                       pmap_query_page_info(old_map->pmap,
+                                            va,
+                                            &disp);
+               }
+
+//             if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
+
+               if (disp == 0 && footprint_region->cfr_num_pages == 0) {
+                       /*
+                        * Ignore "zero" dispositions at start of
+                        * region: just move start of region.
+                        */
+                       footprint_region->cfr_vaddr += PAGE_SIZE;
+                       continue;
+               }
+
+               /* would region's cfr_num_pages overflow? */
+               if (os_add_overflow(footprint_region->cfr_num_pages, 1,
+                                   &num_pages_tmp)) {
+                       /* overflow: create a new region */
+                       new_footprint_region =
+                               vm_map_corpse_footprint_new_region(
+                                       footprint_header);
+                       if (new_footprint_region == NULL) {
+                               goto over_the_edge;
+                       }
+                       footprint_region = new_footprint_region;
+                       footprint_region->cfr_vaddr = va;
+                       footprint_region->cfr_num_pages = 0;
+               }
+
+               next_disp_p = ((unsigned char *)footprint_region +
+                              sizeof (*footprint_region) +
+                              footprint_region->cfr_num_pages);
+               /* check that we're not going over the edge */
+               if ((uintptr_t)next_disp_p >= footprint_edge) {
+                       goto over_the_edge;
+               }
+               /* store this dispostion */
+               *next_disp_p = (unsigned char) disp;
+               footprint_region->cfr_num_pages++;
+
+               if (disp != 0) {
+                       /* non-zero disp: break the current zero streak */
+                       footprint_header->cf_last_zeroes = 0;
+                       /* done */
+                       continue;
+               }
+
+               /* zero disp: add to the current streak of zeroes */
+               footprint_header->cf_last_zeroes++;
+               if ((footprint_header->cf_last_zeroes +
+                    roundup((footprint_region->cfr_num_pages -
+                             footprint_header->cf_last_zeroes) &
+                            (sizeof (int) - 1),
+                            sizeof (int))) <
+                   (sizeof (*footprint_header))) {
+                       /*
+                        * There are not enough trailing "zero" dispositions
+                        * (+ the extra padding we would need for the previous
+                        * region); creating a new region would not save space
+                        * at this point, so let's keep this "zero" disposition
+                        * in this region and reconsider later.
+                        */
+                       continue;
+               }
+               /*
+                * Create a new region to avoid having too many consecutive
+                * "zero" dispositions.
+                */
+               new_footprint_region =
+                       vm_map_corpse_footprint_new_region(footprint_header);
+               if (new_footprint_region == NULL) {
+                       goto over_the_edge;
+               }
+               footprint_region = new_footprint_region;
+               /* initialize the new region as empty ... */
+               footprint_region->cfr_num_pages = 0;
+               /* ... and skip this "zero" disp */
+               footprint_region->cfr_vaddr = va + PAGE_SIZE;
+       }
+
+       return KERN_SUCCESS;
+
+over_the_edge:
+//     printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
+       vm_map_corpse_footprint_full++;
+       return KERN_RESOURCE_SHORTAGE;
+}
+
+/*
+ * vm_map_corpse_footprint_collect_done:
+ *     completes the footprint collection by getting rid of any remaining
+ *     trailing "zero" dispositions and trimming the unused part of the
+ *     kernel buffer
+ */
+void
+vm_map_corpse_footprint_collect_done(
+       vm_map_t        new_map)
+{
+       struct vm_map_corpse_footprint_header *footprint_header;
+       struct vm_map_corpse_footprint_region *footprint_region;
+       vm_size_t       buf_size, actual_size;
+       kern_return_t   kr;
+
+       assert(new_map->has_corpse_footprint);
+       if (!new_map->has_corpse_footprint ||
+           new_map->vmmap_corpse_footprint == NULL) {
+               return;
+       }
+
+       footprint_header = (struct vm_map_corpse_footprint_header *)
+               new_map->vmmap_corpse_footprint;
+       buf_size = footprint_header->cf_size;
+
+       footprint_region = (struct vm_map_corpse_footprint_region *)
+               ((char *)footprint_header +
+                footprint_header->cf_last_region);
+
+       /* get rid of trailing zeroes in last region */
+       assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
+       footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
+       footprint_header->cf_last_zeroes = 0;
+
+       actual_size = (vm_size_t)(footprint_header->cf_last_region +
+                                 sizeof (*footprint_region) +
+                                 footprint_region->cfr_num_pages);
+
+//     printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
+       vm_map_corpse_footprint_size_avg = 
+               (((vm_map_corpse_footprint_size_avg *
+                  vm_map_corpse_footprint_count) +
+                 actual_size) /
+                (vm_map_corpse_footprint_count + 1));
+       vm_map_corpse_footprint_count++;
+       if (actual_size > vm_map_corpse_footprint_size_max) {
+               vm_map_corpse_footprint_size_max = actual_size;
+       }
+
+       actual_size = round_page(actual_size);
+       if (buf_size > actual_size) {
+               kr = vm_deallocate(kernel_map,
+                                  ((vm_address_t)footprint_header +
+                                   actual_size +
+                                   PAGE_SIZE), /* trailing guard page */
+                                  (buf_size - actual_size));
+               assertf(kr == KERN_SUCCESS,
+                       "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
+                       footprint_header,
+                       (uint64_t) buf_size,
+                       (uint64_t) actual_size,
+                       kr);
+               kr = vm_protect(kernel_map,
+                               ((vm_address_t)footprint_header +
+                                actual_size),
+                               PAGE_SIZE,
+                               FALSE, /* set_maximum */
+                               VM_PROT_NONE);
+               assertf(kr == KERN_SUCCESS,
+                       "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
+                       footprint_header,
+                       (uint64_t) buf_size,
+                       (uint64_t) actual_size,
+                       kr);
+       }
+
+       footprint_header->cf_size = actual_size;
+}
+
+/*
+ * vm_map_corpse_footprint_query_page_info:
+ *     retrieves the disposition of the page at virtual address "vaddr"
+ *     in the forked corpse's VM map
+ *
+ * This is the equivalent of pmap_query_page_info() for a forked corpse.
+ */
+kern_return_t
+vm_map_corpse_footprint_query_page_info(
+       vm_map_t        map,
+       vm_map_offset_t va,
+       int             *disp)
+{
+       struct vm_map_corpse_footprint_header *footprint_header;
+       struct vm_map_corpse_footprint_region *footprint_region;
+       uint32_t        footprint_region_offset;
+       vm_map_offset_t region_start, region_end;
+       int             disp_idx;
+       kern_return_t   kr;
+
+       if (!map->has_corpse_footprint) {
+               *disp = 0;
+               kr = KERN_INVALID_ARGUMENT;
+               goto done;
+       }
+
+       footprint_header = map->vmmap_corpse_footprint;
+       if (footprint_header == NULL) {
+               *disp = 0;
+//             if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disp);
+               kr = KERN_INVALID_ARGUMENT;
+               goto done;
+       }
+
+       /* start looking at the hint ("cf_hint_region") */
+       footprint_region_offset = footprint_header->cf_hint_region;
+
+lookup_again:
+       if (footprint_region_offset < sizeof (*footprint_header)) {
+               /* hint too low: start from 1st region */
+               footprint_region_offset = sizeof (*footprint_header);
+       }
+       if (footprint_region_offset >= footprint_header->cf_last_region) {
+               /* hint too high: re-start from 1st region */
+               footprint_region_offset = sizeof (*footprint_header);
+       }
+       footprint_region = (struct vm_map_corpse_footprint_region *)
+               ((char *)footprint_header + footprint_region_offset);
+       region_start = footprint_region->cfr_vaddr;
+       region_end = (region_start +
+                     ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
+                      PAGE_SIZE));
+       if (va < region_start &&
+           footprint_region_offset != sizeof (*footprint_header)) {
+               /* our range starts before the hint region */
+
+               /* reset the hint (in a racy way...) */
+               footprint_header->cf_hint_region = sizeof (*footprint_header);
+               /* lookup "va" again from 1st region */
+               footprint_region_offset = sizeof (*footprint_header);
+               goto lookup_again;
+       }
+
+       while (va >= region_end) {
+               if (footprint_region_offset >= footprint_header->cf_last_region) {
+                       break;
+               }
+               /* skip the region's header */
+               footprint_region_offset += sizeof (*footprint_region);
+               /* skip the region's page dispositions */
+               footprint_region_offset += footprint_region->cfr_num_pages;
+               /* align to next word boundary */
+               footprint_region_offset =
+                       roundup(footprint_region_offset,
+                               sizeof (int));
+               footprint_region = (struct vm_map_corpse_footprint_region *)
+                       ((char *)footprint_header + footprint_region_offset);
+               region_start = footprint_region->cfr_vaddr;
+               region_end = (region_start +
+                             ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
+                              PAGE_SIZE));
+       }
+       if (va < region_start || va >= region_end) {
+               /* page not found */
+               *disp = 0;
+//             if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disp);
+               kr = KERN_SUCCESS;
+               goto done;
+       }
+
+       /* "va" found: set the lookup hint for next lookup (in a racy way...) */
+       footprint_header->cf_hint_region = footprint_region_offset;
+
+       /* get page disposition for "va" in this region */
+       disp_idx = (int) ((va - footprint_region->cfr_vaddr) / PAGE_SIZE);
+       *disp = (int) (footprint_region->cfr_disposition[disp_idx]);
+
+       kr = KERN_SUCCESS;
+done:
+//     if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disp);
+       /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
+       DTRACE_VM4(footprint_query_page_info,
+                  vm_map_t, map,
+                  vm_map_offset_t, va,
+                  int, *disp,
+                  kern_return_t, kr);
+
+       return kr;
+}
+
+
+static void
+vm_map_corpse_footprint_destroy(
+       vm_map_t        map)
+{
+       if (map->has_corpse_footprint &&
+           map->vmmap_corpse_footprint != 0) {
+               struct vm_map_corpse_footprint_header *footprint_header;
+               vm_size_t buf_size;
+               kern_return_t kr;
+
+               footprint_header = map->vmmap_corpse_footprint;
+               buf_size = footprint_header->cf_size;
+               kr = vm_deallocate(kernel_map,
+                                  (vm_offset_t) map->vmmap_corpse_footprint,
+                                  ((vm_size_t) buf_size
+                                   + PAGE_SIZE)); /* trailing guard page */
+               assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
+               map->vmmap_corpse_footprint = 0;
+               map->has_corpse_footprint = FALSE;
+       }
+}
+
+/*
+ * vm_map_copy_footprint_ledgers:
+ *     copies any ledger that's relevant to the memory footprint of "old_task"
+ *     into the forked corpse's task ("new_task")
+ */
+void
+vm_map_copy_footprint_ledgers(
+       task_t  old_task,
+       task_t  new_task)
+{
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.phys_footprint);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile);
+       vm_map_copy_ledger(old_task, new_task, task_ledgers.purgeable_nonvolatile_compressed);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.internal);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.internal_compressed);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.iokit_mapped);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.alternate_accounting_compressed);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.page_table);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.network_nonvolatile_compressed);
+    vm_map_copy_ledger(old_task, new_task, task_ledgers.wired_mem);
+}
+
+/*
+ * vm_map_copy_ledger:
+ *     copy a single ledger from "old_task" to "new_task"
+ */
+void
+vm_map_copy_ledger(
+       task_t  old_task,
+       task_t  new_task,
+       int     ledger_entry)
+{
+       ledger_amount_t old_balance, new_balance, delta;
+
+       assert(new_task->map->has_corpse_footprint);
+       if (!new_task->map->has_corpse_footprint)
+               return;
+
+       /* turn off sanity checks for the ledger we're about to mess with */
+       ledger_disable_panic_on_negative(new_task->ledger,
+                                        ledger_entry);
+
+       /* adjust "new_task" to match "old_task" */
+       ledger_get_balance(old_task->ledger,
+                          ledger_entry,
+                          &old_balance);
+       ledger_get_balance(new_task->ledger,
+                          ledger_entry,
+                          &new_balance);
+       if (new_balance == old_balance) {
+               /* new == old: done */
+       } else if (new_balance > old_balance) {
+               /* new > old ==> new -= new - old */
+               delta = new_balance - old_balance;
+               ledger_debit(new_task->ledger,
+                            ledger_entry,
+                            delta);
+       } else {
+               /* new < old ==> new += old - new */
+               delta = old_balance - new_balance;
+               ledger_credit(new_task->ledger,
+                             ledger_entry,
+                             delta);
+       }
+}
index 23592b8e4d01130c82b18f0873a3b3e45dfe31dd..44cef715dae6faaf76c4fa061d21094f14b87aa7 100644 (file)
@@ -91,11 +91,12 @@ extern vm_map_t current_map(void);
 
 /* Setup reserved areas in a new VM map */
 extern kern_return_t   vm_map_exec(
-                               vm_map_t                new_map,
-                               task_t                  task,
-                               boolean_t               is64bit,
-                               void                    *fsroot,
-                               cpu_type_t              cpu);
+       vm_map_t                new_map,
+       task_t                  task,
+       boolean_t               is64bit,
+       void                    *fsroot,
+       cpu_type_t              cpu,
+       cpu_subtype_t           cpu_subtype);
 
 __END_DECLS
 
@@ -147,6 +148,9 @@ typedef union vm_map_object {
 #define named_entry_lock_destroy(object)       lck_mtx_destroy(&(object)->Lock, &vm_object_lck_grp)
 #define named_entry_lock(object)               lck_mtx_lock(&(object)->Lock)
 #define named_entry_unlock(object)             lck_mtx_unlock(&(object)->Lock)   
+#if VM_NAMED_ENTRY_LIST
+extern queue_head_t vm_named_entry_list;
+#endif /* VM_NAMED_ENTRY_LIST */
 
 /*
  *     Type:           vm_named_entry_t [internal use only]
@@ -182,6 +186,13 @@ struct vm_named_entry {
        /* boolean_t */         internal:1,     /* ... an internal object */
        /* boolean_t */         is_sub_map:1,   /* ... a submap? */
        /* boolean_t */         is_copy:1;      /* ... a VM map copy */
+#if VM_NAMED_ENTRY_LIST
+       queue_chain_t           named_entry_list;
+       int                     named_entry_alias;
+       mach_port_t             named_entry_port;
+#define NAMED_ENTRY_BT_DEPTH 16
+       void                    *named_entry_bt[NAMED_ENTRY_BT_DEPTH];
+#endif /* VM_NAMED_ENTRY_LIST */
 };
 
 /*
@@ -323,6 +334,7 @@ struct vm_map_entry {
                                             * this entry it is being deleted
                                             * without unwiring them */
        /* boolean_t */ used_for_jit:1,
+       /* boolean_t */ pmap_cs_associated:1, /* pmap_cs will validate */
        /* boolean_t */ from_reserved_zone:1, /* Allocated from
                                               * kernel reserved zone    */
 
@@ -331,7 +343,7 @@ struct vm_map_entry {
        /* boolean_t */ vme_resilient_codesign:1,
        /* boolean_t */ vme_resilient_media:1,
        /* boolean_t */ vme_atomic:1, /* entry cannot be split/coalesced */
-               __unused:5;
+               __unused:4;
 ;
 
        unsigned short          wired_count;    /* can be paged if = 0 */
@@ -405,7 +417,7 @@ struct vm_map_header {
  *             quickly find free space.
  */
 struct _vm_map {
-       lck_rw_t                        lock;           /* map lock */
+       lck_rw_t                lock;           /* map lock */
        struct vm_map_header    hdr;            /* Map entry header */
 #define min_offset             hdr.links.start /* start of range */
 #define max_offset             hdr.links.end   /* end of range */
@@ -433,24 +445,30 @@ struct _vm_map {
        } vmu1;
 #define highest_entry_end      vmu1.vmu1_highest_entry_end
 #define lowest_unnestable_start        vmu1.vmu1_lowest_unnestable_start
-
-       int                     ref_count;      /* Reference count */
-#if    TASK_SWAPPER
-       int                     res_count;      /* Residence count (swap) */
-       int                     sw_state;       /* Swap state */
-#endif /* TASK_SWAPPER */
        decl_lck_mtx_data(,     s_lock)         /* Lock ref, res fields */
        lck_mtx_ext_t           s_lock_ext;
        vm_map_entry_t          hint;           /* hint for quick lookups */
-       struct vm_map_links*    hole_hint;      /* hint for quick hole lookups */
+       union {
+               struct vm_map_links* vmmap_hole_hint;   /* hint for quick hole lookups */
+               struct vm_map_corpse_footprint_header *vmmap_corpse_footprint;
+       } vmmap_u_1;
+#define hole_hint vmmap_u_1.vmmap_hole_hint
+#define vmmap_corpse_footprint vmmap_u_1.vmmap_corpse_footprint
        union{
                vm_map_entry_t          _first_free;    /* First free space hint */
                struct vm_map_links*    _holes;         /* links all holes between entries */
-       }f_s;                                           /* Union for free space data structures being used */
+       } f_s;                                          /* Union for free space data structures being used */
 
 #define first_free             f_s._first_free
 #define holes_list             f_s._holes
 
+       int                     map_refcnt;     /* Reference count */
+
+#if    TASK_SWAPPER
+       int                     res_count;      /* Residence count (swap) */
+       int                     sw_state;       /* Swap state */
+#endif /* TASK_SWAPPER */
+
        unsigned int            
        /* boolean_t */         wait_for_space:1, /* Should callers wait for space? */
        /* boolean_t */         wiring_required:1, /* All memory wired? */
@@ -462,14 +480,15 @@ struct _vm_map {
        /* boolean_t */         holelistenabled:1,
        /* boolean_t */         is_nested_map:1,
        /* boolean_t */         map_disallow_new_exec:1, /* Disallow new executable code */
-       /* reserved */          pad:22;
+       /* boolean_t */         jit_entry_exists:1,
+       /* boolean_t */         has_corpse_footprint:1,
+       /* boolean_t */         warned_delete_gap:1,
+       /* reserved */          pad:19;
        unsigned int            timestamp;      /* Version number */
-       unsigned int            color_rr;       /* next color (not protected by a lock) */
-
-       boolean_t               jit_entry_exists;
-} ;
+};
 
-#define vm_map_to_entry(map)   ((struct vm_map_entry *) &(map)->hdr.links)
+#define CAST_TO_VM_MAP_ENTRY(x) ((struct vm_map_entry *)(uintptr_t)(x))
+#define vm_map_to_entry(map) CAST_TO_VM_MAP_ENTRY(&(map)->hdr.links)
 #define vm_map_first_entry(map)        ((map)->hdr.links.next)
 #define vm_map_last_entry(map) ((map)->hdr.links.prev)
 
@@ -563,8 +582,7 @@ struct vm_map_copy {
  *     Useful macros for entry list copy objects
  */
 
-#define vm_map_copy_to_entry(copy)             \
-               ((struct vm_map_entry *) &(copy)->cpy_hdr.links)
+#define vm_map_copy_to_entry(copy) CAST_TO_VM_MAP_ENTRY(&(copy)->cpy_hdr.links)
 #define vm_map_copy_first_entry(copy)          \
                ((copy)->cpy_hdr.links.next)
 #define vm_map_copy_last_entry(copy)           \
@@ -745,7 +763,7 @@ MACRO_BEGIN                                 \
        if (Map) {                              \
                lck_mtx_lock(&Map->s_lock);     \
                Map->res_count++;               \
-               Map->ref_count++;               \
+               Map->map_refcnt++;              \
                lck_mtx_unlock(&Map->s_lock);   \
        }                                       \
 MACRO_END
@@ -780,7 +798,7 @@ MACRO_END
 MACRO_BEGIN                            \
        vm_map_t Map = (map);           \
        lck_mtx_lock(&Map->s_lock);     \
-       ++Map->ref_count;               \
+       ++Map->map_refcnt;              \
        vm_map_res_reference(Map);      \
        lck_mtx_unlock(&Map->s_lock);   \
 MACRO_END
@@ -799,7 +817,7 @@ MACRO_BEGIN                                 \
        vm_map_t Map = (map);                   \
        if (Map) {                              \
                lck_mtx_lock(&Map->s_lock);     \
-               Map->ref_count++;               \
+               Map->map_refcnt++;              \
                lck_mtx_unlock(&Map->s_lock);   \
        }                                       \
 MACRO_END
@@ -953,6 +971,7 @@ extern vm_map_t             vm_map_fork(
                                int                     options);
 #define VM_MAP_FORK_SHARE_IF_INHERIT_NONE      0x00000001
 #define VM_MAP_FORK_PRESERVE_PURGEABLE         0x00000002
+#define VM_MAP_FORK_CORPSE_FOOTPRINT           0x00000004
 
 /* Change inheritance */
 extern kern_return_t   vm_map_inherit(
@@ -1049,6 +1068,13 @@ extern kern_return_t vm_map_set_cache_attr(
 
 extern int override_nx(vm_map_t map, uint32_t user_tag);
 
+#if PMAP_CS
+extern kern_return_t vm_map_entry_cs_associate(
+       vm_map_t                map,
+       vm_map_entry_t          entry,
+       vm_map_kernel_flags_t   vmk_flags);
+#endif /* PMAP_CS */
+
 extern void vm_map_region_top_walk(
         vm_map_entry_t entry,
        vm_region_top_info_t top);
@@ -1062,6 +1088,46 @@ extern void vm_map_region_walk(
        boolean_t look_for_pages,
        mach_msg_type_number_t count);
 
+
+struct vm_map_corpse_footprint_header {
+       vm_size_t       cf_size;        /* allocated buffer size */
+       uint32_t        cf_last_region; /* offset of last region in buffer */
+       union {
+               uint32_t cfu_last_zeroes; /* during creation:
+                                         * number of "zero" dispositions at
+                                         * end of last region */
+               uint32_t cfu_hint_region; /* during lookup:
+                                         * offset of last looked up region */
+#define cf_last_zeroes cfu.cfu_last_zeroes
+#define cf_hint_region cfu.cfu_hint_region
+       } cfu;
+};
+struct vm_map_corpse_footprint_region {
+       vm_map_offset_t cfr_vaddr;      /* region start virtual address */
+       uint32_t        cfr_num_pages;  /* number of pages in this "region" */
+       unsigned char   cfr_disposition[0];     /* disposition of each page */
+} __attribute__((packed));
+
+extern kern_return_t vm_map_corpse_footprint_collect(
+       vm_map_t        old_map,
+       vm_map_entry_t  old_entry,
+       vm_map_t        new_map);
+extern void vm_map_corpse_footprint_collect_done(
+       vm_map_t        new_map);
+
+extern kern_return_t vm_map_corpse_footprint_query_page_info(
+       vm_map_t        map,
+       vm_map_offset_t va,
+       int             *disp);
+
+extern void vm_map_copy_footprint_ledgers(
+       task_t  old_task,
+       task_t  new_task);
+extern void vm_map_copy_ledger(
+       task_t  old_task,
+       task_t  new_task,
+       int     ledger_entry);
+
 #endif /* MACH_KERNEL_PRIVATE */
 
 __BEGIN_DECLS
@@ -1072,6 +1138,15 @@ extern vm_map_t          vm_map_create(
                                vm_map_offset_t         min_off,
                                vm_map_offset_t         max_off,
                                boolean_t               pageable);
+extern vm_map_t vm_map_create_options(
+       pmap_t                  pmap,
+       vm_map_offset_t         min_off,
+       vm_map_offset_t         max_off,
+       int                     options);
+#define VM_MAP_CREATE_PAGEABLE         0x00000001
+#define VM_MAP_CREATE_CORPSE_FOOTPRINT 0x00000002
+#define VM_MAP_CREATE_ALL_OPTIONS (VM_MAP_CREATE_PAGEABLE | \
+                                  VM_MAP_CREATE_CORPSE_FOOTPRINT)
 
 extern void            vm_map_disable_hole_optimization(vm_map_t map);
 
@@ -1320,6 +1395,9 @@ extern void               vm_map_set_32bit(
 extern void            vm_map_set_jumbo(
                                vm_map_t                map);
 
+extern void            vm_map_set_max_addr(
+                               vm_map_t                map, vm_map_offset_t new_max_offset);
+
 extern boolean_t       vm_map_has_hard_pagezero(
                                vm_map_t                map,
                                vm_map_offset_t         pagezero_size);
@@ -1479,7 +1557,7 @@ extern kern_return_t vm_map_set_page_shift(vm_map_t map, int pageshift);
 /*
  * Flags for vm_map_remove() and vm_map_delete()
  */
-#define        VM_MAP_NO_FLAGS                 0x0
+#define        VM_MAP_REMOVE_NO_FLAGS          0x0
 #define        VM_MAP_REMOVE_KUNWIRE           0x1
 #define        VM_MAP_REMOVE_INTERRUPTIBLE     0x2
 #define        VM_MAP_REMOVE_WAIT_FOR_KWIRE    0x4
@@ -1488,6 +1566,7 @@ extern kern_return_t vm_map_set_page_shift(vm_map_t map, int pageshift);
 #define VM_MAP_REMOVE_NO_MAP_ALIGN     0x20
 #define VM_MAP_REMOVE_NO_UNNESTING     0x40
 #define VM_MAP_REMOVE_IMMUTABLE                0x80
+#define VM_MAP_REMOVE_GAPS_OK          0x100
 
 /* Support for UPLs from vm_maps */
 
@@ -1535,13 +1614,23 @@ extern int vm_map_disconnect_page_mappings(
 #if CONFIG_FREEZE
 
 extern kern_return_t vm_map_freeze(
-               vm_map_t map,
+               vm_map_t     map,
                unsigned int *purgeable_count,
                unsigned int *wired_count,
                unsigned int *clean_count,
                unsigned int *dirty_count,
                unsigned int dirty_budget,
-               boolean_t *has_shared);
+                unsigned int *shared_count,
+               int          *freezer_error_code,
+               boolean_t    eval_only);
+
+
+#define FREEZER_ERROR_GENERIC                  (-1)
+#define FREEZER_ERROR_EXCESS_SHARED_MEMORY     (-2)
+#define FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO (-3)
+#define FREEZER_ERROR_NO_COMPRESSOR_SPACE      (-4)
+#define FREEZER_ERROR_NO_SWAP_SPACE            (-5)
+
 #endif
 
 __END_DECLS
index 26b3477a4671e079a9b6a5d828ee298d0096fefe..8690d27ad8c3bbad888f0e7fa077fab490ac6468 100644 (file)
@@ -96,33 +96,6 @@ vm_map_store_update( vm_map_t map, vm_map_entry_t entry, int update_type )
        }
 }
 
-void   vm_map_store_copy_insert( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy)
-{
-       if (__improbable(vm_debug_events)) {
-               vm_map_entry_t entry;
-               for (entry = vm_map_copy_first_entry(copy); entry != vm_map_copy_to_entry(copy); entry = entry->vme_next) {
-                       DTRACE_VM4(map_entry_link_copy, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->links.start, vm_address_t, entry->links.end);
-               }
-       }
-
-       if (map->holelistenabled) {
-               vm_map_entry_t entry = NULL;
-
-               entry = vm_map_copy_first_entry(copy);
-               while (entry != vm_map_copy_to_entry(copy)) {
-                       vm_map_store_update_first_free(map, entry, TRUE);
-                       entry = entry->vme_next;
-               }
-       }
-
-       vm_map_store_copy_insert_ll(map, after_where, copy);
-#ifdef VM_MAP_STORE_USE_RB
-       if (vm_map_store_has_RB_support( &map->hdr )) {
-               vm_map_store_copy_insert_rb(map, after_where, copy);
-       }
-#endif
-}
-
 /*
  *     vm_map_entry_{un,}link:
  *
@@ -156,7 +129,11 @@ _vm_map_store_entry_link( struct vm_map_header * mapHdr, vm_map_entry_t after_wh
 }
 
 void
-vm_map_store_entry_link( vm_map_t map, vm_map_entry_t after_where, vm_map_entry_t entry)
+vm_map_store_entry_link(
+       vm_map_t                map,
+       vm_map_entry_t          after_where,
+       vm_map_entry_t          entry,
+       vm_map_kernel_flags_t   vmk_flags)
 {
        vm_map_t VMEL_map;
        vm_map_entry_t VMEL_entry;
@@ -174,6 +151,11 @@ vm_map_store_entry_link( vm_map_t map, vm_map_entry_t after_where, vm_map_entry_
                }
 #endif
        }
+#if PMAP_CS
+       (void) vm_map_entry_cs_associate(map, entry, vmk_flags);
+#else /* PMAP_CS */
+       (void) vmk_flags;
+#endif /* PMAP_CS */
 }
 
 void
index cc8b60df488c8ad1387a892536aee4cb16f0203a..8a0641c703516285f4f8b1c56911492b1b407cf0 100644 (file)
@@ -132,11 +132,10 @@ void vm_map_store_init( struct vm_map_header*  );
 boolean_t vm_map_store_lookup_entry( struct _vm_map*, vm_map_offset_t, struct vm_map_entry**);
 void   vm_map_store_update( struct _vm_map*, struct vm_map_entry*, int);
 void   _vm_map_store_entry_link( struct vm_map_header *, struct vm_map_entry*, struct vm_map_entry*);
-void   vm_map_store_entry_link( struct _vm_map*, struct vm_map_entry*, struct vm_map_entry*);
+void   vm_map_store_entry_link( struct _vm_map*, struct vm_map_entry*, struct vm_map_entry*, vm_map_kernel_flags_t);
 void   _vm_map_store_entry_unlink( struct vm_map_header *, struct vm_map_entry*);
 void   vm_map_store_entry_unlink( struct _vm_map*, struct vm_map_entry*);
 void   vm_map_store_update_first_free( struct _vm_map*, struct vm_map_entry*, boolean_t new_entry_creation);
-void   vm_map_store_copy_insert( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*);
 void   vm_map_store_copy_reset( struct vm_map_copy*, struct vm_map_entry*);
 #if MACH_ASSERT
 boolean_t first_free_is_valid_store( struct _vm_map*);
index c7c1afd9893ec65f32dd6c6096daadc19fe65bb4..5f33f8c0c9372f4533aeec4676ea0f28791ab14e 100644 (file)
@@ -238,12 +238,6 @@ vm_map_store_entry_unlink_ll( struct vm_map_header *mapHdr, vm_map_entry_t entry
        _vm_map_entry_unlink_ll( mapHdr, entry);
 }
 
-void
-vm_map_store_copy_insert_ll( vm_map_t map, vm_map_entry_t after_where, vm_map_copy_t copy)
-{
-       _vm_map_copy_insert_ll( map, after_where, copy);
-}
-
 void
 vm_map_store_copy_reset_ll( vm_map_copy_t copy, __unused vm_map_entry_t entry, __unused int nentries)
 {
index 0bbe00d48983df31089a7524841b6036e4224ffd..0c15b914d36cd9b3cf508472ac016e371dd805a1 100644 (file)
@@ -38,7 +38,6 @@ boolean_t vm_map_store_lookup_entry_ll( struct _vm_map*, vm_map_offset_t, struct
 void   vm_map_store_entry_link_ll( struct vm_map_header*, struct vm_map_entry*, struct vm_map_entry*);
 void   vm_map_store_entry_unlink_ll( struct vm_map_header*, struct vm_map_entry*);
 void   update_first_free_ll(struct _vm_map*, struct vm_map_entry*);
-void    vm_map_store_copy_insert_ll( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*);
 void    vm_map_store_copy_reset_ll( struct vm_map_copy*, struct vm_map_entry*, int);
 
 #endif /* _VM_VM_MAP_STORE_LL_H */
index 70fb9be4c52f85f18a2fd09480dc1b871991cb3c..9485f0cb8e752c56c94e829138aab7cb3997bfd3 100644 (file)
@@ -120,33 +120,6 @@ void       vm_map_store_entry_unlink_rb( struct vm_map_header *mapHdr, vm_map_entry_t
        RB_REMOVE( rb_head, rbh, store );
 }
 
-void   vm_map_store_copy_insert_rb( vm_map_t map, __unused vm_map_entry_t after_where, vm_map_copy_t copy)
-{
-       struct vm_map_header *mapHdr = &(map->hdr);
-       struct rb_head *rbh = &(mapHdr->rb_head_store);
-       struct vm_map_store *store;
-       vm_map_entry_t entry = vm_map_copy_first_entry(copy);
-       int inserted=0, nentries = copy->cpy_hdr.nentries;
-               
-       while (entry != vm_map_copy_to_entry(copy) && nentries > 0) {           
-               vm_map_entry_t prev = entry;
-               store = &(entry->store);
-               if( RB_INSERT( rb_head, rbh, store ) != NULL){
-                       panic("VMSCIR1: INSERT FAILED: %d: %p, %p, %p, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx, 0x%lx",inserted, prev, entry, vm_map_copy_to_entry(copy), 
-                                       (uintptr_t)prev->vme_start,  (uintptr_t)prev->vme_end,  (uintptr_t)entry->vme_start,  (uintptr_t)entry->vme_end,  
-                                        (uintptr_t)(VME_FOR_STORE(rbh->rbh_root))->vme_start,  (uintptr_t)(VME_FOR_STORE(rbh->rbh_root))->vme_end);
-               } else {
-#if MAP_ENTRY_INSERTION_DEBUG
-                       backtrace(&entry->vme_insertion_bt[0],
-                                 (sizeof (entry->vme_insertion_bt) / sizeof (uintptr_t)));
-#endif
-                       entry = entry->vme_next;
-                       inserted++;
-                       nentries--;
-               }
-       }
-}
-
 void
 vm_map_store_copy_reset_rb( vm_map_copy_t copy, vm_map_entry_t entry, int nentries )
 {
@@ -200,9 +173,9 @@ vm_map_delete_hole(vm_map_t map, vm_map_entry_t hole_entry);
 void
 vm_map_delete_hole(vm_map_t map, vm_map_entry_t hole_entry)
 {
-       if (hole_entry == (vm_map_entry_t) map->holes_list) {
+       if (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
 
-               if (hole_entry->vme_next == (vm_map_entry_t) map->holes_list) {
+               if (hole_entry->vme_next == CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
 
                        map->holes_list = NULL;
                        SAVE_HINT_HOLE_WRITE(map, NULL);
@@ -322,7 +295,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry)
 #endif /* DEBUG */
        boolean_t               create_new_hole = TRUE;
 
-       hole_entry = (vm_map_entry_t) map->hole_hint;
+       hole_entry = CAST_TO_VM_MAP_ENTRY(map->hole_hint);
 
        if (hole_entry) {
 
@@ -334,7 +307,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry)
 
                } else if (hole_entry->vme_start == old_entry->vme_end) {
 
-                       if (hole_entry != (vm_map_entry_t) map->holes_list) {
+                       if (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
 
                                /*
                                 * Found a hole right after below our entry but
@@ -352,10 +325,10 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry)
                         * Useless hint. Start from the top.
                         */
 
-                       hole_entry = (vm_map_entry_t) map->holes_list;
+                       hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
                }
 
-               if (hole_entry != (vm_map_entry_t) map->holes_list) {
+               if (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
                        if (hole_entry->vme_start > old_entry->vme_start) {
                                panic("Hole hint failed: Hole entry start: 0x%llx, entry start: 0x%llx, map hole start: 0x%llx, map hint start: 0x%llx\n",
                                        (unsigned long long)hole_entry->vme_start,
@@ -433,7 +406,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry)
                                copy_hole_info(hole_entry, &old_hole_entry);
 #endif /* DEBUG */
 
-                               if (hole_entry != (vm_map_entry_t) map->holes_list) {
+                               if (hole_entry != CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
                                        assert(hole_entry->vme_start != old_entry->vme_start);
                                        hole_entry = hole_entry->vme_prev;
                                }
@@ -442,7 +415,7 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry)
 
                        hole_entry = next_hole_entry;
 
-                       if (hole_entry == (vm_map_entry_t)map->holes_list) {
+                       if (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list)) {
                                hole_entry = hole_entry->vme_prev;
                                break;
                        }
@@ -460,21 +433,21 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry)
                 * OR
                 * A hole that is located above the current first hole in the map?
                 */
-               if (map->holes_list == NULL || (hole_entry == (vm_map_entry_t) map->holes_list && hole_entry->vme_start > old_entry->vme_start)) {
+               if (map->holes_list == NULL || (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list) && hole_entry->vme_start > old_entry->vme_start)) {
 
                        if (map->holes_list == NULL) {
 
                                map->holes_list = new_hole_entry;
-                               new_hole_entry->prev = new_hole_entry->next = (vm_map_entry_t)map->holes_list;
+                               new_hole_entry->prev = new_hole_entry->next = CAST_TO_VM_MAP_ENTRY(map->holes_list);
                        } else {
 
-                               l_next = (vm_map_entry_t) map->holes_list;
+                               l_next = CAST_TO_VM_MAP_ENTRY(map->holes_list);
                                l_prev = map->holes_list->prev;
                                map->holes_list = new_hole_entry;
                                new_hole_entry->next = l_next;
                                new_hole_entry->prev = l_prev;
 
-                               l_prev->vme_next = l_next->vme_prev = (vm_map_entry_t) new_hole_entry;
+                               l_prev->vme_next = l_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry);
                        }
                } else {
 
@@ -484,14 +457,14 @@ update_holes_on_entry_deletion(vm_map_t map, vm_map_entry_t old_entry)
                        new_hole_entry->prev = hole_entry;
                        new_hole_entry->next = l_next;
 
-                       hole_entry->vme_next = (vm_map_entry_t)new_hole_entry;
-                       l_next->vme_prev = (vm_map_entry_t) new_hole_entry;
+                       hole_entry->vme_next = CAST_TO_VM_MAP_ENTRY(new_hole_entry);
+                       l_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry);
                }
 
                new_hole_entry->start = old_entry->vme_start;
                new_hole_entry->end = old_entry->vme_end;
 
-               hole_entry = (vm_map_entry_t) new_hole_entry;
+               hole_entry = CAST_TO_VM_MAP_ENTRY(new_hole_entry);
 
                assert(new_hole_entry->start < new_hole_entry->end);
        }
@@ -529,7 +502,7 @@ update_holes_on_entry_creation(vm_map_t map, vm_map_entry_t new_entry)
         *         This will reduce the size of the hole or delete the hole completely if it is smaller than the entry.
         */
 
-       hole_entry = (vm_map_entry_t) map->holes_list;
+       hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
        assert(hole_entry);
        next_hole_entry = hole_entry->vme_next;
 
@@ -593,8 +566,8 @@ update_holes_on_entry_creation(vm_map_t map, vm_map_entry_t new_entry)
 
                        new_hole_entry->prev = hole_entry;
                        new_hole_entry->next = hole_entry->vme_next;
-                       hole_entry->vme_next->vme_prev = (vm_map_entry_t)new_hole_entry;
-                       hole_entry->vme_next = (vm_map_entry_t)new_hole_entry;
+                       hole_entry->vme_next->vme_prev = CAST_TO_VM_MAP_ENTRY(new_hole_entry);
+                       hole_entry->vme_next = CAST_TO_VM_MAP_ENTRY(new_hole_entry);
 
                        new_hole_entry->start = new_entry->vme_end;
                        new_hole_entry->end = hole_entry->vme_end;
@@ -664,7 +637,7 @@ update_holes_on_entry_creation(vm_map_t map, vm_map_entry_t new_entry)
                hole_entry = next_hole_entry;
                next_hole_entry = hole_entry->vme_next;
 
-               if (hole_entry == (vm_map_entry_t)map->holes_list)
+               if (hole_entry == CAST_TO_VM_MAP_ENTRY(map->holes_list))
                        break;
        }
 
index d9506e6b4bec5b8d99ebceb2ee4342d9f9717473..82ac40321887fbebe78b014b89f2e1e8db02255c 100644 (file)
@@ -39,7 +39,6 @@ void vm_map_store_walk_rb( struct _vm_map*, struct vm_map_entry**, struct vm_map
 boolean_t vm_map_store_lookup_entry_rb( struct _vm_map*, vm_map_offset_t, struct vm_map_entry**);
 void   vm_map_store_entry_link_rb( struct vm_map_header*, struct vm_map_entry*, struct vm_map_entry*);
 void   vm_map_store_entry_unlink_rb( struct vm_map_header*, struct vm_map_entry*);
-void   vm_map_store_copy_insert_rb( struct _vm_map*, struct vm_map_entry*, struct vm_map_copy*);
 void   vm_map_store_copy_reset_rb( struct vm_map_copy*, struct vm_map_entry*, int);
 void   update_first_free_rb(struct _vm_map*, struct vm_map_entry*, boolean_t new_entry_creation);
 
index 821929f88bffb6d9b25eac90d528d52de04a2aa2..84f0ff6e83933e7e8cc6c84dc162db2e7cad0712 100644 (file)
 #include <vm/vm_phantom_cache.h>
 #endif
 
+#if VM_OBJECT_ACCESS_TRACKING
+uint64_t vm_object_access_tracking_reads = 0;
+uint64_t vm_object_access_tracking_writes = 0;
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+
 boolean_t vm_object_collapse_compressor_allowed = TRUE;
 
 struct vm_counters vm_counters;
@@ -399,6 +404,10 @@ lck_attr_t         vm_object_lck_attr;
 lck_attr_t             kernel_object_lck_attr;
 lck_attr_t             compressor_object_lck_attr;
 
+extern void vm_named_entry_init(void);
+
+int workaround_41447923 = 0;
+
 /*
  *     vm_object_bootstrap:
  *
@@ -466,8 +475,7 @@ vm_object_bootstrap(void)
        vm_object_template.res_count = 1;
 #endif /* TASK_SWAPPER */
        vm_object_template.resident_page_count = 0;
-    // static vm_object_template is zeroed
-    // vm_object_template.wired_page_count = 0;
+       vm_object_template.wired_page_count = 0;
        vm_object_template.reusable_page_count = 0;
        vm_object_template.copy = VM_OBJECT_NULL;
        vm_object_template.shadow = VM_OBJECT_NULL;
@@ -521,7 +529,7 @@ vm_object_bootstrap(void)
        /* cache bitfields */
        vm_object_template.wimg_bits = VM_WIMG_USE_DEFAULT;
        vm_object_template.set_cache_attr = FALSE;
-       vm_object_template.object_slid = FALSE;
+       vm_object_template.object_is_shared_cache = FALSE;
        vm_object_template.code_signed = FALSE;
        vm_object_template.transposed = FALSE;
        vm_object_template.mapping_in_progress = FALSE;
@@ -530,6 +538,7 @@ vm_object_bootstrap(void)
        vm_object_template.volatile_fault = FALSE;
        vm_object_template.all_reusable = FALSE;
        vm_object_template.blocked_access = FALSE;
+       vm_object_template.vo_ledger_tag = VM_OBJECT_LEDGER_TAG_NONE;
        vm_object_template.__object2_unused_bits = 0;
 #if CONFIG_IOSCHED || UPL_DEBUG
        vm_object_template.uplq.prev = NULL;
@@ -551,6 +560,10 @@ vm_object_bootstrap(void)
        vm_object_template.vo_cache_ts = 0;
 
        vm_object_template.wire_tag = VM_KERN_MEMORY_NONE;
+#if ! VM_TAG_ACTIVE_UPDATE
+       vm_object_template.wired_objq.next = NULL;
+       vm_object_template.wired_objq.prev = NULL;
+#endif /* ! VM_TAG_ACTIVE_UPDATE */
 
        vm_object_template.io_tracking = FALSE;
 
@@ -561,6 +574,12 @@ vm_object_bootstrap(void)
        vm_object_template.__object3_unused_bits = 0;
 #endif /* CONFIG_SECLUDED_MEMORY */
        
+#if VM_OBJECT_ACCESS_TRACKING
+       vm_object_template.access_tracking = FALSE;
+       vm_object_template.access_tracking_reads = 0;
+       vm_object_template.access_tracking_writes = 0;
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+
 #if DEBUG
        bzero(&vm_object_template.purgeable_owner_bt[0],
              sizeof (vm_object_template.purgeable_owner_bt));
@@ -605,6 +624,11 @@ vm_object_bootstrap(void)
         * non-zone memory.
         */
        vm_object_reference(vm_submap_object);
+
+       vm_named_entry_init();
+
+       PE_parse_boot_argn("workaround_41447923", &workaround_41447923, 
+                          sizeof (workaround_41447923));
 }
 
 #if CONFIG_IOSCHED
@@ -929,44 +953,44 @@ vm_object_page_grab(
        while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && --p_limit > 0) {
 
                p = next_p;
-               next_p = (vm_page_t)vm_page_queue_next(&next_p->listq);
+               next_p = (vm_page_t)vm_page_queue_next(&next_p->vmp_listq);
 
-               if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry || p->fictitious)
+               if (VM_PAGE_WIRED(p) || p->vmp_busy || p->vmp_cleaning || p->vmp_laundry || p->vmp_fictitious)
                        goto move_page_in_obj;
 
-               if (p->pmapped || p->dirty || p->precious) {
+               if (p->vmp_pmapped || p->vmp_dirty || p->vmp_precious) {
                        vm_page_lockspin_queues();
 
-                       if (p->pmapped) {
+                       if (p->vmp_pmapped) {
                                int refmod_state;
 
                                vm_object_page_grab_pmapped++;
 
-                               if (p->reference == FALSE || p->dirty == FALSE) {
+                               if (p->vmp_reference == FALSE || p->vmp_dirty == FALSE) {
 
                                        refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(p));
 
                                        if (refmod_state & VM_MEM_REFERENCED)
-                                               p->reference = TRUE;
+                                               p->vmp_reference = TRUE;
                                        if (refmod_state & VM_MEM_MODIFIED) {
                                                SET_PAGE_DIRTY(p, FALSE);
                                        }
                                }
-                               if (p->dirty == FALSE && p->precious == FALSE) {
+                               if (p->vmp_dirty == FALSE && p->vmp_precious == FALSE) {
 
                                        refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
 
                                        if (refmod_state & VM_MEM_REFERENCED)
-                                               p->reference = TRUE;
+                                               p->vmp_reference = TRUE;
                                        if (refmod_state & VM_MEM_MODIFIED) {
                                                SET_PAGE_DIRTY(p, FALSE);
                                        }
 
-                                       if (p->dirty == FALSE)
+                                       if (p->vmp_dirty == FALSE)
                                                goto take_page;
                                }
                        }
-                       if ((p->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q) && p->reference == TRUE) {
+                       if ((p->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) && p->vmp_reference == TRUE) {
                                vm_page_activate(p);
 
                                VM_STAT_INCR(reactivations);
@@ -974,8 +998,8 @@ vm_object_page_grab(
                        }
                        vm_page_unlock_queues();
 move_page_in_obj:
-                       vm_page_queue_remove(&object->memq, p, vm_page_t, listq);
-                       vm_page_queue_enter(&object->memq, p, vm_page_t, listq);
+                       vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq);
+                       vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq);
 
                        p_skipped++;
                        continue;
@@ -1115,7 +1139,6 @@ vm_object_cache_evict(
                        next_obj = (vm_object_t)queue_next(&next_obj->cached_list);
 
                        assert(object->purgable == VM_PURGABLE_DENY);
-                       assert(object->wired_page_count == 0);
                        
                        if (sec < object->vo_cache_ts) {
                                KERNEL_DEBUG(0x130020c, object, object->resident_page_count, object->vo_cache_ts, sec, 0);
@@ -1180,20 +1203,20 @@ vm_object_cache_evict(
                while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next_p) && object->vo_cache_pages_to_scan && ep_count < ep_limit) {
 
                        p = next_p;
-                       next_p = (vm_page_t)vm_page_queue_next(&next_p->listq);
+                       next_p = (vm_page_t)vm_page_queue_next(&next_p->vmp_listq);
 
                        object->vo_cache_pages_to_scan--;
 
-                       if (VM_PAGE_WIRED(p) || p->busy || p->cleaning || p->laundry) {
-                               vm_page_queue_remove(&object->memq, p, vm_page_t, listq);
-                               vm_page_queue_enter(&object->memq, p, vm_page_t, listq);
+                       if (VM_PAGE_WIRED(p) || p->vmp_busy || p->vmp_cleaning || p->vmp_laundry) {
+                               vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq);
+                               vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq);
 
                                ep_skipped++;
                                continue;
                        }
-                       if (p->wpmapped || p->dirty || p->precious) {
-                               vm_page_queue_remove(&object->memq, p, vm_page_t, listq);
-                               vm_page_queue_enter(&object->memq, p, vm_page_t, listq);
+                       if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) {
+                               vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq);
+                               vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq);
 
                                pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(p));
                        }
@@ -1207,9 +1230,9 @@ vm_object_cache_evict(
 
                        p = ep_array[ep_index];
 
-                       if (p->wpmapped || p->dirty || p->precious) {
-                               p->reference = FALSE;
-                               p->no_cache = FALSE;
+                       if (p->vmp_wpmapped || p->vmp_dirty || p->vmp_precious) {
+                               p->vmp_reference = FALSE;
+                               p->vmp_no_cache = FALSE;
 
                                /*
                                 * we've already filtered out pages that are in the laundry
@@ -1225,12 +1248,12 @@ vm_object_cache_evict(
 #endif
                                vm_page_free_prepare_queues(p);
 
-                               assert(p->pageq.next == 0 && p->pageq.prev == 0);
+                               assert(p->vmp_pageq.next == 0 && p->vmp_pageq.prev == 0);
                                /*
                                 * Add this page to our list of reclaimed pages,
                                 * to be freed later.
                                 */
-                               p->snext = local_free_q;
+                               p->vmp_snext = local_free_q;
                                local_free_q = p;
 
                                ep_freed++;
@@ -1453,11 +1476,15 @@ vm_object_reap(
         * from its pager, to properly account for compressed pages.
         */
        if (object->internal &&
-           object->purgable != VM_PURGABLE_DENY) {
-               vm_purgeable_accounting(object,
-                                       object->purgable,
-                                       TRUE, /* disown */
-                                       FALSE); /* task_objq locked? */
+           (object->purgable != VM_PURGABLE_DENY ||
+            object->vo_ledger_tag)) {
+               assert(!object->alive);
+               assert(object->terminating);
+               vm_object_ownership_change(object,
+                                          object->vo_ledger_tag, /* unchanged */
+                                          NULL, /* no owner */
+                                          FALSE); /* task_objq not locked */
+               assert(object->vo_owner == NULL);
        }
 
        pager = object->pager;
@@ -1477,9 +1504,7 @@ vm_object_reap(
         * remove from purgeable queue if it's on
         */
        if (object->internal) {
-               task_t owner;
-
-               owner = object->vo_purgeable_owner;
+               assert(VM_OBJECT_OWNER(object) == TASK_NULL);
 
                VM_OBJECT_UNWIRED(object);
 
@@ -1488,8 +1513,6 @@ vm_object_reap(
                } else if (object->purgable == VM_PURGABLE_VOLATILE) {
                        purgeable_q_t queue;
 
-                       assert(object->vo_purgeable_owner == NULL);
-
                        queue = vm_purgeable_object_remove(object);
                        assert(queue);
 
@@ -1532,7 +1555,6 @@ vm_object_reap(
                else if (object->purgable == VM_PURGABLE_NONVOLATILE ||
                         object->purgable == VM_PURGABLE_EMPTY) {
                        /* remove from nonvolatile queue */
-                       assert(object->vo_purgeable_owner == TASK_NULL);
                        vm_purgeable_nonvolatile_dequeue(object);
                } else {
                        panic("object %p in unexpected purgeable state 0x%x\n",
@@ -1622,8 +1644,8 @@ unsigned int vm_max_batch = 256;
                        vm_page_t m;                                    \
                        for (m = _local_free_q;                         \
                             m != VM_PAGE_NULL;                         \
-                            m = m->snext) {            \
-                               if (m->pmapped) {                       \
+                            m = m->vmp_snext) {                        \
+                               if (m->vmp_pmapped) {                   \
                                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)); \
                                }                                       \
                        }                                               \
@@ -1678,7 +1700,7 @@ restart_after_sleep:
        while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) {
 
                p = next;
-               next = (vm_page_t)vm_page_queue_next(&next->listq);
+               next = (vm_page_t)vm_page_queue_next(&next->vmp_listq);
 
                if (--loop_count == 0) {
                                        
@@ -1706,7 +1728,7 @@ restart_after_sleep:
                }
                if (reap_type == REAP_DATA_FLUSH || reap_type == REAP_TERMINATE) {
 
-                       if (p->busy || p->cleaning) {
+                       if (p->vmp_busy || p->vmp_cleaning) {
 
                                vm_page_unlock_queues();
                                /*
@@ -1719,7 +1741,7 @@ restart_after_sleep:
 
                                goto restart_after_sleep;
                        }
-                       if (p->laundry)
+                       if (p->vmp_laundry)
                                vm_pageout_steal_laundry(p, TRUE);
                }
                switch (reap_type) {
@@ -1744,10 +1766,10 @@ restart_after_sleep:
                                vm_page_purged_wired++;
                                continue;
                        }
-                       if (p->laundry && !p->busy && !p->cleaning)
+                       if (p->vmp_laundry && !p->vmp_busy && !p->vmp_cleaning)
                                vm_pageout_steal_laundry(p, TRUE);
 
-                       if (p->cleaning || p->laundry || p->absent) {
+                       if (p->vmp_cleaning || p->vmp_laundry || p->vmp_absent) {
                                /*
                                 * page is being acted upon,
                                 * so don't mess with it
@@ -1755,7 +1777,7 @@ restart_after_sleep:
                                vm_page_purged_others++;
                                continue;
                        }
-                       if (p->busy) {
+                       if (p->vmp_busy) {
                                /*
                                 * We can't reclaim a busy page but we can
                                 * make it more likely to be paged (it's not wired) to make
@@ -1773,7 +1795,7 @@ restart_after_sleep:
                        /*
                         * we can discard this page...
                         */
-                       if (p->pmapped == TRUE) {
+                       if (p->vmp_pmapped == TRUE) {
                                /*
                                 * unmap the page
                                 */
@@ -1784,7 +1806,7 @@ restart_after_sleep:
                        break;
 
                case REAP_TERMINATE:
-                       if (p->absent || p->private) {
+                       if (p->vmp_absent || p->vmp_private) {
                                /*
                                 *      For private pages, VM_PAGE_FREE just
                                 *      leaves the page structure around for
@@ -1794,20 +1816,20 @@ restart_after_sleep:
                                 */
                                break;
                        }
-                       if (p->fictitious) {
+                       if (p->vmp_fictitious) {
                                assert (VM_PAGE_GET_PHYS_PAGE(p) == vm_page_guard_addr);
                                break;
                        }
-                       if (!p->dirty && p->wpmapped)
-                               p->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p));
+                       if (!p->vmp_dirty && p->vmp_wpmapped)
+                               p->vmp_dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p));
 
-                       if ((p->dirty || p->precious) && !p->error && object->alive) {
+                       if ((p->vmp_dirty || p->vmp_precious) && !p->vmp_error && object->alive) {
 
                                assert(!object->internal);
                                
-                               p->free_when_done = TRUE;
+                               p->vmp_free_when_done = TRUE;
 
-                               if (!p->laundry) {
+                               if (!p->vmp_laundry) {
                                        vm_page_queues_remove(p, TRUE);
                                        /*
                                         * flush page... page will be freed
@@ -1832,12 +1854,12 @@ restart_after_sleep:
                        break;
                }
                vm_page_free_prepare_queues(p);
-               assert(p->pageq.next == 0 && p->pageq.prev == 0);
+               assert(p->vmp_pageq.next == 0 && p->vmp_pageq.prev == 0);
                /*
                 * Add this page to our list of reclaimed pages,
                 * to be freed later.
                 */
-               p->snext = local_free_q;
+               p->vmp_snext = local_free_q;
                local_free_q = p;
        }
        vm_page_unlock_queues();
@@ -2195,8 +2217,8 @@ deactivate_pages_in_object(
 
                        MARK_PAGE_HANDLED(*chunk_state, p);
        
-                       if (( !VM_PAGE_WIRED(m)) && (!m->private) && (!m->gobbled) && (!m->busy) &&
-                           (!m->laundry) && (!m->cleaning) && !(m->free_when_done)) {
+                       if (( !VM_PAGE_WIRED(m)) && (!m->vmp_private) && (!m->vmp_gobbled) && (!m->vmp_busy) &&
+                           (!m->vmp_laundry) && (!m->vmp_cleaning) && !(m->vmp_free_when_done)) {
                                int     clear_refmod;
                                int     pmap_options;
        
@@ -2215,11 +2237,11 @@ deactivate_pages_in_object(
                                                 */
                                                pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(m));
                                        }
-                                       m->precious = FALSE;
-                                       m->dirty = FALSE;
+                                       m->vmp_precious = FALSE;
+                                       m->vmp_dirty = FALSE;
 
                                        clear_refmod |= VM_MEM_MODIFIED;
-                                       if (m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) {
+                                       if (m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) {
                                                /*
                                                 * This page is now clean and
                                                 * reclaimable.  Move it out
@@ -2232,10 +2254,10 @@ deactivate_pages_in_object(
 
                                        VM_COMPRESSOR_PAGER_STATE_CLR(object, offset);
 
-                                       if (reusable_page && !m->reusable) {
+                                       if (reusable_page && !m->vmp_reusable) {
                                                assert(!all_reusable);
                                                assert(!object->all_reusable);
-                                               m->reusable = TRUE;
+                                               m->vmp_reusable = TRUE;
                                                object->reusable_page_count++;
                                                assert(object->resident_page_count >= object->reusable_page_count);
                                                reusable++;
@@ -2253,7 +2275,7 @@ deactivate_pages_in_object(
                                                          pmap_options,
                                                          (void *)pfc);
 
-                               if ((m->vm_page_q_state != VM_PAGE_ON_THROTTLED_Q) && !(reusable_page || all_reusable))
+                               if ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !(reusable_page || all_reusable))
                                        dwp->dw_mask |= DW_move_page;
                                
                                if (dwp->dw_mask)
@@ -2506,12 +2528,12 @@ vm_object_reuse_pages(
 #define VM_OBJECT_REUSE_PAGE(object, m, reused)                                \
        MACRO_BEGIN                                                     \
                if ((m) != VM_PAGE_NULL &&                              \
-                   (m)->reusable) {                                    \
+                   (m)->vmp_reusable) {                                \
                        assert((object)->reusable_page_count <=         \
                               (object)->resident_page_count);          \
                        assert((object)->reusable_page_count > 0);      \
                        (object)->reusable_page_count--;                \
-                       (m)->reusable = FALSE;                          \
+                       (m)->vmp_reusable = FALSE;                      \
                        (reused)++;                                     \
                        /*                                              \
                         * Tell pmap that this page is no longer        \
@@ -2543,15 +2565,15 @@ vm_object_reuse_pages(
                        reused = object->resident_page_count;
                } else {
                        vm_page_stats_reusable.partial_reuse_calls++;
-                       vm_page_queue_iterate(&object->memq, m, vm_page_t, listq) {
-                               if (m->offset < start_offset ||
-                                   m->offset >= end_offset) {
-                                       m->reusable = TRUE;
+                       vm_page_queue_iterate(&object->memq, m, vm_page_t, vmp_listq) {
+                               if (m->vmp_offset < start_offset ||
+                                   m->vmp_offset >= end_offset) {
+                                       m->vmp_reusable = TRUE;
                                        object->reusable_page_count++;
                                        assert(object->resident_page_count >= object->reusable_page_count);
                                        continue;
                                } else {
-                                       assert(!m->reusable);
+                                       assert(!m->vmp_reusable);
                                        reused++;
                                }
                        }
@@ -2570,12 +2592,12 @@ vm_object_reuse_pages(
                }
        } else {
                vm_page_stats_reusable.partial_reuse_calls++;
-               vm_page_queue_iterate(&object->memq, m, vm_page_t, listq) {
+               vm_page_queue_iterate(&object->memq, m, vm_page_t, vmp_listq) {
                        if (object->reusable_page_count == 0) {
                                break;
                        }
-                       if (m->offset < start_offset ||
-                           m->offset >= end_offset) {
+                       if (m->vmp_offset < start_offset ||
+                           m->vmp_offset >= end_offset) {
                                continue;
                        }
                        VM_OBJECT_REUSE_PAGE(object, m, reused);
@@ -2706,11 +2728,11 @@ vm_object_pmap_protect_options(
 
                end = offset + size;
 
-               vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
-                       if (!p->fictitious && (offset <= p->offset) && (p->offset < end)) {
+               vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) {
+                       if (!p->vmp_fictitious && (offset <= p->vmp_offset) && (p->vmp_offset < end)) {
                                vm_map_offset_t start;
 
-                               start = pmap_start + p->offset - offset;
+                               start = pmap_start + p->vmp_offset - offset;
 
                                if (pmap != PMAP_NULL)
                                        pmap_protect_options(
@@ -2745,7 +2767,7 @@ vm_object_pmap_protect_options(
                        if (p != VM_PAGE_NULL) {
                                vm_object_offset_t start;
 
-                               start = pmap_start + (p->offset - offset);
+                               start = pmap_start + (p->vmp_offset - offset);
 
                                if (pmap != PMAP_NULL)
                                        pmap_protect_options(
@@ -2801,6 +2823,8 @@ vm_object_pmap_protect_options(
        vm_object_unlock(object);
 }
 
+uint32_t vm_page_busy_absent_skipped = 0;
+
 /*
  *     Routine:        vm_object_copy_slowly
  *
@@ -2842,7 +2866,7 @@ vm_object_copy_slowly(
        vm_object_t             new_object;
        vm_object_offset_t      new_offset;
 
-       struct vm_object_fault_info fault_info;
+       struct vm_object_fault_info fault_info = {};
 
        XPR(XPR_VM_OBJECT, "v_o_c_slowly obj 0x%x off 0x%x size 0x%x\n",
            src_object, src_offset, size, 0, 0);
@@ -2876,16 +2900,9 @@ vm_object_copy_slowly(
 
        fault_info.interruptible = interruptible;
        fault_info.behavior  = VM_BEHAVIOR_SEQUENTIAL;
-       fault_info.user_tag = 0;
-       fault_info.pmap_options = 0;
        fault_info.lo_offset = src_offset;
        fault_info.hi_offset = src_offset + size;
-       fault_info.no_cache  = FALSE;
        fault_info.stealth = TRUE;
-       fault_info.io_sync = FALSE;
-       fault_info.cs_bypass = FALSE;
-       fault_info.mark_zf_absent = FALSE;
-       fault_info.batch_pmap_op = FALSE;
 
        for ( ;
            size != 0 ;
@@ -2925,40 +2942,75 @@ vm_object_copy_slowly(
 
                        if (src_object->internal &&
                            src_object->shadow == VM_OBJECT_NULL &&
-                           (vm_page_lookup(src_object,
-                                           src_offset) == VM_PAGE_NULL) &&
                            (src_object->pager == NULL ||
                             (VM_COMPRESSOR_PAGER_STATE_GET(src_object,
                                                            src_offset) ==
                              VM_EXTERNAL_STATE_ABSENT))) {
-                               /*
-                                * This page is neither resident nor compressed
-                                * and there's no shadow object below 
-                                * "src_object", so this page is really missing.
-                                * There's no need to zero-fill it just to copy
-                                * it:  let's leave it missing in "new_object"
-                                * and get zero-filled on demand.
-                                */
-                               vm_object_unlock(src_object);
-                               /* free the unused "new_page"... */
-                               vm_object_lock(new_object);
-                               VM_PAGE_FREE(new_page);
-                               new_page = VM_PAGE_NULL;
-                               vm_object_unlock(new_object);
-                               /* ...and go to next page in "src_object" */
-                               result = VM_FAULT_SUCCESS;
-                               break;
+                               boolean_t can_skip_page;
+
+                               _result_page = vm_page_lookup(src_object,
+                                                             src_offset);
+                               if (_result_page == VM_PAGE_NULL) {
+                                       /*
+                                        * This page is neither resident nor
+                                        * compressed and there's no shadow
+                                        * object below "src_object", so this
+                                        * page is really missing.
+                                        * There's no need to zero-fill it just
+                                        * to copy it:  let's leave it missing
+                                        * in "new_object" and get zero-filled
+                                        * on demand.
+                                        */
+                                       can_skip_page = TRUE;
+                               } else if (workaround_41447923 &&
+                                          src_object->pager == NULL &&
+                                          _result_page != VM_PAGE_NULL &&
+                                          _result_page->vmp_busy &&
+                                          _result_page->vmp_absent &&
+                                          src_object->purgable == VM_PURGABLE_DENY &&
+                                          !src_object->blocked_access) {
+                                       /*
+                                        * This page is "busy" and "absent"
+                                        * but not because we're waiting for
+                                        * it to be decompressed.  It must
+                                        * be because it's a "no zero fill"
+                                        * page that is currently not
+                                        * accessible until it gets overwritten
+                                        * by a device driver.
+                                        * Since its initial state would have
+                                        * been "zero-filled", let's leave the
+                                        * copy page missing and get zero-filled
+                                        * on demand.
+                                        */
+                                       assert(src_object->internal);
+                                       assert(src_object->shadow == NULL);
+                                       assert(src_object->pager == NULL);
+                                       can_skip_page = TRUE;
+                                       vm_page_busy_absent_skipped++;
+                               } else {
+                                       can_skip_page = FALSE;
+                               }
+                               if (can_skip_page) {
+                                       vm_object_unlock(src_object);
+                                       /* free the unused "new_page"... */
+                                       vm_object_lock(new_object);
+                                       VM_PAGE_FREE(new_page);
+                                       new_page = VM_PAGE_NULL;
+                                       vm_object_unlock(new_object);
+                                       /* ...and go to next page in "src_object" */
+                                       result = VM_FAULT_SUCCESS;
+                                       break;
+                               }
                        }
 
                        vm_object_paging_begin(src_object);
 
-                       if (size > (vm_size_t) -1) {
-                               /* 32-bit overflow */
-                               fault_info.cluster_size = (vm_size_t) (0 - PAGE_SIZE);
-                       } else {
-                               fault_info.cluster_size = (vm_size_t) size;
-                               assert(fault_info.cluster_size == size);
+                       /* cap size at maximum UPL size */
+                       upl_size_t cluster_size;
+                       if (os_convert_overflow(size, &cluster_size)) {
+                               cluster_size = 0 - (upl_size_t)PAGE_SIZE;
                        }
+                       fault_info.cluster_size = cluster_size;
 
                        XPR(XPR_VM_FAULT,"vm_object_copy_slowly -> vm_fault_page",0,0,0,0,0);
                        _result_page = VM_PAGE_NULL;
@@ -2999,8 +3051,8 @@ vm_object_copy_slowly(
                                PAGE_WAKEUP_DONE(result_page);
 
                                vm_page_lockspin_queues();
-                               if ((result_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
-                                   (result_page->vm_page_q_state == VM_PAGE_NOT_ON_Q)) {
+                               if ((result_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
+                                   (result_page->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
                                        vm_page_activate(result_page);
                                }
                                vm_page_activate(new_page);
@@ -3439,10 +3491,10 @@ vm_object_copy_delayed(
                                pmap_flush_context_init(&pmap_flush_context_storage);
                                delayed_pmap_flush = FALSE;
 
-                               vm_page_queue_iterate(&src_object->memq, p, vm_page_t, listq) {
-                                       if (!p->fictitious && 
-                                           p->offset >= old_copy->vo_size && 
-                                           p->offset < copy_size) {
+                               vm_page_queue_iterate(&src_object->memq, p, vm_page_t, vmp_listq) {
+                                       if (!p->vmp_fictitious && 
+                                           p->vmp_offset >= old_copy->vo_size && 
+                                           p->vmp_offset < copy_size) {
                                                if (VM_PAGE_WIRED(p)) {
                                                        vm_object_unlock(old_copy);
                                                        vm_object_unlock(src_object);
@@ -3539,8 +3591,8 @@ vm_object_copy_delayed(
        pmap_flush_context_init(&pmap_flush_context_storage);
        delayed_pmap_flush = FALSE;
 
-       vm_page_queue_iterate(&src_object->memq, p, vm_page_t, listq) {
-               if (!p->fictitious && p->offset < copy_size) {
+       vm_page_queue_iterate(&src_object->memq, p, vm_page_t, vmp_listq) {
+               if (!p->vmp_fictitious && p->vmp_offset < copy_size) {
                        if (VM_PAGE_WIRED(p)) {
                                if (old_copy)
                                        vm_object_unlock(old_copy);
@@ -4225,9 +4277,9 @@ vm_object_do_collapse(
                
                p = (vm_page_t) vm_page_queue_first(&backing_object->memq);
                
-               new_offset = (p->offset - backing_offset);
+               new_offset = (p->vmp_offset - backing_offset);
                
-               assert(!p->busy || p->absent);
+               assert(!p->vmp_busy || p->vmp_absent);
 
                /*
                 *      If the parent has a page here, or if
@@ -4237,7 +4289,7 @@ vm_object_do_collapse(
                 *      Otherwise, move it as planned.
                 */
                
-               if (p->offset < backing_offset || new_offset >= size) {
+               if (p->vmp_offset < backing_offset || new_offset >= size) {
                        VM_PAGE_FREE(p);
                } else {
                        pp = vm_page_lookup(object, new_offset);
@@ -4262,7 +4314,7 @@ vm_object_do_collapse(
                                        vm_page_rename(p, object, new_offset);
                                }
                        } else {
-                               assert(! pp->absent);
+                               assert(! pp->vmp_absent);
 
                                /*
                                 *      Parent object has a real page.
@@ -4875,7 +4927,7 @@ retry:
                                backing_rcount = backing_object->resident_page_count;
                                p = (vm_page_t)vm_page_queue_first(&backing_object->memq);
                                do {
-                                       offset = (p->offset - backing_offset);
+                                       offset = (p->vmp_offset - backing_offset);
 
                                        if (offset < object->vo_size &&
                                            offset != hint_offset &&
@@ -4885,7 +4937,7 @@ retry:
                                                
                                                break;
                                        }
-                                       p = (vm_page_t) vm_page_queue_next(&p->listq);
+                                       p = (vm_page_t) vm_page_queue_next(&p->vmp_listq);
 
                                } while (--backing_rcount);
                                if (backing_rcount != 0 ) {
@@ -5003,8 +5055,8 @@ vm_object_page_remove(
                for (; start < end; start += PAGE_SIZE_64) {
                        p = vm_page_lookup(object, start);
                        if (p != VM_PAGE_NULL) {
-                               assert(!p->cleaning && !p->laundry);
-                               if (!p->fictitious && p->pmapped)
+                               assert(!p->vmp_cleaning && !p->vmp_laundry);
+                               if (!p->vmp_fictitious && p->vmp_pmapped)
                                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
                                VM_PAGE_FREE(p);
                        }
@@ -5014,10 +5066,10 @@ vm_object_page_remove(
 
                p = (vm_page_t) vm_page_queue_first(&object->memq);
                while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t) p)) {
-                       next = (vm_page_t) vm_page_queue_next(&p->listq);
-                       if ((start <= p->offset) && (p->offset < end)) {
-                               assert(!p->cleaning && !p->laundry);
-                               if (!p->fictitious && p->pmapped)
+                       next = (vm_page_t) vm_page_queue_next(&p->vmp_listq);
+                       if ((start <= p->vmp_offset) && (p->vmp_offset < end)) {
+                               assert(!p->vmp_cleaning && !p->vmp_laundry);
+                               if (!p->vmp_fictitious && p->vmp_pmapped)
                                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
                                VM_PAGE_FREE(p);
                        }
@@ -5162,25 +5214,25 @@ vm_object_populate_with_private(
                        m = vm_page_lookup(object, base_offset);
 
                        if (m != VM_PAGE_NULL) {
-                               if (m->fictitious) {
+                               if (m->vmp_fictitious) {
                                        if (VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr) {
 
                                                vm_page_lockspin_queues();
-                                               m->private = TRUE;
+                                               m->vmp_private = TRUE;
                                                vm_page_unlock_queues();
 
-                                               m->fictitious = FALSE;
+                                               m->vmp_fictitious = FALSE;
                                                VM_PAGE_SET_PHYS_PAGE(m, base_page);
                                        }
                                } else if (VM_PAGE_GET_PHYS_PAGE(m) != base_page) {
 
-                                       if ( !m->private) {
+                                       if ( !m->vmp_private) {
                                                /*
                                                 * we'd leak a real page... that can't be right
                                                 */
                                                panic("vm_object_populate_with_private - %p not private", m);
                                        }
-                                       if (m->pmapped) {
+                                       if (m->vmp_pmapped) {
                                                /*
                                                 * pmap call to clear old mapping
                                                 */
@@ -5197,11 +5249,11 @@ vm_object_populate_with_private(
                                 * private normally requires lock_queues but since we
                                 * are initializing the page, its not necessary here
                                 */
-                               m->private = TRUE;
-                               m->fictitious = FALSE;
+                               m->vmp_private = TRUE;
+                               m->vmp_fictitious = FALSE;
                                VM_PAGE_SET_PHYS_PAGE(m, base_page);
-                               m->unusual = TRUE;
-                               m->busy = FALSE;
+                               m->vmp_unusual = TRUE;
+                               m->vmp_busy = FALSE;
 
                                vm_page_insert(m, object, base_offset);
                        }
@@ -5564,8 +5616,8 @@ vm_object_purge(vm_object_t object, int flags)
                                                          -pgcount,
                                                          FALSE, /* shared */
                                                          object);
-                               vm_purgeable_compressed_update(object,
-                                                              -pgcount);
+                               vm_object_owner_compressed_update(object,
+                                                                 -pgcount);
                        }
                        if ( !(flags & C_DONT_BLOCK)) {
                                assert(vm_compressor_pager_get_count(object->pager)
@@ -5807,9 +5859,7 @@ vm_object_purgable_control(
                         * Transfer the object's pages from the volatile to
                         * non-volatile ledgers.
                         */
-                       vm_purgeable_accounting(object, VM_PURGABLE_VOLATILE,
-                                               FALSE, /* disown */
-                                               FALSE); /* task_objq locked? */
+                       vm_purgeable_accounting(object, VM_PURGABLE_VOLATILE);
                }
 
                break;
@@ -5819,15 +5869,15 @@ vm_object_purgable_control(
                        vm_page_t       p;
                        int             refmod;
 
-                       vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
-                               if (p->busy ||
+                       vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) {
+                               if (p->vmp_busy ||
                                    VM_PAGE_WIRED(p) ||
-                                   p->fictitious) {
+                                   p->vmp_fictitious) {
                                        continue;
                                }
                                refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
                                if ((refmod & VM_MEM_MODIFIED) &&
-                                   !p->dirty) {
+                                   !p->vmp_dirty) {
                                        SET_PAGE_DIRTY(p, FALSE);
                                }
                        }
@@ -5938,9 +5988,7 @@ vm_object_purgable_control(
                vm_purgeable_object_add(object, queue, (*state&VM_VOLATILE_GROUP_MASK)>>VM_VOLATILE_GROUP_SHIFT );
                if (old_state == VM_PURGABLE_NONVOLATILE) {
                        vm_purgeable_accounting(object,
-                                               VM_PURGABLE_NONVOLATILE,
-                                               FALSE, /* disown */
-                                               FALSE); /* task_objq locked? */
+                                               VM_PURGABLE_NONVOLATILE);
                }
 
                assert(queue->debug_count_objects>=0);
@@ -5953,15 +6001,15 @@ vm_object_purgable_control(
                        vm_page_t       p;
                        int             refmod;
 
-                       vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
-                               if (p->busy ||
+                       vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) {
+                               if (p->vmp_busy ||
                                    VM_PAGE_WIRED(p) ||
-                                   p->fictitious) {
+                                   p->vmp_fictitious) {
                                        continue;
                                }
                                refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(p));
                                if ((refmod & VM_MEM_MODIFIED) &&
-                                   !p->dirty) {
+                                   !p->vmp_dirty) {
                                        SET_PAGE_DIRTY(p, FALSE);
                                }
                        }
@@ -5990,9 +6038,7 @@ vm_object_purgable_control(
                         * "volatile".
                         */
                        vm_purgeable_accounting(object,
-                                               VM_PURGABLE_NONVOLATILE,
-                                               FALSE, /* disown */
-                                               FALSE); /* task_objq locked? */
+                                               VM_PURGABLE_NONVOLATILE);
                        /*
                         * Set to VM_PURGABLE_EMPTY because the pages are no
                         * longer accounted in the "non-volatile" ledger
@@ -6062,15 +6108,15 @@ vm_object_get_page_counts(
 
        if (object->resident_page_count <= (size >> PAGE_SHIFT)) {
 
-               vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
+               vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) {
                
-                       if (p->offset >= cur_offset && p->offset < end_offset) {
+                       if (p->vmp_offset >= cur_offset && p->vmp_offset < end_offset) {
 
                                local_resident_count++;
 
                                if (count_dirty_pages) {
                                        
-                                       if (p->dirty || (p->wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
+                                       if (p->vmp_dirty || (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
                                                
                                                local_dirty_count++;
                                        }
@@ -6089,7 +6135,7 @@ vm_object_get_page_counts(
 
                                if (count_dirty_pages) {
                                        
-                                       if (p->dirty || (p->wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
+                                       if (p->vmp_dirty || (p->vmp_wpmapped && pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(p)))) {
                                
                                                local_dirty_count++;
                                        }
@@ -6342,7 +6388,7 @@ vm_object_transpose(
                 */
                while (!vm_page_queue_empty(&object2->memq)) {
                        page = (vm_page_t) vm_page_queue_first(&object2->memq);
-                       vm_page_rename(page, object1, page->offset);
+                       vm_page_rename(page, object1, page->vmp_offset);
                }
                assert(vm_page_queue_empty(&object2->memq));
        } else if (object2->phys_contiguous || vm_page_queue_empty(&object2->memq)) {
@@ -6353,31 +6399,31 @@ vm_object_transpose(
                 */
                while (!vm_page_queue_empty(&object1->memq)) {
                        page = (vm_page_t) vm_page_queue_first(&object1->memq);
-                       vm_page_rename(page, object2, page->offset);
+                       vm_page_rename(page, object2, page->vmp_offset);
                }
                assert(vm_page_queue_empty(&object1->memq));
        } else {
                /* transfer object1's pages to tmp_object */
                while (!vm_page_queue_empty(&object1->memq)) {
                        page = (vm_page_t) vm_page_queue_first(&object1->memq);
-                       page_offset = page->offset;
+                       page_offset = page->vmp_offset;
                        vm_page_remove(page, TRUE);
-                       page->offset = page_offset;
-                       vm_page_queue_enter(&tmp_object->memq, page, vm_page_t, listq);
+                       page->vmp_offset = page_offset;
+                       vm_page_queue_enter(&tmp_object->memq, page, vm_page_t, vmp_listq);
                }
                assert(vm_page_queue_empty(&object1->memq));
                /* transfer object2's pages to object1 */
                while (!vm_page_queue_empty(&object2->memq)) {
                        page = (vm_page_t) vm_page_queue_first(&object2->memq);
-                       vm_page_rename(page, object1, page->offset);
+                       vm_page_rename(page, object1, page->vmp_offset);
                }
                assert(vm_page_queue_empty(&object2->memq));
                /* transfer tmp_object's pages to object2 */
                while (!vm_page_queue_empty(&tmp_object->memq)) {
                        page = (vm_page_t) vm_page_queue_first(&tmp_object->memq);
                        vm_page_queue_remove(&tmp_object->memq, page,
-                                            vm_page_t, listq);
-                       vm_page_insert(page, object2, page->offset);
+                                            vm_page_t, vmp_listq);
+                       vm_page_insert(page, object2, page->vmp_offset);
                }
                assert(vm_page_queue_empty(&tmp_object->memq));
        }
@@ -6401,6 +6447,9 @@ MACRO_END
 #endif
        /* "resident_page_count" was updated above when transposing pages */
        /* "wired_page_count" was updated above when transposing pages */
+#if ! VM_TAG_ACTIVE_UPDATE
+       /* "wired_objq" was dealt with along with "wired_page_count" */
+#endif /* ! VM_TAG_ACTIVE_UPDATE */
        /* "reusable_page_count" was updated above when transposing pages */
        /* there should be no "copy" */
        assert(!object1->copy);
@@ -6487,6 +6536,26 @@ MACRO_END
        __TRANSPOSE_FIELD(all_reusable);
        assert(object1->blocked_access);
        assert(object2->blocked_access);
+       __TRANSPOSE_FIELD(set_cache_attr);
+       assert(!object1->object_is_shared_cache);
+       assert(!object2->object_is_shared_cache);
+       /* ignore purgeable_queue_type and purgeable_queue_group */
+       assert(!object1->io_tracking);
+       assert(!object2->io_tracking);
+#if VM_OBJECT_ACCESS_TRACKING
+       assert(!object1->access_tracking);
+       assert(!object2->access_tracking);
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+       __TRANSPOSE_FIELD(no_tag_update);
+#if CONFIG_SECLUDED_MEMORY
+       assert(!object1->eligible_for_secluded);
+       assert(!object2->eligible_for_secluded);
+       assert(!object1->can_grab_secluded);
+       assert(!object2->can_grab_secluded);
+#else /* CONFIG_SECLUDED_MEMORY */
+       assert(object1->__object3_unused_bits == 0);
+       assert(object2->__object3_unused_bits == 0);
+#endif /* CONFIG_SECLUDED_MEMORY */
        assert(object1->__object2_unused_bits == 0);
        assert(object2->__object2_unused_bits == 0);
 #if UPL_DEBUG
@@ -6927,7 +6996,7 @@ vm_object_page_op(
                }
 
                /* Sync up on getting the busy bit */
-               if((dst_page->busy || dst_page->cleaning) && 
+               if((dst_page->vmp_busy || dst_page->vmp_cleaning) && 
                           (((ops & UPL_POP_SET) && 
                           (ops & UPL_POP_BUSY)) || (ops & UPL_POP_DUMP))) {
                        /* someone else is playing with the page, we will */
@@ -6937,7 +7006,7 @@ vm_object_page_op(
                }
 
                if (ops & UPL_POP_DUMP) {
-                       if (dst_page->pmapped == TRUE)
+                       if (dst_page->vmp_pmapped == TRUE)
                                pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
 
                        VM_PAGE_FREE(dst_page);
@@ -6950,11 +7019,11 @@ vm_object_page_op(
                        /* Get the condition of flags before requested ops */
                        /* are undertaken */
 
-                       if(dst_page->dirty) *flags |= UPL_POP_DIRTY;
-                       if(dst_page->free_when_done) *flags |= UPL_POP_PAGEOUT;
-                       if(dst_page->precious) *flags |= UPL_POP_PRECIOUS;
-                       if(dst_page->absent) *flags |= UPL_POP_ABSENT;
-                       if(dst_page->busy) *flags |= UPL_POP_BUSY;
+                       if(dst_page->vmp_dirty) *flags |= UPL_POP_DIRTY;
+                       if(dst_page->vmp_free_when_done) *flags |= UPL_POP_PAGEOUT;
+                       if(dst_page->vmp_precious) *flags |= UPL_POP_PRECIOUS;
+                       if(dst_page->vmp_absent) *flags |= UPL_POP_ABSENT;
+                       if(dst_page->vmp_busy) *flags |= UPL_POP_BUSY;
                }
 
                /* The caller should have made a call either contingent with */
@@ -6967,24 +7036,24 @@ vm_object_page_op(
                        /* because the page may already be busy.  However */
                        /* if such violations occur we will assert sooner */
                        /* or later. */
-                       assert(dst_page->busy || (ops & UPL_POP_BUSY));
+                       assert(dst_page->vmp_busy || (ops & UPL_POP_BUSY));
                        if (ops & UPL_POP_DIRTY) {
                                SET_PAGE_DIRTY(dst_page, FALSE);
                        }
-                       if (ops & UPL_POP_PAGEOUT) dst_page->free_when_done = TRUE;
-                       if (ops & UPL_POP_PRECIOUS) dst_page->precious = TRUE;
-                       if (ops & UPL_POP_ABSENT) dst_page->absent = TRUE;
-                       if (ops & UPL_POP_BUSY) dst_page->busy = TRUE;
+                       if (ops & UPL_POP_PAGEOUT) dst_page->vmp_free_when_done = TRUE;
+                       if (ops & UPL_POP_PRECIOUS) dst_page->vmp_precious = TRUE;
+                       if (ops & UPL_POP_ABSENT) dst_page->vmp_absent = TRUE;
+                       if (ops & UPL_POP_BUSY) dst_page->vmp_busy = TRUE;
                }
 
                if(ops & UPL_POP_CLR) {
-                       assert(dst_page->busy);
-                       if (ops & UPL_POP_DIRTY) dst_page->dirty = FALSE;
-                       if (ops & UPL_POP_PAGEOUT) dst_page->free_when_done = FALSE;
-                       if (ops & UPL_POP_PRECIOUS) dst_page->precious = FALSE;
-                       if (ops & UPL_POP_ABSENT) dst_page->absent = FALSE;
+                       assert(dst_page->vmp_busy);
+                       if (ops & UPL_POP_DIRTY) dst_page->vmp_dirty = FALSE;
+                       if (ops & UPL_POP_PAGEOUT) dst_page->vmp_free_when_done = FALSE;
+                       if (ops & UPL_POP_PRECIOUS) dst_page->vmp_precious = FALSE;
+                       if (ops & UPL_POP_ABSENT) dst_page->vmp_absent = FALSE;
                        if (ops & UPL_POP_BUSY) {
-                               dst_page->busy = FALSE;
+                               dst_page->vmp_busy = FALSE;
                                PAGE_WAKEUP(dst_page);
                        }
                }
@@ -6993,7 +7062,7 @@ vm_object_page_op(
                         * The physical page number will remain valid
                         * only if the page is kept busy.
                         */
-                       assert(dst_page->busy);
+                       assert(dst_page->vmp_busy);
                        *phys_entry = VM_PAGE_GET_PHYS_PAGE(dst_page);
                }
 
@@ -7054,7 +7123,7 @@ vm_object_range_op(
                dst_page = vm_page_lookup(object, offset);
                if (dst_page != VM_PAGE_NULL) {
                        if (ops & UPL_ROP_DUMP) {
-                               if (dst_page->busy || dst_page->cleaning) {
+                               if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
                                        /*
                                         * someone else is playing with the 
                                         * page, we will have to wait
@@ -7068,16 +7137,16 @@ vm_object_range_op(
                                         */
                                        continue;
                                }
-                               if (dst_page->laundry)
+                               if (dst_page->vmp_laundry)
                                        vm_pageout_steal_laundry(dst_page, FALSE);
 
-                               if (dst_page->pmapped == TRUE)
+                               if (dst_page->vmp_pmapped == TRUE)
                                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(dst_page));
 
                                VM_PAGE_FREE(dst_page);
 
                        } else if ((ops & UPL_ROP_ABSENT)
-                                          && (!dst_page->absent || dst_page->busy)) {
+                                          && (!dst_page->vmp_absent || dst_page->vmp_busy)) {
                                break;
                        }
                } else if (ops & UPL_ROP_PRESENT)
@@ -7272,9 +7341,9 @@ vm_object_change_wimg_mode(vm_object_t object, unsigned int wimg_mode)
 
        vm_object_paging_wait(object, THREAD_UNINT);
 
-       vm_page_queue_iterate(&object->memq, p, vm_page_t, listq) {
+       vm_page_queue_iterate(&object->memq, p, vm_page_t, vmp_listq) {
 
-               if (!p->fictitious)
+               if (!p->vmp_fictitious)
                        pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(p), wimg_mode);
        }
        if (wimg_mode == VM_WIMG_USE_DEFAULT)
@@ -7414,22 +7483,22 @@ vm_object_compressed_freezer_pageout(
 
                vm_page_lockspin_queues();
 
-               if (p->cleaning || p->fictitious || p->busy || p->absent || p->unusual || p->error || VM_PAGE_WIRED(p)) {
+               if (p->vmp_cleaning || p->vmp_fictitious || p->vmp_busy || p->vmp_absent || p->vmp_unusual || p->vmp_error || VM_PAGE_WIRED(p)) {
 
                        vm_page_unlock_queues();
 
                        KERNEL_DEBUG(0xe0430004 | DBG_FUNC_END, object, local_freed, 1, 0, 0);
 
-                       vm_page_queue_remove(&object->memq, p, vm_page_t, listq);
-                       vm_page_queue_enter(&object->memq, p, vm_page_t, listq);
+                       vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq);
+                       vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq);
 
                        continue;
                }
 
-               if (p->pmapped == TRUE) {
+               if (p->vmp_pmapped == TRUE) {
                        int refmod_state, pmap_flags;
 
-                       if (p->dirty || p->precious) {
+                       if (p->vmp_dirty || p->vmp_precious) {
                                pmap_flags = PMAP_OPTIONS_COMPRESSOR;
                        } else {
                                pmap_flags = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
@@ -7441,7 +7510,7 @@ vm_object_compressed_freezer_pageout(
                        }
                }
                
-               if (p->dirty == FALSE && p->precious == FALSE) {
+               if (p->vmp_dirty == FALSE && p->vmp_precious == FALSE) {
                        /*
                         * Clean and non-precious page.
                         */
@@ -7452,7 +7521,7 @@ vm_object_compressed_freezer_pageout(
                        continue;
                }
 
-               if (p->laundry)
+               if (p->vmp_laundry)
                        vm_pageout_steal_laundry(p, TRUE);
 
                vm_page_queues_remove(p, TRUE);
@@ -7466,34 +7535,32 @@ vm_object_compressed_freezer_pageout(
                 * Make the move here while we have the object lock held.
                 */
 
-               vm_page_queue_remove(&object->memq, p, vm_page_t, listq);
-               vm_page_queue_enter(&object->memq, p, vm_page_t, listq);
+               vm_page_queue_remove(&object->memq, p, vm_page_t, vmp_listq);
+               vm_page_queue_enter(&object->memq, p, vm_page_t, vmp_listq);
 
                /*
                 * Grab an activity_in_progress here for vm_pageout_compress_page() to consume.
                 *
                 * Mark the page busy so no one messes with it while we have the object lock dropped.
                 */
-
-               p->busy = TRUE;
+               p->vmp_busy = TRUE;
 
                vm_object_activity_begin(object);
 
                vm_object_unlock(object);
 
-               /*
-                * arg3 == FALSE  tells vm_pageout_compress_page that we don't hold the object lock and the pager may not be initialized.
-                */
-               if (vm_pageout_compress_page(&freezer_chead, freezer_compressor_scratch_buf, p, FALSE) == KERN_SUCCESS) {
+               if (vm_pageout_compress_page(&freezer_chead, freezer_compressor_scratch_buf, p) == KERN_SUCCESS) {
                        /*
                         * page has already been un-tabled from the object via 'vm_page_remove'
                         */
-                       p->snext = local_freeq;
+                       p->vmp_snext = local_freeq;
                        local_freeq = p;
                        local_freed++;
 
                        if (local_freed >= MAX_FREE_BATCH) {
-               
+
+                               OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
+
                                vm_page_free_list(local_freeq, TRUE);
                                
                                local_freeq = NULL;
@@ -7513,6 +7580,8 @@ vm_object_compressed_freezer_pageout(
        }
 
        if (local_freeq) {
+               OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);         
+
                vm_page_free_list(local_freeq, TRUE);
                                
                local_freeq = NULL;
@@ -7577,33 +7646,31 @@ ReScan:
 
        while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)next)) {
                p = next;
-               next = (vm_page_t)vm_page_queue_next(&next->listq);
+               next = (vm_page_t)vm_page_queue_next(&next->vmp_listq);
                
-               assert(p->vm_page_q_state != VM_PAGE_ON_FREE_Q);
+               assert(p->vmp_q_state != VM_PAGE_ON_FREE_Q);
                
-               if ((p->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) ||
-                   p->cleaning ||
-                   p->laundry ||
-                   p->busy ||
-                   p->absent ||
-                   p->error ||
-                   p->fictitious ||
+               if ((p->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) ||
+                   p->vmp_cleaning ||
+                   p->vmp_laundry ||
+                   p->vmp_busy ||
+                   p->vmp_absent ||
+                   p->vmp_error ||
+                   p->vmp_fictitious ||
                    VM_PAGE_WIRED(p)) {
                        /*
                         * Page is already being cleaned or can't be cleaned.
                         */
                        continue;
                }
+               if (vm_compressor_low_on_space()) {
+                       break;          
+               }
 
                /* Throw to the pageout queue */
 
                vm_page_lockspin_queues();
 
-               if (vm_compressor_low_on_space()) {
-                       vm_page_unlock_queues();
-                       break;          
-               }
-
                if (VM_PAGE_Q_THROTTLED(iq)) {
                                        
                        iq->pgo_draining = TRUE;
@@ -7619,15 +7686,15 @@ ReScan:
                        goto ReScan;
                }
 
-               assert(!p->fictitious);
-               assert(!p->busy);
-               assert(!p->absent);
-               assert(!p->unusual);
-               assert(!p->error);
+               assert(!p->vmp_fictitious);
+               assert(!p->vmp_busy);
+               assert(!p->vmp_absent);
+               assert(!p->vmp_unusual);
+               assert(!p->vmp_error);
                assert(!VM_PAGE_WIRED(p));
-               assert(!p->cleaning);
+               assert(!p->vmp_cleaning);
 
-               if (p->pmapped == TRUE) {
+               if (p->vmp_pmapped == TRUE) {
                        int refmod_state;
                        int pmap_options;
 
@@ -7637,7 +7704,7 @@ ReScan:
                         */
                        pmap_options =
                                PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
-                       if (p->dirty || p->precious) {
+                       if (p->vmp_dirty || p->vmp_precious) {
                                /*
                                 * We already know it's been modified,
                                 * so tell pmap to account for it
@@ -7653,7 +7720,7 @@ ReScan:
                        }
                }
 
-               if (!p->dirty && !p->precious) {
+               if (!p->vmp_dirty && !p->vmp_precious) {
                        vm_page_unlock_queues();
                        VM_PAGE_FREE(p);
                        continue;
@@ -7868,13 +7935,13 @@ vm_page_sleep(vm_object_t o, vm_page_t m, int interruptible)
 
        KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_START, o, m, 0, 0, 0);
        
-       if (o->io_tracking && ((m->busy == TRUE) || (m->cleaning == TRUE) || VM_PAGE_WIRED(m))) {
+       if (o->io_tracking && ((m->vmp_busy == TRUE) || (m->vmp_cleaning == TRUE) || VM_PAGE_WIRED(m))) {
                /* 
                Indicates page is busy due to an I/O. Issue a reprioritize request if necessary.
                */
                vm_page_handle_prio_inversion(o,m);
        }
-       m->wanted = TRUE;
+       m->vmp_wanted = TRUE;
        ret = thread_sleep_vm_object(o, m, interruptible);
        KERNEL_DEBUG((MACHDBG_CODE(DBG_MACH_VM, VM_PAGE_SLEEP)) | DBG_FUNC_END, o, m, 0, 0, 0);
        return ret;
@@ -7903,3 +7970,310 @@ io_reprioritize_thread(void *param __unused, wait_result_t wr __unused)
        IO_REPRIO_THREAD_CONTINUATION();
 }
 #endif
+
+#if VM_OBJECT_ACCESS_TRACKING
+void
+vm_object_access_tracking(
+       vm_object_t     object,
+       int             *access_tracking_p,
+       uint32_t        *access_tracking_reads_p,
+       uint32_t        *access_tracking_writes_p)
+{
+       int     access_tracking;
+
+       access_tracking = !!*access_tracking_p;
+
+       vm_object_lock(object);
+       *access_tracking_p = object->access_tracking;
+       if (access_tracking_reads_p) {
+               *access_tracking_reads_p = object->access_tracking_reads;
+       }
+       if (access_tracking_writes_p) {
+               *access_tracking_writes_p = object->access_tracking_writes;
+       }
+       object->access_tracking = access_tracking;
+       object->access_tracking_reads = 0;
+       object->access_tracking_writes = 0;
+       vm_object_unlock(object);
+
+       if (access_tracking) {
+               vm_object_pmap_protect_options(object,
+                                              0,
+                                              object->vo_size,
+                                              PMAP_NULL,
+                                              0,
+                                              VM_PROT_NONE,
+                                              0);
+       }
+}
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+
+void
+vm_object_ledger_tag_ledgers(
+       vm_object_t     object,
+       int             *ledger_idx_volatile,
+       int             *ledger_idx_nonvolatile,
+       int             *ledger_idx_volatile_compressed,
+       int             *ledger_idx_nonvolatile_compressed,
+       boolean_t       *do_footprint)
+{
+       assert(object->shadow == VM_OBJECT_NULL);
+
+       switch (object->vo_ledger_tag) {
+       case VM_OBJECT_LEDGER_TAG_NONE:
+               /* regular purgeable memory */
+               assert(object->purgable != VM_PURGABLE_DENY);
+               *ledger_idx_volatile = task_ledgers.purgeable_volatile;
+               *ledger_idx_nonvolatile = task_ledgers.purgeable_nonvolatile;
+               *ledger_idx_volatile_compressed = task_ledgers.purgeable_volatile_compressed;
+               *ledger_idx_nonvolatile_compressed = task_ledgers.purgeable_nonvolatile_compressed;
+               *do_footprint = TRUE;
+               break;
+       case VM_OBJECT_LEDGER_TAG_NETWORK:
+               *ledger_idx_volatile = task_ledgers.network_volatile;
+               *ledger_idx_volatile_compressed = task_ledgers.network_volatile_compressed;
+               *ledger_idx_nonvolatile = task_ledgers.network_nonvolatile;
+               *ledger_idx_nonvolatile_compressed = task_ledgers.network_nonvolatile_compressed;
+               *do_footprint = FALSE;
+               break;
+       case VM_OBJECT_LEDGER_TAG_MEDIA:
+       default:
+               panic("%s: object %p has unsupported ledger_tag %d\n",
+                     __FUNCTION__, object, object->vo_ledger_tag);
+       }
+}
+
+kern_return_t
+vm_object_ownership_change(
+       vm_object_t     object,
+       int             new_ledger_tag,
+       task_t          new_owner,
+       boolean_t       task_objq_locked)
+{
+       int             old_ledger_tag;
+       task_t          old_owner;
+       int             resident_count, wired_count;
+       unsigned int    compressed_count;
+       int             ledger_idx_volatile;
+       int             ledger_idx_nonvolatile;
+       int             ledger_idx_volatile_compressed;
+       int             ledger_idx_nonvolatile_compressed;
+       int             ledger_idx;
+       int             ledger_idx_compressed;
+       boolean_t       do_footprint;
+
+       vm_object_lock_assert_exclusive(object);
+       assert(object->internal);
+
+       old_ledger_tag = object->vo_ledger_tag;
+       old_owner = VM_OBJECT_OWNER(object);
+
+       resident_count = object->resident_page_count - object->wired_page_count;
+       wired_count = object->wired_page_count;
+       compressed_count = vm_compressor_pager_get_count(object->pager);
+
+       /*
+        * Deal with the old owner and/or ledger tag, if needed.
+        */
+       if (old_owner != TASK_NULL &&
+           ((old_owner != new_owner)           /* new owner ... */
+            ||                                 /* ... or ... */
+            (old_ledger_tag &&                 /* ... new ledger */
+             old_ledger_tag != new_ledger_tag))) {
+               /*
+                * Take this object off of the old owner's ledgers.
+                */
+               vm_object_ledger_tag_ledgers(object,
+                                            &ledger_idx_volatile,
+                                            &ledger_idx_nonvolatile,
+                                            &ledger_idx_volatile_compressed,
+                                            &ledger_idx_nonvolatile_compressed,
+                                            &do_footprint);
+               if (object->purgable == VM_PURGABLE_VOLATILE ||
+                   object->purgable == VM_PURGABLE_EMPTY) {
+                       ledger_idx = ledger_idx_volatile;
+                       ledger_idx_compressed = ledger_idx_volatile_compressed;
+               } else {
+                       ledger_idx = ledger_idx_nonvolatile;
+                       ledger_idx_compressed = ledger_idx_nonvolatile_compressed;
+               }
+               if (resident_count) {
+                       /*
+                        * Adjust the appropriate old owners's ledgers by the
+                        * number of resident pages.
+                        */
+                       ledger_debit(old_owner->ledger,
+                                    ledger_idx,
+                                    ptoa_64(resident_count));
+                       /* adjust old owner's footprint */
+                       if (do_footprint &&
+                           object->purgable != VM_PURGABLE_VOLATILE &&
+                           object->purgable != VM_PURGABLE_EMPTY) {
+                               ledger_debit(old_owner->ledger,
+                                            task_ledgers.phys_footprint,
+                                            ptoa_64(resident_count));
+                       }
+               }
+               if (wired_count) {
+                       /* wired pages are always nonvolatile */
+                       ledger_debit(old_owner->ledger,
+                                    ledger_idx_nonvolatile,
+                                    ptoa_64(wired_count));
+                       if (do_footprint) {
+                               ledger_debit(old_owner->ledger,
+                                            task_ledgers.phys_footprint,
+                                            ptoa_64(wired_count));
+                       }
+               }
+               if (compressed_count) {
+                       /*
+                        * Adjust the appropriate old owner's ledgers
+                        * by the number of compressed pages.
+                        */
+                       ledger_debit(old_owner->ledger,
+                                    ledger_idx_compressed,
+                                    ptoa_64(compressed_count));
+                       if (do_footprint &&
+                           object->purgable != VM_PURGABLE_VOLATILE &&
+                           object->purgable != VM_PURGABLE_EMPTY) {
+                               ledger_debit(old_owner->ledger,
+                                            task_ledgers.phys_footprint,
+                                            ptoa_64(compressed_count));
+                       }
+               }
+               if (old_owner != new_owner) {
+                       /* remove object from old_owner's list of owned objects */
+                       DTRACE_VM2(object_owner_remove,
+                                  vm_object_t, object,
+                                  task_t, new_owner);
+                       if (!task_objq_locked) {
+                               task_objq_lock(old_owner);
+                       }
+                       queue_remove(&old_owner->task_objq, object,
+                                    vm_object_t, task_objq);
+                       switch (object->purgable) {
+                       case VM_PURGABLE_NONVOLATILE:
+                       case VM_PURGABLE_EMPTY:
+                               vm_purgeable_nonvolatile_owner_update(old_owner,
+                                                                     -1);
+                               break;
+                       case VM_PURGABLE_VOLATILE:
+                               vm_purgeable_volatile_owner_update(old_owner,
+                                                                  -1);
+                               break;
+                       default:
+                               break;
+                       }
+                       if (!task_objq_locked) {
+                               task_objq_unlock(old_owner);
+                       }
+               }
+       }
+
+       /*
+        * Switch to new ledger tag and/or owner.
+        */
+       object->vo_ledger_tag = new_ledger_tag;
+       object->vo_owner = new_owner;
+
+       if (new_owner == VM_OBJECT_OWNER_DISOWNED) {
+               assert(old_owner != kernel_task);
+               new_owner = kernel_task;
+       }
+
+       /*
+        * Deal with the new owner and/or ledger tag, if needed.
+        */
+       if (new_owner != TASK_NULL &&
+           ((new_owner != old_owner)           /* new owner ... */
+            ||                                 /* ... or ... */
+            (new_ledger_tag &&                 /* ... new ledger */
+             new_ledger_tag != old_ledger_tag))) {
+               /*
+                * Add this object to the new owner's ledgers.
+                */
+               vm_object_ledger_tag_ledgers(object,
+                                            &ledger_idx_volatile,
+                                            &ledger_idx_nonvolatile,
+                                            &ledger_idx_volatile_compressed,
+                                            &ledger_idx_nonvolatile_compressed,
+                                            &do_footprint);
+               if (object->purgable == VM_PURGABLE_VOLATILE ||
+                   object->purgable == VM_PURGABLE_EMPTY) {
+                       ledger_idx = ledger_idx_volatile;
+                       ledger_idx_compressed = ledger_idx_volatile_compressed;
+               } else {
+                       ledger_idx = ledger_idx_nonvolatile;
+                       ledger_idx_compressed = ledger_idx_nonvolatile_compressed;
+               }
+               if (resident_count) {
+                       /*
+                        * Adjust the appropriate new owners's ledgers by the
+                        * number of resident pages.
+                        */
+                       ledger_credit(new_owner->ledger,
+                                     ledger_idx,
+                                     ptoa_64(resident_count));
+                       /* adjust new owner's footprint */
+                       if (do_footprint &&
+                           object->purgable != VM_PURGABLE_VOLATILE &&
+                           object->purgable != VM_PURGABLE_EMPTY) {
+                               ledger_credit(new_owner->ledger,
+                                             task_ledgers.phys_footprint,
+                                             ptoa_64(resident_count));
+                       }
+               }
+               if (wired_count) {
+                       /* wired pages are always nonvolatile */
+                       ledger_credit(new_owner->ledger,
+                                     ledger_idx_nonvolatile,
+                                     ptoa_64(wired_count));
+                       if (do_footprint) {
+                               ledger_credit(new_owner->ledger,
+                                             task_ledgers.phys_footprint,
+                                             ptoa_64(wired_count));
+                       }
+               }
+               if (compressed_count) {
+                       /*
+                        * Adjust the new owner's ledgers by the number of
+                        * compressed pages.
+                        */
+                       ledger_credit(new_owner->ledger,
+                                     ledger_idx_compressed,
+                                     ptoa_64(compressed_count));
+                       if (do_footprint &&
+                           object->purgable != VM_PURGABLE_VOLATILE &&
+                           object->purgable != VM_PURGABLE_EMPTY) {
+                               ledger_credit(new_owner->ledger,
+                                             task_ledgers.phys_footprint,
+                                             ptoa_64(compressed_count));
+                       }
+               }
+               if (new_owner != old_owner) {
+                       /* add object to new_owner's list of owned objects */
+                       DTRACE_VM2(object_owner_add,
+                                  vm_object_t, object,
+                                  task_t, new_owner);
+                       task_objq_lock(new_owner);
+                       queue_enter(&new_owner->task_objq, object,
+                                   vm_object_t, task_objq);
+                       switch (object->purgable) {
+                       case VM_PURGABLE_NONVOLATILE:
+                       case VM_PURGABLE_EMPTY:
+                               vm_purgeable_nonvolatile_owner_update(new_owner,
+                                                                     +1);
+                               break;
+                       case VM_PURGABLE_VOLATILE:
+                               vm_purgeable_volatile_owner_update(new_owner,
+                                                                  +1);
+                               break;
+                       default:
+                               break;
+                       }
+                       task_objq_unlock(new_owner);
+               }
+       }
+
+       return KERN_SUCCESS;
+}
index 2474375718d03d4ba05a4e6b8243427f6b3e2262..ccd1de547f36b4e377f6c35f151d451a0f9ed1c1 100644 (file)
@@ -105,7 +105,6 @@ extern btlog_t *vm_object_tracking_btlog;
 #endif /* VM_OBJECT_TRACKING */
 
 struct vm_page;
-struct vm_shared_region_slide_info;
 
 /*
  *     Types defined:
@@ -126,9 +125,10 @@ struct vm_object_fault_info {
        /* boolean_t */ stealth:1,
        /* boolean_t */ io_sync:1,
        /* boolean_t */ cs_bypass:1,
+       /* boolean_t */ pmap_cs_associated:1,
        /* boolean_t */ mark_zf_absent:1,
        /* boolean_t */ batch_pmap_op:1,
-               __vm_object_fault_info_unused_bits:26;
+               __vm_object_fault_info_unused_bits:25;
        int             pmap_options;
 };
 
@@ -137,8 +137,7 @@ struct vm_object_fault_info {
 #define vo_cache_pages_to_scan         vo_un1.vou_cache_pages_to_scan
 #define vo_shadow_offset               vo_un2.vou_shadow_offset
 #define vo_cache_ts                    vo_un2.vou_cache_ts
-#define vo_purgeable_owner             vo_un2.vou_purgeable_owner
-#define vo_slide_info                  vo_un2.vou_slide_info
+#define vo_owner                       vo_un2.vou_owner
 
 struct vm_object {
        /*
@@ -171,7 +170,7 @@ struct vm_object {
        int                     ref_count;      /* Number of references */
        unsigned int            resident_page_count;
                                                /* number of resident pages */
-       const unsigned int      wired_page_count; /* number of wired pages
+       unsigned int            wired_page_count; /* number of wired pages
                                                     use VM_OBJECT_WIRED_PAGE_UPDATE macros to update */
        unsigned int            reusable_page_count;
 
@@ -189,11 +188,10 @@ struct vm_object {
                clock_sec_t     vou_cache_ts;   /* age of an external object
                                                 * present in cache
                                                 */
-               task_t          vou_purgeable_owner;    /* If the purg'a'ble bits below are set 
-                                                        * to volatile/emtpy, this is the task 
-                                                        * that owns this purgeable object.
-                                                        */
-               struct vm_shared_region_slide_info *vou_slide_info;
+               task_t          vou_owner;      /* If the object is purgeable
+                                                * or has a "ledger_tag", this
+                                                * is the task that owns it.
+                                                */
        } vo_un2;
 
        memory_object_t         pager;          /* Where to get data */
@@ -348,7 +346,7 @@ struct vm_object {
                all_reusable:1,
                blocked_access:1,
                set_cache_attr:1,
-               object_slid:1,
+               object_is_shared_cache:1,
                purgeable_queue_type:2,
                purgeable_queue_group:3,
                io_tracking:1,
@@ -359,7 +357,18 @@ struct vm_object {
 #else /* CONFIG_SECLUDED_MEMORY */
                __object3_unused_bits:2,
 #endif /* CONFIG_SECLUDED_MEMORY */
-               __object2_unused_bits:5;        /* for expansion */
+#if VM_OBJECT_ACCESS_TRACKING
+               access_tracking:1,
+#else /* VM_OBJECT_ACCESS_TRACKING */
+               __unused_access_tracking:1,
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+               vo_ledger_tag:2,
+               __object2_unused_bits:2;        /* for expansion */
+
+#if VM_OBJECT_ACCESS_TRACKING
+       uint32_t        access_tracking_reads;
+       uint32_t        access_tracking_writes;
+#endif /* VM_OBJECT_ACCESS_TRACKING */
 
        uint8_t                 scan_collisions;
         vm_tag_t               wire_tag;
@@ -387,6 +396,10 @@ struct vm_object {
         queue_chain_t          objq;      /* object queue - currently used for purgable queues */
        queue_chain_t           task_objq; /* objects owned by task - protected by task lock */
 
+#if !VM_TAG_ACTIVE_UPDATE
+       queue_chain_t           wired_objq;
+#endif /* !VM_TAG_ACTIVE_UPDATE */
+
 #if DEBUG
        void *purgeable_owner_bt[16];
        task_t vo_purgeable_volatilizer; /* who made it volatile? */
@@ -394,11 +407,26 @@ struct vm_object {
 #endif /* DEBUG */
 };
 
+/* values for object->vo_ledger_tag */
+#define VM_OBJECT_LEDGER_TAG_NONE      0
+#define VM_OBJECT_LEDGER_TAG_NETWORK   1
+#define VM_OBJECT_LEDGER_TAG_MEDIA     2
+#define VM_OBJECT_LEDGER_TAG_RESERVED  3
+
 #define VM_OBJECT_PURGEABLE_FAULT_ERROR(object)                                \
        ((object)->volatile_fault &&                                    \
         ((object)->purgable == VM_PURGABLE_VOLATILE ||                 \
          (object)->purgable == VM_PURGABLE_EMPTY))
 
+#if VM_OBJECT_ACCESS_TRACKING
+extern uint64_t vm_object_access_tracking_reads;
+extern uint64_t vm_object_access_tracking_writes;
+extern void vm_object_access_tracking(vm_object_t object,
+                                     int *access_tracking,
+                                     uint32_t *access_tracking_reads,
+                                     uint32_t *acess_tracking_writes);
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+
 extern
 vm_object_t    kernel_object;          /* the single kernel object */
 
@@ -421,30 +449,44 @@ extern lck_attr_t         vm_map_lck_attr;
 #error VM_TAG_ACTIVE_UPDATE
 #endif
 
+#if VM_TAG_ACTIVE_UPDATE
+#define VM_OBJECT_WIRED_ENQUEUE(object) panic("VM_OBJECT_WIRED_ENQUEUE")
+#define VM_OBJECT_WIRED_DEQUEUE(object) panic("VM_OBJECT_WIRED_DEQUEUE")
+#else /* VM_TAG_ACTIVE_UPDATE */
+#define VM_OBJECT_WIRED_ENQUEUE(object)                                        \
+       MACRO_BEGIN                                                     \
+       lck_spin_lock(&vm_objects_wired_lock);                          \
+       assert(!(object)->wired_objq.next);                             \
+       assert(!(object)->wired_objq.prev);                             \
+       queue_enter(&vm_objects_wired, (object),                        \
+                   vm_object_t, wired_objq);                           \
+       lck_spin_unlock(&vm_objects_wired_lock);                        \
+       MACRO_END
+#define VM_OBJECT_WIRED_DEQUEUE(object)                                        \
+       MACRO_BEGIN                                                     \
+       if ((object)->wired_objq.next) {                                \
+               lck_spin_lock(&vm_objects_wired_lock);                  \
+               queue_remove(&vm_objects_wired, (object),               \
+                            vm_object_t, wired_objq);                  \
+               lck_spin_unlock(&vm_objects_wired_lock);                \
+       }                                                               \
+       MACRO_END
+#endif /* VM_TAG_ACTIVE_UPDATE */
+
 #define VM_OBJECT_WIRED(object, tag)                                   \
     MACRO_BEGIN                                                                \
     assert(VM_KERN_MEMORY_NONE != (tag));                              \
     assert(VM_KERN_MEMORY_NONE == (object)->wire_tag);                 \
     (object)->wire_tag = (tag);                                        \
-    if (!VM_TAG_ACTIVE_UPDATE                                          \
-       && ((object)->purgable == VM_PURGABLE_DENY))                    \
-    {                                                                  \
-       lck_spin_lock(&vm_objects_wired_lock);                          \
-       assert(!(object)->objq.next);                                   \
-       assert(!(object)->objq.prev);                                   \
-       queue_enter(&vm_objects_wired, (object), vm_object_t, objq);    \
-       lck_spin_unlock(&vm_objects_wired_lock);                        \
+    if (!VM_TAG_ACTIVE_UPDATE) {                                       \
+       VM_OBJECT_WIRED_ENQUEUE((object));                              \
     }                                                                  \
     MACRO_END
 
 #define VM_OBJECT_UNWIRED(object)                                                      \
     MACRO_BEGIN                                                                                \
-    if (!VM_TAG_ACTIVE_UPDATE                                                          \
-       && ((object)->purgable == VM_PURGABLE_DENY) && (object)->objq.next)             \
-    {                                                                                  \
-       lck_spin_lock(&vm_objects_wired_lock);                                          \
-       queue_remove(&vm_objects_wired, (object), vm_object_t, objq);                   \
-       lck_spin_unlock(&vm_objects_wired_lock);                                        \
+    if (!VM_TAG_ACTIVE_UPDATE) {                                                       \
+           VM_OBJECT_WIRED_DEQUEUE((object));                                          \
     }                                                                                  \
     if (VM_KERN_MEMORY_NONE != (object)->wire_tag) {                                   \
        vm_tag_update_size((object)->wire_tag, -ptoa_64((object)->wired_page_count));   \
@@ -462,7 +504,7 @@ extern lck_attr_t           vm_map_lck_attr;
        if (__wireddelta) {                                                             \
            boolean_t __overflow __assert_only =                                        \
            os_add_overflow((object)->wired_page_count, __wireddelta,                   \
-                           (unsigned int *)(uintptr_t)&(object)->wired_page_count);    \
+                           &(object)->wired_page_count);                               \
            assert(!__overflow);                                                        \
            if (!(object)->pageout && !(object)->no_tag_update) {                       \
                if (__wireddelta > 0) {                                                 \
@@ -487,10 +529,10 @@ extern lck_attr_t         vm_map_lck_attr;
     __wireddelta += delta; \
 
 #define VM_OBJECT_WIRED_PAGE_ADD(object, m)                     \
-    if (!m->private && !m->fictitious) __wireddelta++;
+    if (!(m)->vmp_private && !(m)->vmp_fictitious) __wireddelta++;
 
 #define VM_OBJECT_WIRED_PAGE_REMOVE(object, m)                  \
-    if (!m->private && !m->fictitious) __wireddelta--;
+    if (!(m)->vmp_private && !(m)->vmp_fictitious) __wireddelta--;
 
 
 
@@ -1140,4 +1182,27 @@ extern void      vm_object_cache_add(vm_object_t);
 extern void    vm_object_cache_remove(vm_object_t);
 extern int     vm_object_cache_evict(int, int);
 
+#define VM_OBJECT_OWNER_DISOWNED ((task_t) -1)
+#define VM_OBJECT_OWNER(object)                                                \
+       ((((object)->purgable == VM_PURGABLE_DENY &&                    \
+          (object)->vo_ledger_tag == 0) ||                             \
+         (object)->vo_owner == TASK_NULL)                              \
+        ? TASK_NULL    /* not owned */                                 \
+        : (((object)->vo_owner == VM_OBJECT_OWNER_DISOWNED)            \
+           ? kernel_task /* disowned -> kernel */                      \
+           : (object)->vo_owner)) /* explicit owner */                 \
+
+extern void    vm_object_ledger_tag_ledgers(
+       vm_object_t object,
+       int *ledger_idx_volatile,
+       int *ledger_idx_nonvolatile,
+       int *ledger_idx_volatile_compressed,
+       int *ledger_idx_nonvolatile_compressed,
+       boolean_t *do_footprint);
+extern kern_return_t vm_object_ownership_change(
+       vm_object_t object,
+       int ledger_tag,
+       task_t owner,
+       boolean_t task_objq_locked);
+
 #endif /* _VM_VM_OBJECT_H_ */
index 92781d9da7b4d1ec75d36831fbbc506e52286b08..47a456aa6d97fd6e584c9635819dc0bfffd23b4a 100644 (file)
@@ -40,4 +40,8 @@
 #define VM_OBJECT_TRACKING 0
 #define VM_SCAN_FOR_SHADOW_CHAIN (DEVELOPMENT || DEBUG)
 
+#define VM_OBJECT_ACCESS_TRACKING (DEVELOPMENT || DEBUG)
+
+#define VM_NAMED_ENTRY_LIST (DEVELOPMENT || DEBUG)
+
 #endif /* __VM_VM_OPTIONS_H__ */
index 559af3f0b126363ba0e79fc2f37e8663960f3b7d..f8fa9c025c6f9f0e4b0a8357549d33ae7501fa16 100644 (file)
@@ -158,10 +158,10 @@ extern    char    vm_page_non_speculative_pageable_states[];
 extern char    vm_page_active_or_inactive_states[];
 
 
-#define        VM_PAGE_INACTIVE(m)                     (vm_page_inactive_states[m->vm_page_q_state])
-#define VM_PAGE_PAGEABLE(m)                    (vm_page_pageable_states[m->vm_page_q_state])
-#define VM_PAGE_NON_SPECULATIVE_PAGEABLE(m)    (vm_page_non_speculative_pageable_states[m->vm_page_q_state])
-#define        VM_PAGE_ACTIVE_OR_INACTIVE(m)           (vm_page_active_or_inactive_states[m->vm_page_q_state])
+#define        VM_PAGE_INACTIVE(m)                     (vm_page_inactive_states[m->vmp_q_state])
+#define VM_PAGE_PAGEABLE(m)                    (vm_page_pageable_states[m->vmp_q_state])
+#define VM_PAGE_NON_SPECULATIVE_PAGEABLE(m)    (vm_page_non_speculative_pageable_states[m->vmp_q_state])
+#define        VM_PAGE_ACTIVE_OR_INACTIVE(m)           (vm_page_active_or_inactive_states[m->vmp_q_state])
 
 
 #define        VM_PAGE_NOT_ON_Q                0               /* page is not present on any queue, nor is it wired... mainly a transient state */
@@ -184,49 +184,45 @@ extern    char    vm_page_active_or_inactive_states[];
 #define        VM_PAGE_Q_STATE_ARRAY_SIZE      (VM_PAGE_Q_STATE_LAST_VALID_VALUE+1)
 
 
-#define        pageq   pageq_un.vm_page_pageq
-#define snext  pageq_un.vm_page_snext
+/*
+ * The structure itself. See the block comment above for what (O) and (P) mean.
+ */
+#define vmp_pageq vmp_q_un.vmp_q_pageq
+#define vmp_snext vmp_q_un.vmp_q_snext
 
 struct vm_page {
        union {
-               vm_page_queue_chain_t   vm_page_pageq;  /* queue info for FIFO queue or free list (P) */
-               struct vm_page  *vm_page_snext;
-       } pageq_un;
+               vm_page_queue_chain_t vmp_q_pageq;           /* queue info for FIFO queue or free list (P) */
+               struct vm_page        *vmp_q_snext;
+       } vmp_q_un;
 
-       vm_page_queue_chain_t   listq;  /* all pages in same object (O) */
+       vm_page_queue_chain_t         vmp_listq;           /* all pages in same object (O) */
 
 #if CONFIG_BACKGROUND_QUEUE
-        vm_page_queue_chain_t  vm_page_backgroundq;    /* anonymous pages in the background pool (P) */
+        vm_page_queue_chain_t         vmp_backgroundq;    /* anonymous pages in the background pool (P) */
 #endif
 
-       vm_object_offset_t      offset; /* offset into that object (O,P) */
-       vm_page_object_t        vm_page_object;         /* which object am I in (O&P) */
+       vm_object_offset_t            vmp_offset;          /* offset into that object (O,P) */
+       vm_page_object_t              vmp_object;          /* which object am I in (O&P) */
 
        /*
-        * The following word of flags is protected
-        * by the "page queues" lock.
+        * The following word of flags is always protected by the "page queues" lock.
         *
-        * we use the 'wire_count' field to store the local
-        * queue id if local queues are enabled...
-        * see the comments at 'vm_page_queues_remove' as to
-        * why this is safe to do
+        * We use 'vmp_wire_count' to store the local queue id if local queues are enabled.
+        * See the comments at 'vm_page_queues_remove' as to why this is safe to do.
         */
-#define local_id wire_count
-       unsigned int    wire_count:16,  /* how many wired down maps use me? (O&P) */
-                       vm_page_q_state:4,      /* which q is the page on (P) */
-
-                       vm_page_in_background:1,
-                       vm_page_on_backgroundq:1,
-       /* boolean_t */
-                       gobbled:1,      /* page used internally (P) */
-                       laundry:1,      /* page is being cleaned now (P)*/
-                       no_cache:1,     /* page is not to be cached and should
-                                        * be reused ahead of other pages (P) */
-                       private:1,      /* Page should not be returned to
-                                        *  the free list (P) */
-                       reference:1,    /* page has been used (P) */
-
-                       __unused_pageq_bits:5;  /* 5 bits available here */
+#define vmp_local_id vmp_wire_count
+       unsigned int vmp_wire_count:16,      /* how many wired down maps use me? (O&P) */
+                    vmp_q_state:4,          /* which q is the page on (P) */
+                    vmp_in_background:1,
+                    vmp_on_backgroundq:1,
+                    vmp_gobbled:1,          /* page used internally (P) */
+                    vmp_laundry:1,          /* page is being cleaned now (P)*/
+                    vmp_no_cache:1,         /* page is not to be cached and should */
+                                            /* be reused ahead of other pages (P) */
+                    vmp_private:1,          /* Page should not be returned to the free list (P) */
+                    vmp_reference:1,        /* page has been used (P) */
+                    vmp_unused_page_bits:5;
 
        /*
         * MUST keep the 2 32 bit words used as bit fields
@@ -236,62 +232,48 @@ struct vm_page {
         * they are protected by 2 different locks, this
         * is a real problem
         */
-       vm_page_packed_t next_m;        /* VP bucket link (O) */
+       vm_page_packed_t vmp_next_m;            /* VP bucket link (O) */
 
        /*
-        * The following word of flags is protected
-        * by the "VM object" lock.
-        */
-       unsigned int
-       /* boolean_t */ busy:1,         /* page is in transit (O) */
-                       wanted:1,       /* someone is waiting for page (O) */
-                       tabled:1,       /* page is in VP table (O) */
-                       hashed:1,       /* page is in vm_page_buckets[]
-                                          (O) + the bucket lock */
-                       fictitious:1,   /* Physical page doesn't exist (O) */
-       /*
-        * IMPORTANT: the "pmapped", "xpmapped" and "clustered" bits can be modified while holding the
+        * The following word of flags is protected by the "VM object" lock.
+        *
+        * IMPORTANT: the "vmp_pmapped", "vmp_xpmapped" and "vmp_clustered" bits can be modified while holding the
         * VM object "shared" lock + the page lock provided through the pmap_lock_phys_page function.
-        * This is done in vm_fault_enter and the CONSUME_CLUSTERED macro.
+        * This is done in vm_fault_enter() and the CONSUME_CLUSTERED macro.
         * It's also ok to modify them behind just the VM object "exclusive" lock.
         */
-                       clustered:1,    /* page is not the faulted page (O) or (O-shared AND pmap_page) */
-                       pmapped:1,      /* page has been entered at some
-                                                * point into a pmap (O) or (O-shared AND pmap_page) */
-                       xpmapped:1,     /* page has been entered with execute permission (O)
-                                          or (O-shared AND pmap_page) */
-
-                       wpmapped:1,     /* page has been entered at some
-                                        * point into a pmap for write (O) */
-                       free_when_done:1,       /* page is to be freed once cleaning is completed (O) */
-                       absent:1,       /* Data has been requested, but is
-                                        *  not yet available (O) */
-                       error:1,        /* Data manager was unable to provide
-                                        *  data due to error (O) */
-                       dirty:1,        /* Page must be cleaned (O) */
-                       cleaning:1,     /* Page clean has begun (O) */
-                       precious:1,     /* Page is precious; data must be
-                                        *  returned even if clean (O) */
-                       overwriting:1,  /* Request to unlock has been made
-                                        * without having data. (O)
-                                        * [See vm_fault_page_overwrite] */
-                       restart:1,      /* Page was pushed higher in shadow
-                                          chain by copy_call-related pagers;
-                                          start again at top of chain */
-                       unusual:1,      /* Page is absent, error, restart or
-                                          page locked */
-                       cs_validated:1,    /* code-signing: page was checked */ 
-                       cs_tainted:1,      /* code-signing: page is tainted */
-                       cs_nx:1,           /* code-signing: page is nx */
-                       reusable:1,
-                       lopage:1,
-                       slid:1,
-                       written_by_kernel:1,    /* page was written by kernel (i.e. decompressed) */
-                       __unused_object_bits:7;  /* 7 bits available here */
+       unsigned int    vmp_busy:1,           /* page is in transit (O) */
+                       vmp_wanted:1,         /* someone is waiting for page (O) */
+                       vmp_tabled:1,         /* page is in VP table (O) */
+                       vmp_hashed:1,         /* page is in vm_page_buckets[] (O) + the bucket lock */
+                       vmp_fictitious:1,     /* Physical page doesn't exist (O) */
+                       vmp_clustered:1,      /* page is not the faulted page (O) or (O-shared AND pmap_page) */
+                       vmp_pmapped:1,        /* page has at some time been entered into a pmap (O) or */
+                                             /* (O-shared AND pmap_page) */
+                       vmp_xpmapped:1,       /* page has been entered with execute permission (O) or */
+                                             /* (O-shared AND pmap_page) */
+                       vmp_wpmapped:1,       /* page has been entered at some point into a pmap for write (O) */
+                       vmp_free_when_done:1, /* page is to be freed once cleaning is completed (O) */
+                       vmp_absent:1,         /* Data has been requested, but is not yet available (O) */
+                       vmp_error:1,          /* Data manager was unable to provide data due to error (O) */
+                       vmp_dirty:1,          /* Page must be cleaned (O) */
+                       vmp_cleaning:1,       /* Page clean has begun (O) */
+                       vmp_precious:1,       /* Page is precious; data must be returned even if clean (O) */
+                       vmp_overwriting:1,    /* Request to unlock has been made without having data. (O) */
+                                             /* [See vm_fault_page_overwrite] */
+                       vmp_restart:1,        /* Page was pushed higher in shadow chain by copy_call-related pagers */
+                                             /* start again at top of chain */
+                       vmp_unusual:1,        /* Page is absent, error, restart or page locked */
+                       vmp_cs_validated:1,   /* code-signing: page was checked */      
+                       vmp_cs_tainted:1,     /* code-signing: page is tainted */
+                       vmp_cs_nx:1,          /* code-signing: page is nx */
+                       vmp_reusable:1,
+                       vmp_lopage:1,
+                       vmp_written_by_kernel:1, /* page was written by kernel (i.e. decompressed) */
+                       vmp_unused_object_bits:8;
 
 #if    !defined(__arm__) && !defined(__arm64__)
-       ppnum_t         phys_page;      /* Physical address of page, passed
-                                        *  to pmap_enter (read-only) */
+       ppnum_t         vmp_phys_page;        /* Physical page number of the page */
 #endif
 };
 
@@ -309,7 +291,7 @@ extern      unsigned int vm_first_phys_ppnum;
 struct vm_page_with_ppnum {
        struct  vm_page vm_page_wo_ppnum;
 
-       ppnum_t phys_page;
+       ppnum_t vmp_phys_page;
 };
 typedef struct vm_page_with_ppnum *vm_page_with_ppnum_t;
 
@@ -319,13 +301,13 @@ static inline ppnum_t VM_PAGE_GET_PHYS_PAGE(vm_page_t m)
        if (m >= vm_page_array_beginning_addr && m < vm_page_array_ending_addr)
                return ((ppnum_t)((uintptr_t)(m - vm_page_array_beginning_addr) + vm_first_phys_ppnum));
        else
-               return (((vm_page_with_ppnum_t)m)->phys_page);
+               return (((vm_page_with_ppnum_t)m)->vmp_phys_page);
 }
 
 #define VM_PAGE_SET_PHYS_PAGE(m, ppnum)                \
        MACRO_BEGIN                             \
        if ((m) < vm_page_array_beginning_addr || (m) >= vm_page_array_ending_addr)     \
-               ((vm_page_with_ppnum_t)(m))->phys_page = ppnum; \
+               ((vm_page_with_ppnum_t)(m))->vmp_phys_page = ppnum;     \
        assert(ppnum == VM_PAGE_GET_PHYS_PAGE(m));              \
        MACRO_END
 
@@ -340,10 +322,10 @@ struct vm_page_with_ppnum {
 typedef struct vm_page_with_ppnum *vm_page_with_ppnum_t;
 
 
-#define        VM_PAGE_GET_PHYS_PAGE(page)     (page)->phys_page
+#define        VM_PAGE_GET_PHYS_PAGE(page)     (page)->vmp_phys_page
 #define VM_PAGE_SET_PHYS_PAGE(page, ppnum)     \
        MACRO_BEGIN                             \
-       (page)->phys_page = ppnum;              \
+       (page)->vmp_phys_page = ppnum;          \
        MACRO_END
 
 #define VM_PAGE_GET_CLUMP(m)    ((VM_PAGE_GET_PHYS_PAGE(m)) >> vm_clump_shift)
@@ -398,13 +380,13 @@ static inline uintptr_t   vm_page_unpack_ptr(uintptr_t p)
 #define        VM_PAGE_PACK_PTR(p)     vm_page_pack_ptr((uintptr_t)(p))
 #define        VM_PAGE_UNPACK_PTR(p)   vm_page_unpack_ptr((uintptr_t)(p))
 
-#define        VM_PAGE_OBJECT(p)       ((vm_object_t)(VM_PAGE_UNPACK_PTR(p->vm_page_object)))
+#define        VM_PAGE_OBJECT(p)       ((vm_object_t)(VM_PAGE_UNPACK_PTR(p->vmp_object)))
 #define        VM_PAGE_PACK_OBJECT(o)  ((vm_page_object_t)(VM_PAGE_PACK_PTR(o)))
 
 
 #define        VM_PAGE_ZERO_PAGEQ_ENTRY(p)     \
 MACRO_BEGIN                            \
-        (p)->snext = 0;                        \
+        (p)->vmp_snext = 0;            \
 MACRO_END
 
 
@@ -560,7 +542,7 @@ MACRO_BEGIN
         __n = VM_PAGE_GET_PHYS_PAGE(elt) & vm_clump_mask;                                                \
         /* scan backward looking for a buddy page */                                                     \
         for(__i=0, __p=(elt)-1; __i<__n && __p>=vm_page_array_beginning_addr; __i++, __p--) {            \
-            if(__p->vm_page_q_state == VM_PAGE_ON_FREE_Q && __clump_num == VM_PAGE_GET_CLUMP(__p)) {     \
+            if(__p->vmp_q_state == VM_PAGE_ON_FREE_Q && __clump_num == VM_PAGE_GET_CLUMP(__p)) {     \
                 if(__prev == 0) __prev = (vm_page_queue_entry_t) __p;                                    \
                 __first = (vm_page_queue_entry_t) __p;                                                   \
                 __n_free++;                                                                              \
@@ -568,7 +550,7 @@ MACRO_BEGIN
         }                                                                                                \
         /* scan forward looking for a buddy page */                                                      \
         for(__i=__n+1, __p=(elt)+1; __i<vm_clump_size && __p<vm_page_array_boundary; __i++, __p++) {     \
-            if(__p->vm_page_q_state == VM_PAGE_ON_FREE_Q && __clump_num == VM_PAGE_GET_CLUMP(__p)) {     \
+            if(__p->vmp_q_state == VM_PAGE_ON_FREE_Q && __clump_num == VM_PAGE_GET_CLUMP(__p)) {     \
                 __DEBUG_CHECK_BUDDIES(__check, __prev, __p, field);                                      \
                 if(__prev == 0) __prev = (vm_page_queue_entry_t) VM_PAGE_UNPACK_PTR(__p->field.prev);    \
                 __last = (vm_page_queue_entry_t) __p;                                                    \
@@ -841,14 +823,14 @@ MACRO_END
 #define        VM_PAGE_PACK_PTR(p)     (p)
 #define        VM_PAGE_UNPACK_PTR(p)   ((uintptr_t)(p))
 
-#define        VM_PAGE_OBJECT(p)       (vm_object_t)(p->vm_page_object)
+#define        VM_PAGE_OBJECT(p)       (vm_object_t)(p->vmp_object)
 #define        VM_PAGE_PACK_OBJECT(o)  ((vm_page_object_t)(VM_PAGE_PACK_PTR(o)))
 
 
 #define        VM_PAGE_ZERO_PAGEQ_ENTRY(p)     \
 MACRO_BEGIN                            \
-        (p)->pageq.next = 0;           \
-        (p)->pageq.prev = 0;           \
+        (p)->vmp_pageq.next = 0;               \
+        (p)->vmp_pageq.prev = 0;               \
 MACRO_END
 
 #define        VM_PAGE_CONVERT_TO_QUEUE_ENTRY(p)       ((queue_entry_t)(p))
@@ -940,9 +922,9 @@ extern      void    vm_page_add_to_backgroundq(vm_page_t mem, boolean_t first);
 extern void    vm_page_remove_from_backgroundq(vm_page_t mem);
 #endif
 
-#define VM_PAGE_WIRED(m)       ((m)->vm_page_q_state == VM_PAGE_IS_WIRED)
-#define NEXT_PAGE(m)           ((m)->snext)
-#define NEXT_PAGE_PTR(m)       (&(m)->snext)
+#define VM_PAGE_WIRED(m)       ((m)->vmp_q_state == VM_PAGE_IS_WIRED)
+#define NEXT_PAGE(m)           ((m)->vmp_snext)
+#define NEXT_PAGE_PTR(m)       (&(m)->vmp_snext)
 
 /*
  * XXX The unusual bit should not be necessary.  Most of the bit
@@ -1147,8 +1129,6 @@ unsigned int      vm_page_free_min;       /* When to wakeup pageout */
 extern
 unsigned int   vm_page_throttle_limit; /* When to throttle new page creation */
 extern
-uint32_t       vm_page_creation_throttle;      /* When to throttle new page creation */
-extern
 unsigned int   vm_page_inactive_target;/* How many do we want inactive? */
 #if CONFIG_SECLUDED_MEMORY
 extern
@@ -1157,12 +1137,8 @@ unsigned int     vm_page_secluded_target;/* How many do we want secluded? */
 extern
 unsigned int   vm_page_anonymous_min;  /* When it's ok to pre-clean */
 extern
-unsigned int   vm_page_inactive_min;   /* When to wakeup pageout */
-extern
 unsigned int   vm_page_free_reserved;  /* How many pages reserved to do pageout */
 extern
-unsigned int   vm_page_throttle_count; /* Count of page allocations throttled */
-extern
 unsigned int   vm_page_gobble_count;
 extern
 unsigned int   vm_page_stolen_count;   /* Count of stolen pages not acccounted in zones */
@@ -1285,6 +1261,9 @@ extern void               vm_page_free_unlocked(
                                        vm_page_t       page,
                                        boolean_t       remove_from_hash);
 
+extern void             vm_page_balance_inactive(
+                                       int             max_to_move);
+
 extern void            vm_page_activate(
                                        vm_page_t       page);
 
@@ -1384,6 +1363,9 @@ extern void               vm_page_validate_cs(vm_page_t   page);
 extern void            vm_page_validate_cs_mapped(
        vm_page_t       page,
        const void      *kaddr);
+extern void            vm_page_validate_cs_mapped_slow(
+       vm_page_t       page,
+       const void      *kaddr);
 extern void            vm_page_validate_cs_mapped_chunk(
        vm_page_t       page,
        const void      *kaddr,
@@ -1434,29 +1416,32 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
 #endif /* CONFIG_JETSAM */
 
 /*
- *     Functions implemented as macros. m->wanted and m->busy are
- *     protected by the object lock.
+ * Functions implemented as macros. m->vmp_wanted and m->vmp_busy are
+ * protected by the object lock.
  */
 
 #if CONFIG_EMBEDDED
 #define SET_PAGE_DIRTY(m, set_pmap_modified)                           \
                MACRO_BEGIN                                             \
                vm_page_t __page__ = (m);                               \
-               if (__page__->dirty == FALSE && (set_pmap_modified)) {  \
+               if (__page__->vmp_pmapped == TRUE &&                    \
+                   __page__->vmp_wpmapped == TRUE &&                   \
+                   __page__->vmp_dirty == FALSE &&                     \
+                   (set_pmap_modified)) {                              \
                        pmap_set_modify(VM_PAGE_GET_PHYS_PAGE(__page__)); \
                }                                                       \
-               __page__->dirty = TRUE;                                 \
+               __page__->vmp_dirty = TRUE;                             \
                MACRO_END
 #else /* CONFIG_EMBEDDED */
 #define SET_PAGE_DIRTY(m, set_pmap_modified)                           \
                MACRO_BEGIN                                             \
                vm_page_t __page__ = (m);                               \
-               __page__->dirty = TRUE;                                 \
+               __page__->vmp_dirty = TRUE;                             \
                MACRO_END
 #endif /* CONFIG_EMBEDDED */
 
 #define PAGE_ASSERT_WAIT(m, interruptible)                     \
-               (((m)->wanted = TRUE),                          \
+               (((m)->vmp_wanted = TRUE),                      \
                 assert_wait((event_t) (m), (interruptible)))
 
 #if CONFIG_IOSCHED
@@ -1464,23 +1449,23 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
                vm_page_sleep(o, m, interruptible)
 #else
 #define PAGE_SLEEP(o, m, interruptible)                                \
-       (((m)->wanted = TRUE),                                  \
+       (((m)->vmp_wanted = TRUE),                              \
         thread_sleep_vm_object((o), (m), (interruptible)))
 #endif
 
 #define PAGE_WAKEUP_DONE(m)                                    \
                MACRO_BEGIN                                     \
-               (m)->busy = FALSE;                              \
-               if ((m)->wanted) {                              \
-                       (m)->wanted = FALSE;                    \
+               (m)->vmp_busy = FALSE;                          \
+               if ((m)->vmp_wanted) {                          \
+                       (m)->vmp_wanted = FALSE;                \
                        thread_wakeup((event_t) (m));           \
                }                                               \
                MACRO_END
 
 #define PAGE_WAKEUP(m)                                         \
                MACRO_BEGIN                                     \
-               if ((m)->wanted) {                              \
-                       (m)->wanted = FALSE;                    \
+               if ((m)->vmp_wanted) {                          \
+                       (m)->vmp_wanted = FALSE;                \
                        thread_wakeup((event_t) (m));           \
                }                                               \
                MACRO_END
@@ -1523,7 +1508,7 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
 #if DEVELOPMENT || DEBUG
 #define VM_PAGE_SPECULATIVE_USED_ADD()                         \
        MACRO_BEGIN                                             \
-       OSAddAtomic(1, &vm_page_speculative_used);      \
+       OSAddAtomic(1, &vm_page_speculative_used);              \
        MACRO_END
 #else
 #define        VM_PAGE_SPECULATIVE_USED_ADD()
@@ -1534,16 +1519,16 @@ extern void memorystatus_pages_update(unsigned int pages_avail);
        MACRO_BEGIN                                             \
        ppnum_t __phys_page;                                    \
        __phys_page = VM_PAGE_GET_PHYS_PAGE(mem);               \
-       pmap_lock_phys_page(__phys_page);       \
-       if (mem->clustered) {                                   \
+       pmap_lock_phys_page(__phys_page);                       \
+       if (mem->vmp_clustered) {                               \
                vm_object_t o;                                  \
                o = VM_PAGE_OBJECT(mem);                        \
                assert(o);                                      \
                o->pages_used++;                                \
-               mem->clustered = FALSE;                         \
+               mem->vmp_clustered = FALSE;                     \
                VM_PAGE_SPECULATIVE_USED_ADD();                 \
        }                                                       \
-       pmap_unlock_phys_page(__phys_page);     \
+       pmap_unlock_phys_page(__phys_page);                     \
        MACRO_END
 
 
@@ -1610,8 +1595,8 @@ extern unsigned int vm_max_delayed_work_limit;
 
 #define VM_PAGE_ADD_DELAYED_WORK(dwp, mem, dw_cnt)             \
        MACRO_BEGIN                                             \
-       if (mem->busy == FALSE) {                               \
-               mem->busy = TRUE;                               \
+       if (mem->vmp_busy == FALSE) {                           \
+               mem->vmp_busy = TRUE;                           \
                if ( !(dwp->dw_mask & DW_vm_page_free))         \
                        dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP); \
        }                                                       \
@@ -1632,5 +1617,11 @@ extern void vm_page_enqueue_inactive(vm_page_t mem, boolean_t first);
 extern void vm_page_enqueue_active(vm_page_t mem, boolean_t first);
 extern void vm_page_check_pageable_safe(vm_page_t page);
 
+#if CONFIG_SECLUDED_MEMORY
+extern uint64_t secluded_shutoff_trigger;
+extern void start_secluded_suppression(task_t);
+extern void stop_secluded_suppression(task_t);
+#endif /* CONFIG_SECLUDED_MEMORY */
+
 
 #endif /* _VM_VM_PAGE_H_ */
index 20eac579db3213c4f95b93b587350d47f057ebfa..bf722548b023d5a64d32add02f84b4ebf62529df 100644 (file)
@@ -2,7 +2,7 @@
  * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
- * 
+ *
  * This file contains Original Code and/or Modifications of Original Code
  * as defined in and that are subject to the Apple Public Source License
  * Version 2.0 (the 'License'). You may not use this file except in
  * unlawful or unlicensed copies of an Apple operating system, or to
  * circumvent, violate, or enable the circumvention or violation of, any
  * terms of an Apple operating system software license agreement.
- * 
+ *
  * Please obtain a copy of the License at
  * http://www.opensource.apple.com/apsl/ and read it before using this file.
- * 
+ *
  * The Original Code and all software distributed under the License are
  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  * Please see the License for the specific language governing rights and
  * limitations under the License.
- * 
+ *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  */
 /*
  * @OSF_COPYRIGHT@
  */
-/* 
+/*
  * Mach Operating System
  * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
  * All Rights Reserved.
- * 
+ *
  * Permission to use, copy, modify and distribute this software and its
  * documentation is hereby granted, provided that both the copyright
  * notice and this permission notice appear in all copies of the
  * software, derivative works or modified versions, and any portions
  * thereof, and that both notices appear in supporting documentation.
- * 
+ *
  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
- * 
+ *
  * Carnegie Mellon requests users of this software to return to
- * 
+ *
  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
  *  School of Computer Science
  *  Carnegie Mellon University
  *  Pittsburgh PA 15213-3890
- * 
+ *
  * any improvements or extensions that they make and grant Carnegie Mellon
  * the rights to redistribute these changes.
  */
 #include <vm/vm_phantom_cache.h>
 #endif
 
-extern int cs_debug;
-
 #if UPL_DEBUG
 #include <libkern/OSDebug.h>
 #endif
 
-extern void m_drain(void);
+extern int cs_debug;
+
+extern void mbuf_drain(boolean_t);
 
 #if VM_PRESSURE_EVENTS
 #if CONFIG_JETSAM
@@ -135,24 +135,14 @@ extern uint64_t memorystatus_available_pages_critical;
 
 extern unsigned int memorystatus_frozen_count;
 extern unsigned int memorystatus_suspended_count;
-
 extern vm_pressure_level_t memorystatus_vm_pressure_level;
-int memorystatus_purge_on_warning = 2;
-int memorystatus_purge_on_urgent = 5;
-int memorystatus_purge_on_critical = 8;
 
 void vm_pressure_response(void);
-boolean_t vm_pressure_thread_running = FALSE;
 extern void consider_vm_pressure_events(void);
 
 #define MEMORYSTATUS_SUSPENDED_THRESHOLD  4
 #endif /* VM_PRESSURE_EVENTS */
 
-boolean_t      vm_pressure_changed = FALSE;
-
-#ifndef VM_PAGEOUT_BURST_ACTIVE_THROTTLE   /* maximum iterations of the active queue to move pages to inactive */
-#define VM_PAGEOUT_BURST_ACTIVE_THROTTLE  100
-#endif
 
 #ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE  /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
 #ifdef CONFIG_EMBEDDED
@@ -166,24 +156,20 @@ boolean_t vm_pressure_changed = FALSE;
 #define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
 #endif
 
-#ifndef VM_PAGEOUT_INACTIVE_RELIEF
-#define VM_PAGEOUT_INACTIVE_RELIEF 50  /* minimum number of pages to move to the inactive q */
-#endif
-
 #ifndef        VM_PAGE_LAUNDRY_MAX
 #define        VM_PAGE_LAUNDRY_MAX     128UL   /* maximum pageouts on a given pageout queue */
 #endif /* VM_PAGEOUT_LAUNDRY_MAX */
 
 #ifndef        VM_PAGEOUT_BURST_WAIT
-#define        VM_PAGEOUT_BURST_WAIT   10      /* milliseconds */
+#define        VM_PAGEOUT_BURST_WAIT   1       /* milliseconds */
 #endif /* VM_PAGEOUT_BURST_WAIT */
 
 #ifndef        VM_PAGEOUT_EMPTY_WAIT
-#define VM_PAGEOUT_EMPTY_WAIT  200     /* milliseconds */
+#define VM_PAGEOUT_EMPTY_WAIT  50      /* milliseconds */
 #endif /* VM_PAGEOUT_EMPTY_WAIT */
 
 #ifndef        VM_PAGEOUT_DEADLOCK_WAIT
-#define VM_PAGEOUT_DEADLOCK_WAIT       300     /* milliseconds */
+#define VM_PAGEOUT_DEADLOCK_WAIT 100    /* milliseconds */
 #endif /* VM_PAGEOUT_DEADLOCK_WAIT */
 
 #ifndef        VM_PAGEOUT_IDLE_WAIT
@@ -191,22 +177,12 @@ boolean_t vm_pressure_changed = FALSE;
 #endif /* VM_PAGEOUT_IDLE_WAIT */
 
 #ifndef        VM_PAGEOUT_SWAP_WAIT
-#define VM_PAGEOUT_SWAP_WAIT   50      /* milliseconds */
+#define VM_PAGEOUT_SWAP_WAIT   10      /* milliseconds */
 #endif /* VM_PAGEOUT_SWAP_WAIT */
 
-#ifndef VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED
-#define VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED           1000    /* maximum pages considered before we issue a pressure event */
-#endif /* VM_PAGEOUT_PRESSURE_PAGES_CONSIDERED */
-
-#ifndef VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS
-#define VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS         5       /* seconds */
-#endif /* VM_PAGEOUT_PRESSURE_EVENT_MONITOR_SECS */
-
-unsigned int   vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
-unsigned int   vm_page_speculative_percentage = 5;
 
 #ifndef VM_PAGE_SPECULATIVE_TARGET
-#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_page_speculative_percentage))
+#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
 #endif /* VM_PAGE_SPECULATIVE_TARGET */
 
 
@@ -223,11 +199,7 @@ unsigned int       vm_page_speculative_percentage = 5;
  */
 
 #ifndef        VM_PAGE_INACTIVE_TARGET
-#ifdef CONFIG_EMBEDDED
-#define        VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 3)
-#else
 #define        VM_PAGE_INACTIVE_TARGET(avail)  ((avail) * 1 / 2)
-#endif
 #endif /* VM_PAGE_INACTIVE_TARGET */
 
 /*
@@ -289,6 +261,7 @@ unsigned int        vm_page_speculative_percentage = 5;
  *     we will make per call of vm_pageout_scan().
  */
 #define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
+
 #ifndef        VM_PAGE_REACTIVATE_LIMIT
 #ifdef CONFIG_EMBEDDED
 #define        VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
@@ -298,18 +271,8 @@ unsigned int       vm_page_speculative_percentage = 5;
 #endif /* VM_PAGE_REACTIVATE_LIMIT */
 #define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM      1000
 
-
 extern boolean_t hibernate_cleaning_in_progress;
 
-/*
- * Exported variable used to broadcast the activation of the pageout scan
- * Working Set uses this to throttle its use of pmap removes.  In this
- * way, code which runs within memory in an uncontested context does
- * not keep encountering soft faults.
- */
-
-unsigned int   vm_pageout_scan_event_counter = 0;
-
 /*
  * Forward declarations for internal routines.
  */
@@ -332,6 +295,7 @@ boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
 boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
 boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
 #endif
+
 void vm_pageout_garbage_collect(int);
 static void vm_pageout_iothread_external(void);
 static void vm_pageout_iothread_internal(struct cq *cq);
@@ -339,176 +303,48 @@ static void vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *, boolean_t
 
 extern void vm_pageout_continue(void);
 extern void vm_pageout_scan(void);
+
 void vm_tests(void); /* forward */
 
-boolean_t      vm_restricted_to_single_processor = FALSE;
 #if !CONFIG_EMBEDDED
 static boolean_t vm_pageout_waiter  = FALSE;
 static boolean_t vm_pageout_running = FALSE;
 #endif /* !CONFIG_EMBEDDED */
 
 
-static thread_t        vm_pageout_external_iothread = THREAD_NULL;
-static thread_t        vm_pageout_internal_iothread = THREAD_NULL;
-
-unsigned int vm_pageout_reserved_internal = 0;
-unsigned int vm_pageout_reserved_really = 0;
-
-unsigned int vm_pageout_swap_wait = 0;
-unsigned int vm_pageout_idle_wait = 0;         /* milliseconds */
-unsigned int vm_pageout_empty_wait = 0;                /* milliseconds */
-unsigned int vm_pageout_burst_wait = 0;                /* milliseconds */
-unsigned int vm_pageout_deadlock_wait = 0;     /* milliseconds */
-unsigned int vm_pageout_deadlock_relief = 0;
-unsigned int vm_pageout_inactive_relief = 0;
-unsigned int vm_pageout_burst_active_throttle = 0;
-unsigned int vm_pageout_burst_inactive_throttle = 0;
-
-int    vm_upl_wait_for_pages = 0;
-
-
-/*
- *     These variables record the pageout daemon's actions:
- *     how many pages it looks at and what happens to those pages.
- *     No locking needed because only one thread modifies the variables.
- */
-
-unsigned int vm_pageout_active = 0;            /* debugging */
-unsigned int vm_pageout_inactive = 0;          /* debugging */
-unsigned int vm_pageout_inactive_throttled = 0;        /* debugging */
-unsigned int vm_pageout_inactive_forced = 0;   /* debugging */
-unsigned int vm_pageout_inactive_nolock = 0;   /* debugging */
-unsigned int vm_pageout_inactive_avoid = 0;    /* debugging */
-unsigned int vm_pageout_inactive_busy = 0;     /* debugging */
-unsigned int vm_pageout_inactive_error = 0;    /* debugging */
-unsigned int vm_pageout_inactive_absent = 0;   /* debugging */
-unsigned int vm_pageout_inactive_notalive = 0; /* debugging */
-unsigned int vm_pageout_inactive_used = 0;     /* debugging */
-unsigned int vm_pageout_cache_evicted = 0;     /* debugging */
-unsigned int vm_pageout_inactive_clean = 0;    /* debugging */
-unsigned int vm_pageout_speculative_clean = 0; /* debugging */
-unsigned int vm_pageout_speculative_dirty = 0; /* debugging */
-
-unsigned int vm_pageout_freed_from_cleaned = 0;
-unsigned int vm_pageout_freed_from_speculative = 0;
-unsigned int vm_pageout_freed_from_inactive_clean = 0;
-unsigned int vm_pageout_freed_after_compression = 0;
-
-extern uint32_t vm_compressor_pages_grabbed;
-extern  uint32_t c_segment_pages_compressed;
-
-unsigned int vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
-
-unsigned int vm_pageout_cleaned_reclaimed = 0;         /* debugging; how many cleaned pages are reclaimed by the pageout scan */
-unsigned int vm_pageout_cleaned_reactivated = 0;       /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */
-unsigned int vm_pageout_cleaned_reference_reactivated = 0;
-unsigned int vm_pageout_cleaned_volatile_reactivated = 0;
-unsigned int vm_pageout_cleaned_fault_reactivated = 0;
-unsigned int vm_pageout_cleaned_commit_reactivated = 0;        /* debugging; how many cleaned pages are found to be referenced on commit (and are therefore reactivated) */
-unsigned int vm_pageout_cleaned_busy = 0;
-unsigned int vm_pageout_cleaned_nolock = 0;
-
-unsigned int vm_pageout_inactive_dirty_internal = 0;   /* debugging */
-unsigned int vm_pageout_inactive_dirty_external = 0;   /* debugging */
-unsigned int vm_pageout_inactive_deactivated = 0;      /* debugging */
-unsigned int vm_pageout_inactive_anonymous = 0;        /* debugging */
-unsigned int vm_pageout_dirty_no_pager = 0;    /* debugging */
-unsigned int vm_pageout_purged_objects = 0;    /* used for sysctl vm stats */
-unsigned int vm_stat_discard = 0;              /* debugging */
-unsigned int vm_stat_discard_sent = 0;         /* debugging */
-unsigned int vm_stat_discard_failure = 0;      /* debugging */
-unsigned int vm_stat_discard_throttle = 0;     /* debugging */
-unsigned int vm_pageout_reactivation_limit_exceeded = 0;       /* debugging */
-unsigned int vm_pageout_inactive_force_reclaim = 0;    /* debugging */
-unsigned int vm_pageout_skipped_external = 0;   /* debugging */
-
-unsigned int vm_pageout_scan_reclaimed_throttled = 0;
-unsigned int vm_pageout_scan_active_throttled = 0;
-unsigned int vm_pageout_scan_inactive_throttled_internal = 0;
-unsigned int vm_pageout_scan_inactive_throttled_external = 0;
-unsigned int vm_pageout_scan_throttle = 0;                     /* debugging */
-unsigned int vm_pageout_scan_burst_throttle = 0;               /* debugging */
-unsigned int vm_pageout_scan_empty_throttle = 0;               /* debugging */
-unsigned int vm_pageout_scan_swap_throttle = 0;                /* debugging */
-unsigned int vm_pageout_scan_deadlock_detected = 0;            /* debugging */
-unsigned int vm_pageout_scan_active_throttle_success = 0;      /* debugging */
-unsigned int vm_pageout_scan_inactive_throttle_success = 0;    /* debugging */
-unsigned int vm_pageout_inactive_external_forced_jetsam_count = 0;     /* debugging */
-unsigned int vm_pageout_scan_throttle_deferred = 0;            /* debugging */
-unsigned int vm_pageout_scan_yield_unthrottled = 0;            /* debugging */
-unsigned int vm_page_speculative_count_drifts = 0;
-unsigned int vm_page_speculative_count_drift_max = 0;
-
-uint32_t vm_compressor_failed;
-
-/*
- * Backing store throttle when BS is exhausted
- */
-unsigned int   vm_backing_store_low = 0;
-
-unsigned int vm_pageout_out_of_line  = 0;
-unsigned int vm_pageout_in_place  = 0;
-
-unsigned int vm_page_steal_pageout_page = 0;
-
-struct vm_config       vm_config;
+#if DEVELOPMENT || DEBUG
+struct vm_pageout_debug vm_pageout_debug;
+#endif
+struct vm_pageout_vminfo vm_pageout_vminfo;
+struct vm_pageout_state  vm_pageout_state;
+struct vm_config         vm_config;
 
 struct vm_pageout_queue vm_pageout_queue_internal __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
 struct vm_pageout_queue vm_pageout_queue_external __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
 
-unsigned int vm_page_speculative_target = 0;
-
-vm_object_t    vm_pageout_scan_wants_object = VM_OBJECT_NULL;
+int        vm_upl_wait_for_pages = 0;
+vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
 
 boolean_t (* volatile consider_buffer_cache_collect)(int) = NULL;
 
-#if DEVELOPMENT || DEBUG
-unsigned long vm_cs_validated_resets = 0;
-#endif
-
 int    vm_debug_events = 0;
 
 #if CONFIG_MEMORYSTATUS
-#if !CONFIG_JETSAM
-extern boolean_t memorystatus_idle_exit_from_VM(void);
-#endif
 extern boolean_t memorystatus_kill_on_VM_page_shortage(boolean_t async);
-extern void memorystatus_on_pageout_scan_end(void);
 
 uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
 uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
-#if DEVELOPMENT || DEBUG
-uint32_t vm_grab_anon_overrides = 0;
-uint32_t vm_grab_anon_nops = 0;
-#endif
 
 #endif
 
-#if MACH_CLUSTER_STATS
-unsigned long vm_pageout_cluster_dirtied = 0;
-unsigned long vm_pageout_cluster_cleaned = 0;
-unsigned long vm_pageout_cluster_collisions = 0;
-unsigned long vm_pageout_cluster_clusters = 0;
-unsigned long vm_pageout_cluster_conversions = 0;
-unsigned long vm_pageout_target_collisions = 0;
-unsigned long vm_pageout_target_page_dirtied = 0;
-unsigned long vm_pageout_target_page_freed = 0;
-#define CLUSTER_STAT(clause)   clause
-#else  /* MACH_CLUSTER_STATS */
-#define CLUSTER_STAT(clause)
-#endif /* MACH_CLUSTER_STATS */
 
 
-#if DEVELOPMENT || DEBUG
-vmct_stats_t vmct_stats;
-#endif
-
-/* 
+/*
  *     Routine:        vm_pageout_object_terminate
  *     Purpose:
  *             Destroy the pageout_object, and perform all of the
  *             required cleanup actions.
- * 
+ *
  *     In/Out conditions:
  *             The object must be locked, and will be returned locked.
  */
@@ -534,13 +370,13 @@ vm_pageout_object_terminate(
 
                p = (vm_page_t) vm_page_queue_first(&object->memq);
 
-               assert(p->private);
-               assert(p->free_when_done);
-               p->free_when_done = FALSE;
-               assert(!p->cleaning);
-               assert(!p->laundry);
+               assert(p->vmp_private);
+               assert(p->vmp_free_when_done);
+               p->vmp_free_when_done = FALSE;
+               assert(!p->vmp_cleaning);
+               assert(!p->vmp_laundry);
 
-               offset = p->offset;
+               offset = p->vmp_offset;
                VM_PAGE_FREE(p);
                p = VM_PAGE_NULL;
 
@@ -550,15 +386,15 @@ vm_pageout_object_terminate(
                if(m == VM_PAGE_NULL)
                        continue;
 
-               assert((m->dirty) || (m->precious) ||
-                               (m->busy && m->cleaning));
+               assert((m->vmp_dirty) || (m->vmp_precious) ||
+                               (m->vmp_busy && m->vmp_cleaning));
 
                /*
                 * Handle the trusted pager throttle.
                 * Also decrement the burst throttle (if external).
                 */
                vm_page_lock_queues();
-               if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)
+               if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)
                        vm_pageout_throttle_up(m);
 
                /*
@@ -569,15 +405,12 @@ vm_pageout_object_terminate(
                 * pages may have been modified between the selection as an
                 * adjacent page and conversion to a target.
                 */
-               if (m->free_when_done) {
-                       assert(m->busy);
-                       assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
-                       assert(m->wire_count == 1);
-                       m->cleaning = FALSE;
-                       m->free_when_done = FALSE;
-#if MACH_CLUSTER_STATS
-                       if (m->wanted) vm_pageout_target_collisions++;
-#endif
+               if (m->vmp_free_when_done) {
+                       assert(m->vmp_busy);
+                       assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
+                       assert(m->vmp_wire_count == 1);
+                       m->vmp_cleaning = FALSE;
+                       m->vmp_free_when_done = FALSE;
                        /*
                         * Revoke all access to the page. Since the object is
                         * locked, and the page is busy, this prevents the page
@@ -591,17 +424,15 @@ vm_pageout_object_terminate(
                        if (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
                                SET_PAGE_DIRTY(m, FALSE);
                        } else {
-                               m->dirty = FALSE;
+                               m->vmp_dirty = FALSE;
                        }
 
-                       if (m->dirty) {
-                               CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
+                       if (m->vmp_dirty) {
                                vm_page_unwire(m, TRUE);        /* reactivates */
                                VM_STAT_INCR(reactivations);
                                PAGE_WAKEUP_DONE(m);
                        } else {
-                               CLUSTER_STAT(vm_pageout_target_page_freed++;)
-                               vm_page_free(m);/* clears busy, etc. */
+                               vm_page_free(m);  /* clears busy, etc. */
                        }
                        vm_page_unlock_queues();
                        continue;
@@ -612,19 +443,19 @@ vm_pageout_object_terminate(
                 * If prep_pin_count is nonzero, then someone is using the
                 * page, so make it active.
                 */
-               if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) && !m->private) {
-                       if (m->reference)
+               if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
+                       if (m->vmp_reference)
                                vm_page_activate(m);
                        else
                                vm_page_deactivate(m);
                }
-               if (m->overwriting) {
+               if (m->vmp_overwriting) {
                        /*
                         * the (COPY_OUT_FROM == FALSE) request_page_list case
                         */
-                       if (m->busy) {
+                       if (m->vmp_busy) {
                                /*
-                                * We do not re-set m->dirty !
+                                * We do not re-set m->vmp_dirty !
                                 * The page was busy so no extraneous activity
                                 * could have occurred. COPY_INTO is a read into the
                                 * new pages. CLEAN_IN_PLACE does actually write
@@ -634,8 +465,8 @@ vm_pageout_object_terminate(
                                 */
                                pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
 
-                               m->busy = FALSE;
-                               m->absent = FALSE;
+                               m->vmp_busy = FALSE;
+                               m->vmp_absent = FALSE;
                        } else {
                                /*
                                 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
@@ -645,28 +476,11 @@ vm_pageout_object_terminate(
                                 assert(VM_PAGE_WIRED(m));
                                 vm_page_unwire(m, TRUE);       /* reactivates */
                        }
-                       m->overwriting = FALSE;
+                       m->vmp_overwriting = FALSE;
                } else {
-                       /*
-                        * Set the dirty state according to whether or not the page was
-                        * modified during the pageout. Note that we purposefully do
-                        * NOT call pmap_clear_modify since the page is still mapped.
-                        * If the page were to be dirtied between the 2 calls, this
-                        * this fact would be lost. This code is only necessary to
-                        * maintain statistics, since the pmap module is always
-                        * consulted if m->dirty is false.
-                        */
-#if MACH_CLUSTER_STATS
-                       m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
-
-                       if (m->dirty)   vm_pageout_cluster_dirtied++;
-                       else            vm_pageout_cluster_cleaned++;
-                       if (m->wanted)  vm_pageout_cluster_collisions++;
-#else
-                       m->dirty = FALSE;
-#endif
+                       m->vmp_dirty = FALSE;
                }
-               m->cleaning = FALSE;
+               m->vmp_cleaning = FALSE;
 
                /*
                 * Wakeup any thread waiting for the page to be un-cleaning.
@@ -705,14 +519,14 @@ vm_pageclean_setup(
        vm_object_t             new_object,
        vm_object_offset_t      new_offset)
 {
-       assert(!m->busy);
+       assert(!m->vmp_busy);
 #if 0
-       assert(!m->cleaning);
+       assert(!m->vmp_cleaning);
 #endif
 
        XPR(XPR_VM_PAGEOUT,
            "vm_pageclean_setup, obj 0x%X off 0x%X page 0x%X new 0x%X new_off 0x%X\n",
-               VM_PAGE_OBJECT(m), m->offset, m, 
+               VM_PAGE_OBJECT(m), m->vmp_offset, m,
                new_m, new_offset);
 
        pmap_clear_modify(VM_PAGE_GET_PHYS_PAGE(m));
@@ -720,19 +534,19 @@ vm_pageclean_setup(
        /*
         * Mark original page as cleaning in place.
         */
-       m->cleaning = TRUE;
+       m->vmp_cleaning = TRUE;
        SET_PAGE_DIRTY(m, FALSE);
-       m->precious = FALSE;
+       m->vmp_precious = FALSE;
 
        /*
         * Convert the fictitious page to a private shadow of
         * the real page.
         */
-       assert(new_m->fictitious);
+       assert(new_m->vmp_fictitious);
        assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
-       new_m->fictitious = FALSE;
-       new_m->private = TRUE;
-       new_m->free_when_done = TRUE;
+       new_m->vmp_fictitious = FALSE;
+       new_m->vmp_private = TRUE;
+       new_m->vmp_free_when_done = TRUE;
        VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
 
        vm_page_lockspin_queues();
@@ -740,8 +554,8 @@ vm_pageclean_setup(
        vm_page_unlock_queues();
 
        vm_page_insert_wired(new_m, new_object, new_offset, VM_KERN_MEMORY_NONE);
-       assert(!new_m->wanted);
-       new_m->busy = FALSE;
+       assert(!new_m->vmp_wanted);
+       new_m->vmp_busy = FALSE;
 }
 
 /*
@@ -762,7 +576,7 @@ vm_pageclean_setup(
  *     Implementation:
  *             Move this page to a completely new object.
  */
-void   
+void
 vm_pageout_initialize_page(
        vm_page_t       m)
 {
@@ -778,22 +592,22 @@ vm_pageout_initialize_page(
 
        object = VM_PAGE_OBJECT(m);
 
-       assert(m->busy);
+       assert(m->vmp_busy);
        assert(object->internal);
 
        /*
         *      Verify that we really want to clean this page
         */
-       assert(!m->absent);
-       assert(!m->error);
-       assert(m->dirty);
+       assert(!m->vmp_absent);
+       assert(!m->vmp_error);
+       assert(m->vmp_dirty);
 
        /*
         *      Create a paging reference to let us play with the object.
         */
-       paging_offset = m->offset + object->paging_offset;
+       paging_offset = m->vmp_offset + object->paging_offset;
 
-       if (m->absent || m->error || m->restart || (!m->dirty && !m->precious)) {
+       if (m->vmp_absent || m->vmp_error || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
                panic("reservation without pageout?"); /* alan */
 
                VM_PAGE_FREE(m);
@@ -803,7 +617,7 @@ vm_pageout_initialize_page(
        }
 
        /*
-        * If there's no pager, then we can't clean the page.  This should 
+        * If there's no pager, then we can't clean the page.  This should
         * never happen since this should be a copy object and therefore not
         * an external object, so the pager should always be there.
         */
@@ -843,15 +657,6 @@ vm_pageout_initialize_page(
        vm_object_paging_end(object);
 }
 
-#if    MACH_CLUSTER_STATS
-#define MAXCLUSTERPAGES        16
-struct {
-       unsigned long pages_in_cluster;
-       unsigned long pages_at_higher_offsets;
-       unsigned long pages_at_lower_offsets;
-} cluster_stats[MAXCLUSTERPAGES];
-#endif /* MACH_CLUSTER_STATS */
-
 
 /*
  * vm_pageout_cluster:
@@ -867,13 +672,21 @@ struct {
  *
  * The page must not be on any pageout queue.
  */
+#if DEVELOPMENT || DEBUG
+vmct_stats_t vmct_stats;
+
 int32_t vmct_active = 0;
+uint64_t vm_compressor_epoch_start = 0;
+uint64_t vm_compressor_epoch_stop = 0;
+
 typedef enum vmct_state_t {
        VMCT_IDLE,
        VMCT_AWAKENED,
        VMCT_ACTIVE,
 } vmct_state_t;
 vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
+#endif
+
 
 void
 vm_pageout_cluster(vm_page_t m)
@@ -884,7 +697,7 @@ vm_pageout_cluster(vm_page_t m)
 
        XPR(XPR_VM_PAGEOUT,
                "vm_pageout_cluster, object 0x%X offset 0x%X page 0x%X\n",
-               object, m->offset, m, 0, 0);
+               object, m->vmp_offset, m, 0, 0);
 
        VM_PAGE_CHECK(m);
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
@@ -893,9 +706,9 @@ vm_pageout_cluster(vm_page_t m)
        /*
         * Only a certain kind of page is appreciated here.
         */
-       assert((m->dirty || m->precious) && (!VM_PAGE_WIRED(m)));
-       assert(!m->cleaning && !m->laundry);
-       assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+       assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
+       assert(!m->vmp_cleaning && !m->vmp_laundry);
+       assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
 
        /*
         * protect the object from collapse or termination
@@ -905,20 +718,20 @@ vm_pageout_cluster(vm_page_t m)
        if (object->internal == TRUE) {
                assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
 
-               m->busy = TRUE;
+               m->vmp_busy = TRUE;
 
                q = &vm_pageout_queue_internal;
        } else
                q = &vm_pageout_queue_external;
 
-       /* 
+       /*
         * pgo_laundry count is tied to the laundry bit
         */
-       m->laundry = TRUE;
+       m->vmp_laundry = TRUE;
        q->pgo_laundry++;
 
-       m->vm_page_q_state = VM_PAGE_ON_PAGEOUT_Q;
-       vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, pageq);
+       m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
+       vm_page_queue_enter(&q->pgo_pending, m, vm_page_t, vmp_pageq);
 
        if (q->pgo_idle == TRUE) {
                q->pgo_idle = FALSE;
@@ -928,10 +741,8 @@ vm_pageout_cluster(vm_page_t m)
 }
 
 
-unsigned long vm_pageout_throttle_up_count = 0;
-
 /*
- * A page is back from laundry or we are stealing it back from 
+ * A page is back from laundry or we are stealing it back from
  * the laundering state.  See if there are some pages waiting to
  * go to laundry and if we can let some of them go now.
  *
@@ -952,25 +763,25 @@ vm_pageout_throttle_up(
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
        vm_object_lock_assert_exclusive(m_object);
 
-       vm_pageout_throttle_up_count++;
-
        if (m_object->internal == TRUE)
                q = &vm_pageout_queue_internal;
        else
                q = &vm_pageout_queue_external;
 
-       if (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
+       if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
 
-              vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, pageq);
-              m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
+              vm_page_queue_remove(&q->pgo_pending, m, vm_page_t, vmp_pageq);
+              m->vmp_q_state = VM_PAGE_NOT_ON_Q;
 
               VM_PAGE_ZERO_PAGEQ_ENTRY(m);
 
               vm_object_activity_end(m_object);
+
+              VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
        }
-       if (m->laundry == TRUE) {
+       if (m->vmp_laundry == TRUE) {
 
-              m->laundry = FALSE;
+              m->vmp_laundry = FALSE;
               q->pgo_laundry--;
 
               if (q->pgo_throttled == TRUE) {
@@ -981,6 +792,7 @@ vm_pageout_throttle_up(
                       q->pgo_draining = FALSE;
                       thread_wakeup((event_t) (&q->pgo_laundry+1));
               }
+              VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
        }
 }
 
@@ -992,7 +804,7 @@ vm_pageout_throttle_up_batch(
 {
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 
-       vm_pageout_throttle_up_count += batch_cnt;
+       VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
 
        q->pgo_laundry -= batch_cnt;
 
@@ -1025,20 +837,64 @@ vm_pageout_throttle_up_batch(
  * also returns the number of pages the system still needs to reclaim at this
  * moment in time.
  */
-#define VM_PAGEOUT_STAT_SIZE   31
+#if DEVELOPMENT || DEBUG
+#define VM_PAGEOUT_STAT_SIZE   (30 * 8) + 1
+#else
+#define VM_PAGEOUT_STAT_SIZE   (1 * 8) + 1
+#endif
 struct vm_pageout_stat {
-       unsigned int considered;
-       unsigned int reclaimed_clean;
+        unsigned long vm_page_active_count;
+        unsigned long vm_page_speculative_count;
+        unsigned long vm_page_inactive_count;
+        unsigned long vm_page_anonymous_count;
+
+        unsigned long vm_page_free_count;
+        unsigned long vm_page_wire_count;
+        unsigned long vm_page_compressor_count;
+
+        unsigned long vm_page_pages_compressed;
+        unsigned long vm_page_pageable_internal_count;
+        unsigned long vm_page_pageable_external_count;
+        unsigned long vm_page_xpmapped_external_count;
+
+        unsigned int pages_grabbed;
+        unsigned int pages_freed;
+
        unsigned int pages_compressed;
        unsigned int pages_grabbed_by_compressor;
+       unsigned int failed_compressions;
+
+        unsigned int pages_evicted;
+        unsigned int pages_purged;
+
+       unsigned int considered;
+        unsigned int considered_bq_internal;
+        unsigned int considered_bq_external;
+
+        unsigned int skipped_external;
+        unsigned int filecache_min_reactivations;
+
+       unsigned int freed_speculative;
+       unsigned int freed_cleaned;
+       unsigned int freed_internal;
+       unsigned int freed_external;
+
        unsigned int cleaned_dirty_external;
+        unsigned int cleaned_dirty_internal;
+
+        unsigned int inactive_referenced;
+        unsigned int inactive_nolock;
+        unsigned int reactivation_limit_exceeded;
+        unsigned int forced_inactive_reclaim;
+
        unsigned int throttled_internal_q;
        unsigned int throttled_external_q;
-       unsigned int failed_compressions;
-} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0,0,0,0,0,0,0}, };
+
+        unsigned int phantom_ghosts_found;
+        unsigned int phantom_ghosts_added;
+} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE] = {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, };
 
 unsigned int vm_pageout_stat_now = 0;
-unsigned int vm_memory_pressure = 0;
 
 #define VM_PAGEOUT_STAT_BEFORE(i) \
        (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
@@ -1046,15 +902,14 @@ unsigned int vm_memory_pressure = 0;
        (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
 
 #if VM_PAGE_BUCKETS_CHECK
-int vm_page_buckets_check_interval = 10; /* in seconds */
+int vm_page_buckets_check_interval = 80; /* in eighths of a second */
 #endif /* VM_PAGE_BUCKETS_CHECK */
 
-/*
- * Called from compute_averages().
- */
+
 void
-compute_memory_pressure(
-       __unused void *arg)
+record_memory_pressure(void);
+void
+record_memory_pressure(void)
 {
        unsigned int vm_pageout_next;
 
@@ -1066,21 +921,18 @@ compute_memory_pressure(
        }
 #endif /* VM_PAGE_BUCKETS_CHECK */
 
-       vm_memory_pressure =
-               vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].reclaimed_clean;
+       vm_pageout_state.vm_memory_pressure =
+         vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
+         vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
+         vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
+         vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
 
-       commpage_set_memory_pressure( vm_memory_pressure );
+       commpage_set_memory_pressure( (unsigned int)vm_pageout_state.vm_memory_pressure );
 
        /* move "now" forward */
        vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
-       vm_pageout_stats[vm_pageout_next].considered = 0;
-       vm_pageout_stats[vm_pageout_next].reclaimed_clean = 0;
-       vm_pageout_stats[vm_pageout_next].throttled_internal_q = 0;
-       vm_pageout_stats[vm_pageout_next].throttled_external_q = 0;
-       vm_pageout_stats[vm_pageout_next].cleaned_dirty_external = 0;
-       vm_pageout_stats[vm_pageout_next].pages_compressed = 0;
-       vm_pageout_stats[vm_pageout_next].pages_grabbed_by_compressor = 0;
-       vm_pageout_stats[vm_pageout_next].failed_compressions = 0;
+
+       bzero(&vm_pageout_stats[vm_pageout_next], sizeof(struct vm_pageout_stat));
 
        vm_pageout_stat_now = vm_pageout_next;
 }
@@ -1089,8 +941,8 @@ compute_memory_pressure(
 /*
  * IMPORTANT
  * mach_vm_ctl_page_free_wanted() is called indirectly, via
- * mach_vm_pressure_monitor(), when taking a stackshot. Therefore, 
- * it must be safe in the restricted stackshot context. Locks and/or 
+ * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
+ * it must be safe in the restricted stackshot context. Locks and/or
  * blocking are not allowable.
  */
 unsigned int
@@ -1112,7 +964,7 @@ mach_vm_ctl_page_free_wanted(void)
 
 /*
  * IMPORTANT:
- * mach_vm_pressure_monitor() is called when taking a stackshot, with 
+ * mach_vm_pressure_monitor() is called when taking a stackshot, with
  * wait_for_pressure FALSE, so that code path must remain safe in the
  * restricted stackshot context. No blocking or locks are allowable.
  * on that code path.
@@ -1128,7 +980,9 @@ mach_vm_pressure_monitor(
        wait_result_t   wr;
        unsigned int    vm_pageout_then, vm_pageout_now;
        unsigned int    pages_reclaimed;
+       unsigned int    units_of_monitor;
 
+       units_of_monitor = 8 * nsecs_monitored;
        /*
         * We don't take the vm_page_queue_lock here because we don't want
         * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
@@ -1174,10 +1028,13 @@ mach_vm_pressure_monitor(
        for (vm_pageout_then =
                     VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
             vm_pageout_then != vm_pageout_now &&
-                    nsecs_monitored-- != 0;
+                    units_of_monitor-- != 0;
             vm_pageout_then =
                     VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
-               pages_reclaimed += vm_pageout_stats[vm_pageout_then].reclaimed_clean;
+               pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
+               pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
+               pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
+               pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
        }
        *pages_reclaimed_p = pages_reclaimed;
 
@@ -1250,7 +1107,7 @@ vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
                 */
                if (m_object != l_object) {
                        /*
-                        * the object associated with candidate page is 
+                        * the object associated with candidate page is
                         * different from the one we were just working
                         * with... dump the lock if we still own it
                         */
@@ -1266,7 +1123,7 @@ vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
                         * page queues lock, we can only 'try' for this one.
                         * if the 'try' fails, we need to do a mutex_pause
                         * to allow the owner of the object lock a chance to
-                        * run... 
+                        * run...
                         */
                        if ( !vm_object_lock_try_scan(m_object)) {
 
@@ -1287,21 +1144,21 @@ vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
 
                        l_object = m_object;
                }
-               if ( !m_object->alive || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
+               if ( !m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) {
                        /*
                         * put it back on the head of its queue
                         */
                        goto reenter_pg_on_q;
                }
-               if (m->pmapped == TRUE) {
+               if (m->vmp_pmapped == TRUE) {
 
                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
 
                        disconnected_count++;
                }
 reenter_pg_on_q:
-               vm_page_queue_remove(q, m, vm_page_t, pageq);
-               vm_page_queue_enter(q, m, vm_page_t, pageq);
+               vm_page_queue_remove(q, m, vm_page_t, vmp_pageq);
+               vm_page_queue_enter(q, m, vm_page_t, vmp_pageq);
 
                qcount--;
                try_failed_count = 0;
@@ -1383,7 +1240,7 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
 
 
        iq = &vm_pageout_queue_internal;
-       
+
        vm_page_lock_queues();
 
        while (qcount && !vm_page_queue_empty(q)) {
@@ -1397,12 +1254,12 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
                                l_object = NULL;
                        }
                        iq->pgo_draining = TRUE;
-                                       
+
                        assert_wait((event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
                        vm_page_unlock_queues();
-                                       
+
                        thread_block(THREAD_CONTINUE_NULL);
-                       
+
                        vm_page_lock_queues();
                        delayed_unlock = 0;
                        continue;
@@ -1416,11 +1273,11 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
                 * already got the lock
                 */
                if (m_object != l_object) {
-                       if ( !m_object->internal) 
+                       if ( !m_object->internal)
                                goto reenter_pg_on_q;
 
                        /*
-                        * the object associated with candidate page is 
+                        * the object associated with candidate page is
                         * different from the one we were just working
                         * with... dump the lock if we still own it
                         */
@@ -1436,7 +1293,7 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
                         * page queues lock, we can only 'try' for this one.
                         * if the 'try' fails, we need to do a mutex_pause
                         * to allow the owner of the object lock a chance to
-                        * run... 
+                        * run...
                         */
                        if ( !vm_object_lock_try_scan(m_object)) {
 
@@ -1453,7 +1310,7 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
                        }
                        l_object = m_object;
                }
-               if ( !m_object->alive || m->cleaning || m->laundry || m->busy || m->absent || m->error || m->free_when_done) {
+               if ( !m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error || m->vmp_free_when_done) {
                        /*
                         * page is not to be cleaned
                         * put it back on the head of its queue
@@ -1462,22 +1319,22 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
                }
                phys_page = VM_PAGE_GET_PHYS_PAGE(m);
 
-               if (m->reference == FALSE && m->pmapped == TRUE) {
+               if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
                        refmod_state = pmap_get_refmod(phys_page);
-                 
+
                        if (refmod_state & VM_MEM_REFERENCED)
-                               m->reference = TRUE;
+                               m->vmp_reference = TRUE;
                        if (refmod_state & VM_MEM_MODIFIED) {
                                SET_PAGE_DIRTY(m, FALSE);
                        }
                }
-               if (m->reference == TRUE) {
-                       m->reference = FALSE;
+               if (m->vmp_reference == TRUE) {
+                       m->vmp_reference = FALSE;
                        pmap_clear_refmod_options(phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
                        goto reenter_pg_on_q;
                }
-               if (m->pmapped == TRUE) {
-                       if (m->dirty || m->precious) {
+               if (m->vmp_pmapped == TRUE) {
+                       if (m->vmp_dirty || m->vmp_precious) {
                                pmap_options = PMAP_OPTIONS_COMPRESSOR;
                        } else {
                                pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
@@ -1487,7 +1344,8 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
                                SET_PAGE_DIRTY(m, FALSE);
                        }
                }
-               if ( !m->dirty && !m->precious) {
+
+               if ( !m->vmp_dirty && !m->vmp_precious) {
                        vm_page_unlock_queues();
                        VM_PAGE_FREE(m);
                        vm_page_lock_queues();
@@ -1496,7 +1354,7 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
                        goto next_pg;
                }
                if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL)  {
-                       
+
                        if (!m_object->pager_initialized) {
 
                                vm_page_unlock_queues();
@@ -1531,8 +1389,8 @@ vm_pageout_page_queue(vm_page_queue_head_t *q, int qcount)
                goto next_pg;
 
 reenter_pg_on_q:
-               vm_page_queue_remove(q, m, vm_page_t, pageq);
-               vm_page_queue_enter(q, m, vm_page_t, pageq);
+               vm_page_queue_remove(q, m, vm_page_t, vmp_pageq);
+               vm_page_queue_enter(q, m, vm_page_t, vmp_pageq);
 next_pg:
                qcount--;
                try_failed_count = 0;
@@ -1572,11 +1430,11 @@ extern void vm_pageout_io_throttle(void);
          * "partially re-used", which could be expensive.               \
          */                                                             \
        assert(VM_PAGE_OBJECT((m)) == (obj));                           \
-        if ((m)->reusable ||                                            \
+        if ((m)->vmp_reusable ||                                        \
             (obj)->all_reusable) {                                     \
                vm_object_reuse_pages((obj),                            \
-                                      (m)->offset,                      \
-                                      (m)->offset + PAGE_SIZE_64,       \
+                                      (m)->vmp_offset,                  \
+                                      (m)->vmp_offset + PAGE_SIZE_64,   \
                                       FALSE);                           \
         }                                                               \
         MACRO_END
@@ -1594,27 +1452,19 @@ struct flow_control {
         mach_timespec_t        ts;
 };
 
+
 #if CONFIG_BACKGROUND_QUEUE
-uint64_t vm_pageout_skipped_bq_internal = 0;
-uint64_t vm_pageout_considered_bq_internal = 0;
-uint64_t vm_pageout_considered_bq_external = 0;
 uint64_t vm_pageout_rejected_bq_internal = 0;
 uint64_t vm_pageout_rejected_bq_external = 0;
+uint64_t vm_pageout_skipped_bq_internal = 0;
 #endif
 
-uint32_t vm_pageout_no_victim = 0;
-uint32_t vm_pageout_considered_page = 0;
-uint32_t vm_page_filecache_min = 0;
-
 #define ANONS_GRABBED_LIMIT    2
 
-#if CONFIG_SECLUDED_MEMORY
-extern vm_page_t vm_page_grab_secluded(void);
-uint64_t vm_pageout_secluded_burst_count = 0;
-#endif /* CONFIG_SECLUDED_MEMORY */
-
 
+#if 0
 static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
+#endif
 static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
 
 #define        VM_PAGEOUT_PB_NO_ACTION                         0
@@ -1622,20 +1472,21 @@ static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *
 #define        VM_PAGEOUT_PB_THREAD_YIELD                      2
 
 
+#if 0
 static void
 vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
 {
        if (*local_freeq) {
                vm_page_unlock_queues();
 
-               VM_DEBUG_EVENT(
+               VM_DEBUG_CONSTANT_EVENT(
                        vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
-                       vm_page_free_count, *local_freed, 0, 1);
+                       vm_page_free_count, 0, 0, 1);
 
                vm_page_free_list(*local_freeq, TRUE);
 
-               VM_DEBUG_EVENT(vm_pageout_freelist,VM_PAGEOUT_FREELIST, DBG_FUNC_END,
-                              vm_page_free_count, 0, 0, 1);
+               VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist,VM_PAGEOUT_FREELIST, DBG_FUNC_END,
+                              vm_page_free_count, *local_freed, 0, 1);
 
                *local_freeq = NULL;
                *local_freed = 0;
@@ -1646,6 +1497,7 @@ vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *loca
        }
        *delayed_unlock = 1;
 }
+#endif
 
 
 static void
@@ -1658,17 +1510,9 @@ vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
                vm_object_unlock(*object);
                *object = NULL;
        }
-       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
-
        if (*local_freeq) {
 
-               VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
-                              vm_page_free_count, *local_freed, 0, 2);
-
                vm_page_free_list(*local_freeq, TRUE);
-                                       
-               VM_DEBUG_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
-                              vm_page_free_count, 0, 0, 2);
 
                *local_freeq = NULL;
                *local_freed = 0;
@@ -1691,130 +1535,259 @@ vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
 }
 
 
-int    last_vm_pageout_freed_from_inactive_clean = 0;
-int    last_vm_pageout_freed_from_cleaned = 0;
-int    last_vm_pageout_freed_from_speculative = 0;
-int    last_vm_pageout_freed_after_compression = 0;
-int    last_vm_pageout_enqueued_cleaned_from_inactive_dirty = 0;
-int    last_vm_pageout_inactive_force_reclaim = 0;
-int    last_vm_pageout_scan_inactive_throttled_external = 0;
-int    last_vm_pageout_scan_inactive_throttled_internal = 0;
-int    last_vm_pageout_reactivation_limit_exceeded = 0;
-int    last_vm_pageout_considered_page = 0;
-int    last_vm_compressor_pages_grabbed = 0;
-int    last_vm_compressor_failed = 0;
-int     last_vm_pageout_skipped_external = 0;
+static struct vm_pageout_vminfo last;
+
+uint64_t last_vm_page_pages_grabbed = 0;
+
+extern  uint32_t c_segment_pages_compressed;
 
+extern uint64_t shared_region_pager_reclaimed;
+extern struct memory_object_pager_ops shared_region_pager_ops;
 
 void update_vm_info(void)
 {
-        int    tmp1, tmp2, tmp3, tmp4;
+        uint64_t tmp;
 
-       if (!kdebug_enable)
-               return;
-       
-       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
-                             vm_page_active_count,
-                             vm_page_speculative_count,
-                             vm_page_inactive_count,
-                             vm_page_anonymous_count,
-                             0);
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
 
-       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
-                             vm_page_free_count,
-                             vm_page_wire_count,
-                             VM_PAGE_COMPRESSOR_COUNT,
-                             0, 0);
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
 
-       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
-                             c_segment_pages_compressed, 
-                             vm_page_internal_count,
-                             vm_page_external_count,
-                             vm_page_xpmapped_external_count,
-                             0);
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
+       vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
+
+
+       tmp = vm_pageout_vminfo.vm_pageout_considered_page;
+       vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
+       last.vm_pageout_considered_page = tmp;
+
+       tmp = vm_pageout_vminfo.vm_pageout_compressions;
+       vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp - last.vm_pageout_compressions);
+       last.vm_pageout_compressions = tmp;
+
+       tmp = vm_pageout_vminfo.vm_compressor_failed;
+       vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
+       last.vm_compressor_failed = tmp;
+
+       tmp = vm_pageout_vminfo.vm_compressor_pages_grabbed;
+       vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp - last.vm_compressor_pages_grabbed);
+       last.vm_compressor_pages_grabbed = tmp;
+
+       tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
+       vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
+       last.vm_phantom_cache_found_ghost = tmp;
+
+       tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
+       vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
+       last.vm_phantom_cache_added_ghost = tmp;
+
+       tmp = get_pages_grabbed_count();
+       vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp - last_vm_page_pages_grabbed);
+       last_vm_page_pages_grabbed = tmp;
+
+       tmp = vm_pageout_vminfo.vm_page_pages_freed;
+       vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
+       last.vm_page_pages_freed = tmp;
+
+
+       if (vm_pageout_stats[vm_pageout_stat_now].considered) {
 
+               tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
+               vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
+               last.vm_pageout_pages_evicted = tmp;
 
-       if ((vm_pageout_considered_page - last_vm_pageout_considered_page) == 0 &&
-           (vm_pageout_enqueued_cleaned_from_inactive_dirty - last_vm_pageout_enqueued_cleaned_from_inactive_dirty == 0) &&
-           (vm_pageout_freed_after_compression - last_vm_pageout_freed_after_compression == 0))
-               return;
+               tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
+               vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
+               last.vm_pageout_pages_purged = tmp;
 
+               tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
+               vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
+               last.vm_pageout_freed_speculative = tmp;
 
-       tmp1 = vm_pageout_considered_page;
-       tmp2 = vm_pageout_freed_from_speculative;
-       tmp3 = vm_pageout_freed_from_inactive_clean;
+               tmp = vm_pageout_vminfo.vm_pageout_freed_external;
+               vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
+               last.vm_pageout_freed_external = tmp;
 
-       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
-                             tmp1 - last_vm_pageout_considered_page,
-                             tmp2 - last_vm_pageout_freed_from_speculative,
-                             tmp3 - last_vm_pageout_freed_from_inactive_clean,
-                             0, 0);
-       
-       last_vm_pageout_considered_page = tmp1;
-       last_vm_pageout_freed_from_speculative = tmp2;
-       last_vm_pageout_freed_from_inactive_clean = tmp3;
+               tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
+               vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
+               last.vm_pageout_inactive_referenced = tmp;
 
+               tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
+               vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
+               last.vm_pageout_scan_inactive_throttled_external = tmp;
 
-       tmp1 = vm_pageout_scan_inactive_throttled_external;
-       tmp2 = vm_pageout_enqueued_cleaned_from_inactive_dirty;
-       tmp3 = vm_pageout_freed_from_cleaned;
+               tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
+               vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
+               last.vm_pageout_inactive_dirty_external = tmp;
 
-       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
-                             tmp1 - last_vm_pageout_scan_inactive_throttled_external,
-                             tmp2 - last_vm_pageout_enqueued_cleaned_from_inactive_dirty,
-                             tmp3 - last_vm_pageout_freed_from_cleaned,
-                             0, 0);
+               tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
+               vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
+               last.vm_pageout_freed_cleaned = tmp;
 
-       vm_pageout_stats[vm_pageout_stat_now].throttled_external_q += (tmp1 - last_vm_pageout_scan_inactive_throttled_external);
-       vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external += (tmp2 - last_vm_pageout_enqueued_cleaned_from_inactive_dirty);
+               tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
+               vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
+               last.vm_pageout_inactive_nolock = tmp;
 
-       last_vm_pageout_scan_inactive_throttled_external = tmp1;
-       last_vm_pageout_enqueued_cleaned_from_inactive_dirty = tmp2;
-       last_vm_pageout_freed_from_cleaned = tmp3;
+               tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
+               vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
+               last.vm_pageout_scan_inactive_throttled_internal = tmp;
 
+               tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
+               vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
+               last.vm_pageout_skipped_external = tmp;
 
-       tmp1 = vm_pageout_scan_inactive_throttled_internal;
-       tmp2 = vm_pageout_freed_after_compression;
-       tmp3 = vm_compressor_pages_grabbed;
-       tmp4 = vm_pageout_skipped_external;
+               tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
+               vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
+               last.vm_pageout_reactivation_limit_exceeded = tmp;
 
-       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
-                             tmp1 - last_vm_pageout_scan_inactive_throttled_internal,
-                             tmp2 - last_vm_pageout_freed_after_compression,
-                             tmp3 - last_vm_compressor_pages_grabbed,
-                             tmp4 - last_vm_pageout_skipped_external,
+               tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
+               vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
+               last.vm_pageout_inactive_force_reclaim = tmp;
+
+               tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
+               vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
+               last.vm_pageout_freed_internal = tmp;
+
+               tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
+               vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
+               last.vm_pageout_considered_bq_internal = tmp;
+
+               tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
+               vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
+               last.vm_pageout_considered_bq_external = tmp;
+
+               tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
+               vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
+               last.vm_pageout_filecache_min_reactivated = tmp;
+
+               tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
+               vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
+               last.vm_pageout_inactive_dirty_internal = tmp;
+       }
+
+       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
+                             0);
+
+       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
+                             0,
+                             0);
+
+       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
+                             vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
+                             0);
+
+       if (vm_pageout_stats[vm_pageout_stat_now].considered ||
+           vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
+           vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
+
+               KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
+                                     vm_pageout_stats[vm_pageout_stat_now].considered,
+                                     vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
+                                     vm_pageout_stats[vm_pageout_stat_now].freed_external,
+                                     vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
+                                     0);
+
+               KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
+                                     vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
+                                     vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
+                                     vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
+                                     vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
+                                     0);
+
+               KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
+                                     vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
+                                     vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
+                                     vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
+                                     vm_pageout_stats[vm_pageout_stat_now].skipped_external,
+                                     0);
+
+               KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
+                                     vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
+                                     vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
+                                     vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
+                                     vm_pageout_stats[vm_pageout_stat_now].freed_internal,
+                                     0);
+
+               KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
+                                     vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
+                                     vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
+                                     vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
+                                     vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
+                                     0);
+
+       }
+       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
+                             vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
+                             vm_pageout_stats[vm_pageout_stat_now].pages_freed,
+                             vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
+                             vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
                              0);
-                             
-       vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q += (tmp1 - last_vm_pageout_scan_inactive_throttled_internal);
-       vm_pageout_stats[vm_pageout_stat_now].pages_compressed += (tmp2 - last_vm_pageout_freed_after_compression);
-       vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor += (tmp3 - last_vm_compressor_pages_grabbed);
 
-       last_vm_pageout_scan_inactive_throttled_internal = tmp1;
-       last_vm_pageout_freed_after_compression = tmp2;
-       last_vm_compressor_pages_grabbed = tmp3;
-       last_vm_pageout_skipped_external = tmp4;
+       record_memory_pressure();
+}
+
+
+void
+vm_page_balance_inactive(int max_to_move)
+{
+        vm_page_t m;
+
+       LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
+
+       vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
+                                                         vm_page_inactive_count +
+                                                         vm_page_speculative_count);
 
+       while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
 
-       if ((vm_pageout_reactivation_limit_exceeded - last_vm_pageout_reactivation_limit_exceeded) == 0 &&
-           (vm_pageout_inactive_force_reclaim - last_vm_pageout_inactive_force_reclaim) == 0 &&
-           (vm_compressor_failed - last_vm_compressor_failed) == 0)
-               return;
+               VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
 
-       tmp1 = vm_pageout_reactivation_limit_exceeded;
-       tmp2 = vm_pageout_inactive_force_reclaim;
-       tmp3 = vm_compressor_failed;
+               m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
+
+               assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
+               assert(!m->vmp_laundry);
+               assert(VM_PAGE_OBJECT(m) != kernel_object);
+               assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
 
-       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
-                             tmp1 - last_vm_pageout_reactivation_limit_exceeded,
-                             tmp2 - last_vm_pageout_inactive_force_reclaim,
-                             tmp3 - last_vm_compressor_failed,
-                             0, 0);
+               DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
 
-       vm_pageout_stats[vm_pageout_stat_now].failed_compressions += (tmp3 - last_vm_compressor_failed);
+               /*
+                * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
+                *
+                * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
+                * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
+                * new reference happens. If no futher references happen on the page after that remote TLB flushes
+                * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
+                * by pageout_scan, which is just fine since the last reference would have happened quite far
+                * in the past (TLB caches don't hang around for very long), and of course could just as easily
+                * have happened before we moved the page
+                */
+               if (m->vmp_pmapped == TRUE)
+                       pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
 
-       last_vm_pageout_reactivation_limit_exceeded = tmp1;
-       last_vm_pageout_inactive_force_reclaim = tmp2;
-       last_vm_compressor_failed = tmp3;
+               /*
+                * The page might be absent or busy,
+                * but vm_page_deactivate can handle that.
+                * FALSE indicates that we don't want a H/W clear reference
+                */
+               vm_page_deactivate_internal(m, FALSE);
+       }
 }
 
 
@@ -1828,7 +1801,6 @@ vm_pageout_scan(void)
 {
        unsigned int loop_count = 0;
        unsigned int inactive_burst_count = 0;
-       unsigned int active_burst_count = 0;
        unsigned int reactivated_this_call;
        unsigned int reactivate_limit;
        vm_page_t   local_freeq = NULL;
@@ -1842,7 +1814,6 @@ vm_pageout_scan(void)
         struct vm_speculative_age_q *sq;
        struct  flow_control    flow_control = { 0, { 0, 0 } };
         boolean_t inactive_throttled = FALSE;
-       boolean_t try_failed;
        mach_timespec_t ts;
        unsigned        int msecs = 0;
        vm_object_t     object = NULL;
@@ -1858,7 +1829,9 @@ vm_pageout_scan(void)
 #endif
        int             cache_evict_throttle = 0;
        uint32_t        vm_pageout_inactive_external_forced_reactivate_limit = 0;
+       uint32_t        inactive_external_count;
        int             force_purge = 0;
+       int             divisor;
 #define        DELAY_SPECULATIVE_AGE   1000
        int             delay_speculative_age = 0;
        vm_object_t     m_object = VM_OBJECT_NULL;
@@ -1868,8 +1841,10 @@ vm_pageout_scan(void)
 #endif /* VM_PRESSURE_EVENTS */
 
        VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
-                      vm_pageout_speculative_clean, vm_pageout_inactive_clean,
-                      vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
+                               vm_pageout_vminfo.vm_pageout_freed_speculative,
+                               vm_pageout_state.vm_pageout_inactive_clean,
+                               vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
+                               vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
 
        flow_control.state = FCS_IDLE;
        iq = &vm_pageout_queue_internal;
@@ -1880,9 +1855,12 @@ vm_pageout_scan(void)
         XPR(XPR_VM_PAGEOUT, "vm_pageout_scan\n", 0, 0, 0, 0, 0);
 
        /* Ask the pmap layer to return any pages it no longer needs. */
-       pmap_release_pages_fast();
+       uint64_t pmap_wired_pages_freed = pmap_release_pages_fast();
 
        vm_page_lock_queues();
+
+       vm_page_wire_count -= pmap_wired_pages_freed;
+
        delayed_unlock = 1;
 
        /*
@@ -1897,12 +1875,6 @@ vm_pageout_scan(void)
        vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
 
        /*
-        *      We want to gradually dribble pages from the active queue
-        *      to the inactive queue.  If we let the inactive queue get
-        *      very small, and then suddenly dump many pages into it,
-        *      those pages won't get a sufficient chance to be referenced
-        *      before we start taking them from the inactive queue.
-        *
         *      We must limit the rate at which we send pages to the pagers
         *      so that we don't tie up too many pages in the I/O queues.
         *      We implement a throttling mechanism using the laundry count
@@ -1913,37 +1885,20 @@ vm_pageout_scan(void)
         *      stalled waiting for memory, which only we can provide.
         */
 
-
 Restart:
 
        assert(object == NULL);
        assert(delayed_unlock != 0);
-       
-       /*
-        *      Recalculate vm_page_inactivate_target.
-        */
-       vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
-                                                         vm_page_inactive_count +
-                                                         vm_page_speculative_count);
 
        vm_page_anonymous_min = vm_page_inactive_target / 20;
 
+       if (vm_pageout_state.vm_page_speculative_percentage > 50)
+               vm_pageout_state.vm_page_speculative_percentage = 50;
+       else if (vm_pageout_state.vm_page_speculative_percentage <= 0)
+               vm_pageout_state.vm_page_speculative_percentage = 1;
 
-       /*
-        * don't want to wake the pageout_scan thread up everytime we fall below
-        * the targets... set a low water mark at 0.25% below the target
-        */
-       vm_page_inactive_min = vm_page_inactive_target - (vm_page_inactive_target / 400);
-
-       if (vm_page_speculative_percentage > 50)
-               vm_page_speculative_percentage = 50;
-       else if (vm_page_speculative_percentage <= 0)
-               vm_page_speculative_percentage = 1;
-
-       vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
-                                                               vm_page_inactive_count);
-
-       try_failed = FALSE;
+       vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
+                                                                                vm_page_inactive_count);
 
        for (;;) {
                vm_page_t m;
@@ -1963,53 +1918,33 @@ Restart:
                 * Deal with secluded_q overflow.
                 */
                if (vm_page_secluded_count > vm_page_secluded_target) {
-                       unsigned int secluded_overflow;
                        vm_page_t secluded_page;
 
-                       if (object != NULL) {
-                               vm_object_unlock(object);
-                               object = NULL;
-                               vm_pageout_scan_wants_object = VM_OBJECT_NULL;
-                       }
                        /*
                         * SECLUDED_AGING_BEFORE_ACTIVE:
                         * Excess secluded pages go to the active queue and
                         * will later go to the inactive queue.
                         */
-                       active_burst_count = MIN(vm_pageout_burst_active_throttle,
-                                                vm_page_secluded_count_inuse);
-                       secluded_overflow = (vm_page_secluded_count -
-                                            vm_page_secluded_target);
-                       while (secluded_overflow-- > 0 &&
-                              vm_page_secluded_count > vm_page_secluded_target) {
-                               assert((vm_page_secluded_count_free +
-                                       vm_page_secluded_count_inuse) ==
-                                      vm_page_secluded_count);
-                               secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
-                               assert(secluded_page->vm_page_q_state ==
-                                      VM_PAGE_ON_SECLUDED_Q);
-                               vm_page_queues_remove(secluded_page, FALSE);
-                               assert(!secluded_page->fictitious);
-                               assert(!VM_PAGE_WIRED(secluded_page));
-                               if (secluded_page->vm_page_object == 0) {
-                                       /* transfer to free queue */
-                                       assert(secluded_page->busy);
-                                       secluded_page->snext = local_freeq;
-                                       local_freeq = secluded_page;
-                                       local_freed++;
-                               } else {
-                                       /* transfer to head of active queue */
-                                       vm_page_enqueue_active(secluded_page, FALSE);
-                                       if (active_burst_count-- == 0) {
-                                               vm_pageout_secluded_burst_count++;
-                                               break;
-                                       }
-                               }
+                       assert((vm_page_secluded_count_free +
+                               vm_page_secluded_count_inuse) ==
+                               vm_page_secluded_count);
+                       secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
+                       assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
+
+                       vm_page_queues_remove(secluded_page, FALSE);
+                       assert(!secluded_page->vmp_fictitious);
+                       assert(!VM_PAGE_WIRED(secluded_page));
+
+                       if (secluded_page->vmp_object == 0) {
+                               /* transfer to free queue */
+                               assert(secluded_page->vmp_busy);
+                               secluded_page->vmp_snext = local_freeq;
+                               local_freeq = secluded_page;
+                               local_freed++;
+                       } else {
+                               /* transfer to head of active queue */
+                               vm_page_enqueue_active(secluded_page, FALSE);
                                secluded_page = VM_PAGE_NULL;
-
-                               if (delayed_unlock++ > delayed_unlock_limit) {
-                                       vm_pageout_delayed_unlock(&delayed_unlock, &local_freed, &local_freeq);
-                               }
                        }
                }
 #endif /* CONFIG_SECLUDED_MEMORY */
@@ -2017,72 +1952,10 @@ Restart:
                assert(delayed_unlock);
 
                /*
-                * Move pages from active to inactive if we're below the target
-                */
-               if ((vm_page_inactive_count + vm_page_speculative_count) >= vm_page_inactive_target)
-                       goto done_moving_active_pages;
-
-               if (object != NULL) {
-                       vm_object_unlock(object);
-                       object = NULL;
-                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
-               }
-               /*
-                * Don't sweep through active queue more than the throttle
-                * which should be kept relatively low
+                * maintain our balance
                 */
-               active_burst_count = MIN(vm_pageout_burst_active_throttle, vm_page_active_count);
-
-               VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_START,
-                              vm_pageout_inactive, vm_pageout_inactive_used, vm_page_free_count, local_freed);
-
-               VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_NONE,
-                              vm_pageout_speculative_clean, vm_pageout_inactive_clean,
-                              vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
-               memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_START);
-
-
-               while (!vm_page_queue_empty(&vm_page_queue_active) && active_burst_count--) {
-
-                       vm_pageout_active++;
+               vm_page_balance_inactive(1);
 
-                       m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
-
-                       assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q);
-                       assert(!m->laundry);
-                       assert(VM_PAGE_OBJECT(m) != kernel_object);
-                       assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
-
-                       DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
-
-                       /*
-                        * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
-                        *
-                        * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
-                        * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
-                        * new reference happens. If no futher references happen on the page after that remote TLB flushes
-                        * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
-                        * by pageout_scan, which is just fine since the last reference would have happened quite far
-                        * in the past (TLB caches don't hang around for very long), and of course could just as easily
-                        * have happened before we moved the page
-                        */
-                       pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
-
-                       /*
-                        * The page might be absent or busy,
-                        * but vm_page_deactivate can handle that.
-                        * FALSE indicates that we don't want a H/W clear reference
-                        */
-                       vm_page_deactivate_internal(m, FALSE);
-
-                       if (delayed_unlock++ > delayed_unlock_limit) {
-                               vm_pageout_delayed_unlock(&delayed_unlock, &local_freed, &local_freeq);
-                       }
-               }
-
-               VM_DEBUG_EVENT(vm_pageout_balance, VM_PAGEOUT_BALANCE, DBG_FUNC_END,
-                              vm_page_active_count, vm_page_inactive_count, vm_page_speculative_count, vm_page_inactive_target);
-               memoryshot(VM_PAGEOUT_BALANCE, DBG_FUNC_END);
 
                /**********************************************************************
                 * above this point we're playing with the active and secluded queues
@@ -2090,15 +1963,15 @@ Restart:
                 * and the inactive queue
                 **********************************************************************/
 
-done_moving_active_pages:
-
                if (vm_page_free_count + local_freed >= vm_page_free_target)
                {
+                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
+
                        vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
                                                    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
                        /*
                         * make sure the pageout I/O threads are running
-                        * throttled in case there are still requests 
+                        * throttled in case there are still requests
                         * in the laundry... since we have met our targets
                         * we don't need the laundry to be cleaned in a timely
                         * fashion... so let's avoid interfering with foreground
@@ -2106,22 +1979,6 @@ done_moving_active_pages:
                         */
                        vm_pageout_adjust_eq_iothrottle(eq, TRUE);
 
-                       /*
-                        * recalculate vm_page_inactivate_target
-                        */
-                       vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
-                                                                         vm_page_inactive_count +
-                                                                         vm_page_speculative_count);
-#ifndef        CONFIG_EMBEDDED
-                       if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) &&
-                           !vm_page_queue_empty(&vm_page_queue_active)) {
-                               /*
-                                * inactive target still not met... keep going
-                                * until we get the queues balanced...
-                                */
-                               continue;
-                       }
-#endif
                        lck_mtx_lock(&vm_page_queue_free_lock);
 
                        if ((vm_page_free_count >= vm_page_free_target) &&
@@ -2134,24 +1991,27 @@ return_from_scan:
                                assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
 
                                VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
-                                              vm_pageout_inactive, vm_pageout_inactive_used, 0, 0);
+                                                       vm_pageout_state.vm_pageout_inactive,
+                                                       vm_pageout_state.vm_pageout_inactive_used, 0, 0);
                                VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
-                                              vm_pageout_speculative_clean, vm_pageout_inactive_clean,
-                                              vm_pageout_inactive_dirty_internal, vm_pageout_inactive_dirty_external);
+                                                       vm_pageout_vminfo.vm_pageout_freed_speculative,
+                                                       vm_pageout_state.vm_pageout_inactive_clean,
+                                                       vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
+                                                       vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
 
                                return;
                        }
                        lck_mtx_unlock(&vm_page_queue_free_lock);
                }
-               
+
                /*
-                * Before anything, we check if we have any ripe volatile 
+                * Before anything, we check if we have any ripe volatile
                 * objects around. If so, try to purge the first object.
                 * If the purge fails, fall through to reclaim a page instead.
                 * If the purge succeeds, go back to the top and reevalute
                 * the new memory situation.
                 */
-               
+
                assert (available_for_purge>=0);
                force_purge = 0; /* no force-purging */
 
@@ -2161,11 +2021,11 @@ return_from_scan:
                if (pressure_level > kVMPressureNormal) {
 
                        if (pressure_level >= kVMPressureCritical) {
-                               force_purge = memorystatus_purge_on_critical;
+                               force_purge = vm_pageout_state.memorystatus_purge_on_critical;
                        } else if (pressure_level >= kVMPressureUrgent) {
-                               force_purge = memorystatus_purge_on_urgent;
+                               force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
                        } else if (pressure_level >= kVMPressureWarning) {
-                               force_purge = memorystatus_purge_on_warning;
+                               force_purge = vm_pageout_state.memorystatus_purge_on_warning;
                        }
                }
 #endif /* VM_PRESSURE_EVENTS */
@@ -2181,7 +2041,7 @@ return_from_scan:
 
                        VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
                        if (vm_purgeable_object_purge_one(force_purge, C_DONT_BLOCK)) {
-                               vm_pageout_purged_objects++;
+                               VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
                                VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
                                memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
                                continue;
@@ -2199,7 +2059,7 @@ return_from_scan:
                        struct vm_speculative_age_q     *aq;
                        boolean_t       can_steal = FALSE;
                        int num_scanned_queues;
-                      
+
                        aq = &vm_page_queue_speculative[speculative_steal_index];
 
                        num_scanned_queues = 0;
@@ -2210,7 +2070,7 @@ return_from_scan:
 
                                if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q)
                                        speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
-                               
+
                                aq = &vm_page_queue_speculative[speculative_steal_index];
                        }
 
@@ -2220,33 +2080,26 @@ return_from_scan:
                                 * queues but still haven't found one
                                 * that is not empty, even though
                                 * vm_page_speculative_count is not 0.
-                                *
-                                * report the anomaly...
                                 */
-                               printf("vm_pageout_scan: "
-                                      "all speculative queues empty "
-                                      "but count=%d.  Re-adjusting.\n",
-                                      vm_page_speculative_count);
-                               if (vm_page_speculative_count > vm_page_speculative_count_drift_max)
-                                       vm_page_speculative_count_drift_max = vm_page_speculative_count;
-                               vm_page_speculative_count_drifts++;
+                               if (!vm_page_queue_empty(&sq->age_q))
+                                       continue;
 #if DEVELOPMENT || DEBUG
                                panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
-#endif /* DEVELOPMENT || DEBUG */
+#endif
                                /* readjust... */
                                vm_page_speculative_count = 0;
                                /* ... and continue */
                                continue;
                        }
 
-                       if (vm_page_speculative_count > vm_page_speculative_target || force_speculative_aging == TRUE)
+                       if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE)
                                can_steal = TRUE;
                        else {
                                if (!delay_speculative_age) {
                                        mach_timespec_t ts_fully_aged;
 
-                                       ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) / 1000;
-                                       ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_page_speculative_q_age_ms) % 1000)
+                                       ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
+                                       ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
                                                * 1000 * NSEC_PER_USEC;
 
                                        ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
@@ -2272,27 +2125,26 @@ return_from_scan:
                }
                force_speculative_aging = FALSE;
 
-#if CONFIG_BACKGROUND_QUEUE
-               if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0 &&
-                   ((vm_page_background_mode == VM_PAGE_BG_DISABLED) || (vm_page_background_count <= vm_page_background_target)))
-#else
-               if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0)
-#endif
-               {
+               if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
+
                        int     pages_evicted;
 
                        if (object != NULL) {
                                vm_object_unlock(object);
                                object = NULL;
                        }
+                       KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
+
                        pages_evicted = vm_object_cache_evict(100, 10);
 
+                       KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
+
                        if (pages_evicted) {
 
-                               vm_pageout_cache_evicted += pages_evicted;
+                               vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
 
                                VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
-                                              vm_page_free_count, pages_evicted, vm_pageout_cache_evicted, 0);
+                                              vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
                                memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
 
                                /*
@@ -2307,6 +2159,8 @@ return_from_scan:
                if  (cache_evict_throttle)
                        cache_evict_throttle--;
 
+               divisor = vm_pageout_state.vm_page_filecache_min_divisor;
+
 #if CONFIG_JETSAM
                /*
                 * don't let the filecache_min fall below 15% of available memory
@@ -2319,22 +2173,24 @@ return_from_scan:
                 * throttled queue (which isn't counted as available) which
                 * effectively disables this filter
                 */
-               if (vm_compressor_low_on_space())
-                       vm_page_filecache_min = 0;
+               if (vm_compressor_low_on_space() || divisor == 0)
+                       vm_pageout_state.vm_page_filecache_min = 0;
                else
-                       vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 7);
+                       vm_pageout_state.vm_page_filecache_min =
+                         ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
 #else
-               if (vm_compressor_out_of_space())
-                       vm_page_filecache_min = 0;
+               if (vm_compressor_out_of_space() || divisor == 0)
+                       vm_pageout_state.vm_page_filecache_min = 0;
                else {
                        /*
-                        * don't let the filecache_min fall below 33% of available memory...
+                        * don't let the filecache_min fall below the specified critical level
                         */
-                       vm_page_filecache_min = (AVAILABLE_NON_COMPRESSED_MEMORY / 3);
+                       vm_pageout_state.vm_page_filecache_min =
+                         ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
                }
 #endif
                if (vm_page_free_count < (vm_page_free_reserved / 4))
-                       vm_page_filecache_min = 0;
+                       vm_pageout_state.vm_page_filecache_min = 0;
 
                exceeded_burst_throttle = FALSE;
                /*
@@ -2346,28 +2202,23 @@ return_from_scan:
                 */
                if (vm_page_queue_empty(&vm_page_queue_inactive) &&
                    vm_page_queue_empty(&vm_page_queue_anonymous) &&
+                   vm_page_queue_empty(&vm_page_queue_cleaned) &&
                    vm_page_queue_empty(&sq->age_q)) {
-                       vm_pageout_scan_empty_throttle++;
-                       msecs = vm_pageout_empty_wait;
+                       VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
+                       msecs = vm_pageout_state.vm_pageout_empty_wait;
                        goto vm_pageout_scan_delay;
 
-               } else if (inactive_burst_count >= 
-                          MIN(vm_pageout_burst_inactive_throttle,
+               } else if (inactive_burst_count >=
+                          MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
                               (vm_page_inactive_count +
                                vm_page_speculative_count))) {
-                       vm_pageout_scan_burst_throttle++;
-                       msecs = vm_pageout_burst_wait;
+                       VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
+                       msecs = vm_pageout_state.vm_pageout_burst_wait;
 
                        exceeded_burst_throttle = TRUE;
                        goto vm_pageout_scan_delay;
 
-               } else if (vm_page_free_count > (vm_page_free_reserved / 4) &&
-                          VM_PAGEOUT_SCAN_NEEDS_TO_THROTTLE()) {
-                       vm_pageout_scan_swap_throttle++;
-                       msecs = vm_pageout_swap_wait;
-                       goto vm_pageout_scan_delay;
-
-               } else if (VM_PAGE_Q_THROTTLED(iq) && 
+               } else if (VM_PAGE_Q_THROTTLED(iq) &&
                                  VM_DYNAMIC_PAGING_ENABLED()) {
                        clock_sec_t sec;
                        clock_nsec_t nsec;
@@ -2375,36 +2226,36 @@ return_from_scan:
                        switch (flow_control.state) {
 
                        case FCS_IDLE:
-                               if ((vm_page_free_count + local_freed) < vm_page_free_target) {
+                               if ((vm_page_free_count + local_freed) < vm_page_free_target &&
+                                   vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
+                                       /*
+                                        * since the compressor is running independently of vm_pageout_scan
+                                        * let's not wait for it just yet... as long as we have a healthy supply
+                                        * of filecache pages to work with, let's keep stealing those.
+                                        */
+                                       inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
 
-                                       vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
-                                                                   VM_PAGEOUT_PB_THREAD_YIELD);
-                                       if (!VM_PAGE_Q_THROTTLED(iq)) {
-                                               vm_pageout_scan_yield_unthrottled++;
-                                               continue;
-                                       }
-                                       if (vm_page_pageable_external_count > vm_page_filecache_min &&
-                                           !vm_page_queue_empty(&vm_page_queue_inactive)) {
-                                               anons_grabbed = ANONS_GRABBED_LIMIT;
-                                               vm_pageout_scan_throttle_deferred++;
+                                       if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
+                                           (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
+                                               anons_grabbed = ANONS_GRABBED_LIMIT;
+                                               VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
                                                goto consider_inactive;
                                        }
-                                       if (((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) && vm_page_active_count)
-                                               continue;
                                }
 reset_deadlock_timer:
-                               ts.tv_sec = vm_pageout_deadlock_wait / 1000;
-                               ts.tv_nsec = (vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
+                               ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
+                               ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
                                clock_get_system_nanotime(&sec, &nsec);
                                flow_control.ts.tv_sec = (unsigned int) sec;
                                flow_control.ts.tv_nsec = nsec;
                                ADD_MACH_TIMESPEC(&flow_control.ts, &ts);
-                               
+
                                flow_control.state = FCS_DELAYED;
-                               msecs = vm_pageout_deadlock_wait;
+                               msecs = vm_pageout_state.vm_pageout_deadlock_wait;
 
+                               vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
                                break;
-                                       
+
                        case FCS_DELAYED:
                                clock_get_system_nanotime(&sec, &nsec);
                                ts.tv_sec = (unsigned int) sec;
@@ -2413,7 +2264,7 @@ reset_deadlock_timer:
                                if (CMP_MACH_TIMESPEC(&ts, &flow_control.ts) >= 0) {
                                        /*
                                         * the pageout thread for the default pager is potentially
-                                        * deadlocked since the 
+                                        * deadlocked since the
                                         * default pager queue has been throttled for more than the
                                         * allowable time... we need to move some clean pages or dirty
                                         * pages belonging to the external pagers if they aren't throttled
@@ -2421,14 +2272,15 @@ reset_deadlock_timer:
                                         * blocked waiting for pages... we'll move one page for each of
                                         * these plus a fixed amount to break the logjam... once we're done
                                         * moving this number of pages, we'll re-enter the FSC_DELAYED state
-                                        * with a new timeout target since we have no way of knowing 
+                                        * with a new timeout target since we have no way of knowing
                                         * whether we've broken the deadlock except through observation
                                         * of the queue associated with the default pager... we need to
                                         * stop moving pages and allow the system to run to see what
                                         * state it settles into.
                                         */
-                                       vm_pageout_deadlock_target = vm_pageout_deadlock_relief + vm_page_free_wanted + vm_page_free_wanted_privileged;
-                                       vm_pageout_scan_deadlock_detected++;
+                                       vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
+                                                                    vm_page_free_wanted + vm_page_free_wanted_privileged;
+                                       VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
                                        flow_control.state = FCS_DEADLOCK_DETECTED;
                                        thread_wakeup((event_t) &vm_pageout_garbage_collect);
                                        goto consider_inactive;
@@ -2439,7 +2291,7 @@ reset_deadlock_timer:
                                 * awakened immediately upon a laundry completion,
                                 * so we won't wait any longer than necessary
                                 */
-                               msecs = vm_pageout_idle_wait;
+                               msecs = vm_pageout_state.vm_pageout_idle_wait;
                                break;
 
                        case FCS_DEADLOCK_DETECTED:
@@ -2449,28 +2301,24 @@ reset_deadlock_timer:
 
                        }
 vm_pageout_scan_delay:
-                       vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed, 
+                       vm_pageout_scan_wants_object = VM_OBJECT_NULL;
+
+                       vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
                                                    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
 
-                       if (flow_control.state == FCS_DELAYED &&
-                           !VM_PAGE_Q_THROTTLED(iq)) {
-                               flow_control.state = FCS_IDLE;
-                               goto consider_inactive;
-                       }
-                       
                        if (vm_page_free_count >= vm_page_free_target) {
                                /*
                                 * we're here because
                                 *  1) someone else freed up some pages while we had
                                 *     the queues unlocked above
-                                * and we've hit one of the 3 conditions that 
+                                * and we've hit one of the 3 conditions that
                                 * cause us to pause the pageout scan thread
                                 *
                                 * since we already have enough free pages,
                                 * let's avoid stalling and return normally
                                 *
                                 * before we return, make sure the pageout I/O threads
-                                * are running throttled in case there are still requests 
+                                * are running throttled in case there are still requests
                                 * in the laundry... since we have enough free pages
                                 * we don't need the laundry to be cleaned in a timely
                                 * fashion... so let's avoid interfering with foreground
@@ -2493,7 +2341,7 @@ vm_pageout_scan_delay:
                                goto return_from_scan;
                        }
                        lck_mtx_unlock(&vm_page_queue_free_lock);
-                       
+
                        if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
                                /*
                                 * we're most likely about to block due to one of
@@ -2523,20 +2371,26 @@ vm_pageout_scan_delay:
                                flow_control.state = FCS_IDLE;
                                goto consider_inactive;
                        }
+                       if (flow_control.state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
+                               flow_control.state = FCS_IDLE;
+                               goto consider_inactive;
+                       }
+
                        VM_CHECK_MEMORYSTATUS;
 
                        if (flow_control.state != FCS_IDLE)
-                               vm_pageout_scan_throttle++;
-                       iq->pgo_throttled = TRUE;
+                               VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
 
+                       iq->pgo_throttled = TRUE;
                        assert_wait_timeout((event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, msecs, 1000*NSEC_PER_USEC);
+
                        counter(c_vm_pageout_scan_block++);
 
                        vm_page_unlock_queues();
 
                        assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
 
-                       VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START, 
+                       VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
                                       iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
                        memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
 
@@ -2561,18 +2415,16 @@ vm_pageout_scan_delay:
 
                flow_control.state = FCS_IDLE;
 consider_inactive:
-               vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count), 
+               vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
                                                                            vm_pageout_inactive_external_forced_reactivate_limit);
                loop_count++;
                inactive_burst_count++;
-               vm_pageout_inactive++;
-
+               vm_pageout_state.vm_pageout_inactive++;
 
                /*
                 * Choose a victim.
                 */
                while (1) {
-                       uint32_t        inactive_external_count;
 
 #if CONFIG_BACKGROUND_QUEUE
                        page_from_bg_q = FALSE;
@@ -2580,7 +2432,7 @@ consider_inactive:
 
                        m = NULL;
                        m_object = VM_OBJECT_NULL;
-                       
+
                        if (VM_DYNAMIC_PAGING_ENABLED()) {
                                assert(vm_page_throttled_count == 0);
                                assert(vm_page_queue_empty(&vm_page_queue_throttled));
@@ -2593,9 +2445,9 @@ consider_inactive:
                         */
                        if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
                                m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
-                    
-                               assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
-                    
+
+                               assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
+
                                break;
                        }
 
@@ -2606,9 +2458,9 @@ consider_inactive:
                        if (!vm_page_queue_empty(&sq->age_q)) {
                                m = (vm_page_t) vm_page_queue_first(&sq->age_q);
 
-                               assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q);
+                               assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
 
-                               if (!m->dirty || force_anonymous == FALSE)
+                               if (!m->vmp_dirty || force_anonymous == FALSE)
                                        break;
                                else
                                        m = NULL;
@@ -2636,35 +2488,35 @@ consider_inactive:
                                } else if (force_anonymous == FALSE || bg_m_object->internal) {
 
                                        if (bg_m_object->internal &&
-                                           ((vm_compressor_out_of_space() == TRUE) ||
-                                            (vm_page_free_count < (vm_page_free_reserved / 4)))) {
-                                               
-                                               vm_pageout_skipped_bq_internal++;
+                                           (VM_PAGE_Q_THROTTLED(iq) ||
+                                            vm_compressor_out_of_space() == TRUE ||
+                                            vm_page_free_count < (vm_page_free_reserved / 4))) {
+
+                                               vm_pageout_skipped_bq_internal++;
                                        } else {
                                                page_from_bg_q = TRUE;
-                               
+
                                                if (bg_m_object->internal)
-                                                       vm_pageout_considered_bq_internal++;
+                                                       vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
                                                else
-                                                       vm_pageout_considered_bq_external++;
-                                       
+                                                       vm_pageout_vminfo.vm_pageout_considered_bq_external++;
                                                break;
                                        }
                                }
                        }
 #endif
-
-                       grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
                        inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
 
-                       if ((vm_page_pageable_external_count < vm_page_filecache_min || force_anonymous == TRUE) ||
-                           ((inactive_external_count < vm_page_anonymous_count) && (inactive_external_count < (vm_page_pageable_external_count / 3)))) {
+                       if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
+                           (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
                                grab_anonymous = TRUE;
                                anons_grabbed = 0;
 
-                               vm_pageout_skipped_external++;
+                               vm_pageout_vminfo.vm_pageout_skipped_external++;
                                goto want_anonymous;
                        }
+                       grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
+
 #if CONFIG_JETSAM
                        /* If the file-backed pool has accumulated
                         * significantly more pages than the jetsam
@@ -2677,22 +2529,19 @@ consider_inactive:
                         */
                        if (grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
                                if (vm_page_pageable_external_count >
-                                   vm_page_filecache_min) {
+                                   vm_pageout_state.vm_page_filecache_min) {
                                        if ((vm_page_pageable_external_count *
                                                vm_pageout_memorystatus_fb_factor_dr) >
                                            (memorystatus_available_pages_critical *
                                            vm_pageout_memorystatus_fb_factor_nr)) {
                                                grab_anonymous = FALSE;
-#if DEVELOPMENT || DEBUG
-                                               vm_grab_anon_overrides++;
-#endif
+
+                                               VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
                                        }
                                }
-#if DEVELOPMENT || DEBUG
                                if (grab_anonymous) {
-                                       vm_grab_anon_nops++;
+                                       VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
                                }
-#endif
                        }
 #endif /* CONFIG_JETSAM */
 
@@ -2701,18 +2550,23 @@ want_anonymous:
 
                                if ( !vm_page_queue_empty(&vm_page_queue_inactive) ) {
                                        m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
-                               
-                                       assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
+
+                                       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
                                        anons_grabbed = 0;
 
-                                       if (vm_page_pageable_external_count < vm_page_filecache_min) {
-                                               if ((++reactivated_this_call % 100))
-                                                       goto must_activate_page;
-                                               /*
-                                                * steal 1% of the file backed pages even if
-                                                * we are under the limit that has been set
-                                                * for a healthy filecache
-                                                */
+                                       if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
+
+                                             if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) {
+                                                   if ((++reactivated_this_call % 100)) {
+                                                         vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
+                                                         goto must_activate_page;
+                                                   }
+                                                   /*
+                                                    * steal 1% of the file backed pages even if
+                                                    * we are under the limit that has been set
+                                                    * for a healthy filecache
+                                                    */
+                                             }
                                        }
                                        break;
                                }
@@ -2720,7 +2574,7 @@ want_anonymous:
                        if ( !vm_page_queue_empty(&vm_page_queue_anonymous) ) {
                                m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
 
-                               assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
+                               assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
                                anons_grabbed++;
 
                                break;
@@ -2729,16 +2583,13 @@ want_anonymous:
                        /*
                         * if we've gotten here, we have no victim page.
                         * check to see if we've not finished balancing the queues
-                        * or we have a page on the aged speculative queue that we 
+                        * or we have a page on the aged speculative queue that we
                         * skipped due to force_anonymous == TRUE.. or we have
                         * speculative  pages that we can prematurely age... if
                         * one of these cases we'll keep going, else panic
                         */
                        force_anonymous = FALSE;
-                       vm_pageout_no_victim++;
-
-                       if ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target)
-                               goto done_with_inactivepage;
+                       VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
 
                        if (!vm_page_queue_empty(&sq->age_q))
                                goto done_with_inactivepage;
@@ -2748,14 +2599,14 @@ want_anonymous:
                                goto done_with_inactivepage;
                        }
                        panic("vm_pageout: no victim");
-                       
+
                        /* NOTREACHED */
                }
                assert(VM_PAGE_PAGEABLE(m));
                m_object = VM_PAGE_OBJECT(m);
                force_anonymous = FALSE;
-               
-               page_prev_q_state = m->vm_page_q_state;
+
+               page_prev_q_state = m->vmp_q_state;
                /*
                 * we just found this page on one of our queues...
                 * it can't also be on the pageout queue, so safe
@@ -2763,14 +2614,13 @@ want_anonymous:
                 */
                vm_page_queues_remove(m, TRUE);
 
-               assert(!m->laundry);
-               assert(!m->private);
-               assert(!m->fictitious);
+               assert(!m->vmp_laundry);
+               assert(!m->vmp_private);
+               assert(!m->vmp_fictitious);
                assert(m_object != kernel_object);
                assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
 
-               vm_pageout_stats[vm_pageout_stat_now].considered++;
-               vm_pageout_considered_page++;
+               vm_pageout_vminfo.vm_pageout_considered_page++;
 
                DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
 
@@ -2781,14 +2631,13 @@ want_anonymous:
                 */
                if (m_object != object) {
                        /*
-                        * the object associated with candidate page is 
+                        * the object associated with candidate page is
                         * different from the one we were just working
                         * with... dump the lock if we still own it
                         */
                        if (object != NULL) {
                                vm_object_unlock(object);
                                object = NULL;
-                               vm_pageout_scan_wants_object = VM_OBJECT_NULL;
                        }
                        /*
                         * Try to lock object; since we've alread got the
@@ -2803,86 +2652,74 @@ want_anonymous:
                        if (!vm_object_lock_try_scan(m_object)) {
                                vm_page_t m_want = NULL;
 
-                               vm_pageout_inactive_nolock++;
+                               vm_pageout_vminfo.vm_pageout_inactive_nolock++;
 
                                if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
-                                       vm_pageout_cleaned_nolock++;
+                                       VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
 
                                pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
-                               m->reference = FALSE;
-
-#if !CONFIG_EMBEDDED
-                               /*
-                                * m->object must be stable since we hold the page queues lock...
-                                * we can update the scan_collisions field sans the object lock
-                                * since it is a separate field and this is the only spot that does
-                                * a read-modify-write operation and it is never executed concurrently...
-                                * we can asynchronously set this field to 0 when creating a UPL, so it
-                                * is possible for the value to be a bit non-determistic, but that's ok
-                                * since it's only used as a hint
-                                */
 
-                               /*
-                                * This is not used on EMBEDDED because having this variable set *could* lead
-                                * us to self-cannibalize pages from m_object to fill a UPL for a pagein.
-                                * And, there's a high probability that the object that vm_pageout_scan
-                                * wants and collides on is a very popular object e.g. the shared cache on EMBEDDED.
-                                * The older pages that we cannibalize from the shared cache could be really
-                                * important text pages e.g. the system call stubs.
-                                */
-                               m_object->scan_collisions = 1;
-#endif /* !CONFIG_EMBEDDED */
+                               m->vmp_reference = FALSE;
 
-                               if ( !vm_page_queue_empty(&sq->age_q) )
-                                       m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
-                               else if ( !vm_page_queue_empty(&vm_page_queue_cleaned))
+                               if ( !m_object->object_is_shared_cache) {
+                                       /*
+                                        * don't apply this optimization if this is the shared cache
+                                        * object, it's too easy to get rid of very hot and important
+                                        * pages...
+                                        * m->vmp_object must be stable since we hold the page queues lock...
+                                        * we can update the scan_collisions field sans the object lock
+                                        * since it is a separate field and this is the only spot that does
+                                        * a read-modify-write operation and it is never executed concurrently...
+                                        * we can asynchronously set this field to 0 when creating a UPL, so it
+                                        * is possible for the value to be a bit non-determistic, but that's ok
+                                        * since it's only used as a hint
+                                        */
+                                       m_object->scan_collisions = 1;
+                               }
+                               if ( !vm_page_queue_empty(&vm_page_queue_cleaned))
                                        m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
-                               else if ( !vm_page_queue_empty(&vm_page_queue_inactive) &&
-                                         (anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)))
-                                       m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
+                               else if ( !vm_page_queue_empty(&sq->age_q))
+                                       m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
+                               else if ( (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT ||
+                                          vm_page_queue_empty(&vm_page_queue_anonymous)) &&
+                                         !vm_page_queue_empty(&vm_page_queue_inactive))
+                                       m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
                                else if ( !vm_page_queue_empty(&vm_page_queue_anonymous))
                                        m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
 
                                /*
                                 * this is the next object we're going to be interested in
-                                * try to make sure its available after the mutex_yield
+                                * try to make sure its available after the mutex_pause
                                 * returns control
                                 */
                                if (m_want)
                                        vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
 
-                               /*
-                                * force us to dump any collected free pages
-                                * and to pause before moving on
-                                */
-                               try_failed = TRUE;
-
                                goto requeue_page;
                        }
                        object = m_object;
                        vm_pageout_scan_wants_object = VM_OBJECT_NULL;
-
-                       try_failed = FALSE;
                }
                assert(m_object == object);
                assert(VM_PAGE_OBJECT(m) == m_object);
 
-               if (m->busy) {
+               if (m->vmp_busy) {
                        /*
                         *      Somebody is already playing with this page.
                         *      Put it back on the appropriate queue
                         *
                         */
-                       vm_pageout_inactive_busy++;
+                       VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
 
                        if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
-                               vm_pageout_cleaned_busy++;
+                               VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
 requeue_page:
                        if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
                                vm_page_enqueue_inactive(m, FALSE);
                        else
                                vm_page_activate(m);
 #if CONFIG_BACKGROUND_QUEUE
+#if DEVELOPMENT || DEBUG
                        if (page_from_bg_q == TRUE) {
                                if (m_object->internal)
                                        vm_pageout_rejected_bq_internal++;
@@ -2890,6 +2727,31 @@ requeue_page:
                                        vm_pageout_rejected_bq_external++;
                        }
 #endif
+#endif
+                       goto done_with_inactivepage;
+               }
+
+               /*
+                *   if (m->vmp_cleaning && !m->vmp_free_when_done)
+                *      If already cleaning this page in place
+                *      just leave if off the paging queues.
+                *      We can leave the page mapped, and upl_commit_range
+                *      will put it on the clean queue.
+                *
+                *   if (m->vmp_free_when_done && !m->vmp_cleaning)
+                *      an msync INVALIDATE is in progress...
+                *      this page has been marked for destruction
+                *      after it has been cleaned,
+                *      but not yet gathered into a UPL
+                *      where 'cleaning' will be set...
+                *      just leave it off the paging queues
+                *
+                *   if (m->vmp_free_when_done && m->vmp_clenaing)
+                *      an msync INVALIDATE is in progress
+                *      and the UPL has already gathered this page...
+                *      just leave it off the paging queues
+                */
+               if (m->vmp_free_when_done || m->vmp_cleaning) {
                        goto done_with_inactivepage;
                }
 
@@ -2901,17 +2763,17 @@ requeue_page:
                 *      from reclaiming it - busy or cleaning - that we've already
                 *      dealt with
                 */
-               if (m->absent || m->error || !object->alive) {
+               if (m->vmp_absent || m->vmp_error || !object->alive) {
 
-                       if (m->absent)
-                               vm_pageout_inactive_absent++;
+                       if (m->vmp_absent)
+                               VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
                        else if (!object->alive)
-                               vm_pageout_inactive_notalive++;
+                               VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
                        else
-                               vm_pageout_inactive_error++;
-reclaim_page:                  
+                               VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
+reclaim_page:
                        if (vm_pageout_deadlock_target) {
-                               vm_pageout_scan_inactive_throttle_success++;
+                               VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
                                vm_pageout_deadlock_target--;
                        }
 
@@ -2922,10 +2784,16 @@ reclaim_page:
                        } else {
                                DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
                        }
-                       assert(!m->cleaning);
-                       assert(!m->laundry);
+                       assert(!m->vmp_cleaning);
+                       assert(!m->vmp_laundry);
+
+                       if (!object->internal &&
+                           object->pager != NULL &&
+                           object->pager->mo_pager_ops == &shared_region_pager_ops) {
+                               shared_region_pager_reclaimed++;
+                       }
 
-                       m->busy = TRUE;
+                       m->vmp_busy = TRUE;
 
                        /*
                         * remove page from object here since we're already
@@ -2933,42 +2801,43 @@ reclaim_page:
                         * we'd normally do in vm_page_free_prepare_object
                         * until 'vm_page_free_list' is called
                         */
-                       if (m->tabled)
+                       if (m->vmp_tabled)
                                vm_page_remove(m, TRUE);
 
-                       assert(m->pageq.next == 0 && m->pageq.prev == 0);
-                       m->snext = local_freeq;
+                       assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
+                       m->vmp_snext = local_freeq;
                        local_freeq = m;
                        local_freed++;
-                       
+
                        if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
-                               vm_pageout_freed_from_speculative++;
+                               vm_pageout_vminfo.vm_pageout_freed_speculative++;
                        else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
-                               vm_pageout_freed_from_cleaned++;
+                               vm_pageout_vminfo.vm_pageout_freed_cleaned++;
+                       else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
+                               vm_pageout_vminfo.vm_pageout_freed_internal++;
                        else
-                               vm_pageout_freed_from_inactive_clean++;
-
-                       vm_pageout_stats[vm_pageout_stat_now].reclaimed_clean++;
+                               vm_pageout_vminfo.vm_pageout_freed_external++;
 
                        inactive_burst_count = 0;
                        goto done_with_inactivepage;
                }
-               /*
-                * If the object is empty, the page must be reclaimed even
-                * if dirty or used.
-                * If the page belongs to a volatile object, we stick it back
-                * on.
-                */
                if (object->copy == VM_OBJECT_NULL) {
+                       /*
+                        * No one else can have any interest in this page.
+                        * If this is an empty purgable object, the page can be
+                        * reclaimed even if dirty.
+                        * If the page belongs to a volatile purgable object, we
+                        * reactivate it if the compressor isn't active.
+                        */
                        if (object->purgable == VM_PURGABLE_EMPTY) {
-                               if (m->pmapped == TRUE) {
+                               if (m->vmp_pmapped == TRUE) {
                                        /* unmap the page */
                                        refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
                                        if (refmod_state & VM_MEM_MODIFIED) {
                                                SET_PAGE_DIRTY(m, FALSE);
                                        }
                                }
-                               if (m->dirty || m->precious) {
+                               if (m->vmp_dirty || m->vmp_precious) {
                                        /* we saved the cost of cleaning this page ! */
                                        vm_page_purged_count++;
                                }
@@ -2997,7 +2866,7 @@ reclaim_page:
                                reactivated_this_call++;
 
                                if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
-                                       vm_pageout_cleaned_volatile_reactivated++;
+                                       VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
 
                                goto reactivate_page;
                        }
@@ -3010,66 +2879,33 @@ reclaim_page:
                 */
                refmod_state = -1;
 
-               if (m->reference == FALSE && m->pmapped == TRUE) {
+               if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
                        refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
-                 
+
                        if (refmod_state & VM_MEM_REFERENCED)
-                               m->reference = TRUE;
+                               m->vmp_reference = TRUE;
                        if (refmod_state & VM_MEM_MODIFIED) {
                                SET_PAGE_DIRTY(m, FALSE);
                        }
                }
-               
-               /*
-                *   if (m->cleaning && !m->free_when_done)
-                *      If already cleaning this page in place and it hasn't
-                *      been recently referenced, just pull off the queue.
-                *      We can leave the page mapped, and upl_commit_range
-                *      will put it on the clean queue.
-                *
-                *   if (m->free_when_done && !m->cleaning)
-                *      an msync INVALIDATE is in progress...
-                *      this page has been marked for destruction
-                *      after it has been cleaned,
-                *      but not yet gathered into a UPL
-                *      where 'cleaning' will be set...
-                *      just leave it off the paging queues
-                *
-                *   if (m->free_when_done && m->clenaing)
-                *      an msync INVALIDATE is in progress
-                *      and the UPL has already gathered this page...
-                *      just leave it off the paging queues
-                */
-               
-               /*
-                * page with m->free_when_done and still on the queues means that an
-                * MS_INVALIDATE is in progress on this page... leave it alone
-                */
-               if (m->free_when_done) {
-                       goto done_with_inactivepage;
-               }
-               
-               /* if cleaning, reactivate if referenced.  otherwise, just pull off queue */
-               if (m->cleaning) {
-                       if (m->reference == TRUE) {
-                               reactivated_this_call++;
-                               goto reactivate_page;
-                       } else {
-                               goto done_with_inactivepage;
-                       }
-               }
 
-                if (m->reference || m->dirty) {
+                if (m->vmp_reference || m->vmp_dirty) {
                         /* deal with a rogue "reusable" page */
                         VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
                 }
+               divisor = vm_pageout_state.vm_page_xpmapped_min_divisor;
 
-               if (!m->no_cache &&
+               if (divisor == 0)
+                       vm_pageout_state.vm_page_xpmapped_min = 0;
+               else
+                       vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / divisor;
+
+               if (!m->vmp_no_cache &&
 #if CONFIG_BACKGROUND_QUEUE
                    page_from_bg_q == FALSE &&
 #endif
-                   (m->reference ||
-                    (m->xpmapped && !object->internal && (vm_page_xpmapped_external_count < (vm_page_external_count / 4))))) {
+                   (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
+                                     (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
                        /*
                         * The page we pulled off the inactive list has
                         * been referenced.  It is possible for other
@@ -3079,14 +2915,16 @@ reclaim_page:
                         * reactivations.
                         */
                        if (++reactivated_this_call >= reactivate_limit) {
-                               vm_pageout_reactivation_limit_exceeded++;
+                               vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
                        } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
-                               vm_pageout_inactive_force_reclaim++;
+                               vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
                        } else {
                                uint32_t isinuse;
 
                                if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
-                                       vm_pageout_cleaned_reference_reactivated++;
+                                       VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
+
+                               vm_pageout_vminfo.vm_pageout_inactive_referenced++;
 reactivate_page:
                                if ( !object->internal && object->pager != MEMORY_OBJECT_NULL &&
                                     vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
@@ -3095,7 +2933,7 @@ reactivate_page:
                                         * and it's not open via the filesystem
                                         */
                                        vm_page_deactivate(m);
-                                       vm_pageout_inactive_deactivated++;
+                                       VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
                                } else {
 must_activate_page:
                                        /*
@@ -3106,25 +2944,27 @@ must_activate_page:
                                        inactive_burst_count = 0;
                                }
 #if CONFIG_BACKGROUND_QUEUE
+#if DEVELOPMENT || DEBUG
                                if (page_from_bg_q == TRUE) {
                                        if (m_object->internal)
                                                vm_pageout_rejected_bq_internal++;
                                        else
                                                vm_pageout_rejected_bq_external++;
                                }
+#endif
 #endif
                                if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
-                                       vm_pageout_cleaned_reactivated++;
-                               vm_pageout_inactive_used++;
+                                       VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
+                               vm_pageout_state.vm_pageout_inactive_used++;
 
                                 goto done_with_inactivepage;
                        }
-                       /* 
+                       /*
                         * Make sure we call pmap_get_refmod() if it
                         * wasn't already called just above, to update
                         * the dirty bit.
                         */
-                       if ((refmod_state == -1) && !m->dirty && m->pmapped) {
+                       if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
                                refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
                                if (refmod_state & VM_MEM_MODIFIED) {
                                        SET_PAGE_DIRTY(m, FALSE);
@@ -3134,18 +2974,18 @@ must_activate_page:
 
                 XPR(XPR_VM_PAGEOUT,
                 "vm_pageout_scan, replace object 0x%X offset 0x%X page 0x%X\n",
-                object, m->offset, m, 0,0);
+                object, m->vmp_offset, m, 0,0);
 
                /*
                 * we've got a candidate page to steal...
                 *
-                * m->dirty is up to date courtesy of the
-                * preceding check for m->reference... if 
-                * we get here, then m->reference had to be
+                * m->vmp_dirty is up to date courtesy of the
+                * preceding check for m->vmp_reference... if
+                * we get here, then m->vmp_reference had to be
                 * FALSE (or possibly "reactivate_limit" was
                  * exceeded), but in either case we called
                  * pmap_get_refmod() and updated both
-                 * m->reference and m->dirty
+                 * m->vmp_reference and m->vmp_dirty
                 *
                 * if it's dirty or precious we need to
                 * see if the target queue is throtttled
@@ -3155,7 +2995,7 @@ must_activate_page:
 
                inactive_throttled = FALSE;
 
-               if (m->dirty || m->precious) {
+               if (m->vmp_dirty || m->vmp_precious) {
                        if (object->internal) {
                                if (VM_PAGE_Q_THROTTLED(iq))
                                        inactive_throttled = TRUE;
@@ -3165,18 +3005,18 @@ must_activate_page:
                }
 throttle_inactive:
                if (!VM_DYNAMIC_PAGING_ENABLED() &&
-                   object->internal && m->dirty &&
+                   object->internal && m->vmp_dirty &&
                    (object->purgable == VM_PURGABLE_DENY ||
                     object->purgable == VM_PURGABLE_NONVOLATILE ||
                     object->purgable == VM_PURGABLE_VOLATILE)) {
                        vm_page_check_pageable_safe(m);
-                       assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+                       assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
                        vm_page_queue_enter(&vm_page_queue_throttled, m,
-                                           vm_page_t, pageq);
-                       m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
+                                           vm_page_t, vmp_pageq);
+                       m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
                        vm_page_throttled_count++;
 
-                       vm_pageout_scan_reclaimed_throttled++;
+                       VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
 
                        inactive_burst_count = 0;
                        goto done_with_inactivepage;
@@ -3209,12 +3049,12 @@ throttle_inactive:
                                 * that we can try to find clean pages in the active/inactive queues before
                                 * deciding to jetsam a process
                                 */
-                               vm_pageout_scan_inactive_throttled_external++;
+                               vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
 
                                vm_page_check_pageable_safe(m);
-                               assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-                               vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, pageq);
-                               m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
+                               assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
+                               vm_page_queue_enter(&vm_page_queue_active, m, vm_page_t, vmp_pageq);
+                               m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
                                vm_page_active_count++;
                                vm_page_pageable_external_count++;
 
@@ -3238,13 +3078,13 @@ throttle_inactive:
 
                                         /* Kill first suitable process. If this call returned FALSE, we might have simply purged a process instead. */
                                        if (memorystatus_kill_on_VM_page_shortage(FALSE) == TRUE) {
-                                               vm_pageout_inactive_external_forced_jetsam_count++;
+                                               VM_PAGEOUT_DEBUG(vm_pageout_inactive_external_forced_jetsam_count, 1);
                                        }
-                                       
+
                                        VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_END,
                                                        vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
 
-                                       vm_page_lock_queues();  
+                                       vm_page_lock_queues();
                                        delayed_unlock = 1;
                                }
 #else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
@@ -3253,7 +3093,6 @@ throttle_inactive:
                                inactive_burst_count = 0;
                                goto done_with_inactivepage;
                        } else {
-                               vm_pageout_scan_inactive_throttled_internal++;
                                goto must_activate_page;
                        }
                }
@@ -3264,17 +3103,17 @@ throttle_inactive:
                 * we have the up-to-date modified state
                 *
                 * if we need to do a pmap_disconnect then we
-                * need to re-evaluate m->dirty since the pmap_disconnect
-                * provides the true state atomically... the 
+                * need to re-evaluate m->vmp_dirty since the pmap_disconnect
+                * provides the true state atomically... the
                 * page was still mapped up to the pmap_disconnect
                 * and may have been dirtied at the last microsecond
                 *
                 * Note that if 'pmapped' is FALSE then the page is not
                 * and has not been in any map, so there is no point calling
-                * pmap_disconnect().  m->dirty could have been set in anticipation
+                * pmap_disconnect().  m->vmp_dirty could have been set in anticipation
                 * of likely usage of the page.
                 */
-               if (m->pmapped == TRUE) {
+               if (m->vmp_pmapped == TRUE) {
                        int pmap_options;
 
                        /*
@@ -3291,7 +3130,7 @@ throttle_inactive:
                        if ( !VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
                            object->internal == FALSE) {
                                pmap_options = 0;
-                       } else if (m->dirty || m->precious) {
+                       } else if (m->vmp_dirty || m->vmp_precious) {
                                /*
                                 * VM knows that this page is dirty (or
                                 * precious) and needs to be compressed
@@ -3319,8 +3158,9 @@ throttle_inactive:
                                SET_PAGE_DIRTY(m, FALSE);
                        }
                }
+
                /*
-                * reset our count of pages that have been reclaimed 
+                * reset our count of pages that have been reclaimed
                 * since the last page was 'stolen'
                 */
                inactive_reclaim_run = 0;
@@ -3328,18 +3168,10 @@ throttle_inactive:
                /*
                 *      If it's clean and not precious, we can free the page.
                 */
-               if (!m->dirty && !m->precious) {
+               if (!m->vmp_dirty && !m->vmp_precious) {
 
-                       if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
-                               vm_pageout_speculative_clean++;
-                       else {
-                               if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
-                                       vm_pageout_inactive_anonymous++;
-                               else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q)
-                                       vm_pageout_cleaned_reclaimed++;
+                       vm_pageout_state.vm_pageout_inactive_clean++;
 
-                               vm_pageout_inactive_clean++;
-                       }
                        /*
                         * OK, at this point we have found a page we are going to free.
                         */
@@ -3365,7 +3197,7 @@ throttle_inactive:
 
                if (inactive_throttled == TRUE)
                        goto throttle_inactive;
-       
+
 #if VM_PRESSURE_EVENTS
 #if CONFIG_JETSAM
 
@@ -3377,40 +3209,43 @@ throttle_inactive:
                 */
 
 #else /* CONFIG_JETSAM */
-               
+
                vm_pressure_response();
 
 #endif /* CONFIG_JETSAM */
 #endif /* VM_PRESSURE_EVENTS */
-               
+
                if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q)
-                       vm_pageout_speculative_dirty++;
-               else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q)
-                       vm_pageout_inactive_anonymous++;
+                       VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
 
                if (object->internal)
-                       vm_pageout_inactive_dirty_internal++;
+                       vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
                else
-                       vm_pageout_inactive_dirty_external++;
+                       vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
 
                /*
-                * do NOT set the pageout bit!
-                * sure, we might need free pages, but this page is going to take time to become free 
-                * anyway, so we may as well put it on the clean queue first and take it from there later
-                * if necessary.  that way, we'll ensure we don't free up too much. -mj
+                * internal pages will go to the compressor...
+                * external pages will go to the appropriate pager to be cleaned
+                * and upon completion will end up on 'vm_page_queue_cleaned' which
+                * is a preferred queue to steal from
                 */
                vm_pageout_cluster(m);
+               inactive_burst_count = 0;
 
 done_with_inactivepage:
 
-               if (delayed_unlock++ > delayed_unlock_limit || try_failed == TRUE) {
+               if (delayed_unlock++ > delayed_unlock_limit) {
+                       int freed = local_freed;
 
                        vm_pageout_prepare_to_block(&object, &delayed_unlock, &local_freeq, &local_freed,
                                                    VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
-                       if (try_failed == TRUE)
-                               lck_mtx_yield(&vm_page_queue_lock);
+                       if (freed == 0)
+                               lck_mtx_yield(&vm_page_queue_lock);
+               } else if (vm_pageout_scan_wants_object) {
+                       vm_page_unlock_queues();
+                       mutex_pause(0);
+                       vm_page_lock_queues();
                }
-
                /*
                 * back to top of pageout scan loop
                 */
@@ -3418,8 +3253,6 @@ done_with_inactivepage:
 }
 
 
-int vm_page_free_count_init;
-
 void
 vm_page_free_reserve(
        int pages)
@@ -3439,7 +3272,7 @@ vm_page_free_reserve(
                else
                        vm_page_free_reserved += pages;
        }
-       free_after_reserve = vm_page_free_count_init - vm_page_free_reserved;
+       free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
 
        vm_page_free_min = vm_page_free_reserved +
                VM_PAGE_FREE_MIN(free_after_reserve);
@@ -3467,7 +3300,7 @@ void
 vm_pageout_continue(void)
 {
        DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
-       vm_pageout_scan_event_counter++;
+       VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
 
 #if !CONFIG_EMBEDDED
        lck_mtx_lock(&vm_page_queue_free_lock);
@@ -3530,11 +3363,11 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
        vm_object_offset_t offset;
        memory_object_t pager;
 
-       /* On systems without a compressor, the external IO thread clears its
+       /* On systems with a compressor, the external IO thread clears its
         * VM privileged bit to accommodate large allocations (e.g. bulk UPL
         * creation)
         */
-       if (vm_pageout_internal_iothread != THREAD_NULL)
+       if (vm_pageout_state.vm_pageout_internal_iothread != THREAD_NULL)
                current_thread()->options &= ~TH_OPT_VMPRIV;
 
        vm_page_lockspin_queues();
@@ -3542,9 +3375,9 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
         while ( !vm_page_queue_empty(&q->pgo_pending) ) {
 
                   q->pgo_busy = TRUE;
-                  vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
+                  vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, vmp_pageq);
 
-                  assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
+                  assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
                   VM_PAGE_CHECK(m);
                   /*
                    * grab a snapshot of the object and offset this
@@ -3556,12 +3389,9 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
                    * on this object which will keep it from terminating
                    */
                   object = VM_PAGE_OBJECT(m);
-                  offset = m->offset;
+                  offset = m->vmp_offset;
 
-                  if (object->object_slid) {
-                          panic("slid page %p not allowed on this path\n", m);
-                  }
-                  m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
+                  m->vmp_q_state = VM_PAGE_NOT_ON_Q;
                   VM_PAGE_ZERO_PAGEQ_ENTRY(m);
 
                   vm_page_unlock_queues();
@@ -3571,7 +3401,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
                   m = vm_page_lookup(object, offset);
 
                   if (m == NULL ||
-                      m->busy || m->cleaning || !m->laundry || (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
+                      m->vmp_busy || m->vmp_cleaning || !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
                           /*
                            * it's either the same page that someone else has
                            * started cleaning (or it's finished cleaning or
@@ -3595,7 +3425,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
                            * memory_object_destroy or vm_object_destroy, and
                            * so there is nowhere for the page to go.
                            */
-                          if (m->free_when_done) {
+                          if (m->vmp_free_when_done) {
                                   /*
                                    * Just free the page... VM_PAGE_FREE takes
                                    * care of cleaning up all the state...
@@ -3607,7 +3437,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
 
                                   vm_pageout_throttle_up(m);
                                   vm_page_activate(m);
-                                  
+
                                   vm_page_unlock_queues();
 
                                   /*
@@ -3643,7 +3473,7 @@ vm_pageout_iothread_external_continue(struct vm_pageout_queue *q)
                    * any pageout clustering happens there
                    */
                   memory_object_data_return(pager,
-                                            m->offset + object->paging_offset,
+                                            m->vmp_offset + object->paging_offset,
                                             PAGE_SIZE,
                                             NULL,
                                             NULL,
@@ -3676,10 +3506,6 @@ uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
                                     */
 
 
-#if DEVELOPMENT || DEBUG
-uint64_t compressor_epoch_start, compressor_epoch_stop, compressor_threads_runtime;
-#endif
-
 void
 vm_pageout_iothread_internal_continue(struct cq *);
 void
@@ -3693,14 +3519,14 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
        vm_page_t   local_freeq = NULL;
        int         local_freed = 0;
        int         local_batch_size;
-       int     ncomps = 0;
 #if DEVELOPMENT || DEBUG
+       int       ncomps = 0;
        boolean_t marked_active = FALSE;
 #endif
        KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
 
        q = cq->q;
-       local_batch_size = q->pgo_maxlaundry / (vm_compressor_thread_count * 2);
+       local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
 
 #if RECORD_THE_COMPRESSED_DATA
        if (q->pgo_laundry)
@@ -3721,7 +3547,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
                        vmct_state[cq->id] = VMCT_ACTIVE;
                        marked_active = TRUE;
                        if (vmct_active == 1) {
-                               compressor_epoch_start = mach_absolute_time();
+                               vm_compressor_epoch_start = mach_absolute_time();
                        }
                }
 #endif
@@ -3729,17 +3555,17 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
 
                KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
 
-               while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt <  local_batch_size) {
+               while ( !vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
 
-                       vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, pageq);
-                       assert(m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q);
+                       vm_page_queue_remove_first(&q->pgo_pending, m, vm_page_t, vmp_pageq);
+                       assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
                        VM_PAGE_CHECK(m);
-                       
-                       m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
+
+                       m->vmp_q_state = VM_PAGE_NOT_ON_Q;
                        VM_PAGE_ZERO_PAGEQ_ENTRY(m);
-                       m->laundry = FALSE;
+                       m->vmp_laundry = FALSE;
 
-                       m->snext = local_q;
+                       m->vmp_snext = local_q;
                        local_q = m;
                        local_cnt++;
                }
@@ -3757,7 +3583,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
                vm_page_unlock_queues();
 
 #if !RECORD_THE_COMPRESSED_DATA
-               if (pages_left_on_q >= local_batch_size && cq->id < (vm_compressor_thread_count - 1)) {
+               if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
                        thread_wakeup((event_t) ((uintptr_t)&q->pgo_pending + cq->id + 1));
                }
 #endif
@@ -3768,19 +3594,25 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
                        KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
 
                        m = local_q;
-                       local_q = m->snext;
-                       m->snext = NULL;
+                       local_q = m->vmp_snext;
+                       m->vmp_snext = NULL;
 
-                       if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m, FALSE) == KERN_SUCCESS) {
+                       if (vm_pageout_compress_page(&cq->current_chead, cq->scratch_buf, m) == KERN_SUCCESS) {
+#if DEVELOPMENT || DEBUG
                                ncomps++;
-                               m->snext = local_freeq;
+#endif
+                               KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
+
+                               m->vmp_snext = local_freeq;
                                local_freeq = m;
                                local_freed++;
 
                                if (local_freed >= MAX_FREE_BATCH) {
-                                       vm_pageout_freed_after_compression += local_freed;
+
+                                       OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
 
                                        vm_page_free_list(local_freeq, TRUE);
+
                                        local_freeq = NULL;
                                        local_freed = 0;
                                }
@@ -3791,7 +3623,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
                                int             need_wakeup = 0;
 
                                if (local_freeq) {
-                                       vm_pageout_freed_after_compression += local_freed;
+                                       OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
 
                                        vm_page_free_list(local_freeq, TRUE);
                                        local_freeq = NULL;
@@ -3821,7 +3653,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
 #endif
                }
                if (local_freeq) {
-                       vm_pageout_freed_after_compression += local_freed;
+                       OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
 
                        vm_page_free_list(local_freeq, TRUE);
                        local_freeq = NULL;
@@ -3848,14 +3680,15 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
                vmct_state[cq->id] = VMCT_IDLE;
 
                if (vmct_active == 0) {
-                       compressor_epoch_stop = mach_absolute_time();
-                       assert(compressor_epoch_stop > compressor_epoch_start);
+                       vm_compressor_epoch_stop = mach_absolute_time();
+                       assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
+                           "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
+                           vm_compressor_epoch_start, vm_compressor_epoch_stop);
                        /* This interval includes intervals where one or more
                         * compressor threads were pre-empted
                         */
-                       vmct_stats.vmct_cthreads_total += compressor_epoch_stop - compressor_epoch_start;
+                       vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
                }
-
        }
 #endif
        vm_page_unlock_queues();
@@ -3881,7 +3714,7 @@ vm_pageout_iothread_internal_continue(struct cq *cq)
 
 
 kern_return_t
-vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, boolean_t object_locked_by_caller) 
+vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
 {
        vm_object_t     object;
        memory_object_t pager;
@@ -3890,16 +3723,13 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b
 
        object = VM_PAGE_OBJECT(m);
 
-       if (object->object_slid) {
-               panic("slid page %p not allowed on this path\n", m);
-       }
-       assert(!m->free_when_done);
-       assert(!m->laundry);
+       assert(!m->vmp_free_when_done);
+       assert(!m->vmp_laundry);
 
        pager = object->pager;
 
-       if (object_locked_by_caller == FALSE && (!object->pager_initialized || pager == MEMORY_OBJECT_NULL))  {
-                               
+       if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL)  {
+
                KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
 
                vm_object_lock(object);
@@ -3929,9 +3759,9 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b
 
                        vm_page_lockspin_queues();
                        vm_page_activate(m);
-                       vm_pageout_dirty_no_pager++;
+                       VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
                        vm_page_unlock_queues();
-                                       
+
                        /*
                         *      And we are done with it.
                         */
@@ -3941,36 +3771,31 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b
                        return KERN_FAILURE;
                }
                vm_object_unlock(object);
-                               
+
                KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
        }
        assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
-
-       if (object_locked_by_caller == FALSE)
-               assert(object->activity_in_progress > 0);
+       assert(object->activity_in_progress > 0);
 
        retval = vm_compressor_pager_put(
                pager,
-               m->offset + object->paging_offset,
+               m->vmp_offset + object->paging_offset,
                VM_PAGE_GET_PHYS_PAGE(m),
                current_chead,
                scratch_buf,
                &compressed_count_delta);
 
-       if (object_locked_by_caller == FALSE) {
-               vm_object_lock(object);
+       vm_object_lock(object);
 
-               assert(object->activity_in_progress > 0);
-               assert(VM_PAGE_OBJECT(m) == object);
-       }
+       assert(object->activity_in_progress > 0);
+       assert(VM_PAGE_OBJECT(m) == object);
+       assert( !VM_PAGE_WIRED(m));
 
        vm_compressor_pager_count(pager,
                                  compressed_count_delta,
                                  FALSE, /* shared_lock */
                                  object);
 
-       assert( !VM_PAGE_WIRED(m));
-
        if (retval == KERN_SUCCESS) {
                /*
                 * If the object is purgeable, its owner's
@@ -3979,15 +3804,16 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b
                 * contributes to the owner's memory footprint,
                 * so account for it as such.
                 */
-               if (object->purgable != VM_PURGABLE_DENY &&
-                   object->vo_purgeable_owner != NULL) {
-                       /* one more compressed purgeable page */
-                       vm_purgeable_compressed_update(object,
-                                                      +1);
+               if ((object->purgable != VM_PURGABLE_DENY ||
+                    object->vo_ledger_tag) &&
+                   object->vo_owner != NULL) {
+                       /* one more compressed purgeable/tagged page */
+                       vm_object_owner_compressed_update(object,
+                                                         +1);
                }
                VM_STAT_INCR(compressions);
-                       
-               if (m->tabled)
+
+               if (m->vmp_tabled)
                        vm_page_remove(m, TRUE);
 
        } else {
@@ -3996,14 +3822,13 @@ vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m, b
                vm_page_lockspin_queues();
 
                vm_page_activate(m);
-               vm_compressor_failed++;
+               vm_pageout_vminfo.vm_compressor_failed++;
 
                vm_page_unlock_queues();
        }
-       if (object_locked_by_caller == FALSE) {
-               vm_object_activity_end(object);
-               vm_object_unlock(object);
-       }
+       vm_object_activity_end(object);
+       vm_object_unlock(object);
+
        return retval;
 }
 
@@ -4012,12 +3837,12 @@ static void
 vm_pageout_adjust_eq_iothrottle(struct vm_pageout_queue *eq, boolean_t req_lowpriority)
 {
        uint32_t        policy;
-       
+
        if (hibernate_cleaning_in_progress == TRUE)
                req_lowpriority = FALSE;
 
        if (eq->pgo_inited == TRUE && eq->pgo_lowpriority != req_lowpriority) {
-               
+
                vm_page_unlock_queues();
 
                if (req_lowpriority == TRUE) {
@@ -4078,7 +3903,7 @@ vm_pageout_iothread_internal(struct cq *cq)
 
        vm_page_unlock_queues();
 
-       if (vm_restricted_to_single_processor == TRUE)
+       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE)
                thread_vm_bind_group_add();
 
 
@@ -4092,7 +3917,7 @@ vm_pageout_iothread_internal(struct cq *cq)
 }
 
 kern_return_t
-vm_set_buffer_cleanup_callout(boolean_t (*func)(int)) 
+vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
 {
        if (OSCompareAndSwapPtr(NULL, func, (void * volatile *) &consider_buffer_cache_collect)) {
                return KERN_SUCCESS;
@@ -4141,7 +3966,7 @@ vm_pressure_response(void)
        if (memorystatus_manual_testing_on) {
                return;
        }
-       
+
        old_level = memorystatus_vm_pressure_level;
 
        switch (memorystatus_vm_pressure_level) {
@@ -4180,17 +4005,22 @@ vm_pressure_response(void)
                default:
                        return;
        }
-               
+
        if (new_level != -1) {
                memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
 
-               if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != new_level)) {
-                       if (vm_pressure_thread_running == FALSE) {
+               if (new_level != old_level) {
+                       VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
+                                       new_level, old_level, 0, 0);
+               }
+
+               if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level)) {
+                       if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
                                thread_wakeup(&vm_pressure_thread);
                        }
 
-                       if (old_level != new_level) {
-                               thread_wakeup(&vm_pressure_changed);
+                       if (old_level != memorystatus_vm_pressure_level) {
+                               thread_wakeup(&vm_pageout_state.vm_pressure_changed);
                        }
                }
        }
@@ -4202,11 +4032,11 @@ kern_return_t
 mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level) {
 
 #if CONFIG_EMBEDDED
-       
+
        return KERN_FAILURE;
 
 #elif !VM_PRESSURE_EVENTS
-       
+
        return KERN_FAILURE;
 
 #else /* VM_PRESSURE_EVENTS */
@@ -4221,7 +4051,7 @@ mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused un
                        wait_result_t           wr = 0;
 
                        while (old_level == *pressure_level) {
-                               wr = assert_wait((event_t) &vm_pressure_changed,
+                               wr = assert_wait((event_t) &vm_pageout_state.vm_pressure_changed,
                                                 THREAD_INTERRUPTIBLE);
                                if (wr == THREAD_WAITING) {
                                        wr = thread_block(THREAD_CONTINUE_NULL);
@@ -4230,7 +4060,7 @@ mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused un
                                        return KERN_ABORTED;
                                }
                                if (wr == THREAD_AWAKENED) {
-                                       
+
                                        old_level = memorystatus_vm_pressure_level;
 
                                        if (old_level != *pressure_level) {
@@ -4256,11 +4086,12 @@ vm_pressure_thread(void) {
        static boolean_t thread_initialized = FALSE;
 
        if (thread_initialized == TRUE) {
-               vm_pressure_thread_running = TRUE;
+               vm_pageout_state.vm_pressure_thread_running = TRUE;
                consider_vm_pressure_events();
-               vm_pressure_thread_running = FALSE;
+               vm_pageout_state.vm_pressure_thread_running = FALSE;
        }
 
+       thread_set_thread_name(current_thread(), "VM_pressure");
        thread_initialized = TRUE;
        assert_wait((event_t) &vm_pressure_thread, THREAD_UNINT);
        thread_block((thread_continue_t)vm_pressure_thread);
@@ -4268,17 +4099,15 @@ vm_pressure_thread(void) {
 #endif /* VM_PRESSURE_EVENTS */
 
 
-uint32_t vm_pageout_considered_page_last = 0;
-
 /*
  * called once per-second via "compute_averages"
  */
 void
 compute_pageout_gc_throttle(__unused void *arg)
 {
-       if (vm_pageout_considered_page != vm_pageout_considered_page_last) {
+       if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
 
-               vm_pageout_considered_page_last = vm_pageout_considered_page;
+               vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
 
                thread_wakeup((event_t) &vm_pageout_garbage_collect);
        }
@@ -4339,7 +4168,7 @@ vm_pageout_garbage_collect(int collect)
                        stack_collect();
 
                        consider_machine_collect();
-                       m_drain();
+                       mbuf_drain(FALSE);
 
                        do {
                                if (consider_buffer_cache_collect != NULL) {
@@ -4388,13 +4217,13 @@ vm_set_restrictions()
 
        if (hinfo.max_cpus <= 3) {
                /*
-                * on systems with a limited number of CPUS, bind the 
+                * on systems with a limited number of CPUS, bind the
                 * 4 major threads that can free memory and that tend to use
                 * a fair bit of CPU under pressured conditions to a single processor.
                 * This insures that these threads don't hog all of the available CPUs
                 * (important for camera launch), while allowing them to run independently
                 * w/r to locks... the 4 threads are
-                * vm_pageout_scan,  vm_pageout_iothread_internal (compressor), 
+                * vm_pageout_scan,  vm_pageout_iothread_internal (compressor),
                 * vm_compressor_swap_trigger_thread (minor and major compactions),
                 * memorystatus_thread (jetsams).
                 *
@@ -4403,8 +4232,9 @@ vm_set_restrictions()
                 * thread_bind_master...  someday this should be replaced with a group
                 * scheduling mechanism and KPI.
                 */
-               vm_restricted_to_single_processor = TRUE;
-       }
+               vm_pageout_state.vm_restricted_to_single_processor = TRUE;
+       } else
+               vm_pageout_state.vm_restricted_to_single_processor = FALSE;
 }
 
 void
@@ -4428,7 +4258,7 @@ vm_pageout(void)
        if (!self->reserved_stack)
                self->reserved_stack = self->kernel_stack;
 
-       if (vm_restricted_to_single_processor == TRUE)
+       if (vm_pageout_state.vm_restricted_to_single_processor == TRUE)
                thread_vm_bind_group_add();
 
        splx(s);
@@ -4439,43 +4269,63 @@ vm_pageout(void)
         *      Initialize some paging parameters.
         */
 
-       if (vm_pageout_swap_wait == 0)
-               vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
-
-       if (vm_pageout_idle_wait == 0)
-               vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
-
-       if (vm_pageout_burst_wait == 0)
-               vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
-
-       if (vm_pageout_empty_wait == 0)
-               vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
+       vm_pageout_state.vm_pressure_thread_running = FALSE;
+       vm_pageout_state.vm_pressure_changed = FALSE;
+       vm_pageout_state.memorystatus_purge_on_warning = 2;
+       vm_pageout_state.memorystatus_purge_on_urgent = 5;
+       vm_pageout_state.memorystatus_purge_on_critical = 8;
+       vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
+       vm_pageout_state.vm_page_speculative_percentage = 5;
+       vm_pageout_state.vm_page_speculative_target = 0;
+
+       vm_pageout_state.vm_pageout_external_iothread = THREAD_NULL;
+       vm_pageout_state.vm_pageout_internal_iothread = THREAD_NULL;
+
+       vm_pageout_state.vm_pageout_swap_wait = 0;
+       vm_pageout_state.vm_pageout_idle_wait = 0;
+       vm_pageout_state.vm_pageout_empty_wait = 0;
+       vm_pageout_state.vm_pageout_burst_wait = 0;
+       vm_pageout_state.vm_pageout_deadlock_wait = 0;
+       vm_pageout_state.vm_pageout_deadlock_relief = 0;
+       vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
+
+       vm_pageout_state.vm_pageout_inactive = 0;
+       vm_pageout_state.vm_pageout_inactive_used = 0;
+       vm_pageout_state.vm_pageout_inactive_clean = 0;
+
+       vm_pageout_state.vm_memory_pressure = 0;
+        vm_pageout_state.vm_page_filecache_min = 0;
+#if CONFIG_JETSAM
+       vm_pageout_state.vm_page_filecache_min_divisor = 70;
+       vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
+#else
+       vm_pageout_state.vm_page_filecache_min_divisor = 27;
+       vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
+#endif
+       vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
 
-       if (vm_pageout_deadlock_wait == 0)
-               vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
+        vm_pageout_state.vm_pageout_considered_page_last = 0;
 
-       if (vm_pageout_deadlock_relief == 0)
-               vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
+       if (vm_pageout_state.vm_pageout_swap_wait == 0)
+               vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
 
-       if (vm_pageout_inactive_relief == 0)
-               vm_pageout_inactive_relief = VM_PAGEOUT_INACTIVE_RELIEF;
+       if (vm_pageout_state.vm_pageout_idle_wait == 0)
+               vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
 
-       if (vm_pageout_burst_active_throttle == 0)
-               vm_pageout_burst_active_throttle = VM_PAGEOUT_BURST_ACTIVE_THROTTLE;
+       if (vm_pageout_state.vm_pageout_burst_wait == 0)
+               vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
 
-       if (vm_pageout_burst_inactive_throttle == 0)
-               vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
+       if (vm_pageout_state.vm_pageout_empty_wait == 0)
+               vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
 
-       /*
-        * Set kernel task to low backing store privileged 
-        * status
-        */
-       task_lock(kernel_task);
-       kernel_task->priv_flags |= VM_BACKING_STORE_PRIV;
-       task_unlock(kernel_task);
+       if (vm_pageout_state.vm_pageout_deadlock_wait == 0)
+               vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
 
-       vm_page_free_count_init = vm_page_free_count;
+       if (vm_pageout_state.vm_pageout_deadlock_relief == 0)
+               vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
 
+       if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0)
+               vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
        /*
         * even if we've already called vm_page_free_reserve
         * call it again here to insure that the targets are
@@ -4514,16 +4364,16 @@ vm_pageout(void)
        /* internal pageout thread started when default pager registered first time */
        /* external pageout and garbage collection threads started here */
 
-       result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL, 
+       result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_external, NULL,
                                              BASEPRI_VM,
-                                             &vm_pageout_external_iothread);
+                                             &vm_pageout_state.vm_pageout_external_iothread);
        if (result != KERN_SUCCESS)
                panic("vm_pageout_iothread_external: create failed");
 
-       thread_deallocate(vm_pageout_external_iothread);
+       thread_deallocate(vm_pageout_state.vm_pageout_external_iothread);
 
        result = kernel_thread_start_priority((thread_continue_t)vm_pageout_garbage_collect, NULL,
-                                             BASEPRI_DEFAULT, 
+                                             BASEPRI_DEFAULT,
                                              &thread);
        if (result != KERN_SUCCESS)
                panic("vm_pageout_garbage_collect: create failed");
@@ -4547,7 +4397,7 @@ vm_pageout(void)
        bzero(&vm_config, sizeof(vm_config));
 
        switch(vm_compressor_mode) {
-               
+
        case VM_PAGER_DEFAULT:
                printf("mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
 
@@ -4643,12 +4493,6 @@ vm_pageout(void)
 
 
 
-#if CONFIG_EMBEDDED
-int vm_compressor_thread_count = 1;
-#else
-int vm_compressor_thread_count = 2;
-#endif
-
 kern_return_t
 vm_pageout_internal_start(void)
 {
@@ -4664,28 +4508,39 @@ vm_pageout_internal_start(void)
 
        assert(hinfo.max_cpus > 0);
 
-       PE_parse_boot_argn("vmcomp_threads", &vm_compressor_thread_count, sizeof(vm_compressor_thread_count));
-       if (vm_compressor_thread_count >= hinfo.max_cpus)
-               vm_compressor_thread_count = hinfo.max_cpus - 1;
-       if (vm_compressor_thread_count <= 0)
-               vm_compressor_thread_count = 1;
-       else if (vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT)
-               vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
+#if CONFIG_EMBEDDED
+       vm_pageout_state.vm_compressor_thread_count = 1;
+#else
+       if (hinfo.max_cpus > 4)
+               vm_pageout_state.vm_compressor_thread_count = 2;
+       else
+               vm_pageout_state.vm_compressor_thread_count = 1;
+#endif
+       PE_parse_boot_argn("vmcomp_threads", &vm_pageout_state.vm_compressor_thread_count,
+                          sizeof(vm_pageout_state.vm_compressor_thread_count));
+
+       if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus)
+               vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
+       if (vm_pageout_state.vm_compressor_thread_count <= 0)
+               vm_pageout_state.vm_compressor_thread_count = 1;
+       else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT)
+               vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
 
-       vm_pageout_queue_internal.pgo_maxlaundry = (vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
+       vm_pageout_queue_internal.pgo_maxlaundry = (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
 
        PE_parse_boot_argn("vmpgoi_maxlaundry", &vm_pageout_queue_internal.pgo_maxlaundry, sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
 
-       for (i = 0; i < vm_compressor_thread_count; i++) {
+       for (i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
                ciq[i].id = i;
                ciq[i].q = &vm_pageout_queue_internal;
                ciq[i].current_chead = NULL;
                ciq[i].scratch_buf = kalloc(COMPRESSOR_SCRATCH_BUF_SIZE);
 
-               result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i], BASEPRI_VM, &vm_pageout_internal_iothread);
+               result = kernel_thread_start_priority((thread_continue_t)vm_pageout_iothread_internal, (void *)&ciq[i],
+                                                     BASEPRI_VM, &vm_pageout_state.vm_pageout_internal_iothread);
 
                if (result == KERN_SUCCESS)
-                       thread_deallocate(vm_pageout_internal_iothread);
+                       thread_deallocate(vm_pageout_state.vm_pageout_internal_iothread);
                else
                        break;
        }
@@ -4718,7 +4573,7 @@ upl_set_decmp_info(upl_t upl, upl_t src_upl)
                  * This case should rarely happen and even if it does, it just means
                  * that we might issue a spurious expedite which the driver is expected
                  * to handle.
-                 */ 
+                 */
                 upl_unlock(src_upl);
                 return;
         }
@@ -4729,7 +4584,7 @@ upl_set_decmp_info(upl_t upl, upl_t src_upl)
         upl->decmp_io_upl = (void *)src_upl;
        upl_unlock(src_upl);
 }
-#endif /* CONFIG_IOSCHED */  
+#endif /* CONFIG_IOSCHED */
 
 #if UPL_DEBUG
 int    upl_debug_enabled = 1;
@@ -4773,11 +4628,12 @@ upl_create(int type, int flags, upl_size_t size)
        upl_lock_init(upl);
        upl->vector_upl = NULL;
        upl->associated_upl = NULL;
+       upl->upl_iodone = NULL;
 #if CONFIG_IOSCHED
        if (type & UPL_CREATE_IO_TRACKING) {
                upl->upl_priority = proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO);
        }
-       
+
        upl->upl_reprio_info = 0;
        upl->decmp_io_upl = 0;
        if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
@@ -4786,7 +4642,7 @@ upl_create(int type, int flags, upl_size_t size)
                upl->upl_reprio_info = (uint64_t *)kalloc(sizeof(uint64_t) * atop(size));
                bzero(upl->upl_reprio_info, (sizeof(uint64_t) * atop(size)));
                upl->flags |= UPL_EXPEDITE_SUPPORTED;
-               if (curthread->decmp_upl != NULL) 
+               if (curthread->decmp_upl != NULL)
                        upl_set_decmp_info(upl, curthread->decmp_upl);
        }
 #endif
@@ -4879,7 +4735,7 @@ upl_destroy(upl_t upl)
 
        if (upl->flags & UPL_INTERNAL) {
                kfree(upl,
-                     sizeof(struct upl) + 
+                     sizeof(struct upl) +
                      (sizeof(struct upl_page_info) * (size/PAGE_SIZE))
                      + page_field_size);
        } else {
@@ -4891,13 +4747,17 @@ void
 upl_deallocate(upl_t upl)
 {
        upl_lock(upl);
+
        if (--upl->ref_count == 0) {
                if(vector_upl_is_valid(upl))
                        vector_upl_deallocate(upl);
-               upl_unlock(upl);        
+               upl_unlock(upl);
+
+               if (upl->upl_iodone)
+                       upl_callout_iodone(upl);
+
                upl_destroy(upl);
-       }
-       else
+       } else
                upl_unlock(upl);
 }
 
@@ -4908,7 +4768,7 @@ upl_mark_decmp(upl_t upl)
        if (upl->flags & UPL_TRACKED_BY_OBJECT) {
                upl->flags |= UPL_DECMP_REQ;
                upl->upl_creator->decmp_upl = (void *)upl;
-       }       
+       }
 }
 
 void
@@ -4917,7 +4777,7 @@ upl_unmark_decmp(upl_t upl)
        if(upl && (upl->flags & UPL_DECMP_REQ)) {
                upl->upl_creator->decmp_upl = NULL;
        }
-} 
+}
 
 #endif /* CONFIG_IOSCHED */
 
@@ -4937,22 +4797,9 @@ must_throttle_writes()
 }
 
 
-#if DEVELOPMENT || DEBUG
-/*/*
- * Statistics about UPL enforcement of copy-on-write obligations.
- */
-unsigned long upl_cow = 0;
-unsigned long upl_cow_again = 0;
-unsigned long upl_cow_pages = 0;
-unsigned long upl_cow_again_pages = 0;
-
-unsigned long iopl_cow = 0;
-unsigned long iopl_cow_pages = 0;
-#endif
-
-/*  
- *     Routine:        vm_object_upl_request 
- *     Purpose:        
+/*
+ *     Routine:        vm_object_upl_request
+ *     Purpose:
  *             Cause the population of a portion of a vm_object.
  *             Depending on the nature of the request, the pages
  *             returned may be contain valid data or be uninitialized.
@@ -4963,7 +4810,7 @@ unsigned long iopl_cow_pages = 0;
  *             IMPORTANT NOTE: The caller must still respect the relationship
  *             between the vm_object and its backing memory object.  The
  *             caller MUST NOT substitute changes in the backing file
- *             without first doing a memory_object_lock_request on the 
+ *             without first doing a memory_object_lock_request on the
  *             target range unless it is know that the pages are not
  *             shared with another entity at the pager level.
  *             Copy_in_to:
@@ -4981,7 +4828,7 @@ unsigned long iopl_cow_pages = 0;
  *                     all mapped pages.  Where a page does not exist
  *                     map a zero filled one. Leave pages busy in
  *                     the original object.  If a page list structure
- *                     is not specified, this call is a no-op. 
+ *                     is not specified, this call is a no-op.
  *
  *             Note:  access of default pager objects has a rather interesting
  *             twist.  The caller of this routine, presumably the file system
@@ -4989,7 +4836,7 @@ unsigned long iopl_cow_pages = 0;
  *             against a default pager backed object.  Only the default
  *             pager will make requests on backing store related vm_objects
  *             In this way the default pager can maintain the relationship
- *             between backing store files (abstract memory objects) and 
+ *             between backing store files (abstract memory objects) and
  *             the vm_objects (cache objects), they support.
  *
  */
@@ -5013,9 +4860,6 @@ vm_object_upl_request(
        boolean_t               hw_dirty;
        upl_t                   upl = NULL;
        unsigned int            entry;
-#if MACH_CLUSTER_STATS
-       boolean_t               encountered_lrp = FALSE;
-#endif
        vm_page_t               alias_page = NULL;
         int                    refmod_state = 0;
        wpl_array_t             lite_list = NULL;
@@ -5026,7 +4870,10 @@ vm_object_upl_request(
        int                     dw_limit;
        int                     io_tracking_flag = 0;
        int                     grab_options;
+       int                     page_grab_count = 0;
        ppnum_t                 phys_page;
+       pmap_flush_context      pmap_flush_context_storage;
+       boolean_t               pmap_flushes_delayed = FALSE;
 
        if (cntrl_flags & ~UPL_VALID_FLAGS) {
                /*
@@ -5040,6 +4887,7 @@ vm_object_upl_request(
        if (object->phys_contiguous)
                panic("vm_object_upl_request: contiguous object specified\n");
 
+       VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
 
        if (size > MAX_UPL_SIZE_BYTES)
                size = MAX_UPL_SIZE_BYTES;
@@ -5063,7 +4911,7 @@ vm_object_upl_request(
 
                        user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
                        lite_list = (wpl_array_t)
-                                       (((uintptr_t)user_page_list) + 
+                                       (((uintptr_t)user_page_list) +
                                        ((size/PAGE_SIZE) * sizeof(upl_page_info_t)));
                        if (size == 0) {
                                user_page_list = NULL;
@@ -5091,7 +4939,7 @@ vm_object_upl_request(
                }
        }
        *upl_ptr = upl;
-       
+
        if (user_page_list)
                user_page_list[0].device = FALSE;
 
@@ -5157,10 +5005,9 @@ vm_object_upl_request(
                                 FALSE, /* should_return */
                                 MEMORY_OBJECT_COPY_SYNC,
                                 VM_PROT_NO_CHANGE);
-#if DEVELOPMENT || DEBUG
-               upl_cow++;
-               upl_cow_pages += size >> PAGE_SHIFT;
-#endif
+
+               VM_PAGEOUT_DEBUG(upl_cow, 1);
+               VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
        }
        /*
         * remember which copy object we synchronized with
@@ -5189,7 +5036,7 @@ vm_object_upl_request(
                vnode_pager_get_isSSD(object->pager, &isSSD);
 #endif
                vm_object_unlock(object);
-               
+
                OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
 
                if (isSSD == TRUE)
@@ -5214,12 +5061,12 @@ vm_object_upl_request(
                        upl->flags |= UPL_PAGE_SYNC_DONE;
 
                        if ( ((dst_page = vm_page_lookup(object, dst_offset)) == VM_PAGE_NULL) ||
-                               dst_page->fictitious ||
-                               dst_page->absent ||
-                               dst_page->error ||
-                               dst_page->cleaning ||
+                               dst_page->vmp_fictitious ||
+                               dst_page->vmp_absent ||
+                               dst_page->vmp_error ||
+                               dst_page->vmp_cleaning ||
                                (VM_PAGE_WIRED(dst_page))) {
-                               
+
                                if (user_page_list)
                                        user_page_list[entry].phys_addr = 0;
 
@@ -5234,7 +5081,7 @@ vm_object_upl_request(
                         * anyway... so we can eliminate an extra call into
                         * the pmap layer by grabbing it here and recording it
                         */
-                       if (dst_page->pmapped)
+                       if (dst_page->vmp_pmapped)
                                refmod_state = pmap_get_refmod(phys_page);
                        else
                                refmod_state = 0;
@@ -5252,15 +5099,15 @@ vm_object_upl_request(
                                /*
                                 * we're only asking for DIRTY pages to be returned
                                 */
-                               if (dst_page->laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
+                               if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
                                        /*
                                         * if we were the page stolen by vm_pageout_scan to be
-                                        * cleaned (as opposed to a buddy being clustered in 
+                                        * cleaned (as opposed to a buddy being clustered in
                                         * or this request is not being driven by a PAGEOUT cluster
                                         * then we only need to check for the page being dirty or
                                         * precious to decide whether to return it
                                         */
-                                       if (dst_page->dirty || dst_page->precious || (refmod_state & VM_MEM_MODIFIED))
+                                       if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED))
                                                goto check_busy;
                                        goto dont_return;
                                }
@@ -5271,9 +5118,9 @@ vm_object_upl_request(
                                 * can't have been referenced recently...
                                 */
                                if ( (hibernate_cleaning_in_progress == TRUE ||
-                                     (!((refmod_state & VM_MEM_REFERENCED) || dst_page->reference) ||
-                                      (dst_page->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q))) && 
-                                    ((refmod_state & VM_MEM_MODIFIED) || dst_page->dirty || dst_page->precious) ) {
+                                     (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
+                                      (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
+                                    ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious) ) {
                                        goto check_busy;
                                }
 dont_return:
@@ -5281,7 +5128,7 @@ dont_return:
                                 * if we reach here, we're not to return
                                 * the page... go on to the next one
                                 */
-                               if (dst_page->laundry == TRUE) {
+                               if (dst_page->vmp_laundry == TRUE) {
                                        /*
                                         * if we get here, the page is not 'cleaning' (filtered out above).
                                         * since it has been referenced, remove it from the laundry
@@ -5292,7 +5139,7 @@ dont_return:
 
                                        vm_pageout_steal_laundry(dst_page, TRUE);
                                        vm_page_activate(dst_page);
-                                       
+
                                        vm_page_unlock_queues();
                                }
                                if (user_page_list)
@@ -5300,9 +5147,9 @@ dont_return:
 
                                goto try_next_page;
                        }
-check_busy:                    
-                       if (dst_page->busy) {
-                               if (cntrl_flags & UPL_NOBLOCK) {        
+check_busy:
+                       if (dst_page->vmp_busy) {
+                               if (cntrl_flags & UPL_NOBLOCK) {
                                        if (user_page_list)
                                                user_page_list[entry].phys_addr = 0;
                                        dwp->dw_mask = 0;
@@ -5317,11 +5164,11 @@ check_busy:
 
                                continue;
                        }
-                       if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
+                       if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
 
                                vm_page_lockspin_queues();
 
-                               if (dst_page->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) {
+                               if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
                                        /*
                                         * we've buddied up a page for a clustered pageout
                                         * that has already been moved to the pageout
@@ -5333,24 +5180,8 @@ check_busy:
                                }
                                vm_page_unlock_queues();
                        }
-#if MACH_CLUSTER_STATS
-                       /*
-                        * pageout statistics gathering.  count
-                        * all the pages we will page out that
-                        * were not counted in the initial
-                        * vm_pageout_scan work
-                        */
-                       if (dst_page->pageout)
-                               encountered_lrp = TRUE;
-                       if ((dst_page->dirty || (object->internal && dst_page->precious))) {
-                               if (encountered_lrp)
-                                       CLUSTER_STAT(pages_at_higher_offsets++;)
-                               else
-                                       CLUSTER_STAT(pages_at_lower_offsets++;)
-                       }
-#endif
                        hw_dirty = refmod_state & VM_MEM_MODIFIED;
-                       dirty = hw_dirty ? TRUE : dst_page->dirty;
+                       dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
 
                        if (phys_page > upl->highest_page)
                                upl->highest_page = phys_page;
@@ -5364,15 +5195,23 @@ check_busy:
                                assert(pg_num == (dst_offset-offset)/PAGE_SIZE);
                                lite_list[pg_num>>5] |= 1 << (pg_num & 31);
 
-                               if (hw_dirty)
-                                       pmap_clear_modify(phys_page);
+                               if (hw_dirty) {
+                                       if (pmap_flushes_delayed == FALSE) {
+                                               pmap_flush_context_init(&pmap_flush_context_storage);
+                                               pmap_flushes_delayed = TRUE;
+                                       }
+                                       pmap_clear_refmod_options(phys_page,
+                                                                 VM_MEM_MODIFIED,
+                                                                 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
+                                                                 &pmap_flush_context_storage);
+                               }
 
                                /*
-                                * Mark original page as cleaning 
+                                * Mark original page as cleaning
                                 * in place.
                                 */
-                               dst_page->cleaning = TRUE;
-                               dst_page->precious = FALSE;
+                               dst_page->vmp_cleaning = TRUE;
+                               dst_page->vmp_precious = FALSE;
                        } else {
                                /*
                                 * use pageclean setup, it is more
@@ -5383,21 +5222,21 @@ check_busy:
                                vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
                                vm_object_unlock(upl->map_object);
 
-                               alias_page->absent = FALSE;
+                               alias_page->vmp_absent = FALSE;
                                alias_page = NULL;
                        }
                        if (dirty) {
                                SET_PAGE_DIRTY(dst_page, FALSE);
                        } else {
-                               dst_page->dirty = FALSE;
+                               dst_page->vmp_dirty = FALSE;
                        }
 
                        if (!dirty)
-                               dst_page->precious = TRUE;
+                               dst_page->vmp_precious = TRUE;
 
                        if ( !(cntrl_flags & UPL_CLEAN_IN_PLACE) ) {
                                if ( !VM_PAGE_WIRED(dst_page))
-                                       dst_page->free_when_done = TRUE;
+                                       dst_page->vmp_free_when_done = TRUE;
                        }
                } else {
                        if ((cntrl_flags & UPL_WILL_MODIFY) && object->copy != last_copy_object) {
@@ -5433,10 +5272,8 @@ check_busy:
                                                MEMORY_OBJECT_COPY_SYNC,
                                                VM_PROT_NO_CHANGE);
 
-#if DEVELOPMENT || DEBUG
-                                       upl_cow_again++;
-                                       upl_cow_again_pages += xfer_size >> PAGE_SHIFT;
-#endif
+                                       VM_PAGEOUT_DEBUG(upl_cow_again, 1);
+                                       VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
                                }
                                /*
                                 * remember the copy object we synced with
@@ -5444,7 +5281,7 @@ check_busy:
                                last_copy_object = object->copy;
                        }
                        dst_page = vm_page_lookup(object, dst_offset);
-                       
+
                        if (dst_page != VM_PAGE_NULL) {
 
                                if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
@@ -5456,11 +5293,11 @@ check_busy:
 
                                        goto try_next_page;
                                }
-                               if (dst_page->fictitious) {
+                               if (dst_page->vmp_fictitious) {
                                        panic("need corner case for fictitious page");
                                }
 
-                               if (dst_page->busy || dst_page->cleaning) {
+                               if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
                                        /*
                                         * someone else is playing with the
                                         * page.  We will have to wait.
@@ -5469,14 +5306,14 @@ check_busy:
 
                                        continue;
                                }
-                               if (dst_page->laundry)
+                               if (dst_page->vmp_laundry)
                                        vm_pageout_steal_laundry(dst_page, FALSE);
                        } else {
                                if (object->private) {
-                                       /* 
-                                        * This is a nasty wrinkle for users 
-                                        * of upl who encounter device or 
-                                        * private memory however, it is 
+                                       /*
+                                        * This is a nasty wrinkle for users
+                                        * of upl who encounter device or
+                                        * private memory however, it is
                                         * unavoidable, only a fault can
                                         * resolve the actual backing
                                         * physical page by asking the
@@ -5512,6 +5349,8 @@ check_busy:
                                         * need to allocate a page
                                         */
                                        dst_page = vm_page_grab_options(grab_options);
+                                       if (dst_page != VM_PAGE_NULL)
+                                               page_grab_count++;
                                }
                                if (dst_page == VM_PAGE_NULL) {
                                        if ( (cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
@@ -5531,7 +5370,7 @@ check_busy:
                                         * offset...
                                         */
                                        vm_object_unlock(object);
-                                       
+
                                        OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
 
                                        VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
@@ -5547,19 +5386,19 @@ check_busy:
                                }
                                vm_page_insert(dst_page, object, dst_offset);
 
-                               dst_page->absent = TRUE;
-                               dst_page->busy = FALSE;
+                               dst_page->vmp_absent = TRUE;
+                               dst_page->vmp_busy = FALSE;
 
                                if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
                                        /*
                                         * if UPL_RET_ONLY_ABSENT was specified,
                                         * than we're definitely setting up a
-                                        * upl for a clustered read/pagein 
+                                        * upl for a clustered read/pagein
                                         * operation... mark the pages as clustered
                                         * so upl_commit_range can put them on the
                                         * speculative list
                                         */
-                                       dst_page->clustered = TRUE;
+                                       dst_page->vmp_clustered = TRUE;
 
                                        if ( !(cntrl_flags & UPL_FILE_IO))
                                                VM_STAT_INCR(pageins);
@@ -5567,9 +5406,9 @@ check_busy:
                        }
                        phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
 
-                       dst_page->overwriting = TRUE;
+                       dst_page->vmp_overwriting = TRUE;
 
-                       if (dst_page->pmapped) {
+                       if (dst_page->vmp_pmapped) {
                                if ( !(cntrl_flags & UPL_FILE_IO))
                                        /*
                                         * eliminate all mappings from the
@@ -5582,7 +5421,7 @@ check_busy:
                                refmod_state = 0;
 
                        hw_dirty = refmod_state & VM_MEM_MODIFIED;
-                       dirty = hw_dirty ? TRUE : dst_page->dirty;
+                       dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
 
                        if (cntrl_flags & UPL_SET_LITE) {
                                unsigned int    pg_num;
@@ -5595,11 +5434,11 @@ check_busy:
                                        pmap_clear_modify(phys_page);
 
                                /*
-                                * Mark original page as cleaning 
+                                * Mark original page as cleaning
                                 * in place.
                                 */
-                               dst_page->cleaning = TRUE;
-                               dst_page->precious = FALSE;
+                               dst_page->vmp_cleaning = TRUE;
+                               dst_page->vmp_precious = FALSE;
                        } else {
                                /*
                                 * use pageclean setup, it is more
@@ -5610,7 +5449,7 @@ check_busy:
                                vm_pageclean_setup(dst_page, alias_page, upl->map_object, size - xfer_size);
                                vm_object_unlock(upl->map_object);
 
-                               alias_page->absent = FALSE;
+                               alias_page->vmp_absent = FALSE;
                                alias_page = NULL;
                        }
 
@@ -5630,17 +5469,17 @@ check_busy:
                                 */
                                upl->flags |= UPL_CLEAR_DIRTY;
                        }
-                       dst_page->dirty = dirty;
+                       dst_page->vmp_dirty = dirty;
 
                        if (!dirty)
-                               dst_page->precious = TRUE;
+                               dst_page->vmp_precious = TRUE;
 
                        if ( !VM_PAGE_WIRED(dst_page)) {
                                /*
                                 * deny access to the target page while
                                 * it is being worked on
                                 */
-                               dst_page->busy = TRUE;
+                               dst_page->vmp_busy = TRUE;
                        } else
                                dwp->dw_mask |= DW_vm_page_wire;
 
@@ -5648,8 +5487,8 @@ check_busy:
                         * We might be about to satisfy a fault which has been
                         * requested. So no need for the "restart" bit.
                         */
-                       dst_page->restart = FALSE;
-                       if (!dst_page->absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
+                       dst_page->vmp_restart = FALSE;
+                       if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
                                /*
                                 * expect the page to be used
                                 */
@@ -5658,15 +5497,15 @@ check_busy:
                        if (cntrl_flags & UPL_PRECIOUS) {
                                if (object->internal) {
                                        SET_PAGE_DIRTY(dst_page, FALSE);
-                                       dst_page->precious = FALSE;
+                                       dst_page->vmp_precious = FALSE;
                                } else {
-                                       dst_page->precious = TRUE;
+                                       dst_page->vmp_precious = TRUE;
                                }
                        } else {
-                               dst_page->precious = FALSE;
+                               dst_page->vmp_precious = FALSE;
                        }
                }
-               if (dst_page->busy)
+               if (dst_page->vmp_busy)
                        upl->flags |= UPL_HAS_BUSY;
 
                if (phys_page > upl->highest_page)
@@ -5674,19 +5513,19 @@ check_busy:
                assert (!pmap_is_noencrypt(phys_page));
                if (user_page_list) {
                        user_page_list[entry].phys_addr = phys_page;
-                       user_page_list[entry].free_when_done    = dst_page->free_when_done;
-                       user_page_list[entry].absent    = dst_page->absent;
-                       user_page_list[entry].dirty     = dst_page->dirty;
-                       user_page_list[entry].precious  = dst_page->precious;
+                       user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
+                       user_page_list[entry].absent    = dst_page->vmp_absent;
+                       user_page_list[entry].dirty     = dst_page->vmp_dirty;
+                       user_page_list[entry].precious  = dst_page->vmp_precious;
                        user_page_list[entry].device    = FALSE;
                        user_page_list[entry].needed    = FALSE;
-                       if (dst_page->clustered == TRUE)
-                               user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
+                       if (dst_page->vmp_clustered == TRUE)
+                               user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
                        else
                                user_page_list[entry].speculative = FALSE;
-                       user_page_list[entry].cs_validated = dst_page->cs_validated;
-                       user_page_list[entry].cs_tainted = dst_page->cs_tainted;
-                       user_page_list[entry].cs_nx = dst_page->cs_nx;
+                       user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
+                       user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
+                       user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
                        user_page_list[entry].mark      = FALSE;
                }
                /*
@@ -5700,9 +5539,9 @@ check_busy:
                        /*
                         * someone is explicitly grabbing this page...
                         * update clustered and speculative state
-                        * 
+                        *
                         */
-                       if (dst_page->clustered)
+                       if (dst_page->vmp_clustered)
                                VM_PAGE_CONSUME_CLUSTERED(dst_page);
                }
 try_next_page:
@@ -5729,6 +5568,8 @@ try_next_page:
        if (alias_page != NULL) {
                VM_PAGE_FREE(alias_page);
        }
+       if (pmap_flushes_delayed == TRUE)
+               pmap_flush(&pmap_flush_context_storage);
 
        if (page_list_count != NULL) {
                if (upl->flags & UPL_INTERNAL)
@@ -5741,12 +5582,14 @@ try_next_page:
 #endif
        vm_object_unlock(object);
 
+       VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
+
        return KERN_SUCCESS;
 }
 
-/*  
+/*
  *     Routine:        vm_object_super_upl_request
- *     Purpose:        
+ *     Purpose:
  *             Cause the population of a portion of a vm_object
  *             in much the same way as memory_object_upl_request.
  *             Depending on the nature of the request, the pages
@@ -5904,6 +5747,7 @@ REDISCOVER_ENTRY:
        }
 
        if (!(caller_flags & UPL_COPYOUT_FROM) &&
+           !entry->is_sub_map &&
            !(entry->protection & VM_PROT_WRITE)) {
                vm_map_unlock_read(map);
                return KERN_PROTECTION_FAILURE;
@@ -6117,8 +5961,8 @@ REDISCOVER_ENTRY:
                vm_map_reference(submap);
                vm_map_unlock_read(map);
 
-               ret = vm_map_create_upl(submap, 
-                                       local_offset + (offset - local_start), 
+               ret = vm_map_create_upl(submap,
+                                       local_offset + (offset - local_start),
                                        upl_size, upl, page_list, count, flags, tag);
                vm_map_deallocate(submap);
 
@@ -6141,7 +5985,7 @@ REDISCOVER_ENTRY:
                                                ((offset - local_start) +
                                                 local_offset) +
                                                local_object->vo_shadow_offset),
-                                              *upl_size, FALSE, 
+                                              *upl_size, FALSE,
                                               MEMORY_OBJECT_DATA_SYNC,
                                               VM_PROT_NO_CHANGE);
                }
@@ -6163,7 +6007,7 @@ REDISCOVER_ENTRY:
                                        ((offset - local_start) +
                                         local_offset)),
                                       (vm_object_size_t)*upl_size,
-                                      FALSE, 
+                                      FALSE,
                                       MEMORY_OBJECT_DATA_SYNC,
                                       VM_PROT_NO_CHANGE);
 
@@ -6251,7 +6095,7 @@ REDISCOVER_ENTRY:
 
        vm_map_unlock_read(map);
 
-       ret = vm_object_iopl_request(local_object, 
+       ret = vm_object_iopl_request(local_object,
                                     ((vm_object_offset_t)
                                      ((offset - local_start) + local_offset)),
                                     *upl_size,
@@ -6267,14 +6111,14 @@ REDISCOVER_ENTRY:
 
 /*
  * Internal routine to enter a UPL into a VM map.
- * 
+ *
  * JMM - This should just be doable through the standard
  * vm_map_enter() API.
  */
 kern_return_t
 vm_map_enter_upl(
-       vm_map_t                map, 
-       upl_t                   upl, 
+       vm_map_t                map,
+       upl_t                   upl,
        vm_map_offset_t         *dst_addr)
 {
        vm_map_size_t           size;
@@ -6306,7 +6150,7 @@ vm_map_enter_upl(
                                mapped++;
                }
 
-               if(mapped) { 
+               if(mapped) {
                        if(mapped != valid_upls)
                                panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped\n", mapped, valid_upls);
                        else {
@@ -6360,7 +6204,7 @@ process_upl_to_enter:
                wpl_array_t             lite_list;
 
                if (upl->flags & UPL_INTERNAL) {
-                       lite_list = (wpl_array_t) 
+                       lite_list = (wpl_array_t)
                                ((((uintptr_t)upl) + sizeof(struct upl))
                                 + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
                } else {
@@ -6399,13 +6243,13 @@ process_upl_to_enter:
                                }
 
                                /*
-                                * Convert the fictitious page to a private 
+                                * Convert the fictitious page to a private
                                 * shadow of the real page.
                                 */
-                               assert(alias_page->fictitious);
-                               alias_page->fictitious = FALSE;
-                               alias_page->private = TRUE;
-                               alias_page->free_when_done = TRUE;
+                               assert(alias_page->vmp_fictitious);
+                               alias_page->vmp_fictitious = FALSE;
+                               alias_page->vmp_private = TRUE;
+                               alias_page->vmp_free_when_done = TRUE;
                                /*
                                 * since m is a page in the upl it must
                                 * already be wired or BUSY, so it's
@@ -6419,12 +6263,12 @@ process_upl_to_enter:
                                vm_page_lockspin_queues();
                                vm_page_wire(alias_page, VM_KERN_MEMORY_NONE, TRUE);
                                vm_page_unlock_queues();
-                               
+
                                vm_page_insert_wired(alias_page, upl->map_object, new_offset, VM_KERN_MEMORY_NONE);
 
-                               assert(!alias_page->wanted);
-                               alias_page->busy = FALSE;
-                               alias_page->absent = FALSE;
+                               assert(!alias_page->vmp_wanted);
+                               alias_page->vmp_busy = FALSE;
+                               alias_page->vmp_absent = FALSE;
                        }
                        size -= PAGE_SIZE;
                        offset += PAGE_SIZE_64;
@@ -6438,7 +6282,7 @@ process_upl_to_enter:
                offset = upl->offset - upl->map_object->paging_offset;
 
        size = upl->size;
-       
+
        vm_object_reference(upl->map_object);
 
        if(!isVectorUPL) {
@@ -6471,14 +6315,14 @@ process_upl_to_enter:
                m = vm_page_lookup(upl->map_object, offset);
 
                if (m) {
-                       m->pmapped = TRUE;
+                       m->vmp_pmapped = TRUE;
 
-                       /* CODE SIGNING ENFORCEMENT: page has been wpmapped, 
+                       /* CODE SIGNING ENFORCEMENT: page has been wpmapped,
                         * but only in kernel space. If this was on a user map,
                         * we'd have to set the wpmapped bit. */
-                       /* m->wpmapped = TRUE; */
+                       /* m->vmp_wpmapped = TRUE; */
                        assert(map->pmap == kernel_pmap);
-       
+
                        PMAP_ENTER(map->pmap, addr, m, VM_PROT_DEFAULT, VM_PROT_NONE, 0, TRUE, kr);
 
                        assert(kr == KERN_SUCCESS);
@@ -6497,7 +6341,7 @@ process_upl_to_enter:
        upl->flags |= UPL_PAGE_LIST_MAPPED;
        upl->kaddr = (vm_offset_t) *dst_addr;
        assert(upl->kaddr == *dst_addr);
-       
+
        if(isVectorUPL)
                goto process_upl_to_enter;
 
@@ -6505,7 +6349,7 @@ process_upl_to_enter:
 
        return KERN_SUCCESS;
 }
-       
+
 /*
  * Internal routine to remove a UPL mapping from a VM map.
  *
@@ -6518,7 +6362,7 @@ process_upl_to_enter:
  */
 kern_return_t
 vm_map_remove_upl(
-       vm_map_t        map, 
+       vm_map_t        map,
        upl_t           upl)
 {
        vm_address_t    addr;
@@ -6562,7 +6406,7 @@ process_upl_to_remove:
                        vm_offset_t v_upl_submap_dst_addr;
                        vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
 
-                       vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_NO_FLAGS);
+                       vm_map_remove(map, v_upl_submap_dst_addr, v_upl_submap_dst_addr + vector_upl->size, VM_MAP_REMOVE_NO_FLAGS);
                        vm_map_deallocate(v_upl_submap);
                        upl_unlock(vector_upl);
                        return KERN_SUCCESS;
@@ -6570,7 +6414,7 @@ process_upl_to_remove:
 
                upl =  vector_upl_subupl_byindex(vector_upl, curr_upl++ );
                if(upl == NULL)
-                       goto process_upl_to_remove;     
+                       goto process_upl_to_remove;
        }
 
        if (upl->flags & UPL_PAGE_LIST_MAPPED) {
@@ -6582,18 +6426,17 @@ process_upl_to_remove:
 
                upl->flags &= ~UPL_PAGE_LIST_MAPPED;
                upl->kaddr = (vm_offset_t) 0;
-               
+
                if(!isVectorUPL) {
                        upl_unlock(upl);
-               
+
                        vm_map_remove(
                                map,
                                vm_map_trunc_page(addr,
                                                  VM_MAP_PAGE_MASK(map)),
                                vm_map_round_page(addr + size,
                                                  VM_MAP_PAGE_MASK(map)),
-                               VM_MAP_NO_FLAGS);
-               
+                               VM_MAP_REMOVE_NO_FLAGS);
                        return KERN_SUCCESS;
                }
                else {
@@ -6601,7 +6444,7 @@ process_upl_to_remove:
                        * If it's a Vectored UPL, we'll be removing the entire
                        * submap anyways, so no need to remove individual UPL
                        * element mappings from within the submap
-                       */      
+                       */
                        goto process_upl_to_remove;
                }
        }
@@ -6613,13 +6456,13 @@ process_upl_to_remove:
 
 kern_return_t
 upl_commit_range(
-       upl_t                   upl, 
-       upl_offset_t            offset, 
+       upl_t                   upl,
+       upl_offset_t            offset,
        upl_size_t              size,
        int                     flags,
        upl_page_info_t         *page_list,
        mach_msg_type_number_t  count,
-       boolean_t               *empty) 
+       boolean_t               *empty)
 {
        upl_size_t              xfer_size, subupl_size = size;
        vm_object_t             shadow_object;
@@ -6685,7 +6528,7 @@ process_upl_to_commit:
 #if UPL_DEBUG
        if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
                (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
-               
+
                upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
                upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
 
@@ -6803,8 +6646,8 @@ process_upl_to_commit:
 
                        if (nxt_page != VM_PAGE_NULL) {
                                m = nxt_page;
-                               nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
-                               target_offset = m->offset;
+                               nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
+                               target_offset = m->vmp_offset;
                        }
                        pg_num = (unsigned int) (target_offset/PAGE_SIZE);
                        assert(pg_num == target_offset/PAGE_SIZE);
@@ -6820,7 +6663,7 @@ process_upl_to_commit:
                if (upl->flags & UPL_SHADOWED) {
                        if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
 
-                               t->free_when_done = FALSE;
+                               t->vmp_free_when_done = FALSE;
 
                                VM_PAGE_FREE(t);
 
@@ -6833,8 +6676,8 @@ process_upl_to_commit:
 
                m_object = VM_PAGE_OBJECT(m);
 
-               if (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
-                       assert(m->busy);
+               if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
+                       assert(m->vmp_busy);
 
                        dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
                        goto commit_next_page;
@@ -6846,12 +6689,12 @@ process_upl_to_commit:
                         * Set the code signing bits according to
                         * what the UPL says they should be.
                         */
-                       m->cs_validated = page_list[entry].cs_validated;
-                       m->cs_tainted = page_list[entry].cs_tainted;
-                       m->cs_nx = page_list[entry].cs_nx;
+                       m->vmp_cs_validated = page_list[entry].cs_validated;
+                       m->vmp_cs_tainted = page_list[entry].cs_tainted;
+                       m->vmp_cs_nx = page_list[entry].cs_nx;
                }
                if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL)
-                       m->written_by_kernel = TRUE;
+                       m->vmp_written_by_kernel = TRUE;
 
                if (upl->flags & UPL_IO_WIRE) {
 
@@ -6861,10 +6704,10 @@ process_upl_to_commit:
                        if (flags & UPL_COMMIT_SET_DIRTY) {
                                SET_PAGE_DIRTY(m, FALSE);
                        } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
-                               m->dirty = FALSE;
+                               m->vmp_dirty = FALSE;
 
                                if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
-                                   m->cs_validated && !m->cs_tainted) {
+                                   m->vmp_cs_validated && !m->vmp_cs_tainted) {
                                        /*
                                         * CODE SIGNING:
                                         * This page is no longer dirty
@@ -6872,15 +6715,10 @@ process_upl_to_commit:
                                         * so it will need to be
                                         * re-validated.
                                         */
-                                       if (m->slid) {
-                                               panic("upl_commit_range(%p): page %p was slid\n",
-                                                     upl, m);
-                                       }
-                                       assert(!m->slid);
-                                       m->cs_validated = FALSE;
-#if DEVELOPMENT || DEBUG
-                                       vm_cs_validated_resets++;
-#endif
+                                       m->vmp_cs_validated = FALSE;
+
+                                       VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
+
                                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
                                }
                                clear_refmod |= VM_MEM_MODIFIED;
@@ -6896,33 +6734,33 @@ process_upl_to_commit:
                        if (fast_path_possible) {
                                assert(m_object->purgable != VM_PURGABLE_EMPTY);
                                assert(m_object->purgable != VM_PURGABLE_VOLATILE);
-                               if (m->absent) {
-                                       assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-                                       assert(m->wire_count == 0);
-                                       assert(m->busy);
+                               if (m->vmp_absent) {
+                                       assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
+                                       assert(m->vmp_wire_count == 0);
+                                       assert(m->vmp_busy);
 
-                                       m->absent = FALSE;
+                                       m->vmp_absent = FALSE;
                                        dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
                                } else {
-                                       if (m->wire_count == 0)
+                                       if (m->vmp_wire_count == 0)
                                                panic("wire_count == 0, m = %p, obj = %p\n", m, shadow_object);
-                                       assert(m->vm_page_q_state == VM_PAGE_IS_WIRED);
+                                       assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
 
                                        /*
                                         * XXX FBDP need to update some other
                                         * counters here (purgeable_wired_count)
                                         * (ledgers), ...
                                         */
-                                       assert(m->wire_count > 0);
-                                       m->wire_count--;
+                                       assert(m->vmp_wire_count > 0);
+                                       m->vmp_wire_count--;
 
-                                       if (m->wire_count == 0) {
-                                               m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
+                                       if (m->vmp_wire_count == 0) {
+                                               m->vmp_q_state = VM_PAGE_NOT_ON_Q;
                                                unwired_count++;
                                        }
                                }
-                               if (m->wire_count == 0) {
-                                       assert(m->pageq.next == 0 && m->pageq.prev == 0);
+                               if (m->vmp_wire_count == 0) {
+                                       assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
 
                                        if (last_local == VM_PAGE_NULL) {
                                                assert(first_local == VM_PAGE_NULL);
@@ -6932,22 +6770,22 @@ process_upl_to_commit:
                                        } else {
                                                assert(first_local != VM_PAGE_NULL);
 
-                                               m->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
-                                               first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
+                                               m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
+                                               first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
                                                first_local = m;
                                        }
                                        local_queue_count++;
 
                                        if (throttle_page) {
-                                               m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
+                                               m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
                                        } else {
                                                if (flags & UPL_COMMIT_INACTIVATE) {
                                                        if (shadow_object->internal)
-                                                               m->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
+                                                               m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
                                                        else
-                                                               m->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
+                                                               m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
                                                } else
-                                                       m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
+                                                       m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
                                        }
                                }
                        } else {
@@ -6955,11 +6793,11 @@ process_upl_to_commit:
                                        dwp->dw_mask |= DW_vm_page_deactivate_internal;
                                        clear_refmod |= VM_MEM_REFERENCED;
                                }
-                               if (m->absent) {
+                               if (m->vmp_absent) {
                                        if (flags & UPL_COMMIT_FREE_ABSENT)
                                                dwp->dw_mask |= DW_vm_page_free;
                                        else {
-                                               m->absent = FALSE;
+                                               m->vmp_absent = FALSE;
                                                dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
 
                                                if ( !(dwp->dw_mask & DW_vm_page_deactivate_internal))
@@ -6970,7 +6808,7 @@ process_upl_to_commit:
                        }
                        goto commit_next_page;
                }
-               assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
+               assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
 
                if (page_list)
                        page_list[entry].phys_addr = 0;
@@ -6983,18 +6821,18 @@ process_upl_to_commit:
                 * change of state
                 */
                if (flags & UPL_COMMIT_CLEAR_DIRTY) {
-                       m->dirty = FALSE;
+                       m->vmp_dirty = FALSE;
 
                        clear_refmod |= VM_MEM_MODIFIED;
                }
-               if (m->laundry)
+               if (m->vmp_laundry)
                        dwp->dw_mask |= DW_vm_pageout_throttle_up;
 
                if (VM_PAGE_WIRED(m))
-                       m->free_when_done = FALSE;
-               
+                       m->vmp_free_when_done = FALSE;
+
                if (! (flags & UPL_COMMIT_CS_VALIDATED) &&
-                   m->cs_validated && !m->cs_tainted) {
+                   m->vmp_cs_validated && !m->vmp_cs_tainted) {
                        /*
                         * CODE SIGNING:
                         * This page is no longer dirty
@@ -7002,27 +6840,22 @@ process_upl_to_commit:
                         * so it will need to be
                         * re-validated.
                         */
-                       if (m->slid) {
-                               panic("upl_commit_range(%p): page %p was slid\n",
-                                     upl, m);
-                       }
-                       assert(!m->slid);
-                       m->cs_validated = FALSE;
-#if DEVELOPMENT || DEBUG
-                       vm_cs_validated_resets++;
-#endif
+                       m->vmp_cs_validated = FALSE;
+
+                       VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
+
                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
                }
-               if (m->overwriting) {
+               if (m->vmp_overwriting) {
                        /*
                         * the (COPY_OUT_FROM == FALSE) request_page_list case
                         */
-                       if (m->busy) {
+                       if (m->vmp_busy) {
 #if CONFIG_PHANTOM_CACHE
-                               if (m->absent && !m_object->internal)
+                               if (m->vmp_absent && !m_object->internal)
                                        dwp->dw_mask |= DW_vm_phantom_cache_update;
 #endif
-                               m->absent = FALSE;
+                               m->vmp_absent = FALSE;
 
                                dwp->dw_mask |= DW_clear_busy;
                        } else {
@@ -7035,37 +6868,34 @@ process_upl_to_commit:
 
                                dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
                        }
-                       m->overwriting = FALSE;
+                       m->vmp_overwriting = FALSE;
                }
-               m->cleaning = FALSE;
+               m->vmp_cleaning = FALSE;
 
-               if (m->free_when_done) {
-                       /* 
+               if (m->vmp_free_when_done) {
+                       /*
                         * With the clean queue enabled, UPL_PAGEOUT should
-                        * no longer set the pageout bit. It's pages now go 
+                        * no longer set the pageout bit. It's pages now go
                         * to the clean queue.
                         */
                        assert(!(flags & UPL_PAGEOUT));
                        assert(!m_object->internal);
 
-                       m->free_when_done = FALSE;
-#if MACH_CLUSTER_STATS
-                       if (m->wanted) vm_pageout_target_collisions++;
-#endif
+                       m->vmp_free_when_done = FALSE;
+
                        if ((flags & UPL_COMMIT_SET_DIRTY) ||
-                           (m->pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
+                           (m->vmp_pmapped && (pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
                                /*
                                 * page was re-dirtied after we started
-                                * the pageout... reactivate it since 
+                                * the pageout... reactivate it since
                                 * we don't know whether the on-disk
                                 * copy matches what is now in memory
                                 */
                                SET_PAGE_DIRTY(m, FALSE);
-                               
+
                                dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
 
                                if (upl->flags & UPL_PAGEOUT) {
-                                       CLUSTER_STAT(vm_pageout_target_page_dirtied++;)
                                        VM_STAT_INCR(reactivations);
                                        DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
                                }
@@ -7079,21 +6909,13 @@ process_upl_to_commit:
                                } else {
                                        DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
                                }
-                               m->dirty = FALSE;
-                               m->busy = TRUE;
+                               m->vmp_dirty = FALSE;
+                               m->vmp_busy = TRUE;
 
                                dwp->dw_mask |= DW_vm_page_free;
                        }
                        goto commit_next_page;
                }
-#if MACH_CLUSTER_STATS
-               if (m->wpmapped)
-                       m->dirty = pmap_is_modified(VM_PAGE_GET_PHYS_PAGE(m));
-
-               if (m->dirty)   vm_pageout_cluster_dirtied++;
-               else            vm_pageout_cluster_cleaned++;
-               if (m->wanted)  vm_pageout_cluster_collisions++;
-#endif
                /*
                 * It is a part of the semantic of COPYOUT_FROM
                 * UPLs that a commit implies cache sync
@@ -7102,24 +6924,23 @@ process_upl_to_commit:
                 * as well as clean
                 */
                if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS))
-                       m->precious = FALSE;
+                       m->vmp_precious = FALSE;
 
                if (flags & UPL_COMMIT_SET_DIRTY) {
                        SET_PAGE_DIRTY(m, FALSE);
                } else {
-                       m->dirty = FALSE;
+                       m->vmp_dirty = FALSE;
                }
 
                /* with the clean queue on, move *all* cleaned pages to the clean queue */
-               if (hibernate_cleaning_in_progress == FALSE && !m->dirty && (upl->flags & UPL_PAGEOUT)) {
+               if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
                        pgpgout_count++;
 
                        VM_STAT_INCR(pageouts);
                        DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
 
                        dwp->dw_mask |= DW_enqueue_cleaned;
-                       vm_pageout_enqueued_cleaned_from_inactive_dirty++;
-               } else if (should_be_throttled == TRUE && (m->vm_page_q_state == VM_PAGE_NOT_ON_Q)) {
+               } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
                        /*
                         * page coming back in from being 'frozen'...
                         * it was dirty before it was frozen, so keep it so
@@ -7130,14 +6951,14 @@ process_upl_to_commit:
                        dwp->dw_mask |= DW_vm_page_activate;
 
                } else {
-                       if ((flags & UPL_COMMIT_INACTIVATE) && !m->clustered && (m->vm_page_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
+                       if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
                                dwp->dw_mask |= DW_vm_page_deactivate_internal;
                                clear_refmod |= VM_MEM_REFERENCED;
                        } else if ( !VM_PAGE_PAGEABLE(m)) {
 
-                               if (m->clustered || (flags & UPL_COMMIT_SPECULATE))
+                               if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE))
                                        dwp->dw_mask |= DW_vm_page_speculate;
-                               else if (m->reference)
+                               else if (m->vmp_reference)
                                        dwp->dw_mask |= DW_vm_page_activate;
                                else {
                                        dwp->dw_mask |= DW_vm_page_deactivate_internal;
@@ -7172,13 +6993,13 @@ commit_next_page:
 
                                if (dw_count >= dw_limit) {
                                        vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
-                       
+
                                        dwp = &dw_array[0];
                                        dw_count = 0;
                                }
                        } else {
                                if (dwp->dw_mask & DW_clear_busy)
-                                       m->busy = FALSE;
+                                       m->vmp_busy = FALSE;
 
                                if (dwp->dw_mask & DW_PAGE_WAKEUP)
                                        PAGE_WAKEUP(m);
@@ -7220,11 +7041,11 @@ commit_next_page:
                                if (vm_page_queue_empty(target_queue))
                                        target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
                                else
-                                       first_target->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
+                                       first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
 
                                target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
-                               first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
-                               last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
+                               first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
+                               last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
 
                                /*
                                 * Adjust the global page counts.
@@ -7249,7 +7070,7 @@ commit_next_page:
                        } else {
                                vm_page_lockspin_queues();
                        }
-                       if (unwired_count) {    
+                       if (unwired_count) {
                                vm_page_wire_count -= unwired_count;
                                VM_CHECK_MEMORYSTATUS;
                        }
@@ -7316,14 +7137,14 @@ commit_next_page:
        vm_object_unlock(shadow_object);
        if (object != shadow_object)
                vm_object_unlock(object);
-       
+
        if(!isVectorUPL)
                upl_unlock(upl);
        else {
-               /* 
+               /*
                 * If we completed our operations on an UPL that is
                 * part of a Vectored UPL and if empty is TRUE, then
-                * we should go ahead and deallocate this UPL element. 
+                * we should go ahead and deallocate this UPL element.
                 * Then we check if this was the last of the UPL elements
                 * within that Vectored UPL. If so, set empty to TRUE
                 * so that in ubc_upl_commit_range or ubc_upl_commit, we
@@ -7344,11 +7165,11 @@ commit_next_page:
 
 kern_return_t
 upl_abort_range(
-       upl_t                   upl, 
-       upl_offset_t            offset, 
+       upl_t                   upl,
+       upl_offset_t            offset,
        upl_size_t              size,
        int                     error,
-       boolean_t               *empty) 
+       boolean_t               *empty)
 {
        upl_page_info_t         *user_page_list = NULL;
        upl_size_t              xfer_size, subupl_size = size;
@@ -7403,7 +7224,7 @@ process_upl_to_abort:
 #if UPL_DEBUG
        if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
                (void) OSBacktrace(&upl->upl_commit_records[upl->upl_commit_index].c_retaddr[0], UPL_DEBUG_STACK_FRAMES);
-               
+
                upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
                upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
                upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
@@ -7425,13 +7246,13 @@ process_upl_to_abort:
                return KERN_FAILURE;
        }
        if (upl->flags & UPL_INTERNAL) {
-               lite_list = (wpl_array_t) 
+               lite_list = (wpl_array_t)
                        ((((uintptr_t)upl) + sizeof(struct upl))
                        + ((upl->size/PAGE_SIZE) * sizeof(upl_page_info_t)));
 
                user_page_list = (upl_page_info_t *) (((uintptr_t)upl) + sizeof(struct upl));
        } else {
-               lite_list = (wpl_array_t) 
+               lite_list = (wpl_array_t)
                        (((uintptr_t)upl) + sizeof(struct upl));
        }
        object = upl->map_object;
@@ -7494,7 +7315,7 @@ process_upl_to_abort:
                }
                if (upl->flags & UPL_SHADOWED) {
                        if ((t = vm_page_lookup(object, target_offset)) != VM_PAGE_NULL) {
-                               t->free_when_done = FALSE;
+                               t->vmp_free_when_done = FALSE;
 
                                VM_PAGE_FREE(t);
 
@@ -7507,9 +7328,9 @@ process_upl_to_abort:
 
                if (m != VM_PAGE_NULL) {
 
-                       assert(m->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
+                       assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
 
-                       if (m->absent) {
+                       if (m->vmp_absent) {
                                boolean_t must_free = TRUE;
 
                                /*
@@ -7518,22 +7339,22 @@ process_upl_to_abort:
                                 * be passed back to the pages customer
                                 */
                                if (error & UPL_ABORT_RESTART) {
-                                       m->restart = TRUE;
-                                       m->absent = FALSE;
-                                       m->unusual = TRUE;
+                                       m->vmp_restart = TRUE;
+                                       m->vmp_absent = FALSE;
+                                       m->vmp_unusual = TRUE;
                                        must_free = FALSE;
                                } else if (error & UPL_ABORT_UNAVAILABLE) {
-                                       m->restart = FALSE;
-                                       m->unusual = TRUE;
+                                       m->vmp_restart = FALSE;
+                                       m->vmp_unusual = TRUE;
                                        must_free = FALSE;
                                } else if (error & UPL_ABORT_ERROR) {
-                                       m->restart = FALSE;
-                                       m->absent = FALSE;
-                                       m->error = TRUE;
-                                       m->unusual = TRUE;
+                                       m->vmp_restart = FALSE;
+                                       m->vmp_absent = FALSE;
+                                       m->vmp_error = TRUE;
+                                       m->vmp_unusual = TRUE;
                                        must_free = FALSE;
                                }
-                               if (m->clustered && needed == FALSE) {
+                               if (m->vmp_clustered && needed == FALSE) {
                                        /*
                                         * This page was a part of a speculative
                                         * read-ahead initiated by the kernel
@@ -7545,14 +7366,14 @@ process_upl_to_abort:
                                         */
                                        must_free = TRUE;
                                }
-                               m->cleaning = FALSE;
+                               m->vmp_cleaning = FALSE;
 
-                               if (m->overwriting && !m->busy) {
+                               if (m->vmp_overwriting && !m->vmp_busy) {
                                        /*
                                         * this shouldn't happen since
                                         * this is an 'absent' page, but
                                         * it doesn't hurt to check for
-                                        * the 'alternate' method of 
+                                        * the 'alternate' method of
                                         * stabilizing the page...
                                         * we will mark 'busy' to be cleared
                                         * in the following code which will
@@ -7561,7 +7382,7 @@ process_upl_to_abort:
                                         */
                                        dwp->dw_mask |= DW_vm_page_unwire;
                                }
-                               m->overwriting = FALSE;
+                               m->vmp_overwriting = FALSE;
 
                                dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
 
@@ -7570,10 +7391,10 @@ process_upl_to_abort:
                                else
                                        dwp->dw_mask |= DW_vm_page_activate;
                        } else {
-                               /*                          
+                               /*
                                 * Handle the trusted pager throttle.
-                                */                     
-                               if (m->laundry)
+                                */
+                               if (m->vmp_laundry)
                                        dwp->dw_mask |= DW_vm_pageout_throttle_up;
 
                                if (upl->flags & UPL_ACCESS_BLOCKED) {
@@ -7584,8 +7405,8 @@ process_upl_to_abort:
                                         */
                                        dwp->dw_mask |= DW_clear_busy;
                                }
-                               if (m->overwriting) {
-                                       if (m->busy)
+                               if (m->vmp_overwriting) {
+                                       if (m->vmp_busy)
                                                dwp->dw_mask |= DW_clear_busy;
                                        else {
                                                /*
@@ -7599,10 +7420,10 @@ process_upl_to_abort:
                                                 */
                                                dwp->dw_mask |= DW_vm_page_unwire;
                                        }
-                                       m->overwriting = FALSE;
+                                       m->vmp_overwriting = FALSE;
                                }
-                               m->free_when_done = FALSE;
-                               m->cleaning = FALSE;
+                               m->vmp_free_when_done = FALSE;
+                               m->vmp_cleaning = FALSE;
 
                                if (error & UPL_ABORT_DUMP_PAGES) {
                                        pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
@@ -7613,7 +7434,7 @@ process_upl_to_abort:
                                                if (error & UPL_ABORT_REFERENCE) {
                                                        /*
                                                         * we've been told to explictly
-                                                        * reference this page... for 
+                                                        * reference this page... for
                                                         * file I/O, this is done by
                                                         * implementing an LRU on the inactive q
                                                         */
@@ -7637,13 +7458,13 @@ abort_next_page:
 
                                if (dw_count >= dw_limit) {
                                        vm_page_do_delayed_work(shadow_object, VM_KERN_MEMORY_NONE, &dw_array[0], dw_count);
-                               
+
                                        dwp = &dw_array[0];
                                        dw_count = 0;
                                }
                        } else {
                                if (dwp->dw_mask & DW_clear_busy)
-                                       m->busy = FALSE;
+                                       m->vmp_busy = FALSE;
 
                                if (dwp->dw_mask & DW_PAGE_WAKEUP)
                                        PAGE_WAKEUP(m);
@@ -7707,14 +7528,14 @@ abort_next_page:
        vm_object_unlock(shadow_object);
        if (object != shadow_object)
                vm_object_unlock(object);
-       
+
        if(!isVectorUPL)
                upl_unlock(upl);
        else {
-               /* 
+               /*
                * If we completed our operations on an UPL that is
                * part of a Vectored UPL and if empty is TRUE, then
-               * we should go ahead and deallocate this UPL element. 
+               * we should go ahead and deallocate this UPL element.
                * Then we check if this was the last of the UPL elements
                * within that Vectored UPL. If so, set empty to TRUE
                * so that in ubc_upl_abort_range or ubc_upl_abort, we
@@ -7785,7 +7606,7 @@ iopl_valid_data(
                panic("iopl_valid_data: object == kernel or compressor");
 
        if (object->purgable == VM_PURGABLE_VOLATILE ||
-           object->purgable == VM_PURGABLE_EMPTY) 
+           object->purgable == VM_PURGABLE_EMPTY)
                panic("iopl_valid_data: object %p purgable %d",
                      object, object->purgable);
 
@@ -7803,7 +7624,7 @@ iopl_valid_data(
 
                if (nxt_page != VM_PAGE_NULL) {
                        m = nxt_page;
-                       nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->listq);
+                       nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
                } else {
                        m = vm_page_lookup(object, offset);
                        offset += PAGE_SIZE;
@@ -7811,29 +7632,29 @@ iopl_valid_data(
                        if (m == VM_PAGE_NULL)
                                panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
                }
-               if (m->busy) {
-                       if (!m->absent)
+               if (m->vmp_busy) {
+                       if (!m->vmp_absent)
                                panic("iopl_valid_data: busy page w/o absent");
 
-                       if (m->pageq.next || m->pageq.prev)
+                       if (m->vmp_pageq.next || m->vmp_pageq.prev)
                                panic("iopl_valid_data: busy+absent page on page queue");
-                       if (m->reusable) {
+                       if (m->vmp_reusable) {
                                panic("iopl_valid_data: %p is reusable", m);
                        }
 
-                       m->absent = FALSE;
-                       m->dirty = TRUE;
-                       assert(m->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-                       assert(m->wire_count == 0);
-                       m->wire_count++;
-                       assert(m->wire_count);
-                       if (m->wire_count == 1) {
-                               m->vm_page_q_state = VM_PAGE_IS_WIRED;
+                       m->vmp_absent = FALSE;
+                       m->vmp_dirty = TRUE;
+                       assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
+                       assert(m->vmp_wire_count == 0);
+                       m->vmp_wire_count++;
+                       assert(m->vmp_wire_count);
+                       if (m->vmp_wire_count == 1) {
+                               m->vmp_q_state = VM_PAGE_IS_WIRED;
                                wired_count++;
                        } else {
                                panic("iopl_valid_data: %p already wired\n", m);
                        }
-                       
+
                        PAGE_WAKEUP_DONE(m);
                }
                size -= PAGE_SIZE;
@@ -7874,7 +7695,7 @@ vm_object_set_pmap_cache_attr(
 
 
 boolean_t      vm_object_iopl_wire_full(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t);
-kern_return_t  vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int);
+kern_return_t  vm_object_iopl_wire_empty(vm_object_t, upl_t, upl_page_info_array_t, wpl_array_t, upl_control_flags_t, vm_tag_t, vm_object_offset_t *, int, int*);
 
 
 
@@ -7903,31 +7724,31 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us
 
        while (page_count--) {
 
-               if (dst_page->busy ||
-                   dst_page->fictitious ||
-                   dst_page->absent ||
-                   dst_page->error ||
-                   dst_page->cleaning ||
-                   dst_page->restart ||
-                   dst_page->laundry) {
+               if (dst_page->vmp_busy ||
+                   dst_page->vmp_fictitious ||
+                   dst_page->vmp_absent ||
+                   dst_page->vmp_error ||
+                   dst_page->vmp_cleaning ||
+                   dst_page->vmp_restart ||
+                   dst_page->vmp_laundry) {
                        retval = FALSE;
                        goto done;
                }
-               if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
+               if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
                        retval = FALSE;
                        goto done;
                }
-               dst_page->reference = TRUE;
+               dst_page->vmp_reference = TRUE;
 
                vm_page_wire(dst_page, tag, FALSE);
 
                if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
                        SET_PAGE_DIRTY(dst_page, FALSE);
                }
-               entry = (unsigned int)(dst_page->offset / PAGE_SIZE);
+               entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
                assert(entry >= 0 && entry < object->resident_page_count);
                lite_list[entry>>5] |= 1 << (entry & 31);
-               
+
                phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
 
                if (phys_page > upl->highest_page)
@@ -7935,10 +7756,10 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us
 
                if (user_page_list) {
                        user_page_list[entry].phys_addr = phys_page;
-                       user_page_list[entry].absent    = dst_page->absent;
-                       user_page_list[entry].dirty     = dst_page->dirty;
-                       user_page_list[entry].free_when_done   = dst_page->free_when_done;
-                       user_page_list[entry].precious  = dst_page->precious;
+                       user_page_list[entry].absent    = dst_page->vmp_absent;
+                       user_page_list[entry].dirty     = dst_page->vmp_dirty;
+                       user_page_list[entry].free_when_done   = dst_page->vmp_free_when_done;
+                       user_page_list[entry].precious  = dst_page->vmp_precious;
                        user_page_list[entry].device    = FALSE;
                        user_page_list[entry].speculative = FALSE;
                        user_page_list[entry].cs_validated = FALSE;
@@ -7953,7 +7774,7 @@ vm_object_iopl_wire_full(vm_object_t object, upl_t upl, upl_page_info_array_t us
 
                        VM_CHECK_MEMORYSTATUS;
                }
-               dst_page = (vm_page_t)vm_page_queue_next(&dst_page->listq);
+               dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
        }
 done:
        vm_page_unlock_queues();
@@ -7966,7 +7787,8 @@ done:
 
 kern_return_t
 vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t user_page_list,
-                            wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset, int page_count)
+                            wpl_array_t lite_list, upl_control_flags_t cntrl_flags, vm_tag_t tag, vm_object_offset_t *dst_offset,
+                            int page_count, int* page_grab_count)
 {
        vm_page_t       dst_page;
        boolean_t       no_zero_fill = FALSE;
@@ -8002,7 +7824,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u
 #endif /* CONFIG_SECLUDED_MEMORY */
 
        while (page_count--) {
-                       
+
                while ((dst_page = vm_page_grab_options(grab_options))
                       == VM_PAGE_NULL) {
 
@@ -8017,7 +7839,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u
                                OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
 
                                VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
-                               
+
                                ret = MACH_SEND_INTERRUPTED;
                                goto done;
                        }
@@ -8028,19 +7850,19 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u
                if (no_zero_fill == FALSE)
                        vm_page_zero_fill(dst_page);
                else
-                       dst_page->absent = TRUE;
+                       dst_page->vmp_absent = TRUE;
 
-               dst_page->reference = TRUE;
+               dst_page->vmp_reference = TRUE;
 
                if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
-                       SET_PAGE_DIRTY(dst_page, FALSE);        
-               }
-               if (dst_page->absent == FALSE) {
-                       assert(dst_page->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-                       assert(dst_page->wire_count == 0);
-                       dst_page->wire_count++;
-                       dst_page->vm_page_q_state = VM_PAGE_IS_WIRED;
-                       assert(dst_page->wire_count);
+                       SET_PAGE_DIRTY(dst_page, FALSE);
+               }
+               if (dst_page->vmp_absent == FALSE) {
+                       assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
+                       assert(dst_page->vmp_wire_count == 0);
+                       dst_page->vmp_wire_count++;
+                       dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
+                       assert(dst_page->vmp_wire_count);
                        pages_wired++;
                        PAGE_WAKEUP_DONE(dst_page);
                }
@@ -8049,7 +7871,7 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u
                vm_page_insert_internal(dst_page, object, *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, &delayed_ledger_update);
 
                lite_list[entry>>5] |= 1 << (entry & 31);
-               
+
                phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
 
                if (phys_page > upl->highest_page)
@@ -8057,8 +7879,8 @@ vm_object_iopl_wire_empty(vm_object_t object, upl_t upl, upl_page_info_array_t u
 
                if (user_page_list) {
                        user_page_list[entry].phys_addr = phys_page;
-                       user_page_list[entry].absent    = dst_page->absent;
-                       user_page_list[entry].dirty     = dst_page->dirty;
+                       user_page_list[entry].absent    = dst_page->vmp_absent;
+                       user_page_list[entry].dirty     = dst_page->vmp_dirty;
                        user_page_list[entry].free_when_done    = FALSE;
                        user_page_list[entry].precious  = FALSE;
                        user_page_list[entry].device    = FALSE;
@@ -8087,25 +7909,41 @@ done:
        }
        if (delayed_ledger_update) {
                task_t          owner;
+               int             ledger_idx_volatile;
+               int             ledger_idx_nonvolatile;
+               int             ledger_idx_volatile_compressed;
+               int             ledger_idx_nonvolatile_compressed;
+               boolean_t       do_footprint;
 
-               owner = object->vo_purgeable_owner;
+               owner = VM_OBJECT_OWNER(object);
                assert(owner);
 
+               vm_object_ledger_tag_ledgers(object,
+                                            &ledger_idx_volatile,
+                                            &ledger_idx_nonvolatile,
+                                            &ledger_idx_volatile_compressed,
+                                            &ledger_idx_nonvolatile_compressed,
+                                            &do_footprint);
+
                /* more non-volatile bytes */
                ledger_credit(owner->ledger,
-                             task_ledgers.purgeable_nonvolatile,
-                             delayed_ledger_update);
-               /* more footprint */
-               ledger_credit(owner->ledger,
-                             task_ledgers.phys_footprint,
+                             ledger_idx_nonvolatile,
                              delayed_ledger_update);
+               if (do_footprint) {
+                       /* more footprint */
+                       ledger_credit(owner->ledger,
+                                     task_ledgers.phys_footprint,
+                                     delayed_ledger_update);
+               }
        }
+
+       assert(page_grab_count);
+       *page_grab_count = pages_inserted;
+
        return (ret);
 }
 
 
-unsigned int vm_object_iopl_request_sleep_for_cleaning = 0;
-
 
 kern_return_t
 vm_object_iopl_request(
@@ -8126,10 +7964,11 @@ vm_object_iopl_request(
        wpl_array_t             lite_list = NULL;
        int                     no_zero_fill = FALSE;
        unsigned int            size_in_pages;
+       int                     page_grab_count = 0;
        u_int32_t               psize;
        kern_return_t           ret;
        vm_prot_t               prot;
-       struct vm_object_fault_info fault_info;
+       struct vm_object_fault_info fault_info = {};
        struct  vm_page_delayed_work    dw_array[DEFAULT_DELAYED_WORK_LIMIT];
        struct  vm_page_delayed_work    *dwp;
        int                     dw_count;
@@ -8162,7 +8001,7 @@ vm_object_iopl_request(
                if (object->phys_contiguous) {
                        if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address)
                                return KERN_INVALID_ADDRESS;
-             
+
                        if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address)
                                return KERN_INVALID_ADDRESS;
                }
@@ -8178,6 +8017,8 @@ vm_object_iopl_request(
        if ((!object->internal) && (object->paging_offset != 0))
                panic("vm_object_iopl_request: external object with non-zero paging offset\n");
 
+       VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
+
 #if CONFIG_IOSCHED || UPL_DEBUG
        if ((object->io_tracking && object != kernel_object) || upl_debug_enabled)
                io_tracking_flag |= UPL_CREATE_IO_TRACKING;
@@ -8218,6 +8059,14 @@ vm_object_iopl_request(
                user_page_list[0].device = FALSE;
        *upl_ptr = upl;
 
+       if (cntrl_flags & UPL_NOZEROFILLIO) {
+               DTRACE_VM4(upl_nozerofillio,
+                          vm_object_t, object,
+                          vm_object_offset_t, offset,
+                          upl_size_t, size,
+                          upl_t, upl);
+       }
+
        upl->map_object = object;
        upl->size = size;
 
@@ -8282,6 +8131,8 @@ vm_object_iopl_request(
                        else
                                *page_list_count = 1;
                }
+
+               VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
                return KERN_SUCCESS;
        }
        if (object != kernel_object && object != compressor_object) {
@@ -8338,10 +8189,8 @@ vm_object_iopl_request(
                                 FALSE, /* should_return */
                                 MEMORY_OBJECT_COPY_SYNC,
                                 VM_PROT_NO_CHANGE);
-#if DEVELOPMENT || DEBUG
-               iopl_cow++;
-               iopl_cow_pages += size >> PAGE_SHIFT;
-#endif
+               VM_PAGEOUT_DEBUG(iopl_cow, 1);
+               VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
        }
        if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
            object->purgable != VM_PURGABLE_VOLATILE &&
@@ -8393,8 +8242,8 @@ vm_object_iopl_request(
                        ret = KERN_MEMORY_ERROR;
                        goto return_err;
                }
-               ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages);
-               
+               ret = vm_object_iopl_wire_empty(object, upl, user_page_list, lite_list, cntrl_flags, tag, &dst_offset, size_in_pages, &page_grab_count);
+
                if (ret) {
                        free_wired_pages = TRUE;
                        goto return_err;
@@ -8403,13 +8252,8 @@ vm_object_iopl_request(
        }
 
        fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
-       fault_info.user_tag  = 0;
        fault_info.lo_offset = offset;
        fault_info.hi_offset = offset + xfer_size;
-       fault_info.no_cache  = FALSE;
-       fault_info.stealth = FALSE;
-       fault_info.io_sync = FALSE;
-       fault_info.cs_bypass = FALSE;
        fault_info.mark_zf_absent = TRUE;
        fault_info.interruptible = interruptible;
        fault_info.batch_pmap_op = TRUE;
@@ -8437,11 +8281,11 @@ vm_object_iopl_request(
                dst_page = vm_page_lookup(object, dst_offset);
 
                if (dst_page == VM_PAGE_NULL ||
-                   dst_page->busy ||
-                   dst_page->error || 
-                   dst_page->restart ||
-                   dst_page->absent ||
-                   dst_page->fictitious) {
+                   dst_page->vmp_busy ||
+                   dst_page->vmp_error ||
+                   dst_page->vmp_restart ||
+                   dst_page->vmp_absent ||
+                   dst_page->vmp_fictitious) {
 
                   if (object == kernel_object)
                           panic("vm_object_iopl_request: missing/bad page in kernel object\n");
@@ -8484,15 +8328,16 @@ vm_object_iopl_request(
                        switch (result) {
 
                        case VM_FAULT_SUCCESS:
+                               page_grab_count++;
 
-                               if ( !dst_page->absent) {
+                               if ( !dst_page->vmp_absent) {
                                        PAGE_WAKEUP_DONE(dst_page);
                                } else {
                                        /*
                                         * we only get back an absent page if we
                                         * requested that it not be zero-filled
                                         * because we are about to fill it via I/O
-                                        * 
+                                        *
                                         * absent pages should be left BUSY
                                         * to prevent them from being faulted
                                         * into an address space before we've
@@ -8509,11 +8354,11 @@ vm_object_iopl_request(
                                        vm_object_t local_object;
 
                                        local_object = VM_PAGE_OBJECT(top_page);
-                                       
+
                                        /*
                                         * comparing 2 packed pointers
                                         */
-                                       if (top_page->vm_page_object != dst_page->vm_page_object) {
+                                       if (top_page->vmp_object != dst_page->vmp_object) {
                                                vm_object_lock(local_object);
                                                VM_PAGE_FREE(top_page);
                                                vm_object_paging_end(local_object);
@@ -8525,7 +8370,7 @@ vm_object_iopl_request(
                                }
                                vm_object_paging_end(object);
                                break;
-                       
+
                        case VM_FAULT_RETRY:
                                vm_object_lock(object);
                                break;
@@ -8576,12 +8421,12 @@ vm_object_iopl_request(
                if (upl->flags & UPL_KERNEL_OBJECT)
                        goto record_phys_addr;
 
-               if (dst_page->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
-                       dst_page->busy = TRUE;
+               if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
+                       dst_page->vmp_busy = TRUE;
                        goto record_phys_addr;
                }
 
-               if (dst_page->cleaning) {
+               if (dst_page->vmp_cleaning) {
                        /*
                         * Someone else is cleaning this page in place.
                         * In theory, we should be able to  proceed and use this
@@ -8592,11 +8437,11 @@ vm_object_iopl_request(
                         * We'd better wait for the cleaning to complete and
                         * then try again.
                         */
-                       vm_object_iopl_request_sleep_for_cleaning++;
+                       VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
                        PAGE_SLEEP(object, dst_page, THREAD_UNINT);
                        continue;
                }
-               if (dst_page->laundry)
+               if (dst_page->vmp_laundry)
                        vm_pageout_steal_laundry(dst_page, FALSE);
 
                if ( (cntrl_flags & UPL_NEED_32BIT_ADDR) &&
@@ -8629,20 +8474,20 @@ vm_object_iopl_request(
                         * it after we disconnect it... we want the fault
                         * to find the new page being substituted.
                         */
-                       if (dst_page->pmapped)
+                       if (dst_page->vmp_pmapped)
                                refmod = pmap_disconnect(phys_page);
                        else
                                refmod = 0;
 
-                       if (!dst_page->absent)
+                       if (!dst_page->vmp_absent)
                                vm_page_copy(dst_page, low_page);
-                 
-                       low_page->reference = dst_page->reference;
-                       low_page->dirty     = dst_page->dirty;
-                       low_page->absent    = dst_page->absent;
+
+                       low_page->vmp_reference = dst_page->vmp_reference;
+                       low_page->vmp_dirty     = dst_page->vmp_dirty;
+                       low_page->vmp_absent    = dst_page->vmp_absent;
 
                        if (refmod & VM_MEM_REFERENCED)
-                               low_page->reference = TRUE;
+                               low_page->vmp_reference = TRUE;
                        if (refmod & VM_MEM_MODIFIED) {
                                SET_PAGE_DIRTY(low_page, FALSE);
                        }
@@ -8655,12 +8500,12 @@ vm_object_iopl_request(
                         * BUSY... we don't need a PAGE_WAKEUP_DONE
                         * here, because we've never dropped the object lock
                         */
-                       if ( !dst_page->absent)
-                               dst_page->busy = FALSE;
+                       if ( !dst_page->vmp_absent)
+                               dst_page->vmp_busy = FALSE;
 
                        phys_page = VM_PAGE_GET_PHYS_PAGE(dst_page);
                }
-               if ( !dst_page->busy)
+               if ( !dst_page->vmp_busy)
                        dwp->dw_mask |= DW_vm_page_wire;
 
                if (cntrl_flags & UPL_BLOCK_ACCESS) {
@@ -8670,8 +8515,8 @@ vm_object_iopl_request(
                         * We'll also remove the mapping
                         * of all these pages before leaving this routine.
                         */
-                       assert(!dst_page->fictitious);
-                       dst_page->busy = TRUE;
+                       assert(!dst_page->vmp_fictitious);
+                       dst_page->vmp_busy = TRUE;
                }
                /*
                 * expect the page to be used
@@ -8680,15 +8525,15 @@ vm_object_iopl_request(
                dwp->dw_mask |= DW_set_reference;
 
                if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
-                       SET_PAGE_DIRTY(dst_page, TRUE); 
+                       SET_PAGE_DIRTY(dst_page, TRUE);
                }
-               if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->written_by_kernel == TRUE) {
+               if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
                        pmap_sync_page_attributes_phys(phys_page);
-                       dst_page->written_by_kernel = FALSE;
+                       dst_page->vmp_written_by_kernel = FALSE;
                }
 
 record_phys_addr:
-               if (dst_page->busy)
+               if (dst_page->vmp_busy)
                        upl->flags |= UPL_HAS_BUSY;
 
                lite_list[entry>>5] |= 1 << (entry & 31);
@@ -8698,28 +8543,28 @@ record_phys_addr:
 
                if (user_page_list) {
                        user_page_list[entry].phys_addr = phys_page;
-                       user_page_list[entry].free_when_done    = dst_page->free_when_done;
-                       user_page_list[entry].absent    = dst_page->absent;
-                       user_page_list[entry].dirty     = dst_page->dirty;
-                       user_page_list[entry].precious  = dst_page->precious;
+                       user_page_list[entry].free_when_done    = dst_page->vmp_free_when_done;
+                       user_page_list[entry].absent    = dst_page->vmp_absent;
+                       user_page_list[entry].dirty     = dst_page->vmp_dirty;
+                       user_page_list[entry].precious  = dst_page->vmp_precious;
                        user_page_list[entry].device    = FALSE;
                        user_page_list[entry].needed    = FALSE;
-                       if (dst_page->clustered == TRUE)
-                               user_page_list[entry].speculative = (dst_page->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
+                       if (dst_page->vmp_clustered == TRUE)
+                               user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
                        else
                                user_page_list[entry].speculative = FALSE;
-                       user_page_list[entry].cs_validated = dst_page->cs_validated;
-                       user_page_list[entry].cs_tainted = dst_page->cs_tainted;
-                       user_page_list[entry].cs_nx = dst_page->cs_nx;
+                       user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
+                       user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
+                       user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
                        user_page_list[entry].mark      = FALSE;
                }
                if (object != kernel_object && object != compressor_object) {
                        /*
                         * someone is explicitly grabbing this page...
                         * update clustered and speculative state
-                        * 
+                        *
                         */
-                       if (dst_page->clustered)
+                       if (dst_page->vmp_clustered)
                                VM_PAGE_CONSUME_CLUSTERED(dst_page);
                }
 skip_page:
@@ -8732,7 +8577,7 @@ skip_page:
 
                        if (dw_count >= dw_limit) {
                                vm_page_do_delayed_work(object, tag, &dw_array[0], dw_count);
-                               
+
                                dwp = &dw_array[0];
                                dw_count = 0;
                        }
@@ -8767,6 +8612,7 @@ finish:
                object->blocked_access = TRUE;
        }
 
+       VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
        return KERN_SUCCESS;
 
 return_err:
@@ -8781,7 +8627,7 @@ return_err:
                        panic("vm_object_iopl_request: Wired page missing. \n");
 
                /*
-                * if we've already processed this page in an earlier 
+                * if we've already processed this page in an earlier
                 * dw_do_work, we need to undo the wiring... we will
                 * leave the dirty and reference bits on if they
                 * were set, since we don't have a good way of knowing
@@ -8807,7 +8653,7 @@ return_err:
                }
                vm_page_lock_queues();
 
-               if (dst_page->absent || free_wired_pages == TRUE) {
+               if (dst_page->vmp_absent || free_wired_pages == TRUE) {
                        vm_page_free(dst_page);
 
                        need_unwire = FALSE;
@@ -8832,6 +8678,7 @@ return_err:
        vm_object_unlock(object);
        upl_destroy(upl);
 
+       VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
        return ret;
 }
 
@@ -8847,7 +8694,7 @@ upl_transpose(
        if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2  || ((upl1->flags & UPL_VECTOR)==UPL_VECTOR)  || ((upl2->flags & UPL_VECTOR)==UPL_VECTOR)) {
                return KERN_INVALID_ARGUMENT;
        }
-       
+
        upls_locked = FALSE;
 
        /*
@@ -8964,6 +8811,7 @@ boolean_t vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
 int            vm_paging_max_index = 0;
 int            vm_paging_page_waiter = 0;
 int            vm_paging_page_waiter_total = 0;
+
 unsigned long  vm_paging_no_kernel_page = 0;
 unsigned long  vm_paging_objects_mapped = 0;
 unsigned long  vm_paging_pages_mapped = 0;
@@ -9055,7 +8903,7 @@ vm_paging_map_object(
 #warn "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
 #endif
 
-               assert(page->busy);
+               assert(page->vmp_busy);
                /*
                 * Use one of the pre-allocated kernel virtual addresses
                 * and just enter the VM page in the kernel address space
@@ -9116,12 +8964,12 @@ vm_paging_map_object(
                        vm_paging_page_inuse[i] = TRUE;
                        simple_unlock(&vm_paging_lock);
 
-                       page->pmapped = TRUE;
+                       page->vmp_pmapped = TRUE;
 
                        /*
                         * Keep the VM object locked over the PMAP_ENTER
                         * and the actual use of the page by the kernel,
-                        * or this pmap mapping might get undone by a 
+                        * or this pmap mapping might get undone by a
                         * vm_object_pmap_protect() call...
                         */
                        PMAP_ENTER(kernel_pmap,
@@ -9134,7 +8982,7 @@ vm_paging_map_object(
                                   kr);
                        assert(kr == KERN_SUCCESS);
                        vm_paging_objects_mapped++;
-                       vm_paging_pages_mapped++; 
+                       vm_paging_pages_mapped++;
                        *address = page_map_offset;
                        *need_unmap = TRUE;
 
@@ -9218,7 +9066,7 @@ vm_paging_map_object(
                        printf("vm_paging_map_object: no page !?");
                        vm_object_unlock(object);
                        kr = vm_map_remove(kernel_map, *address, *size,
-                                          VM_MAP_NO_FLAGS);
+                                          VM_MAP_REMOVE_NO_FLAGS);
                        assert(kr == KERN_SUCCESS);
                        *address = 0;
                        *size = 0;
@@ -9226,7 +9074,7 @@ vm_paging_map_object(
                        vm_object_lock(object);
                        return KERN_MEMORY_ERROR;
                }
-               page->pmapped = TRUE;
+               page->vmp_pmapped = TRUE;
 
                //assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(page)));
                PMAP_ENTER(kernel_pmap,
@@ -9242,7 +9090,7 @@ vm_paging_map_object(
                kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
 #endif
        }
-                          
+
        vm_paging_objects_mapped_slow++;
        vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
 
@@ -9280,7 +9128,8 @@ vm_paging_unmap_object(
                if (object != VM_OBJECT_NULL) {
                        vm_object_unlock(object);
                }
-               kr = vm_map_remove(kernel_map, start, end, VM_MAP_NO_FLAGS);
+               kr = vm_map_remove(kernel_map, start, end,
+                                  VM_MAP_REMOVE_NO_FLAGS);
                if (object != VM_OBJECT_NULL) {
                        vm_object_lock(object);
                }
@@ -9309,7 +9158,7 @@ vm_paging_unmap_object(
 
 
 /*
- * page->object must be locked
+ * page->vmp_object must be locked
  */
 void
 vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
@@ -9318,7 +9167,7 @@ vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
                vm_page_lockspin_queues();
        }
 
-       page->free_when_done = FALSE;
+       page->vmp_free_when_done = FALSE;
        /*
         * need to drop the laundry count...
         * we may also need to remove it
@@ -9329,8 +9178,6 @@ vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
         */
        vm_pageout_throttle_up(page);
 
-       vm_page_steal_pageout_page++;
-
        if (!queues_locked) {
                vm_page_unlock_queues();
        }
@@ -9352,11 +9199,11 @@ vector_upl_create(vm_offset_t upl_offset)
        vector_upl->invalid_upls=0;
        vector_upl->num_upls=0;
        vector_upl->pagelist = NULL;
-       
+
        for(i=0; i < MAX_VECTOR_UPL_ELEMENTS ; i++) {
                vector_upl->upl_iostates[i].size = 0;
                vector_upl->upl_iostates[i].offset = 0;
-               
+
        }
        return upl;
 }
@@ -9401,9 +9248,9 @@ vector_upl_is_valid(upl_t upl)
 boolean_t
 vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
 {
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                vector_upl_t vector_upl = upl->vector_upl;
-               
+
                if(vector_upl) {
                        if(subupl) {
                                if(io_size) {
@@ -9422,12 +9269,12 @@ vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
                                        }
                                        if(i == vector_upl->num_upls)
                                                panic("Trying to remove sub-upl when none exists");
-                                       
+
                                        vector_upl->upl_elems[i] = NULL;
-                                       invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1); 
+                                       invalid_upls = hw_atomic_add(&(vector_upl)->invalid_upls, 1);
                                        if(invalid_upls == vector_upl->num_upls)
                                                return TRUE;
-                                       else 
+                                       else
                                                return FALSE;
                                }
                        }
@@ -9441,12 +9288,12 @@ vector_upl_set_subupl(upl_t upl,upl_t subupl, uint32_t io_size)
                panic("vector_upl_set_subupl was passed a NULL upl\n");
 
        return FALSE;
-}      
+}
 
 void
 vector_upl_set_pagelist(upl_t upl)
 {
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                uint32_t i=0;
                vector_upl_t vector_upl = upl->vector_upl;
 
@@ -9454,7 +9301,7 @@ vector_upl_set_pagelist(upl_t upl)
                        vm_offset_t pagelist_size=0, cur_upl_pagelist_size=0;
 
                        vector_upl->pagelist = (upl_page_info_array_t)kalloc(sizeof(struct upl_page_info)*(vector_upl->size/PAGE_SIZE));
-                       
+
                        for(i=0; i < vector_upl->num_upls; i++) {
                                cur_upl_pagelist_size = sizeof(struct upl_page_info) * vector_upl->upl_elems[i]->size/PAGE_SIZE;
                                bcopy(UPL_GET_INTERNAL_PAGE_LIST_SIMPLE(vector_upl->upl_elems[i]), (char*)vector_upl->pagelist + pagelist_size, cur_upl_pagelist_size);
@@ -9475,7 +9322,7 @@ vector_upl_set_pagelist(upl_t upl)
 upl_t
 vector_upl_subupl_byindex(upl_t upl, uint32_t index)
 {
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                vector_upl_t vector_upl = upl->vector_upl;
                if(vector_upl) {
                        if(index < vector_upl->num_upls)
@@ -9490,7 +9337,7 @@ vector_upl_subupl_byindex(upl_t upl, uint32_t index)
 upl_t
 vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
 {
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                uint32_t i=0;
                vector_upl_t vector_upl = upl->vector_upl;
 
@@ -9518,7 +9365,7 @@ vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_
                                        else if(i)
                                                panic("Vector UPL offset miscalculation\n");
                                        return subupl;
-                               }       
+                               }
                        }
                }
                else
@@ -9532,7 +9379,7 @@ vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst
 {
        *v_upl_submap = NULL;
 
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                vector_upl_t vector_upl = upl->vector_upl;
                if(vector_upl) {
                        *v_upl_submap = vector_upl->submap;
@@ -9548,7 +9395,7 @@ vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst
 void
 vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
 {
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                vector_upl_t vector_upl = upl->vector_upl;
                if(vector_upl) {
                        vector_upl->submap = submap;
@@ -9564,7 +9411,7 @@ vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
 void
 vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
 {
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                uint32_t i = 0;
                vector_upl_t vector_upl = upl->vector_upl;
 
@@ -9573,7 +9420,7 @@ vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t
                                if(vector_upl->upl_elems[i] == subupl)
                                        break;
                        }
-                       
+
                        if(i == vector_upl->num_upls)
                                panic("setting sub-upl iostate when none exists");
 
@@ -9592,7 +9439,7 @@ vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t
 void
 vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
 {
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                uint32_t i = 0;
                vector_upl_t vector_upl = upl->vector_upl;
 
@@ -9601,7 +9448,7 @@ vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t
                                if(vector_upl->upl_elems[i] == subupl)
                                        break;
                        }
-                       
+
                        if(i == vector_upl->num_upls)
                                panic("getting sub-upl iostate when none exists");
 
@@ -9618,7 +9465,7 @@ vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t
 void
 vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
 {
-       if(vector_upl_is_valid(upl)) {          
+       if(vector_upl_is_valid(upl)) {
                vector_upl_t vector_upl = upl->vector_upl;
                if(vector_upl) {
                        if(index < vector_upl->num_upls) {
@@ -9693,161 +9540,14 @@ upl_set_blkno(
                int i,j;
                if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0)
                        return;
-                       
-               assert(upl->upl_reprio_info != 0);      
+
+               assert(upl->upl_reprio_info != 0);
                for(i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
                        UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
                }
 }
 #endif
 
-boolean_t
-vm_page_is_slideable(vm_page_t m)
-{
-       boolean_t result = FALSE;
-       vm_shared_region_slide_info_t si;
-       vm_object_t     m_object;
-
-       m_object = VM_PAGE_OBJECT(m);
-
-       vm_object_lock_assert_held(m_object);
-
-       /* make sure our page belongs to the one object allowed to do this */
-       if (!m_object->object_slid) {
-               goto done;
-       }
-
-       si = m_object->vo_slide_info;
-       if (si == NULL) {
-               goto done;
-       }
-
-       if(!m->slid && (si->start <= m->offset && si->end > m->offset)) {
-               result = TRUE;
-       }
-
-done:
-       return result;
-}
-
-int vm_page_slide_counter = 0;
-int vm_page_slide_errors = 0;
-kern_return_t
-vm_page_slide(
-       vm_page_t       page,
-       vm_map_offset_t kernel_mapping_offset)
-{
-       kern_return_t           kr;
-       vm_map_size_t           kernel_mapping_size;
-       boolean_t               kernel_mapping_needs_unmap;
-       vm_offset_t             kernel_vaddr;
-       uint32_t                pageIndex;
-       uint32_t                slide_chunk;
-       vm_object_t             page_object;
-
-       page_object = VM_PAGE_OBJECT(page);
-
-       assert(!page->slid);
-       assert(page_object->object_slid);
-       vm_object_lock_assert_exclusive(page_object);
-
-       if (page->error)
-               return KERN_FAILURE;
-       
-       /*
-        * Take a paging-in-progress reference to keep the object
-        * alive even if we have to unlock it (in vm_paging_map_object()
-        * for example)...
-        */
-       vm_object_paging_begin(page_object);
-
-       if (kernel_mapping_offset == 0) {
-               /*
-                * The page hasn't already been mapped in kernel space
-                * by the caller.  Map it now, so that we can access
-                * its contents and decrypt them.
-                */
-               kernel_mapping_size = PAGE_SIZE;
-               kernel_mapping_needs_unmap = FALSE;
-               kr = vm_paging_map_object(page,
-                                         page_object,
-                                         page->offset,
-                                         VM_PROT_READ | VM_PROT_WRITE,
-                                         FALSE,
-                                         &kernel_mapping_size,
-                                         &kernel_mapping_offset,
-                                         &kernel_mapping_needs_unmap);
-               if (kr != KERN_SUCCESS) {
-                       panic("vm_page_slide: "
-                             "could not map page in kernel: 0x%x\n",
-                             kr);
-               }
-       } else {
-               kernel_mapping_size = 0;
-               kernel_mapping_needs_unmap = FALSE;
-       }
-       kernel_vaddr = CAST_DOWN(vm_offset_t, kernel_mapping_offset);
-
-       /*
-        * Slide the pointers on the page.
-        */
-
-       /*assert that slide_file_info.start/end are page-aligned?*/
-
-       assert(!page->slid);
-       assert(page_object->object_slid);
-
-       pageIndex = (uint32_t)((page->offset -
-                               page_object->vo_slide_info->start) /
-                              PAGE_SIZE_FOR_SR_SLIDE);
-       for (slide_chunk = 0;
-            slide_chunk < PAGE_SIZE / PAGE_SIZE_FOR_SR_SLIDE;
-            slide_chunk++) {
-               kr = vm_shared_region_slide_page(page_object->vo_slide_info,
-                                                (kernel_vaddr +
-                                                 (slide_chunk *
-                                                  PAGE_SIZE_FOR_SR_SLIDE)),
-                                                (pageIndex + slide_chunk));
-               if (kr != KERN_SUCCESS) {
-                       break;
-               }
-       }
-
-       vm_page_slide_counter++;
-
-       /*
-        * Unmap the page from the kernel's address space,
-        */
-       if (kernel_mapping_needs_unmap) {
-               vm_paging_unmap_object(page_object,
-                                      kernel_vaddr,
-                                      kernel_vaddr + PAGE_SIZE);
-       }
-       
-       page->dirty = FALSE;
-       pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(page), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
-       
-       if (kr != KERN_SUCCESS || cs_debug > 1) {
-               printf("vm_page_slide(%p): "
-                      "obj %p off 0x%llx mobj %p moff 0x%llx\n",
-                      page,
-                      page_object, page->offset,
-                      page_object->pager,
-                      page->offset + page_object->paging_offset);
-       }
-
-       if (kr == KERN_SUCCESS) {
-               page->slid = TRUE;
-       } else {
-               page->error = TRUE;
-               vm_page_slide_errors++;
-       }
-
-       vm_object_paging_end(page_object);
-
-       return kr;
-}
-
 void inline memoryshot(unsigned int event, unsigned int control)
 {
        if (vm_debug_events) {
@@ -9917,12 +9617,12 @@ vm_countdirtypages(void)
        do {
                if (m ==(vm_page_t )0) break;
 
-               if(m->dirty) dpages++;
-               if(m->free_when_done) pgopages++;
-               if(m->precious) precpages++;
+               if(m->vmp_dirty) dpages++;
+               if(m->vmp_free_when_done) pgopages++;
+               if(m->vmp_precious) precpages++;
 
                assert(VM_PAGE_OBJECT(m) != kernel_object);
-               m = (vm_page_t) vm_page_queue_next(&m->pageq);
+               m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
                if (m ==(vm_page_t )0) break;
 
        } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
@@ -9934,10 +9634,10 @@ vm_countdirtypages(void)
                if (m ==(vm_page_t )0) break;
 
                dpages++;
-               assert(m->dirty);
-               assert(!m->free_when_done);
+               assert(m->vmp_dirty);
+               assert(!m->vmp_free_when_done);
                assert(VM_PAGE_OBJECT(m) != kernel_object);
-               m = (vm_page_t) vm_page_queue_next(&m->pageq);
+               m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
                if (m ==(vm_page_t )0) break;
 
        } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
@@ -9948,12 +9648,12 @@ vm_countdirtypages(void)
        do {
                if (m ==(vm_page_t )0) break;
 
-               if(m->dirty) dpages++;
-               if(m->free_when_done) pgopages++;
-               if(m->precious) precpages++;
+               if(m->vmp_dirty) dpages++;
+               if(m->vmp_free_when_done) pgopages++;
+               if(m->vmp_precious) precpages++;
 
                assert(VM_PAGE_OBJECT(m) != kernel_object);
-               m = (vm_page_t) vm_page_queue_next(&m->pageq);
+               m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
                if (m ==(vm_page_t )0) break;
 
        } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
@@ -9970,12 +9670,12 @@ vm_countdirtypages(void)
 
        do {
                if(m == (vm_page_t )0) break;
-               if(m->dirty) dpages++;
-               if(m->free_when_done) pgopages++;
-               if(m->precious) precpages++;
+               if(m->vmp_dirty) dpages++;
+               if(m->vmp_free_when_done) pgopages++;
+               if(m->vmp_precious) precpages++;
 
                assert(VM_PAGE_OBJECT(m) != kernel_object);
-               m = (vm_page_t) vm_page_queue_next(&m->pageq);
+               m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
                if(m == (vm_page_t )0) break;
 
        } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
@@ -9995,7 +9695,35 @@ int upl_get_cached_tier(upl_t  upl)
                return (upl->upl_priority);
        return (-1);
 }
-#endif /* CONFIG_IOSCHED */      
+#endif /* CONFIG_IOSCHED */
+
+
+void upl_callout_iodone(upl_t upl)
+{
+        struct upl_io_completion *upl_ctx = upl->upl_iodone;
+
+       if (upl_ctx) {
+               void    (*iodone_func)(void *, int) = upl_ctx->io_done;
+
+               assert(upl_ctx->io_done);
+
+               (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
+       }
+}
+
+void upl_set_iodone(upl_t upl, void *upl_iodone)
+{
+        upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
+}
+
+void upl_set_iodone_error(upl_t upl, int error)
+{
+        struct upl_io_completion *upl_ctx = upl->upl_iodone;
+
+       if (upl_ctx)
+               upl_ctx->io_error = error;
+}
+
 
 ppnum_t upl_get_highest_page(
                             upl_t                      upl)
@@ -10025,7 +9753,7 @@ struct vnode * upl_lookup_vnode(upl_t upl)
                return vnode_pager_lookup_vnode(upl->map_object->pager);
        else
                return NULL;
-}      
+}
 
 #if UPL_DEBUG
 kern_return_t  upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
@@ -10249,7 +9977,7 @@ vm_test_collapse_compressor(void)
        vm_map_remove(kernel_map,
                      backing_offset,
                      backing_offset + backing_size,
-                     0);
+                     VM_MAP_REMOVE_NO_FLAGS);
        printf("VM_TEST_COLLAPSE_COMPRESSOR: "
               "unmapped backing_object %p [0x%llx:0x%llx]\n",
               backing_object,
index a397634779ae0a34ac21f1cd4b8c5bfc7d4a29c6..2372059260b136b3d34924552e3636d7dc15a573 100644 (file)
@@ -90,7 +90,7 @@
 #define VM_PAGE_AVAILABLE_COUNT()              ((unsigned int)(vm_page_cleaned_count))
 
 /* externally manipulated counters */
-extern unsigned int vm_pageout_cleaned_reactivated, vm_pageout_cleaned_fault_reactivated, vm_pageout_cleaned_commit_reactivated;
+extern unsigned int vm_pageout_cleaned_fault_reactivated;
 
 #if CONFIG_FREEZE
 extern boolean_t memorystatus_freeze_enabled;
@@ -137,6 +137,8 @@ extern int  vm_debug_events;
 #define VM_INFO5                        0x10F
 #define VM_INFO6                        0x110
 #define VM_INFO7                        0x111
+#define VM_INFO8                        0x112
+#define VM_INFO9                        0x113
 
 #define VM_UPL_PAGE_WAIT               0x120
 #define VM_IOPL_PAGE_WAIT              0x121
@@ -148,15 +150,23 @@ extern int        vm_debug_events;
 #define VM_PAGE_EXPEDITE_NO_MEMORY      0x125
 #endif
 
+#define VM_PAGE_GRAB                   0x126
+#define VM_PAGE_RELEASE                        0x127
+
 #define VM_PRESSURE_EVENT              0x130
 #define VM_EXECVE                      0x131
 #define VM_WAKEUP_COMPACTOR_SWAPPER    0x132
+#define VM_UPL_REQUEST                 0x133
+#define VM_IOPL_REQUEST                        0x134
+#define VM_KERN_REQUEST                        0x135
 
 #define VM_DATA_WRITE                  0x140
 
+#define VM_PRESSURE_LEVEL_CHANGE       0x141
+
 #define VM_DEBUG_EVENT(name, event, control, arg1, arg2, arg3, arg4)   \
        MACRO_BEGIN                                             \
-       if (vm_debug_events) {                                  \
+       if (__improbable(vm_debug_events)) {                    \
                KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, event)) | control, arg1, arg2, arg3, arg4, 0); \
        }                                                       \
        MACRO_END
@@ -175,6 +185,10 @@ extern int upl_get_cached_tier(
        upl_t                   upl);
 #endif
 
+extern void upl_set_iodone(upl_t, void *);
+extern void upl_set_iodone_error(upl_t, int);
+extern void upl_callout_iodone(upl_t);
+
 extern ppnum_t upl_get_highest_page(
        upl_t                   upl);
 
@@ -324,6 +338,14 @@ struct ucd {
 };
 #endif
 
+struct upl_io_completion {
+
+        void     *io_context;
+        void     (*io_done)(void *, int);
+
+        int      io_error;
+};
+
 
 struct upl {
        decl_lck_mtx_data(,     Lock)   /* Synchronization */
@@ -337,6 +359,7 @@ struct upl {
        ppnum_t         highest_page;
        void*           vector_upl;
        upl_t           associated_upl;
+        struct upl_io_completion *upl_iodone;
 #if CONFIG_IOSCHED
        int             upl_priority;
        uint64_t        *upl_reprio_info;
@@ -470,9 +493,6 @@ extern void vm_pageout_steal_laundry(
        vm_page_t page, 
        boolean_t queues_locked);
        
-extern boolean_t vm_page_is_slideable(vm_page_t m);
-
-extern kern_return_t vm_page_slide(vm_page_t page, vm_map_offset_t kernel_mapping_offset);
 #endif  /* MACH_KERNEL_PRIVATE */
 
 #if UPL_DEBUG
@@ -536,9 +556,7 @@ extern void hibernate_create_paddr_map(void);
 extern void vm_set_restrictions(void);
 
 extern int vm_compressor_mode;
-extern int vm_compressor_thread_count;
-extern boolean_t vm_restricted_to_single_processor;
-extern kern_return_t vm_pageout_compress_page(void **, char *, vm_page_t, boolean_t);
+extern kern_return_t vm_pageout_compress_page(void **, char *, vm_page_t);
 extern void vm_pageout_anonymous_pages(void);
 extern void vm_pageout_disconnect_all_pages(void);
 
@@ -574,6 +592,161 @@ extern    struct vm_config        vm_config;
 #endif /* KERNEL_PRIVATE */
 
 #ifdef XNU_KERNEL_PRIVATE
+
+struct vm_pageout_state {
+        boolean_t vm_pressure_thread_running;
+        boolean_t vm_pressure_changed;
+        boolean_t vm_restricted_to_single_processor;
+        int vm_compressor_thread_count;
+
+        unsigned int vm_page_speculative_q_age_ms;
+        unsigned int vm_page_speculative_percentage;
+        unsigned int vm_page_speculative_target;
+
+        unsigned int vm_pageout_swap_wait;
+        unsigned int vm_pageout_idle_wait;     /* milliseconds */
+        unsigned int vm_pageout_empty_wait;    /* milliseconds */
+        unsigned int vm_pageout_burst_wait;    /* milliseconds */
+        unsigned int vm_pageout_deadlock_wait;  /* milliseconds */
+        unsigned int vm_pageout_deadlock_relief;
+        unsigned int vm_pageout_burst_inactive_throttle;
+
+        unsigned int vm_pageout_inactive;
+        unsigned int vm_pageout_inactive_used; /* debugging */
+        unsigned int vm_pageout_inactive_clean;        /* debugging */
+
+        uint32_t vm_page_filecache_min;
+        uint32_t vm_page_filecache_min_divisor;
+        uint32_t vm_page_xpmapped_min;
+        uint32_t vm_page_xpmapped_min_divisor;
+        uint64_t vm_pageout_considered_page_last;
+
+        int vm_page_free_count_init;
+
+        unsigned int vm_memory_pressure;
+
+        int memorystatus_purge_on_critical;
+        int memorystatus_purge_on_warning;
+        int memorystatus_purge_on_urgent;
+
+        thread_t vm_pageout_external_iothread;
+        thread_t vm_pageout_internal_iothread;
+};
+
+extern struct vm_pageout_state vm_pageout_state;
+
+/*
+ * This structure is used to track the VM_INFO instrumentation
+ */
+struct vm_pageout_vminfo {
+        unsigned long vm_pageout_considered_page;
+        unsigned long vm_pageout_considered_bq_internal;
+        unsigned long vm_pageout_considered_bq_external;
+        unsigned long vm_pageout_skipped_external;
+
+        unsigned long vm_pageout_pages_evicted;
+        unsigned long vm_pageout_pages_purged;;
+        unsigned long vm_pageout_freed_cleaned;
+        unsigned long vm_pageout_freed_speculative;
+        unsigned long vm_pageout_freed_external;
+        unsigned long vm_pageout_freed_internal;
+        unsigned long vm_pageout_inactive_dirty_internal;
+        unsigned long vm_pageout_inactive_dirty_external;
+        unsigned long vm_pageout_inactive_referenced;
+        unsigned long vm_pageout_reactivation_limit_exceeded;
+        unsigned long vm_pageout_inactive_force_reclaim;
+        unsigned long vm_pageout_inactive_nolock;
+        unsigned long vm_pageout_filecache_min_reactivated;
+        unsigned long vm_pageout_scan_inactive_throttled_internal;
+        unsigned long vm_pageout_scan_inactive_throttled_external;
+
+        uint64_t      vm_pageout_compressions;
+        uint64_t      vm_compressor_pages_grabbed;
+        unsigned long vm_compressor_failed;
+
+        unsigned long vm_page_pages_freed;
+
+        unsigned long vm_phantom_cache_found_ghost;
+        unsigned long vm_phantom_cache_added_ghost;
+};
+
+extern struct vm_pageout_vminfo vm_pageout_vminfo;
+
+
+#if DEVELOPMENT || DEBUG
+
+/*
+ *     This structure records the pageout daemon's actions:
+ *     how many pages it looks at and what happens to those pages.
+ *     No locking needed because only one thread modifies the fields.
+ */
+struct vm_pageout_debug {
+        uint32_t vm_pageout_balanced;
+        uint32_t vm_pageout_scan_event_counter;
+        uint32_t vm_pageout_speculative_dirty;
+
+        uint32_t vm_pageout_inactive_busy;
+        uint32_t vm_pageout_inactive_absent;
+        uint32_t vm_pageout_inactive_notalive;
+        uint32_t vm_pageout_inactive_error;
+        uint32_t vm_pageout_inactive_deactivated;
+
+        uint32_t vm_pageout_enqueued_cleaned;
+
+        uint32_t vm_pageout_cleaned_busy;
+        uint32_t vm_pageout_cleaned_nolock;
+        uint32_t vm_pageout_cleaned_reference_reactivated;
+        uint32_t vm_pageout_cleaned_volatile_reactivated;
+        uint32_t vm_pageout_cleaned_reactivated;  /* debugging; how many cleaned pages are found to be referenced on pageout (and are therefore reactivated) */
+        uint32_t vm_pageout_cleaned_fault_reactivated;
+
+        uint32_t vm_pageout_dirty_no_pager;
+        uint32_t vm_pageout_purged_objects;
+
+        uint32_t vm_pageout_scan_throttle;
+        uint32_t vm_pageout_scan_reclaimed_throttled;
+        uint32_t vm_pageout_scan_burst_throttle;
+        uint32_t vm_pageout_scan_empty_throttle;
+        uint32_t vm_pageout_scan_swap_throttle;
+        uint32_t vm_pageout_scan_deadlock_detected;
+        uint32_t vm_pageout_scan_inactive_throttle_success;
+        uint32_t vm_pageout_scan_throttle_deferred;
+
+        uint32_t vm_pageout_inactive_external_forced_jetsam_count;
+
+        uint32_t vm_grab_anon_overrides;
+        uint32_t vm_grab_anon_nops;
+
+        uint32_t vm_pageout_no_victim;
+        unsigned long vm_pageout_throttle_up_count;
+        uint32_t vm_page_steal_pageout_page;
+
+        uint32_t vm_cs_validated_resets;
+        uint32_t vm_object_iopl_request_sleep_for_cleaning;
+        uint32_t vm_page_slide_counter;
+        uint32_t vm_page_slide_errors;
+        uint32_t vm_page_throttle_count;
+        /*
+        * Statistics about UPL enforcement of copy-on-write obligations.
+        */
+        unsigned long upl_cow;
+        unsigned long upl_cow_again;
+        unsigned long upl_cow_pages;
+        unsigned long upl_cow_again_pages;
+        unsigned long iopl_cow;
+        unsigned long iopl_cow_pages;
+};
+
+extern struct vm_pageout_debug vm_pageout_debug;
+
+#define VM_PAGEOUT_DEBUG(member, value)                        \
+       MACRO_BEGIN                                     \
+               vm_pageout_debug.member += value;       \
+       MACRO_END
+#else
+#define VM_PAGEOUT_DEBUG(member, value)
+#endif
+
 #define MAX_COMPRESSOR_THREAD_COUNT      8
 
 #if DEVELOPMENT || DEBUG
index a075f53facd96359d3d24d7f2cd3049837f58bc8..95bdaa27eb98c90c19366d52f1acd932b6d59dff 100644 (file)
@@ -39,7 +39,7 @@ uint32_t phantom_cache_thrashing_threshold_ssd = 1000;
 #if CONFIG_EMBEDDED
 uint32_t phantom_cache_thrashing_threshold = 500;
 #else
-uint32_t phantom_cache_thrashing_threshold = 100;
+uint32_t phantom_cache_thrashing_threshold = 50;
 #endif
 
 /*
@@ -102,6 +102,7 @@ struct phantom_cache_stats {
 } phantom_cache_stats;
 
 
+
 void
 vm_phantom_cache_init()
 {
@@ -173,7 +174,7 @@ vm_phantom_cache_add_ghost(vm_page_t m)
        if (vm_phantom_cache_num_entries == 0)
                return;
        
-       pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
+       pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
 
        if (object->phantom_object_id == 0) {
 
@@ -239,7 +240,7 @@ vm_phantom_cache_add_ghost(vm_page_t m)
                phantom_cache_stats.pcs_added_new_entry++;
 
        vpce->g_pages_held = pg_mask;
-       vpce->g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
+       vpce->g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
        vpce->g_obj_id = object->phantom_object_id;
 
        ghost_hash_index = vm_phantom_hash(vpce->g_obj_id, vpce->g_obj_offset);
@@ -247,6 +248,8 @@ vm_phantom_cache_add_ghost(vm_page_t m)
        vm_phantom_cache_hash[ghost_hash_index] = ghost_index;
 
 done:
+       vm_pageout_vminfo.vm_phantom_cache_added_ghost++;
+
        if (object->phantom_isssd)
                OSAddAtomic(1, &sample_period_ghost_added_count_ssd);
        else
@@ -270,7 +273,7 @@ vm_phantom_cache_lookup_ghost(vm_page_t m, uint32_t pg_mask)
                 */
                return (NULL);
        }
-       g_obj_offset = (m->offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
+       g_obj_offset = (m->vmp_offset >> (PAGE_SHIFT + VM_GHOST_PAGE_SHIFT)) & VM_GHOST_OFFSET_MASK;
 
        ghost_index = vm_phantom_cache_hash[vm_phantom_hash(g_obj_id, g_obj_offset)];
 
@@ -314,13 +317,14 @@ vm_phantom_cache_update(vm_page_t m)
        if (vm_phantom_cache_num_entries == 0)
                return;
        
-       pg_mask = pg_masks[(m->offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
+       pg_mask = pg_masks[(m->vmp_offset >> PAGE_SHIFT) & VM_GHOST_PAGE_MASK];
        
        if ( (vpce = vm_phantom_cache_lookup_ghost(m, pg_mask)) ) {
 
                vpce->g_pages_held &= ~pg_mask;
 
                phantom_cache_stats.pcs_updated_phantom_state++;
+               vm_pageout_vminfo.vm_phantom_cache_found_ghost++;
 
                if (object->phantom_isssd)
                        OSAddAtomic(1, &sample_period_ghost_found_count_ssd);
index 8044286f5838eb7bf4b494a36142209d33258521..cf4ad88ce994d854d054b4ec659bf942cbd55c3d 100644 (file)
@@ -173,6 +173,19 @@ extern memory_object_t apple_protect_pager_setup(
        vm_object_offset_t      crypto_end);
 #endif /* CONFIG_CODE_DECRYPTION */
 
+struct vm_shared_region_slide_info;
+extern kern_return_t vm_map_shared_region(
+       vm_map_t                map,
+       vm_map_offset_t         start,
+       vm_map_offset_t         end,
+       vm_object_offset_t      backing_offset,
+       struct vm_shared_region_slide_info *slide_info);
+extern void shared_region_pager_bootstrap(void);
+extern memory_object_t shared_region_pager_setup(
+       vm_object_t             backing_object,
+       vm_object_offset_t      backing_offset,
+       struct vm_shared_region_slide_info *slide_info);
+
 struct vnode;
 extern void swapfile_pager_bootstrap(void);
 extern memory_object_t swapfile_pager_setup(struct vnode *vp);
@@ -218,6 +231,11 @@ extern void *upl_get_internal_page_list(
 extern void vnode_setswapmount(struct vnode *);
 extern int64_t vnode_getswappin_avail(struct vnode *);
 
+extern void vnode_pager_was_dirtied(
+        struct vnode *,
+       vm_object_offset_t,
+       vm_object_offset_t);
+
 typedef int pager_return_t;
 extern pager_return_t  vnode_pagein(
        struct vnode *, upl_t,
@@ -294,6 +312,10 @@ extern kern_return_t vnode_pager_get_object_devvp(
         uintptr_t *);
 #endif
 
+extern void vnode_pager_dirtied(
+       memory_object_t,
+       vm_object_offset_t,
+       vm_object_offset_t);
 extern kern_return_t vnode_pager_get_isinuse(
        memory_object_t,
        uint32_t *);
@@ -462,12 +484,26 @@ extern boolean_t cs_validate_range(struct vnode *vp,
                                   const void *data,
                                   vm_size_t size,
                                   unsigned *result);
+#if PMAP_CS
+extern kern_return_t cs_associate_blob_with_mapping(
+       void *pmap,
+       vm_map_offset_t start,
+       vm_map_size_t size,
+       vm_object_offset_t offset,
+       void *blobs_p);
+#endif /* PMAP_CS */
 
 extern kern_return_t memory_entry_purgeable_control_internal(
        ipc_port_t      entry_port,
        vm_purgable_t   control,
        int             *state);
 
+extern kern_return_t memory_entry_access_tracking_internal(
+       ipc_port_t      entry_port,
+       int             *access_tracking,
+       uint32_t        *access_tracking_reads,
+       uint32_t        *access_tracking_writes);
+
 extern kern_return_t mach_memory_entry_purgable_control(
        ipc_port_t      entry_port,
        vm_purgable_t   control,
@@ -531,6 +567,11 @@ extern int proc_get_memstat_priority(struct proc*, boolean_t);
 /* returns TRUE if an object was purged, otherwise FALSE. */
 boolean_t vm_purgeable_object_purge_one_unlocked(int force_purge_below_group);
 void vm_purgeable_disown(task_t task);
+void vm_purgeable_nonvolatile_owner_update(task_t      owner,
+                                          int          delta);
+void vm_purgeable_volatile_owner_update(task_t         owner,
+                                       int             delta);
+
 
 struct trim_list {
        uint64_t        tl_offset;
@@ -597,10 +638,23 @@ extern kern_return_t mach_make_memory_entry_internal(
        ipc_port_t              *object_handle,
        ipc_port_t              parent_handle);
 
+#define        roundup(x, y)   ((((x) % (y)) == 0) ? \
+                       (x) : ((x) + ((y) - ((x) % (y)))))
+
 #ifdef __cplusplus
 }
 #endif
 
+/*
+ * Flags for the VM swapper/reclaimer.
+ * Used by vm_swap_consider_defragment()
+ * to force defrag/reclaim by the swap
+ * GC thread.
+ */
+#define VM_SWAP_FLAGS_NONE             0
+#define VM_SWAP_FLAGS_FORCE_DEFRAG     1
+#define VM_SWAP_FLAGS_FORCE_RECLAIM    2
+
 #endif /* _VM_VM_PROTOS_H_ */
 
 #endif /* XNU_KERNEL_PRIVATE */
index 4606d95911f6cdffd4f5b0ef3adaa36902a48244..5fc9671168aac4bbb9db09c1bfbdeeb58e6c58aa 100644 (file)
@@ -90,11 +90,6 @@ static token_idx_t vm_purgeable_token_remove_first(purgeable_q_t queue);
 
 static void vm_purgeable_stats_helper(vm_purgeable_stat_t *stat, purgeable_q_t queue, int group, task_t target_task);
 
-void vm_purgeable_nonvolatile_owner_update(task_t      owner,
-                                          int          delta);
-void vm_purgeable_volatile_owner_update(task_t         owner,
-                                       int             delta);
-
 
 #if MACH_ASSERT
 static void
@@ -724,8 +719,13 @@ vm_purgeable_object_find_and_lock(
 
                object_task_importance = 0;
 
-               owner = object->vo_purgeable_owner;
-               if (owner) {
+               /*
+                * We don't want to use VM_OBJECT_OWNER() here: we want to
+                * distinguish kernel-owned and disowned objects.
+                * Disowned objects have no owner and will have no importance...
+                */
+               owner = object->vo_owner;
+               if (owner != NULL && owner != VM_OBJECT_OWNER_DISOWNED) {
 #if CONFIG_EMBEDDED
 #if CONFIG_JETSAM
                        object_task_importance = proc_get_memstat_priority((struct proc *)get_bsdtask_info(owner), TRUE);
@@ -780,7 +780,7 @@ vm_purgeable_object_find_and_lock(
        object->purgeable_queue_type = PURGEABLE_Q_TYPE_MAX;
        object->purgeable_queue_group = 0;
        /* one less volatile object for this object's owner */
-       vm_purgeable_volatile_owner_update(object->vo_purgeable_owner, -1);
+       vm_purgeable_volatile_owner_update(VM_OBJECT_OWNER(object), -1);
 
 #if DEBUG
        object->vo_purgeable_volatilizer = NULL;
@@ -793,7 +793,7 @@ vm_purgeable_object_find_and_lock(
        purgeable_nonvolatile_count++;
        assert(purgeable_nonvolatile_count > 0);
        /* one more nonvolatile object for this object's owner */
-       vm_purgeable_nonvolatile_owner_update(object->vo_purgeable_owner, +1);
+       vm_purgeable_nonvolatile_owner_update(VM_OBJECT_OWNER(object), +1);
 
 #if MACH_ASSERT
        queue->debug_count_objects--;
@@ -891,6 +891,11 @@ vm_purgeable_object_purge_one(
        vm_object_t     object = 0;
        purgeable_q_t   queue, queue2;
        boolean_t       forced_purge;
+       unsigned int    resident_page_count;
+
+
+       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE)) | DBG_FUNC_START,
+                             force_purge_below_group, flags, 0, 0, 0);
 
        /* Need the page queue lock since we'll be changing the token queue. */
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
@@ -984,22 +989,29 @@ vm_purgeable_object_purge_one(
          * we have objects in a purgeable state
          */
        lck_mtx_unlock(&vm_purgeable_queue_lock);
+
+       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE)) | DBG_FUNC_END,
+                             0, 0, available_for_purge, 0, 0);
+
        return FALSE;
 
 purge_now:
 
        assert(object);
        vm_page_unlock_queues();  /* Unlock for call to vm_object_purge() */
-//     printf("%sPURGING object %p task %p importance %d queue %d group %d force_purge_below_group %d memorystatus_vm_pressure_level %d\n", forced_purge ? "FORCED " : "", object, object->vo_purgeable_owner, task_importance_estimate(object->vo_purgeable_owner), i, group, force_purge_below_group, memorystatus_vm_pressure_level);
+//     printf("%sPURGING object %p task %p importance %d queue %d group %d force_purge_below_group %d memorystatus_vm_pressure_level %d\n", forced_purge ? "FORCED " : "", object, object->vo_owner, task_importance_estimate(object->vo_owner), i, group, force_purge_below_group, memorystatus_vm_pressure_level);
+       resident_page_count = object->resident_page_count;
        (void) vm_object_purge(object, flags);
        assert(object->purgable == VM_PURGABLE_EMPTY);
        /* no change in purgeable accounting */
        vm_object_unlock(object);
        vm_page_lock_queues();
 
-       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE)),
+       vm_pageout_vminfo.vm_pageout_pages_purged += resident_page_count;
+
+       KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, OBJECT_PURGE)) | DBG_FUNC_END,
                              VM_KERNEL_UNSLIDE_OR_PERM(object),        /* purged object */
-                             0,
+                             resident_page_count,
                              available_for_purge,
                              0,
                              0);
@@ -1024,7 +1036,7 @@ vm_purgeable_object_add(vm_object_t object, purgeable_q_t queue, int group)
        purgeable_nonvolatile_count--;
        assert(purgeable_nonvolatile_count >= 0);
        /* one less nonvolatile object for this object's owner */
-       vm_purgeable_nonvolatile_owner_update(object->vo_purgeable_owner, -1);
+       vm_purgeable_nonvolatile_owner_update(VM_OBJECT_OWNER(object), -1);
 
        if (queue->type == PURGEABLE_Q_TYPE_OBSOLETE)
                group = 0;
@@ -1035,7 +1047,7 @@ vm_purgeable_object_add(vm_object_t object, purgeable_q_t queue, int group)
        else
                queue_enter_first(&queue->objq[group], object, vm_object_t, objq);      /* first to die */
        /* one more volatile object for this object's owner */
-       vm_purgeable_volatile_owner_update(object->vo_purgeable_owner, +1);
+       vm_purgeable_volatile_owner_update(VM_OBJECT_OWNER(object), +1);
 
        object->purgeable_queue_type = queue->type;
        object->purgeable_queue_group = group;
@@ -1043,7 +1055,8 @@ vm_purgeable_object_add(vm_object_t object, purgeable_q_t queue, int group)
 #if DEBUG
        assert(object->vo_purgeable_volatilizer == NULL);
        object->vo_purgeable_volatilizer = current_task();
-       OSBacktrace(&object->purgeable_volatilizer_bt[0], 16);
+       OSBacktrace(&object->purgeable_volatilizer_bt[0],
+                   ARRAY_COUNT(object->purgeable_volatilizer_bt));
 #endif /* DEBUG */
 
 #if MACH_ASSERT
@@ -1089,21 +1102,19 @@ vm_purgeable_object_remove(vm_object_t object)
        object->objq.next = NULL;
        object->objq.prev = NULL;
        /* one less volatile object for this object's owner */
-       vm_purgeable_volatile_owner_update(object->vo_purgeable_owner, -1);
+       vm_purgeable_volatile_owner_update(VM_OBJECT_OWNER(object), -1);
 #if DEBUG
        object->vo_purgeable_volatilizer = NULL;
 #endif /* DEBUG */
        /* keep queue of non-volatile objects */
        if (object->alive && !object->terminating) {
-               task_t  owner;
                queue_enter(&purgeable_nonvolatile_queue, object,
                            vm_object_t, objq);
                assert(purgeable_nonvolatile_count >= 0);
                purgeable_nonvolatile_count++;
                assert(purgeable_nonvolatile_count > 0);
                /* one more nonvolatile object for this object's owner */
-               owner = object->vo_purgeable_owner;
-               vm_purgeable_nonvolatile_owner_update(owner, +1);
+               vm_purgeable_nonvolatile_owner_update(VM_OBJECT_OWNER(object), +1);
        }
 
 #if MACH_ASSERT
@@ -1136,10 +1147,10 @@ vm_purgeable_stats_helper(vm_purgeable_stat_t *stat, purgeable_q_t queue, int gr
        for (object = (vm_object_t) queue_first(&queue->objq[group]);
             !queue_end(&queue->objq[group], (queue_entry_t) object);
             object = (vm_object_t) queue_next(&object->objq)) {
-                       if (!target_task || object->vo_purgeable_owner == target_task) {
-                               stat->count++;
-                               stat->size += (object->resident_page_count * PAGE_SIZE);
-                       }
+               if (!target_task || VM_OBJECT_OWNER(object) == target_task) {
+                       stat->count++;
+                       stat->size += (object->resident_page_count * PAGE_SIZE);
+               }
        }
        return;
 }
@@ -1184,7 +1195,7 @@ vm_purgeable_account_volatile_queue(
        for (object = (vm_object_t) queue_first(&queue->objq[group]);
            !queue_end(&queue->objq[group], (queue_entry_t) object);
            object = (vm_object_t) queue_next(&object->objq)) {
-               if (object->vo_purgeable_owner == task) {
+               if (VM_OBJECT_OWNER(object) == task) {
                        compressed_count = vm_compressor_pager_get_count(object->pager);
                        acnt_info->pvm_volatile_compressed_count += compressed_count;
                        acnt_info->pvm_volatile_count += (object->resident_page_count - object->wired_page_count);
@@ -1226,7 +1237,7 @@ vm_purgeable_account(
        for (object = (vm_object_t) queue_first(nonvolatile_q);
             !queue_end(nonvolatile_q, (queue_entry_t) object);
             object = (vm_object_t) queue_next(&object->objq)) {
-               if (object->vo_purgeable_owner == task) {
+               if (VM_OBJECT_OWNER(object) == task) {
                        state = object->purgable;
                        compressed_count =  vm_compressor_pager_get_count(object->pager);
                        if (state == VM_PURGABLE_EMPTY) {
@@ -1319,18 +1330,21 @@ again:
 #if DEBUG
                assert(object->vo_purgeable_volatilizer == NULL);
 #endif /* DEBUG */
-               assert(object->vo_purgeable_owner == task);
+               assert(object->vo_owner == task);
                if (!vm_object_lock_try(object)) {
                        lck_mtx_unlock(&vm_purgeable_queue_lock);
                        task_objq_unlock(task);
                        mutex_pause(collisions++);
                        goto again;
                }
-               vm_purgeable_accounting(object,
-                                       object->purgable,
-                                       TRUE, /* disown */
-                                       TRUE);/* task_objq_lock is locked */
-               assert(object->vo_purgeable_owner == NULL);
+               /* transfer ownership to the kernel */
+               assert(VM_OBJECT_OWNER(object) != kernel_task);
+               vm_object_ownership_change(
+                       object,
+                       object->vo_ledger_tag, /* unchanged */
+                       VM_OBJECT_OWNER_DISOWNED, /* new owner */
+                       TRUE);  /* old_owner->task_objq locked */
+               assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED);
                vm_object_unlock(object);
        }
 
@@ -1379,7 +1393,7 @@ look_again:
             !queue_end(&queue->objq[group], (queue_entry_t) object);
             object = (vm_object_t) queue_next(&object->objq)) {
 
-               if (object->vo_purgeable_owner != task) {
+               if (object->vo_owner != task) {
                        continue;
                }
 
@@ -1401,7 +1415,7 @@ look_again:
                object->purgeable_queue_type = PURGEABLE_Q_TYPE_MAX;
                object->purgeable_queue_group = 0;
                /* one less volatile object for this object's owner */
-               assert(object->vo_purgeable_owner == task);
+               assert(object->vo_owner == task);
                vm_purgeable_volatile_owner_update(task, -1);
 
 #if DEBUG
@@ -1413,7 +1427,7 @@ look_again:
                purgeable_nonvolatile_count++;
                assert(purgeable_nonvolatile_count > 0);
                /* one more nonvolatile object for this object's owner */
-               assert(object->vo_purgeable_owner == task);
+               assert(object->vo_owner == task);
                vm_purgeable_nonvolatile_owner_update(task, +1);
 
                /* unlock purgeable queues */
@@ -1477,45 +1491,32 @@ vm_purgeable_nonvolatile_enqueue(
        vm_object_t     object,
        task_t          owner)
 {
-       int page_count;
-
        vm_object_lock_assert_exclusive(object);
 
        assert(object->purgable == VM_PURGABLE_NONVOLATILE);
-       assert(object->vo_purgeable_owner == NULL);
+       assert(object->vo_owner == NULL);
 
        lck_mtx_lock(&vm_purgeable_queue_lock);
 
        if (owner != NULL &&
            owner->task_purgeable_disowning) {
                /* task is exiting and no longer tracking purgeable objects */
-               owner = NULL;
+               owner = VM_OBJECT_OWNER_DISOWNED;
+       }
+       if (owner == NULL) {
+               owner = kernel_task;
        }
-
-       object->vo_purgeable_owner = owner;
 #if DEBUG
+       OSBacktrace(&object->purgeable_owner_bt[0],
+                   ARRAY_COUNT(object->purgeable_owner_bt));
        object->vo_purgeable_volatilizer = NULL;
 #endif /* DEBUG */
-       if (owner != NULL) {
-               task_objq_lock(owner);
-               queue_enter(&owner->task_objq, object, vm_object_t, task_objq);
-               task_objq_unlock(owner);
-       }
 
-#if DEBUG
-       OSBacktrace(&object->purgeable_owner_bt[0], 16);
-#endif /* DEBUG */
+       vm_object_ownership_change(object,
+                                  object->vo_ledger_tag, /* tag unchanged */
+                                  owner,
+                                  FALSE);      /* task_objq_locked */
 
-       page_count = object->resident_page_count;
-       if (owner != NULL && page_count != 0) {
-               ledger_credit(owner->ledger,
-                             task_ledgers.purgeable_nonvolatile,
-                             ptoa(page_count));
-               ledger_credit(owner->ledger,
-                             task_ledgers.phys_footprint,
-                             ptoa(page_count));
-       }
-                     
        assert(object->objq.next == NULL);
        assert(object->objq.prev == NULL);
 
@@ -1524,9 +1525,6 @@ vm_purgeable_nonvolatile_enqueue(
        assert(purgeable_nonvolatile_count >= 0);
        purgeable_nonvolatile_count++;
        assert(purgeable_nonvolatile_count > 0);
-       /* one more nonvolatile object for this object's owner */
-       assert(object->vo_purgeable_owner == owner);
-       vm_purgeable_nonvolatile_owner_update(owner, +1);
        lck_mtx_unlock(&vm_purgeable_queue_lock);
 
        vm_object_lock_assert_exclusive(object);
@@ -1540,7 +1538,7 @@ vm_purgeable_nonvolatile_dequeue(
 
        vm_object_lock_assert_exclusive(object);
 
-       owner = object->vo_purgeable_owner;
+       owner = VM_OBJECT_OWNER(object);
 #if DEBUG
        assert(object->vo_purgeable_volatilizer == NULL);
 #endif /* DEBUG */
@@ -1549,10 +1547,14 @@ vm_purgeable_nonvolatile_dequeue(
                 * Update the owner's ledger to stop accounting
                 * for this object.
                 */
-               vm_purgeable_accounting(object,
-                                       object->purgable,
-                                       TRUE, /* disown */
-                                       FALSE); /* is task_objq locked? */
+               /* transfer ownership to the kernel */
+               assert(VM_OBJECT_OWNER(object) != kernel_task);
+               vm_object_ownership_change(
+                       object,
+                       object->vo_ledger_tag,  /* unchanged */
+                       VM_OBJECT_OWNER_DISOWNED, /* new owner */
+                       FALSE); /* old_owner->task_objq locked */
+               assert(object->vo_owner == VM_OBJECT_OWNER_DISOWNED);
        }
 
        lck_mtx_lock(&vm_purgeable_queue_lock);
@@ -1573,28 +1575,32 @@ vm_purgeable_nonvolatile_dequeue(
 void
 vm_purgeable_accounting(
        vm_object_t     object,
-       vm_purgable_t   old_state,
-       boolean_t       disown,
-       boolean_t       task_objq_locked)
+       vm_purgable_t   old_state)
 {
        task_t          owner;
        int             resident_page_count;
        int             wired_page_count;
        int             compressed_page_count;
-       boolean_t       disown_on_the_fly;
+       int             ledger_idx_volatile;
+       int             ledger_idx_nonvolatile;
+       int             ledger_idx_volatile_compressed;
+       int             ledger_idx_nonvolatile_compressed;
+       boolean_t       do_footprint;
 
        vm_object_lock_assert_exclusive(object);
+       assert(object->purgable != VM_PURGABLE_DENY);
 
-       owner = object->vo_purgeable_owner;
-       if (owner == NULL)
+       owner = VM_OBJECT_OWNER(object);
+       if (owner == NULL ||
+           object->purgable == VM_PURGABLE_DENY)
                return;
 
-       if (!disown && owner->task_purgeable_disowning) {
-               /* task is disowning its purgeable objects: help it */
-               disown_on_the_fly = TRUE;
-       } else {
-               disown_on_the_fly = FALSE;
-       }
+       vm_object_ledger_tag_ledgers(object,
+                                    &ledger_idx_volatile,
+                                    &ledger_idx_nonvolatile,
+                                    &ledger_idx_volatile_compressed,
+                                    &ledger_idx_nonvolatile_compressed,
+                                    &do_footprint);
 
        resident_page_count = object->resident_page_count;
        wired_page_count = object->wired_page_count;
@@ -1610,121 +1616,57 @@ vm_purgeable_accounting(
            old_state == VM_PURGABLE_EMPTY) {
                /* less volatile bytes in ledger */
                ledger_debit(owner->ledger,
-                            task_ledgers.purgeable_volatile,
-                            ptoa(resident_page_count - wired_page_count));
+                            ledger_idx_volatile,
+                            ptoa_64(resident_page_count - wired_page_count));
                /* less compressed volatile bytes in ledger */
                ledger_debit(owner->ledger,
-                            task_ledgers.purgeable_volatile_compressed,
-                            ptoa(compressed_page_count));
-
-               if (disown || !object->alive || object->terminating) {
-                       /* wired pages were accounted as "non-volatile"... */
-                       ledger_debit(owner->ledger,
-                                    task_ledgers.purgeable_nonvolatile,
-                                    ptoa(wired_page_count));
-                       /* ... and in phys_footprint */
-                       ledger_debit(owner->ledger,
-                                    task_ledgers.phys_footprint,
-                                    ptoa(wired_page_count));
-
-                       /* no more accounting for this dead object */
-                       if (! task_objq_locked) {
-                               task_objq_lock(owner);
-                       }
-                       if (!disown_on_the_fly &&
-                           (object->purgeable_queue_type ==
-                            PURGEABLE_Q_TYPE_MAX)) {
-                               /*
-                                * Not on a volatile queue:  must be empty
-                                * or emptying.
-                                */
-                               vm_purgeable_nonvolatile_owner_update(owner,-1);
-                       } else {
-                               /* on a volatile queue */
-                               vm_purgeable_volatile_owner_update(owner, -1);
-                       }
-                       task_objq_lock_assert_owned(owner);
-                       queue_remove(&owner->task_objq, object, vm_object_t, task_objq);
-                       object->vo_purgeable_owner = NULL;
-#if DEBUG
-                       object->vo_purgeable_volatilizer = NULL;
-#endif /* DEBUG */
-                       if (! task_objq_locked) {
-                               task_objq_unlock(owner);
-                       }
-                       return;
-               }
+                            ledger_idx_volatile_compressed,
+                            ptoa_64(compressed_page_count));
 
                /* more non-volatile bytes in ledger */
                ledger_credit(owner->ledger,
-                             task_ledgers.purgeable_nonvolatile,
-                             ptoa(resident_page_count - wired_page_count));
+                             ledger_idx_nonvolatile,
+                             ptoa_64(resident_page_count - wired_page_count));
                /* more compressed non-volatile bytes in ledger */
                ledger_credit(owner->ledger,
-                             task_ledgers.purgeable_nonvolatile_compressed,
-                             ptoa(compressed_page_count));
-               /* more footprint */
-               ledger_credit(owner->ledger,
-                             task_ledgers.phys_footprint,
-                             ptoa(resident_page_count
-                                  + compressed_page_count
-                                  - wired_page_count));
+                             ledger_idx_nonvolatile_compressed,
+                             ptoa_64(compressed_page_count));
+               if (do_footprint) {
+                       /* more footprint */
+                       ledger_credit(owner->ledger,
+                                     task_ledgers.phys_footprint,
+                                     ptoa_64(resident_page_count
+                                          + compressed_page_count
+                                          - wired_page_count));
+               }
 
        } else if (old_state == VM_PURGABLE_NONVOLATILE) {
 
                /* less non-volatile bytes in ledger */
                ledger_debit(owner->ledger,
-                            task_ledgers.purgeable_nonvolatile,
-                            ptoa(resident_page_count - wired_page_count));
+                            ledger_idx_nonvolatile,
+                            ptoa_64(resident_page_count - wired_page_count));
                /* less compressed non-volatile bytes in ledger */
                ledger_debit(owner->ledger,
-                            task_ledgers.purgeable_nonvolatile_compressed,
-                            ptoa(compressed_page_count));
-               /* less footprint */
-               ledger_debit(owner->ledger,
-                            task_ledgers.phys_footprint,
-                            ptoa(resident_page_count
-                                 + compressed_page_count
-                                 - wired_page_count));
-
-               if (disown || !object->alive || object->terminating) {
-                       /* wired pages still accounted as "non-volatile" */
-                       ledger_debit(owner->ledger,
-                                    task_ledgers.purgeable_nonvolatile,
-                                    ptoa(wired_page_count));
+                            ledger_idx_nonvolatile_compressed,
+                            ptoa_64(compressed_page_count));
+               if (do_footprint) {
+                       /* less footprint */
                        ledger_debit(owner->ledger,
                                     task_ledgers.phys_footprint,
-                                    ptoa(wired_page_count));
-
-                       /* no more accounting for this dead object */
-                       if (! task_objq_locked) {
-                               task_objq_lock(owner);
-                       }
-                       /* one less "non-volatile" object for the owner */
-                       if (!disown_on_the_fly) {
-                               assert(object->purgeable_queue_type ==
-                                      PURGEABLE_Q_TYPE_MAX);
-                       }
-                       vm_purgeable_nonvolatile_owner_update(owner, -1);
-                       task_objq_lock_assert_owned(owner);
-                       queue_remove(&owner->task_objq, object, vm_object_t, task_objq);
-                       object->vo_purgeable_owner = NULL;
-#if DEBUG
-                       object->vo_purgeable_volatilizer = NULL;
-#endif /* DEBUG */
-                       if (! task_objq_locked) {
-                               task_objq_unlock(owner);
-                       }
-                       return;
+                                    ptoa_64(resident_page_count
+                                         + compressed_page_count
+                                         - wired_page_count));
                }
+
                /* more volatile bytes in ledger */
                ledger_credit(owner->ledger,
-                             task_ledgers.purgeable_volatile,
-                             ptoa(resident_page_count - wired_page_count));
+                             ledger_idx_volatile,
+                             ptoa_64(resident_page_count - wired_page_count));
                /* more compressed volatile bytes in ledger */
                ledger_credit(owner->ledger,
-                             task_ledgers.purgeable_volatile_compressed,
-                             ptoa(compressed_page_count));
+                             ledger_idx_volatile_compressed,
+                             ptoa_64(compressed_page_count));
        } else {
                panic("vm_purgeable_accounting(%p): "
                      "unexpected old_state=%d\n",
@@ -1775,53 +1717,72 @@ vm_purgeable_volatile_owner_update(
 }
 
 void
-vm_purgeable_compressed_update(
+vm_object_owner_compressed_update(
        vm_object_t     object,
        int             delta)
 {
-       task_t  owner;
+       task_t          owner;
+       int             ledger_idx_volatile;
+       int             ledger_idx_nonvolatile;
+       int             ledger_idx_volatile_compressed;
+       int             ledger_idx_nonvolatile_compressed;
+       boolean_t       do_footprint;
 
        vm_object_lock_assert_exclusive(object);
 
+       owner = VM_OBJECT_OWNER(object);
+
        if (delta == 0 ||
            !object->internal ||
-           object->purgable == VM_PURGABLE_DENY ||
-           object->vo_purgeable_owner == NULL) {
-               /* not an owned purgeable VM object: nothing to update */
+           (object->purgable == VM_PURGABLE_DENY &&
+            ! object->vo_ledger_tag) ||
+           owner == NULL) {
+               /* not an owned purgeable (or tagged) VM object: nothing to update */
                return;
        }
        
-       owner = object->vo_purgeable_owner;
+       vm_object_ledger_tag_ledgers(object,
+                                    &ledger_idx_volatile,
+                                    &ledger_idx_nonvolatile,
+                                    &ledger_idx_volatile_compressed,
+                                    &ledger_idx_nonvolatile_compressed,
+                                    &do_footprint);
        switch (object->purgable) {
        case VM_PURGABLE_DENY:
-               break;
+               /* not purgeable: must be ledger-tagged */
+               assert(object->vo_ledger_tag != VM_OBJECT_LEDGER_TAG_NONE);
+               /* fallthru */
        case VM_PURGABLE_NONVOLATILE:
                if (delta > 0) {
                        ledger_credit(owner->ledger,
-                                     task_ledgers.purgeable_nonvolatile_compressed,
-                                     ptoa(delta));
-                       ledger_credit(owner->ledger,
-                                     task_ledgers.phys_footprint,
-                                     ptoa(delta));
+                                     ledger_idx_nonvolatile_compressed,
+                                     ptoa_64(delta));
+                       if (do_footprint) {
+                               ledger_credit(owner->ledger,
+                                             task_ledgers.phys_footprint,
+                                             ptoa_64(delta));
+                       }
                } else {
                        ledger_debit(owner->ledger,
-                                    task_ledgers.purgeable_nonvolatile_compressed,
-                                    ptoa(-delta));
-                       ledger_debit(owner->ledger,
-                                    task_ledgers.phys_footprint,
-                                    ptoa(-delta));
+                                    ledger_idx_nonvolatile_compressed,
+                                    ptoa_64(-delta));
+                       if (do_footprint) {
+                               ledger_debit(owner->ledger,
+                                            task_ledgers.phys_footprint,
+                                            ptoa_64(-delta));
+                       }
                }
                break;
        case VM_PURGABLE_VOLATILE:
        case VM_PURGABLE_EMPTY:
                if (delta > 0) {
                        ledger_credit(owner->ledger,
-                                     task_ledgers.purgeable_volatile_compressed,
-                                     ptoa(delta));
+                                     ledger_idx_volatile_compressed,
+                                     ptoa_64(delta));
                } else {
                        ledger_debit(owner->ledger,
-                                    task_ledgers.purgeable_volatile_compressed,
-                                    ptoa(-delta));
+                                    ledger_idx_volatile_compressed,
+                                    ptoa_64(-delta));
                }
                break;
        default:
index 010c2ee227e622663690c7c8976cd86fadf6132b..5015ada14235cb2e052d6550adce855e02dbf87f 100644 (file)
@@ -123,11 +123,9 @@ uint64_t vm_purgeable_purge_task_owned(task_t task);
 void vm_purgeable_nonvolatile_enqueue(vm_object_t object, task_t task);
 void vm_purgeable_nonvolatile_dequeue(vm_object_t object);
 void vm_purgeable_accounting(vm_object_t       object,
-                            vm_purgable_t      old_state,
-                            boolean_t          disown,
-                            boolean_t          task_objq_locked);
-void vm_purgeable_compressed_update(vm_object_t        object,
-                                   int         delta);
+                            vm_purgable_t      old_state);
+void vm_object_owner_compressed_update(vm_object_t     object,
+                                      int              delta);
 
 #define PURGEABLE_LOOP_MAX 64
 
index b34f1b2d505715f45977528c534b303631cb69b1..748c427546b09e9f0ff73ee29c2bfa5cf0478fc1 100644 (file)
 #include <sys/kdebug.h>
 
 
+
 char   vm_page_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
 char   vm_page_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
 char   vm_page_non_speculative_pageable_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
@@ -113,6 +114,7 @@ char        vm_page_active_or_inactive_states[VM_PAGE_Q_STATE_ARRAY_SIZE];
 
 #if CONFIG_SECLUDED_MEMORY
 struct vm_page_secluded_data vm_page_secluded;
+void secluded_suppression_init(void);
 #endif /* CONFIG_SECLUDED_MEMORY */
 
 boolean_t      hibernate_cleaning_in_progress = FALSE;
@@ -329,8 +331,6 @@ vm_locks_array_t vm_page_locks;
 decl_lck_mtx_data(,vm_page_alloc_lock)
 lck_mtx_ext_t vm_page_alloc_lock_ext;
 
-unsigned int io_throttle_zero_fill;
-
 unsigned int   vm_page_local_q_count = 0;
 unsigned int   vm_page_local_q_soft_limit = 250;
 unsigned int   vm_page_local_q_hard_limit = 500;
@@ -377,9 +377,12 @@ vm_page_queue_head_t       vm_page_queue_throttled __attribute__((aligned(VM_PACKED_PO
 
 queue_head_t   vm_objects_wired;
 
+void vm_update_darkwake_mode(boolean_t);
+
 #if CONFIG_BACKGROUND_QUEUE
 vm_page_queue_head_t   vm_page_queue_background __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
 uint32_t       vm_page_background_target;
+uint32_t       vm_page_background_target_snapshot;
 uint32_t       vm_page_background_count;
 uint64_t       vm_page_background_promoted_count;
 
@@ -430,7 +433,6 @@ unsigned int        vm_page_speculative_used = 0;
 vm_page_queue_head_t    vm_page_queue_cleaned __attribute__((aligned(VM_PACKED_POINTER_ALIGNMENT)));
 
 unsigned int   vm_page_cleaned_count = 0;
-unsigned int   vm_pageout_enqueued_cleaned = 0;
 
 uint64_t       max_valid_dma_address = 0xffffffffffffffffULL;
 ppnum_t                max_valid_low_ppnum = 0xffffffff;
@@ -450,9 +452,7 @@ unsigned int        vm_page_inactive_target = 0;
 unsigned int   vm_page_secluded_target = 0;
 #endif /* CONFIG_SECLUDED_MEMORY */
 unsigned int   vm_page_anonymous_min = 0;
-unsigned int   vm_page_inactive_min = 0;
 unsigned int   vm_page_free_reserved = 0;
-unsigned int   vm_page_throttle_count = 0;
 
 
 /*
@@ -687,57 +687,56 @@ vm_page_bootstrap(
        bzero(m, sizeof (*m));
 
 #if CONFIG_BACKGROUND_QUEUE
-       m->vm_page_backgroundq.next = 0;
-       m->vm_page_backgroundq.prev = 0;
-       m->vm_page_in_background = FALSE;
-       m->vm_page_on_backgroundq = FALSE;
+       m->vmp_backgroundq.next = 0;
+       m->vmp_backgroundq.prev = 0;
+       m->vmp_in_background = FALSE;
+       m->vmp_on_backgroundq = FALSE;
 #endif
 
        VM_PAGE_ZERO_PAGEQ_ENTRY(m);
-       m->listq.next = 0;
-       m->listq.prev = 0;
-       m->next_m = 0;
+       m->vmp_listq.next = 0;
+       m->vmp_listq.prev = 0;
+       m->vmp_next_m = 0;
 
-       m->vm_page_object = 0;                  /* reset later */
-       m->offset = (vm_object_offset_t) -1;    /* reset later */
+       m->vmp_object = 0;                      /* reset later */
+       m->vmp_offset = (vm_object_offset_t) -1;        /* reset later */
 
-       m->wire_count = 0;
-       m->vm_page_q_state = VM_PAGE_NOT_ON_Q;
-       m->laundry = FALSE;
-       m->reference = FALSE;
-       m->gobbled = FALSE;
-       m->private = FALSE;
-       m->__unused_pageq_bits = 0;
+       m->vmp_wire_count = 0;
+       m->vmp_q_state = VM_PAGE_NOT_ON_Q;
+       m->vmp_laundry = FALSE;
+       m->vmp_reference = FALSE;
+       m->vmp_gobbled = FALSE;
+       m->vmp_private = FALSE;
+       m->vmp_unused_page_bits = 0;
 
 #if    !defined(__arm__) && !defined(__arm64__)
        VM_PAGE_SET_PHYS_PAGE(m, 0);            /* reset later */
 #endif
-       m->busy = TRUE;
-       m->wanted = FALSE;
-       m->tabled = FALSE;
-       m->hashed = FALSE;
-       m->fictitious = FALSE;
-       m->pmapped = FALSE;
-       m->wpmapped = FALSE;
-       m->free_when_done = FALSE;
-       m->absent = FALSE;
-       m->error = FALSE;
-       m->dirty = FALSE;
-       m->cleaning = FALSE;
-       m->precious = FALSE;
-       m->clustered = FALSE;
-       m->overwriting = FALSE;
-       m->restart = FALSE;
-       m->unusual = FALSE;
-       m->cs_validated = FALSE;
-       m->cs_tainted = FALSE;
-       m->cs_nx = FALSE;
-       m->no_cache = FALSE;
-       m->reusable = FALSE;
-       m->slid = FALSE;
-       m->xpmapped = FALSE;
-       m->written_by_kernel = FALSE;
-       m->__unused_object_bits = 0;
+       m->vmp_busy = TRUE;
+       m->vmp_wanted = FALSE;
+       m->vmp_tabled = FALSE;
+       m->vmp_hashed = FALSE;
+       m->vmp_fictitious = FALSE;
+       m->vmp_pmapped = FALSE;
+       m->vmp_wpmapped = FALSE;
+       m->vmp_free_when_done = FALSE;
+       m->vmp_absent = FALSE;
+       m->vmp_error = FALSE;
+       m->vmp_dirty = FALSE;
+       m->vmp_cleaning = FALSE;
+       m->vmp_precious = FALSE;
+       m->vmp_clustered = FALSE;
+       m->vmp_overwriting = FALSE;
+       m->vmp_restart = FALSE;
+       m->vmp_unusual = FALSE;
+       m->vmp_cs_validated = FALSE;
+       m->vmp_cs_tainted = FALSE;
+       m->vmp_cs_nx = FALSE;
+       m->vmp_no_cache = FALSE;
+       m->vmp_reusable = FALSE;
+       m->vmp_xpmapped = FALSE;
+       m->vmp_written_by_kernel = FALSE;
+       m->vmp_unused_object_bits = 0;
 
        /*
         *      Initialize the page queues.
@@ -1112,6 +1111,7 @@ int secluded_for_filecache = 2;           /* filecache can use seclude memory */
 #if 11
 int secluded_for_fbdp = 0;
 #endif
+uint64_t secluded_shutoff_trigger = 0;
 #endif /* CONFIG_SECLUDED_MEMORY */
 
 
@@ -1238,28 +1238,52 @@ pmap_startup(
                           &secluded_for_fbdp,
                           sizeof (secluded_for_fbdp));
 #endif
-#endif /* CONFIG_SECLUDED_MEMORY */
 
-       // -debug code remove
-       if (2 == vm_himemory_mode) {
-               // free low -> high so high is preferred
-               for (i = 1; i <= pages_initialized; i++) {
-                       if(fill) fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i - 1]), fillval);            /* Fill the page with a know value if requested at boot */
-                       vm_page_release_startup(&vm_pages[i - 1]);
-               }
+       /*
+        * On small devices, allow a large app to effectively suppress
+        * secluded memory until it exits.
+        */
+       if (max_mem <= 1 * 1024 * 1024 * 1024 && vm_page_secluded_target != 0) {
+
+               /*
+                * Get an amount from boot-args, else use 500MB.
+                * 500MB was chosen from a Peace daemon tentpole test which used munch
+                * to induce jetsam thrashing of false idle daemons.
+                */
+               int secluded_shutoff_mb;
+               if (PE_parse_boot_argn("secluded_shutoff_mb", &secluded_shutoff_mb,
+                   sizeof (secluded_shutoff_mb)))
+                       secluded_shutoff_trigger = (uint64_t)secluded_shutoff_mb * 1024 * 1024;
+               else
+                       secluded_shutoff_trigger = 500 * 1024 * 1024;
+
+               if (secluded_shutoff_trigger != 0)
+                       secluded_suppression_init();
        }
-       else
-       // debug code remove-
+
+#endif /* CONFIG_SECLUDED_MEMORY */
 
        /*
-        * Release pages in reverse order so that physical pages
+        * By default release pages in reverse order so that physical pages
         * initially get allocated in ascending addresses. This keeps
         * the devices (which must address physical memory) happy if
         * they require several consecutive pages.
+        *
+        * For debugging, you can reverse this ordering and/or fill
+        * all pages with a known value.
         */
-       for (i = pages_initialized; i > 0; i--) {
-               if(fill) fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i - 1]), fillval);            /* Fill the page with a know value if requested at boot */                      
-               vm_page_release_startup(&vm_pages[i - 1]);
+       if (vm_himemory_mode == 2) {
+               for (i = 0; i < pages_initialized; i++) {
+                       if (fill)
+                               fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]), fillval);
+                       vm_page_release_startup(&vm_pages[i]);
+               }
+       } else {
+               for (i = pages_initialized; i-- > 0; ) {
+                       if (fill)
+                               fillPage(VM_PAGE_GET_PHYS_PAGE(&vm_pages[i]), fillval);
+                       vm_page_release_startup(&vm_pages[i]);
+               }
        }
 
        VM_CHECK_MEMORYSTATUS;
@@ -1276,7 +1300,7 @@ pmap_startup(
                        queue_iterate(&vm_page_queue_free[i].qhead,
                                      xx,
                                      vm_page_t,
-                                     pageq) {  /* BRINGUP */
+                                     vmp_pageq) {      /* BRINGUP */
                                j++;                                                                                            /* (BRINGUP) */
                                if(j > vm_page_free_count) {                                            /* (BRINGUP) */
                                        panic("pmap_startup: too many pages, xx = %08X, xxl = %08X\n", xx, xxl);
@@ -1388,7 +1412,7 @@ vm_page_create(
                        == VM_PAGE_NULL)
                        vm_page_more_fictitious();
 
-               m->fictitious = FALSE;
+               m->vmp_fictitious = FALSE;
                pmap_clear_noencrypt(phys_page);
 
                vm_page_pages++;
@@ -1451,6 +1475,11 @@ vm_page_insert_internal(
        lck_spin_t              *bucket_lock;
        int                     hash_id;
        task_t                  owner;
+       int                     ledger_idx_volatile;
+       int                     ledger_idx_nonvolatile;
+       int                     ledger_idx_volatile_compressed;
+       int                     ledger_idx_nonvolatile_compressed;
+       boolean_t               do_footprint;
 
         XPR(XPR_VM_PAGE,
                 "vm_page_insert, object 0x%X offset 0x%X page 0x%X\n",
@@ -1465,7 +1494,7 @@ vm_page_insert_internal(
 
        assert(page_aligned(offset));
 
-       assert(!VM_PAGE_WIRED(mem) || mem->private || mem->fictitious || (tag != VM_KERN_MEMORY_NONE));
+       assert(!VM_PAGE_WIRED(mem) || mem->vmp_private || mem->vmp_fictitious || (tag != VM_KERN_MEMORY_NONE));
 
        /* the vm_submap_object is only a placeholder for submaps */
        assert(object != vm_submap_object);
@@ -1480,10 +1509,10 @@ vm_page_insert_internal(
 
        if (insert_in_hash == TRUE) {
 #if DEBUG || VM_PAGE_CHECK_BUCKETS
-               if (mem->tabled || mem->vm_page_object)
+               if (mem->vmp_tabled || mem->vmp_object)
                        panic("vm_page_insert: page %p for (obj=%p,off=0x%llx) "
                              "already in (obj=%p,off=0x%llx)",
-                             mem, object, offset, VM_PAGE_OBJECT(mem), mem->offset);
+                             mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
 #endif
                if (object->internal && (offset >= object->vo_size)) {
                        panic("vm_page_insert_internal: (page=%p,obj=%p,off=0x%llx,size=0x%llx) inserted at offset past object bounds",
@@ -1496,8 +1525,8 @@ vm_page_insert_internal(
                 *      Record the object/offset pair in this page
                 */
 
-               mem->vm_page_object = VM_PAGE_PACK_OBJECT(object);
-               mem->offset = offset;
+               mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
+               mem->vmp_offset = offset;
 
 #if CONFIG_SECLUDED_MEMORY
                if (object->eligible_for_secluded) {
@@ -1514,7 +1543,7 @@ vm_page_insert_internal(
        
                lck_spin_lock(bucket_lock);
 
-               mem->next_m = bucket->page_list;
+               mem->vmp_next_m = bucket->page_list;
                bucket->page_list = VM_PAGE_PACK_PTR(mem);
                assert(mem == (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)));
 
@@ -1522,7 +1551,7 @@ vm_page_insert_internal(
                if (++bucket->cur_count > bucket->hi_count)
                        bucket->hi_count = bucket->cur_count;
 #endif /* MACH_PAGE_HASH_STATS */
-               mem->hashed = TRUE;
+               mem->vmp_hashed = TRUE;
                lck_spin_unlock(bucket_lock);
        }
 
@@ -1538,9 +1567,9 @@ vm_page_insert_internal(
        /*
         *      Now link into the object's list of backed pages.
         */
-       vm_page_queue_enter(&object->memq, mem, vm_page_t, listq);
+       vm_page_queue_enter(&object->memq, mem, vm_page_t, vmp_listq);
        object->memq_hint = mem;
-       mem->tabled = TRUE;
+       mem->vmp_tabled = TRUE;
 
        /*
         *      Show that the object has one more resident page.
@@ -1548,7 +1577,7 @@ vm_page_insert_internal(
 
        object->resident_page_count++;
        if (VM_PAGE_WIRED(mem)) {
-           assert(mem->wire_count > 0);
+           assert(mem->vmp_wire_count > 0);
            VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
            VM_OBJECT_WIRED_PAGE_ADD(object, mem);
            VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
@@ -1575,18 +1604,26 @@ vm_page_insert_internal(
         * a different physical page during a physically-contiguous
         * allocation.
         */
-       assert(!mem->reusable);
+       assert(!mem->vmp_reusable);
        if (object->all_reusable) {
                OSAddAtomic(+1, &vm_page_stats_reusable.reusable_count);
        }
 
-       if (object->purgable == VM_PURGABLE_DENY) {
+       if (object->purgable == VM_PURGABLE_DENY &&
+           ! object->vo_ledger_tag) {
                owner = TASK_NULL;
        } else {
-               owner = object->vo_purgeable_owner;
+               owner = VM_OBJECT_OWNER(object);
+               vm_object_ledger_tag_ledgers(object,
+                                            &ledger_idx_volatile,
+                                            &ledger_idx_nonvolatile,
+                                            &ledger_idx_volatile_compressed,
+                                            &ledger_idx_nonvolatile_compressed,
+                                            &do_footprint);
        }
        if (owner &&
            (object->purgable == VM_PURGABLE_NONVOLATILE ||
+            object->purgable == VM_PURGABLE_DENY ||
             VM_PAGE_WIRED(mem))) {
 
                if (delayed_ledger_update)
@@ -1594,12 +1631,14 @@ vm_page_insert_internal(
                else {
                        /* more non-volatile bytes */
                        ledger_credit(owner->ledger,
-                                     task_ledgers.purgeable_nonvolatile,
-                                     PAGE_SIZE);
-                       /* more footprint */
-                       ledger_credit(owner->ledger,
-                                     task_ledgers.phys_footprint,
+                                     ledger_idx_nonvolatile,
                                      PAGE_SIZE);
+                       if (do_footprint) {
+                               /* more footprint */
+                               ledger_credit(owner->ledger,
+                                             task_ledgers.phys_footprint,
+                                             PAGE_SIZE);
+                       }
                }
 
        } else if (owner &&
@@ -1608,7 +1647,7 @@ vm_page_insert_internal(
                assert(! VM_PAGE_WIRED(mem));
                /* more volatile bytes */
                ledger_credit(owner->ledger,
-                             task_ledgers.purgeable_volatile,
+                             ledger_idx_volatile,
                              PAGE_SIZE);
        }
 
@@ -1619,7 +1658,7 @@ vm_page_insert_internal(
                        OSAddAtomic(+1, &vm_page_purgeable_count);
                }
        } else if (object->purgable == VM_PURGABLE_EMPTY &&
-                  mem->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q) {
+                  mem->vmp_q_state == VM_PAGE_ON_THROTTLED_Q) {
                /*
                 * This page belongs to a purged VM object but hasn't
                 * been purged (because it was "busy").
@@ -1683,10 +1722,10 @@ vm_page_replace(
 #endif
        vm_object_lock_assert_exclusive(object);
 #if DEBUG || VM_PAGE_CHECK_BUCKETS
-       if (mem->tabled || mem->vm_page_object)
+       if (mem->vmp_tabled || mem->vmp_object)
                panic("vm_page_replace: page %p for (obj=%p,off=0x%llx) "
                      "already in (obj=%p,off=0x%llx)",
-                     mem, object, offset, VM_PAGE_OBJECT(mem), mem->offset);
+                     mem, object, offset, VM_PAGE_OBJECT(mem), mem->vmp_offset);
 #endif
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
 
@@ -1695,8 +1734,8 @@ vm_page_replace(
        /*
         *      Record the object/offset pair in this page
         */
-       mem->vm_page_object = VM_PAGE_PACK_OBJECT(object);
-       mem->offset = offset;
+       mem->vmp_object = VM_PAGE_PACK_OBJECT(object);
+       mem->vmp_offset = offset;
 
        /*
         *      Insert it into the object_object/offset hash table,
@@ -1717,29 +1756,29 @@ vm_page_replace(
                        /*
                         * compare packed object pointers
                         */
-                       if (m->vm_page_object == mem->vm_page_object && m->offset == offset) {
+                       if (m->vmp_object == mem->vmp_object && m->vmp_offset == offset) {
                                /*
                                 * Remove old page from hash list
                                 */
-                               *mp = m->next_m;
-                               m->hashed = FALSE;
-                               m->next_m = VM_PAGE_PACK_PTR(NULL);
+                               *mp = m->vmp_next_m;
+                               m->vmp_hashed = FALSE;
+                               m->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
 
                                found_m = m;
                                break;
                        }
-                       mp = &m->next_m;
+                       mp = &m->vmp_next_m;
                } while ((m = (vm_page_t)(VM_PAGE_UNPACK_PTR(*mp))));
 
-               mem->next_m = bucket->page_list;
+               mem->vmp_next_m = bucket->page_list;
        } else {
-               mem->next_m = VM_PAGE_PACK_PTR(NULL);
+               mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
        }
        /*
         * insert new page at head of hash list
         */
        bucket->page_list = VM_PAGE_PACK_PTR(mem);
-       mem->hashed = TRUE;
+       mem->vmp_hashed = TRUE;
 
        lck_spin_unlock(bucket_lock);
 
@@ -1774,18 +1813,23 @@ vm_page_remove(
        int             hash_id;
        task_t          owner;
        vm_object_t     m_object;
+       int             ledger_idx_volatile;
+       int             ledger_idx_nonvolatile;
+       int             ledger_idx_volatile_compressed;
+       int             ledger_idx_nonvolatile_compressed;
+       int             do_footprint;
 
        m_object = VM_PAGE_OBJECT(mem);
 
         XPR(XPR_VM_PAGE,
                 "vm_page_remove, object 0x%X offset 0x%X page 0x%X\n",
-                m_object, mem->offset, 
+                m_object, mem->vmp_offset, 
                mem, 0,0);
 
        vm_object_lock_assert_exclusive(m_object);
-       assert(mem->tabled);
-       assert(!mem->cleaning);
-       assert(!mem->laundry);
+       assert(mem->vmp_tabled);
+       assert(!mem->vmp_cleaning);
+       assert(!mem->vmp_laundry);
 
        if (VM_PAGE_PAGEABLE(mem)) {
                LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
@@ -1801,7 +1845,7 @@ vm_page_remove(
                /*
                 *      Remove from the object_object/offset hash table
                 */
-               hash_id = vm_page_hash(m_object, mem->offset);
+               hash_id = vm_page_hash(m_object, mem->vmp_offset);
                bucket = &vm_page_buckets[hash_id];
                bucket_lock = &vm_page_bucket_locks[hash_id / BUCKETS_PER_LOCK];
 
@@ -1810,21 +1854,21 @@ vm_page_remove(
                if ((this = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list))) == mem) {
                        /* optimize for common case */
 
-                       bucket->page_list = mem->next_m;
+                       bucket->page_list = mem->vmp_next_m;
                } else {
                        vm_page_packed_t        *prev;
 
-                       for (prev = &this->next_m;
+                       for (prev = &this->vmp_next_m;
                             (this = (vm_page_t)(VM_PAGE_UNPACK_PTR(*prev))) != mem;
-                            prev = &this->next_m)
+                            prev = &this->vmp_next_m)
                                continue;
-                       *prev = this->next_m;
+                       *prev = this->vmp_next_m;
                }
 #if     MACH_PAGE_HASH_STATS
                bucket->cur_count--;
 #endif /* MACH_PAGE_HASH_STATS */
-               mem->hashed = FALSE;
-               this->next_m = VM_PAGE_PACK_PTR(NULL);
+               mem->vmp_hashed = FALSE;
+               this->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
                lck_spin_unlock(bucket_lock);
        }
        /*
@@ -1851,30 +1895,32 @@ vm_page_remove(
                assert(vm_page_external_count);
                OSAddAtomic(-1, &vm_page_external_count);
 
-               if (mem->xpmapped) {
+               if (mem->vmp_xpmapped) {
                        assert(vm_page_xpmapped_external_count);
                        OSAddAtomic(-1, &vm_page_xpmapped_external_count);
                }
        }
-       if (!m_object->internal && (m_object->objq.next || m_object->objq.prev)) {
+       if (!m_object->internal &&
+           m_object->cached_list.next &&
+           m_object->cached_list.prev) {
                if (m_object->resident_page_count == 0)
                        vm_object_cache_remove(m_object);
        }
 
        if (VM_PAGE_WIRED(mem)) {
-               assert(mem->wire_count > 0);
+               assert(mem->vmp_wire_count > 0);
                VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
                VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
                VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
        }
        assert(m_object->resident_page_count >=
               m_object->wired_page_count);
-       if (mem->reusable) {
+       if (mem->vmp_reusable) {
                assert(m_object->reusable_page_count > 0);
                m_object->reusable_page_count--;
                assert(m_object->reusable_page_count <=
                       m_object->resident_page_count);
-               mem->reusable = FALSE;
+               mem->vmp_reusable = FALSE;
                OSAddAtomic(-1, &vm_page_stats_reusable.reusable_count);
                vm_page_stats_reusable.reused_remove++;
        } else if (m_object->all_reusable) {
@@ -1882,29 +1928,39 @@ vm_page_remove(
                vm_page_stats_reusable.reused_remove++;
        }
 
-       if (m_object->purgable == VM_PURGABLE_DENY) {
+       if (m_object->purgable == VM_PURGABLE_DENY &&
+           ! m_object->vo_ledger_tag) {
                owner = TASK_NULL;
        } else {
-               owner = m_object->vo_purgeable_owner;
+               owner = VM_OBJECT_OWNER(m_object);
+               vm_object_ledger_tag_ledgers(m_object,
+                                            &ledger_idx_volatile,
+                                            &ledger_idx_nonvolatile,
+                                            &ledger_idx_volatile_compressed,
+                                            &ledger_idx_nonvolatile_compressed,
+                                            &do_footprint);
        }
        if (owner &&
            (m_object->purgable == VM_PURGABLE_NONVOLATILE ||
+            m_object->purgable == VM_PURGABLE_DENY ||
             VM_PAGE_WIRED(mem))) {
                /* less non-volatile bytes */
                ledger_debit(owner->ledger,
-                            task_ledgers.purgeable_nonvolatile,
-                            PAGE_SIZE);
-               /* less footprint */
-               ledger_debit(owner->ledger,
-                            task_ledgers.phys_footprint,
+                            ledger_idx_nonvolatile,
                             PAGE_SIZE);
+               if (do_footprint) {
+                       /* less footprint */
+                       ledger_debit(owner->ledger,
+                                    task_ledgers.phys_footprint,
+                                    PAGE_SIZE);
+               }
        } else if (owner &&
                   (m_object->purgable == VM_PURGABLE_VOLATILE ||
                    m_object->purgable == VM_PURGABLE_EMPTY)) {
                assert(! VM_PAGE_WIRED(mem));
                /* less volatile bytes */
                ledger_debit(owner->ledger,
-                            task_ledgers.purgeable_volatile,
+                            ledger_idx_volatile,
                             PAGE_SIZE);
        }
        if (m_object->purgable == VM_PURGABLE_VOLATILE) {
@@ -1920,9 +1976,9 @@ vm_page_remove(
        if (m_object->set_cache_attr == TRUE)
                pmap_set_cache_attributes(VM_PAGE_GET_PHYS_PAGE(mem), 0);
 
-       mem->tabled = FALSE;
-       mem->vm_page_object = 0;
-       mem->offset = (vm_object_offset_t) -1;
+       mem->vmp_tabled = FALSE;
+       mem->vmp_object = 0;
+       mem->vmp_offset = (vm_object_offset_t) -1;
 }
 
 
@@ -1971,8 +2027,8 @@ kdp_vm_page_lookup(
                panic("panic: kdp_vm_page_lookup done outside of kernel debugger");
        }
 
-       vm_page_queue_iterate(&object->memq, cur_page, vm_page_t, listq) {
-               if (cur_page->offset == offset) {
+       vm_page_queue_iterate(&object->memq, cur_page, vm_page_t, vmp_listq) {
+               if (cur_page->vmp_offset == offset) {
                        return cur_page;
                }
                num_traversed++;
@@ -2014,13 +2070,13 @@ vm_page_lookup(
        if (mem != VM_PAGE_NULL) {
                assert(VM_PAGE_OBJECT(mem) == object);
 
-               if (mem->offset == offset) {
+               if (mem->vmp_offset == offset) {
 #if DEBUG_VM_PAGE_LOOKUP
                        OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint);
 #endif
                        return (mem);
                }
-               qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->listq);
+               qe = (vm_page_queue_entry_t)vm_page_queue_next(&mem->vmp_listq);
 
                if (! vm_page_queue_end(&object->memq, qe)) {
                        vm_page_t       next_page;
@@ -2028,7 +2084,7 @@ vm_page_lookup(
                        next_page = (vm_page_t)((uintptr_t)qe);
                        assert(VM_PAGE_OBJECT(next_page) == object);
 
-                       if (next_page->offset == offset) {
+                       if (next_page->vmp_offset == offset) {
                                object->memq_hint = next_page; /* new hint */
 #if DEBUG_VM_PAGE_LOOKUP
                                OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_next);
@@ -2036,7 +2092,7 @@ vm_page_lookup(
                                return (next_page);
                        }
                }
-               qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->listq);
+               qe = (vm_page_queue_entry_t)vm_page_queue_prev(&mem->vmp_listq);
 
                if (! vm_page_queue_end(&object->memq, qe)) {
                        vm_page_t prev_page;
@@ -2044,7 +2100,7 @@ vm_page_lookup(
                        prev_page = (vm_page_t)((uintptr_t)qe);
                        assert(VM_PAGE_OBJECT(prev_page) == object);
 
-                       if (prev_page->offset == offset) {
+                       if (prev_page->vmp_offset == offset) {
                                object->memq_hint = prev_page; /* new hint */
 #if DEBUG_VM_PAGE_LOOKUP
                                OSAddAtomic64(1, &vm_page_lookup_stats.vpl_hit_hint_prev);
@@ -2086,10 +2142,10 @@ vm_page_lookup(
 
                while (!vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem)) {
 
-                       if (mem->offset == offset)
+                       if (mem->vmp_offset == offset)
                                break;
 
-                       mem = (vm_page_t)vm_page_queue_next(&mem->listq);
+                       mem = (vm_page_t)vm_page_queue_next(&mem->vmp_listq);
                }
                if (vm_page_queue_end(&object->memq, (vm_page_queue_entry_t)mem))
                        mem = NULL;
@@ -2104,7 +2160,7 @@ vm_page_lookup(
 
                for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
                     mem != VM_PAGE_NULL;
-                    mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m))) {
+                    mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m))) {
 #if 0
                        /*
                         * we don't hold the page queue lock
@@ -2112,7 +2168,7 @@ vm_page_lookup(
                         */
                        VM_PAGE_CHECK(mem);
 #endif
-                       if ((mem->vm_page_object == packed_object) && (mem->offset == offset))
+                       if ((mem->vmp_object == packed_object) && (mem->vmp_offset == offset))
                                break;
                }
                lck_spin_unlock(bucket_lock);
@@ -2171,7 +2227,7 @@ vm_page_rename(
                mem, 0,0);
 
        /*
-        *      Changes to mem->object require the page lock because
+        *      Changes to mem->vmp_object require the page lock because
         *      the pageout daemon uses that lock to get the object.
         */
        vm_page_lockspin_queues();
@@ -2179,7 +2235,7 @@ vm_page_rename(
        internal_to_external = FALSE;
        external_to_internal = FALSE;
 
-       if (mem->vm_page_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) {
+       if (mem->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q) {
                /*
                 * it's much easier to get the vm_page_pageable_xxx accounting correct
                 * if we first move the page to the active queue... it's going to end
@@ -2257,7 +2313,7 @@ vm_page_init(
         */
        pmap_clear_refmod(phys_page, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
 #endif
-       mem->lopage = lopage;
+       mem->vmp_lopage = lopage;
 }
 
 /*
@@ -2280,7 +2336,7 @@ vm_page_grab_fictitious_common(
        if ((m = (vm_page_t)zget(vm_page_zone))) {
 
                vm_page_init(m, phys_addr, FALSE);
-               m->fictitious = TRUE;
+               m->vmp_fictitious = TRUE;
 
                c_vm_page_grab_fictitious++;
        } else
@@ -2317,8 +2373,8 @@ void
 vm_page_release_fictitious(
        vm_page_t m)
 {
-       assert((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) || (m->vm_page_q_state == VM_PAGE_IS_WIRED));
-       assert(m->fictitious);
+       assert((m->vmp_q_state == VM_PAGE_NOT_ON_Q) || (m->vmp_q_state == VM_PAGE_IS_WIRED));
+       assert(m->vmp_fictitious);
        assert(VM_PAGE_GET_PHYS_PAGE(m) == vm_page_fictitious_addr ||
               VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr);
 
@@ -2421,6 +2477,58 @@ vm_pool_low(void)
        return( vm_page_free_count <= vm_page_free_reserved );
 }
 
+boolean_t vm_darkwake_mode = FALSE;
+
+/*
+ * vm_update_darkwake_mode():
+ *
+ * Tells the VM that the system is in / out of darkwake.
+ *
+ * Today, the VM only lowers/raises the background queue target
+ * so as to favor consuming more/less background pages when
+ * darwake is ON/OFF.
+ *
+ * We might need to do more things in the future.
+ */
+
+void
+vm_update_darkwake_mode(boolean_t darkwake_mode)
+{
+       LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
+
+       vm_page_lockspin_queues();
+
+       if (vm_darkwake_mode == darkwake_mode) {
+               /*
+                * No change.
+                */
+               vm_page_unlock_queues();
+               return;
+       }
+
+       vm_darkwake_mode = darkwake_mode;
+
+       if (vm_darkwake_mode == TRUE) {
+#if CONFIG_BACKGROUND_QUEUE
+
+               /* save background target to restore later */
+               vm_page_background_target_snapshot = vm_page_background_target;
+
+               /* target is set to 0...no protection for background pages */
+               vm_page_background_target = 0;
+
+#endif /* CONFIG_BACKGROUND_QUEUE */
+
+       } else if (vm_darkwake_mode == FALSE) {
+#if CONFIG_BACKGROUND_QUEUE
+
+               if (vm_page_background_target_snapshot) {
+                       vm_page_background_target = vm_page_background_target_snapshot;
+               }
+#endif /* CONFIG_BACKGROUND_QUEUE */
+       }
+       vm_page_unlock_queues();
+}
 
 #if CONFIG_BACKGROUND_QUEUE
 
@@ -2430,17 +2538,21 @@ vm_page_update_background_state(vm_page_t mem)
        if (vm_page_background_mode == VM_PAGE_BG_DISABLED)
                return;
 
-       if (mem->vm_page_in_background == FALSE)
+       if (mem->vmp_in_background == FALSE)
                return;
 
+       task_t  my_task = current_task();
+
+       if (my_task) {
+               if (task_get_darkwake_mode(my_task)) {
+                       return;
+               }
+       }
+
 #if BACKGROUNDQ_BASED_ON_QOS
         if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS) <= THREAD_QOS_LEGACY)
                return;
 #else
-       task_t  my_task;
-
-       my_task = current_task();
-       
        if (my_task) {
                if (proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG))
                        return;
@@ -2448,7 +2560,7 @@ vm_page_update_background_state(vm_page_t mem)
 #endif
        vm_page_lockspin_queues();
 
-       mem->vm_page_in_background = FALSE;
+       mem->vmp_in_background = FALSE;
        vm_page_background_promoted_count++;
 
        vm_page_remove_from_backgroundq(mem);
@@ -2463,18 +2575,23 @@ vm_page_assign_background_state(vm_page_t mem)
        if (vm_page_background_mode == VM_PAGE_BG_DISABLED)
                return;
 
+       task_t  my_task = current_task();
+
+       if (my_task) {
+               if (task_get_darkwake_mode(my_task)) {
+                       mem->vmp_in_background = TRUE;
+                       return;
+               }
+       }
+
 #if BACKGROUNDQ_BASED_ON_QOS
         if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_QOS) <= THREAD_QOS_LEGACY)
-                mem->vm_page_in_background = TRUE;
+                mem->vmp_in_background = TRUE;
        else
-                mem->vm_page_in_background = FALSE;
+                mem->vmp_in_background = FALSE;
 #else
-       task_t  my_task;
-
-       my_task = current_task();
-
        if (my_task)
-               mem->vm_page_in_background = proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG);
+               mem->vmp_in_background = proc_get_effective_task_policy(my_task, TASK_POLICY_DARWIN_BG);
 #endif
 }
 
@@ -2487,12 +2604,12 @@ vm_page_remove_from_backgroundq(
 
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 
-       if (mem->vm_page_on_backgroundq) {
-               vm_page_queue_remove(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq);
+       if (mem->vmp_on_backgroundq) {
+               vm_page_queue_remove(&vm_page_queue_background, mem, vm_page_t, vmp_backgroundq);
 
-               mem->vm_page_backgroundq.next = 0;
-               mem->vm_page_backgroundq.prev = 0;
-               mem->vm_page_on_backgroundq = FALSE;
+               mem->vmp_backgroundq.next = 0;
+               mem->vmp_backgroundq.prev = 0;
+               mem->vmp_on_backgroundq = FALSE;
                
                vm_page_background_count--;
 
@@ -2503,8 +2620,8 @@ vm_page_remove_from_backgroundq(
                else
                        vm_page_background_external_count--;
        } else {
-               assert(VM_PAGE_UNPACK_PTR(mem->vm_page_backgroundq.next) == (uintptr_t)NULL &&
-                      VM_PAGE_UNPACK_PTR(mem->vm_page_backgroundq.prev) == (uintptr_t)NULL);
+               assert(VM_PAGE_UNPACK_PTR(mem->vmp_backgroundq.next) == (uintptr_t)NULL &&
+                      VM_PAGE_UNPACK_PTR(mem->vmp_backgroundq.prev) == (uintptr_t)NULL);
        }
 }
 
@@ -2521,7 +2638,7 @@ vm_page_add_to_backgroundq(
        if (vm_page_background_mode == VM_PAGE_BG_DISABLED)
                return;
 
-       if (mem->vm_page_on_backgroundq == FALSE) {
+       if (mem->vmp_on_backgroundq == FALSE) {
 
                m_object = VM_PAGE_OBJECT(mem);
 
@@ -2529,10 +2646,10 @@ vm_page_add_to_backgroundq(
                        return;
 
                if (first == TRUE)
-                       vm_page_queue_enter_first(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq);
+                       vm_page_queue_enter_first(&vm_page_queue_background, mem, vm_page_t, vmp_backgroundq);
                else
-                       vm_page_queue_enter(&vm_page_queue_background, mem, vm_page_t, vm_page_backgroundq);
-               mem->vm_page_on_backgroundq = TRUE;
+                       vm_page_queue_enter(&vm_page_queue_background, mem, vm_page_t, vmp_backgroundq);
+               mem->vmp_on_backgroundq = TRUE;
                
                vm_page_background_count++;
 
@@ -2543,7 +2660,7 @@ vm_page_add_to_backgroundq(
        }
 }
 
-#endif
+#endif /* CONFIG_BACKGROUND_QUEUE */
 
 /*
  * this is an interface to support bring-up of drivers
@@ -2576,10 +2693,10 @@ vm_page_grablo(void)
                 vm_page_queue_remove_first(&vm_lopage_queue_free,
                                    mem,
                                    vm_page_t,
-                                   pageq);
+                                   vmp_pageq);
                assert(vm_lopage_free_count);
-               assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q);
-               mem->vm_page_q_state = VM_PAGE_NOT_ON_Q;
+               assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q);
+               mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
 
                 vm_lopage_free_count--;
                vm_lopages_allocated_q++;
@@ -2603,26 +2720,31 @@ vm_page_grablo(void)
 
                        return (VM_PAGE_NULL);
                }
-               assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+               assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
 
-               mem->busy = TRUE;
+               mem->vmp_busy = TRUE;
 
                vm_page_lockspin_queues();
                
-               mem->gobbled = FALSE;
+               mem->vmp_gobbled = FALSE;
                vm_page_gobble_count--;
                vm_page_wire_count--;
 
                vm_lopages_allocated_cpm_success++;
                vm_page_unlock_queues();
        }
-       assert(mem->busy);
-       assert(!mem->pmapped);
-       assert(!mem->wpmapped);
+       assert(mem->vmp_busy);
+       assert(!mem->vmp_pmapped);
+       assert(!mem->vmp_wpmapped);
        assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
 
        VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
 
+       disable_preemption();
+       PROCESSOR_DATA(current_processor(), page_grab_count) += 1;
+       VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, 0, 1, 0, 0);
+       enable_preemption();
+
        return (mem);
 }
 
@@ -2672,7 +2794,7 @@ vm_page_grab_options(
 
        if ((mem = PROCESSOR_DATA(current_processor(), free_pages))) {
 return_page_from_cpu_list:
-               assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
+               assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
 
 #if HIBERNATION
                if (hibernate_rebuild_needed) {
@@ -2680,20 +2802,21 @@ return_page_from_cpu_list:
                }
 #endif /* HIBERNATION */
                PROCESSOR_DATA(current_processor(), page_grab_count) += 1;
-               PROCESSOR_DATA(current_processor(), free_pages) = mem->snext;
+               PROCESSOR_DATA(current_processor(), free_pages) = mem->vmp_snext;
+               VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
 
                enable_preemption();
                VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
-               mem->vm_page_q_state = VM_PAGE_NOT_ON_Q;
-
-               assert(mem->listq.next == 0 && mem->listq.prev == 0);
-               assert(mem->tabled == FALSE);
-               assert(mem->vm_page_object == 0);
-               assert(!mem->laundry);
-               assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)));
-               assert(mem->busy);
-               assert(!mem->pmapped);
-               assert(!mem->wpmapped);
+               mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
+
+               assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
+               assert(mem->vmp_tabled == FALSE);
+               assert(mem->vmp_object == 0);
+               assert(!mem->vmp_laundry);
+               assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem));
+               assert(mem->vmp_busy);
+               assert(!mem->vmp_pmapped);
+               assert(!mem->vmp_wpmapped);
                assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
 
 #if CONFIG_BACKGROUND_QUEUE
@@ -2737,7 +2860,7 @@ return_page_from_cpu_list:
                /* ... but can we try and grab from the secluded queue? */
                if (vm_page_secluded_count > 0 &&
                    ((grab_options & VM_PAGE_GRAB_SECLUDED) ||
-                    task_can_use_secluded_mem(current_task()))) {
+                    task_can_use_secluded_mem(current_task(), TRUE))) {
                        mem = vm_page_grab_secluded();
                        if (grab_options & VM_PAGE_GRAB_SECLUDED) {
                                vm_page_secluded.grab_for_iokit++;
@@ -2747,6 +2870,12 @@ return_page_from_cpu_list:
                        }
                        if (mem) {
                                VM_CHECK_MEMORYSTATUS;
+
+                               disable_preemption();
+                               PROCESSOR_DATA(current_processor(), page_grab_count) += 1;
+                               VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
+                               enable_preemption();
+
                                return mem;
                        }
                }
@@ -2807,16 +2936,16 @@ return_page_from_cpu_list:
                        vm_page_queue_remove_first_with_clump(&vm_page_queue_free[color].qhead,
                                                              mem,
                                                              vm_page_t,
-                                                             pageq,
+                                                             vmp_pageq,
                                                              clump_end);
 #else
                        vm_page_queue_remove_first(&vm_page_queue_free[color].qhead,
                                                              mem,
                                                              vm_page_t,
-                                                             pageq);
+                                                             vmp_pageq);
 #endif
 
-                       assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_Q);
+                       assert(mem->vmp_q_state == VM_PAGE_ON_FREE_Q);
 
                        VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
 
@@ -2842,20 +2971,20 @@ return_page_from_cpu_list:
                        if (head == NULL)
                                head = mem;
                        else
-                               tail->snext = mem;
+                               tail->vmp_snext = mem;
                        tail = mem;
 
-                       assert(mem->listq.next == 0 && mem->listq.prev == 0);
-                       assert(mem->tabled == FALSE);
-                       assert(mem->vm_page_object == 0);
-                       assert(!mem->laundry);
+                       assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
+                       assert(mem->vmp_tabled == FALSE);
+                       assert(mem->vmp_object == 0);
+                       assert(!mem->vmp_laundry);
 
-                       mem->vm_page_q_state = VM_PAGE_ON_FREE_LOCAL_Q;
+                       mem->vmp_q_state = VM_PAGE_ON_FREE_LOCAL_Q;
 
-                       assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)));
-                       assert(mem->busy);
-                       assert(!mem->pmapped);
-                       assert(!mem->wpmapped);
+                       assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem));
+                       assert(mem->vmp_busy);
+                       assert(!mem->vmp_pmapped);
+                       assert(!mem->vmp_wpmapped);
                        assert(!pmap_is_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem)));
                }
 #if defined (__x86_64__) && (DEVELOPMENT || DEBUG)
@@ -2868,18 +2997,19 @@ return_page_from_cpu_list:
                        panic("%s:%d should not modify cpu->free_pages while hibernating", __FUNCTION__, __LINE__);
                }
 #endif /* HIBERNATION */
-               PROCESSOR_DATA(current_processor(), free_pages) = head->snext;
+               PROCESSOR_DATA(current_processor(), free_pages) = head->vmp_snext;
                PROCESSOR_DATA(current_processor(), start_color) = color;
 
                /*
                 * satisfy this request
                 */
                PROCESSOR_DATA(current_processor(), page_grab_count) += 1;
+               VM_DEBUG_EVENT(vm_page_grab, VM_PAGE_GRAB, DBG_FUNC_NONE, grab_options, 0, 0, 0);
                mem = head;
-               assert(mem->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
+               assert(mem->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
 
                VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
-               mem->vm_page_q_state = VM_PAGE_NOT_ON_Q;
+               mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
 
                enable_preemption();
        }
@@ -2893,9 +3023,7 @@ return_page_from_cpu_list:
         *      We don't have the counts locked ... if they change a little,
         *      it doesn't really matter.
         */
-       if ((vm_page_free_count < vm_page_free_min) ||
-            ((vm_page_free_count < vm_page_free_target) &&
-             ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min)))
+       if (vm_page_free_count < vm_page_free_min)
                 thread_wakeup((event_t) &vm_page_free_wanted);
 
        VM_CHECK_MEMORYSTATUS;
@@ -2936,7 +3064,7 @@ vm_page_grab_secluded(void)
        /* can we grab from the secluded queue? */
        if (vm_page_secluded_count > vm_page_secluded_target ||
            (vm_page_secluded_count > 0 &&
-            task_can_use_secluded_mem(current_task()))) {
+            task_can_use_secluded_mem(current_task(), TRUE))) {
                /* OK */
        } else {
                /* can't grab from secluded queue... */
@@ -2955,29 +3083,29 @@ vm_page_grab_secluded(void)
        assert(!vm_page_queue_empty(&vm_page_queue_secluded));
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
        mem = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
-       assert(mem->vm_page_q_state == VM_PAGE_ON_SECLUDED_Q);
+       assert(mem->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
        vm_page_queues_remove(mem, TRUE);
 
        object = VM_PAGE_OBJECT(mem);
 
-       assert(!mem->fictitious);
+       assert(!mem->vmp_fictitious);
        assert(!VM_PAGE_WIRED(mem));
        if (object == VM_OBJECT_NULL) {
                /* free for grab! */
                vm_page_unlock_queues();
                vm_page_secluded.grab_success_free++;
 
-               assert(mem->busy);
-               assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+               assert(mem->vmp_busy);
+               assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
                assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL);
-               assert(mem->pageq.next == 0);
-               assert(mem->pageq.prev == 0);
-               assert(mem->listq.next == 0);
-               assert(mem->listq.prev == 0);
+               assert(mem->vmp_pageq.next == 0);
+               assert(mem->vmp_pageq.prev == 0);
+               assert(mem->vmp_listq.next == 0);
+               assert(mem->vmp_listq.prev == 0);
 #if CONFIG_BACKGROUND_QUEUE
-               assert(mem->vm_page_on_backgroundq == 0);
-               assert(mem->vm_page_backgroundq.next == 0);
-               assert(mem->vm_page_backgroundq.prev == 0);
+               assert(mem->vmp_on_backgroundq == 0);
+               assert(mem->vmp_backgroundq.next == 0);
+               assert(mem->vmp_backgroundq.prev == 0);
 #endif /* CONFIG_BACKGROUND_QUEUE */
                return mem;
        }
@@ -2993,24 +3121,24 @@ vm_page_grab_secluded(void)
                vm_page_unlock_queues();
                return VM_PAGE_NULL;
        }
-       if (mem->busy ||
-           mem->cleaning ||
-           mem->laundry) {
+       if (mem->vmp_busy ||
+           mem->vmp_cleaning ||
+           mem->vmp_laundry) {
                /* can't steal page in this state... */
                vm_object_unlock(object);
                vm_page_secluded.grab_failure_state++;
                goto reactivate_secluded_page;
        }
 
-       mem->busy = TRUE;
+       mem->vmp_busy = TRUE;
        refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(mem));
        if (refmod_state & VM_MEM_REFERENCED) {
-               mem->reference = TRUE;
+               mem->vmp_reference = TRUE;
        }
        if (refmod_state & VM_MEM_MODIFIED) {
                SET_PAGE_DIRTY(mem, FALSE);
        }
-       if (mem->dirty || mem->precious) {
+       if (mem->vmp_dirty || mem->vmp_precious) {
                /* can't grab a dirty page; re-activate */
 //             printf("SECLUDED: dirty page %p\n", mem);
                PAGE_WAKEUP_DONE(mem);
@@ -3018,7 +3146,7 @@ vm_page_grab_secluded(void)
                vm_object_unlock(object);
                goto reactivate_secluded_page;
        }
-       if (mem->reference) {
+       if (mem->vmp_reference) {
                /* it's been used but we do need to grab a page... */
        }
 
@@ -3029,22 +3157,22 @@ vm_page_grab_secluded(void)
        vm_object_unlock(object);
        object = VM_OBJECT_NULL;
        if (vm_page_free_verify) {
-               assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)));
+               assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem));
        }
        pmap_clear_noencrypt(VM_PAGE_GET_PHYS_PAGE(mem));
        vm_page_secluded.grab_success_other++;
 
-       assert(mem->busy);
-       assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+       assert(mem->vmp_busy);
+       assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
        assert(VM_PAGE_OBJECT(mem) == VM_OBJECT_NULL);
-       assert(mem->pageq.next == 0);
-       assert(mem->pageq.prev == 0);
-       assert(mem->listq.next == 0);
-       assert(mem->listq.prev == 0);
+       assert(mem->vmp_pageq.next == 0);
+       assert(mem->vmp_pageq.prev == 0);
+       assert(mem->vmp_listq.next == 0);
+       assert(mem->vmp_listq.prev == 0);
 #if CONFIG_BACKGROUND_QUEUE
-       assert(mem->vm_page_on_backgroundq == 0);
-       assert(mem->vm_page_backgroundq.next == 0);
-       assert(mem->vm_page_backgroundq.prev == 0);
+       assert(mem->vmp_on_backgroundq == 0);
+       assert(mem->vmp_backgroundq.next == 0);
+       assert(mem->vmp_backgroundq.prev == 0);
 #endif /* CONFIG_BACKGROUND_QUEUE */
 
        return mem;
@@ -3075,9 +3203,9 @@ vm_page_release(
                LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
        }
 
-       assert(!mem->private && !mem->fictitious);
+       assert(!mem->vmp_private && !mem->vmp_fictitious);
        if (vm_page_free_verify) {
-               assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)));
+               assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem));
        }
 //     dbgLog(VM_PAGE_GET_PHYS_PAGE(mem), vm_page_free_count, vm_page_wire_count, 5);  /* (TEST/DEBUG) */
 
@@ -3085,18 +3213,18 @@ vm_page_release(
 
        lck_mtx_lock_spin(&vm_page_queue_free_lock);
 
-       assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-       assert(mem->busy);
-       assert(!mem->laundry);
-       assert(mem->vm_page_object == 0);
-       assert(mem->pageq.next == 0 && mem->pageq.prev == 0);
-       assert(mem->listq.next == 0 && mem->listq.prev == 0);
+       assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
+       assert(mem->vmp_busy);
+       assert(!mem->vmp_laundry);
+       assert(mem->vmp_object == 0);
+       assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
+       assert(mem->vmp_listq.next == 0 && mem->vmp_listq.prev == 0);
 #if CONFIG_BACKGROUND_QUEUE
-       assert(mem->vm_page_backgroundq.next == 0 &&
-              mem->vm_page_backgroundq.prev == 0 &&
-              mem->vm_page_on_backgroundq == FALSE);
+       assert(mem->vmp_backgroundq.next == 0 &&
+              mem->vmp_backgroundq.prev == 0 &&
+              mem->vmp_on_backgroundq == FALSE);
 #endif 
-       if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) &&
+       if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) &&
            vm_lopage_free_count < vm_lopage_free_limit &&
            VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
                /*
@@ -3107,14 +3235,14 @@ vm_page_release(
                vm_page_queue_enter_first(&vm_lopage_queue_free,
                                          mem,
                                          vm_page_t,
-                                         pageq);
+                                         vmp_pageq);
                vm_lopage_free_count++;
 
                if (vm_lopage_free_count >= vm_lopage_free_limit)
                        vm_lopage_refill = FALSE;
 
-               mem->vm_page_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
-               mem->lopage = TRUE;
+               mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
+               mem->vmp_lopage = TRUE;
 #if CONFIG_SECLUDED_MEMORY
        } else if (vm_page_free_count > vm_page_free_reserved &&
                   vm_page_secluded_count < vm_page_secluded_target &&
@@ -3131,13 +3259,13 @@ vm_page_release(
                                lck_mtx_lock_spin(&vm_page_queue_free_lock);
                        }
                }
-               mem->lopage = FALSE;
+               mem->vmp_lopage = FALSE;
                LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
                vm_page_queue_enter_first(&vm_page_queue_secluded,
                                          mem,
                                          vm_page_t,
-                                         pageq);
-               mem->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q;
+                                         vmp_pageq);
+               mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
                vm_page_secluded_count++;
                vm_page_secluded_count_free++;
                if (!page_queues_locked) {
@@ -3150,20 +3278,20 @@ vm_page_release(
                }
 #endif /* CONFIG_SECLUDED_MEMORY */
        } else {
-               mem->lopage = FALSE;
-               mem->vm_page_q_state = VM_PAGE_ON_FREE_Q;
+               mem->vmp_lopage = FALSE;
+               mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
 
                color = VM_PAGE_GET_COLOR(mem);
 #if defined(__x86_64__)
                vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead,
                                          mem,
                                          vm_page_t,
-                                         pageq);
+                                         vmp_pageq);
 #else
                vm_page_queue_enter(&vm_page_queue_free[color].qhead,
                                          mem,
                                          vm_page_t,
-                                         pageq);
+                                         vmp_pageq);
 #endif
                vm_page_free_count++;
                /*
@@ -3202,6 +3330,10 @@ vm_page_release(
                        need_wakeup = 1;
                }
        }
+       vm_pageout_vminfo.vm_page_pages_freed++;
+
+       VM_DEBUG_CONSTANT_EVENT(vm_page_release, VM_PAGE_RELEASE, DBG_FUNC_NONE, 1, 0, 0, 0);
+
        lck_mtx_unlock(&vm_page_queue_free_lock);
 
        if (need_priv_wakeup)
@@ -3230,32 +3362,32 @@ vm_page_release_startup(
 
        if (vm_lopage_free_count < vm_lopage_free_limit &&
            VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
-               mem->lopage = TRUE;
-               mem->vm_page_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
+               mem->vmp_lopage = TRUE;
+               mem->vmp_q_state = VM_PAGE_ON_FREE_LOPAGE_Q;
                vm_lopage_free_count++;
                queue_free = &vm_lopage_queue_free;
 #if CONFIG_SECLUDED_MEMORY
        } else if (vm_page_secluded_count < vm_page_secluded_target) {
-               mem->lopage = FALSE;
-               mem->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q;
+               mem->vmp_lopage = FALSE;
+               mem->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
                vm_page_secluded_count++;
                vm_page_secluded_count_free++;
                queue_free = &vm_page_queue_secluded;
 #endif /* CONFIG_SECLUDED_MEMORY */
        } else {
-               mem->lopage = FALSE;
-               mem->vm_page_q_state = VM_PAGE_ON_FREE_Q;
+               mem->vmp_lopage = FALSE;
+               mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
                vm_page_free_count++;
                queue_free = &vm_page_queue_free[VM_PAGE_GET_COLOR(mem)].qhead;
        }
-       if (mem->vm_page_q_state == VM_PAGE_ON_FREE_Q) {
+       if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
 #if defined(__x86_64__)
-               vm_page_queue_enter_clump(queue_free, mem, vm_page_t, pageq);
+               vm_page_queue_enter_clump(queue_free, mem, vm_page_t, vmp_pageq);
 #else
-               vm_page_queue_enter(queue_free, mem, vm_page_t, pageq);
+               vm_page_queue_enter(queue_free, mem, vm_page_t, vmp_pageq);
 #endif
        } else
-               vm_page_queue_enter_first(queue_free, mem, vm_page_t, pageq);
+               vm_page_queue_enter_first(queue_free, mem, vm_page_t, vmp_pageq);
 }
 
 /*
@@ -3302,7 +3434,7 @@ vm_page_wait(
                wait_result = assert_wait((event_t)&vm_page_free_wanted_privileged, interruptible);
 #if CONFIG_SECLUDED_MEMORY
        } else if (secluded_for_apps &&
-                  task_can_use_secluded_mem(current_task())) {
+                  task_can_use_secluded_mem(current_task(), FALSE)) {
 #if 00
                /* XXX FBDP: need pageq lock for this... */
                /* XXX FBDP: might wait even if pages available, */
@@ -3332,7 +3464,7 @@ vm_page_wait(
                thread_wakeup((event_t)&vm_page_free_wanted);
 
        if (wait_result == THREAD_WAITING) {
-               VM_DEBUG_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
+               VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block, VM_PAGE_WAIT_BLOCK, DBG_FUNC_START,
                               vm_page_free_wanted_privileged,
                               vm_page_free_wanted,
 #if CONFIG_SECLUDED_MEMORY
@@ -3342,8 +3474,8 @@ vm_page_wait(
 #endif /* CONFIG_SECLUDED_MEMORY */
                               0);
                wait_result = thread_block(THREAD_CONTINUE_NULL);
-               VM_DEBUG_EVENT(vm_page_wait_block,
-                              VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
+               VM_DEBUG_CONSTANT_EVENT(vm_page_wait_block,
+                                  VM_PAGE_WAIT_BLOCK, DBG_FUNC_END, 0, 0, 0, 0);
        }
 
        return (wait_result == THREAD_AWAKENED);
@@ -3435,15 +3567,15 @@ vm_page_free_prepare_queues(
 
        VM_PAGE_CHECK(mem);
 
-       assert(mem->vm_page_q_state != VM_PAGE_ON_FREE_Q);
-       assert(!mem->cleaning);
+       assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
+       assert(!mem->vmp_cleaning);
        m_object = VM_PAGE_OBJECT(mem);
 
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
        if (m_object) {
                vm_object_lock_assert_exclusive(m_object);
        }
-       if (mem->laundry) {
+       if (mem->vmp_laundry) {
                /*
                 * We may have to free a page while it's being laundered
                 * if we lost its pager (due to a forced unmount, for example).
@@ -3458,7 +3590,7 @@ vm_page_free_prepare_queues(
        vm_page_queues_remove(mem, TRUE);
 
        if (VM_PAGE_WIRED(mem)) {
-               assert(mem->wire_count > 0);
+               assert(mem->vmp_wire_count > 0);
 
                if (m_object) {
 
@@ -3476,10 +3608,22 @@ vm_page_free_prepare_queues(
                        }
                        if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
                             m_object->purgable == VM_PURGABLE_EMPTY) &&
-                           m_object->vo_purgeable_owner != TASK_NULL) {
-                               task_t owner;
-
-                               owner = m_object->vo_purgeable_owner;
+                           m_object->vo_owner != TASK_NULL) {
+                               task_t          owner;
+                               int             ledger_idx_volatile;
+                               int             ledger_idx_nonvolatile;
+                               int             ledger_idx_volatile_compressed;
+                               int             ledger_idx_nonvolatile_compressed;
+                               boolean_t       do_footprint;
+
+                               owner = VM_OBJECT_OWNER(m_object);
+                               vm_object_ledger_tag_ledgers(
+                                       m_object,
+                                       &ledger_idx_volatile,
+                                       &ledger_idx_nonvolatile,
+                                       &ledger_idx_volatile_compressed,
+                                       &ledger_idx_nonvolatile_compressed,
+                                       &do_footprint);
                                /*
                                 * While wired, this page was accounted
                                 * as "non-volatile" but it should now
@@ -3487,26 +3631,28 @@ vm_page_free_prepare_queues(
                                 */
                                /* one less "non-volatile"... */
                                ledger_debit(owner->ledger,
-                                            task_ledgers.purgeable_nonvolatile,
-                                            PAGE_SIZE);
-                               /* ... and "phys_footprint" */
-                               ledger_debit(owner->ledger,
-                                            task_ledgers.phys_footprint,
+                                            ledger_idx_nonvolatile,
                                             PAGE_SIZE);
+                               if (do_footprint) {
+                                       /* ... and "phys_footprint" */
+                                       ledger_debit(owner->ledger,
+                                                    task_ledgers.phys_footprint,
+                                                    PAGE_SIZE);
+                               }
                                /* one more "volatile" */
                                ledger_credit(owner->ledger,
-                                             task_ledgers.purgeable_volatile,
+                                             ledger_idx_volatile,
                                              PAGE_SIZE);
                        }
                }
-               if (!mem->private && !mem->fictitious)
+               if (!mem->vmp_private && !mem->vmp_fictitious)
                        vm_page_wire_count--;
 
-               mem->vm_page_q_state = VM_PAGE_NOT_ON_Q;
-               mem->wire_count = 0;
-               assert(!mem->gobbled);
-       } else if (mem->gobbled) {
-               if (!mem->private && !mem->fictitious)
+               mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
+               mem->vmp_wire_count = 0;
+               assert(!mem->vmp_gobbled);
+       } else if (mem->vmp_gobbled) {
+               if (!mem->vmp_private && !mem->vmp_fictitious)
                        vm_page_wire_count--;
                vm_page_gobble_count--;
        }
@@ -3518,27 +3664,27 @@ vm_page_free_prepare_object(
        vm_page_t       mem,
        boolean_t       remove_from_hash)
 {
-       if (mem->tabled)
+       if (mem->vmp_tabled)
                vm_page_remove(mem, remove_from_hash);  /* clears tabled, object, offset */
 
        PAGE_WAKEUP(mem);               /* clears wanted */
 
-       if (mem->private) {
-               mem->private = FALSE;
-               mem->fictitious = TRUE;
+       if (mem->vmp_private) {
+               mem->vmp_private = FALSE;
+               mem->vmp_fictitious = TRUE;
                VM_PAGE_SET_PHYS_PAGE(mem, vm_page_fictitious_addr);
        }
-       if ( !mem->fictitious) {
-               assert(mem->pageq.next == 0);
-               assert(mem->pageq.prev == 0);
-               assert(mem->listq.next == 0);
-               assert(mem->listq.prev == 0);
+       if ( !mem->vmp_fictitious) {
+               assert(mem->vmp_pageq.next == 0);
+               assert(mem->vmp_pageq.prev == 0);
+               assert(mem->vmp_listq.next == 0);
+               assert(mem->vmp_listq.prev == 0);
 #if CONFIG_BACKGROUND_QUEUE
-               assert(mem->vm_page_backgroundq.next == 0);
-               assert(mem->vm_page_backgroundq.prev == 0);
+               assert(mem->vmp_backgroundq.next == 0);
+               assert(mem->vmp_backgroundq.prev == 0);
 #endif /* CONFIG_BACKGROUND_QUEUE */
-               assert(mem->next_m == 0);
-               vm_page_init(mem, VM_PAGE_GET_PHYS_PAGE(mem), mem->lopage);
+               assert(mem->vmp_next_m == 0);
+               vm_page_init(mem, VM_PAGE_GET_PHYS_PAGE(mem), mem->vmp_lopage);
        }
 }
 
@@ -3557,7 +3703,7 @@ vm_page_free(
 {
        vm_page_free_prepare(mem);
 
-       if (mem->fictitious) {
+       if (mem->vmp_fictitious) {
                vm_page_release_fictitious(mem);
        } else {
                vm_page_release(mem,
@@ -3577,7 +3723,7 @@ vm_page_free_unlocked(
 
        vm_page_free_prepare_object(mem, remove_from_hash);
 
-       if (mem->fictitious) {
+       if (mem->vmp_fictitious) {
                vm_page_release_fictitious(mem);
        } else {
                vm_page_release(mem, FALSE); /* page queues are not locked */
@@ -3621,27 +3767,27 @@ vm_page_free_list(
                 */
                while (mem && pg_count < 64) {
 
-                       assert((mem->vm_page_q_state == VM_PAGE_NOT_ON_Q) ||
-                              (mem->vm_page_q_state == VM_PAGE_IS_WIRED));
+                       assert((mem->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
+                              (mem->vmp_q_state == VM_PAGE_IS_WIRED));
 #if CONFIG_BACKGROUND_QUEUE
-                       assert(mem->vm_page_backgroundq.next == 0 &&
-                              mem->vm_page_backgroundq.prev == 0 &&
-                              mem->vm_page_on_backgroundq == FALSE);
+                       assert(mem->vmp_backgroundq.next == 0 &&
+                              mem->vmp_backgroundq.prev == 0 &&
+                              mem->vmp_on_backgroundq == FALSE);
 #endif
-                       nxt = mem->snext;
-                       mem->snext = NULL;
-                       assert(mem->pageq.prev == 0);
+                       nxt = mem->vmp_snext;
+                       mem->vmp_snext = NULL;
+                       assert(mem->vmp_pageq.prev == 0);
 
-                       if (vm_page_free_verify && !mem->fictitious && !mem->private) {
-                               assert(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)));
+                       if (vm_page_free_verify && !mem->vmp_fictitious && !mem->vmp_private) {
+                               assertf(pmap_verify_free(VM_PAGE_GET_PHYS_PAGE(mem)), "page = 0x%llx", (uint64_t)VM_PAGE_GET_PHYS_PAGE(mem));
                        }
                        if (prepare_object == TRUE)
                                vm_page_free_prepare_object(mem, TRUE);
 
-                       if (!mem->fictitious) {
-                               assert(mem->busy);
+                       if (!mem->vmp_fictitious) {
+                               assert(mem->vmp_busy);
 
-                               if ((mem->lopage == TRUE || vm_lopage_refill == TRUE) &&
+                               if ((mem->vmp_lopage == TRUE || vm_lopage_refill == TRUE) &&
                                    vm_lopage_free_count < vm_lopage_free_limit &&
                                    VM_PAGE_GET_PHYS_PAGE(mem) < max_valid_low_ppnum) {
                                        vm_page_release(mem, FALSE); /* page queues are not locked */
@@ -3661,7 +3807,7 @@ vm_page_free_list(
                                         * cause trouble because the page is not actually
                                         * in the free queue yet...
                                         */
-                                       mem->snext = local_freeq;
+                                       mem->vmp_snext = local_freeq;
                                        local_freeq = mem;
                                        pg_count++;
 
@@ -3689,30 +3835,33 @@ vm_page_free_list(
                        while (mem) {
                                int     color;
 
-                               nxt = mem->snext;
+                               nxt = mem->vmp_snext;
 
-                               assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-                               assert(mem->busy);
-                               mem->lopage = FALSE;
-                               mem->vm_page_q_state = VM_PAGE_ON_FREE_Q;
+                               assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
+                               assert(mem->vmp_busy);
+                               mem->vmp_lopage = FALSE;
+                               mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
 
                                color = VM_PAGE_GET_COLOR(mem);
 #if defined(__x86_64__)
                                vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead,
                                                          mem,
                                                          vm_page_t,
-                                                         pageq);
+                                                         vmp_pageq);
 #else
                                vm_page_queue_enter(&vm_page_queue_free[color].qhead,
                                                          mem,
                                                          vm_page_t,
-                                                         pageq);
+                                                         vmp_pageq);
 #endif
                                mem = nxt;
                        }
+                       vm_pageout_vminfo.vm_page_pages_freed += pg_count;
                        vm_page_free_count += pg_count;
                        avail_free_count = vm_page_free_count;
 
+                       VM_DEBUG_CONSTANT_EVENT(vm_page_release, VM_PAGE_RELEASE, DBG_FUNC_NONE, pg_count, 0, 0, 0);
+
                        if (vm_page_free_wanted_privileged > 0 && avail_free_count > 0) {
 
                                if (avail_free_count < vm_page_free_wanted_privileged) {
@@ -3825,7 +3974,7 @@ vm_page_wire(
 
        m_object = VM_PAGE_OBJECT(mem);
 
-//     dbgLog(current_thread(), mem->offset, m_object, 1);     /* (TEST/DEBUG) */
+//     dbgLog(current_thread(), mem->vmp_offset, m_object, 1); /* (TEST/DEBUG) */
 
        VM_PAGE_CHECK(mem);
        if (m_object) {
@@ -3844,13 +3993,13 @@ vm_page_wire(
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
        if ( !VM_PAGE_WIRED(mem)) {
 
-               if (mem->laundry)
+               if (mem->vmp_laundry)
                        vm_pageout_steal_laundry(mem, TRUE);
 
                vm_page_queues_remove(mem, TRUE);
 
-               assert(mem->wire_count == 0);
-               mem->vm_page_q_state = VM_PAGE_IS_WIRED;
+               assert(mem->vmp_wire_count == 0);
+               mem->vmp_q_state = VM_PAGE_IS_WIRED;
 
                if (m_object) {
 
@@ -3867,22 +4016,36 @@ vm_page_wire(
                        }
                        if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
                             m_object->purgable == VM_PURGABLE_EMPTY) &&
-                           m_object->vo_purgeable_owner != TASK_NULL) {
-                               task_t owner;
-
-                               owner = m_object->vo_purgeable_owner;
+                           m_object->vo_owner != TASK_NULL) {
+                               task_t          owner;
+                               int             ledger_idx_volatile;
+                               int             ledger_idx_nonvolatile;
+                               int             ledger_idx_volatile_compressed;
+                               int             ledger_idx_nonvolatile_compressed;
+                               boolean_t       do_footprint;
+
+                               owner = VM_OBJECT_OWNER(m_object);
+                               vm_object_ledger_tag_ledgers(
+                                       m_object,
+                                       &ledger_idx_volatile,
+                                       &ledger_idx_nonvolatile,
+                                       &ledger_idx_volatile_compressed,
+                                       &ledger_idx_nonvolatile_compressed,
+                                       &do_footprint);
                                /* less volatile bytes */
                                ledger_debit(owner->ledger,
-                                            task_ledgers.purgeable_volatile,
+                                            ledger_idx_volatile,
                                             PAGE_SIZE);
                                /* more not-quite-volatile bytes */
                                ledger_credit(owner->ledger,
-                                             task_ledgers.purgeable_nonvolatile,
-                                             PAGE_SIZE);
-                               /* more footprint */
-                               ledger_credit(owner->ledger,
-                                             task_ledgers.phys_footprint,
+                                             ledger_idx_nonvolatile,
                                              PAGE_SIZE);
+                               if (do_footprint) {
+                                       /* more footprint */
+                                       ledger_credit(owner->ledger,
+                                                     task_ledgers.phys_footprint,
+                                                     PAGE_SIZE);
+                               }
                        }
                        if (m_object->all_reusable) {
                                /*
@@ -3890,34 +4053,34 @@ vm_page_wire(
                                 * in "all_reusable" VM objects, so nothing
                                 * to do here.
                                 */
-                       } else if (mem->reusable) {
+                       } else if (mem->vmp_reusable) {
                                /*
                                 * This page is not "re-usable" when it's
                                 * wired, so adjust its state and the
                                 * accounting.
                                 */
                                vm_object_reuse_pages(m_object,
-                                                     mem->offset,
-                                                     mem->offset+PAGE_SIZE_64,
+                                                     mem->vmp_offset,
+                                                     mem->vmp_offset+PAGE_SIZE_64,
                                                      FALSE);
                        }
                }
-               assert(!mem->reusable);
+               assert(!mem->vmp_reusable);
 
-               if (!mem->private && !mem->fictitious && !mem->gobbled)
+               if (!mem->vmp_private && !mem->vmp_fictitious && !mem->vmp_gobbled)
                        vm_page_wire_count++;
-               if (mem->gobbled)
+               if (mem->vmp_gobbled)
                        vm_page_gobble_count--;
-               mem->gobbled = FALSE;
+               mem->vmp_gobbled = FALSE;
 
                if (check_memorystatus == TRUE) {
                        VM_CHECK_MEMORYSTATUS;
                }
        }
-       assert(!mem->gobbled);
-       assert(mem->vm_page_q_state == VM_PAGE_IS_WIRED);
-       mem->wire_count++;
-       if (__improbable(mem->wire_count == 0)) {
+       assert(!mem->vmp_gobbled);
+       assert(mem->vmp_q_state == VM_PAGE_IS_WIRED);
+       mem->vmp_wire_count++;
+       if (__improbable(mem->vmp_wire_count == 0)) {
                panic("vm_page_wire(%p): wire_count overflow", mem);
        }
        VM_PAGE_CHECK(mem);
@@ -3940,23 +4103,23 @@ vm_page_unwire(
 
        m_object = VM_PAGE_OBJECT(mem);
 
-//     dbgLog(current_thread(), mem->offset, m_object, 0);     /* (TEST/DEBUG) */
+//     dbgLog(current_thread(), mem->vmp_offset, m_object, 0); /* (TEST/DEBUG) */
 
        VM_PAGE_CHECK(mem);
        assert(VM_PAGE_WIRED(mem));
-       assert(mem->wire_count > 0);
-       assert(!mem->gobbled);
+       assert(mem->vmp_wire_count > 0);
+       assert(!mem->vmp_gobbled);
        assert(m_object != VM_OBJECT_NULL);
        vm_object_lock_assert_exclusive(m_object);
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
-       if (--mem->wire_count == 0) {
+       if (--mem->vmp_wire_count == 0) {
 
-               mem->vm_page_q_state = VM_PAGE_NOT_ON_Q;
+               mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
 
                VM_OBJECT_WIRED_PAGE_UPDATE_START(m_object);
                VM_OBJECT_WIRED_PAGE_REMOVE(m_object, mem);
                VM_OBJECT_WIRED_PAGE_UPDATE_END(m_object, m_object->wire_tag);
-               if (!mem->private && !mem->fictitious) {
+               if (!mem->vmp_private && !mem->vmp_fictitious) {
                        vm_page_wire_count--;
                }
 
@@ -3969,25 +4132,39 @@ vm_page_unwire(
                }
                if ((m_object->purgable == VM_PURGABLE_VOLATILE ||
                     m_object->purgable == VM_PURGABLE_EMPTY) &&
-                   m_object->vo_purgeable_owner != TASK_NULL) {
-                       task_t owner;
-
-                       owner = m_object->vo_purgeable_owner;
+                   m_object->vo_owner != TASK_NULL) {
+                       task_t          owner;
+                       int             ledger_idx_volatile;
+                       int             ledger_idx_nonvolatile;
+                       int             ledger_idx_volatile_compressed;
+                       int             ledger_idx_nonvolatile_compressed;
+                       boolean_t       do_footprint;
+
+                       owner = VM_OBJECT_OWNER(m_object);
+                       vm_object_ledger_tag_ledgers(
+                               m_object,
+                               &ledger_idx_volatile,
+                               &ledger_idx_nonvolatile,
+                               &ledger_idx_volatile_compressed,
+                               &ledger_idx_nonvolatile_compressed,
+                               &do_footprint);
                        /* more volatile bytes */
                        ledger_credit(owner->ledger,
-                                     task_ledgers.purgeable_volatile,
+                                     ledger_idx_volatile,
                                      PAGE_SIZE);
                        /* less not-quite-volatile bytes */
                        ledger_debit(owner->ledger,
-                                    task_ledgers.purgeable_nonvolatile,
-                                    PAGE_SIZE);
-                       /* less footprint */
-                       ledger_debit(owner->ledger,
-                                    task_ledgers.phys_footprint,
+                                    ledger_idx_nonvolatile,
                                     PAGE_SIZE);
+                       if (do_footprint) {
+                               /* less footprint */
+                               ledger_debit(owner->ledger,
+                                            task_ledgers.phys_footprint,
+                                            PAGE_SIZE);
+                       }
                }
                assert(m_object != kernel_object);
-               assert(mem->pageq.next == 0 && mem->pageq.prev == 0);
+               assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
 
                if (queueit == TRUE) {
                        if (m_object->purgable == VM_PURGABLE_EMPTY) {
@@ -4042,15 +4219,15 @@ vm_page_deactivate_internal(
         *      inactive queue.  Note wired pages should not have
         *      their reference bit cleared.
         */
-       assert ( !(m->absent && !m->unusual));
+       assert ( !(m->vmp_absent && !m->vmp_unusual));
 
-       if (m->gobbled) {               /* can this happen? */
+       if (m->vmp_gobbled) {           /* can this happen? */
                assert( !VM_PAGE_WIRED(m));
 
-               if (!m->private && !m->fictitious)
+               if (!m->vmp_private && !m->vmp_fictitious)
                        vm_page_wire_count--;
                vm_page_gobble_count--;
-               m->gobbled = FALSE;
+               m->vmp_gobbled = FALSE;
        }
        /*
         * if this page is currently on the pageout queue, we can't do the
@@ -4060,29 +4237,29 @@ vm_page_deactivate_internal(
         * reference which is held on the object while the page is in the pageout queue...
         * just let the normal laundry processing proceed
         */
-       if (m->laundry || m->private || m->fictitious ||
-           (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
-           (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
+       if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
+           (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
+           (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
            VM_PAGE_WIRED(m)) {
                return;
        }
-       if (!m->absent && clear_hw_reference == TRUE)
+       if (!m->vmp_absent && clear_hw_reference == TRUE)
                pmap_clear_reference(VM_PAGE_GET_PHYS_PAGE(m));
 
-       m->reference = FALSE;
-       m->no_cache = FALSE;
+       m->vmp_reference = FALSE;
+       m->vmp_no_cache = FALSE;
 
        if ( !VM_PAGE_INACTIVE(m)) {
                vm_page_queues_remove(m, FALSE);
 
                if (!VM_DYNAMIC_PAGING_ENABLED() &&
-                   m->dirty && m_object->internal &&
+                   m->vmp_dirty && m_object->internal &&
                    (m_object->purgable == VM_PURGABLE_DENY ||
                     m_object->purgable == VM_PURGABLE_NONVOLATILE ||
                     m_object->purgable == VM_PURGABLE_VOLATILE)) {
                        vm_page_check_pageable_safe(m);
-                       vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
-                       m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
+                       vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq);
+                       m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
                        vm_page_throttled_count++;
                } else {
                        if (m_object->named && m_object->ref_count == 1) {
@@ -4115,17 +4292,17 @@ void vm_page_enqueue_cleaned(vm_page_t m)
 
        assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
-       assert( !(m->absent && !m->unusual));
+       assert( !(m->vmp_absent && !m->vmp_unusual));
 
        if (VM_PAGE_WIRED(m)) {
                return;
        }
 
-       if (m->gobbled) {
-               if (!m->private && !m->fictitious)
+       if (m->vmp_gobbled) {
+               if (!m->vmp_private && !m->vmp_fictitious)
                        vm_page_wire_count--;
                vm_page_gobble_count--;
-               m->gobbled = FALSE;
+               m->vmp_gobbled = FALSE;
        }
        /*
         * if this page is currently on the pageout queue, we can't do the
@@ -4135,16 +4312,16 @@ void vm_page_enqueue_cleaned(vm_page_t m)
         * reference which is held on the object while the page is in the pageout queue...
         * just let the normal laundry processing proceed
         */
-       if (m->laundry || m->private || m->fictitious ||
-           (m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
-           (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
+       if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
+           (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
+           (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
                    return;
        }
        vm_page_queues_remove(m, FALSE);
 
        vm_page_check_pageable_safe(m);
-       vm_page_queue_enter(&vm_page_queue_cleaned, m, vm_page_t, pageq);
-       m->vm_page_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q;
+       vm_page_queue_enter(&vm_page_queue_cleaned, m, vm_page_t, vmp_pageq);
+       m->vmp_q_state = VM_PAGE_ON_INACTIVE_CLEANED_Q;
        vm_page_cleaned_count++;
 
        vm_page_inactive_count++;
@@ -4154,10 +4331,10 @@ void vm_page_enqueue_cleaned(vm_page_t m)
                vm_page_pageable_external_count++;
        }
 #if CONFIG_BACKGROUND_QUEUE
-       if (m->vm_page_in_background)
+       if (m->vmp_in_background)
                vm_page_add_to_backgroundq(m, TRUE);
 #endif
-       vm_pageout_enqueued_cleaned++;
+       VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
 }
 
 /*
@@ -4182,14 +4359,14 @@ vm_page_activate(
 #endif
        assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
-       assert( !(m->absent && !m->unusual));
+       assert( !(m->vmp_absent && !m->vmp_unusual));
 
-       if (m->gobbled) {
+       if (m->vmp_gobbled) {
                assert( !VM_PAGE_WIRED(m));
-               if (!m->private && !m->fictitious)
+               if (!m->vmp_private && !m->vmp_fictitious)
                        vm_page_wire_count--;
                vm_page_gobble_count--;
-               m->gobbled = FALSE;
+               m->vmp_gobbled = FALSE;
        }
        /*
         * if this page is currently on the pageout queue, we can't do the
@@ -4199,17 +4376,17 @@ vm_page_activate(
         * reference which is held on the object while the page is in the pageout queue...
         * just let the normal laundry processing proceed
         */
-       if (m->laundry || m->private || m->fictitious ||
-           (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
-           (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q))
+       if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
+           (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
+           (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q))
                return;
 
 #if DEBUG
-       if (m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q)
+       if (m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q)
                panic("vm_page_activate: already active");
 #endif
 
-       if (m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
+       if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
                DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
                DTRACE_VM2(pgfrec, int, 1, (uint64_t *), NULL);
        }
@@ -4219,12 +4396,12 @@ vm_page_activate(
        if ( !VM_PAGE_WIRED(m)) {
                vm_page_check_pageable_safe(m);
                if (!VM_DYNAMIC_PAGING_ENABLED() && 
-                   m->dirty && m_object->internal && 
+                   m->vmp_dirty && m_object->internal && 
                    (m_object->purgable == VM_PURGABLE_DENY ||
                     m_object->purgable == VM_PURGABLE_NONVOLATILE ||
                     m_object->purgable == VM_PURGABLE_VOLATILE)) {
-                       vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, pageq);
-                       m->vm_page_q_state = VM_PAGE_ON_THROTTLED_Q;
+                       vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq);
+                       m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
                        vm_page_throttled_count++;
                } else {
 #if CONFIG_SECLUDED_MEMORY
@@ -4233,8 +4410,8 @@ vm_page_activate(
                            num_tasks_can_use_secluded_mem == 0 &&
                            m_object->eligible_for_secluded) {
                                vm_page_queue_enter(&vm_page_queue_secluded, m,
-                                                   vm_page_t, pageq);
-                               m->vm_page_q_state = VM_PAGE_ON_SECLUDED_Q;
+                                                   vm_page_t, vmp_pageq);
+                               m->vmp_q_state = VM_PAGE_ON_SECLUDED_Q;
                                vm_page_secluded_count++;
                                vm_page_secluded_count_inuse++;
                                assert(!m_object->internal);
@@ -4243,8 +4420,8 @@ vm_page_activate(
 #endif /* CONFIG_SECLUDED_MEMORY */
                        vm_page_enqueue_active(m, FALSE);
                }
-               m->reference = TRUE;
-               m->no_cache = FALSE;
+               m->vmp_reference = TRUE;
+               m->vmp_no_cache = FALSE;
        }
        VM_PAGE_CHECK(m);
 }
@@ -4272,7 +4449,7 @@ vm_page_speculate(
 
        assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
-       assert( !(m->absent && !m->unusual));
+       assert( !(m->vmp_absent && !m->vmp_unusual));
        assert(m_object->internal == FALSE);
 
        /*
@@ -4283,9 +4460,9 @@ vm_page_speculate(
         * reference which is held on the object while the page is in the pageout queue...
         * just let the normal laundry processing proceed
         */
-       if (m->laundry || m->private || m->fictitious ||
-           (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
-           (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q))
+       if (m->vmp_laundry || m->vmp_private || m->vmp_fictitious ||
+           (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
+           (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q))
                return;
 
        vm_page_queues_remove(m, FALSE);
@@ -4309,9 +4486,8 @@ vm_page_speculate(
                        /*
                         * set the timer to begin a new group
                         */
-                       aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000;
-                       aq->age_ts.tv_nsec = (vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
-
+                       aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
+                       aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
                        ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
                } else {
                        aq = &vm_page_queue_speculative[speculative_age_index];
@@ -4333,14 +4509,13 @@ vm_page_speculate(
                                if (!vm_page_queue_empty(&aq->age_q))
                                        vm_page_speculate_ageit(aq);
 
-                               aq->age_ts.tv_sec = vm_page_speculative_q_age_ms / 1000;
-                               aq->age_ts.tv_nsec = (vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
-
+                               aq->age_ts.tv_sec = vm_pageout_state.vm_page_speculative_q_age_ms / 1000;
+                               aq->age_ts.tv_nsec = (vm_pageout_state.vm_page_speculative_q_age_ms % 1000) * 1000 * NSEC_PER_USEC;
                                ADD_MACH_TIMESPEC(&aq->age_ts, &ts);
                        }
                }
-               vm_page_enqueue_tail(&aq->age_q, &m->pageq);
-               m->vm_page_q_state = VM_PAGE_ON_SPECULATIVE_Q;
+               vm_page_enqueue_tail(&aq->age_q, &m->vmp_pageq);
+               m->vmp_q_state = VM_PAGE_ON_SPECULATIVE_Q;
                vm_page_speculative_count++;
                vm_page_pageable_external_count++;
 
@@ -4376,19 +4551,19 @@ vm_page_speculate_ageit(struct vm_speculative_age_q *aq)
                sq->age_q.prev = aq->age_q.prev;
                
                t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.next);
-               t->pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q);
+               t->vmp_pageq.prev = VM_PAGE_PACK_PTR(&sq->age_q);
 
                t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
-               t->pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
+               t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
        } else {
                t = (vm_page_t)VM_PAGE_UNPACK_PTR(sq->age_q.prev);
-               t->pageq.next = aq->age_q.next;
+               t->vmp_pageq.next = aq->age_q.next;
                                                
                t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.next);
-               t->pageq.prev = sq->age_q.prev;
+               t->vmp_pageq.prev = sq->age_q.prev;
 
                t = (vm_page_t)VM_PAGE_UNPACK_PTR(aq->age_q.prev);
-               t->pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
+               t->vmp_pageq.next = VM_PAGE_PACK_PTR(&sq->age_q);
 
                sq->age_q.prev = aq->age_q.prev;
        }
@@ -4405,6 +4580,23 @@ vm_page_lru(
        assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
 
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
+
+       if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q) {
+               /*
+                * we don't need to do all the other work that 
+                * vm_page_queues_remove and vm_page_enqueue_inactive
+                * bring along for the ride
+                */
+               assert(!m->vmp_laundry);
+               assert(!m->vmp_private);
+               
+               m->vmp_no_cache = FALSE;
+
+               vm_page_queue_remove(&vm_page_queue_inactive, m, vm_page_t, vmp_pageq);
+               vm_page_queue_enter(&vm_page_queue_inactive, m, vm_page_t, vmp_pageq);
+
+               return;
+       }
        /*
         * if this page is currently on the pageout queue, we can't do the
         * vm_page_queues_remove (which doesn't handle the pageout queue case)
@@ -4413,13 +4605,13 @@ vm_page_lru(
         * reference which is held on the object while the page is in the pageout queue...
         * just let the normal laundry processing proceed
         */
-       if (m->laundry || m->private ||
-           (m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
-           (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
+       if (m->vmp_laundry || m->vmp_private ||
+           (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) ||
+           (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) ||
            VM_PAGE_WIRED(m))
                return;
 
-       m->no_cache = FALSE;
+       m->vmp_no_cache = FALSE;
 
        vm_page_queues_remove(m, FALSE);
 
@@ -4448,9 +4640,9 @@ vm_page_reactivate_all_throttled(void)
                /*
                 * Switch "throttled" pages to "active".
                 */
-               vm_page_queue_iterate(&vm_page_queue_throttled, m, vm_page_t, pageq) {
+               vm_page_queue_iterate(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq) {
                        VM_PAGE_CHECK(m);
-                       assert(m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q);
+                       assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
 
                        m_object = VM_PAGE_OBJECT(m);
 
@@ -4461,10 +4653,10 @@ vm_page_reactivate_all_throttled(void)
                                extra_external_count++;
                        }
 
-                       m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
+                       m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
                        VM_PAGE_CHECK(m);
 #if CONFIG_BACKGROUND_QUEUE
-                       if (m->vm_page_in_background)
+                       if (m->vmp_in_background)
                                vm_page_add_to_backgroundq(m, FALSE);
 #endif
                }
@@ -4481,11 +4673,11 @@ vm_page_reactivate_all_throttled(void)
                if (vm_page_queue_empty(&vm_page_queue_active)) {
                        vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
                } else {
-                       first_active->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
+                       first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_throttled);
                }
                vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_throttled);
-               first_throttled->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
-               last_throttled->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
+               first_throttled->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
+               last_throttled->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
 
 #if DEBUG
                printf("reactivated %d throttled pages\n", vm_page_throttled_count);
@@ -4541,20 +4733,20 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
                 */
                assert(!vm_page_queue_empty(&lq->vpl_queue));
 
-               vm_page_queue_iterate(&lq->vpl_queue, m, vm_page_t, pageq) {
+               vm_page_queue_iterate(&lq->vpl_queue, m, vm_page_t, vmp_pageq) {
                        VM_PAGE_CHECK(m);
                        vm_page_check_pageable_safe(m);
-                       assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q);
-                       assert(!m->fictitious);
+                       assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_LOCAL_Q);
+                       assert(!m->vmp_fictitious);
 
-                       if (m->local_id != lid)
+                       if (m->vmp_local_id != lid)
                                panic("vm_page_reactivate_local: found vm_page_t(%p) with wrong cpuid", m);
                        
-                       m->local_id = 0;
-                       m->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
+                       m->vmp_local_id = 0;
+                       m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
                        VM_PAGE_CHECK(m);
 #if CONFIG_BACKGROUND_QUEUE
-                       if (m->vm_page_in_background)
+                       if (m->vmp_in_background)
                                vm_page_add_to_backgroundq(m, FALSE);
 #endif
                        count++;
@@ -4572,11 +4764,11 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
                if (vm_page_queue_empty(&vm_page_queue_active)) {
                        vm_page_queue_active.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
                } else {
-                       first_active->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
+                       first_active->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
                }
                vm_page_queue_active.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
-               first_local->pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
-               last_local->pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
+               first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(&vm_page_queue_active);
+               last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_active);
 
                vm_page_queue_init(&lq->vpl_queue);
                /*
@@ -4593,6 +4785,8 @@ vm_page_reactivate_local(uint32_t lid, boolean_t force, boolean_t nolocks)
 
        if (nolocks == FALSE) {
                VPL_UNLOCK(&lq->vpl_lock);
+
+               vm_page_balance_inactive(count / 4);
                vm_page_unlock_queues();
        }
 }
@@ -4655,7 +4849,7 @@ vm_page_zero_fill(
 {
         XPR(XPR_VM_PAGE,
            "vm_page_zero_fill, object 0x%X offset 0x%X page 0x%X\n",
-           VM_PAGE_OBJECT(m), m->offset, m, 0,0);
+           VM_PAGE_OBJECT(m), m->vmp_offset, m, 0,0);
 #if 0
        /*
         * we don't hold the page queue lock
@@ -4714,8 +4908,8 @@ vm_page_copy(
 
         XPR(XPR_VM_PAGE,
            "vm_page_copy, object 0x%X offset 0x%X to object 0x%X offset 0x%X\n",
-           src_m_object, src_m->offset, 
-           VM_PAGE_OBJECT(dest_m), dest_m->offset,
+           src_m_object, src_m->vmp_offset, 
+           VM_PAGE_OBJECT(dest_m), dest_m->vmp_offset,
            0);
 #if 0
        /*
@@ -4740,33 +4934,22 @@ vm_page_copy(
 #if DEVELOPMENT || DEBUG
                DTRACE_VM4(codesigned_copy,
                           vm_object_t, src_m_object,
-                          vm_object_offset_t, src_m->offset,
-                          int, src_m->cs_validated,
-                          int, src_m->cs_tainted);
+                          vm_object_offset_t, src_m->vmp_offset,
+                          int, src_m->vmp_cs_validated,
+                          int, src_m->vmp_cs_tainted);
 #endif /* DEVELOPMENT || DEBUG */
 
        }
 
-       if (vm_page_is_slideable(src_m)) {
-               boolean_t was_busy = src_m->busy;
-               src_m->busy = TRUE;
-               (void) vm_page_slide(src_m, 0);
-               assert(src_m->busy);
-               if (!was_busy) {
-                       PAGE_WAKEUP_DONE(src_m);
-               }
-       }
-
        /*
         * Propagate the cs_tainted bit to the copy page. Do not propagate
         * the cs_validated bit.
         */
-       dest_m->cs_tainted = src_m->cs_tainted;
-       if (dest_m->cs_tainted) {
+       dest_m->vmp_cs_tainted = src_m->vmp_cs_tainted;
+       if (dest_m->vmp_cs_tainted) {
                vm_page_copy_cs_tainted++;
        }
-       dest_m->slid = src_m->slid;
-       dest_m->error = src_m->error; /* sliding src_m might have failed... */
+       dest_m->vmp_error = src_m->vmp_error; /* sliding src_m might have failed... */
        pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(src_m), VM_PAGE_GET_PHYS_PAGE(dest_m));
 }
 
@@ -4777,45 +4960,45 @@ _vm_page_print(
 {
        printf("vm_page %p: \n", p);
        printf("  pageq: next=%p prev=%p\n",
-              (vm_page_t)VM_PAGE_UNPACK_PTR(p->pageq.next),
-              (vm_page_t)VM_PAGE_UNPACK_PTR(p->pageq.prev));
+              (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next),
+              (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.prev));
        printf("  listq: next=%p prev=%p\n",
-              (vm_page_t)(VM_PAGE_UNPACK_PTR(p->listq.next)),
-              (vm_page_t)(VM_PAGE_UNPACK_PTR(p->listq.prev)));
-       printf("  next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->next_m)));
-       printf("  object=%p offset=0x%llx\n",VM_PAGE_OBJECT(p), p->offset);
-       printf("  wire_count=%u\n", p->wire_count);
-       printf("  q_state=%u\n", p->vm_page_q_state);
+              (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.next)),
+              (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_listq.prev)));
+       printf("  next=%p\n", (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m)));
+       printf("  object=%p offset=0x%llx\n",VM_PAGE_OBJECT(p), p->vmp_offset);
+       printf("  wire_count=%u\n", p->vmp_wire_count);
+       printf("  q_state=%u\n", p->vmp_q_state);
 
        printf("  %slaundry, %sref, %sgobbled, %sprivate\n",
-              (p->laundry ? "" : "!"),
-              (p->reference ? "" : "!"),
-              (p->gobbled ? "" : "!"),
-              (p->private ? "" : "!"));
+              (p->vmp_laundry ? "" : "!"),
+              (p->vmp_reference ? "" : "!"),
+              (p->vmp_gobbled ? "" : "!"),
+              (p->vmp_private ? "" : "!"));
        printf("  %sbusy, %swanted, %stabled, %sfictitious, %spmapped, %swpmapped\n",
-              (p->busy ? "" : "!"),
-              (p->wanted ? "" : "!"),
-              (p->tabled ? "" : "!"),
-              (p->fictitious ? "" : "!"),
-              (p->pmapped ? "" : "!"),
-              (p->wpmapped ? "" : "!"));
+              (p->vmp_busy ? "" : "!"),
+              (p->vmp_wanted ? "" : "!"),
+              (p->vmp_tabled ? "" : "!"),
+              (p->vmp_fictitious ? "" : "!"),
+              (p->vmp_pmapped ? "" : "!"),
+              (p->vmp_wpmapped ? "" : "!"));
        printf("  %sfree_when_done, %sabsent, %serror, %sdirty, %scleaning, %sprecious, %sclustered\n",
-              (p->free_when_done ? "" : "!"),
-              (p->absent ? "" : "!"),
-              (p->error ? "" : "!"),
-              (p->dirty ? "" : "!"),
-              (p->cleaning ? "" : "!"),
-              (p->precious ? "" : "!"),
-              (p->clustered ? "" : "!"));
+              (p->vmp_free_when_done ? "" : "!"),
+              (p->vmp_absent ? "" : "!"),
+              (p->vmp_error ? "" : "!"),
+              (p->vmp_dirty ? "" : "!"),
+              (p->vmp_cleaning ? "" : "!"),
+              (p->vmp_precious ? "" : "!"),
+              (p->vmp_clustered ? "" : "!"));
        printf("  %soverwriting, %srestart, %sunusual\n",
-              (p->overwriting ? "" : "!"),
-              (p->restart ? "" : "!"),
-              (p->unusual ? "" : "!"));
+              (p->vmp_overwriting ? "" : "!"),
+              (p->vmp_restart ? "" : "!"),
+              (p->vmp_unusual ? "" : "!"));
        printf("  %scs_validated, %scs_tainted, %scs_nx, %sno_cache\n",
-              (p->cs_validated ? "" : "!"),
-              (p->cs_tainted ? "" : "!"),
-              (p->cs_nx ? "" : "!"),
-              (p->no_cache ? "" : "!"));
+              (p->vmp_cs_validated ? "" : "!"),
+              (p->vmp_cs_tainted ? "" : "!"),
+              (p->vmp_cs_nx ? "" : "!"),
+              (p->vmp_no_cache ? "" : "!"));
 
        printf("phys_page=0x%x\n", VM_PAGE_GET_PHYS_PAGE(p));
 }
@@ -4880,28 +5063,28 @@ vm_page_verify_free_list(
        vm_page_queue_iterate(vm_page_queue,
                              m,
                              vm_page_t,
-                             pageq) {
+                             vmp_pageq) {
 
                if (m == look_for_page) {
                        found_page = TRUE;
                }
-               if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.prev) != prev_m)
+               if ((vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev) != prev_m)
                        panic("vm_page_verify_free_list(color=%u, npages=%u): page %p corrupted prev ptr %p instead of %p\n",
-                             color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.prev), prev_m);
-               if ( ! m->busy )
+                             color, npages, m, (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.prev), prev_m);
+               if ( ! m->vmp_busy )
                        panic("vm_page_verify_free_list(color=%u, npages=%u): page %p not busy\n",
                              color, npages, m);
                if (color != (unsigned int) -1) {
                        if (VM_PAGE_GET_COLOR(m) != color)
                                panic("vm_page_verify_free_list(color=%u, npages=%u): page %p wrong color %u instead of %u\n",
                                      color, npages, m, VM_PAGE_GET_COLOR(m), color);
-                       if (m->vm_page_q_state != VM_PAGE_ON_FREE_Q)
+                       if (m->vmp_q_state != VM_PAGE_ON_FREE_Q)
                                panic("vm_page_verify_free_list(color=%u, npages=%u): page %p - expecting q_state == VM_PAGE_ON_FREE_Q, found %d\n",
-                                     color, npages, m, m->vm_page_q_state);
+                                     color, npages, m, m->vmp_q_state);
                } else {
-                       if (m->vm_page_q_state != VM_PAGE_ON_FREE_LOCAL_Q)
+                       if (m->vmp_q_state != VM_PAGE_ON_FREE_LOCAL_Q)
                                panic("vm_page_verify_free_list(npages=%u): local page %p - expecting q_state == VM_PAGE_ON_FREE_LOCAL_Q, found %d\n",
-                                     npages, m, m->vm_page_q_state);
+                                     npages, m, m->vmp_q_state);
                }
                ++npages;
                prev_m = m;
@@ -5150,25 +5333,25 @@ full_scan_again:
 
                                for(npages = 0; npages < contig_pages; npages++, last_idx++) {
 
-                                       assert(vm_pages[last_idx].gobbled == FALSE);
+                                       assert(vm_pages[last_idx].vmp_gobbled == FALSE);
 
-                                       vm_pages[last_idx].gobbled = TRUE;
+                                       vm_pages[last_idx].vmp_gobbled = TRUE;
                                        vm_page_gobble_count++;
 
-                                       assert(1 == vm_pages[last_idx].wire_count);
+                                       assert(1 == vm_pages[last_idx].vmp_wire_count);
                                        /*
                                         * Gobbled pages are counted as wired pages. So no need to drop
                                         * the global wired page count. Just the page's wire count is fine.
                                         */
-                                       vm_pages[last_idx].wire_count--;
-                                       vm_pages[last_idx].vm_page_q_state = VM_PAGE_NOT_ON_Q;
+                                       vm_pages[last_idx].vmp_wire_count--;
+                                       vm_pages[last_idx].vmp_q_state = VM_PAGE_NOT_ON_Q;
                                }
                        
                        }
 
                        last_idx = start_idx + contig_pages - 1;
 
-                       vm_pages[last_idx].snext = NULL;
+                       vm_pages[last_idx].vmp_snext = NULL;
 
                        printf("Using preallocated buffer: Requested size (pages):%d... index range: %d-%d...freeing %llu pages\n", contig_pages, start_idx, last_idx, PREALLOCATED_CONTIG_BUFFER_PAGES_COUNT - contig_pages);
 
@@ -5222,8 +5405,8 @@ retry:
                scanned++;
                m = &vm_pages[page_idx];
 
-               assert(!m->fictitious);
-               assert(!m->private);
+               assert(!m->vmp_fictitious);
+               assert(!m->vmp_private);
 
                if (max_pnum && VM_PAGE_GET_PHYS_PAGE(m) > max_pnum) {
                        /* no more low pages... */
@@ -5235,9 +5418,9 @@ retry:
                         */
                        RESET_STATE_OF_RUN();
 
-               } else if (VM_PAGE_WIRED(m) || m->gobbled ||
-                          m->laundry || m->wanted ||
-                          m->cleaning || m->overwriting || m->free_when_done) {
+               } else if (VM_PAGE_WIRED(m) || m->vmp_gobbled ||
+                          m->vmp_laundry || m->vmp_wanted ||
+                          m->vmp_cleaning || m->vmp_overwriting || m->vmp_free_when_done) {
                        /*
                         * page is in a transient state
                         * or a state we don't want to deal
@@ -5246,14 +5429,14 @@ retry:
                         */
                        RESET_STATE_OF_RUN();
 
-               } else if ((m->vm_page_q_state == VM_PAGE_NOT_ON_Q) ||
-                          (m->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q) ||
-                          (m->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
-                          (m->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
+               } else if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
+                          (m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q) ||
+                          (m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q) ||
+                          (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
                        /*
                         * page needs to be on one of our queues (other then the pageout or special free queues)
                         * or it needs to belong to the compressor pool (which is now indicated
-                        * by vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out
+                        * by vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR and falls out
                         * from the check for VM_PAGE_NOT_ON_Q)
                         * in order for it to be stable behind the
                         * locks we hold at this point...
@@ -5262,7 +5445,7 @@ retry:
                         */
                        RESET_STATE_OF_RUN();
 
-               } else if ((m->vm_page_q_state != VM_PAGE_ON_FREE_Q) && (!m->tabled || m->busy)) {
+               } else if ((m->vmp_q_state != VM_PAGE_ON_FREE_Q) && (!m->vmp_tabled || m->vmp_busy)) {
                        /*
                         * pages on the free list are always 'busy'
                         * so we couldn't test for 'busy' in the check
@@ -5292,7 +5475,7 @@ retry:
                        prevcontaddr = VM_PAGE_GET_PHYS_PAGE(m);
                        
                        VM_PAGE_CHECK(m);
-                       if (m->vm_page_q_state == VM_PAGE_ON_FREE_Q) {
+                       if (m->vmp_q_state == VM_PAGE_ON_FREE_Q) {
                                free_considered++;
                        } else {
                                /*
@@ -5305,7 +5488,7 @@ retry:
                                 * into a substitute page.
                                 */
 #if VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
-                               if (m->pmapped || m->dirty || m->precious) {
+                               if (m->vmp_pmapped || m->vmp_dirty || m->vmp_precious) {
                                        substitute_needed++;
                                }
 #else
@@ -5413,10 +5596,10 @@ did_consider:
                        m1 = &vm_pages[start_idx++];
 
 #if !VM_PAGE_FIND_CONTIGUOUS_CAN_STEAL
-                       assert(m1->vm_page_q_state == VM_PAGE_ON_FREE_Q);
+                       assert(m1->vmp_q_state == VM_PAGE_ON_FREE_Q);
 #endif
 
-                       if (m1->vm_page_q_state == VM_PAGE_ON_FREE_Q) {
+                       if (m1->vmp_q_state == VM_PAGE_ON_FREE_Q) {
                                unsigned int color;
 
                                color = VM_PAGE_GET_COLOR(m1);
@@ -5426,7 +5609,7 @@ did_consider:
                                vm_page_queue_remove(&vm_page_queue_free[color].qhead,
                                                     m1,
                                                     vm_page_t,
-                                                    pageq);
+                                                    vmp_pageq);
 
                                VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
 #if MACH_ASSERT
@@ -5437,8 +5620,8 @@ did_consider:
                                 * does not get considered for another
                                 * concurrent physically-contiguous allocation.
                                 */
-                               m1->vm_page_q_state = VM_PAGE_NOT_ON_Q;
-                               assert(m1->busy);
+                               m1->vmp_q_state = VM_PAGE_NOT_ON_Q;
+                               assert(m1->vmp_busy);
 
                                vm_page_free_count--;
                        }
@@ -5467,16 +5650,16 @@ did_consider:
                         */
                        m1 = &vm_pages[cur_idx--];
 
-                       if (m1->vm_page_object == 0) {
+                       if (m1->vmp_object == 0) {
                                /*
                                 * page has already been removed from
                                 * the free list in the 1st pass
                                 */
-                               assert(m1->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-                               assert(m1->offset == (vm_object_offset_t) -1);
-                               assert(m1->busy);
-                               assert(!m1->wanted);
-                               assert(!m1->laundry);
+                               assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
+                               assert(m1->vmp_offset == (vm_object_offset_t) -1);
+                               assert(m1->vmp_busy);
+                               assert(!m1->vmp_wanted);
+                               assert(!m1->vmp_laundry);
                        } else {
                                vm_object_t object;
                                int refmod;
@@ -5485,7 +5668,7 @@ did_consider:
                                if (abort_run == TRUE)
                                        continue;
 
-                               assert(m1->vm_page_q_state != VM_PAGE_NOT_ON_Q);
+                               assert(m1->vmp_q_state != VM_PAGE_NOT_ON_Q);
 
                                object = VM_PAGE_OBJECT(m1);
 
@@ -5498,10 +5681,10 @@ did_consider:
                                                locked_object = object;
                                }
                                if (locked_object == VM_OBJECT_NULL || 
-                                   (VM_PAGE_WIRED(m1) || m1->gobbled ||
-                                    m1->laundry || m1->wanted ||
-                                    m1->cleaning || m1->overwriting || m1->free_when_done || m1->busy) ||
-                                   (m1->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
+                                   (VM_PAGE_WIRED(m1) || m1->vmp_gobbled ||
+                                    m1->vmp_laundry || m1->vmp_wanted ||
+                                    m1->vmp_cleaning || m1->vmp_overwriting || m1->vmp_free_when_done || m1->vmp_busy) ||
+                                   (m1->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)) {
 
                                        if (locked_object) {
                                                vm_object_unlock(locked_object);
@@ -5515,11 +5698,11 @@ did_consider:
                                disconnected = FALSE;
                                reusable = FALSE;
 
-                               if ((m1->reusable ||
+                               if ((m1->vmp_reusable ||
                                     object->all_reusable) &&
-                                   (m1->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) &&
-                                   !m1->dirty &&
-                                   !m1->reference) {
+                                   (m1->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) &&
+                                   !m1->vmp_dirty &&
+                                   !m1->vmp_reference) {
                                        /* reusable page... */
                                        refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
                                        disconnected = TRUE;
@@ -5532,10 +5715,10 @@ did_consider:
                                        }
                                }
 
-                               if ((m1->pmapped &&
+                               if ((m1->vmp_pmapped &&
                                     ! reusable) ||
-                                   m1->dirty ||
-                                   m1->precious) {
+                                   m1->vmp_dirty ||
+                                   m1->vmp_precious) {
                                        vm_object_offset_t offset;
 
                                        m2 = vm_page_grab();
@@ -5550,7 +5733,7 @@ did_consider:
                                                continue;
                                        }
                                        if (! disconnected) {
-                                               if (m1->pmapped)
+                                               if (m1->vmp_pmapped)
                                                        refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m1));
                                                else
                                                        refmod = 0;
@@ -5560,32 +5743,32 @@ did_consider:
                                        pmap_copy_page(VM_PAGE_GET_PHYS_PAGE(m1), VM_PAGE_GET_PHYS_PAGE(m2));
                                        /* copy the page's state */
                                        assert(!VM_PAGE_WIRED(m1));
-                                       assert(m1->vm_page_q_state != VM_PAGE_ON_FREE_Q);
-                                       assert(m1->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q);
-                                       assert(!m1->laundry);
-                                       m2->reference   = m1->reference;
-                                       assert(!m1->gobbled);
-                                       assert(!m1->private);
-                                       m2->no_cache    = m1->no_cache;
-                                       m2->xpmapped    = 0;
-                                       assert(!m1->busy);
-                                       assert(!m1->wanted);
-                                       assert(!m1->fictitious);
-                                       m2->pmapped     = m1->pmapped; /* should flush cache ? */
-                                       m2->wpmapped    = m1->wpmapped;
-                                       assert(!m1->free_when_done);
-                                       m2->absent      = m1->absent;
-                                       m2->error       = m1->error;
-                                       m2->dirty       = m1->dirty;
-                                       assert(!m1->cleaning);
-                                       m2->precious    = m1->precious;
-                                       m2->clustered   = m1->clustered;
-                                       assert(!m1->overwriting);
-                                       m2->restart     = m1->restart;
-                                       m2->unusual     = m1->unusual;
-                                       m2->cs_validated = m1->cs_validated;
-                                       m2->cs_tainted  = m1->cs_tainted;
-                                       m2->cs_nx       = m1->cs_nx;
+                                       assert(m1->vmp_q_state != VM_PAGE_ON_FREE_Q);
+                                       assert(m1->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q);
+                                       assert(!m1->vmp_laundry);
+                                       m2->vmp_reference       = m1->vmp_reference;
+                                       assert(!m1->vmp_gobbled);
+                                       assert(!m1->vmp_private);
+                                       m2->vmp_no_cache        = m1->vmp_no_cache;
+                                       m2->vmp_xpmapped        = 0;
+                                       assert(!m1->vmp_busy);
+                                       assert(!m1->vmp_wanted);
+                                       assert(!m1->vmp_fictitious);
+                                       m2->vmp_pmapped = m1->vmp_pmapped; /* should flush cache ? */
+                                       m2->vmp_wpmapped        = m1->vmp_wpmapped;
+                                       assert(!m1->vmp_free_when_done);
+                                       m2->vmp_absent  = m1->vmp_absent;
+                                       m2->vmp_error   = m1->vmp_error;
+                                       m2->vmp_dirty   = m1->vmp_dirty;
+                                       assert(!m1->vmp_cleaning);
+                                       m2->vmp_precious        = m1->vmp_precious;
+                                       m2->vmp_clustered       = m1->vmp_clustered;
+                                       assert(!m1->vmp_overwriting);
+                                       m2->vmp_restart = m1->vmp_restart;
+                                       m2->vmp_unusual = m1->vmp_unusual;
+                                       m2->vmp_cs_validated = m1->vmp_cs_validated;
+                                       m2->vmp_cs_tainted      = m1->vmp_cs_tainted;
+                                       m2->vmp_cs_nx   = m1->vmp_cs_nx;
 
                                        /*
                                         * If m1 had really been reusable,
@@ -5594,14 +5777,13 @@ did_consider:
                                         * bit and assert that m2 is not
                                         * marked as "reusable".
                                         */
-                                       // m2->reusable = m1->reusable;
-                                       assert(!m2->reusable);
+                                       // m2->vmp_reusable     = m1->vmp_reusable;
+                                       assert(!m2->vmp_reusable);
 
-                                       // assert(!m1->lopage);
-                                       m2->slid        = m1->slid;
+                                       // assert(!m1->vmp_lopage);
 
-                                       if (m1->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR)
-                                               m2->vm_page_q_state = VM_PAGE_USED_BY_COMPRESSOR;
+                                       if (m1->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR)
+                                               m2->vmp_q_state = VM_PAGE_USED_BY_COMPRESSOR;
 
                                        /*
                                         * page may need to be flushed if
@@ -5609,7 +5791,7 @@ did_consider:
                                         * that is going to be used by a device
                                         * that doesn't support coherency
                                         */
-                                       m2->written_by_kernel = TRUE;
+                                       m2->vmp_written_by_kernel = TRUE;
 
                                        /*
                                         * make sure we clear the ref/mod state
@@ -5620,11 +5802,11 @@ did_consider:
                                        pmap_clear_refmod(VM_PAGE_GET_PHYS_PAGE(m2), VM_MEM_MODIFIED | VM_MEM_REFERENCED);
 
                                        if (refmod & VM_MEM_REFERENCED)
-                                               m2->reference = TRUE;
+                                               m2->vmp_reference = TRUE;
                                        if (refmod & VM_MEM_MODIFIED) {
                                                SET_PAGE_DIRTY(m2, TRUE);
                                        }
-                                       offset = m1->offset;
+                                       offset = m1->vmp_offset;
 
                                        /*
                                         * completely cleans up the state
@@ -5641,11 +5823,11 @@ did_consider:
                                         */
                                        vm_page_insert_internal(m2, locked_object, offset, VM_KERN_MEMORY_NONE, TRUE, TRUE, FALSE, FALSE, NULL);
 
-                                       if (m2->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
-                                               m2->pmapped = TRUE;
-                                               m2->wpmapped = TRUE;
+                                       if (m2->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
+                                               m2->vmp_pmapped = TRUE;
+                                               m2->vmp_wpmapped = TRUE;
 
-                                               PMAP_ENTER(kernel_pmap, m2->offset, m2,
+                                               PMAP_ENTER(kernel_pmap, m2->vmp_offset, m2,
                                                           VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, TRUE, kr);
 
                                                assert(kr == KERN_SUCCESS);
@@ -5653,7 +5835,7 @@ did_consider:
                                                compressed_pages++;
 
                                        } else {
-                                               if (m2->reference)
+                                               if (m2->vmp_reference)
                                                        vm_page_activate(m2);
                                                else
                                                        vm_page_deactivate(m2);
@@ -5661,7 +5843,7 @@ did_consider:
                                        PAGE_WAKEUP_DONE(m2);
 
                                } else {
-                                       assert(m1->vm_page_q_state != VM_PAGE_USED_BY_COMPRESSOR);
+                                       assert(m1->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
 
                                        /*
                                         * completely cleans up the state
@@ -5680,7 +5862,7 @@ did_consider:
                        vm_page_assign_background_state(m1);
 #endif
                        VM_PAGE_ZERO_PAGEQ_ENTRY(m1);
-                       m1->snext = m;
+                       m1->vmp_snext = m;
                        m = m1;
                }
                if (locked_object) {
@@ -5746,14 +5928,14 @@ did_consider:
 
                for (m1 = m; m1 != VM_PAGE_NULL; m1 = NEXT_PAGE(m1)) {
 
-                       assert(m1->vm_page_q_state == VM_PAGE_NOT_ON_Q);
-                       assert(m1->wire_count == 0);
+                       assert(m1->vmp_q_state == VM_PAGE_NOT_ON_Q);
+                       assert(m1->vmp_wire_count == 0);
 
                        if (wire == TRUE) {
-                               m1->wire_count++;
-                               m1->vm_page_q_state = VM_PAGE_IS_WIRED;
+                               m1->vmp_wire_count++;
+                               m1->vmp_q_state = VM_PAGE_IS_WIRED;
                        } else
-                               m1->gobbled = TRUE;
+                               m1->vmp_gobbled = TRUE;
                }
                if (wire == FALSE)
                        vm_page_gobble_count += npages;
@@ -5850,10 +6032,8 @@ cpm_allocate(
        /*
         * determine need for wakeups
         */
-       if ((vm_page_free_count < vm_page_free_min) ||
-            ((vm_page_free_count < vm_page_free_target) &&
-             ((vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_min)))
-                thread_wakeup((event_t) &vm_page_free_wanted);
+       if (vm_page_free_count < vm_page_free_min)
+               thread_wakeup((event_t) &vm_page_free_wanted);
                
        VM_CHECK_MEMORYSTATUS;
        
@@ -5949,18 +6129,18 @@ vm_page_do_delayed_work(
                if (dwp->dw_mask & DW_vm_page_free) {
                        vm_page_free_prepare_queues(m);
 
-                       assert(m->pageq.next == 0 && m->pageq.prev == 0);
+                       assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
                        /*
                         * Add this page to our list of reclaimed pages,
                         * to be freed later.
                         */
-                       m->snext = local_free_q;
+                       m->vmp_snext = local_free_q;
                        local_free_q = m;
                } else {
                        if (dwp->dw_mask & DW_vm_page_deactivate_internal)
                                vm_page_deactivate_internal(m, FALSE);
                        else if (dwp->dw_mask & DW_vm_page_activate) {
-                               if (m->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q) {
+                               if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q) {
                                        vm_page_activate(m);
                                }
                        }
@@ -5981,30 +6161,29 @@ vm_page_do_delayed_work(
                                         * this page has been touched since it got cleaned; let's activate it
                                         * if it hasn't already been
                                         */
-                                       vm_pageout_enqueued_cleaned++;
-                                       vm_pageout_cleaned_reactivated++;
-                                       vm_pageout_cleaned_commit_reactivated++;
+                                       VM_PAGEOUT_DEBUG(vm_pageout_enqueued_cleaned, 1);
+                                       VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
 
-                                       if (m->vm_page_q_state != VM_PAGE_ON_ACTIVE_Q)
+                                       if (m->vmp_q_state != VM_PAGE_ON_ACTIVE_Q)
                                                vm_page_activate(m);
                                } else {
-                                       m->reference = FALSE;
+                                       m->vmp_reference = FALSE;
                                        vm_page_enqueue_cleaned(m);
                                }
                        }
                        else if (dwp->dw_mask & DW_vm_page_lru)
                                vm_page_lru(m);
                        else if (dwp->dw_mask & DW_VM_PAGE_QUEUES_REMOVE) {
-                               if (m->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q)
+                               if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q)
                                        vm_page_queues_remove(m, TRUE);
                        }
                        if (dwp->dw_mask & DW_set_reference)
-                               m->reference = TRUE;
+                               m->vmp_reference = TRUE;
                        else if (dwp->dw_mask & DW_clear_reference)
-                               m->reference = FALSE;
+                               m->vmp_reference = FALSE;
 
                        if (dwp->dw_mask & DW_move_page) {
-                               if (m->vm_page_q_state != VM_PAGE_ON_PAGEOUT_Q) {
+                               if (m->vmp_q_state != VM_PAGE_ON_PAGEOUT_Q) {
                                        vm_page_queues_remove(m, FALSE);
 
                                        assert(VM_PAGE_OBJECT(m) != kernel_object);
@@ -6013,7 +6192,7 @@ vm_page_do_delayed_work(
                                }
                        }
                        if (dwp->dw_mask & DW_clear_busy)
-                               m->busy = FALSE;
+                               m->vmp_busy = FALSE;
 
                        if (dwp->dw_mask & DW_PAGE_WAKEUP)
                                PAGE_WAKEUP(m);
@@ -6053,7 +6232,7 @@ vm_page_alloc_list(
 
                        return (KERN_RESOURCE_SHORTAGE);
                }
-               mem->snext = lo_page_list;
+               mem->vmp_snext = lo_page_list;
                lo_page_list = mem;
        }
        *list = lo_page_list;
@@ -6064,19 +6243,19 @@ vm_page_alloc_list(
 void
 vm_page_set_offset(vm_page_t page, vm_object_offset_t offset)
 {
-       page->offset = offset;
+       page->vmp_offset = offset;
 }
 
 vm_page_t
 vm_page_get_next(vm_page_t page)
 {
-       return (page->snext);
+       return (page->vmp_snext);
 }
 
 vm_object_offset_t
 vm_page_get_offset(vm_page_t page)
 {
-       return (page->offset);
+       return (page->vmp_offset);
 }
 
 ppnum_t
@@ -6261,12 +6440,12 @@ hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
                                l_object = m_object;
                        }
                }
-               if ( !m_object->alive || m->cleaning || m->laundry || m->busy || m->absent || m->error) {
+               if ( !m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || m->vmp_error) {
                        /*
                         * page is not to be cleaned
                         * put it back on the head of its queue
                         */
-                       if (m->cleaning)
+                       if (m->vmp_cleaning)
                                hibernate_stats.hibernate_skipped_cleaning++;
                        else
                                hibernate_stats.hibernate_skipped_transient++;
@@ -6282,7 +6461,7 @@ hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
                                goto reenter_pg_on_q;
                        }
                }
-               if ( !m->dirty && m->pmapped) {
+               if ( !m->vmp_dirty && m->vmp_pmapped) {
                        refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
 
                        if ((refmod_state & VM_MEM_MODIFIED)) {
@@ -6291,12 +6470,12 @@ hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
                } else
                        refmod_state = 0;
 
-               if ( !m->dirty) {
+               if ( !m->vmp_dirty) {
                        /*
                         * page is not to be cleaned
                         * put it back on the head of its queue
                         */
-                       if (m->precious)
+                       if (m->vmp_precious)
                                hibernate_stats.hibernate_skipped_precious++;
 
                        goto reenter_pg_on_q;
@@ -6380,8 +6559,8 @@ hibernate_flush_queue(vm_page_queue_head_t *q, int qcount)
                goto next_pg;
 
 reenter_pg_on_q:
-               vm_page_queue_remove(q, m, vm_page_t, pageq);
-               vm_page_queue_enter(q, m, vm_page_t, pageq);
+               vm_page_queue_remove(q, m, vm_page_t, vmp_pageq);
+               vm_page_queue_enter(q, m, vm_page_t, vmp_pageq);
 
                hibernate_stats.hibernate_reentered_on_q++;
 next_pg:
@@ -6429,7 +6608,7 @@ hibernate_flush_dirty_pages(int pass)
                vm_page_queue_iterate(&aq->age_q,
                              m,
                              vm_page_t,
-                             pageq)
+                             vmp_pageq)
                {
                        qcount++;
                }
@@ -6573,7 +6752,7 @@ hibernate_free_gobble_pages(void)
     m = (vm_page_t) hibernate_gobble_queue;
     while(m)
     {
-        next = m->snext;
+        next = m->vmp_snext;
         vm_page_free(m);
         count++;
         m = next;
@@ -6593,7 +6772,7 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight)
 
     do
     {
-        if (m->private)
+        if (m->vmp_private)
             panic("hibernate_consider_discard: private");
 
        object = VM_PAGE_OBJECT(m);
@@ -6607,38 +6786,38 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight)
            if (!preflight) hibernate_stats.cd_found_wired++;
             break;
        }
-        if (m->precious) {
+        if (m->vmp_precious) {
            if (!preflight) hibernate_stats.cd_found_precious++;
             break;
        }
-        if (m->busy || !object->alive) {
+        if (m->vmp_busy || !object->alive) {
            /*
             *  Somebody is playing with this page.
             */
            if (!preflight) hibernate_stats.cd_found_busy++;
             break;
        }
-        if (m->absent || m->unusual || m->error) {
+        if (m->vmp_absent || m->vmp_unusual || m->vmp_error) {
            /*
             * If it's unusual in anyway, ignore it
             */
            if (!preflight) hibernate_stats.cd_found_unusual++;
             break;
        }
-        if (m->cleaning) {
+        if (m->vmp_cleaning) {
            if (!preflight) hibernate_stats.cd_found_cleaning++;
             break;
        }
-       if (m->laundry) {
+       if (m->vmp_laundry) {
            if (!preflight) hibernate_stats.cd_found_laundry++;
             break;
        }
-        if (!m->dirty)
+        if (!m->vmp_dirty)
         {
                refmod_state = pmap_get_refmod(VM_PAGE_GET_PHYS_PAGE(m));
         
             if (refmod_state & VM_MEM_REFERENCED)
-                m->reference = TRUE;
+                m->vmp_reference = TRUE;
             if (refmod_state & VM_MEM_MODIFIED) {
                SET_PAGE_DIRTY(m, FALSE);
            }
@@ -6647,7 +6826,7 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight)
         /*
          * If it's clean or purgeable we can discard the page on wakeup.
          */
-        discard = (!m->dirty) 
+        discard = (!m->vmp_dirty) 
                    || (VM_PURGABLE_VOLATILE == object->purgable)
                    || (VM_PURGABLE_EMPTY    == object->purgable);
 
@@ -6655,7 +6834,7 @@ hibernate_consider_discard(vm_page_t m, boolean_t preflight)
         if (discard == FALSE) {
                if (!preflight)
                        hibernate_stats.cd_found_dirty++;
-        } else if (m->xpmapped && m->reference && !object->internal) {
+        } else if (m->vmp_xpmapped && m->vmp_reference && !object->internal) {
                if (hibernate_stats.cd_found_xpmapped < HIBERNATE_XPMAPPED_LIMIT) {
                        if (!preflight)
                                hibernate_stats.cd_found_xpmapped++;
@@ -6680,7 +6859,7 @@ hibernate_discard_page(vm_page_t m)
 {
     vm_object_t        m_object;
 
-    if (m->absent || m->unusual || m->error)
+    if (m->vmp_absent || m->vmp_unusual || m->vmp_error)
        /*
         * If it's unusual in anyway, ignore
         */
@@ -6696,16 +6875,16 @@ hibernate_discard_page(vm_page_t m)
        makes sure these locks are uncontended before sleep */
 #endif /* MACH_ASSERT || DEBUG */
 
-    if (m->pmapped == TRUE) 
+    if (m->vmp_pmapped == TRUE) 
     {
        __unused int refmod_state = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
     }
 
-    if (m->laundry)
+    if (m->vmp_laundry)
         panic("hibernate_discard_page(%p) laundry", m);
-    if (m->private)
+    if (m->vmp_private)
         panic("hibernate_discard_page(%p) private", m);
-    if (m->fictitious)
+    if (m->vmp_fictitious)
         panic("hibernate_discard_page(%p) fictitious", m);
 
     if (VM_PURGABLE_VOLATILE == m_object->purgable)
@@ -6873,16 +7052,16 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
            hibernate_page_bitset(page_list,       TRUE, VM_PAGE_GET_PHYS_PAGE(m));
            hibernate_page_bitset(page_list_wired, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
        }
-       m = m->snext;
+       m = m->vmp_snext;
     }
 
     if (!preflight) for( i = 0; i < real_ncpus; i++ )
     {
        if (cpu_data_ptr[i] && cpu_data_ptr[i]->cpu_processor)
        {
-       for (m = PROCESSOR_DATA(cpu_data_ptr[i]->cpu_processor, free_pages); m; m = m->snext)
+       for (m = PROCESSOR_DATA(cpu_data_ptr[i]->cpu_processor, free_pages); m; m = m->vmp_snext)
            {
-               assert(m->vm_page_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
+               assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOCAL_Q);
 
                pages--;
                count_wire--;
@@ -6900,9 +7079,9 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
        vm_page_queue_iterate(&vm_page_queue_free[i].qhead,
                              m,
                              vm_page_t,
-                             pageq)
+                             vmp_pageq)
        {
-           assert(m->vm_page_q_state == VM_PAGE_ON_FREE_Q);
+           assert(m->vmp_q_state == VM_PAGE_ON_FREE_Q);
 
            pages--;
            count_wire--;
@@ -6918,9 +7097,9 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     vm_page_queue_iterate(&vm_lopage_queue_free,
                          m,
                          vm_page_t,
-                         pageq)
+                         vmp_pageq)
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_FREE_LOPAGE_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_FREE_LOPAGE_Q);
 
        pages--;
        count_wire--;
@@ -6935,9 +7114,9 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
     while (m && !vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_THROTTLED_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_THROTTLED_Q);
 
-        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
        discard = FALSE;
         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) 
          && hibernate_consider_discard(m, preflight))
@@ -6958,15 +7137,15 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
     while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
 
-        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
        discard = FALSE;
         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) 
          && hibernate_consider_discard(m, preflight))
         {
            if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
-           if (m->dirty)
+           if (m->vmp_dirty)
                count_discard_purgeable++;
            else
                count_discard_inactive++;
@@ -6983,15 +7162,15 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
     while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
 
-        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
        discard = FALSE;
         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) 
          && hibernate_consider_discard(m, preflight))
         {
            if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
-           if (m->dirty)
+           if (m->vmp_dirty)
                count_discard_purgeable++;
            else
                count_discard_cleaned++;
@@ -7008,15 +7187,15 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
     while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
 
-        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
        discard = FALSE;
         if ((kIOHibernateModeDiscardCleanActive & gIOHibernateMode) 
          && hibernate_consider_discard(m, preflight))
         {
            if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
-           if (m->dirty)
+           if (m->vmp_dirty)
                count_discard_purgeable++;
            else
                count_discard_active++;
@@ -7033,15 +7212,15 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
     while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
 
-        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
        discard = FALSE;
         if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) 
          && hibernate_consider_discard(m, preflight))
         {
            if (!preflight) hibernate_page_bitset(page_list, TRUE, VM_PAGE_GET_PHYS_PAGE(m));
-           if (m->dirty)
+           if (m->vmp_dirty)
                count_discard_purgeable++;
            else
                count_discard_inactive++;
@@ -7061,9 +7240,12 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
        m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
        while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m))
        {
-           assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q);
+           assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
+           assertf(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q,
+                   "Bad page: %p (0x%x:0x%x) on queue %d has state: %d (Discard: %d, Preflight: %d)",
+                    m, m->vmp_pageq.next, m->vmp_pageq.prev, i, m->vmp_q_state, discard, preflight);
 
-           next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->pageq.next);
+           next = (vm_page_t)VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
            discard = FALSE;
            if ((kIOHibernateModeDiscardCleanInactive & gIOHibernateMode) 
             && hibernate_consider_discard(m, preflight))
@@ -7081,9 +7263,9 @@ hibernate_page_list_setall(hibernate_page_list_t * page_list,
        }
     }
 
-    vm_page_queue_iterate(&compressor_object->memq, m, vm_page_t, listq)
+    vm_page_queue_iterate(&compressor_object->memq, m, vm_page_t, vmp_listq)
     {
-       assert(m->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR);
+       assert(m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR);
 
         count_compressor++;
        count_wire--;
@@ -7196,12 +7378,12 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list)
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
     while (m && !vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
 
-        next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
         if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m)))
         {
-           if (m->dirty)
+           if (m->vmp_dirty)
                count_discard_purgeable++;
            else
                count_discard_inactive++;
@@ -7215,9 +7397,9 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list)
        m = (vm_page_t) vm_page_queue_first(&vm_page_queue_speculative[i].age_q);
        while (m && !vm_page_queue_end(&vm_page_queue_speculative[i].age_q, (vm_page_queue_entry_t)m))
        {
-          assert(m->vm_page_q_state == VM_PAGE_ON_SPECULATIVE_Q);
+          assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
 
-           next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next);
+           next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
            if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m)))
            {
                count_discard_speculative++;
@@ -7230,12 +7412,12 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list)
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
     while (m && !vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
 
-        next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
         if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m)))
         {
-           if (m->dirty)
+           if (m->vmp_dirty)
                count_discard_purgeable++;
            else
                count_discard_inactive++;
@@ -7248,12 +7430,12 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list)
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
     while (m && !vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_ACTIVE_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
 
-        next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
         if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m)))
         {
-           if (m->dirty)
+           if (m->vmp_dirty)
                count_discard_purgeable++;
            else
                count_discard_active++;
@@ -7265,12 +7447,12 @@ hibernate_page_list_discard(hibernate_page_list_t * page_list)
     m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
     while (m && !vm_page_queue_end(&vm_page_queue_cleaned, (vm_page_queue_entry_t)m))
     {
-       assert(m->vm_page_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
+       assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
 
-        next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->pageq.next);
+        next = (vm_page_t) VM_PAGE_UNPACK_PTR(m->vmp_pageq.next);
         if (hibernate_page_bittst(page_list, VM_PAGE_GET_PHYS_PAGE(m)))
         {
-           if (m->dirty)
+           if (m->vmp_dirty)
                count_discard_purgeable++;
            else
                count_discard_cleaned++;
@@ -7411,17 +7593,17 @@ hibernate_hash_insert_page(vm_page_t mem)
 
        m_object = VM_PAGE_OBJECT(mem);
 
-       assert(mem->hashed);
+       assert(mem->vmp_hashed);
        assert(m_object);
-       assert(mem->offset != (vm_object_offset_t) -1);
+       assert(mem->vmp_offset != (vm_object_offset_t) -1);
 
        /*
         *      Insert it into the object_object/offset hash table
         */
-       hash_id = vm_page_hash(m_object, mem->offset);
+       hash_id = vm_page_hash(m_object, mem->vmp_offset);
        bucket = &vm_page_buckets[hash_id];
 
-       mem->next_m = bucket->page_list;
+       mem->vmp_next_m = bucket->page_list;
        bucket->page_list = VM_PAGE_PACK_PTR(mem);
 }
 
@@ -7437,20 +7619,20 @@ hibernate_free_range(int sindx, int eindx)
 
                vm_page_init(mem, hibernate_lookup_paddr(sindx), FALSE);
 
-               mem->lopage = FALSE;
-               mem->vm_page_q_state = VM_PAGE_ON_FREE_Q;
+               mem->vmp_lopage = FALSE;
+               mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
 
                color = VM_PAGE_GET_COLOR(mem);
 #if defined(__x86_64__)
                vm_page_queue_enter_clump(&vm_page_queue_free[color].qhead,
                                          mem,
                                          vm_page_t,
-                                         pageq);
+                                         vmp_pageq);
 #else
                vm_page_queue_enter(&vm_page_queue_free[color].qhead,
                                          mem,
                                          vm_page_t,
-                                         pageq);
+                                         vmp_pageq);
 #endif
                vm_page_free_count++;
 
@@ -7488,18 +7670,18 @@ hibernate_rebuild_vm_structs(void)
         * Without this random data in these vm_pages[] can trip the buddy search 
         */
        for (i = hibernate_teardown_last_valid_compact_indx+1; i < eindx; ++i) 
-               vm_pages[i].vm_page_q_state = VM_PAGE_NOT_ON_Q;
+               vm_pages[i].vmp_q_state = VM_PAGE_NOT_ON_Q;
 
        for (cindx = hibernate_teardown_last_valid_compact_indx; cindx >= 0; cindx--) {
                
                mem = &vm_pages[cindx];
-               assert(mem->vm_page_q_state != VM_PAGE_ON_FREE_Q);
+               assert(mem->vmp_q_state != VM_PAGE_ON_FREE_Q);
                /*
                 * hibernate_teardown_vm_structs leaves the location where
                 * this vm_page_t must be located in "next".
                 */
-               tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m));
-               mem->next_m = VM_PAGE_PACK_PTR(NULL);
+               tmem = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
+               mem->vmp_next_m = VM_PAGE_PACK_PTR(NULL);
 
                sindx = (int)(tmem - &vm_pages[0]);
 
@@ -7511,7 +7693,7 @@ hibernate_rebuild_vm_structs(void)
                        *tmem = *mem;
                        mem = tmem;
                }
-               if (mem->hashed)
+               if (mem->vmp_hashed)
                        hibernate_hash_insert_page(mem);
                /*
                 * the 'hole' between this vm_page_t and the previous
@@ -7533,9 +7715,9 @@ hibernate_rebuild_vm_structs(void)
         * vm_page_t's that were created on the fly (i.e. fictitious)
         */
        for (mem = hibernate_rebuild_hash_list; mem; mem = mem_next) {
-               mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m));
+               mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
 
-               mem->next_m = 0;
+               mem->vmp_next_m = 0;
                hibernate_hash_insert_page(mem);
        }
        hibernate_rebuild_hash_list = NULL;
@@ -7583,12 +7765,12 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l
                bucket = &vm_page_buckets[i];
 
                for (mem = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list)); mem != VM_PAGE_NULL; mem = mem_next) {
-                       assert(mem->hashed);
+                       assert(mem->vmp_hashed);
 
-                       mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->next_m));
+                       mem_next = (vm_page_t)(VM_PAGE_UNPACK_PTR(mem->vmp_next_m));
 
                        if (mem < &vm_pages[0] || mem >= &vm_pages[vm_pages_count]) {
-                               mem->next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list);
+                               mem->vmp_next_m = VM_PAGE_PACK_PTR(hibernate_rebuild_hash_list);
                                hibernate_rebuild_hash_list = mem;
                        }
                }
@@ -7604,18 +7786,18 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l
 
                mem = &vm_pages[i];
 
-               if (mem->vm_page_q_state == VM_PAGE_ON_FREE_Q) {
+               if (mem->vmp_q_state == VM_PAGE_ON_FREE_Q) {
                        unsigned int color;
 
-                       assert(mem->busy);
-                       assert(!mem->lopage);
+                       assert(mem->vmp_busy);
+                       assert(!mem->vmp_lopage);
 
                        color = VM_PAGE_GET_COLOR(mem);
 
                        vm_page_queue_remove(&vm_page_queue_free[color].qhead,
                                             mem,
                                             vm_page_t,
-                                            pageq);
+                                            vmp_pageq);
 
                        VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
 
@@ -7623,7 +7805,7 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l
 
                        hibernate_teardown_found_free_pages++;
 
-                       if (vm_pages[compact_target_indx].vm_page_q_state != VM_PAGE_ON_FREE_Q)
+                       if (vm_pages[compact_target_indx].vmp_q_state != VM_PAGE_ON_FREE_Q)
                                compact_target_indx = i;
                } else {
                        /*
@@ -7632,15 +7814,15 @@ hibernate_teardown_vm_structs(hibernate_page_list_t *page_list, hibernate_page_l
                         * as an indicator to the rebuild function that
                         * we don't have to move it
                         */
-                       mem->next_m = VM_PAGE_PACK_PTR(mem);
+                       mem->vmp_next_m = VM_PAGE_PACK_PTR(mem);
 
-                       if (vm_pages[compact_target_indx].vm_page_q_state == VM_PAGE_ON_FREE_Q) {
+                       if (vm_pages[compact_target_indx].vmp_q_state == VM_PAGE_ON_FREE_Q) {
                                /*
                                 * we've got a hole to fill, so
                                 * move this vm_page_t to it's new home
                                 */
                                vm_pages[compact_target_indx] = *mem;
-                               mem->vm_page_q_state = VM_PAGE_ON_FREE_Q;
+                               mem->vmp_q_state = VM_PAGE_ON_FREE_Q;
 
                                hibernate_teardown_last_valid_compact_indx = compact_target_indx;
                                compact_target_indx++;
@@ -7706,7 +7888,7 @@ vm_page_info(
 
                for (m = (vm_page_t)(VM_PAGE_UNPACK_PTR(bucket->page_list));
                     m != VM_PAGE_NULL;
-                    m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->next_m)))
+                    m = (vm_page_t)(VM_PAGE_UNPACK_PTR(m->vmp_next_m)))
                        bucket_count++;
 
                lck_spin_unlock(bucket_lock);
@@ -7773,22 +7955,22 @@ vm_page_buckets_check(void)
                while (p != VM_PAGE_NULL) {
                        p_object = VM_PAGE_OBJECT(p);
 
-                       if (!p->hashed) {
+                       if (!p->vmp_hashed) {
                                panic("BUCKET_CHECK: page %p (%p,0x%llx) "
                                      "hash %d in bucket %d at %p "
                                      "is not hashed\n",
-                                     p, p_object, p->offset,
+                                     p, p_object, p->vmp_offset,
                                      p_hash, i, bucket);
                        }
-                       p_hash = vm_page_hash(p_object, p->offset);
+                       p_hash = vm_page_hash(p_object, p->vmp_offset);
                        if (p_hash != i) {
                                panic("BUCKET_CHECK: corruption in bucket %d "
                                      "at %p: page %p object %p offset 0x%llx "
                                      "hash %d\n",
-                                     i, bucket, p, p_object, p->offset,
+                                     i, bucket, p, p_object, p->vmp_offset,
                                      p_hash);
                        }
-                       p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->next_m));
+                       p = (vm_page_t)(VM_PAGE_UNPACK_PTR(p->vmp_next_m));
                }
                lck_spin_unlock(bucket_lock);
        }
@@ -7828,44 +8010,44 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
 
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
 
-       if (mem->vm_page_q_state == VM_PAGE_NOT_ON_Q)
+       if (mem->vmp_q_state == VM_PAGE_NOT_ON_Q)
        {
-               assert(mem->pageq.next == 0 && mem->pageq.prev == 0);
+               assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
 #if CONFIG_BACKGROUND_QUEUE
                if (remove_from_backgroundq == TRUE) {
                        vm_page_remove_from_backgroundq(mem);
                }
-               if (mem->vm_page_on_backgroundq) {
-                       assert(mem->vm_page_backgroundq.next != 0);
-                       assert(mem->vm_page_backgroundq.prev != 0);
+               if (mem->vmp_on_backgroundq) {
+                       assert(mem->vmp_backgroundq.next != 0);
+                       assert(mem->vmp_backgroundq.prev != 0);
                } else {
-                       assert(mem->vm_page_backgroundq.next == 0);
-                       assert(mem->vm_page_backgroundq.prev == 0);
+                       assert(mem->vmp_backgroundq.next == 0);
+                       assert(mem->vmp_backgroundq.prev == 0);
                }
 #endif /* CONFIG_BACKGROUND_QUEUE */
                return;
        }
 
-       if (mem->vm_page_q_state == VM_PAGE_USED_BY_COMPRESSOR)
+       if (mem->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR)
        {
-               assert(mem->pageq.next == 0 && mem->pageq.prev == 0);
+               assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
 #if CONFIG_BACKGROUND_QUEUE
-               assert(mem->vm_page_backgroundq.next == 0 &&
-                      mem->vm_page_backgroundq.prev == 0 &&
-                      mem->vm_page_on_backgroundq == FALSE);
+               assert(mem->vmp_backgroundq.next == 0 &&
+                      mem->vmp_backgroundq.prev == 0 &&
+                      mem->vmp_on_backgroundq == FALSE);
 #endif
                return;
        }
-       if (mem->vm_page_q_state == VM_PAGE_IS_WIRED) {
+       if (mem->vmp_q_state == VM_PAGE_IS_WIRED) {
                /*
                 * might put these guys on a list for debugging purposes
                 * if we do, we'll need to remove this assert
                 */
-               assert(mem->pageq.next == 0 && mem->pageq.prev == 0);
+               assert(mem->vmp_pageq.next == 0 && mem->vmp_pageq.prev == 0);
 #if CONFIG_BACKGROUND_QUEUE
-               assert(mem->vm_page_backgroundq.next == 0 &&
-                      mem->vm_page_backgroundq.prev == 0 &&
-                      mem->vm_page_on_backgroundq == FALSE);
+               assert(mem->vmp_backgroundq.next == 0 &&
+                      mem->vmp_backgroundq.prev == 0 &&
+                      mem->vmp_on_backgroundq == FALSE);
 #endif
                return;
        }
@@ -7873,19 +8055,19 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
        assert(m_object != compressor_object);
        assert(m_object != kernel_object);
        assert(m_object != vm_submap_object);
-       assert(!mem->fictitious);
+       assert(!mem->vmp_fictitious);
 
-       switch(mem->vm_page_q_state) {
+       switch(mem->vmp_q_state) {
                
        case VM_PAGE_ON_ACTIVE_LOCAL_Q:
        {
                struct vpl      *lq;
 
-               lq = &vm_page_local_q[mem->local_id].vpl_un.vpl;
+               lq = &vm_page_local_q[mem->vmp_local_id].vpl_un.vpl;
                VPL_LOCK(&lq->vpl_lock);
                vm_page_queue_remove(&lq->vpl_queue,
-                                    mem, vm_page_t, pageq);
-               mem->local_id = 0;
+                                    mem, vm_page_t, vmp_pageq);
+               mem->vmp_local_id = 0;
                lq->vpl_count--;
                if (m_object->internal) {
                        lq->vpl_internal_count--;
@@ -7899,7 +8081,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
        case VM_PAGE_ON_ACTIVE_Q:
        {
                vm_page_queue_remove(&vm_page_queue_active,
-                                    mem, vm_page_t, pageq);
+                                    mem, vm_page_t, vmp_pageq);
                vm_page_active_count--;
                break;
        }
@@ -7910,9 +8092,11 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
 
                vm_page_inactive_count--;
                vm_page_queue_remove(&vm_page_queue_anonymous,
-                                    mem, vm_page_t, pageq);
+                                    mem, vm_page_t, vmp_pageq);
                vm_page_anonymous_count--;
+
                vm_purgeable_q_advance_all();
+               vm_page_balance_inactive(3);
                break;
        }
 
@@ -7922,8 +8106,9 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
 
                vm_page_inactive_count--;
                vm_page_queue_remove(&vm_page_queue_inactive,
-                                    mem, vm_page_t, pageq);
+                                    mem, vm_page_t, vmp_pageq);
                vm_purgeable_q_advance_all();
+               vm_page_balance_inactive(3);
                break;
        }
 
@@ -7933,8 +8118,9 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
 
                vm_page_inactive_count--;
                vm_page_queue_remove(&vm_page_queue_cleaned,
-                                    mem, vm_page_t, pageq);
+                                    mem, vm_page_t, vmp_pageq);
                vm_page_cleaned_count--;
+               vm_page_balance_inactive(3);
                break;
        }
 
@@ -7943,7 +8129,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
                assert(m_object->internal == TRUE);
 
                vm_page_queue_remove(&vm_page_queue_throttled,
-                                    mem, vm_page_t, pageq);
+                                    mem, vm_page_t, vmp_pageq);
                vm_page_throttled_count--;
                was_pageable = FALSE;
                break;
@@ -7953,8 +8139,9 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
        {
                assert(m_object->internal == FALSE);
 
-                vm_page_remque(&mem->pageq);
+                vm_page_remque(&mem->vmp_pageq);
                vm_page_speculative_count--;
+               vm_page_balance_inactive(3);
                break;
        }
 
@@ -7962,7 +8149,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
        case VM_PAGE_ON_SECLUDED_Q:
        {
                vm_page_queue_remove(&vm_page_queue_secluded,
-                                    mem, vm_page_t, pageq);
+                                    mem, vm_page_t, vmp_pageq);
                vm_page_secluded_count--;
                if (m_object == VM_OBJECT_NULL) {
                        vm_page_secluded_count_free--;
@@ -7980,7 +8167,7 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
        default:
        {
                /*
-                *      if (mem->vm_page_q_state == VM_PAGE_ON_PAGEOUT_Q)
+                *      if (mem->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q)
                 *              NOTE: vm_page_queues_remove does not deal with removing pages from the pageout queue...
                 *              the caller is responsible for determing if the page is on that queue, and if so, must
                 *              either first remove it (it needs both the page queues lock and the object lock to do
@@ -7989,13 +8176,13 @@ vm_page_queues_remove(vm_page_t mem, boolean_t __unused remove_from_backgroundq)
                 *      we also don't expect to encounter VM_PAGE_ON_FREE_Q, VM_PAGE_ON_FREE_LOCAL_Q, VM_PAGE_ON_FREE_LOPAGE_Q
                 *      or any of the undefined states
                 */
-               panic("vm_page_queues_remove - bad page q_state (%p, %d)\n", mem, mem->vm_page_q_state);
+               panic("vm_page_queues_remove - bad page q_state (%p, %d)\n", mem, mem->vmp_q_state);
                break;
        }
 
        }
        VM_PAGE_ZERO_PAGEQ_ENTRY(mem);
-       mem->vm_page_q_state = VM_PAGE_NOT_ON_Q;
+       mem->vmp_q_state = VM_PAGE_NOT_ON_Q;
 
 #if CONFIG_BACKGROUND_QUEUE
        if (remove_from_backgroundq == TRUE)
@@ -8017,9 +8204,9 @@ vm_page_remove_internal(vm_page_t page)
        if (page == __object->memq_hint) {
                vm_page_t       __new_hint;
                vm_page_queue_entry_t   __qe;
-               __qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->listq);
+               __qe = (vm_page_queue_entry_t)vm_page_queue_next(&page->vmp_listq);
                if (vm_page_queue_end(&__object->memq, __qe)) {
-                       __qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->listq);
+                       __qe = (vm_page_queue_entry_t)vm_page_queue_prev(&page->vmp_listq);
                        if (vm_page_queue_end(&__object->memq, __qe)) {
                                __qe = NULL;
                        }
@@ -8027,7 +8214,7 @@ vm_page_remove_internal(vm_page_t page)
                __new_hint = (vm_page_t)((uintptr_t) __qe);
                __object->memq_hint = __new_hint;
        }
-       vm_page_queue_remove(&__object->memq, page, vm_page_t, listq);
+       vm_page_queue_remove(&__object->memq, page, vm_page_t, vmp_listq);
 #if CONFIG_SECLUDED_MEMORY
        if (__object->eligible_for_secluded) {
                vm_page_secluded.eligible_for_secluded--;
@@ -8043,28 +8230,28 @@ vm_page_enqueue_inactive(vm_page_t mem, boolean_t first)
        m_object = VM_PAGE_OBJECT(mem);
 
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
-       assert(!mem->fictitious);
-       assert(!mem->laundry);
-       assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+       assert(!mem->vmp_fictitious);
+       assert(!mem->vmp_laundry);
+       assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
        vm_page_check_pageable_safe(mem);
 
        if (m_object->internal) {
-               mem->vm_page_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
+               mem->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
 
                if (first == TRUE)
-                       vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vm_page_t, pageq);
+                       vm_page_queue_enter_first(&vm_page_queue_anonymous, mem, vm_page_t, vmp_pageq);
                else
-                       vm_page_queue_enter(&vm_page_queue_anonymous, mem, vm_page_t, pageq);
+                       vm_page_queue_enter(&vm_page_queue_anonymous, mem, vm_page_t, vmp_pageq);
 
                vm_page_anonymous_count++;
                vm_page_pageable_internal_count++;
        } else {
-               mem->vm_page_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
+               mem->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
 
                if (first == TRUE)
-                       vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vm_page_t, pageq);
+                       vm_page_queue_enter_first(&vm_page_queue_inactive, mem, vm_page_t, vmp_pageq);
                else
-                       vm_page_queue_enter(&vm_page_queue_inactive, mem, vm_page_t, pageq);
+                       vm_page_queue_enter(&vm_page_queue_inactive, mem, vm_page_t, vmp_pageq);
 
                vm_page_pageable_external_count++;
        }
@@ -8072,7 +8259,7 @@ vm_page_enqueue_inactive(vm_page_t mem, boolean_t first)
        token_new_pagecount++;
 
 #if CONFIG_BACKGROUND_QUEUE
-       if (mem->vm_page_in_background)
+       if (mem->vmp_in_background)
                vm_page_add_to_backgroundq(mem, FALSE);
 #endif
 }
@@ -8085,16 +8272,16 @@ vm_page_enqueue_active(vm_page_t mem, boolean_t first)
        m_object = VM_PAGE_OBJECT(mem);
 
        LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
-       assert(!mem->fictitious);
-       assert(!mem->laundry);
-       assert(mem->vm_page_q_state == VM_PAGE_NOT_ON_Q);
+       assert(!mem->vmp_fictitious);
+       assert(!mem->vmp_laundry);
+       assert(mem->vmp_q_state == VM_PAGE_NOT_ON_Q);
        vm_page_check_pageable_safe(mem);
 
-       mem->vm_page_q_state = VM_PAGE_ON_ACTIVE_Q;
+       mem->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
        if (first == TRUE)
-               vm_page_queue_enter_first(&vm_page_queue_active, mem, vm_page_t, pageq);
+               vm_page_queue_enter_first(&vm_page_queue_active, mem, vm_page_t, vmp_pageq);
        else
-               vm_page_queue_enter(&vm_page_queue_active, mem, vm_page_t, pageq);
+               vm_page_queue_enter(&vm_page_queue_active, mem, vm_page_t, vmp_pageq);
        vm_page_active_count++;
 
        if (m_object->internal) {
@@ -8104,9 +8291,10 @@ vm_page_enqueue_active(vm_page_t mem, boolean_t first)
        }
 
 #if CONFIG_BACKGROUND_QUEUE
-       if (mem->vm_page_in_background)
+       if (mem->vmp_in_background)
                vm_page_add_to_backgroundq(mem, FALSE);
 #endif
+       vm_page_balance_inactive(3);
 }
 
 /*
@@ -8189,7 +8377,8 @@ vm_tag_bt(void)
        retaddr = *(frameptr + 1);
 
 
-       if ((retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top))
+       if (((retaddr < vm_kernel_builtinkmod_text_end) && (retaddr >= vm_kernel_builtinkmod_text))
+               || (retaddr < vm_kernel_stext) || (retaddr > vm_kernel_top))
        {
                site = OSKextGetAllocationSiteForCaller(retaddr);
                break;
@@ -8544,6 +8733,7 @@ kern_allocation_name_get_vm_tag(kern_allocation_name_t allocation)
     return (vm_tag_alloc(allocation));
 }
 
+#if ! VM_TAG_ACTIVE_UPDATE
 static void 
 vm_page_count_object(mach_memory_info_t * info, unsigned int __unused num_info, vm_object_t object)
 {
@@ -8577,46 +8767,19 @@ static void
 vm_page_iterate_objects(mach_memory_info_t * info, unsigned int num_info,
                        vm_page_iterate_proc proc)
 {
-    purgeable_q_t   volatile_q;
-    queue_head_t  * nonvolatile_q;
     vm_object_t     object;
-    int             group;
 
     lck_spin_lock(&vm_objects_wired_lock);
     queue_iterate(&vm_objects_wired,
                  object,
                  vm_object_t,
-                 objq)
+                 wired_objq)
     {
                proc(info, num_info, object);
     }
     lck_spin_unlock(&vm_objects_wired_lock);
-
-    lck_mtx_lock(&vm_purgeable_queue_lock);
-    nonvolatile_q = &purgeable_nonvolatile_queue;
-    for (object = (vm_object_t) queue_first(nonvolatile_q);
-                !queue_end(nonvolatile_q, (queue_entry_t) object);
-                object = (vm_object_t) queue_next(&object->objq))
-    {
-               proc(info, num_info, object);
-    }
-
-    volatile_q = &purgeable_queues[PURGEABLE_Q_TYPE_OBSOLETE];
-    vm_page_iterate_purgeable_objects(info, num_info, proc, volatile_q, 0);
-
-    volatile_q = &purgeable_queues[PURGEABLE_Q_TYPE_FIFO];
-    for (group = 0; group < NUM_VOLATILE_GROUPS; group++)
-    {
-               vm_page_iterate_purgeable_objects(info, num_info, proc, volatile_q, group);
-    }
-
-    volatile_q = &purgeable_queues[PURGEABLE_Q_TYPE_LIFO];
-    for (group = 0; group < NUM_VOLATILE_GROUPS; group++)
-    {
-               vm_page_iterate_purgeable_objects(info, num_info, proc, volatile_q, group);
-    }
-    lck_mtx_unlock(&vm_purgeable_queue_lock);
 }
+#endif /* ! VM_TAG_ACTIVE_UPDATE */
 
 static uint64_t
 process_account(mach_memory_info_t * info, unsigned int num_info, uint64_t zones_collectable_bytes, boolean_t iterated)
@@ -8854,7 +9017,9 @@ vm_page_diagnose(mach_memory_info_t * info, unsigned int num_info, uint64_t zone
                vm_page_t                    page;
                int                          stackIdx, count;
 
+#if ! VM_TAG_ACTIVE_UPDATE
            vm_page_iterate_objects(info, num_info, &vm_page_count_object);
+#endif /* ! VM_TAG_ACTIVE_UPDATE */
 
            map = kernel_map;
            stackIdx = 0;
@@ -8973,3 +9138,60 @@ vm_tag_get_kext(vm_tag_t tag, char * name, vm_size_t namelen)
 
     return (kmodId);
 }
+
+
+#if CONFIG_SECLUDED_MEMORY
+/*
+ * Note that there's no locking around other accesses to vm_page_secluded_target.
+ * That should be OK, since these are the only place where it can be changed after
+ * initialization. Other users (like vm_pageout) may see the wrong value briefly,
+ * but will eventually get the correct value. This brief mismatch is OK as pageout
+ * and page freeing will auto-adjust the vm_page_secluded_count to match the target
+ * over time.
+ */
+unsigned int vm_page_secluded_suppress_cnt = 0;
+unsigned int vm_page_secluded_save_target;
+
+
+lck_grp_attr_t secluded_suppress_slock_grp_attr;
+lck_grp_t      secluded_suppress_slock_grp;
+lck_attr_t     secluded_suppress_slock_attr;
+lck_spin_t     secluded_suppress_slock;
+
+void
+secluded_suppression_init(void)
+{
+       lck_grp_attr_setdefault(&secluded_suppress_slock_grp_attr);
+       lck_grp_init(&secluded_suppress_slock_grp,
+           "secluded_suppress_slock",  &secluded_suppress_slock_grp_attr);
+       lck_attr_setdefault(&secluded_suppress_slock_attr);
+       lck_spin_init(&secluded_suppress_slock,
+           &secluded_suppress_slock_grp, &secluded_suppress_slock_attr);
+}
+
+void
+start_secluded_suppression(task_t task)
+{
+       if (task->task_suppressed_secluded)
+               return;
+       lck_spin_lock(&secluded_suppress_slock);
+       if (!task->task_suppressed_secluded && vm_page_secluded_suppress_cnt++ == 0) {
+               task->task_suppressed_secluded = TRUE;
+               vm_page_secluded_save_target = vm_page_secluded_target;
+               vm_page_secluded_target = 0;
+       }
+       lck_spin_unlock(&secluded_suppress_slock);
+}
+
+void
+stop_secluded_suppression(task_t task)
+{
+       lck_spin_lock(&secluded_suppress_slock);
+       if (task->task_suppressed_secluded && --vm_page_secluded_suppress_cnt == 0) {
+               task->task_suppressed_secluded = FALSE;
+               vm_page_secluded_target = vm_page_secluded_save_target;
+       }
+       lck_spin_unlock(&secluded_suppress_slock);
+}
+
+#endif /* CONFIG_SECLUDED_MEMORY */
index f7018253bff9f8b7a5b1c67ee74657a0de2bd5a2..e94960ddc8914c3c6bbe7f88e56b565df2179d67 100644 (file)
 #include <machine/commpage.h>
 #include <machine/cpu_capabilities.h>
 
+#if defined (__arm__) || defined(__arm64__)
+#include <arm/cpu_data_internal.h>
+#endif
+
+/*
+ * the following codes are used in the  subclass
+ * of the DBG_MACH_SHAREDREGION class
+ */
+#define        PROCESS_SHARED_CACHE_LAYOUT 0x00
+
+
 /* "dyld" uses this to figure out what the kernel supports */
 int shared_region_version = 3;
 
@@ -124,6 +135,8 @@ int shared_region_persistence = 0;  /* no by default */
 /* delay before reclaiming an unused shared region */
 int shared_region_destroy_delay = 120; /* in seconds */
 
+struct vm_shared_region *init_task_shared_region = NULL;
+
 #ifndef CONFIG_EMBEDDED
 /* 
  * Only one cache gets to slide on Desktop, since we can't
@@ -152,11 +165,20 @@ static void vm_shared_region_reference_locked(vm_shared_region_t shared_region);
 static vm_shared_region_t vm_shared_region_create(
        void                    *root_dir,
        cpu_type_t              cputype,
+       cpu_subtype_t           cpu_subtype,
        boolean_t               is_64bit);
 static void vm_shared_region_destroy(vm_shared_region_t shared_region);
 
 static void vm_shared_region_timeout(thread_call_param_t param0,
                                     thread_call_param_t param1);
+kern_return_t vm_shared_region_slide_mapping(
+       vm_shared_region_t sr,
+       mach_vm_size_t slide_info_size,
+       mach_vm_offset_t start,
+       mach_vm_size_t size,
+       mach_vm_offset_t slid_mapping,
+       uint32_t slide,
+       memory_object_control_t); /* forward */
 
 static int __commpage_setup = 0;
 #if defined(__i386__) || defined(__x86_64__)
@@ -289,6 +311,30 @@ vm_shared_region_mem_entry(
        return shared_region->sr_mem_entry;
 }
 
+vm_map_t
+vm_shared_region_vm_map(
+       vm_shared_region_t      shared_region)
+{
+       ipc_port_t              sr_handle;
+       vm_named_entry_t        sr_mem_entry;
+       vm_map_t                sr_map;
+
+       SHARED_REGION_TRACE_DEBUG(
+               ("shared_region: -> vm_map(%p)\n",
+                (void *)VM_KERNEL_ADDRPERM(shared_region)));
+       assert(shared_region->sr_ref_count > 1);
+
+       sr_handle = shared_region->sr_mem_entry;
+       sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject;
+       sr_map = sr_mem_entry->backing.map;
+       assert(sr_mem_entry->is_sub_map);
+
+       SHARED_REGION_TRACE_DEBUG(
+               ("shared_region: vm_map(%p) <- %p\n",
+                (void *)VM_KERNEL_ADDRPERM(shared_region),
+                (void *)VM_KERNEL_ADDRPERM(sr_map)));
+       return sr_map;
+}
 uint32_t
 vm_shared_region_get_slide(
        vm_shared_region_t      shared_region)
@@ -379,15 +425,17 @@ vm_shared_region_t
 vm_shared_region_lookup(
        void            *root_dir,
        cpu_type_t      cputype,
+       cpu_subtype_t   cpu_subtype,
        boolean_t       is_64bit)
 {
        vm_shared_region_t      shared_region;
        vm_shared_region_t      new_shared_region;
 
        SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: -> lookup(root=%p,cpu=%d,64bit=%d)\n",
+               ("shared_region: -> lookup(root=%p,cpu=<%d,%d>,64bit=%d)\n",
 
-                (void *)VM_KERNEL_ADDRPERM(root_dir), cputype, is_64bit));
+                (void *)VM_KERNEL_ADDRPERM(root_dir),
+                cputype, cpu_subtype, is_64bit));
 
        shared_region = NULL;
        new_shared_region = NULL;
@@ -400,6 +448,7 @@ vm_shared_region_lookup(
                              sr_q) {
                        assert(shared_region->sr_ref_count > 0);
                        if (shared_region->sr_cpu_type == cputype &&
+                           shared_region->sr_cpu_subtype == cpu_subtype &&
                            shared_region->sr_root_dir == root_dir &&
                            shared_region->sr_64bit == is_64bit) {
                                /* found a match ! */
@@ -412,6 +461,7 @@ vm_shared_region_lookup(
                        vm_shared_region_unlock();
                        new_shared_region = vm_shared_region_create(root_dir,
                                                                    cputype,
+                                                                   cpu_subtype,
                                                                    is_64bit);
                        /* do the lookup again, in case we lost a race */
                        vm_shared_region_lock();
@@ -442,9 +492,9 @@ done:
        }
 
        SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: lookup(root=%p,cpu=%d,64bit=%d) <- %p\n",
+               ("shared_region: lookup(root=%p,cpu=<%d,%d>,64bit=%d) <- %p\n",
                 (void *)VM_KERNEL_ADDRPERM(root_dir),
-                cputype, is_64bit,
+                cputype, cpu_subtype, is_64bit,
                 (void *)VM_KERNEL_ADDRPERM(shared_region)));
 
        assert(shared_region->sr_ref_count > 0);
@@ -612,6 +662,7 @@ static vm_shared_region_t
 vm_shared_region_create(
        void                    *root_dir,
        cpu_type_t              cputype,
+       cpu_subtype_t           cpu_subtype,
        boolean_t               is_64bit)
 {
        kern_return_t           kr;
@@ -623,9 +674,10 @@ vm_shared_region_create(
        mach_vm_offset_t        base_address, pmap_nesting_start;
        mach_vm_size_t          size, pmap_nesting_size;
 
-       SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: -> create(root=%p,cpu=%d,64bit=%d)\n",
-                (void *)VM_KERNEL_ADDRPERM(root_dir), cputype, is_64bit));
+       SHARED_REGION_TRACE_INFO(
+               ("shared_region: -> create(root=%p,cpu=<%d,%d>,64bit=%d)\n",
+                (void *)VM_KERNEL_ADDRPERM(root_dir),
+                cputype, cpu_subtype, is_64bit));
 
        base_address = 0;
        size = 0;
@@ -776,6 +828,7 @@ vm_shared_region_create(
        shared_region->sr_pmap_nesting_start = pmap_nesting_start;
        shared_region->sr_pmap_nesting_size = pmap_nesting_size;
        shared_region->sr_cpu_type = cputype;
+       shared_region->sr_cpu_subtype = cpu_subtype;
        shared_region->sr_64bit = is_64bit;
        shared_region->sr_root_dir = root_dir;
 
@@ -799,17 +852,20 @@ vm_shared_region_create(
        si->slide_info_size = 0;
        si->slide_info_entry = NULL;
 
-       /* Initialize UUID */
+       /* Initialize UUID and other metadata */
        memset(&shared_region->sr_uuid, '\0', sizeof(shared_region->sr_uuid));
        shared_region->sr_uuid_copied = FALSE;
+       shared_region->sr_images_count = 0;
+       shared_region->sr_images = NULL;
 done:
        if (shared_region) {
                SHARED_REGION_TRACE_INFO(
-                       ("shared_region: create(root=%p,cpu=%d,64bit=%d,"
+                       ("shared_region: create(root=%p,cpu=<%d,%d>,64bit=%d,"
                         "base=0x%llx,size=0x%llx) <- "
                         "%p mem=(%p,%p) map=%p pmap=%p\n",
                         (void *)VM_KERNEL_ADDRPERM(root_dir),
-                        cputype, is_64bit, (long long)base_address,
+                        cputype, cpu_subtype, is_64bit,
+                        (long long)base_address,
                         (long long)size,
                         (void *)VM_KERNEL_ADDRPERM(shared_region),
                         (void *)VM_KERNEL_ADDRPERM(mem_entry_port),
@@ -818,10 +874,11 @@ done:
                         (void *)VM_KERNEL_ADDRPERM(sub_map->pmap)));
        } else {
                SHARED_REGION_TRACE_INFO(
-                       ("shared_region: create(root=%p,cpu=%d,64bit=%d,"
+                       ("shared_region: create(root=%p,cpu=<%d,%d>,64bit=%d,"
                         "base=0x%llx,size=0x%llx) <- NULL",
                         (void *)VM_KERNEL_ADDRPERM(root_dir),
-                        cputype, is_64bit, (long long)base_address,
+                        cputype, cpu_subtype, is_64bit,
+                        (long long)base_address,
                         (long long)size));
        }
        return shared_region;
@@ -839,10 +896,11 @@ vm_shared_region_destroy(
        vm_map_t                map;
 
        SHARED_REGION_TRACE_INFO(
-               ("shared_region: -> destroy(%p) (root=%p,cpu=%d,64bit=%d)\n",
+               ("shared_region: -> destroy(%p) (root=%p,cpu=<%d,%d>,64bit=%d)\n",
                 (void *)VM_KERNEL_ADDRPERM(shared_region),
                 (void *)VM_KERNEL_ADDRPERM(shared_region->sr_root_dir),
                 shared_region->sr_cpu_type,
+                shared_region->sr_cpu_subtype,
                 shared_region->sr_64bit));
 
        assert(shared_region->sr_ref_count == 0);
@@ -1091,9 +1149,12 @@ vm_shared_region_map_file(
        vm_object_size_t        obj_size;
        struct shared_file_mapping_np   *mapping_to_slide = NULL;
        mach_vm_offset_t        first_mapping = (mach_vm_offset_t) -1;
+       mach_vm_offset_t        slid_mapping = (mach_vm_offset_t) -1;
        vm_map_offset_t         lowest_unnestable_addr = 0;
        vm_map_kernel_flags_t   vmk_flags;
-
+       mach_vm_offset_t        sfm_min_address = ~0;
+       mach_vm_offset_t        sfm_max_address = 0;
+       struct _dyld_cache_header sr_cache_header;
 
 #if __arm64__
        if ((shared_region->sr_64bit ||
@@ -1170,6 +1231,14 @@ vm_shared_region_map_file(
                         mappings[i].sfm_max_prot,
                         mappings[i].sfm_init_prot));
 
+               if (mappings[i].sfm_address < sfm_min_address) {
+                       sfm_min_address = mappings[i].sfm_address;
+               }
+
+               if ((mappings[i].sfm_address + mappings[i].sfm_size) > sfm_max_address) {
+                       sfm_max_address = mappings[i].sfm_address + mappings[i].sfm_size;
+               }
+
                if (mappings[i].sfm_init_prot & VM_PROT_ZF) {
                        /* zero-filled memory */
                        map_port = MACH_PORT_NULL;
@@ -1268,6 +1337,11 @@ vm_shared_region_map_file(
                                first_mapping = target_address;
                        }
 
+                       if ((slid_mapping == (mach_vm_offset_t) -1) &&
+                               (mapping_to_slide == &mappings[i])) {
+                               slid_mapping = target_address;
+                       }
+
                        /*
                         * Record the lowest writable address in this
                         * sub map, to log any unexpected unnesting below
@@ -1343,6 +1417,7 @@ vm_shared_region_map_file(
                                            mapping_to_slide->sfm_size, 
                                            slide_start, 
                                            slide_size, 
+                                           slid_mapping,
                                            file_control);
                if (kr  != KERN_SUCCESS) {
                        SHARED_REGION_TRACE_ERROR(
@@ -1375,35 +1450,89 @@ vm_shared_region_map_file(
        vm_shared_region_lock();
        assert(shared_region->sr_ref_count > 1);
        assert(shared_region->sr_mapping_in_progress);
+
        /* set "sr_first_mapping"; dyld uses it to validate the shared cache */ 
        if (kr == KERN_SUCCESS &&
            shared_region->sr_first_mapping == (mach_vm_offset_t) -1) {
                shared_region->sr_first_mapping = first_mapping;
        }
 
-
-       /* copy in the shared region UUID to the shared region structure */
+       /*
+        * copy in the shared region UUID to the shared region structure.
+        * we do this indirectly by first copying in the shared cache header
+        * and then copying the UUID from there because we'll need to look
+        * at other content from the shared cache header.
+        */
        if (kr == KERN_SUCCESS && !shared_region->sr_uuid_copied) {
-                int error = copyin((shared_region->sr_base_address + shared_region->sr_first_mapping +
-                                        offsetof(struct _dyld_cache_header, uuid)),
-                                (char *)&shared_region->sr_uuid,
-                                sizeof(shared_region->sr_uuid));
-                if (error == 0) {
+               int error = copyin((shared_region->sr_base_address + shared_region->sr_first_mapping),
+                                (char *)&sr_cache_header,
+                                sizeof(sr_cache_header));
+               if (error == 0) {
+                       memcpy(&shared_region->sr_uuid, &sr_cache_header.uuid, sizeof(shared_region->sr_uuid));
                        shared_region->sr_uuid_copied = TRUE;
-                } else {
+               } else {
 #if DEVELOPMENT || DEBUG
-                       panic("shared_region: copyin_UUID(sr_base_addr:0x%016llx sr_first_mapping:0x%016llx "
-                               "offset:0x%016llx size:0x%016llx) failed with %d\n",
+                       panic("shared_region: copyin shared_cache_header(sr_base_addr:0x%016llx sr_first_mapping:0x%016llx "
+                               "offset:0 size:0x%016llx) failed with %d\n",
                                 (long long)shared_region->sr_base_address,
                                 (long long)shared_region->sr_first_mapping,
-                                (long long)offsetof(struct _dyld_cache_header, uuid),
-                                (long long)sizeof(shared_region->sr_uuid),
+                                (long long)sizeof(sr_cache_header),
                                 error);
 #endif /* DEVELOPMENT || DEBUG */
                        shared_region->sr_uuid_copied = FALSE;
                 }
        }
 
+       /*
+        * If the shared cache is associated with the init task (and is therefore the system shared cache),
+        * check whether it is a custom built shared cache and copy in the shared cache layout accordingly.
+        */
+       boolean_t is_init_task = (task_pid(current_task()) == 1);
+       if (shared_region->sr_uuid_copied && is_init_task) {
+               /* Copy in the shared cache layout if we're running with a locally built shared cache */
+               if (sr_cache_header.locallyBuiltCache) {
+                       KDBG((MACHDBG_CODE(DBG_MACH_SHAREDREGION, PROCESS_SHARED_CACHE_LAYOUT)) | DBG_FUNC_START);
+                       size_t image_array_length = (sr_cache_header.imagesTextCount * sizeof(struct _dyld_cache_image_text_info));
+                       struct _dyld_cache_image_text_info *sr_image_layout = kalloc(image_array_length);
+                       int error = copyin((shared_region->sr_base_address + shared_region->sr_first_mapping +
+                                       sr_cache_header.imagesTextOffset), (char *)sr_image_layout, image_array_length);
+                       if (error == 0) {
+                               shared_region->sr_images = kalloc(sr_cache_header.imagesTextCount * sizeof(struct dyld_uuid_info_64));
+                               for (size_t index = 0; index < sr_cache_header.imagesTextCount; index++) {
+                                       memcpy((char *)&shared_region->sr_images[index].imageUUID, (char *)&sr_image_layout[index].uuid,
+                                                       sizeof(shared_region->sr_images[index].imageUUID));
+                                       shared_region->sr_images[index].imageLoadAddress = sr_image_layout[index].loadAddress;
+                               }
+
+                               assert(sr_cache_header.imagesTextCount < UINT32_MAX);
+                               shared_region->sr_images_count = (uint32_t) sr_cache_header.imagesTextCount;
+                       } else {
+#if DEVELOPMENT || DEBUG
+                               panic("shared_region: copyin shared_cache_layout(sr_base_addr:0x%016llx sr_first_mapping:0x%016llx "
+                                       "offset:0x%016llx size:0x%016llx) failed with %d\n",
+                                        (long long)shared_region->sr_base_address,
+                                        (long long)shared_region->sr_first_mapping,
+                                        (long long)sr_cache_header.imagesTextOffset,
+                                        (long long)image_array_length,
+                                        error);
+#endif /* DEVELOPMENT || DEBUG */
+                       }
+                       KDBG((MACHDBG_CODE(DBG_MACH_SHAREDREGION, PROCESS_SHARED_CACHE_LAYOUT)) | DBG_FUNC_END, shared_region->sr_images_count);
+                       kfree(sr_image_layout, image_array_length);
+                       sr_image_layout = NULL;
+               }
+               init_task_shared_region = shared_region;
+       }
+
+       if (kr == KERN_SUCCESS) {
+               /*
+                * If we succeeded, we know the bounds of the shared region.
+                * Trim our pmaps to only cover this range (if applicable to
+                * this platform).
+                */
+               pmap_trim(current_map()->pmap, sr_map->pmap, sfm_min_address, sfm_min_address, sfm_max_address - sfm_min_address);
+       }
+
        /* we're done working on that shared region */
        shared_region->sr_mapping_in_progress = FALSE;
        thread_wakeup((event_t) &shared_region->sr_mapping_in_progress);
@@ -1418,6 +1547,38 @@ done:
        return kr;
 }
 
+/*
+ * Retrieve a task's shared region and grab an extra reference to
+ * make sure it doesn't disappear while the caller is using it.
+ * The caller is responsible for consuming that extra reference if
+ * necessary.
+ *
+ * This also tries to trim the pmap for the shared region.
+ */
+vm_shared_region_t
+vm_shared_region_trim_and_get(task_t task)
+{
+       vm_shared_region_t shared_region;
+       ipc_port_t sr_handle;
+       vm_named_entry_t sr_mem_entry;
+       vm_map_t sr_map;
+
+       /* Get the shared region and the map. */
+       shared_region = vm_shared_region_get(task);
+       if (shared_region == NULL) {
+               return NULL;
+       }
+
+       sr_handle = shared_region->sr_mem_entry;
+       sr_mem_entry = (vm_named_entry_t) sr_handle->ip_kobject;
+       sr_map = sr_mem_entry->backing.map;
+
+       /* Trim the pmap if possible. */
+       pmap_trim(task->map->pmap, sr_map->pmap, 0, 0, 0);
+
+       return shared_region;
+}
+
 /*
  * Enter the appropriate shared region into "map" for "task".
  * This involves looking up the shared region (and possibly creating a new
@@ -1430,7 +1591,8 @@ vm_shared_region_enter(
        struct task             *task,
        boolean_t               is_64bit,
        void                    *fsroot,
-       cpu_type_t              cpu)
+       cpu_type_t              cpu,
+       cpu_subtype_t           cpu_subtype)
 {
        kern_return_t           kr;
        vm_shared_region_t      shared_region;
@@ -1443,29 +1605,28 @@ vm_shared_region_enter(
 
        SHARED_REGION_TRACE_DEBUG(
                ("shared_region: -> "
-                "enter(map=%p,task=%p,root=%p,cpu=%d,64bit=%d)\n",
+                "enter(map=%p,task=%p,root=%p,cpu=<%d,%d>,64bit=%d)\n",
                 (void *)VM_KERNEL_ADDRPERM(map),
                 (void *)VM_KERNEL_ADDRPERM(task),
-                (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit));
+                (void *)VM_KERNEL_ADDRPERM(fsroot),
+                cpu, cpu_subtype, is_64bit));
 
        /* lookup (create if needed) the shared region for this environment */
-       shared_region = vm_shared_region_lookup(fsroot, cpu, is_64bit);
+       shared_region = vm_shared_region_lookup(fsroot, cpu, cpu_subtype, is_64bit);
        if (shared_region == NULL) {
                /* this should not happen ! */
                SHARED_REGION_TRACE_ERROR(
                        ("shared_region: -> "
-                        "enter(map=%p,task=%p,root=%p,cpu=%d,64bit=%d): "
+                        "enter(map=%p,task=%p,root=%p,cpu=<%d,%d>,64bit=%d): "
                         "lookup failed !\n",
                         (void *)VM_KERNEL_ADDRPERM(map),
                         (void *)VM_KERNEL_ADDRPERM(task),
-                        (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit));
+                        (void *)VM_KERNEL_ADDRPERM(fsroot),
+                        cpu, cpu_subtype, is_64bit));
                //panic("shared_region_enter: lookup failed\n");
                return KERN_FAILURE;
        }
        
-       /* let the task use that shared region */
-       vm_shared_region_set(task, shared_region);
-
        kr = KERN_SUCCESS;
        /* no need to lock since this data is never modified */
        sr_address = shared_region->sr_base_address;
@@ -1511,23 +1672,24 @@ vm_shared_region_enter(
                        VM_INHERIT_SHARE);
                if (kr != KERN_SUCCESS) {
                        SHARED_REGION_TRACE_ERROR(
-                               ("shared_region: enter(%p,%p,%p,%d,%d): "
+                               ("shared_region: enter(%p,%p,%p,%d,%d,%d): "
                                 "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n",
                                 (void *)VM_KERNEL_ADDRPERM(map),
                                 (void *)VM_KERNEL_ADDRPERM(task),
                                 (void *)VM_KERNEL_ADDRPERM(fsroot),
-                                cpu, is_64bit,
+                                cpu, cpu_subtype, is_64bit,
                                 (long long)target_address,
                                 (long long)mapping_size,
                                 (void *)VM_KERNEL_ADDRPERM(sr_handle), kr));
                        goto done;
                }
                SHARED_REGION_TRACE_DEBUG(
-                       ("shared_region: enter(%p,%p,%p,%d,%d): "
+                       ("shared_region: enter(%p,%p,%p,%d,%d,%d): "
                         "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n",
                         (void *)VM_KERNEL_ADDRPERM(map),
                         (void *)VM_KERNEL_ADDRPERM(task),
-                        (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit,
+                        (void *)VM_KERNEL_ADDRPERM(fsroot),
+                        cpu, cpu_subtype, is_64bit,
                         (long long)target_address, (long long)mapping_size,
                         (void *)VM_KERNEL_ADDRPERM(sr_handle), kr));
                sr_offset += mapping_size;
@@ -1564,23 +1726,24 @@ vm_shared_region_enter(
                        VM_INHERIT_SHARE);
                if (kr != KERN_SUCCESS) {
                        SHARED_REGION_TRACE_ERROR(
-                               ("shared_region: enter(%p,%p,%p,%d,%d): "
+                               ("shared_region: enter(%p,%p,%p,%d,%d,%d): "
                                 "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n",
                                 (void *)VM_KERNEL_ADDRPERM(map),
                                 (void *)VM_KERNEL_ADDRPERM(task),
                                 (void *)VM_KERNEL_ADDRPERM(fsroot),
-                                cpu, is_64bit,
+                                cpu, cpu_subtype, is_64bit,
                                 (long long)target_address,
                                 (long long)mapping_size,
                                 (void *)VM_KERNEL_ADDRPERM(sr_handle), kr));
                        goto done;
                }
                SHARED_REGION_TRACE_DEBUG(
-                       ("shared_region: enter(%p,%p,%p,%d,%d): "
+                       ("shared_region: enter(%p,%p,%p,%d,%d,%d): "
                         "nested vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n",
                         (void *)VM_KERNEL_ADDRPERM(map),
                         (void *)VM_KERNEL_ADDRPERM(task),
-                        (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit,
+                        (void *)VM_KERNEL_ADDRPERM(fsroot),
+                        cpu, cpu_subtype, is_64bit,
                         (long long)target_address, (long long)mapping_size,
                         (void *)VM_KERNEL_ADDRPERM(sr_handle), kr));
        }
@@ -1604,23 +1767,24 @@ vm_shared_region_enter(
                        VM_INHERIT_SHARE);
                if (kr != KERN_SUCCESS) {
                        SHARED_REGION_TRACE_ERROR(
-                               ("shared_region: enter(%p,%p,%p,%d,%d): "
+                               ("shared_region: enter(%p,%p,%p,%d,%d,%d): "
                                 "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n",
                                 (void *)VM_KERNEL_ADDRPERM(map),
                                 (void *)VM_KERNEL_ADDRPERM(task),
                                 (void *)VM_KERNEL_ADDRPERM(fsroot),
-                                cpu, is_64bit,
+                                cpu, cpu_subtype, is_64bit,
                                 (long long)target_address,
                                 (long long)mapping_size,
                                 (void *)VM_KERNEL_ADDRPERM(sr_handle), kr));
                        goto done;
                }
                SHARED_REGION_TRACE_DEBUG(
-                       ("shared_region: enter(%p,%p,%p,%d,%d): "
+                       ("shared_region: enter(%p,%p,%p,%d,%d,%d): "
                         "vm_map_enter(0x%llx,0x%llx,%p) error 0x%x\n",
                         (void *)VM_KERNEL_ADDRPERM(map),
                         (void *)VM_KERNEL_ADDRPERM(task),
-                        (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit,
+                        (void *)VM_KERNEL_ADDRPERM(fsroot),
+                        cpu, cpu_subtype, is_64bit,
                         (long long)target_address, (long long)mapping_size,
                         (void *)VM_KERNEL_ADDRPERM(sr_handle), kr));
                sr_offset += mapping_size;
@@ -1629,11 +1793,21 @@ vm_shared_region_enter(
        assert(sr_size == 0);
 
 done:
+       if (kr == KERN_SUCCESS) {
+               /* let the task use that shared region */
+               vm_shared_region_set(task, shared_region);
+       } else {
+               /* drop our reference since we're not using it */
+               vm_shared_region_deallocate(shared_region);
+               vm_shared_region_set(task, NULL);
+       }
+
        SHARED_REGION_TRACE_DEBUG(
-               ("shared_region: enter(%p,%p,%p,%d,%d) <- 0x%x\n",
+               ("shared_region: enter(%p,%p,%p,%d,%d,%d) <- 0x%x\n",
                 (void *)VM_KERNEL_ADDRPERM(map),
                 (void *)VM_KERNEL_ADDRPERM(task),
-                (void *)VM_KERNEL_ADDRPERM(fsroot), cpu, is_64bit, kr));
+                (void *)VM_KERNEL_ADDRPERM(fsroot),
+                cpu, cpu_subtype, is_64bit, kr));
        return kr;
 }
 
@@ -1672,32 +1846,45 @@ vm_shared_region_sliding_valid(uint32_t slide)
 }
 
 kern_return_t
-vm_shared_region_slide_init(
-               vm_shared_region_t sr,
-               mach_vm_size_t  slide_info_size,
-               mach_vm_offset_t start,
-               mach_vm_size_t size,
-               uint32_t slide,
-               memory_object_control_t sr_file_control)
+vm_shared_region_slide_mapping(
+       vm_shared_region_t      sr,
+       mach_vm_size_t          slide_info_size,
+       mach_vm_offset_t        start,
+       mach_vm_size_t          size,
+       mach_vm_offset_t        slid_mapping,
+       uint32_t                slide,
+       memory_object_control_t sr_file_control)
 {
-       kern_return_t kr = KERN_SUCCESS;
-       vm_object_t object = VM_OBJECT_NULL;
-       vm_object_offset_t offset = 0;
-       vm_shared_region_slide_info_t si = vm_shared_region_get_slide_info(sr);
-       vm_offset_t slide_info_entry;
-       
-       vm_map_t map = NULL, cur_map = NULL;
-       boolean_t       is_map_locked = FALSE;
+       kern_return_t           kr;
+       vm_object_t             object;
+       vm_shared_region_slide_info_t si;
+       vm_offset_t             slide_info_entry;
+       vm_map_entry_t          slid_entry, tmp_entry;
+       struct vm_map_entry     tmp_entry_store;
+       memory_object_t         sr_pager;
+       vm_map_t                sr_map;
+       int                     vm_flags;
+       vm_map_kernel_flags_t   vmk_flags;
+       vm_map_offset_t         map_addr;
+
+       tmp_entry = VM_MAP_ENTRY_NULL;
+       sr_pager = MEMORY_OBJECT_NULL;
+       object = VM_OBJECT_NULL;
+       slide_info_entry = 0;
 
        assert(sr->sr_slide_in_progress);
        assert(!sr->sr_slid);
-       assert(si->slide_object == NULL);
+
+       si = vm_shared_region_get_slide_info(sr);
+       assert(si->slide_object == VM_OBJECT_NULL);
        assert(si->slide_info_entry == NULL);
 
+       if (sr_file_control == MEMORY_OBJECT_CONTROL_NULL) {
+               return KERN_INVALID_ARGUMENT;
+       }
        if (slide_info_size > SANE_SLIDE_INFO_SIZE) {
                printf("Slide_info_size too large: %lx\n", (uintptr_t)slide_info_size);
-               kr = KERN_FAILURE;
-               return kr;
+               return KERN_FAILURE;
        }
 
        kr = kmem_alloc(kernel_map,
@@ -1707,93 +1894,117 @@ vm_shared_region_slide_init(
                return kr;
        }
 
-       if (sr_file_control != MEMORY_OBJECT_CONTROL_NULL) {
+       object = memory_object_control_to_vm_object(sr_file_control);
+       if (object == VM_OBJECT_NULL || object->internal) {
+               object = VM_OBJECT_NULL;
+               kr = KERN_INVALID_ADDRESS;
+               goto done;
+       }
 
-               object = memory_object_control_to_vm_object(sr_file_control);
-               vm_object_reference(object);
-               offset = start;
+       vm_object_lock(object);
+       vm_object_reference_locked(object);     /* for si->slide_object */
+       object->object_is_shared_cache = TRUE;
+       vm_object_unlock(object);
 
-               vm_object_lock(object);
-       } else {
-               /*
-                * Remove this entire "else" block and all "map" references
-                * once we get rid of the shared_region_slide_np()
-                * system call. 
-                */ 
-               vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
-               map = current_map();
-               vm_map_lock_read(map);
-               is_map_locked = TRUE;
-       Retry:
-               cur_map = map;
-               if(!vm_map_lookup_entry(map, start, &entry)) {
-                       kr = KERN_INVALID_ARGUMENT;
-               } else {
-                       vm_object_t shadow_obj = VM_OBJECT_NULL;
-        
-                       if (entry->is_sub_map == TRUE) { 
-                               map = VME_SUBMAP(entry);
-                               start -= entry->vme_start;
-                               start += VME_OFFSET(entry);
-                               vm_map_lock_read(map);
-                               vm_map_unlock_read(cur_map);
-                               goto Retry;
-                       } else {
-                               object = VME_OBJECT(entry);
-                               offset = ((start - entry->vme_start) +
-                                         VME_OFFSET(entry));
-                       }
-        
-                       vm_object_lock(object);
-                       while (object->shadow != VM_OBJECT_NULL) {
-                               shadow_obj = object->shadow;
-                               vm_object_lock(shadow_obj);
-                               vm_object_unlock(object);
-                               object = shadow_obj;            
-                       }
-               }
+       si->slide_info_entry = (vm_shared_region_slide_info_entry_t)slide_info_entry;
+       si->slide_info_size = slide_info_size;
+
+       assert(slid_mapping != (mach_vm_offset_t) -1);
+       si->slid_address = slid_mapping + sr->sr_base_address;
+       si->slide_object = object;
+       si->start = start;
+       si->end = si->start + size;
+       si->slide = slide;
+
+       /* find the shared region's map entry to slide */
+       sr_map = vm_shared_region_vm_map(sr);
+       vm_map_lock_read(sr_map);
+       if (!vm_map_lookup_entry(sr_map,
+                                slid_mapping,
+                                &slid_entry)) {
+               /* no mapping there */
+               vm_map_unlock(sr_map);
+               kr = KERN_INVALID_ARGUMENT;
+               goto done;
+       }
+       /*
+        * We might want to clip the entry to cover only the portion that
+        * needs sliding (offsets si->start to si->end in the shared cache
+        * file at the bottom of the shadow chain).
+        * In practice, it seems to cover the entire DATA segment...
+        */
+       tmp_entry_store = *slid_entry;
+       tmp_entry = &tmp_entry_store;
+       slid_entry = VM_MAP_ENTRY_NULL;
+       /* extra ref to keep object alive while map is unlocked */
+       vm_object_reference(VME_OBJECT(tmp_entry));
+       vm_map_unlock_read(sr_map);
+
+       /* create a "shared_region" sliding pager */
+       sr_pager = shared_region_pager_setup(VME_OBJECT(tmp_entry),
+                                            VME_OFFSET(tmp_entry),
+                                            si);
+       if (sr_pager == NULL) {
+               kr = KERN_RESOURCE_SHORTAGE;
+               goto done;
        }
-               
-       if (object->internal == TRUE) {
-               kr = KERN_INVALID_ADDRESS;
-       } else if (object->object_slid) {
-               /* Can only be slid once */
-               printf("%s: found vm_object %p already slid?\n", __FUNCTION__, object);
-               kr = KERN_FAILURE;
-       } else {
 
-               si->slide_info_entry = (vm_shared_region_slide_info_entry_t)slide_info_entry;
-               si->slide_info_size = slide_info_size;
-               si->slide_object = object;
-               si->start = offset;
-               si->end = si->start + size;     
-               si->slide = slide;
+       /* map that pager over the portion of the mapping that needs sliding */
+       vm_flags = VM_FLAGS_FIXED | VM_FLAGS_OVERWRITE;
+       vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
+       vmk_flags.vmkf_overwrite_immutable = TRUE;
+       map_addr = tmp_entry->vme_start;
+       kr = vm_map_enter_mem_object(sr_map,
+                                    &map_addr,
+                                    (tmp_entry->vme_end -
+                                     tmp_entry->vme_start),
+                                    (mach_vm_offset_t) 0,
+                                    vm_flags,
+                                    vmk_flags,
+                                    VM_KERN_MEMORY_NONE,
+                                    (ipc_port_t)(uintptr_t) sr_pager,
+                                    0,
+                                    TRUE,
+                                    tmp_entry->protection,
+                                    tmp_entry->max_protection,
+                                    tmp_entry->inheritance);
+       assertf(kr == KERN_SUCCESS, "kr = 0x%x\n", kr);
+       assertf(map_addr == tmp_entry->vme_start,
+               "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
+               (uint64_t)map_addr,
+               (uint64_t) tmp_entry->vme_start,
+               tmp_entry);
+
+       /* success! */
+       kr = KERN_SUCCESS;
 
+done:
+       if (sr_pager) {
                /*
-                * If we want to have this region get deallocated/freed
-                * then we will have to make sure that we msync(..MS_INVALIDATE..)
-                * the pages associated with this shared region. Those pages would
-                * have been slid with an older slide value.
-                */
-
-               /* 
-                * Pointers in object are held without references; they
-                * are disconnected at the time that we destroy the
-                * shared region, and since the shared region holds 
-                * a reference on the object, no references in the other
-                * direction are required.
+                * Release the sr_pager reference obtained by
+                * shared_region_pager_setup().
+                * The mapping (if it succeeded) is now holding a reference on
+                * the memory object.
                 */
-               object->object_slid = TRUE;
-               object->vo_slide_info = si;
+               memory_object_deallocate(sr_pager);
+               sr_pager = MEMORY_OBJECT_NULL;
        }
-
-       vm_object_unlock(object);
-       if (is_map_locked == TRUE) {
-               vm_map_unlock_read(map);
+       if (tmp_entry) {
+               /* release extra ref on tmp_entry's VM object */
+               vm_object_deallocate(VME_OBJECT(tmp_entry));
+               tmp_entry = VM_MAP_ENTRY_NULL;
        }
 
        if (kr != KERN_SUCCESS) {
-               kmem_free(kernel_map, slide_info_entry, slide_info_size);
+               /* cleanup */
+               if (slide_info_entry) {
+                       kmem_free(kernel_map, slide_info_entry, slide_info_size);
+                       slide_info_entry = 0;
+               }
+               if (si->slide_object) {
+                       vm_object_deallocate(si->slide_object);
+                       si->slide_object = VM_OBJECT_NULL;
+               }
        }
        return kr;
 }
@@ -1858,6 +2069,66 @@ vm_shared_region_slide_sanity_check_v2(vm_shared_region_slide_info_entry_v2_t s_
        return KERN_SUCCESS;
 }
 
+static kern_return_t
+vm_shared_region_slide_sanity_check_v3(vm_shared_region_slide_info_entry_v3_t s_info, mach_vm_size_t slide_info_size)
+{
+       if (s_info->page_size != PAGE_SIZE_FOR_SR_SLIDE) {
+               printf("vm_shared_region_slide_sanity_check_v3: s_info->page_size != PAGE_SIZE_FOR_SR_SL 0x%llx != 0x%llx\n", (uint64_t)s_info->page_size, (uint64_t)PAGE_SIZE_FOR_SR_SLIDE);
+               return KERN_FAILURE;
+       }
+
+       uint32_t page_starts_count = s_info->page_starts_count;
+       mach_vm_size_t num_trailing_entries = page_starts_count;
+       mach_vm_size_t trailing_size = num_trailing_entries << 1;
+       mach_vm_size_t required_size = sizeof(*s_info) + trailing_size;
+       if (required_size < sizeof(*s_info)) {
+               printf("vm_shared_region_slide_sanity_check_v3: required_size != sizeof(*s_info) 0x%llx != 0x%llx\n", (uint64_t)required_size, (uint64_t)sizeof(*s_info));
+               return KERN_FAILURE;
+       }
+
+       if (required_size > slide_info_size) {
+               printf("vm_shared_region_slide_sanity_check_v3: required_size != slide_info_size 0x%llx != 0x%llx\n", (uint64_t)required_size, (uint64_t)slide_info_size);
+               return KERN_FAILURE;
+       }
+
+       return KERN_SUCCESS;
+}
+
+static kern_return_t
+vm_shared_region_slide_sanity_check_v4(vm_shared_region_slide_info_entry_v4_t s_info, mach_vm_size_t slide_info_size)
+{
+    if (s_info->page_size != PAGE_SIZE_FOR_SR_SLIDE) {
+        return KERN_FAILURE;
+    }
+
+    /* Ensure that the slide info doesn't reference any data outside of its bounds. */
+
+    uint32_t page_starts_count = s_info->page_starts_count;
+    uint32_t page_extras_count = s_info->page_extras_count;
+    mach_vm_size_t num_trailing_entries = page_starts_count + page_extras_count;
+    if (num_trailing_entries < page_starts_count) {
+        return KERN_FAILURE;
+    }
+
+    /* Scale by sizeof(uint16_t). Hard-coding the size simplifies the overflow check. */
+    mach_vm_size_t trailing_size = num_trailing_entries << 1;
+    if (trailing_size >> 1 != num_trailing_entries) {
+        return KERN_FAILURE;
+    }
+
+    mach_vm_size_t required_size = sizeof(*s_info) + trailing_size;
+    if (required_size < sizeof(*s_info)) {
+        return KERN_FAILURE;
+    }
+
+    if (required_size > slide_info_size) {
+        return KERN_FAILURE;
+    }
+
+    return KERN_SUCCESS;
+}
+
+
 kern_return_t
 vm_shared_region_slide_sanity_check(vm_shared_region_t sr)
 {
@@ -1880,6 +2151,10 @@ vm_shared_region_slide_sanity_check(vm_shared_region_t sr)
                kr = vm_shared_region_slide_sanity_check_v1(&s_info->v1);
        } else if (s_info->version == 2) {
                kr = vm_shared_region_slide_sanity_check_v2(&s_info->v2, si->slide_info_size);
+       } else if (s_info->version == 3) {
+               kr = vm_shared_region_slide_sanity_check_v3(&s_info->v3, si->slide_info_size);
+    } else if (s_info->version == 4) {
+        kr = vm_shared_region_slide_sanity_check_v4(&s_info->v4, si->slide_info_size);
        } else {
                goto fail;
        }
@@ -1894,11 +2169,6 @@ fail:
                          (vm_offset_t) si->slide_info_entry,
                          (vm_size_t) si->slide_info_size);
                
-               vm_object_lock(si->slide_object);
-               si->slide_object->object_slid = FALSE;
-               si->slide_object->vo_slide_info = NULL;
-               vm_object_unlock(si->slide_object);
-               
                vm_object_deallocate(si->slide_object);
                si->slide_object        = NULL;
                si->start = 0;
@@ -1918,7 +2188,7 @@ vm_shared_region_slide_page_v1(vm_shared_region_slide_info_t si, vm_offset_t vad
        uint32_t i=0, j=0;
        uint8_t b = 0;
        uint32_t slide = si->slide;
-       int is_64 = task_has_64BitAddr(current_task());
+       int is_64 = task_has_64Bit_addr(current_task());
 
        vm_shared_region_slide_info_entry_v1_t s_info = &si->slide_info_entry->v1;
        toc = (uint16_t*)((uintptr_t)s_info + s_info->toc_offset);
@@ -2148,14 +2418,206 @@ vm_shared_region_slide_page_v2(vm_shared_region_slide_info_t si, vm_offset_t vad
        return KERN_SUCCESS;
 }
 
+
+static kern_return_t
+vm_shared_region_slide_page_v3(vm_shared_region_slide_info_t si, vm_offset_t vaddr, __unused mach_vm_offset_t uservaddr, uint32_t pageIndex)
+{
+       vm_shared_region_slide_info_entry_v3_t s_info = &si->slide_info_entry->v3;
+       const uint32_t slide_amount = si->slide;
+
+       uint8_t *page_content = (uint8_t *)vaddr;
+       uint16_t page_entry;
+
+       if (pageIndex >= s_info->page_starts_count) {
+               printf("vm_shared_region_slide_page() did not find page start in slide info: pageIndex=%u, count=%u\n",
+                          pageIndex, s_info->page_starts_count);
+               return KERN_FAILURE;
+       }
+       page_entry = s_info->page_starts[pageIndex];
+
+       if (page_entry == DYLD_CACHE_SLIDE_V3_PAGE_ATTR_NO_REBASE) {
+               return KERN_SUCCESS;
+       }
+
+       uint8_t* rebaseLocation = page_content;
+       uint64_t delta = page_entry;
+       do {
+               rebaseLocation += delta;
+               uint64_t value;
+               memcpy(&value, rebaseLocation, sizeof(value));
+               delta = ( (value & 0x3FF8000000000000) >> 51) * sizeof(uint64_t);
+
+               // A pointer is one of :
+               // {
+               //       uint64_t pointerValue : 51;
+               //       uint64_t offsetToNextPointer : 11;
+               //       uint64_t isBind : 1 = 0;
+               //       uint64_t authenticated : 1 = 0;
+               // }
+               // {
+               //       uint32_t offsetFromSharedCacheBase;
+               //       uint16_t diversityData;
+               //       uint16_t hasAddressDiversity : 1;
+               //       uint16_t hasDKey : 1;
+               //       uint16_t hasBKey : 1;
+               //       uint16_t offsetToNextPointer : 11;
+               //       uint16_t isBind : 1;
+               //       uint16_t authenticated : 1 = 1;
+               // }
+
+               bool isBind = (value & (1ULL << 62)) == 1;
+               if (isBind) {
+                       return KERN_FAILURE;
+               }
+
+               bool isAuthenticated = (value & (1ULL << 63)) != 0;
+
+               if (isAuthenticated) {
+                       // The new value for a rebase is the low 32-bits of the threaded value plus the slide.
+                       value = (value & 0xFFFFFFFF) + slide_amount;
+                       // Add in the offset from the mach_header
+                       const uint64_t value_add = s_info->value_add;
+                       value += value_add;
+
+               } else {
+                       // The new value for a rebase is the low 51-bits of the threaded value plus the slide.
+                       // Regular pointer which needs to fit in 51-bits of value.
+                       // C++ RTTI uses the top bit, so we'll allow the whole top-byte
+                       // and the bottom 43-bits to be fit in to 51-bits.
+                       uint64_t top8Bits = value & 0x0007F80000000000ULL;
+                       uint64_t bottom43Bits = value & 0x000007FFFFFFFFFFULL;
+                       uint64_t targetValue = ( top8Bits << 13 ) | bottom43Bits;
+                       value = targetValue + slide_amount;
+               }
+
+               memcpy(rebaseLocation, &value, sizeof(value));
+       } while (delta != 0);
+
+       return KERN_SUCCESS;
+}
+
+static kern_return_t
+rebase_chainv4(
+    uint8_t *page_content,
+    uint16_t start_offset,
+    uint32_t slide_amount,
+    vm_shared_region_slide_info_entry_v4_t s_info)
+{
+    const uint32_t last_page_offset = PAGE_SIZE_FOR_SR_SLIDE - sizeof(uint32_t);
+
+    const uint32_t delta_mask = (uint32_t)(s_info->delta_mask);
+    const uint32_t value_mask = ~delta_mask;
+    const uint32_t value_add = (uint32_t)(s_info->value_add);
+    const uint32_t delta_shift = __builtin_ctzll(delta_mask) - 2;
+
+    uint32_t page_offset = start_offset;
+    uint32_t delta = 1;
+
+    while (delta != 0 && page_offset <= last_page_offset) {
+        uint8_t *loc;
+        uint32_t value;
+
+        loc = page_content + page_offset;
+        memcpy(&value, loc, sizeof(value));
+        delta = (value & delta_mask) >> delta_shift;
+        value &= value_mask;
+
+        if ( (value & 0xFFFF8000) == 0 ) {
+            // small positive non-pointer, use as-is
+        } else if ( (value & 0x3FFF8000) == 0x3FFF8000 ) {
+            // small negative non-pointer
+            value |= 0xC0000000;
+        } else {
+            // pointer that needs rebasing
+            value += value_add;
+            value += slide_amount;
+        }
+        memcpy(loc, &value, sizeof(value));
+        page_offset += delta;
+    }
+
+    /* If the offset went past the end of the page, then the slide data is invalid. */
+    if (page_offset > last_page_offset) {
+        return KERN_FAILURE;
+    }
+    return KERN_SUCCESS;
+}
+
+static kern_return_t
+vm_shared_region_slide_page_v4(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t pageIndex)
+{
+    vm_shared_region_slide_info_entry_v4_t s_info = &si->slide_info_entry->v4;
+    const uint32_t slide_amount = si->slide;
+
+    const uint16_t *page_starts = (uint16_t *)((uintptr_t)s_info + s_info->page_starts_offset);
+    const uint16_t *page_extras = (uint16_t *)((uintptr_t)s_info + s_info->page_extras_offset);
+
+    uint8_t *page_content = (uint8_t *)vaddr;
+    uint16_t page_entry;
+
+    if (pageIndex >= s_info->page_starts_count) {
+        printf("vm_shared_region_slide_page() did not find page start in slide info: pageIndex=%u, count=%u\n",
+               pageIndex, s_info->page_starts_count);
+        return KERN_FAILURE;
+    }
+    page_entry = page_starts[pageIndex];
+
+    if (page_entry == DYLD_CACHE_SLIDE4_PAGE_NO_REBASE) {
+        return KERN_SUCCESS;
+    }
+
+    if (page_entry & DYLD_CACHE_SLIDE4_PAGE_USE_EXTRA) {
+        uint16_t chain_index = page_entry & DYLD_CACHE_SLIDE4_PAGE_INDEX;
+        uint16_t info;
+
+        do {
+            uint16_t page_start_offset;
+            kern_return_t kr;
+
+            if (chain_index >= s_info->page_extras_count) {
+                printf("vm_shared_region_slide_page() out-of-bounds extras index: index=%u, count=%u\n",
+                       chain_index, s_info->page_extras_count);
+                return KERN_FAILURE;
+            }
+            info = page_extras[chain_index];
+            page_start_offset = (info & DYLD_CACHE_SLIDE4_PAGE_INDEX) << DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT;
+
+            kr = rebase_chainv4(page_content, page_start_offset, slide_amount, s_info);
+            if (kr != KERN_SUCCESS) {
+                return KERN_FAILURE;
+            }
+
+            chain_index++;
+        } while (!(info & DYLD_CACHE_SLIDE4_PAGE_EXTRA_END));
+    } else {
+        const uint32_t page_start_offset = page_entry << DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT;
+        kern_return_t kr;
+
+        kr = rebase_chainv4(page_content, page_start_offset, slide_amount, s_info);
+        if (kr != KERN_SUCCESS) {
+            return KERN_FAILURE;
+        }
+    }
+
+    return KERN_SUCCESS;
+}
+
+
+
 kern_return_t
-vm_shared_region_slide_page(vm_shared_region_slide_info_t si, vm_offset_t vaddr, uint32_t pageIndex)
+vm_shared_region_slide_page(vm_shared_region_slide_info_t si, vm_offset_t vaddr, mach_vm_offset_t uservaddr, uint32_t pageIndex)
 {
        if (si->slide_info_entry->version == 1) {
                return vm_shared_region_slide_page_v1(si, vaddr, pageIndex);
-       } else {
+       } else if (si->slide_info_entry->version == 2) {
                return vm_shared_region_slide_page_v2(si, vaddr, pageIndex);
-       }
+    } else if (si->slide_info_entry->version == 3) {
+               return vm_shared_region_slide_page_v3(si, vaddr, uservaddr, pageIndex);
+    } else if (si->slide_info_entry->version == 4) {
+        return vm_shared_region_slide_page_v4(si, vaddr, pageIndex);
+       } else {
+        return KERN_FAILURE;
+    }
 }
 
 /******************************************************************************/
@@ -2452,6 +2914,7 @@ vm_shared_region_slide(uint32_t slide,
                        mach_vm_size_t          entry_size,
                        mach_vm_offset_t        slide_start,
                        mach_vm_size_t          slide_size,
+                       mach_vm_offset_t        slid_mapping,
                        memory_object_control_t sr_file_control)
 {
        void *slide_info_entry = NULL;
@@ -2496,7 +2959,14 @@ vm_shared_region_slide(uint32_t slide,
        sr->sr_slide_in_progress = TRUE;
        vm_shared_region_unlock();
 
-       if((error = vm_shared_region_slide_init(sr, slide_size, entry_start_address, entry_size, slide, sr_file_control))) {
+       error = vm_shared_region_slide_mapping(sr,
+                                              slide_size,
+                                              entry_start_address,
+                                              entry_size,
+                                              slid_mapping,
+                                              slide,
+                                              sr_file_control);
+       if (error) {
                printf("slide_info initialization failed with kr=%d\n", error);
                goto done;
        }
index 34becaefb98f2dcb60e09dc08bfe8ee914a8a76c..f57b3c891028be72ee552078cd10ade6abff39af 100644 (file)
@@ -57,6 +57,9 @@ extern int shared_region_debug;
 #endif /* DEBUG */
 
 extern int shared_region_trace_level;
+
+extern struct vm_shared_region *init_task_shared_region;
+
 #define SHARED_REGION_TRACE_NONE_LVL           0 /* no trace */
 #define SHARED_REGION_TRACE_ERROR_LVL          1 /* trace abnormal events */
 #define SHARED_REGION_TRACE_INFO_LVL           2 /* trace all events */
@@ -136,15 +139,52 @@ struct vm_shared_region_slide_info_entry_v2 {
 #define DYLD_CACHE_SLIDE_PAGE_VALUE            0x3FFF  // bitwise negation of DYLD_CACHE_SLIDE_PAGE_ATTRS
 #define DYLD_CACHE_SLIDE_PAGE_OFFSET_SHIFT     2
 
+typedef struct vm_shared_region_slide_info_entry_v3 *vm_shared_region_slide_info_entry_v3_t;
+struct vm_shared_region_slide_info_entry_v3
+{
+       uint32_t        version;                        // currently 3
+       uint32_t        page_size;                      // currently 4096 (may also be 16384)
+       uint32_t        page_starts_count;
+       uint64_t        value_add;
+       uint16_t        page_starts[/* page_starts_count */];
+};
+
+#define DYLD_CACHE_SLIDE_V3_PAGE_ATTR_NO_REBASE        0xFFFF  // page has no rebasing
+
+
+typedef struct vm_shared_region_slide_info_entry_v4 *vm_shared_region_slide_info_entry_v4_t;
+struct vm_shared_region_slide_info_entry_v4 {
+    uint32_t    version;            // currently 4
+    uint32_t    page_size;          // currently 4096 (may also be 16384)
+    uint32_t    page_starts_offset;
+    uint32_t    page_starts_count;
+    uint32_t    page_extras_offset;
+    uint32_t    page_extras_count;
+    uint64_t    delta_mask;        // which (contiguous) set of bits contains the delta to the next rebase location (0xC0000000)
+    uint64_t    value_add;         // base address of cache
+    // uint16_t    page_starts[page_starts_count];
+    // uint16_t    page_extras[page_extras_count];
+};
+
+#define DYLD_CACHE_SLIDE4_PAGE_NO_REBASE           0xFFFF  // page has no rebasing
+#define DYLD_CACHE_SLIDE4_PAGE_INDEX               0x7FFF  // index into starts or extras
+#define DYLD_CACHE_SLIDE4_PAGE_USE_EXTRA           0x8000  // index is into extras array (not starts array)
+#define DYLD_CACHE_SLIDE4_PAGE_EXTRA_END           0x8000  // last chain entry for page
+
+
+
 typedef union vm_shared_region_slide_info_entry *vm_shared_region_slide_info_entry_t;
 union vm_shared_region_slide_info_entry {
        uint32_t        version;
        struct vm_shared_region_slide_info_entry_v1     v1;
        struct vm_shared_region_slide_info_entry_v2     v2;
+       struct vm_shared_region_slide_info_entry_v3     v3;
+    struct vm_shared_region_slide_info_entry_v4 v4;
 };
 
 typedef struct vm_shared_region_slide_info *vm_shared_region_slide_info_t;
 struct vm_shared_region_slide_info {
+       mach_vm_address_t       slid_address;
        mach_vm_offset_t        start;
        mach_vm_offset_t        end;
        uint32_t                slide;
@@ -159,6 +199,7 @@ struct vm_shared_region {
        queue_chain_t           sr_q;
        void                    *sr_root_dir;
        cpu_type_t              sr_cpu_type;
+       cpu_subtype_t           sr_cpu_subtype;
        boolean_t               sr_64bit;
        boolean_t               sr_mapping_in_progress;
        boolean_t               sr_slide_in_progress;
@@ -174,10 +215,13 @@ struct vm_shared_region {
        struct vm_shared_region_slide_info sr_slide_info;
        uuid_t                  sr_uuid;
        boolean_t               sr_uuid_copied;
+       uint32_t                sr_images_count;
+       struct dyld_uuid_info_64 *sr_images;
 };
 
 extern kern_return_t vm_shared_region_slide_page(vm_shared_region_slide_info_t si,
-       vm_offset_t     vaddr, 
+       vm_offset_t     vaddr,
+       mach_vm_offset_t uservaddr,
        uint32_t pageIndex);
 extern vm_shared_region_slide_info_t vm_shared_region_get_slide_info(vm_shared_region_t sr);
 #else  /* !MACH_KERNEL_PRIVATE */
@@ -195,12 +239,15 @@ extern kern_return_t vm_shared_region_enter(
        struct task             *task,
        boolean_t               is_64bit,
        void                    *fsroot,
-       cpu_type_t              cpu);
+       cpu_type_t              cpu,
+       cpu_subtype_t           cpu_subtype);
 extern kern_return_t vm_shared_region_remove(
        struct _vm_map          *map,
        struct task             *task);
 extern vm_shared_region_t vm_shared_region_get(
        struct task             *task);
+extern vm_shared_region_t vm_shared_region_trim_and_get(
+       struct task             *task);
 extern void vm_shared_region_deallocate(
        struct vm_shared_region *shared_region);
 extern mach_vm_offset_t vm_shared_region_base_address(
@@ -209,6 +256,8 @@ extern mach_vm_size_t vm_shared_region_size(
        struct vm_shared_region *shared_region);
 extern ipc_port_t vm_shared_region_mem_entry(
        struct vm_shared_region *shared_region);
+extern vm_map_t vm_shared_region_vm_map(
+       struct vm_shared_region *shared_region);
 extern uint32_t vm_shared_region_get_slide(
        vm_shared_region_t      shared_region);
 extern void vm_shared_region_set(
@@ -217,6 +266,7 @@ extern void vm_shared_region_set(
 extern vm_shared_region_t vm_shared_region_lookup(
        void                    *root_dir,
        cpu_type_t              cpu,
+       cpu_subtype_t           cpu_subtype,
        boolean_t               is_64bit);
 extern kern_return_t vm_shared_region_start_address(
        struct vm_shared_region *shared_region,
@@ -238,12 +288,6 @@ extern kern_return_t vm_shared_region_map_file(
        user_addr_t             slide_size);
 extern kern_return_t vm_shared_region_sliding_valid(uint32_t slide);
 extern kern_return_t vm_shared_region_slide_sanity_check(vm_shared_region_t sr);
-extern kern_return_t vm_shared_region_slide_init(vm_shared_region_t sr,
-               mach_vm_size_t slide_info_size,
-               mach_vm_offset_t start,
-               mach_vm_size_t size,
-               uint32_t slide,
-               memory_object_control_t);
 extern void* vm_shared_region_get_slide_info_entry(vm_shared_region_t sr);
 extern void vm_commpage_init(void);
 extern void vm_commpage_text_init(void);
@@ -259,6 +303,7 @@ int vm_shared_region_slide(uint32_t,
        mach_vm_size_t, 
        mach_vm_offset_t, 
        mach_vm_size_t, 
+       mach_vm_offset_t,
        memory_object_control_t);
 
 #endif /* KERNEL_PRIVATE */
diff --git a/osfmk/vm/vm_shared_region_pager.c b/osfmk/vm/vm_shared_region_pager.c
new file mode 100644 (file)
index 0000000..773233d
--- /dev/null
@@ -0,0 +1,1146 @@
+/*
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <sys/errno.h>
+
+#include <mach/mach_types.h>
+#include <mach/mach_traps.h>
+#include <mach/host_priv.h>
+#include <mach/kern_return.h>
+#include <mach/memory_object_control.h>
+#include <mach/memory_object_types.h>
+#include <mach/port.h>
+#include <mach/policy.h>
+#include <mach/upl.h>
+#include <mach/thread_act.h>
+#include <mach/mach_vm.h>
+
+#include <kern/host.h>
+#include <kern/kalloc.h>
+#include <kern/queue.h>
+#include <kern/thread.h>
+#include <kern/ipc_kobject.h>
+
+#include <ipc/ipc_port.h>
+#include <ipc/ipc_space.h>
+
+#include <vm/memory_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_fault.h>
+#include <vm/vm_map.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_protos.h>
+#include <vm/vm_shared_region.h>
+
+
+/*
+ * SHARED REGION MEMORY PAGER
+ *
+ * This external memory manager (EMM) handles mappings of a dyld shared cache
+ * in shared regions, applying any necessary modifications (sliding,
+ * pointer signing, ...).
+ *
+ * It mostly handles page-in requests (from memory_object_data_request()) by
+ * getting the original data from its backing VM object, itself backed by
+ * the dyld shared cache file, modifying it if needed and providing it to VM.
+ *
+ * The modified pages will never be dirtied, so the memory manager doesn't
+ * need to handle page-out requests (from memory_object_data_return()).  The
+ * pages need to be mapped copy-on-write, so that the originals stay clean.
+ *
+ * We don't expect to have to handle a large number of shared cache files,
+ * so the data structures are very simple (simple linked list) for now.
+ */
+
+/* forward declarations */
+void shared_region_pager_reference(memory_object_t mem_obj);
+void shared_region_pager_deallocate(memory_object_t mem_obj);
+kern_return_t shared_region_pager_init(memory_object_t mem_obj,
+                                      memory_object_control_t control,
+                                      memory_object_cluster_size_t pg_size);
+kern_return_t shared_region_pager_terminate(memory_object_t mem_obj);
+kern_return_t shared_region_pager_data_request(memory_object_t mem_obj,
+                                              memory_object_offset_t offset,
+                                              memory_object_cluster_size_t length,
+                                              vm_prot_t protection_required,
+                                              memory_object_fault_info_t fault_info);
+kern_return_t shared_region_pager_data_return(memory_object_t mem_obj,
+                                             memory_object_offset_t offset,
+                                             memory_object_cluster_size_t      data_cnt,
+                                             memory_object_offset_t *resid_offset,
+                                             int *io_error,
+                                             boolean_t dirty,
+                                             boolean_t kernel_copy,
+                                             int upl_flags);
+kern_return_t shared_region_pager_data_initialize(memory_object_t mem_obj,
+                                                 memory_object_offset_t offset,
+                                                 memory_object_cluster_size_t data_cnt);
+kern_return_t shared_region_pager_data_unlock(memory_object_t mem_obj,
+                                             memory_object_offset_t offset,
+                                             memory_object_size_t size,
+                                             vm_prot_t desired_access);
+kern_return_t shared_region_pager_synchronize(memory_object_t mem_obj,
+                                             memory_object_offset_t offset,
+                                             memory_object_size_t length,
+                                             vm_sync_t sync_flags);
+kern_return_t shared_region_pager_map(memory_object_t mem_obj,
+                                     vm_prot_t prot);
+kern_return_t shared_region_pager_last_unmap(memory_object_t mem_obj);
+
+/*
+ * Vector of VM operations for this EMM.
+ * These routines are invoked by VM via the memory_object_*() interfaces.
+ */
+const struct memory_object_pager_ops shared_region_pager_ops = {
+       shared_region_pager_reference,
+       shared_region_pager_deallocate,
+       shared_region_pager_init,
+       shared_region_pager_terminate,
+       shared_region_pager_data_request,
+       shared_region_pager_data_return,
+       shared_region_pager_data_initialize,
+       shared_region_pager_data_unlock,
+       shared_region_pager_synchronize,
+       shared_region_pager_map,
+       shared_region_pager_last_unmap,
+       NULL, /* data_reclaim */
+       "shared_region"
+};
+
+/*
+ * The "shared_region_pager" describes a memory object backed by
+ * the "shared_region" EMM.
+ */
+typedef struct shared_region_pager {
+       /* mandatory generic header */
+       struct memory_object sc_pgr_hdr;
+
+       /* pager-specific data */
+       queue_chain_t           pager_queue;    /* next & prev pagers */
+       unsigned int            ref_count;      /* reference count */
+       boolean_t               is_ready;       /* is this pager ready ? */
+       boolean_t               is_mapped;      /* is this mem_obj mapped ? */
+       vm_object_t             backing_object; /* VM obj for shared cache */
+       vm_object_offset_t      backing_offset;
+       struct vm_shared_region_slide_info *scp_slide_info;
+} *shared_region_pager_t;
+#define        SHARED_REGION_PAGER_NULL        ((shared_region_pager_t) NULL)
+
+/*
+ * List of memory objects managed by this EMM.
+ * The list is protected by the "shared_region_pager_lock" lock.
+ */
+int shared_region_pager_count = 0;             /* number of pagers */
+int shared_region_pager_count_mapped = 0;      /* number of unmapped pagers */
+queue_head_t shared_region_pager_queue;
+decl_lck_mtx_data(,shared_region_pager_lock)
+
+/*
+ * Maximum number of unmapped pagers we're willing to keep around.
+ */
+int shared_region_pager_cache_limit = 0;
+
+/*
+ * Statistics & counters.
+ */
+int shared_region_pager_count_max = 0;
+int shared_region_pager_count_unmapped_max = 0;
+int shared_region_pager_num_trim_max = 0;
+int shared_region_pager_num_trim_total = 0;
+
+
+lck_grp_t      shared_region_pager_lck_grp;
+lck_grp_attr_t shared_region_pager_lck_grp_attr;
+lck_attr_t     shared_region_pager_lck_attr;
+
+uint64_t shared_region_pager_copied = 0;
+uint64_t shared_region_pager_slid = 0;
+uint64_t shared_region_pager_slid_error = 0;
+uint64_t shared_region_pager_reclaimed = 0;
+
+/* internal prototypes */
+shared_region_pager_t shared_region_pager_create(
+       vm_object_t backing_object,
+       vm_object_offset_t backing_offset,
+       struct vm_shared_region_slide_info *slide_info);
+shared_region_pager_t shared_region_pager_lookup(memory_object_t mem_obj);
+void shared_region_pager_dequeue(shared_region_pager_t pager);
+void shared_region_pager_deallocate_internal(shared_region_pager_t pager,
+                                            boolean_t locked);
+void shared_region_pager_terminate_internal(shared_region_pager_t pager);
+void shared_region_pager_trim(void);
+
+
+#if DEBUG
+int shared_region_pagerdebug = 0;
+#define PAGER_ALL              0xffffffff
+#define        PAGER_INIT              0x00000001
+#define        PAGER_PAGEIN            0x00000002
+
+#define PAGER_DEBUG(LEVEL, A)                                          \
+       MACRO_BEGIN                                                     \
+       if ((shared_region_pagerdebug & (LEVEL)) == (LEVEL)) {          \
+               printf A;                                               \
+       }                                                               \
+       MACRO_END
+#else
+#define PAGER_DEBUG(LEVEL, A)
+#endif
+
+
+void
+shared_region_pager_bootstrap(void)
+{
+       lck_grp_attr_setdefault(&shared_region_pager_lck_grp_attr);
+       lck_grp_init(&shared_region_pager_lck_grp, "shared_region", &shared_region_pager_lck_grp_attr);
+       lck_attr_setdefault(&shared_region_pager_lck_attr);
+       lck_mtx_init(&shared_region_pager_lock, &shared_region_pager_lck_grp, &shared_region_pager_lck_attr);
+       queue_init(&shared_region_pager_queue);
+}
+
+/*
+ * shared_region_pager_init()
+ *
+ * Initialize the memory object and makes it ready to be used and mapped.
+ */
+kern_return_t
+shared_region_pager_init(
+       memory_object_t         mem_obj,
+       memory_object_control_t control,
+#if !DEBUG
+       __unused
+#endif
+       memory_object_cluster_size_t pg_size)
+{
+       shared_region_pager_t   pager;
+       kern_return_t           kr;
+       memory_object_attr_info_data_t  attributes;
+
+       PAGER_DEBUG(PAGER_ALL,
+                   ("shared_region_pager_init: %p, %p, %x\n",
+                    mem_obj, control, pg_size));
+
+       if (control == MEMORY_OBJECT_CONTROL_NULL)
+               return KERN_INVALID_ARGUMENT;
+
+       pager = shared_region_pager_lookup(mem_obj);
+
+       memory_object_control_reference(control);
+
+       pager->sc_pgr_hdr.mo_control = control;
+
+       attributes.copy_strategy = MEMORY_OBJECT_COPY_DELAY;
+       /* attributes.cluster_size = (1 << (CLUSTER_SHIFT + PAGE_SHIFT));*/
+       attributes.cluster_size = (1 << (PAGE_SHIFT));
+       attributes.may_cache_object = FALSE;
+       attributes.temporary = TRUE;
+
+       kr = memory_object_change_attributes(
+                                       control,
+                                       MEMORY_OBJECT_ATTRIBUTE_INFO,
+                                       (memory_object_info_t) &attributes,
+                                       MEMORY_OBJECT_ATTR_INFO_COUNT);
+       if (kr != KERN_SUCCESS)
+               panic("shared_region_pager_init: "
+                     "memory_object_change_attributes() failed");
+
+#if CONFIG_SECLUDED_MEMORY
+       if (secluded_for_filecache) {
+#if 00
+               /*
+                * XXX FBDP do we want this in the secluded pool?
+                * Ideally, we'd want the shared region used by Camera to
+                * NOT be in the secluded pool, but all other shared regions
+                * in the secluded pool...
+                */
+               memory_object_mark_eligible_for_secluded(control, TRUE);
+#endif /* 00 */
+       }
+#endif /* CONFIG_SECLUDED_MEMORY */
+
+       return KERN_SUCCESS;
+}
+
+/*
+ * shared_region_data_return()
+ *
+ * Handles page-out requests from VM.  This should never happen since
+ * the pages provided by this EMM are not supposed to be dirty or dirtied
+ * and VM should simply discard the contents and reclaim the pages if it
+ * needs to.
+ */
+kern_return_t
+shared_region_pager_data_return(
+        __unused memory_object_t       mem_obj,
+        __unused memory_object_offset_t        offset,
+        __unused memory_object_cluster_size_t          data_cnt,
+        __unused memory_object_offset_t        *resid_offset,
+       __unused int                    *io_error,
+       __unused boolean_t              dirty,
+       __unused boolean_t              kernel_copy,
+       __unused int                    upl_flags)
+{
+       panic("shared_region_pager_data_return: should never get called");
+       return KERN_FAILURE;
+}
+
+kern_return_t
+shared_region_pager_data_initialize(
+       __unused memory_object_t        mem_obj,
+       __unused memory_object_offset_t offset,
+       __unused memory_object_cluster_size_t           data_cnt)
+{
+       panic("shared_region_pager_data_initialize: should never get called");
+       return KERN_FAILURE;
+}
+
+kern_return_t
+shared_region_pager_data_unlock(
+       __unused memory_object_t        mem_obj,
+       __unused memory_object_offset_t offset,
+       __unused memory_object_size_t           size,
+       __unused vm_prot_t              desired_access)
+{
+       return KERN_FAILURE;
+}
+
+/*
+ * shared_region_pager_data_request()
+ *
+ * Handles page-in requests from VM.
+ */
+int shared_region_pager_data_request_debug = 0;
+kern_return_t
+shared_region_pager_data_request(
+       memory_object_t         mem_obj,
+       memory_object_offset_t  offset,
+       memory_object_cluster_size_t            length,
+#if !DEBUG
+       __unused
+#endif
+       vm_prot_t               protection_required,
+       memory_object_fault_info_t mo_fault_info)
+{
+       shared_region_pager_t   pager;
+       memory_object_control_t mo_control;
+       upl_t                   upl;
+       int                     upl_flags;
+       upl_size_t              upl_size;
+       upl_page_info_t         *upl_pl;
+       unsigned int            pl_count;
+       vm_object_t             src_top_object, src_page_object, dst_object;
+       kern_return_t           kr, retval;
+       vm_offset_t             src_vaddr, dst_vaddr;
+       vm_offset_t             cur_offset;
+       vm_offset_t             offset_in_page;
+       kern_return_t           error_code;
+       vm_prot_t               prot;
+       vm_page_t               src_page, top_page;
+       int                     interruptible;
+       struct vm_object_fault_info     fault_info;
+       mach_vm_offset_t        slide_start_address;
+
+       PAGER_DEBUG(PAGER_ALL, ("shared_region_pager_data_request: %p, %llx, %x, %x\n", mem_obj, offset, length, protection_required));
+
+       retval = KERN_SUCCESS;
+       src_top_object = VM_OBJECT_NULL;
+       src_page_object = VM_OBJECT_NULL;
+       upl = NULL;
+       upl_pl = NULL;
+       fault_info = *((struct vm_object_fault_info *)(uintptr_t)mo_fault_info);
+       fault_info.stealth = TRUE;
+       fault_info.io_sync = FALSE;
+       fault_info.mark_zf_absent = FALSE;
+       fault_info.batch_pmap_op = FALSE;
+       interruptible = fault_info.interruptible;
+
+       pager = shared_region_pager_lookup(mem_obj);
+       assert(pager->is_ready);
+       assert(pager->ref_count > 1); /* pager is alive and mapped */
+
+       PAGER_DEBUG(PAGER_PAGEIN, ("shared_region_pager_data_request: %p, %llx, %x, %x, pager %p\n", mem_obj, offset, length, protection_required, pager));
+
+       /*
+        * Gather in a UPL all the VM pages requested by VM.
+        */
+       mo_control = pager->sc_pgr_hdr.mo_control;
+
+       upl_size = length;
+       upl_flags =
+               UPL_RET_ONLY_ABSENT |
+               UPL_SET_LITE |
+               UPL_NO_SYNC |
+               UPL_CLEAN_IN_PLACE |    /* triggers UPL_CLEAR_DIRTY */
+               UPL_SET_INTERNAL;
+       pl_count = 0;
+       kr = memory_object_upl_request(mo_control,
+                                      offset, upl_size,
+                                      &upl, NULL, NULL, upl_flags, VM_KERN_MEMORY_SECURITY);
+       if (kr != KERN_SUCCESS) {
+               retval = kr;
+               goto done;
+       }
+       dst_object = mo_control->moc_object;
+       assert(dst_object != VM_OBJECT_NULL);
+
+       /*
+        * We'll map the original data in the kernel address space from the
+        * backing VM object (itself backed by the shared cache file via
+        * the vnode pager).
+        */
+       src_top_object = pager->backing_object;
+       assert(src_top_object != VM_OBJECT_NULL);
+       vm_object_reference(src_top_object); /* keep the source object alive */
+
+       slide_start_address = pager->scp_slide_info->slid_address;
+
+       fault_info.lo_offset += pager->backing_offset;
+       fault_info.hi_offset += pager->backing_offset;
+
+       /*
+        * Fill in the contents of the pages requested by VM.
+        */
+       upl_pl = UPL_GET_INTERNAL_PAGE_LIST(upl);
+       pl_count = length / PAGE_SIZE;
+       for (cur_offset = 0;
+            retval == KERN_SUCCESS && cur_offset < length;
+            cur_offset += PAGE_SIZE) {
+               ppnum_t dst_pnum;
+
+               if (!upl_page_present(upl_pl, (int)(cur_offset / PAGE_SIZE))) {
+                       /* this page is not in the UPL: skip it */
+                       continue;
+               }
+
+               /*
+                * Map the source (dyld shared cache) page in the kernel's
+                * virtual address space.
+                * We already hold a reference on the src_top_object.
+                */
+       retry_src_fault:
+               vm_object_lock(src_top_object);
+               vm_object_paging_begin(src_top_object);
+               error_code = 0;
+               prot = VM_PROT_READ;
+               src_page = VM_PAGE_NULL;
+               kr = vm_fault_page(src_top_object,
+                                  pager->backing_offset + offset + cur_offset,
+                                  VM_PROT_READ,
+                                  FALSE,
+                                  FALSE, /* src_page not looked up */
+                                  &prot,
+                                  &src_page,
+                                  &top_page,
+                                  NULL,
+                                  &error_code,
+                                  FALSE,
+                                  FALSE,
+                                  &fault_info);
+               switch (kr) {
+               case VM_FAULT_SUCCESS:
+                       break;
+               case VM_FAULT_RETRY:
+                       goto retry_src_fault;
+               case VM_FAULT_MEMORY_SHORTAGE:
+                       if (vm_page_wait(interruptible)) {
+                               goto retry_src_fault;
+                       }
+                       /* fall thru */
+               case VM_FAULT_INTERRUPTED:
+                       retval = MACH_SEND_INTERRUPTED;
+                       goto done;
+               case VM_FAULT_SUCCESS_NO_VM_PAGE:
+                       /* success but no VM page: fail */
+                       vm_object_paging_end(src_top_object);
+                       vm_object_unlock(src_top_object);
+                       /*FALLTHROUGH*/
+               case VM_FAULT_MEMORY_ERROR:
+                       /* the page is not there ! */
+                       if (error_code) {
+                               retval = error_code;
+                       } else {
+                               retval = KERN_MEMORY_ERROR;
+                       }
+                       goto done;
+               default:
+                       panic("shared_region_pager_data_request: "
+                             "vm_fault_page() unexpected error 0x%x\n",
+                             kr);
+               }
+               assert(src_page != VM_PAGE_NULL);
+               assert(src_page->vmp_busy);
+
+               if (src_page->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
+                       vm_page_lockspin_queues();
+                       if (src_page->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
+                               vm_page_speculate(src_page, FALSE);
+                       }
+                       vm_page_unlock_queues();
+               }
+
+               /*
+                * Establish pointers to the source
+                * and destination physical pages.
+                */
+               dst_pnum = (ppnum_t)
+                       upl_phys_page(upl_pl, (int)(cur_offset / PAGE_SIZE));
+                assert(dst_pnum != 0);
+#if __x86_64__
+               src_vaddr = (vm_map_offset_t)
+                       PHYSMAP_PTOV((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
+                                    << PAGE_SHIFT);
+               dst_vaddr = (vm_map_offset_t)
+                       PHYSMAP_PTOV((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
+
+#elif __arm__ || __arm64__
+               src_vaddr = (vm_map_offset_t)
+                       phystokv((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(src_page)
+                                << PAGE_SHIFT);
+               dst_vaddr = (vm_map_offset_t)
+                       phystokv((pmap_paddr_t)dst_pnum << PAGE_SHIFT);
+#else
+#error "vm_paging_map_object: no 1-to-1 kernel mapping of physical memory..."
+               src_vaddr = 0;
+               dst_vaddr = 0;
+#endif
+               src_page_object = VM_PAGE_OBJECT(src_page);
+
+               /*
+                * Validate the original page...
+                */
+               if (src_page_object->code_signed) {
+                       vm_page_validate_cs_mapped(
+                               src_page,
+                               (const void *) src_vaddr);
+               }
+               /*
+                * ... and transfer the results to the destination page.
+                */
+               UPL_SET_CS_VALIDATED(upl_pl, cur_offset / PAGE_SIZE,
+                                    src_page->vmp_cs_validated);
+               UPL_SET_CS_TAINTED(upl_pl, cur_offset / PAGE_SIZE,
+                                  src_page->vmp_cs_tainted);
+               UPL_SET_CS_NX(upl_pl, cur_offset / PAGE_SIZE,
+                                  src_page->vmp_cs_nx);
+
+               /*
+                * The page provider might access a mapped file, so let's
+                * release the object lock for the source page to avoid a
+                * potential deadlock.
+                * The source page is kept busy and we have a
+                * "paging_in_progress" reference on its object, so it's safe
+                * to unlock the object here.
+                */
+               assert(src_page->vmp_busy);
+               assert(src_page_object->paging_in_progress > 0);
+               vm_object_unlock(src_page_object);
+
+               /*
+                * Process the original contents of the source page
+                * into the destination page.
+                */
+               for (offset_in_page = 0;
+                    offset_in_page < PAGE_SIZE;
+                    offset_in_page += PAGE_SIZE_FOR_SR_SLIDE) {
+                       vm_object_offset_t chunk_offset;
+                       vm_object_offset_t offset_in_backing_object;
+                       vm_object_offset_t offset_in_sliding_range;
+
+                       chunk_offset = offset + cur_offset + offset_in_page;
+
+                       bcopy((const char *)(src_vaddr +
+                                            offset_in_page),
+                             (char *)(dst_vaddr + offset_in_page),
+                             PAGE_SIZE_FOR_SR_SLIDE);
+
+                       offset_in_backing_object = (chunk_offset +
+                                                   pager->backing_offset);
+                       if ((offset_in_backing_object < pager->scp_slide_info->start) ||
+                           (offset_in_backing_object >= pager->scp_slide_info->end)) {
+                               /* chunk is outside of sliding range: done */
+                               shared_region_pager_copied++;
+                               continue;
+                       }
+
+                       offset_in_sliding_range =
+                               (offset_in_backing_object -
+                                pager->scp_slide_info->start);
+                       kr = vm_shared_region_slide_page(
+                               pager->scp_slide_info,
+                               dst_vaddr + offset_in_page,
+                               (mach_vm_offset_t) (offset_in_sliding_range +
+                                                   slide_start_address),
+                               (uint32_t) (offset_in_sliding_range /
+                                           PAGE_SIZE_FOR_SR_SLIDE));
+                       if (shared_region_pager_data_request_debug) {
+                               printf("shared_region_data_request"
+                                      "(%p,0x%llx+0x%llx+0x%04llx): 0x%llx "
+                                      "in sliding range [0x%llx:0x%llx]: "
+                                      "SLIDE offset 0x%llx="
+                                      "(0x%llx+0x%llx+0x%llx+0x%04llx)"
+                                      "[0x%016llx 0x%016llx] "
+                                      "code_signed=%d "
+                                      "cs_validated=%d "
+                                      "cs_tainted=%d "
+                                      "cs_nx=%d "
+                                      "kr=0x%x\n",
+                                      pager,
+                                      offset,
+                                      (uint64_t) cur_offset,
+                                      (uint64_t) offset_in_page,
+                                      chunk_offset,
+                                      pager->scp_slide_info->start,
+                                      pager->scp_slide_info->end,
+                                      (pager->backing_offset +
+                                       offset +
+                                       cur_offset +
+                                       offset_in_page),
+                                      pager->backing_offset,
+                                      offset,
+                                      (uint64_t) cur_offset,
+                                      (uint64_t) offset_in_page,
+                                      *(uint64_t *)(dst_vaddr+offset_in_page),
+                                      *(uint64_t *)(dst_vaddr+offset_in_page+8),
+                                      src_page_object->code_signed,
+                                      src_page->vmp_cs_validated,
+                                      src_page->vmp_cs_tainted,
+                                      src_page->vmp_cs_nx,
+                                      kr);
+                       }
+                       if (kr != KERN_SUCCESS) {
+                               shared_region_pager_slid_error++;
+                               break;
+                       }
+                       shared_region_pager_slid++;
+               }
+
+               assert(VM_PAGE_OBJECT(src_page) == src_page_object);
+               assert(src_page->vmp_busy);
+               assert(src_page_object->paging_in_progress > 0);
+               vm_object_lock(src_page_object);
+
+               /*
+                * Cleanup the result of vm_fault_page() of the source page.
+                */
+               PAGE_WAKEUP_DONE(src_page);
+               src_page = VM_PAGE_NULL;
+               vm_object_paging_end(src_page_object);
+               vm_object_unlock(src_page_object);
+
+               if (top_page != VM_PAGE_NULL) {
+                       assert(VM_PAGE_OBJECT(top_page) == src_top_object);
+                       vm_object_lock(src_top_object);
+                       VM_PAGE_FREE(top_page);
+                       vm_object_paging_end(src_top_object);
+                       vm_object_unlock(src_top_object);
+               }
+       }
+
+done:
+       if (upl != NULL) {
+               /* clean up the UPL */
+
+               /*
+                * The pages are currently dirty because we've just been
+                * writing on them, but as far as we're concerned, they're
+                * clean since they contain their "original" contents as
+                * provided by us, the pager.
+                * Tell the UPL to mark them "clean".
+                */
+               upl_clear_dirty(upl, TRUE);
+
+               /* abort or commit the UPL */
+               if (retval != KERN_SUCCESS) {
+                       upl_abort(upl, 0);
+               } else {
+                       boolean_t empty;
+                       upl_commit_range(upl, 0, upl->size,
+                                        UPL_COMMIT_CS_VALIDATED | UPL_COMMIT_WRITTEN_BY_KERNEL,
+                                        upl_pl, pl_count, &empty);
+               }
+
+               /* and deallocate the UPL */
+               upl_deallocate(upl);
+               upl = NULL;
+       }
+       if (src_top_object != VM_OBJECT_NULL) {
+               vm_object_deallocate(src_top_object);
+       }
+       return retval;
+}
+
+/*
+ * shared_region_pager_reference()
+ *
+ * Get a reference on this memory object.
+ * For external usage only.  Assumes that the initial reference count is not 0,
+ * i.e one should not "revive" a dead pager this way.
+ */
+void
+shared_region_pager_reference(
+       memory_object_t         mem_obj)
+{
+       shared_region_pager_t   pager;
+
+       pager = shared_region_pager_lookup(mem_obj);
+
+       lck_mtx_lock(&shared_region_pager_lock);
+       assert(pager->ref_count > 0);
+       pager->ref_count++;
+       lck_mtx_unlock(&shared_region_pager_lock);
+}
+
+
+/*
+ * shared_region_pager_dequeue:
+ *
+ * Removes a pager from the list of pagers.
+ *
+ * The caller must hold "shared_region_pager_lock".
+ */
+void
+shared_region_pager_dequeue(
+       shared_region_pager_t pager)
+{
+       assert(!pager->is_mapped);
+
+       queue_remove(&shared_region_pager_queue,
+                    pager,
+                    shared_region_pager_t,
+                    pager_queue);
+       pager->pager_queue.next = NULL;
+       pager->pager_queue.prev = NULL;
+
+       shared_region_pager_count--;
+}
+
+/*
+ * shared_region_pager_terminate_internal:
+ *
+ * Trigger the asynchronous termination of the memory object associated
+ * with this pager.
+ * When the memory object is terminated, there will be one more call
+ * to memory_object_deallocate() (i.e. shared_region_pager_deallocate())
+ * to finish the clean up.
+ *
+ * "shared_region_pager_lock" should not be held by the caller.
+ * We don't need the lock because the pager has already been removed from
+ * the pagers' list and is now ours exclusively.
+ */
+void
+shared_region_pager_terminate_internal(
+       shared_region_pager_t pager)
+{
+       assert(pager->is_ready);
+       assert(!pager->is_mapped);
+
+       if (pager->backing_object != VM_OBJECT_NULL) {
+               vm_object_deallocate(pager->backing_object);
+               pager->backing_object = VM_OBJECT_NULL;
+       }
+       /* trigger the destruction of the memory object */
+       memory_object_destroy(pager->sc_pgr_hdr.mo_control, 0);
+}
+
+/*
+ * shared_region_pager_deallocate_internal()
+ *
+ * Release a reference on this pager and free it when the last
+ * reference goes away.
+ * Can be called with shared_region_pager_lock held or not but always returns
+ * with it unlocked.
+ */
+void
+shared_region_pager_deallocate_internal(
+       shared_region_pager_t   pager,
+       boolean_t               locked)
+{
+       boolean_t       needs_trimming;
+       int             count_unmapped;
+
+       if (! locked) {
+               lck_mtx_lock(&shared_region_pager_lock);
+       }
+
+       count_unmapped = (shared_region_pager_count -
+                         shared_region_pager_count_mapped);
+       if (count_unmapped > shared_region_pager_cache_limit) {
+               /* we have too many unmapped pagers:  trim some */
+               needs_trimming = TRUE;
+       } else {
+               needs_trimming = FALSE;
+       }
+
+       /* drop a reference on this pager */
+       pager->ref_count--;
+
+       if (pager->ref_count == 1) {
+               /*
+                * Only the "named" reference is left, which means that
+                * no one is really holding on to this pager anymore.
+                * Terminate it.
+                */
+               shared_region_pager_dequeue(pager);
+               /* the pager is all ours: no need for the lock now */
+               lck_mtx_unlock(&shared_region_pager_lock);
+               shared_region_pager_terminate_internal(pager);
+       } else if (pager->ref_count == 0) {
+               /*
+                * Dropped the existence reference;  the memory object has
+                * been terminated.  Do some final cleanup and release the
+                * pager structure.
+                */
+               lck_mtx_unlock(&shared_region_pager_lock);
+               if (pager->sc_pgr_hdr.mo_control != MEMORY_OBJECT_CONTROL_NULL) {
+                       memory_object_control_deallocate(pager->sc_pgr_hdr.mo_control);
+                       pager->sc_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
+               }
+               kfree(pager, sizeof (*pager));
+               pager = SHARED_REGION_PAGER_NULL;
+       } else {
+               /* there are still plenty of references:  keep going... */
+               lck_mtx_unlock(&shared_region_pager_lock);
+       }
+
+       if (needs_trimming) {
+               shared_region_pager_trim();
+       }
+       /* caution: lock is not held on return... */
+}
+
+/*
+ * shared_region_pager_deallocate()
+ *
+ * Release a reference on this pager and free it when the last
+ * reference goes away.
+ */
+void
+shared_region_pager_deallocate(
+       memory_object_t         mem_obj)
+{
+       shared_region_pager_t   pager;
+
+       PAGER_DEBUG(PAGER_ALL, ("shared_region_pager_deallocate: %p\n", mem_obj));
+       pager = shared_region_pager_lookup(mem_obj);
+       shared_region_pager_deallocate_internal(pager, FALSE);
+}
+
+/*
+ *
+ */
+kern_return_t
+shared_region_pager_terminate(
+#if !DEBUG
+       __unused
+#endif
+       memory_object_t mem_obj)
+{
+       PAGER_DEBUG(PAGER_ALL, ("shared_region_pager_terminate: %p\n", mem_obj));
+
+       return KERN_SUCCESS;
+}
+
+/*
+ *
+ */
+kern_return_t
+shared_region_pager_synchronize(
+       __unused memory_object_t                mem_obj,
+       __unused memory_object_offset_t offset,
+       __unused memory_object_size_t           length,
+       __unused vm_sync_t              sync_flags)
+{
+       panic("shared_region_pager_synchronize: memory_object_synchronize no longer supported\n");
+       return KERN_FAILURE;
+}
+
+/*
+ * shared_region_pager_map()
+ *
+ * This allows VM to let us, the EMM, know that this memory object
+ * is currently mapped one or more times.  This is called by VM each time
+ * the memory object gets mapped and we take one extra reference on the
+ * memory object to account for all its mappings.
+ */
+kern_return_t
+shared_region_pager_map(
+       memory_object_t         mem_obj,
+       __unused vm_prot_t      prot)
+{
+       shared_region_pager_t   pager;
+
+       PAGER_DEBUG(PAGER_ALL, ("shared_region_pager_map: %p\n", mem_obj));
+
+       pager = shared_region_pager_lookup(mem_obj);
+
+       lck_mtx_lock(&shared_region_pager_lock);
+       assert(pager->is_ready);
+       assert(pager->ref_count > 0); /* pager is alive */
+       if (pager->is_mapped == FALSE) {
+               /*
+                * First mapping of this pager:  take an extra reference
+                * that will remain until all the mappings of this pager
+                * are removed.
+                */
+               pager->is_mapped = TRUE;
+               pager->ref_count++;
+               shared_region_pager_count_mapped++;
+       }
+       lck_mtx_unlock(&shared_region_pager_lock);
+
+       return KERN_SUCCESS;
+}
+
+/*
+ * shared_region_pager_last_unmap()
+ *
+ * This is called by VM when this memory object is no longer mapped anywhere.
+ */
+kern_return_t
+shared_region_pager_last_unmap(
+       memory_object_t         mem_obj)
+{
+       shared_region_pager_t   pager;
+       int                     count_unmapped;
+
+       PAGER_DEBUG(PAGER_ALL,
+                   ("shared_region_pager_last_unmap: %p\n", mem_obj));
+
+       pager = shared_region_pager_lookup(mem_obj);
+
+       lck_mtx_lock(&shared_region_pager_lock);
+       if (pager->is_mapped) {
+               /*
+                * All the mappings are gone, so let go of the one extra
+                * reference that represents all the mappings of this pager.
+                */
+               shared_region_pager_count_mapped--;
+               count_unmapped = (shared_region_pager_count -
+                                 shared_region_pager_count_mapped);
+               if (count_unmapped > shared_region_pager_count_unmapped_max) {
+                       shared_region_pager_count_unmapped_max = count_unmapped;
+               }
+               pager->is_mapped = FALSE;
+               shared_region_pager_deallocate_internal(pager, TRUE);
+               /* caution: deallocate_internal() released the lock ! */
+       } else {
+               lck_mtx_unlock(&shared_region_pager_lock);
+       }
+
+       return KERN_SUCCESS;
+}
+
+
+/*
+ *
+ */
+shared_region_pager_t
+shared_region_pager_lookup(
+       memory_object_t  mem_obj)
+{
+       shared_region_pager_t   pager;
+
+       assert(mem_obj->mo_pager_ops == &shared_region_pager_ops);
+       pager = (shared_region_pager_t)(uintptr_t) mem_obj;
+       assert(pager->ref_count > 0);
+       return pager;
+}
+
+shared_region_pager_t
+shared_region_pager_create(
+       vm_object_t             backing_object,
+       vm_object_offset_t      backing_offset,
+       struct vm_shared_region_slide_info *slide_info)
+{
+       shared_region_pager_t   pager;
+       memory_object_control_t control;
+       kern_return_t           kr;
+
+       pager = (shared_region_pager_t) kalloc(sizeof (*pager));
+       if (pager == SHARED_REGION_PAGER_NULL) {
+               return SHARED_REGION_PAGER_NULL;
+       }
+
+       /*
+        * The vm_map call takes both named entry ports and raw memory
+        * objects in the same parameter.  We need to make sure that
+        * vm_map does not see this object as a named entry port.  So,
+        * we reserve the first word in the object for a fake ip_kotype
+        * setting - that will tell vm_map to use it as a memory object.
+        */
+       pager->sc_pgr_hdr.mo_ikot = IKOT_MEMORY_OBJECT;
+       pager->sc_pgr_hdr.mo_pager_ops = &shared_region_pager_ops;
+       pager->sc_pgr_hdr.mo_control = MEMORY_OBJECT_CONTROL_NULL;
+
+       pager->is_ready = FALSE;/* not ready until it has a "name" */
+       pager->ref_count = 1;   /* existence reference (for the cache) */
+       pager->ref_count++;     /* for the caller */
+       pager->is_mapped = FALSE;
+       pager->backing_object = backing_object;
+       pager->backing_offset = backing_offset;
+       pager->scp_slide_info = slide_info;
+
+       vm_object_reference(backing_object);
+
+       lck_mtx_lock(&shared_region_pager_lock);
+       /* enter new pager at the head of our list of pagers */
+       queue_enter_first(&shared_region_pager_queue,
+                         pager,
+                         shared_region_pager_t,
+                         pager_queue);
+       shared_region_pager_count++;
+       if (shared_region_pager_count > shared_region_pager_count_max) {
+               shared_region_pager_count_max = shared_region_pager_count;
+       }
+       lck_mtx_unlock(&shared_region_pager_lock);
+
+       kr = memory_object_create_named((memory_object_t) pager,
+                                       0,
+                                       &control);
+       assert(kr == KERN_SUCCESS);
+
+       lck_mtx_lock(&shared_region_pager_lock);
+       /* the new pager is now ready to be used */
+       pager->is_ready = TRUE;
+       lck_mtx_unlock(&shared_region_pager_lock);
+
+       /* wakeup anyone waiting for this pager to be ready */
+       thread_wakeup(&pager->is_ready);
+
+       return pager;
+}
+
+/*
+ * shared_region_pager_setup()
+ *
+ * Provide the caller with a memory object backed by the provided
+ * "backing_object" VM object.
+ */
+memory_object_t
+shared_region_pager_setup(
+       vm_object_t             backing_object,
+       vm_object_offset_t      backing_offset,
+       struct vm_shared_region_slide_info *slide_info)
+{
+       shared_region_pager_t   pager;
+
+       /* create new pager */
+       pager = shared_region_pager_create(
+               backing_object,
+               backing_offset,
+               slide_info);
+       if (pager == SHARED_REGION_PAGER_NULL) {
+               /* could not create a new pager */
+               return MEMORY_OBJECT_NULL;
+       }
+
+       lck_mtx_lock(&shared_region_pager_lock);
+       while (!pager->is_ready) {
+               lck_mtx_sleep(&shared_region_pager_lock,
+                       LCK_SLEEP_DEFAULT,
+                       &pager->is_ready,
+                       THREAD_UNINT);
+       }
+       lck_mtx_unlock(&shared_region_pager_lock);
+
+       return (memory_object_t) pager;
+}
+
+void
+shared_region_pager_trim(void)
+{
+       shared_region_pager_t   pager, prev_pager;
+       queue_head_t            trim_queue;
+       int                     num_trim;
+       int                     count_unmapped;
+
+       lck_mtx_lock(&shared_region_pager_lock);
+
+       /*
+        * We have too many pagers, try and trim some unused ones,
+        * starting with the oldest pager at the end of the queue.
+        */
+       queue_init(&trim_queue);
+       num_trim = 0;
+
+       for (pager = (shared_region_pager_t)
+                    queue_last(&shared_region_pager_queue);
+            !queue_end(&shared_region_pager_queue,
+                       (queue_entry_t) pager);
+            pager = prev_pager) {
+               /* get prev elt before we dequeue */
+               prev_pager = (shared_region_pager_t)
+                       queue_prev(&pager->pager_queue);
+
+               if (pager->ref_count == 2 &&
+                   pager->is_ready &&
+                   !pager->is_mapped) {
+                       /* this pager can be trimmed */
+                       num_trim++;
+                       /* remove this pager from the main list ... */
+                       shared_region_pager_dequeue(pager);
+                       /* ... and add it to our trim queue */
+                       queue_enter_first(&trim_queue,
+                                         pager,
+                                         shared_region_pager_t,
+                                         pager_queue);
+
+                       count_unmapped = (shared_region_pager_count -
+                                         shared_region_pager_count_mapped);
+                       if (count_unmapped <= shared_region_pager_cache_limit) {
+                               /* we have enough pagers to trim */
+                               break;
+                       }
+               }
+       }
+       if (num_trim > shared_region_pager_num_trim_max) {
+               shared_region_pager_num_trim_max = num_trim;
+       }
+       shared_region_pager_num_trim_total += num_trim;
+
+       lck_mtx_unlock(&shared_region_pager_lock);
+
+       /* terminate the trimmed pagers */
+       while (!queue_empty(&trim_queue)) {
+               queue_remove_first(&trim_queue,
+                                  pager,
+                                  shared_region_pager_t,
+                                  pager_queue);
+               pager->pager_queue.next = NULL;
+               pager->pager_queue.prev = NULL;
+               assert(pager->ref_count == 2);
+               /*
+                * We can't call deallocate_internal() because the pager
+                * has already been dequeued, but we still need to remove
+                * a reference.
+                */
+               pager->ref_count--;
+               shared_region_pager_terminate_internal(pager);
+       }
+}
index f24307d741ca112c5ad777f8a03298f4b9260442..489297724c2430eef99e012ce3262eeffe82206c 100644 (file)
@@ -466,7 +466,7 @@ done:
                kr = vm_map_remove(kernel_map,
                                   kernel_mapping,
                                   kernel_mapping + PAGE_SIZE_64,
-                                  VM_MAP_NO_FLAGS);
+                                  VM_MAP_REMOVE_NO_FLAGS);
                assert(kr == KERN_SUCCESS);
                kernel_mapping = 0;
                dst_vaddr = 0;
index a1e2c51c32e0f4c226cf8b75a3a7676f9d05058c..93e1374e6133131725ec8c11d34f6e6e77c08a2f 100644 (file)
 
 #include <mach/host_priv_server.h>
 #include <mach/mach_vm_server.h>
+#include <mach/memory_entry_server.h>
 #include <mach/vm_map_server.h>
 
 #include <kern/host.h>
 
 #include <san/kasan.h>
 
+#include <libkern/OSDebug.h>
+
 vm_size_t        upl_offset_to_pagelist = 0;
 
 #if    VM_CPM
@@ -318,12 +321,12 @@ mach_vm_deallocate(
        if (size == (mach_vm_offset_t) 0)
                return(KERN_SUCCESS);
 
-       return(vm_map_remove(map,
+       return vm_map_remove(map,
                             vm_map_trunc_page(start,
                                               VM_MAP_PAGE_MASK(map)),
                             vm_map_round_page(start+size,
                                               VM_MAP_PAGE_MASK(map)),
-                            VM_MAP_NO_FLAGS));
+                            VM_MAP_REMOVE_NO_FLAGS);
 }
 
 /*
@@ -344,12 +347,12 @@ vm_deallocate(
        if (size == (vm_offset_t) 0)
                return(KERN_SUCCESS);
 
-       return(vm_map_remove(map,
+       return vm_map_remove(map,
                             vm_map_trunc_page(start,
                                               VM_MAP_PAGE_MASK(map)),
                             vm_map_round_page(start+size,
                                               VM_MAP_PAGE_MASK(map)),
-                            VM_MAP_NO_FLAGS));
+                            VM_MAP_REMOVE_NO_FLAGS);
 }
 
 /*
@@ -594,7 +597,8 @@ vm_read(
        if (map == VM_MAP_NULL)
                return(KERN_INVALID_ARGUMENT);
 
-       if (size > (unsigned)(mach_msg_type_number_t) -1) {
+       mach_msg_type_number_t dsize;
+       if (os_convert_overflow(size, &dsize)) {
                /*
                 * The kernel could handle a 64-bit "size" value, but
                 * it could not return the size of the data in "*data_size"
@@ -612,7 +616,7 @@ vm_read(
 
        if (KERN_SUCCESS == error) {
                *data = (pointer_t) ipc_address;
-               *data_size = (mach_msg_type_number_t) size;
+               *data_size = dsize;
                assert(*data_size == size);
        }
        return(error);
@@ -938,7 +942,7 @@ kern_return_t
 mach_vm_map_external(
        vm_map_t                target_map,
        mach_vm_offset_t        *address,
-       mach_vm_size_t  initial_size,
+       mach_vm_size_t          initial_size,
        mach_vm_offset_t        mask,
        int                     flags,
        ipc_port_t              port,
@@ -951,8 +955,11 @@ mach_vm_map_external(
        vm_tag_t tag;
 
        VM_GET_FLAGS_ALIAS(flags, tag);
-       return (mach_vm_map_kernel(target_map, address, initial_size, mask, flags, tag, port,
-                                       offset, copy, cur_protection, max_protection, inheritance));
+       return (mach_vm_map_kernel(target_map, address, initial_size, mask,
+                                  flags, VM_MAP_KERNEL_FLAGS_NONE, tag,
+                                  port, offset, copy,
+                                  cur_protection, max_protection,
+                                  inheritance));
 }
 
 kern_return_t
@@ -962,6 +969,7 @@ mach_vm_map_kernel(
        mach_vm_size_t  initial_size,
        mach_vm_offset_t        mask,
        int                     flags,
+       vm_map_kernel_flags_t   vmk_flags,
        vm_tag_t                tag,
        ipc_port_t              port,
        vm_object_offset_t      offset,
@@ -984,7 +992,7 @@ mach_vm_map_kernel(
                                     initial_size,
                                     mask,
                                     flags,
-                                    VM_MAP_KERNEL_FLAGS_NONE,
+                                    vmk_flags,
                                     tag,
                                     port,
                                     offset,
@@ -1022,8 +1030,11 @@ vm_map_64_external(
        vm_tag_t tag;
 
        VM_GET_FLAGS_ALIAS(flags, tag);
-       return (vm_map_64_kernel(target_map, address, size, mask, flags, tag, port, offset,
-                                   copy, cur_protection, max_protection, inheritance));
+       return (vm_map_64_kernel(target_map, address, size, mask,
+                                flags, VM_MAP_KERNEL_FLAGS_NONE,
+                                tag, port, offset, copy,
+                                cur_protection, max_protection,
+                                inheritance));
 }
 
 kern_return_t
@@ -1033,6 +1044,7 @@ vm_map_64_kernel(
        vm_size_t               size,
        vm_offset_t             mask,
        int                     flags,
+       vm_map_kernel_flags_t   vmk_flags,
        vm_tag_t                tag,
        ipc_port_t              port,
        vm_object_offset_t      offset,
@@ -1050,9 +1062,10 @@ vm_map_64_kernel(
        map_size = (mach_vm_size_t)size;
        map_mask = (mach_vm_offset_t)mask;
 
-       kr = mach_vm_map_kernel(target_map, &map_addr, map_size, map_mask, flags, tag,
-                        port, offset, copy, 
-                        cur_protection, max_protection, inheritance);
+       kr = mach_vm_map_kernel(target_map, &map_addr, map_size, map_mask,
+                               flags, vmk_flags, tag,
+                               port, offset, copy,
+                               cur_protection, max_protection, inheritance);
        *address = CAST_DOWN(vm_offset_t, map_addr);
        return kr;
 }
@@ -1075,7 +1088,10 @@ vm_map_external(
        vm_tag_t tag;
 
        VM_GET_FLAGS_ALIAS(flags, tag);
-       return (vm_map_kernel(target_map, address, size, mask, flags, tag,  port, offset, copy, cur_protection, max_protection, inheritance));
+       return (vm_map_kernel(target_map, address, size, mask,
+                             flags, VM_MAP_KERNEL_FLAGS_NONE, tag,
+                             port, offset, copy,
+                             cur_protection, max_protection, inheritance));
 }
 
 kern_return_t
@@ -1085,6 +1101,7 @@ vm_map_kernel(
        vm_size_t               size,
        vm_offset_t             mask,
        int                     flags,
+       vm_map_kernel_flags_t   vmk_flags,
        vm_tag_t                tag,
        ipc_port_t              port,
        vm_offset_t             offset,
@@ -1104,9 +1121,10 @@ vm_map_kernel(
        map_mask = (mach_vm_offset_t)mask;
        obj_offset = (vm_object_offset_t)offset;
 
-       kr = mach_vm_map_kernel(target_map, &map_addr, map_size, map_mask, flags, tag,
-                        port, obj_offset, copy, 
-                        cur_protection, max_protection, inheritance);
+       kr = mach_vm_map_kernel(target_map, &map_addr, map_size, map_mask,
+                               flags, vmk_flags, tag,
+                               port, obj_offset, copy,
+                               cur_protection, max_protection, inheritance);
        *address = CAST_DOWN(vm_offset_t, map_addr);
        return kr;
 }
@@ -2208,8 +2226,6 @@ mach_make_memory_entry_64(
                                               parent_handle);
 }
 
-extern int pacified_purgeable_iokit;
-
 kern_return_t
 mach_make_memory_entry_internal(
        vm_map_t                target_map,
@@ -2231,7 +2247,7 @@ mach_make_memory_entry_internal(
        boolean_t               iskernel;
        vm_object_offset_t      obj_off;
        vm_prot_t               prot;
-       struct vm_object_fault_info     fault_info;
+       struct vm_object_fault_info     fault_info = {};
        vm_object_t             object;
        vm_object_t             shadow_object;
 
@@ -2259,6 +2275,9 @@ mach_make_memory_entry_internal(
        boolean_t               force_shadow = FALSE;
        boolean_t               use_data_addr;
        boolean_t               use_4K_compat;
+#if VM_NAMED_ENTRY_LIST
+       int                     alias = -1;
+#endif /* VM_NAMED_ENTRY_LIST */
 
        if ((permission & MAP_MEM_FLAGS_MASK) & ~MAP_MEM_FLAGS_ALL) {
                /*
@@ -2267,7 +2286,7 @@ mach_make_memory_entry_internal(
                return KERN_INVALID_VALUE;
        }
 
-       if (parent_handle != IP_NULL &&
+       if (IP_VALID(parent_handle) &&
            ip_kotype(parent_handle) == IKOT_NAMED_ENTRY) {
                parent_entry = (vm_named_entry_t) parent_handle->ip_kobject;
        } else {
@@ -2356,6 +2375,8 @@ mach_make_memory_entry_internal(
                assert(object != VM_OBJECT_NULL);
 
                if (permission & MAP_MEM_PURGABLE) {
+                       task_t owner;
+
                        if (! (permission & VM_PROT_WRITE)) {
                                /* if we can't write, we can't purge */
                                vm_object_deallocate(object);
@@ -2366,27 +2387,34 @@ mach_make_memory_entry_internal(
                        if (permission & MAP_MEM_PURGABLE_KERNEL_ONLY) {
                                object->purgeable_only_by_kernel = TRUE;
                        }
-                       assert(object->vo_purgeable_owner == NULL);
+                       assert(object->vo_owner == NULL);
                        assert(object->resident_page_count == 0);
                        assert(object->wired_page_count == 0);
                        vm_object_lock(object);
-                       if (pacified_purgeable_iokit) {
-                               if (permission & MAP_MEM_LEDGER_TAG_NETWORK) {
-                                       vm_purgeable_nonvolatile_enqueue(object,
-                                                                        kernel_task);
-                               } else {
-                                       vm_purgeable_nonvolatile_enqueue(object,
-                                                                        current_task());
-                               }
-                       } else {
-                               if (object->purgeable_only_by_kernel) {
-                                       vm_purgeable_nonvolatile_enqueue(object,
-                                                                        kernel_task);
-                               } else {
-                                       vm_purgeable_nonvolatile_enqueue(object,
-                                                                        current_task());
-                               }
+                       owner = current_task();
+#if __arm64__
+                       if (owner->task_legacy_footprint) {
+                               /*
+                                * For ios11, we failed to account for
+                                * this memory.  Keep doing that for
+                                * legacy apps (built before ios12),
+                                * for backwards compatibility's sake...
+                                */
+                               owner = kernel_task;
                        }
+#endif /* __arm64__ */
+                       vm_purgeable_nonvolatile_enqueue(object, owner);
+                       vm_object_unlock(object);
+               }
+
+               if (permission & MAP_MEM_LEDGER_TAG_NETWORK) {
+                       /* make this object owned by the calling task */
+                       vm_object_lock(object);
+                       vm_object_ownership_change(
+                               object,
+                               VM_OBJECT_LEDGER_TAG_NETWORK,
+                               current_task(), /* new owner */
+                               FALSE); /* task_objq locked? */
                        vm_object_unlock(object);
                }
 
@@ -2740,6 +2768,10 @@ redo_lookup:
                   }
                }
 
+#if VM_NAMED_ENTRY_LIST
+               alias = VME_ALIAS(map_entry);
+#endif /* VM_NAMED_ENTRY_LIST */
+
                /*
                 * We found the VM map entry, lock the VM object again.
                 */
@@ -2869,7 +2901,8 @@ redo_lookup:
                                object->vo_size >
                                vm_map_round_page(total_size,
                                                  VM_MAP_PAGE_MASK(target_map)))))
-                            && !object->true_share)) {
+                            && !object->true_share
+                            && object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC)) {
                                /*
                                 * We have to unlock the VM object before
                                 * trying to upgrade the VM map lock, to
@@ -3089,6 +3122,9 @@ redo_lookup:
                user_entry->protection = protections;
                SET_MAP_MEM(GET_MAP_MEM(permission), user_entry->protection);
                user_entry->size = map_size;
+#if VM_NAMED_ENTRY_LIST
+               user_entry->named_entry_alias = alias;
+#endif /* VM_NAMED_ENTRY_LIST */
 
                /* user_object pager and internal fields are not used */
                /* when the object field is filled in.                */
@@ -3173,7 +3209,7 @@ redo_lookup:
                if(parent_entry->is_sub_map) {
                   user_entry->backing.map = parent_entry->backing.map;
                   vm_map_lock(user_entry->backing.map);
-                  user_entry->backing.map->ref_count++;
+                  user_entry->backing.map->map_refcnt++;
                   vm_map_unlock(user_entry->backing.map);
                } else {
                   object = parent_entry->backing.object;
@@ -3277,10 +3313,9 @@ task_wire(
        if (map == VM_MAP_NULL)
                return(KERN_INVALID_ARGUMENT);
 
-       if (must_wire)
-               map->wiring_required = TRUE;
-       else
-               map->wiring_required = FALSE;
+       vm_map_lock(map);
+       map->wiring_required = (must_wire == TRUE);
+       vm_map_unlock(map);
 
        return(KERN_SUCCESS);
 }
@@ -3299,6 +3334,27 @@ vm_map_exec_lockdown(
        return(KERN_SUCCESS);
 }
 
+#if VM_NAMED_ENTRY_LIST
+queue_head_t   vm_named_entry_list;
+int            vm_named_entry_count = 0;
+lck_mtx_t      vm_named_entry_list_lock_data;
+lck_mtx_ext_t  vm_named_entry_list_lock_data_ext;
+#endif /* VM_NAMED_ENTRY_LIST */
+
+void vm_named_entry_init(void);
+void
+vm_named_entry_init(void)
+{
+#if VM_NAMED_ENTRY_LIST
+       queue_init(&vm_named_entry_list);
+       vm_named_entry_count = 0;
+       lck_mtx_init_ext(&vm_named_entry_list_lock_data,
+                        &vm_named_entry_list_lock_data_ext,
+                        &vm_object_lck_grp,
+                        &vm_object_lck_attr);
+#endif /* VM_NAMED_ENTRY_LIST */
+}
+
 __private_extern__ kern_return_t
 mach_memory_entry_allocate(
        vm_named_entry_t        *user_entry_p,
@@ -3311,6 +3367,7 @@ mach_memory_entry_allocate(
        user_entry = (vm_named_entry_t) kalloc(sizeof *user_entry);
        if (user_entry == NULL)
                return KERN_FAILURE;
+       bzero(user_entry, sizeof (*user_entry));
 
        named_entry_lock_init(user_entry);
 
@@ -3325,10 +3382,6 @@ mach_memory_entry_allocate(
        user_handle->ip_sorights++;
        ip_reference(user_handle);
 
-       user_handle->ip_destination = IP_NULL;
-       user_handle->ip_receiver_name = MACH_PORT_NULL;
-       user_handle->ip_receiver = ipc_space_kernel;
-
        /* make a send right */
         user_handle->ip_mscount++;
         user_handle->ip_srights++;
@@ -3353,6 +3406,21 @@ mach_memory_entry_allocate(
        *user_entry_p = user_entry;
        *user_handle_p = user_handle;
 
+#if VM_NAMED_ENTRY_LIST
+       /* keep a loose (no reference) pointer to the Mach port, for debugging only */
+       user_entry->named_entry_port = user_handle;
+       /* backtrace at allocation time, for debugging only */
+       OSBacktrace(&user_entry->named_entry_bt[0],
+                   NAMED_ENTRY_BT_DEPTH);
+
+       /* add this new named entry to the global list */
+       lck_mtx_lock_spin(&vm_named_entry_list_lock_data);
+       queue_enter(&vm_named_entry_list, user_entry,
+                   vm_named_entry_t, named_entry_list);
+       vm_named_entry_count++;
+       lck_mtx_unlock(&vm_named_entry_list_lock_data);
+#endif /* VM_NAMED_ENTRY_LIST */
+
        return KERN_SUCCESS;
 }
 
@@ -3454,7 +3522,7 @@ memory_entry_purgeable_control_internal(
        vm_named_entry_t        mem_entry;
        vm_object_t             object;
 
-       if (entry_port == IP_NULL ||
+       if (!IP_VALID(entry_port) ||
            ip_kotype(entry_port) != IKOT_NAMED_ENTRY) {
                return KERN_INVALID_ARGUMENT;
        }
@@ -3503,6 +3571,69 @@ memory_entry_purgeable_control_internal(
        return kr;
 }
 
+kern_return_t
+mach_memory_entry_access_tracking(
+       ipc_port_t      entry_port,
+       int             *access_tracking,
+       uint32_t        *access_tracking_reads,
+       uint32_t        *access_tracking_writes)
+{
+       return memory_entry_access_tracking_internal(entry_port,
+                                                    access_tracking,
+                                                    access_tracking_reads,
+                                                    access_tracking_writes);
+}
+
+kern_return_t
+memory_entry_access_tracking_internal(
+       ipc_port_t      entry_port,
+       int             *access_tracking,
+       uint32_t        *access_tracking_reads,
+       uint32_t        *access_tracking_writes)
+{
+       vm_named_entry_t        mem_entry;
+       vm_object_t             object;
+       kern_return_t           kr;
+
+       if (!IP_VALID(entry_port) ||
+           ip_kotype(entry_port) != IKOT_NAMED_ENTRY) {
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       mem_entry = (vm_named_entry_t) entry_port->ip_kobject;
+
+       named_entry_lock(mem_entry);
+
+       if (mem_entry->is_sub_map ||
+           mem_entry->is_copy) {
+               named_entry_unlock(mem_entry);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+       object = mem_entry->backing.object;
+       if (object == VM_OBJECT_NULL) {
+               named_entry_unlock(mem_entry);
+               return KERN_INVALID_ARGUMENT;
+       }
+
+#if VM_OBJECT_ACCESS_TRACKING
+       vm_object_access_tracking(object,
+                                 access_tracking,
+                                 access_tracking_reads,
+                                 access_tracking_writes);
+       kr = KERN_SUCCESS;
+#else /* VM_OBJECT_ACCESS_TRACKING */
+       (void) access_tracking;
+       (void) access_tracking_reads;
+       (void) access_tracking_writes;
+       kr = KERN_NOT_SUPPORTED;
+#endif /* VM_OBJECT_ACCESS_TRACKING */
+
+       named_entry_unlock(mem_entry);
+
+       return kr;
+}
+
 kern_return_t
 mach_memory_entry_get_page_counts(
        ipc_port_t      entry_port,
@@ -3515,7 +3646,7 @@ mach_memory_entry_get_page_counts(
        vm_object_offset_t      offset;
        vm_object_size_t        size;
 
-       if (entry_port == IP_NULL ||
+       if (!IP_VALID(entry_port) ||
            ip_kotype(entry_port) != IKOT_NAMED_ENTRY) {
                return KERN_INVALID_ARGUMENT;
        }
@@ -3603,6 +3734,15 @@ mach_destroy_memory_entry(
                named_entry_unlock(named_entry);
                named_entry_lock_destroy(named_entry);
 
+#if VM_NAMED_ENTRY_LIST
+               lck_mtx_lock_spin(&vm_named_entry_list_lock_data);
+               queue_remove(&vm_named_entry_list, named_entry,
+                            vm_named_entry_t, named_entry_list);
+               assert(vm_named_entry_count > 0);
+               vm_named_entry_count--;
+               lck_mtx_unlock(&vm_named_entry_list_lock_data);
+#endif /* VM_NAMED_ENTRY_LIST */
+
                kfree((void *) port->ip_kobject,
                      sizeof (struct vm_named_entry));
        } else
@@ -3624,7 +3764,7 @@ mach_memory_entry_page_op(
        vm_object_t             object;
        kern_return_t           kr;
 
-       if (entry_port == IP_NULL ||
+       if (!IP_VALID(entry_port) ||
            ip_kotype(entry_port) != IKOT_NAMED_ENTRY) {
                return KERN_INVALID_ARGUMENT;
        }
@@ -3677,7 +3817,7 @@ mach_memory_entry_range_op(
        vm_object_t             object;
        kern_return_t           kr;
 
-       if (entry_port == IP_NULL ||
+       if (!IP_VALID(entry_port) ||
            ip_kotype(entry_port) != IKOT_NAMED_ENTRY) {
                return KERN_INVALID_ARGUMENT;
        }
@@ -4173,7 +4313,10 @@ vm_map(
        vm_tag_t tag;
 
        VM_GET_FLAGS_ALIAS(flags, tag);
-       return (vm_map_kernel(target_map, address, size, mask, flags, tag,  port, offset, copy, cur_protection, max_protection, inheritance));
+       return vm_map_kernel(target_map, address, size, mask,
+                            flags, VM_MAP_KERNEL_FLAGS_NONE, tag,
+                            port, offset, copy,
+                            cur_protection, max_protection, inheritance);
 }
 
 #endif /* __x86_64__ */
index baf70d7bada7f191cdb1ecb402dcb856c1d002d8..c9cd2e80754ce2f16e4bbe98be039537e189fee6 100644 (file)
 #include <sys/kdebug.h>
 #include <IOKit/IOBSD.h>
 #include <mach/mach_voucher_attr_control.h>
+#include <pthread/priority_private.h>
 
 ipc_voucher_attr_control_t  ipc_pthread_priority_voucher_attr_control;    /* communication channel from PTHPRIORITY to voucher system */
 
 #define IPC_PTHREAD_PRIORITY_VALUE_TO_HANDLE(x) ((mach_voucher_attr_value_handle_t)(x))
 #define HANDLE_TO_IPC_PTHREAD_PRIORITY_VALUE(x) ((ipc_pthread_priority_value_t)(x))
 
-extern unsigned long pthread_priority_canonicalize(unsigned long priority, boolean_t for_propagation);
-
 kern_return_t
 ipc_pthread_priority_release_value(
        ipc_voucher_attr_manager_t __assert_only manager,
@@ -200,8 +199,8 @@ ipc_pthread_priority_get_value(
                }
 
                /* Callout to pthread kext to get the canonicalized value */
-               canonicalize_priority_value = (ipc_pthread_priority_value_t) pthread_priority_canonicalize(
-                                               (unsigned long)ipc_pthread_priority_value, true);
+               canonicalize_priority_value = (ipc_pthread_priority_value_t)
+                               _pthread_priority_normalize_for_ipc((unsigned long)ipc_pthread_priority_value);
 
                *out_value = IPC_PTHREAD_PRIORITY_VALUE_TO_HANDLE(canonicalize_priority_value);
                *out_flags = MACH_VOUCHER_ATTR_VALUE_FLAGS_PERSIST;
index dd4b936702bb1a1c6a75482b07f1d349a2e1ac86..aae293b6c113fb5b9d935cbd143c8c1359b3654c 100644 (file)
@@ -44,7 +44,8 @@
 
 #include <kern/copyout_shim.h>
 
-
+#undef copyin
+#undef copyout
 
 static int copyio(int, user_addr_t, char *, vm_size_t, vm_size_t *, int);
 static int copyio_phys(addr64_t, addr64_t, vm_size_t, int);
@@ -80,6 +81,8 @@ extern int _bcopy(const void *, void *, vm_size_t);
 extern int _bcopystr(const void *, void *, vm_size_t, vm_size_t *);
 extern int _copyin_word(const char *src, uint64_t *dst, vm_size_t len);
 
+/* On by default, optionally disabled by boot-arg */
+extern boolean_t copyio_zalloc_check;
 
 /*
  * Types of copies:
@@ -166,6 +169,7 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
        int             debug_type = 0xeff70010;
        debug_type += (copy_type << 2);
 #endif
+       vm_size_t kernel_buf_size = 0;
 
        if (__improbable(nbytes > copysize_limit_panic))
                panic("%s(%p, %p, %lu) - transfer too large", __func__,
@@ -177,13 +181,19 @@ copyio(int copy_type, user_addr_t user_addr, char *kernel_addr,
        if (__improbable(nbytes == 0))
                goto out;
 
-        pmap = thread->map->pmap;
-       boolean_t nopagezero = pmap->pagezero_accessible;
+       pmap = thread->map->pmap;
+       boolean_t nopagezero = thread->map->pmap->pagezero_accessible;
 
-       if (__improbable((copy_type != COPYINPHYS) && (copy_type != COPYOUTPHYS) && ((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS))) {
-               panic("Invalid copy parameter, copy type: %d, kernel address: %p", copy_type, kernel_addr);
+       if ((copy_type != COPYINPHYS) && (copy_type != COPYOUTPHYS)) {
+               if (__improbable((vm_offset_t)kernel_addr < VM_MIN_KERNEL_AND_KEXT_ADDRESS))
+                       panic("Invalid copy parameter, copy type: %d, kernel address: %p", copy_type, kernel_addr);
+               if (__probable(copyio_zalloc_check)) {
+                       kernel_buf_size = zone_element_size(kernel_addr, NULL);
+                       if (__improbable(kernel_buf_size && kernel_buf_size < nbytes))
+                               panic("copyio: kernel buffer %p has size %lu < nbytes %lu", kernel_addr, kernel_buf_size, nbytes);
+               }
        }
-
+       
        /* Sanity and security check for addresses to/from a user */
 
        if (__improbable(((pmap != kernel_pmap) && (use_kernel_map == 0)) &&
@@ -371,7 +381,7 @@ copyinmsg(const user_addr_t user_addr, char *kernel_addr, mach_msg_size_t nbytes
 }    
 
 int
-copyin(const user_addr_t user_addr, char *kernel_addr, vm_size_t nbytes)
+copyin(const user_addr_t user_addr, void *kernel_addr, vm_size_t nbytes)
 {
     return copyio(COPYIN, user_addr, kernel_addr, nbytes, NULL, 0);
 }
index c0574bf8c645e956f2d7f5658b1c702942f7b997..cb72f459e30f897538cb9abbfde84d929f6e18c5 100644 (file)
@@ -118,7 +118,7 @@ Entry(Switch_context)
 
 
 Entry(Thread_continue)
-       movq    %rax, %rdi                      /* load thread argument */
+       movq    %rax, %rdi                      /* this is the old thread from Switch_context */
        xorq    %rbp,%rbp                       /* zero frame pointer */
        call    *%rbx                           /* call real continuation */
 
index 54a43ec98f2aca958837461a41d017026a2fd450..b1a1ada4c894b0d45f61a2cf53de10b92ac0fbd4 100644 (file)
 .section       __HIB, __desc
 .globl EXT(idt64_hndl_table0)
 EXT(idt64_hndl_table0):
-       .quad EXT(ks_dispatch)
-       .quad EXT(ks_64bit_return)
-       .quad 0 /* Populated with CPU shadow displacement*/
-       .quad EXT(ks_return)
+/* 0x00 */     .quad EXT(ks_dispatch)
+/* 0x08 */     .quad EXT(ks_64bit_return)
+/* 0x10 */     .quad 0 /* Populated with CPU shadow displacement*/
+/* 0x18 */     .quad EXT(ks_return)
+#define        TBL0_OFF_DISP_USER_WITH_POPRAX  0x20
+/* 0x20 */     .quad EXT(ks_dispatch_user_with_pop_rax)
+#define        TBL0_OFF_DISP_KERN_WITH_POPRAX  0x28
+/* 0x28 */     .quad EXT(ks_dispatch_kernel_with_pop_rax)
+#define        TBL0_OFF_PTR_KERNEL_STACK_MASK  0x30
+/* 0x30 */     .quad 0 /* &kernel_stack_mask */
 
 EXT(idt64_hndl_table1):
        .quad   EXT(hndl_allintrs)
@@ -217,19 +223,19 @@ Entry(idt64_unix_scall)
        pushq   %rax                    /* save system call number */
        pushq   $(HNDL_UNIX_SCALL)
        pushq   $(UNIX_INT)
-       jmp     L_dispatch
+       jmp     L_u64bit_entry_check
        
 Entry(idt64_mach_scall)
        pushq   %rax                    /* save system call number */
        pushq   $(HNDL_MACH_SCALL)
        pushq   $(MACH_INT)
-       jmp     L_dispatch
+       jmp     L_u64bit_entry_check
        
 Entry(idt64_mdep_scall)
        pushq   %rax                    /* save system call number */
        pushq   $(HNDL_MDEP_SCALL)
        pushq   $(MACHDEP_INT)
-       jmp     L_dispatch
+       jmp     L_u64bit_entry_check
 
 /*
  * For GP/NP/SS faults, we use the IST1 stack.
@@ -283,29 +289,95 @@ Entry(idt64_mc)
  * Machine checks, doublefaults and similar use IST1
  */
 Entry(idt64_nmi)
-       /* Synthesize common interrupt stack frame */
-       pushq   $0
-       pushq   $(HNDL_ALLINTRS)
-       pushq   $(T_NMI)
-       /* Spill prior to RDMSR */
        push    %rax
        push    %rcx
        push    %rdx
+       testb   $3, ISF64_CS(%rsp)
+       jz      1f
+
+       /* From user-space: copy interrupt state to user PCB */
+       swapgs
+
+       leaq    EXT(idt64_hndl_table0)(%rip), %rax
+       mov     16(%rax), %rax /* Offset of per-CPU shadow */
+       mov     %gs:CPU_TASK_CR3(%rax), %rax
+       mov     %rax, %cr3                      /* note that SMAP is enabled in L_common_dispatch (on Broadwell+) */
+
+       mov     %gs:CPU_UBER_ISF, %rcx          /* PCB stack addr */
+       add     $(ISF64_SIZE), %rcx             /* adjust to base of ISF */
+
+       leaq    TBL0_OFF_DISP_USER_WITH_POPRAX+EXT(idt64_hndl_table0)(%rip), %rax               /* ks_dispatch_user_with_pop_rax */
+       jmp     4f                                              /* Copy state to PCB */
+
+1:
+       /*
+        * From kernel-space:
+        * Determine whether the kernel or user GS is set.
+        * Sets the high 32 bits of the return CS to 1 to ensure that we'll swapgs back correctly at IRET.
+        */
        mov     $(MSR_IA32_GS_BASE), %ecx
-       rdmsr                                   /* Check contents of GSBASE MSR */
-       test    $0x80000000, %edx               /* MSB set? Already swapped to kernel's */
-       jnz     44f
-       swapgs                                  /* Either direct from user or within trampolines */
-44:
-       pop     %rdx
-       pop     %rcx
+       rdmsr                                   /* read kernel gsbase */
+       test    $0x80000000, %edx               /* test MSB of address */
+       jnz     2f
+       swapgs                                  /* so swap */
+       movl    $1, ISF64_CS+4(%rsp)            /* and set flag in CS slot */
+2:
 
        leaq    EXT(idt64_hndl_table0)(%rip), %rax
        mov     16(%rax), %rax /* Offset of per-CPU shadow */
-       mov     %gs:CPU_KERNEL_CR3(%rax), %rax
+       mov     %cr3, %rdx
+       mov     %gs:CPU_TASK_CR3(%rax), %rax
        mov     %rax, %cr3 /* Unconditionally switch to primary kernel pagetables */
-       leaq    EXT(idt64_hndl_table0)(%rip), %rax
-       jmp     *(%rax)
+
+       /*
+        * Determine whether we're on the kernel or interrupt stack
+        * when the NMI hit.
+        */
+       mov     ISF64_RSP(%rsp), %rcx
+       mov     %gs:CPU_KERNEL_STACK, %rax
+       xor     %rcx, %rax
+       movq    TBL0_OFF_PTR_KERNEL_STACK_MASK+EXT(idt64_hndl_table0)(%rip), %rdx
+       mov     (%rdx), %rdx            /* Load kernel_stack_mask */
+       and     %rdx, %rax
+       test    %rax, %rax              /* are we on the kernel stack? */
+       jz      3f                      /* yes */
+
+       mov     %gs:CPU_INT_STACK_TOP, %rax
+       cmp     %rcx, %rax              /* are we on the interrupt stack? */
+       jb      5f                      /* no */
+       leaq    -INTSTACK_SIZE(%rax), %rax
+       cmp     %rcx, %rax
+       jb      3f                      /* yes */
+5:
+       mov    %gs:CPU_KERNEL_STACK, %rcx
+3:
+       /* 16-byte-align kernel/interrupt stack for state push */
+       and     $0xFFFFFFFFFFFFFFF0, %rcx
+
+       leaq    TBL0_OFF_DISP_KERN_WITH_POPRAX+EXT(idt64_hndl_table0)(%rip), %rax               /* ks_dispatch_kernel_with_pop_rax */
+4:
+       /*
+        * Copy state from NMI stack (RSP) to the save area (RCX) which is
+        * the PCB for user or kernel/interrupt stack from kernel.
+        * ISF64_ERR(RSP)    saved RAX
+        * ISF64_TRAPFN(RSP) saved RCX
+        * ISF64_TRAPNO(RSP) saved RDX
+        */
+       xchg    %rsp, %rcx                      /* set for pushes */
+       push    ISF64_SS(%rcx)
+       push    ISF64_RSP(%rcx)
+       push    ISF64_RFLAGS(%rcx)
+       push    ISF64_CS(%rcx)
+       push    ISF64_RIP(%rcx)
+       /* Synthesize common interrupt stack frame */
+       push    $(0)                            /* error code 0 */
+       push    $(HNDL_ALLINTRS)                /* trapfn allintrs */
+       push    $(T_NMI)                        /* trapno T_NMI */
+       push    ISF64_ERR(%rcx)                 /* saved %rax is popped in ks_dispatch_{kernel|user}_with_pop_rax */
+       mov     ISF64_TRAPNO(%rcx), %rdx
+       mov     ISF64_TRAPFN(%rcx), %rcx
+
+       jmp     *(%rax)         /* ks_dispatch_{kernel|user}_with_pop_rax */
 
 Entry(idt64_double_fault)
        pushq   $(HNDL_DOUBLE_FAULT)
@@ -375,7 +447,7 @@ L_sysenter_continue:
        pushq   $(HNDL_SYSENTER)
        pushq   $(T_SYSENTER)
        orl     $(EFL_IF), ISF64_RFLAGS(%rsp)
-       jmp L_dispatch
+       jmp     L_u64bit_entry_check
 
 /*
  * Common dispatch point.
@@ -394,15 +466,15 @@ L_dispatch:
        swapgs
        leaq    EXT(idt64_hndl_table0)(%rip), %rax
        mov     16(%rax), %rax
-
+L_dispatch_kgsb:
        mov     %gs:CPU_TASK_CR3(%rax), %rax 
        mov     %rax, %cr3
 #if    DEBUG
        mov     %rax, %gs:CPU_ENTRY_CR3
 #endif
 1:
-       /* The text/data relationship here must be preserved in the doublemap, and the contents must be remapped */
        leaq    EXT(idt64_hndl_table0)(%rip), %rax
+       /* The text/data relationship here must be preserved in the doublemap, and the contents must be remapped */
        /* Indirect branch to non-doublemapped trampolines */
        jmp *(%rax)
 /* User return: register restoration and address space switch sequence */
@@ -461,6 +533,28 @@ L_sysret:
        pop     %r11
        pop     %rsp
        sysretq                         /* return from system call */
+
+L_u64bit_entry_check:
+       /*
+        * Check we're not a confused 64-bit user.
+        */
+       pushq   %rax
+       swapgs
+       leaq    EXT(idt64_hndl_table0)(%rip), %rax
+       mov     16(%rax), %rax
+
+       cmpl    $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP(%rax)
+       jne     L_64bit_entry_reject
+       jmp     L_dispatch_kgsb
+
+L_64bit_entry_reject:
+       /*
+        * Here for a 64-bit user attempting an invalid kernel entry.
+        */
+       movq    $(HNDL_ALLTRAPS), 8+ISF64_TRAPFN(%rsp)
+       movq    $(T_INVALID_OPCODE), 8+ISF64_TRAPNO(%rsp)
+       jmp     L_dispatch_kgsb
+
 /* End of double-mapped TEXT */
 .text
 
@@ -489,8 +583,12 @@ Entry(ks_dispatch)
        mov     %gs:CPU_UBER_TMP, %rax
        jmp     EXT(ks_dispatch_user)
 
+Entry(ks_dispatch_user_with_pop_rax)
+       pop     %rax
+       jmp     EXT(ks_dispatch_user)
+
 Entry (ks_return)
-     jmp .
+       jmp     .
 
 Entry(ks_dispatch_user)
        cmpl    $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP
@@ -503,6 +601,10 @@ L_dispatch_U64:
        mov     %gs:CPU_KERNEL_STACK, %rsp
        jmp     L_dispatch_64bit
 
+Entry(ks_dispatch_kernel_with_pop_rax)
+       pop     %rax
+       jmp     EXT(ks_dispatch_kernel)
+
 Entry(ks_dispatch_kernel)
        subq    $(ISS64_OFFSET), %rsp
        mov     %r15, R64_R15(%rsp)
@@ -517,8 +619,8 @@ L_dispatch_64bit:
        /*
         * Save segment regs - for completeness since theyre not used.
         */
-       movl    %fs, R64_FS(%r15)
-       movl    %gs, R64_GS(%r15)
+       mov     %fs, R64_FS(%r15)
+       mov     %gs, R64_GS(%r15)
 
        /* Save general-purpose registers */
        mov     %rax, R64_RAX(%r15)
@@ -557,22 +659,6 @@ L_dispatch_64bit:
 
        jmp     L_common_dispatch
 
-L_64bit_entry_reject:
-       /*
-        * Here for a 64-bit user attempting an invalid kernel entry.
-        */
-       movq    $(HNDL_ALLTRAPS), ISF64_TRAPFN(%rsp)
-       movq    $(T_INVALID_OPCODE), ISF64_TRAPNO(%rsp)
-       jmp     L_dispatch_U64
-       
-Entry(ks_32bit_entry_check)
-       /*
-        * Check we're not a confused 64-bit user.
-        */
-       cmpl    $(TASK_MAP_32BIT), %gs:CPU_TASK_MAP
-       jne     L_64bit_entry_reject
-       /* fall through to 32-bit handler: */
-
 L_dispatch_U32: /* 32-bit user task */
        subq    $(ISS64_OFFSET), %rsp
        mov     %rsp, %r15
@@ -582,10 +668,10 @@ L_dispatch_U32: /* 32-bit user task */
        /*
         * Save segment regs
         */
-       movl    %ds, R32_DS(%r15)
-       movl    %es, R32_ES(%r15)
-       movl    %fs, R32_FS(%r15)
-       movl    %gs, R32_GS(%r15)
+       mov     %ds, R32_DS(%r15)
+       mov     %es, R32_ES(%r15)
+       mov     %fs, R32_FS(%r15)
+       mov     %gs, R32_GS(%r15)
 
        /*
         * Save general 32-bit registers
@@ -1113,73 +1199,6 @@ L_32bit_fault_set_seg:
        jmp     L_dispatch_U32_after_fault
 
 
-Entry(ks_idt64_nmi_kernel)
-       /* From user-space: copy interrupt state to user PCB */
-       swapgs
-       mov     %gs:CPU_UBER_ISF, %rcx          /* PCB stack addr */
-       add     $(ISF64_SIZE), %rcx             /* adjust to base of ISF */     
-       swapgs                                  /* swap back for L_dispatch */
-       jmp     4f                              /* Copy state to PCB */
-
-1:
-       /*
-       * From kernel-space:
-        * Determine whether the kernel or user GS is set.
-        * Set the kernel and ensure that we'll swap back correctly at IRET.
-        */
-       mov     $(MSR_IA32_GS_BASE), %ecx
-       rdmsr                                   /* read kernel gsbase */
-       test    $0x80000000, %edx               /* test MSB of address */
-       jne     2f
-       swapgs                                  /* so swap */
-       movl    $1, ISF64_CS+4(%rsp)            /* and set flag in CS slot */
-2:
-       /*
-        * Determine whether we're on the kernel or interrupt stack
-        * when the NMI hit.
-        */
-       mov     ISF64_RSP(%rsp), %rcx
-       mov     %gs:CPU_KERNEL_STACK, %rax
-       xor     %rcx, %rax
-       and     EXT(kernel_stack_mask)(%rip), %rax
-       test    %rax, %rax              /* are we on the kernel stack? */
-       je      3f                      /* yes */
-
-       mov     %gs:CPU_INT_STACK_TOP, %rax
-       dec     %rax                    /* intr stack top is byte above max */
-       xor     %rcx, %rax
-       and     EXT(kernel_stack_mask)(%rip), %rax
-       test    %rax, %rax              /* are we on the interrupt stack? */
-       je      3f                      /* yes */
-
-       mov    %gs:CPU_KERNEL_STACK, %rcx
-3:
-       /* 16-byte-align kernel/interrupt stack for state push */
-       and     $0xFFFFFFFFFFFFFFF0, %rcx
-
-4:
-       /*
-        * Copy state from NMI stack (RSP) to the save area (RCX) which is
-        * the PCB for user or kernel/interrupt stack from kernel.
-        * ISF64_ERR(RSP)    saved RAX
-        * ISF64_TRAPFN(RSP) saved RCX
-        * ISF64_TRAPNO(RSP) saved RDX
-        */
-       xchg    %rsp, %rcx                      /* set for pushes */
-       push    ISF64_SS(%rcx)
-       push    ISF64_RSP(%rcx)
-       push    ISF64_RFLAGS(%rcx)
-       push    ISF64_CS(%rcx)
-       push    ISF64_RIP(%rcx)
-       push    $(0)                            /* error code 0 */
-       push    $(HNDL_ALLINTRS)                /* trapfn allintrs */
-       push    $(T_NMI)                        /* trapno T_NMI */
-       mov     ISF64_ERR(%rcx), %rax
-       mov     ISF64_TRAPNO(%rcx), %rdx
-       mov     ISF64_TRAPFN(%rcx), %rcx
-       jmp     L_dispatch
-
-
 /* All 'exceptions' enter hndl_alltraps, with:
  *     r15     x86_saved_state_t address
  *     rsp     kernel stack if user-space, otherwise interrupt or kernel stack
index a34b77fb3c8bc30dee4c32fcd61c5f01fe3fd300..f24bbfa318655c25d97a709d5a298ed9379c2b56 100644 (file)
@@ -583,7 +583,14 @@ kpc_set_period_arch( struct kpc_config_remote *mp_config )
 void
 kpc_arch_init(void)
 {
-       /* No-op */
+       i386_cpu_info_t *info = cpuid_info();
+       uint8_t version_id = info->cpuid_arch_perf_leaf.version;
+       /*
+        * kpc only supports Intel PMU versions 2 and above.
+        */
+       if (version_id < 2) {
+               kpc_supported = false;
+       }
 }
 
 uint32_t
index af962f2f44238dc46b88b3fe701941950163045e..c0a1bdc90cde088a4d5a3440406f126ec61ed2ca 100644 (file)
@@ -175,18 +175,40 @@ ENTRY(_rtc_tsc_to_nanoseconds)
        shrdq   $32,%rdx,%rax                   /* %rdx:%rax >>= 32 */
        ret
     
-    
+
+/*
+ *  typedef void (*thread_continue_t)(void *param, wait_result_t)
+ *
+ *     void call_continuation( thread_continue_t continuation,
+ *                                     void *param,
+ *                                         wait_result_t wresult,
+ *                          bool enable interrupts)
+ */
 
 Entry(call_continuation)
-       movq    %rdi,%rcx                       /* get continuation */
-       movq    %rsi,%rdi                       /* continuation param */
-       movq    %rdx,%rsi                       /* wait result */
+
+       movq    %rdi, %r12  /* continuation */
+    movq    %rsi, %r13  /* continuation param */
+    movq    %rdx, %r14  /* wait result */
+
        movq    %gs:CPU_KERNEL_STACK,%rsp       /* set the stack */
        xorq    %rbp,%rbp                       /* zero frame pointer */
+
+    test    %ecx, %ecx
+    jz 1f
+    mov     $1, %edi
+    call   _ml_set_interrupts_enabled
+1:
+
+       movq    %r12,%rcx                       /* continuation */
+       movq    %r13,%rdi                       /* continuation param */
+       movq    %r14,%rsi                       /* wait result */
+
        call    *%rcx                           /* call continuation */
        movq    %gs:CPU_ACTIVE_THREAD,%rdi
        call    EXT(thread_terminate)
 
+
 Entry(x86_init_wrapper)
        xor     %rbp, %rbp
        movq    %rsi, %rsp
index 12c5c4b7627ca97a62af4ffedfcc78b2b8612e76..72056014868ed3493ff7098d3b38a0d8b904364d 100644 (file)
@@ -29,6 +29,7 @@
 #include <i386/cpu_data.h>
 #include <i386/cpuid.h>
 #include <i386/lapic.h>
+#include <i386/mp.h>
 #include <i386/proc_reg.h>
 #include <kern/assert.h> /* static_assert, assert */
 #include <kern/monotonic.h>
@@ -89,7 +90,7 @@ mt_core_snap(unsigned int ctr)
                return __builtin_ia32_rdpmc(PMC2_RD);
        default:
                panic("monotonic: invalid core counter read: %u", ctr);
-               __builtin_trap();
+               __builtin_unreachable();
        }
 }
 
@@ -112,7 +113,7 @@ mt_core_set_snap(unsigned int ctr, uint64_t count)
                break;
        default:
                panic("monotonic: invalid core counter write: %u", ctr);
-               __builtin_trap();
+               __builtin_unreachable();
        }
 }
 
@@ -131,7 +132,8 @@ mt_core_set_snap(unsigned int ctr, uint64_t count)
  * Fixed counters are enabled in all rings, so hard-code this register state to
  * enable in all rings and deliver PMIs.
  */
-#define FIXED_CTR_CTRL_INIT (0x888 | 0x333)
+#define FIXED_CTR_CTRL_INIT (0x888)
+#define FIXED_CTR_CTRL_ENABLE (0x333)
 
 /*
  * GLOBAL_CTRL controls which counters are enabled -- the high 32-bits control
@@ -184,7 +186,7 @@ core_up(cpu_data_t *cpu)
        for (int i = 0; i < MT_CORE_NFIXED; i++) {
                mt_core_set_snap(i, mtc->mtc_snaps[i]);
        }
-       wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT);
+       wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE);
        wrmsr64(GLOBAL_CTRL, GLOBAL_CTRL_FIXED_EN);
 }
 
@@ -208,7 +210,6 @@ mt_pmi_x86_64(x86_saved_state_t *state)
 {
        uint64_t status;
        struct mt_cpu *mtc;
-       bool fixed_ovf = false;
 
        assert(ml_get_interrupts_enabled() == FALSE);
        mtc = mt_cur_cpu();
@@ -216,18 +217,28 @@ mt_pmi_x86_64(x86_saved_state_t *state)
 
        (void)atomic_fetch_add_explicit(&mt_pmis, 1, memory_order_relaxed);
 
-       for (int i = 0; i < MT_CORE_NFIXED; i++) {
+       for (unsigned int i = 0; i < MT_CORE_NFIXED; i++) {
                if (status & CTR_FIX_POS(i)) {
-                       fixed_ovf = true;
-                       uint64_t prior;
-
-                       prior = CTR_MAX - mtc->mtc_snaps[i];
+                       uint64_t prior = CTR_MAX - mtc->mtc_snaps[i];
                        assert(prior <= CTR_MAX);
                        prior += 1; /* wrapped */
 
-                       mtc->mtc_counts[i] += prior;
-                       mtc->mtc_snaps[i] = 0;
-                       mt_mtc_update_count(mtc, i);
+                       uint64_t delta = mt_mtc_update_count(mtc, i);
+                       mtc->mtc_counts[i] += delta;
+
+                       if (mt_microstackshots && mt_microstackshot_ctr == i) {
+                               x86_saved_state64_t *state64 = saved_state64(state);
+                               bool user_mode = (state64->isf.cs & 0x3) ? true : false;
+                               KDBG_RELEASE(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 1),
+                                               mt_microstackshot_ctr, user_mode);
+                               mt_microstackshot_pmi_handler(user_mode, mt_microstackshot_ctx);
+                       } else if (mt_debug) {
+                               KDBG(KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_DEBUG, 2),
+                                               mt_microstackshot_ctr, i);
+                       }
+
+                       mtc->mtc_snaps[i] = mt_core_reset_values[i];
+                       mt_core_set_snap(i, mt_core_reset_values[i]);
                }
        }
 
@@ -239,34 +250,61 @@ mt_pmi_x86_64(x86_saved_state_t *state)
        return 0;
 }
 
-void
-mt_init(void)
+static void
+mt_microstackshot_start_remote(__unused void *arg)
+{
+       struct mt_cpu *mtc = mt_cur_cpu();
+
+       wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT);
+
+       for (int i = 0; i < MT_CORE_NFIXED; i++) {
+               uint64_t delta = mt_mtc_update_count(mtc, i);
+               mtc->mtc_counts[i] += delta;
+               mt_core_set_snap(i, mt_core_reset_values[i]);
+               mtc->mtc_snaps[i] = mt_core_reset_values[i];
+       }
+
+       wrmsr64(FIXED_CTR_CTRL, FIXED_CTR_CTRL_INIT | FIXED_CTR_CTRL_ENABLE);
+}
+
+int
+mt_microstackshot_start_arch(uint64_t period)
 {
-       uint32_t cpuinfo[4];
+       if (!mt_core_supported) {
+               return ENOTSUP;
+       }
 
-       do_cpuid(0xA, cpuinfo);
+       mt_core_reset_values[mt_microstackshot_ctr] = CTR_MAX - period;
+       mp_cpus_call(CPUMASK_ALL, ASYNC, mt_microstackshot_start_remote,
+                       NULL);
+       return 0;
+}
 
-       if ((cpuinfo[0] & 0xff) >= 2) {
+void
+mt_early_init(void)
+{
+       i386_cpu_info_t *info = cpuid_info();
+       if (info->cpuid_arch_perf_leaf.version >= 2) {
                lapic_set_pmi_func((i386_intr_func_t)mt_pmi_x86_64);
                mt_core_supported = true;
        }
 }
 
 static int
-core_init(void)
+core_init(__unused mt_device_t dev)
 {
        return ENOTSUP;
 }
 
 #pragma mark common hooks
 
-const struct monotonic_dev monotonic_devs[] = {
+struct mt_device mt_devices[] = {
        [0] = {
-               .mtd_name = "monotonic/core",
+               .mtd_name = "core",
                .mtd_init = core_init
        }
 };
 
 static_assert(
-               (sizeof(monotonic_devs) / sizeof(monotonic_devs[0])) == MT_NDEVS,
-               "MT_NDEVS macro should be same as the length of monotonic_devs");
+               (sizeof(mt_devices) / sizeof(mt_devices[0])) == MT_NDEVS,
+               "MT_NDEVS macro should be same as the length of mt_devices");
index 852b7618dc9ef197734c5b3c528c9fa4467b82dc..8be1ce0deafbf2d15c95094038431cf9665272ac 100644 (file)
@@ -1402,6 +1402,7 @@ pmap_create_options(
        }
 
 #if MACH_ASSERT
+       p->pmap_stats_assert = TRUE;
        p->pmap_pid = 0;
        strlcpy(p->pmap_procname, "<nil>", sizeof (p->pmap_procname));
 #endif /* MACH_ASSERT */
@@ -1512,6 +1513,34 @@ struct {
        int             purgeable_nonvolatile_compressed_under;
        ledger_amount_t purgeable_nonvolatile_compressed_under_total;
        ledger_amount_t purgeable_nonvolatile_compressed_under_max;
+
+       int             network_volatile_over;
+       ledger_amount_t network_volatile_over_total;
+       ledger_amount_t network_volatile_over_max;
+       int             network_volatile_under;
+       ledger_amount_t network_volatile_under_total;
+       ledger_amount_t network_volatile_under_max;
+
+       int             network_nonvolatile_over;
+       ledger_amount_t network_nonvolatile_over_total;
+       ledger_amount_t network_nonvolatile_over_max;
+       int             network_nonvolatile_under;
+       ledger_amount_t network_nonvolatile_under_total;
+       ledger_amount_t network_nonvolatile_under_max;
+
+       int             network_volatile_compressed_over;
+       ledger_amount_t network_volatile_compressed_over_total;
+       ledger_amount_t network_volatile_compressed_over_max;
+       int             network_volatile_compressed_under;
+       ledger_amount_t network_volatile_compressed_under_total;
+       ledger_amount_t network_volatile_compressed_under_max;
+
+       int             network_nonvolatile_compressed_over;
+       ledger_amount_t network_nonvolatile_compressed_over_total;
+       ledger_amount_t network_nonvolatile_compressed_over_max;
+       int             network_nonvolatile_compressed_under;
+       ledger_amount_t network_nonvolatile_compressed_under_total;
+       ledger_amount_t network_nonvolatile_compressed_under_max;
 } pmap_ledgers_drift;
 static void pmap_check_ledgers(pmap_t pmap);
 #else /* MACH_ASSERT */
@@ -1633,7 +1662,15 @@ pmap_protect(
 /*
  *     Set the physical protection on the
  *     specified range of this map as requested.
- *     Will not increase permissions.
+ *
+ * VERY IMPORTANT: Will *NOT* increase permissions.
+ *     pmap_protect_options() should protect the range against any access types
+ *     that are not in "prot" but it should never grant extra access.
+ *     For example, if "prot" is READ|EXECUTE, that means "remove write
+ *     access" but it does *not* mean "add read and execute" access.
+ *     VM relies on getting soft-faults to enforce extra checks (code
+ *     signing, for example), for example.
+ *     New access permissions are granted via pmap_enter() only.
  */
 void
 pmap_protect_options(
@@ -1698,26 +1735,26 @@ pmap_protect_options(
                                        continue;
 
                                if (is_ept) {
-                                       if (prot & VM_PROT_READ)
-                                               pmap_update_pte(spte, 0, PTE_READ(is_ept));
-                                       else
+                                       if (! (prot & VM_PROT_READ)) {
                                                pmap_update_pte(spte, PTE_READ(is_ept), 0);
+                                       }
                                }
-                               if (prot & VM_PROT_WRITE)
-                                       pmap_update_pte(spte, 0, PTE_WRITE(is_ept));
-                               else
+                               if (! (prot & VM_PROT_WRITE)) {
                                        pmap_update_pte(spte, PTE_WRITE(is_ept), 0);
+                               }
+#if DEVELOPMENT || DEBUG
+                               else if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) &&
+                                        map == kernel_pmap) {
+                                       pmap_update_pte(spte, 0, PTE_WRITE(is_ept));
+                               }
+#endif /* DEVELOPMENT || DEBUG */
 
                                if (set_NX) {
-                                       if (!is_ept)
+                                       if (!is_ept) {
                                                pmap_update_pte(spte, 0, INTEL_PTE_NX);
-                                       else
+                                       } else {
                                                pmap_update_pte(spte, INTEL_EPT_EX, 0);
-                               } else {
-                                       if (!is_ept)
-                                               pmap_update_pte(spte, INTEL_PTE_NX, 0);
-                                       else
-                                               pmap_update_pte(spte, 0, INTEL_EPT_EX);
+                                       }
                                }
                                num_found++;
                        }
@@ -2434,9 +2471,11 @@ pmap_switch(pmap_t tpmap)
 {
         spl_t  s;
 
+       PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(tpmap));
        s = splhigh();          /* Make sure interruptions are disabled */
        set_dirbase(tpmap, current_thread(), cpu_number());
        splx(s);
+       PMAP_TRACE_CONSTANT(PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
 }
 
 
@@ -2498,7 +2537,7 @@ pmap_flush(
 {
        unsigned int    my_cpu;
        unsigned int    cpu;
-       unsigned int    cpu_bit;
+       cpumask_t       cpu_bit;
        cpumask_t       cpus_to_respond = 0;
        cpumask_t       cpus_to_signal = 0;
        cpumask_t       cpus_signaled = 0;
@@ -2629,7 +2668,7 @@ void
 pmap_flush_tlbs(pmap_t pmap, vm_map_offset_t startv, vm_map_offset_t endv, int options, pmap_flush_context *pfc)
 {
        unsigned int    cpu;
-       unsigned int    cpu_bit;
+       cpumask_t       cpu_bit;
        cpumask_t       cpus_to_signal = 0;
        unsigned int    my_cpu = cpu_number();
        pmap_paddr_t    pmap_cr3 = pmap->pm_cr3;
@@ -2954,6 +2993,8 @@ pmap_permissions_verify(pmap_t ipmap, vm_map_t ivmmap, vm_offset_t sv, vm_offset
 
 #if MACH_ASSERT
 extern int pmap_ledgers_panic;
+extern int pmap_ledgers_panic_leeway;
+
 static void
 pmap_check_ledgers(
        pmap_t pmap)
@@ -2985,248 +3026,57 @@ pmap_check_ledgers(
 
        pmap_ledgers_drift.num_pmaps_checked++;
 
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.phys_footprint,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"phys_footprint\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.phys_footprint_over++;
-                       pmap_ledgers_drift.phys_footprint_over_total += bal;
-                       if (bal > pmap_ledgers_drift.phys_footprint_over_max) {
-                               pmap_ledgers_drift.phys_footprint_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.phys_footprint_under++;
-                       pmap_ledgers_drift.phys_footprint_under_total += bal;
-                       if (bal < pmap_ledgers_drift.phys_footprint_under_max) {
-                               pmap_ledgers_drift.phys_footprint_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.internal,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"internal\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.internal_over++;
-                       pmap_ledgers_drift.internal_over_total += bal;
-                       if (bal > pmap_ledgers_drift.internal_over_max) {
-                               pmap_ledgers_drift.internal_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.internal_under++;
-                       pmap_ledgers_drift.internal_under_total += bal;
-                       if (bal < pmap_ledgers_drift.internal_under_max) {
-                               pmap_ledgers_drift.internal_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.internal_compressed,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"internal_compressed\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.internal_compressed_over++;
-                       pmap_ledgers_drift.internal_compressed_over_total += bal;
-                       if (bal > pmap_ledgers_drift.internal_compressed_over_max) {
-                               pmap_ledgers_drift.internal_compressed_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.internal_compressed_under++;
-                       pmap_ledgers_drift.internal_compressed_under_total += bal;
-                       if (bal < pmap_ledgers_drift.internal_compressed_under_max) {
-                               pmap_ledgers_drift.internal_compressed_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.iokit_mapped,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"iokit_mapped\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.iokit_mapped_over++;
-                       pmap_ledgers_drift.iokit_mapped_over_total += bal;
-                       if (bal > pmap_ledgers_drift.iokit_mapped_over_max) {
-                               pmap_ledgers_drift.iokit_mapped_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.iokit_mapped_under++;
-                       pmap_ledgers_drift.iokit_mapped_under_total += bal;
-                       if (bal < pmap_ledgers_drift.iokit_mapped_under_max) {
-                               pmap_ledgers_drift.iokit_mapped_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.alternate_accounting,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"alternate_accounting\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.alternate_accounting_over++;
-                       pmap_ledgers_drift.alternate_accounting_over_total += bal;
-                       if (bal > pmap_ledgers_drift.alternate_accounting_over_max) {
-                               pmap_ledgers_drift.alternate_accounting_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.alternate_accounting_under++;
-                       pmap_ledgers_drift.alternate_accounting_under_total += bal;
-                       if (bal < pmap_ledgers_drift.alternate_accounting_under_max) {
-                               pmap_ledgers_drift.alternate_accounting_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.alternate_accounting_compressed,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"alternate_accounting_compressed\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.alternate_accounting_compressed_over++;
-                       pmap_ledgers_drift.alternate_accounting_compressed_over_total += bal;
-                       if (bal > pmap_ledgers_drift.alternate_accounting_compressed_over_max) {
-                               pmap_ledgers_drift.alternate_accounting_compressed_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.alternate_accounting_compressed_under++;
-                       pmap_ledgers_drift.alternate_accounting_compressed_under_total += bal;
-                       if (bal < pmap_ledgers_drift.alternate_accounting_compressed_under_max) {
-                               pmap_ledgers_drift.alternate_accounting_compressed_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.page_table,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"page_table\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.page_table_over++;
-                       pmap_ledgers_drift.page_table_over_total += bal;
-                       if (bal > pmap_ledgers_drift.page_table_over_max) {
-                               pmap_ledgers_drift.page_table_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.page_table_under++;
-                       pmap_ledgers_drift.page_table_under_total += bal;
-                       if (bal < pmap_ledgers_drift.page_table_under_max) {
-                               pmap_ledgers_drift.page_table_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.purgeable_volatile,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"purgeable_volatile\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.purgeable_volatile_over++;
-                       pmap_ledgers_drift.purgeable_volatile_over_total += bal;
-                       if (bal > pmap_ledgers_drift.purgeable_volatile_over_max) {
-                               pmap_ledgers_drift.purgeable_volatile_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.purgeable_volatile_under++;
-                       pmap_ledgers_drift.purgeable_volatile_under_total += bal;
-                       if (bal < pmap_ledgers_drift.purgeable_volatile_under_max) {
-                               pmap_ledgers_drift.purgeable_volatile_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.purgeable_nonvolatile,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"purgeable_nonvolatile\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.purgeable_nonvolatile_over++;
-                       pmap_ledgers_drift.purgeable_nonvolatile_over_total += bal;
-                       if (bal > pmap_ledgers_drift.purgeable_nonvolatile_over_max) {
-                               pmap_ledgers_drift.purgeable_nonvolatile_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.purgeable_nonvolatile_under++;
-                       pmap_ledgers_drift.purgeable_nonvolatile_under_total += bal;
-                       if (bal < pmap_ledgers_drift.purgeable_nonvolatile_under_max) {
-                               pmap_ledgers_drift.purgeable_nonvolatile_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.purgeable_volatile_compressed,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"purgeable_volatile_compressed\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.purgeable_volatile_compressed_over++;
-                       pmap_ledgers_drift.purgeable_volatile_compressed_over_total += bal;
-                       if (bal > pmap_ledgers_drift.purgeable_volatile_compressed_over_max) {
-                               pmap_ledgers_drift.purgeable_volatile_compressed_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.purgeable_volatile_compressed_under++;
-                       pmap_ledgers_drift.purgeable_volatile_compressed_under_total += bal;
-                       if (bal < pmap_ledgers_drift.purgeable_volatile_compressed_under_max) {
-                               pmap_ledgers_drift.purgeable_volatile_compressed_under_max = bal;
-                       }
-               }
-       }
-       ledger_get_balance(pmap->ledger,
-                          task_ledgers.purgeable_nonvolatile_compressed,
-                          &bal);
-       if (bal != 0) {
-               do_panic = TRUE;
-               printf("LEDGER BALANCE proc %d (%s) "
-                      "\"purgeable_nonvolatile_compressed\" = %lld\n",
-                      pid, procname, bal);
-               if (bal > 0) {
-                       pmap_ledgers_drift.purgeable_nonvolatile_compressed_over++;
-                       pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_total += bal;
-                       if (bal > pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max) {
-                               pmap_ledgers_drift.purgeable_nonvolatile_compressed_over_max = bal;
-                       }
-               } else {
-                       pmap_ledgers_drift.purgeable_nonvolatile_compressed_under++;
-                       pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_total += bal;
-                       if (bal < pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max) {
-                               pmap_ledgers_drift.purgeable_nonvolatile_compressed_under_max = bal;
-                       }
-               }
-       }
+#define LEDGER_CHECK_BALANCE(__LEDGER)                                 \
+MACRO_BEGIN                                                            \
+       int panic_on_negative = TRUE;                                   \
+       ledger_get_balance(pmap->ledger,                                \
+                          task_ledgers.__LEDGER,                       \
+                          &bal);                                       \
+       ledger_get_panic_on_negative(pmap->ledger,                      \
+                                    task_ledgers.__LEDGER,             \
+                                    &panic_on_negative);               \
+       if (bal != 0) {                                                 \
+               if (panic_on_negative ||                                \
+                   (pmap_ledgers_panic &&                              \
+                    pmap_ledgers_panic_leeway > 0 &&                   \
+                    (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) ||  \
+                     bal < (pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
+                       do_panic = TRUE;                                \
+               }                                                       \
+               printf("LEDGER BALANCE proc %d (%s) "                   \
+                      "\"%s\" = %lld\n",                               \
+                      pid, procname, #__LEDGER, bal);                  \
+               if (bal > 0) {                                          \
+                       pmap_ledgers_drift.__LEDGER##_over++;           \
+                       pmap_ledgers_drift.__LEDGER##_over_total += bal; \
+                       if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
+                               pmap_ledgers_drift.__LEDGER##_over_max = bal; \
+                       }                                               \
+               } else if (bal < 0) {                                   \
+                       pmap_ledgers_drift.__LEDGER##_under++;          \
+                       pmap_ledgers_drift.__LEDGER##_under_total += bal; \
+                       if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
+                               pmap_ledgers_drift.__LEDGER##_under_max = bal; \
+                       }                                               \
+               }                                                       \
+       }                                                               \
+MACRO_END
+
+       LEDGER_CHECK_BALANCE(phys_footprint);
+       LEDGER_CHECK_BALANCE(internal);
+       LEDGER_CHECK_BALANCE(internal_compressed);
+       LEDGER_CHECK_BALANCE(iokit_mapped);
+       LEDGER_CHECK_BALANCE(alternate_accounting);
+       LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
+       LEDGER_CHECK_BALANCE(page_table);
+       LEDGER_CHECK_BALANCE(purgeable_volatile);
+       LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
+       LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
+       LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
+       LEDGER_CHECK_BALANCE(network_volatile);
+       LEDGER_CHECK_BALANCE(network_nonvolatile);
+       LEDGER_CHECK_BALANCE(network_volatile_compressed);
+       LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
 
        if (do_panic) {
                if (pmap_ledgers_panic) {
@@ -3254,7 +3104,8 @@ pmap_check_ledgers(
            pmap->stats.external != 0 ||
            pmap->stats.reusable != 0 ||
            pmap->stats.compressed != 0) {
-               if (pmap_stats_assert) {
+               if (pmap_stats_assert &&
+                   pmap->pmap_stats_assert) {
                        panic("pmap_destroy(%p) %d[%s] imbalanced stats: resident=%d wired=%d device=%d internal=%d external=%d reusable=%d compressed=%lld",
                              pmap, pid, procname,
                              pmap->stats.resident_count,
@@ -3289,6 +3140,32 @@ pmap_set_process(
 
        pmap->pmap_pid = pid;
        strlcpy(pmap->pmap_procname, procname, sizeof (pmap->pmap_procname));
+       if (pmap_ledgers_panic_leeway) {
+               /*
+                * XXX FBDP
+                * Some processes somehow trigger some issues that make
+                * the pmap stats and ledgers go off track, causing
+                * some assertion failures and ledger panics.
+                * Turn off the sanity checks if we allow some ledger leeway
+                * because of that.  We'll still do a final check in
+                * pmap_check_ledgers() for discrepancies larger than the
+                * allowed leeway after the address space has been fully
+                * cleaned up.
+                */
+               pmap->pmap_stats_assert = FALSE;
+               ledger_disable_panic_on_negative(pmap->ledger,
+                                                task_ledgers.phys_footprint);
+               ledger_disable_panic_on_negative(pmap->ledger,
+                                                task_ledgers.internal);
+               ledger_disable_panic_on_negative(pmap->ledger,
+                                                task_ledgers.internal_compressed);
+               ledger_disable_panic_on_negative(pmap->ledger,
+                                                task_ledgers.iokit_mapped);
+               ledger_disable_panic_on_negative(pmap->ledger,
+                                                task_ledgers.alternate_accounting);
+               ledger_disable_panic_on_negative(pmap->ledger,
+                                                task_ledgers.alternate_accounting_compressed);
+       }
 }
 #endif /* MACH_ASSERT */
 
@@ -3326,3 +3203,4 @@ void pmap_verify_noncacheable(uintptr_t vaddr) {
                return;
        panic("pmap_verify_noncacheable: IO read from a cacheable address? address: 0x%lx, PTE: %p, *PTE: 0x%llx", vaddr, ptep, *ptep);
 }
+
index 3cf4a0e49a2f01c7527268775a934fe0c21f0da6..506d684ce32bcc222ac3fa663b86bbbb3d89073f 100644 (file)
@@ -33,7 +33,6 @@
 #include <vm/vm_map.h>
 #include <i386/pmap_internal.h>
 #include <i386/pmap_pcid.h>
-#include <mach/branch_predicates.h>
 
 /*
  * PCID (Process context identifier) aka tagged TLB support.
index 63eb8929f49893810e7183cf2bc89894de6efc5a..4e734fbe62ad618918580bb51e50b8a810db425e 100644 (file)
@@ -97,6 +97,11 @@ pe_identify_machine(boot_args * bootArgs)
                pclk = hclk / 2;
                tclk = 100000;  /* timer is at 100khz */
 
+       } else if (!strcmp(gPESoCDeviceType, "bcm2837-io")) {
+               mclk = 1200000000;
+               hclk = mclk / 4;
+               pclk = hclk / 2;
+               tclk = 1000000;
        } else
                use_dt = 1;
 
@@ -297,10 +302,18 @@ static struct tbd_ops    t8010_funcs = {NULL, NULL, NULL};
 static struct tbd_ops    t8011_funcs = {NULL, NULL, NULL};
 #endif /* defined(ARM_BOARD_CLASS_T8011) */
 
+#if defined(ARM_BOARD_CLASS_T8015)
+static struct tbd_ops    t8015_funcs = {NULL, NULL, NULL};
+#endif /* defined(ARM_BOARD_CLASS_T8015) */
+
+
 
 
 
 
+#if defined(ARM_BOARD_CLASS_BCM2837)
+static struct tbd_ops    bcm2837_funcs = {NULL, NULL, NULL};
+#endif /* defined(ARM_BOARD_CLASS_BCM2837) */
 
 vm_offset_t    gPicBase;
 vm_offset_t    gTimerBase;
@@ -320,7 +333,7 @@ typedef enum
 static panic_trace_t bootarg_panic_trace;
 
 // The command buffer contains the converted commands from the device tree for commanding cpu_halt, enable_trace, etc.
-#define DEBUG_COMMAND_BUFFER_SIZE 100
+#define DEBUG_COMMAND_BUFFER_SIZE 256
 typedef struct command_buffer_element{
        uintptr_t address;
        uint16_t destination_cpu_selector;
@@ -659,6 +672,16 @@ pe_arm_init_timer(void *args)
        if (!strcmp(gPESoCDeviceType, "t8011-io")) {
                tbd_funcs = &t8011_funcs;
        } else
+#endif
+#if defined(ARM_BOARD_CLASS_T8015)
+       if (!strcmp(gPESoCDeviceType, "t8015-io")) {
+               tbd_funcs = &t8015_funcs;
+       } else
+#endif
+#if defined(ARM_BOARD_CLASS_BCM2837)
+       if (!strcmp(gPESoCDeviceType, "bcm2837-io")) {
+               tbd_funcs = &bcm2837_funcs;
+       } else
 #endif
                return 0;
 
index 70d5e54f8c24a24d389b2b60590819a81115950b..6e65a0ab9886533c953eb2c3a90418211a2aa38e 100644 (file)
 #include <kern/debug.h>
 #include <libkern/section_keywords.h>
 
+#if defined __arm__
+#include <pexpert/arm/board_config.h>
+#elif defined __arm64__
+#include <pexpert/arm64/board_config.h>
+#endif
+
+
 /* extern references */
 extern void     pe_identify_machine(boot_args *bootArgs);
 
@@ -24,7 +31,7 @@ extern void     pe_identify_machine(boot_args *bootArgs);
 static void    pe_prepare_images(void);
 
 /* private globals */
-PE_state_t      PE_state;
+SECURITY_READ_ONLY_LATE(PE_state_t) PE_state;
 #define FW_VERS_LEN 128
 char            firmware_version[FW_VERS_LEN];
 
@@ -60,9 +67,16 @@ static boolean_t panic_console_available = FALSE;
 
 extern uint32_t crc32(uint32_t crc, const void *buf, size_t size);
 
+void PE_slide_devicetree(vm_offset_t);
+
 static void
 check_for_panic_log(void)
 {
+#ifdef PLATFORM_PANIC_LOG_PADDR
+       gPanicBase = ml_io_map_wcomb(PLATFORM_PANIC_LOG_PADDR, PLATFORM_PANIC_LOG_SIZE);
+       panic_text_len = PLATFORM_PANIC_LOG_SIZE - sizeof(struct embedded_panic_header);
+       gPanicSize = PLATFORM_PANIC_LOG_SIZE;
+#else
        DTEntry entry, chosen;
        unsigned int size;
        uintptr_t *reg_prop;
@@ -93,6 +107,7 @@ check_for_panic_log(void)
        /* Deduct the size of the panic header from the panic region size */
        panic_text_len = panic_region_length[0] - sizeof(struct embedded_panic_header);
        gPanicSize = panic_region_length[0];
+#endif
        panic_info = (struct embedded_panic_header *)gPanicBase;
 
        /* Check if a shared memory console is running in the panic buffer */
@@ -279,6 +294,14 @@ PE_init_iokit(void)
        StartIOKit(PE_state.deviceTreeHead, PE_state.bootArgs, (void *) 0, (void *) 0);
 }
 
+void
+PE_slide_devicetree(vm_offset_t slide)
+{
+       assert(PE_state.initialized);
+       PE_state.deviceTreeHead += slide;
+       DTInit(PE_state.deviceTreeHead);
+}
+
 void
 PE_init_platform(boolean_t vm_initialized, void *args)
 {
@@ -471,6 +494,16 @@ PE_i_can_has_debugger(uint32_t *debug_flags)
        return (debug_enabled);
 }
 
+/*
+ * This routine returns TRUE if the device is configured
+ * with panic debugging enabled.
+ */
+boolean_t
+PE_panic_debugging_enabled()
+{
+       return panicDebugging;
+}
+
 void
 PE_save_buffer_to_vram(unsigned char *buf, unsigned int *size)
 {
index 6b9000117a1cdd805f0419fb59ee8866a9d80507..18c746773170a5d4b883d18c44fd5667653c68a3 100644 (file)
@@ -11,6 +11,7 @@
 #include <kern/debug.h>
 #include <libkern/OSBase.h>
 #include <mach/mach_time.h>
+#include <machine/atomic.h>
 #include <machine/machine_routines.h>
 #include <pexpert/pexpert.h>
 #include <pexpert/protos.h>
@@ -42,7 +43,6 @@ static int    uart_initted = 0;       /* 1 if init'ed */
 
 static vm_offset_t     uart_base;
 
-
 /*****************************************************************************/
 
 #ifdef S3CUART
@@ -645,8 +645,7 @@ static void dockchannel_uart_init(void)
        rDOCKCHANNELS_DEV_DRAIN_CFG(DOCKCHANNEL_UART_CHANNEL) = max_dockchannel_drain_period;
 
        // Drain timer doesn't get loaded with value from drain period register if fifo
-       // is already full. Drop a character from the fifo.  See chapter 8 of the Cayman
-       // DockChannels specification for more details.
+       // is already full. Drop a character from the fifo. 
        rDOCKCHANNELS_DOCK_RDATA1(DOCKCHANNEL_UART_CHANNEL);
 }
 
@@ -662,8 +661,91 @@ static struct pe_serial_functions dockchannel_uart_serial_functions =
 
 #endif /* DOCKCHANNEL_UART */
 
-/*****************************************************************************/
+/****************************************************************************/
+#ifdef         PI3_UART
+vm_offset_t pi3_gpio_base_vaddr;
+vm_offset_t pi3_aux_base_vaddr;
+static int pi3_uart_tr0(void)
+{
+               return (int) BCM2837_GET32(BCM2837_AUX_MU_LSR_REG_V) & 0x20;
+}
+
+static void pi3_uart_td0(int c)
+{
+               BCM2837_PUT32(BCM2837_AUX_MU_IO_REG_V, (uint32_t) c);
+}
+
+static int pi3_uart_rr0(void)
+{      
+               return (int) BCM2837_GET32(BCM2837_AUX_MU_LSR_REG_V) & 0x01;
+}
+
+static int pi3_uart_rd0(void)
+{
+               return (int) BCM2837_GET32(BCM2837_AUX_MU_IO_REG_V) & 0xff;
+}
+
+static void pi3_uart_init(void)
+{
+       // Scratch variable
+       uint32_t i;
+
+       // Reset mini uart registers
+       BCM2837_PUT32(BCM2837_AUX_ENABLES_V, 1);
+       BCM2837_PUT32(BCM2837_AUX_MU_CNTL_REG_V, 0);
+       BCM2837_PUT32(BCM2837_AUX_MU_LCR_REG_V, 3);
+       BCM2837_PUT32(BCM2837_AUX_MU_MCR_REG_V, 0);
+       BCM2837_PUT32(BCM2837_AUX_MU_IER_REG_V, 0);
+       BCM2837_PUT32(BCM2837_AUX_MU_IIR_REG_V, 0xC6);
+       BCM2837_PUT32(BCM2837_AUX_MU_BAUD_REG_V, 270);
+
+        i = BCM2837_FSEL_REG(14);
+       // Configure GPIOs 14 & 15 for alternate function 5
+       i &= ~(BCM2837_FSEL_MASK(14));
+       i |= (BCM2837_FSEL_ALT5 << BCM2837_FSEL_OFFS(14));
+       i &= ~(BCM2837_FSEL_MASK(15));
+       i |= (BCM2837_FSEL_ALT5 << BCM2837_FSEL_OFFS(15));
+
+       BCM2837_PUT32(BCM2837_FSEL_REG(14), i);
+
+       BCM2837_PUT32(BCM2837_GPPUD_V, 0);
+
+       // Barrier before AP spinning for 150 cycles
+       __builtin_arm_isb(ISB_SY);
+
+       for(i = 0; i < 150; i++) {
+               asm volatile("add x0, x0, xzr");
+       }
+
+       __builtin_arm_isb(ISB_SY);
 
+       BCM2837_PUT32(BCM2837_GPPUDCLK0_V,(1 << 14) | (1 << 15));
+
+       __builtin_arm_isb(ISB_SY);
+
+       for(i = 0; i < 150; i++) {
+               asm volatile("add x0, x0, xzr");
+       }
+
+       __builtin_arm_isb(ISB_SY);
+
+       BCM2837_PUT32(BCM2837_GPPUDCLK0_V, 0);
+
+       BCM2837_PUT32(BCM2837_AUX_MU_CNTL_REG_V, 3);
+}
+
+static struct pe_serial_functions pi3_uart_serial_functions =
+{
+       .uart_init = pi3_uart_init,
+       .uart_set_baud_rate = NULL,
+       .tr0 = pi3_uart_tr0,
+       .td0 = pi3_uart_td0,
+       .rr0 = pi3_uart_rr0,
+       .rd0 = pi3_uart_rd0
+};
+
+#endif /* PI3_UART */
+/*****************************************************************************/
 int
 serial_init(void)
 {
@@ -682,12 +764,16 @@ serial_init(void)
 #ifdef DOCKCHANNEL_UART
        uint32_t        no_dockchannel_uart;
 #endif
+#ifdef PI3_UART
+       uint32_t        is_pi3;
+#endif
 
-       if (uart_initted) {
+       if (uart_initted && gPESF) {
                gPESF->uart_init();
                kprintf("reinit serial\n");
                return 1;
        }
+
        dccmode = 0;
        if (PE_parse_boot_argn("dcc", &dccmode, sizeof (dccmode))) {
                gPESF = &dcc_serial_functions;
@@ -704,6 +790,19 @@ serial_init(void)
        }
 #endif /* SHMCON */
 
+#ifdef PI3_UART
+#pragma unused(prop_value)
+       is_pi3 = 0;
+       if (PE_parse_boot_argn("-pi3", &is_pi3, sizeof(is_pi3))) { // FIXME: remove the not operator after boot args are set up.
+               pi3_gpio_base_vaddr = ml_io_map((vm_offset_t)BCM2837_GPIO_BASE, BCM2837_GPIO_SIZE);
+               pi3_aux_base_vaddr = ml_io_map((vm_offset_t)BCM2837_AUX_BASE, BCM2837_AUX_SIZE);
+               gPESF = &pi3_uart_serial_functions;
+               gPESF->uart_init();
+               uart_initted = 1;
+               return 1;
+       }
+#endif /* PI3_UART */
+
        soc_base = pe_arm_get_soc_base_phys();
 
        if (soc_base == 0)
index 5e130b16177c6266eb2574269f4dfb09faa5e244..5515f47b2ab1143526f1199c737865ec19bcb739 100644 (file)
@@ -48,6 +48,8 @@ static uint32_t gPEKernelConfigurationBitmask;
 
 int32_t gPESerialBaud = -1;
 
+int debug_cpu_performance_degradation_factor = 1;
+
 void pe_init_debug(void)
 {
        boolean_t boot_arg_value;
@@ -86,6 +88,21 @@ void pe_init_debug(void)
 #endif
        gPEKernelConfigurationBitmask |= (boot_arg_value ? kPEICanHasDiagnosticAPI : 0);
 
+
+       int factor = 1;
+       boolean_t have_bootarg = PE_parse_boot_argn("cpu-factor", &factor, sizeof (factor));
+       if (have_bootarg) {
+               debug_cpu_performance_degradation_factor = factor;
+       } else {
+               DTEntry         root;
+               if (DTLookupEntry(NULL, "/", &root) == kSuccess) {
+                       void *prop = NULL;
+                       uint32_t size = 0;
+                       if (DTGetProperty(root, "target-is-fpga", &prop, &size) == kSuccess) {
+                               debug_cpu_performance_degradation_factor = 10;
+                       }
+               }
+       }
 }
 
 void PE_enter_debugger(const char *cause)
index 94818a6148a5d0c38c0f6f33d84f0c3bcde7ad05..e35457dde65f46c5a62b9861ee371f57078274c8 100644 (file)
@@ -54,6 +54,8 @@ static boolean_t lpss_uart_supported = 0; /* 1 if LPSS UART is supported on plat
 static unsigned int lpss_uart_enabled = 0; /* 1 if it is LPSS UART is in D0 state */
 static void lpss_uart_re_init (void);
 
+static boolean_t pcie_uart_enabled = 0; /* 1 if PCIe UART is supported on platform */
+
 #define DEFAULT_UART_BAUD_RATE 115200
 
 static unsigned uart_baud_rate = DEFAULT_UART_BAUD_RATE;
@@ -433,6 +435,131 @@ static struct pe_serial_functions mmio_uart_serial_functions = {
     .rd0 = mmio_uart_rd0
 };
 
+// =============================================================================
+// PCIE_MMIO UART 
+// =============================================================================
+
+#define PCIE_MMIO_UART_BASE         0xFE410000
+
+#define PCIE_MMIO_WRITE(r, v)  ml_phys_write_byte(pcie_mmio_uart_base + PCIE_MMIO_UART_##r, v)
+#define PCIE_MMIO_READ(r)      ml_phys_read_byte(pcie_mmio_uart_base + PCIE_MMIO_UART_##r)
+
+enum {
+    PCIE_MMIO_UART_RBR = 0x0,   /* receive buffer Register   (R) */
+    PCIE_MMIO_UART_THR = 0x0,   /* transmit holding register (W) */
+    PCIE_MMIO_UART_IER = 0x1,   /* interrupt enable register     */
+    PCIE_MMIO_UART_FCR = 0x2,   /* fifo control register (W)     */
+    PCIE_MMIO_UART_LCR = 0x4,   /* line control register         */
+    PCIE_MMIO_UART_MCR = 0x4,  /* modem control register        */
+    PCIE_MMIO_UART_LSR = 0x5,  /* line status register          */
+    PCIE_MMIO_UART_DLL = 0x8,   /* DLAB = 1, divisor latch (LSB) */
+    PCIE_MMIO_UART_DLM = 0x9,   /* DLAB = 1, divisor latch (MSB) */
+    PCIE_MMIO_UART_SCR = 0x30,   /* scratch register              */
+};
+
+static vm_offset_t pcie_mmio_uart_base = 0;
+static int
+pcie_mmio_uart_present( void )
+{
+
+    PCIE_MMIO_WRITE( SCR, 0x5a );
+    if (PCIE_MMIO_READ(SCR) != 0x5a) return 0;
+    PCIE_MMIO_WRITE( SCR, 0xa5 );
+    if (PCIE_MMIO_READ(SCR) != 0xa5) return 0;
+
+    return 1;
+}
+
+static int
+pcie_mmio_uart_probe( void )
+{
+    unsigned new_pcie_mmio_uart_base = 0;
+
+    // if specified, pcie_mmio_uart overrides all probing
+    if (PE_parse_boot_argn("pcie_mmio_uart", &new_pcie_mmio_uart_base, sizeof (new_pcie_mmio_uart_base)))
+    {
+        // pcie_mmio_uart=0 will disable pcie_mmio_uart support
+        if (new_pcie_mmio_uart_base == 0) {
+            return 0;
+        }
+        pcie_mmio_uart_base = new_pcie_mmio_uart_base;
+        return 1;
+    }
+
+    pcie_mmio_uart_base = PCIE_MMIO_UART_BASE;
+    if (pcie_mmio_uart_present()) {
+      return 1;
+    }
+
+    // no pcie_mmio uart found
+    return 0;
+}
+
+static void
+pcie_mmio_uart_set_baud_rate( __unused int unit, __unused uint32_t baud_rate )
+{
+    const unsigned char lcr = PCIE_MMIO_READ( LCR );
+    unsigned long       div;
+
+    if (baud_rate == 0) baud_rate = 9600;
+    div = LEGACY_UART_CLOCK / 16 / baud_rate;
+
+    PCIE_MMIO_WRITE( LCR, lcr | UART_LCR_DLAB );
+    PCIE_MMIO_WRITE( DLM, (unsigned char)(div >> 8) );
+    PCIE_MMIO_WRITE( DLL, (unsigned char) div );
+    PCIE_MMIO_WRITE( LCR, lcr & ~UART_LCR_DLAB);
+}
+
+static int
+pcie_mmio_uart_tr0( void )
+{
+    return (PCIE_MMIO_READ(LSR) & UART_LSR_THRE);
+}
+
+static void
+pcie_mmio_uart_td0( int c )
+{
+    PCIE_MMIO_WRITE( THR, c );
+}
+
+static void
+pcie_mmio_uart_init( void )
+{
+    uart_initted = 1;
+}
+
+static int
+pcie_mmio_uart_rr0( void ) 
+{
+    unsigned char lsr;
+
+    lsr = PCIE_MMIO_READ( LSR );
+
+    if ( lsr & (UART_LSR_FE | UART_LSR_PE | UART_LSR_OE) )
+    {
+        PCIE_MMIO_READ( RBR ); /* discard */
+        return 0;
+    }
+    
+    return (lsr & UART_LSR_DR);
+}
+
+static int
+pcie_mmio_uart_rd0( void ) 
+{
+    return PCIE_MMIO_READ( RBR );
+}
+
+static struct pe_serial_functions pcie_mmio_uart_serial_functions = {
+    .uart_init = pcie_mmio_uart_init,
+    .uart_set_baud_rate = pcie_mmio_uart_set_baud_rate,
+    .tr0 = pcie_mmio_uart_tr0,
+    .td0 = pcie_mmio_uart_td0,
+    .rr0 = pcie_mmio_uart_rr0,
+    .rd0 = pcie_mmio_uart_rd0
+};
+
 // =============================================================================
 // Generic serial support below
 // =============================================================================
@@ -465,6 +592,13 @@ serial_init( void )
         legacy_uart_enabled = 1;
         return 1;
     }
+    else if ( pcie_mmio_uart_probe() )
+    {
+        gPESF = &pcie_mmio_uart_serial_functions;
+        gPESF->uart_init();
+        pcie_uart_enabled = 1;
+        return 1;
+    }
     else
     {
         return 0;
@@ -475,7 +609,7 @@ serial_init( void )
 static void
 uart_putc(char c)
 {
-       if (uart_initted && (legacy_uart_enabled || lpss_uart_enabled)) {
+       if (uart_initted && (legacy_uart_enabled || lpss_uart_enabled || pcie_uart_enabled)) {
         while (!gPESF->tr0());  /* Wait until THR is empty. */
         gPESF->td0(c);
     }
@@ -484,7 +618,7 @@ uart_putc(char c)
 static int
 uart_getc(void)
 {
-    if (uart_initted && (legacy_uart_enabled || lpss_uart_enabled)) {
+    if (uart_initted && (legacy_uart_enabled || lpss_uart_enabled || pcie_uart_enabled)) {
         if (!gPESF->rr0())
             return -1;
         return gPESF->rd0();
index 2e7f3d8be1e564b9713ed055d0e087452f8cd3f8..4dccef9457f6fbc2f55378aaffd2c905c1316b77 100644 (file)
@@ -5,17 +5,23 @@
 #ifndef _PEXPERT_ARM_AMCC_H
 #define _PEXPERT_ARM_AMCC_H
 
+#include <pexpert/arm64/board_config.h>
+
 /*
  * AMCC registers for KTRR/RoRegion related lockdown in early kernel bootstrap.
  * amcc_base must be retrieved from device tree before using.
  */
 
-//#if defined(KERNEL_INTEGRITY_KTRR)
+#if defined(KERNEL_INTEGRITY_KTRR)
+#define AMCC_PGSHIFT 14
+#define AMCC_PGSIZE (1 << AMCC_PGSHIFT)
+#define AMCC_PGMASK (AMCC_PGSIZE - 1)
+
 #define rMCCGEN        (*(volatile uint32_t *) (amcc_base + 0x780))
 #define rRORGNBASEADDR (*(volatile uint32_t *) (amcc_base + 0x7e4))
 #define rRORGNENDADDR  (*(volatile uint32_t *) (amcc_base + 0x7e8))
 #define rRORGNLOCK     (*(volatile uint32_t *) (amcc_base + 0x7ec))
-//#endif
+#endif
 
 
 #endif /* _PEXPERT_ARM_AMCC_H */
diff --git a/pexpert/pexpert/arm64/BCM2837.h b/pexpert/pexpert/arm64/BCM2837.h
new file mode 100644 (file)
index 0000000..5de0920
--- /dev/null
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016 Apple Inc. All rights reserved.
+ */
+
+#ifndef _PEXPERT_ARM_BCM2837_H
+#define _PEXPERT_ARM_BCM2837_H
+
+#ifdef BCM2837
+#include "arm64_common.h"
+#endif
+#define NO_MONITOR 1
+#define NO_ECORE 1
+
+#ifndef ASSEMBLER
+
+#define PI3_UART
+
+#define PI3_BREAK                              asm volatile("brk #0");
+
+#define BCM2837_GPIO_BASE      0x3F200000
+#define BCM2837_GPIO_SIZE      0xA0
+#define BCM2837_GPFSEL0                0x3F200000
+#define BCM2837_GPSET0         0x3F20001C
+#define BCM2837_GPCLR0         0x3F200028
+#define BCM2837_GPPUD          0x3F200094
+#define BCM2837_GPPUDCLK0      0x3F200098
+
+#define BCM2837_AUX_BASE       0x3F215000
+#define BCM2837_AUX_SIZE       0x70
+#define BCM2837_AUX_ENABLES    0x3F215004
+#define BCM2837_AUX_MU_IO_REG  0x3F215040
+#define BCM2837_AUX_MU_IER_REG 0x3F215044
+#define BCM2837_AUX_MU_IIR_REG 0x3F215048
+#define BCM2837_AUX_MU_LCR_REG 0x3F21504C
+#define BCM2837_AUX_MU_MCR_REG 0x3F215050
+#define BCM2837_AUX_MU_LSR_REG 0x3F215054
+#define BCM2837_AUX_MU_MSR_REG 0x3F215058
+#define BCM2837_AUX_MU_SCRATCH 0x3F21505C
+#define BCM2837_AUX_MU_CNTL_REG        0x3F215060
+#define BCM2837_AUX_MU_STAT_REG        0x3F215064
+#define BCM2837_AUX_MU_BAUD_REG        0x3F215068
+
+#define BCM2837_GPFSEL0_V              (pi3_gpio_base_vaddr + 0x0)
+#define BCM2837_GPSET0_V               (pi3_gpio_base_vaddr + 0x1C)
+#define BCM2837_GPCLR0_V               (pi3_gpio_base_vaddr + 0x28)
+#define BCM2837_GPPUD_V                (pi3_gpio_base_vaddr + 0x94)
+#define BCM2837_GPPUDCLK0_V            (pi3_gpio_base_vaddr + 0x98)
+
+#define BCM2837_FSEL_INPUT              0x0
+#define BCM2837_FSEL_OUTPUT             0x1
+#define BCM2837_FSEL_ALT0               0x4
+#define BCM2837_FSEL_ALT1               0x5
+#define BCM2837_FSEL_ALT2               0x6
+#define BCM2837_FSEL_ALT3               0x7
+#define BCM2837_FSEL_ALT4               0x3
+#define BCM2837_FSEL_ALT5               0x2
+
+#define BCM2837_FSEL_NFUNCS             54
+#define BCM2837_FSEL_REG(func)          (BCM2837_GPFSEL0_V + (4 * ((func) / 10)))
+#define BCM2837_FSEL_OFFS(func)         (((func) % 10) * 3)
+#define BCM2837_FSEL_MASK(func)         (0x7 << BCM2837_FSEL_OFFS(func))
+
+#define BCM2837_AUX_ENABLES_V          (pi3_aux_base_vaddr + 0x4)
+#define BCM2837_AUX_MU_IO_REG_V                (pi3_aux_base_vaddr + 0x40)
+#define BCM2837_AUX_MU_IER_REG_V       (pi3_aux_base_vaddr + 0x44)
+#define BCM2837_AUX_MU_IIR_REG_V       (pi3_aux_base_vaddr + 0x48)
+#define BCM2837_AUX_MU_LCR_REG_V       (pi3_aux_base_vaddr + 0x4C)
+#define BCM2837_AUX_MU_MCR_REG_V       (pi3_aux_base_vaddr + 0x50)
+#define BCM2837_AUX_MU_LSR_REG_V       (pi3_aux_base_vaddr + 0x54)
+#define BCM2837_AUX_MU_MSR_REG_V       (pi3_aux_base_vaddr + 0x58)
+#define BCM2837_AUX_MU_SCRATCH_V       (pi3_aux_base_vaddr + 0x5C)
+#define BCM2837_AUX_MU_CNTL_REG_V      (pi3_aux_base_vaddr + 0x60)
+#define BCM2837_AUX_MU_STAT_REG_V      (pi3_aux_base_vaddr + 0x64)
+#define BCM2837_AUX_MU_BAUD_REG_V      (pi3_aux_base_vaddr + 0x68)
+#define BCM2837_PUT32(addr, value) do { *((volatile uint32_t *) addr) = value; } while(0)
+#define BCM2837_GET32(addr) *((volatile uint32_t *) addr)
+
+#define PLATFORM_PANIC_LOG_PADDR       0x3c0fc000
+#define PLATFORM_PANIC_LOG_SIZE                16384        // 16kb
+#endif /* ! ASSEMBLER */
+
+#endif /* ! _PEXPERT_ARM_BCM2837_H */
index 6bdb8fc40ee3e98cd49858024fff9ecbc9c4c535..49f2b889e51e2c5e509109bcc7814e188a92c7dc 100644 (file)
@@ -8,6 +8,7 @@ include $(MakeInc_def)
 
 DATAFILES = \
        AIC.h \
+       AMCC.h \
        arm64_common.h \
        board_config.h \
        boot.h \
@@ -19,7 +20,8 @@ DATAFILES = \
        cyclone.h \
        typhoon.h \
        twister.h \
-       hurricane.h
+       hurricane.h \
+       BCM2837.h
 
 
 INSTALL_MD_LIST        = ${DATAFILES}
index 7b24690b9c3f1eef1743b8eb48b6f86be4ba4c59..ac3c6d32073bb49eebf97f132b258b6f556a9f9f 100644 (file)
 
 #define ARM64_REG_HID1                                         S3_0_c15_c1_0
 #define ARM64_REG_HID1_disCmpBrFusion                          (1<<14)
+#define ARM64_REG_HID1_rccForceAllIexL3ClksOn                  (1<<23)
 #define ARM64_REG_HID1_rccDisStallInactiveIexCtl               (1<<24)
 #define ARM64_REG_HID1_disLspFlushWithContextSwitch            (1<<25)
 #define ARM64_REG_HID1_disAESFuseAcrossGrp                     (1<<44)
+#define ARM64_REG_HID1_enaBrKillLimit                          (1ULL << 60)
 
 #define ARM64_REG_HID2                                         S3_0_c15_c2_0
 #define ARM64_REG_HID2_disMMUmtlbPrefetch                      (1<<13)
 
 #define ARM64_REG_HID3                                         S3_0_c15_c3_0
-#define ARM64_REG_HID3_DisDcZvaCmdOnly                 (1<<25)
+#define ARM64_REG_HID3_DisDcZvaCmdOnly                         (1<<25)
 #define ARM64_REG_HID3_DisXmonSnpEvictTriggerL2StarvationMode  (1<<54)
+#define ARM64_REG_HID3_DisColorOpt                             (1<<2)
 
 #define ARM64_REG_EHID3                                                S3_0_c15_c3_1
-#define ARM64_REG_EHID3_DisDcZvaCmdOnly                        (1<<25)
+#define ARM64_REG_EHID3_DisColorOpt                            (1<<2)
+#define ARM64_REG_EHID3_DisDcZvaCmdOnly                                (1<<25)
 
 #define ARM64_REG_HID4                                         S3_0_c15_c4_0
 #define ARM64_REG_EHID4                                                S3_0_c15_c4_1
@@ -45,6 +49,7 @@
 #define ARM64_REG_HID5_DisHwpLd                                        (1<<44)
 #define ARM64_REG_HID5_DisHwpSt                                        (1<<45)
 #define ARM64_REG_HID5_DisFullLineWr                           (1ULL << 57)
+#define ARM64_REG_HID5_EnableDnFIFORdStall                     (1ULL << 54)
 #define ARM64_REG_HID5_CrdEdbSnpRsvd_mask                      (3ULL << 14)
 #define ARM64_REG_HID5_CrdEdbSnpRsvd_VALUE                     (2ULL << 14)
 
 #define ARM64_REG_HID10                                                S3_0_c15_c10_0
 #define ARM64_REG_HID10_DisHwpGups                             (1ULL << 0)
 
+#define ARM64_REG_EHID10                                               S3_0_c15_c10_1
+#define ARM64_REG_EHID10_rccDisPwrSavePrfClkOff        (1ULL << 19)
+
 #if defined(APPLECYCLONE) || defined(APPLETYPHOON) || defined(APPLETWISTER)
 #define ARM64_REG_HID11                                                S3_0_c15_c13_0
 #else
 #define ARM64_REG_HID11                                                S3_0_c15_c11_0
 #endif
+#define ARM64_REG_HID11_DisX64NTLnchOpt                                (1ULL << 1)
 #define ARM64_REG_HID11_DisFillC1BubOpt                                (1<<7)
 #define ARM64_REG_HID11_DisFastDrainOpt                                (1ULL << 23)
 
 #define ARM64_REG_CYC_CFG_deepSleep                            (1ULL<<24)
 #else
 #define ARM64_REG_ACC_OVRD                                     S3_5_c15_c6_0
+#if defined(APPLEMONSOON)
+#define ARM64_REG_ACC_EBLK_OVRD                                        S3_5_c15_c6_1   // EBLK_OVRD on Zephyr
+#endif
 #define ARM64_REG_ACC_OVRD_enDeepSleep                         (1ULL << 34)
-
-
+#define ARM64_REG_ACC_OVRD_disPioOnWfiCpu                      (1ULL << 32)
 #define ARM64_REG_ACC_OVRD_dsblClkDtr                          (1ULL << 29)
 #define ARM64_REG_ACC_OVRD_cpmWakeUp_mask                      (3ULL << 27)
 #define ARM64_REG_ACC_OVRD_cpmWakeUp_force                     (3ULL << 27)
 #define ARM64_REG_CYC_OVRD                                     S3_5_c15_c5_0
 #define ARM64_REG_CYC_OVRD_ok2pwrdn_force_up                   (2<<24)
 #define ARM64_REG_CYC_OVRD_ok2pwrdn_force_down                 (3<<24)
+#define ARM64_REG_CYC_OVRD_disWfiRetn                          (1<<0)
 
+#if defined(APPLEMONSOON)
+#define ARM64_REG_CYC_OVRD_dsblSnoopTime_mask                  (3ULL << 30)
+#define ARM64_REG_CYC_OVRD_dsblSnoopPTime                      (1ULL << 31)    /// Don't fetch the timebase from the P-block
+#endif /* APPLEMONSOON */
 
 #define ARM64_REG_LSU_ERR_STS                          S3_3_c15_c0_0
 #define ARM64_REG_LSU_ERR_STS_L1DTlbMultiHitEN (1ULL<<54)
  *      0=>not a p-core, non-zero=>p-core
  */
 .macro ARM64_IS_PCORE
+#if defined(APPLEMONSOON) || HAS_CLUSTER
+       mrs             $0, MPIDR_EL1
+       and             $0, $0, #(MPIDR_PNE)
+#endif
 .endmacro
 
 /*
  * arg3: SPR to use for p-core or non-AMP architecture
  */
 .macro ARM64_READ_EP_SPR
+#if defined(APPLEMONSOON) || HAS_CLUSTER
+       cbnz            $0, 1f
+// e-core
+       mrs             $1, $2
+       b               2f
+// p-core
+1:
+#endif
        mrs             $1, $3
 2:
 .endmacro
  * arg3: SPR to use for p-core or non-AMP architecture
  */
 .macro ARM64_WRITE_EP_SPR
+#if defined(APPLEMONSOON) || HAS_CLUSTER
+       cbnz            $0, 1f
+// e-core
+       msr             $2, $1
+       b               2f
+// p-core
+1:
+#endif
        msr             $3, $1
 2:
 .endmacro
index 0aaefb89856c575060f04ef5c2d54873753d85d2..c7c434d2af4bb498fa5fc7494dee63866d8ddc69 100644 (file)
@@ -5,6 +5,8 @@
 #ifndef _PEXPERT_ARM_BOARD_CONFIG_H
 #define _PEXPERT_ARM_BOARD_CONFIG_H
 
+#include <mach/machine.h>
+
 #ifdef ARM64_BOARD_CONFIG_S5L8960X
 #define APPLE_ARM64_ARCH_FAMILY  1
 #define APPLECYCLONE
@@ -15,6 +17,7 @@
 #define ARM_BOARD_CLASS_S5L8960X
 #define KERNEL_INTEGRITY_WT 1
 #define PEXPERT_NO_3X_IMAGES   1
+#define CORE_NCTRS 8
 #endif  /* ARM64_BOARD_CONFIG_S5L8960X */
 
 #ifdef ARM64_BOARD_CONFIG_T7000
@@ -26,6 +29,7 @@
 #define ARM_BOARD_WFE_TIMEOUT_NS 1000
 #define ARM_BOARD_CLASS_T7000
 #define KERNEL_INTEGRITY_WT 1
+#define CORE_NCTRS 8
 #endif  /* ARM64_BOARD_CONFIG_T7000 */
 
 #ifdef ARM64_BOARD_CONFIG_T7001
@@ -38,6 +42,7 @@
 #define ARM_BOARD_CLASS_T7000
 #define KERNEL_INTEGRITY_WT 1
 #define CPU_COUNT 3
+#define CORE_NCTRS 8
 #endif  /* ARM64_BOARD_CONFIG_T7001 */
 
 #ifdef ARM64_BOARD_CONFIG_S8000
@@ -55,6 +60,7 @@
 #define ARM_BOARD_WFE_TIMEOUT_NS 1000
 #define ARM_BOARD_CLASS_S8000
 #define KERNEL_INTEGRITY_WT 1
+#define CORE_NCTRS 8
 #endif  /* ARM64_BOARD_CONFIG_S8000 */
 
 #ifdef ARM64_BOARD_CONFIG_S8001
@@ -72,6 +78,7 @@
 #define ARM_BOARD_WFE_TIMEOUT_NS 1000
 #define ARM_BOARD_CLASS_S8000
 #define KERNEL_INTEGRITY_WT 1
+#define CORE_NCTRS 8
 #endif  /* ARM64_BOARD_CONFIG_S8001 */
 
 #ifdef ARM64_BOARD_CONFIG_T8010
 #define APPLE_ARM64_ARCH_FAMILY  1
 #define APPLEHURRICANE
 #define ARM_ARCH_TIMER
+#define KERNEL_INTEGRITY_KTRR
 #include <pexpert/arm64/T8010.h>
 #define __ARM_L2CACHE_SIZE_LOG__ 22
 #define ARM_BOARD_WFE_TIMEOUT_NS 1000
 #define ARM_BOARD_CLASS_T8010
+#define CORE_NCTRS 10
+#if DEVELOPMENT || DEBUG
+#define PMAP_CS                  1
+#define PMAP_CS_ENABLE           0
+#endif
 #endif  /* ARM64_BOARD_CONFIG_T8010 */
 
 #ifdef ARM64_BOARD_CONFIG_T8011
 #define APPLE_ARM64_ARCH_FAMILY  1
 #define APPLEHURRICANE
 #define ARM_ARCH_TIMER
+#define KERNEL_INTEGRITY_KTRR
 #include <pexpert/arm64/T8010.h>
 #define __ARM_L2CACHE_SIZE_LOG__ 23
 #define ARM_BOARD_WFE_TIMEOUT_NS 1000
 #define ARM_BOARD_CLASS_T8011
 #define CPU_COUNT 3
+#define CORE_NCTRS 10
+#if DEVELOPMENT || DEBUG
+#define PMAP_CS                  1
+#define PMAP_CS_ENABLE           0
+#endif
 #endif  /* ARM64_BOARD_CONFIG_T8011 */
 
+#ifdef ARM64_BOARD_CONFIG_T8015
+/*
+ * The LLC size for monsoon is 8MB, but the L2E exposed to mistral is
+ * only 1MB.  We use the larger cache size here.  The expectation is
+ * that this may cause flushes from mistral to be less efficient
+ * (cycles will be wasted on unnecessary way/set operations), but it
+ * will be technically correct... the best kind of correct.
+ *
+ * And is an explicit flush from L2E to LLC something we'll ever want
+ * to do?
+ */
+#define APPLE_ARM64_ARCH_FAMILY  1
+#define APPLEMONSOON
+#define ARM_ARCH_TIMER
+#define KERNEL_INTEGRITY_KTRR
+#include <pexpert/arm64/T8015.h>
+#define __ARM_L2CACHE_SIZE_LOG__ 23
+#define ARM_BOARD_WFE_TIMEOUT_NS 1000
+#define ARM_BOARD_CLASS_T8015
+#define CPU_COUNT 6
+#define BROKEN_FRIGGING_SLEEP 1 /* Spurious wake: See rdar://problem/29762505 */
+#define HAS_UNCORE_CTRS 1
+#define UNCORE_VERSION 1
+#define UNCORE_PER_CLUSTER 0
+#define UNCORE_NCTRS 8
+#define CORE_NCTRS 10
+#if DEVELOPMENT || DEBUG
+#define PMAP_CS                  1
+#define PMAP_CS_ENABLE           0
+#endif
+#endif  /* ARM64_BOARD_CONFIG_T8015 */
+
 
 
 
 
 
+#ifdef ARM64_BOARD_CONFIG_BCM2837
+#define BCM2837
+#define BCM2837_BRINGUP
+#define ARM_ARCH_TIMER
+#include <pexpert/arm64/BCM2837.h>
+#define __ARM_L2CACHE_SIZE_LOG__ 19
+#define ARM_BOARD_CLASS_BCM2837
+#define CPU_COUNT 4
+#endif  /* ARM64_BOARD_CONFIG_BCM2837 */
+
 #endif /* ! _PEXPERT_ARM_BOARD_CONFIG_H */
index c665c47214e30b3a37bba32d707b5e82bcd2cf3f..653b8252d358b0186bc5def1bd7921a92fef3468 100644 (file)
@@ -33,7 +33,7 @@ struct Boot_Video {
 #define kBootVideoDepthRotateShift     (8)
 #define kBootVideoDepthScaleShift      (16)
 
-#define kBootFlagsDarkBoot             (1 << 0)
+#define kBootFlagsDarkBoot             (1ULL << 0)
 
 typedef struct Boot_Video      Boot_Video;
 
index eac7336cf0e37f6724836a186d3031deaf7beca8..df4dd8db749029baaa1704d218584892b98a7f1a 100644 (file)
@@ -92,6 +92,8 @@ uint32_t PE_i_can_has_debugger(
        uint32_t *);
 
 #if defined(__arm__) || defined(__arm64__)
+boolean_t PE_panic_debugging_enabled(void);
+
 void PE_mark_hwaccess(uint64_t thread);
 #endif /* defined(__arm__) || defined(__arm64__) */
 
@@ -173,6 +175,8 @@ struct clock_frequency_info_t {
   unsigned long long fix_frequency_hz;
 };
 
+extern int debug_cpu_performance_degradation_factor;
+
 typedef struct clock_frequency_info_t clock_frequency_info_t;
 
 extern clock_frequency_info_t gPEClockFrequencyInfo;
@@ -396,6 +400,8 @@ extern void PE_arm_debug_enable_trace(void);
 extern uint8_t PE_smc_stashed_x86_power_state;
 extern uint8_t PE_smc_stashed_x86_efi_boot_state;
 extern uint8_t PE_smc_stashed_x86_system_state;
+extern uint8_t PE_smc_stashed_x86_shutdown_cause;
+extern uint64_t PE_smc_stashed_x86_prev_power_transitions;
 extern uint32_t PE_pcie_stashed_link_state;
 #endif
 
index 864ea94246035386b25b5b315b49bb4725754969..4372d6af71e3455183791c99b7f33bdd4b8698fd 100644 (file)
@@ -99,6 +99,7 @@ ___asan_version_mismatch_check_v8
 ___asan_version_mismatch_check_apple_802
 ___asan_version_mismatch_check_apple_900
 ___asan_version_mismatch_check_apple_902
+___asan_version_mismatch_check_apple_1000
 ___asan_init
 ___asan_memcpy
 ___asan_memmove
index 75a98ec4cc98c45629f0907055f9d1be6b2cdcd3..e8c09216728238a988b5c3c496088b6b0b417357 100644 (file)
@@ -31,7 +31,7 @@ EXPORT_MI_DIR = san
 COMP_SUBDIRS = conf
 
 .DELETE_ON_ERROR:
-$(OBJROOT)/san/kasan-blacklist-%: $(SOURCE)/kasan-blacklist $(SOURCE)/kasan-blacklist-%
+$(OBJROOT)/san/kasan-blacklist-%: $(SOURCE)/kasan-blacklist $(SOURCE)/ubsan-blacklist $(SOURCE)/kasan-blacklist-%
        @echo "$(ColorH)GENERATING$(Color0)    $(ColorLF)$(notdir $@)$(Color0)"
        $(_v)sed -e 's,^src:\./,src:'"$(SRCROOT)/," $^ > $@
        $(_v)$(SOURCE)/tools/validate_blacklist.sh "$@"
diff --git a/san/conf/Makefile.arm b/san/conf/Makefile.arm
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/san/conf/Makefile.arm64 b/san/conf/Makefile.arm64
new file mode 100644 (file)
index 0000000..e69de29
index 5edceed1713dbe255a1253edd53eb0a7c4849fee..8c60bc15bd5ba9c4efffaf2b948c8c1c0c4b140d 100644 (file)
@@ -47,11 +47,17 @@ COMP_SUBDIRS =
 # Rebuild if per-file overrides change
 ${OBJS}: $(firstword $(MAKEFILE_LIST))
 
-ifneq ($(KASAN),1)
-# nothing to build for non-KASAN
+# set file list manually
 OBJS =
-COBJS =
-SOBJS =
+
+ifeq ($(KASAN),1)
+OBJS += kasan.o kasan-fakestack.o kasan-memintrinsics.o kasan_dynamic_blacklist.o
+OBJS += kasan-$(CURRENT_ARCH_CONFIG_LC).o
+OBJS += kasan-test.o kasan-test-$(CURRENT_ARCH_CONFIG_LC).o
+endif
+
+ifeq ($(UBSAN),1)
+OBJS += ubsan.o ubsan_log.o
 endif
 
 # Rebuild if global compile flags change
index 30036fb3e6c13296402ca1bfc73f96697b227447..0c312a11ff87b04e0bb97b8588b73bf1207b5e8e 100644 (file)
@@ -3,3 +3,5 @@ san/kasan-fakestack.c standard
 san/kasan-test.c standard
 san/kasan-memintrinsics.c standard
 san/kasan_dynamic_blacklist.c standard
+san/ubsan.c standard
+san/ubsan_log.c standard
diff --git a/san/conf/files.arm b/san/conf/files.arm
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/san/conf/files.arm64 b/san/conf/files.arm64
new file mode 100644 (file)
index 0000000..4303b85
--- /dev/null
@@ -0,0 +1,3 @@
+# KASAN
+san/kasan-arm64.c standard
+san/kasan-test-arm64.s standard
index 77ee449a803c5ba6623a6a8085c1b36092d03706..056e531c44e716a14ad2db2a43f9b32ab6f108c0 100644 (file)
@@ -51,7 +51,9 @@
 
 extern uint64_t *cpu_tte;
 extern unsigned long gVirtBase, gPhysBase;
-#define phystokv(a) ((vm_address_t)(a) - gPhysBase + gVirtBase)
+
+typedef uint64_t pmap_paddr_t;
+extern vm_map_address_t phystokv(pmap_paddr_t pa);
 
 vm_offset_t physmap_vbase;
 vm_offset_t physmap_vtop;
@@ -111,10 +113,12 @@ align_to_page(vm_offset_t *addrp, vm_offset_t *sizep)
 static void
 kasan_map_shadow_internal(vm_offset_t address, vm_size_t size, bool is_zero, bool back_page)
 {
+       size = (size + 0x7UL) & ~0x7UL;
        vm_offset_t shadow_base = vm_map_trunc_page(SHADOW_FOR_ADDRESS(address), ARM_PGMASK);
        vm_offset_t shadow_top = vm_map_round_page(SHADOW_FOR_ADDRESS(address + size), ARM_PGMASK);
 
        assert(shadow_base >= KASAN_SHADOW_MIN && shadow_top <= KASAN_SHADOW_MAX);
+       assert((size & 0x7) == 0);
 
        for (; shadow_base < shadow_top; shadow_base += ARM_PGBYTES) {
                uint64_t *base = cpu_tte;
index cbef48e41e1a7fb10bdcee9aaf51f853fd6e5adc..48ce86d46c0dccb2aa04344f649e31c124c810ac 100644 (file)
@@ -3,6 +3,17 @@
 # the AddressSanitizer runtime itself, or because the code executes before
 # the runtime has been initialized.
 
+[.*]
+
+# Blanket ignore non-sanitized functions
+fun:__nosan_*
+
+# Try really hard to avoid panicing while debugging
+src:./osfmk/kdp/*
+src:./osfmk/kern/debug.c
+
+[address]
+
 # Exclude linker sets
 type:struct linker_set_entry
 type:linker_set_entry
@@ -14,13 +25,6 @@ src:./san/kasan-x86_64.c
 src:./san/kasan-memintrinsics.c
 src:./san/kasan_dynamic_blacklist.c
 
-# Blanket ignore non-sanitized functions
-fun:__nosan_*
-
-# Try really hard to avoid panicing while debugging
-src:./osfmk/kdp/*
-src:./osfmk/kern/debug.c
-
 # Exclude dtrace function that does weird stack manipulations
 fun:fbt_perfCallback
 
@@ -30,4 +34,5 @@ fun:_ZL18IOTrackingLeakScanPv
 # Exclude KASAN dependencies
 # XXX: could this be relaxed since fakestack is reentrant?
 src:./osfmk/kern/zalloc.c
+src:./osfmk/kern/zcache.c
 
index 9ef0c3aabc8b8cd65ee42b84ef6396a3049ef0e7..6f1fe4f8bca38c567e351659a8720c9b9e557089 100644 (file)
@@ -1,5 +1,7 @@
 # ARM64 specific blacklist
 
+[address]
+
 # Exclude KASan runtime
 src:./san/kasan-arm64.c
 
index bd1704d3030521fff10330a9d88877cc970d2183..517bce143eb906abecae06c5903e99e94dc2b4ec 100644 (file)
@@ -1,5 +1,7 @@
 # x86_64 specific blacklist
 
+[address]
+
 # Early boot AUTOGEN
 src:./bsd/kern/kdebug.c
 src:./bsd/kern/kern_csr.c
@@ -11,6 +13,7 @@ src:./osfmk/i386/acpi.c
 src:./osfmk/i386/cpu.c
 src:./osfmk/i386/i386_init.c
 src:./osfmk/i386/locks_i386.c
+src:./osfmk/i386/locks_i386_opt.c
 src:./osfmk/i386/machine_routines.c
 src:./osfmk/i386/mp.c
 src:./osfmk/i386/mtrr.c
@@ -19,7 +22,7 @@ src:./osfmk/i386/panic_hooks.c
 src:./osfmk/i386/rtclock.c
 src:./osfmk/i386/vmx/vmx_cpu.c
 src:./osfmk/kern/locks.c
-src:./osfmk/prng/random.c
+src:./osfmk/prng/prng_random.c
 src:./osfmk/x86_64/loose_ends.c
 src:./pexpert/gen/bootargs.c
 src:./pexpert/gen/device_tree.c
index b023ded1c948529a759ea00ed8039671c119f505..0680f08584194040070b6b888058e984cab7a5eb 100644 (file)
@@ -47,8 +47,9 @@ int fakestack_enabled = 0;
 #define FAKESTACK_HEADER_SZ 64
 #define FAKESTACK_NUM_SZCLASS 7
 
-#define FAKESTACK_FREED     0 /* forced by clang */
+#define FAKESTACK_UNUSED    0 /* waiting to be collected at next gc - forced by clang */
 #define FAKESTACK_ALLOCATED 1
+#define FAKESTACK_FREED     2
 
 #if FAKESTACK
 
@@ -120,29 +121,38 @@ ptr_is_on_stack(uptr ptr)
 }
 
 /* free all unused fakestack entries */
-static void NOINLINE
+void
 kasan_fakestack_gc(thread_t thread)
 {
        struct fakestack_header *cur, *tmp;
        LIST_HEAD(, fakestack_header) tofree = LIST_HEAD_INITIALIZER(tofree);
 
-       /* move all the freed elements off the main list */
+       boolean_t flags;
+       if (!thread_enter_fakestack(&flags)) {
+               panic("expected success entering fakestack\n");
+       }
+
+       /* move the unused objects off the per-thread list... */
        struct fakestack_header_list *head = &kasan_get_thread_data(thread)->fakestack_head;
        LIST_FOREACH_SAFE(cur, head, list, tmp) {
-               if (cur->flag == FAKESTACK_FREED) {
+               if (cur->flag == FAKESTACK_UNUSED) {
                        LIST_REMOVE(cur, list);
                        LIST_INSERT_HEAD(&tofree, cur, list);
+                       cur->flag = FAKESTACK_FREED;
                }
        }
 
+       kasan_unlock(flags);
+
        /* ... then actually free them */
        LIST_FOREACH_SAFE(cur, &tofree, list, tmp) {
+               LIST_REMOVE(cur, list);
+
                zone_t zone = fakestack_zones[cur->sz_class];
                size_t sz = (fakestack_min << cur->sz_class) + FAKESTACK_HEADER_SZ;
-               LIST_REMOVE(cur, list);
 
                void *ptr = (void *)cur;
-               kasan_free_internal(&ptr, &sz, KASAN_HEAP_FAKESTACK, &zone, cur->realsz, 1, FAKESTACK_QUARANTINE);
+               kasan_free_internal(&ptr, &sz, KASAN_HEAP_FAKESTACK, &zone, cur->realsz, 0, FAKESTACK_QUARANTINE);
                if (ptr) {
                        zfree(zone, ptr);
                }
@@ -179,8 +189,6 @@ kasan_fakestack_alloc(int sz_class, size_t realsz)
                return 0;
        }
 
-       kasan_fakestack_gc(current_thread()); /* XXX: optimal? */
-
        ret = (uptr)zget(zone);
 
        if (ret) {
@@ -241,7 +249,7 @@ kasan_fakestack_free(int sz_class, uptr dst, size_t realsz)
 }
 
 void NOINLINE
-kasan_unpoison_fakestack(thread_t thread)
+kasan_fakestack_drop(thread_t thread)
 {
        boolean_t flags;
        if (!thread_enter_fakestack(&flags)) {
@@ -252,11 +260,10 @@ kasan_unpoison_fakestack(thread_t thread)
        struct fakestack_header *cur;
        LIST_FOREACH(cur, head, list) {
                if (cur->flag == FAKESTACK_ALLOCATED) {
-                       cur->flag = FAKESTACK_FREED;
+                       cur->flag = FAKESTACK_UNUSED;
                }
        }
 
-       kasan_fakestack_gc(thread);
        kasan_unlock(flags);
 }
 
index 6dc379c1a0b4d9b2f718cb26dc6bb191a350f970..672a6645e1ece0114fabfca0991896f634f60759 100644 (file)
@@ -34,6 +34,7 @@
 #include <kern/kalloc.h>
 #include <kern/simple_lock.h>
 #include <kern/debug.h>
+#include <kern/thread.h>
 #include <mach/mach_vm.h>
 #include <mach/vm_param.h>
 #include <libkern/libkern.h>
@@ -434,13 +435,13 @@ static int test_strncat(struct kasan_test *t)
 }
 
 /* we ignore the top *two* frames in backtrace - so add an extra one */
-static int __attribute__((noinline))
+static int OS_NOINLINE
 test_blacklist_helper(void)
 {
        return kasan_is_blacklisted(TYPE_TEST);
 }
 
-static int __attribute__((noinline))
+static int OS_NOINLINE
 test_blacklist(struct kasan_test *t)
 {
        TEST_START(t);
@@ -449,7 +450,7 @@ test_blacklist(struct kasan_test *t)
        return 0;
 }
 
-static int __attribute__((noinline))
+static int OS_NOINLINE
 test_blacklist_str(struct kasan_test *t)
 {
        TEST_START(t);
@@ -482,6 +483,50 @@ static int test_strnlen(struct kasan_test *t)
 }
 #endif
 
+static void OS_NOINLINE
+force_fakestack(char *x)
+{
+       __asm__ __volatile__("" :: "r" (x) : "memory");
+}
+
+OS_NOINLINE
+static int
+test_fakestack_helper(struct kasan_test *t, char *x)
+{
+       TEST_START(t);
+
+       x[0] = 0x55;
+
+       /* ensure that 'x' is on the fakestack */
+       uintptr_t base = dtrace_get_kernel_stack(current_thread());
+       uintptr_t p = (uintptr_t)x;
+       if (p >= base && p < base + kernel_stack_size) {
+               return 1;
+       }
+
+       __asan_handle_no_return();
+
+       /* x better still be accessible */
+       TEST_NOFAULT(t);
+       if (x[0] != 0x55) {
+               TEST_DONE(t, 1);
+       }
+
+       TEST_DONE(t, 0);
+       return 0;
+}
+
+static int
+test_fakestack(struct kasan_test *t)
+{
+       char x[8];
+       if (!fakestack_enabled) {
+               return 1;
+       }
+       force_fakestack(x);
+       return test_fakestack_helper(t, x);
+}
+
 int *uaf_ptr;
 static int * NOINLINE
 stack_uaf_helper(void)
@@ -524,6 +569,7 @@ static struct kasan_test xnu_tests[] = {
        DECLARE_TEST(test_strncat,         "strncat"),
        DECLARE_TEST(test_blacklist,       "blacklist"),
        DECLARE_TEST(test_blacklist_str,   "blacklist_str"),
+       DECLARE_TEST(test_fakestack,       "fakestack"),
        // DECLARE_TEST(test_strnlen,         "strnlen"),
 };
 static int num_xnutests = sizeof(xnu_tests)/sizeof(xnu_tests[0]);
@@ -557,11 +603,6 @@ kasan_run_test(struct kasan_test *test_list, int testno, int fail)
                        status = TEST_FAIL_NOFAULT;
                }
        } else {
-               /* Triggering a KASan violation will return here by longjmp, bypassing
-                * stack unpoisoning, so do it here explicitly. We just hope that
-                * fakestack free will happen later... */
-               kasan_unpoison_curstack(true);
-
                if (t->result) {
                        /* faulted, but at the wrong place */
                        printf("KASan: test.%02d FAIL %d (%s)\n", testno, t->result, t->name);
index e2cb6d3bdd0beb85e1921a720a501498edd43436..4b685e67ff4f930d256b36223d8aec9c2968e4cd 100644 (file)
@@ -194,9 +194,12 @@ kasan_map_shadow_superpage_zero(vm_offset_t address, vm_size_t size)
 void
 kasan_map_shadow(vm_offset_t address, vm_size_t size, bool is_zero)
 {
+       size = (size + 0x7UL) & ~0x7UL;
        vm_offset_t shadow_base = vm_map_trunc_page(SHADOW_FOR_ADDRESS(address), PAGE_MASK);
        vm_offset_t shadow_top = vm_map_round_page(SHADOW_FOR_ADDRESS(address + size), PAGE_MASK);
 
+       assert((size & 0x7) == 0);
+
        for (; shadow_base < shadow_top; shadow_base += I386_PGBYTES) {
 
                split_addr_t addr = split_address(shadow_base);
index 01faa38014c090cfc79c03e74026f98d94827215..a34d479aa235c0fbc8f3b206d161e2399ae95787 100644 (file)
@@ -299,7 +299,7 @@ kasan_check_range(const void *x, size_t sz, access_t access)
 /*
  * Return true if [base, base+sz) is unpoisoned or has given shadow value.
  */
-static bool
+bool
 kasan_check_shadow(vm_address_t base, vm_size_t sz, uint8_t shadow)
 {
        sz -= 8 - (base % 8);
@@ -371,7 +371,7 @@ kasan_shadow_crashlog(uptr p, char *buf, size_t len)
 
        uptr shadow = (uptr)SHADOW_FOR_ADDRESS(p);
        uptr shadow_p = shadow;
-       uptr shadow_page = vm_map_round_page(shadow_p, PAGE_MASK);
+       uptr shadow_page = vm_map_round_page(shadow_p, HW_PAGE_MASK);
 
        /* rewind to start of context block */
        shadow &= ~((uptr)0xf);
@@ -381,7 +381,7 @@ kasan_shadow_crashlog(uptr p, char *buf, size_t len)
                        " Shadow             0  1  2  3  4  5  6  7  8  9  a  b  c  d  e  f\n");
 
        for (i = 0; i < 1 + before + after; i++, shadow += 16) {
-               if ((vm_map_round_page(shadow, PAGE_MASK) != shadow_page) && !kasan_is_shadow_mapped(shadow)) {
+               if ((vm_map_round_page(shadow, HW_PAGE_MASK) != shadow_page) && !kasan_is_shadow_mapped(shadow)) {
                        /* avoid unmapped shadow when crossing page boundaries */
                        continue;
                }
@@ -518,7 +518,12 @@ void NOINLINE
 __asan_handle_no_return(void)
 {
        kasan_unpoison_curstack(false);
-       kasan_unpoison_fakestack(current_thread());
+
+       /*
+        * No need to free any fakestack objects because they must stay alive until
+        * we drop the real stack, at which point we can drop the entire fakestack
+        * anyway.
+        */
 }
 
 bool NOINLINE
@@ -1258,17 +1263,17 @@ kasan_traverse_mappings(pmap_traverse_callback cb, void *ctx)
 {
        uintptr_t shadow_base = (uintptr_t)SHADOW_FOR_ADDRESS(VM_MIN_KERNEL_AND_KEXT_ADDRESS);
        uintptr_t shadow_top = (uintptr_t)SHADOW_FOR_ADDRESS(VM_MAX_KERNEL_ADDRESS);
-       shadow_base = vm_map_trunc_page(shadow_base, PAGE_MASK);
-       shadow_top = vm_map_round_page(shadow_top, PAGE_MASK);
+       shadow_base = vm_map_trunc_page(shadow_base, HW_PAGE_MASK);
+       shadow_top = vm_map_round_page(shadow_top, HW_PAGE_MASK);
 
        uintptr_t start = 0, end = 0;
 
-       for (uintptr_t addr = shadow_base; addr < shadow_top; addr += PAGE_SIZE) {
+       for (uintptr_t addr = shadow_base; addr < shadow_top; addr += HW_PAGE_SIZE) {
                if (kasan_is_shadow_mapped(addr)) {
                        if (start == 0) {
                                start = addr;
                        }
-                       end = addr + PAGE_SIZE;
+                       end = addr + HW_PAGE_SIZE;
                } else if (start && end) {
                        cb(start, end, ctx);
                        start = end = 0;
@@ -1307,6 +1312,7 @@ UNUSED_ABI(__asan_version_mismatch_check_v8, void);
 UNUSED_ABI(__asan_version_mismatch_check_apple_802, void);
 UNUSED_ABI(__asan_version_mismatch_check_apple_900, void);
 UNUSED_ABI(__asan_version_mismatch_check_apple_902, void);
+UNUSED_ABI(__asan_version_mismatch_check_apple_1000, void);
 
 void UNSUPPORTED_API(__asan_init_v5, void);
 void UNSUPPORTED_API(__asan_register_globals, uptr a, uptr b);
index 102a31468ab6e630537e401194baf9af857c5527..4682692a8a5db2a5a9e9fcae00ed7eb5e7014d2d 100644 (file)
@@ -46,6 +46,7 @@ typedef uintptr_t uptr;
 
 #if KASAN
 
+#define KASAN_DEBUG  0
 #define KASAN_KALLOC 1
 #define KASAN_ZALLOC 1
 #define KASAN_DYNAMIC_BLACKLIST 1
@@ -101,8 +102,10 @@ void kasan_notify_address(vm_offset_t address, vm_size_t size);
 void kasan_notify_address_nopoison(vm_offset_t address, vm_size_t size);
 void kasan_unpoison_stack(vm_offset_t stack, vm_size_t size);
 void kasan_unpoison_curstack(bool whole_stack);
-void kasan_unpoison_fakestack(thread_t thread);
+bool kasan_check_shadow(vm_address_t base, vm_size_t sz, uint8_t shadow);
 
+void kasan_fakestack_drop(thread_t thread); /* mark all fakestack entries for thread as unused */
+void kasan_fakestack_gc(thread_t thread);   /* free and poison all unused fakestack objects for thread */
 void kasan_fakestack_suspend(void);
 void kasan_fakestack_resume(void);
 
@@ -126,6 +129,7 @@ extern unsigned shadow_stolen_idx;
 extern vm_offset_t shadow_pnext, shadow_ptop;
 #endif
 #endif
+
 /*
  * Allocator hooks
  */
index f4ad0fa059969246e6e6cf9c8313ba3248524a78..983b83576aea0eddb76f00298435a6fc601abe0d 100644 (file)
@@ -350,7 +350,7 @@ addr_to_func(uintptr_t addr, const kernel_mach_header_t *mh)
        return cur_name;
 }
 
-bool __attribute__((noinline))
+bool OS_NOINLINE
 kasan_is_blacklisted(access_t type)
 {
        uint32_t nframes = 0;
index c696abe927c1828e9266e5fa0689e48a8650c4dd..f593fbbbae5dd6ea3e2350fa4e04cc04e9d9da59 100644 (file)
@@ -40,7 +40,6 @@ typedef uintptr_t uptr;
 /*
  * KASAN features and config
  */
-#define KASAN_DEBUG   0
 #define FAKESTACK     1
 /* KASAN_KALLOC defined in kasan.h */
 /* KASAN_ZALLOC defined in kasan.h */
@@ -57,9 +56,13 @@ typedef uintptr_t uptr;
 /* Works out at about 25% of 512 MiB and 15% of 3GiB system */
 # define STOLEN_MEM_PERCENT  13UL
 # define STOLEN_MEM_BYTES    MiB(62)
+# define HW_PAGE_SIZE        (ARM_PGBYTES)
+# define HW_PAGE_MASK        (ARM_PGMASK)
 #else
 # define STOLEN_MEM_PERCENT  25UL
 # define STOLEN_MEM_BYTES    0
+# define HW_PAGE_SIZE        (PAGE_SIZE)
+# define HW_PAGE_MASK        (PAGE_MASK)
 #endif
 
 /* boot-args */
@@ -81,7 +84,7 @@ typedef uintptr_t uptr;
 #define SHADOW_FOR_ADDRESS(x) (uint8_t *)(((x) >> 3) + KASAN_SHIFT)
 
 #if KASAN_DEBUG
-# define NOINLINE __attribute__ ((noinline))
+# define NOINLINE OS_NOINLINE
 #else
 # define NOINLINE
 #endif
@@ -191,7 +194,7 @@ struct asan_global {
 #endif
 
 typedef int jmp_buf[_JBLEN];
-void _longjmp(jmp_buf env, int val);
-int _setjmp(jmp_buf env);
+void _longjmp(jmp_buf env, int val) OS_NORETURN;
+int _setjmp(jmp_buf env) __attribute__((returns_twice));
 
 #endif /* _KASAN_INTERNAL_H_ */
diff --git a/san/ubsan-blacklist b/san/ubsan-blacklist
new file mode 100644 (file)
index 0000000..2e48edf
--- /dev/null
@@ -0,0 +1,9 @@
+[.*]
+src:./san/ubsan*
+
+[alignment]
+
+src:./libsa/bootstrap.cpp
+src:./bsd/net/necp_client.c
+src:./pexpert/arm/pe_identify_machine.c
+
diff --git a/san/ubsan.c b/san/ubsan.c
new file mode 100644 (file)
index 0000000..0364a41
--- /dev/null
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <stdatomic.h>
+#include <kern/debug.h>
+#include <libkern/libkern.h>
+#include "ubsan.h"
+
+static const bool ubsan_print = false;
+static const uint32_t line_acquired = 0x80000000UL;
+
+static size_t
+format_loc(struct san_src_loc *loc, char *dst, size_t sz)
+{
+       return snprintf(dst, sz, "  loc: %s:%d:%d\n",
+                       loc->filename,
+                       loc->line & ~line_acquired,
+                       loc->col
+               );
+}
+
+/*
+ * return true for the first visit to this loc, false every subsequent time
+ */
+static bool
+ubsan_loc_acquire(struct san_src_loc *loc)
+{
+       uint32_t line = loc->line;
+       if (line & line_acquired) {
+               return false;
+       }
+       uint32_t acq = line | line_acquired;
+       return atomic_compare_exchange_strong((_Atomic uint32_t *)&loc->line, &line, acq);
+}
+
+static const char *const
+overflow_str[] = {
+       NULL,
+       "add",
+       "sub",
+       "mul",
+       "divrem",
+       "negate",
+       NULL
+};
+
+static size_t
+format_overflow(struct ubsan_violation *v, char *buf, size_t sz)
+{
+       struct san_type_desc *ty = v->overflow->ty;
+       return snprintf(buf, sz,
+                       "%s overflow, op = %s, ty = %s, width = %d, lhs = 0x%llx, rhs = 0x%llx\n",
+                       ty->issigned ? "signed" : "unsigned",
+                       overflow_str[v->ubsan_type],
+                       ty->name,
+                       1 << ty->width,
+                       v->lhs,
+                       v->rhs
+               );
+}
+
+static size_t
+format_shift(struct ubsan_violation *v, char *buf, size_t sz)
+{
+       size_t n = 0;
+       struct san_type_desc *l = v->shift->lhs_t;
+       struct san_type_desc *r = v->shift->rhs_t;
+
+       n += snprintf(buf+n, sz-n, "bad shift\n");
+       n += snprintf(buf+n, sz-n, "  lhs: 0x%llx, ty = %s, signed = %d, width = %d\n", v->lhs, l->name, l->issigned, 1 << l->width);
+       n += snprintf(buf+n, sz-n, "  rhs: 0x%llx, ty = %s, signed = %d, width = %d\n", v->rhs, r->name, r->issigned, 1 << r->width);
+
+       return n;
+}
+
+static const char *const
+align_kinds[] = {
+       "load",
+       "store",
+       "<unknown>",
+       "member access",
+       "<unknown>",
+};
+
+static size_t
+format_alignment(struct ubsan_violation *v, char *buf, size_t sz)
+{
+       size_t n = 0;
+       struct san_type_desc *ty = v->align->ty;
+
+       n += snprintf(buf+n, sz-n, "mis-aligned %s of 0x%llx\n", align_kinds[v->align->kind], v->lhs);
+       n += snprintf(buf+n, sz-n, "  expected %d-byte alignment, type = %s\n",
+                       1 << v->align->align, ty->name);
+       return n;
+}
+
+static size_t
+format_oob(struct ubsan_violation *v, char *buf, size_t sz)
+{
+       size_t n = 0;
+       struct san_type_desc *aty = v->oob->array_ty;
+       struct san_type_desc *ity = v->oob->index_ty;
+       uintptr_t idx = v->lhs;
+
+       n += snprintf(buf+n, sz-n, "OOB array access\n");
+       n += snprintf(buf+n, sz-n, "  idx %ld\n", idx);
+       n += snprintf(buf+n, sz-n, "  aty: ty = %s, signed = %d, width = %d\n", aty->name, aty->issigned, 1 << aty->width);
+       n += snprintf(buf+n, sz-n, "  ity: ty = %s, signed = %d, width = %d\n", ity->name, ity->issigned, 1 << ity->width);
+
+       return n;
+}
+
+size_t
+ubsan_format(struct ubsan_violation *v, char *buf, size_t sz)
+{
+       size_t n = 0;
+
+       switch (v->ubsan_type) {
+       case UBSAN_OVERFLOW_add ... UBSAN_OVERFLOW_negate:
+               n += format_overflow(v, buf+n, sz-n);
+               break;
+       case UBSAN_UNREACHABLE:
+               n += snprintf(buf+n, sz-n, "unreachable\n");
+               break;
+       case UBSAN_SHIFT:
+               n += format_shift(v, buf+n, sz-n);
+               break;
+       case UBSAN_ALIGN:
+               n += format_alignment(v, buf+n, sz-n);
+               break;
+       case UBSAN_POINTER_OVERFLOW:
+               n += snprintf(buf+n, sz-n, "pointer overflow, before = 0x%llx, after = 0x%llx\n", v->lhs, v->rhs);
+               break;
+       case UBSAN_OOB:
+               n += format_oob(v, buf+n, sz-n);
+               break;
+       default:
+               panic("unknown violation");
+       }
+
+       n += format_loc(v->loc, buf+n, sz-n);
+
+       return n;
+}
+
+static void
+ubsan_handle(struct ubsan_violation *v, bool fatal)
+{
+       const size_t sz = 256;
+       static char buf[sz];
+       size_t n = 0;
+       buf[0] = '\0';
+
+       if (!ubsan_loc_acquire(v->loc)) {
+               /* violation site already reported */
+               return;
+       }
+
+       ubsan_log_append(v);
+
+       if (ubsan_print || fatal) {
+               n += ubsan_format(v, buf+n, sz-n);
+       }
+
+       if (ubsan_print) {
+               printf("UBSan: %s", buf);
+       }
+
+       if (fatal) {
+               panic("UBSan: %s", buf);
+       }
+}
+
+void
+__ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *desc)
+{
+       struct ubsan_violation v = { UBSAN_UNREACHABLE, 0, 0, .unreachable = desc, &desc->loc };
+       ubsan_handle(&v, true);
+}
+
+void
+__ubsan_handle_shift_out_of_bounds(struct ubsan_shift_desc *desc, uint64_t lhs, uint64_t rhs)
+{
+       struct ubsan_violation v = { UBSAN_SHIFT, lhs, rhs, .shift = desc, &desc->loc };
+       ubsan_handle(&v, false);
+}
+
+void
+__ubsan_handle_shift_out_of_bounds_abort(struct ubsan_shift_desc *desc, uint64_t lhs, uint64_t rhs)
+{
+       struct ubsan_violation v = { UBSAN_SHIFT, lhs, rhs, .shift = desc, &desc->loc };
+       ubsan_handle(&v, true);
+}
+
+#define DEFINE_OVERFLOW(op) \
+       void __ubsan_handle_##op##_overflow(struct ubsan_overflow_desc *desc, uint64_t lhs, uint64_t rhs) { \
+               struct ubsan_violation v = { UBSAN_OVERFLOW_##op, lhs, rhs, .overflow = desc, &desc->loc }; \
+               ubsan_handle(&v, false); \
+       } \
+       void __ubsan_handle_##op##_overflow_abort(struct ubsan_overflow_desc *desc, uint64_t lhs, uint64_t rhs) { \
+               struct ubsan_violation v = { UBSAN_OVERFLOW_##op, lhs, rhs, .overflow = desc, &desc->loc }; \
+               ubsan_handle(&v, true); \
+       }
+
+DEFINE_OVERFLOW(add)
+DEFINE_OVERFLOW(sub)
+DEFINE_OVERFLOW(mul)
+DEFINE_OVERFLOW(divrem)
+DEFINE_OVERFLOW(negate)
+
+void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *desc, uint64_t val)
+{
+       struct ubsan_violation v = { UBSAN_ALIGN, val, 0, .align = desc, &desc->loc };
+       ubsan_handle(&v, false);
+}
+
+void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *desc, uint64_t val)
+{
+       struct ubsan_violation v = { UBSAN_ALIGN, val, 0, .align = desc, &desc->loc };
+       ubsan_handle(&v, true);
+}
+
+void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *desc, uint64_t before, uint64_t after)
+{
+       struct ubsan_violation v = { UBSAN_POINTER_OVERFLOW, before, after, .ptroverflow = desc, &desc->loc };
+       ubsan_handle(&v, false);
+}
+
+void __ubsan_handle_pointer_overflow_abort(struct ubsan_ptroverflow_desc *desc, uint64_t before, uint64_t after)
+{
+       struct ubsan_violation v = { UBSAN_POINTER_OVERFLOW, before, after, .ptroverflow = desc, &desc->loc };
+       ubsan_handle(&v, true);
+}
+
+void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *desc, uint64_t idx)
+{
+       struct ubsan_violation v = { UBSAN_OOB, idx, 0, .oob = desc, &desc->loc };
+       ubsan_handle(&v, false);
+}
+
+void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *desc, uint64_t idx)
+{
+       struct ubsan_violation v = { UBSAN_OOB, idx, 0, .oob = desc, &desc->loc };
+       ubsan_handle(&v, true);
+}
diff --git a/san/ubsan.h b/san/ubsan.h
new file mode 100644 (file)
index 0000000..e78dace
--- /dev/null
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#ifndef _UBSAN_H_
+#define _UBSAN_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+struct san_type_desc {
+       uint16_t type; // 0: integer, 1: float
+       union {
+               struct {
+                       uint16_t issigned : 1;
+                       uint16_t width    : 15;
+               }; /* int descriptor */
+               struct {
+                       uint16_t float_desc;
+               }; /* float descriptor */
+       };
+       const char name[];
+};
+
+struct san_src_loc {
+       const char *filename;
+       uint32_t line;
+       uint32_t col;
+};
+
+struct ubsan_overflow_desc {
+       struct san_src_loc loc;
+       struct san_type_desc *ty;
+};
+
+struct ubsan_unreachable_desc {
+       struct san_src_loc loc;
+};
+
+struct ubsan_shift_desc {
+       struct san_src_loc loc;
+       struct san_type_desc *lhs_t;
+       struct san_type_desc *rhs_t;
+};
+
+struct ubsan_align_desc {
+       struct san_src_loc loc;
+       struct san_type_desc *ty;
+       uint8_t align;
+       uint8_t kind;
+};
+
+struct ubsan_ptroverflow_desc {
+       struct san_src_loc loc;
+};
+
+struct ubsan_oob_desc {
+       struct san_src_loc loc;
+       struct san_type_desc *array_ty;
+       struct san_type_desc *index_ty;
+};
+
+enum {
+       UBSAN_OVERFLOW_add = 1,
+       UBSAN_OVERFLOW_sub,
+       UBSAN_OVERFLOW_mul,
+       UBSAN_OVERFLOW_divrem,
+       UBSAN_OVERFLOW_negate,
+       UBSAN_UNREACHABLE,
+       UBSAN_SHIFT,
+       UBSAN_ALIGN,
+       UBSAN_POINTER_OVERFLOW,
+       UBSAN_OOB,
+       UBSAN_VIOLATION_MAX,
+};
+
+struct ubsan_violation {
+       uint8_t ubsan_type;
+       uint64_t lhs;
+       uint64_t rhs;
+       union {
+               struct ubsan_overflow_desc *overflow;
+               struct ubsan_unreachable_desc *unreachable;
+               struct ubsan_shift_desc *shift;
+               struct ubsan_align_desc *align;
+               struct ubsan_ptroverflow_desc *ptroverflow;
+               struct ubsan_oob_desc *oob;
+       };
+       struct san_src_loc *loc;
+};
+
+void ubsan_log_append(struct ubsan_violation *);
+size_t ubsan_format(struct ubsan_violation *, char *buf, size_t sz);
+
+/*
+ * UBSan ABI
+ */
+
+void __ubsan_handle_add_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_sub_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_mul_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_divrem_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_negate_overflow(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_add_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_sub_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_mul_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_divrem_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_negate_overflow_abort(struct ubsan_overflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_builtin_unreachable(struct ubsan_unreachable_desc *);
+void __ubsan_handle_shift_out_of_bounds(struct ubsan_shift_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_shift_out_of_bounds_abort(struct ubsan_shift_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_type_mismatch_v1(struct ubsan_align_desc *, uint64_t val);
+void __ubsan_handle_type_mismatch_v1_abort(struct ubsan_align_desc *, uint64_t val);
+void __ubsan_handle_pointer_overflow(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_pointer_overflow_abort(struct ubsan_ptroverflow_desc *, uint64_t lhs, uint64_t rhs);
+void __ubsan_handle_out_of_bounds(struct ubsan_oob_desc *, uint64_t idx);
+void __ubsan_handle_out_of_bounds_abort(struct ubsan_oob_desc *, uint64_t idx);
+
+#endif /* _UBSAN_H_ */
diff --git a/san/ubsan_log.c b/san/ubsan_log.c
new file mode 100644 (file)
index 0000000..dc06cd7
--- /dev/null
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+#include <stdatomic.h>
+#include <kern/kalloc.h>
+#include <libkern/libkern.h>
+#include <sys/sysctl.h>
+#include "ubsan.h"
+
+/*
+ * To dump the violation log:
+ *   $ sysctl kern.ubsan.log
+ *
+ * To reset:
+ *   $ sysctl kern.ubsan.logentries=0
+ */
+
+static const size_t ubsan_log_size = 2048;
+struct ubsan_violation ubsan_log[ubsan_log_size];
+
+_Atomic size_t ubsan_log_head = 0; /* first valid entry */
+_Atomic size_t ubsan_log_tail = 0; /* next free slot (reader) */
+_Atomic size_t ubsan_log_next = 0; /* next free slot (writer) */
+
+static const bool ubsan_logging = true;
+
+static inline size_t
+next_entry(size_t x)
+{
+       return (x + 1) % ubsan_log_size;
+}
+
+void
+ubsan_log_append(struct ubsan_violation *e)
+{
+       if (!ubsan_logging) {
+               return;
+       }
+
+       /* reserve a slot */
+       size_t i = atomic_load(&ubsan_log_next);
+       size_t n;
+       do {
+               n = next_entry(i);
+               if (n == ubsan_log_tail) {
+                       return; /* full */
+               }
+       } while (!atomic_compare_exchange_weak(&ubsan_log_next, &i, n));
+
+       ubsan_log[i] = *e;
+
+       /* make the entry available */
+       size_t prev;
+       do {
+               prev = i;
+       } while (!atomic_compare_exchange_weak(&ubsan_log_head, &prev, n));
+}
+
+static int
+sysctl_ubsan_log_dump SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       const size_t sz = ubsan_log_size * 256;
+       size_t start = atomic_load(&ubsan_log_tail);
+       size_t end = atomic_load(&ubsan_log_head);
+
+       char *buf;
+       size_t n = 0;
+       int err;
+
+       if (start == end) {
+               return 0; /* log is empty */
+       }
+
+       buf = kalloc(sz);
+       if (!buf) {
+               return 0;
+       }
+       buf[0] = '\0';
+
+       for (size_t i = start; i != end; i = next_entry(i)) {
+               n += ubsan_format(&ubsan_log[i], buf+n, sz-n);
+       }
+
+       err = SYSCTL_OUT(req, buf, n);
+
+       kfree(buf, sz);
+       return err;
+}
+
+static int
+sysctl_ubsan_log_entries SYSCTL_HANDLER_ARGS
+{
+#pragma unused(oidp, arg1, arg2)
+       int ch, err, val;
+
+       int nentries;
+       if (ubsan_log_head >= ubsan_log_tail) {
+               nentries = ubsan_log_head - ubsan_log_tail;
+       } else {
+               nentries = ubsan_log_size - (ubsan_log_tail - ubsan_log_head + 1);
+       }
+
+       err = sysctl_io_number(req, nentries, sizeof(nentries), &val, &ch);
+       if (err == 0 && ch) {
+               if (val != 0) {
+                       err = EINVAL;
+               } else {
+                       ubsan_log_tail = ubsan_log_head;
+               }
+       }
+
+       return err;
+}
+
+SYSCTL_DECL(ubsan);
+SYSCTL_NODE(_kern, OID_AUTO, ubsan, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "");
+
+SYSCTL_COMPAT_UINT(_kern_ubsan, OID_AUTO, logsize, CTLFLAG_RD, NULL, (unsigned)ubsan_log_size, "");
+
+SYSCTL_PROC(_kern_ubsan, OID_AUTO, logentries,
+               CTLTYPE_INT | CTLFLAG_RW,
+               0, 0, sysctl_ubsan_log_entries, "I", "");
+
+SYSCTL_PROC(_kern_ubsan, OID_AUTO, log,
+               CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MASKED,
+               0, 0, sysctl_ubsan_log_dump, "A", "");
index 2bd03b8eb1cd155aabc3813cd83243ba9a234961..0cc9f0b0dd89b8674938912bea461d1e9fef7042 100644 (file)
@@ -2007,25 +2007,6 @@ int mac_iokit_check_hid_control(kauth_cred_t cred __unused)
         return 0;
 }
 
-
-int mac_iokit_check_nvram_delete(kauth_cred_t cred __unused, const char *name __unused);
-int mac_iokit_check_nvram_delete(kauth_cred_t cred __unused, const char *name __unused)
-{
-       return 0;
-}
-
-int mac_iokit_check_nvram_get(kauth_cred_t cred __unused, const char *name __unused);
-int mac_iokit_check_nvram_get(kauth_cred_t cred __unused, const char *name __unused)
-{
-       return 0;
-}
-
-int mac_iokit_check_nvram_set(kauth_cred_t cred __unused, const char *name __unused, io_object_t value __unused);
-int mac_iokit_check_nvram_set(kauth_cred_t cred __unused, const char *name __unused, io_object_t value __unused)
-{
-       return 0;
-}
-
 int mac_vnode_check_trigger_resolve(vfs_context_t ctx __unused, struct vnode *dvp __unused, struct componentname *cnp __unused);
 int mac_vnode_check_trigger_resolve(vfs_context_t ctx __unused, struct vnode *dvp __unused, struct componentname *cnp __unused)
 {
index 81a4839c81132dac436e31ae09b0a4779015e2ac..d735e01242c58ccec74b1d6f394aacf2b354837e 100644 (file)
@@ -238,9 +238,6 @@ int mac_iokit_check_set_properties(kauth_cred_t cred, io_object_t registry_entry
 int    mac_iokit_check_filter_properties(kauth_cred_t cred, io_object_t registry_entry);
 int    mac_iokit_check_get_property(kauth_cred_t cred, io_object_t registry_entry, const char *name);
 int    mac_iokit_check_hid_control(kauth_cred_t cred);
-int    mac_iokit_check_nvram_delete(kauth_cred_t cred, const char *name);
-int    mac_iokit_check_nvram_get(kauth_cred_t cred, const char *name);
-int    mac_iokit_check_nvram_set(kauth_cred_t cred, const char *name, io_object_t value);
 void   mac_ipq_label_associate(struct mbuf *fragment, struct ipq *ipq);
 int    mac_ipq_label_compare(struct mbuf *fragment, struct ipq *ipq);
 void   mac_ipq_label_destroy(struct ipq *ipq);
index d9dff9460d8bdaa069d7bd0950ae61f2a1fe7060..fd41b7538256e936594d94f631ed09166453fc4e 100644 (file)
@@ -119,31 +119,3 @@ mac_iokit_check_hid_control(kauth_cred_t cred)
        return (error);
 }
 
-int
-mac_iokit_check_nvram_delete(kauth_cred_t cred, const char *name)
-{
-       int error;
-
-       MAC_CHECK(iokit_check_nvram_delete, cred, name);
-       return (error);
-}
-
-int
-mac_iokit_check_nvram_get(kauth_cred_t cred, const char *name)
-{
-       int error;
-
-       MAC_CHECK(iokit_check_nvram_get, cred, name);
-       return (error);
-}
-
-int
-mac_iokit_check_nvram_set(kauth_cred_t cred, const char *name, io_object_t value)
-{
-       int error;
-
-       MAC_CHECK(iokit_check_nvram_set, cred, name, value);
-       return (error);
-}
-
-
index 5669e0daf28afc85d2bfb3659ee29dae22bb04ee..4ae38066562cc78d0ded636a44d686c32168bf48 100644 (file)
@@ -149,6 +149,21 @@ mac_thread_userret(struct thread *td)
        MAC_PERFORM(thread_userret, td);
 }
 
+void
+mac_proc_notify_exec_complete(struct proc *proc)
+{
+       thread_t thread = current_thread();
+
+       /*
+        * Since this MAC hook was designed to support upcalls, make sure the hook
+        * is called with kernel importance propagation enabled so any daemons
+        * can get any appropriate importance donations.
+        */
+       thread_enable_send_importance(thread, TRUE);
+       MAC_PERFORM(proc_notify_exec_complete, proc);
+       thread_enable_send_importance(thread, FALSE);
+}
+
 /**** Exception Policy
  *
  * Note that the functions below do not fully follow the usual convention for mac policy functions
index 4849bfabdc329ce5dc3ae10d8061d14fd0d447b9..df3bae67bc9aa5860e5a5ab94611281a9d957599 100644 (file)
@@ -99,6 +99,8 @@ int mac_exc_update_task_crash_label(struct task *task, struct label *newlabel);
 
 int mac_exc_action_check_exception_send(struct task *victim_task, struct exception_action *action);
 
+void mac_proc_notify_exec_complete(struct proc *proc);
+
 struct label *mac_exc_create_label_for_proc(struct proc *proc);
 struct label *mac_exc_create_label_for_current_proc(void);
 
index 5cae6252936d22f4ac3ef4b2f5524c01f98a63ee..a36ebe953124aeb679d88ed9f19bf8c57e9c8543 100644 (file)
@@ -4549,6 +4549,19 @@ typedef int mpo_proc_check_run_cs_invalid_t(
        struct proc *p
 );
 
+/**
+ @brief Notification a process is finished with exec and will jump to userspace
+ @param p Object process
+
+ Notifies all MAC policies that a process has completed an exec and is about to
+ jump to userspace to continue execution. This may result in process termination
+ via signals. Hook is designed to hold no/minimal locks so it can be used for any
+ necessary upcalls.
+ */
+typedef void mpo_proc_notify_exec_complete_t(
+       struct proc *p
+);
+
 /**
   @brief Perform MAC-related events when a thread returns to user space
   @param thread Mach (not BSD) thread that is returning
@@ -5390,6 +5403,7 @@ typedef int mpo_vnode_check_setutimes_t(
   @brief Access control check after determining the code directory hash
   @param vp vnode vnode to combine into proc
   @param label label associated with the vnode
+  @param cpu_type cpu type of the signature being checked
   @param cs_blob the code signature to check
   @param cs_flags update code signing flags if needed
   @param signer_type output parameter for the code signature's signer type
@@ -5403,6 +5417,7 @@ typedef int mpo_vnode_check_setutimes_t(
 typedef int mpo_vnode_check_signature_t(
        struct vnode *vp,
        struct label *label,
+       cpu_type_t cpu_type,
        struct cs_blob *cs_blob,
        unsigned int *cs_flags,
        unsigned int *signer_type,
@@ -6262,56 +6277,6 @@ typedef int mpo_kext_check_query_t(
        kauth_cred_t cred
 );
 
-/**
-  @brief Access control check for getting NVRAM variables.
-  @param cred Subject credential
-  @param name NVRAM variable to get
-
-  Determine whether the subject identifier by the credential can get the
-  value of the named NVRAM variable.
-
-  @return Return 0 if access is granted, otherwise an appropriate value for
-  errno should be returned.  Suggested failure: EPERM for lack of privilege.
-*/
-typedef int mpo_iokit_check_nvram_get_t(
-       kauth_cred_t cred,
-       const char *name
-);
-
-/**
-  @brief Access control check for setting NVRAM variables.
-  @param cred Subject credential
-  @param name NVRAM variable to set
-  @param value The new value for the NVRAM variable
-
-  Determine whether the subject identifier by the credential can set the
-  value of the named NVRAM variable.
-
-  @return Return 0 if access is granted, otherwise an appropriate value for
-  errno should be returned.  Suggested failure: EPERM for lack of privilege.
-*/
-typedef int mpo_iokit_check_nvram_set_t(
-       kauth_cred_t cred,
-       const char *name,
-       io_object_t value
-);
-
-/**
-  @brief Access control check for deleting NVRAM variables.
-  @param cred Subject credential
-  @param name NVRAM variable to delete
-
-  Determine whether the subject identifier by the credential can delete the
-  named NVRAM variable.
-
-  @return Return 0 if access is granted, otherwise an appropriate value for
-  errno should be returned.  Suggested failure: EPERM for lack of privilege.
-*/
-typedef int mpo_iokit_check_nvram_delete_t(
-       kauth_cred_t cred,
-       const char *name
-);
-
 /*
  * Placeholder for future events that may need mac hooks.
  */
@@ -6323,7 +6288,7 @@ typedef void mpo_reserved_hook_t(void);
  * Please note that this should be kept in sync with the check assumptions
  * policy in bsd/kern/policy_check.c (policy_ops struct).
  */
-#define MAC_POLICY_OPS_VERSION 53 /* inc when new reserved slots are taken */
+#define MAC_POLICY_OPS_VERSION 55 /* inc when new reserved slots are taken */
 struct mac_policy_ops {
        mpo_audit_check_postselect_t            *mpo_audit_check_postselect;
        mpo_audit_check_preselect_t             *mpo_audit_check_preselect;
@@ -6462,9 +6427,9 @@ struct mac_policy_ops {
        mpo_proc_check_inherit_ipc_ports_t      *mpo_proc_check_inherit_ipc_ports;
        mpo_vnode_check_rename_t                *mpo_vnode_check_rename;
        mpo_kext_check_query_t                  *mpo_kext_check_query;
-       mpo_iokit_check_nvram_get_t             *mpo_iokit_check_nvram_get;
-       mpo_iokit_check_nvram_set_t             *mpo_iokit_check_nvram_set;
-       mpo_iokit_check_nvram_delete_t          *mpo_iokit_check_nvram_delete;
+       mpo_proc_notify_exec_complete_t         *mpo_proc_notify_exec_complete;
+       mpo_reserved_hook_t                     *mpo_reserved5;
+       mpo_reserved_hook_t                     *mpo_reserved6;
        mpo_proc_check_expose_task_t            *mpo_proc_check_expose_task;
        mpo_proc_check_set_host_special_port_t  *mpo_proc_check_set_host_special_port;
        mpo_proc_check_set_host_exception_port_t *mpo_proc_check_set_host_exception_port;
index 9c9b36c66279e1cfa23e6a54e7ef6b024c8ce4e9..b18fc092cd8363fce0a7ed7b9bef59f833bb212b 100644 (file)
@@ -1111,6 +1111,7 @@ mac_vnode_check_signature(struct vnode *vp, struct cs_blob *cs_blob,
 
         char *vn_path = NULL;
         vm_size_t vn_pathlen = MAXPATHLEN;
+        cpu_type_t cpu_type = (imgp == NULL) ? CPU_TYPE_ANY : imgp->ip_origcputype;
 
 
 #if SECURITY_MAC_CHECK_ENFORCE
@@ -1119,7 +1120,7 @@ mac_vnode_check_signature(struct vnode *vp, struct cs_blob *cs_blob,
                  return 0;
 #endif
 
-        MAC_CHECK(vnode_check_signature, vp, vp->v_label, cs_blob,
+        MAC_CHECK(vnode_check_signature, vp, vp->v_label, cpu_type, cs_blob,
                           cs_flags, signer_type, flags, &fatal_failure_desc, &fatal_failure_desc_len);
 
         if (fatal_failure_desc_len) {
diff --git a/tests/Makefile b/tests/Makefile
new file mode 100644 (file)
index 0000000..019b194
--- /dev/null
@@ -0,0 +1,239 @@
+PROJECT := xnu/darwintests
+
+# When building as part of xnu_tests, we get passed a DSTROOT that's got the
+# unit test path in it already.  But, BASEDSTROOT doesn't, so use that instead.
+ifdef BASEDSTROOT
+override DSTROOT = $(BASEDSTROOT)
+endif
+
+ENABLE_LTE_TESTS=YES
+
+OTHER_LTE_INCLUDE_FILES += \
+       /System/Library/PrivateFrameworks/LoggingSupport.framework, \
+       /System/Library/PrivateFrameworks/MobileKeyBag.framework, \
+       /usr/local/lib/libdarwintest_utils.dylib, \
+       /usr/lib/libapple_crypto.dylib,
+
+DEVELOPER_DIR ?= $(shell xcode-select -p)
+
+# the xnu build system will only ever call us with the default target
+.DEFAULT_GOAL := install
+
+include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common
+
+OTHER_CFLAGS  = -Weverything -Wno-gnu-union-cast -Wno-missing-field-initializers -Wno-partial-availability
+OTHER_CFLAGS += -Wno-missing-noreturn -Wno-vla -Wno-reserved-id-macro -Wno-documentation-unknown-command
+OTHER_CFLAGS += -Wno-padded -Wno-used-but-marked-unused -Wno-covered-switch-default -Wno-nullability-extension
+OTHER_CFLAGS += -Wno-gnu-empty-initializer -Wno-unused-macros
+OTHER_CFLAGS += --std=gnu11 -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
+OTHER_CFLAGS += -UT_NAMESPACE_PREFIX -DT_NAMESPACE_PREFIX=xnu
+OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks
+
+CODESIGN:=$(shell xcrun -sdk "$(TARGETSDK)" -find codesign)
+CODESIGN_ALLOCATE:=$(shell xcrun -sdk "$(TARGETSDK)" -find codesign_allocate)
+
+# to have custom compiler flags to
+# target: OTHER_CFLAGS += <my flags>
+
+atm_diagnostic_flag: OTHER_CFLAGS += drop_priv.c
+
+avx: INVALID_ARCHS = i386
+avx: OTHER_CFLAGS += -mavx512f -mavx512bw -mavx512vl
+avx: OTHER_CFLAGS += -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
+avx: CONFIG_FLAGS := $(filter-out -O%,$(CONFIG_FLAGS))
+# Level 2 optimization must be used to prevent compiler from generating
+# invalid instructions when compiling with AVX-512 flags.
+avx: CONFIG_FLAGS += -O2
+# Disable vzeroupper insertion to work around rdar://problem/35035096
+avx: CONFIG_FLAGS += -mllvm -x86-use-vzeroupper=0
+ifneq (osx,$(TARGET_NAME))
+EXCLUDED_SOURCES += avx.c
+endif
+
+backtracing: OTHER_LDFLAGS += -framework CoreSymbolication
+
+data_protection: OTHER_LDFLAGS += -ldarwintest_utils -framework IOKit
+
+kdebug: INVALID_ARCHS = i386
+kdebug: OTHER_LDFLAGS = -framework ktrace -ldarwintest_utils -framework kperf
+
+EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c
+
+ifeq ($(PLATFORM),iPhoneOS)
+CONFIG_FREEZE_DEFINE:= -DCONFIG_FREEZE
+else
+CONFIG_FREEZE_DEFINE:=
+EXCLUDED_SOURCES += jumbo_va_spaces_28530648.c
+endif
+
+perf_compressor: OTHER_CFLAGS += $(CONFIG_FREEZE_DEFINE)
+perf_compressor: OTHER_LDFLAGS += -ldarwintest_utils
+perf_compressor: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
+
+memorystatus_freeze_test: OTHER_CFLAGS += $(CONFIG_FREEZE_DEFINE)
+memorystatus_freeze_test: OTHER_LDFLAGS += -ldarwintest_utils
+
+stackshot: OTHER_CFLAGS += -Wno-objc-messaging-id
+stackshot: OTHER_LDFLAGS += -lkdd -framework Foundation
+stackshot: INVALID_ARCHS = i386
+
+telemetry: OTHER_LDFLAGS = -framework ktrace
+telemetry: INVALID_ARCHS = i386
+
+memorystatus_zone_test: INVALID_ARCHS = i386
+memorystatus_zone_test: OTHER_CFLAGS += -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
+memorystatus_zone_test: OTHER_LDFLAGS += -framework ktrace
+memorystatus_zone_test: OTHER_LDFLAGS += -ldarwintest_utils
+
+kpc: OTHER_LDFLAGS += -framework kperf
+
+kperf: INVALID_ARCHS = i386
+kperf: OTHER_CFLAGS += kperf_helpers.c
+kperf: OTHER_LDFLAGS += -framework kperf -framework kperfdata -framework ktrace -ldarwintest_utils
+
+kperf_backtracing: INVALID_ARCHS = i386
+kperf_backtracing: OTHER_CFLAGS += kperf_helpers.c
+kperf_backtracing: OTHER_LDFLAGS += -framework kperf -framework kperfdata -framework ktrace
+kperf_backtracing: OTHER_LDFLAGS += -framework CoreSymbolication
+
+kevent_qos: OTHER_CFLAGS += -Wno-unused-macros
+kevent_qos: OTHER_CFLAGS += -I $(OBJROOT)/
+
+mach_get_times: OTHER_LDFLAGS += -ldarwintest_utils
+
+monotonic_core: OTHER_LDFLAGS += -framework ktrace
+monotonic_core: INVALID_ARCHS = i386
+
+perf_exit: perf_exit_proc
+perf_exit: OTHER_LDFLAGS = -framework ktrace
+perf_exit: INVALID_ARCHS = i386
+perf_exit: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
+
+perf_spawn_fork: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
+
+os_thread_self_restrict: os_thread_self_restrict.c os_thread_self_restrict-entitlements.plist
+os_thread_self_restrict: CODE_SIGN_ENTITLEMENTS=os_thread_self_restrict-entitlements.plist
+
+task_inspect: CODE_SIGN_ENTITLEMENTS = task_inspect.entitlements
+task_inspect: OTHER_CFLAGS += -DENTITLED=1
+
+turnstile_multihop: OTHER_CFLAGS += -Wno-unused-macros
+turnstile_multihop: OTHER_CFLAGS += -I $(OBJROOT)/
+
+CUSTOM_TARGETS += perf_exit_proc
+
+perf_exit_proc:
+       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) perf_exit_proc.c -o $(SYMROOT)/perf_exit_proc
+
+install-perf_exit_proc: perf_exit_proc
+       mkdir -p $(INSTALLDIR)
+       cp $(SYMROOT)/perf_exit_proc $(INSTALLDIR)/
+
+perf_kdebug: INVALID_ARCHS = i386
+
+stackshot_idle_25570396: INVALID_ARCHS = i386
+stackshot_idle_25570396: OTHER_LDFLAGS += -lkdd -framework Foundation
+
+stackshot_block_owner_14362384: INVALID_ARCHS = i386
+stackshot_block_owner_14362384: OTHER_LDFLAGS += -framework Foundation -lpthread -lkdd
+ifeq ($(PLATFORM),MacOSX)
+stackshot_block_owner_14362384: OTHER_LDFLAGS += -lpcre
+endif
+
+all: $(DSTROOT)/usr/local/bin/kcdata
+
+$(DSTROOT)/usr/local/bin/kcdata: $(SRCROOT)/../tools/lldbmacros/kcdata.py
+       mkdir -p $(dir $@)
+       cp $< $@
+       chmod a+x $@
+
+xnu_quick_test: OTHER_CFLAGS += xnu_quick_test_helpers.c
+
+xnu_quick_test_entitled: CODE_SIGN_ENTITLEMENTS = xnu_quick_test.entitlements
+
+CUSTOM_TARGETS += vm_set_max_addr_helper
+
+vm_set_max_addr_helper: vm_set_max_addr_helper.c
+       $(CC) $(OTHER_CFLAGS) $(CFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) vm_set_max_addr_helper.c -o $(SYMROOT)/vm_set_max_addr_helper; \
+       echo $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@; \
+       env CODESIGN_ALLOCATE=$(CODESIGN_ALLOCATE) $(CODESIGN) --force --sign - --timestamp=none $(SYMROOT)/$@;
+
+install-vm_set_max_addr_helper: vm_set_max_addr_helper
+       mkdir -p $(INSTALLDIR)
+       cp $(SYMROOT)/vm_set_max_addr_helper $(INSTALLDIR)/
+
+ifeq ($(PLATFORM),iPhoneOS)
+OTHER_TEST_TARGETS += jumbo_va_spaces_28530648_unentitled
+jumbo_va_spaces_28530648: CODE_SIGN_ENTITLEMENTS = jumbo_va_spaces_28530648.entitlements
+jumbo_va_spaces_28530648: OTHER_CFLAGS += -DENTITLED=1
+jumbo_va_spaces_28530648: OTHER_LDFLAGS += -ldarwintest_utils
+
+jumbo_va_spaces_28530648_unentitled: OTHER_LDFLAGS += -ldarwintest_utils
+jumbo_va_spaces_28530648_unentitled: jumbo_va_spaces_28530648.c
+       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
+endif
+
+task_info_28439149: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
+
+proc_info: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
+proc_info: OTHER_LDFLAGS += -ldarwintest_utils
+
+proc_info_list_kthreads: CODE_SIGN_ENTITLEMENTS = ./proc_info_list_kthreads.entitlements
+
+disk_mount_conditioner: disk_mount_conditioner*
+disk_mount_conditioner: CODE_SIGN_ENTITLEMENTS=./disk_mount_conditioner-entitlements.plist
+disk_mount_conditioner: OTHER_LDFLAGS += -ldarwintest_utils
+
+OTHER_TEST_TARGETS += disk_mount_conditioner_unentitled
+disk_mount_conditioner_unentitled: OTHER_CFLAGS += -DTEST_UNENTITLED
+disk_mount_conditioner_unentitled: OTHER_LDFLAGS += -ldarwintest_utils
+disk_mount_conditioner_unentitled: disk_mount_conditioner.c
+       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
+
+work_interval_test: CODE_SIGN_ENTITLEMENTS = work_interval_test.entitlements
+work_interval_test: OTHER_CFLAGS += -DENTITLED=1
+
+settimeofday_29193041: OTHER_CFLAGS += drop_priv.c
+
+settimeofday_29193041_entitled: CODE_SIGN_ENTITLEMENTS = settimeofday_29193041.entitlements
+settimeofday_29193041_entitled: OTHER_CFLAGS += drop_priv.c
+
+thread_group_set_32261625: OTHER_LDFLAGS = -framework ktrace
+thread_group_set_32261625: INVALID_ARCHS = i386
+
+task_info: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist
+
+socket_bind_35243417: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist
+socket_bind_35685803: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist
+
+net_tuntests: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist
+
+ifneq (osx,$(TARGET_NAME))
+EXCLUDED_SOURCES += no32exec_35914211.c no32exec_35914211_helper.c
+endif
+
+no32exec_35914211_helper:  INVALID_ARCHS = x86_64
+no32exec_35914211:  INVALID_ARCHS = i386
+
+MIG:=SDKROOT=$(SDKROOT) $(shell xcrun -sdk "$(TARGETSDK)" -find mig)
+
+CUSTOM_TARGETS += excserver
+
+excserver:
+       $(MIG) $(CFLAGS) \
+               -sheader $(OBJROOT)/excserver.h \
+               -server $(OBJROOT)/excserver.c \
+               -header /dev/null -user /dev/null \
+               excserver.defs
+install-excserver: ;
+
+exc_resource_threads: excserver
+exc_resource_threads: OTHER_CFLAGS += $(OBJROOT)/excserver.c -I $(OBJROOT)
+
+ifneq ($(PLATFORM),BridgeOS)
+EXCLUDED_SOURCES += remote_time.c
+else
+remote_time: INVALID_ARCHS = armv7 armv7s arm64_32
+endif
+
+include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets
diff --git a/tests/atm_diagnostic_flag.c b/tests/atm_diagnostic_flag.c
new file mode 100644 (file)
index 0000000..864ffd6
--- /dev/null
@@ -0,0 +1,78 @@
+#include <darwintest.h>
+
+#include <mach/mach_error.h>
+#include <mach/mach_host.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging"));
+
+/*
+ * The low 8 bits may be in use, so modify one
+ * of the upper 8 bits to ensure round-tripping. 
+ */
+#define LIBTRACE_PRIVATE_DATA  0x01000000
+
+extern void drop_priv(void);
+
+static bool _needs_reset;
+static uint32_t _original;
+
+static uint32_t
+_save_atm_diagnostic_flag(void)
+{
+    kern_return_t kr;
+    kr = host_get_atm_diagnostic_flag(mach_host_self(), &_original);
+    T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_get_atm_diagnostic_flag()");
+    T_LOG("Original ATM diagnostic flag: 0x%08x", _original);
+    return _original;
+}
+
+static kern_return_t
+_mutate_atm_diagnostic_flag(uint32_t v)
+{
+    T_LOG("Try to set ATM diagnostic flag to: 0x%08x", v);
+    kern_return_t kr = host_set_atm_diagnostic_flag(mach_host_self(), v);
+    if (kr == KERN_SUCCESS) _needs_reset = true;
+    return kr;
+}
+
+static void
+_reset_atm_diagnostic_flag(void)
+{
+    if (!_needs_reset) return;
+    T_LOG("Reset ATM diagnostic flag to: 0x%08x", _original);
+    kern_return_t kr;
+    kr = host_set_atm_diagnostic_flag(mach_host_self(), _original);
+    if (kr != KERN_SUCCESS) {
+        T_ASSERT_FAIL("host_set_atm_diagnostic_flag() failed: %s",
+                mach_error_string(kr));
+    }
+}
+
+T_DECL(toggle_atm_diagnostic_flag,
+        "change the atm_diagnostic_flag, which should use the commpage",
+        T_META_ASROOT(true))
+{
+    T_ATEND(_reset_atm_diagnostic_flag);
+    uint32_t f = _save_atm_diagnostic_flag();
+    f ^= LIBTRACE_PRIVATE_DATA;
+    kern_return_t kr = _mutate_atm_diagnostic_flag(f);
+    if (kr == KERN_NOT_SUPPORTED) {
+        T_SKIP("Seems ATM is disabled on this platform. "
+                "Ignoring host_set_atm_diagnostic_flag functionality. "
+                "Bailing gracefully.");
+    }
+    T_EXPECT_MACH_SUCCESS(kr, "Set atm_diagnostic_flag");
+}
+
+T_DECL(unprivileged_atm_diagnostic_flag,
+        "expect to fail to set the atm_diagnostic_flag",
+        T_META_ASROOT(false))
+{
+    drop_priv();
+    T_ATEND(_reset_atm_diagnostic_flag);
+    uint32_t f = _save_atm_diagnostic_flag();
+    f ^= LIBTRACE_PRIVATE_DATA;
+    kern_return_t kr = _mutate_atm_diagnostic_flag(f);
+    T_EXPECT_MACH_ERROR(KERN_INVALID_ARGUMENT, kr,
+            "Deny change to atm_diagnostic_flag");
+}
diff --git a/tests/avx.c b/tests/avx.c
new file mode 100644 (file)
index 0000000..0041e99
--- /dev/null
@@ -0,0 +1,736 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/time.h>
+#include <sys/mman.h>
+#include <immintrin.h>
+#include <mach/mach.h>
+#include <stdio.h>
+#include <string.h>
+#include <err.h>
+#include <i386/cpu_capabilities.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.intel"),
+       T_META_CHECK_LEAKS(false)
+);
+
+#define NORMAL_RUN_TIME  (10)
+#define LONG_RUN_TIME    (10*60)
+#define TIMEOUT_OVERHEAD (10)
+
+volatile boolean_t checking = true;
+char vec_str_buf[8196];
+char karray_str_buf[1024];
+
+/*
+ * ymm defines/globals/prototypes
+ */
+#define        STOP_COOKIE_256 0x01234567
+#if defined(__x86_64__)
+#define YMM_MAX                        16
+#define X86_AVX_STATE_T                x86_avx_state64_t
+#define X86_AVX_STATE_COUNT    x86_AVX_STATE64_COUNT
+#define X86_AVX_STATE_FLAVOR   x86_AVX_STATE64
+#define        MCONTEXT_SIZE_256       sizeof(struct __darwin_mcontext_avx64)
+#else
+#define YMM_MAX                        8
+#define X86_AVX_STATE_T                x86_avx_state32_t
+#define X86_AVX_STATE_COUNT    x86_AVX_STATE32_COUNT
+#define X86_AVX_STATE_FLAVOR   x86_AVX_STATE32
+#define        MCONTEXT_SIZE_256       sizeof(struct __darwin_mcontext_avx32)
+#endif
+#define VECTOR256 __m256
+#define VEC256ALIGN __attribute ((aligned(32)))
+static inline void populate_ymm(void);
+static inline void check_ymm(void);
+VECTOR256      vec256array0[YMM_MAX] VEC256ALIGN;
+VECTOR256      vec256array1[YMM_MAX] VEC256ALIGN;
+VECTOR256      vec256array2[YMM_MAX] VEC256ALIGN;
+VECTOR256      vec256array3[YMM_MAX] VEC256ALIGN;
+
+/*
+ * zmm defines/globals/prototypes
+ */
+#define STOP_COOKIE_512 0x0123456789abcdefULL
+#if defined(__x86_64__)
+#define ZMM_MAX                        32
+#define X86_AVX512_STATE_T     x86_avx512_state64_t
+#define X86_AVX512_STATE_COUNT x86_AVX512_STATE64_COUNT
+#define X86_AVX512_STATE_FLAVOR        x86_AVX512_STATE64
+#define        MCONTEXT_SIZE_512       sizeof(struct __darwin_mcontext_avx512_64)
+#else
+#define ZMM_MAX                        8
+#define X86_AVX512_STATE_T     x86_avx512_state32_t
+#define X86_AVX512_STATE_COUNT x86_AVX512_STATE32_COUNT
+#define X86_AVX512_STATE_FLAVOR        x86_AVX512_STATE32
+#define        MCONTEXT_SIZE_512       sizeof(struct __darwin_mcontext_avx512_32)
+#endif
+#define VECTOR512 __m512
+#define VEC512ALIGN __attribute ((aligned(64)))
+#define OPMASK uint64_t
+#define KARRAY_MAX              8
+static inline void populate_zmm(void);
+static inline void populate_opmask(void);
+static inline void check_zmm(void);
+VECTOR512      vec512array0[ZMM_MAX] VEC512ALIGN;
+VECTOR512      vec512array1[ZMM_MAX] VEC512ALIGN;
+VECTOR512      vec512array2[ZMM_MAX] VEC512ALIGN;
+VECTOR512      vec512array3[ZMM_MAX] VEC512ALIGN;
+OPMASK karray0[8];
+OPMASK karray1[8];
+OPMASK karray2[8];
+OPMASK karray3[8];
+
+
+/*
+ * Common functions
+ */
+
+int
+memcmp_unoptimized(const void *s1, const void *s2, size_t n) {
+       if (n != 0) {
+               const unsigned char *p1 = s1, *p2 = s2;
+               do {
+                       if (*p1++ != *p2++)
+                               return (*--p1 - *--p2);
+               } while (--n != 0);
+       }
+       return (0);
+}
+
+void
+start_timer(int seconds, void (*handler)(int, siginfo_t *, void *)) {
+       struct sigaction sigalrm_action = {
+               .sa_sigaction = handler,
+               .sa_flags = SA_RESTART,
+               .sa_mask = 0
+       };
+       struct itimerval timer = {
+               .it_value.tv_sec = seconds,
+               .it_value.tv_usec = 0,
+               .it_interval.tv_sec = 0,
+               .it_interval.tv_usec = 0
+       };
+       T_QUIET; T_WITH_ERRNO;
+       T_ASSERT_NE(sigaction(SIGALRM, &sigalrm_action, NULL), -1, NULL);
+       T_QUIET; T_WITH_ERRNO;
+       T_ASSERT_NE(setitimer(ITIMER_REAL, &timer, NULL), -1, NULL);
+}
+
+void
+require_avx(void) {
+       if((_get_cpu_capabilities() & kHasAVX1_0) != kHasAVX1_0) {
+               T_SKIP("AVX not supported on this system");
+       }
+}
+
+void
+require_avx512(void) {
+       if((_get_cpu_capabilities() & kHasAVX512F) != kHasAVX512F) {
+               T_SKIP("AVX-512 not supported on this system");
+       }
+}
+
+/*
+ * ymm functions
+ */
+
+static inline void
+store_ymm(VECTOR256 *vec256array) {
+       int i = 0;
+           __asm__ volatile("vmovaps  %%ymm0, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm1, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm2, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm3, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm4, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm5, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm6, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm7, %0" :"=m" (vec256array[i]));
+#if defined(__x86_64__)
+       i++;__asm__ volatile("vmovaps  %%ymm8, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm9, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm10, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm11, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm12, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm13, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm14, %0" :"=m" (vec256array[i]));
+       i++;__asm__ volatile("vmovaps  %%ymm15, %0" :"=m" (vec256array[i]));
+#endif
+}
+
+static inline void
+populate_ymm(void) {
+       int j;
+       uint32_t p[8] VEC256ALIGN;
+
+       for (j = 0; j < (int) (sizeof(p)/sizeof(p[0])); j++)
+               p[j] = getpid();
+
+       p[0] = 0x22222222;
+       p[7] = 0x77777777;
+       __asm__ volatile("vmovaps  %0, %%ymm0" :: "m" (*(__m256i*)p) : "ymm0");
+       __asm__ volatile("vmovaps  %0, %%ymm1" :: "m" (*(__m256i*)p) : "ymm1");
+       __asm__ volatile("vmovaps  %0, %%ymm2" :: "m" (*(__m256i*)p) : "ymm2");
+       __asm__ volatile("vmovaps  %0, %%ymm3" :: "m" (*(__m256i*)p) : "ymm3");
+
+       p[0] = 0x44444444;
+       p[7] = 0xEEEEEEEE;
+       __asm__ volatile("vmovaps  %0, %%ymm4" :: "m" (*(__m256i*)p) : "ymm4");
+       __asm__ volatile("vmovaps  %0, %%ymm5" :: "m" (*(__m256i*)p) : "ymm5");
+       __asm__ volatile("vmovaps  %0, %%ymm6" :: "m" (*(__m256i*)p) : "ymm6");
+       __asm__ volatile("vmovaps  %0, %%ymm7" :: "m" (*(__m256i*)p) : "ymm7");
+
+#if defined(__x86_64__)
+       p[0] = 0x88888888;
+       p[7] = 0xAAAAAAAA;
+       __asm__ volatile("vmovaps  %0, %%ymm8" :: "m" (*(__m256i*)p) : "ymm8");
+       __asm__ volatile("vmovaps  %0, %%ymm9" :: "m" (*(__m256i*)p) : "ymm9");
+       __asm__ volatile("vmovaps  %0, %%ymm10" :: "m" (*(__m256i*)p) : "ymm10");
+       __asm__ volatile("vmovaps  %0, %%ymm11" :: "m" (*(__m256i*)p) : "ymm11");
+
+       p[0] = 0xBBBBBBBB;
+       p[7] = 0xCCCCCCCC;
+       __asm__ volatile("vmovaps  %0, %%ymm12" :: "m" (*(__m256i*)p) : "ymm12");
+       __asm__ volatile("vmovaps  %0, %%ymm13" :: "m" (*(__m256i*)p) : "ymm13");
+       __asm__ volatile("vmovaps  %0, %%ymm14" :: "m" (*(__m256i*)p) : "ymm14");
+       __asm__ volatile("vmovaps  %0, %%ymm15" :: "m" (*(__m256i*)p) : "ymm15");
+#endif
+
+       store_ymm(vec256array0);
+}
+
+void
+vec256_to_string(VECTOR256 *vec, char *buf) {
+       unsigned int vec_idx = 0;
+       unsigned int buf_idx = 0;
+       int ret = 0;
+
+       for (vec_idx = 0; vec_idx < YMM_MAX; vec_idx++) {
+               uint64_t a[4];
+               bcopy(&vec[vec_idx], &a[0], sizeof(a));
+               ret = sprintf(
+                       buf + buf_idx,
+                       "0x%016llx:%016llx:%016llx:%016llx\n",
+                       a[0], a[1], a[2], a[3]
+               );
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sprintf()");
+               buf_idx += ret;
+       }
+}
+
+void
+assert_ymm_eq(void *a, void *b, int c) {
+       if(memcmp_unoptimized(a, b, c)) {
+               vec256_to_string(a, vec_str_buf);
+               T_LOG("Compare failed, vector A:\n%s", vec_str_buf);
+               vec256_to_string(b, vec_str_buf);
+               T_LOG("Compare failed, vector B:\n%s", vec_str_buf);
+               T_ASSERT_FAIL("vectors not equal");
+       }
+}
+
+void
+check_ymm(void)  {
+       uint32_t *p = (uint32_t *) &vec256array1[7];
+       store_ymm(vec256array1);
+       if (p[0] == STOP_COOKIE_256) {
+               return;
+       }
+       assert_ymm_eq(vec256array0, vec256array1, sizeof(vec256array0));
+}
+
+static void
+copy_ymm_state_to_vector(X86_AVX_STATE_T *sp,  VECTOR256 *vp) {
+       int     i;
+       struct  __darwin_xmm_reg *xmm  = &sp->__fpu_xmm0;
+       struct  __darwin_xmm_reg *ymmh = &sp->__fpu_ymmh0;
+
+       for (i = 0; i < YMM_MAX; i++ ) {
+               bcopy(&xmm[i],  &vp[i], sizeof(*xmm));
+               bcopy(&ymmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*ymmh)), sizeof(*ymmh));
+       }
+}
+
+static void
+ymm_sigalrm_handler(int signum __unused, siginfo_t *info __unused, void *ctx)
+{
+       ucontext_t *contextp = (ucontext_t *) ctx;
+       mcontext_t mcontext = contextp->uc_mcontext;
+       X86_AVX_STATE_T *avx_state = (X86_AVX_STATE_T *) &mcontext->__fs;
+       uint32_t *xp = (uint32_t *) &avx_state->__fpu_xmm7;
+       uint32_t *yp = (uint32_t *) &avx_state->__fpu_ymmh7;
+
+       T_LOG("Got SIGALRM");
+
+       /* Check for AVX state */
+       T_QUIET;
+       T_ASSERT_GE(contextp->uc_mcsize, MCONTEXT_SIZE_256, "check context size");
+
+       /* Check that the state in the context is what's set and expected */
+       copy_ymm_state_to_vector(avx_state, vec256array3);
+       assert_ymm_eq(vec256array3, vec256array0, sizeof(vec256array1));
+
+       /* Change the context and break the main loop */
+       xp[0] = STOP_COOKIE_256;
+       yp[0] = STOP_COOKIE_256;
+       checking = FALSE;
+}
+
+void
+ymm_integrity(int time) {
+       mach_msg_type_number_t avx_count = X86_AVX_STATE_COUNT;
+       kern_return_t kret;
+       X86_AVX_STATE_T avx_state, avx_state2;
+       mach_port_t ts = mach_thread_self();
+
+       bzero(&avx_state, sizeof(avx_state));
+       bzero(&avx_state2, sizeof(avx_state));
+
+       kret = thread_get_state(
+               ts, X86_AVX_STATE_FLAVOR, (thread_state_t)&avx_state, &avx_count
+       );
+
+       store_ymm(vec256array2);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "thread_get_state()");
+       vec256_to_string(vec256array2, vec_str_buf);
+       T_LOG("Initial state:\n%s", vec_str_buf);
+
+       copy_ymm_state_to_vector(&avx_state, vec256array1);
+       assert_ymm_eq(vec256array2, vec256array1, sizeof(vec256array1));
+
+       populate_ymm();
+
+       kret = thread_get_state(
+               ts, X86_AVX_STATE_FLAVOR, (thread_state_t)&avx_state2, &avx_count
+       );
+
+       store_ymm(vec256array2);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "thread_get_state()");
+       vec256_to_string(vec256array2, vec_str_buf);
+       T_LOG("Populated state:\n%s", vec_str_buf);
+
+       copy_ymm_state_to_vector(&avx_state2, vec256array1);
+       assert_ymm_eq(vec256array2, vec256array1, sizeof(vec256array0));
+
+       T_LOG("Running for %ds…", time);
+       start_timer(time, ymm_sigalrm_handler);
+
+       /* re-populate because printing mucks up XMMs */
+       populate_ymm();
+
+       /* Check state until timer fires */
+       while(checking) {
+               check_ymm();
+       }
+
+       /* Check that the sig handler changed out AVX state */
+       store_ymm(vec256array1);
+
+       uint32_t *p = (uint32_t *) &vec256array1[7];
+       if (p[0] != STOP_COOKIE_256 ||
+           p[4] != STOP_COOKIE_256) {
+               vec256_to_string(vec256array1, vec_str_buf);
+               T_ASSERT_FAIL("sigreturn failed to stick");
+               T_LOG("State:\n%s", vec_str_buf);
+       }
+
+       T_LOG("Ran for %ds", time);
+       T_PASS("No ymm register corruption occurred");
+}
+
+/*
+ * zmm functions
+ */
+
+static inline void
+store_opmask(OPMASK k[]) {
+       __asm__ volatile("kmovq %%k0, %0" :"=m" (k[0]));
+       __asm__ volatile("kmovq %%k1, %0" :"=m" (k[1]));
+       __asm__ volatile("kmovq %%k2, %0" :"=m" (k[2]));
+       __asm__ volatile("kmovq %%k3, %0" :"=m" (k[3]));
+       __asm__ volatile("kmovq %%k4, %0" :"=m" (k[4]));
+       __asm__ volatile("kmovq %%k5, %0" :"=m" (k[5]));
+       __asm__ volatile("kmovq %%k6, %0" :"=m" (k[6]));
+       __asm__ volatile("kmovq %%k7, %0" :"=m" (k[7]));
+}
+
+static inline void
+store_zmm(VECTOR512 *vecarray) {
+       int i = 0;
+           __asm__ volatile("vmovaps  %%zmm0, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm1, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm2, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm3, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm4, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm5, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm6, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm7, %0" :"=m" (vecarray[i]));
+#if defined(__x86_64__)
+       i++;__asm__ volatile("vmovaps  %%zmm8, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm9, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm10, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm11, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm12, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm13, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm14, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm15, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm16, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm17, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm18, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm19, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm20, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm21, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm22, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm23, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm24, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm25, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm26, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm27, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm28, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm29, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm30, %0" :"=m" (vecarray[i]));
+       i++;__asm__ volatile("vmovaps  %%zmm31, %0" :"=m" (vecarray[i]));
+#endif
+}
+
+static inline void
+populate_opmask(void) {
+       uint64_t k[8];
+
+       for (int j = 0; j < 8; j++)
+               k[j] = ((uint64_t) getpid() << 32) + (0x11111111 * j);
+
+       __asm__ volatile("kmovq %0, %%k0" : :"m" (k[0]));
+       __asm__ volatile("kmovq %0, %%k1" : :"m" (k[1]));
+       __asm__ volatile("kmovq %0, %%k2" : :"m" (k[2]));
+       __asm__ volatile("kmovq %0, %%k3" : :"m" (k[3]));
+       __asm__ volatile("kmovq %0, %%k4" : :"m" (k[4]));
+       __asm__ volatile("kmovq %0, %%k5" : :"m" (k[5]));
+       __asm__ volatile("kmovq %0, %%k6" : :"m" (k[6]));
+       __asm__ volatile("kmovq %0, %%k7" : :"m" (k[7]));
+
+       store_opmask(karray0);
+}
+
+static inline void
+populate_zmm(void) {
+       int j;
+       uint64_t p[8] VEC512ALIGN;
+
+       for (j = 0; j < (int) (sizeof(p)/sizeof(p[0])); j++)
+               p[j] = ((uint64_t) getpid() << 32) + getpid();
+
+       p[0] = 0x0000000000000000ULL;
+       p[2] = 0x4444444444444444ULL;
+       p[4] = 0x8888888888888888ULL;
+       p[7] = 0xCCCCCCCCCCCCCCCCULL;
+       __asm__ volatile("vmovaps  %0, %%zmm0" :: "m" (*(__m256i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm1" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm2" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm3" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm4" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm5" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm6" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm7" :: "m" (*(__m512i*)p) );
+
+#if defined(__x86_64__)
+       p[0] = 0x1111111111111111ULL;
+       p[2] = 0x5555555555555555ULL;
+       p[4] = 0x9999999999999999ULL;
+       p[7] = 0xDDDDDDDDDDDDDDDDULL;
+       __asm__ volatile("vmovaps  %0, %%zmm8" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm9" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm10" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm11" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm12" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm13" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm14" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm15" :: "m" (*(__m512i*)p) );
+
+       p[0] = 0x2222222222222222ULL;
+       p[2] = 0x6666666666666666ULL;
+       p[4] = 0xAAAAAAAAAAAAAAAAULL;
+       p[7] = 0xEEEEEEEEEEEEEEEEULL;
+       __asm__ volatile("vmovaps  %0, %%zmm16" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm17" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm18" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm19" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm20" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm21" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm22" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm23" :: "m" (*(__m512i*)p) );
+
+       p[0] = 0x3333333333333333ULL;
+       p[2] = 0x7777777777777777ULL;
+       p[4] = 0xBBBBBBBBBBBBBBBBULL;
+       p[7] = 0xFFFFFFFFFFFFFFFFULL;
+       __asm__ volatile("vmovaps  %0, %%zmm24" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm25" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm26" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm27" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm28" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm29" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm30" :: "m" (*(__m512i*)p) );
+       __asm__ volatile("vmovaps  %0, %%zmm31" :: "m" (*(__m512i*)p) );
+#endif
+
+       store_zmm(vec512array0);
+}
+
+void
+vec512_to_string(VECTOR512 *vec, char *buf) {
+       unsigned int vec_idx = 0;
+       unsigned int buf_idx = 0;
+       int ret = 0;
+
+       for (vec_idx = 0; vec_idx < ZMM_MAX; vec_idx++) {
+               uint64_t a[8];
+               bcopy(&vec[vec_idx], &a[0], sizeof(a));
+               ret = sprintf(
+                       buf + buf_idx,
+                       "0x%016llx:%016llx:%016llx:%016llx:"
+                       "%016llx:%016llx:%016llx:%016llx%s",
+                       a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
+                       vec_idx < ZMM_MAX - 1 ? "\n" : ""
+               );
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sprintf()");
+               buf_idx += ret;
+       }
+}
+
+void
+opmask_to_string(OPMASK *karray, char *buf) {
+       unsigned int karray_idx = 0;
+       unsigned int buf_idx = 0;
+       int ret = 0;
+
+       for(karray_idx = 0; karray_idx < KARRAY_MAX; karray_idx++) {
+               ret = sprintf(
+                       buf + buf_idx,
+                       "k%d: 0x%016llx%s",
+                       karray_idx, karray[karray_idx],
+                       karray_idx < KARRAY_MAX ? "\n" : ""
+               );
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sprintf()");
+               buf_idx += ret;
+       }
+}
+
+static void
+assert_zmm_eq(void *a, void *b, int c) {
+       if(memcmp_unoptimized(a, b, c)) {
+               vec512_to_string(a, vec_str_buf);
+               T_LOG("Compare failed, vector A:\n%s", vec_str_buf);
+               vec512_to_string(b, vec_str_buf);
+               T_LOG("Compare failed, vector B:\n%s", vec_str_buf);
+               T_ASSERT_FAIL("Vectors not equal");
+       }
+}
+
+static void
+assert_opmask_eq(OPMASK *a, OPMASK *b) {
+       for (int i = 0; i < KARRAY_MAX; i++) {
+               if (a[i] != b[i]) {
+                       opmask_to_string(a, karray_str_buf);
+                       T_LOG("Compare failed, opmask A:\n%s", karray_str_buf);
+                       opmask_to_string(b, karray_str_buf);
+                       T_LOG("Compare failed, opmask B:\n%s", karray_str_buf);
+                       T_ASSERT_FAIL("opmasks not equal");
+               }
+       }
+}
+
+void
+check_zmm(void)  {
+       uint64_t *p = (uint64_t *) &vec512array1[7];
+       store_opmask(karray1);
+       store_zmm(vec512array1);
+       if (p[0] == STOP_COOKIE_512) {
+               return;
+       }
+
+       assert_zmm_eq(vec512array0, vec512array1, sizeof(vec512array0));
+       assert_opmask_eq(karray0, karray1);
+}
+
+static void copy_state_to_opmask(X86_AVX512_STATE_T *sp, OPMASK *op) {
+       OPMASK *k = (OPMASK *) &sp->__fpu_k0;
+       for (int i = 0; i < KARRAY_MAX; i++) {
+               bcopy(&k[i], &op[i], sizeof(*op));
+       }
+}
+
+static void copy_zmm_state_to_vector(X86_AVX512_STATE_T *sp,  VECTOR512 *vp) {
+       int     i;
+       struct  __darwin_xmm_reg *xmm  = &sp->__fpu_xmm0;
+       struct  __darwin_xmm_reg *ymmh = &sp->__fpu_ymmh0;
+       struct  __darwin_ymm_reg *zmmh = &sp->__fpu_zmmh0;
+#if defined(__x86_64__)
+       struct  __darwin_zmm_reg *zmm  = &sp->__fpu_zmm16;
+
+       for (i = 0; i < ZMM_MAX/2; i++ ) {
+               bcopy(&xmm[i],  &vp[i], sizeof(*xmm));
+               bcopy(&ymmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*ymmh)), sizeof(*ymmh));
+               bcopy(&zmmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*zmmh)), sizeof(*zmmh));
+               bcopy(&zmm[i], &vp[(ZMM_MAX/2)+i], sizeof(*zmm));
+       }
+#else
+       for (i = 0; i < ZMM_MAX; i++ ) {
+               bcopy(&xmm[i],  &vp[i], sizeof(*xmm));
+               bcopy(&ymmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*ymmh)), sizeof(*ymmh));
+               bcopy(&zmmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*zmmh)), sizeof(*zmmh));
+       }
+#endif
+}
+
+static void
+zmm_sigalrm_handler(int signum __unused, siginfo_t *info __unused, void *ctx)
+{
+       ucontext_t *contextp = (ucontext_t *) ctx;
+       mcontext_t mcontext = contextp->uc_mcontext;
+       X86_AVX512_STATE_T *avx_state = (X86_AVX512_STATE_T *) &mcontext->__fs;
+       uint64_t *xp = (uint64_t *) &avx_state->__fpu_xmm7;
+       uint64_t *yp = (uint64_t *) &avx_state->__fpu_ymmh7;
+       uint64_t *zp = (uint64_t *) &avx_state->__fpu_zmmh7;
+       uint64_t *kp = (uint64_t *) &avx_state->__fpu_k0;
+
+       /* Check for AVX512 state */
+       T_QUIET;
+       T_ASSERT_GE(contextp->uc_mcsize, MCONTEXT_SIZE_512, "check context size");
+
+       /* Check that the state in the context is what's set and expected */
+       copy_zmm_state_to_vector(avx_state, vec512array3);
+       assert_zmm_eq(vec512array3, vec512array0, sizeof(vec512array1));
+       copy_state_to_opmask(avx_state, karray3);
+       assert_opmask_eq(karray3, karray0);
+
+       /* Change the context and break the main loop */
+       xp[0] = STOP_COOKIE_512;
+       yp[0] = STOP_COOKIE_512;
+       zp[0] = STOP_COOKIE_512;
+       kp[7] = STOP_COOKIE_512;
+       checking = FALSE;
+}
+
+void
+zmm_integrity(int time) {
+       mach_msg_type_number_t avx_count = X86_AVX512_STATE_COUNT;
+       kern_return_t kret;
+       X86_AVX512_STATE_T avx_state, avx_state2;
+       mach_port_t ts = mach_thread_self();
+
+       bzero(&avx_state, sizeof(avx_state));
+       bzero(&avx_state2, sizeof(avx_state));
+
+       store_zmm(vec512array2);
+       store_opmask(karray2);
+
+       kret = thread_get_state(
+               ts, X86_AVX512_STATE_FLAVOR, (thread_state_t)&avx_state, &avx_count
+       );
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "thread_get_state()");
+       vec512_to_string(vec512array2, vec_str_buf);
+       opmask_to_string(karray2, karray_str_buf);
+       T_LOG("Initial state:\n%s\n%s", vec_str_buf, karray_str_buf);
+
+       copy_zmm_state_to_vector(&avx_state, vec512array1);
+       assert_zmm_eq(vec512array2, vec512array1, sizeof(vec512array1));
+       copy_state_to_opmask(&avx_state, karray1);
+       assert_opmask_eq(karray2, karray1);
+
+       populate_zmm();
+       populate_opmask();
+
+       kret = thread_get_state(
+               ts, X86_AVX512_STATE_FLAVOR, (thread_state_t)&avx_state2, &avx_count
+       );
+
+       store_zmm(vec512array2);
+       store_opmask(karray2);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "thread_get_state()");
+       vec512_to_string(vec512array2, vec_str_buf);
+       opmask_to_string(karray2, karray_str_buf);
+       T_LOG("Populated state:\n%s\n%s", vec_str_buf, karray_str_buf);
+
+       copy_zmm_state_to_vector(&avx_state2, vec512array1);
+       assert_zmm_eq(vec512array2, vec512array1, sizeof(vec512array1));
+       copy_state_to_opmask(&avx_state2, karray1);
+       assert_opmask_eq(karray2, karray1);
+
+       T_LOG("Running for %ds…", time);
+       start_timer(time, zmm_sigalrm_handler);
+
+       /* re-populate because printing mucks up XMMs */
+       populate_zmm();
+       populate_opmask();
+
+       /* Check state until timer fires */
+       while(checking) {
+               check_zmm();
+       }
+
+       /* Check that the sig handler changed our AVX state */
+       store_zmm(vec512array1);
+       store_opmask(karray1);
+
+       uint64_t *p = (uint64_t *) &vec512array1[7];
+       if (p[0] != STOP_COOKIE_512 ||
+           p[2] != STOP_COOKIE_512 ||
+           p[4] != STOP_COOKIE_512 ||
+           karray1[7] != STOP_COOKIE_512) {
+               vec512_to_string(vec512array1, vec_str_buf);
+               opmask_to_string(karray1, karray_str_buf);
+               T_ASSERT_FAIL("sigreturn failed to stick");
+               T_LOG("State:\n%s\n%s", vec_str_buf, karray_str_buf);
+       }
+
+       T_LOG("Ran for %ds", time);
+       T_PASS("No zmm register corruption occurred");
+}
+
+/*
+ * Main test declarations
+ */
+T_DECL(ymm_integrity,
+       "Quick soak test to verify that AVX "
+       "register state is maintained correctly",
+       T_META_TIMEOUT(NORMAL_RUN_TIME + TIMEOUT_OVERHEAD)) {
+       require_avx();
+       ymm_integrity(NORMAL_RUN_TIME);
+}
+
+T_DECL(ymm_integrity_stress,
+       "Extended soak test to verify that AVX "
+       "register state is maintained correctly",
+       T_META_TIMEOUT(LONG_RUN_TIME + TIMEOUT_OVERHEAD),
+       T_META_ENABLED(false)) {
+       require_avx();
+       ymm_integrity(LONG_RUN_TIME);
+}
+
+T_DECL(zmm_integrity,
+       "Quick soak test to verify that AVX-512 "
+       "register state is maintained correctly",
+       T_META_TIMEOUT(LONG_RUN_TIME + TIMEOUT_OVERHEAD)) {
+       require_avx512();
+       zmm_integrity(NORMAL_RUN_TIME);
+}
+
+T_DECL(zmm_integrity_stress,
+       "Extended soak test to verify that AVX-512 "
+       "register state is maintained correctly",
+       T_META_TIMEOUT(NORMAL_RUN_TIME + TIMEOUT_OVERHEAD),
+       T_META_ENABLED(false)) {
+       require_avx512();
+       zmm_integrity(LONG_RUN_TIME);
+}
+
diff --git a/tests/backtracing.c b/tests/backtracing.c
new file mode 100644 (file)
index 0000000..614ec12
--- /dev/null
@@ -0,0 +1,172 @@
+#include <CoreSymbolication/CoreSymbolication.h>
+#include <darwintest.h>
+#include <dispatch/dispatch.h>
+#include <execinfo.h>
+#include <pthread.h>
+#include <sys/sysctl.h>
+
+#define USER_FRAMES (12)
+
+#define NON_RECURSE_FRAMES (5)
+
+static const char *user_bt[USER_FRAMES] = {
+    NULL, NULL,
+    "backtrace_thread",
+    "recurse_a", "recurse_b", "recurse_a", "recurse_b",
+    "recurse_a", "recurse_b", "recurse_a",
+    "expect_stack", NULL
+};
+
+static void
+expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol,
+    unsigned long addr, unsigned int bt_idx, unsigned int max_frames)
+{
+    const char *name;
+    unsigned int frame_idx = max_frames - bt_idx - 1;
+
+    if (bt[frame_idx] == NULL) {
+        T_LOG("frame %2u: skipping system frame", frame_idx);
+        return;
+    }
+
+    if (CSIsNull(symbol)) {
+        T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx);
+        return;
+    }
+
+    if (frame_idx >= bt_len) {
+        T_FAIL("unexpected frame '%s' (%#lx) at index %u",
+            CSSymbolGetName(symbol), addr, frame_idx);
+        return;
+    }
+
+    name = CSSymbolGetName(symbol);
+    T_QUIET; T_ASSERT_NOTNULL(name, NULL);
+    T_EXPECT_EQ_STR(name, bt[frame_idx],
+        "frame %2u: saw '%s', expected '%s'",
+        frame_idx, name, bt[frame_idx]);
+}
+
+static void __attribute__((noinline,not_tail_called))
+expect_stack(void)
+{
+    uint64_t bt[USER_FRAMES] = { 0 };
+    unsigned int bt_len = USER_FRAMES;
+    int err;
+    size_t bt_filled;
+
+    static dispatch_once_t expect_stacks_once;
+    static bool k64;
+    static CSSymbolicatorRef user_symb;
+
+    dispatch_once(&expect_stacks_once, ^(void) {
+        int errb;
+        int mib[] = { CTL_KERN, KERN_PROC, KERN_PROC_PID, 0 /* kernproc */ };
+
+        struct kinfo_proc kp;
+        size_t len;
+
+        len = sizeof(kp);
+        errb = sysctl(mib, sizeof(mib) / sizeof(mib[0]), &kp, &len, NULL, 0);
+        T_QUIET; T_ASSERT_POSIX_SUCCESS(errb,
+            "sysctl({ CTL_KERN, KERN_PROC, KERN_PROC_PID, 0})");
+
+        k64 = kp.kp_proc.p_flag & P_LP64;
+        T_LOG("executing with a %s-bit kernel", k64 ? "64" : "32");
+
+        user_symb = CSSymbolicatorCreateWithTask(mach_task_self());
+        T_QUIET; T_ASSERT_FALSE(CSIsNull(user_symb), NULL);
+        T_QUIET; T_ASSERT_TRUE(CSSymbolicatorIsTaskValid(user_symb), NULL);
+    });
+
+    bt_filled = USER_FRAMES;
+    err = sysctlbyname("kern.backtrace.user", bt, &bt_filled, NULL, 0);
+    if (err == ENOENT) {
+        T_SKIP("release kernel: kern.backtrace.user sysctl returned ENOENT");
+    }
+    T_ASSERT_POSIX_SUCCESS(err, "sysctlbyname(\"kern.backtrace.user\")");
+
+    bt_len = (unsigned int)bt_filled;
+    T_EXPECT_EQ(bt_len, (unsigned int)USER_FRAMES,
+        "%u frames should be present in backtrace", (unsigned int)USER_FRAMES);
+
+    for (unsigned int i = 0; i < bt_len; i++) {
+        uintptr_t addr;
+#if !defined(__LP64__)
+        /*
+         * Backtrace frames come out as kernel words; convert them back to user
+         * uintptr_t for 32-bit processes.
+         */
+        if (k64) {
+            addr = (uintptr_t)(bt[i]);
+        } else {
+            addr = (uintptr_t)(((uint32_t *)bt)[i]);
+        }
+#else /* defined(__LP32__) */
+        addr = (uintptr_t)bt[i];
+#endif /* defined(__LP32__) */
+
+        CSSymbolRef symbol = CSSymbolicatorGetSymbolWithAddressAtTime(
+            user_symb, addr, kCSNow);
+        expect_frame(user_bt, USER_FRAMES, symbol, addr, i, bt_len);
+    }
+}
+
+static int __attribute__((noinline,not_tail_called))
+recurse_a(unsigned int frames);
+static int __attribute__((noinline,not_tail_called))
+recurse_b(unsigned int frames);
+
+static int __attribute__((noinline,not_tail_called))
+recurse_a(unsigned int frames)
+{
+    if (frames == 1) {
+        expect_stack();
+        getpid();
+        return 0;
+    }
+
+    return recurse_b(frames - 1) + 1;
+}
+
+static int __attribute__((noinline,not_tail_called))
+recurse_b(unsigned int frames)
+{
+    if (frames == 1) {
+        expect_stack();
+        getpid();
+        return 0;
+    }
+
+    return recurse_a(frames - 1) + 1;
+}
+
+static void *
+backtrace_thread(void *arg)
+{
+#pragma unused(arg)
+    unsigned int calls;
+
+    /*
+     * backtrace_thread, recurse_a, recurse_b, ..., __sysctlbyname
+     *
+     * Always make one less call for this frame (backtrace_thread).
+     */
+    calls = USER_FRAMES - NON_RECURSE_FRAMES;
+
+    T_LOG("backtrace thread calling into %d frames (already at %d frames)",
+       calls, NON_RECURSE_FRAMES);
+    (void)recurse_a(calls);
+    return NULL;
+}
+
+T_DECL(backtrace_user, "test that the kernel can backtrace user stacks",
+    T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
+{
+    pthread_t thread;
+
+    T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, backtrace_thread,
+        NULL), "create additional thread to backtrace");
+
+    T_QUIET; T_ASSERT_POSIX_ZERO(pthread_join(thread, NULL), NULL);
+}
diff --git a/tests/contextswitch.c b/tests/contextswitch.c
new file mode 100644 (file)
index 0000000..3969ead
--- /dev/null
@@ -0,0 +1,285 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+#include <errno.h>
+#include <err.h>
+#include <string.h>
+#include <assert.h>
+#include <sysexits.h>
+#include <getopt.h>
+#include <spawn.h>
+#include <stdbool.h>
+#include <sys/sysctl.h>
+#include <mach/mach_time.h>
+#include <mach/mach.h>
+#include <mach/semaphore.h>
+#include <TargetConditionals.h>
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <stdatomic.h>
+
+#define MAX_THREADS    32
+#define SPIN_SECS      6
+#define THR_SPINNER_PRI        63
+#define THR_MANAGER_PRI        62
+#define WARMUP_ITERATIONS 100
+#define POWERCTRL_SUCCESS_STR "Factor1: 1.000000"
+
+static mach_timebase_info_data_t timebase_info;
+static semaphore_t semaphore;
+static semaphore_t worker_sem;
+static uint32_t g_numcpus;
+static _Atomic uint32_t keep_going = 1;
+static dt_stat_time_t s;
+
+static struct {
+    pthread_t thread;
+    bool measure_thread;
+} threads[MAX_THREADS];
+
+static uint64_t 
+nanos_to_abs(uint64_t nanos) 
+{ 
+    return nanos * timebase_info.denom / timebase_info.numer;
+}
+
+extern char **environ;
+
+static void
+csw_perf_test_init(void)
+{
+    int spawn_ret, pid;
+    char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-f", "5000", NULL};
+    spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, environ);
+    waitpid(pid, &spawn_ret, 0);
+}
+
+static void
+csw_perf_test_cleanup(void)
+{
+    int spawn_ret, pid;
+    char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-d", NULL};
+    spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, environ);
+    waitpid(pid, &spawn_ret, 0);
+}
+
+static pthread_t
+create_thread(uint32_t thread_id, uint32_t priority, bool fixpri, 
+        void *(*start_routine)(void *))
+{
+    int rv;
+    pthread_t new_thread;
+    struct sched_param param = { .sched_priority = (int)priority };
+    pthread_attr_t attr;
+
+    T_ASSERT_POSIX_ZERO(pthread_attr_init(&attr), "pthread_attr_init");
+
+    T_ASSERT_POSIX_ZERO(pthread_attr_setschedparam(&attr, &param),
+            "pthread_attr_setschedparam");
+
+    if (fixpri) {
+        T_ASSERT_POSIX_ZERO(pthread_attr_setschedpolicy(&attr, SCHED_RR),
+                "pthread_attr_setschedpolicy");
+    }
+
+    T_ASSERT_POSIX_ZERO(pthread_create(&new_thread, &attr, start_routine,
+            (void*)(uintptr_t)thread_id), "pthread_create");
+
+    T_ASSERT_POSIX_ZERO(pthread_attr_destroy(&attr), "pthread_attr_destroy");
+
+    threads[thread_id].thread = new_thread;
+
+    return new_thread;
+}
+
+/* Spin until a specified number of seconds elapses */
+static void
+spin_for_duration(uint32_t seconds)
+{
+    uint64_t duration       = nanos_to_abs((uint64_t)seconds * NSEC_PER_SEC);
+    uint64_t current_time   = mach_absolute_time();
+    uint64_t timeout        = duration + current_time;
+
+    uint64_t spin_count = 0;
+
+    while (mach_absolute_time() < timeout && atomic_load_explicit(&keep_going,
+               memory_order_relaxed)) {
+        spin_count++;
+    }
+}
+
+static void *
+spin_thread(void *arg)
+{
+    uint32_t thread_id = (uint32_t) arg;
+    char name[30] = "";
+
+    snprintf(name, sizeof(name), "spin thread %2d", thread_id);
+    pthread_setname_np(name);
+    T_ASSERT_MACH_SUCCESS(semaphore_wait_signal(semaphore, worker_sem),
+           "semaphore_wait_signal");
+    spin_for_duration(SPIN_SECS);
+    return NULL;
+}
+
+static void *
+thread(void *arg)
+{
+    uint32_t thread_id = (uint32_t) arg;
+    char name[30] = "";
+
+    snprintf(name, sizeof(name), "thread %2d", thread_id);
+    pthread_setname_np(name);
+    T_ASSERT_MACH_SUCCESS(semaphore_wait_signal(semaphore, worker_sem), "semaphore_wait");
+
+    if (threads[thread_id].measure_thread) {
+        for (int i = 0; i < WARMUP_ITERATIONS; i++) {
+            thread_switch(THREAD_NULL, SWITCH_OPTION_NONE, 0);
+        }
+        T_STAT_MEASURE_LOOP(s) {
+            if(thread_switch(THREAD_NULL, SWITCH_OPTION_NONE, 0))
+                T_ASSERT_FAIL("thread_switch");
+        }
+        atomic_store_explicit(&keep_going, 0, memory_order_relaxed);
+    } else {
+        while (atomic_load_explicit(&keep_going, memory_order_relaxed)) {
+            if (thread_switch(THREAD_NULL, SWITCH_OPTION_NONE, 0))
+                T_ASSERT_FAIL("thread_switch");
+        }
+    }
+    return NULL;
+}
+
+void check_device_temperature(void)
+{
+    char buffer[256];
+    FILE *pipe = popen("powerctrl Factor1", "r");
+    
+    if (pipe == NULL) {
+        T_FAIL("Failed to check device temperature");
+        T_END;
+    }
+
+    fgets(buffer, sizeof(buffer), pipe);
+    
+    if (strncmp(POWERCTRL_SUCCESS_STR, buffer, strlen(POWERCTRL_SUCCESS_STR))) {
+        T_PERF("temperature", 0.0, "factor", "device temperature");
+    } else {
+        T_PASS("Device temperature check pass");
+        T_PERF("temperature", 1.0, "factor", "device temperature");
+    }
+    pclose(pipe);
+}
+
+void record_perfcontrol_stats(const char *sysctlname, const char *units, const char *info)
+{
+    int data = 0;
+    size_t data_size = sizeof(data);
+    T_ASSERT_POSIX_ZERO(sysctlbyname(sysctlname,
+           &data, &data_size, NULL, 0), 
+           "%s", sysctlname);
+    T_PERF(info, data, units, info);
+}
+
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"));
+
+/* Disable the test on MacOS for now */
+T_DECL(perf_csw, "context switch performance", T_META_TAG_PERF, T_META_CHECK_LEAKS(false), T_META_ASROOT(true))
+{
+
+#if !CONFIG_EMBEDDED
+    T_SKIP("Not supported on MacOS");
+    return;
+#endif /* CONFIG_EMBEDDED */
+    check_device_temperature();
+
+    T_ATEND(csw_perf_test_cleanup);
+
+    csw_perf_test_init();
+    pthread_setname_np("main thread");
+
+    T_ASSERT_MACH_SUCCESS(mach_timebase_info(&timebase_info), "mach_timebase_info");
+
+    struct sched_param param = {.sched_priority = 48};
+
+    T_ASSERT_POSIX_ZERO(pthread_setschedparam(pthread_self(), SCHED_FIFO, &param),
+            "pthread_setschedparam");
+
+    T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &semaphore,
+            SYNC_POLICY_FIFO, 0), "semaphore_create");
+
+    T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &worker_sem,
+            SYNC_POLICY_FIFO, 0), "semaphore_create");
+    
+    size_t ncpu_size = sizeof(g_numcpus);
+    T_ASSERT_POSIX_ZERO(sysctlbyname("hw.ncpu", &g_numcpus, &ncpu_size, NULL, 0),
+            "sysctlbyname hw.ncpu");
+
+    printf("hw.ncpu: %d\n", g_numcpus);
+    uint32_t n_spinners = g_numcpus - 1;
+
+    int mt_supported = 0;
+    size_t mt_supported_size = sizeof(mt_supported);
+    T_ASSERT_POSIX_ZERO(sysctlbyname("kern.monotonic.supported", &mt_supported,
+            &mt_supported_size, NULL, 0), "sysctlbyname kern.monotonic.supported");
+
+    for (uint32_t thread_id = 0; thread_id < n_spinners; thread_id++) {
+        threads[thread_id].thread = create_thread(thread_id, THR_SPINNER_PRI,
+                true, &spin_thread);
+    }
+
+    s = dt_stat_time_create("context switch time");
+
+    create_thread(n_spinners, THR_MANAGER_PRI, true, &thread);
+    threads[n_spinners].measure_thread = true;
+    create_thread(n_spinners + 1, THR_MANAGER_PRI, true, &thread);
+
+    /* Allow the context switch threads to get into sem_wait() */
+    for (uint32_t thread_id = 0; thread_id < n_spinners + 2; thread_id++) {
+        T_ASSERT_MACH_SUCCESS(semaphore_wait(worker_sem), "semaphore_wait");
+    }
+    
+    int enable_callout_stats = 1;
+    size_t enable_size = sizeof(enable_callout_stats);
+
+    if (mt_supported) {
+        /* Enable callout stat collection */
+        T_ASSERT_POSIX_ZERO(sysctlbyname("kern.perfcontrol_callout.stats_enabled",
+                NULL, 0, &enable_callout_stats, enable_size),
+                "sysctlbyname kern.perfcontrol_callout.stats_enabled");
+    }
+    
+    T_ASSERT_MACH_SUCCESS(semaphore_signal_all(semaphore), "semaphore_signal");
+
+
+    for (uint32_t thread_id = 0; thread_id < n_spinners + 2; thread_id++) {
+        T_ASSERT_POSIX_ZERO(pthread_join(threads[thread_id].thread, NULL),
+                "pthread_join %d", thread_id);
+    }
+
+    if (mt_supported) {
+        record_perfcontrol_stats("kern.perfcontrol_callout.oncore_instr",
+                "instructions", "oncore.instructions");
+        record_perfcontrol_stats("kern.perfcontrol_callout.offcore_instr",
+                "instructions", "offcore.instructions");
+        record_perfcontrol_stats("kern.perfcontrol_callout.oncore_cycles",
+                "cycles", "oncore.cycles");
+        record_perfcontrol_stats("kern.perfcontrol_callout.offcore_cycles",
+                "cycles", "offcore.cycles");
+
+        /* Disable callout stat collection */
+        enable_callout_stats = 0;
+        T_ASSERT_POSIX_ZERO(sysctlbyname("kern.perfcontrol_callout.stats_enabled",
+                NULL, 0, &enable_callout_stats, enable_size),
+                "sysctlbyname kern.perfcontrol_callout.stats_enabled");
+    }
+
+    check_device_temperature();
+    dt_stat_finalize(s);
+}
diff --git a/tests/cpucount.c b/tests/cpucount.c
new file mode 100644 (file)
index 0000000..47159c1
--- /dev/null
@@ -0,0 +1,266 @@
+/*
+ * Test to validate that we can schedule threads on all hw.ncpus cores according to _os_cpu_number
+ *
+ * <rdar://problem/29545645>
+ *
+xcrun -sdk macosx.internal clang -o cpucount cpucount.c -ldarwintest -g -Weverything
+xcrun -sdk iphoneos.internal clang -arch arm64 -o cpucount-ios cpucount.c -ldarwintest -g -Weverything
+ */
+
+#include <darwintest.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdalign.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <err.h>
+#include <errno.h>
+#include <sysexits.h>
+#include <sys/sysctl.h>
+#include <stdatomic.h>
+
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+
+#include <os/tsd.h> /* private header for _os_cpu_number */
+
+/* const variables aren't constants, but enums are */
+enum { max_threads = 40 };
+
+#define CACHE_ALIGNED __attribute__((aligned(128)))
+
+static _Atomic CACHE_ALIGNED uint64_t g_ready_threads = 0;
+
+static _Atomic CACHE_ALIGNED bool g_cpu_seen[max_threads];
+
+static _Atomic CACHE_ALIGNED bool g_bail = false;
+
+static uint32_t g_threads; /* set by sysctl hw.ncpu */
+
+static uint64_t g_spin_ms = 50; /* it takes ~50ms of spinning for CLPC to deign to give us all cores */
+
+/*
+ * sometimes pageout scan can eat all of CPU 0 long enough to fail the test,
+ * so we run the test at RT priority
+ */
+static uint32_t g_thread_pri = 97;
+
+/*
+ * add in some extra low-pri threads to convince the amp scheduler to use E-cores consistently
+ * works around <rdar://problem/29636191>
+ */
+static uint32_t g_spin_threads = 2;
+static uint32_t g_spin_threads_pri = 20;
+
+static semaphore_t g_readysem, g_go_sem;
+
+static mach_timebase_info_data_t timebase_info;
+
+static uint64_t nanos_to_abs(uint64_t nanos) { return nanos * timebase_info.denom / timebase_info.numer; }
+
+static void set_realtime(pthread_t thread) {
+       kern_return_t kr;
+       thread_time_constraint_policy_data_t pol;
+
+       mach_port_t target_thread = pthread_mach_thread_np(thread);
+       T_QUIET; T_ASSERT_NOTNULL(target_thread, "pthread_mach_thread_np");
+
+       /* 1s 100ms 10ms */
+       pol.period      = (uint32_t)nanos_to_abs(1000000000);
+       pol.constraint  = (uint32_t)nanos_to_abs(100000000);
+       pol.computation = (uint32_t)nanos_to_abs(10000000);
+
+       pol.preemptible = 0; /* Ignored by OS */
+       kr = thread_policy_set(target_thread, THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &pol,
+                              THREAD_TIME_CONSTRAINT_POLICY_COUNT);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_set(THREAD_TIME_CONSTRAINT_POLICY)");
+}
+
+static pthread_t
+create_thread(void *(*start_routine)(void *), uint32_t priority)
+{
+       int rv;
+       pthread_t new_thread;
+       pthread_attr_t attr;
+
+       struct sched_param param = { .sched_priority = (int)priority };
+
+       rv = pthread_attr_init(&attr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_init");
+
+       rv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_setdetachstate");
+
+       rv = pthread_attr_setschedparam(&attr, &param);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_setschedparam");
+
+       rv = pthread_create(&new_thread, &attr, start_routine, NULL);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create");
+
+       if (priority == 97)
+               set_realtime(new_thread);
+
+       rv = pthread_attr_destroy(&attr);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_destroy");
+
+       return new_thread;
+}
+
+static void *
+thread_fn(__unused void *arg)
+{
+       T_QUIET; T_EXPECT_TRUE(true, "initialize darwintest on this thread");
+
+       kern_return_t kr;
+
+       kr = semaphore_wait_signal(g_go_sem, g_readysem);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
+
+       /* atomic inc to say hello */
+       g_ready_threads++;
+
+       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
+
+       /*
+        * spin to force the other threads to spread out across the cores
+        * may take some time if cores are masked and CLPC needs to warm up to unmask them
+        */
+       while (g_ready_threads < g_threads && mach_absolute_time() < timeout);
+
+       T_QUIET; T_ASSERT_GE(timeout, mach_absolute_time(), "waiting for all threads took too long");
+
+       timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
+
+       int iteration = 0;
+       uint32_t cpunum = 0;
+
+       /* search for new CPUs for the duration */
+       while (mach_absolute_time() < timeout) {
+               cpunum = _os_cpu_number();
+
+               assert(cpunum < max_threads);
+
+               g_cpu_seen[cpunum] = true;
+
+               if (iteration++ % 10000) {
+                       uint32_t cpus_seen = 0;
+
+                       for (uint32_t i = 0 ; i < g_threads; i++) {
+                               if (g_cpu_seen[i])
+                                       cpus_seen++;
+                       }
+
+                       /* bail out early if we saw all CPUs */
+                       if (cpus_seen == g_threads)
+                               break;
+               }
+       }
+
+       g_bail = true;
+
+       printf("thread cpunum: %d\n", cpunum);
+
+       kr = semaphore_wait_signal(g_go_sem, g_readysem);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
+
+       return NULL;
+}
+
+static void *
+spin_fn(__unused void *arg)
+{
+       T_QUIET; T_EXPECT_TRUE(true, "initialize darwintest on this thread");
+
+       kern_return_t kr;
+
+       kr = semaphore_wait_signal(g_go_sem, g_readysem);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
+
+       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC * 2) + mach_absolute_time();
+
+       /*
+        * run and sleep a bit to force some scheduler churn to get all the cores active
+        * needed to work around bugs in the amp scheduler
+        */
+       while (mach_absolute_time() < timeout && g_bail == false) {
+               usleep(500);
+
+               uint64_t inner_timeout = nanos_to_abs(1 * NSEC_PER_MSEC) + mach_absolute_time();
+
+               while (mach_absolute_time() < inner_timeout && g_bail == false);
+       }
+
+       kr = semaphore_wait_signal(g_go_sem, g_readysem);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
+
+       return NULL;
+}
+
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgnu-flexible-array-initializer"
+T_DECL(count_cpus, "Tests we can schedule threads on all hw.ncpus cores according to _os_cpu_number",
+       T_META_CHECK_LEAKS(false), T_META_ENABLED(false))
+#pragma clang diagnostic pop
+{
+       setvbuf(stdout, NULL, _IONBF, 0);
+       setvbuf(stderr, NULL, _IONBF, 0);
+
+       int rv;
+       kern_return_t kr;
+       kr = mach_timebase_info(&timebase_info);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info");
+
+       kr = semaphore_create(mach_task_self(), &g_readysem, SYNC_POLICY_FIFO, 0);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
+
+       kr = semaphore_create(mach_task_self(), &g_go_sem, SYNC_POLICY_FIFO, 0);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
+
+       size_t ncpu_size = sizeof(g_threads);
+       rv = sysctlbyname("hw.ncpu", &g_threads, &ncpu_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)");
+
+       printf("hw.ncpu: %2d\n", g_threads);
+
+       assert(g_threads < max_threads);
+
+       for (uint32_t i = 0; i < g_threads; i++)
+               create_thread(&thread_fn, g_thread_pri);
+
+       for (uint32_t i = 0; i < g_spin_threads; i++)
+               create_thread(&spin_fn, g_spin_threads_pri);
+
+       for (uint32_t i = 0 ; i < g_threads + g_spin_threads; i++) {
+               kr = semaphore_wait(g_readysem);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
+       }
+
+       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
+
+       /* spin to warm up CLPC :) */
+       while (mach_absolute_time() < timeout);
+
+       kr = semaphore_signal_all(g_go_sem);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all");
+
+       for (uint32_t i = 0 ; i < g_threads + g_spin_threads; i++) {
+               kr = semaphore_wait(g_readysem);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
+       }
+
+       uint32_t cpus_seen = 0;
+
+       for (uint32_t i = 0 ; i < g_threads; i++) {
+               if (g_cpu_seen[i])
+                       cpus_seen++;
+
+               printf("cpu %2d: %d\n", i, g_cpu_seen[i]);
+       }
+
+       T_ASSERT_EQ(cpus_seen, g_threads, "test should have run threads on all CPUS");
+}
+
diff --git a/tests/data_protection.c b/tests/data_protection.c
new file mode 100644 (file)
index 0000000..c9a69fe
--- /dev/null
@@ -0,0 +1,1130 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <IOKit/IOKitLib.h>
+#include <Kernel/IOKit/crypto/AppleKeyStoreDefs.h>
+#include <Kernel/sys/content_protection.h>
+
+#define CPT_IO_SIZE      4096
+#define CPT_AKS_BUF_SIZE 256
+#define CPT_MAX_PASS_LEN 64
+
+#define GET_PROT_CLASS(fd) \
+       fcntl((fd), F_GETPROTECTIONCLASS)
+
+#define SET_PROT_CLASS(fd, prot_class) \
+       fcntl((fd), F_SETPROTECTIONCLASS, (prot_class))
+
+#define KEYSTORECTL_PATH  "/usr/local/bin/keystorectl"
+#define KEYBAGDTEST_PATH  "/usr/local/bin/keybagdTest"
+#define TEMP_DIR_TEMPLATE "/tmp/data_protection_test.XXXXXXXX"
+#define TEST_PASSCODE     "IAmASecurePassword"
+
+int g_fd           = -1;
+int g_dir_fd       = -1;
+int g_subdir_fd    = -1;
+int g_passcode_set = 0;
+
+char g_test_tempdir[PATH_MAX] = TEMP_DIR_TEMPLATE;
+char g_filepath[PATH_MAX]     = "";
+char g_dirpath[PATH_MAX]      = "";
+char g_subdirpath[PATH_MAX]   = "";
+
+int apple_key_store(
+       uint32_t command,
+       uint64_t * inputs,
+       uint32_t input_count,
+       void * input_structs,
+       size_t input_struct_count,
+       uint64_t * outputs,
+       uint32_t * output_count
+);
+int spawn_proc(char * const command[]);
+int supports_content_prot(void);
+char* dp_class_num_to_string(int num);
+int lock_device(void);
+int unlock_device(char * passcode);
+int set_passcode(char * new_passcode, char * old_passcode);
+int clear_passcode(char * passcode);
+int has_passcode(void);
+void setup(void);
+void cleanup(void);
+
+T_DECL(data_protection,
+       "Verify behavior of the various data protection classes") {
+       int local_result = -1;
+       int new_prot_class = -1;
+       int old_prot_class = -1;
+       int current_byte = 0;
+       char rd_buffer[CPT_IO_SIZE];
+       char wr_buffer[CPT_IO_SIZE];
+
+       setup();
+
+       /*
+        * Ensure we can freely read and change
+        * protection classes when unlocked.
+        */
+       for(
+               new_prot_class = PROTECTION_CLASS_A;
+               new_prot_class <= PROTECTION_CLASS_F;
+               new_prot_class++
+       ) {
+               T_ASSERT_NE(
+                       old_prot_class = GET_PROT_CLASS(g_fd),
+                       -1,
+                       "Get protection class when locked"
+               );
+               T_WITH_ERRNO;
+               T_ASSERT_NE(
+                       SET_PROT_CLASS(g_fd, new_prot_class),
+                       -1,
+                       "Should be able to change protection "
+                       "from %s to %s while unlocked",
+                       dp_class_num_to_string(old_prot_class),
+                       dp_class_num_to_string(new_prot_class)
+               );
+       }
+
+       /* Query the filesystem for the default CP level (Is it C?) */
+#ifndef F_GETDEFAULTPROTLEVEL
+#define F_GETDEFAULTPROTLEVEL 79
+#endif
+
+       T_WITH_ERRNO;
+       T_ASSERT_NE(
+               old_prot_class = fcntl(g_fd, F_GETDEFAULTPROTLEVEL),
+               -1,
+               "Get default protection level for filesystem"
+       );
+
+       /* XXX: Do we want to do anything with the level? What should it be? */
+
+       /*
+        * files are allowed to move into F, but not out of it. They can also
+        * only do so when they do not have content.
+        */
+       close(g_fd);
+       unlink(g_filepath);
+
+       /* re-create the file */
+       T_WITH_ERRNO;
+       T_ASSERT_GE(
+               g_fd = open(g_filepath, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC),
+               0,
+               "Recreate test file"
+       );
+
+       /* Try making a class A file while locked. */
+       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_A),
+               -1,
+               "Should not be able to change protection "
+               "from class D to class A when locked"
+       );
+       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
+
+       /* Attempt opening/IO to a class A file while unlocked. */
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_A),
+               0,
+               "Should be able to change protection "
+               "from class D to class A when unlocked"
+       );
+
+       close(g_fd);
+
+       T_WITH_ERRNO;
+       T_ASSERT_GE(
+               g_fd = open(g_filepath, O_RDWR|O_CLOEXEC),
+               0,
+               "Should be able to open a class A file when unlocked");
+
+       /*
+        * TODO: Write specific data we can check for. If we're going to do
+        * that, the write scheme should be deliberately ugly.
+        */
+       current_byte = 0;
+
+       while(current_byte < CPT_IO_SIZE) {
+               local_result = pwrite(
+                       g_fd,
+                       &wr_buffer[current_byte],
+                       CPT_IO_SIZE - current_byte,
+                       current_byte
+               );
+
+               T_WITH_ERRNO;
+               T_ASSERT_NE(
+                       local_result,
+                       -1,
+                       "Should be able to write to "
+                       "a class A file when unlocked"
+               );
+
+               current_byte += local_result;
+       }
+
+       current_byte = 0;
+
+       while(current_byte < CPT_IO_SIZE) {
+               local_result = pread(
+                       g_fd,
+                       &rd_buffer[current_byte],
+                       CPT_IO_SIZE - current_byte,
+                       current_byte
+               );
+
+               T_WITH_ERRNO;
+               T_ASSERT_NE(
+                       local_result,
+                       -1,
+                       "Should be able to read from "
+                       "a class A file when unlocked"
+               );
+
+               current_byte += local_result;
+       }
+
+       /*
+        * Again, but now while locked; and try to change the file class
+        * as well.
+        */
+       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
+
+       T_ASSERT_LE(
+               pread(g_fd, rd_buffer, CPT_IO_SIZE, 0),
+               0,
+               "Should not be able to read from a class A file when locked"
+       );
+
+       T_ASSERT_LE(
+               pwrite(g_fd, wr_buffer, CPT_IO_SIZE, 0),
+               0,
+               "Should not be able to write to a class A file when locked"
+       );
+
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_D),
+               -1,
+               "Should not be able to change protection "
+               "from class A to class D when locked"
+       );
+
+       /* Try to open and truncate the file. */
+       close(g_fd);
+
+       T_ASSERT_EQ(
+               g_fd = open(g_filepath, O_RDWR|O_TRUNC|O_CLOEXEC),
+               -1,
+               "Should not be able to open and truncate "
+               "a class A file when locked"
+       );
+
+       /* Try to open the file */
+       T_ASSERT_EQ(
+               g_fd = open(g_filepath, O_RDWR|O_CLOEXEC),
+               -1,
+               "Should not be able to open a class A file when locked"
+       );
+
+       /* What about class B files? */
+       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
+
+       T_ASSERT_GE(
+               g_fd = open(g_filepath, O_RDWR|O_CLOEXEC),
+               0,
+               "Should be able to open a class A file when unlocked"
+       );
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_D),
+               0,
+               "Should be able to change protection "
+               "class from A to D when unlocked"
+       );
+
+       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
+
+       /* Can we create a class B file while locked? */
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_B),
+               0,
+               "Should be able to change protection "
+               "class from D to B when locked"
+       );
+
+       T_ASSERT_EQ(
+               GET_PROT_CLASS(g_fd),
+               PROTECTION_CLASS_B,
+               "File should now have class B protection"
+       );
+
+       /*
+        * We should also be able to read/write to the
+        * file descriptor while it is open.
+        */
+       current_byte = 0;
+
+       while(current_byte < CPT_IO_SIZE) {
+               local_result = pwrite(
+                       g_fd,
+                       &wr_buffer[current_byte],
+                       CPT_IO_SIZE - current_byte,
+                       current_byte
+               );
+
+               T_WITH_ERRNO;
+               T_ASSERT_NE(
+                       local_result,
+                       -1,
+                       "Should be able to write to a "
+                       "new class B file when locked"
+               );
+
+               current_byte += local_result;
+       }
+
+       current_byte = 0;
+
+       while(current_byte < CPT_IO_SIZE) {
+               local_result = pread(
+                       g_fd,
+                       &rd_buffer[current_byte],
+                       CPT_IO_SIZE - current_byte,
+                       current_byte
+               );
+
+               T_ASSERT_NE(
+                       local_result,
+                       -1,
+                       "Should be able to read from a "
+                       "new class B file when locked"
+               );
+
+               current_byte += local_result;
+       }
+
+       /* We should not be able to open a class B file under lock. */
+       close(g_fd);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(
+               g_fd = open(g_filepath, O_RDWR|O_CLOEXEC),
+               -1,
+               "Should not be able to open a class B file when locked"
+       );
+
+       unlink(g_filepath);
+
+       /* We still need to test directory semantics. */
+       T_WITH_ERRNO;
+       T_ASSERT_NE(
+               mkdir(g_dirpath, 0x0777),
+               -1,
+               "Should be able to create a new directory when locked"
+       );
+
+       /* The newly created directory should not have a protection class. */
+       T_ASSERT_NE(
+               g_dir_fd = open(g_dirpath, O_RDONLY|O_CLOEXEC),
+               -1,
+               "Should be able to open an unclassed directory when locked"
+       );
+
+       T_ASSERT_TRUE(
+               GET_PROT_CLASS(g_dir_fd) == PROTECTION_CLASS_D ||
+               GET_PROT_CLASS(g_dir_fd) == PROTECTION_CLASS_DIR_NONE,
+               "Directory protection class sholud be D or NONE"
+       );
+
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_A),
+               0,
+               "Should be able to change a directory from "
+               "class D to class A while locked"
+       );
+
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_D),
+               0,
+               "Should be able to change a directory from "
+               "class A to class D while locked"
+       );
+
+       /*
+        * Do all files created in the directory properly inherit the
+        * directory's protection class?
+        */
+       T_SETUPBEGIN;
+       T_ASSERT_LT(
+               strlcpy(g_filepath, g_dirpath, PATH_MAX),
+               PATH_MAX,
+               "Construct path for file in the directory"
+       );
+       T_ASSERT_LT(
+               strlcat(g_filepath, "test_file", PATH_MAX),
+               PATH_MAX,
+               "Construct path for file in the directory"
+       );
+       T_SETUPEND;
+
+       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
+
+       for(
+               new_prot_class = PROTECTION_CLASS_A;
+               new_prot_class <= PROTECTION_CLASS_D;
+               new_prot_class++
+       ) {
+               int getclass_dir;
+
+               T_WITH_ERRNO;
+               T_ASSERT_NE(
+                       old_prot_class = GET_PROT_CLASS(g_dir_fd),
+                       -1,
+                       "Get protection class for the directory"
+               );
+
+               T_WITH_ERRNO;
+               T_ASSERT_EQ(
+                       SET_PROT_CLASS(g_dir_fd, new_prot_class),
+                       0,
+                       "Should be able to change directory "
+                       "protection from %s to %s",
+                       dp_class_num_to_string(old_prot_class),
+                       dp_class_num_to_string(new_prot_class)
+               );
+
+               T_EXPECT_EQ(
+                       getclass_dir = GET_PROT_CLASS(g_dir_fd),
+                       new_prot_class,
+                       "Get protection class for the directory"
+               );
+
+               T_WITH_ERRNO;
+               T_ASSERT_GE(
+                       g_fd = open(g_filepath, O_CREAT|O_EXCL|O_CLOEXEC, 0777),
+                       0,
+                       "Should be able to create file in "
+                       "%s directory when unlocked",
+                       dp_class_num_to_string(new_prot_class)
+               );
+
+               T_WITH_ERRNO;
+               T_ASSERT_NE(
+                       local_result = GET_PROT_CLASS(g_fd),
+                       -1,
+                       "Get the new file's protection class"
+               );
+
+               T_ASSERT_EQ(
+                       local_result,
+                       new_prot_class,
+                       "File should have %s protection",
+                       dp_class_num_to_string(new_prot_class)
+               );
+
+               close(g_fd);
+               unlink(g_filepath);
+       }
+
+       /* Do we disallow creation of a class F directory? */
+       T_ASSERT_NE(
+               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_F),
+               0,
+               "Should not be able to create class F directory"
+       );
+
+       /*
+        * Are class A and class B semantics followed for when
+        * we create these files during lock?
+        */
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_A),
+               0,
+               "Should be able to change protection "
+               "from class F to class A when unlocked"
+       );
+
+       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
+
+       T_ASSERT_EQ(
+               g_fd = open(g_filepath, O_CREAT|O_EXCL|O_CLOEXEC, 0777),
+               -1,
+               "Should not be able to create a new file "
+               "in a class A directory when locked"
+       );
+
+       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(
+               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_B),
+               0,
+               "Should be able to change directory "
+               "from class A to class B when unlocked"
+       );
+
+       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
+
+       T_ASSERT_GE(
+               g_fd = open(g_filepath, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, 0777),
+               0,
+               "Should be able to create a new file "
+               "in class B directory when locked"
+       );
+
+       T_ASSERT_NE(
+               local_result = GET_PROT_CLASS(g_fd),
+               -1,
+               "Get the new file's protection class"
+       );
+
+       T_ASSERT_EQ(
+               local_result,
+               PROTECTION_CLASS_B,
+               "File should inherit protection class of class B directory"
+       );
+
+       /* What happens when we try to create new subdirectories? */
+       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
+
+       for(
+               new_prot_class = PROTECTION_CLASS_A;
+               new_prot_class <= PROTECTION_CLASS_D;
+               new_prot_class++
+       ) {
+               T_WITH_ERRNO;
+               T_ASSERT_EQ(
+                       SET_PROT_CLASS(g_dir_fd, new_prot_class),
+                       0,
+                       "Change directory to %s",
+                       dp_class_num_to_string(new_prot_class)
+               );
+
+               T_WITH_ERRNO;
+               T_ASSERT_NE(
+                       mkdir(g_subdirpath, 0x0777),
+                       -1,
+                       "Create subdirectory in %s directory",
+                       dp_class_num_to_string(new_prot_class)
+               );
+
+               T_WITH_ERRNO;
+               T_ASSERT_NE(
+                       g_subdir_fd = open(g_subdirpath, O_RDONLY|O_CLOEXEC),
+                       -1,
+                       "Should be able to open subdirectory in %s directory",
+                       dp_class_num_to_string(new_prot_class)
+               );
+
+               T_ASSERT_NE(
+                       local_result = GET_PROT_CLASS(g_subdir_fd),
+                       -1,
+                       "Get protection class of new subdirectory "
+                       "of %s directory",
+                       dp_class_num_to_string(new_prot_class)
+               );
+
+               T_ASSERT_EQ(
+                       local_result,
+                       new_prot_class,
+                       "New subdirectory should have same class as %s parent",
+                       dp_class_num_to_string(new_prot_class)
+               );
+
+               close(g_subdir_fd);
+               rmdir(g_subdirpath);
+       }
+}
+
+void
+setup(void) {
+       int ret = 0;
+       int local_result = -1;
+
+       T_SETUPBEGIN;
+
+       T_ATEND(cleanup);
+
+       T_WITH_ERRNO;
+       T_ASSERT_NOTNULL(
+               mkdtemp(g_test_tempdir),
+               "Create temporary directory for test"
+       );
+       T_LOG("Test temp dir: %s", g_test_tempdir);
+
+       T_ASSERT_NE(
+               local_result = supports_content_prot(),
+               -1,
+               "Get content protection support status"
+       );
+
+       if(local_result == 0) {
+               T_SKIP("Data protection not supported on this system");
+       }
+
+       T_ASSERT_EQ(
+               has_passcode(),
+               0,
+               "Device should not have existing passcode"
+       );
+
+       T_ASSERT_EQ(
+               set_passcode(TEST_PASSCODE, NULL),
+               0,
+               "Set test passcode"
+       );
+
+       bzero(g_filepath, PATH_MAX);
+       bzero(g_dirpath, PATH_MAX);
+       bzero(g_subdirpath, PATH_MAX);
+
+       ret |= (strlcat(g_filepath, g_test_tempdir, PATH_MAX) == PATH_MAX);
+       ret |= (strlcat(g_filepath, "/", PATH_MAX) == PATH_MAX);
+       ret |= (strlcpy(g_dirpath, g_filepath, PATH_MAX) == PATH_MAX);
+       ret |= (strlcat(g_filepath, "test_file", PATH_MAX) == PATH_MAX);
+       ret |= (strlcat(g_dirpath, "test_dir/", PATH_MAX) == PATH_MAX);
+       ret |= (strlcpy(g_subdirpath, g_dirpath, PATH_MAX) == PATH_MAX);
+       ret |= (strlcat(g_subdirpath, "test_subdir/", PATH_MAX) == PATH_MAX);
+
+       T_QUIET;
+       T_ASSERT_EQ(ret, 0, "Initialize test path strings");
+
+       T_WITH_ERRNO;
+       T_ASSERT_GE(
+               g_fd = open(g_filepath, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, 0777),
+               0,
+               "Create test file"
+       );
+
+       T_SETUPEND;
+}
+
+void
+cleanup(void) {
+       T_LOG("Cleaning up…");
+
+       if(g_subdir_fd >= 0) {
+               T_LOG("Cleanup: closing fd %d", g_subdir_fd);
+               close(g_subdir_fd);
+       }
+
+       if(g_subdirpath[0]) {
+               T_LOG("Cleanup: removing %s", g_subdirpath);
+               rmdir(g_subdirpath);
+       }
+
+       if(g_fd >= 0) {
+               T_LOG("Cleanup: closing fd %d", g_fd);
+               close(g_fd);
+       }
+
+       if(g_filepath[0]) {
+               T_LOG("Cleanup: removing %s", g_filepath);
+               unlink(g_filepath);
+       }
+
+       if(g_dir_fd >= 0) {
+               T_LOG("Cleanup: closing fd %d", g_dir_fd);
+               close(g_dir_fd);
+       }
+
+       if(g_dirpath[0]) {
+               T_LOG("Cleanup: removing %s", g_dirpath);
+               rmdir(g_dirpath);
+       }
+
+       if(strcmp(g_test_tempdir, TEMP_DIR_TEMPLATE)) {
+               T_LOG("Cleanup: removing %s", g_test_tempdir);
+               rmdir(g_test_tempdir);
+       }
+
+       if(g_passcode_set) {
+               T_LOG("Cleanup: unlocking device");
+               if(unlock_device(TEST_PASSCODE)) {
+                       T_LOG("Warning: failed to unlock device in cleanup");
+               }
+
+               T_LOG("Cleanup: clearing passcode");
+               if(clear_passcode(TEST_PASSCODE)) {
+                       T_LOG("Warning: failed to clear passcode in cleanup");
+               }
+       }
+}
+
+int
+set_passcode(char * new_passcode, char * old_passcode) {
+       int result = -1;
+
+#ifdef KEYBAG_ENTITLEMENTS
+       /* If we're entitled, we can set the passcode ourselves. */
+       uint64_t inputs[] = {device_keybag_handle};
+       uint32_t input_count = (sizeof(inputs) / sizeof(*inputs));
+       void * input_structs = NULL;
+       size_t input_struct_count = 0;
+       char buffer[CPT_AKS_BUF_SIZE];
+       char * buffer_ptr = buffer;
+       uint32_t old_passcode_len = 0;
+       uint32_t new_passcode_len = 0;
+
+       T_LOG("%s(): using keybag entitlements", __func__);
+
+       old_passcode_len = strnlen(old_passcode, CPT_MAX_PASS_LEN);
+       new_passcode_len = strnlen(new_passcode, CPT_MAX_PASS_LEN);
+
+       if((old_passcode == NULL) || (old_passcode_len == CPT_MAX_PASS_LEN)) {
+               old_passcode = "";
+               old_passcode_len = 0;
+       }
+       if((new_passcode == NULL) || (new_passcode_len == CPT_MAX_PASS_LEN)) {
+               new_passcode = "";
+               new_passcode_len = 0;
+       }
+
+       *((uint32_t *) buffer_ptr) = ((uint32_t) 2);
+       buffer_ptr += sizeof(uint32_t);
+
+       *((uint32_t *) buffer_ptr) = old_passcode_len;
+       buffer_ptr += sizeof(uint32_t);
+
+       memcpy(buffer_ptr, old_passcode, old_passcode_len);
+       buffer_ptr += ((old_passcode_len + sizeof(uint32_t) - 1) &
+               ~(sizeof(uint32_t) - 1));
+
+       *((uint32_t *) buffer_ptr) = new_passcode_len;
+       buffer_ptr += sizeof(uint32_t);
+
+       memcpy(buffer_ptr, new_passcode, new_passcode_len);
+       buffer_ptr += ((new_passcode_len + sizeof(uint32_t) - 1) &
+               ~(sizeof(uint32_t) - 1));
+
+       input_structs = buffer;
+       input_struct_count = (buffer_ptr - buffer);
+
+       result = apple_key_store(
+               kAppleKeyStoreKeyBagSetPasscode,
+               inputs,
+               input_count,
+               input_structs,
+               input_struct_count,
+               NULL,
+               NULL
+       );
+#else
+       /*
+        * If we aren't entitled, we'll need to use
+        * keystorectl to set the passcode.
+        */
+       T_LOG("%s(): using keystorectl", __func__);
+
+       if(
+               (old_passcode == NULL) ||
+               (strnlen(old_passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)
+       ) {
+               old_passcode = "";
+       }
+
+       if(
+               (new_passcode == NULL) ||
+               (strnlen(new_passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)
+       ) {
+               new_passcode = "";
+       }
+
+       char * const keystorectl_args[] = {
+               KEYSTORECTL_PATH,
+               "change-password",
+               old_passcode,
+               new_passcode,
+               NULL
+       };
+       result = spawn_proc(keystorectl_args);
+#endif /* KEYBAG_ENTITLEMENTS */
+       if(result == 0 && new_passcode != NULL) {
+               g_passcode_set = 1;
+       } else if(result == 0 && new_passcode == NULL) {
+               g_passcode_set = 0;
+       }
+
+       return(result);
+}
+
+int
+clear_passcode(char * passcode) {
+       /*
+        * For the moment, this will set the passcode to the empty string
+        * (a known value); this will most likely need to change, or running
+        * this test may ruin everything™
+        */
+       return set_passcode(NULL, passcode);
+}
+
+int
+has_passcode(void) {
+       return set_passcode(NULL, NULL);
+}
+
+int
+lock_device(void) {
+       int result = -1;
+
+       /*
+        * Pass in the path to keybagdTest instead. By doing this, we bypass
+        * the shortcut to get in to the keybag via IOKit and instead use the
+        * pre-existing command line tool.
+        *
+        * This also goes through the normal "lock → locking (10s) → locked"
+        * flow that would normally occuring during system runtime when the
+        * lock button is depressed. To ensure that our single threaded test
+        * works properly in this case, poll until we can't create a class A
+        * file to be safe.
+        */
+       char * const kbd_args[] = {KEYBAGDTEST_PATH, "lock", NULL};
+       result = spawn_proc(kbd_args);
+       if(result) {
+               return result;
+       }
+
+       /*
+        * Delete the file if it is present. Note that this may fail if the
+        * file is actually not there. So don't bomb out if we can't delete
+        * this file right now.
+        */
+       (void) unlink("/private/var/foo_test_file");
+
+       while(1) {
+               int dp_fd;
+
+               dp_fd = open_dprotected_np(
+                       "/private/var/foo_test_file",
+                       O_RDWR|O_CREAT,
+                       PROTECTION_CLASS_A,
+                       0
+               );
+
+               if(dp_fd >= 0) {
+                       /* delete it and sleep */
+                       close(dp_fd);
+                       result = unlink("/private/var/foo_test_file");
+
+                       if(result) {
+                               return result;
+                       }
+
+                       sync();
+                       sleep(1);
+               } else {
+                       /* drop out of our polling loop. */
+                       break;
+               }
+       }
+
+       /*
+        * Note that our loop breakout condition is whether or not we can
+        * create a class A file, so that loop may execute up to 10 times
+        * (due to the 10s grace period). By the time we get here, we assume
+        * that we didn't hit any of the error cases above.
+        */
+
+       return 0;
+}
+
+int
+unlock_device(char * passcode) {
+       int result = -1;
+
+#ifdef  KEYBAG_ENTITLEMENTS
+       /* If we're entitled, we can unlock the device ourselves. */
+       uint64_t inputs[] = {device_keybag_handle};
+       uint32_t input_count = (sizeof(inputs) / sizeof(*inputs));
+       size_t input_struct_count = 0;
+
+       T_LOG("%s(): using keybag entitlements", __func__);
+
+       input_struct_count = strnlen(passcode, CPT_MAX_PASS_LEN);
+       if((passcode == NULL) || (input_struct_count == CPT_MAX_PASS_LEN)) {
+               passcode = "";
+               input_struct_count = 0;
+       }
+
+       result = apple_key_store(
+               kAppleKeyStoreKeyBagUnlock,
+               inputs,
+               input_count,
+               passcode,
+               input_struct_count,
+               NULL,
+               NULL
+       );
+#else
+       /*
+        * If we aren't entitled, we'll need to use
+        * keystorectl to unlock the device.
+        */
+       T_LOG("%s(): using keystorectl", __func__);
+
+       if(
+               (passcode == NULL) ||
+               (strnlen(passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)
+       ) {
+               passcode = "";
+       }
+
+       char * const keystorectl_args[] = {
+               KEYSTORECTL_PATH, "unlock", passcode, NULL
+       };
+
+       result = spawn_proc(keystorectl_args);
+#endif /* KEYBAG_ENTITLEMENTS */
+
+       return(result);
+}
+
+/*
+ * Code based on Mobile Key Bag; specifically
+ * MKBDeviceSupportsContentProtection and
+ * MKBDeviceFormattedForContentProtection.
+ *
+ * We want to verify that we support content protection, and that
+ * we are formatted for it.
+ */
+int
+supports_content_prot(void) {
+       int local_result = -1;
+       int result = -1;
+       uint32_t buffer_size = 1;
+       char buffer[buffer_size];
+       io_registry_entry_t defaults = IO_OBJECT_NULL;
+       kern_return_t k_result = KERN_FAILURE;
+       struct statfs statfs_results;
+
+       defaults = IORegistryEntryFromPath(
+               kIOMasterPortDefault,
+               kIODeviceTreePlane ":/defaults"
+       );
+
+       if(defaults == IO_OBJECT_NULL) {
+               /* Assume data protection is unsupported */
+               T_LOG(
+                       "%s(): no defaults entry in IORegistry",
+                       __func__
+               );
+               return 0;
+       }
+
+       k_result = IORegistryEntryGetProperty(
+               defaults,
+               "content-protect",
+               buffer,
+               &buffer_size
+       );
+
+       if(k_result != KERN_SUCCESS) {
+               /* Assume data protection is unsupported */
+               T_LOG(
+                       "%s(): no content-protect property in IORegistry",
+                       __func__
+               );
+               return 0;
+       }
+
+       /*
+        * At this point, we SUPPORT content protection… but are we
+        * formatted for it? This is ugly; we should be testing the file
+        * system we'll be testing in, not just /tmp/.
+        */
+       local_result = statfs(g_test_tempdir, &statfs_results);
+
+       if(local_result == -1) {
+               T_LOG(
+                       "%s(): failed to statfs the test directory, errno = %s",
+                       __func__, strerror(errno)
+               );
+               return -1;
+       } else if(statfs_results.f_flags & MNT_CPROTECT) {
+               return 1;
+       } else {
+               T_LOG(
+                       "%s(): filesystem not formatted for data protection",
+                       __func__
+               );
+               return 0;
+       }
+}
+
+/*
+ * Shamelessly ripped from keystorectl routines;
+ * a wrapper for invoking the AKS user client.
+ */
+int
+apple_key_store(uint32_t command,
+                uint64_t * inputs,
+                uint32_t input_count,
+                void * input_structs,
+                size_t input_struct_count,
+                uint64_t * outputs,
+                uint32_t * output_count) {
+       int result = -1;
+       io_connect_t connection = IO_OBJECT_NULL;
+       io_registry_entry_t apple_key_bag_service = IO_OBJECT_NULL;
+       kern_return_t k_result = KERN_FAILURE;
+       IOReturn io_result = IO_OBJECT_NULL;
+
+       apple_key_bag_service = IOServiceGetMatchingService(
+               kIOMasterPortDefault,
+               IOServiceMatching(kAppleKeyStoreServiceName)
+       );
+       if(apple_key_bag_service == IO_OBJECT_NULL) {
+               T_LOG(
+                       "%s: failed to match kAppleKeyStoreServiceName",
+                       __func__
+               );
+               goto end;
+       }
+
+       k_result = IOServiceOpen(
+               apple_key_bag_service,
+               mach_task_self(),
+               0,
+               &connection
+       );
+       if(k_result != KERN_SUCCESS) {
+               T_LOG(
+                       "%s: failed to open AppleKeyStore: "
+                       "IOServiceOpen() returned %d",
+                       __func__, k_result
+               );
+               goto end;
+       }
+
+       k_result = IOConnectCallMethod(
+               connection,
+               kAppleKeyStoreUserClientOpen,
+               NULL, 0, NULL, 0, NULL, NULL, NULL, NULL
+       );
+       if(k_result != KERN_SUCCESS) {
+               T_LOG(
+                       "%s: call to AppleKeyStore method "
+                       "kAppleKeyStoreUserClientOpen failed",
+                       __func__
+               );
+               goto close;
+       }
+
+       io_result = IOConnectCallMethod(
+               connection, command, inputs, input_count, input_structs,
+               input_struct_count, outputs, output_count, NULL, NULL
+       );
+       if(io_result != kIOReturnSuccess) {
+               T_LOG("%s: call to AppleKeyStore method %d failed", __func__);
+               goto close;
+       }
+
+       result = 0;
+
+close:
+       IOServiceClose(apple_key_bag_service);
+end:
+       return(result);
+}
+
+/*
+ * Helper function for launching tools
+ */
+int
+spawn_proc(char * const command[]) {
+       pid_t pid           = 0;
+       int launch_tool_ret = 0;
+       bool waitpid_ret    = true;
+       int status          = 0;
+       int signal          = 0;
+       int timeout         = 30;
+
+       launch_tool_ret = dt_launch_tool(&pid, command, false, NULL, NULL);
+       T_EXPECT_EQ(launch_tool_ret, 0, "launch tool: %s", command[0]);
+       if(launch_tool_ret != 0) {
+               return 1;
+       }
+
+       waitpid_ret = dt_waitpid(pid, &status, &signal, timeout);
+       T_EXPECT_TRUE(waitpid_ret, "%s should succeed", command[0]);
+       if(waitpid_ret == false) {
+               if(status != 0) {
+                       T_LOG("%s exited %d", command[0], status);
+               }
+               if(signal != 0) {
+                       T_LOG("%s received signal %d", command[0], signal);
+               }
+               return 1;
+       }
+
+       return 0;
+}
+
+char*
+dp_class_num_to_string(int num) {
+       switch(num) {
+               case 0:
+                       return "unclassed";
+               case PROTECTION_CLASS_A:
+                       return "class A";
+               case PROTECTION_CLASS_B:
+                       return "class B";
+               case PROTECTION_CLASS_C:
+                       return "class C";
+               case PROTECTION_CLASS_D:
+                       return "class D";
+               case PROTECTION_CLASS_E:
+                       return "class E";
+               case PROTECTION_CLASS_F:
+                       return "class F";
+               default:
+                       return "<unknown class>";
+       }
+}
+
+#if 0
+int device_lock_state(void) {
+       /*
+        * TODO: Actually implement this.
+        *
+        * We fail if a passcode already exists, and the methods being used
+        * to lock/unlock the device in this test appear to be synchronous…
+        * do we need this function?
+        */
+       int result = -1;
+
+       return(result);
+}
+
+/* Determines if we will try to test class C semanatics. */
+int unlocked_since_boot() {
+       /*
+        * TODO: Actually implement this.
+        *
+        * The actual semantics for CP mean that even with this primative,
+        * we would need to set a passcode and then reboot the device in
+        * order to test this; this function will probably be rather
+        * worthless as a result.
+        */
+       int result = 1;
+
+       return(result);
+}
+#endif
+
diff --git a/tests/disk_mount_conditioner-entitlements.plist b/tests/disk_mount_conditioner-entitlements.plist
new file mode 100644 (file)
index 0000000..95d2141
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.dmc.set</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/disk_mount_conditioner.c b/tests/disk_mount_conditioner.c
new file mode 100644 (file)
index 0000000..fc3db9f
--- /dev/null
@@ -0,0 +1,515 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <System/sys/fsctl.h>
+#include <paths.h>
+
+static char *mktempdir(void);
+static char *mktempmount(void);
+
+#ifndef TEST_UNENTITLED
+static int system_legal(const char *command);
+static char *mkramdisk(void);
+static uint64_t time_for_read(int fd, const char *expected);
+static void perf_setup(char **path, int *fd);
+
+#define READSIZE 1024L
+#endif /* !TEST_UNENTITLED */
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.vfs.dmc"),
+       T_META_ASROOT(true)
+);
+
+#pragma mark Entitled Tests
+
+#ifndef TEST_UNENTITLED
+T_DECL(fsctl_get_uninitialized,
+       "Initial fsctl.get should return zeros",
+       T_META_ASROOT(false))
+{
+       int err;
+       char *mount_path;
+       disk_conditioner_info info = {0};
+       disk_conditioner_info expected_info = {0};
+
+       T_SETUPBEGIN;
+       mount_path = mktempmount();
+       T_SETUPEND;
+
+       info.enabled = true;
+       info.is_ssd = true;
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET)");
+
+       err = memcmp(&info, &expected_info, sizeof(info));
+       T_ASSERT_EQ_INT(0, err, "initial DMC info is zeroed");
+}
+
+T_DECL(fsctl_set,
+       "fsctl.set should succeed and fsctl.get should verify")
+{
+       int err;
+       char *mount_path;
+       disk_conditioner_info info = {0};
+       disk_conditioner_info expected_info = {0};
+
+       T_SETUPBEGIN;
+       mount_path = mktempmount();
+       T_SETUPEND;
+
+       info.enabled = 1;
+       info.access_time_usec = 10;
+       info.read_throughput_mbps = 40;
+       info.write_throughput_mbps = 40;
+       info.is_ssd = 0;
+       info.ioqueue_depth = 8;
+       info.maxreadcnt = 8;
+       info.maxwritecnt = 8;
+       info.segreadcnt = 8;
+       info.segwritecnt = 8;
+       expected_info = info;
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)");
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET) after SET");
+
+       err = memcmp(&info, &expected_info, sizeof(info));
+       T_ASSERT_EQ_INT(0, err, "fsctl.get is the info configured by fsctl.set");
+}
+
+static void
+verify_mount_fallback_values(const char *mount_path, disk_conditioner_info *info)
+{
+       int err;
+       disk_conditioner_info newinfo = {0};
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)");
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &newinfo, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET) after SET");
+
+       // without querying the drive for the expected values, the best we can do is
+       // assert that they are not zero (impossible) or less than UINT32_MAX (unlikely)
+       T_ASSERT_GT(newinfo.ioqueue_depth, 0u, "ioqueue_depth is the value from the mount");
+       T_ASSERT_GT(newinfo.maxreadcnt, 0u, "maxreadcnt is value from the mount");
+       T_ASSERT_GT(newinfo.maxwritecnt, 0u, "maxwritecnt is value from the mount");
+       T_ASSERT_GT(newinfo.segreadcnt, 0u, "segreadcnt is value from the mount");
+       T_ASSERT_GT(newinfo.segwritecnt, 0u, "segwritecnt is value from the mount");
+       T_ASSERT_LT(newinfo.ioqueue_depth, UINT32_MAX, "ioqueue_depth is the value from the mount");
+       T_ASSERT_LT(newinfo.maxreadcnt, UINT32_MAX, "maxreadcnt is value from the mount");
+       T_ASSERT_LT(newinfo.maxwritecnt, UINT32_MAX, "maxwritecnt is value from the mount");
+       T_ASSERT_LT(newinfo.segreadcnt, UINT32_MAX, "segreadcnt is value from the mount");
+       T_ASSERT_LT(newinfo.segwritecnt, UINT32_MAX, "segwritecnt is value from the mount");
+}
+
+T_DECL(fsctl_set_zero,
+       "fsctl.set zero values should fall back to original mount settings")
+{
+       char *mount_path;
+       disk_conditioner_info info = {0};
+
+       T_SETUPBEGIN;
+       mount_path = mktempmount();
+
+       info.enabled = 1;
+       /* everything else is 0 */
+
+       T_SETUPEND;
+
+       verify_mount_fallback_values(mount_path, &info);
+}
+
+T_DECL(fsctl_set_out_of_bounds,
+       "fsctl.set out-of-bounds values should fall back to original mount settings")
+{
+       char *mount_path;
+       disk_conditioner_info info;
+
+       T_SETUPBEGIN;
+       mount_path = mktempmount();
+
+       memset(&info, UINT32_MAX, sizeof(info));
+       info.enabled = 1;
+       info.access_time_usec = 0;
+       info.read_throughput_mbps = 0;
+       info.write_throughput_mbps = 0;
+       /* everything else is UINT32_MAX */
+
+       T_SETUPEND;
+
+       verify_mount_fallback_values(mount_path, &info);
+}
+
+T_DECL(fsctl_restore_mount_fields,
+       "fsctl.set should restore fields on mount_t that it temporarily overrides")
+{
+       int err;
+       char *mount_path;
+       disk_conditioner_info info;
+       disk_conditioner_info mount_fields;
+
+       T_SETUPBEGIN;
+       mount_path = mktempmount();
+       T_SETUPEND;
+
+       /* first set out-of-bounds values to retrieve the original mount_t fields */
+       memset(&info, UINT32_MAX, sizeof(info));
+       info.enabled = 1;
+       info.access_time_usec = 0;
+       info.read_throughput_mbps = 0;
+       info.write_throughput_mbps = 0;
+       /* everything else is UINT32_MAX */
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)");
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &mount_fields, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET)");
+
+       /* now turn off the disk conditioner which should restore fields on the mount_t */
+       memset(&info, 1, sizeof(info));
+       info.enabled = 0;
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)");
+
+       /* and finally set out-of-bounds values again to retrieve the new mount_t fields which should not have changed */
+       memset(&info, UINT32_MAX, sizeof(info));
+       info.enabled = 0;
+       info.access_time_usec = 0;
+       info.read_throughput_mbps = 0;
+       info.write_throughput_mbps = 0;
+       /* everything else is UINT32_MAX */
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)");
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET)");
+
+       T_ASSERT_EQ(info.maxreadcnt, mount_fields.maxreadcnt, "mount_t maxreadcnt restored");
+       T_ASSERT_EQ(info.maxwritecnt, mount_fields.maxwritecnt, "mount_t maxwritecnt restored");
+       T_ASSERT_EQ(info.segreadcnt, mount_fields.segreadcnt, "mount_t segreadcnt restored");
+       T_ASSERT_EQ(info.segwritecnt, mount_fields.segwritecnt, "mount_t segwritecnt restored");
+       T_ASSERT_EQ(info.ioqueue_depth, mount_fields.ioqueue_depth, "mount_t ioqueue_depth restored");
+}
+
+T_DECL(fsctl_get_nonroot,
+       "fsctl.get should not require root",
+       T_META_ASROOT(false))
+{
+       int err;
+       char *mount_path;
+       disk_conditioner_info info;
+
+       T_SETUPBEGIN;
+       // make sure we're not root
+       if (0 == geteuid()) {
+               seteuid(5000);
+       }
+
+       mount_path = mktempmount();
+       T_SETUPEND;
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl.get without root");
+}
+
+T_DECL(fsctl_set_nonroot,
+       "fsctl.set should require root",
+       T_META_ASROOT(false))
+{
+       int err;
+       char *mount_path;
+       disk_conditioner_info info = {0};
+       disk_conditioner_info expected_info = {0};
+
+       T_SETUPBEGIN;
+       // make sure we're not root
+       if (0 == geteuid()) {
+               seteuid(5000);
+       }
+
+       mount_path = mktempmount();
+       T_SETUPEND;
+
+       // save original info
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &expected_info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "Get original DMC info");
+
+       info.enabled = 1;
+       info.access_time_usec = 10;
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_NE_INT(0, err, "fsctl.set returns error without root");
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl.get after nonroot fsctl.set");
+
+       err = memcmp(&info, &expected_info, sizeof(info));
+       T_ASSERT_EQ_INT(0, err, "fsctl.set should not change info without root");
+}
+
+T_DECL(fsctl_delays,
+       "Validate I/O delays when DMC is enabled")
+{
+       char *path;
+       int fd;
+       int err;
+       uint64_t elapsed_nsec, expected_nsec;
+       disk_conditioner_info info = {0};
+       char buf[READSIZE];
+
+       T_SETUPBEGIN;
+       perf_setup(&path, &fd);
+       memset(buf, 0xFF, sizeof(buf));
+       T_ASSERT_EQ_LONG((long)sizeof(buf), write(fd, buf, sizeof(buf)), "write random data to temp file");
+       fcntl(fd, F_FULLFSYNC);
+       T_SETUPEND;
+
+       expected_nsec = NSEC_PER_SEC / 2;
+
+       // measure delay before setting parameters (should be none)
+       elapsed_nsec = time_for_read(fd, buf);
+       T_ASSERT_LT_ULLONG(elapsed_nsec, expected_nsec, "DMC disabled read(%ld) from %s is reasonably fast", READSIZE, path);
+
+       // measure delay after setting parameters
+       info.enabled = 1;
+       info.access_time_usec = expected_nsec / NSEC_PER_USEC;
+       info.read_throughput_mbps = 40;
+       info.write_throughput_mbps = 40;
+       info.is_ssd = 1; // is_ssd will ensure we get constant access_time delays rather than scaled
+       err = fsctl(path, DISK_CONDITIONER_IOC_SET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET) delay");
+
+       elapsed_nsec = time_for_read(fd, buf);
+       T_ASSERT_GT_ULLONG(elapsed_nsec, expected_nsec, "DMC enabled read(%ld) from %s is at least the expected delay", READSIZE, path);
+       T_ASSERT_LT_ULLONG(elapsed_nsec, 2 * expected_nsec, "DMC enabled read(%ld) from %s is no more than twice the expected delay", READSIZE, path);
+
+       // measure delay after resetting parameters (should be none)
+       info.enabled = 0;
+       err = fsctl(path, DISK_CONDITIONER_IOC_SET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET) reset delay");
+
+       usleep(USEC_PER_SEC / 2); // might still be other I/O inflight
+       elapsed_nsec = time_for_read(fd, buf);
+       T_ASSERT_LT_ULLONG(elapsed_nsec, expected_nsec, "After disabling DMC read(%ld) from %s is reasonably fast", READSIZE, path);
+}
+
+#else /* TEST_UNENTITLED */
+
+#pragma mark Unentitled Tests
+
+T_DECL(fsctl_get_unentitled,
+       "fsctl.get should not require entitlement")
+{
+       int err;
+       char *mount_path;
+       disk_conditioner_info info;
+
+       T_SETUPBEGIN;
+       mount_path = mktempmount();
+       T_SETUPEND;
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl.get without entitlement");
+}
+
+T_DECL(fsctl_set_unentitled,
+       "fsctl.set should require entitlement")
+{
+       int err;
+       char *mount_path;
+       disk_conditioner_info info = {0};
+       disk_conditioner_info expected_info = {0};
+
+       T_SETUPBEGIN;
+       mount_path = mktempmount();
+       T_SETUPEND;
+
+       // save original info
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &expected_info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "Get original DMC info");
+
+       info.enabled = 1;
+       info.access_time_usec = 10;
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_NE_INT(0, err, "fsctl.set returns error without entitlement");
+
+       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, err, "fsctl.get after unentitled fsctl.set");
+
+       err = memcmp(&info, &expected_info, sizeof(info));
+       T_ASSERT_EQ_INT(0, err, "fsctl.set should not change info without entitlement");
+}
+
+#endif /* TEST_UNENTITLED */
+
+#pragma mark Helpers
+
+static char *mktempdir(void) {
+       char *path = malloc(PATH_MAX);
+       strcpy(path, "/tmp/dmc.XXXXXXXX");
+       atexit_b(^{ free(path); });
+
+       // create a temporary mount to run the fsctl on
+       T_WITH_ERRNO;
+       T_ASSERT_NOTNULL(mkdtemp(path), "Create temporary directory");
+       atexit_b(^{ remove(path); });
+
+       return path;
+}
+
+/*
+ * Return the path to a temporary mount
+ * with no usable filesystem but still
+ * can be configured by the disk conditioner
+ *
+ * Faster than creating a ram disk to test with
+ * when access to the filesystem is not necessary
+ */
+static char *mktempmount(void) {
+       char *mount_path = mktempdir();
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(0, mount("devfs", mount_path, MNT_RDONLY, NULL), "Create temporary devfs mount");
+       atexit_b(^{ unmount(mount_path, MNT_FORCE); });
+
+       return mount_path;
+}
+
+#ifndef TEST_UNENTITLED
+
+/*
+ * Wrapper around dt_launch_tool/dt_waitpid
+ * that works like libc:system()
+ */
+static int system_legal(const char *command) {
+       pid_t pid = -1;
+       int exit_status = 0;
+       const char *argv[] = {
+               _PATH_BSHELL,
+               "-c",
+               command,
+               NULL
+       };
+
+       int rc = dt_launch_tool(&pid, (char **)(void *)argv, false, NULL, NULL);
+       if (rc != 0) {
+               return -1;
+       }
+       if (!dt_waitpid(pid, &exit_status, NULL, 30)) {
+               if (exit_status != 0) {
+                       return exit_status;
+               }
+               return -1;
+       }
+
+       return exit_status;
+}
+
+/*
+ * Return the path to a temporary mount
+ * that contains a usable HFS+ filesystem
+ * mounted via a ram disk
+ */
+static char *mkramdisk(void) {
+       char cmd[1024];
+       char *mount_path = mktempdir();
+       char *dev_disk_file = malloc(256);
+       atexit_b(^{ free(dev_disk_file); });
+       strcpy(dev_disk_file, "/tmp/dmc.ramdisk.XXXXXXXX");
+
+       T_WITH_ERRNO;
+       T_ASSERT_NOTNULL(mktemp(dev_disk_file), "Create temporary file to store dev disk for ramdisk");
+       atexit_b(^{ remove(dev_disk_file); });
+
+       // create the RAM disk device
+       snprintf(cmd, sizeof(cmd), "hdik -nomount ram://10000 > %s", dev_disk_file);
+       T_ASSERT_EQ_INT(0, system_legal(cmd), "Create ramdisk");
+
+       atexit_b(^{
+               char eject_cmd[1024];
+               unmount(mount_path, MNT_FORCE);
+               snprintf(eject_cmd, sizeof(eject_cmd), "hdik -e `cat %s`", dev_disk_file);
+               system_legal(eject_cmd);
+               remove(dev_disk_file);
+       });
+
+       // initialize as an HFS volume
+       snprintf(cmd, sizeof(cmd), "newfs_hfs `cat %s`", dev_disk_file);
+       T_ASSERT_EQ_INT(0, system_legal(cmd), "Initialize ramdisk as HFS");
+
+       // mount it
+       snprintf(cmd, sizeof(cmd), "mount -t hfs `cat %s` %s", dev_disk_file, mount_path);
+       T_ASSERT_EQ_INT(0, system_legal(cmd), "Mount ramdisk");
+
+       return mount_path;
+}
+
+static uint64_t time_for_read(int fd, const char *expected) {
+       int err;
+       ssize_t ret;
+       char buf[READSIZE];
+       uint64_t start, stop;
+
+       bzero(buf, sizeof(buf));
+       lseek(fd, 0, SEEK_SET);
+
+       start = dt_nanoseconds();
+       ret = read(fd, buf, READSIZE);
+       stop = dt_nanoseconds();
+
+       T_ASSERT_GE_LONG(ret, 0L, "read from temporary file");
+       T_ASSERT_EQ_LONG(ret, READSIZE, "read %ld bytes from temporary file", READSIZE);
+       err = memcmp(buf, expected, sizeof(buf));
+       T_ASSERT_EQ_INT(0, err, "read expected contents from temporary file");
+
+       return (stop - start);
+}
+
+static void perf_setup(char **path, int *fd) {
+       int temp_fd;
+       char *temp_path;
+
+       char *mount_path = mkramdisk();
+       temp_path = *path = malloc(PATH_MAX);
+       snprintf(temp_path, PATH_MAX, "%s/dmc.XXXXXXXX", mount_path);
+       atexit_b(^{ free(temp_path); });
+
+       T_ASSERT_NOTNULL(mktemp(temp_path), "Create temporary file");
+       atexit_b(^{ remove(temp_path); });
+
+       temp_fd = *fd = open(temp_path, O_RDWR | O_CREAT);
+       T_WITH_ERRNO;
+       T_ASSERT_GE_INT(temp_fd, 0, "Open temporary file for read/write");
+       atexit_b(^{ close(temp_fd); });
+       fcntl(temp_fd, F_NOCACHE, 1);
+}
+#endif /* !TEST_UNENTITLED */
diff --git a/tests/drop_priv.c b/tests/drop_priv.c
new file mode 100644 (file)
index 0000000..7bb499c
--- /dev/null
@@ -0,0 +1,59 @@
+#include <darwintest.h>
+
+#include <TargetConditionals.h>
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/errno.h>
+#include <unistd.h>
+
+#if !TARGET_OS_OSX
+#include <pwd.h>
+#include <sys/types.h>
+#include <uuid/uuid.h>
+#endif
+
+#if TARGET_OS_OSX
+#define INVOKER_UID "SUDO_UID"
+#define INVOKER_GID "SUDO_GID"
+#define ID_MAX (unsigned long)UINT_MAX
+static unsigned
+_get_sudo_invoker(const char *var)
+{
+    char *value_str = getenv(var);
+    T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(value_str,
+            "Not running under sudo, getenv(\"%s\") failed", var);
+    T_QUIET; T_ASSERT_NE_CHAR(*value_str, '\0',
+            "getenv(\"%s\") returned an empty string", var);
+
+    char *endp;
+    unsigned long value = strtoul(value_str, &endp, 10);
+    T_QUIET; T_WITH_ERRNO; T_ASSERT_EQ_CHAR(*endp, '\0',
+            "strtoul(\"%s\") not called on a valid number", value_str);
+    T_QUIET; T_WITH_ERRNO; T_ASSERT_NE_ULONG(value, ULONG_MAX,
+            "strtoul(\"%s\") overflow", value_str);
+
+    T_QUIET; T_ASSERT_NE_ULONG(value, 0ul, "%s invalid", var);
+    T_QUIET; T_ASSERT_LT_ULONG(value, ID_MAX, "%s invalid", var);
+    return (unsigned)value;
+}
+#endif /* TARGET_OS_OSX */
+
+void
+drop_priv(void);
+void
+drop_priv(void)
+{
+#if TARGET_OS_OSX
+    uid_t lower_uid = _get_sudo_invoker(INVOKER_UID);
+    gid_t lower_gid = _get_sudo_invoker(INVOKER_GID);
+#else
+    struct passwd *pw = getpwnam("mobile");
+    T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(pw, "getpwnam(\"mobile\")");
+    uid_t lower_uid = pw->pw_uid;
+    gid_t lower_gid = pw->pw_gid;
+#endif
+    T_ASSERT_POSIX_SUCCESS(setgid(lower_gid), "Change group to %u", lower_gid);
+    T_ASSERT_POSIX_SUCCESS(setuid(lower_uid), "Change user to %u", lower_uid);
+}
diff --git a/tests/exc_resource_threads.c b/tests/exc_resource_threads.c
new file mode 100644 (file)
index 0000000..4b247c6
--- /dev/null
@@ -0,0 +1,175 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+
+#include <darwintest.h>
+#include <dispatch/dispatch.h>
+#include <stdlib.h>
+#include <spawn.h>
+#include <spawn_private.h>
+
+#include <mach-o/dyld.h>
+#include <mach/mach.h>
+#include <mach/task.h>
+
+#include <signal.h>
+#include <sys/sysctl.h>
+#include <sys/syslimits.h>
+
+#include <excserver.h>
+
+static dispatch_semaphore_t sync_sema;
+
+kern_return_t
+catch_mach_exception_raise(mach_port_t exception_port,
+                           mach_port_t thread,
+                           mach_port_t task,
+                           exception_type_t exception,
+                           mach_exception_data_t code,
+                           mach_msg_type_number_t code_count)
+{
+#pragma unused(exception_port, thread, task, code, code_count)
+       pid_t pid;
+       pid_for_task(task, &pid);
+       T_ASSERT_EQ(exception, EXC_CORPSE_NOTIFY, "exception type");
+       T_ASSERT_POSIX_ZERO(kill(pid, SIGKILL), "kill");
+       dispatch_semaphore_signal(sync_sema);
+       return KERN_SUCCESS;
+}
+
+kern_return_t
+catch_mach_exception_raise_state(mach_port_t exception_port,
+                                 exception_type_t exception,
+                                 const mach_exception_data_t code,
+                                 mach_msg_type_number_t code_count,
+                                 int * flavor,
+                                 const thread_state_t old_state,
+                                 mach_msg_type_number_t old_state_count,
+                                 thread_state_t new_state,
+                                 mach_msg_type_number_t * new_state_count)
+{
+#pragma unused(exception_port, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count)
+       T_FAIL("Unsupported catch_mach_exception_raise_state");
+       return KERN_NOT_SUPPORTED;
+}
+
+kern_return_t
+catch_mach_exception_raise_state_identity(mach_port_t exception_port,
+                                          mach_port_t thread,
+                                          mach_port_t task,
+                                          exception_type_t exception,
+                                          mach_exception_data_t code,
+                                          mach_msg_type_number_t code_count,
+                                          int * flavor,
+                                          thread_state_t old_state,
+                                          mach_msg_type_number_t old_state_count,
+                                          thread_state_t new_state,
+                                          mach_msg_type_number_t * new_state_count)
+{
+#pragma unused(exception_port, thread, task, exception, code, code_count, flavor, old_state, old_state_count, new_state, new_state_count)
+       T_FAIL("Unsupported catch_mach_exception_raise_state_identity");
+       return KERN_NOT_SUPPORTED;
+}
+
+
+/*
+ * setup exception handling port for EXC_CORPSE_NOTIFY.
+ * runs mach_msg_server once for receiving exception messages from kernel.
+ */
+static void *
+exc_handler(void * arg)
+{
+#pragma unused(arg)
+       kern_return_t kret;
+       mach_port_t exception_port;
+
+       kret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &exception_port);
+       if (kret != KERN_SUCCESS)
+               T_FAIL("mach_port_allocate: %s (%d)", mach_error_string(kret), kret);
+
+       kret = mach_port_insert_right(mach_task_self(), exception_port, exception_port, MACH_MSG_TYPE_MAKE_SEND);
+       if (kret != KERN_SUCCESS)
+               T_FAIL("mach_port_insert_right: %s (%d)", mach_error_string(kret), kret);
+
+       kret = task_set_exception_ports(mach_task_self(), EXC_MASK_CRASH | EXC_MASK_CORPSE_NOTIFY, exception_port,
+                                       (exception_behavior_t)(EXCEPTION_DEFAULT | MACH_EXCEPTION_CODES), 0);
+       if (kret != KERN_SUCCESS)
+               T_FAIL("task_set_exception_ports: %s (%d)", mach_error_string(kret), kret);
+
+       dispatch_semaphore_signal(sync_sema);
+
+       kret = mach_msg_server(mach_exc_server, MACH_MSG_SIZE_RELIABLE, exception_port, 0);
+       if (kret != KERN_SUCCESS)
+               T_FAIL("mach_msg_server: %s (%d)", mach_error_string(kret), kret);
+
+       return NULL;
+}
+
+static void*
+dummy_thread(void *arg) {
+#pragma unused(arg)
+       while (1) {
+               sleep(60);
+       }
+}
+
+#define THREAD_LIMIT 2
+
+T_HELPER_DECL(exc_resource_helper, "exc_resource helper")
+{
+       pthread_t tid;
+       for (int i = 0; i < THREAD_LIMIT; i++) {
+               T_QUIET;
+               T_EXPECT_POSIX_SUCCESS(pthread_create(&tid, NULL, dummy_thread, NULL), "pthread_create");
+       }
+       while (1) {
+               sleep(60);
+       }
+}
+
+static void
+check_exc_resource_threads_enabled()
+{
+       int err;
+       int enabled;
+       size_t enabled_size = sizeof(enabled);
+       err = sysctlbyname("kern.exc_resource_threads_enabled", &enabled, &enabled_size, NULL, 0);
+
+       if (err || !enabled)
+               T_SKIP("EXC_RESOURCE RESOURCE_TYPE_THREADS not enabled on this system");
+
+}
+
+T_DECL(exc_resource_threads, "Ensures that a process with a thread_limit set will receive an exc_resource when it crosses its thread limit",
+       T_META_ASROOT(true),
+       T_META_CHECK_LEAKS(false))
+{
+       pthread_t handle_thread;
+
+       check_exc_resource_threads_enabled();
+
+       sync_sema = dispatch_semaphore_create(0);
+
+       T_ASSERT_POSIX_ZERO(pthread_create(&handle_thread, NULL, exc_handler, NULL), "pthread_create");
+       dispatch_semaphore_wait(sync_sema, DISPATCH_TIME_FOREVER);
+
+       pid_t helper_pid;
+       char path[PATH_MAX];
+       uint32_t path_size = sizeof(path);
+
+       T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath");
+
+       char *args[] = { path, "-n", "exc_resource_helper", NULL };
+
+       posix_spawnattr_t attr;
+       T_ASSERT_POSIX_ZERO(posix_spawnattr_init(&attr), "posix_spawnattr_init");
+
+       T_EXPECT_POSIX_ZERO(posix_spawnattr_set_threadlimit_ext(&attr, THREAD_LIMIT), "posix_spawnattr_set_threadlimit_ext");
+
+       T_EXPECT_POSIX_ZERO(posix_spawn(&helper_pid, args[0], NULL, &attr, args, NULL), "posix_spawn");
+
+       T_ASSERT_POSIX_ZERO(posix_spawnattr_destroy(&attr), "posix_spawnattr_destroy");
+
+       dispatch_semaphore_wait(sync_sema, DISPATCH_TIME_FOREVER);
+}
diff --git a/tests/excserver.defs b/tests/excserver.defs
new file mode 100644 (file)
index 0000000..e528df4
--- /dev/null
@@ -0,0 +1 @@
+#include <mach/mach_exc.defs>
diff --git a/tests/freebsd_waitpid_nohang.c b/tests/freebsd_waitpid_nohang.c
new file mode 100644 (file)
index 0000000..9aa55e1
--- /dev/null
@@ -0,0 +1,62 @@
+/*-
+ * Copyright (c) 2016 Jilles Tjoelker
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/wait.h>
+
+#include <darwintest.h>
+#include <signal.h>
+#include <unistd.h>
+
+T_DECL(waitpid_nohang, "FreeBSDarwin--waitpid_nohang")
+{
+       pid_t child, pid;
+       int status, r;
+       siginfo_t siginfo;
+
+       child = fork();
+       T_ASSERT_POSIX_SUCCESS(child, "child forked successfully");
+       if (child == 0) {
+               sleep(10);
+               _exit(1);
+       }
+
+       status = 42;
+       pid = waitpid(child, &status, WNOHANG);
+       T_ASSERT_POSIX_ZERO(pid, "waitpid call is successful");
+       T_EXPECT_EQ(status, 42, "status is unaffected as expected");
+
+       r = kill(child, SIGTERM);
+       T_ASSERT_POSIX_ZERO(r, "signal sent successfully");
+       r = waitid(P_PID, (id_t)child, &siginfo, WEXITED | WNOWAIT);
+       T_ASSERT_POSIX_SUCCESS(r, "waitid call successful");
+
+       status = -1;
+       pid = waitpid(child, &status, WNOHANG);
+       T_ASSERT_EQ(pid, child, "waitpid returns correct pid");
+       T_EXPECT_EQ(WIFSIGNALED(status), true, "child was signaled"); 
+       T_EXPECT_EQ(WTERMSIG(status), SIGTERM, "child was sent SIGTERM");
+}
diff --git a/tests/gettimeofday.c b/tests/gettimeofday.c
new file mode 100644 (file)
index 0000000..e2b8c3a
--- /dev/null
@@ -0,0 +1,50 @@
+#include <unistd.h>
+#include <sys/time.h>
+#include <mach/mach_time.h>
+
+#include <darwintest.h>
+
+extern int __gettimeofday(struct timeval *, struct timezone *);
+
+T_DECL(gettimeofday, "gettimeofday()",
+          T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       struct timeval tv_a, tv_b, tv_c;
+
+       T_ASSERT_POSIX_ZERO(gettimeofday(&tv_a, NULL), NULL);
+       T_ASSERT_GT(tv_a.tv_sec, 0L, NULL);
+
+       sleep(1);
+
+       T_ASSERT_POSIX_ZERO(__gettimeofday(&tv_b, NULL), NULL);
+       T_ASSERT_GE(tv_b.tv_sec, tv_a.tv_sec, NULL);
+
+       sleep(1);
+
+       T_ASSERT_POSIX_ZERO(gettimeofday(&tv_c, NULL), NULL);
+       T_ASSERT_GE(tv_c.tv_sec, tv_b.tv_sec, NULL);
+}
+
+#if 0 // This symbol isn't exported so we can't test with stock libsyscall
+extern int __gettimeofday_with_mach(struct timeval *, struct timezone *, uint64_t *mach_time);
+
+T_DECL(gettimeofday_with_mach, "gettimeofday_with_mach()",
+          T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
+{
+       struct timeval gtod_ts;
+
+       uint64_t mach_time_before, mach_time, mach_time_after;
+
+       mach_time_before = mach_absolute_time();
+
+       T_ASSERT_POSIX_ZERO(__gettimeofday_with_mach(&gtod_ts, NULL, &mach_time), NULL);
+       T_ASSERT_GT(gtod_ts.tv_sec, 0L, NULL);
+
+       mach_time_after = mach_absolute_time();
+
+       T_LOG("%llx > %llx > %llx", mach_time_before, mach_time, mach_time_after);
+
+       T_ASSERT_LT(mach_time_before, mach_time, NULL);
+       T_ASSERT_GT(mach_time_after, mach_time, NULL);
+}
+#endif // 0
diff --git a/tests/gettimeofday_29192647.c b/tests/gettimeofday_29192647.c
new file mode 100644 (file)
index 0000000..f580c2f
--- /dev/null
@@ -0,0 +1,49 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <mach/mach_time.h>
+#include <sys/time.h>
+
+#include <darwintest.h>
+#include <darwintest_perf.h>
+
+T_GLOBAL_META(T_META_TAG_PERF);
+
+T_DECL(gettimeofday_tl, "gettimeofday performance in tight loop") {
+       {
+               struct timeval time;
+               dt_stat_time_t s = dt_stat_time_create("gettimeofday tight loop");
+               T_STAT_MEASURE_LOOP(s){
+                       gettimeofday(&time, NULL);
+               }
+               dt_stat_finalize(s);
+       }
+}
+
+extern int __gettimeofday(struct timeval *, struct timezone *);
+T_DECL(__gettimeofday_tl, "__gettimeofday performance in tight loop") {
+       {
+               struct timeval time;
+
+               dt_stat_time_t s = dt_stat_time_create("__gettimeofday tight loop");
+               T_STAT_MEASURE_LOOP(s){
+                       __gettimeofday(&time, NULL);
+               }
+               dt_stat_finalize(s);
+       }
+}
+
+T_DECL(gettimeofday_sl, "gettimeofday performance in loop with sleep") {
+       {
+               struct timeval time;
+               dt_stat_time_t s = dt_stat_time_create("gettimeofday loop with sleep");
+               while (!dt_stat_stable(s)) {
+                       T_STAT_MEASURE_BATCH(s){
+                               gettimeofday(&time, NULL);
+                       }
+                       sleep(1);
+               }
+               dt_stat_finalize(s);
+       }
+}
diff --git a/tests/host_notifications.c b/tests/host_notifications.c
new file mode 100644 (file)
index 0000000..c4463b3
--- /dev/null
@@ -0,0 +1,49 @@
+#include <sys/time.h>
+#include <mach/mach.h>
+#include <mach/mach_host.h>
+
+#include <darwintest.h>
+
+static void do_test(int notify_type, void (^trigger_block)(void)){
+       mach_port_t port;
+       T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), NULL);
+
+       T_ASSERT_MACH_SUCCESS(host_request_notification(mach_host_self(), notify_type, port), NULL);
+
+       trigger_block();
+
+       struct {
+               mach_msg_header_t hdr;
+               mach_msg_trailer_t trailer;
+       } message = { .hdr = {
+               .msgh_bits = 0,
+               .msgh_size = sizeof(mach_msg_header_t),
+               .msgh_remote_port = MACH_PORT_NULL,
+               .msgh_local_port = port,
+               .msgh_voucher_port = MACH_PORT_NULL,
+               .msgh_id = 0,
+       }};
+
+       T_ASSERT_EQ(MACH_RCV_TOO_LARGE, mach_msg_receive(&message.hdr), NULL);
+       mach_msg_destroy(&message.hdr);
+}
+
+T_DECL(host_notify_calendar_change, "host_request_notification(HOST_NOTIFY_CALENDAR_CHANGE)", T_META_CHECK_LEAKS(false), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       do_test(HOST_NOTIFY_CALENDAR_CHANGE, ^{
+               struct timeval tm;
+               if (gettimeofday(&tm, NULL) != 0 || settimeofday(&tm, NULL) != 0){
+                       T_SKIP("Unable to settimeofday()");
+               }
+       });
+}
+
+T_DECL(host_notify_calendar_set, "host_request_notification(HOST_NOTIFY_CALENDAR_SET)", T_META_CHECK_LEAKS(false), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       do_test(HOST_NOTIFY_CALENDAR_SET, ^{
+               struct timeval tm;
+               if (gettimeofday(&tm, NULL) != 0 || settimeofday(&tm, NULL) != 0){
+                       T_SKIP("Unable to settimeofday()");
+               }
+       });
+}
diff --git a/tests/host_statistics_rate_limiting.c b/tests/host_statistics_rate_limiting.c
new file mode 100644 (file)
index 0000000..8376db7
--- /dev/null
@@ -0,0 +1,191 @@
+#include <unistd.h>
+#include <stdint.h>
+#include <sys/time.h>
+#include <System/sys/codesign.h>
+#include <mach/mach_time.h>
+#include <mach/mach.h>
+#include <darwintest.h>
+#include <stdlib.h>
+
+#if !defined(CS_OPS_CLEARPLATFORM)
+#define CS_OPS_CLEARPLATFORM 13
+#endif
+
+#define WINDOW 1 /* seconds */
+#define MAX_ATTEMP_PER_SEC 10
+#define ITER 30
+#define RETRY 5
+
+static int
+remove_platform_binary(void){
+       int ret;
+       uint32_t my_csflags;
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(csops(getpid(), CS_OPS_STATUS, &my_csflags, sizeof(my_csflags)), NULL);
+
+       if (!(my_csflags & CS_PLATFORM_BINARY)) {
+               return 0;
+       }
+
+       ret = csops(getpid(), CS_OPS_CLEARPLATFORM, NULL, 0);
+       if (ret) {
+               switch (errno) {
+               case ENOTSUP:
+                       T_LOG("clearing platform binary not supported, skipping test");
+                       return -1;
+               default:
+                       T_LOG("csops failed with flag CS_OPS_CLEARPLATFORM");
+                       return -1;
+               }
+       }
+
+       my_csflags = 0;
+       T_QUIET; T_ASSERT_POSIX_ZERO(csops(getpid(), CS_OPS_STATUS, &my_csflags, sizeof(my_csflags)), NULL);
+
+       if (my_csflags & CS_PLATFORM_BINARY) {
+               T_LOG("platform binary flag still set");
+               return -1;
+       }
+
+       return 0;
+}
+
+struct all_host_info {
+       vm_statistics64_data_t host_vm_info64_rev0;
+       vm_statistics64_data_t host_vm_info64_rev1;
+       vm_extmod_statistics_data_t host_extmod_info64;
+       host_load_info_data_t host_load_info;
+       vm_statistics_data_t host_vm_info_rev0;
+       vm_statistics_data_t host_vm_info_rev1;
+       vm_statistics_data_t host_vm_info_rev2;
+       host_cpu_load_info_data_t host_cpu_load_info;
+       task_power_info_v2_data_t host_expired_task_info;
+       task_power_info_v2_data_t host_expired_task_info2;
+};
+
+static void
+check_host_info(struct all_host_info* data, unsigned long iter, char lett){
+       char* datap;
+       unsigned long i,j;
+
+       /* check that for the shorter revisions no data is copied on the bytes of diff with the longer */
+       for ( j = 0 ; j < iter; j++) {
+               datap = (char*) &data[j].host_vm_info64_rev0;
+               for ( i = (HOST_VM_INFO64_REV0_COUNT * sizeof(int)); i< (HOST_VM_INFO64_REV1_COUNT * sizeof(int)); i++) {
+                       T_QUIET;T_ASSERT_EQ(datap[i], lett, "HOST_VM_INFO64_REV0 byte %lu iter %lu", i, j);
+               }
+
+               datap = (char*) &data[j].host_vm_info_rev0;
+               for ( i = (HOST_VM_INFO_REV0_COUNT * sizeof(int)); i< (HOST_VM_INFO_REV2_COUNT * sizeof(int)); i++) {
+                       T_QUIET;T_ASSERT_EQ(datap[i], lett, "HOST_VM_INFO_REV0 byte %lu iter %lu", i, j);
+               }
+
+               datap = (char*) &data[j].host_vm_info_rev1;
+               for ( i = (HOST_VM_INFO_REV1_COUNT * sizeof(int)); i< (HOST_VM_INFO_REV2_COUNT * sizeof(int)); i++) {
+                       T_QUIET;T_ASSERT_EQ(datap[i], lett, "HOST_VM_INFO_REV1 byte %lu iter %lu", i, j);
+               }
+
+               datap = (char*) &data[j].host_expired_task_info;
+               for ( i = (TASK_POWER_INFO_COUNT * sizeof(int)); i< (TASK_POWER_INFO_V2_COUNT * sizeof(int)); i++) {
+                       T_QUIET;T_ASSERT_EQ(datap[i], lett, "TASK_POWER_INFO_COUNT byte %lu iter %lu", i, j);
+               }
+       }
+       T_LOG("No data overflow");
+
+       datap = (char*) data;
+
+       /* check that after MAX_ATTEMP_PER_SEC data are all the same */
+       for ( i = 0 ; i < sizeof(struct all_host_info) ; i++ )
+               for ( j = MAX_ATTEMP_PER_SEC - 1 ; j < iter - 1; j++) {
+                       T_QUIET; T_ASSERT_EQ(datap[i+(j * sizeof(struct all_host_info))], datap[i+((j+1) * sizeof(struct all_host_info))], "all_host_info iter %lu does not match iter %lu", j, j+1);
+               }
+
+       T_LOG("Data was cached");
+}
+
+static void
+get_host_info(struct all_host_info* data, host_t self, int iter){
+       int i;
+       unsigned int count;
+       for (i = 0; i < iter; i++){
+               count = HOST_VM_INFO64_REV0_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_VM_INFO64, (host_info64_t)&data[i].host_vm_info64_rev0, &count), NULL);
+               count = HOST_VM_INFO64_REV1_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_VM_INFO64, (host_info64_t)&data[i].host_vm_info64_rev1, &count), NULL);
+               count = HOST_EXTMOD_INFO64_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_EXTMOD_INFO64, (host_info64_t)&data[i].host_extmod_info64, &count), NULL);
+               count = HOST_LOAD_INFO_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_LOAD_INFO, (host_info_t)&data[i].host_load_info, &count), NULL);
+               count = HOST_VM_INFO_REV0_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev0, &count), NULL);
+               count = HOST_VM_INFO_REV1_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev1, &count), NULL);
+               count = HOST_VM_INFO_REV2_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev2, &count), NULL);
+               count = HOST_CPU_LOAD_INFO_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_CPU_LOAD_INFO, (host_info_t)&data[i].host_cpu_load_info, &count), NULL);
+               count = TASK_POWER_INFO_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_EXPIRED_TASK_INFO, (host_info_t)&data[i].host_expired_task_info, &count), NULL);
+               count = TASK_POWER_INFO_V2_COUNT;
+               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_EXPIRED_TASK_INFO, (host_info_t)&data[i].host_expired_task_info2, &count), NULL);
+
+       }
+
+}
+
+T_DECL(test_host_statistics, "testing rate limit for host_statistics",
+          T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
+{
+
+       unsigned long long start, end, window;
+       int retry = 0;
+       host_t self;
+       char lett = 'a';
+       struct all_host_info* data;
+       mach_timebase_info_data_t timebaseInfo = { 0, 0 };
+
+       if (remove_platform_binary())
+               T_SKIP("Failed to remove platform binary");
+
+       data = malloc(ITER * sizeof(struct all_host_info));
+       T_QUIET;T_ASSERT_NE(data, NULL, "malloc");
+
+       /* check the size of the data structure against the bytes in COUNT*/
+       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_vm_info64_rev0), HOST_VM_INFO64_COUNT * sizeof(int), "HOST_VM_INFO64_COUNT");
+       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_extmod_info64), HOST_EXTMOD_INFO64_COUNT * sizeof(int), "HOST_EXTMOD_INFO64_COUNT");
+       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_load_info), HOST_LOAD_INFO_COUNT * sizeof(int), "HOST_LOAD_INFO_COUNT");
+       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_vm_info_rev0), HOST_VM_INFO_COUNT * sizeof(int), "HOST_VM_INFO_COUNT");
+       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_cpu_load_info), HOST_CPU_LOAD_INFO_COUNT * sizeof(int), "HOST_CPU_LOAD_INFO_COUNT");
+       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_expired_task_info2), TASK_POWER_INFO_V2_COUNT * sizeof(int), "TASK_POWER_INFO_V2_COUNT");
+
+       /* check that the latest revision is the COUNT */
+       T_QUIET;T_ASSERT_EQ(HOST_VM_INFO64_REV1_COUNT, HOST_VM_INFO64_COUNT, "HOST_VM_INFO64_REV1_COUNT");
+        T_QUIET;T_ASSERT_EQ(HOST_VM_INFO_REV2_COUNT, HOST_VM_INFO_COUNT, "HOST_VM_INFO_REV2_COUNT");
+
+       /* check that the previous revision are smaller than the latest */
+       T_QUIET;T_ASSERT_LE(HOST_VM_INFO64_REV0_COUNT, HOST_VM_INFO64_REV1_COUNT, "HOST_VM_INFO64_REV0");
+        T_QUIET;T_ASSERT_LE(HOST_VM_INFO_REV0_COUNT, HOST_VM_INFO_REV2_COUNT, "HOST_VM_INFO_REV0_COUNT");
+        T_QUIET;T_ASSERT_LE(HOST_VM_INFO_REV1_COUNT, HOST_VM_INFO_REV2_COUNT, "HOST_VM_INFO_REV1_COUNT");
+        T_QUIET;T_ASSERT_LE(TASK_POWER_INFO_COUNT,TASK_POWER_INFO_V2_COUNT, "TASK_POWER_INFO_COUNT");
+
+       memset(data, lett, ITER * sizeof(struct all_host_info));
+       self = mach_host_self();
+
+       T_QUIET;T_ASSERT_EQ(mach_timebase_info(&timebaseInfo), KERN_SUCCESS, NULL);
+       window = (WINDOW * NSEC_PER_SEC * timebaseInfo.denom) / timebaseInfo.numer;
+       retry = 0;
+
+       /* try to get ITER copies of host_info within window time, in such a way we should hit for sure a cached copy */
+       do {
+               start = mach_continuous_time();
+               get_host_info(data, self, ITER);
+               end = mach_continuous_time();
+               retry++;
+       } while( (end - start > window) && retry <= RETRY);
+
+       if (retry <= RETRY)
+               check_host_info(data, ITER, lett);
+       else
+               T_SKIP("Failed to find window for test");
+}
+
diff --git a/tests/ioperf.c b/tests/ioperf.c
new file mode 100644 (file)
index 0000000..1eb2e8c
--- /dev/null
@@ -0,0 +1,256 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <errno.h>
+#include <err.h>
+#include <string.h>
+#include <assert.h>
+#include <sysexits.h>
+#include <getopt.h>
+#include <spawn.h>
+#include <stdbool.h>
+#include <sys/sysctl.h>
+#include <mach/mach_time.h>
+#include <mach/mach.h>
+#include <mach/semaphore.h>
+#include <TargetConditionals.h>
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <stdatomic.h>
+
+#define MAX_THREADS         32
+#define SPIN_SECS           6
+#define THR_SPINNER_PRI     63
+#define THR_MANAGER_PRI     62
+#define WARMUP_ITERATIONS   100
+#define FILE_SIZE           (16384 * 4096)
+#define IO_SIZE             4096
+#define IO_COUNT            2500
+
+static mach_timebase_info_data_t timebase_info;
+static semaphore_t semaphore;
+static semaphore_t worker_sem;
+static uint32_t g_numcpus;
+static _Atomic uint32_t keep_going = 1;
+int test_file_fd = 0;
+char *data_buf = NULL;
+extern char **environ;
+
+static struct {
+    pthread_t thread;
+} threads[MAX_THREADS];
+
+static uint64_t 
+nanos_to_abs(uint64_t nanos) 
+{ 
+    return nanos * timebase_info.denom / timebase_info.numer;
+}
+
+static void
+io_perf_test_io_init(void)
+{
+    int spawn_ret, pid;
+    char *const mount_args[] = {"/usr/local/sbin/mount_nand.sh", NULL};
+    spawn_ret = posix_spawn(&pid, mount_args[0], NULL, NULL, mount_args, environ);
+    if (spawn_ret < 0) {
+       T_SKIP("NAND mounting in LTE not possible on this device. Skipping test!");
+    }
+    waitpid(pid, &spawn_ret, 0);
+    if (WIFEXITED(spawn_ret) && !WEXITSTATUS(spawn_ret)) {
+        T_PASS("NAND mounted successfully");
+    } else {
+        T_SKIP("Unable to mount NAND. Skipping test!");
+    }
+
+    /* Mark the main thread as fixed priority */
+    struct sched_param param = {.sched_priority = THR_MANAGER_PRI};
+    T_ASSERT_POSIX_ZERO(pthread_setschedparam(pthread_self(), SCHED_FIFO, &param),
+            "pthread_setschedparam");
+
+    /* Set I/O Policy to Tier 0 */
+    T_ASSERT_POSIX_ZERO(setiopolicy_np(IOPOL_TYPE_DISK, IOPOL_SCOPE_PROCESS,
+            IOPOL_IMPORTANT), "setiopolicy");
+
+    /* Create data buffer */
+    data_buf = malloc(IO_SIZE * 16);
+    T_ASSERT_NOTNULL(data_buf, "Data buffer allocation");
+
+    int rndfd = open("/dev/urandom", O_RDONLY, S_IRUSR);
+    T_ASSERT_POSIX_SUCCESS(rndfd, "Open /dev/urandom");
+    T_ASSERT_GE_INT((int)read(rndfd, data_buf, IO_SIZE * 16), 0, "read /dev/urandom");
+    close(rndfd);
+
+    /* Create test file */
+    int fd = open("/mnt2/test", O_CREAT | O_WRONLY, S_IRUSR);
+    T_ASSERT_POSIX_SUCCESS(fd, 0, "Open /mnt2/test for writing!");
+
+    T_ASSERT_POSIX_ZERO(fcntl(fd, F_NOCACHE, 1), "fcntl F_NOCACHE enable");
+    for (int size = 0; size < FILE_SIZE;) {
+        T_QUIET;
+        T_ASSERT_GE_INT((int)write(fd, data_buf, IO_SIZE * 16), 0, "write test file");
+        size += (IO_SIZE * 16);
+    }
+    close(fd);
+    sync();
+
+}
+
+static pthread_t
+create_thread(uint32_t thread_id, uint32_t priority, bool fixpri, 
+        void *(*start_routine)(void *))
+{
+    int rv;
+    pthread_t new_thread;
+    struct sched_param param = { .sched_priority = (int)priority };
+    pthread_attr_t attr;
+
+    T_ASSERT_POSIX_ZERO(pthread_attr_init(&attr), "pthread_attr_init");
+
+    T_ASSERT_POSIX_ZERO(pthread_attr_setschedparam(&attr, &param),
+            "pthread_attr_setschedparam");
+
+    if (fixpri) {
+        T_ASSERT_POSIX_ZERO(pthread_attr_setschedpolicy(&attr, SCHED_RR),
+                "pthread_attr_setschedpolicy");
+    }
+
+    T_ASSERT_POSIX_ZERO(pthread_create(&new_thread, &attr, start_routine,
+            (void*)(uintptr_t)thread_id), "pthread_create");
+
+    T_ASSERT_POSIX_ZERO(pthread_attr_destroy(&attr), "pthread_attr_destroy");
+
+    threads[thread_id].thread = new_thread;
+
+    return new_thread;
+}
+
+/* Spin until a specified number of seconds elapses */
+static void
+spin_for_duration(uint32_t seconds)
+{
+    uint64_t duration       = nanos_to_abs((uint64_t)seconds * NSEC_PER_SEC);
+    uint64_t current_time   = mach_absolute_time();
+    uint64_t timeout        = duration + current_time;
+
+    uint64_t spin_count = 0;
+
+    while (mach_absolute_time() < timeout && atomic_load_explicit(&keep_going,
+               memory_order_relaxed)) {
+        spin_count++;
+    }
+}
+
+static void *
+spin_thread(void *arg)
+{
+    uint32_t thread_id = (uint32_t) arg;
+    char name[30] = "";
+
+    snprintf(name, sizeof(name), "spin thread %2d", thread_id);
+    pthread_setname_np(name);
+    T_ASSERT_MACH_SUCCESS(semaphore_wait_signal(semaphore, worker_sem),
+            "semaphore_wait_signal");
+    spin_for_duration(SPIN_SECS);
+    return NULL;
+}
+
+void
+perform_io(dt_stat_time_t stat)
+{
+    /* Open the test data file */
+    int test_file_fd = open("/mnt2/test", O_RDONLY);
+    T_WITH_ERRNO;
+    T_ASSERT_POSIX_SUCCESS(test_file_fd, "Open test data file");
+
+    /* Disable caching and read-ahead for the file */
+    T_ASSERT_POSIX_ZERO(fcntl(test_file_fd, F_NOCACHE, 1), "fcntl F_NOCACHE enable");
+    T_ASSERT_POSIX_ZERO(fcntl(test_file_fd, F_RDAHEAD, 0), "fcntl F_RDAHEAD disable");
+
+    uint32_t count = 0;
+    int ret;
+
+    for (int i=0; i < WARMUP_ITERATIONS; i++) {
+        /* Warmup loop */
+        read(test_file_fd, data_buf, IO_SIZE);
+    }
+    
+    do {
+        T_STAT_MEASURE(stat) {
+            ret = read(test_file_fd, data_buf, IO_SIZE);
+        }
+        if (ret == 0) {
+            T_QUIET;
+            T_ASSERT_POSIX_SUCCESS(lseek(test_file_fd, 0, SEEK_SET), "lseek begin");
+        } else if (ret < 0) {
+            T_FAIL("read failure");
+            T_END;
+        }
+        count++;
+    } while(count < IO_COUNT);
+    close(test_file_fd);
+}
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.io"), T_META_TAG_PERF);
+
+/* Disable the test on MacOS for now */
+T_DECL(read_perf, "Sequential Uncached Read Performance", T_META_TYPE_PERF, T_META_CHECK_LEAKS(NO), T_META_ASROOT(YES), T_META_LTEPHASE(LTE_POSTINIT))
+{
+
+#if !CONFIG_EMBEDDED
+    T_SKIP("Not supported on MacOS");
+#endif /* !CONFIG_EMBEDDED */
+
+    io_perf_test_io_init();
+    pthread_setname_np("main thread");
+
+    T_ASSERT_MACH_SUCCESS(mach_timebase_info(&timebase_info), "mach_timebase_info");
+
+    dt_stat_time_t seq_noload = dt_stat_time_create("sequential read latency (CPU idle)");
+    perform_io(seq_noload);
+    dt_stat_finalize(seq_noload);
+
+    /* 
+     * We create spinner threads for this test so that all other cores are 
+     * busy. That way the I/O issue thread has to context switch to the 
+     * IOWorkLoop thread and back for the I/O. 
+     */
+    T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &semaphore,
+            SYNC_POLICY_FIFO, 0), "semaphore_create");
+
+    T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &worker_sem,
+            SYNC_POLICY_FIFO, 0), "semaphore_create");
+    
+    size_t ncpu_size = sizeof(g_numcpus);
+    T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.ncpu", &g_numcpus, &ncpu_size, NULL, 0),
+            "sysctlbyname(hw.ncpu)");
+
+    T_LOG("hw.ncpu: %d\n", g_numcpus);
+    uint32_t n_spinners = g_numcpus - 1;
+
+    for (uint32_t thread_id = 0; thread_id < n_spinners; thread_id++) {
+        threads[thread_id].thread = create_thread(thread_id, THR_SPINNER_PRI,
+                true, &spin_thread);
+    }
+
+    for (uint32_t thread_id = 0; thread_id < n_spinners; thread_id++) {
+        T_ASSERT_MACH_SUCCESS(semaphore_wait(worker_sem), "semaphore_wait");
+    }
+
+    T_ASSERT_MACH_SUCCESS(semaphore_signal_all(semaphore), "semaphore_signal");
+    
+    dt_stat_time_t seq_load = dt_stat_time_create("sequential read latency (Single CPU)");
+    perform_io(seq_load);
+    dt_stat_finalize(seq_load);
+    
+    atomic_store_explicit(&keep_going, 0, memory_order_relaxed);
+    for (uint32_t thread_id = 0; thread_id < n_spinners; thread_id++) {
+        T_ASSERT_POSIX_ZERO(pthread_join(threads[thread_id].thread, NULL),
+                "pthread_join %d", thread_id);
+    }
+}
diff --git a/tests/jumbo_va_spaces_28530648.c b/tests/jumbo_va_spaces_28530648.c
new file mode 100644 (file)
index 0000000..aa081f3
--- /dev/null
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+
+#define GB (1ULL * 1024 * 1024 * 1024)
+
+/*
+ * This test expects the entitlement to be the enabling factor for a process to
+ * allocate at least this many GB of VA space. i.e. with the entitlement, n GB
+ * must be allocatable; whereas without it, it must be less.
+ */
+#define ALLOC_TEST_GB 54
+
+T_DECL(jumbo_va_spaces_28530648,
+       "Verify that the \"dynamic-codesigning\" entitlement is required to utilize an extra-large "
+       "VA space on arm64",
+       T_META_NAMESPACE("xnu.vm"),
+       T_META_CHECK_LEAKS(false))
+{
+       int     i;
+       void    *res;
+
+       if (!dt_64_bit_kernel()) {
+               T_SKIP("This test is only applicable to arm64");
+       }
+
+       T_LOG("Attemping to allocate VA space in 1 GB chunks.");
+
+       for (i = 0; i < (ALLOC_TEST_GB * 2); i++) {
+               res = mmap(NULL, 1 * GB, PROT_NONE, MAP_PRIVATE | MAP_ANON, 0, 0);
+               if (res == MAP_FAILED) {
+                       if (errno != ENOMEM) {
+                               T_WITH_ERRNO;
+                               T_LOG("mmap failed: stopped at %d of %d GB allocated", i, ALLOC_TEST_GB);
+                       }
+                       break;
+               } else {
+                       T_LOG("%d: %p\n", i, res);
+               }
+       }
+
+#if defined(ENTITLED)
+       T_EXPECT_GE_INT(i, ALLOC_TEST_GB, "Allocate at least %d GB of VA space", ALLOC_TEST_GB);
+#else
+       T_EXPECT_LT_INT(i, ALLOC_TEST_GB, "Not permitted to allocate %d GB of VA space", ALLOC_TEST_GB);
+#endif
+}
diff --git a/tests/jumbo_va_spaces_28530648.entitlements b/tests/jumbo_va_spaces_28530648.entitlements
new file mode 100644 (file)
index 0000000..9a1d0fb
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>dynamic-codesigning</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/kdebug.c b/tests/kdebug.c
new file mode 100644 (file)
index 0000000..6be5164
--- /dev/null
@@ -0,0 +1,1101 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <dispatch/dispatch.h>
+#include <inttypes.h>
+#include <ktrace/session.h>
+#include <ktrace/private.h>
+#include <kperf/kperf.h>
+#include <mach/clock_types.h>
+#include <mach/dyld_kernel.h>
+#include <mach/host_info.h>
+#include <mach/mach.h>
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <os/assumes.h>
+#include <stdlib.h>
+#include <sys/kdebug.h>
+#include <sys/kdebug_signpost.h>
+#include <sys/sysctl.h>
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.ktrace"),
+               T_META_ASROOT(true));
+
+#define KDBG_TEST_MACROS    1
+#define KDBG_TEST_OLD_TIMES 2
+
+static void
+assert_kdebug_test(unsigned int flavor)
+{
+       size_t size = flavor;
+       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDTEST };
+       T_ASSERT_POSIX_SUCCESS(sysctl(mib, sizeof(mib) / sizeof(mib[0]), NULL,
+                       &size, NULL, 0), "KERN_KDTEST sysctl");
+}
+
+#pragma mark kdebug syscalls
+
+#define TRACE_DEBUGID (0xfedfed00U)
+
+T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events")
+{
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){});
+
+       __block int events_seen = 0;
+       ktrace_events_single(s, TRACE_DEBUGID, ^void(struct trace_point *tp) {
+               events_seen++;
+               T_PASS("saw traced event");
+
+               T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of traced event is correct");
+               T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of traced event is correct");
+               T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of traced event is correct");
+               T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of traced event is correct");
+
+               ktrace_end(s, 1);
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               T_EXPECT_GE(events_seen, 1, NULL);
+               ktrace_session_destroy(s);
+               T_END;
+       });
+
+       ktrace_filter_pid(s, getpid());
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+       T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 1, 2, 3, 4), NULL);
+       ktrace_end(s, 0);
+
+       dispatch_main();
+}
+
+#define SIGNPOST_SINGLE_CODE (0x10U)
+#define SIGNPOST_PAIRED_CODE (0x20U)
+
+T_DECL(kdebug_signpost_syscall,
+               "test that kdebug_signpost(2) emits correct events")
+{
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       __block int single_seen = 0;
+       __block int paired_seen = 0;
+
+       /* make sure to get enough events for the KDBUFWAIT to trigger */
+       // ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){});
+       ktrace_events_single(s,
+                       APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_SINGLE_CODE),
+                       ^(struct trace_point *tp) {
+               single_seen++;
+               T_PASS("single signpost is traced");
+
+               T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of single signpost is correct");
+               T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of single signpost is correct");
+               T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of single signpost is correct");
+               T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of single signpost is correct");
+       });
+
+       ktrace_events_single_paired(s,
+                       APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_PAIRED_CODE),
+                       ^(struct trace_point *start, struct trace_point *end) {
+               paired_seen++;
+               T_PASS("paired signposts are traced");
+
+               T_EXPECT_EQ(start->arg1, 5UL, "argument 1 of start signpost is correct");
+               T_EXPECT_EQ(start->arg2, 6UL, "argument 2 of start signpost is correct");
+               T_EXPECT_EQ(start->arg3, 7UL, "argument 3 of start signpost is correct");
+               T_EXPECT_EQ(start->arg4, 8UL, "argument 4 of start signpost is correct");
+
+               T_EXPECT_EQ(end->arg1, 9UL, "argument 1 of end signpost is correct");
+               T_EXPECT_EQ(end->arg2, 10UL, "argument 2 of end signpost is correct");
+               T_EXPECT_EQ(end->arg3, 11UL, "argument 3 of end signpost is correct");
+               T_EXPECT_EQ(end->arg4, 12UL, "argument 4 of end signpost is correct");
+
+               T_EXPECT_EQ(single_seen, 1,
+                               "signposts are traced in the correct order");
+
+               ktrace_end(s, 1);
+       });
+
+       ktrace_set_completion_handler(s, ^(void) {
+               T_QUIET; T_EXPECT_NE(single_seen, 0,
+                               "did not see single tracepoint before timeout");
+               T_QUIET; T_EXPECT_NE(paired_seen, 0,
+                               "did not see single tracepoint before timeout");
+               ktrace_session_destroy(s);
+               T_END;
+       });
+
+       ktrace_filter_pid(s, getpid());
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()),
+                       "started tracing");
+
+       T_EXPECT_POSIX_SUCCESS(kdebug_signpost(SIGNPOST_SINGLE_CODE, 1, 2, 3, 4),
+                       "emitted single signpost");
+       T_EXPECT_POSIX_SUCCESS(
+                       kdebug_signpost_start(SIGNPOST_PAIRED_CODE, 5, 6, 7, 8),
+                       "emitted start signpost");
+       T_EXPECT_POSIX_SUCCESS(
+                       kdebug_signpost_end(SIGNPOST_PAIRED_CODE, 9, 10, 11, 12),
+                       "emitted end signpost");
+       ktrace_end(s, 0);
+
+       dispatch_main();
+}
+
+#pragma mark kdebug behaviors
+
+#define WRAPPING_EVENTS_COUNT     (150000)
+#define TRACE_ITERATIONS          (5000)
+#define WRAPPING_EVENTS_THRESHOLD (100)
+
+T_DECL(wrapping,
+               "ensure that wrapping traces lost events and no events prior to the wrap",
+               T_META_CHECK_LEAKS(false))
+{
+       int mib[4];
+       kbufinfo_t buf_info;
+       int wait_wrapping_secs = (WRAPPING_EVENTS_COUNT / TRACE_ITERATIONS) + 5;
+       int current_secs = wait_wrapping_secs;
+
+       /* use sysctls manually to bypass libktrace assumptions */
+
+       mib[0] = CTL_KERN; mib[1] = KERN_KDEBUG; mib[2] = KERN_KDSETUP; mib[3] = 0;
+       size_t needed = 0;
+       T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, NULL, &needed, NULL, 0),
+                       "KERN_KDSETUP");
+
+       mib[2] = KERN_KDSETBUF; mib[3] = WRAPPING_EVENTS_COUNT;
+       T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDSETBUF");
+
+       mib[2] = KERN_KDENABLE; mib[3] = 1;
+       T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDENABLE");
+
+       /* wrapping is on by default */
+
+       /* wait until wrapped */
+       T_LOG("waiting for trace to wrap");
+       mib[2] = KERN_KDGETBUF;
+       needed = sizeof(buf_info);
+       do {
+               sleep(1);
+               for (int i = 0; i < TRACE_ITERATIONS; i++) {
+                       T_QUIET;
+                       T_ASSERT_POSIX_SUCCESS(kdebug_trace(0xfefe0000, 0, 0, 0, 0), NULL);
+               }
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, &buf_info, &needed, NULL, 0),
+                               NULL);
+       } while (!(buf_info.flags & KDBG_WRAPPED) && --current_secs > 0);
+
+       T_ASSERT_TRUE(buf_info.flags & KDBG_WRAPPED,
+                       "trace wrapped (after %d seconds within %d second timeout)",
+                       wait_wrapping_secs - current_secs, wait_wrapping_secs);
+
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_ASSERT_NOTNULL(s, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(s), NULL);
+
+       __block int events = 0;
+
+       ktrace_events_all(s, ^(struct trace_point *tp) {
+               if (events == 0) {
+                       T_EXPECT_EQ(tp->debugid, (unsigned int)TRACE_LOST_EVENTS,
+                                       "first event's debugid 0x%08x (%s) should be TRACE_LOST_EVENTS",
+                                       tp->debugid,
+                                       ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK));
+               } else {
+                       T_QUIET;
+                       T_EXPECT_NE(tp->debugid, (unsigned int)TRACE_LOST_EVENTS,
+                                       "event debugid 0x%08x (%s) should not be TRACE_LOST_EVENTS",
+                                       tp->debugid,
+                                       ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK));
+               }
+
+               events++;
+               if (events > WRAPPING_EVENTS_THRESHOLD) {
+                       ktrace_end(s, 1);
+               }
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               ktrace_session_destroy(s);
+               T_END;
+       });
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()),
+                       "started tracing");
+
+       dispatch_main();
+}
+
+T_DECL(reject_old_events,
+               "ensure that kdebug rejects events from before tracing began",
+               T_META_CHECK_LEAKS(false))
+{
+       __block uint64_t event_horizon_ts;
+
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       __block int events = 0;
+       ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0),
+                       KDBG_EVENTID(DBG_BSD + 1, 0, 0), ^(struct trace_point *tp) {
+               events++;
+               T_EXPECT_GT(tp->timestamp, event_horizon_ts,
+                               "events in trace should be from after tracing began");
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               T_EXPECT_EQ(events, 2, "should see only two events");
+               ktrace_session_destroy(s);
+               T_END;
+       });
+
+       event_horizon_ts = mach_absolute_time();
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+       /* first, try an old event at the beginning of trace */
+       assert_kdebug_test(KDBG_TEST_OLD_TIMES);
+       /* after a good event has been traced, old events should be rejected */
+       assert_kdebug_test(KDBG_TEST_OLD_TIMES);
+       ktrace_end(s, 0);
+
+       dispatch_main();
+}
+
+#define ORDERING_TIMEOUT_SEC 5
+
+T_DECL(ascending_time_order,
+               "ensure that kdebug events are in ascending order based on time",
+               T_META_CHECK_LEAKS(false))
+{
+       __block uint64_t prev_ts = 0;
+       __block uint32_t prev_debugid = 0;
+       __block unsigned int prev_cpu = 0;
+       __block bool in_order = true;
+
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       ktrace_events_all(s, ^(struct trace_point *tp) {
+               if (tp->timestamp < prev_ts) {
+                       in_order = false;
+                       T_LOG("%" PRIu64 ": %#" PRIx32 " (cpu %d)",
+                                       prev_ts, prev_debugid, prev_cpu);
+                       T_LOG("%" PRIu64 ": %#" PRIx32 " (cpu %d)",
+                                       tp->timestamp, tp->debugid, tp->cpuid);
+                       ktrace_end(s, 1);
+               }
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               ktrace_session_destroy(s);
+               T_EXPECT_TRUE(in_order, "event timestamps were in-order");
+               T_END;
+       });
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()),
+                       "started tracing");
+
+       /* try to inject old timestamps into trace */
+       assert_kdebug_test(KDBG_TEST_OLD_TIMES);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, ORDERING_TIMEOUT_SEC * NSEC_PER_SEC),
+                       dispatch_get_main_queue(), ^{
+               T_LOG("ending test after timeout");
+               ktrace_end(s, 1);
+       });
+
+       dispatch_main();
+}
+
+#pragma mark dyld tracing
+
+__attribute__((aligned(8)))
+       static const char map_uuid[16] = "map UUID";
+
+__attribute__((aligned(8)))
+       static const char unmap_uuid[16] = "unmap UUID";
+
+__attribute__((aligned(8)))
+       static const char sc_uuid[16] = "shared UUID";
+
+       static fsid_t map_fsid = { .val = { 42, 43 } };
+static fsid_t unmap_fsid = { .val = { 44, 45 } };
+static fsid_t sc_fsid = { .val = { 46, 47 } };
+
+static fsobj_id_t map_fsobjid = { .fid_objno = 42, .fid_generation = 43 };
+static fsobj_id_t unmap_fsobjid = { .fid_objno = 44, .fid_generation = 45 };
+static fsobj_id_t sc_fsobjid = { .fid_objno = 46, .fid_generation = 47 };
+
+#define MAP_LOAD_ADDR   0xabadcafe
+#define UNMAP_LOAD_ADDR 0xfeedface
+#define SC_LOAD_ADDR    0xfedfaced
+
+__unused
+static void
+expect_dyld_image_info(struct trace_point *tp, const uint64_t *exp_uuid,
+               uint64_t exp_load_addr, fsid_t *exp_fsid, fsobj_id_t *exp_fsobjid,
+               int order)
+{
+#if defined(__LP64__) || defined(__arm64__)
+       if (order == 0) {
+               uint64_t uuid[2];
+               uint64_t load_addr;
+               fsid_t fsid;
+
+               uuid[0] = (uint64_t)tp->arg1;
+               uuid[1] = (uint64_t)tp->arg2;
+               load_addr = (uint64_t)tp->arg3;
+               fsid.val[0] = (int32_t)(tp->arg4 & UINT32_MAX);
+               fsid.val[1] = (int32_t)((uint64_t)tp->arg4 >> 32);
+
+               T_QUIET; T_EXPECT_EQ(uuid[0], exp_uuid[0], NULL);
+               T_QUIET; T_EXPECT_EQ(uuid[1], exp_uuid[1], NULL);
+               T_QUIET; T_EXPECT_EQ(load_addr, exp_load_addr, NULL);
+               T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL);
+               T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL);
+       } else if (order == 1) {
+               fsobj_id_t fsobjid;
+
+               fsobjid.fid_objno = (uint32_t)(tp->arg1 & UINT32_MAX);
+               fsobjid.fid_generation = (uint32_t)((uint64_t)tp->arg1 >> 32);
+
+               T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL);
+               T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation,
+                               exp_fsobjid->fid_generation, NULL);
+       } else {
+               T_ASSERT_FAIL("unrecognized order of events %d", order);
+       }
+#else /* defined(__LP64__) */
+       if (order == 0) {
+               uint32_t uuid[4];
+
+               uuid[0] = (uint32_t)tp->arg1;
+               uuid[1] = (uint32_t)tp->arg2;
+               uuid[2] = (uint32_t)tp->arg3;
+               uuid[3] = (uint32_t)tp->arg4;
+
+               T_QUIET; T_EXPECT_EQ(uuid[0], (uint32_t)exp_uuid[0], NULL);
+               T_QUIET; T_EXPECT_EQ(uuid[1], (uint32_t)(exp_uuid[0] >> 32), NULL);
+               T_QUIET; T_EXPECT_EQ(uuid[2], (uint32_t)exp_uuid[1], NULL);
+               T_QUIET; T_EXPECT_EQ(uuid[3], (uint32_t)(exp_uuid[1] >> 32), NULL);
+       } else if (order == 1) {
+               uint32_t load_addr;
+               fsid_t fsid;
+               fsobj_id_t fsobjid;
+
+               load_addr = (uint32_t)tp->arg1;
+               fsid.val[0] = (int32_t)tp->arg2;
+               fsid.val[1] = (int32_t)tp->arg3;
+               fsobjid.fid_objno = (uint32_t)tp->arg4;
+
+               T_QUIET; T_EXPECT_EQ(load_addr, (uint32_t)exp_load_addr, NULL);
+               T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL);
+               T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL);
+               T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL);
+       } else if (order == 2) {
+               fsobj_id_t fsobjid;
+
+               fsobjid.fid_generation = tp->arg1;
+
+               T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation,
+                               exp_fsobjid->fid_generation, NULL);
+       } else {
+               T_ASSERT_FAIL("unrecognized order of events %d", order);
+       }
+#endif /* defined(__LP64__) */
+}
+
+#if defined(__LP64__) || defined(__arm64__)
+#define DYLD_CODE_OFFSET (0)
+#define DYLD_EVENTS      (2)
+#else
+#define DYLD_CODE_OFFSET (2)
+#define DYLD_EVENTS      (3)
+#endif
+
+static void
+expect_dyld_events(ktrace_session_t s, const char *name, uint32_t base_code,
+               const char *exp_uuid, uint64_t exp_load_addr, fsid_t *exp_fsid,
+               fsobj_id_t *exp_fsobjid, uint8_t *saw_events)
+{
+       for (int i = 0; i < DYLD_EVENTS; i++) {
+               ktrace_events_single(s, KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID,
+                               base_code + DYLD_CODE_OFFSET + (unsigned int)i),
+                               ^(struct trace_point *tp) {
+                       T_LOG("checking %s event %c", name, 'A' + i);
+                       expect_dyld_image_info(tp, (const void *)exp_uuid, exp_load_addr,
+                                       exp_fsid, exp_fsobjid, i);
+                       *saw_events |= (1U << i);
+               });
+       }
+}
+
+T_DECL(dyld_events, "test that dyld registering libraries emits events")
+{
+       dyld_kernel_image_info_t info;
+
+       /*
+        * Use pointers instead of __block variables in order to use these variables
+        * in the completion block below _and_ pass pointers to them to the
+        * expect_dyld_events function.
+        */
+       uint8_t saw_events[3] = { 0 };
+       uint8_t *saw_mapping = &(saw_events[0]);
+       uint8_t *saw_unmapping = &(saw_events[1]);
+       uint8_t *saw_shared_cache = &(saw_events[2]);
+
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()),
+                       "filtered to current process");
+
+       expect_dyld_events(s, "mapping", DBG_DYLD_UUID_MAP_A, map_uuid,
+                       MAP_LOAD_ADDR, &map_fsid, &map_fsobjid, saw_mapping);
+       expect_dyld_events(s, "unmapping", DBG_DYLD_UUID_UNMAP_A, unmap_uuid,
+                       UNMAP_LOAD_ADDR, &unmap_fsid, &unmap_fsobjid, saw_unmapping);
+       expect_dyld_events(s, "shared cache", DBG_DYLD_UUID_SHARED_CACHE_A,
+                       sc_uuid, SC_LOAD_ADDR, &sc_fsid, &sc_fsobjid, saw_shared_cache);
+
+       ktrace_set_completion_handler(s, ^{
+               ktrace_session_destroy(s);
+
+               T_EXPECT_EQ(__builtin_popcount(*saw_mapping), DYLD_EVENTS, NULL);
+               T_EXPECT_EQ(__builtin_popcount(*saw_unmapping), DYLD_EVENTS, NULL);
+               T_EXPECT_EQ(__builtin_popcount(*saw_shared_cache), DYLD_EVENTS, NULL);
+               T_END;
+       });
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+
+       info.load_addr = MAP_LOAD_ADDR;
+       memcpy(info.uuid, map_uuid, sizeof(info.uuid));
+       info.fsid = map_fsid;
+       info.fsobjid = map_fsobjid;
+       T_EXPECT_MACH_SUCCESS(task_register_dyld_image_infos(mach_task_self(),
+                       &info, 1), "registered dyld image info");
+
+       info.load_addr = UNMAP_LOAD_ADDR;
+       memcpy(info.uuid, unmap_uuid, sizeof(info.uuid));
+       info.fsid = unmap_fsid;
+       info.fsobjid = unmap_fsobjid;
+       T_EXPECT_MACH_SUCCESS(task_unregister_dyld_image_infos(mach_task_self(),
+                       &info, 1), "unregistered dyld image info");
+
+       info.load_addr = SC_LOAD_ADDR;
+       memcpy(info.uuid, sc_uuid, sizeof(info.uuid));
+       info.fsid = sc_fsid;
+       info.fsobjid = sc_fsobjid;
+       T_EXPECT_MACH_SUCCESS(task_register_dyld_shared_cache_image_info(
+                       mach_task_self(), info, FALSE, FALSE),
+                       "registered dyld shared cache image info");
+
+       ktrace_end(s, 0);
+
+       dispatch_main();
+}
+
+#pragma mark kdebug kernel macros
+
+#define EXP_KERNEL_EVENTS 5U
+
+static const uint32_t dev_evts[EXP_KERNEL_EVENTS] = {
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 0),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 1),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 2),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 3),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 4),
+};
+
+static const uint32_t rel_evts[EXP_KERNEL_EVENTS] = {
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 5),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 6),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 7),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 8),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 9),
+};
+
+static const uint32_t filt_evts[EXP_KERNEL_EVENTS] = {
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 10),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 11),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 12),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 13),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 14),
+};
+
+static const uint32_t noprocfilt_evts[EXP_KERNEL_EVENTS] = {
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 15),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 16),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 17),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 18),
+       BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 19),
+};
+
+static bool
+is_development_kernel(void)
+{
+       static dispatch_once_t is_development_once;
+       static bool is_development;
+
+       dispatch_once(&is_development_once, ^{
+               int dev;
+               size_t dev_size = sizeof(dev);
+
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev,
+                                       &dev_size, NULL, 0), NULL);
+               is_development = (dev != 0);
+       });
+
+       return is_development;
+}
+
+static void
+expect_event(struct trace_point *tp, const char *name, unsigned int *events,
+               const uint32_t *event_ids, size_t event_ids_len)
+{
+       unsigned int event_idx = *events;
+       bool event_found = false;
+       size_t i;
+       for (i = 0; i < event_ids_len; i++) {
+               if (event_ids[i] == (tp->debugid & KDBG_EVENTID_MASK)) {
+                       T_LOG("found %s event 0x%x", name, tp->debugid);
+                       event_found = true;
+               }
+       }
+
+       if (!event_found) {
+               return;
+       }
+
+       *events += 1;
+       for (i = 0; i < event_idx; i++) {
+               T_QUIET; T_EXPECT_EQ(((uint64_t *)&tp->arg1)[i], (uint64_t)i + 1,
+                               NULL);
+       }
+       for (; i < 4; i++) {
+               T_QUIET; T_EXPECT_EQ(((uint64_t *)&tp->arg1)[i], (uint64_t)0, NULL);
+       }
+}
+
+static void
+expect_release_event(struct trace_point *tp, unsigned int *events)
+{
+       expect_event(tp, "release", events, rel_evts,
+                       sizeof(rel_evts) / sizeof(rel_evts[0]));
+}
+
+static void
+expect_development_event(struct trace_point *tp, unsigned int *events)
+{
+       expect_event(tp, "dev", events, dev_evts, sizeof(dev_evts) / sizeof(dev_evts[0]));
+}
+
+static void
+expect_filtered_event(struct trace_point *tp, unsigned int *events)
+{
+       expect_event(tp, "filtered", events, filt_evts,
+                       sizeof(filt_evts) / sizeof(filt_evts[0]));
+}
+
+static void
+expect_noprocfilt_event(struct trace_point *tp, unsigned int *events)
+{
+       expect_event(tp, "noprocfilt", events, noprocfilt_evts,
+                       sizeof(noprocfilt_evts) / sizeof(noprocfilt_evts[0]));
+}
+
+static void
+expect_kdbg_test_events(ktrace_session_t s, bool use_all_callback,
+               void (^cb)(unsigned int dev_seen, unsigned int rel_seen,
+               unsigned int filt_seen, unsigned int noprocfilt_seen))
+{
+       __block unsigned int dev_seen = 0;
+       __block unsigned int rel_seen = 0;
+       __block unsigned int filt_seen = 0;
+       __block unsigned int noprocfilt_seen = 0;
+
+       void (^evtcb)(struct trace_point *tp) = ^(struct trace_point *tp) {
+               expect_development_event(tp, &dev_seen);
+               expect_release_event(tp, &rel_seen);
+               expect_filtered_event(tp, &filt_seen);
+               expect_noprocfilt_event(tp, &noprocfilt_seen);
+       };
+
+       if (use_all_callback) {
+               ktrace_events_all(s, evtcb);
+       } else {
+               ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0),
+                               KDBG_EVENTID(DBG_BSD + 1, 0, 0), evtcb);
+       }
+
+       ktrace_set_completion_handler(s, ^{
+               ktrace_session_destroy(s);
+               cb(dev_seen, rel_seen, filt_seen, noprocfilt_seen);
+               T_END;
+       });
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+       assert_kdebug_test(KDBG_TEST_MACROS);
+
+       ktrace_end(s, 0);
+}
+
+T_DECL(kernel_events, "ensure kernel macros work")
+{
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()),
+                       "filtered events to current process");
+
+       expect_kdbg_test_events(s, false,
+                       ^(unsigned int dev_seen, unsigned int rel_seen,
+                       unsigned int filt_seen, unsigned int noprocfilt_seen) {
+               /*
+                * Development-only events are only filtered if running on an embedded
+                * OS.
+                */
+               unsigned int dev_exp;
+#if TARGET_OS_EMBEDDED
+               dev_exp = is_development_kernel() ? EXP_KERNEL_EVENTS : 0U;
+#else
+               dev_exp = EXP_KERNEL_EVENTS;
+#endif
+
+               T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS,
+                               "release and development events seen");
+               T_EXPECT_EQ(dev_seen, dev_exp, "development-only events %sseen",
+                               dev_exp ? "" : "not ");
+               T_EXPECT_EQ(filt_seen, dev_exp, "filter-only events seen");
+               T_EXPECT_EQ(noprocfilt_seen, EXP_KERNEL_EVENTS,
+                               "process filter-agnostic events seen");
+       });
+
+       dispatch_main();
+}
+
+T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work")
+{
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()),
+                       "filtered events to current process");
+
+       expect_kdbg_test_events(s, true,
+                       ^(unsigned int dev_seen, unsigned int rel_seen,
+                       unsigned int filt_seen, unsigned int noprocfilt_seen) {
+               T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS, NULL);
+#if defined(__arm__) || defined(__arm64__)
+               T_EXPECT_EQ(dev_seen, is_development_kernel() ? EXP_KERNEL_EVENTS : 0U,
+                               NULL);
+#else
+               T_EXPECT_EQ(dev_seen, EXP_KERNEL_EVENTS,
+                               "development-only events seen");
+#endif /* defined(__arm__) || defined(__arm64__) */
+               T_EXPECT_EQ(filt_seen, 0U, "no filter-only events seen");
+               T_EXPECT_EQ(noprocfilt_seen, EXP_KERNEL_EVENTS,
+                               "process filter-agnostic events seen");
+       });
+
+       dispatch_main();
+}
+
+T_DECL(kernel_events_noprocfilt,
+               "ensure that the no process filter kernel macros work")
+{
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       /*
+        * Only allow launchd events through.
+        */
+       T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, 1), "filtered events to launchd");
+       for (size_t i = 0; i < sizeof(noprocfilt_evts) / sizeof(noprocfilt_evts[0]); i++) {
+               T_QUIET;
+               T_ASSERT_POSIX_ZERO(ktrace_ignore_process_filter_for_event(s,
+                               noprocfilt_evts[i]),
+                               "ignored process filter for noprocfilt event");
+       }
+
+       expect_kdbg_test_events(s, false,
+                       ^(unsigned int dev_seen, unsigned int rel_seen,
+                       unsigned int filt_seen, unsigned int noprocfilt_seen) {
+               T_EXPECT_EQ(rel_seen, 0U, "release and development events not seen");
+               T_EXPECT_EQ(dev_seen, 0U, "development-only events not seen");
+               T_EXPECT_EQ(filt_seen, 0U, "filter-only events not seen");
+
+               T_EXPECT_EQ(noprocfilt_seen, EXP_KERNEL_EVENTS,
+                               "process filter-agnostic events seen");
+       });
+
+       dispatch_main();
+}
+
+static volatile bool continue_abuse = true;
+
+#define STRESS_DEBUGID (0xfeedfac0)
+#define ABUSE_SECS (10)
+#define TIMER_NS (100 * NSEC_PER_USEC)
+/*
+ * Use the quantum as the gap threshold.
+ */
+#define GAP_THRESHOLD_NS (10 * NSEC_PER_MSEC)
+
+static void *
+kdebug_abuser_thread(void *ctx)
+{
+       unsigned int id = (unsigned int)ctx;
+       uint64_t i = 0;
+       while (continue_abuse) {
+               kdebug_trace(STRESS_DEBUGID, id, i, 0, 0);
+               i++;
+       }
+
+       return NULL;
+}
+
+T_DECL(stress, "emit events on all but one CPU with a small buffer",
+               T_META_CHECK_LEAKS(false))
+{
+       T_SETUPBEGIN;
+       ktrace_session_t s = ktrace_session_create();
+       T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
+
+       /* Let's not waste any time with pleasantries. */
+       ktrace_set_uuid_map_enabled(s, KTRACE_FEATURE_DISABLED);
+
+       /* Ouch. */
+       ktrace_events_all(s, ^(__unused struct trace_point *tp) {});
+       ktrace_set_vnode_paths_enabled(s, KTRACE_FEATURE_ENABLED);
+       (void)atexit_b(^{ kperf_reset(); });
+       (void)kperf_action_count_set(1);
+       (void)kperf_timer_count_set(1);
+       int kperror = kperf_timer_period_set(0, kperf_ns_to_ticks(TIMER_NS));
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kperror, "kperf_timer_period_set %llu ns",
+                       TIMER_NS);
+       kperror = kperf_timer_action_set(0, 1);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kperror, "kperf_timer_action_set");
+       kperror = kperf_action_samplers_set(1, KPERF_SAMPLER_TINFO |
+                       KPERF_SAMPLER_TH_SNAPSHOT | KPERF_SAMPLER_KSTACK |
+                       KPERF_SAMPLER_USTACK | KPERF_SAMPLER_MEMINFO |
+                       KPERF_SAMPLER_TINFO_SCHED | KPERF_SAMPLER_TH_DISPATCH |
+                       KPERF_SAMPLER_TK_SNAPSHOT | KPERF_SAMPLER_SYS_MEM |
+                       KPERF_SAMPLER_TH_INSTRS_CYCLES);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kperror, "kperf_action_samplers_set");
+       /* You monster... */
+
+       /* The coup-de-grace. */
+       ktrace_set_buffer_size(s, 10);
+
+       char filepath_arr[MAXPATHLEN] = "";
+       strlcpy(filepath_arr, dt_tmpdir(), sizeof(filepath_arr));
+       strlcat(filepath_arr, "/stress.ktrace", sizeof(filepath_arr));
+       char *filepath = filepath_arr;
+
+       int ncpus = 0;
+       size_t ncpus_size = sizeof(ncpus);
+       int ret = sysctlbyname("hw.logicalcpu_max", &ncpus, &ncpus_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname(\"hw.logicalcpu_max\"");
+       T_QUIET; T_ASSERT_GT(ncpus, 0, "realistic number of CPUs");
+
+       pthread_t *threads = calloc((unsigned int)ncpus - 1, sizeof(pthread_t));
+       T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(threads, "calloc(%d threads)",
+                       ncpus - 1);
+
+       ktrace_set_completion_handler(s, ^{
+               T_SETUPBEGIN;
+               ktrace_session_destroy(s);
+
+               T_LOG("trace ended, searching for gaps");
+
+               ktrace_session_t sread = ktrace_session_create();
+               T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(sread, "ktrace_session_create");
+
+               int error = ktrace_set_file(sread, filepath);
+               T_QUIET; T_ASSERT_POSIX_ZERO(error, "ktrace_set_file %s", filepath);
+
+               ktrace_file_t f = ktrace_file_open(filepath, false);
+               T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(f, "ktrace_file_open %s",
+                               filepath);
+               uint64_t first_timestamp = 0;
+               error = ktrace_file_earliest_timestamp(f, &first_timestamp);
+               T_QUIET; T_ASSERT_POSIX_ZERO(error, "ktrace_file_earliest_timestamp");
+
+               uint64_t last_timestamp = 0;
+               (void)ktrace_file_latest_timestamp(f, &last_timestamp);
+
+               __block uint64_t prev_timestamp = 0;
+               __block uint64_t nevents = 0;
+               ktrace_events_all(sread, ^(struct trace_point *tp) {
+                       nevents++;
+                       uint64_t delta_ns = 0;
+                       T_QUIET; T_EXPECT_GE(tp->timestamp, prev_timestamp,
+                                       "timestamps are monotonically increasing");
+                       int converror = ktrace_convert_timestamp_to_nanoseconds(sread,
+                                       tp->timestamp - prev_timestamp, &delta_ns);
+                       T_QUIET; T_ASSERT_POSIX_ZERO(converror, "convert timestamp to ns");
+                       if (prev_timestamp && delta_ns > GAP_THRESHOLD_NS) {
+                               if (tp->debugname) {
+                                       T_LOG("gap: %gs at %llu - %llu on %d: %s (%#08x)",
+                                                       (double)delta_ns / 1e9, prev_timestamp,
+                                                       tp->timestamp, tp->cpuid, tp->debugname, tp->debugid);
+                               } else {
+                                       T_LOG("gap: %gs at %llu - %llu on %d: %#x",
+                                                       (double)delta_ns / 1e9, prev_timestamp,
+                                                       tp->timestamp, tp->cpuid, tp->debugid);
+                               }
+
+                               /*
+                                * These gaps are ok -- they appear after CPUs are brought back
+                                * up.
+                                */
+#define INTERRUPT (0x1050000)
+#define PERF_CPU_IDLE (0x27001000)
+#define INTC_HANDLER (0x5000004)
+#define DECR_TRAP (0x1090000)
+                               uint32_t eventid = tp->debugid & KDBG_EVENTID_MASK;
+                               if (eventid != INTERRUPT && eventid != PERF_CPU_IDLE &&
+                                               eventid != INTC_HANDLER && eventid != DECR_TRAP) {
+                                       unsigned int lost_events = TRACE_LOST_EVENTS;
+                                       T_QUIET; T_EXPECT_EQ(tp->debugid, lost_events,
+                                                       "gaps should end with lost events");
+                               }
+                       }
+
+                       prev_timestamp = tp->timestamp;
+               });
+               ktrace_events_single(sread, TRACE_LOST_EVENTS, ^(struct trace_point *tp){
+                       T_LOG("lost: %llu on %d (%lu)", tp->timestamp, tp->cpuid, tp->arg1);
+               });
+
+               __block uint64_t last_write = 0;
+               ktrace_events_single_paired(sread, TRACE_WRITING_EVENTS,
+                               ^(struct trace_point *start, struct trace_point *end) {
+                       uint64_t delta_ns;
+                       int converror = ktrace_convert_timestamp_to_nanoseconds(sread,
+                                       start->timestamp - last_write, &delta_ns);
+                       T_QUIET; T_ASSERT_POSIX_ZERO(converror, "convert timestamp to ns");
+
+                       uint64_t dur_ns;
+                       converror = ktrace_convert_timestamp_to_nanoseconds(sread,
+                                       end->timestamp - start->timestamp, &dur_ns);
+                       T_QUIET; T_ASSERT_POSIX_ZERO(converror, "convert timestamp to ns");
+
+                       T_LOG("write: %llu (+%gs): %gus on %d: %lu events", start->timestamp,
+                                       (double)delta_ns / 1e9, (double)dur_ns / 1e3, end->cpuid, end->arg1);
+                       last_write = end->timestamp;
+               });
+               ktrace_set_completion_handler(sread, ^{
+                       uint64_t duration_ns = 0;
+                       if (last_timestamp) {
+                       int converror = ktrace_convert_timestamp_to_nanoseconds(sread,
+                                       last_timestamp - first_timestamp, &duration_ns);
+                       T_QUIET; T_ASSERT_POSIX_ZERO(converror,
+                                       "convert timestamp to ns");
+                       T_LOG("file was %gs long, %llu events: %g events/msec/cpu",
+                                       (double)duration_ns / 1e9, nevents,
+                                       (double)nevents / ((double)duration_ns / 1e6) / ncpus);
+                       }
+                       (void)unlink(filepath);
+                       ktrace_session_destroy(sread);
+                       T_END;
+               });
+
+               int starterror = ktrace_start(sread, dispatch_get_main_queue());
+               T_QUIET; T_ASSERT_POSIX_ZERO(starterror, "ktrace_start read session");
+
+               T_SETUPEND;
+       });
+
+/* Just kidding... for now. */
+#if 0
+       kperror = kperf_sample_set(1);
+       T_ASSERT_POSIX_SUCCESS(kperror,
+                       "started kperf timer sampling every %llu ns", TIMER_NS);
+#endif
+
+       for (int i = 0; i < (ncpus - 1); i++) {
+               int error = pthread_create(&threads[i], NULL, kdebug_abuser_thread,
+                               (void *)(uintptr_t)i);
+               T_QUIET; T_ASSERT_POSIX_ZERO(error,
+                               "pthread_create abuser thread %d", i);
+       }
+
+       int error = ktrace_start_writing_file(s, filepath,
+                       ktrace_compression_none, NULL, NULL);
+       T_ASSERT_POSIX_ZERO(error, "started writing ktrace to %s", filepath);
+
+       T_SETUPEND;
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, ABUSE_SECS * NSEC_PER_SEC),
+                       dispatch_get_main_queue(), ^{
+               T_LOG("ending trace");
+               ktrace_end(s, 1);
+
+               continue_abuse = false;
+               for (int i = 0; i < (ncpus - 1); i++) {
+               int joinerror = pthread_join(threads[i], NULL);
+               T_QUIET; T_EXPECT_POSIX_ZERO(joinerror, "pthread_join thread %d",
+                               i);
+               }
+       });
+
+       dispatch_main();
+}
+
+#define ROUND_TRIP_PERIOD UINT64_C(10 * 1000)
+#define ROUND_TRIPS_THRESHOLD UINT64_C(25)
+#define ROUND_TRIPS_TIMEOUT_SECS (2 * 60)
+#define COLLECTION_INTERVAL_MS 100
+
+/*
+ * Test a sustained tracing session, involving multiple round-trips to the
+ * kernel.
+ *
+ * Trace all events, and every `ROUND_TRIP_PERIOD` events, emit an event that's
+ * unlikely to be emitted elsewhere.  Look for this event, too, and make sure we
+ * see as many of them as we emitted.
+ *
+ * After seeing `ROUND_TRIPS_THRESHOLD` of the unlikely events, end tracing.
+ * In the failure mode, we won't see any of these, so set a timeout of
+ * `ROUND_TRIPS_TIMEOUT_SECS` to prevent hanging, waiting for events that we'll
+ * never see.
+ */
+T_DECL(round_trips,
+               "test sustained tracing with multiple round-trips through the kernel")
+{
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       /*
+        * Set a small buffer and collection interval to increase the number of
+        * round-trips.
+        */
+       ktrace_set_buffer_size(s, 50);
+       ktrace_set_collection_interval(s, COLLECTION_INTERVAL_MS);
+
+       __block uint64_t events = 0;
+       __block uint64_t emitted = 0;
+       __block uint64_t seen = 0;
+       ktrace_events_all(s, ^(__unused struct trace_point *tp) {
+               events++;
+               if (events % ROUND_TRIP_PERIOD == 0) {
+                       T_LOG("emitting round-trip event %" PRIu64, emitted);
+                       kdebug_trace(TRACE_DEBUGID, events, 0, 0, 0);
+                       emitted++;
+               }
+       });
+
+       ktrace_events_single(s, TRACE_DEBUGID, ^(__unused struct trace_point *tp) {
+               T_LOG("saw round-trip event after %" PRIu64 " events", events);
+               seen++;
+               if (seen >= ROUND_TRIPS_THRESHOLD) {
+                       T_LOG("ending trace after seeing %" PRIu64 " events, "
+                                       "emitting %" PRIu64, seen, emitted);
+                       ktrace_end(s, 1);
+               }
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               T_EXPECT_GE(emitted, ROUND_TRIPS_THRESHOLD,
+                               "emitted %" PRIu64 " round-trip events", emitted);
+               T_EXPECT_GE(seen, ROUND_TRIPS_THRESHOLD,
+                               "saw %" PRIu64 " round-trip events", seen);
+               ktrace_session_destroy(s);
+               T_END;
+       });
+
+       int error = ktrace_start(s, dispatch_get_main_queue());
+       T_ASSERT_POSIX_ZERO(error, "started tracing");
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW,
+                       ROUND_TRIPS_TIMEOUT_SECS * NSEC_PER_SEC), dispatch_get_main_queue(),
+                       ^{
+               T_LOG("ending trace after %d seconds", ROUND_TRIPS_TIMEOUT_SECS);
+               ktrace_end(s, 0);
+       });
+
+       dispatch_main();
+}
+
+#define HEARTBEAT_INTERVAL_SECS 2
+#define HEARTBEAT_COUNT 20
+
+/*
+ * Ensure we see events periodically, checking for recent events on a
+ * heart-beat.
+ */
+T_DECL(event_coverage, "ensure events appear up to the end of tracing")
+{
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "created session");
+
+       __block uint64_t current_timestamp = 0;
+       __block uint64_t events = 0;
+       ktrace_events_all(s, ^(struct trace_point *tp) {
+               current_timestamp = tp->timestamp;
+               events++;
+       });
+
+       ktrace_set_buffer_size(s, 20);
+       ktrace_set_collection_interval(s, COLLECTION_INTERVAL_MS);
+
+       __block uint64_t last_timestamp = 0;
+       __block uint64_t last_events = 0;
+       __block unsigned int heartbeats = 0;
+
+       ktrace_set_completion_handler(s, ^{
+               ktrace_session_destroy(s);
+               T_QUIET; T_EXPECT_GT(events, 0ULL, "should have seen some events");
+               T_END;
+       });
+
+       dispatch_source_t timer = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER,
+                       0, 0, dispatch_get_main_queue());
+       dispatch_source_set_timer(timer, dispatch_time(DISPATCH_TIME_NOW,
+                       HEARTBEAT_INTERVAL_SECS * NSEC_PER_SEC),
+                       HEARTBEAT_INTERVAL_SECS * NSEC_PER_SEC, 0);
+       dispatch_source_set_cancel_handler(timer, ^{
+               dispatch_release(timer);
+       });
+
+       dispatch_source_set_event_handler(timer, ^{
+               heartbeats++;
+
+               T_LOG("heartbeat %u at time %lld, seen %" PRIu64 " events, "
+                               "current event time %lld", heartbeats, mach_absolute_time(),
+                               events, current_timestamp);
+
+               if (current_timestamp > 0) {
+                       T_EXPECT_GT(current_timestamp, last_timestamp,
+                                       "event timestamps should be increasing");
+                       T_QUIET; T_EXPECT_GT(events, last_events,
+                                       "number of events should be increasing");
+               }
+
+               last_timestamp = current_timestamp;
+               last_events = events;
+
+               if (heartbeats >= HEARTBEAT_COUNT) {
+                       T_LOG("ending trace after %u heartbeats", HEARTBEAT_COUNT);
+                       ktrace_end(s, 0);
+               }
+       });
+
+       int error = ktrace_start(s, dispatch_get_main_queue());
+       T_ASSERT_POSIX_ZERO(error, "started tracing");
+
+       dispatch_activate(timer);
+
+       dispatch_main();
+}
diff --git a/tests/kernel_mtx_perf.c b/tests/kernel_mtx_perf.c
new file mode 100644 (file)
index 0000000..396104f
--- /dev/null
@@ -0,0 +1,306 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+#include <darwintest_utils.h>
+#include <pthread.h>
+#include <launch.h>
+#include <servers/bootstrap.h>
+#include <stdlib.h>
+#include <sys/event.h>
+#include <unistd.h>
+#include <crt_externs.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <spawn.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.kernel_mtx_perf_test"));
+
+#define ITER 100000
+#define TEST_MTX_MAX_STATS             8
+
+#define TEST_MTX_LOCK_STATS            0
+#define TEST_MTX_UNLOCK_MTX_STATS      6
+
+static void
+test_from_kernel_lock_unlock_contended(void)
+{
+       int i, ret, name_size;
+       uint64_t avg, run, tot;
+       size_t size;
+       char iter[35];
+       char *buff, *buff_p, *avg_p, *name, *end_name;
+
+       T_LOG("Testing locking/unlocking mutex from kernel with contention.\n");
+       T_LOG("Requesting test with %d iterations\n", ITER);
+
+       size = 1000;
+       buff = calloc(size, sizeof(char));
+       T_QUIET;T_ASSERT_NOTNULL(buff, "Allocating buffer fo sysctl");
+
+       snprintf(iter, sizeof(iter), "%d", ITER);
+       ret = sysctlbyname("kern.test_mtx_contended", buff, &size, iter, sizeof(iter));
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname kern.test_mtx_contended");
+
+       T_LOG("%s stats:\n%s\n", __func__, buff);
+
+       /* first line is "STATS INNER LOOP" */
+       buff_p = buff;
+       while( *buff_p != '\n' ) buff_p++;
+       buff_p++;
+
+       /*
+        * Sequence of statistic lines like
+        * { samples 100000, tot 3586175 ns, avg 35 ns, max 3997 ns, min 33 ns } TEST_MTX_LOCK_STATS
+        * for all TEST_MTX_MAX_STATS statistics
+        */
+       for (i = 0; i < TEST_MTX_MAX_STATS; i++) {
+               avg_p = strstr(buff_p, "avg ");
+
+               /* contended test records statistics only for lock/unlock for now */
+               if (i == TEST_MTX_LOCK_STATS || i == TEST_MTX_UNLOCK_MTX_STATS ) {
+                       T_QUIET;T_ASSERT_NOTNULL(avg_p, "contended %i average not found", i);
+                       sscanf(avg_p, "avg %llu", &avg);
+
+                       name = strstr(buff_p, "TEST_MTX_");
+                       end_name = strstr(buff_p, "_STATS");
+                       name_size = end_name - name - strlen("TEST_MTX_") + 1;
+
+                       char name_string[40];
+                       char avg_name_string[50];
+                       char *pre_string = "contended ";
+                       snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
+                       pre_string = "avg contended ";
+                       snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
+                       T_PERF(name_string, avg, "ns", avg_name_string);
+               }
+
+               buff_p = avg_p;
+               while( *buff_p != '\n' ) buff_p++;
+               buff_p++;
+
+       }
+
+       while( *buff_p != '\n' ) buff_p++;
+       buff_p++;
+
+       /* next line is "STATS OUTER LOOP" */
+       while( *buff_p != '\n' ) buff_p++;
+       buff_p++;
+
+       /* contended test records statistics only for lock/unlock for now */
+       avg_p = strstr(buff_p, "run time ");
+       T_QUIET;T_ASSERT_NOTNULL(avg_p, "contended %d loop run time not found", 0);
+       sscanf(avg_p, "run time %llu", &run);
+
+       avg_p = strstr(buff_p, "total time ");
+       T_QUIET;T_ASSERT_NOTNULL(avg_p, "uncontended %d loop total time not found", 0);
+       sscanf(avg_p, "total time %llu", &tot);
+
+       if (run < tot)
+               avg = run;
+       else
+               avg = tot;
+
+       name = strstr(buff_p, "TEST_MTX_");
+       end_name = strstr(buff_p, "_STATS");
+       name_size = end_name - name - strlen("TEST_MTX_") + 1;
+
+       char name_string[50];
+       char avg_name_string[60];
+       char *pre_string = "contended loop ";
+       snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
+       pre_string = "avg time contended loop ";
+       snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
+       T_PERF(name_string, avg/ITER, "ns", avg_name_string);
+
+       free(buff);
+}
+
+static void
+test_from_kernel_lock_unlock_uncontended(void)
+{
+       int i, ret, name_size;
+       uint64_t avg, run, tot;
+       size_t size;
+       char iter[35];
+       char *buff, *buff_p, *avg_p, *name, *end_name;
+
+       T_LOG("Testing locking/unlocking mutex from kernel without contention.\n");
+       T_LOG("Requesting test with %d iterations\n", ITER);
+
+       size = 2000;
+       buff = calloc(size, sizeof(char));
+       T_QUIET;T_ASSERT_NOTNULL(buff, "Allocating buffer fo sysctl");
+
+       snprintf(iter, sizeof(iter), "%d", ITER);
+       ret = sysctlbyname("kern.test_mtx_uncontended", buff, &size, iter, sizeof(iter));
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname kern.test_mtx_uncontended");
+
+       T_LOG("%s stats:\n%s\n", __func__, buff);
+
+       /* first line is "STATS INNER LOOP" */
+       buff_p = buff;
+       while( *buff_p != '\n' ) buff_p++;
+       buff_p++;
+
+       /*
+        * Sequence of statistic lines like
+        * { samples 100000, tot 3586175 ns, avg 35 ns, max 3997 ns, min 33 ns } TEST_MTX_LOCK_STATS
+        * for all TEST_MTX_MAX_STATS statistics
+        */
+       for (i = 0; i < TEST_MTX_MAX_STATS; i++) {
+               avg_p = strstr(buff_p, "avg ");
+               T_QUIET;T_ASSERT_NOTNULL(avg_p, "uncontended %i average not found", i);
+               sscanf(avg_p, "avg %llu", &avg);
+
+               name = strstr(buff_p, "TEST_MTX_");
+               end_name = strstr(buff_p, "_STATS");
+               name_size = end_name - name - strlen("TEST_MTX_") + 1;
+
+               char name_string[40];
+               char avg_name_string[50];
+               char *pre_string = "uncontended ";
+               snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
+               pre_string = "avg time uncontended ";
+               snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
+               T_PERF(name_string, avg, "ns", avg_name_string);
+
+               buff_p = avg_p;
+               while( *buff_p != '\n' ) buff_p++;
+               buff_p++;
+       }
+
+       while( *buff_p != '\n' ) buff_p++;
+       buff_p++;
+
+       /* next line is "STATS OUTER LOOP" */
+       while( *buff_p != '\n' ) buff_p++;
+       buff_p++;
+
+       /*
+        * Sequence of statistic lines like
+        * total time 4040673 ns total run time 3981080 ns TEST_MTX_LOCK_STATS
+        * for all TEST_MTX_MAX_STATS statistics exept UNLOCK
+        */
+       for (i = 0; i < TEST_MTX_MAX_STATS - 2; i++) {
+               avg_p = strstr(buff_p, "run time ");
+               T_QUIET;T_ASSERT_NOTNULL(avg_p, "uncontended %d loop run time not found", i);
+               sscanf(avg_p, "run time %llu", &run);
+
+               avg_p = strstr(buff_p, "total time ");
+               T_QUIET;T_ASSERT_NOTNULL(avg_p, "uncontended %d loop total time not found", i);
+               sscanf(avg_p, "total time %llu", &tot);
+
+               if (run < tot)
+                       avg = run;
+               else
+                       avg = tot;
+
+               name = strstr(buff_p, "TEST_MTX_");
+               end_name = strstr(buff_p, "_STATS");
+               name_size = end_name - name - strlen("TEST_MTX_") + 1;
+
+               char name_string[50];
+               char avg_name_string[60];
+               char *pre_string = "uncontended loop ";
+               snprintf(name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
+               pre_string = "avg time uncontended loop ";
+               snprintf(avg_name_string, name_size + strlen(pre_string), "%s%s", pre_string, &name[strlen("TEST_MTX_")]);
+               T_PERF(name_string, avg/ITER, "ns", avg_name_string);
+
+               buff_p = avg_p;
+               while( *buff_p != '\n' ) buff_p++;
+               buff_p++;
+
+       }
+       free(buff);
+}
+
+extern char **environ;
+static void
+fix_cpu_frequency(void)
+{
+#if CONFIG_EMBEDDED
+       int spawn_ret, pid;
+       char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-f", "5000", NULL};
+
+       T_LOG("Setting cpu frequency to %d\n", 5000);
+
+       spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, environ);
+       waitpid(pid, &spawn_ret, 0);
+
+#else /*CONFIG_EMBEDDED*/
+
+       int spawn_ret, pid;
+       int ret, nom_freq;
+       size_t len;
+       float val;
+       char scale;
+       char *buffer, *cpu_freq;
+       char str_val[10];
+
+       ret = sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0);
+       T_QUIET;T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname machdep.cpu.brand_string");
+
+       buffer = malloc(len+2);
+       ret = sysctlbyname("machdep.cpu.brand_string", buffer, &len, NULL, 0);
+       T_QUIET;T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname machdep.cpu.brand_string");
+       buffer[len+1] = '\0';
+
+       cpu_freq = strstr(buffer, "CPU @ ");
+       if (cpu_freq == NULL) {
+               T_LOG("Could not fix frequency, %s field not present\n", "CPU @ ");
+               goto out;
+       }
+
+       if (strstr(cpu_freq, "Hz") != NULL) {
+               sscanf(cpu_freq, "CPU @ %f%cHz", &val, &scale);
+       } else {
+               if (strstr(cpu_freq, "hz") != NULL) {
+                       sscanf(cpu_freq, "CPU @ %f%chz", &val, &scale);
+               } else {
+                       T_LOG("Could not fix frequency, %s field not present\n", "Hz");
+                       goto out;
+               }
+       }
+
+       switch(scale){
+       case 'M':
+       case 'm':
+               nom_freq = (int) val;
+               break;
+       case 'G':
+       case 'g':
+               nom_freq = (int) (val*1000);
+               break;
+       default:
+               T_LOG("Could not fix frequency, scale field is %c\n", scale);
+               goto out;
+       }
+
+       snprintf(str_val, 10, "%d", nom_freq);
+       T_LOG("Setting min and max cpu frequency to %d (%s)\n", nom_freq, str_val);
+       char *xcpm_args[] = {"/usr/local/bin/xcpm", "limits", str_val, str_val, NULL};
+       spawn_ret = posix_spawn(&pid, xcpm_args[0], NULL, NULL, xcpm_args, environ);
+       waitpid(pid, &spawn_ret, 0);
+
+out:
+       free(buffer);
+       return;
+#endif /*CONFIG_EMBEDDED*/
+}
+
+T_DECL(kernel_mtx_perf_test,
+       "Kernel mutex performance test",
+       T_META_ASROOT(YES), T_META_CHECK_LEAKS(NO))
+{
+       fix_cpu_frequency();
+
+       test_from_kernel_lock_unlock_uncontended();
+       test_from_kernel_lock_unlock_contended();
+}
+
diff --git a/tests/kernel_uuid_match.c b/tests/kernel_uuid_match.c
new file mode 100644 (file)
index 0000000..f5f32d4
--- /dev/null
@@ -0,0 +1,192 @@
+#include <darwintest.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <uuid/uuid.h>
+#include <sys/sysctl.h>
+#include <TargetConditionals.h>
+#include <glob.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <mach-o/loader.h>
+#include <mach-o/dyld.h>
+#include <mach-o/swap.h>
+#include <libkern/OSByteOrder.h>
+
+#define MAX_LEN 1024
+
+#if TARGET_OS_MAC && !TARGET_OS_EMBEDDED
+       //running on macOS
+       #define KERNEL_SEARCH_DIR "/System/Library/Kernels/*"
+#else
+       //running on a different OS (e.g. iOS, watchOS, etc.)
+       #define KERNEL_SEARCH_DIR "/*"
+#endif
+
+#define SWAP32(v)              v = OSSwapInt32(v)
+
+
+/* opens and maps the file at [path] in memory,
+ * sets the length in [len] and returns a pointer
+ * to the beginning of the memory region or NULL
+ * if unable to open and map the file
+ */
+static void *open_file(char *path, size_t *len) {
+       int fd;
+       if ((fd = open(path, O_RDONLY)) < 0) {
+               return NULL;
+       }
+       *len = (size_t)lseek(fd, (off_t)0, SEEK_END);
+       void *p = mmap(NULL, *len, PROT_READ, MAP_PRIVATE, fd, 0);
+       close(fd);
+       if (p == MAP_FAILED) {
+               return NULL;
+       }
+       return p;
+}
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wsign-conversion"
+static void __swap_mach_header(struct mach_header *header) {
+       SWAP32(header->magic);
+       SWAP32(header->cputype);
+       SWAP32(header->cpusubtype);
+       SWAP32(header->filetype);
+       SWAP32(header->ncmds);
+       SWAP32(header->sizeofcmds);
+       SWAP32(header->flags);
+}
+
+static void __swap_mach_header_64(struct mach_header_64 *header) {
+       SWAP32(header->magic);
+       SWAP32(header->cputype);
+       SWAP32(header->cpusubtype);
+       SWAP32(header->filetype);
+       SWAP32(header->ncmds);
+       SWAP32(header->sizeofcmds);
+       SWAP32(header->flags);
+}
+#pragma clang diagnostic pop
+
+/* parses the uuid from the file at [path] and sets the uuid in [uuid]
+ * returns true if successfully parses the file, returns false otherwise
+ * (e.g. the file is not a Mach-O binary)
+ */
+static bool parse_binary_uuid(char *path, uuid_t uuid) {
+       size_t len = 0;
+       bool should_swap = false;
+       unsigned int ncmds = 0;
+       struct load_command *lc = NULL;
+       bool ret = false;
+
+       struct mach_header *h = open_file(path, &len);
+       if (!h) {
+               return false;
+       }
+       if (h->magic == MH_MAGIC || h->magic == MH_CIGAM) {
+               //32-bit header
+               struct mach_header *header = h;
+               if (header->magic == MH_CIGAM) {
+                       __swap_mach_header(header);
+                       should_swap = true;
+               }
+               ncmds = header->ncmds;
+               //the first load command is after the header
+               lc = (struct load_command *)(header + 1);
+       } else if (h->magic == MH_MAGIC_64 || h->magic == MH_CIGAM_64) {
+               //64-bit header
+               struct mach_header_64 *header = (struct mach_header_64 *)h;
+               if (header->magic == MH_CIGAM_64) {
+                       __swap_mach_header_64(header);
+                       should_swap = true;
+               }
+               ncmds = header->ncmds;
+               lc = (struct load_command *)(header + 1);
+       } else {
+               //this is not a Mach-O binary, or it is a FAT binary
+               munmap(h, len);
+               return false;
+       }
+       for (unsigned int i = 0; i < ncmds; i++) {
+               uint32_t cmd = lc->cmd;
+               uint32_t cmdsize = lc->cmdsize;
+               if (should_swap) {
+                       SWAP32(cmd);
+                       SWAP32(cmdsize);
+               }
+               if (cmd == LC_UUID) {
+                       struct uuid_command *uuid_cmd =
+                                       (struct uuid_command *)lc;
+                       uuid_copy(uuid, uuid_cmd->uuid);
+                       uuid_string_t tuuid_str;
+                       uuid_unparse(uuid, tuuid_str);
+                       T_LOG("Trying test UUID %s", tuuid_str);
+                       ret = true;
+                       break;
+               }
+               lc = (struct load_command *)((uintptr_t)lc + cmdsize);
+       }
+       munmap(h, len);
+       return ret;
+}
+
+/* uses the sysctl command line tool to get the uuid
+ * of the currently running kernel
+ */
+static void get_system_kernel_uuid(uuid_t kuuid) {
+       char kuuid_line[MAX_LEN];
+       memset(kuuid_line, 0, sizeof(kuuid_line));
+       size_t len = sizeof(kuuid_line);
+       int ret = sysctlbyname("kern.uuid", kuuid_line, &len, NULL, 0);
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.uuid");
+
+       T_ASSERT_TRUE(uuid_parse(kuuid_line, kuuid) == 0,
+                       "Parse running kernel uuid");
+}
+
+/* compares [kuuid] to the uuid in each of the kernel binaries on OS's
+ * other than macOS (there can be multiple kernel binaries if the mastering
+ * process doesn't remove all of the irrelevant binaries)
+ */
+static void find_and_compare_test_uuids(char *search_path, uuid_t kuuid) {
+       glob_t g;
+       int ret = glob(search_path, 0, NULL, &g);
+       T_WITH_ERRNO; T_ASSERT_EQ(ret, 0, "glob %s", search_path);
+
+       bool pass = false;
+       for (int i = 0; i < g.gl_matchc; i++) {
+               char *path = g.gl_pathv[i];
+
+               //check that [path] is the path for a file (not a directory, device, etc.)
+               struct stat s;
+               int ret = stat(path, &s);
+               T_ASSERT_POSIX_SUCCESS(ret, "stat %s", path);
+               if ((s.st_mode & S_IFREG) == 0) {
+                       continue;
+               }
+
+               T_LOG("Reading file at path: %s", path);
+               uuid_t tuuid;
+               if (parse_binary_uuid(path, tuuid) &&
+                               uuid_compare(kuuid, tuuid) == 0) {
+                       pass = true;
+                       break;
+               }
+       }
+       globfree(&g);
+       T_EXPECT_TRUE(pass, "The sources match");
+}
+
+T_DECL(uuid_match, "Compare the running kernel UUID to kernel binaries.")
+{
+       uuid_t kuuid;
+       uuid_clear(kuuid);
+       get_system_kernel_uuid(kuuid);
+       uuid_string_t kuuid_str;
+       uuid_unparse(kuuid, kuuid_str);
+       T_LOG("Got running kernel UUID %s", kuuid_str);
+       find_and_compare_test_uuids(KERNEL_SEARCH_DIR, kuuid);
+}
diff --git a/tests/kevent_continuous_time.c b/tests/kevent_continuous_time.c
new file mode 100644 (file)
index 0000000..607cce6
--- /dev/null
@@ -0,0 +1,258 @@
+#include <stdio.h>
+#include <unistd.h>
+
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#include <sys/time.h>
+#include <spawn.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+#include <errno.h>
+#include <sys/event.h>
+
+#include <darwintest.h>
+
+extern char **environ;
+
+static mach_timebase_info_data_t tb_info;
+static const uint64_t one_mil = 1000LL*1000LL;
+
+#define tick_to_ns(ticks) (((ticks) * tb_info.numer) / (tb_info.denom))
+#define tick_to_ms(ticks) (tick_to_ns(ticks)/one_mil)
+
+#define ns_to_tick(ns) ((ns) * tb_info.denom / tb_info.numer)
+#define ms_to_tick(ms) (ns_to_tick((ms) * one_mil))
+
+static uint64_t time_delta_ms(void){
+       uint64_t abs_now = mach_absolute_time();
+       uint64_t cnt_now = mach_continuous_time();;
+       return tick_to_ms(cnt_now) - tick_to_ms(abs_now);
+}
+
+static int run_sleep_tests = 0;
+
+static int trigger_sleep(int for_secs) {
+       if(!run_sleep_tests) return 0;
+
+       // sleep for 1 seconds each iteration
+       char buf[10];
+       snprintf(buf, 10, "%d", for_secs);
+
+       T_LOG("Sleepeing for %s seconds...", buf);
+
+       int spawn_ret, pid;
+       char *const pmset1_args[] = {"/usr/bin/pmset", "relative", "wake", buf, NULL};
+       T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset1_args[0], NULL, NULL, pmset1_args, environ)), NULL);
+       
+       T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, NULL);
+       T_ASSERT_EQ(spawn_ret, 0, NULL);
+
+       char *const pmset2_args[] = {"/usr/bin/pmset", "sleepnow", NULL};
+       T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset2_args[0], NULL, NULL, pmset2_args, environ)), NULL);
+       
+       T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, NULL);
+       T_ASSERT_EQ(spawn_ret, 0, NULL);
+
+       return 0;
+}
+
+// waits up to 30 seconds for system to sleep
+// returns number of seconds it took for sleep to be entered
+// or -1 if sleep wasn't accomplished
+static int wait_for_sleep() {
+       if(!run_sleep_tests) return 0;
+
+       uint64_t before_diff = time_delta_ms();
+       
+       for(int i = 0; i < 30; i++) {
+               uint64_t after_diff = time_delta_ms();
+
+               // on OSX, there's enough latency between calls to MCT and MAT
+               // when the system is going down for sleep for values to diverge a few ms
+               if(llabs((int64_t)before_diff - (int64_t)after_diff) > 2) {
+                       return i + 1;
+               }
+               
+               sleep(1);
+               T_LOG("waited %d seconds for sleep...", i+1);
+       }
+       return -1;
+}
+
+T_DECL(kevent_continuous_time_periodic_tick, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME)", T_META_LTEPHASE(LTE_POSTINIT)){
+       mach_timebase_info(&tb_info);
+       int kq;
+       T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL);
+
+       struct kevent64_s kev = {
+               .ident = 1,
+               .filter = EVFILT_TIMER,
+               .flags = EV_ADD | EV_RECEIPT,
+               .fflags = NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME,
+               .data = 4,
+       };
+       T_LOG("EV_SET(&kev, 1, EVFILT_TIMER, EV_ADD, NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, 4, 0, 0, 0);");
+
+       T_ASSERT_EQ(kevent64(kq, &kev, 1, &kev, 1, 0, NULL), 1, NULL);
+       T_ASSERT_EQ(0ll, kev.data, "No error returned");
+
+       uint64_t abs_then = mach_absolute_time();
+       uint64_t cnt_then = mach_continuous_time();;
+
+       trigger_sleep(1);
+       int sleep_secs = wait_for_sleep();
+
+       T_WITH_ERRNO; T_ASSERT_EQ(kevent64(kq, NULL, 0, &kev, 1, 0, NULL), 1, "kevent() should have returned one event");
+       T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %lld}", kev.ident, kev.filter, kev.flags, kev.fflags, kev.data, kev.udata);
+       T_ASSERT_EQ(kev.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", kev.flags & EV_ERROR ? strerror((int)kev.data) : "no error");
+
+       uint64_t abs_now = mach_absolute_time();
+       uint64_t cnt_now = mach_continuous_time();;
+       uint64_t ct_ms_progressed = tick_to_ms(cnt_now - cnt_then);
+       uint64_t ab_ms_progressed = tick_to_ms(abs_now - abs_then);
+
+       T_LOG("ct progressed %llu ms, abs progressed %llu ms", ct_ms_progressed, tick_to_ms(abs_now - abs_then));
+
+       if (run_sleep_tests) {
+               T_ASSERT_GT(llabs((int64_t)ct_ms_progressed - (int64_t)ab_ms_progressed), 500LL, "should have > 500ms difference between MCT and MAT");
+       } else {
+               T_ASSERT_LT(llabs((int64_t)ct_ms_progressed - (int64_t)ab_ms_progressed), 10LL, "should have < 10ms difference between MCT and MAT");
+       }
+
+       if (sleep_secs < 4) {
+               T_ASSERT_LT(llabs((int64_t)ct_ms_progressed - 4000), 100LL, "mach_continuous_time should progress ~4 seconds (+/- 100ms) between sleeps");
+       }
+
+       sleep(1);
+
+       kev = (struct kevent64_s){
+               .ident = 1,
+               .filter = EVFILT_TIMER,
+               .flags = EV_DELETE | EV_RECEIPT,
+       };
+       T_LOG("EV_SET(&kev, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0);");
+       T_ASSERT_EQ(kevent64(kq, &kev, 1, &kev, 1, 0, NULL), 1, NULL);
+       T_ASSERT_EQ(0ll, kev.data, "No error returned");
+
+       T_ASSERT_POSIX_ZERO(close(kq), NULL);
+}
+
+T_DECL(kevent_continuous_time_absolute, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME and NOTE_ABSOLUTE)", T_META_LTEPHASE(LTE_POSTINIT)){
+       mach_timebase_info(&tb_info);
+
+       int kq;
+       T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL);
+
+       struct timeval tv;
+       gettimeofday(&tv, NULL);
+       int64_t nowus   = (int64_t)tv.tv_sec * USEC_PER_SEC + (int64_t)tv.tv_usec;
+       int64_t fire_at = (3*USEC_PER_SEC) + nowus;
+
+       uint64_t cnt_now = mach_continuous_time();
+       uint64_t cnt_then = cnt_now + ms_to_tick(3000);
+
+       T_LOG("currently is %llu, firing at %llu", nowus, fire_at);
+
+       struct kevent64_s kev = {
+               .ident = 2,
+               .filter = EVFILT_TIMER,
+               .flags = EV_ADD | EV_RECEIPT,
+               .fflags = NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS,
+               .data = fire_at,
+       };
+       T_LOG("EV_SET(&kev, 2, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, fire_at, 0);");
+
+       T_ASSERT_EQ(kevent64(kq, &kev, 1, &kev, 1, 0, NULL), 1, NULL);
+       T_ASSERT_EQ(0ll, kev.data, "No error returned");
+
+       T_LOG("testing NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE between sleep");
+
+       trigger_sleep(1);
+
+       struct timespec timeout = {
+               .tv_sec = 10,
+               .tv_nsec = 0,
+       };
+       struct kevent64_s event = {0};
+       T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, &timeout), 1, "kevent() should have returned one event");
+       T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %lld}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata);
+       T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error");
+
+       uint64_t elapsed_ms = tick_to_ms(mach_continuous_time() - cnt_now);
+       int64_t missed_by  = tick_to_ns((int64_t)mach_continuous_time() - (int64_t)cnt_then) / 1000000;
+
+       // ~1/2 second is about as good as we'll get
+       T_ASSERT_LT(llabs(missed_by), 500LL, "timer should pop 3 sec in the future, popped after %lldms", elapsed_ms);
+
+       T_ASSERT_EQ(event.data, 1LL, NULL);
+
+       T_ASSERT_EQ(event.ident, 2ULL, NULL);
+
+       // try getting a periodic tick out of kq
+       T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, &timeout), 0, NULL);
+       T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error");
+
+       T_ASSERT_POSIX_ZERO(close(kq), NULL);
+}
+
+T_DECL(kevent_continuous_time_pops, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME with multiple pops)", T_META_LTEPHASE(LTE_POSTINIT)){
+       // have to throttle rate at which pmset is called
+       sleep(2);
+
+       mach_timebase_info(&tb_info);
+
+       int kq;
+       T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL);
+
+       // test that periodic ticks accumulate while asleep
+       struct kevent64_s kev = {
+               .ident = 3,
+               .filter = EVFILT_TIMER,
+               .flags = EV_ADD | EV_RECEIPT,
+               .fflags = NOTE_MACH_CONTINUOUS_TIME,
+               .data = 100,
+       };
+       T_LOG("EV_SET(&kev, 3, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME, 100, 0);");
+
+       // wait for first pop, then sleep
+       T_ASSERT_EQ(kevent64(kq, &kev, 1, &kev, 1, 0, NULL), 1, NULL);
+       T_ASSERT_EQ(0ll, kev.data, "No error returned");
+
+       struct kevent64_s event = {0};
+       T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event");
+       T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %llu}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata);
+       T_ASSERT_EQ(event.flags & EV_ERROR, 0, "should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error");
+       T_ASSERT_EQ(event.ident, 3ULL, NULL);
+
+       uint64_t cnt_then = mach_continuous_time();
+       trigger_sleep(2);
+
+       int sleep_secs = 0;
+       if(run_sleep_tests) {
+               sleep_secs = wait_for_sleep();
+       }
+       else {
+               // simulate 2 seconds of system "sleep"
+               sleep(2);
+       }
+
+       uint64_t cnt_now = mach_continuous_time();
+
+       uint64_t ms_elapsed = tick_to_ms(cnt_now - cnt_then);
+       if(run_sleep_tests) {
+               T_ASSERT_LT(llabs((int64_t)ms_elapsed - 2000LL), 500LL, "slept for %llums, expected 2000ms (astris is connected?)", ms_elapsed);
+       }
+
+       T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event");
+       T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %llu}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata);
+       T_ASSERT_EQ(event.ident, 3ULL, NULL);
+
+       uint64_t expected_pops = ms_elapsed / 100;
+       uint64_t got_pops      = (uint64_t)event.data;
+
+       T_ASSERT_GE(got_pops, expected_pops - 1, "tracking pops while asleep");
+       T_ASSERT_POSIX_ZERO(close(kq), NULL);
+}
diff --git a/tests/kevent_pty.c b/tests/kevent_pty.c
new file mode 100644 (file)
index 0000000..2fad75e
--- /dev/null
@@ -0,0 +1,299 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif /* T_NAMESPACE */
+
+#include <Block.h>
+#include <darwintest.h>
+#include <dispatch/dispatch.h>
+#include <err.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <util.h>
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.kevent"),
+               T_META_CHECK_LEAKS(false));
+
+#define TIMEOUT_SECS 10
+
+static int child_ready[2];
+
+static void
+child_tty_client(void)
+{
+       dispatch_source_t src;
+       char buf[16] = "";
+       ssize_t bytes_wr;
+
+       src = dispatch_source_create(DISPATCH_SOURCE_TYPE_READ,
+                       (uintptr_t)STDIN_FILENO, 0, NULL);
+       if (!src) {
+               exit(1);
+       }
+       dispatch_source_set_event_handler(src, ^{});
+
+       dispatch_activate(src);
+
+       close(child_ready[0]);
+       snprintf(buf, sizeof(buf), "%ds", getpid());
+       bytes_wr = write(child_ready[1], buf, strlen(buf));
+       if (bytes_wr < 0) {
+               err(1, "failed to write on child ready pipe");
+       }
+
+       dispatch_main();
+}
+
+static void
+pty_master(void)
+{
+       pid_t child_pid;
+       int ret;
+
+       child_pid = fork();
+       if (child_pid == 0) {
+               child_tty_client();
+       }
+       ret = setpgid(child_pid, child_pid);
+       if (ret < 0) {
+               exit(1);
+       }
+       ret = tcsetpgrp(STDIN_FILENO, child_pid);
+       if (ret < 0) {
+               exit(1);
+       }
+
+       sleep(TIMEOUT_SECS);
+       exit(1);
+}
+
+T_DECL(pty_master_teardown,
+               "try removing a TTY master out from under a PTY slave holding a kevent",
+               T_META_ASROOT(true))
+{
+       __block pid_t master_pid;
+       char buf[16] = "";
+       char *end;
+       ssize_t bytes_rd;
+       size_t buf_len = 0;
+       unsigned long slave_pid;
+       int master_fd;
+       char pty_filename[PATH_MAX];
+       int status;
+
+       T_SETUPBEGIN;
+       T_ASSERT_POSIX_SUCCESS(pipe(child_ready), NULL);
+
+       master_pid = forkpty(&master_fd, pty_filename, NULL, NULL);
+       if (master_pid == 0) {
+               pty_master();
+               __builtin_unreachable();
+       }
+       T_ASSERT_POSIX_SUCCESS(master_pid,
+                       "forked child master PTY with pid %d, at pty %s", master_pid,
+                       pty_filename);
+
+       close(child_ready[1]);
+
+       end = buf;
+       do {
+               bytes_rd = read(child_ready[0], end, sizeof(buf) - buf_len);
+               T_ASSERT_POSIX_SUCCESS(bytes_rd, "read on pipe between master and runner");
+               buf_len += (size_t)bytes_rd;
+               T_LOG("runner read %zd bytes", bytes_rd);
+               end += bytes_rd;
+       } while (bytes_rd != 0 && *(end - 1) != 's');
+
+       slave_pid = strtoul(buf, &end, 0);
+       if (buf == end) {
+               T_ASSERT_FAIL("could not parse child PID from master pipe");
+       }
+
+       T_LOG("got pid %lu for slave process from master", slave_pid);
+       T_SETUPEND;
+
+       T_LOG("sending fatal signal to master");
+       T_ASSERT_POSIX_SUCCESS(kill(master_pid, SIGKILL), NULL);
+
+       T_LOG("sending fatal signal to slave");
+       (void)kill((int)slave_pid, SIGKILL);
+
+       T_ASSERT_POSIX_SUCCESS(waitpid(master_pid, &status, 0), NULL);
+       T_ASSERT_TRUE(WIFSIGNALED(status), "master PID was signaled");
+       (void)waitpid((int)slave_pid, &status, 0);
+}
+
+volatile static bool writing = true;
+
+static void *
+reader_thread(void *arg)
+{
+       int fd = (int)arg;
+       char c;
+
+       T_SETUPBEGIN;
+       T_QUIET;
+       T_ASSERT_GT(fd, 0, "reader thread received valid fd");
+       T_SETUPEND;
+
+       for (;;) {
+               ssize_t rdsize = read(fd, &c, sizeof(c));
+               if (rdsize == -1) {
+                       if (errno == EINTR) {
+                               continue;
+                       } else if (errno == EBADF) {
+                               T_LOG("reader got an error (%s), shutting down", strerror(errno));
+                               return NULL;
+                       } else {
+                               T_ASSERT_POSIX_SUCCESS(rdsize, "read on PTY");
+                       }
+               } else if (rdsize == 0) {
+                       return NULL;
+               }
+       }
+
+       return NULL;
+}
+
+static void *
+writer_thread(void *arg)
+{
+       int fd = (int)arg;
+       char c[4096];
+       memset(c, 'a', sizeof(c));
+
+       T_SETUPBEGIN;
+       T_QUIET;
+       T_ASSERT_GT(fd, 0, "writer thread received valid fd");
+       T_SETUPEND;
+
+       while (writing) {
+               ssize_t wrsize = write(fd, c, sizeof(c));
+               if (wrsize == -1) {
+                       if (errno == EINTR) {
+                               continue;
+                       } else {
+                               T_LOG("writer got an error (%s), shutting down", strerror(errno));
+                               return NULL;
+                       }
+               }
+       }
+
+       return NULL;
+}
+
+#define ATTACH_ITERATIONS 10000
+
+static int attach_master, attach_slave;
+static pthread_t reader, writer;
+
+static void
+join_threads(void)
+{
+       close(attach_slave);
+       close(attach_master);
+       writing = false;
+       pthread_join(reader, NULL);
+       pthread_join(writer, NULL);
+}
+
+static void
+redispatch(dispatch_group_t grp, dispatch_source_type_t type, int fd)
+{
+       __block int iters = 0;
+
+       __block void (^redispatch_blk)(void) = Block_copy(^{
+               if (iters++ > ATTACH_ITERATIONS) {
+                       return;
+               } else if (iters == ATTACH_ITERATIONS) {
+                       dispatch_group_leave(grp);
+                       T_PASS("created %d %s sources on busy PTY", iters,
+                                       type == DISPATCH_SOURCE_TYPE_READ ? "read" : "write");
+               }
+
+               dispatch_source_t src = dispatch_source_create(
+                               type, (uintptr_t)fd, 0,
+                               dispatch_get_main_queue());
+
+               dispatch_source_set_event_handler(src, ^{
+                       dispatch_cancel(src);
+               });
+
+               dispatch_source_set_cancel_handler(src, redispatch_blk);
+
+               dispatch_activate(src);
+       });
+
+       dispatch_group_enter(grp);
+       dispatch_async(dispatch_get_main_queue(), redispatch_blk);
+}
+
+T_DECL(attach_while_tty_wakeups,
+               "try to attach knotes while a TTY is getting wakeups")
+{
+       dispatch_group_t grp = dispatch_group_create();
+
+       T_SETUPBEGIN;
+       T_ASSERT_POSIX_SUCCESS(openpty(&attach_master, &attach_slave, NULL, NULL,
+                       NULL), NULL);
+
+       T_ASSERT_POSIX_ZERO(pthread_create(&reader, NULL, reader_thread,
+                               (void *)(uintptr_t)attach_master), NULL);
+       T_ASSERT_POSIX_ZERO(pthread_create(&writer, NULL, writer_thread,
+                               (void *)(uintptr_t)attach_slave), NULL);
+       T_ATEND(join_threads);
+       T_SETUPEND;
+
+       redispatch(grp, DISPATCH_SOURCE_TYPE_READ, attach_master);
+       redispatch(grp, DISPATCH_SOURCE_TYPE_WRITE, attach_slave);
+
+       dispatch_group_notify(grp, dispatch_get_main_queue(), ^{
+               T_LOG("both reader and writer sources cleaned up");
+               T_END;
+       });
+
+       dispatch_main();
+}
+
+T_DECL(master_read_data_set,
+               "check that the data is set on read sources of master fds")
+{
+       int master = -1, slave = -1;
+
+       T_SETUPBEGIN;
+       T_ASSERT_POSIX_SUCCESS(openpty(&master, &slave, NULL, NULL, NULL), NULL);
+       T_QUIET; T_ASSERT_GE(master, 0, "master fd is valid");
+       T_QUIET; T_ASSERT_GE(slave, 0, "slave fd is valid");
+
+       dispatch_source_t src = dispatch_source_create(DISPATCH_SOURCE_TYPE_READ,
+                       (uintptr_t)master, 0, dispatch_get_main_queue());
+
+       dispatch_source_set_event_handler(src, ^{
+               unsigned long len = dispatch_source_get_data(src);
+               T_EXPECT_GT(len, (unsigned long)0,
+                               "the amount of data to read was set for the master source");
+               dispatch_cancel(src);
+       });
+
+       dispatch_source_set_cancel_handler(src, ^{
+               dispatch_release(src);
+               T_END;
+       });
+
+       dispatch_activate(src);
+       T_SETUPEND;
+
+       // Let's not fill up the TTY's buffer, otherwise write(2) will block.
+       char buf[512] = "";
+
+       int ret = 0;
+       while ((ret = write(slave, buf, sizeof(buf)) == -1 && errno == EAGAIN));
+       T_ASSERT_POSIX_SUCCESS(ret, "slave wrote data");
+
+       dispatch_main();
+}
diff --git a/tests/kevent_qos.c b/tests/kevent_qos.c
new file mode 100644 (file)
index 0000000..df021e3
--- /dev/null
@@ -0,0 +1,1767 @@
+/*
+ * kevent_qos: Tests Synchronous IPC QOS override.
+ */
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+
+#include <dispatch/dispatch.h>
+#include <pthread.h>
+#include <launch.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <mach/mach_voucher.h>
+#include <pthread/workqueue_private.h>
+#include <voucher/ipc_pthread_priority_types.h>
+#include <servers/bootstrap.h>
+#include <stdlib.h>
+#include <sys/event.h>
+#include <unistd.h>
+#include <crt_externs.h>
+#include <mach/mach_port.h>
+#include <mach/mach_sync_ipc.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.kevent_qos"));
+
+#define ARRAYLEN(arr) (sizeof(arr) / sizeof(arr[0]))
+
+#define INTERMITTENT_TIMEOUT_SEC (3)
+#define RECV_TIMEOUT_SECS   (4)
+#define SEND_TIMEOUT_SECS   (6)
+#define HELPER_TIMEOUT_SECS (15)
+
+#define ENV_VAR_QOS (3)
+static const char *qos_env[ENV_VAR_QOS] = {"XNU_TEST_QOS_BO",  "XNU_TEST_QOS_QO", "XNU_TEST_QOS_AO"};
+static const char *qos_name_env[ENV_VAR_QOS] = {"XNU_TEST_QOS_NAME_BO", "XNU_TEST_QOS_NAME_QO", "XNU_TEST_QOS_NAME_AO"};
+
+#define ENV_VAR_FUNCTION (1)
+static const char *wl_function_name = "XNU_TEST_WL_FUNCTION";
+
+static qos_class_t g_expected_qos[ENV_VAR_QOS];
+static const char *g_expected_qos_name[ENV_VAR_QOS];
+
+#define ENV_QOS_BEFORE_OVERRIDE (0)
+#define ENV_QOS_QUEUE_OVERRIDE  (1)
+#define ENV_QOS_AFTER_OVERRIDE  (2)
+
+struct test_msg {
+       mach_msg_header_t header;
+       mach_msg_body_t body;
+       mach_msg_port_descriptor_t port_descriptor;
+       mach_msg_option_t opts;
+       mach_msg_priority_t qos;
+};
+
+#pragma mark pthread callbacks
+
+static void
+thread_create_at_qos(qos_class_t qos, void * (*function)(void *));
+static void
+send(mach_port_t send_port, mach_port_t reply_port, mach_port_t msg_port, mach_msg_priority_t qos, mach_msg_option_t options);
+static void
+enable_kevent(uint64_t *workloop_id, unsigned long long port);
+static void
+populate_kevent(struct kevent_qos_s *kev, unsigned long long port);
+
+static void
+worker_cb(pthread_priority_t __unused priority)
+{
+       T_FAIL("a worker thread was created");
+}
+
+static void
+event_cb(void ** __unused events, int * __unused nevents)
+{
+       T_FAIL("a kevent routine was called instead of workloop");
+}
+
+static uint32_t
+get_user_promotion_basepri(void)
+{
+       mach_msg_type_number_t count = THREAD_POLICY_STATE_COUNT;
+       struct thread_policy_state thread_policy;
+       boolean_t get_default = FALSE;
+       mach_port_t thread_port = pthread_mach_thread_np(pthread_self());
+
+       kern_return_t kr = thread_policy_get(thread_port, THREAD_POLICY_STATE,
+                       (thread_policy_t)&thread_policy, &count, &get_default);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_get");
+       return thread_policy.thps_user_promotion_basepri;
+}
+
+#define EXPECT_QOS_EQ(qos, ...) do { \
+               if ((qos) == QOS_CLASS_USER_INTERACTIVE) { \
+                       T_EXPECT_EFFECTIVE_QOS_EQ(QOS_CLASS_USER_INITIATED, __VA_ARGS__); \
+                       T_EXPECT_EQ(47u, get_user_promotion_basepri(), __VA_ARGS__); \
+               } else { \
+                       T_EXPECT_EFFECTIVE_QOS_EQ(qos, __VA_ARGS__); \
+               } \
+       } while (0)
+
+#define EXPECT_TEST_MSG(_ke)  do { \
+               struct kevent_qos_s *ke = _ke; \
+               mach_msg_header_t *hdr = (mach_msg_header_t *)ke->ext[0]; \
+               T_ASSERT_NOTNULL(hdr, "has a message"); \
+               T_ASSERT_EQ(hdr->msgh_size, (uint32_t)sizeof(struct test_msg), "of the right size"); \
+               struct test_msg *tmsg = (struct test_msg *)hdr; \
+               if (tmsg->opts & MACH_SEND_PROPAGATE_QOS) { \
+                       T_EXPECT_EQ(tmsg->qos, ((uint32_t)(ke->ext[2] >> 32)), \
+                                       "propagation works"); \
+               } \
+       } while (0)
+
+/*
+ * Basic WL handler callback, it sleeps for n seconds and then checks the
+ * effective Qos of the servicer thread.
+ */
+static void
+workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist, int *events)
+{
+       T_LOG("Workloop handler workloop_cb_test_intransit called. "
+               "Will wait for %d seconds to make sure client enqueues the sync msg \n",
+               2 * RECV_TIMEOUT_SECS);
+
+       EXPECT_TEST_MSG(*eventslist);
+
+       /* Wait for the client to send the high priority message to override the qos */
+       sleep(2 * RECV_TIMEOUT_SECS);
+
+       /* Skip the test if we can't check Qos */
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       /* The effective Qos should be the one expected after override */
+       EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
+
+       *events = 0;
+       T_END;
+}
+
+/*
+ * WL handler which checks if the servicer thread has correct Qos.
+ */
+static void
+workloop_cb_test_sync_send(uint64_t *workloop_id __unused, void **eventslist, int *events)
+{
+       T_LOG("Workloop handler workloop_cb_test_sync_send called");
+
+       EXPECT_TEST_MSG(*eventslist);
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       /* The effective Qos should be the one expected after override */
+       EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
+
+       *events = 0;
+       T_END;
+}
+
+/*
+ * WL handler which checks the overridden Qos and then enables the knote and checks
+ * for the Qos again if that dropped the sync ipc override.
+ */
+static void
+workloop_cb_test_sync_send_and_enable(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events)
+{
+       unsigned override_priority;
+       unsigned reenable_priority;
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_and_enable called");
+
+       EXPECT_TEST_MSG(*eventslist);
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       /* The effective Qos should be the one expected after override */
+       EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
+
+       /* Snapshot the current override priority */
+       override_priority = get_user_promotion_basepri();
+
+       /* Enable the knote */
+       struct kevent_qos_s *kev = *eventslist;
+       enable_kevent(workloop_id, kev->ident);
+
+       /*
+        * Check if the override has been dropped, check for priority instead of qos since
+        * there will be async qos push.
+        */
+       reenable_priority = get_user_promotion_basepri();
+       T_EXPECT_LT(reenable_priority, override_priority,
+               "thread's current override priority %d should be less than override priority prior to enabling knote %d",
+               reenable_priority, override_priority);
+
+       *events = 0;
+       T_END;
+}
+
+/*
+ * WL handler receives the first message and checks sync ipc override, then enables the knote
+ * and receives 2nd message and checks it sync ipc override.
+ */
+static int send_two_sync_handler_called = 0;
+static void
+workloop_cb_test_send_two_sync(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events)
+{
+       T_LOG("Workloop handler workloop_cb_test_send_two_sync called for %d time", send_two_sync_handler_called + 1);
+
+       EXPECT_TEST_MSG(*eventslist);
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_LOG("Number of events received is %d\n", *events);
+
+       if (send_two_sync_handler_called == 0) {
+               /* The effective Qos should be the one expected after override */
+               EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
+
+               /* Enable the knote to get 2nd message */
+               struct kevent_qos_s *kev = *eventslist;
+               uint64_t port = kev->ident;
+               populate_kevent(kev, port);
+               *events = 1;
+       } else {
+               EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE],
+                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]);
+               *events = 0;
+               T_END;
+       }
+       send_two_sync_handler_called++;
+}
+
+/*
+ * Checks the sync ipc override and then waits for client to destroy the
+ * special reply port and checks if that removes the sync ipc override.
+ */
+static boolean_t two_send_and_destroy_test_passed = FALSE;
+static int two_send_and_destroy_handler = 0;
+static void
+workloop_cb_test_two_send_and_destroy(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist __unused, int *events)
+{
+       T_LOG("Workloop handler workloop_cb_test_two_send_and_destroy called %d times", two_send_and_destroy_handler + 1);
+
+       EXPECT_TEST_MSG(*eventslist);
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       if (two_send_and_destroy_handler == 0) {
+               /* Sleep to make sure the mqueue gets full */
+               sleep(RECV_TIMEOUT_SECS);
+
+               /* The effective Qos should be the one expected after override */
+               EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
+
+               sleep(SEND_TIMEOUT_SECS);
+
+               /* Special reply port should have been destroyed, check Qos again */
+               EXPECT_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE],
+                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]);
+
+               two_send_and_destroy_test_passed = TRUE;
+       } else {
+               if (two_send_and_destroy_test_passed) {
+                       T_END;
+               }
+       }
+
+       /* Enable the knote to get next message */
+       struct kevent_qos_s *kev = *eventslist;
+       uint64_t port = kev->ident;
+       populate_kevent(kev, port);
+       *events = 1;
+       two_send_and_destroy_handler++;
+       T_LOG("Handler returning \n");
+}
+
+static mach_port_type_t
+get_reply_port(struct kevent_qos_s *kev)
+{
+       mach_msg_header_t *hdr;
+       mach_port_t reply_port;
+       mach_port_type_t type;
+       kern_return_t kr;
+
+       hdr = (void*)kev->ext[0];
+       T_QUIET; T_ASSERT_NOTNULL(hdr, "msg hdr");
+
+       reply_port = hdr->msgh_remote_port;
+       T_QUIET;T_ASSERT_TRUE(MACH_PORT_VALID(reply_port), "reply port valid");
+       kr = mach_port_type(mach_task_self(), reply_port, &type);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_type");
+       T_QUIET; T_ASSERT_TRUE(type & MACH_PORT_TYPE_SEND_ONCE, "send once received");
+
+       return reply_port;
+}
+
+static void
+send_reply(mach_port_t reply_port)
+{
+       kern_return_t kr;
+
+       struct {
+               mach_msg_header_t header;
+       } send_msg = {
+           .header = {
+                   .msgh_remote_port = reply_port,
+                   .msgh_local_port  = MACH_PORT_NULL,
+                   .msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_MOVE_SEND_ONCE, 0),
+                   .msgh_id          = 0x100,
+                   .msgh_size        = sizeof(send_msg),
+               },
+       };
+
+       kr = mach_msg(&(send_msg.header),
+               MACH_SEND_MSG,
+               send_msg.header.msgh_size,
+               0,
+               MACH_PORT_NULL,
+               0,
+               0);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "server mach_msg");
+}
+
+static void
+populate_kevent(struct kevent_qos_s *kev, unsigned long long port)
+{
+
+       memset(kev, 0, sizeof(struct kevent_qos_s));
+       kev->ident = port;
+       kev->filter = EVFILT_MACHPORT;
+       kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED;
+       kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
+               MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
+               MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0));
+       kev->data = 1;
+
+}
+
+static void
+enable_kevent(uint64_t *workloop_id, unsigned long long port)
+{
+       kern_return_t kr;
+       struct kevent_qos_s kev;
+
+       populate_kevent(&kev, port);
+       struct kevent_qos_s kev_err[] = {{ 0 }};
+
+       kr = kevent_id(*workloop_id, &kev, 1, kev_err, 1, NULL,
+                       NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS | KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kr, "kevent_id");
+}
+
+/*
+ * WL handler which sends a msg to the client from handler.
+ */
+static void
+workloop_cb_test_sync_send_reply(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events)
+{
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_reply called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       /* send reply */
+       send_reply(get_reply_port(*eventslist));
+
+       *events = 0;
+}
+
+/*
+ * WL handler which deallocates reply port.
+ */
+static void
+workloop_cb_test_sync_send_deallocate(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events)
+{
+       mach_port_t reply_port;
+       kern_return_t kr;
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_deallocate called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       reply_port = get_reply_port(*eventslist);
+
+       /* deallocate port */
+       kr = mach_port_deallocate(mach_task_self(), reply_port);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_deallocate");
+
+       *events = 0;
+
+       T_LOG("Handler returning \n");
+}
+
+
+/*
+ * WL handler which sends a msg to the client before enabling the event from handler.
+ */
+static void
+workloop_cb_test_sync_send_reply_kevent(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events)
+{
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_reply_kevent called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT(((*eventslist)->filter), EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       struct kevent_qos_s *kev = *eventslist;
+
+       /* send reply */
+       send_reply(get_reply_port(kev));
+
+       /* Enable the knote */
+       enable_kevent(workloop_id, kev->ident);
+
+       *events = 0;
+
+       T_LOG("Handler returning \n");
+}
+
+/*
+ * WL handler which sends a msg to the client before enabling the event from pthread.
+ */
+static void
+workloop_cb_test_sync_send_reply_kevent_pthread(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events)
+{
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_reply_kevent_pthread called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       struct kevent_qos_s *kev = *eventslist;
+
+       /* send reply */
+       send_reply(get_reply_port(kev));
+
+       populate_kevent(kev, kev->ident);
+
+       *events = 1;
+
+       T_LOG("Handler returning \n");
+}
+
+/*
+ * WL handler which sends a msg to the client after reenabling the event.
+ */
+static void
+workloop_cb_test_sync_send_kevent_reply(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events)
+{
+
+       T_LOG("workloop handler workloop_cb_test_sync_send_kevent_reply called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       struct kevent_qos_s *kev = *eventslist;
+       mach_port_t reply_port = get_reply_port(*eventslist);
+
+       /* Enable the knote */
+       enable_kevent(workloop_id, kev->ident);
+
+       /* send reply */
+       send_reply(reply_port);
+
+       *events = 0;
+
+       T_LOG("Handler returning \n");
+}
+
+/*
+ * WL handler that does nothing.
+ */
+static void
+workloop_cb_test_sync_send_do_nothing(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events)
+{
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_do_nothing called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       /* do nothing */
+
+       *events = 0;
+
+       T_LOG("Handler returning \n");
+}
+
+/*
+ * WL handler that returns the event to reenable.
+ */
+static void
+workloop_cb_test_sync_send_do_nothing_kevent_pthread(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events)
+{
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_do_nothing_kevent_pthread called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       struct kevent_qos_s *kev = *eventslist;
+       populate_kevent(kev, kev->ident);
+
+       *events = 1;
+
+       T_LOG("handler returning \n");
+}
+
+/*
+ * WL handler that exits.
+ */
+static void
+workloop_cb_test_sync_send_do_nothing_exit(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, __unused int *events)
+{
+
+       T_LOG("workloop handler workloop_cb_test_sync_send_do_nothing_exit called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       /* call exit */
+       exit(0);
+}
+
+/*
+ * WL handler which:
+ * first sync sends a msg to the client and reenables kevent after
+ * second sync sends a msg and reenables kevent after.
+ */
+static void
+workloop_cb_test_sync_send_reply_kevent_reply_kevent(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events)
+{
+
+       T_LOG("Workloop handler workloop_cb_test_sync_send_reply_kevent_reply_kevent called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       struct kevent_qos_s *kev = *eventslist;
+
+       /* send reply */
+       send_reply(get_reply_port(kev));
+
+       populate_kevent(kev, kev->ident);
+
+       *events = 1;
+
+       T_LOG("Handler returning \n");
+}
+
+/*
+ * WL handler which:
+ * first sync reenables kevent and after sends a msg
+ * second sync sends a msg and reenables kevent after.
+ */
+static int workloop_cb_test_sync_send_kevent_reply_reply_kevent_handler_called = 0;
+static void
+workloop_cb_test_sync_send_kevent_reply_reply_kevent(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events)
+{
+       T_LOG("workloop handler workloop_cb_test_sync_send_kevent_reply_reply_kevent called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       struct kevent_qos_s *kev = *eventslist;
+       mach_port_t reply_port = get_reply_port(kev);
+
+       if (workloop_cb_test_sync_send_kevent_reply_reply_kevent_handler_called == 0) {
+               workloop_cb_test_sync_send_kevent_reply_reply_kevent_handler_called = 1;
+
+               /* Enable the knote */
+               enable_kevent(workloop_id, kev->ident);
+
+               /* send reply */
+               send_reply(reply_port);
+
+               *events = 0;
+
+       } else {
+               /* send reply */
+               send_reply(reply_port);
+
+               /* Enable the knote */
+               enable_kevent(workloop_id, kev->ident);
+
+               *events = 0;
+       }
+
+       T_LOG("Handler returning \n");
+}
+
+/*
+ * WL handler which:
+ * first sync reenables kevent and after sends a msg
+ * second sync reenables kevent and after sends a msg
+ */
+static void
+workloop_cb_test_sync_send_kevent_reply_kevent_reply(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events)
+{
+       T_LOG("workloop handler workloop_cb_test_sync_send_kevent_reply_kevent_reply called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       struct kevent_qos_s *kev = *eventslist;
+       mach_port_t reply_port = get_reply_port(kev);
+
+       /* Enable the knote */
+       enable_kevent(workloop_id, kev->ident);
+
+       /* send reply */
+       send_reply(reply_port);
+
+       *events = 0;
+       T_LOG("Handler returning \n");
+}
+
+/*
+ * WL handler which:
+ * first sync ends a msg and reenables kevent after
+ * second sync reenables kevent and sends a msg after
+ */
+static int workloop_cb_test_sync_send_reply_kevent_kevent_reply_handler_called = 0;
+static void
+workloop_cb_test_sync_send_reply_kevent_kevent_reply(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events)
+{
+       T_LOG("workloop handler workloop_cb_test_sync_send_reply_kevent_kevent_reply called");
+
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       T_QUIET; T_ASSERT_EQ_INT(*events, 1, "events received");
+       T_QUIET; T_ASSERT_EQ_INT((*eventslist)->filter, EVFILT_MACHPORT, "received EVFILT_MACHPORT");
+
+       struct kevent_qos_s *kev = *eventslist;
+       mach_port_t reply_port = get_reply_port(kev);
+
+       if (workloop_cb_test_sync_send_reply_kevent_kevent_reply_handler_called == 0) {
+               workloop_cb_test_sync_send_reply_kevent_kevent_reply_handler_called = 1;
+
+               /* send reply */
+               send_reply(reply_port);
+
+               populate_kevent(kev, kev->ident);
+
+               *events = 1;
+
+       } else {
+
+               /* Enable the knote */
+               enable_kevent(workloop_id, kev->ident);
+               /* send reply */
+               send_reply(reply_port);
+
+               *events = 0;
+       }
+
+       T_LOG("Handler returning \n");
+}
+#pragma mark Mach receive
+
+#define KEVENT_QOS_SERVICE_NAME "com.apple.xnu.test.kevent_qos"
+
+static mach_port_t
+get_server_port(void)
+{
+       mach_port_t port;
+       kern_return_t kr = bootstrap_check_in(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "server bootstrap_check_in");
+       return port;
+}
+
+static void
+env_set_qos(char **env, qos_class_t qos[], const char *qos_name[], const char *wl_function)
+{
+       int i;
+       char *qos_str, *qos_name_str;
+       for (i = 0; i < ENV_VAR_QOS; i++) {
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&qos_str, "%s=%d", qos_env[i] , qos[i]),
+                       NULL);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(
+                       asprintf(&qos_name_str, "%s=%s", qos_name_env[i], qos_name[i]), NULL);
+               env[2 * i] = qos_str;
+               env[2 * i + 1] = qos_name_str;
+       }
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&env[2 * i], "%s=%s", wl_function_name, wl_function),
+                       NULL);
+       env[2 * i + 1] = NULL;
+}
+
+static void
+environ_get_qos(qos_class_t qos[], const char *qos_name[], const char **wl_function)
+{
+       char *qos_str;
+       char *qos_end;
+       int i;
+
+       for (i = 0; i < ENV_VAR_QOS; i++) {
+               qos_str = getenv(qos_env[i]);
+               T_QUIET; T_ASSERT_NOTNULL(qos_str, "getenv(%s)", qos_env[i]);
+
+               unsigned long qos_l = strtoul(qos_str, &qos_end, 10);
+               T_QUIET; T_ASSERT_EQ(*qos_end, '\0', "getenv(%s) = '%s' should be an "
+                               "integer", qos_env[i], qos_str);
+
+               T_QUIET; T_ASSERT_LT(qos_l, (unsigned long)100, "getenv(%s) = '%s' should "
+                               "be less than 100", qos_env[i], qos_str);
+
+               qos[i] = (qos_class_t)qos_l;
+               qos_name[i] = getenv(qos_name_env[i]);
+               T_QUIET; T_ASSERT_NOTNULL(qos_name[i], "getenv(%s)", qos_name_env[i]);
+       }
+       *wl_function = getenv(wl_function_name);
+       T_QUIET; T_ASSERT_NOTNULL(*wl_function, "getenv(%s)", wl_function_name);
+}
+
+static mach_voucher_t
+create_pthpriority_voucher(mach_msg_priority_t qos)
+{
+       char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)];
+
+       mach_voucher_t voucher = MACH_PORT_NULL;
+       kern_return_t ret;
+       ipc_pthread_priority_value_t ipc_pthread_priority_value =
+                       (ipc_pthread_priority_value_t)qos;
+
+       mach_voucher_attr_raw_recipe_array_t recipes;
+       mach_voucher_attr_raw_recipe_size_t recipe_size = 0;
+       mach_voucher_attr_recipe_t recipe =
+               (mach_voucher_attr_recipe_t)&voucher_buf[recipe_size];
+
+       recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY;
+       recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE;
+       recipe->previous_voucher = MACH_VOUCHER_NULL;
+       memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value));
+       recipe->content_size = sizeof(ipc_pthread_priority_value_t);
+       recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size;
+
+       recipes = (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0];
+
+       ret = host_create_mach_voucher(mach_host_self(),
+                               recipes,
+                               recipe_size,
+                               &voucher);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client host_create_mach_voucher");
+       return voucher;
+}
+
+static void
+send(
+       mach_port_t send_port,
+       mach_port_t reply_port,
+       mach_port_t msg_port,
+       mach_msg_priority_t qos,
+       mach_msg_option_t options)
+{
+       kern_return_t ret = 0;
+
+       struct test_msg send_msg = {
+           .header = {
+                   .msgh_remote_port = send_port,
+                   .msgh_local_port  = reply_port,
+                   .msgh_bits        = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND,
+                                       reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0,
+                                       MACH_MSG_TYPE_MOVE_SEND,
+                                       MACH_MSGH_BITS_COMPLEX),
+                   .msgh_id          = 0x100,
+                   .msgh_size        = sizeof(send_msg),
+               },
+           .body = {
+                   .msgh_descriptor_count = 1,
+               },
+           .port_descriptor = {
+                   .name        = msg_port,
+                       .disposition = MACH_MSG_TYPE_MOVE_RECEIVE,
+                       .type        = MACH_MSG_PORT_DESCRIPTOR,
+               },
+               .opts = options,
+       };
+
+       if (msg_port == MACH_PORT_NULL) {
+               send_msg.body.msgh_descriptor_count = 0;
+       }
+
+       if ((options & MACH_SEND_PROPAGATE_QOS) == 0) {
+               send_msg.header.msgh_voucher_port = create_pthpriority_voucher(qos);
+               send_msg.qos = qos;
+       } else {
+               qos_class_t qc;
+               int relpri;
+               pthread_get_qos_class_np(pthread_self(), &qc, &relpri);
+               send_msg.qos = (uint32_t)_pthread_qos_class_encode(qc, relpri, 0);
+       }
+
+       ret = mach_msg(&(send_msg.header),
+               MACH_SEND_MSG |
+               MACH_SEND_TIMEOUT |
+               MACH_SEND_OVERRIDE|
+               ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options),
+               send_msg.header.msgh_size,
+               0,
+               MACH_PORT_NULL,
+               10000,
+               0);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg");
+}
+
+static kern_return_t
+receive(
+       mach_port_t rcv_port,
+       mach_port_t notify_port)
+{
+       kern_return_t ret = 0;
+
+       struct test_msg rcv_msg = {
+           .header = {
+                   .msgh_remote_port = MACH_PORT_NULL,
+                   .msgh_local_port  = rcv_port,
+                   .msgh_size        = sizeof(rcv_msg),
+               },
+       };
+
+       T_LOG("Client: Starting sync receive\n");
+
+       ret = mach_msg(&(rcv_msg.header),
+               MACH_RCV_MSG |
+               MACH_RCV_TIMEOUT |
+               MACH_RCV_SYNC_WAIT,
+               0,
+               rcv_msg.header.msgh_size,
+               rcv_port,
+               SEND_TIMEOUT_SECS * 1000,
+               notify_port);
+
+       return ret;
+}
+
+T_HELPER_DECL(qos_get_special_reply_port,
+               "Test get_special_reply_port and it's corner cases.")
+{
+       mach_port_t special_reply_port;
+       mach_port_t new_special_reply_port;
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       new_special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(new_special_reply_port), "get_thread_special_reply_port");
+
+       mach_port_destroy(mach_task_self(), special_reply_port);
+       mach_port_destroy(mach_task_self(), new_special_reply_port);
+
+       new_special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(new_special_reply_port), "get_thread_special_reply_port");
+
+       T_END;
+}
+
+static void *
+qos_client_send_to_intransit(void *arg __unused)
+{
+       mach_port_t qos_send_port;
+       mach_port_t msg_port;
+       mach_port_t special_reply_port;
+
+       kern_return_t kr = bootstrap_look_up(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       /* Create a rcv right to send in a msg */
+       kr = mach_port_allocate(mach_task_self(),
+                       MACH_PORT_RIGHT_RECEIVE,
+                       &msg_port);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_allocate");
+
+       kr = mach_port_insert_right(mach_task_self(),
+                       msg_port,
+                       msg_port,
+                       MACH_MSG_TYPE_MAKE_SEND);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_insert_right");
+
+       /* Send an empty msg on the port to fire the WL thread */
+       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0);
+
+       /* Sleep 3 seconds for the server to start */
+       sleep(3);
+
+       /* Send the message with msg port as in-transit port, this msg will not be dequeued */
+       send(qos_send_port, MACH_PORT_NULL, msg_port,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0);
+
+       /* Send 5 messages to msg port to make sure the port is full */
+       for (int i = 0; i < 5; i++) {
+               send(msg_port, MACH_PORT_NULL, MACH_PORT_NULL,
+                       (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0);
+       }
+
+       T_LOG("Sent 5 msgs, now trying to send sync ipc messgae, which will block with a timeout\n");
+       /* Send the message to the in-transit port, it should block and override the rcv's workloop */
+       send(msg_port, special_reply_port, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0);
+       T_LOG("Client done sending messages, now waiting for server to end the test");
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+T_HELPER_DECL(qos_client_send_to_intransit_with_thr_pri,
+               "Send synchronous messages from a pri thread to an intransit port")
+{
+       thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_send_to_intransit);
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+static void
+thread_create_at_qos(qos_class_t qos, void * (*function)(void *))
+{
+       qos_class_t qos_thread;
+       pthread_t thread;
+        pthread_attr_t attr;
+       int ret;
+
+       ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL);
+       if (ret != 0) {
+               T_LOG("set priority failed\n");
+       }
+
+        pthread_attr_init(&attr);
+        pthread_attr_set_qos_class_np(&attr, qos, 0);
+        pthread_create(&thread, &attr, function, NULL);
+
+       T_LOG("pthread created\n");
+       pthread_get_qos_class_np(thread, &qos_thread, NULL);
+        T_EXPECT_EQ(qos_thread, (qos_class_t)qos, NULL);
+}
+
+static void *
+qos_send_and_sync_rcv(void *arg __unused)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+
+       T_LOG("Client: from created thread\n");
+       T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+                       "pthread QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
+
+       kern_return_t kr = bootstrap_look_up(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       /* enqueue two messages to make sure that mqueue is not empty */
+       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0), 0);
+
+       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0), 0);
+
+       sleep(SEND_TIMEOUT_SECS);
+
+       /* sync wait on msg port */
+       receive(special_reply_port, qos_send_port);
+
+       T_LOG("Client done doing sync rcv, now waiting for server to end the test");
+       sleep(SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+T_HELPER_DECL(qos_client_send_sync_and_sync_rcv,
+               "Send messages and syncronously wait for rcv")
+{
+       thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_send_and_sync_rcv);
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+static void *
+qos_client_send_sync_msg_and_test_link(void *arg)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+       boolean_t in_effect = FALSE;
+       kern_return_t kr;
+       unsigned long expected_result = (unsigned long) arg;
+
+       kr = bootstrap_look_up(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       /* start monitoring sync ipc link */
+       kr = mach_sync_ipc_link_monitoring_start(&special_reply_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_sync_ipc_link_monitoring_start");
+
+       /* Send the message to msg port */
+       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0);
+
+       /*
+        * wait for the reply
+        * some tests do not send a msg back so the receive
+        * might fail
+        */
+       receive(special_reply_port, qos_send_port);
+
+       /* stop monitoring link */
+       kr = mach_sync_ipc_link_monitoring_stop(special_reply_port, &in_effect);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_sync_ipc_link_monitoring_stop");
+
+       if (!in_effect)
+               T_LOG("Link was broken");
+       else
+               T_LOG("Link correct");
+
+       if (expected_result == 1)
+               T_ASSERT_TRUE(in_effect, "special reply port link after rcv");
+       else
+               T_ASSERT_FALSE(in_effect, "special reply port link after rcv");
+       T_END;
+}
+
+static void *
+qos_client_send_2sync_msg_and_test_link(void *arg)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+       boolean_t in_effect = FALSE;
+       kern_return_t kr;
+       unsigned long expected_result = (unsigned long) arg;
+
+       kr = bootstrap_look_up(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       /* start monitoring sync ipc link */
+       kr = mach_sync_ipc_link_monitoring_start(&special_reply_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_sync_ipc_link_monitoring_start");
+
+       /* Send the first message to msg port */
+       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0);
+
+       /* wait for the reply */
+       kr = receive(special_reply_port, qos_send_port);
+       T_QUIET;T_ASSERT_MACH_SUCCESS(kr, "receive");
+
+       /* Send the second message to msg port */
+       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0);
+
+       /* wait for the reply */
+       kr = receive(special_reply_port, qos_send_port);
+       T_QUIET;T_ASSERT_MACH_SUCCESS(kr, "receive");
+
+       /* stop monitoring link */
+       kr = mach_sync_ipc_link_monitoring_stop(special_reply_port, &in_effect);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_sync_ipc_link_monitoring_stop");
+
+       if (!in_effect)
+               T_LOG("Link was broken");
+       else
+               T_LOG("Link correct");
+
+       if (expected_result == 1)
+               T_ASSERT_TRUE(in_effect, "special reply port link after rcv");
+       else
+               T_ASSERT_FALSE(in_effect, "special reply port link after rcv");
+       T_END;
+}
+T_HELPER_DECL(qos_client_send_sync_msg_with_link_check_correct_server,
+               "Send sync message, wait for reply and check sync ipc link")
+{
+       pthread_t thread;
+       pthread_attr_t attr;
+       unsigned long expected_result = 1;
+
+       pthread_attr_init(&attr);
+       pthread_create(&thread, &attr, qos_client_send_sync_msg_and_test_link, (void *)expected_result);
+
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+T_HELPER_DECL(qos_client_send_sync_msg_with_link_check_incorrect_server,
+               "Send sync message, wait for reply and check sync ipc link")
+{
+       pthread_t thread;
+       pthread_attr_t attr;
+       unsigned long expected_result = 0;
+
+       pthread_attr_init(&attr);
+       pthread_create(&thread, &attr, qos_client_send_sync_msg_and_test_link, (void *)expected_result);
+
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+T_HELPER_DECL(qos_client_send_2sync_msg_with_link_check_correct_server,
+               "Send sync message, wait for reply and check sync ipc link")
+{
+       pthread_t thread;
+       pthread_attr_t attr;
+       unsigned long expected_result = 1;
+
+       pthread_attr_init(&attr);
+       pthread_create(&thread, &attr, qos_client_send_2sync_msg_and_test_link, (void *)expected_result);
+
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+T_HELPER_DECL(qos_client_send_2sync_msg_with_link_check_incorrect_server,
+               "Send sync message, wait for reply and check sync ipc link")
+{
+       pthread_t thread;
+       pthread_attr_t attr;
+       unsigned long expected_result = 0;
+
+       pthread_attr_init(&attr);
+       pthread_create(&thread, &attr, qos_client_send_2sync_msg_and_test_link, (void *)expected_result);
+
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+static void *
+qos_client_send_sync_msg(void *arg __unused)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+
+       kern_return_t kr = bootstrap_look_up(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       /* Send the message to msg port */
+       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0);
+
+       /* wait for the reply */
+       receive(special_reply_port, qos_send_port);
+
+       T_LOG("Client done sending messages, now waiting for server to end the test");
+       sleep(2 * SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+T_HELPER_DECL(qos_client_send_sync_msg_with_pri,
+               "Send sync message and wait for reply")
+{
+       thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_send_sync_msg);
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+static void *
+qos_client_send_two_sync_msg_high_qos(void *arg __unused)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+
+       kern_return_t kr = bootstrap_look_up(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       /* Send the message to msg port */
+       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0), 0);
+
+       /* wait for the reply */
+       receive(special_reply_port, qos_send_port);
+
+       T_LOG("Client done sending messages, now waiting for server to end the test");
+       sleep(SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+static void *
+qos_client_send_two_sync_msg_low_qos(void *arg __unused)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+
+       kern_return_t kr = bootstrap_look_up(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       /* Send the message to msg port */
+       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0);
+
+       /* wait for the reply */
+       receive(special_reply_port, qos_send_port);
+
+       T_LOG("Client done sending messages, now waiting for server to end the test");
+       sleep(SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+T_HELPER_DECL(qos_client_send_two_sync_msg_with_thr_pri,
+               "Send messages sync msgs from 2 threads at given thread pri")
+{
+       thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_send_two_sync_msg_high_qos);
+       sleep(INTERMITTENT_TIMEOUT_SEC);
+       thread_create_at_qos(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], qos_client_send_two_sync_msg_low_qos);
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+static mach_port_t other_thread_reply_port = MACH_PORT_NULL;
+static void *
+qos_client_destroy_other_threads_port(void *arg __unused)
+{
+       T_LOG("Waiting 6 seconds before destroying other thread's reply port");
+       sleep(SEND_TIMEOUT_SECS);
+
+       T_LOG("Destroying other thread's special reply port ");
+       mach_port_destroy(mach_task_self(), other_thread_reply_port);
+
+       T_LOG("Other thread done destroying ");
+       sleep(3 * SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+static void *
+qos_client_create_sepcial_reply_and_spawn_thread(void *arg __unused)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+
+       kern_return_t kr = bootstrap_look_up(bootstrap_port,
+                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       other_thread_reply_port = special_reply_port;
+
+       /* Send an async message */
+       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0);
+
+       /* Send the sync ipc message */
+       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0), 0);
+
+       /* Create a new thread to send the sync message on our special reply port */
+       thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_destroy_other_threads_port);
+
+       /* Client starting to receive messgae */
+       receive(special_reply_port, qos_send_port);
+
+       sleep(3 * SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+T_HELPER_DECL(qos_client_send_two_msg_and_destroy,
+               "Send a message with another threads special reply port while that thread destroys the port")
+{
+       thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_client_create_sepcial_reply_and_spawn_thread);
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+static mach_port_t send_complex_connection_port = MACH_PORT_NULL;
+
+static void *
+qos_client_send_complex_msg_to_service_port(void *arg __unused)
+{
+       mach_port_t svc_port, tsr_port, conn_port;
+       kern_return_t kr;
+
+       kr = bootstrap_look_up(bootstrap_port, KEVENT_QOS_SERVICE_NAME, &svc_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       tsr_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(tsr_port), "get_thread_special_reply_port");
+
+       conn_port = send_complex_connection_port;
+
+       T_LOG("Sending to the service port with a sync IPC");
+       send(svc_port, tsr_port, conn_port,
+                       (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0),
+                       MACH_SEND_PROPAGATE_QOS);
+
+       receive(tsr_port, svc_port);
+
+       sleep(3 * SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+static void *
+qos_client_send_to_connection_then_service_port(void *arg __unused)
+{
+       mach_port_t tsr_port, conn_port;
+       mach_port_options_t opts = {
+               .flags = MPO_INSERT_SEND_RIGHT,
+       };
+       kern_return_t kr;
+
+       kr = mach_port_construct(mach_task_self(), &opts, 0ull, &conn_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_port_construct");
+       send_complex_connection_port = conn_port;
+
+       tsr_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(tsr_port), "get_thread_special_reply_port");
+
+       T_LOG("Sending to the connection port with a sync IPC");
+       send(conn_port, tsr_port, MACH_PORT_NULL,
+                       (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0),
+                       MACH_SEND_PROPAGATE_QOS);
+
+       thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
+                       qos_client_send_complex_msg_to_service_port);
+
+       receive(tsr_port, conn_port);
+
+       sleep(3 * SEND_TIMEOUT_SECS);
+
+       T_ASSERT_FAIL("client timed out");
+       return NULL;
+}
+
+T_HELPER_DECL(qos_client_send_complex_msg_with_pri,
+               "Send a message with several ports causing links")
+{
+       thread_create_at_qos(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE],
+                       qos_client_send_to_connection_then_service_port);
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+static void
+run_client_server(const char *server_name, const char *client_name, qos_class_t qos[],
+               const char *qos_name[], const char *wl_function)
+{
+       char *env[2 * ENV_VAR_QOS + ENV_VAR_FUNCTION + 1];
+       env_set_qos(env, qos, qos_name, wl_function);
+
+       for (int i = 0; i < ENV_VAR_QOS; i++) {
+               g_expected_qos[i] = qos[i];
+               g_expected_qos_name[i] = qos_name[i];
+       }
+
+       dt_helper_t helpers[] = {
+               dt_launchd_helper_env("com.apple.xnu.test.kevent_qos.plist",
+                               server_name, env),
+               dt_fork_helper(client_name)
+       };
+       dt_run_helpers(helpers, 2, HELPER_TIMEOUT_SECS);
+}
+
+#pragma mark Mach receive - kevent_qos
+
+static void
+expect_kevent_id_recv(mach_port_t port, qos_class_t qos[], const char *qos_name[], const char *wl_function)
+{
+       int r;
+
+       /* Qos expected by workloop thread */
+       for (int i = 0; i < ENV_VAR_QOS; i++) {
+               g_expected_qos[i] = qos[i];
+               g_expected_qos_name[i] = qos_name[i];
+       }
+
+       if (strcmp(wl_function, "workloop_cb_test_intransit") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_intransit, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_and_enable") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_and_enable, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_send_two_sync") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_send_two_sync, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_two_send_and_destroy") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_two_send_and_destroy, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_deallocate") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_deallocate, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply_kevent") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply_kevent, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply_kevent_pthread") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply_kevent_pthread, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_kevent_reply") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_kevent_reply, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_do_nothing") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_do_nothing, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_do_nothing_kevent_pthread") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_do_nothing_kevent_pthread, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_do_nothing_exit") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_do_nothing_exit, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply_kevent_reply_kevent") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply_kevent_reply_kevent, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_kevent_reply_reply_kevent") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_kevent_reply_reply_kevent, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_kevent_reply_kevent_reply") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_kevent_reply_kevent_reply, 0, 0), NULL);
+       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_reply_kevent_kevent_reply") == 0) {
+               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+                       worker_cb, event_cb,
+                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_reply_kevent_kevent_reply, 0, 0), NULL);
+       } else {
+               T_ASSERT_FAIL("no workloop function specified \n");
+       }
+
+       struct kevent_qos_s kev[] = {{
+               .ident = port,
+               .filter = EVFILT_MACHPORT,
+               .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
+               .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
+                               MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
+                               MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0)),
+               .data = 1,
+               .qos = (int32_t)_pthread_qos_class_encode(qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0)
+       }};
+
+       struct kevent_qos_s kev_err[] = {{ 0 }};
+
+       /* Setup workloop for mach msg rcv */
+       r = kevent_id(25, kev, 1, kev_err, 1, NULL,
+                       NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id");
+       T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id");
+       sleep(HELPER_TIMEOUT_SECS);
+}
+
+T_HELPER_DECL(server_kevent_id,
+               "Reply with the QoS that a dispatch source event handler ran with")
+{
+       qos_class_t qos[ENV_VAR_QOS];
+       const char *qos_name[ENV_VAR_QOS];
+       const char *wl_function;
+       environ_get_qos(qos, qos_name, &wl_function);
+
+       expect_kevent_id_recv(get_server_port(), qos, qos_name, wl_function);
+       sleep(HELPER_TIMEOUT_SECS);
+       T_ASSERT_FAIL("should receive a message within %d seconds",
+                       RECV_TIMEOUT_SECS);
+}
+
+#define TEST_QOS(server_name, client_name, name, wl_function_name, qos_bo, qos_bo_name, qos_qo, qos_qo_name, qos_ao, qos_ao_name) \
+       T_DECL(server_kevent_id_##name, \
+                       "Event delivery at " qos_ao_name " QoS using a kevent_id", \
+                       T_META_ASROOT(YES)) \
+       { \
+               qos_class_t qos_array[ENV_VAR_QOS] = {qos_bo, qos_qo, qos_ao};  \
+               const char *qos_name_array[ENV_VAR_QOS] = {qos_bo_name, qos_qo_name, qos_ao_name}; \
+               run_client_server(server_name, client_name, qos_array, qos_name_array, wl_function_name); \
+       }
+/*
+ * Test 1: Test special reply port SPI
+ *
+ * Create thread special reply port and check any subsequent calls to
+ * the same should return MACH_PORT_NULL, unless the reply port is destroyed.
+ */
+TEST_QOS("server_kevent_id", "qos_get_special_reply_port", special_reply_port, "workloop_cb_test_intransit",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+/*
+ * Test 2: Test sync ipc send to an in-transit port
+ *
+ * Send a sync ipc message (at IN qos) to an in-transit port enqueued in a port
+ * attached to a workloop. Test that the servicer of the workloop gets
+ * sync ipc override.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_to_intransit_with_thr_pri", transit_IN, "workloop_cb_test_intransit",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_USER_INITIATED, "user initiated")
+
+/*
+ * Test 3: Test sync ipc send to an in-transit port
+ *
+ * Send a sync ipc message (at UI qos) to an in-transit port enqueued in a port
+ * attached to a workloop. Test that the servicer of the workloop gets
+ * sync ipc override.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_to_intransit_with_thr_pri", transit_UI, "workloop_cb_test_intransit",
+       QOS_CLASS_USER_INITIATED, "user initiated",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
+
+/*
+ * Test 4: Test starting a sync rcv overrides the servicer
+ *
+ * Send an async message to a port and then start waiting on
+ * the port in mach msg rcv (at IN qos) with sync wait and test if the
+ * servicer of the workloop gets sync ipc override.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_sync_and_sync_rcv", rcv_IN, "workloop_cb_test_intransit",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_USER_INITIATED, "user initiated")
+
+/*
+ * Test 5: Test starting a sync rcv overrides the servicer
+ *
+ * Send an async message to a port and then start waiting on
+ * the port in mach msg rcv (at UI qos) with sync wait and test if the
+ * servicer of the workloop gets sync ipc override.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_sync_and_sync_rcv", rcv_UI, "workloop_cb_test_intransit",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_USER_INTERACTIVE, "user interactive with 47 basepri promotion")
+
+/*
+ * Test 6: test sending sync ipc message (at IN qos) to port will override the servicer
+ *
+ * Send a message with sync ipc override to a port and check if the servicer
+ * of the workloop on other side gets sync ipc override.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_pri", send_sync_IN, "workloop_cb_test_sync_send",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_USER_INITIATED, "user initiated")
+
+/*
+ * Test 7: test sending sync ipc message (at UI qos) to port will override the servicer
+ *
+ * Send a message with sync ipc override to a port and check if the servicer
+ * of the workloop on other side gets sync ipc override.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_pri", send_sync_UI, "workloop_cb_test_sync_send",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
+
+/*
+ * Test 8: test enabling a knote in workloop handler will drop the sync ipc override of delivered message
+ *
+ * Send a sync ipc message to port and check the servicer of the workloop
+ * on other side gets sync ipc override and once the handler enables the knote,
+ * that sync ipc override is dropped.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_pri", send_sync_UI_and_enable, "workloop_cb_test_sync_send_and_enable",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
+
+/*
+ * Test 9: test returning to begin processing drops sync ipc override of delivered message
+ *
+ * Send a sync ipc message and check if enabling the knote clears the override of
+ * the delivered message, but should still have the override of an enqueued message.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_two_sync_msg_with_thr_pri", send_two_sync_UI, "workloop_cb_test_send_two_sync",
+       QOS_CLASS_BACKGROUND, "background",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
+
+/*
+ * Test 10: test destroying the special reply port drops the override
+ *
+ * Send an async messages and a sync ipc message, the workloop handler
+ * should get a sync ipc override, now test if destroying the special
+ * reply port drops the sync ipc override on the servicer.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_two_msg_and_destroy", send_two_UI_and_destroy, "workloop_cb_test_two_send_and_destroy",
+       QOS_CLASS_BACKGROUND, "background",
+       QOS_CLASS_MAINTENANCE, "maintenance",
+       QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
+/*
+ * Test 11: test sending two ports with chaining
+ *
+ * Send a sync IPC to a connection port, which itself is embedded in a message
+ * sent as a sync IPC to a service port.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_complex_msg_with_pri", send_complex_sync_UI_and_enable, "workloop_cb_test_sync_send_and_enable",
+       QOS_CLASS_USER_INITIATED, "user initiated",
+       QOS_CLASS_USER_INITIATED, "user initiated",
+       QOS_CLASS_USER_INTERACTIVE, "user initiated with 47 basepri promotion")
+
+/*
+ * Test 12 - 19
+ *
+ * Test single sync ipc link with server that breaks/preserves the link in different ways.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_s, "workloop_cb_test_sync_send_reply",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_d, "workloop_cb_test_sync_send_deallocate",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_sk, "workloop_cb_test_sync_send_reply_kevent",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_skp, "workloop_cb_test_sync_send_reply_kevent_pthread",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_incorrect_server", send_sync_link_incorrect_server_ks, "workloop_cb_test_sync_send_kevent_reply",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_n, "workloop_cb_test_sync_send_do_nothing",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_incorrect_server", send_sync_link_incorrect_server_kp, "workloop_cb_test_sync_send_do_nothing_kevent_pthread",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_sync_msg_with_link_check_correct_server", send_sync_link_correct_server_e, "workloop_cb_test_sync_send_do_nothing_exit",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+/*
+ * Test 20 - 23
+ *
+ * Test sequential sync ipc link with server that breaks/preserves the link.
+ */
+TEST_QOS("server_kevent_id", "qos_client_send_2sync_msg_with_link_check_correct_server", send_2sync_link_correct_server_sksk, "workloop_cb_test_sync_send_reply_kevent_reply_kevent",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_2sync_msg_with_link_check_incorrect_server", send_2sync_link_incorrect_server_kssk, "workloop_cb_test_sync_send_kevent_reply_reply_kevent",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_2sync_msg_with_link_check_incorrect_server", send_2sync_link_incorrect_server_ksks, "workloop_cb_test_sync_send_kevent_reply_kevent_reply",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
+
+TEST_QOS("server_kevent_id", "qos_client_send_2sync_msg_with_link_check_incorrect_server", send_2sync_link_incorrect_server_skks, "workloop_cb_test_sync_send_reply_kevent_kevent_reply",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default",
+       QOS_CLASS_DEFAULT, "default")
diff --git a/tests/kpc.c b/tests/kpc.c
new file mode 100644 (file)
index 0000000..5200950
--- /dev/null
@@ -0,0 +1,68 @@
+#include <darwintest.h>
+#include <inttypes.h>
+#include <stdint.h>
+
+#include <kperf/kpc.h>
+
+T_DECL(fixed_counters,
+               "test that fixed counters return monotonically increasing values",
+               T_META_ASROOT(YES))
+{
+       T_SKIP("unimplemented");
+}
+
+T_DECL(fixed_thread_counters,
+               "test that fixed thread counters return monotonically increasing values",
+               T_META_ASROOT(YES))
+{
+       int err;
+       uint32_t ctrs_cnt;
+       uint64_t *ctrs_a;
+       uint64_t *ctrs_b;
+
+       T_SETUPBEGIN;
+
+       ctrs_cnt = kpc_get_counter_count(KPC_CLASS_FIXED_MASK);
+       if (ctrs_cnt == 0) {
+               T_SKIP("no fixed counters available");
+       }
+       T_LOG("device has %" PRIu32 " fixed counters", ctrs_cnt);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kpc_force_all_ctrs_set(1), NULL);
+       T_ASSERT_POSIX_SUCCESS(kpc_set_counting(KPC_CLASS_FIXED_MASK),
+                       "kpc_set_counting");
+       T_ASSERT_POSIX_SUCCESS(kpc_set_thread_counting(KPC_CLASS_FIXED_MASK),
+                       "kpc_set_thread_counting");
+
+       T_SETUPEND;
+
+       ctrs_a = malloc(ctrs_cnt * sizeof(uint64_t));
+       T_QUIET; T_ASSERT_NOTNULL(ctrs_a, NULL);
+
+       err = kpc_get_thread_counters(0, ctrs_cnt, ctrs_a);
+       T_ASSERT_POSIX_SUCCESS(err, "kpc_get_thread_counters");
+
+       for (uint32_t i = 0; i < ctrs_cnt; i++) {
+               T_LOG("checking counter %d with value %" PRIu64 " > 0", i, ctrs_a[i]);
+               T_QUIET;
+               T_EXPECT_GT(ctrs_a[i], UINT64_C(0), "counter %d is non-zero", i);
+       }
+
+       ctrs_b = malloc(ctrs_cnt * sizeof(uint64_t));
+       T_QUIET; T_ASSERT_NOTNULL(ctrs_b, NULL);
+
+       err = kpc_get_thread_counters(0, ctrs_cnt, ctrs_b);
+       T_ASSERT_POSIX_SUCCESS(err, "kpc_get_thread_counters");
+
+       for (uint32_t i = 0; i < ctrs_cnt; i++) {
+               T_LOG("checking counter %d with value %" PRIu64
+                               " > previous value %" PRIu64, i, ctrs_b[i], ctrs_a[i]);
+               T_QUIET;
+               T_EXPECT_GT(ctrs_b[i], UINT64_C(0), "counter %d is non-zero", i);
+               T_QUIET; T_EXPECT_LT(ctrs_a[i], ctrs_b[i],
+                               "counter %d is increasing", i);
+       }
+
+       free(ctrs_a);
+       free(ctrs_b);
+}
diff --git a/tests/kperf.c b/tests/kperf.c
new file mode 100644 (file)
index 0000000..81e3e4d
--- /dev/null
@@ -0,0 +1,558 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif /* defined(T_NAMESPACE) */
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <dispatch/dispatch.h>
+#include <inttypes.h>
+#include <ktrace/session.h>
+#include <ktrace/private.h>
+#include <System/sys/kdebug.h>
+#include <kperf/kperf.h>
+#include <kperfdata/kpdecode.h>
+#include <os/assumes.h>
+#include <stdint.h>
+#include <sys/sysctl.h>
+
+#include "kperf_helpers.h"
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.kperf"),
+               T_META_CHECK_LEAKS(false));
+
+#define MAX_CPUS    64
+#define MAX_THREADS 64
+
+volatile static bool running_threads = true;
+
+static void *
+spinning_thread(void *semp)
+{
+       T_QUIET;
+       T_ASSERT_NOTNULL(semp, "semaphore passed to thread should not be NULL");
+       dispatch_semaphore_signal(*(dispatch_semaphore_t *)semp);
+
+       while (running_threads);
+       return NULL;
+}
+
+#define PERF_STK_KHDR  UINT32_C(0x25020014)
+#define PERF_STK_UHDR  UINT32_C(0x25020018)
+#define PERF_TMR_FIRE  KDBG_EVENTID(DBG_PERF, 3, 0)
+#define PERF_TMR_HNDLR KDBG_EVENTID(DBG_PERF, 3, 2)
+#define PERF_TMR_PEND  KDBG_EVENTID(DBG_PERF, 3, 3)
+#define PERF_TMR_SKIP  KDBG_EVENTID(DBG_PERF, 3, 4)
+
+#define SCHED_HANDOFF KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, \
+               MACH_STACK_HANDOFF)
+#define SCHED_SWITCH  KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, MACH_SCHED)
+#define SCHED_IDLE    KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, MACH_IDLE)
+
+#define MP_CPUS_CALL UINT32_C(0x1900004)
+
+#define DISPATCH_AFTER_EVENT UINT32_C(0xfefffffc)
+#define TIMEOUT_SECS 10
+
+#define TIMER_PERIOD_NS (1 * NSEC_PER_MSEC)
+
+static void
+reset_ktrace(void)
+{
+       kperf_reset();
+}
+
+/*
+ * Ensure that kperf is correctly IPIing CPUs that are actively scheduling by
+ * bringing up threads and ensuring that threads on-core are sampled by each
+ * timer fire.
+ */
+
+T_DECL(ipi_active_cpus,
+               "make sure that kperf IPIs all active CPUs",
+               T_META_ASROOT(true))
+{
+       int ncpus = dt_ncpu();
+       T_QUIET;
+       T_ASSERT_LT(ncpus, MAX_CPUS,
+                       "only supports up to %d CPUs", MAX_CPUS);
+       T_LOG("found %d CPUs", ncpus);
+
+       int nthreads = ncpus - 1;
+       T_QUIET;
+       T_ASSERT_LT(nthreads, MAX_THREADS,
+                       "only supports up to %d threads", MAX_THREADS);
+
+       static pthread_t threads[MAX_THREADS];
+
+       /*
+        * TODO options to write this to a file and reinterpret a file...
+        */
+
+       /*
+        * Create threads to bring up all of the CPUs.
+        */
+
+       dispatch_semaphore_t thread_spinning = dispatch_semaphore_create(0);
+
+       for (int i = 0; i < nthreads; i++) {
+               T_QUIET;
+               T_ASSERT_POSIX_ZERO(
+                               pthread_create(&threads[i], NULL, &spinning_thread,
+                               &thread_spinning), NULL);
+               dispatch_semaphore_wait(thread_spinning, DISPATCH_TIME_FOREVER);
+       }
+
+       T_LOG("spun up %d thread%s", nthreads, nthreads == 1 ? "" : "s");
+
+       ktrace_session_t s = ktrace_session_create();
+       T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "ktrace_session_create");
+
+       dispatch_queue_t q = dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0);
+
+       /*
+        * Only set the timeout after we've seen an event that was traced by us.
+        * This helps set a reasonable timeout after we're guaranteed to get a
+        * few events.
+        */
+
+       ktrace_events_single(s, DISPATCH_AFTER_EVENT,
+                       ^(__unused struct trace_point *tp)
+       {
+               dispatch_after(dispatch_time(DISPATCH_TIME_NOW,
+                               TIMEOUT_SECS * NSEC_PER_SEC), q, ^{
+                       ktrace_end(s, 0);
+               });
+       });
+
+       __block uint64_t nfires = 0;
+       __block uint64_t nsamples = 0;
+       static uint64_t idle_tids[MAX_CPUS] = { 0 };
+       __block int nidles = 0;
+
+       ktrace_set_completion_handler(s, ^{
+               T_LOG("stopping threads");
+
+               running_threads = false;
+
+               for (int i = 0; i < nthreads; i++) {
+                       T_QUIET;
+                       T_ASSERT_POSIX_ZERO(pthread_join(threads[i], NULL), NULL);
+               }
+
+               for (int i = 0; i < nidles; i++) {
+                       T_LOG("CPU %d idle thread: %#" PRIx64, i, idle_tids[i]);
+               }
+
+               T_LOG("saw %" PRIu64 " timer fires, %" PRIu64 " samples, "
+                               "%g samples/fire", nfires, nsamples,
+                               (double)nsamples / (double)nfires);
+
+               T_END;
+       });
+
+       /*
+        * Track which threads are running on each CPU.
+        */
+
+       static uint64_t tids_on_cpu[MAX_CPUS] = { 0 };
+
+       void (^switch_cb)(struct trace_point *) = ^(struct trace_point *tp) {
+               uint64_t new_thread = tp->arg2;
+               // uint64_t old_thread = tp->threadid;
+
+               for (int i = 0; i < nidles; i++) {
+                       if (idle_tids[i] == new_thread) {
+                               return;
+                       }
+               }
+
+               tids_on_cpu[tp->cpuid] = new_thread;
+       };
+
+       ktrace_events_single(s, SCHED_SWITCH, switch_cb);
+       ktrace_events_single(s, SCHED_HANDOFF, switch_cb);
+
+       /*
+        * Determine the thread IDs of the idle threads on each CPU.
+        */
+
+       ktrace_events_single(s, SCHED_IDLE, ^(struct trace_point *tp) {
+               uint64_t idle_thread = tp->threadid;
+
+               tids_on_cpu[tp->cpuid] = 0;
+
+               for (int i = 0; i < nidles; i++) {
+                       if (idle_tids[i] == idle_thread) {
+                               return;
+                       }
+               }
+
+               idle_tids[nidles++] = idle_thread;
+       });
+
+       /*
+        * On each timer fire, go through all the cores and mark any threads
+        * that should be sampled.
+        */
+
+       __block int last_fire_cpu = -1;
+       __block uint64_t sample_missing = 0;
+       static uint64_t tids_snap[MAX_CPUS] = { 0 };
+       __block int nexpected = 0;
+#if defined(__x86_64__)
+       __block int xcall_from_cpu = -1;
+#endif /* defined(__x86_64__) */
+       __block uint64_t xcall_mask = 0;
+
+       ktrace_events_single(s, PERF_TMR_FIRE, ^(struct trace_point *tp) {
+               int last_expected = nexpected;
+               nfires++;
+
+               nexpected = 0;
+               for (int i = 0; i < ncpus; i++) {
+                       uint64_t i_bit = UINT64_C(1) << i;
+                       if (sample_missing & i_bit) {
+                               T_LOG("missed sample on CPU %d for thread %#llx from timer on CPU %d (xcall mask = %llx, expected %d samples)",
+                                               tp->cpuid, tids_snap[i], last_fire_cpu,
+                                               xcall_mask, last_expected);
+                               sample_missing &= ~i_bit;
+                       }
+
+                       if (tids_on_cpu[i] != 0) {
+                               tids_snap[i] = tids_on_cpu[i];
+                               sample_missing |= i_bit;
+                               nexpected++;
+                       }
+               }
+
+               T_QUIET;
+               T_ASSERT_LT((int)tp->cpuid, ncpus, "timer fire should not occur on an IOP");
+               last_fire_cpu = (int)tp->cpuid;
+#if defined(__x86_64__)
+               xcall_from_cpu = (int)tp->cpuid;
+#endif /* defined(__x86_64__) */
+       });
+
+#if defined(__x86_64__)
+       /*
+        * Watch for the cross-call on Intel, make sure they match what kperf
+        * should be doing.
+        */
+
+       ktrace_events_single(s, MP_CPUS_CALL, ^(struct trace_point *tp) {
+               if (xcall_from_cpu != (int)tp->cpuid) {
+                       return;
+               }
+
+               xcall_mask = tp->arg1;
+               xcall_from_cpu = -1;
+       });
+#endif /* defined(__x86_64__) */
+
+       /*
+        * On the timer handler for each CPU, unset the missing sample bitmap.
+        */
+
+       ktrace_events_single(s, PERF_TMR_HNDLR, ^(struct trace_point *tp) {
+               nsamples++;
+               if ((int)tp->cpuid > ncpus) {
+                       /* skip IOPs; they're not scheduling our threads */
+                       return;
+               }
+
+               sample_missing &= ~(UINT64_C(1) << tp->cpuid);
+       });
+
+       /*
+        * Configure kperf and ktrace.
+        */
+
+       (void)kperf_action_count_set(1);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1, KPERF_SAMPLER_KSTACK),
+                       NULL);
+       (void)kperf_timer_count_set(1);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0,
+                       kperf_ns_to_ticks(TIMER_PERIOD_NS)), NULL);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL);
+
+       T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), "start kperf sampling");
+       T_ATEND(reset_ktrace);
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s,
+                       dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0)),
+                       "start ktrace");
+
+       kdebug_trace(DISPATCH_AFTER_EVENT, 0, 0, 0, 0);
+
+       dispatch_main();
+}
+
+#pragma mark kdebug triggers
+
+#define KDEBUG_TRIGGER_TIMEOUT_NS (10 * NSEC_PER_SEC)
+
+#define NON_TRIGGER_CLASS    UINT8_C(0xfd)
+#define NON_TRIGGER_SUBCLASS UINT8_C(0xff)
+#define NON_TRIGGER_CODE     UINT8_C(0xff)
+
+#define NON_TRIGGER_EVENT \
+               (KDBG_EVENTID(NON_TRIGGER_CLASS, NON_TRIGGER_SUBCLASS, \
+               NON_TRIGGER_CODE))
+
+static void
+expect_kdebug_trigger(const char *filter_desc, const uint32_t *debugids,
+               unsigned int n_debugids)
+{
+       __block int missing_kernel_stacks = 0;
+       __block int missing_user_stacks = 0;
+       ktrace_session_t s;
+       kperf_kdebug_filter_t filter;
+
+       s = ktrace_session_create();
+       T_QUIET; T_ASSERT_NOTNULL(s, NULL);
+
+       ktrace_events_single(s, PERF_STK_KHDR, ^(struct trace_point *tp) {
+                       missing_kernel_stacks--;
+                       T_LOG("saw kernel stack with %lu frames, flags = %#lx", tp->arg2,
+                                       tp->arg1);
+                       });
+       ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) {
+                       missing_user_stacks--;
+                       T_LOG("saw user stack with %lu frames, flags = %#lx", tp->arg2,
+                                       tp->arg1);
+                       });
+
+       for (unsigned int i = 0; i < n_debugids; i++) {
+               ktrace_events_single(s, debugids[i], ^(struct trace_point *tp) {
+                               missing_kernel_stacks++;
+                               missing_user_stacks++;
+                               T_LOG("saw event with debugid 0x%" PRIx32, tp->debugid);
+                               });
+       }
+
+       ktrace_events_single(s, NON_TRIGGER_EVENT,
+                       ^(__unused struct trace_point *tp)
+                       {
+                       ktrace_end(s, 0);
+                       });
+
+       ktrace_set_completion_handler(s, ^{
+                       T_EXPECT_LE(missing_kernel_stacks, 0, NULL);
+                       T_EXPECT_LE(missing_user_stacks, 0, NULL);
+
+                       ktrace_session_destroy(s);
+                       T_END;
+                       });
+
+       /* configure kperf */
+
+       kperf_reset();
+
+       (void)kperf_action_count_set(1);
+       T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1,
+                               KPERF_SAMPLER_KSTACK | KPERF_SAMPLER_USTACK), NULL);
+
+       filter = kperf_kdebug_filter_create();
+       T_ASSERT_NOTNULL(filter, NULL);
+
+       T_ASSERT_POSIX_SUCCESS(kperf_kdebug_action_set(1), NULL);
+       T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_add_desc(filter, filter_desc),
+                       NULL);
+       T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_set(filter), NULL);
+       kperf_kdebug_filter_destroy(filter);
+
+       T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+
+       /* trace the triggering debugids */
+
+       for (unsigned int i = 0; i < n_debugids; i++) {
+               T_ASSERT_POSIX_SUCCESS(kdebug_trace(debugids[i], 0, 0, 0, 0), NULL);
+       }
+
+       T_ASSERT_POSIX_SUCCESS(kdebug_trace(NON_TRIGGER_EVENT, 0, 0, 0, 0), NULL);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, KDEBUG_TRIGGER_TIMEOUT_NS),
+                       dispatch_get_main_queue(), ^(void)
+                       {
+                       ktrace_end(s, 1);
+                       });
+}
+
+#define TRIGGER_CLASS     UINT8_C(0xfe)
+#define TRIGGER_CLASS_END UINT8_C(0xfd)
+#define TRIGGER_SUBCLASS  UINT8_C(0xff)
+#define TRIGGER_CODE      UINT8_C(0)
+#define TRIGGER_DEBUGID \
+               (KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, TRIGGER_CODE))
+
+T_DECL(kdebug_trigger_classes,
+               "test that kdebug trigger samples on classes",
+               T_META_ASROOT(true))
+{
+       const uint32_t class_debugids[] = {
+               KDBG_EVENTID(TRIGGER_CLASS, 1, 1),
+               KDBG_EVENTID(TRIGGER_CLASS, 2, 1),
+               KDBG_EVENTID(TRIGGER_CLASS_END, 1, 1) | DBG_FUNC_END,
+               KDBG_EVENTID(TRIGGER_CLASS_END, 2, 1) | DBG_FUNC_END,
+       };
+
+       expect_kdebug_trigger("C0xfe,C0xfdr", class_debugids,
+                       sizeof(class_debugids) / sizeof(class_debugids[0]));
+       dispatch_main();
+}
+
+T_DECL(kdebug_trigger_subclasses,
+               "test that kdebug trigger samples on subclasses",
+               T_META_ASROOT(true))
+{
+       const uint32_t subclass_debugids[] = {
+               KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 0),
+               KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 1),
+               KDBG_EVENTID(TRIGGER_CLASS_END, TRIGGER_SUBCLASS, 0) | DBG_FUNC_END,
+               KDBG_EVENTID(TRIGGER_CLASS_END, TRIGGER_SUBCLASS, 1) | DBG_FUNC_END
+       };
+
+       expect_kdebug_trigger("S0xfeff,S0xfdffr", subclass_debugids,
+                       sizeof(subclass_debugids) / sizeof(subclass_debugids[0]));
+       dispatch_main();
+}
+
+T_DECL(kdebug_trigger_debugids,
+               "test that kdebug trigger samples on debugids",
+               T_META_ASROOT(true))
+{
+       const uint32_t debugids[] = {
+               TRIGGER_DEBUGID
+       };
+
+       expect_kdebug_trigger("D0xfeff0000", debugids,
+                       sizeof(debugids) / sizeof(debugids[0]));
+       dispatch_main();
+}
+
+/*
+ * TODO Set a single function specifier filter, expect not to trigger of all
+ * events from that class.
+ */
+
+T_DECL(kdbg_callstacks,
+               "test that the kdbg_callstacks samples on syscalls",
+               T_META_ASROOT(true))
+{
+       ktrace_session_t s;
+       __block bool saw_user_stack = false;
+
+       s = ktrace_session_create();
+       T_ASSERT_NOTNULL(s, NULL);
+
+       /*
+        * Make sure BSD events are traced in order to trigger samples on syscalls.
+        */
+       ktrace_events_class(s, DBG_BSD, ^void(__unused struct trace_point *tp) {});
+
+       ktrace_events_single(s, PERF_STK_UHDR, ^(__unused struct trace_point *tp) {
+               saw_user_stack = true;
+               ktrace_end(s, 1);
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               ktrace_session_destroy(s);
+
+               T_EXPECT_TRUE(saw_user_stack,
+                               "saw user stack after configuring kdbg_callstacks");
+               T_END;
+       });
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+       T_ASSERT_POSIX_SUCCESS(kperf_kdbg_callstacks_set(1), NULL);
+#pragma clang diagnostic pop
+       T_ATEND(kperf_reset);
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC),
+                       dispatch_get_main_queue(), ^(void) {
+               ktrace_end(s, 1);
+       });
+
+       dispatch_main();
+}
+
+#pragma mark PET
+
+#define STACKS_WAIT_DURATION_NS (3 * NSEC_PER_SEC)
+
+static void
+expect_stacks_traced(void (^cb)(void))
+{
+       ktrace_session_t s;
+
+       s = ktrace_session_create();
+       T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
+
+       __block unsigned int user_stacks = 0;
+       __block unsigned int kernel_stacks = 0;
+
+       ktrace_events_single(s, PERF_STK_UHDR, ^(__unused struct trace_point *tp) {
+                       user_stacks++;
+                       });
+       ktrace_events_single(s, PERF_STK_KHDR, ^(__unused struct trace_point *tp) {
+                       kernel_stacks++;
+                       });
+
+       ktrace_set_completion_handler(s, ^(void) {
+                       ktrace_session_destroy(s);
+                       T_EXPECT_GT(user_stacks, 0U, NULL);
+                       T_EXPECT_GT(kernel_stacks, 0U, NULL);
+                       cb();
+                       });
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, STACKS_WAIT_DURATION_NS),
+                       dispatch_get_main_queue(), ^(void)
+                       {
+                       kperf_reset();
+                       ktrace_end(s, 0);
+                       });
+}
+
+T_DECL(pet, "test that PET mode samples kernel and user stacks",
+               T_META_ASROOT(true))
+{
+       configure_kperf_stacks_timer(-1, 10);
+       T_ASSERT_POSIX_SUCCESS(kperf_timer_pet_set(0), NULL);
+
+       expect_stacks_traced(^(void) {
+                       T_END;
+                       });
+
+       dispatch_main();
+}
+
+T_DECL(lightweight_pet,
+               "test that lightweight PET mode samples kernel and user stacks",
+               T_META_ASROOT(true))
+{
+       int set = 1;
+
+       configure_kperf_stacks_timer(-1, 10);
+       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kperf.lightweight_pet", NULL, NULL,
+                               &set, sizeof(set)), NULL);
+       T_ASSERT_POSIX_SUCCESS(kperf_timer_pet_set(0), NULL);
+
+       expect_stacks_traced(^(void) {
+                       T_END;
+                       });
+
+       dispatch_main();
+}
diff --git a/tests/kperf_backtracing.c b/tests/kperf_backtracing.c
new file mode 100644 (file)
index 0000000..1d3d46d
--- /dev/null
@@ -0,0 +1,449 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <CoreSymbolication/CoreSymbolication.h>
+#include <darwintest.h>
+#include <dispatch/dispatch.h>
+#include <kperf/kperf.h>
+#include <ktrace/session.h>
+#include <System/sys/kdebug.h>
+#include <pthread.h>
+
+#include "kperf_helpers.h"
+
+#define PERF_STK_KHDR  UINT32_C(0x25020014)
+#define PERF_STK_UHDR  UINT32_C(0x25020018)
+#define PERF_STK_KDATA UINT32_C(0x2502000c)
+#define PERF_STK_UDATA UINT32_C(0x25020010)
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.kperf"),
+               T_META_CHECK_LEAKS(false));
+
+static void
+expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol,
+    unsigned long addr, unsigned int bt_idx, unsigned int max_frames)
+{
+    const char *name;
+    unsigned int frame_idx = max_frames - bt_idx - 1;
+
+    if (!bt[frame_idx]) {
+        T_LOG("frame %2u: skipping system frame", frame_idx);
+        return;
+    }
+
+    if (CSIsNull(symbol)) {
+        T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx);
+        return;
+    }
+
+    if (frame_idx >= bt_len) {
+        T_FAIL("unexpected frame '%s' (%#lx) at index %u",
+            CSSymbolGetName(symbol), addr, frame_idx);
+        return;
+    }
+
+    name = CSSymbolGetName(symbol);
+    T_QUIET; T_ASSERT_NOTNULL(name, NULL);
+    T_EXPECT_EQ_STR(name, bt[frame_idx],
+        "frame %2u: saw '%s', expected '%s'",
+        frame_idx, name, bt[frame_idx]);
+}
+
+/*
+ * Expect to see either user or kernel stacks on thread with ID `tid` with a
+ * signature of `bt` of length `bt_len`.  Updates `stacks_seen` when stack
+ * is found.
+ *
+ * Can also allow stacks to be larger than the signature -- additional frames
+ * near the current PC will be ignored.  This allows stacks to potentially be
+ * in the middle of a signalling system call (which signals that it is safe to
+ * start sampling).
+ */
+static void
+expect_backtrace(ktrace_session_t s, uint64_t tid, unsigned int *stacks_seen,
+    bool kern, const char **bt, unsigned int bt_len, unsigned int allow_larger_by)
+{
+    CSSymbolicatorRef symb;
+    uint32_t hdr_debugid;
+    uint32_t data_debugid;
+    __block unsigned int stacks = 0;
+    __block unsigned int frames = 0;
+    __block unsigned int hdr_frames = 0;
+    __block unsigned int allow_larger = allow_larger_by;
+
+    if (kern) {
+        static CSSymbolicatorRef kern_symb;
+        static dispatch_once_t kern_symb_once;
+
+        hdr_debugid = PERF_STK_KHDR;
+        data_debugid = PERF_STK_KDATA;
+
+        dispatch_once(&kern_symb_once, ^(void) {
+            kern_symb = CSSymbolicatorCreateWithMachKernel();
+            T_QUIET; T_ASSERT_FALSE(CSIsNull(kern_symb), NULL);
+        });
+        symb = kern_symb;
+    } else {
+        static CSSymbolicatorRef user_symb;
+        static dispatch_once_t user_symb_once;
+
+        hdr_debugid = PERF_STK_UHDR;
+        data_debugid = PERF_STK_UDATA;
+
+        dispatch_once(&user_symb_once, ^(void) {
+            user_symb = CSSymbolicatorCreateWithTask(mach_task_self());
+            T_QUIET; T_ASSERT_FALSE(CSIsNull(user_symb), NULL);
+            T_QUIET; T_ASSERT_TRUE(CSSymbolicatorIsTaskValid(user_symb), NULL);
+        });
+        symb = user_symb;
+    }
+
+    ktrace_events_single(s, hdr_debugid, ^(struct trace_point *tp) {
+        if (tid != 0 && tid != tp->threadid) {
+            return;
+        }
+
+        T_LOG("found stack from thread %#lx", tp->threadid);
+        stacks++;
+        if (!(tp->arg1 & 1)) {
+            T_FAIL("invalid %s stack on thread %#lx", kern ? "kernel" : "user",
+                tp->threadid);
+            return;
+        }
+
+        hdr_frames = (unsigned int)tp->arg2;
+        /* ignore extra link register or value pointed to by stack pointer */
+        hdr_frames -= 1;
+
+        T_QUIET; T_EXPECT_GE(hdr_frames, bt_len,
+            "number of frames in header");
+        T_QUIET; T_EXPECT_LE(hdr_frames, bt_len + allow_larger,
+            "number of frames in header");
+        if (hdr_frames > bt_len && allow_larger > 0) {
+            allow_larger = hdr_frames - bt_len;
+            hdr_frames = bt_len;
+        }
+
+        T_LOG("%s stack seen", kern ? "kernel" : "user");
+        frames = 0;
+    });
+
+    ktrace_events_single(s, data_debugid, ^(struct trace_point *tp) {
+        if (tid != 0 && tid != tp->threadid) {
+            return;
+        }
+
+        int i = 0;
+
+        if (frames == 0 && hdr_frames > bt_len) {
+            /* skip frames near the PC */
+            i = (int)allow_larger;
+            allow_larger -= 4;
+        }
+
+        for (; i < 4 && frames < hdr_frames; i++, frames++) {
+            unsigned long addr = (&tp->arg1)[i];
+            CSSymbolRef symbol = CSSymbolicatorGetSymbolWithAddressAtTime(
+                symb, addr, kCSNow);
+
+            expect_frame(bt, bt_len, symbol, addr, frames, hdr_frames);
+        }
+
+        /* saw the end of the user stack */
+        if (hdr_frames == frames) {
+            *stacks_seen += 1;
+            if (!kern) {
+                ktrace_end(s, 1);
+            }
+        }
+    });
+}
+
+#define TRIGGERING_DEBUGID (0xfeff0f00)
+
+/*
+ * These functions must return an int to avoid the function prologue being
+ * hoisted out of the path to the spin (breaking being able to get a good
+ * backtrace).
+ */
+static int __attribute__((noinline,not_tail_called))
+recurse_a(dispatch_semaphore_t spinning, unsigned int frames);
+static int __attribute__((noinline,not_tail_called))
+recurse_b(dispatch_semaphore_t spinning, unsigned int frames);
+
+static int __attribute__((noinline,not_tail_called))
+recurse_a(dispatch_semaphore_t spinning, unsigned int frames)
+{
+    if (frames == 0) {
+        if (spinning) {
+            dispatch_semaphore_signal(spinning);
+            for (;;);
+        } else {
+            kdebug_trace(TRIGGERING_DEBUGID, 0, 0, 0, 0);
+            return 0;
+        }
+    }
+
+    return recurse_b(spinning, frames - 1) + 1;
+}
+
+static int __attribute__((noinline,not_tail_called))
+recurse_b(dispatch_semaphore_t spinning, unsigned int frames)
+{
+    if (frames == 0) {
+        if (spinning) {
+            dispatch_semaphore_signal(spinning);
+            for (;;);
+        } else {
+            kdebug_trace(TRIGGERING_DEBUGID, 0, 0, 0, 0);
+            return 0;
+        }
+    }
+
+    return recurse_a(spinning, frames - 1) + 1;
+}
+
+#define USER_FRAMES       (12)
+
+#if defined(__x86_64__)
+#define RECURSE_START_OFFSET (4)
+#else /* defined(__x86_64__) */
+#define RECURSE_START_OFFSET (3)
+#endif /* defined(__x86_64__) */
+
+static const char *user_bt[USER_FRAMES] = {
+#if defined(__x86_64__)
+    NULL,
+#endif /* defined(__x86_64__) */
+    NULL, NULL,
+    "backtrace_thread",
+    "recurse_a", "recurse_b", "recurse_a", "recurse_b",
+    "recurse_a", "recurse_b", "recurse_a",
+#if !defined(__x86_64__)
+    "recurse_b",
+#endif /* !defined(__x86_64__) */
+    NULL
+};
+
+#if defined(__arm__)
+
+#define KERNEL_FRAMES (2)
+static const char *kernel_bt[KERNEL_FRAMES] = {
+    "unix_syscall", "kdebug_trace64"
+};
+
+#elif defined(__arm64__)
+
+#define KERNEL_FRAMES (4)
+static const char *kernel_bt[KERNEL_FRAMES] = {
+    "fleh_synchronous", "sleh_synchronous", "unix_syscall", "kdebug_trace64"
+};
+
+#elif defined(__x86_64__)
+
+#define KERNEL_FRAMES (2)
+static const char *kernel_bt[KERNEL_FRAMES] = {
+    "unix_syscall64", "kdebug_trace64"
+};
+
+#else
+#error "architecture unsupported"
+#endif /* defined(__arm__) */
+
+static dispatch_once_t backtrace_once;
+static dispatch_semaphore_t backtrace_started;
+static dispatch_semaphore_t backtrace_go;
+
+/*
+ * Another thread to run with a known backtrace.
+ *
+ * Take a semaphore that will be signalled when the thread is spinning at the
+ * correct frame.  If the semaphore is NULL, don't spin and instead make a
+ * kdebug_trace system call, which can trigger a deterministic backtrace itself.
+ */
+static void *
+backtrace_thread(void *arg)
+{
+    dispatch_semaphore_t notify_spinning;
+    unsigned int calls;
+
+    notify_spinning = (dispatch_semaphore_t)arg;
+
+    dispatch_semaphore_signal(backtrace_started);
+    if (!notify_spinning) {
+        dispatch_semaphore_wait(backtrace_go, DISPATCH_TIME_FOREVER);
+    }
+
+    /*
+     * backtrace_thread, recurse_a, recurse_b, ...[, __kdebug_trace64]
+     *
+     * Always make one less call for this frame (backtrace_thread).
+     */
+    calls = USER_FRAMES - RECURSE_START_OFFSET - 1 /* backtrace_thread */;
+    if (notify_spinning) {
+        /*
+         * Spinning doesn't end up calling __kdebug_trace64.
+         */
+        calls -= 1;
+    }
+
+    T_LOG("backtrace thread calling into %d frames (already at %d frames)",
+        calls, RECURSE_START_OFFSET);
+    (void)recurse_a(notify_spinning, calls);
+    return NULL;
+}
+
+static uint64_t
+create_backtrace_thread(dispatch_semaphore_t notify_spinning)
+{
+    pthread_t thread = NULL;
+    uint64_t tid;
+
+    dispatch_once(&backtrace_once, ^{
+        backtrace_started = dispatch_semaphore_create(0);
+        T_QUIET; T_ASSERT_NOTNULL(backtrace_started, NULL);
+
+        if (!notify_spinning) {
+            backtrace_go = dispatch_semaphore_create(0);
+            T_QUIET; T_ASSERT_NOTNULL(backtrace_go, NULL);
+        }
+    });
+
+    T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, backtrace_thread,
+        (void *)notify_spinning), NULL);
+    T_QUIET; T_ASSERT_NOTNULL(thread, "backtrace thread created");
+    dispatch_semaphore_wait(backtrace_started, DISPATCH_TIME_FOREVER);
+
+    T_QUIET; T_ASSERT_POSIX_ZERO(pthread_threadid_np(thread, &tid), NULL);
+    T_QUIET; T_ASSERT_NE(tid, UINT64_C(0),
+        "backtrace thread created does not have ID 0");
+
+    T_LOG("starting thread with ID 0x%" PRIx64, tid);
+
+    return tid;
+}
+
+static void
+start_backtrace_thread(void)
+{
+    T_QUIET; T_ASSERT_NOTNULL(backtrace_go,
+        "thread to backtrace created before starting it");
+    dispatch_semaphore_signal(backtrace_go);
+}
+
+#if TARGET_OS_WATCH
+#define TEST_TIMEOUT_NS (30 * NSEC_PER_SEC)
+#else /* TARGET_OS_WATCH */
+#define TEST_TIMEOUT_NS (5 * NSEC_PER_SEC)
+#endif /* !TARGET_OS_WATCH */
+
+T_DECL(backtraces_kdebug_trigger,
+    "test that backtraces from kdebug trigger are correct",
+    T_META_ASROOT(true))
+{
+    static unsigned int stacks_seen = 0;
+    ktrace_session_t s;
+    kperf_kdebug_filter_t filter;
+    uint64_t tid;
+
+    s = ktrace_session_create();
+    T_ASSERT_NOTNULL(s, "ktrace session was created");
+
+    T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL);
+
+    tid = create_backtrace_thread(NULL);
+    expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES, 0);
+    expect_backtrace(s, tid, &stacks_seen, true, kernel_bt, KERNEL_FRAMES, 0);
+
+    /*
+     * The triggering event must be traced (and thus registered with libktrace)
+     * to get backtraces.
+     */
+    ktrace_events_single(s, TRIGGERING_DEBUGID,
+        ^(__unused struct trace_point *tp){ });
+
+    ktrace_set_completion_handler(s, ^(void) {
+        T_EXPECT_GE(stacks_seen, 2U, "saw both kernel and user stacks");
+        ktrace_session_destroy(s);
+        kperf_reset();
+        T_END;
+    });
+
+    filter = kperf_kdebug_filter_create();
+    T_ASSERT_NOTNULL(filter, "kperf kdebug filter was created");
+
+    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_add_debugid(filter,
+        TRIGGERING_DEBUGID), NULL);
+    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_set(filter), NULL);
+    (void)kperf_action_count_set(1);
+    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1,
+        KPERF_SAMPLER_USTACK | KPERF_SAMPLER_KSTACK), NULL);
+    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_action_set(1), NULL);
+    kperf_kdebug_filter_destroy(filter);
+
+    T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
+
+    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+
+    start_backtrace_thread();
+
+    dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS),
+        dispatch_get_main_queue(), ^(void)
+    {
+        T_LOG("ending test after timeout");
+        ktrace_end(s, 0);
+    });
+
+    dispatch_main();
+}
+
+T_DECL(backtraces_user_timer,
+    "test that user backtraces on a timer are correct",
+    T_META_ASROOT(true))
+{
+    static unsigned int stacks_seen = 0;
+    ktrace_session_t s;
+    uint64_t tid;
+    dispatch_semaphore_t wait_for_spinning = dispatch_semaphore_create(0);
+
+    s = ktrace_session_create();
+    T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
+
+    ktrace_filter_pid(s, getpid());
+
+    configure_kperf_stacks_timer(getpid(), 10);
+
+    tid = create_backtrace_thread(wait_for_spinning);
+    /* potentially calling dispatch function and system call */
+    expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES - 1, 2);
+
+    ktrace_set_completion_handler(s, ^(void) {
+        T_EXPECT_GE(stacks_seen, 1U, "saw at least one stack");
+        ktrace_session_destroy(s);
+        kperf_reset();
+        T_END;
+    });
+
+    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
+
+    /* wait until the thread that will be backtraced is spinning */
+    dispatch_semaphore_wait(wait_for_spinning, DISPATCH_TIME_FOREVER);
+
+    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
+
+    dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS),
+        dispatch_get_main_queue(), ^(void)
+    {
+        T_LOG("ending test after timeout");
+        ktrace_end(s, 0);
+    });
+
+    dispatch_main();
+}
+
+/* TODO test kernel stacks in all modes */
+/* TODO legacy PET mode backtracing */
+/* TODO test deep stacks, further than 128 frames, make sure they are truncated */
+/* TODO test constrained stacks */
diff --git a/tests/kperf_helpers.c b/tests/kperf_helpers.c
new file mode 100644 (file)
index 0000000..bf64f6b
--- /dev/null
@@ -0,0 +1,25 @@
+#include "kperf_helpers.h"
+
+#include <darwintest.h>
+#include <kperf/kperf.h>
+#include <unistd.h>
+
+void
+configure_kperf_stacks_timer(pid_t pid, unsigned int period_ms)
+{
+    kperf_reset();
+
+    (void)kperf_action_count_set(1);
+    (void)kperf_timer_count_set(1);
+
+    T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1,
+        KPERF_SAMPLER_USTACK | KPERF_SAMPLER_KSTACK), NULL);
+
+    if (pid != -1) {
+        T_ASSERT_POSIX_SUCCESS(kperf_action_filter_set_by_pid(1, pid), NULL);
+    }
+
+    T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL);
+    T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0,
+        kperf_ns_to_ticks(period_ms * NSEC_PER_MSEC)), NULL);
+}
diff --git a/tests/kperf_helpers.h b/tests/kperf_helpers.h
new file mode 100644 (file)
index 0000000..466f3d9
--- /dev/null
@@ -0,0 +1,8 @@
+#ifndef KPERF_HELPERS_H
+#define KPERF_HELPERS_H
+
+#include <unistd.h>
+
+void configure_kperf_stacks_timer(pid_t pid, unsigned int period_ms);
+
+#endif /* !defined(KPERF_HELPERS_H) */
diff --git a/tests/kqueue_add_and_trigger.c b/tests/kqueue_add_and_trigger.c
new file mode 100644 (file)
index 0000000..15243a7
--- /dev/null
@@ -0,0 +1,37 @@
+#include <unistd.h>
+#include <errno.h>
+#include <sys/event.h>
+#include <darwintest.h>
+
+/* <rdar://problem/28139044> EVFILT_USER doesn't properly support add&fire atomic combination
+ *
+ * Chek that using EV_ADD and EV_TRIGGER on a EV_USER actually trigger the event just added.
+ *
+ */
+
+T_DECL(kqueue_add_and_trigger_evfilt_user, "Add and trigger EVFILT_USER events with kevent ")
+{
+       int kq_fd, ret;
+       struct kevent ret_kev;
+       const struct kevent kev = {
+               .ident = 1,
+               .filter = EVFILT_USER,
+               .flags = EV_ADD|EV_CLEAR,
+               .fflags = NOTE_TRIGGER,
+       };
+       const struct timespec timeout = {
+               .tv_sec = 1,
+               .tv_nsec = 0,
+       };
+
+       T_ASSERT_POSIX_SUCCESS((kq_fd = kqueue()), NULL);
+       ret = kevent(kq_fd, &kev, 1, &ret_kev, 1, &timeout);
+
+       T_ASSERT_POSIX_SUCCESS(ret, "kevent");
+
+       T_ASSERT_EQ(ret, 1, "kevent with add and trigger, ret");
+       T_ASSERT_EQ(ret_kev.ident, 1, "kevent with add and trigger, ident");
+       T_ASSERT_EQ(ret_kev.filter, EVFILT_USER, "kevent with add and trigger, filter");
+
+}
+
diff --git a/tests/kqueue_close.c b/tests/kqueue_close.c
new file mode 100644 (file)
index 0000000..3682d91
--- /dev/null
@@ -0,0 +1,77 @@
+#include <unistd.h>
+#include <pthread.h>
+#include <errno.h>
+
+#include <sys/event.h>
+
+#include <darwintest.h>
+
+/*
+ * <rdar://problem/30231213> close() of kqueue FD races with kqueue_scan park
+ *
+ * When close concurrent with poll goes wrong, the close hangs
+ * and the kevent never gets any more events.
+ */
+
+/* Both events should fire at about the same time */
+static uint32_t timeout_ms = 10;
+
+static void *
+poll_kqueue(void *arg)
+{
+       int fd = (int)arg;
+
+       struct kevent kev = {
+               .filter = EVFILT_TIMER,
+               .flags  = EV_ADD,
+               .data   = timeout_ms,
+       };
+
+       int rv = kevent(fd, &kev, 1, NULL, 0, NULL);
+
+       if (rv == -1 && errno == EBADF) {
+               /* The close may race with this thread spawning */
+               T_LOG("kqueue already closed?");
+               return NULL;
+       } else {
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kevent");
+       }
+
+       while ((rv = kevent(fd, NULL, 0, &kev, 1, NULL)) == 1) {
+               T_LOG("poll\n");
+       }
+
+       if (rv != -1 || errno != EBADF) {
+               T_ASSERT_POSIX_SUCCESS(rv, "fd should be closed");
+       }
+
+       return NULL;
+}
+
+static void
+run_test()
+{
+       int fd = kqueue();
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(fd, "kqueue");
+
+       pthread_t thread;
+       int rv = pthread_create(&thread, NULL, poll_kqueue,
+                               (void *)(uintptr_t)fd);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create");
+
+       usleep(timeout_ms * 1000);
+
+       rv = close(fd);
+       T_ASSERT_POSIX_SUCCESS(rv, "close");
+
+       rv = pthread_join(thread, NULL);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_join");
+}
+
+T_DECL(kqueue_close_race, "Races kqueue close with kqueue process",
+       T_META_LTEPHASE(LTE_POSTINIT), T_META_TIMEOUT(5))
+{
+       for (uint32_t i = 1 ; i < 100 ; i++) {
+               run_test();
+       }
+}
diff --git a/tests/kqueue_fifo_18776047.c b/tests/kqueue_fifo_18776047.c
new file mode 100644 (file)
index 0000000..fe45758
--- /dev/null
@@ -0,0 +1,153 @@
+/*
+ * testname: kqueue_fifo
+ */
+
+#include <darwintest.h>
+#include <fcntl.h>
+#include <sys/event.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+
+#include <TargetConditionals.h>
+
+#define TMP_FILE_PATH "/tmp/test_kqueue_fifo_18776047"
+
+#define READ_BUFFER_LEN 256
+
+#if TARGET_OS_WATCH
+#define TOTAL_ITERATIONS 5000
+#else
+#define TOTAL_ITERATIONS 10000
+#endif
+
+/* prototypes */
+int write_some_data(int fd);
+int read_data(int fd);
+void create_fifo(const char * filepath);
+void kevent_one_shot(int kq, int fd, int filter);
+
+int
+write_some_data(int fd)
+{
+       int retval  = 0;
+       int count   = 0;
+       int len     = 5;
+       char * data = "ABCDE";
+       while (true) {
+               errno  = 0;
+               retval = (int)write(fd, data, (size_t)len);
+               if (retval < 0) {
+                       if (errno == EAGAIN) {
+                               if (len == 1)
+                                       return count;
+                               else
+                                       len--;
+                       } else {
+                               T_ASSERT_FAIL("write to fd %d of %s of len %d failed.", fd, data, len);
+                               abort();
+                       }
+               } else {
+                       count += retval;
+               }
+       }
+}
+
+int
+read_data(int fd)
+{
+       int retval, count = 0;
+       char databuffer[READ_BUFFER_LEN];
+       while (true) {
+               errno  = 0;
+               retval = (int)read(fd, databuffer, READ_BUFFER_LEN);
+               if (retval < 0) {
+                       if (errno == EAGAIN) {
+                               return count;
+                       } else {
+                               T_ASSERT_FAIL("read from fd %d failed.", fd);
+                               abort();
+                       }
+               }
+               count += retval;
+       }
+}
+
+void
+create_fifo(const char * filepath)
+{
+       struct stat f_stat;
+       int ret = 0;
+       errno   = 0;
+       ret = stat(filepath, &f_stat);
+       if (ret == 0) {
+               /* if file exists, make sure its a fifo */
+               T_ASSERT_TRUE(S_ISFIFO(f_stat.st_mode), "ensure %s is a fifo", filepath);
+       } else if (errno == ENOENT) {
+               ret = mkfifo(filepath, 0777);
+               T_ASSERT_POSIX_ZERO(ret, "creating a fifo at path %s", filepath);
+       } else {
+               T_ASSERT_FAIL("stat operation on %s", filepath);
+       }
+}
+
+void
+kevent_one_shot(int kq, int fd, int filter)
+{
+       int retval             = 0;
+       struct timespec t_zero = {0, 0};
+       struct kevent kev[1];
+
+       T_QUIET;
+       T_ASSERT_GE(kq, 0, "ensure kq is valid");
+       T_LOG("kevent doing ONESHOT %s", filter == EVFILT_READ ? "read" : "write");
+
+       EV_SET(kev, fd, filter, EV_ADD | EV_ONESHOT, 0, 0, NULL);
+       retval = kevent(kq, kev, 1, NULL, 0, &t_zero);
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(retval, "ONESHOT kevent for fd %d, filter %d", fd, filter);
+}
+
+T_DECL(kqueue_fifo_18776047, "Tests kqueue, kevent for watching a fifo.", T_META_LTEPHASE(LTE_POSTINIT))
+{
+       struct kevent kev[1];
+       int read_fd, write_fd, kq;
+       int retval         = 0;
+       int iter           = 0;
+       const char * fpath = TMP_FILE_PATH;
+       T_SETUPBEGIN;
+       create_fifo(fpath);
+
+       kq = kqueue();
+       T_ASSERT_GE(kq, 0, "create a kqueue");
+
+       read_fd = open(fpath, O_RDONLY | O_APPEND | O_NONBLOCK);
+       T_ASSERT_POSIX_SUCCESS(read_fd, "opening read fd on fifo.");
+
+       write_fd = open(fpath, O_WRONLY | O_APPEND | O_NONBLOCK);
+       T_ASSERT_POSIX_SUCCESS(write_fd, "opening write fd on fifo.");
+
+       T_SETUPEND;
+
+       kevent_one_shot(kq, write_fd, EVFILT_WRITE);
+       kevent_one_shot(kq, read_fd, EVFILT_READ);
+
+       while (iter++ < TOTAL_ITERATIONS) {
+               retval = kevent(kq, NULL, 0, kev, 1, NULL);
+               T_QUIET;
+               T_ASSERT_GE(retval, 0, "kevent on kq %d", kq);
+
+               if (kev[0].ident == (uintptr_t)write_fd) {
+                       retval = write_some_data(write_fd);
+                       T_LOG("writer ready iter: %d wrote %d bytes", iter, retval);
+                       kevent_one_shot(kq, write_fd, EVFILT_WRITE);
+               } else if (kev[0].ident == (uintptr_t)read_fd) {
+                       retval = read_data(read_fd);
+                       T_LOG("reader ready iter: %d read %d bytes", iter, retval);
+                       kevent_one_shot(kq, read_fd, EVFILT_READ);
+               }
+       }
+       T_PASS("kqueue_fifo_18776047 PASSED");
+}
diff --git a/tests/kqueue_file_tests.c b/tests/kqueue_file_tests.c
new file mode 100644 (file)
index 0000000..dcd2c47
--- /dev/null
@@ -0,0 +1,1837 @@
+#include <string.h>
+#include <errno.h>
+#include <pwd.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <poll.h>
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <sys/xattr.h>
+#include <sys/file.h>
+
+#include <TargetConditionals.h>
+#include <darwintest.h>
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.kevent")
+               );
+
+#define PDIR   "/tmp"
+#define DIR1   PDIR "/dir1"
+#define DOTDOT ".."
+#define DIR2   PDIR "/dir2"
+#define FILE1  PDIR "/file1"
+#define FILE2  PDIR "/file2"
+
+#define KEY    "somekey"
+#define VAL    "someval"
+
+#define NOSLEEP                0
+#define SLEEP          1
+#define NO_EVENT       0
+#define YES_EVENT      1
+
+
+#define OUTPUT_LEVEL   0
+#define RESULT_LEVEL   3
+
+#define TEST_STRING    "Some text!!! Yes indeed, some of that very structure which has passed on man's knowledge for generations."
+#define HELLO_WORLD    "Hello, World!"
+#define USLEEP_TIME    5000
+#define WAIT_TIME      (4l)
+#define LENGTHEN_SIZE  500
+#define FIFO_SPACE     8192    /* FIFOS have 8K of buffer space */
+
+/*
+ * These two variables are the non local memory for holding the return
+ * values from functions with which pthread_create is called.
+ */
+int thread_status;
+int fifo_read_fd;
+
+/*
+ * Types of actions for setup, cleanup, and execution of tests
+ */
+typedef enum {CREAT, MKDIR, READ, WRITE, WRITEFD, FILLFD, UNLINK, LSKEE, RMDIR, MKFIFO, LENGTHEN, TRUNC,
+       SYMLINK, CHMOD, CHOWN, EXCHANGEDATA, RENAME, LSEEK, OPEN, MMAP, NOTHING,
+       SETXATTR, UTIMES, STAT, HARDLINK, REVOKE, FUNLOCK} action_id_t;
+
+/* 
+ * Directs an action as mentioned above
+ */
+typedef struct _action {
+       int             act_dosleep;
+       action_id_t     act_id;
+       void            *act_args[5];
+       int             act_fd;
+} action_t;
+
+/*
+ * A test case.  Specifies setup, an event to look for, an action to take to
+ * cause (or not cause) that event, and cleanup.
+ */
+typedef struct _test {
+       char *t_testname;
+       
+       /* Is this test an expected failure? */
+       int t_known_failure;
+
+       /* Is this test behaving non-deterministically? */
+       int t_nondeterministic;
+
+       /* Test kevent() or poll() */
+       int     t_is_poll_test; 
+       
+       /* Actions for setting up test */
+       int      t_n_prep_actions;
+       action_t t_prep_actions[5];
+       
+       /* Actions for cleaning up test */
+       int      t_n_cleanup_actions;
+       action_t t_cleanup_actions[5];
+       
+       /* Action for thred to take while we wait */
+       action_t t_helpthreadact;
+       
+       /* File to look for event on */
+       char     *t_watchfile;  /* set event ident IN TEST (can't know fd beforehand)*/
+       int      t_file_is_fifo;/* FIFOs are handled in a special manner */
+       
+       /* Different parameters for poll() vs kevent() */
+       union { 
+               struct kevent   tu_kev;
+               short           tu_pollevents;
+       } t_union;
+       
+       /* Do we expect results? */
+       int      t_want_event;
+       
+       /* Not always used--how much data should we find (EVFILT_{READ,WRITE}) */
+       int      t_nbytes;
+       
+       /* Hacks for FILT_READ and pipes */
+       int      t_read_to_end_first;   /* Consume all data in file before waiting for event */
+       int      t_write_some_data;     /* Write some data to file before waiting for event (FIFO hack) */
+       int      t_extra_sleep_hack;    /* Sleep before waiting, to let a fifo fill up with data */
+} test_t;
+
+char *
+get_action_name(action_id_t a)
+{
+       switch (a) {
+       case CREAT:
+               return "CREAT";
+       case MKDIR:
+               return "MKDIR";
+       case READ:
+               return "READ";
+       case WRITE:
+               return "WRITE";
+       case WRITEFD:
+               return "WRITEFD";
+       case FILLFD:
+               return "FILLFD";
+       case UNLINK:
+               return "UNLINK";
+       case LSKEE:
+               return "LSKEE";
+       case RMDIR:
+               return "RMDIR";
+       case MKFIFO:
+               return "MKFIFO";
+       case LENGTHEN:
+               return "LENGTHEN";
+       case TRUNC:
+               return "TRUNC";
+       case SYMLINK:
+               return "SYMLINK";
+       case CHMOD:
+               return "CHMOD";
+       case CHOWN:
+               return "CHOWN";
+       case EXCHANGEDATA:
+               return "EXCHANGEDATA";
+       case RENAME:
+               return "RENAME";
+       case LSEEK:
+               return "LSEEK";
+       case OPEN:
+               return "OPEN";
+       case MMAP:
+               return "MMAP";
+       case NOTHING:
+               return "NOTHING";
+       case SETXATTR:
+               return "SETXATTR";
+       case UTIMES:
+               return "UTIMES";
+       case STAT:
+               return "STAT";
+       case HARDLINK:
+               return "HARDLINK";
+       case REVOKE:
+               return "REVOKE";
+       case FUNLOCK:
+               return "FUNLOCK";
+       }
+       return "Unknown";
+}
+/*
+ * Initialize an action struct.  Whether to sleep, what action to take,
+ * and arguments for that action.
+ */
+void 
+init_action(action_t *act, int sleep, action_id_t call, int nargs, ...) 
+{
+       int i;
+       va_list ap;
+       va_start(ap, nargs);
+       act->act_dosleep = sleep;
+       act->act_id = call;
+       
+       for (i = 0; i < nargs; i++)
+       {
+               act->act_args[i] = va_arg(ap, void*);
+       }
+       
+       va_end(ap);
+       
+}
+
+/*
+ * Opening a fifo is complicated: need to open both sides at once 
+ */
+void *
+open_fifo_readside(void *arg) 
+{
+       if ((fifo_read_fd = open((char*)arg, O_RDONLY)) == -1) {
+               T_LOG("open(%s, O_RDONLY) failed: %d (%s)\n", arg, errno, strerror(errno));
+       }
+       return (&fifo_read_fd);
+}
+
+/*
+ * Open a fifo, setting read and write descriptors.  Return 0 for success, -1 for failure.
+ * Only set FD args upon success; they will be unmodified on failure.
+ */
+int 
+open_fifo(const char *path, int *readfd, int *writefd) 
+{
+       pthread_t thread;
+       int waitres;
+       int res;
+       int *tmpreadfd, tmpwritefd;
+       
+       fifo_read_fd = -1;
+       res = pthread_create(&thread, 0, open_fifo_readside, (void*)path);
+       if (res == 0) {
+               if ((tmpwritefd = open(path, O_WRONLY)) == -1) {
+                       T_LOG("open(%s, O_WRONLY) failed: %d (%s)\n", path, errno, strerror(errno));
+                       return (-1);
+               }
+               waitres = pthread_join(thread, (void**) &tmpreadfd);
+               
+               fcntl(tmpwritefd, F_SETFL, O_WRONLY | O_NONBLOCK);
+               
+               if ((waitres == 0) && (tmpwritefd >= 0) && (*tmpreadfd >= 0)) {
+                       *readfd = *tmpreadfd;
+                       *writefd = tmpwritefd;
+               } else {
+                       res = -1;       
+               }
+       }
+       
+       return res;
+}
+
+/*
+ * Just concatenate a directory and a filename, sticking a "/" betwixt them
+ */
+void 
+makepath(char *buf, const char *dir, const char *file) 
+{
+       strcpy(buf, dir);
+       strcat(buf, "/");
+       strcat(buf, file);
+}
+
+
+/* Execute a prep, cleanup, or test action; specific tricky notes below.
+ *
+ * CREAT:      comes to life and given length 1
+ * READ:       try to read one char
+ * WRITE:      try to write TEST_STRING to file
+ * LENGTHEN:   make longer by LENGTHEN_SIZE
+ * MMAP:       mmap first 20 bytes of file, write HELLO_WORLD in
+ * SETXATTR:   set the KEY attribute to value VAL
+ * WRITEFD:    instead of opening fresh, take an FD in the action struct (FIFOs)
+ * FILLFD:     write a file until you can no longer.  for filling FIFOS.
+ *
+ * * Several of these have hard-coded sizes.
+ */
+void* 
+execute_action(void *actionptr) 
+{
+       action_t *act = (action_t*)actionptr;
+       void **args = act->act_args;
+       char c;
+       int res = -1, tmpfd, tmpfd2;
+       static int lastfd;
+       void *addr;
+       struct timeval tv;
+       struct stat sstat;
+       
+       T_LOG("Beginning action of type %d: %s\n", act->act_id, get_action_name(act->act_id));
+       
+       /* Let other thread get into kevent() sleep */
+       if(SLEEP == act->act_dosleep) {
+               usleep(USLEEP_TIME);
+       }
+       switch(act->act_id) {
+               case NOTHING:
+                       res = 0;
+                       break;
+               case CREAT:
+                       if ((tmpfd = creat((char*)args[0], 0755)) == -1) {
+                               T_LOG("creat() failed on \"%s\": %d (%s)\n", args[0], errno, strerror(errno));
+                               res = -1;
+                               break;
+                       }
+                       ftruncate(tmpfd, 1); /* So that mmap() doesn't fool us */
+                       close(tmpfd);
+                       res = 0;
+                       break;
+               case MKDIR:
+                       res = mkdir((char*)args[0], 0755);
+                       break;
+               case READ:
+                       if ((tmpfd = open((char*)args[0], O_RDONLY)) == -1) {
+                               T_LOG("open(%s, O_RDONLY) failed: %d (%s)\n", args[0], errno, strerror(errno));
+                               res = -1;
+                               break;
+                       }
+                       res = read(tmpfd, &c, 1);
+                       res = (res == 1 ? 0 : -1);
+                       close(tmpfd);
+                       break;
+               case WRITE:
+                       if ((tmpfd = open((char*)args[0], O_RDWR)) == -1) {
+                               T_LOG("open(%s, O_RDWR) failed: %d (%s)\n", args[0], errno, strerror(errno));
+                               res = -1;
+                               break;
+                       }
+                       res = write(tmpfd, TEST_STRING, strlen(TEST_STRING));
+                       if (res == strlen(TEST_STRING)) {
+                               res = 0;
+                       } else {
+                               res = -1;
+                       }
+                       close(tmpfd);
+                       break;
+               case WRITEFD:
+                       res = write((int)act->act_fd, TEST_STRING, strlen(TEST_STRING));
+                       if (res == strlen(TEST_STRING)) {
+                               res = 0;
+                       } else {
+                               res = -1;
+                       }
+                       break;
+               case FILLFD:
+                       while (write((int)act->act_fd, "a", 1) > 0);
+                       res = 0;
+                       break;
+               case UNLINK:
+                       res = unlink((char*)args[0]);
+                       break;
+               case LSEEK:
+                       res = lseek((int)act->act_fd, (int)args[0], SEEK_SET);
+                       res = (res == (int)args[0] ? 0 : -1);
+                       break;
+               case RMDIR:
+                       res = rmdir((char*)args[0]);
+                       break;
+               case MKFIFO:
+                       res = mkfifo((char*)args[0], 0755);
+                       break;
+               case LENGTHEN:
+                       res = truncate((char*)args[0], LENGTHEN_SIZE);
+                       break;
+               case TRUNC:
+                       res = truncate((char*)args[0], 0);
+                       break;
+               case SYMLINK:
+                       res = symlink((char*)args[0], (char*)args[1]);
+                       break;
+               case CHMOD:
+                       res = chmod((char*)args[0], (int)args[1]);
+                       break;
+               case CHOWN:
+                       /* path, uid, gid */
+                       res = chown((char*)args[0], (int) args[1], (int) args[2]);
+                       break;
+               case EXCHANGEDATA:
+                       res = exchangedata((char*)args[0], (char*)args[1], 0);
+                       break;
+               case RENAME:
+                       res = rename((char*)args[0], (char*)args[1]);
+                       break;
+               case OPEN:
+                       if ((tmpfd = open((char*)args[0], O_RDONLY | O_CREAT)) == -1) {
+                               T_LOG("open(%s, O_RDONLY | O_CREAT) failed: %d (%s)\n", args[0], errno, strerror(errno));
+                               res = -1;
+                               break;
+                       }
+                       res = close(tmpfd);
+                       break;
+               case MMAP:
+                       /* It had best already exist with nonzero size */
+                       if ((tmpfd = open((char*)args[0], O_RDWR)) == -1) {
+                               T_LOG("open(%s, O_RDWR) failed: %d (%s)\n", args[0], errno, strerror(errno));
+                               res = -1;
+                               break;
+                       }
+                       addr = mmap(0, 20, PROT_WRITE | PROT_READ, MAP_FILE | MAP_SHARED, tmpfd, 0);
+                       if (addr != ((void*)-1)) {
+                               res = 0;
+                               if ((int)args[1]) {
+                                       strcpy((char*)addr, HELLO_WORLD);
+                                       msync(addr, 20, MS_SYNC);
+                               }
+                       }
+                       close(tmpfd);
+                       munmap(addr, 20);
+                       break;
+               case SETXATTR:
+                       res = setxattr((char*)args[0], KEY, (void*)VAL, strlen(VAL),
+                                                  0, 0);
+                       break;
+               case UTIMES:
+                       tv.tv_sec = time(NULL);
+                       tv.tv_usec = 0;
+                       res = utimes((char*)args[0], &tv); 
+                       break;
+               case STAT:
+                       res = lstat((char*)args[0], &sstat);
+                       break;
+               case HARDLINK:
+                       res = link((char*)args[0], (char*)args[1]);
+                       break;
+               case REVOKE:
+                       if ((tmpfd = open((char*)args[0], O_RDONLY)) == -1) {
+                               T_LOG("open(%s, O_RDONLY) failed: %d (%s)\n", args[0], errno, strerror(errno));
+                               res = -1;
+                               break;
+                       }
+                       res = revoke((char*)args[0]);
+                       close(tmpfd);
+                       break;
+               case FUNLOCK:
+                       if ((tmpfd = open((char*)args[0], O_RDONLY)) == -1) {
+                               T_LOG("open(%s, O_RDONLY) failed: %d (%s)\n", args[0], errno, strerror(errno));
+                               res = -1;
+                               break;
+                       }                               
+                       if ((res = flock(tmpfd, LOCK_EX)) == -1) {
+                               T_LOG("flock() LOCK_EX failed: %d (%s)\n", errno, strerror(errno));
+                               close(tmpfd);
+                               break;
+                       }
+                       if ((res = flock(tmpfd, LOCK_UN)) == -1) {
+                               T_LOG("flock() LOCK_UN failed: %d (%s)\n", errno, strerror(errno));
+                               close(tmpfd);
+                               break;
+                       }
+                       close(tmpfd);
+                       break;
+               default:
+                       res = -1;
+                       break;
+       }
+
+       thread_status = res;
+       return (&thread_status);
+}
+
+/*
+ * Read until the end of a file, for EVFILT_READ purposes (considers file position)
+ */
+void 
+read_to_end(int fd) 
+{
+       char buf[50];
+       while (read(fd, buf, sizeof(buf)) > 0);
+}
+
+/*
+ * Helper for setup and cleanup; just execute every action in an array
+ * of actions.  "failout" parameter indicates whether to stop if one fails.
+ */
+int
+execute_action_list(action_t *actions, int nactions, int failout) 
+{
+       int i, res;
+       for (i = 0, res = 0; (0 == res || (!failout)) && (i < nactions); i++) {
+               T_LOG("Starting prep action %d\n", i);
+               res = *((int *) execute_action(&(actions[i])));
+               if(res != 0) {
+                       T_LOG("Action list failed on step %d. res = %d errno = %d (%s)\n", i, res,
+                               errno, strerror(errno));
+               } else {
+                       T_LOG("Action list work succeeded on step %d.\n", i);
+               }
+       }
+
+       return res;
+}
+
+/*
+ * Execute a full test, return success value.
+ */
+int
+execute_test(test_t *test)
+{
+       int i, kqfd, filefd = -1, res2, res, cnt, writefd = -1;
+       int retval = -1;
+       pthread_t thr;
+       struct kevent evlist;
+       struct timespec ts = {WAIT_TIME, 0l};
+       int *status;
+
+       memset(&evlist, 0, sizeof(evlist));
+       
+       T_LOG("[BEGIN] %s\n", test->t_testname);
+
+       T_LOG(test->t_want_event ? "Expecting an event.\n" : "Not expecting events.\n");
+       
+       res = execute_action_list(test->t_prep_actions, test->t_n_prep_actions, 1);
+       
+       /* If prep succeeded */
+       if (0 == res) {
+               /* Create kqueue for kqueue tests*/
+               if (!test->t_is_poll_test) {
+                       if ((kqfd = kqueue()) == -1) {
+                               T_LOG("kqueue() failed: %d (%s)\n", errno, strerror(errno));
+                       }
+               }
+               
+               if ((test->t_is_poll_test) || kqfd >= 0) {
+                       
+                       /* Open the file we're to monitor.  Fifos get special handling */
+                       if (test->t_file_is_fifo) {
+                               filefd = -1;
+                               open_fifo(test->t_watchfile, &filefd, &writefd);
+                       } else {
+                               if ((filefd = open(test->t_watchfile, O_RDONLY | O_SYMLINK)) == -1) {
+                                       T_LOG("open() of watchfile %s failed: %d (%s)\n", test->t_watchfile,
+                                             errno, strerror(errno));
+                               }
+                       }
+                       
+                       if (filefd >= 0) {
+                               T_LOG("Opened file to monitor.\n");
+                               
+                               /* 
+                                * Fill in the fd to monitor once you know it 
+                                * If it's a fifo test, then the helper is definitely going to want the write end.
+                                */
+                               test->t_helpthreadact.act_fd = (writefd >= 0 ? writefd : filefd);
+                               
+                               if (test->t_read_to_end_first) {
+                                       read_to_end(filefd);
+                               } else if (test->t_write_some_data) {
+                                       action_t dowr;
+                                       init_action(&dowr, NOSLEEP, WRITEFD, 0);
+                                       dowr.act_fd = writefd;
+                                       (void)execute_action(&dowr);
+                               }
+                               
+                               /* Helper modifies the file that we're listening on (sleeps first, in general) */
+                               thread_status = 0;
+                               res = pthread_create(&thr, NULL, execute_action, (void*) &test->t_helpthreadact);
+                               if (0 == res) {
+                                       T_LOG("Created helper thread.\n");
+                                       
+                                       /* This is ugly business to hack on filling up a FIFO */
+                                       if (test->t_extra_sleep_hack) {
+                                               usleep(USLEEP_TIME);
+                                       }
+                                       
+                                       if (test->t_is_poll_test) {
+                                               struct pollfd pl;
+                                               pl.fd = filefd;
+                                               pl.events = test->t_union.tu_pollevents;
+                                               cnt = poll(&pl, 1, WAIT_TIME);
+                                               T_LOG("Finished poll() call.\n");
+                                               if ((cnt < 0)) {
+                                                       T_LOG("error is in errno, %s\n", strerror(errno));
+                                                       res = cnt;
+                                               }
+                                       } else {
+                                               test->t_union.tu_kev.ident = filefd; 
+                                               cnt = kevent(kqfd, &test->t_union.tu_kev, 1, &evlist, 1,  &ts);
+                                               T_LOG("Finished kevent() call.\n");
+                                               
+                                               if ((cnt < 0) || (evlist.flags & EV_ERROR))  {
+                                                       T_LOG("kevent() call failed.\n");
+                                                       if (cnt < 0) {
+                                                               T_LOG("error is in errno, %s\n", strerror(errno));
+                                                       } else {
+                                                               T_LOG("error is in data, %s\n", strerror(evlist.data));
+                                                       }
+                                                       res = cnt;
+                                               }
+                                       }
+                                       
+                                       /* Success only if you've succeeded to this point AND joined AND other thread is happy*/
+                                       status = NULL;
+                                       res2 = pthread_join(thr, (void **)&status);
+                                       if (res2 != 0) {
+                                               T_LOG("Couldn't join helper thread: %d (%s).\n", res2,
+                                                       strerror(res2));
+                                       } else if (*status) {
+                                               T_LOG("Helper action had result %d\n", *status);
+                                       }
+                                       res = ((res == 0) && (res2 == 0) && (*status == 0)) ? 0 : -1;
+                               } else {
+                                       T_LOG("Couldn't start thread: %d (%s).\n", res, strerror(res));
+                               }
+                               
+                               close(filefd);
+                               if (test->t_file_is_fifo) {
+                                       close(writefd);
+                               }
+                       } else {
+                               T_LOG("Couldn't open test file %s to monitor: %d (%s)\n", test->t_watchfile);
+                               res = -1;
+                       }
+                       if (!test->t_is_poll_test) {
+                               close(kqfd);
+                       }
+               } else {
+                       T_LOG("Couldn't open kqueue.\n");
+                       res = -1;
+               }
+       }
+       
+       /* Cleanup work */
+       execute_action_list(test->t_cleanup_actions, test->t_n_cleanup_actions, 0);
+       
+       /* Success if nothing failed and we either received or did not receive event,
+        * as expected 
+        */
+       if (0 == res) {
+               T_LOG(cnt > 0 ? "Got an event.\n" : "Did not get an event.\n");
+               if (((cnt > 0) && (test->t_want_event)) || ((cnt == 0) && (!test->t_want_event))) {
+                       if ((!test->t_is_poll_test) && (test->t_union.tu_kev.filter == EVFILT_READ || test->t_union.tu_kev.filter == EVFILT_WRITE)
+                               && (test->t_nbytes) && (test->t_nbytes != evlist.data)) {
+                               T_LOG("Read wrong number of bytes available.  Wanted %d, got %d\n", test->t_nbytes, evlist.data);
+                               retval = -1;
+                       } else {
+                               retval = 0;
+                       }
+                       
+               } else {
+                       T_LOG("Got unexpected event or lack thereof.\n");
+                       retval = -1;
+               }
+       } else {
+               T_LOG("Failed to execute test. res = %d\n", res);
+               retval = -1;
+       }
+
+       if (test->t_nondeterministic) {
+               T_LOG("XXX non-deterministic test result = %d (%s)\n", retval,
+                       (retval == 0) ? "pass" : "fail");
+               T_MAYFAIL;
+       } else {
+               if (test->t_known_failure) {
+                       // Signal to harness that this test is expected to fail.
+                       T_EXPECTFAIL;
+               }
+       }
+
+       if (retval == 0) {
+               T_PASS("%s", test->t_testname);
+       } else {
+               T_FAIL("%s", test->t_testname);
+       }
+
+       T_LOG("Test %s done with result %d.\n", test->t_testname, retval);
+       return (retval);
+}
+
+
+
+void
+init_test_common(test_t *tst, char *testname, char *watchfile, int nprep, int nclean, int event, int want, int ispoll)
+{
+       memset(tst, 0, sizeof(test_t));
+       tst->t_testname = testname;
+       tst->t_known_failure = 0;
+       tst->t_nondeterministic = 0;
+       tst->t_watchfile = watchfile;
+       tst->t_n_prep_actions = nprep;
+       tst->t_n_cleanup_actions = nclean;
+       tst->t_want_event = (want > 0);
+       
+       if (ispoll) {
+               tst->t_is_poll_test = 1;
+               tst->t_union.tu_pollevents = (short)event;
+       } else {
+               /* Can do this because filter is negative, notes are positive */
+               if (event == EVFILT_READ || event == EVFILT_WRITE) {
+                       EV_SET(&tst->t_union.tu_kev, 0, event, EV_ADD | EV_ENABLE, 0, 0, NULL);
+                       tst->t_nbytes = want;
+               } else {
+                       EV_SET(&tst->t_union.tu_kev, 0, EVFILT_VNODE, EV_ADD | EV_ENABLE, event, 0, NULL);
+               }
+       }
+}
+
+/*
+ * Initialize a test case, not including its actions.  Meaning: a name for it, what filename to watch,
+ * counts of prep and cleanup actions, what event to watch for, and whether you want an event/how many bytes read.
+ *
+ * "want" does double duty as whether you want an event and how many bytes you might want to read
+ * "event" is either an event flag (e.g. NOTE_WRITE) or EVFILT_READ
+ */    
+void 
+init_test(test_t *tst, char *testname, char *watchfile, int nprep, int nclean, int event, int want) 
+{
+       init_test_common(tst, testname, watchfile, nprep, nclean, event, want, 0);
+}
+
+/*
+ * Same as above, but for a poll() test
+ */
+void
+init_poll_test(test_t *tst, char *testname, char *watchfile, int nprep, int nclean, int event, int want) 
+{
+       init_test_common(tst, testname, watchfile, nprep, nclean, event, want, 1);
+}
+
+void 
+run_note_delete_tests() 
+{
+       test_t test;
+       
+       init_test(&test, "1.1.2: unlink a file", FILE1, 1, 0, NOTE_DELETE, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "1.1.3: rmdir a dir", DIR1, 1, 0, NOTE_DELETE, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "1.1.4: rename one file over another", FILE2, 2, 1, NOTE_DELETE, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "1.1.5: rename one dir over another", DIR2, 2, 1, NOTE_DELETE, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
+       execute_test(&test);
+       
+       /* Do FIFO stuff here */
+       init_test(&test, "1.1.6: make a fifo, unlink it", FILE1, 1, 0, NOTE_DELETE, YES_EVENT);
+       test.t_file_is_fifo = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1);
+       execute_test(&test);
+       
+       init_test(&test, "1.1.7: rename a file over a fifo", FILE1, 2, 1, NOTE_DELETE, YES_EVENT);
+       test.t_nondeterministic = 1;
+       test.t_file_is_fifo = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE2, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "1.1.8: unlink a symlink to a file", FILE2, 2, 1, NOTE_DELETE, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, SYMLINK, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       /* ================= */
+       
+       init_test(&test, "1.2.1: Straight-up rename file", FILE1, 1, 1, NOTE_DELETE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "1.2.2: Straight-up rename dir", DIR1, 1, 1, NOTE_DELETE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "1.2.3: Null action on file", FILE1, 1, 1, NOTE_DELETE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 2, NULL, NULL); /* The null action */
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "1.2.4: Rename one file over another: watch the file that lives", FILE1, 2, 1, NOTE_DELETE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "1.2.5: Rename one dir over another, watch the dir that lives", DIR1, 2, 1, NOTE_DELETE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
+}
+
+static bool
+path_on_apfs(const char *path)
+{
+       struct statfs sfs = {};
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(statfs(path, &sfs), NULL);
+       return (memcmp(&sfs.f_fstypename[0], "apfs", strlen("apfs")) == 0);
+}
+
+void 
+run_note_write_tests()
+{
+       char pathbuf[50];
+       char otherpathbuf[50];
+       
+       test_t test;
+       
+       init_test(&test, "2.1.1: Straight-up write to a file", FILE1, 1, 1, NOTE_WRITE, YES_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "2.1.2: creat() file inside a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "2.1.3: open() file inside a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "2.1.4: unlink a file from a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       makepath(otherpathbuf, DIR1, FILE2);
+       init_test(&test, "2.1.5: rename a file in a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)otherpathbuf);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "2.1.6: rename a file to outside of a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "2.1.7: rename a file into a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)pathbuf);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "2.1.9: unlink a fifo from a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKFIFO, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "2.1.10: make symlink in a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DOTDOT, (void*)pathbuf);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "2.1.12: write to a FIFO", FILE1, 1, 1, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
+       test.t_file_is_fifo = 1;
+       init_action(&test.t_helpthreadact, SLEEP, WRITEFD, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "2.1.13: delete a symlink in a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, SYMLINK, 2, (void*)DOTDOT, (void*)pathbuf);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+
+       /* exchangedata is not supported on APFS volumes */
+       if (!path_on_apfs(PDIR)) {
+               /* This actually should not generate an event, though it's in this section */
+               makepath(pathbuf, DIR1, FILE1);
+               makepath(otherpathbuf, DIR1, FILE2);
+               init_test(&test, "2.1.14: exchangedata two files in a dir", DIR1, 3, 3, NOTE_WRITE, NO_EVENT);
+               test.t_known_failure = 1;
+               init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+               init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
+               init_action(&(test.t_prep_actions[2]), NOSLEEP, CREAT, 2, (void*)otherpathbuf, (void*)NULL);
+               init_action(&test.t_helpthreadact, SLEEP, EXCHANGEDATA, 2, (void*)pathbuf, (void*)otherpathbuf);
+               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+               init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, (void*)NULL);
+               init_action(&test.t_cleanup_actions[2], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+               execute_test(&test);
+       }
+
+       init_test(&test, "2.1.15: Change a file with mmap()", FILE1, 1, 1, NOTE_WRITE, YES_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, MMAP, 2, (void*)FILE1, (void*)1); /* 1 -> "modify it"*/
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       /*================= no-event tests ==================*/
+       init_test(&test, "2.2.1: just open and close existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)FILE1, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "2.2.2: read from existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, READ, 2, (void*)FILE1, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "2.2.3: rename existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "2.2.4: just open and close dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       /* There are no tests 2.2.5 or 2.2.6 */
+       
+       init_test(&test, "2.2.7: rename a dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "2.2.8: rename a fifo", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
+       test.t_file_is_fifo = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "2.2.9: unlink a fifo", FILE1, 1, 0, NOTE_WRITE, NO_EVENT);
+       test.t_file_is_fifo = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK,1, (void*)FILE1);
+       execute_test(&test);
+       
+       init_test(&test, "2.2.10: chmod a file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)FILE1, (void*)0700);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       struct passwd *pwd = getpwnam("local");
+
+       if (pwd != NULL) {
+               init_test(&test, "2.2.11: chown a file", FILE1, 2, 1, NOTE_WRITE, NO_EVENT);
+               init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+               init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
+               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)FILE1, (void*)getuid(), (void*)getgid());
+               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+               execute_test(&test);
+       }
+       
+       init_test(&test, "2.2.12: chmod a dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       if (pwd != NULL) {
+               init_test(&test, "2.2.13: chown a dir", DIR1, 2, 1, NOTE_WRITE, NO_EVENT);
+               init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+               init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)DIR1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
+               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)DIR1, (void*)getuid(), (void*)getgid());
+               init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+               execute_test(&test);
+       }
+       
+       T_LOG("MMAP will never give a notification on HFS.\n");
+       init_test(&test, "2.1.14: mmap() a file but do not change it", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, MMAP, 2, (void*)FILE1, (void*)0); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+}
+
+void
+run_note_extend_tests()
+{
+       test_t test;
+       char pathbuf[50];
+       
+       T_LOG("THESE TESTS MAY FAIL ON HFS\n");
+       
+       init_test(&test, "3.1.1: write beyond the end of a file", FILE1, 1, 1, NOTE_EXTEND, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       /*
+        * We won't concern ourselves with lengthening directories: commenting these out  
+        *
+        
+        makepath(pathbuf, DIR1, FILE1);
+        init_test(&test, "3.1.2: add a file to a directory with creat()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT);
+        init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+        init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL); 
+        init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+        init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+        execute_test(&test);
+        
+        makepath(pathbuf, DIR1, FILE1);
+        init_test(&test, "3.1.3: add a file to a directory with open()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT);
+        init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+        init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL); 
+        init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+        init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+        execute_test(&test);
+        
+        makepath(pathbuf, DIR1, FILE1);
+        init_test(&test, "3.1.4: add a file to a directory with rename()", DIR1, 2, 2, NOTE_EXTEND, YES_EVENT);
+        init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+        init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+        init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)pathbuf); 
+        init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+        init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+        execute_test(&test);
+        */
+       
+       /* 3.1.5: a placeholder for a potential kernel test */
+       /*
+        makepath(pathbuf, DIR1, DIR2);
+        init_test(&test, "3.1.6: add a file to a directory with mkdir()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT);
+        init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+        init_action(&test.t_helpthreadact, SLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL); 
+        init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL);
+        init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+        execute_test(&test);
+        */
+       init_test(&test, "3.1.7: lengthen a file with truncate()", FILE1, 1, 1, NOTE_EXTEND, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, LENGTHEN, 2, FILE1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       
+       /** ========== NO EVENT SECTION ============== **/
+       init_test(&test, "3.2.1: setxattr() a file", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, FILE1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "3.2.2: chmod a file", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)FILE1, (void*)0700);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       struct passwd *pwd = getpwnam("local");
+       if (pwd != NULL) {
+               init_test(&test, "3.2.3: chown a file", FILE1, 2, 1, NOTE_EXTEND, NO_EVENT);
+               init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+               init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
+               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)FILE1, (void*)getuid(), (void*)getgid());
+               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+               execute_test(&test);
+       } else {
+               T_LOG("Couldn't getpwnam for user \"local\"\n");
+       }
+       
+       init_test(&test, "3.2.4: chmod a dir", DIR1, 1, 1, NOTE_EXTEND, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       if (pwd != NULL) {
+               init_test(&test, "3.2.5: chown a dir", DIR1, 2, 1, NOTE_EXTEND, NO_EVENT);
+               init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+               init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)DIR1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
+               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)DIR1, (void*)getuid(), (void*)getgid());
+               init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+               execute_test(&test);
+       }
+       
+       init_test(&test, "3.2.6: TRUNC a file with truncate()", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, TRUNC, 2, FILE1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+}
+
+void
+run_note_attrib_tests()
+{
+       test_t test;
+       char pathbuf[50];
+       
+       init_test(&test, "4.1.1: chmod a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, FILE1, (void*)0700); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       struct passwd *pwd = getpwnam("local");
+       if (pwd != NULL) {
+               init_test(&test, "4.1.2: chown a file", FILE1, 2, 1, NOTE_ATTRIB, YES_EVENT);
+               init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+               init_action(&(test.t_prep_actions[1]), NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
+               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, FILE1, (void*)getuid(), (void*)pwd->pw_gid);
+               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+               execute_test(&test);
+       }
+
+       init_test(&test, "4.1.3: chmod a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_helpthreadact), SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       if (pwd != NULL) {
+               init_test(&test, "4.1.4: chown a dir", DIR1, 2, 1, NOTE_ATTRIB, YES_EVENT);
+               init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+               init_action(&(test.t_prep_actions[1]), NOSLEEP, CHOWN, 3, (void*)DIR1, (void*) pwd->pw_uid, (void*)pwd->pw_gid);
+               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, DIR1, (void*)getuid(), (void*)getgid());
+               init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+               execute_test(&test);
+       }
+       
+       init_test(&test, "4.1.5: setxattr on a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, (void*)FILE1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.1.6: setxattr on a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, (void*)DIR1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+
+       /* exchangedata is not supported on APFS volumes */
+       if (!path_on_apfs(PDIR)) {
+               init_test(&test, "4.1.7: exchangedata", FILE1, 2, 2, NOTE_ATTRIB, YES_EVENT);
+               init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+               init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
+               init_action(&test.t_helpthreadact, SLEEP, EXCHANGEDATA, 2, (void*)FILE1, (void*)FILE2); 
+               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+               init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
+               execute_test(&test);
+       }
+
+       init_test(&test, "4.1.8: utimes on a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UTIMES, 2, (void*)FILE1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.1.9: utimes on a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UTIMES, 2, (void*)DIR1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       
+       /* ====== NO EVENT TESTS ========== */
+       
+       init_test(&test, "4.2.1: rename a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.2: open (do not change) a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)FILE1, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.3: stat a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, STAT, 2, (void*)FILE1, NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.4: unlink a file", FILE1, 1, 0, NOTE_ATTRIB, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.5: write to a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       T_LOG("EXPECT SPURIOUS NOTE_ATTRIB EVENTS FROM DIRECTORY OPERATIONS on HFS.\n");
+       init_test(&test, "4.2.6: add a file to a directory with creat()", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT);
+       test.t_known_failure = 1;
+       makepath(pathbuf, DIR1, FILE1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.7: mkdir in a dir", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT);
+       test.t_known_failure = 1;
+       makepath(pathbuf, DIR1, DIR2);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.8: add a symlink to a directory", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT);
+       test.t_known_failure = 1;
+       makepath(pathbuf, DIR1, FILE1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DOTDOT, (void*)pathbuf); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.9: rename into a dir()", DIR1, 2, 2, NOTE_ATTRIB, NO_EVENT);
+       test.t_known_failure = 1;
+       makepath(pathbuf, DIR1, FILE1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)pathbuf); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.10: unlink() file from dir", DIR1, 2, 1, NOTE_ATTRIB, NO_EVENT);
+       test.t_known_failure = 1;
+       makepath(pathbuf, DIR1, FILE1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       init_test(&test, "4.2.11: mkfifo in a directory", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT);
+       test.t_known_failure = 1;
+       makepath(pathbuf, DIR1, FILE1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, MKFIFO, 1, (void*)pathbuf); 
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       
+}
+
+
+void 
+run_note_link_tests()
+{
+       test_t test;
+       char pathbuf[50];
+       char otherpathbuf[50];
+       
+       T_LOG("HFS DOES NOT HANDLE UNLINK CORRECTLY...\n");
+       init_test(&test, "5.1.1: unlink() a file", FILE1, 1, 0, NOTE_LINK, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
+       execute_test(&test);
+       
+       
+       init_test(&test, "5.1.1.5: link A to B, watch A, remove B", FILE1, 2, 1, NOTE_LINK, YES_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "5.1.2: link() to a file", FILE1, 1, 2, NOTE_LINK, YES_EVENT);
+#if TARGET_OS_WATCH
+       test.t_nondeterministic = 1;
+#endif
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, DIR2);
+       init_test(&test, "5.1.3: make one dir in another", DIR1, 1, 2, NOTE_LINK, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, DIR2);
+       init_test(&test, "5.1.4: rmdir a dir from within another", DIR1, 2, 1, NOTE_LINK, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, DIR2);
+       makepath(otherpathbuf, DIR1, DIR1);
+       init_test(&test, "5.1.5: rename dir A over dir B inside dir C", DIR1, 3, 2, NOTE_LINK, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&(test.t_prep_actions[2]), NOSLEEP, MKDIR, 2, (void*)otherpathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)otherpathbuf);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)otherpathbuf, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       T_LOG("HFS bypasses hfs_makenode to create in target, so misses knote.\n");
+       makepath(pathbuf, DIR1, DIR2);
+       init_test(&test, "5.1.6: rename one dir into another", DIR1, 2, 2, NOTE_LINK, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR2, (void*)pathbuf);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       T_LOG("HFS bypasses hfs_removedir to remove from source, so misses knote.\n");
+       makepath(pathbuf, DIR1, DIR2);
+       init_test(&test, "5.1.7: rename one dir out of another", DIR1, 2, 2, NOTE_LINK, YES_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)DIR2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "5.1.8: rmdir a dir", DIR1, 1, 0, NOTE_LINK, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
+       execute_test(&test);
+       
+       /* ============= NO EVENT SECTION ============== */
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "5.2.1: make a file in a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "5.2.2: unlink a file in a dir", DIR1, 2, 1, NOTE_LINK, NO_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       makepath(otherpathbuf, DIR1, FILE2);
+       init_test(&test, "5.2.3: rename a file within a dir", DIR1, 2, 2, NOTE_LINK, NO_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)otherpathbuf);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "5.2.4: rename a file into a dir", DIR1, 2, 2, NOTE_LINK, NO_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)pathbuf);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       makepath(pathbuf, DIR1, FILE1);
+       init_test(&test, "5.2.5: make a symlink in a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DOTDOT, (void*)pathbuf);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "5.2.6: make a symlink to a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT);
+       test.t_known_failure = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DIR1, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "5.2.7: make a symlink to a file", FILE1, 1, 2, NOTE_LINK, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+}
+
+void
+run_note_rename_tests() 
+{
+       test_t test;
+       
+       init_test(&test, "6.1.1: rename a file", FILE1, 1, 1, NOTE_RENAME, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.1.2: rename a dir", DIR1, 1, 1, NOTE_RENAME, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.1.3: rename one file over another", FILE1, 2, 1, NOTE_RENAME, YES_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.1.4: rename one dir over another", DIR1, 2, 1, NOTE_RENAME, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
+       execute_test(&test);
+       
+       /* ========= NO EVENT SECTION =========== */
+       
+       init_test(&test, "6.2.1: unlink a file", FILE1, 1, 0, NOTE_RENAME, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.2.2: rmdir a dir", DIR1, 1, 0, NOTE_RENAME, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.2.3: link() to a file", FILE1, 1, 2, NOTE_RENAME, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.2.4: rename one file over another: watch deceased", 
+                         FILE2, 2, 1, NOTE_RENAME, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.2.5: rename one dir over another: watch deceased", 
+                         DIR2, 2, 1, NOTE_RENAME, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.2.6: rename a file to itself", FILE1, 1, 1, NOTE_RENAME, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "6.2.7: rename a dir to itself", DIR1, 1, 1, NOTE_RENAME, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
+       execute_test(&test);
+}
+
+void 
+run_note_revoke_tests() 
+{
+       test_t test;
+       init_test(&test, "7.1.1: revoke file", FILE1, 1, 1, NOTE_REVOKE, YES_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&test.t_helpthreadact, SLEEP, REVOKE, 1, (void*)FILE1);
+       init_action(&(test.t_cleanup_actions[0]), NOSLEEP, UNLINK, 1, (void*)FILE1);
+       execute_test(&test);
+       
+       init_test(&test, "7.2.1: delete file", FILE1, 1, 0, NOTE_REVOKE, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1);
+       execute_test(&test);
+}
+
+
+void
+run_evfilt_read_tests() 
+{
+       test_t test;
+       init_test(&test, "8.1.1: how much data in file of length LENGTHEN_SIZE?", FILE1, 2, 1, EVFILT_READ, LENGTHEN_SIZE);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, LENGTHEN, 2, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "8.1.2: block, then write to file", FILE1, 2, 1, EVFILT_READ, strlen(TEST_STRING));
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1);
+       init_action(&test.t_helpthreadact, SLEEP, WRITE, 1, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "8.1.3: block, then extend", FILE1, 2, 1, EVFILT_READ, LENGTHEN_SIZE);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1);
+       init_action(&test.t_helpthreadact, SLEEP, LENGTHEN, 1, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "8.1.4: block, then seek to beginning", FILE1, 2, 1, EVFILT_READ, strlen(TEST_STRING));
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, WRITE, 1, (void*)FILE1);
+       test.t_read_to_end_first = 1; /* hack means that we've gotten to EOF before we block */
+       init_action(&test.t_helpthreadact, SLEEP, LSEEK, 1, (void*)0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       
+       init_test(&test, "8.1.5: block, then write to fifo", FILE1, 1, 1, EVFILT_READ, strlen(TEST_STRING));
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1);
+       test.t_file_is_fifo = 1;
+       init_action(&test.t_helpthreadact, SLEEP, WRITE, 1, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       /* No result section... */
+       init_test(&test, "8.2.1: just rename", FILE1, 2, 1, EVFILT_READ, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1);
+       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "8.2.2: delete file", FILE1, 2, 0, EVFILT_READ, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1);
+       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1);
+       execute_test(&test);
+       
+       init_test(&test, "8.2.3: write to beginning", FILE1, 2, 1, EVFILT_READ, NO_EVENT);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, WRITE, 1, (void*)FILE1);
+       test.t_read_to_end_first = 1; /* hack means that we've gotten to EOF before we block */
+       init_action(&test.t_helpthreadact, SLEEP, WRITE, 1, (void*)FILE1);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 1, (void*)FILE1);
+       execute_test(&test);
+       
+       init_test(&test, "8.1.4: block, then seek to current location", FILE1, 2, 1, EVFILT_READ, 0);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, WRITE, 1, (void*)FILE1);
+       test.t_read_to_end_first = 1; /* hack means that we've gotten to EOF before we block */
+       init_action(&test.t_helpthreadact, SLEEP, LSEEK, 1, (void*)strlen(TEST_STRING));
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "8.2.5: trying to read from empty fifo", FILE1, 1, 1, EVFILT_READ, 0);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1);
+       test.t_file_is_fifo = 1;
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 1, (void*)0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+}
+
+
+
+void*
+read_from_fd(void *arg)
+{
+       char buf[50];
+       int fd = (int) arg;
+       usleep(USLEEP_TIME);
+       return (void*) read(fd, buf, sizeof(buf));
+}
+
+void*
+write_to_fd(void *arg)
+{
+       char buf[50];
+       int fd = (int) arg;
+       usleep(USLEEP_TIME);
+       return (void*) write(fd, buf, sizeof(buf));
+}
+
+/*
+ * We don't (in principle) support EVFILT_WRITE for vnodes; thusly, no tests here
+ */
+void 
+run_evfilt_write_tests()
+{
+       
+       test_t test;
+       init_test(&test, "9.1.1: how much space in empty fifo?", FILE1, 1, 1, EVFILT_WRITE, FIFO_SPACE);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
+       test.t_file_is_fifo = 1;
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "9.1.2: how much space in slightly written fifo?", FILE1, 1, 1, EVFILT_WRITE, FIFO_SPACE - strlen(TEST_STRING));
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
+       test.t_file_is_fifo = 1;
+       test.t_write_some_data = 1;
+       init_action(&(test.t_helpthreadact), NOSLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_test(&test, "9.2.1: how much space in a full fifo?", FILE1, 1, 1, EVFILT_WRITE, 0);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
+       test.t_nondeterministic = 1;
+       test.t_file_is_fifo = 1;
+       test.t_extra_sleep_hack = 1;
+       init_action(&(test.t_helpthreadact), NOSLEEP, FILLFD, 1, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+}
+
+void
+run_poll_tests()
+{
+       test_t test;
+       init_poll_test(&test, "10.1.1: does poll say I can write a regular file?", FILE1, 1, 1, POLLWRNORM, 1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_poll_test(&test, "10.1.2: does poll say I can write an empty FIFO?", FILE1, 1, 1, POLLWRNORM, 1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
+       test.t_file_is_fifo = 1;
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_poll_test(&test, "10.1.3: does poll say I can read a nonempty FIFO?", FILE1, 1, 1, POLLRDNORM, 1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
+       test.t_file_is_fifo = 1;
+       test.t_write_some_data = 1;
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_poll_test(&test, "10.1.4: does poll say I can read a nonempty regular file?", FILE1, 2, 1, POLLRDNORM, 1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1, (void*)NULL);
+       init_action(&(test.t_prep_actions[1]), NOSLEEP, LENGTHEN, 1, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_poll_test(&test, "10.1.5: does poll say I can read an empty file?", FILE1, 1, 1, POLLRDNORM, 1);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       
+       
+       
+       init_poll_test(&test, "10.2.2: does poll say I can read an empty FIFO?", FILE1, 1, 1, POLLRDNORM, 0);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
+       test.t_file_is_fifo = 1;
+       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       execute_test(&test);
+       
+       init_poll_test(&test, "10.2.3: does poll say I can write a full FIFO?", FILE1, 1, 1, POLLWRNORM, 0);
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
+       test.t_nondeterministic = 1;
+       test.t_file_is_fifo = 1;
+       test.t_extra_sleep_hack = 1;
+       init_action(&(test.t_helpthreadact), NOSLEEP, FILLFD, 1, (void*)FILE1, (void*)NULL);
+       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
+       test.t_known_failure = 1;
+       execute_test(&test);
+}
+
+void
+run_note_funlock_tests()
+{
+       test_t test;
+       init_test(&test, "11.1.1: unlock file", FILE1, 1, 1, NOTE_FUNLOCK, YES_EVENT);
+       test.t_nondeterministic = 1;
+       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void *)NULL);
+       init_action(&test.t_helpthreadact, SLEEP, FUNLOCK, 2, (void*)FILE1, (void *)NULL);
+       init_action(&(test.t_cleanup_actions[0]), NOSLEEP, UNLINK, 2, (void*)FILE1, (void *)NULL);
+       execute_test(&test);
+}
+
+void
+run_all_tests() 
+{
+       run_note_delete_tests();
+       run_note_write_tests();
+       run_note_extend_tests();
+       run_note_attrib_tests();
+       run_note_link_tests();
+       run_note_rename_tests();
+#if 0
+       run_note_revoke_tests(); /* Can no longer revoke a regular file--need an unmount test */
+#endif /* 0 */
+       run_evfilt_read_tests();
+       run_evfilt_write_tests();
+       run_poll_tests();
+       run_note_funlock_tests();
+}
+
+       T_DECL(kqueue_file_tests,
+               "Tests assorted kqueue operations for file-related events")
+{
+       char *which = NULL;
+       if (argc > 1) {
+               which = argv[1];
+       }
+       
+       T_SETUPBEGIN;
+       rmdir(DIR1);
+       rmdir(DIR2);
+       T_SETUPEND;
+
+       if ((!which) || (strcmp(which, "all") == 0))
+               run_all_tests();
+       else if (strcmp(which, "delete") == 0) 
+               run_note_delete_tests();
+       else if (strcmp(which, "write") == 0)
+               run_note_write_tests();
+       else if (strcmp(which, "extend") == 0)
+               run_note_extend_tests();
+       else if (strcmp(which, "attrib") == 0)
+               run_note_attrib_tests();
+       else if (strcmp(which, "link") == 0)
+               run_note_link_tests();
+       else if (strcmp(which, "rename") == 0)
+               run_note_rename_tests();
+       else if (strcmp(which, "revoke") == 0)
+               run_note_revoke_tests();
+       else if (strcmp(which, "evfiltread") == 0)
+               run_evfilt_read_tests();
+       else if (strcmp(which, "evfiltwrite") == 0)
+               run_evfilt_write_tests();
+       else if (strcmp(which, "poll") == 0)
+               run_poll_tests();
+       else if (strcmp(which, "funlock") == 0)
+               run_note_funlock_tests();
+       else {
+               fprintf(stderr, "Valid options are:\n\tdelete, write, extend, "
+                               "attrib, link, rename, revoke, evfiltread, "
+                               "fifo, all, evfiltwrite, funlock<none>\n");
+               exit(1);
+       }
+}
+
diff --git a/tests/kqueue_timer_tests.c b/tests/kqueue_timer_tests.c
new file mode 100644 (file)
index 0000000..e02deb4
--- /dev/null
@@ -0,0 +1,437 @@
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <mach/mach.h>
+#include <mach/task.h>
+
+#include <TargetConditionals.h>
+#include <darwintest.h>
+
+#ifndef NOTE_MACHTIME
+#define NOTE_MACHTIME   0x00000100
+#endif
+
+static mach_timebase_info_data_t timebase_info;
+
+static uint64_t nanos_to_abs(uint64_t nanos) { return nanos * timebase_info.denom / timebase_info.numer; }
+static uint64_t abs_to_nanos(uint64_t abs)   { return abs * timebase_info.numer / timebase_info.denom; }
+
+static int kq, passed, failed;
+
+static struct timespec failure_timeout = { .tv_sec = 10, .tv_nsec = 0 };
+
+/*
+ * Wait for given kevent, which should return in 'expected' usecs.
+ */
+static int
+do_simple_kevent(struct kevent64_s *kev, uint64_t expected)
+{
+       int ret;
+       int64_t elapsed_usecs;
+       uint64_t delta_usecs;
+       struct timespec timeout;
+       struct timeval before, after;
+
+       /* time out after 1 sec extra delay */
+       timeout.tv_sec = (expected / USEC_PER_SEC) + 1;
+       timeout.tv_nsec = (expected % USEC_PER_SEC) * 1000;
+
+       T_SETUPBEGIN;
+
+       /* measure time for the kevent */
+       gettimeofday(&before, NULL);
+       ret = kevent64(kq, kev, 1, kev, 1, 0, &timeout);
+       gettimeofday(&after, NULL);
+
+       if (ret < 1 || (kev->flags & EV_ERROR)) {
+               T_LOG("%s() failure: kevent returned %d, error %d\n", __func__, ret,
+                               (ret == -1 ? errno : (int) kev->data));
+               return 0;
+       }
+
+       T_SETUPEND;
+
+       /* did it work? */
+       elapsed_usecs = (after.tv_sec - before.tv_sec) * (int64_t)USEC_PER_SEC +
+               (after.tv_usec - before.tv_usec);
+       delta_usecs = (uint64_t)llabs(elapsed_usecs - ((int64_t)expected));
+
+       /* failure if we're 30% off, or 50 mics late */
+       if (delta_usecs > (30 * expected / 100.0) && delta_usecs > 50) {
+               T_LOG("\tfailure: expected %lld usec, measured %lld usec.\n",
+                               expected, elapsed_usecs);
+               return 0;
+       } else {
+               T_LOG("\tsuccess, measured %lld usec.\n", elapsed_usecs);
+               return 1;
+       }
+}
+
+static void
+test_absolute_kevent(int time, int scale)
+{
+       struct timeval tv;
+       struct kevent64_s kev;
+       uint64_t nowus, expected, timescale = 0;
+       int ret;
+       int64_t deadline;
+
+       gettimeofday(&tv, NULL);
+       nowus = (uint64_t)tv.tv_sec * USEC_PER_SEC + (uint64_t)tv.tv_usec;
+
+       T_SETUPBEGIN;
+
+       switch (scale) {
+       case NOTE_MACHTIME:
+               T_LOG("Testing %d MATUs absolute timer...\n", time);
+               break;
+       case NOTE_SECONDS:
+               T_LOG("Testing %d sec absolute timer...\n", time);
+               timescale = USEC_PER_SEC;
+               break;
+       case NOTE_USECONDS:
+               T_LOG("Testing %d usec absolute timer...\n", time);
+               timescale = 1;
+               break;
+       case 0:
+               T_LOG("Testing %d msec absolute timer...\n", time);
+               timescale = 1000;
+               break;
+       default:
+               T_FAIL("Failure: scale 0x%x not recognized.\n", scale);
+               return;
+       }
+
+       T_SETUPEND;
+
+       if (scale == NOTE_MACHTIME) {
+               expected = abs_to_nanos((uint64_t)time) / NSEC_PER_USEC;
+               deadline = (int64_t)mach_absolute_time() + time;
+       } else {
+               expected = (uint64_t)time * timescale;
+               deadline = (int64_t)(nowus / timescale) + time;
+       }
+
+       /* deadlines in the past should fire immediately */
+       if (time < 0)
+               expected = 0;
+
+       EV_SET64(&kev, 1, EVFILT_TIMER, EV_ADD,
+                       NOTE_ABSOLUTE | scale, deadline, 0,0,0);
+       ret = do_simple_kevent(&kev, expected);
+
+       if (ret) {
+               passed++;
+               T_PASS("%s time:%d, scale:0x%x", __func__, time, scale);
+       } else {
+               failed++;
+               T_FAIL("%s time:%d, scale:0x%x", __func__, time, scale);
+       }
+}
+
+static void
+test_oneshot_kevent(int time, int scale)
+{
+       int ret;
+       uint64_t expected = 0;
+       struct kevent64_s kev;
+
+       T_SETUPBEGIN;
+
+       switch (scale) {
+       case NOTE_MACHTIME:
+               T_LOG("Testing %d MATUs interval timer...\n", time);
+               expected = abs_to_nanos((uint64_t)time) / NSEC_PER_USEC;
+               break;
+       case NOTE_SECONDS:
+               T_LOG("Testing %d sec interval timer...\n", time);
+               expected = (uint64_t)time * USEC_PER_SEC;
+               break;
+       case NOTE_USECONDS:
+               T_LOG("Testing %d usec interval timer...\n", time);
+               expected = (uint64_t)time;
+               break;
+       case NOTE_NSECONDS:
+               T_LOG("Testing %d nsec interval timer...\n", time);
+               expected = (uint64_t)time / 1000;
+               break;
+       case 0:
+               T_LOG("Testing %d msec interval timer...\n", time);
+               expected = (uint64_t)time * 1000;
+               break;
+       default:
+               T_FAIL("Failure: scale 0x%x not recognized.\n", scale);
+               return;
+       }
+
+       T_SETUPEND;
+
+       /* deadlines in the past should fire immediately */
+       if (time < 0)
+               expected = 0;
+
+       EV_SET64(&kev, 2, EVFILT_TIMER, EV_ADD | EV_ONESHOT, scale, time,
+                       0, 0, 0);
+       ret = do_simple_kevent(&kev, expected);
+
+       if (ret) {
+               passed++;
+               T_PASS("%s time:%d, scale:0x%x", __func__, time, scale);
+       } else {
+               failed++;
+               T_FAIL("%s time:%d, scale:0x%x", __func__, time, scale);
+       }
+}
+
+/* Test that the timer goes ding multiple times */
+static void
+test_interval_kevent(int usec)
+{
+       struct kevent64_s kev;
+       int ret;
+
+       T_SETUPBEGIN;
+
+       uint64_t test_duration_us = USEC_PER_SEC; /* 1 second */
+       uint64_t expected_pops;
+
+       if (usec < 0)
+               expected_pops = 1; /* TODO: test 'and only once' */
+       else
+               expected_pops = test_duration_us / (uint64_t)usec;
+
+       T_LOG("Testing interval kevent at %d usec intervals (%lld pops/second)...\n",
+               usec, expected_pops);
+
+       EV_SET64(&kev, 3, EVFILT_TIMER, EV_ADD, NOTE_USECONDS, usec, 0, 0, 0);
+       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
+       if (ret != 0 || (kev.flags & EV_ERROR)) {
+               T_FAIL("%s() setup failure: kevent64 returned %d\n", __func__, ret);
+               failed++;
+               return;
+       }
+
+       T_SETUPEND;
+
+       struct timeval before, after;
+       uint64_t elapsed_usecs;
+
+       gettimeofday(&before, NULL);
+
+       uint64_t pops = 0;
+
+       for (uint32_t i = 0; i < expected_pops; i++) {
+               ret = kevent64(kq, NULL, 0, &kev, 1, 0, &failure_timeout);
+               if (ret != 1) {
+                       T_FAIL("%s() failure: kevent64 returned %d\n", __func__, ret);
+                       failed++;
+                       return;
+               }
+
+               //T_LOG("\t ding: %lld\n", kev.data);
+
+               pops += (uint64_t)kev.data;
+               gettimeofday(&after, NULL);
+               elapsed_usecs = (uint64_t)((after.tv_sec - before.tv_sec) * (int64_t)USEC_PER_SEC +
+                       (after.tv_usec - before.tv_usec));
+
+               if (elapsed_usecs > test_duration_us)
+                       break;
+       }
+
+       /* check how many times the timer fired: within 5%? */
+       if (pops > expected_pops + (expected_pops / 20) ||
+               pops < expected_pops - (expected_pops / 20)) {
+               T_FAIL("%s() usec:%d (saw %lld of %lld expected pops)", __func__, usec, pops, expected_pops);
+               failed++;
+       } else {
+               T_PASS("%s() usec:%d (saw %lld pops)", __func__, usec, pops);
+               passed++;
+       }
+
+       EV_SET64(&kev, 3, EVFILT_TIMER, EV_DELETE, 0, 0, 0, 0, 0);
+       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
+       if (ret != 0) {
+               T_LOG("\tfailed to stop repeating timer: %d\n", ret);
+       }
+}
+
+/* Test that the repeating timer repeats even while not polling in kqueue */
+static void
+test_repeating_kevent(int usec)
+{
+       struct kevent64_s kev;
+       int ret;
+
+       T_SETUPBEGIN;
+
+       uint64_t test_duration_us = USEC_PER_SEC; /* 1 second */
+
+       uint64_t expected_pops = test_duration_us / (uint64_t)usec;
+       T_LOG("Testing repeating kevent at %d usec intervals (%lld pops/second)...\n",
+               usec, expected_pops);
+
+       EV_SET64(&kev, 4, EVFILT_TIMER, EV_ADD, NOTE_USECONDS, usec, 0, 0, 0);
+       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
+       if (ret != 0) {
+               T_FAIL("%s() setup failure: kevent64 returned %d\n", __func__, ret);
+               failed++;
+               return;
+       }
+
+       usleep((useconds_t)test_duration_us);
+
+       ret = kevent64(kq, NULL, 0, &kev, 1, 0, &failure_timeout);
+       if (ret != 1 || (kev.flags & EV_ERROR)) {
+               T_FAIL("%s() setup failure: kevent64 returned %d\n", __func__, ret);
+               failed++;
+               return;
+       }
+
+       T_SETUPEND;
+
+       uint64_t pops = (uint64_t) kev.data;
+
+       /* check how many times the timer fired: within 5%? */
+       if (pops > expected_pops + (expected_pops / 20) ||
+               pops < expected_pops - (expected_pops / 20)) {
+               T_FAIL("%s() usec:%d (saw %lld of %lld expected pops)", __func__, usec, pops, expected_pops);
+               failed++;
+       } else {
+               T_PASS("%s() usec:%d (saw %lld pops)", __func__, usec, pops);
+               passed++;
+       }
+
+       EV_SET64(&kev, 4, EVFILT_TIMER, EV_DELETE, 0, 0, 0, 0, 0);
+       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
+       if (ret != 0) {
+               T_LOG("\tfailed to stop repeating timer: %d\n", ret);
+       }
+}
+
+
+static void
+test_updated_kevent(int first, int second)
+{
+       struct kevent64_s kev;
+       int ret;
+
+       T_LOG("Testing update from %d to %d msecs...\n", first, second);
+
+       T_SETUPBEGIN;
+
+       EV_SET64(&kev, 4, EVFILT_TIMER, EV_ADD|EV_ONESHOT, 0, first, 0, 0, 0);
+       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
+       if (ret != 0) {
+               T_FAIL("%s() failure: initial kevent returned %d\n", __func__, ret);
+               failed++;
+               return;
+       }
+
+       T_SETUPEND;
+
+       EV_SET64(&kev, 4, EVFILT_TIMER, EV_ONESHOT, 0, second, 0, 0, 0);
+
+       uint64_t expected_us = (uint64_t)second * 1000;
+
+       if (second < 0)
+               expected_us = 0;
+
+       ret = do_simple_kevent(&kev, expected_us);
+
+       if (ret) {
+               passed++;
+               T_PASS("%s() %d, %d", __func__, first, second);
+       } else {
+               failed++;
+               T_FAIL("%s() %d, %d", __func__, first, second);
+       }
+}
+
+static void
+disable_timer_coalescing(void)
+{
+    struct task_qos_policy     qosinfo;
+    kern_return_t                      kr;
+
+       T_SETUPBEGIN;
+
+       qosinfo.task_latency_qos_tier = LATENCY_QOS_TIER_0;
+       qosinfo.task_throughput_qos_tier = THROUGHPUT_QOS_TIER_0;
+
+       kr = task_policy_set(mach_task_self(), TASK_OVERRIDE_QOS_POLICY, (task_policy_t)&qosinfo,
+                            TASK_QOS_POLICY_COUNT);
+       if (kr != KERN_SUCCESS) {
+               T_FAIL("task_policy_set(... TASK_OVERRIDE_QOS_POLICY ...) failed: %d (%s)", kr, mach_error_string(kr));
+       }
+
+       T_SETUPEND;
+}
+
+T_DECL(kqueue_timer_tests,
+       "Tests assorted kqueue operations for timer-related events")
+{
+       /*
+        * Since we're trying to test timers here, disable timer coalescing
+        * to improve the accuracy of timer fires for this process.
+        */
+       disable_timer_coalescing();
+
+       mach_timebase_info(&timebase_info);
+
+       kq = kqueue();
+       assert(kq > 0);
+       passed = 0;
+       failed = 0;
+
+       test_absolute_kevent(100, 0);
+       test_absolute_kevent(200, 0);
+       test_absolute_kevent(300, 0);
+       test_absolute_kevent(1000, 0);
+       T_MAYFAIL;
+       test_absolute_kevent(500, NOTE_USECONDS);
+       T_MAYFAIL;
+       test_absolute_kevent(100, NOTE_USECONDS);
+       T_MAYFAIL;
+       test_absolute_kevent(2, NOTE_SECONDS);
+       T_MAYFAIL;
+       test_absolute_kevent(-1000, 0);
+
+       T_MAYFAIL;
+       test_absolute_kevent((int)nanos_to_abs(10 * NSEC_PER_MSEC), NOTE_MACHTIME);
+
+       test_oneshot_kevent(1, NOTE_SECONDS);
+       T_MAYFAIL;
+       test_oneshot_kevent(10, 0);
+       T_MAYFAIL;
+       test_oneshot_kevent(200, NOTE_USECONDS);
+       T_MAYFAIL;
+       test_oneshot_kevent(300000, NOTE_NSECONDS);
+       T_MAYFAIL;
+       test_oneshot_kevent(-1, NOTE_SECONDS);
+
+       T_MAYFAIL;
+       test_oneshot_kevent((int)nanos_to_abs(10 * NSEC_PER_MSEC), NOTE_MACHTIME);
+
+       test_interval_kevent(250 * 1000);
+       T_MAYFAIL;
+       test_interval_kevent(5 * 1000);
+       T_MAYFAIL;
+       test_interval_kevent(200);
+       T_MAYFAIL;
+       test_interval_kevent(50);
+
+       test_interval_kevent(-1000);
+
+       test_repeating_kevent(10000); /* 10ms */
+
+       test_updated_kevent(1000, 2000);
+       test_updated_kevent(2000, 1000);
+       test_updated_kevent(1000, -1);
+
+}
diff --git a/tests/launchd_plists/com.apple.xnu.test.kevent_qos.plist b/tests/launchd_plists/com.apple.xnu.test.kevent_qos.plist
new file mode 100644 (file)
index 0000000..bf3c2f4
--- /dev/null
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>Label</key>
+       <string>com.apple.xnu.test.kevent_qos</string>
+       <key>MachServices</key>
+       <dict>
+               <key>com.apple.xnu.test.kevent_qos</key>
+               <true/>
+       </dict>
+       <key>ThrottleInterval</key>
+       <integer>1</integer>
+       <key>UserName</key>
+       <string>root</string>
+       <key>ProcessType</key>
+       <string>Adaptive</string>
+       <key>EnvironmentVariables</key>
+       <dict>
+               <key>MallocNanoZone</key>
+               <string>1</string>
+       </dict>
+</dict>
+</plist>
diff --git a/tests/launchd_plists/com.apple.xnu.test.turnstile_multihop.plist b/tests/launchd_plists/com.apple.xnu.test.turnstile_multihop.plist
new file mode 100644 (file)
index 0000000..e4d4241
--- /dev/null
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>Label</key>
+       <string>com.apple.xnu.test.turnstile_multihop</string>
+       <key>MachServices</key>
+       <dict>
+               <key>com.apple.xnu.test.turnstile_multihop</key>
+               <true/>
+       </dict>
+       <key>ThrottleInterval</key>
+       <integer>1</integer>
+       <key>UserName</key>
+       <string>root</string>
+       <key>ProcessType</key>
+       <string>Adaptive</string>
+       <key>EnvironmentVariables</key>
+       <dict>
+               <key>MallocNanoZone</key>
+               <string>1</string>
+       </dict>
+</dict>
+</plist>
diff --git a/tests/ltable_exhaustion_test.c b/tests/ltable_exhaustion_test.c
new file mode 100644 (file)
index 0000000..9bfeba8
--- /dev/null
@@ -0,0 +1,35 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <mach/mach.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+#define ITER 100
+
+T_DECL(ltable_exhaustion_test,
+       "check if allocating not used ltable entries can panic the system",
+       T_META_ASROOT(true))
+{
+       int n_ltable_entries,n_ltable_entries_after;
+       size_t len = sizeof(int);
+       int i;
+       mach_port_name_t portset;
+
+       /*
+        * Get how many ltable entries are allocated right now.
+        */
+       T_EXPECT_POSIX_SUCCESS(sysctlbyname("kern.n_ltable_entries", &n_ltable_entries, &len, NULL, 0), "kern.n_ltable_entries");
+
+       for (i = 0; i < ITER; i++) {
+               mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_PORT_SET, &portset);
+       }
+
+       /*
+        * Get how many ltable entries are allocated after the loop. Other processes in the system might have allocated entries,
+        * so don't expect the same value.
+        */
+       T_EXPECT_POSIX_SUCCESS(sysctlbyname("kern.n_ltable_entries", &n_ltable_entries_after, &len, NULL, 0), "kern.n_ltable_entries");
+
+       T_EXPECT_LE(n_ltable_entries_after, n_ltable_entries+ITER, "ltable before %d after %d iter %d", n_ltable_entries, n_ltable_entries_after, ITER);
+}
diff --git a/tests/mach_boottime_usec.c b/tests/mach_boottime_usec.c
new file mode 100644 (file)
index 0000000..ad0ac32
--- /dev/null
@@ -0,0 +1,20 @@
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <mach/mach_time.h>
+
+#include <darwintest.h>
+
+T_DECL(mach_boottime_usec, "mach_boottime_usec()",
+               T_META_ALL_VALID_ARCHS(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       uint64_t bt_usec = mach_boottime_usec();
+
+       struct timeval bt_tv;
+       size_t len = sizeof(bt_tv);
+       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.boottime", &bt_tv, &len, NULL, 0), NULL);
+
+       T_EXPECT_EQ((uint64_t)bt_tv.tv_sec * USEC_PER_SEC + (uint64_t)bt_tv.tv_usec, bt_usec, NULL);
+}
diff --git a/tests/mach_continuous_time.c b/tests/mach_continuous_time.c
new file mode 100644 (file)
index 0000000..a7d773b
--- /dev/null
@@ -0,0 +1,367 @@
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#include <mach/clock_types.h>
+#include <sys/time.h>
+#include <spawn.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <time.h>
+#include <errno.h>
+
+#include <darwintest.h>
+
+#if (defined(__arm__) || defined(__arm64__))
+#define HAS_KERNEL_TIME_TRAPS
+
+extern uint64_t mach_absolute_time_kernel(void);
+extern uint64_t mach_continuous_time_kernel(void);
+
+#endif
+extern char **environ;
+
+static const int64_t one_mil = 1000*1000;
+
+#define to_ns(ticks) (((ticks) * tb_info.numer) / (tb_info.denom))
+#define to_ms(ticks) (to_ns(ticks)/one_mil)
+
+static mach_timebase_info_data_t tb_info;
+
+static void
+update(uint64_t *a, uint64_t *c) {
+       mach_get_times(a,c,NULL);
+}
+
+T_DECL(mct_monotonic, "Testing mach_continuous_time returns sane, monotonic values",
+               T_META_ALL_VALID_ARCHS(true))
+{
+       mach_timebase_info(&tb_info);
+#ifdef HAS_KERNEL_TIME_TRAPS
+       bool kernel = false;
+#endif
+
+       volatile uint64_t multiple_test = to_ms(mach_continuous_time());
+       for(int i = 0; i < 20; i++) {
+               uint64_t tmp;
+               const char *test_type = "user";
+#ifdef HAS_KERNEL_TIME_TRAPS
+               if (kernel) {
+                       test_type = "kernel";
+                       tmp = mach_continuous_time_kernel();
+               } else
+                       tmp = mach_continuous_time();
+               kernel = !kernel;
+#else
+               tmp = mach_continuous_time();
+#endif
+               tmp = to_ms(tmp);
+               T_ASSERT_GE(tmp, multiple_test, "mach_continuous_time (%s) must be monotonic", test_type);
+
+               // each successive call shouldn't be more than 100ms in the future
+               T_ASSERT_LE(tmp - multiple_test, 100ULL, "mach_continuous_time (%s) should not jump forward too fast", test_type);
+
+               multiple_test = tmp;
+       }
+}
+
+T_DECL(mat_monotonic, "Testing mach_absolute_time returns sane, monotonic values",
+               T_META_ALL_VALID_ARCHS(true))
+{
+       mach_timebase_info(&tb_info);
+#ifdef HAS_KERNEL_TIME_TRAPS
+       bool kernel = false;
+#endif
+
+       volatile uint64_t multiple_test = to_ms(mach_absolute_time());
+       for(int i = 0; i < 20; i++) {
+               uint64_t tmp;
+               const char *test_type = "user";
+#ifdef HAS_KERNEL_TIME_TRAPS
+               if (kernel) {
+                       test_type = "kernel";
+                       tmp = mach_absolute_time_kernel();
+               } else
+                       tmp = mach_absolute_time();
+               kernel = !kernel;
+#endif
+               tmp = mach_absolute_time();
+               tmp = to_ms(tmp);
+               T_ASSERT_GE(tmp, multiple_test, "mach_absolute_time (%s) must be monotonic", test_type);
+
+               // each successive call shouldn't be more than 100ms in the future
+               T_ASSERT_LE(tmp - multiple_test, 100ULL, "mach_absolute_time (%s) should not jump forward too fast", test_type);
+
+               multiple_test = tmp;
+       }
+}
+
+T_DECL(mct_pause, "Testing mach_continuous_time and mach_absolute_time don't diverge")
+{
+       mach_timebase_info(&tb_info);
+
+       uint64_t abs_now;
+       uint64_t cnt_now;
+       int before_diff, after_diff;
+
+       update(&abs_now, &cnt_now);
+       before_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
+
+       sleep(1);
+
+       update(&abs_now, &cnt_now);
+       after_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
+
+       T_ASSERT_LE(abs(after_diff - before_diff), 1, "mach_continuous_time and mach_absolute_time should not diverge");
+}
+
+#ifdef HAS_KERNEL_TIME_TRAPS
+static void update_kern(uint64_t *abs, uint64_t *cont)
+{
+       uint64_t abs1, abs2, cont1, cont2;
+       do {
+               abs1 = mach_absolute_time_kernel();
+               cont1 = mach_continuous_time_kernel();
+               abs2 = mach_absolute_time_kernel();
+               cont2 = mach_continuous_time_kernel();
+       } while (to_ms(abs2 - abs1) || to_ms(cont2 - cont1));
+       *abs = abs2;
+       *cont = cont2;
+}
+#endif
+
+#ifdef HAS_KERNEL_TIME_TRAPS
+T_DECL(mct_pause_kern, "Testing kernel mach_continuous_time and mach_absolute_time don't diverge")
+{
+       mach_timebase_info(&tb_info);
+
+       uint64_t abs_now;
+       uint64_t cnt_now;
+       int before_diff, after_diff;
+
+       update_kern(&abs_now, &cnt_now);
+       before_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
+
+       sleep(1);
+
+       update_kern(&abs_now, &cnt_now);
+       after_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
+
+       T_ASSERT_LE(abs(after_diff - before_diff), 1, "mach_continuous_time_kernel and mach_absolute_time_kernel should not diverge");
+}
+#endif
+
+T_DECL(mct_sleep, "Testing mach_continuous_time behavior over system sleep"){
+#ifndef MCT_SLEEP_TEST
+       T_SKIP("Skipping test that sleeps the device; compile with MCT_SLEEP_TEST define to enable.");
+#endif
+
+       mach_timebase_info(&tb_info);
+
+       uint64_t abs_now;
+       uint64_t cnt_now;
+       int before_diff, after_diff = 0;
+
+       T_LOG("Testing mach_continuous_time is ~5 seconds ahead of mach_absolute_time after 5 second sleep");
+       update(&abs_now, &cnt_now);
+       before_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
+
+       // performs:
+       // pmset relative wake 5
+       // pmset sleepnow
+
+       pid_t pid;
+       int spawn_ret = 0;
+       time_t before_sleep = time(NULL);
+       int ct_ms_before_sleep = (int)to_ms(cnt_now);
+       int ab_ms_before_sleep = (int)to_ms(abs_now);
+
+       char *const pmset1_args[] = {"/usr/bin/pmset", "relative", "wake", "5", NULL};
+       T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset1_args[0], NULL, NULL, pmset1_args, environ)), NULL);
+
+       T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed");
+       T_ASSERT_EQ(spawn_ret, 0, "pmset relative wait 5 failed");
+
+       char *const pmset2_args[] = {"/usr/bin/pmset", "sleepnow", NULL};
+       T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset2_args[0], NULL, NULL, pmset2_args, environ)), NULL);
+
+       T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed");
+       T_ASSERT_EQ(spawn_ret, 0, "pmset relative wait 5 failed");
+
+       // wait for device to sleep (up to 30 seconds)
+       for(int i = 0; i < 30; i++) {
+               update(&abs_now, &cnt_now);
+               after_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
+
+               // on OSX, there's enough latency between calls to MCT and MAT
+               // when the system is going down for sleep for values to diverge a few ms
+               if(abs(before_diff - after_diff) > 2) {
+                       break;
+               }
+
+               sleep(1);
+               T_LOG("waited %d seconds for sleep...", i+1);
+       }
+
+       if((after_diff - before_diff) < 4000) {
+               T_LOG("Device slept for less than 4 seconds, did it really sleep? (%d ms change between abs and cont)",
+                       after_diff - before_diff);
+       }
+
+       time_t after_sleep = time(NULL);
+
+       int cal_sleep_diff  = (int)(double)difftime(after_sleep, before_sleep);
+       int ct_sleep_diff = ((int)to_ms(cnt_now) - ct_ms_before_sleep)/1000;
+       int ab_sleep_diff = ((int)to_ms(abs_now) - ab_ms_before_sleep)/1000;
+
+       T_LOG("Calendar progressed: %d sec; continuous time progressed: %d sec; absolute time progressed %d sec",
+               cal_sleep_diff, ct_sleep_diff, ab_sleep_diff);
+
+       T_ASSERT_LE(abs(ct_sleep_diff - cal_sleep_diff), 2,
+               "continuous time should progress at ~ same rate as calendar");
+}
+
+T_DECL(mct_settimeofday, "Testing mach_continuous_time behavior over settimeofday"){
+       if (geteuid() != 0){
+               T_SKIP("The settimeofday() test requires root privileges to run.");
+       }
+       mach_timebase_info(&tb_info);
+
+       struct timeval saved_tv;
+       struct timezone saved_tz;
+       int before, after;
+
+       T_ASSERT_POSIX_ZERO(gettimeofday(&saved_tv, &saved_tz), NULL);
+
+       struct timeval forward_tv = saved_tv;
+       // move time forward by two minutes, ensure mach_continuous_time keeps
+       // chugging along with mach_absolute_time
+       forward_tv.tv_sec += 2*60;
+
+       before = (int)to_ms(mach_continuous_time());
+       T_ASSERT_POSIX_ZERO(settimeofday(&forward_tv, &saved_tz), NULL);
+
+       after = (int)to_ms(mach_continuous_time());
+       T_ASSERT_POSIX_ZERO(settimeofday(&saved_tv, &saved_tz), NULL);
+
+       T_ASSERT_LT(abs(before - after), 1000, "mach_continuous_time should not jump more than 1s");
+}
+
+#ifdef HAS_KERNEL_TIME_TRAPS
+T_DECL(mct_settimeofday_kern, "Testing kernel mach_continuous_time behavior over settimeofday"){
+       if (geteuid() != 0){
+               T_SKIP("The settimeofday() test requires root privileges to run.");
+       }
+       mach_timebase_info(&tb_info);
+
+       struct timeval saved_tv;
+       struct timezone saved_tz;
+       int before, after;
+
+       T_ASSERT_POSIX_ZERO(gettimeofday(&saved_tv, &saved_tz), NULL);
+
+       struct timeval forward_tv = saved_tv;
+       // move time forward by two minutes, ensure mach_continuous_time keeps
+       // chugging along with mach_absolute_time
+       forward_tv.tv_sec += 2*60;
+
+       before = (int)to_ms(mach_continuous_time_kernel());
+       T_ASSERT_POSIX_ZERO(settimeofday(&forward_tv, &saved_tz), NULL);
+
+       after = (int)to_ms(mach_continuous_time_kernel());
+       T_ASSERT_POSIX_ZERO(settimeofday(&saved_tv, &saved_tz), NULL);
+
+       T_ASSERT_LT(abs(before - after), 1000, "mach_continuous_time_kernel should not jump more than 1s");
+}
+#endif
+
+T_DECL(mct_aproximate, "Testing mach_continuous_approximate_time()",
+               T_META_ALL_VALID_ARCHS(true))
+{
+       mach_timebase_info(&tb_info);
+
+       uint64_t absolute = to_ns(mach_continuous_time());
+       uint64_t approximate = to_ns(mach_continuous_approximate_time());
+
+       T_EXPECT_LE(llabs((long long)absolute - (long long)approximate), (long long)(25*NSEC_PER_MSEC), NULL);
+}
+
+T_DECL(mach_time_perf, "mach_time performance") {
+       {
+               dt_stat_time_t s = dt_stat_time_create("mach_absolute_time");
+               T_STAT_MEASURE_LOOP(s) {
+                       uint64_t t;
+                       t = mach_absolute_time();
+               }
+               dt_stat_finalize(s);
+       }
+       {
+               dt_stat_time_t s = dt_stat_time_create("mach_continuous_time");
+               T_STAT_MEASURE_LOOP(s) {
+                       uint64_t t;
+                       t = mach_continuous_time();
+               }
+               dt_stat_finalize(s);
+       }
+}
+
+T_DECL(mach_time_perf_instructions, "instructions retired for mach_time", T_META_TYPE_PERF, T_META_ASROOT(YES)) {
+       {
+               dt_stat_thread_instructions_t s = dt_stat_thread_instructions_create("mach_absolute_time");
+               T_STAT_MEASURE_LOOP(s) {
+                       uint64_t t;
+                       t = mach_absolute_time();
+               }
+               dt_stat_finalize(s);
+       }
+       {
+               dt_stat_thread_instructions_t s = dt_stat_thread_instructions_create("mach_continuous_time");
+               T_STAT_MEASURE_LOOP(s) {
+                       uint64_t t;
+                       t = mach_continuous_time();
+               }
+               dt_stat_finalize(s);
+       }
+}
+
+#ifdef HAS_KERNEL_TIME_TRAPS
+T_DECL(mach_time_perf_kern, "kernel mach_time performance") {
+       {
+               dt_stat_time_t s = dt_stat_time_create("mach_absolute_time_kernel");
+               T_STAT_MEASURE_LOOP(s) {
+                       uint64_t t;
+                       t = mach_absolute_time_kernel();
+               }
+               dt_stat_finalize(s);
+       }
+       {
+               dt_stat_time_t s = dt_stat_time_create("mach_continuous_time_kernel");
+               T_STAT_MEASURE_LOOP(s) {
+                       uint64_t t;
+                       t = mach_continuous_time_kernel();
+               }
+               dt_stat_finalize(s);
+       }
+}
+
+T_DECL(mach_time_perf_instructions_kern, "instructions retired for kernel mach_time", T_META_TYPE_PERF, T_META_ASROOT(YES)) {
+       {
+               dt_stat_thread_instructions_t s = dt_stat_thread_instructions_create("mach_absolute_time_kernel");
+               T_STAT_MEASURE_LOOP(s) {
+                       uint64_t t;
+                       t = mach_absolute_time_kernel();
+               }
+               dt_stat_finalize(s);
+       }
+       {
+               dt_stat_thread_instructions_t s = dt_stat_thread_instructions_create("mach_continuous_time_kernel");
+               T_STAT_MEASURE_LOOP(s) {
+                       uint64_t t;
+                       t = mach_continuous_time_kernel();
+               }
+               dt_stat_finalize(s);
+       }
+}
+#endif
+
diff --git a/tests/mach_get_times.c b/tests/mach_get_times.c
new file mode 100644 (file)
index 0000000..c40bada
--- /dev/null
@@ -0,0 +1,44 @@
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <mach/mach_time.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#define T_LOG_VERBOSE(...)
+
+#define timespec2nanosec(ts) ((uint64_t)((ts)->tv_sec) * NSEC_PER_SEC + (uint64_t)((ts)->tv_nsec))
+
+T_DECL(mach_get_times, "mach_get_times()",
+          T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
+{
+       const int ITERATIONS = 500000 * dt_ncpu();
+       struct timespec gtod_ts;
+
+       uint64_t last_absolute, last_continuous, last_gtod;
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&last_absolute, &last_continuous, &gtod_ts), KERN_SUCCESS, NULL);
+       last_gtod = timespec2nanosec(&gtod_ts);
+
+       for (int i = 0; i < ITERATIONS; i++) {
+               uint64_t absolute, continuous, gtod;
+               T_QUIET; T_ASSERT_EQ(mach_get_times(&absolute, &continuous, &gtod_ts), KERN_SUCCESS, NULL);
+               gtod = timespec2nanosec(&gtod_ts);
+
+               T_LOG_VERBOSE("[%d] abs: %llu.%09llu(+%llu)\tcont: %llu.%09llu(+%llu)\tgtod:%llu.%09llu(+%llu)", i,
+                               absolute / NSEC_PER_SEC, absolute % NSEC_PER_SEC, absolute - last_absolute,
+                               continuous / NSEC_PER_SEC, continuous % NSEC_PER_SEC, continuous - last_continuous,
+                               gtod / NSEC_PER_SEC, gtod % NSEC_PER_SEC, gtod - last_gtod);
+
+               T_QUIET; T_EXPECT_EQ(absolute - last_absolute, continuous - last_continuous, NULL);
+
+               int64_t gtod_diff = (int64_t)gtod - (int64_t)last_gtod;
+               T_QUIET; T_ASSERT_LE((uint64_t)llabs(gtod_diff), NSEC_PER_SEC, NULL);
+
+               last_absolute = absolute;
+               last_continuous = continuous;
+               last_gtod = gtod;
+
+               gtod_ts.tv_sec = 0; gtod_ts.tv_nsec = 0;
+       }
+}
diff --git a/tests/mach_port_deallocate_21692215.c b/tests/mach_port_deallocate_21692215.c
new file mode 100644 (file)
index 0000000..4b84428
--- /dev/null
@@ -0,0 +1,38 @@
+#define T_NAMESPACE "xnu.ipc"
+#include <darwintest.h>
+#include <mach/mach.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#define NR_PORTS 4
+
+T_DECL(mach_port_deallocate, "mach_port_deallocate deallocates also PORT_SET"){
+       mach_port_t port_set;
+       mach_port_t port[NR_PORTS];
+       int i,ret;
+
+       ret= mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_PORT_SET, &port_set);
+       T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_PORT_SET");
+
+       for(i=0;i<NR_PORTS;i++){
+               ret= mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port[i]);
+               T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_RECEIVE");
+
+               ret= mach_port_move_member(mach_task_self(), port[i], port_set);
+               T_ASSERT_MACH_SUCCESS(ret, "mach_port_move_member");
+       }
+
+       T_LOG("Ports created");
+
+       /* do something */
+
+       for(i=0;i<NR_PORTS;i++){
+               ret= mach_port_mod_refs(mach_task_self(), port[i], MACH_PORT_RIGHT_RECEIVE, -1);
+               T_ASSERT_MACH_SUCCESS(ret, "mach_port_mod_refs -1 RIGHT_RECEIVE");
+       }
+
+       ret= mach_port_deallocate(mach_task_self(), port_set);
+       T_ASSERT_MACH_SUCCESS(ret, "mach_port_deallocate PORT_SET");
+
+       T_LOG("Ports erased");
+}
diff --git a/tests/mach_port_insert_right.c b/tests/mach_port_insert_right.c
new file mode 100644 (file)
index 0000000..f422892
--- /dev/null
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <darwintest.h>
+
+T_DECL(mach_port_insert_right,"insert send right for an existing right", T_META_CHECK_LEAKS(false))
+{
+       mach_port_t port = MACH_PORT_NULL;
+       mach_port_t port2 = MACH_PORT_NULL;
+       kern_return_t retval;
+
+       mach_port_t task = mach_task_self();
+
+       retval = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &port);
+       T_ASSERT_MACH_SUCCESS(retval, "allocate a port=[%d]", port);
+
+       mach_port_name_t name = 123;
+
+       retval = mach_port_insert_right(task, name, port, MACH_MSG_TYPE_MAKE_SEND);
+       T_ASSERT_MACH_ERROR(retval, KERN_FAILURE, "insert a send right for port=[%d] with name=[%d]", port, name);
+
+       name = port + 1;
+       retval = mach_port_insert_right(task, name, port, MACH_MSG_TYPE_MAKE_SEND);
+       T_ASSERT_MACH_ERROR(retval, KERN_FAILURE, "insert a send right for port=[%d] with name=[%d]", port, name);
+
+       retval = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &port2);
+       T_ASSERT_MACH_SUCCESS(retval, "allocate a port=[%d]", port2);
+
+       name = port;
+       retval = mach_port_insert_right(task, name, port2, MACH_MSG_TYPE_MAKE_SEND);
+       T_ASSERT_MACH_ERROR(retval, KERN_RIGHT_EXISTS, "insert a send right for port=[%d] with name=[%d]", port2, name);
+}
diff --git a/tests/mach_port_mod_refs.c b/tests/mach_port_mod_refs.c
new file mode 100644 (file)
index 0000000..3e5d2f3
--- /dev/null
@@ -0,0 +1,92 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#define T_NAMESPACE "xnu.ipc"
+#include <darwintest.h>
+#include <mach/mach.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+
+T_DECL(mach_port_mod_refs, "mach_port_mod_refs"){
+       mach_port_t port_set;
+       mach_port_t port;
+       int ret;
+
+       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_PORT_SET, &port_set);
+       T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_PORT_SET");
+
+       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port);
+       T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_RECEIVE");
+
+
+       /*
+        * Test all known variants of port rights on each type of port
+        */
+
+       /* can't subtract a send right if it doesn't exist */
+       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs SEND: -1 on a RECV right");
+
+       /* can't subtract a send once right if it doesn't exist */
+       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND_ONCE, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs SEND_ONCE: -1 on a RECV right");
+
+       /* can't subtract a PORT SET right if it's not a port set */
+       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_PORT_SET, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs PORT_SET: -1 on a RECV right");
+
+       /* can't subtract a dead name right if it doesn't exist */
+       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_DEAD_NAME, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs DEAD_NAME: -1 on a RECV right");
+
+       /* can't subtract a LABELH right if it doesn't exist */
+       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_LABELH, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs LABELH: -1 on a RECV right");
+
+       /* can't subtract an invalid right-type */
+       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_NUMBER, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_VALUE, "mach_port_mod_refs NUMBER: -1 on a RECV right");
+
+       /* can't subtract an invalid right-type */
+       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_NUMBER + 1, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_VALUE, "mach_port_mod_refs NUMBER+1: -1 on a RECV right");
+
+
+       /* can't subtract a send right if it doesn't exist */
+       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_SEND, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs SEND: -1 on a PORT_SET right");
+
+       /* can't subtract a send once right if it doesn't exist */
+       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_SEND_ONCE, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs SEND_ONCE: -1 on a PORT_SET right");
+
+       /* can't subtract a receive right if it's a port set */
+       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_RECEIVE, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs RECV: -1 on a PORT_SET right");
+
+       /* can't subtract a dead name right if it doesn't exist */
+       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_DEAD_NAME, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs DEAD_NAME: -1 on a PORT_SET right");
+
+       /* can't subtract a LABELH right if it doesn't exist */
+       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_LABELH, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs LABELH: -1 on a PORT_SET right");
+
+       /* can't subtract an invalid right-type */
+       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_NUMBER, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_VALUE, "mach_port_mod_refs NUMBER: -1 on a PORT_SET right");
+
+       /* can't subtract an invalid right-type */
+       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_NUMBER + 1, -1);
+       T_ASSERT_EQ(ret, KERN_INVALID_VALUE, "mach_port_mod_refs NUMBER+1: -1 on a PORT_SET right");
+
+       /*
+        * deallocate the ports/sets
+        */
+       ret= mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_PORT_SET, -1);
+       T_ASSERT_MACH_SUCCESS(ret, "mach_port_mod_refs(PORT_SET, -1)");
+
+       ret= mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_RECEIVE, -1);
+       T_ASSERT_MACH_SUCCESS(ret, "mach_port_mod_refs(RECV_RIGHT, -1)");
+}
diff --git a/tests/mach_timebase_info.c b/tests/mach_timebase_info.c
new file mode 100644 (file)
index 0000000..51f3bb4
--- /dev/null
@@ -0,0 +1,20 @@
+#include <mach/mach_time.h>
+
+#include <darwintest.h>
+
+extern kern_return_t mach_timebase_info_trap(mach_timebase_info_t info);
+
+T_DECL(mach_timebase_info, "mach_timebase_info(_trap)",
+               T_META_ALL_VALID_ARCHS(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       mach_timebase_info_data_t a, b, c;
+
+       T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info(&a), NULL);
+       T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info(&b), NULL);
+       T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info_trap(&c), NULL);
+
+       T_EXPECT_EQ(a.numer, b.numer, NULL);
+       T_EXPECT_EQ(a.denom, b.denom, NULL);
+       T_EXPECT_EQ(a.numer, c.numer, NULL);
+       T_EXPECT_EQ(a.denom, c.denom, NULL);
+}
diff --git a/tests/memorystatus_freeze_test.c b/tests/memorystatus_freeze_test.c
new file mode 100644 (file)
index 0000000..d41c664
--- /dev/null
@@ -0,0 +1,270 @@
+#include <stdio.h>
+#include <signal.h>
+#include <sys/sysctl.h>
+#include <sys/kern_memorystatus.h>
+#include <mach-o/dyld.h>
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.vm"),
+       T_META_CHECK_LEAKS(false)
+);
+
+#define MEM_SIZE_MB                    10
+#define NUM_ITERATIONS         5
+
+#define CREATE_LIST(X) \
+       X(SUCCESS) \
+       X(TOO_FEW_ARGUMENTS) \
+       X(SYSCTL_VM_PAGESIZE_FAILED) \
+       X(VM_PAGESIZE_IS_ZERO) \
+       X(SYSCTL_VM_FREEZE_ENABLED_FAILED) \
+       X(FREEZER_DISABLED) \
+       X(DISPATCH_SOURCE_CREATE_FAILED) \
+       X(INITIAL_SIGNAL_TO_PARENT_FAILED) \
+       X(SIGNAL_TO_PARENT_FAILED) \
+       X(MEMORYSTATUS_CONTROL_FAILED) \
+       X(IS_FREEZABLE_NOT_AS_EXPECTED) \
+       X(MEMSTAT_PRIORITY_CHANGE_FAILED) \
+       X(EXIT_CODE_MAX)
+
+#define EXIT_CODES_ENUM(VAR) VAR,
+enum exit_codes_num {
+       CREATE_LIST(EXIT_CODES_ENUM)
+};
+
+#define EXIT_CODES_STRING(VAR) #VAR,
+static const char *exit_codes_str[] = {
+       CREATE_LIST(EXIT_CODES_STRING)
+};
+
+
+static pid_t pid = -1;
+static int freeze_count = 0;
+
+void move_to_idle_band(void);
+void run_freezer_test(int size_mb);
+void freeze_helper_process(void);
+
+
+void move_to_idle_band(void) {
+
+       memorystatus_priority_properties_t props;
+       /*
+        * Freezing a process also moves it to an elevated jetsam band in order to protect it from idle exits.
+        * So we move the child process to the idle band to mirror the typical 'idle app being frozen' scenario.
+        */
+       props.priority = JETSAM_PRIORITY_IDLE;
+       props.user_data = 0;
+
+       /*
+        * This requires us to run as root (in the absence of entitlement).
+        * Hence the T_META_ASROOT(true) in the T_HELPER_DECL.
+        */
+       if (memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, getpid(), 0, &props, sizeof(props))) {
+               exit(MEMSTAT_PRIORITY_CHANGE_FAILED);
+       }
+}
+
+void freeze_helper_process(void) {
+       int ret;
+
+       T_LOG("Freezing child pid %d", pid);
+       ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid));
+       sleep(1);
+
+       if (freeze_count % 2 == 0) {
+               /*
+                * The child process toggles its freezable state on each iteration.
+                * So a failure for every alternate freeze is expected.
+                */
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_freeze failed");
+               T_LOG("Freeze succeeded. Thawing child pid %d", pid);
+               ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid));
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed");
+       } else {
+               T_QUIET; T_ASSERT_TRUE(ret != KERN_SUCCESS, "Freeze should have failed");
+               T_LOG("Freeze failed as expected");
+       }
+
+       freeze_count++;
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGUSR1), "failed to send SIGUSR1 to child process");
+}
+
+void run_freezer_test(int size_mb) {
+       int ret;
+       char sz_str[50];
+       char **launch_tool_args;
+       char testpath[PATH_MAX];
+       uint32_t testpath_buf_size;
+       dispatch_source_t ds_freeze, ds_proc;
+
+#ifndef CONFIG_FREEZE
+       T_SKIP("Task freeze not supported.");
+#endif
+
+       signal(SIGUSR1, SIG_IGN);
+       ds_freeze = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_NOTNULL(ds_freeze, "dispatch_source_create (ds_freeze)");
+
+       dispatch_source_set_event_handler(ds_freeze, ^{
+               if (freeze_count < NUM_ITERATIONS) {
+                       freeze_helper_process();
+               } else {
+                       kill(pid, SIGKILL);
+                       dispatch_source_cancel(ds_freeze);
+               }
+       });
+       dispatch_activate(ds_freeze);
+
+       testpath_buf_size = sizeof(testpath);
+       ret = _NSGetExecutablePath(testpath, &testpath_buf_size);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "_NSGetExecutablePath");
+       T_LOG("Executable path: %s", testpath);
+
+       sprintf(sz_str, "%d", size_mb);
+       launch_tool_args = (char *[]){
+               testpath,
+               "-n",
+               "allocate_pages",
+               "--",
+               sz_str,
+               NULL
+       };
+
+       /* Spawn the child process. Suspend after launch until the exit proc handler has been set up. */
+       ret = dt_launch_tool(&pid, launch_tool_args, true, NULL, NULL);
+       if (ret != 0) {
+               T_LOG("dt_launch tool returned %d with error code %d", ret, errno);
+       }
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool");
+
+       ds_proc = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_NOTNULL(ds_proc, "dispatch_source_create (ds_proc)");
+
+       dispatch_source_set_event_handler(ds_proc, ^{
+               int status = 0, code = 0;
+               pid_t rc = waitpid(pid, &status, 0);
+               T_QUIET; T_ASSERT_EQ(rc, pid, "waitpid");
+               code = WEXITSTATUS(status);
+
+               if (code == 0) {
+                       T_END;
+               } else if (code > 0 && code < EXIT_CODE_MAX) {
+                       T_ASSERT_FAIL("Child exited with %s", exit_codes_str[code]);
+               } else {
+                       T_ASSERT_FAIL("Child exited with unknown exit code %d", code);
+               }
+       });
+       dispatch_activate(ds_proc);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGCONT), "failed to send SIGCONT to child process");
+       dispatch_main();
+}
+
+T_HELPER_DECL(allocate_pages,
+               "allocates pages to freeze",
+               T_META_ASROOT(true)) {
+       int i, j, temp, ret, size_mb, vmpgsize;
+       size_t len;
+       char val;
+       __block int num_pages, num_iter = 0;
+       __block char **buf;
+       dispatch_source_t ds_signal;
+
+       len = sizeof(vmpgsize);
+       ret = sysctlbyname("vm.pagesize", &vmpgsize, &len, NULL, 0);
+       if (ret != 0) {
+               exit(SYSCTL_VM_PAGESIZE_FAILED);
+       }
+       if (vmpgsize == 0) {
+               exit(VM_PAGESIZE_IS_ZERO);
+       }
+
+       if (argc < 1) {
+               exit(TOO_FEW_ARGUMENTS);
+       }
+
+       len = sizeof(temp);
+       ret = sysctlbyname("vm.freeze_enabled", &temp, &len, NULL, 0);
+       if (ret != 0) {
+               exit(SYSCTL_VM_FREEZE_ENABLED_FAILED);
+       }
+       if (temp == 0) {
+               exit(FREEZER_DISABLED);
+       }
+
+       size_mb = atoi(argv[0]);
+       num_pages = size_mb * 1024 * 1024 / vmpgsize;
+       buf = (char**)malloc(sizeof(char*) * (size_t)num_pages);
+
+       /* Gives us the compression ratio we see in the typical case (~2.7) */
+       for (j = 0; j < num_pages; j++) {
+               buf[j] = (char*)malloc((size_t)vmpgsize * sizeof(char));
+               val = 0;
+               for (i = 0; i < vmpgsize; i += 16) {
+                       memset(&buf[j][i], val, 16);
+                       if (i < 3400 * (vmpgsize / 4096)) {
+                               val++;
+                       }
+               }
+       }
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), dispatch_get_main_queue(), ^{
+               /* Signal to the parent that we're done allocating and it's ok to freeze us */
+               printf("Sending initial signal to parent to begin freezing\n");
+               if (kill(getppid(), SIGUSR1) != 0) {
+                       exit(INITIAL_SIGNAL_TO_PARENT_FAILED);
+               }
+       });
+
+       signal(SIGUSR1, SIG_IGN);
+       ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
+       if (ds_signal == NULL) {
+               exit(DISPATCH_SOURCE_CREATE_FAILED);
+       }
+
+       dispatch_source_set_event_handler(ds_signal, ^{
+               int current_state, new_state;
+               volatile int tmp;
+
+               /* Make sure all the pages are accessed before trying to freeze again */
+               for (int x = 0; x < num_pages; x++) {
+                       tmp = buf[x][0];
+               }
+
+               current_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0);
+
+               /* Toggle freezable state */
+               new_state = (current_state) ? 0: 1;
+               printf("Changing state from %s to %s\n", (current_state) ? "freezable": "unfreezable", (new_state) ? "freezable": "unfreezable");
+               if (memorystatus_control(MEMORYSTATUS_CMD_SET_PROCESS_IS_FREEZABLE, getpid(), (uint32_t)new_state, NULL, 0) != KERN_SUCCESS) {
+                       exit(MEMORYSTATUS_CONTROL_FAILED);
+               }
+
+               /* Verify that the state has been set correctly */
+               current_state = memorystatus_control(MEMORYSTATUS_CMD_GET_PROCESS_IS_FREEZABLE, getpid(), 0, NULL, 0);
+               if (new_state != current_state) {
+                       exit(IS_FREEZABLE_NOT_AS_EXPECTED);
+               }
+               num_iter++;
+
+               if (kill(getppid(), SIGUSR1) != 0) {
+                       exit(SIGNAL_TO_PARENT_FAILED);
+               }
+       });
+       dispatch_activate(ds_signal);
+       move_to_idle_band();
+
+       dispatch_main();
+}
+
+T_DECL(freeze, "VM freezer test") {
+       run_freezer_test(MEM_SIZE_MB);
+}
diff --git a/tests/memorystatus_vm_map_fork.c b/tests/memorystatus_vm_map_fork.c
new file mode 100644 (file)
index 0000000..e321bea
--- /dev/null
@@ -0,0 +1,467 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+#include <spawn.h>
+#include <spawn_private.h>
+#include <stdint.h>
+#include <sys/sysctl.h>
+#include <sys/spawn_internal.h>
+#include <sys/kern_memorystatus.h>
+#include <mach-o/dyld.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.vm"),
+       T_META_CHECK_LEAKS(false)
+);
+
+extern char **environ;
+
+/*
+ * This test file contains two sub-tests which attempt to verify
+ * the allowing or not allowing of a corpse for crashreporter when
+ * a task exceeds its memory allocation limit. vm_map_fork() is the
+ * kernel routine used to generate a corpse task.
+ *
+ * A corpse is allowed to be taken if a task's memory resource limit that
+ * is exceeded is less than 1/4 of the system wide task limit.
+ * If the amount exceeds 1/4 the sytem wide limit, then the corpse is disallowed.
+ *
+ * If the device under test is already under pressure, the test
+ * could fail due to jetsam cutting in and killing the parent, child or
+ * other necessary testing processes.
+ */
+
+/* Test variants */
+#define TEST_ALLOWED    0x1
+#define TEST_NOT_ALLOWED 0x2
+
+/*
+ * Values which the kernel OR's into the PID when a corpse
+ * is either allowed or disallowed for the
+ * kern.memorystatus_vm_map_fork_pidwatch sysctl.
+ */
+#define MEMORYSTATUS_VM_MAP_FORK_ALLOWED       0x100000000ul
+#define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000ul
+
+/*
+ * The memory allocation happens in a child process, this
+ * is stuff to deal with creating and managing the child.
+ * The child will only execute the T_HELPER_DECL.
+ */
+static char testpath[PATH_MAX];
+static uint32_t testpath_size = sizeof(testpath);
+#define LIMIT_DELTA_MB 5 /* an arbitrary limit delta */
+#define MEGABYTE       (1024 * 1024)
+
+/*
+ * The child process communicates back to parent via an exit() code.
+ */
+enum child_exits {
+       NORMAL_EXIT = 0,
+       NO_MEMSIZE_ARG,
+       INVALID_MEMSIZE,
+       MALLOC_FAILED,
+       NUM_CHILD_EXIT
+};
+static char *child_exit_why[] = {
+       "normal exit",
+       "no memsize argument to child",
+       "invalid memsize argument to child",
+       "malloc() failed",
+};
+
+/*
+ * Corpse collection only happens in development kernels.
+ * So we need this to detect if the test is relevant.
+ */
+static boolean_t
+is_development_kernel(void)
+{
+       int ret;
+       int dev = 0;
+       size_t dev_size = sizeof(dev);
+
+       ret = sysctlbyname("kern.development", &dev, &dev_size, NULL, 0);
+       if (ret != 0) {
+               return FALSE;
+       }
+
+       return (dev != 0);
+}
+
+/*
+ * Set/Get the sysctl used to determine if corpse collection occurs.
+ * This is done by the kernel checking for a specific PID.
+ */
+static void
+set_memorystatus_vm_map_fork_pidwatch(pid_t pid)
+{
+       uint64_t new_value = (uint64_t)pid;
+       size_t new_len = sizeof(new_value);
+       int err;
+
+       err = sysctlbyname("kern.memorystatus_vm_map_fork_pidwatch", NULL, NULL, &new_value, new_len);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(err, "set sysctlbyname(kern.memorystatus_vm_map_fork_pidwatch...) failed");
+       return;
+}
+
+static uint64_t
+get_memorystatus_vm_map_fork_pidwatch()
+{
+       uint64_t value = 0;
+       size_t val_len = sizeof(value);
+       int err;
+
+       err = sysctlbyname("kern.memorystatus_vm_map_fork_pidwatch", &value, &val_len, NULL, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(err, "get sysctlbyname(kern.memorystatus_vm_map_fork_pidwatch...) failed");
+
+       return value;
+}
+
+/*
+ * We want to avoid jetsam giving us bad results, if possible. So check if there's
+ * enough memory for the test to run, waiting briefly for some to free up.
+ */
+static void
+wait_for_free_mem(int need_mb)
+{
+       int64_t         memsize;
+       int             memorystatus_level;
+       size_t          size;
+       int64_t         avail;
+       int             err;
+       int             try;
+
+       /*
+        * get amount of memory in the machine
+        */
+       size = sizeof(memsize);
+       err = sysctlbyname("hw.memsize", &memsize, &size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "sysctlbyname(hw.memsize...) failed");
+
+       /*
+        * Use a loop to briefly sleep and recheck if short on memory.
+        */
+       try = 1;
+       for (;;) {
+
+               /*
+                * memorystatus_level is a percentage of memory available. For example 20 means 1/5 of memory.
+                * It currently doesn't exist on macOS but neither does jetsam, so pass the test there.
+                */
+               size = sizeof(memorystatus_level);
+               if (sysctlbyname("kern.memorystatus_level", &memorystatus_level, &size, NULL, 0) != 0)
+                       return;
+               T_QUIET; T_ASSERT_LE(memorystatus_level, 100, "memorystatus_level too high");
+               T_QUIET; T_ASSERT_GT(memorystatus_level, 0, "memorystatus_level negative");
+
+               /*
+                * jetsam kicks in at memory status level of 15%, so subtract that much out of what's available.
+                */
+               avail = MAX(0, (memsize * (memorystatus_level - 15)) / 100);
+
+               /*
+                * We're good to go if there's more than enough available.
+                */
+               if ((int64_t)need_mb * MEGABYTE < avail)
+                       return;
+
+               /*
+                * issue a message to log and sleep briefly to see if we can get more memory
+                */
+               if (try-- == 0)
+                       break;
+               T_LOG("Need %d MB, only %d MB available. sleeping 5 seconds for more to free. memorystatus_level %d",
+                   need_mb, (int)(avail / MEGABYTE), memorystatus_level);
+               sleep(5);
+       }
+       T_SKIP("Needed %d MB, but only %d MB available. Skipping test to avoid jetsam issues.",
+           need_mb, (int)(avail / MEGABYTE));
+}
+
+
+/*
+ * The main test calls this to spawn child process which will run and
+ * exceed some memory limit. The child is initially suspended so that
+ * we can do the sysctl calls before it runs.
+ * Since this is a libdarwintest, the "-n" names the T_HELPER_DECL() that
+ * we want to run. The arguments specific to the test follow a "--".
+ */
+static pid_t
+spawn_child_process(
+       char * const executable,
+       char * const memlimit,
+       short flags,
+       int priority,
+       int active_limit_mb,
+       int inactive_limit_mb)
+{
+       posix_spawnattr_t spawn_attrs;
+       int err;
+       pid_t child_pid;
+       char * const argv_child[] = { executable, "-n", "child_process", "--", memlimit, NULL };
+
+       err = posix_spawnattr_init(&spawn_attrs);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  posix_spawnattr_init() failed");
+
+       err = posix_spawnattr_setflags(&spawn_attrs, POSIX_SPAWN_START_SUSPENDED);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  posix_spawnattr_setflags() failed");
+
+       err = posix_spawnattr_setjetsam_ext(&spawn_attrs, flags, priority, active_limit_mb, inactive_limit_mb);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  posix_spawnattr_setjetsam_ext() failed");
+
+       err = posix_spawn(&child_pid, executable, NULL, &spawn_attrs, argv_child, environ);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  posix_spawn() failed");
+
+       return child_pid;
+}
+
+
+/*
+ * The parent calls this to continue the suspended child, then wait for its result.
+ * We collect its resource usage to vefiry the expected amount allocated.
+ */
+static void
+test_child_process(pid_t child_pid, int *status, struct rusage *ru)
+{
+       int err = 0;
+       pid_t got_pid;
+
+       T_LOG("  continuing child[%d]\n", child_pid);
+
+       err = kill(child_pid, SIGCONT);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  kill(%d, SIGCONT) failed", child_pid);
+
+       T_LOG("  waiting for child[%d] to exit", child_pid);
+
+       got_pid = wait4(child_pid, status, 0, ru);
+       T_QUIET; T_ASSERT_EQ(child_pid, got_pid, "  wait4(%d, ...) returned %d", child_pid, got_pid);
+}
+
+/*
+ * The child process executes this code. The easiest way, with given darwintest infrastructure,
+ * it has to return information is via exit status.
+ */
+T_HELPER_DECL(child_process, "child allocates memory to failure")
+{
+#define BYTESPERALLOC  MEGABYTE
+#define BYTESINEXCESS  (2 * MEGABYTE) /* 2 MB - arbitrary */
+       char *limit;
+       long limit_mb = 0;
+       long max_bytes_to_munch, bytes_remaining, bytes_this_munch;
+       void *mem = NULL;
+
+       /*
+        * This helper is run in a child process. The helper sees one argument
+        * as a string which is the amount of memory in megabytes to allocate.
+        */
+       if (argc != 1)
+               exit(NO_MEMSIZE_ARG);
+
+       limit = argv[0];
+       errno = 0;
+       limit_mb = strtol(limit, NULL, 10);
+       if (errno != 0 || limit_mb <= 0)
+               exit(INVALID_MEMSIZE);
+
+       /* Compute in excess of assigned limit */
+       max_bytes_to_munch = limit_mb * MEGABYTE;
+       max_bytes_to_munch += BYTESINEXCESS;
+
+       for (bytes_remaining = max_bytes_to_munch; bytes_remaining > 0; bytes_remaining -= bytes_this_munch) {
+               bytes_this_munch = MIN(bytes_remaining, BYTESPERALLOC);
+
+               mem = malloc((size_t)bytes_this_munch);
+               if (mem == NULL)
+                       exit(MALLOC_FAILED);
+               arc4random_buf(mem, (size_t)bytes_this_munch);
+       }
+
+       /* We chewed up all the memory we were asked to. */
+       exit(NORMAL_EXIT);
+}
+
+
+/*
+ * Actual test body.
+ */
+static void
+memorystatus_vm_map_fork_parent(int test_variant)
+{
+       int             max_task_pmem = 0; /* MB */
+       size_t          size = 0;
+       int             active_limit_mb = 0;
+       int             inactive_limit_mb = 0;
+       short           flags = 0;
+       char            memlimit_str[16];
+       pid_t           child_pid;
+       int             child_status;
+       uint64_t        kernel_pidwatch_val;
+       uint64_t        expected_pidwatch_val;
+       int             ret;
+       struct rusage   ru;
+       enum child_exits exit_val;
+
+       /*
+        * The code to set/get the pidwatch sysctl is only in
+        * development kernels. Skip the test if not on one.
+        */
+       if (!is_development_kernel()) {
+               T_SKIP("Can't test on release kernel");
+       }
+
+       /*
+        * Determine a memory limit based on system having one or not.
+        */
+       size = sizeof(max_task_pmem);
+       (void)sysctlbyname("kern.max_task_pmem", &max_task_pmem, &size, NULL, 0);
+       if (max_task_pmem <= 0)
+               max_task_pmem = 0;
+
+       if (test_variant == TEST_ALLOWED) {
+               
+               /*
+                * Tell the child to allocate less than 1/4 the system wide limit.
+                */
+               if (max_task_pmem / 4 - LIMIT_DELTA_MB <= 0) {
+                       active_limit_mb = LIMIT_DELTA_MB;
+               } else {
+                       active_limit_mb = max_task_pmem / 4 - LIMIT_DELTA_MB;
+               }
+               expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
+
+       } else { /* TEST_NOT_ALLOWED */
+
+               /*
+                * Tell the child to allocate more than 1/4 the system wide limit.
+                */
+               active_limit_mb = (max_task_pmem / 4) + LIMIT_DELTA_MB;
+               if (max_task_pmem == 0) {
+                       expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
+               } else {
+                       expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED;
+               }
+
+       }
+       inactive_limit_mb = active_limit_mb;
+       T_LOG("using limit of %d Meg", active_limit_mb);
+
+       /*
+        * When run as part of a larger suite, a previous test
+        * may have left the system temporarily with too little
+        * memory to run this test. We try to detect if there is
+        * enough free memory to proceed, waiting a little bit
+        * for memory to free up.
+        */
+       wait_for_free_mem(active_limit_mb);
+
+#if defined(__x86_64__)
+       /*
+        * vm_map_fork() is always allowed on desktop.
+        */
+       expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
+#endif
+
+       /*
+        * Prepare the arguments needed to spawn the child process.
+        */
+       memset (memlimit_str, 0, sizeof(memlimit_str));
+       (void)sprintf(memlimit_str, "%d", active_limit_mb);
+
+       ret = _NSGetExecutablePath(testpath, &testpath_size);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "_NSGetExecutablePath(%s, ...)", testpath);
+
+       /*
+        * We put the child process in FOREGROUND to try and keep jetsam's hands off it.
+        */
+       child_pid = spawn_child_process(testpath, memlimit_str, flags,
+           JETSAM_PRIORITY_FOREGROUND, active_limit_mb, inactive_limit_mb);
+
+       expected_pidwatch_val |= (uint64_t)child_pid;
+
+       /*
+        * We only reach here if parent successfully spawned child process.
+        */
+       T_LOG("  spawned child_pid[%d] with memlimit %s (%d)MB\n",
+           child_pid, memlimit_str, active_limit_mb);
+
+       /*
+        * Set the kernel's pidwatch to look for the child.
+        */
+       (void)set_memorystatus_vm_map_fork_pidwatch((pid_t)0);
+       (void)set_memorystatus_vm_map_fork_pidwatch(child_pid);
+
+       /*
+        * Let the child run and wait for it to finish.
+        */
+       test_child_process(child_pid, &child_status, &ru);
+       T_LOG("Child exited with max_rss of %ld", ru.ru_maxrss);
+
+       /*
+        * Retrieve the kernel's pidwatch value. This should now indicate
+        * if the corpse was allowed or not.
+        */
+       kernel_pidwatch_val = get_memorystatus_vm_map_fork_pidwatch();
+       (void)set_memorystatus_vm_map_fork_pidwatch((pid_t)0);
+
+       /*
+        * If the child died abnormally, the test is invalid.
+        */
+       if (!WIFEXITED(child_status)) {
+               if (WIFSIGNALED(child_status)) {
+                       /* jetsam kills a process with SIGKILL */
+                       if (WTERMSIG(child_status) == SIGKILL)
+                               T_LOG("Child appears to have been a jetsam victim");
+                       T_SKIP("Child terminated by signal %d test result invalid", WTERMSIG(child_status));
+               }
+               T_SKIP("child did not exit normally (status=%d) test result invalid", child_status);
+       }
+
+       /*
+        * We don't expect the child to exit for any other reason than success
+        */
+       exit_val = (enum child_exits)WEXITSTATUS(child_status);
+       T_QUIET; T_ASSERT_EQ(exit_val, NORMAL_EXIT, "child exit due to: %s", 
+           (0 < exit_val && exit_val < NUM_CHILD_EXIT) ? child_exit_why[exit_val] : "unknown");
+
+       /*
+        * If the kernel aborted generating a corpse for other reasons, the test is invalid.
+        */
+       if (kernel_pidwatch_val == -1ull) {
+               T_SKIP("corpse generation was aborted by kernel");
+       }
+
+       /*
+        * We should always have made it through the vm_map_fork() checks in the kernel for this test.
+        */
+       T_QUIET; T_ASSERT_NE_ULLONG(kernel_pidwatch_val, (uint64_t)child_pid, "child didn't trigger corpse generation");
+
+       T_EXPECT_EQ(kernel_pidwatch_val, expected_pidwatch_val, "kernel value 0x%llx - expected 0x%llx",
+           kernel_pidwatch_val, expected_pidwatch_val);
+}
+
+/*
+ * The order of these 2 test functions is important. They will be executed by the test framwork in order.
+ *
+ * We test "not allowed first", then "allowed". If it were the other way around, the corpse from the "allowed"
+ * test would likely cause memory pressure and jetsam would likely kill the "not allowed" test.
+ */
+T_DECL(memorystatus_vm_map_fork_test_not_allowed, "test that corpse generation was not allowed")
+{
+       memorystatus_vm_map_fork_parent(TEST_NOT_ALLOWED);
+}
+
+T_DECL(memorystatus_vm_map_fork_test_allowed, "test corpse generation allowed")
+{
+
+       memorystatus_vm_map_fork_parent(TEST_ALLOWED);
+}
diff --git a/tests/memorystatus_zone_test.c b/tests/memorystatus_zone_test.c
new file mode 100644 (file)
index 0000000..007970e
--- /dev/null
@@ -0,0 +1,591 @@
+#include <stdio.h>
+#include <mach/mach_vm.h>
+#include <mach/mach_port.h>
+#include <mach/mach_host.h>
+#include <mach/mach_error.h>
+#include <mach-o/dyld.h>
+#include <sys/sysctl.h>
+#include <sys/kdebug.h>
+#include <sys/mman.h>
+#include <sys/kern_memorystatus.h>
+#include <ktrace/session.h>
+#include <dispatch/private.h>
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.vm"),
+       T_META_CHECK_LEAKS(false)
+);
+
+#define TIMEOUT_SECS                                   1500
+
+#if TARGET_OS_EMBEDDED
+#define ALLOCATION_SIZE_VM_REGION                              (16*1024)               /* 16 KB */
+#define ALLOCATION_SIZE_VM_OBJECT                              ALLOCATION_SIZE_VM_REGION
+#else
+#define ALLOCATION_SIZE_VM_REGION                              (1024*1024*100) /* 100 MB */
+#define ALLOCATION_SIZE_VM_OBJECT                              (16*1024)               /* 16 KB */
+#endif
+#define MAX_CHILD_PROCS                                100
+
+#define ZONEMAP_JETSAM_LIMIT_SYSCTL                    "kern.zone_map_jetsam_limit=60"
+
+#define VME_ZONE_TEST_OPT                              "allocate_vm_regions"
+#define VM_OBJECTS_ZONE_TEST_OPT                       "allocate_vm_objects"
+#define GENERIC_ZONE_TEST_OPT                          "allocate_from_generic_zone"
+
+#define VME_ZONE                                                               "VM map entries"
+#define VMOBJECTS_ZONE                                                 "vm objects"
+#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO   98
+
+#define VM_TAG1                                                                        100
+#define VM_TAG2                                                                        101
+
+enum {
+    VME_ZONE_TEST = 0,
+    VM_OBJECTS_ZONE_TEST,
+    GENERIC_ZONE_TEST,
+};
+
+typedef struct test_config_struct {
+       int test_index;
+       int num_zones;
+       const char *helper_func;
+       mach_zone_name_array_t zone_names;
+} test_config_struct;
+
+static test_config_struct current_test;
+static int num_children = 0;
+static bool test_ending = false;
+static bool within_dispatch_signal_handler = false;
+static bool within_dispatch_timer_handler = false;
+static dispatch_source_t ds_signal = NULL;
+static dispatch_source_t ds_timer = NULL;
+static ktrace_session_t session = NULL;
+
+static mach_zone_info_array_t zone_info_array = NULL;
+static mach_zone_name_t largest_zone_name;
+static mach_zone_info_t largest_zone_info;
+
+static char testpath[PATH_MAX];
+static pid_t child_pids[MAX_CHILD_PROCS];
+static pthread_mutex_t test_ending_mtx;
+
+static void allocate_vm_regions(void);
+static void allocate_vm_objects(void);
+static void allocate_from_generic_zone(void);
+static void cleanup_and_end_test(void);
+static void setup_ktrace_session(void);
+static void spawn_child_process(void);
+static void run_test(void);
+static bool verify_generic_jetsam_criteria(void);
+static bool vme_zone_compares_to_vm_objects(void);
+static void print_zone_map_size(void);
+static void query_zone_info(void);
+static void print_zone_info(mach_zone_name_t *zn, mach_zone_info_t *zi);
+
+extern void mach_zone_force_gc(host_t host);
+extern kern_return_t mach_zone_info_for_largest_zone(
+       host_priv_t host,
+       mach_zone_name_t *name,
+       mach_zone_info_t *info
+);
+
+static void allocate_vm_regions(void)
+{
+       uint64_t alloc_size = ALLOCATION_SIZE_VM_REGION, i = 0;
+
+       printf("[%d] Allocating VM regions, each of size %lld KB\n", getpid(), (alloc_size>>10));
+       for (i = 0; ; i++) {
+               mach_vm_address_t addr = (mach_vm_address_t)NULL;
+
+               /* Alternate VM tags between consecutive regions to prevent coalescing */
+               int flags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE;
+
+               if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, flags)) != KERN_SUCCESS) {
+                       break;
+               }
+       }
+       printf("[%d] Number of allocations: %lld\n", getpid(), i);
+
+       /* Signal to the parent that we're done allocating */
+       kill(getppid(), SIGUSR1);
+
+       while (1) {
+               pause();
+       }
+}
+
+static void allocate_vm_objects(void)
+{
+       uint64_t alloc_size = ALLOCATION_SIZE_VM_OBJECT, i = 0;
+
+       printf("[%d] Allocating VM regions, each of size %lld KB, each backed by a VM object\n", getpid(), (alloc_size>>10));
+       for (i = 0; ; i++) {
+               mach_vm_address_t addr = (mach_vm_address_t)NULL;
+
+               /* Alternate VM tags between consecutive regions to prevent coalescing */
+               int flags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE;
+
+               if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, flags)) != KERN_SUCCESS) {
+                       break;
+               }
+               /* Touch the region so the VM object can actually be created */
+               *((int *)addr) = 0;
+               /* OK to free this page. Keeps us from holding a lot of dirty pages */
+               madvise((void *)addr, (size_t)alloc_size, MADV_FREE);
+       }
+       printf("[%d] Number of allocations: %lld\n", getpid(), i);
+
+       /* Signal to the parent that we're done allocating */
+       kill(getppid(), SIGUSR1);
+
+       while (1) {
+               pause();
+       }
+}
+
+static void allocate_from_generic_zone(void)
+{
+       uint64_t i = 0;
+
+       printf("[%d] Allocating mach_ports\n", getpid());
+       for (i = 0; ; i++) {
+               mach_port_t port;
+
+               if ((mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port)) != KERN_SUCCESS) {
+                       break;
+               }
+       }
+       printf("[%d] Number of allocations: %lld\n", getpid(), i);
+
+       /* Signal to the parent that we're done allocating */
+       kill(getppid(), SIGUSR1);
+
+       while (1) {
+               pause();
+       }
+}
+
+static void print_zone_info(mach_zone_name_t *zn, mach_zone_info_t *zi)
+{
+       T_LOG("ZONE NAME: %-35sSIZE: %-25lluELEMENTS: %llu",
+                       zn->mzn_name, zi->mzi_cur_size, zi->mzi_count);
+}
+
+static void query_zone_info(void)
+{
+       int i;
+       kern_return_t kr;
+       static uint64_t num_calls = 0;
+
+       for (i = 0; i < current_test.num_zones; i++) {
+               kr = mach_zone_info_for_zone(mach_host_self(), current_test.zone_names[i], &(zone_info_array[i]));
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_zone_info_for_zone(%s) returned %d [%s]", current_test.zone_names[i].mzn_name, kr, mach_error_string(kr));
+       }
+       kr = mach_zone_info_for_largest_zone(mach_host_self(), &largest_zone_name, &largest_zone_info);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_zone_info_for_largest_zone returned %d [%s]", kr, mach_error_string(kr));
+
+       num_calls++;
+       if (num_calls % 10 != 0) {
+               return;
+       }
+
+       /* Print out size and element count for zones relevant to the test */
+       for (i = 0; i < current_test.num_zones; i++) {
+               print_zone_info(&(current_test.zone_names[i]), &(zone_info_array[i]));
+       }
+}
+
+static bool vme_zone_compares_to_vm_objects(void)
+{
+       int i;
+       uint64_t vm_object_element_count = 0, vm_map_entry_element_count = 0;
+
+       T_LOG("Comparing element counts of \"VM map entries\" and \"vm objects\" zones");
+       for (i = 0; i < current_test.num_zones; i++) {
+               if (!strcmp(current_test.zone_names[i].mzn_name, VME_ZONE)) {
+                       vm_map_entry_element_count = zone_info_array[i].mzi_count;
+               } else if (!strcmp(current_test.zone_names[i].mzn_name, VMOBJECTS_ZONE)) {
+                       vm_object_element_count = zone_info_array[i].mzi_count;
+               }
+               print_zone_info(&(current_test.zone_names[i]), &(zone_info_array[i]));
+       }
+
+       T_LOG("# VM map entries as percentage of # vm objects = %llu", (vm_map_entry_element_count * 100)/ vm_object_element_count);
+       if (vm_map_entry_element_count >= ((vm_object_element_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
+               T_LOG("Number of VM map entries is comparable to vm objects\n\n");
+               return true;
+       }
+       T_LOG("Number of VM map entries is NOT comparable to vm objects\n\n");
+       return false;
+}
+
+static bool verify_generic_jetsam_criteria(void)
+{
+       T_LOG("Largest zone info");
+       print_zone_info(&largest_zone_name, &largest_zone_info);
+
+       /* If VM map entries is not the largest zone */
+       if (strcmp(largest_zone_name.mzn_name, VME_ZONE)) {
+               /* If vm objects is the largest zone and the VM map entries zone had comparable # of elements, return false */
+               if (!strcmp(largest_zone_name.mzn_name, VMOBJECTS_ZONE) && vme_zone_compares_to_vm_objects()) {
+                       return false;
+               }
+               return true;
+       }
+       return false;
+}
+
+static void cleanup_and_end_test(void)
+{
+       int i;
+
+       /*
+        * The atend handler executes on a different dispatch queue.
+        * We want to do the cleanup only once.
+        */
+       pthread_mutex_lock(&test_ending_mtx);
+       if (test_ending) {
+               pthread_mutex_unlock(&test_ending_mtx);
+               return;
+       }
+       test_ending = true;
+       pthread_mutex_unlock(&test_ending_mtx);
+
+       T_LOG("Number of processes spawned: %d", num_children);
+       T_LOG("Cleaning up...");
+
+       /* Disable the timer that queries and prints zone info periodically */
+       if (ds_timer != NULL && !within_dispatch_timer_handler) {
+               dispatch_source_cancel(ds_timer);
+       }
+
+       /* Disable signal handler that spawns child processes, only if we're not in the event handler's context */
+       if (ds_signal != NULL && !within_dispatch_signal_handler) {
+               dispatch_source_cancel_and_wait(ds_signal);
+       }
+
+       /* Kill all the child processes that were spawned */
+       for (i = 0; i < num_children; i++) {
+               kill(child_pids[i], SIGKILL);
+               /*
+                * Sleep between kills to avoid hogging the VM map entries zone lock (on the task_terminate path).
+                * Without this we were seeing hw_lock_bit timeouts in BATS.
+                */
+               sleep(1);
+       }
+       for (i = 0; i < num_children; i++) {
+               int status = 0;
+               if (waitpid(child_pids[i], &status, 0) < 0) {
+                       T_LOG("waitpid returned status %d", status);
+               }
+       }
+       sleep(1);
+
+       /* Force zone_gc before starting test for another zone or exiting */
+       mach_zone_force_gc(mach_host_self());
+
+       /* End ktrace session */
+       if (session != NULL) {
+               ktrace_end(session, 1);
+       }
+
+       for (i = 0; i < current_test.num_zones; i++) {
+               print_zone_info(&(current_test.zone_names[i]), &(zone_info_array[i]));
+       }
+}
+
+static void setup_ktrace_session(void)
+{
+       int ret = 0;
+
+       T_LOG("Setting up ktrace session...");
+       session = ktrace_session_create();
+       T_QUIET; T_ASSERT_NOTNULL(session, "ktrace_session_create");
+
+       ktrace_set_interactive(session);
+
+       ktrace_set_completion_handler(session, ^{
+               ktrace_session_destroy(session);
+               T_END;
+       });
+
+       /* Listen for memorystatus_do_kill trace events */
+       ret = ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END, ^(ktrace_event_t event) {
+               int i;
+               bool received_jetsam_event = false;
+
+               /* We don't care about jetsams for any other reason except zone-map-exhaustion */
+               if (event->arg2 == kMemorystatusKilledZoneMapExhaustion) {
+                       cleanup_and_end_test();
+                       T_LOG("[memorystatus_do_kill] jetsam reason: zone-map-exhaustion, pid: %lu\n\n", event->arg1);
+                       if (current_test.test_index == VME_ZONE_TEST || current_test.test_index == VM_OBJECTS_ZONE_TEST) {
+                               /*
+                                * For the VM map entries zone we try to kill the leaking process.
+                                * Verify that we jetsammed one of the processes we spawned.
+                                *
+                                * For the vm objects zone we pick the leaking process via the VM map entries
+                                * zone, if the number of vm objects and VM map entries are comparable.
+                                * The test simulates this scenario, we should see a targeted jetsam for the
+                                * vm objects zone too.
+                                */
+                               for (i = 0; i < num_children; i++) {
+                                       if (child_pids[i] == (pid_t)event->arg1) {
+                                               received_jetsam_event = true;
+                                               break;
+                                       }
+                               }
+                               /*
+                                * If we didn't see a targeted jetsam, verify that the largest zone actually
+                                * fulfilled the criteria for generic jetsams.
+                                */
+                               if (!received_jetsam_event && verify_generic_jetsam_criteria()) {
+                                       received_jetsam_event = true;
+                               }
+                       } else {
+                               received_jetsam_event = true;
+                       }
+
+                       T_ASSERT_TRUE(received_jetsam_event, "Received zone-map-exhaustion jetsam event as expected");
+               }
+       });
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_events_single");
+
+       ret = ktrace_start(session, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_start");
+}
+
+static void print_zone_map_size(void)
+{
+       int ret;
+       uint64_t zstats[2];
+       size_t zstats_size = sizeof(zstats);
+
+       ret = sysctlbyname("kern.zone_map_size_and_capacity", &zstats, &zstats_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_size_and_capacity failed");
+
+       T_LOG("Zone map capacity: %-30lldZone map size: %lld [%lld%% full]", zstats[1], zstats[0], (zstats[0] * 100)/zstats[1]);
+}
+
+static void spawn_child_process(void)
+{
+       pid_t pid = -1;
+       char helper_func[50];
+       char *launch_tool_args[4];
+
+       T_QUIET; T_ASSERT_LT(num_children, MAX_CHILD_PROCS, "Spawned %d children. Timing out...", MAX_CHILD_PROCS);
+
+       strlcpy(helper_func, current_test.helper_func, sizeof(helper_func));
+       launch_tool_args[0] = testpath;
+       launch_tool_args[1] = "-n";
+       launch_tool_args[2] = helper_func;
+       launch_tool_args[3] = NULL;
+
+       /* Spawn the child process */
+       int rc = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL);
+       if (rc != 0) {
+               T_LOG("dt_launch tool returned %d with error code %d", rc, errno);
+       }
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool");
+
+       child_pids[num_children++] = pid;
+}
+
+static void run_test(void)
+{
+       uint64_t mem;
+       uint32_t testpath_buf_size, pages;
+       int ret, dev, pgsz;
+       size_t sysctl_size;
+
+       T_ATEND(cleanup_and_end_test);
+       T_SETUPBEGIN;
+
+       dev = 0;
+       sysctl_size = sizeof(dev);
+       ret = sysctlbyname("kern.development", &dev, &sysctl_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.development failed");
+       if (dev == 0) {
+               T_SKIP("Skipping test on release kernel");
+       }
+
+       testpath_buf_size = sizeof(testpath);
+       ret = _NSGetExecutablePath(testpath, &testpath_buf_size);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "_NSGetExecutablePath");
+       T_LOG("Executable path: %s", testpath);
+
+       sysctl_size = sizeof(mem);
+       ret = sysctlbyname("hw.memsize", &mem, &sysctl_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl hw.memsize failed");
+       T_LOG("hw.memsize: %llu", mem);
+
+       sysctl_size = sizeof(pgsz);
+       ret = sysctlbyname("vm.pagesize", &pgsz, &sysctl_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pagesize failed");
+       T_LOG("vm.pagesize: %d", pgsz);
+
+       sysctl_size = sizeof(pages);
+       ret = sysctlbyname("vm.pages", &pages, &sysctl_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pages failed");
+       T_LOG("vm.pages: %d", pages);
+
+       zone_info_array = (mach_zone_info_array_t) calloc((unsigned long)current_test.num_zones, sizeof *zone_info_array);
+
+       print_zone_map_size();
+
+       /*
+        * If the timeout specified by T_META_TIMEOUT is hit, the atend handler does not get called.
+        * So we're queueing a dispatch block to fire after TIMEOUT_SECS seconds, so we can exit cleanly.
+        */
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TIMEOUT_SECS * NSEC_PER_SEC), dispatch_get_main_queue(), ^{
+               T_ASSERT_FAIL("Timed out after %d seconds", TIMEOUT_SECS);
+       });
+
+       /*
+        * Create a dispatch source for the signal SIGUSR1. When a child is done allocating zone memory, it
+        * sends SIGUSR1 to the parent. Only then does the parent spawn another child. This prevents us from
+        * spawning many children at once and creating a lot of memory pressure.
+        */
+       signal(SIGUSR1, SIG_IGN);
+       ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create: signal");
+
+       dispatch_source_set_event_handler(ds_signal, ^{
+               within_dispatch_signal_handler = true;
+               print_zone_map_size();
+
+               /* Wait a few seconds before spawning another child. Keeps us from allocating too aggressively */
+               sleep(5);
+               spawn_child_process();
+               within_dispatch_signal_handler = false;
+       });
+       dispatch_activate(ds_signal);
+
+       /* Timer to query jetsam-relevant zone info every second. Print it every 10 seconds. */
+       ds_timer = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, dispatch_queue_create("timer_queue", NULL));
+       T_QUIET; T_ASSERT_NOTNULL(ds_timer, "dispatch_source_create: timer");
+    dispatch_source_set_timer(ds_timer, dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), NSEC_PER_SEC, 0);
+
+       dispatch_source_set_event_handler(ds_timer, ^{
+               within_dispatch_timer_handler = true;
+               query_zone_info();
+               within_dispatch_timer_handler = false;
+    });
+       dispatch_activate(ds_timer);
+
+       /* Set up a ktrace session to listen for jetsam events */
+       setup_ktrace_session();
+
+       T_SETUPEND;
+
+       /* Spawn the first child process */
+       T_LOG("Spawning child processes to allocate zone memory...\n\n");
+       spawn_child_process();
+
+       dispatch_main();
+}
+
+static void move_to_idle_band(void)
+{
+       memorystatus_priority_properties_t props;
+
+       /*
+        * We want to move the processes we spawn into the idle band, so that jetsam can target them first.
+        * This prevents other important BATS tasks from getting killed, specially in LTE where we have very few
+        * processes running.
+        *
+        * This is only needed for tests which (are likely to) lead us down the generic jetsam path.
+        */
+       props.priority = JETSAM_PRIORITY_IDLE;
+       props.user_data = 0;
+
+       if (memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, getpid(), 0, &props, sizeof(props))) {
+               printf("memorystatus call to change jetsam priority failed\n");
+               exit(-1);
+       }
+}
+
+T_HELPER_DECL(allocate_vm_regions, "allocates VM regions")
+{
+       allocate_vm_regions();
+}
+
+T_HELPER_DECL(allocate_vm_objects, "allocates VM objects and VM regions")
+{
+       move_to_idle_band();
+       allocate_vm_objects();
+}
+
+T_HELPER_DECL(allocate_from_generic_zone, "allocates from a generic zone")
+{
+       move_to_idle_band();
+       allocate_from_generic_zone();
+}
+
+/*
+ * T_META_SYSCTL_INT(ZONEMAP_JETSAM_LIMIT_SYSCTL) changes the zone_map_jetsam_limit to a
+ * lower value, so that the test can complete faster.
+ * The test allocates zone memory pretty aggressively which can cause the system to panic
+ * if the jetsam limit is quite high; a lower value keeps us from panicking.
+ */
+T_DECL(        memorystatus_vme_zone_test,
+               "allocates elements from the VM map entries zone, verifies zone-map-exhaustion jetsams",
+               T_META_ASROOT(true),
+               T_META_TIMEOUT(1800),
+/*             T_META_LTEPHASE(LTE_POSTINIT),
+ */
+               T_META_SYSCTL_INT(ZONEMAP_JETSAM_LIMIT_SYSCTL))
+{
+       current_test = (test_config_struct) {
+               .test_index = VME_ZONE_TEST,
+               .helper_func = VME_ZONE_TEST_OPT,
+               .num_zones = 1,
+               .zone_names = (mach_zone_name_t []){
+                       { .mzn_name = VME_ZONE }
+               }
+       };
+       run_test();
+}
+
+T_DECL(        memorystatus_vm_objects_zone_test,
+               "allocates elements from the VM objects and the VM map entries zones, verifies zone-map-exhaustion jetsams",
+               T_META_ASROOT(true),
+               T_META_TIMEOUT(1800),
+/*             T_META_LTEPHASE(LTE_POSTINIT),
+ */
+               T_META_SYSCTL_INT(ZONEMAP_JETSAM_LIMIT_SYSCTL))
+{
+       current_test = (test_config_struct) {
+               .test_index = VM_OBJECTS_ZONE_TEST,
+               .helper_func = VM_OBJECTS_ZONE_TEST_OPT,
+               .num_zones = 2,
+               .zone_names = (mach_zone_name_t []){
+                       { .mzn_name = VME_ZONE },
+                       { .mzn_name = VMOBJECTS_ZONE}
+               }
+       };
+       run_test();
+}
+
+T_DECL(        memorystatus_generic_zone_test,
+               "allocates elements from a zone that doesn't have an optimized jetsam path, verifies zone-map-exhaustion jetsams",
+               T_META_ASROOT(true),
+               T_META_TIMEOUT(1800),
+/*             T_META_LTEPHASE(LTE_POSTINIT),
+ */
+               T_META_SYSCTL_INT(ZONEMAP_JETSAM_LIMIT_SYSCTL))
+{
+       current_test = (test_config_struct) {
+               .test_index = GENERIC_ZONE_TEST,
+               .helper_func = GENERIC_ZONE_TEST_OPT,
+               .num_zones = 0,
+               .zone_names = NULL
+       };
+       run_test();
+}
diff --git a/tests/mktimer_kobject.c b/tests/mktimer_kobject.c
new file mode 100644 (file)
index 0000000..54b24a0
--- /dev/null
@@ -0,0 +1,50 @@
+#include <stdint.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <mach/mach.h>
+#include <mach/mk_timer.h>
+
+#include <darwintest.h>
+
+T_DECL(mktimer_kobject, "mktimer_kobject()", T_META_ALL_VALID_ARCHS(true))
+{
+       mach_port_t timer_port = MACH_PORT_NULL;
+       mach_port_t notify_port = MACH_PORT_NULL;
+
+       kern_return_t kr = KERN_SUCCESS;
+
+       // timer port
+       // This is a receive right which is also a kobject
+       timer_port = mk_timer_create();
+       T_ASSERT_NE(timer_port, (mach_port_t)MACH_PORT_NULL, "mk_timer_create: %s", mach_error_string(kr));
+
+       mach_port_set_context(mach_task_self(), timer_port, (mach_port_context_t) 0x1);
+       T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_set_context(timer_port): %s", mach_error_string(kr));
+
+       // notification port for the mk_timer port to come back on
+       kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &notify_port);
+       T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_allocate(notify_port): %s", mach_error_string(kr));
+
+       kr = mach_port_set_context(mach_task_self(), notify_port, (mach_port_context_t) 0x2);
+       T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_set_context(notify_port): %s", mach_error_string(kr));
+
+       T_LOG("timer: 0x%x, notify: 0x%x", timer_port, notify_port);
+
+       mach_port_t previous = MACH_PORT_NULL;
+
+       // request a port-destroyed notification on the timer port
+       kr = mach_port_request_notification(mach_task_self(), timer_port, MACH_NOTIFY_PORT_DESTROYED,
+                                           0, notify_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &previous);
+       // this should fail!
+       T_ASSERT_NE(kr, KERN_SUCCESS, "notifications should NOT work on mk_timer ports!");
+
+       // destroy the timer port to send the notification
+       mach_port_mod_refs(mach_task_self(), timer_port, MACH_PORT_RIGHT_RECEIVE, -1);
+
+       // destroy the notification port
+       mach_port_mod_refs(mach_task_self(), notify_port, MACH_PORT_RIGHT_RECEIVE, -1);
+
+       T_LOG("done");
+}
+
diff --git a/tests/monotonic_core.c b/tests/monotonic_core.c
new file mode 100644 (file)
index 0000000..3feaeba
--- /dev/null
@@ -0,0 +1,237 @@
+/*
+ * Must come before including darwintest.h
+ */
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif /* defined(T_NAMESPACE) */
+
+#include <darwintest.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#ifndef PRIVATE
+/*
+ * Need new CPU families.
+ */
+#define PRIVATE
+#include <mach/machine.h>
+#undef PRIVATE
+#else /* !defined(PRIVATE) */
+#include <mach/machine.h>
+#endif /* defined(PRIVATE) */
+#include <ktrace.h>
+#include <mach/mach.h>
+#include <stdint.h>
+#include <System/sys/guarded.h>
+#include <System/sys/monotonic.h>
+#include <sys/ioctl.h>
+#include <sys/kdebug.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.monotonic"),
+               T_META_CHECK_LEAKS(false)
+);
+
+static void
+skip_if_unsupported(void)
+{
+       int r;
+       int supported = 0;
+       size_t supported_size = sizeof(supported);
+
+       r = sysctlbyname("kern.monotonic.supported", &supported, &supported_size,
+                       NULL, 0);
+       if (r < 0) {
+               T_WITH_ERRNO;
+               T_SKIP("could not find \"kern.monotonic.supported\" sysctl");
+       }
+
+       if (!supported) {
+               T_SKIP("monotonic is not supported on this platform");
+       }
+}
+
+static void
+check_fixed_counts(uint64_t counts[2][2])
+{
+       T_QUIET;
+       T_EXPECT_GT(counts[0][0], UINT64_C(0), "instructions are larger than 0");
+       T_QUIET;
+       T_EXPECT_GT(counts[0][1], UINT64_C(0), "cycles are larger than 0");
+
+       T_EXPECT_GT(counts[1][0], counts[0][0], "instructions increase monotonically");
+       T_EXPECT_GT(counts[1][1], counts[0][1], "cycles increase monotonically");
+}
+
+T_DECL(core_fixed_thread_self, "check the current thread's fixed counters",
+               T_META_ASROOT(true))
+{
+       int err;
+       extern int thread_selfcounts(int type, void *buf, size_t nbytes);
+       uint64_t counts[2][2];
+
+       T_SETUPBEGIN;
+       skip_if_unsupported();
+       T_SETUPEND;
+
+       err = thread_selfcounts(1, &counts[0], sizeof(counts[0]));
+       T_ASSERT_POSIX_ZERO(err, "thread_selfcounts");
+       err = thread_selfcounts(1, &counts[1], sizeof(counts[1]));
+       T_ASSERT_POSIX_ZERO(err, "thread_selfcounts");
+
+       check_fixed_counts(counts);
+}
+
+T_DECL(core_fixed_task, "check that task counting is working",
+               T_META_ASROOT(true))
+{
+       task_t task = mach_task_self();
+       kern_return_t kr;
+       mach_msg_type_number_t size = TASK_INSPECT_BASIC_COUNTS_COUNT;
+       uint64_t counts[2][2];
+
+       skip_if_unsupported();
+
+       kr = task_inspect(task, TASK_INSPECT_BASIC_COUNTS,
+                       (task_inspect_info_t)&counts[0], &size);
+       T_ASSERT_MACH_SUCCESS(kr,
+                       "task_inspect(... TASK_INSPECT_BASIC_COUNTS ...)");
+
+       size = TASK_INSPECT_BASIC_COUNTS_COUNT;
+       kr = task_inspect(task, TASK_INSPECT_BASIC_COUNTS,
+                       (task_inspect_info_t)&counts[1], &size);
+       T_ASSERT_MACH_SUCCESS(kr,
+                       "task_inspect(... TASK_INSPECT_BASIC_COUNTS ...)");
+
+       check_fixed_counts(counts);
+}
+
+T_DECL(core_fixed_kdebug, "check that the kdebug macros for monotonic work",
+               T_META_ASROOT(true))
+{
+       __block bool saw_events = false;
+       ktrace_session_t s;
+       int r;
+       int set = 1;
+
+       T_SETUPBEGIN;
+       skip_if_unsupported();
+
+       s = ktrace_session_create();
+       T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
+
+       ktrace_events_single_paired(s,
+                       KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_TMPCPU, 0x3fff),
+                       ^(struct trace_point *start, struct trace_point *end)
+       {
+               uint64_t counts[2][2];
+
+               saw_events = true;
+
+               counts[0][0] = start->arg1;
+               counts[0][1] = start->arg2;
+               counts[1][0] = end->arg1;
+               counts[1][1] = end->arg2;
+
+               check_fixed_counts(counts);
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               T_ASSERT_TRUE(saw_events, "should see monotonic kdebug events");
+               T_END;
+       });
+       T_SETUPEND;
+
+       T_ASSERT_POSIX_ZERO(ktrace_start(s,
+                       dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0)), NULL);
+
+       r = sysctlbyname("kern.monotonic.kdebug_test", NULL, NULL, &set,
+                       sizeof(set));
+       T_ASSERT_POSIX_SUCCESS(r,
+                       "sysctlbyname(\"kern.monotonic.kdebug_test\", ...)");
+
+       ktrace_end(s, 0);
+       dispatch_main();
+}
+
+static void
+perf_sysctl_deltas(const char *sysctl_name, const char *stat_name)
+{
+       uint64_t deltas[2];
+       size_t deltas_size;
+       int r;
+
+       T_SETUPBEGIN;
+       skip_if_unsupported();
+
+       dt_stat_t instrs = dt_stat_create("instructions", "%s_instrs",
+                       stat_name);
+       dt_stat_t cycles = dt_stat_create("cycles", "%s_cycles", stat_name);
+       T_SETUPEND;
+
+       while (!dt_stat_stable(instrs) || !dt_stat_stable(cycles)) {
+               deltas_size = sizeof(deltas);
+               r = sysctlbyname(sysctl_name, deltas, &deltas_size, NULL, 0);
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(r, "sysctlbyname(\"%s\", ...)", sysctl_name);
+               dt_stat_add(instrs, (double)deltas[0]);
+               dt_stat_add(cycles, (double)deltas[1]);
+       }
+
+       dt_stat_finalize(instrs);
+       dt_stat_finalize(cycles);
+}
+
+T_DECL(perf_core_fixed_cpu, "test the performance of fixed CPU counter access",
+               T_META_ASROOT(true), T_META_TAG_PERF)
+{
+       perf_sysctl_deltas("kern.monotonic.fixed_cpu_perf", "fixed_cpu_counters");
+}
+
+T_DECL(perf_core_fixed_thread, "test the performance of fixed thread counter access",
+               T_META_ASROOT(true), T_META_TAG_PERF)
+{
+       perf_sysctl_deltas("kern.monotonic.fixed_thread_perf",
+                       "fixed_thread_counters");
+}
+
+T_DECL(perf_core_fixed_task, "test the performance of fixed task counter access",
+               T_META_ASROOT(true), T_META_TAG_PERF)
+{
+       perf_sysctl_deltas("kern.monotonic.fixed_task_perf", "fixed_task_counters");
+}
+
+T_DECL(perf_core_fixed_thread_self, "test the performance of thread self counts",
+               T_META_TAG_PERF)
+{
+       extern int thread_selfcounts(int type, void *buf, size_t nbytes);
+       uint64_t counts[2][2];
+
+       T_SETUPBEGIN;
+       dt_stat_t instrs = dt_stat_create("fixed_thread_self_instrs", "instructions");
+       dt_stat_t cycles = dt_stat_create("fixed_thread_self_cycles", "cycles");
+
+       skip_if_unsupported();
+       T_SETUPEND;
+
+       while (!dt_stat_stable(instrs) || !dt_stat_stable(cycles)) {
+               int r1, r2;
+
+               r1 = thread_selfcounts(1, &counts[0], sizeof(counts[0]));
+               r2 = thread_selfcounts(1, &counts[1], sizeof(counts[1]));
+               T_QUIET; T_ASSERT_POSIX_ZERO(r1, "__thread_selfcounts");
+               T_QUIET; T_ASSERT_POSIX_ZERO(r2, "__thread_selfcounts");
+
+               T_QUIET; T_ASSERT_GT(counts[1][0], counts[0][0],
+                               "instructions increase monotonically");
+               dt_stat_add(instrs, counts[1][0] - counts[0][0]);
+
+               T_QUIET; T_ASSERT_GT(counts[1][1], counts[0][1],
+                               "cycles increase monotonically");
+               dt_stat_add(cycles, counts[1][1] - counts[0][1]);
+       }
+
+       dt_stat_finalize(instrs);
+       dt_stat_finalize(cycles);
+}
diff --git a/tests/net_tun_pr_35136664.c b/tests/net_tun_pr_35136664.c
new file mode 100644 (file)
index 0000000..366f066
--- /dev/null
@@ -0,0 +1,63 @@
+
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/kern_control.h>
+#include <sys/sys_domain.h>
+
+#include <net/if_utun.h>
+#include <net/if_ipsec.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.net"));
+
+T_DECL(PR_35136664_utun,
+       "This bind a utun and close it without connecting")
+{
+       int tunsock;
+       struct ctl_info kernctl_info;
+       struct sockaddr_ctl kernctl_addr;
+
+       T_ASSERT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL);
+
+       memset(&kernctl_info, 0, sizeof(kernctl_info));
+       strlcpy(kernctl_info.ctl_name, UTUN_CONTROL_NAME, sizeof(kernctl_info.ctl_name));
+       T_ASSERT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL);
+
+       memset(&kernctl_addr, 0, sizeof(kernctl_addr));
+       kernctl_addr.sc_len = sizeof(kernctl_addr);
+       kernctl_addr.sc_family = AF_SYSTEM;
+       kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
+       kernctl_addr.sc_id = kernctl_info.ctl_id;
+       kernctl_addr.sc_unit = 0;
+
+       T_ASSERT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL);
+
+       T_ASSERT_POSIX_ZERO(close(tunsock), NULL);
+}
+
+T_DECL(PR_35136664_ipsec,
+       "This bind a ipsec and close it without connecting")
+{
+       int tunsock;
+       struct ctl_info kernctl_info;
+       struct sockaddr_ctl kernctl_addr;
+
+       T_ASSERT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL);
+
+       memset(&kernctl_info, 0, sizeof(kernctl_info));
+       strlcpy(kernctl_info.ctl_name, IPSEC_CONTROL_NAME, sizeof(kernctl_info.ctl_name));
+       T_ASSERT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL);
+
+       memset(&kernctl_addr, 0, sizeof(kernctl_addr));
+       kernctl_addr.sc_len = sizeof(kernctl_addr);
+       kernctl_addr.sc_family = AF_SYSTEM;
+       kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
+       kernctl_addr.sc_id = kernctl_info.ctl_id;
+       kernctl_addr.sc_unit = 0;
+
+       T_ASSERT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL);
+
+       T_ASSERT_POSIX_ZERO(close(tunsock), NULL);
+}
diff --git a/tests/net_tuntests.c b/tests/net_tuntests.c
new file mode 100644 (file)
index 0000000..91363ab
--- /dev/null
@@ -0,0 +1,536 @@
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <string.h>
+#include <unistd.h>
+#include <poll.h>
+#include <sys/event.h>
+#include <uuid/uuid.h>
+#include <arpa/inet.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/kern_control.h>
+#include <sys/ioctl.h>
+#include <sys/socket.h>
+#include <sys/kern_control.h>
+#include <sys/sys_domain.h>
+
+#include <net/if.h>
+#include <net/if_ipsec.h>
+#include <net/if_utun.h>
+#include <netinet/in.h>
+#include <netinet/in_var.h>
+#include <net/pfkeyv2.h>
+#include <netinet6/ipsec.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#include <skywalk/os_skywalk_private.h> // for SK_FEATURE_*
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.net.tun"));
+
+#if 0
+static void
+log_hexdump(const void *inp, size_t len)
+{
+       unsigned i, off = 0;
+       char buf[9+16*3+1];
+       for (i = 0; i < len; i++) {
+               if (i % 16 == 0)
+                       off = (unsigned)snprintf(buf, sizeof(buf), "%08x:", i);
+               off += (unsigned)snprintf(buf+off, sizeof(buf)-off, " %02x", (((const uint8_t *)inp)[i]) & 0xff);
+               if (i % 16 == 15)
+                       T_LOG("%s", buf);
+               }
+               if (len % 16)
+                       T_LOG("%s", buf);
+}
+#endif
+
+static uint64_t
+get_skywalk_features(void)
+{
+       uint64_t features = 0;
+       size_t len = sizeof(features);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(sysctlbyname("kern.skywalk.features", &features, &len, NULL, 0), NULL);
+       T_QUIET; T_ASSERT_EQ(len, sizeof(features), NULL);
+       T_QUIET; T_ASSERT_TRUE(features & SK_FEATURE_SKYWALK, NULL);
+       return features;
+}
+
+static bool g_is_ipsec_test;
+static bool g_is_utun_test;
+static int g_OPT_ENABLE_NETIF = -1;
+static int g_OPT_ENABLE_FLOWSWITCH = -1;
+static int g_OPT_ENABLE_CHANNEL = -1;
+static int g_OPT_GET_CHANNEL_UUID = -1;
+static int g_OPT_IFNAME = -1;
+static char *g_CONTROL_NAME = NULL;
+
+static void
+setup_ipsec_test(void)
+{
+       T_LOG("Configuring for ipsec tests");
+       g_OPT_ENABLE_NETIF = IPSEC_OPT_ENABLE_NETIF;
+       g_OPT_ENABLE_FLOWSWITCH = IPSEC_OPT_ENABLE_FLOWSWITCH;
+       g_OPT_ENABLE_CHANNEL = IPSEC_OPT_ENABLE_CHANNEL;
+       g_OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID;
+       g_OPT_IFNAME = IPSEC_OPT_IFNAME;
+       g_CONTROL_NAME = IPSEC_CONTROL_NAME;
+       g_is_ipsec_test = true;
+}
+
+static void
+setup_utun_test(void)
+{
+       T_LOG("Configuring for utun tests");
+       g_OPT_ENABLE_NETIF = UTUN_OPT_ENABLE_NETIF;
+       g_OPT_ENABLE_FLOWSWITCH = UTUN_OPT_ENABLE_FLOWSWITCH;
+       g_OPT_ENABLE_CHANNEL = UTUN_OPT_ENABLE_CHANNEL;
+       g_OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID;
+       g_OPT_IFNAME = UTUN_OPT_IFNAME;
+       g_CONTROL_NAME = UTUN_CONTROL_NAME;
+       g_is_utun_test = true;
+}
+
+static void
+check_enables(int tunsock, int enable_netif, int enable_flowswitch, int enable_channel, uuid_t uuid)
+{
+       int scratch;
+       socklen_t scratchlen, uuidlen;
+       uuid_t scratchuuid;
+       if (!uuid) {
+               uuid = scratchuuid;
+       }
+
+       //T_LOG("checking tunsock %d", tunsock);
+
+       scratchlen = sizeof(scratch);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
+                       &scratch, &scratchlen), NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )scratchlen, sizeof(scratch), NULL);
+       T_QUIET; T_EXPECT_EQ(scratch, enable_netif, NULL);
+
+       scratchlen = sizeof(scratch);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                       &scratch, &scratchlen), NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )scratchlen, sizeof(scratch), NULL);
+       if (get_skywalk_features() & SK_FEATURE_NETNS) {
+               if (enable_netif) {
+                       T_QUIET; T_EXPECT_EQ(scratch, enable_flowswitch, NULL);
+               } else {
+                       T_QUIET; T_EXPECT_EQ(scratch, 0, NULL);
+               }
+       } else {
+               T_QUIET; T_EXPECT_EQ(scratch, 0, NULL);
+       }
+
+       scratchlen = sizeof(scratch);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+                       &scratch, &scratchlen), NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )scratchlen, sizeof(scratch), NULL);
+       if (g_is_ipsec_test && !enable_netif) {
+               T_QUIET; T_EXPECT_EQ(scratch, 0, NULL);
+       } else {
+               T_QUIET; T_EXPECT_EQ(scratch, enable_channel, NULL);
+       }
+
+       if (scratch) {
+               uuid_clear(uuid);
+               uuidlen = sizeof(uuid_t);
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                               uuid, &uuidlen), NULL);
+               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
+               T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid), NULL);
+       } else {
+               uuid_clear(uuid);
+               uuidlen = sizeof(uuid_t);
+               T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                               uuid, &uuidlen), ENXIO, NULL);
+               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
+               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+       }
+}
+
+static void
+tunsock_get_ifname(int s, char ifname[IFXNAMSIZ])
+{
+       socklen_t optlen = IFXNAMSIZ;
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_POSIX_ZERO(getsockopt(s, SYSPROTO_CONTROL, g_OPT_IFNAME, ifname, &optlen), NULL);
+       T_QUIET; T_ASSERT_TRUE(optlen > 0, NULL);
+       T_QUIET; T_ASSERT_TRUE(ifname[optlen-1] == '\0', NULL);
+       T_QUIET; T_ASSERT_TRUE(strlen(ifname)+1 == optlen, "got ifname \"%s\" len %zd expected %u", ifname, strlen(ifname), optlen);
+}
+
+static short
+ifnet_get_flags(int s, const char ifname[IFNAMSIZ])
+{
+       struct ifreq    ifr;
+       memset(&ifr, 0, sizeof(ifr));
+       strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(s, SIOCGIFFLAGS, (caddr_t)&ifr), NULL);
+       return ifr.ifr_flags;
+}
+
+static void
+ifnet_add_addr4(const char ifname[IFNAMSIZ], struct in_addr *addr, struct in_addr *mask, struct in_addr *broadaddr)
+{
+       struct sockaddr_in *sin;
+       struct in_aliasreq ifra;
+       int s;
+
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(s = socket(AF_INET, SOCK_DGRAM, 0), NULL);
+
+       memset(&ifra, 0, sizeof(ifra));
+       strlcpy(ifra.ifra_name, ifname, sizeof(ifra.ifra_name));
+
+       if (addr != NULL) {
+               sin = &ifra.ifra_addr;
+               sin->sin_len = sizeof(*sin);
+               sin->sin_family = AF_INET;
+               sin->sin_addr = *addr;
+       }
+
+       if (mask != NULL) {
+               sin = &ifra.ifra_mask;
+               sin->sin_len = sizeof(*sin);
+               sin->sin_family = AF_INET;
+               sin->sin_addr = *mask;
+       }
+
+       if (broadaddr != NULL || (addr != NULL &&
+                 (ifnet_get_flags(s, ifname) & IFF_POINTOPOINT) != 0)) {
+               sin = &ifra.ifra_broadaddr;
+               sin->sin_len = sizeof(*sin);
+               sin->sin_family = AF_INET;
+               sin->sin_addr = (broadaddr != NULL) ? *broadaddr : *addr;
+       }
+
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(s, SIOCAIFADDR, &ifra), NULL);
+
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(s), NULL);
+}
+
+static int g_pfkeyso = -1;
+static struct in_addr g_addr1, g_addr2;
+
+static void
+create_sa(const char ifname[IFXNAMSIZ], uint8_t type, uint32_t spi, struct in_addr *src, struct in_addr *dst)
+{
+       if (g_pfkeyso == -1) {
+               T_QUIET; T_EXPECT_POSIX_SUCCESS(g_pfkeyso = socket(PF_KEY, SOCK_RAW, PF_KEY_V2), NULL);
+       }
+
+       /*
+               <base, SA, (lifetime(HS),) address(SD), (address(P),)
+               key(AE), (identity(SD),) (sensitivity)>
+       */
+
+       struct {
+               struct sadb_msg msg __attribute((aligned(sizeof (uint64_t))));
+               struct sadb_key key  __attribute((aligned(sizeof (uint64_t))));
+               struct sadb_sa sa  __attribute((aligned(sizeof (uint64_t))));
+               struct sadb_x_sa2 sa2  __attribute((aligned(sizeof (uint64_t))));
+               struct sadb_x_ipsecif ipsecif __attribute((aligned(sizeof (uint64_t))));
+               struct {
+                       struct sadb_address addr __attribute((aligned(sizeof (uint64_t))));
+                       struct sockaddr_in saddr __attribute((aligned(sizeof (uint64_t))));
+               } src;
+               struct {
+                       struct sadb_address addr __attribute((aligned(sizeof (uint64_t))));
+                       struct sockaddr_in saddr __attribute((aligned(sizeof (uint64_t))));
+               } dst;
+       } addcmd;
+
+       memset(&addcmd, 0, sizeof(addcmd));
+
+       addcmd.msg.sadb_msg_version = PF_KEY_V2;
+       addcmd.msg.sadb_msg_type = type;
+       addcmd.msg.sadb_msg_errno = 0;
+       addcmd.msg.sadb_msg_satype = SADB_SATYPE_ESP;
+       addcmd.msg.sadb_msg_len = PFKEY_UNIT64(sizeof(addcmd));
+       addcmd.msg.sadb_msg_reserved = 0;
+       addcmd.msg.sadb_msg_seq = 0;
+       addcmd.msg.sadb_msg_pid = (unsigned)getpid();
+
+       addcmd.key.sadb_key_len = PFKEY_UNIT64(sizeof(addcmd.key));
+       addcmd.key.sadb_key_exttype = SADB_EXT_KEY_ENCRYPT;
+  addcmd.key.sadb_key_bits = 0;
+  addcmd.key.sadb_key_reserved = 0;
+
+  addcmd.sa.sadb_sa_len = PFKEY_UNIT64(sizeof(addcmd.sa));
+  addcmd.sa.sadb_sa_exttype = SADB_EXT_SA;
+  addcmd.sa.sadb_sa_spi = htonl(spi);
+  addcmd.sa.sadb_sa_replay = 0;
+  addcmd.sa.sadb_sa_state = 0;
+  addcmd.sa.sadb_sa_auth = SADB_AALG_NONE;
+  addcmd.sa.sadb_sa_encrypt = SADB_EALG_NULL;
+  addcmd.sa.sadb_sa_flags = SADB_X_EXT_CYCSEQ;
+
+       addcmd.sa2.sadb_x_sa2_len = PFKEY_UNIT64(sizeof(addcmd.sa2));
+       addcmd.sa2.sadb_x_sa2_exttype = SADB_X_EXT_SA2;
+       addcmd.sa2.sadb_x_sa2_mode = IPSEC_MODE_ANY;
+       addcmd.sa2.sadb_x_sa2_alwaysexpire = 1;
+       addcmd.sa2.sadb_x_sa2_flags = SADB_X_EXT_SA2_DELETE_ON_DETACH;
+       addcmd.sa2.sadb_x_sa2_sequence = 0;
+       addcmd.sa2.sadb_x_sa2_reqid = 0;
+
+       addcmd.ipsecif.sadb_x_ipsecif_len = PFKEY_UNIT64(sizeof(addcmd.ipsecif));
+       addcmd.ipsecif.sadb_x_ipsecif_exttype = SADB_X_EXT_IPSECIF;
+       memset(addcmd.ipsecif.sadb_x_ipsecif_internal_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_internal_if));
+       memset(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if));
+       strlcpy(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if, ifname, sizeof(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if));
+       addcmd.ipsecif.sadb_x_ipsecif_init_disabled = 0;
+       addcmd.ipsecif.reserved = 0;
+
+  addcmd.src.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.src));
+  addcmd.src.addr.sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
+  addcmd.src.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
+  addcmd.src.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
+       addcmd.src.addr.sadb_address_reserved = 0;
+       addcmd.src.saddr.sin_len = sizeof(addcmd.src.saddr);
+       addcmd.src.saddr.sin_family = AF_INET;
+       addcmd.src.saddr.sin_port = htons(0);
+       addcmd.src.saddr.sin_addr = *src;
+
+  addcmd.dst.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.dst));
+  addcmd.dst.addr.sadb_address_exttype = SADB_EXT_ADDRESS_DST;
+  addcmd.dst.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
+       addcmd.dst.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
+       addcmd.dst.addr.sadb_address_reserved = 0;
+       addcmd.dst.saddr.sin_len = sizeof(addcmd.dst.saddr);
+       addcmd.dst.saddr.sin_family = AF_INET;
+       addcmd.dst.saddr.sin_port = htons(0);
+       addcmd.dst.saddr.sin_addr = *dst;
+
+       //log_hexdump(&addcmd, sizeof(addcmd));
+
+       ssize_t slen;
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(slen = send(g_pfkeyso, &addcmd, sizeof(addcmd), 0), NULL);
+       T_QUIET; T_EXPECT_EQ(slen, (ssize_t)sizeof(addcmd), NULL);
+}
+
+static int
+create_tunsock(int enable_netif, int enable_flowswitch, int enable_channel)
+{
+       int tunsock;
+       struct ctl_info kernctl_info;
+       struct sockaddr_ctl kernctl_addr;
+       uuid_t uuid;
+       socklen_t uuidlen;
+
+startover:
+
+       T_QUIET; T_EXPECT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL);
+
+       memset(&kernctl_info, 0, sizeof(kernctl_info));
+       strlcpy(kernctl_info.ctl_name, g_CONTROL_NAME, sizeof(kernctl_info.ctl_name));
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL);
+
+       memset(&kernctl_addr, 0, sizeof(kernctl_addr));
+       kernctl_addr.sc_len = sizeof(kernctl_addr);
+       kernctl_addr.sc_family = AF_SYSTEM;
+       kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
+       kernctl_addr.sc_id = kernctl_info.ctl_id;
+       kernctl_addr.sc_unit = 0;
+
+       //T_LOG("enable_netif = %d, enable_flowswitch = %d, enable_channel = %d",
+       //enable_netif, enable_channel, enable_flowswitch);
+
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
+                       &enable_netif, sizeof(enable_netif)), EINVAL, NULL);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                       &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+                       &enable_channel, sizeof(enable_channel)), EINVAL, NULL);
+       uuid_clear(uuid);
+       uuidlen = sizeof(uuid_t);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                       uuid, &uuidlen), EINVAL, NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
+       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL);
+
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
+                               &enable_netif, sizeof(enable_netif)), NULL);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                       &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+                       &enable_channel, sizeof(enable_channel)), EINVAL, NULL);
+       uuid_clear(uuid);
+       uuidlen = sizeof(uuid_t);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                       uuid, &uuidlen), ENXIO, NULL);
+       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
+       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+
+       int error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
+       if (error == -1 && errno == EBUSY) {
+               /* XXX remove this retry nonsense when this is fixed:
+                * <rdar://problem/37340313> creating an interface without specifying specific interface name should not return EBUSY
+                */
+               close(tunsock);
+               T_LOG("connect got EBUSY, sleeping 1 second before retry");
+               sleep(1);
+               goto startover;
+       }
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(error, "connect()");
+
+       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
+                       &enable_netif, sizeof(enable_netif)), EINVAL, NULL);
+
+       if (get_skywalk_features() & SK_FEATURE_NETNS) {
+               if (enable_netif) {
+                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                                       &enable_flowswitch, sizeof(enable_flowswitch)), NULL);
+               } else {
+                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                                       &enable_flowswitch, sizeof(enable_flowswitch)), ENOENT, NULL);
+               }
+       } else {
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
+                               &enable_flowswitch, sizeof(enable_flowswitch)), ENOTSUP, NULL);
+       }
+
+       if (enable_channel) {
+               if (g_is_ipsec_test && !enable_netif) {
+                       /* ipsec doesn't support channels without a netif */
+                       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+                                       &enable_channel, sizeof(enable_channel)), EOPNOTSUPP, NULL);
+                       uuid_clear(uuid);
+                       uuidlen = sizeof(uuid_t);
+                       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                                       uuid, &uuidlen), ENXIO, NULL);
+                       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
+                       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+               } else {
+                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+                                       &enable_channel, sizeof(enable_channel)), NULL);
+                       uuid_clear(uuid);
+                       uuidlen = sizeof(uuid_t);
+                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                                       uuid, &uuidlen), NULL);
+                       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
+                       T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid), NULL);
+               }
+       } else {
+               T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
+                               &enable_channel, sizeof(enable_channel)), ENXIO, NULL);
+               uuid_clear(uuid);
+               uuidlen = sizeof(uuid_t);
+               T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
+                               uuid, &uuidlen), ENXIO, NULL);
+               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
+               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
+       }
+
+       check_enables(tunsock, enable_netif, enable_flowswitch, enable_channel, uuid);
+
+       //T_LOG("Returning tunsock %d", tunsock);
+
+       return tunsock;
+}
+
+#if 0
+static void
+ipsec_stats(void)
+{
+       struct ifmibdata ifmd;
+
+               len = sizeof(struct ifmibdata);
+               name[3] = IFMIB_IFDATA;
+               name[4] = interesting_row;
+               name[5] = IpFDATA_GENERAL;
+               if (sysctl(name, 6, &ifmd, &len, (void *)0, 0) == -1)
+                       err(1, "sysctl IFDATA_GENERAL %d", interesting_row);
+}
+#endif
+
+static void
+permute_enables(void)
+{
+       int tunsock;
+       T_EXPECT_GE(tunsock = create_tunsock(false, false, false), 0, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(false, false, true), 0, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(false, true, false), 0, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(false, true, true), 0, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(true, false, false), 0, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(true, false, true), 0, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(true, true, false), 0, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
+       T_EXPECT_GE(tunsock = create_tunsock(true, true, true), 0, NULL);
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
+}
+
+T_DECL(ipsec_enables, "This test checks combinations of netif/channel/flowswitch on ipsec")
+{
+       setup_ipsec_test();
+       permute_enables();
+}
+
+T_DECL(utun_enables, "This test checks combinations of netif/channel/flowswitch on utun")
+{
+       setup_utun_test();
+       permute_enables();
+}
+
+static int g_tunsock = -1;
+
+static void
+cleanup_tunsock(void)
+{
+       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(g_tunsock), NULL);
+       T_QUIET; T_EXPECT_POSIX_FAILURE(close(g_tunsock), EBADF, NULL);
+       if (g_is_ipsec_test) {
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(g_pfkeyso), NULL);
+               T_QUIET; T_EXPECT_POSIX_FAILURE(close(g_pfkeyso), EBADF, NULL);
+       }
+}
+
+static void
+setup_tunsock(void)
+{
+       T_ASSERT_GE(g_tunsock = create_tunsock(true, false, true), 0, NULL);
+       T_ATEND(cleanup_tunsock);
+
+       char ifname[IFXNAMSIZ];
+       tunsock_get_ifname(g_tunsock, ifname);
+
+       T_LOG("Created interface %s", ifname);
+
+       uint32_t ifaddr = (10 << 24) | ((unsigned)getpid()&0xffff) << 8 | 160;
+       struct in_addr mask;
+       g_addr1.s_addr = htonl(ifaddr);
+       g_addr2.s_addr = htonl(ifaddr+1);
+       mask.s_addr = htonl(0xffffffff);
+
+       ifnet_add_addr4(ifname, &g_addr1, &mask, &g_addr2);
+
+       if (g_is_ipsec_test) {
+               create_sa(ifname, SADB_ADD, 12345, &g_addr1, &g_addr2);
+               create_sa(ifname, SADB_ADD, 12346, &g_addr2, &g_addr1);
+       }
+}
+
+T_DECL(setup_ipsec, "This test sets up an ipsec interface")
+{
+       setup_ipsec_test();
+       setup_tunsock();
+}
+
+T_DECL(setup_utun, "This test sets up a utun interface")
+{
+       setup_utun_test();
+       setup_tunsock();
+}
diff --git a/tests/netbsd_utimensat.c b/tests/netbsd_utimensat.c
new file mode 100644 (file)
index 0000000..c14f92a
--- /dev/null
@@ -0,0 +1,198 @@
+/*     $NetBSD: t_utimensat.c,v 1.6 2017/01/10 15:13:56 christos Exp $ */
+
+/*-
+ * Copyright (c) 2012 The NetBSD Foundation, Inc.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to The NetBSD Foundation
+ * by Emmanuel Dreyfus.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+ * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__RCSID("$NetBSD: t_utimensat.c,v 1.6 2017/01/10 15:13:56 christos Exp $");
+
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <paths.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#define DIRPATH "dir"
+#define FILEPATH "dir/utimensat"
+#define BASEFILE "utimensat"
+#define LINK "dir/symlink"
+#define BASELINK "symlink"
+#define FILEERR "dir/symlink"
+
+static const struct timespec tptr[] = { 
+       { 0x12345678, 987654321 },
+       { 0x15263748, 123456789 },
+};
+
+static void chtmpdir(void)
+{
+       T_SETUPBEGIN;
+       T_ASSERT_POSIX_ZERO(chdir(dt_tmpdir()), NULL);
+
+       // <rdar://problem/31780295> dt_tmpdir() should guarantee a clean directory for each run
+       unlink(FILEPATH);
+       unlink(LINK);
+       rmdir(DIRPATH);
+
+       // Skip the test if the current working directory is not on APFS.
+       struct statfs sfs = { 0 };
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(statfs(".", &sfs), NULL);
+       if (memcmp(&sfs.f_fstypename[0], "apfs", strlen("apfs")) != 0) {
+               T_SKIP("utimensat is APFS-only, but working directory is non-APFS");
+       }
+
+       T_SETUPEND;
+}
+
+T_DECL(netbsd_utimensat_fd, "See that utimensat works with fd")
+{
+       chtmpdir();
+
+       int dfd;
+       int fd;
+       struct stat st;
+
+       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
+       T_ASSERT_POSIX_SUCCESS((fd = open(FILEPATH, O_CREAT|O_RDWR, 0644)), NULL);
+       T_ASSERT_POSIX_ZERO(close(fd), NULL);
+
+       T_ASSERT_POSIX_SUCCESS((dfd = open(DIRPATH, O_RDONLY, 0)), NULL);
+       T_ASSERT_POSIX_ZERO(utimensat(dfd, BASEFILE, tptr, 0), NULL);
+       T_ASSERT_POSIX_ZERO(close(dfd), NULL);
+
+       T_ASSERT_POSIX_ZERO(stat(FILEPATH, &st), NULL);
+       T_ASSERT_EQ(st.st_atimespec.tv_sec, tptr[0].tv_sec, NULL);
+       T_ASSERT_EQ(st.st_atimespec.tv_nsec, tptr[0].tv_nsec, NULL);
+       T_ASSERT_EQ(st.st_mtimespec.tv_sec, tptr[1].tv_sec, NULL);
+       T_ASSERT_EQ(st.st_mtimespec.tv_nsec, tptr[1].tv_nsec, NULL);
+}
+
+T_DECL(netbsd_utimensat_fdcwd, "See that utimensat works with fd as AT_FDCWD")
+{
+       chtmpdir();
+
+       int fd;
+       struct stat st;
+
+       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
+       T_ASSERT_POSIX_SUCCESS((fd = open(FILEPATH, O_CREAT|O_RDWR, 0644)), NULL);
+       T_ASSERT_POSIX_ZERO(close(fd), NULL);
+
+       T_ASSERT_POSIX_ZERO(chdir(DIRPATH), NULL);
+       T_ASSERT_POSIX_ZERO(utimensat(AT_FDCWD, BASEFILE, tptr, 0), NULL);
+
+       T_ASSERT_POSIX_ZERO(stat(BASEFILE, &st), NULL);
+       T_ASSERT_EQ(st.st_atimespec.tv_sec, tptr[0].tv_sec, NULL);
+       T_ASSERT_EQ(st.st_atimespec.tv_nsec, tptr[0].tv_nsec, NULL);
+       T_ASSERT_EQ(st.st_mtimespec.tv_sec, tptr[1].tv_sec, NULL);
+       T_ASSERT_EQ(st.st_mtimespec.tv_nsec, tptr[1].tv_nsec, NULL);
+}
+
+T_DECL(netbsd_utimensat_fdcwderr, "See that utimensat fails with fd as AT_FDCWD and bad path")
+{
+       chtmpdir();
+
+       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
+       T_ASSERT_EQ(utimensat(AT_FDCWD, FILEERR, tptr, 0), -1, NULL);
+}
+
+T_DECL(netbsd_utimensat_fderr1, "See that utimensat fail with bad path")
+{
+       chtmpdir();
+
+       int dfd;
+
+       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
+       T_ASSERT_POSIX_SUCCESS((dfd = open(DIRPATH, O_RDONLY, 0)), NULL);
+       T_ASSERT_EQ(utimensat(dfd, FILEERR, tptr, 0), -1, NULL);
+       T_ASSERT_POSIX_ZERO(close(dfd), NULL);
+}
+
+T_DECL(netbsd_utimensat_fderr2, "See that utimensat fails with bad fdat")
+{
+       chtmpdir();
+
+       int dfd;
+       int fd;
+       char cwd[MAXPATHLEN];
+
+       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
+       T_ASSERT_POSIX_SUCCESS((fd = open(FILEPATH, O_CREAT|O_RDWR, 0644)), NULL);
+       T_ASSERT_POSIX_ZERO(close(fd), NULL);
+
+       T_ASSERT_POSIX_SUCCESS((dfd = open(getcwd(cwd, MAXPATHLEN), O_RDONLY, 0)), NULL);
+       T_ASSERT_EQ(utimensat(dfd, BASEFILE, tptr, 0), -1, NULL);
+       T_ASSERT_POSIX_ZERO(close(dfd), NULL);
+}
+
+T_DECL(netbsd_utimensat_fderr3, "See that utimensat fails with fd as -1")
+{
+       chtmpdir();
+
+       int fd;
+
+       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
+       T_ASSERT_POSIX_SUCCESS((fd = open(FILEPATH, O_CREAT|O_RDWR, 0644)), NULL);
+       T_ASSERT_POSIX_ZERO(close(fd), NULL);
+
+       T_ASSERT_EQ(utimensat(-1, FILEPATH, tptr, 0), -1, NULL);
+}
+
+T_DECL(netbsd_utimensat_fdlink, "See that utimensat works on symlink")
+{
+       chtmpdir();
+
+       int dfd;
+       struct stat st;
+
+       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
+       T_ASSERT_POSIX_ZERO(symlink(FILEPATH, LINK), NULL); /* NB: FILE does not exists */
+
+       T_ASSERT_POSIX_SUCCESS((dfd = open(DIRPATH, O_RDONLY, 0)), NULL);
+
+       T_ASSERT_EQ(utimensat(dfd, BASELINK, tptr, 0), -1, NULL);
+       T_ASSERT_EQ(errno, ENOENT, NULL);
+
+       T_ASSERT_POSIX_ZERO(utimensat(dfd, BASELINK, tptr, AT_SYMLINK_NOFOLLOW), NULL);
+
+       T_ASSERT_POSIX_ZERO(close(dfd), NULL);
+
+       T_ASSERT_POSIX_ZERO(lstat(LINK, &st), NULL);
+       T_ASSERT_EQ(st.st_atimespec.tv_sec, tptr[0].tv_sec, NULL);
+       T_ASSERT_EQ(st.st_atimespec.tv_nsec, tptr[0].tv_nsec, NULL);
+       T_ASSERT_EQ(st.st_mtimespec.tv_sec, tptr[1].tv_sec, NULL);
+       T_ASSERT_EQ(st.st_mtimespec.tv_nsec, tptr[1].tv_nsec, NULL);
+}
diff --git a/tests/network_entitlements.plist b/tests/network_entitlements.plist
new file mode 100644 (file)
index 0000000..83c92ca
--- /dev/null
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.security.network.client</key>
+       <true/>
+       <key>com.apple.security.network.server</key>
+       <true/>
+       <key>com.apple.private.skywalk.register-kernel-pipe</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/no32exec_35914211.c b/tests/no32exec_35914211.c
new file mode 100644 (file)
index 0000000..ea36703
--- /dev/null
@@ -0,0 +1,23 @@
+#include <spawn.h>
+#include <sys/wait.h>
+#include <darwintest.h>
+#include <mach-o/dyld.h>
+#include <errno.h>
+
+T_DECL(no32exec_bootarg, "make sure the no32exec boot-arg is honored", T_META_BOOTARGS_SET("-no32exec"))
+{
+       int spawn_ret, pid;
+       char path[1024];
+       uint32_t size = sizeof(path);
+
+       T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL);
+       T_ASSERT_LT(strlcat(path, "_helper", size), size, NULL);
+
+       spawn_ret = posix_spawn(&pid, path, NULL, NULL, NULL, NULL);
+       if (spawn_ret == 0) {
+               int wait_ret = 0;
+               waitpid(pid, &wait_ret, 0);
+               T_ASSERT_FALSE(WIFEXITED(wait_ret), "i386 helper should not run");
+       }
+       T_ASSERT_EQ(spawn_ret, EBADARCH, NULL);
+}
diff --git a/tests/no32exec_35914211_helper.c b/tests/no32exec_35914211_helper.c
new file mode 100644 (file)
index 0000000..99fb6be
--- /dev/null
@@ -0,0 +1,6 @@
+#include <darwintest.h>
+
+T_DECL(null_test, "nothing to see here")
+{
+       T_SKIP("nothing to see here");
+}
diff --git a/tests/ntp_adjtime_29192647.c b/tests/ntp_adjtime_29192647.c
new file mode 100644 (file)
index 0000000..2866385
--- /dev/null
@@ -0,0 +1,371 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <mach/clock_types.h>
+#include <sys/timex.h>
+#include <mach/mach.h>
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+
+#define DAY 86400 /*1 day in sec*/
+#define ERROR 2 /*2 us of error tolerance*/
+
+T_DECL(settimeofday_29192647,
+       "Verify that the syscall settimeofday is effective",
+       T_META_ASROOT(true), T_META_CHECK_LEAKS(NO), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       struct timeval time;
+       long new_time;
+
+       if (geteuid() != 0){
+                T_SKIP("settimeofday_29192647 test requires root privileges to run.");
+        }
+
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* increment the time of one day */
+       new_time = time.tv_sec + DAY;
+
+       time.tv_sec = new_time;
+       time.tv_usec = 0;
+
+       T_LOG("Attemping to set the time one day after.");
+
+       T_WITH_ERRNO;
+       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* expext to be past new_time */
+       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time successfully changed");
+
+       /* set the time back to previous value */
+       if (time.tv_sec >= new_time) {
+               time.tv_sec = time.tv_sec - DAY;
+               time.tv_usec = 0;
+
+               T_WITH_ERRNO;
+               T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+       }
+}
+
+static void get_abs_to_us_scale_factor(uint64_t* numer, uint64_t* denom){
+       struct timespec time;
+       uint64_t old_abstime, new_abstime;
+       uint64_t old_time_usec, new_time_usec;
+       uint64_t time_conv1, diff;
+       mach_timebase_info_data_t timebaseInfo = { 0, 0 };
+
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&old_abstime, NULL, &time), KERN_SUCCESS, NULL);
+
+       old_time_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
+
+       sleep(1);
+
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&new_abstime, NULL, &time), KERN_SUCCESS, NULL);
+
+       new_time_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
+
+       /* this is conversion factors from abs to nanos */
+       T_ASSERT_EQ(mach_timebase_info(&timebaseInfo), KERN_SUCCESS, NULL);
+
+       new_time_usec -= old_time_usec;
+       new_abstime -= old_abstime;
+
+       time_conv1 = new_abstime;
+       time_conv1 *= timebaseInfo.numer;
+       time_conv1 /= timebaseInfo.denom * 1000;
+
+       if (time_conv1 > new_time_usec)
+               diff = time_conv1 - new_time_usec;
+       else
+               diff = new_time_usec - time_conv1;
+
+       T_EXPECT_LE_ULLONG(diff, (unsigned long long)ERROR, "Check scale factor time base (%u/%u) delta read usec %llu delta converted %llu delta abs %llu", timebaseInfo.numer, timebaseInfo.denom, time_conv1, new_time_usec, new_abstime);
+
+       *numer = (uint64_t)timebaseInfo.numer;
+       *denom = (uint64_t)timebaseInfo.denom * 1000;
+}
+
+
+#define ADJSTMENT 3333 /*3333 us*/
+#define ADJTIME_OFFSET_PER_SEC 500
+
+T_DECL(adjtime_29192647,
+       "Verify that the syscall adjtime is effective",
+       T_META_CHECK_LEAKS(NO), T_META_LTEPHASE(LTE_POSTINIT), T_META_ASROOT(true))
+{
+       struct timespec time;
+       struct timeval adj;
+       uint64_t old_abstime, new_abstime, abs_delta;
+       uint64_t old_time_usec, new_time_usec, us_delta, num, den;
+       unsigned int sleep_time;
+       long diff;
+       const char * lterdos_env = NULL;
+
+#if defined(__i386__) || defined(__x86_64__)
+       T_SKIP("adjtime_29192647 test requires LTE to run.");
+#endif
+
+       if (geteuid() != 0) {
+                T_SKIP("adjtime_29192647 test requires root privileges to run.");
+        }
+
+       lterdos_env = getenv("LTERDOS");
+
+       if (lterdos_env != NULL){
+               if (!(strcmp(lterdos_env, "YES") == 0)) {
+                    T_SKIP("adjtime_29192647 test requires LTE to run.");
+               }
+       }
+       else {
+               T_SKIP("adjtime_29192647 test requires LTE to run.");
+       }
+
+       /*
+        * Calibrate scale factor for converting from abs time to usec
+        */
+       get_abs_to_us_scale_factor(&num, &den);
+
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&old_abstime, NULL, &time), KERN_SUCCESS, NULL);
+
+       old_time_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
+
+       adj.tv_sec = 0;
+       adj.tv_usec = ADJSTMENT;
+
+       T_LOG("Attemping to adjust the time of %d", ADJSTMENT);
+
+       /*
+        * If more than one second of adjustment
+        * the system slews at a rate of 5ms/s otherwise 500us/s
+        * until the last second is slewed the final < 500 usecs.
+        */
+       T_WITH_ERRNO;
+       T_ASSERT_POSIX_ZERO(adjtime(&adj, NULL),NULL);
+
+       /*
+        * Wait that the full adjustment is applied.
+        * Note, add 2 more secs for take into account division error
+        * and that the last block of adj is fully elapsed.
+        */
+       sleep_time = (ADJSTMENT)/(ADJTIME_OFFSET_PER_SEC)+2;
+
+       T_LOG("Waiting for %u sec\n", sleep_time);
+       sleep(sleep_time);
+
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&new_abstime, NULL, &time), KERN_SUCCESS, NULL);
+
+       new_time_usec =  (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
+
+       us_delta = new_time_usec - old_time_usec;
+       us_delta -= ADJSTMENT;
+
+       /* abs time is not affected by adjtime */
+       abs_delta = new_abstime - old_abstime;
+
+       abs_delta *= num;
+       abs_delta /= den;
+
+       diff = (long) us_delta - (long) abs_delta;
+
+       /* expext that us_delta == abs_delta */
+       T_EXPECT_LE_LONG(diff, (long) ERROR, "Check abs time vs calendar time");
+
+       T_EXPECT_GE_LONG(diff, (long) -ERROR, "Check abs time vs calendar time");
+
+}
+
+#define FREQ_PPM 222 /*222 PPM(us/s)*/
+#define SHIFT_PLL 4
+#define OFFSET_US 123 /*123us*/
+
+T_DECL(ntp_adjtime_29192647,
+       "Verify that the syscall ntp_adjtime is effective",
+       T_META_CHECK_LEAKS(NO), T_META_LTEPHASE(LTE_POSTINIT), T_META_ASROOT(true))
+{
+       struct timespec time;
+       struct timex ntptime;
+       uint64_t abstime1, abstime2, abs_delta, num, den, time_delta;
+       uint64_t time1_usec, time2_usec, time_conv, us_delta, app;
+       int64_t offset;
+       long diff, freq;
+       unsigned int sleep_time;
+       const char * lterdos_env = NULL;
+
+#if defined(__i386__) || defined(__x86_64__)
+       T_SKIP("ntp_adjtime_29192647 test requires LTE to run.");
+#endif
+
+       if (geteuid() != 0){
+                T_SKIP("ntp_adjtime_29192647 test requires root privileges to run.");
+        }
+
+       lterdos_env = getenv("LTERDOS");
+
+       if (lterdos_env != NULL){
+               if (!(strcmp(lterdos_env, "YES") == 0)) {
+                    T_SKIP("adjtime_29192647 test requires LTE to run.");
+               }
+       }
+       else {
+               T_SKIP("adjtime_29192647 test requires LTE to run.");
+       }
+
+       /*
+        * Calibrate scale factor for converting from abs time to usec
+        */
+       get_abs_to_us_scale_factor(&num, &den);
+
+       /*
+        * scale frequency using ntp_adjtime;
+        */
+       memset(&ntptime, 0, sizeof(ntptime));
+
+       ntptime.modes = MOD_STATUS;
+       ntptime.status = TIME_OK;
+        /* ntp input freq is in ppm (us/s) * 2^16, max freq is 500 ppm */
+        freq = (FREQ_PPM) * 65536;
+       ntptime.modes |= MOD_FREQUENCY;
+        ntptime.freq = freq;
+
+       T_LOG("Attemping to change calendar frequency of %d ppm", FREQ_PPM);
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(ntptime.freq, freq, NULL);
+
+       sleep(2);
+
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&abstime1, NULL, &time), KERN_SUCCESS, NULL);
+
+       time1_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
+
+       sleep(1);
+
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&abstime2, NULL, &time), KERN_SUCCESS, NULL);
+
+       time2_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
+
+       abs_delta = abstime2 - abstime1;
+       us_delta = time2_usec - time1_usec;
+
+       time_conv = abs_delta;
+       time_conv *= num;
+       time_conv /= den;
+
+       app = time_conv/USEC_PER_SEC; //sec elapsed
+
+       time_delta = time_conv;
+       time_delta += app * (FREQ_PPM);
+
+       app = time_conv%USEC_PER_SEC;
+
+       time_delta += (app*(FREQ_PPM))/USEC_PER_SEC;
+
+       diff = (long) us_delta - (long) time_delta;
+
+       /* expext that us_delta == time_delta */
+       T_EXPECT_LE_LONG(diff, (long) ERROR, "Check abs time vs calendar time");
+
+       T_EXPECT_GE_LONG(diff, (long) -ERROR, "Check abs time vs calendar time");
+
+       memset(&ntptime, 0, sizeof(ntptime));
+
+       /* reset freq to zero */
+       freq = 0;
+       ntptime.modes = MOD_STATUS;
+       ntptime.status = TIME_OK;
+        ntptime.modes |= MOD_FREQUENCY;
+        ntptime.freq = freq;
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(ntptime.freq, freq, NULL);
+
+       sleep(1);
+
+       /*
+        * adjust the phase using ntp_adjtime;
+        */
+       memset(&ntptime, 0, sizeof(ntptime));
+       ntptime.modes |= MOD_STATUS;
+       ntptime.status = TIME_OK;
+       ntptime.status |= STA_PLL|STA_FREQHOLD;
+
+       /* ntp input phase can be both ns or us (MOD_MICRO), max offset is 500 ms */
+        ntptime.offset = OFFSET_US;
+       ntptime.modes |= MOD_OFFSET|MOD_MICRO;
+
+       /*
+        * The system will slew each sec of:
+        * slew = ntp.offset >> (SHIFT_PLL + time_constant);
+        * ntp.offset -= slew;
+        */
+       offset= (OFFSET_US) * 1000;
+       sleep_time = 2;
+
+       while((offset>>SHIFT_PLL)>0){
+               offset -= offset >> SHIFT_PLL;
+               sleep_time++;
+       }
+
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&abstime1, NULL, &time), KERN_SUCCESS, NULL);
+
+       time1_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
+
+       T_LOG("Attemping to change calendar phase of %d us", OFFSET_US);
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(ntptime.offset, (long) OFFSET_US, NULL);
+
+       T_LOG("Waiting for %u sec\n", sleep_time);
+       sleep(sleep_time);
+
+       T_QUIET; T_ASSERT_EQ(mach_get_times(&abstime2, NULL, &time), KERN_SUCCESS, NULL);
+
+       time2_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
+
+       abs_delta = abstime2 - abstime1;
+       us_delta = time2_usec - time1_usec;
+
+       abs_delta *= num;
+       abs_delta /= den;
+
+       us_delta -= OFFSET_US;
+
+       diff = (long) us_delta - (long) abs_delta;
+
+       /* expext that us_delta == abs_delta */
+       T_EXPECT_LE_LONG(diff, (long) ERROR, "Check abs time vs calendar time");
+
+       T_EXPECT_GE_LONG(diff, (long) -ERROR, "Check abs time vs calendar time");
+
+       memset(&ntptime, 0, sizeof(ntptime));
+       ntptime.modes = MOD_STATUS;
+       ntptime.status = TIME_OK;
+        ntptime.modes |= MOD_FREQUENCY;
+        ntptime.freq = 0;
+
+       ntptime.status |= STA_PLL;
+        ntptime.offset = 0;
+       ntptime.modes |= MOD_OFFSET;
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
+
+}
+
+
diff --git a/tests/perf_compressor.c b/tests/perf_compressor.c
new file mode 100644 (file)
index 0000000..1a8a57f
--- /dev/null
@@ -0,0 +1,334 @@
+#include <stdio.h>
+#include <signal.h>
+#include <sys/sysctl.h>
+#include <mach-o/dyld.h>
+#include <perfcheck_keys.h>
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.vm.perf"),
+       T_META_CHECK_LEAKS(false),
+       T_META_TAG_PERF
+);
+
+enum {
+       ALL_ZEROS,
+       MOSTLY_ZEROS,
+       RANDOM,
+       TYPICAL
+};
+
+#define CREATE_LIST(X) \
+       X(SUCCESS) \
+       X(TOO_FEW_ARGUMENTS) \
+       X(SYSCTL_VM_PAGESIZE_FAILED) \
+       X(VM_PAGESIZE_IS_ZERO) \
+       X(UNKNOWN_PAGE_TYPE) \
+       X(DISPATCH_SOURCE_CREATE_FAILED) \
+       X(INITIAL_SIGNAL_TO_PARENT_FAILED) \
+       X(SIGNAL_TO_PARENT_FAILED) \
+       X(EXIT_CODE_MAX)
+
+#define EXIT_CODES_ENUM(VAR) VAR,
+enum exit_codes_num {
+       CREATE_LIST(EXIT_CODES_ENUM)
+};
+
+#define EXIT_CODES_STRING(VAR) #VAR,
+static const char *exit_codes_str[] = {
+       CREATE_LIST(EXIT_CODES_STRING)
+};
+
+
+static pid_t pid = -1;
+static dt_stat_t r;
+static dt_stat_time_t s;
+
+void allocate_zero_pages(char **buf, int num_pages, int vmpgsize);
+void allocate_mostly_zero_pages(char **buf, int num_pages, int vmpgsize);
+void allocate_random_pages(char **buf, int num_pages, int vmpgsize);
+void allocate_representative_pages(char **buf, int num_pages, int vmpgsize);
+void run_compressor_test(int size_mb, int page_type);
+void freeze_helper_process(void);
+
+void allocate_zero_pages(char **buf, int num_pages, int vmpgsize) {
+       int i;
+
+       for (i = 0; i < num_pages; i++) {
+               buf[i] = (char*)malloc((size_t)vmpgsize * sizeof(char));
+               memset(buf[i], 0, vmpgsize);
+       }
+}
+
+void allocate_mostly_zero_pages(char **buf, int num_pages, int vmpgsize) {
+       int i, j;
+
+       for (i = 0; i < num_pages; i++) {
+               buf[i] = (char*)malloc((size_t)vmpgsize * sizeof(char));
+               memset(buf[i], 0, vmpgsize);
+               for (j = 0; j < 40; j++) {
+                       buf[i][j] = (char)(j+1);
+               }
+       }
+}
+
+void allocate_random_pages(char **buf, int num_pages, int vmpgsize) {
+       int i;
+
+       for (i = 0; i < num_pages; i++) {
+               buf[i] = (char*)malloc((size_t)vmpgsize * sizeof(char));
+               arc4random_buf((void*)buf[i], (size_t)vmpgsize);
+       }
+}
+
+// Gives us the compression ratio we see in the typical case (~2.7)
+void allocate_representative_pages(char **buf, int num_pages, int vmpgsize) {
+       int i, j;
+       char val;
+
+       for (j = 0; j < num_pages; j++) {
+               buf[j] = (char*)malloc((size_t)vmpgsize * sizeof(char));
+               val = 0;
+               for (i = 0; i < vmpgsize; i += 16) {
+                       memset(&buf[j][i], val, 16);
+                       if (i < 3400 * (vmpgsize / 4096)) {
+                               val++;
+                       }
+               }
+       }
+}
+
+void freeze_helper_process(void) {
+       int ret;
+       int64_t compressed_before, compressed_after, input_before, input_after;
+       size_t length;
+
+       length = sizeof(compressed_before);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_compressed_bytes", &compressed_before, &length, NULL, 0),
+                       "failed to query vm.compressor_compressed_bytes");
+       length = sizeof(input_before);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_input_bytes", &input_before, &length, NULL, 0),
+                       "failed to query vm.compressor_input_bytes");
+
+       T_STAT_MEASURE(s) {
+               ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid));
+       };
+
+       length = sizeof(compressed_after);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_compressed_bytes", &compressed_after, &length, NULL, 0),
+                       "failed to query vm.compressor_compressed_bytes");
+       length = sizeof(input_after);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_input_bytes", &input_after, &length, NULL, 0),
+                       "failed to query vm.compressor_input_bytes");
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_freeze failed");
+
+       dt_stat_add(r, (double)(input_after - input_before)/(double)(compressed_after - compressed_before));
+
+       ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid));
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed");
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGUSR1), "failed to send SIGUSR1 to child process");
+}
+
+void run_compressor_test(int size_mb, int page_type) {
+       int ret;
+       char sz_str[50];
+       char pt_str[50];
+       char **launch_tool_args;
+       char testpath[PATH_MAX];
+       uint32_t testpath_buf_size;
+       dispatch_source_t ds_freeze, ds_proc;
+
+#ifndef CONFIG_FREEZE
+       T_SKIP("Task freeze not supported.");
+#endif
+
+       r = dt_stat_create("(input bytes / compressed bytes)", "compression_ratio");
+       s = dt_stat_time_create("compressor_latency");
+       // This sets the A/B failure threshold at 50% of baseline for compressor_latency
+       dt_stat_set_variable(s, kPCFailureThresholdPctVar, 50.0);
+
+       signal(SIGUSR1, SIG_IGN);
+       ds_freeze = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_NOTNULL(ds_freeze, "dispatch_source_create (ds_freeze)");
+
+       dispatch_source_set_event_handler(ds_freeze, ^{
+               if (!dt_stat_stable(s)) {
+                       freeze_helper_process();
+               } else {
+                       dt_stat_finalize(s);
+                       dt_stat_finalize(r);
+
+                       kill(pid, SIGKILL);
+                       dispatch_source_cancel(ds_freeze);
+               }
+       });
+       dispatch_activate(ds_freeze);
+
+       testpath_buf_size = sizeof(testpath);
+       ret = _NSGetExecutablePath(testpath, &testpath_buf_size);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "_NSGetExecutablePath");
+       T_LOG("Executable path: %s", testpath);
+
+       sprintf(sz_str, "%d", size_mb);
+       sprintf(pt_str, "%d", page_type);
+       launch_tool_args = (char *[]){
+               testpath,
+               "-n",
+               "allocate_pages",
+               "--",
+               sz_str,
+               pt_str,
+               NULL
+       };
+
+       /* Spawn the child process. Suspend after launch until the exit proc handler has been set up. */
+       ret = dt_launch_tool(&pid, launch_tool_args, true, NULL, NULL);
+       if (ret != 0) {
+               T_LOG("dt_launch tool returned %d with error code %d", ret, errno);
+       }
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool");
+
+       ds_proc = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue());
+       T_QUIET; T_ASSERT_NOTNULL(ds_proc, "dispatch_source_create (ds_proc)");
+
+       dispatch_source_set_event_handler(ds_proc, ^{
+               int status = 0, code = 0;
+               pid_t rc = waitpid(pid, &status, 0);
+               T_QUIET; T_ASSERT_EQ(rc, pid, "waitpid");
+               code = WEXITSTATUS(status);
+
+               if (code == 0) {
+                       T_END;
+               } else if (code > 0 && code < EXIT_CODE_MAX) {
+                       T_ASSERT_FAIL("Child exited with %s", exit_codes_str[code]);
+               } else {
+                       T_ASSERT_FAIL("Child exited with unknown exit code %d", code);
+               }
+       });
+       dispatch_activate(ds_proc);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGCONT), "failed to send SIGCONT to child process");
+       dispatch_main();
+}
+
+T_HELPER_DECL(allocate_pages, "allocates pages to compress") {
+       int i, j, ret, size_mb, page_type, vmpgsize;
+       size_t vmpgsize_length;
+       __block int num_pages;
+       __block char **buf;
+       dispatch_source_t ds_signal;
+
+       vmpgsize_length = sizeof(vmpgsize);
+       ret = sysctlbyname("vm.pagesize", &vmpgsize, &vmpgsize_length, NULL, 0);
+       if (ret != 0) {
+               exit(SYSCTL_VM_PAGESIZE_FAILED);
+       }
+       if (vmpgsize == 0) {
+               exit(VM_PAGESIZE_IS_ZERO);
+       }
+
+       if (argc < 2) {
+               exit(TOO_FEW_ARGUMENTS);
+       }
+
+       size_mb = atoi(argv[0]);
+       page_type = atoi(argv[1]);
+       num_pages = size_mb * 1024 * 1024 / vmpgsize;
+       buf = (char**)malloc(sizeof(char*) * (size_t)num_pages);
+
+       // Switch on the type of page requested
+       switch(page_type) {
+               case ALL_ZEROS:
+                       allocate_zero_pages(buf, num_pages, vmpgsize);
+                       break;
+               case MOSTLY_ZEROS:
+                       allocate_mostly_zero_pages(buf, num_pages, vmpgsize);
+                       break;
+               case RANDOM:
+                       allocate_random_pages(buf, num_pages, vmpgsize);
+                       break;
+               case TYPICAL:
+                       allocate_representative_pages(buf, num_pages, vmpgsize);
+                       break;
+               default:
+                       exit(UNKNOWN_PAGE_TYPE);
+       }
+
+       for (j = 0; j < num_pages; j++) {
+               i = buf[j][0];
+       }
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), dispatch_get_main_queue(), ^{
+               /* Signal to the parent that we're done allocating and it's ok to freeze us */
+               printf("Sending initial signal to parent to begin freezing\n");
+               if (kill(getppid(), SIGUSR1) != 0) {
+                       exit(INITIAL_SIGNAL_TO_PARENT_FAILED);
+               }
+       });
+
+       signal(SIGUSR1, SIG_IGN);
+       ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
+       if (ds_signal == NULL) {
+               exit(DISPATCH_SOURCE_CREATE_FAILED);
+       }
+
+       dispatch_source_set_event_handler(ds_signal, ^{
+               volatile int tmp;
+
+               /* Make sure all the pages are accessed before trying to freeze again */
+               for (int x = 0; x < num_pages; x++) {
+                       tmp = buf[x][0];
+               }
+               if (kill(getppid(), SIGUSR1) != 0) {
+                       exit(SIGNAL_TO_PARENT_FAILED);
+               }
+       });
+       dispatch_activate(ds_signal);
+
+       dispatch_main();
+}
+
+// Numbers for 10MB and above are fairly reproducible. Anything smaller shows a lot of variation.
+
+// Keeping just the 100MB version for iOSMark
+#ifndef DT_IOSMARK
+T_DECL(compr_10MB_zero, "Compressor latencies") {
+       run_compressor_test(10, ALL_ZEROS);
+}
+
+T_DECL(compr_10MB_mostly_zero, "Compressor latencies") {
+       run_compressor_test(10, MOSTLY_ZEROS);
+}
+
+T_DECL(compr_10MB_random, "Compressor latencies") {
+       run_compressor_test(10, RANDOM);
+}
+
+T_DECL(compr_10MB_typical, "Compressor latencies") {
+       run_compressor_test(10, TYPICAL);
+}
+
+T_DECL(compr_100MB_zero, "Compressor latencies") {
+       run_compressor_test(100, ALL_ZEROS);
+}
+
+T_DECL(compr_100MB_mostly_zero, "Compressor latencies") {
+       run_compressor_test(100, MOSTLY_ZEROS);
+}
+
+T_DECL(compr_100MB_random, "Compressor latencies") {
+       run_compressor_test(100, RANDOM);
+}
+#endif
+
+T_DECL(compr_100MB_typical, "Compressor latencies") {
+       run_compressor_test(100, TYPICAL);
+}
+
diff --git a/tests/perf_exit.c b/tests/perf_exit.c
new file mode 100644 (file)
index 0000000..1dba37c
--- /dev/null
@@ -0,0 +1,190 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#include <sys/kdebug.h>
+#include <ktrace/session.h>
+#include <spawn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdatomic.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.perf"),
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_SINGLEUSER),
+       T_META_TAG_PERF
+);
+#if TARGET_OS_WATCH
+#define TEST_TIMEOUT 3600 * (NSEC_PER_SEC)
+#else
+#define TEST_TIMEOUT 1800 * (NSEC_PER_SEC)
+#endif
+// From bsd/sys/proc_internal.h
+#define PID_MAX 99999
+
+#define EXIT_BINARY "perf_exit_proc"
+#define EXIT_BINARY_PATH "./" EXIT_BINARY
+
+#define NEXT_CASE_EVENTID (0xfedcbb00)
+
+struct test_case {
+       int wired_mem;
+       int threads;
+};
+
+static struct test_case test_cases[] = {
+       {0, 0},
+       {0, 10},
+       {1000000, 0},
+#if !TARGET_OS_WATCH
+       {10000000, 0}
+#endif
+};
+
+#define TEST_CASES_COUNT (sizeof(test_cases) / sizeof(struct test_case))
+
+static _Atomic int producer_i, consumer_i;
+
+static ktrace_session_t session;
+
+static dispatch_queue_t spawn_queue, processing_queue;
+
+static uint64_t *begin_ts;
+static dt_stat_time_t s;
+static _Atomic bool tracing_on = false;
+
+void run_exit_test(int proc_wired_mem, int nthreads);
+
+static void cleanup(void) {
+       free(begin_ts);
+       dispatch_release(spawn_queue);
+       dispatch_release(processing_queue);
+       if (tracing_on) {
+               ktrace_end(session, 1);
+       }
+}
+
+static dt_stat_time_t
+create_stat(int proc_wired_mem, int nthreads)
+{
+       dt_stat_time_t dst = dt_stat_time_create("time");
+       T_ASSERT_NOTNULL(dst, "created time statistic");
+
+       dt_stat_set_variable((dt_stat_t)dst, "proc_threads", nthreads);
+       dt_stat_set_variable((dt_stat_t)dst, "proc_wired_mem", proc_wired_mem);;
+
+       return dst;
+}
+
+T_DECL(exit, "exit(2) time from syscall start to end", T_META_TIMEOUT(TEST_TIMEOUT)) {
+       s = create_stat(test_cases[consumer_i].wired_mem, test_cases[consumer_i].threads);
+
+       begin_ts = malloc(sizeof(uint64_t) * PID_MAX);
+       T_ASSERT_NOTNULL(begin_ts, "created pid array");
+
+       T_ATEND(cleanup);
+
+       session = ktrace_session_create();
+       T_ASSERT_NOTNULL(session, "created a trace session");
+
+       spawn_queue = dispatch_queue_create("com.apple.perf_exit.spawn_queue", NULL);
+       processing_queue = dispatch_queue_create("com.apple.perf_exit.processing_queue", NULL);
+
+       ktrace_set_completion_handler(session, ^{
+               T_ASSERT_EQ(consumer_i, TEST_CASES_COUNT, "ran all the test cases");
+               dispatch_sync(spawn_queue, ^(void) {
+                       tracing_on = false;
+               });
+               ktrace_session_destroy(session);
+               T_END;
+       });
+
+       ktrace_set_signal_handler(session);
+       ktrace_set_execnames_enabled(session, KTRACE_FEATURE_ENABLED);
+
+       // We are only interested in the processes we launched and ourselves
+       ktrace_filter_process(session, EXIT_BINARY);
+       ktrace_filter_process(session, "perf_exit");
+
+       ktrace_events_single(session, NEXT_CASE_EVENTID, ^(__unused ktrace_event_t e) {
+               consumer_i++;
+               dt_stat_finalize(s);
+               if (consumer_i >= TEST_CASES_COUNT) {
+                       ktrace_end(session, 1);
+               }
+               else {
+                       s = create_stat(test_cases[consumer_i].wired_mem, test_cases[consumer_i].threads);
+               }
+       });
+
+       ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_EXCP_SC, 1) | DBG_FUNC_START), ^(ktrace_event_t e) {
+               T_QUIET; T_ASSERT_LE(e->pid, PID_MAX, "pid %d is valid in start tracepoint", e->pid);
+               begin_ts[e->pid] = e->timestamp;
+       });
+
+       ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END), ^(ktrace_event_t e) {
+               T_ASSERT_LE(e->pid, PID_MAX, "pid %d is valid in end tracepoint", e->pid);
+
+               if (begin_ts[e->pid] == 0) {
+                       return;
+               }
+
+               T_QUIET; T_ASSERT_LE(begin_ts[e->pid], e->timestamp, "timestamps are monotonically increasing");
+               dt_stat_mach_time_add(s, e->timestamp - begin_ts[e->pid]);
+
+
+               if (dt_stat_stable(s) && producer_i == consumer_i) {
+                       dispatch_sync(spawn_queue, ^(void) {
+                               producer_i++;
+                               T_ASSERT_POSIX_ZERO(kdebug_trace(NEXT_CASE_EVENTID, producer_i, 0, 0, 0), "kdebug_trace returns 0");
+                       });
+               }
+       });
+
+       int ret = ktrace_start(session, processing_queue);
+       T_ASSERT_POSIX_ZERO(ret, "starting trace");
+       tracing_on = true;
+
+       // Spawn processes continuously until the test is over
+
+       __block void (^spawn_process)(void) = Block_copy(^(void) {
+               char nthreads_buf[32], mem_buf[32];
+
+               if (producer_i >= TEST_CASES_COUNT || !tracing_on) {
+                       return;
+               }
+
+               snprintf(nthreads_buf, 32, "%d", test_cases[producer_i].threads);
+               snprintf(mem_buf, 32, "%d", test_cases[producer_i].wired_mem);
+
+               char *args[] = {EXIT_BINARY_PATH, nthreads_buf, mem_buf, NULL};
+               int status;
+
+               pid_t pid;
+               int bret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL);
+               T_ASSERT_POSIX_ZERO(bret, "spawned process with pid %d (threads=%s mem=%s)", pid, nthreads_buf, mem_buf);
+
+               bret = waitpid(pid, &status, 0);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(bret, "waited for process %d\n", pid);
+
+               if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
+                       T_ASSERT_FAIL("child process failed to run");
+
+               // Avoid saturating the CPU with new processes
+               usleep(1000);
+
+               dispatch_async(spawn_queue, spawn_process);
+       });
+
+       dispatch_async(spawn_queue, spawn_process);
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT), dispatch_get_main_queue(), ^{
+               ktrace_end(session, 0);
+       });
+
+       dispatch_main();
+}
+
diff --git a/tests/perf_exit_proc.c b/tests/perf_exit_proc.c
new file mode 100644 (file)
index 0000000..b8bb88a
--- /dev/null
@@ -0,0 +1,86 @@
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include <mach/mach.h>
+#include <mach/mach_vm.h>
+
+static void* loop(__attribute__ ((unused)) void *arg) {
+       while (1) {
+
+       }
+}
+
+
+static int run_additional_threads(int nthreads) {
+       for (int i = 0; i < nthreads; i++) {
+               pthread_t pthread;
+               int err;
+               
+               err = pthread_create(&pthread, NULL, loop, NULL);
+               if (err) {
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+static int allocate_and_wire_memory(mach_vm_size_t size) {
+       int err;
+       task_t task = mach_task_self();
+       mach_vm_address_t addr;
+
+       if (size <= 0)
+               return 0;
+
+       err = mach_vm_allocate(task, &addr, size, VM_FLAGS_ANYWHERE);
+       if (err != KERN_SUCCESS) {
+               printf("mach_vm_allocate returned non-zero: %s\n", mach_error_string(err));
+               return err;
+       }
+       err = mach_vm_protect(task, addr, size, 0, VM_PROT_READ | VM_PROT_WRITE);;
+       if (err != KERN_SUCCESS) {
+               printf("mach_vm_protect returned non-zero: %s\n", mach_error_string(err));
+               return err;
+       }
+       host_t host_priv_port;
+       err = host_get_host_priv_port(mach_host_self(), &host_priv_port);
+       if (err != KERN_SUCCESS) {
+               printf("host_get_host_priv_port retruned non-zero: %s\n", mach_error_string(err));
+               return err;
+       }
+       err = mach_vm_wire(host_priv_port, task, addr, size, VM_PROT_READ | VM_PROT_WRITE);
+       if (err != KERN_SUCCESS) {
+               printf("mach_vm_wire returned non-zero: %s\n", mach_error_string(err));
+               return err;
+       }
+
+       return 0;
+}
+
+int main(int argc, char *argv[]) {
+       int nthreads = 0;
+       int err;
+       mach_vm_size_t wired_mem = 0;
+
+       if (argc > 1) {
+               nthreads = (int)strtoul(argv[1], NULL, 10);
+       }
+       if (argc > 2) {
+               wired_mem = (mach_vm_size_t)strtoul(argv[2], NULL, 10);
+       }
+       
+       err = allocate_and_wire_memory(wired_mem);
+       if (err) {
+               return err;
+       }
+
+       err = run_additional_threads(nthreads);
+       if (err) {
+               return err;
+       }
+
+       return 0;
+}
diff --git a/tests/perf_kdebug.c b/tests/perf_kdebug.c
new file mode 100644 (file)
index 0000000..0b8240e
--- /dev/null
@@ -0,0 +1,168 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+
+#include <sys/kdebug.h>
+#include <sys/sysctl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.perf.kdebug"),
+       T_META_ASROOT(true),
+       T_META_CHECK_LEAKS(false),
+       T_META_TAG_PERF
+);
+
+//
+// Helper functions for direct control over the kernel trace facility.
+//
+
+static void _sysctl_reset() {
+       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE };
+       if(sysctl(mib, 3, NULL, NULL, NULL, 0)) {
+               T_FAIL("KERN_KDREMOVE sysctl failed");
+       }
+}
+
+static void _sysctl_setbuf(uint32_t capacity) {
+       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, (int)capacity };
+       if (sysctl(mib, 4, NULL, NULL, NULL, 0)) {
+               T_FAIL("KERN_KDSETBUF sysctl failed");
+       }
+}
+
+static void _sysctl_setup() {
+       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSETUP };
+       if (sysctl(mib, 3, NULL, NULL, NULL, 0)) {
+               T_FAIL("KERN_KDSETUP sysctl failed");
+       }
+}
+
+static void _sysctl_enable(int value)
+{
+       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, value };
+       if (sysctl(mib, 4, NULL, NULL, NULL, 0) < 0) {
+               T_FAIL("KERN_KDENABLE sysctl failed");
+       }
+}
+
+static void _sysctl_enable_typefilter(uint8_t* type_filter_bitmap) {
+       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSET_TYPEFILTER };
+       size_t needed = KDBG_TYPEFILTER_BITMAP_SIZE;
+       if(sysctl(mib, 3, type_filter_bitmap, &needed, NULL, 0)) {
+               T_FAIL("KERN_KDSET_TYPEFILTER sysctl failed");
+       }
+}
+
+static void _sysctl_nowrap(bool is_nowrap) {
+       int mib[] = { CTL_KERN, KERN_KDEBUG, is_nowrap ? KERN_KDEFLAGS : KERN_KDDFLAGS, KDBG_NOWRAP };
+       if (sysctl(mib, 4, NULL, NULL, NULL, 0)) {
+               T_FAIL("KDBG_NOWRAP sysctl failed");
+       }
+}
+
+static void enable_tracing(bool value) {
+       _sysctl_enable(value ? KDEBUG_ENABLE_TRACE : 0);
+}
+
+static void enable_typefilter_all_reject() {
+       uint8_t type_filter_bitmap[KDBG_TYPEFILTER_BITMAP_SIZE];
+       memset(type_filter_bitmap, 0, sizeof(type_filter_bitmap));
+       _sysctl_enable_typefilter(type_filter_bitmap);
+}
+
+static void enable_typefilter_all_pass() {
+       uint8_t type_filter_bitmap[KDBG_TYPEFILTER_BITMAP_SIZE];
+       memset(type_filter_bitmap, 0xff, sizeof(type_filter_bitmap));
+       _sysctl_enable_typefilter(type_filter_bitmap);
+}
+
+static void loop_kdebug_trace(dt_stat_time_t s) {
+       do {
+               dt_stat_token start = dt_stat_time_begin(s);
+               for (uint32_t i = 0; i<100; i++) {
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
+               }
+               dt_stat_time_end_batch(s, 1000, start);
+       } while (!dt_stat_stable(s));
+}
+
+static void loop_getppid(dt_stat_time_t s) {
+       do {
+               dt_stat_token start = dt_stat_time_begin(s);
+               for (uint32_t i = 0; i<100; i++) {
+                       getppid();
+                       getppid();
+                       getppid();
+                       getppid();
+                       getppid();
+                       getppid();
+                       getppid();
+                       getppid();
+                       getppid();
+                       getppid();
+               }
+               dt_stat_time_end_batch(s, 1000, start);
+       } while (!dt_stat_stable(s));
+}
+
+static void reset_kdebug_trace(void) {
+       _sysctl_reset();
+}
+
+static void test(const char* test_name, void (^pretest_setup)(void), void (*test)(dt_stat_time_t s)) {
+       T_ATEND(reset_kdebug_trace);
+       _sysctl_reset();
+       _sysctl_setbuf(1000000);
+       _sysctl_nowrap(false);
+       _sysctl_setup();
+
+       pretest_setup();
+
+       dt_stat_time_t s = dt_stat_time_create("%s", test_name);
+
+       test(s);
+
+       dt_stat_finalize(s);
+}
+
+//
+// Begin tests...
+//
+
+T_DECL(kdebug_trace_baseline_syscall,
+       "Test the latency of a syscall while kernel tracing is disabled") {
+       test("kdebug_trace_baseline_syscall", ^{ enable_tracing(false); }, loop_getppid);
+}
+
+T_DECL(kdebug_trace_kdbg_disabled,
+       "Test the latency of kdebug_trace while kernel tracing is disabled") {
+       test("kdebug_trace_kdbg_disabled", ^{ enable_tracing(false); }, loop_kdebug_trace);
+}
+
+T_DECL(kdebug_trace_kdbg_enabled,
+       "Test the latency of kdebug_trace while kernel tracing is enabled with no typefilter") {
+       test("kdebug_trace_kdbg_enabled", ^{ enable_tracing(true); }, loop_kdebug_trace);
+}
+
+T_DECL(kdebug_trace_kdbg_enabled_typefilter_pass,
+       "Test the latency of kdebug_trace while kernel tracing is enabled with a typefilter that passes the event") {
+       test("kdebug_trace_kdbg_enabled_typefilter_pass", ^{ enable_tracing(true); enable_typefilter_all_pass(); }, loop_kdebug_trace);
+}
+
+T_DECL(kdebug_trace_kdbg_enabled_typefilter_reject,
+       "Test the latency of kdebug_trace while kernel tracing is enabled with a typefilter that rejects the event") {
+       test("kdebug_trace_kdbg_enabled_typefilter_reject", ^{ enable_tracing(true); enable_typefilter_all_reject(); }, loop_kdebug_trace);
+}
diff --git a/tests/perf_spawn_fork.c b/tests/perf_spawn_fork.c
new file mode 100644 (file)
index 0000000..fad33b2
--- /dev/null
@@ -0,0 +1,76 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+
+#include <spawn.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.perf"),
+       T_META_CHECK_LEAKS(false),
+       T_META_TAG_PERF
+);
+
+#define SPAWN_MEASURE_LOOP(s) \
+       char *args[] = {"/usr/bin/true", NULL}; \
+       int err; \
+       pid_t pid; \
+       int status; \
+       while (!dt_stat_stable(s)) { \
+               T_STAT_MEASURE(s) { \
+                       err = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); \
+               } \
+               if (err) { \
+                       T_FAIL("posix_spawn returned %d", err); \
+               } \
+               waitpid(pid, &status, 0); \
+               if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { \
+                       T_FAIL("Child process of posix_spawn failed to run"); \
+               } \
+       }
+
+T_DECL(posix_spawn_platform_binary_latency, "posix_spawn platform binary latency") {
+       {
+               dt_stat_time_t s = dt_stat_time_create("time");
+               SPAWN_MEASURE_LOOP(s);
+               dt_stat_finalize(s);
+       }
+
+       {
+               dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on_cpu_time");
+               SPAWN_MEASURE_LOOP(s);
+               dt_stat_finalize(s);
+       }
+}
+
+#define FORK_MEASURE_LOOP(s) \
+       pid_t pid; \
+       int status; \
+       while (!dt_stat_stable(s)) { \
+               T_STAT_MEASURE(s) { \
+                       pid = fork(); \
+                       if (pid == 0) \
+                               exit(0); \
+                       else if (pid == -1) \
+                               T_FAIL("fork returned -1"); \
+               } \
+               waitpid(pid, &status, 0); \
+               if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { \
+                       T_FAIL("forked process failed to exit properly"); \
+               } \
+       }
+
+T_DECL(fork, "fork latency") {
+       {
+               dt_stat_time_t s = dt_stat_time_create("time");
+               FORK_MEASURE_LOOP(s);
+               dt_stat_finalize(s);
+       }
+       {
+               dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on_cpu_time");
+               FORK_MEASURE_LOOP(s);
+               dt_stat_finalize(s);
+       }
+}
diff --git a/tests/perf_vmfault.c b/tests/perf_vmfault.c
new file mode 100644 (file)
index 0000000..e3e81f1
--- /dev/null
@@ -0,0 +1,243 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/sysctl.h>
+#include <mach/mach.h>
+#include <mach/vm_map.h>
+#include <darwintest.h>
+#include <TargetConditionals.h>
+#include <perfcheck_keys.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.vm.perf"),
+       T_META_CHECK_LEAKS(false),
+       T_META_TAG_PERF
+);
+
+#ifdef DT_IOSMARK
+#define MEMSIZE                        (1UL<<29)       /* 512 MB */
+#else
+#define MEMSIZE                        (1UL<<27)       /* 128 MB */
+#endif
+
+enum {
+       SOFT_FAULT,
+       ZERO_FILL,
+       NUM_TESTS
+};
+
+static int test_type;
+static int num_threads;
+static int ready_thread_count;
+static size_t pgsize;
+static size_t num_pages;
+static char *memblock;
+static char *memblock_share;
+static dt_stat_time_t t;
+static pthread_cond_t start_cvar;
+static pthread_cond_t threads_ready_cvar;
+static pthread_mutex_t ready_thread_count_lock;
+
+static void map_mem_regions(void);
+static void unmap_mem_regions(void);
+static void fault_pages(int thread_id);
+static void execute_threads(void);
+static void *thread_setup(void *arg);
+static void run_test(int test, int threads, int cpus);
+static int get_ncpu(void);
+
+static void map_mem_regions(void)
+{
+       char *ptr;
+       volatile char val;
+       vm_prot_t curprot, maxprot;
+
+       memblock = (char *)mmap(NULL, MEMSIZE, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+       T_QUIET; T_ASSERT_NE((void *)memblock, MAP_FAILED, "mmap");
+
+       if (test_type == SOFT_FAULT) {
+
+               /* Fault in all the pages of the original region. */
+               for(ptr = memblock; ptr < memblock + MEMSIZE; ptr += pgsize) {
+                       val = *ptr;
+               }
+               /* Remap the region so that subsequent accesses result in read soft faults. */
+               T_QUIET; T_ASSERT_MACH_SUCCESS(vm_remap(mach_task_self(), (vm_address_t *)&memblock_share,
+                                       MEMSIZE, 0, VM_FLAGS_ANYWHERE, mach_task_self(), (vm_address_t)memblock, FALSE,
+                                       &curprot, &maxprot, VM_INHERIT_DEFAULT), "vm_remap");
+       }
+}
+
+static void unmap_mem_regions(void)
+{
+       if (test_type == SOFT_FAULT) {
+               T_QUIET; T_ASSERT_MACH_SUCCESS(munmap(memblock_share, MEMSIZE), "munmap");
+       }
+       T_QUIET; T_ASSERT_MACH_SUCCESS(munmap(memblock, MEMSIZE), "munmap");
+}
+
+static void fault_pages(int thread_id)
+{
+       size_t region_len, region_start, region_end;
+       char *ptr, *block;
+       volatile char val;
+
+       region_len = num_pages / (size_t)num_threads;
+       region_start = region_len * (size_t)thread_id;
+
+       if((size_t)thread_id < num_pages % (size_t)num_threads) {
+               region_start += (size_t)thread_id;
+               region_len++;
+       }
+       else {
+               region_start += num_pages % (size_t)num_threads;
+       }
+
+       region_start *= pgsize;
+       region_len *= pgsize;
+       region_end = region_start + region_len;
+
+       block = (test_type == SOFT_FAULT)? memblock_share: memblock;
+       for(ptr = block + region_start; ptr < block + region_end; ptr += pgsize) {
+               val = *ptr;
+       }
+}
+
+static void execute_threads(void)
+{
+       int thread_index, thread_retval;
+       int *thread_indices;
+    void *thread_retval_ptr = &thread_retval;
+       pthread_t* threads;
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&threads_ready_cvar, NULL), "pthread_cond_init");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_init(&start_cvar, NULL), "pthread_cond_init");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_init(&ready_thread_count_lock, NULL), "pthread_mutex_init");
+       ready_thread_count = 0;
+
+       threads = (pthread_t *)malloc(sizeof(*threads) * (size_t)num_threads);
+       thread_indices = (int *)malloc(sizeof(*thread_indices) * (size_t)num_threads);
+       for(thread_index = 0; thread_index < num_threads; thread_index++) {
+               thread_indices[thread_index] = thread_index;
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&threads[thread_index], NULL,
+                                       thread_setup, (void *)&thread_indices[thread_index]), "pthread_create");
+       }
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&ready_thread_count_lock), "pthread_mutex_lock");
+       if(ready_thread_count != num_threads) {
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&threads_ready_cvar, &ready_thread_count_lock),
+                               "pthread_cond_wait");
+       }
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock");
+
+       T_STAT_MEASURE(t) {
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_broadcast(&start_cvar), "pthread_cond_broadcast");
+               for(thread_index = 0; thread_index < num_threads; thread_index++) {
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_join(threads[thread_index], &thread_retval_ptr),
+                                       "pthread_join");
+               }
+       };
+
+       free(threads);
+       free(thread_indices);
+}
+
+static void *thread_setup(void *arg)
+{
+  int my_index = *((int *)arg);
+
+  T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_lock(&ready_thread_count_lock), "pthread_mutex_lock");
+  ready_thread_count++;
+  if(ready_thread_count == num_threads) {
+    T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_signal(&threads_ready_cvar), "pthread_cond_signal");
+  }
+  T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_cond_wait(&start_cvar, &ready_thread_count_lock), "pthread_cond_wait");
+  T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_mutex_unlock(&ready_thread_count_lock), "pthread_mutex_unlock");
+
+  fault_pages(my_index);
+  return NULL;
+}
+
+static void run_test(int test, int threads, int cpus)
+{
+       size_t sysctl_size = sizeof(pgsize);
+       int ret = sysctlbyname("vm.pagesize", &pgsize, &sysctl_size, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pagesize failed");
+
+       test_type = test;
+       num_threads = threads;
+       num_pages = MEMSIZE / pgsize;
+
+       T_QUIET; T_ASSERT_LT(test_type, NUM_TESTS, "invalid test type");
+       T_QUIET; T_ASSERT_GT(num_threads, 0, "num_threads <= 0");
+       T_QUIET; T_ASSERT_GT((int)num_pages/ num_threads, 0, "num_pages/num_threads <= 0");
+
+       T_LOG("No. of cpus:     %d", cpus);
+       T_LOG("No. of threads:  %d", num_threads);
+       T_LOG("No. of pages:    %ld", num_pages);
+       T_LOG("Pagesize:        %ld", pgsize);
+
+       t = dt_stat_time_create("Runtime");
+       // This sets the A/B failure threshold at 50% of baseline for Runtime
+       dt_stat_set_variable(t, kPCFailureThresholdPctVar, 50.0);
+       while (!dt_stat_stable(t)) {
+               map_mem_regions();
+               execute_threads();
+               unmap_mem_regions();
+       }
+
+       dt_stat_finalize(t);
+       T_END;
+}
+
+static int get_ncpu(void)
+{
+       int ncpu;
+       size_t length = sizeof(ncpu);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.ncpu", &ncpu, &length, NULL, 0),
+                       "failed to query hw.ncpu");
+       return ncpu;
+}
+
+T_DECL(read_soft_fault,
+               "Read soft faults (single thread)")
+{
+       run_test(SOFT_FAULT, 1, get_ncpu());
+}
+
+T_DECL(read_soft_fault_multithreaded,
+               "Read soft faults (multi-threaded)")
+{
+       char *e;
+       int nthreads;
+
+       /* iOSMark passes in the no. of threads via an env. variable */
+       if ((e = getenv("DT_STAT_NTHREADS"))) {
+               nthreads = (int)strtol(e, NULL, 0);
+       } else {
+               nthreads = get_ncpu();
+       }
+       run_test(SOFT_FAULT, nthreads, get_ncpu());
+}
+
+T_DECL(zero_fill_fault,
+               "Zero fill faults (single thread)")
+{
+       run_test(ZERO_FILL, 1, get_ncpu());
+}
+
+T_DECL(zero_fill_fault_multithreaded,
+               "Zero fill faults (multi-threaded)")
+{
+       char *e;
+       int nthreads;
+
+       /* iOSMark passes in the no. of threads via an env. variable */
+       if ((e = getenv("DT_STAT_NTHREADS"))) {
+               nthreads = (int)strtol(e, NULL, 0);
+       } else {
+               nthreads = get_ncpu();
+       }
+       run_test(ZERO_FILL, nthreads, get_ncpu());
+}
diff --git a/tests/phys_footprint_interval_max.c b/tests/phys_footprint_interval_max.c
new file mode 100644 (file)
index 0000000..846b591
--- /dev/null
@@ -0,0 +1,94 @@
+
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+#include <darwintest.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <mach/mach_vm.h>
+#include <mach/mach_init.h>
+#include <sys/resource.h>
+#include <libproc.h>
+#include <libproc_internal.h>
+#include <TargetConditionals.h>
+
+#define ALLOC_SIZE_LARGE 5*1024*1024
+#define ALLOC_SIZE_SMALL 2*1024*1024
+
+int proc_rlimit_control(pid_t pid, int flavor, void *arg);
+
+T_DECL(phys_footprint_interval_max,
+       "Validate physical footprint interval tracking")
+{
+       int ret;
+       struct rusage_info_v4 ru;
+       mach_vm_address_t addr = (mach_vm_address_t)NULL;
+
+       ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage");
+       T_ASSERT_EQ(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint,
+                   "Max footprint and interval footprint are equal prior to dirtying memory");
+
+       ret = mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)ALLOC_SIZE_LARGE, VM_FLAGS_ANYWHERE);
+       T_QUIET;
+       T_ASSERT_MACH_SUCCESS(ret, "mach_vm_allocate(ALLOC_SIZE_LARGE)");
+
+       memset((void *)addr, 0xab, ALLOC_SIZE_LARGE);
+
+       ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage");
+       T_ASSERT_EQ(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint,
+                   "Max footprint and interval footprint are equal after dirtying large memory region");
+
+       mach_vm_deallocate(mach_task_self(), addr, (mach_vm_size_t)ALLOC_SIZE_LARGE);
+
+       ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage");
+       T_ASSERT_EQ(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint,
+                   "Max footprint and interval footprint are still equal after freeing large memory region");
+
+       ret = proc_reset_footprint_interval(getpid());
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_reset_footprint_interval()");
+
+       ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage");
+       T_ASSERT_GT(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint,
+                   "Max footprint is greater than interval footprint after resetting interval");
+
+       ret = mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)ALLOC_SIZE_SMALL, VM_FLAGS_ANYWHERE);
+       T_QUIET;
+       T_ASSERT_MACH_SUCCESS(ret, "mach_vm_allocate(ALLOC_SIZE_SMALL)");
+       memset((void *)addr, 0xab, ALLOC_SIZE_SMALL);
+
+       ret = proc_pid_rusage(getpid(), RUSAGE_INFO_V4, (rusage_info_t *)&ru);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_pid_rusage");
+       T_ASSERT_GT(ru.ri_lifetime_max_phys_footprint, ru.ri_interval_max_phys_footprint,
+                   "Max footprint is still greater than interval footprint after dirtying small memory region");
+}
diff --git a/tests/poll.c b/tests/poll.c
new file mode 100644 (file)
index 0000000..8ff8806
--- /dev/null
@@ -0,0 +1,129 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+
+#include <dispatch/dispatch.h>
+#include <fcntl.h>
+#include <mach/mach.h>
+#include <poll.h>
+#include <stdint.h>
+#include <unistd.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.poll"));
+
+#define SLEEP_TIME_SECS 1
+#define POLL_TIMEOUT_MS 1800
+static_assert(POLL_TIMEOUT_MS > (SLEEP_TIME_SECS * 1000),
+               "poll timeout should be longer than sleep time");
+
+/*
+ * This matches the behavior of other UNIXes, but is under-specified in POSIX.
+ *
+ * See <rdar://problem/28372390>.
+ */
+T_DECL(sleep_with_no_fds,
+               "poll() called with no fds provided should act like sleep")
+{
+       uint64_t begin_time, sleep_time, poll_time;
+       struct pollfd pfd = { 0 };
+
+       begin_time = mach_absolute_time();
+       sleep(SLEEP_TIME_SECS);
+       sleep_time = mach_absolute_time() - begin_time;
+       T_LOG("sleep(%d) ~= %llu mach absolute time units", SLEEP_TIME_SECS, sleep_time);
+
+       begin_time = mach_absolute_time();
+       T_ASSERT_POSIX_SUCCESS(poll(&pfd, 0, POLL_TIMEOUT_MS),
+                       "poll() with 0 events and timeout %d ms", POLL_TIMEOUT_MS);
+       poll_time = mach_absolute_time() - begin_time;
+
+       T_EXPECT_GT(poll_time, sleep_time,
+                       "poll(... %d) should wait longer than sleep(1)", POLL_TIMEOUT_MS);
+}
+
+#define LAUNCHD_PATH "/sbin/launchd"
+#define PIPE_DIR_TIMEOUT_SECS 1
+
+/*
+ * See <rdar://problem/28539155>.
+ */
+T_DECL(directories,
+               "poll() with directories should return an error")
+{
+       int file, dir, pipes[2];
+       struct pollfd pfd[] = {
+               { .events = POLLIN },
+               { .events = POLLIN },
+               { .events = POLLIN },
+       };
+
+       file = open(LAUNCHD_PATH, O_RDONLY | O_NONBLOCK);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(file, "open(%s)", LAUNCHD_PATH);
+       dir = open(".", O_RDONLY | O_NONBLOCK);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(dir, "open(\".\")");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pipe(pipes), NULL);
+
+       /* just directory */
+       pfd[0].fd = dir;
+       T_EXPECT_POSIX_SUCCESS(poll(pfd, 1, -1), "poll() with a directory");
+       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLNVAL,
+                       "directory should be an invalid event");
+
+       /* file and directory */
+       pfd[0].fd = file; pfd[0].revents = 0;
+       pfd[1].fd = dir; pfd[1].revents = 0;
+       T_EXPECT_POSIX_SUCCESS(poll(pfd, 2, -1),
+                       "poll() with a file and directory");
+       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLIN, "file should be readable");
+       T_QUIET; T_EXPECT_TRUE(pfd[1].revents & POLLNVAL,
+                       "directory should be an invalid event");
+
+       /* directory and file */
+       pfd[0].fd = dir; pfd[0].revents = 0;
+       pfd[1].fd = file; pfd[1].revents = 0;
+       T_EXPECT_POSIX_SUCCESS(poll(pfd, 2, -1),
+                       "poll() with a directory and a file");
+       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLNVAL,
+                       "directory should be an invalid event");
+       T_QUIET; T_EXPECT_TRUE(pfd[1].revents & POLLIN, "file should be readable");
+
+       /* file and pipe */
+       pfd[0].fd = file; pfd[0].revents = 0;
+       pfd[1].fd = pipes[0]; pfd[0].revents = 0;
+       T_EXPECT_POSIX_SUCCESS(poll(pfd, 2, -1),
+                       "poll() with a file and pipe");
+       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLIN, "file should be readable");
+       T_QUIET; T_EXPECT_FALSE(pfd[1].revents & POLLIN,
+                       "pipe should not be readable");
+
+       /* file, directory, and pipe */
+       pfd[0].fd = file; pfd[0].revents = 0;
+       pfd[1].fd = dir; pfd[1].revents = 0;
+       pfd[2].fd = pipes[0]; pfd[2].revents = 0;
+       T_EXPECT_POSIX_SUCCESS(poll(pfd, 3, -1),
+                       "poll() with a file, directory, and pipe");
+       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLIN, "file should be readable");
+       T_QUIET; T_EXPECT_TRUE(pfd[1].revents & POLLNVAL,
+                       "directory should be an invalid event");
+       T_QUIET; T_EXPECT_FALSE(pfd[2].revents & POLLIN, "pipe should not be readable");
+
+       /* directory and pipe */
+       __block bool timed_out = true;
+       pfd[0].fd = dir; pfd[0].revents = 0;
+       pfd[1].fd = pipes[0]; pfd[1].revents = 0;
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW,
+                       PIPE_DIR_TIMEOUT_SECS * NSEC_PER_SEC),
+                       dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0), ^{
+               T_ASSERT_FALSE(timed_out, "poll timed out after %d seconds",
+                               PIPE_DIR_TIMEOUT_SECS);
+       });
+
+       T_EXPECT_POSIX_SUCCESS(poll(pfd, 3, -1),
+                       "poll() with a directory and pipe");
+       timed_out = false;
+
+       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLNVAL,
+                       "directory should be an invalid event");
+       T_QUIET; T_EXPECT_FALSE(pfd[1].revents & POLLIN, "pipe should not be readable");
+}
diff --git a/tests/poll_select_kevent_paired_fds.c b/tests/poll_select_kevent_paired_fds.c
new file mode 100644 (file)
index 0000000..bd9a5e7
--- /dev/null
@@ -0,0 +1,932 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <mach/mach.h>
+#include <darwintest_multiprocess.h>
+
+#include <assert.h>
+#include <dispatch/dispatch.h>
+#include <dispatch/private.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <pthread/workqueue_private.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/event.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sysexits.h>
+#include <unistd.h>
+#include <util.h>
+#include <System/sys/event.h> /* kevent_qos */
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.kevent"),
+               T_META_CHECK_LEAKS(false),
+               T_META_LTEPHASE(LTE_POSTINIT));
+
+/*
+ * Test to validate that monitoring a PTY device, FIFO, pipe, or socket pair in
+ * a dispatch source, kqueue, poll, or select delivers read events within and
+ * between processes as expected.
+ *
+ * This test catches issues with watching special devices in kqueue(),
+ * which has tricky special cases for character devices like PTYs.
+ *
+ * It also exercises the path to wake up a dispatch worker thread from the
+ * special device kqueue event, which is also a special case in kqueue().
+ *
+ * See rdar://problem/26240299&26220074&26226862&28625427 for examples and
+ * history.
+ */
+
+#define EXPECTED_STRING    "abcdefghijklmnopqrstuvwxyz. ABCDEFGHIJKLMNOPQRSTUVWXYZ. 1234567890"
+#define EXPECTED_LEN       strlen(EXPECTED_STRING)
+
+#define READ_SETUP_TIMEOUT_SECS       2
+#define WRITE_TIMEOUT_SECS            4
+#define READ_TIMEOUT_SECS             4
+#define INCREMENTAL_WRITE_SLEEP_USECS 50
+
+static mach_timespec_t READ_SETUP_timeout = {.tv_sec = READ_SETUP_TIMEOUT_SECS, .tv_nsec = 0};
+static mach_timespec_t READ_timeout = {.tv_sec = READ_TIMEOUT_SECS, .tv_nsec = 0};
+static mach_timespec_t WRITE_timeout = {.tv_sec = WRITE_TIMEOUT_SECS, .tv_nsec = 0};
+
+enum fd_pair {
+       PTY_PAIR,
+       FIFO_PAIR,
+       PIPE_PAIR,
+       SOCKET_PAIR
+};
+
+enum write_mode {
+       FULL_WRITE,
+       INCREMENTAL_WRITE,
+       KEVENT_INCREMENTAL_WRITE,
+       KEVENT64_INCREMENTAL_WRITE,
+       KEVENT_QOS_INCREMENTAL_WRITE,
+       WORKQ_INCREMENTAL_WRITE,
+       DISPATCH_INCREMENTAL_WRITE
+};
+
+enum read_mode {
+       POLL_READ,
+       SELECT_READ,
+       KEVENT_READ,
+       KEVENT64_READ,
+       KEVENT_QOS_READ,
+       WORKQ_READ,
+       DISPATCH_READ
+};
+
+union mode {
+       enum read_mode rd;
+       enum write_mode wr;
+};
+
+static struct {
+       enum fd_pair fd_pair;
+       enum write_mode wr_mode;
+       int wr_fd;
+       enum read_mode rd_mode;
+       int rd_fd;
+
+       enum writer_kind {
+               THREAD_WRITER, /* sem */
+               PROCESS_WRITER /* fd */
+       } wr_kind;
+       union {
+               semaphore_t sem;
+               struct {
+                       int in_fd;
+                       int out_fd;
+               };
+       } wr_wait;
+       semaphore_t wr_finished;
+       semaphore_t rd_finished;
+} shared;
+
+static bool handle_reading(enum fd_pair fd_pair, int fd);
+static bool handle_writing(enum fd_pair fd_pair, int fd);
+static void drive_kq(bool reading, union mode mode, enum fd_pair fd_pair,
+               int fd);
+
+#pragma mark writing
+
+static void
+wake_writer(void)
+{
+       T_LOG("waking writer");
+
+       switch (shared.wr_kind) {
+       case THREAD_WRITER:
+               T_LOG("signal shared.wr_wait.sem");
+               semaphore_signal(shared.wr_wait.sem);
+               break;
+       case PROCESS_WRITER: {
+               char tmp = 'a';
+               close(shared.wr_wait.out_fd);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(write(
+                               shared.wr_wait.in_fd, &tmp, 1), NULL);
+               break;
+       }
+       }
+}
+
+static void
+writer_wait(void)
+{
+       switch (shared.wr_kind) {
+       case THREAD_WRITER:
+               T_LOG("wait shared.wr_wait.sem");
+               kern_return_t kret = semaphore_timedwait(shared.wr_wait.sem, READ_SETUP_timeout);
+
+               if (kret == KERN_OPERATION_TIMED_OUT) {
+                       T_ASSERT_FAIL("THREAD_WRITER semaphore timedout after %d seconds", READ_SETUP_timeout.tv_sec);
+               }
+               T_QUIET;
+               T_ASSERT_MACH_SUCCESS(kret, "semaphore_timedwait shared.wr_wait.sem");
+               break;
+
+       case PROCESS_WRITER: {
+               char tmp;
+               close(shared.wr_wait.in_fd);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(read(
+                               shared.wr_wait.out_fd, &tmp, 1), NULL);
+               break;
+       }
+       }
+
+       T_LOG("writer woken up, starting to write");
+}
+
+static bool
+handle_writing(enum fd_pair __unused fd_pair, int fd)
+{
+       static unsigned int cur_char = 0;
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(write(fd,
+                       &(EXPECTED_STRING[cur_char]), 1), NULL);
+       cur_char++;
+
+       return (cur_char < EXPECTED_LEN);
+}
+
+#define EXPECTED_QOS QOS_CLASS_USER_INITIATED
+
+static void
+reenable_workq(int fd, int16_t filt)
+{
+       struct kevent_qos_s events[] = {{
+               .ident = (uint64_t)fd,
+               .filter = filt,
+               .flags = EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH,
+               .qos = (int32_t)_pthread_qos_class_encode(EXPECTED_QOS,
+                               0, 0),
+               .fflags = NOTE_LOWAT,
+               .data = 1
+       }};
+
+       int kev = kevent_qos(-1, events, 1, events, 1, NULL, NULL,
+                       KEVENT_FLAG_WORKQ | KEVENT_FLAG_ERROR_EVENTS);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kev, "reenable workq in kevent_qos");
+}
+
+static void
+workqueue_write_fn(void ** __unused buf, int * __unused count)
+{
+       // T_MAYFAIL;
+       // T_QUIET; T_ASSERT_EFFECTIVE_QOS_EQ(EXPECTED_QOS,
+                       // "writer thread should be woken up at correct QoS");
+       if (!handle_writing(shared.fd_pair, shared.wr_fd)) {
+               /* finished handling the fd, tear down the source */
+               T_LOG("signal shared.wr_finished");
+               semaphore_signal(shared.wr_finished);
+               return;
+       }
+
+       reenable_workq(shared.wr_fd, EVFILT_WRITE);
+}
+
+static void
+workqueue_fn(pthread_priority_t __unused priority)
+{
+       T_ASSERT_FAIL("workqueue function callback was called");
+}
+
+static void
+drive_kq(bool reading, union mode mode, enum fd_pair fd_pair, int fd)
+{
+       struct timespec timeout = { .tv_sec = READ_TIMEOUT_SECS };
+       int kev = -1;
+
+       struct kevent events;
+       EV_SET(&events, fd, reading ? EVFILT_READ : EVFILT_WRITE, EV_ADD,
+                       NOTE_LOWAT, 1, NULL);
+       struct kevent64_s events64;
+       EV_SET64(&events64, fd, reading ? EVFILT_READ : EVFILT_WRITE, EV_ADD,
+                       NOTE_LOWAT, 1, 0, 0, 0);
+       struct kevent_qos_s events_qos[] = {{
+               .ident = (uint64_t)fd,
+               .filter = reading ? EVFILT_READ : EVFILT_WRITE,
+               .flags = EV_ADD,
+               .fflags = NOTE_LOWAT,
+               .data = 1
+       }, {
+               .ident = 0,
+               .filter = EVFILT_TIMER,
+               .flags = EV_ADD,
+               .fflags = NOTE_SECONDS,
+               .data = READ_TIMEOUT_SECS
+       }};
+
+       /* determine which variant of kevent to use */
+       enum read_mode which_kevent;
+       if (reading) {
+               which_kevent = mode.rd;
+       } else {
+               if (mode.wr == KEVENT_INCREMENTAL_WRITE) {
+                       which_kevent = KEVENT_READ;
+               } else if (mode.wr == KEVENT64_INCREMENTAL_WRITE) {
+                       which_kevent = KEVENT64_READ;
+               } else if (mode.wr == KEVENT_QOS_INCREMENTAL_WRITE) {
+                       which_kevent = KEVENT_QOS_READ;
+               } else {
+                       T_ASSERT_FAIL("unexpected mode: %d", mode.wr);
+                       __builtin_unreachable();
+               }
+       }
+
+       int kq_fd = kqueue();
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(kq_fd, "kqueue");
+
+       switch (which_kevent) {
+       case KEVENT_READ:
+               kev = kevent(kq_fd, &events, 1, NULL, 0, NULL);
+               break;
+       case KEVENT64_READ:
+               kev = kevent64(kq_fd, &events64, 1, NULL, 0, 0, NULL);
+               break;
+       case KEVENT_QOS_READ:
+               kev = kevent_qos(kq_fd, events_qos, 2, NULL, 0, NULL, NULL, 0);
+               break;
+       case POLL_READ: /* FALLTHROUGH */
+       case SELECT_READ: /* FALLTHROUGH */
+       case DISPATCH_READ: /* FALLTHROUGH */
+       case WORKQ_READ: /* FALLTHROUGH */
+       default:
+               T_ASSERT_FAIL("unexpected mode: %d", reading ? mode.rd : mode.wr);
+               break;
+       }
+
+       if (reading) {
+               wake_writer();
+       } else {
+               writer_wait();
+       }
+
+       for (;;) {
+               switch (which_kevent) {
+               case KEVENT_READ:
+                       kev = kevent(kq_fd, NULL, 0, &events, 1, &timeout);
+                       break;
+               case KEVENT64_READ:
+                       kev = kevent64(kq_fd, NULL, 0, &events64, 1, 0, &timeout);
+                       break;
+               case KEVENT_QOS_READ:
+                       kev = kevent_qos(kq_fd, NULL, 0, events_qos, 2, NULL, NULL, 0);
+
+                       /* check for a timeout */
+                       for (int i = 0; i < kev; i++) {
+                               if (events_qos[i].filter == EVFILT_TIMER) {
+                                       kev = 0;
+                               }
+                       }
+                       break;
+               case POLL_READ: /* FALLTHROUGH */
+               case SELECT_READ: /* FALLTHROUGH */
+               case DISPATCH_READ: /* FALLTHROUGH */
+               case WORKQ_READ: /* FALLTHROUGH */
+               default:
+                       T_ASSERT_FAIL("unexpected mode: %d", reading ? mode.rd : mode.wr);
+                       break;
+               }
+
+               if (kev == -1 && errno == EINTR) {
+                       T_LOG("kevent was interrupted");
+                       continue;
+               }
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(kev, "kevent");
+               T_QUIET; T_ASSERT_NE(kev, 0, "kevent timed out");
+
+               if (reading) {
+                       if (!handle_reading(fd_pair, fd)) {
+                               break;
+                       }
+               } else {
+                       if (!handle_writing(fd_pair, fd)) {
+                               break;
+                       }
+               }
+       }
+
+       close(kq_fd);
+}
+
+static void *
+write_to_fd(void * __unused ctx)
+{
+       ssize_t bytes_wr = 0;
+
+       writer_wait();
+
+       switch (shared.wr_mode) {
+       case FULL_WRITE:
+               do {
+                       if (bytes_wr == -1) {
+                               T_LOG("write from child was interrupted");
+                       }
+                       bytes_wr = write(shared.wr_fd, EXPECTED_STRING,
+                                       EXPECTED_LEN);
+               } while (bytes_wr == -1 && errno == EINTR);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(bytes_wr, "write");
+               T_QUIET; T_ASSERT_EQ(bytes_wr, (ssize_t)EXPECTED_LEN,
+                               "wrote enough bytes");
+               break;
+
+       case INCREMENTAL_WRITE:
+               for (unsigned int i = 0; i < EXPECTED_LEN ; i++) {
+                       T_QUIET;
+                       T_ASSERT_POSIX_SUCCESS(write(shared.wr_fd,
+                                       &(EXPECTED_STRING[i]), 1), NULL);
+                       usleep(INCREMENTAL_WRITE_SLEEP_USECS);
+               }
+               break;
+
+       case KEVENT_INCREMENTAL_WRITE: /* FALLTHROUGH */
+       case KEVENT64_INCREMENTAL_WRITE: /* FALLTHROUGH */
+       case KEVENT_QOS_INCREMENTAL_WRITE: {
+               union mode mode = { .wr = shared.wr_mode };
+               drive_kq(false, mode, shared.fd_pair, shared.wr_fd);
+               break;
+       }
+
+       case WORKQ_INCREMENTAL_WRITE: {
+               // prohibit ourselves from going multi-threaded see:rdar://33296008
+               _dispatch_prohibit_transition_to_multithreaded(true);
+               int changes = 1;
+
+               T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.wr_finished, SYNC_POLICY_FIFO, 0),
+                                     "semaphore_create shared.wr_finished");
+
+               T_QUIET;
+               T_ASSERT_NE_UINT(shared.wr_finished, (unsigned)MACH_PORT_NULL, "wr_finished semaphore_create");
+
+               T_QUIET;
+               T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_kevent(workqueue_fn, workqueue_write_fn, 0, 0), NULL);
+
+               struct kevent_qos_s events[] = {{
+                       .ident = (uint64_t)shared.wr_fd,
+                       .filter = EVFILT_WRITE,
+                       .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
+                       .fflags = NOTE_LOWAT,
+                       .data = 1,
+                       .qos = (int32_t)_pthread_qos_class_encode(EXPECTED_QOS,
+                                       0, 0)
+               }};
+
+               for (;;) {
+                       int kev = kevent_qos(-1, changes == 0 ? NULL : events, changes,
+                                       events, 1, NULL, NULL,
+                                       KEVENT_FLAG_WORKQ | KEVENT_FLAG_ERROR_EVENTS);
+                       if (kev == -1 && errno == EINTR) {
+                               changes = 0;
+                               T_LOG("kevent_qos was interrupted");
+                               continue;
+                       }
+
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(kev, "kevent_qos");
+                       break;
+               }
+               break;
+       }
+
+       case DISPATCH_INCREMENTAL_WRITE: {
+               dispatch_source_t write_src;
+
+               T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.wr_finished, SYNC_POLICY_FIFO, 0),
+                                     "semaphore_create shared.wr_finished");
+
+               T_QUIET;
+               T_ASSERT_NE_UINT(shared.wr_finished, (unsigned)MACH_PORT_NULL, "semaphore_create");
+
+               write_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_WRITE,
+                               (uintptr_t)shared.wr_fd, 0, NULL);
+               T_QUIET; T_ASSERT_NOTNULL(write_src,
+                               "dispatch_source_create(DISPATCH_SOURCE_TYPE_WRITE ...)");
+
+               dispatch_block_t handler = dispatch_block_create_with_qos_class(
+                               DISPATCH_BLOCK_ENFORCE_QOS_CLASS, EXPECTED_QOS, 0, ^{
+                       // T_MAYFAIL;
+                       // T_QUIET; T_ASSERT_EFFECTIVE_QOS_EQ(EXPECTED_QOS,
+                                       // "write handler block should run at correct QoS");
+                       if (!handle_writing(shared.fd_pair, shared.wr_fd)) {
+                               /* finished handling the fd, tear down the source */
+                               dispatch_source_cancel(write_src);
+                               dispatch_release(write_src);
+                               T_LOG("signal shared.wr_finished");
+                               semaphore_signal(shared.wr_finished);
+                       }
+               });
+
+               dispatch_source_set_event_handler(write_src, handler);
+               dispatch_activate(write_src);
+
+               break;
+       }
+
+       default:
+               T_ASSERT_FAIL("unrecognized write mode: %d", shared.wr_mode);
+               break;
+       }
+
+       if (shared.wr_finished) {
+               T_LOG("wait shared.wr_finished");
+               kern_return_t kret = semaphore_timedwait(shared.wr_finished, WRITE_timeout);
+               if (kret == KERN_OPERATION_TIMED_OUT) {
+                       T_ASSERT_FAIL("write side semaphore timedout after %d seconds", WRITE_timeout.tv_sec);
+               }
+               T_QUIET;
+               T_ASSERT_MACH_SUCCESS(kret, "semaphore_timedwait shared.wr_finished");
+               semaphore_destroy(mach_task_self(), shared.wr_finished);
+       }
+
+       T_LOG("writer finished, closing fd");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(close(shared.wr_fd), NULL);
+       return NULL;
+}
+
+#pragma mark reading
+
+#define BUF_LEN 1024
+static char final_string[BUF_LEN];
+static size_t final_length;
+
+/*
+ * Read from the master PTY descriptor.
+ *
+ * Returns false if EOF is encountered, and true otherwise.
+ */
+static bool
+handle_reading(enum fd_pair fd_pair, int fd)
+{
+       char read_buf[BUF_LEN] = { 0 };
+       ssize_t bytes_rd = 0;
+
+       do {
+               if (bytes_rd == -1) {
+                       T_LOG("read was interrupted, retrying");
+               }
+               bytes_rd = read(fd, read_buf, sizeof(read_buf) - 1);
+       } while (bytes_rd == -1 && errno == EINTR);
+
+       // T_LOG("read %zd bytes: '%s'", bytes_rd, read_buf);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(bytes_rd, "reading from file");
+       T_QUIET; T_ASSERT_LE(bytes_rd, (ssize_t)EXPECTED_LEN,
+                       "read too much from file");
+
+       if (bytes_rd == 0) {
+               T_LOG("read EOF from file");
+               return false;
+       }
+
+       read_buf[bytes_rd] = '\0';
+       strlcpy(&(final_string[final_length]), read_buf,
+                       sizeof(final_string) - final_length);
+       final_length += (size_t)bytes_rd;
+
+       T_QUIET; T_ASSERT_LE(final_length, EXPECTED_LEN,
+                       "should not read more from file than what can be sent");
+
+       /* FIFOs don't send EOF when the write side closes */
+       if (final_length == strlen(EXPECTED_STRING) &&
+                       (fd_pair == FIFO_PAIR))
+       {
+               T_LOG("read all expected bytes from FIFO");
+               return false;
+       }
+       return true;
+}
+
+static void
+workqueue_read_fn(void ** __unused buf, int * __unused count)
+{
+       // T_MAYFAIL;
+       // T_QUIET; T_ASSERT_EFFECTIVE_QOS_EQ(EXPECTED_QOS,
+                       // "reader thread should be requested at correct QoS");
+       if (!handle_reading(shared.fd_pair, shared.rd_fd)) {
+               T_LOG("signal shared.rd_finished");
+               semaphore_signal(shared.rd_finished);
+       }
+
+       reenable_workq(shared.rd_fd, EVFILT_READ);
+}
+
+static void
+read_from_fd(int fd, enum fd_pair fd_pair, enum read_mode mode)
+{
+       int fd_flags;
+
+       T_LOG("reader setting up");
+
+       bzero(final_string, sizeof(final_string));
+
+       fd_flags = fcntl(fd, F_GETFL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(fd_flags, "fcntl(F_GETFL)");
+
+       if (!(fd_flags & O_NONBLOCK)) {
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(fcntl(fd, F_SETFL,
+                       fd_flags | O_NONBLOCK), NULL);
+       }
+
+       switch (mode) {
+       case POLL_READ: {
+               struct pollfd fds[] = { { .fd = fd, .events = POLLIN } };
+               wake_writer();
+
+               for (;;) {
+                       fds[0].revents = 0;
+                       int pol = poll(fds, 1, READ_TIMEOUT_SECS * 1000);
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(pol, "poll");
+                       T_QUIET; T_ASSERT_NE(pol, 0,
+                                       "poll should not time out after %d seconds, read %zd out "
+                                       "of %zu bytes",
+                                       READ_TIMEOUT_SECS, final_length, strlen(EXPECTED_STRING));
+                       T_QUIET; T_ASSERT_FALSE(fds[0].revents & POLLERR,
+                                       "should not see an error on the device");
+                       T_QUIET; T_ASSERT_FALSE(fds[0].revents & POLLNVAL,
+                                       "should not set up an invalid poll");
+
+                       if (!handle_reading(fd_pair, fd)) {
+                               break;
+                       }
+               }
+               break;
+       }
+
+       case SELECT_READ:
+               wake_writer();
+
+               for (;;) {
+                       struct timeval tv = { .tv_sec = READ_TIMEOUT_SECS };
+
+                       fd_set read_fd;
+                       FD_ZERO(&read_fd);
+                       FD_SET(fd, &read_fd);
+                       fd_set err_fd;
+                       FD_ZERO(&err_fd);
+                       FD_SET(fd, &err_fd);
+
+                       int sel = select(fd + 1, &read_fd, NULL, NULL/*&err_fd*/, &tv);
+                       if (sel == -1 && errno == EINTR) {
+                               T_LOG("select interrupted");
+                               continue;
+                       }
+                       (void)fd_pair;
+
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(sel, "select");
+
+                       T_QUIET; T_ASSERT_NE(sel, 0,
+                               "select waited for %d seconds and timed out",
+                               READ_TIMEOUT_SECS);
+
+                       /* didn't fail or time out, therefore data is ready */
+                       T_QUIET; T_ASSERT_NE(FD_ISSET(fd, &read_fd), 0,
+                                       "select should show reading fd as readable");
+
+                       if (!handle_reading(fd_pair, fd)) {
+                               break;
+                       }
+               }
+               break;
+
+       case KEVENT_READ: /* FALLTHROUGH */
+       case KEVENT64_READ: /* FALLTHROUGH */
+       case KEVENT_QOS_READ: {
+               union mode rd_mode = { .rd = shared.rd_mode };
+               drive_kq(true, rd_mode, fd_pair, shared.rd_fd);
+               break;
+       }
+
+       case WORKQ_READ: {
+               // prohibit ourselves from going multi-threaded see:rdar://33296008
+               _dispatch_prohibit_transition_to_multithreaded(true);
+               T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_kevent(
+                               workqueue_fn, workqueue_read_fn, 0, 0), NULL);
+
+               T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.rd_finished, SYNC_POLICY_FIFO, 0),
+                                     "semaphore_create shared.rd_finished");
+
+               T_QUIET;
+               T_ASSERT_NE_UINT(shared.rd_finished, (unsigned)MACH_PORT_NULL, "semaphore_create");
+
+               int changes = 1;
+               struct kevent_qos_s events[] = {{
+                       .ident = (uint64_t)shared.rd_fd,
+                       .filter = EVFILT_READ,
+                       .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
+                       .fflags = NOTE_LOWAT,
+                       .data = 1,
+                       .qos = (int32_t)_pthread_qos_class_encode(EXPECTED_QOS,
+                                       0, 0)
+               }};
+
+               for (;;) {
+                       int kev = kevent_qos(-1, changes == 0 ? NULL : events, changes,
+                                       events, 1, NULL, NULL,
+                                       KEVENT_FLAG_WORKQ | KEVENT_FLAG_ERROR_EVENTS);
+                       if (kev == -1 && errno == EINTR) {
+                               changes = 0;
+                               T_LOG("kevent_qos was interrupted");
+                               continue;
+                       }
+
+                       T_QUIET; T_ASSERT_POSIX_SUCCESS(kev, "kevent_qos");
+                       break;
+               }
+
+               wake_writer();
+               break;
+       }
+
+       case DISPATCH_READ: {
+               dispatch_source_t read_src;
+
+               T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.rd_finished, SYNC_POLICY_FIFO, 0),
+                                     "semaphore_create shared.rd_finished");
+
+               T_QUIET;
+               T_ASSERT_NE_UINT(shared.rd_finished, (unsigned)MACH_PORT_NULL, "semaphore_create");
+
+               read_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_READ,
+                               (uintptr_t)fd, 0, NULL);
+               T_QUIET; T_ASSERT_NOTNULL(read_src,
+                               "dispatch_source_create(DISPATCH_SOURCE_TYPE_READ)");
+
+               dispatch_block_t handler = dispatch_block_create_with_qos_class(
+                               DISPATCH_BLOCK_ENFORCE_QOS_CLASS, EXPECTED_QOS, 0, ^{
+                       // T_MAYFAIL;
+                       // T_QUIET; T_ASSERT_EFFECTIVE_QOS_EQ(EXPECTED_QOS,
+                                       // "read handler block should run at correct QoS");
+
+                       if (!handle_reading(fd_pair, fd)) {
+                               /* finished handling the fd, tear down the source */
+                               dispatch_source_cancel(read_src);
+                               dispatch_release(read_src);
+                               T_LOG("signal shared.rd_finished");
+                               semaphore_signal(shared.rd_finished);
+                       }
+               });
+
+               dispatch_source_set_event_handler(read_src, handler);
+               dispatch_activate(read_src);
+
+               wake_writer();
+               break;
+       }
+
+       default:
+               T_ASSERT_FAIL("unrecognized read mode: %d", mode);
+               break;
+       }
+
+       if (shared.rd_finished) {
+               T_LOG("wait shared.rd_finished");
+               kern_return_t kret = semaphore_timedwait(shared.rd_finished, READ_timeout);
+               if (kret == KERN_OPERATION_TIMED_OUT) {
+                       T_ASSERT_FAIL("reading timed out after %d seconds", READ_timeout.tv_sec);
+               }
+               T_QUIET;
+               T_ASSERT_MACH_SUCCESS(kret, "semaphore_timedwait shared.rd_finished");
+       }
+
+       T_EXPECT_EQ_STR(final_string, EXPECTED_STRING,
+                       "reader should receive valid string");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(close(fd), NULL);
+}
+
+#pragma mark file setup
+
+static void
+fd_pair_init(enum fd_pair fd_pair, int *rd_fd, int *wr_fd)
+{
+       switch (fd_pair) {
+       case PTY_PAIR:
+               T_ASSERT_POSIX_SUCCESS(openpty(rd_fd, wr_fd, NULL, NULL, NULL),
+                               NULL);
+               break;
+
+       case FIFO_PAIR: {
+               char fifo_path[] = "/tmp/async-io-fifo.XXXXXX";
+               T_QUIET; T_ASSERT_NOTNULL(mktemp(fifo_path), NULL);
+
+               T_ASSERT_POSIX_SUCCESS(mkfifo(fifo_path, 0700), "mkfifo(%s, 0700)",
+                               fifo_path);
+               /*
+                * Opening the read side of a pipe will block until the write
+                * side opens -- use O_NONBLOCK.
+                */
+               *rd_fd = open(fifo_path, O_RDONLY | O_NONBLOCK);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(*rd_fd, "open(... O_RDONLY)");
+               *wr_fd = open(fifo_path, O_WRONLY | O_NONBLOCK);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(*wr_fd, "open(... O_WRONLY)");
+               break;
+       }
+
+       case PIPE_PAIR: {
+               int pipe_fds[2];
+               T_ASSERT_POSIX_SUCCESS(pipe(pipe_fds), NULL);
+               *rd_fd = pipe_fds[0];
+               *wr_fd = pipe_fds[1];
+               break;
+       }
+
+       case SOCKET_PAIR: {
+               int sock_fds[2];
+               T_ASSERT_POSIX_SUCCESS(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds),
+                               NULL);
+               *rd_fd = sock_fds[0];
+               *wr_fd = sock_fds[1];
+               break;
+       }
+
+       default:
+               T_ASSERT_FAIL("unknown descriptor pair type: %d", fd_pair);
+               break;
+       }
+
+       T_QUIET; T_ASSERT_NE(*rd_fd, -1, "reading descriptor");
+       T_QUIET; T_ASSERT_NE(*wr_fd, -1, "writing descriptor");
+}
+
+#pragma mark single process
+
+static void
+drive_threads(enum fd_pair fd_pair, enum read_mode rd_mode,
+               enum write_mode wr_mode)
+{
+       pthread_t thread;
+
+       shared.fd_pair = fd_pair;
+       shared.rd_mode = rd_mode;
+       shared.wr_mode = wr_mode;
+       fd_pair_init(fd_pair, &(shared.rd_fd), &(shared.wr_fd));
+
+       shared.wr_kind = THREAD_WRITER;
+       T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.wr_wait.sem, SYNC_POLICY_FIFO, 0),
+                             "semaphore_create shared.wr_wait.sem");
+
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, write_to_fd, NULL),
+                       NULL);
+       T_LOG("created writer thread");
+
+       read_from_fd(shared.rd_fd, fd_pair, rd_mode);
+
+       T_ASSERT_POSIX_ZERO(pthread_join(thread, NULL), NULL);
+
+       T_END;
+}
+
+#pragma mark multiple processes
+
+static void __attribute__((noreturn))
+drive_processes(enum fd_pair fd_pair, enum read_mode rd_mode, enum write_mode wr_mode)
+{
+       shared.fd_pair = fd_pair;
+       shared.rd_mode = rd_mode;
+       shared.wr_mode = wr_mode;
+       fd_pair_init(fd_pair, &(shared.rd_fd), &(shared.wr_fd));
+
+       shared.wr_kind = PROCESS_WRITER;
+       int fds[2];
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pipe(fds), NULL);
+       shared.wr_wait.out_fd = fds[0];
+       shared.wr_wait.in_fd = fds[1];
+
+       T_LOG("starting subprocesses");
+       dt_helper_t helpers[2] = {
+               dt_fork_helper("reader_helper"),
+               dt_fork_helper("writer_helper")
+       };
+
+       close(shared.rd_fd);
+       close(shared.wr_fd);
+
+       dt_run_helpers(helpers, 2, 50000);
+}
+
+T_HELPER_DECL(reader_helper, "Read asynchronously")
+{
+       close(shared.wr_fd);
+       read_from_fd(shared.rd_fd, shared.fd_pair, shared.rd_mode);
+       T_END;
+}
+
+T_HELPER_DECL(writer_helper, "Write asynchronously")
+{
+       close(shared.rd_fd);
+       write_to_fd(NULL);
+}
+
+#pragma mark tests
+
+#define WR_DECL_PROCESSES(desc_name, fd_pair, write_name, write_str, \
+                               write_mode, read_name, read_mode) \
+               T_DECL(desc_name##_r##read_name##_w##write_name##_procs, "read changes to a " \
+                               #desc_name " with " #read_name " and writing " #write_str \
+                               " across two processes") \
+               { \
+                       drive_processes(fd_pair, read_mode, write_mode); \
+               }
+#define WR_DECL_THREADS(desc_name, fd_pair, write_name, write_str, \
+                               write_mode, read_name, read_mode) \
+               T_DECL(desc_name##_r##read_name##_w##write_name##_thds, "read changes to a " \
+                               #desc_name " with " #read_name " and writing " #write_str) \
+               { \
+                       drive_threads(fd_pair, read_mode, write_mode); \
+               }
+
+#define WR_DECL(desc_name, fd_pair, write_name, write_str, write_mode, \
+               read_name, read_mode) \
+               WR_DECL_PROCESSES(desc_name, fd_pair, write_name, write_str, \
+                               write_mode, read_name, read_mode) \
+               WR_DECL_THREADS(desc_name, fd_pair, write_name, write_str, \
+                               write_mode, read_name, read_mode)
+
+#define RD_DECL_SAFE(desc_name, fd_pair, read_name, read_mode) \
+               WR_DECL(desc_name, fd_pair, full, "the full string", FULL_WRITE, \
+                               read_name, read_mode) \
+               WR_DECL(desc_name, fd_pair, inc, "incrementally", \
+                               INCREMENTAL_WRITE, read_name, read_mode)
+
+#define RD_DECL_DISPATCH_ONLY(suffix, desc_name, fd_pair, read_name, \
+                               read_mode) \
+               WR_DECL##suffix(desc_name, fd_pair, inc_dispatch, \
+                               "incrementally with a dispatch source", \
+                               DISPATCH_INCREMENTAL_WRITE, read_name, read_mode)
+#define RD_DECL_WORKQ_ONLY(suffix, desc_name, fd_pair, read_name, \
+                               read_mode) \
+               WR_DECL##suffix(desc_name, fd_pair, inc_workq, \
+                               "incrementally with the workqueue", \
+                               WORKQ_INCREMENTAL_WRITE, read_name, read_mode)
+
+#define RD_DECL(desc_name, fd_pair, read_name, read_mode) \
+               RD_DECL_SAFE(desc_name, fd_pair, read_name, read_mode) \
+               RD_DECL_DISPATCH_ONLY(, desc_name, fd_pair, read_name, read_mode)
+               // RD_DECL_WORKQ_ONLY(, desc_name, fd_pair, read_name, read_mode)
+
+/*
+ * dispatch_source tests cannot share the same process as other workqueue
+ * tests.
+ */
+#define RD_DECL_DISPATCH(desc_name, fd_pair, read_name, read_mode) \
+               RD_DECL_SAFE(desc_name, fd_pair, read_name, read_mode) \
+               RD_DECL_DISPATCH_ONLY(, desc_name, fd_pair, read_name, read_mode) \
+               RD_DECL_WORKQ_ONLY(_PROCESSES, desc_name, fd_pair, read_name, \
+                               read_mode)
+
+/*
+ * Workqueue tests cannot share the same process as other workqueue or
+ * dispatch_source tests.
+#define RD_DECL_WORKQ(desc_name, fd_pair, read_name, read_mode) \
+               RD_DECL_SAFE(desc_name, fd_pair, read_name, read_mode) \
+               RD_DECL_DISPATCH_ONLY(_PROCESSES, desc_name, fd_pair, read_name, \
+                               read_mode) \
+               RD_DECL_WORKQ_ONLY(_PROCESSES, desc_name, fd_pair, read_name, \
+                               read_mode)
+ */
+
+#define PAIR_DECL(desc_name, fd_pair) \
+       RD_DECL(desc_name, fd_pair, poll, POLL_READ) \
+       RD_DECL(desc_name, fd_pair, select, SELECT_READ) \
+       RD_DECL(desc_name, fd_pair, kevent, KEVENT_READ) \
+       RD_DECL(desc_name, fd_pair, kevent64, KEVENT64_READ) \
+       RD_DECL(desc_name, fd_pair, kevent_qos, KEVENT_QOS_READ) \
+       RD_DECL_DISPATCH(desc_name, fd_pair, dispatch_source, DISPATCH_READ)
+       // RD_DECL_WORKQ(desc_name, fd_pair, workq, WORKQ_READ)
+
+PAIR_DECL(tty, PTY_PAIR)
+PAIR_DECL(pipe, PIPE_PAIR)
+PAIR_DECL(fifo, FIFO_PAIR)
+PAIR_DECL(socket, SOCKET_PAIR)
diff --git a/tests/port_descriptions.c b/tests/port_descriptions.c
new file mode 100644 (file)
index 0000000..a42ab29
--- /dev/null
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018 Apple Computer, Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+#include <darwintest.h>
+#include <mach/port_descriptions.h>
+
+static void
+expect_special_port_description(const char *(*fn)(mach_port_t),
+               mach_port_t port, const char *namestr)
+{
+       const char *desc = fn(port);
+       T_EXPECT_NOTNULL(desc, "%s is %s", namestr, desc);
+       if (desc) {
+               T_QUIET; T_EXPECT_GT(strlen(desc), strlen(""),
+                               "%s's description string is not empty", namestr);
+       }
+}
+
+T_DECL(host_special_port_descriptions,
+               "verify that host special ports can be described")
+{
+#define TEST_HSP(portdef) \
+               expect_special_port_description(mach_host_special_port_description, \
+               portdef, #portdef)
+
+       TEST_HSP(HOST_PORT);
+       TEST_HSP(HOST_PRIV_PORT);
+       TEST_HSP(HOST_IO_MASTER_PORT);
+       TEST_HSP(HOST_DYNAMIC_PAGER_PORT);
+       TEST_HSP(HOST_AUDIT_CONTROL_PORT);
+       TEST_HSP(HOST_USER_NOTIFICATION_PORT);
+       TEST_HSP(HOST_AUTOMOUNTD_PORT);
+       TEST_HSP(HOST_LOCKD_PORT);
+       TEST_HSP(HOST_KTRACE_BACKGROUND_PORT);
+       TEST_HSP(HOST_SEATBELT_PORT);
+       TEST_HSP(HOST_KEXTD_PORT);
+       TEST_HSP(HOST_LAUNCHCTL_PORT);
+       TEST_HSP(HOST_UNFREED_PORT);
+       TEST_HSP(HOST_AMFID_PORT);
+       TEST_HSP(HOST_GSSD_PORT);
+       TEST_HSP(HOST_TELEMETRY_PORT);
+       TEST_HSP(HOST_ATM_NOTIFICATION_PORT);
+       TEST_HSP(HOST_COALITION_PORT);
+       TEST_HSP(HOST_SYSDIAGNOSE_PORT);
+       TEST_HSP(HOST_XPC_EXCEPTION_PORT);
+       TEST_HSP(HOST_CONTAINERD_PORT);
+       TEST_HSP(HOST_NODE_PORT);
+       TEST_HSP(HOST_RESOURCE_NOTIFY_PORT);
+       TEST_HSP(HOST_CLOSURED_PORT);
+       TEST_HSP(HOST_SYSPOLICYD_PORT);
+
+#undef TEST_HSP
+
+       T_EXPECT_EQ(HOST_SYSPOLICYD_PORT, HOST_MAX_SPECIAL_PORT,
+                       "checked all of the ports");
+
+       const char *invalid_hsp =
+                       mach_host_special_port_description(HOST_MAX_SPECIAL_PORT + 1);
+       T_EXPECT_NULL(invalid_hsp,
+                       "invalid host special port description should be NULL");
+}
+
+T_DECL(task_special_port_descriptions,
+               "verify that task special ports can be described")
+{
+#define TEST_TSP(portdef) \
+               expect_special_port_description(mach_task_special_port_description, \
+               portdef, #portdef)
+
+       TEST_TSP(TASK_KERNEL_PORT);
+       TEST_TSP(TASK_HOST_PORT);
+       TEST_TSP(TASK_NAME_PORT);
+       TEST_TSP(TASK_BOOTSTRAP_PORT);
+       TEST_TSP(TASK_SEATBELT_PORT);
+       TEST_TSP(TASK_ACCESS_PORT);
+       TEST_TSP(TASK_DEBUG_CONTROL_PORT);
+       TEST_TSP(TASK_RESOURCE_NOTIFY_PORT);
+
+#undef TEST_TSP
+
+       T_EXPECT_EQ(TASK_RESOURCE_NOTIFY_PORT, TASK_MAX_SPECIAL_PORT,
+                       "checked all of the ports");
+
+       const char *invalid_tsp =
+                       mach_task_special_port_description(TASK_MAX_SPECIAL_PORT + 1);
+       T_EXPECT_NULL(invalid_tsp,
+                       "invalid task special port description should be NULL");
+}
+
+static void
+expect_special_port_id(int (*fn)(const char *id), int port, const char *portid)
+{
+       int observed_port = fn(portid);
+       T_WITH_ERRNO;
+       T_EXPECT_EQ(observed_port, port, "%s is %d", portid, observed_port);
+}
+
+T_DECL(host_special_port_mapping,
+               "verify that task special port names can be mapped to numbers")
+{
+#define TEST_HSP(portdef) \
+               expect_special_port_id(mach_host_special_port_for_id, \
+               portdef, #portdef)
+
+       TEST_HSP(HOST_PORT);
+       TEST_HSP(HOST_PRIV_PORT);
+       TEST_HSP(HOST_IO_MASTER_PORT);
+       TEST_HSP(HOST_DYNAMIC_PAGER_PORT);
+       TEST_HSP(HOST_AUDIT_CONTROL_PORT);
+       TEST_HSP(HOST_USER_NOTIFICATION_PORT);
+       TEST_HSP(HOST_AUTOMOUNTD_PORT);
+       TEST_HSP(HOST_LOCKD_PORT);
+       TEST_HSP(HOST_KTRACE_BACKGROUND_PORT);
+       TEST_HSP(HOST_SEATBELT_PORT);
+       TEST_HSP(HOST_KEXTD_PORT);
+       TEST_HSP(HOST_LAUNCHCTL_PORT);
+       TEST_HSP(HOST_UNFREED_PORT);
+       TEST_HSP(HOST_AMFID_PORT);
+       TEST_HSP(HOST_GSSD_PORT);
+       TEST_HSP(HOST_TELEMETRY_PORT);
+       TEST_HSP(HOST_ATM_NOTIFICATION_PORT);
+       TEST_HSP(HOST_COALITION_PORT);
+       TEST_HSP(HOST_SYSDIAGNOSE_PORT);
+       TEST_HSP(HOST_XPC_EXCEPTION_PORT);
+       TEST_HSP(HOST_CONTAINERD_PORT);
+       TEST_HSP(HOST_NODE_PORT);
+       TEST_HSP(HOST_RESOURCE_NOTIFY_PORT);
+       TEST_HSP(HOST_CLOSURED_PORT);
+       TEST_HSP(HOST_SYSPOLICYD_PORT);
+
+#undef TEST_HSP
+
+       int invalid_tsp = mach_host_special_port_for_id("BOGUS_SPECIAL_PORT_NAME");
+       T_EXPECT_EQ(invalid_tsp, -1,
+                       "invalid host special port IDs should return -1");
+}
+
+T_DECL(task_special_port_mapping,
+               "verify that task special port names can be mapped to numbers")
+{
+#define TEST_TSP(portdef) \
+               expect_special_port_id(mach_task_special_port_for_id, \
+               portdef, #portdef)
+
+       TEST_TSP(TASK_KERNEL_PORT);
+       TEST_TSP(TASK_HOST_PORT);
+       TEST_TSP(TASK_NAME_PORT);
+       TEST_TSP(TASK_BOOTSTRAP_PORT);
+       TEST_TSP(TASK_SEATBELT_PORT);
+       TEST_TSP(TASK_ACCESS_PORT);
+       TEST_TSP(TASK_DEBUG_CONTROL_PORT);
+       TEST_TSP(TASK_RESOURCE_NOTIFY_PORT);
+
+#undef TEST_TSP
+
+       int invalid_tsp = mach_task_special_port_for_id("BOGUS_SPECIAL_PORT_NAME");
+       T_EXPECT_EQ(invalid_tsp, -1,
+                       "invalid task special port IDs should return -1");
+}
diff --git a/tests/private_entitlement.plist b/tests/private_entitlement.plist
new file mode 100644 (file)
index 0000000..6f5cece
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.entitlement-1</key>
+       <string>something</string>
+</dict>
+</plist>
diff --git a/tests/proc_core_name_24152432.c b/tests/proc_core_name_24152432.c
new file mode 100644 (file)
index 0000000..11317c6
--- /dev/null
@@ -0,0 +1,197 @@
+#include <darwintest.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/event.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <sys/resource.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <TargetConditionals.h>
+#include <unistd.h>
+#include <dirent.h>
+
+#define BUFFLEN  2048
+#define EVILLEN  19
+#define TIMEOUT  420 /* Timeout in seconds to wait for coredumps to appear */
+
+static const char corefile_ctl[]     = "kern.corefile";
+static const char coredump_ctl[]     = "kern.coredump";
+/* The directory where coredumps will be */
+static const char dump_dir[]        = "/cores";
+/* The default coredump location if the kern.coredump ctl is invalid */
+static const char default_dump_fmt[] = "/cores/core.%d";
+/* The coredump location when we set kern.coredump ctl to something valid */
+static const char valid_dump_fmt[]   = "/cores/test-core.%d";
+static const char ls_path[]          = "/bin/ls";
+
+/* /cores/core.%(null), then BORK immediately after. */
+static char evil[] = "/cores/core.%\0BORK";
+/* A valid coredump location to test. */
+static char valid_dump_loc[]   = "/cores/test-core.%P";
+
+static const struct rlimit lim_infty = {
+       RLIM_INFINITY,
+       RLIM_INFINITY
+};
+
+static volatile int stop_looking = 0;
+
+static const struct timespec timeout = {
+       TIMEOUT,
+       0
+};
+
+#if TARGET_OS_OSX
+static int fork_and_wait_for_segfault(void);
+
+static void sigalrm_handler(int sig)
+{
+       (void)sig;
+       stop_looking = 1;
+       return;
+}
+
+static void list_coredump_files()
+{
+       int ret;
+       char buf[BUFFLEN] = { 0 };
+
+       T_LOG("Contents of %s:", dump_dir);
+       snprintf(buf, BUFFLEN, "%s %s", ls_path, dump_dir);
+       ret = system(buf);
+       T_ASSERT_POSIX_SUCCESS(ret, "Listing contents of cores directory");
+       return;
+}
+
+static int fork_and_wait_for_segfault() {
+       int pid, ret;
+       pid = fork();
+       if (pid == 0) {
+               unsigned int *ptr = NULL; /* Cause a segfault so that we get a coredump */
+               *ptr = 0xdeadd00d;
+               T_FAIL("Expected segmentation fault on write to NULL pointer");
+       }
+       T_ASSERT_TRUE(pid != -1, "Checking fork success in parent");
+
+       ret = wait(NULL);
+       T_ASSERT_TRUE(ret != -1, "Waited for child to segfault and dump core");
+       return pid;
+}
+
+static int setup_coredump_kevent(struct kevent *kev, int dir)
+{
+       int ret;
+       int kqfd;
+
+       EV_SET(kev, dir, EVFILT_VNODE, EV_ADD, NOTE_WRITE, 0, NULL);
+       kqfd = kqueue();
+       T_ASSERT_POSIX_SUCCESS(kqfd, "kqueue: get kqueue for coredump monitoring");
+
+       ret = kevent(kqfd, kev, 1, NULL, 0, NULL);
+       T_ASSERT_POSIX_SUCCESS(ret, "kevent: setup directory monitoring for coredump");
+       return kqfd;
+}
+
+static void look_for_coredump(const char *format, int pid, int kqfd, struct kevent *kev)
+{
+       int ret = 0;
+       int i = 0;
+       char buf[BUFFLEN];
+       memset(buf, 0, BUFFLEN);
+       /*
+        * Something else might touch this directory. If we get notified and don't see
+        * anything, try a few more times before failing.
+        */
+       alarm(TIMEOUT);
+       while (!stop_looking) {
+               /* Wait for kevent to tell us the coredump folder was modified */
+               ret = kevent(kqfd, NULL, 0, kev, 1, &timeout);
+               T_ASSERT_POSIX_SUCCESS(ret, "kevent: Waiting for coredump to appear");
+
+               snprintf(buf, BUFFLEN, format, pid);
+               ret = remove(buf);
+
+               if (ret != -1)
+                       break;
+
+               T_LOG("Couldn't find coredump file (try #%d).", i+1);
+               i++;
+       }
+       alarm(0);
+
+       if (ret == -1) {
+               /* Couldn't find the coredump -- list contents of /cores */
+               list_coredump_files();
+       }
+       T_ASSERT_POSIX_SUCCESS(ret, "Removing coredump file (should be at %s)", buf);
+}
+
+static void sysctl_enable_coredumps(void)
+{
+       int ret;
+       int enable_core_dump = 1;
+       size_t oldlen = BUFFLEN;
+       char buf[BUFFLEN];
+       memset(buf, 0, BUFFLEN);
+
+       ret = sysctlbyname(coredump_ctl, buf, &oldlen, &enable_core_dump, sizeof(int));
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: enable core dumps");
+
+       ret = setrlimit(RLIMIT_CORE, &lim_infty);
+       T_ASSERT_POSIX_SUCCESS(ret, "setrlimit: remove limit on maximum coredump size");
+}
+#endif
+
+T_DECL(
+       proc_core_name_24152432,
+       "Tests behavior of core dump when kern.corefile ends in %, e.g., /cores/core.%",
+       T_META_ASROOT(true),
+       T_META_IGNORECRASHES("proc_core_name_24152432.*"))
+{
+#if TARGET_OS_OSX
+       DIR *dirp;
+       int ret, pid, dir;
+       char buf[BUFFLEN];
+       memset(buf, 0, BUFFLEN);
+       size_t oldlen = BUFFLEN;
+       struct kevent kev;
+       sig_t sig;
+       int kqfd;
+
+       sig = signal(SIGALRM, sigalrm_handler);
+       T_WITH_ERRNO; T_EXPECT_NE(sig, SIG_ERR, "signal: set sigalrm handler");
+
+       dirp = opendir(dump_dir);
+       T_ASSERT_NOTNULL(dirp, "opendir: opening coredump directory");
+       dir = dirfd(dirp);
+       T_ASSERT_POSIX_SUCCESS(dir, "dirfd: getting file descriptor for coredump directory");
+       kqfd = setup_coredump_kevent(&kev, dir);
+
+       sysctl_enable_coredumps();
+
+       ret = sysctlbyname(corefile_ctl, buf, &oldlen, evil, EVILLEN);
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set bad core dump location, old value was %s", buf);
+       memset(buf, 0, BUFFLEN);
+       oldlen = BUFFLEN;
+
+       pid = fork_and_wait_for_segfault();
+       look_for_coredump(default_dump_fmt, pid, kqfd, &kev);
+
+       ret = sysctlbyname(corefile_ctl, buf, &oldlen, valid_dump_loc, strlen(valid_dump_loc));
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set valid core dump location, old value was %s", buf);
+       memset(buf, 0, BUFFLEN);
+
+       pid = fork_and_wait_for_segfault();
+       look_for_coredump(valid_dump_fmt, pid, kqfd, &kev);
+
+       closedir(dirp);
+       close(kqfd);
+#else
+       T_LOG("proc_core_name appears in OS X only, skipping test.");
+#endif
+       T_PASS("proc_core_name_24152432 PASSED");
+}
diff --git a/tests/proc_info.c b/tests/proc_info.c
new file mode 100644 (file)
index 0000000..cb5799d
--- /dev/null
@@ -0,0 +1,2178 @@
+#define PRIVATE
+#include <System/sys/kdebug.h>
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <dispatch/dispatch.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <libproc.h>
+#include <limits.h>
+#include <mach/mach.h>
+#include <mach/policy.h>
+#include <mach/vm_param.h>
+#include <os/assumes.h>
+#include <os/overflow.h>
+#include <pthread.h>
+#include <pthread/qos_private.h>
+#include <signal.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/event.h>
+#include <sys/mman.h>
+#include <sys/proc_info.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <unistd.h>
+#undef PRIVATE
+
+#define ACT_CHANGE_UID 1
+#define ACT_CHANGE_RUID 2
+#define ACT_EXIT 127
+
+#define ACT_PHASE2 2
+#define ACT_PHASE3 3
+#define ACT_PHASE4 4
+#define ACT_PHASE5 5
+
+#define PIPE_IN 0
+#define PIPE_OUT 1
+
+#define CONF_THREAD_NAME "test_child_thread"
+#define CONF_CMD_NAME getprogname()
+#define CONF_PROC_COUNT 20
+#define CONF_BLK_SIZE 4096
+#define CONF_UID_VAL 999U
+#define CONF_RUID_VAL 998U
+#define CONF_GID_VAL 997U
+#define CONF_NICE_VAL 5
+#define CONF_NUM_THREADS 2
+
+#define BASEPRI_DEFAULT 31
+#define MAXPRI_USER 63
+
+#define CONF_OPN_FILE_COUNT 3
+#define CONF_TMP_FILE_PFX   "/tmp/xnu.tests.proc_info."
+static int CONF_TMP_FILE_OPEN(char path[PATH_MAX])
+{
+       static char stmp_path[PATH_MAX] = {};
+       char *nm;
+       if (path) {
+               nm = path;
+       } else {
+               nm = stmp_path;
+       }
+       strlcpy(nm, CONF_TMP_FILE_PFX "XXXXXXXXXX", PATH_MAX);
+       int fd = mkstemp(nm);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(fd, "mkstemp(" CONF_TMP_FILE_PFX "XXXXXXXXXX)");
+       return fd;
+}
+
+uint32_t get_tty_dev(void);
+
+#define WAIT_FOR_CHILDREN(pipefd, action, child_count)                           \
+       do {                                                                         \
+               long ret;                                                                \
+               if (child_count == 1) {                                                  \
+                       int child_ret_action = 999;                                          \
+                       while (child_ret_action != action) {                                 \
+                               ret = read(pipefd, &child_ret_action, sizeof(child_ret_action)); \
+                       }                                                                    \
+               } else {                                                                 \
+                       int child_ready_count = child_count * (int)sizeof(action);           \
+                                                                                 \
+                       action = 0;                                                          \
+                       while (child_ready_count) {                                          \
+                               ret = read(pipefd, &action, (int)sizeof(action));                \
+                               if (ret != -1) {                                                 \
+                                       child_ready_count -= ret;                                    \
+                               } else {                                                         \
+                                       T_FAIL("ERROR: Could not read from pipe() : %d", errno);     \
+                               }                                                                \
+                               if (action) {                                                    \
+                                       T_FAIL("ERROR: Child action failed with error %d", action);  \
+                               }                                                                \
+                       }                                                                    \
+               }                                                                        \
+       } while (0)
+
+#define PROC_INFO_CALL(struct_name, pid, flavor, proc_arg)                                                     \
+       do {                                                                                                       \
+               struct struct_name * struct_var = malloc(sizeof(struct struct_name));                                  \
+               T_QUIET;                                                                                               \
+               T_ASSERT_NOTNULL(struct_var, "malloc() for " #flavor);                                                 \
+               retval = __proc_info(PROC_INFO_CALL_PIDINFO, pid, flavor, (uint64_t)proc_arg, (user_addr_t)struct_var, \
+                                    (uint32_t)sizeof(struct struct_name));                                            \
+                                                                                                               \
+               T_QUIET;                                                                                               \
+               T_EXPECT_POSIX_SUCCESS(retval, "__proc_info call for " #flavor);                                       \
+               T_ASSERT_EQ_INT(retval, (int)sizeof(struct struct_name), "__proc_info call for " #flavor);             \
+               ret_structs[i] = (void *)struct_var;                                                                   \
+               i++;                                                                                                   \
+       } while (0)
+
+uint32_t
+get_tty_dev()
+{
+       struct stat buf;
+       stat(ttyname(1), &buf);
+       return ((uint32_t)buf.st_rdev);
+}
+
+/*
+ * Defined in libsyscall/wrappers/libproc/libproc.c
+ * For API test only. For normal use, please use the libproc API instead.
+ * DO NOT COPY
+ */
+extern int __proc_info(int32_t callnum, int32_t pid, uint32_t flavor, uint64_t arg, user_addr_t buffer, int32_t buffersize);
+struct proc_config_s {
+       int parent_pipe[2];
+       int child_count;
+       pid_t proc_grp_id;
+       int child_pipe[CONF_PROC_COUNT][2];
+       int child_pids[CONF_PROC_COUNT];
+       void * cow_map; /* memory for cow test */
+};
+typedef struct proc_config_s * proc_config_t;
+
+typedef void (^child_action_handler_t)(proc_config_t proc_config, int child_id);
+
+enum proc_info_opt {
+       P_UNIQIDINFO    = 0x01,
+       C_UNIQIDINFO    = 0x02,
+       PBSD_OLD        = 0x04,
+       PBSD            = 0x08,
+       PBSD_SHORT      = 0x10,
+       PBSD_UNIQID     = 0x20,
+       P_TASK_INFO     = 0x40,
+       P_TASK_INFO_NEW = 0x80,
+       PALL            = 0x100,
+       THREAD_ADDR     = 0x200,
+       PTHINFO_OLD     = 0x400,
+       PTHINFO         = 0x800,
+       PTHINFO_64      = 0x1000,
+       PINFO_PATH      = 0x2000,
+       PAI             = 0x4000,
+       PREGINFO        = 0x8000,
+       PREGINFO_PATH   = 0x10000,
+       PREGINFO_PATH_2 = 0x20000,
+       PREGINFO_PATH_3 = 0x40000,
+       PVNINFO         = 0x80000
+};
+
+static int tmp_fd = -1;
+
+static child_action_handler_t proc_info_listpids_handler = ^void(proc_config_t proc_config, int child_id) {
+  close(proc_config->parent_pipe[PIPE_IN]);
+  close(proc_config->child_pipe[child_id][PIPE_OUT]);
+  long retval      = 0;
+  int child_action = 0;
+  retval           = write(proc_config->parent_pipe[PIPE_OUT], &child_action, sizeof(child_action));
+  if (retval != -1) {
+         while (child_action != ACT_EXIT) {
+                 retval = read(proc_config->child_pipe[child_id][PIPE_IN], &child_action, sizeof(child_action));
+                 if (retval == 0 || (retval == -1 && errno == EAGAIN)) {
+                         continue;
+                 }
+                 if (retval != -1) {
+                         switch (child_action) {
+                         case ACT_CHANGE_UID:
+                                 /*
+                                  * Change uid
+                                  */
+                                 retval = setuid(CONF_UID_VAL);
+                                 break;
+                         case ACT_CHANGE_RUID:
+                                 /*
+                                  * Change ruid
+                                  */
+                                 retval = setreuid(CONF_RUID_VAL, (uid_t)-1);
+                                 break;
+                         case ACT_EXIT:
+                                 /*
+                                  * Exit
+                                  */
+                                 break;
+                         }
+                 }
+                 if (child_action != ACT_EXIT) {
+                         retval = write(proc_config->parent_pipe[PIPE_OUT], &retval, sizeof(retval));
+                         if (retval == -1)
+                                 break;
+                 }
+         }
+  }
+  close(proc_config->parent_pipe[PIPE_OUT]);
+  close(proc_config->child_pipe[child_id][PIPE_IN]);
+  exit(0);
+};
+
+static child_action_handler_t proc_info_call_pidinfo_handler = ^void(proc_config_t proc_config, int child_id) {
+  close(proc_config->parent_pipe[PIPE_IN]);
+  close(proc_config->child_pipe[child_id][PIPE_OUT]);
+  int action  = 0;
+  long retval = 0;
+  int i;
+  void * tmp_map           = NULL;
+  dispatch_queue_t q       = NULL;
+  dispatch_semaphore_t sem = NULL;
+  /*
+   * PHASE 1: Child ready and waits for parent to send next action
+   */
+  T_LOG("Child ready to accept action from parent");
+  retval = write(proc_config->parent_pipe[PIPE_OUT], &action, sizeof(action));
+  if (retval != -1) {
+         while (action != ACT_EXIT) {
+                 retval = read(proc_config->child_pipe[child_id][PIPE_IN], &action, sizeof(action));
+
+                 if (retval != -1) {
+                         retval = 0;
+                         switch (action) {
+                         case ACT_PHASE2: {
+                                 /*
+                                  * Change uid, euid, guid, rgid, nice value
+                                  * Also change the svuid and svgid
+                                  */
+                                 T_LOG("Child changing uid, euid, rguid, svuid, svgid and nice value");
+                                 retval = nice(CONF_NICE_VAL);
+                                 if (retval == -1) {
+                                         T_LOG("(child) ERROR: nice() failed");
+                                         break;
+                                 }
+                                 retval = setgid(CONF_GID_VAL);
+                                 if (retval == -1) {
+                                         T_LOG("(child) ERROR: setgid() failed");
+                                         break;
+                                 }
+                                 retval = setreuid((uid_t)-1, CONF_RUID_VAL);
+                                 if (retval == -1) {
+                                         T_LOG("(child) ERROR: setreuid() failed");
+                                         break;
+                                 }
+                                 break;
+                         }
+                         case ACT_PHASE3: {
+                                 /*
+                                  * Allocate a page of memory
+                                  * Copy on write shared memory
+                                  */
+                                 T_LOG("Child allocating a page of memory, and causing a copy-on-write");
+                                 retval  = 0;
+                                 tmp_map = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+                                 if (tmp_map == MAP_FAILED) {
+                                         T_LOG("(child) ERROR: mmap() failed");
+                                         retval = 1;
+                                         break;
+                                 }
+                                 /*
+                                  * Get the page allocated
+                                  */
+                                 int * map_ptr = (int *)tmp_map;
+                                 for (i = 0; i < (int)(PAGE_SIZE / sizeof(int)); i++) {
+                                         *map_ptr++ = i;
+                                 }
+                                 /*
+                                  * Cause copy on write to the page
+                                  */
+                                 *((int *)(proc_config->cow_map)) = 20;
+
+                                 break;
+                         }
+                         case ACT_PHASE4: {
+                                 T_LOG("Child spending CPU cycles and changing thread name");
+                                 retval                       = 0;
+                                 int number                   = 1000;
+                                 unsigned long long factorial = 1;
+                                 int j;
+                                 for (j = 1; j <= number; j++) {
+                                         factorial *= (unsigned long long)j;
+                                 }
+                                 sysctlbyname("kern.threadname", NULL, 0, CONF_THREAD_NAME, strlen(CONF_THREAD_NAME));
+                                 break;
+                         }
+                         case ACT_PHASE5: {
+                                 /*
+                                  * Dispatch for Workq test
+                                  */
+                                 T_LOG("Child creating a dispatch queue, and dispatching blocks on it");
+                                 q = dispatch_queue_create("com.apple.test_proc_info.workqtest",
+                                                               DISPATCH_QUEUE_CONCURRENT); // dispatch_get_global_queue(0, 0);
+                                 sem = dispatch_semaphore_create(0);
+
+                                 for (i = 0; i < CONF_NUM_THREADS; i++) {
+                                         dispatch_async(q, ^{
+                                               /*
+                                                * Block the thread, do nothing
+                                                */
+                                               dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
+                                         });
+                                 }
+                                 break;
+                         }
+                         case ACT_EXIT: {
+                                 /*
+                                  * Exit
+                                  */
+                                 if (sem) {
+                                         for (i = 0; i < CONF_NUM_THREADS; i++) {
+                                                 dispatch_semaphore_signal(sem);
+                                         }
+                                 }
+
+                                 if (tmp_map)
+                                         munmap(tmp_map, PAGE_SIZE);
+
+                                 if (proc_config->cow_map)
+                                         munmap(proc_config->cow_map, PAGE_SIZE);
+
+                                 break;
+                         }
+                         }
+                 }
+                 if (action != ACT_EXIT) {
+                         retval = write(proc_config->parent_pipe[PIPE_OUT], &action, sizeof(action));
+                         if (retval == -1)
+                                 break;
+                 }
+         }
+         close(proc_config->parent_pipe[PIPE_OUT]);
+         close(proc_config->child_pipe[child_id][PIPE_IN]);
+         exit(0);
+  }
+};
+
+static void
+free_proc_config(proc_config_t proc_config)
+{
+       free(proc_config);
+}
+
+static void
+send_action_to_child_processes(proc_config_t proc_config, int action)
+{
+       long err;
+       for (int i = 0; i < proc_config->child_count; i++) {
+               err = write(proc_config->child_pipe[i][PIPE_OUT], &action, sizeof(action));
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(err, "write() to child in send_action");
+       }
+       if (action != ACT_EXIT) {
+               WAIT_FOR_CHILDREN(proc_config->parent_pipe[PIPE_IN], action, proc_config->child_count);
+       }
+}
+
+static void
+kill_child_processes(proc_config_t proc_config)
+{
+       int ret = 0;
+       T_LOG("Killing child processes");
+       send_action_to_child_processes(proc_config, ACT_EXIT);
+       for (int child_id = 0; child_id < proc_config->child_count; child_id++) {
+               close(proc_config->child_pipe[child_id][PIPE_OUT]);
+               dt_waitpid(proc_config->child_pids[child_id], NULL, NULL, 5);
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(ret, "killed child %d", child_id);
+       }
+       close(proc_config->parent_pipe[PIPE_IN]);
+       munmap(proc_config->cow_map, PAGE_SIZE);
+       T_LOG("Killed child processes");
+}
+
+static proc_config_t
+spawn_child_processes(int child_count, child_action_handler_t child_handler)
+{
+       /*
+        * Spawn procs for Tests 1.2 and 1.3
+        */
+       T_LOG("Spawning child processes...");
+       proc_config_t proc_config = malloc(sizeof(*proc_config));
+       int action                = 0;
+       int err;
+
+       setpgid(0, 0);
+       proc_config->proc_grp_id = getpgid(0);
+
+       proc_config->child_count = child_count;
+
+       err = pipe(proc_config->parent_pipe);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(err, "pipe() call");
+
+       /*
+        * Needed for ACT_PHASE3 tests
+        */
+       proc_config->cow_map = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+       T_QUIET;
+       T_ASSERT_NE_PTR(proc_config->cow_map, MAP_FAILED, "cow_map mmap()");
+       *((int *)(proc_config->cow_map)) = 10;
+
+       pid_t child_pid;
+       int i;
+       int child_id;
+       for (i = 0; i < child_count; i++) {
+               err = pipe(proc_config->child_pipe[i]);
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(err, "pipe() call");
+
+               child_pid = fork();
+               child_id  = i;
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(child_pid, "fork() in parent process for child %d", child_id);
+
+               if (child_pid == 0) {
+                       child_handler(proc_config, child_id);
+               } else {
+                       proc_config->child_pids[child_id] = child_pid;
+               }
+               close(proc_config->child_pipe[child_id][PIPE_IN]);
+       }
+       /*
+        * Wait for the children processes to spawn
+        */
+       close(proc_config->parent_pipe[PIPE_OUT]);
+       WAIT_FOR_CHILDREN(proc_config->parent_pipe[PIPE_IN], action, child_count);
+
+       return proc_config;
+}
+
+/*
+ *  All PROC_INFO_CALL_PIDINFO __proc_info calls fire from this function.
+ *  T_DECLs require different combinations of structs and different actions
+ *  must occur in the child to get the data.  Instead of performing the setup
+ *  in each T_DECL, this function accepts a bitmap and performs the necessary setup
+ *  and cleanup work
+ */
+
+static void
+proc_info_caller(int proc_info_opts, void ** ret_structs, int * ret_child_pid)
+{
+       int retval, i = 0;
+       uint64_t * thread_addr = NULL;
+       void * map_tmp         = NULL;
+
+       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
+       int child_pid             = proc_config->child_pids[0];
+       /*
+        * These tests only require one child.
+        * Some DECLs need to know the child pid, so we pass that back if applicable
+        */
+       if (ret_child_pid != NULL) {
+               *ret_child_pid = child_pid;
+       }
+
+       if (proc_info_opts & P_UNIQIDINFO) {
+               PROC_INFO_CALL(proc_uniqidentifierinfo, getpid(), PROC_PIDUNIQIDENTIFIERINFO, 0);
+       }
+       if (proc_info_opts & C_UNIQIDINFO) {
+               PROC_INFO_CALL(proc_uniqidentifierinfo, child_pid, PROC_PIDUNIQIDENTIFIERINFO, 0);
+       }
+       if (proc_info_opts & PBSD_OLD) {
+               PROC_INFO_CALL(proc_bsdinfo, child_pid, PROC_PIDTBSDINFO, 0);
+       }
+
+       /*
+        * Child Phase 2 Fires if opts require it
+        * Small nap after call to give child time to receive and execute the action
+        */
+
+       if (proc_info_opts >= PBSD) {
+               send_action_to_child_processes(proc_config, ACT_PHASE2);
+       }
+
+       if (proc_info_opts & PBSD) {
+               PROC_INFO_CALL(proc_bsdinfo, child_pid, PROC_PIDTBSDINFO, 0);
+       }
+
+       if (proc_info_opts & PBSD_SHORT) {
+               PROC_INFO_CALL(proc_bsdshortinfo, child_pid, PROC_PIDT_SHORTBSDINFO, 0);
+       }
+
+       if (proc_info_opts & PBSD_UNIQID) {
+               PROC_INFO_CALL(proc_bsdinfowithuniqid, child_pid, PROC_PIDT_BSDINFOWITHUNIQID, 0);
+       }
+       if (proc_info_opts & P_TASK_INFO) {
+               PROC_INFO_CALL(proc_taskinfo, child_pid, PROC_PIDTASKINFO, 0);
+       }
+
+       /*
+        * Child Phase 3 Fires
+        */
+       if (proc_info_opts >= P_TASK_INFO_NEW) {
+               send_action_to_child_processes(proc_config, ACT_PHASE3);
+       }
+
+       if (proc_info_opts & P_TASK_INFO_NEW) {
+               PROC_INFO_CALL(proc_taskinfo, child_pid, PROC_PIDTASKINFO, 0);
+       }
+
+       if (proc_info_opts & PALL) {
+               PROC_INFO_CALL(proc_taskallinfo, child_pid, PROC_PIDTASKALLINFO, 0);
+       }
+       /*
+        * This case breaks the pattern in that its proc_info call requires PALL,
+        * its value is required in some other proc_info calls
+        * and we never put the retval into our ret_structs
+        */
+       if (proc_info_opts & THREAD_ADDR || proc_info_opts & PTHINFO_OLD || proc_info_opts & PTHINFO || proc_info_opts & PINFO_PATH) {
+               struct proc_taskallinfo * pall = malloc(sizeof(struct proc_taskallinfo));
+               T_QUIET;
+               T_ASSERT_NOTNULL(pall, "malloc() for PROC_TASKALLINFO");
+
+               retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDTASKALLINFO, (uint32_t)0, (user_addr_t)pall,
+                                    (uint32_t)sizeof(struct proc_taskallinfo));
+               T_QUIET;
+               T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_taskallinfo), "__proc_info call for PROC_PIDTASKALLINFO in THREAD_ADDR");
+
+               thread_addr = malloc(sizeof(uint64_t) * (unsigned long)(pall->ptinfo.pti_threadnum + 1));
+               memset(thread_addr, 0, sizeof(uint64_t) * (unsigned long)(pall->ptinfo.pti_threadnum + 1));
+               T_QUIET;
+               T_ASSERT_NOTNULL(thread_addr, "malloc() for PROC_PIDLISTTHREADS");
+
+               retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDLISTTHREADS, (uint32_t)0, (user_addr_t)thread_addr,
+                                    (int32_t)(sizeof(uint64_t) * (unsigned long)(pall->ptinfo.pti_threadnum + 1)));
+               T_LOG("(int)((unsigned long)retval / PROC_PIDLISTTHREADS_SIZE: %d",
+                     (int)((unsigned long)retval / PROC_PIDLISTTHREADS_SIZE));
+               T_ASSERT_GE_INT((int)((unsigned long)retval / PROC_PIDLISTTHREADS_SIZE), pall->ptinfo.pti_threadnum,
+                               "__proc_info call for PROC_PIDLISTTHREADS");
+
+               free(pall);
+       }
+       if (proc_info_opts & PTHINFO_OLD) {
+               PROC_INFO_CALL(proc_threadinfo, child_pid, PROC_PIDTHREADINFO, thread_addr[0]);
+       }
+
+       /*
+        * Child Phase 4 Fires
+        */
+       if (proc_info_opts >= PTHINFO) {
+               send_action_to_child_processes(proc_config, ACT_PHASE4);
+       }
+
+       if (proc_info_opts & PTHINFO) {
+               PROC_INFO_CALL(proc_threadinfo, child_pid, PROC_PIDTHREADINFO, thread_addr[0]);
+       }
+       if (proc_info_opts & PTHINFO_64) {
+               mach_port_name_t child_task  = MACH_PORT_NULL;
+               thread_array_t child_threads = NULL;
+               mach_msg_type_number_t child_thread_count;
+               thread_identifier_info_data_t child_thread_threadinfo;
+               mach_msg_type_number_t thread_info_count = THREAD_IDENTIFIER_INFO_COUNT;
+               struct proc_threadinfo * pthinfo_64      = malloc(sizeof(struct proc_threadinfo));
+               T_QUIET;
+               T_ASSERT_NOTNULL(pthinfo_64, "malloc() for PROC_THREADINFO");
+
+               retval = task_for_pid(mach_task_self(), child_pid, &child_task);
+               T_ASSERT_EQ_INT(retval, 0, "task_for_pid for PROC_PIDTHREADID64INFO");
+
+               retval = task_threads(child_task, &child_threads, &child_thread_count);
+               T_ASSERT_MACH_SUCCESS(retval, "task_threads() call for PROC_PIDTHREADID64INFO");
+
+               retval = thread_info(child_threads[0], THREAD_IDENTIFIER_INFO, (thread_info_t)&child_thread_threadinfo, &thread_info_count);
+               T_ASSERT_MACH_SUCCESS(retval, "thread_info call for PROC_PIDTHREADID64INFO");
+
+               retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDTHREADID64INFO, (uint64_t)child_thread_threadinfo.thread_id,
+                                    (user_addr_t)pthinfo_64, (uint32_t)sizeof(struct proc_threadinfo));
+               T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_threadinfo), "__proc_info call for PROC_PIDTHREADID64INFO");
+
+               ret_structs[i] = (void *)pthinfo_64;
+               i++;
+
+               mach_port_deallocate(mach_task_self(), child_task);
+               mach_port_deallocate(mach_task_self(), child_threads[0]);
+               child_threads[0] = MACH_PORT_NULL;
+               child_task       = MACH_PORT_NULL;
+       }
+       if (proc_info_opts & PINFO_PATH) {
+               PROC_INFO_CALL(proc_threadwithpathinfo, child_pid, PROC_PIDTHREADPATHINFO, thread_addr[0]);
+       }
+
+       if (proc_info_opts & PAI) {
+               PROC_INFO_CALL(proc_archinfo, getpid(), PROC_PIDARCHINFO, 0);
+       }
+
+       vm_map_size_t map_tmp_sz = 0;
+       if ((proc_info_opts & PREGINFO) | (proc_info_opts & PREGINFO_PATH) | (proc_info_opts & PREGINFO_PATH_2) |
+           (proc_info_opts & PREGINFO_PATH_3)) {
+               static char tmp_path[PATH_MAX] = {};
+               tmp_fd = CONF_TMP_FILE_OPEN(tmp_path);
+
+               /*
+                * subsequent checks assume that this data does *not* stay
+                * resident in the buffer cache, so set F_NOCACHE for direct
+                * to storage writing. NOTE: this works if the writes are
+                * page-aligned and > 2 pages in length.
+                */
+               retval = fcntl(tmp_fd, F_NOCACHE, 1);
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(retval, "fcntl(%d, F_NOCACHE) failed", tmp_fd);
+
+               int npages_to_write = 10;
+               map_tmp_sz = (vm_map_size_t)npages_to_write * (vm_map_size_t)PAGE_SIZE;
+
+               /*
+                * To make sure we don't go through the cached write paths in
+                * the VM, we allocate a PAGE-aligned buffer that is > 2
+                * pages, and perform a write of the entire buffer (not in
+                * small page-aligned chunks).
+                */
+               char *buf = valloc((size_t)map_tmp_sz);
+               T_QUIET;
+               T_ASSERT_NOTNULL(buf, "valloc(%d) failed", (int)map_tmp_sz);
+
+               memset(buf, 0x5, map_tmp_sz);
+               ssize_t bw = write(tmp_fd, buf, (size_t)map_tmp_sz);
+               T_QUIET;
+               T_ASSERT_GT_INT((int)bw, 0, "write(%d, buf, %d) failed", tmp_fd, (int)map_tmp_sz);
+
+               free(buf);
+
+               map_tmp_sz -= PAGE_SIZE;
+               map_tmp = mmap(0, (size_t)map_tmp_sz, PROT_WRITE, MAP_PRIVATE, tmp_fd, (off_t)PAGE_SIZE);
+               T_ASSERT_NE_PTR(map_tmp, MAP_FAILED, "mmap() for PROC_PIDREGIONINFO");
+
+               T_LOG("file: %s is opened as fd %d and mapped at %llx with size %lu", tmp_path, tmp_fd, (uint64_t)map_tmp,
+                     (unsigned long)PAGE_SIZE);
+
+               /*
+                * unlink() the file to be nice, but do it _after_ we've
+                * already flushed and mapped the file. This will ensure that
+                * we don't end up writing to the buffer cache because the
+                * file is unlinked.
+                */
+               if (!(proc_info_opts & PREGINFO_PATH_3)) {
+                       retval = unlink(tmp_path);
+                       T_QUIET;
+                       T_ASSERT_POSIX_SUCCESS(retval, "unlink(%s) failed", tmp_path);
+               }
+       }
+
+       if (proc_info_opts & PREGINFO) {
+               PROC_INFO_CALL(proc_regioninfo, getpid(), PROC_PIDREGIONINFO, map_tmp);
+               ret_structs[i] = map_tmp;
+               i++;
+               ret_structs[i] = (void *)(uintptr_t)map_tmp_sz;
+               i++;
+       }
+       if (proc_info_opts & PREGINFO_PATH) {
+               PROC_INFO_CALL(proc_regionwithpathinfo, getpid(), PROC_PIDREGIONPATHINFO, map_tmp);
+               ret_structs[i] = map_tmp;
+               i++;
+               ret_structs[i] = (void *)(uintptr_t)map_tmp_sz;
+               i++;
+       }
+       if (proc_info_opts & PREGINFO_PATH_2) {
+               PROC_INFO_CALL(proc_regionwithpathinfo, getpid(), PROC_PIDREGIONPATHINFO2, map_tmp);
+               ret_structs[i] = map_tmp;
+               i++;
+               ret_structs[i] = (void *)(uintptr_t)map_tmp_sz;
+               i++;
+       }
+
+       if (proc_info_opts & PREGINFO_PATH_3) {
+               struct proc_regionwithpathinfo * preginfo_path = malloc(sizeof(struct proc_regionwithpathinfo));
+
+               retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDREGIONPATHINFO2, (uint64_t)map_tmp,
+                                    (user_addr_t)preginfo_path, (uint32_t)sizeof(struct proc_regionwithpathinfo));
+
+               T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_regionwithpathinfo), "__proc_info call for PROC_PIDREGIONPATHINFO2");
+
+               T_LOG("preginfo_path.prp_vip.vip_vi.vi_fsid.val 0: %d", preginfo_path->prp_vip.vip_vi.vi_fsid.val[0]);
+               T_LOG("preginfo_path.prp_vip.vip_vi.vi_fsid.val 1: %d", preginfo_path->prp_vip.vip_vi.vi_fsid.val[1]);
+
+               retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDREGIONPATHINFO3,
+                                    (uint64_t)(*(uint32_t *)(preginfo_path->prp_vip.vip_vi.vi_fsid.val)), (user_addr_t)preginfo_path,
+                                    (uint32_t)sizeof(struct proc_regionwithpathinfo));
+               T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_regionwithpathinfo), "__proc_info call for PROC_PIDREGIONPATHWITHINFO3");
+               ret_structs[i] = (void *)preginfo_path;
+               i++;
+               ret_structs[i] = (void *)map_tmp;
+               i++;
+               ret_structs[i] = (void *)(uintptr_t)map_tmp_sz;
+               i++;
+
+               retval = unlink(preginfo_path->prp_vip.vip_path);
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(retval, "unlink(%s) failed", preginfo_path->prp_vip.vip_path);
+       }
+
+       if (proc_info_opts & PVNINFO) {
+               PROC_INFO_CALL(proc_vnodepathinfo, getpid(), PROC_PIDVNODEPATHINFO, 0);
+       }
+
+       kill_child_processes(proc_config);
+       free_proc_config(proc_config);
+       free(thread_addr);
+       thread_addr = NULL;
+       close(tmp_fd);
+       tmp_fd = -1;
+}
+
+static void
+free_proc_info(void ** proc_info, int num)
+{
+       for (int i = 0; i < num; i++) {
+               free(proc_info[i]);
+       }
+
+       return;
+}
+
+/*
+ *     Start DECLs
+ */
+
+T_DECL(proc_info_listpids_all_pids,
+       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       /*
+        * Get the value of nprocs with no buffer sent in
+        */
+       int num_procs;
+       num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)0, (uint32_t)0);
+       T_ASSERT_GE_INT(num_procs, 1, "verify valid value for nprocs: %d", num_procs);
+
+       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
+
+       num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)0, (uint32_t)0);
+
+       int proc_count     = num_procs / (int)sizeof(pid_t);
+       int proc_count_all = num_procs / (int)sizeof(pid_t);
+       if (proc_count > (CONF_PROC_COUNT + 1)) {
+               proc_count = CONF_PROC_COUNT + 1;
+       }
+       pid_t * proc_ids = malloc(sizeof(pid_t) * (unsigned long)proc_count);
+       num_procs        = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)proc_ids,
+                               (int32_t)(proc_count * (int)sizeof(pid_t)));
+       num_procs = num_procs / (int)sizeof(pid_t);
+       T_ASSERT_GE_INT(num_procs, proc_count, "Valid number of pids obtained for PROC_ALL_PIDS.");
+
+       free(proc_ids);
+
+       /*
+        * Grab list of all procs and make sure our spawned children are in the list.
+        */
+
+       proc_ids  = malloc(sizeof(pid_t) * (unsigned long)proc_count_all);
+       num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)proc_ids,
+                               (int32_t)(proc_count_all * (int)sizeof(pid_t)));
+       num_procs = num_procs / (int)sizeof(pid_t);
+
+       int pid_match = 1;
+
+       for (int i = 0; i < (CONF_PROC_COUNT - 1); i++) {
+               for (int j = 0; j < num_procs; j++) {
+                       if (proc_ids[j] == proc_config->child_pids[i]) {
+                               break;
+                       } else if (j == (num_procs - 1)) {
+                               pid_match = 0;
+                               break;
+                       }
+               }
+
+               if (!pid_match) {
+                       break;
+               }
+       }
+
+       T_ASSERT_EQ(pid_match, 1, "PROC_INFO_CALL_LISTPIDS contains our spawned children's pids");
+
+       free(proc_ids);
+
+       kill_child_processes(proc_config);
+       free_proc_config(proc_config);
+
+       errno     = 0;
+       num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)proc_ids,
+                               (uint32_t)(sizeof(pid_t) - 1));
+       T_EXPECT_POSIX_ERROR(errno, ENOMEM, "Valid proc_info behavior when bufsize < sizeof(pid_t).");
+}
+
+T_DECL(proc_info_listpids_pgrp_only,
+       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
+       T_LOG("Test to verify PROC_PGRP_ONLY returns correct value");
+       /*
+        * The number of obtained pids depends on size of buffer.
+        * count = childCount + 1(parent)
+        * So, we set it to one more than expected to capture any error.
+        */
+       int proc_count   = CONF_PROC_COUNT + 2;
+       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
+       int num_procs    = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_PGRP_ONLY, (uint32_t)proc_config->proc_grp_id, (uint32_t)0,
+                                   (user_addr_t)proc_ids, (int32_t)(proc_count * (int)sizeof(*proc_ids)));
+       num_procs = num_procs / (int)sizeof(pid_t);
+       T_ASSERT_EQ_INT(num_procs, CONF_PROC_COUNT + 1, "Valid number of pids obtained for PROC_PGRP_ONLY.");
+       kill_child_processes(proc_config);
+       free_proc_config(proc_config);
+       free(proc_ids);
+}
+
+T_DECL(proc_info_listpids_ppid_only,
+       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
+       T_LOG("Test to verify PROC_PPID_ONLY returns correct value");
+       /*
+        * Pass in the same (bigger) buffer but expect only the pids where ppid is pid of current proc.
+        */
+       int proc_count   = CONF_PROC_COUNT + 2;
+       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
+       int num_procs    = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_PPID_ONLY, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)proc_ids,
+                                   (int32_t)(proc_count * (int)sizeof(*proc_ids)));
+       num_procs = num_procs / (int)sizeof(pid_t);
+       T_ASSERT_EQ_INT(num_procs, CONF_PROC_COUNT, "Valid number of pids obtained for PROC_PPID_ONLY.");
+       kill_child_processes(proc_config);
+       free_proc_config(proc_config);
+       free(proc_ids);
+}
+
+T_DECL(proc_info_listpids_uid_only,
+       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
+       T_LOG("Test to verify PROC_UID_ONLY returns correct value");
+       int proc_count   = CONF_PROC_COUNT + 2;
+       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
+       send_action_to_child_processes(proc_config, ACT_CHANGE_UID);
+       usleep(10000);
+       int num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_UID_ONLY, CONF_UID_VAL, (uint32_t)0, (user_addr_t)proc_ids,
+                                   (int32_t)(proc_count * (int)sizeof(*proc_ids)));
+       T_ASSERT_GE_ULONG((unsigned long)num_procs / sizeof(pid_t), (unsigned long)CONF_PROC_COUNT,
+                         "Valid number of pids obtained for PROC_UID_ONLY.");
+       kill_child_processes(proc_config);
+       free_proc_config(proc_config);
+       free(proc_ids);
+}
+
+T_DECL(proc_info_listpids_ruid_only,
+       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
+       T_LOG("Test to verify PROC_RUID_ONLY returns correct value");
+       int proc_count   = CONF_PROC_COUNT + 2;
+       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
+       send_action_to_child_processes(proc_config, ACT_CHANGE_RUID);
+       usleep(10000);
+       int num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_RUID_ONLY, CONF_RUID_VAL, (uint32_t)0, (user_addr_t)proc_ids,
+                                   (int32_t)(proc_count * (int)sizeof(*proc_ids)));
+       T_ASSERT_GE_ULONG((unsigned long)num_procs / sizeof(pid_t), (unsigned long)CONF_PROC_COUNT,
+                         "Valid number of pids obtained for PROC_RUID_ONLY.");
+       kill_child_processes(proc_config);
+       free_proc_config(proc_config);
+       free(proc_ids);
+}
+
+T_DECL(proc_info_listpids_tty_only,
+       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       int ret = isatty(STDOUT_FILENO);
+       if (ret != 1) {
+               T_SKIP("Not connected to tty...skipping test");
+       }
+
+       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
+
+       T_LOG("Test to verify PROC_TTY_ONLY returns correct value");
+       int proc_count   = CONF_PROC_COUNT + 2;
+       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
+       int num_procs    = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_TTY_ONLY, get_tty_dev(), (uint32_t)0, (user_addr_t)proc_ids,
+                                   (int32_t)(proc_count * (int)sizeof(*proc_ids)));
+       num_procs = num_procs / (int)sizeof(pid_t);
+       T_ASSERT_GE_INT(num_procs, 0, "Valid number of pids returned by PROC_TTY_ONLY.");
+       kill_child_processes(proc_config);
+       free_proc_config(proc_config);
+       free(proc_ids);
+}
+
+/*
+ * Most of the following PROC_INFO_CALL_PIDINFO tests rely on a helper function (proc_info_caller) to make the necessary proc_info
+ * calls on their behalf
+ * In a previous iteration, these tests were all in one giant T_DECL and the helper function handles inter-DECL dependencies such as
+ * a proc_info call relying on the results of a previous proc_info call or an assumed state that a child should be in.
+ */
+
+T_DECL(proc_info_pidinfo_proc_piduniqidentifierinfo,
+       "Test to identify PROC_PIDUNIQIDENTIFIERINFO returns correct unique identifiers for process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[2];
+       proc_info_caller(P_UNIQIDINFO | C_UNIQIDINFO, proc_info, NULL);
+       struct proc_uniqidentifierinfo * p_uniqidinfo = (struct proc_uniqidentifierinfo *)proc_info[0];
+       struct proc_uniqidentifierinfo * c_uniqidinfo = (struct proc_uniqidentifierinfo *)proc_info[1];
+
+       T_EXPECT_NE_ULLONG(c_uniqidinfo->p_uniqueid, p_uniqidinfo->p_uniqueid, "p_uniqueid not unique for the process");
+
+       for (size_t i = 0; i < 16; i++) {
+               T_EXPECT_EQ_UCHAR(c_uniqidinfo->p_uuid[i], p_uniqidinfo->p_uuid[i], "p_uuid should be the same unique id");
+       }
+       T_EXPECT_EQ_ULLONG(c_uniqidinfo->p_puniqueid, p_uniqidinfo->p_uniqueid,
+                          "p_puniqueid of child should be same as p_uniqueid for parent");
+
+       free_proc_info(proc_info, 2);
+}
+
+T_DECL(proc_info_pidinfo_proc_pidtbsdinfo,
+       "Test to verify PROC_PIDTBSDINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[2];
+       int child_pid = 0;
+       proc_info_caller(PBSD_OLD | PBSD, proc_info, &child_pid);
+       struct proc_bsdinfo * pbsd_old = (struct proc_bsdinfo *)proc_info[0];
+       struct proc_bsdinfo * pbsd     = (struct proc_bsdinfo *)proc_info[1];
+
+       T_EXPECT_EQ_UINT((unsigned int)SRUN, pbsd->pbi_status, "PROC_PIDTBSDINFO shows Correct status");
+       T_EXPECT_EQ_UINT(0U, pbsd->pbi_xstatus, "PROC_PIDTBSDINFO show Correct xstatus (exit status)");
+       T_EXPECT_EQ_UINT(pbsd->pbi_pid, (unsigned int)child_pid, "PROC_PIDTBSDINFO returns valid pid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_ppid, (unsigned int)getpid(), "PROC_PIDTBSDINFO returns valid ppid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_uid, CONF_RUID_VAL, "PROC_PIDTBSDINFO returns valid uid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_gid, CONF_GID_VAL, "PROC_PIDTBSDINFO returns valid gid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_ruid, 0U, "PROC_PIDTBSDINFO returns valid ruid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_rgid, CONF_GID_VAL, "PROC_PIDTBSDINFO returns valid rgid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_svuid, CONF_RUID_VAL, "PROC_PIDTBSDINFO returns valid svuid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_svgid, CONF_GID_VAL, "PROC_PIDTBSDINFO returns valid svgid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_nice, CONF_NICE_VAL, "PROC_PIDTBSDINFO returns valid nice value");
+       T_EXPECT_EQ_STR(pbsd->pbi_comm, CONF_CMD_NAME, "PROC_PIDTBSDINFO returns valid p_comm name");
+       T_EXPECT_EQ_STR(pbsd->pbi_name, CONF_CMD_NAME, "PROC_PIDTBSDINFO returns valid p_name name");
+       T_EXPECT_EQ_UINT(pbsd->pbi_flags, (pbsd_old->pbi_flags | PROC_FLAG_PSUGID), "PROC_PIDTBSDINFO returns valid flags");
+       T_EXPECT_EQ_UINT(pbsd->pbi_nfiles, pbsd_old->pbi_nfiles, "PROC_PIDTBSDINFO returned valid pbi_nfiles");
+       T_EXPECT_EQ_UINT(pbsd->pbi_pgid, (uint32_t)getpgid(getpid()), "PROC_PIDTBSDINFO returned valid pbi_pgid");
+       T_EXPECT_EQ_UINT(pbsd->pbi_pjobc, pbsd->pbi_pjobc, "PROC_PIDTBSDINFO returned valid pbi_pjobc");
+       T_EXPECT_NE_UINT(pbsd->e_tdev, 0U, "PROC_PIDTBSDINFO returned valid e_tdev");
+
+       free_proc_info(proc_info, 2);
+}
+
+T_DECL(proc_info_pidt_shortbsdinfo,
+       "Test to verify PROC_PIDT_SHORTBSDINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[2];
+       int child_pid = 0;
+       proc_info_caller(PBSD | PBSD_SHORT, proc_info, &child_pid);
+       struct proc_bsdinfo * pbsd            = (struct proc_bsdinfo *)proc_info[0];
+       struct proc_bsdshortinfo * pbsd_short = (struct proc_bsdshortinfo *)proc_info[1];
+
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_pid, (unsigned int)child_pid, "PROC_PIDT_SHORTBSDINFO returns valid pid");
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_ppid, (unsigned int)getpid(), "PROC_PIDT_SHORTBSDINFO returns valid ppid");
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_pgid, (uint32_t)getpgid(getpid()), "PROC_PIDT_SHORTBSDINFO returned valid pbi_pgid");
+       T_EXPECT_EQ_UINT((unsigned int)SRUN, pbsd_short->pbsi_status, "PROC_PIDT_SHORTBSDINFO shows Correct status");
+       T_EXPECT_EQ_STR(pbsd_short->pbsi_comm, CONF_CMD_NAME, "PROC_PIDT_SHORTBSDINFO returns valid p_comm name");
+       /*
+        * The short variant returns all flags except session flags, hence ignoring them here.
+        */
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_flags, (pbsd->pbi_flags & (unsigned int)(~PROC_FLAG_CTTY)),
+                        "PROC_PIDT_SHORTBSDINFO returns valid flags");
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_uid, CONF_RUID_VAL, "PROC_PIDT_SHORTBSDINFO returns valid uid");
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_gid, CONF_GID_VAL, "PROC_PIDT_SHORTBSDINFO returns valid gid");
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_ruid, 0U, "PROC_PIDT_SHORTBSDINFO returns valid ruid");
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_svuid, CONF_RUID_VAL, "PROC_PIDT_SHORTBSDINFO returns valid svuid");
+       T_EXPECT_EQ_UINT(pbsd_short->pbsi_svgid, CONF_GID_VAL, "PROC_PIDT_SHORTBSDINFO returns valid svgid");
+
+       free_proc_info(proc_info, 2);
+}
+
+T_DECL(proc_info_pidt_bsdinfowithuniqid,
+       "Test to verify PROC_PIDT_BSDINFOWITHUNIQID returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[4];
+       int child_pid = 0;
+       proc_info_caller(P_UNIQIDINFO | PBSD_OLD | PBSD | PBSD_UNIQID, proc_info, &child_pid);
+       struct proc_uniqidentifierinfo * p_uniqidinfo = (struct proc_uniqidentifierinfo *)proc_info[0];
+       struct proc_bsdinfo * pbsd_old                = (struct proc_bsdinfo *)proc_info[1];
+       struct proc_bsdinfo * pbsd                    = (struct proc_bsdinfo *)proc_info[2];
+       struct proc_bsdinfowithuniqid * pbsd_uniqid   = (struct proc_bsdinfowithuniqid *)proc_info[3];
+
+       T_EXPECT_EQ_UINT((unsigned int)SRUN, pbsd->pbi_status, "PROC_PIDT_BSDINFOWITHUNIQID shows Correct status");
+       T_EXPECT_EQ_UINT(0U, pbsd->pbi_xstatus, "PROC_PIDT_BSDINFOWITHUNIQID show Correct xstatus");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_pid, (unsigned int)child_pid, "PROC_PIDT_BSDINFOWITHUNIQID returns valid pid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_ppid, (unsigned int)getpid(), "PROC_PIDT_BSDINFOWITHUNIQID returns valid ppid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_uid, CONF_RUID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid uid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_gid, CONF_GID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid gid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_ruid, 0U, "PROC_PIDT_BSDINFOWITHUNIQID returns valid ruid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_rgid, CONF_GID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid rgid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_svuid, CONF_RUID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid svuid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_svgid, CONF_GID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid svgid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_nice, CONF_NICE_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid nice value");
+       T_EXPECT_EQ_STR(pbsd_uniqid->pbsd.pbi_comm, CONF_CMD_NAME, "PROC_PIDT_BSDINFOWITHUNIQID returns valid p_comm name");
+       T_EXPECT_EQ_STR(pbsd_uniqid->pbsd.pbi_name, CONF_CMD_NAME, "PROC_PIDT_BSDINFOWITHUNIQID returns valid p_name name");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_flags, (pbsd_old->pbi_flags | PROC_FLAG_PSUGID),
+                        "PROC_PIDT_BSDINFOWITHUNIQID returns valid flags");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_nfiles, pbsd_old->pbi_nfiles, "PROC_PIDT_BSDINFOWITHUNIQID returned valid pbi_nfiles");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_pgid, (uint32_t)getpgid(getpid()),
+                        "PROC_PIDT_BSDINFOWITHUNIQID returned valid pbi_pgid");
+       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_pjobc, pbsd->pbi_pjobc, "PROC_PIDT_BSDINFOWITHUNIQID returned valid pbi_pjobc");
+       T_EXPECT_NE_UINT(pbsd_uniqid->pbsd.e_tdev, 0U, "PROC_PIDT_BSDINFOWITHUNIQID returned valid e_tdev");
+       T_EXPECT_NE_ULLONG(pbsd_uniqid->p_uniqidentifier.p_uniqueid, p_uniqidinfo->p_uniqueid,
+                          "PROC_PIDT_BSDINFOWITHUNIQID returned valid p_uniqueid");
+       for (int i = 0; i < 16; i++) {
+               T_EXPECT_EQ_UCHAR(pbsd_uniqid->p_uniqidentifier.p_uuid[i], p_uniqidinfo->p_uuid[i],
+                                 "PROC_PIDT_BSDINFOWITHUNIQID reported valid p_uniqueid");
+       }
+       T_EXPECT_EQ_ULLONG(pbsd_uniqid->p_uniqidentifier.p_puniqueid, p_uniqidinfo->p_uniqueid,
+                          "p_puniqueid of child should be same as p_uniqueid for parent");
+
+       free_proc_info(proc_info, 4);
+}
+
+T_DECL(proc_info_proc_pidtask_info,
+       "Test to verify PROC_PIDTASKINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[2];
+       proc_info_caller(P_TASK_INFO | P_TASK_INFO_NEW, proc_info, NULL);
+       struct proc_taskinfo * p_task_info     = (struct proc_taskinfo *)proc_info[0];
+       struct proc_taskinfo * p_task_info_new = (struct proc_taskinfo *)proc_info[1];
+
+       T_EXPECT_GE_ULLONG((p_task_info_new->pti_virtual_size - p_task_info->pti_virtual_size), (unsigned long long)PAGE_SIZE,
+                          "PROC_PIDTASKINFO returned valid value for pti_virtual_size");
+       T_EXPECT_GE_ULLONG((p_task_info_new->pti_resident_size - p_task_info->pti_resident_size), (unsigned long long)PAGE_SIZE,
+                          "PROC_PIDTASKINFO returned valid value for pti_virtual_size");
+       T_EXPECT_EQ_INT(p_task_info_new->pti_policy, POLICY_TIMESHARE, "PROC_PIDTASKINFO returned valid value for pti_virtual_size");
+       T_EXPECT_GE_ULLONG(p_task_info->pti_threads_user, 1ULL, "PROC_PIDTASKINFO returned valid value for pti_threads_user");
+#if defined(__arm__) || defined(__arm64__)
+       T_EXPECT_GE_ULLONG(p_task_info->pti_threads_system, 0ULL, "PROC_PIDTASKINFO returned valid value for pti_threads_system");
+       T_EXPECT_GE_ULLONG((p_task_info_new->pti_total_system - p_task_info->pti_total_system), 0ULL,
+                          "PROC_PIDTASKINFO returned valid value for pti_total_system");
+#else
+       T_EXPECT_GE_ULLONG(p_task_info->pti_threads_system, 1ULL, "PROC_PIDTASKINFO returned valid value for pti_threads_system");
+       T_EXPECT_GT_ULLONG((p_task_info_new->pti_total_system - p_task_info->pti_total_system), 0ULL,
+                          "PROC_PIDTASKINFO returned valid value for pti_total_system");
+#endif
+       T_EXPECT_GT_ULLONG((p_task_info_new->pti_total_user - p_task_info->pti_total_user), 0ULL,
+                          "PROC_PIDTASKINFO returned valid value for pti_total_user");
+       T_EXPECT_GE_INT((p_task_info_new->pti_faults - p_task_info->pti_faults), 1,
+                       "PROC_PIDTASKINFO returned valid value for pti_faults");
+       T_EXPECT_GE_INT((p_task_info_new->pti_cow_faults - p_task_info->pti_cow_faults), 1,
+                       "PROC_PIDTASKINFO returned valid value for pti_cow_faults");
+       T_EXPECT_GE_INT((p_task_info_new->pti_syscalls_mach - p_task_info->pti_syscalls_mach), 0,
+                       "PROC_PIDTASKINFO returned valid value for pti_syscalls_mach");
+       T_EXPECT_GE_INT((p_task_info_new->pti_syscalls_unix - p_task_info->pti_syscalls_unix), 2,
+                       "PROC_PIDTASKINFO returned valid value for pti_syscalls_unix");
+       T_EXPECT_EQ_INT((p_task_info_new->pti_messages_sent - p_task_info->pti_messages_sent), 0,
+                       "PROC_PIDTASKINFO returned valid value for pti_messages_sent");
+       T_EXPECT_EQ_INT((p_task_info_new->pti_messages_received - p_task_info->pti_messages_received), 0,
+                       "PROC_PIDTASKINFO returned valid value for pti_messages_received");
+       T_EXPECT_EQ_INT(p_task_info_new->pti_priority, p_task_info->pti_priority,
+                       "PROC_PIDTASKINFO returned valid value for pti_priority");
+       T_EXPECT_GE_INT(p_task_info_new->pti_threadnum, 1, "PROC_PIDTASKINFO returned valid value for pti_threadnum");
+
+       if (p_task_info_new->pti_threadnum > 1) {
+               T_LOG("WARN: PROC_PIDTASKINFO returned threadnum greater than 1");
+       }
+       T_EXPECT_GE_INT(p_task_info_new->pti_numrunning, 0, "PROC_PIDTASKINFO returned valid value for pti_numrunning");
+       T_EXPECT_GE_INT(p_task_info_new->pti_pageins, 0, "PROC_PIDTASKINFO returned valid value for pti_pageins");
+
+       if (p_task_info_new->pti_pageins > 0) {
+               T_LOG("WARN: PROC_PIDTASKINFO returned pageins greater than 0");
+       }
+
+       T_EXPECT_GE_INT(p_task_info_new->pti_csw, p_task_info->pti_csw, "PROC_PIDTASKINFO returned valid value for pti_csw");
+
+       free_proc_info(proc_info, 2);
+}
+
+T_DECL(proc_info_proc_pidtaskallinfo,
+       "Test to verify PROC_PIDTASKALLINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[4];
+       int child_pid = 0;
+       proc_info_caller(PBSD | PBSD_OLD | P_TASK_INFO | PALL, proc_info, &child_pid);
+       struct proc_bsdinfo * pbsd         = (struct proc_bsdinfo *)proc_info[0];
+       struct proc_bsdinfo * pbsd_old     = (struct proc_bsdinfo *)proc_info[1];
+       struct proc_taskinfo * p_task_info = (struct proc_taskinfo *)proc_info[2];
+       struct proc_taskallinfo * pall     = (struct proc_taskallinfo *)proc_info[3];
+
+       T_EXPECT_EQ_UINT((unsigned int)SRUN, pbsd->pbi_status, "PROC_PIDTASKALLINFO shows Correct status");
+       T_EXPECT_EQ_UINT(0U, pbsd->pbi_xstatus, "PROC_PIDTASKALLINFO show Correct xstatus");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_pid, (unsigned int)child_pid, "PROC_PIDTASKALLINFO returns valid pid");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_ppid, (unsigned int)getpid(), "PROC_PIDTASKALLINFO returns valid ppid");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_uid, CONF_RUID_VAL, "PROC_PIDTASKALLINFO returns valid uid");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_gid, CONF_GID_VAL, "PROC_PIDTASKALLINFO returns valid gid");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_ruid, 0U, "PROC_PIDTASKALLINFO returns valid ruid");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_rgid, CONF_GID_VAL, "PROC_PIDTASKALLINFO returns valid rgid");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_svuid, CONF_RUID_VAL, "PROC_PIDTASKALLINFO returns valid svuid");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_svgid, CONF_GID_VAL, "PROC_PIDTASKALLINFO returns valid svgid");
+       T_EXPECT_EQ_INT(pall->pbsd.pbi_nice, CONF_NICE_VAL, "PROC_PIDTASKALLINFO returns valid nice value");
+       T_EXPECT_EQ_STR(pall->pbsd.pbi_comm, CONF_CMD_NAME, "PROC_PIDTASKALLINFO returns valid p_comm name");
+       T_EXPECT_EQ_STR(pall->pbsd.pbi_name, CONF_CMD_NAME, "PROC_PIDTASKALLINFO returns valid p_name name");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_flags, (pbsd_old->pbi_flags | PROC_FLAG_PSUGID), "PROC_PIDTASKALLINFO returns valid flags");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_nfiles, pbsd_old->pbi_nfiles, "PROC_PIDTASKALLINFO returned valid pbi_nfiles");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_pgid, (uint32_t)getpgid(getpid()), "PROC_PIDTASKALLINFO returned valid pbi_pgid");
+       T_EXPECT_EQ_UINT(pall->pbsd.pbi_pjobc, pbsd->pbi_pjobc, "PROC_PIDTASKALLINFO returned valid pbi_pjobc");
+       T_EXPECT_NE_UINT(pall->pbsd.e_tdev, 0U, "PROC_PIDTASKALLINFO returned valid e_tdev");
+
+#if defined(__arm__) || defined(__arm64__)
+       T_EXPECT_GE_ULLONG(pall->ptinfo.pti_threads_system, 0ULL, "PROC_PIDTASKALLINFO returned valid value for pti_threads_system");
+       T_EXPECT_GE_ULLONG((pall->ptinfo.pti_total_system - p_task_info->pti_total_system), 0ULL,
+                          "PROC_PIDTASKALLINFO returned valid value for pti_total_system");
+#else
+       T_EXPECT_GE_ULLONG(pall->ptinfo.pti_threads_system, 1ULL, "PROC_PIDTASKALLINFO returned valid value for pti_threads_system");
+       T_EXPECT_GT_ULLONG((pall->ptinfo.pti_total_system - p_task_info->pti_total_system), 0ULL,
+                          "PROC_PIDTASKALLINFO returned valid value for pti_total_system");
+#endif /* ARM */
+
+       T_EXPECT_GE_ULLONG((pall->ptinfo.pti_virtual_size - p_task_info->pti_virtual_size), (unsigned long long)PAGE_SIZE,
+                          "PROC_PIDTASKALLINFO returned valid value for pti_virtual_size");
+       T_EXPECT_GE_ULLONG((pall->ptinfo.pti_resident_size - p_task_info->pti_resident_size), (unsigned long long)PAGE_SIZE,
+                          "PROC_PIDTASKALLINFO returned valid value for pti_virtual_size");
+       T_EXPECT_EQ_INT(pall->ptinfo.pti_policy, POLICY_TIMESHARE, "PROC_PIDTASKALLINFO returned valid value for pti_virtual_size");
+       T_EXPECT_GE_ULLONG(pall->ptinfo.pti_threads_user, 1ULL, "PROC_PIDTASKALLINFO returned valid value for pti_threads_user ");
+       T_EXPECT_GT_ULLONG((pall->ptinfo.pti_total_user - p_task_info->pti_total_user), 0ULL,
+                          "PROC_PIDTASKALLINFO returned valid value for pti_total_user");
+       T_EXPECT_GE_INT((pall->ptinfo.pti_faults - p_task_info->pti_faults), 1,
+                       "PROC_PIDTASKALLINFO returned valid value for pti_faults");
+       T_EXPECT_GE_INT((pall->ptinfo.pti_cow_faults - p_task_info->pti_cow_faults), 1,
+                       "PROC_PIDTASKALLINFO returned valid value for pti_cow_faults");
+       T_EXPECT_GE_INT((pall->ptinfo.pti_syscalls_mach - p_task_info->pti_syscalls_mach), 0,
+                       "PROC_PIDTASKALLINFO returned valid value for pti_syscalls_mach");
+       T_EXPECT_GE_INT((pall->ptinfo.pti_syscalls_unix - p_task_info->pti_syscalls_unix), 2,
+                       "PROC_PIDTASKALLINFO returned valid value for pti_syscalls_unix");
+       T_EXPECT_EQ_INT((pall->ptinfo.pti_messages_sent - p_task_info->pti_messages_sent), 0,
+                       "PROC_PIDTASKALLINFO returned valid value for pti_messages_sent");
+       T_EXPECT_EQ_INT((pall->ptinfo.pti_messages_received - p_task_info->pti_messages_received), 0,
+                       "PROC_PIDTASKALLINFO returned valid value for pti_messages_received");
+       T_EXPECT_EQ_INT(pall->ptinfo.pti_priority, p_task_info->pti_priority,
+                       "PROC_PIDTASKALLINFO returned valid value for pti_priority");
+       T_EXPECT_GE_INT(pall->ptinfo.pti_threadnum, 1, "PROC_PIDTASKALLINFO returned valid value for pti_threadnum");
+       if (pall->ptinfo.pti_threadnum > 1) {
+               T_LOG("WARN: PROC_PIDTASKALLINFO returned threadnum greater than 1");
+       }
+       T_EXPECT_GE_INT(pall->ptinfo.pti_numrunning, 0, "PROC_PIDTASKALLINFO returned valid value for pti_numrunning");
+       T_EXPECT_GE_INT(pall->ptinfo.pti_pageins, 0, "PROC_PIDTASKALLINFO returned valid value for pti_pageins");
+       if (pall->ptinfo.pti_pageins > 0) {
+               T_LOG("WARN: PROC_PIDTASKALLINFO returned pageins greater than 0");
+       }
+       T_EXPECT_GE_INT(pall->ptinfo.pti_csw, p_task_info->pti_csw, "PROC_PIDTASKALLINFO returned valid value for pti_csw");
+
+       free_proc_info(proc_info, 4);
+}
+
+T_DECL(proc_info_proc_pidlistthreads,
+       "Test to verify PROC_PIDLISTTHREADS returns valid information about process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[1];
+       proc_info_caller(THREAD_ADDR, proc_info, NULL);
+}
+
+T_DECL(proc_info_proc_pidthreadinfo,
+       "Test to verify PROC_PIDTHREADINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[2];
+       int child_pid = 0;
+       proc_info_caller(PTHINFO_OLD | PTHINFO, proc_info, &child_pid);
+       struct proc_threadinfo * pthinfo_old = (struct proc_threadinfo *)proc_info[0];
+       struct proc_threadinfo * pthinfo     = (struct proc_threadinfo *)proc_info[1];
+
+       T_EXPECT_GT_ULLONG((pthinfo->pth_user_time - pthinfo_old->pth_user_time), 0ULL,
+                          "PROC_PIDTHREADINFO returns valid value for pth_user_time");
+       T_EXPECT_GE_ULLONG((pthinfo->pth_system_time - pthinfo_old->pth_system_time), 0ULL,
+                          "PROC_PIDTHREADINFO returns valid value for pth_system_time");
+       /*
+        * This is the scaled cpu usage percentage, since we are not
+        * doing a really long CPU bound task, it is (nearly) zero
+        */
+       T_EXPECT_GE_INT(pthinfo->pth_cpu_usage, 0, "PROC_PIDTHREADINFO returns valid value for pth_cpu_usage");
+       T_EXPECT_EQ_INT(pthinfo->pth_policy, POLICY_TIMESHARE, "PROC_PIDTHREADINFO returns valid value for pth_policy");
+       if (!(pthinfo->pth_run_state == TH_STATE_WAITING) && !(pthinfo->pth_run_state == TH_STATE_RUNNING)) {
+               T_EXPECT_EQ_INT(pthinfo->pth_run_state, -1, "PROC_PIDTHREADINFO returns valid value for pth_run_state");
+       }
+       /*
+        * This value is hardcoded to 0 in the source, hence it will always
+        * unconditionally return 0
+        */
+       T_EXPECT_EQ_INT(pthinfo->pth_sleep_time, 0, "PROC_PIDTHREADINFO returns valid value for pth_sleep_time");
+       T_EXPECT_LE_INT(pthinfo->pth_curpri, (BASEPRI_DEFAULT - CONF_NICE_VAL),
+                       "PROC_PIDTHREADINFO returns valid value for pth_curpri");
+       T_EXPECT_EQ_INT(pthinfo->pth_priority, (BASEPRI_DEFAULT - CONF_NICE_VAL),
+                       "PROC_PIDTHREADINFO returns valid value for pth_priority");
+       T_EXPECT_EQ_INT(pthinfo->pth_maxpriority, MAXPRI_USER, "PROC_PIDTHREADINFO returns valid value for pth_maxpriority");
+       T_EXPECT_EQ_STR(pthinfo->pth_name, CONF_THREAD_NAME, "PROC_PIDTHREADINFO returns valid value for pth_name");
+
+       free_proc_info(proc_info, 2);
+}
+
+T_DECL(proc_info_proc_threadid64info,
+       "Test to verify PROC_PIDTHREADID64INFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[2];
+       proc_info_caller(PTHINFO | PTHINFO_64, proc_info, NULL);
+       struct proc_threadinfo pthinfo    = *((struct proc_threadinfo *)proc_info[0]);
+       struct proc_threadinfo pthinfo_64 = *((struct proc_threadinfo *)proc_info[1]);
+       T_EXPECT_GE_ULLONG(pthinfo_64.pth_user_time, pthinfo.pth_user_time,
+                          "PROC_PIDTHREADID64INFO returns valid value for pth_user_time");
+       T_EXPECT_GE_ULLONG(pthinfo_64.pth_system_time, pthinfo.pth_system_time,
+                          "PROC_PIDTHREADID64INFO returns valid value for pth_system_time");
+       T_EXPECT_GE_INT(pthinfo_64.pth_cpu_usage, pthinfo.pth_cpu_usage,
+                       "PROC_PIDTHREADID64INFO returns valid value for pth_cpu_usage");
+       T_EXPECT_EQ_INT(pthinfo_64.pth_policy, POLICY_TIMESHARE, "PROC_PIDTHREADID64INFO returns valid value for pth_policy");
+       if (!(pthinfo_64.pth_run_state == TH_STATE_WAITING) && !(pthinfo_64.pth_run_state == TH_STATE_RUNNING)) {
+               T_EXPECT_EQ_INT(pthinfo_64.pth_run_state, -1, "PROC_PIDTHREADID64INFO returns valid value for pth_run_state");
+       }
+       T_EXPECT_EQ_INT(pthinfo_64.pth_sleep_time, 0, "PROC_PIDTHREADID64INFO returns valid value for pth_sleep_time");
+       T_EXPECT_EQ_INT(pthinfo_64.pth_curpri, pthinfo.pth_curpri, "PROC_PIDTHREADID64INFO returns valid value for pth_curpri");
+       T_EXPECT_EQ_INT(pthinfo_64.pth_priority, pthinfo.pth_priority, "PROC_PIDTHREADID64INFO returns valid value for pth_priority");
+       T_EXPECT_EQ_INT(pthinfo_64.pth_maxpriority, pthinfo.pth_maxpriority,
+                       "PROC_PIDTHREADID64INFO returns valid value for pth_maxpriority");
+       T_EXPECT_EQ_STR(pthinfo_64.pth_name, CONF_THREAD_NAME, "PROC_PIDTHREADID64INFO returns valid value for pth_name");
+
+       free_proc_info(proc_info, 2);
+}
+
+T_DECL(proc_info_proc_pidthreadpathinfo,
+       "Test to verify PROC_PIDTHREADPATHINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[2];
+       proc_info_caller(PTHINFO | PINFO_PATH, proc_info, NULL);
+       struct proc_threadinfo pthinfo            = *((struct proc_threadinfo *)proc_info[0]);
+       struct proc_threadwithpathinfo pinfo_path = *((struct proc_threadwithpathinfo *)proc_info[1]);
+
+       T_EXPECT_GE_ULLONG(pinfo_path.pt.pth_user_time, pthinfo.pth_user_time,
+                          "PROC_PIDTHREADPATHINFO returns valid value for pth_user_time");
+       T_EXPECT_GE_ULLONG(pinfo_path.pt.pth_system_time, pthinfo.pth_system_time,
+                          "PROC_PIDTHREADPATHINFO returns valid value for pth_system_time");
+       T_EXPECT_GE_INT(pinfo_path.pt.pth_cpu_usage, pthinfo.pth_cpu_usage,
+                       "PROC_PIDTHREADPATHINFO returns valid value for pth_cpu_usage");
+       T_EXPECT_EQ_INT(pinfo_path.pt.pth_policy, POLICY_TIMESHARE, "PROC_PIDTHREADPATHINFO returns valid value for pth_policy");
+       if (!(pinfo_path.pt.pth_run_state == TH_STATE_WAITING) && !(pinfo_path.pt.pth_run_state == TH_STATE_RUNNING)) {
+               T_EXPECT_EQ_INT(pinfo_path.pt.pth_run_state, -1, "PROC_PIDTHREADPATHINFO returns valid value for pth_run_state");
+       }
+       T_EXPECT_EQ_INT(pinfo_path.pt.pth_sleep_time, 0, "PROC_PIDTHREADPATHINFO returns valid value for pth_sleep_time");
+       T_EXPECT_EQ_INT(pinfo_path.pt.pth_curpri, pthinfo.pth_curpri, "PROC_PIDTHREADPATHINFO returns valid value for pth_curpri");
+       T_EXPECT_EQ_INT(pinfo_path.pt.pth_priority, pthinfo.pth_priority,
+                       "PROC_PIDTHREADPATHINFO returns valid value for pth_priority");
+       T_EXPECT_EQ_INT(pinfo_path.pt.pth_maxpriority, pthinfo.pth_maxpriority,
+                       "PROC_PIDTHREADPATHINFO returns valid value for pth_maxpriority");
+       T_EXPECT_EQ_STR(pinfo_path.pt.pth_name, CONF_THREAD_NAME, "PROC_PIDTHREADPATHINFO returns valid value for pth_name");
+       T_EXPECT_EQ_INT(pinfo_path.pvip.vip_vi.vi_type, VNON, "PROC_PIDTHREADPATHINFO valid vnode information");
+
+       free_proc_info(proc_info, 2);
+}
+
+T_DECL(proc_info_proc_pidarchinfo,
+       "Test to verify PROC_PIDARCHINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[1];
+       proc_info_caller(PAI, proc_info, NULL);
+       struct proc_archinfo pai = *((struct proc_archinfo *)proc_info[0]);
+
+#if defined(__arm__) || defined(__arm64__)
+       if (!((pai.p_cputype & CPU_TYPE_ARM) == CPU_TYPE_ARM) && !((pai.p_cputype & CPU_TYPE_ARM64) == CPU_TYPE_ARM64)) {
+               T_EXPECT_EQ_INT(pai.p_cputype, CPU_TYPE_ARM, "PROC_PIDARCHINFO returned valid value for p_cputype");
+       }
+       T_EXPECT_EQ_INT((pai.p_cpusubtype & CPU_SUBTYPE_ARM_ALL), CPU_SUBTYPE_ARM_ALL,
+                       "PROC_PIDARCHINFO returned valid value for p_cpusubtype");
+#else
+       if (!((pai.p_cputype & CPU_TYPE_X86) == CPU_TYPE_X86) && !((pai.p_cputype & CPU_TYPE_X86_64) == CPU_TYPE_X86_64)) {
+               T_EXPECT_EQ_INT(pai.p_cputype, CPU_TYPE_X86, "PROC_PIDARCHINFO returned valid value for p_cputype");
+       }
+#endif
+       free_proc_info(proc_info, 1);
+}
+
+T_DECL(proc_info_proc_pidregioninfo,
+       "Test to verify PROC_PIDREGIONINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[3];
+       proc_info_caller(PREGINFO, proc_info, NULL);
+
+       struct proc_regioninfo preginfo = *((struct proc_regioninfo *)proc_info[0]);
+       /*
+        *      map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it
+        */
+       void *map_tmp = proc_info[1];
+       vm_map_size_t map_tmp_sz = (vm_map_size_t)(uintptr_t)proc_info[2];
+
+       T_EXPECT_EQ_ULLONG(preginfo.pri_offset, (unsigned long long)PAGE_SIZE, "PROC_PIDREGIONINFO returns valid value for pri_offset");
+       T_EXPECT_EQ_UINT((preginfo.pri_protection ^ (VM_PROT_READ | VM_PROT_WRITE)), 0U,
+                        "PROC_PIDREGIONINFO returns valid value for pri_protection, expected read/write only");
+       T_EXPECT_EQ_UINT((preginfo.pri_max_protection & (VM_PROT_READ | VM_PROT_WRITE)), (unsigned int)(VM_PROT_READ | VM_PROT_WRITE),
+                        "PROC_PIDREGIONINFO returns valid value for pri_max_protection");
+       T_EXPECT_EQ_UINT((preginfo.pri_inheritance ^ VM_INHERIT_COPY), 0U,
+                        "PROC_PIDREGIONINFO returns valid value for pri_inheritance");
+       T_EXPECT_EQ_UINT((preginfo.pri_behavior ^ VM_BEHAVIOR_DEFAULT), 0U, "PROC_PIDREGIONINFO returns valid value for pri_behavior");
+       T_EXPECT_EQ_UINT(preginfo.pri_user_wired_count, 0U, "PROC_PIDREGIONINFO returns valid value for pri_user_wired_count");
+       T_EXPECT_EQ_UINT(preginfo.pri_user_tag, 0U, "PROC_PIDREGIONINFO returns valid value for pri_user_tag");
+       T_EXPECT_NE_UINT((preginfo.pri_flags ^ (PROC_REGION_SUBMAP | PROC_REGION_SHARED)), 0U,
+                        "PROC_PIDREGIONINFO returns valid value for pri_flags");
+       T_EXPECT_EQ_UINT(preginfo.pri_pages_resident, 0U, "PROC_PIDREGIONINFO returns valid value for pri_pages_resident");
+       T_EXPECT_EQ_UINT(preginfo.pri_pages_shared_now_private, 0U,
+                        "PROC_PIDREGIONINFO returns valid value for pri_pages_shared_now_private");
+       T_EXPECT_EQ_UINT(preginfo.pri_pages_swapped_out, 0U, "PROC_PIDREGIONINFO returns valid value for pri_pages_swapped_out");
+       T_EXPECT_EQ_UINT(preginfo.pri_pages_dirtied, 0U, "PROC_PIDREGIONINFO returns valid value for pri_pages_dirtied");
+       T_EXPECT_EQ_UINT(preginfo.pri_ref_count, 2U, "PROC_PIDREGIONINFO returns valid value for pri_ref_count");
+       T_EXPECT_EQ_UINT(preginfo.pri_shadow_depth, 1U, "PROC_PIDREGIONINFO returns valid value for pri_shadow_depth");
+       T_EXPECT_EQ_UINT(preginfo.pri_share_mode, (unsigned int)SM_COW, "PROC_PIDREGIONINFO returns valid value for pri_share_mode");
+       T_EXPECT_EQ_UINT(preginfo.pri_private_pages_resident, 0U,
+                        "PROC_PIDREGIONINFO returns valid value for pri_private_pages_resident");
+       T_EXPECT_GE_UINT(preginfo.pri_shared_pages_resident, 0U,
+                        "PROC_PIDREGIONINFO returns valid value for pri_shared_pages_resident");
+       T_EXPECT_EQ_ULLONG(preginfo.pri_address, (uint64_t)map_tmp, "PROC_PIDREGIONINFO returns valid value for pri_addr");
+       T_EXPECT_NE_UINT(preginfo.pri_obj_id, 0U, "PROC_PIDREGIONINFO returns valid value for pri_obj_id");
+       T_EXPECT_EQ_ULLONG(preginfo.pri_size, (unsigned long long)map_tmp_sz, "PROC_PIDREGIONINFO returns valid value for pri_size");
+       T_EXPECT_EQ_UINT(preginfo.pri_depth, 0U, "PROC_PIDREGIONINFO returns valid value for pri_depth");
+
+       int ret = 0;
+       ret     = munmap(map_tmp, (size_t)map_tmp_sz);
+       T_QUIET;
+       T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp");
+       free_proc_info(proc_info, 1);
+}
+
+T_DECL(proc_info_proc_pidregionpathinfo,
+       "Test to verify PROC_PIDREGIONPATHINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
+{
+       void * proc_info[3];
+       proc_info_caller(PREGINFO_PATH, proc_info, NULL);
+
+       struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]);
+       /*
+        *      map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it
+            */
+       void *map_tmp = proc_info[1];
+       vm_map_size_t map_tmp_sz = (vm_map_size_t)(uintptr_t)proc_info[2];
+
+       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE,
+                          "PROC_PIDREGIONPATHINFO returns valid value for pri_offset");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_protection ^ (VM_PROT_READ | VM_PROT_WRITE)), 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_protection, expected read/write only");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_max_protection & (VM_PROT_READ | VM_PROT_WRITE)),
+                        (unsigned int)(VM_PROT_READ | VM_PROT_WRITE),
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_max_protection");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_inheritance ^ VM_INHERIT_COPY), 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_inheritance");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_behavior ^ VM_BEHAVIOR_DEFAULT), 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_behavior");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_wired_count, 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_user_wired_count");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_tag, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_user_tag");
+       T_EXPECT_NE_UINT((preginfo_path.prp_prinfo.pri_flags ^ (PROC_REGION_SUBMAP | PROC_REGION_SHARED)), 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_flags");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_pages_resident");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_shared_now_private, 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_pages_shared_now_private");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_swapped_out, 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_pages_swapped_out");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_dirtied, 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_pages_dirtied");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_ref_count, 2U, "PROC_PIDREGIONPATHINFO returns valid value for pri_ref_count");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shadow_depth, 1U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_shadow_depth");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_share_mode, (unsigned int)SM_COW,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_share_mode");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_private_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_private_pages_resident");
+       T_EXPECT_GE_UINT(preginfo_path.prp_prinfo.pri_shared_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO returns valid value for pri_shared_pages_resident");
+       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_address, (uint64_t)map_tmp,
+                          "PROC_PIDREGIONPATHINFO returns valid value for pri_addr");
+       T_EXPECT_NE_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_obj_id");
+       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_size, (uint64_t)map_tmp_sz,
+                          "PROC_PIDREGIONPATHINFO returns valid value for pri_size");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_depth");
+       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_type, VREG, "PROC_PIDREGIONPATHINFO returns valid value for vi_type");
+       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_pad, 0, "PROC_PIDREGIONPATHINFO returns valid value for vi_pad");
+       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[0], 0,
+                       "PROC_PIDREGIONPATHINFO returns valid value for vi_fsid.val[0]");
+       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[1], 0,
+                       "PROC_PIDREGIONPATHINFO returns valid value for vi_fsid.val[1]");
+       T_EXPECT_NE_PTR((void *)(strcasestr(preginfo_path.prp_vip.vip_path, CONF_TMP_FILE_PFX)), NULL,
+                       "PROC_PIDREGIONPATHINFO returns valid value for vi_path");
+       /*
+        * Basic sanity checks for vnode stat returned by the API
+        */
+       T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO returns valid value for vst_dev");
+       T_EXPECT_EQ_INT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0,
+                       "PROC_PIDREGIONPATHINFO returns valid value for vst_mode");
+       T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)0, /* the file was unlink()'d! */
+                          "PROC_PIDREGIONPATHINFO returns valid value for vst_nlink");
+       T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL,
+                          "PROC_PIDREGIONPATHINFO returns valid value for vst_ino");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDREGIONPATHINFO returns valid value for vst_uid");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDREGIONPATHINFO returns valid value for vst_gid");
+       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_size, (off_t)CONF_BLK_SIZE,
+                         "PROC_PIDREGIONPATHINFO returns valid value for vst_size");
+       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blocks, 1LL,
+                         "PROC_PIDREGIONPATHINFO returns valid value for vst_blocks");
+       T_EXPECT_GE_INT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE,
+                       "PROC_PIDREGIONPATHINFO returns valid value for vst_blksize");
+
+       int ret = 0;
+       ret     = munmap(map_tmp, (size_t)map_tmp_sz);
+       T_QUIET;
+       T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp");
+       free_proc_info(proc_info, 1);
+}
+
+T_DECL(proc_info_proc_pidregionpathinfo2,
+       "Test to verify PROC_PIDREGIONPATHINFO2 returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
+{
+       void * proc_info[3];
+       proc_info_caller(PREGINFO_PATH_2, proc_info, NULL);
+
+       struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]);
+       /*
+        *      map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it
+            */
+       void *map_tmp = proc_info[1];
+       vm_map_size_t map_tmp_sz = (vm_map_size_t)(uintptr_t)proc_info[2];
+
+       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE,
+                          "PROC_PIDREGIONPATHINFO2 returns valid value for pri_offset");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_protection ^ (VM_PROT_READ | VM_PROT_WRITE)), 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_protection, expected read/write only");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_max_protection & (VM_PROT_READ | VM_PROT_WRITE)),
+                        (unsigned int)(VM_PROT_READ | VM_PROT_WRITE),
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_max_protection");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_inheritance ^ VM_INHERIT_COPY), 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_inheritance");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_behavior ^ VM_BEHAVIOR_DEFAULT), 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_behavior");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_wired_count, 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_user_wired_count");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_tag, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_user_tag");
+       T_EXPECT_NE_UINT((preginfo_path.prp_prinfo.pri_flags ^ (PROC_REGION_SUBMAP | PROC_REGION_SHARED)), 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_flags");
+       /*
+        * Following values are hard-coded to be zero in source
+        */
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_pages_resident");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_shared_now_private, 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_pages_shared_now_private");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_swapped_out, 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_pages_swapped_out");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_dirtied, 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_pages_dirtied");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_ref_count, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_ref_count");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shadow_depth, 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_shadow_depth");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_share_mode, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_share_mode");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_private_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_private_pages_resident");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shared_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_shared_pages_resident");
+       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_address, (uint64_t)map_tmp,
+                          "PROC_PIDREGIONPATHINFO2 returns valid value for pri_addr");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_obj_id");
+       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_size, (unsigned long long)map_tmp_sz,
+                          "PROC_PIDREGIONPATHINFO2 returns valid value for pri_size");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_depth");
+
+       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_type, VREG, "PROC_PIDREGIONPATHINFO2 returns valid value for vi_type");
+       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_pad, 0, "PROC_PIDREGIONPATHINFO2 returns valid value for vi_pad");
+       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[0], 0,
+                       "PROC_PIDREGIONPATHINFO2 returns valid value for vi_fsid.val[0]:%d",
+                       preginfo_path.prp_vip.vip_vi.vi_fsid.val[0]);
+       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[1], 0,
+                       "PROC_PIDREGIONPATHINFO2 returns valid value for vi_fsid.val[1]:%d",
+                       preginfo_path.prp_vip.vip_vi.vi_fsid.val[1]);
+       T_EXPECT_NE_PTR((void *)(strcasestr(preginfo_path.prp_vip.vip_path, CONF_TMP_FILE_PFX)), NULL,
+                       "PROC_PIDREGIONPATHINFO2 returns valid value for vi_path");
+       /*
+        * Basic sanity checks for vnode stat returned by the API
+        */
+       T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_dev");
+       T_EXPECT_EQ_UINT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for vst_mode");
+       T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)0, /* the file was unlink()'d! */
+                          "PROC_PIDREGIONPATHINFO2 returns valid value for vst_nlink");
+       T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL,
+                          "PROC_PIDREGIONPATHINFO2 returns valid value for vst_ino");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_uid");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_gid");
+       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_size, (off_t)CONF_BLK_SIZE,
+                         "PROC_PIDREGIONPATHINFO2 returns valid value for vst_size");
+       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blocks, 1LL,
+                         "PROC_PIDREGIONPATHINFO2 returns valid value for vst_blocks");
+       T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE,
+                        "PROC_PIDREGIONPATHINFO2 returns valid value for vst_blksize");
+
+       int ret = 0;
+       ret     = munmap(map_tmp, (size_t)map_tmp_sz);
+       T_QUIET;
+       T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp");
+       free_proc_info(proc_info, 1);
+}
+
+T_DECL(proc_info_proc_pidregionpathinfo3,
+       "Test to verify PROC_PIDREGIONPATHINFO3 returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
+{
+       void * proc_info[3];
+       proc_info_caller(PREGINFO_PATH_3, proc_info, NULL);
+
+       struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]);
+       void *map_tmp = proc_info[1];
+       vm_map_size_t map_tmp_sz = (vm_map_size_t)(uintptr_t)proc_info[2];
+
+       T_EXPECT_GE_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE,
+                          "PROC_PIDREGIONPATHINFO3 returns valid value for pri_offset");
+       T_EXPECT_NE_UINT((preginfo_path.prp_prinfo.pri_protection ^ (VM_PROT_WRITE | VM_PROT_EXECUTE)), 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_protection");
+#if defined(__arm__) || defined(__arm64__)
+       T_EXPECT_GT_UINT(preginfo_path.prp_prinfo.pri_max_protection, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_max_protection");
+#else
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_max_protection ^ VM_PROT_ALL), 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_max_protection");
+#endif
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_inheritance ^ VM_INHERIT_COPY), 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_inheritance");
+       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_behavior ^ VM_BEHAVIOR_DEFAULT), 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_behavior");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_wired_count, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_user_wired_count");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_tag, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_user_tag");
+       T_EXPECT_NE_UINT((preginfo_path.prp_prinfo.pri_flags ^ (PROC_REGION_SUBMAP | PROC_REGION_SHARED)), 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_flags");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_pages_resident");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_shared_now_private, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_pages_shared_now_private");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_swapped_out, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_pages_swapped_out");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_dirtied, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_pages_dirtied");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_ref_count, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_ref_count");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shadow_depth, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_shadow_depth");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_share_mode, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_share_mode");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_private_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_private_pages_resident");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shared_pages_resident, 0U,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_shared_pages_resident");
+       T_EXPECT_NE_ULLONG(preginfo_path.prp_prinfo.pri_address, 0ULL, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_addr");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_obj_id");
+       T_EXPECT_GE_ULLONG(preginfo_path.prp_prinfo.pri_size, (uint64_t)map_tmp_sz,
+                          "PROC_PIDREGIONPATHINFO3 returns valid value for pri_size");
+       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_depth");
+
+       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_type, VREG, "PROC_PIDREGIONPATHINFO3 returns valid value for vi_type");
+       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_pad, 0, "PROC_PIDREGIONPATHINFO3 returns valid value for vi_pad");
+       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[0], 0,
+                       "PROC_PIDREGIONPATHINFO3 returns valid value for vi_fsid.val[0]");
+       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[1], 0,
+                       "PROC_PIDREGIONPATHINFO3 returns valid value for vi_fsid.val[1]");
+       /*
+        * Basic sanity checks for vnode stat returned by the API
+        */
+       T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_dev");
+       T_EXPECT_EQ_UINT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for vst_mode");
+       T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)1, /* the file was unlink()'d _after_ calling proc_info */
+                          "PROC_PIDREGIONPATHINFO3 returns valid value for vst_nlink");
+       T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL,
+                          "PROC_PIDREGIONPATHINFO3 returns valid value for vst_ino");
+       /*
+        * No way to confirm actual ownership or binary. Just log the value
+        */
+       T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_uid");
+       T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_gid");
+       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_size, (off_t)CONF_BLK_SIZE,
+                         "PROC_PIDREGIONPATHINFO3 returns valid value for vst_size");
+       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blocks, 1LL,
+                         "PROC_PIDREGIONPATHINFO3 returns valid value for vst_blocks");
+       T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE,
+                        "PROC_PIDREGIONPATHINFO3 returns valid value for vst_blksize");
+
+       int ret = 0;
+       ret     = munmap(map_tmp, (size_t)map_tmp_sz);
+       T_QUIET;
+       T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp");
+       free_proc_info(proc_info, 1);
+}
+
+T_DECL(proc_info_proc_pidvnodepathinfo,
+       "Test to verify PROC_PIDVNODEPATHINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       void * proc_info[1];
+       proc_info_caller(PVNINFO, proc_info, NULL);
+       struct proc_vnodepathinfo pvninfo = *((struct proc_vnodepathinfo *)proc_info[0]);
+
+       T_EXPECT_EQ_INT(pvninfo.pvi_cdir.vip_vi.vi_type, VDIR, "PROC_PIDVNODEPATHINFO returns valid value for vi_type");
+       T_EXPECT_EQ_INT(pvninfo.pvi_cdir.vip_vi.vi_pad, 0, "PROC_PIDVNODEPATHINFO returns valid value for vi_pad");
+       T_EXPECT_NE_INT(pvninfo.pvi_cdir.vip_vi.vi_fsid.val[0], 0, "PROC_PIDVNODEPATHINFO returns valid value for vi_fsid.val[0]");
+       T_EXPECT_NE_INT(pvninfo.pvi_cdir.vip_vi.vi_fsid.val[1], 0, "PROC_PIDVNODEPATHINFO returns valid value for vi_fsid.val[1]");
+       /*
+        * Basic sanity checks for vnode stat returned by the API
+        */
+       T_EXPECT_NE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDVNODEPATHINFO returns valid value for vst_dev");
+       T_EXPECT_EQ_INT(((pvninfo.pvi_cdir.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFDIR), 0,
+                       "PROC_PIDVNODEPATHINFO returns valid value for vst_mode");
+       T_EXPECT_GE_USHORT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_nlink, (unsigned short)2,
+                          "PROC_PIDVNODEPATHINFO returns valid value for vst_nlink");
+       T_EXPECT_NE_ULLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_ino, 0ULL, "PROC_PIDVNODEPATHINFO returns valid value for vst_ino");
+       T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDVNODEPATHINFO returns valid value for vst_uid");
+       T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDVNODEPATHINFO returns valid value for vst_gid");
+       T_EXPECT_GT_LLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_size, 0LL, "PROC_PIDVNODEPATHINFO returns valid value for vst_size");
+       T_EXPECT_GE_LLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_blocks, 0LL, "PROC_PIDVNODEPATHINFO returns valid value for vst_blocks");
+       T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE,
+                        "PROC_PIDVNODEPATHINFO returns valid value for vst_blksize");
+
+       free_proc_info(proc_info, 1);
+}
+/*
+ * The remaining tests break from the pattern of the other PROC_INFO_CALL_PIDINFO tests.
+ * We call proc_info directly as it's more efficient
+ */
+
+T_DECL(proc_info_pidinfo_proc_pidlistfds,
+       "proc_info API tests to verify PROC_INFO_CALL_PIDINFO/PROC_PIDLISTFDS",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       int retval;
+       int orig_nfiles              = 0;
+       struct proc_fdinfo * fd_info = NULL;
+
+       T_LOG("Test to verify PROC_PIDLISTFDS returns sane number of open files");
+       retval      = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDLISTFDS, (uint32_t)0, (user_addr_t)0, (uint32_t)0);
+       orig_nfiles = retval / (int)sizeof(struct proc_fdinfo);
+       T_EXPECT_GE_INT(orig_nfiles, CONF_OPN_FILE_COUNT, "The number of open files is lower than expected.");
+
+       /*
+         * Allocate a buffer of expected size + 1 to ensure that
+         * the API still returns expected size
+         * i.e. 3 + 1 = 4 open fds
+         */
+       T_LOG("Test to verify PROC_PIDLISTFDS returns valid fd information");
+       fd_info = malloc(sizeof(*fd_info) * 5);
+       tmp_fd = CONF_TMP_FILE_OPEN(NULL);
+       T_LOG("tmp_fd val:%d", tmp_fd);
+       T_QUIET;
+       T_EXPECT_POSIX_SUCCESS(tmp_fd, "open() for PROC_PIDLISTFDS");
+
+       retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDLISTFDS, (uint32_t)0, (user_addr_t)fd_info,
+                            (uint32_t)(sizeof(*fd_info) * 5));
+       retval = retval / (int)sizeof(struct proc_fdinfo);
+
+       close(tmp_fd);
+
+       for (int i = 0; i < retval; i++) {
+               /*
+                * Check only for the fd that we control.
+                */
+               if (tmp_fd != fd_info[i].proc_fd) {
+                       continue;
+               }
+               T_EXPECT_EQ_UINT(fd_info[i].proc_fdtype, (unsigned int)PROX_FDTYPE_VNODE, "Correct proc_fdtype for returned fd");
+       }
+
+       T_EXPECT_GE_INT(retval, 4, "Correct number of fds was returned.");
+
+       tmp_fd = -1;
+       free(fd_info);
+       fd_info = NULL;
+}
+
+T_DECL(proc_info_proc_pidpathinfo,
+       "Test to verify PROC_PIDPATHINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       char * pid_path = NULL;
+       pid_path        = malloc(sizeof(char) * PROC_PIDPATHINFO_MAXSIZE);
+       T_EXPECT_NOTNULL(pid_path, "malloc for PROC_PIDPATHINFO");
+       int retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDPATHINFO, (uint64_t)0, (user_addr_t)pid_path,
+                                (uint32_t)PROC_PIDPATHINFO_MAXSIZE);
+       T_EXPECT_EQ_INT(retval, 0, "__proc_info call for PROC_PIDPATHINFO");
+
+       T_EXPECT_NE_PTR((void *)(strcasestr(pid_path, CONF_CMD_NAME)), NULL, "PROC_PIDPATHINFOreturns valid value for pid_path");
+       free(pid_path);
+       pid_path = NULL;
+}
+
+T_DECL(proc_info_proc_pidlistfileports,
+       "Test to verify PROC_PIDLISTFILEPORTS returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       struct proc_fileportinfo * fileport_info = NULL;
+       mach_port_t tmp_file_port                = MACH_PORT_NULL;
+       proc_config_t proc_config                = spawn_child_processes(1, proc_info_call_pidinfo_handler);
+       int child_pid                            = proc_config->child_pids[0];
+
+       /*
+        * Create a file port
+        */
+       tmp_fd     = CONF_TMP_FILE_OPEN(NULL);
+       int retval = fileport_makeport(tmp_fd, &tmp_file_port);
+       T_EXPECT_POSIX_SUCCESS(retval, "fileport_makeport() for PROC_PIDLISTFILEPORTS");
+
+       /*
+        * Like the other APIs, this returns the actual count + 20. Hence we expect it to be atleast 1 (that we created)
+        */
+       retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDLISTFILEPORTS, (uint64_t)0, (user_addr_t)0, (uint32_t)0);
+       T_EXPECT_GE_INT(retval / (int)sizeof(fileport_info), 1,
+                       "__proc_info call for PROC_PIDLISTFILEPORTS to get total ports in parent");
+
+       /*
+        * Child doesn't have any fileports, should return zero
+        */
+       retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDLISTFILEPORTS, (uint64_t)0, (user_addr_t)0, (uint32_t)0);
+       T_EXPECT_EQ_INT(retval / (int)sizeof(fileport_info), 0,
+                       "__proc_info call for PROC_PIDLISTFILEPORTS to get total ports in child");
+
+       fileport_info = malloc(sizeof(*fileport_info) * (size_t)retval);
+       retval        = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDLISTFILEPORTS, (uint64_t)0, (user_addr_t)fileport_info,
+                            (uint32_t)sizeof(*fileport_info));
+       T_EXPECT_EQ_INT(retval, (int)sizeof(*fileport_info), "__proc_info call for PROC_PIDLISTFILEPORTS");
+
+       T_EXPECT_NE_UINT(fileport_info->proc_fileport, (uint32_t)0, "PROC_PIDLISTFILEPORTS returns valid value for proc_fileport");
+       T_EXPECT_EQ_UINT(fileport_info->proc_fdtype, (uint32_t)PROX_FDTYPE_VNODE,
+                        "PROC_PIDLISTFILEPORTS returns valid value for proc_fdtype");
+
+       /*
+        * Cleanup for the fileport
+        */
+       mach_port_deallocate(mach_task_self(), tmp_file_port);
+       tmp_file_port = MACH_PORT_NULL;
+       free(fileport_info);
+       fileport_info = NULL;
+       close(tmp_fd);
+       tmp_fd = -1;
+       free_proc_config(proc_config);
+}
+
+T_DECL(proc_info_proc_pidcoalitioninfo,
+       "Test to verify PROC_PIDCOALITIONINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
+       int child_pid             = proc_config->child_pids[0];
+
+       struct proc_pidcoalitioninfo pci_parent;
+       struct proc_pidcoalitioninfo pci_child;
+       int retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDCOALITIONINFO, (uint64_t)0, (user_addr_t)&pci_parent,
+                                (uint32_t)sizeof(pci_parent));
+       T_EXPECT_EQ_INT(retval, (int)sizeof(pci_parent), "__proc_info call for PROC_PIDCOALITIONINFO (parent)");
+       retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDCOALITIONINFO, (uint64_t)0, (user_addr_t)&pci_child,
+                            (uint32_t)sizeof(pci_child));
+       T_EXPECT_EQ_INT(retval, (int)sizeof(pci_child), "__proc_info call for PROC_PIDCOALITIONINFO (child)");
+
+       /*
+        * Coalition IDs should match for child and parent
+        */
+       for (int i = 0; i < COALITION_NUM_TYPES; i++) {
+               T_EXPECT_EQ_ULLONG(pci_parent.coalition_id[i], pci_child.coalition_id[i],
+                                  "PROC_PIDCOALITIONINFO returns valid value for coalition_id");
+       }
+
+       free_proc_config(proc_config);
+}
+
+T_DECL(proc_info_proc_pidworkqueueinfo,
+       "Test to verify PROC_PIDWORKQUEUEINFO returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
+       int child_pid             = proc_config->child_pids[0];
+       send_action_to_child_processes(proc_config, ACT_PHASE5);
+
+       struct proc_workqueueinfo pwqinfo;
+       usleep(10000);
+       int retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDWORKQUEUEINFO, (uint64_t)0, (user_addr_t)&pwqinfo,
+                                (uint32_t)sizeof(pwqinfo));
+       T_EXPECT_EQ_INT(retval, (int)sizeof(pwqinfo), "__proc_info call for PROC_PIDWORKQUEUEINFO");
+
+       int ncpu         = 0;
+       size_t ncpu_size = sizeof(ncpu);
+       retval           = sysctlbyname("hw.ncpu", (void *)&ncpu, &ncpu_size, NULL, 0);
+       T_EXPECT_EQ_INT(retval, 0, "sysctl() for PROC_PIDWORKQUEUEINFO");
+       T_EXPECT_GE_UINT(pwqinfo.pwq_nthreads, (uint32_t)1, "PROC_PIDWORKQUEUEINFO returns valid value for pwq_nthreads");
+       T_EXPECT_GE_UINT(pwqinfo.pwq_blockedthreads + pwqinfo.pwq_runthreads, (uint32_t)1,
+                        "PROC_PIDWORKQUEUEINFO returns valid value for pwqinfo.pwq_runthreads/pwq_blockedthreads");
+       T_EXPECT_EQ_UINT(pwqinfo.pwq_state, (uint32_t)0, "PROC_PIDWORKQUEUEINFO returns valid value for pwq_state");
+
+       kill_child_processes(proc_config);
+       free_proc_config(proc_config);
+}
+T_DECL(proc_info_proc_pidnoteexit,
+       "Test to verify PROC_PIDNOTEEXIT returns valid information about the process",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       /*
+        * Ask the child to close pipe and quit, cleanup pipes for parent
+        */
+       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
+       int child_pid             = proc_config->child_pids[0];
+       send_action_to_child_processes(proc_config, ACT_EXIT);
+
+       uint32_t exit_data = 0;
+       int retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDNOTEEXIT, (uint64_t)(NOTE_EXITSTATUS | NOTE_EXIT_DETAIL),
+                                (user_addr_t)&exit_data, (uint32_t)sizeof(exit_data));
+       T_EXPECT_EQ_INT(retval, (int)sizeof(exit_data), "__proc_info call for PROC_PIDNOTEEXIT");
+
+       T_EXPECT_EQ_UINT(exit_data, 0U, "PROC_PIDNOTEEXIT returned valid value for exit_data");
+
+       free_proc_config(proc_config);
+}
+
+T_DECL(proc_info_negative_tests,
+       "Test to validate PROC_INFO_CALL_PIDINFO for invalid arguments",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
+       int child_pid             = proc_config->child_pids[0];
+       uint32_t exit_data        = 0;
+
+       int retval =
+           __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDNOTEEXIT, (uint64_t)0, (user_addr_t)&exit_data, (uint32_t)0);
+       T_EXPECT_EQ_INT(errno, ENOMEM, "PROC_INFO_CALL_PIDINFO call should fail with ENOMEM if buffersize is zero");
+       retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDPATHINFO, (uint64_t)0, (user_addr_t)&exit_data,
+                            (uint32_t)PROC_PIDPATHINFO_MAXSIZE + 1);
+       T_EXPECT_EQ_INT(errno, EOVERFLOW,
+                       "PROC_INFO_CALL_PIDINFO call should fail with EOVERFLOW if buffersize is larger than PROC_PIDPATHINFO_MAXSIZE");
+       retval = __proc_info(PROC_INFO_CALL_PIDINFO, -1, PROC_PIDNOTEEXIT, (uint64_t)0, (user_addr_t)&exit_data,
+                            (uint32_t)sizeof(exit_data));
+       T_EXPECT_EQ_INT(errno, ESRCH, "PROC_INFO_CALL_PIDINFO call should fail with ESRCH for invalid process id");
+       retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, -1U, (uint64_t)0, (user_addr_t)&exit_data, (uint32_t)sizeof(exit_data));
+       T_EXPECT_EQ_INT(errno, EINVAL, "PROC_INFO_CALL_PIDINFO call should fail with EINVAL for invalid flavor");
+       retval = __proc_info(PROC_INFO_CALL_PIDINFO, 0, PROC_PIDWORKQUEUEINFO, (uint64_t)0, (user_addr_t)0, (uint32_t)0);
+       T_EXPECT_EQ_INT(errno, EINVAL,
+                       "PROC_INFO_CALL_PIDINFO call should fail with EINVAL if flavor is PROC_PIDWORKQUEUEINFO and pid=0");
+
+       free_proc_config(proc_config);
+}
+
+/*
+ * END PROC_INFO_CALL_PIDINFO DECLs
+ */
+
+#pragma mark proc_list_uptrs
+
+#define NUPTRS 4
+static uint64_t uptrs[NUPTRS] = {0x1122334455667788ULL, 0x99aabbccddeeff00ULL, 0xaabbaaddccaaffeeULL, 0xcc000011ccaa7755ULL};
+
+static const char * uptr_names[NUPTRS];
+
+static void
+print_uptrs(int argc, char * const * argv)
+{
+       for (int i = 0; i < argc; i++) {
+               char * end;
+               unsigned long pid = strtoul(argv[i], &end, 0);
+               if (pid > INT_MAX) {
+                       printf("error: pid '%lu' would overflow an integer\n", pid);
+               }
+               if (end == argv[i]) {
+                       printf("error: could not parse '%s' as a pid\n", argv[i]);
+                       continue;
+               }
+               int uptrs_count = proc_list_uptrs((int)pid, NULL, 0);
+               if (uptrs_count == 0) {
+                       printf("no uptrs for process %d\n", (int)pid);
+                       return;
+               }
+
+               /* extra space */
+               unsigned int uptrs_len = (unsigned int)uptrs_count + 32;
+
+               uint64_t * uptrs_alloc = malloc(sizeof(uint64_t) * uptrs_len);
+               os_assert(uptrs_alloc != NULL);
+
+               uptrs_count = proc_list_uptrs((int)pid, uptrs_alloc, (uint32_t)(sizeof(uint64_t) * uptrs_len));
+               printf("process %d has %d uptrs:\n", (int)pid, uptrs_count);
+               if (uptrs_count > (int)uptrs_len) {
+                       uptrs_count = (int)uptrs_len;
+               }
+               for (int j = 0; j < uptrs_count; j++) {
+                       printf("%#17" PRIx64 "\n", uptrs_alloc[j]);
+               }
+       }
+}
+
+T_DECL(proc_list_uptrs, "the kernel should return any up-pointers it knows about", T_META_ALL_VALID_ARCHS(YES))
+{
+       if (argc > 0) {
+               print_uptrs(argc, argv);
+               T_SKIP("command line invocation of tool, not test");
+       }
+
+       unsigned int cur_uptr = 0;
+
+       int kq = kqueue();
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(kq, "kqueue");
+
+       /*
+        * Should find uptrs on file-type knotes and generic knotes (two
+        * different search locations, internally).
+        */
+       struct kevent64_s events[2];
+       memset(events, 0, sizeof(events));
+
+       uptr_names[cur_uptr] = "kqueue file-backed knote";
+       events[0].filter     = EVFILT_WRITE;
+       events[0].ident      = STDOUT_FILENO;
+       events[0].flags      = EV_ADD;
+       events[0].udata      = uptrs[cur_uptr++];
+
+       uptr_names[cur_uptr] = "kqueue non-file-backed knote";
+       events[1].filter     = EVFILT_USER;
+       events[1].ident      = 1;
+       events[1].flags      = EV_ADD;
+       events[1].udata      = uptrs[cur_uptr++];
+
+       int kev_err = kevent64(kq, events, sizeof(events) / sizeof(events[0]), NULL, 0, KEVENT_FLAG_IMMEDIATE, NULL);
+       T_ASSERT_POSIX_SUCCESS(kev_err, "register events with kevent64");
+
+       /*
+        * Should find uptrs both on a kevent_id kqueue and in a workloop
+        * kqueue's knote's udata field.
+        */
+       uptr_names[cur_uptr] = "dynamic kqueue non-file-backed knote";
+       struct kevent_qos_s events_id[] = {{
+               .filter = EVFILT_USER,
+               .ident = 1,
+               .flags = EV_ADD,
+               .qos = (int)_pthread_qos_class_encode(QOS_CLASS_DEFAULT, 0, 0),
+               .udata = uptrs[cur_uptr++]
+       }};
+
+       uptr_names[cur_uptr] = "dynamic kqueue ID";
+       kev_err = kevent_id(uptrs[cur_uptr++], events_id, 1, NULL, 0, NULL, NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE);
+       T_ASSERT_POSIX_SUCCESS(kev_err, "register event with kevent_id");
+
+       errno           = 0;
+       int uptrs_count = proc_list_uptrs(getpid(), NULL, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(uptrs_count, "proc_list_uptrs");
+       T_QUIET;
+       T_EXPECT_EQ(uptrs_count, NUPTRS, "should see correct number of up-pointers");
+
+       uint64_t uptrs_obs[NUPTRS] = {0};
+       uptrs_count                = proc_list_uptrs(getpid(), uptrs_obs, sizeof(uptrs_obs));
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(uptrs_count, "proc_list_uptrs");
+
+       for (int i = 0; i < uptrs_count; i++) {
+               int found = -1;
+               for (int j = 0; j < NUPTRS; j++) {
+                       if (uptrs_obs[i] == uptrs[j]) {
+                               found = j;
+                               goto next;
+                       }
+               }
+               T_FAIL("unexpected up-pointer found: %#" PRIx64, uptrs_obs[i]);
+       next:;
+               if (found != -1) {
+                       T_PASS("found up-pointer for %s", uptr_names[found]);
+               }
+       }
+
+       uint64_t up_overflow[2] = {0};
+       uptrs_count = proc_list_uptrs(getpid(), up_overflow, sizeof(uint64_t)+1);
+       T_ASSERT_EQ(up_overflow[1], (uint64_t)0 , "overflow check");
+}
+
+#pragma mark dynamic kqueue info
+
+#define EXPECTED_ID UINT64_C(0x1122334455667788)
+#define EXPECTED_UDATA UINT64_C(0x99aabbccddeeff00)
+#ifndef KQ_WORKLOOP
+#define KQ_WORKLOOP 0x80
+#endif
+
+static void
+setup_kevent_id(kqueue_id_t id)
+{
+       struct kevent_qos_s events_id[] = {{
+               .filter = EVFILT_USER,
+               .ident = 1,
+               .flags = EV_ADD,
+               .qos = (int)_pthread_qos_class_encode(QOS_CLASS_DEFAULT, 0, 0),
+               .udata = EXPECTED_UDATA
+       }};
+
+       int err = kevent_id(id, events_id, 1, NULL, 0, NULL, NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE);
+       T_ASSERT_POSIX_SUCCESS(err, "register event with kevent_id");
+}
+
+static kqueue_id_t *
+list_kqids(pid_t pid, int * nkqids_out)
+{
+       int kqids_len = 256;
+       int nkqids;
+       kqueue_id_t * kqids = NULL;
+       uint32_t kqids_size;
+
+retry:
+       if (os_mul_overflow(sizeof(kqueue_id_t), kqids_len, &kqids_size)) {
+               T_QUIET;
+               T_ASSERT_GT(kqids_len, PROC_PIDDYNKQUEUES_MAX, NULL);
+               kqids_len = PROC_PIDDYNKQUEUES_MAX;
+               goto retry;
+       }
+       if (!kqids) {
+               kqids = malloc(kqids_size);
+               T_QUIET;
+               T_ASSERT_NOTNULL(kqids, "malloc(%" PRIu32 ")", kqids_size);
+       }
+
+       nkqids = proc_list_dynkqueueids(pid, kqids, kqids_size);
+       if (nkqids > kqids_len && kqids_len < PROC_PIDDYNKQUEUES_MAX) {
+               kqids_len *= 2;
+               if (kqids_len > PROC_PIDDYNKQUEUES_MAX) {
+                       kqids_len = PROC_PIDDYNKQUEUES_MAX;
+               }
+               free(kqids);
+               kqids = NULL;
+               goto retry;
+       }
+
+       *nkqids_out = nkqids;
+       return kqids;
+}
+
+T_DECL(list_dynamic_kqueues, "the kernel should list IDs of dynamic kqueues", T_META_ALL_VALID_ARCHS(true))
+{
+       int nkqids;
+       bool found = false;
+
+       setup_kevent_id(EXPECTED_ID);
+       kqueue_id_t * kqids = list_kqids(getpid(), &nkqids);
+       T_ASSERT_GE(nkqids, 1, "at least one dynamic kqueue is listed");
+       for (int i = 0; i < nkqids; i++) {
+               if (kqids[i] == EXPECTED_ID) {
+                       found = true;
+                       T_PASS("found expected dynamic kqueue ID");
+               } else {
+                       T_LOG("found another dynamic kqueue with ID %#" PRIx64, kqids[i]);
+               }
+       }
+
+       if (!found) {
+               T_FAIL("could not find dynamic ID of kqueue created");
+       }
+
+       free(kqids);
+}
+
+T_DECL(dynamic_kqueue_basic_info, "the kernel should report valid basic dynamic kqueue info", T_META_ALL_VALID_ARCHS(true))
+{
+       struct kqueue_info kqinfo;
+       int ret;
+
+       setup_kevent_id(EXPECTED_ID);
+       ret = proc_piddynkqueueinfo(getpid(), PROC_PIDDYNKQUEUE_INFO, EXPECTED_ID, &kqinfo, sizeof(kqinfo));
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_piddynkqueueinfo(... PROC_PIDDYNKQUEUE_INFO ...)");
+       T_QUIET;
+       T_ASSERT_GE(ret, (int)sizeof(kqinfo), "PROC_PIDDYNKQUEUE_INFO should return the right size");
+
+       T_EXPECT_NE(kqinfo.kq_state & KQ_WORKLOOP, 0U, "kqueue info should be for a workloop kqueue");
+       T_EXPECT_EQ(kqinfo.kq_stat.vst_ino, EXPECTED_ID, "inode field should be the kqueue's ID");
+}
+
+T_DECL(dynamic_kqueue_extended_info, "the kernel should report valid extended dynamic kqueue info", T_META_ALL_VALID_ARCHS(true))
+{
+       struct kevent_extinfo kqextinfo[1];
+       int ret;
+
+       setup_kevent_id(EXPECTED_ID);
+       ret = proc_piddynkqueueinfo(getpid(), PROC_PIDDYNKQUEUE_EXTINFO, EXPECTED_ID, kqextinfo, sizeof(kqextinfo));
+       T_ASSERT_POSIX_SUCCESS(ret, "proc_piddynkqueueinfo(... PROC_PIDDYNKQUEUE_EXTINFO ...)");
+       T_QUIET;
+       T_ASSERT_EQ(ret, 1, "PROC_PIDDYNKQUEUE_EXTINFO should return a single knote");
+
+       T_EXPECT_EQ(kqextinfo[0].kqext_kev.ident, 1ULL, "kevent identifier matches what was configured");
+       T_EXPECT_EQ(kqextinfo[0].kqext_kev.filter, (short)EVFILT_USER, "kevent filter matches what was configured");
+       T_EXPECT_EQ(kqextinfo[0].kqext_kev.udata, EXPECTED_UDATA, "kevent udata matches what was configured");
+}
+
+#pragma mark proc_listpids
+
+T_DECL(list_kdebug_pids, "the kernel should report processes that are filtered by kdebug", T_META_ASROOT(YES))
+{
+       int mib[4] = {CTL_KERN, KERN_KDEBUG};
+       int npids;
+       int pids[1];
+       int ret;
+       kd_regtype reg;
+       size_t regsize = sizeof(reg);
+
+       mib[2] = KERN_KDREMOVE;
+       ret    = sysctl(mib, 3, NULL, NULL, NULL, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDREMOVE sysctl");
+
+       mib[2] = KERN_KDSETBUF;
+       mib[3] = 100000;
+       ret    = sysctl(mib, 4, NULL, NULL, NULL, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDSETBUF sysctl");
+
+       mib[2] = KERN_KDSETUP;
+       ret    = sysctl(mib, 3, NULL, NULL, NULL, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDSETUP sysctl");
+
+       npids = proc_listpids(PROC_KDBG_ONLY, 0, pids, sizeof(pids));
+       T_EXPECT_EQ(npids, 0, "no processes should be filtered initially");
+
+       reg.type   = KDBG_TYPENONE;
+       reg.value1 = (unsigned int)getpid();
+       reg.value2 = 1; /* set the pid in the filter */
+       mib[2]     = KERN_KDPIDTR;
+       ret        = sysctl(mib, 3, &reg, &regsize, NULL, 0);
+       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDPIDTR sysctl to set a pid in the filter");
+
+       npids = proc_listpids(PROC_KDBG_ONLY, 0, pids, sizeof(pids));
+       npids /= 4;
+       T_EXPECT_EQ(npids, 1, "a process should be filtered");
+       T_EXPECT_EQ(pids[0], getpid(), "process filtered should be the one that was set");
+
+       mib[2] = KERN_KDREMOVE;
+       ret    = sysctl(mib, 3, NULL, NULL, NULL, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDREMOVE sysctl");
+}
diff --git a/tests/proc_info_list_kthreads.c b/tests/proc_info_list_kthreads.c
new file mode 100644 (file)
index 0000000..f7c4105
--- /dev/null
@@ -0,0 +1,110 @@
+/*
+ * proc_info_list_kthreads
+ *
+ * list 64 bit thread ids of kernel_task
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <err.h>
+
+#include <libproc.h>
+#include <strings.h>
+#include <darwintest.h>
+#include <TargetConditionals.h>
+
+#define MAX_TRIES 20
+#define EXTRA_THREADS 15
+
+#if TARGET_OS_OSX
+T_DECL(proc_info_list_kthreads,
+       "Test to verify PROC_PIDLISTTHREADIDS returns kernel thread IDs for pid 0",
+       T_META_ASROOT(true),
+       T_META_CHECK_LEAKS(false))
+#else
+T_DECL(proc_info_list_kthreads,
+       "Test to verify PROC_PIDLISTTHREADIDS returns kernel thread IDs for pid 0",
+       T_META_ASROOT(false),
+       T_META_CHECK_LEAKS(false))
+#endif /* TARGET_OS_OSX */
+{
+       int buf_used = 0;
+
+       int thread_count = 0;
+       uint64_t *thread_list = NULL;
+
+       /*
+        * To use PROC_PIDLISTTHREADIDS, we must pass a buffer of uint64_t's for each thread ID.
+        * However, there is a TOCTOU race between asking for the thread count
+        * and asking for the array of identifiers.
+        *
+        * Because the process could have allocated more threads since last we asked
+        * how many threads there are, we instead pass an extra slot in the array,
+        * and try again if it used that slot.
+        */
+
+       int attempt = 1;
+       while (!thread_count && (attempt < MAX_TRIES)) {
+               struct proc_taskinfo ti;
+
+               buf_used = proc_pidinfo(0, PROC_PIDTASKINFO, 0, &ti, sizeof(ti));
+
+               T_QUIET; T_WITH_ERRNO; T_ASSERT_GT(buf_used, 0, "proc_pidinfo(PROC_PIDTASKINFO) returned a value > 0");
+               T_QUIET; T_ASSERT_EQ(buf_used, (int)sizeof(ti), "proc_pidinfo(PROC_PIDTASKINFO) returned size %d == %lu", buf_used, sizeof(ti));
+
+               T_LOG("The kernel says it has %d threads", ti.pti_threadnum);
+
+               int expected_size  = ti.pti_threadnum * (int)sizeof(uint64_t);
+               /* tack on five extra to detect newly allocated threads */
+               int allocated_size = expected_size + EXTRA_THREADS*(int)sizeof(uint64_t);
+               uint64_t *thread_list_tmp = malloc((size_t)allocated_size);
+               T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(thread_list_tmp, "malloc(size = %d) failed", allocated_size);
+
+               buf_used = proc_pidinfo(0, PROC_PIDLISTTHREADIDS, 0, thread_list_tmp, (int)allocated_size);
+               T_LOG("proc_pidinfo(PROC_PIDLISTTHREADIDS) buf_used = %d, expected_size = %d", buf_used, expected_size);
+
+               if (buf_used == 0) {
+                       T_WITH_ERRNO; T_ASSERT_FAIL("proc_pidinfo(PROC_PIDLISTTHREADIDS) failed");
+               }
+               if (buf_used == expected_size) {
+                       /* success, we found the expected number of threads */
+                       thread_list = thread_list_tmp;
+                       thread_count = expected_size / (int)sizeof(uint64_t);
+               } else if (buf_used < expected_size) {
+                       /* there were fewer threads than we expected, fix up the allocation */
+                       thread_list = realloc(thread_list_tmp, (size_t)buf_used);
+                       thread_count = buf_used / (int)sizeof(uint64_t);
+                       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(thread_list, "realloc(size = %d) failed", buf_used);
+               } else if (buf_used > expected_size) {
+                       if (buf_used < allocated_size) {
+                               thread_list = realloc(thread_list_tmp, (size_t)buf_used);
+                               thread_count = buf_used / (int)sizeof(uint64_t);
+                               T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(thread_list, "realloc(size = %d) failed", buf_used);
+                       } else {
+                               /*
+                                * it used all the extra slots, meaning there are more
+                                * threads than we thought, try again!
+                                */
+                               T_LOG("expected %d threads, but saw an extra thread: %d",
+                                      expected_size / (int)sizeof(uint64_t), buf_used / (int)sizeof(uint64_t));
+                               free(thread_list_tmp);
+                       }
+               }
+               attempt++;
+       }
+       T_QUIET; T_ASSERT_LE(attempt, MAX_TRIES, "attempt <= MAX_TRIES");
+       T_QUIET; T_ASSERT_NOTNULL(thread_list, "thread_list != NULL");
+       T_QUIET; T_ASSERT_GT(thread_count, 0, "thread_count > 0");
+
+       struct proc_threadinfo pthinfo_64;
+       for (int i = 0 ; i < thread_count ; i++) {
+               bzero(&pthinfo_64, sizeof(struct proc_threadinfo));
+               int retval = proc_pidinfo(0, PROC_PIDTHREADID64INFO, thread_list[i],
+                                         (void *)&pthinfo_64, (uint32_t)sizeof(pthinfo_64));
+               T_QUIET; T_WITH_ERRNO; T_EXPECT_GT(retval, 0, "proc_pidinfo(PROC_PIDTASKINFO) returned %d", retval);
+               T_QUIET; T_EXPECT_EQ(retval, (int)sizeof(pthinfo_64), "proc_pidinfo(PROC_PIDTASKINFO) returned size %d == %lu",
+                                    retval, sizeof(pthinfo_64));
+       }
+}
+
diff --git a/tests/proc_info_list_kthreads.entitlements b/tests/proc_info_list_kthreads.entitlements
new file mode 100644 (file)
index 0000000..a333f47
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.kernel.global-proc-info</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/proc_info_udata.c b/tests/proc_info_udata.c
new file mode 100644 (file)
index 0000000..3a37cbf
--- /dev/null
@@ -0,0 +1,47 @@
+#include <darwintest.h>
+#include "../bsd/sys/proc_info.h"
+#include "../libsyscall/wrappers/libproc/libproc.h"
+#include <stdio.h>
+#include <unistd.h>
+
+T_DECL(proc_udata_info, "Get and set a proc udata token"){
+       uint64_t token = mach_absolute_time();
+       proc_info_udata_t udata;
+       int ret;
+
+       udata = token;
+       ret = proc_udata_info(getpid(), PROC_UDATA_INFO_SET, &udata, sizeof (udata));
+
+#if CONFIG_EMBEDDED
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(ret, -1, "proc_udata_info PROC_UDATA_INFO_SET returns error on non-macOS");
+       T_SKIP("Remaining tests are only supported on macOS");
+#endif /* CONFIG_EMBEDDED */
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(ret, 0, "proc_udata_info PROC_UDATA_INFO_SET");
+
+       T_LOG("udata set to %#llx", udata);
+
+       bzero(&udata, sizeof (udata));
+       ret = proc_udata_info(getpid(), PROC_UDATA_INFO_GET, &udata, sizeof (udata));
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(ret, 0, "proc_udata_info PROC_UDATA_INFO_GET");
+
+       T_ASSERT_EQ_ULLONG(token, udata, "proc_udata_info(): retrieved value matches token");
+
+       ret = proc_udata_info(getpid(), PROC_UDATA_INFO_SET, &udata, sizeof (uint32_t));
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(ret, -1, "proc_udata_info PROC_UDATA_INFO_SET with invalid size returned -1");
+       T_ASSERT_EQ_INT(errno, EINVAL, "proc_udata_info PROC_UDATA_INFO_SET with invalid size returned EINVAL");
+
+       ret = proc_udata_info(getppid(), PROC_UDATA_INFO_GET, &udata, sizeof (udata));
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(ret, -1, "proc_udata_info PROC_UDATA_INFO_GET returned -1 on attempt against non-self pid");
+       T_ASSERT_EQ_INT(errno, EACCES, "proc_udata_info PROC_UDATA_INFO_GET set errno to EACCES on attempt against non-self pid");
+
+       ret = proc_udata_info(getppid(), PROC_UDATA_INFO_SET, &udata, sizeof (udata));
+       T_WITH_ERRNO;
+       T_ASSERT_EQ_INT(ret, -1, "proc_udata_info PROC_UDATA_INFO_SET returned -1 on attempt against non-self pid");
+       T_ASSERT_EQ_INT(errno, EACCES, "proc_udata_info PROC_UDATA_INFO_SET set errno to EACCES on attempt against non-self pid");
+}
diff --git a/tests/proc_uuid_policy_26567533.c b/tests/proc_uuid_policy_26567533.c
new file mode 100644 (file)
index 0000000..470d5ca
--- /dev/null
@@ -0,0 +1,42 @@
+#include <darwintest.h>
+#include <uuid/uuid.h>
+#include <System/sys/proc_uuid_policy.h>
+#include <stdint.h>
+
+#define NUM_PROC_UUID_POLICY_FLAGS 4
+
+T_DECL(proc_uuid_policy_26567533, "Tests passing a NULL uuid in (uap->uuid).", T_META_LTEPHASE(LTE_POSTINIT))
+{
+       int i, ret;
+       uuid_t null_uuid;
+       memset(null_uuid, 0, sizeof(uuid_t));
+
+       uint32_t policy_flags[] = {
+               PROC_UUID_POLICY_FLAGS_NONE,
+               PROC_UUID_NO_CELLULAR,
+               PROC_UUID_NECP_APP_POLICY,
+               PROC_UUID_ALT_DYLD_POLICY
+       };
+
+       for (i = 0; i < NUM_PROC_UUID_POLICY_FLAGS; i++) {
+               T_LOG("Testing policy add with flag value 0x%x", policy_flags[i]);
+
+               /* Since UUID is null, this call should fail with errno = EINVAL. */
+               ret = proc_uuid_policy(PROC_UUID_POLICY_OPERATION_ADD, null_uuid, sizeof(uuid_t), policy_flags[i]);
+
+               T_ASSERT_TRUE(ret == -1, "proc_uuid_policy returned %d", ret);
+               T_WITH_ERRNO;
+               T_ASSERT_TRUE(errno = EINVAL, "errno is %d", errno);
+       }
+
+       for (i = 0; i < NUM_PROC_UUID_POLICY_FLAGS; i++) {
+               T_LOG("Testing policy remove with flag value 0x%x", policy_flags[i]);
+
+               /* Since UUID is null, this call should fail with errno = EINVAL. */
+               ret = proc_uuid_policy(PROC_UUID_POLICY_OPERATION_REMOVE, null_uuid, sizeof(uuid_t), policy_flags[i]);
+
+               T_ASSERT_TRUE(ret == -1, "proc_uuid_policy returned %d", ret);
+               T_WITH_ERRNO;
+               T_ASSERT_TRUE(errno = EINVAL, "errno is %d", errno);
+       }
+}
diff --git a/tests/pwrite_avoid_sigxfsz_28581610.c b/tests/pwrite_avoid_sigxfsz_28581610.c
new file mode 100644 (file)
index 0000000..9c39e55
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * testname: pwrite_avoid_sigxfsz_28581610
+ */
+
+#include <darwintest.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <setjmp.h>
+#include <signal.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#define TMP_FILE_PATH "/tmp/test_pwrite"
+
+static sigjmp_buf xfsz_jmpbuf;
+
+void xfsz_signal(int);
+
+void
+xfsz_signal(__unused int signo)
+{
+       siglongjmp(xfsz_jmpbuf, 1);
+}
+
+T_DECL(pwrite, "Tests avoiding SIGXFSZ with pwrite and odd offsets",
+               T_META_ASROOT(true))
+{
+       int fd, x;
+       off_t ret;
+       struct stat f_stat;
+       struct rlimit crl;
+       static const int offs[] = { -1, -1 * 1024, -1 * 1024 * 16, -1 * 1024 * 1024 * 16, 0 };
+       static unsigned char buffer[1048576];
+
+       T_SETUPBEGIN;
+       /* We expect zero SIGXFSZ signals because we have no file size limits */
+       crl.rlim_cur = crl.rlim_max = RLIM_INFINITY;
+       ret = setrlimit(RLIMIT_FSIZE, &crl);
+       T_ASSERT_POSIX_SUCCESS(ret, "setting infinite file size limit");
+
+       /* we just needed root to setup unlimited file size */
+       remove(TMP_FILE_PATH);
+       setuid(5000);
+
+       /* We just want an empty regular file to test with */
+       fd = open(TMP_FILE_PATH, O_RDWR | O_CREAT | O_EXCL, 0777);
+       T_ASSERT_POSIX_SUCCESS(fd, "opening fd on temp file %s.", TMP_FILE_PATH);
+
+       /* sanity check that this new file is really zero bytes in size */
+       ret = fstat(fd, &f_stat);
+       T_ASSERT_POSIX_SUCCESS(ret, "stat() fd on temp file.");
+       T_ASSERT_TRUE(0 == f_stat.st_size, "ensure %s is empty", TMP_FILE_PATH);
+
+       /* sanity check that ftruncate() considers negative offsets an error */
+       for (x = 0; offs[x] != 0; x++) {
+               ret = ftruncate(fd, offs[x]);
+               T_ASSERT_TRUE(((ret == -1) && (errno == EINVAL)),
+                               "negative offset %d", offs[x]);
+       }
+
+       T_SETUPEND;
+
+       /* we want to get the EFBIG errno but without a SIGXFSZ signal */
+    T_EXPECTFAIL;
+       if (!sigsetjmp(xfsz_jmpbuf, 1)) {
+               signal(SIGXFSZ, xfsz_signal);
+               ret = pwrite(fd, buffer, sizeof buffer, LONG_MAX);
+               T_ASSERT_TRUE(((ret == -1) && (errno == EFBIG)),
+                               "large offset %d", 13);
+       } else {
+               signal(SIGXFSZ, SIG_DFL);
+               T_FAIL("%s unexpected SIGXFSZ with offset %lX",
+                "<rdar://problem/28581610>", LONG_MAX);
+       }
+
+       /* Negative offsets are invalid, no SIGXFSZ signals required */
+       for (x = 0; offs[x] != 0; x++) {
+        /* only -1 gives the correct result */
+        if (-1 != offs[x]) {
+            T_EXPECTFAIL;
+        }
+
+               if (!sigsetjmp(xfsz_jmpbuf, 1)) {
+                       signal(SIGXFSZ, xfsz_signal);
+                       ret = pwrite(fd, buffer, sizeof buffer, offs[x]);
+                       T_ASSERT_TRUE(((ret == -1) && (errno == EINVAL)),
+                                       "negative offset %d", offs[x]);
+               } else {
+                       signal(SIGXFSZ, SIG_DFL);
+                       T_FAIL("%s unexpected SIGXFSZ with negative offset %d",
+                   "<rdar://problem/28581610>", offs[x]);
+               }
+       }
+
+       remove(TMP_FILE_PATH);
+}
diff --git a/tests/quiesce_counter.c b/tests/quiesce_counter.c
new file mode 100644 (file)
index 0000000..563d13d
--- /dev/null
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
+ */
+
+/*
+ * Test to validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER ticks at least once per second
+ *
+ * <rdar://problem/42433973>
+ */
+
+#include <System/machine/cpu_capabilities.h>
+
+#include <darwintest.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <sys/sysctl.h>
+
+#ifndef _COMM_PAGE_CPU_QUIESCENT_COUNTER
+
+T_DECL(test_quiescent_counter, "Validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER increments",
+       T_META_CHECK_LEAKS(false))
+{
+       T_SKIP("_COMM_PAGE_CPU_QUIESCENT_COUNTER doesn't exist on this system");
+}
+
+#else /* _COMM_PAGE_CPU_QUIESCENT_COUNTER */
+
+T_DECL(test_quiescent_counter, "Validate that _COMM_PAGE_CPU_QUIESCENT_COUNTER increments",
+       T_META_CHECK_LEAKS(false))
+{
+       int rv;
+
+       uint32_t cpu_checkin_min_interval = 0; /* set by sysctl hw.ncpu */
+
+       size_t value_size = sizeof(cpu_checkin_min_interval);
+       rv = sysctlbyname("kern.cpu_checkin_interval", &cpu_checkin_min_interval, &value_size, NULL, 0);
+       T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(kern.cpu_checkin_interval)");
+
+       T_LOG("kern.cpu_checkin_interval is %d", cpu_checkin_min_interval);
+
+       T_ASSERT_GT(cpu_checkin_min_interval, 0, "kern.cpu_checkin_interval should be > 0");
+
+       uint64_t* commpage_addr = (uint64_t *)(uintptr_t)_COMM_PAGE_CPU_QUIESCENT_COUNTER;
+
+       T_LOG("address of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %p", (void*) commpage_addr);
+
+       uint64_t counter = *commpage_addr;
+       uint64_t last_counter = counter;
+       T_LOG("first value of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %llu", counter);
+
+       for (int i = 0 ; i < 10 ; i++)
+       {
+               sleep(1);
+
+               last_counter = counter;
+               counter = *commpage_addr;
+
+               T_LOG("value of _COMM_PAGE_CPU_QUIESCENT_COUNTER is %llu", counter);
+
+               T_ASSERT_GT(counter, last_counter, "_COMM_PAGE_CPU_QUIESCENT_COUNTER must monotonically increase at least once per second");
+       }
+}
+
+#endif /* _COMM_PAGE_CPU_QUIESCENT_COUNTER */
+
diff --git a/tests/regression_17272465.c b/tests/regression_17272465.c
new file mode 100644 (file)
index 0000000..ed2dc10
--- /dev/null
@@ -0,0 +1,25 @@
+#include <darwintest.h>
+#include <stdio.h>
+#include <mach/mach.h>
+#include <mach/host_priv.h>
+
+
+T_DECL(regression_17272465,
+       "Test for host_set_special_port Mach port over-release, rdr: 17272465", T_META_CHECK_LEAKS(false))
+{
+       kern_return_t kr;
+       mach_port_t port = MACH_PORT_NULL;
+
+       T_SETUPBEGIN;
+       T_QUIET;
+       T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), NULL);
+       T_QUIET;
+       T_ASSERT_MACH_SUCCESS(mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND), NULL);
+       T_SETUPEND;
+
+       (void)host_set_special_port(mach_host_self(), 30, port);
+       (void)host_set_special_port(mach_host_self(), 30, port);
+       (void)host_set_special_port(mach_host_self(), 30, port);
+
+       T_PASS("No panic occurred");
+}
diff --git a/tests/remote_time.c b/tests/remote_time.c
new file mode 100644 (file)
index 0000000..cd028a9
--- /dev/null
@@ -0,0 +1,22 @@
+#include <darwintest.h>
+#include <System/kern/remote_time.h>
+#include <mach/mach_time.h>
+#include <stdint.h>
+#include <sys/sysctl.h>
+#include <TargetConditionals.h>
+extern uint64_t __mach_bridge_remote_time(uint64_t);
+
+T_DECL(remote_time_syscall, "test mach_bridge_remote_time syscall",
+       T_META_CHECK_LEAKS(false))
+{
+#if TARGET_OS_BRIDGE
+       uint64_t local_time = mach_absolute_time();
+       uint64_t remote_time1 = mach_bridge_remote_time(local_time);
+       uint64_t remote_time2 = __mach_bridge_remote_time(local_time);
+       T_LOG("local_time = %llu, remote_time1 = %llu, remote_time2 = %llu",
+               local_time, remote_time1, remote_time2);
+       T_ASSERT_EQ(remote_time1, remote_time2, "syscall works");
+#else
+       T_SKIP("Skipping test");
+#endif /* TARGET_OS_BRIDGE */
+}
diff --git a/tests/settimeofday_29193041.c b/tests/settimeofday_29193041.c
new file mode 100644 (file)
index 0000000..6bb495d
--- /dev/null
@@ -0,0 +1,158 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <mach/clock_types.h>
+#include <sys/mman.h>
+#include <sys/timex.h>
+#include <spawn.h>
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#if CONFIG_EMBEDDED
+#include <sys/types.h>
+#include <pwd.h>
+#include <uuid/uuid.h>
+#endif
+
+/*
+ * This test expects the entitlement or root privileges for a process to
+ * set the time using settimeofday syscall.
+ */
+
+#define DAY 86400 //1 day in sec
+
+T_DECL(settime_32089962_not_entitled_root,
+       "Verify that root privileges can allow to change the time",
+       T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
+{
+       struct timeval settimeofdaytime;
+       struct timeval adj_time;
+       struct timex ntptime;
+
+       if (geteuid() != 0){
+                T_SKIP("settimeofday_root_29193041 test requires root privileges to run.");
+        }
+
+       /* test settimeofday */
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
+       T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
+
+       /* test adjtime */
+       adj_time.tv_sec = 1;
+       adj_time.tv_usec = 0;
+       T_ASSERT_POSIX_ZERO(adjtime(&adj_time, NULL),NULL);
+
+       /* test ntp_adjtime */
+       memset(&ntptime, 0, sizeof(ntptime));
+       ntptime.modes |= MOD_STATUS;
+       ntptime.status = TIME_OK;
+
+       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
+}
+
+T_DECL(settime_32089962_not_entitled_not_root,
+       "Verify that the \"com.apple.settime\" entitlement can allow to change the time",
+       T_META_ASROOT(false), T_META_CHECK_LEAKS(false))
+{
+       struct timeval settimeofdaytime;
+       struct timeval adj_time;
+       struct timex ntptime;
+       int res;
+
+       if (geteuid() == 0){
+                T_SKIP("settimeofday_29193041 test requires no root privileges to run.");
+        }
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
+
+       /* test settimeofday */
+#if TARGET_OS_EMBEDDED
+       T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
+#else
+       res = settimeofday(&settimeofdaytime, NULL);
+       T_ASSERT_EQ(res, -1, NULL);
+#endif
+
+       /* test adjtime */
+       adj_time.tv_sec = 1;
+       adj_time.tv_usec = 0;
+       res = adjtime(&adj_time, NULL);
+       T_ASSERT_EQ(res, -1, NULL);
+
+       /* test ntp_adjtime */
+       memset(&ntptime, 0, sizeof(ntptime));
+       ntptime.modes |= MOD_STATUS;
+       ntptime.status = TIME_OK;
+       res = ntp_adjtime(&ntptime);
+       T_ASSERT_EQ(res, -1, NULL);
+}
+
+T_DECL(settimeofday_29193041_not_entitled_root,
+       "Verify that root privileges can allow to change the time",
+       T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
+{
+       struct timeval time;
+       long new_time;
+
+       if (geteuid() != 0){
+                T_SKIP("settimeofday_root_29193041 test requires root privileges to run.");
+        }
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* increment the time of one day */
+       new_time = time.tv_sec + DAY;
+
+       time.tv_sec = new_time;
+       time.tv_usec = 0;
+
+       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* expext to be past new_time */
+       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time changed with root and without entitlement");
+
+       time.tv_sec -= DAY;
+       T_QUIET;T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+}
+
+T_DECL(settimeofday_29193041_not_entitled_not_root,
+       "Verify that the \"com.apple.settime\" entitlement can allow to change the time",
+       T_META_ASROOT(false), T_META_CHECK_LEAKS(false))
+{
+       struct timeval time;
+       long new_time;
+
+       if (geteuid() == 0){
+                T_SKIP("settimeofday_29193041 test requires no root privileges to run.");
+        }
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* increment the time of one day */
+       new_time = time.tv_sec + DAY;
+
+       time.tv_sec = new_time;
+       time.tv_usec = 0;
+
+#if TARGET_OS_EMBEDDED
+       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+#else
+       int res = settimeofday(&time, NULL);
+       T_ASSERT_EQ(res, -1, NULL);
+#endif
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+#if TARGET_OS_EMBEDDED
+       /* expext to be past new_time */
+       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time successfully changed without root and without entitlement");
+       time.tv_sec -= DAY;
+       T_QUIET; T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+#else
+       T_EXPECT_LT_LONG(time.tv_sec, new_time, "Not permitted to change time without root and without entitlement");
+#endif
+
+}
diff --git a/tests/settimeofday_29193041.entitlements b/tests/settimeofday_29193041.entitlements
new file mode 100644 (file)
index 0000000..fafc6c9
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.settime</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/settimeofday_29193041_entitled.c b/tests/settimeofday_29193041_entitled.c
new file mode 100644 (file)
index 0000000..51ca5a5
--- /dev/null
@@ -0,0 +1,143 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <mach/clock_types.h>
+#include <sys/timex.h>
+#include <spawn.h>
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#if CONFIG_EMBEDDED
+#include <sys/types.h>
+#include <pwd.h>
+#include <uuid/uuid.h>
+#endif
+
+/*
+ * This test expects the entitlement or root privileges for a process to
+ * set the time using settimeofday syscall.
+ */
+
+#define DAY 86400 //1 day in sec
+
+T_DECL(settime_32089962_entitled_root,
+       "Verify that root privileges can allow to change the time",
+       T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
+{
+       struct timeval settimeofdaytime;
+       struct timeval adj_time;
+       struct timex ntptime;
+
+       if (geteuid() != 0){
+                T_SKIP("settime_32089962_entitled_root test requires root privileges to run.");
+        }
+
+       /* test settimeofday */
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
+       T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
+
+       /* test adjtime */
+       adj_time.tv_sec = 1;
+       adj_time.tv_usec = 0;
+       T_ASSERT_POSIX_ZERO(adjtime(&adj_time, NULL),NULL);
+
+       /* test ntp_adjtime */
+       memset(&ntptime, 0, sizeof(ntptime));
+       ntptime.modes |= MOD_STATUS;
+       ntptime.status = TIME_OK;
+
+       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
+}
+
+T_DECL(settime_32089962_entitled_not_root,
+       "Verify that the \"com.apple.settime\" entitlement can allow to change the time",
+       T_META_ASROOT(false), T_META_CHECK_LEAKS(false))
+{
+
+       struct timeval settimeofdaytime;
+       struct timeval adj_time;
+       struct timex ntptime;
+
+       if (geteuid() == 0){
+                T_SKIP("settime_32089962_entitled_root test requires no root privileges to run.");
+        }
+
+       /* test settimeofday */
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
+       T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
+
+       /* test adjtime */
+       adj_time.tv_sec = 1;
+       adj_time.tv_usec = 0;
+       T_ASSERT_POSIX_ZERO(adjtime(&adj_time, NULL),NULL);
+
+       /* test ntp_adjtime */
+       memset(&ntptime, 0, sizeof(ntptime));
+       ntptime.modes |= MOD_STATUS;
+       ntptime.status = TIME_OK;
+
+       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
+
+}
+
+T_DECL(settimeofday_29193041_entitled_root,
+       "Verify that root privileges can allow to change the time",
+       T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
+{
+       struct timeval time;
+       long new_time;
+
+       if (geteuid() != 0){
+                T_SKIP("settimeofday_root_29193041 test requires root privileges to run.");
+        }
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* increment the time of one day */
+       new_time = time.tv_sec + DAY;
+
+       time.tv_sec = new_time;
+       time.tv_usec = 0;
+
+       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* expext to be past new_time */
+       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time changed with root and entitlement");
+
+       time.tv_sec -= DAY;
+       T_QUIET;T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+}
+
+T_DECL(settimeofday_29193041_entitled_not_root,
+       "Verify that the \"com.apple.settime\" entitlement can allow to change the time",
+       T_META_ASROOT(false), T_META_CHECK_LEAKS(false))
+{
+       struct timeval time;
+       long new_time;
+
+       if (geteuid() == 0){
+                T_SKIP("settimeofday_29193041 test requires no root privileges to run.");
+        }
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* increment the time of one day */
+       new_time = time.tv_sec + DAY;
+
+       time.tv_sec = new_time;
+       time.tv_usec = 0;
+
+       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
+
+       /* expext to be past new_time */
+       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time successfully changed without root and with entitlement");
+       
+       time.tv_sec -= DAY;
+       T_QUIET; T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
+}
diff --git a/tests/sigchld_return.c b/tests/sigchld_return.c
new file mode 100644 (file)
index 0000000..6a3cc6b
--- /dev/null
@@ -0,0 +1,50 @@
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <darwintest.h>
+
+
+static int exitcode = 0x6789BEEF;
+int should_exit = 0;
+
+void handler (int sig, siginfo_t *sip, __unused void *uconp)
+{
+        /* Should handle the SIGCHLD signal */
+        T_ASSERT_EQ_INT(sig, SIGCHLD, "Captured signal returns 0x%x, expected SIGCHLD (0x%x).", sig, SIGCHLD);
+        T_QUIET; T_ASSERT_NOTNULL(sip, "siginfo_t returned NULL but should have returned data.");
+        T_ASSERT_EQ_INT(sip->si_code, CLD_EXITED, "si_code returns 0x%x, expected CLD_EXITED (0x%x).", sip->si_code, CLD_EXITED);
+        T_ASSERT_EQ_INT(sip->si_status, exitcode, "si_status returns 0x%08X, expected the child's exit code (0x%08X).", sip->si_status, exitcode);
+        should_exit = 1;
+}
+
+
+T_DECL(sigchldreturn, "checks that a child process exited with an exitcode returns correctly to parent", T_META_CHECK_LEAKS(false))
+{
+        struct sigaction act;
+        int pid;
+
+        act.sa_sigaction = handler;
+        act.sa_flags = SA_SIGINFO;
+
+        /* Set action for signal */
+        T_QUIET; T_ASSERT_POSIX_SUCCESS(sigaction (SIGCHLD, &act, NULL), "Calling sigaction() failed for SIGCHLD");
+
+        /* Now fork a child that just exits */
+        pid = fork();
+        T_QUIET; T_ASSERT_NE_INT(pid, -1, "fork() failed!");
+
+        if (pid == 0) {
+                /* Child process! */
+                exit (exitcode);
+        }
+
+        /* Main program that did the fork */
+        /* We should process the signal, then exit */
+        while (!should_exit) {
+                sleep(1);
+        }
+}
+
diff --git a/tests/sigcont_return.c b/tests/sigcont_return.c
new file mode 100644 (file)
index 0000000..606caa9
--- /dev/null
@@ -0,0 +1,28 @@
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include <darwintest.h>
+
+T_DECL(sigcontreturn, "checks that a call to waitid() for a child that is stopped and then continued returns correctly")
+{
+        pid_t           pid;
+        siginfo_t       siginfo;
+        pid = fork();
+        T_QUIET; T_ASSERT_NE_INT(pid, -1, "fork() failed!");
+
+        if (pid == 0) {
+                while(1){}
+        }
+
+        kill(pid, SIGSTOP);
+        kill(pid, SIGCONT);
+        sleep(1);
+
+        T_QUIET; T_ASSERT_POSIX_SUCCESS(waitid(P_PID, pid, &siginfo, WCONTINUED), "Calling waitid() failed for pid %d", pid);
+
+        T_ASSERT_EQ_INT(siginfo.si_status, SIGCONT, "A call to waitid() for stopped and continued child returns 0x%x, expected SIGCONT (0x%x)", siginfo.si_status, SIGCONT );
+        kill(pid, SIGKILL);
+}
diff --git a/tests/socket_bind_35243417.c b/tests/socket_bind_35243417.c
new file mode 100644 (file)
index 0000000..cb44aa5
--- /dev/null
@@ -0,0 +1,107 @@
+/* -*- Mode: c; tab-width: 8; indent-tabs-mode: 1; c-basic-offset: 8; -*- */
+
+#include <darwintest.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <errno.h>
+
+static int
+sockv6_open(void)
+{
+       int     s;
+
+       s = socket(AF_INET6, SOCK_DGRAM, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(s, "socket(AF_INET6, SOCK_DGRAM, 0)");
+       return (s);
+}
+
+static int
+sockv6_bind(int s, in_port_t port)
+{
+       struct sockaddr_in6     sin6;
+
+       bzero(&sin6, sizeof(sin6));
+       sin6.sin6_len = sizeof(sin6);
+       sin6.sin6_family = AF_INET6;
+       sin6.sin6_port = port;
+       return (bind(s, (const struct sockaddr *)&sin6, sizeof(sin6)));
+}
+
+static void
+sockv6_set_v6only(int s)
+{
+       int             on = 1;
+       int             ret;
+
+       ret = setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on));
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "setsockopt(%d, IPV6_ONLY)", s);
+}
+
+static bool
+alloc_and_bind_ports(in_port_t port_start, in_port_t port_end,
+                    int bind_attempts)
+{
+       int     bound_count = 0;
+       bool    success = true;
+
+       for (in_port_t i = port_start; success && i <= port_end; i++) {
+               int     s6 = -1;
+               int     s6_other = -1;
+               int     ret;
+
+               s6 = sockv6_open();
+               sockv6_set_v6only(s6);
+               if (sockv6_bind(s6, i) != 0) {
+                       /* find the next available port */
+                       goto loop_done;
+               }
+               s6_other = sockv6_open();
+               ret = sockv6_bind(s6_other, i);
+               T_WITH_ERRNO;
+               T_QUIET;
+               T_ASSERT_TRUE(ret != 0, "socket %d bind %d", s6_other, i);
+               /*
+                * After bind fails, try binding to a different port.
+                * For non-root user, this will panic without the fix for
+                * <rdar://problem/35243417>.
+                */
+               if (sockv6_bind(s6_other, i + 1) == 0) {
+                       bound_count++;
+                       if (bound_count >= bind_attempts) {
+                               break;
+                       }
+               }
+       loop_done:
+               if (s6 >= 0) {
+                       close(s6);
+               }
+               if (s6_other >= 0) {
+                       close(s6_other);
+               }
+       }
+       T_ASSERT_TRUE(bound_count == bind_attempts,
+                     "number of successful binds %d (out of %d)",
+                     bound_count, bind_attempts);
+       return (success);
+}
+
+
+T_DECL(socket_bind_35243417,
+       "bind IPv6 only UDP socket, then bind IPv6 socket.",
+       T_META_ASROOT(false),
+       T_META_CHECK_LEAKS(false))
+{
+       alloc_and_bind_ports(1, 65534, 10);
+}
+
+T_DECL(socket_bind_35243417_root,
+       "bind IPv6 only UDP socket, then bind IPv6 socket.",
+       T_META_ASROOT(true))
+{
+       alloc_and_bind_ports(1, 65534, 10);
+}
diff --git a/tests/socket_bind_35685803.c b/tests/socket_bind_35685803.c
new file mode 100644 (file)
index 0000000..d0e22a9
--- /dev/null
@@ -0,0 +1,205 @@
+/* -*- Mode: c; tab-width: 8; indent-tabs-mode: 1; c-basic-offset: 8; -*- */
+
+#include <darwintest.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <errno.h>
+#include <pthread.h>
+#include <stdbool.h>
+
+static bool debug;
+
+static int
+sock_open_common(int pf, int type)
+{
+       int     s;
+
+       s = socket(pf, type, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(s, "socket(%d, %d, 0)", pf, type);
+       return (s);
+}
+
+static int
+sock_open(int type)
+{
+       return (sock_open_common(PF_INET, type));
+}
+
+static int
+sock_bind(int s, int port)
+{
+       struct sockaddr_in      sin = {
+               .sin_len = sizeof(sin),
+               .sin_family = AF_INET,
+       };
+
+       sin.sin_port = htons(port);
+       return (bind(s, (const struct sockaddr *)&sin, sizeof(sin)));
+}
+
+static int
+sockv6_open(int type)
+{
+       return (sock_open_common(PF_INET6, type));
+}
+
+static int
+sockv6_bind(int s, int port)
+{
+       struct sockaddr_in6             sin6 = {
+               .sin6_len = sizeof(sin6),
+               .sin6_family = AF_INET6,
+       };
+
+       sin6.sin6_port = htons(port);
+       return (bind(s, (const struct sockaddr *)&sin6, sizeof(sin6)));
+}
+
+static uint16_t
+sock_get_port(int sockfd)
+{
+       int                             error;
+       uint16_t                        p;
+       union sockaddr_in_4_6   sin;
+       socklen_t                       sin_len;
+
+       sin_len = sizeof(sin);
+       bzero(&sin, sin_len);
+       error = getsockname(sockfd, (struct sockaddr *)&sin, &sin_len);
+       T_QUIET;
+       T_EXPECT_POSIX_ZERO(error, "getsockname(%d)", sockfd);
+       if (error != 0) {
+               return (0);
+       }
+       switch (sin.sa.sa_family) {
+       case AF_INET:
+               p = sin.sin.sin_port;
+               break;
+       case AF_INET6:
+               p = sin.sin6.sin6_port;
+               break;
+       default:
+               T_ASSERT_FAIL("unknown address family %d\n",
+                             sin.sa.sa_family);
+               p = 0;
+               break;
+       }
+       return (p);
+}
+
+typedef struct {
+       bool    v6;
+       int             socket_count;
+       int *   socket_list;
+} SocketInfo, * SocketInfoRef;
+
+static void
+bind_sockets(SocketInfoRef info, const char * msg)
+{
+       for (int i = 0; i < info->socket_count; i++) {
+               int             error;
+               uint16_t        port;
+
+               if (info->v6) {
+                       error = sockv6_bind(info->socket_list[i], 0);
+               }
+               else {
+                       error = sock_bind(info->socket_list[i], 0);
+               }
+               port = sock_get_port(info->socket_list[i]);
+               if (debug) {
+                       T_LOG( "%s: fd %d port is %d error %d",
+                              msg, info->socket_list[i], ntohs(port), error);
+               }
+       }
+       return;
+}
+
+static void *
+second_thread(void * arg)
+{
+       SocketInfoRef   info = (SocketInfoRef)arg;
+
+       bind_sockets(info, "second");
+       return (NULL);
+}
+
+static void
+multithreaded_bind_test(bool v6, int socket_count)
+{
+       int             error;
+       SocketInfo      info;
+       int     socket_list[socket_count];
+       pthread_t       thread;
+
+       info.v6 = v6;
+       for (int i = 0; i < socket_count; i++) {
+               if (v6) {
+                       socket_list[i] = sockv6_open(SOCK_STREAM);
+               } else {
+                       socket_list[i] = sock_open(SOCK_STREAM);
+               }
+       }
+       info.socket_count = socket_count;
+       info.socket_list = socket_list;
+       error = pthread_create(&thread, NULL, second_thread, &info);
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(error, "pthread_create");
+
+       /* compete with second thread */
+       bind_sockets(&info, "main");
+       error = pthread_join(thread, NULL);
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(error, "pthread_join");
+
+       for (int i = 0; i < socket_count; i++) {
+               error = close(socket_list[i]);
+               T_QUIET;
+               T_ASSERT_POSIX_ZERO(error, "close socket %d", socket_list[i]);
+       }
+}
+
+static void
+run_multithreaded_bind_test(int number_of_runs, bool v6, int socket_count)
+{
+       for (int i = 0; i < number_of_runs; i++) {
+               multithreaded_bind_test(v6, socket_count);
+       }
+       T_PASS("multithreaded_bind_test %s", v6 ? "IPv6" : "IPv4");
+}
+
+T_DECL(socket_bind_35685803,
+       "multithreaded bind IPv4 socket as root",
+       T_META_ASROOT(false),
+       T_META_CHECK_LEAKS(false))
+{
+       run_multithreaded_bind_test(100, false, 100);
+}
+
+T_DECL(socket_bind_35685803_root,
+       "multithreaded bind IPv4 socket",
+       T_META_ASROOT(true))
+{
+       run_multithreaded_bind_test(100, false, 100);
+}
+
+T_DECL(socket_bind_35685803_v6,
+       "multithreaded bind IPv6 socket as root",
+       T_META_ASROOT(false),
+       T_META_CHECK_LEAKS(false))
+{
+       run_multithreaded_bind_test(100, true, 100);
+}
+
+T_DECL(socket_bind_35685803_v6_root,
+       "multithreaded bind IPv6 socket",
+       T_META_ASROOT(true))
+{
+       run_multithreaded_bind_test(100, true, 100);
+}
diff --git a/tests/socket_poll_close_25786011.c b/tests/socket_poll_close_25786011.c
new file mode 100644 (file)
index 0000000..b39b936
--- /dev/null
@@ -0,0 +1,35 @@
+#include <darwintest.h>
+#include <poll.h>
+#include <sys/socket.h>
+#include <unistd.h>
+
+T_DECL(socket_poll_close_25786011, "Tests an invalid poll call to a socket and then calling close.", T_META_LTEPHASE(LTE_POSTINIT))
+{
+       int my_socket, ret;
+
+       my_socket = socket(PF_LOCAL, SOCK_STREAM, 0);
+       T_WITH_ERRNO; T_ASSERT_TRUE(my_socket > 0, "create socket");
+
+       /*
+        * Setup a pollfd that we know will return an error when we try
+        * to create a knote for it. We specify a BSD vnode specific event
+        * for a socket.
+        */
+       struct pollfd my_pollfd = {
+               .fd = my_socket,
+               .events = POLLEXTEND
+       };
+
+       /*
+        * Previously the call to kevent_register() in the kernel from this call
+        * would leak an iocount reference on the fileproc, which would cause any
+        * subsequent calls to close() on the associated fd to block indefinitely.
+        */
+       ret = poll(&my_pollfd, 1, 0);
+       T_WITH_ERRNO; T_ASSERT_TRUE(ret == 1, "poll returned %d", ret);
+
+       ret = close(my_socket);
+       T_ASSERT_POSIX_ZERO(ret, "close on socket with fd %d\n", my_socket);
+
+       T_PASS("socket_poll_close_25786011 PASSED");
+}
diff --git a/tests/stackshot.m b/tests/stackshot.m
new file mode 100644 (file)
index 0000000..7aef17c
--- /dev/null
@@ -0,0 +1,1022 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <kern/debug.h>
+#include <kern/kern_cdata.h>
+#include <kdd.h>
+#include <libproc.h>
+#include <mach-o/dyld.h>
+#include <mach-o/dyld_priv.h>
+#include <sys/syscall.h>
+#include <sys/stackshot.h>
+
+/*
+ * mirrors the dyld_cache_header struct defined in dyld_cache_format.h from dyld source code
+ * TODO: remove once rdar://42361850 is in the build
+ */
+struct dyld_cache_header
+{
+    char       magic[16];                              // e.g. "dyld_v0    i386"
+    uint32_t   mappingOffset;          // file offset to first dyld_cache_mapping_info
+    uint32_t    mappingCount;           // number of dyld_cache_mapping_info entries
+    uint32_t    imagesOffset;           // file offset to first dyld_cache_image_info
+    uint32_t    imagesCount;            // number of dyld_cache_image_info entries
+    uint64_t    dyldBaseAddress;        // base address of dyld when cache was built
+    uint64_t    codeSignatureOffset;    // file offset of code signature blob
+    uint64_t    codeSignatureSize;             // size of code signature blob (zero means to end of file)
+    uint64_t    slideInfoOffset;        // file offset of kernel slid info
+    uint64_t    slideInfoSize;          // size of kernel slid info
+    uint64_t    localSymbolsOffset;     // file offset of where local symbols are stored
+    uint64_t    localSymbolsSize;       // size of local symbols information
+    uint8_t     uuid[16];               // unique value for each shared cache file
+    uint64_t    cacheType;              // 0 for development, 1 for production
+    uint32_t    branchPoolsOffset;      // file offset to table of uint64_t pool addresses
+    uint32_t    branchPoolsCount;       // number of uint64_t entries
+    uint64_t    accelerateInfoAddr;     // (unslid) address of optimization info
+    uint64_t    accelerateInfoSize;     // size of optimization info
+    uint64_t    imagesTextOffset;       // file offset to first dyld_cache_image_text_info
+    uint64_t    imagesTextCount;        // number of dyld_cache_image_text_info entries
+    uint64_t    dylibsImageGroupAddr;   // (unslid) address of ImageGroup for dylibs in this cache
+    uint64_t    dylibsImageGroupSize;   // size of ImageGroup for dylibs in this cache
+    uint64_t    otherImageGroupAddr;    // (unslid) address of ImageGroup for other OS dylibs
+    uint64_t    otherImageGroupSize;    // size of oImageGroup for other OS dylibs
+    uint64_t    progClosuresAddr;       // (unslid) address of list of program launch closures
+    uint64_t    progClosuresSize;       // size of list of program launch closures
+    uint64_t    progClosuresTrieAddr;   // (unslid) address of trie of indexes into program launch closures
+    uint64_t    progClosuresTrieSize;   // size of trie of indexes into program launch closures
+    uint32_t    platform;               // platform number (macOS=1, etc)
+    uint32_t    formatVersion        : 8,  // dyld3::closure::kFormatVersion
+                dylibsExpectedOnDisk : 1,  // dyld should expect the dylib exists on disk and to compare inode/mtime to see if cache is valid
+                simulator            : 1,  // for simulator of specified platform
+                locallyBuiltCache    : 1,  // 0 for B&I built cache, 1 for locally built cache
+                padding              : 21; // TBD
+};
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.stackshot"),
+               T_META_CHECK_LEAKS(false),
+               T_META_ASROOT(true)
+               );
+
+static const char *current_process_name(void);
+static void verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count);
+static void parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid);
+static void parse_thread_group_stackshot(void **sbuf, size_t sslen);
+static uint64_t stackshot_timestamp(void *ssbuf, size_t sslen);
+static void initialize_thread(void);
+
+#define DEFAULT_STACKSHOT_BUFFER_SIZE (1024 * 1024)
+#define MAX_STACKSHOT_BUFFER_SIZE     (6 * 1024 * 1024)
+
+/* bit flags for parse_stackshot */
+#define PARSE_STACKSHOT_DELTA 0x1
+#define PARSE_STACKSHOT_ZOMBIE 0x2
+#define PARSE_STACKSHOT_SHAREDCACHE_LAYOUT 0x4
+
+T_DECL(microstackshots, "test the microstackshot syscall")
+{
+       void *buf = NULL;
+       unsigned int size = DEFAULT_STACKSHOT_BUFFER_SIZE;
+
+       while (1) {
+               buf = malloc(size);
+               T_QUIET; T_ASSERT_NOTNULL(buf, "allocated stackshot buffer");
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+               int len = syscall(SYS_microstackshot, buf, size,
+                               STACKSHOT_GET_MICROSTACKSHOT);
+#pragma clang diagnostic pop
+               if (len == ENOSYS) {
+                       T_SKIP("microstackshot syscall failed, likely not compiled with CONFIG_TELEMETRY");
+               }
+               if (len == -1 && errno == ENOSPC) {
+                       /* syscall failed because buffer wasn't large enough, try again */
+                       free(buf);
+                       buf = NULL;
+                       size *= 2;
+                       T_ASSERT_LE(size, (unsigned int)MAX_STACKSHOT_BUFFER_SIZE,
+                                       "growing stackshot buffer to sane size");
+                       continue;
+               }
+               T_ASSERT_POSIX_SUCCESS(len, "called microstackshot syscall");
+               break;
+    }
+
+       T_EXPECT_EQ(*(uint32_t *)buf,
+                       (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC,
+                       "magic value for microstackshot matches");
+
+       free(buf);
+}
+
+struct scenario {
+       const char *name;
+       uint32_t flags;
+       bool should_fail;
+       bool maybe_unsupported;
+       pid_t target_pid;
+       uint64_t since_timestamp;
+       uint32_t size_hint;
+       dt_stat_time_t timer;
+};
+
+static void
+quiet(struct scenario *scenario)
+{
+       if (scenario->timer) {
+               T_QUIET;
+       }
+}
+
+static void
+take_stackshot(struct scenario *scenario, void (^cb)(void *buf, size_t size))
+{
+       initialize_thread();
+
+       void *config = stackshot_config_create();
+       quiet(scenario);
+       T_ASSERT_NOTNULL(config, "created stackshot config");
+
+       int ret = stackshot_config_set_flags(config, scenario->flags);
+       quiet(scenario);
+       T_ASSERT_POSIX_ZERO(ret, "set flags %#x on stackshot config", scenario->flags);
+
+       if (scenario->size_hint > 0) {
+               ret = stackshot_config_set_size_hint(config, scenario->size_hint);
+               quiet(scenario);
+               T_ASSERT_POSIX_ZERO(ret, "set size hint %" PRIu32 " on stackshot config",
+                               scenario->size_hint);
+       }
+
+       if (scenario->target_pid > 0) {
+               ret = stackshot_config_set_pid(config, scenario->target_pid);
+               quiet(scenario);
+               T_ASSERT_POSIX_ZERO(ret, "set target pid %d on stackshot config",
+                               scenario->target_pid);
+       }
+
+       if (scenario->since_timestamp > 0) {
+               ret = stackshot_config_set_delta_timestamp(config, scenario->since_timestamp);
+               quiet(scenario);
+               T_ASSERT_POSIX_ZERO(ret, "set since timestamp %" PRIu64 " on stackshot config",
+                               scenario->since_timestamp);
+       }
+
+       int retries_remaining = 5;
+
+retry: ;
+       uint64_t start_time = mach_absolute_time();
+       ret = stackshot_capture_with_config(config);
+       uint64_t end_time = mach_absolute_time();
+
+       if (scenario->should_fail) {
+               T_EXPECTFAIL;
+               T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
+               return;
+       }
+
+       if (ret == EBUSY || ret == ETIMEDOUT) {
+               if (retries_remaining > 0) {
+                       if (!scenario->timer) {
+                               T_LOG("stackshot_capture_with_config failed with %s (%d), retrying",
+                                               strerror(ret), ret);
+                       }
+
+                       retries_remaining--;
+                       goto retry;
+               } else {
+                       T_ASSERT_POSIX_ZERO(ret,
+                                       "called stackshot_capture_with_config (no retries remaining)");
+               }
+       } else if ((ret == ENOTSUP) && scenario->maybe_unsupported) {
+               T_SKIP("kernel indicated this stackshot configuration is not supported");
+       } else {
+               quiet(scenario);
+               T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
+       }
+
+       if (scenario->timer) {
+               dt_stat_mach_time_add(scenario->timer, end_time - start_time);
+       }
+       void *buf = stackshot_config_get_stackshot_buffer(config);
+       size_t size = stackshot_config_get_stackshot_size(config);
+       if (scenario->name) {
+               char sspath[MAXPATHLEN];
+               strlcpy(sspath, scenario->name, sizeof(sspath));
+               strlcat(sspath, ".kcdata", sizeof(sspath));
+               T_QUIET; T_ASSERT_POSIX_ZERO(dt_resultfile(sspath, sizeof(sspath)),
+                               "create result file path");
+
+               T_LOG("writing stackshot to %s", sspath);
+
+               FILE *f = fopen(sspath, "w");
+               T_WITH_ERRNO; T_QUIET; T_ASSERT_NOTNULL(f,
+                               "open stackshot output file");
+
+               size_t written = fwrite(buf, size, 1, f);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(written, "wrote stackshot to file");
+
+               fclose(f);
+       }
+       cb(buf, size);
+
+       ret = stackshot_config_dealloc(config);
+       T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config");
+}
+
+T_DECL(kcdata, "test that kcdata stackshots can be taken and parsed")
+{
+       struct scenario scenario = {
+               .name = "kcdata",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS |
+                               STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("taking kcdata stackshot");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(0, ssbuf, sslen, -1);
+       });
+}
+
+T_DECL(kcdata_faulting, "test that kcdata stackshots while faulting can be taken and parsed")
+{
+       struct scenario scenario = {
+               .name = "faulting",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT
+                               | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING),
+       };
+
+       T_LOG("taking faulting stackshot");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(0, ssbuf, sslen, -1);
+       });
+}
+
+T_DECL(bad_flags, "test a poorly-formed stackshot syscall")
+{
+       struct scenario scenario = {
+               .flags = STACKSHOT_SAVE_IN_KERNEL_BUFFER /* not allowed from user space */,
+               .should_fail = true,
+       };
+
+       T_LOG("attempting to take stackshot with kernel-only flag");
+       take_stackshot(&scenario, ^(__unused void *ssbuf, __unused size_t sslen) {
+               T_ASSERT_FAIL("stackshot data callback called");
+       });
+}
+
+T_DECL(delta, "test delta stackshots")
+{
+       struct scenario scenario = {
+               .name = "delta",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("taking full stackshot");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen);
+
+               T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time);
+
+               parse_stackshot(0, ssbuf, sslen, -1);
+
+               struct scenario delta_scenario = {
+                       .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                                       | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT
+                                       | STACKSHOT_COLLECT_DELTA_SNAPSHOT),
+                       .since_timestamp = stackshot_time
+               };
+
+               take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) {
+                       parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1);
+               });
+       });
+}
+
+T_DECL(shared_cache_layout, "test stackshot inclusion of shared cache layout")
+{
+       struct scenario scenario = {
+               .name = "shared_cache_layout",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT |
+                               STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT),
+       };
+
+       T_LOG("taking stackshot with STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT set");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(PARSE_STACKSHOT_SHAREDCACHE_LAYOUT, ssbuf, sslen, -1);
+       });
+}
+
+static void *stuck_sysctl_thread(void *arg) {
+       int val = 1;
+       dispatch_semaphore_t child_thread_started = *(dispatch_semaphore_t *)arg;
+
+       dispatch_semaphore_signal(child_thread_started);
+       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.wedge_thread", NULL, NULL, &val, sizeof(val)), "wedge child thread");
+
+       return NULL;
+}
+
+T_HELPER_DECL(zombie_child, "child process to sample as a zombie")
+{
+       pthread_t pthread;
+       dispatch_semaphore_t child_thread_started = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(child_thread_started, "zombie child thread semaphore");
+
+       /* spawn another thread to get stuck in the kernel, then call exit() to become a zombie */
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pthread_create(&pthread, NULL, stuck_sysctl_thread, &child_thread_started), "pthread_create");
+
+       dispatch_semaphore_wait(child_thread_started, DISPATCH_TIME_FOREVER);
+
+       /* sleep for a bit in the hope of ensuring that the other thread has called the sysctl before we signal the parent */
+       usleep(100);
+       T_ASSERT_POSIX_SUCCESS(kill(getppid(), SIGUSR1), "signaled parent to take stackshot");
+
+       exit(0);
+}
+
+T_DECL(zombie, "tests a stackshot of a zombie task with a thread stuck in the kernel")
+{
+       char path[PATH_MAX];
+       uint32_t path_size = sizeof(path);
+       T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath");
+       char *args[] = { path, "-n", "zombie_child", NULL };
+
+       dispatch_source_t child_sig_src;
+       dispatch_semaphore_t child_ready_sem = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "zombie child semaphore");
+
+       dispatch_queue_t signal_processing_q = dispatch_queue_create("signal processing queue", NULL);
+       T_QUIET; T_ASSERT_NOTNULL(child_ready_sem, "signal processing queue");
+
+       pid_t pid;
+
+       T_LOG("spawning a child");
+
+       signal(SIGUSR1, SIG_IGN);
+       child_sig_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, signal_processing_q);
+       T_QUIET; T_ASSERT_NOTNULL(child_sig_src, "dispatch_source_create (child_sig_src)");
+
+       dispatch_source_set_event_handler(child_sig_src, ^{ dispatch_semaphore_signal(child_ready_sem); });
+       dispatch_activate(child_sig_src);
+
+       int sp_ret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid);
+
+       dispatch_semaphore_wait(child_ready_sem, DISPATCH_TIME_FOREVER);
+
+       T_LOG("received signal from child, capturing stackshot");
+
+       struct proc_bsdshortinfo bsdshortinfo;
+       int retval, iterations_to_wait = 10;
+
+       while (iterations_to_wait > 0) {
+               retval = proc_pidinfo(pid, PROC_PIDT_SHORTBSDINFO, 0, &bsdshortinfo, sizeof(bsdshortinfo));
+               if ((retval == 0) && errno == ESRCH) {
+                       T_LOG("unable to find child using proc_pidinfo, assuming zombie");
+                       break;
+               }
+
+               T_QUIET; T_WITH_ERRNO; T_ASSERT_GT(retval, 0, "proc_pidinfo(PROC_PIDT_SHORTBSDINFO) returned a value > 0");
+               T_QUIET; T_ASSERT_EQ(retval, (int)sizeof(bsdshortinfo), "proc_pidinfo call for PROC_PIDT_SHORTBSDINFO returned expected size");
+
+               if (bsdshortinfo.pbsi_flags & PROC_FLAG_INEXIT) {
+                       T_LOG("child proc info marked as in exit");
+                       break;
+               }
+
+               iterations_to_wait--;
+               if (iterations_to_wait == 0) {
+                       /*
+                        * This will mark the test as failed but let it continue so we
+                        * don't leave a process stuck in the kernel.
+                        */
+                       T_FAIL("unable to discover that child is marked as exiting");
+               }
+
+               /* Give the child a few more seconds to make it to exit */
+               sleep(5);
+       }
+
+       /* Give the child some more time to make it through exit */
+       sleep(10);
+
+       struct scenario scenario = {
+               .name = "zombie",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       take_stackshot(&scenario, ^( void *ssbuf, size_t sslen) {
+               /* First unwedge the child so we can reap it */
+               int val = 1, status;
+               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.unwedge_thread", NULL, NULL, &val, sizeof(val)), "unwedge child");
+
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(pid, &status, 0), "waitpid on zombie child");
+
+               parse_stackshot(PARSE_STACKSHOT_ZOMBIE, ssbuf, sslen, pid);
+       });
+}
+
+static void
+expect_instrs_cycles_in_stackshot(void *ssbuf, size_t sslen)
+{
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+
+       bool in_task = false;
+       bool in_thread = false;
+       bool saw_instrs_cycles = false;
+       iter = kcdata_iter_next(iter);
+
+       KCDATA_ITER_FOREACH(iter) {
+               switch (kcdata_iter_type(iter)) {
+               case KCDATA_TYPE_CONTAINER_BEGIN:
+                       switch (kcdata_iter_container_type(iter)) {
+                       case STACKSHOT_KCCONTAINER_TASK:
+                               in_task = true;
+                               saw_instrs_cycles = false;
+                               break;
+
+                       case STACKSHOT_KCCONTAINER_THREAD:
+                               in_thread = true;
+                               saw_instrs_cycles = false;
+                               break;
+
+                       default:
+                               break;
+                       }
+                       break;
+
+               case STACKSHOT_KCTYPE_INSTRS_CYCLES:
+                       saw_instrs_cycles = true;
+                       break;
+
+               case KCDATA_TYPE_CONTAINER_END:
+                       if (in_thread) {
+                               T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles,
+                                               "saw instructions and cycles in thread");
+                               in_thread = false;
+                       } else if (in_task) {
+                               T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles,
+                                               "saw instructions and cycles in task");
+                               in_task = false;
+                       }
+
+               default:
+                       break;
+               }
+       }
+}
+
+static void
+skip_if_monotonic_unsupported(void)
+{
+       int supported = 0;
+       size_t supported_size = sizeof(supported);
+       int ret = sysctlbyname("kern.monotonic.supported", &supported,
+                       &supported_size, 0, 0);
+       if (ret < 0 || !supported) {
+               T_SKIP("monotonic is unsupported");
+       }
+}
+
+T_DECL(instrs_cycles, "test a getting instructions and cycles in stackshot")
+{
+       skip_if_monotonic_unsupported();
+
+       struct scenario scenario = {
+               .name = "instrs-cycles",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
+                               | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("attempting to take stackshot with instructions and cycles");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_stackshot(0, ssbuf, sslen, -1);
+               expect_instrs_cycles_in_stackshot(ssbuf, sslen);
+       });
+}
+
+T_DECL(delta_instrs_cycles,
+               "test delta stackshots with instructions and cycles")
+{
+       skip_if_monotonic_unsupported();
+
+       struct scenario scenario = {
+               .name = "delta-instrs-cycles",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
+                               | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("taking full stackshot");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen);
+
+               T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time);
+
+               parse_stackshot(0, ssbuf, sslen, -1);
+               expect_instrs_cycles_in_stackshot(ssbuf, sslen);
+
+               struct scenario delta_scenario = {
+                       .name = "delta-instrs-cycles-next",
+                       .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
+                                       | STACKSHOT_KCDATA_FORMAT
+                                       | STACKSHOT_COLLECT_DELTA_SNAPSHOT),
+                       .since_timestamp = stackshot_time,
+               };
+
+               take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) {
+                       parse_stackshot(PARSE_STACKSHOT_DELTA, dssbuf, dsslen, -1);
+                       expect_instrs_cycles_in_stackshot(dssbuf, dsslen);
+               });
+       });
+}
+
+static void
+check_thread_groups_supported()
+{
+       int err;
+       int supported = 0;
+       size_t supported_size = sizeof(supported);
+       err = sysctlbyname("kern.thread_groups_supported", &supported, &supported_size, NULL, 0);
+
+       if (err || !supported)
+               T_SKIP("thread groups not supported on this system");
+}
+
+T_DECL(thread_groups, "test getting thread groups in stackshot")
+{
+       check_thread_groups_supported();
+
+       struct scenario scenario = {
+               .name = "thread-groups",
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_THREAD_GROUP
+                               | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       T_LOG("attempting to take stackshot with thread group flag");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_thread_group_stackshot(ssbuf, sslen);
+       });
+}
+
+static void
+parse_page_table_asid_stackshot(void **ssbuf, size_t sslen)
+{
+       bool seen_asid = false;
+       bool seen_page_table_snapshot = false;
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+       T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
+                       "buffer provided is a stackshot");
+
+       iter = kcdata_iter_next(iter);
+       KCDATA_ITER_FOREACH(iter) {
+               switch (kcdata_iter_type(iter)) {
+               case KCDATA_TYPE_ARRAY: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
+                                       "checked that array is valid");
+
+                       if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_PAGE_TABLES) {
+                               continue;
+                       }
+
+                       T_ASSERT_FALSE(seen_page_table_snapshot, "check that we haven't yet seen a page table snapshot");
+                       seen_page_table_snapshot = true;
+
+                       T_ASSERT_EQ((size_t) kcdata_iter_array_elem_size(iter), sizeof(uint64_t),
+                               "check that each element of the pagetable dump is the expected size");
+
+                       uint64_t *pt_array = kcdata_iter_payload(iter);
+                       uint32_t elem_count = kcdata_iter_array_elem_count(iter);
+                       uint32_t j;
+                       bool nonzero_tte = false;
+                       for (j = 0; j < elem_count;) {
+                               T_QUIET; T_ASSERT_LE(j + 4, elem_count, "check for valid page table segment header");
+                               uint64_t pa = pt_array[j];
+                               uint64_t num_entries = pt_array[j + 1];
+                               uint64_t start_va = pt_array[j + 2];
+                               uint64_t end_va = pt_array[j + 3];
+
+                               T_QUIET; T_ASSERT_NE(pa, (uint64_t) 0, "check that the pagetable physical address is non-zero");
+                               T_QUIET; T_ASSERT_EQ(pa % (num_entries * sizeof(uint64_t)), (uint64_t) 0, "check that the pagetable physical address is correctly aligned");
+                               T_QUIET; T_ASSERT_NE(num_entries, (uint64_t) 0, "check that a pagetable region has more than 0 entries");
+                               T_QUIET; T_ASSERT_LE(j + 4 + num_entries, (uint64_t) elem_count, "check for sufficient space in page table array");
+                               T_QUIET; T_ASSERT_GT(end_va, start_va, "check for valid VA bounds in page table segment header");
+
+                               for (uint32_t k = j + 4; k < (j + 4 + num_entries); ++k) {
+                                       if (pt_array[k] != 0) {
+                                               nonzero_tte = true;
+                                               T_QUIET; T_ASSERT_EQ((pt_array[k] >> 48) & 0xf, (uint64_t) 0, "check that bits[48:51] of arm64 TTE are clear");
+                                               // L0-L2 table and non-compressed L3 block entries should always have bit 1 set; assumes L0-L2 blocks will not be used outside the kernel
+                                               bool table = ((pt_array[k] & 0x2) != 0);
+                                               if (table) {
+                                                       T_QUIET; T_ASSERT_NE(pt_array[k] & ((1ULL << 48) - 1) & ~((1ULL << 12) - 1), (uint64_t) 0, "check that arm64 TTE physical address is non-zero");
+                                               } else { // should be a compressed PTE
+                                                       T_QUIET; T_ASSERT_NE(pt_array[k] & 0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has at least one of bits [63:62] set");
+                                                       T_QUIET; T_ASSERT_EQ(pt_array[k] & ~0xC000000000000000ULL, (uint64_t) 0, "check that compressed PTE has no other bits besides [63:62] set");
+                                               }
+                                       }
+                               }
+
+                               j += (4 + num_entries);
+                       }
+                       T_ASSERT_TRUE(nonzero_tte, "check that we saw at least one non-empty TTE");
+                       T_ASSERT_EQ(j, elem_count, "check that page table dump size matches extent of last header"); 
+                       break;
+               }
+               case STACKSHOT_KCTYPE_ASID: {
+                       T_ASSERT_FALSE(seen_asid, "check that we haven't yet seen an ASID");
+                       seen_asid = true;
+               }
+               }
+       }
+       T_ASSERT_TRUE(seen_page_table_snapshot, "check that we have seen a page table snapshot");
+       T_ASSERT_TRUE(seen_asid, "check that we have seen an ASID");
+}
+
+T_DECL(dump_page_tables, "test stackshot page table dumping support")
+{
+       struct scenario scenario = {
+               .name = "asid-page-tables",
+               .flags = (STACKSHOT_KCDATA_FORMAT | STACKSHOT_ASID | STACKSHOT_PAGE_TABLES),
+               .size_hint = (1ULL << 23), // 8 MB
+               .target_pid = getpid(),
+               .maybe_unsupported = true,
+       };
+
+       T_LOG("attempting to take stackshot with ASID and page table flags");
+       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+               parse_page_table_asid_stackshot(ssbuf, sslen);
+       });
+}
+
+#pragma mark performance tests
+
+#define SHOULD_REUSE_SIZE_HINT 0x01
+#define SHOULD_USE_DELTA       0x02
+#define SHOULD_TARGET_SELF     0x04
+
+static void
+stackshot_perf(unsigned int options)
+{
+       struct scenario scenario = {
+               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
+                       | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
+       };
+
+       dt_stat_t size = dt_stat_create("bytes", "size");
+       dt_stat_time_t duration = dt_stat_time_create("duration");
+       scenario.timer = duration;
+
+       if (options & SHOULD_TARGET_SELF) {
+               scenario.target_pid = getpid();
+       }
+
+       while (!dt_stat_stable(duration) || !dt_stat_stable(size)) {
+               __block uint64_t last_time = 0;
+               __block uint32_t size_hint = 0;
+               take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
+                       dt_stat_add(size, (double)sslen);
+                       last_time = stackshot_timestamp(ssbuf, sslen);
+                       size_hint = (uint32_t)sslen;
+               });
+               if (options & SHOULD_USE_DELTA) {
+                       scenario.since_timestamp = last_time;
+                       scenario.flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT;
+               }
+               if (options & SHOULD_REUSE_SIZE_HINT) {
+                       scenario.size_hint = size_hint;
+               }
+       }
+
+       dt_stat_finalize(duration);
+       dt_stat_finalize(size);
+}
+
+T_DECL(perf_no_size_hint, "test stackshot performance with no size hint",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(0);
+}
+
+T_DECL(perf_size_hint, "test stackshot performance with size hint",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(SHOULD_REUSE_SIZE_HINT);
+}
+
+T_DECL(perf_process, "test stackshot performance targeted at process",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_TARGET_SELF);
+}
+
+T_DECL(perf_delta, "test delta stackshot performance",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA);
+}
+
+T_DECL(perf_delta_process, "test delta stackshot performance targeted at a process",
+               T_META_TAG_PERF)
+{
+       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA | SHOULD_TARGET_SELF);
+}
+
+static uint64_t
+stackshot_timestamp(void *ssbuf, size_t sslen)
+{
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+
+       uint32_t type = kcdata_iter_type(iter);
+       if (type != KCDATA_BUFFER_BEGIN_STACKSHOT && type != KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT) {
+               T_ASSERT_FAIL("invalid kcdata type %u", kcdata_iter_type(iter));
+       }
+
+       iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME);
+       T_QUIET;
+       T_ASSERT_TRUE(kcdata_iter_valid(iter), "timestamp found in stackshot");
+
+       return *(uint64_t *)kcdata_iter_payload(iter);
+}
+
+#define TEST_THREAD_NAME "stackshot_test_thread"
+
+static void
+parse_thread_group_stackshot(void **ssbuf, size_t sslen)
+{
+       bool seen_thread_group_snapshot = false;
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+       T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
+                       "buffer provided is a stackshot");
+
+       NSMutableSet *thread_groups = [[NSMutableSet alloc] init];
+
+       iter = kcdata_iter_next(iter);
+       KCDATA_ITER_FOREACH(iter) {
+               switch (kcdata_iter_type(iter)) {
+               case KCDATA_TYPE_ARRAY: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
+                                       "checked that array is valid");
+
+                       if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT) {
+                               continue;
+                       }
+
+                       seen_thread_group_snapshot = true;
+
+                       if (kcdata_iter_array_elem_size(iter) >= sizeof(struct thread_group_snapshot_v2)) {
+                               struct thread_group_snapshot_v2 *tgs_array = kcdata_iter_payload(iter);
+                               for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) {
+                                       struct thread_group_snapshot_v2 *tgs = tgs_array + j;
+                                       [thread_groups addObject:@(tgs->tgs_id)];
+                               }
+
+                       }
+                       else {
+                               struct thread_group_snapshot *tgs_array = kcdata_iter_payload(iter);
+                               for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) {
+                                       struct thread_group_snapshot *tgs = tgs_array + j;
+                                       [thread_groups addObject:@(tgs->tgs_id)];
+                               }
+                       }
+                       break;
+               }
+               }
+       }
+       KCDATA_ITER_FOREACH(iter) {
+               NSError *error = nil;
+
+               switch (kcdata_iter_type(iter)) {
+
+               case KCDATA_TYPE_CONTAINER_BEGIN: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_container_valid(iter),
+                                       "checked that container is valid");
+
+                       if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_THREAD) {
+                               break;
+                       }
+
+                       NSDictionary *container = parseKCDataContainer(&iter, &error);
+                       T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot");
+                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container");
+
+                       int tg = [container[@"thread_snapshots"][@"thread_group"] intValue];
+
+                       T_ASSERT_TRUE([thread_groups containsObject:@(tg)], "check that the thread group the thread is in exists");
+
+                       break;
+               };
+
+               }
+       }
+       T_ASSERT_TRUE(seen_thread_group_snapshot, "check that we have seen a thread group snapshot");
+}
+
+static void
+verify_stackshot_sharedcache_layout(struct dyld_uuid_info_64 *uuids, uint32_t uuid_count)
+{
+       uuid_t cur_shared_cache_uuid;
+       __block uint32_t lib_index = 0, libs_found = 0;
+
+       _dyld_get_shared_cache_uuid(cur_shared_cache_uuid);
+       int result = dyld_shared_cache_iterate_text(cur_shared_cache_uuid, ^(const dyld_shared_cache_dylib_text_info* info) {
+                       T_QUIET; T_ASSERT_LT(lib_index, uuid_count, "dyld_shared_cache_iterate_text exceeded number of libraries returned by kernel");
+
+                       libs_found++;
+                       struct dyld_uuid_info_64 *cur_stackshot_uuid_entry = &uuids[lib_index];
+                       T_QUIET; T_ASSERT_EQ(memcmp(info->dylibUuid, cur_stackshot_uuid_entry->imageUUID, sizeof(info->dylibUuid)), 0,
+                                       "dyld returned UUID doesn't match kernel returned UUID");
+                       T_QUIET; T_ASSERT_EQ(info->loadAddressUnslid, cur_stackshot_uuid_entry->imageLoadAddress,
+                                       "dyld returned load address doesn't match kernel returned load address");
+                       lib_index++;
+               });
+
+       T_ASSERT_EQ(result, 0, "iterate shared cache layout");
+       T_ASSERT_EQ(libs_found, uuid_count, "dyld iterator returned same number of libraries as kernel");
+
+       T_LOG("verified %d libraries from dyld shared cache", libs_found);
+}
+
+static void
+parse_stackshot(uint64_t stackshot_parsing_flags, void *ssbuf, size_t sslen, int child_pid)
+{
+       bool delta = (stackshot_parsing_flags & PARSE_STACKSHOT_DELTA);
+       bool expect_zombie_child = (stackshot_parsing_flags & PARSE_STACKSHOT_ZOMBIE);
+       bool expect_shared_cache_layout = false;
+       bool expect_shared_cache_uuid = !delta;
+       bool found_zombie_child = false, found_shared_cache_layout = false, found_shared_cache_uuid = false;
+
+       if (stackshot_parsing_flags & PARSE_STACKSHOT_SHAREDCACHE_LAYOUT) {
+               size_t shared_cache_length = 0;
+               const struct dyld_cache_header *cache_header = NULL;
+               cache_header = _dyld_get_shared_cache_range(&shared_cache_length);
+               T_QUIET; T_ASSERT_NOTNULL(cache_header, "current process running with shared cache");
+               T_QUIET; T_ASSERT_GT(shared_cache_length, sizeof(struct _dyld_cache_header), "valid shared cache length populated by _dyld_get_shared_cache_range");
+
+               if (cache_header->locallyBuiltCache) {
+                       T_LOG("device running with locally built shared cache, expect shared cache layout");
+                       expect_shared_cache_layout = true;
+               } else {
+                       T_LOG("device running with B&I built shared-cache, no shared cache layout expected");
+               }
+       }
+
+       if (expect_zombie_child) {
+               T_QUIET; T_ASSERT_GT(child_pid, 0, "child pid greater than zero");
+       }
+
+       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
+       if (delta) {
+               T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
+                               "buffer provided is a delta stackshot");
+       } else {
+               T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
+                               "buffer provided is a stackshot");
+       }
+
+       iter = kcdata_iter_next(iter);
+       KCDATA_ITER_FOREACH(iter) {
+               NSError *error = nil;
+
+               switch (kcdata_iter_type(iter)) {
+               case KCDATA_TYPE_ARRAY: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
+                                       "checked that array is valid");
+
+                       NSMutableDictionary *array = parseKCDataArray(iter, &error);
+                       T_QUIET; T_ASSERT_NOTNULL(array, "parsed array from stackshot");
+                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing array");
+
+                       if (kcdata_iter_array_elem_type(iter) == STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT) {
+                               struct dyld_uuid_info_64 *shared_cache_uuids = kcdata_iter_payload(iter);
+                               uint32_t uuid_count = kcdata_iter_array_elem_count(iter);
+                               T_ASSERT_NOTNULL(shared_cache_uuids, "parsed shared cache layout array");
+                               T_ASSERT_GT(uuid_count, 0, "returned valid number of UUIDs from shared cache");
+                               verify_stackshot_sharedcache_layout(shared_cache_uuids, uuid_count);
+                               found_shared_cache_layout = true;
+                       }
+
+                       break;
+               }
+
+               case KCDATA_TYPE_CONTAINER_BEGIN: {
+                       T_QUIET;
+                       T_ASSERT_TRUE(kcdata_iter_container_valid(iter),
+                                       "checked that container is valid");
+
+                       if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) {
+                               break;
+                       }
+
+                       NSDictionary *container = parseKCDataContainer(&iter, &error);
+                       T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot");
+                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container");
+
+                       int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue];
+                       if (expect_zombie_child && (pid == child_pid)) {
+                                       found_zombie_child = true;
+
+                                       uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue];
+                                       T_ASSERT_TRUE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "child zombie marked as terminated");
+
+                                       continue;
+                       } else if (pid != getpid()) {
+                               break;
+                       }
+
+                       T_EXPECT_EQ_STR(current_process_name(),
+                                       [container[@"task_snapshots"][@"task_snapshot"][@"ts_p_comm"] UTF8String],
+                                       "current process name matches in stackshot");
+
+                       uint64_t task_flags = [container[@"task_snapshots"][@"task_snapshot"][@"ts_ss_flags"] unsignedLongLongValue];
+                       T_ASSERT_FALSE((task_flags & kTerminatedSnapshot) == kTerminatedSnapshot, "current process not marked as terminated");
+
+                       T_QUIET;
+                       T_EXPECT_LE(pid, [container[@"task_snapshots"][@"task_snapshot"][@"ts_unique_pid"] intValue],
+                                       "unique pid is greater than pid");
+
+                       bool found_main_thread = false;
+                       for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) {
+                               NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key];
+                               NSDictionary *thread_snap = thread[@"thread_snapshot"];
+
+                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_thread_id"] intValue], 0,
+                                               "thread ID of thread in current task is valid");
+                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_base_priority"] intValue], 0,
+                                               "base priority of thread in current task is valid");
+                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_sched_priority"] intValue], 0,
+                                               "scheduling priority of thread in current task is valid");
+
+                               NSString *pth_name = thread[@"pth_name"];
+                               if (pth_name != nil && [pth_name isEqualToString:@TEST_THREAD_NAME]) {
+                                       found_main_thread = true;
+
+                                       T_QUIET; T_EXPECT_GT([thread_snap[@"ths_total_syscalls"] intValue], 0,
+                                                       "total syscalls of current thread is valid");
+
+                                       NSDictionary *cpu_times = thread[@"cpu_times"];
+                                       T_EXPECT_GE([cpu_times[@"runnable_time"] intValue],
+                                                       [cpu_times[@"system_time"] intValue] +
+                                                       [cpu_times[@"user_time"] intValue],
+                                                       "runnable time of current thread is valid");
+                               }
+                       }
+                       T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot");
+                       break;
+               }
+               case STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO: {
+                       struct dyld_uuid_info_64_v2 *shared_cache_info = kcdata_iter_payload(iter);
+                       uuid_t shared_cache_uuid;
+                       T_QUIET; T_ASSERT_TRUE(_dyld_get_shared_cache_uuid(shared_cache_uuid), "retrieve current shared cache UUID");
+                       T_QUIET; T_ASSERT_EQ(memcmp(shared_cache_info->imageUUID, shared_cache_uuid, sizeof(shared_cache_uuid)), 0,
+                                       "dyld returned UUID doesn't match kernel returned UUID for system shared cache");
+                       found_shared_cache_uuid = true;
+                       break;
+               }
+               }
+       }
+
+       if (expect_zombie_child) {
+               T_QUIET; T_ASSERT_TRUE(found_zombie_child, "found zombie child in kcdata");
+       }
+
+       if (expect_shared_cache_layout) {
+               T_QUIET; T_ASSERT_TRUE(found_shared_cache_layout, "shared cache layout found in kcdata");
+       }
+
+       if (expect_shared_cache_uuid) {
+               T_QUIET; T_ASSERT_TRUE(found_shared_cache_uuid, "shared cache UUID found in kcdata");
+       }
+
+       T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata");
+}
+
+static const char *
+current_process_name(void)
+{
+       static char name[64];
+
+       if (!name[0]) {
+               int ret = proc_name(getpid(), name, sizeof(name));
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(ret, "proc_name failed for current process");
+       }
+
+       return name;
+}
+
+static void
+initialize_thread(void)
+{
+       int ret = pthread_setname_np(TEST_THREAD_NAME);
+       T_QUIET;
+       T_ASSERT_POSIX_ZERO(ret, "set thread name to %s", TEST_THREAD_NAME);
+}
diff --git a/tests/stackshot_block_owner_14362384.m b/tests/stackshot_block_owner_14362384.m
new file mode 100644 (file)
index 0000000..aabe544
--- /dev/null
@@ -0,0 +1,913 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+#include <darwintest.h>
+
+#include <kdd.h>
+#include <kern/kcdata.h>
+#include <kern/debug.h>
+#include <kern/block_hint.h>
+#include <mach/mach.h>
+#include <mach/mach_init.h>
+#include <mach/mach_traps.h>
+#include <mach/message.h>
+#include <mach/port.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <os/lock.h>
+#include <pthread.h>
+#include <signal.h>
+#include <sys/sysctl.h>
+#include <sys/stackshot.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <TargetConditionals.h>
+
+#if !TARGET_OS_EMBEDDED
+#include <pcre.h>
+#endif
+
+
+T_GLOBAL_META(
+        T_META_NAMESPACE("xnu.scheduler"),
+        T_META_ASROOT(true)
+);
+
+#include <Foundation/Foundation.h>
+
+#define SENDS_TO_BLOCK 6
+#define NUMRETRIES 5
+#define KRWLCK_STORES_EXCL_OWNER 0
+
+#define KMUTEX_SYSCTL_CHECK_EXISTS   0
+#define KMUTEX_SYSCTL_ACQUIRE_WAIT   1
+#define KMUTEX_SYSCTL_ACQUIRE_NOWAIT 2
+#define KMUTEX_SYSCTL_SIGNAL         3
+#define KMUTEX_SYSCTL_TEARDOWN       4
+
+#define KRWLCK_SYSCTL_CHECK_EXISTS    0
+#define KRWLCK_SYSCTL_RACQUIRE_NOWAIT 1
+#define KRWLCK_SYSCTL_RACQUIRE_WAIT   2
+#define KRWLCK_SYSCTL_WACQUIRE_NOWAIT 3
+#define KRWLCK_SYSCTL_WACQUIRE_WAIT   4
+#define KRWLCK_SYSCTL_SIGNAL          5
+#define KRWLCK_SYSCTL_TEARDOWN        6
+
+static const char kmutex_ctl[] = "debug.test_MutexOwnerCtl";
+static const char krwlck_ctl[] = "debug.test_RWLockOwnerCtl";
+
+static mach_port_t send = MACH_PORT_NULL;
+static mach_port_t recv = MACH_PORT_NULL;
+
+static void *
+take_stackshot(uint32_t extra_flags, uint64_t since_timestamp)
+{
+       void * stackshot = NULL;
+       int ret = 0;
+       uint32_t stackshot_flags = STACKSHOT_SAVE_LOADINFO |
+                                       STACKSHOT_GET_GLOBAL_MEM_STATS |
+                                       STACKSHOT_SAVE_IMP_DONATION_PIDS |
+                                       STACKSHOT_KCDATA_FORMAT;
+
+       if (since_timestamp != 0)
+               stackshot_flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT;
+
+       stackshot_flags |= extra_flags;
+
+       stackshot = stackshot_config_create();
+       T_QUIET; T_ASSERT_NOTNULL(stackshot, "Allocating stackshot config");
+
+       ret = stackshot_config_set_flags(stackshot, stackshot_flags);
+       T_ASSERT_POSIX_ZERO(ret, "Setting flags on stackshot config");
+
+       ret = stackshot_config_set_pid(stackshot, getpid());
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Setting target pid on stackshot config");
+
+       if (since_timestamp != 0) {
+               ret = stackshot_config_set_delta_timestamp(stackshot, since_timestamp);
+               T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Setting prev snapshot time on stackshot config");
+       }
+
+       for (int retries = NUMRETRIES; retries > 0; retries--) {
+               ret = stackshot_capture_with_config(stackshot);
+               T_QUIET; T_ASSERT_TRUE(ret == 0 || ret == EBUSY || ret == ETIMEDOUT,
+                               "Attempting to take stackshot (error %d)...", ret);
+               if (retries == 0 && (ret == EBUSY || ret == ETIMEDOUT))
+                       T_ASSERT_FAIL("Failed to take stackshot after %d retries: got %d (%s)", NUMRETRIES, ret, strerror(ret));
+               if (ret == 0)
+                       break;
+       }
+       return stackshot;
+}
+
+static void
+save_stackshot(void *stackshot, const char *filename)
+{
+       void *buf = stackshot_config_get_stackshot_buffer(stackshot);
+       T_QUIET; T_ASSERT_NOTNULL(buf, "buf");
+       size_t size = stackshot_config_get_stackshot_size(stackshot);
+       FILE *f = fopen(filename, "w");
+       T_QUIET; T_ASSERT_NOTNULL(f, "f");
+       fwrite(buf, size, 1, f);
+       fclose(f);
+}
+
+static
+void check_python(void *stackshot, const char *fmt, ...)
+{
+       save_stackshot(stackshot, "/tmp/ss");
+
+#if !TARGET_OS_EMBEDDED
+       va_list args;
+       va_start(args, fmt);
+       char *re_string = NULL;
+       vasprintf(&re_string, fmt, args);
+       va_end(args);
+       T_QUIET; T_ASSERT_NOTNULL(re_string, "vasprintf");
+
+       const char *pcreErrorStr;
+       int pcreErrorOffset;
+       pcre *re = pcre_compile(re_string, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
+       T_QUIET; T_ASSERT_NOTNULL(re, "pcre_compile");
+
+       bool found = false;
+       FILE *p = popen("/usr/local/bin/kcdata --pretty /tmp/ss", "r");
+       T_QUIET; T_ASSERT_NOTNULL(p, "popen");
+       while (1) {
+               char *line = NULL;
+               size_t linecap = 0;
+               ssize_t linesize = getline(&line, &linecap, p);
+               if (linesize < 0) {
+                       if (line)
+                               free(line);
+                       break;
+               }
+               int pcre_ret = pcre_exec(re, NULL, line, strlen(line), 0, 0, NULL, 0);
+               if (pcre_ret == 0){
+                       T_LOG("line: %s", line);
+                       found = true;
+               }
+               free(line);
+       }
+       T_EXPECT_TRUE(found, "found the waitinfo in kcdata.py output");
+       pclose(p);
+       pcre_free(re);
+       free(re_string);
+#endif
+}
+
+
+// waitinfo can be NULL, but len must be non-null and point to the length of the waitinfo array.
+// when the function returns, len will be set to the number of waitinfo structs found in the stackshot.
+static void
+find_blocking_info(void * stackshot, struct stackshot_thread_waitinfo *waitinfo, int *len)
+{
+       void *buf = NULL;
+       uint32_t t = 0;
+       uint32_t buflen = 0;
+       NSError *error = nil;
+       NSMutableDictionary *parsed_container = nil;
+       NSArray *parsed_waitinfo = nil;
+
+       T_QUIET; T_ASSERT_NOTNULL(len, "Length pointer shouldn't be NULL");
+       int oldlen = *len;
+       *len = 0;
+
+       buf = stackshot_config_get_stackshot_buffer(stackshot);
+       T_QUIET; T_ASSERT_NOTNULL(buf, "Getting stackshot buffer");
+       buflen = stackshot_config_get_stackshot_size(stackshot);
+
+       kcdata_iter_t iter = kcdata_iter(buf, buflen);
+
+       T_QUIET; T_ASSERT_TRUE(kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_STACKSHOT ||
+                       kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
+                       "Checking start of stackshot buffer");
+
+       iter = kcdata_iter_next(iter);
+       KCDATA_ITER_FOREACH(iter)
+       {
+               t = kcdata_iter_type(iter);
+
+               if (t != KCDATA_TYPE_CONTAINER_BEGIN) {
+                       continue;
+               }
+
+               if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) {
+                       continue;
+               }
+
+               parsed_container = parseKCDataContainer(&iter, &error);
+               T_QUIET; T_ASSERT_TRUE(!error, "Error while parsing container: %d (%s)",
+                               (int)error.code, [error.domain UTF8String]);
+               T_QUIET; T_ASSERT_TRUE(parsed_container && !error, "Parsing container");
+
+               parsed_waitinfo = parsed_container[@"task_snapshots"][@"thread_waitinfo"];
+               for (id elem in parsed_waitinfo) {
+                       /* check to see that tid matches expected idle status */
+                       uint8_t type = [elem[@"wait_type"] unsignedCharValue];
+                       if (type != kThreadWaitNone) {
+                               if (waitinfo && *len < oldlen) {
+                                       struct stackshot_thread_waitinfo *curr = &waitinfo[*len];
+                                       curr->wait_type = type;
+                                       curr->owner     = [elem[@"owner"] unsignedLongLongValue];
+                                       curr->waiter    = [elem[@"waiter"] unsignedLongLongValue];
+                                       curr->context   = [elem[@"context"] unsignedLongLongValue];
+                               }
+                               (*len)++;
+                       }
+               }
+               [parsed_container release];
+       }
+}
+
+/* perform various actions with a mutex in kernel memory. note that, since we aren't allowed
+ * to go to user space while still holding a mutex, the lock-acquiring actions in this kernel
+ * sysctl will either lock and immediately release the lock, or lock and wait until a semaphore
+ * is signalled, then unlock. if called with CHECK_EXISTS, returns whether or not the sysctl
+ * exist in the kernel (to determine if we're running with CONFIG_XNUPOST defined). Else,
+ * returns 1. */
+static int kmutex_action(int action)
+{
+       int ret = 0;
+       if (action == KMUTEX_SYSCTL_CHECK_EXISTS) {
+               ret = sysctlbyname(krwlck_ctl, NULL, NULL, NULL, 0);
+               return !(ret == -1);
+       }
+
+       char * action_name = "";
+       switch(action) {
+               case KMUTEX_SYSCTL_ACQUIRE_WAIT:
+                       action_name = "lock (and wait)";
+                       break;
+               case KMUTEX_SYSCTL_ACQUIRE_NOWAIT:
+                       action_name = "lock";
+                       break;
+               case KMUTEX_SYSCTL_SIGNAL:
+                       action_name = "signal to holder of";
+                       break;
+               case KMUTEX_SYSCTL_TEARDOWN:
+                       action_name = "tear down";
+                       break;
+               default:
+                       T_ASSERT_FAIL("Somebody passed the wrong argument to kmutex_action: %d", action);
+                       break;
+       }
+
+       ret = sysctlbyname(kmutex_ctl, NULL, NULL, &action, sizeof(int));
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: %s kernel mutex", action_name);
+       return 1;
+}
+
+static void
+sysctl_kmutex_test_match(uint64_t context)
+{
+       int ret = 0;
+       unsigned long long unslid_kmutex_address = 0;
+       size_t addrsize = sizeof(unslid_kmutex_address);
+
+       ret = sysctlbyname(kmutex_ctl, &unslid_kmutex_address, &addrsize, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Getting unslid location of kernel mutex. Size is %llu",
+                       (unsigned long long)addrsize);
+       T_EXPECT_EQ(context, unslid_kmutex_address,
+                       "Context should match unslid location of mutex in kernel memory");
+}
+
+/* We don't really care what goes into these messages, we're just sending something to a port. */
+static void
+msg_send_helper(mach_port_t remote_port)
+{
+       int ret;
+        mach_msg_header_t * msg = NULL;
+
+        ret = vm_allocate(mach_task_self(),
+                            (vm_address_t *)&msg,
+                            PAGE_SIZE,
+                            VM_MAKE_TAG(VM_MEMORY_MACH_MSG) | TRUE);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Allocating vm page %p", (void*)msg);
+        msg->msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, 0, 0, 0);
+       msg->msgh_size = PAGE_SIZE;
+        msg->msgh_remote_port = remote_port;
+        msg->msgh_local_port = MACH_PORT_NULL;
+        msg->msgh_voucher_port = MACH_PORT_NULL;
+        ret = mach_msg(msg,
+                       MACH_SEND_MSG | MACH_MSG_OPTION_NONE,
+                       PAGE_SIZE,
+                        0,
+                        MACH_PORT_NULL,
+                        MACH_MSG_TIMEOUT_NONE,
+                        MACH_PORT_NULL);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Sending message to port %d", remote_port);
+
+        vm_deallocate(mach_task_self(), (vm_address_t)msg, PAGE_SIZE);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Deallocating vm page %p", (void*)msg);
+}
+
+static void
+msg_recv_helper(mach_port_t local_port)
+{
+       int ret = 0;
+       mach_msg_size_t size = 2*PAGE_SIZE;
+       mach_msg_header_t * msg = NULL;
+        ret = vm_allocate(mach_task_self(),
+                          (vm_address_t *)&msg,
+                         size,
+                          VM_MAKE_TAG(VM_MEMORY_MACH_MSG) | TRUE );
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Allocating page %p for message", (void*)msg);
+
+       ret = mach_msg(msg,
+                       MACH_RCV_MSG,
+                       0,
+                       size,
+                       local_port,
+                       MACH_MSG_TIMEOUT_NONE,
+                       MACH_PORT_NULL);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Received message on port %d", local_port);
+        ret = vm_deallocate(mach_task_self(), (vm_address_t)msg, PAGE_SIZE);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Deallocating page %p", (void*)msg);
+}
+
+/* perform various actions with a rwlock in kernel memory. note that, since we aren't allowed
+ * to go to user space while still holding a rwlock, the lock-acquiring actions in this kernel
+ * sysctl will either lock and immediately release the lock, or lock and wait until a semaphore
+ * is signalled, then unlock. if called with CHECK_EXISTS, returns whether or not the sysctl
+ * exist in the kernel (to determine if we're running with CONFIG_XNUPOST defined). Else,
+ * returns 1. */
+static int
+krwlck_action(int action)
+{
+       int ret = 0;
+       if (action == KRWLCK_SYSCTL_CHECK_EXISTS) {
+               ret = sysctlbyname(krwlck_ctl, NULL, NULL, NULL, 0);
+               return !(ret == -1);
+       }
+
+       char * action_name = "";
+       switch(action) {
+               case KRWLCK_SYSCTL_RACQUIRE_NOWAIT:
+                       action_name = "shared lock";
+                       break;
+               case KRWLCK_SYSCTL_RACQUIRE_WAIT:
+                       action_name = "shared lock (and wait)";
+                       break;
+               case KRWLCK_SYSCTL_WACQUIRE_NOWAIT:
+                       action_name = "exclusive lock";
+                       break;
+               case KRWLCK_SYSCTL_WACQUIRE_WAIT:
+                       action_name = "exclusive lock (and wait)";
+                       break;
+               case KRWLCK_SYSCTL_SIGNAL:
+                       action_name = "signal to holder of";
+                       break;
+               case KRWLCK_SYSCTL_TEARDOWN:
+                       action_name = "tear down";
+                       break;
+               default:
+                       T_ASSERT_FAIL("Somebody passed the wrong argument to krwlck_action: %d", action);
+                       break;
+       }
+
+       ret = sysctlbyname(krwlck_ctl, NULL, NULL, &action, sizeof(int));
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: %s kernel rwlock", action_name);
+       return 1;
+}
+
+static void
+sysctl_krwlck_test_match(uint64_t context)
+{
+       int ret = 0;
+       unsigned long long unslid_krwlck_address = 0;
+       size_t addrsize = sizeof(unslid_krwlck_address);
+
+       ret = sysctlbyname(krwlck_ctl, &unslid_krwlck_address, &addrsize, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Getting unslid location of kernel rwlock");
+       T_EXPECT_EQ(context, unslid_krwlck_address, "Context should match unslid location of rwlock in kernel memory");
+}
+
+/* "Grabbing" threads: only purpose is to grab a sync primitive and hang. */
+
+static void *
+kmutex_grabbing_thread(void * arg)
+{
+       (void)arg;
+       kmutex_action(KMUTEX_SYSCTL_ACQUIRE_NOWAIT);
+       return NULL;
+}
+
+static void *
+kmutex_grab_and_wait_thread(void * arg)
+{
+       (void)arg;
+       kmutex_action(KMUTEX_SYSCTL_ACQUIRE_WAIT);
+       return NULL;
+}
+
+static void *
+sem_grabbing_thread(void * arg)
+{
+       semaphore_t *sem = (semaphore_t *)arg;
+       semaphore_wait(*sem);
+       return NULL;
+}
+
+static void *
+msg_blocking_thread(void * arg)
+{
+       (void)arg;
+       msg_recv_helper(send);
+
+       for (int i = 0; i < SENDS_TO_BLOCK; i++)
+               msg_send_helper(recv); // will block on send until message is received
+       return NULL;
+}
+
+static void *
+ulock_blocking_thread(void * arg)
+{
+       os_unfair_lock_t oul = (os_unfair_lock_t)arg;
+       os_unfair_lock_lock(oul);
+       os_unfair_lock_unlock(oul);
+       return NULL;
+}
+
+// acquires a kernel rwlock for writing, and then waits on a kernel semaphore.
+static void *
+krwlck_write_waiting_thread(void * arg)
+{
+       (void)arg;
+       krwlck_action(KRWLCK_SYSCTL_WACQUIRE_WAIT);
+       return NULL;
+}
+
+// attempts to acquire a kernel rwlock for reading, and doesn't wait on a semaphore afterwards.
+static void *
+krwlck_read_grabbing_thread(void * arg)
+{
+       (void)arg;
+       krwlck_action(KRWLCK_SYSCTL_RACQUIRE_NOWAIT);
+       return NULL;
+}
+
+static void *
+pthread_mutex_blocking_thread(void * arg)
+{
+       pthread_mutex_t *mtx = (pthread_mutex_t *)arg;
+       pthread_mutex_lock(mtx);
+       pthread_mutex_unlock(mtx);
+       return NULL;
+}
+
+static void *
+pthread_rwlck_blocking_thread(void * arg)
+{
+       pthread_rwlock_t *rwlck = (pthread_rwlock_t *)arg;
+       pthread_rwlock_rdlock(rwlck);
+       pthread_rwlock_unlock(rwlck);
+       return NULL;
+}
+
+static void *
+pthread_cond_blocking_thread(void * arg)
+{
+       pthread_mutex_t mtx  = PTHREAD_MUTEX_INITIALIZER;
+       pthread_cond_t *cond = (pthread_cond_t *)arg;
+       pthread_cond_wait(cond, &mtx);
+       pthread_mutex_unlock(&mtx);
+       return NULL;
+}
+
+static void *
+waitpid_blocking_thread(void * arg)
+{
+       pid_t pid = (pid_t)arg;
+
+       int ret = waitpid(pid, NULL, 0);
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Reaping child.");
+       return NULL;
+}
+
+/*
+ * Uses a debug sysctl to initialize a kernel mutex.
+ *
+ * The 'waiting' thread grabs this kernel mutex, and immediately waits on a kernel semaphore.
+ * The 'grabbing' thread just attempts to lock the kernel mutex.
+ * When the semaphore is signalled, the 'waiting' thread will unlock the kernel mutex,
+ * giving the opportunity for the 'grabbing' thread to lock it and then immediately unlock it.
+ * This allows us to create a situation in the kernel where we know a thread to be blocked
+ * on a kernel mutex.
+ */
+static void
+test_kmutex_blocking(void)
+{
+       int ret = 0;
+       int len = 2;
+       struct stackshot_thread_waitinfo waitinfo[2] = { { 0 }, { 0 } };
+       uint64_t thread_id = 0;
+       pthread_t grabbing, waiting;
+
+       T_LOG("Starting %s", __FUNCTION__);
+       ret = pthread_create(&waiting, NULL, kmutex_grab_and_wait_thread, NULL); // thread will block until we signal it
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Spawning grab and wait thread");
+       sleep(1); // give time for thread to block
+       ret = pthread_create(&grabbing, NULL, kmutex_grabbing_thread, NULL); // thread should immediately block
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Spawning waiting thread");
+       sleep(3); // give (lots of) time for thread to give up spinning on lock
+
+       void * stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+
+       ret = pthread_threadid_np(waiting, &thread_id); // this is the thread that currently holds the kernel mutex
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Getting integer value of thread id");
+
+       check_python(stackshot, "thread \\d+: semaphore port \\w+ with unknown owner");
+
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+
+       T_EXPECT_EQ(len, 2, "There should only be two blocking threads");
+       for (int i = 0; i < len; i++) {
+               struct stackshot_thread_waitinfo *curr = &waitinfo[i];
+               if (curr->wait_type == kThreadWaitSemaphore)
+                       continue;
+               T_EXPECT_EQ(curr->wait_type, kThreadWaitKernelMutex, "Wait type should match expected KernelMutex value");
+               T_EXPECT_EQ(curr->owner, thread_id, "Thread ID of blocking thread should match 'owner' field in stackshot");
+               sysctl_kmutex_test_match(curr->context);
+
+               check_python(stackshot, "thread \\d+: kernel mutex %llx owned by thread %lld", curr->context, thread_id);
+       }
+
+       kmutex_action(KMUTEX_SYSCTL_SIGNAL); // waiting thread should now unblock.
+       ret = pthread_join(waiting, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on waiting thread");
+       ret = pthread_join(grabbing, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on grabber thread");
+       kmutex_action(KMUTEX_SYSCTL_TEARDOWN);
+       stackshot_config_dealloc(stackshot);
+}
+
+/* Initialize a userspace semaphore, and spawn a thread to block on it. */
+static void
+test_semaphore_blocking(void)
+{
+       int ret = 0;
+       semaphore_t sem;
+       struct stackshot_thread_waitinfo waitinfo = { 0 };
+       int len = 1;
+       uint64_t pid = 0;
+
+       T_LOG("Starting %s", __FUNCTION__);
+       ret = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Creating semaphore");
+       pthread_t tid;
+       ret = pthread_create(&tid, NULL, sem_grabbing_thread, (void*)&sem); // thread should immediately block
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating semaphore grabbing thread");
+
+       sleep(1); // give time for thread to block
+
+       void * stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
+       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitSemaphore, "Wait type should match expected Semaphore value");
+
+       pid = (uint64_t)getpid();
+       T_EXPECT_EQ(waitinfo.owner, pid, "Owner value should match process ID");
+
+       check_python(stackshot, "thread \\d+: semaphore port \\w+ owned by pid %d", (int)pid);
+
+       ret = semaphore_signal(sem);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Signalling semaphore");
+       ret = pthread_join(tid, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on grabber thread");
+       ret = semaphore_destroy(mach_task_self(), sem);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Destroying semaphore");
+       stackshot_config_dealloc(stackshot);
+}
+
+/* Spawn a process to send a message to, and block while both sending and receiving in different contexts. */
+static void
+test_mach_msg_blocking(void)
+{
+       int ret = 0;
+       pthread_t tid;
+       void *stackshot = NULL;
+       struct stackshot_thread_waitinfo waitinfo = { 0 };
+       int len = 1;
+
+       T_LOG("Starting %s", __FUNCTION__);
+       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &send);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Allocating send port");
+       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &recv);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Allocating recv port");
+       ret = mach_port_insert_right(mach_task_self(), send, send, MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Getting send right to send port");
+       ret = mach_port_insert_right(mach_task_self(), recv, recv, MACH_MSG_TYPE_MAKE_SEND);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Getting send right to recv port");
+
+       ret = pthread_create(&tid, NULL, msg_blocking_thread, (void*)&send); // thread should block on recv soon
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating message blocking thread");
+
+       sleep(1); // give time for thread to block
+       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+
+       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
+       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPortReceive, "Wait type should match expected PortReceive value");
+
+       check_python(stackshot, "thread \\d+: mach_msg receive on port \\w+ name %llx", (long long)send);
+
+       stackshot_config_dealloc(stackshot);
+
+       msg_send_helper(send); // ping! msg_blocking_thread will now try to send us stuff, and block until we receive.
+
+       sleep(1); // give time for thread to block
+       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
+       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPortSend, "Wait type should match expected PortSend value");
+
+       check_python(stackshot, "thread \\d+: mach_msg send on port \\w+ owned by pid %d", (int)getpid());
+
+       stackshot_config_dealloc(stackshot);
+
+       msg_recv_helper(recv); // thread should block until we receive one of its messages
+       ret = pthread_join(tid, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on blocking thread");
+}
+
+static void
+test_ulock_blocking(void)
+{
+       int ret = 0;
+       void *stackshot = NULL;
+       uint64_t thread_id = 0;
+       pthread_t tid;
+       struct os_unfair_lock_s ouls = OS_UNFAIR_LOCK_INIT;
+       os_unfair_lock_t oul = &ouls;
+       struct stackshot_thread_waitinfo waitinfo = { 0 };
+       int len = 1;
+
+       T_LOG("Starting %s", __FUNCTION__);
+       os_unfair_lock_lock(oul);
+       ret = pthread_create(&tid, NULL, ulock_blocking_thread, (void*)oul);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating ulock blocking thread");
+       sleep(3); // give time for thread to spawn, fall back to kernel for contention, and block
+
+       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
+       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitUserLock, "Wait type should match expected UserLock value");
+
+       os_unfair_lock_unlock(oul);
+       ret = pthread_join(tid, NULL); // wait for thread to unblock and exit
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on blocking thread");
+
+       ret = pthread_threadid_np(NULL, &thread_id); // this thread is the "owner" of the ulock
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Getting integer value of thread id");
+       T_EXPECT_EQ(waitinfo.owner, thread_id, "Thread ID of blocking thread should match 'owner' field in stackshot");
+
+       check_python(stackshot, "thread \\d+: unfair lock \\w+ owned by thread %lld", thread_id);
+       stackshot_config_dealloc(stackshot);
+       return;
+}
+
+static void
+test_krwlock_blocking(void)
+{
+       int ret = 0;
+       void *stackshot = NULL;
+       uint64_t thread_id = 0;
+       pthread_t waiting, grabbing;
+       int len = 2;
+       struct stackshot_thread_waitinfo waitinfo[2] = { { 0 }, { 0 } };
+
+       T_LOG("Starting %s", __FUNCTION__);
+       // this thread should spawn, acquire a kernel rwlock for write, and then wait on a semaphore
+       ret = pthread_create(&waiting, NULL, krwlck_write_waiting_thread, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating krwlck write waiting thread");
+       sleep(1); // give time for thread to block
+       // this thread should spawn and try to acquire the same kernel rwlock for read, but block
+       ret = pthread_create(&grabbing, NULL, krwlck_read_grabbing_thread, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating krwlck read grabbing thread");
+       sleep(1); // give time for thread to block
+
+       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+
+       check_python(stackshot, "thread \\d+: semaphore port \\w+ with unknown owner");
+
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+
+       T_EXPECT_EQ(len, 2, "There should only be two blocking threads");
+       for (int i = 0; i < len; i++) {
+               struct stackshot_thread_waitinfo *curr = &waitinfo[i];
+               if (curr->wait_type == kThreadWaitSemaphore)
+                       continue;
+               T_EXPECT_EQ(curr->wait_type, kThreadWaitKernelRWLockRead, "Wait type should match expected KRWLockRead value");
+               sysctl_krwlck_test_match(curr->context);
+
+               check_python(stackshot, "thread \\d+: krwlock %llx for reading", curr->context);
+
+#if KRWLCK_STORES_EXCL_OWNER /* A future planned enhancement */
+               ret = pthread_threadid_np(waiting, &thread_id); // this is the thread that currently holds the kernel mutex
+               T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Getting integer value of thread id");
+               T_EXPECT_EQ(curr->owner, thread_id, "Thread ID of blocking thread should match 'owner' field in stackshot");
+#else
+               (void)thread_id; // suppress compiler warning about unused variable
+#endif /* RWLCK_STORES_EXCL_OWNER */
+       }
+
+       krwlck_action(KRWLCK_SYSCTL_SIGNAL); // pthread should now unblock & finish
+       ret = pthread_join(waiting, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on waiting thread");
+       ret = pthread_join(grabbing, NULL);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on grabbing thread");
+       krwlck_action(KRWLCK_SYSCTL_TEARDOWN);
+       stackshot_config_dealloc(stackshot);
+}
+
+
+static void
+test_pthread_mutex_blocking(void)
+{
+       int ret = 0;
+       void *stackshot = NULL;
+       uint64_t thread_id = 0;
+       pthread_t tid;
+       struct stackshot_thread_waitinfo waitinfo = { 0 };
+       pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
+       int len = 1;
+
+       T_LOG("Starting %s", __FUNCTION__);
+
+       ret = pthread_threadid_np(NULL, &thread_id); // this thread is the "owner" of the mutex
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Getting integer value of thread id");
+
+       pthread_mutex_lock(&mtx);
+       ret = pthread_create(&tid, NULL, pthread_mutex_blocking_thread, (void*)&mtx);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating pthread mutex blocking thread");
+       sleep(2); // give time for thread to block
+
+       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+
+       check_python(stackshot, "thread \\d+: pthread mutex %llx owned by thread %lld", &mtx, thread_id);
+
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
+       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPThreadMutex,
+                       "Wait type should match expected PThreadMutex value");
+       stackshot_config_dealloc(stackshot);
+
+       pthread_mutex_unlock(&mtx);
+       ret = pthread_join(tid, NULL); // wait for thread to unblock and exit
+
+
+       T_EXPECT_EQ(waitinfo.owner, thread_id,
+                       "Thread ID of blocking thread should match 'owner' field in stackshot");
+       T_EXPECT_EQ(waitinfo.context, (uint64_t)&mtx,
+                       "Userspace address of mutex should match 'context' field in stackshot");
+}
+
+static void
+test_pthread_rwlck_blocking(void)
+{
+       int ret = 0;
+       void *stackshot = NULL;
+       pthread_t tid;
+       struct stackshot_thread_waitinfo waitinfo = { 0 };
+       pthread_rwlock_t rwlck = PTHREAD_RWLOCK_INITIALIZER;
+       int len = 1;
+
+       T_LOG("Starting %s", __FUNCTION__);
+       pthread_rwlock_wrlock(&rwlck);
+       ret = pthread_create(&tid, NULL, pthread_rwlck_blocking_thread, (void*)&rwlck);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating pthread rwlck blocking thread");
+       sleep(2);
+
+       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+
+       check_python(stackshot, "thread \\d+: pthread rwlock %llx for reading", (long long)&rwlck);
+
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
+       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPThreadRWLockRead,
+                       "Wait type should match expected PThreadRWLockRead value");
+       stackshot_config_dealloc(stackshot);
+
+       pthread_rwlock_unlock(&rwlck);
+       ret = pthread_join(tid, NULL); // wait for thread to unblock and exit
+       T_EXPECT_EQ(waitinfo.context, (uint64_t)&rwlck,
+                       "Userspace address of rwlck should match 'context' field in stackshot");
+}
+
+
+
+static void
+test_pthread_cond_blocking(void)
+{
+       int ret = 0;
+       void *stackshot = NULL;
+       pthread_t tid;
+       pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+       struct stackshot_thread_waitinfo waitinfo = { 0 };
+       int len = 1;
+
+       T_LOG("Starting %s", __FUNCTION__);
+       ret = pthread_create(&tid, NULL, pthread_cond_blocking_thread, (void*)&cond);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating pthread condvar blocking thread");
+       sleep(2);
+
+       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+
+       check_python(stackshot, "thread \\d+: pthread condvar %llx", (long long)&cond);
+
+       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
+       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPThreadCondVar,
+                       "Wait type should match expected PThreadCondVar value");
+       stackshot_config_dealloc(stackshot);
+
+       pthread_cond_signal(&cond);
+       ret = pthread_join(tid, NULL); // wait for thread to unblock and exit
+       T_EXPECT_EQ(waitinfo.context, (uint64_t)&cond,
+                       "Userspace address of condvar should match 'context' field in stackshot");
+       pthread_cond_destroy(&cond);
+}
+
+static void
+test_waitpid_blocking(void)
+{
+       int ret = 0;
+       pid_t pid = 0;
+       void *stackshot = NULL;
+       struct stackshot_thread_waitinfo waitinfo = { 0 };
+       int len = 1;
+       pthread_t tid;
+
+       T_LOG("Starting %s", __FUNCTION__);
+       if ((pid = fork()) == 0) {
+               pause();
+       } else {
+               T_ASSERT_POSIX_SUCCESS(ret, "Running in parent. Child pid is %d", pid);
+
+               sleep(1); // allow enough time for child to run & sleep
+               ret = pthread_create(&tid, NULL, waitpid_blocking_thread, (void*)pid);
+               T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating waitpid blocking thread");
+
+               sleep(1); // allow enough time for reaping thread to waitpid & block
+               stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
+               find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
+               T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
+               T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitOnProcess,
+                               "Wait type should match expected WaitOnProcess value");
+
+               check_python(stackshot, "thread \\d+: waitpid, for pid %d", (int)pid);
+
+               stackshot_config_dealloc(stackshot);
+               T_EXPECT_EQ(waitinfo.owner, pid,
+                       "Process ID of blocking process should match 'owner' field in stackshot");
+
+               ret = kill(pid, SIGUSR1); // wake up child so waitpid thread can reap it & exit
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Send SIGUSR1 to child process");
+               ret = pthread_join(tid, NULL);
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Join on waitpid thread");
+       }
+}
+
+/*
+ *
+ * Test declarations
+ *
+ */
+
+T_DECL(stackshot_block_owner_klocks, "tests stackshot block owner for kernel locks") {
+       /* check to see if kmutex sysctl exists before running kmutex test */
+       if (kmutex_action(KMUTEX_SYSCTL_CHECK_EXISTS))
+               test_kmutex_blocking();
+       /* check to see if krwlck sysctl exists before running krwlck test */
+       if (krwlck_action(KRWLCK_SYSCTL_CHECK_EXISTS))
+               test_krwlock_blocking();
+       test_ulock_blocking();
+}
+
+T_DECL(stackshot_block_owner_pthread_mutex, "tests stackshot block owner: pthread mutex") {
+       test_pthread_mutex_blocking();
+}
+
+T_DECL(stackshot_block_owner_pthread_rwlck, "tests stackshot block owner: pthread rw locks") {
+       test_pthread_rwlck_blocking();
+}
+
+T_DECL(stackshot_block_owner_pthread_condvar, "tests stackshot block owner: pthread condvar") {
+       test_pthread_cond_blocking();
+}
+
+T_DECL(stackshot_block_owner_semaphore, "tests stackshot block owner: semaphore") {
+       test_semaphore_blocking();
+}
+
+T_DECL(stackshot_block_owner_mach_msg, "tests stackshot block owner: mach messaging") {
+       test_mach_msg_blocking();
+}
+
+T_DECL(stackshot_block_owner_waitpid, "tests stackshot block owner: waitpid") {
+       test_waitpid_blocking();
+}
diff --git a/tests/stackshot_idle_25570396.m b/tests/stackshot_idle_25570396.m
new file mode 100644 (file)
index 0000000..87ec2d0
--- /dev/null
@@ -0,0 +1,264 @@
+/* This program tests that kThreadIdleWorker is being set properly, so
+ * that idle and active threads can be appropriately identified.
+ */
+
+#include <darwintest.h>
+#include <dispatch/dispatch.h>
+#include <kdd.h>
+#include <kern/kcdata.h>
+#include <kern/debug.h>
+#include <mach/mach_init.h>
+#include <mach/mach_traps.h>
+#include <mach/semaphore.h>
+#include <mach/task.h>
+#include <pthread.h>
+#include <sys/stackshot.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <Foundation/Foundation.h>
+
+#define NUMRETRIES  5  // number of times to retry a stackshot
+#define NUMENQUEUES 16 // number of blocking jobs to enqueue
+#define NUMTHREADS  (NUMENQUEUES + 2) // total number of threads (including numenqueues)
+
+volatile static int spin_threads = 1;
+
+static void *
+take_stackshot(uint32_t extra_flags, uint64_t since_timestamp)
+{
+       void * stackshot;
+       int ret, retries;
+       uint32_t stackshot_flags = STACKSHOT_SAVE_LOADINFO |
+                                       STACKSHOT_GET_GLOBAL_MEM_STATS |
+                                       STACKSHOT_SAVE_IMP_DONATION_PIDS |
+                                       STACKSHOT_KCDATA_FORMAT;
+
+       if (since_timestamp != 0)
+               stackshot_flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT;
+
+       stackshot_flags |= extra_flags;
+
+       stackshot = stackshot_config_create();
+       T_ASSERT_NOTNULL(stackshot, "Allocating stackshot config");
+
+       ret = stackshot_config_set_flags(stackshot, stackshot_flags);
+       T_ASSERT_POSIX_ZERO(ret, "Setting flags on stackshot config");
+
+       ret = stackshot_config_set_pid(stackshot, getpid());
+       T_ASSERT_POSIX_ZERO(ret, "Setting target pid on stackshot config");
+
+       if (since_timestamp != 0) {
+               ret = stackshot_config_set_delta_timestamp(stackshot, since_timestamp);
+               T_ASSERT_POSIX_ZERO(ret, "Setting prev snapshot time on stackshot config");
+       }
+
+       for (retries = NUMRETRIES; retries > 0; retries--) {
+               ret = stackshot_capture_with_config(stackshot);
+               T_ASSERT_TRUE(ret == 0 || ret == EBUSY || ret == ETIMEDOUT, "Attempting to take stackshot (error %d)...", ret);
+               if (retries == 0 && (ret == EBUSY || ret == ETIMEDOUT))
+                       T_ASSERT_FAIL("Failed to take stackshot after %d retries: %s", ret, strerror(ret));
+               if (ret == 0)
+                       break;
+       }
+       return stackshot;
+}
+
+static uint64_t get_stackshot_timestamp(void * stackshot)
+{
+       kcdata_iter_t iter;
+       void * buf;
+       uint64_t default_time = 0;
+       uint32_t t, buflen;
+
+       buf = stackshot_config_get_stackshot_buffer(stackshot);
+       T_ASSERT_NOTNULL(buf, "Getting stackshot buffer");
+       buflen = stackshot_config_get_stackshot_size(stackshot);
+
+       iter = kcdata_iter(buf, buflen);
+       t    = kcdata_iter_type(iter);
+
+       T_ASSERT_TRUE(t == KCDATA_BUFFER_BEGIN_STACKSHOT || t == KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
+               "Making sure stackshot data begins with \"begin\" flag");
+       T_ASSERT_TRUE(kcdata_iter_valid(iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME)),
+               "Getting stackshot timestamp");
+       default_time = *(uint64_t *)kcdata_iter_payload(iter);
+       return default_time;
+}
+
+static void
+get_thread_statuses(void * stackshot, int * num_idles, int * num_nonidles)
+{
+       void *buf;
+       uint32_t t, buflen;
+       uint64_t thread_snap_flags;
+       NSError *error = nil;
+       NSMutableDictionary *parsed_container, *parsed_threads;
+
+       *num_idles = 0;
+       *num_nonidles = 0;
+
+       buf = stackshot_config_get_stackshot_buffer(stackshot);
+       T_ASSERT_NOTNULL(buf, "Getting stackshot buffer");
+       buflen = stackshot_config_get_stackshot_size(stackshot);
+
+       kcdata_iter_t iter = kcdata_iter(buf, buflen);
+       T_ASSERT_TRUE(kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_STACKSHOT ||
+                       kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
+                       "Checking start of stackshot buffer");
+
+       iter = kcdata_iter_next(iter);
+       KCDATA_ITER_FOREACH(iter)
+       {
+               t = kcdata_iter_type(iter);
+
+               if (t != KCDATA_TYPE_CONTAINER_BEGIN) {
+                       continue;
+               }
+
+               if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) {
+                       continue;
+               }
+
+               parsed_container = parseKCDataContainer(&iter, &error);
+               T_ASSERT_TRUE(parsed_container && !error, "Parsing container");
+
+               parsed_threads = parsed_container[@"task_snapshots"][@"thread_snapshots"];
+               for (id th_key in parsed_threads) {
+                       /* check to see that tid matches expected idle status */
+                       thread_snap_flags = [parsed_threads[th_key][@"thread_snapshot"][@"ths_ss_flags"] unsignedLongLongValue];
+                       (thread_snap_flags & kThreadIdleWorker) ? (*num_idles)++ : (*num_nonidles)++;
+               }
+               [parsed_container release];
+       }
+
+}
+
+/* Dispatch NUMENQUEUES jobs to a concurrent queue that immediately wait on a
+ * shared semaphore. This should spin up plenty of threads! */
+static void
+warm_up_threadpool(dispatch_queue_t q)
+{
+       int i;
+       dispatch_semaphore_t thread_wait = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(thread_wait, "Initializing work queue semaphore");
+       dispatch_semaphore_t main_wait = dispatch_semaphore_create(0);
+       T_QUIET; T_ASSERT_NOTNULL(main_wait, "Initializing main thread semaphore");
+
+       for (i = 0; i < NUMENQUEUES; i++) {
+               dispatch_async(q, ^{
+                       dispatch_semaphore_wait(thread_wait, DISPATCH_TIME_FOREVER);
+                       dispatch_semaphore_signal(main_wait);
+               });
+       }
+
+       sleep(1); // give worker threads enough time to block
+
+       for (i = 0; i < NUMENQUEUES; i++) {
+               dispatch_semaphore_signal(thread_wait);
+               dispatch_semaphore_wait(main_wait, DISPATCH_TIME_FOREVER);
+       }
+
+       dispatch_release(thread_wait);
+       dispatch_release(main_wait);
+
+       // Give enough time for worker threads to go idle again
+       sleep(1);
+}
+
+/* Dispatch NUMENQUEUES jobs to a concurrent queue that spin in a tight loop.
+ * Isn't guaranteed to occupy every worker thread, but it's enough so
+ * that a thread will go from idle to nonidle.
+ */
+static void
+fill_threadpool_with_spinning(dispatch_queue_t q)
+{
+       int i;
+       for (i = 0; i < NUMENQUEUES; i++) {
+               dispatch_async(q, ^{
+                       while(spin_threads); // should now appear as non-idle in delta shot
+               });
+       }
+       sleep(1); // wait for jobs to enqueue
+}
+
+/* Take stackshot, count the number of idle and nonidle threads the stackshot records.
+ * Where this is called, there should be NUMENQUEUES idle threads (thanks to warm_up_threadpool)
+ * and 2 nonidle threads (the main thread, and the spinning pthread).
+ */
+static void
+take_and_verify_initial_stackshot(uint64_t * since_time)
+{
+       void *stackshot;
+       int num_init_idle_threads, num_init_nonidle_threads;
+
+       stackshot = take_stackshot(0, 0);
+       *since_time = get_stackshot_timestamp(stackshot);
+       get_thread_statuses(stackshot, &num_init_idle_threads, &num_init_nonidle_threads);
+
+       T_EXPECT_EQ(num_init_idle_threads, NUMENQUEUES,
+                       "Idle count of %d should match expected value of %d...",
+                       num_init_idle_threads, NUMENQUEUES);
+       T_EXPECT_EQ(num_init_nonidle_threads, NUMTHREADS - NUMENQUEUES,
+                       "Non-idle count of %d should match expected value of %d...",
+                       num_init_nonidle_threads, NUMTHREADS - NUMENQUEUES);
+       stackshot_config_dealloc(stackshot);
+}
+
+/* Take a stackshot and a delta stackshot, measuring what changed since the previous
+ * stackshot. Where this is called, the blocking jobs have been cleared from the work queue,
+ * and the work queue has NUMENQUEUES tight-spinning jobs on it. Make sure that
+ * no new idle threads appear in the delta, and make sure that the delta shot isn't
+ * ignoring the worker threads that have become active.
+ */
+static void
+take_and_verify_delta_stackshot(uint64_t since_time)
+{
+       void *stackshot;
+       void *delta_stackshot;
+
+       int num_delta_idles, num_delta_nonidles, num_curr_idles, num_curr_nonidles;
+
+       stackshot = take_stackshot(0, 0);
+       delta_stackshot = take_stackshot(0, since_time); /* Threads should appear in delta stackshot as non-idle */
+
+       get_thread_statuses(stackshot, &num_curr_idles, &num_curr_nonidles);
+       get_thread_statuses(delta_stackshot, &num_delta_idles, &num_delta_nonidles);
+
+       T_EXPECT_EQ(num_delta_idles, 0, "Making sure there are no idles in delta shot");
+       T_EXPECT_EQ(num_delta_nonidles + num_curr_idles, NUMTHREADS,
+                       "Making sure delta shot isn't ignoring newly active threads");
+       stackshot_config_dealloc(stackshot);
+       stackshot_config_dealloc(delta_stackshot);
+}
+
+static void *
+spinning_non_work_queue_thread(void * ignored)
+{
+       (void)ignored;
+       while(spin_threads);
+       return NULL;
+}
+
+T_DECL(stackshot_idle_25570396, "Tests that stackshot can properly recognize idle and non-idle threads", T_META_ASROOT(true))
+{
+       int ret;
+       uint64_t initial_stackshot_time;
+       pthread_t spinning_thread;
+       dispatch_queue_t q;
+
+       ret = pthread_create(&spinning_thread, NULL, spinning_non_work_queue_thread, NULL);
+       T_ASSERT_POSIX_ZERO(ret, "Spinning up non-work-queue thread");
+
+       q = dispatch_queue_create("com.apple.kernel.test.waiting_semaphores", DISPATCH_QUEUE_CONCURRENT);
+
+       warm_up_threadpool(q);
+       take_and_verify_initial_stackshot(&initial_stackshot_time);
+
+       fill_threadpool_with_spinning(q);
+       take_and_verify_delta_stackshot(initial_stackshot_time);
+
+       spin_threads = 0; /* pthread-made thread should now exit */
+       ret = pthread_join(spinning_thread, NULL);
+       T_ASSERT_POSIX_ZERO(ret, "Joining on non-work-queue thread");
+}
diff --git a/tests/stackshot_spawn_exit_stress.c b/tests/stackshot_spawn_exit_stress.c
new file mode 100644 (file)
index 0000000..2a0be2b
--- /dev/null
@@ -0,0 +1,131 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#include <dispatch/dispatch.h>
+#include <kern/debug.h>
+#include <libproc.h>
+#include <mach-o/dyld.h>
+#include <sys/syscall.h>
+#include <sys/stackshot.h>
+#include <spawn.h>
+
+T_GLOBAL_META(
+               T_META_NAMESPACE("xnu.stackshot"),
+               T_META_CHECK_LEAKS(false),
+               T_META_ASROOT(true)
+               );
+
+#if TARGET_OS_WATCH
+#define SPAWN_ITERATIONS 1999
+#elif TARGET_OS_IPHONE
+#define SPAWN_ITERATIONS 4999
+#else
+#define SPAWN_ITERATIONS 9999
+#endif
+
+#define REAP_INTERVAL 10
+
+static void* loop(__attribute__ ((unused)) void *arg) {
+       exit(0);
+}
+
+T_HELPER_DECL(spawn_children_helper, "spawn_children helper")
+{
+       pthread_t pthread;
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&pthread, NULL, loop, NULL), "pthread_create");
+
+       while (1) { ; }
+}
+
+static void
+take_stackshot(void)
+{
+       uint32_t stackshot_flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS |
+                               STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT);
+
+       void *config = stackshot_config_create();
+       T_QUIET; T_ASSERT_NOTNULL(config, "created stackshot config");
+
+       int ret = stackshot_config_set_flags(config, stackshot_flags);
+       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "set flags on stackshot config");
+
+       int retries_remaining = 5;
+
+retry:
+       ret = stackshot_capture_with_config(config);
+
+       if (ret == EBUSY || ret == ETIMEDOUT) {
+               if (retries_remaining > 0) {
+                       retries_remaining--;
+                       goto retry;
+               } else {
+                       T_QUIET; T_ASSERT_POSIX_ZERO(ret,
+                                       "called stackshot_capture_with_config (no retries remaining)");
+               }
+       } else {
+                T_QUIET; T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
+       }
+
+       ret = stackshot_config_dealloc(config);
+       T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config");
+}
+
+T_DECL(stackshot_spawn_exit, "tests taking many stackshots while children processes are spawning+exiting")
+{
+       char path[PATH_MAX];
+       uint32_t path_size = sizeof(path);
+       T_ASSERT_POSIX_ZERO(_NSGetExecutablePath(path, &path_size), "_NSGetExecutablePath");
+       char *args[] = { path, "-n", "spawn_children_helper", NULL };
+
+       dispatch_queue_t stackshot_queue = dispatch_queue_create("stackshot_queue", NULL);
+       dispatch_async(stackshot_queue, ^(void) {
+               int num_stackshots = 0;
+
+               while (1) {
+                       take_stackshot();
+                       num_stackshots++;
+                       if ((num_stackshots % 100) == 0) {
+                               T_LOG("completed %d stackshots", num_stackshots);
+                       }
+
+                       // Sleep between each stackshot
+                       usleep(100);
+               }
+       });
+
+       // <rdar://problem/39739547> META option for T_HELPER_DECL to not output test begin on start
+       posix_spawn_file_actions_t actions;
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(posix_spawn_file_actions_init(&actions), "create spawn actions");
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(posix_spawn_file_actions_addopen (&actions, STDOUT_FILENO, "/dev/null", O_WRONLY, 0),
+                       "set stdout of child to NULL");
+
+       int children_unreaped = 0, status;
+       for (int iterations_remaining = SPAWN_ITERATIONS; iterations_remaining > 0; iterations_remaining--) {
+               pid_t pid;
+
+               int sp_ret = posix_spawn(&pid, args[0], &actions, NULL, args, NULL);
+               T_QUIET; T_ASSERT_POSIX_ZERO(sp_ret, "spawned process '%s' with PID %d", args[0], pid);
+
+               children_unreaped++;
+
+               if (children_unreaped >= REAP_INTERVAL) {
+                       while (children_unreaped) {
+                               T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(-1, &status, 0), "waitpid returned child pid");
+                               children_unreaped--;
+                       }
+               }
+
+               if ((iterations_remaining % 100) == 0) {
+                       T_LOG("spawned %d children thus far", (SPAWN_ITERATIONS - iterations_remaining));
+               }
+       }
+
+       while (children_unreaped) {
+               T_QUIET; T_ASSERT_POSIX_SUCCESS(waitpid(-1, &status, 0), "waitpid returned child pid");
+               children_unreaped--;
+       }
+}
diff --git a/tests/suspended_spawn_26184412.c b/tests/suspended_spawn_26184412.c
new file mode 100644 (file)
index 0000000..977e96d
--- /dev/null
@@ -0,0 +1,101 @@
+
+
+#include <darwintest.h>
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <assert.h>
+#include <signal.h>
+#include <spawn.h>
+#include <stdint.h>
+#include <sys/sysctl.h>
+#include <stdbool.h>
+#include <sysexits.h>
+#include <err.h>
+
+/*
+ * Test to validate that suspended-spawn DTRTs when a SIGKILL is recieved
+ * while the process is waiting for SIGCONT.
+ *
+ * Also test that suspended-spawn correctly looks like a SIGSTOP while it's suspended.
+ *
+ * <rdar://problem/26184412> posix_spawn non-exec with POSIX_SPAWN_START_SUSPENDED, then killing instead of SIGCONT-ing causes unkillable hung processes
+ */
+
+static void
+spawn_and_signal(int signal)
+{
+       /* do not buffer output to stdout */
+       setvbuf(stdout, NULL, _IONBF, 0);
+
+       int ret;
+       posix_spawnattr_t attr;
+
+       ret = posix_spawnattr_init(&attr);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init");
+
+       ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_START_SUSPENDED);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_setflags");
+
+       char * const    prog = "/usr/bin/true";
+       char * const    argv_child[] = { prog, NULL };
+       pid_t           child_pid;
+       extern char   **environ;
+
+       ret = posix_spawn(&child_pid, prog, NULL, &attr, argv_child, environ);
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn");
+
+       printf("parent: spawned child with pid %d\n", child_pid);
+
+       ret = posix_spawnattr_destroy(&attr);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_destroy");
+
+       int status = 0;
+       int waitpid_result = waitpid(child_pid, &status, WUNTRACED|WNOHANG);
+       T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid");
+
+       T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned");
+
+       T_ASSERT_EQ(WIFEXITED(status), 0, "before SIGCONT: must not have exited");
+       T_ASSERT_EQ(WIFSTOPPED(status), 1, "before SIGCONT: must be stopped");
+
+       printf("parent: continuing child process\n");
+
+       ret = kill(child_pid, signal);
+       T_ASSERT_POSIX_SUCCESS(ret, "kill(signal)");
+
+       printf("parent: waiting for child process\n");
+
+       status = 0;
+       waitpid_result = waitpid(child_pid, &status, 0);
+       T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid");
+
+       T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned");
+
+       if (signal == SIGKILL) {
+               T_ASSERT_EQ(WIFSIGNALED(status), 1, "child should have exited due to signal");
+               T_ASSERT_EQ(WTERMSIG(status), SIGKILL, "child should have exited due to SIGKILL");
+       } else {
+               T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally");
+               T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success");
+       }
+
+       printf("wait returned with pid %d, status %d\n", ret, status);
+}
+
+T_DECL(suspended_spawn_continue, "Tests spawning a suspended process and continuing it", T_META_TIMEOUT(2))
+{
+       spawn_and_signal(SIGCONT);
+}
+
+T_DECL(suspended_spawn_kill, "Tests spawning a suspended process and killing it", T_META_TIMEOUT(2))
+{
+       spawn_and_signal(SIGKILL);
+}
+
diff --git a/tests/task_for_pid_entitlement.plist b/tests/task_for_pid_entitlement.plist
new file mode 100644 (file)
index 0000000..2398d67
--- /dev/null
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+        <key>com.apple.system-task-ports</key>
+        <true/>
+        <key>task_for_pid-allow</key>
+        <true/>
+</dict>
+</plist>
diff --git a/tests/task_info.c b/tests/task_info.c
new file mode 100644 (file)
index 0000000..c440036
--- /dev/null
@@ -0,0 +1,1135 @@
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <errno.h>
+#include <mach/mach.h>
+#include <mach/mach_error.h>
+#include <mach/policy.h>
+#include <mach/task_info.h>
+#include <mach/thread_info.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+
+/* *************************************************************************************
+ * Test the task_info API.
+ *
+ * This is a functional test of the following APIs:
+ * TASK_BASIC_INFO_32
+ * TASK_BASIC2_INFO_32
+ * TASK_BASIC_INFO_64
+ * TASK_BASIC_INFO_64_2
+ * TASK_POWER_INFO_V2
+ * TASK_FLAGS_INFO
+ * TASK_AFFINITY_TAG_INFO
+ * TASK_THREAD_TIMES_INFO
+ * TASK_ABSOLUTE_TIME_INFO
+ * <rdar://problem/22242021> Add tests to increase code coverage for the task_info API
+ * *************************************************************************************
+ */
+#define TESTPHYSFOOTPRINTVAL 5
+#define CANARY 0x0f0f0f0f0f0f0f0fULL
+#if !defined(CONFIG_EMBEDDED)
+#define ABSOLUTE_MIN_USER_TIME_DIFF 150
+#define ABSOLUTE_MIN_SYSTEM_TIME_DIFF 300
+#endif
+
+enum info_kind { INFO_32, INFO_64, INFO_32_2, INFO_64_2, INFO_MACH, INFO_MAX };
+
+enum info_get { GET_SUSPEND_COUNT, GET_RESIDENT_SIZE, GET_VIRTUAL_SIZE, GET_USER_TIME, GET_SYS_TIME, GET_POLICY, GET_MAX_RES };
+
+/*
+ * This function uses CPU cycles by doing a factorial computation.
+ */
+static void do_factorial_task(void);
+
+void test_task_basic_info_32(void);
+void test_task_basic_info_64(void);
+void task_basic_info_32_debug(void);
+void task_basic2_info_32_warmup(void);
+static int is_development_kernel(void);
+void test_task_basic_info(enum info_kind kind);
+uint64_t info_get(enum info_kind kind, enum info_get get, void * data);
+
+T_DECL(task_vm_info, "tests task vm info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       kern_return_t err;
+       task_vm_info_data_t vm_info;
+
+       mach_msg_type_number_t count = TASK_VM_INFO_COUNT;
+
+       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       T_EXPECT_NE(vm_info.virtual_size, 0ULL, "task_info return value !=0 for virtual_size\n");
+
+       T_EXPECT_NE(vm_info.phys_footprint, 0ULL, "task_info return value !=0 for phys_footprint\n");
+
+       /*
+        * Test the REV0 version of TASK_VM_INFO. It should not change the value of phys_footprint.
+        */
+
+       count                  = TASK_VM_INFO_REV0_COUNT;
+       vm_info.phys_footprint = TESTPHYSFOOTPRINTVAL;
+       vm_info.min_address    = CANARY;
+       vm_info.max_address    = CANARY;
+
+       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       T_EXPECT_EQ(count, TASK_VM_INFO_REV0_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV0_COUNT", count);
+
+       T_EXPECT_NE(vm_info.virtual_size, 0ULL, "task_info --rev0 call does not return 0 for virtual_size");
+
+       T_EXPECT_EQ(vm_info.phys_footprint, (unsigned long long)TESTPHYSFOOTPRINTVAL,
+                   "task_info --rev0 call returned value %llu for vm_info.phys_footprint.  Expected %u since this value should not be "
+                   "modified by rev0",
+                   vm_info.phys_footprint, TESTPHYSFOOTPRINTVAL);
+
+       T_EXPECT_EQ(vm_info.min_address, CANARY,
+                   "task_info --rev0 call returned value 0x%llx for vm_info.min_address. Expected 0x%llx since this value should not "
+                   "be modified by rev0",
+                   vm_info.min_address, CANARY);
+
+       T_EXPECT_EQ(vm_info.max_address, CANARY,
+                   "task_info --rev0 call returned value 0x%llx for vm_info.max_address. Expected 0x%llx since this value should not "
+                   "be modified by rev0",
+                   vm_info.max_address, CANARY);
+
+       /*
+        * Test the REV1 version of TASK_VM_INFO.
+        */
+
+       count                  = TASK_VM_INFO_REV1_COUNT;
+       vm_info.phys_footprint = TESTPHYSFOOTPRINTVAL;
+       vm_info.min_address    = CANARY;
+       vm_info.max_address    = CANARY;
+
+       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       T_EXPECT_EQ(count, TASK_VM_INFO_REV1_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV1_COUNT", count);
+
+       T_EXPECT_NE(vm_info.virtual_size, 0ULL, "task_info --rev1 call does not return 0 for virtual_size");
+
+       T_EXPECT_NE(vm_info.phys_footprint, (unsigned long long)TESTPHYSFOOTPRINTVAL,
+                   "task_info --rev1 call returned value %llu for vm_info.phys_footprint.  Expected value is anything other than %u "
+                   "since this value should not be modified by rev1",
+                   vm_info.phys_footprint, TESTPHYSFOOTPRINTVAL);
+
+       T_EXPECT_EQ(vm_info.min_address, CANARY,
+                   "task_info --rev1 call returned value 0x%llx for vm_info.min_address. Expected 0x%llx since this value should not "
+                   "be modified by rev1",
+                   vm_info.min_address, CANARY);
+
+       T_EXPECT_EQ(vm_info.max_address, CANARY,
+                   "task_info --rev1 call returned value 0x%llx for vm_info.max_address. Expected 0x%llx since this value should not "
+                   "be modified by rev1",
+                   vm_info.max_address, CANARY);
+
+       /*
+        * Test the REV2 version of TASK_VM_INFO.
+        */
+
+       count                  = TASK_VM_INFO_REV2_COUNT;
+       vm_info.phys_footprint = TESTPHYSFOOTPRINTVAL;
+       vm_info.min_address    = CANARY;
+       vm_info.max_address    = CANARY;
+
+       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       T_EXPECT_EQ(count, TASK_VM_INFO_REV2_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV2_COUNT\n", count);
+
+       T_EXPECT_NE(vm_info.virtual_size, 0ULL, "task_info --rev2 call does not return 0 for virtual_size\n");
+
+       T_EXPECT_NE(vm_info.phys_footprint, (unsigned long long)TESTPHYSFOOTPRINTVAL,
+                   "task_info --rev2 call returned value %llu for vm_info.phys_footprint.  Expected anything other than %u since this "
+                   "value should be modified by rev2",
+                   vm_info.phys_footprint, TESTPHYSFOOTPRINTVAL);
+
+       T_EXPECT_NE(vm_info.min_address, CANARY,
+                   "task_info --rev2 call returned value 0x%llx for vm_info.min_address. Expected anything other than 0x%llx since "
+                   "this value should be modified by rev2",
+                   vm_info.min_address, CANARY);
+
+       T_EXPECT_NE(vm_info.max_address, CANARY,
+                   "task_info --rev2 call returned value 0x%llx for vm_info.max_address. Expected anything other than 0x%llx since "
+                   "this value should be modified by rev2",
+                   vm_info.max_address, CANARY);
+}
+
+T_DECL(host_debug_info, "tests host debug info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       kern_return_t err;
+       mach_port_t host;
+       host_debug_info_internal_data_t debug_info;
+       mach_msg_type_number_t count = HOST_DEBUG_INFO_INTERNAL_COUNT;
+       host                         = mach_host_self();
+       err                          = host_info(host, HOST_DEBUG_INFO_INTERNAL, (host_info_t)&debug_info, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify host_info call succeeded");
+}
+
+T_DECL(task_debug_info, "tests task debug info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       kern_return_t err;
+       task_debug_info_internal_data_t debug_info;
+
+       mach_msg_type_number_t count = TASK_DEBUG_INFO_INTERNAL_COUNT;
+
+       err = task_info(mach_task_self(), TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+}
+
+T_DECL(thread_debug_info, "tests thread debug info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       kern_return_t err;
+       thread_debug_info_internal_data_t debug_info;
+
+       mach_msg_type_number_t count = THREAD_DEBUG_INFO_INTERNAL_COUNT;
+
+       err = thread_info(mach_thread_self(), THREAD_DEBUG_INFO_INTERNAL, (thread_info_t)&debug_info, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+}
+
+static void
+do_factorial_task()
+{
+       int number    = 20;
+       int factorial = 1;
+       int i;
+       for (i = 1; i <= number; i++) {
+               factorial *= i;
+       }
+
+       return;
+}
+
+T_DECL(task_thread_times_info, "tests task thread times info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       kern_return_t err;
+       task_thread_times_info_data_t thread_times_info_data;
+       task_thread_times_info_data_t thread_times_info_data_new;
+       mach_msg_type_number_t count = TASK_THREAD_TIMES_INFO_COUNT;
+
+       err = task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, (task_info_t)&thread_times_info_data, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       do_factorial_task();
+
+       err = task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, (task_info_t)&thread_times_info_data_new, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       /*
+        * The difference is observed to be less than 30 microseconds for user_time
+        * and less than 50 microseconds for system_time. This observation was done for over
+        * 1000 runs.
+        */
+
+       T_EXPECT_FALSE((thread_times_info_data_new.user_time.seconds - thread_times_info_data.user_time.seconds) != 0 ||
+                          (thread_times_info_data_new.system_time.seconds - thread_times_info_data.system_time.seconds) != 0,
+                      "Tests whether the difference between thread times is greater than the allowed limit");
+
+       /*
+        * This is a negative case.
+        */
+
+       count--;
+       err = task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, (task_info_t)&thread_times_info_data, &count);
+       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
+                           "Negative test case: task_info should verify that count is at least equal to what is defined in API.");
+}
+
+T_DECL(task_absolutetime_info, "tests task absolute time info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       kern_return_t err;
+       uint64_t user_time_diff, system_time_diff;
+       task_absolutetime_info_data_t absolute_time_info_data;
+       task_absolutetime_info_data_t absolute_time_info_data_new;
+       mach_msg_type_number_t count = TASK_ABSOLUTETIME_INFO_COUNT;
+
+       err = task_info(mach_task_self(), TASK_ABSOLUTETIME_INFO, (task_info_t)&absolute_time_info_data, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       do_factorial_task();
+
+       err = task_info(mach_task_self(), TASK_ABSOLUTETIME_INFO, (task_info_t)&absolute_time_info_data_new, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       user_time_diff   = absolute_time_info_data_new.total_user - absolute_time_info_data.total_user;
+       system_time_diff = absolute_time_info_data_new.total_system - absolute_time_info_data.total_system;
+
+#if !(defined(__arm__) || defined(__arm64__))
+       /*
+        * On embedded devices the difference is always zero.
+        * On non-embedded devices the difference occurs in this range. This was observed over ~10000 runs.
+        */
+
+       T_EXPECT_FALSE(user_time_diff < ABSOLUTE_MIN_USER_TIME_DIFF || system_time_diff < ABSOLUTE_MIN_SYSTEM_TIME_DIFF,
+                      "Tests whether the difference between thread times is greater than the expected range");
+#endif
+
+       if (absolute_time_info_data.threads_user <= 0) {
+               int precise_time_val = 0;
+               size_t len           = sizeof(size_t);
+
+               T_LOG("User threads time is zero. This should only happen rarely and when precise_user_time is off");
+
+               err = sysctlbyname("kern.precise_user_kernel_time", &precise_time_val, &len, NULL, 0);
+
+               T_EXPECT_POSIX_SUCCESS(err, "performing sysctl to check precise_user_time");
+
+               T_LOG("kern.precise_user_kernel_time val = %d", precise_time_val);
+
+               T_EXPECT_FALSE(precise_time_val, "user thread time should only be zero when precise_user_kernel_time is disabled");
+       } else {
+               T_PASS("task_info should return non-zero value for user threads time = %llu", absolute_time_info_data.threads_user);
+       }
+
+#if !(defined(__arm__) || defined(__arm64__))
+       /*
+        * On iOS, system threads are always zero. On OS X this value can be some large positive number.
+        * There is no real way to estimate the exact amount.
+        */
+       T_EXPECT_NE(absolute_time_info_data.threads_system, 0ULL,
+                   "task_info should return non-zero value for system threads time = %llu", absolute_time_info_data.threads_system);
+#endif
+
+       /*
+        * This is a negative case.
+        */
+       count--;
+       err = task_info(mach_task_self(), TASK_ABSOLUTETIME_INFO, (task_info_t)&absolute_time_info_data_new, &count);
+       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
+                           "Negative test case: task_info should verify that count is at least equal to what is defined in API.");
+}
+
+T_DECL(task_affinity_tag_info, "tests task_affinity_tag_info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       kern_return_t err;
+       task_affinity_tag_info_data_t affinity_tag_info_data;
+       mach_msg_type_number_t count = TASK_AFFINITY_TAG_INFO_COUNT;
+
+       err = task_info(mach_task_self(), TASK_AFFINITY_TAG_INFO, (task_info_t)&affinity_tag_info_data, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       /*
+        * The affinity is not set by default, hence expecting a zero value.
+        */
+       T_ASSERT_FALSE(affinity_tag_info_data.min != 0 || affinity_tag_info_data.max != 0,
+                      "task_info call returns non-zero min or max value");
+
+       /*
+       * This is a negative case.
+       */
+       count--;
+       err = task_info(mach_task_self(), TASK_AFFINITY_TAG_INFO, (task_info_t)&affinity_tag_info_data, &count);
+       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
+                           "Negative test case: task_info should verify that count is at least equal to what is defined in API.");
+}
+
+T_DECL(task_flags_info, "tests task_flags_info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       kern_return_t err;
+       task_flags_info_data_t flags_info_data;
+       mach_msg_type_number_t count = TASK_FLAGS_INFO_COUNT;
+
+       err = task_info(mach_task_self(), TASK_FLAGS_INFO, (task_info_t)&flags_info_data, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       /* Change for 32-bit arch possibility?*/
+       T_ASSERT_EQ((flags_info_data.flags & (unsigned int)(~(TF_LP64 | TF_64B_DATA))), 0U,
+                   "task_info should only give out 64-bit addr/data flags");
+
+       /*
+        * This is a negative case.
+        */
+
+       count--;
+       err = task_info(mach_task_self(), TASK_FLAGS_INFO, (task_info_t)&flags_info_data, &count);
+       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
+                           "Negative test case: task_info should verify that count is at least equal to what is defined in API.");
+}
+
+T_DECL(task_power_info_v2, "tests task_power_info_v2", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       kern_return_t err;
+       task_power_info_v2_data_t power_info_data_v2;
+       task_power_info_v2_data_t power_info_data_v2_new;
+       mach_msg_type_number_t count = TASK_POWER_INFO_V2_COUNT;
+
+       sleep(1);
+
+       err = task_info(mach_task_self(), TASK_POWER_INFO_V2, (task_info_t)&power_info_data_v2, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       T_ASSERT_LE(power_info_data_v2.gpu_energy.task_gpu_utilisation, 0ULL,
+                   "verified task_info call shows zero GPU utilization for non-GPU task");
+
+       do_factorial_task();
+
+       /*
+        * Verify the cpu_energy parameters.
+        */
+       err = task_info(mach_task_self(), TASK_POWER_INFO_V2, (task_info_t)&power_info_data_v2_new, &count);
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+#if !(defined(__arm__) || defined(__arm64__))
+       /*
+        * iOS does not have system_time.
+        */
+       T_ASSERT_GT(power_info_data_v2_new.cpu_energy.total_user, power_info_data_v2.cpu_energy.total_user,
+                   "task_info call returns valid user time");
+       T_ASSERT_GT(power_info_data_v2_new.cpu_energy.total_system, power_info_data_v2.cpu_energy.total_system,
+                   "task_info call returns valid system time");
+#endif
+
+       T_ASSERT_GE(power_info_data_v2.cpu_energy.task_interrupt_wakeups, 1ULL,
+                   "verify task_info call returns non-zero value for interrupt_wakeup (ret value = %llu)",
+                   power_info_data_v2.cpu_energy.task_interrupt_wakeups);
+
+#if !(defined(__arm__) || defined(__arm64__))
+       if (power_info_data_v2.cpu_energy.task_platform_idle_wakeups != 0) {
+               T_LOG("task_info call returned %llu for platform_idle_wakeup", power_info_data_v2.cpu_energy.task_platform_idle_wakeups);
+       }
+#endif
+
+       count = TASK_POWER_INFO_V2_COUNT_OLD;
+       err   = task_info(mach_task_self(), TASK_POWER_INFO_V2, (task_info_t)&power_info_data_v2, &count);
+
+       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
+
+       /*
+        * This is a negative case.
+        */
+       count--;
+       err = task_info(mach_task_self(), TASK_POWER_INFO_V2, (task_info_t)&power_info_data_v2, &count);
+
+       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
+                           "Negative test case: task_info should verify that count is at least equal to what is defined in API. Call "
+                           "returns errno %d:%s",
+                           err, mach_error_string(err));
+}
+
+T_DECL(test_task_basic_info_32, "tests TASK_BASIC_INFO_32", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       test_task_basic_info(INFO_32);
+}
+
+T_DECL(test_task_basic_info_32_2, "tests TASK_BASIC_INFO_32_2", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       test_task_basic_info(INFO_32_2);
+}
+
+#if defined(__arm__) || defined(__arm64__)
+T_DECL(test_task_basic_info_64i_2, "tests TASK_BASIC_INFO_64_2", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       test_task_basic_info(INFO_64_2);
+}
+#else
+T_DECL(test_task_basic_info_64, "tests TASK_BASIC_INFO_64", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       test_task_basic_info(INFO_64);
+}
+#endif /* defined(__arm__) || defined(__arm64__) */
+
+T_DECL(test_mach_task_basic_info, "tests MACH_TASK_BASIC_INFO", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
+{
+       test_task_basic_info(INFO_MACH);
+}
+
+void
+test_task_basic_info(enum info_kind kind)
+{
+#define BEFORE 0
+#define AFTER 1
+
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       task_info_t info_data[2];
+       task_basic_info_32_data_t basic_info_32_data[2];
+#if defined(__arm__) || defined(__arm64__)
+       task_basic_info_64_2_data_t basic_info_64_2_data[2];
+#else
+       task_basic_info_64_data_t basic_info_64_data[2];
+#endif /* defined(__arm__) || defined(__arm64__) */
+       mach_task_basic_info_data_t mach_basic_info_data[2];
+
+       kern_return_t kr;
+       mach_msg_type_number_t count;
+       task_flavor_t flavor = 0;
+       integer_t suspend_count;
+       uint64_t resident_size_diff;
+       uint64_t virtual_size_diff;
+
+       void * tmp_map = NULL;
+       pid_t child_pid;
+       mach_port_name_t child_task;
+       /*for dt_waitpid*/
+       int timeout     = 10; // change to max timeout
+       int exit_status = 0;
+
+       switch (kind) {
+       case INFO_32:
+       case INFO_32_2:
+               info_data[BEFORE] = (task_info_t)&basic_info_32_data[BEFORE];
+               info_data[AFTER]  = (task_info_t)&basic_info_32_data[AFTER];
+               count             = TASK_BASIC_INFO_32_COUNT;
+               flavor            = TASK_BASIC_INFO_32;
+
+               if (kind == INFO_32_2) {
+                       flavor = TASK_BASIC2_INFO_32;
+               }
+
+               break;
+#if defined(__arm__) || defined(__arm64__)
+       case INFO_64:
+               T_ASSERT_FAIL("invalid basic info kind");
+               break;
+
+       case INFO_64_2:
+               info_data[BEFORE] = (task_info_t)&basic_info_64_2_data[BEFORE];
+               info_data[AFTER]  = (task_info_t)&basic_info_64_2_data[AFTER];
+               count             = TASK_BASIC_INFO_64_2_COUNT;
+               flavor            = TASK_BASIC_INFO_64_2;
+               break;
+
+#else
+       case INFO_64:
+               info_data[BEFORE] = (task_info_t)&basic_info_64_data[BEFORE];
+               info_data[AFTER]  = (task_info_t)&basic_info_64_data[AFTER];
+               count             = TASK_BASIC_INFO_64_COUNT;
+               flavor            = TASK_BASIC_INFO_64;
+               break;
+
+       case INFO_64_2:
+               T_ASSERT_FAIL("invalid basic info kind");
+               break;
+#endif /* defined(__arm__) || defined(__arm64__) */
+       case INFO_MACH:
+               info_data[BEFORE] = (task_info_t)&mach_basic_info_data[BEFORE];
+               info_data[AFTER]  = (task_info_t)&mach_basic_info_data[AFTER];
+               count             = MACH_TASK_BASIC_INFO_COUNT;
+               flavor            = MACH_TASK_BASIC_INFO;
+               break;
+       case INFO_MAX:
+       default:
+               T_ASSERT_FAIL("invalid basic info kind");
+               break;
+       }
+
+       kr = task_info(mach_task_self(), flavor, info_data[BEFORE], &count);
+
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_info succeeded");
+
+       do_factorial_task();
+
+       /*
+        * Allocate virtual and resident memory.
+        */
+       tmp_map = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
+
+       T_WITH_ERRNO;
+       T_EXPECT_NE(tmp_map, MAP_FAILED, "verify mmap call is successful");
+
+       memset(tmp_map, 'm', PAGE_SIZE);
+
+       child_pid = fork();
+
+       T_ASSERT_POSIX_SUCCESS(child_pid, "verify process can be forked");
+
+       if (child_pid == 0) {
+               /*
+                * This will suspend the child process.
+                */
+               kr = task_suspend(mach_task_self());
+               exit(kr);
+       }
+
+       /*
+        * Wait for the child process to suspend itself.
+        */
+       sleep(1);
+
+       kr = task_for_pid(mach_task_self(), child_pid, &child_task);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_for_pid succeeded.  check sudo if failed");
+
+       /*
+        * Verify the suspend_count for child and resume it.
+        */
+
+       kr = task_info(child_task, flavor, info_data[AFTER], &count);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
+
+       suspend_count = (integer_t)(info_get(kind, GET_SUSPEND_COUNT, info_data[AFTER]));
+       T_ASSERT_EQ(suspend_count, 1, "verify task_info shows correct suspend_count");
+
+       kr = task_resume(child_task);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_resume succeeded");
+
+       /*
+        * reap kr from task_suspend call in child
+        */
+       if (dt_waitpid(child_pid, &exit_status, NULL, timeout)) {
+               T_ASSERT_MACH_SUCCESS(exit_status, "verify child task_suspend is successful");
+       } else {
+               T_FAIL("dt_waitpid failed");
+       }
+
+       kr = task_info(mach_task_self(), flavor, info_data[AFTER], &count);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
+
+       resident_size_diff = info_get(kind, GET_RESIDENT_SIZE, info_data[AFTER]) - info_get(kind, GET_RESIDENT_SIZE, info_data[BEFORE]);
+       virtual_size_diff  = info_get(kind, GET_VIRTUAL_SIZE, info_data[AFTER]) - info_get(kind, GET_VIRTUAL_SIZE, info_data[BEFORE]);
+
+       /*
+        * INFO_32_2 gets the max resident size instead of the current resident size
+        * 32 KB tolerance built into test.  The returned value is generally between 0 and 16384
+        *
+        * max resident size is a discrete field in INFO_MACH, so it's handled differently
+        */
+       if (kind == INFO_32_2) {
+               T_EXPECT_EQ(resident_size_diff % 4096, 0ULL, "verify task_info returns valid max resident_size");
+               T_EXPECT_GE(resident_size_diff, 0ULL, "verify task_info returns non-negative max resident_size");
+               T_EXPECT_GE(virtual_size_diff, (unsigned long long)PAGE_SIZE, "verify task_info returns valid virtual_size");
+       } else {
+               T_EXPECT_GE(resident_size_diff, (unsigned long long)PAGE_SIZE, "task_info returns valid resident_size");
+               T_EXPECT_GE(virtual_size_diff, (unsigned long long)PAGE_SIZE, "task_info returns valid virtual_size");
+       }
+
+       if (kind == INFO_MACH) {
+               resident_size_diff = info_get(kind, GET_MAX_RES, info_data[AFTER]) - info_get(kind, GET_MAX_RES, info_data[BEFORE]);
+               T_EXPECT_EQ(resident_size_diff % 4096, 0ULL, "verify task_info returns valid max resident_size");
+               T_EXPECT_GE(resident_size_diff, 0ULL, "verify task_info returns non-negative max resident_size");
+               T_EXPECT_GE(info_get(kind, GET_MAX_RES, info_data[AFTER]), info_get(kind, GET_RESIDENT_SIZE, info_data[AFTER]),
+                           "verify max resident size is greater than or equal to curr resident size");
+       }
+
+       do_factorial_task();
+
+       /*
+        * These counters give time for threads that have terminated. We dont have any, so checking for zero.
+        */
+
+       time_value_t * user_tv = (time_value_t *)(info_get(kind, GET_USER_TIME, info_data[BEFORE]));
+       T_EXPECT_EQ((user_tv->seconds + user_tv->microseconds / 1000000), 0, "verify task_info shows valid user time");
+
+       time_value_t * sys_tv = (time_value_t *)(info_get(kind, GET_SYS_TIME, info_data[BEFORE]));
+       T_EXPECT_EQ(sys_tv->seconds + (sys_tv->microseconds / 1000000), 0, "verify task_info shows valid system time");
+
+       /*
+        * The default value for non-kernel tasks is TIMESHARE.
+        */
+
+       policy_t pt = (policy_t)info_get(kind, GET_POLICY, info_data[BEFORE]);
+
+       T_EXPECT_EQ(pt, POLICY_TIMESHARE, "verify task_info shows valid policy");
+
+       /*
+        * This is a negative case.
+        */
+
+       count--;
+       kr = task_info(mach_task_self(), flavor, info_data[AFTER], &count);
+
+       T_ASSERT_MACH_ERROR(kr, KERN_INVALID_ARGUMENT,
+                           "Negative test case: task_info should verify that count is at least equal to what is defined in API");
+
+       /*
+        * deallocate memory
+        */
+       munmap(tmp_map, PAGE_SIZE);
+
+       return;
+
+#undef BEFORE
+#undef AFTER
+}
+
+T_DECL(test_sigcont_task_suspend_resume,
+       "test to verify that SIGCONT on task_suspend()-ed process works",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       mach_task_basic_info_data_t mach_basic_info_data;
+       task_info_t info_data = (task_info_t)&mach_basic_info_data;
+
+       task_debug_info_internal_data_t debug_info;
+       mach_msg_type_number_t debug_count = TASK_DEBUG_INFO_INTERNAL_COUNT;
+
+       kern_return_t kr;
+       int posix_ret;
+       mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT;
+       task_flavor_t flavor         = MACH_TASK_BASIC_INFO;
+       integer_t suspend_count;
+       integer_t debug_suspend_count;
+       pid_t child_pid = 0;
+       mach_port_name_t child_task;
+       /*for dt_waitpid*/
+       int timeout     = 5;
+       int exit_status = 0;
+       int signal_no   = 0;
+
+       child_pid = fork();
+
+       T_ASSERT_POSIX_SUCCESS(child_pid, "verify process can be forked");
+
+       if (child_pid == 0) {
+               /*
+                * This will suspend the child process.
+                */
+               kr = task_suspend(mach_task_self());
+
+               /*
+                * When child resumes, it exits immediately
+                */
+
+               exit(kr);
+       }
+
+       /*
+        * Wait for the child process to suspend itself.
+        */
+       sleep(1);
+
+       kr = task_for_pid(mach_task_self(), child_pid, &child_task);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_for_pid succeeded.  check sudo if failed");
+
+       /*
+        * Verify the suspend_count for child and resume it.
+        */
+
+       kr = task_info(child_task, flavor, info_data, &count);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
+
+       suspend_count = (integer_t)(info_get(INFO_MACH, GET_SUSPEND_COUNT, info_data));
+       T_ASSERT_EQ(suspend_count, 1, "verify task_info shows correct suspend_count (1) (actually user stop count) ");
+
+       kr = task_info(child_task, TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &debug_count);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
+
+       debug_suspend_count = debug_info.suspend_count;
+       T_ASSERT_EQ(debug_info.suspend_count, 1, "verify debug_info shows correct suspend_count(1)");
+
+       posix_ret = kill(child_pid, SIGCONT);
+       T_ASSERT_POSIX_SUCCESS(posix_ret, "verify signal call succeeded");
+
+       /*
+        * reap kr from task_suspend call in child
+        */
+       dt_waitpid(child_pid, &exit_status, &signal_no, timeout);
+
+       T_ASSERT_EQ(signal_no, 0, "child should be resumed and exit without signal");
+       T_ASSERT_EQ(exit_status, 0, "child should exit with 0");
+
+}
+
+T_DECL(test_sigcont_task_suspend2_resume,
+       "test to verify that SIGCONT on task_suspend2()-ed process doesn't work",
+       T_META_ASROOT(true),
+       T_META_LTEPHASE(LTE_POSTINIT))
+{
+       T_SETUPBEGIN;
+       int is_dev = is_development_kernel();
+       T_QUIET;
+       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
+       T_SETUPEND;
+
+       mach_task_basic_info_data_t mach_basic_info_data;
+       task_info_t info_data = (task_info_t)&mach_basic_info_data;
+
+       task_debug_info_internal_data_t debug_info;
+       mach_msg_type_number_t debug_count = TASK_DEBUG_INFO_INTERNAL_COUNT;
+
+       kern_return_t kr;
+       int posix_ret;
+       mach_msg_type_number_t count  = MACH_TASK_BASIC_INFO_COUNT;
+       task_flavor_t flavor          = MACH_TASK_BASIC_INFO;
+       integer_t suspend_count       = 0;
+       integer_t debug_suspend_count = 0;
+       pid_t child_pid               = 0;
+       mach_port_name_t child_task;
+       task_suspension_token_t child_token = 0xFFFFF;
+
+       /*
+        * for dt_waitpid
+        * We expect the test to fail right now, so I've set timeout to
+        * be shorter than we may want it to be when the issue is fixed
+        */
+       int timeout     = 1;
+       int exit_status = 0;
+       int signal_no   = 0;
+
+       /* for pipe */
+       int fd[2];
+       pipe(fd);
+       int pipe_msg = 0;
+
+       child_pid = fork();
+
+       T_ASSERT_POSIX_SUCCESS(child_pid, "verify process can be forked %d", child_pid);
+
+       if (child_pid == 0) {
+               close(fd[1]);
+               T_LOG("Waiting to read from parent...");
+               read(fd[0], &pipe_msg, sizeof(pipe_msg));
+               T_LOG("Done reading from parent, about to exit...");
+               exit(0);
+       }
+       /*
+        * Wait for child to fork and block on read
+        */
+       sleep(1);
+
+       close(fd[0]);
+
+       kr = task_for_pid(mach_task_self(), child_pid, &child_task);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_for_pid succeeded.  check sudo if failed");
+
+       kr = task_info(child_task, TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &debug_count);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
+
+       debug_suspend_count = debug_info.suspend_count;
+       T_EXPECT_EQ(debug_suspend_count, 0, "verify debug_info shows correct (true) suspend_count(0)");
+
+       kr = task_suspend2(child_task, &child_token);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_suspend2 call succeeded");
+
+       kr = task_info(child_task, TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &debug_count);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
+
+       debug_suspend_count = debug_info.suspend_count;
+       T_ASSERT_EQ(debug_suspend_count, 1, "verify debug_info shows correct (true) suspend_count(1)");
+
+       /*
+        * Verify the suspend_count for child and resume it.
+        */
+
+       kr = task_info(child_task, flavor, info_data, &count);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
+
+       suspend_count = (integer_t)(info_get(INFO_MACH, GET_SUSPEND_COUNT, info_data));
+       T_EXPECT_EQ(suspend_count, 1, "verify task_info shows correct (user_stop_count) suspend_count (1)");
+
+       posix_ret = kill(child_pid, SIGCONT);
+       T_ASSERT_POSIX_SUCCESS(posix_ret, "verify signal call succeeded");
+
+       kr = task_info(child_task, TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &debug_count);
+       T_EXPECT_MACH_SUCCESS(kr, "verify task_info call succeeded");
+
+       debug_suspend_count = debug_info.suspend_count;
+       T_EXPECTFAIL_WITH_RADAR(33166654);
+       T_EXPECT_EQ(debug_suspend_count, 1, "verify debug_info shows correct (true) suspend_count (1)");
+
+       suspend_count = (integer_t)(info_get(INFO_MACH, GET_SUSPEND_COUNT, info_data));
+       T_ASSERT_EQ(suspend_count, 1, "verify task_info shows correct (user_stop_count) suspend_count (1) after SIG_CONT");
+
+       kr = task_resume(child_task);
+       T_EXPECTFAIL_WITH_RADAR(33166654);
+       T_EXPECT_MACH_SUCCESS(kr, "verify task_resume succeeded");
+
+       /*
+        * reap kr from task_suspend call in child
+        */
+
+       dt_waitpid(child_pid, &exit_status, &signal_no, timeout);
+
+       T_ASSERT_EQ(signal_no, SIG_DT_TIMEOUT, "dt_waitpid timed out as expected");
+
+       // Resume properly using token and then wait
+
+       kr = task_resume2(child_token);
+       T_EXPECTFAIL_WITH_RADAR(33166654);
+       T_ASSERT_MACH_SUCCESS(kr, "verify task_resume2 succeeded");
+
+       write(fd[1], &pipe_msg, sizeof(pipe_msg));
+
+       /*
+        * reap kr from task_suspend call in child
+        */
+       dt_waitpid(child_pid, &exit_status, &signal_no, timeout);
+
+       T_ASSERT_EQ(signal_no, 0, "child should be resumed and no signal should be returned");
+       T_ASSERT_EQ(exit_status, 0, "child should exit with 0");
+
+}
+
+uint64_t
+info_get(enum info_kind kind, enum info_get get, void * data)
+{
+       switch (get) {
+       case GET_SUSPEND_COUNT:
+               switch (kind) {
+               case INFO_32:
+               case INFO_32_2:
+                       return (uint64_t)(((task_basic_info_32_t)data)->suspend_count);
+#if defined(__arm__) || defined(__arm64__)
+               case INFO_64:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+
+               case INFO_64_2:
+                       return (uint64_t)(((task_basic_info_64_2_t)data)->suspend_count);
+#else
+               case INFO_64:
+                       return (uint64_t)(((task_basic_info_64_t)data)->suspend_count);
+
+               case INFO_64_2:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+#endif /* defined(__arm__) || defined(__arm64__) */
+               case INFO_MACH:
+                       return (uint64_t)(((mach_task_basic_info_t)data)->suspend_count);
+               case INFO_MAX:
+               default:
+                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
+               }
+       case GET_RESIDENT_SIZE:
+               switch (kind) {
+               case INFO_32:
+               case INFO_32_2:
+                       return (uint64_t)(((task_basic_info_32_t)data)->resident_size);
+#if defined(__arm__) || defined(__arm64__)
+               case INFO_64:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+
+               case INFO_64_2:
+                       return (uint64_t)(((task_basic_info_64_2_t)data)->resident_size);
+#else
+               case INFO_64:
+                       return (uint64_t)(((task_basic_info_64_t)data)->resident_size);
+
+               case INFO_64_2:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+#endif /* defined(__arm__) || defined(__arm64__) */
+               case INFO_MACH:
+                       return (uint64_t)(((mach_task_basic_info_t)data)->resident_size);
+               case INFO_MAX:
+               default:
+                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
+               }
+       case GET_VIRTUAL_SIZE:
+               switch (kind) {
+               case INFO_32:
+               case INFO_32_2:
+                       return (uint64_t)(((task_basic_info_32_t)data)->virtual_size);
+#if defined(__arm__) || defined(__arm64__)
+               case INFO_64:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+
+               case INFO_64_2:
+                       return (uint64_t)(((task_basic_info_64_2_t)data)->virtual_size);
+#else
+               case INFO_64:
+                       return (uint64_t)(((task_basic_info_64_t)data)->virtual_size);
+
+               case INFO_64_2:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+#endif /* defined(__arm__) || defined(__arm64__) */
+               case INFO_MACH:
+                       return (uint64_t)(((mach_task_basic_info_t)data)->virtual_size);
+
+               case INFO_MAX:
+               default:
+                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
+               }
+       case GET_USER_TIME:
+               switch (kind) {
+               case INFO_32:
+               case INFO_32_2:
+                       return (uint64_t) & (((task_basic_info_32_t)data)->user_time);
+#if defined(__arm__) || defined(__arm64__)
+               case INFO_64:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+
+               case INFO_64_2:
+                       return (uint64_t) & (((task_basic_info_64_2_t)data)->user_time);
+#else
+               case INFO_64:
+                       return (uint64_t) & (((task_basic_info_64_t)data)->user_time);
+
+               case INFO_64_2:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+#endif /* defined(__arm__) || defined(__arm64__) */
+               case INFO_MACH:
+                       return (uint64_t) & (((mach_task_basic_info_t)data)->user_time);
+
+               case INFO_MAX:
+               default:
+                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
+               }
+       case GET_SYS_TIME:
+               switch (kind) {
+               case INFO_32:
+               case INFO_32_2:
+                       return (uint64_t) & (((task_basic_info_32_t)data)->system_time);
+#if defined(__arm__) || defined(__arm64__)
+               case INFO_64:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+
+               case INFO_64_2:
+                       return (uint64_t) & (((task_basic_info_64_2_t)data)->system_time);
+#else
+               case INFO_64:
+                       return (uint64_t) & (((task_basic_info_64_t)data)->system_time);
+
+               case INFO_64_2:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+#endif /* defined(__arm__) || defined(__arm64__) */
+               case INFO_MACH:
+                       return (uint64_t) & (((mach_task_basic_info_t)data)->user_time);
+               case INFO_MAX:
+               default:
+                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
+               }
+       case GET_POLICY:
+               switch (kind) {
+               case INFO_32:
+               case INFO_32_2:
+                       return (uint64_t)(((task_basic_info_32_t)data)->policy);
+#if defined(__arm__) || defined(__arm64__)
+               case INFO_64:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+
+               case INFO_64_2:
+                       return (uint64_t)(((task_basic_info_64_2_t)data)->policy);
+#else
+               case INFO_64:
+                       return (uint64_t)(((task_basic_info_64_t)data)->policy);
+
+               case INFO_64_2:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+                       break;
+#endif /* defined(__arm__) || defined(__arm64__) */
+               case INFO_MACH:
+                       return (uint64_t)(((mach_task_basic_info_t)data)->policy);
+
+               case INFO_MAX:
+               default:
+                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
+               }
+       case GET_MAX_RES:
+               switch (kind) {
+               case INFO_32:
+               case INFO_32_2:
+               case INFO_64:
+               case INFO_64_2:
+                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
+               case INFO_MACH:
+                       return (uint64_t)(((mach_task_basic_info_t)data)->resident_size_max);
+               case INFO_MAX:
+               default:
+                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
+               }
+       }
+
+       __builtin_unreachable();
+}
+
+/*
+ * Determines whether we're running on a development kernel
+ */
+static int
+is_development_kernel(void)
+{
+#define NOTSET -1
+
+       static int is_dev = NOTSET;
+
+       if (is_dev == NOTSET) {
+               int dev;
+               size_t dev_size = sizeof(dev);
+
+               T_QUIET;
+               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev, &dev_size, NULL, 0), NULL);
+               is_dev = (dev != 0);
+
+               return is_dev;
+       } else {
+               return is_dev;
+       }
+#undef NOTSET
+}
diff --git a/tests/task_info_28439149.c b/tests/task_info_28439149.c
new file mode 100644 (file)
index 0000000..9102ba6
--- /dev/null
@@ -0,0 +1,81 @@
+#include <darwintest.h>
+#include <mach/host_priv.h>
+#include <mach/mach.h>
+#include <mach/mach_types.h>
+#include <mach/processor_set.h>
+#include <mach/task.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+#include <mach-o/dyld.h>
+#include <mach-o/dyld_images.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+
+static void do_child(int *pipefd){
+       int exit = 0;
+
+       close(pipefd[1]);
+       read(pipefd[0], &exit, sizeof(int));
+       T_QUIET; T_EXPECT_EQ_INT(exit, 1, "exit");
+       close(pipefd[0]);
+}
+
+T_DECL(task_info_28439149, "ensure that task_info has the correct permission",
+                T_META_CHECK_LEAKS(false), T_META_ASROOT(true))
+{
+       int pipefd[2];
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pipe(pipefd), "pipe");
+
+       int pid = fork();
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork");
+
+       if (pid == 0) {
+               do_child(pipefd);
+               return;
+       }
+
+       close(pipefd[0]);
+
+       int exit;
+       mach_msg_type_number_t count;
+        struct task_basic_info_64 ti;
+       task_dyld_info_data_t di;
+
+       task_t self = mach_task_self();
+       task_t other_name;
+       task_t other;
+       int ret;
+
+       T_EXPECT_MACH_SUCCESS(task_for_pid(self, pid, &other), NULL);
+       T_EXPECT_MACH_SUCCESS(task_name_for_pid(self, pid, &other_name), NULL);
+
+       count = TASK_BASIC_INFO_64_COUNT;
+       T_EXPECT_MACH_SUCCESS(task_info(self, TASK_BASIC_INFO_64, (task_info_t)&ti,
+                               &count), "task_info(self, TASK_BASIC_INFO_64 ...)");
+       count = TASK_BASIC_INFO_64_COUNT;
+       T_EXPECT_MACH_SUCCESS(task_info(other, TASK_BASIC_INFO_64, (task_info_t)&ti,
+                               &count), "task_info(other_name, TASK_BASIC_INFO_64 ...)");
+       count = TASK_BASIC_INFO_64_COUNT;
+       T_EXPECT_MACH_SUCCESS(task_info(other_name, TASK_BASIC_INFO_64, (task_info_t)&ti,
+                               &count), "task_info(other_name, TASK_BASIC_INFO_64 ...)");
+
+
+       count = TASK_DYLD_INFO_COUNT;
+       T_EXPECT_MACH_SUCCESS(task_info(self, TASK_DYLD_INFO, (task_info_t)&di,
+                               &count), "task_info(self, TASK_DYLD_INFO ...)");
+       count = TASK_DYLD_INFO_COUNT;
+       T_EXPECT_MACH_SUCCESS(task_info(other, TASK_DYLD_INFO, (task_info_t)&di,
+                               &count), "task_info(other_name, TASK_DYLD_INFO ...)");
+       count = TASK_DYLD_INFO_COUNT;
+       ret = task_info(other_name, TASK_DYLD_INFO, (task_info_t)&di, &count);
+       T_EXPECT_EQ_INT(ret, KERN_INVALID_ARGUMENT, "task info TASK_DYLD_INFO should fail with mach_port_name");
+
+       exit = 1;
+       write(pipefd[1], &exit, sizeof(int));
+       close(pipefd[1]);
+
+       wait(NULL);
+}
+
diff --git a/tests/task_inspect.c b/tests/task_inspect.c
new file mode 100644 (file)
index 0000000..f16064a
--- /dev/null
@@ -0,0 +1,145 @@
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+
+#include <mach/host_priv.h>
+#include <mach/mach.h>
+#include <mach/mach_types.h>
+#include <mach/mach_vm.h>
+#include <mach/processor_set.h>
+#include <mach/task.h>
+#include <sys/sysctl.h>
+#include <unistd.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"));
+
+/*
+ * Attempt to inspect kernel_task using a task_inspect_t.  Interact with the
+ * kernel in the same way top(1) and lsmp(1) do.
+ */
+
+static void
+check_secure_kernel(void)
+{
+       int secure_kern = 0;
+       size_t secure_kern_size = sizeof(secure_kern);
+
+       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.secure_kernel", &secure_kern,
+                       &secure_kern_size, NULL, 0), NULL);
+
+       if (secure_kern) {
+               T_SKIP("secure kernel: processor_set_tasks will not return kernel_task");
+       }
+}
+
+static void
+attempt_kernel_inspection(task_t task)
+{
+       pid_t pid = (pid_t)-1;
+       mach_msg_type_number_t i, count, thcnt;
+       struct task_basic_info_64 ti;
+       thread_act_array_t threads;
+
+       T_QUIET;
+       T_EXPECT_MACH_SUCCESS(pid_for_task(task, &pid), NULL);
+       T_LOG("Checking pid %d", pid);
+
+       if (pid != 0) {
+               return;
+       }
+
+       T_LOG("found kernel_task, attempting to inspect");
+
+       count = TASK_BASIC_INFO_64_COUNT;
+       T_EXPECT_MACH_SUCCESS(task_info(task, TASK_BASIC_INFO_64, (task_info_t)&ti,
+                                       &count), "task_info(... TASK_BASIC_INFO_64 ...)");
+
+       T_EXPECT_MACH_SUCCESS(task_threads(task, &threads, &thcnt), "task_threads");
+       T_LOG("Found %d kernel threads.", thcnt);
+       for (i = 0; i < thcnt; i++) {
+               kern_return_t kr;
+               thread_basic_info_data_t basic_info;
+               mach_msg_type_number_t bi_count = THREAD_BASIC_INFO_COUNT;
+
+               kr = thread_info(threads[i], THREAD_BASIC_INFO,
+                               (thread_info_t)&basic_info, &bi_count);
+               /*
+                * Ignore threads that have gone away.
+                */
+               if (kr == MACH_SEND_INVALID_DEST) {
+                       T_LOG("ignoring thread that has been destroyed");
+                       continue;
+               }
+               T_EXPECT_MACH_SUCCESS(kr, "thread_info(... THREAD_BASIC_INFO ...)");
+               (void)mach_port_deallocate(mach_task_self(), threads[i]);
+       }
+       mach_vm_deallocate(mach_task_self(),
+                          (mach_vm_address_t)(uintptr_t)threads,
+                          thcnt * sizeof(*threads));
+
+       ipc_info_space_basic_t basic_info;
+       T_EXPECT_MACH_SUCCESS(mach_port_space_basic_info(task, &basic_info), "mach_port_space_basic_info");
+
+       ipc_info_space_t info_space;
+       ipc_info_name_array_t table;
+       ipc_info_tree_name_array_t tree;
+       mach_msg_type_number_t tblcnt = 0, treecnt = 0;
+       T_EXPECT_MACH_SUCCESS(mach_port_space_info(task, &info_space, &table,
+                                                  &tblcnt, &tree, &treecnt), "mach_port_space_info");
+       if (tblcnt > 0) {
+               mach_vm_deallocate(mach_task_self(),
+                                  (mach_vm_address_t)(uintptr_t)table,
+                                  tblcnt * sizeof(*table));
+       }
+       if (treecnt > 0) {
+               mach_vm_deallocate(mach_task_self(),
+                                  (mach_vm_address_t)(uintptr_t)tree,
+                                  treecnt * sizeof(*tree));
+       }
+
+       T_END;
+}
+
+T_DECL(inspect_kernel_task,
+               "ensure that kernel task can be inspected",
+               T_META_CHECK_LEAKS(false),
+               T_META_ASROOT(true))
+{
+       processor_set_name_array_t psets;
+       processor_set_t pset;
+       task_array_t tasks;
+       mach_msg_type_number_t i, j, tcnt, pcnt = 0;
+       mach_port_t self = mach_host_self();
+
+       check_secure_kernel();
+
+       T_ASSERT_MACH_SUCCESS(host_processor_sets(self, &psets, &pcnt),
+                       NULL);
+
+       for (i = 0; i < pcnt; i++) {
+               T_ASSERT_MACH_SUCCESS(host_processor_set_priv(self, psets[i], &pset), NULL);
+               T_LOG("Checking pset %d/%d", i, pcnt - 1);
+
+               tcnt = 0;
+               T_ASSERT_MACH_SUCCESS(processor_set_tasks(pset, &tasks, &tcnt), NULL);
+
+               for (j = 0; j < tcnt; j++) {
+                       attempt_kernel_inspection(tasks[j]);
+                       mach_port_deallocate(self, tasks[j]);
+               }
+
+               /* free tasks array */
+               mach_vm_deallocate(mach_task_self(),
+                                  (mach_vm_address_t)(uintptr_t)tasks,
+                                  tcnt * sizeof(*tasks));
+               mach_port_deallocate(mach_task_self(), pset);
+               mach_port_deallocate(mach_task_self(), psets[i]);
+       }
+       mach_vm_deallocate(mach_task_self(),
+                          (mach_vm_address_t)(uintptr_t)psets,
+                          pcnt * sizeof(*psets));
+
+       T_FAIL("could not find kernel_task in list of tasks returned");
+}
diff --git a/tests/task_inspect.entitlements b/tests/task_inspect.entitlements
new file mode 100644 (file)
index 0000000..eaaf1de
--- /dev/null
@@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.system-task-ports</key>
+       <true/>
+       <key>task_for_pid-allow</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/telemetry.c b/tests/telemetry.c
new file mode 100644 (file)
index 0000000..ab45d14
--- /dev/null
@@ -0,0 +1,185 @@
+#include <darwintest.h>
+#include <dispatch/dispatch.h>
+#include <ktrace/ktrace.h>
+#include <kern/debug.h>
+#include <sys/kdebug.h>
+#include <TargetConditionals.h>
+
+enum telemetry_pmi {
+       TELEMETRY_PMI_NONE,
+       TELEMETRY_PMI_INSTRS,
+       TELEMETRY_PMI_CYCLES,
+};
+#define TELEMETRY_CMD_PMI_SETUP 3
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging.telemetry"),
+               T_META_CHECK_LEAKS(false),
+               T_META_ASROOT(true));
+
+extern int __telemetry(uint64_t cmd, uint64_t deadline, uint64_t interval,
+               uint64_t leeway, uint64_t arg4, uint64_t arg5);
+
+static void
+telemetry_cleanup(void)
+{
+       int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_NONE, 0, 0, 0, 0);
+       T_EXPECT_POSIX_SUCCESS(ret, "telemetry(... NONE ...)");
+}
+
+volatile static bool spinning = true;
+static void *
+thread_spin(__unused void *arg)
+{
+       while (spinning) {
+       }
+       return NULL;
+}
+
+#define MT_MICROSTACKSHOT KDBG_EVENTID(DBG_MONOTONIC, 2, 1)
+#define MS_RECORD MACHDBG_CODE(DBG_MACH_STACKSHOT, \
+               MICROSTACKSHOT_RECORD)
+#if defined(__arm64__) || defined(__arm__)
+#define INSTRS_PERIOD (100ULL * 1000 * 1000)
+#else /* defined(__arm64__) || defined(__arm__) */
+#define INSTRS_PERIOD (1ULL * 1000 * 1000 * 1000)
+#endif /* !defined(__arm64__) && !defined(__arm__) */
+#define SLEEP_SECS 10
+
+T_DECL(microstackshot_pmi, "attempt to configure microstackshots on PMI")
+{
+#if TARGET_OS_WATCH
+       T_SKIP("unsupported platform");
+#endif /* TARGET_OS_WATCH */
+
+       T_SETUPBEGIN;
+       ktrace_session_t s = ktrace_session_create();
+       T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "session create");
+
+       __block int pmi_events = 0;
+       __block int microstackshot_record_events = 0;
+       __block int pmi_records = 0;
+       __block int io_records = 0;
+       __block int interrupt_records = 0;
+       __block int timer_arm_records = 0;
+       __block int unknown_records = 0;
+       __block int multi_records = 0;
+
+       ktrace_events_single(s, MT_MICROSTACKSHOT, ^(__unused struct trace_point *tp) {
+               pmi_events++;
+       });
+       ktrace_events_single_paired(s, MS_RECORD,
+                       ^(struct trace_point *start, __unused struct trace_point *end) {
+               if (start->arg1 & kPMIRecord) {
+                       pmi_records++;
+               }
+               if (start->arg1 & kIORecord) {
+                       io_records++;
+               }
+               if (start->arg1 & kInterruptRecord) {
+                       interrupt_records++;
+               }
+               if (start->arg1 & kTimerArmingRecord) {
+                       timer_arm_records++;
+               }
+
+               const uint8_t any_record = kPMIRecord | kIORecord | kInterruptRecord |
+                               kTimerArmingRecord;
+               if ((start->arg1 & any_record) == 0) {
+                       unknown_records++;
+               }
+               if (__builtin_popcount(start->arg1 & any_record) != 1) {
+                       multi_records++;
+               }
+
+               microstackshot_record_events++;
+       });
+
+       ktrace_set_completion_handler(s, ^{
+               ktrace_session_destroy(s);
+               T_EXPECT_GT(pmi_events, 0,
+                               "saw non-zero PMIs (%g/sec)", pmi_events / (double)SLEEP_SECS);
+               T_EXPECT_GT(pmi_records, 0, "saw non-zero PMI record events (%g/sec)",
+                               pmi_records / (double)SLEEP_SECS);
+               T_EXPECT_EQ(unknown_records, 0, "saw zero unknown record events");
+               T_EXPECT_EQ(multi_records, 0, "saw zero multiple record events");
+               T_EXPECT_GT(microstackshot_record_events, 0,
+                               "saw non-zero microstackshot record events (%g/sec)",
+                               microstackshot_record_events / (double)SLEEP_SECS);
+
+               if (interrupt_records > 0) {
+                       T_LOG("saw %g interrupt records per second",
+                                       interrupt_records / (double)SLEEP_SECS);
+               } else {
+                       T_LOG("saw no interrupt records");
+               }
+               if (io_records > 0) {
+                       T_LOG("saw %g I/O records per second",
+                                       io_records / (double)SLEEP_SECS);
+               } else {
+                       T_LOG("saw no I/O records");
+               }
+               if (timer_arm_records > 0) {
+                       T_LOG("saw %g timer arming records per second",
+                                       timer_arm_records / (double)SLEEP_SECS);
+               } else {
+                       T_LOG("saw no timer arming records");
+               }
+
+               T_END;
+       });
+
+       T_SETUPEND;
+
+       /*
+        * Start sampling via telemetry on the instructions PMI.
+        */
+       int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_INSTRS,
+                       INSTRS_PERIOD, 0, 0, 0);
+       if (ret < 0 && errno == EBUSY) {
+               T_PASS("telemetry is busy/active, maybe the events will be seen");
+       } else {
+               T_ASSERT_POSIX_SUCCESS(ret,
+                               "telemetry syscall succeeded, started microstackshots");
+               T_LOG("installing cleanup handler");
+               T_ATEND(telemetry_cleanup);
+       }
+
+       pthread_t thread;
+       int error = pthread_create(&thread, NULL, thread_spin, NULL);
+       T_ASSERT_POSIX_ZERO(error, "started thread to spin");
+
+       error = ktrace_start(s, dispatch_get_main_queue());
+       T_ASSERT_POSIX_ZERO(error, "started tracing");
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, SLEEP_SECS * NSEC_PER_SEC),
+                       dispatch_get_main_queue(), ^{
+               spinning = false;
+               ktrace_end(s, 0);
+               (void)pthread_join(thread, NULL);
+               T_LOG("ending trace session after %d seconds", SLEEP_SECS);
+       });
+
+       dispatch_main();
+}
+
+T_DECL(error_handling,
+               "ensure that error conditions for the telemetry syscall are observed")
+{
+       int ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_INSTRS,
+                       1, 0, 0, 0);
+       T_EXPECT_EQ(ret, -1, "telemetry shouldn't allow PMI every instruction");
+
+       ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_INSTRS,
+                       1000 * 1000, 0, 0, 0);
+       T_EXPECT_EQ(ret, -1,
+                       "telemetry shouldn't allow PMI every million instructions");
+
+       ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_CYCLES,
+                       1, 0, 0, 0);
+       T_EXPECT_EQ(ret, -1, "telemetry shouldn't allow PMI every cycle");
+
+       ret = __telemetry(TELEMETRY_CMD_PMI_SETUP, TELEMETRY_PMI_CYCLES,
+                       1000 * 1000, 0, 0, 0);
+       T_EXPECT_EQ(ret, -1,
+                       "telemetry shouldn't allow PMI every million cycles");
+}
diff --git a/tests/thread_group_set_32261625.c b/tests/thread_group_set_32261625.c
new file mode 100644 (file)
index 0000000..1c7eb3f
--- /dev/null
@@ -0,0 +1,64 @@
+#include <darwintest.h>
+#include <ktrace.h>
+#include <sys/kdebug.h>
+
+#define TEST_EVENTID (0xfedcbb00)
+
+static void*
+newthread(void *arg)
+{
+#pragma unused(arg)
+       while (1) {
+               kdebug_trace(TEST_EVENTID, 0, 0, 0, 0);
+               sleep(1);
+       }
+}
+
+#define TEST_TIMEOUT (15 * NSEC_PER_SEC)
+
+T_DECL(thread_group_set, "Checks that new threads get a THREAD_GROUP_SET tracepoint with a non-zero tid") {
+       pthread_t thread;
+       __block int seen_new_thread = 0, __block seen_thread_group_set = 0;
+
+       ktrace_machine_t machine = ktrace_machine_create_current();
+       T_WITH_ERRNO; T_ASSERT_NOTNULL(machine, "ktrace_get_machine");
+
+       bool has_tg = false;
+       if (ktrace_machine_has_thread_groups(machine, &has_tg) || !has_tg) {
+               T_SKIP("thread groups not supported on this system");
+       }
+       ktrace_machine_destroy(machine);
+
+       ktrace_session_t session = ktrace_session_create();
+       T_WITH_ERRNO; T_ASSERT_NOTNULL(session, "ktrace_session_create");
+
+       ktrace_set_interactive(session);
+
+       ktrace_set_completion_handler(session, ^{
+               ktrace_session_destroy(session);
+               T_ASSERT_TRUE(seen_new_thread, "seen new thread tracepoint");
+               T_END;
+       });
+
+       ktrace_events_single(session, TEST_EVENTID, ^(__unused ktrace_event_t e) {
+               T_EXPECT_TRUE(seen_thread_group_set, "seen THREAD_GROUP_SET tracepoint");
+               seen_new_thread = 1;
+               ktrace_end(session, 1);
+       });
+
+       ktrace_events_single(session, MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_SET), ^(ktrace_event_t e) {
+               T_EXPECT_GT(e->arg3, (uintptr_t)0, "tid on THREAD_GROUP_SET");
+               seen_thread_group_set = 1;
+       });
+
+       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT), dispatch_get_main_queue(), ^{
+               ktrace_end(session, 0);
+       });
+
+       T_ASSERT_POSIX_SUCCESS(ktrace_start(session, dispatch_get_main_queue()), "ktrace_start");
+
+       T_EXPECT_POSIX_SUCCESS(pthread_create(&thread, NULL, newthread, NULL), "pthread_create");
+       T_EXPECT_POSIX_SUCCESS(pthread_detach(thread), "pthread_detach");
+
+       dispatch_main();
+}
diff --git a/tests/tty_hang.c b/tests/tty_hang.c
new file mode 100644 (file)
index 0000000..19dc4d2
--- /dev/null
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <util.h>
+#include <syslog.h>
+#include <termios.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+#include <darwintest_multiprocess.h>
+
+#define TEST_TIMEOUT   10
+
+/*
+ * Receiving SIGTTIN (from the blocked read) is the passing condition, we just
+ * catch it so that we don't get terminated when we receive this.
+ */
+void
+handle_sigttin(int signal)
+{
+       return;
+}
+
+/*
+ * Because of the way dt_fork_helpers work, we have to ensure any children
+ * created by this function calls exit instead of getting the fork handlers exit
+ * handling
+ */
+int
+get_new_session_and_terminal_and_fork_child_to_read(char *pty_name)
+{
+       int sock_fd[2];
+       int pty_fd;
+       pid_t pid;
+       char buf[10];
+
+       /*
+        * We use this to handshake certain actions between this process and its
+        * child.
+        */
+       T_ASSERT_POSIX_SUCCESS(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fd),
+          NULL);
+       
+       /*
+        * New session, lose any existing controlling terminal and become
+        * session leader.
+        */
+       T_ASSERT_POSIX_SUCCESS(setsid(), NULL);
+       
+       /* now open pty, become controlling terminal of new session */
+       T_ASSERT_POSIX_SUCCESS(pty_fd = open(pty_name, O_RDWR), NULL);
+       
+       T_ASSERT_POSIX_SUCCESS(pid = fork(), NULL);
+
+       if (pid == 0) { /* child */
+               int pty_fd_child;
+               char buf[10];
+               
+               T_ASSERT_POSIX_SUCCESS(close(sock_fd[0]), NULL);
+               T_ASSERT_POSIX_SUCCESS(close(pty_fd), NULL);
+
+               /* Make a new process group for ourselves */
+               T_ASSERT_POSIX_SUCCESS(setpgid(0, 0), NULL);
+
+               T_ASSERT_POSIX_SUCCESS(pty_fd_child = open(pty_name, O_RDWR),
+                   NULL);
+
+               /* now let parent know we've done open and setpgid */
+               write(sock_fd[1], "done", sizeof("done"));
+
+               /* wait for parent to set us to the foreground process group */
+               read(sock_fd[1], buf, sizeof(buf));
+
+               /*
+                * We are the foreground process group now so we can read
+                * without getting a SIGTTIN.
+                *
+                * Once we are blocked though (we have a crude 1 second sleep on
+                * the parent to "detect" this), our parent is going to change
+                * us to be in the background.
+                *
+                * We'll be blocked until we get a signal and if that is signal
+                * is SIGTTIN, then the test has passed otherwise the test has
+                * failed.
+                */
+               signal(SIGTTIN, handle_sigttin);
+               (void)read(pty_fd_child, buf, sizeof(buf));
+               /*
+                * If we get here, we passed, if we get any other signal than
+                * SIGTTIN, we will not reach here.
+                */
+               exit(0);
+       }
+       
+       T_ASSERT_POSIX_SUCCESS(close(sock_fd[1]), NULL);
+       
+       /* wait for child to open slave side and set its pgid to its pid */
+       T_ASSERT_POSIX_SUCCESS(read(sock_fd[0], buf, sizeof(buf)), NULL);
+       
+       /*
+        * We need this to happen and in the order shown
+        *
+        * parent (pgid = pid)                  child (child_pgid = child_pid)
+        *
+        * 1 - tcsetpgrp(child_pgid)
+        * 2 -                                      block in read()
+        * 3 - tcsetpgrp(pgid)
+        *
+        * making sure 2 happens after 1 is easy, we use a sleep(1) in the
+        * parent to try and ensure 3 happens after 2.
+        */
+
+       T_ASSERT_POSIX_SUCCESS(tcsetpgrp(pty_fd, pid), NULL);
+       
+       /* let child know you have set it to be the foreground process group */
+       T_ASSERT_POSIX_SUCCESS(write(sock_fd[0], "done", sizeof("done")), NULL);
+       
+       /*
+        * give it a second to do the read of the terminal in response.
+        *
+        * XXX : Find a way to detect that the child is blocked in read(2).
+        */
+       sleep(1);
+       
+       /*
+        * now change the foreground process group to ourselves -
+        * Note we are now in the background process group and we need to ignore
+        * SIGTTOU for this call to succeed.
+        *
+        * Hopefully the child has gotten to run and blocked for read on the
+        * terminal in the 1 second we slept.
+        */
+       signal(SIGTTOU, SIG_IGN);
+       T_ASSERT_POSIX_SUCCESS(tcsetpgrp(pty_fd, getpid()), NULL);
+
+       return (0);
+}
+
+/*
+ * We're running in a "fork helper", we can't do a waitpid on the child because
+ * the fork helper unhelpfully hides the pid of the child and in it kills itself.
+ * We will instead fork first and wait on the child. If it is
+ * able to emerge from the read of the terminal, the test passes and if it
+ * doesn't, the test fails.
+ * Since the test is testing for a deadlock in proc_exit of the child (caused
+ * by a background read in the "grandchild".
+ */
+void
+run_test(int do_revoke)
+{
+       int master_fd;
+       char *slave_pty;
+       pid_t pid;
+
+       T_WITH_ERRNO;
+       T_QUIET;
+
+       T_SETUPBEGIN;
+       
+       slave_pty= NULL;
+       T_ASSERT_POSIX_SUCCESS(master_fd = posix_openpt(O_RDWR | O_NOCTTY),
+           NULL);
+       (void)fcntl(master_fd, F_SETFL, O_NONBLOCK);
+       T_ASSERT_POSIX_SUCCESS(grantpt(master_fd), NULL);
+       T_ASSERT_POSIX_SUCCESS(unlockpt(master_fd), NULL);
+       slave_pty= ptsname(master_fd);
+       T_ASSERT_NOTNULL(slave_pty, NULL);
+       T_LOG("slave pty is %s\n", slave_pty);
+
+       T_SETUPEND;
+       
+       /*
+        * We get the stdin and stdout redirection but we don't have visibility
+        * into the child (nor can we wait for it). To get around that, we fork
+        * and only let the parent to the caller and the child exits before
+        * returning to the caller.
+        */
+       T_ASSERT_POSIX_SUCCESS(pid = fork(), NULL);
+       
+       if (pid == 0) { /* child */
+               T_ASSERT_POSIX_SUCCESS(close(master_fd), NULL);
+               get_new_session_and_terminal_and_fork_child_to_read(slave_pty);
+
+               /*
+                * These tests are for testing revoke and read hangs. This
+                * revoke can be explicit by a revoke(2) system call (test 2)
+                * or as part of exit(2) of the session leader (test 1).
+                * The exit hang is the common hang and can be fixed
+                * independently but fixing the revoke(2) hang requires us make
+                * changes in the tcsetpgrp path ( which also fixes the exit
+                * hang). In essence, we have 2 fixes. One which only addresses
+                * the exit hang and one which fixes both.
+                */
+               if (do_revoke) {
+                       /* This should not hang for the test to pass .. */
+                       T_ASSERT_POSIX_SUCCESS(revoke(slave_pty), NULL);
+               }
+               /*
+                * This child has the same dt_helper variables as its parent
+                * The way dt_fork_helpers work if we don't exit() from here,
+                * we will be killing the parent. So we have to exit() and not
+                * let the dt_fork_helpers continue.
+                * If we didn't do the revoke(2), This test passes if this exit
+                * doesn't hang waiting for its child to finish reading.
+                */
+               exit(0);
+       }
+
+       int status;
+       int sig;
+
+       dt_waitpid(pid, &status, &sig, 0);
+       if (sig) {
+               T_FAIL("Test failed because child received signal %s\n",
+                      strsignal(sig));
+       } else if (status) {
+               T_FAIL("Test failed because child exited with status %d\n",
+                      status);
+       } else {
+               T_PASS("test_passed\n");
+       }
+       /*
+        * we can let this process proceed with the regular darwintest process
+        * termination and cleanup.
+        */
+}
+
+
+/*************************** TEST 1 ********************************/
+T_HELPER_DECL(create_new_session_and_exit, "create_new_session_and_exit") {
+       run_test(0);
+}
+
+T_DECL(tty_exit_bgread_hang_test, "test for background read hang on ttys with proc exit")
+{
+       dt_helper_t helpers[1];
+       
+       helpers[0] = dt_fork_helper("create_new_session_and_exit");
+       dt_run_helpers(helpers, 1, TEST_TIMEOUT);
+}
+/***********************  END TEST 1  ********************************/
+
+/************************** TEST 2 ***********************************/
+T_HELPER_DECL(create_new_session_and_revoke_terminal, "create_new_session_and_revoke_terminal") {
+       run_test(1);
+}
+
+T_DECL(tty_revoke_bgread_hang_test, "test for background read hang on ttys with revoke")
+{
+       dt_helper_t helpers[1];
+       
+       helpers[0] = dt_fork_helper("create_new_session_and_revoke_terminal");
+       dt_run_helpers(helpers, 1, TEST_TIMEOUT);
+}
+/***********************  END TEST 2 *********************************/
+
diff --git a/tests/turnstile_multihop.c b/tests/turnstile_multihop.c
new file mode 100644 (file)
index 0000000..339cfe8
--- /dev/null
@@ -0,0 +1,813 @@
+/*
+ * turnstile_multihop: Tests turnstile and multi hop priority propagation.
+ */
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+
+#include <dispatch/dispatch.h>
+#include <pthread.h>
+#include <launch.h>
+#include <mach/mach.h>
+#include <mach/message.h>
+#include <mach/mach_voucher.h>
+#include <pthread/workqueue_private.h>
+#include <voucher/ipc_pthread_priority_types.h>
+#include <servers/bootstrap.h>
+#include <stdlib.h>
+#include <sys/event.h>
+#include <unistd.h>
+#include <crt_externs.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <libkern/OSAtomic.h>
+#include <sys/wait.h>
+
+#include "turnstile_multihop_helper.h"
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.turnstile_multihop"));
+
+#define HELPER_TIMEOUT_SECS (3000)
+
+static boolean_t spin_for_ever = false;
+
+static void
+thread_create_at_qos(qos_class_t qos, void * (*function)(void *));
+static uint64_t
+nanoseconds_to_absolutetime(uint64_t nanoseconds);
+static int
+sched_create_load_at_qos(qos_class_t qos, void **load_token);
+static int
+sched_terminate_load(void *load_token) __unused;
+static void do_work(int num);
+static void
+dispatch_sync_cancel(mach_port_t owner_thread, qos_class_t promote_qos);
+
+static void *sched_load_thread(void *);
+
+struct load_token_context {
+       volatile int threads_should_exit;
+       int thread_count;
+       qos_class_t qos;
+       pthread_t *threads;
+};
+
+static struct mach_timebase_info sched_mti;
+static pthread_once_t sched_mti_once_control = PTHREAD_ONCE_INIT;
+
+static void sched_mti_init(void)
+{
+       mach_timebase_info(&sched_mti);
+}
+uint64_t
+nanoseconds_to_absolutetime(uint64_t nanoseconds)
+{
+       pthread_once(&sched_mti_once_control, sched_mti_init);
+
+       return (uint64_t)(nanoseconds * (((double)sched_mti.denom) / ((double)sched_mti.numer)));
+}
+
+static int
+sched_create_load_at_qos(qos_class_t qos, void **load_token)
+{
+       struct load_token_context *context = NULL;
+       int ret;
+       int ncpu;
+       size_t ncpu_size = sizeof(ncpu);
+       int nthreads;
+       int i;
+       pthread_attr_t attr;
+
+       ret = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0);
+       if (ret == -1) {
+               T_LOG("sysctlbyname(hw.ncpu)");
+               return errno;
+       }
+
+       T_QUIET; T_LOG("%s: Detected %d CPUs\n", __FUNCTION__, ncpu);
+
+       nthreads = ncpu;
+       T_QUIET; T_LOG("%s: Will create %d threads\n", __FUNCTION__, nthreads);
+
+       ret = pthread_attr_init(&attr);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_attr_init");
+
+       if (&pthread_attr_set_qos_class_np) {
+               ret = pthread_attr_set_qos_class_np(&attr, qos, 0);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_attr_set_qos_class_np");
+       }
+
+       context = calloc(1, sizeof(*context));
+       if (context == NULL) { T_QUIET; T_LOG("calloc returned error"); return ENOMEM; }
+
+       context->threads_should_exit = 0;
+       context->thread_count = nthreads;
+       context->qos = qos;
+       context->threads = calloc((unsigned int)nthreads, sizeof(pthread_t));
+
+       OSMemoryBarrier();
+
+       for (i=0; i < nthreads; i++) {
+               ret = pthread_create(&context->threads[i], &attr, sched_load_thread, context);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_create");
+               T_QUIET; T_LOG("%s: Created thread %d (%p)\n", __FUNCTION__, i, (void *)context->threads[i]);
+       }
+
+       ret = pthread_attr_destroy(&attr);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_attr_destroy");
+
+       *load_token = context;
+
+       return 0;
+}
+
+static void *
+sched_load_thread(void *arg)
+{
+       struct load_token_context *context = (struct load_token_context *)arg;
+
+       T_QUIET; T_LOG("%s: Thread started %p\n", __FUNCTION__, (void *)pthread_self());
+
+       while (!context->threads_should_exit) {
+               uint64_t start = mach_absolute_time();
+               uint64_t end = start + nanoseconds_to_absolutetime(900ULL * NSEC_PER_MSEC);
+
+               while ((mach_absolute_time() < end) && !context->threads_should_exit);
+       }
+
+       T_QUIET; T_LOG("%s: Thread terminating %p\n", __FUNCTION__, (void *)pthread_self());
+
+       return NULL;
+}
+
+static int
+sched_terminate_load(void *load_token)
+{
+       int ret;
+       int i;
+       struct load_token_context *context = (struct load_token_context *)load_token;
+
+       context->threads_should_exit = 1;
+       OSMemoryBarrier();
+
+       for (i=0; i < context->thread_count; i++) {
+               T_QUIET; T_LOG("%s: Joining thread %d (%p)\n", __FUNCTION__, i, (void *)context->threads[i]);
+               ret = pthread_join(context->threads[i], NULL);
+               T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_join");
+       }
+
+       free(context->threads);
+       free(context);
+
+       return 0;
+}
+
+
+// Find the first num primes, simply as a means of doing work
+static void do_work(int num)
+{
+       volatile int i = 3, count, c;
+
+       for(count = 2; count <= num; ) {
+               for(c = 2; c <= i; c++) {
+                       if(i%c == 0) {
+                               break;
+                       }
+               }
+               if(c == i) {
+                       count++;
+               }
+               i++;
+       }
+}
+
+#pragma mark pthread callbacks
+
+static void
+worker_cb(pthread_priority_t __unused priority)
+{
+       T_FAIL("a worker thread was created");
+}
+
+static void
+event_cb(void ** __unused events, int * __unused nevents)
+{
+       T_FAIL("a kevent routine was called instead of workloop");
+}
+
+static uint32_t
+get_user_promotion_basepri(void)
+{
+       mach_msg_type_number_t count = THREAD_POLICY_STATE_COUNT;
+       struct thread_policy_state thread_policy;
+       boolean_t get_default = FALSE;
+       mach_port_t thread_port = pthread_mach_thread_np(pthread_self());
+
+       kern_return_t kr = thread_policy_get(thread_port, THREAD_POLICY_STATE,
+                       (thread_policy_t)&thread_policy, &count, &get_default);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_get");
+       return thread_policy.thps_user_promotion_basepri;
+}
+
+static int messages_received = 0;
+/*
+ * Basic WL handler callback, it checks the
+ * effective Qos of the servicer thread.
+ */
+static void
+workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist __unused, int *events)
+{
+       messages_received++;
+       T_LOG("Workloop handler workloop_cb_test_intransit called. Received message no %d",
+               messages_received);
+
+
+       /* Skip the test if we can't check Qos */
+       if (geteuid() != 0) {
+               T_SKIP("kevent_qos test requires root privileges to run.");
+       }
+
+       if (messages_received == 1) {
+
+               sleep(5);
+               T_LOG("Do some CPU work.");
+               do_work(5000);
+
+               /* Check if the override now is IN + 60 boost */
+               T_EXPECT_EFFECTIVE_QOS_EQ(QOS_CLASS_USER_INITIATED,
+                               "dispatch_source event handler QoS should be QOS_CLASS_USER_INITIATED");
+               T_EXPECT_EQ(get_user_promotion_basepri(), 60u,
+                               "dispatch_source event handler should be overridden at 60");
+
+               /* Enable the knote to get 2nd message */
+               struct kevent_qos_s *kev = *eventslist;
+               kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED;
+               kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
+                               MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
+                               MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) |
+                               MACH_RCV_VOUCHER);
+               *events = 1;
+       } else {
+               *events = 0;
+               exit(0);
+       }
+}
+
+static void
+run_client_server(const char *server_name, const char *client_name)
+{
+       dt_helper_t helpers[] = {
+               dt_launchd_helper_domain("com.apple.xnu.test.turnstile_multihop.plist",
+                               server_name, NULL, LAUNCH_SYSTEM_DOMAIN),
+               dt_fork_helper(client_name)
+       };
+       dt_run_helpers(helpers, 2, HELPER_TIMEOUT_SECS);
+}
+
+#pragma mark Mach receive
+
+#define TURNSTILE_MULTIHOP_SERVICE_NAME "com.apple.xnu.test.turnstile_multihop"
+
+static mach_port_t
+get_server_port(void)
+{
+       mach_port_t port;
+       kern_return_t kr = bootstrap_check_in(bootstrap_port,
+                       TURNSTILE_MULTIHOP_SERVICE_NAME, &port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "server bootstrap_check_in");
+       return port;
+}
+
+static mach_voucher_t
+create_pthpriority_voucher(mach_msg_priority_t qos)
+{
+       char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)];
+
+       mach_voucher_t voucher = MACH_PORT_NULL;
+       kern_return_t ret;
+       ipc_pthread_priority_value_t ipc_pthread_priority_value =
+                       (ipc_pthread_priority_value_t)qos;
+
+       mach_voucher_attr_raw_recipe_array_t recipes;
+       mach_voucher_attr_raw_recipe_size_t recipe_size = 0;
+       mach_voucher_attr_recipe_t recipe =
+               (mach_voucher_attr_recipe_t)&voucher_buf[recipe_size];
+
+       recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY;
+       recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE;
+       recipe->previous_voucher = MACH_VOUCHER_NULL;
+       memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value));
+       recipe->content_size = sizeof(ipc_pthread_priority_value_t);
+       recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size;
+
+       recipes = (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0];
+
+       ret = host_create_mach_voucher(mach_host_self(),
+                               recipes,
+                               recipe_size,
+                               &voucher);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client host_create_mach_voucher");
+       return voucher;
+}
+
+static void
+send(
+       mach_port_t send_port,
+       mach_port_t reply_port,
+       mach_port_t msg_port,
+       mach_msg_priority_t qos,
+       mach_msg_option_t options)
+{
+       kern_return_t ret = 0;
+
+       struct {
+               mach_msg_header_t header;
+               mach_msg_body_t body;
+               mach_msg_port_descriptor_t port_descriptor;
+       } send_msg = {
+           .header = {
+                   .msgh_remote_port = send_port,
+                   .msgh_local_port  = reply_port,
+                   .msgh_bits        = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND,
+                                       reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0,
+                                       MACH_MSG_TYPE_MOVE_SEND,
+                                       MACH_MSGH_BITS_COMPLEX),
+                   .msgh_id          = 0x100,
+                   .msgh_size        = sizeof(send_msg),
+               },
+           .body = {
+                   .msgh_descriptor_count = 1,
+               },
+           .port_descriptor = {
+                   .name        = msg_port,
+                       .disposition = MACH_MSG_TYPE_MOVE_RECEIVE,
+                       .type        = MACH_MSG_PORT_DESCRIPTOR,
+               },
+       };
+
+       if (options & MACH_SEND_SYNC_USE_THRPRI) {
+               send_msg.header.msgh_voucher_port = create_pthpriority_voucher(qos);
+       }
+
+       if (msg_port == MACH_PORT_NULL) {
+               send_msg.body.msgh_descriptor_count = 0;
+       }
+
+       ret = mach_msg(&(send_msg.header),
+               MACH_SEND_MSG |
+               MACH_SEND_TIMEOUT |
+               MACH_SEND_OVERRIDE|
+               ((reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) | options),
+               send_msg.header.msgh_size,
+               0,
+               MACH_PORT_NULL,
+               10000,
+               0);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg");
+}
+
+static void
+receive(
+       mach_port_t rcv_port,
+       mach_port_t notify_port)
+{
+       kern_return_t ret = 0;
+
+       struct {
+               mach_msg_header_t header;
+               mach_msg_body_t body;
+               mach_msg_port_descriptor_t port_descriptor;
+       } rcv_msg = {
+           .header =
+               {
+                   .msgh_remote_port = MACH_PORT_NULL,
+                   .msgh_local_port  = rcv_port,
+                   .msgh_size        = sizeof(rcv_msg),
+               },
+       };
+
+       T_LOG("Client: Starting sync receive\n");
+
+       ret = mach_msg(&(rcv_msg.header),
+               MACH_RCV_MSG |
+               MACH_RCV_SYNC_WAIT,
+               0,
+               rcv_msg.header.msgh_size,
+               rcv_port,
+               0,
+               notify_port);
+}
+
+static lock_t lock_DEF;
+static lock_t lock_IN;
+static lock_t lock_UI;
+
+static mach_port_t main_thread_port;
+static mach_port_t def_thread_port;
+static mach_port_t in_thread_port;
+static mach_port_t ui_thread_port;
+static mach_port_t sixty_thread_port;
+
+static uint64_t dispatch_sync_owner;
+
+static int get_pri(thread_t thread_port) {
+       kern_return_t kr;
+
+       thread_extended_info_data_t extended_info;
+       mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+       kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+                          (thread_info_t)&extended_info, &count);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+
+       return extended_info.pth_curpri;
+}
+
+static void
+set_thread_name(const char *fn_name)
+{
+       char name[50] = "";
+
+       thread_t thread_port = pthread_mach_thread_np(pthread_self());
+
+       int pri = get_pri(thread_port);
+
+       snprintf(name, sizeof(name), "%s at pri %2d", fn_name, pri);
+       pthread_setname_np(name);
+}
+
+static void
+thread_wait_to_block(mach_port_t thread_port)
+{
+       thread_extended_info_data_t extended_info;
+       kern_return_t kr;
+
+       while (1) {
+               mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+               kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+                                  (thread_info_t)&extended_info, &count);
+
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+
+               if (extended_info.pth_run_state == TH_STATE_WAITING) {
+                       T_LOG("Target thread blocked\n");
+                       break;
+               }
+               thread_switch(thread_port, SWITCH_OPTION_DEPRESS, 0);
+       }
+}
+
+static void
+thread_wait_to_boost(mach_port_t thread_port, mach_port_t yield_thread, int priority)
+{
+       thread_extended_info_data_t extended_info;
+       kern_return_t kr;
+
+       while (1) {
+               mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+               kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+                                  (thread_info_t)&extended_info, &count);
+
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+
+               if (extended_info.pth_priority >= priority) {
+                       T_LOG("Target thread boosted\n");
+                       break;
+               }
+               thread_switch(yield_thread, SWITCH_OPTION_DEPRESS, 0);
+       }
+}
+
+static void
+dispatch_sync_wait(mach_port_t owner_thread, qos_class_t promote_qos)
+{
+       struct kevent_qos_s kev_err[] = {{ 0 }};
+       uint32_t fflags = 0;
+       uint64_t mask = 0;
+       uint16_t action = 0;
+       int r;
+
+       action = EV_ADD | EV_DISABLE;
+       fflags = NOTE_WL_SYNC_WAIT | NOTE_WL_DISCOVER_OWNER;
+
+       dispatch_sync_owner = owner_thread;
+
+       struct kevent_qos_s kev[] =  {{
+               .ident = mach_thread_self(),
+               .filter = EVFILT_WORKLOOP,
+               .flags = action,
+               .fflags = fflags,
+               .udata = (uintptr_t) &def_thread_port,
+               .qos = (int32_t)_pthread_qos_class_encode(promote_qos, 0, 0),
+               .ext[EV_EXTIDX_WL_MASK] = mask,
+               .ext[EV_EXTIDX_WL_VALUE] = dispatch_sync_owner,
+               .ext[EV_EXTIDX_WL_ADDR] = (uint64_t)&dispatch_sync_owner,
+       }};
+
+       /* Setup workloop to fake dispatch sync wait on a workloop */
+       r = kevent_id(30, kev, 1, kev_err, 1, NULL,
+                       NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS);
+       T_QUIET; T_LOG("dispatch_sync_wait returned\n");
+}
+
+static void
+dispatch_sync_cancel(mach_port_t owner_thread, qos_class_t promote_qos)
+{
+       struct kevent_qos_s kev_err[] = {{ 0 }};
+       uint32_t fflags = 0;
+       uint64_t mask = 0;
+       uint16_t action = 0;
+       int r;
+
+       action = EV_DELETE | EV_ENABLE;
+       fflags = NOTE_WL_SYNC_WAKE | NOTE_WL_END_OWNERSHIP;
+
+       dispatch_sync_owner = owner_thread;
+
+       struct kevent_qos_s kev[] =  {{
+               .ident = def_thread_port,
+               .filter = EVFILT_WORKLOOP,
+               .flags = action,
+               .fflags = fflags,
+               .udata = (uintptr_t) &def_thread_port,
+               .qos = (int32_t)_pthread_qos_class_encode(promote_qos, 0, 0),
+               .ext[EV_EXTIDX_WL_MASK] = mask,
+               .ext[EV_EXTIDX_WL_VALUE] = dispatch_sync_owner,
+               .ext[EV_EXTIDX_WL_ADDR] = (uint64_t)&dispatch_sync_owner,
+       }};
+
+       /* Setup workloop to fake dispatch sync wake on a workloop */
+       r = kevent_id(30, kev, 1, kev_err, 1, NULL,
+                       NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS);
+       T_QUIET; T_LOG("dispatch_sync_cancel returned\n");
+
+}
+
+static void *
+thread_at_sixty(void *arg __unused)
+{
+       int policy;
+       struct sched_param param;
+       int ret;
+       void *load_token;
+       uint64_t before_lock_time, after_lock_time;
+
+       sixty_thread_port = mach_thread_self();
+
+       set_thread_name(__FUNCTION__);
+
+       /* Change our priority to 60 */
+       ret = pthread_getschedparam(pthread_self(), &policy, &param);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_getschedparam");
+
+       param.sched_priority = 60;
+
+       ret = pthread_setschedparam(pthread_self(), policy, &param);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_setschedparam");
+
+       ret = pthread_getschedparam(pthread_self(), &policy, &param);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "pthread_getschedparam");
+
+       T_LOG("My priority is %d", param.sched_priority);
+
+       thread_wait_to_boost(in_thread_port, ui_thread_port, 46);
+
+       if (spin_for_ever) {
+               /* Schedule load at Default */
+               sched_create_load_at_qos(QOS_CLASS_DEFAULT, &load_token);
+       }
+
+       T_LOG("Thread at priority 60 trying to acquire UI lock");
+
+       before_lock_time = mach_absolute_time();
+       ull_lock(&lock_UI, 3, UL_UNFAIR_LOCK, 0);
+       after_lock_time = mach_absolute_time();
+
+       T_QUIET; T_LOG("The time for priority 60 thread to acquire lock was %llu \n",
+               (after_lock_time - before_lock_time));
+       exit(0);
+}
+
+static void *
+thread_at_ui(void *arg __unused)
+{
+       ui_thread_port = mach_thread_self();
+
+       set_thread_name(__FUNCTION__);
+
+       /* Grab the first ulock */
+       ull_lock(&lock_UI, 2, UL_UNFAIR_LOCK, 0);
+
+       thread_wait_to_boost(def_thread_port, in_thread_port, 37);
+       thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, thread_at_sixty);
+
+       T_LOG("Thread at UI priority trying to acquire IN lock");
+       ull_lock(&lock_IN, 2, UL_UNFAIR_LOCK, 0);
+       ull_unlock(&lock_UI, 2, UL_UNFAIR_LOCK, 0);
+       return NULL;
+}
+
+static void *
+thread_at_in(void *arg __unused)
+{
+       in_thread_port = mach_thread_self();
+
+       set_thread_name(__FUNCTION__);
+
+       /* Grab the first ulock */
+       ull_lock(&lock_IN, 2, UL_UNFAIR_LOCK, 0);
+
+       T_LOG("Thread at IN priority got first lock ");
+
+       thread_wait_to_boost(main_thread_port, def_thread_port, 31);
+
+       /* Create a new thread at QOS_CLASS_USER_INTERACTIVE qos */
+       thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, thread_at_ui);
+
+       T_LOG("Thread at IN priority trying to acquire default lock");
+       ull_lock(&lock_DEF, 1, UL_UNFAIR_LOCK, 0);
+       ull_unlock(&lock_IN, 2, UL_UNFAIR_LOCK, 0);
+       return NULL;
+}
+
+static void *
+thread_at_default(void *arg __unused)
+{
+       def_thread_port = mach_thread_self();
+
+       set_thread_name(__FUNCTION__);
+
+       /* Grab the first ulock */
+       ull_lock(&lock_DEF, 1, UL_UNFAIR_LOCK, 0);
+
+       T_LOG("Thread at DEFAULT priority got first lock ");
+
+       thread_wait_to_block(main_thread_port);
+
+       /* Create a new thread at QOS_CLASS_USER_INITIATED qos */
+       thread_create_at_qos(QOS_CLASS_USER_INITIATED, thread_at_in);
+
+       T_LOG("Thread at Default priority trying to wait on dispatch sync for maintenance thread");
+       dispatch_sync_wait(main_thread_port, QOS_CLASS_DEFAULT);
+       ull_unlock(&lock_DEF, 1, UL_UNFAIR_LOCK, 0);
+       return NULL;
+}
+
+static void *
+thread_at_maintenance(void *arg __unused)
+{
+       mach_port_t qos_send_port;
+       mach_port_t special_reply_port;
+
+       main_thread_port = mach_thread_self();
+
+       set_thread_name(__FUNCTION__);
+
+       kern_return_t kr = bootstrap_look_up(bootstrap_port,
+                       TURNSTILE_MULTIHOP_SERVICE_NAME, &qos_send_port);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
+
+       special_reply_port = thread_get_special_reply_port();
+       T_QUIET; T_ASSERT_TRUE(MACH_PORT_VALID(special_reply_port), "get_thread_special_reply_port");
+
+       /* Become the dispatch sync owner, dispatch_sync_owner will be set in dispatch_sync_wait function */
+
+       /* Send an async message */
+       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
+                       (uint32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0), 0);
+
+       /* Send a sync message */
+       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
+                       (uint32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0), 0);
+
+       /* Create a new thread at QOS_CLASS_DEFAULT qos */
+       thread_create_at_qos(QOS_CLASS_DEFAULT, thread_at_default);
+
+       /* Block on Sync IPC */
+       receive(special_reply_port, qos_send_port);
+
+       dispatch_sync_cancel(def_thread_port, QOS_CLASS_DEFAULT);
+       return NULL;
+}
+
+T_HELPER_DECL(three_ulock_sync_ipc_hop,
+               "Create chain of 4 threads with 3 ulocks and 1 sync IPC at different qos")
+{
+       dt_stat_time_t roundtrip_stat = dt_stat_time_create("multihop_lock_acquire");
+
+       T_STAT_MEASURE_LOOP(roundtrip_stat) {
+               if (fork() == 0) {
+                       thread_create_at_qos(QOS_CLASS_MAINTENANCE, thread_at_maintenance);
+                       sigsuspend(0);
+                       exit(0);
+               }
+               wait(NULL);
+       }
+
+       dt_stat_finalize(roundtrip_stat);
+       T_END;
+}
+
+static void
+thread_create_at_qos(qos_class_t qos, void * (*function)(void *))
+{
+       qos_class_t qos_thread;
+       pthread_t thread;
+        pthread_attr_t attr;
+       int ret;
+
+       ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL);
+       if (ret != 0) {
+               T_LOG("set priority failed\n");
+       }
+
+        pthread_attr_init(&attr);
+        pthread_attr_set_qos_class_np(&attr, qos, 0);
+        pthread_create(&thread, &attr, function, NULL);
+
+       T_LOG("pthread created\n");
+       pthread_get_qos_class_np(thread, &qos_thread, NULL);
+}
+
+#pragma mark Mach receive - kevent_qos
+
+static void
+expect_kevent_id_recv(mach_port_t port)
+{
+       int r;
+
+       T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
+               worker_cb, event_cb,
+               (pthread_workqueue_function_workloop_t)workloop_cb_test_intransit, 0, 0), NULL);
+
+       struct kevent_qos_s kev[] = {{
+               .ident = port,
+               .filter = EVFILT_MACHPORT,
+               .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
+               .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
+                               MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
+                               MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) |
+                               MACH_RCV_VOUCHER),
+               .data = 1,
+               .qos = (int32_t)_pthread_qos_class_encode(QOS_CLASS_MAINTENANCE, 0, 0)
+       }};
+
+       struct kevent_qos_s kev_err[] = {{ 0 }};
+
+       /* Setup workloop for mach msg rcv */
+       r = kevent_id(25, kev, 1, kev_err, 1, NULL,
+                       NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS);
+
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id");
+       T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id");
+}
+
+T_HELPER_DECL(server_kevent_id,
+               "Reply with the QoS that a dispatch source event handler ran with")
+{
+       expect_kevent_id_recv(get_server_port());
+       sigsuspend(0);
+       T_ASSERT_FAIL("should receive a message");
+}
+
+#define TEST_MULTIHOP(server_name, client_name, name) \
+       T_DECL(server_kevent_id_##name, \
+                       "Event delivery using a kevent_id", \
+                       T_META_ASROOT(YES)) \
+       { \
+               run_client_server(server_name, client_name); \
+       }
+
+#define TEST_MULTIHOP_SPIN(server_name, client_name, name) \
+       T_DECL(server_kevent_id_##name, \
+                       "Event delivery using a kevent_id", \
+                       T_META_ASROOT(YES), T_META_ENABLED(FALSE)) \
+       { \
+               spin_for_ever = true; \
+               run_client_server(server_name, client_name); \
+               spin_for_ever = false; \
+       }
+
+/*
+ * Test 1: Test multihop priority boosting with ulocks, dispatch sync and sync IPC.
+ *
+ * Create thread's at different Qos and acquire a ulock and block on next ulock/dispatch sync
+ * creating a sync chain. The last hop the chain is blocked on Sync IPC.
+ */
+TEST_MULTIHOP("server_kevent_id", "three_ulock_sync_ipc_hop", three_ulock_sync_ipc_hop)
+
+/*
+ * Test 2: Test multihop priority boosting with ulocks, dispatch sync and sync IPC.
+ *
+ * Create thread's at different Qos and acquire a ulock and block on next ulock/dispatch sync
+ * creating a sync chain. The last hop the chain is blocked on Sync IPC.
+ * Before the last priority 60 thread blocks on ulock, it also starts spinforeverd at priority 31.
+ */
+TEST_MULTIHOP_SPIN("server_kevent_id", "three_ulock_sync_ipc_hop", three_ulock_sync_ipc_hop_spin)
diff --git a/tests/turnstile_multihop_helper.h b/tests/turnstile_multihop_helper.h
new file mode 100644 (file)
index 0000000..0652b27
--- /dev/null
@@ -0,0 +1,203 @@
+// vim:noexpandtab
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/ulock.h>
+
+#include "turnstile_multihop_types.h"
+
+typedef _Atomic(u32) lock_t;
+
+__inline static void
+yield(void)
+{
+#if !defined(__x86_64__) && !defined(__i386__)
+       __asm volatile("yield");
+#else
+       __asm volatile("pause");
+#endif
+}
+
+__inline static void
+wfe(void)
+{
+#if !defined(__x86_64__) && !defined(__i386__)
+       __asm volatile("wfe");
+#else
+       __asm volatile("pause");
+#endif
+}
+
+__inline static void
+wfi(void)
+{
+#if !defined(__x86_64__) && !defined(__i386__)
+       __asm volatile("wfi");
+#else
+       __asm volatile("pause");
+#endif
+}
+
+__inline static void
+sev(void)
+{
+#if !defined(__x86_64__) && !defined(__i386__)
+       __asm volatile("sev");
+#endif
+}
+
+#include <os/tsd.h>
+
+#ifndef __TSD_MACH_THREAD_SELF
+#define __TSD_MACH_THREAD_SELF 3
+#endif
+
+__inline static mach_port_name_t
+_os_get_self(void)
+{
+       mach_port_name_t self = (mach_port_name_t)(uintptr_t)(void *)_os_tsd_get_direct(__TSD_MACH_THREAD_SELF);
+       return self;
+}
+
+#define ULL_WAITERS    1U
+
+static uint32_t lock_no_wait[4] = { 0, 0, 0, 0};
+static uint32_t lock_wait[4] = { 0, 0, 0, 0};
+
+static mach_port_name_t main_thread_name = 0;
+
+__inline static void
+ull_lock(lock_t *lock, int id, uint opcode, uint flags)
+{
+       u32 thread_id = _os_get_self() & ~0x3u;
+       u32 ull_locked = (opcode == UL_UNFAIR_LOCK) ? thread_id : 4u;
+       u32 mach_id = _os_get_self() >> 2;
+       u32 prev;
+       bool succeeded = false;
+       bool waiters = false;
+       bool called_wait = false;
+       u32 count = 0;
+
+       do {
+               count++;
+               if ((count % 100000) == 0) {
+                       printf("[%d,%d]%s>top of loop count=%d\n", id, mach_id, __FUNCTION__, count);
+               }
+               u32 new = waiters ? (ULL_WAITERS|ull_locked) : ull_locked;
+               prev = 0;
+               __c11_atomic_compare_exchange_strong(lock, &prev, new, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED);
+               if (prev == 0) {
+                       /* Was unlocked, now locked */
+                       succeeded = true;
+                       break;
+               }
+
+               u32 value = prev;
+               if (!(value & ULL_WAITERS)) {
+                       new = value | ULL_WAITERS;
+                       __c11_atomic_compare_exchange_strong(lock, &prev, new, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+                       if (prev == value) {
+                               /* succeeded in setting ULL_WAITERS */
+                               value = new;
+                       } else if (prev & ULL_WAITERS) {
+                               /* Didn't succeed, but someone else already set ULL_WAITERS */
+                               value = prev;
+                       } else {
+                               /* Something changed under us, so try again */
+                               if (count % 100000 == 0) {
+                                       printf("[%d,%d]%s>Something changed under us, prev=%d\n", id, mach_id, __FUNCTION__, prev);
+                               }
+                               continue;
+                       }
+               }
+               /* Locked with waiters indication, so block */
+               int ret = __ulock_wait(flags | opcode, lock, value, 0);
+               called_wait = true;
+               if (ret < 0) {
+                       if (flags & ULF_NO_ERRNO) {
+                               errno = -ret;
+                       }
+                       if (errno == EFAULT) {
+                               continue;
+                       }
+                       printf("[%d,%d]%s>ull_wait() error: %s\n", id, mach_id, __FUNCTION__, strerror(errno));
+                       exit(1);
+               }
+               waiters = (ret > 0);
+
+               if (count % 100000 == 0) {
+                       printf("[%d,%d]%s>bottom of loop prev=%d\n", id, mach_id, __FUNCTION__, prev);
+               }
+       } while (!succeeded);
+
+       if (called_wait) {
+               lock_wait[id]++;
+       } else {
+               lock_no_wait[id]++;
+       }
+}
+
+static uint32_t unlock_no_waiters[4] = { 0, 0, 0, 0};
+static uint32_t unlock_waiters[4] =  { 0, 0, 0, 0 };
+static uint32_t unlock_waiters_gone[4] =  { 0, 0, 0, 0 };
+static uint32_t unlock_waiters_wake_thread[4] =  { 0, 0, 0, 0 };
+
+__inline static void
+ull_unlock(lock_t *lock, int id, uint opcode, uint flags)
+{
+       u32 thread_id = _os_get_self() & ~0x3u;
+       u32 ull_locked = (opcode == UL_UNFAIR_LOCK) ? thread_id : 4u;
+       u32 mach_id = _os_get_self() >> 2;
+       u32 prev = ull_locked;
+       __c11_atomic_compare_exchange_strong(lock, &prev, 0, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
+       if (prev == ull_locked) {
+               unlock_no_waiters[id]++;
+               return;
+       }
+
+       if (prev == 0) {
+               printf("%s>already unlocked\n", __FUNCTION__);
+               exit(1);
+       }
+
+       if (prev == (ULL_WAITERS|ull_locked)) {
+               /* locked with waiters */
+               *lock = 0;
+               __c11_atomic_thread_fence(__ATOMIC_ACQ_REL);
+
+               if ((flags & ULF_WAKE_THREAD) && (_os_get_self() == main_thread_name)) {
+                       flags &= ~(uint)ULF_WAKE_THREAD;
+               }
+               int ret = __ulock_wake((flags | opcode), lock, main_thread_name);
+               if ((ret < 0) && (flags & ULF_NO_ERRNO)) {
+                       errno = -ret;
+               }
+               if ((flags & ULF_WAKE_THREAD) && (ret < 0) && (errno == EALREADY)) {
+                       flags &= ~(uint)ULF_WAKE_THREAD;
+                       ret = __ulock_wake((flags | opcode), lock, 0);
+                       if ((ret < 0) && (flags & ULF_NO_ERRNO)) {
+                               errno = -ret;
+                       }
+               } else if ((flags & ULF_WAKE_THREAD) && (ret == 0)) {
+                       unlock_waiters_wake_thread[id]++;
+               }
+               if (ret < 0) {
+                       if (errno == ENOENT) {
+                               unlock_waiters_gone[id]++;
+                       } else {
+                               printf("[%d,%d]%s>ull_wake() error: %s\n", id, mach_id, __FUNCTION__, strerror(errno));
+                               exit(1);
+                       }
+               }
+               unlock_waiters[id]++;
+       } else {
+               printf("%s>unexpected lock value %d\n", __FUNCTION__, prev);
+               exit(1);
+       }
+}
diff --git a/tests/turnstile_multihop_types.h b/tests/turnstile_multihop_types.h
new file mode 100644 (file)
index 0000000..fc21b00
--- /dev/null
@@ -0,0 +1,32 @@
+// vim:noexpandtab
+#ifndef __TYPES_H__
+#define __TYPES_H__
+
+#include <stdint.h>
+#include <stdbool.h>
+
+typedef        signed char     s8;
+typedef        unsigned char   u8;
+typedef uint16_t       u16;
+typedef int16_t                s16;
+typedef uint32_t       u32;
+typedef uint64_t       u64;
+typedef int32_t                s32;
+typedef int64_t                s64;
+
+#if defined(__arm64__) || defined(__x86_64__)
+typedef u64    un;
+typedef s64    sn;
+#else
+typedef u32    un;
+typedef s32    sn;
+#endif
+
+#ifndef __DRT_H__
+typedef u32    uint;
+#endif
+
+#define volatile_read(atom)            (*((volatile typeof(*(atom)) *)(atom)))
+#define volatile_write(atom, value)    (*((volatile typeof(*(atom)) *)(atom)) = value)
+
+#endif
diff --git a/tests/turnstiles_test.c b/tests/turnstiles_test.c
new file mode 100644 (file)
index 0000000..0494ba1
--- /dev/null
@@ -0,0 +1,258 @@
+/*
+ * turnstiles_test: Tests turnstile kernel primitive.
+ */
+
+#ifdef T_NAMESPACE
+#undef T_NAMESPACE
+#endif
+
+#include <darwintest.h>
+#include <darwintest_multiprocess.h>
+
+#include <pthread.h>
+#include <launch.h>
+#include <servers/bootstrap.h>
+#include <stdlib.h>
+#include <sys/event.h>
+#include <unistd.h>
+#include <crt_externs.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+
+#define SYSCTL_TURNSTILE_TEST_DEFAULT                   1
+#define SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE          2
+
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.turnstiles_test"));
+
+static void
+thread_create_at_qos(qos_class_t qos, void * (*function)(void *), int type)
+{
+       qos_class_t qos_thread;
+       pthread_t thread;
+        pthread_attr_t attr;
+       int ret;
+
+       ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL);
+       if (ret != 0) {
+               T_LOG("set priority failed\n");
+       }
+
+        pthread_attr_init(&attr);
+        pthread_attr_set_qos_class_np(&attr, qos, 0);
+        pthread_create(&thread, &attr, function, (void *)type);
+
+       T_LOG("pthread created\n");
+       pthread_get_qos_class_np(thread, &qos_thread, NULL);
+        T_EXPECT_EQ(qos_thread, (qos_class_t)qos, NULL);
+}
+
+static int
+get_pri(thread_t thread_port) {
+       kern_return_t kr;
+
+       thread_extended_info_data_t extended_info;
+       mach_msg_type_number_t count = THREAD_EXTENDED_INFO_COUNT;
+       kr = thread_info(thread_port, THREAD_EXTENDED_INFO,
+                          (thread_info_t)&extended_info, &count);
+
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
+       return extended_info.pth_curpri;
+}
+
+static void
+turnstile_prim_lock(int type)
+{
+       int ret;
+       uint64_t tid;
+       int in_val = type;
+       pthread_threadid_np(NULL, &tid);
+       T_LOG("sysctlbyname lock called from thread %llu \n", tid);
+       ret = sysctlbyname("kern.turnstiles_test_lock", NULL, 0, &in_val, sizeof(in_val));
+       T_LOG("sysctlbyname lock returned from thread %llu with value %d \n", tid, ret);
+}
+
+static void
+turnstile_prim_unlock(int type)
+{
+       int ret;
+       uint64_t tid;
+       int in_val = type;
+       pthread_threadid_np(NULL, &tid);
+       T_LOG("sysctlbyname unlock called from thread %llu \n", tid);
+       ret = sysctlbyname("kern.turnstiles_test_unlock", NULL, 0, &in_val, sizeof(in_val));
+       T_LOG("sysctlbyname unlock returned from thread %llu with value %d \n", tid, ret);
+}
+
+static void *
+take_lock_check_priority(void * arg)
+{
+       int old_pri = get_pri(mach_thread_self());
+       int unboosted_pri;
+       int boosted_pri;
+       int after_unlock_pri;
+       uint64_t tid;
+       int type = (int)arg;
+
+       pthread_threadid_np(NULL, &tid);
+
+       T_ASSERT_EQ(old_pri, 37, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri);
+
+       /* Take the test lock */
+       turnstile_prim_lock(type);
+
+       unboosted_pri =  get_pri(mach_thread_self());
+       T_ASSERT_EQ(unboosted_pri, 37, "thread(%llu) priority after acquiring the lock (uncontended) is %d\n", tid, unboosted_pri);
+
+       sleep(8);
+
+       /* Check for elevated priority */
+       boosted_pri =  get_pri(mach_thread_self());
+       T_ASSERT_EQ(boosted_pri, 47, "thread(%llu) priority after contention by 47 thread is %d\n", tid, boosted_pri);
+
+       /* Drop the lock */
+       turnstile_prim_unlock(type);
+
+       /* Check for regular priority */
+       after_unlock_pri =  get_pri(mach_thread_self());
+       T_ASSERT_EQ(after_unlock_pri, 37, "thread(%llu) priority after dropping lock is %d\n", tid, after_unlock_pri);
+
+       return NULL;
+}
+
+static void *
+try_to_take_lock_and_unlock(void *arg)
+{
+       uint64_t tid;
+       int type = (int)arg;
+
+       pthread_threadid_np(NULL, &tid);
+       sleep(4);
+
+       int old_pri = get_pri(mach_thread_self());
+       T_ASSERT_EQ(old_pri, 47, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri);
+
+       /* Try taking the test lock */
+       turnstile_prim_lock(type);
+       sleep (2);
+       turnstile_prim_unlock(type);
+       return NULL;
+}
+
+static void *
+take_lock_and_exit(void * arg)
+{
+       int old_pri = get_pri(mach_thread_self());
+       int unboosted_pri;
+       int boosted_pri;
+       uint64_t tid;
+       int type = (int)arg;
+
+       pthread_threadid_np(NULL, &tid);
+
+       T_ASSERT_EQ(old_pri, 37, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri);
+
+       /* Take the test lock */
+       turnstile_prim_lock(type);
+
+       unboosted_pri =  get_pri(mach_thread_self());
+       T_ASSERT_EQ(unboosted_pri, 37, "thread(%llu) priority after acquiring the lock (uncontended) is %d\n", tid, unboosted_pri);
+
+       sleep(8);
+
+       /* Check for elevated priority */
+       boosted_pri =  get_pri(mach_thread_self());
+       T_ASSERT_EQ(boosted_pri, 47, "thread(%llu) priority after contention by 47 thread is %d\n", tid, boosted_pri);
+
+       /* return without unlocking the lock */
+       return NULL;
+}
+
+static void *
+unlock_an_owner_exited_lock(void *arg)
+{
+       uint64_t tid;
+       int type = (int)arg;
+
+       pthread_threadid_np(NULL, &tid);
+       sleep(12);
+
+       int old_pri = get_pri(mach_thread_self());
+       T_ASSERT_EQ(old_pri, 47, "thread(%llu) priority before acquiring the lock is %d\n", tid, old_pri);
+
+       /* Unlock the test lock causing the turnstile code to call thread_deallocate_safe */
+       turnstile_prim_unlock(type);
+       return NULL;
+}
+
+/*
+ * Test 1: test if lock contended by a UI thread boosts the owner to UI qos.
+ */
+static void
+test1(int type)
+{
+       T_LOG("Test 1: test if lock contended by a UI thread boosts the owner to UI qos");
+
+       /* Create a thread at IN and take lock */
+       thread_create_at_qos(QOS_CLASS_USER_INITIATED, &take_lock_check_priority, type);
+
+       /* Create a thread at UI and try to take lock */
+       thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &try_to_take_lock_and_unlock, type);
+
+       sleep(12);
+       return;
+}
+
+/*
+ * Test 2: test if lock contended by a 2 UI thread boosts the owner to UI qos.
+ */
+static void
+test2(int type)
+{
+       T_LOG("Test 2: test if lock contended by a 2 UI thread boosts the owner to UI qos");
+
+       /* Create a thread at IN and take lock */
+       thread_create_at_qos(QOS_CLASS_USER_INITIATED, &take_lock_check_priority, type);
+
+       /* Create a thread at UI and try to take lock */
+       thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &try_to_take_lock_and_unlock, type);
+
+       /* Create a thread at UI and try to take lock */
+       thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &try_to_take_lock_and_unlock, type);
+
+       sleep(16);
+       return;
+}
+
+/*
+ * Test 3: test if lock owner thread exiting without unlocking allows turnstile to work correctly.
+ */
+static void
+test3(int type)
+{
+       T_LOG("Test 3: test if lock owner thread exiting without unlocking allows turnstile to work correctly");
+
+       /* Create a thread at IN and take lock */
+       thread_create_at_qos(QOS_CLASS_USER_INITIATED, &take_lock_and_exit, type);
+
+       /* Create a thread at UI and try to take lock */
+       thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &try_to_take_lock_and_unlock, type);
+
+       /* Create a thread at UI and try to take lock */
+       thread_create_at_qos(QOS_CLASS_USER_INTERACTIVE, &unlock_an_owner_exited_lock, type);
+
+       sleep(16);
+       return;
+}
+
+T_DECL(turnstile_test, "Turnstile test", T_META_ASROOT(YES))
+{
+       test1(SYSCTL_TURNSTILE_TEST_DEFAULT);
+       test2(SYSCTL_TURNSTILE_TEST_DEFAULT);
+       test3(SYSCTL_TURNSTILE_TEST_DEFAULT);
+
+       test1(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE);
+       test2(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE);
+       test3(SYSCTL_TURNSTILE_TEST_GLOBAL_HASHTABLE);
+       
+}
diff --git a/tests/utimensat.c b/tests/utimensat.c
new file mode 100644 (file)
index 0000000..d5baad6
--- /dev/null
@@ -0,0 +1,83 @@
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <paths.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+#define FILENAME "utimensat"
+
+static const struct timespec tptr[][2] = {
+       { { 0x12345678, 987654321 }, { 0x15263748, 123456789 }, },
+
+       { { 0, UTIME_NOW }, { 0x15263748, 123456789 }, },
+       { { 0x12345678, 987654321 }, { 0, UTIME_NOW }, },
+       { { 0, UTIME_NOW }, { 0, UTIME_NOW }, },
+
+       { { 0, UTIME_OMIT }, { 0x15263748, 123456789 }, },
+       { { 0x12345678, 987654321 }, { 0, UTIME_OMIT }, },
+       { { 0, UTIME_OMIT }, { 0, UTIME_OMIT }, },
+
+       { { 0, UTIME_NOW }, { 0, UTIME_OMIT }, },
+       { { 0, UTIME_OMIT }, { 0, UTIME_NOW }, },
+};
+
+T_DECL(utimensat, "Try various versions of utimensat")
+{
+       T_SETUPBEGIN;
+       T_ASSERT_POSIX_ZERO(chdir(dt_tmpdir()), NULL);
+       // Skip the test if the current working directory is not on APFS.
+       struct statfs sfs = { 0 };
+       T_QUIET; T_ASSERT_POSIX_SUCCESS(statfs(".", &sfs), NULL);
+       if (memcmp(&sfs.f_fstypename[0], "apfs", strlen("apfs")) != 0) {
+               T_SKIP("utimensat is APFS-only, but working directory is non-APFS");
+       }
+       T_SETUPEND;
+
+       struct stat pre_st, post_st;
+       int fd;
+
+       T_ASSERT_POSIX_SUCCESS((fd = open(FILENAME, O_CREAT|O_RDWR, 0644)), NULL);
+       T_ASSERT_POSIX_ZERO(close(fd), NULL);
+
+       for (size_t i = 0; i < sizeof(tptr)/sizeof(tptr[0]); i++) {
+               T_LOG("=== {%ld, %ld} {%ld, %ld} ===", 
+                               tptr[i][0].tv_sec, tptr[i][0].tv_nsec,
+                               tptr[i][1].tv_sec, tptr[i][1].tv_nsec);
+
+               struct timespec now;
+               clock_gettime(CLOCK_REALTIME, &now);
+
+               T_ASSERT_POSIX_ZERO(stat(FILENAME, &pre_st), NULL);
+               T_ASSERT_POSIX_ZERO(utimensat(AT_FDCWD, FILENAME, tptr[i], 0), NULL);
+               T_ASSERT_POSIX_ZERO(stat(FILENAME, &post_st), NULL);
+
+               if (tptr[i][0].tv_nsec == UTIME_NOW) {
+                       T_ASSERT_GE(post_st.st_atimespec.tv_sec, now.tv_sec, NULL);
+               } else if (tptr[i][0].tv_nsec == UTIME_OMIT) {
+                       T_ASSERT_EQ(post_st.st_atimespec.tv_sec, pre_st.st_atimespec.tv_sec, NULL);
+                       T_ASSERT_EQ(post_st.st_atimespec.tv_nsec, pre_st.st_atimespec.tv_nsec, NULL);
+               } else {
+                       T_ASSERT_EQ(post_st.st_atimespec.tv_sec, tptr[i][0].tv_sec, NULL);
+                       T_ASSERT_EQ(post_st.st_atimespec.tv_nsec, tptr[i][0].tv_nsec, NULL);
+               }
+
+               if (tptr[i][1].tv_nsec == UTIME_NOW) {
+                       T_ASSERT_GE(post_st.st_mtimespec.tv_sec, now.tv_sec, NULL);
+               } else if (tptr[i][1].tv_nsec == UTIME_OMIT) {
+                       T_ASSERT_EQ(post_st.st_mtimespec.tv_sec, pre_st.st_mtimespec.tv_sec, NULL);
+                       T_ASSERT_EQ(post_st.st_mtimespec.tv_nsec, pre_st.st_mtimespec.tv_nsec, NULL);
+               } else {
+                       T_ASSERT_EQ(post_st.st_mtimespec.tv_sec, tptr[i][1].tv_sec, NULL);
+                       T_ASSERT_EQ(post_st.st_mtimespec.tv_nsec, tptr[i][1].tv_nsec, NULL);
+               }
+       }
+}
diff --git a/tests/verify_kalloc_config.c b/tests/verify_kalloc_config.c
new file mode 100644 (file)
index 0000000..14ce3c9
--- /dev/null
@@ -0,0 +1,68 @@
+#include <string.h>
+#include <stdlib.h>
+#include <mach/mach.h>
+#include <mach_debug/mach_debug.h>
+#include <darwintest.h>
+
+T_GLOBAL_META(
+       T_META_NAMESPACE("xnu.vm"),
+       T_META_CHECK_LEAKS(false)
+);
+
+static void run_test(void);
+
+static void run_test(void)
+{
+       kern_return_t kr;
+       uint64_t size, i;
+       mach_zone_name_t *name = NULL;
+       unsigned int nameCnt = 0;
+       mach_zone_info_t *info = NULL;
+       unsigned int infoCnt = 0;
+       mach_memory_info_t *wiredInfo = NULL;
+       unsigned int wiredInfoCnt = 0;
+       const char kalloc_str[] = "kalloc.";
+
+       kr = mach_memory_info(mach_host_self(),
+                       &name, &nameCnt, &info, &infoCnt,
+                       &wiredInfo, &wiredInfoCnt);
+       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_memory_info");
+       T_QUIET; T_ASSERT_EQ(nameCnt, infoCnt, "zone name and info counts don't match");
+
+       /* Match the names of the kalloc zones against their element sizes. */
+       for (i = 0; i < nameCnt; i++) {
+               if (strncmp(name[i].mzn_name, kalloc_str, strlen(kalloc_str)) == 0) {
+                       size = strtoul(&(name[i].mzn_name[strlen(kalloc_str)]), NULL, 10);
+                       T_LOG("ZONE NAME: %-25s ELEMENT SIZE: %llu", name[i].mzn_name, size);
+                       T_QUIET; T_ASSERT_EQ(size, info[i].mzi_elem_size, "kalloc zone name and element size don't match");
+               }
+       }
+
+       if ((name != NULL) && (nameCnt != 0)) {
+               kr = vm_deallocate(mach_task_self(), (vm_address_t) name,
+                               (vm_size_t) (nameCnt * sizeof *name));
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate name");
+       }
+
+       if ((info != NULL) && (infoCnt != 0)) {
+               kr = vm_deallocate(mach_task_self(), (vm_address_t) info,
+                               (vm_size_t) (infoCnt * sizeof *info));
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate info");
+       }
+
+       if ((wiredInfo != NULL) && (wiredInfoCnt != 0)) {
+               kr = vm_deallocate(mach_task_self(), (vm_address_t) wiredInfo,
+                               (vm_size_t) (wiredInfoCnt * sizeof *wiredInfo));
+               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate wiredInfo");
+       }
+
+       T_END;
+}
+
+T_DECL( verify_kalloc_config,
+               "verifies that the kalloc zones are configured correctly",
+               T_META_ASROOT(true))
+{
+       run_test();
+}
+
diff --git a/tests/vm_set_max_addr_helper.c b/tests/vm_set_max_addr_helper.c
new file mode 100644 (file)
index 0000000..5a06a3e
--- /dev/null
@@ -0,0 +1,18 @@
+#include <mach/mach_init.h>
+#include <mach/mach_vm.h>
+#include <stdlib.h>
+
+int main(void)
+{
+       kern_return_t kr;
+       mach_vm_address_t addr = 50ULL * 1024ULL * 1024ULL * 1024ULL;
+
+       kr = mach_vm_allocate(current_task(), &addr, 4096, VM_FLAGS_FIXED);
+
+       if (kr == KERN_SUCCESS) {
+               return 0;
+       } else {
+               return 1;
+       }
+}
+
diff --git a/tests/vm_set_max_addr_test.c b/tests/vm_set_max_addr_test.c
new file mode 100644 (file)
index 0000000..325227d
--- /dev/null
@@ -0,0 +1,57 @@
+#include <sys/wait.h>
+#include <spawn.h>
+#include <spawn_private.h>
+
+#include <mach/mach_init.h>
+#include <mach/mach_vm.h>
+
+#include <darwintest.h>
+#include <darwintest_utils.h>
+
+extern char * testpath;
+
+T_DECL(set_max_addr,
+       "Description",
+       T_META_NAMESPACE("xnu.vm"),
+       T_META_CHECK_LEAKS(false))
+{
+#if (defined(__arm64__) && defined(__LP64__))
+       int result = 0;
+       int code = 0;
+       int child_pid = 0;
+       int status = 0;
+       char * command_path = "./vm_set_max_addr_helper";
+       char * command_args[] = { command_path, NULL };
+       posix_spawnattr_t attrp;
+
+       result = posix_spawnattr_init(&attrp);
+       T_ASSERT_POSIX_SUCCESS(result, "posix_spawnattr_init");
+
+       result = posix_spawn(&child_pid, command_path, NULL, &attrp, command_args, NULL);
+       T_ASSERT_POSIX_SUCCESS(result, "posix_spawn");
+
+       result = waitpid(child_pid, &status, 0);
+       T_ASSERT_POSIX_SUCCESS(result, "waitpid");
+
+       code = WEXITSTATUS(status);
+       T_ASSERT_NE_INT(code, 0, "Child should have failed");
+
+       result = posix_spawnattr_set_max_addr_np(&attrp, ~0ULL);
+       T_ASSERT_POSIX_SUCCESS(result, "posix_spawnattr_set_max_addr_np");
+
+       result = posix_spawn(&child_pid, command_path, NULL, &attrp, command_args, NULL);
+       T_ASSERT_POSIX_SUCCESS(result, "posix_spawn");
+
+       result = waitpid(child_pid, &status, 0);
+       T_ASSERT_POSIX_SUCCESS(result, "waitpid");
+
+       code = WEXITSTATUS(status);
+       T_ASSERT_EQ_INT(code, 0, "Child should have succeeded");
+
+       posix_spawnattr_destroy(&attrp);
+       T_ASSERT_POSIX_SUCCESS(result, "posix_spawnattr_destroy");
+#else /* !defined(__arm64__) || !defined(__LP64__) */
+       T_SKIP("Not supported on this architecture");
+#endif /* (defined(__arm64__) && defined(__LP64__)) */
+}
+
diff --git a/tests/voucher_entry_18826844.c b/tests/voucher_entry_18826844.c
new file mode 100644 (file)
index 0000000..24e246a
--- /dev/null
@@ -0,0 +1,78 @@
+/*
+ * Test that sending a message to a voucher with the same voucher as the voucher port
+ * with only one send right count with move send before the copy send doesn't panic.
+ *
+ * clang -o voucherentry voucherentry.c -ldarwintest -Weverything -Wno-gnu-flexible-array-initializer
+ *
+ * <rdar://problem/18826844>
+ */
+
+#include <mach/mach.h>
+#include <darwintest.h>
+
+T_DECL(voucher_entry, "voucher_entry", T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
+{
+       kern_return_t kr        = KERN_SUCCESS;
+       mach_voucher_t voucher  = MACH_VOUCHER_NULL;
+
+       /*
+        * The bank voucher already exists in this process, so using it doesn't
+        * actually test the problem. Use an importance voucher instead.
+        */
+       mach_voucher_attr_recipe_data_t recipe = {
+               .key                = MACH_VOUCHER_ATTR_KEY_IMPORTANCE,
+               .command            = MACH_VOUCHER_ATTR_IMPORTANCE_SELF,
+               .previous_voucher   = MACH_VOUCHER_NULL,
+               .content_size       = 0,
+       };
+
+       kr = host_create_mach_voucher(mach_host_self(),
+                                     (mach_voucher_attr_raw_recipe_array_t)&recipe,
+                                     sizeof(recipe), &voucher);
+
+       T_ASSERT_MACH_SUCCESS(kr, "host_create_mach_voucher");
+
+       T_ASSERT_NOTNULL(voucher, "voucher must not be null");
+
+       mach_port_urefs_t refs = 0;
+
+       kr = mach_port_get_refs(mach_task_self(), voucher, MACH_PORT_RIGHT_SEND, &refs);
+
+       T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_refs");
+
+       T_ASSERT_EQ(refs, (mach_port_urefs_t)1, "voucher must have only one ref");
+
+       /* First, try with two moves (must fail because there's only one ref) */
+       mach_msg_header_t request_msg_1 = {
+               .msgh_remote_port   = voucher,
+               .msgh_local_port    = MACH_PORT_NULL,
+               .msgh_voucher_port  = voucher,
+               .msgh_bits          = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND, 0, MACH_MSG_TYPE_MOVE_SEND, 0),
+               .msgh_id            = 0xDEAD,
+               .msgh_size          = sizeof(request_msg_1),
+       };
+
+       kr = mach_msg_send(&request_msg_1);
+
+       T_ASSERT_MACH_ERROR(MACH_SEND_INVALID_DEST, kr, "send with two moves should fail with invalid dest");
+
+       /* Next, try with a move and a copy (will succeed and destroy the last ref) */
+       mach_msg_header_t request_msg_2 = {
+               .msgh_remote_port   = voucher,
+               .msgh_local_port    = MACH_PORT_NULL,
+               .msgh_voucher_port  = voucher,
+               .msgh_bits          = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND, 0, MACH_MSG_TYPE_COPY_SEND, 0),
+               .msgh_id            = 0xDEAD,
+               .msgh_size          = sizeof(request_msg_2),
+       };
+
+       /* panic happens here */
+       kr = mach_msg_send(&request_msg_2);
+
+       T_ASSERT_MACH_SUCCESS(kr, "send with move and copy succeeds");
+
+       kr = mach_port_get_refs(mach_task_self(), voucher, MACH_PORT_RIGHT_SEND, &refs);
+
+       T_ASSERT_MACH_ERROR(KERN_INVALID_NAME, kr, "voucher should now be invalid name");
+}
+
diff --git a/tests/voucher_traps.c b/tests/voucher_traps.c
new file mode 100644 (file)
index 0000000..f3e5a0a
--- /dev/null
@@ -0,0 +1,112 @@
+/*
+ * Test voucher trap APIs.
+ * There was an unfortunate bug in the trap interface that used the user space
+ * _address_ of a trap parameter as a copyin size. This test validates there
+ * are no other kernel panics in the voucher create and voucher attribute
+ * extraction mach traps.
+ *
+ * clang -o voucher_traps voucher_traps.c -ldarwintest -Weverything -Wno-gnu-flexible-array-initializer
+ *
+ * <rdar://problem/29379175>
+ */
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <mach/mach.h>
+#include <mach/mach_vm.h>
+#include <mach/mach_traps.h>
+
+#include <atm/atm_types.h>
+
+#include <darwintest.h>
+
+
+static mach_port_t get_atm_voucher(void)
+{
+       mach_voucher_attr_recipe_data_t r = {
+               .key = MACH_VOUCHER_ATTR_KEY_ATM,
+               .command = MACH_VOUCHER_ATTR_ATM_CREATE
+       };
+       mach_port_t port = MACH_PORT_NULL;
+       kern_return_t kr = host_create_mach_voucher(mach_host_self(),
+                                                   (mach_voucher_attr_raw_recipe_array_t)&r,
+                                                   sizeof(r), &port);
+       T_ASSERT_MACH_SUCCESS(kr, "Create ATM voucher: 0x%x", (unsigned int)port);
+
+       return port;
+}
+
+
+T_DECL(voucher_extract_attr_recipe, "voucher_extract_attr_recipe")
+{
+       kern_return_t kr;
+       mach_vm_size_t alloc_sz;
+       mach_port_t port;
+       mach_vm_address_t alloc_addr;
+
+       /* map at least a page of memory at some arbitrary location */
+       alloc_sz = (mach_vm_size_t)round_page(MACH_VOUCHER_TRAP_STACK_LIMIT + 1);
+
+       /*
+        * We could theoretically ask for a fixed location, but this is more
+        * reliable, and we're not actually trying to exploit anything - a
+        * kernel panic on failure should suffice :-)
+        */
+       alloc_addr = (mach_vm_address_t)round_page(MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE + 1);
+       kr = mach_vm_allocate(mach_task_self(), &alloc_addr,
+                             alloc_sz, VM_FLAGS_ANYWHERE);
+
+       /*
+        * Make sure that the address of the allocation is larger than the
+        * maximum recipe size: this will test for the bug that was fixed in
+        * <rdar://problem/29379175>.
+        */
+       T_ASSERT_GT_ULLONG((uint64_t)alloc_addr,
+                          (uint64_t)MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE,
+                          "Recipe addr (%llu bytes): 0x%llx > max recipe sz: %llu",
+                          (uint64_t)alloc_sz, (uint64_t)alloc_addr,
+                          (uint64_t)MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE);
+
+       /* make the allocation look like a pointer to an int */
+       mach_msg_type_number_t *recipe_size;
+       recipe_size = (mach_msg_type_number_t *)((uintptr_t)alloc_addr);
+       bzero(recipe_size, (unsigned long)alloc_sz);
+       if (alloc_sz > MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE)
+               *recipe_size = MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE;
+       else
+               *recipe_size = (mach_msg_type_number_t)alloc_sz;
+
+       /* recipe buffer on the heap: memset it so panics show up loudly */
+       size_t size = (size_t)(10 * 1024 * 1024);
+       void *recipe = malloc(size);
+       memset(recipe, 0x41, size);
+
+       port = get_atm_voucher();
+
+       /*
+        * This should try to extract the ATM attribute using a buffer on the
+        * kernel heap (probably zone memory).
+        */
+       kr = mach_voucher_extract_attr_recipe_trap(port, MACH_VOUCHER_ATTR_KEY_ATM,
+                                                  recipe, recipe_size);
+       T_ASSERT_MACH_SUCCESS(kr, "Extract attribute data with recipe: heap");
+
+       /* reset the recipe memory */
+       memset(recipe, 0x41, size);
+       /* reduce the size to get an allocation on the kernel stack */
+       *recipe_size = MACH_VOUCHER_TRAP_STACK_LIMIT - 1;
+
+       /*
+        * This should try to extract the ATM attribute using a buffer on the
+        * kernel stack.
+        */
+       kr = mach_voucher_extract_attr_recipe_trap(port, MACH_VOUCHER_ATTR_KEY_ATM,
+                                                  recipe, recipe_size);
+       T_ASSERT_MACH_SUCCESS(kr, "Extract attribute data with recipe: stack");
+
+       /* cleanup */
+
+       free(recipe);
+       kr = mach_vm_deallocate(mach_task_self(), alloc_addr, alloc_sz);
+       T_ASSERT_MACH_SUCCESS(kr, "Deallocate recipe buffers");
+}
diff --git a/tests/wired_mem_bench.c b/tests/wired_mem_bench.c
new file mode 100644 (file)
index 0000000..91fe03a
--- /dev/null
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2015-2018 Apple Inc. All rights reserved.
+ *
+ * @APPLE_LICENSE_HEADER_START@
+ *
+ * This file contains Original Code and/or Modifications of Original Code
+ * as defined in and that are subject to the Apple Public Source License
+ * Version 2.0 (the 'License'). You may not use this file except in
+ * compliance with the License. Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this
+ * file.
+ *
+ * The Original Code and all software distributed under the License are
+ * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
+ * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
+ * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
+ * Please see the License for the specific language governing rights and
+ * limitations under the License.
+ *
+ * @APPLE_LICENSE_HEADER_END@
+ */
+
+#include <darwintest.h>
+
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <mach/mach.h>
+#include <sys/utsname.h>
+#include <TargetConditionals.h>
+
+#define WIRED_MEM_THRESHOLD_PERCENTAGE 30
+
+T_DECL(wired_mem_bench,
+       "report the amount of wired memory consumed by the booted OS; guard against egregious or unexpected regressions",
+       T_META_CHECK_LEAKS(false),
+       T_META_ASROOT(true),
+       T_META_REQUIRES_REBOOT(true)) // Help reduce noise by asking for a clean boot
+//     T_META_TAG_PERF)
+{
+       vm_statistics64_data_t  stat;
+       uint64_t                memsize;
+       vm_size_t               page_size = 0;
+       unsigned int            count = HOST_VM_INFO64_COUNT;
+       kern_return_t           ret;
+       int                     wired_mem_pct;
+       struct utsname          uname_vers;
+
+       T_SETUPBEGIN;
+       ret = uname(&uname_vers);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "uname()");
+
+       if (strnstr(uname_vers.version, "KASAN", sizeof(uname_vers.version)) != NULL) {
+               T_SKIP("wired memory metrics are not meaningful on KASAN kernels.");
+       }
+
+       ret = host_statistics64(mach_host_self(), HOST_VM_INFO64, (host_info64_t)&stat, &count);
+       T_QUIET;
+       T_ASSERT_MACH_SUCCESS(ret, "wired memory query via host_statistics64()");
+
+       size_t s = sizeof(memsize);
+       ret = sysctlbyname("hw.memsize", &memsize, &s, NULL, 0);
+       T_QUIET;
+       T_ASSERT_POSIX_SUCCESS(ret, "sysctlbyname(\"hw.memsize\")");
+
+       T_QUIET;
+       T_EXPECT_NE(memsize, 0ULL, "hw.memsize sysctl failed to provide device DRAM size");
+
+       ret = host_page_size(mach_host_self(), &page_size);
+       T_QUIET;
+       T_ASSERT_MACH_SUCCESS(ret, "page size query via host_page_size()");
+
+       T_SETUPEND;
+
+       T_PERF("wired_memory", (double)(stat.wire_count * (mach_vm_size_t)vm_kernel_page_size >> 10), "kB",
+              "Wired memory at boot");
+
+       T_LOG("\nwired memory: %llu kB (%llu MB)\n", stat.wire_count * (mach_vm_size_t)vm_kernel_page_size >> 10,
+              stat.wire_count * (mach_vm_size_t)vm_kernel_page_size >> 20);
+
+#if TARGET_OS_IOS || TARGET_OS_OSX
+       // zprint is not mastered onto other platforms.
+       int r;
+       if ((r = system("zprint")) != 0) {
+               T_FAIL("couldn't run zprint: %d", r);
+       }
+#endif
+       /*
+        * Poor-man's wired memory regression test: validate that wired memory consumes
+        * no more than some outrageously high fixed percentage of total device memory.
+        */
+       wired_mem_pct = (int)((stat.wire_count * page_size * 100ULL) / memsize);
+       T_PERF("wired_memory_percentage", wired_mem_pct, "%", "Wired memory as percentage of device DRAM size");
+
+       T_ASSERT_LT(wired_mem_pct, WIRED_MEM_THRESHOLD_PERCENTAGE,
+                   "Wired memory percentage is below allowable threshold (%llu bytes / %u pages / %llu total device memory)",
+                   (uint64_t)stat.wire_count * page_size, stat.wire_count, memsize);
+}
diff --git a/tests/work_interval_test.c b/tests/work_interval_test.c
new file mode 100644 (file)
index 0000000..cc69250
--- /dev/null
@@ -0,0 +1,122 @@
+
+/* test that the header doesn't implicitly depend on others */
+#include <sys/work_interval.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <err.h>
+#include <string.h>
+#include <pthread.h>
+
+#include <mach/mach.h>
+
+#include <darwintest.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"));
+
+static mach_port_t port = MACH_PORT_NULL;
+
+static void *
+joining_thread_fn(__unused void *arg)
+{
+       int ret = 0;
+       kern_return_t kr = KERN_SUCCESS;
+
+       ret = work_interval_join_port(port);
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_join_port, another thread");
+
+       kr = mach_port_deallocate(mach_task_self(), port);
+       T_ASSERT_MACH_SUCCESS(kr, "mach_port_deallocate of port, another thread");
+
+       /* deliberately exit with joined work interval */
+       return NULL;
+}
+
+T_DECL(work_interval, "work interval interface")
+{
+       int ret = 0;
+       work_interval_t handle = NULL;
+       uint64_t now = mach_absolute_time();
+       kern_return_t kr = KERN_SUCCESS;
+
+       ret = work_interval_create(NULL, 0);
+       T_ASSERT_EQ(errno, EINVAL, "create with null errno EINVAL");
+       T_ASSERT_EQ(ret, -1, "create with null returns -1");
+
+       /* Binary must be entitled for this to succeed */
+       ret = work_interval_create(&handle, 0);
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_create, no flags");
+
+       ret = work_interval_copy_port(handle, &port);
+       T_ASSERT_EQ(errno, EINVAL, "work_interval_copy_port on non-joinable interval errno EINVAL");
+       T_ASSERT_EQ(ret, -1, "work_interval_copy_port on non-joinable interval returns -1");
+
+       ret = work_interval_notify(handle, now - 1000, now, now + 1000, now + 2000, 0);
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_notify, no flags");
+
+       ret = work_interval_destroy(handle);
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_destroy, no flags");
+
+       uint32_t flags[] = {
+               WORK_INTERVAL_FLAG_JOINABLE,
+               WORK_INTERVAL_FLAG_JOINABLE | WORK_INTERVAL_FLAG_GROUP,
+       };
+
+       for (uint32_t i = 0 ; i < sizeof(flags) / sizeof(flags[0]) ; i++) {
+               ret = work_interval_create(&handle, flags[i]);
+               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_create, joinable");
+
+               ret = work_interval_copy_port(handle, &port);
+               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_copy_port, joinable");
+
+               ret = work_interval_notify(handle, now - 1000, now, now + 1000, now + 2000, 0);
+               T_ASSERT_EQ(ret, -1, "work_interval_notify on non-joined thread returns -1");
+               T_ASSERT_EQ(errno, EINVAL, "work_interval_copy_port on non-joined thread errno EINVAL");
+
+               ret = work_interval_join_port(port);
+               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_join_port, joinable");
+
+               ret = work_interval_notify(handle, now - 1000, now, now + 1000, now + 2000, 0);
+               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_notify, on joined thread");
+
+               ret = work_interval_join_port(port);
+               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_join_port, join the same interval after destroy");
+
+               kr = mach_port_deallocate(mach_task_self(), port);
+               T_ASSERT_MACH_SUCCESS(kr, "mach_port_deallocate of port");
+
+               ret = work_interval_notify(handle, now - 1000, now, now + 1000, now + 2000, 0);
+               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_notify, on joined thread after destroy");
+
+               ret = work_interval_destroy(handle);
+               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_destroy, joinable, on joined thread");
+
+               ret = work_interval_leave();
+               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_leave, on destroyed work interval");
+       }
+
+       ret = work_interval_create(&handle, WORK_INTERVAL_FLAG_JOINABLE | WORK_INTERVAL_FLAG_GROUP);
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_create, joinable");
+
+       ret = work_interval_copy_port(handle, &port);
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_copy_port, joinable");
+
+       ret = work_interval_join_port(port);
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_join_port, join before handing to another thread");
+
+       pthread_t joining_thread;
+
+       T_ASSERT_POSIX_ZERO(pthread_create(&joining_thread, NULL, joining_thread_fn, NULL), "pthread_create");
+
+       T_ASSERT_POSIX_ZERO(pthread_join(joining_thread, NULL), "pthread_join");
+
+       ret = work_interval_leave();
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_leave");
+
+       ret = work_interval_destroy(handle);
+       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_destroy");
+
+}
+
diff --git a/tests/work_interval_test.entitlements b/tests/work_interval_test.entitlements
new file mode 100644 (file)
index 0000000..5726ec2
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.private.kernel.work-interval</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/workq_sigprof.c b/tests/workq_sigprof.c
new file mode 100644 (file)
index 0000000..6ea38a8
--- /dev/null
@@ -0,0 +1,70 @@
+#include <pthread.h>
+#include <stdbool.h>
+#include <signal.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <mach/mach_time.h>
+#include <dispatch/dispatch.h>
+
+#include <darwintest.h>
+
+#if !TARGET_OS_IPHONE
+
+static pthread_t workq_thread;
+static bool signal_received;
+
+static void signal_handler(int sig __unused, siginfo_t *b __unused, void* unused __unused) {
+    if (pthread_self() == workq_thread) {
+        signal_received = true;
+    }
+}
+
+static void workq_block(void *unused __unused) {
+    workq_thread = pthread_self();
+
+    /*
+    sigset_t set;
+    sigemptyset(&set);
+    sigaddset(&set, SIGPROF);
+    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
+    */
+
+    uint64_t spin_start = mach_absolute_time();
+    while (mach_absolute_time() - spin_start < 30 * NSEC_PER_SEC)
+        if (signal_received) {
+            T_PASS("Got SIGPROF!");
+            T_END;
+        }
+    }
+
+T_DECL(workq_sigprof, "test that workqueue threads can receive sigprof")
+{
+    struct sigaction sa = {
+        .sa_sigaction = signal_handler
+    };
+    sigfillset(&sa.sa_mask);
+    T_ASSERT_POSIX_ZERO(sigaction(SIGPROF, &sa, NULL), NULL);
+
+    dispatch_queue_t q = dispatch_get_global_queue(0, 0);
+    dispatch_async_f(q, NULL, workq_block);
+
+    struct itimerval timerval = {
+        .it_interval = {.tv_usec = 10000},
+        .it_value = {.tv_usec = 10000}
+    };
+    T_ASSERT_POSIX_ZERO(setitimer(ITIMER_PROF, &timerval, NULL), NULL);
+
+    dispatch_main();
+}
+
+#else //!TARGET_OS_IPHONE
+
+T_DECL(workq_sigprof, "test that workqueue threads can receive sigprof")
+{
+    T_EXPECTFAIL;
+    T_FAIL("<rdar://problem/25864196> setitimer/sigprof doesn't seem to be delivered on embeded platforms");
+}
+
+#endif //!TARGET_OS_IPHONE
diff --git a/tests/xnu_quick_test.c b/tests/xnu_quick_test.c
new file mode 100644 (file)
index 0000000..7698b3f
--- /dev/null
@@ -0,0 +1,118 @@
+#include <darwintest.h>
+#include "xnu_quick_test_helpers.h"
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <mach/mach.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/sysctl.h>
+#include <sys/wait.h>
+
+T_GLOBAL_META (T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false));
+char g_target_path[ PATH_MAX ];
+
+/*  **************************************************************************************************************
+ *     Test the syscall system call.
+ *  **************************************************************************************************************
+ */
+T_DECL(syscall,
+       "xnu_quick_test for syscall", T_META_CHECK_LEAKS(NO))
+{
+       int                             my_fd = -1;
+       char *                  my_pathp;
+       kern_return_t   my_kr;
+
+       T_SETUPBEGIN;
+
+       create_target_directory(TEST_DIRECTORY);
+       
+       T_SETUPEND;
+
+       my_kr = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t*)&my_pathp, 
+               PATH_MAX, VM_FLAGS_ANYWHERE);
+       T_ASSERT_MACH_SUCCESS(my_kr, "Allocating vm to path %s", my_pathp);
+
+       *my_pathp = 0x00;
+       strcpy( my_pathp, &g_target_path[0] );
+       strcat( my_pathp, "/" );
+
+       /* create a test file */
+       
+       T_ASSERT_MACH_SUCCESS( create_random_name( my_pathp, 1), "Create random test file" );
+       /* use an indirect system call to open our test file.
+        * I picked open since it uses a path pointer which grows to 64 bits in an LP64 environment.
+        */
+       T_EXPECT_NE(my_fd = syscall( SYS_open, my_pathp, (O_RDWR | O_EXCL), 0 ),
+               -1, "Attempt to open file using indirect syscall %s", my_pathp);
+
+       if (my_fd != -1)
+               close(my_fd);
+       
+       if (my_pathp != NULL) {
+               remove(my_pathp);       
+               vm_deallocate(mach_task_self(), (vm_address_t)my_pathp, PATH_MAX);
+       }
+
+       T_ATEND(remove_target_directory);
+}
+
+/*  **************************************************************************************************************
+ *     Test fork wait4, and exit system calls.
+ *  **************************************************************************************************************
+ */
+T_DECL(fork_wait4_exit, 
+       "Tests forking off a process and waiting for the child to exit", T_META_CHECK_LEAKS(false))
+{
+       int                             my_err, my_status;
+    pid_t                      my_pid, my_wait_pid;
+       struct rusage   my_usage;
+       
+       strncpy(g_target_path, "/", 2);
+
+       /* spin off another process */
+       T_ASSERT_NE(my_pid = fork(), -1, "Fork off a process");
+       
+       if ( my_pid == 0 ) {
+               struct stat             my_sb;
+               
+               /* child process does very little then exits */
+               my_err = stat( &g_target_path[0], &my_sb );
+               T_WITH_ERRNO;
+        T_ASSERT_TRUE(my_err == 0, "stat call with path: \"%s\" returned \"%d\"", &g_target_path[0], errno);
+               exit( 44 );
+       }
+       
+       /* parent process waits for child to exit */
+       T_ASSERT_NE(my_wait_pid = wait4( my_pid, &my_status, 0, &my_usage ), -1,
+               "Wait for child to exit\n");
+
+       /* wait4 should return our child's pid when it exits */
+       T_ASSERT_EQ(my_wait_pid, my_pid, 
+               "wait4 should return our child's pid when it exits");
+       
+       /* kind of just guessing on these values so if this fails we should take a closer 
+        * look at the returned rusage structure. 
+        */
+        T_ASSERT_FALSE(( my_usage.ru_utime.tv_sec > 1 || 
+               my_usage.ru_stime.tv_sec > 1 || my_usage.ru_majflt > 1000 ||
+               my_usage.ru_msgsnd > 100 ), "wait4 returned rusage structure");
+
+       T_ASSERT_TRUE(( WIFEXITED( my_status ) && WEXITSTATUS( my_status ) == 44 ),
+               "check if wait4 returns right exit status");
+}
+
+T_DECL (getrusage, "Sanity check of getrusage")
+{
+        struct rusage   my_rusage;
+        
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(getrusage( RUSAGE_SELF, &my_rusage ), 0, NULL);
+       T_LOG("Checking that getrusage returned sane values");
+       T_EXPECT_LT(my_rusage.ru_msgrcv, 1000, NULL);
+       T_EXPECT_GE(my_rusage.ru_msgrcv, 0, NULL);
+       T_EXPECT_LT(my_rusage.ru_nsignals, 1000, NULL);
+       T_EXPECT_GE(my_rusage.ru_nsignals, 0, NULL);
+}
+
diff --git a/tests/xnu_quick_test.entitlements b/tests/xnu_quick_test.entitlements
new file mode 100644 (file)
index 0000000..ada01fb
--- /dev/null
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+<dict>
+       <key>com.apple.rootless.datavault.controller.internal</key>
+       <true/>
+</dict>
+</plist>
diff --git a/tests/xnu_quick_test_entitled.c b/tests/xnu_quick_test_entitled.c
new file mode 100644 (file)
index 0000000..ec1252f
--- /dev/null
@@ -0,0 +1,81 @@
+#include <darwintest.h>
+
+#include <fcntl.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/disk.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+
+#if !TARGET_OS_EMBEDDED
+#include <sys/csr.h>
+#endif
+
+T_GLOBAL_META (T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false));
+
+
+/*  **************************************************************************************************************
+ *     Test ioctl system calls.
+ *  **************************************************************************************************************
+ */
+T_DECL(ioctl, "Sanity check of ioctl by exercising DKIOCGETBLOCKCOUNT and DKIOCGETBLOCKSIZE",
+       T_META_ASROOT(true))
+{
+       int                                     my_err;
+       int                                     my_fd = -1;
+       struct statfs *         my_infop;
+       char *                          my_ptr;
+       int                                     my_blksize;
+       long long                       my_block_count;
+       char                            my_name[ MAXPATHLEN ];
+
+#if !TARGET_OS_EMBEDDED
+       /*
+        * this test won't be able to open the root disk device unless CSR is
+        * disabled or in AppleInternal mode
+        */
+       if (csr_check( CSR_ALLOW_UNRESTRICTED_FS ) &&
+               csr_check( CSR_ALLOW_APPLE_INTERNAL ) ) {
+               T_SKIP("System Integrity Protection is enabled");
+       }
+#endif
+
+       T_SETUPBEGIN;
+
+       T_WITH_ERRNO;
+       T_ASSERT_GT(getmntinfo( &my_infop, MNT_NOWAIT ), 0, "getmntinfo");
+
+       /* make this a raw device */
+       strlcpy( &my_name[0], &my_infop->f_mntfromname[0], sizeof(my_name) );
+       if ( (my_ptr = strrchr( &my_name[0], '/' )) != 0 ) {
+               if ( my_ptr[1] != 'r' ) {
+                       my_ptr[ strlen( my_ptr ) ] = 0x00;
+                       memmove( &my_ptr[2], &my_ptr[1], (strlen( &my_ptr[1] ) + 1) );
+                       my_ptr[1] = 'r';
+               }
+       }
+
+       T_ASSERT_POSIX_SUCCESS(my_fd = open( &my_name[0], O_RDONLY ), "open");
+
+       T_SETUPEND;
+
+       /* obtain the size of the media (in blocks) */
+       T_EXPECT_POSIX_SUCCESS(my_err = ioctl( my_fd, DKIOCGETBLOCKCOUNT, &my_block_count ),
+                                                  "ioctl DKIOCGETBLOCKCOUNT");
+
+       /* obtain the block size of the media */
+       T_EXPECT_POSIX_SUCCESS(my_err = ioctl( my_fd, DKIOCGETBLOCKSIZE, &my_blksize ),
+                                                  "ioctl DKIOCGETBLOCKSIZE");
+
+       T_LOG( "my_block_count %qd my_blksize %d \n", my_block_count, my_blksize );
+
+       if (my_err != -1) {
+               /* make sure the returned data looks somewhat valid */
+               T_EXPECT_GE(my_blksize, 0, NULL);
+               T_EXPECT_LE(my_blksize, 1024 * 1000, NULL);
+       }
+
+       close( my_fd );
+}
diff --git a/tests/xnu_quick_test_getsetpriority.c b/tests/xnu_quick_test_getsetpriority.c
new file mode 100644 (file)
index 0000000..ec62af5
--- /dev/null
@@ -0,0 +1,40 @@
+#include <darwintest.h>
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+
+T_GLOBAL_META(T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false));
+
+T_DECL(getpriority_setpriority, "Tests getpriority and setpriority system calls", T_META_ASROOT(true))
+{
+       int my_priority;
+       int my_new_priority;
+
+       /* getpriority returns scheduling priority so -1 is a valid value */
+       errno       = 0;
+       my_priority = getpriority(PRIO_PROCESS, 0);
+
+       T_WITH_ERRNO;
+       T_ASSERT_FALSE(my_priority == -1 && errno != 0, "Verify getpriority is successful", NULL);
+
+       /* change scheduling priority*/
+       my_new_priority = (my_priority == PRIO_MIN) ? (my_priority + 10) : (PRIO_MIN);
+
+       T_WITH_ERRNO;
+       T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_PROCESS, 0, my_new_priority), "Change scheduling priority", NULL);
+
+       /* verify change */
+       errno       = 0;
+       my_priority = getpriority(PRIO_PROCESS, 0);
+       T_WITH_ERRNO;
+       T_ASSERT_FALSE(my_priority == -1 && errno != 0, "Verify getpriority change is successful", NULL);
+
+       T_WITH_ERRNO;
+       T_ASSERT_EQ(my_priority, my_new_priority, "Verify setpriority correctly set scheduling priority", NULL);
+
+       /* reset scheduling priority */
+       T_WITH_ERRNO;
+       T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_PROCESS, 0, 0), "Reset scheduling priority", NULL);
+}
diff --git a/tests/xnu_quick_test_helpers.c b/tests/xnu_quick_test_helpers.c
new file mode 100644 (file)
index 0000000..08670d8
--- /dev/null
@@ -0,0 +1,114 @@
+#include <darwintest.h>
+
+#include "xnu_quick_test_helpers.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+void create_target_directory( const char * the_targetp )
+{
+    int             err;
+    const char *    my_targetp;
+
+    my_targetp = getenv("TMPDIR");
+    if ( my_targetp == NULL )
+        my_targetp = "/tmp";
+
+    T_ASSERT_LT( strlen( the_targetp ), (unsigned long)( PATH_MAX - 1 ),
+        "check target path too long - \"%s\"", the_targetp );
+
+    for ( ;; ) {
+        int         my_rand;
+        char        my_name[64];
+        
+        my_rand = rand( );
+        sprintf( &my_name[0], "xnu_quick_test-%d", my_rand );
+        T_ASSERT_LT( strlen( &my_name[0] ) + strlen( the_targetp ) + 2, (unsigned long)PATH_MAX,
+            "check target path plus our test directory name is too long: "
+            "target path - \"%s\" test directory name - \"%s\"",
+            the_targetp, &my_name[0] );
+
+        /* append generated directory name onto our path */
+        g_target_path[0] = 0x00;
+        strcat( &g_target_path[0], the_targetp );
+        if ( g_target_path[ (strlen(the_targetp) - 1) ] != '/' ) {
+            strcat( &g_target_path[0], "/" );
+        }
+        strcat( &g_target_path[0], &my_name[0] );
+        
+        /* try to create the test directory */
+        err = mkdir( &g_target_path[0], (S_IRWXU | S_IRWXG | S_IROTH) );
+        if ( err == 0 ) {
+            break;
+        }
+        err = errno;
+        if ( EEXIST != err ) {
+            T_ASSERT_FAIL( "test directory creation failed - \"%s\" \n"
+                "mkdir call failed with error %d - \"%s\"", 
+                &g_target_path[0], errno, strerror( err) );
+        }
+    }
+
+} /* create_target_directory */
+
+/*
+ * create_random_name - creates a file with a random / unique name in the given directory.
+ * when do_open is true we create a file else we generaate a name that does not exist in the
+ * given directory (we do not create anything when do_open is 0).
+ * WARNING - caller provides enough space in path buffer for longest possible name.
+ * WARNING - assumes caller has appended a trailing '/' on the path passed to us.
+ * RAND_MAX is currently 2147483647 (ten characters plus one for a slash)
+ */
+int create_random_name( char *the_pathp, int do_open ) {
+    int     i, my_err;
+    int     my_fd = -1;
+    
+    for ( i = 0; i < 1; i++ ) {
+        int         my_rand;
+        char        *myp;
+        char        my_name[32];
+        
+        my_rand = rand( );
+        sprintf( &my_name[0], "%d", my_rand );
+        T_ASSERT_LT_ULONG((strlen( &my_name[0] ) + strlen( the_pathp ) + 2), (unsigned long)PATH_MAX,
+            "check if path to test file is less than PATH_MAX");
+
+        // append generated file name onto our path
+        myp = strrchr( the_pathp, '/' );
+        *(myp + 1) = 0x00;
+        strcat( the_pathp, &my_name[0] );
+        if ( do_open ) {
+            /* create a file with this name */
+            my_fd = open( the_pathp, (O_RDWR | O_CREAT | O_EXCL),
+                            (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) );
+            T_EXPECT_TRUE((my_fd != -1 || errno == EEXIST), "open file with name %s", the_pathp);
+            
+            if( errno == EEXIST )
+                continue;
+        }
+        else {
+            /* make sure the name is unique */
+            struct stat     my_sb;
+            my_err = stat( the_pathp, &my_sb );
+            T_EXPECT_TRUE((my_err == 0 || errno == ENOENT), "make sure the name is unique");
+            
+            if(errno == ENOENT) break;
+            /* name already exists, try another */
+            i--;
+            continue;
+        }
+    }
+    
+    if ( my_fd != -1 )
+        close( my_fd );
+
+    if(do_open && my_fd == -1)
+        return 1;
+
+    return 0;
+} /* create_random_name */
+
+void remove_target_directory() {
+    rmdir(&g_target_path[0]);
+}
+
diff --git a/tests/xnu_quick_test_helpers.h b/tests/xnu_quick_test_helpers.h
new file mode 100644 (file)
index 0000000..b6a25ed
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef XNU_QUICK_TEST_HELPERS_H
+#define XNU_QUICK_TEST_HELPERS_H
+
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/syslimits.h>
+
+#define TEST_DIRECTORY "/tmp"
+
+extern char g_target_path[ PATH_MAX ];
+
+int create_random_name( char *the_pathp, int do_open );
+void create_target_directory( const char * the_targetp );
+void remove_target_directory( void );
+
+#endif
index 30383a3dbe1bc891bf253fd8cfd18a8fc2a6341d..26e79ffb4effd7bddff7d97a1bd9982bd48c2083 100644 (file)
@@ -14,6 +14,9 @@ LLDBMACROS_SOURCE:=$(SRCROOT)/tools/lldbmacros/
 LLDBMACROS_BOOTSTRAP_DEST:=$(OBJPATH)/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)
 LLDBMACROS_DEST:=$(LLDBMACROS_BOOTSTRAP_DEST)/lldbmacros/
 LLDBMACROS_USERDEBUG_FILES=
+ifeq ($(BUILD_STATIC_LINK),1)
+KERNEL_STATIC_DSYM_LLDBMACROS := $(OBJPATH)/$(KERNEL_FILE_NAME).link/$(KERNEL_FILE_NAME).dSYM/$(DSYMLLDBMACROSDIR)/lldbmacros
+endif
 
 LLDBMACROS_USERDEBUG_FILES:= \
        usertaskdebugging/__init__.py \
@@ -37,7 +40,9 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \
        plugins/zprint_perf_log.py \
        atm.py \
        bank.py \
+       turnstile.py \
        kevent.py \
+       workqueue.py \
        xnu.py \
        xnudefines.py \
        ktrace.py \
@@ -55,6 +60,7 @@ LLDBMACROS_PYTHON_FILES = $(LLDBMACROS_USERDEBUG_FILES) \
        memory.py \
        mbufs.py \
        net.py \
+       skywalk.py \
        ioreg.py \
        utils.py \
        kdp.py \
@@ -77,21 +83,21 @@ ifneq ($(PLATFORM),MacOSX)
                plugins/iosspeedtracer.sh
 endif
 
+include $(MakeInc_rule)
+include $(MakeInc_dir)
 
 INSTALL_LLDBMACROS_PYTHON_FILES=$(addprefix $(LLDBMACROS_DEST), $(LLDBMACROS_PYTHON_FILES))
+$(eval $(call INSTALLPYTHON_RULE_template,$(INSTALL_LLDBMACROS_PYTHON_FILES),$(LLDBMACROS_SOURCE)%,pydir,$(DATA_UNIFDEF),$(LLDBMACROS_DEST)))
+$(eval $(call INSTALLPYTHON_RULE_template,$(LLDBMACROS_BOOTSTRAP_DEST)/$(KERNEL_LLDBBOOTSTRAP_NAME),$(LLDBMACROS_SOURCE)/core/xnu_lldb_init.py,kbpydir,$(DATA_UNIFDEF),$(LLDBMACROS_BOOTSTRAP_DEST)/))
 
-$(INSTALL_LLDBMACROS_PYTHON_FILES): $(LLDBMACROS_DEST)% : $(LLDBMACROS_SOURCE)%
-       $(_v)$(MKDIR) $(dir $@)
-       $(_v)$(PYTHON) $(LLDBMACROS_SOURCE)/core/syntax_checker.py $< $(_vstdout)
-       $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@
-       $(_v)$(TOUCH) $(LLDBMACROS_DEST)
-
-$(LLDBMACROS_BOOTSTRAP_DEST)/$(KERNEL_LLDBBOOTSTRAP_NAME): $(LLDBMACROS_SOURCE)/core/xnu_lldb_init.py
-       $(_v)$(MKDIR) $(dir $@)
-       $(_v)$(PYTHON) $(LLDBMACROS_SOURCE)/core/syntax_checker.py $< $(_vstdout)
-       $(_v)$(INSTALL) $(DATA_INSTALL_FLAGS) $< $@
+ifeq ($(BUILD_STATIC_LINK),1)
+INSTALL_STATIC_DSYM_LLDBMACROS_PYTHON_FILES=$(addprefix $(KERNEL_STATIC_DSYM_LLDBMACROS), $(LLDBMACROS_PYTHON_FILES))
+$(eval $(call INSTALLPYTHON_RULE_template,$(INSTALL_STATIC_DSYM_LLDBMACROS_PYTHON_FILES),$(LLDBMACROS_SOURCE)%,sdpydir,$(DATA_UNIFDEF),$(KERNEL_STATIC_DSYM_LLDBMACROS)))
+$(eval $(call INSTALLPYTHON_RULE_template,$(KERNEL_STATIC_DSYM_LLDBMACROS)/../$(KERNEL_LLDBBOOTSTRAP_NAME),$(LLDBMACROS_SOURCE)/core/xnu_lldb_init.py,kbsdpydir,$(DATA_UNIFDEF),$(KERNEL_STATIC_DSYM_LLDBMACROS)/../))
+endif
 
 lldbmacros_install: $(INSTALL_LLDBMACROS_PYTHON_FILES) $(LLDBMACROS_BOOTSTRAP_DEST)/$(KERNEL_LLDBBOOTSTRAP_NAME)
-
-include $(MakeInc_rule)
-include $(MakeInc_dir)
+       $(_v)$(MKDIR) $(LLDBMACROS_DEST)/builtinkexts
+ifeq ($(BUILD_STATIC_LINK),1)
+       $(_v)$(MKDIR) $(KERNEL_STATIC_DSYM_LLDBMACROS)/builtinkexts
+endif
index 0941f7530f4d2f785a1618d6233ea0e40fa0379e..3b1c4eadd580b657a86f390d32c4fee3d03ba7a1 100755 (executable)
@@ -416,6 +416,18 @@ def cast(obj, target_type):
         print "ERROR: You cannot cast an 'int' to %s, please use kern.GetValueFromAddress() for such purposes." % str(target_type) 
     raise TypeError("object of type %s cannot be casted to %s" % (str(type(obj)), str(target_type)))
 
+def containerof(obj, target_type, field_name):
+    """ Type cast an object to another C type from a pointer to a field.
+        params:
+            obj - core.value  object representing some C construct in lldb
+            target_type - str : ex 'struct thread'
+                        - lldb.SBType :
+            field_name - the field name within the target_type obj is a pointer to
+    """
+    addr = int(obj) - getfieldoffset(target_type, field_name)
+    obj = value(obj.GetSBValue().CreateValueFromExpression(None,'(void *)'+str(addr)))
+    return cast(obj, target_type + " *")
+
 
 _value_types_cache={}
 
index 580887584d78b5dc23c40fca3adb86e2653ef108..43a3bd864b0cb5b1dfebf60e8359b1879ba2f531 100755 (executable)
@@ -6,6 +6,7 @@
 from cvalue import *
 from lazytarget import *
 from configuration import *
+from utils import *
 import caching
 import lldb
 
@@ -222,6 +223,30 @@ def IterateRBTreeEntry(element, element_type, field_name):
                 elt = cast(elt, element_type)
 
 
+def IteratePriorityQueueEntry(root, element_type, field_name):
+    """ iterate over a priority queue as defined with struct priority_queue from osfmk/kern/priority_queue.h
+            root         - value : Value object for the priority queue
+            element_type - str   : Type of the link element
+            field_name   - str   : Name of the field in link element's structure
+        returns:
+            A generator does not return. It is used for iterating
+            value  : an object thats of type (element_type). Always a pointer object
+    """
+    def _make_pqe(addr):
+        return value(root.GetSBValue().CreateValueFromExpression(None,'(struct priority_queue_entry *)'+str(addr)))
+
+    queue = [unsigned(root.pq_root_packed) & ~3]
+
+    while len(queue):
+        elt = _make_pqe(queue.pop())
+
+        while elt:
+            yield containerof(elt, element_type, field_name)
+            addr = unsigned(elt.child)
+            if addr: queue.append(addr)
+            elt = elt.next
+
+
 class KernelTarget(object):
     """ A common kernel object that provides access to kernel objects and information.
         The class holds global lists for  task, terminated_tasks, procs, zones, zombroc etc.
@@ -399,9 +424,19 @@ class KernelTarget(object):
         val = ((addr + size) & (unsigned(self.GetGlobalVariable("page_size"))-1))
         return (val < size and val > 0)
 
+
+    def PhysToKVARM64(self, addr):
+        ptov_table = self.GetGlobalVariable('ptov_table')
+        for i in range(0, self.GetGlobalVariable('ptov_index')):
+            if (addr >= long(unsigned(ptov_table[i].pa))) and (addr < (long(unsigned(ptov_table[i].pa)) + long(unsigned(ptov_table[i].len)))):
+                return (addr - long(unsigned(ptov_table[i].pa)) + long(unsigned(ptov_table[i].va)))
+        return (addr - unsigned(self.GetGlobalVariable("gPhysBase")) + unsigned(self.GetGlobalVariable("gVirtBase")))
+
     def PhysToKernelVirt(self, addr):
         if self.arch == 'x86_64':
             return (addr + unsigned(self.GetGlobalVariable('physmap_base')))
+        elif self.arch.startswith('arm64'):
+            return self.PhysToKVARM64(addr)
         elif self.arch.startswith('arm'):
             return (addr - unsigned(self.GetGlobalVariable("gPhysBase")) + unsigned(self.GetGlobalVariable("gVirtBase")))
         else:
@@ -548,7 +583,7 @@ class KernelTarget(object):
             self._ptrsize = caching.GetStaticCacheData("kern.ptrsize", None)
             if self._ptrsize != None : return self._ptrsize
             arch = LazyTarget.GetTarget().triple.split('-')[0]
-            if arch in ('x86_64', 'arm64'):
+            if arch == 'x86_64' or arch.startswith('arm64'):
                 self._ptrsize = 8
             else:
                 self._ptrsize = 4
@@ -558,7 +593,7 @@ class KernelTarget(object):
         if name == 'VM_MIN_KERNEL_ADDRESS':
             if self.arch == 'x86_64':
                 return unsigned(0xFFFFFF8000000000)
-            elif self.arch == 'arm64':
+            elif self.arch.startswith('arm64'):
                 return unsigned(0xffffffe000000000)
             else:
                 return unsigned(0x80000000)
index c7f49ea188728df458061c6dc24ab843e516e596..e7f494b96e40dd1d4aa3776ad5ef5dac6c15d314 100755 (executable)
@@ -103,5 +103,23 @@ def __lldb_init_module(debugger, internal_dict):
         if source_map_cmd:
             print source_map_cmd
             debugger.HandleCommand(source_map_cmd)
+
+        load_kexts = True
+        if "XNU_LLDBMACROS_NOBUILTINKEXTS" in os.environ and len(os.environ['XNU_LLDBMACROS_NOBUILTINKEXTS']) > 0:
+            load_kexts = False
+        builtinkexts_path = os.path.join(os.path.dirname(self_path), "lldbmacros", "builtinkexts")
+        if os.access(builtinkexts_path, os.F_OK):
+            kexts = os.listdir(builtinkexts_path)
+            if len(kexts) > 0:
+                print "\nBuiltin kexts: %s\n" % kexts
+                if load_kexts == False:
+                    print "XNU_LLDBMACROS_NOBUILTINKEXTS is set, not loading:\n"
+                for kextdir in kexts:
+                    script = os.path.join(builtinkexts_path, kextdir, kextdir.split('.')[-1] + ".py")
+                    import_kext_cmd = "command script import \"%s\"" % script
+                    print "%s" % import_kext_cmd
+                    if load_kexts:
+                        debugger.HandleCommand(import_kext_cmd)
+
     print "\n"
 
index 74d2e3baadc06432218c8d654e90250ad5b44b88..e2bdaf20ebc29eca34a2cddeeae3edfd4abd8ad9 100755 (executable)
@@ -1,5 +1,6 @@
 from xnu import *
 from utils import *
+from kdp import *
 import sys
 
 ######################################
index 9e5c482150a0b1ef5e55703b9d124cc886cb3d02..81090bbd8c2615e610c7b9f353577719a59916fe 100755 (executable)
@@ -11,8 +11,8 @@ from waitq import *
 from ioreg import *
 import xnudefines
 
-@header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <15s}".format("task", "pid", '#acts', "tablesize", "command"))
-def GetTaskIPCSummary(task):
+@header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <20s}".format("task", "pid", '#acts', "tablesize", "command"))
+def GetTaskIPCSummary(task, show_busy = False):
     """ Display a task's ipc summary. 
         params:
             task : core.value represeting a Task in kernel
@@ -20,12 +20,45 @@ def GetTaskIPCSummary(task):
             str - string of ipc info for the task
     """
     out_string = ''
-    format_string = "{0: <#020x} {1: <6d} {2: <6d} {3: <10d} {4: <15s}"
+    format_string = "{0: <#020x} {1: <6d} {2: <6d} {3: <10d} {4: <20s}"
+    busy_format = " {0: <10d} {1: <6d}"
+    proc_name = ''
+    if not task.active:
+        proc_name = 'terminated: '
+    if task.halting:
+        proc_name += 'halting: '
     pval = Cast(task.bsd_info, 'proc *')
+    if int(pval) != 0:
+        proc_name += str(pval.p_comm)
+    elif int(task.task_imp_base) != 0 and hasattr(task.task_imp_base, 'iit_procname'):
+        proc_name += str(task.task_imp_base.iit_procname)
     table_size = int(task.itk_space.is_table_size)
-    proc_name = str(pval.p_comm)
     out_string += format_string.format(task, pval.p_pid, task.thread_count, table_size, proc_name)
-    return out_string
+    if show_busy:
+        nbusy, nmsgs = GetTaskBusyPortsSummary(task)
+        out_string += busy_format.format(nbusy, nmsgs)
+        return (out_string, table_size, nbusy, nmsgs)
+    return (out_string, table_size)
+
+@header("{0: <20s} {1: <6s} {2: <6s} {3: <10s} {4: <20s} {5: <10s} {6: <6s}".format("task", "pid", '#acts', "tablesize", "command", "#busyports", "#kmsgs"))
+def GetTaskBusyIPCSummary(task):
+    return GetTaskIPCSummary(task, True)
+
+def GetTaskBusyPortsSummary(task):
+    isp = task.itk_space
+    i = 0
+    nbusy = 0
+    nmsgs = 0
+    while i < isp.is_table_size:
+        iep = addressof(isp.is_table[i])
+        if iep.ie_bits & 0x00020000:
+            port = Cast(iep.ie_object, 'ipc_port_t')
+            if port.ip_messages.data.port.msgcount > 0:
+                nbusy += 1
+                nmsgs += port.ip_messages.data.port.msgcount
+        i = i + 1
+    return (nbusy, nmsgs)
+
 
 @header("{0: <20s} {1: <28s} {2: <12s} {3: <6s} {4: <4s}  {5: <20s} {6: <4s}\n".format(
             "port", "mqueue", "recvname", "flags", "refs", "recvname", "dest"))
@@ -87,6 +120,56 @@ def GetPortDestProc(portp):
     
     return out_str
 
+
+def GetPortDispositionString(disp):
+    if (disp < 0): ## use negative numbers for request ports
+        portname = 'notify'
+        if disp == -1:
+            disp_str = 'reqNS'
+        elif disp == -2:
+            disp_str = 'reqPD'
+        elif disp == -3:
+            disp_str = 'reqSPa'
+        elif disp == -4:
+            disp_str = 'reqSPr'
+        elif disp == -5:
+            disp_str = 'reqSPra'
+        else:
+            disp_str = '-X'
+    ## These dispositions should match those found in osfmk/mach/message.h
+    elif disp == 16:
+        disp_str = 'R'  ## receive
+    elif disp == 24:
+        disp_str = 'dR' ## dispose receive
+    elif disp == 17:
+        disp_str = 'S'  ## (move) send
+    elif disp == 19:
+        disp_str = 'cS' ## copy send
+    elif disp == 20:
+        disp_str = 'mS' ## make send
+    elif disp == 25:
+        disp_str = 'dS' ## dispose send
+    elif disp == 18:
+        disp_str = 'O'  ## send-once
+    elif disp == 21:
+        disp_str = 'mO' ## make send-once
+    elif disp == 26:
+        disp_str = 'dO' ## dispose send-once
+    ## faux dispositions used to string-ify IPC entry types
+    elif disp == 100:
+        disp_str = 'PS' ## port set
+    elif disp == 101:
+        disp_str = 'dead' ## dead name
+    elif disp == 102:
+        disp_str = 'L' ## LABELH
+    elif disp == 103:
+        disp_str = 'V' ## Thread voucher (thread->ith_voucher->iv_port)
+    ## Catch-all
+    else:
+        disp_str = 'X'  ## invalid
+    return disp_str
+
+
 @header("{:<20s} {:<28s} {:<12s} {:<8s} {:<6s} {:<19s} {:<26s} {:<26s}\n".format(
             "", "kmsg", "msgid", "disp", "size", "reply-port", "source", "destination"))
 def GetKMsgSummary(kmsgp, prefix_str=""):
@@ -164,8 +247,8 @@ def GetKMsgSummary(kmsgp, prefix_str=""):
                     GetKMsgSrc(kmsgp), dest_proc_name)
     
     if kmsgh.msgh_bits & 0x80000000:
-        out_string += prefix_str + "\t" + GetKMsgBody.header + "\n"
-        out_string += prefix_str + "\t" + GetKMsgBody(kmsgp, prefix_str + "\t") + "\n"
+        out_string += prefix_str + "\t" + GetKMsgComplexBodyDesc.header + "\n"
+        out_string += prefix_str + "\t" + GetKMsgComplexBodyDesc(kmsgp, prefix_str + "\t") + "\n"
     
     return out_string
 
@@ -177,31 +260,56 @@ def GetMachMsgOOLDescriptorSummary(desc):
     out_string = format_string.format(desc, desc.address, desc.size)
     return out_string
 
+
+def GetKmsgDescriptors(kmsgp):
+    """ Get a list of descriptors in a complex message
+    """
+    kmsghp = kmsgp.ikm_header
+    kmsgh = dereference(kmsghp)
+    if not (kmsgh.msgh_bits & 0x80000000):
+        return []
+    ## Something in the python/lldb types is not getting alignment correct here.
+    ## I'm grabbing a pointer to the body manually, and using tribal knowledge
+    ## of the location of the descriptor count to get this correct
+    body = Cast(addressof(Cast(addressof(kmsgh), 'char *')[sizeof(kmsgh)]), 'mach_msg_body_t *')
+    #dsc_count = body.msgh_descriptor_count
+    dsc_count = dereference(Cast(body, 'uint32_t *'))
+    #dschead = Cast(addressof(body[1]), 'mach_msg_descriptor_t *')
+    dschead = Cast(addressof(Cast(addressof(body[0]), 'char *')[sizeof('uint32_t')]), 'mach_msg_descriptor_t *')
+    dsc_list = []
+    for i in range(dsc_count):
+        dsc_list.append(dschead[i])
+    return (body, dschead, dsc_list)
+
+
 @header("{: <20s} {: <8s} {: <20s} {: <10s} {: <20s}".format("kmsgheader", "size", "body", "ds_count", "dsc_head"))
-def GetKMsgBody(kmsgp, prefix_str=""):
+def GetKMsgComplexBodyDesc(kmsgp, prefix_str=""):
     """ Routine that prints a complex kmsg's body
     """
     kmsghp = kmsgp.ikm_header
     kmsgh = dereference(kmsghp)
+    if not (kmsgh.msgh_bits & 0x80000000):
+        return ""
     format_string = "{: <#020x} {: <#08x} {: <#020x} {: <#010x} {: <#020x}"
     out_string = ""
-    body = Cast(addressof(kmsghp[1]), 'mach_msg_body_t *')
-    dsc_count = body.msgh_descriptor_count
 
-    dschead = Cast(addressof(body[1]), 'mach_msg_descriptor_t *')
-    out_string += format_string.format(kmsghp, sizeof(dereference(kmsghp)), body, unsigned(dsc_count), dschead)
-    
-    for i in range(dsc_count):
-        dsc = dschead[i]        
-        out_string += "\n" + prefix_str + "Descriptor: " + xnudefines.mach_msg_type_descriptor_strings[unsigned(dsc.type.type)]
-        if unsigned(dsc.type.type) == 0:
-            # its a port.
-            p = dsc.port.name
-            out_string += " name: {: <#20x}".format(p)
-        elif unsigned(dsc.type.type) in (1,3):
-            # its OOL DESCRIPTOR or OOL VOLATILE DESCRIPTOR
-            ool = dsc.out_of_line
-            out_string += " " + GetMachMsgOOLDescriptorSummary(addressof(ool))
+    (body, dschead, dsc_list) = GetKmsgDescriptors(kmsgp)
+    out_string += format_string.format(kmsghp, sizeof(dereference(kmsghp)), body, len(dsc_list), dschead)
+    for dsc in dsc_list:
+        try:
+            dsc_type = unsigned(dsc.type.type)
+            out_string += "\n" + prefix_str + "Descriptor: " + xnudefines.mach_msg_type_descriptor_strings[dsc_type]
+            if dsc_type == 0:
+                # its a port.
+                p = dsc.port.name
+                dstr = GetPortDispositionString(dsc.port.disposition)
+                out_string += " disp:{:s}, name:{: <#20x}".format(dstr, p)
+            elif unsigned(dsc.type.type) in (1,3):
+                # its OOL DESCRIPTOR or OOL VOLATILE DESCRIPTOR
+                ool = dsc.out_of_line
+                out_string += " " + GetMachMsgOOLDescriptorSummary(addressof(ool))
+        except:
+            out_string += "\n" + prefix_str + "Invalid Descriptor: {}".format(dsc)
     return out_string 
 
 def GetKMsgSrc(kmsgp):
@@ -348,8 +456,9 @@ def ShowTaskIPC(cmd_args=None):
     print GetTaskSummary.header + " " + GetProcSummary.header
     pval = Cast(tval.bsd_info, 'proc *')
     print GetTaskSummary(tval) + " " + GetProcSummary(pval)
-    print GetTaskIPCSummary.header
-    print GetTaskIPCSummary(tval)
+    print GetTaskBusyIPCSummary.header
+    (summary, table_size, nbusy, nmsgs) = GetTaskBusyIPCSummary(tval)
+    print summary
 
 # EndMacro: showtaskipc
 
@@ -376,8 +485,15 @@ def ShowIPCSummary(cmd_args=None):
         tasks that are candidates for further investigation.
     """
     print GetTaskIPCSummary.header
+    ipc_table_size = 0
     for t in kern.tasks:
-        print GetTaskIPCSummary(t)
+        (summary, table_size) = GetTaskIPCSummary(t)
+        ipc_table_size += table_size
+        print summary
+    for t in kern.terminated_tasks:
+        (summary, table_size) = GetTaskIPCSummary(t)
+        ipc_table_size += table_size
+    print "Total Table size: {:d}".format(ipc_table_size)
     return
 
 def GetKObjectFromPort(portval):
@@ -461,7 +577,7 @@ def GetPortDestinationSummary(port):
     return out_str
     
 @lldb_type_summary(['ipc_entry_t'])
-@header("{: <20s} {: <20s} {: <8s} {: <8s} {: <8s} {: <8s} {: <20s} {: <20s}".format("object", "name","rite", "urefs", "nsets", "nmsgs", "destname", "destination"))
+@header("{: <20s} {: <12s} {: <8s} {: <8s} {: <8s} {: <8s} {: <20s} {: <20s}".format("object", "name", "rite", "urefs", "nsets", "nmsgs", "destname", "destination"))
 def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0):
     """ Get summary of a ipc entry.
         params:
@@ -477,19 +593,19 @@ def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0):
             'R'     : Receive right
             'O'     : Send-once right
         types of notifications:
+            'd'     : Dead-Name notification requested
             's'     : Send-Possible notification armed
-            'd'     : Send-Possible notification requested
-            'n'     : Dead-Name notification requested
-            'c'     : ???
-            'x'     : No-Senders notification requested
+            'r'     : Send-Possible notification requested
+            'n'     : No-Senders notification requested
+            'x'     : Port-destroy notification requested
     """
-    out_str = ''    
+    out_str = ''
     entry_ptr = int(hex(entry), 16)
     format_string = "{: <#020x} {: <12s} {: <8s} {: <8d} {: <8d} {: <8d} {: <20s} {: <20s}"
     right_str = ''
     destname_str = ''
     destination_str = ''
-    
+
     ie_object = entry.ie_object
     ie_bits = int(entry.ie_bits)
     urefs = int(ie_bits & 0xffff)
@@ -523,16 +639,31 @@ def GetIPCEntrySummary(entry, ipc_name='', rights_filter=0):
             sorightval = requestsval[int(entry.index.request)].notify.port
             soright_ptr = unsigned(sorightval)
             if soright_ptr != 0:
-                 # send-possible armed
-                 if soright_ptr & 0x1 : right_str +='s'
-                 # send-possible requested
-                 elif soright_ptr & 0x2 : right_str +='d'
-                 # dead-name notification requested
-                 else : right_str +='n'
-        # XXX: What does this bit mean?
-        if ie_bits & 0x00800000 : right_str +='c'
+                # dead-name notification requested
+                right_str += 'd'
+                # send-possible armed
+                if soright_ptr & 0x1 : right_str +='s'
+                # send-possible requested
+                if soright_ptr & 0x2 : right_str +='r'
         # No-senders notification requested
-        if portval.ip_nsrequest != 0: right_str +='x'
+        if portval.ip_nsrequest != 0: right_str += 'n'
+        # port-destroy notification requested
+        if portval.ip_pdrequest != 0: right_str += 'x'
+
+        # early-out if the rights-filter doesn't match
+        if rights_filter != 0 and rights_filter != right_str:
+            return ''
+
+        # append the generation to the name value
+        # (from osfmk/ipc/ipc_entry.h)
+        # bits    rollover period
+        # 0 0     64
+        # 0 1     48
+        # 1 0     32
+        # 1 1     16
+        ie_gen_roll = { 0:'.64', 1:'.48', 2:'.32', 3:'.16' }
+        ipc_name = '{:s}{:s}'.format(strip(ipc_name), ie_gen_roll[(ie_bits & 0x00c00000) >> 22])
+
         # now show the port destination part
         destname_str = GetPortDestinationSummary(Cast(ie_object, 'ipc_port_t'))
         # Get the number of sets to which this port belongs
@@ -620,12 +751,12 @@ def ShowRights(cmd_args=None, cmd_options={}):
                     'S'     : Send right
                     'R'     : Receive right
                     'O'     : Send-once right
-                types of notifications (append to rights type string):
+                types of notifications:
+                    'd'     : Dead-Name notification requested
                     's'     : Send-Possible notification armed
-                    'd'     : Send-Possible notification requested
-                    'n'     : Dead-Name notification requested
-                    'c'     : ???
-                    'x'     : No-Senders notification requested
+                    'r'     : Send-Possible notification requested
+                    'n'     : No-Senders notification requested
+                    'x'     : Port-destroy notification requested
     """
     if not cmd_args:
         print "No arguments passed"
@@ -655,12 +786,12 @@ def ShowTaskRights(cmd_args=None, cmd_options={}):
                    'S'     : Send right
                    'R'     : Receive right
                    'O'     : Send-once right
-               types of notifications (append to rights type string):
+               types of notifications:
+                   'd'     : Dead-Name notification requested
                    's'     : Send-Possible notification armed
-                   'd'     : Send-Possible notification requested
-                   'n'     : Dead-Name notification requested
-                   'c'     : ???
-                   'x'     : No-Senders notification requested
+                   'r'     : Send-Possible notification requested
+                   'n'     : No-Senders notification requested
+                   'x'     : Port-destroy notification requested
     """
     if cmd_args == None:
         print "No arguments passed"
@@ -693,12 +824,12 @@ def ShowTaskRightsBt(cmd_args=None, cmd_options={}):
                    'S'     : Send right
                    'R'     : Receive right
                    'O'     : Send-once right
-               types of notifications (append to rights type string):
+               types of notifications:
+                   'd'     : Dead-Name notification requested
                    's'     : Send-Possible notification armed
-                   'd'     : Send-Possible notification requested
-                   'n'     : Dead-Name notification requested
-                   'c'     : ???
-                   'x'     : No-Senders notification requested
+                   'r'     : Send-Possible notification requested
+                   'n'     : No-Senders notification requested
+                   'x'     : Port-destroy notification requested
     """
     if cmd_args == None:
         print "No arguments passed"
@@ -733,12 +864,12 @@ def ShowAllRights(cmd_args=None, cmd_options={}):
                     'S'     : Send right
                     'R'     : Receive right
                     'O'     : Send-once right
-                types of notifications (append to rights type string):
+                types of notifications:
+                    'd'     : Dead-Name notification requested
                     's'     : Send-Possible notification armed
-                    'd'     : Send-Possible notification requested
-                    'n'     : Dead-Name notification requested
-                    'c'     : ???
-                    'x'     : No-Senders notification requested
+                    'r'     : Send-Possible notification requested
+                    'n'     : No-Senders notification requested
+                    'x'     : Port-destroy notification requested
     """
     rights_type = 0
     if "-R" in cmd_options:
@@ -757,6 +888,525 @@ def ShowAllRights(cmd_args=None, cmd_options={}):
 
 # EndMacro: showallrights
 
+
+def GetInTransitPortSummary(port, disp, holding_port, holding_kmsg):
+    """ String-ify the in-transit dispostion of a port.
+    """
+    ## This should match the summary generated by GetIPCEntrySummary
+    ##              "object"   "name"   "rite"  "urefs" "nsets" "nmsgs" "destname" "destination"
+    format_str = "\t{: <#20x} {: <12} {: <8s} {: <8d} {: <8d} {: <8d} p:{: <#19x} k:{: <#19x}"
+    portname = 'intransit'
+
+    disp_str = GetPortDispositionString(disp)
+
+    out_str = format_str.format(unsigned(port), 'in-transit', disp_str, 0, 0, port.ip_messages.data.port.msgcount, unsigned(holding_port), unsigned(holding_kmsg))
+    return out_str
+
+
+def GetDispositionFromEntryType(entry_bits):
+    """ Translate an IPC entry type into an in-transit disposition. This allows
+        the GetInTransitPortSummary function to be re-used to string-ify IPC
+        entry types.
+    """
+    ebits = int(entry_bits)
+    if (ebits & 0x003f0000) == 0:
+        return 0
+
+    if (ebits & 0x00010000) != 0:
+        return 17 ## MACH_PORT_RIGHT_SEND
+    elif (ebits & 0x00020000) != 0:
+        return 16 ## MACH_PORT_RIGHT_RECEIVE
+    elif (ebits & 0x00040000) != 0:
+        return 18 ## MACH_PORT_RIGHT_SEND_ONCE
+    elif (ebits & 0x00080000) != 0:
+        return 100 ## MACH_PORT_RIGHT_PORT_SET
+    elif (ebits & 0x00100000) != 0:
+        return 101 ## MACH_PORT_RIGHT_DEAD_NAME
+    elif (ebits & 0x00200000) != 0:
+        return 102 ## MACH_PORT_RIGHT_LABELH
+    else:
+        return 0
+
+def GetDispositionFromVoucherPort(th_vport):
+    """ Translate a thread's voucher port into a 'disposition'
+    """
+    if unsigned(th_vport) > 0:
+        return 103  ## Voucher type
+    return 0
+
+
+g_kmsg_prog = 0
+g_progmeter = {
+    0 : '*',
+    1 : '-',
+    2 : '\\',
+    3 : '|',
+    4 : '/',
+    5 : '-',
+    6 : '\\',
+    7 : '|',
+    8 : '/',
+}
+
+def PrintProgressForKmsg():
+    global g_kmsg_prog
+    global g_progmeter
+    sys.stderr.write(" {:<1s}\r".format(g_progmeter[g_kmsg_prog % 9]))
+    g_kmsg_prog += 1
+
+
+def CollectPortsForAnalysis(port, disposition):
+    """
+    """
+    p = Cast(port, 'struct ipc_port *')
+    yield (p, disposition)
+
+    # no-senders notification port
+    if unsigned(p.ip_nsrequest) != 0:
+        PrintProgressForKmsg()
+        yield (Cast(p.ip_nsrequest, 'struct ipc_port *'), -1)
+
+    # port-death notification port
+    if unsigned(p.ip_pdrequest) != 0:
+        PrintProgressForKmsg()
+        yield (Cast(p.ip_pdrequest, 'struct ipc_port *'), -2)
+
+    ## ports can have many send-possible notifications armed: go through the table!
+    if unsigned(p.ip_requests) != 0:
+        table = Cast(p.ip_requests, 'struct ipc_port_request *')
+        table_sz = int(table.name.size.its_size)
+        for i in range(table_sz):
+            if i == 0:
+                continue
+            ipr = table[i]
+            if unsigned(ipr.name.name) != 0:
+                ipr_bits = unsigned(ipr.notify.port) & 3
+                ipr_port = kern.GetValueFromAddress(int(ipr.notify.port) & ~3, 'struct ipc_port *')
+                ipr_disp = 0
+                if ipr_bits & 3: ## send-possible armed and requested
+                    ipr_disp = -5
+                elif ipr_bits & 2: ## send-possible requested
+                    ipr_disp = -4
+                elif ipr_bits & 1: ## send-possible armed
+                    ipr_disp = -3
+                PrintProgressForKmsg()
+                yield (ipr_port, ipr_disp)
+    return
+
+def CollectKmsgPorts(task, task_port, kmsgp):
+    """ Look through a message, 'kmsgp' destined for 'task'
+        (enqueued on task_port). Collect any port descriptors,
+        remote, local, voucher, or other port references
+        into a (ipc_port_t, disposition) list.
+    """
+    kmsgh = dereference(kmsgp.ikm_header)
+
+    p_list = []
+
+    PrintProgressForKmsg()
+    if kmsgh.msgh_remote_port and unsigned(kmsgh.msgh_remote_port) != unsigned(task_port):
+        disp = kmsgh.msgh_bits & 0x1f
+        p_list += list(CollectPortsForAnalysis(kmsgh.msgh_remote_port, disp))
+
+    if kmsgh.msgh_local_port and unsigned(kmsgh.msgh_local_port) != unsigned(task_port) \
+       and unsigned(kmsgh.msgh_local_port) != unsigned(kmsgh.msgh_remote_port):
+        disp = (kmsgh.msgh_bits & 0x1f00) >> 8
+        p_list += list(CollectPortsForAnalysis(kmsgh.msgh_local_port, disp))
+
+    if kmsgp.ikm_voucher:
+        p_list += list(CollectPortsForAnalysis(kmsgp.ikm_voucher, 0))
+
+    if kmsgh.msgh_bits & 0x80000000:
+        ## Complex message - look for descriptors
+        PrintProgressForKmsg()
+        (body, dschead, dsc_list) = GetKmsgDescriptors(kmsgp)
+        for dsc in dsc_list:
+            PrintProgressForKmsg()
+            dsc_type = unsigned(dsc.type.type)
+            if dsc_type == 0 or dsc_type == 2: ## 0 == port, 2 == ool port
+                if dsc_type == 0:
+                    ## its a port descriptor
+                    dsc_disp = dsc.port.disposition
+                    p_list += list(CollectPortsForAnalysis(dsc.port.name, dsc_disp))
+                else:
+                    ## it's an ool_ports descriptor which is an array of ports
+                    dsc_disp = dsc.ool_ports.disposition
+                    dispdata = Cast(dsc.ool_ports.address, 'struct ipc_port *')
+                    for pidx in range(dsc.ool_ports.count):
+                        PrintProgressForKmsg()
+                        p_list += list(CollectPortsForAnalysis(dispdata[pidx], dsc_disp))
+    return p_list
+
+def CollectKmsgPortRefs(task, task_port, kmsgp, p_refs):
+    """ Recursively collect all references to ports inside the kmsg 'kmsgp'
+        into the set 'p_refs'
+    """
+    p_list = CollectKmsgPorts(task, task_port, kmsgp)
+
+    ## Iterate over each ports we've collected, to see if they
+    ## have messages on them, and then recurse!
+    for p, pdisp in p_list:
+        ptype = (p.ip_object.io_bits & 0x7fff0000) >> 16
+        p_refs.add((p, pdisp, ptype))
+        if ptype != 0: ## don't bother with port sets
+            continue
+        ## If the port that's in-transit has messages already enqueued,
+        ## go through each of those messages and look for more ports!
+        if p.ip_messages.data.port.msgcount > 0:
+            p_kmsgp = Cast(p.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t')
+            kmsgheadp = p_kmsgp
+            while unsigned(p_kmsgp) > 0:
+                CollectKmsgPortRefs(task, p, p_kmsgp, p_refs)
+                p_kmsgp = p_kmsgp.ikm_next
+                if p_kmsgp == kmsgheadp:
+                    break;
+
+
+def FindKmsgPortRefs(instr, task, task_port, kmsgp, qport):
+    """ Look through a message, 'kmsgp' destined for 'task'. If we find
+        any port descriptors, remote, local, voucher, or other port that
+        matches 'qport', return a short description
+        which should match the format of GetIPCEntrySummary.
+    """
+
+    out_str = instr
+    p_list = CollectKmsgPorts(task, task_port, kmsgp)
+
+    ## Run through all ports we've collected looking for 'qport'
+    for p, pdisp in p_list:
+        PrintProgressForKmsg()
+        if unsigned(p) == unsigned(qport):
+            ## the port we're looking for was found in this message!
+            if len(out_str) > 0:
+                out_str += '\n'
+            out_str += GetInTransitPortSummary(p, pdisp, task_port, kmsgp)
+
+        ptype = (p.ip_object.io_bits & 0x7fff0000) >> 16
+        if ptype != 0: ## don't bother with port sets
+            continue
+
+        ## If the port that's in-transit has messages already enqueued,
+        ## go through each of those messages and look for more ports!
+        if p.ip_messages.data.port.msgcount > 0:
+            p_kmsgp = Cast(p.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t')
+            kmsgheadp = p_kmsgp
+            while unsigned(p_kmsgp) > 0:
+                out_str = FindKmsgPortRefs(out_str, task, p, p_kmsgp, qport)
+                p_kmsgp = p_kmsgp.ikm_next
+                if p_kmsgp == kmsgheadp:
+                    break
+    return out_str
+
+
+port_iteration_do_print_taskname = False
+registeredport_idx = -10
+excports_idx = -20
+intransit_idx = -1000
+taskports_idx = -2000
+thports_idx = -3000
+
+def IterateAllPorts(tasklist, func, ctx, include_psets, follow_busyports, should_log):
+    """ Iterate over all ports in the system, calling 'func'
+        for each entry in 
+    """
+    global port_iteration_do_print_taskname
+    global intransit_idx, taskports_idx, thports_idx, registeredport_idx, excports_idx
+
+    ## XXX: also host special ports
+
+    entry_port_type_mask = 0x00070000
+    if include_psets:
+        entry_port_type_mask = 0x000f0000
+
+    if tasklist is None:
+        tasklist = kern.tasks
+        tasklist += kern.terminated_tasks
+
+    tidx = 1
+
+    for t in tasklist:
+        # Write a progress line.  Using stderr avoids automatic newline when
+        # writing to stdout from lldb.  Blank spaces at the end clear out long
+        # lines.
+        if should_log:
+            procname = ""
+            if not t.active:
+                procname = 'terminated: '
+            if t.halting:
+                procname += 'halting: '
+            t_p = Cast(t.bsd_info, 'proc *')
+            if unsigned(t_p) != 0:
+                procname += str(t_p.p_name)
+            elif unsigned(t.task_imp_base) != 0 and hasattr(t.task_imp_base, 'iit_procname'):
+                procname += str(t.task_imp_base.iit_procname)
+            sys.stderr.write("  checking {:s} ({}/{})...{:50s}\r".format(procname, tidx, len(tasklist), ''))
+        tidx += 1
+
+        port_iteration_do_print_taskname = True
+        space = t.itk_space
+        num_entries = int(space.is_table_size)
+        is_tableval = space.is_table
+        idx = 0
+        while idx < num_entries:
+            entry_val = GetObjectAtIndexFromArray(is_tableval, idx)
+            entry_bits= unsigned(entry_val.ie_bits)
+            entry_obj = 0
+            entry_str = ''
+            entry_name = "{:x}".format( (idx << 8 | entry_bits >> 24) )
+
+            entry_disp = GetDispositionFromEntryType(entry_bits)
+
+            ## If the entry in the table represents a port of some sort,
+            ## then make the callback provided
+            if int(entry_bits) & entry_port_type_mask:
+                eport = Cast(entry_val.ie_object, 'ipc_port_t')
+                ## Make the callback
+                func(t, space, ctx, idx, entry_val, eport, entry_disp)
+
+                ## if the port has pending messages, look through
+                ## each message for ports (and recurse)
+                if follow_busyports and unsigned(eport) > 0 and eport.ip_messages.data.port.msgcount > 0:
+                    ## collect all port references from all messages
+                    kmsgp = Cast(eport.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t')
+                    kmsgheadp = kmsgp
+                    while unsigned(kmsgp) > 0:
+                        p_refs = set()
+                        CollectKmsgPortRefs(t, eport, kmsgp, p_refs)
+                        for (port, pdisp, ptype) in p_refs:
+                            func(t, space, ctx, intransit_idx, None, port, pdisp)
+                        kmsgp = kmsgp.ikm_next
+                        if kmsgp == kmsgheadp:
+                            break
+
+            idx = idx + 1
+        ## while (idx < num_entries)
+
+        ## Task ports (send rights)
+        if unsigned(t.itk_sself) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_sself, 17)
+        if unsigned(t.itk_host) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_host, 17)
+        if unsigned(t.itk_bootstrap) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_bootstrap, 17)
+        if unsigned(t.itk_seatbelt) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_seatbelt, 17)
+        if unsigned(t.itk_gssd) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_gssd, 17)
+        if unsigned(t.itk_debug_control) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_debug_control, 17)
+        if unsigned(t.itk_task_access) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_task_access, 17)
+
+        ## Task name port (not a send right, just a naked ref)
+        if unsigned(t.itk_nself) > 0:
+            func(t, space, ctx, taskports_idx, 0,t.itk_nself, 0)
+
+        ## task resume port is a receive right to resume the task
+        if unsigned(t.itk_resume) > 0:
+            func(t, space, ctx, taskports_idx, 0, t.itk_resume, 16)
+
+        ## registered task ports (all send rights)
+        tr_idx = 0
+        tr_max = sizeof(t.itk_registered) / sizeof(t.itk_registered[0])
+        while tr_idx < tr_max:
+            tport = t.itk_registered[tr_idx]
+            if unsigned(tport) > 0:
+                try:
+                    func(t, space, ctx, registeredport_idx, 0, tport, 17)
+                except Exception, e:
+                    print("\texception looking through registered port {:d}/{:d} in {:s}".format(tr_idx,tr_max,t))
+                    pass
+            tr_idx += 1
+
+        ## Task exception ports
+        exidx = 0
+        exmax = sizeof(t.exc_actions) / sizeof(t.exc_actions[0])
+        while exidx < exmax: ## see: osfmk/mach/[arm|i386]/exception.h
+            export = t.exc_actions[exidx].port ## send right
+            if unsigned(export) > 0:
+                try:
+                    func(t, space, ctx, excports_idx, 0, export, 17)
+                except Exception, e:
+                    print("\texception looking through exception port {:d}/{:d} in {:s}".format(exidx,exmax,t))
+                    pass
+            exidx += 1
+
+        ## XXX: any  ports still valid after clearing IPC space?!
+
+        for thval in IterateQueue(t.threads, 'thread *', 'task_threads'):
+            ## XXX: look at block reason to see if it's in mach_msg_receive - then look at saved state / message
+
+            ## Thread port (send right)
+            if unsigned(thval.ith_sself) > 0:
+                thport = thval.ith_sself
+                func(t, space, ctx, thports_idx, 0, thport, 17) ## see: osfmk/mach/message.h
+            ## Thread special reply port (send-once right)
+            if unsigned(thval.ith_special_reply_port) > 0:
+                thport = thval.ith_special_reply_port
+                func(t, space, ctx, thports_idx, 0, thport, 18) ## see: osfmk/mach/message.h
+            ## Thread voucher port
+            if unsigned(thval.ith_voucher) > 0:
+                vport = thval.ith_voucher.iv_port
+                if unsigned(vport) > 0:
+                    vdisp = GetDispositionFromVoucherPort(vport)
+                    func(t, space, ctx, thports_idx, 0, vport, vdisp)
+            ## Thread exception ports
+            if unsigned(thval.exc_actions) > 0:
+                exidx = 0
+                while exidx < exmax: ## see: osfmk/mach/[arm|i386]/exception.h
+                    export = thval.exc_actions[exidx].port ## send right
+                    if unsigned(export) > 0:
+                        try:
+                            func(t, space, ctx, excports_idx, 0, export, 17)
+                        except Exception, e:
+                            print("\texception looking through exception port {:d}/{:d} in {:s}".format(exidx,exmax,t))
+                            pass
+                    exidx += 1
+            ## XXX: the message on a thread (that's currently being received)
+        ## for (thval in t.threads)
+    ## for (t in tasklist)
+
+
+# Macro: findportrights
+def FindPortRightsCallback(task, space, ctx, entry_idx, ipc_entry, ipc_port, port_disp):
+    """ Callback which uses 'ctx' as the (port,rights_types) tuple for which
+        a caller is seeking references. This should *not* be used from a
+        recursive call to IterateAllPorts.
+    """
+    global port_iteration_do_print_taskname
+
+    (qport, rights_type) = ctx
+    entry_name = ''
+    entry_str = ''
+    if unsigned(ipc_entry) != 0:
+        entry_bits = unsigned(ipc_entry.ie_bits)
+        entry_name = "{:x}".format( (entry_idx << 8 | entry_bits >> 24) )
+        if (int(entry_bits) & 0x001f0000) != 0 and unsigned(ipc_entry.ie_object) == unsigned(qport):
+            ## it's a valid entry, and it points to the port
+            entry_str = '\t' + GetIPCEntrySummary(ipc_entry, entry_name, rights_type)
+
+    procname = GetProcNameForTask(task)
+    if unsigned(ipc_port) != 0 and ipc_port.ip_messages.data.port.msgcount > 0:
+        sys.stderr.write("  checking {:s} busy-port {}:{:#x}...{:30s}\r".format(procname, entry_name, unsigned(ipc_port), ''))
+        ## Search through busy ports to find descriptors which could
+        ## contain the only reference to this port!
+        kmsgp = Cast(ipc_port.ip_messages.data.port.messages.ikmq_base, 'ipc_kmsg_t')
+        kmsgheadp = kmsgp
+        while unsigned(kmsgp):
+            entry_str = FindKmsgPortRefs(entry_str, task, ipc_port, kmsgp, qport)
+            kmsgp = kmsgp.ikm_next
+            if kmsgp == kmsgheadp:
+                break;
+    if len(entry_str) > 0:
+        sys.stderr.write("{:80s}\r".format(''))
+        if port_iteration_do_print_taskname:
+            print "Task: {0: <#x} {1: <s}".format(task, procname)
+            print '\t' + GetIPCEntrySummary.header
+            port_iteration_do_print_taskname = False
+        print entry_str
+
+@lldb_command('findportrights', 'R:S:')
+def FindPortRights(cmd_args=None, cmd_options={}):
+    """  Routine to locate and print all extant rights to a given port
+         Usage: findportrights [-R rights_type] [-S <ipc_space_t>] <ipc_port_t>
+                -S ipc_space    : only search the specified ipc space
+                -R rights_type  : only display rights matching the string 'rights_type'
+
+                types of rights:
+                    'Dead'  : Dead name
+                    'Set'   : Port set
+                    'S'     : Send right
+                    'R'     : Receive right
+                    'O'     : Send-once right
+                types of notifications:
+                    'd'     : Dead-Name notification requested
+                    's'     : Send-Possible notification armed
+                    'r'     : Send-Possible notification requested
+                    'n'     : No-Senders notification requested
+                    'x'     : Port-destroy notification requested
+    """
+    if not cmd_args:
+        raise ArgumentError("no port address provided")
+    port = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_port *')
+
+    rights_type = 0
+    if "-R" in cmd_options:
+        rights_type = cmd_options["-R"]
+
+    tasklist = None
+    if "-S" in cmd_options:
+        space = kern.GetValueFromAddress(cmd_options["-S"], 'struct ipc_space *')
+        tasklist = [ space.is_task ]
+
+    ## Don't include port sets
+    ## Don't recurse on busy ports (we do that manually)
+    ## DO log progress
+    IterateAllPorts(tasklist, FindPortRightsCallback, (port, rights_type), False, False, True)
+    sys.stderr.write("{:120s}\r".format(' '))
+
+    print "Done."
+    return
+# EndMacro: findportrights
+
+# Macro: countallports
+
+def CountPortsCallback(task, space, ctx, entry_idx, ipc_entry, ipc_port, port_disp):
+    """ Callback which uses 'ctx' as the set of all ports found in the
+        iteration. This should *not* be used from a recursive
+        call to IterateAllPorts.
+    """
+    global intransit_idx
+
+    (p_set, p_intransit, p_bytask) = ctx
+
+    ## Add the port address to the set of all port addresses
+    p_set.add(unsigned(ipc_port))
+
+    if entry_idx == intransit_idx:
+        p_intransit.add(unsigned(ipc_port))
+
+    if task.active or (task.halting and not task.active):
+        pname = str(Cast(task.bsd_info, 'proc *').p_name)
+        if not pname in p_bytask.keys():
+            p_bytask[pname] = { 'transit':0, 'table':0, 'other':0 }
+        if entry_idx == intransit_idx:
+            p_bytask[pname]['transit'] += 1
+        elif entry_idx >= 0:
+            p_bytask[pname]['table'] += 1
+        else:
+            p_bytask[pname]['other'] += 1
+
+
+@lldb_command('countallports', 'P')
+def CountAllPorts(cmd_args=None, cmd_options={}):
+    """ Routine to search for all as many references to ipc_port structures in the kernel
+        that we can find.
+        Usage: countallports [-P]
+                -P : include port sets in the count (default: NO)
+    """
+    p_set = set()
+    p_intransit = set()
+    p_bytask = {}
+
+    find_psets = False
+    if "-P" in cmd_options:
+        find_psets = True
+
+    ## optionally include port sets
+    ## DO recurse on busy ports
+    ## DO log progress
+    IterateAllPorts(None, CountPortsCallback, (p_set, p_intransit, p_bytask), find_psets, True, True)
+    sys.stderr.write("{:120s}\r".format(' '))
+
+    print "Total ports found: {:d}".format(len(p_set))
+    print "In Transit: {:d}".format(len(p_intransit))
+    print "By Task:"
+    for pname in sorted(p_bytask.keys()):
+        count = p_bytask[pname]
+        print "\t{: <20s}: table={: <5d}, transit={: <5d}, other={: <5d}".format(pname, count['table'], count['transit'], count['other'])
+    return
+# EndMacro: countallports
+
 # Macro: showpipestats
 @lldb_command('showpipestats')
 def ShowPipeStats(cmd_args=None):
@@ -816,6 +1466,35 @@ def ShowAllBusyPorts(cmd_args=None):
     return
 # EndMacro: showallbusyports
 
+# Macro: showbusyportsummary
+@lldb_command('showbusyportsummary')
+def ShowBusyPortSummary(cmd_args=None):
+    """ Routine to print a summary of information about all receive rights
+        on the system that have enqueued messages.
+    """
+    task_queue_head = kern.globals.tasks
+
+    ipc_table_size = 0
+    ipc_busy_ports = 0
+    ipc_msgs = 0
+
+    print GetTaskBusyIPCSummary.header
+    for tsk in kern.tasks:
+        (summary, table_size, nbusy, nmsgs) = GetTaskBusyIPCSummary(tsk)
+        ipc_table_size += table_size
+        ipc_busy_ports += nbusy
+        ipc_msgs += nmsgs
+        print summary
+    for t in kern.terminated_tasks:
+        (summary, table_size, nbusy, nmsgs) = GetTaskBusyIPCSummary(tsk)
+        ipc_table_size += table_size
+        ipc_busy_ports += nbusy
+        ipc_msgs += nmsgs
+        print summary
+    print "Total Table Size: {:d}, Busy Ports: {:d}, Messages in-flight: {:d}".format(ipc_table_size, ipc_busy_ports, ipc_msgs)
+    return
+# EndMacro: showbusyportsummary
+
 # Macro: showport:
 @lldb_command('showport','K')
 def ShowPort(cmd_args=None, cmd_options={}):
@@ -1372,21 +2051,8 @@ def ShowPortSendRights(cmd_args=[], cmd_options={}):
     port = kern.GetValueFromAddress(cmd_args[0], 'struct ipc_port *')
     i = 1
 
-    for t in kern.tasks:
-        # Write a progress line.  Using stderr avoids automatic newline when
-        # writing to stdout from lldb.  Blank spaces at the end clear out long
-        # lines.
-        sys.stderr.write("checking {:s} ({}/{})...{:30s}\r".format(Cast(t.bsd_info, 'proc_t').p_name, i, len(kern.tasks), ''))
-        i += 1
-        entries = GetSpaceSendRightEntries(t.itk_space, port)
-
-        if entries:
-            print GetTaskIPCSummary.header
-            print GetTaskIPCSummary(t)
-            print '\t' + GetIPCEntrySummary.header
+    return FindPortRights(cmd_args=[unsigned(port)], cmd_options={'-R':'S'})
 
-        for entry in entries:
-            print "\t" + GetIPCEntrySummary(entry)
 
 @lldb_command('showtasksuspenders')
 def ShowTaskSuspenders(cmd_args=[], cmd_options={}):
@@ -1411,4 +2077,4 @@ def ShowTaskSuspenders(cmd_args=[], cmd_options={}):
         print "task {:#x} ({:s}) is suspended but no resume port exists".format(unsigned(task), Cast(task.bsd_info, 'proc_t').p_name)
         return
 
-    return ShowPortSendRights(cmd_args=[unsigned(port)], cmd_options=cmd_options)
+    return FindPortRights(cmd_args=[unsigned(port)], cmd_options={'-R':'S'})
index d4bec35c4cdd4da59fa888c792dd24c48d92117e..d924521fe640e0874cd4f50f338bfff3c862e2a1 100755 (executable)
@@ -109,7 +109,7 @@ def print_alloc_free_entry(addr, orig_ptr):
             print " #{:}: {}".format(btframes-i-1, GetSourceInformationForAddress(fr))
 
     print "",
-    print_hexdump(addr, asz, 0)
+    print_hexdump(addr, asz, 1)
 
 alloc_header_sz = 16
 
@@ -177,7 +177,7 @@ def print_alloc_info(_addr):
                     print " #{:}: {}".format(btframes-i-1, GetSourceInformationForAddress(fr))
 
                 print "",
-                print_hexdump(base, asz, 0)
+                print_hexdump(base, asz, 1)
             return
 
         elif magic_for_addr(addr, 0xf233) == unsigned(freeh.magic):
@@ -202,8 +202,14 @@ def print_whatis(_addr, ctx):
     rightrz = None
     extra = "Live"
 
-    shbyte = get_shadow_byte(shadow_for_address(addr, shift))
-    maxsearch = 4096 * 2
+    shaddr = shadow_for_address(addr, shift)
+    try:
+        shbyte = get_shadow_byte(shaddr)
+    except:
+        print "Unmapped shadow 0x{:x} for address 0x{:x}".format(shaddr, addr)
+        return
+
+    maxsearch = 8*4096
 
     if shbyte in [0xfa, 0xfb, 0xfd, 0xf5]:
         print_alloc_info(_addr)
@@ -266,9 +272,12 @@ def print_whatis(_addr, ctx):
     print "Valid range: 0x{:x} -- 0x{:x} ({} bytes)".format(base, base+total_size-1, total_size)
     print "Offset:      {} bytes".format(_addr - base)
     print "",
-    print_hexdump(base, total_size, 0)
+    print_hexdump(base, total_size, 1)
 
 def print_hexdump(base, size, ctx):
+    if size < 16:
+        size = 16
+    base -= base % 16
     start = base - 16*ctx
     size += size % 16
     size = min(size + 16*2*ctx, 256)
@@ -294,7 +303,7 @@ def kasan_subcommand(cmd, args, opts):
         print("0x{:02x} @ 0x{:016x} [{}]\n\n".format(sb, shadow, shadow_byte_to_string(sb)))
         ctx = long(opts.get("-C", 5))
         print_shadow_context(addr, ctx)
-    elif cmd == 'legend':
+    elif cmd == 'key' or cmd == 'legend':
         print_legend()
     elif cmd == 'info':
         pages_used = unsigned(kern.globals.shadow_pages_used)
@@ -308,6 +317,8 @@ def kasan_subcommand(cmd, args, opts):
         print_whatis(addr, ctx)
     elif cmd == 'alloc' or cmd == 'heap':
         print_alloc_info(addr)
+    else:
+        print "Unknown subcommand: `{}'".format(cmd)
 
 @lldb_command('kasan', 'C:')
 def Kasan(cmd_args=None, cmd_options={}):
index 12a996c6935f35fb0e8910ada76d7a0495997ac1..f6db996ccb42d078b06a6646206d59f23992ab98 100755 (executable)
@@ -30,9 +30,9 @@ kcdata_type_def = {
     'KCDATA_TYPE_INT64_DESC':           0x5,
     'KCDATA_TYPE_BINDATA_DESC':         0x6,
     'KCDATA_TYPE_ARRAY':                0x11,
-    'KCDATA_TYPE_TYPEDEFINTION':        0x12,
+    'KCDATA_TYPE_TYPEDEFINITION':       0x12,
     'KCDATA_TYPE_CONTAINER_BEGIN':      0x13,
-    'KCDATA_TYPE_CONTIANER_END':        0x14,
+    'KCDATA_TYPE_CONTAINER_END':        0x14,
 
     'KCDATA_TYPE_ARRAY_PAD0':           0x20,
     'KCDATA_TYPE_ARRAY_PAD1':           0x21,
@@ -74,9 +74,6 @@ kcdata_type_def = {
     'STACKSHOT_KCTYPE_KERN_PAGE_SIZE':  0x910,
     'STACKSHOT_KCTYPE_JETSAM_LEVEL':    0x911,
     'STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP': 0x912,
-    'STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT': 0x940,
-    'STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT': 0x941,
-
     'STACKSHOT_KCTYPE_KERN_STACKLR':  0x913,
     'STACKSHOT_KCTYPE_KERN_STACKLR64':  0x914,
     'STACKSHOT_KCTYPE_USER_STACKLR':  0x915,
@@ -92,9 +89,16 @@ kcdata_type_def = {
     'STACKSHOT_KCTYPE_THREAD_GROUP' : 0x91f,
     'STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT' : 0x920,
     'STACKSHOT_KCTYPE_JETSAM_COALITION' : 0x921,
+    'STACKSHOT_KCTYPE_THREAD_POLICY_VERSION': 0x922,
     'STACKSHOT_KCTYPE_INSTRS_CYCLES' : 0x923,
+    'STACKSHOT_KCTYPE_USER_STACKTOP' : 0x924,
+    'STACKSHOT_KCTYPE_ASID' : 0x925,
+    'STACKSHOT_KCTYPE_PAGE_TABLES' : 0x926,
+    'STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT' : 0x927,
+
+    'STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT': 0x940,
+    'STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT': 0x941,
 
-    'STACKSHOT_KCTYPE_THREAD_POLICY_VERSION': 0x922,
 
 
     'KCDATA_TYPE_BUFFER_END':      0xF19158ED,
@@ -291,7 +295,7 @@ class KCSubTypeElement(object):
 
 
 class KCTypeDescription(object):
-    def __init__(self, t_type_id, t_elements=[], t_name='anon', custom_repr=None, legacy_size=None, merge=False):
+    def __init__(self, t_type_id, t_elements=[], t_name='anon', custom_repr=None, legacy_size=None, merge=False, naked=False):
         self.type_id = t_type_id
         self.elements = t_elements
         self.name = t_name
@@ -300,6 +304,7 @@ class KCTypeDescription(object):
         if legacy_size:
             self.legacy_size = legacy_size
         self.merge = merge
+        self.naked = naked
         for e in self.elements:
             self.totalsize += e.GetTotalSize()
 
@@ -336,7 +341,10 @@ class KCTypeDescription(object):
             base_data = base_data[:self.legacy_size]
         if self.custom_JsonRepr:
             return self.custom_JsonRepr([e.GetValue(base_data) for e in self.elements])
-        o = ", ".join(['"%s": %s' % (e.GetName(), e.GetJsonRepr(base_data)) for e in self.elements if not e.ShouldSkip(base_data)])
+        if self.naked:
+            o = ", ".join([e.GetJsonRepr(base_data) for e in self.elements if not e.ShouldSkip(base_data)])
+        else:
+            o = ", ".join(['"%s": %s' % (e.GetName(), e.GetJsonRepr(base_data)) for e in self.elements if not e.ShouldSkip(base_data)])
         if not self.merge:
             o = '{' + o + '}'
         return o
@@ -427,7 +435,7 @@ class KCObject(object):
         return KCObject(kcitem.i_type, kcitem.i_data, kcitem.i_offset, kcitem.i_flags)
 
     def IsContainerEnd(self):
-        return self.i_type == GetTypeForName('KCDATA_TYPE_CONTIANER_END')
+        return self.i_type == GetTypeForName('KCDATA_TYPE_CONTAINER_END')
 
     def IsBufferEnd(self):
         return self.i_type == GetTypeForName('KCDATA_TYPE_BUFFER_END')
@@ -469,7 +477,7 @@ class KCObject(object):
             self.obj['typeID'] = self.i_type
             logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name))
 
-        elif self.i_type == GetTypeForName('KCDATA_TYPE_CONTIANER_END'):
+        elif self.i_type == GetTypeForName('KCDATA_TYPE_CONTAINER_END'):
             self.obj['uniqID'] = self.i_flags
             logging.info("0x%08x: %sEND" % (self.offset, INDENT(end=True)))
 
@@ -491,7 +499,7 @@ class KCObject(object):
             self.obj = u_d[1]
             logging.info("0x%08x: %s%s" % (self.offset, INDENT(), self.i_name))
 
-        elif self.i_type == GetTypeForName('KCDATA_TYPE_TYPEDEFINTION'):
+        elif self.i_type == GetTypeForName('KCDATA_TYPE_TYPEDEFINITION'):
             self.is_naked_type = True
             u_d = struct.unpack_from('II32s', self.i_data)
             self.obj['name'] = u_d[2].split(chr(0))[0]
@@ -854,7 +862,12 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT')] =
 
 KNOWN_TYPES_COLLECTION[0x909] = KCSubTypeElement('pth_name', KCSUBTYPE_TYPE.KC_ST_CHAR, KCSubTypeElement.GetSizeForArray(64, 1), 0, 1)
 
-
+KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT'), (
+    KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0),
+    KCSubTypeElement('imageUUID', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(16, 1), 8, 1)
+),
+    'system_shared_cache_layout'
+)
 
 KNOWN_TYPES_COLLECTION[GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO64')] = KCTypeDescription(GetTypeForName('KCDATA_TYPE_LIBRARY_LOADINFO64'), (
     KCSubTypeElement('imageLoadAddress', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0),
@@ -1014,6 +1027,18 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_INSTRS_CYCLES')] = KCTyp
             ),
             'instrs_cycles_snapshot')
 
+def set_type(name, *args):
+    typ = GetTypeForName(name)
+    KNOWN_TYPES_COLLECTION[typ] = KCTypeDescription(GetTypeForName(typ), *args)
+
+
+set_type('STACKSHOT_KCTYPE_USER_STACKTOP',
+         (
+             KCSubTypeElement.FromBasicCtype('sp', KCSUBTYPE_TYPE.KC_ST_UINT64, 0),
+             KCSubTypeElement('stack_contents', KCSUBTYPE_TYPE.KC_ST_UINT8, KCSubTypeElement.GetSizeForArray(8, 1), 8, 1),
+         ),
+         'user_stacktop')
+
 #KNOWN_TYPES_COLLECTION[0x907] = KCSubTypeElement('donating_pids', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0, KCSubTypeElement._get_naked_element_value)
 KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_PID')] = KCSubTypeElement('pid', KCSUBTYPE_TYPE.KC_ST_INT32, 4, 0, 0)
 KNOWN_TYPES_COLLECTION[GetTypeForName('TASK_CRASHINFO_PPID')] = KCSubTypeElement('ppid', KCSUBTYPE_TYPE.KC_ST_INT32, 4, 0, 0)
@@ -1095,6 +1120,7 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_CPU_TIMES')] = KCTypeDes
     (
         KCSubTypeElement.FromBasicCtype('user_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 0),
         KCSubTypeElement.FromBasicCtype('system_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 8),
+        KCSubTypeElement.FromBasicCtype('runnable_usec', KCSUBTYPE_TYPE.KC_ST_UINT64, 16),
     ), 'cpu_times')
 
 KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_DURATION')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_STACKSHOT_DURATION'),
@@ -1150,6 +1176,16 @@ KNOWN_TYPES_COLLECTION[GetTypeForName('EXIT_REASON_WORKLOOP_ID')] = (
 KNOWN_TYPES_COLLECTION[GetTypeForName('EXIT_REASON_DISPATCH_QUEUE_NO')] = (
         KCSubTypeElement('exit_reason_dispatch_queue_no', KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0, KCSubTypeElement._get_naked_element_value))
 
+KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_ASID')] = (
+    KCSubTypeElement('ts_asid', KCSUBTYPE_TYPE.KC_ST_UINT32, 4, 0, 0))
+
+KNOWN_TYPES_COLLECTION[GetTypeForName('STACKSHOT_KCTYPE_PAGE_TABLES')] = KCTypeDescription(GetTypeForName('STACKSHOT_KCTYPE_PAGE_TABLES'), (
+    KCSubTypeElement(None, KCSUBTYPE_TYPE.KC_ST_UINT64, 8, 0, 0, KCSubTypeElement._get_naked_element_value), ),
+    'ts_pagetable',
+    merge=True,
+    naked=True
+)
+
 def GetSecondsFromMATime(mat, tb):
     return (float(mat) * tb['numer']) / tb['denom']
 
@@ -1194,6 +1230,7 @@ def GetStateDescription(s):
     TH_UNINT = 0x08
     TH_TERMINATE = 0x10
     TH_TERMINATE2 = 0x20
+    TH_WAIT_REPORT = 0x40
     TH_IDLE = 0x80
     if (s & TH_WAIT):
         retval.append("TH_WAIT")
@@ -1207,6 +1244,8 @@ def GetStateDescription(s):
         retval.append("TH_TERMINATE")
     if (s & TH_TERMINATE2):
         retval.append("TH_TERMINATE2")
+    if (s & TH_WAIT_REPORT):
+        retval.append("TH_WAIT_REPORT")
     if (s & TH_IDLE):
         retval.append("TH_IDLE")
     return retval
@@ -1232,6 +1271,7 @@ kThreadWaitPThreadRWLockWrite   = 0x0d
 kThreadWaitPThreadCondVar       = 0x0e
 kThreadWaitParkedWorkQueue      = 0x0f
 kThreadWaitWorkloopSyncWait     = 0x10
+kThreadWaitOnProcess            = 0x11
 
 
 UINT64_MAX = 0xffffffffffffffff
@@ -1329,6 +1369,9 @@ def formatWaitInfo(info):
         else:
             s += ", unknown owner"
         s += ", workloop id %x" % context
+    elif type == kThreadWaitOnProcess:
+        s += "waitpid, for pid %d" % owner
+
     else:
         s += "unknown type %d (owner %d, context %x)" % (type, owner, context)
 
@@ -1367,9 +1410,9 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr, incomplete):
         return
 
     dsc_common = [format_uuid(ss.get('shared_cache_dyld_load_info')['imageUUID']),
-                  shared_cache_base_addr,
-                  "S"
-                 ]
+                  shared_cache_base_addr, "S" ]
+
+    dsc_layout = ss.get('system_shared_cache_layout')
 
     dsc_libs = []
     print "Shared cache UUID found from the binary data is <%s> " % str(dsc_common[0])
@@ -1381,11 +1424,17 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr, incomplete):
         for i in dsc_libs_arr:
             _uuid = i[2].lower().replace('-','').strip()
             _addr = int(i[0], 16) + _load_addr
-            dsc_libs.append([_uuid, _addr, "P"])
+            dsc_libs.append([_uuid, _addr, "C"])
             #print "adding ", [_uuid, _addr, "C"]
     elif dsc_uuid:
         print "Provided shared cache UUID does not match. Skipping writing report."
         return
+    elif dsc_layout:
+        print "Found in memory system shared cache layout with {} images".format(len(dsc_layout))
+        slide = ss.get('shared_cache_dyld_load_info')['imageLoadAddress']
+
+        for image in dsc_layout:
+            dsc_libs.append([format_uuid(image['imageUUID']), image['imageLoadAddress'] + slide, "C"])
 
     AllImageCatalog = []
     obj = {}
@@ -1452,6 +1501,15 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr, incomplete):
             continue
         tasksnap = piddata['task_snapshot']
         tsnap["pid"] = tasksnap["ts_pid"]
+        if 'ts_asid' in piddata:
+            tsnap["asid"] = piddata["ts_asid"]
+
+        if 'ts_pagetable' in piddata:
+            pagetables = []
+            for tte in piddata["ts_pagetable"]:
+                pagetables.append(tte)
+            tsnap["pageTables"] = pagetables
+
         tsnap["residentMemoryBytes"] = tasksnap["ts_task_size"]
         tsnap["timesDidThrottle"] = tasksnap["ts_did_throttle"]
         tsnap["systemTimeTask"] = GetSecondsFromMATime(tasksnap["ts_system_time_in_terminated_th"], timebase)
@@ -1493,6 +1551,11 @@ def SaveStackshotReport(j, outfile_name, dsc_uuid, dsc_libs_arr, incomplete):
                 for f in thdata["user_stack_frames"]:
                     uframes.append(GetSymbolInfoForFrame(AllImageCatalog, pr_libs, f['lr']))
                 thsnap["userFrames"] = uframes
+
+            if "user_stacktop" in thdata:
+                (address,) = struct.unpack("<Q", struct.pack("B"*8, *thdata["user_stacktop"]["stack_contents"]))
+                thsnap["userStacktop"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, address)
+
             if threadsnap['ths_wait_event']:
                 thsnap["waitEvent"] = GetSymbolInfoForFrame(AllImageCatalog, pr_libs, threadsnap['ths_wait_event'])
 
@@ -1654,8 +1717,14 @@ def prettify(data):
                 value = '%02X%02X%02X%02X-%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X' % tuple(value)
             elif 'address' in key.lower() and isinstance(value, (int, long)):
                 value = '0x%X' % value
+            elif key == 'lr':
+                value = '0x%X' % value
             elif key == 'thread_waitinfo':
                 value = map(formatWaitInfo, value)
+            elif key == 'stack_contents':
+                print value
+                (address,) = struct.unpack("<Q", struct.pack("B"*8, *value))
+                value = '0x%X' % address
             else:
                 value = prettify(value);
             newdata[key] = value
index 8f9f7c9513d7d2d1d1fe04991a576930a5d2ce5a..1fb875628748e23d261d26061f461639b3539c27 100755 (executable)
@@ -99,7 +99,7 @@ def GetKnoteKqueue(kn):
             kn - the knote object
         returns: kq - the kqueue corresponding to the knote
     """
-    return kern.GetValueFromAddress(kn.kn_kq_packed + kern.VM_MIN_KERNEL_AND_KEXT_ADDRESS, 'struct kqueue *')
+    return kern.GetValueFromAddress(int(kn.kn_kq_packed), 'struct kqueue *')
 
 @lldb_type_summary(['knote *'])
 @header('{:<20s} {:<20s} {:<10s} {:<20s} {:<20s} {:<30s} {:<10} {:<10} {:<10} {:<30s}'.format('knote', 'ident', 'kev_flags', 'kqueue', 'udata', 'filtops', 'qos_use', 'qos_req', 'qos_ovr', 'status'))
@@ -148,7 +148,7 @@ def IterateKqueueKnotes(kq):
         yield kn
 
 @lldb_type_summary(['struct kqrequest *'])
-@header('{:<20s} {:<20s} {:<5s} {:<5s} {:<5s} {:<5s} {:s}'.format('kqrequest', 'thread', 'qos', 'ovr_qos', 'w_qos', 'sa_qos', 'state'))
+@header('{:<20s} {:<20s} {:<5s} {:<5s} {:<5s} {:s}'.format('kqrequest', 'thread', 'qos', 'ovr_qos', 'sa_qos', 'state'))
 def GetKqrequestSummary(kqr):
     """ Summarize kqrequest information
 
@@ -156,12 +156,11 @@ def GetKqrequestSummary(kqr):
             kqr - the kqrequest object
         returns: str - summary of kqrequest
     """
-    fmt = '{kqrp: <#020x} {kqr.kqr_bound.kqrb_thread: <#020x} {qos: <5s} {ovr_qos: <5s} {w_qos: <5s} {sa_qos: <5s} {state_str:<s}'
+    fmt = '{kqrp: <#020x} {kqr.kqr_thread: <#020x} {qos: <5s} {ovr_qos: <5s} {sa_qos: <5s} {state_str:<s}'
     return fmt.format(kqrp=int(kqr),
             kqr=kqr,
             qos=xnudefines.thread_qos_short_strings[int(kqr.kqr_qos_index)],
             ovr_qos=xnudefines.thread_qos_short_strings[int(kqr.kqr_override_index)],
-            w_qos=xnudefines.thread_qos_short_strings[int(kqr.kqr_dsync_waiters_qos)],
             sa_qos=xnudefines.thread_qos_short_strings[int(kqr.kqr_stayactive_qos)],
             state_str=xnudefines.GetStateString(xnudefines.kqrequest_state_strings, kqr.kqr_state))
 
@@ -280,7 +279,7 @@ def GetKqworkloopSummary(kqwl):
             dyn_id=kqwl.kqwl_dynamicid,
             kqr_state=xnudefines.GetStateString(xnudefines.kqrequest_state_strings, kqwl.kqwl_request.kqr_state),
             st_str=xnudefines.GetStateString(xnudefines.kq_state_strings, state),
-            servicer=int(kqwl.kqwl_request.kqr_bound.kqrb_thread),
+            servicer=int(kqwl.kqwl_request.kqr_thread),
             owner=int(kqwl.kqwl_owner)
             )
 
index 548f9870fec75cac11e2400afeb5d5e8a227c048..9affaa7d2dc3ef3f9ee116d867844880a2003ced 100755 (executable)
@@ -780,3 +780,12 @@ def McacheShowCache(cmd_args=None):
     out_string += "Total # of objects cached:\t\t" + str(total) + "\n"
     print out_string
 # EndMacro: mcache_showcache
+
+# Macro: mbuf_wdlog
+@lldb_command('mbuf_wdlog')
+def McacheShowCache(cmd_args=None):
+    """Display the watchdog log
+    """
+    lldb_run_command('settings set max-string-summary-length 4096')
+    print('%s' % lldb_run_command('p/s mbwdog_logging').replace("\\n","\n"))
+# EndMacro: mbuf_wdlog
index a0fdab4af0c777c742da8e0f544900a3c2a5c4a9..a39837ff230a764321ad06689b5dfb7b50594cc0 100755 (executable)
@@ -60,13 +60,13 @@ def CalculateLedgerPeak(phys_footprint_entry):
     """
     now = kern.globals.sched_tick / 20
     ledger_peak = phys_footprint_entry.le_credit - phys_footprint_entry.le_debit
-    if (now - phys_footprint_entry._le.le_maxtracking.le_peaks[0].le_time <= 1) and (phys_footprint_entry._le.le_maxtracking.le_peaks[0].le_max > ledger_peak):
-        ledger_peak = phys_footprint_entry._le.le_maxtracking.le_peaks[0].le_max
+    if hasattr(phys_footprint_entry._le._le_max, 'le_interval_max') and (phys_footprint_entry._le._le_max.le_interval_max > ledger_peak):
+        ledger_peak = phys_footprint_entry._le._le_max.le_interval_max
     return ledger_peak
 
 @header("{: >8s} {: >12s} {: >12s} {: >10s} {: >12s} {: >14s} {: >10s} {: >12s} {: >10s} {: >10s} {: >10s}  {: <20s}\n".format(
 'pid', 'effective', 'requested', 'state', 'user_data', 'physical', 'iokit', 'footprint',
-'spike', 'lifemax', 'limit', 'command'))
+'recent peak', 'lifemax', 'limit', 'command'))
 def GetMemoryStatusNode(proc_val):
     """ Internal function to get memorystatus information from the given proc
         params: proc - value representing struct proc *
@@ -87,7 +87,7 @@ def GetMemoryStatusNode(proc_val):
     phys_footprint_limit = task_phys_footprint_ledger_entry.le_limit / page_size
     ledger_peak = CalculateLedgerPeak(task_phys_footprint_ledger_entry)
     phys_footprint_spike = ledger_peak / page_size
-    phys_footprint_lifetime_max = task_phys_footprint_ledger_entry._le.le_maxtracking.le_lifetime_max / page_size
+    phys_footprint_lifetime_max = task_phys_footprint_ledger_entry._le._le_max.le_lifetime_max / page_size
 
     format_string = '{0: >8d} {1: >12d} {2: >12d} {3: #011x} {4: #011x} {5: >12d} {6: >10d} {7: >13d}'
     out_str += format_string.format(proc_val.p_pid, proc_val.p_memstat_effectivepriority,
@@ -232,11 +232,109 @@ def WhatIsHelper(cmd_args=None):
         pass
     return
 
+# Macro: showzcache
+
+@lldb_type_summary(['zone','zone_t'])
+@header("{:^18s} {:<40s} {:>10s} {:>10s} {:>10s} {:>10s}".format(
+'ZONE', 'NAME', 'CACHE_ELTS', 'DEP_VALID', 'DEP_EMPTY','DEP_FULL'))
+
+def GetZoneCacheSummary(zone):
+    """ Summarize a zone's cache with important information.
+        params:
+          zone: value - obj representing a zone in kernel
+        returns:
+          str - summary of the zone's cache contents
+    """
+    out_string = ""
+    format_string = '{:#018x} {:<40s} {:>10d} {:>10s} {:>10d} {:>10d}'
+    cache_elem_count = 0
+    mag_capacity = kern.GetGlobalVariable('magazine_element_count')
+    depot_capacity = kern.GetGlobalVariable('depot_element_count')
+
+
+    if zone.__getattr__('cpu_cache_enabled') :
+        for i in range(0, kern.globals.machine_info.physical_cpu):
+            cache = zone.zcache[0].zcc_per_cpu_caches[i]
+            cache_elem_count += cache.current.zcc_magazine_index
+            cache_elem_count += cache.previous.zcc_magazine_index
+        
+        if zone.zcache[0].zcc_depot_index != -1:
+            cache_elem_count += zone.zcache[0].zcc_depot_index * mag_capacity
+            out_string += format_string.format(zone, zone.zone_name, cache_elem_count, "Y", depot_capacity - zone.zcache[0].zcc_depot_index, zone.zcache[0].zcc_depot_index)
+        else:
+            out_string += format_string.format(zone, zone.zone_name, cache_elem_count, "N", 0, 0)
+
+    return out_string
+
+@lldb_command('showzcache')
+def ZcachePrint(cmd_args=None):
+    """ Routine to print a summary listing of all the kernel zones cache contents
+    All columns are printed in decimal
+    """
+    global kern
+    print GetZoneCacheSummary.header
+    for zval in kern.zones:
+        if zval.__getattr__('cpu_cache_enabled') :
+            print GetZoneCacheSummary(zval)
+
+# EndMacro: showzcache
+
+# Macro: showzcachecpu
+
+@lldb_type_summary(['zone','zone_t'])
+@header("{:^18s} {:40s} {:>10s} {:>10s}".format(
+'ZONE', 'NAME', 'CACHE_ELTS', 'CPU_INFO'))
+
+def GetZoneCacheCPUSummary(zone):
+    """ Summarize a zone's cache broken up per cpu
+        params:
+          zone: value - obj representing a zone in kernel
+        returns:
+          str - summary of the zone's per CPU cache contents
+    """
+    out_string = ""
+    format_string = '{:#018x} {:40s} {:10d} {cpuinfo:s}'
+    cache_elem_count = 0
+    cpu_info = ""
+    per_cpu_count = 0
+    mag_capacity = kern.GetGlobalVariable('magazine_element_count')
+    depot_capacity = kern.GetGlobalVariable('depot_element_count')
+
+
+    if zone.__getattr__('cpu_cache_enabled') :
+        for i in range(0, kern.globals.machine_info.physical_cpu):
+            if i != 0:
+                cpu_info += ", "
+            cache = zone.zcache[0].zcc_per_cpu_caches[i]
+            per_cpu_count = cache.current.zcc_magazine_index
+            per_cpu_count += cache.previous.zcc_magazine_index
+            cache_elem_count += per_cpu_count
+            cpu_info += "CPU {:d}: {:5}".format(i,per_cpu_count)
+        if zone.zcache[0].zcc_depot_index != -1:
+            cache_elem_count += zone.zcache[0].zcc_depot_index * mag_capacity
+
+    out_string += format_string.format(zone, zone.zone_name, cache_elem_count,cpuinfo = cpu_info)
+
+    return out_string
+
+@lldb_command('showzcachecpu')
+def ZcacheCPUPrint(cmd_args=None):
+    """ Routine to print a summary listing of all the kernel zones cache contents
+    All columns are printed in decimal
+    """
+    global kern
+    print GetZoneCacheCPUSummary.header
+    for zval in kern.zones:
+        if zval.__getattr__('cpu_cache_enabled') :
+            print GetZoneCacheCPUSummary(zval)
+
+# EndMacro: showzcachecpu
+
 # Macro: zprint
 
 @lldb_type_summary(['zone','zone_t'])
-@header("{:^18s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s}({:>6s} {:>6s} {:>6s}) {:^15s} {:<20s}".format(
-'ZONE', 'TOT_SZ', 'PAGE_COUNT', 'ALLOC_ELTS', 'FREE_ELTS', 'FREE_SZ', 'ALL_FREE_PGS', 'ELT_SZ', 'ALLOC', 'ELTS', 'PGS', 'WASTE', 'FLAGS', 'NAME'))
+@header("{:^18s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:^6s} {:^6s} {:^6s} {:>10s} {:^15s} {:<20s}".format(
+'ZONE', 'TOT_SZ', 'PAGE_COUNT', 'ALLOC_ELTS', 'FREE_ELTS', 'FREE_SZ', 'ALL_FREE_PGS', 'ELT_SZ', 'ALLOC', '(ELTS', 'PGS', 'WASTE)', 'CACHE_ELTS', 'FLAGS', 'NAME'))
 def GetZoneSummary(zone):
     """ Summarize a zone with important information. See help zprint for description of each field
         params:
@@ -245,11 +343,12 @@ def GetZoneSummary(zone):
           str - summary of the zone
     """
     out_string = ""
-    format_string = '{:#018x} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:6d} {:6d} {:6d}  {markings} {name:s} '
+    format_string = '{:#018x} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:10d} {:6d} {:6d} {:6d} {:10d}  {markings} {name:s} '
     pagesize = kern.globals.page_size
 
     free_elements = zone.countfree
     free_size = free_elements * zone.elem_size
+    mag_capacity = kern.GetGlobalVariable('magazine_element_count')
 
     alloc_pages = zone.alloc_size / pagesize
     alloc_count = zone.alloc_size / zone.elem_size
@@ -267,7 +366,8 @@ def GetZoneSummary(zone):
             ["zleak_on",                    "L"],
             ["doing_alloc_without_vm_priv", "A"],
             ["doing_alloc_with_vm_priv",    "S"],
-            ["waiting",                     "W"]
+            ["waiting",                     "W"],
+            ["cpu_cache_enabled",           "E"]
             ]
     if kern.arch == 'x86_64':
         marks.append(["gzalloc_exempt",     "M"])
@@ -281,10 +381,19 @@ def GetZoneSummary(zone):
             markings+=mark[1]
         else:
             markings+=" "
+    cache_elem_count = 0
+    if zone.__getattr__('cpu_cache_enabled') :
+        for i in range(0, kern.globals.machine_info.physical_cpu):
+            cache = zone.zcache[0].zcc_per_cpu_caches[i]
+            cache_elem_count += cache.current.zcc_magazine_index
+            cache_elem_count += cache.previous.zcc_magazine_index
+        if zone.zcache[0].zcc_depot_index != -1:
+            cache_elem_count += zone.zcache[0].zcc_depot_index * mag_capacity
+
     out_string += format_string.format(zone, zone.cur_size, zone.page_count,
                     zone.count, free_elements, free_size, zone.count_all_free_pages,
                     zone.elem_size, zone.alloc_size, alloc_count,
-                    alloc_pages, alloc_waste, name = zone.zone_name, markings=markings)
+                    alloc_pages, alloc_waste, cache_elem_count, name = zone.zone_name, markings=markings)
 
     if zone.exhaustible :
             out_string += "(max: {:d})".format(zone.max_size)
@@ -309,6 +418,7 @@ def Zprint(cmd_args=None):
         A - currently trying to allocate more backing memory from kernel_memory_allocate without VM priv
         S - currently trying to allocate more backing memory from kernel_memory_allocate with VM priv
         W - another thread is waiting for more memory
+        E - Per-cpu caching is enabled for this zone
         L - zone is being monitored by zleaks
         G - currently running GC
         I - zone was destroyed and is no longer valid
@@ -1117,10 +1227,10 @@ def ShowAllVMStats(cmd_args=None):
     vmstats.compressed_lifetime = 0
     vmstats.error = ''
 
-    hdr_format = "{0: >10s} {1: <20s} {2: >6s} {3: >10s} {4: >10s} {5: >10s} {6: >10s} {7: >10s} {8: >10s} {9: >10s} {10: >10s} {11: >10s} {12: >10s} {13: >10s} {14:}"
-    print hdr_format.format('pid', 'command', '#ents', 'wired', 'vsize', 'rsize', 'NEW RSIZE', 'max rsize', 'internal', 'external', 'reusable', 'compressed', 'compressed', 'compressed', '')
-    print hdr_format.format('', '', '', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(current)', '(peak)', '(lifetime)', '')
-    entry_format = "{p.p_pid: >10d} {p.p_comm: <20s} {m.hdr.nentries: >6d} {s.wired_count: >10d} {vsize: >10d} {s.resident_count: >10d} {s.new_resident_count: >10d} {s.resident_max: >10d} {s.internal: >10d} {s.external: >10d} {s.reusable: >10d} {s.compressed: >10d} {s.compressed_peak: >10d} {s.compressed_lifetime: >10d} {s.error}"
+    hdr_format = "{:>6s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:>10s} {:<20s} {:1s}"
+    print hdr_format.format('#ents', 'wired', 'vsize', 'rsize', 'NEW RSIZE', 'max rsize', 'internal', 'external', 'reusable', 'compressed', 'compressed', 'compressed', 'pid', 'command', '')
+    print hdr_format.format('', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(pages)', '(current)', '(peak)', '(lifetime)', '', '', '')
+    entry_format = "{m.hdr.nentries: >6d} {s.wired_count: >10d} {vsize: >10d} {s.resident_count: >10d} {s.new_resident_count: >10d} {s.resident_max: >10d} {s.internal: >10d} {s.external: >10d} {s.reusable: >10d} {s.compressed: >10d} {s.compressed_peak: >10d} {s.compressed_lifetime: >10d} {p.p_pid: >10d} {p.p_comm: <20s} {s.error}"
 
     for task in kern.tasks:
         proc = Cast(task.bsd_info, 'proc *')
@@ -1195,7 +1305,7 @@ def ShowMapVME(cmd_args=None):
         usage: showmapvme <vm_map>
     """
     if cmd_args == None or len(cmd_args) < 1:
-        print "Invalid argument.", ShowMap.__doc__
+        print "Invalid argument.", ShowMapVME.__doc__
         return
     map_val = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t')
     print GetVMMapSummary.header
@@ -1217,7 +1327,7 @@ def GetVMMapSummary(vmmap):
     resident_pages = 0
     if vmmap.pmap != 0: resident_pages = int(vmmap.pmap.stats.resident_count)
     first_free = 0
-    if int(vmmap.holelistenabled) == 0: first_free = vmmap.f_s.first_free
+    if int(vmmap.holelistenabled) == 0: first_free = vmmap.f_s._first_free
     out_string += format_string.format(vmmap, vmmap.pmap, vm_size, vmmap.hdr.nentries, resident_pages, vmmap.hint, first_free)
     return out_string
 
@@ -1485,6 +1595,7 @@ def AddKextAddr(cmd_args=[]):
 
     addr = ArgumentStringToInt(cmd_args[0])
     all_kexts_info = GetKextLoadInformation()
+    kernel_uuid = str(kern.globals.kernel_uuid_string).lower()
     found_kinfo = None
     found_segment = None
     for kinfo in all_kexts_info:
@@ -1493,14 +1604,17 @@ def AddKextAddr(cmd_args=[]):
             print GetKextSummary.header
             print GetKextSummary(kinfo[7]) + " segment: {} offset = {:#0x}".format(segment.name, (addr - segment.vmaddr))
             cur_uuid = kinfo[0].lower()
-            print "Fetching dSYM for %s" % cur_uuid
-            info = dsymForUUID(cur_uuid)
-            if info and 'DBGSymbolRichExecutable' in info:
-                print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable'])
-                addDSYM(cur_uuid, info)
-                loadDSYM(cur_uuid, int(kinfo[1],16), kinfo[4])
+            if (kernel_uuid == cur_uuid):
+                print "(builtin)"
             else:
-                print "Failed to get symbol info for %s" % cur_uuid
+                print "Fetching dSYM for %s" % cur_uuid
+                info = dsymForUUID(cur_uuid)
+                if info and 'DBGSymbolRichExecutable' in info:
+                    print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable'])
+                    addDSYM(cur_uuid, info)
+                    loadDSYM(cur_uuid, int(kinfo[1],16), kinfo[4])
+                else:
+                    print "Failed to get symbol info for %s" % cur_uuid
             return
 
 
@@ -1594,6 +1708,7 @@ def AddKextSyms(cmd_args=[], cmd_options={}):
         return True
 
     all_kexts_info = GetKextLoadInformation()
+    kernel_uuid = str(kern.globals.kernel_uuid_string).lower()
 
     if "-N" in cmd_options:
         kext_name = cmd_options["-N"]
@@ -1608,14 +1723,17 @@ def AddKextSyms(cmd_args=[], cmd_options={}):
             for x in all_kexts_info:
                 if cur_knm == x[2]:
                     cur_uuid = x[0].lower()
-                    print "Fetching dSYM for {:s}".format(cur_uuid)
-                    info = dsymForUUID(cur_uuid)
-                    if info and 'DBGSymbolRichExecutable' in info:
-                        print "Adding dSYM ({0:s}) for {1:s}".format(cur_uuid, info['DBGSymbolRichExecutable'])
-                        addDSYM(cur_uuid, info)
-                        loadDSYM(cur_uuid, int(x[1],16), x[4])
+                    if (kernel_uuid == cur_uuid):
+                        print "(builtin)"
                     else:
-                        print "Failed to get symbol info for {:s}".format(cur_uuid)
+                        print "Fetching dSYM for {:s}".format(cur_uuid)
+                        info = dsymForUUID(cur_uuid)
+                        if info and 'DBGSymbolRichExecutable' in info:
+                            print "Adding dSYM ({0:s}) for {1:s}".format(cur_uuid, info['DBGSymbolRichExecutable'])
+                            addDSYM(cur_uuid, info)
+                            loadDSYM(cur_uuid, int(x[1],16), x[4])
+                        else:
+                            print "Failed to get symbol info for {:s}".format(cur_uuid)
                     break
         kern.symbolicator = None
         return
@@ -1635,14 +1753,15 @@ def AddKextSyms(cmd_args=[], cmd_options={}):
     for k_info in all_kexts_info:
         cur_uuid = k_info[0].lower()
         if load_all_kexts or (uuid == cur_uuid):
-            print "Fetching dSYM for %s" % cur_uuid
-            info = dsymForUUID(cur_uuid)
-            if info and 'DBGSymbolRichExecutable' in info:
-                print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable'])
-                addDSYM(cur_uuid, info)
-                loadDSYM(cur_uuid, int(k_info[1],16), k_info[4])
-            else:
-                print "Failed to get symbol info for %s" % cur_uuid
+            if (kernel_uuid != cur_uuid):
+                print "Fetching dSYM for %s" % cur_uuid
+                info = dsymForUUID(cur_uuid)
+                if info and 'DBGSymbolRichExecutable' in info:
+                    print "Adding dSYM (%s) for %s" % (cur_uuid, info['DBGSymbolRichExecutable'])
+                    addDSYM(cur_uuid, info)
+                    loadDSYM(cur_uuid, int(k_info[1],16), k_info[4])
+                else:
+                    print "Failed to get symbol info for %s" % cur_uuid
         #end of for loop
     kern.symbolicator = None
     return True
@@ -2259,10 +2378,10 @@ def GetMutexLockSummary(mtx):
         out_str += "Pri                 : {mtx.lck_mtx_pri:#x}\n".format(mtx=mtx)
         out_str += "Spin                : {mtx.lck_mtx_spin:#x}\n".format(mtx=mtx)
         out_str += "Ext                 : {mtx.lck_mtx_is_ext:#x}\n".format(mtx=mtx)
-        if mtx.lck_mtxd_pad32 == 0xFFFFFFFF :
-            out_str += "Canary (valid)      : {mtx.lck_mtxd_pad32:#x}\n".format(mtx=mtx)
+        if mtx.lck_mtx_pad32 == 0xFFFFFFFF :
+            out_str += "Canary (valid)      : {mtx.lck_mtx_pad32:#x}\n".format(mtx=mtx)
         else:
-            out_str += "Canary (INVALID)    : {mtx.lck_mtxd_pad32:#x}\n".format(mtx=mtx)
+            out_str += "Canary (INVALID)    : {mtx.lck_mtx_pad32:#x}\n".format(mtx=mtx)
         return out_str
 
     out_str = "Lock Type\t\t: MUTEX\n"
@@ -2319,33 +2438,34 @@ def ShowLock(cmd_args=None, cmd_options={}):
         return
 
     summary_str = ""
-    lock = kern.GetValueFromAddress(cmd_args[0], 'uintptr_t *')
-
-    if kern.arch == "x86_64" and lock:
+    addr = cmd_args[0]
+    # from osfmk/arm/locks.h
+    LCK_SPIN_TYPE = 0x11
+    LCK_MTX_TYPE = 0x22
+    if kern.arch == "x86_64":
         if "-M" in cmd_options:
-            lock_mtx = kern.GetValueFromAddress(lock, 'lck_mtx_t *')
+            lock_mtx = kern.GetValueFromAddress(addr, 'lck_mtx_t *')
             summary_str = GetMutexLockSummary(lock_mtx)
         elif "-S" in cmd_options:
-            lock_spin = kern.GetValueFromAddress(lock, 'lck_spin_t *')
+            lock_spin = kern.GetValueFromAddress(addr, 'lck_spin_t *')
             summary_str = GetSpinLockSummary(lock_spin)
         else:
             summary_str = "Please specify supported lock option(-M/-S)"
 
         print summary_str
-        return
-
-    if lock:
-        lock_mtx = Cast(lock, 'lck_mtx_t*')
-        if lock_mtx.lck_mtx_type == 0x22:
-            summary_str = GetMutexLockSummary(lock_mtx)
-
-        lock_spin = Cast(lock, 'lck_spin_t*')
-        if lock_spin.type == 0x11:
-            summary_str = GetSpinLockSummary(lock_spin)
-
-    if summary_str == "":
-        summary_str = "Lock Type\t\t: INVALID LOCK" 
-    print summary_str
+    else:
+        lock = kern.GetValueFromAddress(addr, 'uintptr_t *')
+        if lock:
+            lock_mtx = Cast(lock, 'lck_mtx_t*')
+            if lock_mtx.lck_mtx_type == LCK_MTX_TYPE:
+                summary_str = GetMutexLockSummary(lock_mtx)
+
+            lock_spin = Cast(lock, 'lck_spin_t*')
+            if lock_spin.type == LCK_SPIN_TYPE:
+                summary_str = GetSpinLockSummary(lock_spin)
+        if summary_str == "":
+            summary_str = "Lock Type\t\t: INVALID LOCK"
+        print summary_str
 
 #EndMacro: showlock
 
@@ -2451,10 +2571,10 @@ def ShowAllPurgeableNonVolatileVmObjects(cmd_args=None):
     queue_len = kern.globals.purgeable_nonvolatile_count
     queue_head = kern.globals.purgeable_nonvolatile_queue
 
-    print 'purgeable_nonvolatile_queue:{:#018x}  purgeable_volatile_count:{:d}\n'.format(kern.GetLoadAddressForSymbol('purgeable_nonvolatile_queue'),queue_len)
+    print 'purgeable_nonvolatile_queue:{: <#018x}  purgeable_volatile_count:{:d}\n'.format(kern.GetLoadAddressForSymbol('purgeable_nonvolatile_queue'),queue_len)
     print 'N:non-volatile  V:volatile  E:empty  D:deny\n'
 
-    print '{:>6s} {:<6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s}   {:18s} {:>6s} {:<20s}\n'.format("#","#","object","P","refcnt","size (pages)","resid","wired","compressed","owner","pid","process")
+    print '{:>6s} {:<6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s}   {:>3s} {:18s} {:>6s} {:<20s}\n'.format("#","#","object","P","refcnt","size (pages)","resid","wired","compressed","tag","owner","pid","process")
     idx = 0
     for object in IterateQueue(queue_head, 'struct vm_object *', 'objq'):
         idx += 1
@@ -2487,14 +2607,14 @@ def ShowPurgeableNonVolatileVmObject(object, idx, queue_len, nonvolatile_total):
         compressor_pager = Cast(object.pager, 'compressor_pager *')
         compressed_count = compressor_pager.cpgr_num_slots_occupied
 
-    print "{:>6d}/{:<6d} {:#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d}   {:#018x} {:>6d} {:<20s}\n".format(idx,queue_len,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_un2.vou_purgeable_owner,GetProcPIDForTask(object.vo_un2.vou_purgeable_owner),GetProcNameForTask(object.vo_un2.vou_purgeable_owner))
+    print "{:>6d}/{:<6d} {: <#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d}  {:>3d} {: <#018x} {:>6d} {:<20s}\n".format(idx,queue_len,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_ledger_tag, object.vo_un2.vou_owner,GetProcPIDForObjectOwner(object.vo_un2.vou_owner),GetProcNameForObjectOwner(object.vo_un2.vou_owner))
 
     nonvolatile_total.objects += 1
     nonvolatile_total.vsize += object.vo_un1.vou_size/page_size
     nonvolatile_total.rsize += object.resident_page_count
     nonvolatile_total.wsize += object.wired_page_count
     nonvolatile_total.csize += compressed_count
-    if object.vo_un2.vou_purgeable_owner == 0:
+    if object.vo_un2.vou_owner == 0:
         nonvolatile_total.disowned_objects += 1
         nonvolatile_total.disowned_vsize += object.vo_un1.vou_size/page_size
         nonvolatile_total.disowned_rsize += object.resident_page_count
@@ -2561,7 +2681,7 @@ def ShowPurgeableGroup(qhead, volatile_total):
     for object in IterateQueue(qhead, 'struct vm_object *', 'objq'):
         if idx == 0:
 #            print "{:>6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s}   {:18s} {:>6s} {:<20s} {:18s} {:>6s} {:<20s} {:s}\n".format("#","object","P","refcnt","size (pages)","resid","wired","compressed","owner","pid","process","volatilizer","pid","process","")
-            print "{:>6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s}   {:18s} {:>6s} {:<20s}\n".format("#","object","P","refcnt","size (pages)","resid","wired","compressed","owner","pid","process")
+            print "{:>6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s}   {:>3s} {:18s} {:>6s} {:<20s}\n".format("#","object","P","refcnt","size (pages)","resid","wired","compressed","tag","owner","pid","process")
         idx += 1
         ShowPurgeableVolatileVmObject(object, idx, volatile_total)
 
@@ -2572,7 +2692,7 @@ def ShowPurgeableVolatileVmObject(object, idx, volatile_total):
         returns:
             None
     """
-##   if int(object.vo_un2.vou_purgeable_owner) != int(object.vo_purgeable_volatilizer):
+##   if int(object.vo_un2.vou_owner) != int(object.vo_purgeable_volatilizer):
 #        diff=" !="
 ##    else:
 #        diff="  "
@@ -2592,14 +2712,14 @@ def ShowPurgeableVolatileVmObject(object, idx, volatile_total):
     else:
         compressor_pager = Cast(object.pager, 'compressor_pager *')
         compressed_count = compressor_pager.cpgr_num_slots_occupied
-#    print "{:>6d} {:#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {:#018x} {:>6d} {:<20s}   {:#018x} {:>6d} {:<20s} {:s}\n".format(idx,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count,object.vo_un2.vou_purgeable_owner,GetProcPIDForTask(object.vo_un2.vou_purgeable_owner),GetProcNameForTask(object.vo_un2.vou_purgeable_owner),object.vo_purgeable_volatilizer,GetProcPIDForTask(object.vo_purgeable_volatilizer),GetProcNameForTask(object.vo_purgeable_volatilizer),diff)
-    print "{:>6d} {:#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d}   {:#018x} {:>6d} {:<20s}\n".format(idx,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_un2.vou_purgeable_owner,GetProcPIDForTask(object.vo_un2.vou_purgeable_owner),GetProcNameForTask(object.vo_un2.vou_purgeable_owner))
+#    print "{:>6d} {: <#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {: <#018x} {:>6d} {:<20s}   {: <#018x} {:>6d} {:<20s} {:s}\n".format(idx,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count,object.vo_un2.vou_owner,GetProcPIDForObjectOwner(object.vo_un2.vou_owner),GetProcNameForObjectOwner(object.vo_un2.vou_owner),object.vo_purgeable_volatilizer,GetProcPIDForObjectOwner(object.vo_purgeable_volatilizer),GetProcNameForObjectOwner(object.vo_purgeable_volatilizer),diff)
+    print "{:>6d} {: <#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d}   {:>3d} {: <#018x} {:>6d} {:<20s}\n".format(idx,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_ledger_tag, object.vo_un2.vou_owner,GetProcPIDForObjectOwner(object.vo_un2.vou_owner),GetProcNameForObjectOwner(object.vo_un2.vou_owner))
     volatile_total.objects += 1
     volatile_total.vsize += object.vo_un1.vou_size/page_size
     volatile_total.rsize += object.resident_page_count
     volatile_total.wsize += object.wired_page_count
     volatile_total.csize += compressed_count
-    if object.vo_un2.vou_purgeable_owner == 0:
+    if object.vo_un2.vou_owner == 0:
         volatile_total.disowned_objects += 1
         volatile_total.disowned_vsize += object.vo_un1.vou_size/page_size
         volatile_total.disowned_rsize += object.resident_page_count
@@ -2651,48 +2771,150 @@ def ShowTaskVMEntries(task, show_pager_info, show_all_shadows):
     if not task.map:
         print "Task {0: <#020x} has map = 0x0"
         return None
-    showmapvme(task.map, show_pager_info, show_all_shadows)
+    showmapvme(task.map, 0, 0, show_pager_info, show_all_shadows, False)
 
-@lldb_command("showmapvme", "PS")
+@lldb_command("showmapvme", "A:B:PRST")
 def ShowMapVME(cmd_args=None, cmd_options={}):
     """Routine to print out info about the specified vm_map and its vm entries
-        usage: showmapvme <vm_map>
+        usage: showmapvme <vm_map> [-A start] [-B end] [-S] [-P]
+        Use -A <start> flag to start at virtual address <start>
+        Use -B <end> flag to end at virtual address <end>
         Use -S flag to show VM object shadow chains
         Use -P flag to show pager info (mapped file, compressed pages, ...)
+        Use -R flag to reverse order
+        Use -T to show red-black tree pointers
     """
     if cmd_args == None or len(cmd_args) < 1:
-        print "Invalid argument.", ShowMap.__doc__
+        print "Invalid argument.", ShowMapVME.__doc__
         return
     show_pager_info = False
     show_all_shadows = False
+    show_rb_tree = False
+    start_vaddr = 0
+    end_vaddr = 0
+    reverse_order = False
+    if "-A" in cmd_options:
+        start_vaddr = unsigned(int(cmd_options['-A'], 16))
+    if "-B" in cmd_options:
+        end_vaddr = unsigned(int(cmd_options['-B'], 16))
     if "-P" in cmd_options:
         show_pager_info = True
     if "-S" in cmd_options:
         show_all_shadows = True
+    if "-R" in cmd_options:
+        reverse_order = True
+    if "-T" in cmd_options:
+        show_rb_tree = True
     map = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t')
-    showmapvme(map, show_pager_info, show_all_shadows)
+    showmapvme(map, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order, show_rb_tree)
+
+@lldb_command("showvmobject", "A:B:PRST")
+def ShowVMObject(cmd_args=None, cmd_options={}):
+    """Routine to print out a VM object and its shadow chain
+        usage: showvmobject <vm_object> [-S] [-P]
+        -S: show VM object shadow chain
+        -P: show pager info (mapped file, compressed pages, ...)
+    """
+    if cmd_args == None or len(cmd_args) < 1:
+        print "Invalid argument.", ShowMapVME.__doc__
+        return
+    show_pager_info = False
+    show_all_shadows = False
+    if "-P" in cmd_options:
+        show_pager_info = True
+    if "-S" in cmd_options:
+        show_all_shadows = True
+    object = kern.GetValueFromAddress(cmd_args[0], 'vm_object_t')
+    showvmobject(object, 0, 0, show_pager_info, show_all_shadows)
 
-def showmapvme(map, show_pager_info, show_all_shadows):
+def showvmobject(object, offset=0, size=0, show_pager_info=False, show_all_shadows=False):
     page_size = kern.globals.page_size
     vnode_pager_ops = kern.globals.vnode_pager_ops
     vnode_pager_ops_addr = unsigned(addressof(vnode_pager_ops))
+    depth = 0
+    if size == 0 and object != 0 and object.internal:
+        size = object.vo_un1.vou_size
+    while object != 0:
+        depth += 1
+        if show_all_shadows == False and depth != 1 and object.shadow != 0:
+            offset += unsigned(object.vo_un2.vou_shadow_offset)
+            object = object.shadow
+            continue
+        if object.copy_strategy == 0:
+            copy_strategy="N"
+        elif object.copy_strategy == 2:
+            copy_strategy="D"
+        elif object.copy_strategy == 4:
+            copy_strategy="S"
+
+        else:
+            copy_strategy=str(object.copy_strategy)
+        if object.internal:
+            internal = "internal"
+        else:
+            internal = "external"
+        purgeable = "NVED"[int(object.purgable)]
+        pager_string = ""
+        if object.phys_contiguous:
+            pager_string = pager_string + "phys_contig {:#018x}:{:#018x} ".format(unsigned(object.vo_un2.vou_shadow_offset), unsigned(object.vo_un1.vou_size))
+        pager = object.pager
+        if show_pager_info and pager != 0:
+            if object.internal:
+                pager_string = pager_string + "-> compressed:{:d}".format(GetCompressedPagesForObject(object))
+            elif unsigned(pager.mo_pager_ops) == vnode_pager_ops_addr:
+                vnode_pager = Cast(pager,'vnode_pager *')
+                pager_string = pager_string + "-> " + GetVnodePath(vnode_pager.vnode_handle)
+            else:
+                pager_string = pager_string + "-> {:s}:{: <#018x}".format(pager.mo_pager_ops.memory_object_pager_name, pager)
+        print "{:>18d} {:#018x}:{:#018x} {: <#018x} ref:{:<6d} ts:{:1d} strat:{:1s} purg:{:1s} {:s} wtag:{:d} ({:d} {:d} {:d}) {:s}".format(depth,offset,offset+size,object,object.ref_count,object.true_share,copy_strategy,purgeable,internal,object.wire_tag,unsigned(object.vo_un1.vou_size)/page_size,object.resident_page_count,object.wired_page_count,pager_string)
+#       print "        #{:<5d} obj {: <#018x} ref:{:<6d} ts:{:1d} strat:{:1s} {:s} size:{:<10d} wired:{:<10d} resident:{:<10d} reusable:{:<10d}".format(depth,object,object.ref_count,object.true_share,copy_strategy,internal,object.vo_un1.vou_size/page_size,object.wired_page_count,object.resident_page_count,object.reusable_page_count)
+        offset += unsigned(object.vo_un2.vou_shadow_offset)
+        object = object.shadow
+
+def showmapvme(map, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order=False, show_rb_tree=False):
     rsize = 0
     if map.pmap != 0:
         rsize = int(map.pmap.stats.resident_count)
     print "{:<18s} {:<18s} {:<18s} {:>10s} {:>18s} {:>18s}:{:<18s}".format("vm_map","pmap","size","#ents","rsize","start","end")
-    print "{:#018x} {:#018x} {:#018x} {:>10d} {:>18d} {:#018x}:{:#018x}".format(map,map.pmap,unsigned(map.size),map.hdr.nentries,rsize,map.hdr.links.start,map.hdr.links.end)
-    vme_list_head = map.hdr.links
+    print "{: <#018x} {: <#018x} {:#018x} {:>10d} {:>18d} {:#018x}:{:#018x}".format(map,map.pmap,unsigned(map.size),map.hdr.nentries,rsize,map.hdr.links.start,map.hdr.links.end)
+    showmaphdrvme(map.hdr, map.pmap, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order, show_rb_tree)
+
+def showmapcopyvme(mapcopy, start_vaddr=0, end_vaddr=0, show_pager_info=True, show_all_shadows=True, reverse_order=False, show_rb_tree=False):
+    print "{:<18s} {:<18s} {:<18s} {:>10s} {:>18s} {:>18s}:{:<18s}".format("vm_map_copy","pmap","size","#ents","rsize","start","end")
+    print "{: <#018x} {:#018x} {:#018x} {:>10d} {:>18d} {:#018x}:{:#018x}".format(mapcopy,0,0,mapcopy.c_u.hdr.nentries,0,mapcopy.c_u.hdr.links.start,mapcopy.c_u.hdr.links.end)
+    showmaphdrvme(mapcopy.c_u.hdr, 0, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order, show_rb_tree)
+
+def showmaphdrvme(maphdr, pmap, start_vaddr, end_vaddr, show_pager_info, show_all_shadows, reverse_order, show_rb_tree):
+    page_size = kern.globals.page_size
+    vnode_pager_ops = kern.globals.vnode_pager_ops
+    vnode_pager_ops_addr = unsigned(addressof(vnode_pager_ops))
+    if hasattr(kern.globals, 'compressor_object'):
+        compressor_object = kern.globals.compressor_object
+    else:
+        compressor_object = -1;
+    vme_list_head = maphdr.links
     vme_ptr_type = GetType('vm_map_entry *')
-    print "{:<18s} {:>18s}:{:<18s} {:>10s} {:<8s} {:<10s} {:<18s} {:<18s}".format("entry","start","end","#pgs","tag.kmod","prot&flags","object","offset")
-    last_end = unsigned(map.hdr.links.start)
-    for vme in IterateQueue(vme_list_head, vme_ptr_type, "links"):
+    print "{:<18s} {:>18s}:{:<18s} {:>10s} {:<8s} {:<16s} {:<18s} {:<18s}".format("entry","start","end","#pgs","tag.kmod","prot&flags","object","offset")
+    last_end = unsigned(maphdr.links.start)
+    skipped_entries = 0
+    for vme in IterateQueue(vme_list_head, vme_ptr_type, "links", reverse_order):
+        if start_vaddr != 0 and end_vaddr != 0:
+            if unsigned(vme.links.start) > end_vaddr:
+                break
+            if unsigned(vme.links.end) <= start_vaddr:
+                last_end = unsigned(vme.links.end)
+                skipped_entries = skipped_entries + 1
+                continue
+            if skipped_entries != 0:
+                print "... skipped {:d} entries ...".format(skipped_entries)
+                skipped_entries = 0
         if unsigned(vme.links.start) != last_end:
             print "{:18s} {:#018x}:{:#018x} {:>10d}".format("------------------",last_end,vme.links.start,(unsigned(vme.links.start) - last_end)/page_size)
         last_end = unsigned(vme.links.end)
         size = unsigned(vme.links.end) - unsigned(vme.links.start)
         object = vme.vme_object.vmo_object
         if object == 0:
-            object_str = "{:<#018x}".format(object)
+            object_str = "{: <#018x}".format(object)
         elif vme.is_sub_map:
             if object == kern.globals.bufferhdr_map:
                 object_str = "BUFFERHDR_MAP"
@@ -2717,70 +2939,73 @@ def showmapvme(map, show_pager_info, show_all_shadows):
             elif hasattr(kern.globals, 'vector_upl_submap') and object == kern.globals.vector_upl_submap:
                 object_str = "VECTOR_UPL_SUBMAP"
             else:
-                object_str = "submap:{:<#018x}".format(object)
+                object_str = "submap:{: <#018x}".format(object)
         else:
             if object == kern.globals.kernel_object:
                 object_str = "KERNEL_OBJECT"
             elif object == kern.globals.vm_submap_object:
                 object_str = "VM_SUBMAP_OBJECT"
-            elif object == kern.globals.compressor_object:
+            elif object == compressor_object:
                 object_str = "COMPRESSOR_OBJECT"
             else:
-                object_str = "{:<#018x}".format(object)
+                object_str = "{: <#018x}".format(object)
         offset = unsigned(vme.vme_offset) & ~0xFFF
         tag = unsigned(vme.vme_offset & 0xFFF)
+        protection = ""
+        if vme.protection & 0x1:
+            protection +="r"
+        else:
+            protection += "-"
+        if vme.protection & 0x2:
+            protection += "w"
+        else:
+            protection += "-"
+        if vme.protection & 0x4:
+            protection += "x"
+        else:
+            protection += "-"
+        max_protection = ""
+        if vme.max_protection & 0x1:
+            max_protection +="r"
+        else:
+            max_protection += "-"
+        if vme.max_protection & 0x2:
+            max_protection += "w"
+        else:
+            max_protection += "-"
+        if vme.max_protection & 0x4:
+            max_protection += "x"
+        else:
+            max_protection += "-"
         vme_flags = ""
         if vme.is_sub_map:
             vme_flags += "s"
         if vme.needs_copy:
             vme_flags += "n"
-        if vme.is_sub_map and vme.use_pmap:
+        if vme.use_pmap:
             vme_flags += "p"
+        if vme.wired_count:
+            vme_flags += "w"
+        if vme.used_for_jit:
+            vme_flags += "j"
         tagstr = ""
-        if map.pmap == kern.globals.kernel_pmap:
+        if pmap == kern.globals.kernel_pmap:
             xsite = Cast(kern.globals.vm_allocation_sites[tag],'OSKextAccount *')
             if xsite and xsite.site.flags & 0x0200:
                 tagstr = ".{:<3d}".format(xsite.loadTag)
-        print "{:#018x} {:#018x}:{:#018x} {:>10d} {:>3d}{:<4s}  {:1d}{:1d}{:<8s} {:<18s} {:<#18x}".format(vme,vme.links.start,vme.links.end,(unsigned(vme.links.end)-unsigned(vme.links.start))/page_size,tag,tagstr,vme.protection,vme.max_protection,vme_flags,object_str,offset)
+        rb_info = ""
+        if show_rb_tree:
+            rb_info = "l={: <#018x} r={: <#018x} p={: <#018x}".format(vme.store.entry.rbe_left, vme.store.entry.rbe_right, vme.store.entry.rbe_parent)
+        print "{: <#018x} {:#018x}:{:#018x} {:>10d} {:>3d}{:<4s}  {:3s}/{:3s}/{:<8s} {:<18s} {:<#18x} {:s}".format(vme,vme.links.start,vme.links.end,(unsigned(vme.links.end)-unsigned(vme.links.start))/page_size,tag,tagstr,protection,max_protection,vme_flags,object_str,offset, rb_info)
         if (show_pager_info or show_all_shadows) and vme.is_sub_map == 0 and vme.vme_object.vmo_object != 0:
             object = vme.vme_object.vmo_object
         else:
             object = 0
-        depth = 0
-        while object != 0:
-            depth += 1
-            if show_all_shadows == False and depth != 1 and object.shadow != 0:
-                offset += unsigned(object.vo_un2.vou_shadow_offset)
-                object = object.shadow
-                continue
-            if object.copy_strategy == 0:
-                copy_strategy="N"
-            elif object.copy_strategy == 2:
-                copy_strategy="D"
-            elif object.copy_strategy == 4:
-                copy_strategy="S"
-            else:
-                copy_strategy=str(object.copy_strategy)
-            if object.internal:
-                internal = "internal"
-            else:
-                internal = "external"
-            pager_string = ""
-            pager = object.pager
-            if show_pager_info and pager != 0:
-                if object.internal:
-                    pager_string = "-> compressed:{:d}".format(GetCompressedPagesForObject(object))
-                elif unsigned(pager.mo_pager_ops) == vnode_pager_ops_addr:
-                    vnode_pager = Cast(pager,'vnode_pager *')
-                    pager_string = "-> " + GetVnodePath(vnode_pager.vnode_handle)
-                else:
-                    pager_string = "-> {:s}:{:#018x}".format(pager.mo_pager_ops.memory_object_pager_name, pager.mo_pager_ops)
-            print "{:>18d} {:#018x}:{:#018x} {:#018x} ref:{:<6d} ts:{:1d} strat:{:1s} {:s} ({:d} {:d} {:d}) {:s}".format(depth,offset,offset+size,object,object.ref_count,object.true_share,copy_strategy,internal,unsigned(object.vo_un1.vou_size)/page_size,object.resident_page_count,object.wired_page_count,pager_string)
-#            print "        #{:<5d} obj {:#018x} ref:{:<6d} ts:{:1d} strat:{:1s} {:s} size:{:<10d} wired:{:<10d} resident:{:<10d} reusable:{:<10d}".format(depth,object,object.ref_count,object.true_share,copy_strategy,internal,object.vo_un1.vou_size/page_size,object.wired_page_count,object.resident_page_count,object.reusable_page_count)
-            offset += unsigned(object.vo_un2.vou_shadow_offset)
-            object = object.shadow
-    if unsigned(map.hdr.links.end) > last_end:
-        print "{:18s} {:#018x}:{:#018x} {:>10d}".format("------------------",last_end,map.hdr.links.end,(unsigned(map.hdr.links.end) - last_end)/page_size)
+        showvmobject(object, offset, size, show_pager_info, show_all_shadows)
+    if start_vaddr != 0 or end_vaddr != 0:
+        print "..."
+    elif unsigned(maphdr.links.end) > last_end:
+        print "{:18s} {:#018x}:{:#018x} {:>10d}".format("------------------",last_end,maphdr.links.end,(unsigned(maphdr.links.end) - last_end)/page_size)
     return None
 
 def CountMapTags(map, tagcounts, slow):
@@ -2802,11 +3027,11 @@ def CountMapTags(map, tagcounts, slow):
                     page = _vm_page_unpack_ptr(page_list)
                     while (page != 0):
                         vmpage = kern.GetValueFromAddress(page, 'vm_page_t')
-                        if (addr == unsigned(vmpage.offset)) and (object == vm_object_t(_vm_page_unpack_ptr(vmpage.vm_page_object))):
-                            if (not vmpage.local) and (vmpage.wire_count > 0):
+                        if (addr == unsigned(vmpage.vmp_offset)) and (object == vm_object_t(_vm_page_unpack_ptr(vmpage.vmp_object))):
+                            if (not vmpage.vmp_local) and (vmpage.vmp_wire_count > 0):
                                 count += 1
                             break
-                        page = _vm_page_unpack_ptr(vmpage.next_m)
+                        page = _vm_page_unpack_ptr(vmpage.vmp_next_m)
                     addr += page_size
             tagcounts[tag] += count
         elif vme.is_sub_map:
@@ -2817,21 +3042,6 @@ def CountWiredObject(object, tagcounts):
     tagcounts[unsigned(object.wire_tag)] += object.wired_page_count
     return None
 
-def CountWiredPurgeableGroup(qhead, tagcounts):
-    for object in IterateQueue(qhead, 'struct vm_object *', 'objq'):
-        CountWiredObject(object, tagcounts)
-    return None
-
-def CountWiredPurgeableQueue(qhead, tagcounts):
-    CountWiredPurgeableGroup(qhead.objq[0], tagcounts)
-    CountWiredPurgeableGroup(qhead.objq[1], tagcounts)
-    CountWiredPurgeableGroup(qhead.objq[2], tagcounts)
-    CountWiredPurgeableGroup(qhead.objq[3], tagcounts)
-    CountWiredPurgeableGroup(qhead.objq[4], tagcounts)
-    CountWiredPurgeableGroup(qhead.objq[5], tagcounts)
-    CountWiredPurgeableGroup(qhead.objq[6], tagcounts)
-    CountWiredPurgeableGroup(qhead.objq[7], tagcounts)
-
 def GetKmodIDName(kmod_id):
     kmod_val = kern.globals.kmod
     for kmod in IterateLinkedList(kmod_val, 'next'):
@@ -2874,17 +3084,22 @@ FixedTags = {
 def GetVMKernName(tag):
     return FixedTags[tag]
 
-@lldb_command("showvmtags", "S")
+@lldb_command("showvmtags", "AS")
 def showvmtags(cmd_args=None, cmd_options={}):
     """Routine to print out info about kernel wired page allocations
         usage: showvmtags
                iterates kernel map and vm objects totaling allocations by tag.
         usage: showvmtags -S
                also iterates kernel object pages individually - slow.
+        usage: showvmtags -A
+               show all tags, even tags that have no wired count
     """
     slow = False
     if "-S" in cmd_options:
         slow = True
+    all_tags = False
+    if "-A" in cmd_options:
+        all_tags = True
     page_size = unsigned(kern.globals.page_size)
     tagcounts = []
     tagpeaks = []
@@ -2901,25 +3116,16 @@ def showvmtags(cmd_args=None, cmd_options={}):
                 tagpeaks[unsigned(tag)] = unsigned(site.peak)
     else:
         queue_head = kern.globals.vm_objects_wired
-        for object in IterateQueue(queue_head, 'struct vm_object *', 'objq'):
+        for object in IterateQueue(queue_head, 'struct vm_object *', 'wired_objq'):
             if object != kern.globals.kernel_object:
                 CountWiredObject(object, tagcounts)
 
-        queue_head = kern.globals.purgeable_nonvolatile_queue
-        for object in IterateQueue(queue_head, 'struct vm_object *', 'objq'):
-            CountWiredObject(object, tagcounts)
-
-        purgeable_queues = kern.globals.purgeable_queues
-        CountWiredPurgeableQueue(purgeable_queues[0], tagcounts)
-        CountWiredPurgeableQueue(purgeable_queues[1], tagcounts)
-        CountWiredPurgeableQueue(purgeable_queues[2], tagcounts)
-
         CountMapTags(kern.globals.kernel_map, tagcounts, slow)
 
     total = 0
     print " {:<7s}  {:>7s}   {:>7s}  {:<50s}".format("tag.kmod","peak","size","name")
     for tag in range(256):
-        if tagcounts[tag]:
+        if all_tags or tagcounts[tag]:
             total += tagcounts[tag]
             tagstr = ""
             sitestr = ""
@@ -3023,8 +3229,8 @@ def VMPageLookup(cmd_args=None):
     page = _vm_page_unpack_ptr(page_list)
     while (page != 0) :
         pg_t = kern.GetValueFromAddress(page, 'vm_page_t')
-        print format_string.format(page, pg_t.offset, _vm_page_unpack_ptr(pg_t.vm_page_object))
-        page = _vm_page_unpack_ptr(pg_t.next_m)
+        print format_string.format(page, pg_t.vmp_offset, _vm_page_unpack_ptr(pg_t.vmp_object))
+        page = _vm_page_unpack_ptr(pg_t.vmp_next_m)
 
 
 
@@ -3044,7 +3250,7 @@ def VmPageGetPhysPage(cmd_args=None):
 
 def _vm_page_get_phys_page(page):
     if kern.arch == 'x86_64':
-        return page.phys_page
+        return page.vmp_phys_page
 
     if page == 0 :
         return 0
@@ -3181,7 +3387,7 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
     page_found = False
     pages_seen = set()
 
-    for vmp in IterateQueue(obj.memq, "vm_page_t", "listq", walk_backwards, unpack_ptr_fn=_vm_page_unpack_ptr):
+    for vmp in IterateQueue(obj.memq, "vm_page_t", "vmp_listq", walk_backwards, unpack_ptr_fn=_vm_page_unpack_ptr):
         page_count += 1
         out_string = ""
         if (page != 0 and not(page_found) and vmp == page):
@@ -3192,41 +3398,45 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
              if (page_count % 1000) == 0:
                 print "traversed %d pages ...\n" % (page_count)
         else:
-                out_string += format_string.format(page_count, res_page_count, vmp, vmp.offset, _vm_page_unpack_ptr(vmp.listq.next), _vm_page_get_phys_page(vmp), vmp.wire_count)
-                out_string += first_bitfield_format_string.format(vmp.vm_page_q_state, vmp.vm_page_in_background, vmp.vm_page_on_backgroundq, vmp.gobbled, vmp.laundry, vmp.no_cache,
-                                                                   vmp.private, vmp.reference)
+                out_string += format_string.format(page_count, res_page_count, vmp, vmp.vmp_offset, _vm_page_unpack_ptr(vmp.listq.next), _vm_page_get_phys_page(vmp), vmp.vmp_wire_count)
+                out_string += first_bitfield_format_string.format(vmp.vmp_q_state, vmp.vmp_in_background, vmp.vmp_on_backgroundq, vmp.vmp_gobbled, vmp.vmp_laundry, vmp.vmp_no_cache,
+                                                                   vmp.vmp_private, vmp.vmp_reference)
 
-                out_string += second_bitfield_format_string.format(vmp.busy, vmp.wanted, vmp.tabled, vmp.hashed, vmp.fictitious, vmp.clustered,
-                                                                    vmp.pmapped, vmp.xpmapped, vmp.wpmapped, vmp.free_when_done, vmp.absent,
-                                                                    vmp.error, vmp.dirty, vmp.cleaning, vmp.precious, vmp.overwriting,
-                                                                    vmp.restart, vmp.unusual, 0, 0,
-                                                                    vmp.cs_validated, vmp.cs_tainted, vmp.cs_nx, vmp.reusable, vmp.lopage, vmp.slid,
-                                                                    vmp.written_by_kernel)
+                if hasattr(vmp,'slid'):
+                    vmp_slid = vmp.slid
+                else:
+                    vmp_slid = 0
+                out_string += second_bitfield_format_string.format(vmp.vmp_busy, vmp.vmp_wanted, vmp.vmp_tabled, vmp.vmp_hashed, vmp.vmp_fictitious, vmp.vmp_clustered,
+                                                                    vmp.vmp_pmapped, vmp.vmp_xpmapped, vmp.vmp_wpmapped, vmp.vmp_free_when_done, vmp.vmp_absent,
+                                                                    vmp.vmp_error, vmp.vmp_dirty, vmp.vmp_cleaning, vmp.vmp_precious, vmp.vmp_overwriting,
+                                                                    vmp.vmp_restart, vmp.vmp_unusual, 0, 0,
+                                                                    vmp.vmp_cs_validated, vmp.vmp_cs_tainted, vmp.vmp_cs_nx, vmp.vmp_reusable, vmp.vmp_lopage, vmp_slid,
+                                                                    vmp.vmp_written_by_kernel)
 
         if (vmp in pages_seen):
             print out_string + "cycle detected! we've seen vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " twice. stopping...\n"
             return
 
-        if (_vm_page_unpack_ptr(vmp.vm_page_object) != unsigned(obj)):
-            print out_string + " vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) +  " points to different vm_object_t: " + "{0: <#020x}".format(unsigned(_vm_page_unpack_ptr(vmp.vm_page_object)))
+        if (_vm_page_unpack_ptr(vmp.vmp_object) != unsigned(obj)):
+            print out_string + " vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) +  " points to different vm_object_t: " + "{0: <#020x}".format(unsigned(_vm_page_unpack_ptr(vmp.vmp_object)))
             return
 
-        if (vmp.vm_page_q_state == VM_PAGE_IS_WIRED) and (vmp.wire_count == 0):
+        if (vmp.vmp_q_state == VM_PAGE_IS_WIRED) and (vmp.vmp_wire_count == 0):
             print out_string + " page in wired state with wire_count of 0\n"
             print "vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + "\n"
             print "stopping...\n"
             return
 
-        if ((vmp.__unused_pageq_bits != 0) or (vmp.__unused_object_bits != 0)):
-            print out_string + " unused bits not zero for vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " unused__pageq_bits: %d unused_object_bits : %d\n" % (vmp.__unused_pageq_bits,
-                                            vmp.__unused_object_bits)
+        if ((vmp.vmp_unused_page_bits != 0) or (vmp.vmp_unused_object_bits != 0)):
+            print out_string + " unused bits not zero for vm_page_t: " + "{0: <#020x}".format(unsigned(vmp)) + " unused__pageq_bits: %d unused_object_bits : %d\n" % (vmp.vmp_unused_page_bits,
+                                            vmp.vmp_unused_object_bits)
             print "stopping...\n"
             return
 
         pages_seen.add(vmp)
 
         if False:
-            hash_id = _calc_vm_page_hash(obj, vmp.offset)
+            hash_id = _calc_vm_page_hash(obj, vmp.vmp_offset)
             hash_page_list = kern.globals.vm_page_buckets[hash_id].page_list
             hash_page = _vm_page_unpack_ptr(hash_page_list)
             hash_page_t = 0
@@ -3235,11 +3445,11 @@ def VMObjectWalkPages(cmd_args=None, cmd_options={}):
                 hash_page_t = kern.GetValueFromAddress(hash_page, 'vm_page_t')
                 if hash_page_t == vmp:
                     break
-                hash_page = _vm_page_unpack_ptr(hash_page_t.next_m)
+                hash_page = _vm_page_unpack_ptr(hash_page_t.vmp_next_m)
 
             if (unsigned(vmp) != unsigned(hash_page_t)):
                 print out_string + "unable to find page: " + "{0: <#020x}".format(unsigned(vmp)) + " from object in kernel page bucket list\n"
-                print lldb_run_command("vm_page_info %s 0x%x" % (cmd_args[0], unsigned(vmp.offset)))
+                print lldb_run_command("vm_page_info %s 0x%x" % (cmd_args[0], unsigned(vmp.vmp_offset)))
                 return
 
         if (page_count >= limit and not(ignore_limit)):
@@ -3278,9 +3488,9 @@ def ShowAppleProtectPager(cmd_args=None):
         usage: show_apple_protect_pager <pager>
     """
     if cmd_args == None or len(cmd_args) < 1:
-        print "Invalid argument.", ShowMap.__doc__
+        print "Invalid argument.", ShowAppleProtectPager.__doc__
         return
-    pager = kern.GetValueFromAddress(cmd_ars[0], 'apple_protect_pager_t')
+    pager = kern.GetValueFromAddress(cmd_args[0], 'apple_protect_pager_t')
     show_apple_protect_pager(pager, 1, 1)
 
 def show_apple_protect_pager(pager, qcnt, idx):
@@ -3291,7 +3501,7 @@ def show_apple_protect_pager(pager, qcnt, idx):
         shadow = object.shadow
     vnode_pager = Cast(object.pager,'vnode_pager *')
     filename = GetVnodePath(vnode_pager.vnode_handle)
-    print "{:>3}/{:<3d} {:#018x} {:>5d} {:>5d} {:>6d} {:#018x} {:#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{:#018x} <decrypt:{:#018x} end:{:#018x} ops:{:#018x} refs:{:<d}>\n\tvnode:{:#018x} {:s}\n".format(idx, qcnt, pager, pager.ref_count, pager.is_ready, pager.is_mapped, pager.pager_control, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename)
+    print "{:>3}/{:<3d} {: <#018x} {:>5d} {:>5d} {:>6d} {: <#018x} {: <#018x} {:#018x} {:#018x} {:#018x} {:#018x}\n\tcrypt_info:{: <#018x} <decrypt:{: <#018x} end:{:#018x} ops:{: <#018x} refs:{:<d}>\n\tvnode:{: <#018x} {:s}\n".format(idx, qcnt, pager, pager.ref_count, pager.is_ready, pager.is_mapped, pager.pager_control, pager.backing_object, pager.backing_offset, pager.crypto_backing_offset, pager.crypto_start, pager.crypto_end, pager.crypt_info, pager.crypt_info.page_decrypt, pager.crypt_info.crypt_end, pager.crypt_info.crypt_ops, pager.crypt_info.crypt_refcnt, vnode_pager.vnode_handle, filename)
 
 @lldb_command("show_console_ring")
 def ShowConsoleRingData(cmd_args=None):
@@ -3449,3 +3659,619 @@ def ShowVnodeDirtyBlocks(cmd_args=None):
     _ShowVnodeBlocks(True, cmd_args)
 
 # EndMacro: showvnodecleanblk/showvnodedirtyblk
+
+
+@lldb_command("vm_page_lookup_in_map")
+def VmPageLookupInMap(cmd_args=None):
+    """Lookup up a page at a virtual address in a VM map
+        usage: vm_page_lookup_in_map <map> <vaddr>
+    """
+    if cmd_args == None or len(cmd_args) < 2:
+        print "Invalid argument.", VmPageLookupInMap.__doc__
+        return
+    map = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t')
+    vaddr = kern.GetValueFromAddress(cmd_args[1], 'vm_map_offset_t')
+    print "vaddr {:#018x} in map {: <#018x}".format(vaddr, map)
+    vm_page_lookup_in_map(map, vaddr)
+
+def vm_page_lookup_in_map(map, vaddr):
+    vaddr = unsigned(vaddr)
+    vme_list_head = map.hdr.links
+    vme_ptr_type = GetType('vm_map_entry *')
+    for vme in IterateQueue(vme_list_head, vme_ptr_type, "links"):
+        if unsigned(vme.links.start) > vaddr:
+            break
+        if unsigned(vme.links.end) <= vaddr:
+            continue
+        offset_in_vme = vaddr - unsigned(vme.links.start)
+        print "  offset {:#018x} in map entry {: <#018x} [{:#018x}:{:#018x}] object {: <#018x} offset {:#018x}".format(offset_in_vme, vme, unsigned(vme.links.start), unsigned(vme.links.end), vme.vme_object.vmo_object, unsigned(vme.vme_offset) & ~0xFFF)
+        offset_in_object = offset_in_vme + (unsigned(vme.vme_offset) & ~0xFFF)
+        if vme.is_sub_map:
+            print "vaddr {:#018x} in map {: <#018x}".format(offset_in_object, vme.vme_object.vmo_submap)
+            vm_page_lookup_in_map(vme.vme_object.vmo_submap, offset_in_object)
+        else:
+            vm_page_lookup_in_object(vme.vme_object.vmo_object, offset_in_object)
+
+@lldb_command("vm_page_lookup_in_object")
+def VmPageLookupInObject(cmd_args=None):
+    """Lookup up a page at a given offset in a VM object
+        usage: vm_page_lookup_in_object <object> <offset>
+    """
+    if cmd_args == None or len(cmd_args) < 2:
+        print "Invalid argument.", VmPageLookupInObject.__doc__
+        return
+    object = kern.GetValueFromAddress(cmd_args[0], 'vm_object_t')
+    offset = kern.GetValueFromAddress(cmd_args[1], 'vm_object_offset_t')
+    print "offset {:#018x} in object {: <#018x}".format(offset, object)
+    vm_page_lookup_in_object(object, offset)
+
+def vm_page_lookup_in_object(object, offset):
+    offset = unsigned(offset)
+    page_size = kern.globals.page_size
+    trunc_offset = offset & ~(page_size - 1)
+    print "    offset {:#018x} in VM object {: <#018x}".format(offset, object)
+    hash_id = _calc_vm_page_hash(object, trunc_offset)
+    page_list = kern.globals.vm_page_buckets[hash_id].page_list
+    page = _vm_page_unpack_ptr(page_list)
+    while page != 0:
+        m = kern.GetValueFromAddress(page, 'vm_page_t')
+        m_object_val = _vm_page_unpack_ptr(m.vmp_object)
+        m_object = kern.GetValueFromAddress(m_object_val, 'vm_object_t')
+        if unsigned(m_object) != unsigned(object) or unsigned(m.vmp_offset) != unsigned(trunc_offset):
+            page = _vm_page_unpack_ptr(m.vmp_next_m)
+            continue
+        print "    resident page {: <#018x} phys {:#010x}".format(m, _vm_page_get_phys_page(m))
+        return
+    if object.pager and object.pager_ready:
+        offset_in_pager = trunc_offset + unsigned(object.paging_offset)
+        if not object.internal:
+            print "    offset {:#018x} in external '{:s}' {: <#018x}".format(offset_in_pager, object.pager.mo_pager_ops.memory_object_pager_name, object.pager)
+            return
+        pager = Cast(object.pager, 'compressor_pager *')
+        ret = vm_page_lookup_in_compressor_pager(pager, offset_in_pager)
+        if ret:
+            return
+    if object.shadow and not object.phys_contiguous:
+        offset_in_shadow = offset + unsigned(object.vo_un2.vou_shadow_offset)
+        vm_page_lookup_in_object(object.shadow, offset_in_shadow)
+        return
+    print "    page is absent and will be zero-filled on demand"
+    return
+
+@lldb_command("vm_page_lookup_in_compressor_pager")
+def VmPageLookupInCompressorPager(cmd_args=None):
+    """Lookup up a page at a given offset in a compressor pager
+        usage: vm_page_lookup_in_compressor_pager <pager> <offset>
+    """
+    if cmd_args == None or len(cmd_args) < 2:
+        print "Invalid argument.", VmPageLookupInCompressorPager.__doc__
+        return
+    pager = kern.GetValueFromAddress(cmd_args[0], 'compressor_pager_t')
+    offset = kern.GetValueFromAddress(cmd_args[1], 'memory_object_offset_t')
+    print "offset {:#018x} in compressor pager {: <#018x}".format(offset, pager)
+    vm_page_lookup_in_compressor_pager(pager, offset)
+
+def vm_page_lookup_in_compressor_pager(pager, offset):
+    offset = unsigned(offset)
+    page_size = unsigned(kern.globals.page_size)
+    page_num = unsigned(offset / page_size)
+    if page_num > pager.cpgr_num_slots:
+        print "      *** ERROR: vm_page_lookup_in_compressor_pager({: <#018x},{:#018x}): page_num {:#x} > num_slots {:#x}".format(pager, offset, page_num, pager.cpgr_num_slots)
+        return 0
+    slots_per_chunk = 512 / sizeof ('compressor_slot_t')
+    num_chunks = unsigned((pager.cpgr_num_slots+slots_per_chunk-1) / slots_per_chunk)
+    if num_chunks > 1:
+        chunk_idx = unsigned(page_num / slots_per_chunk)
+        chunk = pager.cpgr_slots.cpgr_islots[chunk_idx]
+        slot_idx = unsigned(page_num % slots_per_chunk)
+        slot = GetObjectAtIndexFromArray(chunk, slot_idx)
+        slot_str = "islots[{:d}][{:d}]".format(chunk_idx, slot_idx)
+    elif pager.cpgr_num_slots > 2:
+        slot_idx = page_num
+        slot = GetObjectAtIndexFromArray(pager.cpgr_slots.cpgr_dslots, slot_idx)
+        slot_str = "dslots[{:d}]".format(slot_idx)
+    else:
+        slot_idx = page_num
+        slot = GetObjectAtIndexFromArray(pager.cpgr_slots.cpgr_eslots, slot_idx)
+        slot_str = "eslots[{:d}]".format(slot_idx)
+    print "      offset {:#018x} in compressor pager {: <#018x} {:s} slot {: <#018x}".format(offset, pager, slot_str, slot)
+    if slot == 0:
+        return 0
+    slot_value = dereference(slot)
+    print " value {:#010x}".format(slot_value)
+    vm_page_lookup_in_compressor(Cast(slot, 'c_slot_mapping_t'))
+    return 1
+
+@lldb_command("vm_page_lookup_in_compressor")
+def VmPageLookupInCompressor(cmd_args=None):
+    """Lookup up a page in a given compressor slot
+        usage: vm_page_lookup_in_compressor <slot>
+    """
+    if cmd_args == None or len(cmd_args) < 1:
+        print "Invalid argument.", VmPageLookupInCompressor.__doc__
+        return
+    slot = kern.GetValueFromAddress(cmd_args[0], 'compressor_slot_t *')
+    print "compressor slot {: <#018x}".format(slot)
+    vm_page_lookup_in_compressor(slot)
+
+C_SV_CSEG_ID = ((1 << 22) - 1)
+
+def vm_page_lookup_in_compressor(slot_ptr):
+    slot_ptr = Cast(slot_ptr, 'compressor_slot_t *')
+    slot_value = dereference(slot_ptr)
+    slot = Cast(slot_value, 'c_slot_mapping')
+    print slot
+    print "compressor slot {: <#018x} -> {:#010x} cseg {:d} cindx {:d}".format(unsigned(slot_ptr), unsigned(slot_value), slot.s_cseg, slot.s_cindx)
+    if slot_ptr == 0:
+        return
+    if slot.s_cseg == C_SV_CSEG_ID:
+        sv = kern.globals.c_segment_sv_hash_table
+        print "single value[{:#d}]: ref {:d} value {:#010x}".format(slot.s_cindx, sv[slot.s_cindx].c_sv_he_un.c_sv_he.c_sv_he_ref, sv[slot.s_cindx].c_sv_he_un.c_sv_he.c_sv_he_data)
+        return
+    if slot.s_cseg == 0 or unsigned(slot.s_cseg) > unsigned(kern.globals.c_segments_available):
+        print "*** ERROR: s_cseg {:d} is out of bounds (1 - {:d})".format(slot.s_cseg, unsigned(kern.globals.c_segments_available))
+        return
+    c_segments = kern.globals.c_segments
+    c_segments_elt = GetObjectAtIndexFromArray(c_segments, slot.s_cseg-1)
+    c_seg = c_segments_elt.c_seg
+    c_no_data = 0
+    if hasattr(c_seg, 'c_state'):
+        c_state = c_seg.c_state
+        if c_state == 0:
+            c_state_str = "C_IS_EMPTY"
+            c_no_data = 1
+        elif c_state == 1:
+            c_state_str = "C_IS_FREE"
+            c_no_data = 1
+        elif c_state == 2:
+            c_state_str = "C_IS_FILLING"
+        elif c_state == 3:
+            c_state_str = "C_ON_AGE_Q"
+        elif c_state == 4:
+            c_state_str = "C_ON_SWAPOUT_Q"
+        elif c_state == 5:
+            c_state_str = "C_ON_SWAPPEDOUT_Q"
+            c_no_data = 1
+        elif c_state == 6:
+            c_state_str = "C_ON_SWAPPEDOUTSPARSE_Q"
+            c_no_data = 1
+        elif c_state == 7:
+            c_state_str = "C_ON_SWAPPEDIN_Q"
+        elif c_state == 8:
+            c_state_str = "C_ON_MAJORCOMPACT_Q"
+        elif c_state == 9:
+            c_state_str = "C_ON_BAD_Q"
+            c_no_data = 1
+        else:
+            c_state_str = "<unknown>"
+    else:
+        c_state = -1
+        c_state_str = "<no c_state field>"
+    print "c_segments[{:d}] {: <#018x} c_seg {: <#018x} c_state {:#x}={:s}".format(slot.s_cseg-1, c_segments_elt, c_seg, c_state, c_state_str)
+    c_indx = unsigned(slot.s_cindx)
+    if hasattr(c_seg, 'c_slot_var_array'):
+        c_seg_fixed_array_len = kern.globals.c_seg_fixed_array_len
+        if c_indx < c_seg_fixed_array_len:
+            cs = c_seg.c_slot_fixed_array[c_indx]
+        else:
+            cs = GetObjectAtIndexFromArray(c_seg.c_slot_var_array, c_indx - c_seg_fixed_array_len)
+    else:
+        C_SEG_SLOT_ARRAY_SIZE = 64
+        C_SEG_SLOT_ARRAY_MASK = C_SEG_SLOT_ARRAY_SIZE - 1
+        cs = GetObjectAtIndexFromArray(c_seg.c_slots[c_indx / C_SEG_SLOT_ARRAY_SIZE], c_indx & C_SEG_SLOT_ARRAY_MASK)
+    print cs
+    c_slot_unpacked_ptr = (unsigned(cs.c_packed_ptr) << 2) + vm_min_kernel_and_kext_address()
+    print "c_slot {: <#018x} c_offset {:#x} c_size {:#x} c_packed_ptr {:#x} (unpacked: {: <#018x})".format(cs, cs.c_offset, cs.c_size, cs.c_packed_ptr, unsigned(c_slot_unpacked_ptr))
+    if unsigned(slot_ptr) != unsigned(c_slot_unpacked_ptr):
+        print "*** ERROR: compressor slot {: <#018x} points back to {: <#018x} instead of itself".format(slot_ptr, c_slot_unpacked_ptr)
+    if c_no_data == 0:
+        c_data = c_seg.c_store.c_buffer + (4 * cs.c_offset)
+        c_size = cs.c_size
+        cmd = "memory read {: <#018x} {: <#018x} --force".format(c_data, c_data + c_size)
+        print cmd
+        print lldb_run_command(cmd)
+    else:
+        print "<no compressed data>"
+
+def vm_min_kernel_and_kext_address(cmd_args=None):
+    if hasattr(kern.globals, 'vm_min_kernel_and_kext_address'):
+        return unsigned(kern.globals.vm_min_kernel_and_kext_address)
+    elif kern.arch == 'x86_64':
+        return unsigned(0xffffff7f80000000)
+    elif kern.arch == 'arm64':
+        return unsigned(0xffffff8000000000)
+    elif kern.arch == 'arm':
+        return unsigned(0x80000000)
+    else:
+        print "vm_min_kernel_and_kext_address(): unknown arch '{:s}'".format(kern.arch)
+        return unsigned(0)
+
+def print_hex_data(data, begin_offset=0, desc=""):
+    """ print on stdout "hexdump -C < data" like output
+        params:
+            data - bytearray or array of int where each int < 255
+            begin_offset - int offset that should be printed in left column
+            desc - str optional description to print on the first line to describe data
+    """
+    if desc:
+        print "{}:".format(desc)
+    index = 0
+    total_len = len(data)
+    hex_buf = ""
+    char_buf = ""
+    while index < total_len:
+        hex_buf += " {:02x}".format(data[index])
+        if data[index] < 0x20 or data[index] > 0x7e:
+            char_buf += "."
+        else:
+            char_buf += "{:c}".format(data[index])
+        index += 1
+        if index and index % 8 == 0:
+            hex_buf += " "
+        if index > 1 and (index % 16) == 0:
+            print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf)
+            hex_buf = ""
+            char_buf = ""
+    print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf)
+    return
+
+@lldb_command('vm_scan_all_pages')
+def VMScanAllPages(cmd_args=None):
+    """Scans the vm_pages[] array
+    """
+    vm_pages_count = kern.globals.vm_pages_count
+    vm_pages = kern.globals.vm_pages
+
+    free_count = 0
+    local_free_count = 0
+    active_count = 0
+    local_active_count = 0
+    inactive_count = 0
+    speculative_count = 0
+    throttled_count = 0
+    wired_count = 0
+    compressor_count = 0
+    pageable_internal_count = 0
+    pageable_external_count = 0
+    secluded_count = 0
+    secluded_free_count = 0
+    secluded_inuse_count = 0
+
+    i = 0
+    while i < vm_pages_count:
+
+        if i % 10000 == 0:
+            print "{:d}/{:d}...\n".format(i,vm_pages_count)
+
+        m = vm_pages[i]
+
+        internal = 0
+        external = 0
+        m_object_val = _vm_page_unpack_ptr(m.vmp_object)
+
+        if m_object:
+            if m_object.internal:
+                internal = 1
+            else:
+                external = 1
+
+        if m.vmp_wire_count != 0 and m.vmp_local == 0:
+            wired_count = wired_count + 1
+            pageable = 0
+        elif m.vmp_throttled:
+            throttled_count = throttled_count + 1
+            pageable = 0
+        elif m.vmp_active:
+            active_count = active_count + 1
+            pageable = 1
+        elif m.vmp_local:
+            local_active_count = local_active_count + 1
+            pageable = 0
+        elif m.vmp_inactive:
+            inactive_count = inactive_count + 1
+            pageable = 1
+        elif m.vmp_speculative:
+            speculative_count = speculative_count + 1
+            pageable = 0
+        elif m.vmp_free:
+            free_count = free_count + 1
+            pageable = 0
+        elif m.vmp_secluded:
+            secluded_count = secluded_count + 1
+            if m_object == 0:
+                secluded_free_count = secluded_free_count + 1
+            else:
+                secluded_inuse_count = secluded_inuse_count + 1
+            pageable = 0
+        elif m_object == 0 and m.vmp_busy:
+            local_free_count = local_free_count + 1
+            pageable = 0
+        elif m.vmp_compressor:
+            compressor_count = compressor_count + 1
+            pageable = 0
+        else:
+            print "weird page vm_pages[{:d}]?\n".format(i)
+            pageable = 0
+
+        if pageable:
+            if internal:
+                pageable_internal_count = pageable_internal_count + 1
+            else:
+                pageable_external_count = pageable_external_count + 1
+        i = i + 1
+
+    print "vm_pages_count = {:d}\n".format(vm_pages_count)
+
+    print "wired_count = {:d}\n".format(wired_count)
+    print "throttled_count = {:d}\n".format(throttled_count)
+    print "active_count = {:d}\n".format(active_count)
+    print "local_active_count = {:d}\n".format(local_active_count)
+    print "inactive_count = {:d}\n".format(inactive_count)
+    print "speculative_count = {:d}\n".format(speculative_count)
+    print "free_count = {:d}\n".format(free_count)
+    print "local_free_count = {:d}\n".format(local_free_count)
+    print "compressor_count = {:d}\n".format(compressor_count)
+
+    print "pageable_internal_count = {:d}\n".format(pageable_internal_count)
+    print "pageable_external_count = {:d}\n".format(pageable_external_count)
+    print "secluded_count = {:d}\n".format(secluded_count)
+    print "secluded_free_count = {:d}\n".format(secluded_free_count)
+    print "secluded_inuse_count = {:d}\n".format(secluded_inuse_count)
+
+
+@lldb_command('show_all_vm_named_entries')
+def ShowAllVMNamedEntries(cmd_args=None):
+    """ Routine to print a summary listing of all the VM named entries
+    """
+    queue_len = kern.globals.vm_named_entry_count
+    queue_head = kern.globals.vm_named_entry_list
+
+    print 'vm_named_entry_list:{: <#018x}  vm_named_entry_count:{:d}\n'.format(kern.GetLoadAddressForSymbol('vm_named_entry_list'),queue_len)
+
+    print '{:>6s} {:<6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s}   {:>3s} {:18s} {:>6s} {:<20s}\n'.format("#","#","object","P","refcnt","size (pages)","resid","wired","compressed","tag","owner","pid","process")
+    idx = 0
+    for entry in IterateQueue(queue_head, 'struct vm_named_entry *', 'named_entry_list'):
+        idx += 1
+        showmemoryentry(entry, idx, queue_len)
+
+@lldb_command('show_vm_named_entry')
+def ShowVMNamedEntry(cmd_args=None):
+    """ Routine to print a VM named entry
+    """
+    if cmd_args == None or len(cmd_args) < 1:
+        print "Invalid argument.", ShowMapVMNamedEntry.__doc__
+        return
+    named_entry = kern.GetValueFromAddress(cmd_args[0], 'vm_named_entry_t')
+    showmemoryentry(named_entry, 0, 0)
+
+def showmemoryentry(entry, idx=0, queue_len=0):
+    """  Routine to print out a summary a VM memory entry
+        params: 
+            entry - core.value : a object of type 'struct vm_named_entry *'
+        returns:
+            None
+    """
+    show_pager_info = True
+    show_all_shadows = True
+
+    backing = ""
+    if entry.is_sub_map == 1:
+        backing += "SUBMAP"
+    if entry.is_copy == 1:
+        backing += "COPY"
+    if entry.is_sub_map == 0 and entry.is_copy == 0:
+        backing += "OBJECT"
+    prot=""
+    if entry.protection & 0x1:
+        prot += "r"
+    else:
+        prot += "-"
+    if entry.protection & 0x2:
+        prot += "w"
+    else:
+        prot += "-"
+    if entry.protection & 0x4:
+        prot += "x"
+    else:
+        prot += "-"
+    extra_str = ""
+    if hasattr(entry, 'named_entry_alias'):
+        extra_str += " alias={:d}".format(entry.named_entry_alias)
+    if hasattr(entry, 'named_entry_port'):
+        extra_str += " port={:#016x}".format(entry.named_entry_port)
+    print "{:>6d}/{:<6d} {: <#018x} ref={:d} prot={:d}/{:s} type={:s} backing={: <#018x} offset={:#016x} dataoffset={:#016x} size={:#016x}{:s}\n".format(idx,queue_len,entry,entry.ref_count,entry.protection,prot,backing,entry.backing.object,entry.offset,entry.data_offset,entry.size,extra_str)
+    if entry.is_sub_map == 1:
+        showmapvme(entry.backing.map, 0, 0, show_pager_info, show_all_shadows)
+    if entry.is_copy == 1:
+        showmapcopyvme(entry.backing.copy, 0, 0, 0, show_pager_info, show_all_shadows, 0)
+    if entry.is_sub_map == 0 and entry.is_copy == 0:
+        showvmobject(entry.backing.object, entry.offset, entry.size, show_pager_info, show_all_shadows)
+
+
+def IterateRBTreeEntry2(element, element_type, field_name1, field_name2):
+    """ iterate over a rbtree as defined with RB_HEAD in libkern/tree.h
+            element      - value : Value object for rbh_root
+            element_type - str   : Type of the link element
+            field_name   - str   : Name of the field in link element's structure
+        returns:
+            A generator does not return. It is used for iterating
+            value  : an object thats of type (element_type) head->sle_next. Always a pointer object
+    """
+    elt = element.__getattr__('rbh_root')
+    if type(element_type) == str:
+        element_type = gettype(element_type)
+    charp_type = gettype('char *');
+
+    # Walk to find min
+    parent = elt
+    while unsigned(elt) != 0:
+        parent = elt
+        elt = cast(elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_left'), element_type)
+    elt = parent
+
+    # Now elt is min
+    while unsigned(elt) != 0:
+        yield elt
+        # implementation cribbed from RB_NEXT in libkern/tree.h
+        right = cast(elt.__getattr__(field_name1).__getattr__(fieldname2).__getattr__('rbe_right'), element_type)
+        if unsigned(right) != 0:
+            elt = right
+            left = cast(elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_left'), element_type)
+            while unsigned(left) != 0:
+                elt = left
+                left = cast(elt.__getattr__(field_name1).__getattr(__field_name2).__getattr__('rbe_left'), element_type)
+        else:
+
+            # avoid using GetValueFromAddress
+            addr = elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_parent')&~1
+            parent = value(elt.GetSBValue().CreateValueFromExpression(None,'(void *)'+str(addr)))
+            parent = cast(parent, element_type)
+
+            if unsigned(parent) != 0:
+                left = cast(parent.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_left'), element_type)
+            if (unsigned(parent) != 0) and (unsigned(elt) == unsigned(left)):
+                elt = parent
+            else:
+                if unsigned(parent) != 0:
+                    right = cast(parent.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_right'), element_type)
+                while unsigned(parent) != 0 and (unsigned(elt) == unsigned(right)):
+                    elt = parent
+
+                    # avoid using GetValueFromAddress
+                    addr = elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_parent')&~1
+                    parent = value(elt.GetSBValue().CreateValueFromExpression(None,'(void *)'+str(addr)))
+                    parent = cast(parent, element_type)
+
+                    right = cast(parent.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_right'), element_type)
+
+                # avoid using GetValueFromAddress
+                addr = elt.__getattr__(field_name1).__getattr__(field_name2).__getattr__('rbe_parent')&~1
+                elt = value(elt.GetSBValue().CreateValueFromExpression(None,'(void *)'+str(addr)))
+                elt = cast(elt, element_type)
+
+
+@lldb_command("showmaprb")
+def ShowMapRB(cmd_args=None):
+    """Routine to print out a VM map's RB tree
+        usage: showmaprb <vm_map>
+    """
+    if cmd_args == None or len(cmd_args) < 1:
+        print "Invalid argument.", ShowMapRB.__doc__
+        return
+    map_val = kern.GetValueFromAddress(cmd_args[0], 'vm_map_t')
+    print GetVMMapSummary.header
+    print GetVMMapSummary(map_val)
+    vme_rb_root = map_val.hdr.rb_head_store
+    vme_ptr_type = GetType('struct vm_map_entry *')
+    print GetVMEntrySummary.header
+    for vme in IterateRBTreeEntry2(vme_rb_root, 'struct vm_map_entry *', 'store', 'entry'):
+        print GetVMEntrySummary(vme)
+    return None
+
+@lldb_command('show_all_owned_objects', 'T')
+def ShowAllOwnedObjects(cmd_args=None, cmd_options={}):
+    """ Routine to print the list of VM objects owned by each task
+        -T: show only ledger-tagged objects
+    """
+    showonlytagged = False
+    if "-T" in cmd_options:
+        showonlytagged = True
+    for task in kern.tasks:
+        ShowTaskOwnedVmObjects(task, showonlytagged)
+
+@lldb_command('show_task_owned_objects', 'T')
+def ShowTaskOwnedObjects(cmd_args=None, cmd_options={}):
+    """ Routine to print the list of VM objects owned by the specified task
+        -T: show only ledger-tagged objects
+    """
+    showonlytagged = False
+    if "-T" in cmd_options:
+        showonlytagged = True
+    task = kern.GetValueFromAddress(cmd_args[0], 'task *')
+    ShowTaskOwnedVmObjects(task, showonlytagged)
+
+def ShowTaskOwnedVmObjects(task, showonlytagged=False):
+    """  Routine to print out a summary listing of all the entries in a vm_map
+        params:
+            task - core.value : a object of type 'task *'
+        returns:
+            None
+    """
+    taskobjq_total = lambda:None
+    taskobjq_total.objects = 0
+    taskobjq_total.vsize = 0
+    taskobjq_total.rsize = 0
+    taskobjq_total.wsize = 0
+    taskobjq_total.csize = 0
+    vmo_list_head = task.task_objq
+    vmo_ptr_type = GetType('vm_object *')
+    idx = 0
+    for vmo in IterateQueue(vmo_list_head, vmo_ptr_type, "task_objq"):
+        idx += 1
+        if not showonlytagged or vmo.vo_ledger_tag != 0:
+            if taskobjq_total.objects == 0:
+                print ' \n'
+                print GetTaskSummary.header + ' ' + GetProcSummary.header
+                print GetTaskSummary(task) + ' ' + GetProcSummary(Cast(task.bsd_info, 'proc *'))
+                print '{:>6s} {:<6s} {:18s} {:1s} {:>6s} {:>16s} {:>10s} {:>10s} {:>10s} {:>2s} {:18s} {:>6s} {:<20s}\n'.format("#","#","object","P","refcnt","size (pages)","resid","wired","compressed","tg","owner","pid","process")
+            ShowOwnedVmObject(vmo, idx, 0, taskobjq_total)
+    if taskobjq_total.objects != 0:
+        print "           total:{:<10d}  [ virtual:{:<10d}  resident:{:<10d}  wired:{:<10d}  compressed:{:<10d} ]\n".format(taskobjq_total.objects, taskobjq_total.vsize, taskobjq_total.rsize, taskobjq_total.wsize, taskobjq_total.csize)
+    return None
+
+def ShowOwnedVmObject(object, idx, queue_len, taskobjq_total):
+    """  Routine to print out a VM object owned by a task
+        params:
+            object - core.value : a object of type 'struct vm_object *'
+        returns:
+            None
+    """
+    page_size = kern.globals.page_size
+    if object.purgable == 0:
+        purgable = "N"
+    elif object.purgable == 1:
+        purgable = "V"
+    elif object.purgable == 2:
+        purgable = "E"
+    elif object.purgable == 3:
+        purgable = "D"
+    else:
+        purgable = "?"
+    if object.pager == 0:
+        compressed_count = 0
+    else:
+        compressor_pager = Cast(object.pager, 'compressor_pager *')
+        compressed_count = compressor_pager.cpgr_num_slots_occupied
+
+    print "{:>6d}/{:<6d} {: <#018x} {:1s} {:>6d} {:>16d} {:>10d} {:>10d} {:>10d} {:>2d} {: <#018x} {:>6d} {:<20s}\n".format(idx,queue_len,object,purgable,object.ref_count,object.vo_un1.vou_size/page_size,object.resident_page_count,object.wired_page_count,compressed_count, object.vo_ledger_tag, object.vo_un2.vou_owner,GetProcPIDForObjectOwner(object.vo_un2.vou_owner),GetProcNameForObjectOwner(object.vo_un2.vou_owner))
+
+    taskobjq_total.objects += 1
+    taskobjq_total.vsize += object.vo_un1.vou_size/page_size
+    taskobjq_total.rsize += object.resident_page_count
+    taskobjq_total.wsize += object.wired_page_count
+    taskobjq_total.csize += compressed_count
+
+def GetProcPIDForObjectOwner(owner):
+    """ same as GetProcPIDForTask() but deals with -1 for a disowned object
+    """
+    if unsigned(Cast(owner, 'int')) == unsigned(int(0xffffffff)):
+        return -1
+    return GetProcPIDForTask(owner)
+
+def GetProcNameForObjectOwner(owner):
+    """ same as GetProcNameForTask() but deals with -1 for a disowned object
+    """
+    if unsigned(Cast(owner, 'int')) == unsigned(int(0xffffffff)):
+        return "<disowned>"
+    return GetProcNameForTask(owner)
+
+def GetDescForNamedEntry(mem_entry):
+    out_str = "\n"
+    out_str += "\t\tmem_entry {:#08x} ref:{:d} offset:{:#08x} size:{:#08x} prot{:d} backing {:#08x}".format(mem_entry, mem_entry.ref_count, mem_entry.offset, mem_entry.size, mem_entry.protection, mem_entry.backing.object)
+    if mem_entry.is_sub_map:
+        out_str += " is_sub_map"
+    elif mem_entry.is_copy:
+        out_str += " is_copy"
+    else:
+        out_str += " is_object"
+    return out_str
index 88b1d7673adc1d8954122f35e03dffeed491a4ac..fd5382f1a4f76352c3d82d8d8004b4d87372f8d2 100755 (executable)
@@ -95,7 +95,7 @@ def GetCpuDataForCpuID(cpu_id):
     if kern.arch == 'x86_64':
         cpu_data = kern.globals.cpu_data_ptr[cpu_id]
         return cpu_data
-    elif kern.arch in ['arm', 'arm64'] :
+    elif kern.arch.startswith('arm'):
         data_entries_addr = kern.GetLoadAddressForSymbol('CpuDataEntries')
         data_entries = kern.GetValueFromAddress(data_entries_addr, 'cpu_data_entry_t *')
         data_entry = data_entries[cpu_id];
@@ -689,7 +689,7 @@ def DumpRawTraceFile(cmd_args=[], cmd_options={}):
         print "Trace buffer not enabled\n"
         return
 
-    if ((kern.arch == "x86_64") or (kern.arch == "arm64")) :
+    if ((kern.arch == "x86_64") or kern.arch.startswith("arm64")) :
         lp64 = True
     elif kern.arch == "arm" :
         lp64 = False
index 6f9bbb6ecc7b8abba724a5b2c8fca68a91947af6..40529c70c2104f3cd672940a9e4b73dbc294022c 100755 (executable)
@@ -920,6 +920,10 @@ def DecodeTTE(cmd_args=None):
     else:
         raise NotImplementedError("decode_tte does not support {0}".format(kern.arch))
 
+
+PVH_HIGH_FLAGS_ARM64 = (1 << 62) | (1 << 61) | (1 << 60) | (1 << 59)
+PVH_HIGH_FLAGS_ARM32 = (1 << 31)
+
 def PVWalkARM(pa):
     """ Walk a physical-to-virtual reverse mapping list maintained by the arm pmap
         pa: physical address (NOT page number).  Does not need to be page-aligned 
@@ -928,10 +932,19 @@ def PVWalkARM(pa):
     vm_last_phys = unsigned(kern.globals.vm_last_phys)
     if pa < vm_first_phys or pa >= vm_last_phys:
         raise ArgumentError("PA {:#x} is outside range of managed physical addresses: [{:#x}, {:#x})".format(pa, vm_first_phys, vm_last_phys))
-    page_size = kern.globals.arm_hardware_page_size
+    page_size = kern.globals.page_size
     pn = (pa - unsigned(kern.globals.vm_first_phys)) / page_size
     pvh = unsigned(kern.globals.pv_head_table[pn])
     pvh_type = pvh & 0x3
+    print "PVH raw value: ({:#x})".format(pvh)
+    if kern.arch.startswith('arm64'):
+        iommu_flag = 0x4
+        iommu_table_flag = 1 << 63
+        pvh = pvh | PVH_HIGH_FLAGS_ARM64
+    else:
+        iommu_flag = 0
+        iommu_table_flag = 0 
+        pvh = pvh | PVH_HIGH_FLAGS_ARM32
     if pvh_type == 0:
         print "PVH type: NULL"
         return
@@ -940,8 +953,16 @@ def PVWalkARM(pa):
         return
     elif pvh_type == 2:
         ptep = pvh & ~0x3
+        pte_str = ''
         print "PVH type: single PTE"
-        print "PTE {:#x}: {:#x}".format(ptep, dereference(kern.GetValueFromAddress(ptep, 'pt_entry_t *')))
+        if ptep & iommu_flag:
+            ptep = ptep & ~iommu_flag
+            if ptep & iommu_table_flag:
+                pte_str = ' (IOMMU table), entry'
+            else:
+                pte_str = ' (IOMMU state), descriptor'
+                ptep = ptep | iommu_table_flag
+        print "PTE {:#x}{:s}: {:#x}".format(ptep, pte_str, dereference(kern.GetValueFromAddress(ptep, 'pt_entry_t *')))
     elif pvh_type == 1:
         pvep = pvh & ~0x3
         print "PVH type: PTE list"
@@ -954,6 +975,13 @@ def PVWalkARM(pa):
             current_pvep = pvep
             pvep = unsigned(pve.pve_next) & ~0x1
             ptep = unsigned(pve.pve_ptep) & ~0x3
+            if ptep & iommu_flag:
+                ptep = ptep & ~iommu_flag
+                if ptep & iommu_table_flag:
+                    pve_str = ' (IOMMU table), entry'
+                else:
+                    pve_str = ' (IOMMU state), descriptor'
+                    ptep = ptep | iommu_table_flag
             print "PVE {:#x}, PTE {:#x}{:s}: {:#x}".format(current_pvep, ptep, pve_str, dereference(kern.GetValueFromAddress(ptep, 'pt_entry_t *')))
 
 @lldb_command('pv_walk')
@@ -967,15 +995,50 @@ def PVWalk(cmd_args=None):
         raise NotImplementedError("pv_walk does not support {0}".format(kern.arch))
     PVWalkARM(kern.GetValueFromAddress(cmd_args[0], 'unsigned long'))
 
+@lldb_command('kvtophys')
+def KVToPhys(cmd_args=None):
+    """ Translate a kernel virtual address to the corresponding physical address.
+        Assumes the virtual address falls within the kernel static region.
+        Syntax: (lldb) kvtophys <kernel virtual address>
+    """
+    if cmd_args == None or len(cmd_args) < 1:
+        raise ArgumentError("Too few arguments to kvtophys.")
+    if kern.arch.startswith('arm'):
+        print "{:#x}".format(KVToPhysARM(long(unsigned(kern.GetValueFromAddress(cmd_args[0], 'unsigned long')))))
+    elif kern.arch == 'x86_64':
+        print "{:#x}".format(long(unsigned(kern.GetValueFromAddress(cmd_args[0], 'unsigned long'))) - unsigned(kern.globals.physmap_base))
+
+@lldb_command('phystokv')
+def PhysToKV(cmd_args=None):
+    """ Translate a physical address to the corresponding static kernel virtual address.
+        Assumes the physical address corresponds to managed DRAM.
+        Syntax: (lldb) phystokv <physical address>
+    """
+    if cmd_args == None or len(cmd_args) < 1:
+        raise ArgumentError("Too few arguments to phystokv.")
+    print "{:#x}".format(kern.PhysToKernelVirt(long(unsigned(kern.GetValueFromAddress(cmd_args[0], 'unsigned long')))))
+
+def KVToPhysARM(addr):
+    if kern.arch.startswith('arm64'):
+        ptov_table = kern.globals.ptov_table
+        for i in range(0, kern.globals.ptov_index):
+            if (addr >= long(unsigned(ptov_table[i].va))) and (addr < (long(unsigned(ptov_table[i].va)) + long(unsigned(ptov_table[i].len)))):
+                return (addr - long(unsigned(ptov_table[i].va)) + long(unsigned(ptov_table[i].pa)))
+    return (addr - unsigned(kern.globals.gVirtBase) + unsigned(kern.globals.gPhysBase))
+
 def ShowPTEARM(pte):
     """ Display vital information about an ARM page table entry
         pte: kernel virtual address of the PTE.  Should be L3 PTE.  May also work with L2 TTEs for certain devices.
     """
     page_size = kern.globals.arm_hardware_page_size
-    pn = (pte - unsigned(kern.globals.gVirtBase) + unsigned(kern.globals.gPhysBase) - unsigned(kern.globals.vm_first_phys)) / page_size
-    pvh = kern.globals.pv_head_table[pn]
+    pn = (KVToPhysARM(pte) - unsigned(kern.globals.vm_first_phys)) / page_size
+    pvh = unsigned(kern.globals.pv_head_table[pn])
+    if kern.arch.startswith('arm64'):
+        pvh = pvh | PVH_HIGH_FLAGS_ARM64
+    else:
+        pvh = pvh | PVH_HIGH_FLAGS_ARM32
     pvh_type = pvh & 0x3
-    if pvh_type != 0x3 and pvh_type != 0x0:
+    if pvh_type != 0x3:
         raise ValueError("PV head {:#x} does not correspond to a page-table descriptor".format(pvh))
     ptd = kern.GetValueFromAddress(pvh & ~0x3, 'pt_desc_t *')
     print "descriptor: {:#x}".format(ptd)
@@ -1137,7 +1200,7 @@ def ShowAllMappings(cmd_args=None):
     ScanPageTables(printMatchedMapping, targetPmap)
 
 def checkPVList(pmap, level, type, tte, paddr, granule):
-    """ Checks an ARM physical-to-virtual mapping list for consistency error.
+    """ Checks an ARM physical-to-virtual mapping list for consistency errors.
         pmap: owner of the translation table
         level: translation table level.  PV lists will only be checked for L2 (arm32) or L3 (arm64) tables.
         type: unused
@@ -1147,20 +1210,22 @@ def checkPVList(pmap, level, type, tte, paddr, granule):
     """
     vm_first_phys = unsigned(kern.globals.vm_first_phys)
     vm_last_phys = unsigned(kern.globals.vm_last_phys)
-    page_size = kern.globals.arm_hardware_page_size
+    page_size = kern.globals.page_size
     if kern.arch.startswith('arm64'):
         page_offset_mask = (page_size - 1)
         page_base_mask = ((1 << ARM64_VMADDR_BITS) - 1) & (~page_offset_mask)
         paddr = paddr & page_base_mask
         max_level = 3
+        pvh_set_bits = PVH_HIGH_FLAGS_ARM64
     elif kern.arch == 'arm':
         page_base_mask = 0xFFFFF000
         paddr = paddr & page_base_mask
         max_level = 2
+        pvh_set_bits = PVH_HIGH_FLAGS_ARM32
     if level < max_level or paddr < vm_first_phys or paddr >= vm_last_phys:
         return
     pn = (paddr - vm_first_phys) / page_size
-    pvh = unsigned(kern.globals.pv_head_table[pn])
+    pvh = unsigned(kern.globals.pv_head_table[pn]) | pvh_set_bits
     pvh_type = pvh & 0x3
     if pmap is not None:
         pmap_str = "pmap: {:#x}: ".format(pmap)
@@ -1207,7 +1272,7 @@ def PVCheck(cmd_args=None, cmd_options={}):
             -P        : Interpret <addr> as a physical address rather than a PTE
     """
     if cmd_args == None or len(cmd_args) < 1:
-        raise ArgumentError("Too few arguments to showallmappings.")
+        raise ArgumentError("Too few arguments to pv_check.")
     if kern.arch == 'arm':
         level = 2
     elif kern.arch.startswith('arm64'):
index b86e1a80d694a231ec059d263bf8577fdd4a09b4..f37169a3b59ae5bda7dfc2ec1c8c4304f13e4c93 100755 (executable)
@@ -179,13 +179,15 @@ def GetASTSummary(ast):
         D - AST_DTRACE
         I - AST_TELEMETRY_IO
         E - AST_KEVENT
+        R - AST_REBALANCE
+        N - AST_UNQUIESCE
     """
     out_string = ""
     state = int(ast)
     thread_state_chars = {0x0:'', 0x1:'P', 0x2:'Q', 0x4:'U', 0x8:'H', 0x10:'Y', 0x20:'A',
                           0x40:'L', 0x80:'B', 0x100:'K', 0x200:'M',
                           0x1000:'G', 0x2000:'T', 0x4000:'T', 0x8000:'T', 0x10000:'S',
-                          0x20000: 'D', 0x40000: 'I', 0x80000: 'E'}
+                          0x20000: 'D', 0x40000: 'I', 0x80000: 'E', 0x100000: 'R', 0x200000: 'N'}
     state_str = ''
     mask = 0x1
     while mask <= 0x80000:
@@ -553,7 +555,7 @@ def ShowAllCoalitions(cmd_args=None):
 
 # Macro: showallthreadgroups
 
-@lldb_type_summary(['thread_group_t', 'thread_group *'])
+@lldb_type_summary(['struct thread_group *', 'thread_group *'])
 @header("{0: <20s} {1: <5s} {2: <16s} {3: <5s} {4: <8s} {5: <20s}".format("thread_group", "id", "name", "refc", "flags", "recommendation"))
 def GetThreadGroupSummary(tg):
     if unsigned(tg) == 0:
@@ -1500,7 +1502,7 @@ def GetProcessorSummary(processor):
             preemption_disable_str)
     return out_str   
 
-def GetLedgerEntrySummary(ledger_template, ledger, i):
+def GetLedgerEntrySummary(ledger_template, ledger, i, show_footprint_interval_max=False):
     """ Internal function to get internals of a ledger entry (*not* a ledger itself)
         params: ledger_template - value representing struct ledger_template_t for the task or thread
                 ledger - value representing struct ledger_entry *
@@ -1517,14 +1519,13 @@ def GetLedgerEntrySummary(ledger_template, ledger, i):
     out_str += "{: >32s} {:<2d}:".format(ledger_template.lt_entries[i].et_key, i)
     out_str += "{: >15d} ".format(unsigned(ledger.le_credit) - unsigned(ledger.le_debit))
     if (ledger.le_flags & lf_tracking_max):
-        out_str += "{:9d} {:5d} ".format(ledger._le.le_maxtracking.le_peaks[0].le_max, now - unsigned(ledger._le.le_maxtracking.le_peaks[0].le_time))
+        if (show_footprint_interval_max):
+            out_str += "{:12d} ".format(ledger._le._le_max.le_interval_max)
+        out_str += "{:14d} ".format(ledger._le._le_max.le_lifetime_max)
     else:
-        out_str += "        -     -"
-
-    if (ledger.le_flags & lf_tracking_max):
-        out_str += "{:12d} ".format(ledger._le.le_maxtracking.le_lifetime_max)
-    else:
-        out_str += "             -"
+        if (show_footprint_interval_max):
+            out_str += "           - "
+        out_str += "             - "
     out_str += "{:12d} {:12d} ".format(unsigned(ledger.le_credit), unsigned(ledger.le_debit))
     if (unsigned(ledger.le_limit) != ledger_limit_infinity):
         out_str += "{:12d} ".format(unsigned(ledger.le_limit))
@@ -1569,11 +1570,7 @@ def GetThreadLedgerSummary(thread_val):
             i = i + 1
     return out_str
 
-@header("{0: <15s} {1: >16s} {2: <2s} {3: >15s} {4: >9s} {5: >6s} {6: >12s} {7: >11s} \
-    {8: >7s} {9: >13s}   {10: <15s} {11: <8s} {12: <9s} {13: <6s} {14: >6s}".format(
-    "task [thread]", "entry", "#", "balance", "peakA", "(age)", "lifemax", "credit",
-     "debit", "limit", "refill period", "lim pct", "warn pct", "over?", "flags"))
-def GetTaskLedgers(task_val):
+def GetTaskLedgers(task_val, show_footprint_interval_max=False):
     """ Internal function to get summary of ledger entries from the task and its threads
         params: task_val - value representing struct task *
         return: str - formatted output information for ledger entries of the input task
@@ -1588,7 +1585,7 @@ def GetTaskLedgers(task_val):
     else:
         out_str += "Invalid process:\n"
     while i != task_ledgerp.l_template.lt_cnt:
-        out_str += GetLedgerEntrySummary(kern.globals.task_ledger_template, task_ledgerp.l_entries[i], i)
+        out_str += GetLedgerEntrySummary(kern.globals.task_ledger_template, task_ledgerp.l_entries[i], i, show_footprint_interval_max)
         i = i + 1
 
     # Now walk threads
@@ -1599,11 +1596,14 @@ def GetTaskLedgers(task_val):
 
 # Macro: showtaskledgers
 
-@lldb_command('showtaskledgers', 'F:') 
+@lldb_command('showtaskledgers', 'F:I') 
 def ShowTaskLedgers(cmd_args=None, cmd_options={}):
     """  Routine to print a summary  of ledger entries for the task and all of its threads
-         Usage: showtaskledgers <address of task>
-         or   : showtaskledgers -F <name of task>
+         or   : showtaskledgers [ -I ] [ -F ] <task>
+         options:
+            -I: show footprint interval max (DEV/DEBUG only)
+            -F: specify task via name instead of address
+        -
     """
     if "-F" in cmd_options:
         task_list = FindTasksByName(cmd_options["-F"])
@@ -1614,24 +1614,34 @@ def ShowTaskLedgers(cmd_args=None, cmd_options={}):
     
     if not cmd_args:
         raise ArgumentError("No arguments passed.")
+    show_footprint_interval_max = False
+    if "-I" in cmd_options:
+        show_footprint_interval_max = True
     tval = kern.GetValueFromAddress(cmd_args[0], 'task *')
     if not tval:
         raise ArgumentError("unknown arguments: %r" %cmd_args)
-    print GetTaskLedgers.header
-    print GetTaskLedgers(tval)
+    if (show_footprint_interval_max):
+        print "{0: <15s} {1: >16s} {2: <2s} {3: >15s} {4: >12s} {5: >14s} {6: >12s} {7: >12s} {8: >12s}   {9: <15s} {10: <8s} {11: <9s} {12: <6s} {13: >6s}".format(
+        "task [thread]", "entry", "#", "balance", "intrvl_max", "lifetime_max", "credit",
+        "debit", "limit", "refill period", "lim pct", "warn pct", "over?", "flags")
+    else:
+        print "{0: <15s} {1: >16s} {2: <2s} {3: >15s} {4: >14s} {5: >12s} {6: >12s} {7: >12s}   {8: <15s} {9: <8s} {10: <9s} {11: <6s} {12: >6s}".format(
+        "task [thread]", "entry", "#", "balance", "lifetime_max", "credit",
+        "debit", "limit", "refill period", "lim pct", "warn pct", "over?", "flags")
+    print GetTaskLedgers(tval, show_footprint_interval_max)
 
 # EndMacro: showtaskledgers
 
 # Macro: showalltaskledgers
 
 @lldb_command('showalltaskledgers') 
-def ShowAllTaskLedgers(cmd_args=None):
+def ShowAllTaskLedgers(cmd_args=None, cmd_options={}):
     """  Routine to print a summary  of ledger entries for all tasks and respective threads
          Usage: showalltaskledgers
     """
     for t in kern.tasks:
         task_val = unsigned(t)
-        ShowTaskLedgers([task_val])
+        ShowTaskLedgers([task_val], cmd_options=cmd_options)
     
 # EndMacro: showalltaskledgers
 
index d60dd0e4e22da5f886366146b2b0fdfe7dde40cf..36a37c32884cc2f257c949926d29aa72e49e3c3c 100755 (executable)
@@ -147,18 +147,13 @@ def ShowCurremtAbsTime(cmd_args=None):
          Usage: showcurrentabstime
     """
     pset = addressof(kern.globals.pset0)
+    processor_array = kern.globals.processor_array
     cur_abstime = 0
 
     while unsigned(pset) != 0:
-        for processor in ParanoidIterateLinkageChain(pset.active_queue, "processor_t", "processor_queue"):
-            if unsigned(processor.last_dispatch) > cur_abstime:
-                cur_abstime = unsigned(processor.last_dispatch)
-
-        for processor in ParanoidIterateLinkageChain(pset.idle_queue, "processor_t", "processor_queue"):
-            if unsigned(processor.last_dispatch) > cur_abstime:
-                cur_abstime = unsigned(processor.last_dispatch)
-
-        for processor in ParanoidIterateLinkageChain(pset.idle_secondary_queue, "processor_t", "processor_queue"):
+        cpu_bitmap = int(pset.cpu_bitmask)
+        for cpuid in IterateBitmap(cpu_bitmap):
+            processor = processor_array[cpuid]
             if unsigned(processor.last_dispatch) > cur_abstime:
                 cur_abstime = unsigned(processor.last_dispatch)
 
@@ -377,20 +372,22 @@ def ShowSchedHistory(cmd_args=None, cmd_options=None):
     run_count      = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')]
     fixpri_count   = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')]
     share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')]
+    share_df_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')]
     share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')]
     share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')]
 
     sched_pri_shifts = kern.globals.sched_run_buckets
 
     share_fg_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')]
+    share_df_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')]
     share_ut_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')]
     share_bg_shift = sched_pri_shifts[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')]
 
 
     print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals)
-    print "FG Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_ut_count, share_bg_count)
+    print "FG Timeshare threads: {:d} DF Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_df_count, share_ut_count, share_bg_count)
     print "Mach factor: {g.sched_mach_factor:d} Load factor: {g.sched_load_average:d} Sched tick: {g.sched_tick:d} timestamp: {g.sched_tick_last_abstime:d} interval:{g.sched_tick_interval:d}\n".format(g=kern.globals)
-    print "Fixed shift: {g.sched_fixed_shift:d} FG shift: {:d} UT shift: {:d} BG shift: {:d}\n".format(share_fg_shift, share_ut_shift, share_bg_shift, g=kern.globals)
+    print "Fixed shift: {g.sched_fixed_shift:d} FG shift: {:d} DF shift: {:d} UT shift: {:d} BG shift: {:d}\n".format(share_fg_shift, share_df_shift, share_ut_shift, share_bg_shift, g=kern.globals)
     print "sched_pri_decay_band_limit: {g.sched_pri_decay_band_limit:d} sched_decay_usage_age_factor: {g.sched_decay_usage_age_factor:d}\n".format(g=kern.globals)
 
     if kern.arch == 'x86_64':
@@ -572,11 +569,12 @@ def ShowScheduler(cmd_args=None):
     run_count      = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_RUN')]
     fixpri_count   = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_FIXPRI')]
     share_fg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_FG')]
+    share_df_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_DF')]
     share_ut_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_UT')]
     share_bg_count = run_buckets[GetEnumValue('sched_bucket_t::TH_BUCKET_SHARE_BG')]
 
     print "Processors: {g.processor_avail_count:d} Runnable threads: {:d} Fixpri threads: {:d}\n".format(run_count, fixpri_count, g=kern.globals)
-    print "FG Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_ut_count, share_bg_count)
+    print "FG Timeshare threads: {:d} DF Timeshare threads: {:d} UT Timeshare threads: {:d} BG Timeshare threads: {:d}\n".format(share_fg_count, share_df_count, share_ut_count, share_bg_count)
 
     if show_group_pset_runq:
         if hasattr(kern.globals, "multiq_sanity_check"):
@@ -620,41 +618,69 @@ def ShowScheduler(cmd_args=None):
                         print "Group {: <#012x} Task {: <#012x}\n".format(unsigned(group), unsigned(task))
                         ShowRunQSummary(group.runq)
             print " \n"
+            
+            processor_array = kern.globals.processor_array
 
             print "Active Processors:\n"
-            for processor in ParanoidIterateLinkageChain(pset.active_queue, "processor_t", "processor_queue"):
-                print "    " + GetProcessorSummary(processor)
-                ShowActiveThread(processor)
-                ShowNextThread(processor)
-
-                if show_priority_runq:
-                    runq = processor.runq
-                    ShowRunQSummary(runq)
-                if show_grrr:
-                    grrr_runq = processor.grrr_runq
-                    ShowGrrrSummary(grrr_runq)
+            active_bitmap = int(pset.cpu_state_map[5]) | int(pset.cpu_state_map[6])
+            for cpuid in IterateBitmap(active_bitmap):
+                processor = processor_array[cpuid]
+                if processor != 0:
+                    print "    " + GetProcessorSummary(processor)
+                    ShowActiveThread(processor)
+                    ShowNextThread(processor)
+
+                    if show_priority_runq:
+                        runq = processor.runq
+                        ShowRunQSummary(runq)
+                    if show_grrr:
+                        grrr_runq = processor.grrr_runq
+                        ShowGrrrSummary(grrr_runq)
             print " \n"
 
 
             print "Idle Processors:\n"
-            for processor in ParanoidIterateLinkageChain(pset.idle_queue, "processor_t", "processor_queue"):
-                print "    " + GetProcessorSummary(processor)
-                ShowActiveThread(processor)
-                ShowNextThread(processor)
-
-                if show_priority_runq:
-                    ShowRunQSummary(processor.runq)
+            idle_bitmap = int(pset.cpu_state_map[4]) & int(pset.primary_map)
+            for cpuid in IterateBitmap(idle_bitmap):
+                processor = processor_array[cpuid]
+                if processor != 0:
+                    print "    " + GetProcessorSummary(processor)
+                    ShowActiveThread(processor)
+                    ShowNextThread(processor)
+
+                    if show_priority_runq:
+                        ShowRunQSummary(processor.runq)
             print " \n"
 
 
             print "Idle Secondary Processors:\n"
-            for processor in ParanoidIterateLinkageChain(pset.idle_secondary_queue, "processor_t", "processor_queue"):
-                print "    " + GetProcessorSummary(processor)
-                ShowActiveThread(processor)
-                ShowNextThread(processor)
+            idle_bitmap = int(pset.cpu_state_map[4]) & ~(int(pset.primary_map))
+            for cpuid in IterateBitmap(idle_bitmap):
+                processor = processor_array[cpuid]
+                if processor != 0:
+                    print "    " + GetProcessorSummary(processor)
+                    ShowActiveThread(processor)
+                    ShowNextThread(processor)
+
+                    if show_priority_runq:
+                        print ShowRunQSummary(processor.runq)
+            print " \n"
 
-                if show_priority_runq:
-                    print ShowRunQSummary(processor.runq)
+
+            print "Other Processors:\n"
+            other_bitmap = 0
+            for i in range(0, 4):
+                other_bitmap |= int(pset.cpu_state_map[i])
+            other_bitmap &= int(pset.cpu_bitmask)
+            for cpuid in IterateBitmap(other_bitmap):
+                processor = processor_array[cpuid]
+                if processor != 0:
+                    print "    " + GetProcessorSummary(processor)
+                    ShowActiveThread(processor)
+                    ShowNextThread(processor)
+
+                    if show_priority_runq:
+                        ShowRunQSummary(processor.runq)
             print " \n"
 
 
@@ -791,6 +817,32 @@ def ParanoidIterateLinkageChain(queue_head, element_type, field_name, field_ofst
 ParanoidIterateLinkageChain.enable_paranoia = True
 ParanoidIterateLinkageChain.enable_debug = False
 
+def bit_first(bitmap):
+    return bitmap.bit_length() - 1
+
+def lsb_first(bitmap):
+    bitmap = bitmap & -bitmap
+    return bit_first(bitmap)
+
+def IterateBitmap(bitmap):
+    """ Iterate over a bitmap, returning the index of set bits starting from 0
+
+        params:
+            bitmap       - value       : bitmap
+        returns:
+            A generator does not return. It is used for iterating.
+            value  : index of a set bit
+        example usage:
+            for cpuid in IterateBitmap(running_bitmap):
+                print processor_array[cpuid]
+    """
+    i = lsb_first(bitmap)
+    while (i >= 0):
+        yield i
+        bitmap = bitmap & ~((1 << (i + 1)) - 1)
+        i = lsb_first(bitmap)
+
+
 # Macro: showallcallouts
 
 def ShowThreadCall(prefix, call):
diff --git a/tools/lldbmacros/skywalk.py b/tools/lldbmacros/skywalk.py
new file mode 100755 (executable)
index 0000000..2119bc0
--- /dev/null
@@ -0,0 +1,566 @@
+
+""" Please make sure you read the README COMPLETELY BEFORE reading anything below.
+    It is very critical that you read coding guidelines in Section E in README file.
+"""
+
+from xnu import *
+from utils import *
+from string import *
+
+import xnudefines
+
+def IterateProcChannels(proc):
+    """ Iterate through all channels in the given process
+
+        params:
+            proc - the proc object
+        returns: nothing, this is meant to be used as a generator function
+            kc - yields each kern_channel in the process
+    """
+
+    proc_filedesc = proc.p_fd
+    proc_lastfile = unsigned(proc_filedesc.fd_lastfile)
+    proc_ofiles = proc_filedesc.fd_ofiles
+
+    count = 0
+    while count <= proc_lastfile:
+        if unsigned(proc_ofiles[count]) != 0:
+            proc_fd_fglob = proc_ofiles[count].f_fglob
+            if (unsigned(proc_fd_fglob.fg_ops.fo_type) == 10):
+                yield Cast(proc_fd_fglob.fg_data, 'kern_channel *')
+        count += 1
+
+def IterateKernChannelRings(kc, kind):
+    """ Iterate through all rings on a given channel
+    """
+
+    NR_RX = 0
+    NR_TX = 1
+    NR_A  = 2
+    NR_F  = 3
+
+    if kind == NR_RX:
+        rings = kc.ch_na.na_rx_rings
+    elif kind == NR_TX :
+        rings = kc.ch_na.na_tx_rings
+    elif kind == NR_A :
+        rings = kc.ch_na.na_alloc_rings
+    else :
+        rings = kc.ch_na.na_free_rings
+
+    # note that ch_last is actually one greater than the last
+    # as per the comment in ch_connect
+    for i in xrange(kc.ch_first[kind], kc.ch_last[kind]):
+        yield addressof(rings[i])
+
+# Note this is broken if you have type summaries enabled
+# because we are summarizing the pointer to the structure
+# and not the structure itself.  Unfortunately, that's
+# the pattern used elsewhere.
+# Trying to actually use the type summary will blow up
+# because it has a linked list pointer to itself
+#
+@lldb_type_summary(['kern_channel_t', 'kern_channel *'])
+@header('{:<20s} {:<36s}'.format('kern_channel', 'uuid'))
+def GetKernChannelSummary(kc):
+    """ Summarizes a kern_channel and related information
+
+        returns: str - summary of kern_channel
+    """
+
+    format_string = '{o: <#020x} {u: <36s}'
+    return format_string.format(
+        o=kc,
+        u=GetUUIDSummary(kc.ch_info.cinfo_ch_id))
+
+@lldb_type_summary(['__kern_channel_ring *'])
+@header('{:<20s} {:<65s} {:>10s} | {:<5s} {:<5s} | {:<5s} {:<5s} {:<5s} | {:<5s} {:<5s} {:<5s}'.format(
+        'kernchannelring', 'name', 'flags', 'kc', 'kt', 'rc', 'rh', 'rt', 'c', 'h', 't'))
+def GetKernChannelRingSummary(kring):
+    """ Summarizes a __kern_channel_ring and related information
+
+        returns: str - summary of kern_channel_ring
+    """
+
+    format_string = '{o: <#020x} "{name: <63s}" {flags: >#010x} | {kh: <5d} {kt: <5d} | {rh: <5d} {rt: <5d} | {h: <5d} {t: <5d}'
+    return format_string.format(
+        o=kring,
+        name=kring.ckr_name,
+        flags=kring.ckr_flags,
+        kh=kring.ckr_khead,
+        kt=kring.ckr_ktail,
+        rh=kring.ckr_rhead,
+        rt=kring.ckr_rtail,
+        h=kring.ckr_ring.ring_head,
+        t=kring.ckr_ring.ring_tail)
+
+@lldb_command('showprocchannels')
+def ShowProcChannels(cmd_args=None):
+    """ Show the skywalk channels for a given process.
+
+        usage: showprocchannels <proc_t>
+    """
+
+    if not cmd_args:
+        raise ArgumentError('missing struct proc * argument')
+
+    proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t')
+
+    print GetKernChannelSummary.header
+    for kc in IterateProcChannels(proc):
+        print GetKernChannelSummary(kc)
+
+@lldb_command('showchannelrings')
+def ShowChannelRings(cmd_args=None):
+    """ Show the skywalk rings for a given channel.
+
+        usage: showchannelrings <struct kern_channel *>
+    """
+
+    if not cmd_args:
+        raise ArgumentError('missing struct kern_channel * argument')
+
+    kc = kern.GetValueFromAddress(cmd_args[0], 'kern_channel *')
+
+    print "RX rings:"
+    print GetKernChannelRingSummary.header
+    for ring in IterateKernChannelRings(kc, 0) :
+        print GetKernChannelRingSummary(ring)
+
+    print "TX rings:"
+    print GetKernChannelRingSummary.header
+    for ring in IterateKernChannelRings(kc, 1) :
+        print GetKernChannelRingSummary(ring)
+
+    print "ALLOC rings:"
+    print GetKernChannelRingSummary.header
+    for ring in IterateKernChannelRings(kc, 2) :
+        print GetKernChannelRingSummary(ring)
+
+    print "FREE rings:"
+    print GetKernChannelRingSummary.header
+    for ring in IterateKernChannelRings(kc, 3) :
+        print GetKernChannelRingSummary(ring)
+
+def SkmemCacheModeAsString(mode) :
+    out_string = ""
+    SKM_MODE_NOCACHE = 0x1
+    SKM_MODE_AUDIT   = 0x2
+
+    if (mode & SKM_MODE_NOCACHE) :
+        out_string += "n"
+    else :
+        out_string += "-"
+    if (mode & SKM_MODE_AUDIT) :
+        out_string += "a"
+    else :
+        out_string += "-"
+
+    return out_string
+
+@lldb_command('showskmemcache')
+def ShowSkmemCache(cmd_args=None) :
+    """ Show the global list of skmem caches
+    """
+
+    format_string = "{:<4s}  {:<18s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<4s} {:<s}"
+    print format_string.format("", "ADDR", "BUFI", "BUFM", "RESC", "SLCR", "SLDE", "SLAL", "SLFR", "DECO", "MODE", "NAME")
+
+    i = 1
+    skmhead = kern.globals.skmem_cache_head
+
+    for skm in IterateTAILQ_HEAD(skmhead, "skm_link") :
+        format_string = "{:>4d}: 0x{:<08x} {:<4d} {:<4d} {:<4d} {:<4d} {:<4d} {:<4d} {:<4d} {:<4d} {:<4s} \"{:<s}\""
+        print format_string.format(i, skm, skm.skm_bufinuse, skm.skm_bufmax, skm.skm_rescale, skm.skm_sl_create, skm.skm_sl_destroy, skm.skm_sl_alloc, skm.skm_sl_free, skm.skm_depot_contention, SkmemCacheModeAsString(skm.skm_mode), str(skm.skm_name))
+        i += 1
+
+@lldb_command('showskmemslab')
+def ShowBufCtl(cmd_args=None) :
+    """ Show slabs and bufctls of a skmem cache
+    """
+
+    if (cmd_args == None or len(cmd_args) == 0) :
+        print "Missing argument 0 (skmem_cache address)."
+        return
+
+    skm = kern.GetValueFromAddress(cmd_args[0], 'skmem_cache *')
+
+    for slab in IterateTAILQ_HEAD(skm.skm_sl_partial, "sl_link") :
+        format_string = "{:<08x} {:<4d} 0x{:<08x} 0x{:08x}"
+        print format_string.format(slab, slab.sl_refcnt, slab.sl_base, slab.sl_basem)
+
+    for slab in IterateTAILQ_HEAD(skm.skm_sl_empty, "sl_link") :
+        format_string = "{:<08x} {:<4d} 0x{:<08x} 0x{:08x}"
+        print format_string.format(slab, slab.sl_refcnt, slab.sl_base, slab.sl_basem)
+
+def SkmemArenaTypeAsString(type) :
+    out_string = ""
+    SKMEM_ARENA_TYPE_NEXUS  = 0
+    SKMEM_ARENA_TYPE_NECP   = 1
+    SKMEM_ARENA_TYPE_SYSTEM = 2
+
+    if (type == SKMEM_ARENA_TYPE_NEXUS) :
+        out_string += "NEXUS"
+    elif (type == SKMEM_ARENA_TYPE_NECP) :
+        out_string += "NECP"
+    elif (type == SKMEM_ARENA_TYPE_SYSTEM) :
+        out_string += "SYSTEM"
+    else :
+        out_string += "?"
+
+    return out_string
+
+@lldb_command('showskmemarena')
+def ShowSkmemArena(cmd_args=None) :
+    """ Show the global list of skmem arenas
+    """
+
+    i = 1
+    arhead = kern.globals.skmem_arena_head
+
+    for ar in IterateTAILQ_HEAD(arhead, "ar_link") :
+        format_string = "{:>4d}: 0x{:<08x} {:<6s} {:>5d} KB \"{:<s}\""
+        print format_string.format(i, ar, SkmemArenaTypeAsString(ar.ar_type), ar.ar_mapsize >> 10, str(ar.ar_name))
+        i += 1
+
+@lldb_command('showskmemregion')
+def ShowSkmemRegion(cmd_args=None) :
+    """ Show the global list of skmem regions
+    """
+
+    i = 1
+    skrhead = kern.globals.skmem_region_head
+
+    for skr in IterateTAILQ_HEAD(skrhead, "skr_link") :
+        format_string = "{:>4d}: 0x{:<08x} \"{:<s}\""
+        print format_string.format(i, skr, str(skr.skr_name))
+        i += 1
+
+@lldb_command('showchannelupphash')
+def ShowChannelUppHash(cmd_args=None) :
+    """ Show channel user packet pool hash chain
+    """
+
+    if (cmd_args == None or len(cmd_args) == 0) :
+        print "Missing argument 0 (skmem_cache address)."
+        return
+
+    ch = kern.GetValueFromAddress(cmd_args[0], 'kern_channel *')
+    KERN_CHANNEL_UPP_HTBL_SIZE = 256
+
+    for i in range(KERN_CHANNEL_UPP_HTBL_SIZE) :
+        bkt = addressof(ch.ch_upp_hash_table[i])
+        format_string = "{:>4d} 0x{:<08x}"
+        print format_string.format(i, bkt)
+        for kqum in IterateListEntry(bkt.upp_head, 'struct __kern_quantum *',
+                                      'qum_upp_link', list_prefix='s') :
+            format_string = "0x{:<08x}"
+            print format_string.format(kqum)
+
+@lldb_type_summary(['struct ns *'])
+@header('{:<20s} {:<5s} {:<48s} {:<4s}'.format('ns', 'proto', 'addr', 'nreservations'))
+def GetStructNsSummary(ns):
+    """ Summarizes a struct ns from the netns
+
+        returns: str - summary of struct ns
+    """
+
+    if (ns.ns_proto == IPPROTO_TCP):
+        proto = "tcp"
+    elif (ns.ns_proto == IPPROTO_UDP):
+        proto = "udp"
+    else:
+        proto = str(ns.ns_proto)
+
+    if (ns.ns_addr_len == sizeof('struct in_addr')):
+        addr = GetInAddrAsString(addressof(ns.ns_inaddr))
+    elif (ns.ns_addr_len == sizeof('struct in6_addr')):
+        addr = GetIn6AddrAsString(ns.ns_in6addr.__u6_addr.__u6_addr8)
+    else:
+        addr = str(ns_addr) + " bad len {:u}".format(ns.ns_addr_len)
+
+    format_string = '{o:#020x} {p:<5s} {a:<48s} {n:<4d}'
+
+    """ show ports and refs, one per line
+    """
+    ports_string = "ports & refs\n"
+    for f in IterateRBTreeEntry(ns.ns_reservations, 'struct ns_reservation *', 'nsr_link'):
+        ports_string += "\t%u" % f.nsr_port
+        ports_string += "\tlisten %d\tskywalk %d\tbsd %d\tpf %d\n" % (f.nsr_refs[0], f.nsr_refs[1], f.nsr_refs[2], f.nsr_refs[3])
+    """ show just the ports, not refs
+    offs = 0
+    ports_string = "\nports:\t"
+    for f in IterateRBTreeEntry(ns.ns_reservations, 'struct ns_reservation *', 'nsr_link'):
+        if (len(ports_string)-offs > 70):
+            ports_string += "\n\t"
+            offs = len(ports_string)
+        ports_string += " %u" % f.nsr_port
+    """
+
+    return format_string.format(
+        o=ns,
+        p=proto,
+        a=addr,
+        n=ns.ns_n_reservations) + ports_string
+
+@lldb_command('shownetns')
+def ShowNetNS(cmd_args=None):
+    """ Show the netns table
+    """
+    print"\nnetns_namespaces:"
+    print GetStructNsSummary.header
+
+    namespaces = kern.globals.netns_namespaces
+    for ns in IterateRBTreeEntry(namespaces, 'struct ns *', 'ns_link'):
+        print GetStructNsSummary(ns)
+
+    print "\nwild: (these should be duplicated above)"
+    print GetStructNsSummary.header
+    for i in range(0,4):
+        print GetStructNsSummary(kern.globals.netns_global_wild[i])
+
+    print "\nnon wild:"
+    print GetStructNsSummary.header
+    for i in range(0,4):
+        print GetStructNsSummary(kern.globals.netns_global_non_wild[i])
+
+
+@lldb_type_summary(['struct ns_token *'])
+@header('{:<20s} {:<5s} {:<48s} {:<12s} {:<8s} {:<38s} {:<38s} {:<12s}'.format('nt', 'proto', 'addr', 'port', 'owner', 'ifp', 'parent', 'flags'))
+def GetNsTokenSummary(nt):
+    """ Summarizes a struct ns from the netns
+
+        returns: str - summary of struct ns
+    """
+
+    if (nt.nt_proto == IPPROTO_TCP):
+        proto = "tcp"
+    elif (nt.nt_proto == IPPROTO_UDP):
+        proto = "udp"
+    else:
+        proto = str(nt.nt_proto)
+
+    if (nt.nt_addr_len == sizeof('struct in_addr')):
+        addr = GetInAddrAsString(addressof(nt.nt_inaddr))
+    elif (nt.nt_addr_len == sizeof('struct in6_addr')):
+        addr = GetIn6AddrAsString(nt.nt_in6addr.__u6_addr.__u6_addr8)
+    else:
+        addr = str(nt_addr) + " bad len {:u}".format(nt.nt_addr_len)
+
+    format_string = '{o:#020x} {p:<5s} {a:<48s} {pt:<12s} {wn:<8s} {ifp:38s} {pa:38s} {f:#012x}'
+
+    ports = "%u" % nt.nt_port
+
+    ifp = "(struct ifnet *)" + hex(nt.nt_ifp)
+
+    if ((nt.nt_flags & 0x7) == 0x00):
+        owner = "LISTENER"
+        parent = "(void *)" + hex(nt.nt_parent)
+    elif ((nt.nt_flags & 0x7) == 0x01):
+        owner = "SKYWALK"
+        parent = "(struct flow_entry *)" + hex(nt.nt_parent_skywalk)
+    elif ((nt.nt_flags & 0x7) == 0x02): # XXX xnudefines?
+        owner = "BSD"
+        parent = "(struct inpcb *)" + hex(nt.nt_parent_bsd)
+    elif ((nt.nt_flags & 0x7) == 0x03): # XXX xnudefines?
+        owner = "PF"
+        parent = "(void *)" + hex(nt.nt_parent)
+
+    return format_string.format(
+        o=nt,
+        p=proto,
+        a=addr,
+        pt=ports,
+        wn=owner,
+        ifp=ifp,
+        pa=parent,
+        f=nt.nt_flags)
+
+@lldb_command("showallnetnstokens")
+def ShowAllNetNSTokens(cmd_args=None):
+    """ show all netns tokens
+    """
+
+    tokenhead = kern.globals.netns_all_tokens
+    print GetNsTokenSummary.header
+    for nt in IterateListEntry(tokenhead, 'struct ns_token *', 'nt_all_link', list_prefix='s'):
+        print GetNsTokenSummary(nt)
+
+@lldb_command("shownetnstokens")
+def ShowNetNSTokens(cmd_args=None):
+    """ show netns tokens attached to an ifp
+        with no args, shows unbound tokens
+    """
+
+    if (cmd_args == None or len(cmd_args) == 0):
+        print "No ifp argument provided, showing unbound tokens"
+        tokenhead = kern.globals.netns_unbound_tokens
+    elif len(cmd_args) > 0:
+        ifp = kern.GetValueFromAddress(cmd_args[0], 'ifnet *')
+        print "Showing tokens for ifp %r" % ifp
+        tokenhead = ifp.if_netns_tokens
+    else:
+        print "Missing ifp argument 0 in shownetnstokens"
+        print cmd_args
+        return
+
+    print GetNsTokenSummary.header
+    for nt in IterateListEntry(tokenhead, 'struct ns_token *', 'nt_ifp_link', list_prefix='s'):
+        print GetNsTokenSummary(nt)
+
+def IterateSTAILQ_HEAD(headval, element_name):
+    iter_val = headval.stqh_first
+    while unsigned(iter_val) != 0 :
+        yield iter_val
+        iter_val = iter_val.__getattr__(element_name).stqe_next
+    #end of yield loop
+
+@lldb_command("shownexuschannels")
+def ShowNexusChannels(cmd_args=None):
+    """ show nexus channels
+    """
+    if (cmd_args == None or len(cmd_args) == 0):
+        print "Missing argument 0 (kern_nexus address)."
+        return
+
+    nx = kern.GetValueFromAddress(cmd_args[0], 'kern_nexus *')
+    i = 1
+
+    format_string = "{:>4s}  {:<18s} {:>4s} {:<7s} {:<7s} {:<18s} {:<18s} {:<18s} {:>8s} {:6s} {:<18s} {:>4s} {:s}"
+    print format_string.format("", "addr", "refs", "txrings", "rxrings", "arena", "ioskmap", "mapaddr", "mapsize", "maprdr", "na", "fd", "process")
+
+    for ch in IterateSTAILQ_HEAD(nx.nx_ch_head, "ch_link"):
+        format_string = "{:>4d}: 0x{:<08x} {:>4d} [{:2d},{:2d}] [{:2d},{:2d}] 0x{:<08x} 0x{:<08x} 0x{:<16x} {:>8d} {:>6d} 0x{:<08x} {:>4d} {:s}({:d})"
+        print format_string.format(i, ch, ch.ch_refcnt, ch.ch_first[0], ch.ch_last[0], ch.ch_first[1], ch.ch_last[1], ch.ch_mmap.ami_arena, ch.ch_mmap.ami_mapref, ch.ch_mmap.ami_mapaddr, ch.ch_mmap.ami_mapsize, ch.ch_mmap.ami_redirect, ch.ch_na, ch.ch_fd, ch.ch_name, ch.ch_pid)
+        i += 1
+
+    for ch in IterateSTAILQ_HEAD(nx.nx_ch_nonxref_head, "ch_link"):
+        format_string = "{:>4d}: 0x{:<08x} {:>4d} [{:2d},{:2d}] [{:2d},{:2d}] 0x{:<08x} 0x{:<08x} 0x{:<16x} {:>8d} {:>6d} 0x{:<08x} {:>4d} {:s}({:d})"
+        print format_string.format(i, ch, ch.ch_refcnt, ch.ch_first[0], ch.ch_last[0], ch.ch_first[1], ch.ch_last[1], ch.ch_mmap.ami_arena, ch.ch_mmap.ami_mapref, ch.ch_mmap.ami_mapaddr, ch.ch_mmap.ami_mapsize, ch.ch_mmap.ami_redirect, ch.ch_na, ch.ch_fd, ch.ch_name, ch.ch_pid)
+        i += 1
+
+def IterateProcNECP(proc):
+    """ Iterate through all NECP descriptors in the given process
+
+        params:
+            proc - the proc object
+        returns: nothing, this is meant to be used as a generator function
+            necp - yields each necp_fd_data in the process
+    """
+
+    proc_filedesc = proc.p_fd
+    proc_lastfile = unsigned(proc_filedesc.fd_lastfile)
+    proc_ofiles = proc_filedesc.fd_ofiles
+
+    count = 0
+    while count <= proc_lastfile:
+        if unsigned(proc_ofiles[count]) != 0:
+            proc_fd_fglob = proc_ofiles[count].f_fglob
+            if (unsigned(proc_fd_fglob.fg_ops.fo_type) == 9):
+                yield Cast(proc_fd_fglob.fg_data, 'necp_fd_data *')
+        count += 1
+
+def GetNECPClientBitFields(necp):
+    """ Return the bit fields in necp_client as string
+
+        returns: str - string representation of necp_client bit fields
+    """
+
+    bitfields_string = ''
+    if necp.result_read != 0:
+        bitfields_string += 'r'
+    else:
+        bitfields_string += '-'
+    if necp.allow_multiple_flows != 0:
+        bitfields_string += 'm'
+    else:
+        bitfields_string += '-'
+    if necp.background != 0:
+        bitfields_string += 'b'
+    else:
+        bitfields_string += '-'
+    if necp.background_update != 0:
+        bitfields_string += 'B'
+    else:
+        bitfields_string += '-'
+    if necp.platform_binary != 0:
+        bitfields_string += 'p'
+    else:
+        bitfields_string += '-'
+
+    return bitfields_string
+
+def GetNECPFlowBitFields(flow_registration):
+    """ Return the bit fields in necp_client_flow_registration as string
+
+        returns: str - string representation of necp_client_flow_registration bit fields
+    """
+
+    bitfields_string = ''
+    if flow_registration.flow_result_read != 0:
+        bitfields_string += 'r'
+    else:
+        bitfields_string += '-'
+    if flow_registration.defunct != 0:
+        bitfields_string += 'd'
+    else:
+        bitfields_string += '-'
+
+    return bitfields_string
+
+@lldb_type_summary(['necp_fd_data *'])
+@header('{:<20s} {:<8s}'.format('necp_fd_data', "flags"))
+def GetNECPSummary(necp):
+    """ Summarizes a necp_fd_data and related information
+
+        returns: str - summary of necp_fd_data
+    """
+
+    format_string = '{o: <#020x} {u:<#08x}'
+
+    stats_arenas_string = "\n\n\t%-18s %-39s %-4s %-10s\n" % ("stats_arenas", "mmap", "refs", "flags")
+    for sa in IterateListEntry(necp.stats_arena_list, 'struct necp_arena_info *', 'nai_chain'):
+        stats_arenas_string += "\t0x%016x " % sa
+        stats_arenas_string += "[0x%016x-0x%016x) " % (sa.nai_mmap.ami_mapaddr,(sa.nai_mmap.ami_mapaddr+sa.nai_mmap.ami_mapsize))
+        stats_arenas_string += "%4u " % sa.nai_use_count
+        stats_arenas_string += "0x%08x " % sa.nai_flags
+        stats_arenas_string += "\n"
+
+    clients_string = ""
+    for c in IterateRBTreeEntry(necp.clients, 'struct necp_client *', 'link'):
+        clients_string += "\n\t%-18s %-36s %-4s %-5s\n" % ("necp_clients", "client_id", "refs", "flags")
+        clients_string += "\t0x%016x " % c
+        clients_string += "%36s " % GetUUIDSummary(c.client_id)
+        clients_string += "%4u " % c.reference_count
+        clients_string += "%5s " % GetNECPClientBitFields(c)
+        count = 0;
+        for f in IterateRBTreeEntry(c.flow_registrations, 'struct necp_client_flow_registration *', 'client_link'):
+            if count == 0:
+                clients_string += "\n\t\t%-18s %-36s %-2s %-18s %-18s %-18s\n" % ("flow_registration", "registraton_id", "flags", "stats_arena", "kstats_obj", "ustats_obj")
+            clients_string += "\t\t0x%016x " % f
+            clients_string += "%36s " % GetUUIDSummary(f.registration_id)
+            clients_string += "%2s " % GetNECPFlowBitFields(f)
+            clients_string += "0x%016x " % f.stats_arena
+            clients_string += "0x%016x " % f.kstats_kaddr
+            clients_string += "0x%016x " % f.ustats_uaddr
+        clients_string += "\n"
+
+    return format_string.format(
+        o=necp,
+        u=necp.flags) + stats_arenas_string + clients_string
+
+@lldb_command('showprocnecp')
+def ShowProcNECP(cmd_args=None):
+    """ Show NECP descriptors for a given process.
+
+        usage: showprocnecp <proc_t>
+    """
+
+    if not cmd_args:
+        raise ArgumentError('missing struct proc * argument')
+
+    proc = kern.GetValueFromAddress(cmd_args[0], 'proc_t')
+
+    print GetNECPSummary.header
+    for kc in IterateProcNECP(proc):
+        print GetNECPSummary(kc)
diff --git a/tools/lldbmacros/turnstile.py b/tools/lldbmacros/turnstile.py
new file mode 100755 (executable)
index 0000000..372e13e
--- /dev/null
@@ -0,0 +1,147 @@
+from xnu import *
+import sys, shlex
+from utils import *
+from waitq import *
+import xnudefines
+
+@lldb_type_summary(['struct turnstile *'])
+@header("{0: <20s} {1: <5s} {2: <20s} {3: <8s} {4: <8s} {5: <23s} {6: <20s} {7: <16s} {8: <20s} {9: <20s}".format("turnstile", "pri", "waitq", "type", "state", "inheritor", "proprietor", "gen count", "thread", "prev_thread"))
+def GetTurnstileSummary(turnstile):
+    """ Summarizes the turnstile
+        params: turnstile = value of the object of type struct turnstile *
+        returns: String with summary of the type.
+    """
+
+    type_and_gencount = Cast(addressof(turnstile.ts_type_gencount), 'union turnstile_type_gencount *')
+    turnstile_type = ""
+
+    if type_and_gencount.ts_type == 0:
+      turnstile_type = "none   "
+    elif type_and_gencount.ts_type == 1:
+      turnstile_type = "knl_mtx"
+    elif type_and_gencount.ts_type == 2:
+      turnstile_type = "ulock  "
+    elif type_and_gencount.ts_type == 3:
+      turnstile_type = "pth_mtx"
+    elif type_and_gencount.ts_type == 4:
+      turnstile_type = "syn_ipc"
+    elif type_and_gencount.ts_type == 5:
+      turnstile_type = "kqwl   "
+    elif type_and_gencount.ts_type == 6:
+      turnstile_type = "workq  "
+    elif type_and_gencount.ts_type == 7:
+      turnstile_type = "knote  "
+
+    turnstile_state = ""
+    if turnstile.ts_state & 0x1:
+        turnstile_state += "T"
+    elif turnstile.ts_state & 0x2:
+        turnstile_state += "F"
+    elif turnstile.ts_state & 0x4:
+        turnstile_state += "H"
+    elif turnstile.ts_state & 0x8:
+        turnstile_state += "P"
+
+    if turnstile.ts_inheritor_flags & 0x4:
+        inheritor_type = "th"
+    elif turnstile.ts_inheritor_flags & 0x8:
+        inheritor_type = "ts"
+    elif turnstile.ts_inheritor_flags & 0x40:
+        inheritor_type = "wq"
+    else:
+        inheritor_type = "--"
+
+    format_str = "{0: <#020x} {1: <5d} {2: <#020x} {3: <8s} {4: <8s} {6: <2s}:{5: <#020x} {7: <#020x} {8: <16d}"
+    out_string = format_str.format(turnstile, turnstile.ts_priority, addressof(turnstile.ts_waitq),
+            turnstile_type, turnstile_state, turnstile.ts_inheritor, inheritor_type,
+            turnstile.ts_proprietor, type_and_gencount.ts_gencount)
+
+    #if DEVELOPMENT
+    format_str = " {0: <#020x} {1: <#020x}"
+    if hasattr(turnstile, 'ts_thread'):
+      out_string += format_str.format(turnstile.ts_thread, turnstile.ts_prev_thread)
+    #endif
+    return out_string
+
+def PrintTurnstile(turnstile):
+    """ print turnstile and it's free list.
+        params:
+            turnstile - turnstile to print
+    """
+    print GetTurnstileSummary(turnstile)
+
+    """ print turnstile freelist if its not on a thread or freelist """
+    if turnstile.ts_state & 0x3 == 0:
+      needsHeader = True
+      for free_turnstile in IterateListEntry(turnstile.ts_free_turnstiles, 'struct turnstile *', 'ts_free_elm', 's'):
+        if needsHeader:
+          print "    Turnstile free List"
+          header_str = "    " + GetTurnstileSummary.header
+          print header_str
+          needsHeader = False
+        print "    " + GetTurnstileSummary(free_turnstile)
+        print ""
+    return
+
+# Macro: showturnstile
+@lldb_command('showturnstile')
+def ShowTurnstile(cmd_args=None, cmd_options={}):
+    """ show the turnstile and all free turnstiles hanging off the turnstile.
+        Usage: (lldb)showturnstile <struct turnstile *>
+    """
+    if not cmd_args:
+      raise ArgumentError("Please provide arguments")
+
+    turnstile = kern.GetValueFromAddress(cmd_args[0], 'struct turnstile *')
+    print GetTurnstileSummary.header
+    PrintTurnstile(turnstile)
+    return
+# EndMacro: showturnstile
+
+@lldb_command('showturnstilehashtable')
+def ShowTurnstileHashTable(cmd_args=None, cmd_options={}):
+    """ show the global hash table for turnstiles.
+        Usage: (lldb)showturnstilehashtable
+    """
+    print GetTurnstileSummary.header
+    turnstile_htable_buckets = kern.globals.ts_htable_buckets
+    for index in range(0, turnstile_htable_buckets):
+        turnstile_bucket = GetObjectAtIndexFromArray(kern.globals.turnstile_htable, index)
+        for turnstile in IterateQueue(turnstile_bucket.ts_ht_bucket_list, 'struct turnstile *', 'ts_htable_link'):
+            PrintTurnstile(turnstile)
+    return True
+
+#if DEVELOPMENT
+# Macro: showallturnstiles
+@lldb_command('showallturnstiles')
+def ShowAllTurnstiles(cmd_args=None, cmd_options={}):
+    """ A DEVELOPMENT macro that walks the list of all allocated turnstile objects
+        and prints them.
+        usage: (lldb) showallturnstiles
+    """
+    if not hasattr(kern.globals, 'turnstiles_list'):
+      print "It seems you are running a build of kernel that does not have the list of all turnstiles."
+      return False
+    print GetTurnstileSummary.header
+    for turnstile in IterateQueue(kern.globals.turnstiles_list, 'struct turnstile *', 'ts_global_elm'):
+        PrintTurnstile(turnstile)
+    return True
+# EndMacro showallturnstiles
+
+# Macro: showallbusyturnstiles
+@lldb_command('showallbusyturnstiles')
+def ShowAllTurnstiles(cmd_args=None, cmd_options={}):
+    """ A DEVELOPMENT macro that walks the list of all allocated turnstile objects
+        and prints them.
+        usage: (lldb) showallbusyturnstiles
+    """
+    if not hasattr(kern.globals, 'turnstiles_list'):
+      print "It seems you are running a build of kernel that does not have the list of all turnstiles."
+      return False
+    print GetTurnstileSummary.header
+    for turnstile in IterateQueue(kern.globals.turnstiles_list, 'struct turnstile *', 'ts_global_elm'):
+      if turnstile.ts_state & 0x3 == 0:
+        PrintTurnstile(turnstile)
+    return True
+# EndMacro showallbusyturnstiles
+#endif
index 88a8858fc5a49dfb43a1e0bca91cefab6b0d8451..3413fff961331a0201d26d132274dc7f76200449 100755 (executable)
@@ -195,7 +195,6 @@ def PrintUserspaceData(cmd_args=None, cmd_options={}):
 
     return True
 
-
 @lldb_command('showtaskuserargs')
 def ShowTaskUserArgs(cmd_args=None, cmd_options={}):
     """ Read the process argv, env, and apple strings from the user stack
@@ -208,8 +207,9 @@ def ShowTaskUserArgs(cmd_args=None, cmd_options={}):
 
     task = kern.GetValueFromAddress(cmd_args[0], 'task *')
     proc = Cast(task.bsd_info, 'proc *')
+    ptrsize = 8 if int(task.t_flags) & 0x1 else 4
 
-    format_string = "Q" if kern.ptrsize == 8 else "I"
+    format_string = "Q" if ptrsize == 8 else "I"
 
     string_area_size = proc.p_argslen
     string_area_addr = proc.user_stack - string_area_size
@@ -220,7 +220,7 @@ def ShowTaskUserArgs(cmd_args=None, cmd_options={}):
         return False
 
     i = 0
-    pos = string_area_addr - kern.ptrsize
+    pos = string_area_addr - ptrsize
 
     for name in ["apple", "env", "argv"] :
         while True:
@@ -229,9 +229,9 @@ def ShowTaskUserArgs(cmd_args=None, cmd_options={}):
                     break
                 i += 1
 
-            pos -= kern.ptrsize
+            pos -= ptrsize
 
-            user_data_string = GetUserDataAsString(task, pos, kern.ptrsize)
+            user_data_string = GetUserDataAsString(task, pos, ptrsize)
             ptr = struct.unpack(format_string, user_data_string)[0]          
 
             if ptr == 0:
index 5a5079e7b876b36737a95082d6034a944f5cf0f6..74e54223e0d49b65278c01464ebfc4730bd16501 100755 (executable)
@@ -11,17 +11,8 @@ CPU_TYPE_X86_64 = 0x01000007
 CPU_TYPE_ARM = 0x0000000c
 CPU_TYPE_ARM64 = 0x0100000c
 
-
-CPU_SUBTYPE_X86_64_ALL = 3
-CPU_SUBTYPE_X86_64_H = 8
-CPU_SUBTYPE_ARMV8 = 13
-CPU_SUBTYPE_ARM_V7 = 9
-CPU_SUBTYPE_ARM_V7S = 11
-CPU_SUBTYPE_ARM_V7K = 12
-
-
 def GetRegisterSetForCPU(cputype, subtype):
-    if cputype ==  CPU_TYPE_ARM64:
+    if cputype == CPU_TYPE_ARM64:
         retval = Armv8_RegisterSet
     elif cputype == CPU_TYPE_ARM:
         retval = Armv7_RegisterSet
@@ -37,13 +28,12 @@ def GetRegisterSetForCPU(cputype, subtype):
 
 class UserThreadObject(object):
     """representation of userspace thread"""
-    def __init__(self, thr_obj, cputype, cpusubtype, kern_cputype):
+    def __init__(self, thr_obj, cputype, cpusubtype, is_kern_64bit):
         super(UserThreadObject, self).__init__()
         self.thread = thr_obj
         self.registerset = GetRegisterSetForCPU(cputype, cpusubtype)
         self.thread_id = unsigned(self.thread.thread_id)
         self.is64Bit = bool(cputype & 0x01000000)
-        isKern64Bit = bool(kern_cputype & 0x01000000)
 
         if self.is64Bit:
             if cputype == CPU_TYPE_X86_64:
@@ -58,12 +48,13 @@ class UserThreadObject(object):
                 self.saved_state = Cast(self.thread.machine.iss, 'x86_saved_state_t *').uss.ss_32
             if cputype == CPU_TYPE_ARM:
                 self.reg_type = "arm"
-                if not isKern64Bit:
+                if not is_kern_64bit:
                     self.saved_state = self.thread.machine.PcbData
                 else:
                     self.saved_state = self.thread.machine.contextData.ss.uss.ss_32
-        logging.debug("created thread id 0x%x of type %s, kern_cputype 0x%x cputype 0x%x"
-                      % (self.thread_id, self.reg_type, kern_cputype, cputype))
+
+        logging.debug("created thread id 0x%x of type %s, is_kern_64bit 0x%x cputype 0x%x"
+                      % (self.thread_id, self.reg_type, is_kern_64bit, cputype))
 
     def getRegisterValueByName(self, name):
         if self.reg_type == 'arm64':
@@ -108,30 +99,21 @@ class UserProcess(target.Process):
         if task.t_flags & 0x1:
             ptrsize = 8
         if task.t_flags & 0x2:
-            dataregisters64bit = 8
-
-        cputype = CPU_TYPE_X86_64
-        cpusubtype = CPU_SUBTYPE_X86_64_ALL
+            dataregisters64bit = True
 
+        is_kern_64bit = kern.arch in ['x86_64', 'x86_64h', 'arm64'
+        ]
 
-        """ these computations should come out of the macho header i think """
-        """ where does kern.arch come from? what's kern.arch == armv8?? """ 
-        if kern.arch in ('arm'):
-            cputype = CPU_TYPE_ARM
-            cpusubtype = CPU_SUBTYPE_ARM_V7
-        elif kern.arch in ('armv8', 'arm64'):
-            cputype = CPU_TYPE_ARM64
-            cpusubtype = CPU_SUBTYPE_ARMV8
+        self.cputype = unsigned(self.proc.p_cputype)
+        self.cpusubtype = unsigned(self.proc.p_cpusubtype)
 
-        super(UserProcess, self).__init__(cputype, cpusubtype, ptrsize)
+        super(UserProcess, self).__init__(self.cputype, self.cpusubtype, ptrsize)
 
         self.hinfo['ostype'] = 'macosx'
-        if cputype != CPU_TYPE_X86_64:
+        if self.cputype != CPU_TYPE_X86_64 and self.cputype != CPU_TYPE_I386:
             self.hinfo['ostype'] = 'ios'
 
-        self.cputype = unsigned(self.proc.p_cputype)
-        self.cpusubtype = unsigned(self.proc.p_cpusubtype)
-        self.registerset = GetRegisterSetForCPU(cputype, cpusubtype)
+        self.registerset = GetRegisterSetForCPU(self.cputype, self.cpusubtype)
         logging.debug("process %s is64bit: %d ptrsize: %d cputype: %d  cpusubtype:%d",
                       hex(self.proc), int(dataregisters64bit), ptrsize,
                       self.cputype, self.cpusubtype
@@ -140,7 +122,7 @@ class UserProcess(target.Process):
         self.threads_ids_list = []
         logging.debug("iterating over threads in process")
         for thval in IterateQueue(task.threads, 'thread *', 'task_threads'):
-            self.threads[unsigned(thval.thread_id)] = UserThreadObject(thval, self.cputype, self.cpusubtype, cputype)
+            self.threads[unsigned(thval.thread_id)] = UserThreadObject(thval, self.cputype, self.cpusubtype, is_kern_64bit)
             self.threads_ids_list.append(unsigned(thval.thread_id))
 
     def getRegisterDataForThread(self, th_id, reg_num):
index eaa3bcc00b362d3cc96478fee972550c5b942c1d..33d601f8d2e2681672f207c58d0221163fa9a966 100755 (executable)
@@ -140,7 +140,16 @@ def Cast(obj, target_type):
     """
     return cast(obj, target_type)
 
-    
+def ContainerOf(obj, target_type, field_name):
+    """ Type cast an object to another C type from a pointer to a field.
+        params:
+            obj - core.value  object representing some C construct in lldb
+            target_type - str : ex 'struct thread'
+                        - lldb.SBType :
+            field_name - the field name within the target_type obj is a pointer to
+    """
+    return containerof(obj, target_type, field_name)
+
 def loadLLDB():
     """ Util function to load lldb python framework in case not available in common include paths.
     """
@@ -461,3 +470,7 @@ def print_hex_data(data, begin_offset=0, desc=""):
             char_buf = ""
     print "{:08x} {: <50s} |{: <16s}|".format(begin_offset + index - 16, hex_buf, char_buf)
     return
+
+def Ones(x):
+    return (1 << x)-1
+
index 8bfa6391903d99cee429b5a4f44fcabb34f25192..6768635c0a68df17270a58a5822d66812e682894 100755 (executable)
@@ -832,6 +832,10 @@ def GetWaitqPreposts(waitq):
     wqset = Cast(waitq, 'waitq_set *')
     if wqset.wqset_prepost_id == 0:
         return []
+    if not wqset.wqset_q.waitq_prepost:
+        # If the "prepost_id" is non-zero, but the 'waitq_prepost' bit is
+        # *not* set, then this waitq actually has a prepost hook!
+        return [ "{0: <#18x}:{1: <18s}".format(wqset.wqset_prepost_id, "<hook>") ]
     return GetPrepostChain(wqset.wqset_prepost_id)
 
 
diff --git a/tools/lldbmacros/workqueue.py b/tools/lldbmacros/workqueue.py
new file mode 100755 (executable)
index 0000000..dae699f
--- /dev/null
@@ -0,0 +1,176 @@
+from xnu import *
+from scheduler import GetRecentTimestamp
+import xnudefines
+
+def GetProcWorkqueue(proc):
+    wq = proc.p_wqptr;
+    if unsigned(wq):
+        return Cast(wq, "struct workqueue *");
+    return None
+
+@header("{:<20s} {:<20s} {:<20s} {:<10s} {:<10s} {:<10s} {:<10s} {:<10s} {:<10s} {:<30s}".format(
+    'task', 'proc', 'wq', 'sched', 'pending', 'idle', 'dying', 'creations', 'fulfilled', 'wq_flags'))
+def GetWorkqueueSummary(proc, wq):
+    wq_flags = []
+    if wq.wq_flags & GetEnumValue("workq_state_flags_t::WQ_EXITING"):
+        wq_flags.append("EXITING")
+    if wq.wq_flags & GetEnumValue("workq_state_flags_t::WQ_PROC_SUSPENDED"):
+        wq_flags.append("PROC_SUSPENDED")
+    if wq.wq_flags & GetEnumValue("workq_state_flags_t::WQ_DEATH_CALL_SCHEDULED"):
+        wq_flags.append("DEATH_CALL")
+
+    scheduled = GetEnumValue("workq_state_flags_t::WQ_DELAYED_CALL_SCHEDULED")
+    pended = GetEnumValue("workq_state_flags_t::WQ_DELAYED_CALL_PENDED")
+    if wq.wq_flags & (scheduled | pended):
+        s = "DELAYED_CALL["
+        if wq.wq_flags & scheduled: s += 'S'
+        if wq.wq_flags & pended: s += 'P'
+        s += ']'
+        wq_flags.append(s)
+
+    scheduled = GetEnumValue("workq_state_flags_t::WQ_IMMEDIATE_CALL_SCHEDULED")
+    pended = GetEnumValue("workq_state_flags_t::WQ_IMMEDIATE_CALL_PENDED")
+    if wq.wq_flags & (scheduled | pended):
+        s = "IMMEDIATE_CALL["
+        if wq.wq_flags & scheduled: s += 'S'
+        if wq.wq_flags & pended: s += 'P'
+        s += ']'
+        wq_flags.append(s)
+
+    return "{p.task: <#020x} {p: <#020x} {wq: <#020x} {wq.wq_threads_scheduled: <10d} {wq.wq_reqcount: <10d} {wq.wq_thidlecount: <10d} {wq.wq_thdying_count: <10d} {wq.wq_creations: <10d} {wq.wq_fulfilled: <10d} {wq_flags: <30s}".format(p=proc, wq=wq, wq_flags=" ".join(wq_flags));
+
+@header("{:<20s} {:<20s} {:>10s}  {:9s} {:<20s} {:<10s} {:<30s}".format(
+    'thread', 'uthread', 'thport', 'kind', 'kqueue', 'idle (ms)', 'uu_workq_flags'))
+def GetWQThreadSummary(th, uth):
+    p = Cast(th.task.bsd_info, 'proc *')
+    wq = p.p_wqptr
+
+    uu_workq_flags = []
+    if uth.uu_workq_flags & 0x01: uu_workq_flags.append("NEW")
+    if uth.uu_workq_flags & 0x02:
+        uu_workq_flags.append("RUNNING")
+        if wq.wq_creator == uth:
+            kind = "creator"
+        else:
+            kind = "workq"
+        idle = ""
+    else:
+        ts = kern.GetNanotimeFromAbstime(GetRecentTimestamp() - uth.uu_save.uus_workq_park_data.idle_stamp) / 1e9
+        kind = "idle"
+        idle = "%#.03f" % (ts)
+    if uth.uu_workq_flags & 0x04: uu_workq_flags.append("DYING")
+    if uth.uu_workq_flags & 0x08: uu_workq_flags.append("OVERCOMMIT")
+    if uth.uu_workq_flags & 0x10: uu_workq_flags.append("OUTSIDE_QOS")
+    if uth.uu_workq_flags & 0x20: uu_workq_flags.append("IDLE_CLEANUP")
+    if uth.uu_workq_flags & 0x40: uu_workq_flags.append("EARLY_BOUND")
+    if uth.uu_workq_flags & 0x80: uu_workq_flags.append("CPU%")
+
+    kqr = uth.uu_kqr_bound
+    if not kqr:
+        kq = 0
+    elif kqr.kqr_state & 0x1: # workloop
+        kq = ContainerOf(kqr, 'struct kqworkloop', 'kqwl_request')
+        kind = "workloop"
+    else:
+        kq = p.p_fd.fd_wqkqueue
+        kind = "kqwq[%s]" % (xnudefines.thread_qos_short_strings[int(kqr.kqr_qos_index)])
+
+    return "{th: <#020x} {uth: <#020x} {thport: >#010x}  {kind: <9s} {kq: <#020x} {idle: <10s} {uu_workq_flags: <30s}".format(th=th, uth=uth, thport=uth.uu_workq_thport, kind=kind, kq=kq, idle=idle, uu_workq_flags=" ".join(uu_workq_flags))
+
+@header("{:<20s} {:<20s} {:<10s} {:<3s} {:<4s} {:<30s}".format(
+    'request', 'kqueue', 'state', '#', 'qos', 'tr_flags'))
+def GetWorkqueueThreadRequestSummary(proc, req):
+    kq = 0
+    tr_flags = []
+
+    if req.tr_flags & 0x01:
+        tr_flags.append("KEVENT")
+        kq = proc.p_fd.fd_wqkqueue
+    if req.tr_flags & 0x02:
+        tr_flags.append("WORKLOOP")
+        kq = ContainerOf(req, 'struct kqworkloop', 'kqwl_request.kqr_req')
+    if req.tr_flags & 0x04: tr_flags.append("OVERCOMMIT")
+    if req.tr_flags & 0x08: tr_flags.append("PARAMS")
+    if req.tr_flags & 0x10: tr_flags.append("OUTSIDE_QOS")
+
+    state = {0: "IDLE", 1: "NEW", 2: "QUEUED", 4: "BINDING" }[int(req.tr_state)]
+
+    qos = int(req.tr_qos)
+    if qos == 8:
+        qos = "MG"
+    elif qos == 7:
+        qos = "SP"
+    else:
+        qos = xnudefines.thread_qos_short_strings[qos]
+
+    return "{req: <#020x} {kq: <#020x} {state: <10s} {req.tr_count: <3d} {qos: <4s} {tr_flags: <30s}".format(req=req, kq=kq, state=state, qos=qos, tr_flags=" ".join(tr_flags))
+
+@lldb_command('showwqthread')
+def ShowWQThread(cmd_args=None):
+    """ Shows info about a workqueue thread
+
+        usage: showworkqthread <thread_t>
+    """
+
+    if not cmd_args:
+        raise ArgumentError('missing struct proc * argument')
+
+    th = kern.GetValueFromAddress(cmd_args[0], "struct thread *")
+    if not (th.thread_tag & 0x20):
+        raise ArgumentError('not a workqueue thread')
+
+    print GetWQThreadSummary.header
+    print GetWQThreadSummary(th, Cast(th.uthread, 'struct uthread *'))
+
+
+@lldb_command('showprocworkqueue')
+def ShowProcWorkqueue(cmd_args=None):
+    """ Shows the process workqueue
+
+        usage: showprocworkqueue <proc_t>
+    """
+
+    if not cmd_args:
+        raise ArgumentError('missing struct proc * argument')
+
+    proc = kern.GetValueFromAddress(cmd_args[0], "proc_t")
+    wq = Cast(proc.p_wqptr, "struct workqueue *");
+    if wq:
+        print GetWorkqueueSummary.header
+        print GetWorkqueueSummary(proc, wq)
+
+        if wq.wq_reqcount:
+            print "    "
+            print "    " + GetWorkqueueThreadRequestSummary.header
+            if wq.wq_event_manager_threadreq:
+                print "    " + GetWorkqueueThreadRequestSummary(proc, wq.wq_event_manager_threadreq)
+            for req in IteratePriorityQueueEntry(wq.wq_overcommit_queue, 'struct workq_threadreq_s', 'tr_entry'):
+                print "    " + GetWorkqueueThreadRequestSummary(proc, req)
+            for req in IteratePriorityQueueEntry(wq.wq_constrained_queue, 'struct workq_threadreq_s', 'tr_entry'):
+                print "    " + GetWorkqueueThreadRequestSummary(proc, req)
+            for req in IteratePriorityQueueEntry(wq.wq_special_queue, 'struct workq_threadreq_s', 'tr_entry'):
+                print "    " + GetWorkqueueThreadRequestSummary(proc, req)
+
+        print "    "
+        print "    " + GetWQThreadSummary.header
+        for uth in IterateTAILQ_HEAD(wq.wq_thrunlist, "uu_workq_entry"):
+            print "    " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
+        for uth in IterateTAILQ_HEAD(wq.wq_thidlelist, "uu_workq_entry"):
+            print "    " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
+        for uth in IterateTAILQ_HEAD(wq.wq_thnewlist, "uu_workq_entry"):
+            print "    " + GetWQThreadSummary(Cast(uth.uu_thread, 'struct thread *'), uth)
+
+@lldb_command('showallworkqueues')
+def ShowAllWorkqueues(cmd_args=None):
+    """ Display a summary of all the workqueues in the system
+
+        usage: showallworkqueues
+    """
+
+    print GetWorkqueueSummary.header
+
+    for t in kern.tasks:
+        proc = Cast(t.bsd_info, 'proc *')
+        wq = Cast(proc.p_wqptr, "struct workqueue *");
+        if wq:
+            print GetWorkqueueSummary(proc, wq)
index 1806464fd72bbf38b64da88bf739cbb16312a764..1d30ad890498cb161028e7c001e8014b6e60f3e5 100755 (executable)
@@ -802,6 +802,7 @@ from pmap import *
 from ioreg import *
 from mbufs import *
 from net import *
+from skywalk import *
 from kdp import *
 from userspace import *
 from pci import *
@@ -812,6 +813,7 @@ from atm import *
 from structanalyze import *
 from ipcimportancedetail import *
 from bank import *
+from turnstile import *
 from kasan import *
 from kauth import *
 from waitq import *
@@ -820,6 +822,6 @@ from ktrace import *
 from pgtrace import *
 from xnutriage import *
 from kevent import *
+from workqueue import *
 from ntstat import *
 from zonetriage import *
-
index 9db538d38fe351c17a93d08de7e8afccf26f0cda..9ae4701736f9ca4cf8819274021a56a1ad8ea012 100755 (executable)
@@ -88,7 +88,7 @@ kn_state_strings = { 0x0000: '',
                      0x0002: 'QUEUED',
                      0x0004: 'DISABLED',
                      0x0008: 'DROPPING',
-                     0x0010: 'USERWAIT',
+                     0x0010: 'LOCKED',
                      0x0020: 'ATTACHING',
                      0x0040: 'STAYACTIVE',
                      0x0080: 'DEFERDROP',
@@ -96,17 +96,16 @@ kn_state_strings = { 0x0000: '',
                      0x0200: 'DISPATCH',
                      0x0400: 'UDATASPEC',
                      0x0800: 'SUPPRESS',
-                     0x1000: 'STOLENDROP',
+                     0x1000: 'MERGE_QOS',
                      0x2000: 'REQVANISH',
                      0x4000: 'VANISHED' }
 
-kqrequest_state_strings = { 0x01: 'PROCESSING',
+kqrequest_state_strings = { 0x01: 'WORKLOOP',
                             0x02: 'THREQUESTED',
                             0x04: 'WAKEUP',
-                            0x08: 'BOUND',
-                            0x20: 'THOVERCOMMIT',
-                            0x40: 'DRAIN' }
-
+                            0x08: 'THOVERCOMMIT',
+                            0x10: 'R2K_ARMED',
+                            0x20: 'ALLOC_TURNSTILE' }
 thread_qos_short_strings = { 0: '--',
                              1: 'MT',
                              2: 'BG',
@@ -118,7 +117,7 @@ thread_qos_short_strings = { 0: '--',
 
 KQ_WORKQ = 0x40
 KQ_WORKLOOP = 0x80
-KQWQ_NBUCKETS = 22
+KQWQ_NBUCKETS = 8
 KQWL_NBUCKETS = 8
 
 DTYPE_VNODE = 1
@@ -182,7 +181,8 @@ proc_flag_explain_strings = ["!0x00000004 - process is 32 bit",  #only exception
 # string representations for Kobject types
 kobject_types = ['', 'THREAD', 'TASK', 'HOST', 'HOST_PRIV', 'PROCESSOR', 'PSET', 'PSET_NAME', 'TIMER', 'PAGER_REQ', 'DEVICE', 'XMM_OBJECT', 'XMM_PAGER', 'XMM_KERNEL', 'XMM_REPLY', 
                      'NOTDEF 15', 'NOTDEF 16', 'HOST_SEC', 'LEDGER', 'MASTER_DEV', 'TASK_NAME', 'SUBSYTEM', 'IO_DONE_QUE', 'SEMAPHORE', 'LOCK_SET', 'CLOCK', 'CLOCK_CTRL' , 'IOKIT_SPARE', 
-                      'NAMED_MEM', 'IOKIT_CON', 'IOKIT_OBJ', 'UPL', 'MEM_OBJ_CONTROL', 'AU_SESSIONPORT', 'FILEPORT', 'LABELH', 'TASK_RESUME', 'VOUCHER', 'VOUCHER_ATTR_CONTROL', 'IKOT_WORK_INTERVAL']
+                      'NAMED_MEM', 'IOKIT_CON', 'IOKIT_OBJ', 'UPL', 'MEM_OBJ_CONTROL', 'AU_SESSIONPORT', 'FILEPORT', 'LABELH', 'TASK_RESUME', 'VOUCHER', 'VOUCHER_ATTR_CONTROL', 'WORK_INTERVAL',
+                      'UX_HANDLER']
 
 def populate_kobject_types(xnu_dir_path):
     """ Function to read data from header file xnu/osfmk/kern/ipc_kobject.h
@@ -196,6 +196,9 @@ def populate_kobject_types(xnu_dir_path):
         kobject_found_types.append(v[0])
     return kobject_found_types
 
+FSHIFT = 11
+FSCALE = 1 << FSHIFT
+
 KDBG_BFINIT         = 0x80000000
 KDBG_WRAPPED        = 0x008
 KDCOPYBUF_COUNT     = 8192
index d241564602fc8debfdc79797e5b64801e25c1f68..369fbcace7331d5fe3d0835810a8b412d2a0e7e2 100644 (file)
@@ -17,9 +17,15 @@ ifdef RC_ARCHS
   endif
 endif
 
-ARCH_32 := $(filter-out %64, $(ARCHS))
+# These are convenience functions for filtering based on substrings, as the
+# normal filter functions only accept one wildcard.
+FILTER_OUT_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),,$(string))))
+FILTER_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),$(string),)))
+
+ARCH_32:=$(call FILTER_OUT_SUBSTRING,64,$(ARCHS))
+ARCH_64:=$(call FILTER_SUBSTRING,64,$(ARCHS))
+
 ARCH_32_FLAGS := $(patsubst %, -arch %, $(ARCH_32))
-ARCH_64 := $(filter %64, $(ARCHS))
 ARCH_64_FLAGS := $(patsubst %, -arch %, $(ARCH_64))
 
 DSTROOT?=$(shell /bin/pwd)
index 0cc99e0dcd1fad94c8c5a899b78a7b2eadb0477d..09d13dfc4a0c5325b6ded74ecd6fa206a2eb8356 100644 (file)
@@ -25,12 +25,11 @@ COMMON_TARGETS = unit_tests \
                MPMMTest                \
                packetdrill             \
                affinity                \
-               execperf                \
                superpages              \
                zero-to-n               \
                jitter                  \
                perf_index              \
-               darwintests             \
+               personas                \
                unixconf                \
                testkext/pgokext.kext
 
index c4d1a9bc4910e8010f1d391f338cfca27473dff5..5f45973ab8924f237e43f99e4b27d03cd0957912 100644 (file)
@@ -14,9 +14,15 @@ ifdef RC_ARCHS
   endif
 endif
 
-ARCH_32 := $(filter-out %64, $(ARCHS))
+# These are convenience functions for filtering based on substrings, as the
+# normal filter functions only accept one wildcard.
+FILTER_OUT_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),,$(string))))
+FILTER_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),$(string),)))
+
+ARCH_32:=$(call FILTER_OUT_SUBSTRING,64,$(ARCHS))
+ARCH_64:=$(call FILTER_SUBSTRING,64,$(ARCHS))
+
 ARCH_32_FLAGS := $(patsubst %, -arch %, $(ARCH_32))
-ARCH_64 := $(filter %64, $(ARCHS))
 ARCH_64_FLAGS := $(patsubst %, -arch %, $(ARCH_64))
 
 CFLAGS :=-g -isysroot $(SDKROOT) -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
diff --git a/tools/tests/darwintests/Makefile b/tools/tests/darwintests/Makefile
deleted file mode 100644 (file)
index 24560ab..0000000
+++ /dev/null
@@ -1,195 +0,0 @@
-PROJECT := xnu/darwintests
-
-# When building as part of xnu_tests, we get passed a DSTROOT that's got the
-# unit test path in it already.  But, BASEDSTROOT doesn't, so use that instead.
-ifdef BASEDSTROOT
-override DSTROOT = $(BASEDSTROOT)
-endif
-
-ENABLE_LTE_TESTS=YES
-
-OTHER_LTE_INCLUDE_FILES += \
-       /System/Library/PrivateFrameworks/LoggingSupport.framework, \
-       /System/Library/PrivateFrameworks/MobileKeyBag.framework, \
-       /usr/local/lib/libdarwintest_utils.dylib, \
-       /usr/lib/libapple_crypto.dylib,
-
-DEVELOPER_DIR ?= /Applications/Xcode.app/Contents/Developer/
-
-# the xnu build system will only ever call us with the default target
-.DEFAULT_GOAL := install
-
-include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.common
-
-OTHER_CFLAGS  = -Weverything -Wno-gnu-union-cast -Wno-missing-field-initializers -Wno-partial-availability
-OTHER_CFLAGS += -Wno-missing-noreturn -Wno-vla -Wno-reserved-id-macro -Wno-documentation-unknown-command
-OTHER_CFLAGS += -Wno-padded -Wno-used-but-marked-unused -Wno-covered-switch-default -Wno-nullability-extension
-OTHER_CFLAGS += --std=gnu11 -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
-OTHER_CFLAGS += -DT_NAMESPACE_PREFIX=xnu
-OTHER_CFLAGS += -F $(SDKROOT)/System/Library/PrivateFrameworks
-
-CODESIGN:=$(shell xcrun -sdk "$(TARGETSDK)" -find codesign)
-CODESIGN_ALLOCATE:=$(shell xcrun -sdk "$(TARGETSDK)" -find codesign_allocate)
-
-# to have custom compiler flags to
-# target: OTHER_CFLAGS += <my flags>
-
-atm_diagnostic_flag: OTHER_CFLAGS += drop_priv.c
-
-avx: INVALID_ARCHS = i386
-avx: OTHER_CFLAGS += -mavx512f -mavx512bw -mavx512vl
-avx: OTHER_CFLAGS += -I$(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
-avx: CONFIG_FLAGS := $(filter-out -O%,$(CONFIG_FLAGS))
-# Level 2 optimization must be used to prevent compiler from generating
-# invalid instructions when compiling with AVX-512 flags.
-avx: CONFIG_FLAGS += -O2
-# Disable vzeroupper insertion to work around rdar://problem/35035096
-avx: CONFIG_FLAGS += -mllvm -x86-use-vzeroupper=0
-ifneq (osx,$(TARGET_NAME))
-EXCLUDED_SOURCES += avx.c
-endif
-
-backtracing: OTHER_LDFLAGS += -framework CoreSymbolication
-
-data_protection: OTHER_LDFLAGS += -ldarwintest_utils -framework IOKit
-
-kdebug: INVALID_ARCHS = i386
-kdebug: OTHER_LDFLAGS = -framework ktrace
-
-EXCLUDED_SOURCES += drop_priv.c kperf_helpers.c xnu_quick_test_helpers.c
-
-ifeq ($(PLATFORM),iPhoneOS)
-CONFIG_FREEZE_DEFINE:= -DCONFIG_FREEZE
-else
-CONFIG_FREEZE_DEFINE:=
-EXCLUDED_SOURCES += jumbo_va_spaces_28530648.c
-endif
-
-perf_compressor: OTHER_CFLAGS += $(CONFIG_FREEZE_DEFINE)
-perf_compressor: OTHER_LDFLAGS += -ldarwintest_utils
-perf_compressor: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
-
-stackshot: OTHER_LDFLAGS += -lkdd -framework Foundation
-stackshot: INVALID_ARCHS = i386
-
-memorystatus_zone_test: INVALID_ARCHS = i386
-memorystatus_zone_test: OTHER_CFLAGS += -isystem $(SDKROOT)/System/Library/Frameworks/System.framework/PrivateHeaders
-memorystatus_zone_test: OTHER_LDFLAGS += -framework ktrace
-memorystatus_zone_test: OTHER_LDFLAGS += -ldarwintest_utils
-
-kpc: OTHER_LDFLAGS += -framework kperf
-
-kperf: INVALID_ARCHS = i386
-kperf: OTHER_CFLAGS += kperf_helpers.c
-kperf: OTHER_LDFLAGS += -framework kperf -framework kperfdata -framework ktrace -ldarwintest_utils
-
-kperf_backtracing: INVALID_ARCHS = i386
-kperf_backtracing: OTHER_CFLAGS += kperf_helpers.c
-kperf_backtracing: OTHER_LDFLAGS += -framework kperf -framework kperfdata -framework ktrace
-kperf_backtracing: OTHER_LDFLAGS += -framework CoreSymbolication
-
-kevent_qos: OTHER_CFLAGS += -Wno-unused-macros
-kevent_qos: OTHER_CFLAGS += -I $(OBJROOT)/
-
-mach_get_times: OTHER_LDFLAGS += -ldarwintest_utils
-
-monotonic_core: OTHER_LDFLAGS += -framework ktrace
-monotonic_core: INVALID_ARCHS = i386
-
-perf_exit: OTHER_LDFLAGS = -framework ktrace
-perf_exit: INVALID_ARCHS = i386
-perf_exit: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
-
-perf_spawn_fork: CODE_SIGN_ENTITLEMENTS=./private_entitlement.plist
-
-os_thread_self_restrict: os_thread_self_restrict.c os_thread_self_restrict-entitlements.plist
-os_thread_self_restrict: CODE_SIGN_ENTITLEMENTS=os_thread_self_restrict-entitlements.plist
-
-task_inspect: CODE_SIGN_ENTITLEMENTS = task_inspect.entitlements
-task_inspect: OTHER_CFLAGS += -DENTITLED=1
-
-CUSTOM_TARGETS += perf_exit_proc
-
-perf_exit_proc:
-       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) perf_exit_proc.c -o $(SYMROOT)/perf_exit_proc
-
-install-perf_exit_proc: perf_exit_proc
-       mkdir -p $(INSTALLDIR)
-       cp $(SYMROOT)/perf_exit_proc $(INSTALLDIR)/
-
-perf_kdebug: INVALID_ARCHS = i386
-
-stackshot_idle_25570396: INVALID_ARCHS = i386
-stackshot_idle_25570396: OTHER_LDFLAGS += -lkdd -framework Foundation
-
-stackshot_block_owner_14362384: INVALID_ARCHS = i386
-stackshot_block_owner_14362384: OTHER_LDFLAGS += -framework Foundation -lpthread -lkdd
-ifeq ($(PLATFORM),MacOSX)
-stackshot_block_owner_14362384: OTHER_LDFLAGS += -lpcre
-endif
-
-all: $(DSTROOT)/usr/local/bin/kcdata
-
-$(DSTROOT)/usr/local/bin/kcdata: $(SRCROOT)/../../lldbmacros/kcdata.py
-       mkdir -p $(dir $@)
-       cp $< $@
-       chmod a+x $@
-
-xnu_quick_test: OTHER_CFLAGS += xnu_quick_test_helpers.c
-
-ifeq ($(PLATFORM),iPhoneOS)
-OTHER_TEST_TARGETS += jumbo_va_spaces_28530648_unentitled
-jumbo_va_spaces_28530648: CODE_SIGN_ENTITLEMENTS = jumbo_va_spaces_28530648.entitlements
-jumbo_va_spaces_28530648: OTHER_CFLAGS += -DENTITLED=1
-jumbo_va_spaces_28530648: OTHER_LDFLAGS += -ldarwintest_utils
-
-jumbo_va_spaces_28530648_unentitled: OTHER_LDFLAGS += -ldarwintest_utils
-jumbo_va_spaces_28530648_unentitled: jumbo_va_spaces_28530648.c
-       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
-endif
-
-task_info_28439149: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
-
-proc_info: CODE_SIGN_ENTITLEMENTS = ./task_for_pid_entitlement.plist
-proc_info: OTHER_LDFLAGS += -ldarwintest_utils
-
-disk_mount_conditioner: disk_mount_conditioner*
-disk_mount_conditioner: CODE_SIGN_ENTITLEMENTS=./disk_mount_conditioner-entitlements.plist
-disk_mount_conditioner: OTHER_LDFLAGS += -ldarwintest_utils
-
-OTHER_TEST_TARGETS += disk_mount_conditioner_unentitled
-disk_mount_conditioner_unentitled: OTHER_CFLAGS += -DTEST_UNENTITLED
-disk_mount_conditioner_unentitled: OTHER_LDFLAGS += -ldarwintest_utils
-disk_mount_conditioner_unentitled: disk_mount_conditioner.c
-       $(CC) $(DT_CFLAGS) $(OTHER_CFLAGS) $(CFLAGS) $(DT_LDFLAGS) $(OTHER_LDFLAGS) $(LDFLAGS) $< -o $(SYMROOT)/$@
-
-work_interval_test: CODE_SIGN_ENTITLEMENTS = work_interval_test.entitlements
-work_interval_test: OTHER_CFLAGS += -DENTITLED=1
-
-settimeofday_29193041: OTHER_CFLAGS += drop_priv.c
-
-settimeofday_29193041_entitled: CODE_SIGN_ENTITLEMENTS = settimeofday_29193041.entitlements
-settimeofday_29193041_entitled: OTHER_CFLAGS += drop_priv.c
-
-thread_group_set_32261625: OTHER_LDFLAGS = -framework ktrace
-thread_group_set_32261625: INVALID_ARCHS = i386
-
-task_info: CODE_SIGN_ENTITLEMENTS = task_for_pid_entitlement.plist
-
-socket_bind_35243417: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist
-socket_bind_35685803: CODE_SIGN_ENTITLEMENTS = network_entitlements.plist
-
-ifneq (osx,$(TARGET_NAME))
-EXCLUDED_SOURCES += no32exec_35914211.c no32exec_35914211_helper.c
-endif
-
-no32exec_35914211_helper:  INVALID_ARCHS = x86_64
-no32exec_35914211:  INVALID_ARCHS = i386
-
-ifneq ($(PLATFORM),BridgeOS)
-EXCLUDED_SOURCES += remote_time.c
-else
-remote_time: INVALID_ARCHS = armv7 armv7s arm64_32
-endif
-
-include $(DEVELOPER_DIR)/AppleInternal/Makefiles/darwintest/Makefile.targets
diff --git a/tools/tests/darwintests/atm_diagnostic_flag.c b/tools/tests/darwintests/atm_diagnostic_flag.c
deleted file mode 100644 (file)
index 864ffd6..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-#include <darwintest.h>
-
-#include <mach/mach_error.h>
-#include <mach/mach_host.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.debugging"));
-
-/*
- * The low 8 bits may be in use, so modify one
- * of the upper 8 bits to ensure round-tripping. 
- */
-#define LIBTRACE_PRIVATE_DATA  0x01000000
-
-extern void drop_priv(void);
-
-static bool _needs_reset;
-static uint32_t _original;
-
-static uint32_t
-_save_atm_diagnostic_flag(void)
-{
-    kern_return_t kr;
-    kr = host_get_atm_diagnostic_flag(mach_host_self(), &_original);
-    T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_get_atm_diagnostic_flag()");
-    T_LOG("Original ATM diagnostic flag: 0x%08x", _original);
-    return _original;
-}
-
-static kern_return_t
-_mutate_atm_diagnostic_flag(uint32_t v)
-{
-    T_LOG("Try to set ATM diagnostic flag to: 0x%08x", v);
-    kern_return_t kr = host_set_atm_diagnostic_flag(mach_host_self(), v);
-    if (kr == KERN_SUCCESS) _needs_reset = true;
-    return kr;
-}
-
-static void
-_reset_atm_diagnostic_flag(void)
-{
-    if (!_needs_reset) return;
-    T_LOG("Reset ATM diagnostic flag to: 0x%08x", _original);
-    kern_return_t kr;
-    kr = host_set_atm_diagnostic_flag(mach_host_self(), _original);
-    if (kr != KERN_SUCCESS) {
-        T_ASSERT_FAIL("host_set_atm_diagnostic_flag() failed: %s",
-                mach_error_string(kr));
-    }
-}
-
-T_DECL(toggle_atm_diagnostic_flag,
-        "change the atm_diagnostic_flag, which should use the commpage",
-        T_META_ASROOT(true))
-{
-    T_ATEND(_reset_atm_diagnostic_flag);
-    uint32_t f = _save_atm_diagnostic_flag();
-    f ^= LIBTRACE_PRIVATE_DATA;
-    kern_return_t kr = _mutate_atm_diagnostic_flag(f);
-    if (kr == KERN_NOT_SUPPORTED) {
-        T_SKIP("Seems ATM is disabled on this platform. "
-                "Ignoring host_set_atm_diagnostic_flag functionality. "
-                "Bailing gracefully.");
-    }
-    T_EXPECT_MACH_SUCCESS(kr, "Set atm_diagnostic_flag");
-}
-
-T_DECL(unprivileged_atm_diagnostic_flag,
-        "expect to fail to set the atm_diagnostic_flag",
-        T_META_ASROOT(false))
-{
-    drop_priv();
-    T_ATEND(_reset_atm_diagnostic_flag);
-    uint32_t f = _save_atm_diagnostic_flag();
-    f ^= LIBTRACE_PRIVATE_DATA;
-    kern_return_t kr = _mutate_atm_diagnostic_flag(f);
-    T_EXPECT_MACH_ERROR(KERN_INVALID_ARGUMENT, kr,
-            "Deny change to atm_diagnostic_flag");
-}
diff --git a/tools/tests/darwintests/avx.c b/tools/tests/darwintests/avx.c
deleted file mode 100644 (file)
index 0041e99..0000000
+++ /dev/null
@@ -1,736 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-
-#include <darwintest.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/time.h>
-#include <sys/mman.h>
-#include <immintrin.h>
-#include <mach/mach.h>
-#include <stdio.h>
-#include <string.h>
-#include <err.h>
-#include <i386/cpu_capabilities.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.intel"),
-       T_META_CHECK_LEAKS(false)
-);
-
-#define NORMAL_RUN_TIME  (10)
-#define LONG_RUN_TIME    (10*60)
-#define TIMEOUT_OVERHEAD (10)
-
-volatile boolean_t checking = true;
-char vec_str_buf[8196];
-char karray_str_buf[1024];
-
-/*
- * ymm defines/globals/prototypes
- */
-#define        STOP_COOKIE_256 0x01234567
-#if defined(__x86_64__)
-#define YMM_MAX                        16
-#define X86_AVX_STATE_T                x86_avx_state64_t
-#define X86_AVX_STATE_COUNT    x86_AVX_STATE64_COUNT
-#define X86_AVX_STATE_FLAVOR   x86_AVX_STATE64
-#define        MCONTEXT_SIZE_256       sizeof(struct __darwin_mcontext_avx64)
-#else
-#define YMM_MAX                        8
-#define X86_AVX_STATE_T                x86_avx_state32_t
-#define X86_AVX_STATE_COUNT    x86_AVX_STATE32_COUNT
-#define X86_AVX_STATE_FLAVOR   x86_AVX_STATE32
-#define        MCONTEXT_SIZE_256       sizeof(struct __darwin_mcontext_avx32)
-#endif
-#define VECTOR256 __m256
-#define VEC256ALIGN __attribute ((aligned(32)))
-static inline void populate_ymm(void);
-static inline void check_ymm(void);
-VECTOR256      vec256array0[YMM_MAX] VEC256ALIGN;
-VECTOR256      vec256array1[YMM_MAX] VEC256ALIGN;
-VECTOR256      vec256array2[YMM_MAX] VEC256ALIGN;
-VECTOR256      vec256array3[YMM_MAX] VEC256ALIGN;
-
-/*
- * zmm defines/globals/prototypes
- */
-#define STOP_COOKIE_512 0x0123456789abcdefULL
-#if defined(__x86_64__)
-#define ZMM_MAX                        32
-#define X86_AVX512_STATE_T     x86_avx512_state64_t
-#define X86_AVX512_STATE_COUNT x86_AVX512_STATE64_COUNT
-#define X86_AVX512_STATE_FLAVOR        x86_AVX512_STATE64
-#define        MCONTEXT_SIZE_512       sizeof(struct __darwin_mcontext_avx512_64)
-#else
-#define ZMM_MAX                        8
-#define X86_AVX512_STATE_T     x86_avx512_state32_t
-#define X86_AVX512_STATE_COUNT x86_AVX512_STATE32_COUNT
-#define X86_AVX512_STATE_FLAVOR        x86_AVX512_STATE32
-#define        MCONTEXT_SIZE_512       sizeof(struct __darwin_mcontext_avx512_32)
-#endif
-#define VECTOR512 __m512
-#define VEC512ALIGN __attribute ((aligned(64)))
-#define OPMASK uint64_t
-#define KARRAY_MAX              8
-static inline void populate_zmm(void);
-static inline void populate_opmask(void);
-static inline void check_zmm(void);
-VECTOR512      vec512array0[ZMM_MAX] VEC512ALIGN;
-VECTOR512      vec512array1[ZMM_MAX] VEC512ALIGN;
-VECTOR512      vec512array2[ZMM_MAX] VEC512ALIGN;
-VECTOR512      vec512array3[ZMM_MAX] VEC512ALIGN;
-OPMASK karray0[8];
-OPMASK karray1[8];
-OPMASK karray2[8];
-OPMASK karray3[8];
-
-
-/*
- * Common functions
- */
-
-int
-memcmp_unoptimized(const void *s1, const void *s2, size_t n) {
-       if (n != 0) {
-               const unsigned char *p1 = s1, *p2 = s2;
-               do {
-                       if (*p1++ != *p2++)
-                               return (*--p1 - *--p2);
-               } while (--n != 0);
-       }
-       return (0);
-}
-
-void
-start_timer(int seconds, void (*handler)(int, siginfo_t *, void *)) {
-       struct sigaction sigalrm_action = {
-               .sa_sigaction = handler,
-               .sa_flags = SA_RESTART,
-               .sa_mask = 0
-       };
-       struct itimerval timer = {
-               .it_value.tv_sec = seconds,
-               .it_value.tv_usec = 0,
-               .it_interval.tv_sec = 0,
-               .it_interval.tv_usec = 0
-       };
-       T_QUIET; T_WITH_ERRNO;
-       T_ASSERT_NE(sigaction(SIGALRM, &sigalrm_action, NULL), -1, NULL);
-       T_QUIET; T_WITH_ERRNO;
-       T_ASSERT_NE(setitimer(ITIMER_REAL, &timer, NULL), -1, NULL);
-}
-
-void
-require_avx(void) {
-       if((_get_cpu_capabilities() & kHasAVX1_0) != kHasAVX1_0) {
-               T_SKIP("AVX not supported on this system");
-       }
-}
-
-void
-require_avx512(void) {
-       if((_get_cpu_capabilities() & kHasAVX512F) != kHasAVX512F) {
-               T_SKIP("AVX-512 not supported on this system");
-       }
-}
-
-/*
- * ymm functions
- */
-
-static inline void
-store_ymm(VECTOR256 *vec256array) {
-       int i = 0;
-           __asm__ volatile("vmovaps  %%ymm0, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm1, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm2, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm3, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm4, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm5, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm6, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm7, %0" :"=m" (vec256array[i]));
-#if defined(__x86_64__)
-       i++;__asm__ volatile("vmovaps  %%ymm8, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm9, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm10, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm11, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm12, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm13, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm14, %0" :"=m" (vec256array[i]));
-       i++;__asm__ volatile("vmovaps  %%ymm15, %0" :"=m" (vec256array[i]));
-#endif
-}
-
-static inline void
-populate_ymm(void) {
-       int j;
-       uint32_t p[8] VEC256ALIGN;
-
-       for (j = 0; j < (int) (sizeof(p)/sizeof(p[0])); j++)
-               p[j] = getpid();
-
-       p[0] = 0x22222222;
-       p[7] = 0x77777777;
-       __asm__ volatile("vmovaps  %0, %%ymm0" :: "m" (*(__m256i*)p) : "ymm0");
-       __asm__ volatile("vmovaps  %0, %%ymm1" :: "m" (*(__m256i*)p) : "ymm1");
-       __asm__ volatile("vmovaps  %0, %%ymm2" :: "m" (*(__m256i*)p) : "ymm2");
-       __asm__ volatile("vmovaps  %0, %%ymm3" :: "m" (*(__m256i*)p) : "ymm3");
-
-       p[0] = 0x44444444;
-       p[7] = 0xEEEEEEEE;
-       __asm__ volatile("vmovaps  %0, %%ymm4" :: "m" (*(__m256i*)p) : "ymm4");
-       __asm__ volatile("vmovaps  %0, %%ymm5" :: "m" (*(__m256i*)p) : "ymm5");
-       __asm__ volatile("vmovaps  %0, %%ymm6" :: "m" (*(__m256i*)p) : "ymm6");
-       __asm__ volatile("vmovaps  %0, %%ymm7" :: "m" (*(__m256i*)p) : "ymm7");
-
-#if defined(__x86_64__)
-       p[0] = 0x88888888;
-       p[7] = 0xAAAAAAAA;
-       __asm__ volatile("vmovaps  %0, %%ymm8" :: "m" (*(__m256i*)p) : "ymm8");
-       __asm__ volatile("vmovaps  %0, %%ymm9" :: "m" (*(__m256i*)p) : "ymm9");
-       __asm__ volatile("vmovaps  %0, %%ymm10" :: "m" (*(__m256i*)p) : "ymm10");
-       __asm__ volatile("vmovaps  %0, %%ymm11" :: "m" (*(__m256i*)p) : "ymm11");
-
-       p[0] = 0xBBBBBBBB;
-       p[7] = 0xCCCCCCCC;
-       __asm__ volatile("vmovaps  %0, %%ymm12" :: "m" (*(__m256i*)p) : "ymm12");
-       __asm__ volatile("vmovaps  %0, %%ymm13" :: "m" (*(__m256i*)p) : "ymm13");
-       __asm__ volatile("vmovaps  %0, %%ymm14" :: "m" (*(__m256i*)p) : "ymm14");
-       __asm__ volatile("vmovaps  %0, %%ymm15" :: "m" (*(__m256i*)p) : "ymm15");
-#endif
-
-       store_ymm(vec256array0);
-}
-
-void
-vec256_to_string(VECTOR256 *vec, char *buf) {
-       unsigned int vec_idx = 0;
-       unsigned int buf_idx = 0;
-       int ret = 0;
-
-       for (vec_idx = 0; vec_idx < YMM_MAX; vec_idx++) {
-               uint64_t a[4];
-               bcopy(&vec[vec_idx], &a[0], sizeof(a));
-               ret = sprintf(
-                       buf + buf_idx,
-                       "0x%016llx:%016llx:%016llx:%016llx\n",
-                       a[0], a[1], a[2], a[3]
-               );
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sprintf()");
-               buf_idx += ret;
-       }
-}
-
-void
-assert_ymm_eq(void *a, void *b, int c) {
-       if(memcmp_unoptimized(a, b, c)) {
-               vec256_to_string(a, vec_str_buf);
-               T_LOG("Compare failed, vector A:\n%s", vec_str_buf);
-               vec256_to_string(b, vec_str_buf);
-               T_LOG("Compare failed, vector B:\n%s", vec_str_buf);
-               T_ASSERT_FAIL("vectors not equal");
-       }
-}
-
-void
-check_ymm(void)  {
-       uint32_t *p = (uint32_t *) &vec256array1[7];
-       store_ymm(vec256array1);
-       if (p[0] == STOP_COOKIE_256) {
-               return;
-       }
-       assert_ymm_eq(vec256array0, vec256array1, sizeof(vec256array0));
-}
-
-static void
-copy_ymm_state_to_vector(X86_AVX_STATE_T *sp,  VECTOR256 *vp) {
-       int     i;
-       struct  __darwin_xmm_reg *xmm  = &sp->__fpu_xmm0;
-       struct  __darwin_xmm_reg *ymmh = &sp->__fpu_ymmh0;
-
-       for (i = 0; i < YMM_MAX; i++ ) {
-               bcopy(&xmm[i],  &vp[i], sizeof(*xmm));
-               bcopy(&ymmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*ymmh)), sizeof(*ymmh));
-       }
-}
-
-static void
-ymm_sigalrm_handler(int signum __unused, siginfo_t *info __unused, void *ctx)
-{
-       ucontext_t *contextp = (ucontext_t *) ctx;
-       mcontext_t mcontext = contextp->uc_mcontext;
-       X86_AVX_STATE_T *avx_state = (X86_AVX_STATE_T *) &mcontext->__fs;
-       uint32_t *xp = (uint32_t *) &avx_state->__fpu_xmm7;
-       uint32_t *yp = (uint32_t *) &avx_state->__fpu_ymmh7;
-
-       T_LOG("Got SIGALRM");
-
-       /* Check for AVX state */
-       T_QUIET;
-       T_ASSERT_GE(contextp->uc_mcsize, MCONTEXT_SIZE_256, "check context size");
-
-       /* Check that the state in the context is what's set and expected */
-       copy_ymm_state_to_vector(avx_state, vec256array3);
-       assert_ymm_eq(vec256array3, vec256array0, sizeof(vec256array1));
-
-       /* Change the context and break the main loop */
-       xp[0] = STOP_COOKIE_256;
-       yp[0] = STOP_COOKIE_256;
-       checking = FALSE;
-}
-
-void
-ymm_integrity(int time) {
-       mach_msg_type_number_t avx_count = X86_AVX_STATE_COUNT;
-       kern_return_t kret;
-       X86_AVX_STATE_T avx_state, avx_state2;
-       mach_port_t ts = mach_thread_self();
-
-       bzero(&avx_state, sizeof(avx_state));
-       bzero(&avx_state2, sizeof(avx_state));
-
-       kret = thread_get_state(
-               ts, X86_AVX_STATE_FLAVOR, (thread_state_t)&avx_state, &avx_count
-       );
-
-       store_ymm(vec256array2);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "thread_get_state()");
-       vec256_to_string(vec256array2, vec_str_buf);
-       T_LOG("Initial state:\n%s", vec_str_buf);
-
-       copy_ymm_state_to_vector(&avx_state, vec256array1);
-       assert_ymm_eq(vec256array2, vec256array1, sizeof(vec256array1));
-
-       populate_ymm();
-
-       kret = thread_get_state(
-               ts, X86_AVX_STATE_FLAVOR, (thread_state_t)&avx_state2, &avx_count
-       );
-
-       store_ymm(vec256array2);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "thread_get_state()");
-       vec256_to_string(vec256array2, vec_str_buf);
-       T_LOG("Populated state:\n%s", vec_str_buf);
-
-       copy_ymm_state_to_vector(&avx_state2, vec256array1);
-       assert_ymm_eq(vec256array2, vec256array1, sizeof(vec256array0));
-
-       T_LOG("Running for %ds…", time);
-       start_timer(time, ymm_sigalrm_handler);
-
-       /* re-populate because printing mucks up XMMs */
-       populate_ymm();
-
-       /* Check state until timer fires */
-       while(checking) {
-               check_ymm();
-       }
-
-       /* Check that the sig handler changed out AVX state */
-       store_ymm(vec256array1);
-
-       uint32_t *p = (uint32_t *) &vec256array1[7];
-       if (p[0] != STOP_COOKIE_256 ||
-           p[4] != STOP_COOKIE_256) {
-               vec256_to_string(vec256array1, vec_str_buf);
-               T_ASSERT_FAIL("sigreturn failed to stick");
-               T_LOG("State:\n%s", vec_str_buf);
-       }
-
-       T_LOG("Ran for %ds", time);
-       T_PASS("No ymm register corruption occurred");
-}
-
-/*
- * zmm functions
- */
-
-static inline void
-store_opmask(OPMASK k[]) {
-       __asm__ volatile("kmovq %%k0, %0" :"=m" (k[0]));
-       __asm__ volatile("kmovq %%k1, %0" :"=m" (k[1]));
-       __asm__ volatile("kmovq %%k2, %0" :"=m" (k[2]));
-       __asm__ volatile("kmovq %%k3, %0" :"=m" (k[3]));
-       __asm__ volatile("kmovq %%k4, %0" :"=m" (k[4]));
-       __asm__ volatile("kmovq %%k5, %0" :"=m" (k[5]));
-       __asm__ volatile("kmovq %%k6, %0" :"=m" (k[6]));
-       __asm__ volatile("kmovq %%k7, %0" :"=m" (k[7]));
-}
-
-static inline void
-store_zmm(VECTOR512 *vecarray) {
-       int i = 0;
-           __asm__ volatile("vmovaps  %%zmm0, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm1, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm2, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm3, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm4, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm5, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm6, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm7, %0" :"=m" (vecarray[i]));
-#if defined(__x86_64__)
-       i++;__asm__ volatile("vmovaps  %%zmm8, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm9, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm10, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm11, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm12, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm13, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm14, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm15, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm16, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm17, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm18, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm19, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm20, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm21, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm22, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm23, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm24, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm25, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm26, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm27, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm28, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm29, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm30, %0" :"=m" (vecarray[i]));
-       i++;__asm__ volatile("vmovaps  %%zmm31, %0" :"=m" (vecarray[i]));
-#endif
-}
-
-static inline void
-populate_opmask(void) {
-       uint64_t k[8];
-
-       for (int j = 0; j < 8; j++)
-               k[j] = ((uint64_t) getpid() << 32) + (0x11111111 * j);
-
-       __asm__ volatile("kmovq %0, %%k0" : :"m" (k[0]));
-       __asm__ volatile("kmovq %0, %%k1" : :"m" (k[1]));
-       __asm__ volatile("kmovq %0, %%k2" : :"m" (k[2]));
-       __asm__ volatile("kmovq %0, %%k3" : :"m" (k[3]));
-       __asm__ volatile("kmovq %0, %%k4" : :"m" (k[4]));
-       __asm__ volatile("kmovq %0, %%k5" : :"m" (k[5]));
-       __asm__ volatile("kmovq %0, %%k6" : :"m" (k[6]));
-       __asm__ volatile("kmovq %0, %%k7" : :"m" (k[7]));
-
-       store_opmask(karray0);
-}
-
-static inline void
-populate_zmm(void) {
-       int j;
-       uint64_t p[8] VEC512ALIGN;
-
-       for (j = 0; j < (int) (sizeof(p)/sizeof(p[0])); j++)
-               p[j] = ((uint64_t) getpid() << 32) + getpid();
-
-       p[0] = 0x0000000000000000ULL;
-       p[2] = 0x4444444444444444ULL;
-       p[4] = 0x8888888888888888ULL;
-       p[7] = 0xCCCCCCCCCCCCCCCCULL;
-       __asm__ volatile("vmovaps  %0, %%zmm0" :: "m" (*(__m256i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm1" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm2" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm3" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm4" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm5" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm6" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm7" :: "m" (*(__m512i*)p) );
-
-#if defined(__x86_64__)
-       p[0] = 0x1111111111111111ULL;
-       p[2] = 0x5555555555555555ULL;
-       p[4] = 0x9999999999999999ULL;
-       p[7] = 0xDDDDDDDDDDDDDDDDULL;
-       __asm__ volatile("vmovaps  %0, %%zmm8" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm9" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm10" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm11" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm12" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm13" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm14" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm15" :: "m" (*(__m512i*)p) );
-
-       p[0] = 0x2222222222222222ULL;
-       p[2] = 0x6666666666666666ULL;
-       p[4] = 0xAAAAAAAAAAAAAAAAULL;
-       p[7] = 0xEEEEEEEEEEEEEEEEULL;
-       __asm__ volatile("vmovaps  %0, %%zmm16" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm17" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm18" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm19" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm20" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm21" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm22" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm23" :: "m" (*(__m512i*)p) );
-
-       p[0] = 0x3333333333333333ULL;
-       p[2] = 0x7777777777777777ULL;
-       p[4] = 0xBBBBBBBBBBBBBBBBULL;
-       p[7] = 0xFFFFFFFFFFFFFFFFULL;
-       __asm__ volatile("vmovaps  %0, %%zmm24" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm25" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm26" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm27" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm28" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm29" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm30" :: "m" (*(__m512i*)p) );
-       __asm__ volatile("vmovaps  %0, %%zmm31" :: "m" (*(__m512i*)p) );
-#endif
-
-       store_zmm(vec512array0);
-}
-
-void
-vec512_to_string(VECTOR512 *vec, char *buf) {
-       unsigned int vec_idx = 0;
-       unsigned int buf_idx = 0;
-       int ret = 0;
-
-       for (vec_idx = 0; vec_idx < ZMM_MAX; vec_idx++) {
-               uint64_t a[8];
-               bcopy(&vec[vec_idx], &a[0], sizeof(a));
-               ret = sprintf(
-                       buf + buf_idx,
-                       "0x%016llx:%016llx:%016llx:%016llx:"
-                       "%016llx:%016llx:%016llx:%016llx%s",
-                       a[0], a[1], a[2], a[3], a[4], a[5], a[6], a[7],
-                       vec_idx < ZMM_MAX - 1 ? "\n" : ""
-               );
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sprintf()");
-               buf_idx += ret;
-       }
-}
-
-void
-opmask_to_string(OPMASK *karray, char *buf) {
-       unsigned int karray_idx = 0;
-       unsigned int buf_idx = 0;
-       int ret = 0;
-
-       for(karray_idx = 0; karray_idx < KARRAY_MAX; karray_idx++) {
-               ret = sprintf(
-                       buf + buf_idx,
-                       "k%d: 0x%016llx%s",
-                       karray_idx, karray[karray_idx],
-                       karray_idx < KARRAY_MAX ? "\n" : ""
-               );
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sprintf()");
-               buf_idx += ret;
-       }
-}
-
-static void
-assert_zmm_eq(void *a, void *b, int c) {
-       if(memcmp_unoptimized(a, b, c)) {
-               vec512_to_string(a, vec_str_buf);
-               T_LOG("Compare failed, vector A:\n%s", vec_str_buf);
-               vec512_to_string(b, vec_str_buf);
-               T_LOG("Compare failed, vector B:\n%s", vec_str_buf);
-               T_ASSERT_FAIL("Vectors not equal");
-       }
-}
-
-static void
-assert_opmask_eq(OPMASK *a, OPMASK *b) {
-       for (int i = 0; i < KARRAY_MAX; i++) {
-               if (a[i] != b[i]) {
-                       opmask_to_string(a, karray_str_buf);
-                       T_LOG("Compare failed, opmask A:\n%s", karray_str_buf);
-                       opmask_to_string(b, karray_str_buf);
-                       T_LOG("Compare failed, opmask B:\n%s", karray_str_buf);
-                       T_ASSERT_FAIL("opmasks not equal");
-               }
-       }
-}
-
-void
-check_zmm(void)  {
-       uint64_t *p = (uint64_t *) &vec512array1[7];
-       store_opmask(karray1);
-       store_zmm(vec512array1);
-       if (p[0] == STOP_COOKIE_512) {
-               return;
-       }
-
-       assert_zmm_eq(vec512array0, vec512array1, sizeof(vec512array0));
-       assert_opmask_eq(karray0, karray1);
-}
-
-static void copy_state_to_opmask(X86_AVX512_STATE_T *sp, OPMASK *op) {
-       OPMASK *k = (OPMASK *) &sp->__fpu_k0;
-       for (int i = 0; i < KARRAY_MAX; i++) {
-               bcopy(&k[i], &op[i], sizeof(*op));
-       }
-}
-
-static void copy_zmm_state_to_vector(X86_AVX512_STATE_T *sp,  VECTOR512 *vp) {
-       int     i;
-       struct  __darwin_xmm_reg *xmm  = &sp->__fpu_xmm0;
-       struct  __darwin_xmm_reg *ymmh = &sp->__fpu_ymmh0;
-       struct  __darwin_ymm_reg *zmmh = &sp->__fpu_zmmh0;
-#if defined(__x86_64__)
-       struct  __darwin_zmm_reg *zmm  = &sp->__fpu_zmm16;
-
-       for (i = 0; i < ZMM_MAX/2; i++ ) {
-               bcopy(&xmm[i],  &vp[i], sizeof(*xmm));
-               bcopy(&ymmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*ymmh)), sizeof(*ymmh));
-               bcopy(&zmmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*zmmh)), sizeof(*zmmh));
-               bcopy(&zmm[i], &vp[(ZMM_MAX/2)+i], sizeof(*zmm));
-       }
-#else
-       for (i = 0; i < ZMM_MAX; i++ ) {
-               bcopy(&xmm[i],  &vp[i], sizeof(*xmm));
-               bcopy(&ymmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*ymmh)), sizeof(*ymmh));
-               bcopy(&zmmh[i], (void *) ((uint64_t)&vp[i] + sizeof(*zmmh)), sizeof(*zmmh));
-       }
-#endif
-}
-
-static void
-zmm_sigalrm_handler(int signum __unused, siginfo_t *info __unused, void *ctx)
-{
-       ucontext_t *contextp = (ucontext_t *) ctx;
-       mcontext_t mcontext = contextp->uc_mcontext;
-       X86_AVX512_STATE_T *avx_state = (X86_AVX512_STATE_T *) &mcontext->__fs;
-       uint64_t *xp = (uint64_t *) &avx_state->__fpu_xmm7;
-       uint64_t *yp = (uint64_t *) &avx_state->__fpu_ymmh7;
-       uint64_t *zp = (uint64_t *) &avx_state->__fpu_zmmh7;
-       uint64_t *kp = (uint64_t *) &avx_state->__fpu_k0;
-
-       /* Check for AVX512 state */
-       T_QUIET;
-       T_ASSERT_GE(contextp->uc_mcsize, MCONTEXT_SIZE_512, "check context size");
-
-       /* Check that the state in the context is what's set and expected */
-       copy_zmm_state_to_vector(avx_state, vec512array3);
-       assert_zmm_eq(vec512array3, vec512array0, sizeof(vec512array1));
-       copy_state_to_opmask(avx_state, karray3);
-       assert_opmask_eq(karray3, karray0);
-
-       /* Change the context and break the main loop */
-       xp[0] = STOP_COOKIE_512;
-       yp[0] = STOP_COOKIE_512;
-       zp[0] = STOP_COOKIE_512;
-       kp[7] = STOP_COOKIE_512;
-       checking = FALSE;
-}
-
-void
-zmm_integrity(int time) {
-       mach_msg_type_number_t avx_count = X86_AVX512_STATE_COUNT;
-       kern_return_t kret;
-       X86_AVX512_STATE_T avx_state, avx_state2;
-       mach_port_t ts = mach_thread_self();
-
-       bzero(&avx_state, sizeof(avx_state));
-       bzero(&avx_state2, sizeof(avx_state));
-
-       store_zmm(vec512array2);
-       store_opmask(karray2);
-
-       kret = thread_get_state(
-               ts, X86_AVX512_STATE_FLAVOR, (thread_state_t)&avx_state, &avx_count
-       );
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "thread_get_state()");
-       vec512_to_string(vec512array2, vec_str_buf);
-       opmask_to_string(karray2, karray_str_buf);
-       T_LOG("Initial state:\n%s\n%s", vec_str_buf, karray_str_buf);
-
-       copy_zmm_state_to_vector(&avx_state, vec512array1);
-       assert_zmm_eq(vec512array2, vec512array1, sizeof(vec512array1));
-       copy_state_to_opmask(&avx_state, karray1);
-       assert_opmask_eq(karray2, karray1);
-
-       populate_zmm();
-       populate_opmask();
-
-       kret = thread_get_state(
-               ts, X86_AVX512_STATE_FLAVOR, (thread_state_t)&avx_state2, &avx_count
-       );
-
-       store_zmm(vec512array2);
-       store_opmask(karray2);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kret, "thread_get_state()");
-       vec512_to_string(vec512array2, vec_str_buf);
-       opmask_to_string(karray2, karray_str_buf);
-       T_LOG("Populated state:\n%s\n%s", vec_str_buf, karray_str_buf);
-
-       copy_zmm_state_to_vector(&avx_state2, vec512array1);
-       assert_zmm_eq(vec512array2, vec512array1, sizeof(vec512array1));
-       copy_state_to_opmask(&avx_state2, karray1);
-       assert_opmask_eq(karray2, karray1);
-
-       T_LOG("Running for %ds…", time);
-       start_timer(time, zmm_sigalrm_handler);
-
-       /* re-populate because printing mucks up XMMs */
-       populate_zmm();
-       populate_opmask();
-
-       /* Check state until timer fires */
-       while(checking) {
-               check_zmm();
-       }
-
-       /* Check that the sig handler changed our AVX state */
-       store_zmm(vec512array1);
-       store_opmask(karray1);
-
-       uint64_t *p = (uint64_t *) &vec512array1[7];
-       if (p[0] != STOP_COOKIE_512 ||
-           p[2] != STOP_COOKIE_512 ||
-           p[4] != STOP_COOKIE_512 ||
-           karray1[7] != STOP_COOKIE_512) {
-               vec512_to_string(vec512array1, vec_str_buf);
-               opmask_to_string(karray1, karray_str_buf);
-               T_ASSERT_FAIL("sigreturn failed to stick");
-               T_LOG("State:\n%s\n%s", vec_str_buf, karray_str_buf);
-       }
-
-       T_LOG("Ran for %ds", time);
-       T_PASS("No zmm register corruption occurred");
-}
-
-/*
- * Main test declarations
- */
-T_DECL(ymm_integrity,
-       "Quick soak test to verify that AVX "
-       "register state is maintained correctly",
-       T_META_TIMEOUT(NORMAL_RUN_TIME + TIMEOUT_OVERHEAD)) {
-       require_avx();
-       ymm_integrity(NORMAL_RUN_TIME);
-}
-
-T_DECL(ymm_integrity_stress,
-       "Extended soak test to verify that AVX "
-       "register state is maintained correctly",
-       T_META_TIMEOUT(LONG_RUN_TIME + TIMEOUT_OVERHEAD),
-       T_META_ENABLED(false)) {
-       require_avx();
-       ymm_integrity(LONG_RUN_TIME);
-}
-
-T_DECL(zmm_integrity,
-       "Quick soak test to verify that AVX-512 "
-       "register state is maintained correctly",
-       T_META_TIMEOUT(LONG_RUN_TIME + TIMEOUT_OVERHEAD)) {
-       require_avx512();
-       zmm_integrity(NORMAL_RUN_TIME);
-}
-
-T_DECL(zmm_integrity_stress,
-       "Extended soak test to verify that AVX-512 "
-       "register state is maintained correctly",
-       T_META_TIMEOUT(NORMAL_RUN_TIME + TIMEOUT_OVERHEAD),
-       T_META_ENABLED(false)) {
-       require_avx512();
-       zmm_integrity(LONG_RUN_TIME);
-}
-
diff --git a/tools/tests/darwintests/backtracing.c b/tools/tests/darwintests/backtracing.c
deleted file mode 100644 (file)
index 614ec12..0000000
+++ /dev/null
@@ -1,172 +0,0 @@
-#include <CoreSymbolication/CoreSymbolication.h>
-#include <darwintest.h>
-#include <dispatch/dispatch.h>
-#include <execinfo.h>
-#include <pthread.h>
-#include <sys/sysctl.h>
-
-#define USER_FRAMES (12)
-
-#define NON_RECURSE_FRAMES (5)
-
-static const char *user_bt[USER_FRAMES] = {
-    NULL, NULL,
-    "backtrace_thread",
-    "recurse_a", "recurse_b", "recurse_a", "recurse_b",
-    "recurse_a", "recurse_b", "recurse_a",
-    "expect_stack", NULL
-};
-
-static void
-expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol,
-    unsigned long addr, unsigned int bt_idx, unsigned int max_frames)
-{
-    const char *name;
-    unsigned int frame_idx = max_frames - bt_idx - 1;
-
-    if (bt[frame_idx] == NULL) {
-        T_LOG("frame %2u: skipping system frame", frame_idx);
-        return;
-    }
-
-    if (CSIsNull(symbol)) {
-        T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx);
-        return;
-    }
-
-    if (frame_idx >= bt_len) {
-        T_FAIL("unexpected frame '%s' (%#lx) at index %u",
-            CSSymbolGetName(symbol), addr, frame_idx);
-        return;
-    }
-
-    name = CSSymbolGetName(symbol);
-    T_QUIET; T_ASSERT_NOTNULL(name, NULL);
-    T_EXPECT_EQ_STR(name, bt[frame_idx],
-        "frame %2u: saw '%s', expected '%s'",
-        frame_idx, name, bt[frame_idx]);
-}
-
-static void __attribute__((noinline,not_tail_called))
-expect_stack(void)
-{
-    uint64_t bt[USER_FRAMES] = { 0 };
-    unsigned int bt_len = USER_FRAMES;
-    int err;
-    size_t bt_filled;
-
-    static dispatch_once_t expect_stacks_once;
-    static bool k64;
-    static CSSymbolicatorRef user_symb;
-
-    dispatch_once(&expect_stacks_once, ^(void) {
-        int errb;
-        int mib[] = { CTL_KERN, KERN_PROC, KERN_PROC_PID, 0 /* kernproc */ };
-
-        struct kinfo_proc kp;
-        size_t len;
-
-        len = sizeof(kp);
-        errb = sysctl(mib, sizeof(mib) / sizeof(mib[0]), &kp, &len, NULL, 0);
-        T_QUIET; T_ASSERT_POSIX_SUCCESS(errb,
-            "sysctl({ CTL_KERN, KERN_PROC, KERN_PROC_PID, 0})");
-
-        k64 = kp.kp_proc.p_flag & P_LP64;
-        T_LOG("executing with a %s-bit kernel", k64 ? "64" : "32");
-
-        user_symb = CSSymbolicatorCreateWithTask(mach_task_self());
-        T_QUIET; T_ASSERT_FALSE(CSIsNull(user_symb), NULL);
-        T_QUIET; T_ASSERT_TRUE(CSSymbolicatorIsTaskValid(user_symb), NULL);
-    });
-
-    bt_filled = USER_FRAMES;
-    err = sysctlbyname("kern.backtrace.user", bt, &bt_filled, NULL, 0);
-    if (err == ENOENT) {
-        T_SKIP("release kernel: kern.backtrace.user sysctl returned ENOENT");
-    }
-    T_ASSERT_POSIX_SUCCESS(err, "sysctlbyname(\"kern.backtrace.user\")");
-
-    bt_len = (unsigned int)bt_filled;
-    T_EXPECT_EQ(bt_len, (unsigned int)USER_FRAMES,
-        "%u frames should be present in backtrace", (unsigned int)USER_FRAMES);
-
-    for (unsigned int i = 0; i < bt_len; i++) {
-        uintptr_t addr;
-#if !defined(__LP64__)
-        /*
-         * Backtrace frames come out as kernel words; convert them back to user
-         * uintptr_t for 32-bit processes.
-         */
-        if (k64) {
-            addr = (uintptr_t)(bt[i]);
-        } else {
-            addr = (uintptr_t)(((uint32_t *)bt)[i]);
-        }
-#else /* defined(__LP32__) */
-        addr = (uintptr_t)bt[i];
-#endif /* defined(__LP32__) */
-
-        CSSymbolRef symbol = CSSymbolicatorGetSymbolWithAddressAtTime(
-            user_symb, addr, kCSNow);
-        expect_frame(user_bt, USER_FRAMES, symbol, addr, i, bt_len);
-    }
-}
-
-static int __attribute__((noinline,not_tail_called))
-recurse_a(unsigned int frames);
-static int __attribute__((noinline,not_tail_called))
-recurse_b(unsigned int frames);
-
-static int __attribute__((noinline,not_tail_called))
-recurse_a(unsigned int frames)
-{
-    if (frames == 1) {
-        expect_stack();
-        getpid();
-        return 0;
-    }
-
-    return recurse_b(frames - 1) + 1;
-}
-
-static int __attribute__((noinline,not_tail_called))
-recurse_b(unsigned int frames)
-{
-    if (frames == 1) {
-        expect_stack();
-        getpid();
-        return 0;
-    }
-
-    return recurse_a(frames - 1) + 1;
-}
-
-static void *
-backtrace_thread(void *arg)
-{
-#pragma unused(arg)
-    unsigned int calls;
-
-    /*
-     * backtrace_thread, recurse_a, recurse_b, ..., __sysctlbyname
-     *
-     * Always make one less call for this frame (backtrace_thread).
-     */
-    calls = USER_FRAMES - NON_RECURSE_FRAMES;
-
-    T_LOG("backtrace thread calling into %d frames (already at %d frames)",
-       calls, NON_RECURSE_FRAMES);
-    (void)recurse_a(calls);
-    return NULL;
-}
-
-T_DECL(backtrace_user, "test that the kernel can backtrace user stacks",
-    T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
-{
-    pthread_t thread;
-
-    T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, backtrace_thread,
-        NULL), "create additional thread to backtrace");
-
-    T_QUIET; T_ASSERT_POSIX_ZERO(pthread_join(thread, NULL), NULL);
-}
diff --git a/tools/tests/darwintests/contextswitch.c b/tools/tests/darwintests/contextswitch.c
deleted file mode 100644 (file)
index b059be9..0000000
+++ /dev/null
@@ -1,285 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <pthread.h>
-#include <errno.h>
-#include <err.h>
-#include <string.h>
-#include <assert.h>
-#include <sysexits.h>
-#include <getopt.h>
-#include <spawn.h>
-#include <stdbool.h>
-#include <sys/sysctl.h>
-#include <mach/mach_time.h>
-#include <mach/mach.h>
-#include <mach/semaphore.h>
-#include <TargetConditionals.h>
-
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-
-#include <darwintest.h>
-#include <stdatomic.h>
-
-#define MAX_THREADS    32
-#define SPIN_SECS      6
-#define THR_SPINNER_PRI        63
-#define THR_MANAGER_PRI        62
-#define WARMUP_ITERATIONS 100
-#define POWERCTRL_SUCCESS_STR "Factor1: 1.000000"
-
-static mach_timebase_info_data_t timebase_info;
-static semaphore_t semaphore;
-static semaphore_t worker_sem;
-static uint32_t g_numcpus;
-static _Atomic uint32_t keep_going = 1;
-static dt_stat_time_t s;
-
-static struct {
-    pthread_t thread;
-    bool measure_thread;
-} threads[MAX_THREADS];
-
-static uint64_t 
-nanos_to_abs(uint64_t nanos) 
-{ 
-    return nanos * timebase_info.denom / timebase_info.numer;
-}
-
-extern char **environ;
-
-static void
-csw_perf_test_init(void)
-{
-    int spawn_ret, pid;
-    char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-f", "5000", NULL};
-    spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, environ);
-    waitpid(pid, &spawn_ret, 0);
-}
-
-static void
-csw_perf_test_cleanup(void)
-{
-    int spawn_ret, pid;
-    char *const clpcctrl_args[] = {"/usr/local/bin/clpcctrl", "-d", NULL};
-    spawn_ret = posix_spawn(&pid, clpcctrl_args[0], NULL, NULL, clpcctrl_args, environ);
-    waitpid(pid, &spawn_ret, 0);
-}
-
-static pthread_t
-create_thread(uint32_t thread_id, uint32_t priority, bool fixpri, 
-        void *(*start_routine)(void *))
-{
-    int rv;
-    pthread_t new_thread;
-    struct sched_param param = { .sched_priority = (int)priority };
-    pthread_attr_t attr;
-
-    T_ASSERT_POSIX_ZERO(pthread_attr_init(&attr), "pthread_attr_init");
-
-    T_ASSERT_POSIX_ZERO(pthread_attr_setschedparam(&attr, &param),
-            "pthread_attr_setschedparam");
-
-    if (fixpri) {
-        T_ASSERT_POSIX_ZERO(pthread_attr_setschedpolicy(&attr, SCHED_RR),
-                "pthread_attr_setschedpolicy");
-    }
-
-    T_ASSERT_POSIX_ZERO(pthread_create(&new_thread, &attr, start_routine,
-            (void*)(uintptr_t)thread_id), "pthread_create");
-
-    T_ASSERT_POSIX_ZERO(pthread_attr_destroy(&attr), "pthread_attr_destroy");
-
-    threads[thread_id].thread = new_thread;
-
-    return new_thread;
-}
-
-/* Spin until a specified number of seconds elapses */
-static void
-spin_for_duration(uint32_t seconds)
-{
-    uint64_t duration       = nanos_to_abs((uint64_t)seconds * NSEC_PER_SEC);
-    uint64_t current_time   = mach_absolute_time();
-    uint64_t timeout        = duration + current_time;
-
-    uint64_t spin_count = 0;
-
-    while (mach_absolute_time() < timeout && atomic_load_explicit(&keep_going,
-               memory_order_relaxed)) {
-        spin_count++;
-    }
-}
-
-static void *
-spin_thread(void *arg)
-{
-    uint32_t thread_id = (uint32_t) arg;
-    char name[30] = "";
-
-    snprintf(name, sizeof(name), "spin thread %2d", thread_id);
-    pthread_setname_np(name);
-    T_ASSERT_MACH_SUCCESS(semaphore_wait_signal(semaphore, worker_sem),
-           "semaphore_wait_signal");
-    spin_for_duration(SPIN_SECS);
-    return NULL;
-}
-
-static void *
-thread(void *arg)
-{
-    uint32_t thread_id = (uint32_t) arg;
-    char name[30] = "";
-
-    snprintf(name, sizeof(name), "thread %2d", thread_id);
-    pthread_setname_np(name);
-    T_ASSERT_MACH_SUCCESS(semaphore_wait_signal(semaphore, worker_sem), "semaphore_wait");
-
-    if (threads[thread_id].measure_thread) {
-        for (int i = 0; i < WARMUP_ITERATIONS; i++) {
-            thread_switch(THREAD_NULL, SWITCH_OPTION_NONE, 0);
-        }
-        T_STAT_MEASURE_LOOP(s) {
-            if(thread_switch(THREAD_NULL, SWITCH_OPTION_NONE, 0))
-                T_ASSERT_FAIL("thread_switch");
-        }
-        atomic_store_explicit(&keep_going, 0, memory_order_relaxed);
-    } else {
-        while (atomic_load_explicit(&keep_going, memory_order_relaxed)) {
-            if (thread_switch(THREAD_NULL, SWITCH_OPTION_NONE, 0))
-                T_ASSERT_FAIL("thread_switch");
-        }
-    }
-    return NULL;
-}
-
-void check_device_temperature(void)
-{
-    char buffer[256];
-    FILE *pipe = popen("powerctrl Factor1", "r");
-    
-    if (pipe == NULL) {
-        T_FAIL("Failed to check device temperature");
-        T_END;
-    }
-
-    fgets(buffer, sizeof(buffer), pipe);
-    
-    if (strncmp(POWERCTRL_SUCCESS_STR, buffer, strlen(POWERCTRL_SUCCESS_STR))) {
-        T_PERF("temperature", 0.0, "factor", "device temperature");
-    } else {
-        T_PASS("Device temperature check pass");
-        T_PERF("temperature", 1.0, "factor", "device temperature");
-    }
-    pclose(pipe);
-}
-
-void record_perfcontrol_stats(const char *sysctlname, const char *units, const char *info)
-{
-    int data = 0;
-    size_t data_size = sizeof(data);
-    T_ASSERT_POSIX_ZERO(sysctlbyname(sysctlname,
-           &data, &data_size, NULL, 0), 
-           "%s", sysctlname);
-    T_PERF(info, data, units, info);
-}
-
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"));
-
-/* Disable the test on MacOS for now */
-T_DECL(perf_csw, "context switch performance", T_META_TYPE_PERF, T_META_CHECK_LEAKS(NO), T_META_ASROOT(YES))
-{
-
-#if !CONFIG_EMBEDDED
-    T_SKIP("Not supported on MacOS");
-    return;
-#endif /* CONFIG_EMBEDDED */
-    check_device_temperature();
-
-    T_ATEND(csw_perf_test_cleanup);
-
-    csw_perf_test_init();
-    pthread_setname_np("main thread");
-
-    T_ASSERT_MACH_SUCCESS(mach_timebase_info(&timebase_info), "mach_timebase_info");
-
-    struct sched_param param = {.sched_priority = 48};
-
-    T_ASSERT_POSIX_ZERO(pthread_setschedparam(pthread_self(), SCHED_FIFO, &param),
-            "pthread_setschedparam");
-
-    T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &semaphore,
-            SYNC_POLICY_FIFO, 0), "semaphore_create");
-
-    T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &worker_sem,
-            SYNC_POLICY_FIFO, 0), "semaphore_create");
-    
-    size_t ncpu_size = sizeof(g_numcpus);
-    T_ASSERT_POSIX_ZERO(sysctlbyname("hw.ncpu", &g_numcpus, &ncpu_size, NULL, 0),
-            "sysctlbyname hw.ncpu");
-
-    printf("hw.ncpu: %d\n", g_numcpus);
-    uint32_t n_spinners = g_numcpus - 1;
-
-    int mt_supported = 0;
-    size_t mt_supported_size = sizeof(mt_supported);
-    T_ASSERT_POSIX_ZERO(sysctlbyname("kern.monotonic.supported", &mt_supported,
-            &mt_supported_size, NULL, 0), "sysctlbyname kern.monotonic.supported");
-
-    for (uint32_t thread_id = 0; thread_id < n_spinners; thread_id++) {
-        threads[thread_id].thread = create_thread(thread_id, THR_SPINNER_PRI,
-                true, &spin_thread);
-    }
-
-    s = dt_stat_time_create("context switch time");
-
-    create_thread(n_spinners, THR_MANAGER_PRI, true, &thread);
-    threads[n_spinners].measure_thread = true;
-    create_thread(n_spinners + 1, THR_MANAGER_PRI, true, &thread);
-
-    /* Allow the context switch threads to get into sem_wait() */
-    for (uint32_t thread_id = 0; thread_id < n_spinners + 2; thread_id++) {
-        T_ASSERT_MACH_SUCCESS(semaphore_wait(worker_sem), "semaphore_wait");
-    }
-    
-    int enable_callout_stats = 1;
-    size_t enable_size = sizeof(enable_callout_stats);
-
-    if (mt_supported) {
-        /* Enable callout stat collection */
-        T_ASSERT_POSIX_ZERO(sysctlbyname("kern.perfcontrol_callout.stats_enabled",
-                NULL, 0, &enable_callout_stats, enable_size),
-                "sysctlbyname kern.perfcontrol_callout.stats_enabled");
-    }
-    
-    T_ASSERT_MACH_SUCCESS(semaphore_signal_all(semaphore), "semaphore_signal");
-
-
-    for (uint32_t thread_id = 0; thread_id < n_spinners + 2; thread_id++) {
-        T_ASSERT_POSIX_ZERO(pthread_join(threads[thread_id].thread, NULL),
-                "pthread_join %d", thread_id);
-    }
-
-    if (mt_supported) {
-        record_perfcontrol_stats("kern.perfcontrol_callout.oncore_instr",
-                "instructions", "oncore.instructions");
-        record_perfcontrol_stats("kern.perfcontrol_callout.offcore_instr",
-                "instructions", "offcore.instructions");
-        record_perfcontrol_stats("kern.perfcontrol_callout.oncore_cycles",
-                "cycles", "oncore.cycles");
-        record_perfcontrol_stats("kern.perfcontrol_callout.offcore_cycles",
-                "cycles", "offcore.cycles");
-
-        /* Disable callout stat collection */
-        enable_callout_stats = 0;
-        T_ASSERT_POSIX_ZERO(sysctlbyname("kern.perfcontrol_callout.stats_enabled",
-                NULL, 0, &enable_callout_stats, enable_size),
-                "sysctlbyname kern.perfcontrol_callout.stats_enabled");
-    }
-
-    check_device_temperature();
-    dt_stat_finalize(s);
-}
diff --git a/tools/tests/darwintests/cpucount.c b/tools/tests/darwintests/cpucount.c
deleted file mode 100644 (file)
index bd0548a..0000000
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Test to validate that we can schedule threads on all hw.ncpus cores according to _os_cpu_number
- *
- * <rdar://problem/29545645>
- *
-xcrun -sdk macosx.internal clang -o cpucount cpucount.c -ldarwintest -g -Weverything
-xcrun -sdk iphoneos.internal clang -arch arm64 -o cpucount-ios cpucount.c -ldarwintest -g -Weverything
- */
-
-#include <darwintest.h>
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdalign.h>
-#include <unistd.h>
-#include <assert.h>
-#include <pthread.h>
-#include <err.h>
-#include <errno.h>
-#include <sysexits.h>
-#include <sys/sysctl.h>
-#include <stdatomic.h>
-
-#include <mach/mach.h>
-#include <mach/mach_time.h>
-
-#include <os/tsd.h> /* private header for _os_cpu_number */
-
-/* const variables aren't constants, but enums are */
-enum { max_threads = 40 };
-
-#define CACHE_ALIGNED __attribute__((aligned(128)))
-
-static _Atomic CACHE_ALIGNED uint64_t g_ready_threads = 0;
-
-static _Atomic CACHE_ALIGNED bool g_cpu_seen[max_threads];
-
-static _Atomic CACHE_ALIGNED bool g_bail = false;
-
-static uint32_t g_threads; /* set by sysctl hw.ncpu */
-
-static uint64_t g_spin_ms = 50; /* it takes ~50ms of spinning for CLPC to deign to give us all cores */
-
-/*
- * sometimes pageout scan can eat all of CPU 0 long enough to fail the test,
- * so we run the test at RT priority
- */
-static uint32_t g_thread_pri = 97;
-
-/*
- * add in some extra low-pri threads to convince the amp scheduler to use E-cores consistently
- * works around <rdar://problem/29636191>
- */
-static uint32_t g_spin_threads = 2;
-static uint32_t g_spin_threads_pri = 20;
-
-static semaphore_t g_readysem, g_go_sem;
-
-static mach_timebase_info_data_t timebase_info;
-
-static uint64_t nanos_to_abs(uint64_t nanos) { return nanos * timebase_info.denom / timebase_info.numer; }
-
-static void set_realtime(pthread_t thread) {
-       kern_return_t kr;
-       thread_time_constraint_policy_data_t pol;
-
-       mach_port_t target_thread = pthread_mach_thread_np(thread);
-       T_QUIET; T_ASSERT_NOTNULL(target_thread, "pthread_mach_thread_np");
-
-       /* 1s 100ms 10ms */
-       pol.period      = (uint32_t)nanos_to_abs(1000000000);
-       pol.constraint  = (uint32_t)nanos_to_abs(100000000);
-       pol.computation = (uint32_t)nanos_to_abs(10000000);
-
-       pol.preemptible = 0; /* Ignored by OS */
-       kr = thread_policy_set(target_thread, THREAD_TIME_CONSTRAINT_POLICY, (thread_policy_t) &pol,
-                              THREAD_TIME_CONSTRAINT_POLICY_COUNT);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_policy_set(THREAD_TIME_CONSTRAINT_POLICY)");
-}
-
-static pthread_t
-create_thread(void *(*start_routine)(void *), uint32_t priority)
-{
-       int rv;
-       pthread_t new_thread;
-       pthread_attr_t attr;
-
-       struct sched_param param = { .sched_priority = (int)priority };
-
-       rv = pthread_attr_init(&attr);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_init");
-
-       rv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_setdetachstate");
-
-       rv = pthread_attr_setschedparam(&attr, &param);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_setschedparam");
-
-       rv = pthread_create(&new_thread, &attr, start_routine, NULL);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create");
-
-       if (priority == 97)
-               set_realtime(new_thread);
-
-       rv = pthread_attr_destroy(&attr);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_destroy");
-
-       return new_thread;
-}
-
-static void *
-thread_fn(__unused void *arg)
-{
-       T_QUIET; T_EXPECT_TRUE(true, "initialize darwintest on this thread");
-
-       kern_return_t kr;
-
-       kr = semaphore_wait_signal(g_go_sem, g_readysem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
-
-       /* atomic inc to say hello */
-       g_ready_threads++;
-
-       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
-
-       /*
-        * spin to force the other threads to spread out across the cores
-        * may take some time if cores are masked and CLPC needs to warm up to unmask them
-        */
-       while (g_ready_threads < g_threads && mach_absolute_time() < timeout);
-
-       T_QUIET; T_ASSERT_GE(timeout, mach_absolute_time(), "waiting for all threads took too long");
-
-       timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
-
-       int iteration = 0;
-       uint32_t cpunum = 0;
-
-       /* search for new CPUs for the duration */
-       while (mach_absolute_time() < timeout) {
-               cpunum = _os_cpu_number();
-
-               assert(cpunum < max_threads);
-
-               g_cpu_seen[cpunum] = true;
-
-               if (iteration++ % 10000) {
-                       uint32_t cpus_seen = 0;
-
-                       for (uint32_t i = 0 ; i < g_threads; i++) {
-                               if (g_cpu_seen[i])
-                                       cpus_seen++;
-                       }
-
-                       /* bail out early if we saw all CPUs */
-                       if (cpus_seen == g_threads)
-                               break;
-               }
-       }
-
-       g_bail = true;
-
-       printf("thread cpunum: %d\n", cpunum);
-
-       kr = semaphore_wait_signal(g_go_sem, g_readysem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
-
-       return NULL;
-}
-
-static void *
-spin_fn(__unused void *arg)
-{
-       T_QUIET; T_EXPECT_TRUE(true, "initialize darwintest on this thread");
-
-       kern_return_t kr;
-
-       kr = semaphore_wait_signal(g_go_sem, g_readysem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
-
-       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC * 2) + mach_absolute_time();
-
-       /*
-        * run and sleep a bit to force some scheduler churn to get all the cores active
-        * needed to work around bugs in the amp scheduler
-        */
-       while (mach_absolute_time() < timeout && g_bail == false) {
-               usleep(500);
-
-               uint64_t inner_timeout = nanos_to_abs(1 * NSEC_PER_MSEC) + mach_absolute_time();
-
-               while (mach_absolute_time() < inner_timeout && g_bail == false);
-       }
-
-       kr = semaphore_wait_signal(g_go_sem, g_readysem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
-
-       return NULL;
-}
-
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wgnu-flexible-array-initializer"
-T_DECL(count_cpus, "Tests we can schedule threads on all hw.ncpus cores according to _os_cpu_number",
-       T_META_CHECK_LEAKS(NO))
-#pragma clang diagnostic pop
-{
-       setvbuf(stdout, NULL, _IONBF, 0);
-       setvbuf(stderr, NULL, _IONBF, 0);
-
-       int rv;
-       kern_return_t kr;
-       kr = mach_timebase_info(&timebase_info);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info");
-
-       kr = semaphore_create(mach_task_self(), &g_readysem, SYNC_POLICY_FIFO, 0);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
-
-       kr = semaphore_create(mach_task_self(), &g_go_sem, SYNC_POLICY_FIFO, 0);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
-
-       size_t ncpu_size = sizeof(g_threads);
-       rv = sysctlbyname("hw.ncpu", &g_threads, &ncpu_size, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)");
-
-       printf("hw.ncpu: %2d\n", g_threads);
-
-       assert(g_threads < max_threads);
-
-       for (uint32_t i = 0; i < g_threads; i++)
-               create_thread(&thread_fn, g_thread_pri);
-
-       for (uint32_t i = 0; i < g_spin_threads; i++)
-               create_thread(&spin_fn, g_spin_threads_pri);
-
-       for (uint32_t i = 0 ; i < g_threads + g_spin_threads; i++) {
-               kr = semaphore_wait(g_readysem);
-               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
-       }
-
-       uint64_t timeout = nanos_to_abs(g_spin_ms * NSEC_PER_MSEC) + mach_absolute_time();
-
-       /* spin to warm up CLPC :) */
-       while (mach_absolute_time() < timeout);
-
-       kr = semaphore_signal_all(g_go_sem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all");
-
-       for (uint32_t i = 0 ; i < g_threads + g_spin_threads; i++) {
-               kr = semaphore_wait(g_readysem);
-               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
-       }
-
-       uint32_t cpus_seen = 0;
-
-       for (uint32_t i = 0 ; i < g_threads; i++) {
-               if (g_cpu_seen[i])
-                       cpus_seen++;
-
-               printf("cpu %2d: %d\n", i, g_cpu_seen[i]);
-       }
-
-       T_ASSERT_EQ(cpus_seen, g_threads, "test should have run threads on all CPUS");
-}
-
diff --git a/tools/tests/darwintests/data_protection.c b/tools/tests/darwintests/data_protection.c
deleted file mode 100644 (file)
index c9a69fe..0000000
+++ /dev/null
@@ -1,1130 +0,0 @@
-#include <darwintest.h>
-#include <darwintest_utils.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <sys/mount.h>
-#include <sys/wait.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <IOKit/IOKitLib.h>
-#include <Kernel/IOKit/crypto/AppleKeyStoreDefs.h>
-#include <Kernel/sys/content_protection.h>
-
-#define CPT_IO_SIZE      4096
-#define CPT_AKS_BUF_SIZE 256
-#define CPT_MAX_PASS_LEN 64
-
-#define GET_PROT_CLASS(fd) \
-       fcntl((fd), F_GETPROTECTIONCLASS)
-
-#define SET_PROT_CLASS(fd, prot_class) \
-       fcntl((fd), F_SETPROTECTIONCLASS, (prot_class))
-
-#define KEYSTORECTL_PATH  "/usr/local/bin/keystorectl"
-#define KEYBAGDTEST_PATH  "/usr/local/bin/keybagdTest"
-#define TEMP_DIR_TEMPLATE "/tmp/data_protection_test.XXXXXXXX"
-#define TEST_PASSCODE     "IAmASecurePassword"
-
-int g_fd           = -1;
-int g_dir_fd       = -1;
-int g_subdir_fd    = -1;
-int g_passcode_set = 0;
-
-char g_test_tempdir[PATH_MAX] = TEMP_DIR_TEMPLATE;
-char g_filepath[PATH_MAX]     = "";
-char g_dirpath[PATH_MAX]      = "";
-char g_subdirpath[PATH_MAX]   = "";
-
-int apple_key_store(
-       uint32_t command,
-       uint64_t * inputs,
-       uint32_t input_count,
-       void * input_structs,
-       size_t input_struct_count,
-       uint64_t * outputs,
-       uint32_t * output_count
-);
-int spawn_proc(char * const command[]);
-int supports_content_prot(void);
-char* dp_class_num_to_string(int num);
-int lock_device(void);
-int unlock_device(char * passcode);
-int set_passcode(char * new_passcode, char * old_passcode);
-int clear_passcode(char * passcode);
-int has_passcode(void);
-void setup(void);
-void cleanup(void);
-
-T_DECL(data_protection,
-       "Verify behavior of the various data protection classes") {
-       int local_result = -1;
-       int new_prot_class = -1;
-       int old_prot_class = -1;
-       int current_byte = 0;
-       char rd_buffer[CPT_IO_SIZE];
-       char wr_buffer[CPT_IO_SIZE];
-
-       setup();
-
-       /*
-        * Ensure we can freely read and change
-        * protection classes when unlocked.
-        */
-       for(
-               new_prot_class = PROTECTION_CLASS_A;
-               new_prot_class <= PROTECTION_CLASS_F;
-               new_prot_class++
-       ) {
-               T_ASSERT_NE(
-                       old_prot_class = GET_PROT_CLASS(g_fd),
-                       -1,
-                       "Get protection class when locked"
-               );
-               T_WITH_ERRNO;
-               T_ASSERT_NE(
-                       SET_PROT_CLASS(g_fd, new_prot_class),
-                       -1,
-                       "Should be able to change protection "
-                       "from %s to %s while unlocked",
-                       dp_class_num_to_string(old_prot_class),
-                       dp_class_num_to_string(new_prot_class)
-               );
-       }
-
-       /* Query the filesystem for the default CP level (Is it C?) */
-#ifndef F_GETDEFAULTPROTLEVEL
-#define F_GETDEFAULTPROTLEVEL 79
-#endif
-
-       T_WITH_ERRNO;
-       T_ASSERT_NE(
-               old_prot_class = fcntl(g_fd, F_GETDEFAULTPROTLEVEL),
-               -1,
-               "Get default protection level for filesystem"
-       );
-
-       /* XXX: Do we want to do anything with the level? What should it be? */
-
-       /*
-        * files are allowed to move into F, but not out of it. They can also
-        * only do so when they do not have content.
-        */
-       close(g_fd);
-       unlink(g_filepath);
-
-       /* re-create the file */
-       T_WITH_ERRNO;
-       T_ASSERT_GE(
-               g_fd = open(g_filepath, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC),
-               0,
-               "Recreate test file"
-       );
-
-       /* Try making a class A file while locked. */
-       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_A),
-               -1,
-               "Should not be able to change protection "
-               "from class D to class A when locked"
-       );
-       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
-
-       /* Attempt opening/IO to a class A file while unlocked. */
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_A),
-               0,
-               "Should be able to change protection "
-               "from class D to class A when unlocked"
-       );
-
-       close(g_fd);
-
-       T_WITH_ERRNO;
-       T_ASSERT_GE(
-               g_fd = open(g_filepath, O_RDWR|O_CLOEXEC),
-               0,
-               "Should be able to open a class A file when unlocked");
-
-       /*
-        * TODO: Write specific data we can check for. If we're going to do
-        * that, the write scheme should be deliberately ugly.
-        */
-       current_byte = 0;
-
-       while(current_byte < CPT_IO_SIZE) {
-               local_result = pwrite(
-                       g_fd,
-                       &wr_buffer[current_byte],
-                       CPT_IO_SIZE - current_byte,
-                       current_byte
-               );
-
-               T_WITH_ERRNO;
-               T_ASSERT_NE(
-                       local_result,
-                       -1,
-                       "Should be able to write to "
-                       "a class A file when unlocked"
-               );
-
-               current_byte += local_result;
-       }
-
-       current_byte = 0;
-
-       while(current_byte < CPT_IO_SIZE) {
-               local_result = pread(
-                       g_fd,
-                       &rd_buffer[current_byte],
-                       CPT_IO_SIZE - current_byte,
-                       current_byte
-               );
-
-               T_WITH_ERRNO;
-               T_ASSERT_NE(
-                       local_result,
-                       -1,
-                       "Should be able to read from "
-                       "a class A file when unlocked"
-               );
-
-               current_byte += local_result;
-       }
-
-       /*
-        * Again, but now while locked; and try to change the file class
-        * as well.
-        */
-       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
-
-       T_ASSERT_LE(
-               pread(g_fd, rd_buffer, CPT_IO_SIZE, 0),
-               0,
-               "Should not be able to read from a class A file when locked"
-       );
-
-       T_ASSERT_LE(
-               pwrite(g_fd, wr_buffer, CPT_IO_SIZE, 0),
-               0,
-               "Should not be able to write to a class A file when locked"
-       );
-
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_D),
-               -1,
-               "Should not be able to change protection "
-               "from class A to class D when locked"
-       );
-
-       /* Try to open and truncate the file. */
-       close(g_fd);
-
-       T_ASSERT_EQ(
-               g_fd = open(g_filepath, O_RDWR|O_TRUNC|O_CLOEXEC),
-               -1,
-               "Should not be able to open and truncate "
-               "a class A file when locked"
-       );
-
-       /* Try to open the file */
-       T_ASSERT_EQ(
-               g_fd = open(g_filepath, O_RDWR|O_CLOEXEC),
-               -1,
-               "Should not be able to open a class A file when locked"
-       );
-
-       /* What about class B files? */
-       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
-
-       T_ASSERT_GE(
-               g_fd = open(g_filepath, O_RDWR|O_CLOEXEC),
-               0,
-               "Should be able to open a class A file when unlocked"
-       );
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_D),
-               0,
-               "Should be able to change protection "
-               "class from A to D when unlocked"
-       );
-
-       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
-
-       /* Can we create a class B file while locked? */
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_fd, PROTECTION_CLASS_B),
-               0,
-               "Should be able to change protection "
-               "class from D to B when locked"
-       );
-
-       T_ASSERT_EQ(
-               GET_PROT_CLASS(g_fd),
-               PROTECTION_CLASS_B,
-               "File should now have class B protection"
-       );
-
-       /*
-        * We should also be able to read/write to the
-        * file descriptor while it is open.
-        */
-       current_byte = 0;
-
-       while(current_byte < CPT_IO_SIZE) {
-               local_result = pwrite(
-                       g_fd,
-                       &wr_buffer[current_byte],
-                       CPT_IO_SIZE - current_byte,
-                       current_byte
-               );
-
-               T_WITH_ERRNO;
-               T_ASSERT_NE(
-                       local_result,
-                       -1,
-                       "Should be able to write to a "
-                       "new class B file when locked"
-               );
-
-               current_byte += local_result;
-       }
-
-       current_byte = 0;
-
-       while(current_byte < CPT_IO_SIZE) {
-               local_result = pread(
-                       g_fd,
-                       &rd_buffer[current_byte],
-                       CPT_IO_SIZE - current_byte,
-                       current_byte
-               );
-
-               T_ASSERT_NE(
-                       local_result,
-                       -1,
-                       "Should be able to read from a "
-                       "new class B file when locked"
-               );
-
-               current_byte += local_result;
-       }
-
-       /* We should not be able to open a class B file under lock. */
-       close(g_fd);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(
-               g_fd = open(g_filepath, O_RDWR|O_CLOEXEC),
-               -1,
-               "Should not be able to open a class B file when locked"
-       );
-
-       unlink(g_filepath);
-
-       /* We still need to test directory semantics. */
-       T_WITH_ERRNO;
-       T_ASSERT_NE(
-               mkdir(g_dirpath, 0x0777),
-               -1,
-               "Should be able to create a new directory when locked"
-       );
-
-       /* The newly created directory should not have a protection class. */
-       T_ASSERT_NE(
-               g_dir_fd = open(g_dirpath, O_RDONLY|O_CLOEXEC),
-               -1,
-               "Should be able to open an unclassed directory when locked"
-       );
-
-       T_ASSERT_TRUE(
-               GET_PROT_CLASS(g_dir_fd) == PROTECTION_CLASS_D ||
-               GET_PROT_CLASS(g_dir_fd) == PROTECTION_CLASS_DIR_NONE,
-               "Directory protection class sholud be D or NONE"
-       );
-
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_A),
-               0,
-               "Should be able to change a directory from "
-               "class D to class A while locked"
-       );
-
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_D),
-               0,
-               "Should be able to change a directory from "
-               "class A to class D while locked"
-       );
-
-       /*
-        * Do all files created in the directory properly inherit the
-        * directory's protection class?
-        */
-       T_SETUPBEGIN;
-       T_ASSERT_LT(
-               strlcpy(g_filepath, g_dirpath, PATH_MAX),
-               PATH_MAX,
-               "Construct path for file in the directory"
-       );
-       T_ASSERT_LT(
-               strlcat(g_filepath, "test_file", PATH_MAX),
-               PATH_MAX,
-               "Construct path for file in the directory"
-       );
-       T_SETUPEND;
-
-       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
-
-       for(
-               new_prot_class = PROTECTION_CLASS_A;
-               new_prot_class <= PROTECTION_CLASS_D;
-               new_prot_class++
-       ) {
-               int getclass_dir;
-
-               T_WITH_ERRNO;
-               T_ASSERT_NE(
-                       old_prot_class = GET_PROT_CLASS(g_dir_fd),
-                       -1,
-                       "Get protection class for the directory"
-               );
-
-               T_WITH_ERRNO;
-               T_ASSERT_EQ(
-                       SET_PROT_CLASS(g_dir_fd, new_prot_class),
-                       0,
-                       "Should be able to change directory "
-                       "protection from %s to %s",
-                       dp_class_num_to_string(old_prot_class),
-                       dp_class_num_to_string(new_prot_class)
-               );
-
-               T_EXPECT_EQ(
-                       getclass_dir = GET_PROT_CLASS(g_dir_fd),
-                       new_prot_class,
-                       "Get protection class for the directory"
-               );
-
-               T_WITH_ERRNO;
-               T_ASSERT_GE(
-                       g_fd = open(g_filepath, O_CREAT|O_EXCL|O_CLOEXEC, 0777),
-                       0,
-                       "Should be able to create file in "
-                       "%s directory when unlocked",
-                       dp_class_num_to_string(new_prot_class)
-               );
-
-               T_WITH_ERRNO;
-               T_ASSERT_NE(
-                       local_result = GET_PROT_CLASS(g_fd),
-                       -1,
-                       "Get the new file's protection class"
-               );
-
-               T_ASSERT_EQ(
-                       local_result,
-                       new_prot_class,
-                       "File should have %s protection",
-                       dp_class_num_to_string(new_prot_class)
-               );
-
-               close(g_fd);
-               unlink(g_filepath);
-       }
-
-       /* Do we disallow creation of a class F directory? */
-       T_ASSERT_NE(
-               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_F),
-               0,
-               "Should not be able to create class F directory"
-       );
-
-       /*
-        * Are class A and class B semantics followed for when
-        * we create these files during lock?
-        */
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_A),
-               0,
-               "Should be able to change protection "
-               "from class F to class A when unlocked"
-       );
-
-       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
-
-       T_ASSERT_EQ(
-               g_fd = open(g_filepath, O_CREAT|O_EXCL|O_CLOEXEC, 0777),
-               -1,
-               "Should not be able to create a new file "
-               "in a class A directory when locked"
-       );
-
-       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(
-               SET_PROT_CLASS(g_dir_fd, PROTECTION_CLASS_B),
-               0,
-               "Should be able to change directory "
-               "from class A to class B when unlocked"
-       );
-
-       T_ASSERT_EQ(lock_device(), 0, "*** Lock device ***");
-
-       T_ASSERT_GE(
-               g_fd = open(g_filepath, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, 0777),
-               0,
-               "Should be able to create a new file "
-               "in class B directory when locked"
-       );
-
-       T_ASSERT_NE(
-               local_result = GET_PROT_CLASS(g_fd),
-               -1,
-               "Get the new file's protection class"
-       );
-
-       T_ASSERT_EQ(
-               local_result,
-               PROTECTION_CLASS_B,
-               "File should inherit protection class of class B directory"
-       );
-
-       /* What happens when we try to create new subdirectories? */
-       T_ASSERT_EQ(unlock_device(TEST_PASSCODE), 0, "*** Unlock device ***");
-
-       for(
-               new_prot_class = PROTECTION_CLASS_A;
-               new_prot_class <= PROTECTION_CLASS_D;
-               new_prot_class++
-       ) {
-               T_WITH_ERRNO;
-               T_ASSERT_EQ(
-                       SET_PROT_CLASS(g_dir_fd, new_prot_class),
-                       0,
-                       "Change directory to %s",
-                       dp_class_num_to_string(new_prot_class)
-               );
-
-               T_WITH_ERRNO;
-               T_ASSERT_NE(
-                       mkdir(g_subdirpath, 0x0777),
-                       -1,
-                       "Create subdirectory in %s directory",
-                       dp_class_num_to_string(new_prot_class)
-               );
-
-               T_WITH_ERRNO;
-               T_ASSERT_NE(
-                       g_subdir_fd = open(g_subdirpath, O_RDONLY|O_CLOEXEC),
-                       -1,
-                       "Should be able to open subdirectory in %s directory",
-                       dp_class_num_to_string(new_prot_class)
-               );
-
-               T_ASSERT_NE(
-                       local_result = GET_PROT_CLASS(g_subdir_fd),
-                       -1,
-                       "Get protection class of new subdirectory "
-                       "of %s directory",
-                       dp_class_num_to_string(new_prot_class)
-               );
-
-               T_ASSERT_EQ(
-                       local_result,
-                       new_prot_class,
-                       "New subdirectory should have same class as %s parent",
-                       dp_class_num_to_string(new_prot_class)
-               );
-
-               close(g_subdir_fd);
-               rmdir(g_subdirpath);
-       }
-}
-
-void
-setup(void) {
-       int ret = 0;
-       int local_result = -1;
-
-       T_SETUPBEGIN;
-
-       T_ATEND(cleanup);
-
-       T_WITH_ERRNO;
-       T_ASSERT_NOTNULL(
-               mkdtemp(g_test_tempdir),
-               "Create temporary directory for test"
-       );
-       T_LOG("Test temp dir: %s", g_test_tempdir);
-
-       T_ASSERT_NE(
-               local_result = supports_content_prot(),
-               -1,
-               "Get content protection support status"
-       );
-
-       if(local_result == 0) {
-               T_SKIP("Data protection not supported on this system");
-       }
-
-       T_ASSERT_EQ(
-               has_passcode(),
-               0,
-               "Device should not have existing passcode"
-       );
-
-       T_ASSERT_EQ(
-               set_passcode(TEST_PASSCODE, NULL),
-               0,
-               "Set test passcode"
-       );
-
-       bzero(g_filepath, PATH_MAX);
-       bzero(g_dirpath, PATH_MAX);
-       bzero(g_subdirpath, PATH_MAX);
-
-       ret |= (strlcat(g_filepath, g_test_tempdir, PATH_MAX) == PATH_MAX);
-       ret |= (strlcat(g_filepath, "/", PATH_MAX) == PATH_MAX);
-       ret |= (strlcpy(g_dirpath, g_filepath, PATH_MAX) == PATH_MAX);
-       ret |= (strlcat(g_filepath, "test_file", PATH_MAX) == PATH_MAX);
-       ret |= (strlcat(g_dirpath, "test_dir/", PATH_MAX) == PATH_MAX);
-       ret |= (strlcpy(g_subdirpath, g_dirpath, PATH_MAX) == PATH_MAX);
-       ret |= (strlcat(g_subdirpath, "test_subdir/", PATH_MAX) == PATH_MAX);
-
-       T_QUIET;
-       T_ASSERT_EQ(ret, 0, "Initialize test path strings");
-
-       T_WITH_ERRNO;
-       T_ASSERT_GE(
-               g_fd = open(g_filepath, O_CREAT|O_EXCL|O_RDWR|O_CLOEXEC, 0777),
-               0,
-               "Create test file"
-       );
-
-       T_SETUPEND;
-}
-
-void
-cleanup(void) {
-       T_LOG("Cleaning up…");
-
-       if(g_subdir_fd >= 0) {
-               T_LOG("Cleanup: closing fd %d", g_subdir_fd);
-               close(g_subdir_fd);
-       }
-
-       if(g_subdirpath[0]) {
-               T_LOG("Cleanup: removing %s", g_subdirpath);
-               rmdir(g_subdirpath);
-       }
-
-       if(g_fd >= 0) {
-               T_LOG("Cleanup: closing fd %d", g_fd);
-               close(g_fd);
-       }
-
-       if(g_filepath[0]) {
-               T_LOG("Cleanup: removing %s", g_filepath);
-               unlink(g_filepath);
-       }
-
-       if(g_dir_fd >= 0) {
-               T_LOG("Cleanup: closing fd %d", g_dir_fd);
-               close(g_dir_fd);
-       }
-
-       if(g_dirpath[0]) {
-               T_LOG("Cleanup: removing %s", g_dirpath);
-               rmdir(g_dirpath);
-       }
-
-       if(strcmp(g_test_tempdir, TEMP_DIR_TEMPLATE)) {
-               T_LOG("Cleanup: removing %s", g_test_tempdir);
-               rmdir(g_test_tempdir);
-       }
-
-       if(g_passcode_set) {
-               T_LOG("Cleanup: unlocking device");
-               if(unlock_device(TEST_PASSCODE)) {
-                       T_LOG("Warning: failed to unlock device in cleanup");
-               }
-
-               T_LOG("Cleanup: clearing passcode");
-               if(clear_passcode(TEST_PASSCODE)) {
-                       T_LOG("Warning: failed to clear passcode in cleanup");
-               }
-       }
-}
-
-int
-set_passcode(char * new_passcode, char * old_passcode) {
-       int result = -1;
-
-#ifdef KEYBAG_ENTITLEMENTS
-       /* If we're entitled, we can set the passcode ourselves. */
-       uint64_t inputs[] = {device_keybag_handle};
-       uint32_t input_count = (sizeof(inputs) / sizeof(*inputs));
-       void * input_structs = NULL;
-       size_t input_struct_count = 0;
-       char buffer[CPT_AKS_BUF_SIZE];
-       char * buffer_ptr = buffer;
-       uint32_t old_passcode_len = 0;
-       uint32_t new_passcode_len = 0;
-
-       T_LOG("%s(): using keybag entitlements", __func__);
-
-       old_passcode_len = strnlen(old_passcode, CPT_MAX_PASS_LEN);
-       new_passcode_len = strnlen(new_passcode, CPT_MAX_PASS_LEN);
-
-       if((old_passcode == NULL) || (old_passcode_len == CPT_MAX_PASS_LEN)) {
-               old_passcode = "";
-               old_passcode_len = 0;
-       }
-       if((new_passcode == NULL) || (new_passcode_len == CPT_MAX_PASS_LEN)) {
-               new_passcode = "";
-               new_passcode_len = 0;
-       }
-
-       *((uint32_t *) buffer_ptr) = ((uint32_t) 2);
-       buffer_ptr += sizeof(uint32_t);
-
-       *((uint32_t *) buffer_ptr) = old_passcode_len;
-       buffer_ptr += sizeof(uint32_t);
-
-       memcpy(buffer_ptr, old_passcode, old_passcode_len);
-       buffer_ptr += ((old_passcode_len + sizeof(uint32_t) - 1) &
-               ~(sizeof(uint32_t) - 1));
-
-       *((uint32_t *) buffer_ptr) = new_passcode_len;
-       buffer_ptr += sizeof(uint32_t);
-
-       memcpy(buffer_ptr, new_passcode, new_passcode_len);
-       buffer_ptr += ((new_passcode_len + sizeof(uint32_t) - 1) &
-               ~(sizeof(uint32_t) - 1));
-
-       input_structs = buffer;
-       input_struct_count = (buffer_ptr - buffer);
-
-       result = apple_key_store(
-               kAppleKeyStoreKeyBagSetPasscode,
-               inputs,
-               input_count,
-               input_structs,
-               input_struct_count,
-               NULL,
-               NULL
-       );
-#else
-       /*
-        * If we aren't entitled, we'll need to use
-        * keystorectl to set the passcode.
-        */
-       T_LOG("%s(): using keystorectl", __func__);
-
-       if(
-               (old_passcode == NULL) ||
-               (strnlen(old_passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)
-       ) {
-               old_passcode = "";
-       }
-
-       if(
-               (new_passcode == NULL) ||
-               (strnlen(new_passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)
-       ) {
-               new_passcode = "";
-       }
-
-       char * const keystorectl_args[] = {
-               KEYSTORECTL_PATH,
-               "change-password",
-               old_passcode,
-               new_passcode,
-               NULL
-       };
-       result = spawn_proc(keystorectl_args);
-#endif /* KEYBAG_ENTITLEMENTS */
-       if(result == 0 && new_passcode != NULL) {
-               g_passcode_set = 1;
-       } else if(result == 0 && new_passcode == NULL) {
-               g_passcode_set = 0;
-       }
-
-       return(result);
-}
-
-int
-clear_passcode(char * passcode) {
-       /*
-        * For the moment, this will set the passcode to the empty string
-        * (a known value); this will most likely need to change, or running
-        * this test may ruin everything™
-        */
-       return set_passcode(NULL, passcode);
-}
-
-int
-has_passcode(void) {
-       return set_passcode(NULL, NULL);
-}
-
-int
-lock_device(void) {
-       int result = -1;
-
-       /*
-        * Pass in the path to keybagdTest instead. By doing this, we bypass
-        * the shortcut to get in to the keybag via IOKit and instead use the
-        * pre-existing command line tool.
-        *
-        * This also goes through the normal "lock → locking (10s) → locked"
-        * flow that would normally occuring during system runtime when the
-        * lock button is depressed. To ensure that our single threaded test
-        * works properly in this case, poll until we can't create a class A
-        * file to be safe.
-        */
-       char * const kbd_args[] = {KEYBAGDTEST_PATH, "lock", NULL};
-       result = spawn_proc(kbd_args);
-       if(result) {
-               return result;
-       }
-
-       /*
-        * Delete the file if it is present. Note that this may fail if the
-        * file is actually not there. So don't bomb out if we can't delete
-        * this file right now.
-        */
-       (void) unlink("/private/var/foo_test_file");
-
-       while(1) {
-               int dp_fd;
-
-               dp_fd = open_dprotected_np(
-                       "/private/var/foo_test_file",
-                       O_RDWR|O_CREAT,
-                       PROTECTION_CLASS_A,
-                       0
-               );
-
-               if(dp_fd >= 0) {
-                       /* delete it and sleep */
-                       close(dp_fd);
-                       result = unlink("/private/var/foo_test_file");
-
-                       if(result) {
-                               return result;
-                       }
-
-                       sync();
-                       sleep(1);
-               } else {
-                       /* drop out of our polling loop. */
-                       break;
-               }
-       }
-
-       /*
-        * Note that our loop breakout condition is whether or not we can
-        * create a class A file, so that loop may execute up to 10 times
-        * (due to the 10s grace period). By the time we get here, we assume
-        * that we didn't hit any of the error cases above.
-        */
-
-       return 0;
-}
-
-int
-unlock_device(char * passcode) {
-       int result = -1;
-
-#ifdef  KEYBAG_ENTITLEMENTS
-       /* If we're entitled, we can unlock the device ourselves. */
-       uint64_t inputs[] = {device_keybag_handle};
-       uint32_t input_count = (sizeof(inputs) / sizeof(*inputs));
-       size_t input_struct_count = 0;
-
-       T_LOG("%s(): using keybag entitlements", __func__);
-
-       input_struct_count = strnlen(passcode, CPT_MAX_PASS_LEN);
-       if((passcode == NULL) || (input_struct_count == CPT_MAX_PASS_LEN)) {
-               passcode = "";
-               input_struct_count = 0;
-       }
-
-       result = apple_key_store(
-               kAppleKeyStoreKeyBagUnlock,
-               inputs,
-               input_count,
-               passcode,
-               input_struct_count,
-               NULL,
-               NULL
-       );
-#else
-       /*
-        * If we aren't entitled, we'll need to use
-        * keystorectl to unlock the device.
-        */
-       T_LOG("%s(): using keystorectl", __func__);
-
-       if(
-               (passcode == NULL) ||
-               (strnlen(passcode, CPT_MAX_PASS_LEN) == CPT_MAX_PASS_LEN)
-       ) {
-               passcode = "";
-       }
-
-       char * const keystorectl_args[] = {
-               KEYSTORECTL_PATH, "unlock", passcode, NULL
-       };
-
-       result = spawn_proc(keystorectl_args);
-#endif /* KEYBAG_ENTITLEMENTS */
-
-       return(result);
-}
-
-/*
- * Code based on Mobile Key Bag; specifically
- * MKBDeviceSupportsContentProtection and
- * MKBDeviceFormattedForContentProtection.
- *
- * We want to verify that we support content protection, and that
- * we are formatted for it.
- */
-int
-supports_content_prot(void) {
-       int local_result = -1;
-       int result = -1;
-       uint32_t buffer_size = 1;
-       char buffer[buffer_size];
-       io_registry_entry_t defaults = IO_OBJECT_NULL;
-       kern_return_t k_result = KERN_FAILURE;
-       struct statfs statfs_results;
-
-       defaults = IORegistryEntryFromPath(
-               kIOMasterPortDefault,
-               kIODeviceTreePlane ":/defaults"
-       );
-
-       if(defaults == IO_OBJECT_NULL) {
-               /* Assume data protection is unsupported */
-               T_LOG(
-                       "%s(): no defaults entry in IORegistry",
-                       __func__
-               );
-               return 0;
-       }
-
-       k_result = IORegistryEntryGetProperty(
-               defaults,
-               "content-protect",
-               buffer,
-               &buffer_size
-       );
-
-       if(k_result != KERN_SUCCESS) {
-               /* Assume data protection is unsupported */
-               T_LOG(
-                       "%s(): no content-protect property in IORegistry",
-                       __func__
-               );
-               return 0;
-       }
-
-       /*
-        * At this point, we SUPPORT content protection… but are we
-        * formatted for it? This is ugly; we should be testing the file
-        * system we'll be testing in, not just /tmp/.
-        */
-       local_result = statfs(g_test_tempdir, &statfs_results);
-
-       if(local_result == -1) {
-               T_LOG(
-                       "%s(): failed to statfs the test directory, errno = %s",
-                       __func__, strerror(errno)
-               );
-               return -1;
-       } else if(statfs_results.f_flags & MNT_CPROTECT) {
-               return 1;
-       } else {
-               T_LOG(
-                       "%s(): filesystem not formatted for data protection",
-                       __func__
-               );
-               return 0;
-       }
-}
-
-/*
- * Shamelessly ripped from keystorectl routines;
- * a wrapper for invoking the AKS user client.
- */
-int
-apple_key_store(uint32_t command,
-                uint64_t * inputs,
-                uint32_t input_count,
-                void * input_structs,
-                size_t input_struct_count,
-                uint64_t * outputs,
-                uint32_t * output_count) {
-       int result = -1;
-       io_connect_t connection = IO_OBJECT_NULL;
-       io_registry_entry_t apple_key_bag_service = IO_OBJECT_NULL;
-       kern_return_t k_result = KERN_FAILURE;
-       IOReturn io_result = IO_OBJECT_NULL;
-
-       apple_key_bag_service = IOServiceGetMatchingService(
-               kIOMasterPortDefault,
-               IOServiceMatching(kAppleKeyStoreServiceName)
-       );
-       if(apple_key_bag_service == IO_OBJECT_NULL) {
-               T_LOG(
-                       "%s: failed to match kAppleKeyStoreServiceName",
-                       __func__
-               );
-               goto end;
-       }
-
-       k_result = IOServiceOpen(
-               apple_key_bag_service,
-               mach_task_self(),
-               0,
-               &connection
-       );
-       if(k_result != KERN_SUCCESS) {
-               T_LOG(
-                       "%s: failed to open AppleKeyStore: "
-                       "IOServiceOpen() returned %d",
-                       __func__, k_result
-               );
-               goto end;
-       }
-
-       k_result = IOConnectCallMethod(
-               connection,
-               kAppleKeyStoreUserClientOpen,
-               NULL, 0, NULL, 0, NULL, NULL, NULL, NULL
-       );
-       if(k_result != KERN_SUCCESS) {
-               T_LOG(
-                       "%s: call to AppleKeyStore method "
-                       "kAppleKeyStoreUserClientOpen failed",
-                       __func__
-               );
-               goto close;
-       }
-
-       io_result = IOConnectCallMethod(
-               connection, command, inputs, input_count, input_structs,
-               input_struct_count, outputs, output_count, NULL, NULL
-       );
-       if(io_result != kIOReturnSuccess) {
-               T_LOG("%s: call to AppleKeyStore method %d failed", __func__);
-               goto close;
-       }
-
-       result = 0;
-
-close:
-       IOServiceClose(apple_key_bag_service);
-end:
-       return(result);
-}
-
-/*
- * Helper function for launching tools
- */
-int
-spawn_proc(char * const command[]) {
-       pid_t pid           = 0;
-       int launch_tool_ret = 0;
-       bool waitpid_ret    = true;
-       int status          = 0;
-       int signal          = 0;
-       int timeout         = 30;
-
-       launch_tool_ret = dt_launch_tool(&pid, command, false, NULL, NULL);
-       T_EXPECT_EQ(launch_tool_ret, 0, "launch tool: %s", command[0]);
-       if(launch_tool_ret != 0) {
-               return 1;
-       }
-
-       waitpid_ret = dt_waitpid(pid, &status, &signal, timeout);
-       T_EXPECT_TRUE(waitpid_ret, "%s should succeed", command[0]);
-       if(waitpid_ret == false) {
-               if(status != 0) {
-                       T_LOG("%s exited %d", command[0], status);
-               }
-               if(signal != 0) {
-                       T_LOG("%s received signal %d", command[0], signal);
-               }
-               return 1;
-       }
-
-       return 0;
-}
-
-char*
-dp_class_num_to_string(int num) {
-       switch(num) {
-               case 0:
-                       return "unclassed";
-               case PROTECTION_CLASS_A:
-                       return "class A";
-               case PROTECTION_CLASS_B:
-                       return "class B";
-               case PROTECTION_CLASS_C:
-                       return "class C";
-               case PROTECTION_CLASS_D:
-                       return "class D";
-               case PROTECTION_CLASS_E:
-                       return "class E";
-               case PROTECTION_CLASS_F:
-                       return "class F";
-               default:
-                       return "<unknown class>";
-       }
-}
-
-#if 0
-int device_lock_state(void) {
-       /*
-        * TODO: Actually implement this.
-        *
-        * We fail if a passcode already exists, and the methods being used
-        * to lock/unlock the device in this test appear to be synchronous…
-        * do we need this function?
-        */
-       int result = -1;
-
-       return(result);
-}
-
-/* Determines if we will try to test class C semanatics. */
-int unlocked_since_boot() {
-       /*
-        * TODO: Actually implement this.
-        *
-        * The actual semantics for CP mean that even with this primative,
-        * we would need to set a passcode and then reboot the device in
-        * order to test this; this function will probably be rather
-        * worthless as a result.
-        */
-       int result = 1;
-
-       return(result);
-}
-#endif
-
diff --git a/tools/tests/darwintests/disk_mount_conditioner-entitlements.plist b/tools/tests/darwintests/disk_mount_conditioner-entitlements.plist
deleted file mode 100644 (file)
index 95d2141..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.private.dmc.set</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/disk_mount_conditioner.c b/tools/tests/darwintests/disk_mount_conditioner.c
deleted file mode 100644 (file)
index 5847149..0000000
+++ /dev/null
@@ -1,388 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <System/sys/fsctl.h>
-#include <paths.h>
-
-static char *mktempdir(void);
-static char *mktempmount(void);
-
-#ifndef TEST_UNENTITLED
-static int system_legal(const char *command);
-static char *mkramdisk(void);
-static uint64_t time_for_read(int fd, const char *expected);
-static void perf_setup(char **path, int *fd);
-
-#define READSIZE 1024L
-#endif /* !TEST_UNENTITLED */
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.vfs.dmc"),
-       T_META_ASROOT(true)
-);
-
-#pragma mark Entitled Tests
-
-#ifndef TEST_UNENTITLED
-T_DECL(fsctl_get_uninitialized,
-       "Initial fsctl.get should return zeros",
-       T_META_ASROOT(false))
-{
-       int err;
-       char *mount_path;
-       disk_conditioner_info info = {0};
-       disk_conditioner_info expected_info = {0};
-
-       T_SETUPBEGIN;
-       mount_path = mktempmount();
-       T_SETUPEND;
-
-       info.enabled = true;
-       info.is_ssd = true;
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET)");
-
-       err = memcmp(&info, &expected_info, sizeof(info));
-       T_ASSERT_EQ_INT(0, err, "initial DMC info is zeroed");
-}
-
-T_DECL(fsctl_set,
-       "fsctl.set should succeed and fsctl.get should verify")
-{
-       int err;
-       char *mount_path;
-       disk_conditioner_info info = {0};
-       disk_conditioner_info expected_info = {0};
-
-       T_SETUPBEGIN;
-       mount_path = mktempmount();
-       T_SETUPEND;
-
-       info.enabled = 1;
-       info.access_time_usec = 10;
-       info.read_throughput_mbps = 40;
-       info.write_throughput_mbps = 40;
-       info.is_ssd = 0;
-       expected_info = info;
-
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET)");
-
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_GET) after SET");
-
-       err = memcmp(&info, &expected_info, sizeof(info));
-       T_ASSERT_EQ_INT(0, err, "fsctl.get is the info configured by fsctl.set");
-}
-
-T_DECL(fsctl_get_nonroot,
-       "fsctl.get should not require root",
-       T_META_ASROOT(false))
-{
-       int err;
-       char *mount_path;
-       disk_conditioner_info info;
-
-       T_SETUPBEGIN;
-       // make sure we're not root
-       if (0 == geteuid()) {
-               seteuid(5000);
-       }
-
-       mount_path = mktempmount();
-       T_SETUPEND;
-
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl.get without root");
-}
-
-T_DECL(fsctl_set_nonroot,
-       "fsctl.set should require root",
-       T_META_ASROOT(false))
-{
-       int err;
-       char *mount_path;
-       disk_conditioner_info info = {0};
-       disk_conditioner_info expected_info = {0};
-
-       T_SETUPBEGIN;
-       // make sure we're not root
-       if (0 == geteuid()) {
-               seteuid(5000);
-       }
-
-       mount_path = mktempmount();
-       T_SETUPEND;
-
-       // save original info
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &expected_info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "Get original DMC info");
-
-       info.enabled = 1;
-       info.access_time_usec = 10;
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_NE_INT(0, err, "fsctl.set returns error without root");
-
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl.get after nonroot fsctl.set");
-
-       err = memcmp(&info, &expected_info, sizeof(info));
-       T_ASSERT_EQ_INT(0, err, "fsctl.set should not change info without root");
-}
-
-T_DECL(fsctl_delays,
-       "Validate I/O delays when DMC is enabled")
-{
-       char *path;
-       int fd;
-       int err;
-       uint64_t elapsed_nsec, expected_nsec;
-       disk_conditioner_info info;
-       char buf[READSIZE];
-
-       T_SETUPBEGIN;
-       perf_setup(&path, &fd);
-       memset(buf, 0xFF, sizeof(buf));
-       T_ASSERT_EQ_LONG((long)sizeof(buf), write(fd, buf, sizeof(buf)), "write random data to temp file");
-       fcntl(fd, F_FULLFSYNC);
-       T_SETUPEND;
-
-       expected_nsec = NSEC_PER_SEC / 2;
-
-       // measure delay before setting parameters (should be none)
-       elapsed_nsec = time_for_read(fd, buf);
-       T_ASSERT_LT_ULLONG(elapsed_nsec, expected_nsec, "DMC disabled read(%ld) from %s is reasonably fast", READSIZE, path);
-
-       // measure delay after setting parameters
-       info.enabled = 1;
-       info.access_time_usec = expected_nsec / NSEC_PER_USEC;
-       info.read_throughput_mbps = 40;
-       info.write_throughput_mbps = 40;
-       info.is_ssd = 1; // is_ssd will ensure we get constant access_time delays rather than scaled
-       err = fsctl(path, DISK_CONDITIONER_IOC_SET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET) delay");
-
-       elapsed_nsec = time_for_read(fd, buf);
-       T_ASSERT_GT_ULLONG(elapsed_nsec, expected_nsec, "DMC enabled read(%ld) from %s is at least the expected delay", READSIZE, path);
-       T_ASSERT_LT_ULLONG(elapsed_nsec, 2 * expected_nsec, "DMC enabled read(%ld) from %s is no more than twice the expected delay", READSIZE, path);
-
-       // measure delay after resetting parameters (should be none)
-       info.enabled = 0;
-       err = fsctl(path, DISK_CONDITIONER_IOC_SET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl(DISK_CONDITIONER_IOC_SET) reset delay");
-
-       usleep(USEC_PER_SEC / 2); // might still be other I/O inflight
-       elapsed_nsec = time_for_read(fd, buf);
-       T_ASSERT_LT_ULLONG(elapsed_nsec, expected_nsec, "After disabling DMC read(%ld) from %s is reasonably fast", READSIZE, path);
-}
-
-#else /* TEST_UNENTITLED */
-
-#pragma mark Unentitled Tests
-
-T_DECL(fsctl_get_unentitled,
-       "fsctl.get should not require entitlement")
-{
-       int err;
-       char *mount_path;
-       disk_conditioner_info info;
-
-       T_SETUPBEGIN;
-       mount_path = mktempmount();
-       T_SETUPEND;
-
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl.get without entitlement");
-}
-
-T_DECL(fsctl_set_unentitled,
-       "fsctl.set should require entitlement")
-{
-       int err;
-       char *mount_path;
-       disk_conditioner_info info = {0};
-       disk_conditioner_info expected_info = {0};
-
-       T_SETUPBEGIN;
-       mount_path = mktempmount();
-       T_SETUPEND;
-
-       // save original info
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &expected_info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "Get original DMC info");
-
-       info.enabled = 1;
-       info.access_time_usec = 10;
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_SET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_NE_INT(0, err, "fsctl.set returns error without entitlement");
-
-       err = fsctl(mount_path, DISK_CONDITIONER_IOC_GET, &info, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, err, "fsctl.get after unentitled fsctl.set");
-
-       err = memcmp(&info, &expected_info, sizeof(info));
-       T_ASSERT_EQ_INT(0, err, "fsctl.set should not change info without entitlement");
-}
-
-#endif /* TEST_UNENTITLED */
-
-#pragma mark Helpers
-
-static char *mktempdir(void) {
-       char *path = malloc(PATH_MAX);
-       strcpy(path, "/tmp/dmc.XXXXXXXX");
-       atexit_b(^{ free(path); });
-
-       // create a temporary mount to run the fsctl on
-       T_WITH_ERRNO;
-       T_ASSERT_NOTNULL(mkdtemp(path), "Create temporary directory");
-       atexit_b(^{ remove(path); });
-
-       return path;
-}
-
-/*
- * Return the path to a temporary mount
- * with no usable filesystem but still
- * can be configured by the disk conditioner
- *
- * Faster than creating a ram disk to test with
- * when access to the filesystem is not necessary
- */
-static char *mktempmount(void) {
-       char *mount_path = mktempdir();
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(0, mount("devfs", mount_path, MNT_RDONLY, NULL), "Create temporary devfs mount");
-       atexit_b(^{ unmount(mount_path, MNT_FORCE); });
-
-       return mount_path;
-}
-
-#ifndef TEST_UNENTITLED
-
-/*
- * Wrapper around dt_launch_tool/dt_waitpid
- * that works like libc:system()
- */
-static int system_legal(const char *command) {
-       pid_t pid = -1;
-       int exit_status = 0;
-       const char *argv[] = {
-               _PATH_BSHELL,
-               "-c",
-               command,
-               NULL
-       };
-
-       int rc = dt_launch_tool(&pid, (char **)(void *)argv, false, NULL, NULL);
-       if (rc != 0) {
-               return -1;
-       }
-       if (!dt_waitpid(pid, &exit_status, NULL, 30)) {
-               if (exit_status != 0) {
-                       return exit_status;
-               }
-               return -1;
-       }
-
-       return exit_status;
-}
-
-/*
- * Return the path to a temporary mount
- * that contains a usable HFS+ filesystem
- * mounted via a ram disk
- */
-static char *mkramdisk(void) {
-       char cmd[1024];
-       char *mount_path = mktempdir();
-       char *dev_disk_file = malloc(256);
-       atexit_b(^{ free(dev_disk_file); });
-       strcpy(dev_disk_file, "/tmp/dmc.ramdisk.XXXXXXXX");
-
-       T_WITH_ERRNO;
-       T_ASSERT_NOTNULL(mktemp(dev_disk_file), "Create temporary file to store dev disk for ramdisk");
-       atexit_b(^{ remove(dev_disk_file); });
-
-       // create the RAM disk device
-       snprintf(cmd, sizeof(cmd), "hdik -nomount ram://10000 > %s", dev_disk_file);
-       T_ASSERT_EQ_INT(0, system_legal(cmd), "Create ramdisk");
-
-       atexit_b(^{
-               char eject_cmd[1024];
-               unmount(mount_path, MNT_FORCE);
-               snprintf(eject_cmd, sizeof(eject_cmd), "hdik -e `cat %s`", dev_disk_file);
-               system_legal(eject_cmd);
-               remove(dev_disk_file);
-       });
-
-       // initialize as an HFS volume
-       snprintf(cmd, sizeof(cmd), "newfs_hfs `cat %s`", dev_disk_file);
-       T_ASSERT_EQ_INT(0, system_legal(cmd), "Initialize ramdisk as HFS");
-
-       // mount it
-       snprintf(cmd, sizeof(cmd), "mount -t hfs `cat %s` %s", dev_disk_file, mount_path);
-       T_ASSERT_EQ_INT(0, system_legal(cmd), "Mount ramdisk");
-
-       return mount_path;
-}
-
-static uint64_t time_for_read(int fd, const char *expected) {
-       int err;
-       ssize_t ret;
-       char buf[READSIZE];
-       uint64_t start, stop;
-
-       bzero(buf, sizeof(buf));
-       lseek(fd, 0, SEEK_SET);
-
-       start = dt_nanoseconds();
-       ret = read(fd, buf, READSIZE);
-       stop = dt_nanoseconds();
-
-       T_ASSERT_GE_LONG(ret, 0L, "read from temporary file");
-       T_ASSERT_EQ_LONG(ret, READSIZE, "read %ld bytes from temporary file", READSIZE);
-       err = memcmp(buf, expected, sizeof(buf));
-       T_ASSERT_EQ_INT(0, err, "read expected contents from temporary file");
-
-       return (stop - start);
-}
-
-static void perf_setup(char **path, int *fd) {
-       int temp_fd;
-       char *temp_path;
-
-       char *mount_path = mkramdisk();
-       temp_path = *path = malloc(PATH_MAX);
-       snprintf(temp_path, PATH_MAX, "%s/dmc.XXXXXXXX", mount_path);
-       atexit_b(^{ free(temp_path); });
-
-       T_ASSERT_NOTNULL(mktemp(temp_path), "Create temporary file");
-       atexit_b(^{ remove(temp_path); });
-
-       temp_fd = *fd = open(temp_path, O_RDWR | O_CREAT);
-       T_WITH_ERRNO;
-       T_ASSERT_GE_INT(temp_fd, 0, "Open temporary file for read/write");
-       atexit_b(^{ close(temp_fd); });
-       fcntl(temp_fd, F_NOCACHE, 1);
-}
-#endif /* !TEST_UNENTITLED */
diff --git a/tools/tests/darwintests/drop_priv.c b/tools/tests/darwintests/drop_priv.c
deleted file mode 100644 (file)
index 7bb499c..0000000
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <darwintest.h>
-
-#include <TargetConditionals.h>
-#include <limits.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/errno.h>
-#include <unistd.h>
-
-#if !TARGET_OS_OSX
-#include <pwd.h>
-#include <sys/types.h>
-#include <uuid/uuid.h>
-#endif
-
-#if TARGET_OS_OSX
-#define INVOKER_UID "SUDO_UID"
-#define INVOKER_GID "SUDO_GID"
-#define ID_MAX (unsigned long)UINT_MAX
-static unsigned
-_get_sudo_invoker(const char *var)
-{
-    char *value_str = getenv(var);
-    T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(value_str,
-            "Not running under sudo, getenv(\"%s\") failed", var);
-    T_QUIET; T_ASSERT_NE_CHAR(*value_str, '\0',
-            "getenv(\"%s\") returned an empty string", var);
-
-    char *endp;
-    unsigned long value = strtoul(value_str, &endp, 10);
-    T_QUIET; T_WITH_ERRNO; T_ASSERT_EQ_CHAR(*endp, '\0',
-            "strtoul(\"%s\") not called on a valid number", value_str);
-    T_QUIET; T_WITH_ERRNO; T_ASSERT_NE_ULONG(value, ULONG_MAX,
-            "strtoul(\"%s\") overflow", value_str);
-
-    T_QUIET; T_ASSERT_NE_ULONG(value, 0ul, "%s invalid", var);
-    T_QUIET; T_ASSERT_LT_ULONG(value, ID_MAX, "%s invalid", var);
-    return (unsigned)value;
-}
-#endif /* TARGET_OS_OSX */
-
-void
-drop_priv(void);
-void
-drop_priv(void)
-{
-#if TARGET_OS_OSX
-    uid_t lower_uid = _get_sudo_invoker(INVOKER_UID);
-    gid_t lower_gid = _get_sudo_invoker(INVOKER_GID);
-#else
-    struct passwd *pw = getpwnam("mobile");
-    T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(pw, "getpwnam(\"mobile\")");
-    uid_t lower_uid = pw->pw_uid;
-    gid_t lower_gid = pw->pw_gid;
-#endif
-    T_ASSERT_POSIX_SUCCESS(setgid(lower_gid), "Change group to %u", lower_gid);
-    T_ASSERT_POSIX_SUCCESS(setuid(lower_uid), "Change user to %u", lower_uid);
-}
diff --git a/tools/tests/darwintests/freebsd_waitpid_nohang.c b/tools/tests/darwintests/freebsd_waitpid_nohang.c
deleted file mode 100644 (file)
index 9aa55e1..0000000
+++ /dev/null
@@ -1,62 +0,0 @@
-/*-
- * Copyright (c) 2016 Jilles Tjoelker
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <sys/cdefs.h>
-#include <sys/wait.h>
-
-#include <darwintest.h>
-#include <signal.h>
-#include <unistd.h>
-
-T_DECL(waitpid_nohang, "FreeBSDarwin--waitpid_nohang")
-{
-       pid_t child, pid;
-       int status, r;
-       siginfo_t siginfo;
-
-       child = fork();
-       T_ASSERT_POSIX_SUCCESS(child, "child forked successfully");
-       if (child == 0) {
-               sleep(10);
-               _exit(1);
-       }
-
-       status = 42;
-       pid = waitpid(child, &status, WNOHANG);
-       T_ASSERT_POSIX_ZERO(pid, "waitpid call is successful");
-       T_EXPECT_EQ(status, 42, "status is unaffected as expected");
-
-       r = kill(child, SIGTERM);
-       T_ASSERT_POSIX_ZERO(r, "signal sent successfully");
-       r = waitid(P_PID, (id_t)child, &siginfo, WEXITED | WNOWAIT);
-       T_ASSERT_POSIX_SUCCESS(r, "waitid call successful");
-
-       status = -1;
-       pid = waitpid(child, &status, WNOHANG);
-       T_ASSERT_EQ(pid, child, "waitpid returns correct pid");
-       T_EXPECT_EQ(WIFSIGNALED(status), true, "child was signaled"); 
-       T_EXPECT_EQ(WTERMSIG(status), SIGTERM, "child was sent SIGTERM");
-}
diff --git a/tools/tests/darwintests/gettimeofday.c b/tools/tests/darwintests/gettimeofday.c
deleted file mode 100644 (file)
index e2b8c3a..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-#include <unistd.h>
-#include <sys/time.h>
-#include <mach/mach_time.h>
-
-#include <darwintest.h>
-
-extern int __gettimeofday(struct timeval *, struct timezone *);
-
-T_DECL(gettimeofday, "gettimeofday()",
-          T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       struct timeval tv_a, tv_b, tv_c;
-
-       T_ASSERT_POSIX_ZERO(gettimeofday(&tv_a, NULL), NULL);
-       T_ASSERT_GT(tv_a.tv_sec, 0L, NULL);
-
-       sleep(1);
-
-       T_ASSERT_POSIX_ZERO(__gettimeofday(&tv_b, NULL), NULL);
-       T_ASSERT_GE(tv_b.tv_sec, tv_a.tv_sec, NULL);
-
-       sleep(1);
-
-       T_ASSERT_POSIX_ZERO(gettimeofday(&tv_c, NULL), NULL);
-       T_ASSERT_GE(tv_c.tv_sec, tv_b.tv_sec, NULL);
-}
-
-#if 0 // This symbol isn't exported so we can't test with stock libsyscall
-extern int __gettimeofday_with_mach(struct timeval *, struct timezone *, uint64_t *mach_time);
-
-T_DECL(gettimeofday_with_mach, "gettimeofday_with_mach()",
-          T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
-{
-       struct timeval gtod_ts;
-
-       uint64_t mach_time_before, mach_time, mach_time_after;
-
-       mach_time_before = mach_absolute_time();
-
-       T_ASSERT_POSIX_ZERO(__gettimeofday_with_mach(&gtod_ts, NULL, &mach_time), NULL);
-       T_ASSERT_GT(gtod_ts.tv_sec, 0L, NULL);
-
-       mach_time_after = mach_absolute_time();
-
-       T_LOG("%llx > %llx > %llx", mach_time_before, mach_time, mach_time_after);
-
-       T_ASSERT_LT(mach_time_before, mach_time, NULL);
-       T_ASSERT_GT(mach_time_after, mach_time, NULL);
-}
-#endif // 0
diff --git a/tools/tests/darwintests/gettimeofday_29192647.c b/tools/tests/darwintests/gettimeofday_29192647.c
deleted file mode 100644 (file)
index bd7b661..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <mach/mach_time.h>
-#include <sys/time.h>
-
-#include <darwintest.h>
-#include <darwintest_perf.h>
-
-T_DECL(gettimeofday_tl, "gettimeofday performance in tight loop") {
-       {
-               struct timeval time;
-               dt_stat_time_t s = dt_stat_time_create("gettimeofday tight loop");
-               T_STAT_MEASURE_LOOP(s){
-                       gettimeofday(&time, NULL);
-               }
-               dt_stat_finalize(s);
-       }
-}
-
-extern int __gettimeofday(struct timeval *, struct timezone *);
-T_DECL(__gettimeofday_tl, "__gettimeofday performance in tight loop") {
-       {
-               struct timeval time;
-
-               dt_stat_time_t s = dt_stat_time_create("__gettimeofday tight loop");
-               T_STAT_MEASURE_LOOP(s){
-                       __gettimeofday(&time, NULL);
-               }
-               dt_stat_finalize(s);
-       }
-}
-
-T_DECL(gettimeofday_sl, "gettimeofday performance in loop with sleep") {
-       {
-               struct timeval time;
-               dt_stat_time_t s = dt_stat_time_create("gettimeofday loop with sleep");
-               while (!dt_stat_stable(s)) {
-                       T_STAT_MEASURE_BATCH(s){
-                               gettimeofday(&time, NULL);
-                       }
-                       sleep(1);
-               }
-               dt_stat_finalize(s);
-       }
-}
diff --git a/tools/tests/darwintests/host_notifications.c b/tools/tests/darwintests/host_notifications.c
deleted file mode 100644 (file)
index c4463b3..0000000
+++ /dev/null
@@ -1,49 +0,0 @@
-#include <sys/time.h>
-#include <mach/mach.h>
-#include <mach/mach_host.h>
-
-#include <darwintest.h>
-
-static void do_test(int notify_type, void (^trigger_block)(void)){
-       mach_port_t port;
-       T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), NULL);
-
-       T_ASSERT_MACH_SUCCESS(host_request_notification(mach_host_self(), notify_type, port), NULL);
-
-       trigger_block();
-
-       struct {
-               mach_msg_header_t hdr;
-               mach_msg_trailer_t trailer;
-       } message = { .hdr = {
-               .msgh_bits = 0,
-               .msgh_size = sizeof(mach_msg_header_t),
-               .msgh_remote_port = MACH_PORT_NULL,
-               .msgh_local_port = port,
-               .msgh_voucher_port = MACH_PORT_NULL,
-               .msgh_id = 0,
-       }};
-
-       T_ASSERT_EQ(MACH_RCV_TOO_LARGE, mach_msg_receive(&message.hdr), NULL);
-       mach_msg_destroy(&message.hdr);
-}
-
-T_DECL(host_notify_calendar_change, "host_request_notification(HOST_NOTIFY_CALENDAR_CHANGE)", T_META_CHECK_LEAKS(false), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       do_test(HOST_NOTIFY_CALENDAR_CHANGE, ^{
-               struct timeval tm;
-               if (gettimeofday(&tm, NULL) != 0 || settimeofday(&tm, NULL) != 0){
-                       T_SKIP("Unable to settimeofday()");
-               }
-       });
-}
-
-T_DECL(host_notify_calendar_set, "host_request_notification(HOST_NOTIFY_CALENDAR_SET)", T_META_CHECK_LEAKS(false), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       do_test(HOST_NOTIFY_CALENDAR_SET, ^{
-               struct timeval tm;
-               if (gettimeofday(&tm, NULL) != 0 || settimeofday(&tm, NULL) != 0){
-                       T_SKIP("Unable to settimeofday()");
-               }
-       });
-}
diff --git a/tools/tests/darwintests/host_statistics_rate_limiting.c b/tools/tests/darwintests/host_statistics_rate_limiting.c
deleted file mode 100644 (file)
index 8376db7..0000000
+++ /dev/null
@@ -1,191 +0,0 @@
-#include <unistd.h>
-#include <stdint.h>
-#include <sys/time.h>
-#include <System/sys/codesign.h>
-#include <mach/mach_time.h>
-#include <mach/mach.h>
-#include <darwintest.h>
-#include <stdlib.h>
-
-#if !defined(CS_OPS_CLEARPLATFORM)
-#define CS_OPS_CLEARPLATFORM 13
-#endif
-
-#define WINDOW 1 /* seconds */
-#define MAX_ATTEMP_PER_SEC 10
-#define ITER 30
-#define RETRY 5
-
-static int
-remove_platform_binary(void){
-       int ret;
-       uint32_t my_csflags;
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(csops(getpid(), CS_OPS_STATUS, &my_csflags, sizeof(my_csflags)), NULL);
-
-       if (!(my_csflags & CS_PLATFORM_BINARY)) {
-               return 0;
-       }
-
-       ret = csops(getpid(), CS_OPS_CLEARPLATFORM, NULL, 0);
-       if (ret) {
-               switch (errno) {
-               case ENOTSUP:
-                       T_LOG("clearing platform binary not supported, skipping test");
-                       return -1;
-               default:
-                       T_LOG("csops failed with flag CS_OPS_CLEARPLATFORM");
-                       return -1;
-               }
-       }
-
-       my_csflags = 0;
-       T_QUIET; T_ASSERT_POSIX_ZERO(csops(getpid(), CS_OPS_STATUS, &my_csflags, sizeof(my_csflags)), NULL);
-
-       if (my_csflags & CS_PLATFORM_BINARY) {
-               T_LOG("platform binary flag still set");
-               return -1;
-       }
-
-       return 0;
-}
-
-struct all_host_info {
-       vm_statistics64_data_t host_vm_info64_rev0;
-       vm_statistics64_data_t host_vm_info64_rev1;
-       vm_extmod_statistics_data_t host_extmod_info64;
-       host_load_info_data_t host_load_info;
-       vm_statistics_data_t host_vm_info_rev0;
-       vm_statistics_data_t host_vm_info_rev1;
-       vm_statistics_data_t host_vm_info_rev2;
-       host_cpu_load_info_data_t host_cpu_load_info;
-       task_power_info_v2_data_t host_expired_task_info;
-       task_power_info_v2_data_t host_expired_task_info2;
-};
-
-static void
-check_host_info(struct all_host_info* data, unsigned long iter, char lett){
-       char* datap;
-       unsigned long i,j;
-
-       /* check that for the shorter revisions no data is copied on the bytes of diff with the longer */
-       for ( j = 0 ; j < iter; j++) {
-               datap = (char*) &data[j].host_vm_info64_rev0;
-               for ( i = (HOST_VM_INFO64_REV0_COUNT * sizeof(int)); i< (HOST_VM_INFO64_REV1_COUNT * sizeof(int)); i++) {
-                       T_QUIET;T_ASSERT_EQ(datap[i], lett, "HOST_VM_INFO64_REV0 byte %lu iter %lu", i, j);
-               }
-
-               datap = (char*) &data[j].host_vm_info_rev0;
-               for ( i = (HOST_VM_INFO_REV0_COUNT * sizeof(int)); i< (HOST_VM_INFO_REV2_COUNT * sizeof(int)); i++) {
-                       T_QUIET;T_ASSERT_EQ(datap[i], lett, "HOST_VM_INFO_REV0 byte %lu iter %lu", i, j);
-               }
-
-               datap = (char*) &data[j].host_vm_info_rev1;
-               for ( i = (HOST_VM_INFO_REV1_COUNT * sizeof(int)); i< (HOST_VM_INFO_REV2_COUNT * sizeof(int)); i++) {
-                       T_QUIET;T_ASSERT_EQ(datap[i], lett, "HOST_VM_INFO_REV1 byte %lu iter %lu", i, j);
-               }
-
-               datap = (char*) &data[j].host_expired_task_info;
-               for ( i = (TASK_POWER_INFO_COUNT * sizeof(int)); i< (TASK_POWER_INFO_V2_COUNT * sizeof(int)); i++) {
-                       T_QUIET;T_ASSERT_EQ(datap[i], lett, "TASK_POWER_INFO_COUNT byte %lu iter %lu", i, j);
-               }
-       }
-       T_LOG("No data overflow");
-
-       datap = (char*) data;
-
-       /* check that after MAX_ATTEMP_PER_SEC data are all the same */
-       for ( i = 0 ; i < sizeof(struct all_host_info) ; i++ )
-               for ( j = MAX_ATTEMP_PER_SEC - 1 ; j < iter - 1; j++) {
-                       T_QUIET; T_ASSERT_EQ(datap[i+(j * sizeof(struct all_host_info))], datap[i+((j+1) * sizeof(struct all_host_info))], "all_host_info iter %lu does not match iter %lu", j, j+1);
-               }
-
-       T_LOG("Data was cached");
-}
-
-static void
-get_host_info(struct all_host_info* data, host_t self, int iter){
-       int i;
-       unsigned int count;
-       for (i = 0; i < iter; i++){
-               count = HOST_VM_INFO64_REV0_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_VM_INFO64, (host_info64_t)&data[i].host_vm_info64_rev0, &count), NULL);
-               count = HOST_VM_INFO64_REV1_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_VM_INFO64, (host_info64_t)&data[i].host_vm_info64_rev1, &count), NULL);
-               count = HOST_EXTMOD_INFO64_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics64(self, HOST_EXTMOD_INFO64, (host_info64_t)&data[i].host_extmod_info64, &count), NULL);
-               count = HOST_LOAD_INFO_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_LOAD_INFO, (host_info_t)&data[i].host_load_info, &count), NULL);
-               count = HOST_VM_INFO_REV0_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev0, &count), NULL);
-               count = HOST_VM_INFO_REV1_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev1, &count), NULL);
-               count = HOST_VM_INFO_REV2_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_VM_INFO, (host_info_t)&data[i].host_vm_info_rev2, &count), NULL);
-               count = HOST_CPU_LOAD_INFO_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_CPU_LOAD_INFO, (host_info_t)&data[i].host_cpu_load_info, &count), NULL);
-               count = TASK_POWER_INFO_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_EXPIRED_TASK_INFO, (host_info_t)&data[i].host_expired_task_info, &count), NULL);
-               count = TASK_POWER_INFO_V2_COUNT;
-               T_QUIET;T_ASSERT_POSIX_ZERO(host_statistics(self, HOST_EXPIRED_TASK_INFO, (host_info_t)&data[i].host_expired_task_info2, &count), NULL);
-
-       }
-
-}
-
-T_DECL(test_host_statistics, "testing rate limit for host_statistics",
-          T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
-{
-
-       unsigned long long start, end, window;
-       int retry = 0;
-       host_t self;
-       char lett = 'a';
-       struct all_host_info* data;
-       mach_timebase_info_data_t timebaseInfo = { 0, 0 };
-
-       if (remove_platform_binary())
-               T_SKIP("Failed to remove platform binary");
-
-       data = malloc(ITER * sizeof(struct all_host_info));
-       T_QUIET;T_ASSERT_NE(data, NULL, "malloc");
-
-       /* check the size of the data structure against the bytes in COUNT*/
-       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_vm_info64_rev0), HOST_VM_INFO64_COUNT * sizeof(int), "HOST_VM_INFO64_COUNT");
-       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_extmod_info64), HOST_EXTMOD_INFO64_COUNT * sizeof(int), "HOST_EXTMOD_INFO64_COUNT");
-       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_load_info), HOST_LOAD_INFO_COUNT * sizeof(int), "HOST_LOAD_INFO_COUNT");
-       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_vm_info_rev0), HOST_VM_INFO_COUNT * sizeof(int), "HOST_VM_INFO_COUNT");
-       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_cpu_load_info), HOST_CPU_LOAD_INFO_COUNT * sizeof(int), "HOST_CPU_LOAD_INFO_COUNT");
-       T_QUIET;T_ASSERT_EQ(sizeof(data[0].host_expired_task_info2), TASK_POWER_INFO_V2_COUNT * sizeof(int), "TASK_POWER_INFO_V2_COUNT");
-
-       /* check that the latest revision is the COUNT */
-       T_QUIET;T_ASSERT_EQ(HOST_VM_INFO64_REV1_COUNT, HOST_VM_INFO64_COUNT, "HOST_VM_INFO64_REV1_COUNT");
-        T_QUIET;T_ASSERT_EQ(HOST_VM_INFO_REV2_COUNT, HOST_VM_INFO_COUNT, "HOST_VM_INFO_REV2_COUNT");
-
-       /* check that the previous revision are smaller than the latest */
-       T_QUIET;T_ASSERT_LE(HOST_VM_INFO64_REV0_COUNT, HOST_VM_INFO64_REV1_COUNT, "HOST_VM_INFO64_REV0");
-        T_QUIET;T_ASSERT_LE(HOST_VM_INFO_REV0_COUNT, HOST_VM_INFO_REV2_COUNT, "HOST_VM_INFO_REV0_COUNT");
-        T_QUIET;T_ASSERT_LE(HOST_VM_INFO_REV1_COUNT, HOST_VM_INFO_REV2_COUNT, "HOST_VM_INFO_REV1_COUNT");
-        T_QUIET;T_ASSERT_LE(TASK_POWER_INFO_COUNT,TASK_POWER_INFO_V2_COUNT, "TASK_POWER_INFO_COUNT");
-
-       memset(data, lett, ITER * sizeof(struct all_host_info));
-       self = mach_host_self();
-
-       T_QUIET;T_ASSERT_EQ(mach_timebase_info(&timebaseInfo), KERN_SUCCESS, NULL);
-       window = (WINDOW * NSEC_PER_SEC * timebaseInfo.denom) / timebaseInfo.numer;
-       retry = 0;
-
-       /* try to get ITER copies of host_info within window time, in such a way we should hit for sure a cached copy */
-       do {
-               start = mach_continuous_time();
-               get_host_info(data, self, ITER);
-               end = mach_continuous_time();
-               retry++;
-       } while( (end - start > window) && retry <= RETRY);
-
-       if (retry <= RETRY)
-               check_host_info(data, ITER, lett);
-       else
-               T_SKIP("Failed to find window for test");
-}
-
diff --git a/tools/tests/darwintests/ioperf.c b/tools/tests/darwintests/ioperf.c
deleted file mode 100644 (file)
index c2586ac..0000000
+++ /dev/null
@@ -1,256 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <errno.h>
-#include <err.h>
-#include <string.h>
-#include <assert.h>
-#include <sysexits.h>
-#include <getopt.h>
-#include <spawn.h>
-#include <stdbool.h>
-#include <sys/sysctl.h>
-#include <mach/mach_time.h>
-#include <mach/mach.h>
-#include <mach/semaphore.h>
-#include <TargetConditionals.h>
-
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-
-#include <darwintest.h>
-#include <stdatomic.h>
-
-#define MAX_THREADS         32
-#define SPIN_SECS           6
-#define THR_SPINNER_PRI     63
-#define THR_MANAGER_PRI     62
-#define WARMUP_ITERATIONS   100
-#define FILE_SIZE           (16384 * 4096)
-#define IO_SIZE             4096
-#define IO_COUNT            2500
-
-static mach_timebase_info_data_t timebase_info;
-static semaphore_t semaphore;
-static semaphore_t worker_sem;
-static uint32_t g_numcpus;
-static _Atomic uint32_t keep_going = 1;
-int test_file_fd = 0;
-char *data_buf = NULL;
-extern char **environ;
-
-static struct {
-    pthread_t thread;
-} threads[MAX_THREADS];
-
-static uint64_t 
-nanos_to_abs(uint64_t nanos) 
-{ 
-    return nanos * timebase_info.denom / timebase_info.numer;
-}
-
-static void
-io_perf_test_io_init(void)
-{
-    int spawn_ret, pid;
-    char *const mount_args[] = {"/usr/local/sbin/mount_nand.sh", NULL};
-    spawn_ret = posix_spawn(&pid, mount_args[0], NULL, NULL, mount_args, environ);
-    if (spawn_ret < 0) {
-       T_SKIP("NAND mounting in LTE not possible on this device. Skipping test!");
-    }
-    waitpid(pid, &spawn_ret, 0);
-    if (WIFEXITED(spawn_ret) && !WEXITSTATUS(spawn_ret)) {
-        T_PASS("NAND mounted successfully");
-    } else {
-        T_SKIP("Unable to mount NAND. Skipping test!");
-    }
-
-    /* Mark the main thread as fixed priority */
-    struct sched_param param = {.sched_priority = THR_MANAGER_PRI};
-    T_ASSERT_POSIX_ZERO(pthread_setschedparam(pthread_self(), SCHED_FIFO, &param),
-            "pthread_setschedparam");
-
-    /* Set I/O Policy to Tier 0 */
-    T_ASSERT_POSIX_ZERO(setiopolicy_np(IOPOL_TYPE_DISK, IOPOL_SCOPE_PROCESS,
-            IOPOL_IMPORTANT), "setiopolicy");
-
-    /* Create data buffer */
-    data_buf = malloc(IO_SIZE * 16);
-    T_ASSERT_NOTNULL(data_buf, "Data buffer allocation");
-
-    int rndfd = open("/dev/urandom", O_RDONLY, S_IRUSR);
-    T_ASSERT_POSIX_SUCCESS(rndfd, "Open /dev/urandom");
-    T_ASSERT_GE_INT((int)read(rndfd, data_buf, IO_SIZE * 16), 0, "read /dev/urandom");
-    close(rndfd);
-
-    /* Create test file */
-    int fd = open("/mnt2/test", O_CREAT | O_WRONLY, S_IRUSR);
-    T_ASSERT_POSIX_SUCCESS(fd, 0, "Open /mnt2/test for writing!");
-
-    T_ASSERT_POSIX_ZERO(fcntl(fd, F_NOCACHE, 1), "fcntl F_NOCACHE enable");
-    for (int size = 0; size < FILE_SIZE;) {
-        T_QUIET;
-        T_ASSERT_GE_INT((int)write(fd, data_buf, IO_SIZE * 16), 0, "write test file");
-        size += (IO_SIZE * 16);
-    }
-    close(fd);
-    sync();
-
-}
-
-static pthread_t
-create_thread(uint32_t thread_id, uint32_t priority, bool fixpri, 
-        void *(*start_routine)(void *))
-{
-    int rv;
-    pthread_t new_thread;
-    struct sched_param param = { .sched_priority = (int)priority };
-    pthread_attr_t attr;
-
-    T_ASSERT_POSIX_ZERO(pthread_attr_init(&attr), "pthread_attr_init");
-
-    T_ASSERT_POSIX_ZERO(pthread_attr_setschedparam(&attr, &param),
-            "pthread_attr_setschedparam");
-
-    if (fixpri) {
-        T_ASSERT_POSIX_ZERO(pthread_attr_setschedpolicy(&attr, SCHED_RR),
-                "pthread_attr_setschedpolicy");
-    }
-
-    T_ASSERT_POSIX_ZERO(pthread_create(&new_thread, &attr, start_routine,
-            (void*)(uintptr_t)thread_id), "pthread_create");
-
-    T_ASSERT_POSIX_ZERO(pthread_attr_destroy(&attr), "pthread_attr_destroy");
-
-    threads[thread_id].thread = new_thread;
-
-    return new_thread;
-}
-
-/* Spin until a specified number of seconds elapses */
-static void
-spin_for_duration(uint32_t seconds)
-{
-    uint64_t duration       = nanos_to_abs((uint64_t)seconds * NSEC_PER_SEC);
-    uint64_t current_time   = mach_absolute_time();
-    uint64_t timeout        = duration + current_time;
-
-    uint64_t spin_count = 0;
-
-    while (mach_absolute_time() < timeout && atomic_load_explicit(&keep_going,
-               memory_order_relaxed)) {
-        spin_count++;
-    }
-}
-
-static void *
-spin_thread(void *arg)
-{
-    uint32_t thread_id = (uint32_t) arg;
-    char name[30] = "";
-
-    snprintf(name, sizeof(name), "spin thread %2d", thread_id);
-    pthread_setname_np(name);
-    T_ASSERT_MACH_SUCCESS(semaphore_wait_signal(semaphore, worker_sem),
-            "semaphore_wait_signal");
-    spin_for_duration(SPIN_SECS);
-    return NULL;
-}
-
-void
-perform_io(dt_stat_time_t stat)
-{
-    /* Open the test data file */
-    int test_file_fd = open("/mnt2/test", O_RDONLY);
-    T_WITH_ERRNO;
-    T_ASSERT_POSIX_SUCCESS(test_file_fd, "Open test data file");
-
-    /* Disable caching and read-ahead for the file */
-    T_ASSERT_POSIX_ZERO(fcntl(test_file_fd, F_NOCACHE, 1), "fcntl F_NOCACHE enable");
-    T_ASSERT_POSIX_ZERO(fcntl(test_file_fd, F_RDAHEAD, 0), "fcntl F_RDAHEAD disable");
-
-    uint32_t count = 0;
-    int ret;
-
-    for (int i=0; i < WARMUP_ITERATIONS; i++) {
-        /* Warmup loop */
-        read(test_file_fd, data_buf, IO_SIZE);
-    }
-    
-    do {
-        T_STAT_MEASURE(stat) {
-            ret = read(test_file_fd, data_buf, IO_SIZE);
-        }
-        if (ret == 0) {
-            T_QUIET;
-            T_ASSERT_POSIX_SUCCESS(lseek(test_file_fd, 0, SEEK_SET), "lseek begin");
-        } else if (ret < 0) {
-            T_FAIL("read failure");
-            T_END;
-        }
-        count++;
-    } while(count < IO_COUNT);
-    close(test_file_fd);
-}
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.io"));
-
-/* Disable the test on MacOS for now */
-T_DECL(read_perf, "Sequential Uncached Read Performance", T_META_TYPE_PERF, T_META_CHECK_LEAKS(NO), T_META_ASROOT(YES), T_META_LTEPHASE(LTE_POSTINIT))
-{
-
-#if !CONFIG_EMBEDDED
-    T_SKIP("Not supported on MacOS");
-#endif /* !CONFIG_EMBEDDED */
-
-    io_perf_test_io_init();
-    pthread_setname_np("main thread");
-
-    T_ASSERT_MACH_SUCCESS(mach_timebase_info(&timebase_info), "mach_timebase_info");
-
-    dt_stat_time_t seq_noload = dt_stat_time_create("sequential read latency (CPU idle)");
-    perform_io(seq_noload);
-    dt_stat_finalize(seq_noload);
-
-    /* 
-     * We create spinner threads for this test so that all other cores are 
-     * busy. That way the I/O issue thread has to context switch to the 
-     * IOWorkLoop thread and back for the I/O. 
-     */
-    T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &semaphore,
-            SYNC_POLICY_FIFO, 0), "semaphore_create");
-
-    T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &worker_sem,
-            SYNC_POLICY_FIFO, 0), "semaphore_create");
-    
-    size_t ncpu_size = sizeof(g_numcpus);
-    T_ASSERT_POSIX_SUCCESS(sysctlbyname("hw.ncpu", &g_numcpus, &ncpu_size, NULL, 0),
-            "sysctlbyname(hw.ncpu)");
-
-    T_LOG("hw.ncpu: %d\n", g_numcpus);
-    uint32_t n_spinners = g_numcpus - 1;
-
-    for (uint32_t thread_id = 0; thread_id < n_spinners; thread_id++) {
-        threads[thread_id].thread = create_thread(thread_id, THR_SPINNER_PRI,
-                true, &spin_thread);
-    }
-
-    for (uint32_t thread_id = 0; thread_id < n_spinners; thread_id++) {
-        T_ASSERT_MACH_SUCCESS(semaphore_wait(worker_sem), "semaphore_wait");
-    }
-
-    T_ASSERT_MACH_SUCCESS(semaphore_signal_all(semaphore), "semaphore_signal");
-    
-    dt_stat_time_t seq_load = dt_stat_time_create("sequential read latency (Single CPU)");
-    perform_io(seq_load);
-    dt_stat_finalize(seq_load);
-    
-    atomic_store_explicit(&keep_going, 0, memory_order_relaxed);
-    for (uint32_t thread_id = 0; thread_id < n_spinners; thread_id++) {
-        T_ASSERT_POSIX_ZERO(pthread_join(threads[thread_id].thread, NULL),
-                "pthread_join %d", thread_id);
-    }
-}
diff --git a/tools/tests/darwintests/jumbo_va_spaces_28530648.c b/tools/tests/darwintests/jumbo_va_spaces_28530648.c
deleted file mode 100644 (file)
index aa081f3..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-
-#define GB (1ULL * 1024 * 1024 * 1024)
-
-/*
- * This test expects the entitlement to be the enabling factor for a process to
- * allocate at least this many GB of VA space. i.e. with the entitlement, n GB
- * must be allocatable; whereas without it, it must be less.
- */
-#define ALLOC_TEST_GB 54
-
-T_DECL(jumbo_va_spaces_28530648,
-       "Verify that the \"dynamic-codesigning\" entitlement is required to utilize an extra-large "
-       "VA space on arm64",
-       T_META_NAMESPACE("xnu.vm"),
-       T_META_CHECK_LEAKS(false))
-{
-       int     i;
-       void    *res;
-
-       if (!dt_64_bit_kernel()) {
-               T_SKIP("This test is only applicable to arm64");
-       }
-
-       T_LOG("Attemping to allocate VA space in 1 GB chunks.");
-
-       for (i = 0; i < (ALLOC_TEST_GB * 2); i++) {
-               res = mmap(NULL, 1 * GB, PROT_NONE, MAP_PRIVATE | MAP_ANON, 0, 0);
-               if (res == MAP_FAILED) {
-                       if (errno != ENOMEM) {
-                               T_WITH_ERRNO;
-                               T_LOG("mmap failed: stopped at %d of %d GB allocated", i, ALLOC_TEST_GB);
-                       }
-                       break;
-               } else {
-                       T_LOG("%d: %p\n", i, res);
-               }
-       }
-
-#if defined(ENTITLED)
-       T_EXPECT_GE_INT(i, ALLOC_TEST_GB, "Allocate at least %d GB of VA space", ALLOC_TEST_GB);
-#else
-       T_EXPECT_LT_INT(i, ALLOC_TEST_GB, "Not permitted to allocate %d GB of VA space", ALLOC_TEST_GB);
-#endif
-}
diff --git a/tools/tests/darwintests/jumbo_va_spaces_28530648.entitlements b/tools/tests/darwintests/jumbo_va_spaces_28530648.entitlements
deleted file mode 100644 (file)
index 9a1d0fb..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>dynamic-codesigning</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/kdebug.c b/tools/tests/darwintests/kdebug.c
deleted file mode 100644 (file)
index 3cc0e22..0000000
+++ /dev/null
@@ -1,690 +0,0 @@
-#include <darwintest.h>
-#include <dispatch/dispatch.h>
-#include <inttypes.h>
-#include <ktrace/session.h>
-#include <ktrace/private.h>
-#include <mach/dyld_kernel.h>
-#include <mach/host_info.h>
-#include <mach/mach.h>
-#include <mach/mach_init.h>
-#include <mach/task.h>
-#include <os/assumes.h>
-#include <sys/kdebug.h>
-#include <sys/kdebug_signpost.h>
-#include <sys/sysctl.h>
-
-#define KDBG_TEST_MACROS    1
-#define KDBG_TEST_OLD_TIMES 2
-
-static void
-assert_kdebug_test(unsigned int flavor)
-{
-    size_t size = flavor;
-    int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDTEST };
-    T_ASSERT_POSIX_SUCCESS(
-        sysctl(mib, sizeof(mib) / sizeof(mib[0]), NULL, &size, NULL, 0),
-        "KERN_KDTEST sysctl");
-}
-
-#pragma mark kdebug syscalls
-
-#define TRACE_DEBUGID (0xfedfed00U)
-
-T_DECL(kdebug_trace_syscall, "test that kdebug_trace(2) emits correct events",
-       T_META_ASROOT(true))
-{
-    ktrace_session_t s;
-    __block int events_seen = 0;
-
-    s = ktrace_session_create();
-    os_assert(s != NULL);
-
-    ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){});
-    ktrace_events_single(s, TRACE_DEBUGID, ^void(struct trace_point *tp) {
-        events_seen++;
-        T_PASS("saw traced event");
-
-        T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of traced event is correct");
-        T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of traced event is correct");
-        T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of traced event is correct");
-        T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of traced event is correct");
-
-        ktrace_end(s, 1);
-    });
-
-    ktrace_set_completion_handler(s, ^(void) {
-        T_EXPECT_GE(events_seen, 1, NULL);
-        ktrace_session_destroy(s);
-        T_END;
-    });
-
-    ktrace_filter_pid(s, getpid());
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-    T_ASSERT_POSIX_SUCCESS(kdebug_trace(TRACE_DEBUGID, 1, 2, 3, 4), NULL);
-    ktrace_end(s, 0);
-
-    dispatch_main();
-}
-
-#define SIGNPOST_SINGLE_CODE  (0x10U)
-#define SIGNPOST_PAIRED_CODE  (0x20U)
-
-T_DECL(kdebug_signpost_syscall,
-    "test that kdebug_signpost(2) emits correct events",
-    T_META_ASROOT(true))
-{
-    ktrace_session_t s;
-    __block int single_seen = 0;
-    __block int paired_seen = 0;
-
-    s = ktrace_session_create();
-    T_ASSERT_NOTNULL(s, NULL);
-
-    /* make sure to get enough events for the KDBUFWAIT to trigger */
-    // ktrace_events_class(s, DBG_MACH, ^(__unused struct trace_point *tp){});
-    ktrace_events_single(s,
-        APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_SINGLE_CODE),
-        ^void(struct trace_point *tp)
-    {
-        single_seen++;
-        T_PASS("single signpost is traced");
-
-        T_EXPECT_EQ(tp->arg1, 1UL, "argument 1 of single signpost is correct");
-        T_EXPECT_EQ(tp->arg2, 2UL, "argument 2 of single signpost is correct");
-        T_EXPECT_EQ(tp->arg3, 3UL, "argument 3 of single signpost is correct");
-        T_EXPECT_EQ(tp->arg4, 4UL, "argument 4 of single signpost is correct");
-    });
-
-    ktrace_events_single_paired(s,
-        APPSDBG_CODE(DBG_APP_SIGNPOST, SIGNPOST_PAIRED_CODE),
-        ^void(struct trace_point *start, struct trace_point *end)
-    {
-        paired_seen++;
-        T_PASS("paired signposts are traced");
-
-        T_EXPECT_EQ(start->arg1, 5UL, "argument 1 of start signpost is correct");
-        T_EXPECT_EQ(start->arg2, 6UL, "argument 2 of start signpost is correct");
-        T_EXPECT_EQ(start->arg3, 7UL, "argument 3 of start signpost is correct");
-        T_EXPECT_EQ(start->arg4, 8UL, "argument 4 of start signpost is correct");
-
-        T_EXPECT_EQ(end->arg1, 9UL, "argument 1 of end signpost is correct");
-        T_EXPECT_EQ(end->arg2, 10UL, "argument 2 of end signpost is correct");
-        T_EXPECT_EQ(end->arg3, 11UL, "argument 3 of end signpost is correct");
-        T_EXPECT_EQ(end->arg4, 12UL, "argument 4 of end signpost is correct");
-
-        T_EXPECT_EQ(single_seen, 1,
-            "signposts are traced in the correct order");
-
-        ktrace_end(s, 1);
-    });
-
-    ktrace_set_completion_handler(s, ^(void) {
-        if (single_seen == 0) {
-            T_FAIL("did not see single tracepoint before timeout");
-        }
-        if (paired_seen == 0) {
-            T_FAIL("did not see paired tracepoints before timeout");
-        }
-        ktrace_session_destroy(s);
-        T_END;
-    });
-
-    ktrace_filter_pid(s, getpid());
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-    T_EXPECT_POSIX_SUCCESS(kdebug_signpost(
-        SIGNPOST_SINGLE_CODE, 1, 2, 3, 4), NULL);
-    T_EXPECT_POSIX_SUCCESS(kdebug_signpost_start(
-        SIGNPOST_PAIRED_CODE, 5, 6, 7, 8), NULL);
-    T_EXPECT_POSIX_SUCCESS(kdebug_signpost_end(
-        SIGNPOST_PAIRED_CODE, 9, 10, 11, 12), NULL);
-    ktrace_end(s, 0);
-
-    dispatch_main();
-}
-
-#pragma mark kdebug behaviors
-
-#define WRAPPING_EVENTS_COUNT     (150000)
-#define TRACE_ITERATIONS          (5000)
-#define WRAPPING_EVENTS_THRESHOLD (100)
-
-T_DECL(wrapping,
-    "ensure that wrapping traces lost events and no events prior to the wrap",
-    T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
-{
-    ktrace_session_t s;
-    __block int events = 0;
-    int mib[4];
-    size_t needed;
-    kbufinfo_t buf_info;
-    int wait_wrapping_secs = (WRAPPING_EVENTS_COUNT / TRACE_ITERATIONS) + 5;
-    int current_secs = wait_wrapping_secs;
-
-    /* use sysctls manually to bypass libktrace assumptions */
-
-    mib[0] = CTL_KERN; mib[1] = KERN_KDEBUG; mib[2] = KERN_KDSETUP; mib[3] = 0;
-    needed = 0;
-    T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, NULL, &needed, NULL, 0),
-        "KERN_KDSETUP");
-
-    mib[2] = KERN_KDSETBUF; mib[3] = WRAPPING_EVENTS_COUNT;
-    T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDSETBUF");
-
-    mib[2] = KERN_KDENABLE; mib[3] = 1;
-    T_ASSERT_POSIX_SUCCESS(sysctl(mib, 4, NULL, 0, NULL, 0), "KERN_KDENABLE");
-
-    /* wrapping is on by default */
-
-    /* wait until wrapped */
-    T_LOG("waiting for trace to wrap");
-    mib[2] = KERN_KDGETBUF;
-    needed = sizeof(buf_info);
-    do {
-        sleep(1);
-        for (int i = 0; i < TRACE_ITERATIONS; i++) {
-            T_QUIET;
-            T_ASSERT_POSIX_SUCCESS(kdebug_trace(0xfefe0000, 0, 0, 0, 0), NULL);
-        }
-        T_QUIET;
-        T_ASSERT_POSIX_SUCCESS(sysctl(mib, 3, &buf_info, &needed, NULL, 0),
-            NULL);
-    } while (!(buf_info.flags & KDBG_WRAPPED) && --current_secs > 0);
-
-    T_ASSERT_TRUE(buf_info.flags & KDBG_WRAPPED,
-        "trace wrapped (after %d seconds within %d second timeout)",
-        wait_wrapping_secs - current_secs, wait_wrapping_secs);
-
-    s = ktrace_session_create();
-    T_QUIET; T_ASSERT_NOTNULL(s, NULL);
-    T_QUIET; T_ASSERT_POSIX_ZERO(ktrace_set_use_existing(s), NULL);
-
-    ktrace_events_all(s, ^void(struct trace_point *tp) {
-        if (events == 0) {
-            T_EXPECT_EQ(tp->debugid, (unsigned int)TRACE_LOST_EVENTS,
-                "first event's debugid 0x%08x (%s) should be TRACE_LOST_EVENTS",
-                tp->debugid,
-                ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK));
-        } else {
-            T_QUIET;
-            T_EXPECT_NE(tp->debugid, (unsigned int)TRACE_LOST_EVENTS,
-                "event debugid 0x%08x (%s) should not be TRACE_LOST_EVENTS",
-                tp->debugid,
-                ktrace_name_for_eventid(s, tp->debugid & KDBG_EVENTID_MASK));
-        }
-
-        events++;
-        if (events > WRAPPING_EVENTS_THRESHOLD) {
-            ktrace_end(s, 1);
-        }
-    });
-
-    ktrace_set_completion_handler(s, ^(void) {
-        ktrace_session_destroy(s);
-        T_END;
-    });
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-    dispatch_main();
-}
-
-T_DECL(reject_old_events,
-        "ensure that kdebug rejects events from before tracing began",
-        T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
-{
-    __block uint64_t event_horizon_ts;
-    __block int events = 0;
-
-    ktrace_session_t s = ktrace_session_create();
-    T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
-
-    ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0),
-        KDBG_EVENTID(DBG_BSD + 1, 0, 0),
-        ^(struct trace_point *tp)
-    {
-        events++;
-        T_EXPECT_GT(tp->timestamp, event_horizon_ts,
-                "events in trace should be from after tracing began");
-    });
-
-    ktrace_set_completion_handler(s, ^{
-        T_EXPECT_EQ(events, 2, "should see only two events");
-        ktrace_session_destroy(s);
-        T_END;
-    });
-
-    event_horizon_ts = mach_absolute_time();
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-    /* first, try an old event at the beginning of trace */
-    assert_kdebug_test(KDBG_TEST_OLD_TIMES);
-    /* after a good event has been traced, old events should be rejected */
-    assert_kdebug_test(KDBG_TEST_OLD_TIMES);
-    ktrace_end(s, 0);
-
-    dispatch_main();
-}
-
-#define ORDERING_TIMEOUT_SEC 5
-
-T_DECL(ascending_time_order,
-        "ensure that kdebug events are in ascending order based on time",
-        T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
-{
-    __block uint64_t prev_ts = 0;
-    __block uint32_t prev_debugid = 0;
-    __block unsigned int prev_cpu = 0;
-    __block bool in_order = true;
-
-    ktrace_session_t s = ktrace_session_create();
-    T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
-
-    ktrace_events_all(s, ^(struct trace_point *tp) {
-        if (tp->timestamp < prev_ts) {
-            in_order = false;
-            T_FAIL("found timestamps out of order");
-            T_LOG("%" PRIu64 ": %#" PRIx32 " (cpu %d)",
-                    prev_ts, prev_debugid, prev_cpu);
-            T_LOG("%" PRIu64 ": %#" PRIx32 " (cpu %d)",
-                    tp->timestamp, tp->debugid, tp->cpuid);
-        }
-    });
-
-    ktrace_set_completion_handler(s, ^{
-        ktrace_session_destroy(s);
-        T_EXPECT_TRUE(in_order, "event timestamps were in-order");
-        T_END;
-    });
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-    /* try to inject old timestamps into trace */
-    assert_kdebug_test(KDBG_TEST_OLD_TIMES);
-
-    dispatch_after(dispatch_time(DISPATCH_TIME_NOW, ORDERING_TIMEOUT_SEC * NSEC_PER_SEC),
-            dispatch_get_main_queue(), ^{
-        T_LOG("ending test after timeout");
-        ktrace_end(s, 1);
-    });
-
-    dispatch_main();
-
-}
-
-#pragma mark dyld tracing
-
-__attribute__((aligned(8)))
-static const char map_uuid[16] = "map UUID";
-
-__attribute__((aligned(8)))
-static const char unmap_uuid[16] = "unmap UUID";
-
-__attribute__((aligned(8)))
-static const char sc_uuid[16] = "shared UUID";
-
-static fsid_t map_fsid = { .val = { 42, 43 } };
-static fsid_t unmap_fsid = { .val = { 44, 45 } };
-static fsid_t sc_fsid = { .val = { 46, 47 } };
-
-static fsobj_id_t map_fsobjid = { .fid_objno = 42, .fid_generation = 43 };
-static fsobj_id_t unmap_fsobjid = { .fid_objno = 44, .fid_generation = 45 };
-static fsobj_id_t sc_fsobjid = { .fid_objno = 46, .fid_generation = 47 };
-
-#define MAP_LOAD_ADDR   0xabadcafe
-#define UNMAP_LOAD_ADDR 0xfeedface
-#define SC_LOAD_ADDR    0xfedfaced
-
-__unused
-static void
-expect_dyld_image_info(struct trace_point *tp, const uint64_t *exp_uuid,
-    uint64_t exp_load_addr, fsid_t *exp_fsid, fsobj_id_t *exp_fsobjid,
-    int order)
-{
-#if defined(__LP64__)
-    if (order == 0) {
-        uint64_t uuid[2];
-        uint64_t load_addr;
-        fsid_t fsid;
-
-        uuid[0] = (uint64_t)tp->arg1;
-        uuid[1] = (uint64_t)tp->arg2;
-        load_addr = (uint64_t)tp->arg3;
-        fsid.val[0] = (int32_t)(tp->arg4 & UINT32_MAX);
-        fsid.val[1] = (int32_t)((uint64_t)tp->arg4 >> 32);
-
-        T_QUIET; T_EXPECT_EQ(uuid[0], exp_uuid[0], NULL);
-        T_QUIET; T_EXPECT_EQ(uuid[1], exp_uuid[1], NULL);
-        T_QUIET; T_EXPECT_EQ(load_addr, exp_load_addr, NULL);
-        T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL);
-        T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL);
-    } else if (order == 1) {
-        fsobj_id_t fsobjid;
-
-        fsobjid.fid_objno = (uint32_t)(tp->arg1 & UINT32_MAX);
-        fsobjid.fid_generation = (uint32_t)((uint64_t)tp->arg1 >> 32);
-
-        T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL);
-        T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation,
-            exp_fsobjid->fid_generation, NULL);
-    } else {
-        T_ASSERT_FAIL("unrecognized order of events %d", order);
-    }
-#else /* defined(__LP64__) */
-    if (order == 0) {
-        uint32_t uuid[4];
-
-        uuid[0] = (uint32_t)tp->arg1;
-        uuid[1] = (uint32_t)tp->arg2;
-        uuid[2] = (uint32_t)tp->arg3;
-        uuid[3] = (uint32_t)tp->arg4;
-
-        T_QUIET; T_EXPECT_EQ(uuid[0], (uint32_t)exp_uuid[0], NULL);
-        T_QUIET; T_EXPECT_EQ(uuid[1], (uint32_t)(exp_uuid[0] >> 32), NULL);
-        T_QUIET; T_EXPECT_EQ(uuid[2], (uint32_t)exp_uuid[1], NULL);
-        T_QUIET; T_EXPECT_EQ(uuid[3], (uint32_t)(exp_uuid[1] >> 32), NULL);
-    } else if (order == 1) {
-        uint32_t load_addr;
-        fsid_t fsid;
-        fsobj_id_t fsobjid;
-
-        load_addr = (uint32_t)tp->arg1;
-        fsid.val[0] = (int32_t)tp->arg2;
-        fsid.val[1] = (int32_t)tp->arg3;
-        fsobjid.fid_objno = (uint32_t)tp->arg4;
-
-        T_QUIET; T_EXPECT_EQ(load_addr, (uint32_t)exp_load_addr, NULL);
-        T_QUIET; T_EXPECT_EQ(fsid.val[0], exp_fsid->val[0], NULL);
-        T_QUIET; T_EXPECT_EQ(fsid.val[1], exp_fsid->val[1], NULL);
-        T_QUIET; T_EXPECT_EQ(fsobjid.fid_objno, exp_fsobjid->fid_objno, NULL);
-    } else if (order == 2) {
-        fsobj_id_t fsobjid;
-
-        fsobjid.fid_generation = tp->arg1;
-
-        T_QUIET; T_EXPECT_EQ(fsobjid.fid_generation,
-            exp_fsobjid->fid_generation, NULL);
-    } else {
-        T_ASSERT_FAIL("unrecognized order of events %d", order);
-    }
-#endif /* defined(__LP64__) */
-}
-
-#if defined(__LP64__)
-#define DYLD_CODE_OFFSET (0)
-#define DYLD_EVENTS      (2)
-#else
-#define DYLD_CODE_OFFSET (2)
-#define DYLD_EVENTS      (3)
-#endif
-
-static void
-expect_dyld_events(ktrace_session_t s, const char *name, uint32_t base_code,
-    const char *exp_uuid, uint64_t exp_load_addr, fsid_t *exp_fsid,
-    fsobj_id_t *exp_fsobjid, uint8_t *saw_events)
-{
-    for (int i = 0; i < DYLD_EVENTS; i++) {
-        ktrace_events_single(s,
-            KDBG_EVENTID(DBG_DYLD, DBG_DYLD_UUID,
-            base_code + DYLD_CODE_OFFSET + (unsigned int)i),
-            ^(struct trace_point *tp)
-        {
-            T_LOG("checking %s event %c", name, 'A' + i);
-            expect_dyld_image_info(tp, (const void *)exp_uuid, exp_load_addr,
-                exp_fsid, exp_fsobjid, i);
-            *saw_events |= (1U << i);
-        });
-    }
-}
-
-T_DECL(dyld_events, "test that dyld registering libraries emits events",
-    T_META_ASROOT(true))
-{
-    ktrace_session_t s;
-    dyld_kernel_image_info_t info;
-
-    /*
-     * Use pointers instead of __block variables in order to use these variables
-     * in the completion block below _and_ pass pointers to them to the
-     * expect_dyld_events function.
-     */
-    uint8_t saw_events[3] = { 0 };
-    uint8_t *saw_mapping = &(saw_events[0]);
-    uint8_t *saw_unmapping = &(saw_events[1]);
-    uint8_t *saw_shared_cache = &(saw_events[2]);
-
-    s = ktrace_session_create();
-    T_ASSERT_NOTNULL(s, NULL);
-    T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL);
-
-    expect_dyld_events(s, "mapping", DBG_DYLD_UUID_MAP_A, map_uuid,
-        MAP_LOAD_ADDR, &map_fsid, &map_fsobjid, saw_mapping);
-    expect_dyld_events(s, "unmapping", DBG_DYLD_UUID_UNMAP_A, unmap_uuid,
-        UNMAP_LOAD_ADDR, &unmap_fsid, &unmap_fsobjid, saw_unmapping);
-    expect_dyld_events(s, "shared cache", DBG_DYLD_UUID_SHARED_CACHE_A,
-        sc_uuid, SC_LOAD_ADDR, &sc_fsid, &sc_fsobjid, saw_shared_cache);
-
-    ktrace_set_completion_handler(s, ^(void) {
-        T_EXPECT_EQ(__builtin_popcount(*saw_mapping), DYLD_EVENTS, NULL);
-        T_EXPECT_EQ(__builtin_popcount(*saw_unmapping), DYLD_EVENTS, NULL);
-        T_EXPECT_EQ(__builtin_popcount(*saw_shared_cache), DYLD_EVENTS, NULL);
-        ktrace_session_destroy(s);
-        T_END;
-    });
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-    info.load_addr = MAP_LOAD_ADDR;
-    memcpy(info.uuid, map_uuid, sizeof(info.uuid));
-    info.fsid = map_fsid;
-    info.fsobjid = map_fsobjid;
-    T_EXPECT_MACH_SUCCESS(task_register_dyld_image_infos(mach_task_self(),
-        &info, 1), NULL);
-
-    info.load_addr = UNMAP_LOAD_ADDR;
-    memcpy(info.uuid, unmap_uuid, sizeof(info.uuid));
-    info.fsid = unmap_fsid;
-    info.fsobjid = unmap_fsobjid;
-    T_EXPECT_MACH_SUCCESS(task_unregister_dyld_image_infos(mach_task_self(),
-        &info, 1), NULL);
-
-    info.load_addr = SC_LOAD_ADDR;
-    memcpy(info.uuid, sc_uuid, sizeof(info.uuid));
-    info.fsid = sc_fsid;
-    info.fsobjid = sc_fsobjid;
-    T_EXPECT_MACH_SUCCESS(task_register_dyld_shared_cache_image_info(
-        mach_task_self(), info, FALSE, FALSE), NULL);
-
-    ktrace_end(s, 0);
-
-    dispatch_main();
-}
-
-#pragma mark kdebug kernel macros
-
-#define EXP_KERNEL_EVENTS 5U
-
-static const uint32_t dev_evts[EXP_KERNEL_EVENTS] = {
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 0),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 1),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 2),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 3),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 4),
-};
-
-static const uint32_t rel_evts[EXP_KERNEL_EVENTS] = {
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 5),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 6),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 7),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 8),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 9),
-};
-
-static const uint32_t filt_evts[EXP_KERNEL_EVENTS] = {
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 10),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 11),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 12),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 13),
-    BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, 14),
-};
-
-static bool
-is_development_kernel(void)
-{
-    static dispatch_once_t is_development_once;
-    static bool is_development;
-
-    dispatch_once(&is_development_once, ^(void) {
-        int dev;
-        size_t dev_size = sizeof(dev);
-
-        T_QUIET;
-        T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev,
-                                            &dev_size, NULL, 0), NULL);
-        is_development = (dev != 0);
-    });
-
-    return is_development;
-}
-
-static void
-expect_event(struct trace_point *tp, unsigned int *events,
-    const uint32_t *event_ids, size_t event_ids_len)
-{
-    unsigned int event_idx = *events;
-    bool event_found = false;
-    size_t i;
-    for (i = 0; i < event_ids_len; i++) {
-        if (event_ids[i] == (tp->debugid & KDBG_EVENTID_MASK)) {
-            T_LOG("found event 0x%x", tp->debugid);
-            event_found = true;
-        }
-    }
-
-    if (!event_found) {
-        return;
-    }
-
-    *events += 1;
-    for (i = 0; i < event_idx; i++) {
-        T_QUIET; T_EXPECT_EQ(((uintptr_t *)&tp->arg1)[i], (uintptr_t)i + 1,
-            NULL);
-    }
-    for (; i < 4; i++) {
-        T_QUIET; T_EXPECT_EQ(((uintptr_t *)&tp->arg1)[i], (uintptr_t)0, NULL);
-    }
-}
-
-static void
-expect_release_event(struct trace_point *tp, unsigned int *events)
-{
-    expect_event(tp, events, rel_evts,
-        sizeof(rel_evts) / sizeof(rel_evts[0]));
-}
-
-static void
-expect_development_event(struct trace_point *tp, unsigned int *events)
-{
-    expect_event(tp, events, dev_evts,
-        sizeof(dev_evts) / sizeof(dev_evts[0]));
-}
-
-static void
-expect_filtered_event(struct trace_point *tp, unsigned int *events)
-{
-    expect_event(tp, events, filt_evts,
-        sizeof(filt_evts) / sizeof(filt_evts[0]));
-}
-
-T_DECL(kernel_events, "ensure kernel macros work",
-    T_META_ASROOT(true))
-{
-    ktrace_session_t s;
-
-
-    s = ktrace_session_create();
-    T_QUIET; T_ASSERT_NOTNULL(s, NULL);
-
-    __block unsigned int dev_seen = 0;
-    __block unsigned int rel_seen = 0;
-    __block unsigned int filt_seen = 0;
-    ktrace_events_range(s, KDBG_EVENTID(DBG_BSD, DBG_BSD_KDEBUG_TEST, 0),
-        KDBG_EVENTID(DBG_BSD + 1, 0, 0),
-        ^(struct trace_point *tp)
-    {
-        expect_development_event(tp, &dev_seen);
-        expect_release_event(tp, &rel_seen);
-        expect_filtered_event(tp, &filt_seen);
-    });
-
-    ktrace_set_completion_handler(s, ^(void) {
-        /*
-         * Development-only events are only filtered if running on an embedded
-         * OS.
-         */
-        unsigned int dev_exp;
-#if TARGET_OS_EMBEDDED
-        dev_exp = is_development_kernel() ? EXP_KERNEL_EVENTS : 0U;
-#else
-        dev_exp = EXP_KERNEL_EVENTS;
-#endif
-
-        T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS,
-                "release and development events seen");
-        T_EXPECT_EQ(dev_seen, dev_exp, "development-only events seen/not seen");
-        T_EXPECT_EQ(filt_seen, dev_exp, "filter-only events seen");
-        ktrace_session_destroy(s);
-        T_END;
-    });
-
-    ktrace_filter_pid(s, getpid());
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-    assert_kdebug_test(KDBG_TEST_MACROS);
-
-    ktrace_end(s, 0);
-
-    dispatch_main();
-}
-
-T_DECL(kernel_events_filtered, "ensure that the filtered kernel macros work",
-    T_META_ASROOT(true))
-{
-    ktrace_session_t s;
-
-    s = ktrace_session_create();
-    T_QUIET; T_ASSERT_NOTNULL(s, NULL);
-
-    __block unsigned int dev_seen = 0;
-    __block unsigned int rel_seen = 0;
-    __block unsigned int filt_seen = 0;
-    ktrace_events_all(s, ^(struct trace_point *tp) {
-        expect_development_event(tp, &dev_seen);
-        expect_release_event(tp, &rel_seen);
-        /* to make sure no filtered events are emitted */
-        expect_filtered_event(tp, &filt_seen);
-    });
-
-    ktrace_set_completion_handler(s, ^(void) {
-        ktrace_session_destroy(s);
-
-        T_EXPECT_EQ(rel_seen, EXP_KERNEL_EVENTS, NULL);
-#if defined(__arm__) || defined(__arm64__)
-        T_EXPECT_EQ(dev_seen, is_development_kernel() ? EXP_KERNEL_EVENTS : 0U,
-            NULL);
-#else
-        T_EXPECT_EQ(dev_seen, EXP_KERNEL_EVENTS, NULL);
-#endif /* defined(__arm__) || defined(__arm64__) */
-        T_EXPECT_EQ(filt_seen, 0U, NULL);
-        T_END;
-    });
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-    assert_kdebug_test(KDBG_TEST_MACROS);
-
-    ktrace_end(s, 0);
-
-    dispatch_main();
-}
diff --git a/tools/tests/darwintests/kevent_continuous_time.c b/tools/tests/darwintests/kevent_continuous_time.c
deleted file mode 100644 (file)
index 93015cd..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-
-#include <mach/mach.h>
-#include <mach/mach_time.h>
-#include <sys/time.h>
-#include <spawn.h>
-#include <sys/wait.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <time.h>
-#include <errno.h>
-#include <sys/event.h>
-
-#include <darwintest.h>
-
-extern char **environ;
-
-static mach_timebase_info_data_t tb_info;
-static const uint64_t one_mil = 1000LL*1000LL;
-
-#define tick_to_ns(ticks) (((ticks) * tb_info.numer) / (tb_info.denom))
-#define tick_to_ms(ticks) (tick_to_ns(ticks)/one_mil)
-
-#define ns_to_tick(ns) ((ns) * tb_info.denom / tb_info.numer)
-#define ms_to_tick(ms) (ns_to_tick((ms) * one_mil))
-
-static uint64_t time_delta_ms(void){
-       uint64_t abs_now = mach_absolute_time();
-       uint64_t cnt_now = mach_continuous_time();;
-       return tick_to_ms(cnt_now) - tick_to_ms(abs_now);
-}
-
-static int run_sleep_tests = 0;
-
-static int trigger_sleep(int for_secs) {
-       if(!run_sleep_tests) return 0;
-
-       // sleep for 1 seconds each iteration
-       char buf[10];
-       snprintf(buf, 10, "%d", for_secs);
-
-       T_LOG("Sleepeing for %s seconds...", buf);
-
-       int spawn_ret, pid;
-       char *const pmset1_args[] = {"/usr/bin/pmset", "relative", "wake", buf, NULL};
-       T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset1_args[0], NULL, NULL, pmset1_args, environ)), NULL);
-       
-       T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, NULL);
-       T_ASSERT_EQ(spawn_ret, 0, NULL);
-
-       char *const pmset2_args[] = {"/usr/bin/pmset", "sleepnow", NULL};
-       T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset2_args[0], NULL, NULL, pmset2_args, environ)), NULL);
-       
-       T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, NULL);
-       T_ASSERT_EQ(spawn_ret, 0, NULL);
-
-       return 0;
-}
-
-// waits up to 30 seconds for system to sleep
-// returns number of seconds it took for sleep to be entered
-// or -1 if sleep wasn't accomplished
-static int wait_for_sleep() {
-       if(!run_sleep_tests) return 0;
-
-       uint64_t before_diff = time_delta_ms();
-       
-       for(int i = 0; i < 30; i++) {
-               uint64_t after_diff = time_delta_ms();
-
-               // on OSX, there's enough latency between calls to MCT and MAT
-               // when the system is going down for sleep for values to diverge a few ms
-               if(llabs((int64_t)before_diff - (int64_t)after_diff) > 2) {
-                       return i + 1;
-               }
-               
-               sleep(1);
-               T_LOG("waited %d seconds for sleep...", i+1);
-       }
-       return -1;
-}
-
-T_DECL(kevent_continuous_time_periodic_tick, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME)", T_META_LTEPHASE(LTE_POSTINIT)){
-       mach_timebase_info(&tb_info);
-       int kq;
-       T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL);
-
-       struct kevent64_s change = {0};
-       EV_SET64(&change, 1, EVFILT_TIMER, EV_ADD, NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, 4, 0, 0, 0);
-       T_LOG("EV_SET(&change, 1, EVFILT_TIMER, EV_ADD, NOTE_SECONDS | NOTE_MACH_CONTINUOUS_TIME, 4, 0, 0, 0);");
-
-       T_ASSERT_POSIX_ZERO(kevent64(kq, &change, 1, NULL, 0, 0, NULL), NULL);
-
-       uint64_t abs_then = mach_absolute_time();
-       uint64_t cnt_then = mach_continuous_time();;
-
-       trigger_sleep(1);
-       int sleep_secs = wait_for_sleep();
-
-       struct kevent64_s event = {0};
-       T_WITH_ERRNO; T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event");
-       T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %lld}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata);
-       T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error");
-
-       uint64_t abs_now = mach_absolute_time();
-       uint64_t cnt_now = mach_continuous_time();;
-       uint64_t ct_ms_progressed = tick_to_ms(cnt_now - cnt_then);
-       uint64_t ab_ms_progressed = tick_to_ms(abs_now - abs_then);
-
-       T_LOG("ct progressed %llu ms, abs progressed %llu ms", ct_ms_progressed, tick_to_ms(abs_now - abs_then));
-
-       if (run_sleep_tests) {
-               T_ASSERT_GT(llabs((int64_t)ct_ms_progressed - (int64_t)ab_ms_progressed), 500LL, "should have > 500ms difference between MCT and MAT");
-       } else {
-               T_ASSERT_LT(llabs((int64_t)ct_ms_progressed - (int64_t)ab_ms_progressed), 10LL, "should have < 10ms difference between MCT and MAT");
-       }
-
-       if (sleep_secs < 4) {
-               T_ASSERT_LT(llabs((int64_t)ct_ms_progressed - 4000), 100LL, "mach_continuous_time should progress ~4 seconds (+/- 100ms) between sleeps");
-       }
-
-       sleep(1);
-
-       EV_SET64(&change, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0, 0, 0);
-       T_LOG("EV_SET(&change, 1, EVFILT_TIMER, EV_DELETE, 0, 0, 0);");
-       T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL);
-
-       T_ASSERT_POSIX_ZERO(close(kq), NULL);
-}
-
-T_DECL(kevent_continuous_time_absolute, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME and NOTE_ABSOLUTE)", T_META_LTEPHASE(LTE_POSTINIT)){
-       mach_timebase_info(&tb_info);
-
-       int kq;
-       T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL);
-
-       struct timeval tv;
-       gettimeofday(&tv, NULL);
-       uint64_t nowus   = (uint64_t)tv.tv_sec * USEC_PER_SEC + (uint64_t)tv.tv_usec;
-       uint64_t fire_at = (3*USEC_PER_SEC) + nowus;
-
-       uint64_t cnt_now = mach_continuous_time();
-       uint64_t cnt_then = cnt_now + ms_to_tick(3000);
-
-       T_LOG("currently is %llu, firing at %llu", nowus, fire_at);
-
-       struct kevent64_s change = {0};
-       EV_SET64(&change, 2, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, fire_at, 0, 0, 0);
-       T_LOG("EV_SET(&change, 2, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE | NOTE_USECONDS, fire_at, 0);");
-
-       T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL);
-
-       T_LOG("testing NOTE_MACH_CONTINUOUS_TIME | NOTE_ABSOLUTE between sleep");
-
-       trigger_sleep(1);
-
-       struct timespec timeout = {
-               .tv_sec = 10,
-               .tv_nsec = 0 
-       };
-       struct kevent64_s event = {0};
-       T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, &timeout), 1, "kevent() should have returned one event");
-       T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %lld}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata);
-       T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error");
-
-       uint64_t elapsed_ms = tick_to_ms(mach_continuous_time() - cnt_now);
-       int64_t missed_by  = tick_to_ns((int64_t)mach_continuous_time() - (int64_t)cnt_then) / 1000000;
-
-       // ~1/2 second is about as good as we'll get
-       T_ASSERT_LT(llabs(missed_by), 500LL, "timer should pop 3 sec in the future, popped after %lldms", elapsed_ms);
-
-       T_ASSERT_EQ(event.data, 1LL, NULL);
-
-       T_ASSERT_EQ(event.ident, 2ULL, NULL);
-
-       // try getting a periodic tick out of kq
-       T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, &timeout), 0, NULL);
-       T_ASSERT_EQ(event.flags & EV_ERROR, 0, "event should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error");
-
-       T_ASSERT_POSIX_ZERO(close(kq), NULL);
-}
-
-T_DECL(kevent_continuous_time_pops, "kevent(EVFILT_TIMER with NOTE_MACH_CONTINUOUS_TIME with multiple pops)", T_META_LTEPHASE(LTE_POSTINIT)){
-       // have to throttle rate at which pmset is called
-       sleep(2);
-
-       mach_timebase_info(&tb_info);
-
-       int kq;
-       T_ASSERT_POSIX_SUCCESS((kq = kqueue()), NULL);
-
-       // test that periodic ticks accumulate while asleep
-       struct kevent64_s change = {0};
-       EV_SET64(&change, 3, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME, 100, 0, 0, 0); // tick every 100 ms
-       T_LOG("EV_SET(&change, 3, EVFILT_TIMER, EV_ADD, NOTE_MACH_CONTINUOUS_TIME, 100, 0);");
-
-       // wait for first pop, then sleep
-       T_ASSERT_EQ(kevent64(kq, &change, 1, NULL, 0, 0, NULL), 0, NULL);
-
-       struct kevent64_s event = {0};
-       T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event");
-       T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %llu}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata);
-       T_ASSERT_EQ(event.flags & EV_ERROR, 0, "should not have EV_ERROR set: %s", event.flags & EV_ERROR ? strerror((int)event.data) : "no error");
-       T_ASSERT_EQ(event.ident, 3ULL, NULL);
-
-       uint64_t cnt_then = mach_continuous_time();
-       trigger_sleep(2);
-
-       int sleep_secs = 0;
-       if(run_sleep_tests) {
-               sleep_secs = wait_for_sleep();
-       }
-       else {
-               // simulate 2 seconds of system "sleep"
-               sleep(2);
-       }
-
-       uint64_t cnt_now = mach_continuous_time();
-
-       uint64_t ms_elapsed = tick_to_ms(cnt_now - cnt_then);
-       if(run_sleep_tests) {
-               T_ASSERT_LT(llabs((int64_t)ms_elapsed - 2000LL), 500LL, "slept for %llums, expected 2000ms (astris is connected?)", ms_elapsed);
-       }
-
-       T_ASSERT_EQ(kevent64(kq, NULL, 0, &event, 1, 0, NULL), 1, "kevent() should have returned one event");
-       T_LOG("event = {.ident = %llx, .filter = %d, .flags = %d, .fflags = %d, .data = %lld, .udata = %llu}", event.ident, event.filter, event.flags, event.fflags, event.data, event.udata);
-       T_ASSERT_EQ(event.ident, 3ULL, NULL);
-
-       uint64_t expected_pops = ms_elapsed / 100;
-       uint64_t got_pops      = (uint64_t)event.data;
-
-       T_ASSERT_GE(got_pops, expected_pops - 1, "tracking pops while asleep");
-       T_ASSERT_POSIX_ZERO(close(kq), NULL);
-}
diff --git a/tools/tests/darwintests/kevent_pty.c b/tools/tests/darwintests/kevent_pty.c
deleted file mode 100644 (file)
index a64c48d..0000000
+++ /dev/null
@@ -1,259 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif /* T_NAMESPACE */
-
-#include <Block.h>
-#include <darwintest.h>
-#include <dispatch/dispatch.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <signal.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <unistd.h>
-#include <util.h>
-
-T_GLOBAL_META(
-               T_META_NAMESPACE("xnu.kevent"),
-               T_META_CHECK_LEAKS(false));
-
-#define TIMEOUT_SECS 10
-
-static int child_ready[2];
-
-static void
-child_tty_client(void)
-{
-       dispatch_source_t src;
-       char buf[16] = "";
-       ssize_t bytes_wr;
-
-       src = dispatch_source_create(DISPATCH_SOURCE_TYPE_READ,
-                       (uintptr_t)STDIN_FILENO, 0, NULL);
-       if (!src) {
-               exit(1);
-       }
-       dispatch_source_set_event_handler(src, ^{});
-
-       dispatch_activate(src);
-
-       close(child_ready[0]);
-       snprintf(buf, sizeof(buf), "%ds", getpid());
-       bytes_wr = write(child_ready[1], buf, strlen(buf));
-       if (bytes_wr < 0) {
-               exit(1);
-       }
-
-       dispatch_main();
-}
-
-static void
-pty_master(void)
-{
-       pid_t child_pid;
-       int ret;
-
-       child_pid = fork();
-       if (child_pid == 0) {
-               child_tty_client();
-       }
-       ret = setpgid(child_pid, child_pid);
-       if (ret < 0) {
-               exit(1);
-       }
-       ret = tcsetpgrp(STDIN_FILENO, child_pid);
-       if (ret < 0) {
-               exit(1);
-       }
-
-       sleep(TIMEOUT_SECS);
-       exit(1);
-}
-
-T_DECL(pty_master_teardown,
-               "try removing a TTY master out from under a PTY slave holding a kevent",
-               T_META_ASROOT(true))
-{
-       __block pid_t master_pid;
-       char buf[16] = "";
-       char *end;
-       ssize_t bytes_rd;
-       size_t buf_len = 0;
-       unsigned long slave_pid;
-       int master_fd;
-       char pty_filename[PATH_MAX];
-       int status;
-
-       T_SETUPBEGIN;
-       T_ASSERT_POSIX_SUCCESS(pipe(child_ready), NULL);
-
-       master_pid = forkpty(&master_fd, pty_filename, NULL, NULL);
-       if (master_pid == 0) {
-               pty_master();
-               __builtin_unreachable();
-       }
-       T_ASSERT_POSIX_SUCCESS(master_pid,
-                       "forked child master PTY with pid %d, at pty %s", master_pid,
-                       pty_filename);
-
-       close(child_ready[1]);
-
-       end = buf;
-       do {
-               bytes_rd = read(child_ready[0], end, sizeof(buf) - buf_len);
-               T_ASSERT_POSIX_SUCCESS(bytes_rd, "read on pipe between master and runner");
-               buf_len += (size_t)bytes_rd;
-               T_LOG("runner read %zd bytes", bytes_rd);
-               end += bytes_rd;
-       } while (bytes_rd != 0 && *(end - 1) != 's');
-
-       slave_pid = strtoul(buf, &end, 0);
-       if (buf == end) {
-               T_ASSERT_FAIL("could not parse child PID from master pipe");
-       }
-
-       T_LOG("got pid %lu for slave process from master", slave_pid);
-       T_SETUPEND;
-
-       T_LOG("sending fatal signal to master");
-       T_ASSERT_POSIX_SUCCESS(kill(master_pid, SIGKILL), NULL);
-
-       T_LOG("sending fatal signal to slave");
-       (void)kill((int)slave_pid, SIGKILL);
-
-       T_ASSERT_POSIX_SUCCESS(waitpid(master_pid, &status, 0), NULL);
-       T_ASSERT_TRUE(WIFSIGNALED(status), "master PID was signaled");
-       (void)waitpid((int)slave_pid, &status, 0);
-}
-
-volatile static bool writing = true;
-
-static void *
-reader_thread(void *arg)
-{
-       int fd = (int)arg;
-       char c;
-
-       T_SETUPBEGIN;
-       T_QUIET;
-       T_ASSERT_GT(fd, 0, "reader thread received valid fd");
-       T_SETUPEND;
-
-       for (;;) {
-               ssize_t rdsize = read(fd, &c, sizeof(c));
-               if (rdsize == -1) {
-                       if (errno == EINTR) {
-                               continue;
-                       } else if (errno == EBADF) {
-                               T_LOG("reader got an error (%s), shutting down", strerror(errno));
-                               return NULL;
-                       } else {
-                               T_ASSERT_POSIX_SUCCESS(rdsize, "read on PTY");
-                       }
-               } else if (rdsize == 0) {
-                       return NULL;
-               }
-       }
-
-       return NULL;
-}
-
-static void *
-writer_thread(void *arg)
-{
-       int fd = (int)arg;
-       char c[4096];
-
-       T_SETUPBEGIN;
-       T_QUIET;
-       T_ASSERT_GT(fd, 0, "writer thread received valid fd");
-       memset(c, 'a', sizeof(c));
-       T_SETUPEND;
-
-       while (writing) {
-               ssize_t wrsize = write(fd, c, sizeof(c));
-               if (wrsize == -1) {
-                       if (errno == EINTR) {
-                               continue;
-                       } else {
-                               T_LOG("writer got an error (%s), shutting down", strerror(errno));
-                               return NULL;
-                       }
-               }
-       }
-
-       return NULL;
-}
-
-#define ATTACH_ITERATIONS 10000
-
-static int master, slave;
-static pthread_t reader, writer;
-
-static void
-join_threads(void)
-{
-       close(slave);
-       close(master);
-       writing = false;
-       pthread_join(reader, NULL);
-       pthread_join(writer, NULL);
-}
-
-static void
-redispatch(dispatch_group_t grp, dispatch_source_type_t type, int fd)
-{
-       __block int iters = 0;
-
-       __block void (^redispatch_blk)(void) = Block_copy(^{
-               if (iters++ > ATTACH_ITERATIONS) {
-                       return;
-               } else if (iters == ATTACH_ITERATIONS) {
-                       dispatch_group_leave(grp);
-                       T_PASS("created %d %s sources on busy PTY", iters,
-                                       type == DISPATCH_SOURCE_TYPE_READ ? "read" : "write");
-               }
-
-               dispatch_source_t src = dispatch_source_create(
-                               type, (uintptr_t)fd, 0,
-                               dispatch_get_main_queue());
-
-               dispatch_source_set_event_handler(src, ^{
-                       dispatch_cancel(src);
-               });
-
-               dispatch_source_set_cancel_handler(src, redispatch_blk);
-
-               dispatch_activate(src);
-       });
-
-       dispatch_group_enter(grp);
-       dispatch_async(dispatch_get_main_queue(), redispatch_blk);
-}
-
-T_DECL(attach_while_tty_wakeups,
-               "try to attach knotes while a TTY is getting wakeups")
-{
-       dispatch_group_t grp = dispatch_group_create();
-
-       T_SETUPBEGIN;
-       T_ASSERT_POSIX_SUCCESS(openpty(&master, &slave, NULL, NULL, NULL), NULL);
-
-       T_ASSERT_POSIX_ZERO(pthread_create(&reader, NULL, reader_thread,
-                               (void *)(uintptr_t)master), NULL);
-       T_ASSERT_POSIX_ZERO(pthread_create(&writer, NULL, writer_thread,
-                               (void *)(uintptr_t)slave), NULL);
-       T_ATEND(join_threads);
-       T_SETUPEND;
-
-       redispatch(grp, DISPATCH_SOURCE_TYPE_READ, master);
-       redispatch(grp, DISPATCH_SOURCE_TYPE_WRITE, slave);
-
-       dispatch_group_notify(grp, dispatch_get_main_queue(), ^{
-               T_LOG("both reader and writer sources cleaned up");
-               T_END;
-       });
-
-       dispatch_main();
-}
diff --git a/tools/tests/darwintests/kevent_qos.c b/tools/tests/darwintests/kevent_qos.c
deleted file mode 100644 (file)
index 823bf1a..0000000
+++ /dev/null
@@ -1,908 +0,0 @@
-/*
- * kevent_qos: Tests Synchronous IPC QOS override.
- */
-
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-
-#include <darwintest.h>
-#include <darwintest_multiprocess.h>
-
-#include <dispatch/dispatch.h>
-#include <pthread.h>
-#include <launch.h>
-#include <mach/mach.h>
-#include <mach/message.h>
-#include <mach/mach_voucher.h>
-#include <pthread/workqueue_private.h>
-#include <voucher/ipc_pthread_priority_types.h>
-#include <servers/bootstrap.h>
-#include <stdlib.h>
-#include <sys/event.h>
-#include <unistd.h>
-#include <crt_externs.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.kevent_qos"));
-
-#define ARRAYLEN(arr) (sizeof(arr) / sizeof(arr[0]))
-
-#define RECV_TIMEOUT_SECS   (4)
-#define SEND_TIMEOUT_SECS   (6)
-#define HELPER_TIMEOUT_SECS (15)
-
-#define ENV_VAR_QOS (3)
-static const char *qos_env[ENV_VAR_QOS] = {"XNU_TEST_QOS_BO",  "XNU_TEST_QOS_QO", "XNU_TEST_QOS_AO"};
-static const char *qos_name_env[ENV_VAR_QOS] = {"XNU_TEST_QOS_NAME_BO", "XNU_TEST_QOS_NAME_QO", "XNU_TEST_QOS_NAME_AO"};
-
-#define ENV_VAR_FUNCTION (1)
-static const char *wl_function_name = "XNU_TEST_WL_FUNCTION";
-
-static qos_class_t g_expected_qos[ENV_VAR_QOS];
-static const char *g_expected_qos_name[ENV_VAR_QOS];
-
-#define ENV_QOS_BEFORE_OVERRIDE (0)
-#define ENV_QOS_QUEUE_OVERRIDE  (1)
-#define ENV_QOS_AFTER_OVERRIDE  (2)
-
-#pragma mark pthread callbacks
-
-static void
-worker_cb(pthread_priority_t __unused priority)
-{
-       T_FAIL("a worker thread was created");
-}
-
-static void
-event_cb(void ** __unused events, int * __unused nevents)
-{
-       T_FAIL("a kevent routine was called instead of workloop");
-}
-
-/*
- * Basic WL handler callback, it sleeps for n seconds and then checks the
- * effective Qos of the servicer thread.
- */
-static void
-workloop_cb_test_intransit(uint64_t *workloop_id __unused, void **eventslist __unused, int *events)
-{
-       T_LOG("Workloop handler workloop_cb_test_intransit called. "
-               "Will wait for %d seconds to make sure client enqueues the sync msg \n",
-               2 * RECV_TIMEOUT_SECS);
-
-       /* Wait for the client to send the high priority message to override the qos */
-       sleep(2 * RECV_TIMEOUT_SECS);
-
-       /* Skip the test if we can't check Qos */
-       if (geteuid() != 0) {
-               T_SKIP("kevent_qos test requires root privileges to run.");
-       }
-
-       /* The effective Qos should be the one expected after override */
-       T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
-                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
-
-       T_END;
-       *events = 0;
-}
-
-/*
- * WL handler which checks if the servicer thread has correct Qos.
- */
-static void
-workloop_cb_test_sync_send(uint64_t *workloop_id __unused, void **eventslist __unused, int *events)
-{
-       T_LOG("Workloop handler workloop_cb_test_sync_send called");
-
-       if (geteuid() != 0) {
-               T_SKIP("kevent_qos test requires root privileges to run.");
-       }
-
-       /* The effective Qos should be the one expected after override */
-       T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
-                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
-
-       T_END;
-       *events = 0;
-}
-
-/*
- * WL handler which checks the overridden Qos and then enables the knote and checks
- * for the Qos again if that dropped the sync ipc override.
- */
-static void
-workloop_cb_test_sync_send_and_enable(uint64_t *workloop_id, struct kevent_qos_s **eventslist, int *events)
-{
-       int r;
-       T_LOG("Workloop handler workloop_cb_test_sync_send_and_enable called");
-
-       if (geteuid() != 0) {
-               T_SKIP("kevent_qos test requires root privileges to run.");
-       }
-
-       /* The effective Qos should be the one expected after override */
-       T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
-                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
-
-       /* Enable the knote */
-       struct kevent_qos_s *kev = *eventslist;
-       kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED;
-       struct kevent_qos_s kev_err[] = {{ 0 }};
-
-       r = kevent_id(*workloop_id, kev, 1, kev_err, 1, NULL,
-                       NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS | KEVENT_FLAG_DYNAMIC_KQ_MUST_EXIST);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id");
-
-       /* Sync override should have been removed */
-       T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE],
-                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]);
-
-       T_END;
-       *events = 0;
-}
-
-/*
- * WL handler receives the first message and checks sync ipc override, then enables the knote
- * and receives 2nd message and checks it sync ipc override.
- */
-static int send_two_sync_handler_called = 0;
-static void
-workloop_cb_test_send_two_sync(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist, int *events)
-{
-       T_LOG("Workloop handler workloop_cb_test_send_two_sync called for %d time", send_two_sync_handler_called + 1);
-
-       if (geteuid() != 0) {
-               T_SKIP("kevent_qos test requires root privileges to run.");
-       }
-
-       T_LOG("Number of events received is %d\n", *events);
-
-       if (send_two_sync_handler_called == 0) {
-               /* The effective Qos should be the one expected after override */
-               T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
-                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
-
-               /* Enable the knote to get 2nd message */
-               struct kevent_qos_s *kev = *eventslist;
-               kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED;
-               kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
-                               MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
-                               MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) |
-                               MACH_RCV_VOUCHER);
-               *events = 1;
-       } else {
-               T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE],
-                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]);
-               T_END;
-               *events = 0;
-       }
-       send_two_sync_handler_called++;
-}
-
-/*
- * Checks the sync ipc override and then waits for client to destroy the
- * special reply port and checks if that removes the sync ipc override.
- */
-static boolean_t two_send_and_destroy_test_passed = FALSE;
-static int two_send_and_destroy_handler = 0;
-static void
-workloop_cb_test_two_send_and_destroy(uint64_t *workloop_id __unused, struct kevent_qos_s **eventslist __unused, int *events)
-{
-       T_LOG("Workloop handler workloop_cb_test_two_send_and_destroy called %d times", two_send_and_destroy_handler + 1);
-
-       if (geteuid() != 0) {
-               T_SKIP("kevent_qos test requires root privileges to run.");
-       }
-
-       if (two_send_and_destroy_handler == 0) {
-               /* The effective Qos should be the one expected after override */
-               T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
-                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
-
-               sleep(2 * RECV_TIMEOUT_SECS);
-
-               /* Special reply port should have been destroyed, check Qos again */
-               T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE],
-                       "dispatch_source event handler QoS should be %s", g_expected_qos_name[ENV_QOS_BEFORE_OVERRIDE]);
-
-               two_send_and_destroy_test_passed = TRUE;
-       } else {
-               if (two_send_and_destroy_test_passed) {
-                       T_END;
-               }
-       }
-
-       /* Enable the knote to get next message */
-       struct kevent_qos_s *kev = *eventslist;
-       kev->flags = EV_ADD | EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED;
-       kev->fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
-                               MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
-                               MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) |
-                               MACH_RCV_VOUCHER);
-       *events = 1;
-       two_send_and_destroy_handler++;
-       T_LOG("Handler returning \n");
-}
-
-#pragma mark Mach receive
-
-#define KEVENT_QOS_SERVICE_NAME "com.apple.xnu.test.kevent_qos"
-
-static mach_port_t
-get_server_port(void)
-{
-       mach_port_t port;
-       kern_return_t kr = bootstrap_check_in(bootstrap_port,
-                       KEVENT_QOS_SERVICE_NAME, &port);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "server bootstrap_check_in");
-       return port;
-}
-
-static void
-env_set_qos(char **env, qos_class_t qos[], const char *qos_name[], const char *wl_function)
-{
-       int i;
-       char *qos_str, *qos_name_str;
-       for (i = 0; i < ENV_VAR_QOS; i++) {
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&qos_str, "%s=%d", qos_env[i] , qos[i]),
-                       NULL);
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(
-                       asprintf(&qos_name_str, "%s=%s", qos_name_env[i], qos_name[i]), NULL);
-               env[2 * i] = qos_str;
-               env[2 * i + 1] = qos_name_str;
-       }
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(asprintf(&env[2 * i], "%s=%s", wl_function_name, wl_function),
-                       NULL);
-       env[2 * i + 1] = NULL;
-}
-
-static void
-environ_get_qos(qos_class_t qos[], const char *qos_name[], const char **wl_function)
-{
-       char *qos_str;
-       char *qos_end;
-       int i;
-
-       for (i = 0; i < ENV_VAR_QOS; i++) {
-               qos_str = getenv(qos_env[i]);
-               T_QUIET; T_ASSERT_NOTNULL(qos_str, "getenv(%s)", qos_env[i]);
-
-               unsigned long qos_l = strtoul(qos_str, &qos_end, 10);
-               T_QUIET; T_ASSERT_EQ(*qos_end, '\0', "getenv(%s) = '%s' should be an "
-                               "integer", qos_env[i], qos_str);
-
-               T_QUIET; T_ASSERT_LT(qos_l, (unsigned long)100, "getenv(%s) = '%s' should "
-                               "be less than 100", qos_env[i], qos_str);
-
-               qos[i] = (qos_class_t)qos_l;
-               qos_name[i] = getenv(qos_name_env[i]);
-               T_QUIET; T_ASSERT_NOTNULL(qos_name[i], "getenv(%s)", qos_name_env[i]);
-       }
-       *wl_function = getenv(wl_function_name);
-       T_QUIET; T_ASSERT_NOTNULL(*wl_function, "getenv(%s)", wl_function_name);
-}
-
-static mach_voucher_t
-create_pthpriority_voucher(mach_msg_priority_t qos)
-{
-       char voucher_buf[sizeof(mach_voucher_attr_recipe_data_t) + sizeof(ipc_pthread_priority_value_t)];
-
-       mach_voucher_t voucher = MACH_PORT_NULL;
-       kern_return_t ret;
-       ipc_pthread_priority_value_t ipc_pthread_priority_value =
-                       (ipc_pthread_priority_value_t)qos;
-
-       mach_voucher_attr_raw_recipe_array_t recipes;
-       mach_voucher_attr_raw_recipe_size_t recipe_size = 0;
-       mach_voucher_attr_recipe_t recipe =
-               (mach_voucher_attr_recipe_t)&voucher_buf[recipe_size];
-
-       recipe->key = MACH_VOUCHER_ATTR_KEY_PTHPRIORITY;
-       recipe->command = MACH_VOUCHER_ATTR_PTHPRIORITY_CREATE;
-       recipe->previous_voucher = MACH_VOUCHER_NULL;
-       memcpy((char *)&recipe->content[0], &ipc_pthread_priority_value, sizeof(ipc_pthread_priority_value));
-       recipe->content_size = sizeof(ipc_pthread_priority_value_t);
-       recipe_size += sizeof(mach_voucher_attr_recipe_data_t) + recipe->content_size;
-
-       recipes = (mach_voucher_attr_raw_recipe_array_t)&voucher_buf[0];
-
-       ret = host_create_mach_voucher(mach_host_self(),
-                               recipes,
-                               recipe_size,
-                               &voucher);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client host_create_mach_voucher");
-       return voucher;
-}
-
-static void
-send(
-       mach_port_t send_port,
-       mach_port_t reply_port,
-       mach_port_t msg_port,
-       mach_msg_priority_t qos)
-{
-       kern_return_t ret = 0;
-
-       struct {
-               mach_msg_header_t header;
-               mach_msg_body_t body;
-               mach_msg_port_descriptor_t port_descriptor;
-       } send_msg = {
-           .header =
-               {
-                   .msgh_remote_port = send_port,
-                   .msgh_local_port  = reply_port,
-                   .msgh_bits        = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND,
-                       reply_port ? MACH_MSG_TYPE_MAKE_SEND_ONCE : 0,
-                       MACH_MSG_TYPE_MOVE_SEND,
-                       MACH_MSGH_BITS_COMPLEX),
-                   .msgh_id          = 0x100,
-                   .msgh_size        = sizeof(send_msg),
-                   .msgh_voucher_port = create_pthpriority_voucher(qos),
-               },
-           .body =
-               {
-                   .msgh_descriptor_count = 1,
-               },
-           .port_descriptor =
-               {
-                   .name = msg_port, .disposition = MACH_MSG_TYPE_MOVE_RECEIVE, .type = MACH_MSG_PORT_DESCRIPTOR,
-               },
-       };
-
-       if (msg_port == MACH_PORT_NULL) {
-               send_msg.body.msgh_descriptor_count = 0;
-       }
-
-       ret = mach_msg(&(send_msg.header),
-               MACH_SEND_MSG |
-               MACH_SEND_TIMEOUT |
-               MACH_SEND_OVERRIDE|
-               (reply_port ? MACH_SEND_SYNC_OVERRIDE : 0) ,
-               send_msg.header.msgh_size,
-               0,
-               MACH_PORT_NULL,
-               0,
-               0);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "client mach_msg");
-}
-
-static void
-receive(
-       mach_port_t rcv_port,
-       mach_port_t notify_port)
-{
-       kern_return_t ret = 0;
-
-       struct {
-               mach_msg_header_t header;
-               mach_msg_body_t body;
-               mach_msg_port_descriptor_t port_descriptor;
-       } rcv_msg = {
-           .header =
-               {
-                   .msgh_remote_port = MACH_PORT_NULL,
-                   .msgh_local_port  = rcv_port,
-                   .msgh_size        = sizeof(rcv_msg),
-               },
-       };
-
-       T_LOG("Client: Starting sync receive\n");
-
-       ret = mach_msg(&(rcv_msg.header),
-               MACH_RCV_MSG |
-               MACH_RCV_TIMEOUT |
-               MACH_RCV_SYNC_WAIT,
-               0,
-               rcv_msg.header.msgh_size,
-               rcv_port,
-               SEND_TIMEOUT_SECS * 1000,
-               notify_port);
-
-       if (!(ret == MACH_RCV_TIMED_OUT || ret == MACH_MSG_SUCCESS)) {
-               T_ASSERT_FAIL("Sync rcv failed \n");
-       }
-}
-
-T_HELPER_DECL(qos_get_special_reply_port,
-               "Test get_special_reply_port and it's corner cases.")
-{
-       mach_port_t special_reply_port;
-       mach_port_t new_special_reply_port;
-
-       special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port");
-
-       new_special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(new_special_reply_port , "get_thread_special_reply_port");
-
-       mach_port_destroy(mach_task_self(), special_reply_port);
-       mach_port_destroy(mach_task_self(), new_special_reply_port);
-
-       new_special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(new_special_reply_port , "get_thread_special_reply_port");
-
-       T_END;
-}
-
-T_HELPER_DECL(qos_client_send_to_intransit,
-               "Send synchronous messages to an intransit port")
-{
-       mach_port_t qos_send_port;
-       mach_port_t msg_port;
-       mach_port_t special_reply_port;
-
-       kern_return_t kr = bootstrap_look_up(bootstrap_port,
-                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
-
-       special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port");
-
-       /* Create a rcv right to send in a msg */
-       kr = mach_port_allocate(mach_task_self(),
-                       MACH_PORT_RIGHT_RECEIVE,
-                       &msg_port);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_allocate");
-
-       kr = mach_port_insert_right(mach_task_self(),
-                       msg_port,
-                       msg_port,
-                       MACH_MSG_TYPE_MAKE_SEND);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_insert_right");
-
-       /* Send an empty msg on the port to fire the WL thread */
-       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0));
-
-       sleep(SEND_TIMEOUT_SECS);
-
-       /* Send the message with msg port as in-transit port, this msg will not be dequeued */
-       send(qos_send_port, MACH_PORT_NULL, msg_port,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0));
-
-       /* Send the message to the in-transit port, it should override the rcv's workloop */
-       send(msg_port, special_reply_port, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0));
-       T_LOG("Client done sending messages, now waiting for server to end the test");
-       sleep(2 * SEND_TIMEOUT_SECS);
-
-       T_ASSERT_FAIL("client timed out");
-}
-
-T_HELPER_DECL(qos_client_send_sync_and_enqueue_rcv,
-               "Send synchronous messages and enqueue the rcv right")
-{
-       mach_port_t qos_send_port;
-       mach_port_t msg_port;
-       mach_port_t special_reply_port;
-
-       kern_return_t kr = bootstrap_look_up(bootstrap_port,
-                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
-
-       special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port");
-
-       /* Create a rcv right to send in a msg */
-       kr = mach_port_allocate(mach_task_self(),
-                       MACH_PORT_RIGHT_RECEIVE,
-                       &msg_port);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_allocate");
-
-       kr = mach_port_insert_right(mach_task_self(),
-                       msg_port,
-                       msg_port,
-                       MACH_MSG_TYPE_MAKE_SEND);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client mach_port_insert_right");
-
-       /* Send the message to msg port */
-       send(msg_port, special_reply_port, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0));
-
-       /* Send the message with msg port as in-transit port, copyin of in-transit will cause sync override */
-       send(qos_send_port, MACH_PORT_NULL, msg_port,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0));
-
-       T_LOG("Client done sending messages, now waiting for server to end the test");
-       sleep(3 * SEND_TIMEOUT_SECS);
-
-       T_ASSERT_FAIL("client timed out");
-}
-
-static void
-thread_create_at_qos(qos_class_t qos, void * (*function)(void *))
-{
-       qos_class_t qos_thread;
-       pthread_t thread;
-        pthread_attr_t attr;
-       int ret;
-
-       ret = setpriority(PRIO_DARWIN_ROLE, 0, PRIO_DARWIN_ROLE_UI_FOCAL);
-       if (ret != 0) {
-               T_LOG("set priority failed\n");
-       }
-
-        pthread_attr_init(&attr);
-        pthread_attr_set_qos_class_np(&attr, qos, 0);
-        pthread_create(&thread, &attr, function, NULL);
-
-       T_LOG("pthread created\n");
-       pthread_get_qos_class_np(thread, &qos_thread, NULL);
-        T_EXPECT_EQ(qos_thread, (qos_class_t)qos, NULL);
-}
-
-static void *
-qos_send_and_sync_rcv(void *arg __unused)
-{
-       mach_port_t qos_send_port;
-       mach_port_t special_reply_port;
-
-       T_LOG("Client: from created thread\n");
-
-       T_EXPECT_EFFECTIVE_QOS_EQ(g_expected_qos[ENV_QOS_AFTER_OVERRIDE],
-                       "pthread QoS should be %s", g_expected_qos_name[ENV_QOS_AFTER_OVERRIDE]);
-
-       kern_return_t kr = bootstrap_look_up(bootstrap_port,
-                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
-
-       special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port");
-
-       /* enqueue two messages to make sure that mqueue is not empty */
-       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0));
-
-       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0));
-
-       sleep(SEND_TIMEOUT_SECS);
-
-       /* sync wait on msg port */
-       receive(special_reply_port, qos_send_port);
-
-       T_LOG("Client done doing sync rcv, now waiting for server to end the test");
-       sleep(SEND_TIMEOUT_SECS);
-
-       T_ASSERT_FAIL("client timed out");
-       return 0;
-}
-
-T_HELPER_DECL(qos_client_send_sync_and_sync_rcv,
-               "Send messages and syncronously wait for rcv")
-{
-       thread_create_at_qos(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], qos_send_and_sync_rcv);
-       sleep(HELPER_TIMEOUT_SECS);
-}
-
-T_HELPER_DECL(qos_client_send_sync_msg,
-               "Send synchronous messages")
-{
-       mach_port_t qos_send_port;
-       mach_port_t special_reply_port;
-
-       kern_return_t kr = bootstrap_look_up(bootstrap_port,
-                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
-
-       special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port");
-
-       /* Send the message to msg port */
-       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0));
-
-       T_LOG("Client done sending messages, now waiting for server to end the test");
-       sleep(2 * SEND_TIMEOUT_SECS);
-
-       T_ASSERT_FAIL("client timed out");
-}
-
-T_HELPER_DECL(qos_client_send_two_sync_msg,
-               "Send two synchronous messages at different qos")
-{
-       mach_port_t qos_send_port;
-       mach_port_t special_reply_port;
-
-       kern_return_t kr = bootstrap_look_up(bootstrap_port,
-                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
-
-       special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port");
-
-       /* Send the message to msg port */
-       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0));
-
-       /* Send the message to msg port */
-       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_BEFORE_OVERRIDE], 0, 0));
-
-       T_LOG("Client done sending messages, now waiting for server to end the test");
-       sleep(SEND_TIMEOUT_SECS);
-
-       T_ASSERT_FAIL("client timed out");
-}
-
-T_HELPER_DECL(qos_client_send_two_msg_and_destroy,
-               "Send two messages with 2nd one as sync and then destory the special reply port")
-{
-       mach_port_t qos_send_port;
-       mach_port_t special_reply_port;
-
-       kern_return_t kr = bootstrap_look_up(bootstrap_port,
-                       KEVENT_QOS_SERVICE_NAME, &qos_send_port);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "client bootstrap_look_up");
-
-       special_reply_port = thread_get_special_reply_port();
-       T_QUIET; T_ASSERT_NOTNULL(special_reply_port , "get_thread_special_reply_port");
-
-       /* Send an async message to msg port */
-       send(qos_send_port, MACH_PORT_NULL, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0));
-
-       /* Send the message to msg port */
-       send(qos_send_port, special_reply_port, MACH_PORT_NULL,
-               (uint32_t)_pthread_qos_class_encode(g_expected_qos[ENV_QOS_AFTER_OVERRIDE], 0, 0));
-
-       T_LOG("Client done sending messages, waiting for destroy the special reply_port");
-       sleep(SEND_TIMEOUT_SECS);
-
-       mach_port_destroy(mach_task_self(), special_reply_port);
-       sleep(SEND_TIMEOUT_SECS);
-
-       T_ASSERT_FAIL("client timed out");
-}
-
-static void
-run_client_server(const char *server_name, const char *client_name, qos_class_t qos[],
-               const char *qos_name[], const char *wl_function)
-{
-       char *env[2 * ENV_VAR_QOS + ENV_VAR_FUNCTION + 1];
-       env_set_qos(env, qos, qos_name, wl_function);
-
-       for (int i = 0; i < ENV_VAR_QOS; i++) {
-               g_expected_qos[i] = qos[i];
-               g_expected_qos_name[i] = qos_name[i];
-       }
-
-       dt_helper_t helpers[] = {
-               dt_launchd_helper_env("com.apple.xnu.test.kevent_qos.plist",
-                               server_name, env),
-               dt_fork_helper(client_name)
-       };
-       dt_run_helpers(helpers, 2, HELPER_TIMEOUT_SECS);
-}
-
-#pragma mark Mach receive - kevent_qos
-
-
-static void
-expect_kevent_id_recv(mach_port_t port, qos_class_t qos[], const char *qos_name[], const char *wl_function)
-{
-       int r;
-
-       /* Qos expected by workloop thread */
-       for (int i = 0; i < ENV_VAR_QOS; i++) {
-               g_expected_qos[i] = qos[i];
-               g_expected_qos_name[i] = qos_name[i];
-       }
-
-       if (strcmp(wl_function, "workloop_cb_test_intransit") == 0) {
-               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
-                       worker_cb, event_cb,
-                       (pthread_workqueue_function_workloop_t)workloop_cb_test_intransit, 0, 0), NULL);
-       } else if (strcmp(wl_function, "workloop_cb_test_sync_send") == 0) {
-               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
-                       worker_cb, event_cb,
-                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send, 0, 0), NULL);
-       } else if (strcmp(wl_function, "workloop_cb_test_sync_send_and_enable") == 0) {
-               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
-                       worker_cb, event_cb,
-                       (pthread_workqueue_function_workloop_t)workloop_cb_test_sync_send_and_enable, 0, 0), NULL);
-       } else if (strcmp(wl_function, "workloop_cb_test_send_two_sync") == 0) {
-               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
-                       worker_cb, event_cb,
-                       (pthread_workqueue_function_workloop_t)workloop_cb_test_send_two_sync, 0, 0), NULL);
-       } else if (strcmp(wl_function, "workloop_cb_test_two_send_and_destroy") == 0) {
-               T_QUIET; T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_workloop(
-                       worker_cb, event_cb,
-                       (pthread_workqueue_function_workloop_t)workloop_cb_test_two_send_and_destroy, 0, 0), NULL);
-       } else {
-               T_ASSERT_FAIL("no workloop function specified \n");
-       }
-
-       struct kevent_qos_s kev[] = {{
-               .ident = port,
-               .filter = EVFILT_MACHPORT,
-               .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
-               .fflags = (MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_LARGE_IDENTITY |
-                               MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_CTX) |
-                               MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0) |
-                               MACH_RCV_VOUCHER),
-               .data = 1,
-               .qos = (int32_t)_pthread_qos_class_encode(qos[ENV_QOS_QUEUE_OVERRIDE], 0, 0)
-       }};
-
-       struct kevent_qos_s kev_err[] = {{ 0 }};
-
-       /* Setup workloop for mach msg rcv */
-       r = kevent_id(25, kev, 1, kev_err, 1, NULL,
-                       NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_ERROR_EVENTS);
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(r, "kevent_id");
-       T_QUIET; T_ASSERT_EQ(r, 0, "no errors returned from kevent_id");
-       sleep(HELPER_TIMEOUT_SECS);
-}
-
-T_HELPER_DECL(server_kevent_id,
-               "Reply with the QoS that a dispatch source event handler ran with")
-{
-       qos_class_t qos[ENV_VAR_QOS];
-       const char *qos_name[ENV_VAR_QOS];
-       const char *wl_function;
-       environ_get_qos(qos, qos_name, &wl_function);
-
-       expect_kevent_id_recv(get_server_port(), qos, qos_name, wl_function);
-       sleep(HELPER_TIMEOUT_SECS);
-       T_ASSERT_FAIL("should receive a message within %d seconds",
-                       RECV_TIMEOUT_SECS);
-}
-
-#define TEST_QOS(server_name, client_name, name, wl_function_name, qos_bo, qos_bo_name, qos_qo, qos_qo_name, qos_ao, qos_ao_name) \
-       T_DECL(server_kevent_id_##name, \
-                       "Event delivery at " qos_ao_name " QoS using a kevent_id", \
-                       T_META_ASROOT(YES)) \
-       { \
-               qos_class_t qos_array[ENV_VAR_QOS] = {qos_bo, qos_qo, qos_ao};  \
-               const char *qos_name_array[ENV_VAR_QOS] = {qos_bo_name, qos_qo_name, qos_ao_name}; \
-               run_client_server(server_name, client_name, qos_array, qos_name_array, wl_function_name); \
-       }
-
-/*
- * Test 1: Test special reply port SPI
- *
- * Create thread special reply port and check any subsequent calls to
- * the same should return MACH_PORT_NULL, unless the reply port is destroyed.
- */
-TEST_QOS("server_kevent_id", "qos_get_special_reply_port", special_reply_port, "workloop_cb_test_intransit",
-       QOS_CLASS_DEFAULT, "default",
-       QOS_CLASS_DEFAULT, "default",
-       QOS_CLASS_DEFAULT, "default")
-
-/*
- * Test 2: Test sync ipc send to an in-transit port
- *
- * Send a sync ipc message (at IN qos) to an in-transit port enqueued in a port
- * attached to a workloop. Test that the servicer of the workloop gets
- * sync ipc override.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_to_intransit", transit_IN, "workloop_cb_test_intransit",
-       QOS_CLASS_DEFAULT, "default",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INITIATED, "user initiated")
-
-/*
- * Test 3: Test sync ipc send to an in-transit port
- *
- * Send a sync ipc message (at UI qos) to an in-transit port enqueued in a port
- * attached to a workloop. Test that the servicer of the workloop gets
- * sync ipc override.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_to_intransit", transit_UI, "workloop_cb_test_intransit",
-       QOS_CLASS_USER_INITIATED, "user initiated",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INTERACTIVE, "user interactive")
-
-/*
- * Test 4: Test enqueue of a receive right having sync ipc override
- *
- * Enqueue a receive right which has a sync ipc override (at IN qos)
- * and test that servicer of the workloop on other side gets sync ipc
- * override.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_sync_and_enqueue_rcv", enqueue_IN, "workloop_cb_test_intransit",
-       QOS_CLASS_DEFAULT, "default",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INITIATED, "user initiated")
-
-/*
- * Test 5: Test enqueue of a receive right having sync ipc override
- *
- * Enqueue a receive right which has a sync ipc override (at UI qos)
- * and test that servicer of the workloop on other side gets sync ipc
- * override.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_sync_and_enqueue_rcv", enqueue_UI, "workloop_cb_test_intransit",
-       QOS_CLASS_DEFAULT, "default",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INTERACTIVE, "user interactive")
-
-/*
- * Test 6: Test starting a sync rcv overrides the servicer
- *
- * Send an async message to a port and then start waiting on
- * the port in mach msg rcv (at IN qos) with sync wait and test if the
- * servicer of the workloop gets sync ipc override.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_sync_and_sync_rcv", rcv_IN, "workloop_cb_test_intransit",
-       QOS_CLASS_DEFAULT, "default",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INITIATED, "user initiated")
-
-/*
- * Test 7: Test starting a sync rcv overrides the servicer
- *
- * Send an async message to a port and then start waiting on
- * the port in mach msg rcv (at UI qos) with sync wait and test if the
- * servicer of the workloop gets sync ipc override.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_sync_and_sync_rcv", rcv_UI, "workloop_cb_test_intransit",
-       QOS_CLASS_DEFAULT, "default",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INTERACTIVE, "user interactive")
-
-/*
- * Test 8: test sending sync ipc message (at IN qos) to port will override the servicer
- *
- * Send a message with sync ipc override to a port and check if the servicer
- * of the workloop on other side gets sync ipc override.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_sync_msg", send_sync_IN, "workloop_cb_test_sync_send",
-       QOS_CLASS_DEFAULT, "default",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INITIATED, "user initiated")
-
-/*
- * Test 9: test sending sync ipc message (at UI qos) to port will override the servicer
- *
- * Send a message with sync ipc override to a port and check if the servicer
- * of the workloop on other side gets sync ipc override.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_sync_msg", send_sync_UI, "workloop_cb_test_sync_send",
-       QOS_CLASS_USER_INITIATED, "user initiated",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INTERACTIVE, "user interactive")
-
-/*
- * Test 10: test enabling a knote in workloop handler will drop the sync ipc override of delivered message
- *
- * Send a sync ipc message to port and check the servicer of the workloop
- * on other side gets sync ipc override and once the handler enables the knote,
- * that sync ipc override is dropped.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_sync_msg", send_sync_UI_and_enable, "workloop_cb_test_sync_send_and_enable",
-       QOS_CLASS_USER_INITIATED, "user initiated",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INTERACTIVE, "user interactive")
-
-/*
- * Test 11: test returning to begin processing drops sync ipc override of delivered message
- *
- * Send a sync ipc message and check if enabling the knote clears the override of
- * the delivered message, but should still have the override of an enqueued message.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_two_sync_msg", send_two_sync_UI, "workloop_cb_test_send_two_sync",
-       QOS_CLASS_USER_INITIATED, "user initiated",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INTERACTIVE, "user interactive")
-
-/*
- * Test 12: test destorying the special reply port drops the override
- *
- * Send two async messages and a sync ipc message, the workloop handler
- * should get a sync ipc override, now test if destroying the special
- * reply port drops the sync ipc override on the servicer.
- */
-TEST_QOS("server_kevent_id", "qos_client_send_two_msg_and_destroy", send_two_UI_and_destroy, "workloop_cb_test_two_send_and_destroy",
-       QOS_CLASS_USER_INITIATED, "user initiated",
-       QOS_CLASS_MAINTENANCE, "maintenance",
-       QOS_CLASS_USER_INTERACTIVE, "user interactive")
diff --git a/tools/tests/darwintests/kpc.c b/tools/tests/darwintests/kpc.c
deleted file mode 100644 (file)
index 5200950..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-#include <darwintest.h>
-#include <inttypes.h>
-#include <stdint.h>
-
-#include <kperf/kpc.h>
-
-T_DECL(fixed_counters,
-               "test that fixed counters return monotonically increasing values",
-               T_META_ASROOT(YES))
-{
-       T_SKIP("unimplemented");
-}
-
-T_DECL(fixed_thread_counters,
-               "test that fixed thread counters return monotonically increasing values",
-               T_META_ASROOT(YES))
-{
-       int err;
-       uint32_t ctrs_cnt;
-       uint64_t *ctrs_a;
-       uint64_t *ctrs_b;
-
-       T_SETUPBEGIN;
-
-       ctrs_cnt = kpc_get_counter_count(KPC_CLASS_FIXED_MASK);
-       if (ctrs_cnt == 0) {
-               T_SKIP("no fixed counters available");
-       }
-       T_LOG("device has %" PRIu32 " fixed counters", ctrs_cnt);
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kpc_force_all_ctrs_set(1), NULL);
-       T_ASSERT_POSIX_SUCCESS(kpc_set_counting(KPC_CLASS_FIXED_MASK),
-                       "kpc_set_counting");
-       T_ASSERT_POSIX_SUCCESS(kpc_set_thread_counting(KPC_CLASS_FIXED_MASK),
-                       "kpc_set_thread_counting");
-
-       T_SETUPEND;
-
-       ctrs_a = malloc(ctrs_cnt * sizeof(uint64_t));
-       T_QUIET; T_ASSERT_NOTNULL(ctrs_a, NULL);
-
-       err = kpc_get_thread_counters(0, ctrs_cnt, ctrs_a);
-       T_ASSERT_POSIX_SUCCESS(err, "kpc_get_thread_counters");
-
-       for (uint32_t i = 0; i < ctrs_cnt; i++) {
-               T_LOG("checking counter %d with value %" PRIu64 " > 0", i, ctrs_a[i]);
-               T_QUIET;
-               T_EXPECT_GT(ctrs_a[i], UINT64_C(0), "counter %d is non-zero", i);
-       }
-
-       ctrs_b = malloc(ctrs_cnt * sizeof(uint64_t));
-       T_QUIET; T_ASSERT_NOTNULL(ctrs_b, NULL);
-
-       err = kpc_get_thread_counters(0, ctrs_cnt, ctrs_b);
-       T_ASSERT_POSIX_SUCCESS(err, "kpc_get_thread_counters");
-
-       for (uint32_t i = 0; i < ctrs_cnt; i++) {
-               T_LOG("checking counter %d with value %" PRIu64
-                               " > previous value %" PRIu64, i, ctrs_b[i], ctrs_a[i]);
-               T_QUIET;
-               T_EXPECT_GT(ctrs_b[i], UINT64_C(0), "counter %d is non-zero", i);
-               T_QUIET; T_EXPECT_LT(ctrs_a[i], ctrs_b[i],
-                               "counter %d is increasing", i);
-       }
-
-       free(ctrs_a);
-       free(ctrs_b);
-}
diff --git a/tools/tests/darwintests/kperf.c b/tools/tests/darwintests/kperf.c
deleted file mode 100644 (file)
index 81e3e4d..0000000
+++ /dev/null
@@ -1,558 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif /* defined(T_NAMESPACE) */
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-#include <dispatch/dispatch.h>
-#include <inttypes.h>
-#include <ktrace/session.h>
-#include <ktrace/private.h>
-#include <System/sys/kdebug.h>
-#include <kperf/kperf.h>
-#include <kperfdata/kpdecode.h>
-#include <os/assumes.h>
-#include <stdint.h>
-#include <sys/sysctl.h>
-
-#include "kperf_helpers.h"
-
-T_GLOBAL_META(
-               T_META_NAMESPACE("xnu.kperf"),
-               T_META_CHECK_LEAKS(false));
-
-#define MAX_CPUS    64
-#define MAX_THREADS 64
-
-volatile static bool running_threads = true;
-
-static void *
-spinning_thread(void *semp)
-{
-       T_QUIET;
-       T_ASSERT_NOTNULL(semp, "semaphore passed to thread should not be NULL");
-       dispatch_semaphore_signal(*(dispatch_semaphore_t *)semp);
-
-       while (running_threads);
-       return NULL;
-}
-
-#define PERF_STK_KHDR  UINT32_C(0x25020014)
-#define PERF_STK_UHDR  UINT32_C(0x25020018)
-#define PERF_TMR_FIRE  KDBG_EVENTID(DBG_PERF, 3, 0)
-#define PERF_TMR_HNDLR KDBG_EVENTID(DBG_PERF, 3, 2)
-#define PERF_TMR_PEND  KDBG_EVENTID(DBG_PERF, 3, 3)
-#define PERF_TMR_SKIP  KDBG_EVENTID(DBG_PERF, 3, 4)
-
-#define SCHED_HANDOFF KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, \
-               MACH_STACK_HANDOFF)
-#define SCHED_SWITCH  KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, MACH_SCHED)
-#define SCHED_IDLE    KDBG_EVENTID(DBG_MACH, DBG_MACH_SCHED, MACH_IDLE)
-
-#define MP_CPUS_CALL UINT32_C(0x1900004)
-
-#define DISPATCH_AFTER_EVENT UINT32_C(0xfefffffc)
-#define TIMEOUT_SECS 10
-
-#define TIMER_PERIOD_NS (1 * NSEC_PER_MSEC)
-
-static void
-reset_ktrace(void)
-{
-       kperf_reset();
-}
-
-/*
- * Ensure that kperf is correctly IPIing CPUs that are actively scheduling by
- * bringing up threads and ensuring that threads on-core are sampled by each
- * timer fire.
- */
-
-T_DECL(ipi_active_cpus,
-               "make sure that kperf IPIs all active CPUs",
-               T_META_ASROOT(true))
-{
-       int ncpus = dt_ncpu();
-       T_QUIET;
-       T_ASSERT_LT(ncpus, MAX_CPUS,
-                       "only supports up to %d CPUs", MAX_CPUS);
-       T_LOG("found %d CPUs", ncpus);
-
-       int nthreads = ncpus - 1;
-       T_QUIET;
-       T_ASSERT_LT(nthreads, MAX_THREADS,
-                       "only supports up to %d threads", MAX_THREADS);
-
-       static pthread_t threads[MAX_THREADS];
-
-       /*
-        * TODO options to write this to a file and reinterpret a file...
-        */
-
-       /*
-        * Create threads to bring up all of the CPUs.
-        */
-
-       dispatch_semaphore_t thread_spinning = dispatch_semaphore_create(0);
-
-       for (int i = 0; i < nthreads; i++) {
-               T_QUIET;
-               T_ASSERT_POSIX_ZERO(
-                               pthread_create(&threads[i], NULL, &spinning_thread,
-                               &thread_spinning), NULL);
-               dispatch_semaphore_wait(thread_spinning, DISPATCH_TIME_FOREVER);
-       }
-
-       T_LOG("spun up %d thread%s", nthreads, nthreads == 1 ? "" : "s");
-
-       ktrace_session_t s = ktrace_session_create();
-       T_WITH_ERRNO; T_ASSERT_NOTNULL(s, "ktrace_session_create");
-
-       dispatch_queue_t q = dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0);
-
-       /*
-        * Only set the timeout after we've seen an event that was traced by us.
-        * This helps set a reasonable timeout after we're guaranteed to get a
-        * few events.
-        */
-
-       ktrace_events_single(s, DISPATCH_AFTER_EVENT,
-                       ^(__unused struct trace_point *tp)
-       {
-               dispatch_after(dispatch_time(DISPATCH_TIME_NOW,
-                               TIMEOUT_SECS * NSEC_PER_SEC), q, ^{
-                       ktrace_end(s, 0);
-               });
-       });
-
-       __block uint64_t nfires = 0;
-       __block uint64_t nsamples = 0;
-       static uint64_t idle_tids[MAX_CPUS] = { 0 };
-       __block int nidles = 0;
-
-       ktrace_set_completion_handler(s, ^{
-               T_LOG("stopping threads");
-
-               running_threads = false;
-
-               for (int i = 0; i < nthreads; i++) {
-                       T_QUIET;
-                       T_ASSERT_POSIX_ZERO(pthread_join(threads[i], NULL), NULL);
-               }
-
-               for (int i = 0; i < nidles; i++) {
-                       T_LOG("CPU %d idle thread: %#" PRIx64, i, idle_tids[i]);
-               }
-
-               T_LOG("saw %" PRIu64 " timer fires, %" PRIu64 " samples, "
-                               "%g samples/fire", nfires, nsamples,
-                               (double)nsamples / (double)nfires);
-
-               T_END;
-       });
-
-       /*
-        * Track which threads are running on each CPU.
-        */
-
-       static uint64_t tids_on_cpu[MAX_CPUS] = { 0 };
-
-       void (^switch_cb)(struct trace_point *) = ^(struct trace_point *tp) {
-               uint64_t new_thread = tp->arg2;
-               // uint64_t old_thread = tp->threadid;
-
-               for (int i = 0; i < nidles; i++) {
-                       if (idle_tids[i] == new_thread) {
-                               return;
-                       }
-               }
-
-               tids_on_cpu[tp->cpuid] = new_thread;
-       };
-
-       ktrace_events_single(s, SCHED_SWITCH, switch_cb);
-       ktrace_events_single(s, SCHED_HANDOFF, switch_cb);
-
-       /*
-        * Determine the thread IDs of the idle threads on each CPU.
-        */
-
-       ktrace_events_single(s, SCHED_IDLE, ^(struct trace_point *tp) {
-               uint64_t idle_thread = tp->threadid;
-
-               tids_on_cpu[tp->cpuid] = 0;
-
-               for (int i = 0; i < nidles; i++) {
-                       if (idle_tids[i] == idle_thread) {
-                               return;
-                       }
-               }
-
-               idle_tids[nidles++] = idle_thread;
-       });
-
-       /*
-        * On each timer fire, go through all the cores and mark any threads
-        * that should be sampled.
-        */
-
-       __block int last_fire_cpu = -1;
-       __block uint64_t sample_missing = 0;
-       static uint64_t tids_snap[MAX_CPUS] = { 0 };
-       __block int nexpected = 0;
-#if defined(__x86_64__)
-       __block int xcall_from_cpu = -1;
-#endif /* defined(__x86_64__) */
-       __block uint64_t xcall_mask = 0;
-
-       ktrace_events_single(s, PERF_TMR_FIRE, ^(struct trace_point *tp) {
-               int last_expected = nexpected;
-               nfires++;
-
-               nexpected = 0;
-               for (int i = 0; i < ncpus; i++) {
-                       uint64_t i_bit = UINT64_C(1) << i;
-                       if (sample_missing & i_bit) {
-                               T_LOG("missed sample on CPU %d for thread %#llx from timer on CPU %d (xcall mask = %llx, expected %d samples)",
-                                               tp->cpuid, tids_snap[i], last_fire_cpu,
-                                               xcall_mask, last_expected);
-                               sample_missing &= ~i_bit;
-                       }
-
-                       if (tids_on_cpu[i] != 0) {
-                               tids_snap[i] = tids_on_cpu[i];
-                               sample_missing |= i_bit;
-                               nexpected++;
-                       }
-               }
-
-               T_QUIET;
-               T_ASSERT_LT((int)tp->cpuid, ncpus, "timer fire should not occur on an IOP");
-               last_fire_cpu = (int)tp->cpuid;
-#if defined(__x86_64__)
-               xcall_from_cpu = (int)tp->cpuid;
-#endif /* defined(__x86_64__) */
-       });
-
-#if defined(__x86_64__)
-       /*
-        * Watch for the cross-call on Intel, make sure they match what kperf
-        * should be doing.
-        */
-
-       ktrace_events_single(s, MP_CPUS_CALL, ^(struct trace_point *tp) {
-               if (xcall_from_cpu != (int)tp->cpuid) {
-                       return;
-               }
-
-               xcall_mask = tp->arg1;
-               xcall_from_cpu = -1;
-       });
-#endif /* defined(__x86_64__) */
-
-       /*
-        * On the timer handler for each CPU, unset the missing sample bitmap.
-        */
-
-       ktrace_events_single(s, PERF_TMR_HNDLR, ^(struct trace_point *tp) {
-               nsamples++;
-               if ((int)tp->cpuid > ncpus) {
-                       /* skip IOPs; they're not scheduling our threads */
-                       return;
-               }
-
-               sample_missing &= ~(UINT64_C(1) << tp->cpuid);
-       });
-
-       /*
-        * Configure kperf and ktrace.
-        */
-
-       (void)kperf_action_count_set(1);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1, KPERF_SAMPLER_KSTACK),
-                       NULL);
-       (void)kperf_timer_count_set(1);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0,
-                       kperf_ns_to_ticks(TIMER_PERIOD_NS)), NULL);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL);
-
-       T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), "start kperf sampling");
-       T_ATEND(reset_ktrace);
-
-       T_ASSERT_POSIX_ZERO(ktrace_start(s,
-                       dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0)),
-                       "start ktrace");
-
-       kdebug_trace(DISPATCH_AFTER_EVENT, 0, 0, 0, 0);
-
-       dispatch_main();
-}
-
-#pragma mark kdebug triggers
-
-#define KDEBUG_TRIGGER_TIMEOUT_NS (10 * NSEC_PER_SEC)
-
-#define NON_TRIGGER_CLASS    UINT8_C(0xfd)
-#define NON_TRIGGER_SUBCLASS UINT8_C(0xff)
-#define NON_TRIGGER_CODE     UINT8_C(0xff)
-
-#define NON_TRIGGER_EVENT \
-               (KDBG_EVENTID(NON_TRIGGER_CLASS, NON_TRIGGER_SUBCLASS, \
-               NON_TRIGGER_CODE))
-
-static void
-expect_kdebug_trigger(const char *filter_desc, const uint32_t *debugids,
-               unsigned int n_debugids)
-{
-       __block int missing_kernel_stacks = 0;
-       __block int missing_user_stacks = 0;
-       ktrace_session_t s;
-       kperf_kdebug_filter_t filter;
-
-       s = ktrace_session_create();
-       T_QUIET; T_ASSERT_NOTNULL(s, NULL);
-
-       ktrace_events_single(s, PERF_STK_KHDR, ^(struct trace_point *tp) {
-                       missing_kernel_stacks--;
-                       T_LOG("saw kernel stack with %lu frames, flags = %#lx", tp->arg2,
-                                       tp->arg1);
-                       });
-       ktrace_events_single(s, PERF_STK_UHDR, ^(struct trace_point *tp) {
-                       missing_user_stacks--;
-                       T_LOG("saw user stack with %lu frames, flags = %#lx", tp->arg2,
-                                       tp->arg1);
-                       });
-
-       for (unsigned int i = 0; i < n_debugids; i++) {
-               ktrace_events_single(s, debugids[i], ^(struct trace_point *tp) {
-                               missing_kernel_stacks++;
-                               missing_user_stacks++;
-                               T_LOG("saw event with debugid 0x%" PRIx32, tp->debugid);
-                               });
-       }
-
-       ktrace_events_single(s, NON_TRIGGER_EVENT,
-                       ^(__unused struct trace_point *tp)
-                       {
-                       ktrace_end(s, 0);
-                       });
-
-       ktrace_set_completion_handler(s, ^{
-                       T_EXPECT_LE(missing_kernel_stacks, 0, NULL);
-                       T_EXPECT_LE(missing_user_stacks, 0, NULL);
-
-                       ktrace_session_destroy(s);
-                       T_END;
-                       });
-
-       /* configure kperf */
-
-       kperf_reset();
-
-       (void)kperf_action_count_set(1);
-       T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1,
-                               KPERF_SAMPLER_KSTACK | KPERF_SAMPLER_USTACK), NULL);
-
-       filter = kperf_kdebug_filter_create();
-       T_ASSERT_NOTNULL(filter, NULL);
-
-       T_ASSERT_POSIX_SUCCESS(kperf_kdebug_action_set(1), NULL);
-       T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_add_desc(filter, filter_desc),
-                       NULL);
-       T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_set(filter), NULL);
-       kperf_kdebug_filter_destroy(filter);
-
-       T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
-
-       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-       /* trace the triggering debugids */
-
-       for (unsigned int i = 0; i < n_debugids; i++) {
-               T_ASSERT_POSIX_SUCCESS(kdebug_trace(debugids[i], 0, 0, 0, 0), NULL);
-       }
-
-       T_ASSERT_POSIX_SUCCESS(kdebug_trace(NON_TRIGGER_EVENT, 0, 0, 0, 0), NULL);
-
-       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, KDEBUG_TRIGGER_TIMEOUT_NS),
-                       dispatch_get_main_queue(), ^(void)
-                       {
-                       ktrace_end(s, 1);
-                       });
-}
-
-#define TRIGGER_CLASS     UINT8_C(0xfe)
-#define TRIGGER_CLASS_END UINT8_C(0xfd)
-#define TRIGGER_SUBCLASS  UINT8_C(0xff)
-#define TRIGGER_CODE      UINT8_C(0)
-#define TRIGGER_DEBUGID \
-               (KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, TRIGGER_CODE))
-
-T_DECL(kdebug_trigger_classes,
-               "test that kdebug trigger samples on classes",
-               T_META_ASROOT(true))
-{
-       const uint32_t class_debugids[] = {
-               KDBG_EVENTID(TRIGGER_CLASS, 1, 1),
-               KDBG_EVENTID(TRIGGER_CLASS, 2, 1),
-               KDBG_EVENTID(TRIGGER_CLASS_END, 1, 1) | DBG_FUNC_END,
-               KDBG_EVENTID(TRIGGER_CLASS_END, 2, 1) | DBG_FUNC_END,
-       };
-
-       expect_kdebug_trigger("C0xfe,C0xfdr", class_debugids,
-                       sizeof(class_debugids) / sizeof(class_debugids[0]));
-       dispatch_main();
-}
-
-T_DECL(kdebug_trigger_subclasses,
-               "test that kdebug trigger samples on subclasses",
-               T_META_ASROOT(true))
-{
-       const uint32_t subclass_debugids[] = {
-               KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 0),
-               KDBG_EVENTID(TRIGGER_CLASS, TRIGGER_SUBCLASS, 1),
-               KDBG_EVENTID(TRIGGER_CLASS_END, TRIGGER_SUBCLASS, 0) | DBG_FUNC_END,
-               KDBG_EVENTID(TRIGGER_CLASS_END, TRIGGER_SUBCLASS, 1) | DBG_FUNC_END
-       };
-
-       expect_kdebug_trigger("S0xfeff,S0xfdffr", subclass_debugids,
-                       sizeof(subclass_debugids) / sizeof(subclass_debugids[0]));
-       dispatch_main();
-}
-
-T_DECL(kdebug_trigger_debugids,
-               "test that kdebug trigger samples on debugids",
-               T_META_ASROOT(true))
-{
-       const uint32_t debugids[] = {
-               TRIGGER_DEBUGID
-       };
-
-       expect_kdebug_trigger("D0xfeff0000", debugids,
-                       sizeof(debugids) / sizeof(debugids[0]));
-       dispatch_main();
-}
-
-/*
- * TODO Set a single function specifier filter, expect not to trigger of all
- * events from that class.
- */
-
-T_DECL(kdbg_callstacks,
-               "test that the kdbg_callstacks samples on syscalls",
-               T_META_ASROOT(true))
-{
-       ktrace_session_t s;
-       __block bool saw_user_stack = false;
-
-       s = ktrace_session_create();
-       T_ASSERT_NOTNULL(s, NULL);
-
-       /*
-        * Make sure BSD events are traced in order to trigger samples on syscalls.
-        */
-       ktrace_events_class(s, DBG_BSD, ^void(__unused struct trace_point *tp) {});
-
-       ktrace_events_single(s, PERF_STK_UHDR, ^(__unused struct trace_point *tp) {
-               saw_user_stack = true;
-               ktrace_end(s, 1);
-       });
-
-       ktrace_set_completion_handler(s, ^{
-               ktrace_session_destroy(s);
-
-               T_EXPECT_TRUE(saw_user_stack,
-                               "saw user stack after configuring kdbg_callstacks");
-               T_END;
-       });
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-       T_ASSERT_POSIX_SUCCESS(kperf_kdbg_callstacks_set(1), NULL);
-#pragma clang diagnostic pop
-       T_ATEND(kperf_reset);
-
-       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, 10 * NSEC_PER_SEC),
-                       dispatch_get_main_queue(), ^(void) {
-               ktrace_end(s, 1);
-       });
-
-       dispatch_main();
-}
-
-#pragma mark PET
-
-#define STACKS_WAIT_DURATION_NS (3 * NSEC_PER_SEC)
-
-static void
-expect_stacks_traced(void (^cb)(void))
-{
-       ktrace_session_t s;
-
-       s = ktrace_session_create();
-       T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
-
-       __block unsigned int user_stacks = 0;
-       __block unsigned int kernel_stacks = 0;
-
-       ktrace_events_single(s, PERF_STK_UHDR, ^(__unused struct trace_point *tp) {
-                       user_stacks++;
-                       });
-       ktrace_events_single(s, PERF_STK_KHDR, ^(__unused struct trace_point *tp) {
-                       kernel_stacks++;
-                       });
-
-       ktrace_set_completion_handler(s, ^(void) {
-                       ktrace_session_destroy(s);
-                       T_EXPECT_GT(user_stacks, 0U, NULL);
-                       T_EXPECT_GT(kernel_stacks, 0U, NULL);
-                       cb();
-                       });
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
-
-       T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, STACKS_WAIT_DURATION_NS),
-                       dispatch_get_main_queue(), ^(void)
-                       {
-                       kperf_reset();
-                       ktrace_end(s, 0);
-                       });
-}
-
-T_DECL(pet, "test that PET mode samples kernel and user stacks",
-               T_META_ASROOT(true))
-{
-       configure_kperf_stacks_timer(-1, 10);
-       T_ASSERT_POSIX_SUCCESS(kperf_timer_pet_set(0), NULL);
-
-       expect_stacks_traced(^(void) {
-                       T_END;
-                       });
-
-       dispatch_main();
-}
-
-T_DECL(lightweight_pet,
-               "test that lightweight PET mode samples kernel and user stacks",
-               T_META_ASROOT(true))
-{
-       int set = 1;
-
-       configure_kperf_stacks_timer(-1, 10);
-       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kperf.lightweight_pet", NULL, NULL,
-                               &set, sizeof(set)), NULL);
-       T_ASSERT_POSIX_SUCCESS(kperf_timer_pet_set(0), NULL);
-
-       expect_stacks_traced(^(void) {
-                       T_END;
-                       });
-
-       dispatch_main();
-}
diff --git a/tools/tests/darwintests/kperf_backtracing.c b/tools/tests/darwintests/kperf_backtracing.c
deleted file mode 100644 (file)
index 1d3d46d..0000000
+++ /dev/null
@@ -1,449 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-
-#include <CoreSymbolication/CoreSymbolication.h>
-#include <darwintest.h>
-#include <dispatch/dispatch.h>
-#include <kperf/kperf.h>
-#include <ktrace/session.h>
-#include <System/sys/kdebug.h>
-#include <pthread.h>
-
-#include "kperf_helpers.h"
-
-#define PERF_STK_KHDR  UINT32_C(0x25020014)
-#define PERF_STK_UHDR  UINT32_C(0x25020018)
-#define PERF_STK_KDATA UINT32_C(0x2502000c)
-#define PERF_STK_UDATA UINT32_C(0x25020010)
-
-T_GLOBAL_META(
-               T_META_NAMESPACE("xnu.kperf"),
-               T_META_CHECK_LEAKS(false));
-
-static void
-expect_frame(const char **bt, unsigned int bt_len, CSSymbolRef symbol,
-    unsigned long addr, unsigned int bt_idx, unsigned int max_frames)
-{
-    const char *name;
-    unsigned int frame_idx = max_frames - bt_idx - 1;
-
-    if (!bt[frame_idx]) {
-        T_LOG("frame %2u: skipping system frame", frame_idx);
-        return;
-    }
-
-    if (CSIsNull(symbol)) {
-        T_FAIL("invalid symbol for address %#lx at frame %d", addr, frame_idx);
-        return;
-    }
-
-    if (frame_idx >= bt_len) {
-        T_FAIL("unexpected frame '%s' (%#lx) at index %u",
-            CSSymbolGetName(symbol), addr, frame_idx);
-        return;
-    }
-
-    name = CSSymbolGetName(symbol);
-    T_QUIET; T_ASSERT_NOTNULL(name, NULL);
-    T_EXPECT_EQ_STR(name, bt[frame_idx],
-        "frame %2u: saw '%s', expected '%s'",
-        frame_idx, name, bt[frame_idx]);
-}
-
-/*
- * Expect to see either user or kernel stacks on thread with ID `tid` with a
- * signature of `bt` of length `bt_len`.  Updates `stacks_seen` when stack
- * is found.
- *
- * Can also allow stacks to be larger than the signature -- additional frames
- * near the current PC will be ignored.  This allows stacks to potentially be
- * in the middle of a signalling system call (which signals that it is safe to
- * start sampling).
- */
-static void
-expect_backtrace(ktrace_session_t s, uint64_t tid, unsigned int *stacks_seen,
-    bool kern, const char **bt, unsigned int bt_len, unsigned int allow_larger_by)
-{
-    CSSymbolicatorRef symb;
-    uint32_t hdr_debugid;
-    uint32_t data_debugid;
-    __block unsigned int stacks = 0;
-    __block unsigned int frames = 0;
-    __block unsigned int hdr_frames = 0;
-    __block unsigned int allow_larger = allow_larger_by;
-
-    if (kern) {
-        static CSSymbolicatorRef kern_symb;
-        static dispatch_once_t kern_symb_once;
-
-        hdr_debugid = PERF_STK_KHDR;
-        data_debugid = PERF_STK_KDATA;
-
-        dispatch_once(&kern_symb_once, ^(void) {
-            kern_symb = CSSymbolicatorCreateWithMachKernel();
-            T_QUIET; T_ASSERT_FALSE(CSIsNull(kern_symb), NULL);
-        });
-        symb = kern_symb;
-    } else {
-        static CSSymbolicatorRef user_symb;
-        static dispatch_once_t user_symb_once;
-
-        hdr_debugid = PERF_STK_UHDR;
-        data_debugid = PERF_STK_UDATA;
-
-        dispatch_once(&user_symb_once, ^(void) {
-            user_symb = CSSymbolicatorCreateWithTask(mach_task_self());
-            T_QUIET; T_ASSERT_FALSE(CSIsNull(user_symb), NULL);
-            T_QUIET; T_ASSERT_TRUE(CSSymbolicatorIsTaskValid(user_symb), NULL);
-        });
-        symb = user_symb;
-    }
-
-    ktrace_events_single(s, hdr_debugid, ^(struct trace_point *tp) {
-        if (tid != 0 && tid != tp->threadid) {
-            return;
-        }
-
-        T_LOG("found stack from thread %#lx", tp->threadid);
-        stacks++;
-        if (!(tp->arg1 & 1)) {
-            T_FAIL("invalid %s stack on thread %#lx", kern ? "kernel" : "user",
-                tp->threadid);
-            return;
-        }
-
-        hdr_frames = (unsigned int)tp->arg2;
-        /* ignore extra link register or value pointed to by stack pointer */
-        hdr_frames -= 1;
-
-        T_QUIET; T_EXPECT_GE(hdr_frames, bt_len,
-            "number of frames in header");
-        T_QUIET; T_EXPECT_LE(hdr_frames, bt_len + allow_larger,
-            "number of frames in header");
-        if (hdr_frames > bt_len && allow_larger > 0) {
-            allow_larger = hdr_frames - bt_len;
-            hdr_frames = bt_len;
-        }
-
-        T_LOG("%s stack seen", kern ? "kernel" : "user");
-        frames = 0;
-    });
-
-    ktrace_events_single(s, data_debugid, ^(struct trace_point *tp) {
-        if (tid != 0 && tid != tp->threadid) {
-            return;
-        }
-
-        int i = 0;
-
-        if (frames == 0 && hdr_frames > bt_len) {
-            /* skip frames near the PC */
-            i = (int)allow_larger;
-            allow_larger -= 4;
-        }
-
-        for (; i < 4 && frames < hdr_frames; i++, frames++) {
-            unsigned long addr = (&tp->arg1)[i];
-            CSSymbolRef symbol = CSSymbolicatorGetSymbolWithAddressAtTime(
-                symb, addr, kCSNow);
-
-            expect_frame(bt, bt_len, symbol, addr, frames, hdr_frames);
-        }
-
-        /* saw the end of the user stack */
-        if (hdr_frames == frames) {
-            *stacks_seen += 1;
-            if (!kern) {
-                ktrace_end(s, 1);
-            }
-        }
-    });
-}
-
-#define TRIGGERING_DEBUGID (0xfeff0f00)
-
-/*
- * These functions must return an int to avoid the function prologue being
- * hoisted out of the path to the spin (breaking being able to get a good
- * backtrace).
- */
-static int __attribute__((noinline,not_tail_called))
-recurse_a(dispatch_semaphore_t spinning, unsigned int frames);
-static int __attribute__((noinline,not_tail_called))
-recurse_b(dispatch_semaphore_t spinning, unsigned int frames);
-
-static int __attribute__((noinline,not_tail_called))
-recurse_a(dispatch_semaphore_t spinning, unsigned int frames)
-{
-    if (frames == 0) {
-        if (spinning) {
-            dispatch_semaphore_signal(spinning);
-            for (;;);
-        } else {
-            kdebug_trace(TRIGGERING_DEBUGID, 0, 0, 0, 0);
-            return 0;
-        }
-    }
-
-    return recurse_b(spinning, frames - 1) + 1;
-}
-
-static int __attribute__((noinline,not_tail_called))
-recurse_b(dispatch_semaphore_t spinning, unsigned int frames)
-{
-    if (frames == 0) {
-        if (spinning) {
-            dispatch_semaphore_signal(spinning);
-            for (;;);
-        } else {
-            kdebug_trace(TRIGGERING_DEBUGID, 0, 0, 0, 0);
-            return 0;
-        }
-    }
-
-    return recurse_a(spinning, frames - 1) + 1;
-}
-
-#define USER_FRAMES       (12)
-
-#if defined(__x86_64__)
-#define RECURSE_START_OFFSET (4)
-#else /* defined(__x86_64__) */
-#define RECURSE_START_OFFSET (3)
-#endif /* defined(__x86_64__) */
-
-static const char *user_bt[USER_FRAMES] = {
-#if defined(__x86_64__)
-    NULL,
-#endif /* defined(__x86_64__) */
-    NULL, NULL,
-    "backtrace_thread",
-    "recurse_a", "recurse_b", "recurse_a", "recurse_b",
-    "recurse_a", "recurse_b", "recurse_a",
-#if !defined(__x86_64__)
-    "recurse_b",
-#endif /* !defined(__x86_64__) */
-    NULL
-};
-
-#if defined(__arm__)
-
-#define KERNEL_FRAMES (2)
-static const char *kernel_bt[KERNEL_FRAMES] = {
-    "unix_syscall", "kdebug_trace64"
-};
-
-#elif defined(__arm64__)
-
-#define KERNEL_FRAMES (4)
-static const char *kernel_bt[KERNEL_FRAMES] = {
-    "fleh_synchronous", "sleh_synchronous", "unix_syscall", "kdebug_trace64"
-};
-
-#elif defined(__x86_64__)
-
-#define KERNEL_FRAMES (2)
-static const char *kernel_bt[KERNEL_FRAMES] = {
-    "unix_syscall64", "kdebug_trace64"
-};
-
-#else
-#error "architecture unsupported"
-#endif /* defined(__arm__) */
-
-static dispatch_once_t backtrace_once;
-static dispatch_semaphore_t backtrace_started;
-static dispatch_semaphore_t backtrace_go;
-
-/*
- * Another thread to run with a known backtrace.
- *
- * Take a semaphore that will be signalled when the thread is spinning at the
- * correct frame.  If the semaphore is NULL, don't spin and instead make a
- * kdebug_trace system call, which can trigger a deterministic backtrace itself.
- */
-static void *
-backtrace_thread(void *arg)
-{
-    dispatch_semaphore_t notify_spinning;
-    unsigned int calls;
-
-    notify_spinning = (dispatch_semaphore_t)arg;
-
-    dispatch_semaphore_signal(backtrace_started);
-    if (!notify_spinning) {
-        dispatch_semaphore_wait(backtrace_go, DISPATCH_TIME_FOREVER);
-    }
-
-    /*
-     * backtrace_thread, recurse_a, recurse_b, ...[, __kdebug_trace64]
-     *
-     * Always make one less call for this frame (backtrace_thread).
-     */
-    calls = USER_FRAMES - RECURSE_START_OFFSET - 1 /* backtrace_thread */;
-    if (notify_spinning) {
-        /*
-         * Spinning doesn't end up calling __kdebug_trace64.
-         */
-        calls -= 1;
-    }
-
-    T_LOG("backtrace thread calling into %d frames (already at %d frames)",
-        calls, RECURSE_START_OFFSET);
-    (void)recurse_a(notify_spinning, calls);
-    return NULL;
-}
-
-static uint64_t
-create_backtrace_thread(dispatch_semaphore_t notify_spinning)
-{
-    pthread_t thread = NULL;
-    uint64_t tid;
-
-    dispatch_once(&backtrace_once, ^{
-        backtrace_started = dispatch_semaphore_create(0);
-        T_QUIET; T_ASSERT_NOTNULL(backtrace_started, NULL);
-
-        if (!notify_spinning) {
-            backtrace_go = dispatch_semaphore_create(0);
-            T_QUIET; T_ASSERT_NOTNULL(backtrace_go, NULL);
-        }
-    });
-
-    T_QUIET; T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, backtrace_thread,
-        (void *)notify_spinning), NULL);
-    T_QUIET; T_ASSERT_NOTNULL(thread, "backtrace thread created");
-    dispatch_semaphore_wait(backtrace_started, DISPATCH_TIME_FOREVER);
-
-    T_QUIET; T_ASSERT_POSIX_ZERO(pthread_threadid_np(thread, &tid), NULL);
-    T_QUIET; T_ASSERT_NE(tid, UINT64_C(0),
-        "backtrace thread created does not have ID 0");
-
-    T_LOG("starting thread with ID 0x%" PRIx64, tid);
-
-    return tid;
-}
-
-static void
-start_backtrace_thread(void)
-{
-    T_QUIET; T_ASSERT_NOTNULL(backtrace_go,
-        "thread to backtrace created before starting it");
-    dispatch_semaphore_signal(backtrace_go);
-}
-
-#if TARGET_OS_WATCH
-#define TEST_TIMEOUT_NS (30 * NSEC_PER_SEC)
-#else /* TARGET_OS_WATCH */
-#define TEST_TIMEOUT_NS (5 * NSEC_PER_SEC)
-#endif /* !TARGET_OS_WATCH */
-
-T_DECL(backtraces_kdebug_trigger,
-    "test that backtraces from kdebug trigger are correct",
-    T_META_ASROOT(true))
-{
-    static unsigned int stacks_seen = 0;
-    ktrace_session_t s;
-    kperf_kdebug_filter_t filter;
-    uint64_t tid;
-
-    s = ktrace_session_create();
-    T_ASSERT_NOTNULL(s, "ktrace session was created");
-
-    T_ASSERT_POSIX_ZERO(ktrace_filter_pid(s, getpid()), NULL);
-
-    tid = create_backtrace_thread(NULL);
-    expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES, 0);
-    expect_backtrace(s, tid, &stacks_seen, true, kernel_bt, KERNEL_FRAMES, 0);
-
-    /*
-     * The triggering event must be traced (and thus registered with libktrace)
-     * to get backtraces.
-     */
-    ktrace_events_single(s, TRIGGERING_DEBUGID,
-        ^(__unused struct trace_point *tp){ });
-
-    ktrace_set_completion_handler(s, ^(void) {
-        T_EXPECT_GE(stacks_seen, 2U, "saw both kernel and user stacks");
-        ktrace_session_destroy(s);
-        kperf_reset();
-        T_END;
-    });
-
-    filter = kperf_kdebug_filter_create();
-    T_ASSERT_NOTNULL(filter, "kperf kdebug filter was created");
-
-    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_add_debugid(filter,
-        TRIGGERING_DEBUGID), NULL);
-    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_filter_set(filter), NULL);
-    (void)kperf_action_count_set(1);
-    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1,
-        KPERF_SAMPLER_USTACK | KPERF_SAMPLER_KSTACK), NULL);
-    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_kdebug_action_set(1), NULL);
-    kperf_kdebug_filter_destroy(filter);
-
-    T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-    start_backtrace_thread();
-
-    dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS),
-        dispatch_get_main_queue(), ^(void)
-    {
-        T_LOG("ending test after timeout");
-        ktrace_end(s, 0);
-    });
-
-    dispatch_main();
-}
-
-T_DECL(backtraces_user_timer,
-    "test that user backtraces on a timer are correct",
-    T_META_ASROOT(true))
-{
-    static unsigned int stacks_seen = 0;
-    ktrace_session_t s;
-    uint64_t tid;
-    dispatch_semaphore_t wait_for_spinning = dispatch_semaphore_create(0);
-
-    s = ktrace_session_create();
-    T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
-
-    ktrace_filter_pid(s, getpid());
-
-    configure_kperf_stacks_timer(getpid(), 10);
-
-    tid = create_backtrace_thread(wait_for_spinning);
-    /* potentially calling dispatch function and system call */
-    expect_backtrace(s, tid, &stacks_seen, false, user_bt, USER_FRAMES - 1, 2);
-
-    ktrace_set_completion_handler(s, ^(void) {
-        T_EXPECT_GE(stacks_seen, 1U, "saw at least one stack");
-        ktrace_session_destroy(s);
-        kperf_reset();
-        T_END;
-    });
-
-    T_QUIET; T_ASSERT_POSIX_SUCCESS(kperf_sample_set(1), NULL);
-
-    /* wait until the thread that will be backtraced is spinning */
-    dispatch_semaphore_wait(wait_for_spinning, DISPATCH_TIME_FOREVER);
-
-    T_ASSERT_POSIX_ZERO(ktrace_start(s, dispatch_get_main_queue()), NULL);
-
-    dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT_NS),
-        dispatch_get_main_queue(), ^(void)
-    {
-        T_LOG("ending test after timeout");
-        ktrace_end(s, 0);
-    });
-
-    dispatch_main();
-}
-
-/* TODO test kernel stacks in all modes */
-/* TODO legacy PET mode backtracing */
-/* TODO test deep stacks, further than 128 frames, make sure they are truncated */
-/* TODO test constrained stacks */
diff --git a/tools/tests/darwintests/kperf_helpers.c b/tools/tests/darwintests/kperf_helpers.c
deleted file mode 100644 (file)
index bf64f6b..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-#include "kperf_helpers.h"
-
-#include <darwintest.h>
-#include <kperf/kperf.h>
-#include <unistd.h>
-
-void
-configure_kperf_stacks_timer(pid_t pid, unsigned int period_ms)
-{
-    kperf_reset();
-
-    (void)kperf_action_count_set(1);
-    (void)kperf_timer_count_set(1);
-
-    T_ASSERT_POSIX_SUCCESS(kperf_action_samplers_set(1,
-        KPERF_SAMPLER_USTACK | KPERF_SAMPLER_KSTACK), NULL);
-
-    if (pid != -1) {
-        T_ASSERT_POSIX_SUCCESS(kperf_action_filter_set_by_pid(1, pid), NULL);
-    }
-
-    T_ASSERT_POSIX_SUCCESS(kperf_timer_action_set(0, 1), NULL);
-    T_ASSERT_POSIX_SUCCESS(kperf_timer_period_set(0,
-        kperf_ns_to_ticks(period_ms * NSEC_PER_MSEC)), NULL);
-}
diff --git a/tools/tests/darwintests/kperf_helpers.h b/tools/tests/darwintests/kperf_helpers.h
deleted file mode 100644 (file)
index 466f3d9..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef KPERF_HELPERS_H
-#define KPERF_HELPERS_H
-
-#include <unistd.h>
-
-void configure_kperf_stacks_timer(pid_t pid, unsigned int period_ms);
-
-#endif /* !defined(KPERF_HELPERS_H) */
diff --git a/tools/tests/darwintests/kqueue_add_and_trigger.c b/tools/tests/darwintests/kqueue_add_and_trigger.c
deleted file mode 100644 (file)
index 15243a7..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <unistd.h>
-#include <errno.h>
-#include <sys/event.h>
-#include <darwintest.h>
-
-/* <rdar://problem/28139044> EVFILT_USER doesn't properly support add&fire atomic combination
- *
- * Chek that using EV_ADD and EV_TRIGGER on a EV_USER actually trigger the event just added.
- *
- */
-
-T_DECL(kqueue_add_and_trigger_evfilt_user, "Add and trigger EVFILT_USER events with kevent ")
-{
-       int kq_fd, ret;
-       struct kevent ret_kev;
-       const struct kevent kev = {
-               .ident = 1,
-               .filter = EVFILT_USER,
-               .flags = EV_ADD|EV_CLEAR,
-               .fflags = NOTE_TRIGGER,
-       };
-       const struct timespec timeout = {
-               .tv_sec = 1,
-               .tv_nsec = 0,
-       };
-
-       T_ASSERT_POSIX_SUCCESS((kq_fd = kqueue()), NULL);
-       ret = kevent(kq_fd, &kev, 1, &ret_kev, 1, &timeout);
-
-       T_ASSERT_POSIX_SUCCESS(ret, "kevent");
-
-       T_ASSERT_EQ(ret, 1, "kevent with add and trigger, ret");
-       T_ASSERT_EQ(ret_kev.ident, 1, "kevent with add and trigger, ident");
-       T_ASSERT_EQ(ret_kev.filter, EVFILT_USER, "kevent with add and trigger, filter");
-
-}
-
diff --git a/tools/tests/darwintests/kqueue_close.c b/tools/tests/darwintests/kqueue_close.c
deleted file mode 100644 (file)
index 3682d91..0000000
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <unistd.h>
-#include <pthread.h>
-#include <errno.h>
-
-#include <sys/event.h>
-
-#include <darwintest.h>
-
-/*
- * <rdar://problem/30231213> close() of kqueue FD races with kqueue_scan park
- *
- * When close concurrent with poll goes wrong, the close hangs
- * and the kevent never gets any more events.
- */
-
-/* Both events should fire at about the same time */
-static uint32_t timeout_ms = 10;
-
-static void *
-poll_kqueue(void *arg)
-{
-       int fd = (int)arg;
-
-       struct kevent kev = {
-               .filter = EVFILT_TIMER,
-               .flags  = EV_ADD,
-               .data   = timeout_ms,
-       };
-
-       int rv = kevent(fd, &kev, 1, NULL, 0, NULL);
-
-       if (rv == -1 && errno == EBADF) {
-               /* The close may race with this thread spawning */
-               T_LOG("kqueue already closed?");
-               return NULL;
-       } else {
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kevent");
-       }
-
-       while ((rv = kevent(fd, NULL, 0, &kev, 1, NULL)) == 1) {
-               T_LOG("poll\n");
-       }
-
-       if (rv != -1 || errno != EBADF) {
-               T_ASSERT_POSIX_SUCCESS(rv, "fd should be closed");
-       }
-
-       return NULL;
-}
-
-static void
-run_test()
-{
-       int fd = kqueue();
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(fd, "kqueue");
-
-       pthread_t thread;
-       int rv = pthread_create(&thread, NULL, poll_kqueue,
-                               (void *)(uintptr_t)fd);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create");
-
-       usleep(timeout_ms * 1000);
-
-       rv = close(fd);
-       T_ASSERT_POSIX_SUCCESS(rv, "close");
-
-       rv = pthread_join(thread, NULL);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_join");
-}
-
-T_DECL(kqueue_close_race, "Races kqueue close with kqueue process",
-       T_META_LTEPHASE(LTE_POSTINIT), T_META_TIMEOUT(5))
-{
-       for (uint32_t i = 1 ; i < 100 ; i++) {
-               run_test();
-       }
-}
diff --git a/tools/tests/darwintests/kqueue_fifo_18776047.c b/tools/tests/darwintests/kqueue_fifo_18776047.c
deleted file mode 100644 (file)
index fe45758..0000000
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * testname: kqueue_fifo
- */
-
-#include <darwintest.h>
-#include <fcntl.h>
-#include <sys/event.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <errno.h>
-
-#include <TargetConditionals.h>
-
-#define TMP_FILE_PATH "/tmp/test_kqueue_fifo_18776047"
-
-#define READ_BUFFER_LEN 256
-
-#if TARGET_OS_WATCH
-#define TOTAL_ITERATIONS 5000
-#else
-#define TOTAL_ITERATIONS 10000
-#endif
-
-/* prototypes */
-int write_some_data(int fd);
-int read_data(int fd);
-void create_fifo(const char * filepath);
-void kevent_one_shot(int kq, int fd, int filter);
-
-int
-write_some_data(int fd)
-{
-       int retval  = 0;
-       int count   = 0;
-       int len     = 5;
-       char * data = "ABCDE";
-       while (true) {
-               errno  = 0;
-               retval = (int)write(fd, data, (size_t)len);
-               if (retval < 0) {
-                       if (errno == EAGAIN) {
-                               if (len == 1)
-                                       return count;
-                               else
-                                       len--;
-                       } else {
-                               T_ASSERT_FAIL("write to fd %d of %s of len %d failed.", fd, data, len);
-                               abort();
-                       }
-               } else {
-                       count += retval;
-               }
-       }
-}
-
-int
-read_data(int fd)
-{
-       int retval, count = 0;
-       char databuffer[READ_BUFFER_LEN];
-       while (true) {
-               errno  = 0;
-               retval = (int)read(fd, databuffer, READ_BUFFER_LEN);
-               if (retval < 0) {
-                       if (errno == EAGAIN) {
-                               return count;
-                       } else {
-                               T_ASSERT_FAIL("read from fd %d failed.", fd);
-                               abort();
-                       }
-               }
-               count += retval;
-       }
-}
-
-void
-create_fifo(const char * filepath)
-{
-       struct stat f_stat;
-       int ret = 0;
-       errno   = 0;
-       ret = stat(filepath, &f_stat);
-       if (ret == 0) {
-               /* if file exists, make sure its a fifo */
-               T_ASSERT_TRUE(S_ISFIFO(f_stat.st_mode), "ensure %s is a fifo", filepath);
-       } else if (errno == ENOENT) {
-               ret = mkfifo(filepath, 0777);
-               T_ASSERT_POSIX_ZERO(ret, "creating a fifo at path %s", filepath);
-       } else {
-               T_ASSERT_FAIL("stat operation on %s", filepath);
-       }
-}
-
-void
-kevent_one_shot(int kq, int fd, int filter)
-{
-       int retval             = 0;
-       struct timespec t_zero = {0, 0};
-       struct kevent kev[1];
-
-       T_QUIET;
-       T_ASSERT_GE(kq, 0, "ensure kq is valid");
-       T_LOG("kevent doing ONESHOT %s", filter == EVFILT_READ ? "read" : "write");
-
-       EV_SET(kev, fd, filter, EV_ADD | EV_ONESHOT, 0, 0, NULL);
-       retval = kevent(kq, kev, 1, NULL, 0, &t_zero);
-       T_QUIET;
-       T_ASSERT_POSIX_ZERO(retval, "ONESHOT kevent for fd %d, filter %d", fd, filter);
-}
-
-T_DECL(kqueue_fifo_18776047, "Tests kqueue, kevent for watching a fifo.", T_META_LTEPHASE(LTE_POSTINIT))
-{
-       struct kevent kev[1];
-       int read_fd, write_fd, kq;
-       int retval         = 0;
-       int iter           = 0;
-       const char * fpath = TMP_FILE_PATH;
-       T_SETUPBEGIN;
-       create_fifo(fpath);
-
-       kq = kqueue();
-       T_ASSERT_GE(kq, 0, "create a kqueue");
-
-       read_fd = open(fpath, O_RDONLY | O_APPEND | O_NONBLOCK);
-       T_ASSERT_POSIX_SUCCESS(read_fd, "opening read fd on fifo.");
-
-       write_fd = open(fpath, O_WRONLY | O_APPEND | O_NONBLOCK);
-       T_ASSERT_POSIX_SUCCESS(write_fd, "opening write fd on fifo.");
-
-       T_SETUPEND;
-
-       kevent_one_shot(kq, write_fd, EVFILT_WRITE);
-       kevent_one_shot(kq, read_fd, EVFILT_READ);
-
-       while (iter++ < TOTAL_ITERATIONS) {
-               retval = kevent(kq, NULL, 0, kev, 1, NULL);
-               T_QUIET;
-               T_ASSERT_GE(retval, 0, "kevent on kq %d", kq);
-
-               if (kev[0].ident == (uintptr_t)write_fd) {
-                       retval = write_some_data(write_fd);
-                       T_LOG("writer ready iter: %d wrote %d bytes", iter, retval);
-                       kevent_one_shot(kq, write_fd, EVFILT_WRITE);
-               } else if (kev[0].ident == (uintptr_t)read_fd) {
-                       retval = read_data(read_fd);
-                       T_LOG("reader ready iter: %d read %d bytes", iter, retval);
-                       kevent_one_shot(kq, read_fd, EVFILT_READ);
-               }
-       }
-       T_PASS("kqueue_fifo_18776047 PASSED");
-}
diff --git a/tools/tests/darwintests/kqueue_file_tests.c b/tools/tests/darwintests/kqueue_file_tests.c
deleted file mode 100644 (file)
index dcd2c47..0000000
+++ /dev/null
@@ -1,1837 +0,0 @@
-#include <string.h>
-#include <errno.h>
-#include <pwd.h>
-#include <stdarg.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <poll.h>
-#include <sys/types.h>
-#include <sys/event.h>
-#include <sys/time.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <sys/param.h>
-#include <sys/mount.h>
-#include <sys/xattr.h>
-#include <sys/file.h>
-
-#include <TargetConditionals.h>
-#include <darwintest.h>
-
-T_GLOBAL_META(
-               T_META_NAMESPACE("xnu.kevent")
-               );
-
-#define PDIR   "/tmp"
-#define DIR1   PDIR "/dir1"
-#define DOTDOT ".."
-#define DIR2   PDIR "/dir2"
-#define FILE1  PDIR "/file1"
-#define FILE2  PDIR "/file2"
-
-#define KEY    "somekey"
-#define VAL    "someval"
-
-#define NOSLEEP                0
-#define SLEEP          1
-#define NO_EVENT       0
-#define YES_EVENT      1
-
-
-#define OUTPUT_LEVEL   0
-#define RESULT_LEVEL   3
-
-#define TEST_STRING    "Some text!!! Yes indeed, some of that very structure which has passed on man's knowledge for generations."
-#define HELLO_WORLD    "Hello, World!"
-#define USLEEP_TIME    5000
-#define WAIT_TIME      (4l)
-#define LENGTHEN_SIZE  500
-#define FIFO_SPACE     8192    /* FIFOS have 8K of buffer space */
-
-/*
- * These two variables are the non local memory for holding the return
- * values from functions with which pthread_create is called.
- */
-int thread_status;
-int fifo_read_fd;
-
-/*
- * Types of actions for setup, cleanup, and execution of tests
- */
-typedef enum {CREAT, MKDIR, READ, WRITE, WRITEFD, FILLFD, UNLINK, LSKEE, RMDIR, MKFIFO, LENGTHEN, TRUNC,
-       SYMLINK, CHMOD, CHOWN, EXCHANGEDATA, RENAME, LSEEK, OPEN, MMAP, NOTHING,
-       SETXATTR, UTIMES, STAT, HARDLINK, REVOKE, FUNLOCK} action_id_t;
-
-/* 
- * Directs an action as mentioned above
- */
-typedef struct _action {
-       int             act_dosleep;
-       action_id_t     act_id;
-       void            *act_args[5];
-       int             act_fd;
-} action_t;
-
-/*
- * A test case.  Specifies setup, an event to look for, an action to take to
- * cause (or not cause) that event, and cleanup.
- */
-typedef struct _test {
-       char *t_testname;
-       
-       /* Is this test an expected failure? */
-       int t_known_failure;
-
-       /* Is this test behaving non-deterministically? */
-       int t_nondeterministic;
-
-       /* Test kevent() or poll() */
-       int     t_is_poll_test; 
-       
-       /* Actions for setting up test */
-       int      t_n_prep_actions;
-       action_t t_prep_actions[5];
-       
-       /* Actions for cleaning up test */
-       int      t_n_cleanup_actions;
-       action_t t_cleanup_actions[5];
-       
-       /* Action for thred to take while we wait */
-       action_t t_helpthreadact;
-       
-       /* File to look for event on */
-       char     *t_watchfile;  /* set event ident IN TEST (can't know fd beforehand)*/
-       int      t_file_is_fifo;/* FIFOs are handled in a special manner */
-       
-       /* Different parameters for poll() vs kevent() */
-       union { 
-               struct kevent   tu_kev;
-               short           tu_pollevents;
-       } t_union;
-       
-       /* Do we expect results? */
-       int      t_want_event;
-       
-       /* Not always used--how much data should we find (EVFILT_{READ,WRITE}) */
-       int      t_nbytes;
-       
-       /* Hacks for FILT_READ and pipes */
-       int      t_read_to_end_first;   /* Consume all data in file before waiting for event */
-       int      t_write_some_data;     /* Write some data to file before waiting for event (FIFO hack) */
-       int      t_extra_sleep_hack;    /* Sleep before waiting, to let a fifo fill up with data */
-} test_t;
-
-char *
-get_action_name(action_id_t a)
-{
-       switch (a) {
-       case CREAT:
-               return "CREAT";
-       case MKDIR:
-               return "MKDIR";
-       case READ:
-               return "READ";
-       case WRITE:
-               return "WRITE";
-       case WRITEFD:
-               return "WRITEFD";
-       case FILLFD:
-               return "FILLFD";
-       case UNLINK:
-               return "UNLINK";
-       case LSKEE:
-               return "LSKEE";
-       case RMDIR:
-               return "RMDIR";
-       case MKFIFO:
-               return "MKFIFO";
-       case LENGTHEN:
-               return "LENGTHEN";
-       case TRUNC:
-               return "TRUNC";
-       case SYMLINK:
-               return "SYMLINK";
-       case CHMOD:
-               return "CHMOD";
-       case CHOWN:
-               return "CHOWN";
-       case EXCHANGEDATA:
-               return "EXCHANGEDATA";
-       case RENAME:
-               return "RENAME";
-       case LSEEK:
-               return "LSEEK";
-       case OPEN:
-               return "OPEN";
-       case MMAP:
-               return "MMAP";
-       case NOTHING:
-               return "NOTHING";
-       case SETXATTR:
-               return "SETXATTR";
-       case UTIMES:
-               return "UTIMES";
-       case STAT:
-               return "STAT";
-       case HARDLINK:
-               return "HARDLINK";
-       case REVOKE:
-               return "REVOKE";
-       case FUNLOCK:
-               return "FUNLOCK";
-       }
-       return "Unknown";
-}
-/*
- * Initialize an action struct.  Whether to sleep, what action to take,
- * and arguments for that action.
- */
-void 
-init_action(action_t *act, int sleep, action_id_t call, int nargs, ...) 
-{
-       int i;
-       va_list ap;
-       va_start(ap, nargs);
-       act->act_dosleep = sleep;
-       act->act_id = call;
-       
-       for (i = 0; i < nargs; i++)
-       {
-               act->act_args[i] = va_arg(ap, void*);
-       }
-       
-       va_end(ap);
-       
-}
-
-/*
- * Opening a fifo is complicated: need to open both sides at once 
- */
-void *
-open_fifo_readside(void *arg) 
-{
-       if ((fifo_read_fd = open((char*)arg, O_RDONLY)) == -1) {
-               T_LOG("open(%s, O_RDONLY) failed: %d (%s)\n", arg, errno, strerror(errno));
-       }
-       return (&fifo_read_fd);
-}
-
-/*
- * Open a fifo, setting read and write descriptors.  Return 0 for success, -1 for failure.
- * Only set FD args upon success; they will be unmodified on failure.
- */
-int 
-open_fifo(const char *path, int *readfd, int *writefd) 
-{
-       pthread_t thread;
-       int waitres;
-       int res;
-       int *tmpreadfd, tmpwritefd;
-       
-       fifo_read_fd = -1;
-       res = pthread_create(&thread, 0, open_fifo_readside, (void*)path);
-       if (res == 0) {
-               if ((tmpwritefd = open(path, O_WRONLY)) == -1) {
-                       T_LOG("open(%s, O_WRONLY) failed: %d (%s)\n", path, errno, strerror(errno));
-                       return (-1);
-               }
-               waitres = pthread_join(thread, (void**) &tmpreadfd);
-               
-               fcntl(tmpwritefd, F_SETFL, O_WRONLY | O_NONBLOCK);
-               
-               if ((waitres == 0) && (tmpwritefd >= 0) && (*tmpreadfd >= 0)) {
-                       *readfd = *tmpreadfd;
-                       *writefd = tmpwritefd;
-               } else {
-                       res = -1;       
-               }
-       }
-       
-       return res;
-}
-
-/*
- * Just concatenate a directory and a filename, sticking a "/" betwixt them
- */
-void 
-makepath(char *buf, const char *dir, const char *file) 
-{
-       strcpy(buf, dir);
-       strcat(buf, "/");
-       strcat(buf, file);
-}
-
-
-/* Execute a prep, cleanup, or test action; specific tricky notes below.
- *
- * CREAT:      comes to life and given length 1
- * READ:       try to read one char
- * WRITE:      try to write TEST_STRING to file
- * LENGTHEN:   make longer by LENGTHEN_SIZE
- * MMAP:       mmap first 20 bytes of file, write HELLO_WORLD in
- * SETXATTR:   set the KEY attribute to value VAL
- * WRITEFD:    instead of opening fresh, take an FD in the action struct (FIFOs)
- * FILLFD:     write a file until you can no longer.  for filling FIFOS.
- *
- * * Several of these have hard-coded sizes.
- */
-void* 
-execute_action(void *actionptr) 
-{
-       action_t *act = (action_t*)actionptr;
-       void **args = act->act_args;
-       char c;
-       int res = -1, tmpfd, tmpfd2;
-       static int lastfd;
-       void *addr;
-       struct timeval tv;
-       struct stat sstat;
-       
-       T_LOG("Beginning action of type %d: %s\n", act->act_id, get_action_name(act->act_id));
-       
-       /* Let other thread get into kevent() sleep */
-       if(SLEEP == act->act_dosleep) {
-               usleep(USLEEP_TIME);
-       }
-       switch(act->act_id) {
-               case NOTHING:
-                       res = 0;
-                       break;
-               case CREAT:
-                       if ((tmpfd = creat((char*)args[0], 0755)) == -1) {
-                               T_LOG("creat() failed on \"%s\": %d (%s)\n", args[0], errno, strerror(errno));
-                               res = -1;
-                               break;
-                       }
-                       ftruncate(tmpfd, 1); /* So that mmap() doesn't fool us */
-                       close(tmpfd);
-                       res = 0;
-                       break;
-               case MKDIR:
-                       res = mkdir((char*)args[0], 0755);
-                       break;
-               case READ:
-                       if ((tmpfd = open((char*)args[0], O_RDONLY)) == -1) {
-                               T_LOG("open(%s, O_RDONLY) failed: %d (%s)\n", args[0], errno, strerror(errno));
-                               res = -1;
-                               break;
-                       }
-                       res = read(tmpfd, &c, 1);
-                       res = (res == 1 ? 0 : -1);
-                       close(tmpfd);
-                       break;
-               case WRITE:
-                       if ((tmpfd = open((char*)args[0], O_RDWR)) == -1) {
-                               T_LOG("open(%s, O_RDWR) failed: %d (%s)\n", args[0], errno, strerror(errno));
-                               res = -1;
-                               break;
-                       }
-                       res = write(tmpfd, TEST_STRING, strlen(TEST_STRING));
-                       if (res == strlen(TEST_STRING)) {
-                               res = 0;
-                       } else {
-                               res = -1;
-                       }
-                       close(tmpfd);
-                       break;
-               case WRITEFD:
-                       res = write((int)act->act_fd, TEST_STRING, strlen(TEST_STRING));
-                       if (res == strlen(TEST_STRING)) {
-                               res = 0;
-                       } else {
-                               res = -1;
-                       }
-                       break;
-               case FILLFD:
-                       while (write((int)act->act_fd, "a", 1) > 0);
-                       res = 0;
-                       break;
-               case UNLINK:
-                       res = unlink((char*)args[0]);
-                       break;
-               case LSEEK:
-                       res = lseek((int)act->act_fd, (int)args[0], SEEK_SET);
-                       res = (res == (int)args[0] ? 0 : -1);
-                       break;
-               case RMDIR:
-                       res = rmdir((char*)args[0]);
-                       break;
-               case MKFIFO:
-                       res = mkfifo((char*)args[0], 0755);
-                       break;
-               case LENGTHEN:
-                       res = truncate((char*)args[0], LENGTHEN_SIZE);
-                       break;
-               case TRUNC:
-                       res = truncate((char*)args[0], 0);
-                       break;
-               case SYMLINK:
-                       res = symlink((char*)args[0], (char*)args[1]);
-                       break;
-               case CHMOD:
-                       res = chmod((char*)args[0], (int)args[1]);
-                       break;
-               case CHOWN:
-                       /* path, uid, gid */
-                       res = chown((char*)args[0], (int) args[1], (int) args[2]);
-                       break;
-               case EXCHANGEDATA:
-                       res = exchangedata((char*)args[0], (char*)args[1], 0);
-                       break;
-               case RENAME:
-                       res = rename((char*)args[0], (char*)args[1]);
-                       break;
-               case OPEN:
-                       if ((tmpfd = open((char*)args[0], O_RDONLY | O_CREAT)) == -1) {
-                               T_LOG("open(%s, O_RDONLY | O_CREAT) failed: %d (%s)\n", args[0], errno, strerror(errno));
-                               res = -1;
-                               break;
-                       }
-                       res = close(tmpfd);
-                       break;
-               case MMAP:
-                       /* It had best already exist with nonzero size */
-                       if ((tmpfd = open((char*)args[0], O_RDWR)) == -1) {
-                               T_LOG("open(%s, O_RDWR) failed: %d (%s)\n", args[0], errno, strerror(errno));
-                               res = -1;
-                               break;
-                       }
-                       addr = mmap(0, 20, PROT_WRITE | PROT_READ, MAP_FILE | MAP_SHARED, tmpfd, 0);
-                       if (addr != ((void*)-1)) {
-                               res = 0;
-                               if ((int)args[1]) {
-                                       strcpy((char*)addr, HELLO_WORLD);
-                                       msync(addr, 20, MS_SYNC);
-                               }
-                       }
-                       close(tmpfd);
-                       munmap(addr, 20);
-                       break;
-               case SETXATTR:
-                       res = setxattr((char*)args[0], KEY, (void*)VAL, strlen(VAL),
-                                                  0, 0);
-                       break;
-               case UTIMES:
-                       tv.tv_sec = time(NULL);
-                       tv.tv_usec = 0;
-                       res = utimes((char*)args[0], &tv); 
-                       break;
-               case STAT:
-                       res = lstat((char*)args[0], &sstat);
-                       break;
-               case HARDLINK:
-                       res = link((char*)args[0], (char*)args[1]);
-                       break;
-               case REVOKE:
-                       if ((tmpfd = open((char*)args[0], O_RDONLY)) == -1) {
-                               T_LOG("open(%s, O_RDONLY) failed: %d (%s)\n", args[0], errno, strerror(errno));
-                               res = -1;
-                               break;
-                       }
-                       res = revoke((char*)args[0]);
-                       close(tmpfd);
-                       break;
-               case FUNLOCK:
-                       if ((tmpfd = open((char*)args[0], O_RDONLY)) == -1) {
-                               T_LOG("open(%s, O_RDONLY) failed: %d (%s)\n", args[0], errno, strerror(errno));
-                               res = -1;
-                               break;
-                       }                               
-                       if ((res = flock(tmpfd, LOCK_EX)) == -1) {
-                               T_LOG("flock() LOCK_EX failed: %d (%s)\n", errno, strerror(errno));
-                               close(tmpfd);
-                               break;
-                       }
-                       if ((res = flock(tmpfd, LOCK_UN)) == -1) {
-                               T_LOG("flock() LOCK_UN failed: %d (%s)\n", errno, strerror(errno));
-                               close(tmpfd);
-                               break;
-                       }
-                       close(tmpfd);
-                       break;
-               default:
-                       res = -1;
-                       break;
-       }
-
-       thread_status = res;
-       return (&thread_status);
-}
-
-/*
- * Read until the end of a file, for EVFILT_READ purposes (considers file position)
- */
-void 
-read_to_end(int fd) 
-{
-       char buf[50];
-       while (read(fd, buf, sizeof(buf)) > 0);
-}
-
-/*
- * Helper for setup and cleanup; just execute every action in an array
- * of actions.  "failout" parameter indicates whether to stop if one fails.
- */
-int
-execute_action_list(action_t *actions, int nactions, int failout) 
-{
-       int i, res;
-       for (i = 0, res = 0; (0 == res || (!failout)) && (i < nactions); i++) {
-               T_LOG("Starting prep action %d\n", i);
-               res = *((int *) execute_action(&(actions[i])));
-               if(res != 0) {
-                       T_LOG("Action list failed on step %d. res = %d errno = %d (%s)\n", i, res,
-                               errno, strerror(errno));
-               } else {
-                       T_LOG("Action list work succeeded on step %d.\n", i);
-               }
-       }
-
-       return res;
-}
-
-/*
- * Execute a full test, return success value.
- */
-int
-execute_test(test_t *test)
-{
-       int i, kqfd, filefd = -1, res2, res, cnt, writefd = -1;
-       int retval = -1;
-       pthread_t thr;
-       struct kevent evlist;
-       struct timespec ts = {WAIT_TIME, 0l};
-       int *status;
-
-       memset(&evlist, 0, sizeof(evlist));
-       
-       T_LOG("[BEGIN] %s\n", test->t_testname);
-
-       T_LOG(test->t_want_event ? "Expecting an event.\n" : "Not expecting events.\n");
-       
-       res = execute_action_list(test->t_prep_actions, test->t_n_prep_actions, 1);
-       
-       /* If prep succeeded */
-       if (0 == res) {
-               /* Create kqueue for kqueue tests*/
-               if (!test->t_is_poll_test) {
-                       if ((kqfd = kqueue()) == -1) {
-                               T_LOG("kqueue() failed: %d (%s)\n", errno, strerror(errno));
-                       }
-               }
-               
-               if ((test->t_is_poll_test) || kqfd >= 0) {
-                       
-                       /* Open the file we're to monitor.  Fifos get special handling */
-                       if (test->t_file_is_fifo) {
-                               filefd = -1;
-                               open_fifo(test->t_watchfile, &filefd, &writefd);
-                       } else {
-                               if ((filefd = open(test->t_watchfile, O_RDONLY | O_SYMLINK)) == -1) {
-                                       T_LOG("open() of watchfile %s failed: %d (%s)\n", test->t_watchfile,
-                                             errno, strerror(errno));
-                               }
-                       }
-                       
-                       if (filefd >= 0) {
-                               T_LOG("Opened file to monitor.\n");
-                               
-                               /* 
-                                * Fill in the fd to monitor once you know it 
-                                * If it's a fifo test, then the helper is definitely going to want the write end.
-                                */
-                               test->t_helpthreadact.act_fd = (writefd >= 0 ? writefd : filefd);
-                               
-                               if (test->t_read_to_end_first) {
-                                       read_to_end(filefd);
-                               } else if (test->t_write_some_data) {
-                                       action_t dowr;
-                                       init_action(&dowr, NOSLEEP, WRITEFD, 0);
-                                       dowr.act_fd = writefd;
-                                       (void)execute_action(&dowr);
-                               }
-                               
-                               /* Helper modifies the file that we're listening on (sleeps first, in general) */
-                               thread_status = 0;
-                               res = pthread_create(&thr, NULL, execute_action, (void*) &test->t_helpthreadact);
-                               if (0 == res) {
-                                       T_LOG("Created helper thread.\n");
-                                       
-                                       /* This is ugly business to hack on filling up a FIFO */
-                                       if (test->t_extra_sleep_hack) {
-                                               usleep(USLEEP_TIME);
-                                       }
-                                       
-                                       if (test->t_is_poll_test) {
-                                               struct pollfd pl;
-                                               pl.fd = filefd;
-                                               pl.events = test->t_union.tu_pollevents;
-                                               cnt = poll(&pl, 1, WAIT_TIME);
-                                               T_LOG("Finished poll() call.\n");
-                                               if ((cnt < 0)) {
-                                                       T_LOG("error is in errno, %s\n", strerror(errno));
-                                                       res = cnt;
-                                               }
-                                       } else {
-                                               test->t_union.tu_kev.ident = filefd; 
-                                               cnt = kevent(kqfd, &test->t_union.tu_kev, 1, &evlist, 1,  &ts);
-                                               T_LOG("Finished kevent() call.\n");
-                                               
-                                               if ((cnt < 0) || (evlist.flags & EV_ERROR))  {
-                                                       T_LOG("kevent() call failed.\n");
-                                                       if (cnt < 0) {
-                                                               T_LOG("error is in errno, %s\n", strerror(errno));
-                                                       } else {
-                                                               T_LOG("error is in data, %s\n", strerror(evlist.data));
-                                                       }
-                                                       res = cnt;
-                                               }
-                                       }
-                                       
-                                       /* Success only if you've succeeded to this point AND joined AND other thread is happy*/
-                                       status = NULL;
-                                       res2 = pthread_join(thr, (void **)&status);
-                                       if (res2 != 0) {
-                                               T_LOG("Couldn't join helper thread: %d (%s).\n", res2,
-                                                       strerror(res2));
-                                       } else if (*status) {
-                                               T_LOG("Helper action had result %d\n", *status);
-                                       }
-                                       res = ((res == 0) && (res2 == 0) && (*status == 0)) ? 0 : -1;
-                               } else {
-                                       T_LOG("Couldn't start thread: %d (%s).\n", res, strerror(res));
-                               }
-                               
-                               close(filefd);
-                               if (test->t_file_is_fifo) {
-                                       close(writefd);
-                               }
-                       } else {
-                               T_LOG("Couldn't open test file %s to monitor: %d (%s)\n", test->t_watchfile);
-                               res = -1;
-                       }
-                       if (!test->t_is_poll_test) {
-                               close(kqfd);
-                       }
-               } else {
-                       T_LOG("Couldn't open kqueue.\n");
-                       res = -1;
-               }
-       }
-       
-       /* Cleanup work */
-       execute_action_list(test->t_cleanup_actions, test->t_n_cleanup_actions, 0);
-       
-       /* Success if nothing failed and we either received or did not receive event,
-        * as expected 
-        */
-       if (0 == res) {
-               T_LOG(cnt > 0 ? "Got an event.\n" : "Did not get an event.\n");
-               if (((cnt > 0) && (test->t_want_event)) || ((cnt == 0) && (!test->t_want_event))) {
-                       if ((!test->t_is_poll_test) && (test->t_union.tu_kev.filter == EVFILT_READ || test->t_union.tu_kev.filter == EVFILT_WRITE)
-                               && (test->t_nbytes) && (test->t_nbytes != evlist.data)) {
-                               T_LOG("Read wrong number of bytes available.  Wanted %d, got %d\n", test->t_nbytes, evlist.data);
-                               retval = -1;
-                       } else {
-                               retval = 0;
-                       }
-                       
-               } else {
-                       T_LOG("Got unexpected event or lack thereof.\n");
-                       retval = -1;
-               }
-       } else {
-               T_LOG("Failed to execute test. res = %d\n", res);
-               retval = -1;
-       }
-
-       if (test->t_nondeterministic) {
-               T_LOG("XXX non-deterministic test result = %d (%s)\n", retval,
-                       (retval == 0) ? "pass" : "fail");
-               T_MAYFAIL;
-       } else {
-               if (test->t_known_failure) {
-                       // Signal to harness that this test is expected to fail.
-                       T_EXPECTFAIL;
-               }
-       }
-
-       if (retval == 0) {
-               T_PASS("%s", test->t_testname);
-       } else {
-               T_FAIL("%s", test->t_testname);
-       }
-
-       T_LOG("Test %s done with result %d.\n", test->t_testname, retval);
-       return (retval);
-}
-
-
-
-void
-init_test_common(test_t *tst, char *testname, char *watchfile, int nprep, int nclean, int event, int want, int ispoll)
-{
-       memset(tst, 0, sizeof(test_t));
-       tst->t_testname = testname;
-       tst->t_known_failure = 0;
-       tst->t_nondeterministic = 0;
-       tst->t_watchfile = watchfile;
-       tst->t_n_prep_actions = nprep;
-       tst->t_n_cleanup_actions = nclean;
-       tst->t_want_event = (want > 0);
-       
-       if (ispoll) {
-               tst->t_is_poll_test = 1;
-               tst->t_union.tu_pollevents = (short)event;
-       } else {
-               /* Can do this because filter is negative, notes are positive */
-               if (event == EVFILT_READ || event == EVFILT_WRITE) {
-                       EV_SET(&tst->t_union.tu_kev, 0, event, EV_ADD | EV_ENABLE, 0, 0, NULL);
-                       tst->t_nbytes = want;
-               } else {
-                       EV_SET(&tst->t_union.tu_kev, 0, EVFILT_VNODE, EV_ADD | EV_ENABLE, event, 0, NULL);
-               }
-       }
-}
-
-/*
- * Initialize a test case, not including its actions.  Meaning: a name for it, what filename to watch,
- * counts of prep and cleanup actions, what event to watch for, and whether you want an event/how many bytes read.
- *
- * "want" does double duty as whether you want an event and how many bytes you might want to read
- * "event" is either an event flag (e.g. NOTE_WRITE) or EVFILT_READ
- */    
-void 
-init_test(test_t *tst, char *testname, char *watchfile, int nprep, int nclean, int event, int want) 
-{
-       init_test_common(tst, testname, watchfile, nprep, nclean, event, want, 0);
-}
-
-/*
- * Same as above, but for a poll() test
- */
-void
-init_poll_test(test_t *tst, char *testname, char *watchfile, int nprep, int nclean, int event, int want) 
-{
-       init_test_common(tst, testname, watchfile, nprep, nclean, event, want, 1);
-}
-
-void 
-run_note_delete_tests() 
-{
-       test_t test;
-       
-       init_test(&test, "1.1.2: unlink a file", FILE1, 1, 0, NOTE_DELETE, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "1.1.3: rmdir a dir", DIR1, 1, 0, NOTE_DELETE, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "1.1.4: rename one file over another", FILE2, 2, 1, NOTE_DELETE, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "1.1.5: rename one dir over another", DIR2, 2, 1, NOTE_DELETE, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
-       execute_test(&test);
-       
-       /* Do FIFO stuff here */
-       init_test(&test, "1.1.6: make a fifo, unlink it", FILE1, 1, 0, NOTE_DELETE, YES_EVENT);
-       test.t_file_is_fifo = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1);
-       execute_test(&test);
-       
-       init_test(&test, "1.1.7: rename a file over a fifo", FILE1, 2, 1, NOTE_DELETE, YES_EVENT);
-       test.t_nondeterministic = 1;
-       test.t_file_is_fifo = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE2, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "1.1.8: unlink a symlink to a file", FILE2, 2, 1, NOTE_DELETE, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, SYMLINK, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       /* ================= */
-       
-       init_test(&test, "1.2.1: Straight-up rename file", FILE1, 1, 1, NOTE_DELETE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "1.2.2: Straight-up rename dir", DIR1, 1, 1, NOTE_DELETE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "1.2.3: Null action on file", FILE1, 1, 1, NOTE_DELETE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 2, NULL, NULL); /* The null action */
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "1.2.4: Rename one file over another: watch the file that lives", FILE1, 2, 1, NOTE_DELETE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "1.2.5: Rename one dir over another, watch the dir that lives", DIR1, 2, 1, NOTE_DELETE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
-}
-
-static bool
-path_on_apfs(const char *path)
-{
-       struct statfs sfs = {};
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(statfs(path, &sfs), NULL);
-       return (memcmp(&sfs.f_fstypename[0], "apfs", strlen("apfs")) == 0);
-}
-
-void 
-run_note_write_tests()
-{
-       char pathbuf[50];
-       char otherpathbuf[50];
-       
-       test_t test;
-       
-       init_test(&test, "2.1.1: Straight-up write to a file", FILE1, 1, 1, NOTE_WRITE, YES_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "2.1.2: creat() file inside a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "2.1.3: open() file inside a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "2.1.4: unlink a file from a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       makepath(otherpathbuf, DIR1, FILE2);
-       init_test(&test, "2.1.5: rename a file in a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)otherpathbuf);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "2.1.6: rename a file to outside of a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "2.1.7: rename a file into a dir", DIR1, 2, 2, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)pathbuf);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "2.1.9: unlink a fifo from a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKFIFO, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "2.1.10: make symlink in a dir", DIR1, 1, 2, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DOTDOT, (void*)pathbuf);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "2.1.12: write to a FIFO", FILE1, 1, 1, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
-       test.t_file_is_fifo = 1;
-       init_action(&test.t_helpthreadact, SLEEP, WRITEFD, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "2.1.13: delete a symlink in a dir", DIR1, 2, 1, NOTE_WRITE, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, SYMLINK, 2, (void*)DOTDOT, (void*)pathbuf);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-
-       /* exchangedata is not supported on APFS volumes */
-       if (!path_on_apfs(PDIR)) {
-               /* This actually should not generate an event, though it's in this section */
-               makepath(pathbuf, DIR1, FILE1);
-               makepath(otherpathbuf, DIR1, FILE2);
-               init_test(&test, "2.1.14: exchangedata two files in a dir", DIR1, 3, 3, NOTE_WRITE, NO_EVENT);
-               test.t_known_failure = 1;
-               init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-               init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
-               init_action(&(test.t_prep_actions[2]), NOSLEEP, CREAT, 2, (void*)otherpathbuf, (void*)NULL);
-               init_action(&test.t_helpthreadact, SLEEP, EXCHANGEDATA, 2, (void*)pathbuf, (void*)otherpathbuf);
-               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-               init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, (void*)NULL);
-               init_action(&test.t_cleanup_actions[2], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-               execute_test(&test);
-       }
-
-       init_test(&test, "2.1.15: Change a file with mmap()", FILE1, 1, 1, NOTE_WRITE, YES_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, MMAP, 2, (void*)FILE1, (void*)1); /* 1 -> "modify it"*/
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       /*================= no-event tests ==================*/
-       init_test(&test, "2.2.1: just open and close existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)FILE1, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "2.2.2: read from existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, READ, 2, (void*)FILE1, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "2.2.3: rename existing file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "2.2.4: just open and close dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       /* There are no tests 2.2.5 or 2.2.6 */
-       
-       init_test(&test, "2.2.7: rename a dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "2.2.8: rename a fifo", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
-       test.t_file_is_fifo = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "2.2.9: unlink a fifo", FILE1, 1, 0, NOTE_WRITE, NO_EVENT);
-       test.t_file_is_fifo = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK,1, (void*)FILE1);
-       execute_test(&test);
-       
-       init_test(&test, "2.2.10: chmod a file", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)FILE1, (void*)0700);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       struct passwd *pwd = getpwnam("local");
-
-       if (pwd != NULL) {
-               init_test(&test, "2.2.11: chown a file", FILE1, 2, 1, NOTE_WRITE, NO_EVENT);
-               init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-               init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
-               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)FILE1, (void*)getuid(), (void*)getgid());
-               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-               execute_test(&test);
-       }
-       
-       init_test(&test, "2.2.12: chmod a dir", DIR1, 1, 1, NOTE_WRITE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       if (pwd != NULL) {
-               init_test(&test, "2.2.13: chown a dir", DIR1, 2, 1, NOTE_WRITE, NO_EVENT);
-               init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-               init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)DIR1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
-               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)DIR1, (void*)getuid(), (void*)getgid());
-               init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-               execute_test(&test);
-       }
-       
-       T_LOG("MMAP will never give a notification on HFS.\n");
-       init_test(&test, "2.1.14: mmap() a file but do not change it", FILE1, 1, 1, NOTE_WRITE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, MMAP, 2, (void*)FILE1, (void*)0); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-}
-
-void
-run_note_extend_tests()
-{
-       test_t test;
-       char pathbuf[50];
-       
-       T_LOG("THESE TESTS MAY FAIL ON HFS\n");
-       
-       init_test(&test, "3.1.1: write beyond the end of a file", FILE1, 1, 1, NOTE_EXTEND, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       /*
-        * We won't concern ourselves with lengthening directories: commenting these out  
-        *
-        
-        makepath(pathbuf, DIR1, FILE1);
-        init_test(&test, "3.1.2: add a file to a directory with creat()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT);
-        init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-        init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL); 
-        init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-        init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-        execute_test(&test);
-        
-        makepath(pathbuf, DIR1, FILE1);
-        init_test(&test, "3.1.3: add a file to a directory with open()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT);
-        init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-        init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL); 
-        init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-        init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-        execute_test(&test);
-        
-        makepath(pathbuf, DIR1, FILE1);
-        init_test(&test, "3.1.4: add a file to a directory with rename()", DIR1, 2, 2, NOTE_EXTEND, YES_EVENT);
-        init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-        init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-        init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)pathbuf); 
-        init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-        init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-        execute_test(&test);
-        */
-       
-       /* 3.1.5: a placeholder for a potential kernel test */
-       /*
-        makepath(pathbuf, DIR1, DIR2);
-        init_test(&test, "3.1.6: add a file to a directory with mkdir()", DIR1, 1, 2, NOTE_EXTEND, YES_EVENT);
-        init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-        init_action(&test.t_helpthreadact, SLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL); 
-        init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL);
-        init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-        execute_test(&test);
-        */
-       init_test(&test, "3.1.7: lengthen a file with truncate()", FILE1, 1, 1, NOTE_EXTEND, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, LENGTHEN, 2, FILE1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       
-       /** ========== NO EVENT SECTION ============== **/
-       init_test(&test, "3.2.1: setxattr() a file", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, FILE1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "3.2.2: chmod a file", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)FILE1, (void*)0700);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       struct passwd *pwd = getpwnam("local");
-       if (pwd != NULL) {
-               init_test(&test, "3.2.3: chown a file", FILE1, 2, 1, NOTE_EXTEND, NO_EVENT);
-               init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-               init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
-               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)FILE1, (void*)getuid(), (void*)getgid());
-               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-               execute_test(&test);
-       } else {
-               T_LOG("Couldn't getpwnam for user \"local\"\n");
-       }
-       
-       init_test(&test, "3.2.4: chmod a dir", DIR1, 1, 1, NOTE_EXTEND, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       if (pwd != NULL) {
-               init_test(&test, "3.2.5: chown a dir", DIR1, 2, 1, NOTE_EXTEND, NO_EVENT);
-               init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-               init_action(&test.t_prep_actions[1], NOSLEEP, CHOWN, 3, (void*)DIR1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
-               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, (void*)DIR1, (void*)getuid(), (void*)getgid());
-               init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-               execute_test(&test);
-       }
-       
-       init_test(&test, "3.2.6: TRUNC a file with truncate()", FILE1, 1, 1, NOTE_EXTEND, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, TRUNC, 2, FILE1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-}
-
-void
-run_note_attrib_tests()
-{
-       test_t test;
-       char pathbuf[50];
-       
-       init_test(&test, "4.1.1: chmod a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, CHMOD, 2, FILE1, (void*)0700); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       struct passwd *pwd = getpwnam("local");
-       if (pwd != NULL) {
-               init_test(&test, "4.1.2: chown a file", FILE1, 2, 1, NOTE_ATTRIB, YES_EVENT);
-               init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-               init_action(&(test.t_prep_actions[1]), NOSLEEP, CHOWN, 3, (void*)FILE1, (void*)pwd->pw_uid, (void*)pwd->pw_gid);
-               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, FILE1, (void*)getuid(), (void*)pwd->pw_gid);
-               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-               execute_test(&test);
-       }
-
-       init_test(&test, "4.1.3: chmod a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_helpthreadact), SLEEP, CHMOD, 2, (void*)DIR1, (void*)0700);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       if (pwd != NULL) {
-               init_test(&test, "4.1.4: chown a dir", DIR1, 2, 1, NOTE_ATTRIB, YES_EVENT);
-               init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-               init_action(&(test.t_prep_actions[1]), NOSLEEP, CHOWN, 3, (void*)DIR1, (void*) pwd->pw_uid, (void*)pwd->pw_gid);
-               init_action(&test.t_helpthreadact, SLEEP, CHOWN, 3, DIR1, (void*)getuid(), (void*)getgid());
-               init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-               execute_test(&test);
-       }
-       
-       init_test(&test, "4.1.5: setxattr on a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, (void*)FILE1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.1.6: setxattr on a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, SETXATTR, 2, (void*)DIR1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-
-       /* exchangedata is not supported on APFS volumes */
-       if (!path_on_apfs(PDIR)) {
-               init_test(&test, "4.1.7: exchangedata", FILE1, 2, 2, NOTE_ATTRIB, YES_EVENT);
-               init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-               init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
-               init_action(&test.t_helpthreadact, SLEEP, EXCHANGEDATA, 2, (void*)FILE1, (void*)FILE2); 
-               init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-               init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
-               execute_test(&test);
-       }
-
-       init_test(&test, "4.1.8: utimes on a file", FILE1, 1, 1, NOTE_ATTRIB, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UTIMES, 2, (void*)FILE1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.1.9: utimes on a dir", DIR1, 1, 1, NOTE_ATTRIB, YES_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UTIMES, 2, (void*)DIR1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       
-       /* ====== NO EVENT TESTS ========== */
-       
-       init_test(&test, "4.2.1: rename a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.2: open (do not change) a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, OPEN, 2, (void*)FILE1, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.3: stat a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, STAT, 2, (void*)FILE1, NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.4: unlink a file", FILE1, 1, 0, NOTE_ATTRIB, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.5: write to a file", FILE1, 1, 1, NOTE_ATTRIB, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, WRITE, 2, (void*)FILE1, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       T_LOG("EXPECT SPURIOUS NOTE_ATTRIB EVENTS FROM DIRECTORY OPERATIONS on HFS.\n");
-       init_test(&test, "4.2.6: add a file to a directory with creat()", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT);
-       test.t_known_failure = 1;
-       makepath(pathbuf, DIR1, FILE1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.7: mkdir in a dir", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT);
-       test.t_known_failure = 1;
-       makepath(pathbuf, DIR1, DIR2);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.8: add a symlink to a directory", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT);
-       test.t_known_failure = 1;
-       makepath(pathbuf, DIR1, FILE1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DOTDOT, (void*)pathbuf); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.9: rename into a dir()", DIR1, 2, 2, NOTE_ATTRIB, NO_EVENT);
-       test.t_known_failure = 1;
-       makepath(pathbuf, DIR1, FILE1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)pathbuf); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.10: unlink() file from dir", DIR1, 2, 1, NOTE_ATTRIB, NO_EVENT);
-       test.t_known_failure = 1;
-       makepath(pathbuf, DIR1, FILE1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       init_test(&test, "4.2.11: mkfifo in a directory", DIR1, 1, 2, NOTE_ATTRIB, NO_EVENT);
-       test.t_known_failure = 1;
-       makepath(pathbuf, DIR1, FILE1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, MKFIFO, 1, (void*)pathbuf); 
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       
-}
-
-
-void 
-run_note_link_tests()
-{
-       test_t test;
-       char pathbuf[50];
-       char otherpathbuf[50];
-       
-       T_LOG("HFS DOES NOT HANDLE UNLINK CORRECTLY...\n");
-       init_test(&test, "5.1.1: unlink() a file", FILE1, 1, 0, NOTE_LINK, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, (void*)NULL);
-       execute_test(&test);
-       
-       
-       init_test(&test, "5.1.1.5: link A to B, watch A, remove B", FILE1, 2, 1, NOTE_LINK, YES_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE2, (void*)NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "5.1.2: link() to a file", FILE1, 1, 2, NOTE_LINK, YES_EVENT);
-#if TARGET_OS_WATCH
-       test.t_nondeterministic = 1;
-#endif
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, DIR2);
-       init_test(&test, "5.1.3: make one dir in another", DIR1, 1, 2, NOTE_LINK, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, DIR2);
-       init_test(&test, "5.1.4: rmdir a dir from within another", DIR1, 2, 1, NOTE_LINK, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, DIR2);
-       makepath(otherpathbuf, DIR1, DIR1);
-       init_test(&test, "5.1.5: rename dir A over dir B inside dir C", DIR1, 3, 2, NOTE_LINK, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&(test.t_prep_actions[2]), NOSLEEP, MKDIR, 2, (void*)otherpathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)otherpathbuf);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)otherpathbuf, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       T_LOG("HFS bypasses hfs_makenode to create in target, so misses knote.\n");
-       makepath(pathbuf, DIR1, DIR2);
-       init_test(&test, "5.1.6: rename one dir into another", DIR1, 2, 2, NOTE_LINK, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR2, (void*)pathbuf);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       T_LOG("HFS bypasses hfs_removedir to remove from source, so misses knote.\n");
-       makepath(pathbuf, DIR1, DIR2);
-       init_test(&test, "5.1.7: rename one dir out of another", DIR1, 2, 2, NOTE_LINK, YES_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)DIR2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "5.1.8: rmdir a dir", DIR1, 1, 0, NOTE_LINK, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, (void*)NULL);
-       execute_test(&test);
-       
-       /* ============= NO EVENT SECTION ============== */
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "5.2.1: make a file in a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "5.2.2: unlink a file in a dir", DIR1, 2, 1, NOTE_LINK, NO_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       makepath(otherpathbuf, DIR1, FILE2);
-       init_test(&test, "5.2.3: rename a file within a dir", DIR1, 2, 2, NOTE_LINK, NO_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)pathbuf, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)pathbuf, (void*)otherpathbuf);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)otherpathbuf, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "5.2.4: rename a file into a dir", DIR1, 2, 2, NOTE_LINK, NO_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)pathbuf);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       makepath(pathbuf, DIR1, FILE1);
-       init_test(&test, "5.2.5: make a symlink in a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DOTDOT, (void*)pathbuf);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)pathbuf, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "5.2.6: make a symlink to a dir", DIR1, 1, 2, NOTE_LINK, NO_EVENT);
-       test.t_known_failure = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)DIR1, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "5.2.7: make a symlink to a file", FILE1, 1, 2, NOTE_LINK, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, SYMLINK, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-}
-
-void
-run_note_rename_tests() 
-{
-       test_t test;
-       
-       init_test(&test, "6.1.1: rename a file", FILE1, 1, 1, NOTE_RENAME, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.1.2: rename a dir", DIR1, 1, 1, NOTE_RENAME, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.1.3: rename one file over another", FILE1, 2, 1, NOTE_RENAME, YES_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.1.4: rename one dir over another", DIR1, 2, 1, NOTE_RENAME, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
-       execute_test(&test);
-       
-       /* ========= NO EVENT SECTION =========== */
-       
-       init_test(&test, "6.2.1: unlink a file", FILE1, 1, 0, NOTE_RENAME, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.2.2: rmdir a dir", DIR1, 1, 0, NOTE_RENAME, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.2.3: link() to a file", FILE1, 1, 2, NOTE_RENAME, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, HARDLINK, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       init_action(&test.t_cleanup_actions[1], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.2.4: rename one file over another: watch deceased", 
-                         FILE2, 2, 1, NOTE_RENAME, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, CREAT, 2, (void*)FILE2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.2.5: rename one dir over another: watch deceased", 
-                         DIR2, 2, 1, NOTE_RENAME, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, MKDIR, 2, (void*)DIR2, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.2.6: rename a file to itself", FILE1, 1, 1, NOTE_RENAME, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "6.2.7: rename a dir to itself", DIR1, 1, 1, NOTE_RENAME, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKDIR, 2, (void*)DIR1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)DIR1, (void*)DIR1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, RMDIR, 2, (void*)DIR1, NULL);
-       execute_test(&test);
-}
-
-void 
-run_note_revoke_tests() 
-{
-       test_t test;
-       init_test(&test, "7.1.1: revoke file", FILE1, 1, 1, NOTE_REVOKE, YES_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&test.t_helpthreadact, SLEEP, REVOKE, 1, (void*)FILE1);
-       init_action(&(test.t_cleanup_actions[0]), NOSLEEP, UNLINK, 1, (void*)FILE1);
-       execute_test(&test);
-       
-       init_test(&test, "7.2.1: delete file", FILE1, 1, 0, NOTE_REVOKE, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1);
-       execute_test(&test);
-}
-
-
-void
-run_evfilt_read_tests() 
-{
-       test_t test;
-       init_test(&test, "8.1.1: how much data in file of length LENGTHEN_SIZE?", FILE1, 2, 1, EVFILT_READ, LENGTHEN_SIZE);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, LENGTHEN, 2, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "8.1.2: block, then write to file", FILE1, 2, 1, EVFILT_READ, strlen(TEST_STRING));
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1);
-       init_action(&test.t_helpthreadact, SLEEP, WRITE, 1, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "8.1.3: block, then extend", FILE1, 2, 1, EVFILT_READ, LENGTHEN_SIZE);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1);
-       init_action(&test.t_helpthreadact, SLEEP, LENGTHEN, 1, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "8.1.4: block, then seek to beginning", FILE1, 2, 1, EVFILT_READ, strlen(TEST_STRING));
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, WRITE, 1, (void*)FILE1);
-       test.t_read_to_end_first = 1; /* hack means that we've gotten to EOF before we block */
-       init_action(&test.t_helpthreadact, SLEEP, LSEEK, 1, (void*)0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       
-       init_test(&test, "8.1.5: block, then write to fifo", FILE1, 1, 1, EVFILT_READ, strlen(TEST_STRING));
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1);
-       test.t_file_is_fifo = 1;
-       init_action(&test.t_helpthreadact, SLEEP, WRITE, 1, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       /* No result section... */
-       init_test(&test, "8.2.1: just rename", FILE1, 2, 1, EVFILT_READ, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1);
-       init_action(&test.t_helpthreadact, SLEEP, RENAME, 2, (void*)FILE1, (void*)FILE2);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE2, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "8.2.2: delete file", FILE1, 2, 0, EVFILT_READ, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, TRUNC, 1, (void*)FILE1);
-       init_action(&test.t_helpthreadact, SLEEP, UNLINK, 1, (void*)FILE1);
-       execute_test(&test);
-       
-       init_test(&test, "8.2.3: write to beginning", FILE1, 2, 1, EVFILT_READ, NO_EVENT);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, WRITE, 1, (void*)FILE1);
-       test.t_read_to_end_first = 1; /* hack means that we've gotten to EOF before we block */
-       init_action(&test.t_helpthreadact, SLEEP, WRITE, 1, (void*)FILE1);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 1, (void*)FILE1);
-       execute_test(&test);
-       
-       init_test(&test, "8.1.4: block, then seek to current location", FILE1, 2, 1, EVFILT_READ, 0);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, WRITE, 1, (void*)FILE1);
-       test.t_read_to_end_first = 1; /* hack means that we've gotten to EOF before we block */
-       init_action(&test.t_helpthreadact, SLEEP, LSEEK, 1, (void*)strlen(TEST_STRING));
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "8.2.5: trying to read from empty fifo", FILE1, 1, 1, EVFILT_READ, 0);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1);
-       test.t_file_is_fifo = 1;
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 1, (void*)0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-}
-
-
-
-void*
-read_from_fd(void *arg)
-{
-       char buf[50];
-       int fd = (int) arg;
-       usleep(USLEEP_TIME);
-       return (void*) read(fd, buf, sizeof(buf));
-}
-
-void*
-write_to_fd(void *arg)
-{
-       char buf[50];
-       int fd = (int) arg;
-       usleep(USLEEP_TIME);
-       return (void*) write(fd, buf, sizeof(buf));
-}
-
-/*
- * We don't (in principle) support EVFILT_WRITE for vnodes; thusly, no tests here
- */
-void 
-run_evfilt_write_tests()
-{
-       
-       test_t test;
-       init_test(&test, "9.1.1: how much space in empty fifo?", FILE1, 1, 1, EVFILT_WRITE, FIFO_SPACE);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
-       test.t_file_is_fifo = 1;
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "9.1.2: how much space in slightly written fifo?", FILE1, 1, 1, EVFILT_WRITE, FIFO_SPACE - strlen(TEST_STRING));
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
-       test.t_file_is_fifo = 1;
-       test.t_write_some_data = 1;
-       init_action(&(test.t_helpthreadact), NOSLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_test(&test, "9.2.1: how much space in a full fifo?", FILE1, 1, 1, EVFILT_WRITE, 0);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
-       test.t_nondeterministic = 1;
-       test.t_file_is_fifo = 1;
-       test.t_extra_sleep_hack = 1;
-       init_action(&(test.t_helpthreadact), NOSLEEP, FILLFD, 1, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-}
-
-void
-run_poll_tests()
-{
-       test_t test;
-       init_poll_test(&test, "10.1.1: does poll say I can write a regular file?", FILE1, 1, 1, POLLWRNORM, 1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_poll_test(&test, "10.1.2: does poll say I can write an empty FIFO?", FILE1, 1, 1, POLLWRNORM, 1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
-       test.t_file_is_fifo = 1;
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_poll_test(&test, "10.1.3: does poll say I can read a nonempty FIFO?", FILE1, 1, 1, POLLRDNORM, 1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
-       test.t_file_is_fifo = 1;
-       test.t_write_some_data = 1;
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_poll_test(&test, "10.1.4: does poll say I can read a nonempty regular file?", FILE1, 2, 1, POLLRDNORM, 1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1, (void*)NULL);
-       init_action(&(test.t_prep_actions[1]), NOSLEEP, LENGTHEN, 1, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_poll_test(&test, "10.1.5: does poll say I can read an empty file?", FILE1, 1, 1, POLLRDNORM, 1);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 1, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       
-       
-       
-       init_poll_test(&test, "10.2.2: does poll say I can read an empty FIFO?", FILE1, 1, 1, POLLRDNORM, 0);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
-       test.t_file_is_fifo = 1;
-       init_action(&test.t_helpthreadact, SLEEP, NOTHING, 0);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       execute_test(&test);
-       
-       init_poll_test(&test, "10.2.3: does poll say I can write a full FIFO?", FILE1, 1, 1, POLLWRNORM, 0);
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, MKFIFO, 1, (void*)FILE1, (void*)NULL);
-       test.t_nondeterministic = 1;
-       test.t_file_is_fifo = 1;
-       test.t_extra_sleep_hack = 1;
-       init_action(&(test.t_helpthreadact), NOSLEEP, FILLFD, 1, (void*)FILE1, (void*)NULL);
-       init_action(&test.t_cleanup_actions[0], NOSLEEP, UNLINK, 2, (void*)FILE1, NULL);
-       test.t_known_failure = 1;
-       execute_test(&test);
-}
-
-void
-run_note_funlock_tests()
-{
-       test_t test;
-       init_test(&test, "11.1.1: unlock file", FILE1, 1, 1, NOTE_FUNLOCK, YES_EVENT);
-       test.t_nondeterministic = 1;
-       init_action(&(test.t_prep_actions[0]), NOSLEEP, CREAT, 2, (void*)FILE1, (void *)NULL);
-       init_action(&test.t_helpthreadact, SLEEP, FUNLOCK, 2, (void*)FILE1, (void *)NULL);
-       init_action(&(test.t_cleanup_actions[0]), NOSLEEP, UNLINK, 2, (void*)FILE1, (void *)NULL);
-       execute_test(&test);
-}
-
-void
-run_all_tests() 
-{
-       run_note_delete_tests();
-       run_note_write_tests();
-       run_note_extend_tests();
-       run_note_attrib_tests();
-       run_note_link_tests();
-       run_note_rename_tests();
-#if 0
-       run_note_revoke_tests(); /* Can no longer revoke a regular file--need an unmount test */
-#endif /* 0 */
-       run_evfilt_read_tests();
-       run_evfilt_write_tests();
-       run_poll_tests();
-       run_note_funlock_tests();
-}
-
-       T_DECL(kqueue_file_tests,
-               "Tests assorted kqueue operations for file-related events")
-{
-       char *which = NULL;
-       if (argc > 1) {
-               which = argv[1];
-       }
-       
-       T_SETUPBEGIN;
-       rmdir(DIR1);
-       rmdir(DIR2);
-       T_SETUPEND;
-
-       if ((!which) || (strcmp(which, "all") == 0))
-               run_all_tests();
-       else if (strcmp(which, "delete") == 0) 
-               run_note_delete_tests();
-       else if (strcmp(which, "write") == 0)
-               run_note_write_tests();
-       else if (strcmp(which, "extend") == 0)
-               run_note_extend_tests();
-       else if (strcmp(which, "attrib") == 0)
-               run_note_attrib_tests();
-       else if (strcmp(which, "link") == 0)
-               run_note_link_tests();
-       else if (strcmp(which, "rename") == 0)
-               run_note_rename_tests();
-       else if (strcmp(which, "revoke") == 0)
-               run_note_revoke_tests();
-       else if (strcmp(which, "evfiltread") == 0)
-               run_evfilt_read_tests();
-       else if (strcmp(which, "evfiltwrite") == 0)
-               run_evfilt_write_tests();
-       else if (strcmp(which, "poll") == 0)
-               run_poll_tests();
-       else if (strcmp(which, "funlock") == 0)
-               run_note_funlock_tests();
-       else {
-               fprintf(stderr, "Valid options are:\n\tdelete, write, extend, "
-                               "attrib, link, rename, revoke, evfiltread, "
-                               "fifo, all, evfiltwrite, funlock<none>\n");
-               exit(1);
-       }
-}
-
diff --git a/tools/tests/darwintests/kqueue_timer_tests.c b/tools/tests/darwintests/kqueue_timer_tests.c
deleted file mode 100644 (file)
index e02deb4..0000000
+++ /dev/null
@@ -1,437 +0,0 @@
-#include <sys/types.h>
-#include <sys/event.h>
-#include <sys/time.h>
-#include <assert.h>
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <mach/mach.h>
-#include <mach/task.h>
-
-#include <TargetConditionals.h>
-#include <darwintest.h>
-
-#ifndef NOTE_MACHTIME
-#define NOTE_MACHTIME   0x00000100
-#endif
-
-static mach_timebase_info_data_t timebase_info;
-
-static uint64_t nanos_to_abs(uint64_t nanos) { return nanos * timebase_info.denom / timebase_info.numer; }
-static uint64_t abs_to_nanos(uint64_t abs)   { return abs * timebase_info.numer / timebase_info.denom; }
-
-static int kq, passed, failed;
-
-static struct timespec failure_timeout = { .tv_sec = 10, .tv_nsec = 0 };
-
-/*
- * Wait for given kevent, which should return in 'expected' usecs.
- */
-static int
-do_simple_kevent(struct kevent64_s *kev, uint64_t expected)
-{
-       int ret;
-       int64_t elapsed_usecs;
-       uint64_t delta_usecs;
-       struct timespec timeout;
-       struct timeval before, after;
-
-       /* time out after 1 sec extra delay */
-       timeout.tv_sec = (expected / USEC_PER_SEC) + 1;
-       timeout.tv_nsec = (expected % USEC_PER_SEC) * 1000;
-
-       T_SETUPBEGIN;
-
-       /* measure time for the kevent */
-       gettimeofday(&before, NULL);
-       ret = kevent64(kq, kev, 1, kev, 1, 0, &timeout);
-       gettimeofday(&after, NULL);
-
-       if (ret < 1 || (kev->flags & EV_ERROR)) {
-               T_LOG("%s() failure: kevent returned %d, error %d\n", __func__, ret,
-                               (ret == -1 ? errno : (int) kev->data));
-               return 0;
-       }
-
-       T_SETUPEND;
-
-       /* did it work? */
-       elapsed_usecs = (after.tv_sec - before.tv_sec) * (int64_t)USEC_PER_SEC +
-               (after.tv_usec - before.tv_usec);
-       delta_usecs = (uint64_t)llabs(elapsed_usecs - ((int64_t)expected));
-
-       /* failure if we're 30% off, or 50 mics late */
-       if (delta_usecs > (30 * expected / 100.0) && delta_usecs > 50) {
-               T_LOG("\tfailure: expected %lld usec, measured %lld usec.\n",
-                               expected, elapsed_usecs);
-               return 0;
-       } else {
-               T_LOG("\tsuccess, measured %lld usec.\n", elapsed_usecs);
-               return 1;
-       }
-}
-
-static void
-test_absolute_kevent(int time, int scale)
-{
-       struct timeval tv;
-       struct kevent64_s kev;
-       uint64_t nowus, expected, timescale = 0;
-       int ret;
-       int64_t deadline;
-
-       gettimeofday(&tv, NULL);
-       nowus = (uint64_t)tv.tv_sec * USEC_PER_SEC + (uint64_t)tv.tv_usec;
-
-       T_SETUPBEGIN;
-
-       switch (scale) {
-       case NOTE_MACHTIME:
-               T_LOG("Testing %d MATUs absolute timer...\n", time);
-               break;
-       case NOTE_SECONDS:
-               T_LOG("Testing %d sec absolute timer...\n", time);
-               timescale = USEC_PER_SEC;
-               break;
-       case NOTE_USECONDS:
-               T_LOG("Testing %d usec absolute timer...\n", time);
-               timescale = 1;
-               break;
-       case 0:
-               T_LOG("Testing %d msec absolute timer...\n", time);
-               timescale = 1000;
-               break;
-       default:
-               T_FAIL("Failure: scale 0x%x not recognized.\n", scale);
-               return;
-       }
-
-       T_SETUPEND;
-
-       if (scale == NOTE_MACHTIME) {
-               expected = abs_to_nanos((uint64_t)time) / NSEC_PER_USEC;
-               deadline = (int64_t)mach_absolute_time() + time;
-       } else {
-               expected = (uint64_t)time * timescale;
-               deadline = (int64_t)(nowus / timescale) + time;
-       }
-
-       /* deadlines in the past should fire immediately */
-       if (time < 0)
-               expected = 0;
-
-       EV_SET64(&kev, 1, EVFILT_TIMER, EV_ADD,
-                       NOTE_ABSOLUTE | scale, deadline, 0,0,0);
-       ret = do_simple_kevent(&kev, expected);
-
-       if (ret) {
-               passed++;
-               T_PASS("%s time:%d, scale:0x%x", __func__, time, scale);
-       } else {
-               failed++;
-               T_FAIL("%s time:%d, scale:0x%x", __func__, time, scale);
-       }
-}
-
-static void
-test_oneshot_kevent(int time, int scale)
-{
-       int ret;
-       uint64_t expected = 0;
-       struct kevent64_s kev;
-
-       T_SETUPBEGIN;
-
-       switch (scale) {
-       case NOTE_MACHTIME:
-               T_LOG("Testing %d MATUs interval timer...\n", time);
-               expected = abs_to_nanos((uint64_t)time) / NSEC_PER_USEC;
-               break;
-       case NOTE_SECONDS:
-               T_LOG("Testing %d sec interval timer...\n", time);
-               expected = (uint64_t)time * USEC_PER_SEC;
-               break;
-       case NOTE_USECONDS:
-               T_LOG("Testing %d usec interval timer...\n", time);
-               expected = (uint64_t)time;
-               break;
-       case NOTE_NSECONDS:
-               T_LOG("Testing %d nsec interval timer...\n", time);
-               expected = (uint64_t)time / 1000;
-               break;
-       case 0:
-               T_LOG("Testing %d msec interval timer...\n", time);
-               expected = (uint64_t)time * 1000;
-               break;
-       default:
-               T_FAIL("Failure: scale 0x%x not recognized.\n", scale);
-               return;
-       }
-
-       T_SETUPEND;
-
-       /* deadlines in the past should fire immediately */
-       if (time < 0)
-               expected = 0;
-
-       EV_SET64(&kev, 2, EVFILT_TIMER, EV_ADD | EV_ONESHOT, scale, time,
-                       0, 0, 0);
-       ret = do_simple_kevent(&kev, expected);
-
-       if (ret) {
-               passed++;
-               T_PASS("%s time:%d, scale:0x%x", __func__, time, scale);
-       } else {
-               failed++;
-               T_FAIL("%s time:%d, scale:0x%x", __func__, time, scale);
-       }
-}
-
-/* Test that the timer goes ding multiple times */
-static void
-test_interval_kevent(int usec)
-{
-       struct kevent64_s kev;
-       int ret;
-
-       T_SETUPBEGIN;
-
-       uint64_t test_duration_us = USEC_PER_SEC; /* 1 second */
-       uint64_t expected_pops;
-
-       if (usec < 0)
-               expected_pops = 1; /* TODO: test 'and only once' */
-       else
-               expected_pops = test_duration_us / (uint64_t)usec;
-
-       T_LOG("Testing interval kevent at %d usec intervals (%lld pops/second)...\n",
-               usec, expected_pops);
-
-       EV_SET64(&kev, 3, EVFILT_TIMER, EV_ADD, NOTE_USECONDS, usec, 0, 0, 0);
-       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
-       if (ret != 0 || (kev.flags & EV_ERROR)) {
-               T_FAIL("%s() setup failure: kevent64 returned %d\n", __func__, ret);
-               failed++;
-               return;
-       }
-
-       T_SETUPEND;
-
-       struct timeval before, after;
-       uint64_t elapsed_usecs;
-
-       gettimeofday(&before, NULL);
-
-       uint64_t pops = 0;
-
-       for (uint32_t i = 0; i < expected_pops; i++) {
-               ret = kevent64(kq, NULL, 0, &kev, 1, 0, &failure_timeout);
-               if (ret != 1) {
-                       T_FAIL("%s() failure: kevent64 returned %d\n", __func__, ret);
-                       failed++;
-                       return;
-               }
-
-               //T_LOG("\t ding: %lld\n", kev.data);
-
-               pops += (uint64_t)kev.data;
-               gettimeofday(&after, NULL);
-               elapsed_usecs = (uint64_t)((after.tv_sec - before.tv_sec) * (int64_t)USEC_PER_SEC +
-                       (after.tv_usec - before.tv_usec));
-
-               if (elapsed_usecs > test_duration_us)
-                       break;
-       }
-
-       /* check how many times the timer fired: within 5%? */
-       if (pops > expected_pops + (expected_pops / 20) ||
-               pops < expected_pops - (expected_pops / 20)) {
-               T_FAIL("%s() usec:%d (saw %lld of %lld expected pops)", __func__, usec, pops, expected_pops);
-               failed++;
-       } else {
-               T_PASS("%s() usec:%d (saw %lld pops)", __func__, usec, pops);
-               passed++;
-       }
-
-       EV_SET64(&kev, 3, EVFILT_TIMER, EV_DELETE, 0, 0, 0, 0, 0);
-       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
-       if (ret != 0) {
-               T_LOG("\tfailed to stop repeating timer: %d\n", ret);
-       }
-}
-
-/* Test that the repeating timer repeats even while not polling in kqueue */
-static void
-test_repeating_kevent(int usec)
-{
-       struct kevent64_s kev;
-       int ret;
-
-       T_SETUPBEGIN;
-
-       uint64_t test_duration_us = USEC_PER_SEC; /* 1 second */
-
-       uint64_t expected_pops = test_duration_us / (uint64_t)usec;
-       T_LOG("Testing repeating kevent at %d usec intervals (%lld pops/second)...\n",
-               usec, expected_pops);
-
-       EV_SET64(&kev, 4, EVFILT_TIMER, EV_ADD, NOTE_USECONDS, usec, 0, 0, 0);
-       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
-       if (ret != 0) {
-               T_FAIL("%s() setup failure: kevent64 returned %d\n", __func__, ret);
-               failed++;
-               return;
-       }
-
-       usleep((useconds_t)test_duration_us);
-
-       ret = kevent64(kq, NULL, 0, &kev, 1, 0, &failure_timeout);
-       if (ret != 1 || (kev.flags & EV_ERROR)) {
-               T_FAIL("%s() setup failure: kevent64 returned %d\n", __func__, ret);
-               failed++;
-               return;
-       }
-
-       T_SETUPEND;
-
-       uint64_t pops = (uint64_t) kev.data;
-
-       /* check how many times the timer fired: within 5%? */
-       if (pops > expected_pops + (expected_pops / 20) ||
-               pops < expected_pops - (expected_pops / 20)) {
-               T_FAIL("%s() usec:%d (saw %lld of %lld expected pops)", __func__, usec, pops, expected_pops);
-               failed++;
-       } else {
-               T_PASS("%s() usec:%d (saw %lld pops)", __func__, usec, pops);
-               passed++;
-       }
-
-       EV_SET64(&kev, 4, EVFILT_TIMER, EV_DELETE, 0, 0, 0, 0, 0);
-       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
-       if (ret != 0) {
-               T_LOG("\tfailed to stop repeating timer: %d\n", ret);
-       }
-}
-
-
-static void
-test_updated_kevent(int first, int second)
-{
-       struct kevent64_s kev;
-       int ret;
-
-       T_LOG("Testing update from %d to %d msecs...\n", first, second);
-
-       T_SETUPBEGIN;
-
-       EV_SET64(&kev, 4, EVFILT_TIMER, EV_ADD|EV_ONESHOT, 0, first, 0, 0, 0);
-       ret = kevent64(kq, &kev, 1, NULL, 0, 0, NULL);
-       if (ret != 0) {
-               T_FAIL("%s() failure: initial kevent returned %d\n", __func__, ret);
-               failed++;
-               return;
-       }
-
-       T_SETUPEND;
-
-       EV_SET64(&kev, 4, EVFILT_TIMER, EV_ONESHOT, 0, second, 0, 0, 0);
-
-       uint64_t expected_us = (uint64_t)second * 1000;
-
-       if (second < 0)
-               expected_us = 0;
-
-       ret = do_simple_kevent(&kev, expected_us);
-
-       if (ret) {
-               passed++;
-               T_PASS("%s() %d, %d", __func__, first, second);
-       } else {
-               failed++;
-               T_FAIL("%s() %d, %d", __func__, first, second);
-       }
-}
-
-static void
-disable_timer_coalescing(void)
-{
-    struct task_qos_policy     qosinfo;
-    kern_return_t                      kr;
-
-       T_SETUPBEGIN;
-
-       qosinfo.task_latency_qos_tier = LATENCY_QOS_TIER_0;
-       qosinfo.task_throughput_qos_tier = THROUGHPUT_QOS_TIER_0;
-
-       kr = task_policy_set(mach_task_self(), TASK_OVERRIDE_QOS_POLICY, (task_policy_t)&qosinfo,
-                            TASK_QOS_POLICY_COUNT);
-       if (kr != KERN_SUCCESS) {
-               T_FAIL("task_policy_set(... TASK_OVERRIDE_QOS_POLICY ...) failed: %d (%s)", kr, mach_error_string(kr));
-       }
-
-       T_SETUPEND;
-}
-
-T_DECL(kqueue_timer_tests,
-       "Tests assorted kqueue operations for timer-related events")
-{
-       /*
-        * Since we're trying to test timers here, disable timer coalescing
-        * to improve the accuracy of timer fires for this process.
-        */
-       disable_timer_coalescing();
-
-       mach_timebase_info(&timebase_info);
-
-       kq = kqueue();
-       assert(kq > 0);
-       passed = 0;
-       failed = 0;
-
-       test_absolute_kevent(100, 0);
-       test_absolute_kevent(200, 0);
-       test_absolute_kevent(300, 0);
-       test_absolute_kevent(1000, 0);
-       T_MAYFAIL;
-       test_absolute_kevent(500, NOTE_USECONDS);
-       T_MAYFAIL;
-       test_absolute_kevent(100, NOTE_USECONDS);
-       T_MAYFAIL;
-       test_absolute_kevent(2, NOTE_SECONDS);
-       T_MAYFAIL;
-       test_absolute_kevent(-1000, 0);
-
-       T_MAYFAIL;
-       test_absolute_kevent((int)nanos_to_abs(10 * NSEC_PER_MSEC), NOTE_MACHTIME);
-
-       test_oneshot_kevent(1, NOTE_SECONDS);
-       T_MAYFAIL;
-       test_oneshot_kevent(10, 0);
-       T_MAYFAIL;
-       test_oneshot_kevent(200, NOTE_USECONDS);
-       T_MAYFAIL;
-       test_oneshot_kevent(300000, NOTE_NSECONDS);
-       T_MAYFAIL;
-       test_oneshot_kevent(-1, NOTE_SECONDS);
-
-       T_MAYFAIL;
-       test_oneshot_kevent((int)nanos_to_abs(10 * NSEC_PER_MSEC), NOTE_MACHTIME);
-
-       test_interval_kevent(250 * 1000);
-       T_MAYFAIL;
-       test_interval_kevent(5 * 1000);
-       T_MAYFAIL;
-       test_interval_kevent(200);
-       T_MAYFAIL;
-       test_interval_kevent(50);
-
-       test_interval_kevent(-1000);
-
-       test_repeating_kevent(10000); /* 10ms */
-
-       test_updated_kevent(1000, 2000);
-       test_updated_kevent(2000, 1000);
-       test_updated_kevent(1000, -1);
-
-}
diff --git a/tools/tests/darwintests/launchd_plists/com.apple.xnu.test.kevent_qos.plist b/tools/tests/darwintests/launchd_plists/com.apple.xnu.test.kevent_qos.plist
deleted file mode 100644 (file)
index bf3c2f4..0000000
+++ /dev/null
@@ -1,24 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>Label</key>
-       <string>com.apple.xnu.test.kevent_qos</string>
-       <key>MachServices</key>
-       <dict>
-               <key>com.apple.xnu.test.kevent_qos</key>
-               <true/>
-       </dict>
-       <key>ThrottleInterval</key>
-       <integer>1</integer>
-       <key>UserName</key>
-       <string>root</string>
-       <key>ProcessType</key>
-       <string>Adaptive</string>
-       <key>EnvironmentVariables</key>
-       <dict>
-               <key>MallocNanoZone</key>
-               <string>1</string>
-       </dict>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/mach_boottime_usec.c b/tools/tests/darwintests/mach_boottime_usec.c
deleted file mode 100644 (file)
index ad0ac32..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <stdlib.h>
-#include <time.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <mach/mach_time.h>
-
-#include <darwintest.h>
-
-T_DECL(mach_boottime_usec, "mach_boottime_usec()",
-               T_META_ALL_VALID_ARCHS(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       uint64_t bt_usec = mach_boottime_usec();
-
-       struct timeval bt_tv;
-       size_t len = sizeof(bt_tv);
-       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.boottime", &bt_tv, &len, NULL, 0), NULL);
-
-       T_EXPECT_EQ((uint64_t)bt_tv.tv_sec * USEC_PER_SEC + (uint64_t)bt_tv.tv_usec, bt_usec, NULL);
-}
diff --git a/tools/tests/darwintests/mach_continuous_time.c b/tools/tests/darwintests/mach_continuous_time.c
deleted file mode 100644 (file)
index a7d773b..0000000
+++ /dev/null
@@ -1,367 +0,0 @@
-#include <mach/mach.h>
-#include <mach/mach_time.h>
-#include <mach/clock_types.h>
-#include <sys/time.h>
-#include <spawn.h>
-#include <sys/wait.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <time.h>
-#include <errno.h>
-
-#include <darwintest.h>
-
-#if (defined(__arm__) || defined(__arm64__))
-#define HAS_KERNEL_TIME_TRAPS
-
-extern uint64_t mach_absolute_time_kernel(void);
-extern uint64_t mach_continuous_time_kernel(void);
-
-#endif
-extern char **environ;
-
-static const int64_t one_mil = 1000*1000;
-
-#define to_ns(ticks) (((ticks) * tb_info.numer) / (tb_info.denom))
-#define to_ms(ticks) (to_ns(ticks)/one_mil)
-
-static mach_timebase_info_data_t tb_info;
-
-static void
-update(uint64_t *a, uint64_t *c) {
-       mach_get_times(a,c,NULL);
-}
-
-T_DECL(mct_monotonic, "Testing mach_continuous_time returns sane, monotonic values",
-               T_META_ALL_VALID_ARCHS(true))
-{
-       mach_timebase_info(&tb_info);
-#ifdef HAS_KERNEL_TIME_TRAPS
-       bool kernel = false;
-#endif
-
-       volatile uint64_t multiple_test = to_ms(mach_continuous_time());
-       for(int i = 0; i < 20; i++) {
-               uint64_t tmp;
-               const char *test_type = "user";
-#ifdef HAS_KERNEL_TIME_TRAPS
-               if (kernel) {
-                       test_type = "kernel";
-                       tmp = mach_continuous_time_kernel();
-               } else
-                       tmp = mach_continuous_time();
-               kernel = !kernel;
-#else
-               tmp = mach_continuous_time();
-#endif
-               tmp = to_ms(tmp);
-               T_ASSERT_GE(tmp, multiple_test, "mach_continuous_time (%s) must be monotonic", test_type);
-
-               // each successive call shouldn't be more than 100ms in the future
-               T_ASSERT_LE(tmp - multiple_test, 100ULL, "mach_continuous_time (%s) should not jump forward too fast", test_type);
-
-               multiple_test = tmp;
-       }
-}
-
-T_DECL(mat_monotonic, "Testing mach_absolute_time returns sane, monotonic values",
-               T_META_ALL_VALID_ARCHS(true))
-{
-       mach_timebase_info(&tb_info);
-#ifdef HAS_KERNEL_TIME_TRAPS
-       bool kernel = false;
-#endif
-
-       volatile uint64_t multiple_test = to_ms(mach_absolute_time());
-       for(int i = 0; i < 20; i++) {
-               uint64_t tmp;
-               const char *test_type = "user";
-#ifdef HAS_KERNEL_TIME_TRAPS
-               if (kernel) {
-                       test_type = "kernel";
-                       tmp = mach_absolute_time_kernel();
-               } else
-                       tmp = mach_absolute_time();
-               kernel = !kernel;
-#endif
-               tmp = mach_absolute_time();
-               tmp = to_ms(tmp);
-               T_ASSERT_GE(tmp, multiple_test, "mach_absolute_time (%s) must be monotonic", test_type);
-
-               // each successive call shouldn't be more than 100ms in the future
-               T_ASSERT_LE(tmp - multiple_test, 100ULL, "mach_absolute_time (%s) should not jump forward too fast", test_type);
-
-               multiple_test = tmp;
-       }
-}
-
-T_DECL(mct_pause, "Testing mach_continuous_time and mach_absolute_time don't diverge")
-{
-       mach_timebase_info(&tb_info);
-
-       uint64_t abs_now;
-       uint64_t cnt_now;
-       int before_diff, after_diff;
-
-       update(&abs_now, &cnt_now);
-       before_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
-
-       sleep(1);
-
-       update(&abs_now, &cnt_now);
-       after_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
-
-       T_ASSERT_LE(abs(after_diff - before_diff), 1, "mach_continuous_time and mach_absolute_time should not diverge");
-}
-
-#ifdef HAS_KERNEL_TIME_TRAPS
-static void update_kern(uint64_t *abs, uint64_t *cont)
-{
-       uint64_t abs1, abs2, cont1, cont2;
-       do {
-               abs1 = mach_absolute_time_kernel();
-               cont1 = mach_continuous_time_kernel();
-               abs2 = mach_absolute_time_kernel();
-               cont2 = mach_continuous_time_kernel();
-       } while (to_ms(abs2 - abs1) || to_ms(cont2 - cont1));
-       *abs = abs2;
-       *cont = cont2;
-}
-#endif
-
-#ifdef HAS_KERNEL_TIME_TRAPS
-T_DECL(mct_pause_kern, "Testing kernel mach_continuous_time and mach_absolute_time don't diverge")
-{
-       mach_timebase_info(&tb_info);
-
-       uint64_t abs_now;
-       uint64_t cnt_now;
-       int before_diff, after_diff;
-
-       update_kern(&abs_now, &cnt_now);
-       before_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
-
-       sleep(1);
-
-       update_kern(&abs_now, &cnt_now);
-       after_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
-
-       T_ASSERT_LE(abs(after_diff - before_diff), 1, "mach_continuous_time_kernel and mach_absolute_time_kernel should not diverge");
-}
-#endif
-
-T_DECL(mct_sleep, "Testing mach_continuous_time behavior over system sleep"){
-#ifndef MCT_SLEEP_TEST
-       T_SKIP("Skipping test that sleeps the device; compile with MCT_SLEEP_TEST define to enable.");
-#endif
-
-       mach_timebase_info(&tb_info);
-
-       uint64_t abs_now;
-       uint64_t cnt_now;
-       int before_diff, after_diff = 0;
-
-       T_LOG("Testing mach_continuous_time is ~5 seconds ahead of mach_absolute_time after 5 second sleep");
-       update(&abs_now, &cnt_now);
-       before_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
-
-       // performs:
-       // pmset relative wake 5
-       // pmset sleepnow
-
-       pid_t pid;
-       int spawn_ret = 0;
-       time_t before_sleep = time(NULL);
-       int ct_ms_before_sleep = (int)to_ms(cnt_now);
-       int ab_ms_before_sleep = (int)to_ms(abs_now);
-
-       char *const pmset1_args[] = {"/usr/bin/pmset", "relative", "wake", "5", NULL};
-       T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset1_args[0], NULL, NULL, pmset1_args, environ)), NULL);
-
-       T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed");
-       T_ASSERT_EQ(spawn_ret, 0, "pmset relative wait 5 failed");
-
-       char *const pmset2_args[] = {"/usr/bin/pmset", "sleepnow", NULL};
-       T_ASSERT_POSIX_ZERO((spawn_ret = posix_spawn(&pid, pmset2_args[0], NULL, NULL, pmset2_args, environ)), NULL);
-
-       T_ASSERT_EQ(waitpid(pid, &spawn_ret, 0), pid, "waitpid failed");
-       T_ASSERT_EQ(spawn_ret, 0, "pmset relative wait 5 failed");
-
-       // wait for device to sleep (up to 30 seconds)
-       for(int i = 0; i < 30; i++) {
-               update(&abs_now, &cnt_now);
-               after_diff = (int)(to_ms(cnt_now) - to_ms(abs_now));
-
-               // on OSX, there's enough latency between calls to MCT and MAT
-               // when the system is going down for sleep for values to diverge a few ms
-               if(abs(before_diff - after_diff) > 2) {
-                       break;
-               }
-
-               sleep(1);
-               T_LOG("waited %d seconds for sleep...", i+1);
-       }
-
-       if((after_diff - before_diff) < 4000) {
-               T_LOG("Device slept for less than 4 seconds, did it really sleep? (%d ms change between abs and cont)",
-                       after_diff - before_diff);
-       }
-
-       time_t after_sleep = time(NULL);
-
-       int cal_sleep_diff  = (int)(double)difftime(after_sleep, before_sleep);
-       int ct_sleep_diff = ((int)to_ms(cnt_now) - ct_ms_before_sleep)/1000;
-       int ab_sleep_diff = ((int)to_ms(abs_now) - ab_ms_before_sleep)/1000;
-
-       T_LOG("Calendar progressed: %d sec; continuous time progressed: %d sec; absolute time progressed %d sec",
-               cal_sleep_diff, ct_sleep_diff, ab_sleep_diff);
-
-       T_ASSERT_LE(abs(ct_sleep_diff - cal_sleep_diff), 2,
-               "continuous time should progress at ~ same rate as calendar");
-}
-
-T_DECL(mct_settimeofday, "Testing mach_continuous_time behavior over settimeofday"){
-       if (geteuid() != 0){
-               T_SKIP("The settimeofday() test requires root privileges to run.");
-       }
-       mach_timebase_info(&tb_info);
-
-       struct timeval saved_tv;
-       struct timezone saved_tz;
-       int before, after;
-
-       T_ASSERT_POSIX_ZERO(gettimeofday(&saved_tv, &saved_tz), NULL);
-
-       struct timeval forward_tv = saved_tv;
-       // move time forward by two minutes, ensure mach_continuous_time keeps
-       // chugging along with mach_absolute_time
-       forward_tv.tv_sec += 2*60;
-
-       before = (int)to_ms(mach_continuous_time());
-       T_ASSERT_POSIX_ZERO(settimeofday(&forward_tv, &saved_tz), NULL);
-
-       after = (int)to_ms(mach_continuous_time());
-       T_ASSERT_POSIX_ZERO(settimeofday(&saved_tv, &saved_tz), NULL);
-
-       T_ASSERT_LT(abs(before - after), 1000, "mach_continuous_time should not jump more than 1s");
-}
-
-#ifdef HAS_KERNEL_TIME_TRAPS
-T_DECL(mct_settimeofday_kern, "Testing kernel mach_continuous_time behavior over settimeofday"){
-       if (geteuid() != 0){
-               T_SKIP("The settimeofday() test requires root privileges to run.");
-       }
-       mach_timebase_info(&tb_info);
-
-       struct timeval saved_tv;
-       struct timezone saved_tz;
-       int before, after;
-
-       T_ASSERT_POSIX_ZERO(gettimeofday(&saved_tv, &saved_tz), NULL);
-
-       struct timeval forward_tv = saved_tv;
-       // move time forward by two minutes, ensure mach_continuous_time keeps
-       // chugging along with mach_absolute_time
-       forward_tv.tv_sec += 2*60;
-
-       before = (int)to_ms(mach_continuous_time_kernel());
-       T_ASSERT_POSIX_ZERO(settimeofday(&forward_tv, &saved_tz), NULL);
-
-       after = (int)to_ms(mach_continuous_time_kernel());
-       T_ASSERT_POSIX_ZERO(settimeofday(&saved_tv, &saved_tz), NULL);
-
-       T_ASSERT_LT(abs(before - after), 1000, "mach_continuous_time_kernel should not jump more than 1s");
-}
-#endif
-
-T_DECL(mct_aproximate, "Testing mach_continuous_approximate_time()",
-               T_META_ALL_VALID_ARCHS(true))
-{
-       mach_timebase_info(&tb_info);
-
-       uint64_t absolute = to_ns(mach_continuous_time());
-       uint64_t approximate = to_ns(mach_continuous_approximate_time());
-
-       T_EXPECT_LE(llabs((long long)absolute - (long long)approximate), (long long)(25*NSEC_PER_MSEC), NULL);
-}
-
-T_DECL(mach_time_perf, "mach_time performance") {
-       {
-               dt_stat_time_t s = dt_stat_time_create("mach_absolute_time");
-               T_STAT_MEASURE_LOOP(s) {
-                       uint64_t t;
-                       t = mach_absolute_time();
-               }
-               dt_stat_finalize(s);
-       }
-       {
-               dt_stat_time_t s = dt_stat_time_create("mach_continuous_time");
-               T_STAT_MEASURE_LOOP(s) {
-                       uint64_t t;
-                       t = mach_continuous_time();
-               }
-               dt_stat_finalize(s);
-       }
-}
-
-T_DECL(mach_time_perf_instructions, "instructions retired for mach_time", T_META_TYPE_PERF, T_META_ASROOT(YES)) {
-       {
-               dt_stat_thread_instructions_t s = dt_stat_thread_instructions_create("mach_absolute_time");
-               T_STAT_MEASURE_LOOP(s) {
-                       uint64_t t;
-                       t = mach_absolute_time();
-               }
-               dt_stat_finalize(s);
-       }
-       {
-               dt_stat_thread_instructions_t s = dt_stat_thread_instructions_create("mach_continuous_time");
-               T_STAT_MEASURE_LOOP(s) {
-                       uint64_t t;
-                       t = mach_continuous_time();
-               }
-               dt_stat_finalize(s);
-       }
-}
-
-#ifdef HAS_KERNEL_TIME_TRAPS
-T_DECL(mach_time_perf_kern, "kernel mach_time performance") {
-       {
-               dt_stat_time_t s = dt_stat_time_create("mach_absolute_time_kernel");
-               T_STAT_MEASURE_LOOP(s) {
-                       uint64_t t;
-                       t = mach_absolute_time_kernel();
-               }
-               dt_stat_finalize(s);
-       }
-       {
-               dt_stat_time_t s = dt_stat_time_create("mach_continuous_time_kernel");
-               T_STAT_MEASURE_LOOP(s) {
-                       uint64_t t;
-                       t = mach_continuous_time_kernel();
-               }
-               dt_stat_finalize(s);
-       }
-}
-
-T_DECL(mach_time_perf_instructions_kern, "instructions retired for kernel mach_time", T_META_TYPE_PERF, T_META_ASROOT(YES)) {
-       {
-               dt_stat_thread_instructions_t s = dt_stat_thread_instructions_create("mach_absolute_time_kernel");
-               T_STAT_MEASURE_LOOP(s) {
-                       uint64_t t;
-                       t = mach_absolute_time_kernel();
-               }
-               dt_stat_finalize(s);
-       }
-       {
-               dt_stat_thread_instructions_t s = dt_stat_thread_instructions_create("mach_continuous_time_kernel");
-               T_STAT_MEASURE_LOOP(s) {
-                       uint64_t t;
-                       t = mach_continuous_time_kernel();
-               }
-               dt_stat_finalize(s);
-       }
-}
-#endif
-
diff --git a/tools/tests/darwintests/mach_get_times.c b/tools/tests/darwintests/mach_get_times.c
deleted file mode 100644 (file)
index c40bada..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-#include <stdlib.h>
-#include <time.h>
-#include <sys/time.h>
-#include <mach/mach_time.h>
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-#define T_LOG_VERBOSE(...)
-
-#define timespec2nanosec(ts) ((uint64_t)((ts)->tv_sec) * NSEC_PER_SEC + (uint64_t)((ts)->tv_nsec))
-
-T_DECL(mach_get_times, "mach_get_times()",
-          T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
-{
-       const int ITERATIONS = 500000 * dt_ncpu();
-       struct timespec gtod_ts;
-
-       uint64_t last_absolute, last_continuous, last_gtod;
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&last_absolute, &last_continuous, &gtod_ts), KERN_SUCCESS, NULL);
-       last_gtod = timespec2nanosec(&gtod_ts);
-
-       for (int i = 0; i < ITERATIONS; i++) {
-               uint64_t absolute, continuous, gtod;
-               T_QUIET; T_ASSERT_EQ(mach_get_times(&absolute, &continuous, &gtod_ts), KERN_SUCCESS, NULL);
-               gtod = timespec2nanosec(&gtod_ts);
-
-               T_LOG_VERBOSE("[%d] abs: %llu.%09llu(+%llu)\tcont: %llu.%09llu(+%llu)\tgtod:%llu.%09llu(+%llu)", i,
-                               absolute / NSEC_PER_SEC, absolute % NSEC_PER_SEC, absolute - last_absolute,
-                               continuous / NSEC_PER_SEC, continuous % NSEC_PER_SEC, continuous - last_continuous,
-                               gtod / NSEC_PER_SEC, gtod % NSEC_PER_SEC, gtod - last_gtod);
-
-               T_QUIET; T_EXPECT_EQ(absolute - last_absolute, continuous - last_continuous, NULL);
-
-               int64_t gtod_diff = (int64_t)gtod - (int64_t)last_gtod;
-               T_QUIET; T_ASSERT_LE((uint64_t)llabs(gtod_diff), NSEC_PER_SEC, NULL);
-
-               last_absolute = absolute;
-               last_continuous = continuous;
-               last_gtod = gtod;
-
-               gtod_ts.tv_sec = 0; gtod_ts.tv_nsec = 0;
-       }
-}
diff --git a/tools/tests/darwintests/mach_port_deallocate_21692215.c b/tools/tests/darwintests/mach_port_deallocate_21692215.c
deleted file mode 100644 (file)
index 4b84428..0000000
+++ /dev/null
@@ -1,38 +0,0 @@
-#define T_NAMESPACE "xnu.ipc"
-#include <darwintest.h>
-#include <mach/mach.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#define NR_PORTS 4
-
-T_DECL(mach_port_deallocate, "mach_port_deallocate deallocates also PORT_SET"){
-       mach_port_t port_set;
-       mach_port_t port[NR_PORTS];
-       int i,ret;
-
-       ret= mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_PORT_SET, &port_set);
-       T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_PORT_SET");
-
-       for(i=0;i<NR_PORTS;i++){
-               ret= mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port[i]);
-               T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_RECEIVE");
-
-               ret= mach_port_move_member(mach_task_self(), port[i], port_set);
-               T_ASSERT_MACH_SUCCESS(ret, "mach_port_move_member");
-       }
-
-       T_LOG("Ports created");
-
-       /* do something */
-
-       for(i=0;i<NR_PORTS;i++){
-               ret= mach_port_mod_refs(mach_task_self(), port[i], MACH_PORT_RIGHT_RECEIVE, -1);
-               T_ASSERT_MACH_SUCCESS(ret, "mach_port_mod_refs -1 RIGHT_RECEIVE");
-       }
-
-       ret= mach_port_deallocate(mach_task_self(), port_set);
-       T_ASSERT_MACH_SUCCESS(ret, "mach_port_deallocate PORT_SET");
-
-       T_LOG("Ports erased");
-}
diff --git a/tools/tests/darwintests/mach_port_mod_refs.c b/tools/tests/darwintests/mach_port_mod_refs.c
deleted file mode 100644 (file)
index 3e5d2f3..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#define T_NAMESPACE "xnu.ipc"
-#include <darwintest.h>
-#include <mach/mach.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-
-T_DECL(mach_port_mod_refs, "mach_port_mod_refs"){
-       mach_port_t port_set;
-       mach_port_t port;
-       int ret;
-
-       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_PORT_SET, &port_set);
-       T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_PORT_SET");
-
-       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port);
-       T_ASSERT_MACH_SUCCESS(ret, "mach_port_allocate MACH_PORT_RIGHT_RECEIVE");
-
-
-       /*
-        * Test all known variants of port rights on each type of port
-        */
-
-       /* can't subtract a send right if it doesn't exist */
-       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs SEND: -1 on a RECV right");
-
-       /* can't subtract a send once right if it doesn't exist */
-       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_SEND_ONCE, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs SEND_ONCE: -1 on a RECV right");
-
-       /* can't subtract a PORT SET right if it's not a port set */
-       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_PORT_SET, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs PORT_SET: -1 on a RECV right");
-
-       /* can't subtract a dead name right if it doesn't exist */
-       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_DEAD_NAME, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs DEAD_NAME: -1 on a RECV right");
-
-       /* can't subtract a LABELH right if it doesn't exist */
-       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_LABELH, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs LABELH: -1 on a RECV right");
-
-       /* can't subtract an invalid right-type */
-       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_NUMBER, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_VALUE, "mach_port_mod_refs NUMBER: -1 on a RECV right");
-
-       /* can't subtract an invalid right-type */
-       ret = mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_NUMBER + 1, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_VALUE, "mach_port_mod_refs NUMBER+1: -1 on a RECV right");
-
-
-       /* can't subtract a send right if it doesn't exist */
-       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_SEND, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs SEND: -1 on a PORT_SET right");
-
-       /* can't subtract a send once right if it doesn't exist */
-       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_SEND_ONCE, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs SEND_ONCE: -1 on a PORT_SET right");
-
-       /* can't subtract a receive right if it's a port set */
-       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_RECEIVE, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs RECV: -1 on a PORT_SET right");
-
-       /* can't subtract a dead name right if it doesn't exist */
-       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_DEAD_NAME, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs DEAD_NAME: -1 on a PORT_SET right");
-
-       /* can't subtract a LABELH right if it doesn't exist */
-       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_LABELH, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_RIGHT, "mach_port_mod_refs LABELH: -1 on a PORT_SET right");
-
-       /* can't subtract an invalid right-type */
-       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_NUMBER, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_VALUE, "mach_port_mod_refs NUMBER: -1 on a PORT_SET right");
-
-       /* can't subtract an invalid right-type */
-       ret = mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_NUMBER + 1, -1);
-       T_ASSERT_EQ(ret, KERN_INVALID_VALUE, "mach_port_mod_refs NUMBER+1: -1 on a PORT_SET right");
-
-       /*
-        * deallocate the ports/sets
-        */
-       ret= mach_port_mod_refs(mach_task_self(), port_set, MACH_PORT_RIGHT_PORT_SET, -1);
-       T_ASSERT_MACH_SUCCESS(ret, "mach_port_mod_refs(PORT_SET, -1)");
-
-       ret= mach_port_mod_refs(mach_task_self(), port, MACH_PORT_RIGHT_RECEIVE, -1);
-       T_ASSERT_MACH_SUCCESS(ret, "mach_port_mod_refs(RECV_RIGHT, -1)");
-}
diff --git a/tools/tests/darwintests/mach_timebase_info.c b/tools/tests/darwintests/mach_timebase_info.c
deleted file mode 100644 (file)
index 51f3bb4..0000000
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <mach/mach_time.h>
-
-#include <darwintest.h>
-
-extern kern_return_t mach_timebase_info_trap(mach_timebase_info_t info);
-
-T_DECL(mach_timebase_info, "mach_timebase_info(_trap)",
-               T_META_ALL_VALID_ARCHS(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       mach_timebase_info_data_t a, b, c;
-
-       T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info(&a), NULL);
-       T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info(&b), NULL);
-       T_ASSERT_EQ(KERN_SUCCESS, mach_timebase_info_trap(&c), NULL);
-
-       T_EXPECT_EQ(a.numer, b.numer, NULL);
-       T_EXPECT_EQ(a.denom, b.denom, NULL);
-       T_EXPECT_EQ(a.numer, c.numer, NULL);
-       T_EXPECT_EQ(a.denom, c.denom, NULL);
-}
diff --git a/tools/tests/darwintests/memorystatus_vm_map_fork.c b/tools/tests/darwintests/memorystatus_vm_map_fork.c
deleted file mode 100644 (file)
index dc92e5c..0000000
+++ /dev/null
@@ -1,467 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <string.h>
-#include <assert.h>
-#include <signal.h>
-#include <spawn.h>
-#include <spawn_private.h>
-#include <stdint.h>
-#include <sys/sysctl.h>
-#include <sys/spawn_internal.h>
-#include <sys/kern_memorystatus.h>
-#include <mach-o/dyld.h>
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.vm"),
-       T_META_CHECK_LEAKS(false)
-);
-
-extern char **environ;
-
-/*
- * This test file contains two sub-tests which attempt to verify
- * the allowing or not allowing of a corpse for crashreporter when
- * a task exceeds its memory allocation limit. vm_map_fork() is the
- * kernel routine used to generate a corpse task.
- *
- * A corpse is allowed to be taken if a task's memory resource limit that
- * is exceeded is less than 1/2 of the system wide task limit.
- * If the amount exceeds 1/2 the sytem wide limit, then the corpse is disallowed.
- *
- * If the device under test is already under pressure, the test
- * could fail due to jetsam cutting in and killing the parent, child or
- * other necessary testing processes.
- */
-
-/* Test variants */
-#define TEST_ALLOWED    0x1
-#define TEST_NOT_ALLOWED 0x2
-
-/*
- * Values which the kernel OR's into the PID when a corpse
- * is either allowed or disallowed for the
- * kern.memorystatus_vm_map_fork_pidwatch sysctl.
- */
-#define MEMORYSTATUS_VM_MAP_FORK_ALLOWED       0x100000000ul
-#define MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED 0x200000000ul
-
-/*
- * The memory allocation happens in a child process, this
- * is stuff to deal with creating and managing the child.
- * The child will only execute the T_HELPER_DECL.
- */
-static char testpath[PATH_MAX];
-static uint32_t testpath_size = sizeof(testpath);
-#define LIMIT_DELTA_MB 5 /* an arbitrary limit delta */
-#define MEGABYTE       (1024 * 1024)
-
-/*
- * The child process communicates back to parent via an exit() code.
- */
-enum child_exits {
-       NORMAL_EXIT = 0,
-       NO_MEMSIZE_ARG,
-       INVALID_MEMSIZE,
-       MALLOC_FAILED,
-       NUM_CHILD_EXIT
-};
-static char *child_exit_why[] = {
-       "normal exit",
-       "no memsize argument to child",
-       "invalid memsize argument to child",
-       "malloc() failed",
-};
-
-/*
- * Corpse collection only happens in development kernels.
- * So we need this to detect if the test is relevant.
- */
-static boolean_t
-is_development_kernel(void)
-{
-       int ret;
-       int dev = 0;
-       size_t dev_size = sizeof(dev);
-
-       ret = sysctlbyname("kern.development", &dev, &dev_size, NULL, 0);
-       if (ret != 0) {
-               return FALSE;
-       }
-
-       return (dev != 0);
-}
-
-/*
- * Set/Get the sysctl used to determine if corpse collection occurs.
- * This is done by the kernel checking for a specific PID.
- */
-static void
-set_memorystatus_vm_map_fork_pidwatch(pid_t pid)
-{
-       uint64_t new_value = (uint64_t)pid;
-       size_t new_len = sizeof(new_value);
-       int err;
-
-       err = sysctlbyname("kern.memorystatus_vm_map_fork_pidwatch", NULL, NULL, &new_value, new_len);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(err, "set sysctlbyname(kern.memorystatus_vm_map_fork_pidwatch...) failed");
-       return;
-}
-
-static uint64_t
-get_memorystatus_vm_map_fork_pidwatch()
-{
-       uint64_t value = 0;
-       size_t val_len = sizeof(value);
-       int err;
-
-       err = sysctlbyname("kern.memorystatus_vm_map_fork_pidwatch", &value, &val_len, NULL, 0);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(err, "get sysctlbyname(kern.memorystatus_vm_map_fork_pidwatch...) failed");
-
-       return value;
-}
-
-/*
- * We want to avoid jetsam giving us bad results, if possible. So check if there's
- * enough memory for the test to run, waiting briefly for some to free up.
- */
-static void
-wait_for_free_mem(int need_mb)
-{
-       int64_t         memsize;
-       int             memorystatus_level;
-       size_t          size;
-       int64_t         avail;
-       int             err;
-       int             try;
-
-       /*
-        * get amount of memory in the machine
-        */
-       size = sizeof(memsize);
-       err = sysctlbyname("hw.memsize", &memsize, &size, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "sysctlbyname(hw.memsize...) failed");
-
-       /*
-        * Use a loop to briefly sleep and recheck if short on memory.
-        */
-       try = 1;
-       for (;;) {
-
-               /*
-                * memorystatus_level is a percentage of memory available. For example 20 means 1/5 of memory.
-                * It currently doesn't exist on macOS but neither does jetsam, so pass the test there.
-                */
-               size = sizeof(memorystatus_level);
-               if (sysctlbyname("kern.memorystatus_level", &memorystatus_level, &size, NULL, 0) != 0)
-                       return;
-               T_QUIET; T_ASSERT_LE(memorystatus_level, 100, "memorystatus_level too high");
-               T_QUIET; T_ASSERT_GT(memorystatus_level, 0, "memorystatus_level negative");
-
-               /*
-                * jetsam kicks in at memory status level of 15%, so subtract that much out of what's available.
-                */
-               avail = MAX(0, (memsize * (memorystatus_level - 15)) / 100);
-
-               /*
-                * We're good to go if there's more than enough available.
-                */
-               if ((int64_t)need_mb * MEGABYTE < avail)
-                       return;
-
-               /*
-                * issue a message to log and sleep briefly to see if we can get more memory
-                */
-               if (try-- == 0)
-                       break;
-               T_LOG("Need %d MB, only %d MB available. sleeping 5 seconds for more to free. memorystatus_level %d",
-                   need_mb, (int)(avail / MEGABYTE), memorystatus_level);
-               sleep(5);
-       }
-       T_SKIP("Needed %d MB, but only %d MB available. Skipping test to avoid jetsam issues.",
-           need_mb, (int)(avail / MEGABYTE));
-}
-
-
-/*
- * The main test calls this to spawn child process which will run and
- * exceed some memory limit. The child is initially suspended so that
- * we can do the sysctl calls before it runs.
- * Since this is a libdarwintest, the "-n" names the T_HELPER_DECL() that
- * we want to run. The arguments specific to the test follow a "--".
- */
-static pid_t
-spawn_child_process(
-       char * const executable,
-       char * const memlimit,
-       short flags,
-       int priority,
-       int active_limit_mb,
-       int inactive_limit_mb)
-{
-       posix_spawnattr_t spawn_attrs;
-       int err;
-       pid_t child_pid;
-       char * const argv_child[] = { executable, "-n", "child_process", "--", memlimit, NULL };
-
-       err = posix_spawnattr_init(&spawn_attrs);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  posix_spawnattr_init() failed");
-
-       err = posix_spawnattr_setflags(&spawn_attrs, POSIX_SPAWN_START_SUSPENDED);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  posix_spawnattr_setflags() failed");
-
-       err = posix_spawnattr_setjetsam_ext(&spawn_attrs, flags, priority, active_limit_mb, inactive_limit_mb);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  posix_spawnattr_setjetsam_ext() failed");
-
-       err = posix_spawn(&child_pid, executable, NULL, &spawn_attrs, argv_child, environ);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  posix_spawn() failed");
-
-       return child_pid;
-}
-
-
-/*
- * The parent calls this to continue the suspended child, then wait for its result.
- * We collect its resource usage to vefiry the expected amount allocated.
- */
-static void
-test_child_process(pid_t child_pid, int *status, struct rusage *ru)
-{
-       int err = 0;
-       pid_t got_pid;
-
-       T_LOG("  continuing child[%d]\n", child_pid);
-
-       err = kill(child_pid, SIGCONT);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "  kill(%d, SIGCONT) failed", child_pid);
-
-       T_LOG("  waiting for child[%d] to exit", child_pid);
-
-       got_pid = wait4(child_pid, status, 0, ru);
-       T_QUIET; T_ASSERT_EQ(child_pid, got_pid, "  wait4(%d, ...) returned %d", child_pid, got_pid);
-}
-
-/*
- * The child process executes this code. The easiest way, with given darwintest infrastructure,
- * it has to return information is via exit status.
- */
-T_HELPER_DECL(child_process, "child allocates memory to failure")
-{
-#define BYTESPERALLOC  MEGABYTE
-#define BYTESINEXCESS  (2 * MEGABYTE) /* 2 MB - arbitrary */
-       char *limit;
-       long limit_mb = 0;
-       long max_bytes_to_munch, bytes_remaining, bytes_this_munch;
-       void *mem = NULL;
-
-       /*
-        * This helper is run in a child process. The helper sees one argument
-        * as a string which is the amount of memory in megabytes to allocate.
-        */
-       if (argc != 1)
-               exit(NO_MEMSIZE_ARG);
-
-       limit = argv[0];
-       errno = 0;
-       limit_mb = strtol(limit, NULL, 10);
-       if (errno != 0 || limit_mb <= 0)
-               exit(INVALID_MEMSIZE);
-
-       /* Compute in excess of assigned limit */
-       max_bytes_to_munch = limit_mb * MEGABYTE;
-       max_bytes_to_munch += BYTESINEXCESS;
-
-       for (bytes_remaining = max_bytes_to_munch; bytes_remaining > 0; bytes_remaining -= bytes_this_munch) {
-               bytes_this_munch = MIN(bytes_remaining, BYTESPERALLOC);
-
-               mem = malloc((size_t)bytes_this_munch);
-               if (mem == NULL)
-                       exit(MALLOC_FAILED);
-               arc4random_buf(mem, (size_t)bytes_this_munch);
-       }
-
-       /* We chewed up all the memory we were asked to. */
-       exit(NORMAL_EXIT);
-}
-
-
-/*
- * Actual test body.
- */
-static void
-memorystatus_vm_map_fork_parent(int test_variant)
-{
-       int             max_task_pmem = 0; /* MB */
-       size_t          size = 0;
-       int             active_limit_mb = 0;
-       int             inactive_limit_mb = 0;
-       short           flags = 0;
-       char            memlimit_str[16];
-       pid_t           child_pid;
-       int             child_status;
-       uint64_t        kernel_pidwatch_val;
-       uint64_t        expected_pidwatch_val;
-       int             ret;
-       struct rusage   ru;
-       enum child_exits exit_val;
-
-       /*
-        * The code to set/get the pidwatch sysctl is only in
-        * development kernels. Skip the test if not on one.
-        */
-       if (!is_development_kernel()) {
-               T_SKIP("Can't test on release kernel");
-       }
-
-       /*
-        * Determine a memory limit based on system having one or not.
-        */
-       size = sizeof(max_task_pmem);
-       (void)sysctlbyname("kern.max_task_pmem", &max_task_pmem, &size, NULL, 0);
-       if (max_task_pmem <= 0)
-               max_task_pmem = 0;
-
-       if (test_variant == TEST_ALLOWED) {
-               
-               /*
-                * Tell the child to allocate less than 1/2 the system wide limit.
-                */
-               if (max_task_pmem / 2 - LIMIT_DELTA_MB <= 0) {
-                       active_limit_mb = LIMIT_DELTA_MB;
-               } else {
-                       active_limit_mb = max_task_pmem / 2 - LIMIT_DELTA_MB;
-               }
-               expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
-
-       } else { /* TEST_NOT_ALLOWED */
-
-               /*
-                * Tell the child to allocate more than 1/2 the system wide limit.
-                */
-               active_limit_mb = (max_task_pmem / 2) + LIMIT_DELTA_MB;
-               if (max_task_pmem == 0) {
-                       expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
-               } else {
-                       expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_NOT_ALLOWED;
-               }
-
-       }
-       inactive_limit_mb = active_limit_mb;
-       T_LOG("using limit of %d Meg", active_limit_mb);
-
-       /*
-        * When run as part of a larger suite, a previous test
-        * may have left the system temporarily with too little
-        * memory to run this test. We try to detect if there is
-        * enough free memory to proceed, waiting a little bit
-        * for memory to free up.
-        */
-       wait_for_free_mem(active_limit_mb);
-
-#if defined(__x86_64__)
-       /*
-        * vm_map_fork() is always allowed on desktop.
-        */
-       expected_pidwatch_val = MEMORYSTATUS_VM_MAP_FORK_ALLOWED;
-#endif
-
-       /*
-        * Prepare the arguments needed to spawn the child process.
-        */
-       memset (memlimit_str, 0, sizeof(memlimit_str));
-       (void)sprintf(memlimit_str, "%d", active_limit_mb);
-
-       ret = _NSGetExecutablePath(testpath, &testpath_size);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "_NSGetExecutablePath(%s, ...)", testpath);
-
-       /*
-        * We put the child process in FOREGROUND to try and keep jetsam's hands off it.
-        */
-       child_pid = spawn_child_process(testpath, memlimit_str, flags,
-           JETSAM_PRIORITY_FOREGROUND, active_limit_mb, inactive_limit_mb);
-
-       expected_pidwatch_val |= (uint64_t)child_pid;
-
-       /*
-        * We only reach here if parent successfully spawned child process.
-        */
-       T_LOG("  spawned child_pid[%d] with memlimit %s (%d)MB\n",
-           child_pid, memlimit_str, active_limit_mb);
-
-       /*
-        * Set the kernel's pidwatch to look for the child.
-        */
-       (void)set_memorystatus_vm_map_fork_pidwatch((pid_t)0);
-       (void)set_memorystatus_vm_map_fork_pidwatch(child_pid);
-
-       /*
-        * Let the child run and wait for it to finish.
-        */
-       test_child_process(child_pid, &child_status, &ru);
-       T_LOG("Child exited with max_rss of %ld", ru.ru_maxrss);
-
-       /*
-        * Retrieve the kernel's pidwatch value. This should now indicate
-        * if the corpse was allowed or not.
-        */
-       kernel_pidwatch_val = get_memorystatus_vm_map_fork_pidwatch();
-       (void)set_memorystatus_vm_map_fork_pidwatch((pid_t)0);
-
-       /*
-        * If the child died abnormally, the test is invalid.
-        */
-       if (!WIFEXITED(child_status)) {
-               if (WIFSIGNALED(child_status)) {
-                       /* jetsam kills a process with SIGKILL */
-                       if (WTERMSIG(child_status) == SIGKILL)
-                               T_LOG("Child appears to have been a jetsam victim");
-                       T_SKIP("Child terminated by signal %d test result invalid", WTERMSIG(child_status));
-               }
-               T_SKIP("child did not exit normally (status=%d) test result invalid", child_status);
-       }
-
-       /*
-        * We don't expect the child to exit for any other reason than success
-        */
-       exit_val = (enum child_exits)WEXITSTATUS(child_status);
-       T_QUIET; T_ASSERT_EQ(exit_val, NORMAL_EXIT, "child exit due to: %s", 
-           (0 < exit_val && exit_val < NUM_CHILD_EXIT) ? child_exit_why[exit_val] : "unknown");
-
-       /*
-        * If the kernel aborted generating a corpse for other reasons, the test is invalid.
-        */
-       if (kernel_pidwatch_val == -1ull) {
-               T_SKIP("corpse generation was aborted by kernel");
-       }
-
-       /*
-        * We should always have made it through the vm_map_fork() checks in the kernel for this test.
-        */
-       T_QUIET; T_ASSERT_NE_ULLONG(kernel_pidwatch_val, (uint64_t)child_pid, "child didn't trigger corpse generation");
-
-       T_EXPECT_EQ(kernel_pidwatch_val, expected_pidwatch_val, "kernel value 0x%llx - expected 0x%llx",
-           kernel_pidwatch_val, expected_pidwatch_val);
-}
-
-/*
- * The order of these 2 test functions is important. They will be executed by the test framwork in order.
- *
- * We test "not allowed first", then "allowed". If it were the other way around, the corpse from the "allowed"
- * test would likely cause memory pressure and jetsam would likely kill the "not allowed" test.
- */
-T_DECL(memorystatus_vm_map_fork_test_not_allowed, "test that corpse generation was not allowed")
-{
-       memorystatus_vm_map_fork_parent(TEST_NOT_ALLOWED);
-}
-
-T_DECL(memorystatus_vm_map_fork_test_allowed, "test corpse generation allowed")
-{
-
-       memorystatus_vm_map_fork_parent(TEST_ALLOWED);
-}
diff --git a/tools/tests/darwintests/memorystatus_zone_test.c b/tools/tests/darwintests/memorystatus_zone_test.c
deleted file mode 100644 (file)
index f652725..0000000
+++ /dev/null
@@ -1,586 +0,0 @@
-#include <stdio.h>
-#include <mach/mach_vm.h>
-#include <mach/mach_port.h>
-#include <mach/mach_host.h>
-#include <mach/mach_error.h>
-#include <mach-o/dyld.h>
-#include <sys/sysctl.h>
-#include <sys/kdebug.h>
-#include <sys/mman.h>
-#include <sys/kern_memorystatus.h>
-#include <ktrace/session.h>
-#include <dispatch/private.h>
-
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.vm"),
-       T_META_CHECK_LEAKS(false)
-);
-
-#define TIMEOUT_SECS                                   1500
-
-#if TARGET_OS_EMBEDDED
-#define ALLOCATION_SIZE_VM_REGION                              (16*1024)               /* 16 KB */
-#define ALLOCATION_SIZE_VM_OBJECT                              ALLOCATION_SIZE_VM_REGION
-#else
-#define ALLOCATION_SIZE_VM_REGION                              (1024*1024*100) /* 100 MB */
-#define ALLOCATION_SIZE_VM_OBJECT                              (16*1024)               /* 16 KB */
-#endif
-#define MAX_CHILD_PROCS                                100
-
-#define ZONEMAP_JETSAM_LIMIT_SYSCTL                    "kern.zone_map_jetsam_limit=60"
-
-#define VME_ZONE_TEST_OPT                              "allocate_vm_regions"
-#define VM_OBJECTS_ZONE_TEST_OPT                       "allocate_vm_objects"
-#define GENERIC_ZONE_TEST_OPT                          "allocate_from_generic_zone"
-
-#define VME_ZONE                                                               "VM map entries"
-#define VMOBJECTS_ZONE                                                 "vm objects"
-#define VMENTRY_TO_VMOBJECT_COMPARISON_RATIO   98
-
-#define VM_TAG1                                                                        100
-#define VM_TAG2                                                                        101
-
-enum {
-    VME_ZONE_TEST = 0,
-    VM_OBJECTS_ZONE_TEST,
-    GENERIC_ZONE_TEST,
-};
-
-typedef struct test_config_struct {
-       int test_index;
-       int num_zones;
-       const char *helper_func;
-       mach_zone_name_array_t zone_names;
-} test_config_struct;
-
-static test_config_struct current_test;
-static int num_children = 0;
-static bool test_ending = false;
-static bool within_dispatch_signal_handler = false;
-static bool within_dispatch_timer_handler = false;
-static dispatch_source_t ds_signal = NULL;
-static dispatch_source_t ds_timer = NULL;
-static ktrace_session_t session = NULL;
-
-static mach_zone_info_array_t zone_info_array = NULL;
-static mach_zone_name_t largest_zone_name;
-static mach_zone_info_t largest_zone_info;
-
-static char testpath[PATH_MAX];
-static pid_t child_pids[MAX_CHILD_PROCS];
-static pthread_mutex_t test_ending_mtx;
-
-static void allocate_vm_regions(void);
-static void allocate_vm_objects(void);
-static void allocate_from_generic_zone(void);
-static void cleanup_and_end_test(void);
-static void setup_ktrace_session(void);
-static void spawn_child_process(void);
-static void run_test(void);
-static bool verify_generic_jetsam_criteria(void);
-static bool vme_zone_compares_to_vm_objects(void);
-static void print_zone_map_size(void);
-static void query_zone_info(void);
-static void print_zone_info(mach_zone_name_t *zn, mach_zone_info_t *zi);
-
-extern void mach_zone_force_gc(host_t host);
-extern kern_return_t mach_zone_info_for_largest_zone(
-       host_priv_t host,
-       mach_zone_name_t *name,
-       mach_zone_info_t *info
-);
-
-static void allocate_vm_regions(void)
-{
-       uint64_t alloc_size = ALLOCATION_SIZE_VM_REGION, i = 0;
-
-       printf("[%d] Allocating VM regions, each of size %lld KB\n", getpid(), (alloc_size>>10));
-       for (i = 0; ; i++) {
-               mach_vm_address_t addr = (mach_vm_address_t)NULL;
-
-               /* Alternate VM tags between consecutive regions to prevent coalescing */
-               int flags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE;
-
-               if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, flags)) != KERN_SUCCESS) {
-                       break;
-               }
-       }
-       printf("[%d] Number of allocations: %lld\n", getpid(), i);
-
-       /* Signal to the parent that we're done allocating */
-       kill(getppid(), SIGUSR1);
-
-       while (1) {
-               pause();
-       }
-}
-
-static void allocate_vm_objects(void)
-{
-       uint64_t alloc_size = ALLOCATION_SIZE_VM_OBJECT, i = 0;
-
-       printf("[%d] Allocating VM regions, each of size %lld KB, each backed by a VM object\n", getpid(), (alloc_size>>10));
-       for (i = 0; ; i++) {
-               mach_vm_address_t addr = (mach_vm_address_t)NULL;
-
-               /* Alternate VM tags between consecutive regions to prevent coalescing */
-               int flags = VM_MAKE_TAG((i % 2)? VM_TAG1: VM_TAG2) | VM_FLAGS_ANYWHERE;
-
-               if ((mach_vm_allocate(mach_task_self(), &addr, (mach_vm_size_t)alloc_size, flags)) != KERN_SUCCESS) {
-                       break;
-               }
-               /* Touch the region so the VM object can actually be created */
-               *((int *)addr) = 0;
-               /* OK to free this page. Keeps us from holding a lot of dirty pages */
-               madvise((void *)addr, (size_t)alloc_size, MADV_FREE);
-       }
-       printf("[%d] Number of allocations: %lld\n", getpid(), i);
-
-       /* Signal to the parent that we're done allocating */
-       kill(getppid(), SIGUSR1);
-
-       while (1) {
-               pause();
-       }
-}
-
-static void allocate_from_generic_zone(void)
-{
-       uint64_t i = 0;
-
-       printf("[%d] Allocating mach_ports\n", getpid());
-       for (i = 0; ; i++) {
-               mach_port_t port;
-
-               if ((mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port)) != KERN_SUCCESS) {
-                       break;
-               }
-       }
-       printf("[%d] Number of allocations: %lld\n", getpid(), i);
-
-       /* Signal to the parent that we're done allocating */
-       kill(getppid(), SIGUSR1);
-
-       while (1) {
-               pause();
-       }
-}
-
-static void print_zone_info(mach_zone_name_t *zn, mach_zone_info_t *zi)
-{
-       T_LOG("ZONE NAME: %-35sSIZE: %-25lluELEMENTS: %llu",
-                       zn->mzn_name, zi->mzi_cur_size, zi->mzi_count);
-}
-
-static void query_zone_info(void)
-{
-       int i;
-       kern_return_t kr;
-       static uint64_t num_calls = 0;
-
-       for (i = 0; i < current_test.num_zones; i++) {
-               kr = mach_zone_info_for_zone(mach_host_self(), current_test.zone_names[i], &(zone_info_array[i]));
-               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_zone_info_for_zone(%s) returned %d [%s]", current_test.zone_names[i].mzn_name, kr, mach_error_string(kr));
-       }
-       kr = mach_zone_info_for_largest_zone(mach_host_self(), &largest_zone_name, &largest_zone_info);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_zone_info_for_largest_zone returned %d [%s]", kr, mach_error_string(kr));
-
-       num_calls++;
-       if (num_calls % 10 != 0) {
-               return;
-       }
-
-       /* Print out size and element count for zones relevant to the test */
-       for (i = 0; i < current_test.num_zones; i++) {
-               print_zone_info(&(current_test.zone_names[i]), &(zone_info_array[i]));
-       }
-}
-
-static bool vme_zone_compares_to_vm_objects(void)
-{
-       int i;
-       uint64_t vm_object_element_count = 0, vm_map_entry_element_count = 0;
-
-       T_LOG("Comparing element counts of \"VM map entries\" and \"vm objects\" zones");
-       for (i = 0; i < current_test.num_zones; i++) {
-               if (!strcmp(current_test.zone_names[i].mzn_name, VME_ZONE)) {
-                       vm_map_entry_element_count = zone_info_array[i].mzi_count;
-               } else if (!strcmp(current_test.zone_names[i].mzn_name, VMOBJECTS_ZONE)) {
-                       vm_object_element_count = zone_info_array[i].mzi_count;
-               }
-               print_zone_info(&(current_test.zone_names[i]), &(zone_info_array[i]));
-       }
-
-       T_LOG("# VM map entries as percentage of # vm objects = %llu", (vm_map_entry_element_count * 100)/ vm_object_element_count);
-       if (vm_map_entry_element_count >= ((vm_object_element_count * VMENTRY_TO_VMOBJECT_COMPARISON_RATIO) / 100)) {
-               T_LOG("Number of VM map entries is comparable to vm objects\n\n");
-               return true;
-       }
-       T_LOG("Number of VM map entries is NOT comparable to vm objects\n\n");
-       return false;
-}
-
-static bool verify_generic_jetsam_criteria(void)
-{
-       T_LOG("Largest zone info");
-       print_zone_info(&largest_zone_name, &largest_zone_info);
-
-       /* If VM map entries is not the largest zone */
-       if (strcmp(largest_zone_name.mzn_name, VME_ZONE)) {
-               /* If vm objects is the largest zone and the VM map entries zone had comparable # of elements, return false */
-               if (!strcmp(largest_zone_name.mzn_name, VMOBJECTS_ZONE) && vme_zone_compares_to_vm_objects()) {
-                       return false;
-               }
-               return true;
-       }
-       return false;
-}
-
-static void cleanup_and_end_test(void)
-{
-       int i;
-
-       /*
-        * The atend handler executes on a different dispatch queue.
-        * We want to do the cleanup only once.
-        */
-       pthread_mutex_lock(&test_ending_mtx);
-       if (test_ending) {
-               pthread_mutex_unlock(&test_ending_mtx);
-               return;
-       }
-       test_ending = true;
-       pthread_mutex_unlock(&test_ending_mtx);
-
-       T_LOG("Number of processes spawned: %d", num_children);
-       T_LOG("Cleaning up...");
-
-       /* Disable the timer that queries and prints zone info periodically */
-       if (ds_timer != NULL && !within_dispatch_timer_handler) {
-               dispatch_source_cancel(ds_timer);
-       }
-
-       /* Disable signal handler that spawns child processes, only if we're not in the event handler's context */
-       if (ds_signal != NULL && !within_dispatch_signal_handler) {
-               dispatch_source_cancel_and_wait(ds_signal);
-       }
-
-       /* Kill all the child processes that were spawned */
-       for (i = 0; i < num_children; i++) {
-               kill(child_pids[i], SIGKILL);
-       }
-       for (i = 0; i < num_children; i++) {
-               int status = 0;
-               if (waitpid(child_pids[i], &status, 0) < 0) {
-                       T_LOG("waitpid returned status %d", status);
-               }
-       }
-       sleep(1);
-
-       /* Force zone_gc before starting test for another zone or exiting */
-       mach_zone_force_gc(mach_host_self());
-
-       /* End ktrace session */
-       if (session != NULL) {
-               ktrace_end(session, 1);
-       }
-
-       for (i = 0; i < current_test.num_zones; i++) {
-               print_zone_info(&(current_test.zone_names[i]), &(zone_info_array[i]));
-       }
-}
-
-static void setup_ktrace_session(void)
-{
-       int ret = 0;
-
-       T_LOG("Setting up ktrace session...");
-       session = ktrace_session_create();
-       T_QUIET; T_ASSERT_NOTNULL(session, "ktrace_session_create");
-
-       ktrace_set_interactive(session);
-
-       ktrace_set_completion_handler(session, ^{
-               ktrace_session_destroy(session);
-               T_END;
-       });
-
-       /* Listen for memorystatus_do_kill trace events */
-       ret = ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_DO_KILL)) | DBG_FUNC_END, ^(ktrace_event_t event) {
-               int i;
-               bool received_jetsam_event = false;
-
-               /* We don't care about jetsams for any other reason except zone-map-exhaustion */
-               if (event->arg2 == kMemorystatusKilledZoneMapExhaustion) {
-                       cleanup_and_end_test();
-                       T_LOG("[memorystatus_do_kill] jetsam reason: zone-map-exhaustion, pid: %lu\n\n", event->arg1);
-                       if (current_test.test_index == VME_ZONE_TEST || current_test.test_index == VM_OBJECTS_ZONE_TEST) {
-                               /*
-                                * For the VM map entries zone we try to kill the leaking process.
-                                * Verify that we jetsammed one of the processes we spawned.
-                                *
-                                * For the vm objects zone we pick the leaking process via the VM map entries
-                                * zone, if the number of vm objects and VM map entries are comparable.
-                                * The test simulates this scenario, we should see a targeted jetsam for the
-                                * vm objects zone too.
-                                */
-                               for (i = 0; i < num_children; i++) {
-                                       if (child_pids[i] == (pid_t)event->arg1) {
-                                               received_jetsam_event = true;
-                                               break;
-                                       }
-                               }
-                               /*
-                                * If we didn't see a targeted jetsam, verify that the largest zone actually
-                                * fulfilled the criteria for generic jetsams.
-                                */
-                               if (!received_jetsam_event && verify_generic_jetsam_criteria()) {
-                                       received_jetsam_event = true;
-                               }
-                       } else {
-                               received_jetsam_event = true;
-                       }
-
-                       T_ASSERT_TRUE(received_jetsam_event, "Received zone-map-exhaustion jetsam event as expected");
-               }
-       });
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_events_single");
-
-       ret = ktrace_start(session, dispatch_get_main_queue());
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_start");
-}
-
-static void print_zone_map_size(void)
-{
-       int ret;
-       uint64_t zstats[2];
-       size_t zstats_size = sizeof(zstats);
-
-       ret = sysctlbyname("kern.zone_map_size_and_capacity", &zstats, &zstats_size, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.zone_map_size_and_capacity failed");
-
-       T_LOG("Zone map capacity: %-30lldZone map size: %lld [%lld%% full]", zstats[1], zstats[0], (zstats[0] * 100)/zstats[1]);
-}
-
-static void spawn_child_process(void)
-{
-       pid_t pid = -1;
-       char helper_func[50];
-       char *launch_tool_args[4];
-
-       T_QUIET; T_ASSERT_LT(num_children, MAX_CHILD_PROCS, "Spawned %d children. Timing out...", MAX_CHILD_PROCS);
-
-       strlcpy(helper_func, current_test.helper_func, sizeof(helper_func));
-       launch_tool_args[0] = testpath;
-       launch_tool_args[1] = "-n";
-       launch_tool_args[2] = helper_func;
-       launch_tool_args[3] = NULL;
-
-       /* Spawn the child process */
-       int rc = dt_launch_tool(&pid, launch_tool_args, false, NULL, NULL);
-       if (rc != 0) {
-               T_LOG("dt_launch tool returned %d with error code %d", rc, errno);
-       }
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool");
-
-       child_pids[num_children++] = pid;
-}
-
-static void run_test(void)
-{
-       uint64_t mem;
-       uint32_t testpath_buf_size, pages;
-       int ret, dev, pgsz;
-       size_t sysctl_size;
-
-       T_ATEND(cleanup_and_end_test);
-       T_SETUPBEGIN;
-
-       dev = 0;
-       sysctl_size = sizeof(dev);
-       ret = sysctlbyname("kern.development", &dev, &sysctl_size, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.development failed");
-       if (dev == 0) {
-               T_SKIP("Skipping test on release kernel");
-       }
-
-       testpath_buf_size = sizeof(testpath);
-       ret = _NSGetExecutablePath(testpath, &testpath_buf_size);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "_NSGetExecutablePath");
-       T_LOG("Executable path: %s", testpath);
-
-       sysctl_size = sizeof(mem);
-       ret = sysctlbyname("hw.memsize", &mem, &sysctl_size, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl hw.memsize failed");
-       T_LOG("hw.memsize: %llu", mem);
-
-       sysctl_size = sizeof(pgsz);
-       ret = sysctlbyname("vm.pagesize", &pgsz, &sysctl_size, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pagesize failed");
-       T_LOG("vm.pagesize: %d", pgsz);
-
-       sysctl_size = sizeof(pages);
-       ret = sysctlbyname("vm.pages", &pages, &sysctl_size, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl vm.pages failed");
-       T_LOG("vm.pages: %d", pages);
-
-       zone_info_array = (mach_zone_info_array_t) calloc((unsigned long)current_test.num_zones, sizeof *zone_info_array);
-
-       print_zone_map_size();
-
-       /*
-        * If the timeout specified by T_META_TIMEOUT is hit, the atend handler does not get called.
-        * So we're queueing a dispatch block to fire after TIMEOUT_SECS seconds, so we can exit cleanly.
-        */
-       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TIMEOUT_SECS * NSEC_PER_SEC), dispatch_get_main_queue(), ^{
-               T_ASSERT_FAIL("Timed out after %d seconds", TIMEOUT_SECS);
-       });
-
-       /*
-        * Create a dispatch source for the signal SIGUSR1. When a child is done allocating zone memory, it
-        * sends SIGUSR1 to the parent. Only then does the parent spawn another child. This prevents us from
-        * spawning many children at once and creating a lot of memory pressure.
-        */
-       signal(SIGUSR1, SIG_IGN);
-       ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
-       T_QUIET; T_ASSERT_NOTNULL(ds_signal, "dispatch_source_create: signal");
-
-       dispatch_source_set_event_handler(ds_signal, ^{
-               within_dispatch_signal_handler = true;
-               print_zone_map_size();
-
-               /* Wait a few seconds before spawning another child. Keeps us from allocating too aggressively */
-               sleep(5);
-               spawn_child_process();
-               within_dispatch_signal_handler = false;
-       });
-       dispatch_activate(ds_signal);
-
-       /* Timer to query jetsam-relevant zone info every second. Print it every 10 seconds. */
-       ds_timer = dispatch_source_create(DISPATCH_SOURCE_TYPE_TIMER, 0, 0, dispatch_queue_create("timer_queue", NULL));
-       T_QUIET; T_ASSERT_NOTNULL(ds_timer, "dispatch_source_create: timer");
-    dispatch_source_set_timer(ds_timer, dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), NSEC_PER_SEC, 0);
-
-       dispatch_source_set_event_handler(ds_timer, ^{
-               within_dispatch_timer_handler = true;
-               query_zone_info();
-               within_dispatch_timer_handler = false;
-    });
-       dispatch_activate(ds_timer);
-
-       /* Set up a ktrace session to listen for jetsam events */
-       setup_ktrace_session();
-
-       T_SETUPEND;
-
-       /* Spawn the first child process */
-       T_LOG("Spawning child processes to allocate zone memory...\n\n");
-       spawn_child_process();
-
-       dispatch_main();
-}
-
-static void move_to_idle_band(void)
-{
-       memorystatus_priority_properties_t props;
-
-       /*
-        * We want to move the processes we spawn into the idle band, so that jetsam can target them first.
-        * This prevents other important BATS tasks from getting killed, specially in LTE where we have very few
-        * processes running.
-        *
-        * This is only needed for tests which (are likely to) lead us down the generic jetsam path.
-        */
-       props.priority = JETSAM_PRIORITY_IDLE;
-       props.user_data = 0;
-
-       if (memorystatus_control(MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES, getpid(), 0, &props, sizeof(props))) {
-               printf("memorystatus call to change jetsam priority failed\n");
-               exit(-1);
-       }
-}
-
-T_HELPER_DECL(allocate_vm_regions, "allocates VM regions")
-{
-       allocate_vm_regions();
-}
-
-T_HELPER_DECL(allocate_vm_objects, "allocates VM objects and VM regions")
-{
-       move_to_idle_band();
-       allocate_vm_objects();
-}
-
-T_HELPER_DECL(allocate_from_generic_zone, "allocates from a generic zone")
-{
-       move_to_idle_band();
-       allocate_from_generic_zone();
-}
-
-/*
- * T_META_SYSCTL_INT(ZONEMAP_JETSAM_LIMIT_SYSCTL) changes the zone_map_jetsam_limit to a
- * lower value, so that the test can complete faster.
- * The test allocates zone memory pretty aggressively which can cause the system to panic
- * if the jetsam limit is quite high; a lower value keeps us from panicking.
- */
-T_DECL(        memorystatus_vme_zone_test,
-               "allocates elements from the VM map entries zone, verifies zone-map-exhaustion jetsams",
-               T_META_ASROOT(true),
-               T_META_TIMEOUT(1800),
-/*             T_META_LTEPHASE(LTE_POSTINIT),
- */
-               T_META_SYSCTL_INT(ZONEMAP_JETSAM_LIMIT_SYSCTL))
-{
-       current_test = (test_config_struct) {
-               .test_index = VME_ZONE_TEST,
-               .helper_func = VME_ZONE_TEST_OPT,
-               .num_zones = 1,
-               .zone_names = (mach_zone_name_t []){
-                       { .mzn_name = VME_ZONE }
-               }
-       };
-       run_test();
-}
-
-T_DECL(        memorystatus_vm_objects_zone_test,
-               "allocates elements from the VM objects and the VM map entries zones, verifies zone-map-exhaustion jetsams",
-               T_META_ASROOT(true),
-               T_META_TIMEOUT(1800),
-/*             T_META_LTEPHASE(LTE_POSTINIT),
- */
-               T_META_SYSCTL_INT(ZONEMAP_JETSAM_LIMIT_SYSCTL))
-{
-       current_test = (test_config_struct) {
-               .test_index = VM_OBJECTS_ZONE_TEST,
-               .helper_func = VM_OBJECTS_ZONE_TEST_OPT,
-               .num_zones = 2,
-               .zone_names = (mach_zone_name_t []){
-                       { .mzn_name = VME_ZONE },
-                       { .mzn_name = VMOBJECTS_ZONE}
-               }
-       };
-       run_test();
-}
-
-T_DECL(        memorystatus_generic_zone_test,
-               "allocates elements from a zone that doesn't have an optimized jetsam path, verifies zone-map-exhaustion jetsams",
-               T_META_ASROOT(true),
-               T_META_TIMEOUT(1800),
-/*             T_META_LTEPHASE(LTE_POSTINIT),
- */
-               T_META_SYSCTL_INT(ZONEMAP_JETSAM_LIMIT_SYSCTL))
-{
-       current_test = (test_config_struct) {
-               .test_index = GENERIC_ZONE_TEST,
-               .helper_func = GENERIC_ZONE_TEST_OPT,
-               .num_zones = 0,
-               .zone_names = NULL
-       };
-       run_test();
-}
diff --git a/tools/tests/darwintests/mktimer_kobject.c b/tools/tests/darwintests/mktimer_kobject.c
deleted file mode 100644 (file)
index 54b24a0..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-#include <stdint.h>
-#include <stdio.h>
-#include <unistd.h>
-
-#include <mach/mach.h>
-#include <mach/mk_timer.h>
-
-#include <darwintest.h>
-
-T_DECL(mktimer_kobject, "mktimer_kobject()", T_META_ALL_VALID_ARCHS(true))
-{
-       mach_port_t timer_port = MACH_PORT_NULL;
-       mach_port_t notify_port = MACH_PORT_NULL;
-
-       kern_return_t kr = KERN_SUCCESS;
-
-       // timer port
-       // This is a receive right which is also a kobject
-       timer_port = mk_timer_create();
-       T_ASSERT_NE(timer_port, (mach_port_t)MACH_PORT_NULL, "mk_timer_create: %s", mach_error_string(kr));
-
-       mach_port_set_context(mach_task_self(), timer_port, (mach_port_context_t) 0x1);
-       T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_set_context(timer_port): %s", mach_error_string(kr));
-
-       // notification port for the mk_timer port to come back on
-       kr = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &notify_port);
-       T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_allocate(notify_port): %s", mach_error_string(kr));
-
-       kr = mach_port_set_context(mach_task_self(), notify_port, (mach_port_context_t) 0x2);
-       T_ASSERT_EQ(kr, KERN_SUCCESS, "mach_port_set_context(notify_port): %s", mach_error_string(kr));
-
-       T_LOG("timer: 0x%x, notify: 0x%x", timer_port, notify_port);
-
-       mach_port_t previous = MACH_PORT_NULL;
-
-       // request a port-destroyed notification on the timer port
-       kr = mach_port_request_notification(mach_task_self(), timer_port, MACH_NOTIFY_PORT_DESTROYED,
-                                           0, notify_port, MACH_MSG_TYPE_MAKE_SEND_ONCE, &previous);
-       // this should fail!
-       T_ASSERT_NE(kr, KERN_SUCCESS, "notifications should NOT work on mk_timer ports!");
-
-       // destroy the timer port to send the notification
-       mach_port_mod_refs(mach_task_self(), timer_port, MACH_PORT_RIGHT_RECEIVE, -1);
-
-       // destroy the notification port
-       mach_port_mod_refs(mach_task_self(), notify_port, MACH_PORT_RIGHT_RECEIVE, -1);
-
-       T_LOG("done");
-}
-
diff --git a/tools/tests/darwintests/monotonic_core.c b/tools/tests/darwintests/monotonic_core.c
deleted file mode 100644 (file)
index 66bcc31..0000000
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Must come before including darwintest.h
- */
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif /* defined(T_NAMESPACE) */
-
-#include <darwintest.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#ifndef PRIVATE
-/*
- * Need new CPU families.
- */
-#define PRIVATE
-#include <mach/machine.h>
-#undef PRIVATE
-#else /* !defined(PRIVATE) */
-#include <mach/machine.h>
-#endif /* defined(PRIVATE) */
-#include <ktrace.h>
-#include <mach/mach.h>
-#include <stdint.h>
-#include <System/sys/guarded.h>
-#include <System/sys/monotonic.h>
-#include <sys/ioctl.h>
-#include <sys/kdebug.h>
-#include <sys/sysctl.h>
-#include <unistd.h>
-
-T_GLOBAL_META(
-               T_META_NAMESPACE("xnu.monotonic"),
-               T_META_CHECK_LEAKS(false)
-);
-
-static void
-skip_if_unsupported(void)
-{
-       int r;
-       int supported = 0;
-       size_t supported_size = sizeof(supported);
-
-       r = sysctlbyname("kern.monotonic.supported", &supported, &supported_size,
-                       NULL, 0);
-       if (r < 0) {
-               T_WITH_ERRNO;
-               T_SKIP("could not find \"kern.monotonic.supported\" sysctl");
-       }
-
-       if (!supported) {
-               T_SKIP("monotonic is not supported on this platform");
-       }
-}
-
-static void
-check_fixed_counts(uint64_t counts[2][2])
-{
-       T_QUIET;
-       T_EXPECT_GT(counts[0][0], UINT64_C(0), "instructions are larger than 0");
-       T_QUIET;
-       T_EXPECT_GT(counts[0][1], UINT64_C(0), "cycles are larger than 0");
-
-       T_EXPECT_GT(counts[1][0], counts[0][0], "instructions increase monotonically");
-       T_EXPECT_GT(counts[1][1], counts[0][1], "cycles increase monotonically");
-}
-
-T_DECL(core_fixed_thread_self, "check the current thread's fixed counters",
-               T_META_ASROOT(true))
-{
-       int err;
-       extern int thread_selfcounts(int type, void *buf, size_t nbytes);
-       uint64_t counts[2][2];
-
-       T_SETUPBEGIN;
-       skip_if_unsupported();
-       T_SETUPEND;
-
-       err = thread_selfcounts(1, &counts[0], sizeof(counts[0]));
-       T_ASSERT_POSIX_ZERO(err, "thread_selfcounts");
-       err = thread_selfcounts(1, &counts[1], sizeof(counts[1]));
-       T_ASSERT_POSIX_ZERO(err, "thread_selfcounts");
-
-       check_fixed_counts(counts);
-}
-
-T_DECL(core_fixed_task, "check that task counting is working",
-               T_META_ASROOT(true))
-{
-       task_t task = mach_task_self();
-       kern_return_t kr;
-       mach_msg_type_number_t size = TASK_INSPECT_BASIC_COUNTS_COUNT;
-       uint64_t counts[2][2];
-
-       skip_if_unsupported();
-
-       kr = task_inspect(task, TASK_INSPECT_BASIC_COUNTS,
-                       (task_inspect_info_t)&counts[0], &size);
-       T_ASSERT_MACH_SUCCESS(kr,
-                       "task_inspect(... TASK_INSPECT_BASIC_COUNTS ...)");
-
-       size = TASK_INSPECT_BASIC_COUNTS_COUNT;
-       kr = task_inspect(task, TASK_INSPECT_BASIC_COUNTS,
-                       (task_inspect_info_t)&counts[1], &size);
-       T_ASSERT_MACH_SUCCESS(kr,
-                       "task_inspect(... TASK_INSPECT_BASIC_COUNTS ...)");
-
-       check_fixed_counts(counts);
-}
-
-T_DECL(core_fixed_kdebug, "check that the kdebug macros for monotonic work",
-               T_META_ASROOT(true))
-{
-       __block bool saw_events = false;
-       ktrace_session_t s;
-       int r;
-       int set = 1;
-
-       T_SETUPBEGIN;
-       skip_if_unsupported();
-
-       s = ktrace_session_create();
-       T_QUIET; T_ASSERT_NOTNULL(s, "ktrace_session_create");
-
-       ktrace_events_single_paired(s,
-                       KDBG_EVENTID(DBG_MONOTONIC, DBG_MT_TMPCPU, 0x3fff),
-                       ^(struct trace_point *start, struct trace_point *end)
-       {
-               uint64_t counts[2][2];
-
-               saw_events = true;
-
-               counts[0][0] = start->arg1;
-               counts[0][1] = start->arg2;
-               counts[1][0] = end->arg1;
-               counts[1][1] = end->arg2;
-
-               check_fixed_counts(counts);
-       });
-
-       ktrace_set_completion_handler(s, ^{
-               T_ASSERT_TRUE(saw_events, "should see monotonic kdebug events");
-               T_END;
-       });
-       T_SETUPEND;
-
-       T_ASSERT_POSIX_ZERO(ktrace_start(s,
-                       dispatch_get_global_queue(QOS_CLASS_USER_INITIATED, 0)), NULL);
-
-       r = sysctlbyname("kern.monotonic.kdebug_test", NULL, NULL, &set,
-                       sizeof(set));
-       T_ASSERT_POSIX_SUCCESS(r,
-                       "sysctlbyname(\"kern.monotonic.kdebug_test\", ...)");
-
-       ktrace_end(s, 0);
-       dispatch_main();
-}
-
-static void
-perf_sysctl_deltas(const char *sysctl_name, const char *stat_name)
-{
-       uint64_t deltas[2];
-       size_t deltas_size;
-       int r;
-
-       T_SETUPBEGIN;
-       skip_if_unsupported();
-
-       dt_stat_t instrs = dt_stat_create("instructions", "%s_instrs",
-                       stat_name);
-       dt_stat_t cycles = dt_stat_create("cycles", "%s_cycles", stat_name);
-       T_SETUPEND;
-
-       while (!dt_stat_stable(instrs) || !dt_stat_stable(cycles)) {
-               deltas_size = sizeof(deltas);
-               r = sysctlbyname(sysctl_name, deltas, &deltas_size, NULL, 0);
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(r, "sysctlbyname(\"%s\", ...)", sysctl_name);
-               dt_stat_add(instrs, (double)deltas[0]);
-               dt_stat_add(cycles, (double)deltas[1]);
-       }
-
-       dt_stat_finalize(instrs);
-       dt_stat_finalize(cycles);
-}
-
-T_DECL(perf_core_fixed_cpu, "test the performance of fixed CPU counter access",
-               T_META_ASROOT(true))
-{
-       perf_sysctl_deltas("kern.monotonic.fixed_cpu_perf", "fixed_cpu_counters");
-}
-
-T_DECL(perf_core_fixed_thread, "test the performance of fixed thread counter access",
-               T_META_ASROOT(true))
-{
-       perf_sysctl_deltas("kern.monotonic.fixed_thread_perf",
-                       "fixed_thread_counters");
-}
-
-T_DECL(perf_core_fixed_task, "test the performance of fixed task counter access",
-               T_META_ASROOT(true))
-{
-       perf_sysctl_deltas("kern.monotonic.fixed_task_perf", "fixed_task_counters");
-}
-
-T_DECL(perf_core_fixed_thread_self, "test the performance of thread self counts")
-{
-       extern int thread_selfcounts(int type, void *buf, size_t nbytes);
-       uint64_t counts[2][2];
-
-       T_SETUPBEGIN;
-       dt_stat_t instrs = dt_stat_create("fixed_thread_self_instrs", "instructions");
-       dt_stat_t cycles = dt_stat_create("fixed_thread_self_cycles", "cycles");
-
-       skip_if_unsupported();
-       T_SETUPEND;
-
-       while (!dt_stat_stable(instrs) || !dt_stat_stable(cycles)) {
-               int r1, r2;
-
-               r1 = thread_selfcounts(1, &counts[0], sizeof(counts[0]));
-               r2 = thread_selfcounts(1, &counts[1], sizeof(counts[1]));
-               T_QUIET; T_ASSERT_POSIX_ZERO(r1, "__thread_selfcounts");
-               T_QUIET; T_ASSERT_POSIX_ZERO(r2, "__thread_selfcounts");
-
-               T_QUIET; T_ASSERT_GT(counts[1][0], counts[0][0],
-                               "instructions increase monotonically");
-               dt_stat_add(instrs, counts[1][0] - counts[0][0]);
-
-               T_QUIET; T_ASSERT_GT(counts[1][1], counts[0][1],
-                               "cycles increase monotonically");
-               dt_stat_add(cycles, counts[1][1] - counts[0][1]);
-       }
-
-       dt_stat_finalize(instrs);
-       dt_stat_finalize(cycles);
-}
diff --git a/tools/tests/darwintests/net_tun_pr_35136664.c b/tools/tests/darwintests/net_tun_pr_35136664.c
deleted file mode 100644 (file)
index 366f066..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/kern_control.h>
-#include <sys/sys_domain.h>
-
-#include <net/if_utun.h>
-#include <net/if_ipsec.h>
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.net"));
-
-T_DECL(PR_35136664_utun,
-       "This bind a utun and close it without connecting")
-{
-       int tunsock;
-       struct ctl_info kernctl_info;
-       struct sockaddr_ctl kernctl_addr;
-
-       T_ASSERT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL);
-
-       memset(&kernctl_info, 0, sizeof(kernctl_info));
-       strlcpy(kernctl_info.ctl_name, UTUN_CONTROL_NAME, sizeof(kernctl_info.ctl_name));
-       T_ASSERT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL);
-
-       memset(&kernctl_addr, 0, sizeof(kernctl_addr));
-       kernctl_addr.sc_len = sizeof(kernctl_addr);
-       kernctl_addr.sc_family = AF_SYSTEM;
-       kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
-       kernctl_addr.sc_id = kernctl_info.ctl_id;
-       kernctl_addr.sc_unit = 0;
-
-       T_ASSERT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL);
-
-       T_ASSERT_POSIX_ZERO(close(tunsock), NULL);
-}
-
-T_DECL(PR_35136664_ipsec,
-       "This bind a ipsec and close it without connecting")
-{
-       int tunsock;
-       struct ctl_info kernctl_info;
-       struct sockaddr_ctl kernctl_addr;
-
-       T_ASSERT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL);
-
-       memset(&kernctl_info, 0, sizeof(kernctl_info));
-       strlcpy(kernctl_info.ctl_name, IPSEC_CONTROL_NAME, sizeof(kernctl_info.ctl_name));
-       T_ASSERT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL);
-
-       memset(&kernctl_addr, 0, sizeof(kernctl_addr));
-       kernctl_addr.sc_len = sizeof(kernctl_addr);
-       kernctl_addr.sc_family = AF_SYSTEM;
-       kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
-       kernctl_addr.sc_id = kernctl_info.ctl_id;
-       kernctl_addr.sc_unit = 0;
-
-       T_ASSERT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL);
-
-       T_ASSERT_POSIX_ZERO(close(tunsock), NULL);
-}
diff --git a/tools/tests/darwintests/net_tuntests.c b/tools/tests/darwintests/net_tuntests.c
deleted file mode 100644 (file)
index 91363ab..0000000
+++ /dev/null
@@ -1,536 +0,0 @@
-
-#include <inttypes.h>
-#include <stdbool.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <string.h>
-#include <unistd.h>
-#include <poll.h>
-#include <sys/event.h>
-#include <uuid/uuid.h>
-#include <arpa/inet.h>
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <sys/kern_control.h>
-#include <sys/ioctl.h>
-#include <sys/socket.h>
-#include <sys/kern_control.h>
-#include <sys/sys_domain.h>
-
-#include <net/if.h>
-#include <net/if_ipsec.h>
-#include <net/if_utun.h>
-#include <netinet/in.h>
-#include <netinet/in_var.h>
-#include <net/pfkeyv2.h>
-#include <netinet6/ipsec.h>
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-#include <skywalk/os_skywalk_private.h> // for SK_FEATURE_*
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.net.tun"));
-
-#if 0
-static void
-log_hexdump(const void *inp, size_t len)
-{
-       unsigned i, off = 0;
-       char buf[9+16*3+1];
-       for (i = 0; i < len; i++) {
-               if (i % 16 == 0)
-                       off = (unsigned)snprintf(buf, sizeof(buf), "%08x:", i);
-               off += (unsigned)snprintf(buf+off, sizeof(buf)-off, " %02x", (((const uint8_t *)inp)[i]) & 0xff);
-               if (i % 16 == 15)
-                       T_LOG("%s", buf);
-               }
-               if (len % 16)
-                       T_LOG("%s", buf);
-}
-#endif
-
-static uint64_t
-get_skywalk_features(void)
-{
-       uint64_t features = 0;
-       size_t len = sizeof(features);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(sysctlbyname("kern.skywalk.features", &features, &len, NULL, 0), NULL);
-       T_QUIET; T_ASSERT_EQ(len, sizeof(features), NULL);
-       T_QUIET; T_ASSERT_TRUE(features & SK_FEATURE_SKYWALK, NULL);
-       return features;
-}
-
-static bool g_is_ipsec_test;
-static bool g_is_utun_test;
-static int g_OPT_ENABLE_NETIF = -1;
-static int g_OPT_ENABLE_FLOWSWITCH = -1;
-static int g_OPT_ENABLE_CHANNEL = -1;
-static int g_OPT_GET_CHANNEL_UUID = -1;
-static int g_OPT_IFNAME = -1;
-static char *g_CONTROL_NAME = NULL;
-
-static void
-setup_ipsec_test(void)
-{
-       T_LOG("Configuring for ipsec tests");
-       g_OPT_ENABLE_NETIF = IPSEC_OPT_ENABLE_NETIF;
-       g_OPT_ENABLE_FLOWSWITCH = IPSEC_OPT_ENABLE_FLOWSWITCH;
-       g_OPT_ENABLE_CHANNEL = IPSEC_OPT_ENABLE_CHANNEL;
-       g_OPT_GET_CHANNEL_UUID = IPSEC_OPT_GET_CHANNEL_UUID;
-       g_OPT_IFNAME = IPSEC_OPT_IFNAME;
-       g_CONTROL_NAME = IPSEC_CONTROL_NAME;
-       g_is_ipsec_test = true;
-}
-
-static void
-setup_utun_test(void)
-{
-       T_LOG("Configuring for utun tests");
-       g_OPT_ENABLE_NETIF = UTUN_OPT_ENABLE_NETIF;
-       g_OPT_ENABLE_FLOWSWITCH = UTUN_OPT_ENABLE_FLOWSWITCH;
-       g_OPT_ENABLE_CHANNEL = UTUN_OPT_ENABLE_CHANNEL;
-       g_OPT_GET_CHANNEL_UUID = UTUN_OPT_GET_CHANNEL_UUID;
-       g_OPT_IFNAME = UTUN_OPT_IFNAME;
-       g_CONTROL_NAME = UTUN_CONTROL_NAME;
-       g_is_utun_test = true;
-}
-
-static void
-check_enables(int tunsock, int enable_netif, int enable_flowswitch, int enable_channel, uuid_t uuid)
-{
-       int scratch;
-       socklen_t scratchlen, uuidlen;
-       uuid_t scratchuuid;
-       if (!uuid) {
-               uuid = scratchuuid;
-       }
-
-       //T_LOG("checking tunsock %d", tunsock);
-
-       scratchlen = sizeof(scratch);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
-                       &scratch, &scratchlen), NULL);
-       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )scratchlen, sizeof(scratch), NULL);
-       T_QUIET; T_EXPECT_EQ(scratch, enable_netif, NULL);
-
-       scratchlen = sizeof(scratch);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
-                       &scratch, &scratchlen), NULL);
-       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )scratchlen, sizeof(scratch), NULL);
-       if (get_skywalk_features() & SK_FEATURE_NETNS) {
-               if (enable_netif) {
-                       T_QUIET; T_EXPECT_EQ(scratch, enable_flowswitch, NULL);
-               } else {
-                       T_QUIET; T_EXPECT_EQ(scratch, 0, NULL);
-               }
-       } else {
-               T_QUIET; T_EXPECT_EQ(scratch, 0, NULL);
-       }
-
-       scratchlen = sizeof(scratch);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                       &scratch, &scratchlen), NULL);
-       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )scratchlen, sizeof(scratch), NULL);
-       if (g_is_ipsec_test && !enable_netif) {
-               T_QUIET; T_EXPECT_EQ(scratch, 0, NULL);
-       } else {
-               T_QUIET; T_EXPECT_EQ(scratch, enable_channel, NULL);
-       }
-
-       if (scratch) {
-               uuid_clear(uuid);
-               uuidlen = sizeof(uuid_t);
-               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
-                               uuid, &uuidlen), NULL);
-               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
-               T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid), NULL);
-       } else {
-               uuid_clear(uuid);
-               uuidlen = sizeof(uuid_t);
-               T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
-                               uuid, &uuidlen), ENXIO, NULL);
-               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
-               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
-       }
-}
-
-static void
-tunsock_get_ifname(int s, char ifname[IFXNAMSIZ])
-{
-       socklen_t optlen = IFXNAMSIZ;
-       T_QUIET; T_WITH_ERRNO; T_ASSERT_POSIX_ZERO(getsockopt(s, SYSPROTO_CONTROL, g_OPT_IFNAME, ifname, &optlen), NULL);
-       T_QUIET; T_ASSERT_TRUE(optlen > 0, NULL);
-       T_QUIET; T_ASSERT_TRUE(ifname[optlen-1] == '\0', NULL);
-       T_QUIET; T_ASSERT_TRUE(strlen(ifname)+1 == optlen, "got ifname \"%s\" len %zd expected %u", ifname, strlen(ifname), optlen);
-}
-
-static short
-ifnet_get_flags(int s, const char ifname[IFNAMSIZ])
-{
-       struct ifreq    ifr;
-       memset(&ifr, 0, sizeof(ifr));
-       strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(s, SIOCGIFFLAGS, (caddr_t)&ifr), NULL);
-       return ifr.ifr_flags;
-}
-
-static void
-ifnet_add_addr4(const char ifname[IFNAMSIZ], struct in_addr *addr, struct in_addr *mask, struct in_addr *broadaddr)
-{
-       struct sockaddr_in *sin;
-       struct in_aliasreq ifra;
-       int s;
-
-       T_QUIET; T_EXPECT_POSIX_SUCCESS(s = socket(AF_INET, SOCK_DGRAM, 0), NULL);
-
-       memset(&ifra, 0, sizeof(ifra));
-       strlcpy(ifra.ifra_name, ifname, sizeof(ifra.ifra_name));
-
-       if (addr != NULL) {
-               sin = &ifra.ifra_addr;
-               sin->sin_len = sizeof(*sin);
-               sin->sin_family = AF_INET;
-               sin->sin_addr = *addr;
-       }
-
-       if (mask != NULL) {
-               sin = &ifra.ifra_mask;
-               sin->sin_len = sizeof(*sin);
-               sin->sin_family = AF_INET;
-               sin->sin_addr = *mask;
-       }
-
-       if (broadaddr != NULL || (addr != NULL &&
-                 (ifnet_get_flags(s, ifname) & IFF_POINTOPOINT) != 0)) {
-               sin = &ifra.ifra_broadaddr;
-               sin->sin_len = sizeof(*sin);
-               sin->sin_family = AF_INET;
-               sin->sin_addr = (broadaddr != NULL) ? *broadaddr : *addr;
-       }
-
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(s, SIOCAIFADDR, &ifra), NULL);
-
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(s), NULL);
-}
-
-static int g_pfkeyso = -1;
-static struct in_addr g_addr1, g_addr2;
-
-static void
-create_sa(const char ifname[IFXNAMSIZ], uint8_t type, uint32_t spi, struct in_addr *src, struct in_addr *dst)
-{
-       if (g_pfkeyso == -1) {
-               T_QUIET; T_EXPECT_POSIX_SUCCESS(g_pfkeyso = socket(PF_KEY, SOCK_RAW, PF_KEY_V2), NULL);
-       }
-
-       /*
-               <base, SA, (lifetime(HS),) address(SD), (address(P),)
-               key(AE), (identity(SD),) (sensitivity)>
-       */
-
-       struct {
-               struct sadb_msg msg __attribute((aligned(sizeof (uint64_t))));
-               struct sadb_key key  __attribute((aligned(sizeof (uint64_t))));
-               struct sadb_sa sa  __attribute((aligned(sizeof (uint64_t))));
-               struct sadb_x_sa2 sa2  __attribute((aligned(sizeof (uint64_t))));
-               struct sadb_x_ipsecif ipsecif __attribute((aligned(sizeof (uint64_t))));
-               struct {
-                       struct sadb_address addr __attribute((aligned(sizeof (uint64_t))));
-                       struct sockaddr_in saddr __attribute((aligned(sizeof (uint64_t))));
-               } src;
-               struct {
-                       struct sadb_address addr __attribute((aligned(sizeof (uint64_t))));
-                       struct sockaddr_in saddr __attribute((aligned(sizeof (uint64_t))));
-               } dst;
-       } addcmd;
-
-       memset(&addcmd, 0, sizeof(addcmd));
-
-       addcmd.msg.sadb_msg_version = PF_KEY_V2;
-       addcmd.msg.sadb_msg_type = type;
-       addcmd.msg.sadb_msg_errno = 0;
-       addcmd.msg.sadb_msg_satype = SADB_SATYPE_ESP;
-       addcmd.msg.sadb_msg_len = PFKEY_UNIT64(sizeof(addcmd));
-       addcmd.msg.sadb_msg_reserved = 0;
-       addcmd.msg.sadb_msg_seq = 0;
-       addcmd.msg.sadb_msg_pid = (unsigned)getpid();
-
-       addcmd.key.sadb_key_len = PFKEY_UNIT64(sizeof(addcmd.key));
-       addcmd.key.sadb_key_exttype = SADB_EXT_KEY_ENCRYPT;
-  addcmd.key.sadb_key_bits = 0;
-  addcmd.key.sadb_key_reserved = 0;
-
-  addcmd.sa.sadb_sa_len = PFKEY_UNIT64(sizeof(addcmd.sa));
-  addcmd.sa.sadb_sa_exttype = SADB_EXT_SA;
-  addcmd.sa.sadb_sa_spi = htonl(spi);
-  addcmd.sa.sadb_sa_replay = 0;
-  addcmd.sa.sadb_sa_state = 0;
-  addcmd.sa.sadb_sa_auth = SADB_AALG_NONE;
-  addcmd.sa.sadb_sa_encrypt = SADB_EALG_NULL;
-  addcmd.sa.sadb_sa_flags = SADB_X_EXT_CYCSEQ;
-
-       addcmd.sa2.sadb_x_sa2_len = PFKEY_UNIT64(sizeof(addcmd.sa2));
-       addcmd.sa2.sadb_x_sa2_exttype = SADB_X_EXT_SA2;
-       addcmd.sa2.sadb_x_sa2_mode = IPSEC_MODE_ANY;
-       addcmd.sa2.sadb_x_sa2_alwaysexpire = 1;
-       addcmd.sa2.sadb_x_sa2_flags = SADB_X_EXT_SA2_DELETE_ON_DETACH;
-       addcmd.sa2.sadb_x_sa2_sequence = 0;
-       addcmd.sa2.sadb_x_sa2_reqid = 0;
-
-       addcmd.ipsecif.sadb_x_ipsecif_len = PFKEY_UNIT64(sizeof(addcmd.ipsecif));
-       addcmd.ipsecif.sadb_x_ipsecif_exttype = SADB_X_EXT_IPSECIF;
-       memset(addcmd.ipsecif.sadb_x_ipsecif_internal_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_internal_if));
-       memset(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if, 0, sizeof(addcmd.ipsecif.sadb_x_ipsecif_outgoing_if));
-       strlcpy(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if, ifname, sizeof(addcmd.ipsecif.sadb_x_ipsecif_ipsec_if));
-       addcmd.ipsecif.sadb_x_ipsecif_init_disabled = 0;
-       addcmd.ipsecif.reserved = 0;
-
-  addcmd.src.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.src));
-  addcmd.src.addr.sadb_address_exttype = SADB_EXT_ADDRESS_SRC;
-  addcmd.src.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
-  addcmd.src.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
-       addcmd.src.addr.sadb_address_reserved = 0;
-       addcmd.src.saddr.sin_len = sizeof(addcmd.src.saddr);
-       addcmd.src.saddr.sin_family = AF_INET;
-       addcmd.src.saddr.sin_port = htons(0);
-       addcmd.src.saddr.sin_addr = *src;
-
-  addcmd.dst.addr.sadb_address_len = PFKEY_UNIT64(sizeof(addcmd.dst));
-  addcmd.dst.addr.sadb_address_exttype = SADB_EXT_ADDRESS_DST;
-  addcmd.dst.addr.sadb_address_proto = IPSEC_ULPROTO_ANY;
-       addcmd.dst.addr.sadb_address_prefixlen = sizeof(struct in_addr) << 3; //XXX Why?
-       addcmd.dst.addr.sadb_address_reserved = 0;
-       addcmd.dst.saddr.sin_len = sizeof(addcmd.dst.saddr);
-       addcmd.dst.saddr.sin_family = AF_INET;
-       addcmd.dst.saddr.sin_port = htons(0);
-       addcmd.dst.saddr.sin_addr = *dst;
-
-       //log_hexdump(&addcmd, sizeof(addcmd));
-
-       ssize_t slen;
-       T_QUIET; T_EXPECT_POSIX_SUCCESS(slen = send(g_pfkeyso, &addcmd, sizeof(addcmd), 0), NULL);
-       T_QUIET; T_EXPECT_EQ(slen, (ssize_t)sizeof(addcmd), NULL);
-}
-
-static int
-create_tunsock(int enable_netif, int enable_flowswitch, int enable_channel)
-{
-       int tunsock;
-       struct ctl_info kernctl_info;
-       struct sockaddr_ctl kernctl_addr;
-       uuid_t uuid;
-       socklen_t uuidlen;
-
-startover:
-
-       T_QUIET; T_EXPECT_POSIX_SUCCESS(tunsock = socket(PF_SYSTEM, SOCK_DGRAM, SYSPROTO_CONTROL), NULL);
-
-       memset(&kernctl_info, 0, sizeof(kernctl_info));
-       strlcpy(kernctl_info.ctl_name, g_CONTROL_NAME, sizeof(kernctl_info.ctl_name));
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(ioctl(tunsock, CTLIOCGINFO, &kernctl_info), NULL);
-
-       memset(&kernctl_addr, 0, sizeof(kernctl_addr));
-       kernctl_addr.sc_len = sizeof(kernctl_addr);
-       kernctl_addr.sc_family = AF_SYSTEM;
-       kernctl_addr.ss_sysaddr = AF_SYS_CONTROL;
-       kernctl_addr.sc_id = kernctl_info.ctl_id;
-       kernctl_addr.sc_unit = 0;
-
-       //T_LOG("enable_netif = %d, enable_flowswitch = %d, enable_channel = %d",
-       //enable_netif, enable_channel, enable_flowswitch);
-
-       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
-                       &enable_netif, sizeof(enable_netif)), EINVAL, NULL);
-       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
-                       &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
-       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                       &enable_channel, sizeof(enable_channel)), EINVAL, NULL);
-       uuid_clear(uuid);
-       uuidlen = sizeof(uuid_t);
-       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
-                       uuid, &uuidlen), EINVAL, NULL);
-       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
-       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
-
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(bind(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr)), NULL);
-
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
-                               &enable_netif, sizeof(enable_netif)), NULL);
-       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
-                       &enable_flowswitch, sizeof(enable_flowswitch)), EINVAL, NULL);
-       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                       &enable_channel, sizeof(enable_channel)), EINVAL, NULL);
-       uuid_clear(uuid);
-       uuidlen = sizeof(uuid_t);
-       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
-                       uuid, &uuidlen), ENXIO, NULL);
-       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
-       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
-
-       int error = connect(tunsock, (struct sockaddr *)&kernctl_addr, sizeof(kernctl_addr));
-       if (error == -1 && errno == EBUSY) {
-               /* XXX remove this retry nonsense when this is fixed:
-                * <rdar://problem/37340313> creating an interface without specifying specific interface name should not return EBUSY
-                */
-               close(tunsock);
-               T_LOG("connect got EBUSY, sleeping 1 second before retry");
-               sleep(1);
-               goto startover;
-       }
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(error, "connect()");
-
-       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_NETIF,
-                       &enable_netif, sizeof(enable_netif)), EINVAL, NULL);
-
-       if (get_skywalk_features() & SK_FEATURE_NETNS) {
-               if (enable_netif) {
-                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
-                                       &enable_flowswitch, sizeof(enable_flowswitch)), NULL);
-               } else {
-                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
-                                       &enable_flowswitch, sizeof(enable_flowswitch)), ENOENT, NULL);
-               }
-       } else {
-               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_FLOWSWITCH,
-                               &enable_flowswitch, sizeof(enable_flowswitch)), ENOTSUP, NULL);
-       }
-
-       if (enable_channel) {
-               if (g_is_ipsec_test && !enable_netif) {
-                       /* ipsec doesn't support channels without a netif */
-                       T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                                       &enable_channel, sizeof(enable_channel)), EOPNOTSUPP, NULL);
-                       uuid_clear(uuid);
-                       uuidlen = sizeof(uuid_t);
-                       T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
-                                       uuid, &uuidlen), ENXIO, NULL);
-                       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
-                       T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
-               } else {
-                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                                       &enable_channel, sizeof(enable_channel)), NULL);
-                       uuid_clear(uuid);
-                       uuidlen = sizeof(uuid_t);
-                       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
-                                       uuid, &uuidlen), NULL);
-                       T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
-                       T_QUIET; T_EXPECT_FALSE(uuid_is_null(uuid), NULL);
-               }
-       } else {
-               T_QUIET; T_EXPECT_POSIX_FAILURE(setsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_ENABLE_CHANNEL,
-                               &enable_channel, sizeof(enable_channel)), ENXIO, NULL);
-               uuid_clear(uuid);
-               uuidlen = sizeof(uuid_t);
-               T_QUIET; T_EXPECT_POSIX_FAILURE(getsockopt(tunsock, SYSPROTO_CONTROL, g_OPT_GET_CHANNEL_UUID,
-                               uuid, &uuidlen), ENXIO, NULL);
-               T_QUIET; T_EXPECT_EQ_ULONG((unsigned long )uuidlen, sizeof(uuid_t), NULL);
-               T_QUIET; T_EXPECT_TRUE(uuid_is_null(uuid), NULL);
-       }
-
-       check_enables(tunsock, enable_netif, enable_flowswitch, enable_channel, uuid);
-
-       //T_LOG("Returning tunsock %d", tunsock);
-
-       return tunsock;
-}
-
-#if 0
-static void
-ipsec_stats(void)
-{
-       struct ifmibdata ifmd;
-
-               len = sizeof(struct ifmibdata);
-               name[3] = IFMIB_IFDATA;
-               name[4] = interesting_row;
-               name[5] = IpFDATA_GENERAL;
-               if (sysctl(name, 6, &ifmd, &len, (void *)0, 0) == -1)
-                       err(1, "sysctl IFDATA_GENERAL %d", interesting_row);
-}
-#endif
-
-static void
-permute_enables(void)
-{
-       int tunsock;
-       T_EXPECT_GE(tunsock = create_tunsock(false, false, false), 0, NULL);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(false, false, true), 0, NULL);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(false, true, false), 0, NULL);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(false, true, true), 0, NULL);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(true, false, false), 0, NULL);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(true, false, true), 0, NULL);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(true, true, false), 0, NULL);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-       T_EXPECT_GE(tunsock = create_tunsock(true, true, true), 0, NULL);
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(tunsock), NULL);
-}
-
-T_DECL(ipsec_enables, "This test checks combinations of netif/channel/flowswitch on ipsec")
-{
-       setup_ipsec_test();
-       permute_enables();
-}
-
-T_DECL(utun_enables, "This test checks combinations of netif/channel/flowswitch on utun")
-{
-       setup_utun_test();
-       permute_enables();
-}
-
-static int g_tunsock = -1;
-
-static void
-cleanup_tunsock(void)
-{
-       T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(g_tunsock), NULL);
-       T_QUIET; T_EXPECT_POSIX_FAILURE(close(g_tunsock), EBADF, NULL);
-       if (g_is_ipsec_test) {
-               T_QUIET; T_WITH_ERRNO; T_EXPECT_POSIX_ZERO(close(g_pfkeyso), NULL);
-               T_QUIET; T_EXPECT_POSIX_FAILURE(close(g_pfkeyso), EBADF, NULL);
-       }
-}
-
-static void
-setup_tunsock(void)
-{
-       T_ASSERT_GE(g_tunsock = create_tunsock(true, false, true), 0, NULL);
-       T_ATEND(cleanup_tunsock);
-
-       char ifname[IFXNAMSIZ];
-       tunsock_get_ifname(g_tunsock, ifname);
-
-       T_LOG("Created interface %s", ifname);
-
-       uint32_t ifaddr = (10 << 24) | ((unsigned)getpid()&0xffff) << 8 | 160;
-       struct in_addr mask;
-       g_addr1.s_addr = htonl(ifaddr);
-       g_addr2.s_addr = htonl(ifaddr+1);
-       mask.s_addr = htonl(0xffffffff);
-
-       ifnet_add_addr4(ifname, &g_addr1, &mask, &g_addr2);
-
-       if (g_is_ipsec_test) {
-               create_sa(ifname, SADB_ADD, 12345, &g_addr1, &g_addr2);
-               create_sa(ifname, SADB_ADD, 12346, &g_addr2, &g_addr1);
-       }
-}
-
-T_DECL(setup_ipsec, "This test sets up an ipsec interface")
-{
-       setup_ipsec_test();
-       setup_tunsock();
-}
-
-T_DECL(setup_utun, "This test sets up a utun interface")
-{
-       setup_utun_test();
-       setup_tunsock();
-}
diff --git a/tools/tests/darwintests/netbsd_utimensat.c b/tools/tests/darwintests/netbsd_utimensat.c
deleted file mode 100644 (file)
index c14f92a..0000000
+++ /dev/null
@@ -1,198 +0,0 @@
-/*     $NetBSD: t_utimensat.c,v 1.6 2017/01/10 15:13:56 christos Exp $ */
-
-/*-
- * Copyright (c) 2012 The NetBSD Foundation, Inc.
- * All rights reserved.
- *
- * This code is derived from software contributed to The NetBSD Foundation
- * by Emmanuel Dreyfus.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
- * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
- * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-#include <sys/cdefs.h>
-__RCSID("$NetBSD: t_utimensat.c,v 1.6 2017/01/10 15:13:56 christos Exp $");
-
-#include <sys/param.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <paths.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-#define DIRPATH "dir"
-#define FILEPATH "dir/utimensat"
-#define BASEFILE "utimensat"
-#define LINK "dir/symlink"
-#define BASELINK "symlink"
-#define FILEERR "dir/symlink"
-
-static const struct timespec tptr[] = { 
-       { 0x12345678, 987654321 },
-       { 0x15263748, 123456789 },
-};
-
-static void chtmpdir(void)
-{
-       T_SETUPBEGIN;
-       T_ASSERT_POSIX_ZERO(chdir(dt_tmpdir()), NULL);
-
-       // <rdar://problem/31780295> dt_tmpdir() should guarantee a clean directory for each run
-       unlink(FILEPATH);
-       unlink(LINK);
-       rmdir(DIRPATH);
-
-       // Skip the test if the current working directory is not on APFS.
-       struct statfs sfs = { 0 };
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(statfs(".", &sfs), NULL);
-       if (memcmp(&sfs.f_fstypename[0], "apfs", strlen("apfs")) != 0) {
-               T_SKIP("utimensat is APFS-only, but working directory is non-APFS");
-       }
-
-       T_SETUPEND;
-}
-
-T_DECL(netbsd_utimensat_fd, "See that utimensat works with fd")
-{
-       chtmpdir();
-
-       int dfd;
-       int fd;
-       struct stat st;
-
-       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
-       T_ASSERT_POSIX_SUCCESS((fd = open(FILEPATH, O_CREAT|O_RDWR, 0644)), NULL);
-       T_ASSERT_POSIX_ZERO(close(fd), NULL);
-
-       T_ASSERT_POSIX_SUCCESS((dfd = open(DIRPATH, O_RDONLY, 0)), NULL);
-       T_ASSERT_POSIX_ZERO(utimensat(dfd, BASEFILE, tptr, 0), NULL);
-       T_ASSERT_POSIX_ZERO(close(dfd), NULL);
-
-       T_ASSERT_POSIX_ZERO(stat(FILEPATH, &st), NULL);
-       T_ASSERT_EQ(st.st_atimespec.tv_sec, tptr[0].tv_sec, NULL);
-       T_ASSERT_EQ(st.st_atimespec.tv_nsec, tptr[0].tv_nsec, NULL);
-       T_ASSERT_EQ(st.st_mtimespec.tv_sec, tptr[1].tv_sec, NULL);
-       T_ASSERT_EQ(st.st_mtimespec.tv_nsec, tptr[1].tv_nsec, NULL);
-}
-
-T_DECL(netbsd_utimensat_fdcwd, "See that utimensat works with fd as AT_FDCWD")
-{
-       chtmpdir();
-
-       int fd;
-       struct stat st;
-
-       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
-       T_ASSERT_POSIX_SUCCESS((fd = open(FILEPATH, O_CREAT|O_RDWR, 0644)), NULL);
-       T_ASSERT_POSIX_ZERO(close(fd), NULL);
-
-       T_ASSERT_POSIX_ZERO(chdir(DIRPATH), NULL);
-       T_ASSERT_POSIX_ZERO(utimensat(AT_FDCWD, BASEFILE, tptr, 0), NULL);
-
-       T_ASSERT_POSIX_ZERO(stat(BASEFILE, &st), NULL);
-       T_ASSERT_EQ(st.st_atimespec.tv_sec, tptr[0].tv_sec, NULL);
-       T_ASSERT_EQ(st.st_atimespec.tv_nsec, tptr[0].tv_nsec, NULL);
-       T_ASSERT_EQ(st.st_mtimespec.tv_sec, tptr[1].tv_sec, NULL);
-       T_ASSERT_EQ(st.st_mtimespec.tv_nsec, tptr[1].tv_nsec, NULL);
-}
-
-T_DECL(netbsd_utimensat_fdcwderr, "See that utimensat fails with fd as AT_FDCWD and bad path")
-{
-       chtmpdir();
-
-       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
-       T_ASSERT_EQ(utimensat(AT_FDCWD, FILEERR, tptr, 0), -1, NULL);
-}
-
-T_DECL(netbsd_utimensat_fderr1, "See that utimensat fail with bad path")
-{
-       chtmpdir();
-
-       int dfd;
-
-       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
-       T_ASSERT_POSIX_SUCCESS((dfd = open(DIRPATH, O_RDONLY, 0)), NULL);
-       T_ASSERT_EQ(utimensat(dfd, FILEERR, tptr, 0), -1, NULL);
-       T_ASSERT_POSIX_ZERO(close(dfd), NULL);
-}
-
-T_DECL(netbsd_utimensat_fderr2, "See that utimensat fails with bad fdat")
-{
-       chtmpdir();
-
-       int dfd;
-       int fd;
-       char cwd[MAXPATHLEN];
-
-       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
-       T_ASSERT_POSIX_SUCCESS((fd = open(FILEPATH, O_CREAT|O_RDWR, 0644)), NULL);
-       T_ASSERT_POSIX_ZERO(close(fd), NULL);
-
-       T_ASSERT_POSIX_SUCCESS((dfd = open(getcwd(cwd, MAXPATHLEN), O_RDONLY, 0)), NULL);
-       T_ASSERT_EQ(utimensat(dfd, BASEFILE, tptr, 0), -1, NULL);
-       T_ASSERT_POSIX_ZERO(close(dfd), NULL);
-}
-
-T_DECL(netbsd_utimensat_fderr3, "See that utimensat fails with fd as -1")
-{
-       chtmpdir();
-
-       int fd;
-
-       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
-       T_ASSERT_POSIX_SUCCESS((fd = open(FILEPATH, O_CREAT|O_RDWR, 0644)), NULL);
-       T_ASSERT_POSIX_ZERO(close(fd), NULL);
-
-       T_ASSERT_EQ(utimensat(-1, FILEPATH, tptr, 0), -1, NULL);
-}
-
-T_DECL(netbsd_utimensat_fdlink, "See that utimensat works on symlink")
-{
-       chtmpdir();
-
-       int dfd;
-       struct stat st;
-
-       T_ASSERT_POSIX_ZERO(mkdir(DIRPATH, 0755), NULL);
-       T_ASSERT_POSIX_ZERO(symlink(FILEPATH, LINK), NULL); /* NB: FILE does not exists */
-
-       T_ASSERT_POSIX_SUCCESS((dfd = open(DIRPATH, O_RDONLY, 0)), NULL);
-
-       T_ASSERT_EQ(utimensat(dfd, BASELINK, tptr, 0), -1, NULL);
-       T_ASSERT_EQ(errno, ENOENT, NULL);
-
-       T_ASSERT_POSIX_ZERO(utimensat(dfd, BASELINK, tptr, AT_SYMLINK_NOFOLLOW), NULL);
-
-       T_ASSERT_POSIX_ZERO(close(dfd), NULL);
-
-       T_ASSERT_POSIX_ZERO(lstat(LINK, &st), NULL);
-       T_ASSERT_EQ(st.st_atimespec.tv_sec, tptr[0].tv_sec, NULL);
-       T_ASSERT_EQ(st.st_atimespec.tv_nsec, tptr[0].tv_nsec, NULL);
-       T_ASSERT_EQ(st.st_mtimespec.tv_sec, tptr[1].tv_sec, NULL);
-       T_ASSERT_EQ(st.st_mtimespec.tv_nsec, tptr[1].tv_nsec, NULL);
-}
diff --git a/tools/tests/darwintests/network_entitlements.plist b/tools/tests/darwintests/network_entitlements.plist
deleted file mode 100644 (file)
index c326c83..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.security.network.client</key>
-       <true/>
-       <key>com.apple.security.network.server</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/no32exec_35914211.c b/tools/tests/darwintests/no32exec_35914211.c
deleted file mode 100644 (file)
index ea36703..0000000
+++ /dev/null
@@ -1,23 +0,0 @@
-#include <spawn.h>
-#include <sys/wait.h>
-#include <darwintest.h>
-#include <mach-o/dyld.h>
-#include <errno.h>
-
-T_DECL(no32exec_bootarg, "make sure the no32exec boot-arg is honored", T_META_BOOTARGS_SET("-no32exec"))
-{
-       int spawn_ret, pid;
-       char path[1024];
-       uint32_t size = sizeof(path);
-
-       T_ASSERT_EQ(_NSGetExecutablePath(path, &size), 0, NULL);
-       T_ASSERT_LT(strlcat(path, "_helper", size), size, NULL);
-
-       spawn_ret = posix_spawn(&pid, path, NULL, NULL, NULL, NULL);
-       if (spawn_ret == 0) {
-               int wait_ret = 0;
-               waitpid(pid, &wait_ret, 0);
-               T_ASSERT_FALSE(WIFEXITED(wait_ret), "i386 helper should not run");
-       }
-       T_ASSERT_EQ(spawn_ret, EBADARCH, NULL);
-}
diff --git a/tools/tests/darwintests/no32exec_35914211_helper.c b/tools/tests/darwintests/no32exec_35914211_helper.c
deleted file mode 100644 (file)
index 99fb6be..0000000
+++ /dev/null
@@ -1,6 +0,0 @@
-#include <darwintest.h>
-
-T_DECL(null_test, "nothing to see here")
-{
-       T_SKIP("nothing to see here");
-}
diff --git a/tools/tests/darwintests/ntp_adjtime_29192647.c b/tools/tests/darwintests/ntp_adjtime_29192647.c
deleted file mode 100644 (file)
index 2866385..0000000
+++ /dev/null
@@ -1,371 +0,0 @@
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <mach/clock_types.h>
-#include <sys/timex.h>
-#include <mach/mach.h>
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-
-#define DAY 86400 /*1 day in sec*/
-#define ERROR 2 /*2 us of error tolerance*/
-
-T_DECL(settimeofday_29192647,
-       "Verify that the syscall settimeofday is effective",
-       T_META_ASROOT(true), T_META_CHECK_LEAKS(NO), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       struct timeval time;
-       long new_time;
-
-       if (geteuid() != 0){
-                T_SKIP("settimeofday_29192647 test requires root privileges to run.");
-        }
-
-       T_QUIET;
-       T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* increment the time of one day */
-       new_time = time.tv_sec + DAY;
-
-       time.tv_sec = new_time;
-       time.tv_usec = 0;
-
-       T_LOG("Attemping to set the time one day after.");
-
-       T_WITH_ERRNO;
-       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-
-       T_QUIET;
-       T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* expext to be past new_time */
-       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time successfully changed");
-
-       /* set the time back to previous value */
-       if (time.tv_sec >= new_time) {
-               time.tv_sec = time.tv_sec - DAY;
-               time.tv_usec = 0;
-
-               T_WITH_ERRNO;
-               T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-       }
-}
-
-static void get_abs_to_us_scale_factor(uint64_t* numer, uint64_t* denom){
-       struct timespec time;
-       uint64_t old_abstime, new_abstime;
-       uint64_t old_time_usec, new_time_usec;
-       uint64_t time_conv1, diff;
-       mach_timebase_info_data_t timebaseInfo = { 0, 0 };
-
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&old_abstime, NULL, &time), KERN_SUCCESS, NULL);
-
-       old_time_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
-
-       sleep(1);
-
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&new_abstime, NULL, &time), KERN_SUCCESS, NULL);
-
-       new_time_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
-
-       /* this is conversion factors from abs to nanos */
-       T_ASSERT_EQ(mach_timebase_info(&timebaseInfo), KERN_SUCCESS, NULL);
-
-       new_time_usec -= old_time_usec;
-       new_abstime -= old_abstime;
-
-       time_conv1 = new_abstime;
-       time_conv1 *= timebaseInfo.numer;
-       time_conv1 /= timebaseInfo.denom * 1000;
-
-       if (time_conv1 > new_time_usec)
-               diff = time_conv1 - new_time_usec;
-       else
-               diff = new_time_usec - time_conv1;
-
-       T_EXPECT_LE_ULLONG(diff, (unsigned long long)ERROR, "Check scale factor time base (%u/%u) delta read usec %llu delta converted %llu delta abs %llu", timebaseInfo.numer, timebaseInfo.denom, time_conv1, new_time_usec, new_abstime);
-
-       *numer = (uint64_t)timebaseInfo.numer;
-       *denom = (uint64_t)timebaseInfo.denom * 1000;
-}
-
-
-#define ADJSTMENT 3333 /*3333 us*/
-#define ADJTIME_OFFSET_PER_SEC 500
-
-T_DECL(adjtime_29192647,
-       "Verify that the syscall adjtime is effective",
-       T_META_CHECK_LEAKS(NO), T_META_LTEPHASE(LTE_POSTINIT), T_META_ASROOT(true))
-{
-       struct timespec time;
-       struct timeval adj;
-       uint64_t old_abstime, new_abstime, abs_delta;
-       uint64_t old_time_usec, new_time_usec, us_delta, num, den;
-       unsigned int sleep_time;
-       long diff;
-       const char * lterdos_env = NULL;
-
-#if defined(__i386__) || defined(__x86_64__)
-       T_SKIP("adjtime_29192647 test requires LTE to run.");
-#endif
-
-       if (geteuid() != 0) {
-                T_SKIP("adjtime_29192647 test requires root privileges to run.");
-        }
-
-       lterdos_env = getenv("LTERDOS");
-
-       if (lterdos_env != NULL){
-               if (!(strcmp(lterdos_env, "YES") == 0)) {
-                    T_SKIP("adjtime_29192647 test requires LTE to run.");
-               }
-       }
-       else {
-               T_SKIP("adjtime_29192647 test requires LTE to run.");
-       }
-
-       /*
-        * Calibrate scale factor for converting from abs time to usec
-        */
-       get_abs_to_us_scale_factor(&num, &den);
-
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&old_abstime, NULL, &time), KERN_SUCCESS, NULL);
-
-       old_time_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
-
-       adj.tv_sec = 0;
-       adj.tv_usec = ADJSTMENT;
-
-       T_LOG("Attemping to adjust the time of %d", ADJSTMENT);
-
-       /*
-        * If more than one second of adjustment
-        * the system slews at a rate of 5ms/s otherwise 500us/s
-        * until the last second is slewed the final < 500 usecs.
-        */
-       T_WITH_ERRNO;
-       T_ASSERT_POSIX_ZERO(adjtime(&adj, NULL),NULL);
-
-       /*
-        * Wait that the full adjustment is applied.
-        * Note, add 2 more secs for take into account division error
-        * and that the last block of adj is fully elapsed.
-        */
-       sleep_time = (ADJSTMENT)/(ADJTIME_OFFSET_PER_SEC)+2;
-
-       T_LOG("Waiting for %u sec\n", sleep_time);
-       sleep(sleep_time);
-
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&new_abstime, NULL, &time), KERN_SUCCESS, NULL);
-
-       new_time_usec =  (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
-
-       us_delta = new_time_usec - old_time_usec;
-       us_delta -= ADJSTMENT;
-
-       /* abs time is not affected by adjtime */
-       abs_delta = new_abstime - old_abstime;
-
-       abs_delta *= num;
-       abs_delta /= den;
-
-       diff = (long) us_delta - (long) abs_delta;
-
-       /* expext that us_delta == abs_delta */
-       T_EXPECT_LE_LONG(diff, (long) ERROR, "Check abs time vs calendar time");
-
-       T_EXPECT_GE_LONG(diff, (long) -ERROR, "Check abs time vs calendar time");
-
-}
-
-#define FREQ_PPM 222 /*222 PPM(us/s)*/
-#define SHIFT_PLL 4
-#define OFFSET_US 123 /*123us*/
-
-T_DECL(ntp_adjtime_29192647,
-       "Verify that the syscall ntp_adjtime is effective",
-       T_META_CHECK_LEAKS(NO), T_META_LTEPHASE(LTE_POSTINIT), T_META_ASROOT(true))
-{
-       struct timespec time;
-       struct timex ntptime;
-       uint64_t abstime1, abstime2, abs_delta, num, den, time_delta;
-       uint64_t time1_usec, time2_usec, time_conv, us_delta, app;
-       int64_t offset;
-       long diff, freq;
-       unsigned int sleep_time;
-       const char * lterdos_env = NULL;
-
-#if defined(__i386__) || defined(__x86_64__)
-       T_SKIP("ntp_adjtime_29192647 test requires LTE to run.");
-#endif
-
-       if (geteuid() != 0){
-                T_SKIP("ntp_adjtime_29192647 test requires root privileges to run.");
-        }
-
-       lterdos_env = getenv("LTERDOS");
-
-       if (lterdos_env != NULL){
-               if (!(strcmp(lterdos_env, "YES") == 0)) {
-                    T_SKIP("adjtime_29192647 test requires LTE to run.");
-               }
-       }
-       else {
-               T_SKIP("adjtime_29192647 test requires LTE to run.");
-       }
-
-       /*
-        * Calibrate scale factor for converting from abs time to usec
-        */
-       get_abs_to_us_scale_factor(&num, &den);
-
-       /*
-        * scale frequency using ntp_adjtime;
-        */
-       memset(&ntptime, 0, sizeof(ntptime));
-
-       ntptime.modes = MOD_STATUS;
-       ntptime.status = TIME_OK;
-        /* ntp input freq is in ppm (us/s) * 2^16, max freq is 500 ppm */
-        freq = (FREQ_PPM) * 65536;
-       ntptime.modes |= MOD_FREQUENCY;
-        ntptime.freq = freq;
-
-       T_LOG("Attemping to change calendar frequency of %d ppm", FREQ_PPM);
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(ntptime.freq, freq, NULL);
-
-       sleep(2);
-
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&abstime1, NULL, &time), KERN_SUCCESS, NULL);
-
-       time1_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
-
-       sleep(1);
-
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&abstime2, NULL, &time), KERN_SUCCESS, NULL);
-
-       time2_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
-
-       abs_delta = abstime2 - abstime1;
-       us_delta = time2_usec - time1_usec;
-
-       time_conv = abs_delta;
-       time_conv *= num;
-       time_conv /= den;
-
-       app = time_conv/USEC_PER_SEC; //sec elapsed
-
-       time_delta = time_conv;
-       time_delta += app * (FREQ_PPM);
-
-       app = time_conv%USEC_PER_SEC;
-
-       time_delta += (app*(FREQ_PPM))/USEC_PER_SEC;
-
-       diff = (long) us_delta - (long) time_delta;
-
-       /* expext that us_delta == time_delta */
-       T_EXPECT_LE_LONG(diff, (long) ERROR, "Check abs time vs calendar time");
-
-       T_EXPECT_GE_LONG(diff, (long) -ERROR, "Check abs time vs calendar time");
-
-       memset(&ntptime, 0, sizeof(ntptime));
-
-       /* reset freq to zero */
-       freq = 0;
-       ntptime.modes = MOD_STATUS;
-       ntptime.status = TIME_OK;
-        ntptime.modes |= MOD_FREQUENCY;
-        ntptime.freq = freq;
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(ntptime.freq, freq, NULL);
-
-       sleep(1);
-
-       /*
-        * adjust the phase using ntp_adjtime;
-        */
-       memset(&ntptime, 0, sizeof(ntptime));
-       ntptime.modes |= MOD_STATUS;
-       ntptime.status = TIME_OK;
-       ntptime.status |= STA_PLL|STA_FREQHOLD;
-
-       /* ntp input phase can be both ns or us (MOD_MICRO), max offset is 500 ms */
-        ntptime.offset = OFFSET_US;
-       ntptime.modes |= MOD_OFFSET|MOD_MICRO;
-
-       /*
-        * The system will slew each sec of:
-        * slew = ntp.offset >> (SHIFT_PLL + time_constant);
-        * ntp.offset -= slew;
-        */
-       offset= (OFFSET_US) * 1000;
-       sleep_time = 2;
-
-       while((offset>>SHIFT_PLL)>0){
-               offset -= offset >> SHIFT_PLL;
-               sleep_time++;
-       }
-
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&abstime1, NULL, &time), KERN_SUCCESS, NULL);
-
-       time1_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
-
-       T_LOG("Attemping to change calendar phase of %d us", OFFSET_US);
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(ntptime.offset, (long) OFFSET_US, NULL);
-
-       T_LOG("Waiting for %u sec\n", sleep_time);
-       sleep(sleep_time);
-
-       T_QUIET; T_ASSERT_EQ(mach_get_times(&abstime2, NULL, &time), KERN_SUCCESS, NULL);
-
-       time2_usec = (uint64_t)time.tv_sec * USEC_PER_SEC + (uint64_t)time.tv_nsec/1000;
-
-       abs_delta = abstime2 - abstime1;
-       us_delta = time2_usec - time1_usec;
-
-       abs_delta *= num;
-       abs_delta /= den;
-
-       us_delta -= OFFSET_US;
-
-       diff = (long) us_delta - (long) abs_delta;
-
-       /* expext that us_delta == abs_delta */
-       T_EXPECT_LE_LONG(diff, (long) ERROR, "Check abs time vs calendar time");
-
-       T_EXPECT_GE_LONG(diff, (long) -ERROR, "Check abs time vs calendar time");
-
-       memset(&ntptime, 0, sizeof(ntptime));
-       ntptime.modes = MOD_STATUS;
-       ntptime.status = TIME_OK;
-        ntptime.modes |= MOD_FREQUENCY;
-        ntptime.freq = 0;
-
-       ntptime.status |= STA_PLL;
-        ntptime.offset = 0;
-       ntptime.modes |= MOD_OFFSET;
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
-
-}
-
-
diff --git a/tools/tests/darwintests/perf_compressor.c b/tools/tests/darwintests/perf_compressor.c
deleted file mode 100644 (file)
index 1d3b23d..0000000
+++ /dev/null
@@ -1,336 +0,0 @@
-#include <stdio.h>
-#include <signal.h>
-#include <sys/sysctl.h>
-#include <mach-o/dyld.h>
-
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.vm.perf"),
-       T_META_CHECK_LEAKS(false)
-);
-
-enum {
-       ALL_ZEROS,
-       MOSTLY_ZEROS,
-       RANDOM,
-       TYPICAL
-};
-
-#define CREATE_LIST(X) \
-       X(SUCCESS) \
-       X(TOO_FEW_ARGUMENTS) \
-       X(SYSCTL_VM_PAGESIZE_FAILED) \
-       X(VM_PAGESIZE_IS_ZERO) \
-       X(UNKNOWN_PAGE_TYPE) \
-       X(DISPATCH_SOURCE_CREATE_FAILED) \
-       X(INITIAL_SIGNAL_TO_PARENT_FAILED) \
-       X(SIGNAL_TO_PARENT_FAILED) \
-       X(EXIT_CODE_MAX)
-
-#define EXIT_CODES_ENUM(VAR) VAR,
-enum exit_codes_num {
-       CREATE_LIST(EXIT_CODES_ENUM)
-};
-
-#define EXIT_CODES_STRING(VAR) #VAR,
-static const char *exit_codes_str[] = {
-       CREATE_LIST(EXIT_CODES_STRING)
-};
-
-
-static pid_t pid = -1;
-static dt_stat_t r;
-static dt_stat_time_t s;
-
-void allocate_zero_pages(char **buf, int num_pages, int vmpgsize);
-void allocate_mostly_zero_pages(char **buf, int num_pages, int vmpgsize);
-void allocate_random_pages(char **buf, int num_pages, int vmpgsize);
-void allocate_representative_pages(char **buf, int num_pages, int vmpgsize);
-void run_compressor_test(int size_mb, int page_type);
-void freeze_helper_process(void);
-
-void allocate_zero_pages(char **buf, int num_pages, int vmpgsize) {
-       int i;
-
-       for (i = 0; i < num_pages; i++) {
-               buf[i] = (char*)malloc((size_t)vmpgsize * sizeof(char));
-               memset(buf[i], 0, vmpgsize);
-       }
-}
-
-void allocate_mostly_zero_pages(char **buf, int num_pages, int vmpgsize) {
-       int i, j;
-
-       for (i = 0; i < num_pages; i++) {
-               buf[i] = (char*)malloc((size_t)vmpgsize * sizeof(char));
-               memset(buf[i], 0, vmpgsize);
-               for (j = 0; j < 40; j++) {
-                       buf[i][j] = (char)(j+1);
-               }
-       }
-}
-
-void allocate_random_pages(char **buf, int num_pages, int vmpgsize) {
-       int i;
-
-       for (i = 0; i < num_pages; i++) {
-               buf[i] = (char*)malloc((size_t)vmpgsize * sizeof(char));
-               arc4random_buf((void*)buf[i], (size_t)vmpgsize);
-       }
-}
-
-// Gives us the compression ratio we see in the typical case (~2.7)
-void allocate_representative_pages(char **buf, int num_pages, int vmpgsize) {
-       int i, j;
-       char val;
-
-       for (j = 0; j < num_pages; j++) {
-               buf[j] = (char*)malloc((size_t)vmpgsize * sizeof(char));
-               val = 0;
-               for (i = 0; i < vmpgsize; i += 16) {
-                       memset(&buf[j][i], val, 16);
-                       if (i < 3400 * (vmpgsize / 4096)) {
-                               val++;
-                       }
-               }
-       }
-}
-
-void freeze_helper_process(void) {
-       int ret;
-       int64_t compressed_before, compressed_after, input_before, input_after;
-       size_t length;
-
-       /*
-        * Wait a bit after the pages have been allocated/accessed before trying to freeze.
-        * The sleeps are not needed, they just separate the operations into three logical chunks:
-        * touch a few pages, freeze them, thaw them (and repeat).
-        */
-       usleep(100);
-       length = sizeof(compressed_before);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_compressed_bytes", &compressed_before, &length, NULL, 0),
-                       "failed to query vm.compressor_compressed_bytes");
-       length = sizeof(input_before);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_input_bytes", &input_before, &length, NULL, 0),
-                       "failed to query vm.compressor_input_bytes");
-
-       T_STAT_MEASURE(s) {
-               ret = sysctlbyname("kern.memorystatus_freeze", NULL, NULL, &pid, sizeof(pid));
-       };
-
-       length = sizeof(compressed_after);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_compressed_bytes", &compressed_after, &length, NULL, 0),
-                       "failed to query vm.compressor_compressed_bytes");
-       length = sizeof(input_after);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(sysctlbyname("vm.compressor_input_bytes", &input_after, &length, NULL, 0),
-                       "failed to query vm.compressor_input_bytes");
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_freeze failed on pid %d", pid);
-
-       dt_stat_add(r, (double)(input_after - input_before)/(double)(compressed_after - compressed_before));
-
-       /* Wait a bit after freezing before trying to thaw */
-       usleep(100);
-       ret = sysctlbyname("kern.memorystatus_thaw", NULL, NULL, &pid, sizeof(pid));
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "sysctl kern.memorystatus_thaw failed on pid %d", pid);
-
-       /* Wait a bit after thawing before pages can be re-accessed */
-       usleep(100);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGUSR1), "failed to send SIGUSR1 to child process [%d]", pid);
-}
-
-void run_compressor_test(int size_mb, int page_type) {
-       int ret;
-       char sz_str[50];
-       char pt_str[50];
-       char **launch_tool_args;
-       char testpath[PATH_MAX];
-       uint32_t testpath_buf_size;
-       dispatch_source_t ds_freeze, ds_proc;
-
-#ifndef CONFIG_FREEZE
-       T_SKIP("Task freeze not supported.");
-#endif
-
-       r = dt_stat_create("(input bytes / compressed bytes)", "compression_ratio");
-       s = dt_stat_time_create("compressor_latency");
-
-       signal(SIGUSR1, SIG_IGN);
-       ds_freeze = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
-       T_QUIET; T_ASSERT_NOTNULL(ds_freeze, "dispatch_source_create (ds_freeze)");
-
-       dispatch_source_set_event_handler(ds_freeze, ^{
-               if (!dt_stat_stable(s)) {
-                       freeze_helper_process();
-               } else {
-                       dt_stat_finalize(s);
-                       dt_stat_finalize(r);
-
-                       kill(pid, SIGKILL);
-                       dispatch_source_cancel(ds_freeze);
-               }
-       });
-       dispatch_activate(ds_freeze);
-
-       testpath_buf_size = sizeof(testpath);
-       ret = _NSGetExecutablePath(testpath, &testpath_buf_size);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "_NSGetExecutablePath");
-       T_LOG("Executable path: %s", testpath);
-
-       sprintf(sz_str, "%d", size_mb);
-       sprintf(pt_str, "%d", page_type);
-       launch_tool_args = (char *[]){
-               testpath,
-               "-n",
-               "allocate_pages",
-               "--",
-               sz_str,
-               pt_str,
-               NULL
-       };
-
-       /* Spawn the child process. Suspend after launch until the exit proc handler has been set up. */
-       ret = dt_launch_tool(&pid, launch_tool_args, true, NULL, NULL);
-       if (ret != 0) {
-               T_LOG("dt_launch tool returned %d with error code %d", ret, errno);
-       }
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "dt_launch_tool");
-
-       ds_proc = dispatch_source_create(DISPATCH_SOURCE_TYPE_PROC, (uintptr_t)pid, DISPATCH_PROC_EXIT, dispatch_get_main_queue());
-       T_QUIET; T_ASSERT_NOTNULL(ds_proc, "dispatch_source_create (ds_proc)");
-
-       dispatch_source_set_event_handler(ds_proc, ^{
-               int status = 0, code = 0;
-               pid_t rc = waitpid(pid, &status, 0);
-               T_QUIET; T_ASSERT_EQ(rc, pid, "waitpid");
-               code = WEXITSTATUS(status);
-
-               if (code == 0) {
-                       T_END;
-               } else if (code > 0 && code < EXIT_CODE_MAX) {
-                       T_ASSERT_FAIL("Child exited with %s", exit_codes_str[code]);
-               } else {
-                       T_ASSERT_FAIL("Child exited with unknown exit code %d", code);
-               }
-       });
-       dispatch_activate(ds_proc);
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kill(pid, SIGCONT), "failed to send SIGCONT to child process [%d]", pid);
-       dispatch_main();
-}
-
-T_HELPER_DECL(allocate_pages, "allocates pages to compress") {
-       int i, j, ret, size_mb, page_type, vmpgsize;
-       size_t vmpgsize_length;
-       __block int num_pages;
-       __block char **buf;
-       dispatch_source_t ds_signal;
-
-       vmpgsize_length = sizeof(vmpgsize);
-       ret = sysctlbyname("vm.pagesize", &vmpgsize, &vmpgsize_length, NULL, 0);
-       if (ret != 0) {
-               exit(SYSCTL_VM_PAGESIZE_FAILED);
-       }
-       if (vmpgsize == 0) {
-               exit(VM_PAGESIZE_IS_ZERO);
-       }
-
-       if (argc < 2) {
-               exit(TOO_FEW_ARGUMENTS);
-       }
-
-       size_mb = atoi(argv[0]);
-       page_type = atoi(argv[1]);
-       num_pages = size_mb * 1024 * 1024 / vmpgsize;
-       buf = (char**)malloc(sizeof(char*) * (size_t)num_pages);
-
-       // Switch on the type of page requested
-       switch(page_type) {
-               case ALL_ZEROS:
-                       allocate_zero_pages(buf, num_pages, vmpgsize);
-                       break;
-               case MOSTLY_ZEROS:
-                       allocate_mostly_zero_pages(buf, num_pages, vmpgsize);
-                       break;
-               case RANDOM:
-                       allocate_random_pages(buf, num_pages, vmpgsize);
-                       break;
-               case TYPICAL:
-                       allocate_representative_pages(buf, num_pages, vmpgsize);
-                       break;
-               default:
-                       exit(UNKNOWN_PAGE_TYPE);
-       }
-
-       for (j = 0; j < num_pages; j++) {
-               i = buf[j][0];
-       }
-
-       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, NSEC_PER_SEC), dispatch_get_main_queue(), ^{
-               /* Signal to the parent that we're done allocating and it's ok to freeze us */
-               printf("Sending initial signal to parent to begin freezing\n");
-               if (kill(getppid(), SIGUSR1) != 0) {
-                       exit(INITIAL_SIGNAL_TO_PARENT_FAILED);
-               }
-       });
-
-       signal(SIGUSR1, SIG_IGN);
-       ds_signal = dispatch_source_create(DISPATCH_SOURCE_TYPE_SIGNAL, SIGUSR1, 0, dispatch_get_main_queue());
-       if (ds_signal == NULL) {
-               exit(DISPATCH_SOURCE_CREATE_FAILED);
-       }
-
-       dispatch_source_set_event_handler(ds_signal, ^{
-               volatile int tmp;
-
-               /* Make sure all the pages are accessed before trying to freeze again */
-               for (int x = 0; x < num_pages; x++) {
-                       tmp = buf[x][0];
-               }
-               if (kill(getppid(), SIGUSR1) != 0) {
-                       exit(SIGNAL_TO_PARENT_FAILED);
-               }
-       });
-       dispatch_activate(ds_signal);
-
-       dispatch_main();
-}
-
-// Numbers for 10MB and above are fairly reproducible. Anything smaller shows a lot of variation.
-T_DECL(compr_10MB_zero, "Compressor latencies") {
-       run_compressor_test(10, ALL_ZEROS);
-}
-
-T_DECL(compr_10MB_mostly_zero, "Compressor latencies") {
-       run_compressor_test(10, MOSTLY_ZEROS);
-}
-
-T_DECL(compr_10MB_random, "Compressor latencies") {
-       run_compressor_test(10, RANDOM);
-}
-
-T_DECL(compr_10MB_typical, "Compressor latencies") {
-       run_compressor_test(10, TYPICAL);
-}
-
-T_DECL(compr_100MB_zero, "Compressor latencies") {
-       run_compressor_test(100, ALL_ZEROS);
-}
-
-T_DECL(compr_100MB_mostly_zero, "Compressor latencies") {
-       run_compressor_test(100, MOSTLY_ZEROS);
-}
-
-T_DECL(compr_100MB_random, "Compressor latencies") {
-       run_compressor_test(100, RANDOM);
-}
-
-T_DECL(compr_100MB_typical, "Compressor latencies") {
-       run_compressor_test(100, TYPICAL);
-}
-
diff --git a/tools/tests/darwintests/perf_exit.c b/tools/tests/darwintests/perf_exit.c
deleted file mode 100644 (file)
index 0caafda..0000000
+++ /dev/null
@@ -1,166 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-
-#include <sys/kdebug.h>
-#include <ktrace/session.h>
-#include <spawn.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdatomic.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.perf.exit"),
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_SINGLEUSER)
-);
-
-// From osfmk/kern/sched.h
-#define BASEPRI_FOREGROUND 47
-#define BASEPRI_USER_INITIATED 37
-#define BASEPRI_UTILITY 20
-#define MAXPRI_THROTTLE 4
-
-// From bsd/sys/proc_internal.h
-#define PID_MAX 99999
-
-#define EXIT_BINARY "perf_exit_proc"
-#define EXIT_BINARY_PATH "./" EXIT_BINARY
-
-static ktrace_session_t session;
-static dispatch_queue_t spawn_queue;
-static uint64_t *begin_ts;
-static dt_stat_time_t s;
-static bool started_tracing = false;
-
-void run_exit_test(int proc_wired_mem, int thread_priority, int nthreads);
-
-static void cleanup(void) {
-       free(begin_ts);
-       dt_stat_finalize(s);
-       dispatch_release(spawn_queue);
-       if (started_tracing) {
-               ktrace_end(session, 1);
-       }
-}
-
-void run_exit_test(int proc_wired_mem, int thread_priority, int nthreads) {
-       static atomic_bool ended = false;
-
-       s = dt_stat_time_create("time");
-       T_QUIET; T_ASSERT_NOTNULL(s, "created time statistic");
-
-       begin_ts = malloc(sizeof(uint64_t) * PID_MAX);
-       T_QUIET; T_ASSERT_NOTNULL(begin_ts, "created pid array");
-
-       T_ATEND(cleanup);
-
-       session = ktrace_session_create();
-       T_QUIET; T_ASSERT_NOTNULL(session, "created a trace session");
-
-       spawn_queue = dispatch_queue_create("spawn_queue", NULL);
-
-       ktrace_set_completion_handler(session, ^{
-               ktrace_session_destroy(session);
-               T_END;
-       });
-
-       ktrace_set_signal_handler(session);
-       ktrace_set_execnames_enabled(session, KTRACE_FEATURE_ENABLED);
-
-       // We are only interested in the process we launched
-       ktrace_filter_process(session, EXIT_BINARY);
-
-       ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_EXCP_SC, 1) | DBG_FUNC_START), ^(ktrace_event_t e) {
-               T_QUIET; T_ASSERT_LE(e->pid, PID_MAX, "valid pid for tracepoint");
-               begin_ts[e->pid] = e->timestamp;
-       });
-       ktrace_events_single(session, (BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXIT) | DBG_FUNC_END), ^(ktrace_event_t e) {
-               T_QUIET; T_ASSERT_LE(e->pid, PID_MAX, "valid pid for tracepoint");
-
-               if (begin_ts[e->pid] == 0) {
-                       return;
-               }
-               T_QUIET; T_ASSERT_LE(begin_ts[e->pid], e->timestamp, "timestamps are monotonically increasing");
-               dt_stat_mach_time_add(s, e->timestamp - begin_ts[e->pid]);
-
-               if (dt_stat_stable(s)) {
-                       ended = true;
-                       ktrace_end(session, 1);
-               }
-       });
-
-       int ret = ktrace_start(session, dispatch_get_main_queue());
-       T_ASSERT_POSIX_ZERO(ret, "starting trace");
-       started_tracing = true;
-
-       // Spawn processes continuously until the test is over
-       dispatch_async(spawn_queue, ^(void) {
-               char priority_buf[32], nthreads_buf[32], mem_buf[32];
-
-               snprintf(priority_buf, 32, "%d", thread_priority);
-               snprintf(nthreads_buf, 32, "%d", nthreads);
-               snprintf(mem_buf, 32, "%d", proc_wired_mem);
-
-               char *args[] = {EXIT_BINARY_PATH, priority_buf, nthreads_buf, mem_buf, NULL};
-               int status;
-               while (!ended) {
-                       pid_t pid;
-                       int bret = posix_spawn(&pid, args[0], NULL, NULL, args, NULL);
-                       T_QUIET; T_ASSERT_POSIX_ZERO(bret, "spawned process '%s'", args[0]);
-
-                       bret = waitpid(pid, &status, 0);
-                       T_QUIET; T_ASSERT_POSIX_SUCCESS(bret, "waited for process %d\n", pid);
-
-                       if (!WIFEXITED(status) || WEXITSTATUS(status) != 0)
-                               T_ASSERT_FAIL("child process failed to run");
-
-                       // Avoid saturating the CPU with new processes
-                       usleep(1);
-               }
-       });
-
-       dispatch_main();
-}
-
-
-T_DECL(exit, "exit(2) time from syscall start to end") {
-       run_exit_test(0, BASEPRI_FOREGROUND, 0);
-}
-
-T_DECL(exit_pri_4, "exit(2) time at priority 4 (throttled)") {
-       run_exit_test(0, MAXPRI_THROTTLE, 0);
-}
-
-T_DECL(exit_pri_20, "exit(2) time at priority 20 (utility)") {
-       run_exit_test(0, BASEPRI_UTILITY, 0);
-}
-
-T_DECL(exit_pri_37, "exit(2) time at priority 37 (user initiated)") {
-       run_exit_test(0, BASEPRI_USER_INITIATED, 0);
-}
-
-T_DECL(exit_10_threads, "exit(2) time with 10 threads") {
-       run_exit_test(0, BASEPRI_FOREGROUND, 10);
-}
-
-T_DECL(exit_1mb, "exit(2) time with 1MB of wired memory") {
-       run_exit_test(10000000, BASEPRI_FOREGROUND, 0);
-}
-
-T_DECL(exit_10mb, "exit(2) time with 10MB of wired memory") {
-       run_exit_test(10000000, BASEPRI_FOREGROUND, 0);
-}
-
-T_DECL(exit_100_threads, "exit(2) time with 100 threads", T_META_ENABLED(false), T_META_TIMEOUT(1800)) {
-       run_exit_test(0, BASEPRI_FOREGROUND, 100);
-}
-
-T_DECL(exit_1000_threads, "exit(2) time with 1000 threads", T_META_ENABLED(false), T_META_TIMEOUT(1800)) {
-       run_exit_test(0, BASEPRI_FOREGROUND, 1000);
-}
-
-T_DECL(exit_100mb, "exit(2) time with 100MB of wired memory", T_META_ENABLED(false), T_META_TIMEOUT(1800)) {
-       run_exit_test(100000000, BASEPRI_FOREGROUND, 0);
-}
diff --git a/tools/tests/darwintests/perf_exit_proc.c b/tools/tests/darwintests/perf_exit_proc.c
deleted file mode 100644 (file)
index fa157cd..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-#include <pthread.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-
-#include <mach/mach.h>
-#include <mach/mach_vm.h>
-
-static void* loop(__attribute__ ((unused)) void *arg) {
-       while (1) {
-
-       }
-}
-
-
-static int run_additional_threads(int nthreads) {
-       for (int i = 0; i < nthreads; i++) {
-               pthread_t pthread;
-               int err;
-               
-               err = pthread_create(&pthread, NULL, loop, NULL);
-               if (err) {
-                       return err;
-               }
-       }
-
-       return 0;
-}
-
-static int allocate_and_wire_memory(mach_vm_size_t size) {
-       int err;
-       task_t task = mach_task_self();
-       mach_vm_address_t addr;
-
-       if (size <= 0)
-               return 0;
-
-       err = mach_vm_allocate(task, &addr, size, VM_FLAGS_ANYWHERE);
-       if (err != KERN_SUCCESS) {
-               printf("mach_vm_allocate returned non-zero: %s\n", mach_error_string(err));
-               return err;
-       }
-       err = mach_vm_protect(task, addr, size, 0, VM_PROT_READ | VM_PROT_WRITE);;
-       if (err != KERN_SUCCESS) {
-               printf("mach_vm_protect returned non-zero: %s\n", mach_error_string(err));
-               return err;
-       }
-       host_t host_priv_port;
-       err = host_get_host_priv_port(mach_host_self(), &host_priv_port);
-       if (err != KERN_SUCCESS) {
-               printf("host_get_host_priv_port retruned non-zero: %s\n", mach_error_string(err));
-               return err;
-       }
-       err = mach_vm_wire(host_priv_port, task, addr, size, VM_PROT_READ | VM_PROT_WRITE);
-       if (err != KERN_SUCCESS) {
-               printf("mach_vm_wire returned non-zero: %s\n", mach_error_string(err));
-               return err;
-       }
-
-       return 0;
-}
-
-static int set_thread_priority(int priority) {
-       struct sched_param param;
-       int policy;
-
-       int err = pthread_getschedparam(pthread_self(), &policy, &param);
-       if (err) return err;
-
-       param.sched_priority = priority;
-
-       err = pthread_setschedparam(pthread_self(), policy, &param);
-       if (err) return err;
-       
-       return 0;
-}
-
-int main(int argc, char *argv[]) {
-       int priority = 47, nthreads = 0;
-       int err;
-       mach_vm_size_t wired_mem = 0;
-
-       if (argc > 1) {
-               priority = (int)strtoul(argv[1], NULL, 10);
-       }
-       if (argc > 2) {
-               nthreads = (int)strtoul(argv[2], NULL, 10);
-       }
-       if (argc > 3) {
-               wired_mem = (mach_vm_size_t)strtoul(argv[3], NULL, 10);
-       }
-       
-       err = allocate_and_wire_memory(wired_mem);
-       if (err) {
-               return err;
-       }
-
-       err = set_thread_priority(priority);
-       if (err) {
-               return err;
-       }
-
-       err = run_additional_threads(nthreads);
-       if (err) {
-               return err;
-       }
-
-       return 0;
-}
diff --git a/tools/tests/darwintests/perf_kdebug.c b/tools/tests/darwintests/perf_kdebug.c
deleted file mode 100644 (file)
index f0f058f..0000000
+++ /dev/null
@@ -1,167 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-
-#include <sys/kdebug.h>
-#include <sys/sysctl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.perf.kdebug"),
-       T_META_ASROOT(true),
-       T_META_CHECK_LEAKS(false)
-);
-
-//
-// Helper functions for direct control over the kernel trace facility.
-//
-
-static void _sysctl_reset() {
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDREMOVE };
-       if(sysctl(mib, 3, NULL, NULL, NULL, 0)) {
-               T_FAIL("KERN_KDREMOVE sysctl failed");
-       }
-}
-
-static void _sysctl_setbuf(uint32_t capacity) {
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSETBUF, (int)capacity };
-       if (sysctl(mib, 4, NULL, NULL, NULL, 0)) {
-               T_FAIL("KERN_KDSETBUF sysctl failed");
-       }
-}
-
-static void _sysctl_setup() {
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSETUP };
-       if (sysctl(mib, 3, NULL, NULL, NULL, 0)) {
-               T_FAIL("KERN_KDSETUP sysctl failed");
-       }
-}
-
-static void _sysctl_enable(int value)
-{
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDENABLE, value };
-       if (sysctl(mib, 4, NULL, NULL, NULL, 0) < 0) {
-               T_FAIL("KERN_KDENABLE sysctl failed");
-       }
-}
-
-static void _sysctl_enable_typefilter(uint8_t* type_filter_bitmap) {
-       int mib[] = { CTL_KERN, KERN_KDEBUG, KERN_KDSET_TYPEFILTER };
-       size_t needed = KDBG_TYPEFILTER_BITMAP_SIZE;
-       if(sysctl(mib, 3, type_filter_bitmap, &needed, NULL, 0)) {
-               T_FAIL("KERN_KDSET_TYPEFILTER sysctl failed");
-       }
-}
-
-static void _sysctl_nowrap(bool is_nowrap) {
-       int mib[] = { CTL_KERN, KERN_KDEBUG, is_nowrap ? KERN_KDEFLAGS : KERN_KDDFLAGS, KDBG_NOWRAP };
-       if (sysctl(mib, 4, NULL, NULL, NULL, 0)) {
-               T_FAIL("KDBG_NOWRAP sysctl failed");
-       }
-}
-
-static void enable_tracing(bool value) {
-       _sysctl_enable(value ? KDEBUG_ENABLE_TRACE : 0);
-}
-
-static void enable_typefilter_all_reject() {
-       uint8_t type_filter_bitmap[KDBG_TYPEFILTER_BITMAP_SIZE];
-       memset(type_filter_bitmap, 0, sizeof(type_filter_bitmap));
-       _sysctl_enable_typefilter(type_filter_bitmap);
-}
-
-static void enable_typefilter_all_pass() {
-       uint8_t type_filter_bitmap[KDBG_TYPEFILTER_BITMAP_SIZE];
-       memset(type_filter_bitmap, 0xff, sizeof(type_filter_bitmap));
-       _sysctl_enable_typefilter(type_filter_bitmap);
-}
-
-static void loop_kdebug_trace(dt_stat_time_t s) {
-       do {
-               dt_stat_token start = dt_stat_time_begin(s);
-               for (uint32_t i = 0; i<100; i++) {
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-                       kdebug_trace(0x97000000 | DBG_FUNC_NONE, i, i, i, i);
-               }
-               dt_stat_time_end_batch(s, 1000, start);
-       } while (!dt_stat_stable(s));
-}
-
-static void loop_getppid(dt_stat_time_t s) {
-       do {
-               dt_stat_token start = dt_stat_time_begin(s);
-               for (uint32_t i = 0; i<100; i++) {
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-                       getppid();
-               }
-               dt_stat_time_end_batch(s, 1000, start);
-       } while (!dt_stat_stable(s));
-}
-
-static void reset_kdebug_trace(void) {
-       _sysctl_reset();
-}
-
-static void test(const char* test_name, void (^pretest_setup)(void), void (*test)(dt_stat_time_t s)) {
-       T_ATEND(reset_kdebug_trace);
-       _sysctl_reset();
-       _sysctl_setbuf(1000000);
-       _sysctl_nowrap(false);
-       _sysctl_setup();
-
-       pretest_setup();
-
-       dt_stat_time_t s = dt_stat_time_create("%s", test_name);
-
-       test(s);
-
-       dt_stat_finalize(s);
-}
-
-//
-// Begin tests...
-//
-
-T_DECL(kdebug_trace_baseline_syscall,
-       "Test the latency of a syscall while kernel tracing is disabled") {
-       test("kdebug_trace_baseline_syscall", ^{ enable_tracing(false); }, loop_getppid);
-}
-
-T_DECL(kdebug_trace_kdbg_disabled,
-       "Test the latency of kdebug_trace while kernel tracing is disabled") {
-       test("kdebug_trace_kdbg_disabled", ^{ enable_tracing(false); }, loop_kdebug_trace);
-}
-
-T_DECL(kdebug_trace_kdbg_enabled,
-       "Test the latency of kdebug_trace while kernel tracing is enabled with no typefilter") {
-       test("kdebug_trace_kdbg_enabled", ^{ enable_tracing(true); }, loop_kdebug_trace);
-}
-
-T_DECL(kdebug_trace_kdbg_enabled_typefilter_pass,
-       "Test the latency of kdebug_trace while kernel tracing is enabled with a typefilter that passes the event") {
-       test("kdebug_trace_kdbg_enabled_typefilter_pass", ^{ enable_tracing(true); enable_typefilter_all_pass(); }, loop_kdebug_trace);
-}
-
-T_DECL(kdebug_trace_kdbg_enabled_typefilter_reject,
-       "Test the latency of kdebug_trace while kernel tracing is enabled with a typefilter that rejects the event") {
-       test("kdebug_trace_kdbg_enabled_typefilter_reject", ^{ enable_tracing(true); enable_typefilter_all_reject(); }, loop_kdebug_trace);
-}
diff --git a/tools/tests/darwintests/perf_spawn_fork.c b/tools/tests/darwintests/perf_spawn_fork.c
deleted file mode 100644 (file)
index 13a85ff..0000000
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-
-#include <spawn.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.perf.fork"),
-       T_META_CHECK_LEAKS(false)
-);
-
-#define SPAWN_MEASURE_LOOP(s) \
-       char *args[] = {"/usr/bin/true", NULL}; \
-       int err; \
-       pid_t pid; \
-       int status; \
-       while (!dt_stat_stable(s)) { \
-               T_STAT_MEASURE(s) { \
-                       err = posix_spawn(&pid, args[0], NULL, NULL, args, NULL); \
-               } \
-               if (err) { \
-                       T_FAIL("posix_spawn returned %d", err); \
-               } \
-               waitpid(pid, &status, 0); \
-               if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { \
-                       T_FAIL("Child process of posix_spawn failed to run"); \
-               } \
-       }
-
-T_DECL(posix_spawn_platform_binary_latency, "posix_spawn platform binary latency") {
-       {
-               dt_stat_time_t s = dt_stat_time_create("time");
-               SPAWN_MEASURE_LOOP(s);
-               dt_stat_finalize(s);
-       }
-
-       {
-               dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on-cpu time");
-               SPAWN_MEASURE_LOOP(s);
-               dt_stat_finalize(s);
-       }
-}
-
-#define FORK_MEASURE_LOOP(s) \
-       pid_t pid; \
-       int status; \
-       while (!dt_stat_stable(s)) { \
-               T_STAT_MEASURE(s) { \
-                       pid = fork(); \
-                       if (pid == 0) \
-                               exit(0); \
-                       else if (pid == -1) \
-                               T_FAIL("fork returned -1"); \
-               } \
-               waitpid(pid, &status, 0); \
-               if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { \
-                       T_FAIL("forked process failed to exit properly"); \
-               } \
-       }
-
-T_DECL(fork, "fork latency") {
-       {
-               dt_stat_time_t s = dt_stat_time_create("time");
-               FORK_MEASURE_LOOP(s);
-               dt_stat_finalize(s);
-       }
-       {
-               dt_stat_thread_cpu_time_t s = dt_stat_thread_cpu_time_create("on-cpu time");
-               FORK_MEASURE_LOOP(s);
-               dt_stat_finalize(s);
-       }
-}
diff --git a/tools/tests/darwintests/poll.c b/tools/tests/darwintests/poll.c
deleted file mode 100644 (file)
index 8ff8806..0000000
+++ /dev/null
@@ -1,129 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-
-#include <dispatch/dispatch.h>
-#include <fcntl.h>
-#include <mach/mach.h>
-#include <poll.h>
-#include <stdint.h>
-#include <unistd.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.poll"));
-
-#define SLEEP_TIME_SECS 1
-#define POLL_TIMEOUT_MS 1800
-static_assert(POLL_TIMEOUT_MS > (SLEEP_TIME_SECS * 1000),
-               "poll timeout should be longer than sleep time");
-
-/*
- * This matches the behavior of other UNIXes, but is under-specified in POSIX.
- *
- * See <rdar://problem/28372390>.
- */
-T_DECL(sleep_with_no_fds,
-               "poll() called with no fds provided should act like sleep")
-{
-       uint64_t begin_time, sleep_time, poll_time;
-       struct pollfd pfd = { 0 };
-
-       begin_time = mach_absolute_time();
-       sleep(SLEEP_TIME_SECS);
-       sleep_time = mach_absolute_time() - begin_time;
-       T_LOG("sleep(%d) ~= %llu mach absolute time units", SLEEP_TIME_SECS, sleep_time);
-
-       begin_time = mach_absolute_time();
-       T_ASSERT_POSIX_SUCCESS(poll(&pfd, 0, POLL_TIMEOUT_MS),
-                       "poll() with 0 events and timeout %d ms", POLL_TIMEOUT_MS);
-       poll_time = mach_absolute_time() - begin_time;
-
-       T_EXPECT_GT(poll_time, sleep_time,
-                       "poll(... %d) should wait longer than sleep(1)", POLL_TIMEOUT_MS);
-}
-
-#define LAUNCHD_PATH "/sbin/launchd"
-#define PIPE_DIR_TIMEOUT_SECS 1
-
-/*
- * See <rdar://problem/28539155>.
- */
-T_DECL(directories,
-               "poll() with directories should return an error")
-{
-       int file, dir, pipes[2];
-       struct pollfd pfd[] = {
-               { .events = POLLIN },
-               { .events = POLLIN },
-               { .events = POLLIN },
-       };
-
-       file = open(LAUNCHD_PATH, O_RDONLY | O_NONBLOCK);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(file, "open(%s)", LAUNCHD_PATH);
-       dir = open(".", O_RDONLY | O_NONBLOCK);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(dir, "open(\".\")");
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pipe(pipes), NULL);
-
-       /* just directory */
-       pfd[0].fd = dir;
-       T_EXPECT_POSIX_SUCCESS(poll(pfd, 1, -1), "poll() with a directory");
-       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLNVAL,
-                       "directory should be an invalid event");
-
-       /* file and directory */
-       pfd[0].fd = file; pfd[0].revents = 0;
-       pfd[1].fd = dir; pfd[1].revents = 0;
-       T_EXPECT_POSIX_SUCCESS(poll(pfd, 2, -1),
-                       "poll() with a file and directory");
-       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLIN, "file should be readable");
-       T_QUIET; T_EXPECT_TRUE(pfd[1].revents & POLLNVAL,
-                       "directory should be an invalid event");
-
-       /* directory and file */
-       pfd[0].fd = dir; pfd[0].revents = 0;
-       pfd[1].fd = file; pfd[1].revents = 0;
-       T_EXPECT_POSIX_SUCCESS(poll(pfd, 2, -1),
-                       "poll() with a directory and a file");
-       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLNVAL,
-                       "directory should be an invalid event");
-       T_QUIET; T_EXPECT_TRUE(pfd[1].revents & POLLIN, "file should be readable");
-
-       /* file and pipe */
-       pfd[0].fd = file; pfd[0].revents = 0;
-       pfd[1].fd = pipes[0]; pfd[0].revents = 0;
-       T_EXPECT_POSIX_SUCCESS(poll(pfd, 2, -1),
-                       "poll() with a file and pipe");
-       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLIN, "file should be readable");
-       T_QUIET; T_EXPECT_FALSE(pfd[1].revents & POLLIN,
-                       "pipe should not be readable");
-
-       /* file, directory, and pipe */
-       pfd[0].fd = file; pfd[0].revents = 0;
-       pfd[1].fd = dir; pfd[1].revents = 0;
-       pfd[2].fd = pipes[0]; pfd[2].revents = 0;
-       T_EXPECT_POSIX_SUCCESS(poll(pfd, 3, -1),
-                       "poll() with a file, directory, and pipe");
-       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLIN, "file should be readable");
-       T_QUIET; T_EXPECT_TRUE(pfd[1].revents & POLLNVAL,
-                       "directory should be an invalid event");
-       T_QUIET; T_EXPECT_FALSE(pfd[2].revents & POLLIN, "pipe should not be readable");
-
-       /* directory and pipe */
-       __block bool timed_out = true;
-       pfd[0].fd = dir; pfd[0].revents = 0;
-       pfd[1].fd = pipes[0]; pfd[1].revents = 0;
-       dispatch_after(dispatch_time(DISPATCH_TIME_NOW,
-                       PIPE_DIR_TIMEOUT_SECS * NSEC_PER_SEC),
-                       dispatch_get_global_queue(QOS_CLASS_DEFAULT, 0), ^{
-               T_ASSERT_FALSE(timed_out, "poll timed out after %d seconds",
-                               PIPE_DIR_TIMEOUT_SECS);
-       });
-
-       T_EXPECT_POSIX_SUCCESS(poll(pfd, 3, -1),
-                       "poll() with a directory and pipe");
-       timed_out = false;
-
-       T_QUIET; T_EXPECT_TRUE(pfd[0].revents & POLLNVAL,
-                       "directory should be an invalid event");
-       T_QUIET; T_EXPECT_FALSE(pfd[1].revents & POLLIN, "pipe should not be readable");
-}
diff --git a/tools/tests/darwintests/poll_select_kevent_paired_fds.c b/tools/tests/darwintests/poll_select_kevent_paired_fds.c
deleted file mode 100644 (file)
index bd9a5e7..0000000
+++ /dev/null
@@ -1,932 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-
-#include <darwintest.h>
-#include <mach/mach.h>
-#include <darwintest_multiprocess.h>
-
-#include <assert.h>
-#include <dispatch/dispatch.h>
-#include <dispatch/private.h>
-#include <err.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <poll.h>
-#include <pthread.h>
-#include <pthread/workqueue_private.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/event.h>
-#include <sys/socket.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sysexits.h>
-#include <unistd.h>
-#include <util.h>
-#include <System/sys/event.h> /* kevent_qos */
-
-T_GLOBAL_META(
-               T_META_NAMESPACE("xnu.kevent"),
-               T_META_CHECK_LEAKS(false),
-               T_META_LTEPHASE(LTE_POSTINIT));
-
-/*
- * Test to validate that monitoring a PTY device, FIFO, pipe, or socket pair in
- * a dispatch source, kqueue, poll, or select delivers read events within and
- * between processes as expected.
- *
- * This test catches issues with watching special devices in kqueue(),
- * which has tricky special cases for character devices like PTYs.
- *
- * It also exercises the path to wake up a dispatch worker thread from the
- * special device kqueue event, which is also a special case in kqueue().
- *
- * See rdar://problem/26240299&26220074&26226862&28625427 for examples and
- * history.
- */
-
-#define EXPECTED_STRING    "abcdefghijklmnopqrstuvwxyz. ABCDEFGHIJKLMNOPQRSTUVWXYZ. 1234567890"
-#define EXPECTED_LEN       strlen(EXPECTED_STRING)
-
-#define READ_SETUP_TIMEOUT_SECS       2
-#define WRITE_TIMEOUT_SECS            4
-#define READ_TIMEOUT_SECS             4
-#define INCREMENTAL_WRITE_SLEEP_USECS 50
-
-static mach_timespec_t READ_SETUP_timeout = {.tv_sec = READ_SETUP_TIMEOUT_SECS, .tv_nsec = 0};
-static mach_timespec_t READ_timeout = {.tv_sec = READ_TIMEOUT_SECS, .tv_nsec = 0};
-static mach_timespec_t WRITE_timeout = {.tv_sec = WRITE_TIMEOUT_SECS, .tv_nsec = 0};
-
-enum fd_pair {
-       PTY_PAIR,
-       FIFO_PAIR,
-       PIPE_PAIR,
-       SOCKET_PAIR
-};
-
-enum write_mode {
-       FULL_WRITE,
-       INCREMENTAL_WRITE,
-       KEVENT_INCREMENTAL_WRITE,
-       KEVENT64_INCREMENTAL_WRITE,
-       KEVENT_QOS_INCREMENTAL_WRITE,
-       WORKQ_INCREMENTAL_WRITE,
-       DISPATCH_INCREMENTAL_WRITE
-};
-
-enum read_mode {
-       POLL_READ,
-       SELECT_READ,
-       KEVENT_READ,
-       KEVENT64_READ,
-       KEVENT_QOS_READ,
-       WORKQ_READ,
-       DISPATCH_READ
-};
-
-union mode {
-       enum read_mode rd;
-       enum write_mode wr;
-};
-
-static struct {
-       enum fd_pair fd_pair;
-       enum write_mode wr_mode;
-       int wr_fd;
-       enum read_mode rd_mode;
-       int rd_fd;
-
-       enum writer_kind {
-               THREAD_WRITER, /* sem */
-               PROCESS_WRITER /* fd */
-       } wr_kind;
-       union {
-               semaphore_t sem;
-               struct {
-                       int in_fd;
-                       int out_fd;
-               };
-       } wr_wait;
-       semaphore_t wr_finished;
-       semaphore_t rd_finished;
-} shared;
-
-static bool handle_reading(enum fd_pair fd_pair, int fd);
-static bool handle_writing(enum fd_pair fd_pair, int fd);
-static void drive_kq(bool reading, union mode mode, enum fd_pair fd_pair,
-               int fd);
-
-#pragma mark writing
-
-static void
-wake_writer(void)
-{
-       T_LOG("waking writer");
-
-       switch (shared.wr_kind) {
-       case THREAD_WRITER:
-               T_LOG("signal shared.wr_wait.sem");
-               semaphore_signal(shared.wr_wait.sem);
-               break;
-       case PROCESS_WRITER: {
-               char tmp = 'a';
-               close(shared.wr_wait.out_fd);
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(write(
-                               shared.wr_wait.in_fd, &tmp, 1), NULL);
-               break;
-       }
-       }
-}
-
-static void
-writer_wait(void)
-{
-       switch (shared.wr_kind) {
-       case THREAD_WRITER:
-               T_LOG("wait shared.wr_wait.sem");
-               kern_return_t kret = semaphore_timedwait(shared.wr_wait.sem, READ_SETUP_timeout);
-
-               if (kret == KERN_OPERATION_TIMED_OUT) {
-                       T_ASSERT_FAIL("THREAD_WRITER semaphore timedout after %d seconds", READ_SETUP_timeout.tv_sec);
-               }
-               T_QUIET;
-               T_ASSERT_MACH_SUCCESS(kret, "semaphore_timedwait shared.wr_wait.sem");
-               break;
-
-       case PROCESS_WRITER: {
-               char tmp;
-               close(shared.wr_wait.in_fd);
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(read(
-                               shared.wr_wait.out_fd, &tmp, 1), NULL);
-               break;
-       }
-       }
-
-       T_LOG("writer woken up, starting to write");
-}
-
-static bool
-handle_writing(enum fd_pair __unused fd_pair, int fd)
-{
-       static unsigned int cur_char = 0;
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(write(fd,
-                       &(EXPECTED_STRING[cur_char]), 1), NULL);
-       cur_char++;
-
-       return (cur_char < EXPECTED_LEN);
-}
-
-#define EXPECTED_QOS QOS_CLASS_USER_INITIATED
-
-static void
-reenable_workq(int fd, int16_t filt)
-{
-       struct kevent_qos_s events[] = {{
-               .ident = (uint64_t)fd,
-               .filter = filt,
-               .flags = EV_ENABLE | EV_UDATA_SPECIFIC | EV_DISPATCH,
-               .qos = (int32_t)_pthread_qos_class_encode(EXPECTED_QOS,
-                               0, 0),
-               .fflags = NOTE_LOWAT,
-               .data = 1
-       }};
-
-       int kev = kevent_qos(-1, events, 1, events, 1, NULL, NULL,
-                       KEVENT_FLAG_WORKQ | KEVENT_FLAG_ERROR_EVENTS);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kev, "reenable workq in kevent_qos");
-}
-
-static void
-workqueue_write_fn(void ** __unused buf, int * __unused count)
-{
-       // T_MAYFAIL;
-       // T_QUIET; T_ASSERT_EFFECTIVE_QOS_EQ(EXPECTED_QOS,
-                       // "writer thread should be woken up at correct QoS");
-       if (!handle_writing(shared.fd_pair, shared.wr_fd)) {
-               /* finished handling the fd, tear down the source */
-               T_LOG("signal shared.wr_finished");
-               semaphore_signal(shared.wr_finished);
-               return;
-       }
-
-       reenable_workq(shared.wr_fd, EVFILT_WRITE);
-}
-
-static void
-workqueue_fn(pthread_priority_t __unused priority)
-{
-       T_ASSERT_FAIL("workqueue function callback was called");
-}
-
-static void
-drive_kq(bool reading, union mode mode, enum fd_pair fd_pair, int fd)
-{
-       struct timespec timeout = { .tv_sec = READ_TIMEOUT_SECS };
-       int kev = -1;
-
-       struct kevent events;
-       EV_SET(&events, fd, reading ? EVFILT_READ : EVFILT_WRITE, EV_ADD,
-                       NOTE_LOWAT, 1, NULL);
-       struct kevent64_s events64;
-       EV_SET64(&events64, fd, reading ? EVFILT_READ : EVFILT_WRITE, EV_ADD,
-                       NOTE_LOWAT, 1, 0, 0, 0);
-       struct kevent_qos_s events_qos[] = {{
-               .ident = (uint64_t)fd,
-               .filter = reading ? EVFILT_READ : EVFILT_WRITE,
-               .flags = EV_ADD,
-               .fflags = NOTE_LOWAT,
-               .data = 1
-       }, {
-               .ident = 0,
-               .filter = EVFILT_TIMER,
-               .flags = EV_ADD,
-               .fflags = NOTE_SECONDS,
-               .data = READ_TIMEOUT_SECS
-       }};
-
-       /* determine which variant of kevent to use */
-       enum read_mode which_kevent;
-       if (reading) {
-               which_kevent = mode.rd;
-       } else {
-               if (mode.wr == KEVENT_INCREMENTAL_WRITE) {
-                       which_kevent = KEVENT_READ;
-               } else if (mode.wr == KEVENT64_INCREMENTAL_WRITE) {
-                       which_kevent = KEVENT64_READ;
-               } else if (mode.wr == KEVENT_QOS_INCREMENTAL_WRITE) {
-                       which_kevent = KEVENT_QOS_READ;
-               } else {
-                       T_ASSERT_FAIL("unexpected mode: %d", mode.wr);
-                       __builtin_unreachable();
-               }
-       }
-
-       int kq_fd = kqueue();
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(kq_fd, "kqueue");
-
-       switch (which_kevent) {
-       case KEVENT_READ:
-               kev = kevent(kq_fd, &events, 1, NULL, 0, NULL);
-               break;
-       case KEVENT64_READ:
-               kev = kevent64(kq_fd, &events64, 1, NULL, 0, 0, NULL);
-               break;
-       case KEVENT_QOS_READ:
-               kev = kevent_qos(kq_fd, events_qos, 2, NULL, 0, NULL, NULL, 0);
-               break;
-       case POLL_READ: /* FALLTHROUGH */
-       case SELECT_READ: /* FALLTHROUGH */
-       case DISPATCH_READ: /* FALLTHROUGH */
-       case WORKQ_READ: /* FALLTHROUGH */
-       default:
-               T_ASSERT_FAIL("unexpected mode: %d", reading ? mode.rd : mode.wr);
-               break;
-       }
-
-       if (reading) {
-               wake_writer();
-       } else {
-               writer_wait();
-       }
-
-       for (;;) {
-               switch (which_kevent) {
-               case KEVENT_READ:
-                       kev = kevent(kq_fd, NULL, 0, &events, 1, &timeout);
-                       break;
-               case KEVENT64_READ:
-                       kev = kevent64(kq_fd, NULL, 0, &events64, 1, 0, &timeout);
-                       break;
-               case KEVENT_QOS_READ:
-                       kev = kevent_qos(kq_fd, NULL, 0, events_qos, 2, NULL, NULL, 0);
-
-                       /* check for a timeout */
-                       for (int i = 0; i < kev; i++) {
-                               if (events_qos[i].filter == EVFILT_TIMER) {
-                                       kev = 0;
-                               }
-                       }
-                       break;
-               case POLL_READ: /* FALLTHROUGH */
-               case SELECT_READ: /* FALLTHROUGH */
-               case DISPATCH_READ: /* FALLTHROUGH */
-               case WORKQ_READ: /* FALLTHROUGH */
-               default:
-                       T_ASSERT_FAIL("unexpected mode: %d", reading ? mode.rd : mode.wr);
-                       break;
-               }
-
-               if (kev == -1 && errno == EINTR) {
-                       T_LOG("kevent was interrupted");
-                       continue;
-               }
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(kev, "kevent");
-               T_QUIET; T_ASSERT_NE(kev, 0, "kevent timed out");
-
-               if (reading) {
-                       if (!handle_reading(fd_pair, fd)) {
-                               break;
-                       }
-               } else {
-                       if (!handle_writing(fd_pair, fd)) {
-                               break;
-                       }
-               }
-       }
-
-       close(kq_fd);
-}
-
-static void *
-write_to_fd(void * __unused ctx)
-{
-       ssize_t bytes_wr = 0;
-
-       writer_wait();
-
-       switch (shared.wr_mode) {
-       case FULL_WRITE:
-               do {
-                       if (bytes_wr == -1) {
-                               T_LOG("write from child was interrupted");
-                       }
-                       bytes_wr = write(shared.wr_fd, EXPECTED_STRING,
-                                       EXPECTED_LEN);
-               } while (bytes_wr == -1 && errno == EINTR);
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(bytes_wr, "write");
-               T_QUIET; T_ASSERT_EQ(bytes_wr, (ssize_t)EXPECTED_LEN,
-                               "wrote enough bytes");
-               break;
-
-       case INCREMENTAL_WRITE:
-               for (unsigned int i = 0; i < EXPECTED_LEN ; i++) {
-                       T_QUIET;
-                       T_ASSERT_POSIX_SUCCESS(write(shared.wr_fd,
-                                       &(EXPECTED_STRING[i]), 1), NULL);
-                       usleep(INCREMENTAL_WRITE_SLEEP_USECS);
-               }
-               break;
-
-       case KEVENT_INCREMENTAL_WRITE: /* FALLTHROUGH */
-       case KEVENT64_INCREMENTAL_WRITE: /* FALLTHROUGH */
-       case KEVENT_QOS_INCREMENTAL_WRITE: {
-               union mode mode = { .wr = shared.wr_mode };
-               drive_kq(false, mode, shared.fd_pair, shared.wr_fd);
-               break;
-       }
-
-       case WORKQ_INCREMENTAL_WRITE: {
-               // prohibit ourselves from going multi-threaded see:rdar://33296008
-               _dispatch_prohibit_transition_to_multithreaded(true);
-               int changes = 1;
-
-               T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.wr_finished, SYNC_POLICY_FIFO, 0),
-                                     "semaphore_create shared.wr_finished");
-
-               T_QUIET;
-               T_ASSERT_NE_UINT(shared.wr_finished, (unsigned)MACH_PORT_NULL, "wr_finished semaphore_create");
-
-               T_QUIET;
-               T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_kevent(workqueue_fn, workqueue_write_fn, 0, 0), NULL);
-
-               struct kevent_qos_s events[] = {{
-                       .ident = (uint64_t)shared.wr_fd,
-                       .filter = EVFILT_WRITE,
-                       .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
-                       .fflags = NOTE_LOWAT,
-                       .data = 1,
-                       .qos = (int32_t)_pthread_qos_class_encode(EXPECTED_QOS,
-                                       0, 0)
-               }};
-
-               for (;;) {
-                       int kev = kevent_qos(-1, changes == 0 ? NULL : events, changes,
-                                       events, 1, NULL, NULL,
-                                       KEVENT_FLAG_WORKQ | KEVENT_FLAG_ERROR_EVENTS);
-                       if (kev == -1 && errno == EINTR) {
-                               changes = 0;
-                               T_LOG("kevent_qos was interrupted");
-                               continue;
-                       }
-
-                       T_QUIET; T_ASSERT_POSIX_SUCCESS(kev, "kevent_qos");
-                       break;
-               }
-               break;
-       }
-
-       case DISPATCH_INCREMENTAL_WRITE: {
-               dispatch_source_t write_src;
-
-               T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.wr_finished, SYNC_POLICY_FIFO, 0),
-                                     "semaphore_create shared.wr_finished");
-
-               T_QUIET;
-               T_ASSERT_NE_UINT(shared.wr_finished, (unsigned)MACH_PORT_NULL, "semaphore_create");
-
-               write_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_WRITE,
-                               (uintptr_t)shared.wr_fd, 0, NULL);
-               T_QUIET; T_ASSERT_NOTNULL(write_src,
-                               "dispatch_source_create(DISPATCH_SOURCE_TYPE_WRITE ...)");
-
-               dispatch_block_t handler = dispatch_block_create_with_qos_class(
-                               DISPATCH_BLOCK_ENFORCE_QOS_CLASS, EXPECTED_QOS, 0, ^{
-                       // T_MAYFAIL;
-                       // T_QUIET; T_ASSERT_EFFECTIVE_QOS_EQ(EXPECTED_QOS,
-                                       // "write handler block should run at correct QoS");
-                       if (!handle_writing(shared.fd_pair, shared.wr_fd)) {
-                               /* finished handling the fd, tear down the source */
-                               dispatch_source_cancel(write_src);
-                               dispatch_release(write_src);
-                               T_LOG("signal shared.wr_finished");
-                               semaphore_signal(shared.wr_finished);
-                       }
-               });
-
-               dispatch_source_set_event_handler(write_src, handler);
-               dispatch_activate(write_src);
-
-               break;
-       }
-
-       default:
-               T_ASSERT_FAIL("unrecognized write mode: %d", shared.wr_mode);
-               break;
-       }
-
-       if (shared.wr_finished) {
-               T_LOG("wait shared.wr_finished");
-               kern_return_t kret = semaphore_timedwait(shared.wr_finished, WRITE_timeout);
-               if (kret == KERN_OPERATION_TIMED_OUT) {
-                       T_ASSERT_FAIL("write side semaphore timedout after %d seconds", WRITE_timeout.tv_sec);
-               }
-               T_QUIET;
-               T_ASSERT_MACH_SUCCESS(kret, "semaphore_timedwait shared.wr_finished");
-               semaphore_destroy(mach_task_self(), shared.wr_finished);
-       }
-
-       T_LOG("writer finished, closing fd");
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(close(shared.wr_fd), NULL);
-       return NULL;
-}
-
-#pragma mark reading
-
-#define BUF_LEN 1024
-static char final_string[BUF_LEN];
-static size_t final_length;
-
-/*
- * Read from the master PTY descriptor.
- *
- * Returns false if EOF is encountered, and true otherwise.
- */
-static bool
-handle_reading(enum fd_pair fd_pair, int fd)
-{
-       char read_buf[BUF_LEN] = { 0 };
-       ssize_t bytes_rd = 0;
-
-       do {
-               if (bytes_rd == -1) {
-                       T_LOG("read was interrupted, retrying");
-               }
-               bytes_rd = read(fd, read_buf, sizeof(read_buf) - 1);
-       } while (bytes_rd == -1 && errno == EINTR);
-
-       // T_LOG("read %zd bytes: '%s'", bytes_rd, read_buf);
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(bytes_rd, "reading from file");
-       T_QUIET; T_ASSERT_LE(bytes_rd, (ssize_t)EXPECTED_LEN,
-                       "read too much from file");
-
-       if (bytes_rd == 0) {
-               T_LOG("read EOF from file");
-               return false;
-       }
-
-       read_buf[bytes_rd] = '\0';
-       strlcpy(&(final_string[final_length]), read_buf,
-                       sizeof(final_string) - final_length);
-       final_length += (size_t)bytes_rd;
-
-       T_QUIET; T_ASSERT_LE(final_length, EXPECTED_LEN,
-                       "should not read more from file than what can be sent");
-
-       /* FIFOs don't send EOF when the write side closes */
-       if (final_length == strlen(EXPECTED_STRING) &&
-                       (fd_pair == FIFO_PAIR))
-       {
-               T_LOG("read all expected bytes from FIFO");
-               return false;
-       }
-       return true;
-}
-
-static void
-workqueue_read_fn(void ** __unused buf, int * __unused count)
-{
-       // T_MAYFAIL;
-       // T_QUIET; T_ASSERT_EFFECTIVE_QOS_EQ(EXPECTED_QOS,
-                       // "reader thread should be requested at correct QoS");
-       if (!handle_reading(shared.fd_pair, shared.rd_fd)) {
-               T_LOG("signal shared.rd_finished");
-               semaphore_signal(shared.rd_finished);
-       }
-
-       reenable_workq(shared.rd_fd, EVFILT_READ);
-}
-
-static void
-read_from_fd(int fd, enum fd_pair fd_pair, enum read_mode mode)
-{
-       int fd_flags;
-
-       T_LOG("reader setting up");
-
-       bzero(final_string, sizeof(final_string));
-
-       fd_flags = fcntl(fd, F_GETFL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(fd_flags, "fcntl(F_GETFL)");
-
-       if (!(fd_flags & O_NONBLOCK)) {
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(fcntl(fd, F_SETFL,
-                       fd_flags | O_NONBLOCK), NULL);
-       }
-
-       switch (mode) {
-       case POLL_READ: {
-               struct pollfd fds[] = { { .fd = fd, .events = POLLIN } };
-               wake_writer();
-
-               for (;;) {
-                       fds[0].revents = 0;
-                       int pol = poll(fds, 1, READ_TIMEOUT_SECS * 1000);
-                       T_QUIET; T_ASSERT_POSIX_SUCCESS(pol, "poll");
-                       T_QUIET; T_ASSERT_NE(pol, 0,
-                                       "poll should not time out after %d seconds, read %zd out "
-                                       "of %zu bytes",
-                                       READ_TIMEOUT_SECS, final_length, strlen(EXPECTED_STRING));
-                       T_QUIET; T_ASSERT_FALSE(fds[0].revents & POLLERR,
-                                       "should not see an error on the device");
-                       T_QUIET; T_ASSERT_FALSE(fds[0].revents & POLLNVAL,
-                                       "should not set up an invalid poll");
-
-                       if (!handle_reading(fd_pair, fd)) {
-                               break;
-                       }
-               }
-               break;
-       }
-
-       case SELECT_READ:
-               wake_writer();
-
-               for (;;) {
-                       struct timeval tv = { .tv_sec = READ_TIMEOUT_SECS };
-
-                       fd_set read_fd;
-                       FD_ZERO(&read_fd);
-                       FD_SET(fd, &read_fd);
-                       fd_set err_fd;
-                       FD_ZERO(&err_fd);
-                       FD_SET(fd, &err_fd);
-
-                       int sel = select(fd + 1, &read_fd, NULL, NULL/*&err_fd*/, &tv);
-                       if (sel == -1 && errno == EINTR) {
-                               T_LOG("select interrupted");
-                               continue;
-                       }
-                       (void)fd_pair;
-
-                       T_QUIET; T_ASSERT_POSIX_SUCCESS(sel, "select");
-
-                       T_QUIET; T_ASSERT_NE(sel, 0,
-                               "select waited for %d seconds and timed out",
-                               READ_TIMEOUT_SECS);
-
-                       /* didn't fail or time out, therefore data is ready */
-                       T_QUIET; T_ASSERT_NE(FD_ISSET(fd, &read_fd), 0,
-                                       "select should show reading fd as readable");
-
-                       if (!handle_reading(fd_pair, fd)) {
-                               break;
-                       }
-               }
-               break;
-
-       case KEVENT_READ: /* FALLTHROUGH */
-       case KEVENT64_READ: /* FALLTHROUGH */
-       case KEVENT_QOS_READ: {
-               union mode rd_mode = { .rd = shared.rd_mode };
-               drive_kq(true, rd_mode, fd_pair, shared.rd_fd);
-               break;
-       }
-
-       case WORKQ_READ: {
-               // prohibit ourselves from going multi-threaded see:rdar://33296008
-               _dispatch_prohibit_transition_to_multithreaded(true);
-               T_ASSERT_POSIX_ZERO(_pthread_workqueue_init_with_kevent(
-                               workqueue_fn, workqueue_read_fn, 0, 0), NULL);
-
-               T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.rd_finished, SYNC_POLICY_FIFO, 0),
-                                     "semaphore_create shared.rd_finished");
-
-               T_QUIET;
-               T_ASSERT_NE_UINT(shared.rd_finished, (unsigned)MACH_PORT_NULL, "semaphore_create");
-
-               int changes = 1;
-               struct kevent_qos_s events[] = {{
-                       .ident = (uint64_t)shared.rd_fd,
-                       .filter = EVFILT_READ,
-                       .flags = EV_ADD | EV_UDATA_SPECIFIC | EV_DISPATCH | EV_VANISHED,
-                       .fflags = NOTE_LOWAT,
-                       .data = 1,
-                       .qos = (int32_t)_pthread_qos_class_encode(EXPECTED_QOS,
-                                       0, 0)
-               }};
-
-               for (;;) {
-                       int kev = kevent_qos(-1, changes == 0 ? NULL : events, changes,
-                                       events, 1, NULL, NULL,
-                                       KEVENT_FLAG_WORKQ | KEVENT_FLAG_ERROR_EVENTS);
-                       if (kev == -1 && errno == EINTR) {
-                               changes = 0;
-                               T_LOG("kevent_qos was interrupted");
-                               continue;
-                       }
-
-                       T_QUIET; T_ASSERT_POSIX_SUCCESS(kev, "kevent_qos");
-                       break;
-               }
-
-               wake_writer();
-               break;
-       }
-
-       case DISPATCH_READ: {
-               dispatch_source_t read_src;
-
-               T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.rd_finished, SYNC_POLICY_FIFO, 0),
-                                     "semaphore_create shared.rd_finished");
-
-               T_QUIET;
-               T_ASSERT_NE_UINT(shared.rd_finished, (unsigned)MACH_PORT_NULL, "semaphore_create");
-
-               read_src = dispatch_source_create(DISPATCH_SOURCE_TYPE_READ,
-                               (uintptr_t)fd, 0, NULL);
-               T_QUIET; T_ASSERT_NOTNULL(read_src,
-                               "dispatch_source_create(DISPATCH_SOURCE_TYPE_READ)");
-
-               dispatch_block_t handler = dispatch_block_create_with_qos_class(
-                               DISPATCH_BLOCK_ENFORCE_QOS_CLASS, EXPECTED_QOS, 0, ^{
-                       // T_MAYFAIL;
-                       // T_QUIET; T_ASSERT_EFFECTIVE_QOS_EQ(EXPECTED_QOS,
-                                       // "read handler block should run at correct QoS");
-
-                       if (!handle_reading(fd_pair, fd)) {
-                               /* finished handling the fd, tear down the source */
-                               dispatch_source_cancel(read_src);
-                               dispatch_release(read_src);
-                               T_LOG("signal shared.rd_finished");
-                               semaphore_signal(shared.rd_finished);
-                       }
-               });
-
-               dispatch_source_set_event_handler(read_src, handler);
-               dispatch_activate(read_src);
-
-               wake_writer();
-               break;
-       }
-
-       default:
-               T_ASSERT_FAIL("unrecognized read mode: %d", mode);
-               break;
-       }
-
-       if (shared.rd_finished) {
-               T_LOG("wait shared.rd_finished");
-               kern_return_t kret = semaphore_timedwait(shared.rd_finished, READ_timeout);
-               if (kret == KERN_OPERATION_TIMED_OUT) {
-                       T_ASSERT_FAIL("reading timed out after %d seconds", READ_timeout.tv_sec);
-               }
-               T_QUIET;
-               T_ASSERT_MACH_SUCCESS(kret, "semaphore_timedwait shared.rd_finished");
-       }
-
-       T_EXPECT_EQ_STR(final_string, EXPECTED_STRING,
-                       "reader should receive valid string");
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(close(fd), NULL);
-}
-
-#pragma mark file setup
-
-static void
-fd_pair_init(enum fd_pair fd_pair, int *rd_fd, int *wr_fd)
-{
-       switch (fd_pair) {
-       case PTY_PAIR:
-               T_ASSERT_POSIX_SUCCESS(openpty(rd_fd, wr_fd, NULL, NULL, NULL),
-                               NULL);
-               break;
-
-       case FIFO_PAIR: {
-               char fifo_path[] = "/tmp/async-io-fifo.XXXXXX";
-               T_QUIET; T_ASSERT_NOTNULL(mktemp(fifo_path), NULL);
-
-               T_ASSERT_POSIX_SUCCESS(mkfifo(fifo_path, 0700), "mkfifo(%s, 0700)",
-                               fifo_path);
-               /*
-                * Opening the read side of a pipe will block until the write
-                * side opens -- use O_NONBLOCK.
-                */
-               *rd_fd = open(fifo_path, O_RDONLY | O_NONBLOCK);
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(*rd_fd, "open(... O_RDONLY)");
-               *wr_fd = open(fifo_path, O_WRONLY | O_NONBLOCK);
-               T_QUIET; T_ASSERT_POSIX_SUCCESS(*wr_fd, "open(... O_WRONLY)");
-               break;
-       }
-
-       case PIPE_PAIR: {
-               int pipe_fds[2];
-               T_ASSERT_POSIX_SUCCESS(pipe(pipe_fds), NULL);
-               *rd_fd = pipe_fds[0];
-               *wr_fd = pipe_fds[1];
-               break;
-       }
-
-       case SOCKET_PAIR: {
-               int sock_fds[2];
-               T_ASSERT_POSIX_SUCCESS(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds),
-                               NULL);
-               *rd_fd = sock_fds[0];
-               *wr_fd = sock_fds[1];
-               break;
-       }
-
-       default:
-               T_ASSERT_FAIL("unknown descriptor pair type: %d", fd_pair);
-               break;
-       }
-
-       T_QUIET; T_ASSERT_NE(*rd_fd, -1, "reading descriptor");
-       T_QUIET; T_ASSERT_NE(*wr_fd, -1, "writing descriptor");
-}
-
-#pragma mark single process
-
-static void
-drive_threads(enum fd_pair fd_pair, enum read_mode rd_mode,
-               enum write_mode wr_mode)
-{
-       pthread_t thread;
-
-       shared.fd_pair = fd_pair;
-       shared.rd_mode = rd_mode;
-       shared.wr_mode = wr_mode;
-       fd_pair_init(fd_pair, &(shared.rd_fd), &(shared.wr_fd));
-
-       shared.wr_kind = THREAD_WRITER;
-       T_ASSERT_MACH_SUCCESS(semaphore_create(mach_task_self(), &shared.wr_wait.sem, SYNC_POLICY_FIFO, 0),
-                             "semaphore_create shared.wr_wait.sem");
-
-       T_QUIET;
-       T_ASSERT_POSIX_ZERO(pthread_create(&thread, NULL, write_to_fd, NULL),
-                       NULL);
-       T_LOG("created writer thread");
-
-       read_from_fd(shared.rd_fd, fd_pair, rd_mode);
-
-       T_ASSERT_POSIX_ZERO(pthread_join(thread, NULL), NULL);
-
-       T_END;
-}
-
-#pragma mark multiple processes
-
-static void __attribute__((noreturn))
-drive_processes(enum fd_pair fd_pair, enum read_mode rd_mode, enum write_mode wr_mode)
-{
-       shared.fd_pair = fd_pair;
-       shared.rd_mode = rd_mode;
-       shared.wr_mode = wr_mode;
-       fd_pair_init(fd_pair, &(shared.rd_fd), &(shared.wr_fd));
-
-       shared.wr_kind = PROCESS_WRITER;
-       int fds[2];
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pipe(fds), NULL);
-       shared.wr_wait.out_fd = fds[0];
-       shared.wr_wait.in_fd = fds[1];
-
-       T_LOG("starting subprocesses");
-       dt_helper_t helpers[2] = {
-               dt_fork_helper("reader_helper"),
-               dt_fork_helper("writer_helper")
-       };
-
-       close(shared.rd_fd);
-       close(shared.wr_fd);
-
-       dt_run_helpers(helpers, 2, 50000);
-}
-
-T_HELPER_DECL(reader_helper, "Read asynchronously")
-{
-       close(shared.wr_fd);
-       read_from_fd(shared.rd_fd, shared.fd_pair, shared.rd_mode);
-       T_END;
-}
-
-T_HELPER_DECL(writer_helper, "Write asynchronously")
-{
-       close(shared.rd_fd);
-       write_to_fd(NULL);
-}
-
-#pragma mark tests
-
-#define WR_DECL_PROCESSES(desc_name, fd_pair, write_name, write_str, \
-                               write_mode, read_name, read_mode) \
-               T_DECL(desc_name##_r##read_name##_w##write_name##_procs, "read changes to a " \
-                               #desc_name " with " #read_name " and writing " #write_str \
-                               " across two processes") \
-               { \
-                       drive_processes(fd_pair, read_mode, write_mode); \
-               }
-#define WR_DECL_THREADS(desc_name, fd_pair, write_name, write_str, \
-                               write_mode, read_name, read_mode) \
-               T_DECL(desc_name##_r##read_name##_w##write_name##_thds, "read changes to a " \
-                               #desc_name " with " #read_name " and writing " #write_str) \
-               { \
-                       drive_threads(fd_pair, read_mode, write_mode); \
-               }
-
-#define WR_DECL(desc_name, fd_pair, write_name, write_str, write_mode, \
-               read_name, read_mode) \
-               WR_DECL_PROCESSES(desc_name, fd_pair, write_name, write_str, \
-                               write_mode, read_name, read_mode) \
-               WR_DECL_THREADS(desc_name, fd_pair, write_name, write_str, \
-                               write_mode, read_name, read_mode)
-
-#define RD_DECL_SAFE(desc_name, fd_pair, read_name, read_mode) \
-               WR_DECL(desc_name, fd_pair, full, "the full string", FULL_WRITE, \
-                               read_name, read_mode) \
-               WR_DECL(desc_name, fd_pair, inc, "incrementally", \
-                               INCREMENTAL_WRITE, read_name, read_mode)
-
-#define RD_DECL_DISPATCH_ONLY(suffix, desc_name, fd_pair, read_name, \
-                               read_mode) \
-               WR_DECL##suffix(desc_name, fd_pair, inc_dispatch, \
-                               "incrementally with a dispatch source", \
-                               DISPATCH_INCREMENTAL_WRITE, read_name, read_mode)
-#define RD_DECL_WORKQ_ONLY(suffix, desc_name, fd_pair, read_name, \
-                               read_mode) \
-               WR_DECL##suffix(desc_name, fd_pair, inc_workq, \
-                               "incrementally with the workqueue", \
-                               WORKQ_INCREMENTAL_WRITE, read_name, read_mode)
-
-#define RD_DECL(desc_name, fd_pair, read_name, read_mode) \
-               RD_DECL_SAFE(desc_name, fd_pair, read_name, read_mode) \
-               RD_DECL_DISPATCH_ONLY(, desc_name, fd_pair, read_name, read_mode)
-               // RD_DECL_WORKQ_ONLY(, desc_name, fd_pair, read_name, read_mode)
-
-/*
- * dispatch_source tests cannot share the same process as other workqueue
- * tests.
- */
-#define RD_DECL_DISPATCH(desc_name, fd_pair, read_name, read_mode) \
-               RD_DECL_SAFE(desc_name, fd_pair, read_name, read_mode) \
-               RD_DECL_DISPATCH_ONLY(, desc_name, fd_pair, read_name, read_mode) \
-               RD_DECL_WORKQ_ONLY(_PROCESSES, desc_name, fd_pair, read_name, \
-                               read_mode)
-
-/*
- * Workqueue tests cannot share the same process as other workqueue or
- * dispatch_source tests.
-#define RD_DECL_WORKQ(desc_name, fd_pair, read_name, read_mode) \
-               RD_DECL_SAFE(desc_name, fd_pair, read_name, read_mode) \
-               RD_DECL_DISPATCH_ONLY(_PROCESSES, desc_name, fd_pair, read_name, \
-                               read_mode) \
-               RD_DECL_WORKQ_ONLY(_PROCESSES, desc_name, fd_pair, read_name, \
-                               read_mode)
- */
-
-#define PAIR_DECL(desc_name, fd_pair) \
-       RD_DECL(desc_name, fd_pair, poll, POLL_READ) \
-       RD_DECL(desc_name, fd_pair, select, SELECT_READ) \
-       RD_DECL(desc_name, fd_pair, kevent, KEVENT_READ) \
-       RD_DECL(desc_name, fd_pair, kevent64, KEVENT64_READ) \
-       RD_DECL(desc_name, fd_pair, kevent_qos, KEVENT_QOS_READ) \
-       RD_DECL_DISPATCH(desc_name, fd_pair, dispatch_source, DISPATCH_READ)
-       // RD_DECL_WORKQ(desc_name, fd_pair, workq, WORKQ_READ)
-
-PAIR_DECL(tty, PTY_PAIR)
-PAIR_DECL(pipe, PIPE_PAIR)
-PAIR_DECL(fifo, FIFO_PAIR)
-PAIR_DECL(socket, SOCKET_PAIR)
diff --git a/tools/tests/darwintests/private_entitlement.plist b/tools/tests/darwintests/private_entitlement.plist
deleted file mode 100644 (file)
index 6f5cece..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.private.entitlement-1</key>
-       <string>something</string>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/proc_core_name_24152432.c b/tools/tests/darwintests/proc_core_name_24152432.c
deleted file mode 100644 (file)
index aad5ee6..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-#include <darwintest.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <sys/resource.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <TargetConditionals.h>
-#include <unistd.h>
-
-#define BUFFLEN 2048
-#define EVILLEN 19
-
-static const char corefile_ctl[]     = "kern.corefile";
-static const char coredump_ctl[]     = "kern.coredump";
-/* The default coredump location if the kern.coredump ctl is invalid */
-static const char default_dump_fmt[] = "/cores/core.%d";
-/* The coredump location when we set kern.coredump ctl to something valid */
-static const char valid_dump_fmt[]   = "/cores/test-core.%d";
-
-/* /cores/core.%(null), then BORK immediately after. */
-static char evil[] = {'/', 'c', 'o', 'r', 'e', 's', '/', 'c', 'o', 'r', 'e', '.', '%', '\0', 'B', 'O', 'R', 'K', '\0'};
-/* A valid coredump location to test. */
-static char valid_dump_loc[]   = "/cores/test-core.%P";
-
-static const struct rlimit lim_infty = {
-       RLIM_INFINITY,
-       RLIM_INFINITY
-};
-
-#if TARGET_OS_OSX
-static int fork_and_wait_for_segfault(void);
-
-static int fork_and_wait_for_segfault() {
-       int pid, ret;
-       pid = fork();
-       if (pid == 0) {
-               unsigned int *ptr = NULL; /* Cause a segfault so that we get a coredump */
-               *ptr = 0xdeadd00d;
-               T_FAIL("Expected segmentation fault on write to NULL pointer");
-       }
-       T_ASSERT_TRUE(pid != -1, "Checking fork success in parent");
-
-       ret = wait(NULL);
-       T_ASSERT_TRUE(ret != -1, "Waited for child to segfault and dump core");
-       return pid;
-}
-#endif
-
-T_DECL(
-    proc_core_name_24152432,
-    "Tests behavior of core dump when kern.corefile ends in %, e.g., /cores/core.%",
-    T_META_ASROOT(true))
-{
-#if TARGET_OS_OSX
-       int ret, pid;
-       int enable_core_dump = 1;
-       char buf[BUFFLEN];
-       memset(buf, 0, BUFFLEN);
-       size_t oldlen = BUFFLEN;
-
-       ret = sysctlbyname(coredump_ctl, buf, &oldlen, &enable_core_dump, sizeof(int));
-       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: enable core dumps");
-       memset(buf, 0, BUFFLEN);
-       oldlen = BUFFLEN;
-
-       ret = setrlimit(RLIMIT_CORE, &lim_infty);
-       T_ASSERT_POSIX_SUCCESS(ret, "setrlimit: remove limit on maximum coredump size");
-
-       ret = sysctlbyname(corefile_ctl, buf, &oldlen, evil, EVILLEN);
-       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set bad core dump location, old value was %s", buf);
-       memset(buf, 0, BUFFLEN);
-       oldlen = BUFFLEN;
-
-       pid = fork_and_wait_for_segfault();
-
-       snprintf(buf, BUFFLEN, default_dump_fmt, pid);
-       ret = remove(buf);
-       T_ASSERT_TRUE(ret != -1, "Removing coredump file (should be in fallback location)");
-       memset(buf, 0, BUFFLEN);
-
-       ret = sysctlbyname(corefile_ctl, buf, &oldlen, valid_dump_loc, strlen(valid_dump_loc));
-       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: set valid core dump location, old value was %s", buf);
-       memset(buf, 0, BUFFLEN);
-
-       pid = fork_and_wait_for_segfault();
-
-       snprintf(buf, BUFFLEN, valid_dump_fmt, pid);
-       ret = remove(buf);
-       T_ASSERT_TRUE(ret != -1, "Removing coredump file (should be in valid location)");
-#else
-       T_LOG("proc_core_name appears in OS X only, skipping test.");
-#endif
-       T_PASS("proc_core_name_24152432 PASSED");
-}
diff --git a/tools/tests/darwintests/proc_info.c b/tools/tests/darwintests/proc_info.c
deleted file mode 100644 (file)
index 3a1e738..0000000
+++ /dev/null
@@ -1,2092 +0,0 @@
-#define PRIVATE
-#include <System/sys/kdebug.h>
-#include <darwintest.h>
-#include <darwintest_utils.h>
-#include <dispatch/dispatch.h>
-#include <fcntl.h>
-#include <inttypes.h>
-#include <libproc.h>
-#include <limits.h>
-#include <mach/mach.h>
-#include <mach/policy.h>
-#include <os/assumes.h>
-#include <os/overflow.h>
-#include <pthread.h>
-#include <signal.h>
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/event.h>
-#include <sys/mman.h>
-#include <sys/proc_info.h>
-#include <sys/stat.h>
-#include <sys/sysctl.h>
-#include <sys/vnode.h>
-#include <unistd.h>
-#undef PRIVATE
-
-#define ACT_CHANGE_UID 1
-#define ACT_CHANGE_RUID 2
-#define ACT_EXIT 127
-
-#define ACT_PHASE2 2
-#define ACT_PHASE3 3
-#define ACT_PHASE4 4
-#define ACT_PHASE5 5
-
-#define PIPE_IN 0
-#define PIPE_OUT 1
-
-#define CONF_THREAD_NAME "test_child_thread"
-#define CONF_CMD_NAME getprogname()
-#define CONF_PROC_COUNT 20
-#define CONF_BLK_SIZE 4096
-#define CONF_UID_VAL 999U
-#define CONF_RUID_VAL 998U
-#define CONF_GID_VAL 997U
-#define CONF_NICE_VAL 5
-#define CONF_NUM_THREADS 2
-
-#define BASEPRI_DEFAULT 31
-#define MAXPRI_USER 63
-
-#define CONF_OPN_FILE_COUNT 3
-#define CONF_TMP_FILE_PATH "/tmp/testfile"
-
-uint32_t get_tty_dev(void);
-
-#define WAIT_FOR_CHILDREN(pipefd, action, child_count)                           \
-       do {                                                                         \
-               long ret;                                                                \
-               if (child_count == 1) {                                                  \
-                       int child_ret_action = 999;                                          \
-                       while (child_ret_action != action) {                                 \
-                               ret = read(pipefd, &child_ret_action, sizeof(child_ret_action)); \
-                       }                                                                    \
-               } else {                                                                 \
-                       int child_ready_count = child_count * (int)sizeof(action);           \
-                                                                                 \
-                       action = 0;                                                          \
-                       while (child_ready_count) {                                          \
-                               ret = read(pipefd, &action, (int)sizeof(action));                \
-                               if (ret != -1) {                                                 \
-                                       child_ready_count -= ret;                                    \
-                               } else {                                                         \
-                                       T_FAIL("ERROR: Could not read from pipe() : %d", errno);     \
-                               }                                                                \
-                               if (action) {                                                    \
-                                       T_FAIL("ERROR: Child action failed with error %d", action);  \
-                               }                                                                \
-                       }                                                                    \
-               }                                                                        \
-       } while (0)
-
-#define PROC_INFO_CALL(struct_name, pid, flavor, proc_arg)                                                     \
-       do {                                                                                                       \
-               struct struct_name * struct_var = malloc(sizeof(struct struct_name));                                  \
-               T_QUIET;                                                                                               \
-               T_ASSERT_NOTNULL(struct_var, "malloc() for " #flavor);                                                 \
-               retval = __proc_info(PROC_INFO_CALL_PIDINFO, pid, flavor, (uint64_t)proc_arg, (user_addr_t)struct_var, \
-                                    (uint32_t)sizeof(struct struct_name));                                            \
-                                                                                                               \
-               T_QUIET;                                                                                               \
-               T_EXPECT_POSIX_SUCCESS(retval, "__proc_info call for " #flavor);                                       \
-               T_ASSERT_EQ_INT(retval, (int)sizeof(struct struct_name), "__proc_info call for " #flavor);             \
-               ret_structs[i] = (void *)struct_var;                                                                   \
-               i++;                                                                                                   \
-       } while (0)
-
-uint32_t
-get_tty_dev()
-{
-       struct stat buf;
-       stat(ttyname(1), &buf);
-       return ((uint32_t)buf.st_rdev);
-}
-
-/*
- * Defined in libsyscall/wrappers/libproc/libproc.c
- * For API test only. For normal use, please use the libproc API instead.
- * DO NOT COPY
- */
-extern int __proc_info(int32_t callnum, int32_t pid, uint32_t flavor, uint64_t arg, user_addr_t buffer, int32_t buffersize);
-struct proc_config_s {
-       int parent_pipe[2];
-       int child_count;
-       pid_t proc_grp_id;
-       int child_pipe[CONF_PROC_COUNT][2];
-       int child_pids[CONF_PROC_COUNT];
-       void * cow_map; /* memory for cow test */
-};
-typedef struct proc_config_s * proc_config_t;
-
-typedef void (^child_action_handler_t)(proc_config_t proc_config, int child_id);
-
-enum proc_info_opt {
-       P_UNIQIDINFO    = 0x01,
-       C_UNIQIDINFO    = 0x02,
-       PBSD_OLD        = 0x04,
-       PBSD            = 0x08,
-       PBSD_SHORT      = 0x10,
-       PBSD_UNIQID     = 0x20,
-       P_TASK_INFO     = 0x40,
-       P_TASK_INFO_NEW = 0x80,
-       PALL            = 0x100,
-       THREAD_ADDR     = 0x200,
-       PTHINFO_OLD     = 0x400,
-       PTHINFO         = 0x800,
-       PTHINFO_64      = 0x1000,
-       PINFO_PATH      = 0x2000,
-       PAI             = 0x4000,
-       PREGINFO        = 0x8000,
-       PREGINFO_PATH   = 0x10000,
-       PREGINFO_PATH_2 = 0x20000,
-       PREGINFO_PATH_3 = 0x40000,
-       PVNINFO         = 0x80000
-};
-
-static int tmp_fd = -1;
-
-static child_action_handler_t proc_info_listpids_handler = ^void(proc_config_t proc_config, int child_id) {
-  close(proc_config->parent_pipe[PIPE_IN]);
-  close(proc_config->child_pipe[child_id][PIPE_OUT]);
-  long retval      = 0;
-  int child_action = 0;
-  retval           = write(proc_config->parent_pipe[PIPE_OUT], &child_action, sizeof(child_action));
-  if (retval != -1) {
-         while (child_action != ACT_EXIT) {
-                 retval = read(proc_config->child_pipe[child_id][PIPE_IN], &child_action, sizeof(child_action));
-                 if (retval == 0 || (retval == -1 && errno == EAGAIN)) {
-                         continue;
-                 }
-                 if (retval != -1) {
-                         switch (child_action) {
-                         case ACT_CHANGE_UID:
-                                 /*
-                                  * Change uid
-                                  */
-                                 retval = setuid(CONF_UID_VAL);
-                                 break;
-                         case ACT_CHANGE_RUID:
-                                 /*
-                                  * Change ruid
-                                  */
-                                 retval = setreuid(CONF_RUID_VAL, (uid_t)-1);
-                                 break;
-                         case ACT_EXIT:
-                                 /*
-                                  * Exit
-                                  */
-                                 break;
-                         }
-                 }
-                 if (child_action != ACT_EXIT) {
-                         retval = write(proc_config->parent_pipe[PIPE_OUT], &retval, sizeof(retval));
-                         if (retval == -1)
-                                 break;
-                 }
-         }
-  }
-  close(proc_config->parent_pipe[PIPE_OUT]);
-  close(proc_config->child_pipe[child_id][PIPE_IN]);
-  exit(0);
-};
-
-static child_action_handler_t proc_info_call_pidinfo_handler = ^void(proc_config_t proc_config, int child_id) {
-  close(proc_config->parent_pipe[PIPE_IN]);
-  close(proc_config->child_pipe[child_id][PIPE_OUT]);
-  int action  = 0;
-  long retval = 0;
-  int i;
-  void * tmp_map           = NULL;
-  dispatch_queue_t q       = NULL;
-  dispatch_semaphore_t sem = NULL;
-  /*
-   * PHASE 1: Child ready and waits for parent to send next action
-   */
-  T_LOG("Child ready to accept action from parent");
-  retval = write(proc_config->parent_pipe[PIPE_OUT], &action, sizeof(action));
-  if (retval != -1) {
-         while (action != ACT_EXIT) {
-                 retval = read(proc_config->child_pipe[child_id][PIPE_IN], &action, sizeof(action));
-
-                 if (retval != -1) {
-                         retval = 0;
-                         switch (action) {
-                         case ACT_PHASE2: {
-                                 /*
-                                  * Change uid, euid, guid, rgid, nice value
-                                  * Also change the svuid and svgid
-                                  */
-                                 T_LOG("Child changing uid, euid, rguid, svuid, svgid and nice value");
-                                 retval = nice(CONF_NICE_VAL);
-                                 if (retval == -1) {
-                                         T_LOG("(child) ERROR: nice() failed");
-                                         break;
-                                 }
-                                 retval = setgid(CONF_GID_VAL);
-                                 if (retval == -1) {
-                                         T_LOG("(child) ERROR: setgid() failed");
-                                         break;
-                                 }
-                                 retval = setreuid((uid_t)-1, CONF_RUID_VAL);
-                                 if (retval == -1) {
-                                         T_LOG("(child) ERROR: setreuid() failed");
-                                         break;
-                                 }
-                                 break;
-                         }
-                         case ACT_PHASE3: {
-                                 /*
-                                  * Allocate a page of memory
-                                  * Copy on write shared memory
-                                  */
-                                 T_LOG("Child allocating a page of memory, and causing a copy-on-write");
-                                 retval  = 0;
-                                 tmp_map = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
-                                 if (tmp_map == MAP_FAILED) {
-                                         T_LOG("(child) ERROR: mmap() failed");
-                                         retval = 1;
-                                         break;
-                                 }
-                                 /*
-                                  * Get the page allocated
-                                  */
-                                 int * map_ptr = (int *)tmp_map;
-                                 for (i = 0; i < (int)(PAGE_SIZE / sizeof(int)); i++) {
-                                         *map_ptr++ = i;
-                                 }
-                                 /*
-                                  * Cause copy on write to the page
-                                  */
-                                 *((int *)(proc_config->cow_map)) = 20;
-
-                                 break;
-                         }
-                         case ACT_PHASE4: {
-                                 T_LOG("Child spending CPU cycles and changing thread name");
-                                 retval                       = 0;
-                                 int number                   = 1000;
-                                 unsigned long long factorial = 1;
-                                 int j;
-                                 for (j = 1; j <= number; j++) {
-                                         factorial *= (unsigned long long)j;
-                                 }
-                                 sysctlbyname("kern.threadname", NULL, 0, CONF_THREAD_NAME, strlen(CONF_THREAD_NAME));
-                                 break;
-                         }
-                         case ACT_PHASE5: {
-                                 /*
-                                  * Dispatch for Workq test
-                                  */
-                                 T_LOG("Child creating a dispatch queue, and dispatching blocks on it");
-                                 q = dispatch_queue_create("com.apple.test_proc_info.workqtest",
-                                                               DISPATCH_QUEUE_CONCURRENT); // dispatch_get_global_queue(0, 0);
-                                 sem = dispatch_semaphore_create(0);
-
-                                 for (i = 0; i < CONF_NUM_THREADS; i++) {
-                                         dispatch_async(q, ^{
-                                               /*
-                                                * Block the thread, do nothing
-                                                */
-                                               dispatch_semaphore_wait(sem, DISPATCH_TIME_FOREVER);
-                                         });
-                                 }
-                                 break;
-                         }
-                         case ACT_EXIT: {
-                                 /*
-                                  * Exit
-                                  */
-                                 if (sem) {
-                                         for (i = 0; i < CONF_NUM_THREADS; i++) {
-                                                 dispatch_semaphore_signal(sem);
-                                         }
-                                 }
-
-                                 if (tmp_map)
-                                         munmap(tmp_map, PAGE_SIZE);
-
-                                 if (proc_config->cow_map)
-                                         munmap(proc_config->cow_map, PAGE_SIZE);
-
-                                 break;
-                         }
-                         }
-                 }
-                 if (action != ACT_EXIT) {
-                         retval = write(proc_config->parent_pipe[PIPE_OUT], &action, sizeof(action));
-                         if (retval == -1)
-                                 break;
-                 }
-         }
-         close(proc_config->parent_pipe[PIPE_OUT]);
-         close(proc_config->child_pipe[child_id][PIPE_IN]);
-         exit(0);
-  }
-};
-
-static void
-free_proc_config(proc_config_t proc_config)
-{
-       free(proc_config);
-}
-
-static void
-send_action_to_child_processes(proc_config_t proc_config, int action)
-{
-       long err;
-       for (int i = 0; i < proc_config->child_count; i++) {
-               err = write(proc_config->child_pipe[i][PIPE_OUT], &action, sizeof(action));
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(err, "write() to child in send_action");
-       }
-       if (action != ACT_EXIT) {
-               WAIT_FOR_CHILDREN(proc_config->parent_pipe[PIPE_IN], action, proc_config->child_count);
-       }
-}
-
-static void
-kill_child_processes(proc_config_t proc_config)
-{
-       int ret = 0;
-       T_LOG("Killing child processes");
-       send_action_to_child_processes(proc_config, ACT_EXIT);
-       for (int child_id = 0; child_id < proc_config->child_count; child_id++) {
-               close(proc_config->child_pipe[child_id][PIPE_OUT]);
-               dt_waitpid(proc_config->child_pids[child_id], NULL, NULL, 5);
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(ret, "killed child %d", child_id);
-       }
-       close(proc_config->parent_pipe[PIPE_IN]);
-       munmap(proc_config->cow_map, PAGE_SIZE);
-       T_LOG("Killed child processes");
-}
-
-static proc_config_t
-spawn_child_processes(int child_count, child_action_handler_t child_handler)
-{
-       /*
-        * Spawn procs for Tests 1.2 and 1.3
-        */
-       T_LOG("Spawning child processes...");
-       proc_config_t proc_config = malloc(sizeof(*proc_config));
-       int action                = 0;
-       int err;
-
-       setpgid(0, 0);
-       proc_config->proc_grp_id = getpgid(0);
-
-       proc_config->child_count = child_count;
-
-       err = pipe(proc_config->parent_pipe);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(err, "pipe() call");
-
-       /*
-        * Needed for ACT_PHASE3 tests
-        */
-       proc_config->cow_map = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
-       T_QUIET;
-       T_ASSERT_NE_PTR(proc_config->cow_map, MAP_FAILED, "cow_map mmap()");
-       *((int *)(proc_config->cow_map)) = 10;
-
-       pid_t child_pid;
-       int i;
-       int child_id;
-       for (i = 0; i < child_count; i++) {
-               err = pipe(proc_config->child_pipe[i]);
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(err, "pipe() call");
-
-               child_pid = fork();
-               child_id  = i;
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(child_pid, "fork() in parent process for child %d", child_id);
-
-               if (child_pid == 0) {
-                       child_handler(proc_config, child_id);
-               } else {
-                       proc_config->child_pids[child_id] = child_pid;
-               }
-               close(proc_config->child_pipe[child_id][PIPE_IN]);
-       }
-       /*
-        * Wait for the children processes to spawn
-        */
-       close(proc_config->parent_pipe[PIPE_OUT]);
-       WAIT_FOR_CHILDREN(proc_config->parent_pipe[PIPE_IN], action, child_count);
-
-       return proc_config;
-}
-
-/*
- *  All PROC_INFO_CALL_PIDINFO __proc_info calls fire from this function.
- *  T_DECLs require different combinations of structs and different actions
- *  must occur in the child to get the data.  Instead of performing the setup
- *  in each T_DECL, this function accepts a bitmap and performs the necessary setup
- *  and cleanup work
- */
-
-static void
-proc_info_caller(int proc_info_opts, void ** ret_structs, int * ret_child_pid)
-{
-       int retval, i = 0;
-       uint64_t * thread_addr = NULL;
-       void * map_tmp         = NULL;
-
-       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
-       int child_pid             = proc_config->child_pids[0];
-       /*
-        * These tests only require one child.
-        * Some DECLs need to know the child pid, so we pass that back if applicable
-        */
-       if (ret_child_pid != NULL) {
-               *ret_child_pid = child_pid;
-       }
-
-       if (proc_info_opts & P_UNIQIDINFO) {
-               PROC_INFO_CALL(proc_uniqidentifierinfo, getpid(), PROC_PIDUNIQIDENTIFIERINFO, 0);
-       }
-       if (proc_info_opts & C_UNIQIDINFO) {
-               PROC_INFO_CALL(proc_uniqidentifierinfo, child_pid, PROC_PIDUNIQIDENTIFIERINFO, 0);
-       }
-       if (proc_info_opts & PBSD_OLD) {
-               PROC_INFO_CALL(proc_bsdinfo, child_pid, PROC_PIDTBSDINFO, 0);
-       }
-
-       /*
-        * Child Phase 2 Fires if opts require it
-        * Small nap after call to give child time to receive and execute the action
-        */
-
-       if (proc_info_opts >= PBSD) {
-               send_action_to_child_processes(proc_config, ACT_PHASE2);
-       }
-
-       if (proc_info_opts & PBSD) {
-               PROC_INFO_CALL(proc_bsdinfo, child_pid, PROC_PIDTBSDINFO, 0);
-       }
-
-       if (proc_info_opts & PBSD_SHORT) {
-               PROC_INFO_CALL(proc_bsdshortinfo, child_pid, PROC_PIDT_SHORTBSDINFO, 0);
-       }
-
-       if (proc_info_opts & PBSD_UNIQID) {
-               PROC_INFO_CALL(proc_bsdinfowithuniqid, child_pid, PROC_PIDT_BSDINFOWITHUNIQID, 0);
-       }
-       if (proc_info_opts & P_TASK_INFO) {
-               PROC_INFO_CALL(proc_taskinfo, child_pid, PROC_PIDTASKINFO, 0);
-       }
-
-       /*
-        * Child Phase 3 Fires
-        */
-       if (proc_info_opts >= P_TASK_INFO_NEW) {
-               send_action_to_child_processes(proc_config, ACT_PHASE3);
-       }
-
-       if (proc_info_opts & P_TASK_INFO_NEW) {
-               PROC_INFO_CALL(proc_taskinfo, child_pid, PROC_PIDTASKINFO, 0);
-       }
-
-       if (proc_info_opts & PALL) {
-               PROC_INFO_CALL(proc_taskallinfo, child_pid, PROC_PIDTASKALLINFO, 0);
-       }
-       /*
-        * This case breaks the pattern in that its proc_info call requires PALL,
-        * its value is required in some other proc_info calls
-        * and we never put the retval into our ret_structs
-        */
-       if (proc_info_opts & THREAD_ADDR || proc_info_opts & PTHINFO_OLD || proc_info_opts & PTHINFO || proc_info_opts & PINFO_PATH) {
-               struct proc_taskallinfo * pall = malloc(sizeof(struct proc_taskallinfo));
-               T_QUIET;
-               T_ASSERT_NOTNULL(pall, "malloc() for PROC_TASKALLINFO");
-
-               retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDTASKALLINFO, (uint32_t)0, (user_addr_t)pall,
-                                    (uint32_t)sizeof(struct proc_taskallinfo));
-               T_QUIET;
-               T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_taskallinfo), "__proc_info call for PROC_PIDTASKALLINFO in THREAD_ADDR");
-
-               thread_addr = malloc(sizeof(uint64_t) * (unsigned long)(pall->ptinfo.pti_threadnum + 1));
-               memset(thread_addr, 0, sizeof(uint64_t) * (unsigned long)(pall->ptinfo.pti_threadnum + 1));
-               T_QUIET;
-               T_ASSERT_NOTNULL(thread_addr, "malloc() for PROC_PIDLISTTHREADS");
-
-               retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDLISTTHREADS, (uint32_t)0, (user_addr_t)thread_addr,
-                                    (int32_t)(sizeof(uint64_t) * (unsigned long)(pall->ptinfo.pti_threadnum + 1)));
-               T_LOG("(int)((unsigned long)retval / PROC_PIDLISTTHREADS_SIZE: %d",
-                     (int)((unsigned long)retval / PROC_PIDLISTTHREADS_SIZE));
-               T_ASSERT_GE_INT((int)((unsigned long)retval / PROC_PIDLISTTHREADS_SIZE), pall->ptinfo.pti_threadnum,
-                               "__proc_info call for PROC_PIDLISTTHREADS");
-
-               free(pall);
-       }
-       if (proc_info_opts & PTHINFO_OLD) {
-               PROC_INFO_CALL(proc_threadinfo, child_pid, PROC_PIDTHREADINFO, thread_addr[0]);
-       }
-
-       /*
-        * Child Phase 4 Fires
-        */
-       if (proc_info_opts >= PTHINFO) {
-               send_action_to_child_processes(proc_config, ACT_PHASE4);
-       }
-
-       if (proc_info_opts & PTHINFO) {
-               PROC_INFO_CALL(proc_threadinfo, child_pid, PROC_PIDTHREADINFO, thread_addr[0]);
-       }
-       if (proc_info_opts & PTHINFO_64) {
-               mach_port_name_t child_task  = MACH_PORT_NULL;
-               thread_array_t child_threads = NULL;
-               mach_msg_type_number_t child_thread_count;
-               thread_identifier_info_data_t child_thread_threadinfo;
-               mach_msg_type_number_t thread_info_count = THREAD_IDENTIFIER_INFO_COUNT;
-               struct proc_threadinfo * pthinfo_64      = malloc(sizeof(struct proc_threadinfo));
-               T_QUIET;
-               T_ASSERT_NOTNULL(pthinfo_64, "malloc() for PROC_THREADINFO");
-
-               retval = task_for_pid(mach_task_self(), child_pid, &child_task);
-               T_ASSERT_EQ_INT(retval, 0, "task_for_pid for PROC_PIDTHREADID64INFO");
-
-               retval = task_threads(child_task, &child_threads, &child_thread_count);
-               T_ASSERT_MACH_SUCCESS(retval, "task_threads() call for PROC_PIDTHREADID64INFO");
-
-               retval = thread_info(child_threads[0], THREAD_IDENTIFIER_INFO, (thread_info_t)&child_thread_threadinfo, &thread_info_count);
-               T_ASSERT_MACH_SUCCESS(retval, "thread_info call for PROC_PIDTHREADID64INFO");
-
-               retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDTHREADID64INFO, (uint64_t)child_thread_threadinfo.thread_id,
-                                    (user_addr_t)pthinfo_64, (uint32_t)sizeof(struct proc_threadinfo));
-               T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_threadinfo), "__proc_info call for PROC_PIDTHREADID64INFO");
-
-               ret_structs[i] = (void *)pthinfo_64;
-               i++;
-
-               mach_port_deallocate(mach_task_self(), child_task);
-               mach_port_deallocate(mach_task_self(), child_threads[0]);
-               child_threads[0] = MACH_PORT_NULL;
-               child_task       = MACH_PORT_NULL;
-       }
-       if (proc_info_opts & PINFO_PATH) {
-               PROC_INFO_CALL(proc_threadwithpathinfo, child_pid, PROC_PIDTHREADPATHINFO, thread_addr[0]);
-       }
-
-       if (proc_info_opts & PAI) {
-               PROC_INFO_CALL(proc_archinfo, getpid(), PROC_PIDARCHINFO, 0);
-       }
-
-       if ((proc_info_opts & PREGINFO) | (proc_info_opts & PREGINFO_PATH) | (proc_info_opts & PREGINFO_PATH_2) |
-           (proc_info_opts & PREGINFO_PATH_3)) {
-               tmp_fd = open(CONF_TMP_FILE_PATH, O_RDWR | O_CREAT);
-
-               for (int j = 0; j < 100; j++) {
-                       char buf[50];
-                       write(tmp_fd, buf, sizeof(buf));
-               }
-               retval = fsync(tmp_fd);
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(retval, "file fsync()");
-
-               map_tmp = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_PRIVATE, tmp_fd, (off_t)PAGE_SIZE);
-               T_ASSERT_NE_PTR(map_tmp, MAP_FAILED, "mmap() for PROC_PIDREGIONINFO");
-
-               T_LOG("file: %s is opened as fd %d and mapped at %llx with size %lu", CONF_TMP_FILE_PATH, tmp_fd, (uint64_t)map_tmp,
-                     (unsigned long)PAGE_SIZE);
-       }
-
-       if (proc_info_opts & PREGINFO) {
-               PROC_INFO_CALL(proc_regioninfo, getpid(), PROC_PIDREGIONINFO, map_tmp);
-               ret_structs[i] = map_tmp;
-               i++;
-       }
-       if (proc_info_opts & PREGINFO_PATH) {
-               PROC_INFO_CALL(proc_regionwithpathinfo, getpid(), PROC_PIDREGIONPATHINFO, map_tmp);
-               ret_structs[i] = map_tmp;
-               i++;
-       }
-       if (proc_info_opts & PREGINFO_PATH_2) {
-               PROC_INFO_CALL(proc_regionwithpathinfo, getpid(), PROC_PIDREGIONPATHINFO2, map_tmp);
-               ret_structs[i] = map_tmp;
-               i++;
-       }
-
-       if (proc_info_opts & PREGINFO_PATH_3) {
-               struct proc_regionwithpathinfo * preginfo_path = malloc(sizeof(struct proc_regionwithpathinfo));
-
-               retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDREGIONPATHINFO2, (uint64_t)map_tmp,
-                                    (user_addr_t)preginfo_path, (uint32_t)sizeof(struct proc_regionwithpathinfo));
-
-               T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_regionwithpathinfo), "__proc_info call for PROC_PIDREGIONPATHINFO2");
-
-               T_LOG("preginfo_path.prp_vip.vip_vi.vi_fsid.val 0: %d", preginfo_path->prp_vip.vip_vi.vi_fsid.val[0]);
-               T_LOG("preginfo_path.prp_vip.vip_vi.vi_fsid.val 1: %d", preginfo_path->prp_vip.vip_vi.vi_fsid.val[1]);
-
-               retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDREGIONPATHINFO3,
-                                    (uint64_t)(*(uint32_t *)(preginfo_path->prp_vip.vip_vi.vi_fsid.val)), (user_addr_t)preginfo_path,
-                                    (uint32_t)sizeof(struct proc_regionwithpathinfo));
-               T_ASSERT_EQ_INT(retval, (int)sizeof(struct proc_regionwithpathinfo), "__proc_info call for PROC_PIDREGIONPATHWITHINFO3");
-               ret_structs[i] = (void *)preginfo_path;
-               i++;
-       }
-
-       if (proc_info_opts & PVNINFO) {
-               PROC_INFO_CALL(proc_vnodepathinfo, getpid(), PROC_PIDVNODEPATHINFO, 0);
-       }
-
-       kill_child_processes(proc_config);
-       free_proc_config(proc_config);
-       free(thread_addr);
-       thread_addr = NULL;
-       close(tmp_fd);
-       tmp_fd = -1;
-}
-
-static void
-free_proc_info(void ** proc_info, int num)
-{
-       for (int i = 0; i < num; i++) {
-               free(proc_info[i]);
-       }
-
-       return;
-}
-
-/*
- *     Start DECLs
- */
-
-T_DECL(proc_info_listpids_all_pids,
-       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       /*
-        * Get the value of nprocs with no buffer sent in
-        */
-       int num_procs;
-       num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)0, (uint32_t)0);
-       T_ASSERT_GE_INT(num_procs, 1, "verify valid value for nprocs: %d", num_procs);
-
-       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
-
-       num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)0, (uint32_t)0);
-
-       int proc_count     = num_procs / (int)sizeof(pid_t);
-       int proc_count_all = num_procs / (int)sizeof(pid_t);
-       if (proc_count > (CONF_PROC_COUNT + 1)) {
-               proc_count = CONF_PROC_COUNT + 1;
-       }
-       pid_t * proc_ids = malloc(sizeof(pid_t) * (unsigned long)proc_count);
-       num_procs        = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)proc_ids,
-                               (int32_t)(proc_count * (int)sizeof(pid_t)));
-       num_procs = num_procs / (int)sizeof(pid_t);
-       T_ASSERT_GE_INT(num_procs, proc_count, "Valid number of pids obtained for PROC_ALL_PIDS.");
-
-       free(proc_ids);
-
-       /*
-        * Grab list of all procs and make sure our spawned children are in the list.
-        */
-
-       proc_ids  = malloc(sizeof(pid_t) * (unsigned long)proc_count_all);
-       num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)proc_ids,
-                               (int32_t)(proc_count_all * (int)sizeof(pid_t)));
-       num_procs = num_procs / (int)sizeof(pid_t);
-
-       int pid_match = 1;
-
-       for (int i = 0; i < (CONF_PROC_COUNT - 1); i++) {
-               for (int j = 0; j < num_procs; j++) {
-                       if (proc_ids[j] == proc_config->child_pids[i]) {
-                               break;
-                       } else if (j == (num_procs - 1)) {
-                               pid_match = 0;
-                               break;
-                       }
-               }
-
-               if (!pid_match) {
-                       break;
-               }
-       }
-
-       T_ASSERT_EQ(pid_match, 1, "PROC_INFO_CALL_LISTPIDS contains our spawned children's pids");
-
-       free(proc_ids);
-
-       kill_child_processes(proc_config);
-       free_proc_config(proc_config);
-
-       errno     = 0;
-       num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_ALL_PIDS, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)proc_ids,
-                               (uint32_t)(sizeof(pid_t) - 1));
-       T_EXPECT_POSIX_ERROR(errno, ENOMEM, "Valid proc_info behavior when bufsize < sizeof(pid_t).");
-}
-
-T_DECL(proc_info_listpids_pgrp_only,
-       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
-       T_LOG("Test to verify PROC_PGRP_ONLY returns correct value");
-       /*
-        * The number of obtained pids depends on size of buffer.
-        * count = childCount + 1(parent)
-        * So, we set it to one more than expected to capture any error.
-        */
-       int proc_count   = CONF_PROC_COUNT + 2;
-       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
-       int num_procs    = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_PGRP_ONLY, (uint32_t)proc_config->proc_grp_id, (uint32_t)0,
-                                   (user_addr_t)proc_ids, (int32_t)(proc_count * (int)sizeof(*proc_ids)));
-       num_procs = num_procs / (int)sizeof(pid_t);
-       T_ASSERT_EQ_INT(num_procs, CONF_PROC_COUNT + 1, "Valid number of pids obtained for PROC_PGRP_ONLY.");
-       kill_child_processes(proc_config);
-       free_proc_config(proc_config);
-       free(proc_ids);
-}
-
-T_DECL(proc_info_listpids_ppid_only,
-       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
-       T_LOG("Test to verify PROC_PPID_ONLY returns correct value");
-       /*
-        * Pass in the same (bigger) buffer but expect only the pids where ppid is pid of current proc.
-        */
-       int proc_count   = CONF_PROC_COUNT + 2;
-       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
-       int num_procs    = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_PPID_ONLY, (uint32_t)getpid(), (uint32_t)0, (user_addr_t)proc_ids,
-                                   (int32_t)(proc_count * (int)sizeof(*proc_ids)));
-       num_procs = num_procs / (int)sizeof(pid_t);
-       T_ASSERT_EQ_INT(num_procs, CONF_PROC_COUNT, "Valid number of pids obtained for PROC_PPID_ONLY.");
-       kill_child_processes(proc_config);
-       free_proc_config(proc_config);
-       free(proc_ids);
-}
-
-T_DECL(proc_info_listpids_uid_only,
-       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
-       T_LOG("Test to verify PROC_UID_ONLY returns correct value");
-       int proc_count   = CONF_PROC_COUNT + 2;
-       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
-       send_action_to_child_processes(proc_config, ACT_CHANGE_UID);
-       usleep(10000);
-       int num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_UID_ONLY, CONF_UID_VAL, (uint32_t)0, (user_addr_t)proc_ids,
-                                   (int32_t)(proc_count * (int)sizeof(*proc_ids)));
-       T_ASSERT_GE_ULONG((unsigned long)num_procs / sizeof(pid_t), (unsigned long)CONF_PROC_COUNT,
-                         "Valid number of pids obtained for PROC_UID_ONLY.");
-       kill_child_processes(proc_config);
-       free_proc_config(proc_config);
-       free(proc_ids);
-}
-
-T_DECL(proc_info_listpids_ruid_only,
-       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
-       T_LOG("Test to verify PROC_RUID_ONLY returns correct value");
-       int proc_count   = CONF_PROC_COUNT + 2;
-       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
-       send_action_to_child_processes(proc_config, ACT_CHANGE_RUID);
-       usleep(10000);
-       int num_procs = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_RUID_ONLY, CONF_RUID_VAL, (uint32_t)0, (user_addr_t)proc_ids,
-                                   (int32_t)(proc_count * (int)sizeof(*proc_ids)));
-       T_ASSERT_GE_ULONG((unsigned long)num_procs / sizeof(pid_t), (unsigned long)CONF_PROC_COUNT,
-                         "Valid number of pids obtained for PROC_RUID_ONLY.");
-       kill_child_processes(proc_config);
-       free_proc_config(proc_config);
-       free(proc_ids);
-}
-
-T_DECL(proc_info_listpids_tty_only,
-       "proc_info API test to verify PROC_INFO_CALL_LISTPIDS",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       int ret = isatty(STDOUT_FILENO);
-       if (ret != 1) {
-               T_SKIP("Not connected to tty...skipping test");
-       }
-
-       proc_config_t proc_config = spawn_child_processes(CONF_PROC_COUNT, proc_info_listpids_handler);
-
-       T_LOG("Test to verify PROC_TTY_ONLY returns correct value");
-       int proc_count   = CONF_PROC_COUNT + 2;
-       pid_t * proc_ids = malloc(sizeof(*proc_ids) * (unsigned long)proc_count);
-       int num_procs    = __proc_info(PROC_INFO_CALL_LISTPIDS, PROC_TTY_ONLY, get_tty_dev(), (uint32_t)0, (user_addr_t)proc_ids,
-                                   (int32_t)(proc_count * (int)sizeof(*proc_ids)));
-       num_procs = num_procs / (int)sizeof(pid_t);
-       T_ASSERT_GE_INT(num_procs, 0, "Valid number of pids returned by PROC_TTY_ONLY.");
-       kill_child_processes(proc_config);
-       free_proc_config(proc_config);
-       free(proc_ids);
-}
-
-/*
- * Most of the following PROC_INFO_CALL_PIDINFO tests rely on a helper function (proc_info_caller) to make the necessary proc_info
- * calls on their behalf
- * In a previous iteration, these tests were all in one giant T_DECL and the helper function handles inter-DECL dependencies such as
- * a proc_info call relying on the results of a previous proc_info call or an assumed state that a child should be in.
- */
-
-T_DECL(proc_info_pidinfo_proc_piduniqidentifierinfo,
-       "Test to identify PROC_PIDUNIQIDENTIFIERINFO returns correct unique identifiers for process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[2];
-       proc_info_caller(P_UNIQIDINFO | C_UNIQIDINFO, proc_info, NULL);
-       struct proc_uniqidentifierinfo * p_uniqidinfo = (struct proc_uniqidentifierinfo *)proc_info[0];
-       struct proc_uniqidentifierinfo * c_uniqidinfo = (struct proc_uniqidentifierinfo *)proc_info[1];
-
-       T_EXPECT_NE_ULLONG(c_uniqidinfo->p_uniqueid, p_uniqidinfo->p_uniqueid, "p_uniqueid not unique for the process");
-
-       for (size_t i = 0; i < 16; i++) {
-               T_EXPECT_EQ_UCHAR(c_uniqidinfo->p_uuid[i], p_uniqidinfo->p_uuid[i], "p_uuid should be the same unique id");
-       }
-       T_EXPECT_EQ_ULLONG(c_uniqidinfo->p_puniqueid, p_uniqidinfo->p_uniqueid,
-                          "p_puniqueid of child should be same as p_uniqueid for parent");
-
-       free_proc_info(proc_info, 2);
-}
-
-T_DECL(proc_info_pidinfo_proc_pidtbsdinfo,
-       "Test to verify PROC_PIDTBSDINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[2];
-       int child_pid = 0;
-       proc_info_caller(PBSD_OLD | PBSD, proc_info, &child_pid);
-       struct proc_bsdinfo * pbsd_old = (struct proc_bsdinfo *)proc_info[0];
-       struct proc_bsdinfo * pbsd     = (struct proc_bsdinfo *)proc_info[1];
-
-       T_EXPECT_EQ_UINT((unsigned int)SRUN, pbsd->pbi_status, "PROC_PIDTBSDINFO shows Correct status");
-       T_EXPECT_EQ_UINT(0U, pbsd->pbi_xstatus, "PROC_PIDTBSDINFO show Correct xstatus (exit status)");
-       T_EXPECT_EQ_UINT(pbsd->pbi_pid, (unsigned int)child_pid, "PROC_PIDTBSDINFO returns valid pid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_ppid, (unsigned int)getpid(), "PROC_PIDTBSDINFO returns valid ppid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_uid, CONF_RUID_VAL, "PROC_PIDTBSDINFO returns valid uid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_gid, CONF_GID_VAL, "PROC_PIDTBSDINFO returns valid gid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_ruid, 0U, "PROC_PIDTBSDINFO returns valid ruid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_rgid, CONF_GID_VAL, "PROC_PIDTBSDINFO returns valid rgid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_svuid, CONF_RUID_VAL, "PROC_PIDTBSDINFO returns valid svuid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_svgid, CONF_GID_VAL, "PROC_PIDTBSDINFO returns valid svgid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_nice, CONF_NICE_VAL, "PROC_PIDTBSDINFO returns valid nice value");
-       T_EXPECT_EQ_STR(pbsd->pbi_comm, CONF_CMD_NAME, "PROC_PIDTBSDINFO returns valid p_comm name");
-       T_EXPECT_EQ_STR(pbsd->pbi_name, CONF_CMD_NAME, "PROC_PIDTBSDINFO returns valid p_name name");
-       T_EXPECT_EQ_UINT(pbsd->pbi_flags, (pbsd_old->pbi_flags | PROC_FLAG_PSUGID), "PROC_PIDTBSDINFO returns valid flags");
-       T_EXPECT_EQ_UINT(pbsd->pbi_nfiles, pbsd_old->pbi_nfiles, "PROC_PIDTBSDINFO returned valid pbi_nfiles");
-       T_EXPECT_EQ_UINT(pbsd->pbi_pgid, (uint32_t)getpgid(getpid()), "PROC_PIDTBSDINFO returned valid pbi_pgid");
-       T_EXPECT_EQ_UINT(pbsd->pbi_pjobc, pbsd->pbi_pjobc, "PROC_PIDTBSDINFO returned valid pbi_pjobc");
-       T_EXPECT_NE_UINT(pbsd->e_tdev, 0U, "PROC_PIDTBSDINFO returned valid e_tdev");
-
-       free_proc_info(proc_info, 2);
-}
-
-T_DECL(proc_info_pidt_shortbsdinfo,
-       "Test to verify PROC_PIDT_SHORTBSDINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[2];
-       int child_pid = 0;
-       proc_info_caller(PBSD | PBSD_SHORT, proc_info, &child_pid);
-       struct proc_bsdinfo * pbsd            = (struct proc_bsdinfo *)proc_info[0];
-       struct proc_bsdshortinfo * pbsd_short = (struct proc_bsdshortinfo *)proc_info[1];
-
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_pid, (unsigned int)child_pid, "PROC_PIDT_SHORTBSDINFO returns valid pid");
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_ppid, (unsigned int)getpid(), "PROC_PIDT_SHORTBSDINFO returns valid ppid");
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_pgid, (uint32_t)getpgid(getpid()), "PROC_PIDT_SHORTBSDINFO returned valid pbi_pgid");
-       T_EXPECT_EQ_UINT((unsigned int)SRUN, pbsd_short->pbsi_status, "PROC_PIDT_SHORTBSDINFO shows Correct status");
-       T_EXPECT_EQ_STR(pbsd_short->pbsi_comm, CONF_CMD_NAME, "PROC_PIDT_SHORTBSDINFO returns valid p_comm name");
-       /*
-        * The short variant returns all flags except session flags, hence ignoring them here.
-        */
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_flags, (pbsd->pbi_flags & (unsigned int)(~PROC_FLAG_CTTY)),
-                        "PROC_PIDT_SHORTBSDINFO returns valid flags");
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_uid, CONF_RUID_VAL, "PROC_PIDT_SHORTBSDINFO returns valid uid");
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_gid, CONF_GID_VAL, "PROC_PIDT_SHORTBSDINFO returns valid gid");
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_ruid, 0U, "PROC_PIDT_SHORTBSDINFO returns valid ruid");
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_svuid, CONF_RUID_VAL, "PROC_PIDT_SHORTBSDINFO returns valid svuid");
-       T_EXPECT_EQ_UINT(pbsd_short->pbsi_svgid, CONF_GID_VAL, "PROC_PIDT_SHORTBSDINFO returns valid svgid");
-
-       free_proc_info(proc_info, 2);
-}
-
-T_DECL(proc_info_pidt_bsdinfowithuniqid,
-       "Test to verify PROC_PIDT_BSDINFOWITHUNIQID returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[4];
-       int child_pid = 0;
-       proc_info_caller(P_UNIQIDINFO | PBSD_OLD | PBSD | PBSD_UNIQID, proc_info, &child_pid);
-       struct proc_uniqidentifierinfo * p_uniqidinfo = (struct proc_uniqidentifierinfo *)proc_info[0];
-       struct proc_bsdinfo * pbsd_old                = (struct proc_bsdinfo *)proc_info[1];
-       struct proc_bsdinfo * pbsd                    = (struct proc_bsdinfo *)proc_info[2];
-       struct proc_bsdinfowithuniqid * pbsd_uniqid   = (struct proc_bsdinfowithuniqid *)proc_info[3];
-
-       T_EXPECT_EQ_UINT((unsigned int)SRUN, pbsd->pbi_status, "PROC_PIDT_BSDINFOWITHUNIQID shows Correct status");
-       T_EXPECT_EQ_UINT(0U, pbsd->pbi_xstatus, "PROC_PIDT_BSDINFOWITHUNIQID show Correct xstatus");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_pid, (unsigned int)child_pid, "PROC_PIDT_BSDINFOWITHUNIQID returns valid pid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_ppid, (unsigned int)getpid(), "PROC_PIDT_BSDINFOWITHUNIQID returns valid ppid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_uid, CONF_RUID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid uid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_gid, CONF_GID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid gid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_ruid, 0U, "PROC_PIDT_BSDINFOWITHUNIQID returns valid ruid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_rgid, CONF_GID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid rgid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_svuid, CONF_RUID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid svuid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_svgid, CONF_GID_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid svgid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_nice, CONF_NICE_VAL, "PROC_PIDT_BSDINFOWITHUNIQID returns valid nice value");
-       T_EXPECT_EQ_STR(pbsd_uniqid->pbsd.pbi_comm, CONF_CMD_NAME, "PROC_PIDT_BSDINFOWITHUNIQID returns valid p_comm name");
-       T_EXPECT_EQ_STR(pbsd_uniqid->pbsd.pbi_name, CONF_CMD_NAME, "PROC_PIDT_BSDINFOWITHUNIQID returns valid p_name name");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_flags, (pbsd_old->pbi_flags | PROC_FLAG_PSUGID),
-                        "PROC_PIDT_BSDINFOWITHUNIQID returns valid flags");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_nfiles, pbsd_old->pbi_nfiles, "PROC_PIDT_BSDINFOWITHUNIQID returned valid pbi_nfiles");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_pgid, (uint32_t)getpgid(getpid()),
-                        "PROC_PIDT_BSDINFOWITHUNIQID returned valid pbi_pgid");
-       T_EXPECT_EQ_UINT(pbsd_uniqid->pbsd.pbi_pjobc, pbsd->pbi_pjobc, "PROC_PIDT_BSDINFOWITHUNIQID returned valid pbi_pjobc");
-       T_EXPECT_NE_UINT(pbsd_uniqid->pbsd.e_tdev, 0U, "PROC_PIDT_BSDINFOWITHUNIQID returned valid e_tdev");
-       T_EXPECT_NE_ULLONG(pbsd_uniqid->p_uniqidentifier.p_uniqueid, p_uniqidinfo->p_uniqueid,
-                          "PROC_PIDT_BSDINFOWITHUNIQID returned valid p_uniqueid");
-       for (int i = 0; i < 16; i++) {
-               T_EXPECT_EQ_UCHAR(pbsd_uniqid->p_uniqidentifier.p_uuid[i], p_uniqidinfo->p_uuid[i],
-                                 "PROC_PIDT_BSDINFOWITHUNIQID reported valid p_uniqueid");
-       }
-       T_EXPECT_EQ_ULLONG(pbsd_uniqid->p_uniqidentifier.p_puniqueid, p_uniqidinfo->p_uniqueid,
-                          "p_puniqueid of child should be same as p_uniqueid for parent");
-
-       free_proc_info(proc_info, 4);
-}
-
-T_DECL(proc_info_proc_pidtask_info,
-       "Test to verify PROC_PIDTASKINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[2];
-       proc_info_caller(P_TASK_INFO | P_TASK_INFO_NEW, proc_info, NULL);
-       struct proc_taskinfo * p_task_info     = (struct proc_taskinfo *)proc_info[0];
-       struct proc_taskinfo * p_task_info_new = (struct proc_taskinfo *)proc_info[1];
-
-       T_EXPECT_GE_ULLONG((p_task_info_new->pti_virtual_size - p_task_info->pti_virtual_size), (unsigned long long)PAGE_SIZE,
-                          "PROC_PIDTASKINFO returned valid value for pti_virtual_size");
-       T_EXPECT_GE_ULLONG((p_task_info_new->pti_resident_size - p_task_info->pti_resident_size), (unsigned long long)PAGE_SIZE,
-                          "PROC_PIDTASKINFO returned valid value for pti_virtual_size");
-       T_EXPECT_EQ_INT(p_task_info_new->pti_policy, POLICY_TIMESHARE, "PROC_PIDTASKINFO returned valid value for pti_virtual_size");
-       T_EXPECT_GE_ULLONG(p_task_info->pti_threads_user, 1ULL, "PROC_PIDTASKINFO returned valid value for pti_threads_user");
-#if defined(__arm__) || defined(__arm64__)
-       T_EXPECT_GE_ULLONG(p_task_info->pti_threads_system, 0ULL, "PROC_PIDTASKINFO returned valid value for pti_threads_system");
-       T_EXPECT_GE_ULLONG((p_task_info_new->pti_total_system - p_task_info->pti_total_system), 0ULL,
-                          "PROC_PIDTASKINFO returned valid value for pti_total_system");
-#else
-       T_EXPECT_GE_ULLONG(p_task_info->pti_threads_system, 1ULL, "PROC_PIDTASKINFO returned valid value for pti_threads_system");
-       T_EXPECT_GT_ULLONG((p_task_info_new->pti_total_system - p_task_info->pti_total_system), 0ULL,
-                          "PROC_PIDTASKINFO returned valid value for pti_total_system");
-#endif
-       T_EXPECT_GT_ULLONG((p_task_info_new->pti_total_user - p_task_info->pti_total_user), 0ULL,
-                          "PROC_PIDTASKINFO returned valid value for pti_total_user");
-       T_EXPECT_GE_INT((p_task_info_new->pti_faults - p_task_info->pti_faults), 1,
-                       "PROC_PIDTASKINFO returned valid value for pti_faults");
-       T_EXPECT_GE_INT((p_task_info_new->pti_cow_faults - p_task_info->pti_cow_faults), 1,
-                       "PROC_PIDTASKINFO returned valid value for pti_cow_faults");
-       T_EXPECT_GE_INT((p_task_info_new->pti_syscalls_mach - p_task_info->pti_syscalls_mach), 0,
-                       "PROC_PIDTASKINFO returned valid value for pti_syscalls_mach");
-       T_EXPECT_GE_INT((p_task_info_new->pti_syscalls_unix - p_task_info->pti_syscalls_unix), 2,
-                       "PROC_PIDTASKINFO returned valid value for pti_syscalls_unix");
-       T_EXPECT_EQ_INT((p_task_info_new->pti_messages_sent - p_task_info->pti_messages_sent), 0,
-                       "PROC_PIDTASKINFO returned valid value for pti_messages_sent");
-       T_EXPECT_EQ_INT((p_task_info_new->pti_messages_received - p_task_info->pti_messages_received), 0,
-                       "PROC_PIDTASKINFO returned valid value for pti_messages_received");
-       T_EXPECT_EQ_INT(p_task_info_new->pti_priority, p_task_info->pti_priority,
-                       "PROC_PIDTASKINFO returned valid value for pti_priority");
-       T_EXPECT_GE_INT(p_task_info_new->pti_threadnum, 1, "PROC_PIDTASKINFO returned valid value for pti_threadnum");
-
-       if (p_task_info_new->pti_threadnum > 1) {
-               T_LOG("WARN: PROC_PIDTASKINFO returned threadnum greater than 1");
-       }
-       T_EXPECT_GE_INT(p_task_info_new->pti_numrunning, 0, "PROC_PIDTASKINFO returned valid value for pti_numrunning");
-       T_EXPECT_GE_INT(p_task_info_new->pti_pageins, 0, "PROC_PIDTASKINFO returned valid value for pti_pageins");
-
-       if (p_task_info_new->pti_pageins > 0) {
-               T_LOG("WARN: PROC_PIDTASKINFO returned pageins greater than 0");
-       }
-
-       T_EXPECT_GE_INT(p_task_info_new->pti_csw, p_task_info->pti_csw, "PROC_PIDTASKINFO returned valid value for pti_csw");
-
-       free_proc_info(proc_info, 2);
-}
-
-T_DECL(proc_info_proc_pidtaskallinfo,
-       "Test to verify PROC_PIDTASKALLINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[4];
-       int child_pid = 0;
-       proc_info_caller(PBSD | PBSD_OLD | P_TASK_INFO | PALL, proc_info, &child_pid);
-       struct proc_bsdinfo * pbsd         = (struct proc_bsdinfo *)proc_info[0];
-       struct proc_bsdinfo * pbsd_old     = (struct proc_bsdinfo *)proc_info[1];
-       struct proc_taskinfo * p_task_info = (struct proc_taskinfo *)proc_info[2];
-       struct proc_taskallinfo * pall     = (struct proc_taskallinfo *)proc_info[3];
-
-       T_EXPECT_EQ_UINT((unsigned int)SRUN, pbsd->pbi_status, "PROC_PIDTASKALLINFO shows Correct status");
-       T_EXPECT_EQ_UINT(0U, pbsd->pbi_xstatus, "PROC_PIDTASKALLINFO show Correct xstatus");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_pid, (unsigned int)child_pid, "PROC_PIDTASKALLINFO returns valid pid");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_ppid, (unsigned int)getpid(), "PROC_PIDTASKALLINFO returns valid ppid");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_uid, CONF_RUID_VAL, "PROC_PIDTASKALLINFO returns valid uid");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_gid, CONF_GID_VAL, "PROC_PIDTASKALLINFO returns valid gid");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_ruid, 0U, "PROC_PIDTASKALLINFO returns valid ruid");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_rgid, CONF_GID_VAL, "PROC_PIDTASKALLINFO returns valid rgid");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_svuid, CONF_RUID_VAL, "PROC_PIDTASKALLINFO returns valid svuid");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_svgid, CONF_GID_VAL, "PROC_PIDTASKALLINFO returns valid svgid");
-       T_EXPECT_EQ_INT(pall->pbsd.pbi_nice, CONF_NICE_VAL, "PROC_PIDTASKALLINFO returns valid nice value");
-       T_EXPECT_EQ_STR(pall->pbsd.pbi_comm, CONF_CMD_NAME, "PROC_PIDTASKALLINFO returns valid p_comm name");
-       T_EXPECT_EQ_STR(pall->pbsd.pbi_name, CONF_CMD_NAME, "PROC_PIDTASKALLINFO returns valid p_name name");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_flags, (pbsd_old->pbi_flags | PROC_FLAG_PSUGID), "PROC_PIDTASKALLINFO returns valid flags");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_nfiles, pbsd_old->pbi_nfiles, "PROC_PIDTASKALLINFO returned valid pbi_nfiles");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_pgid, (uint32_t)getpgid(getpid()), "PROC_PIDTASKALLINFO returned valid pbi_pgid");
-       T_EXPECT_EQ_UINT(pall->pbsd.pbi_pjobc, pbsd->pbi_pjobc, "PROC_PIDTASKALLINFO returned valid pbi_pjobc");
-       T_EXPECT_NE_UINT(pall->pbsd.e_tdev, 0U, "PROC_PIDTASKALLINFO returned valid e_tdev");
-
-#if defined(__arm__) || defined(__arm64__)
-       T_EXPECT_GE_ULLONG(pall->ptinfo.pti_threads_system, 0ULL, "PROC_PIDTASKALLINFO returned valid value for pti_threads_system");
-       T_EXPECT_GE_ULLONG((pall->ptinfo.pti_total_system - p_task_info->pti_total_system), 0ULL,
-                          "PROC_PIDTASKALLINFO returned valid value for pti_total_system");
-#else
-       T_EXPECT_GE_ULLONG(pall->ptinfo.pti_threads_system, 1ULL, "PROC_PIDTASKALLINFO returned valid value for pti_threads_system");
-       T_EXPECT_GT_ULLONG((pall->ptinfo.pti_total_system - p_task_info->pti_total_system), 0ULL,
-                          "PROC_PIDTASKALLINFO returned valid value for pti_total_system");
-#endif /* ARM */
-
-       T_EXPECT_GE_ULLONG((pall->ptinfo.pti_virtual_size - p_task_info->pti_virtual_size), (unsigned long long)PAGE_SIZE,
-                          "PROC_PIDTASKALLINFO returned valid value for pti_virtual_size");
-       T_EXPECT_GE_ULLONG((pall->ptinfo.pti_resident_size - p_task_info->pti_resident_size), (unsigned long long)PAGE_SIZE,
-                          "PROC_PIDTASKALLINFO returned valid value for pti_virtual_size");
-       T_EXPECT_EQ_INT(pall->ptinfo.pti_policy, POLICY_TIMESHARE, "PROC_PIDTASKALLINFO returned valid value for pti_virtual_size");
-       T_EXPECT_GE_ULLONG(pall->ptinfo.pti_threads_user, 1ULL, "PROC_PIDTASKALLINFO returned valid value for pti_threads_user ");
-       T_EXPECT_GT_ULLONG((pall->ptinfo.pti_total_user - p_task_info->pti_total_user), 0ULL,
-                          "PROC_PIDTASKALLINFO returned valid value for pti_total_user");
-       T_EXPECT_GE_INT((pall->ptinfo.pti_faults - p_task_info->pti_faults), 1,
-                       "PROC_PIDTASKALLINFO returned valid value for pti_faults");
-       T_EXPECT_GE_INT((pall->ptinfo.pti_cow_faults - p_task_info->pti_cow_faults), 1,
-                       "PROC_PIDTASKALLINFO returned valid value for pti_cow_faults");
-       T_EXPECT_GE_INT((pall->ptinfo.pti_syscalls_mach - p_task_info->pti_syscalls_mach), 0,
-                       "PROC_PIDTASKALLINFO returned valid value for pti_syscalls_mach");
-       T_EXPECT_GE_INT((pall->ptinfo.pti_syscalls_unix - p_task_info->pti_syscalls_unix), 2,
-                       "PROC_PIDTASKALLINFO returned valid value for pti_syscalls_unix");
-       T_EXPECT_EQ_INT((pall->ptinfo.pti_messages_sent - p_task_info->pti_messages_sent), 0,
-                       "PROC_PIDTASKALLINFO returned valid value for pti_messages_sent");
-       T_EXPECT_EQ_INT((pall->ptinfo.pti_messages_received - p_task_info->pti_messages_received), 0,
-                       "PROC_PIDTASKALLINFO returned valid value for pti_messages_received");
-       T_EXPECT_EQ_INT(pall->ptinfo.pti_priority, p_task_info->pti_priority,
-                       "PROC_PIDTASKALLINFO returned valid value for pti_priority");
-       T_EXPECT_GE_INT(pall->ptinfo.pti_threadnum, 1, "PROC_PIDTASKALLINFO returned valid value for pti_threadnum");
-       if (pall->ptinfo.pti_threadnum > 1) {
-               T_LOG("WARN: PROC_PIDTASKALLINFO returned threadnum greater than 1");
-       }
-       T_EXPECT_GE_INT(pall->ptinfo.pti_numrunning, 0, "PROC_PIDTASKALLINFO returned valid value for pti_numrunning");
-       T_EXPECT_GE_INT(pall->ptinfo.pti_pageins, 0, "PROC_PIDTASKALLINFO returned valid value for pti_pageins");
-       if (pall->ptinfo.pti_pageins > 0) {
-               T_LOG("WARN: PROC_PIDTASKALLINFO returned pageins greater than 0");
-       }
-       T_EXPECT_GE_INT(pall->ptinfo.pti_csw, p_task_info->pti_csw, "PROC_PIDTASKALLINFO returned valid value for pti_csw");
-
-       free_proc_info(proc_info, 4);
-}
-
-T_DECL(proc_info_proc_pidlistthreads,
-       "Test to verify PROC_PIDLISTTHREADS returns valid information about process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[1];
-       proc_info_caller(THREAD_ADDR, proc_info, NULL);
-}
-
-T_DECL(proc_info_proc_pidthreadinfo,
-       "Test to verify PROC_PIDTHREADINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[2];
-       int child_pid = 0;
-       proc_info_caller(PTHINFO_OLD | PTHINFO, proc_info, &child_pid);
-       struct proc_threadinfo * pthinfo_old = (struct proc_threadinfo *)proc_info[0];
-       struct proc_threadinfo * pthinfo     = (struct proc_threadinfo *)proc_info[1];
-
-       T_EXPECT_GT_ULLONG((pthinfo->pth_user_time - pthinfo_old->pth_user_time), 0ULL,
-                          "PROC_PIDTHREADINFO returns valid value for pth_user_time");
-       T_EXPECT_GE_ULLONG((pthinfo->pth_system_time - pthinfo_old->pth_system_time), 0ULL,
-                          "PROC_PIDTHREADINFO returns valid value for pth_system_time");
-       /*
-        * This is the scaled cpu usage percentage, since we are not
-        * doing a really long CPU bound task, it is (nearly) zero
-        */
-       T_EXPECT_GE_INT(pthinfo->pth_cpu_usage, 0, "PROC_PIDTHREADINFO returns valid value for pth_cpu_usage");
-       T_EXPECT_EQ_INT(pthinfo->pth_policy, POLICY_TIMESHARE, "PROC_PIDTHREADINFO returns valid value for pth_policy");
-       if (!(pthinfo->pth_run_state == TH_STATE_WAITING) && !(pthinfo->pth_run_state == TH_STATE_RUNNING)) {
-               T_EXPECT_EQ_INT(pthinfo->pth_run_state, -1, "PROC_PIDTHREADINFO returns valid value for pth_run_state");
-       }
-       /*
-        * This value is hardcoded to 0 in the source, hence it will always
-        * unconditionally return 0
-        */
-       T_EXPECT_EQ_INT(pthinfo->pth_sleep_time, 0, "PROC_PIDTHREADINFO returns valid value for pth_sleep_time");
-       T_EXPECT_LE_INT(pthinfo->pth_curpri, (BASEPRI_DEFAULT - CONF_NICE_VAL),
-                       "PROC_PIDTHREADINFO returns valid value for pth_curpri");
-       T_EXPECT_EQ_INT(pthinfo->pth_priority, (BASEPRI_DEFAULT - CONF_NICE_VAL),
-                       "PROC_PIDTHREADINFO returns valid value for pth_priority");
-       T_EXPECT_EQ_INT(pthinfo->pth_maxpriority, MAXPRI_USER, "PROC_PIDTHREADINFO returns valid value for pth_maxpriority");
-       T_EXPECT_EQ_STR(pthinfo->pth_name, CONF_THREAD_NAME, "PROC_PIDTHREADINFO returns valid value for pth_name");
-
-       free_proc_info(proc_info, 2);
-}
-
-T_DECL(proc_info_proc_threadid64info,
-       "Test to verify PROC_PIDTHREADID64INFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[2];
-       proc_info_caller(PTHINFO | PTHINFO_64, proc_info, NULL);
-       struct proc_threadinfo pthinfo    = *((struct proc_threadinfo *)proc_info[0]);
-       struct proc_threadinfo pthinfo_64 = *((struct proc_threadinfo *)proc_info[1]);
-       T_EXPECT_GE_ULLONG(pthinfo_64.pth_user_time, pthinfo.pth_user_time,
-                          "PROC_PIDTHREADID64INFO returns valid value for pth_user_time");
-       T_EXPECT_GE_ULLONG(pthinfo_64.pth_system_time, pthinfo.pth_system_time,
-                          "PROC_PIDTHREADID64INFO returns valid value for pth_system_time");
-       T_EXPECT_GE_INT(pthinfo_64.pth_cpu_usage, pthinfo.pth_cpu_usage,
-                       "PROC_PIDTHREADID64INFO returns valid value for pth_cpu_usage");
-       T_EXPECT_EQ_INT(pthinfo_64.pth_policy, POLICY_TIMESHARE, "PROC_PIDTHREADID64INFO returns valid value for pth_policy");
-       if (!(pthinfo_64.pth_run_state == TH_STATE_WAITING) && !(pthinfo_64.pth_run_state == TH_STATE_RUNNING)) {
-               T_EXPECT_EQ_INT(pthinfo_64.pth_run_state, -1, "PROC_PIDTHREADID64INFO returns valid value for pth_run_state");
-       }
-       T_EXPECT_EQ_INT(pthinfo_64.pth_sleep_time, 0, "PROC_PIDTHREADID64INFO returns valid value for pth_sleep_time");
-       T_EXPECT_EQ_INT(pthinfo_64.pth_curpri, pthinfo.pth_curpri, "PROC_PIDTHREADID64INFO returns valid value for pth_curpri");
-       T_EXPECT_EQ_INT(pthinfo_64.pth_priority, pthinfo.pth_priority, "PROC_PIDTHREADID64INFO returns valid value for pth_priority");
-       T_EXPECT_EQ_INT(pthinfo_64.pth_maxpriority, pthinfo.pth_maxpriority,
-                       "PROC_PIDTHREADID64INFO returns valid value for pth_maxpriority");
-       T_EXPECT_EQ_STR(pthinfo_64.pth_name, CONF_THREAD_NAME, "PROC_PIDTHREADID64INFO returns valid value for pth_name");
-
-       free_proc_info(proc_info, 2);
-}
-
-T_DECL(proc_info_proc_pidthreadpathinfo,
-       "Test to verify PROC_PIDTHREADPATHINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[2];
-       proc_info_caller(PTHINFO | PINFO_PATH, proc_info, NULL);
-       struct proc_threadinfo pthinfo            = *((struct proc_threadinfo *)proc_info[0]);
-       struct proc_threadwithpathinfo pinfo_path = *((struct proc_threadwithpathinfo *)proc_info[1]);
-
-       T_EXPECT_GE_ULLONG(pinfo_path.pt.pth_user_time, pthinfo.pth_user_time,
-                          "PROC_PIDTHREADPATHINFO returns valid value for pth_user_time");
-       T_EXPECT_GE_ULLONG(pinfo_path.pt.pth_system_time, pthinfo.pth_system_time,
-                          "PROC_PIDTHREADPATHINFO returns valid value for pth_system_time");
-       T_EXPECT_GE_INT(pinfo_path.pt.pth_cpu_usage, pthinfo.pth_cpu_usage,
-                       "PROC_PIDTHREADPATHINFO returns valid value for pth_cpu_usage");
-       T_EXPECT_EQ_INT(pinfo_path.pt.pth_policy, POLICY_TIMESHARE, "PROC_PIDTHREADPATHINFO returns valid value for pth_policy");
-       if (!(pinfo_path.pt.pth_run_state == TH_STATE_WAITING) && !(pinfo_path.pt.pth_run_state == TH_STATE_RUNNING)) {
-               T_EXPECT_EQ_INT(pinfo_path.pt.pth_run_state, -1, "PROC_PIDTHREADPATHINFO returns valid value for pth_run_state");
-       }
-       T_EXPECT_EQ_INT(pinfo_path.pt.pth_sleep_time, 0, "PROC_PIDTHREADPATHINFO returns valid value for pth_sleep_time");
-       T_EXPECT_EQ_INT(pinfo_path.pt.pth_curpri, pthinfo.pth_curpri, "PROC_PIDTHREADPATHINFO returns valid value for pth_curpri");
-       T_EXPECT_EQ_INT(pinfo_path.pt.pth_priority, pthinfo.pth_priority,
-                       "PROC_PIDTHREADPATHINFO returns valid value for pth_priority");
-       T_EXPECT_EQ_INT(pinfo_path.pt.pth_maxpriority, pthinfo.pth_maxpriority,
-                       "PROC_PIDTHREADPATHINFO returns valid value for pth_maxpriority");
-       T_EXPECT_EQ_STR(pinfo_path.pt.pth_name, CONF_THREAD_NAME, "PROC_PIDTHREADPATHINFO returns valid value for pth_name");
-       T_EXPECT_EQ_INT(pinfo_path.pvip.vip_vi.vi_type, VNON, "PROC_PIDTHREADPATHINFO valid vnode information");
-
-       free_proc_info(proc_info, 2);
-}
-
-T_DECL(proc_info_proc_pidarchinfo,
-       "Test to verify PROC_PIDARCHINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[1];
-       proc_info_caller(PAI, proc_info, NULL);
-       struct proc_archinfo pai = *((struct proc_archinfo *)proc_info[0]);
-
-#if defined(__arm__) || defined(__arm64__)
-       if (!((pai.p_cputype & CPU_TYPE_ARM) == CPU_TYPE_ARM) && !((pai.p_cputype & CPU_TYPE_ARM64) == CPU_TYPE_ARM64)) {
-               T_EXPECT_EQ_INT(pai.p_cputype, CPU_TYPE_ARM, "PROC_PIDARCHINFO returned valid value for p_cputype");
-       }
-       T_EXPECT_EQ_INT((pai.p_cpusubtype & CPU_SUBTYPE_ARM_ALL), CPU_SUBTYPE_ARM_ALL,
-                       "PROC_PIDARCHINFO returned valid value for p_cpusubtype");
-#else
-       if (!((pai.p_cputype & CPU_TYPE_X86) == CPU_TYPE_X86) && !((pai.p_cputype & CPU_TYPE_X86_64) == CPU_TYPE_X86_64)) {
-               T_EXPECT_EQ_INT(pai.p_cputype, CPU_TYPE_X86, "PROC_PIDARCHINFO returned valid value for p_cputype");
-       }
-#endif
-       free_proc_info(proc_info, 1);
-}
-
-T_DECL(proc_info_proc_pidregioninfo,
-       "Test to verify PROC_PIDREGIONINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[2];
-       void * map_tmp = NULL;
-       proc_info_caller(PREGINFO, proc_info, NULL);
-
-       struct proc_regioninfo preginfo = *((struct proc_regioninfo *)proc_info[0]);
-       /*
-        *      map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it
-        */
-       map_tmp = proc_info[1];
-
-       T_EXPECT_EQ_ULLONG(preginfo.pri_offset, (unsigned long long)PAGE_SIZE, "PROC_PIDREGIONINFO returns valid value for pri_offset");
-       T_EXPECT_EQ_UINT((preginfo.pri_protection ^ (VM_PROT_READ | VM_PROT_WRITE)), 0U,
-                        "PROC_PIDREGIONINFO returns valid value for pri_protection, expected read/write only");
-       T_EXPECT_EQ_UINT((preginfo.pri_max_protection & (VM_PROT_READ | VM_PROT_WRITE)), (unsigned int)(VM_PROT_READ | VM_PROT_WRITE),
-                        "PROC_PIDREGIONINFO returns valid value for pri_max_protection");
-       T_EXPECT_EQ_UINT((preginfo.pri_inheritance ^ VM_INHERIT_COPY), 0U,
-                        "PROC_PIDREGIONINFO returns valid value for pri_inheritance");
-       T_EXPECT_EQ_UINT((preginfo.pri_behavior ^ VM_BEHAVIOR_DEFAULT), 0U, "PROC_PIDREGIONINFO returns valid value for pri_behavior");
-       T_EXPECT_EQ_UINT(preginfo.pri_user_wired_count, 0U, "PROC_PIDREGIONINFO returns valid value for pri_user_wired_count");
-       T_EXPECT_EQ_UINT(preginfo.pri_user_tag, 0U, "PROC_PIDREGIONINFO returns valid value for pri_user_tag");
-       T_EXPECT_NE_UINT((preginfo.pri_flags ^ (PROC_REGION_SUBMAP | PROC_REGION_SHARED)), 0U,
-                        "PROC_PIDREGIONINFO returns valid value for pri_flags");
-       T_EXPECT_EQ_UINT(preginfo.pri_pages_resident, 0U, "PROC_PIDREGIONINFO returns valid value for pri_pages_resident");
-       T_EXPECT_EQ_UINT(preginfo.pri_pages_shared_now_private, 0U,
-                        "PROC_PIDREGIONINFO returns valid value for pri_pages_shared_now_private");
-       T_EXPECT_EQ_UINT(preginfo.pri_pages_swapped_out, 0U, "PROC_PIDREGIONINFO returns valid value for pri_pages_swapped_out");
-       T_EXPECT_EQ_UINT(preginfo.pri_pages_dirtied, 0U, "PROC_PIDREGIONINFO returns valid value for pri_pages_dirtied");
-       T_EXPECT_EQ_UINT(preginfo.pri_ref_count, 2U, "PROC_PIDREGIONINFO returns valid value for pri_ref_count");
-       T_EXPECT_EQ_UINT(preginfo.pri_shadow_depth, 1U, "PROC_PIDREGIONINFO returns valid value for pri_shadow_depth");
-       T_EXPECT_EQ_UINT(preginfo.pri_share_mode, (unsigned int)SM_COW, "PROC_PIDREGIONINFO returns valid value for pri_share_mode");
-       T_EXPECT_EQ_UINT(preginfo.pri_private_pages_resident, 0U,
-                        "PROC_PIDREGIONINFO returns valid value for pri_private_pages_resident");
-       T_EXPECT_GE_UINT(preginfo.pri_shared_pages_resident, 1U,
-                        "PROC_PIDREGIONINFO returns valid value for pri_shared_pages_resident");
-       T_EXPECT_EQ_ULLONG(preginfo.pri_address, (uint64_t)map_tmp, "PROC_PIDREGIONINFO returns valid value for pri_addr");
-       T_EXPECT_NE_UINT(preginfo.pri_obj_id, 0U, "PROC_PIDREGIONINFO returns valid value for pri_obj_id");
-       T_EXPECT_EQ_ULLONG(preginfo.pri_size, (unsigned long long)PAGE_SIZE, "PROC_PIDREGIONINFO returns valid value for pri_size");
-       T_EXPECT_EQ_UINT(preginfo.pri_depth, 0U, "PROC_PIDREGIONINFO returns valid value for pri_depth");
-
-       int ret = 0;
-       ret     = munmap(map_tmp, PAGE_SIZE);
-       T_QUIET;
-       T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp");
-       free_proc_info(proc_info, 1);
-}
-
-T_DECL(proc_info_proc_pidregionpathinfo,
-       "Test to verify PROC_PIDREGIONPATHINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
-{
-       void * proc_info[2];
-       void * map_tmp = NULL;
-       proc_info_caller(PREGINFO_PATH, proc_info, NULL);
-
-       struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]);
-       /*
-        *      map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it
-            */
-       map_tmp = proc_info[1];
-
-       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE,
-                          "PROC_PIDREGIONPATHINFO returns valid value for pri_offset");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_protection ^ (VM_PROT_READ | VM_PROT_WRITE)), 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_protection, expected read/write only");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_max_protection & (VM_PROT_READ | VM_PROT_WRITE)),
-                        (unsigned int)(VM_PROT_READ | VM_PROT_WRITE),
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_max_protection");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_inheritance ^ VM_INHERIT_COPY), 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_inheritance");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_behavior ^ VM_BEHAVIOR_DEFAULT), 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_behavior");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_wired_count, 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_user_wired_count");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_tag, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_user_tag");
-       T_EXPECT_NE_UINT((preginfo_path.prp_prinfo.pri_flags ^ (PROC_REGION_SUBMAP | PROC_REGION_SHARED)), 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_flags");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_resident, 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_pages_resident");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_shared_now_private, 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_pages_shared_now_private");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_swapped_out, 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_pages_swapped_out");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_dirtied, 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_pages_dirtied");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_ref_count, 2U, "PROC_PIDREGIONPATHINFO returns valid value for pri_ref_count");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shadow_depth, 1U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_shadow_depth");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_share_mode, (unsigned int)SM_COW,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_share_mode");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_private_pages_resident, 0U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_private_pages_resident");
-       T_EXPECT_GE_UINT(preginfo_path.prp_prinfo.pri_shared_pages_resident, 1U,
-                        "PROC_PIDREGIONPATHINFO returns valid value for pri_shared_pages_resident");
-       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_address, (uint64_t)map_tmp,
-                          "PROC_PIDREGIONPATHINFO returns valid value for pri_addr");
-       T_EXPECT_NE_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_obj_id");
-       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_size, (uint64_t)PAGE_SIZE,
-                          "PROC_PIDREGIONPATHINFO returns valid value for pri_size");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO returns valid value for pri_depth");
-       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_type, VREG, "PROC_PIDREGIONPATHINFO returns valid value for vi_type");
-       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_pad, 0, "PROC_PIDREGIONPATHINFO returns valid value for vi_pad");
-       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[0], 0,
-                       "PROC_PIDREGIONPATHINFO returns valid value for vi_fsid.val[0]");
-       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[1], 0,
-                       "PROC_PIDREGIONPATHINFO returns valid value for vi_fsid.val[1]");
-       T_EXPECT_NE_PTR((void *)(strcasestr(preginfo_path.prp_vip.vip_path, CONF_TMP_FILE_PATH)), NULL,
-                       "PROC_PIDREGIONPATHINFO returns valid value for vi_path");
-       /*
-        * Basic sanity checks for vnode stat returned by the API
-        */
-       T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO returns valid value for vst_dev");
-       T_EXPECT_EQ_INT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0,
-                       "PROC_PIDREGIONPATHINFO returns valid value for vst_mode");
-       T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)1,
-                          "PROC_PIDREGIONPATHINFO returns valid value for vst_nlink");
-       T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL,
-                          "PROC_PIDREGIONPATHINFO returns valid value for vst_ino");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDREGIONPATHINFO returns valid value for vst_uid");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDREGIONPATHINFO returns valid value for vst_gid");
-       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_size, (off_t)CONF_BLK_SIZE,
-                         "PROC_PIDREGIONPATHINFO returns valid value for vst_size");
-       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blocks, 1LL,
-                         "PROC_PIDREGIONPATHINFO returns valid value for vst_blocks");
-       T_EXPECT_GE_INT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE,
-                       "PROC_PIDREGIONPATHINFO returns valid value for vst_blksize");
-
-       int ret = 0;
-       ret     = munmap(map_tmp, PAGE_SIZE);
-       T_QUIET;
-       T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp");
-       free_proc_info(proc_info, 1);
-}
-
-T_DECL(proc_info_proc_pidregionpathinfo2,
-       "Test to verify PROC_PIDREGIONPATHINFO2 returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
-{
-       void * proc_info[2];
-       void * map_tmp = NULL;
-       proc_info_caller(PREGINFO_PATH_2, proc_info, NULL);
-
-       struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]);
-       /*
-        *      map_tmp isn't a struct like the rest of our ret_structs, but we sneak it back because we need it
-            */
-       map_tmp = proc_info[1];
-
-       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE,
-                          "PROC_PIDREGIONPATHINFO2 returns valid value for pri_offset");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_protection ^ (VM_PROT_READ | VM_PROT_WRITE)), 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_protection, expected read/write only");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_max_protection & (VM_PROT_READ | VM_PROT_WRITE)),
-                        (unsigned int)(VM_PROT_READ | VM_PROT_WRITE),
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_max_protection");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_inheritance ^ VM_INHERIT_COPY), 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_inheritance");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_behavior ^ VM_BEHAVIOR_DEFAULT), 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_behavior");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_wired_count, 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_user_wired_count");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_tag, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_user_tag");
-       T_EXPECT_NE_UINT((preginfo_path.prp_prinfo.pri_flags ^ (PROC_REGION_SUBMAP | PROC_REGION_SHARED)), 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_flags");
-       /*
-        * Following values are hard-coded to be zero in source
-        */
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_resident, 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_pages_resident");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_shared_now_private, 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_pages_shared_now_private");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_swapped_out, 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_pages_swapped_out");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_dirtied, 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_pages_dirtied");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_ref_count, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_ref_count");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shadow_depth, 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_shadow_depth");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_share_mode, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_share_mode");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_private_pages_resident, 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_private_pages_resident");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shared_pages_resident, 0U,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for pri_shared_pages_resident");
-       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_address, (uint64_t)map_tmp,
-                          "PROC_PIDREGIONPATHINFO2 returns valid value for pri_addr");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_obj_id");
-       T_EXPECT_EQ_ULLONG(preginfo_path.prp_prinfo.pri_size, (unsigned long long)PAGE_SIZE,
-                          "PROC_PIDREGIONPATHINFO2 returns valid value for pri_size");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for pri_depth");
-
-       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_type, VREG, "PROC_PIDREGIONPATHINFO2 returns valid value for vi_type");
-       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_pad, 0, "PROC_PIDREGIONPATHINFO2 returns valid value for vi_pad");
-       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[0], 0,
-                       "PROC_PIDREGIONPATHINFO2 returns valid value for vi_fsid.val[0]:%d",
-                       preginfo_path.prp_vip.vip_vi.vi_fsid.val[0]);
-       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[1], 0,
-                       "PROC_PIDREGIONPATHINFO2 returns valid value for vi_fsid.val[1]:%d",
-                       preginfo_path.prp_vip.vip_vi.vi_fsid.val[1]);
-       T_EXPECT_NE_PTR((void *)(strcasestr(preginfo_path.prp_vip.vip_path, CONF_TMP_FILE_PATH)), NULL,
-                       "PROC_PIDREGIONPATHINFO2 returns valid value for vi_path");
-       /*
-        * Basic sanity checks for vnode stat returned by the API
-        */
-       T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_dev");
-       T_EXPECT_EQ_UINT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for vst_mode");
-       T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)1,
-                          "PROC_PIDREGIONPATHINFO2 returns valid value for vst_nlink");
-       T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL,
-                          "PROC_PIDREGIONPATHINFO2 returns valid value for vst_ino");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_uid");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDREGIONPATHINFO2 returns valid value for vst_gid");
-       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_size, (off_t)CONF_BLK_SIZE,
-                         "PROC_PIDREGIONPATHINFO2 returns valid value for vst_size");
-       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blocks, 1LL,
-                         "PROC_PIDREGIONPATHINFO2 returns valid value for vst_blocks");
-       T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE,
-                        "PROC_PIDREGIONPATHINFO2 returns valid value for vst_blksize");
-
-       int ret = 0;
-       ret     = munmap(map_tmp, PAGE_SIZE);
-       T_QUIET;
-       T_EXPECT_POSIX_SUCCESS(ret, "munmap of map_tmp");
-       free_proc_info(proc_info, 1);
-}
-
-T_DECL(proc_info_proc_pidregionpathinfo3,
-       "Test to verify PROC_PIDREGIONPATHINFO3 returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_INSTALLEDUSEROS))
-{
-       void * proc_info[1];
-       proc_info_caller(PREGINFO_PATH_3, proc_info, NULL);
-
-       struct proc_regionwithpathinfo preginfo_path = *((struct proc_regionwithpathinfo *)proc_info[0]);
-
-       T_EXPECT_GE_ULLONG(preginfo_path.prp_prinfo.pri_offset, (uint64_t)PAGE_SIZE,
-                          "PROC_PIDREGIONPATHINFO3 returns valid value for pri_offset");
-       T_EXPECT_NE_UINT((preginfo_path.prp_prinfo.pri_protection ^ (VM_PROT_WRITE | VM_PROT_EXECUTE)), 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_protection");
-#if defined(__arm__) || defined(__arm64__)
-       T_EXPECT_GT_UINT(preginfo_path.prp_prinfo.pri_max_protection, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_max_protection");
-#else
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_max_protection ^ VM_PROT_ALL), 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_max_protection");
-#endif
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_inheritance ^ VM_INHERIT_COPY), 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_inheritance");
-       T_EXPECT_EQ_UINT((preginfo_path.prp_prinfo.pri_behavior ^ VM_BEHAVIOR_DEFAULT), 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_behavior");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_wired_count, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_user_wired_count");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_user_tag, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_user_tag");
-       T_EXPECT_NE_UINT((preginfo_path.prp_prinfo.pri_flags ^ (PROC_REGION_SUBMAP | PROC_REGION_SHARED)), 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_flags");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_resident, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_pages_resident");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_shared_now_private, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_pages_shared_now_private");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_swapped_out, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_pages_swapped_out");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_pages_dirtied, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_pages_dirtied");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_ref_count, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_ref_count");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shadow_depth, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_shadow_depth");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_share_mode, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_share_mode");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_private_pages_resident, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_private_pages_resident");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_shared_pages_resident, 0U,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for pri_shared_pages_resident");
-       T_EXPECT_NE_ULLONG(preginfo_path.prp_prinfo.pri_address, 0ULL, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_addr");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_obj_id, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_obj_id");
-       T_EXPECT_GE_ULLONG(preginfo_path.prp_prinfo.pri_size, (uint64_t)PAGE_SIZE,
-                          "PROC_PIDREGIONPATHINFO3 returns valid value for pri_size");
-       T_EXPECT_EQ_UINT(preginfo_path.prp_prinfo.pri_depth, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for pri_depth");
-
-       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_type, VREG, "PROC_PIDREGIONPATHINFO3 returns valid value for vi_type");
-       T_EXPECT_EQ_INT(preginfo_path.prp_vip.vip_vi.vi_pad, 0, "PROC_PIDREGIONPATHINFO3 returns valid value for vi_pad");
-       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[0], 0,
-                       "PROC_PIDREGIONPATHINFO3 returns valid value for vi_fsid.val[0]");
-       T_EXPECT_NE_INT(preginfo_path.prp_vip.vip_vi.vi_fsid.val[1], 0,
-                       "PROC_PIDREGIONPATHINFO3 returns valid value for vi_fsid.val[1]");
-       /*
-        * Basic sanity checks for vnode stat returned by the API
-        */
-       T_EXPECT_NE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_dev");
-       T_EXPECT_EQ_UINT(((preginfo_path.prp_vip.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFREG), 0,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for vst_mode");
-       T_EXPECT_EQ_USHORT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_nlink, (unsigned short)1,
-                          "PROC_PIDREGIONPATHINFO3 returns valid value for vst_nlink");
-       T_EXPECT_NE_ULLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_ino, 0ULL,
-                          "PROC_PIDREGIONPATHINFO3 returns valid value for vst_ino");
-       /*
-        * No way to confirm actual ownership or binary. Just log the value
-        */
-       T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_uid");
-       T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_gid");
-       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_size, (off_t)CONF_BLK_SIZE,
-                         "PROC_PIDREGIONPATHINFO3 returns valid value for vst_size");
-       T_EXPECT_GE_LLONG(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blocks, 1LL,
-                         "PROC_PIDREGIONPATHINFO3 returns valid value for vst_blocks");
-       T_EXPECT_GE_UINT(preginfo_path.prp_vip.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE,
-                        "PROC_PIDREGIONPATHINFO3 returns valid value for vst_blksize");
-
-       free_proc_info(proc_info, 1);
-}
-
-T_DECL(proc_info_proc_pidvnodepathinfo,
-       "Test to verify PROC_PIDVNODEPATHINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       void * proc_info[1];
-       proc_info_caller(PVNINFO, proc_info, NULL);
-       struct proc_vnodepathinfo pvninfo = *((struct proc_vnodepathinfo *)proc_info[0]);
-
-       T_EXPECT_EQ_INT(pvninfo.pvi_cdir.vip_vi.vi_type, VDIR, "PROC_PIDVNODEPATHINFO returns valid value for vi_type");
-       T_EXPECT_EQ_INT(pvninfo.pvi_cdir.vip_vi.vi_pad, 0, "PROC_PIDVNODEPATHINFO returns valid value for vi_pad");
-       T_EXPECT_NE_INT(pvninfo.pvi_cdir.vip_vi.vi_fsid.val[0], 0, "PROC_PIDVNODEPATHINFO returns valid value for vi_fsid.val[0]");
-       T_EXPECT_NE_INT(pvninfo.pvi_cdir.vip_vi.vi_fsid.val[1], 0, "PROC_PIDVNODEPATHINFO returns valid value for vi_fsid.val[1]");
-       /*
-        * Basic sanity checks for vnode stat returned by the API
-        */
-       T_EXPECT_NE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_dev, 0U, "PROC_PIDVNODEPATHINFO returns valid value for vst_dev");
-       T_EXPECT_EQ_INT(((pvninfo.pvi_cdir.vip_vi.vi_stat.vst_mode & S_IFMT) ^ S_IFDIR), 0,
-                       "PROC_PIDVNODEPATHINFO returns valid value for vst_mode");
-       T_EXPECT_GE_USHORT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_nlink, (unsigned short)2,
-                          "PROC_PIDVNODEPATHINFO returns valid value for vst_nlink");
-       T_EXPECT_NE_ULLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_ino, 0ULL, "PROC_PIDVNODEPATHINFO returns valid value for vst_ino");
-       T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_uid, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_uid");
-       T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_gid, 0U, "PROC_PIDREGIONPATHINFO3 returns valid value for vst_gid");
-       T_EXPECT_GT_LLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_size, 0LL, "PROC_PIDVNODEPATHINFO returns valid value for vst_size");
-       T_EXPECT_GE_LLONG(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_blocks, 0LL, "PROC_PIDVNODEPATHINFO returns valid value for vst_blocks");
-       T_EXPECT_GE_UINT(pvninfo.pvi_cdir.vip_vi.vi_stat.vst_blksize, CONF_BLK_SIZE,
-                        "PROC_PIDVNODEPATHINFO returns valid value for vst_blksize");
-
-       free_proc_info(proc_info, 1);
-}
-/*
- * The remaining tests break from the pattern of the other PROC_INFO_CALL_PIDINFO tests.
- * We call proc_info directly as it's more efficient
- */
-
-T_DECL(proc_info_pidinfo_proc_pidlistfds,
-       "proc_info API tests to verify PROC_INFO_CALL_PIDINFO/PROC_PIDLISTFDS",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       int retval;
-       int orig_nfiles              = 0;
-       struct proc_fdinfo * fd_info = NULL;
-
-       T_LOG("Test to verify PROC_PIDLISTFDS returns sane number of open files");
-       retval      = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDLISTFDS, (uint32_t)0, (user_addr_t)0, (uint32_t)0);
-       orig_nfiles = retval / (int)sizeof(struct proc_fdinfo);
-       T_EXPECT_GE_INT(orig_nfiles, CONF_OPN_FILE_COUNT, "The number of open files is lower than expected.");
-
-       /*
-         * Allocate a buffer of expected size + 1 to ensure that
-         * the API still returns expected size
-         * i.e. 3 + 1 = 4 open fds
-         */
-       T_LOG("Test to verify PROC_PIDLISTFDS returns valid fd information");
-       fd_info = malloc(sizeof(*fd_info) * 5);
-       tmp_fd  = open(CONF_TMP_FILE_PATH, O_RDONLY | O_CREAT);
-       T_LOG("tmp_fd val:%d", tmp_fd);
-       T_QUIET;
-       T_EXPECT_POSIX_SUCCESS(tmp_fd, "open() for PROC_PIDLISTFDS");
-
-       retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDLISTFDS, (uint32_t)0, (user_addr_t)fd_info,
-                            (uint32_t)(sizeof(*fd_info) * 5));
-       retval = retval / (int)sizeof(struct proc_fdinfo);
-
-       close(tmp_fd);
-
-       for (int i = 0; i < retval; i++) {
-               /*
-                * Check only for the fd that we control.
-                */
-               if (tmp_fd != fd_info[i].proc_fd) {
-                       continue;
-               }
-               T_EXPECT_EQ_UINT(fd_info[i].proc_fdtype, (unsigned int)PROX_FDTYPE_VNODE, "Correct proc_fdtype for returned fd");
-       }
-
-       T_EXPECT_GE_INT(retval, 4, "Correct number of fds was returned.");
-
-       tmp_fd = -1;
-       free(fd_info);
-       fd_info = NULL;
-}
-
-T_DECL(proc_info_proc_pidpathinfo,
-       "Test to verify PROC_PIDPATHINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       char * pid_path = NULL;
-       pid_path        = malloc(sizeof(char) * PROC_PIDPATHINFO_MAXSIZE);
-       T_EXPECT_NOTNULL(pid_path, "malloc for PROC_PIDPATHINFO");
-       int retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDPATHINFO, (uint64_t)0, (user_addr_t)pid_path,
-                                (uint32_t)PROC_PIDPATHINFO_MAXSIZE);
-       T_EXPECT_EQ_INT(retval, 0, "__proc_info call for PROC_PIDPATHINFO");
-
-       T_EXPECT_NE_PTR((void *)(strcasestr(pid_path, CONF_CMD_NAME)), NULL, "PROC_PIDPATHINFOreturns valid value for pid_path");
-       free(pid_path);
-       pid_path = NULL;
-}
-
-T_DECL(proc_info_proc_pidlistfileports,
-       "Test to verify PROC_PIDLISTFILEPORTS returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       struct proc_fileportinfo * fileport_info = NULL;
-       mach_port_t tmp_file_port                = MACH_PORT_NULL;
-       proc_config_t proc_config                = spawn_child_processes(1, proc_info_call_pidinfo_handler);
-       int child_pid                            = proc_config->child_pids[0];
-
-       /*
-        * Create a file port
-        */
-       tmp_fd     = open(CONF_TMP_FILE_PATH, O_RDWR | O_CREAT);
-       int retval = fileport_makeport(tmp_fd, &tmp_file_port);
-       T_EXPECT_POSIX_SUCCESS(retval, "fileport_makeport() for PROC_PIDLISTFILEPORTS");
-
-       /*
-        * Like the other APIs, this returns the actual count + 20. Hence we expect it to be atleast 1 (that we created)
-        */
-       retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDLISTFILEPORTS, (uint64_t)0, (user_addr_t)0, (uint32_t)0);
-       T_EXPECT_GE_INT(retval / (int)sizeof(fileport_info), 1,
-                       "__proc_info call for PROC_PIDLISTFILEPORTS to get total ports in parent");
-
-       /*
-        * Child doesn't have any fileports, should return zero
-        */
-       retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDLISTFILEPORTS, (uint64_t)0, (user_addr_t)0, (uint32_t)0);
-       T_EXPECT_EQ_INT(retval / (int)sizeof(fileport_info), 0,
-                       "__proc_info call for PROC_PIDLISTFILEPORTS to get total ports in child");
-
-       fileport_info = malloc(sizeof(*fileport_info) * (size_t)retval);
-       retval        = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDLISTFILEPORTS, (uint64_t)0, (user_addr_t)fileport_info,
-                            (uint32_t)sizeof(*fileport_info));
-       T_EXPECT_EQ_INT(retval, (int)sizeof(*fileport_info), "__proc_info call for PROC_PIDLISTFILEPORTS");
-
-       T_EXPECT_NE_UINT(fileport_info->proc_fileport, (uint32_t)0, "PROC_PIDLISTFILEPORTS returns valid value for proc_fileport");
-       T_EXPECT_EQ_UINT(fileport_info->proc_fdtype, (uint32_t)PROX_FDTYPE_VNODE,
-                        "PROC_PIDLISTFILEPORTS returns valid value for proc_fdtype");
-
-       /*
-        * Cleanup for the fileport
-        */
-       mach_port_deallocate(mach_task_self(), tmp_file_port);
-       tmp_file_port = MACH_PORT_NULL;
-       free(fileport_info);
-       fileport_info = NULL;
-       close(tmp_fd);
-       tmp_fd = -1;
-       free_proc_config(proc_config);
-}
-
-T_DECL(proc_info_proc_pidcoalitioninfo,
-       "Test to verify PROC_PIDCOALITIONINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
-       int child_pid             = proc_config->child_pids[0];
-
-       struct proc_pidcoalitioninfo pci_parent;
-       struct proc_pidcoalitioninfo pci_child;
-       int retval = __proc_info(PROC_INFO_CALL_PIDINFO, getpid(), PROC_PIDCOALITIONINFO, (uint64_t)0, (user_addr_t)&pci_parent,
-                                (uint32_t)sizeof(pci_parent));
-       T_EXPECT_EQ_INT(retval, (int)sizeof(pci_parent), "__proc_info call for PROC_PIDCOALITIONINFO (parent)");
-       retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDCOALITIONINFO, (uint64_t)0, (user_addr_t)&pci_child,
-                            (uint32_t)sizeof(pci_child));
-       T_EXPECT_EQ_INT(retval, (int)sizeof(pci_child), "__proc_info call for PROC_PIDCOALITIONINFO (child)");
-
-       /*
-        * Coalition IDs should match for child and parent
-        */
-       for (int i = 0; i < COALITION_NUM_TYPES; i++) {
-               T_EXPECT_EQ_ULLONG(pci_parent.coalition_id[i], pci_child.coalition_id[i],
-                                  "PROC_PIDCOALITIONINFO returns valid value for coalition_id");
-       }
-
-       free_proc_config(proc_config);
-}
-
-T_DECL(proc_info_proc_pidworkqueueinfo,
-       "Test to verify PROC_PIDWORKQUEUEINFO returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
-       int child_pid             = proc_config->child_pids[0];
-       send_action_to_child_processes(proc_config, ACT_PHASE5);
-
-       struct proc_workqueueinfo pwqinfo;
-       usleep(10000);
-       int retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDWORKQUEUEINFO, (uint64_t)0, (user_addr_t)&pwqinfo,
-                                (uint32_t)sizeof(pwqinfo));
-       T_EXPECT_EQ_INT(retval, (int)sizeof(pwqinfo), "__proc_info call for PROC_PIDWORKQUEUEINFO");
-
-       int ncpu         = 0;
-       size_t ncpu_size = sizeof(ncpu);
-       retval           = sysctlbyname("hw.ncpu", (void *)&ncpu, &ncpu_size, NULL, 0);
-       T_EXPECT_EQ_INT(retval, 0, "sysctl() for PROC_PIDWORKQUEUEINFO");
-       T_EXPECT_GE_UINT(pwqinfo.pwq_nthreads, (uint32_t)1, "PROC_PIDWORKQUEUEINFO returns valid value for pwq_nthreads");
-       T_EXPECT_GE_UINT(pwqinfo.pwq_blockedthreads + pwqinfo.pwq_runthreads, (uint32_t)1,
-                        "PROC_PIDWORKQUEUEINFO returns valid value for pwqinfo.pwq_runthreads/pwq_blockedthreads");
-       T_EXPECT_EQ_UINT(pwqinfo.pwq_state, (uint32_t)0, "PROC_PIDWORKQUEUEINFO returns valid value for pwq_state");
-
-       kill_child_processes(proc_config);
-       free_proc_config(proc_config);
-}
-T_DECL(proc_info_proc_pidnoteexit,
-       "Test to verify PROC_PIDNOTEEXIT returns valid information about the process",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       /*
-        * Ask the child to close pipe and quit, cleanup pipes for parent
-        */
-       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
-       int child_pid             = proc_config->child_pids[0];
-       send_action_to_child_processes(proc_config, ACT_EXIT);
-
-       uint32_t exit_data = 0;
-       int retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDNOTEEXIT, (uint64_t)(NOTE_EXITSTATUS | NOTE_EXIT_DETAIL),
-                                (user_addr_t)&exit_data, (uint32_t)sizeof(exit_data));
-       T_EXPECT_EQ_INT(retval, (int)sizeof(exit_data), "__proc_info call for PROC_PIDNOTEEXIT");
-
-       T_EXPECT_EQ_UINT(exit_data, 0U, "PROC_PIDNOTEEXIT returned valid value for exit_data");
-
-       free_proc_config(proc_config);
-}
-
-T_DECL(proc_info_negative_tests,
-       "Test to validate PROC_INFO_CALL_PIDINFO for invalid arguments",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       proc_config_t proc_config = spawn_child_processes(1, proc_info_call_pidinfo_handler);
-       int child_pid             = proc_config->child_pids[0];
-       uint32_t exit_data        = 0;
-
-       int retval =
-           __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDNOTEEXIT, (uint64_t)0, (user_addr_t)&exit_data, (uint32_t)0);
-       T_EXPECT_EQ_INT(errno, ENOMEM, "PROC_INFO_CALL_PIDINFO call should fail with ENOMEM if buffersize is zero");
-       retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, PROC_PIDPATHINFO, (uint64_t)0, (user_addr_t)&exit_data,
-                            (uint32_t)PROC_PIDPATHINFO_MAXSIZE + 1);
-       T_EXPECT_EQ_INT(errno, EOVERFLOW,
-                       "PROC_INFO_CALL_PIDINFO call should fail with EOVERFLOW if buffersize is larger than PROC_PIDPATHINFO_MAXSIZE");
-       retval = __proc_info(PROC_INFO_CALL_PIDINFO, -1, PROC_PIDNOTEEXIT, (uint64_t)0, (user_addr_t)&exit_data,
-                            (uint32_t)sizeof(exit_data));
-       T_EXPECT_EQ_INT(errno, ESRCH, "PROC_INFO_CALL_PIDINFO call should fail with ESRCH for invalid process id");
-       retval = __proc_info(PROC_INFO_CALL_PIDINFO, child_pid, -1U, (uint64_t)0, (user_addr_t)&exit_data, (uint32_t)sizeof(exit_data));
-       T_EXPECT_EQ_INT(errno, EINVAL, "PROC_INFO_CALL_PIDINFO call should fail with EINVAL for invalid flavor");
-       retval = __proc_info(PROC_INFO_CALL_PIDINFO, 0, PROC_PIDWORKQUEUEINFO, (uint64_t)0, (user_addr_t)0, (uint32_t)0);
-       T_EXPECT_EQ_INT(errno, EINVAL,
-                       "PROC_INFO_CALL_PIDINFO call should fail with EINVAL if flavor is PROC_PIDWORKQUEUEINFO and pid=0");
-
-       free_proc_config(proc_config);
-}
-
-/*
- * END PROC_INFO_CALL_PIDINFO DECLs
- */
-
-#pragma mark proc_list_uptrs
-
-#define NUPTRS 4
-static uint64_t uptrs[NUPTRS] = {0x1122334455667788ULL, 0x99aabbccddeeff00ULL, 0xaabbaaddccaaffeeULL, 0xcc000011ccaa7755ULL};
-
-static const char * uptr_names[NUPTRS];
-
-static void
-print_uptrs(int argc, char * const * argv)
-{
-       for (int i = 0; i < argc; i++) {
-               char * end;
-               unsigned long pid = strtoul(argv[i], &end, 0);
-               if (pid > INT_MAX) {
-                       printf("error: pid '%lu' would overflow an integer\n", pid);
-               }
-               if (end == argv[i]) {
-                       printf("error: could not parse '%s' as a pid\n", argv[i]);
-                       continue;
-               }
-               int uptrs_count = proc_list_uptrs((int)pid, NULL, 0);
-               if (uptrs_count == 0) {
-                       printf("no uptrs for process %d\n", (int)pid);
-                       return;
-               }
-
-               /* extra space */
-               unsigned int uptrs_len = (unsigned int)uptrs_count + 32;
-
-               uint64_t * uptrs_alloc = malloc(sizeof(uint64_t) * uptrs_len);
-               os_assert(uptrs_alloc != NULL);
-
-               uptrs_count = proc_list_uptrs((int)pid, uptrs_alloc, (uint32_t)(sizeof(uint64_t) * uptrs_len));
-               printf("process %d has %d uptrs:\n", (int)pid, uptrs_count);
-               if (uptrs_count > (int)uptrs_len) {
-                       uptrs_count = (int)uptrs_len;
-               }
-               for (int j = 0; j < uptrs_count; j++) {
-                       printf("%#17" PRIx64 "\n", uptrs_alloc[j]);
-               }
-       }
-}
-
-T_DECL(proc_list_uptrs, "the kernel should return any up-pointers it knows about", T_META_ALL_VALID_ARCHS(YES))
-{
-       if (argc > 0) {
-               print_uptrs(argc, argv);
-               T_SKIP("command line invocation of tool, not test");
-       }
-
-       unsigned int cur_uptr = 0;
-
-       int kq = kqueue();
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(kq, "kqueue");
-
-       /*
-        * Should find uptrs on file-type knotes and generic knotes (two
-        * different search locations, internally).
-        */
-       struct kevent64_s events[2];
-       memset(events, 0, sizeof(events));
-
-       uptr_names[cur_uptr] = "kqueue file-backed knote";
-       events[0].filter     = EVFILT_WRITE;
-       events[0].ident      = STDOUT_FILENO;
-       events[0].flags      = EV_ADD;
-       events[0].udata      = uptrs[cur_uptr++];
-
-       uptr_names[cur_uptr] = "kqueue non-file-backed knote";
-       events[1].filter     = EVFILT_USER;
-       events[1].ident      = 1;
-       events[1].flags      = EV_ADD;
-       events[1].udata      = uptrs[cur_uptr++];
-
-       int kev_err = kevent64(kq, events, sizeof(events) / sizeof(events[0]), NULL, 0, KEVENT_FLAG_IMMEDIATE, NULL);
-       T_ASSERT_POSIX_SUCCESS(kev_err, "register events with kevent64");
-
-       /*
-        * Should find uptrs both on a kevent_id kqueue and in a workloop
-        * kqueue's knote's udata field.
-        */
-       uptr_names[cur_uptr]            = "dynamic kqueue non-file-backed knote";
-       struct kevent_qos_s events_id[] = {{.filter = EVFILT_USER, .ident = 1, .flags = EV_ADD, .udata = uptrs[cur_uptr++]}};
-
-       uptr_names[cur_uptr] = "dynamic kqueue ID";
-       kev_err = kevent_id(uptrs[cur_uptr++], events_id, 1, NULL, 0, NULL, NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE);
-       T_ASSERT_POSIX_SUCCESS(kev_err, "register event with kevent_id");
-
-       errno           = 0;
-       int uptrs_count = proc_list_uptrs(getpid(), NULL, 0);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(uptrs_count, "proc_list_uptrs");
-       T_QUIET;
-       T_EXPECT_EQ(uptrs_count, NUPTRS, "should see correct number of up-pointers");
-
-       uint64_t uptrs_obs[NUPTRS] = {0};
-       uptrs_count                = proc_list_uptrs(getpid(), uptrs_obs, sizeof(uptrs_obs));
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(uptrs_count, "proc_list_uptrs");
-
-       for (int i = 0; i < uptrs_count; i++) {
-               int found = -1;
-               for (int j = 0; j < NUPTRS; j++) {
-                       if (uptrs_obs[i] == uptrs[j]) {
-                               found = j;
-                               goto next;
-                       }
-               }
-               T_FAIL("unexpected up-pointer found: %#" PRIx64, uptrs_obs[i]);
-       next:;
-               if (found != -1) {
-                       T_PASS("found up-pointer for %s", uptr_names[found]);
-               }
-       }
-
-       uint64_t up_overflow[2] = {0};
-       uptrs_count = proc_list_uptrs(getpid(), up_overflow, sizeof(uint64_t)+1);
-       T_ASSERT_EQ(up_overflow[1], 0 , "overflow check");
-}
-
-#pragma mark dynamic kqueue info
-
-#define EXPECTED_ID UINT64_C(0x1122334455667788)
-#define EXPECTED_UDATA UINT64_C(0x99aabbccddeeff00)
-#ifndef KQ_WORKLOOP
-#define KQ_WORKLOOP 0x80
-#endif
-
-static void
-setup_kevent_id(kqueue_id_t id)
-{
-       struct kevent_qos_s events_id[] = {{.filter = EVFILT_USER, .ident = 1, .flags = EV_ADD, .udata = EXPECTED_UDATA}};
-
-       int err = kevent_id(id, events_id, 1, NULL, 0, NULL, NULL, KEVENT_FLAG_WORKLOOP | KEVENT_FLAG_IMMEDIATE);
-       T_ASSERT_POSIX_SUCCESS(err, "register event with kevent_id");
-}
-
-static kqueue_id_t *
-list_kqids(pid_t pid, int * nkqids_out)
-{
-       int kqids_len = 256;
-       int nkqids;
-       kqueue_id_t * kqids = NULL;
-       uint32_t kqids_size;
-
-retry:
-       if (os_mul_overflow(sizeof(kqueue_id_t), kqids_len, &kqids_size)) {
-               T_QUIET;
-               T_ASSERT_GT(kqids_len, PROC_PIDDYNKQUEUES_MAX, NULL);
-               kqids_len = PROC_PIDDYNKQUEUES_MAX;
-               goto retry;
-       }
-       if (!kqids) {
-               kqids = malloc(kqids_size);
-               T_QUIET;
-               T_ASSERT_NOTNULL(kqids, "malloc(%" PRIu32 ")", kqids_size);
-       }
-
-       nkqids = proc_list_dynkqueueids(pid, kqids, kqids_size);
-       if (nkqids > kqids_len && kqids_len < PROC_PIDDYNKQUEUES_MAX) {
-               kqids_len *= 2;
-               if (kqids_len > PROC_PIDDYNKQUEUES_MAX) {
-                       kqids_len = PROC_PIDDYNKQUEUES_MAX;
-               }
-               free(kqids);
-               kqids = NULL;
-               goto retry;
-       }
-
-       *nkqids_out = nkqids;
-       return kqids;
-}
-
-T_DECL(list_dynamic_kqueues, "the kernel should list IDs of dynamic kqueues", T_META_ALL_VALID_ARCHS(true))
-{
-       int nkqids;
-       bool found = false;
-
-       setup_kevent_id(EXPECTED_ID);
-       kqueue_id_t * kqids = list_kqids(getpid(), &nkqids);
-       T_ASSERT_GE(nkqids, 1, "at least one dynamic kqueue is listed");
-       for (int i = 0; i < nkqids; i++) {
-               if (kqids[i] == EXPECTED_ID) {
-                       found = true;
-                       T_PASS("found expected dynamic kqueue ID");
-               } else {
-                       T_LOG("found another dynamic kqueue with ID %#" PRIx64, kqids[i]);
-               }
-       }
-
-       if (!found) {
-               T_FAIL("could not find dynamic ID of kqueue created");
-       }
-
-       free(kqids);
-}
-
-T_DECL(dynamic_kqueue_basic_info, "the kernel should report valid basic dynamic kqueue info", T_META_ALL_VALID_ARCHS(true))
-{
-       struct kqueue_info kqinfo;
-       int ret;
-
-       setup_kevent_id(EXPECTED_ID);
-       ret = proc_piddynkqueueinfo(getpid(), PROC_PIDDYNKQUEUE_INFO, EXPECTED_ID, &kqinfo, sizeof(kqinfo));
-       T_ASSERT_POSIX_SUCCESS(ret, "proc_piddynkqueueinfo(... PROC_PIDDYNKQUEUE_INFO ...)");
-       T_QUIET;
-       T_ASSERT_GE(ret, (int)sizeof(kqinfo), "PROC_PIDDYNKQUEUE_INFO should return the right size");
-
-       T_EXPECT_NE(kqinfo.kq_state & KQ_WORKLOOP, 0U, "kqueue info should be for a workloop kqueue");
-       T_EXPECT_EQ(kqinfo.kq_stat.vst_ino, EXPECTED_ID, "inode field should be the kqueue's ID");
-}
-
-T_DECL(dynamic_kqueue_extended_info, "the kernel should report valid extended dynamic kqueue info", T_META_ALL_VALID_ARCHS(true))
-{
-       struct kevent_extinfo kqextinfo[1];
-       int ret;
-
-       setup_kevent_id(EXPECTED_ID);
-       ret = proc_piddynkqueueinfo(getpid(), PROC_PIDDYNKQUEUE_EXTINFO, EXPECTED_ID, kqextinfo, sizeof(kqextinfo));
-       T_ASSERT_POSIX_SUCCESS(ret, "proc_piddynkqueueinfo(... PROC_PIDDYNKQUEUE_EXTINFO ...)");
-       T_QUIET;
-       T_ASSERT_EQ(ret, 1, "PROC_PIDDYNKQUEUE_EXTINFO should return a single knote");
-
-       T_EXPECT_EQ(kqextinfo[0].kqext_kev.ident, 1ULL, "kevent identifier matches what was configured");
-       T_EXPECT_EQ(kqextinfo[0].kqext_kev.filter, (short)EVFILT_USER, "kevent filter matches what was configured");
-       T_EXPECT_EQ(kqextinfo[0].kqext_kev.udata, EXPECTED_UDATA, "kevent udata matches what was configured");
-}
-
-#pragma mark proc_listpids
-
-T_DECL(list_kdebug_pids, "the kernel should report processes that are filtered by kdebug", T_META_ASROOT(YES))
-{
-       int mib[4] = {CTL_KERN, KERN_KDEBUG};
-       int npids;
-       int pids[1];
-       int ret;
-       kd_regtype reg;
-       size_t regsize = sizeof(reg);
-
-       mib[2] = KERN_KDREMOVE;
-       ret    = sysctl(mib, 3, NULL, NULL, NULL, 0);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDREMOVE sysctl");
-
-       mib[2] = KERN_KDSETBUF;
-       mib[3] = 100000;
-       ret    = sysctl(mib, 4, NULL, NULL, NULL, 0);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDSETBUF sysctl");
-
-       mib[2] = KERN_KDSETUP;
-       ret    = sysctl(mib, 3, NULL, NULL, NULL, 0);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDSETUP sysctl");
-
-       npids = proc_listpids(PROC_KDBG_ONLY, 0, pids, sizeof(pids));
-       T_EXPECT_EQ(npids, 0, "no processes should be filtered initially");
-
-       reg.type   = KDBG_TYPENONE;
-       reg.value1 = (unsigned int)getpid();
-       reg.value2 = 1; /* set the pid in the filter */
-       mib[2]     = KERN_KDPIDTR;
-       ret        = sysctl(mib, 3, &reg, &regsize, NULL, 0);
-       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDPIDTR sysctl to set a pid in the filter");
-
-       npids = proc_listpids(PROC_KDBG_ONLY, 0, pids, sizeof(pids));
-       npids /= 4;
-       T_EXPECT_EQ(npids, 1, "a process should be filtered");
-       T_EXPECT_EQ(pids[0], getpid(), "process filtered should be the one that was set");
-
-       mib[2] = KERN_KDREMOVE;
-       ret    = sysctl(mib, 3, NULL, NULL, NULL, 0);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(ret, "KERN_KDREMOVE sysctl");
-}
diff --git a/tools/tests/darwintests/proc_info_udata.c b/tools/tests/darwintests/proc_info_udata.c
deleted file mode 100644 (file)
index f814be4..0000000
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <darwintest.h>
-#include "../../../bsd/sys/proc_info.h"
-#include "../../../libsyscall/wrappers/libproc/libproc.h"
-#include <stdio.h>
-#include <unistd.h>
-
-T_DECL(proc_udata_info, "Get and set a proc udata token"){
-       uint64_t token = mach_absolute_time();
-       proc_info_udata_t udata;
-       int ret;
-       
-       udata = token;
-       ret = proc_udata_info(getpid(), PROC_UDATA_INFO_SET, &udata, sizeof (udata));
-
-#if CONFIG_EMBEDDED
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(ret, -1, "proc_udata_info PROC_UDATA_INFO_SET returns error on non-macOS");
-       T_SKIP("Remaining tests are only supported on macOS");
-#endif /* CONFIG_EMBEDDED */
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(ret, 0, "proc_udata_info PROC_UDATA_INFO_SET");
-
-       T_LOG("udata set to %#llx", udata);
-
-       bzero(&udata, sizeof (udata));
-       ret = proc_udata_info(getpid(), PROC_UDATA_INFO_GET, &udata, sizeof (udata));
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(ret, 0, "proc_udata_info PROC_UDATA_INFO_GET");
-
-       T_ASSERT_EQ_ULLONG(token, udata, "proc_udata_info(): retrieved value matches token");
-
-       ret = proc_udata_info(getpid(), PROC_UDATA_INFO_SET, &udata, sizeof (uint32_t));
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(ret, -1, "proc_udata_info PROC_UDATA_INFO_SET with invalid size returned -1");
-       T_ASSERT_EQ_INT(errno, EINVAL, "proc_udata_info PROC_UDATA_INFO_SET with invalid size returned EINVAL");
-
-       ret = proc_udata_info(getppid(), PROC_UDATA_INFO_GET, &udata, sizeof (udata));
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(ret, -1, "proc_udata_info PROC_UDATA_INFO_GET returned -1 on attempt against non-self pid");
-       T_ASSERT_EQ_INT(errno, EACCES, "proc_udata_info PROC_UDATA_INFO_GET set errno to EACCES on attempt against non-self pid");
-
-       ret = proc_udata_info(getppid(), PROC_UDATA_INFO_SET, &udata, sizeof (udata));
-       T_WITH_ERRNO;
-       T_ASSERT_EQ_INT(ret, -1, "proc_udata_info PROC_UDATA_INFO_SET returned -1 on attempt against non-self pid");
-       T_ASSERT_EQ_INT(errno, EACCES, "proc_udata_info PROC_UDATA_INFO_SET set errno to EACCES on attempt against non-self pid");
-}
diff --git a/tools/tests/darwintests/proc_uuid_policy_26567533.c b/tools/tests/darwintests/proc_uuid_policy_26567533.c
deleted file mode 100644 (file)
index 470d5ca..0000000
+++ /dev/null
@@ -1,42 +0,0 @@
-#include <darwintest.h>
-#include <uuid/uuid.h>
-#include <System/sys/proc_uuid_policy.h>
-#include <stdint.h>
-
-#define NUM_PROC_UUID_POLICY_FLAGS 4
-
-T_DECL(proc_uuid_policy_26567533, "Tests passing a NULL uuid in (uap->uuid).", T_META_LTEPHASE(LTE_POSTINIT))
-{
-       int i, ret;
-       uuid_t null_uuid;
-       memset(null_uuid, 0, sizeof(uuid_t));
-
-       uint32_t policy_flags[] = {
-               PROC_UUID_POLICY_FLAGS_NONE,
-               PROC_UUID_NO_CELLULAR,
-               PROC_UUID_NECP_APP_POLICY,
-               PROC_UUID_ALT_DYLD_POLICY
-       };
-
-       for (i = 0; i < NUM_PROC_UUID_POLICY_FLAGS; i++) {
-               T_LOG("Testing policy add with flag value 0x%x", policy_flags[i]);
-
-               /* Since UUID is null, this call should fail with errno = EINVAL. */
-               ret = proc_uuid_policy(PROC_UUID_POLICY_OPERATION_ADD, null_uuid, sizeof(uuid_t), policy_flags[i]);
-
-               T_ASSERT_TRUE(ret == -1, "proc_uuid_policy returned %d", ret);
-               T_WITH_ERRNO;
-               T_ASSERT_TRUE(errno = EINVAL, "errno is %d", errno);
-       }
-
-       for (i = 0; i < NUM_PROC_UUID_POLICY_FLAGS; i++) {
-               T_LOG("Testing policy remove with flag value 0x%x", policy_flags[i]);
-
-               /* Since UUID is null, this call should fail with errno = EINVAL. */
-               ret = proc_uuid_policy(PROC_UUID_POLICY_OPERATION_REMOVE, null_uuid, sizeof(uuid_t), policy_flags[i]);
-
-               T_ASSERT_TRUE(ret == -1, "proc_uuid_policy returned %d", ret);
-               T_WITH_ERRNO;
-               T_ASSERT_TRUE(errno = EINVAL, "errno is %d", errno);
-       }
-}
diff --git a/tools/tests/darwintests/pwrite_avoid_sigxfsz_28581610.c b/tools/tests/darwintests/pwrite_avoid_sigxfsz_28581610.c
deleted file mode 100644 (file)
index 9c39e55..0000000
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * testname: pwrite_avoid_sigxfsz_28581610
- */
-
-#include <darwintest.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <setjmp.h>
-#include <signal.h>
-#include <sys/resource.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#define TMP_FILE_PATH "/tmp/test_pwrite"
-
-static sigjmp_buf xfsz_jmpbuf;
-
-void xfsz_signal(int);
-
-void
-xfsz_signal(__unused int signo)
-{
-       siglongjmp(xfsz_jmpbuf, 1);
-}
-
-T_DECL(pwrite, "Tests avoiding SIGXFSZ with pwrite and odd offsets",
-               T_META_ASROOT(true))
-{
-       int fd, x;
-       off_t ret;
-       struct stat f_stat;
-       struct rlimit crl;
-       static const int offs[] = { -1, -1 * 1024, -1 * 1024 * 16, -1 * 1024 * 1024 * 16, 0 };
-       static unsigned char buffer[1048576];
-
-       T_SETUPBEGIN;
-       /* We expect zero SIGXFSZ signals because we have no file size limits */
-       crl.rlim_cur = crl.rlim_max = RLIM_INFINITY;
-       ret = setrlimit(RLIMIT_FSIZE, &crl);
-       T_ASSERT_POSIX_SUCCESS(ret, "setting infinite file size limit");
-
-       /* we just needed root to setup unlimited file size */
-       remove(TMP_FILE_PATH);
-       setuid(5000);
-
-       /* We just want an empty regular file to test with */
-       fd = open(TMP_FILE_PATH, O_RDWR | O_CREAT | O_EXCL, 0777);
-       T_ASSERT_POSIX_SUCCESS(fd, "opening fd on temp file %s.", TMP_FILE_PATH);
-
-       /* sanity check that this new file is really zero bytes in size */
-       ret = fstat(fd, &f_stat);
-       T_ASSERT_POSIX_SUCCESS(ret, "stat() fd on temp file.");
-       T_ASSERT_TRUE(0 == f_stat.st_size, "ensure %s is empty", TMP_FILE_PATH);
-
-       /* sanity check that ftruncate() considers negative offsets an error */
-       for (x = 0; offs[x] != 0; x++) {
-               ret = ftruncate(fd, offs[x]);
-               T_ASSERT_TRUE(((ret == -1) && (errno == EINVAL)),
-                               "negative offset %d", offs[x]);
-       }
-
-       T_SETUPEND;
-
-       /* we want to get the EFBIG errno but without a SIGXFSZ signal */
-    T_EXPECTFAIL;
-       if (!sigsetjmp(xfsz_jmpbuf, 1)) {
-               signal(SIGXFSZ, xfsz_signal);
-               ret = pwrite(fd, buffer, sizeof buffer, LONG_MAX);
-               T_ASSERT_TRUE(((ret == -1) && (errno == EFBIG)),
-                               "large offset %d", 13);
-       } else {
-               signal(SIGXFSZ, SIG_DFL);
-               T_FAIL("%s unexpected SIGXFSZ with offset %lX",
-                "<rdar://problem/28581610>", LONG_MAX);
-       }
-
-       /* Negative offsets are invalid, no SIGXFSZ signals required */
-       for (x = 0; offs[x] != 0; x++) {
-        /* only -1 gives the correct result */
-        if (-1 != offs[x]) {
-            T_EXPECTFAIL;
-        }
-
-               if (!sigsetjmp(xfsz_jmpbuf, 1)) {
-                       signal(SIGXFSZ, xfsz_signal);
-                       ret = pwrite(fd, buffer, sizeof buffer, offs[x]);
-                       T_ASSERT_TRUE(((ret == -1) && (errno == EINVAL)),
-                                       "negative offset %d", offs[x]);
-               } else {
-                       signal(SIGXFSZ, SIG_DFL);
-                       T_FAIL("%s unexpected SIGXFSZ with negative offset %d",
-                   "<rdar://problem/28581610>", offs[x]);
-               }
-       }
-
-       remove(TMP_FILE_PATH);
-}
diff --git a/tools/tests/darwintests/regression_17272465.c b/tools/tests/darwintests/regression_17272465.c
deleted file mode 100644 (file)
index ed2dc10..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <darwintest.h>
-#include <stdio.h>
-#include <mach/mach.h>
-#include <mach/host_priv.h>
-
-
-T_DECL(regression_17272465,
-       "Test for host_set_special_port Mach port over-release, rdr: 17272465", T_META_CHECK_LEAKS(false))
-{
-       kern_return_t kr;
-       mach_port_t port = MACH_PORT_NULL;
-
-       T_SETUPBEGIN;
-       T_QUIET;
-       T_ASSERT_MACH_SUCCESS(mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &port), NULL);
-       T_QUIET;
-       T_ASSERT_MACH_SUCCESS(mach_port_insert_right(mach_task_self(), port, port, MACH_MSG_TYPE_MAKE_SEND), NULL);
-       T_SETUPEND;
-
-       (void)host_set_special_port(mach_host_self(), 30, port);
-       (void)host_set_special_port(mach_host_self(), 30, port);
-       (void)host_set_special_port(mach_host_self(), 30, port);
-
-       T_PASS("No panic occurred");
-}
diff --git a/tools/tests/darwintests/remote_time.c b/tools/tests/darwintests/remote_time.c
deleted file mode 100644 (file)
index cd028a9..0000000
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <darwintest.h>
-#include <System/kern/remote_time.h>
-#include <mach/mach_time.h>
-#include <stdint.h>
-#include <sys/sysctl.h>
-#include <TargetConditionals.h>
-extern uint64_t __mach_bridge_remote_time(uint64_t);
-
-T_DECL(remote_time_syscall, "test mach_bridge_remote_time syscall",
-       T_META_CHECK_LEAKS(false))
-{
-#if TARGET_OS_BRIDGE
-       uint64_t local_time = mach_absolute_time();
-       uint64_t remote_time1 = mach_bridge_remote_time(local_time);
-       uint64_t remote_time2 = __mach_bridge_remote_time(local_time);
-       T_LOG("local_time = %llu, remote_time1 = %llu, remote_time2 = %llu",
-               local_time, remote_time1, remote_time2);
-       T_ASSERT_EQ(remote_time1, remote_time2, "syscall works");
-#else
-       T_SKIP("Skipping test");
-#endif /* TARGET_OS_BRIDGE */
-}
diff --git a/tools/tests/darwintests/settimeofday_29193041.c b/tools/tests/darwintests/settimeofday_29193041.c
deleted file mode 100644 (file)
index 6bb495d..0000000
+++ /dev/null
@@ -1,158 +0,0 @@
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <mach/clock_types.h>
-#include <sys/mman.h>
-#include <sys/timex.h>
-#include <spawn.h>
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-#if CONFIG_EMBEDDED
-#include <sys/types.h>
-#include <pwd.h>
-#include <uuid/uuid.h>
-#endif
-
-/*
- * This test expects the entitlement or root privileges for a process to
- * set the time using settimeofday syscall.
- */
-
-#define DAY 86400 //1 day in sec
-
-T_DECL(settime_32089962_not_entitled_root,
-       "Verify that root privileges can allow to change the time",
-       T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
-{
-       struct timeval settimeofdaytime;
-       struct timeval adj_time;
-       struct timex ntptime;
-
-       if (geteuid() != 0){
-                T_SKIP("settimeofday_root_29193041 test requires root privileges to run.");
-        }
-
-       /* test settimeofday */
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
-       T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
-
-       /* test adjtime */
-       adj_time.tv_sec = 1;
-       adj_time.tv_usec = 0;
-       T_ASSERT_POSIX_ZERO(adjtime(&adj_time, NULL),NULL);
-
-       /* test ntp_adjtime */
-       memset(&ntptime, 0, sizeof(ntptime));
-       ntptime.modes |= MOD_STATUS;
-       ntptime.status = TIME_OK;
-
-       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
-}
-
-T_DECL(settime_32089962_not_entitled_not_root,
-       "Verify that the \"com.apple.settime\" entitlement can allow to change the time",
-       T_META_ASROOT(false), T_META_CHECK_LEAKS(false))
-{
-       struct timeval settimeofdaytime;
-       struct timeval adj_time;
-       struct timex ntptime;
-       int res;
-
-       if (geteuid() == 0){
-                T_SKIP("settimeofday_29193041 test requires no root privileges to run.");
-        }
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
-
-       /* test settimeofday */
-#if TARGET_OS_EMBEDDED
-       T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
-#else
-       res = settimeofday(&settimeofdaytime, NULL);
-       T_ASSERT_EQ(res, -1, NULL);
-#endif
-
-       /* test adjtime */
-       adj_time.tv_sec = 1;
-       adj_time.tv_usec = 0;
-       res = adjtime(&adj_time, NULL);
-       T_ASSERT_EQ(res, -1, NULL);
-
-       /* test ntp_adjtime */
-       memset(&ntptime, 0, sizeof(ntptime));
-       ntptime.modes |= MOD_STATUS;
-       ntptime.status = TIME_OK;
-       res = ntp_adjtime(&ntptime);
-       T_ASSERT_EQ(res, -1, NULL);
-}
-
-T_DECL(settimeofday_29193041_not_entitled_root,
-       "Verify that root privileges can allow to change the time",
-       T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
-{
-       struct timeval time;
-       long new_time;
-
-       if (geteuid() != 0){
-                T_SKIP("settimeofday_root_29193041 test requires root privileges to run.");
-        }
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* increment the time of one day */
-       new_time = time.tv_sec + DAY;
-
-       time.tv_sec = new_time;
-       time.tv_usec = 0;
-
-       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* expext to be past new_time */
-       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time changed with root and without entitlement");
-
-       time.tv_sec -= DAY;
-       T_QUIET;T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-}
-
-T_DECL(settimeofday_29193041_not_entitled_not_root,
-       "Verify that the \"com.apple.settime\" entitlement can allow to change the time",
-       T_META_ASROOT(false), T_META_CHECK_LEAKS(false))
-{
-       struct timeval time;
-       long new_time;
-
-       if (geteuid() == 0){
-                T_SKIP("settimeofday_29193041 test requires no root privileges to run.");
-        }
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* increment the time of one day */
-       new_time = time.tv_sec + DAY;
-
-       time.tv_sec = new_time;
-       time.tv_usec = 0;
-
-#if TARGET_OS_EMBEDDED
-       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-#else
-       int res = settimeofday(&time, NULL);
-       T_ASSERT_EQ(res, -1, NULL);
-#endif
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-#if TARGET_OS_EMBEDDED
-       /* expext to be past new_time */
-       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time successfully changed without root and without entitlement");
-       time.tv_sec -= DAY;
-       T_QUIET; T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-#else
-       T_EXPECT_LT_LONG(time.tv_sec, new_time, "Not permitted to change time without root and without entitlement");
-#endif
-
-}
diff --git a/tools/tests/darwintests/settimeofday_29193041.entitlements b/tools/tests/darwintests/settimeofday_29193041.entitlements
deleted file mode 100644 (file)
index fafc6c9..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.private.settime</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/settimeofday_29193041_entitled.c b/tools/tests/darwintests/settimeofday_29193041_entitled.c
deleted file mode 100644 (file)
index 51ca5a5..0000000
+++ /dev/null
@@ -1,143 +0,0 @@
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mman.h>
-#include <mach/clock_types.h>
-#include <sys/timex.h>
-#include <spawn.h>
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-#if CONFIG_EMBEDDED
-#include <sys/types.h>
-#include <pwd.h>
-#include <uuid/uuid.h>
-#endif
-
-/*
- * This test expects the entitlement or root privileges for a process to
- * set the time using settimeofday syscall.
- */
-
-#define DAY 86400 //1 day in sec
-
-T_DECL(settime_32089962_entitled_root,
-       "Verify that root privileges can allow to change the time",
-       T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
-{
-       struct timeval settimeofdaytime;
-       struct timeval adj_time;
-       struct timex ntptime;
-
-       if (geteuid() != 0){
-                T_SKIP("settime_32089962_entitled_root test requires root privileges to run.");
-        }
-
-       /* test settimeofday */
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
-       T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
-
-       /* test adjtime */
-       adj_time.tv_sec = 1;
-       adj_time.tv_usec = 0;
-       T_ASSERT_POSIX_ZERO(adjtime(&adj_time, NULL),NULL);
-
-       /* test ntp_adjtime */
-       memset(&ntptime, 0, sizeof(ntptime));
-       ntptime.modes |= MOD_STATUS;
-       ntptime.status = TIME_OK;
-
-       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
-}
-
-T_DECL(settime_32089962_entitled_not_root,
-       "Verify that the \"com.apple.settime\" entitlement can allow to change the time",
-       T_META_ASROOT(false), T_META_CHECK_LEAKS(false))
-{
-
-       struct timeval settimeofdaytime;
-       struct timeval adj_time;
-       struct timex ntptime;
-
-       if (geteuid() == 0){
-                T_SKIP("settime_32089962_entitled_root test requires no root privileges to run.");
-        }
-
-       /* test settimeofday */
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&settimeofdaytime, NULL), NULL);
-       T_ASSERT_POSIX_ZERO(settimeofday(&settimeofdaytime, NULL), NULL);
-
-       /* test adjtime */
-       adj_time.tv_sec = 1;
-       adj_time.tv_usec = 0;
-       T_ASSERT_POSIX_ZERO(adjtime(&adj_time, NULL),NULL);
-
-       /* test ntp_adjtime */
-       memset(&ntptime, 0, sizeof(ntptime));
-       ntptime.modes |= MOD_STATUS;
-       ntptime.status = TIME_OK;
-
-       T_ASSERT_EQ(ntp_adjtime(&ntptime), TIME_OK, NULL);
-
-}
-
-T_DECL(settimeofday_29193041_entitled_root,
-       "Verify that root privileges can allow to change the time",
-       T_META_ASROOT(true), T_META_CHECK_LEAKS(false))
-{
-       struct timeval time;
-       long new_time;
-
-       if (geteuid() != 0){
-                T_SKIP("settimeofday_root_29193041 test requires root privileges to run.");
-        }
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* increment the time of one day */
-       new_time = time.tv_sec + DAY;
-
-       time.tv_sec = new_time;
-       time.tv_usec = 0;
-
-       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* expext to be past new_time */
-       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time changed with root and entitlement");
-
-       time.tv_sec -= DAY;
-       T_QUIET;T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-}
-
-T_DECL(settimeofday_29193041_entitled_not_root,
-       "Verify that the \"com.apple.settime\" entitlement can allow to change the time",
-       T_META_ASROOT(false), T_META_CHECK_LEAKS(false))
-{
-       struct timeval time;
-       long new_time;
-
-       if (geteuid() == 0){
-                T_SKIP("settimeofday_29193041 test requires no root privileges to run.");
-        }
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* increment the time of one day */
-       new_time = time.tv_sec + DAY;
-
-       time.tv_sec = new_time;
-       time.tv_usec = 0;
-
-       T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-
-       T_QUIET; T_ASSERT_POSIX_ZERO(gettimeofday(&time, NULL), NULL);
-
-       /* expext to be past new_time */
-       T_EXPECT_GE_LONG(time.tv_sec, new_time, "Time successfully changed without root and with entitlement");
-       
-       time.tv_sec -= DAY;
-       T_QUIET; T_ASSERT_POSIX_ZERO(settimeofday(&time, NULL), NULL);
-}
diff --git a/tools/tests/darwintests/sigchld_return.c b/tools/tests/darwintests/sigchld_return.c
deleted file mode 100644 (file)
index 6a3cc6b..0000000
+++ /dev/null
@@ -1,50 +0,0 @@
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-
-#include <darwintest.h>
-
-
-static int exitcode = 0x6789BEEF;
-int should_exit = 0;
-
-void handler (int sig, siginfo_t *sip, __unused void *uconp)
-{
-        /* Should handle the SIGCHLD signal */
-        T_ASSERT_EQ_INT(sig, SIGCHLD, "Captured signal returns 0x%x, expected SIGCHLD (0x%x).", sig, SIGCHLD);
-        T_QUIET; T_ASSERT_NOTNULL(sip, "siginfo_t returned NULL but should have returned data.");
-        T_ASSERT_EQ_INT(sip->si_code, CLD_EXITED, "si_code returns 0x%x, expected CLD_EXITED (0x%x).", sip->si_code, CLD_EXITED);
-        T_ASSERT_EQ_INT(sip->si_status, exitcode, "si_status returns 0x%08X, expected the child's exit code (0x%08X).", sip->si_status, exitcode);
-        should_exit = 1;
-}
-
-
-T_DECL(sigchldreturn, "checks that a child process exited with an exitcode returns correctly to parent", T_META_CHECK_LEAKS(false))
-{
-        struct sigaction act;
-        int pid;
-
-        act.sa_sigaction = handler;
-        act.sa_flags = SA_SIGINFO;
-
-        /* Set action for signal */
-        T_QUIET; T_ASSERT_POSIX_SUCCESS(sigaction (SIGCHLD, &act, NULL), "Calling sigaction() failed for SIGCHLD");
-
-        /* Now fork a child that just exits */
-        pid = fork();
-        T_QUIET; T_ASSERT_NE_INT(pid, -1, "fork() failed!");
-
-        if (pid == 0) {
-                /* Child process! */
-                exit (exitcode);
-        }
-
-        /* Main program that did the fork */
-        /* We should process the signal, then exit */
-        while (!should_exit) {
-                sleep(1);
-        }
-}
-
diff --git a/tools/tests/darwintests/sigcont_return.c b/tools/tests/darwintests/sigcont_return.c
deleted file mode 100644 (file)
index 606caa9..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-
-#include <darwintest.h>
-
-T_DECL(sigcontreturn, "checks that a call to waitid() for a child that is stopped and then continued returns correctly")
-{
-        pid_t           pid;
-        siginfo_t       siginfo;
-        pid = fork();
-        T_QUIET; T_ASSERT_NE_INT(pid, -1, "fork() failed!");
-
-        if (pid == 0) {
-                while(1){}
-        }
-
-        kill(pid, SIGSTOP);
-        kill(pid, SIGCONT);
-        sleep(1);
-
-        T_QUIET; T_ASSERT_POSIX_SUCCESS(waitid(P_PID, pid, &siginfo, WCONTINUED), "Calling waitid() failed for pid %d", pid);
-
-        T_ASSERT_EQ_INT(siginfo.si_status, SIGCONT, "A call to waitid() for stopped and continued child returns 0x%x, expected SIGCONT (0x%x)", siginfo.si_status, SIGCONT );
-        kill(pid, SIGKILL);
-}
diff --git a/tools/tests/darwintests/socket_bind_35243417.c b/tools/tests/darwintests/socket_bind_35243417.c
deleted file mode 100644 (file)
index cb44aa5..0000000
+++ /dev/null
@@ -1,107 +0,0 @@
-/* -*- Mode: c; tab-width: 8; indent-tabs-mode: 1; c-basic-offset: 8; -*- */
-
-#include <darwintest.h>
-#include <poll.h>
-#include <sys/socket.h>
-#include <unistd.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <errno.h>
-
-static int
-sockv6_open(void)
-{
-       int     s;
-
-       s = socket(AF_INET6, SOCK_DGRAM, 0);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(s, "socket(AF_INET6, SOCK_DGRAM, 0)");
-       return (s);
-}
-
-static int
-sockv6_bind(int s, in_port_t port)
-{
-       struct sockaddr_in6     sin6;
-
-       bzero(&sin6, sizeof(sin6));
-       sin6.sin6_len = sizeof(sin6);
-       sin6.sin6_family = AF_INET6;
-       sin6.sin6_port = port;
-       return (bind(s, (const struct sockaddr *)&sin6, sizeof(sin6)));
-}
-
-static void
-sockv6_set_v6only(int s)
-{
-       int             on = 1;
-       int             ret;
-
-       ret = setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY, &on, sizeof(on));
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(ret, "setsockopt(%d, IPV6_ONLY)", s);
-}
-
-static bool
-alloc_and_bind_ports(in_port_t port_start, in_port_t port_end,
-                    int bind_attempts)
-{
-       int     bound_count = 0;
-       bool    success = true;
-
-       for (in_port_t i = port_start; success && i <= port_end; i++) {
-               int     s6 = -1;
-               int     s6_other = -1;
-               int     ret;
-
-               s6 = sockv6_open();
-               sockv6_set_v6only(s6);
-               if (sockv6_bind(s6, i) != 0) {
-                       /* find the next available port */
-                       goto loop_done;
-               }
-               s6_other = sockv6_open();
-               ret = sockv6_bind(s6_other, i);
-               T_WITH_ERRNO;
-               T_QUIET;
-               T_ASSERT_TRUE(ret != 0, "socket %d bind %d", s6_other, i);
-               /*
-                * After bind fails, try binding to a different port.
-                * For non-root user, this will panic without the fix for
-                * <rdar://problem/35243417>.
-                */
-               if (sockv6_bind(s6_other, i + 1) == 0) {
-                       bound_count++;
-                       if (bound_count >= bind_attempts) {
-                               break;
-                       }
-               }
-       loop_done:
-               if (s6 >= 0) {
-                       close(s6);
-               }
-               if (s6_other >= 0) {
-                       close(s6_other);
-               }
-       }
-       T_ASSERT_TRUE(bound_count == bind_attempts,
-                     "number of successful binds %d (out of %d)",
-                     bound_count, bind_attempts);
-       return (success);
-}
-
-
-T_DECL(socket_bind_35243417,
-       "bind IPv6 only UDP socket, then bind IPv6 socket.",
-       T_META_ASROOT(false),
-       T_META_CHECK_LEAKS(false))
-{
-       alloc_and_bind_ports(1, 65534, 10);
-}
-
-T_DECL(socket_bind_35243417_root,
-       "bind IPv6 only UDP socket, then bind IPv6 socket.",
-       T_META_ASROOT(true))
-{
-       alloc_and_bind_ports(1, 65534, 10);
-}
diff --git a/tools/tests/darwintests/socket_bind_35685803.c b/tools/tests/darwintests/socket_bind_35685803.c
deleted file mode 100644 (file)
index d0e22a9..0000000
+++ /dev/null
@@ -1,205 +0,0 @@
-/* -*- Mode: c; tab-width: 8; indent-tabs-mode: 1; c-basic-offset: 8; -*- */
-
-#include <darwintest.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <errno.h>
-#include <pthread.h>
-#include <stdbool.h>
-
-static bool debug;
-
-static int
-sock_open_common(int pf, int type)
-{
-       int     s;
-
-       s = socket(pf, type, 0);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(s, "socket(%d, %d, 0)", pf, type);
-       return (s);
-}
-
-static int
-sock_open(int type)
-{
-       return (sock_open_common(PF_INET, type));
-}
-
-static int
-sock_bind(int s, int port)
-{
-       struct sockaddr_in      sin = {
-               .sin_len = sizeof(sin),
-               .sin_family = AF_INET,
-       };
-
-       sin.sin_port = htons(port);
-       return (bind(s, (const struct sockaddr *)&sin, sizeof(sin)));
-}
-
-static int
-sockv6_open(int type)
-{
-       return (sock_open_common(PF_INET6, type));
-}
-
-static int
-sockv6_bind(int s, int port)
-{
-       struct sockaddr_in6             sin6 = {
-               .sin6_len = sizeof(sin6),
-               .sin6_family = AF_INET6,
-       };
-
-       sin6.sin6_port = htons(port);
-       return (bind(s, (const struct sockaddr *)&sin6, sizeof(sin6)));
-}
-
-static uint16_t
-sock_get_port(int sockfd)
-{
-       int                             error;
-       uint16_t                        p;
-       union sockaddr_in_4_6   sin;
-       socklen_t                       sin_len;
-
-       sin_len = sizeof(sin);
-       bzero(&sin, sin_len);
-       error = getsockname(sockfd, (struct sockaddr *)&sin, &sin_len);
-       T_QUIET;
-       T_EXPECT_POSIX_ZERO(error, "getsockname(%d)", sockfd);
-       if (error != 0) {
-               return (0);
-       }
-       switch (sin.sa.sa_family) {
-       case AF_INET:
-               p = sin.sin.sin_port;
-               break;
-       case AF_INET6:
-               p = sin.sin6.sin6_port;
-               break;
-       default:
-               T_ASSERT_FAIL("unknown address family %d\n",
-                             sin.sa.sa_family);
-               p = 0;
-               break;
-       }
-       return (p);
-}
-
-typedef struct {
-       bool    v6;
-       int             socket_count;
-       int *   socket_list;
-} SocketInfo, * SocketInfoRef;
-
-static void
-bind_sockets(SocketInfoRef info, const char * msg)
-{
-       for (int i = 0; i < info->socket_count; i++) {
-               int             error;
-               uint16_t        port;
-
-               if (info->v6) {
-                       error = sockv6_bind(info->socket_list[i], 0);
-               }
-               else {
-                       error = sock_bind(info->socket_list[i], 0);
-               }
-               port = sock_get_port(info->socket_list[i]);
-               if (debug) {
-                       T_LOG( "%s: fd %d port is %d error %d",
-                              msg, info->socket_list[i], ntohs(port), error);
-               }
-       }
-       return;
-}
-
-static void *
-second_thread(void * arg)
-{
-       SocketInfoRef   info = (SocketInfoRef)arg;
-
-       bind_sockets(info, "second");
-       return (NULL);
-}
-
-static void
-multithreaded_bind_test(bool v6, int socket_count)
-{
-       int             error;
-       SocketInfo      info;
-       int     socket_list[socket_count];
-       pthread_t       thread;
-
-       info.v6 = v6;
-       for (int i = 0; i < socket_count; i++) {
-               if (v6) {
-                       socket_list[i] = sockv6_open(SOCK_STREAM);
-               } else {
-                       socket_list[i] = sock_open(SOCK_STREAM);
-               }
-       }
-       info.socket_count = socket_count;
-       info.socket_list = socket_list;
-       error = pthread_create(&thread, NULL, second_thread, &info);
-       T_QUIET;
-       T_ASSERT_POSIX_ZERO(error, "pthread_create");
-
-       /* compete with second thread */
-       bind_sockets(&info, "main");
-       error = pthread_join(thread, NULL);
-       T_QUIET;
-       T_ASSERT_POSIX_ZERO(error, "pthread_join");
-
-       for (int i = 0; i < socket_count; i++) {
-               error = close(socket_list[i]);
-               T_QUIET;
-               T_ASSERT_POSIX_ZERO(error, "close socket %d", socket_list[i]);
-       }
-}
-
-static void
-run_multithreaded_bind_test(int number_of_runs, bool v6, int socket_count)
-{
-       for (int i = 0; i < number_of_runs; i++) {
-               multithreaded_bind_test(v6, socket_count);
-       }
-       T_PASS("multithreaded_bind_test %s", v6 ? "IPv6" : "IPv4");
-}
-
-T_DECL(socket_bind_35685803,
-       "multithreaded bind IPv4 socket as root",
-       T_META_ASROOT(false),
-       T_META_CHECK_LEAKS(false))
-{
-       run_multithreaded_bind_test(100, false, 100);
-}
-
-T_DECL(socket_bind_35685803_root,
-       "multithreaded bind IPv4 socket",
-       T_META_ASROOT(true))
-{
-       run_multithreaded_bind_test(100, false, 100);
-}
-
-T_DECL(socket_bind_35685803_v6,
-       "multithreaded bind IPv6 socket as root",
-       T_META_ASROOT(false),
-       T_META_CHECK_LEAKS(false))
-{
-       run_multithreaded_bind_test(100, true, 100);
-}
-
-T_DECL(socket_bind_35685803_v6_root,
-       "multithreaded bind IPv6 socket",
-       T_META_ASROOT(true))
-{
-       run_multithreaded_bind_test(100, true, 100);
-}
diff --git a/tools/tests/darwintests/socket_poll_close_25786011.c b/tools/tests/darwintests/socket_poll_close_25786011.c
deleted file mode 100644 (file)
index b39b936..0000000
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <darwintest.h>
-#include <poll.h>
-#include <sys/socket.h>
-#include <unistd.h>
-
-T_DECL(socket_poll_close_25786011, "Tests an invalid poll call to a socket and then calling close.", T_META_LTEPHASE(LTE_POSTINIT))
-{
-       int my_socket, ret;
-
-       my_socket = socket(PF_LOCAL, SOCK_STREAM, 0);
-       T_WITH_ERRNO; T_ASSERT_TRUE(my_socket > 0, "create socket");
-
-       /*
-        * Setup a pollfd that we know will return an error when we try
-        * to create a knote for it. We specify a BSD vnode specific event
-        * for a socket.
-        */
-       struct pollfd my_pollfd = {
-               .fd = my_socket,
-               .events = POLLEXTEND
-       };
-
-       /*
-        * Previously the call to kevent_register() in the kernel from this call
-        * would leak an iocount reference on the fileproc, which would cause any
-        * subsequent calls to close() on the associated fd to block indefinitely.
-        */
-       ret = poll(&my_pollfd, 1, 0);
-       T_WITH_ERRNO; T_ASSERT_TRUE(ret == 1, "poll returned %d", ret);
-
-       ret = close(my_socket);
-       T_ASSERT_POSIX_ZERO(ret, "close on socket with fd %d\n", my_socket);
-
-       T_PASS("socket_poll_close_25786011 PASSED");
-}
diff --git a/tools/tests/darwintests/stackshot.m b/tools/tests/darwintests/stackshot.m
deleted file mode 100644 (file)
index 2c5b37d..0000000
+++ /dev/null
@@ -1,619 +0,0 @@
-#include <darwintest.h>
-#include <darwintest_utils.h>
-#include <kern/debug.h>
-#include <kern/kern_cdata.h>
-#include <kdd.h>
-#include <libproc.h>
-#include <sys/syscall.h>
-#include <sys/stackshot.h>
-
-T_GLOBAL_META(
-               T_META_NAMESPACE("xnu.stackshot"),
-               T_META_CHECK_LEAKS(false),
-               T_META_ASROOT(true)
-               );
-
-static const char *current_process_name(void);
-static void parse_stackshot(bool delta, void *ssbuf, size_t sslen);
-static void parse_thread_group_stackshot(void **sbuf, size_t sslen);
-static uint64_t stackshot_timestamp(void *ssbuf, size_t sslen);
-static void initialize_thread(void);
-
-#define DEFAULT_STACKSHOT_BUFFER_SIZE (1024 * 1024)
-#define MAX_STACKSHOT_BUFFER_SIZE     (6 * 1024 * 1024)
-
-T_DECL(microstackshots, "test the microstackshot syscall")
-{
-       void *buf = NULL;
-       unsigned int size = DEFAULT_STACKSHOT_BUFFER_SIZE;
-
-       while (1) {
-               buf = malloc(size);
-               T_QUIET; T_ASSERT_NOTNULL(buf, "allocated stackshot buffer");
-
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wdeprecated-declarations"
-               int len = syscall(SYS_microstackshot, buf, size,
-                               STACKSHOT_GET_MICROSTACKSHOT);
-#pragma clang diagnostic pop
-               if (len == ENOSYS) {
-                       T_SKIP("microstackshot syscall failed, likely not compiled with CONFIG_TELEMETRY");
-               }
-               if (len == -1 && errno == ENOSPC) {
-                       /* syscall failed because buffer wasn't large enough, try again */
-                       free(buf);
-                       buf = NULL;
-                       size *= 2;
-                       T_ASSERT_LE(size, (unsigned int)MAX_STACKSHOT_BUFFER_SIZE,
-                                       "growing stackshot buffer to sane size");
-                       continue;
-               }
-               T_ASSERT_POSIX_SUCCESS(len, "called microstackshot syscall");
-               break;
-    }
-
-       T_EXPECT_EQ(*(uint32_t *)buf,
-                       (uint32_t)STACKSHOT_MICRO_SNAPSHOT_MAGIC,
-                       "magic value for microstackshot matches");
-
-       free(buf);
-}
-
-struct scenario {
-       uint32_t flags;
-       bool should_fail;
-       pid_t target_pid;
-       uint64_t since_timestamp;
-       uint32_t size_hint;
-       dt_stat_time_t timer;
-};
-
-static void
-quiet(struct scenario *scenario)
-{
-       if (scenario->timer) {
-               T_QUIET;
-       }
-}
-
-static void
-take_stackshot(struct scenario *scenario, void (^cb)(void *buf, size_t size))
-{
-       void *config = stackshot_config_create();
-       quiet(scenario);
-       T_ASSERT_NOTNULL(config, "created stackshot config");
-
-       int ret = stackshot_config_set_flags(config, scenario->flags);
-       quiet(scenario);
-       T_ASSERT_POSIX_ZERO(ret, "set flags %#x on stackshot config", scenario->flags);
-
-       if (scenario->size_hint > 0) {
-               ret = stackshot_config_set_size_hint(config, scenario->size_hint);
-               quiet(scenario);
-               T_ASSERT_POSIX_ZERO(ret, "set size hint %" PRIu32 " on stackshot config",
-                               scenario->size_hint);
-       }
-
-       if (scenario->target_pid > 0) {
-               ret = stackshot_config_set_pid(config, scenario->target_pid);
-               quiet(scenario);
-               T_ASSERT_POSIX_ZERO(ret, "set target pid %d on stackshot config",
-                               scenario->target_pid);
-       }
-
-       if (scenario->since_timestamp > 0) {
-               ret = stackshot_config_set_delta_timestamp(config, scenario->since_timestamp);
-               quiet(scenario);
-               T_ASSERT_POSIX_ZERO(ret, "set since timestamp %" PRIu64 " on stackshot config",
-                               scenario->since_timestamp);
-       }
-
-       int retries_remaining = 5;
-
-retry: ;
-       uint64_t start_time = mach_absolute_time();
-       ret = stackshot_capture_with_config(config);
-       uint64_t end_time = mach_absolute_time();
-
-       if (scenario->should_fail) {
-               T_EXPECTFAIL;
-               T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
-               return;
-       }
-
-       if (ret == EBUSY || ret == ETIMEDOUT) {
-               if (retries_remaining > 0) {
-                       if (!scenario->timer) {
-                               T_LOG("stackshot_capture_with_config failed with %s (%d), retrying",
-                                               strerror(ret), ret);
-                       }
-
-                       retries_remaining--;
-                       goto retry;
-               } else {
-                       T_ASSERT_POSIX_ZERO(ret,
-                                       "called stackshot_capture_with_config (no retries remaining)");
-               }
-       } else {
-               quiet(scenario);
-               T_ASSERT_POSIX_ZERO(ret, "called stackshot_capture_with_config");
-       }
-
-       if (scenario->timer) {
-               dt_stat_mach_time_add(scenario->timer, end_time - start_time);
-       }
-       cb(stackshot_config_get_stackshot_buffer(config), stackshot_config_get_stackshot_size(config));
-
-       ret = stackshot_config_dealloc(config);
-       T_QUIET; T_EXPECT_POSIX_ZERO(ret, "deallocated stackshot config");
-}
-
-T_DECL(kcdata, "test that kcdata stackshots can be taken and parsed")
-{
-       struct scenario scenario = {
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS |
-                               STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT)
-       };
-
-       initialize_thread();
-       T_LOG("taking kcdata stackshot");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(false, ssbuf, sslen);
-       });
-}
-
-T_DECL(kcdata_faulting, "test that kcdata stackshots while faulting can be taken and parsed")
-{
-       struct scenario scenario = {
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT
-                               | STACKSHOT_ENABLE_BT_FAULTING | STACKSHOT_ENABLE_UUID_FAULTING),
-       };
-
-       initialize_thread();
-       T_LOG("taking faulting stackshot");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(false, ssbuf, sslen);
-       });
-}
-
-T_DECL(bad_flags, "test a poorly-formed stackshot syscall")
-{
-       struct scenario scenario = {
-               .flags = STACKSHOT_SAVE_IN_KERNEL_BUFFER /* not allowed from user space */,
-               .should_fail = true
-       };
-
-       T_LOG("attempting to take stackshot with kernel-only flag");
-       take_stackshot(&scenario, ^(__unused void *ssbuf, __unused size_t sslen) {
-               T_ASSERT_FAIL("stackshot data callback called");
-       });
-}
-
-T_DECL(delta, "test delta stackshots")
-{
-       struct scenario scenario = {
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                               | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT)
-       };
-
-       initialize_thread();
-       T_LOG("taking full stackshot");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen);
-
-               T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time);
-
-               parse_stackshot(false, ssbuf, sslen);
-
-               struct scenario delta_scenario = {
-                       .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                                       | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT
-                                       | STACKSHOT_COLLECT_DELTA_SNAPSHOT),
-                       .since_timestamp = stackshot_time
-               };
-
-               take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) {
-                       parse_stackshot(true, dssbuf, dsslen);
-               });
-       });
-}
-
-static void
-expect_instrs_cycles_in_stackshot(void *ssbuf, size_t sslen)
-{
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-
-       bool in_task = false;
-       bool in_thread = false;
-       bool saw_instrs_cycles = false;
-       iter = kcdata_iter_next(iter);
-
-       KCDATA_ITER_FOREACH(iter) {
-               switch (kcdata_iter_type(iter)) {
-               case KCDATA_TYPE_CONTAINER_BEGIN:
-                       switch (kcdata_iter_container_type(iter)) {
-                       case STACKSHOT_KCCONTAINER_TASK:
-                               in_task = true;
-                               saw_instrs_cycles = false;
-                               break;
-
-                       case STACKSHOT_KCCONTAINER_THREAD:
-                               in_thread = true;
-                               saw_instrs_cycles = false;
-                               break;
-
-                       default:
-                               break;
-                       }
-                       break;
-
-               case STACKSHOT_KCTYPE_INSTRS_CYCLES:
-                       saw_instrs_cycles = true;
-                       break;
-
-               case KCDATA_TYPE_CONTAINER_END:
-                       if (in_thread) {
-                               T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, "saw instructions and cycles in thread");
-                               in_thread = false;
-                       } else if (in_task) {
-                               T_QUIET; T_EXPECT_TRUE(saw_instrs_cycles, "saw instructions and cycles in task");
-                               in_task = false;
-                       }
-
-               default:
-                       break;
-               }
-       }
-}
-
-static void
-skip_if_monotonic_unsupported(void)
-{
-       int supported = 0;
-       size_t supported_size = sizeof(supported);
-       int ret = sysctlbyname("kern.monotonic.supported", &supported, &supported_size, 0, 0);
-       if (ret < 0 || !supported) {
-               T_SKIP("monotonic is unsupported");
-       }
-}
-
-T_DECL(instrs_cycles, "test a getting instructions and cycles in stackshot")
-{
-       skip_if_monotonic_unsupported();
-
-       struct scenario scenario = {
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
-                               | STACKSHOT_KCDATA_FORMAT)
-       };
-
-       T_LOG("attempting to take stackshot with instructions and cycles");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_stackshot(false, ssbuf, sslen);
-               expect_instrs_cycles_in_stackshot(ssbuf, sslen);
-       });
-}
-
-T_DECL(delta_instrs_cycles, "test delta stackshots with instructions and cycles")
-{
-       skip_if_monotonic_unsupported();
-
-       struct scenario scenario = {
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
-                               | STACKSHOT_KCDATA_FORMAT)
-       };
-
-       initialize_thread();
-       T_LOG("taking full stackshot");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               uint64_t stackshot_time = stackshot_timestamp(ssbuf, sslen);
-
-               T_LOG("taking delta stackshot since time %" PRIu64, stackshot_time);
-
-               parse_stackshot(false, ssbuf, sslen);
-               expect_instrs_cycles_in_stackshot(ssbuf, sslen);
-
-               struct scenario delta_scenario = {
-                       .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_INSTRS_CYCLES
-                                       | STACKSHOT_KCDATA_FORMAT
-                                       | STACKSHOT_COLLECT_DELTA_SNAPSHOT),
-                       .since_timestamp = stackshot_time
-               };
-
-               take_stackshot(&delta_scenario, ^(void *dssbuf, size_t dsslen) {
-                       parse_stackshot(true, dssbuf, dsslen);
-                       expect_instrs_cycles_in_stackshot(dssbuf, dsslen);
-               });
-       });
-}
-
-static void
-check_thread_groups_supported()
-{
-       int err;
-       int supported = 0;
-       size_t supported_size = sizeof(supported);
-       err = sysctlbyname("kern.thread_groups_supported", &supported, &supported_size, NULL, 0);
-
-       if (err || !supported)
-               T_SKIP("thread groups not supported on this system");
-}
-
-T_DECL(thread_groups, "test getting thread groups in stackshot")
-{
-       check_thread_groups_supported();
-
-       struct scenario scenario = {
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_THREAD_GROUP
-                               | STACKSHOT_KCDATA_FORMAT)
-       };
-
-       T_LOG("attempting to take stackshot with thread group flag");
-       take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-               parse_thread_group_stackshot(ssbuf, sslen);
-       });
-
-}
-
-#pragma mark performance tests
-
-#define SHOULD_REUSE_SIZE_HINT 0x01
-#define SHOULD_USE_DELTA       0x02
-#define SHOULD_TARGET_SELF     0x04
-
-static void
-stackshot_perf(unsigned int options)
-{
-       struct scenario scenario = {
-               .flags = (STACKSHOT_SAVE_LOADINFO | STACKSHOT_GET_GLOBAL_MEM_STATS
-                       | STACKSHOT_SAVE_IMP_DONATION_PIDS | STACKSHOT_KCDATA_FORMAT),
-       };
-
-       dt_stat_t size = dt_stat_create("bytes", "size");
-       dt_stat_time_t duration = dt_stat_time_create("duration");
-       scenario.timer = duration;
-
-       if (options & SHOULD_TARGET_SELF) {
-               scenario.target_pid = getpid();
-       }
-
-       while (!dt_stat_stable(duration) || !dt_stat_stable(size)) {
-               __block uint64_t last_time = 0;
-               __block uint32_t size_hint = 0;
-               take_stackshot(&scenario, ^(void *ssbuf, size_t sslen) {
-                       dt_stat_add(size, (double)sslen);
-                       last_time = stackshot_timestamp(ssbuf, sslen);
-                       size_hint = (uint32_t)sslen;
-               });
-               if (options & SHOULD_USE_DELTA) {
-                       scenario.since_timestamp = last_time;
-                       scenario.flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT;
-               }
-               if (options & SHOULD_REUSE_SIZE_HINT) {
-                       scenario.size_hint = size_hint;
-               }
-       }
-
-       dt_stat_finalize(duration);
-       dt_stat_finalize(size);
-}
-
-T_DECL(perf_no_size_hint, "test stackshot performance with no size hint")
-{
-       stackshot_perf(0);
-}
-
-T_DECL(perf_size_hint, "test stackshot performance with size hint")
-{
-       stackshot_perf(SHOULD_REUSE_SIZE_HINT);
-}
-
-T_DECL(perf_process, "test stackshot performance targeted at process")
-{
-       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_TARGET_SELF);
-}
-
-T_DECL(perf_delta, "test delta stackshot performance")
-{
-       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA);
-}
-
-T_DECL(perf_delta_process, "test delta stackshot performance targeted at a process")
-{
-       stackshot_perf(SHOULD_REUSE_SIZE_HINT | SHOULD_USE_DELTA | SHOULD_TARGET_SELF);
-}
-
-static uint64_t
-stackshot_timestamp(void *ssbuf, size_t sslen)
-{
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-
-       uint32_t type = kcdata_iter_type(iter);
-       if (type != KCDATA_BUFFER_BEGIN_STACKSHOT && type != KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT) {
-               T_ASSERT_FAIL("invalid kcdata type %u", kcdata_iter_type(iter));
-       }
-
-       iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME);
-       T_QUIET;
-       T_ASSERT_TRUE(kcdata_iter_valid(iter), "timestamp found in stackshot");
-
-       return *(uint64_t *)kcdata_iter_payload(iter);
-}
-
-#define TEST_THREAD_NAME "stackshot_test_thread"
-
-static void
-parse_thread_group_stackshot(void **ssbuf, size_t sslen)
-{
-       bool seen_thread_group_snapshot = false;
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-       T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
-                       "buffer provided is a stackshot");
-
-       NSMutableSet *thread_groups = [[NSMutableSet alloc] init];
-
-       iter = kcdata_iter_next(iter);
-       KCDATA_ITER_FOREACH(iter) {
-               switch (kcdata_iter_type(iter)) {
-               case KCDATA_TYPE_ARRAY: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
-                                       "checked that array is valid");
-
-                       if (kcdata_iter_array_elem_type(iter) != STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT) {
-                               continue;
-                       }
-
-                       seen_thread_group_snapshot = true;
-
-                       if (kcdata_iter_array_elem_size(iter) >= sizeof(struct thread_group_snapshot_v2)) {
-                               struct thread_group_snapshot_v2 *tgs_array = kcdata_iter_payload(iter);
-                               for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) {
-                                       struct thread_group_snapshot_v2 *tgs = tgs_array + j;
-                                       [thread_groups addObject:@(tgs->tgs_id)];
-                               }
-
-                       }
-                       else {
-                               struct thread_group_snapshot *tgs_array = kcdata_iter_payload(iter);
-                               for (uint32_t j = 0; j < kcdata_iter_array_elem_count(iter); j++) {
-                                       struct thread_group_snapshot *tgs = tgs_array + j;
-                                       [thread_groups addObject:@(tgs->tgs_id)];
-                               }
-                       }
-                       break;
-               }
-               }
-       }
-       KCDATA_ITER_FOREACH(iter) {
-               NSError *error = nil;
-
-               switch (kcdata_iter_type(iter)) {
-
-               case KCDATA_TYPE_CONTAINER_BEGIN: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_container_valid(iter),
-                                       "checked that container is valid");
-
-                       NSDictionary *container = parseKCDataContainer(&iter, &error);
-                       T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot");
-                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container");
-
-                       if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_THREAD) {
-                               break;
-                       }
-
-                       int tg = [container[@"thread_snapshots"][@"thread_group"] intValue];
-
-                       T_ASSERT_TRUE([thread_groups containsObject:@(tg)], "check that the thread group the thread is in exists");
-
-                       break;
-               };
-
-               }
-       }
-       T_ASSERT_TRUE(seen_thread_group_snapshot, "check that we have seen a thread group snapshot");
-}
-
-static void
-parse_stackshot(bool delta, void *ssbuf, size_t sslen)
-{
-       kcdata_iter_t iter = kcdata_iter(ssbuf, sslen);
-       if (delta) {
-               T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
-                               "buffer provided is a delta stackshot");
-       } else {
-               T_ASSERT_EQ(kcdata_iter_type(iter), KCDATA_BUFFER_BEGIN_STACKSHOT,
-                               "buffer provided is a stackshot");
-       }
-
-       iter = kcdata_iter_next(iter);
-       KCDATA_ITER_FOREACH(iter) {
-               NSError *error = nil;
-
-               switch (kcdata_iter_type(iter)) {
-               case KCDATA_TYPE_ARRAY: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_array_valid(iter),
-                                       "checked that array is valid");
-
-                       NSMutableDictionary *array = parseKCDataArray(iter, &error);
-                       T_QUIET; T_ASSERT_NOTNULL(array, "parsed array from stackshot");
-                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing array");
-                       break;
-               }
-
-               case KCDATA_TYPE_CONTAINER_BEGIN: {
-                       T_QUIET;
-                       T_ASSERT_TRUE(kcdata_iter_container_valid(iter),
-                                       "checked that container is valid");
-
-                       NSDictionary *container = parseKCDataContainer(&iter, &error);
-                       T_QUIET; T_ASSERT_NOTNULL(container, "parsed container from stackshot");
-                       T_QUIET; T_ASSERT_NULL(error, "error unset after parsing container");
-
-                       if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) {
-                               break;
-                       }
-                       int pid = [container[@"task_snapshots"][@"task_snapshot"][@"ts_pid"] intValue];
-                       if (pid != getpid()) {
-                               break;
-                       }
-
-                       T_EXPECT_EQ_STR(current_process_name(),
-                                       [container[@"task_snapshots"][@"task_snapshot"][@"ts_p_comm"] UTF8String],
-                                       "current process name matches in stackshot");
-
-                       T_QUIET;
-                       T_EXPECT_LE(pid, [container[@"task_snapshots"][@"task_snapshot"][@"ts_unique_pid"] intValue],
-                                       "unique pid is greater than pid");
-
-                       bool found_main_thread = 0;
-                       for (id thread_key in container[@"task_snapshots"][@"thread_snapshots"]) {
-                               NSMutableDictionary *thread = container[@"task_snapshots"][@"thread_snapshots"][thread_key];
-                               NSDictionary *thread_snap = thread[@"thread_snapshot"];
-
-                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_thread_id"] intValue], 0,
-                                               "thread ID of thread in current task is valid");
-                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_total_syscalls"] intValue], 0,
-                                               "total syscalls of thread in current task is valid");
-                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_base_priority"] intValue], 0,
-                                               "base priority of thread in current task is valid");
-                               T_QUIET; T_EXPECT_GT([thread_snap[@"ths_sched_priority"] intValue], 0,
-                                               "scheduling priority of thread in current task is valid");
-
-                               NSString *pth_name = thread_snap[@"pth_name"];
-                               if (pth_name != nil && [pth_name isEqualToString:@TEST_THREAD_NAME]) {
-                                       found_main_thread = true;
-                               }
-                       }
-                       T_EXPECT_TRUE(found_main_thread, "found main thread for current task in stackshot");
-                       break;
-               }
-               }
-       }
-
-       T_ASSERT_FALSE(KCDATA_ITER_FOREACH_FAILED(iter), "successfully iterated kcdata");
-}
-
-static const char *
-current_process_name(void)
-{
-       static char name[64];
-
-       if (!name[0]) {
-               int ret = proc_name(getpid(), name, sizeof(name));
-               T_QUIET;
-               T_ASSERT_POSIX_ZERO(ret, "proc_pidname failed for current process");
-       }
-
-       return name;
-}
-
-static void
-initialize_thread(void)
-{
-       int ret = pthread_setname_np(TEST_THREAD_NAME);
-       T_QUIET;
-       T_ASSERT_POSIX_ZERO(ret, "set thread name to %s", TEST_THREAD_NAME);
-}
diff --git a/tools/tests/darwintests/stackshot_block_owner_14362384.m b/tools/tests/darwintests/stackshot_block_owner_14362384.m
deleted file mode 100644 (file)
index bf4f3ae..0000000
+++ /dev/null
@@ -1,858 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-#include <darwintest.h>
-
-#include <kdd.h>
-#include <kern/kcdata.h>
-#include <kern/debug.h>
-#include <kern/block_hint.h>
-#include <mach/mach.h>
-#include <mach/mach_init.h>
-#include <mach/mach_traps.h>
-#include <mach/message.h>
-#include <mach/port.h>
-#include <mach/semaphore.h>
-#include <mach/task.h>
-#include <os/lock.h>
-#include <pthread.h>
-#include <sys/sysctl.h>
-#include <sys/stackshot.h>
-#include <sys/types.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <TargetConditionals.h>
-
-#if !TARGET_OS_EMBEDDED
-#include <pcre.h>
-#endif
-
-
-T_GLOBAL_META(
-        T_META_NAMESPACE("xnu.scheduler"),
-        T_META_ASROOT(true)
-);
-
-#include <Foundation/Foundation.h>
-
-#define SENDS_TO_BLOCK 6
-#define NUMRETRIES 5
-#define KRWLCK_STORES_EXCL_OWNER 0
-
-#define KMUTEX_SYSCTL_CHECK_EXISTS   0
-#define KMUTEX_SYSCTL_ACQUIRE_WAIT   1
-#define KMUTEX_SYSCTL_ACQUIRE_NOWAIT 2
-#define KMUTEX_SYSCTL_SIGNAL         3
-#define KMUTEX_SYSCTL_TEARDOWN       4
-
-#define KRWLCK_SYSCTL_CHECK_EXISTS    0
-#define KRWLCK_SYSCTL_RACQUIRE_NOWAIT 1
-#define KRWLCK_SYSCTL_RACQUIRE_WAIT   2
-#define KRWLCK_SYSCTL_WACQUIRE_NOWAIT 3
-#define KRWLCK_SYSCTL_WACQUIRE_WAIT   4
-#define KRWLCK_SYSCTL_SIGNAL          5
-#define KRWLCK_SYSCTL_TEARDOWN        6
-
-static const char kmutex_ctl[] = "debug.test_MutexOwnerCtl";
-static const char krwlck_ctl[] = "debug.test_RWLockOwnerCtl";
-
-static mach_port_t send = MACH_PORT_NULL;
-static mach_port_t recv = MACH_PORT_NULL;
-
-static void *
-take_stackshot(uint32_t extra_flags, uint64_t since_timestamp)
-{
-       void * stackshot = NULL;
-       int ret = 0;
-       uint32_t stackshot_flags = STACKSHOT_SAVE_LOADINFO |
-                                       STACKSHOT_GET_GLOBAL_MEM_STATS |
-                                       STACKSHOT_SAVE_IMP_DONATION_PIDS |
-                                       STACKSHOT_KCDATA_FORMAT;
-
-       if (since_timestamp != 0)
-               stackshot_flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT;
-
-       stackshot_flags |= extra_flags;
-
-       stackshot = stackshot_config_create();
-       T_QUIET; T_ASSERT_NOTNULL(stackshot, "Allocating stackshot config");
-
-       ret = stackshot_config_set_flags(stackshot, stackshot_flags);
-       T_ASSERT_POSIX_ZERO(ret, "Setting flags on stackshot config");
-
-       ret = stackshot_config_set_pid(stackshot, getpid());
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Setting target pid on stackshot config");
-
-       if (since_timestamp != 0) {
-               ret = stackshot_config_set_delta_timestamp(stackshot, since_timestamp);
-               T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Setting prev snapshot time on stackshot config");
-       }
-
-       for (int retries = NUMRETRIES; retries > 0; retries--) {
-               ret = stackshot_capture_with_config(stackshot);
-               T_QUIET; T_ASSERT_TRUE(ret == 0 || ret == EBUSY || ret == ETIMEDOUT,
-                               "Attempting to take stackshot (error %d)...", ret);
-               if (retries == 0 && (ret == EBUSY || ret == ETIMEDOUT))
-                       T_ASSERT_FAIL("Failed to take stackshot after %d retries: got %d (%s)", NUMRETRIES, ret, strerror(ret));
-               if (ret == 0)
-                       break;
-       }
-       return stackshot;
-}
-
-static void
-save_stackshot(void *stackshot, const char *filename)
-{
-       void *buf = stackshot_config_get_stackshot_buffer(stackshot);
-       T_QUIET; T_ASSERT_NOTNULL(buf, "buf");
-       size_t size = stackshot_config_get_stackshot_size(stackshot);
-       FILE *f = fopen(filename, "w");
-       T_QUIET; T_ASSERT_NOTNULL(f, "f");
-       fwrite(buf, size, 1, f);
-       fclose(f);
-}
-
-static
-void check_python(void *stackshot, const char *fmt, ...)
-{
-       save_stackshot(stackshot, "/tmp/ss");
-
-#if !TARGET_OS_EMBEDDED
-       va_list args;
-       va_start(args, fmt);
-       char *re_string = NULL;
-       vasprintf(&re_string, fmt, args);
-       va_end(args);
-       T_QUIET; T_ASSERT_NOTNULL(re_string, "vasprintf");
-
-       const char *pcreErrorStr;
-       int pcreErrorOffset;
-       pcre *re = pcre_compile(re_string, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
-       T_QUIET; T_ASSERT_NOTNULL(re, "pcre_compile");
-
-       bool found = false;
-       FILE *p = popen("/usr/local/bin/kcdata --pretty /tmp/ss", "r");
-       T_QUIET; T_ASSERT_NOTNULL(p, "popen");
-       while (1) {
-               char *line = NULL;
-               size_t linecap = 0;
-               ssize_t linesize = getline(&line, &linecap, p);
-               if (linesize < 0) {
-                       if (line)
-                               free(line);
-                       break;
-               }
-               int pcre_ret = pcre_exec(re, NULL, line, strlen(line), 0, 0, NULL, 0);
-               if (pcre_ret == 0){
-                       T_LOG("line: %s", line);
-                       found = true;
-               }
-               free(line);
-       }
-       T_EXPECT_TRUE(found, "found the waitinfo in kcdata.py output");
-       pclose(p);
-       pcre_free(re);
-       free(re_string);
-#endif
-}
-
-
-// waitinfo can be NULL, but len must be non-null and point to the length of the waitinfo array.
-// when the function returns, len will be set to the number of waitinfo structs found in the stackshot.
-static void
-find_blocking_info(void * stackshot, struct stackshot_thread_waitinfo *waitinfo, int *len)
-{
-       void *buf = NULL;
-       uint32_t t = 0;
-       uint32_t buflen = 0;
-       NSError *error = nil;
-       NSMutableDictionary *parsed_container = nil;
-       NSArray *parsed_waitinfo = nil;
-
-       T_QUIET; T_ASSERT_NOTNULL(len, "Length pointer shouldn't be NULL");
-       int oldlen = *len;
-       *len = 0;
-
-       buf = stackshot_config_get_stackshot_buffer(stackshot);
-       T_QUIET; T_ASSERT_NOTNULL(buf, "Getting stackshot buffer");
-       buflen = stackshot_config_get_stackshot_size(stackshot);
-
-       kcdata_iter_t iter = kcdata_iter(buf, buflen);
-
-       T_QUIET; T_ASSERT_TRUE(kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_STACKSHOT ||
-                       kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
-                       "Checking start of stackshot buffer");
-
-       iter = kcdata_iter_next(iter);
-       KCDATA_ITER_FOREACH(iter)
-       {
-               t = kcdata_iter_type(iter);
-
-               if (t != KCDATA_TYPE_CONTAINER_BEGIN) {
-                       continue;
-               }
-
-               if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) {
-                       continue;
-               }
-
-               parsed_container = parseKCDataContainer(&iter, &error);
-               T_QUIET; T_ASSERT_TRUE(!error, "Error while parsing container: %d (%s)",
-                               (int)error.code, [error.domain UTF8String]);
-               T_QUIET; T_ASSERT_TRUE(parsed_container && !error, "Parsing container");
-
-               parsed_waitinfo = parsed_container[@"task_snapshots"][@"thread_waitinfo"];
-               for (id elem in parsed_waitinfo) {
-                       /* check to see that tid matches expected idle status */
-                       uint8_t type = [elem[@"wait_type"] unsignedCharValue];
-                       if (type != kThreadWaitNone) {
-                               if (waitinfo && *len < oldlen) {
-                                       struct stackshot_thread_waitinfo *curr = &waitinfo[*len];
-                                       curr->wait_type = type;
-                                       curr->owner     = [elem[@"owner"] unsignedLongLongValue];
-                                       curr->waiter    = [elem[@"waiter"] unsignedLongLongValue];
-                                       curr->context   = [elem[@"context"] unsignedLongLongValue];
-                               }
-                               (*len)++;
-                       }
-               }
-               [parsed_container release];
-       }
-}
-
-/* perform various actions with a mutex in kernel memory. note that, since we aren't allowed
- * to go to user space while still holding a mutex, the lock-acquiring actions in this kernel
- * sysctl will either lock and immediately release the lock, or lock and wait until a semaphore
- * is signalled, then unlock. if called with CHECK_EXISTS, returns whether or not the sysctl
- * exist in the kernel (to determine if we're running with CONFIG_XNUPOST defined). Else,
- * returns 1. */
-static int kmutex_action(int action)
-{
-       int ret = 0;
-       if (action == KMUTEX_SYSCTL_CHECK_EXISTS) {
-               ret = sysctlbyname(krwlck_ctl, NULL, NULL, NULL, 0);
-               return !(ret == -1);
-       }
-
-       char * action_name = "";
-       switch(action) {
-               case KMUTEX_SYSCTL_ACQUIRE_WAIT:
-                       action_name = "lock (and wait)";
-                       break;
-               case KMUTEX_SYSCTL_ACQUIRE_NOWAIT:
-                       action_name = "lock";
-                       break;
-               case KMUTEX_SYSCTL_SIGNAL:
-                       action_name = "signal to holder of";
-                       break;
-               case KMUTEX_SYSCTL_TEARDOWN:
-                       action_name = "tear down";
-                       break;
-               default:
-                       T_ASSERT_FAIL("Somebody passed the wrong argument to kmutex_action: %d", action);
-                       break;
-       }
-
-       ret = sysctlbyname(kmutex_ctl, NULL, NULL, &action, sizeof(int));
-       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: %s kernel mutex", action_name);
-       return 1;
-}
-
-static void
-sysctl_kmutex_test_match(uint64_t context)
-{
-       int ret = 0;
-       unsigned long long unslid_kmutex_address = 0;
-       size_t addrsize = sizeof(unslid_kmutex_address);
-
-       ret = sysctlbyname(kmutex_ctl, &unslid_kmutex_address, &addrsize, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Getting unslid location of kernel mutex. Size is %llu",
-                       (unsigned long long)addrsize);
-       T_EXPECT_EQ(context, unslid_kmutex_address,
-                       "Context should match unslid location of mutex in kernel memory");
-}
-
-/* We don't really care what goes into these messages, we're just sending something to a port. */
-static void
-msg_send_helper(mach_port_t remote_port)
-{
-       int ret;
-        mach_msg_header_t * msg = NULL;
-
-        ret = vm_allocate(mach_task_self(),
-                            (vm_address_t *)&msg,
-                            PAGE_SIZE,
-                            VM_MAKE_TAG(VM_MEMORY_MACH_MSG) | TRUE);
-
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Allocating vm page %p", (void*)msg);
-        msg->msgh_bits = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_COPY_SEND, 0, 0, 0);
-       msg->msgh_size = PAGE_SIZE;
-        msg->msgh_remote_port = remote_port;
-        msg->msgh_local_port = MACH_PORT_NULL;
-        msg->msgh_voucher_port = MACH_PORT_NULL;
-        ret = mach_msg(msg,
-                       MACH_SEND_MSG | MACH_MSG_OPTION_NONE,
-                       PAGE_SIZE,
-                        0,
-                        MACH_PORT_NULL,
-                        MACH_MSG_TIMEOUT_NONE,
-                        MACH_PORT_NULL);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Sending message to port %d", remote_port);
-
-        vm_deallocate(mach_task_self(), (vm_address_t)msg, PAGE_SIZE);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Deallocating vm page %p", (void*)msg);
-}
-
-static void
-msg_recv_helper(mach_port_t local_port)
-{
-       int ret = 0;
-       mach_msg_size_t size = 2*PAGE_SIZE;
-       mach_msg_header_t * msg = NULL;
-        ret = vm_allocate(mach_task_self(),
-                          (vm_address_t *)&msg,
-                         size,
-                          VM_MAKE_TAG(VM_MEMORY_MACH_MSG) | TRUE );
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Allocating page %p for message", (void*)msg);
-
-       ret = mach_msg(msg,
-                       MACH_RCV_MSG,
-                       0,
-                       size,
-                       local_port,
-                       MACH_MSG_TIMEOUT_NONE,
-                       MACH_PORT_NULL);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Received message on port %d", local_port);
-        ret = vm_deallocate(mach_task_self(), (vm_address_t)msg, PAGE_SIZE);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Deallocating page %p", (void*)msg);
-}
-
-/* perform various actions with a rwlock in kernel memory. note that, since we aren't allowed
- * to go to user space while still holding a rwlock, the lock-acquiring actions in this kernel
- * sysctl will either lock and immediately release the lock, or lock and wait until a semaphore
- * is signalled, then unlock. if called with CHECK_EXISTS, returns whether or not the sysctl
- * exist in the kernel (to determine if we're running with CONFIG_XNUPOST defined). Else,
- * returns 1. */
-static int
-krwlck_action(int action)
-{
-       int ret = 0;
-       if (action == KRWLCK_SYSCTL_CHECK_EXISTS) {
-               ret = sysctlbyname(krwlck_ctl, NULL, NULL, NULL, 0);
-               return !(ret == -1);
-       }
-
-       char * action_name = "";
-       switch(action) {
-               case KRWLCK_SYSCTL_RACQUIRE_NOWAIT:
-                       action_name = "shared lock";
-                       break;
-               case KRWLCK_SYSCTL_RACQUIRE_WAIT:
-                       action_name = "shared lock (and wait)";
-                       break;
-               case KRWLCK_SYSCTL_WACQUIRE_NOWAIT:
-                       action_name = "exclusive lock";
-                       break;
-               case KRWLCK_SYSCTL_WACQUIRE_WAIT:
-                       action_name = "exclusive lock (and wait)";
-                       break;
-               case KRWLCK_SYSCTL_SIGNAL:
-                       action_name = "signal to holder of";
-                       break;
-               case KRWLCK_SYSCTL_TEARDOWN:
-                       action_name = "tear down";
-                       break;
-               default:
-                       T_ASSERT_FAIL("Somebody passed the wrong argument to krwlck_action: %d", action);
-                       break;
-       }
-
-       ret = sysctlbyname(krwlck_ctl, NULL, NULL, &action, sizeof(int));
-       T_ASSERT_POSIX_SUCCESS(ret, "sysctl: %s kernel rwlock", action_name);
-       return 1;
-}
-
-static void
-sysctl_krwlck_test_match(uint64_t context)
-{
-       int ret = 0;
-       unsigned long long unslid_krwlck_address = 0;
-       size_t addrsize = sizeof(unslid_krwlck_address);
-
-       ret = sysctlbyname(krwlck_ctl, &unslid_krwlck_address, &addrsize, NULL, 0);
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "Getting unslid location of kernel rwlock");
-       T_EXPECT_EQ(context, unslid_krwlck_address, "Context should match unslid location of rwlock in kernel memory");
-}
-
-/* "Grabbing" threads: only purpose is to grab a sync primitive and hang. */
-
-static void *
-kmutex_grabbing_thread(void * arg)
-{
-       (void)arg;
-       kmutex_action(KMUTEX_SYSCTL_ACQUIRE_NOWAIT);
-       return NULL;
-}
-
-static void *
-kmutex_grab_and_wait_thread(void * arg)
-{
-       (void)arg;
-       kmutex_action(KMUTEX_SYSCTL_ACQUIRE_WAIT);
-       return NULL;
-}
-
-static void *
-sem_grabbing_thread(void * arg)
-{
-       semaphore_t *sem = (semaphore_t *)arg;
-       semaphore_wait(*sem);
-       return NULL;
-}
-
-static void *
-msg_blocking_thread(void * arg)
-{
-       (void)arg;
-       msg_recv_helper(send);
-
-       for (int i = 0; i < SENDS_TO_BLOCK; i++)
-               msg_send_helper(recv); // will block on send until message is received
-       return NULL;
-}
-
-static void *
-ulock_blocking_thread(void * arg)
-{
-       os_unfair_lock_t oul = (os_unfair_lock_t)arg;
-       os_unfair_lock_lock(oul);
-       os_unfair_lock_unlock(oul);
-       return NULL;
-}
-
-// acquires a kernel rwlock for writing, and then waits on a kernel semaphore.
-static void *
-krwlck_write_waiting_thread(void * arg)
-{
-       (void)arg;
-       krwlck_action(KRWLCK_SYSCTL_WACQUIRE_WAIT);
-       return NULL;
-}
-
-// attempts to acquire a kernel rwlock for reading, and doesn't wait on a semaphore afterwards.
-static void *
-krwlck_read_grabbing_thread(void * arg)
-{
-       (void)arg;
-       krwlck_action(KRWLCK_SYSCTL_RACQUIRE_NOWAIT);
-       return NULL;
-}
-
-static void *
-pthread_mutex_blocking_thread(void * arg)
-{
-       pthread_mutex_t *mtx = (pthread_mutex_t *)arg;
-       pthread_mutex_lock(mtx);
-       pthread_mutex_unlock(mtx);
-       return NULL;
-}
-
-static void *
-pthread_rwlck_blocking_thread(void * arg)
-{
-       pthread_rwlock_t *rwlck = (pthread_rwlock_t *)arg;
-       pthread_rwlock_rdlock(rwlck);
-       pthread_rwlock_unlock(rwlck);
-       return NULL;
-}
-
-static void *
-pthread_cond_blocking_thread(void * arg)
-{
-       pthread_mutex_t mtx  = PTHREAD_MUTEX_INITIALIZER;
-       pthread_cond_t *cond = (pthread_cond_t *)arg;
-       pthread_cond_wait(cond, &mtx);
-       pthread_mutex_unlock(&mtx);
-       return NULL;
-}
-
-/*
- * Uses a debug sysctl to initialize a kernel mutex.
- *
- * The 'waiting' thread grabs this kernel mutex, and immediately waits on a kernel semaphore.
- * The 'grabbing' thread just attempts to lock the kernel mutex.
- * When the semaphore is signalled, the 'waiting' thread will unlock the kernel mutex,
- * giving the opportunity for the 'grabbing' thread to lock it and then immediately unlock it.
- * This allows us to create a situation in the kernel where we know a thread to be blocked
- * on a kernel mutex.
- */
-static void
-test_kmutex_blocking(void)
-{
-       int ret = 0;
-       int len = 2;
-       struct stackshot_thread_waitinfo waitinfo[2] = { { 0 }, { 0 } };
-       uint64_t thread_id = 0;
-       pthread_t grabbing, waiting;
-
-       T_LOG("Starting %s", __FUNCTION__);
-       ret = pthread_create(&waiting, NULL, kmutex_grab_and_wait_thread, NULL); // thread will block until we signal it
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Spawning grab and wait thread");
-       sleep(1); // give time for thread to block
-       ret = pthread_create(&grabbing, NULL, kmutex_grabbing_thread, NULL); // thread should immediately block
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Spawning waiting thread");
-       sleep(3); // give (lots of) time for thread to give up spinning on lock
-
-       void * stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-
-       ret = pthread_threadid_np(waiting, &thread_id); // this is the thread that currently holds the kernel mutex
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Getting integer value of thread id");
-
-       check_python(stackshot, "thread \\d+: semaphore port \\w+ with unknown owner");
-
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-
-       T_EXPECT_EQ(len, 2, "There should only be two blocking threads");
-       for (int i = 0; i < len; i++) {
-               struct stackshot_thread_waitinfo *curr = &waitinfo[i];
-               if (curr->wait_type == kThreadWaitSemaphore)
-                       continue;
-               T_EXPECT_EQ(curr->wait_type, kThreadWaitKernelMutex, "Wait type should match expected KernelMutex value");
-               T_EXPECT_EQ(curr->owner, thread_id, "Thread ID of blocking thread should match 'owner' field in stackshot");
-               sysctl_kmutex_test_match(curr->context);
-
-               check_python(stackshot, "thread \\d+: kernel mutex %llx owned by thread %lld", curr->context, thread_id);
-       }
-
-       kmutex_action(KMUTEX_SYSCTL_SIGNAL); // waiting thread should now unblock.
-       ret = pthread_join(waiting, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on waiting thread");
-       ret = pthread_join(grabbing, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on grabber thread");
-       kmutex_action(KMUTEX_SYSCTL_TEARDOWN);
-       stackshot_config_dealloc(stackshot);
-}
-
-/* Initialize a userspace semaphore, and spawn a thread to block on it. */
-static void
-test_semaphore_blocking(void)
-{
-       int ret = 0;
-       semaphore_t sem;
-       struct stackshot_thread_waitinfo waitinfo = { 0 };
-       int len = 1;
-       uint64_t pid = 0;
-
-       T_LOG("Starting %s", __FUNCTION__);
-       ret = semaphore_create(mach_task_self(), &sem, SYNC_POLICY_FIFO, 0);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Creating semaphore");
-       pthread_t tid;
-       ret = pthread_create(&tid, NULL, sem_grabbing_thread, (void*)&sem); // thread should immediately block
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating semaphore grabbing thread");
-
-       sleep(1); // give time for thread to block
-
-       void * stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
-       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitSemaphore, "Wait type should match expected Semaphore value");
-
-       pid = (uint64_t)getpid();
-       T_EXPECT_EQ(waitinfo.owner, pid, "Owner value should match process ID");
-
-       check_python(stackshot, "thread \\d+: semaphore port \\w+ owned by pid %d", (int)pid);
-
-       ret = semaphore_signal(sem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Signalling semaphore");
-       ret = pthread_join(tid, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on grabber thread");
-       ret = semaphore_destroy(mach_task_self(), sem);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Destroying semaphore");
-       stackshot_config_dealloc(stackshot);
-}
-
-/* Spawn a process to send a message to, and block while both sending and receiving in different contexts. */
-static void
-test_mach_msg_blocking(void)
-{
-       int ret = 0;
-       pthread_t tid;
-       void *stackshot = NULL;
-       struct stackshot_thread_waitinfo waitinfo = { 0 };
-       int len = 1;
-
-       T_LOG("Starting %s", __FUNCTION__);
-       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &send);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Allocating send port");
-       ret = mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &recv);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Allocating recv port");
-       ret = mach_port_insert_right(mach_task_self(), send, send, MACH_MSG_TYPE_MAKE_SEND);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Getting send right to send port");
-       ret = mach_port_insert_right(mach_task_self(), recv, recv, MACH_MSG_TYPE_MAKE_SEND);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(ret, "Getting send right to recv port");
-
-       ret = pthread_create(&tid, NULL, msg_blocking_thread, (void*)&send); // thread should block on recv soon
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating message blocking thread");
-
-       sleep(1); // give time for thread to block
-       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-
-       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
-       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPortReceive, "Wait type should match expected PortReceive value");
-
-       check_python(stackshot, "thread \\d+: mach_msg receive on port \\w+ name %llx", (long long)send);
-
-       stackshot_config_dealloc(stackshot);
-
-       msg_send_helper(send); // ping! msg_blocking_thread will now try to send us stuff, and block until we receive.
-
-       sleep(1); // give time for thread to block
-       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
-       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPortSend, "Wait type should match expected PortSend value");
-
-       check_python(stackshot, "thread \\d+: mach_msg send on port \\w+ owned by pid %d", (int)getpid());
-
-       stackshot_config_dealloc(stackshot);
-
-       msg_recv_helper(recv); // thread should block until we receive one of its messages
-       ret = pthread_join(tid, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on blocking thread");
-}
-
-static void
-test_ulock_blocking(void)
-{
-       int ret = 0;
-       void *stackshot = NULL;
-       uint64_t thread_id = 0;
-       pthread_t tid;
-       struct os_unfair_lock_s ouls = OS_UNFAIR_LOCK_INIT;
-       os_unfair_lock_t oul = &ouls;
-       struct stackshot_thread_waitinfo waitinfo = { 0 };
-       int len = 1;
-
-       T_LOG("Starting %s", __FUNCTION__);
-       os_unfair_lock_lock(oul);
-       ret = pthread_create(&tid, NULL, ulock_blocking_thread, (void*)oul);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating ulock blocking thread");
-       sleep(3); // give time for thread to spawn, fall back to kernel for contention, and block
-
-       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
-       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitUserLock, "Wait type should match expected UserLock value");
-
-       os_unfair_lock_unlock(oul);
-       ret = pthread_join(tid, NULL); // wait for thread to unblock and exit
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on blocking thread");
-
-       ret = pthread_threadid_np(NULL, &thread_id); // this thread is the "owner" of the ulock
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Getting integer value of thread id");
-       T_EXPECT_EQ(waitinfo.owner, thread_id, "Thread ID of blocking thread should match 'owner' field in stackshot");
-
-       check_python(stackshot, "thread \\d+: unfair lock \\w+ owned by thread %lld", thread_id);
-       stackshot_config_dealloc(stackshot);
-       return;
-}
-
-static void
-test_krwlock_blocking(void)
-{
-       int ret = 0;
-       void *stackshot = NULL;
-       uint64_t thread_id = 0;
-       pthread_t waiting, grabbing;
-       int len = 2;
-       struct stackshot_thread_waitinfo waitinfo[2] = { { 0 }, { 0 } };
-
-       T_LOG("Starting %s", __FUNCTION__);
-       // this thread should spawn, acquire a kernel rwlock for write, and then wait on a semaphore
-       ret = pthread_create(&waiting, NULL, krwlck_write_waiting_thread, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating krwlck write waiting thread");
-       sleep(1); // give time for thread to block
-       // this thread should spawn and try to acquire the same kernel rwlock for read, but block
-       ret = pthread_create(&grabbing, NULL, krwlck_read_grabbing_thread, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating krwlck read grabbing thread");
-       sleep(1); // give time for thread to block
-
-       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-
-       check_python(stackshot, "thread \\d+: semaphore port \\w+ with unknown owner");
-
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-
-       T_EXPECT_EQ(len, 2, "There should only be two blocking threads");
-       for (int i = 0; i < len; i++) {
-               struct stackshot_thread_waitinfo *curr = &waitinfo[i];
-               if (curr->wait_type == kThreadWaitSemaphore)
-                       continue;
-               T_EXPECT_EQ(curr->wait_type, kThreadWaitKernelRWLockRead, "Wait type should match expected KRWLockRead value");
-               sysctl_krwlck_test_match(curr->context);
-
-               check_python(stackshot, "thread \\d+: krwlock %llx for reading", curr->context);
-
-#if KRWLCK_STORES_EXCL_OWNER /* A future planned enhancement */
-               ret = pthread_threadid_np(waiting, &thread_id); // this is the thread that currently holds the kernel mutex
-               T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Getting integer value of thread id");
-               T_EXPECT_EQ(curr->owner, thread_id, "Thread ID of blocking thread should match 'owner' field in stackshot");
-#else
-               (void)thread_id; // suppress compiler warning about unused variable
-#endif /* RWLCK_STORES_EXCL_OWNER */
-       }
-
-       krwlck_action(KRWLCK_SYSCTL_SIGNAL); // pthread should now unblock & finish
-       ret = pthread_join(waiting, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on waiting thread");
-       ret = pthread_join(grabbing, NULL);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Joining on grabbing thread");
-       krwlck_action(KRWLCK_SYSCTL_TEARDOWN);
-       stackshot_config_dealloc(stackshot);
-}
-
-
-static void
-test_pthread_mutex_blocking(void)
-{
-       int ret = 0;
-       void *stackshot = NULL;
-       uint64_t thread_id = 0;
-       pthread_t tid;
-       struct stackshot_thread_waitinfo waitinfo = { 0 };
-       pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
-       int len = 1;
-
-       T_LOG("Starting %s", __FUNCTION__);
-
-       ret = pthread_threadid_np(NULL, &thread_id); // this thread is the "owner" of the mutex
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Getting integer value of thread id");
-
-       pthread_mutex_lock(&mtx);
-       ret = pthread_create(&tid, NULL, pthread_mutex_blocking_thread, (void*)&mtx);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating pthread mutex blocking thread");
-       sleep(2); // give time for thread to block
-
-       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-
-       check_python(stackshot, "thread \\d+: pthread mutex %llx owned by thread %lld", &mtx, thread_id);
-
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
-       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPThreadMutex,
-                       "Wait type should match expected PThreadMutex value");
-       stackshot_config_dealloc(stackshot);
-
-       pthread_mutex_unlock(&mtx);
-       ret = pthread_join(tid, NULL); // wait for thread to unblock and exit
-
-
-       T_EXPECT_EQ(waitinfo.owner, thread_id,
-                       "Thread ID of blocking thread should match 'owner' field in stackshot");
-       T_EXPECT_EQ(waitinfo.context, (uint64_t)&mtx,
-                       "Userspace address of mutex should match 'context' field in stackshot");
-}
-
-static void
-test_pthread_rwlck_blocking(void)
-{
-       int ret = 0;
-       void *stackshot = NULL;
-       pthread_t tid;
-       struct stackshot_thread_waitinfo waitinfo = { 0 };
-       pthread_rwlock_t rwlck = PTHREAD_RWLOCK_INITIALIZER;
-       int len = 1;
-
-       T_LOG("Starting %s", __FUNCTION__);
-       pthread_rwlock_wrlock(&rwlck);
-       ret = pthread_create(&tid, NULL, pthread_rwlck_blocking_thread, (void*)&rwlck);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating pthread rwlck blocking thread");
-       sleep(2);
-
-       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-
-       check_python(stackshot, "thread \\d+: pthread rwlock %llx for reading", (long long)&rwlck);
-
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
-       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPThreadRWLockRead,
-                       "Wait type should match expected PThreadRWLockRead value");
-       stackshot_config_dealloc(stackshot);
-
-       pthread_rwlock_unlock(&rwlck);
-       ret = pthread_join(tid, NULL); // wait for thread to unblock and exit
-       T_EXPECT_EQ(waitinfo.context, (uint64_t)&rwlck,
-                       "Userspace address of rwlck should match 'context' field in stackshot");
-}
-
-
-
-static void
-test_pthread_cond_blocking(void)
-{
-       int ret = 0;
-       void *stackshot = NULL;
-       pthread_t tid;
-       pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
-       struct stackshot_thread_waitinfo waitinfo = { 0 };
-       int len = 1;
-
-       T_LOG("Starting %s", __FUNCTION__);
-       ret = pthread_create(&tid, NULL, pthread_cond_blocking_thread, (void*)&cond);
-       T_QUIET; T_ASSERT_POSIX_ZERO(ret, "Creating pthread condvar blocking thread");
-       sleep(2);
-
-       stackshot = take_stackshot(STACKSHOT_THREAD_WAITINFO, 0);
-
-       check_python(stackshot, "thread \\d+: pthread condvar %llx", (long long)&cond);
-
-       find_blocking_info(stackshot, (struct stackshot_thread_waitinfo *)&waitinfo, &len);
-       T_EXPECT_EQ(len, 1, "Only one blocking thread should exist");
-       T_EXPECT_EQ(waitinfo.wait_type, kThreadWaitPThreadCondVar,
-                       "Wait type should match expected PThreadCondVar value");
-       stackshot_config_dealloc(stackshot);
-
-       pthread_cond_signal(&cond);
-       ret = pthread_join(tid, NULL); // wait for thread to unblock and exit
-       T_EXPECT_EQ(waitinfo.context, (uint64_t)&cond,
-                       "Userspace address of condvar should match 'context' field in stackshot");
-       pthread_cond_destroy(&cond);
-}
-
-/*
- *
- * Test declarations
- *
- */
-
-T_DECL(stackshot_block_owner_klocks, "tests stackshot block owner for kernel locks") {
-       /* check to see if kmutex sysctl exists before running kmutex test */
-       if (kmutex_action(KMUTEX_SYSCTL_CHECK_EXISTS))
-               test_kmutex_blocking();
-       /* check to see if krwlck sysctl exists before running krwlck test */
-       if (krwlck_action(KRWLCK_SYSCTL_CHECK_EXISTS))
-               test_krwlock_blocking();
-       test_ulock_blocking();
-}
-
-T_DECL(stackshot_block_owner_pthread_mutex, "tests stackshot block owner: pthread mutex") {
-       test_pthread_mutex_blocking();
-}
-
-T_DECL(stackshot_block_owner_pthread_rwlck, "tests stackshot block owner: pthread rw locks") {
-       test_pthread_rwlck_blocking();
-}
-
-T_DECL(stackshot_block_owner_pthread_condvar, "tests stackshot block owner: pthread condvar") {
-       test_pthread_cond_blocking();
-}
-
-T_DECL(stackshot_block_owner_semaphore, "tests stackshot block owner: semaphore") {
-       test_semaphore_blocking();
-}
-
-T_DECL(stackshot_block_owner_mach_msg, "tests stackshot block owner: mach messaging") {
-       test_mach_msg_blocking();
-}
diff --git a/tools/tests/darwintests/stackshot_idle_25570396.m b/tools/tests/darwintests/stackshot_idle_25570396.m
deleted file mode 100644 (file)
index 87ec2d0..0000000
+++ /dev/null
@@ -1,264 +0,0 @@
-/* This program tests that kThreadIdleWorker is being set properly, so
- * that idle and active threads can be appropriately identified.
- */
-
-#include <darwintest.h>
-#include <dispatch/dispatch.h>
-#include <kdd.h>
-#include <kern/kcdata.h>
-#include <kern/debug.h>
-#include <mach/mach_init.h>
-#include <mach/mach_traps.h>
-#include <mach/semaphore.h>
-#include <mach/task.h>
-#include <pthread.h>
-#include <sys/stackshot.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#include <Foundation/Foundation.h>
-
-#define NUMRETRIES  5  // number of times to retry a stackshot
-#define NUMENQUEUES 16 // number of blocking jobs to enqueue
-#define NUMTHREADS  (NUMENQUEUES + 2) // total number of threads (including numenqueues)
-
-volatile static int spin_threads = 1;
-
-static void *
-take_stackshot(uint32_t extra_flags, uint64_t since_timestamp)
-{
-       void * stackshot;
-       int ret, retries;
-       uint32_t stackshot_flags = STACKSHOT_SAVE_LOADINFO |
-                                       STACKSHOT_GET_GLOBAL_MEM_STATS |
-                                       STACKSHOT_SAVE_IMP_DONATION_PIDS |
-                                       STACKSHOT_KCDATA_FORMAT;
-
-       if (since_timestamp != 0)
-               stackshot_flags |= STACKSHOT_COLLECT_DELTA_SNAPSHOT;
-
-       stackshot_flags |= extra_flags;
-
-       stackshot = stackshot_config_create();
-       T_ASSERT_NOTNULL(stackshot, "Allocating stackshot config");
-
-       ret = stackshot_config_set_flags(stackshot, stackshot_flags);
-       T_ASSERT_POSIX_ZERO(ret, "Setting flags on stackshot config");
-
-       ret = stackshot_config_set_pid(stackshot, getpid());
-       T_ASSERT_POSIX_ZERO(ret, "Setting target pid on stackshot config");
-
-       if (since_timestamp != 0) {
-               ret = stackshot_config_set_delta_timestamp(stackshot, since_timestamp);
-               T_ASSERT_POSIX_ZERO(ret, "Setting prev snapshot time on stackshot config");
-       }
-
-       for (retries = NUMRETRIES; retries > 0; retries--) {
-               ret = stackshot_capture_with_config(stackshot);
-               T_ASSERT_TRUE(ret == 0 || ret == EBUSY || ret == ETIMEDOUT, "Attempting to take stackshot (error %d)...", ret);
-               if (retries == 0 && (ret == EBUSY || ret == ETIMEDOUT))
-                       T_ASSERT_FAIL("Failed to take stackshot after %d retries: %s", ret, strerror(ret));
-               if (ret == 0)
-                       break;
-       }
-       return stackshot;
-}
-
-static uint64_t get_stackshot_timestamp(void * stackshot)
-{
-       kcdata_iter_t iter;
-       void * buf;
-       uint64_t default_time = 0;
-       uint32_t t, buflen;
-
-       buf = stackshot_config_get_stackshot_buffer(stackshot);
-       T_ASSERT_NOTNULL(buf, "Getting stackshot buffer");
-       buflen = stackshot_config_get_stackshot_size(stackshot);
-
-       iter = kcdata_iter(buf, buflen);
-       t    = kcdata_iter_type(iter);
-
-       T_ASSERT_TRUE(t == KCDATA_BUFFER_BEGIN_STACKSHOT || t == KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
-               "Making sure stackshot data begins with \"begin\" flag");
-       T_ASSERT_TRUE(kcdata_iter_valid(iter = kcdata_iter_find_type(iter, KCDATA_TYPE_MACH_ABSOLUTE_TIME)),
-               "Getting stackshot timestamp");
-       default_time = *(uint64_t *)kcdata_iter_payload(iter);
-       return default_time;
-}
-
-static void
-get_thread_statuses(void * stackshot, int * num_idles, int * num_nonidles)
-{
-       void *buf;
-       uint32_t t, buflen;
-       uint64_t thread_snap_flags;
-       NSError *error = nil;
-       NSMutableDictionary *parsed_container, *parsed_threads;
-
-       *num_idles = 0;
-       *num_nonidles = 0;
-
-       buf = stackshot_config_get_stackshot_buffer(stackshot);
-       T_ASSERT_NOTNULL(buf, "Getting stackshot buffer");
-       buflen = stackshot_config_get_stackshot_size(stackshot);
-
-       kcdata_iter_t iter = kcdata_iter(buf, buflen);
-       T_ASSERT_TRUE(kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_STACKSHOT ||
-                       kcdata_iter_type(iter) == KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT,
-                       "Checking start of stackshot buffer");
-
-       iter = kcdata_iter_next(iter);
-       KCDATA_ITER_FOREACH(iter)
-       {
-               t = kcdata_iter_type(iter);
-
-               if (t != KCDATA_TYPE_CONTAINER_BEGIN) {
-                       continue;
-               }
-
-               if (kcdata_iter_container_type(iter) != STACKSHOT_KCCONTAINER_TASK) {
-                       continue;
-               }
-
-               parsed_container = parseKCDataContainer(&iter, &error);
-               T_ASSERT_TRUE(parsed_container && !error, "Parsing container");
-
-               parsed_threads = parsed_container[@"task_snapshots"][@"thread_snapshots"];
-               for (id th_key in parsed_threads) {
-                       /* check to see that tid matches expected idle status */
-                       thread_snap_flags = [parsed_threads[th_key][@"thread_snapshot"][@"ths_ss_flags"] unsignedLongLongValue];
-                       (thread_snap_flags & kThreadIdleWorker) ? (*num_idles)++ : (*num_nonidles)++;
-               }
-               [parsed_container release];
-       }
-
-}
-
-/* Dispatch NUMENQUEUES jobs to a concurrent queue that immediately wait on a
- * shared semaphore. This should spin up plenty of threads! */
-static void
-warm_up_threadpool(dispatch_queue_t q)
-{
-       int i;
-       dispatch_semaphore_t thread_wait = dispatch_semaphore_create(0);
-       T_QUIET; T_ASSERT_NOTNULL(thread_wait, "Initializing work queue semaphore");
-       dispatch_semaphore_t main_wait = dispatch_semaphore_create(0);
-       T_QUIET; T_ASSERT_NOTNULL(main_wait, "Initializing main thread semaphore");
-
-       for (i = 0; i < NUMENQUEUES; i++) {
-               dispatch_async(q, ^{
-                       dispatch_semaphore_wait(thread_wait, DISPATCH_TIME_FOREVER);
-                       dispatch_semaphore_signal(main_wait);
-               });
-       }
-
-       sleep(1); // give worker threads enough time to block
-
-       for (i = 0; i < NUMENQUEUES; i++) {
-               dispatch_semaphore_signal(thread_wait);
-               dispatch_semaphore_wait(main_wait, DISPATCH_TIME_FOREVER);
-       }
-
-       dispatch_release(thread_wait);
-       dispatch_release(main_wait);
-
-       // Give enough time for worker threads to go idle again
-       sleep(1);
-}
-
-/* Dispatch NUMENQUEUES jobs to a concurrent queue that spin in a tight loop.
- * Isn't guaranteed to occupy every worker thread, but it's enough so
- * that a thread will go from idle to nonidle.
- */
-static void
-fill_threadpool_with_spinning(dispatch_queue_t q)
-{
-       int i;
-       for (i = 0; i < NUMENQUEUES; i++) {
-               dispatch_async(q, ^{
-                       while(spin_threads); // should now appear as non-idle in delta shot
-               });
-       }
-       sleep(1); // wait for jobs to enqueue
-}
-
-/* Take stackshot, count the number of idle and nonidle threads the stackshot records.
- * Where this is called, there should be NUMENQUEUES idle threads (thanks to warm_up_threadpool)
- * and 2 nonidle threads (the main thread, and the spinning pthread).
- */
-static void
-take_and_verify_initial_stackshot(uint64_t * since_time)
-{
-       void *stackshot;
-       int num_init_idle_threads, num_init_nonidle_threads;
-
-       stackshot = take_stackshot(0, 0);
-       *since_time = get_stackshot_timestamp(stackshot);
-       get_thread_statuses(stackshot, &num_init_idle_threads, &num_init_nonidle_threads);
-
-       T_EXPECT_EQ(num_init_idle_threads, NUMENQUEUES,
-                       "Idle count of %d should match expected value of %d...",
-                       num_init_idle_threads, NUMENQUEUES);
-       T_EXPECT_EQ(num_init_nonidle_threads, NUMTHREADS - NUMENQUEUES,
-                       "Non-idle count of %d should match expected value of %d...",
-                       num_init_nonidle_threads, NUMTHREADS - NUMENQUEUES);
-       stackshot_config_dealloc(stackshot);
-}
-
-/* Take a stackshot and a delta stackshot, measuring what changed since the previous
- * stackshot. Where this is called, the blocking jobs have been cleared from the work queue,
- * and the work queue has NUMENQUEUES tight-spinning jobs on it. Make sure that
- * no new idle threads appear in the delta, and make sure that the delta shot isn't
- * ignoring the worker threads that have become active.
- */
-static void
-take_and_verify_delta_stackshot(uint64_t since_time)
-{
-       void *stackshot;
-       void *delta_stackshot;
-
-       int num_delta_idles, num_delta_nonidles, num_curr_idles, num_curr_nonidles;
-
-       stackshot = take_stackshot(0, 0);
-       delta_stackshot = take_stackshot(0, since_time); /* Threads should appear in delta stackshot as non-idle */
-
-       get_thread_statuses(stackshot, &num_curr_idles, &num_curr_nonidles);
-       get_thread_statuses(delta_stackshot, &num_delta_idles, &num_delta_nonidles);
-
-       T_EXPECT_EQ(num_delta_idles, 0, "Making sure there are no idles in delta shot");
-       T_EXPECT_EQ(num_delta_nonidles + num_curr_idles, NUMTHREADS,
-                       "Making sure delta shot isn't ignoring newly active threads");
-       stackshot_config_dealloc(stackshot);
-       stackshot_config_dealloc(delta_stackshot);
-}
-
-static void *
-spinning_non_work_queue_thread(void * ignored)
-{
-       (void)ignored;
-       while(spin_threads);
-       return NULL;
-}
-
-T_DECL(stackshot_idle_25570396, "Tests that stackshot can properly recognize idle and non-idle threads", T_META_ASROOT(true))
-{
-       int ret;
-       uint64_t initial_stackshot_time;
-       pthread_t spinning_thread;
-       dispatch_queue_t q;
-
-       ret = pthread_create(&spinning_thread, NULL, spinning_non_work_queue_thread, NULL);
-       T_ASSERT_POSIX_ZERO(ret, "Spinning up non-work-queue thread");
-
-       q = dispatch_queue_create("com.apple.kernel.test.waiting_semaphores", DISPATCH_QUEUE_CONCURRENT);
-
-       warm_up_threadpool(q);
-       take_and_verify_initial_stackshot(&initial_stackshot_time);
-
-       fill_threadpool_with_spinning(q);
-       take_and_verify_delta_stackshot(initial_stackshot_time);
-
-       spin_threads = 0; /* pthread-made thread should now exit */
-       ret = pthread_join(spinning_thread, NULL);
-       T_ASSERT_POSIX_ZERO(ret, "Joining on non-work-queue thread");
-}
diff --git a/tools/tests/darwintests/suspended_spawn_26184412.c b/tools/tests/darwintests/suspended_spawn_26184412.c
deleted file mode 100644 (file)
index 977e96d..0000000
+++ /dev/null
@@ -1,101 +0,0 @@
-
-
-#include <darwintest.h>
-
-#include <stdio.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <string.h>
-#include <assert.h>
-#include <signal.h>
-#include <spawn.h>
-#include <stdint.h>
-#include <sys/sysctl.h>
-#include <stdbool.h>
-#include <sysexits.h>
-#include <err.h>
-
-/*
- * Test to validate that suspended-spawn DTRTs when a SIGKILL is recieved
- * while the process is waiting for SIGCONT.
- *
- * Also test that suspended-spawn correctly looks like a SIGSTOP while it's suspended.
- *
- * <rdar://problem/26184412> posix_spawn non-exec with POSIX_SPAWN_START_SUSPENDED, then killing instead of SIGCONT-ing causes unkillable hung processes
- */
-
-static void
-spawn_and_signal(int signal)
-{
-       /* do not buffer output to stdout */
-       setvbuf(stdout, NULL, _IONBF, 0);
-
-       int ret;
-       posix_spawnattr_t attr;
-
-       ret = posix_spawnattr_init(&attr);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_init");
-
-       ret = posix_spawnattr_setflags(&attr, POSIX_SPAWN_START_SUSPENDED);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_setflags");
-
-       char * const    prog = "/usr/bin/true";
-       char * const    argv_child[] = { prog, NULL };
-       pid_t           child_pid;
-       extern char   **environ;
-
-       ret = posix_spawn(&child_pid, prog, NULL, &attr, argv_child, environ);
-       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawn");
-
-       printf("parent: spawned child with pid %d\n", child_pid);
-
-       ret = posix_spawnattr_destroy(&attr);
-       T_QUIET;
-       T_ASSERT_POSIX_SUCCESS(ret, "posix_spawnattr_destroy");
-
-       int status = 0;
-       int waitpid_result = waitpid(child_pid, &status, WUNTRACED|WNOHANG);
-       T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid");
-
-       T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned");
-
-       T_ASSERT_EQ(WIFEXITED(status), 0, "before SIGCONT: must not have exited");
-       T_ASSERT_EQ(WIFSTOPPED(status), 1, "before SIGCONT: must be stopped");
-
-       printf("parent: continuing child process\n");
-
-       ret = kill(child_pid, signal);
-       T_ASSERT_POSIX_SUCCESS(ret, "kill(signal)");
-
-       printf("parent: waiting for child process\n");
-
-       status = 0;
-       waitpid_result = waitpid(child_pid, &status, 0);
-       T_ASSERT_POSIX_SUCCESS(waitpid_result, "waitpid");
-
-       T_ASSERT_EQ(waitpid_result, child_pid, "waitpid should return child we spawned");
-
-       if (signal == SIGKILL) {
-               T_ASSERT_EQ(WIFSIGNALED(status), 1, "child should have exited due to signal");
-               T_ASSERT_EQ(WTERMSIG(status), SIGKILL, "child should have exited due to SIGKILL");
-       } else {
-               T_ASSERT_EQ(WIFEXITED(status), 1, "child should have exited normally");
-               T_ASSERT_EQ(WEXITSTATUS(status), EX_OK, "child should have exited with success");
-       }
-
-       printf("wait returned with pid %d, status %d\n", ret, status);
-}
-
-T_DECL(suspended_spawn_continue, "Tests spawning a suspended process and continuing it", T_META_TIMEOUT(2))
-{
-       spawn_and_signal(SIGCONT);
-}
-
-T_DECL(suspended_spawn_kill, "Tests spawning a suspended process and killing it", T_META_TIMEOUT(2))
-{
-       spawn_and_signal(SIGKILL);
-}
-
diff --git a/tools/tests/darwintests/task_for_pid_entitlement.plist b/tools/tests/darwintests/task_for_pid_entitlement.plist
deleted file mode 100644 (file)
index 2398d67..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-        <key>com.apple.system-task-ports</key>
-        <true/>
-        <key>task_for_pid-allow</key>
-        <true/>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/task_info.c b/tools/tests/darwintests/task_info.c
deleted file mode 100644 (file)
index cb77c30..0000000
+++ /dev/null
@@ -1,1134 +0,0 @@
-#include <darwintest.h>
-#include <darwintest_utils.h>
-#include <errno.h>
-#include <mach/mach.h>
-#include <mach/mach_error.h>
-#include <mach/policy.h>
-#include <mach/task_info.h>
-#include <mach/thread_info.h>
-#include <signal.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <sys/sysctl.h>
-#include <unistd.h>
-
-/* *************************************************************************************
- * Test the task_info API.
- *
- * This is a functional test of the following APIs:
- * TASK_BASIC_INFO_32
- * TASK_BASIC2_INFO_32
- * TASK_BASIC_INFO_64
- * TASK_BASIC_INFO_64_2
- * TASK_POWER_INFO_V2
- * TASK_FLAGS_INFO
- * TASK_AFFINITY_TAG_INFO
- * TASK_THREAD_TIMES_INFO
- * TASK_ABSOLUTE_TIME_INFO
- * <rdar://problem/22242021> Add tests to increase code coverage for the task_info API
- * *************************************************************************************
- */
-#define TESTPHYSFOOTPRINTVAL 5
-#define CANARY 0x0f0f0f0f0f0f0f0fULL
-#if !defined(CONFIG_EMBEDDED)
-#define ABSOLUTE_MIN_USER_TIME_DIFF 150
-#define ABSOLUTE_MIN_SYSTEM_TIME_DIFF 300
-#endif
-
-enum info_kind { INFO_32, INFO_64, INFO_32_2, INFO_64_2, INFO_MACH, INFO_MAX };
-
-enum info_get { GET_SUSPEND_COUNT, GET_RESIDENT_SIZE, GET_VIRTUAL_SIZE, GET_USER_TIME, GET_SYS_TIME, GET_POLICY, GET_MAX_RES };
-
-/*
- * This function uses CPU cycles by doing a factorial computation.
- */
-static void do_factorial_task(void);
-
-void test_task_basic_info_32(void);
-void test_task_basic_info_64(void);
-void task_basic_info_32_debug(void);
-void task_basic2_info_32_warmup(void);
-static int is_development_kernel(void);
-void test_task_basic_info(enum info_kind kind);
-uint64_t info_get(enum info_kind kind, enum info_get get, void * data);
-
-T_DECL(task_vm_info, "tests task vm info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       kern_return_t err;
-       task_vm_info_data_t vm_info;
-
-       mach_msg_type_number_t count = TASK_VM_INFO_COUNT;
-
-       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       T_EXPECT_NE(vm_info.virtual_size, 0ULL, "task_info return value !=0 for virtual_size\n");
-
-       T_EXPECT_NE(vm_info.phys_footprint, 0ULL, "task_info return value !=0 for phys_footprint\n");
-
-       /*
-        * Test the REV0 version of TASK_VM_INFO. It should not change the value of phys_footprint.
-        */
-
-       count                  = TASK_VM_INFO_REV0_COUNT;
-       vm_info.phys_footprint = TESTPHYSFOOTPRINTVAL;
-       vm_info.min_address    = CANARY;
-       vm_info.max_address    = CANARY;
-
-       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       T_EXPECT_EQ(count, TASK_VM_INFO_REV0_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV0_COUNT", count);
-
-       T_EXPECT_NE(vm_info.virtual_size, 0ULL, "task_info --rev0 call does not return 0 for virtual_size");
-
-       T_EXPECT_EQ(vm_info.phys_footprint, (unsigned long long)TESTPHYSFOOTPRINTVAL,
-                   "task_info --rev0 call returned value %llu for vm_info.phys_footprint.  Expected %u since this value should not be "
-                   "modified by rev0",
-                   vm_info.phys_footprint, TESTPHYSFOOTPRINTVAL);
-
-       T_EXPECT_EQ(vm_info.min_address, CANARY,
-                   "task_info --rev0 call returned value 0x%llx for vm_info.min_address. Expected 0x%llx since this value should not "
-                   "be modified by rev0",
-                   vm_info.min_address, CANARY);
-
-       T_EXPECT_EQ(vm_info.max_address, CANARY,
-                   "task_info --rev0 call returned value 0x%llx for vm_info.max_address. Expected 0x%llx since this value should not "
-                   "be modified by rev0",
-                   vm_info.max_address, CANARY);
-
-       /*
-        * Test the REV1 version of TASK_VM_INFO.
-        */
-
-       count                  = TASK_VM_INFO_REV1_COUNT;
-       vm_info.phys_footprint = TESTPHYSFOOTPRINTVAL;
-       vm_info.min_address    = CANARY;
-       vm_info.max_address    = CANARY;
-
-       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       T_EXPECT_EQ(count, TASK_VM_INFO_REV1_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV1_COUNT", count);
-
-       T_EXPECT_NE(vm_info.virtual_size, 0ULL, "task_info --rev1 call does not return 0 for virtual_size");
-
-       T_EXPECT_NE(vm_info.phys_footprint, (unsigned long long)TESTPHYSFOOTPRINTVAL,
-                   "task_info --rev1 call returned value %llu for vm_info.phys_footprint.  Expected value is anything other than %u "
-                   "since this value should not be modified by rev1",
-                   vm_info.phys_footprint, TESTPHYSFOOTPRINTVAL);
-
-       T_EXPECT_EQ(vm_info.min_address, CANARY,
-                   "task_info --rev1 call returned value 0x%llx for vm_info.min_address. Expected 0x%llx since this value should not "
-                   "be modified by rev1",
-                   vm_info.min_address, CANARY);
-
-       T_EXPECT_EQ(vm_info.max_address, CANARY,
-                   "task_info --rev1 call returned value 0x%llx for vm_info.max_address. Expected 0x%llx since this value should not "
-                   "be modified by rev1",
-                   vm_info.max_address, CANARY);
-
-       /*
-        * Test the REV2 version of TASK_VM_INFO.
-        */
-
-       count                  = TASK_VM_INFO_REV2_COUNT;
-       vm_info.phys_footprint = TESTPHYSFOOTPRINTVAL;
-       vm_info.min_address    = CANARY;
-       vm_info.max_address    = CANARY;
-
-       err = task_info(mach_task_self(), TASK_VM_INFO_PURGEABLE, (task_info_t)&vm_info, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       T_EXPECT_EQ(count, TASK_VM_INFO_REV2_COUNT, "task_info count(%d) is equal to TASK_VM_INFO_REV2_COUNT\n", count);
-
-       T_EXPECT_NE(vm_info.virtual_size, 0ULL, "task_info --rev2 call does not return 0 for virtual_size\n");
-
-       T_EXPECT_NE(vm_info.phys_footprint, (unsigned long long)TESTPHYSFOOTPRINTVAL,
-                   "task_info --rev2 call returned value %llu for vm_info.phys_footprint.  Expected anything other than %u since this "
-                   "value should be modified by rev2",
-                   vm_info.phys_footprint, TESTPHYSFOOTPRINTVAL);
-
-       T_EXPECT_NE(vm_info.min_address, CANARY,
-                   "task_info --rev2 call returned value 0x%llx for vm_info.min_address. Expected anything other than 0x%llx since "
-                   "this value should be modified by rev2",
-                   vm_info.min_address, CANARY);
-
-       T_EXPECT_NE(vm_info.max_address, CANARY,
-                   "task_info --rev2 call returned value 0x%llx for vm_info.max_address. Expected anything other than 0x%llx since "
-                   "this value should be modified by rev2",
-                   vm_info.max_address, CANARY);
-}
-
-T_DECL(host_debug_info, "tests host debug info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       kern_return_t err;
-       mach_port_t host;
-       host_debug_info_internal_data_t debug_info;
-       mach_msg_type_number_t count = HOST_DEBUG_INFO_INTERNAL_COUNT;
-       host                         = mach_host_self();
-       err                          = host_info(host, HOST_DEBUG_INFO_INTERNAL, (host_info_t)&debug_info, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify host_info call succeeded");
-}
-
-T_DECL(task_debug_info, "tests task debug info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       kern_return_t err;
-       task_debug_info_internal_data_t debug_info;
-
-       mach_msg_type_number_t count = TASK_DEBUG_INFO_INTERNAL_COUNT;
-
-       err = task_info(mach_task_self(), TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-}
-
-T_DECL(thread_debug_info, "tests thread debug info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       kern_return_t err;
-       thread_debug_info_internal_data_t debug_info;
-
-       mach_msg_type_number_t count = THREAD_DEBUG_INFO_INTERNAL_COUNT;
-
-       err = thread_info(mach_thread_self(), THREAD_DEBUG_INFO_INTERNAL, (thread_info_t)&debug_info, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-}
-
-static void
-do_factorial_task()
-{
-       int number    = 20;
-       int factorial = 1;
-       int i;
-       for (i = 1; i <= number; i++) {
-               factorial *= i;
-       }
-
-       return;
-}
-
-T_DECL(task_thread_times_info, "tests task thread times info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       kern_return_t err;
-       task_thread_times_info_data_t thread_times_info_data;
-       task_thread_times_info_data_t thread_times_info_data_new;
-       mach_msg_type_number_t count = TASK_THREAD_TIMES_INFO_COUNT;
-
-       err = task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, (task_info_t)&thread_times_info_data, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       do_factorial_task();
-
-       err = task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, (task_info_t)&thread_times_info_data_new, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       /*
-        * The difference is observed to be less than 30 microseconds for user_time
-        * and less than 50 microseconds for system_time. This observation was done for over
-        * 1000 runs.
-        */
-
-       T_EXPECT_FALSE((thread_times_info_data_new.user_time.seconds - thread_times_info_data.user_time.seconds) != 0 ||
-                          (thread_times_info_data_new.system_time.seconds - thread_times_info_data.system_time.seconds) != 0,
-                      "Tests whether the difference between thread times is greater than the allowed limit");
-
-       /*
-        * This is a negative case.
-        */
-
-       count--;
-       err = task_info(mach_task_self(), TASK_THREAD_TIMES_INFO, (task_info_t)&thread_times_info_data, &count);
-       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
-                           "Negative test case: task_info should verify that count is at least equal to what is defined in API.");
-}
-
-T_DECL(task_absolutetime_info, "tests task absolute time info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       kern_return_t err;
-       uint64_t user_time_diff, system_time_diff;
-       task_absolutetime_info_data_t absolute_time_info_data;
-       task_absolutetime_info_data_t absolute_time_info_data_new;
-       mach_msg_type_number_t count = TASK_ABSOLUTETIME_INFO_COUNT;
-
-       err = task_info(mach_task_self(), TASK_ABSOLUTETIME_INFO, (task_info_t)&absolute_time_info_data, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       do_factorial_task();
-
-       err = task_info(mach_task_self(), TASK_ABSOLUTETIME_INFO, (task_info_t)&absolute_time_info_data_new, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       user_time_diff   = absolute_time_info_data_new.total_user - absolute_time_info_data.total_user;
-       system_time_diff = absolute_time_info_data_new.total_system - absolute_time_info_data.total_system;
-
-#if !(defined(__arm__) || defined(__arm64__))
-       /*
-        * On embedded devices the difference is always zero.
-        * On non-embedded devices the difference occurs in this range. This was observed over ~10000 runs.
-        */
-
-       T_EXPECT_FALSE(user_time_diff < ABSOLUTE_MIN_USER_TIME_DIFF || system_time_diff < ABSOLUTE_MIN_SYSTEM_TIME_DIFF,
-                      "Tests whether the difference between thread times is greater than the expected range");
-#endif
-
-       if (absolute_time_info_data.threads_user <= 0) {
-               int precise_time_val = 0;
-               size_t len           = sizeof(size_t);
-
-               T_LOG("User threads time is zero. This should only happen rarely and when precise_user_time is off");
-
-               err = sysctlbyname("kern.precise_user_kernel_time", &precise_time_val, &len, NULL, 0);
-
-               T_EXPECT_POSIX_SUCCESS(err, "performing sysctl to check precise_user_time");
-
-               T_LOG("kern.precise_user_kernel_time val = %d", precise_time_val);
-
-               T_EXPECT_FALSE(precise_time_val, "user thread time should only be zero when precise_user_kernel_time is disabled");
-       } else {
-               T_PASS("task_info should return non-zero value for user threads time = %llu", absolute_time_info_data.threads_user);
-       }
-
-#if !(defined(__arm__) || defined(__arm64__))
-       /*
-        * On iOS, system threads are always zero. On OS X this value can be some large positive number.
-        * There is no real way to estimate the exact amount.
-        */
-       T_EXPECT_NE(absolute_time_info_data.threads_system, 0ULL,
-                   "task_info should return non-zero value for system threads time = %llu", absolute_time_info_data.threads_system);
-#endif
-
-       /*
-        * This is a negative case.
-        */
-       count--;
-       err = task_info(mach_task_self(), TASK_ABSOLUTETIME_INFO, (task_info_t)&absolute_time_info_data_new, &count);
-       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
-                           "Negative test case: task_info should verify that count is at least equal to what is defined in API.");
-}
-
-T_DECL(task_affinity_tag_info, "tests task_affinity_tag_info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       kern_return_t err;
-       task_affinity_tag_info_data_t affinity_tag_info_data;
-       mach_msg_type_number_t count = TASK_AFFINITY_TAG_INFO_COUNT;
-
-       err = task_info(mach_task_self(), TASK_AFFINITY_TAG_INFO, (task_info_t)&affinity_tag_info_data, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       /*
-        * The affinity is not set by default, hence expecting a zero value.
-        */
-       T_ASSERT_FALSE(affinity_tag_info_data.min != 0 || affinity_tag_info_data.max != 0,
-                      "task_info call returns non-zero min or max value");
-
-       /*
-       * This is a negative case.
-       */
-       count--;
-       err = task_info(mach_task_self(), TASK_AFFINITY_TAG_INFO, (task_info_t)&affinity_tag_info_data, &count);
-       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
-                           "Negative test case: task_info should verify that count is at least equal to what is defined in API.");
-}
-
-T_DECL(task_flags_info, "tests task_flags_info", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       kern_return_t err;
-       task_flags_info_data_t flags_info_data;
-       mach_msg_type_number_t count = TASK_FLAGS_INFO_COUNT;
-
-       err = task_info(mach_task_self(), TASK_FLAGS_INFO, (task_info_t)&flags_info_data, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       /* Change for 32-bit arch possibility?*/
-       T_ASSERT_EQ((flags_info_data.flags & (unsigned int)(~TF_LP64)), 0U, "task_info should only give out 64-bit addr flag");
-
-       /*
-        * This is a negative case.
-        */
-
-       count--;
-       err = task_info(mach_task_self(), TASK_FLAGS_INFO, (task_info_t)&flags_info_data, &count);
-       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
-                           "Negative test case: task_info should verify that count is at least equal to what is defined in API.");
-}
-
-T_DECL(task_power_info_v2, "tests task_power_info_v2", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       kern_return_t err;
-       task_power_info_v2_data_t power_info_data_v2;
-       task_power_info_v2_data_t power_info_data_v2_new;
-       mach_msg_type_number_t count = TASK_POWER_INFO_V2_COUNT;
-
-       sleep(1);
-
-       err = task_info(mach_task_self(), TASK_POWER_INFO_V2, (task_info_t)&power_info_data_v2, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       T_ASSERT_LE(power_info_data_v2.gpu_energy.task_gpu_utilisation, 0ULL,
-                   "verified task_info call shows zero GPU utilization for non-GPU task");
-
-       do_factorial_task();
-
-       /*
-        * Verify the cpu_energy parameters.
-        */
-       err = task_info(mach_task_self(), TASK_POWER_INFO_V2, (task_info_t)&power_info_data_v2_new, &count);
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-#if !(defined(__arm__) || defined(__arm64__))
-       /*
-        * iOS does not have system_time.
-        */
-       T_ASSERT_GT(power_info_data_v2_new.cpu_energy.total_user, power_info_data_v2.cpu_energy.total_user,
-                   "task_info call returns valid user time");
-       T_ASSERT_GT(power_info_data_v2_new.cpu_energy.total_system, power_info_data_v2.cpu_energy.total_system,
-                   "task_info call returns valid system time");
-#endif
-
-       T_ASSERT_GE(power_info_data_v2.cpu_energy.task_interrupt_wakeups, 1ULL,
-                   "verify task_info call returns non-zero value for interrupt_wakeup (ret value = %llu)",
-                   power_info_data_v2.cpu_energy.task_interrupt_wakeups);
-
-#if !(defined(__arm__) || defined(__arm64__))
-       if (power_info_data_v2.cpu_energy.task_platform_idle_wakeups != 0) {
-               T_LOG("task_info call returned %llu for platform_idle_wakeup", power_info_data_v2.cpu_energy.task_platform_idle_wakeups);
-       }
-#endif
-
-       count = TASK_POWER_INFO_V2_COUNT_OLD;
-       err   = task_info(mach_task_self(), TASK_POWER_INFO_V2, (task_info_t)&power_info_data_v2, &count);
-
-       T_ASSERT_MACH_SUCCESS(err, "verify task_info call succeeded");
-
-       /*
-        * This is a negative case.
-        */
-       count--;
-       err = task_info(mach_task_self(), TASK_POWER_INFO_V2, (task_info_t)&power_info_data_v2, &count);
-
-       T_ASSERT_MACH_ERROR(err, KERN_INVALID_ARGUMENT,
-                           "Negative test case: task_info should verify that count is at least equal to what is defined in API. Call "
-                           "returns errno %d:%s",
-                           err, mach_error_string(err));
-}
-
-T_DECL(test_task_basic_info_32, "tests TASK_BASIC_INFO_32", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       test_task_basic_info(INFO_32);
-}
-
-T_DECL(test_task_basic_info_32_2, "tests TASK_BASIC_INFO_32_2", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       test_task_basic_info(INFO_32_2);
-}
-
-#if defined(__arm__) || defined(__arm64__)
-T_DECL(test_task_basic_info_64i_2, "tests TASK_BASIC_INFO_64_2", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       test_task_basic_info(INFO_64_2);
-}
-#else
-T_DECL(test_task_basic_info_64, "tests TASK_BASIC_INFO_64", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       test_task_basic_info(INFO_64);
-}
-#endif /* defined(__arm__) || defined(__arm64__) */
-
-T_DECL(test_mach_task_basic_info, "tests MACH_TASK_BASIC_INFO", T_META_ASROOT(true), T_META_LTEPHASE(LTE_POSTINIT))
-{
-       test_task_basic_info(INFO_MACH);
-}
-
-void
-test_task_basic_info(enum info_kind kind)
-{
-#define BEFORE 0
-#define AFTER 1
-
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       task_info_t info_data[2];
-       task_basic_info_32_data_t basic_info_32_data[2];
-#if defined(__arm__) || defined(__arm64__)
-       task_basic_info_64_2_data_t basic_info_64_2_data[2];
-#else
-       task_basic_info_64_data_t basic_info_64_data[2];
-#endif /* defined(__arm__) || defined(__arm64__) */
-       mach_task_basic_info_data_t mach_basic_info_data[2];
-
-       kern_return_t kr;
-       mach_msg_type_number_t count;
-       task_flavor_t flavor = 0;
-       integer_t suspend_count;
-       uint64_t resident_size_diff;
-       uint64_t virtual_size_diff;
-
-       void * tmp_map = NULL;
-       pid_t child_pid;
-       mach_port_name_t child_task;
-       /*for dt_waitpid*/
-       int timeout     = 10; // change to max timeout
-       int exit_status = 0;
-
-       switch (kind) {
-       case INFO_32:
-       case INFO_32_2:
-               info_data[BEFORE] = (task_info_t)&basic_info_32_data[BEFORE];
-               info_data[AFTER]  = (task_info_t)&basic_info_32_data[AFTER];
-               count             = TASK_BASIC_INFO_32_COUNT;
-               flavor            = TASK_BASIC_INFO_32;
-
-               if (kind == INFO_32_2) {
-                       flavor = TASK_BASIC2_INFO_32;
-               }
-
-               break;
-#if defined(__arm__) || defined(__arm64__)
-       case INFO_64:
-               T_ASSERT_FAIL("invalid basic info kind");
-               break;
-
-       case INFO_64_2:
-               info_data[BEFORE] = (task_info_t)&basic_info_64_2_data[BEFORE];
-               info_data[AFTER]  = (task_info_t)&basic_info_64_2_data[AFTER];
-               count             = TASK_BASIC_INFO_64_2_COUNT;
-               flavor            = TASK_BASIC_INFO_64_2;
-               break;
-
-#else
-       case INFO_64:
-               info_data[BEFORE] = (task_info_t)&basic_info_64_data[BEFORE];
-               info_data[AFTER]  = (task_info_t)&basic_info_64_data[AFTER];
-               count             = TASK_BASIC_INFO_64_COUNT;
-               flavor            = TASK_BASIC_INFO_64;
-               break;
-
-       case INFO_64_2:
-               T_ASSERT_FAIL("invalid basic info kind");
-               break;
-#endif /* defined(__arm__) || defined(__arm64__) */
-       case INFO_MACH:
-               info_data[BEFORE] = (task_info_t)&mach_basic_info_data[BEFORE];
-               info_data[AFTER]  = (task_info_t)&mach_basic_info_data[AFTER];
-               count             = MACH_TASK_BASIC_INFO_COUNT;
-               flavor            = MACH_TASK_BASIC_INFO;
-               break;
-       case INFO_MAX:
-       default:
-               T_ASSERT_FAIL("invalid basic info kind");
-               break;
-       }
-
-       kr = task_info(mach_task_self(), flavor, info_data[BEFORE], &count);
-
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_info succeeded");
-
-       do_factorial_task();
-
-       /*
-        * Allocate virtual and resident memory.
-        */
-       tmp_map = mmap(0, PAGE_SIZE, PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0);
-
-       T_WITH_ERRNO;
-       T_EXPECT_NE(tmp_map, MAP_FAILED, "verify mmap call is successful");
-
-       memset(tmp_map, 'm', PAGE_SIZE);
-
-       child_pid = fork();
-
-       T_ASSERT_POSIX_SUCCESS(child_pid, "verify process can be forked");
-
-       if (child_pid == 0) {
-               /*
-                * This will suspend the child process.
-                */
-               kr = task_suspend(mach_task_self());
-               exit(kr);
-       }
-
-       /*
-        * Wait for the child process to suspend itself.
-        */
-       sleep(1);
-
-       kr = task_for_pid(mach_task_self(), child_pid, &child_task);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_for_pid succeeded.  check sudo if failed");
-
-       /*
-        * Verify the suspend_count for child and resume it.
-        */
-
-       kr = task_info(child_task, flavor, info_data[AFTER], &count);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
-
-       suspend_count = (integer_t)(info_get(kind, GET_SUSPEND_COUNT, info_data[AFTER]));
-       T_ASSERT_EQ(suspend_count, 1, "verify task_info shows correct suspend_count");
-
-       kr = task_resume(child_task);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_resume succeeded");
-
-       /*
-        * reap kr from task_suspend call in child
-        */
-       if (dt_waitpid(child_pid, &exit_status, NULL, timeout)) {
-               T_ASSERT_MACH_SUCCESS(exit_status, "verify child task_suspend is successful");
-       } else {
-               T_FAIL("dt_waitpid failed");
-       }
-
-       kr = task_info(mach_task_self(), flavor, info_data[AFTER], &count);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
-
-       resident_size_diff = info_get(kind, GET_RESIDENT_SIZE, info_data[AFTER]) - info_get(kind, GET_RESIDENT_SIZE, info_data[BEFORE]);
-       virtual_size_diff  = info_get(kind, GET_VIRTUAL_SIZE, info_data[AFTER]) - info_get(kind, GET_VIRTUAL_SIZE, info_data[BEFORE]);
-
-       /*
-        * INFO_32_2 gets the max resident size instead of the current resident size
-        * 32 KB tolerance built into test.  The returned value is generally between 0 and 16384
-        *
-        * max resident size is a discrete field in INFO_MACH, so it's handled differently
-        */
-       if (kind == INFO_32_2) {
-               T_EXPECT_EQ(resident_size_diff % 4096, 0ULL, "verify task_info returns valid max resident_size");
-               T_EXPECT_GE(resident_size_diff, 0ULL, "verify task_info returns non-negative max resident_size");
-               T_EXPECT_GE(virtual_size_diff, (unsigned long long)PAGE_SIZE, "verify task_info returns valid virtual_size");
-       } else {
-               T_EXPECT_GE(resident_size_diff, (unsigned long long)PAGE_SIZE, "task_info returns valid resident_size");
-               T_EXPECT_GE(virtual_size_diff, (unsigned long long)PAGE_SIZE, "task_info returns valid virtual_size");
-       }
-
-       if (kind == INFO_MACH) {
-               resident_size_diff = info_get(kind, GET_MAX_RES, info_data[AFTER]) - info_get(kind, GET_MAX_RES, info_data[BEFORE]);
-               T_EXPECT_EQ(resident_size_diff % 4096, 0ULL, "verify task_info returns valid max resident_size");
-               T_EXPECT_GE(resident_size_diff, 0ULL, "verify task_info returns non-negative max resident_size");
-               T_EXPECT_GE(info_get(kind, GET_MAX_RES, info_data[AFTER]), info_get(kind, GET_RESIDENT_SIZE, info_data[AFTER]),
-                           "verify max resident size is greater than or equal to curr resident size");
-       }
-
-       do_factorial_task();
-
-       /*
-        * These counters give time for threads that have terminated. We dont have any, so checking for zero.
-        */
-
-       time_value_t * user_tv = (time_value_t *)(info_get(kind, GET_USER_TIME, info_data[BEFORE]));
-       T_EXPECT_EQ((user_tv->seconds + user_tv->microseconds / 1000000), 0, "verify task_info shows valid user time");
-
-       time_value_t * sys_tv = (time_value_t *)(info_get(kind, GET_SYS_TIME, info_data[BEFORE]));
-       T_EXPECT_EQ(sys_tv->seconds + (sys_tv->microseconds / 1000000), 0, "verify task_info shows valid system time");
-
-       /*
-        * The default value for non-kernel tasks is TIMESHARE.
-        */
-
-       policy_t pt = (policy_t)info_get(kind, GET_POLICY, info_data[BEFORE]);
-
-       T_EXPECT_EQ(pt, POLICY_TIMESHARE, "verify task_info shows valid policy");
-
-       /*
-        * This is a negative case.
-        */
-
-       count--;
-       kr = task_info(mach_task_self(), flavor, info_data[AFTER], &count);
-
-       T_ASSERT_MACH_ERROR(kr, KERN_INVALID_ARGUMENT,
-                           "Negative test case: task_info should verify that count is at least equal to what is defined in API");
-
-       /*
-        * deallocate memory
-        */
-       munmap(tmp_map, PAGE_SIZE);
-
-       return;
-
-#undef BEFORE
-#undef AFTER
-}
-
-T_DECL(test_sigcont_task_suspend_resume,
-       "test to verify that SIGCONT on task_suspend()-ed process works",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       mach_task_basic_info_data_t mach_basic_info_data;
-       task_info_t info_data = (task_info_t)&mach_basic_info_data;
-
-       task_debug_info_internal_data_t debug_info;
-       mach_msg_type_number_t debug_count = TASK_DEBUG_INFO_INTERNAL_COUNT;
-
-       kern_return_t kr;
-       int posix_ret;
-       mach_msg_type_number_t count = MACH_TASK_BASIC_INFO_COUNT;
-       task_flavor_t flavor         = MACH_TASK_BASIC_INFO;
-       integer_t suspend_count;
-       integer_t debug_suspend_count;
-       pid_t child_pid = 0;
-       mach_port_name_t child_task;
-       /*for dt_waitpid*/
-       int timeout     = 5;
-       int exit_status = 0;
-       int signal_no   = 0;
-
-       child_pid = fork();
-
-       T_ASSERT_POSIX_SUCCESS(child_pid, "verify process can be forked");
-
-       if (child_pid == 0) {
-               /*
-                * This will suspend the child process.
-                */
-               kr = task_suspend(mach_task_self());
-
-               /*
-                * When child resumes, it exits immediately
-                */
-
-               exit(kr);
-       }
-
-       /*
-        * Wait for the child process to suspend itself.
-        */
-       sleep(1);
-
-       kr = task_for_pid(mach_task_self(), child_pid, &child_task);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_for_pid succeeded.  check sudo if failed");
-
-       /*
-        * Verify the suspend_count for child and resume it.
-        */
-
-       kr = task_info(child_task, flavor, info_data, &count);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
-
-       suspend_count = (integer_t)(info_get(INFO_MACH, GET_SUSPEND_COUNT, info_data));
-       T_ASSERT_EQ(suspend_count, 1, "verify task_info shows correct suspend_count (1) (actually user stop count) ");
-
-       kr = task_info(child_task, TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &debug_count);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
-
-       debug_suspend_count = debug_info.suspend_count;
-       T_ASSERT_EQ(debug_info.suspend_count, 1, "verify debug_info shows correct suspend_count(1)");
-
-       posix_ret = kill(child_pid, SIGCONT);
-       T_ASSERT_POSIX_SUCCESS(posix_ret, "verify signal call succeeded");
-
-       /*
-        * reap kr from task_suspend call in child
-        */
-       dt_waitpid(child_pid, &exit_status, &signal_no, timeout);
-
-       T_ASSERT_EQ(signal_no, 0, "child should be resumed and exit without signal");
-       T_ASSERT_EQ(exit_status, 0, "child should exit with 0");
-
-}
-
-T_DECL(test_sigcont_task_suspend2_resume,
-       "test to verify that SIGCONT on task_suspend2()-ed process doesn't work",
-       T_META_ASROOT(true),
-       T_META_LTEPHASE(LTE_POSTINIT))
-{
-       T_SETUPBEGIN;
-       int is_dev = is_development_kernel();
-       T_QUIET;
-       T_ASSERT_TRUE(is_dev, "verify development kernel is running");
-       T_SETUPEND;
-
-       mach_task_basic_info_data_t mach_basic_info_data;
-       task_info_t info_data = (task_info_t)&mach_basic_info_data;
-
-       task_debug_info_internal_data_t debug_info;
-       mach_msg_type_number_t debug_count = TASK_DEBUG_INFO_INTERNAL_COUNT;
-
-       kern_return_t kr;
-       int posix_ret;
-       mach_msg_type_number_t count  = MACH_TASK_BASIC_INFO_COUNT;
-       task_flavor_t flavor          = MACH_TASK_BASIC_INFO;
-       integer_t suspend_count       = 0;
-       integer_t debug_suspend_count = 0;
-       pid_t child_pid               = 0;
-       mach_port_name_t child_task;
-       task_suspension_token_t child_token = 0xFFFFF;
-
-       /*
-        * for dt_waitpid
-        * We expect the test to fail right now, so I've set timeout to
-        * be shorter than we may want it to be when the issue is fixed
-        */
-       int timeout     = 1;
-       int exit_status = 0;
-       int signal_no   = 0;
-
-       /* for pipe */
-       int fd[2];
-       pipe(fd);
-       int pipe_msg = 0;
-
-       child_pid = fork();
-
-       T_ASSERT_POSIX_SUCCESS(child_pid, "verify process can be forked %d", child_pid);
-
-       if (child_pid == 0) {
-               close(fd[1]);
-               T_LOG("Waiting to read from parent...");
-               read(fd[0], &pipe_msg, sizeof(pipe_msg));
-               T_LOG("Done reading from parent, about to exit...");
-               exit(0);
-       }
-       /*
-        * Wait for child to fork and block on read
-        */
-       sleep(1);
-
-       close(fd[0]);
-
-       kr = task_for_pid(mach_task_self(), child_pid, &child_task);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_for_pid succeeded.  check sudo if failed");
-
-       kr = task_info(child_task, TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &debug_count);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
-
-       debug_suspend_count = debug_info.suspend_count;
-       T_EXPECT_EQ(debug_suspend_count, 0, "verify debug_info shows correct (true) suspend_count(0)");
-
-       kr = task_suspend2(child_task, &child_token);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_suspend2 call succeeded");
-
-       kr = task_info(child_task, TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &debug_count);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
-
-       debug_suspend_count = debug_info.suspend_count;
-       T_ASSERT_EQ(debug_suspend_count, 1, "verify debug_info shows correct (true) suspend_count(1)");
-
-       /*
-        * Verify the suspend_count for child and resume it.
-        */
-
-       kr = task_info(child_task, flavor, info_data, &count);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_info call succeeded");
-
-       suspend_count = (integer_t)(info_get(INFO_MACH, GET_SUSPEND_COUNT, info_data));
-       T_EXPECT_EQ(suspend_count, 1, "verify task_info shows correct (user_stop_count) suspend_count (1)");
-
-       posix_ret = kill(child_pid, SIGCONT);
-       T_ASSERT_POSIX_SUCCESS(posix_ret, "verify signal call succeeded");
-
-       kr = task_info(child_task, TASK_DEBUG_INFO_INTERNAL, (task_info_t)&debug_info, &debug_count);
-       T_EXPECT_MACH_SUCCESS(kr, "verify task_info call succeeded");
-
-       debug_suspend_count = debug_info.suspend_count;
-       T_EXPECTFAIL_WITH_RADAR(33166654);
-       T_EXPECT_EQ(debug_suspend_count, 1, "verify debug_info shows correct (true) suspend_count (1)");
-
-       suspend_count = (integer_t)(info_get(INFO_MACH, GET_SUSPEND_COUNT, info_data));
-       T_ASSERT_EQ(suspend_count, 1, "verify task_info shows correct (user_stop_count) suspend_count (1) after SIG_CONT");
-
-       kr = task_resume(child_task);
-       T_EXPECTFAIL_WITH_RADAR(33166654);
-       T_EXPECT_MACH_SUCCESS(kr, "verify task_resume succeeded");
-
-       /*
-        * reap kr from task_suspend call in child
-        */
-
-       dt_waitpid(child_pid, &exit_status, &signal_no, timeout);
-
-       T_ASSERT_EQ(signal_no, SIG_DT_TIMEOUT, "dt_waitpid timed out as expected");
-
-       // Resume properly using token and then wait
-
-       kr = task_resume2(child_token);
-       T_EXPECTFAIL_WITH_RADAR(33166654);
-       T_ASSERT_MACH_SUCCESS(kr, "verify task_resume2 succeeded");
-
-       write(fd[1], &pipe_msg, sizeof(pipe_msg));
-
-       /*
-        * reap kr from task_suspend call in child
-        */
-       dt_waitpid(child_pid, &exit_status, &signal_no, timeout);
-
-       T_ASSERT_EQ(signal_no, 0, "child should be resumed and no signal should be returned");
-       T_ASSERT_EQ(exit_status, 0, "child should exit with 0");
-
-}
-
-uint64_t
-info_get(enum info_kind kind, enum info_get get, void * data)
-{
-       switch (get) {
-       case GET_SUSPEND_COUNT:
-               switch (kind) {
-               case INFO_32:
-               case INFO_32_2:
-                       return (uint64_t)(((task_basic_info_32_t)data)->suspend_count);
-#if defined(__arm__) || defined(__arm64__)
-               case INFO_64:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-
-               case INFO_64_2:
-                       return (uint64_t)(((task_basic_info_64_2_t)data)->suspend_count);
-#else
-               case INFO_64:
-                       return (uint64_t)(((task_basic_info_64_t)data)->suspend_count);
-
-               case INFO_64_2:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-#endif /* defined(__arm__) || defined(__arm64__) */
-               case INFO_MACH:
-                       return (uint64_t)(((mach_task_basic_info_t)data)->suspend_count);
-               case INFO_MAX:
-               default:
-                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
-               }
-       case GET_RESIDENT_SIZE:
-               switch (kind) {
-               case INFO_32:
-               case INFO_32_2:
-                       return (uint64_t)(((task_basic_info_32_t)data)->resident_size);
-#if defined(__arm__) || defined(__arm64__)
-               case INFO_64:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-
-               case INFO_64_2:
-                       return (uint64_t)(((task_basic_info_64_2_t)data)->resident_size);
-#else
-               case INFO_64:
-                       return (uint64_t)(((task_basic_info_64_t)data)->resident_size);
-
-               case INFO_64_2:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-#endif /* defined(__arm__) || defined(__arm64__) */
-               case INFO_MACH:
-                       return (uint64_t)(((mach_task_basic_info_t)data)->resident_size);
-               case INFO_MAX:
-               default:
-                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
-               }
-       case GET_VIRTUAL_SIZE:
-               switch (kind) {
-               case INFO_32:
-               case INFO_32_2:
-                       return (uint64_t)(((task_basic_info_32_t)data)->virtual_size);
-#if defined(__arm__) || defined(__arm64__)
-               case INFO_64:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-
-               case INFO_64_2:
-                       return (uint64_t)(((task_basic_info_64_2_t)data)->virtual_size);
-#else
-               case INFO_64:
-                       return (uint64_t)(((task_basic_info_64_t)data)->virtual_size);
-
-               case INFO_64_2:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-#endif /* defined(__arm__) || defined(__arm64__) */
-               case INFO_MACH:
-                       return (uint64_t)(((mach_task_basic_info_t)data)->virtual_size);
-
-               case INFO_MAX:
-               default:
-                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
-               }
-       case GET_USER_TIME:
-               switch (kind) {
-               case INFO_32:
-               case INFO_32_2:
-                       return (uint64_t) & (((task_basic_info_32_t)data)->user_time);
-#if defined(__arm__) || defined(__arm64__)
-               case INFO_64:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-
-               case INFO_64_2:
-                       return (uint64_t) & (((task_basic_info_64_2_t)data)->user_time);
-#else
-               case INFO_64:
-                       return (uint64_t) & (((task_basic_info_64_t)data)->user_time);
-
-               case INFO_64_2:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-#endif /* defined(__arm__) || defined(__arm64__) */
-               case INFO_MACH:
-                       return (uint64_t) & (((mach_task_basic_info_t)data)->user_time);
-
-               case INFO_MAX:
-               default:
-                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
-               }
-       case GET_SYS_TIME:
-               switch (kind) {
-               case INFO_32:
-               case INFO_32_2:
-                       return (uint64_t) & (((task_basic_info_32_t)data)->system_time);
-#if defined(__arm__) || defined(__arm64__)
-               case INFO_64:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-
-               case INFO_64_2:
-                       return (uint64_t) & (((task_basic_info_64_2_t)data)->system_time);
-#else
-               case INFO_64:
-                       return (uint64_t) & (((task_basic_info_64_t)data)->system_time);
-
-               case INFO_64_2:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-#endif /* defined(__arm__) || defined(__arm64__) */
-               case INFO_MACH:
-                       return (uint64_t) & (((mach_task_basic_info_t)data)->user_time);
-               case INFO_MAX:
-               default:
-                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
-               }
-       case GET_POLICY:
-               switch (kind) {
-               case INFO_32:
-               case INFO_32_2:
-                       return (uint64_t)(((task_basic_info_32_t)data)->policy);
-#if defined(__arm__) || defined(__arm64__)
-               case INFO_64:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-
-               case INFO_64_2:
-                       return (uint64_t)(((task_basic_info_64_2_t)data)->policy);
-#else
-               case INFO_64:
-                       return (uint64_t)(((task_basic_info_64_t)data)->policy);
-
-               case INFO_64_2:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-                       break;
-#endif /* defined(__arm__) || defined(__arm64__) */
-               case INFO_MACH:
-                       return (uint64_t)(((mach_task_basic_info_t)data)->policy);
-
-               case INFO_MAX:
-               default:
-                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
-               }
-       case GET_MAX_RES:
-               switch (kind) {
-               case INFO_32:
-               case INFO_32_2:
-               case INFO_64:
-               case INFO_64_2:
-                       T_ASSERT_FAIL("illegal info_get %d %d", kind, get);
-               case INFO_MACH:
-                       return (uint64_t)(((mach_task_basic_info_t)data)->resident_size_max);
-               case INFO_MAX:
-               default:
-                       T_ASSERT_FAIL("unhandled info_get %d %d", kind, get);
-               }
-       }
-
-       __builtin_unreachable();
-}
-
-/*
- * Determines whether we're running on a development kernel
- */
-static int
-is_development_kernel(void)
-{
-#define NOTSET -1
-
-       static int is_dev = NOTSET;
-
-       if (is_dev == NOTSET) {
-               int dev;
-               size_t dev_size = sizeof(dev);
-
-               T_QUIET;
-               T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.development", &dev, &dev_size, NULL, 0), NULL);
-               is_dev = (dev != 0);
-
-               return is_dev;
-       } else {
-               return is_dev;
-       }
-#undef NOTSET
-}
diff --git a/tools/tests/darwintests/task_info_28439149.c b/tools/tests/darwintests/task_info_28439149.c
deleted file mode 100644 (file)
index 9102ba6..0000000
+++ /dev/null
@@ -1,81 +0,0 @@
-#include <darwintest.h>
-#include <mach/host_priv.h>
-#include <mach/mach.h>
-#include <mach/mach_types.h>
-#include <mach/processor_set.h>
-#include <mach/task.h>
-#include <sys/sysctl.h>
-#include <unistd.h>
-#include <mach-o/dyld.h>
-#include <mach-o/dyld_images.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <stdlib.h>
-
-static void do_child(int *pipefd){
-       int exit = 0;
-
-       close(pipefd[1]);
-       read(pipefd[0], &exit, sizeof(int));
-       T_QUIET; T_EXPECT_EQ_INT(exit, 1, "exit");
-       close(pipefd[0]);
-}
-
-T_DECL(task_info_28439149, "ensure that task_info has the correct permission",
-                T_META_CHECK_LEAKS(false), T_META_ASROOT(true))
-{
-       int pipefd[2];
-
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pipe(pipefd), "pipe");
-
-       int pid = fork();
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(pid, "fork");
-
-       if (pid == 0) {
-               do_child(pipefd);
-               return;
-       }
-
-       close(pipefd[0]);
-
-       int exit;
-       mach_msg_type_number_t count;
-        struct task_basic_info_64 ti;
-       task_dyld_info_data_t di;
-
-       task_t self = mach_task_self();
-       task_t other_name;
-       task_t other;
-       int ret;
-
-       T_EXPECT_MACH_SUCCESS(task_for_pid(self, pid, &other), NULL);
-       T_EXPECT_MACH_SUCCESS(task_name_for_pid(self, pid, &other_name), NULL);
-
-       count = TASK_BASIC_INFO_64_COUNT;
-       T_EXPECT_MACH_SUCCESS(task_info(self, TASK_BASIC_INFO_64, (task_info_t)&ti,
-                               &count), "task_info(self, TASK_BASIC_INFO_64 ...)");
-       count = TASK_BASIC_INFO_64_COUNT;
-       T_EXPECT_MACH_SUCCESS(task_info(other, TASK_BASIC_INFO_64, (task_info_t)&ti,
-                               &count), "task_info(other_name, TASK_BASIC_INFO_64 ...)");
-       count = TASK_BASIC_INFO_64_COUNT;
-       T_EXPECT_MACH_SUCCESS(task_info(other_name, TASK_BASIC_INFO_64, (task_info_t)&ti,
-                               &count), "task_info(other_name, TASK_BASIC_INFO_64 ...)");
-
-
-       count = TASK_DYLD_INFO_COUNT;
-       T_EXPECT_MACH_SUCCESS(task_info(self, TASK_DYLD_INFO, (task_info_t)&di,
-                               &count), "task_info(self, TASK_DYLD_INFO ...)");
-       count = TASK_DYLD_INFO_COUNT;
-       T_EXPECT_MACH_SUCCESS(task_info(other, TASK_DYLD_INFO, (task_info_t)&di,
-                               &count), "task_info(other_name, TASK_DYLD_INFO ...)");
-       count = TASK_DYLD_INFO_COUNT;
-       ret = task_info(other_name, TASK_DYLD_INFO, (task_info_t)&di, &count);
-       T_EXPECT_EQ_INT(ret, KERN_INVALID_ARGUMENT, "task info TASK_DYLD_INFO should fail with mach_port_name");
-
-       exit = 1;
-       write(pipefd[1], &exit, sizeof(int));
-       close(pipefd[1]);
-
-       wait(NULL);
-}
-
diff --git a/tools/tests/darwintests/task_inspect.c b/tools/tests/darwintests/task_inspect.c
deleted file mode 100644 (file)
index f16064a..0000000
+++ /dev/null
@@ -1,145 +0,0 @@
-#ifdef T_NAMESPACE
-#undef T_NAMESPACE
-#endif
-
-#include <darwintest.h>
-
-#include <mach/host_priv.h>
-#include <mach/mach.h>
-#include <mach/mach_types.h>
-#include <mach/mach_vm.h>
-#include <mach/processor_set.h>
-#include <mach/task.h>
-#include <sys/sysctl.h>
-#include <unistd.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.ipc"));
-
-/*
- * Attempt to inspect kernel_task using a task_inspect_t.  Interact with the
- * kernel in the same way top(1) and lsmp(1) do.
- */
-
-static void
-check_secure_kernel(void)
-{
-       int secure_kern = 0;
-       size_t secure_kern_size = sizeof(secure_kern);
-
-       T_ASSERT_POSIX_SUCCESS(sysctlbyname("kern.secure_kernel", &secure_kern,
-                       &secure_kern_size, NULL, 0), NULL);
-
-       if (secure_kern) {
-               T_SKIP("secure kernel: processor_set_tasks will not return kernel_task");
-       }
-}
-
-static void
-attempt_kernel_inspection(task_t task)
-{
-       pid_t pid = (pid_t)-1;
-       mach_msg_type_number_t i, count, thcnt;
-       struct task_basic_info_64 ti;
-       thread_act_array_t threads;
-
-       T_QUIET;
-       T_EXPECT_MACH_SUCCESS(pid_for_task(task, &pid), NULL);
-       T_LOG("Checking pid %d", pid);
-
-       if (pid != 0) {
-               return;
-       }
-
-       T_LOG("found kernel_task, attempting to inspect");
-
-       count = TASK_BASIC_INFO_64_COUNT;
-       T_EXPECT_MACH_SUCCESS(task_info(task, TASK_BASIC_INFO_64, (task_info_t)&ti,
-                                       &count), "task_info(... TASK_BASIC_INFO_64 ...)");
-
-       T_EXPECT_MACH_SUCCESS(task_threads(task, &threads, &thcnt), "task_threads");
-       T_LOG("Found %d kernel threads.", thcnt);
-       for (i = 0; i < thcnt; i++) {
-               kern_return_t kr;
-               thread_basic_info_data_t basic_info;
-               mach_msg_type_number_t bi_count = THREAD_BASIC_INFO_COUNT;
-
-               kr = thread_info(threads[i], THREAD_BASIC_INFO,
-                               (thread_info_t)&basic_info, &bi_count);
-               /*
-                * Ignore threads that have gone away.
-                */
-               if (kr == MACH_SEND_INVALID_DEST) {
-                       T_LOG("ignoring thread that has been destroyed");
-                       continue;
-               }
-               T_EXPECT_MACH_SUCCESS(kr, "thread_info(... THREAD_BASIC_INFO ...)");
-               (void)mach_port_deallocate(mach_task_self(), threads[i]);
-       }
-       mach_vm_deallocate(mach_task_self(),
-                          (mach_vm_address_t)(uintptr_t)threads,
-                          thcnt * sizeof(*threads));
-
-       ipc_info_space_basic_t basic_info;
-       T_EXPECT_MACH_SUCCESS(mach_port_space_basic_info(task, &basic_info), "mach_port_space_basic_info");
-
-       ipc_info_space_t info_space;
-       ipc_info_name_array_t table;
-       ipc_info_tree_name_array_t tree;
-       mach_msg_type_number_t tblcnt = 0, treecnt = 0;
-       T_EXPECT_MACH_SUCCESS(mach_port_space_info(task, &info_space, &table,
-                                                  &tblcnt, &tree, &treecnt), "mach_port_space_info");
-       if (tblcnt > 0) {
-               mach_vm_deallocate(mach_task_self(),
-                                  (mach_vm_address_t)(uintptr_t)table,
-                                  tblcnt * sizeof(*table));
-       }
-       if (treecnt > 0) {
-               mach_vm_deallocate(mach_task_self(),
-                                  (mach_vm_address_t)(uintptr_t)tree,
-                                  treecnt * sizeof(*tree));
-       }
-
-       T_END;
-}
-
-T_DECL(inspect_kernel_task,
-               "ensure that kernel task can be inspected",
-               T_META_CHECK_LEAKS(false),
-               T_META_ASROOT(true))
-{
-       processor_set_name_array_t psets;
-       processor_set_t pset;
-       task_array_t tasks;
-       mach_msg_type_number_t i, j, tcnt, pcnt = 0;
-       mach_port_t self = mach_host_self();
-
-       check_secure_kernel();
-
-       T_ASSERT_MACH_SUCCESS(host_processor_sets(self, &psets, &pcnt),
-                       NULL);
-
-       for (i = 0; i < pcnt; i++) {
-               T_ASSERT_MACH_SUCCESS(host_processor_set_priv(self, psets[i], &pset), NULL);
-               T_LOG("Checking pset %d/%d", i, pcnt - 1);
-
-               tcnt = 0;
-               T_ASSERT_MACH_SUCCESS(processor_set_tasks(pset, &tasks, &tcnt), NULL);
-
-               for (j = 0; j < tcnt; j++) {
-                       attempt_kernel_inspection(tasks[j]);
-                       mach_port_deallocate(self, tasks[j]);
-               }
-
-               /* free tasks array */
-               mach_vm_deallocate(mach_task_self(),
-                                  (mach_vm_address_t)(uintptr_t)tasks,
-                                  tcnt * sizeof(*tasks));
-               mach_port_deallocate(mach_task_self(), pset);
-               mach_port_deallocate(mach_task_self(), psets[i]);
-       }
-       mach_vm_deallocate(mach_task_self(),
-                          (mach_vm_address_t)(uintptr_t)psets,
-                          pcnt * sizeof(*psets));
-
-       T_FAIL("could not find kernel_task in list of tasks returned");
-}
diff --git a/tools/tests/darwintests/task_inspect.entitlements b/tools/tests/darwintests/task_inspect.entitlements
deleted file mode 100644 (file)
index eaaf1de..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.system-task-ports</key>
-       <true/>
-       <key>task_for_pid-allow</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/thread_group_set_32261625.c b/tools/tests/darwintests/thread_group_set_32261625.c
deleted file mode 100644 (file)
index 1c7eb3f..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-#include <darwintest.h>
-#include <ktrace.h>
-#include <sys/kdebug.h>
-
-#define TEST_EVENTID (0xfedcbb00)
-
-static void*
-newthread(void *arg)
-{
-#pragma unused(arg)
-       while (1) {
-               kdebug_trace(TEST_EVENTID, 0, 0, 0, 0);
-               sleep(1);
-       }
-}
-
-#define TEST_TIMEOUT (15 * NSEC_PER_SEC)
-
-T_DECL(thread_group_set, "Checks that new threads get a THREAD_GROUP_SET tracepoint with a non-zero tid") {
-       pthread_t thread;
-       __block int seen_new_thread = 0, __block seen_thread_group_set = 0;
-
-       ktrace_machine_t machine = ktrace_machine_create_current();
-       T_WITH_ERRNO; T_ASSERT_NOTNULL(machine, "ktrace_get_machine");
-
-       bool has_tg = false;
-       if (ktrace_machine_has_thread_groups(machine, &has_tg) || !has_tg) {
-               T_SKIP("thread groups not supported on this system");
-       }
-       ktrace_machine_destroy(machine);
-
-       ktrace_session_t session = ktrace_session_create();
-       T_WITH_ERRNO; T_ASSERT_NOTNULL(session, "ktrace_session_create");
-
-       ktrace_set_interactive(session);
-
-       ktrace_set_completion_handler(session, ^{
-               ktrace_session_destroy(session);
-               T_ASSERT_TRUE(seen_new_thread, "seen new thread tracepoint");
-               T_END;
-       });
-
-       ktrace_events_single(session, TEST_EVENTID, ^(__unused ktrace_event_t e) {
-               T_EXPECT_TRUE(seen_thread_group_set, "seen THREAD_GROUP_SET tracepoint");
-               seen_new_thread = 1;
-               ktrace_end(session, 1);
-       });
-
-       ktrace_events_single(session, MACHDBG_CODE(DBG_MACH_THREAD_GROUP, MACH_THREAD_GROUP_SET), ^(ktrace_event_t e) {
-               T_EXPECT_GT(e->arg3, (uintptr_t)0, "tid on THREAD_GROUP_SET");
-               seen_thread_group_set = 1;
-       });
-
-       dispatch_after(dispatch_time(DISPATCH_TIME_NOW, TEST_TIMEOUT), dispatch_get_main_queue(), ^{
-               ktrace_end(session, 0);
-       });
-
-       T_ASSERT_POSIX_SUCCESS(ktrace_start(session, dispatch_get_main_queue()), "ktrace_start");
-
-       T_EXPECT_POSIX_SUCCESS(pthread_create(&thread, NULL, newthread, NULL), "pthread_create");
-       T_EXPECT_POSIX_SUCCESS(pthread_detach(thread), "pthread_detach");
-
-       dispatch_main();
-}
diff --git a/tools/tests/darwintests/utimensat.c b/tools/tests/darwintests/utimensat.c
deleted file mode 100644 (file)
index d5baad6..0000000
+++ /dev/null
@@ -1,83 +0,0 @@
-#include <sys/cdefs.h>
-#include <sys/param.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <limits.h>
-#include <paths.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-
-#include <darwintest.h>
-#include <darwintest_utils.h>
-
-#define FILENAME "utimensat"
-
-static const struct timespec tptr[][2] = {
-       { { 0x12345678, 987654321 }, { 0x15263748, 123456789 }, },
-
-       { { 0, UTIME_NOW }, { 0x15263748, 123456789 }, },
-       { { 0x12345678, 987654321 }, { 0, UTIME_NOW }, },
-       { { 0, UTIME_NOW }, { 0, UTIME_NOW }, },
-
-       { { 0, UTIME_OMIT }, { 0x15263748, 123456789 }, },
-       { { 0x12345678, 987654321 }, { 0, UTIME_OMIT }, },
-       { { 0, UTIME_OMIT }, { 0, UTIME_OMIT }, },
-
-       { { 0, UTIME_NOW }, { 0, UTIME_OMIT }, },
-       { { 0, UTIME_OMIT }, { 0, UTIME_NOW }, },
-};
-
-T_DECL(utimensat, "Try various versions of utimensat")
-{
-       T_SETUPBEGIN;
-       T_ASSERT_POSIX_ZERO(chdir(dt_tmpdir()), NULL);
-       // Skip the test if the current working directory is not on APFS.
-       struct statfs sfs = { 0 };
-       T_QUIET; T_ASSERT_POSIX_SUCCESS(statfs(".", &sfs), NULL);
-       if (memcmp(&sfs.f_fstypename[0], "apfs", strlen("apfs")) != 0) {
-               T_SKIP("utimensat is APFS-only, but working directory is non-APFS");
-       }
-       T_SETUPEND;
-
-       struct stat pre_st, post_st;
-       int fd;
-
-       T_ASSERT_POSIX_SUCCESS((fd = open(FILENAME, O_CREAT|O_RDWR, 0644)), NULL);
-       T_ASSERT_POSIX_ZERO(close(fd), NULL);
-
-       for (size_t i = 0; i < sizeof(tptr)/sizeof(tptr[0]); i++) {
-               T_LOG("=== {%ld, %ld} {%ld, %ld} ===", 
-                               tptr[i][0].tv_sec, tptr[i][0].tv_nsec,
-                               tptr[i][1].tv_sec, tptr[i][1].tv_nsec);
-
-               struct timespec now;
-               clock_gettime(CLOCK_REALTIME, &now);
-
-               T_ASSERT_POSIX_ZERO(stat(FILENAME, &pre_st), NULL);
-               T_ASSERT_POSIX_ZERO(utimensat(AT_FDCWD, FILENAME, tptr[i], 0), NULL);
-               T_ASSERT_POSIX_ZERO(stat(FILENAME, &post_st), NULL);
-
-               if (tptr[i][0].tv_nsec == UTIME_NOW) {
-                       T_ASSERT_GE(post_st.st_atimespec.tv_sec, now.tv_sec, NULL);
-               } else if (tptr[i][0].tv_nsec == UTIME_OMIT) {
-                       T_ASSERT_EQ(post_st.st_atimespec.tv_sec, pre_st.st_atimespec.tv_sec, NULL);
-                       T_ASSERT_EQ(post_st.st_atimespec.tv_nsec, pre_st.st_atimespec.tv_nsec, NULL);
-               } else {
-                       T_ASSERT_EQ(post_st.st_atimespec.tv_sec, tptr[i][0].tv_sec, NULL);
-                       T_ASSERT_EQ(post_st.st_atimespec.tv_nsec, tptr[i][0].tv_nsec, NULL);
-               }
-
-               if (tptr[i][1].tv_nsec == UTIME_NOW) {
-                       T_ASSERT_GE(post_st.st_mtimespec.tv_sec, now.tv_sec, NULL);
-               } else if (tptr[i][1].tv_nsec == UTIME_OMIT) {
-                       T_ASSERT_EQ(post_st.st_mtimespec.tv_sec, pre_st.st_mtimespec.tv_sec, NULL);
-                       T_ASSERT_EQ(post_st.st_mtimespec.tv_nsec, pre_st.st_mtimespec.tv_nsec, NULL);
-               } else {
-                       T_ASSERT_EQ(post_st.st_mtimespec.tv_sec, tptr[i][1].tv_sec, NULL);
-                       T_ASSERT_EQ(post_st.st_mtimespec.tv_nsec, tptr[i][1].tv_nsec, NULL);
-               }
-       }
-}
diff --git a/tools/tests/darwintests/verify_kalloc_config.c b/tools/tests/darwintests/verify_kalloc_config.c
deleted file mode 100644 (file)
index 14ce3c9..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-#include <string.h>
-#include <stdlib.h>
-#include <mach/mach.h>
-#include <mach_debug/mach_debug.h>
-#include <darwintest.h>
-
-T_GLOBAL_META(
-       T_META_NAMESPACE("xnu.vm"),
-       T_META_CHECK_LEAKS(false)
-);
-
-static void run_test(void);
-
-static void run_test(void)
-{
-       kern_return_t kr;
-       uint64_t size, i;
-       mach_zone_name_t *name = NULL;
-       unsigned int nameCnt = 0;
-       mach_zone_info_t *info = NULL;
-       unsigned int infoCnt = 0;
-       mach_memory_info_t *wiredInfo = NULL;
-       unsigned int wiredInfoCnt = 0;
-       const char kalloc_str[] = "kalloc.";
-
-       kr = mach_memory_info(mach_host_self(),
-                       &name, &nameCnt, &info, &infoCnt,
-                       &wiredInfo, &wiredInfoCnt);
-       T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_memory_info");
-       T_QUIET; T_ASSERT_EQ(nameCnt, infoCnt, "zone name and info counts don't match");
-
-       /* Match the names of the kalloc zones against their element sizes. */
-       for (i = 0; i < nameCnt; i++) {
-               if (strncmp(name[i].mzn_name, kalloc_str, strlen(kalloc_str)) == 0) {
-                       size = strtoul(&(name[i].mzn_name[strlen(kalloc_str)]), NULL, 10);
-                       T_LOG("ZONE NAME: %-25s ELEMENT SIZE: %llu", name[i].mzn_name, size);
-                       T_QUIET; T_ASSERT_EQ(size, info[i].mzi_elem_size, "kalloc zone name and element size don't match");
-               }
-       }
-
-       if ((name != NULL) && (nameCnt != 0)) {
-               kr = vm_deallocate(mach_task_self(), (vm_address_t) name,
-                               (vm_size_t) (nameCnt * sizeof *name));
-               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate name");
-       }
-
-       if ((info != NULL) && (infoCnt != 0)) {
-               kr = vm_deallocate(mach_task_self(), (vm_address_t) info,
-                               (vm_size_t) (infoCnt * sizeof *info));
-               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate info");
-       }
-
-       if ((wiredInfo != NULL) && (wiredInfoCnt != 0)) {
-               kr = vm_deallocate(mach_task_self(), (vm_address_t) wiredInfo,
-                               (vm_size_t) (wiredInfoCnt * sizeof *wiredInfo));
-               T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "vm_deallocate wiredInfo");
-       }
-
-       T_END;
-}
-
-T_DECL( verify_kalloc_config,
-               "verifies that the kalloc zones are configured correctly",
-               T_META_ASROOT(true))
-{
-       run_test();
-}
-
diff --git a/tools/tests/darwintests/voucher_entry_18826844.c b/tools/tests/darwintests/voucher_entry_18826844.c
deleted file mode 100644 (file)
index 24e246a..0000000
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Test that sending a message to a voucher with the same voucher as the voucher port
- * with only one send right count with move send before the copy send doesn't panic.
- *
- * clang -o voucherentry voucherentry.c -ldarwintest -Weverything -Wno-gnu-flexible-array-initializer
- *
- * <rdar://problem/18826844>
- */
-
-#include <mach/mach.h>
-#include <darwintest.h>
-
-T_DECL(voucher_entry, "voucher_entry", T_META_CHECK_LEAKS(false), T_META_ALL_VALID_ARCHS(true))
-{
-       kern_return_t kr        = KERN_SUCCESS;
-       mach_voucher_t voucher  = MACH_VOUCHER_NULL;
-
-       /*
-        * The bank voucher already exists in this process, so using it doesn't
-        * actually test the problem. Use an importance voucher instead.
-        */
-       mach_voucher_attr_recipe_data_t recipe = {
-               .key                = MACH_VOUCHER_ATTR_KEY_IMPORTANCE,
-               .command            = MACH_VOUCHER_ATTR_IMPORTANCE_SELF,
-               .previous_voucher   = MACH_VOUCHER_NULL,
-               .content_size       = 0,
-       };
-
-       kr = host_create_mach_voucher(mach_host_self(),
-                                     (mach_voucher_attr_raw_recipe_array_t)&recipe,
-                                     sizeof(recipe), &voucher);
-
-       T_ASSERT_MACH_SUCCESS(kr, "host_create_mach_voucher");
-
-       T_ASSERT_NOTNULL(voucher, "voucher must not be null");
-
-       mach_port_urefs_t refs = 0;
-
-       kr = mach_port_get_refs(mach_task_self(), voucher, MACH_PORT_RIGHT_SEND, &refs);
-
-       T_ASSERT_MACH_SUCCESS(kr, "mach_port_get_refs");
-
-       T_ASSERT_EQ(refs, (mach_port_urefs_t)1, "voucher must have only one ref");
-
-       /* First, try with two moves (must fail because there's only one ref) */
-       mach_msg_header_t request_msg_1 = {
-               .msgh_remote_port   = voucher,
-               .msgh_local_port    = MACH_PORT_NULL,
-               .msgh_voucher_port  = voucher,
-               .msgh_bits          = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND, 0, MACH_MSG_TYPE_MOVE_SEND, 0),
-               .msgh_id            = 0xDEAD,
-               .msgh_size          = sizeof(request_msg_1),
-       };
-
-       kr = mach_msg_send(&request_msg_1);
-
-       T_ASSERT_MACH_ERROR(MACH_SEND_INVALID_DEST, kr, "send with two moves should fail with invalid dest");
-
-       /* Next, try with a move and a copy (will succeed and destroy the last ref) */
-       mach_msg_header_t request_msg_2 = {
-               .msgh_remote_port   = voucher,
-               .msgh_local_port    = MACH_PORT_NULL,
-               .msgh_voucher_port  = voucher,
-               .msgh_bits          = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MOVE_SEND, 0, MACH_MSG_TYPE_COPY_SEND, 0),
-               .msgh_id            = 0xDEAD,
-               .msgh_size          = sizeof(request_msg_2),
-       };
-
-       /* panic happens here */
-       kr = mach_msg_send(&request_msg_2);
-
-       T_ASSERT_MACH_SUCCESS(kr, "send with move and copy succeeds");
-
-       kr = mach_port_get_refs(mach_task_self(), voucher, MACH_PORT_RIGHT_SEND, &refs);
-
-       T_ASSERT_MACH_ERROR(KERN_INVALID_NAME, kr, "voucher should now be invalid name");
-}
-
diff --git a/tools/tests/darwintests/voucher_traps.c b/tools/tests/darwintests/voucher_traps.c
deleted file mode 100644 (file)
index f3e5a0a..0000000
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Test voucher trap APIs.
- * There was an unfortunate bug in the trap interface that used the user space
- * _address_ of a trap parameter as a copyin size. This test validates there
- * are no other kernel panics in the voucher create and voucher attribute
- * extraction mach traps.
- *
- * clang -o voucher_traps voucher_traps.c -ldarwintest -Weverything -Wno-gnu-flexible-array-initializer
- *
- * <rdar://problem/29379175>
- */
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <mach/mach.h>
-#include <mach/mach_vm.h>
-#include <mach/mach_traps.h>
-
-#include <atm/atm_types.h>
-
-#include <darwintest.h>
-
-
-static mach_port_t get_atm_voucher(void)
-{
-       mach_voucher_attr_recipe_data_t r = {
-               .key = MACH_VOUCHER_ATTR_KEY_ATM,
-               .command = MACH_VOUCHER_ATTR_ATM_CREATE
-       };
-       mach_port_t port = MACH_PORT_NULL;
-       kern_return_t kr = host_create_mach_voucher(mach_host_self(),
-                                                   (mach_voucher_attr_raw_recipe_array_t)&r,
-                                                   sizeof(r), &port);
-       T_ASSERT_MACH_SUCCESS(kr, "Create ATM voucher: 0x%x", (unsigned int)port);
-
-       return port;
-}
-
-
-T_DECL(voucher_extract_attr_recipe, "voucher_extract_attr_recipe")
-{
-       kern_return_t kr;
-       mach_vm_size_t alloc_sz;
-       mach_port_t port;
-       mach_vm_address_t alloc_addr;
-
-       /* map at least a page of memory at some arbitrary location */
-       alloc_sz = (mach_vm_size_t)round_page(MACH_VOUCHER_TRAP_STACK_LIMIT + 1);
-
-       /*
-        * We could theoretically ask for a fixed location, but this is more
-        * reliable, and we're not actually trying to exploit anything - a
-        * kernel panic on failure should suffice :-)
-        */
-       alloc_addr = (mach_vm_address_t)round_page(MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE + 1);
-       kr = mach_vm_allocate(mach_task_self(), &alloc_addr,
-                             alloc_sz, VM_FLAGS_ANYWHERE);
-
-       /*
-        * Make sure that the address of the allocation is larger than the
-        * maximum recipe size: this will test for the bug that was fixed in
-        * <rdar://problem/29379175>.
-        */
-       T_ASSERT_GT_ULLONG((uint64_t)alloc_addr,
-                          (uint64_t)MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE,
-                          "Recipe addr (%llu bytes): 0x%llx > max recipe sz: %llu",
-                          (uint64_t)alloc_sz, (uint64_t)alloc_addr,
-                          (uint64_t)MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE);
-
-       /* make the allocation look like a pointer to an int */
-       mach_msg_type_number_t *recipe_size;
-       recipe_size = (mach_msg_type_number_t *)((uintptr_t)alloc_addr);
-       bzero(recipe_size, (unsigned long)alloc_sz);
-       if (alloc_sz > MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE)
-               *recipe_size = MACH_VOUCHER_ATTR_MAX_RAW_RECIPE_ARRAY_SIZE;
-       else
-               *recipe_size = (mach_msg_type_number_t)alloc_sz;
-
-       /* recipe buffer on the heap: memset it so panics show up loudly */
-       size_t size = (size_t)(10 * 1024 * 1024);
-       void *recipe = malloc(size);
-       memset(recipe, 0x41, size);
-
-       port = get_atm_voucher();
-
-       /*
-        * This should try to extract the ATM attribute using a buffer on the
-        * kernel heap (probably zone memory).
-        */
-       kr = mach_voucher_extract_attr_recipe_trap(port, MACH_VOUCHER_ATTR_KEY_ATM,
-                                                  recipe, recipe_size);
-       T_ASSERT_MACH_SUCCESS(kr, "Extract attribute data with recipe: heap");
-
-       /* reset the recipe memory */
-       memset(recipe, 0x41, size);
-       /* reduce the size to get an allocation on the kernel stack */
-       *recipe_size = MACH_VOUCHER_TRAP_STACK_LIMIT - 1;
-
-       /*
-        * This should try to extract the ATM attribute using a buffer on the
-        * kernel stack.
-        */
-       kr = mach_voucher_extract_attr_recipe_trap(port, MACH_VOUCHER_ATTR_KEY_ATM,
-                                                  recipe, recipe_size);
-       T_ASSERT_MACH_SUCCESS(kr, "Extract attribute data with recipe: stack");
-
-       /* cleanup */
-
-       free(recipe);
-       kr = mach_vm_deallocate(mach_task_self(), alloc_addr, alloc_sz);
-       T_ASSERT_MACH_SUCCESS(kr, "Deallocate recipe buffers");
-}
diff --git a/tools/tests/darwintests/work_interval_test.c b/tools/tests/darwintests/work_interval_test.c
deleted file mode 100644 (file)
index cc69250..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-
-/* test that the header doesn't implicitly depend on others */
-#include <sys/work_interval.h>
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <errno.h>
-#include <err.h>
-#include <string.h>
-#include <pthread.h>
-
-#include <mach/mach.h>
-
-#include <darwintest.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"));
-
-static mach_port_t port = MACH_PORT_NULL;
-
-static void *
-joining_thread_fn(__unused void *arg)
-{
-       int ret = 0;
-       kern_return_t kr = KERN_SUCCESS;
-
-       ret = work_interval_join_port(port);
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_join_port, another thread");
-
-       kr = mach_port_deallocate(mach_task_self(), port);
-       T_ASSERT_MACH_SUCCESS(kr, "mach_port_deallocate of port, another thread");
-
-       /* deliberately exit with joined work interval */
-       return NULL;
-}
-
-T_DECL(work_interval, "work interval interface")
-{
-       int ret = 0;
-       work_interval_t handle = NULL;
-       uint64_t now = mach_absolute_time();
-       kern_return_t kr = KERN_SUCCESS;
-
-       ret = work_interval_create(NULL, 0);
-       T_ASSERT_EQ(errno, EINVAL, "create with null errno EINVAL");
-       T_ASSERT_EQ(ret, -1, "create with null returns -1");
-
-       /* Binary must be entitled for this to succeed */
-       ret = work_interval_create(&handle, 0);
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_create, no flags");
-
-       ret = work_interval_copy_port(handle, &port);
-       T_ASSERT_EQ(errno, EINVAL, "work_interval_copy_port on non-joinable interval errno EINVAL");
-       T_ASSERT_EQ(ret, -1, "work_interval_copy_port on non-joinable interval returns -1");
-
-       ret = work_interval_notify(handle, now - 1000, now, now + 1000, now + 2000, 0);
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_notify, no flags");
-
-       ret = work_interval_destroy(handle);
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_destroy, no flags");
-
-       uint32_t flags[] = {
-               WORK_INTERVAL_FLAG_JOINABLE,
-               WORK_INTERVAL_FLAG_JOINABLE | WORK_INTERVAL_FLAG_GROUP,
-       };
-
-       for (uint32_t i = 0 ; i < sizeof(flags) / sizeof(flags[0]) ; i++) {
-               ret = work_interval_create(&handle, flags[i]);
-               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_create, joinable");
-
-               ret = work_interval_copy_port(handle, &port);
-               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_copy_port, joinable");
-
-               ret = work_interval_notify(handle, now - 1000, now, now + 1000, now + 2000, 0);
-               T_ASSERT_EQ(ret, -1, "work_interval_notify on non-joined thread returns -1");
-               T_ASSERT_EQ(errno, EINVAL, "work_interval_copy_port on non-joined thread errno EINVAL");
-
-               ret = work_interval_join_port(port);
-               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_join_port, joinable");
-
-               ret = work_interval_notify(handle, now - 1000, now, now + 1000, now + 2000, 0);
-               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_notify, on joined thread");
-
-               ret = work_interval_join_port(port);
-               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_join_port, join the same interval after destroy");
-
-               kr = mach_port_deallocate(mach_task_self(), port);
-               T_ASSERT_MACH_SUCCESS(kr, "mach_port_deallocate of port");
-
-               ret = work_interval_notify(handle, now - 1000, now, now + 1000, now + 2000, 0);
-               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_notify, on joined thread after destroy");
-
-               ret = work_interval_destroy(handle);
-               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_destroy, joinable, on joined thread");
-
-               ret = work_interval_leave();
-               T_ASSERT_POSIX_SUCCESS(ret, "work_interval_leave, on destroyed work interval");
-       }
-
-       ret = work_interval_create(&handle, WORK_INTERVAL_FLAG_JOINABLE | WORK_INTERVAL_FLAG_GROUP);
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_create, joinable");
-
-       ret = work_interval_copy_port(handle, &port);
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_copy_port, joinable");
-
-       ret = work_interval_join_port(port);
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_join_port, join before handing to another thread");
-
-       pthread_t joining_thread;
-
-       T_ASSERT_POSIX_ZERO(pthread_create(&joining_thread, NULL, joining_thread_fn, NULL), "pthread_create");
-
-       T_ASSERT_POSIX_ZERO(pthread_join(joining_thread, NULL), "pthread_join");
-
-       ret = work_interval_leave();
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_leave");
-
-       ret = work_interval_destroy(handle);
-       T_ASSERT_POSIX_SUCCESS(ret, "work_interval_destroy");
-
-}
-
diff --git a/tools/tests/darwintests/work_interval_test.entitlements b/tools/tests/darwintests/work_interval_test.entitlements
deleted file mode 100644 (file)
index 5726ec2..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-       <key>com.apple.private.kernel.work-interval</key>
-       <true/>
-</dict>
-</plist>
diff --git a/tools/tests/darwintests/workq_sigprof.c b/tools/tests/darwintests/workq_sigprof.c
deleted file mode 100644 (file)
index 6ea38a8..0000000
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <pthread.h>
-#include <stdbool.h>
-#include <signal.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <mach/mach_time.h>
-#include <dispatch/dispatch.h>
-
-#include <darwintest.h>
-
-#if !TARGET_OS_IPHONE
-
-static pthread_t workq_thread;
-static bool signal_received;
-
-static void signal_handler(int sig __unused, siginfo_t *b __unused, void* unused __unused) {
-    if (pthread_self() == workq_thread) {
-        signal_received = true;
-    }
-}
-
-static void workq_block(void *unused __unused) {
-    workq_thread = pthread_self();
-
-    /*
-    sigset_t set;
-    sigemptyset(&set);
-    sigaddset(&set, SIGPROF);
-    pthread_sigmask(SIG_UNBLOCK, &set, NULL);
-    */
-
-    uint64_t spin_start = mach_absolute_time();
-    while (mach_absolute_time() - spin_start < 30 * NSEC_PER_SEC)
-        if (signal_received) {
-            T_PASS("Got SIGPROF!");
-            T_END;
-        }
-    }
-
-T_DECL(workq_sigprof, "test that workqueue threads can receive sigprof")
-{
-    struct sigaction sa = {
-        .sa_sigaction = signal_handler
-    };
-    sigfillset(&sa.sa_mask);
-    T_ASSERT_POSIX_ZERO(sigaction(SIGPROF, &sa, NULL), NULL);
-
-    dispatch_queue_t q = dispatch_get_global_queue(0, 0);
-    dispatch_async_f(q, NULL, workq_block);
-
-    struct itimerval timerval = {
-        .it_interval = {.tv_usec = 10000},
-        .it_value = {.tv_usec = 10000}
-    };
-    T_ASSERT_POSIX_ZERO(setitimer(ITIMER_PROF, &timerval, NULL), NULL);
-
-    dispatch_main();
-}
-
-#else //!TARGET_OS_IPHONE
-
-T_DECL(workq_sigprof, "test that workqueue threads can receive sigprof")
-{
-    T_EXPECTFAIL;
-    T_FAIL("<rdar://problem/25864196> setitimer/sigprof doesn't seem to be delivered on embeded platforms");
-}
-
-#endif //!TARGET_OS_IPHONE
diff --git a/tools/tests/darwintests/xnu_quick_test.c b/tools/tests/darwintests/xnu_quick_test.c
deleted file mode 100644 (file)
index 7698b3f..0000000
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <darwintest.h>
-#include "xnu_quick_test_helpers.h"
-
-#include <fcntl.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <mach/mach.h>
-#include <sys/stat.h>
-#include <sys/syscall.h>
-#include <sys/sysctl.h>
-#include <sys/wait.h>
-
-T_GLOBAL_META (T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false));
-char g_target_path[ PATH_MAX ];
-
-/*  **************************************************************************************************************
- *     Test the syscall system call.
- *  **************************************************************************************************************
- */
-T_DECL(syscall,
-       "xnu_quick_test for syscall", T_META_CHECK_LEAKS(NO))
-{
-       int                             my_fd = -1;
-       char *                  my_pathp;
-       kern_return_t   my_kr;
-
-       T_SETUPBEGIN;
-
-       create_target_directory(TEST_DIRECTORY);
-       
-       T_SETUPEND;
-
-       my_kr = vm_allocate((vm_map_t) mach_task_self(), (vm_address_t*)&my_pathp, 
-               PATH_MAX, VM_FLAGS_ANYWHERE);
-       T_ASSERT_MACH_SUCCESS(my_kr, "Allocating vm to path %s", my_pathp);
-
-       *my_pathp = 0x00;
-       strcpy( my_pathp, &g_target_path[0] );
-       strcat( my_pathp, "/" );
-
-       /* create a test file */
-       
-       T_ASSERT_MACH_SUCCESS( create_random_name( my_pathp, 1), "Create random test file" );
-       /* use an indirect system call to open our test file.
-        * I picked open since it uses a path pointer which grows to 64 bits in an LP64 environment.
-        */
-       T_EXPECT_NE(my_fd = syscall( SYS_open, my_pathp, (O_RDWR | O_EXCL), 0 ),
-               -1, "Attempt to open file using indirect syscall %s", my_pathp);
-
-       if (my_fd != -1)
-               close(my_fd);
-       
-       if (my_pathp != NULL) {
-               remove(my_pathp);       
-               vm_deallocate(mach_task_self(), (vm_address_t)my_pathp, PATH_MAX);
-       }
-
-       T_ATEND(remove_target_directory);
-}
-
-/*  **************************************************************************************************************
- *     Test fork wait4, and exit system calls.
- *  **************************************************************************************************************
- */
-T_DECL(fork_wait4_exit, 
-       "Tests forking off a process and waiting for the child to exit", T_META_CHECK_LEAKS(false))
-{
-       int                             my_err, my_status;
-    pid_t                      my_pid, my_wait_pid;
-       struct rusage   my_usage;
-       
-       strncpy(g_target_path, "/", 2);
-
-       /* spin off another process */
-       T_ASSERT_NE(my_pid = fork(), -1, "Fork off a process");
-       
-       if ( my_pid == 0 ) {
-               struct stat             my_sb;
-               
-               /* child process does very little then exits */
-               my_err = stat( &g_target_path[0], &my_sb );
-               T_WITH_ERRNO;
-        T_ASSERT_TRUE(my_err == 0, "stat call with path: \"%s\" returned \"%d\"", &g_target_path[0], errno);
-               exit( 44 );
-       }
-       
-       /* parent process waits for child to exit */
-       T_ASSERT_NE(my_wait_pid = wait4( my_pid, &my_status, 0, &my_usage ), -1,
-               "Wait for child to exit\n");
-
-       /* wait4 should return our child's pid when it exits */
-       T_ASSERT_EQ(my_wait_pid, my_pid, 
-               "wait4 should return our child's pid when it exits");
-       
-       /* kind of just guessing on these values so if this fails we should take a closer 
-        * look at the returned rusage structure. 
-        */
-        T_ASSERT_FALSE(( my_usage.ru_utime.tv_sec > 1 || 
-               my_usage.ru_stime.tv_sec > 1 || my_usage.ru_majflt > 1000 ||
-               my_usage.ru_msgsnd > 100 ), "wait4 returned rusage structure");
-
-       T_ASSERT_TRUE(( WIFEXITED( my_status ) && WEXITSTATUS( my_status ) == 44 ),
-               "check if wait4 returns right exit status");
-}
-
-T_DECL (getrusage, "Sanity check of getrusage")
-{
-        struct rusage   my_rusage;
-        
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(getrusage( RUSAGE_SELF, &my_rusage ), 0, NULL);
-       T_LOG("Checking that getrusage returned sane values");
-       T_EXPECT_LT(my_rusage.ru_msgrcv, 1000, NULL);
-       T_EXPECT_GE(my_rusage.ru_msgrcv, 0, NULL);
-       T_EXPECT_LT(my_rusage.ru_nsignals, 1000, NULL);
-       T_EXPECT_GE(my_rusage.ru_nsignals, 0, NULL);
-}
-
diff --git a/tools/tests/darwintests/xnu_quick_test_getsetpriority.c b/tools/tests/darwintests/xnu_quick_test_getsetpriority.c
deleted file mode 100644 (file)
index ec62af5..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-#include <darwintest.h>
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-
-T_GLOBAL_META(T_META_NAMESPACE("xnu.quicktest"), T_META_CHECK_LEAKS(false));
-
-T_DECL(getpriority_setpriority, "Tests getpriority and setpriority system calls", T_META_ASROOT(true))
-{
-       int my_priority;
-       int my_new_priority;
-
-       /* getpriority returns scheduling priority so -1 is a valid value */
-       errno       = 0;
-       my_priority = getpriority(PRIO_PROCESS, 0);
-
-       T_WITH_ERRNO;
-       T_ASSERT_FALSE(my_priority == -1 && errno != 0, "Verify getpriority is successful", NULL);
-
-       /* change scheduling priority*/
-       my_new_priority = (my_priority == PRIO_MIN) ? (my_priority + 10) : (PRIO_MIN);
-
-       T_WITH_ERRNO;
-       T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_PROCESS, 0, my_new_priority), "Change scheduling priority", NULL);
-
-       /* verify change */
-       errno       = 0;
-       my_priority = getpriority(PRIO_PROCESS, 0);
-       T_WITH_ERRNO;
-       T_ASSERT_FALSE(my_priority == -1 && errno != 0, "Verify getpriority change is successful", NULL);
-
-       T_WITH_ERRNO;
-       T_ASSERT_EQ(my_priority, my_new_priority, "Verify setpriority correctly set scheduling priority", NULL);
-
-       /* reset scheduling priority */
-       T_WITH_ERRNO;
-       T_ASSERT_POSIX_SUCCESS(setpriority(PRIO_PROCESS, 0, 0), "Reset scheduling priority", NULL);
-}
diff --git a/tools/tests/darwintests/xnu_quick_test_helpers.c b/tools/tests/darwintests/xnu_quick_test_helpers.c
deleted file mode 100644 (file)
index 08670d8..0000000
+++ /dev/null
@@ -1,114 +0,0 @@
-#include <darwintest.h>
-
-#include "xnu_quick_test_helpers.h"
-
-#include <fcntl.h>
-#include <unistd.h>
-
-void create_target_directory( const char * the_targetp )
-{
-    int             err;
-    const char *    my_targetp;
-
-    my_targetp = getenv("TMPDIR");
-    if ( my_targetp == NULL )
-        my_targetp = "/tmp";
-
-    T_ASSERT_LT( strlen( the_targetp ), (unsigned long)( PATH_MAX - 1 ),
-        "check target path too long - \"%s\"", the_targetp );
-
-    for ( ;; ) {
-        int         my_rand;
-        char        my_name[64];
-        
-        my_rand = rand( );
-        sprintf( &my_name[0], "xnu_quick_test-%d", my_rand );
-        T_ASSERT_LT( strlen( &my_name[0] ) + strlen( the_targetp ) + 2, (unsigned long)PATH_MAX,
-            "check target path plus our test directory name is too long: "
-            "target path - \"%s\" test directory name - \"%s\"",
-            the_targetp, &my_name[0] );
-
-        /* append generated directory name onto our path */
-        g_target_path[0] = 0x00;
-        strcat( &g_target_path[0], the_targetp );
-        if ( g_target_path[ (strlen(the_targetp) - 1) ] != '/' ) {
-            strcat( &g_target_path[0], "/" );
-        }
-        strcat( &g_target_path[0], &my_name[0] );
-        
-        /* try to create the test directory */
-        err = mkdir( &g_target_path[0], (S_IRWXU | S_IRWXG | S_IROTH) );
-        if ( err == 0 ) {
-            break;
-        }
-        err = errno;
-        if ( EEXIST != err ) {
-            T_ASSERT_FAIL( "test directory creation failed - \"%s\" \n"
-                "mkdir call failed with error %d - \"%s\"", 
-                &g_target_path[0], errno, strerror( err) );
-        }
-    }
-
-} /* create_target_directory */
-
-/*
- * create_random_name - creates a file with a random / unique name in the given directory.
- * when do_open is true we create a file else we generaate a name that does not exist in the
- * given directory (we do not create anything when do_open is 0).
- * WARNING - caller provides enough space in path buffer for longest possible name.
- * WARNING - assumes caller has appended a trailing '/' on the path passed to us.
- * RAND_MAX is currently 2147483647 (ten characters plus one for a slash)
- */
-int create_random_name( char *the_pathp, int do_open ) {
-    int     i, my_err;
-    int     my_fd = -1;
-    
-    for ( i = 0; i < 1; i++ ) {
-        int         my_rand;
-        char        *myp;
-        char        my_name[32];
-        
-        my_rand = rand( );
-        sprintf( &my_name[0], "%d", my_rand );
-        T_ASSERT_LT_ULONG((strlen( &my_name[0] ) + strlen( the_pathp ) + 2), (unsigned long)PATH_MAX,
-            "check if path to test file is less than PATH_MAX");
-
-        // append generated file name onto our path
-        myp = strrchr( the_pathp, '/' );
-        *(myp + 1) = 0x00;
-        strcat( the_pathp, &my_name[0] );
-        if ( do_open ) {
-            /* create a file with this name */
-            my_fd = open( the_pathp, (O_RDWR | O_CREAT | O_EXCL),
-                            (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) );
-            T_EXPECT_TRUE((my_fd != -1 || errno == EEXIST), "open file with name %s", the_pathp);
-            
-            if( errno == EEXIST )
-                continue;
-        }
-        else {
-            /* make sure the name is unique */
-            struct stat     my_sb;
-            my_err = stat( the_pathp, &my_sb );
-            T_EXPECT_TRUE((my_err == 0 || errno == ENOENT), "make sure the name is unique");
-            
-            if(errno == ENOENT) break;
-            /* name already exists, try another */
-            i--;
-            continue;
-        }
-    }
-    
-    if ( my_fd != -1 )
-        close( my_fd );
-
-    if(do_open && my_fd == -1)
-        return 1;
-
-    return 0;
-} /* create_random_name */
-
-void remove_target_directory() {
-    rmdir(&g_target_path[0]);
-}
-
diff --git a/tools/tests/darwintests/xnu_quick_test_helpers.h b/tools/tests/darwintests/xnu_quick_test_helpers.h
deleted file mode 100644 (file)
index b6a25ed..0000000
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef XNU_QUICK_TEST_HELPERS_H
-#define XNU_QUICK_TEST_HELPERS_H
-
-#include <stdlib.h>
-#include <sys/stat.h>
-#include <sys/syslimits.h>
-
-#define TEST_DIRECTORY "/tmp"
-
-extern char g_target_path[ PATH_MAX ];
-
-int create_random_name( char *the_pathp, int do_open );
-void create_target_directory( const char * the_targetp );
-void remove_target_directory( void );
-
-#endif
index f3421742f26986d2b4e0f90c1359ce1ad7146aa8..d2c718f39379f61390b99549f34912af0fb9b264 100644 (file)
@@ -16,8 +16,13 @@ ifdef RC_ARCHS
   endif
 endif
 
-ARCH_32 := $(filter-out %64, $(ARCHS))
-ARCH_64 := $(filter %64, $(ARCHS))
+# These are convenience functions for filtering based on substrings, as the
+# normal filter functions only accept one wildcard.
+FILTER_OUT_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),,$(string))))
+FILTER_SUBSTRING=$(strip $(foreach string,$(2),$(if $(findstring $(1),$(string)),$(string),)))
+
+ARCH_32:=$(call FILTER_OUT_SUBSTRING,64,$(ARCHS))
+ARCH_64:=$(call FILTER_SUBSTRING,64,$(ARCHS))
 
 ARCH_32_FLAGS := $(patsubst %, -arch %, $(ARCH_32))
 ARCH_64_FLAGS := $(patsubst %, -arch %, $(ARCH_64))
@@ -25,7 +30,7 @@ ARCH_FLAGS := $(if $(ARCH_64), $(ARCH_64_FLAGS)) $(if $(ARCH_32), $(ARCH_32_FLAG
 
 DSTROOT?=$(shell /bin/pwd)
 
-TARGETS := persona_mgr persona_spawn
+TARGETS := persona_mgr persona_spawn persona_test_run.sh
 
 all: $(addprefix $(DSTROOT)/, $(TARGETS))
 
@@ -33,5 +38,9 @@ $(DSTROOT)/persona_%: persona_%.c persona_test.h Makefile
        ${CC} ${CFLAGS} ${ARCH_FLAGS} -o $(SYMROOT)/$(notdir $@) $<
        if [ ! -e $@ ]; then ditto $(SYMROOT)/$(notdir $@) $@; fi
 
+$(DSTROOT)/persona_test_run.sh: persona_test_run.sh
+       cp $? $@
+       chmod +x $@
+
 clean:
        rm -rf $(addprefix $(DSTROOT)/,$(TARGETS)) $(addprefix $(SYMROOT)/,$(TARGETS)) $(SYMROOT)/*.dSYM
index 5da9f5bff6a0714c1d9dbc09bb372cd8e9ee80a1..93692f3d49a9ce5d5a9774b4e297175a4dbf77f5 100644 (file)
@@ -41,7 +41,8 @@ enum {
        PERSONA_OP_CREATE  = 1,
        PERSONA_OP_DESTROY = 2,
        PERSONA_OP_LOOKUP  = 3,
-       PERSONA_OP_MAX     = 3,
+       PERSONA_OP_SUPPORT = 4,
+       PERSONA_OP_MAX     = 4,
 };
 
 static struct mgr_config {
@@ -84,7 +85,7 @@ static int persona_op_lookup(struct kpersona_info *ki, pid_t pid, uid_t uid)
 {
        int ret;
 
-       info("Looking up persona (pid:%d, uid:%d)", pid, uid);
+       info("Looking up persona (login:%s, pid:%d, uid:%d)", ki->persona_name, pid, uid);
        if (pid > 0) {
                ki->persona_info_version = PERSONA_INFO_V1;
                ret = kpersona_pidinfo(pid, ki);
@@ -118,6 +119,19 @@ static int persona_op_lookup(struct kpersona_info *ki, pid_t pid, uid_t uid)
        return ret;
 }
 
+static int persona_op_support(void)
+{
+       uid_t pna_id = -1;
+       int ret = kpersona_get(&pna_id);
+       if (ret == 0 || errno != ENOSYS) {
+               info("Persona subsystem is supported (id=%d)", pna_id);
+               return 0;
+       }
+
+       info("Persona subsystem is not supported");
+       return ENOSYS;
+}
+
 
 /* = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
  *
@@ -137,7 +151,7 @@ static void usage_main(const char *progname, const char *msg, int verbose)
        if (!verbose)
                exit(1);
 
-       printf("\t%-15s\tOne of: create | destroy | lookup\n", "[op]");
+       printf("\t%-15s\tOne of: create | destroy | lookup | support\n", "[op]");
        printf("\t%-15s\tBe verbose\n", "-v");
 
        printf("\t%-15s\tID of the persona\n", "-i id");
@@ -160,7 +174,7 @@ int main(int argc, char **argv)
        int ret;
 
        const char *op_str = NULL;
-       int persona_op = 0;
+       int persona_op = -1;
        struct kpersona_info kinfo;
        uid_t uid = (uid_t)-1;
        pid_t pid = (pid_t)-1;
@@ -184,6 +198,8 @@ int main(int argc, char **argv)
                persona_op = PERSONA_OP_DESTROY;
        else if (strcmp(op_str, "lookup") == 0)
                persona_op = PERSONA_OP_LOOKUP;
+       else if (strcmp(op_str, "support") == 0)
+               persona_op = PERSONA_OP_SUPPORT;
        else if (strcmp(op_str, "help") == 0 || strcmp(op_str, "-h") == 0)
                usage_main(argv[0], NULL, 1);
 
@@ -201,15 +217,27 @@ int main(int argc, char **argv)
                switch (ch) {
                case 'i':
                        ret = atoi(optarg);
-                       if (ret <= 0)
-                               err("Invalid Persona ID: %s", optarg);
+                       if (ret <= 0) {
+                               ret = PERSONA_ID_NONE;
+                       }
                        kinfo.persona_id = (uid_t)ret;
                        break;
                case 't':
-                       ret = atoi(optarg);
-                       if (ret <= PERSONA_INVALID || ret > PERSONA_TYPE_MAX)
-                               err("Invalid type specification: %s", optarg);
-                       kinfo.persona_type = ret;
+                       if (strncmp(optarg, "guest", 6) == 0) {
+                               kinfo.persona_type = PERSONA_GUEST;
+                       } else if (strncmp(optarg, "managed", 8) == 0) {
+                               kinfo.persona_type = PERSONA_MANAGED;
+                       } else if (strncmp(optarg, "priv", 4) == 0) { /* shortcut... */
+                               kinfo.persona_type = PERSONA_PRIV;
+                       } else if (strncmp(optarg, "system", 7) == 0) {
+                               kinfo.persona_type = PERSONA_SYSTEM;
+                       } else {
+                               ret = atoi(optarg);
+                               if (ret <= PERSONA_INVALID || ret > PERSONA_TYPE_MAX) {
+                                       err("Invalid type specification: %s", optarg);
+                               }
+                               kinfo.persona_type = ret;
+                       }
                        break;
                case 'p':
                        ret = atoi(optarg);
@@ -257,10 +285,11 @@ int main(int argc, char **argv)
                }
        }
 
-       if (uid == (uid_t)-1 && persona_op != PERSONA_OP_LOOKUP)
+       if (uid == (uid_t)-1 && persona_op != PERSONA_OP_LOOKUP) {
                uid = kinfo.persona_id;
+       }
 
-       if (kinfo.persona_gmuid && kinfo.persona_ngroups == 0) {
+       if (kinfo.persona_gmuid != KAUTH_UID_NONE && kinfo.persona_ngroups == 0) {
                /*
                 * In order to set the group membership UID, we need to set at
                 * least one group: make it equal to either the GID or UID
@@ -285,6 +314,9 @@ int main(int argc, char **argv)
        case PERSONA_OP_LOOKUP:
                ret = persona_op_lookup(&kinfo, pid, uid);
                break;
+       case PERSONA_OP_SUPPORT:
+               ret = persona_op_support();
+               break;
        default:
                err("Invalid persona op: %d", persona_op);
        }
index b6e7782ceffc118b69faeb87bf8f7e677270c5da..521871576ec0f6cd39176bb168f0b0ee310e8332 100644 (file)
@@ -19,6 +19,7 @@
 #include <mach/mach.h>
 #include <mach/task.h>
 #include <mach/vm_param.h>
+#include <sys/kauth.h>
 #include <sys/queue.h>
 #include <sys/sysctl.h>
 #include <sys/types.h>
@@ -72,7 +73,7 @@ static pid_t spawn_child(int argc, char **argv, struct persona_args *pa)
                return -ERR_SYSTEM;
        }
 
-       if (!pa->flags & PA_HAS_ID) {
+       if (!(pa->flags & PA_HAS_ID)) {
                err_print("No persona ID specified!");
                return -ERR_SYSTEM;
        }
@@ -129,6 +130,15 @@ static pid_t spawn_child(int argc, char **argv, struct persona_args *pa)
                }
        }
 
+       if (pa->flags & PA_HAS_GROUPS) {
+               ret = posix_spawnattr_set_persona_groups_np(&attr, pa->kinfo.persona_ngroups, pa->kinfo.persona_groups, KAUTH_UID_NONE);
+               if (ret != 0) {
+                       err_print("");
+                       ret = -ERR_SPAWN_ATTR;
+                       goto out_err;
+               }
+       }
+
        ret = posix_spawn(&child->pid, argv[0], NULL, &attr, argv, environ);
        if (ret != 0) {
                err_print("posix_spawn (ret=%d)", ret);
@@ -259,6 +269,8 @@ static void usage_main(const char *progname, int verbose)
        printf("\t%-10s\tVerify persona parameters against existing persona (given by -I)\n", "-V");
        printf("\t%-10s\tOverride/verify the user ID of the new process\n", "-u uid");
        printf("\t%-10s\tOverride/verify the group ID of the new process\n", "-g gid");
+       printf("\t%-15s\tGroups to which the persona will belong\n", "-G {groupspec}");
+       printf("\t%-15s\tgroupspec: G1{,G2,G3...}\n", " ");
        printf("\t%-10s\tBe verbose\n", "-v");
        printf("\t%-10s\tDo not wait for the child process\n", "-w");
        printf("\n");
@@ -283,7 +295,12 @@ int main(int argc, char **argv)
                optind = 2;
                ret = child_main_loop(argc, argv);
                if (ret != 1)
+                       exit(ret);
+               if (strcmp(argv[optind], "spawn") != 0) {
+                       printf("child exiting (%s).\n", argv[optind]);
                        exit(0);
+               }
+               optind++;
 
                /*
                 * If we get here, then the child wants us to continue running
@@ -305,18 +322,23 @@ int main(int argc, char **argv)
        /*
         * Argument parse for default overrides:
         */
-       while ((ch = getopt(argc, argv, "Vg:I:u:vwh")) != -1) {
+       while ((ch = getopt(argc, argv, "Vg:G:I:u:vwh")) != -1) {
                switch (ch) {
                case 'V':
                        pa.flags |= PA_SHOULD_VERIFY;
                        break;
                case 'g':
                        pa.kinfo.persona_gid = atoi(optarg);
-                       if (pa.kinfo.persona_gid <= 500)
-                               err("Invalid GID: %d", pa.kinfo.persona_gid);
                        pa.flags |= PA_HAS_GID;
                        pa.flags |= PA_OVERRIDE;
                        break;
+               case 'G':
+                       ret = parse_groupspec(&pa.kinfo, optarg);
+                       if (ret < 0)
+                               err("Invalid groupspec: \"%s\"", optarg);
+                       pa.flags |= PA_HAS_GROUPS;
+                       pa.flags |= PA_OVERRIDE;
+                       break;
                case 'I':
                        pa.kinfo.persona_id = atoi(optarg);
                        if (pa.kinfo.persona_id == 0)
@@ -325,8 +347,6 @@ int main(int argc, char **argv)
                        break;
                case 'u':
                        pa.override_uid = atoi(optarg);
-                       if (pa.override_uid <= 500)
-                               err("Invalid UID: %d", pa.override_uid);
                        pa.flags |= PA_HAS_UID;
                        pa.flags |= PA_OVERRIDE;
                        break;
diff --git a/tools/tests/personas/persona_test_run.sh b/tools/tests/personas/persona_test_run.sh
new file mode 100755 (executable)
index 0000000..77ee923
--- /dev/null
@@ -0,0 +1,569 @@
+#!/bin/bash
+# persona_test_run.sh
+#
+# This file aims to be a comprehensive test suite for the persona subsystem.
+# It uses two tools:
+#   1. persona_mgr - create, destroy, lookup personas
+#   2. persona_spawn - spawn processes into personas with a variety of options
+# The script relies heavily on the particular output of these tools, so if you
+# are modifying / extending those tools, this file also need to be updated to
+# properly capture the new output. Specifically, the get_persona_info function
+# needs to be maintained / updated.
+#
+# NOTE: the function get_persona_info() also needs to be kept up to date with
+# the types of personas found in bsd/sys/persona.h
+
+# be sure to bail on script errors and unepected tool failures
+set -e
+
+PERSONA_MGR="${PWD}/persona_mgr"
+PERSONA_SPAWN="${PWD}/persona_spawn"
+
+if [ ! -d "$TMPDIR" ]; then
+       echo "Couldn't find temp directory '$TMPDIR': check permissions/environment?"
+       exit 255
+fi
+
+if [ ! -e "${PERSONA_MGR}" ] ||  [ ! -x "${PERSONA_MGR}" ]; then
+       echo "Can't find '${PERSONA_MGR}': skipping test"
+       exit 0
+fi
+if [ ! -e "${PERSONA_SPAWN}" ] || [ ! -x "${PERSONA_SPAWN}" ]; then
+       echo "Can't find '${PERSONA_SPAWN}': skipping test"
+       exit 0
+fi
+
+function check_for_persona_support() {
+       local errno=0
+       ${PERSONA_MGR} support || errno=$?
+       if [ $errno -eq 78 ]; then
+               echo "Persona subsystem is not supported - skipping tests"
+               exit 0
+       fi
+       return 0
+}
+check_for_persona_support
+
+
+## bail [failure_msg]
+#
+# exit the script with an error code that corresponds to the line number
+# from which this function was invoked. Because we want to exit with a
+# non-zero exit code, we use: 1 + (254 % line).
+#
+function bail() {
+       local msg="$1"
+       local line=$2
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       echo "[$line] ERROR: $msg" 1>&2
+       exit $((1 + $line % 254))
+}
+
+## check_return [message_on_failure]
+#
+# Check the return value of the previous command or script line. If the
+# value of '$?' is not 0, then call bail() with an appropriate message.
+#
+function check_return() {
+       local err=$?
+       local msg=$1
+       local line=$2
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       echo "CHECK: $msg"
+       if [ $err -ne 0 ]; then
+               bail "e=$err: $msg" $line
+       fi
+
+       return 0
+}
+
+## expect_failure [message_on_success]
+#
+# Check the return value of the previous command or script line. If the
+# value of '$?' is 0 (success), then call bail() with a message saying
+# that we expected this previous command/line to fail.
+# 
+function expect_failure() {
+       local err=$?
+       local msg=$1
+       local line=$2
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       if [ $err -eq 0 ]; then
+               bail "found success, expected failure: $msg" $line
+       fi
+
+       echo "EXPECT: failure: $msg"
+       return 0
+}
+
+## test_num [debug_info] [number]
+#
+# Check that a variable value is a number, bail() on error.
+#
+function test_num() {
+       local type=$1
+       local num=$2
+       local line=$3
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       if [ -z "$num" ]; then
+               bail "invalid (NULL) $type" $line
+       fi
+       [ "$num" -eq "$num" ] 2>/dev/null
+       if [ $? -ne 0 ]; then
+               bail "invalid $type: $num" $line
+       fi
+
+       return 0
+}
+
+## global variables used to return values to callers
+_ID=-1
+_TYPE="invalid"
+_LOGIN=""
+_UID=-1
+_GID=-1
+_NGROUPS=-1
+_GROUPS=""
+
+## get_persona_info {persona_id} {persona_login}
+#
+# Lookup persona info for the given ID/login. At least one of the ID/login
+# parameters must be valid
+function get_persona_info() {
+       local pna_id=${1:-1}
+       local pna_login=${2:- }
+       local line=$3
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+
+       local largs="-u ${pna_id}"
+       if [ "${pna_login}" != " " ]; then
+               largs+=" -l ${pna_login}"
+       fi
+
+       _ID=-1
+       _TYPE=-1
+       _LOGIN=""
+       _UID=-1
+       _GID=-1
+       _NGROUPS=-1
+       _GROUPS=()
+
+       local file="${TMPDIR}/plookup"
+
+       ${PERSONA_MGR} lookup ${largs} > "${file}"
+       check_return "persona lookup of: ${largs}" $line
+
+       _ID=$(cat "${file}" | grep "+id: " | head -1 | sed 's/.*+id:[ ]*\([0-9][0-9]*\).*/\1/')
+       test_num "Persona ID lookup:${largs}" "$_ID"
+
+       local type=$(cat "${file}" | grep "+type: " | head -1 | sed 's/.*+type:[ ]*\([0-9][0-9]*\).*/\1/')
+       test_num "+type lookup:${largs}" "$type"
+       ##
+       ## NOTE: keep in sync with bsd/sys/persona.h types!
+       ##
+       if [ $type -eq 1 ]; then
+               _TYPE=guest
+       elif [ $type -eq 2 ]; then
+               _TYPE=managed
+       elif [ $type -eq 3 ]; then
+               _TYPE=priv
+       elif [ $type -eq 4 ]; then
+               _TYPE=system
+       else
+               _TYPE=invalid
+       fi
+
+       _LOGIN=$(cat "${file}" | grep "+login: " | head -1 | sed 's/.*+login:[ ]*"\([^"]*\)".*/\1/')
+       if [ -z "$_LOGIN" ]; then
+               bail "invalid login for pna_id:$_ID: '$_LOGIN'" $line
+       fi
+
+       # these are always the same
+       _UID=$_ID
+
+       _GID=$(cat "${file}" | grep "+gid: " | head -1 | sed 's/.*+gid:[ ]*\([0-9][0-9]*\).*/\1/')
+       test_num "GID lookup:${largs}" "$_GID"
+
+       _NGROUPS=$(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*\([0-9][0-9]*\)[ ][ ]*{.*}.*/\1/')
+       test_num "NGROUPS lookup:${largs}" "$_NGROUPS"
+
+       _GROUPS=( $(cat "${file}" | grep "ngroups: " | head -1 | sed 's/.*ngroups:[ ]*[0-9][0-9]*[ ][ ]*{[ ]*\([^ ].*\)[ ][ ]*}.*/\1/') )
+       if [ $_NGROUPS -gt 0 ]; then
+               if [ -z "${_GROUPS}" ]; then
+                       bail "lookup:${largs}: missing $_NGROUPS groups" $line
+               fi
+               if [ ${#_GROUPS[@]} -ne $_NGROUPS ]; then
+                       bail "lookup:${largs} wrong number of groups ${#_GROUPS[@]} != $_NGROUPS" $line
+               fi
+       fi
+}
+
+## validate_child_info [output_file] [persona_id] {uid} {gid} {groups}
+#
+# Parse the output of the 'persona_spawn' command and validate that
+# the new child process is in the correct persona with the correct
+# process attributes.
+#
+function validate_child_info() {
+       local file=$1
+       local pna_id=$2
+       local uid=${3:--1}
+       local gid=${4:--1}
+       local groups=${5:- }
+       local line=$6
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+       local l=( )
+
+       # get the child's PID
+       local cpid="$(cat "$file" | grep "Child: PID:" | sed 's/.*Child: PID:\([0-9][0-9]*\).*/\1/')"
+       test_num "Child PID" "$cpid" $line
+
+       # validate the child's persona
+       l=( $(cat "$file" | grep "Child: Persona:" | sed 's/.*Child: Persona: \([0-9][0-9]*\) (err:\([0-9][0-9]*\))/\1 \2/') )
+       if [ ${#l[@]} -ne 2 ]; then
+               bail "Invalid Child[$cpid] Persona line" $line
+       fi
+       test_num "Child Persona ID" "${l[0]}" $line
+       test_num "kpersona_info retval" "${l[1]}" $line
+
+       if [ ${l[0]} -ne $pna_id ]; then
+               bail "Child[$cpid] persona:${l[0]} != specified persona:$pna_id" $line
+       fi
+
+       # Validate the UID/GID
+       l=( $(cat "$file" | grep "Child: UID:" | sed 's/.*UID:\([0-9][0-9]*\), GID:\([0-9][0-9]*\).*/\1 \2/') )
+       if [ ${#l[@]} -ne 2 ]; then
+               bail "Invalid Child[$cpid] UID/GID output" $line
+       fi
+       if [ $uid -ge 0 ]; then
+               if [ $uid -ne ${l[0]} ]; then
+                       bail "Child[$cpid] UID:${l[0]} != specified UID:$uid" $line
+               fi
+       fi
+       if [ $gid -ge 0 ]; then
+               if [ $gid -ne ${l[1]} ]; then
+                       bail "Child[$cpid] GID:${l[1]} != specified GID:$gid" $line
+               fi
+       fi
+
+       # TODO: validate / verify groups?
+
+       return 0
+}
+
+
+## spawn_child [persona_id] {uid} {gid} {group_spec}
+#
+# Create a child process that is spawn'd into the persona given by
+# the first argument (pna_id). The new process can have its UID, GID,
+# and group membership properties overridden.
+#
+function spawn_child() {
+       local pna_id=$1
+       local uid=${2:--1}
+       local gid=${3:--1}
+       local groups=${4:- }
+       local line=$5
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+
+       local file="child.${pna_id}"
+       local spawn_args="-I $pna_id"
+       if [ $uid -ge 0 ]; then
+               spawn_args+=" -u $uid"
+               file+=".u$uid"
+       fi
+       if [ $gid -ge 0 ]; then
+               spawn_args+=" -g $gid"
+               file+=".g$gid"
+       fi
+       if [ "$groups" != " " ]; then
+               spawn_args+=" -G $groups"
+               file+="._groups"
+       fi
+
+       echo "SPAWN: $file"
+       ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E > "${TMPDIR}/$file"
+       check_return "child info: $file" $line
+
+       # Grab the specified persona's info so we can
+       # verify the child's info against it.
+       # This function puts data into global variables, e.g. _ID, _GID, etc.
+       get_persona_info ${pna_id} " " $line
+       if [ $uid -lt 0 ]; then
+               uid=$_UID
+       fi
+       if [ $gid -lt 0 ]; then
+               gid=$_GID
+       fi
+       if [ "$groups" == " " ]; then
+               # convert a bash array into a comma-separated list for validation
+               local _g="${_GROUPS[@]}"
+               groups="${_g// /,}"
+       fi
+
+       validate_child_info "${TMPDIR}/$file" "$pna_id" "$uid" "$gid" "$groups" $line
+
+       # TODO: validate that the first child spawned into a persona *cannot* spawn
+       # into a different persona...
+       #if [ $uid -eq 0 ]; then
+       #       ${PERSONA_SPAWN} -v $spawn_args ${PERSONA_SPAWN} child -v -E -R -v -I 99 /bin/echo "This is running in the system persona"
+       #       expect_failure "Spawned child that re-execs into non-default persona" $line
+       #fi
+       return 0
+}
+
+## get_created_id [output_file]
+#
+# Parse the output of the 'persona_mgr' command to determine the ID
+# of the newly created persona.
+#
+function get_created_id() {
+       local file=$1
+       local o=$(cat "$file" | grep "Created persona" | sed 's/.*Created persona \([0-9][0-9]*\):/\1/')
+       echo $o
+       return 0
+}
+
+## create_persona [login_name] [persona_type] {persona_id} {gid} {group_spec}
+#
+# Create a new persona with given parameters.
+#
+# Returns: the newly created persona ID via the global variable, $_ID
+#
+function create_persona() {
+       local name=${1}
+       local type=${2}
+       local pna_id=${3:--1}
+       local gid=${4:--1}
+       local groups=${5:- }
+       local line=$6
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+
+       if [ -z "$name" -o -z "$type" ]; then
+               bail "Invalid arguments to create_persona '$name' '$type'" $line
+       fi
+
+       local file="persona.at${line}"
+       # persona ID of '-1' is auto-assigned
+       local spawn_args="-v -l $name -i $pna_id"
+       if [ $pna_id -eq -1 ]; then
+               file+=".auto"
+       else
+               file+=".${pna_id}"
+       fi
+
+       spawn_args+=" -t $type"
+       file+=".$type"
+
+       if [ $gid -ge 0 ]; then
+               spawn_args+=" -g $gid"
+               file+=".g$gid"
+       fi
+       if [ "$groups" != " " ]; then
+               spawn_args+=" -G $groups"
+               file+="._groups"
+       fi
+
+       echo "CREATE: $file"
+       ${PERSONA_MGR} create ${spawn_args} > "${TMPDIR}/${file}"
+       check_return "persona creation: ${file}" $line
+       # test output should include persona creation output for later debugging
+       cat "${TMPDIR}/${file}"
+
+       # validate the output of the persona_mgr tool (what we think we created)
+       _ID=`get_created_id "${TMPDIR}/${file}"`
+       test_num "persona_id for $file" "$_ID" $line
+       if [ ${pna_id} -gt 0 ]; then
+               if [ $_ID -ne ${pna_id} ]; then
+                       bail "Created persona doesn't have expected ID $_ID != ${pna_id}" $line
+               fi
+       fi
+
+       # validate the entire persona information (what a kpersona_lookup says we created)
+       # This function puts data into global variables, e.g. _ID, _LOGIN, _GID, etc.
+       echo "VALIDATE: ${file}"
+       get_persona_info ${pna_id} "$name" $line
+       if [ "$name" != "$_LOGIN" ]; then
+               bail "${file}: unexpected login '$_LOGIN' != '$name'" $line
+       fi
+       if [ "$type" != "$_TYPE" ]; then
+               bail "${file}: unexpected type '$_TYPE' != '$type'" $line
+       fi
+       if [ ${pna_id} -gt 0 ]; then
+               if [ ${pna_id} -ne $_ID ]; then
+                       bail "${file}: unexpected ID '$_ID' != '${pna_id}'" $line
+               fi
+       fi
+       if [ $gid -ge 0 ]; then
+               if [ $gid -ne $_GID ]; then
+                       bail "${file}: unexpected GID '$_GID' != '$gid'" $line
+               fi
+       fi
+       if [ "$groups" != " " ]; then
+               local _g="${_GROUPS[@]}"
+               if [ "${_g// /,}" != "$groups" ]; then
+                       bail "${file}: unexpected groups '${_g// /,}' != '$groups'" $line
+               fi
+       fi
+
+       return 0
+}
+
+## destroy_persona [persona_id]
+#
+# Destroy the given persona.
+#
+function destroy_persona() {
+       local pna_id=$1
+       local line=$2
+       if [ -z "$line" ]; then
+               line=${BASH_LINENO[0]}
+       fi
+
+       echo "DESTROY: ${pna_id}"
+       ${PERSONA_MGR} destroy -v -i ${pna_id}
+       check_return "destruction of ${pna_id}" $line
+}
+
+#
+#
+# Begin Tests!
+#
+#
+echo "Running persona tests [$LINENO] ($TMPDIR)"
+
+##
+## Test Group 0: basic creation + spawn tests
+##
+
+# default group, specific ID
+create_persona "test0_1" "guest" 1001
+P0ID=$_ID
+spawn_child $P0ID
+spawn_child $P0ID 1100
+spawn_child $P0ID 0
+spawn_child $P0ID -1 1101
+spawn_child $P0ID 1100 1101
+spawn_child $P0ID 1100 1101 1000,2000,3000
+spawn_child $P0ID 1100 -1 1000,2000,3000
+spawn_child $P0ID -1 -1 1000,2000,3000
+destroy_persona $P0ID
+
+# specific ID, non-default group
+create_persona "test0_2" "guest" 1002 2000
+P0ID=$_ID
+spawn_child $P0ID
+spawn_child $P0ID 1100
+spawn_child $P0ID 0
+spawn_child $P0ID -1 1101
+spawn_child $P0ID 1100 1101
+spawn_child $P0ID 1100 1101 1000,2000,3000
+spawn_child $P0ID 1100 -1 1000,2000,3000
+spawn_child $P0ID -1 -1 1000,2000,3000
+destroy_persona $P0ID
+
+# non-default set of groups
+create_persona "test0_3" "guest" 1003 2000 2000,3000,4000
+P0ID=$_ID
+spawn_child $P0ID
+spawn_child $P0ID 1100
+spawn_child $P0ID 0
+spawn_child $P0ID -1 1101
+spawn_child $P0ID 1100 1101
+spawn_child $P0ID 1100 1101 1111,2222,3333
+spawn_child $P0ID 1100 -1 1111,2222,3333
+spawn_child $P0ID -1 -1 1111,2222,3333
+destroy_persona $P0ID
+
+
+##
+## Test Group 1: persona creation / re-creation
+##
+
+# Create 3 personas with auto-assigned IDs
+create_persona "test1_1" "guest"
+P1ID=$_ID
+create_persona "test1_2" "managed"
+P2ID=$_ID
+create_persona "test1_3" "priv"
+P3ID=$_ID
+create_persona "test1_4" "system"
+P4ID=$_ID
+
+D1=$(($P2ID - $P1ID))
+D2=$(($P3ID - $P2ID))
+D3=$(($P4ID - $P3ID))
+if [ $D1 -ne $D2 -o $D1 -ne $D3 -o $D2 -ne $D3 ]; then
+       bail "inconsistent automatic Persona ID increment: $D1,$D2,$D3 ($P1ID,$P2ID,$P3ID,$P4ID)"
+fi
+
+# make sure we can't re-allocate the same name / ID
+${PERSONA_MGR} create -v -l test1_1 -t guest -i -1 && expect_failure "re-create same name:test1_1 type:guest"
+${PERSONA_MGR} create -v -l test1_1 -t managed -i -1 && expect_failure "re-create same name:test1_1 type:managed"
+${PERSONA_MGR} create -v -l test1_1_new -t managed -i $P1ID && expect_failure "re-create $P1ID with new name:test1_1_new type:managed"
+
+##
+## Test Group 2: auto-assigned ID tricks
+##
+
+# Notice the difference in IDs, then try to create a persona by
+# specifying an ID that will match the next auto-assigned ID
+# (should succeed)
+P5ID_REQ=$(($P4ID + $D2))
+create_persona "test2_1" "guest" ${P5ID_REQ}
+P5ID=$_ID
+if [ ! $P5ID -eq ${P5ID_REQ} ]; then
+       bail "test2_1: ${P5ID_REQ} != $P5ID"
+fi
+
+# try to create a persona with auto-assigned ID
+# (resulting persona should have ID != P5ID)
+create_persona "test2_2" "guest"
+P6ID=$_ID
+if [ $P6ID -eq $P5ID ]; then
+       bail "created duplicate persona IDs: $P6ID == $P5ID"
+fi
+
+##
+## Test Group 3: persona destruction
+##
+
+destroy_persona $P1ID
+destroy_persona $P2ID
+destroy_persona $P3ID
+destroy_persona $P4ID
+destroy_persona $P5ID
+destroy_persona $P6ID
+
+# try to re-destroy the personas
+# (should fail)
+${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (1/2) $P1ID"
+${PERSONA_MGR} destroy -v -i $P1ID && expect_failure "re-destroy (2/2) $P1ID"
+${PERSONA_MGR} destroy -v -i $P2ID && expect_failure "re-destroy $P2ID"
+${PERSONA_MGR} destroy -v -i $P3ID && expect_failure "re-destroy $P3ID"
+${PERSONA_MGR} destroy -v -i $P4ID && expect_failure "re-destroy $P4ID"
+${PERSONA_MGR} destroy -v -i $P5ID && expect_failure "re-destroy $P5ID"
+${PERSONA_MGR} destroy -v -i $P6ID && expect_failure "re-destroy $P6ID"
+
+# cleanup
+rm -rf "${TMPDIR}"
+
+echo ""
+echo "${0##/}: SUCCESS"
+exit 0
index 5bc38ccd72e33b1ef6d89a98f0cbfe03a3c24b79..42472547b2447bc64aef88e6889ea36a91f8f0d1 100755 (executable)
@@ -92,7 +92,7 @@ trace_codename("MACH_CLOCK_BRIDGE_RCV_TS", function(buf)
                        prefix, format_timestamp_arm(buf[1]), format_timestamp_intel(buf[2]))
        else
                local skip = ""
-               if buf[3] == 1 then
+               if buf[1] == 0 then
                        skip = "Int handler"
                end
                printf("%s ( %-10s %-10s ) %s\n",
@@ -121,9 +121,9 @@ trace_codename("MACH_CLOCK_BRIDGE_TS_MISMATCH", function(buf)
 
        local diff = (math.abs(buf[2] - buf[3]))/1000000
 
-       printf("%s ( Cur: %-10s Pred: %-10s Diff: %5.6f ms ) @ %-20s\n",
+       printf("%s ( Cur: %-10s Pred: %-10s Diff: %5.6f ms Count: %d ) @ %-20s\n",
                prefix, format_timestamp_intel(buf[2]), format_timestamp_intel(buf[3]),
-               diff, format_timestamp_arm(buf[1]))
+               diff, buf[4], format_timestamp_arm(buf[1]))
 
 end)
 
index bb5b9545f7517c3cf01fae4a399c4ba18397fd43..07cbd9fbdfe2684061ae28eac8a4fe3c42420306 100755 (executable)
@@ -75,9 +75,10 @@ function state_string(strings, state)
 end
 
 kqrequest_state_strings = {
-       ['PROCESSING'] = 0x1,
-       ['THREQUESTED'] = 0x2,
-       ['WAKEUP'] = 0x4
+       ['THREQUESTED'] = 0x02,
+       ['WAKEUP'] = 0x04,
+       ['BOUND'] = 0x08,
+       ['DRAIN'] = 0x40,
 }
 
 kqueue_state_strings = {
@@ -100,7 +101,7 @@ knote_state_strings = {
        ['QUEUED'] = 0x0002,
        ['DISABLED'] = 0x0004,
        ['DROPPING'] = 0x0008,
-       ['USEWAIT'] = 0x0010,
+       ['LOCKED'] = 0x0010,
        ['ATTACHING'] = 0x0020,
        ['STAYACTIVE'] = 0x0040,
        ['DEFERDELETE'] = 0x0080,
@@ -108,28 +109,10 @@ knote_state_strings = {
        ['DISPATCH'] = 0x0200,
        ['UDATA_SPECIFIC'] = 0x0400,
        ['SUPPRESSED'] = 0x0800,
-       ['STOLENDROP'] = 0x1000,
+       ['MERGE_QOS'] = 0x1000,
        ['REQVANISH'] = 0x2000,
        ['VANISHED'] = 0x4000,
 }
-knote_state_strings = {
-       ['ACTIVE'] = 0x0001,
-       ['QUEUED'] = 0x0002,
-       ['DISABLED'] = 0x0004,
-       ['DROPPING'] = 0x0008,
-       ['USEWAIT'] = 0x0010,
-       ['ATTACHING'] = 0x0020,
-       ['STAYACTIVE'] = 0x0040,
-       ['DEFERDELETE'] = 0x0080,
-       ['ATTACHED'] = 0x0100,
-       ['DISPATCH'] = 0x0200,
-       ['UDATA_SPECIFIC'] = 0x0400,
-       ['SUPPRESSED'] = 0x0800,
-       ['STOLENDROP'] = 0x1000,
-       ['REQVANISH'] = 0x2000,
-       ['VANISHED'] = 0x4000,
-}
-
 
 kevent_flags_strings = {
        ['ADD'] = 0x0001,
@@ -272,7 +255,7 @@ trace_eventname("KEVENT_kqwl_bind", function(buf)
                        event_prefix_string(buf, false), buf.arg2, qos_string(qos),
                        kqr_override_qos_delta,
                        state_string(kqrequest_state_strings, kqr_state),
-                       duplicate ? ", duplicate" : "")
+                       duplicate and ", duplicate" or "")
 end)
 
 trace_eventname("KEVENT_kqwl_unbind", function(buf)
diff --git a/tools/trace/wqtrace.lua b/tools/trace/wqtrace.lua
new file mode 100755 (executable)
index 0000000..ae853d4
--- /dev/null
@@ -0,0 +1,307 @@
+#!/usr/local/bin/luatrace -s
+
+trace_codename = function(codename, callback)
+       local debugid = trace.debugid(codename)
+       if debugid ~= 0 then
+               trace.single(debugid,callback)
+       else
+               printf("WARNING: Cannot locate debugid for '%s'\n", codename)
+       end
+end
+
+initial_timestamp = 0
+pid_map = {};
+get_prefix = function(buf)
+       if initial_timestamp == 0 then
+               initial_timestamp = buf.timestamp
+       end
+       local secs = trace.convert_timestamp_to_nanoseconds(buf.timestamp - initial_timestamp) / 1000000000
+
+       local prefix
+       if trace.debugid_is_start(buf.debugid) then
+               prefix = "→"
+       elseif trace.debugid_is_end(buf.debugid) then
+               prefix = "←"
+       else
+               prefix = "↔"
+       end
+
+       local proc
+       if buf.pid == buf[1] then
+               proc = buf.command
+               if pid_map[buf[1]] == nil then
+                       pid_map[buf[1]] = buf.command
+               end
+       elseif pid_map[buf[1]] ~= nil then
+               proc = pid_map[buf[1]]
+       else
+               proc = "UNKNOWN"
+       end
+
+       return string.format("%s %6.9f %-17s [%05d.%06x] %-24s",
+               prefix, secs, proc, buf.pid, buf.threadid, buf.debugname)
+end
+
+parse_pthread_priority = function(pri)
+       pri = pri & 0xffffffff
+       if (pri & 0x02000000) == 0x02000000 then
+               return "Manager"
+       end
+       local qos = (pri & 0x00ffff00) >> 8
+       if qos == 0x20 then
+               return string.format("UI[%x]", pri);
+       elseif qos == 0x10 then
+               return string.format("IN[%x]", pri);
+       elseif qos == 0x08 then
+               return string.format("DF[%x]", pri);
+       elseif qos == 0x04 then
+               return string.format("UT[%x]", pri);
+       elseif qos == 0x02 then
+               return string.format("BG[%x]", pri);
+       elseif qos == 0x01 then
+               return string.format("MT[%x]", pri);
+       elseif qos == 0x00 then
+               return string.format("--[%x]", pri);
+       else
+               return string.format("??[%x]", pri);
+       end
+end
+
+parse_thread_qos = function(pri)
+       if pri == 7 then
+               return string.format("MG", pri);
+       elseif pri == 6 then
+               return string.format("UI", pri);
+       elseif pri == 5 then
+               return string.format("IN", pri);
+       elseif pri == 4 then
+               return string.format("DF", pri);
+       elseif pri == 3 then
+               return string.format("UT", pri);
+       elseif pri == 2 then
+               return string.format("BG", pri);
+       elseif pri == 1 then
+               return string.format("MT", pri);
+       elseif pri == 0 then
+               return string.format("--", pri);
+       else
+               return string.format("??[%x]", pri);
+       end
+end
+
+parse_thactive_req_qos = function(pri)
+       if pri ~= 0 then
+               return parse_thread_qos(pri)
+       end
+       return "None"
+end
+
+get_thactive = function(low, high)
+       return string.format("req: %s, MG: %d, UI: %d, IN: %d, DE: %d, UT: %d, BG: %d, MT: %d",
+                       parse_thactive_req_qos(high >> (16 * 3)), (high >> (2 * 16)) & 0xffff,
+                       (high >> (1 * 16)) & 0xffff, (high >> (0 * 16)) & 0xffff,
+                       (low  >> (3 * 16)) & 0xffff, (low  >> (2 * 16)) & 0xffff,
+                       (low  >> (1 * 16)) & 0xffff, (low  >> (0 * 16)) & 0xffff)
+end
+
+-- workqueue lifecycle
+
+trace_codename("wq_pthread_exit", function(buf)
+       local prefix = get_prefix(buf)
+       if trace.debugid_is_start(buf.debugid) then
+               printf("%s\tprocess is exiting\n",prefix)
+       else
+               printf("%s\tworkqueue marked as exiting and timer is complete\n",prefix)
+       end
+end)
+
+trace_codename("wq_workqueue_exit", function(buf)
+       local prefix = get_prefix(buf)
+       if trace.debugid_is_start(buf.debugid) then
+               printf("%s\tall threads have exited, cleaning up\n",prefix)
+       else
+               printf("%s\tclean up complete\n",prefix)
+       end
+end)
+
+trace_codename("wq_start_add_timer", function(buf)
+       local prefix = get_prefix(buf)
+       printf("%s\tarming timer to fire in %d us (flags: %x, reqcount: %d)\n",
+               prefix, buf.arg4, buf.arg3, buf.arg2)
+end)
+
+trace_codename("wq_add_timer", function(buf)
+       local prefix = get_prefix(buf)
+       if trace.debugid_is_start(buf.debugid) then
+               printf("%s\tadd_timer fired (flags: %x, nthreads: %d, thidlecount: %d)\n",
+                       prefix, buf.arg2, buf.arg3, buf.arg4)
+       elseif trace.debugid_is_end(buf.debugid) then
+               printf("%s\tadd_timer completed (start_timer: %x, nthreads: %d, thidlecount: %d)\n",
+                       prefix, buf.arg2, buf.arg3, buf.arg4)
+       end
+end)
+
+trace_codename("wq_select_threadreq", function(buf)
+       local prefix = get_prefix(buf)
+       if buf[2] == 0 then
+               printf("%s\tSelection failed: process exiting\n", prefix)
+       elseif buf[2] == 1 then
+               printf("%s\tSelection failed: no request\n", prefix)
+       elseif buf[2] == 2 then
+               printf("%s\tSelection failed: throttled\n", prefix)
+       end
+end)
+
+trace_codename("wq_creator_select", function(buf)
+       local prefix = get_prefix(buf)
+       if buf[2] == 1 then
+               printf("%s\t\tcreator %x overridden at %s\n", prefix, buf[3],
+                       parse_thread_qos(buf[4]))
+       elseif buf[2] == 2 then
+               printf("%s\t\tcreator %x selected at %s\n", prefix, buf[3],
+                       parse_thread_qos(buf[4]))
+       elseif buf[2] == 3 then
+               printf("%s\t\tcreator idled (%d yields)\n", prefix, buf[4])
+       elseif buf[2] == 4 then
+               printf("%s\t\tcreator removed (%d yields)\n", prefix, buf[4])
+       end
+end)
+
+trace_codename("wq_creator_yield", function(buf)
+       local prefix = get_prefix(buf)
+       local reason = "unknown"
+       if buf[2] == 1 then
+               reason = "fast steal rate"
+       elseif buf[2] == 2 then
+               reason = "above ncpu scheduled"
+       end
+       printf("%s\t\tcreator yielded (%s, current:%d snapshot:%d)\n",
+                       prefix, reason, buf[3], buf[4])
+end)
+
+trace_codename("wq_thread_logical_run", function(buf)
+       local prefix = get_prefix(buf)
+       if trace.debugid_is_start(buf.debugid) then
+               printf("%s\tthread unparking (request %x)\n", prefix, buf[2])
+       else
+               printf("%s\tthread parking\n", prefix)
+       end
+end)
+
+trace.enable_thread_cputime()
+runthread_time_map = {}
+runthread_cputime_map = {}
+trace_codename("wq_runthread", function(buf)
+       local prefix = get_prefix(buf)
+       if trace.debugid_is_start(buf.debugid) then
+               printf("%s\tSTART running thread\n", prefix)
+               runthread_time_map[buf.threadid] = buf.timestamp;
+               runthread_cputime_map[buf.threadid] = trace.cputime_for_thread(buf.threadid);
+       elseif runthread_time_map[buf.threadid] then
+               local time = buf.timestamp - runthread_time_map[buf.threadid]
+               local cputime = trace.cputime_for_thread(buf.threadid) - runthread_cputime_map[buf.threadid]
+
+               local time_ms = trace.convert_timestamp_to_nanoseconds(time) / 1000000
+               local cputime_ms = trace.convert_timestamp_to_nanoseconds(cputime) / 1000000
+
+               printf("%s\tDONE running thread: time = %6.6f ms, cputime = %6.6f ms\n",
+                               prefix, time_ms, cputime_ms)
+
+               runthread_time_map[buf.threadid] = 0
+               runthread_cputime_map[buf.threadid] = 0
+       elseif trace.debugid_is_end(buf.debugid) then
+               printf("%s\tDONE running thread\n", prefix)
+       end
+end)
+
+trace_codename("wq_thactive_update", function(buf)
+       local prefix = get_prefix(buf)
+       local thactive = get_thactive(buf[2], buf[3])
+       printf("%s\tthactive updated (%s)\n", prefix, thactive)
+end)
+
+trace_codename("wq_thread_block", function(buf)
+       local prefix = get_prefix(buf)
+       local req_pri = parse_thread_qos(buf[3] >> 8)
+       if trace.debugid_is_start(buf.debugid) then
+               printf("%s\tthread blocked (activecount: %d, priority: %s, req_pri: %s, reqcount: %d, start_timer: %d)\n",
+                       prefix, buf[2], parse_thread_qos(buf[3] & 0xff), req_pri, buf[4] >> 1, buf[4] & 0x1)
+       else
+               printf("%s\tthread unblocked (activecount: %d, priority: %s, req_pri: %s, threads_scheduled: %d)\n",
+                       prefix, buf[2], parse_thread_qos(buf[3] & 0xff), req_pri, buf[4])
+       end
+end)
+
+trace_codename("wq_thread_create_failed", function(buf)
+       local prefix = get_prefix(buf)
+       if buf[3] == 0 then
+               printf("%s\tfailed to create new workqueue thread, kern_return: 0x%x\n",
+                       prefix, buf[2])
+       elseif buf[3] == 1 then
+               printf("%s\tfailed to vm_map workq thread stack: 0x%x\n", prefix, buf[2])
+       elseif buf[3] == 2 then
+               printf("%s\tfailed to vm_protect workq thread guardsize: 0x%x\n", prefix, buf[2])
+       end
+end)
+
+trace_codename("wq_thread_create", function(buf)
+       printf("%s\tcreated new workqueue thread\n", get_prefix(buf))
+end)
+
+trace_codename("wq_thread_terminate", function(buf)
+       local prefix = get_prefix(buf)
+       local what
+       if trace.debugid_is_start(buf.debugid) then
+               what = "try to terminate thread"
+       else
+               what = "terminated thread"
+       end
+       printf("%s\t%s: currently idle %d\n", prefix, what, buf[2])
+end)
+
+trace_codename("wq_wqops_reqthreads", function(buf)
+       local prefix = get_prefix(buf)
+       printf("%s\tlegacy thread request made for %d threads at %s\n", prefix, buf[2], parse_pthread_priority(buf[3]));
+end)
+
+trace_codename("wq_thread_request_initiate", function(buf)
+       local prefix = get_prefix(buf)
+       printf("%s\tthread request %x made at %s (count:%d)\n", prefix, buf[2], parse_thread_qos(buf[3]), buf[4]);
+end)
+
+trace_codename("wq_thread_request_modify", function(buf)
+       local prefix = get_prefix(buf)
+       printf("%s\tthread request %x priorty updated to %s\n", prefix, buf[2], parse_thread_qos(buf[3]));
+end)
+
+trace_codename("wq_thread_request_cancel", function(buf)
+       local prefix = get_prefix(buf)
+       printf("%s\tthread request %x canceled\n", prefix, buf[2], parse_thread_qos(buf[3]));
+end)
+
+trace_codename("wq_constrained_admission", function(buf)
+       local prefix = get_prefix(buf)
+       if buf[2] == 1 then
+               printf("fail: %s\twq_constrained_threads_scheduled=%d >= wq_max_constrained_threads=%d\n",
+                               prefix, buf[3], buf[4])
+       elseif (buf[2] == 2) or (buf[2] == 3) then
+               local success = nil;
+               if buf[2] == 2 then success = "success"
+               else success = "fail" end
+               printf("%s\t%s\tthactive_count=%d + busycount=%d >= wq->wq_max_concurrency\n",
+                               prefix, success, buf[3], buf[4])
+       end
+end)
+
+trace_codename("wq_death_call", function(buf)
+       local prefix = get_prefix(buf)
+       if trace.debugid_is_start(buf.debugid) then
+               printf("%s\tentering death call\n", prefix);
+       elseif trace.debugid_is_end(buf.debugid) then
+               printf("%s\tleaving death call\n", prefix);
+       else
+               printf("%s\tscheduling death call\n", prefix);
+       end
+end)
+--
+-- vim:ts=4:sw=4:noet: